Mercurial > repos > mahtabm > ensembl
diff variant_effect_predictor/Bio/EnsEMBL/DBSQL/RepeatFeatureAdaptor.pm @ 0:1f6dce3d34e0
Uploaded
| author | mahtabm |
|---|---|
| date | Thu, 11 Apr 2013 02:01:53 -0400 |
| parents | |
| children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/DBSQL/RepeatFeatureAdaptor.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,511 @@ +=head1 LICENSE + + Copyright (c) 1999-2012 The European Bioinformatics Institute and + Genome Research Limited. All rights reserved. + + This software is distributed under a modified Apache license. + For license details, please see + + http://www.ensembl.org/info/about/code_licence.html + +=head1 CONTACT + + Please email comments or questions to the public Ensembl + developers list at <dev@ensembl.org>. + + Questions may also be sent to the Ensembl help desk at + <helpdesk@ensembl.org>. + +=cut + +=head1 NAME + +Bio::EnsEMBL::DBSQL::RepeatFeatureAdaptor + +=head1 SYNOPSIS + + $rfa = $database_adaptor->get_RepeatFeatureAdaptor(); + + my $repeat = $rfa->fetch_by_dbID(1234); + my @repeats = @{ $rfa->fetch_all_by_Slice($slice) }; + +=head1 DESCRIPTION + +This is an adaptor for the retrieval and storage of RepeatFeature +objects from the database. Most of the implementation is in the +superclass BaseFeatureAdaptor. + +=head1 METHODS + +=cut + +package Bio::EnsEMBL::DBSQL::RepeatFeatureAdaptor; + +use strict; +use Bio::EnsEMBL::DBSQL::BaseFeatureAdaptor; +use Bio::EnsEMBL::RepeatFeature; +use Bio::EnsEMBL::Utils::Exception qw(throw warning); +use Bio::EnsEMBL::Utils::Scalar qw/wrap_array/; + +use vars qw(@ISA); + +@ISA = qw(Bio::EnsEMBL::DBSQL::BaseFeatureAdaptor); + + +=head2 fetch_all_by_Slice + + Arg [1] : Bio::EnsEMBL::Slice $slice + Arg [2] : (optional) string $logic_name + Limits RepeatFeatures obtained to those having an Analysis with + of the specified logic_name. If no logic name is specified + Repeats of all analysis types are retrieved. + Arg [3] : (optional) string/array $repeat_type + Limits RepeatFeatures obtained to those of specified + repeat_type + Example : @rfeats = @{$rfa->fetch_all_by_Slice($slice, undef, 'Type II Transposons')}; + @rfeats = @{$rfa->fetch_all_by_Slice($slice, undef, ['Type II Transposons', 'RNA repeats'])}; + Description: Retrieves repeat features overlapping the area designated by + the provided slice argument. Returned features will be in + in the same coordinate system as the provided slice and will + have coordinates relative to the slice start. + Returntype : reference to a list of Bio::EnsEMBL::RepeatFeatures. + Exceptions : throw on bad argument + Caller : Slice::get_all_RepeatFeatures + Status : Stable + +=cut + +sub fetch_all_by_Slice { + my $self = shift; + my $slice = shift; + my $logic_name = shift; + my $repeat_type = shift; + + my $constraint = ''; + + # MySQL was optimising the query the incorrect way when joining to + # the repeat_consensus table on type + $self->_straight_join(1); + + if($repeat_type) { + my $rta = wrap_array($repeat_type); + if(scalar(@{$rta}) > 1) { + $constraint .= sprintf('rc.repeat_type IN (%s)', join(q{,}, map {"'${_}'"} @{$rta})); + } + else { + $constraint .= "rc.repeat_type = '${repeat_type}'"; + } + } + + my $result = + $self->fetch_all_by_Slice_constraint($slice,$constraint,$logic_name); + + + $self->_straight_join(0); + + return $result; +} + + +# _tablename +# +# Arg [1] : none +# Example : none +# Description: PROTECTED Implementation of abstract superclass method to +# provide the name of the tables to query +# Returntype : string +# Exceptions : none +# Caller : internal + + +sub _tables { + my $self = shift; + + return (['repeat_feature', 'r'], ['repeat_consensus', 'rc']); +} + + +# _columns +# +# Arg [1] : none +# Example : none +# Description: PROTECTED Implementation of abstract superclass method to +# provide the name of the columns to query +# Returntype : list of strings +# Exceptions : none +# Caller : internal + +sub _columns { + my $self = shift; + + return qw (r.repeat_feature_id + r.seq_region_id + r.seq_region_start + r.seq_region_end + r.seq_region_strand + r.repeat_consensus_id + r.repeat_start + r.repeat_end + r.analysis_id + r.score + rc.repeat_name + rc.repeat_class + rc.repeat_type + rc.repeat_consensus); +} + + +# _default_where_clause +# Arg [1] : none +# Example : none +# Description: Overrides superclass method to provide an additional +# table joining constraint before the SQL query is performed. +# Returntype : string +# Exceptions : none +# Caller : generic_fetch +# + +sub _default_where_clause { + my $self = shift; + + return 'r.repeat_consensus_id = rc.repeat_consensus_id'; +} + + + +# Description: PROTECTED implementation of abstract superclass method. +# responsible for the creation of RepeatFeatures from a +# hashref generated from an SQL query + +sub _objs_from_sth { + my ($self, $sth, $mapper, $dest_slice) = @_; + + # + # This code is ugly because an attempt has been made to remove as many + # function calls as possible for speed purposes. Thus many caches and + # a fair bit of gymnastics is used. + # + + my $rca = $self->db()->get_RepeatConsensusAdaptor(); + my $sa = $self->db()->get_SliceAdaptor(); + my $aa = $self->db->get_AnalysisAdaptor(); + + my @features; + my %rc_hash; + my %analysis_hash; + my %slice_hash; + my %sr_name_hash; + my %sr_cs_hash; + + my($repeat_feature_id, $seq_region_id, $seq_region_start, $seq_region_end, + $seq_region_strand, $repeat_consensus_id, $repeat_start, $repeat_end, + $analysis_id, $score, $repeat_name, $repeat_class, $repeat_type, + $repeat_consensus); + + $sth->bind_columns( \$repeat_feature_id, \$seq_region_id, \$seq_region_start, + \$seq_region_end, \$seq_region_strand, + \$repeat_consensus_id, \$repeat_start,\$repeat_end, + \$analysis_id, \$score, \$repeat_name, \$repeat_class, + \$repeat_type, \$repeat_consensus ); + + my $asm_cs; + my $cmp_cs; + my $asm_cs_vers; + my $asm_cs_name; + my $cmp_cs_vers; + my $cmp_cs_name; + if($mapper) { + $asm_cs = $mapper->assembled_CoordSystem(); + $cmp_cs = $mapper->component_CoordSystem(); + $asm_cs_name = $asm_cs->name(); + $asm_cs_vers = $asm_cs->version(); + $cmp_cs_name = $cmp_cs->name(); + $cmp_cs_vers = $cmp_cs->version(); + } + + my $dest_slice_start; + my $dest_slice_end; + my $dest_slice_strand; + my $dest_slice_length; + my $dest_slice_sr_name; + my $dest_slice_sr_id; + if($dest_slice) { + $dest_slice_start = $dest_slice->start(); + $dest_slice_end = $dest_slice->end(); + $dest_slice_strand = $dest_slice->strand(); + $dest_slice_length = $dest_slice->length(); + $dest_slice_sr_name = $dest_slice->seq_region_name(); + $dest_slice_sr_id = $dest_slice->get_seq_region_id(); + } + + FEATURE: while($sth->fetch()) { + #create a repeat consensus object + + my $rc = $rc_hash{$repeat_consensus_id} ||= + Bio::EnsEMBL::RepeatConsensus->new_fast + ({'dbID' => $repeat_consensus_id, + 'adaptor' => $rca, + 'name' => $repeat_name, + 'repeat_class' => $repeat_class, + 'repeat_type' => $repeat_type, + 'repeat_consensus' => $repeat_consensus, + 'length' => length($repeat_consensus)}); + + #get the analysis object + my $analysis = $analysis_hash{$analysis_id} ||= + $aa->fetch_by_dbID($analysis_id); + #need to get the internal_seq_region, if present + $seq_region_id = $self->get_seq_region_id_internal($seq_region_id); + my $slice = $slice_hash{"ID:".$seq_region_id}; + + if(!$slice) { + $slice = $sa->fetch_by_seq_region_id($seq_region_id); + $slice_hash{"ID:".$seq_region_id} = $slice; + $sr_name_hash{$seq_region_id} = $slice->seq_region_name(); + $sr_cs_hash{$seq_region_id} = $slice->coord_system(); + } + + my $sr_name = $sr_name_hash{$seq_region_id}; + my $sr_cs = $sr_cs_hash{$seq_region_id}; + # + # remap the feature coordinates to another coord system + # if a mapper was provided + # + if($mapper) { + + if (defined $dest_slice && $mapper->isa('Bio::EnsEMBL::ChainedAssemblyMapper') ) { + ( $seq_region_id, $seq_region_start, + $seq_region_end, $seq_region_strand ) + = + $mapper->map( $sr_name, $seq_region_start, $seq_region_end, + $seq_region_strand, $sr_cs, 1, $dest_slice); + + } else { + + ( $seq_region_id, $seq_region_start, + $seq_region_end, $seq_region_strand ) + = + $mapper->fastmap( $sr_name, $seq_region_start, $seq_region_end, + $seq_region_strand, $sr_cs ); + } + + #skip features that map to gaps or coord system boundaries + next FEATURE if(!defined($seq_region_id)); + + #get a slice in the coord system we just mapped to +# if($asm_cs == $sr_cs || ($cmp_cs != $sr_cs && $asm_cs->equals($sr_cs))) { + $slice = $slice_hash{"ID:".$seq_region_id} ||= + $sa->fetch_by_seq_region_id($seq_region_id); +# } else { +# $slice = $slice_hash{"NAME:$sr_name:$asm_cs_name:$asm_cs_vers"} ||= +# $sa->fetch_by_region($asm_cs_name, $sr_name, undef, undef, undef, +# $asm_cs_vers); +# } + } + + # + # If a destination slice was provided convert the coords + # If the dest_slice starts at 1 and is foward strand, nothing needs doing + # + if($dest_slice) { + if($dest_slice_start != 1 || $dest_slice_strand != 1) { + if($dest_slice_strand == 1) { + $seq_region_start = $seq_region_start - $dest_slice_start + 1; + $seq_region_end = $seq_region_end - $dest_slice_start + 1; + } else { + my $tmp_seq_region_start = $seq_region_start; + $seq_region_start = $dest_slice_end - $seq_region_end + 1; + $seq_region_end = $dest_slice_end - $tmp_seq_region_start + 1; + $seq_region_strand *= -1; + } + } + + #throw away features off the end of the requested slice + if($seq_region_end < 1 || $seq_region_start > $dest_slice_length || + ( $dest_slice_sr_id ne $seq_region_id )) { + next FEATURE; + } + $slice = $dest_slice; + } + + # Finally, create the new RepeatFeature. + push( @features, + $self->_create_feature_fast( 'Bio::EnsEMBL::RepeatFeature', { + 'dbID' => $repeat_feature_id, + 'analysis' => $analysis, + 'start' => $seq_region_start, + 'end' => $seq_region_end, + 'strand' => $seq_region_strand, + 'score' => $score, + 'hstart' => $repeat_start, + 'hend' => $repeat_end, + 'repeat_consensus' => $rc, + 'adaptor' => $self, + 'slice' => $slice + } ) ); + + } + + return \@features; +} + + +=head2 store + + Arg [1] : list of Bio::EnsEMBL::RepeatFeatures $repeat_feature_id + the list of repeat features to store in the database + Example : $repeat_feature_adaptor->store(@repeat_features); + Description: stores a repeat feature in the database + Returntype : none + Exceptions : if the repeat features do not have attached sequences + or if repeat_consensus are not present + Caller : general + Status : Stable + +=cut + +sub store { + my( $self, @repeats ) = @_; + + my $db = $self->db(); + my $rca = $db->get_RepeatConsensusAdaptor(); + my $sa = $db->get_SliceAdaptor(); + my ($cons, $db_id); + + my $sth = $self->prepare(qq{ + INSERT into repeat_feature( repeat_feature_id + , seq_region_id + , seq_region_start + , seq_region_end + , seq_region_strand + , repeat_consensus_id + , repeat_start + , repeat_end + , score + , analysis_id ) + VALUES(NULL, ?,?,?,?,?,?,?,?,?) + }); + + FEATURE: foreach my $rf (@repeats) { + if(!ref($rf) || !$rf->isa('Bio::EnsEMBL::RepeatFeature')) { + throw('Expected RepeatFeature argument not [' . ref($rf) .'].'); + } + + if($rf->is_stored($db)) { + warning("RepeatFeature [".$rf->dbID."] is already stored in this DB."); + next FEATURE; + } + + my $cons = $rf->repeat_consensus(); + throw("Must have a RepeatConsensus attached") if(!defined($cons)); + + # for tandem repeats - simply store consensus and repeat + # one pair per hit. don't need to check consensi stored + # already. consensus has name and class set to 'trf' + + if ($cons->repeat_class eq 'trf') { + + # Look for matches already stored + my @match = @{$rca->fetch_all_by_class_seq('trf', $cons->repeat_consensus)}; + if (@match) { + $cons->dbID($match[0]->dbID()); + } + else { + $rca->store($cons); + } + + } elsif ($cons->repeat_class eq 'Simple_repeat') { + + my $rcon = $cons->name; + $rcon =~ s/\((\S+)\)n/$1/; # get repeat element + $cons->repeat_consensus($rcon); + + # Look for matches already stored + my $match = $rca->fetch_by_name_class($cons->name, 'Simple_repeat'); + if ($match) { + $cons->dbID($match->dbID()); + } + else { + $rca->store($cons); + } + + } else { + + # for other repeats - need to see if a consensus is stored already + if(!$cons->dbID) { + my $match = ($rca->fetch_by_name($cons->name)); + + if($match) { + #set the consensus dbID to be the same as the database one + $cons->dbID($match->dbID()); + } else { + # if we don't match a consensus already stored create a fake one + # and set consensus to 'N' as null seq not allowed + # FIXME: not happy with this, but ho hum ... + warning("Can't find " . $cons->name . "\n"); + $cons->repeat_consensus("N"); + $rca->store($cons); + } + } + + #if (@match > 1) { + #multiple consensi were matched + # $self->warn(@match . " consensi for " . $cons->name . "\n"); + #} + } + + my $slice = $rf->slice(); + if(!ref($slice) || !($slice->isa("Bio::EnsEMBL::Slice") or $slice->isa('Bio::EnsEMBL::LRGSlice'))) { + throw("RepeatFeature cannot be stored without an associated slice."); + } + + my $original = $rf; + my $seq_region_id; + ($rf, $seq_region_id) = $self->_pre_store($rf); + + $sth->bind_param(1,$seq_region_id,SQL_INTEGER); + $sth->bind_param(2,$rf->start,SQL_INTEGER); + $sth->bind_param(3,$rf->end,SQL_INTEGER); + $sth->bind_param(4,$rf->strand,SQL_TINYINT); + $sth->bind_param(5,$rf->repeat_consensus->dbID,SQL_INTEGER); + $sth->bind_param(6,$rf->hstart,SQL_INTEGER); + $sth->bind_param(7,$rf->hend,SQL_INTEGER); + $sth->bind_param(8,$rf->score,SQL_DOUBLE); + $sth->bind_param(9,$rf->analysis->dbID,SQL_INTEGER); + + $sth->execute(); + + my $db_id = $sth->{'mysql_insertid'} + or throw("Didn't get an insertid from the INSERT statement"); + + $original->dbID($db_id); + $original->adaptor($self); + } +} + + +=head2 list_dbIDs + + Arg [1] : none + Example : @feature_ids = @{$repeat_feature_adaptor->list_dbIDs()}; + Description: Gets an array of internal ids for all repeat features in the current db + Arg[1] : <optional> int. not 0 for the ids to be sorted by the seq_region. + Returntype : list of ints + Exceptions : none + Caller : ? + Status : Stable + +=cut + +sub list_dbIDs { + my ($self, $ordered) = @_; + + return $self->_list_dbIDs("repeat_feature", undef, $ordered); +} + +1; + + + + +
