Mercurial > repos > mahtabm > ensembl
diff variant_effect_predictor/Bio/EnsEMBL/DBSQL/MiscFeatureAdaptor.pm @ 0:1f6dce3d34e0
Uploaded
author | mahtabm |
---|---|
date | Thu, 11 Apr 2013 02:01:53 -0400 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/DBSQL/MiscFeatureAdaptor.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,648 @@ +=head1 LICENSE + + Copyright (c) 1999-2012 The European Bioinformatics Institute and + Genome Research Limited. All rights reserved. + + This software is distributed under a modified Apache license. + For license details, please see + + http://www.ensembl.org/info/about/code_licence.html + +=head1 CONTACT + + Please email comments or questions to the public Ensembl + developers list at <dev@ensembl.org>. + + Questions may also be sent to the Ensembl help desk at + <helpdesk@ensembl.org>. + +=cut + +=head1 NAME + +Bio::EnsEMBL::DBSQL::MiscFeatureAdaptor + +=head1 SYNOPSIS + + $mfa = $database_adaptor->get_MiscFeatureAdaptor(); + + # retrieve a misc feature by its dbID + my $misc_feat = $mfa->fetch_by_dbID(1234); + + # retrieve all misc features in a given region + my @misc_feats = @{ $mfa->fetch_all_by_Slice($slice) }; + + # retrieve all misc features in a given region with a given set code + my @misc_clones = + @{ $mfa->fetch_all_by_Slice_and_set_code('cloneset') }; + + # store some misc features in the database + $mfa->store(@misc_features); + +=head1 DESCRIPTION + +This is an adaptor for the retrieval and storage of MiscFeatures. +Misc Features are extremely generic features that can be added with +minimal effort to the database. Currently misc features are used to +describe the locations of clone sets and tiling path information, +but arbitrary features can be stored. Misc features are grouped +into sets and can be fetched according to their grouping using the +fetch_all_by_Slice_and_set_code and fetch_all_by_set_code methods. +MiscFeatures may belong to more than one set. + +=head1 METHODS + +=cut + +package Bio::EnsEMBL::DBSQL::MiscFeatureAdaptor; + +use strict; +use Bio::EnsEMBL::DBSQL::BaseFeatureAdaptor; +use Bio::EnsEMBL::MiscFeature; +use Bio::EnsEMBL::Attribute; +use Bio::EnsEMBL::MiscSet; +use Bio::EnsEMBL::Utils::Exception qw(throw warning); + +use vars qw(@ISA); + +@ISA = qw(Bio::EnsEMBL::DBSQL::BaseFeatureAdaptor); + + + +=head2 fetch_all_by_Slice_and_set_code + + Arg [1] : Bio::EnsEMBL::Slice $slice + A slice representing the region to fetch from + Arg [2...] : string $set_code + The code of the set to retrieve features from + Example : @feats = @{$mfa->fetch_all_by_Slice_and_set_code('cloneset')}; + Description: Retrieves a set of MiscFeatures which have a particular set code + and which lie in a particular region. All features with the + provide set code and which overlap the given slice are returned. + Returntype : listref of Bio::EnsEMBL::MiscFeatures + Exceptions : throw if set_code is not provided + warning if no set for provided set code exists + Caller : general + Status : Stable + +=cut + +sub fetch_all_by_Slice_and_set_code { + my $self = shift; + my $slice = shift; + + throw('Set code argument is required.') unless @_; + + my $msa = $self->db->get_MiscSetAdaptor(); + my @sets = (); + my $max_len = 0; + foreach my $set_code (@_) { + my $set = $msa->fetch_by_code($set_code); + if($set) { + $max_len = $set->longest_feature if $set->longest_feature > $max_len; + push @sets, $set->dbID; + } else { + warning("No misc_set with code [$set_code] exists"); + } + } + my $constraint; + if( @sets > 1 ) { + $constraint = " mfms.misc_set_id in ( @{[join ',',@sets]} ) "; + } elsif( @sets == 1 ) { + $constraint = " mfms.misc_set_id = $sets[0] "; + } else { + return []; + } + + $self->_max_feature_length($max_len); + + my $results = $self->fetch_all_by_Slice_constraint($slice, $constraint); + + $self->_max_feature_length(undef); + + return $results; +} + + + +=head2 fetch_all_by_attribute_type_value + + Arg [1] : string $attrib_type_code + The code of the attribute type to fetch features for + Arg [2] : (optional) string $attrib_value + The value of the attribute to fetch features for + Example : + #get all misc features that have an embl accession + @feats = @{$mfa->fetch_all_by_attrib_type_value('embl_acc')}; + #get the misc feature with synonym 'AL014121' + ($feat)=@{$mfa->fetch_all_by_attrib_type_value('synonym','AL014121'); + Description: Retrieves MiscFeatures which have a particular attribute. + If the attribute value argument is also provided only + features which have the attribute AND a particular value + are returned. The features are returned in their native + coordinate system (i.e. the coordinate system that they + are stored in). + Returntype : listref of Bio::EnsEMBL::MiscFeatures + Exceptions : throw if attrib_type code arg is not provided + Caller : general + Status : Stable + +=cut + +sub fetch_all_by_attribute_type_value { + my $self = shift; + my $attrib_type_code = shift; + my $attrib_value = shift; + + throw("Attrib type code argument is required.") + if ( !$attrib_type_code ); + + # Need to do 2 queries so that all of the ids come back with the + # features. The problem with adding attrib constraints to filter the + # misc_features which come back is that not all of the attributes will + # come back + + my $sql = qq( + SELECT DISTINCT + ma.misc_feature_id + FROM misc_attrib ma, + attrib_type at, + misc_feature mf, + seq_region sr, + coord_system cs + WHERE ma.attrib_type_id = at.attrib_type_id + AND at.code = ? + AND ma.misc_feature_id = mf.misc_feature_id + AND mf.seq_region_id = sr.seq_region_id + AND sr.coord_system_id = cs.coord_system_id + AND cs.species_id = ?); + + if ($attrib_value) { + $sql .= " AND ma.value = ?"; + } + + my $sth = $self->prepare($sql); + + $sth->bind_param( 1, $attrib_type_code, SQL_VARCHAR ); + $sth->bind_param( 2, $self->species_id(), SQL_INTEGER ); + if ($attrib_value) { + $sth->bind_param( 3, $attrib_value, SQL_VARCHAR ); + } + + $sth->execute(); + + my @ids = map { $_->[0] } @{ $sth->fetchall_arrayref() }; + + $sth->finish(); + + # Construct constraints from the list of ids. Split ids into groups + # of 1000 to ensure that the query is not too big. + my @constraints; + while (@ids) { + my @subset = splice( @ids, 0, 1000 ); + if ( @subset == 1 ) { + push @constraints, "mf.misc_feature_id = $subset[0]"; + } else { + my $id_str = join( ',', @subset ); + push @constraints, "mf.misc_feature_id in ($id_str)"; + } + } + + my @results; + foreach my $constraint (@constraints) { + push @results, @{ $self->generic_fetch($constraint) }; + } + + return \@results; +} ## end sub fetch_all_by_attribute_type_value + + +#_tables +# +# Arg [1] : none +# Example : none +# Description: PROTECTED Implementation of abstract superclass method to +# provide the name of the tables to query +# Returntype : string +# Exceptions : none +# Caller : internal + + +sub _tables { + my $self = shift; + + return (['misc_feature', 'mf'], + ['misc_feature_misc_set', 'mfms'], + ['misc_attrib', 'ma'], + ['attrib_type', 'at']); +} + + +#_columns + +# Arg [1] : none +# Example : none +# Description: PROTECTED Implementation of abstract superclass method to +# provide the name of the columns to query +# Returntype : list of strings +# Exceptions : none +# Caller : internal + +sub _columns { + my $self = shift; + + #warning _objs_from_sth implementation depends on ordering + return qw (mf.misc_feature_id + mf.seq_region_id + mf.seq_region_start + mf.seq_region_end + mf.seq_region_strand + ma.value + at.code + mfms.misc_set_id + at.name + at.description); +} + + + +# _default_where_clause + +# Arg [1] : none +# Example : none +# Description: Overrides superclass method to provide an additional +# table joining constraint before the SQL query is performed. +# Returntype : string +# Exceptions : none +# Caller : generic_fetch + +sub _default_where_clause { + my $self = shift; + + return ''; +} + + +sub _left_join { + my $self = shift; + + return( + ['misc_feature_misc_set','mf.misc_feature_id = mfms.misc_feature_id'], + ['misc_attrib', 'mf.misc_feature_id = ma.misc_feature_id'], + ['attrib_type','ma.attrib_type_id = at.attrib_type_id']); +} + + +sub _final_clause { + my $self = shift; + + return " ORDER BY mf.misc_feature_id"; +} + + +# _objs_from_sth + +# Arg [1] : StatementHandle $sth +# Example : none +# Description: PROTECTED implementation of abstract superclass method. +# responsible for the creation of MiscFeatures from a +# hashref generated from an SQL query +# Returntype : listref of Bio::EnsEMBL::MiscFeatures +# Exceptions : none +# Caller : internal + +sub _objs_from_sth { + my ($self, $sth, $mapper, $dest_slice) = @_; + + # + # This code is ugly because an attempt has been made to remove as many + # function calls as possible for speed purposes. Thus many caches and + # a fair bit of gymnastics is used. + # + + my $sa = $self->db()->get_SliceAdaptor(); + my $msa = $self->db->get_MiscSetAdaptor(); + + my @features; + my %ms_hash; + my %slice_hash; + my %sr_name_hash; + my %sr_cs_hash; + + my($misc_feature_id, $seq_region_id, $seq_region_start, $seq_region_end, + $seq_region_strand, $attrib_value, $attrib_type_code, $misc_set_id, + $attrib_type_name, $attrib_type_description ); + + $sth->bind_columns( \$misc_feature_id, \$seq_region_id, \$seq_region_start, + \$seq_region_end, \$seq_region_strand, + \$attrib_value, \$attrib_type_code,\$misc_set_id, + \$attrib_type_name, \$attrib_type_description ); + + my $asm_cs; + my $cmp_cs; + my $asm_cs_vers; + my $asm_cs_name; + my $cmp_cs_vers; + my $cmp_cs_name; + if($mapper) { + $asm_cs = $mapper->assembled_CoordSystem(); + $cmp_cs = $mapper->component_CoordSystem(); + $asm_cs_name = $asm_cs->name(); + $asm_cs_vers = $asm_cs->version(); + $cmp_cs_name = $cmp_cs->name(); + $cmp_cs_vers = $cmp_cs->version(); + } + + my $dest_slice_start; + my $dest_slice_end; + my $dest_slice_strand; + my $dest_slice_length; + my $dest_slice_sr_name; + my $dest_slice_sr_id; + if($dest_slice) { + $dest_slice_start = $dest_slice->start(); + $dest_slice_end = $dest_slice->end(); + $dest_slice_strand = $dest_slice->strand(); + $dest_slice_length = $dest_slice->length(); + $dest_slice_sr_name = $dest_slice->seq_region_name(); + $dest_slice_sr_id = $dest_slice->get_seq_region_id(); + } + + my $current = -1; + my $throw_away = -1; + my $feat; + my $feat_misc_sets; + my $feat_attribs; + my $seen_attribs; + + FEATURE: while($sth->fetch()) { + #if this feature is not being used, skip all rows related to it + next if($throw_away == $misc_feature_id); + + if ($current == $misc_feature_id) { + #still working on building up attributes and sets for current feature + + #if there is a misc_set, add it to the current feature + if ($misc_set_id) { + my $misc_set = $ms_hash{$misc_set_id} ||= + $msa->fetch_by_dbID($misc_set_id); + if ( ! exists $feat_misc_sets->{$misc_set->{'code'}} ) { + $feat->add_MiscSet( $misc_set ); + $feat_misc_sets->{$misc_set->{'code'}} = $misc_set; + } + } + + #if there is a new attribute add it to the current feature + if ($attrib_value && $attrib_type_code && + !$seen_attribs->{"$attrib_type_code:$attrib_value"}) { + my $attrib = Bio::EnsEMBL::Attribute->new + ( -CODE => $attrib_type_code, + -NAME => $attrib_type_name, + -DESC => $attrib_type_description, + -VALUE => $attrib_value + ); + + + $feat_attribs ||= []; + push @$feat_attribs, $attrib; + $seen_attribs->{"$attrib_type_code:$attrib_value"} = 1; + } + + } else { + if ($feat) { + #start working on a new feature, discard references to last one + $feat = {}; + $feat_attribs = []; + $feat_misc_sets = {}; + $seen_attribs = {}; + } + + $current = $misc_feature_id; + #need to get the internal_seq_region, if present + $seq_region_id = $self->get_seq_region_id_internal($seq_region_id); + my $slice = $slice_hash{"ID:".$seq_region_id}; + + if (!$slice) { + $slice = $sa->fetch_by_seq_region_id($seq_region_id); + $slice_hash{"ID:".$seq_region_id} = $slice; + $sr_name_hash{$seq_region_id} = $slice->seq_region_name(); + $sr_cs_hash{$seq_region_id} = $slice->coord_system(); + } + + my $sr_name = $sr_name_hash{$seq_region_id}; + my $sr_cs = $sr_cs_hash{$seq_region_id}; + # + # remap the feature coordinates to another coord system + # if a mapper was provided + # + if ($mapper) { + + + if (defined $dest_slice && $mapper->isa('Bio::EnsEMBL::ChainedAssemblyMapper') ) { + ( $seq_region_id, $seq_region_start,$seq_region_end, $seq_region_strand ) = + $mapper->map( $sr_name, $seq_region_start, $seq_region_end, + $seq_region_strand, $sr_cs, 1, $dest_slice); + + } else { + + ( $seq_region_id, $seq_region_start,$seq_region_end, $seq_region_strand ) = + $mapper->fastmap( $sr_name, $seq_region_start, $seq_region_end,$seq_region_strand, $sr_cs ); + } + + #skip features that map to gaps or coord system boundaries + if(!defined($seq_region_id)) { + $throw_away = $misc_feature_id; + next FEATURE; + } + + #get a slice in the coord system we just mapped to +# if ($asm_cs == $sr_cs || +# ($cmp_cs != $sr_cs && $asm_cs->equals($sr_cs))) { + $slice = $slice_hash{"ID:".$seq_region_id} ||= + $sa->fetch_by_seq_region_id($seq_region_id); +# } else { +# $slice = $slice_hash{"NAME:$sr_name:$asm_cs_name:$asm_cs_vers"} ||= +# $sa->fetch_by_region($asm_cs_name, $sr_name, undef, undef, undef, +# $asm_cs_vers); +# } + } + + # + # If a destination slice was provided convert the coords + # If the dest_slice starts at 1 and is foward strand, nothing needs doing + # + if ($dest_slice) { + if ($dest_slice_start != 1 || $dest_slice_strand != 1) { + if ($dest_slice_strand == 1) { + $seq_region_start = $seq_region_start - $dest_slice_start + 1; + $seq_region_end = $seq_region_end - $dest_slice_start + 1; + } else { + my $tmp_seq_region_start = $seq_region_start; + $seq_region_start = $dest_slice_end - $seq_region_end + 1; + $seq_region_end = $dest_slice_end - $tmp_seq_region_start + 1; + $seq_region_strand *= -1; + } + } + #throw away features off the end of the requested slice + if ($seq_region_end < 1 || $seq_region_start > $dest_slice_length || + ( $dest_slice_sr_id ne $seq_region_id )) { + #flag this feature as one to throw away + $throw_away = $misc_feature_id; + next FEATURE; + } + $slice = $dest_slice; + } + + + if ($attrib_value && $attrib_type_code) { + my $attrib = Bio::EnsEMBL::Attribute->new + ( -CODE => $attrib_type_code, + -NAME => $attrib_type_name, + -DESC => $attrib_type_description, + -VALUE => $attrib_value + ); + $feat_attribs = [$attrib]; + $seen_attribs->{"$attrib_type_code:$attrib_value"} = 1; + } + + $feat = + $self->_create_feature_fast( 'Bio::EnsEMBL::MiscFeature', { + 'start' => $seq_region_start, + 'end' => $seq_region_end, + 'strand' => $seq_region_strand, + 'slice' => $slice, + 'adaptor' => $self, + 'dbID' => $misc_feature_id, + 'attributes' => $feat_attribs + } ); + + push @features, $feat; + + if ($misc_set_id) { + #get the misc_set object + my $misc_set = $ms_hash{$misc_set_id} ||= + $msa->fetch_by_dbID($misc_set_id); + if ( ! exists $feat_misc_sets->{$misc_set->{'code'}} ) { + $feat->add_MiscSet( $misc_set ); + $feat_misc_sets->{$misc_set->{'code'}} = $misc_set; + } + } + } + } + + return \@features; +} + + + +=head2 list_dbIDs + + Arg [1] : none + Example : @feature_ids = @{$misc_feature_adaptor->list_dbIDs()}; + Description: Gets an array of internal ids for all misc_features in the + current db + Arg[1] : <optional> int. not 0 for the ids to be sorted by the seq_region. + Returntype : list of ints + Exceptions : none + Caller : ? + Status : Stable + +=cut + +sub list_dbIDs { + my ($self,$ordered) = @_; + + return $self->_list_dbIDs("misc_feature",undef,$ordered); +} + + +=head2 store + + Arg [1] : list of Bio::EnsEMBL::MiscFeatures @misc_features + Example : $misc_feature_adaptor->store(@misc_features); + Description: Stores a list of MiscFeatures in this database. The stored + features will have their + Returntype : none + Exceptions : throw on invalid arguments + warning if misc feature is already stored in this database + throw if start/end/strand attribs are not valid + Caller : general + Status : Stable + +=cut + +sub store { + my $self = shift; + my @misc_features = @_; + + my $db = $self->db(); + + my $feature_sth = $self->prepare + ("INSERT INTO misc_feature SET " . + " seq_region_id = ?, " . + " seq_region_start = ?, " . + " seq_region_end = ?, " . + " seq_region_strand = ?"); + + my $feature_set_sth = $self->prepare + ("INSERT IGNORE misc_feature_misc_set SET " . + " misc_feature_id = ?, " . + " misc_set_id = ?"); + + my $msa = $db->get_MiscSetAdaptor(); + my $aa = $db->get_AttributeAdaptor(); + + FEATURE: + foreach my $mf (@misc_features) { + if(!ref($mf) || !$mf->isa('Bio::EnsEMBL::MiscFeature')) { + throw("List of MiscFeature arguments expeceted"); + } + + if($mf->is_stored($db)) { + warning("MiscFeature [" .$mf->dbID."] is already stored in database."); + next FEATURE; + } + + # do some checking of the start/end and convert to seq_region coords + my $original = $mf; + my $seq_region_id; + ($mf, $seq_region_id) = $self->_pre_store($mf); + + # store the actual MiscFeature + $feature_sth->bind_param(1,$seq_region_id,SQL_INTEGER); + $feature_sth->bind_param(2,$mf->start,SQL_INTEGER); + $feature_sth->bind_param(3,$mf->end,SQL_INTEGER); + $feature_sth->bind_param(4,$mf->strand,SQL_TINYINT); + $feature_sth->execute(); + + my $dbID = $feature_sth->{'mysql_insertid'}; + + $mf->dbID($dbID); + $mf->adaptor($self); + + # store all the attributes + my $attribs = $mf->get_all_Attributes(); + $aa->store_on_MiscFeature($mf, $attribs); + + # store all the sets that have not been stored yet + my $sets = $mf->get_all_MiscSets(); + foreach my $set (@$sets) { + $msa->store($set) if(!$set->is_stored($db)); + + # update the misc_feat_misc_set table to store the set relationship + $feature_set_sth->bind_param(1,$dbID,SQL_INTEGER); + $feature_set_sth->bind_param(2,$set->dbID,SQL_INTEGER); + + $feature_set_sth->execute(); + } + } + + return; +} + +1; + + + + +