Mercurial > repos > mahtabm > ensembl
diff variant_effect_predictor/Bio/EnsEMBL/Variation/Pipeline/InitVariationClass.pm @ 0:1f6dce3d34e0
Uploaded
author | mahtabm |
---|---|
date | Thu, 11 Apr 2013 02:01:53 -0400 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/Variation/Pipeline/InitVariationClass.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,109 @@ +package Bio::EnsEMBL::Variation::Pipeline::InitVariationClass; + +use strict; +use warnings; + +use base qw(Bio::EnsEMBL::Variation::Pipeline::BaseVariationProcess); + +use POSIX qw(ceil); + +my $DEBUG = 0; + +sub fetch_input { + + my $self = shift; + + my $num_chunks = $self->required_param('num_chunks'); + + my $var_dba = $self->get_species_adaptor('variation'); + + my $aa = $var_dba->get_AttributeAdaptor; + + my $dbc = $var_dba->dbc(); + + # first set everything in variation (except HGMDs) to 'sequence_alteration' by default + # because sometimes we miss them because there is no variation_feature + # or any alleles (though this should become unnecessary as we move to the + # new approach to failing for all species) + + my $default_attrib_id = $aa->attrib_id_for_type_value('SO_term', 'sequence_alteration'); + + die "No attrib_id for 'sequence_alteration'" unless defined $default_attrib_id; + + $dbc->do(qq{ + UPDATE variation v, source s + SET v.class_attrib_id = $default_attrib_id + WHERE v.source_id = s.source_id + AND s.name != 'HGMD-PUBLIC' + }); + + # now create some temp tables to store the class attribs + + my $temp_var_table = 'temp_variation_class'; + my $temp_var_feat_table = 'temp_variation_feature_class'; + + $dbc->do(qq{DROP TABLE IF EXISTS $temp_var_table}); + $dbc->do(qq{DROP TABLE IF EXISTS $temp_var_feat_table}); + + $dbc->do(qq{CREATE TABLE $temp_var_table LIKE variation}); + $dbc->do(qq{CREATE TABLE $temp_var_feat_table LIKE variation_feature}); + + $dbc->do(qq{ALTER TABLE $temp_var_table DISABLE KEYS}); + $dbc->do(qq{ALTER TABLE $temp_var_feat_table DISABLE KEYS}); + + # now get an ordered list of all the variation_ids + + my $get_var_ids_sth = $dbc->prepare(qq{ + SELECT variation_id FROM variation ORDER BY variation_id + }); + + $get_var_ids_sth->execute; + + my @var_ids; + + while (my ($var_id) = $get_var_ids_sth->fetchrow_array) { + push @var_ids, $var_id; + } + + # and split them up into as many chunks as requested + + my $num_vars = scalar @var_ids; + + my $chunk_size = ceil($num_vars / $num_chunks); + + my @output_ids; + + while (@var_ids) { + + my $start = $var_ids[0]; + my $stop = $chunk_size <= $#var_ids ? $var_ids[$chunk_size - 1] : $var_ids[$#var_ids]; + + splice(@var_ids, 0, $chunk_size); + + push @output_ids, { + variation_id_start => $start, + variation_id_stop => $stop, + temp_var_table => $temp_var_table, + temp_var_feat_table => $temp_var_feat_table, + }; + } + + $self->param('chunk_output_ids', \@output_ids); + + $self->param( + 'finish_var_class', [{ + temp_var_table => $temp_var_table, + temp_var_feat_table => $temp_var_feat_table, + }] + ); +} + +sub write_output { + my $self = shift; + + $self->dataflow_output_id($self->param('finish_var_class'), 1); + $self->dataflow_output_id($self->param('chunk_output_ids'), 2); +} + +1; +