view variant_effect_predictor/Bio/EnsEMBL/Variation/Pipeline/InitVariationClass.pm @ 3:d30fa12e4cc5 default tip

Merge heads 2:a5976b2dce6f and 1:09613ce8151e which were created as a result of a recently fixed bug.
author devteam <devteam@galaxyproject.org>
date Mon, 13 Jan 2014 10:38:30 -0500
parents 1f6dce3d34e0
children
line wrap: on
line source

package Bio::EnsEMBL::Variation::Pipeline::InitVariationClass;

use strict;
use warnings;

use base qw(Bio::EnsEMBL::Variation::Pipeline::BaseVariationProcess);

use POSIX qw(ceil);

my $DEBUG = 0;

sub fetch_input {
    
    my $self = shift;

    my $num_chunks = $self->required_param('num_chunks');
    
    my $var_dba = $self->get_species_adaptor('variation');
        
    my $aa = $var_dba->get_AttributeAdaptor;
    
    my $dbc = $var_dba->dbc();
    
    # first set everything in variation (except HGMDs) to 'sequence_alteration' by default
    # because sometimes we miss them because there is no variation_feature
    # or any alleles (though this should become unnecessary as we move to the
    # new approach to failing for all species)

    my $default_attrib_id = $aa->attrib_id_for_type_value('SO_term', 'sequence_alteration');

    die "No attrib_id for 'sequence_alteration'" unless defined $default_attrib_id;

    $dbc->do(qq{
        UPDATE  variation v, source s
        SET     v.class_attrib_id = $default_attrib_id
        WHERE   v.source_id = s.source_id
        AND     s.name != 'HGMD-PUBLIC'
    });
    
    # now create some temp tables to store the class attribs

    my $temp_var_table = 'temp_variation_class';
    my $temp_var_feat_table = 'temp_variation_feature_class';
    
    $dbc->do(qq{DROP TABLE IF EXISTS $temp_var_table});
    $dbc->do(qq{DROP TABLE IF EXISTS $temp_var_feat_table});
    
    $dbc->do(qq{CREATE TABLE $temp_var_table LIKE variation});
    $dbc->do(qq{CREATE TABLE $temp_var_feat_table LIKE variation_feature});

    $dbc->do(qq{ALTER TABLE $temp_var_table DISABLE KEYS});
    $dbc->do(qq{ALTER TABLE $temp_var_feat_table DISABLE KEYS});

    # now get an ordered list of all the variation_ids
        
    my $get_var_ids_sth = $dbc->prepare(qq{
        SELECT variation_id FROM variation ORDER BY variation_id
    });

    $get_var_ids_sth->execute;

    my @var_ids;

    while (my ($var_id) = $get_var_ids_sth->fetchrow_array) {
        push @var_ids, $var_id;
    }

    # and split them up into as many chunks as requested

    my $num_vars = scalar @var_ids;

    my $chunk_size = ceil($num_vars / $num_chunks);

    my @output_ids;

    while (@var_ids) {

        my $start = $var_ids[0];
        my $stop  = $chunk_size <= $#var_ids ? $var_ids[$chunk_size - 1] : $var_ids[$#var_ids];

        splice(@var_ids, 0, $chunk_size);

        push @output_ids, {
            variation_id_start  => $start,
            variation_id_stop   => $stop,
            temp_var_table      => $temp_var_table,
            temp_var_feat_table => $temp_var_feat_table,
        };
    }

    $self->param('chunk_output_ids', \@output_ids);

    $self->param(
        'finish_var_class', [{
            temp_var_table      => $temp_var_table,
            temp_var_feat_table => $temp_var_feat_table,
        }]
    );
}

sub write_output {
    my $self = shift;

    $self->dataflow_output_id($self->param('finish_var_class'), 1);
    $self->dataflow_output_id($self->param('chunk_output_ids'), 2);
}

1;