0
|
1 package Bio::EnsEMBL::Variation::Pipeline::InitVariationClass;
|
|
2
|
|
3 use strict;
|
|
4 use warnings;
|
|
5
|
|
6 use base qw(Bio::EnsEMBL::Variation::Pipeline::BaseVariationProcess);
|
|
7
|
|
8 use POSIX qw(ceil);
|
|
9
|
|
10 my $DEBUG = 0;
|
|
11
|
|
12 sub fetch_input {
|
|
13
|
|
14 my $self = shift;
|
|
15
|
|
16 my $num_chunks = $self->required_param('num_chunks');
|
|
17
|
|
18 my $var_dba = $self->get_species_adaptor('variation');
|
|
19
|
|
20 my $aa = $var_dba->get_AttributeAdaptor;
|
|
21
|
|
22 my $dbc = $var_dba->dbc();
|
|
23
|
|
24 # first set everything in variation (except HGMDs) to 'sequence_alteration' by default
|
|
25 # because sometimes we miss them because there is no variation_feature
|
|
26 # or any alleles (though this should become unnecessary as we move to the
|
|
27 # new approach to failing for all species)
|
|
28
|
|
29 my $default_attrib_id = $aa->attrib_id_for_type_value('SO_term', 'sequence_alteration');
|
|
30
|
|
31 die "No attrib_id for 'sequence_alteration'" unless defined $default_attrib_id;
|
|
32
|
|
33 $dbc->do(qq{
|
|
34 UPDATE variation v, source s
|
|
35 SET v.class_attrib_id = $default_attrib_id
|
|
36 WHERE v.source_id = s.source_id
|
|
37 AND s.name != 'HGMD-PUBLIC'
|
|
38 });
|
|
39
|
|
40 # now create some temp tables to store the class attribs
|
|
41
|
|
42 my $temp_var_table = 'temp_variation_class';
|
|
43 my $temp_var_feat_table = 'temp_variation_feature_class';
|
|
44
|
|
45 $dbc->do(qq{DROP TABLE IF EXISTS $temp_var_table});
|
|
46 $dbc->do(qq{DROP TABLE IF EXISTS $temp_var_feat_table});
|
|
47
|
|
48 $dbc->do(qq{CREATE TABLE $temp_var_table LIKE variation});
|
|
49 $dbc->do(qq{CREATE TABLE $temp_var_feat_table LIKE variation_feature});
|
|
50
|
|
51 $dbc->do(qq{ALTER TABLE $temp_var_table DISABLE KEYS});
|
|
52 $dbc->do(qq{ALTER TABLE $temp_var_feat_table DISABLE KEYS});
|
|
53
|
|
54 # now get an ordered list of all the variation_ids
|
|
55
|
|
56 my $get_var_ids_sth = $dbc->prepare(qq{
|
|
57 SELECT variation_id FROM variation ORDER BY variation_id
|
|
58 });
|
|
59
|
|
60 $get_var_ids_sth->execute;
|
|
61
|
|
62 my @var_ids;
|
|
63
|
|
64 while (my ($var_id) = $get_var_ids_sth->fetchrow_array) {
|
|
65 push @var_ids, $var_id;
|
|
66 }
|
|
67
|
|
68 # and split them up into as many chunks as requested
|
|
69
|
|
70 my $num_vars = scalar @var_ids;
|
|
71
|
|
72 my $chunk_size = ceil($num_vars / $num_chunks);
|
|
73
|
|
74 my @output_ids;
|
|
75
|
|
76 while (@var_ids) {
|
|
77
|
|
78 my $start = $var_ids[0];
|
|
79 my $stop = $chunk_size <= $#var_ids ? $var_ids[$chunk_size - 1] : $var_ids[$#var_ids];
|
|
80
|
|
81 splice(@var_ids, 0, $chunk_size);
|
|
82
|
|
83 push @output_ids, {
|
|
84 variation_id_start => $start,
|
|
85 variation_id_stop => $stop,
|
|
86 temp_var_table => $temp_var_table,
|
|
87 temp_var_feat_table => $temp_var_feat_table,
|
|
88 };
|
|
89 }
|
|
90
|
|
91 $self->param('chunk_output_ids', \@output_ids);
|
|
92
|
|
93 $self->param(
|
|
94 'finish_var_class', [{
|
|
95 temp_var_table => $temp_var_table,
|
|
96 temp_var_feat_table => $temp_var_feat_table,
|
|
97 }]
|
|
98 );
|
|
99 }
|
|
100
|
|
101 sub write_output {
|
|
102 my $self = shift;
|
|
103
|
|
104 $self->dataflow_output_id($self->param('finish_var_class'), 1);
|
|
105 $self->dataflow_output_id($self->param('chunk_output_ids'), 2);
|
|
106 }
|
|
107
|
|
108 1;
|
|
109
|