view variant_effect_predictor/Bio/EnsEMBL/Pipeline/SpeciesFactory.pm @ 3:d30fa12e4cc5 default tip

Merge heads 2:a5976b2dce6f and 1:09613ce8151e which were created as a result of a recently fixed bug.
author devteam <devteam@galaxyproject.org>
date Mon, 13 Jan 2014 10:38:30 -0500
parents 1f6dce3d34e0
children
line wrap: on
line source

=pod

=head1 LICENSE

  Copyright (c) 1999-2012 The European Bioinformatics Institute and
  Genome Research Limited.  All rights reserved.

  This software is distributed under a modified Apache license.
  For license details, please see

    http://www.ensembl.org/info/about/code_licence.html

=head1 CONTACT

  Please email comments or questions to the public Ensembl
  developers list at <dev@ensembl.org>.

  Questions may also be sent to the Ensembl help desk at
  <helpdesk@ensembl.org>.

=head1 NAME

Bio::EnsEMBL::Pipeline::SpeciesFactory

=head1 DESCRIPTION

A module which generates dump jobs for each species it finds in the Ensembl
Registry. The species we run the code on can be controlled by specifying
the I<species> parameter or by reducing the number of DBAdaptors loaded into
the registry. 

Allowed parameters are:

=over 8

=item species - Can be an array of species to perform dumps for or a single
                species name. If specified only jobs will be created for
                those species. Defaults to nothing so all species are processed

item db_types - Specify the types of database to dump. Defaults to core and
                should be an array.

=back

The code flows once per species to branch 2.

=cut

package Bio::EnsEMBL::Pipeline::SpeciesFactory;

use strict;
use warnings;

use base qw/Bio::EnsEMBL::Pipeline::Base/;

use Bio::EnsEMBL::Registry;

sub param_defaults {
  my ($self) = @_;
  return {
    db_types => [qw/core/],
    species => []
  };
}

sub fetch_input {
  my ($self) = @_;
  
  $self->reset_empty_array_param('db_types');
  
  my $core_dbas = $self->get_DBAdaptors();
  $self->info('Found %d core DBAdaptor(s) to process', scalar(@{$core_dbas}));
  $self->param('dbas', $core_dbas);
  
  my %species_lookup = 
    map { $_ => 1 } 
    map { Bio::EnsEMBL::Registry->get_alias($_)  } 
    @{$self->param('species')};
  $self->param('species_lookup', \%species_lookup);
  
  return;
}
  
sub run {
  my ($self) = @_;
  my @dna;
  my @genes;
  my @species;
  foreach my $dba (@{$self->param('dbas')}) {
    if(!$self->process_dba($dba)) {
      $self->fine('Skipping %s', $dba->species());
      next;
    }
    my $input_id = $self->input_id($dba);
    push(@species, [ $input_id, 2 ]);
  }
  $self->param('species', \@species);
  return;
}

sub write_output {
  my ($self) = @_;
  $self->do_flow('species');
  return;
}

sub get_DBAdaptors {
  my ($self) = @_;
  return Bio::EnsEMBL::Registry->get_all_DBAdaptors(-GROUP => 'core');
}

sub do_flow {
  my ($self, $key) = @_;
  my $targets = $self->param($key);
  foreach my $entry (@{$targets}) {
    my ($input_id, $flow) = @{$entry};
    $self->fine('Flowing %s to %d for %s', $input_id->{species}, $flow, $key);
    $self->dataflow_output_id($input_id, $flow);
  }
  return;
}

sub process_dba {
  my ($self, $dba) = @_;
  
  #Reject if DB was ancestral sequences
  return 0 if $dba->species() =~ /ancestral/i;
  
  #If species is defined then make sure we only allow those species through
  if(@{$self->param('species')}) {
    my $lookup = $self->param('species_lookup');
    my $name = $dba->species();
    my $aliases = Bio::EnsEMBL::Registry->get_all_aliases($name);
    push(@{$aliases}, $name);
    my $found = 0;
    foreach my $alias (@{$aliases}) {
      if($lookup->{$alias}) {
        $found = 1;
        last;
      }
    }
    return $found;
  }
  
  #Otherwise just accept
  return 1;
}

sub input_id {
  my ($self, $dba, $type) = @_;
  my $mc = $dba->get_MetaContainer();
  my $input_id = {
    db_types => $self->db_types($dba),
    species => $mc->get_production_name(),
  };
  return $input_id;
}

sub db_types {
  my ($self, $dba) = @_;
  return $self->param('db_types');
}

1;