view variant_effect_predictor/Bio/EnsEMBL/Pipeline/FASTA/ConcatFiles.pm @ 3:d30fa12e4cc5 default tip

Merge heads 2:a5976b2dce6f and 1:09613ce8151e which were created as a result of a recently fixed bug.
author devteam <devteam@galaxyproject.org>
date Mon, 13 Jan 2014 10:38:30 -0500
parents 1f6dce3d34e0
children
line wrap: on
line source

=pod

=head1 LICENSE

  Copyright (c) 1999-2012 The European Bioinformatics Institute and
  Genome Research Limited.  All rights reserved.

  This software is distributed under a modified Apache license.
  For license details, please see

    http://www.ensembl.org/info/about/code_licence.html

=head1 CONTACT

  Please email comments or questions to the public Ensembl
  developers list at <dev@ensembl.org>.

  Questions may also be sent to the Ensembl help desk at
  <helpdesk@ensembl.org>.

=head1 NAME

Bio::EnsEMBL::Pipeline::FASTA::ConcatFiles

=head1 DESCRIPTION

Performs a find in the DNA dumps directory for the given species and then
concats files which match a specified name pattern. We only allow
two types of concats; DNA and RM DNA. The concat file is a series
of cat command calls from all other Gzipped FASTA dumps (allowed under
the GZip specification). 

Allowed parameters are:

=over 8

=item release - Needed to build the target path

=item species - Required to indicate which species we are working with

=item data_type - The type of data to work with. Can be I<dna>, I<dn_sm> or I<dna_rm>

=item base_path - The base of the dumps

=back

=cut

package Bio::EnsEMBL::Pipeline::FASTA::ConcatFiles;

use strict;
use warnings;
use base qw/Bio::EnsEMBL::Pipeline::FASTA::Base/;

use File::Spec;
use File::stat;

sub param_defaults {
  my ($self) = @_;
  return {
    dna => {
      regex => qr/.+\.dna\..+\.fa\.gz$/,
    },
    dna_rm => {
      regex => qr/.+\.dna_rm\..+\.fa\.gz$/,
    },
    dna_sm => {
      regex => qr/.+\.dna_sm\..+\.fa\.gz$/,
    },
  };
}

sub fetch_input {
  my ($self) = @_;
  foreach my $key (qw/data_type species release base_path/) {
    $self->throw("Cannot find the required parameter $key") unless $self->param($key);
  }
  return;
}

# sticks ends of files together into one big file.
sub run {
  my ($self) = @_;
  
  my @file_list = @{$self->get_dna_files()};
  my $count = scalar(@file_list);
  my $running_total_size = 0;
  
  if($count) {
    my $target_file = $self->target_file();
    $self->info("Concatting type %s with %d file(s) into %s", $self->param('data_type'), $count, $target_file);
    
    if(-f $target_file) {
      $self->info("Target already exists. Removing");
      unlink $target_file or $self->throw("Could not remove $target_file: $!");
    }
    
    $self->info('Running concat');
    foreach my $file (@file_list) {
      $self->fine('Processing %s', $file);
      $running_total_size += stat($file)->size;
      system("cat $file >> $target_file") 
        and $self->throw( sprintf('Cannot concat %s into %s. RC %d', $file, $target_file, ($?>>8)));
    }

    $self->info("Catted files together");
    
    my $catted_size = stat($target_file)->size;
    
    if($running_total_size != $catted_size) {
      $self->throw(sprintf('The total size of the files catted together should be %d but was in fact %d. Failing as we expect the catted size to be the same', $running_total_size, $catted_size));
    }
    
    $self->param('target_file', $target_file);
  }
  else {
    $self->throw("Cannot continue as we found no files to concat");
  }
  return;
}

sub write_output {
  my ($self) = @_;
  my $file = $self->param('target_file');
  if($file) {
    $self->dataflow_output_id({ file => $file, species => $self->param('species') }, 1);
  }
  return;
}

sub get_dna_files {
  my ($self) = @_;
  my $path = $self->fasta_path('dna');
  my $data_type = $self->param('data_type'); 
  my $regex_hash = $self->param($data_type); 
  if(! $regex_hash ) {
    $self->throw("We do not have an entry for the data_type $data_type in our regex lookup hash. Edit this module");
  }
  my $regex = $regex_hash->{regex};
  my $filter = sub {
    my ($filename) = @_;
    return ($filename =~ $regex && $filename !~ /\.toplevel\./) ? 1 : 0;
  };
  my $files = $self->find_files($path, $filter);
  return [ sort @{$files} ];
}


sub target_file {
  my ($self) = @_;
  # File name format looks like:
  # <species>.<assembly>.<release>.<sequence type>.<id type>.<id>.fa.gz
  # e.g. Homo_sapiens.GRCh37.64.dna_rm.toplevel.fa.gz
  my @name_bits;
  push @name_bits, $self->web_name();
  push @name_bits, $self->assembly();
  push @name_bits, $self->param('release');
  push @name_bits, $self->param('data_type');
  push @name_bits, 'toplevel';
  push @name_bits, 'fa', 'gz';
  my $file_name = join( '.', @name_bits );
  my $dir = $self->fasta_path('dna');
  return File::Spec->catfile( $dir, $file_name );
}

1;