Mercurial > repos > mahtabm > ensembl
diff variant_effect_predictor/Bio/EnsEMBL/Pipeline/FASTA/WuBlastIndexer.pm @ 0:1f6dce3d34e0
Uploaded
author | mahtabm |
---|---|
date | Thu, 11 Apr 2013 02:01:53 -0400 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/Pipeline/FASTA/WuBlastIndexer.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,160 @@ +=pod + +=head1 LICENSE + + Copyright (c) 1999-2012 The European Bioinformatics Institute and + Genome Research Limited. All rights reserved. + + This software is distributed under a modified Apache license. + For license details, please see + + http://www.ensembl.org/info/about/code_licence.html + +=head1 CONTACT + + Please email comments or questions to the public Ensembl + developers list at <dev@ensembl.org>. + + Questions may also be sent to the Ensembl help desk at + <helpdesk@ensembl.org>. + +=head1 NAME + +Bio::EnsEMBL::Pipeline::FASTA::WuBlastIndexer + +=head1 DESCRIPTION + +Creates WUBlast indexes of the given GZipped file. The resulting index +is created under the parameter location I<base_path> in blast and then in a +directory defined by the type of dump. The type of dump also changes the file +name generated. Genomic dumps have their release number replaced with the +last repeat masked date. + +Allowed parameters are: + +=over 8 + +=item file - The file to index + +=item program - The location of the xdformat program + +=item molecule - The type of molecule to index. I<dna> and I<pep> are allowed + +=item type - Type of index we are creating. I<genomic> and I<genes> are allowed + +=item base_path - The base of the dumps + +=item release - Required for correct DB naming + +=back + +=cut + +package Bio::EnsEMBL::Pipeline::FASTA::WuBlastIndexer; + +use strict; +use warnings; +use base qw/Bio::EnsEMBL::Pipeline::FASTA::Indexer/; + +use Bio::EnsEMBL::Utils::Exception qw/throw/; +use File::Copy qw/copy/; +use File::Spec; +use POSIX qw/strftime/; + +sub param_defaults { + my ($self) = @_; + return { + program => 'xdformat', +# molecule => 'pep', #pep or dna +# type => 'genes' #genes or genomic + }; +} + +sub fetch_input { + my ($self) = @_; + my $mol = $self->param('molecule'); + if($mol ne 'dna' && $mol ne 'pep') { + throw "param 'molecule' must be set to 'dna' or 'pep'"; + } + my $type = $self->param('type'); + if($type ne 'genomic' && $type ne 'genes') { + throw "param 'type' must be set to 'genomic' or 'genes'"; + } + $self->assert_executable($self->param('program')); + $self->assert_executable('gunzip'); +} + +sub write_output { + my ($self) = @_; + $self->dataflow_output_id({ + species => $self->param('species'), + type => $self->param('type'), + molecule => $self->param('molecule'), + index_base => $self->param('index_base') + }, 1); + return; +} + +sub index_file { + my ($self, $file) = @_; + my $molecule_arg = ($self->param('molecule') eq 'dna') ? '-n' : '-p' ; + my $silence = ($self->debug()) ? 0 : 1; + my $target_dir = $self->target_dir(); + my $target_file = $self->target_file($file); + my $db_title = $self->db_title($file); + my $date = $self->db_date(); + + my $cmd = sprintf(q{cd %s && %s %s -q%d -I -t %s -d %s -o %s %s }, + $target_dir, $self->param('program'), $molecule_arg, $silence, $db_title, $date, $target_file, $file); + + $self->info('About to run "%s"', $cmd); + my $output = `$cmd 2>&1`; + my $rc = $? >> 8; + throw "Cannot run program '$cmd'. Return code was ${rc}. Program output was $output" if $rc; + unlink $file or throw "Cannot remove the file '$file' from the filesystem: $!"; + $self->param('index_base', $target_file); + return; +} + +sub target_file { + my ($self, $file) = @_; + my $target_dir = $self->target_dir(); + my $target_filename = $self->target_filename($file); + return File::Spec->catfile($target_dir, $target_filename); +} + +# Produce a dir like /nfs/path/to/blast/genes/XXX && /nfs/path/to/blast/dna/XXX +sub target_dir { + my ($self) = @_; + return $self->get_dir('blast', $self->param('type')); +} + +sub db_title { + my ($self, $source_file) = @_; + my ($vol, $dir, $file) = File::Spec->splitpath($source_file); + my $release = $self->param('release'); + my $title = $file; + $title =~ s/$release\.//; + return $title; +} + +sub db_date { + my ($self) = @_; + return strftime('%d-%m-%Y', gmtime()); +} + +#Source like Homo_sapiens.GRCh37.68.dna.toplevel.fa +#Filename like Homo_sapiens.GRCh37.20090401.dna.toplevel.fa +sub target_filename { + my ($self, $source_file) = @_; + my ($vol, $dir, $file) = File::Spec->splitpath($source_file); + if($self->param('type') eq 'genomic') { + my @split = split(/\./, $file); + my $rm_date = $self->repeat_mask_date(); + $split[-4] = $rm_date; + return join(q{.}, @split); + } + return $file; +} + +1;