Mercurial > repos > mahtabm > ensembl

diff variant_effect_predictor/Bio/EnsEMBL/Pipeline/FASTA/BlatIndexer.pm @ 0:1f6dce3d34e0
Uploaded
author: mahtabm
date: Thu, 11 Apr 2013 02:01:53 -0400
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/variant_effect_predictor/Bio/EnsEMBL/Pipeline/FASTA/BlatIndexer.pm	Thu Apr 11 02:01:53 2013 -0400
@@ -0,0 +1,236 @@
+=pod
+
+=head1 LICENSE
+
+  Copyright (c) 1999-2012 The European Bioinformatics Institute and
+  Genome Research Limited.  All rights reserved.
+
+  This software is distributed under a modified Apache license.
+  For license details, please see
+
+    http://www.ensembl.org/info/about/code_licence.html
+
+=head1 CONTACT
+
+  Please email comments or questions to the public Ensembl
+  developers list at <dev@ensembl.org>.
+
+  Questions may also be sent to the Ensembl help desk at
+  <helpdesk@ensembl.org>.
+
+=head1 NAME
+
+Bio::EnsEMBL::Pipeline::FASTA::BlastIndexer
+
+=head1 DESCRIPTION
+
+Creates 2bit file of the given GZipped file. The resulting index
+is created under the parameter location I<base_path> in blat/index. The filename
+is prefixed with the port number of the blat server this file should be
+run on.
+
+The module also performs filtering of non-reference sequence regions
+and can filter the redundant Y chromosome piece for human (as 2bit does
+not like repeated sequence region names).
+
+Allowed parameters are:
+
+=over 8
+
+=item file - The file to index
+
+=item program - The location of the faToTwoBit program
+
+=item port_offset - Value to add onto the species_id from the website DB
+                    to name the file correctly
+
+=item base_path - The base of the dumps
+
+=item index     - The type of file to index; supported values are empty, 
+                  I<dna>, I<dna_sm> or I<dna_rm>. If specified we will look for this
+                  string in the filename surrounded by '.' e.g. .dna.
+
+=back
+
+The registry should also have a DBAdaptor for the website schema 
+registered under the species B<multi> and the group B<web> for species_id to
+Blat port number. 
+
+=cut
+
+
+package Bio::EnsEMBL::Pipeline::FASTA::BlatIndexer;
+
+use strict;
+use warnings;
+use base qw/Bio::EnsEMBL::Pipeline::FASTA::Indexer/;
+
+use File::Spec;
+use File::stat;
+use Bio::EnsEMBL::Utils::IO qw/work_with_file/;
+use Bio::EnsEMBL::Utils::Exception qw/throw/;
+use Bio::EnsEMBL::Registry;
+
+sub param_defaults {
+  my ($self) = @_;
+  return {
+    program => 'faToTwoBit',
+    port_offset => 30000,
+    'index' => 'dna', #or dna_rm and dna_sm
+  };
+}
+
+sub fetch_input {
+  my ($self) = @_;
+  $self->assert_executable($self->param('program'));
+  $self->assert_executable('zcat');
+  $self->assert_executable('gunzip');
+  return;
+}
+
+sub run {
+  my ($self) = @_;
+  if($self->run_indexing()) {
+    $self->SUPER::run();
+  }
+  return;
+}
+
+sub run_indexing {
+  my ($self) = @_;
+  my $index = $self->param('index');
+  if($index) {
+    my $file = $self->param('file');
+    return (index($file, ".${index}.") > -1) ? 1 : 0;
+  }
+  return 1;
+}
+
+sub index_file {
+  my ($self, $file) = @_;
+  
+  my $target_file = $self->target_file();
+  my $cmd = sprintf(q{%s %s %s}, 
+    $self->param('program'), $file, $target_file);
+  
+  $self->info('About to run "%s"', $cmd);
+  my $output = `$cmd 2>&1`;
+  my $rc = $? >> 8;
+  throw "Cannot run program '$cmd'. Return code was ${rc}. Program output was $output" if $rc;
+  unlink $file or throw "Cannot remove the file '$file' from the filesystem: $!";
+  
+  #Check the file size. If it's 16 bytes then reject as that is an empty file for 2bit
+  my $filesize = stat($target_file)->size();
+  if($filesize <= 16) {
+    unlink $file;
+    my $msg = sprintf(
+      'The file %s produced a 2bit file %d byte(s). Lower than 17 bytes therefore empty 2 bit file',
+      $file, $filesize
+    );
+    $self->throw($msg);
+  }
+  
+  return;
+}
+
+sub decompress {
+  my ($self) = @_;
+  
+  #If we have no non-reference seq regions then use normal decompress
+  if(! $self->has_non_refs()) {
+    return $self->SUPER::decompress();
+  }
+  
+  #Filter for non-refs
+  my $source = $self->param('file');
+  my $target_dir = $self->target_dir();
+  my ($vol, $dir, $file) = File::Spec->splitpath($source);
+  $file =~ s/.gz$//;
+  my $target = File::Spec->catdir($target_dir, $file);
+  
+  my $allowed_regions = $self->allowed_regions();
+  
+  $self->info('Decompressing %s to %s', $source, $target);
+  
+  open my $src_fh, '-|', "zcat $source" or throw "Cannot decompress $source to $target";
+  work_with_file($target, 'w', sub {
+    my ($trg_fh) = @_;
+    my $transfer = 0;
+    while(my $line = <$src_fh>) {
+      #HEADER
+      if(index($line, '>') == 0) {
+        #regex is looking for NNN:NNN:NNN:1:80:1 i.e. the name
+        my ($name) = $line =~ />.+\s(.+:.+:.+:\d+:\d+:\d+)/; 
+        $transfer = ($allowed_regions->{$name}) ? 1 : 0;
+        if($transfer) {
+          $self->info('%s was an allowed Slice', $name);
+        }
+        else {
+          $self->info('%s will be skipped; not a reference Slice', $name);
+        }
+      }
+      print $trg_fh $line if $transfer;
+    }
+  });
+  close($src_fh);
+
+  return $target;
+}
+
+sub allowed_regions {
+  my ($self) = @_;
+  my $filter_human = 1;
+  my @slices = grep { $_->is_reference() } @{$self->get_Slices('core', $filter_human)};
+  my %hash = map { $_->name() => 1 } @slices;
+  return \%hash;
+}
+
+#Filename like 30061.Homo_sapiens.GRCh37.2bit
+sub target_filename {
+  my ($self) = @_;
+  my $port = $self->blat_port();
+  my $name = $self->web_name();
+  my $assembly = $self->assembly();
+  return join(q{.}, $port, $name, $assembly, '2bit');
+}
+
+sub target_file {
+  my ($self) = @_;
+  my $target_dir = $self->target_dir();
+  my $target_filename = $self->target_filename();
+  return File::Spec->catfile($target_dir, $target_filename);
+  return;
+}
+
+sub target_dir {
+  my ($self) = @_;
+  return $self->get_dir('blat', $self->param('index'));
+}
+
+sub blat_port {
+  my ($self) = @_;
+  my $dba = Bio::EnsEMBL::Registry->get_DBAdaptor('multi', 'web');
+  my $id = $dba->dbc()->sql_helper()->execute_single_result(
+    -SQL => 'select species_id from species where name =?',
+    -PARAMS => [$self->web_name()]
+  );
+  return $id + $self->param('port_offset');
+}
+
+sub has_non_refs {
+  my ($self) = @_;
+  my $sql = <<'SQL';
+select count(*)
+from attrib_type at
+join seq_region_attrib sra using (attrib_type_id)
+join seq_region sr using (seq_region_id)
+join coord_system cs using (coord_system_id)
+where cs.species_id =?
+and at.code =?
+SQL
+  my $dba = $self->get_DBAdaptor();
+  return $dba->dbc()->sql_helper()->execute_single_result(
+    -SQL => $sql, -PARAMS => [$dba->species_id(), 'non_ref']);
+}
+
+1;
author	mahtabm
date	Thu, 11 Apr 2013 02:01:53 -0400
parents
children