view variant_effect_predictor/Bio/EnsEMBL/Funcgen/Utils/EFGUtils.pm @ 3:d30fa12e4cc5 default tip

Merge heads 2:a5976b2dce6f and 1:09613ce8151e which were created as a result of a recently fixed bug.
author devteam <devteam@galaxyproject.org>
date Mon, 13 Jan 2014 10:38:30 -0500
parents 1f6dce3d34e0
children
line wrap: on
line source

=head1 LICENSE

  Copyright (c) 1999-2011 The European Bioinformatics Institute and
  Genome Research Limited.  All rights reserved.

  This software is distributed under a modified Apache license.
  For license details, please see

    http://www.ensembl.org/info/about/code_licence.html

=head1 CONTACT

  Please email comments or questions to the public Ensembl
  developers list at <ensembl-dev@ebi.ac.uk>.

  Questions may also be sent to the Ensembl help desk at
  <helpdesk@ensembl.org>.

=head1 NAME

Bio::EnsEMBL::Funcgen::Utils::EFGUtils

=head1 DESCRIPTION

This module collates a variety of miscellaneous methods.


=head1 SYNOPSIS

  BEGIN
  {
    unshift(@INC,"/path/of/local/src/modules");
  }

  use Utils;

  &Utils::send_mail($to_address, $title, $message);

=cut


# No API/Object based methods in here

###############################################################################

package Bio::EnsEMBL::Funcgen::Utils::EFGUtils;
require Exporter;
@ISA = qw(Exporter);
@EXPORT_OK = qw(get_date species_name get_month_number species_chr_num 
				open_file median mean run_system_cmd backup_file 
				is_gzipped is_sam is_bed get_file_format strip_param_args 
				generate_slices_from_names strip_param_flags
				get_current_regulatory_input_names add_external_db);

use Bio::EnsEMBL::Utils::Exception qw( throw );
use File::Path qw (mkpath);
use File::Basename qw (dirname);
use strict;
use Time::Local;
use FileHandle;
use Carp;

sub get_date{
	my ($format, $file) = @_;

	my ($time, $sec, $min, $hour, $mday, $mon, $year, $wday, $yday, $isdst);	


	throw("File does not exist or is not a regular file:\t$file") if $file && ! -f $file;


	($sec, $min, $hour, $mday, $mon, $year, $wday, $yday, $isdst) = (defined $file) ? 
	  localtime((stat($file))[9]) : localtime();

	#print "	($sec, $min, $hour, $mday, $mon, $year, $wday, $yday, $isdst)\n";
	
	if((! defined $format && ! defined $file) || $format eq "date"){
		$time = ($year+1900)."-".$mday."-".($mon+1);	
	}
	elsif($format eq "time"){#not working!
		$time = "${hour}:${min}:${sec}";
	}
	elsif($format eq "timedate"){#
	  $time = localtime();
	}
	else{#add mysql formats here, datetime etc...
		croak("get_date does not handle format:\t$format");
	}

	return $time;
}


#migrate this data to defs file!!??
#must contain all E! species and any other species which are used in local DB extractions
#NEED TO ADD FLY!!

sub species_name{
  my($species) = @_;
  my %species_names = (
		       "HOMO_SAPIENS", "human",
		       "MUS_MUSCULUS", "mouse",
		       "RATTUS_NORVEGICUS", "rat",
		       "CANIS_FAMILIARIS", "dog",
		       "PAN_TROGOLODYTES", "chimp",
		       "GALLUS_GALLUS", "chicken",
		       "SACCHAROMYCES_CEREVISIAE", "yeast",
		       "HUMAN",  "HOMO_SAPIENS",
		       "MOUSE", "MUS_MUSCULUS",
		       "RAT","RATTUS_NORVEGICUS",
		       "DOG", "CANIS_FAMILIARIS",
		       "CHIMP", "PAN_TROGOLODYTES",
		       "CHICKEN", "GALLUS_GALLUS",
		       "YEAST", "SACCHAROMYCES_CEREVISIAE",
		      );

  return $species_names{uc($species)};
}

sub get_month_number{
  my($mon) = @_;
  my %month_nos =(
		  "jan", "01",
		  "feb", "02",
		  "mar", "03",
		  "apr", "04",
		  "may", "05",
		  "jun", "06",
		  "jul", "07",
		  "aug", "08",
		  "sep", "09",
		  "oct", "10",
		  "nov", "11",
		  "dec", "12",
		 );
  return $month_nos{lc($mon)};
}


sub species_chr_num{
	my ($species, $val) = @_;

	($species = lc($species)) =~ s/ /_/;

	my %species_chrs = (
						homo_sapiens => {(
										  'x' => 23,
										  'y' => 24,
										  'mt' => 25, 
										 )},
						
						mus_musculus => {(
										  'x'  => 20,
										  'y'  => 21,
										  'mt' => 22,
										   )},
						
						rattus_norvegicus =>  {(
												'x'  => 21,
												'y'  => 22,
												'mt' => 23,
											   )},
					   );

	die("species not defined in chromosome hash") if(! exists $species_chrs{$species});

	return (exists $species_chrs{$species}{lc($val)}) ? $species_chrs{$species}{lc($val)} : $val;
}

#Sort should always be done in the caller if required

sub median{
  my ($scores, $sort) = shift;

  return undef if (! @$scores);


  my ($median);
  my $count = scalar(@$scores);
  my $index = $count-1;
  #need to deal with lines with no results!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
  #deal with one score fastest
  return  $scores->[0] if ($count == 1);

	 
  if($sort){
	#This is going to sort the reference here, so will affect
	#The array in the caller
	#We need to deref to avoid this
  }
  
  #taken from Statistics::Descriptive
  #remeber we're dealing with size starting with 1 but indices starting at 0
  
  if ($count % 2) { #odd number of scores
    $median = $scores->[($index+1)/2];
  }
  else { #even, get mean of flanks
    $median = ($scores->[($index)/2] + $scores->[($index/2)+1] ) / 2;
  }


  return $median;
}


sub mean{
  my $scores = shift;
  
  my $total = 0;

  map $total+= $_, @$scores;
  my $mean = $total/(scalar(@$scores));

  return $mean;

}

#Should really extend this to detect previous file?
#Or do in caller?

sub open_file{
  my ($file, $operator, $file_permissions) = @_;
  
  my $dir_permissions = $file_permissions || 0755;

  $operator ||= '<';

  if($operator !~ /%/){
  $operator = "$operator $file";
  }
  else{
  	#We have some piping to do
  	$operator = sprintf($operator, $file);
  }

  #Get dir here and create if not exists
  my $dir = dirname($file);  
  mkpath($dir, {verbose => 1, mode => $dir_permissions}) if(! -d $dir);
  my $fh = new FileHandle "$operator";


  #This does not catch incorrectly defined named pipes

  if(! defined $fh){
	croak("Failed to open $operator");
  }


  #Have to chmod here as umask will over-ride permissions passed to FileHandle
  if(defined $file_permissions){

	#Catch non-numeric here as chmod still returns true
	if($file_permissions =~ /[^0-9]/){
	  croak("Failed to change $file permissions using:\t$file_permissions");
	}

	#chmod requires a literal octal number e.g. 0775 not '0775'
	#should catch numbers as strings here, but perl makes this very hard to test
	#Can't even system this as if we build the cmd line with an octal it will be converted to a decimal
 	#These is still no way of testing for non-octal number or string
	#eval/sprintf will also not fail if there are non-octal digits i.e. 1999
	
	#eval will treat octal number and string as true octal number
	#else will pass non-octal string/number which we can't catch
	chmod(eval($file_permissions), $file);
  }

  return $fh;
}



################################################################################

=head2 run_system_cmd

 Description : Method to control the execution of the standard system() command

 ReturnType  : none

 Example     : $Helper->debug(2,"dir=$dir file=$file");

 Exceptions  : throws exception if system command returns none zero

=cut

################################################################################

sub run_system_cmd{
  my ($command, $no_exit) = @_;

  my $redirect = '';

  #$self->debug(3, "system($command)");
  
  # decide where the command line output should be redirected

  #This should account for redirects

  #if ($self->{_debug_level} >= 3){

  #  if (defined $self->{_debug_file}){
  #    $redirect = " >>".$self->{_debug_file}." 2>&1";
  #  }
  #  else{
  #    $redirect = "";
  #  }
  #}
  #else{
    #$redirect = " > /dev/null 2>&1";
  #}

  # execute the passed system command
  my $status = system("$command $redirect");
  my $exit_code = $status >> 8; 
 

  if ($status == -1) {	
	warn "Failed to execute: $!\n";
  }    
  elsif ($status & 127) {
	warn sprintf("Child died with signal %d, %s coredump\nError:\t$!",($status & 127),($status & 128) ? 'with' : 'without');
  }    
  elsif($status != 0) {	
	warn sprintf("Child exited with value %d\nError:\t$!\n", $exit_code); #get the true exit code
  }
 
  #We're not catchign error message here!

  if ($exit_code != 0){
		  
    if (! $no_exit){
      throw("System command failed:\t$command\n");
    }
    else{
      warn("System command returned non-zero exit code:\t$command\n");
    }
  }
  
  #reverse boolean logic for perl...can't do this anymore due to tab2mage successful non-zero exit codes :/

  return $exit_code;
}


sub backup_file{
  my $file_path = shift;

  throw("Must define a file path to backup") if(! $file_path);

  if (-f $file_path) {
    #$self->log("Backing up:\t$file_path");
    system ("mv ${file_path} ${file_path}.".`date '+%T'`) == 0 || return 0;
  }

  return 1;

}


sub get_file_format{
  my $file = shift;

  my $format = &is_bed($file);

  if(! $format){
	$format =  &is_sam($file);

	#Add more testes here
  }
  
  
  return $format;
}

sub is_gzipped {
  my ($file, $fail_if_compressed) = @_;

  throw ("File does not exist:\t$file") if ! -e $file;

  open(FILE, "file -L $file |")
	or throw("Can't execute command 'file' on '$file'");
  my $file_info = <FILE>;
  close FILE;

  my $gzip = 0;

  if($file_info =~ /compressed data/){

	if($file_info =~ /gzip/){
	  $gzip = 1;
	}
	else{
	  throw("File is compressed but not with gzip, please unzip or gzip:\t$file_info");
	}
  }

  return $gzip;
}

#Change these to also return the gz status

sub is_sam{
  my $file = shift;

  warn "Only checking file suffix for is_sam";
  #Could check for header here altho this is not mandatory!
  #Can we use web format guessing code?

  my $gz = (&is_gzipped($file, 1)) ? '.gz' : '';

  return ($file =~ /.sam${gz}/) ? 'sam' : 0;
}

#need is bam here too!

sub is_bed {
  my $file = shift;

  #Use open_file here!
  
  if(&is_gzipped($file, 1)){
	
    open(FILE, "zcat $file 2>&1 |") or throw("Can't open file via zcat:\t$file");
  } 
  else{
    open(FILE, $file) or throw("Can't open file:\t$file");
  }

  my @line;
  #$verbose =1;


  while (<FILE>) {
	chomp;
	@line = split("\t", $_);
	last;
  }
  close FILE;
  
  if (scalar @line < 6) {
	warn "Infile '$file' does not have 6 or more columns. We expect bed format:\t".
	  "CHROM START END NAME SCORE STRAND.\n";
	return 0;
	#} elsif ($line[0] !~ m/^((chr)?[MTXYNT_\d]+)$/) {
	#    warn ("1st column must contain name of seq_region (e.g. chr1 or 1) in '$file'");
	#    return 0;
	#Commented this out for now due to HSCHR_RANDOM seqs
	#How does the webcode handle this?
  } 
  elsif ($line[1] !~ m/^\d+$/ && $line[2] =~ m/^\d+$/) {
	warn "2nd and 3rd column must contain start and end respectively in '$file'\n";
	return 0;
  } 
  elsif ($line[5] !~ m/^[+-\.]$/) {
	warn "6th column must define strand (either +, - or .) in '$file'\n";
	return 0;
  }

  return 'bed';
}


#These subs are useful for implementing
#a farm mode in a run script, where a script can
#submit itself to the farm as slice based jobs

#strip cmd line params and associated arguments from a list 
#should not be used to remove flag options i.e. no following args
#as this may cause removal of any following @ARGV;
#Can this be used on flattened args hash?

sub strip_param_args{
  my ($args, @strip_params) = @_;

  my $param_name;
  my $seen_opt = 0;

  foreach my $i(0..$#{$args}){

	if($args->[$i] =~ /^[-]+/){
	  $seen_opt = 0;#Reset seen opt if we seen a new one
	  
	  ($param_name = $args->[$i]) =~ s/^[-]+//;

	  if(grep/^${param_name}$/, @strip_params){
		$seen_opt = 1;
	  }
	}

	#$args->[$i] = '' if $args->[$i] =~ /^[-]+farm/;#Only remove current flag
	#$seen_opt = 1 if $args->[$i] =~ /^[-]+skip_slices/;
	#$seen_opt = 1 if $args->[$i] =~ /^[-]+slice/;#Don't have full param name incase we have just specified -slice
	
	$args->[$i] = '' if $seen_opt;#Remove option and args following option
  }

  return $args;
}


sub strip_param_flags{
  my ($args, @strip_params) = @_;

  my @args = @$args;

  foreach my $flag(@strip_params){
	@args = grep(!/[-]+${flag}$/, @args);
  }

  return \@args;
}

#Generates slices from names or optionally alll default top level nonref

sub generate_slices_from_names{
  my ($slice_adaptor, $slice_names, $skip_slices, $highestlevel, $non_ref, $inc_dups, $assembly) = @_;

  #Test if $assembly is old?



  my (@slices, $slice, $sr_name);

  if(@$slice_names){
	
	foreach my $name(@$slice_names){
	  $slice = $slice_adaptor->fetch_by_region(undef, $name, undef, undef, undef, $assembly);

	  #WHy is this failing for hap regions?
	
	  if(! $slice){

		#Need to eval this as it will break with incorrect formating
		
		eval { $slice = $slice_adaptor->fetch_by_name($name) };

		if(! $slice){
		  throw("Could not fetch slice by region or name:\t".$name);
		}
	  }

	  $sr_name = $slice->seq_region_name;

	  next if(grep/^${sr_name}$/, @$skip_slices);
	  push @slices, $slice;
	}
  }
  elsif($highestlevel){

	my $level = 'toplevel';

	if($assembly){
	  $level = 'chromosome';
	  warn "Cannot get toplevel for old assembly version $assembly, defaulting to 'chromosome' level";
	  #Would ignore old assembly and just fetch current assembly otherwise as there is no toplevel for old assemblies
	  #No need for projection on non-ref unassembled seqs as these will/should be identical
	  #Only need need to project assembled seq i.e. haps(lrgs?).
	  #Only rollback toplevel data when cleaning after projection, otherwise we may lose some data.
	  #Change default delete to use all toplevel ref seqs (and non-ref with cs version e.g. haps but not lrgs)
	}

	my @tmp_slices = @{$slice_adaptor->fetch_all($level, $assembly, $non_ref, $inc_dups)};

	if(@$skip_slices){

	  foreach $slice(@tmp_slices){
		$sr_name = $slice->seq_region_name;
		push @slices, $slice if ! grep/^${sr_name}$/, @$skip_slices;
	  }
	}
	else{
	  @slices = @tmp_slices;
	}
  }
  else{
	throw('You must either pass an arrayref of slice names or specify the toplevel flag');
  }


  if(! @slices){
	throw("You have specified slice_names and skip_slices paramters which have generated no slices.\nslice_names:\t".join(' ',@$slice_names)."\nskip_slices:\t".join(' ', @$skip_slices));
  }

  return \@slices;
}


# Tracking DB methods
# Move to DBAdaptor? Can we add this as a separate package in the same module?

sub get_current_regulatory_input_names{
  my ($tdb, $efg_db, $focus) = @_;

  #Validate is production?
  my $sql;

  

  if($focus){
	$focus = 'Focus';
	$sql   = 'SELECT efgdb_set_name from dataset where is_focus=true and is_current=true and species="'.$efg_db->species.'"';
  }
  else{
	$focus = 'Non-focus';
	#0 rather than false so we don't get NULLs
	$sql = 'SELECT efgdb_set_name from dataset where is_focus=0 and is_current=true and species="'.$efg_db->species.'"';
  }
 

  #Currently efgdb_set_name can either be data_set or feature_set name!
  #Need to standardise this

  my @prd_names = @{$tdb->db_handle->selectcol_arrayref($sql)};
  my @names;
  my @failed_sets;

  foreach my $prd_name(@prd_names){

	$sql = "SELECT name from feature_set where name like '${prd_name}%'";
	my @tmp_names =  @{$efg_db->dbc->db_handle->selectcol_arrayref($sql)};

	#This is causing problems with multiple feature sets with differing analyses

	#Do this via InputSets(using query extension?) instead of using like?

	#This is very hacky right now to get it to work
	#Need to standardise and review tracking db data.

	if(scalar(@tmp_names) > 1){

	  $sql = "SELECT name from feature_set where name ='${prd_name}_ccat_histone'";
	  @tmp_names =  @{$efg_db->dbc->db_handle->selectcol_arrayref($sql)};

	  if(scalar(@tmp_names) == 1){
		push @names, $tmp_names[0];
	  }else{
		push @failed_sets, $prd_name;
	  }
	}
	elsif(scalar(@tmp_names) == 0){
	  push @failed_sets, $prd_name;
	}
	else{
	  push @names, $tmp_names[0];
	}

  }

  if(@failed_sets){
	throw("Failed to find unique $focus FeatureSets for production dataset names:\n\t".
		  join("\n\t", @failed_sets)."\n");
  }

  return @names;
}

#Handy function to add an external_db entry when needed
sub add_external_db{
  my ($efg_db, $db_name,$db_release,$db_display_name) = @_;
  my $sql = "select external_db_id from external_db where db_name='$db_name' and db_release='$db_release'";
  my ($db_id) =  $efg_db->dbc->db_handle->selectrow_array($sql);
  if($db_id){ 
    warn "External DB $db_name $db_release already exists in db with db_id $db_id\n";
  } else {
    #TODO check if it there was a failure
    $efg_db->dbc->do("insert into external_db (db_name, db_release, status, dbprimary_acc_linkable, priority, db_display_name, type) values('$db_name', '$db_release', 'KNOWNXREF', '1', '5', '$db_display_name', 'MISC')");
  }

}

1;