Mercurial > repos > mahtabm > ensembl

diff variant_effect_predictor/Bio/EnsEMBL/Funcgen/Utils/EFGUtils.pm @ 0:1f6dce3d34e0
Uploaded
author: mahtabm
date: Thu, 11 Apr 2013 02:01:53 -0400
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/variant_effect_predictor/Bio/EnsEMBL/Funcgen/Utils/EFGUtils.pm	Thu Apr 11 02:01:53 2013 -0400
@@ -0,0 +1,673 @@
+=head1 LICENSE
+
+  Copyright (c) 1999-2011 The European Bioinformatics Institute and
+  Genome Research Limited.  All rights reserved.
+
+  This software is distributed under a modified Apache license.
+  For license details, please see
+
+    http://www.ensembl.org/info/about/code_licence.html
+
+=head1 CONTACT
+
+  Please email comments or questions to the public Ensembl
+  developers list at <ensembl-dev@ebi.ac.uk>.
+
+  Questions may also be sent to the Ensembl help desk at
+  <helpdesk@ensembl.org>.
+
+=head1 NAME
+
+Bio::EnsEMBL::Funcgen::Utils::EFGUtils
+
+=head1 DESCRIPTION
+
+This module collates a variety of miscellaneous methods.
+
+
+=head1 SYNOPSIS
+
+  BEGIN
+  {
+    unshift(@INC,"/path/of/local/src/modules");
+  }
+
+  use Utils;
+
+  &Utils::send_mail($to_address, $title, $message);
+
+=cut
+
+
+# No API/Object based methods in here
+
+###############################################################################
+
+package Bio::EnsEMBL::Funcgen::Utils::EFGUtils;
+require Exporter;
+@ISA = qw(Exporter);
+@EXPORT_OK = qw(get_date species_name get_month_number species_chr_num 
+				open_file median mean run_system_cmd backup_file 
+				is_gzipped is_sam is_bed get_file_format strip_param_args 
+				generate_slices_from_names strip_param_flags
+				get_current_regulatory_input_names add_external_db);
+
+use Bio::EnsEMBL::Utils::Exception qw( throw );
+use File::Path qw (mkpath);
+use File::Basename qw (dirname);
+use strict;
+use Time::Local;
+use FileHandle;
+use Carp;
+
+sub get_date{
+	my ($format, $file) = @_;
+
+	my ($time, $sec, $min, $hour, $mday, $mon, $year, $wday, $yday, $isdst);	
+
+
+	throw("File does not exist or is not a regular file:\t$file") if $file && ! -f $file;
+
+
+	($sec, $min, $hour, $mday, $mon, $year, $wday, $yday, $isdst) = (defined $file) ? 
+	  localtime((stat($file))[9]) : localtime();
+
+	#print "	($sec, $min, $hour, $mday, $mon, $year, $wday, $yday, $isdst)\n";
+	
+	if((! defined $format && ! defined $file) || $format eq "date"){
+		$time = ($year+1900)."-".$mday."-".($mon+1);	
+	}
+	elsif($format eq "time"){#not working!
+		$time = "${hour}:${min}:${sec}";
+	}
+	elsif($format eq "timedate"){#
+	  $time = localtime();
+	}
+	else{#add mysql formats here, datetime etc...
+		croak("get_date does not handle format:\t$format");
+	}
+
+	return $time;
+}
+
+
+#migrate this data to defs file!!??
+#must contain all E! species and any other species which are used in local DB extractions
+#NEED TO ADD FLY!!
+
+sub species_name{
+  my($species) = @_;
+  my %species_names = (
+		       "HOMO_SAPIENS", "human",
+		       "MUS_MUSCULUS", "mouse",
+		       "RATTUS_NORVEGICUS", "rat",
+		       "CANIS_FAMILIARIS", "dog",
+		       "PAN_TROGOLODYTES", "chimp",
+		       "GALLUS_GALLUS", "chicken",
+		       "SACCHAROMYCES_CEREVISIAE", "yeast",
+		       "HUMAN",  "HOMO_SAPIENS",
+		       "MOUSE", "MUS_MUSCULUS",
+		       "RAT","RATTUS_NORVEGICUS",
+		       "DOG", "CANIS_FAMILIARIS",
+		       "CHIMP", "PAN_TROGOLODYTES",
+		       "CHICKEN", "GALLUS_GALLUS",
+		       "YEAST", "SACCHAROMYCES_CEREVISIAE",
+		      );
+
+  return $species_names{uc($species)};
+}
+
+sub get_month_number{
+  my($mon) = @_;
+  my %month_nos =(
+		  "jan", "01",
+		  "feb", "02",
+		  "mar", "03",
+		  "apr", "04",
+		  "may", "05",
+		  "jun", "06",
+		  "jul", "07",
+		  "aug", "08",
+		  "sep", "09",
+		  "oct", "10",
+		  "nov", "11",
+		  "dec", "12",
+		 );
+  return $month_nos{lc($mon)};
+}
+
+
+sub species_chr_num{
+	my ($species, $val) = @_;
+
+	($species = lc($species)) =~ s/ /_/;
+
+	my %species_chrs = (
+						homo_sapiens => {(
+										  'x' => 23,
+										  'y' => 24,
+										  'mt' => 25, 
+										 )},
+						
+						mus_musculus => {(
+										  'x'  => 20,
+										  'y'  => 21,
+										  'mt' => 22,
+										   )},
+						
+						rattus_norvegicus =>  {(
+												'x'  => 21,
+												'y'  => 22,
+												'mt' => 23,
+											   )},
+					   );
+
+	die("species not defined in chromosome hash") if(! exists $species_chrs{$species});
+
+	return (exists $species_chrs{$species}{lc($val)}) ? $species_chrs{$species}{lc($val)} : $val;
+}
+
+#Sort should always be done in the caller if required
+
+sub median{
+  my ($scores, $sort) = shift;
+
+  return undef if (! @$scores);
+
+
+  my ($median);
+  my $count = scalar(@$scores);
+  my $index = $count-1;
+  #need to deal with lines with no results!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
+  #deal with one score fastest
+  return  $scores->[0] if ($count == 1);
+
+	 
+  if($sort){
+	#This is going to sort the reference here, so will affect
+	#The array in the caller
+	#We need to deref to avoid this
+  }
+  
+  #taken from Statistics::Descriptive
+  #remeber we're dealing with size starting with 1 but indices starting at 0
+  
+  if ($count % 2) { #odd number of scores
+    $median = $scores->[($index+1)/2];
+  }
+  else { #even, get mean of flanks
+    $median = ($scores->[($index)/2] + $scores->[($index/2)+1] ) / 2;
+  }
+
+
+  return $median;
+}
+
+
+sub mean{
+  my $scores = shift;
+  
+  my $total = 0;
+
+  map $total+= $_, @$scores;
+  my $mean = $total/(scalar(@$scores));
+
+  return $mean;
+
+}
+
+#Should really extend this to detect previous file?
+#Or do in caller?
+
+sub open_file{
+  my ($file, $operator, $file_permissions) = @_;
+  
+  my $dir_permissions = $file_permissions || 0755;
+
+  $operator ||= '<';
+
+  if($operator !~ /%/){
+  $operator = "$operator $file";
+  }
+  else{
+  	#We have some piping to do
+  	$operator = sprintf($operator, $file);
+  }
+
+  #Get dir here and create if not exists
+  my $dir = dirname($file);  
+  mkpath($dir, {verbose => 1, mode => $dir_permissions}) if(! -d $dir);
+  my $fh = new FileHandle "$operator";
+
+
+  #This does not catch incorrectly defined named pipes
+
+  if(! defined $fh){
+	croak("Failed to open $operator");
+  }
+
+
+  #Have to chmod here as umask will over-ride permissions passed to FileHandle
+  if(defined $file_permissions){
+
+	#Catch non-numeric here as chmod still returns true
+	if($file_permissions =~ /[^0-9]/){
+	  croak("Failed to change $file permissions using:\t$file_permissions");
+	}
+
+	#chmod requires a literal octal number e.g. 0775 not '0775'
+	#should catch numbers as strings here, but perl makes this very hard to test
+	#Can't even system this as if we build the cmd line with an octal it will be converted to a decimal
+ 	#These is still no way of testing for non-octal number or string
+	#eval/sprintf will also not fail if there are non-octal digits i.e. 1999
+	
+	#eval will treat octal number and string as true octal number
+	#else will pass non-octal string/number which we can't catch
+	chmod(eval($file_permissions), $file);
+  }
+
+  return $fh;
+}
+
+
+
+################################################################################
+
+=head2 run_system_cmd
+
+ Description : Method to control the execution of the standard system() command
+
+ ReturnType  : none
+
+ Example     : $Helper->debug(2,"dir=$dir file=$file");
+
+ Exceptions  : throws exception if system command returns none zero
+
+=cut
+
+################################################################################
+
+sub run_system_cmd{
+  my ($command, $no_exit) = @_;
+
+  my $redirect = '';
+
+  #$self->debug(3, "system($command)");
+  
+  # decide where the command line output should be redirected
+
+  #This should account for redirects
+
+  #if ($self->{_debug_level} >= 3){
+
+  #  if (defined $self->{_debug_file}){
+  #    $redirect = " >>".$self->{_debug_file}." 2>&1";
+  #  }
+  #  else{
+  #    $redirect = "";
+  #  }
+  #}
+  #else{
+    #$redirect = " > /dev/null 2>&1";
+  #}
+
+  # execute the passed system command
+  my $status = system("$command $redirect");
+  my $exit_code = $status >> 8; 
+ 
+
+  if ($status == -1) {	
+	warn "Failed to execute: $!\n";
+  }    
+  elsif ($status & 127) {
+	warn sprintf("Child died with signal %d, %s coredump\nError:\t$!",($status & 127),($status & 128) ? 'with' : 'without');
+  }    
+  elsif($status != 0) {	
+	warn sprintf("Child exited with value %d\nError:\t$!\n", $exit_code); #get the true exit code
+  }
+ 
+  #We're not catchign error message here!
+
+  if ($exit_code != 0){
+		  
+    if (! $no_exit){
+      throw("System command failed:\t$command\n");
+    }
+    else{
+      warn("System command returned non-zero exit code:\t$command\n");
+    }
+  }
+  
+  #reverse boolean logic for perl...can't do this anymore due to tab2mage successful non-zero exit codes :/
+
+  return $exit_code;
+}
+
+
+sub backup_file{
+  my $file_path = shift;
+
+  throw("Must define a file path to backup") if(! $file_path);
+
+  if (-f $file_path) {
+    #$self->log("Backing up:\t$file_path");
+    system ("mv ${file_path} ${file_path}.".`date '+%T'`) == 0 || return 0;
+  }
+
+  return 1;
+
+}
+
+
+sub get_file_format{
+  my $file = shift;
+
+  my $format = &is_bed($file);
+
+  if(! $format){
+	$format =  &is_sam($file);
+
+	#Add more testes here
+  }
+  
+  
+  return $format;
+}
+
+sub is_gzipped {
+  my ($file, $fail_if_compressed) = @_;
+
+  throw ("File does not exist:\t$file") if ! -e $file;
+
+  open(FILE, "file -L $file |")
+	or throw("Can't execute command 'file' on '$file'");
+  my $file_info = <FILE>;
+  close FILE;
+
+  my $gzip = 0;
+
+  if($file_info =~ /compressed data/){
+
+	if($file_info =~ /gzip/){
+	  $gzip = 1;
+	}
+	else{
+	  throw("File is compressed but not with gzip, please unzip or gzip:\t$file_info");
+	}
+  }
+
+  return $gzip;
+}
+
+#Change these to also return the gz status
+
+sub is_sam{
+  my $file = shift;
+
+  warn "Only checking file suffix for is_sam";
+  #Could check for header here altho this is not mandatory!
+  #Can we use web format guessing code?
+
+  my $gz = (&is_gzipped($file, 1)) ? '.gz' : '';
+
+  return ($file =~ /.sam${gz}/) ? 'sam' : 0;
+}
+
+#need is bam here too!
+
+sub is_bed {
+  my $file = shift;
+
+  #Use open_file here!
+  
+  if(&is_gzipped($file, 1)){
+	
+    open(FILE, "zcat $file 2>&1 |") or throw("Can't open file via zcat:\t$file");
+  } 
+  else{
+    open(FILE, $file) or throw("Can't open file:\t$file");
+  }
+
+  my @line;
+  #$verbose =1;
+
+
+  while (<FILE>) {
+	chomp;
+	@line = split("\t", $_);
+	last;
+  }
+  close FILE;
+  
+  if (scalar @line < 6) {
+	warn "Infile '$file' does not have 6 or more columns. We expect bed format:\t".
+	  "CHROM START END NAME SCORE STRAND.\n";
+	return 0;
+	#} elsif ($line[0] !~ m/^((chr)?[MTXYNT_\d]+)$/) {
+	#    warn ("1st column must contain name of seq_region (e.g. chr1 or 1) in '$file'");
+	#    return 0;
+	#Commented this out for now due to HSCHR_RANDOM seqs
+	#How does the webcode handle this?
+  } 
+  elsif ($line[1] !~ m/^\d+$/ && $line[2] =~ m/^\d+$/) {
+	warn "2nd and 3rd column must contain start and end respectively in '$file'\n";
+	return 0;
+  } 
+  elsif ($line[5] !~ m/^[+-\.]$/) {
+	warn "6th column must define strand (either +, - or .) in '$file'\n";
+	return 0;
+  }
+
+  return 'bed';
+}
+
+
+#These subs are useful for implementing
+#a farm mode in a run script, where a script can
+#submit itself to the farm as slice based jobs
+
+#strip cmd line params and associated arguments from a list 
+#should not be used to remove flag options i.e. no following args
+#as this may cause removal of any following @ARGV;
+#Can this be used on flattened args hash?
+
+sub strip_param_args{
+  my ($args, @strip_params) = @_;
+
+  my $param_name;
+  my $seen_opt = 0;
+
+  foreach my $i(0..$#{$args}){
+
+	if($args->[$i] =~ /^[-]+/){
+	  $seen_opt = 0;#Reset seen opt if we seen a new one
+	  
+	  ($param_name = $args->[$i]) =~ s/^[-]+//;
+
+	  if(grep/^${param_name}$/, @strip_params){
+		$seen_opt = 1;
+	  }
+	}
+
+	#$args->[$i] = '' if $args->[$i] =~ /^[-]+farm/;#Only remove current flag
+	#$seen_opt = 1 if $args->[$i] =~ /^[-]+skip_slices/;
+	#$seen_opt = 1 if $args->[$i] =~ /^[-]+slice/;#Don't have full param name incase we have just specified -slice
+	
+	$args->[$i] = '' if $seen_opt;#Remove option and args following option
+  }
+
+  return $args;
+}
+
+
+sub strip_param_flags{
+  my ($args, @strip_params) = @_;
+
+  my @args = @$args;
+
+  foreach my $flag(@strip_params){
+	@args = grep(!/[-]+${flag}$/, @args);
+  }
+
+  return \@args;
+}
+
+#Generates slices from names or optionally alll default top level nonref
+
+sub generate_slices_from_names{
+  my ($slice_adaptor, $slice_names, $skip_slices, $highestlevel, $non_ref, $inc_dups, $assembly) = @_;
+
+  #Test if $assembly is old?
+
+
+
+  my (@slices, $slice, $sr_name);
+
+  if(@$slice_names){
+	
+	foreach my $name(@$slice_names){
+	  $slice = $slice_adaptor->fetch_by_region(undef, $name, undef, undef, undef, $assembly);
+
+	  #WHy is this failing for hap regions?
+	
+	  if(! $slice){
+
+		#Need to eval this as it will break with incorrect formating
+		
+		eval { $slice = $slice_adaptor->fetch_by_name($name) };
+
+		if(! $slice){
+		  throw("Could not fetch slice by region or name:\t".$name);
+		}
+	  }
+
+	  $sr_name = $slice->seq_region_name;
+
+	  next if(grep/^${sr_name}$/, @$skip_slices);
+	  push @slices, $slice;
+	}
+  }
+  elsif($highestlevel){
+
+	my $level = 'toplevel';
+
+	if($assembly){
+	  $level = 'chromosome';
+	  warn "Cannot get toplevel for old assembly version $assembly, defaulting to 'chromosome' level";
+	  #Would ignore old assembly and just fetch current assembly otherwise as there is no toplevel for old assemblies
+	  #No need for projection on non-ref unassembled seqs as these will/should be identical
+	  #Only need need to project assembled seq i.e. haps(lrgs?).
+	  #Only rollback toplevel data when cleaning after projection, otherwise we may lose some data.
+	  #Change default delete to use all toplevel ref seqs (and non-ref with cs version e.g. haps but not lrgs)
+	}
+
+	my @tmp_slices = @{$slice_adaptor->fetch_all($level, $assembly, $non_ref, $inc_dups)};
+
+	if(@$skip_slices){
+
+	  foreach $slice(@tmp_slices){
+		$sr_name = $slice->seq_region_name;
+		push @slices, $slice if ! grep/^${sr_name}$/, @$skip_slices;
+	  }
+	}
+	else{
+	  @slices = @tmp_slices;
+	}
+  }
+  else{
+	throw('You must either pass an arrayref of slice names or specify the toplevel flag');
+  }
+
+
+  if(! @slices){
+	throw("You have specified slice_names and skip_slices paramters which have generated no slices.\nslice_names:\t".join(' ',@$slice_names)."\nskip_slices:\t".join(' ', @$skip_slices));
+  }
+
+  return \@slices;
+}
+
+
+# Tracking DB methods
+# Move to DBAdaptor? Can we add this as a separate package in the same module?
+
+sub get_current_regulatory_input_names{
+  my ($tdb, $efg_db, $focus) = @_;
+
+  #Validate is production?
+  my $sql;
+
+  
+
+  if($focus){
+	$focus = 'Focus';
+	$sql   = 'SELECT efgdb_set_name from dataset where is_focus=true and is_current=true and species="'.$efg_db->species.'"';
+  }
+  else{
+	$focus = 'Non-focus';
+	#0 rather than false so we don't get NULLs
+	$sql = 'SELECT efgdb_set_name from dataset where is_focus=0 and is_current=true and species="'.$efg_db->species.'"';
+  }
+ 
+
+  #Currently efgdb_set_name can either be data_set or feature_set name!
+  #Need to standardise this
+
+  my @prd_names = @{$tdb->db_handle->selectcol_arrayref($sql)};
+  my @names;
+  my @failed_sets;
+
+  foreach my $prd_name(@prd_names){
+
+	$sql = "SELECT name from feature_set where name like '${prd_name}%'";
+	my @tmp_names =  @{$efg_db->dbc->db_handle->selectcol_arrayref($sql)};
+
+	#This is causing problems with multiple feature sets with differing analyses
+
+	#Do this via InputSets(using query extension?) instead of using like?
+
+	#This is very hacky right now to get it to work
+	#Need to standardise and review tracking db data.
+
+	if(scalar(@tmp_names) > 1){
+
+	  $sql = "SELECT name from feature_set where name ='${prd_name}_ccat_histone'";
+	  @tmp_names =  @{$efg_db->dbc->db_handle->selectcol_arrayref($sql)};
+
+	  if(scalar(@tmp_names) == 1){
+		push @names, $tmp_names[0];
+	  }else{
+		push @failed_sets, $prd_name;
+	  }
+	}
+	elsif(scalar(@tmp_names) == 0){
+	  push @failed_sets, $prd_name;
+	}
+	else{
+	  push @names, $tmp_names[0];
+	}
+
+  }
+
+  if(@failed_sets){
+	throw("Failed to find unique $focus FeatureSets for production dataset names:\n\t".
+		  join("\n\t", @failed_sets)."\n");
+  }
+
+  return @names;
+}
+
+#Handy function to add an external_db entry when needed
+sub add_external_db{
+  my ($efg_db, $db_name,$db_release,$db_display_name) = @_;
+  my $sql = "select external_db_id from external_db where db_name='$db_name' and db_release='$db_release'";
+  my ($db_id) =  $efg_db->dbc->db_handle->selectrow_array($sql);
+  if($db_id){ 
+    warn "External DB $db_name $db_release already exists in db with db_id $db_id\n";
+  } else {
+    #TODO check if it there was a failure
+    $efg_db->dbc->do("insert into external_db (db_name, db_release, status, dbprimary_acc_linkable, priority, db_display_name, type) values('$db_name', '$db_release', 'KNOWNXREF', '1', '5', '$db_display_name', 'MISC')");
+  }
+
+}
+
+1;
author	mahtabm
date	Thu, 11 Apr 2013 02:01:53 -0400
parents
children