Mercurial > repos > mahtabm > ensembl
view variant_effect_predictor/Bio/EnsEMBL/Funcgen/Utils/EFGUtils.pm @ 3:d30fa12e4cc5 default tip
Merge heads 2:a5976b2dce6f and 1:09613ce8151e which were created as a result of a recently fixed bug.
author | devteam <devteam@galaxyproject.org> |
---|---|
date | Mon, 13 Jan 2014 10:38:30 -0500 |
parents | 1f6dce3d34e0 |
children |
line wrap: on
line source
=head1 LICENSE Copyright (c) 1999-2011 The European Bioinformatics Institute and Genome Research Limited. All rights reserved. This software is distributed under a modified Apache license. For license details, please see http://www.ensembl.org/info/about/code_licence.html =head1 CONTACT Please email comments or questions to the public Ensembl developers list at <ensembl-dev@ebi.ac.uk>. Questions may also be sent to the Ensembl help desk at <helpdesk@ensembl.org>. =head1 NAME Bio::EnsEMBL::Funcgen::Utils::EFGUtils =head1 DESCRIPTION This module collates a variety of miscellaneous methods. =head1 SYNOPSIS BEGIN { unshift(@INC,"/path/of/local/src/modules"); } use Utils; &Utils::send_mail($to_address, $title, $message); =cut # No API/Object based methods in here ############################################################################### package Bio::EnsEMBL::Funcgen::Utils::EFGUtils; require Exporter; @ISA = qw(Exporter); @EXPORT_OK = qw(get_date species_name get_month_number species_chr_num open_file median mean run_system_cmd backup_file is_gzipped is_sam is_bed get_file_format strip_param_args generate_slices_from_names strip_param_flags get_current_regulatory_input_names add_external_db); use Bio::EnsEMBL::Utils::Exception qw( throw ); use File::Path qw (mkpath); use File::Basename qw (dirname); use strict; use Time::Local; use FileHandle; use Carp; sub get_date{ my ($format, $file) = @_; my ($time, $sec, $min, $hour, $mday, $mon, $year, $wday, $yday, $isdst); throw("File does not exist or is not a regular file:\t$file") if $file && ! -f $file; ($sec, $min, $hour, $mday, $mon, $year, $wday, $yday, $isdst) = (defined $file) ? localtime((stat($file))[9]) : localtime(); #print " ($sec, $min, $hour, $mday, $mon, $year, $wday, $yday, $isdst)\n"; if((! defined $format && ! defined $file) || $format eq "date"){ $time = ($year+1900)."-".$mday."-".($mon+1); } elsif($format eq "time"){#not working! $time = "${hour}:${min}:${sec}"; } elsif($format eq "timedate"){# $time = localtime(); } else{#add mysql formats here, datetime etc... croak("get_date does not handle format:\t$format"); } return $time; } #migrate this data to defs file!!?? #must contain all E! species and any other species which are used in local DB extractions #NEED TO ADD FLY!! sub species_name{ my($species) = @_; my %species_names = ( "HOMO_SAPIENS", "human", "MUS_MUSCULUS", "mouse", "RATTUS_NORVEGICUS", "rat", "CANIS_FAMILIARIS", "dog", "PAN_TROGOLODYTES", "chimp", "GALLUS_GALLUS", "chicken", "SACCHAROMYCES_CEREVISIAE", "yeast", "HUMAN", "HOMO_SAPIENS", "MOUSE", "MUS_MUSCULUS", "RAT","RATTUS_NORVEGICUS", "DOG", "CANIS_FAMILIARIS", "CHIMP", "PAN_TROGOLODYTES", "CHICKEN", "GALLUS_GALLUS", "YEAST", "SACCHAROMYCES_CEREVISIAE", ); return $species_names{uc($species)}; } sub get_month_number{ my($mon) = @_; my %month_nos =( "jan", "01", "feb", "02", "mar", "03", "apr", "04", "may", "05", "jun", "06", "jul", "07", "aug", "08", "sep", "09", "oct", "10", "nov", "11", "dec", "12", ); return $month_nos{lc($mon)}; } sub species_chr_num{ my ($species, $val) = @_; ($species = lc($species)) =~ s/ /_/; my %species_chrs = ( homo_sapiens => {( 'x' => 23, 'y' => 24, 'mt' => 25, )}, mus_musculus => {( 'x' => 20, 'y' => 21, 'mt' => 22, )}, rattus_norvegicus => {( 'x' => 21, 'y' => 22, 'mt' => 23, )}, ); die("species not defined in chromosome hash") if(! exists $species_chrs{$species}); return (exists $species_chrs{$species}{lc($val)}) ? $species_chrs{$species}{lc($val)} : $val; } #Sort should always be done in the caller if required sub median{ my ($scores, $sort) = shift; return undef if (! @$scores); my ($median); my $count = scalar(@$scores); my $index = $count-1; #need to deal with lines with no results!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! #deal with one score fastest return $scores->[0] if ($count == 1); if($sort){ #This is going to sort the reference here, so will affect #The array in the caller #We need to deref to avoid this } #taken from Statistics::Descriptive #remeber we're dealing with size starting with 1 but indices starting at 0 if ($count % 2) { #odd number of scores $median = $scores->[($index+1)/2]; } else { #even, get mean of flanks $median = ($scores->[($index)/2] + $scores->[($index/2)+1] ) / 2; } return $median; } sub mean{ my $scores = shift; my $total = 0; map $total+= $_, @$scores; my $mean = $total/(scalar(@$scores)); return $mean; } #Should really extend this to detect previous file? #Or do in caller? sub open_file{ my ($file, $operator, $file_permissions) = @_; my $dir_permissions = $file_permissions || 0755; $operator ||= '<'; if($operator !~ /%/){ $operator = "$operator $file"; } else{ #We have some piping to do $operator = sprintf($operator, $file); } #Get dir here and create if not exists my $dir = dirname($file); mkpath($dir, {verbose => 1, mode => $dir_permissions}) if(! -d $dir); my $fh = new FileHandle "$operator"; #This does not catch incorrectly defined named pipes if(! defined $fh){ croak("Failed to open $operator"); } #Have to chmod here as umask will over-ride permissions passed to FileHandle if(defined $file_permissions){ #Catch non-numeric here as chmod still returns true if($file_permissions =~ /[^0-9]/){ croak("Failed to change $file permissions using:\t$file_permissions"); } #chmod requires a literal octal number e.g. 0775 not '0775' #should catch numbers as strings here, but perl makes this very hard to test #Can't even system this as if we build the cmd line with an octal it will be converted to a decimal #These is still no way of testing for non-octal number or string #eval/sprintf will also not fail if there are non-octal digits i.e. 1999 #eval will treat octal number and string as true octal number #else will pass non-octal string/number which we can't catch chmod(eval($file_permissions), $file); } return $fh; } ################################################################################ =head2 run_system_cmd Description : Method to control the execution of the standard system() command ReturnType : none Example : $Helper->debug(2,"dir=$dir file=$file"); Exceptions : throws exception if system command returns none zero =cut ################################################################################ sub run_system_cmd{ my ($command, $no_exit) = @_; my $redirect = ''; #$self->debug(3, "system($command)"); # decide where the command line output should be redirected #This should account for redirects #if ($self->{_debug_level} >= 3){ # if (defined $self->{_debug_file}){ # $redirect = " >>".$self->{_debug_file}." 2>&1"; # } # else{ # $redirect = ""; # } #} #else{ #$redirect = " > /dev/null 2>&1"; #} # execute the passed system command my $status = system("$command $redirect"); my $exit_code = $status >> 8; if ($status == -1) { warn "Failed to execute: $!\n"; } elsif ($status & 127) { warn sprintf("Child died with signal %d, %s coredump\nError:\t$!",($status & 127),($status & 128) ? 'with' : 'without'); } elsif($status != 0) { warn sprintf("Child exited with value %d\nError:\t$!\n", $exit_code); #get the true exit code } #We're not catchign error message here! if ($exit_code != 0){ if (! $no_exit){ throw("System command failed:\t$command\n"); } else{ warn("System command returned non-zero exit code:\t$command\n"); } } #reverse boolean logic for perl...can't do this anymore due to tab2mage successful non-zero exit codes :/ return $exit_code; } sub backup_file{ my $file_path = shift; throw("Must define a file path to backup") if(! $file_path); if (-f $file_path) { #$self->log("Backing up:\t$file_path"); system ("mv ${file_path} ${file_path}.".`date '+%T'`) == 0 || return 0; } return 1; } sub get_file_format{ my $file = shift; my $format = &is_bed($file); if(! $format){ $format = &is_sam($file); #Add more testes here } return $format; } sub is_gzipped { my ($file, $fail_if_compressed) = @_; throw ("File does not exist:\t$file") if ! -e $file; open(FILE, "file -L $file |") or throw("Can't execute command 'file' on '$file'"); my $file_info = <FILE>; close FILE; my $gzip = 0; if($file_info =~ /compressed data/){ if($file_info =~ /gzip/){ $gzip = 1; } else{ throw("File is compressed but not with gzip, please unzip or gzip:\t$file_info"); } } return $gzip; } #Change these to also return the gz status sub is_sam{ my $file = shift; warn "Only checking file suffix for is_sam"; #Could check for header here altho this is not mandatory! #Can we use web format guessing code? my $gz = (&is_gzipped($file, 1)) ? '.gz' : ''; return ($file =~ /.sam${gz}/) ? 'sam' : 0; } #need is bam here too! sub is_bed { my $file = shift; #Use open_file here! if(&is_gzipped($file, 1)){ open(FILE, "zcat $file 2>&1 |") or throw("Can't open file via zcat:\t$file"); } else{ open(FILE, $file) or throw("Can't open file:\t$file"); } my @line; #$verbose =1; while (<FILE>) { chomp; @line = split("\t", $_); last; } close FILE; if (scalar @line < 6) { warn "Infile '$file' does not have 6 or more columns. We expect bed format:\t". "CHROM START END NAME SCORE STRAND.\n"; return 0; #} elsif ($line[0] !~ m/^((chr)?[MTXYNT_\d]+)$/) { # warn ("1st column must contain name of seq_region (e.g. chr1 or 1) in '$file'"); # return 0; #Commented this out for now due to HSCHR_RANDOM seqs #How does the webcode handle this? } elsif ($line[1] !~ m/^\d+$/ && $line[2] =~ m/^\d+$/) { warn "2nd and 3rd column must contain start and end respectively in '$file'\n"; return 0; } elsif ($line[5] !~ m/^[+-\.]$/) { warn "6th column must define strand (either +, - or .) in '$file'\n"; return 0; } return 'bed'; } #These subs are useful for implementing #a farm mode in a run script, where a script can #submit itself to the farm as slice based jobs #strip cmd line params and associated arguments from a list #should not be used to remove flag options i.e. no following args #as this may cause removal of any following @ARGV; #Can this be used on flattened args hash? sub strip_param_args{ my ($args, @strip_params) = @_; my $param_name; my $seen_opt = 0; foreach my $i(0..$#{$args}){ if($args->[$i] =~ /^[-]+/){ $seen_opt = 0;#Reset seen opt if we seen a new one ($param_name = $args->[$i]) =~ s/^[-]+//; if(grep/^${param_name}$/, @strip_params){ $seen_opt = 1; } } #$args->[$i] = '' if $args->[$i] =~ /^[-]+farm/;#Only remove current flag #$seen_opt = 1 if $args->[$i] =~ /^[-]+skip_slices/; #$seen_opt = 1 if $args->[$i] =~ /^[-]+slice/;#Don't have full param name incase we have just specified -slice $args->[$i] = '' if $seen_opt;#Remove option and args following option } return $args; } sub strip_param_flags{ my ($args, @strip_params) = @_; my @args = @$args; foreach my $flag(@strip_params){ @args = grep(!/[-]+${flag}$/, @args); } return \@args; } #Generates slices from names or optionally alll default top level nonref sub generate_slices_from_names{ my ($slice_adaptor, $slice_names, $skip_slices, $highestlevel, $non_ref, $inc_dups, $assembly) = @_; #Test if $assembly is old? my (@slices, $slice, $sr_name); if(@$slice_names){ foreach my $name(@$slice_names){ $slice = $slice_adaptor->fetch_by_region(undef, $name, undef, undef, undef, $assembly); #WHy is this failing for hap regions? if(! $slice){ #Need to eval this as it will break with incorrect formating eval { $slice = $slice_adaptor->fetch_by_name($name) }; if(! $slice){ throw("Could not fetch slice by region or name:\t".$name); } } $sr_name = $slice->seq_region_name; next if(grep/^${sr_name}$/, @$skip_slices); push @slices, $slice; } } elsif($highestlevel){ my $level = 'toplevel'; if($assembly){ $level = 'chromosome'; warn "Cannot get toplevel for old assembly version $assembly, defaulting to 'chromosome' level"; #Would ignore old assembly and just fetch current assembly otherwise as there is no toplevel for old assemblies #No need for projection on non-ref unassembled seqs as these will/should be identical #Only need need to project assembled seq i.e. haps(lrgs?). #Only rollback toplevel data when cleaning after projection, otherwise we may lose some data. #Change default delete to use all toplevel ref seqs (and non-ref with cs version e.g. haps but not lrgs) } my @tmp_slices = @{$slice_adaptor->fetch_all($level, $assembly, $non_ref, $inc_dups)}; if(@$skip_slices){ foreach $slice(@tmp_slices){ $sr_name = $slice->seq_region_name; push @slices, $slice if ! grep/^${sr_name}$/, @$skip_slices; } } else{ @slices = @tmp_slices; } } else{ throw('You must either pass an arrayref of slice names or specify the toplevel flag'); } if(! @slices){ throw("You have specified slice_names and skip_slices paramters which have generated no slices.\nslice_names:\t".join(' ',@$slice_names)."\nskip_slices:\t".join(' ', @$skip_slices)); } return \@slices; } # Tracking DB methods # Move to DBAdaptor? Can we add this as a separate package in the same module? sub get_current_regulatory_input_names{ my ($tdb, $efg_db, $focus) = @_; #Validate is production? my $sql; if($focus){ $focus = 'Focus'; $sql = 'SELECT efgdb_set_name from dataset where is_focus=true and is_current=true and species="'.$efg_db->species.'"'; } else{ $focus = 'Non-focus'; #0 rather than false so we don't get NULLs $sql = 'SELECT efgdb_set_name from dataset where is_focus=0 and is_current=true and species="'.$efg_db->species.'"'; } #Currently efgdb_set_name can either be data_set or feature_set name! #Need to standardise this my @prd_names = @{$tdb->db_handle->selectcol_arrayref($sql)}; my @names; my @failed_sets; foreach my $prd_name(@prd_names){ $sql = "SELECT name from feature_set where name like '${prd_name}%'"; my @tmp_names = @{$efg_db->dbc->db_handle->selectcol_arrayref($sql)}; #This is causing problems with multiple feature sets with differing analyses #Do this via InputSets(using query extension?) instead of using like? #This is very hacky right now to get it to work #Need to standardise and review tracking db data. if(scalar(@tmp_names) > 1){ $sql = "SELECT name from feature_set where name ='${prd_name}_ccat_histone'"; @tmp_names = @{$efg_db->dbc->db_handle->selectcol_arrayref($sql)}; if(scalar(@tmp_names) == 1){ push @names, $tmp_names[0]; }else{ push @failed_sets, $prd_name; } } elsif(scalar(@tmp_names) == 0){ push @failed_sets, $prd_name; } else{ push @names, $tmp_names[0]; } } if(@failed_sets){ throw("Failed to find unique $focus FeatureSets for production dataset names:\n\t". join("\n\t", @failed_sets)."\n"); } return @names; } #Handy function to add an external_db entry when needed sub add_external_db{ my ($efg_db, $db_name,$db_release,$db_display_name) = @_; my $sql = "select external_db_id from external_db where db_name='$db_name' and db_release='$db_release'"; my ($db_id) = $efg_db->dbc->db_handle->selectrow_array($sql); if($db_id){ warn "External DB $db_name $db_release already exists in db with db_id $db_id\n"; } else { #TODO check if it there was a failure $efg_db->dbc->do("insert into external_db (db_name, db_release, status, dbprimary_acc_linkable, priority, db_display_name, type) values('$db_name', '$db_release', 'KNOWNXREF', '1', '5', '$db_display_name', 'MISC')"); } } 1;