Mercurial > repos > mahtabm > ensembl
comparison variant_effect_predictor/Bio/EnsEMBL/Funcgen/Utils/EFGUtils.pm @ 0:1f6dce3d34e0
Uploaded
author | mahtabm |
---|---|
date | Thu, 11 Apr 2013 02:01:53 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:1f6dce3d34e0 |
---|---|
1 =head1 LICENSE | |
2 | |
3 Copyright (c) 1999-2011 The European Bioinformatics Institute and | |
4 Genome Research Limited. All rights reserved. | |
5 | |
6 This software is distributed under a modified Apache license. | |
7 For license details, please see | |
8 | |
9 http://www.ensembl.org/info/about/code_licence.html | |
10 | |
11 =head1 CONTACT | |
12 | |
13 Please email comments or questions to the public Ensembl | |
14 developers list at <ensembl-dev@ebi.ac.uk>. | |
15 | |
16 Questions may also be sent to the Ensembl help desk at | |
17 <helpdesk@ensembl.org>. | |
18 | |
19 =head1 NAME | |
20 | |
21 Bio::EnsEMBL::Funcgen::Utils::EFGUtils | |
22 | |
23 =head1 DESCRIPTION | |
24 | |
25 This module collates a variety of miscellaneous methods. | |
26 | |
27 | |
28 =head1 SYNOPSIS | |
29 | |
30 BEGIN | |
31 { | |
32 unshift(@INC,"/path/of/local/src/modules"); | |
33 } | |
34 | |
35 use Utils; | |
36 | |
37 &Utils::send_mail($to_address, $title, $message); | |
38 | |
39 =cut | |
40 | |
41 | |
42 # No API/Object based methods in here | |
43 | |
44 ############################################################################### | |
45 | |
46 package Bio::EnsEMBL::Funcgen::Utils::EFGUtils; | |
47 require Exporter; | |
48 @ISA = qw(Exporter); | |
49 @EXPORT_OK = qw(get_date species_name get_month_number species_chr_num | |
50 open_file median mean run_system_cmd backup_file | |
51 is_gzipped is_sam is_bed get_file_format strip_param_args | |
52 generate_slices_from_names strip_param_flags | |
53 get_current_regulatory_input_names add_external_db); | |
54 | |
55 use Bio::EnsEMBL::Utils::Exception qw( throw ); | |
56 use File::Path qw (mkpath); | |
57 use File::Basename qw (dirname); | |
58 use strict; | |
59 use Time::Local; | |
60 use FileHandle; | |
61 use Carp; | |
62 | |
63 sub get_date{ | |
64 my ($format, $file) = @_; | |
65 | |
66 my ($time, $sec, $min, $hour, $mday, $mon, $year, $wday, $yday, $isdst); | |
67 | |
68 | |
69 throw("File does not exist or is not a regular file:\t$file") if $file && ! -f $file; | |
70 | |
71 | |
72 ($sec, $min, $hour, $mday, $mon, $year, $wday, $yday, $isdst) = (defined $file) ? | |
73 localtime((stat($file))[9]) : localtime(); | |
74 | |
75 #print " ($sec, $min, $hour, $mday, $mon, $year, $wday, $yday, $isdst)\n"; | |
76 | |
77 if((! defined $format && ! defined $file) || $format eq "date"){ | |
78 $time = ($year+1900)."-".$mday."-".($mon+1); | |
79 } | |
80 elsif($format eq "time"){#not working! | |
81 $time = "${hour}:${min}:${sec}"; | |
82 } | |
83 elsif($format eq "timedate"){# | |
84 $time = localtime(); | |
85 } | |
86 else{#add mysql formats here, datetime etc... | |
87 croak("get_date does not handle format:\t$format"); | |
88 } | |
89 | |
90 return $time; | |
91 } | |
92 | |
93 | |
94 #migrate this data to defs file!!?? | |
95 #must contain all E! species and any other species which are used in local DB extractions | |
96 #NEED TO ADD FLY!! | |
97 | |
98 sub species_name{ | |
99 my($species) = @_; | |
100 my %species_names = ( | |
101 "HOMO_SAPIENS", "human", | |
102 "MUS_MUSCULUS", "mouse", | |
103 "RATTUS_NORVEGICUS", "rat", | |
104 "CANIS_FAMILIARIS", "dog", | |
105 "PAN_TROGOLODYTES", "chimp", | |
106 "GALLUS_GALLUS", "chicken", | |
107 "SACCHAROMYCES_CEREVISIAE", "yeast", | |
108 "HUMAN", "HOMO_SAPIENS", | |
109 "MOUSE", "MUS_MUSCULUS", | |
110 "RAT","RATTUS_NORVEGICUS", | |
111 "DOG", "CANIS_FAMILIARIS", | |
112 "CHIMP", "PAN_TROGOLODYTES", | |
113 "CHICKEN", "GALLUS_GALLUS", | |
114 "YEAST", "SACCHAROMYCES_CEREVISIAE", | |
115 ); | |
116 | |
117 return $species_names{uc($species)}; | |
118 } | |
119 | |
120 sub get_month_number{ | |
121 my($mon) = @_; | |
122 my %month_nos =( | |
123 "jan", "01", | |
124 "feb", "02", | |
125 "mar", "03", | |
126 "apr", "04", | |
127 "may", "05", | |
128 "jun", "06", | |
129 "jul", "07", | |
130 "aug", "08", | |
131 "sep", "09", | |
132 "oct", "10", | |
133 "nov", "11", | |
134 "dec", "12", | |
135 ); | |
136 return $month_nos{lc($mon)}; | |
137 } | |
138 | |
139 | |
140 sub species_chr_num{ | |
141 my ($species, $val) = @_; | |
142 | |
143 ($species = lc($species)) =~ s/ /_/; | |
144 | |
145 my %species_chrs = ( | |
146 homo_sapiens => {( | |
147 'x' => 23, | |
148 'y' => 24, | |
149 'mt' => 25, | |
150 )}, | |
151 | |
152 mus_musculus => {( | |
153 'x' => 20, | |
154 'y' => 21, | |
155 'mt' => 22, | |
156 )}, | |
157 | |
158 rattus_norvegicus => {( | |
159 'x' => 21, | |
160 'y' => 22, | |
161 'mt' => 23, | |
162 )}, | |
163 ); | |
164 | |
165 die("species not defined in chromosome hash") if(! exists $species_chrs{$species}); | |
166 | |
167 return (exists $species_chrs{$species}{lc($val)}) ? $species_chrs{$species}{lc($val)} : $val; | |
168 } | |
169 | |
170 #Sort should always be done in the caller if required | |
171 | |
172 sub median{ | |
173 my ($scores, $sort) = shift; | |
174 | |
175 return undef if (! @$scores); | |
176 | |
177 | |
178 my ($median); | |
179 my $count = scalar(@$scores); | |
180 my $index = $count-1; | |
181 #need to deal with lines with no results!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! | |
182 #deal with one score fastest | |
183 return $scores->[0] if ($count == 1); | |
184 | |
185 | |
186 if($sort){ | |
187 #This is going to sort the reference here, so will affect | |
188 #The array in the caller | |
189 #We need to deref to avoid this | |
190 } | |
191 | |
192 #taken from Statistics::Descriptive | |
193 #remeber we're dealing with size starting with 1 but indices starting at 0 | |
194 | |
195 if ($count % 2) { #odd number of scores | |
196 $median = $scores->[($index+1)/2]; | |
197 } | |
198 else { #even, get mean of flanks | |
199 $median = ($scores->[($index)/2] + $scores->[($index/2)+1] ) / 2; | |
200 } | |
201 | |
202 | |
203 return $median; | |
204 } | |
205 | |
206 | |
207 sub mean{ | |
208 my $scores = shift; | |
209 | |
210 my $total = 0; | |
211 | |
212 map $total+= $_, @$scores; | |
213 my $mean = $total/(scalar(@$scores)); | |
214 | |
215 return $mean; | |
216 | |
217 } | |
218 | |
219 #Should really extend this to detect previous file? | |
220 #Or do in caller? | |
221 | |
222 sub open_file{ | |
223 my ($file, $operator, $file_permissions) = @_; | |
224 | |
225 my $dir_permissions = $file_permissions || 0755; | |
226 | |
227 $operator ||= '<'; | |
228 | |
229 if($operator !~ /%/){ | |
230 $operator = "$operator $file"; | |
231 } | |
232 else{ | |
233 #We have some piping to do | |
234 $operator = sprintf($operator, $file); | |
235 } | |
236 | |
237 #Get dir here and create if not exists | |
238 my $dir = dirname($file); | |
239 mkpath($dir, {verbose => 1, mode => $dir_permissions}) if(! -d $dir); | |
240 my $fh = new FileHandle "$operator"; | |
241 | |
242 | |
243 #This does not catch incorrectly defined named pipes | |
244 | |
245 if(! defined $fh){ | |
246 croak("Failed to open $operator"); | |
247 } | |
248 | |
249 | |
250 #Have to chmod here as umask will over-ride permissions passed to FileHandle | |
251 if(defined $file_permissions){ | |
252 | |
253 #Catch non-numeric here as chmod still returns true | |
254 if($file_permissions =~ /[^0-9]/){ | |
255 croak("Failed to change $file permissions using:\t$file_permissions"); | |
256 } | |
257 | |
258 #chmod requires a literal octal number e.g. 0775 not '0775' | |
259 #should catch numbers as strings here, but perl makes this very hard to test | |
260 #Can't even system this as if we build the cmd line with an octal it will be converted to a decimal | |
261 #These is still no way of testing for non-octal number or string | |
262 #eval/sprintf will also not fail if there are non-octal digits i.e. 1999 | |
263 | |
264 #eval will treat octal number and string as true octal number | |
265 #else will pass non-octal string/number which we can't catch | |
266 chmod(eval($file_permissions), $file); | |
267 } | |
268 | |
269 return $fh; | |
270 } | |
271 | |
272 | |
273 | |
274 ################################################################################ | |
275 | |
276 =head2 run_system_cmd | |
277 | |
278 Description : Method to control the execution of the standard system() command | |
279 | |
280 ReturnType : none | |
281 | |
282 Example : $Helper->debug(2,"dir=$dir file=$file"); | |
283 | |
284 Exceptions : throws exception if system command returns none zero | |
285 | |
286 =cut | |
287 | |
288 ################################################################################ | |
289 | |
290 sub run_system_cmd{ | |
291 my ($command, $no_exit) = @_; | |
292 | |
293 my $redirect = ''; | |
294 | |
295 #$self->debug(3, "system($command)"); | |
296 | |
297 # decide where the command line output should be redirected | |
298 | |
299 #This should account for redirects | |
300 | |
301 #if ($self->{_debug_level} >= 3){ | |
302 | |
303 # if (defined $self->{_debug_file}){ | |
304 # $redirect = " >>".$self->{_debug_file}." 2>&1"; | |
305 # } | |
306 # else{ | |
307 # $redirect = ""; | |
308 # } | |
309 #} | |
310 #else{ | |
311 #$redirect = " > /dev/null 2>&1"; | |
312 #} | |
313 | |
314 # execute the passed system command | |
315 my $status = system("$command $redirect"); | |
316 my $exit_code = $status >> 8; | |
317 | |
318 | |
319 if ($status == -1) { | |
320 warn "Failed to execute: $!\n"; | |
321 } | |
322 elsif ($status & 127) { | |
323 warn sprintf("Child died with signal %d, %s coredump\nError:\t$!",($status & 127),($status & 128) ? 'with' : 'without'); | |
324 } | |
325 elsif($status != 0) { | |
326 warn sprintf("Child exited with value %d\nError:\t$!\n", $exit_code); #get the true exit code | |
327 } | |
328 | |
329 #We're not catchign error message here! | |
330 | |
331 if ($exit_code != 0){ | |
332 | |
333 if (! $no_exit){ | |
334 throw("System command failed:\t$command\n"); | |
335 } | |
336 else{ | |
337 warn("System command returned non-zero exit code:\t$command\n"); | |
338 } | |
339 } | |
340 | |
341 #reverse boolean logic for perl...can't do this anymore due to tab2mage successful non-zero exit codes :/ | |
342 | |
343 return $exit_code; | |
344 } | |
345 | |
346 | |
347 sub backup_file{ | |
348 my $file_path = shift; | |
349 | |
350 throw("Must define a file path to backup") if(! $file_path); | |
351 | |
352 if (-f $file_path) { | |
353 #$self->log("Backing up:\t$file_path"); | |
354 system ("mv ${file_path} ${file_path}.".`date '+%T'`) == 0 || return 0; | |
355 } | |
356 | |
357 return 1; | |
358 | |
359 } | |
360 | |
361 | |
362 sub get_file_format{ | |
363 my $file = shift; | |
364 | |
365 my $format = &is_bed($file); | |
366 | |
367 if(! $format){ | |
368 $format = &is_sam($file); | |
369 | |
370 #Add more testes here | |
371 } | |
372 | |
373 | |
374 return $format; | |
375 } | |
376 | |
377 sub is_gzipped { | |
378 my ($file, $fail_if_compressed) = @_; | |
379 | |
380 throw ("File does not exist:\t$file") if ! -e $file; | |
381 | |
382 open(FILE, "file -L $file |") | |
383 or throw("Can't execute command 'file' on '$file'"); | |
384 my $file_info = <FILE>; | |
385 close FILE; | |
386 | |
387 my $gzip = 0; | |
388 | |
389 if($file_info =~ /compressed data/){ | |
390 | |
391 if($file_info =~ /gzip/){ | |
392 $gzip = 1; | |
393 } | |
394 else{ | |
395 throw("File is compressed but not with gzip, please unzip or gzip:\t$file_info"); | |
396 } | |
397 } | |
398 | |
399 return $gzip; | |
400 } | |
401 | |
402 #Change these to also return the gz status | |
403 | |
404 sub is_sam{ | |
405 my $file = shift; | |
406 | |
407 warn "Only checking file suffix for is_sam"; | |
408 #Could check for header here altho this is not mandatory! | |
409 #Can we use web format guessing code? | |
410 | |
411 my $gz = (&is_gzipped($file, 1)) ? '.gz' : ''; | |
412 | |
413 return ($file =~ /.sam${gz}/) ? 'sam' : 0; | |
414 } | |
415 | |
416 #need is bam here too! | |
417 | |
418 sub is_bed { | |
419 my $file = shift; | |
420 | |
421 #Use open_file here! | |
422 | |
423 if(&is_gzipped($file, 1)){ | |
424 | |
425 open(FILE, "zcat $file 2>&1 |") or throw("Can't open file via zcat:\t$file"); | |
426 } | |
427 else{ | |
428 open(FILE, $file) or throw("Can't open file:\t$file"); | |
429 } | |
430 | |
431 my @line; | |
432 #$verbose =1; | |
433 | |
434 | |
435 while (<FILE>) { | |
436 chomp; | |
437 @line = split("\t", $_); | |
438 last; | |
439 } | |
440 close FILE; | |
441 | |
442 if (scalar @line < 6) { | |
443 warn "Infile '$file' does not have 6 or more columns. We expect bed format:\t". | |
444 "CHROM START END NAME SCORE STRAND.\n"; | |
445 return 0; | |
446 #} elsif ($line[0] !~ m/^((chr)?[MTXYNT_\d]+)$/) { | |
447 # warn ("1st column must contain name of seq_region (e.g. chr1 or 1) in '$file'"); | |
448 # return 0; | |
449 #Commented this out for now due to HSCHR_RANDOM seqs | |
450 #How does the webcode handle this? | |
451 } | |
452 elsif ($line[1] !~ m/^\d+$/ && $line[2] =~ m/^\d+$/) { | |
453 warn "2nd and 3rd column must contain start and end respectively in '$file'\n"; | |
454 return 0; | |
455 } | |
456 elsif ($line[5] !~ m/^[+-\.]$/) { | |
457 warn "6th column must define strand (either +, - or .) in '$file'\n"; | |
458 return 0; | |
459 } | |
460 | |
461 return 'bed'; | |
462 } | |
463 | |
464 | |
465 #These subs are useful for implementing | |
466 #a farm mode in a run script, where a script can | |
467 #submit itself to the farm as slice based jobs | |
468 | |
469 #strip cmd line params and associated arguments from a list | |
470 #should not be used to remove flag options i.e. no following args | |
471 #as this may cause removal of any following @ARGV; | |
472 #Can this be used on flattened args hash? | |
473 | |
474 sub strip_param_args{ | |
475 my ($args, @strip_params) = @_; | |
476 | |
477 my $param_name; | |
478 my $seen_opt = 0; | |
479 | |
480 foreach my $i(0..$#{$args}){ | |
481 | |
482 if($args->[$i] =~ /^[-]+/){ | |
483 $seen_opt = 0;#Reset seen opt if we seen a new one | |
484 | |
485 ($param_name = $args->[$i]) =~ s/^[-]+//; | |
486 | |
487 if(grep/^${param_name}$/, @strip_params){ | |
488 $seen_opt = 1; | |
489 } | |
490 } | |
491 | |
492 #$args->[$i] = '' if $args->[$i] =~ /^[-]+farm/;#Only remove current flag | |
493 #$seen_opt = 1 if $args->[$i] =~ /^[-]+skip_slices/; | |
494 #$seen_opt = 1 if $args->[$i] =~ /^[-]+slice/;#Don't have full param name incase we have just specified -slice | |
495 | |
496 $args->[$i] = '' if $seen_opt;#Remove option and args following option | |
497 } | |
498 | |
499 return $args; | |
500 } | |
501 | |
502 | |
503 sub strip_param_flags{ | |
504 my ($args, @strip_params) = @_; | |
505 | |
506 my @args = @$args; | |
507 | |
508 foreach my $flag(@strip_params){ | |
509 @args = grep(!/[-]+${flag}$/, @args); | |
510 } | |
511 | |
512 return \@args; | |
513 } | |
514 | |
515 #Generates slices from names or optionally alll default top level nonref | |
516 | |
517 sub generate_slices_from_names{ | |
518 my ($slice_adaptor, $slice_names, $skip_slices, $highestlevel, $non_ref, $inc_dups, $assembly) = @_; | |
519 | |
520 #Test if $assembly is old? | |
521 | |
522 | |
523 | |
524 my (@slices, $slice, $sr_name); | |
525 | |
526 if(@$slice_names){ | |
527 | |
528 foreach my $name(@$slice_names){ | |
529 $slice = $slice_adaptor->fetch_by_region(undef, $name, undef, undef, undef, $assembly); | |
530 | |
531 #WHy is this failing for hap regions? | |
532 | |
533 if(! $slice){ | |
534 | |
535 #Need to eval this as it will break with incorrect formating | |
536 | |
537 eval { $slice = $slice_adaptor->fetch_by_name($name) }; | |
538 | |
539 if(! $slice){ | |
540 throw("Could not fetch slice by region or name:\t".$name); | |
541 } | |
542 } | |
543 | |
544 $sr_name = $slice->seq_region_name; | |
545 | |
546 next if(grep/^${sr_name}$/, @$skip_slices); | |
547 push @slices, $slice; | |
548 } | |
549 } | |
550 elsif($highestlevel){ | |
551 | |
552 my $level = 'toplevel'; | |
553 | |
554 if($assembly){ | |
555 $level = 'chromosome'; | |
556 warn "Cannot get toplevel for old assembly version $assembly, defaulting to 'chromosome' level"; | |
557 #Would ignore old assembly and just fetch current assembly otherwise as there is no toplevel for old assemblies | |
558 #No need for projection on non-ref unassembled seqs as these will/should be identical | |
559 #Only need need to project assembled seq i.e. haps(lrgs?). | |
560 #Only rollback toplevel data when cleaning after projection, otherwise we may lose some data. | |
561 #Change default delete to use all toplevel ref seqs (and non-ref with cs version e.g. haps but not lrgs) | |
562 } | |
563 | |
564 my @tmp_slices = @{$slice_adaptor->fetch_all($level, $assembly, $non_ref, $inc_dups)}; | |
565 | |
566 if(@$skip_slices){ | |
567 | |
568 foreach $slice(@tmp_slices){ | |
569 $sr_name = $slice->seq_region_name; | |
570 push @slices, $slice if ! grep/^${sr_name}$/, @$skip_slices; | |
571 } | |
572 } | |
573 else{ | |
574 @slices = @tmp_slices; | |
575 } | |
576 } | |
577 else{ | |
578 throw('You must either pass an arrayref of slice names or specify the toplevel flag'); | |
579 } | |
580 | |
581 | |
582 if(! @slices){ | |
583 throw("You have specified slice_names and skip_slices paramters which have generated no slices.\nslice_names:\t".join(' ',@$slice_names)."\nskip_slices:\t".join(' ', @$skip_slices)); | |
584 } | |
585 | |
586 return \@slices; | |
587 } | |
588 | |
589 | |
590 # Tracking DB methods | |
591 # Move to DBAdaptor? Can we add this as a separate package in the same module? | |
592 | |
593 sub get_current_regulatory_input_names{ | |
594 my ($tdb, $efg_db, $focus) = @_; | |
595 | |
596 #Validate is production? | |
597 my $sql; | |
598 | |
599 | |
600 | |
601 if($focus){ | |
602 $focus = 'Focus'; | |
603 $sql = 'SELECT efgdb_set_name from dataset where is_focus=true and is_current=true and species="'.$efg_db->species.'"'; | |
604 } | |
605 else{ | |
606 $focus = 'Non-focus'; | |
607 #0 rather than false so we don't get NULLs | |
608 $sql = 'SELECT efgdb_set_name from dataset where is_focus=0 and is_current=true and species="'.$efg_db->species.'"'; | |
609 } | |
610 | |
611 | |
612 #Currently efgdb_set_name can either be data_set or feature_set name! | |
613 #Need to standardise this | |
614 | |
615 my @prd_names = @{$tdb->db_handle->selectcol_arrayref($sql)}; | |
616 my @names; | |
617 my @failed_sets; | |
618 | |
619 foreach my $prd_name(@prd_names){ | |
620 | |
621 $sql = "SELECT name from feature_set where name like '${prd_name}%'"; | |
622 my @tmp_names = @{$efg_db->dbc->db_handle->selectcol_arrayref($sql)}; | |
623 | |
624 #This is causing problems with multiple feature sets with differing analyses | |
625 | |
626 #Do this via InputSets(using query extension?) instead of using like? | |
627 | |
628 #This is very hacky right now to get it to work | |
629 #Need to standardise and review tracking db data. | |
630 | |
631 if(scalar(@tmp_names) > 1){ | |
632 | |
633 $sql = "SELECT name from feature_set where name ='${prd_name}_ccat_histone'"; | |
634 @tmp_names = @{$efg_db->dbc->db_handle->selectcol_arrayref($sql)}; | |
635 | |
636 if(scalar(@tmp_names) == 1){ | |
637 push @names, $tmp_names[0]; | |
638 }else{ | |
639 push @failed_sets, $prd_name; | |
640 } | |
641 } | |
642 elsif(scalar(@tmp_names) == 0){ | |
643 push @failed_sets, $prd_name; | |
644 } | |
645 else{ | |
646 push @names, $tmp_names[0]; | |
647 } | |
648 | |
649 } | |
650 | |
651 if(@failed_sets){ | |
652 throw("Failed to find unique $focus FeatureSets for production dataset names:\n\t". | |
653 join("\n\t", @failed_sets)."\n"); | |
654 } | |
655 | |
656 return @names; | |
657 } | |
658 | |
659 #Handy function to add an external_db entry when needed | |
660 sub add_external_db{ | |
661 my ($efg_db, $db_name,$db_release,$db_display_name) = @_; | |
662 my $sql = "select external_db_id from external_db where db_name='$db_name' and db_release='$db_release'"; | |
663 my ($db_id) = $efg_db->dbc->db_handle->selectrow_array($sql); | |
664 if($db_id){ | |
665 warn "External DB $db_name $db_release already exists in db with db_id $db_id\n"; | |
666 } else { | |
667 #TODO check if it there was a failure | |
668 $efg_db->dbc->do("insert into external_db (db_name, db_release, status, dbprimary_acc_linkable, priority, db_display_name, type) values('$db_name', '$db_release', 'KNOWNXREF', '1', '5', '$db_display_name', 'MISC')"); | |
669 } | |
670 | |
671 } | |
672 | |
673 1; |