comparison variant_effect_predictor/Bio/EnsEMBL/Funcgen/Utils/EFGUtils.pm @ 0:1f6dce3d34e0

Uploaded
author mahtabm
date Thu, 11 Apr 2013 02:01:53 -0400
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:1f6dce3d34e0
1 =head1 LICENSE
2
3 Copyright (c) 1999-2011 The European Bioinformatics Institute and
4 Genome Research Limited. All rights reserved.
5
6 This software is distributed under a modified Apache license.
7 For license details, please see
8
9 http://www.ensembl.org/info/about/code_licence.html
10
11 =head1 CONTACT
12
13 Please email comments or questions to the public Ensembl
14 developers list at <ensembl-dev@ebi.ac.uk>.
15
16 Questions may also be sent to the Ensembl help desk at
17 <helpdesk@ensembl.org>.
18
19 =head1 NAME
20
21 Bio::EnsEMBL::Funcgen::Utils::EFGUtils
22
23 =head1 DESCRIPTION
24
25 This module collates a variety of miscellaneous methods.
26
27
28 =head1 SYNOPSIS
29
30 BEGIN
31 {
32 unshift(@INC,"/path/of/local/src/modules");
33 }
34
35 use Utils;
36
37 &Utils::send_mail($to_address, $title, $message);
38
39 =cut
40
41
42 # No API/Object based methods in here
43
44 ###############################################################################
45
46 package Bio::EnsEMBL::Funcgen::Utils::EFGUtils;
47 require Exporter;
48 @ISA = qw(Exporter);
49 @EXPORT_OK = qw(get_date species_name get_month_number species_chr_num
50 open_file median mean run_system_cmd backup_file
51 is_gzipped is_sam is_bed get_file_format strip_param_args
52 generate_slices_from_names strip_param_flags
53 get_current_regulatory_input_names add_external_db);
54
55 use Bio::EnsEMBL::Utils::Exception qw( throw );
56 use File::Path qw (mkpath);
57 use File::Basename qw (dirname);
58 use strict;
59 use Time::Local;
60 use FileHandle;
61 use Carp;
62
63 sub get_date{
64 my ($format, $file) = @_;
65
66 my ($time, $sec, $min, $hour, $mday, $mon, $year, $wday, $yday, $isdst);
67
68
69 throw("File does not exist or is not a regular file:\t$file") if $file && ! -f $file;
70
71
72 ($sec, $min, $hour, $mday, $mon, $year, $wday, $yday, $isdst) = (defined $file) ?
73 localtime((stat($file))[9]) : localtime();
74
75 #print " ($sec, $min, $hour, $mday, $mon, $year, $wday, $yday, $isdst)\n";
76
77 if((! defined $format && ! defined $file) || $format eq "date"){
78 $time = ($year+1900)."-".$mday."-".($mon+1);
79 }
80 elsif($format eq "time"){#not working!
81 $time = "${hour}:${min}:${sec}";
82 }
83 elsif($format eq "timedate"){#
84 $time = localtime();
85 }
86 else{#add mysql formats here, datetime etc...
87 croak("get_date does not handle format:\t$format");
88 }
89
90 return $time;
91 }
92
93
94 #migrate this data to defs file!!??
95 #must contain all E! species and any other species which are used in local DB extractions
96 #NEED TO ADD FLY!!
97
98 sub species_name{
99 my($species) = @_;
100 my %species_names = (
101 "HOMO_SAPIENS", "human",
102 "MUS_MUSCULUS", "mouse",
103 "RATTUS_NORVEGICUS", "rat",
104 "CANIS_FAMILIARIS", "dog",
105 "PAN_TROGOLODYTES", "chimp",
106 "GALLUS_GALLUS", "chicken",
107 "SACCHAROMYCES_CEREVISIAE", "yeast",
108 "HUMAN", "HOMO_SAPIENS",
109 "MOUSE", "MUS_MUSCULUS",
110 "RAT","RATTUS_NORVEGICUS",
111 "DOG", "CANIS_FAMILIARIS",
112 "CHIMP", "PAN_TROGOLODYTES",
113 "CHICKEN", "GALLUS_GALLUS",
114 "YEAST", "SACCHAROMYCES_CEREVISIAE",
115 );
116
117 return $species_names{uc($species)};
118 }
119
120 sub get_month_number{
121 my($mon) = @_;
122 my %month_nos =(
123 "jan", "01",
124 "feb", "02",
125 "mar", "03",
126 "apr", "04",
127 "may", "05",
128 "jun", "06",
129 "jul", "07",
130 "aug", "08",
131 "sep", "09",
132 "oct", "10",
133 "nov", "11",
134 "dec", "12",
135 );
136 return $month_nos{lc($mon)};
137 }
138
139
140 sub species_chr_num{
141 my ($species, $val) = @_;
142
143 ($species = lc($species)) =~ s/ /_/;
144
145 my %species_chrs = (
146 homo_sapiens => {(
147 'x' => 23,
148 'y' => 24,
149 'mt' => 25,
150 )},
151
152 mus_musculus => {(
153 'x' => 20,
154 'y' => 21,
155 'mt' => 22,
156 )},
157
158 rattus_norvegicus => {(
159 'x' => 21,
160 'y' => 22,
161 'mt' => 23,
162 )},
163 );
164
165 die("species not defined in chromosome hash") if(! exists $species_chrs{$species});
166
167 return (exists $species_chrs{$species}{lc($val)}) ? $species_chrs{$species}{lc($val)} : $val;
168 }
169
170 #Sort should always be done in the caller if required
171
172 sub median{
173 my ($scores, $sort) = shift;
174
175 return undef if (! @$scores);
176
177
178 my ($median);
179 my $count = scalar(@$scores);
180 my $index = $count-1;
181 #need to deal with lines with no results!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
182 #deal with one score fastest
183 return $scores->[0] if ($count == 1);
184
185
186 if($sort){
187 #This is going to sort the reference here, so will affect
188 #The array in the caller
189 #We need to deref to avoid this
190 }
191
192 #taken from Statistics::Descriptive
193 #remeber we're dealing with size starting with 1 but indices starting at 0
194
195 if ($count % 2) { #odd number of scores
196 $median = $scores->[($index+1)/2];
197 }
198 else { #even, get mean of flanks
199 $median = ($scores->[($index)/2] + $scores->[($index/2)+1] ) / 2;
200 }
201
202
203 return $median;
204 }
205
206
207 sub mean{
208 my $scores = shift;
209
210 my $total = 0;
211
212 map $total+= $_, @$scores;
213 my $mean = $total/(scalar(@$scores));
214
215 return $mean;
216
217 }
218
219 #Should really extend this to detect previous file?
220 #Or do in caller?
221
222 sub open_file{
223 my ($file, $operator, $file_permissions) = @_;
224
225 my $dir_permissions = $file_permissions || 0755;
226
227 $operator ||= '<';
228
229 if($operator !~ /%/){
230 $operator = "$operator $file";
231 }
232 else{
233 #We have some piping to do
234 $operator = sprintf($operator, $file);
235 }
236
237 #Get dir here and create if not exists
238 my $dir = dirname($file);
239 mkpath($dir, {verbose => 1, mode => $dir_permissions}) if(! -d $dir);
240 my $fh = new FileHandle "$operator";
241
242
243 #This does not catch incorrectly defined named pipes
244
245 if(! defined $fh){
246 croak("Failed to open $operator");
247 }
248
249
250 #Have to chmod here as umask will over-ride permissions passed to FileHandle
251 if(defined $file_permissions){
252
253 #Catch non-numeric here as chmod still returns true
254 if($file_permissions =~ /[^0-9]/){
255 croak("Failed to change $file permissions using:\t$file_permissions");
256 }
257
258 #chmod requires a literal octal number e.g. 0775 not '0775'
259 #should catch numbers as strings here, but perl makes this very hard to test
260 #Can't even system this as if we build the cmd line with an octal it will be converted to a decimal
261 #These is still no way of testing for non-octal number or string
262 #eval/sprintf will also not fail if there are non-octal digits i.e. 1999
263
264 #eval will treat octal number and string as true octal number
265 #else will pass non-octal string/number which we can't catch
266 chmod(eval($file_permissions), $file);
267 }
268
269 return $fh;
270 }
271
272
273
274 ################################################################################
275
276 =head2 run_system_cmd
277
278 Description : Method to control the execution of the standard system() command
279
280 ReturnType : none
281
282 Example : $Helper->debug(2,"dir=$dir file=$file");
283
284 Exceptions : throws exception if system command returns none zero
285
286 =cut
287
288 ################################################################################
289
290 sub run_system_cmd{
291 my ($command, $no_exit) = @_;
292
293 my $redirect = '';
294
295 #$self->debug(3, "system($command)");
296
297 # decide where the command line output should be redirected
298
299 #This should account for redirects
300
301 #if ($self->{_debug_level} >= 3){
302
303 # if (defined $self->{_debug_file}){
304 # $redirect = " >>".$self->{_debug_file}." 2>&1";
305 # }
306 # else{
307 # $redirect = "";
308 # }
309 #}
310 #else{
311 #$redirect = " > /dev/null 2>&1";
312 #}
313
314 # execute the passed system command
315 my $status = system("$command $redirect");
316 my $exit_code = $status >> 8;
317
318
319 if ($status == -1) {
320 warn "Failed to execute: $!\n";
321 }
322 elsif ($status & 127) {
323 warn sprintf("Child died with signal %d, %s coredump\nError:\t$!",($status & 127),($status & 128) ? 'with' : 'without');
324 }
325 elsif($status != 0) {
326 warn sprintf("Child exited with value %d\nError:\t$!\n", $exit_code); #get the true exit code
327 }
328
329 #We're not catchign error message here!
330
331 if ($exit_code != 0){
332
333 if (! $no_exit){
334 throw("System command failed:\t$command\n");
335 }
336 else{
337 warn("System command returned non-zero exit code:\t$command\n");
338 }
339 }
340
341 #reverse boolean logic for perl...can't do this anymore due to tab2mage successful non-zero exit codes :/
342
343 return $exit_code;
344 }
345
346
347 sub backup_file{
348 my $file_path = shift;
349
350 throw("Must define a file path to backup") if(! $file_path);
351
352 if (-f $file_path) {
353 #$self->log("Backing up:\t$file_path");
354 system ("mv ${file_path} ${file_path}.".`date '+%T'`) == 0 || return 0;
355 }
356
357 return 1;
358
359 }
360
361
362 sub get_file_format{
363 my $file = shift;
364
365 my $format = &is_bed($file);
366
367 if(! $format){
368 $format = &is_sam($file);
369
370 #Add more testes here
371 }
372
373
374 return $format;
375 }
376
377 sub is_gzipped {
378 my ($file, $fail_if_compressed) = @_;
379
380 throw ("File does not exist:\t$file") if ! -e $file;
381
382 open(FILE, "file -L $file |")
383 or throw("Can't execute command 'file' on '$file'");
384 my $file_info = <FILE>;
385 close FILE;
386
387 my $gzip = 0;
388
389 if($file_info =~ /compressed data/){
390
391 if($file_info =~ /gzip/){
392 $gzip = 1;
393 }
394 else{
395 throw("File is compressed but not with gzip, please unzip or gzip:\t$file_info");
396 }
397 }
398
399 return $gzip;
400 }
401
402 #Change these to also return the gz status
403
404 sub is_sam{
405 my $file = shift;
406
407 warn "Only checking file suffix for is_sam";
408 #Could check for header here altho this is not mandatory!
409 #Can we use web format guessing code?
410
411 my $gz = (&is_gzipped($file, 1)) ? '.gz' : '';
412
413 return ($file =~ /.sam${gz}/) ? 'sam' : 0;
414 }
415
416 #need is bam here too!
417
418 sub is_bed {
419 my $file = shift;
420
421 #Use open_file here!
422
423 if(&is_gzipped($file, 1)){
424
425 open(FILE, "zcat $file 2>&1 |") or throw("Can't open file via zcat:\t$file");
426 }
427 else{
428 open(FILE, $file) or throw("Can't open file:\t$file");
429 }
430
431 my @line;
432 #$verbose =1;
433
434
435 while (<FILE>) {
436 chomp;
437 @line = split("\t", $_);
438 last;
439 }
440 close FILE;
441
442 if (scalar @line < 6) {
443 warn "Infile '$file' does not have 6 or more columns. We expect bed format:\t".
444 "CHROM START END NAME SCORE STRAND.\n";
445 return 0;
446 #} elsif ($line[0] !~ m/^((chr)?[MTXYNT_\d]+)$/) {
447 # warn ("1st column must contain name of seq_region (e.g. chr1 or 1) in '$file'");
448 # return 0;
449 #Commented this out for now due to HSCHR_RANDOM seqs
450 #How does the webcode handle this?
451 }
452 elsif ($line[1] !~ m/^\d+$/ && $line[2] =~ m/^\d+$/) {
453 warn "2nd and 3rd column must contain start and end respectively in '$file'\n";
454 return 0;
455 }
456 elsif ($line[5] !~ m/^[+-\.]$/) {
457 warn "6th column must define strand (either +, - or .) in '$file'\n";
458 return 0;
459 }
460
461 return 'bed';
462 }
463
464
465 #These subs are useful for implementing
466 #a farm mode in a run script, where a script can
467 #submit itself to the farm as slice based jobs
468
469 #strip cmd line params and associated arguments from a list
470 #should not be used to remove flag options i.e. no following args
471 #as this may cause removal of any following @ARGV;
472 #Can this be used on flattened args hash?
473
474 sub strip_param_args{
475 my ($args, @strip_params) = @_;
476
477 my $param_name;
478 my $seen_opt = 0;
479
480 foreach my $i(0..$#{$args}){
481
482 if($args->[$i] =~ /^[-]+/){
483 $seen_opt = 0;#Reset seen opt if we seen a new one
484
485 ($param_name = $args->[$i]) =~ s/^[-]+//;
486
487 if(grep/^${param_name}$/, @strip_params){
488 $seen_opt = 1;
489 }
490 }
491
492 #$args->[$i] = '' if $args->[$i] =~ /^[-]+farm/;#Only remove current flag
493 #$seen_opt = 1 if $args->[$i] =~ /^[-]+skip_slices/;
494 #$seen_opt = 1 if $args->[$i] =~ /^[-]+slice/;#Don't have full param name incase we have just specified -slice
495
496 $args->[$i] = '' if $seen_opt;#Remove option and args following option
497 }
498
499 return $args;
500 }
501
502
503 sub strip_param_flags{
504 my ($args, @strip_params) = @_;
505
506 my @args = @$args;
507
508 foreach my $flag(@strip_params){
509 @args = grep(!/[-]+${flag}$/, @args);
510 }
511
512 return \@args;
513 }
514
515 #Generates slices from names or optionally alll default top level nonref
516
517 sub generate_slices_from_names{
518 my ($slice_adaptor, $slice_names, $skip_slices, $highestlevel, $non_ref, $inc_dups, $assembly) = @_;
519
520 #Test if $assembly is old?
521
522
523
524 my (@slices, $slice, $sr_name);
525
526 if(@$slice_names){
527
528 foreach my $name(@$slice_names){
529 $slice = $slice_adaptor->fetch_by_region(undef, $name, undef, undef, undef, $assembly);
530
531 #WHy is this failing for hap regions?
532
533 if(! $slice){
534
535 #Need to eval this as it will break with incorrect formating
536
537 eval { $slice = $slice_adaptor->fetch_by_name($name) };
538
539 if(! $slice){
540 throw("Could not fetch slice by region or name:\t".$name);
541 }
542 }
543
544 $sr_name = $slice->seq_region_name;
545
546 next if(grep/^${sr_name}$/, @$skip_slices);
547 push @slices, $slice;
548 }
549 }
550 elsif($highestlevel){
551
552 my $level = 'toplevel';
553
554 if($assembly){
555 $level = 'chromosome';
556 warn "Cannot get toplevel for old assembly version $assembly, defaulting to 'chromosome' level";
557 #Would ignore old assembly and just fetch current assembly otherwise as there is no toplevel for old assemblies
558 #No need for projection on non-ref unassembled seqs as these will/should be identical
559 #Only need need to project assembled seq i.e. haps(lrgs?).
560 #Only rollback toplevel data when cleaning after projection, otherwise we may lose some data.
561 #Change default delete to use all toplevel ref seqs (and non-ref with cs version e.g. haps but not lrgs)
562 }
563
564 my @tmp_slices = @{$slice_adaptor->fetch_all($level, $assembly, $non_ref, $inc_dups)};
565
566 if(@$skip_slices){
567
568 foreach $slice(@tmp_slices){
569 $sr_name = $slice->seq_region_name;
570 push @slices, $slice if ! grep/^${sr_name}$/, @$skip_slices;
571 }
572 }
573 else{
574 @slices = @tmp_slices;
575 }
576 }
577 else{
578 throw('You must either pass an arrayref of slice names or specify the toplevel flag');
579 }
580
581
582 if(! @slices){
583 throw("You have specified slice_names and skip_slices paramters which have generated no slices.\nslice_names:\t".join(' ',@$slice_names)."\nskip_slices:\t".join(' ', @$skip_slices));
584 }
585
586 return \@slices;
587 }
588
589
590 # Tracking DB methods
591 # Move to DBAdaptor? Can we add this as a separate package in the same module?
592
593 sub get_current_regulatory_input_names{
594 my ($tdb, $efg_db, $focus) = @_;
595
596 #Validate is production?
597 my $sql;
598
599
600
601 if($focus){
602 $focus = 'Focus';
603 $sql = 'SELECT efgdb_set_name from dataset where is_focus=true and is_current=true and species="'.$efg_db->species.'"';
604 }
605 else{
606 $focus = 'Non-focus';
607 #0 rather than false so we don't get NULLs
608 $sql = 'SELECT efgdb_set_name from dataset where is_focus=0 and is_current=true and species="'.$efg_db->species.'"';
609 }
610
611
612 #Currently efgdb_set_name can either be data_set or feature_set name!
613 #Need to standardise this
614
615 my @prd_names = @{$tdb->db_handle->selectcol_arrayref($sql)};
616 my @names;
617 my @failed_sets;
618
619 foreach my $prd_name(@prd_names){
620
621 $sql = "SELECT name from feature_set where name like '${prd_name}%'";
622 my @tmp_names = @{$efg_db->dbc->db_handle->selectcol_arrayref($sql)};
623
624 #This is causing problems with multiple feature sets with differing analyses
625
626 #Do this via InputSets(using query extension?) instead of using like?
627
628 #This is very hacky right now to get it to work
629 #Need to standardise and review tracking db data.
630
631 if(scalar(@tmp_names) > 1){
632
633 $sql = "SELECT name from feature_set where name ='${prd_name}_ccat_histone'";
634 @tmp_names = @{$efg_db->dbc->db_handle->selectcol_arrayref($sql)};
635
636 if(scalar(@tmp_names) == 1){
637 push @names, $tmp_names[0];
638 }else{
639 push @failed_sets, $prd_name;
640 }
641 }
642 elsif(scalar(@tmp_names) == 0){
643 push @failed_sets, $prd_name;
644 }
645 else{
646 push @names, $tmp_names[0];
647 }
648
649 }
650
651 if(@failed_sets){
652 throw("Failed to find unique $focus FeatureSets for production dataset names:\n\t".
653 join("\n\t", @failed_sets)."\n");
654 }
655
656 return @names;
657 }
658
659 #Handy function to add an external_db entry when needed
660 sub add_external_db{
661 my ($efg_db, $db_name,$db_release,$db_display_name) = @_;
662 my $sql = "select external_db_id from external_db where db_name='$db_name' and db_release='$db_release'";
663 my ($db_id) = $efg_db->dbc->db_handle->selectrow_array($sql);
664 if($db_id){
665 warn "External DB $db_name $db_release already exists in db with db_id $db_id\n";
666 } else {
667 #TODO check if it there was a failure
668 $efg_db->dbc->do("insert into external_db (db_name, db_release, status, dbprimary_acc_linkable, priority, db_display_name, type) values('$db_name', '$db_release', 'KNOWNXREF', '1', '5', '$db_display_name', 'MISC')");
669 }
670
671 }
672
673 1;