Mercurial > repos > mahtabm > ensembl
diff variant_effect_predictor/Bio/EnsEMBL/Funcgen/RunnableDB/RunCCAT.pm @ 0:1f6dce3d34e0
Uploaded
author | mahtabm |
---|---|
date | Thu, 11 Apr 2013 02:01:53 -0400 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/Funcgen/RunnableDB/RunCCAT.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,286 @@ +=pod + +=head1 NAME + +Bio::EnsEMBL::Funcgen::RunnableDB::RunCCAT + +=head1 DESCRIPTION + +'RunCCAT' Runs CCAT "broad peak" caller and stores peaks as an annotated feature set. +Assumes Files are organized and named with a specific convention +($repository_folder)/experiment_name/cell-type_feature-type/ +unless specific files are given + +=cut + +package Bio::EnsEMBL::Funcgen::RunnableDB::RunCCAT; + +use warnings; +use strict; +use Bio::EnsEMBL::Funcgen::Utils::Helper; +use Bio::EnsEMBL::DBSQL::DBAdaptor; +use Bio::EnsEMBL::Funcgen::DBSQL::DBAdaptor; +use Bio::EnsEMBL::Funcgen::InputSet; +use Bio::EnsEMBL::Funcgen::DataSet; +use Bio::EnsEMBL::Funcgen::FeatureSet; +use Bio::EnsEMBL::Funcgen::AnnotatedFeature; +use Bio::EnsEMBL::Utils::Exception qw(throw warning stack_trace_dump); +use Bio::EnsEMBL::Funcgen::Utils::EFGUtils qw (run_system_cmd); + + +use base ('Bio::EnsEMBL::Funcgen::RunnableDB::SWEmbl'); + +sub fetch_input { # fetch and preprocess the input file plus do some more checking + my $self = shift @_; + + $self->SUPER::fetch_input(); + + my $efgdba = $self->_efgdba(); + my $fsa = $efgdba->get_FeatureSetAdaptor(); + if(!$self->_feature_set($fsa->fetch_by_name($self->_feature_set_name()))){ + throw "Feature Set was not Created"; + } + + my $bin_dir = $self->_bin_dir(); + + my $analysis = $self->_analysis(); + #Use the definitions that are on the database + + my $cell_type = $self->_cell_type()->name; + my $feature_type = $self->_feature_type()->name; + my $experiment_name = $self->_experiment_name(); + + my $file_type = $self->_file_type(); + + if(($file_type ne 'sam') && ($file_type ne 'bed') ){ + throw "Only sam and bed currently supported for CCAT"; + } + + my $output_dir = $self->_output_dir(); + + my $size_file = $output_dir."/".$self->_set_name.".sizes"; + + #Get the size file... similar to the sam header... + open(FILE, $self->_sam_header); + #Consider having it pregenerated... + open(SIZES,">".$size_file); + while(<FILE>){ + chomp; + /^(\S+)\s+(\d+)\s+/; + my $slice = $1; + my $slice_size = $2; + if(!$slice || !$slice_size){ throw " Could not process sam header line $_ "; } + + #Mouse Hack!! + next if(($self->_species eq 'mus_musculus') && !($slice =~ /chromosome/)); + print SIZES $slice."\t".$slice_size."\n"; + } + close SIZES; + close FILE; + + $self->_size_file($size_file); + + my $input_dir = $self->_input_dir(); + + my $file_name = $self->_input_file(); + my $input_file = $input_dir."/".$file_name; + if(-e $input_file){ + my $output_file = $output_dir."/".$file_name; + if(! $self->param('reenter')){ + #TODO Validate if existent file is ok. + $self->_preprocess_file($input_file, $output_file, $file_type) || + throw "Error processing data file $input_file"; + } else { + if(! -e $output_file){ warn "$output_file does not exist. May need to rerun from scratch."; } + } + $self->_input_file($output_file); + + if(!$self->_results_file($self->param('results_file'))){ + $self->_results_file($output_file.".".$analysis->logic_name); + } + } else { throw "No valid data file was given: ".$input_file; } + + #Always require a control file... + my $control_file = $output_dir."/".$self->_control_file(); + if(! -e $control_file){ throw "No valid control file was given: ".$control_file; } + $self->_control_file($control_file); + + #May need to convert it... + if($file_type eq 'sam'){ + + my $input_file_bed = $self->_input_file; + $input_file_bed =~ s/\.sam/\.bed/; + + #Mouse hack + if($self->_species eq 'mus_musculus'){ + if(! $self->param('reenter')){ + run_system_cmd($bin_dir."/samtools view -Su ".$self->_input_file." | ${bin_dir}/bamToBed | grep 'chromosome' >${input_file_bed}"); + } + } else { + if(! $self->param('reenter')){ + run_system_cmd($bin_dir."/samtools view -Su ".$self->_input_file." | ${bin_dir}/bamToBed >${input_file_bed}"); + } + } + $self->_input_file($input_file_bed); + + my $control_file_bed = $self->_control_file; + $control_file_bed =~ s/\.sam/\.bed/; + #Mouse Hack + if($self->_species eq 'mus_musculus'){ + if(! $self->param('reenter')){ + run_system_cmd($bin_dir."/samtools view -Su ".$self->_control_file." | ${bin_dir}/bamToBed | grep 'chromosome' >${control_file_bed}"); + } + } else { + if(! $self->param('reenter')){ + run_system_cmd($bin_dir."/samtools view -Su ".$self->_control_file." | ${bin_dir}/bamToBed >${control_file_bed}"); + } + } + $self->_control_file($control_file_bed); + + } + + return 1; +} + +sub run { # call SWEmbl + my $self = shift @_; + + if($self->param('reenter')){ return 1; } + + my $analysis = $self->_analysis; + #<CCAT path>/bin/CCAT <ChIP library read file name> <control library read file name> + # <chromosome length file name> <config file name> <project name> + my $bin_dir = $self->_bin_dir(); + my $command = $bin_dir."/".$analysis->program_file . + " ".$self->_input_file() . + " ".$self->_control_file() . + " ". $self->_size_file() . + " ".$bin_dir."/ccat_config/".$analysis->parameters . + " " . $self->_results_file; + warn "Running analysis:\t$command"; + run_system_cmd($command); + + return 1; +} + +sub write_output { # Store results + my $self = shift @_; + + $self->_parse_result_file(); + + return 1; +} + +sub _parse_result_file { + + my $self = shift @_; + + ### annotated features + my $fset = $self->_feature_set(); + + my $efgdba = $self->_efgdba(); + my $sa = $efgdba->get_SliceAdaptor(); + + #Cache slices and features... + my %slice; + my @af; + + my %cache_af; + + open(FILE,$self->_results_file().".significant.region"); + while(<FILE>){ + chomp; + + #Content of CCAT output file + my ($seqid,$summit,$start,$end,$chipreads,$ctrlreads,$fold,$fdr)= split("\t"); + #Hardcode a minimum fdr... may pass as parameter + next if ($fdr>0.05); + my $score = $fold; + + #This seqid may vary depending on the input given to SWEmbl... + # handle it manually at least for the moment... namely the sam seqid... + #Make sure to test thoroughly to see if it works... + #e.g. chromosome:GRCh37:15:1:102531392:1 + if($seqid =~ /^\S*:\S*:(\S+):\S+:\S+:\S/) { $seqid = $1; } + #In case UCSC input is used... + $seqid =~ s/^chr//i; + + if($self->param('slice') && ($seqid ne $self->param('slice'))){ + warn "Feature being ignored as it is not in specified slice ".$self->param('slice')." : Region:". + $seqid." Start:".$start." End:".$end." Score:".$score." Summit:".$summit."\n"; + next; + } + + #May have some sort of repeats(?). Since it is ordered with significance, ignore remaining hits. + next if(defined($cache_af{$seqid."_".$start})); + $cache_af{$seqid."_".$start} = 1; + + #next if ($seqid =~ m/^M/); + + # filtering is done as a post-processing e.g. FilterBlacklist.pm + #$summit = int($summit);#Round up? + + unless (exists $slice{"$seqid"}) { + $slice{"$seqid"} = $sa->fetch_by_region(undef, $seqid); + } + + if( ($start < 1) || ($end > $slice{"$seqid"}->end)){ + warn "Feature being ignored due to coordinates out of slice: Region:". + $seqid." Start:".$start." End:".$end." Score:".$score." Summit:".$summit."\n"; + } + + #Gracefully handle errors... + my $af; + eval{ + $af = Bio::EnsEMBL::Funcgen::AnnotatedFeature->new + ( + -slice => $slice{"$seqid"}, + -start => $start, + -end => $end, + -strand => 0, + -score => $score, + -summit => $summit, + -feature_set => $fset, + ); + }; + if($@) { warn($@); next; } + if(!$af) { warn("Could not create feature - Region:". + $seqid." Start:".$start." End:".$end." Score:".$score. + " Summit:".$summit); next; } + + push(@af, $af); + } + close FILE; + + # Batch store features... + if(scalar(@af>0)){ + $efgdba->get_AnnotatedFeatureAdaptor->store(@af); + } else { + warn "No significant features detected!"; + } + #Do this on a wrapup runnable...so it will only be visible after reads are loaded... + $fset->adaptor->set_imported_states_by_Set($fset); + + # Status should not be set at this stage + +} + + +#Private getter / setter to the feature set +sub _feature_set { + return $_[0]->_getter_setter('feature_set',$_[1]); +} + +#Private getter / setter to the results file +sub _results_file { + return $_[0]->_getter_setter('results_file',$_[1]); +} + +#Private getter / setter to the size file +sub _size_file { + return $_[0]->_getter_setter('size_file',$_[1]); +} + + +1; +