diff variant_effect_predictor/Bio/EnsEMBL/Funcgen/RunnableDB/RunCCAT.pm @ 0:1f6dce3d34e0

Uploaded
author mahtabm
date Thu, 11 Apr 2013 02:01:53 -0400
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/variant_effect_predictor/Bio/EnsEMBL/Funcgen/RunnableDB/RunCCAT.pm	Thu Apr 11 02:01:53 2013 -0400
@@ -0,0 +1,286 @@
+=pod 
+
+=head1 NAME
+
+Bio::EnsEMBL::Funcgen::RunnableDB::RunCCAT
+
+=head1 DESCRIPTION
+
+'RunCCAT' Runs CCAT "broad peak" caller and stores peaks as an annotated feature set.
+Assumes Files are organized and named with a specific convention 
+($repository_folder)/experiment_name/cell-type_feature-type/
+unless specific files are given
+
+=cut
+
+package Bio::EnsEMBL::Funcgen::RunnableDB::RunCCAT;
+
+use warnings;
+use strict;
+use Bio::EnsEMBL::Funcgen::Utils::Helper;
+use Bio::EnsEMBL::DBSQL::DBAdaptor;
+use Bio::EnsEMBL::Funcgen::DBSQL::DBAdaptor; 
+use Bio::EnsEMBL::Funcgen::InputSet;
+use Bio::EnsEMBL::Funcgen::DataSet;
+use Bio::EnsEMBL::Funcgen::FeatureSet;
+use Bio::EnsEMBL::Funcgen::AnnotatedFeature;
+use Bio::EnsEMBL::Utils::Exception qw(throw warning stack_trace_dump);
+use Bio::EnsEMBL::Funcgen::Utils::EFGUtils qw (run_system_cmd);
+
+
+use base ('Bio::EnsEMBL::Funcgen::RunnableDB::SWEmbl');
+
+sub fetch_input {   # fetch and preprocess the input file plus do some more checking
+  my $self = shift @_;
+
+  $self->SUPER::fetch_input();
+
+  my $efgdba = $self->_efgdba();
+  my $fsa = $efgdba->get_FeatureSetAdaptor();
+  if(!$self->_feature_set($fsa->fetch_by_name($self->_feature_set_name()))){
+    throw "Feature Set was not Created";
+  }
+
+  my $bin_dir = $self->_bin_dir();
+
+  my $analysis = $self->_analysis();
+  #Use the definitions that are on the database
+  
+  my $cell_type = $self->_cell_type()->name;
+  my $feature_type = $self->_feature_type()->name;
+  my $experiment_name = $self->_experiment_name();
+  
+  my $file_type = $self->_file_type();
+
+  if(($file_type ne 'sam') && ($file_type ne 'bed') ){ 
+    throw "Only sam and bed currently supported for CCAT"; 
+  }
+
+  my $output_dir = $self->_output_dir();  
+
+  my $size_file = $output_dir."/".$self->_set_name.".sizes";
+
+  #Get the size file... similar to the sam header... 
+  open(FILE, $self->_sam_header);
+  #Consider having it pregenerated...
+  open(SIZES,">".$size_file);
+  while(<FILE>){
+    chomp;
+    /^(\S+)\s+(\d+)\s+/;
+    my $slice = $1;
+    my $slice_size = $2;
+    if(!$slice || !$slice_size){ throw " Could not process sam header line $_ "; }
+
+    #Mouse Hack!!
+    next if(($self->_species eq 'mus_musculus') && !($slice =~ /chromosome/));
+    print SIZES $slice."\t".$slice_size."\n";
+  }
+  close SIZES;
+  close FILE;
+
+  $self->_size_file($size_file);
+
+  my $input_dir = $self->_input_dir();  
+   
+  my $file_name = $self->_input_file();
+  my $input_file =  $input_dir."/".$file_name;
+  if(-e $input_file){ 
+    my $output_file = $output_dir."/".$file_name;
+    if(! $self->param('reenter')){
+      #TODO Validate if existent file is ok. 
+      $self->_preprocess_file($input_file, $output_file, $file_type) || 
+	throw "Error processing data file $input_file";
+    } else {
+      if(! -e $output_file){ warn "$output_file does not exist. May need to rerun from scratch."; }
+    }
+    $self->_input_file($output_file); 
+    
+    if(!$self->_results_file($self->param('results_file'))){ 
+      $self->_results_file($output_file.".".$analysis->logic_name);
+    }
+  } else { throw "No valid data file was given: ".$input_file; }
+
+  #Always require a control file...
+  my $control_file = $output_dir."/".$self->_control_file();
+  if(! -e $control_file){ throw "No valid control file was given: ".$control_file; }
+  $self->_control_file($control_file); 
+  
+  #May need to convert it...
+  if($file_type eq 'sam'){
+
+    my $input_file_bed = $self->_input_file;
+    $input_file_bed =~ s/\.sam/\.bed/;
+
+    #Mouse hack
+    if($self->_species eq 'mus_musculus'){
+      if(! $self->param('reenter')){
+	run_system_cmd($bin_dir."/samtools view -Su ".$self->_input_file." | ${bin_dir}/bamToBed | grep 'chromosome' >${input_file_bed}");
+      }
+    } else {
+      if(! $self->param('reenter')){
+	run_system_cmd($bin_dir."/samtools view -Su ".$self->_input_file." | ${bin_dir}/bamToBed >${input_file_bed}");
+      }
+    }
+    $self->_input_file($input_file_bed);
+
+    my $control_file_bed = $self->_control_file;
+    $control_file_bed =~ s/\.sam/\.bed/;
+    #Mouse Hack
+    if($self->_species eq 'mus_musculus'){
+      if(! $self->param('reenter')){
+	run_system_cmd($bin_dir."/samtools view -Su ".$self->_control_file." | ${bin_dir}/bamToBed | grep 'chromosome' >${control_file_bed}");     
+      }
+    } else {
+      if(! $self->param('reenter')){
+	run_system_cmd($bin_dir."/samtools view -Su ".$self->_control_file." | ${bin_dir}/bamToBed >${control_file_bed}");
+      }
+    }
+    $self->_control_file($control_file_bed); 
+
+  }
+
+  return 1;
+}
+
+sub run {   # call SWEmbl
+  my $self = shift @_;
+
+  if($self->param('reenter')){ return 1; }
+
+  my $analysis = $self->_analysis;
+  #<CCAT path>/bin/CCAT  <ChIP library read file name>  <control library read file name> 
+  # <chromosome length file name>  <config file name>  <project name>
+  my $bin_dir = $self->_bin_dir();
+  my $command =  $bin_dir."/".$analysis->program_file .
+    " ".$self->_input_file() .  
+      " ".$self->_control_file() .  
+	" ". $self->_size_file() .
+	  " ".$bin_dir."/ccat_config/".$analysis->parameters .
+	    " " .  $self->_results_file;    
+  warn "Running analysis:\t$command";
+  run_system_cmd($command);
+  
+  return 1;
+}
+
+sub write_output {  # Store results
+  my $self = shift @_;
+  
+  $self->_parse_result_file();
+
+  return 1;
+}
+
+sub _parse_result_file {
+  
+  my $self = shift @_;
+  
+  ### annotated features
+  my $fset = $self->_feature_set();	
+  
+  my $efgdba = $self->_efgdba();
+  my $sa = $efgdba->get_SliceAdaptor();
+  
+  #Cache slices and features...
+  my %slice;
+  my @af;
+  
+  my %cache_af;
+
+  open(FILE,$self->_results_file().".significant.region");
+  while(<FILE>){
+    chomp;
+
+    #Content of CCAT output file
+    my ($seqid,$summit,$start,$end,$chipreads,$ctrlreads,$fold,$fdr)= split("\t");
+    #Hardcode a minimum fdr... may pass as parameter
+    next if ($fdr>0.05);
+    my $score = $fold;
+
+    #This seqid may vary depending on the input given to SWEmbl... 
+    # handle it manually at least for the moment... namely the sam seqid...
+    #Make sure to test thoroughly to see if it works...
+    #e.g. chromosome:GRCh37:15:1:102531392:1
+    if($seqid =~ /^\S*:\S*:(\S+):\S+:\S+:\S/) { $seqid = $1; }
+    #In case UCSC input is used... 
+    $seqid =~ s/^chr//i;
+
+    if($self->param('slice') && ($seqid ne $self->param('slice'))){
+      warn "Feature being ignored as it is not in specified slice ".$self->param('slice')." : Region:".
+	$seqid." Start:".$start." End:".$end." Score:".$score." Summit:".$summit."\n";
+      next;
+    }
+
+    #May have some sort of repeats(?). Since it is ordered with significance, ignore remaining hits.
+    next if(defined($cache_af{$seqid."_".$start}));
+    $cache_af{$seqid."_".$start} = 1;
+
+    #next if ($seqid =~ m/^M/);
+
+    # filtering is done as a post-processing e.g. FilterBlacklist.pm
+    #$summit = int($summit);#Round up?	  
+  
+    unless (exists $slice{"$seqid"}) {
+      $slice{"$seqid"} = $sa->fetch_by_region(undef, $seqid);
+    }
+
+    if( ($start < 1) || ($end > $slice{"$seqid"}->end)){
+      warn "Feature being ignored due to coordinates out of slice: Region:".
+	$seqid." Start:".$start." End:".$end." Score:".$score." Summit:".$summit."\n";
+    }
+  
+    #Gracefully handle errors...
+    my $af;
+    eval{
+      $af = Bio::EnsEMBL::Funcgen::AnnotatedFeature->new
+    	(
+	 -slice         => $slice{"$seqid"},
+	 -start         => $start,
+	 -end           => $end,
+	 -strand        => 0,
+	 -score         => $score,
+	 -summit        => $summit,
+	 -feature_set   => $fset,
+	);
+    }; 
+    if($@) { warn($@); next; }
+    if(!$af) { warn("Could not create feature - Region:".
+		    $seqid." Start:".$start." End:".$end." Score:".$score.
+		    " Summit:".$summit); next; }
+    
+    push(@af, $af);
+  } 
+  close FILE;
+ 
+  # Batch store features...
+  if(scalar(@af>0)){
+    $efgdba->get_AnnotatedFeatureAdaptor->store(@af);
+  } else {
+    warn "No significant features detected!";
+  }
+  #Do this on a wrapup runnable...so it will only be visible after reads are loaded...
+  $fset->adaptor->set_imported_states_by_Set($fset);
+
+  # Status should not be set at this stage
+
+}
+
+
+#Private getter / setter to the feature set
+sub _feature_set {
+  return $_[0]->_getter_setter('feature_set',$_[1]);
+}
+
+#Private getter / setter to the results file 
+sub _results_file {
+  return $_[0]->_getter_setter('results_file',$_[1]);
+}
+
+#Private getter / setter to the size file
+sub _size_file {
+  return $_[0]->_getter_setter('size_file',$_[1]);
+}
+
+
+1;
+