diff variant_effect_predictor/Bio/EnsEMBL/Funcgen/RunnableDB/MakeDnaseProfile.pm @ 0:1f6dce3d34e0

Uploaded
author mahtabm
date Thu, 11 Apr 2013 02:01:53 -0400
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/variant_effect_predictor/Bio/EnsEMBL/Funcgen/RunnableDB/MakeDnaseProfile.pm	Thu Apr 11 02:01:53 2013 -0400
@@ -0,0 +1,169 @@
+=pod 
+
+=head1 NAME
+
+Bio::EnsEMBL::Funcgen::RunnableDB::MakeDnaseProfile
+
+=head1 DESCRIPTION
+
+=cut
+
+
+package Bio::EnsEMBL::Funcgen::RunnableDB::MakeDnaseProfile;
+
+use warnings;
+use strict;
+use Bio::EnsEMBL::Hive::DBSQL::AnalysisDataAdaptor;
+use base ('Bio::EnsEMBL::Hive::ProcessWithParams');
+
+use Bio::EnsEMBL::Utils::Exception qw(throw warning stack_trace_dump);
+use Data::Dumper;
+
+sub fetch_input {   # nothing to fetch... just the parameters...
+  my $self = shift @_;
+
+  $self->SUPER::fetch_input();
+  
+  my $work_dir = $self->param('work_dir');
+  my $matrix = $self->param('matrix');
+  my $dnase = $self->param('dnase');
+  #throw "Dnase file $dnase does not exist in $work_dir" if(! -e $work_dir."/".$dnase);
+  throw "Matrix file $matrix does not exist in $work_dir" if(! -e $work_dir."/matches/".$matrix);
+  my $is_male = $self->param('is_male');
+  throw "Need to define is_male" if(!defined($is_male));
+  #throw "mapability.bedGraph file expected in $work_dir" if(! -e $work_dir."/mapability.bedGraph");
+
+  return 1;
+}
+
+sub run {   # Create Groups and Analysis and stuff...
+  my $self = shift @_;
+
+  my $hlen = 100;
+  my $work_dir = $self->param('work_dir');
+  my $matrix = $self->param('matrix');
+  my $dnase = $self->param('dnase');
+  my $is_male = $self->param('is_male');
+
+  my $motif_size;
+  my %motifs;
+  #Or remove // from matrix name
+  open(FILE,$work_dir."/matches_vf/".$matrix);
+  <FILE>; #title
+  while(<FILE>){
+    chomp;  
+    #10	71532	71542	Gata1	8.55866	1	MA0035.2
+    my ($chr,$start,$end,$name,$score,$strand)=split("\t");
+    next if(!$is_male && ($chr =~ /Y/));
+    #Ignore Mitochondria...
+    next if($chr =~ /MT/);
+    if(!defined($motif_size)){ $motif_size = $end - $start; }
+    $motifs{$chr}{$start}{$strand}=$score;
+  }
+  close FILE;
+
+  #Maybe pass this as parameters
+  my $out_file = $work_dir."/output/".$matrix."_".$dnase.".counts";
+  open(FO,">".$out_file);
+  
+  foreach my $chr (sort keys %motifs){
+    my %datap = ();
+    my %datan = ();
+    foreach my $start (sort keys %{$motifs{$chr}}){
+      for(my $i=$start-$hlen;$i<$start+$hlen+$motif_size;$i++){
+	$datap{$i} = 0;
+	$datan{$i} = 0;
+      }
+    }
+
+    #Add a mappability score to extra filter...
+    #A lot more time and didn't see much evidence of improvement...
+    #my %mappability = ();
+    #foreach my $start (sort keys %{$motifs{$chr}}){
+    #  for(my $i=$start-$hlen;$i<$start+$hlen+$motif_size;$i++){
+    #	$mappability{$i} = 0;
+    #  }
+    #}
+    #open(FILE,$work_dir."/mapability.bedGraph");
+    #while(<FILE>){
+    #  chomp;
+    #  my ($cur_chr,$start,$end,$score)=split();
+    #  next if($cur_chr ne $chr);
+    #for(my $i=$start; $i<$end; $i++){
+    #	if(defined($mappability{$i})){ $mappability{$i}=$score; }
+    #  }
+    #}
+    #close FILE;
+    
+    my $folder;
+    if($is_male){
+      $folder = "male";
+    } else {
+      $folder = "female";
+    }
+    #open(FILE,$work_dir."/dnase/".$folder."/".$dnase);
+    open(FILE,"gzip -dc ${work_dir}/dnase/${folder}/${dnase}".'AlnRep*.bam.unique.tagAlign.gz |');
+    while(<FILE>){
+      chomp;
+      my ($cur_chr,$start,$end,undef,undef,$strand)=split("\t");
+      next if(($cur_chr ne $chr) || !defined($datap{$start}));
+      if($strand eq '+'){
+	$datap{$start}++;
+      } else {
+	$datan{$end-1}++;
+      }
+    }
+    close FILE;
+    
+    foreach my $start (sort keys %{$motifs{$chr}}){
+      foreach my $strand (keys %{$motifs{$chr}{$start}}){
+	my $score = $motifs{$chr}{$start}{$strand};
+	
+	#filter out those that have no mappability in more than 20% of their extension...
+	#my $count = 0;
+	#for(my $i=$start-$hlen;$i<$start+$hlen+$motif_size;$i++){
+	#  if($mappability{$i} <= 0.5){ $count++; }
+	#}
+	#Remove if more than 20% of region is "non_mappable" (approach from Centipede paper)
+	#if(($count / (2*$hlen + $motif_size)) > 0.2){
+	#  warn "Motif at chr:  $chr start: $start strand: $strand was ignored due to low mappability";
+	#  next;
+	#}
+
+	print  FO $chr."\t".$start."\t".($start+$motif_size)."\t".$matrix."\t".$score."\t".$strand;
+	for(my $i=$start-$hlen;$i<$start+$hlen+$motif_size;$i++){
+	  if($strand eq '+'){
+	    print FO "\t".$datap{$i};
+	  } else {
+	    print FO "\t".$datan{$i};
+	  }
+	}
+	for(my $i=$start-$hlen;$i<$start+$hlen+$motif_size;$i++){
+	  if($strand eq '+'){
+	    print FO "\t".$datan{$i};
+	  } else {
+	    print FO "\t".$datap{$i};
+	  }
+	}
+	print FO "\n";
+      }
+    }
+  }
+  
+  close FO;
+
+  return 1;
+}
+
+
+sub write_output {  # Nothing to do here
+  my $self = shift @_;
+  
+  $self->dataflow_output_id($self->input_id, 2);
+
+  return 1;
+
+
+}
+
+1;