Mercurial > repos > mahtabm > ensembl
diff variant_effect_predictor/Bio/EnsEMBL/Funcgen/RunnableDB/MakeDnaseProfile.pm @ 0:1f6dce3d34e0
Uploaded
author | mahtabm |
---|---|
date | Thu, 11 Apr 2013 02:01:53 -0400 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/Funcgen/RunnableDB/MakeDnaseProfile.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,169 @@ +=pod + +=head1 NAME + +Bio::EnsEMBL::Funcgen::RunnableDB::MakeDnaseProfile + +=head1 DESCRIPTION + +=cut + + +package Bio::EnsEMBL::Funcgen::RunnableDB::MakeDnaseProfile; + +use warnings; +use strict; +use Bio::EnsEMBL::Hive::DBSQL::AnalysisDataAdaptor; +use base ('Bio::EnsEMBL::Hive::ProcessWithParams'); + +use Bio::EnsEMBL::Utils::Exception qw(throw warning stack_trace_dump); +use Data::Dumper; + +sub fetch_input { # nothing to fetch... just the parameters... + my $self = shift @_; + + $self->SUPER::fetch_input(); + + my $work_dir = $self->param('work_dir'); + my $matrix = $self->param('matrix'); + my $dnase = $self->param('dnase'); + #throw "Dnase file $dnase does not exist in $work_dir" if(! -e $work_dir."/".$dnase); + throw "Matrix file $matrix does not exist in $work_dir" if(! -e $work_dir."/matches/".$matrix); + my $is_male = $self->param('is_male'); + throw "Need to define is_male" if(!defined($is_male)); + #throw "mapability.bedGraph file expected in $work_dir" if(! -e $work_dir."/mapability.bedGraph"); + + return 1; +} + +sub run { # Create Groups and Analysis and stuff... + my $self = shift @_; + + my $hlen = 100; + my $work_dir = $self->param('work_dir'); + my $matrix = $self->param('matrix'); + my $dnase = $self->param('dnase'); + my $is_male = $self->param('is_male'); + + my $motif_size; + my %motifs; + #Or remove // from matrix name + open(FILE,$work_dir."/matches_vf/".$matrix); + <FILE>; #title + while(<FILE>){ + chomp; + #10 71532 71542 Gata1 8.55866 1 MA0035.2 + my ($chr,$start,$end,$name,$score,$strand)=split("\t"); + next if(!$is_male && ($chr =~ /Y/)); + #Ignore Mitochondria... + next if($chr =~ /MT/); + if(!defined($motif_size)){ $motif_size = $end - $start; } + $motifs{$chr}{$start}{$strand}=$score; + } + close FILE; + + #Maybe pass this as parameters + my $out_file = $work_dir."/output/".$matrix."_".$dnase.".counts"; + open(FO,">".$out_file); + + foreach my $chr (sort keys %motifs){ + my %datap = (); + my %datan = (); + foreach my $start (sort keys %{$motifs{$chr}}){ + for(my $i=$start-$hlen;$i<$start+$hlen+$motif_size;$i++){ + $datap{$i} = 0; + $datan{$i} = 0; + } + } + + #Add a mappability score to extra filter... + #A lot more time and didn't see much evidence of improvement... + #my %mappability = (); + #foreach my $start (sort keys %{$motifs{$chr}}){ + # for(my $i=$start-$hlen;$i<$start+$hlen+$motif_size;$i++){ + # $mappability{$i} = 0; + # } + #} + #open(FILE,$work_dir."/mapability.bedGraph"); + #while(<FILE>){ + # chomp; + # my ($cur_chr,$start,$end,$score)=split(); + # next if($cur_chr ne $chr); + #for(my $i=$start; $i<$end; $i++){ + # if(defined($mappability{$i})){ $mappability{$i}=$score; } + # } + #} + #close FILE; + + my $folder; + if($is_male){ + $folder = "male"; + } else { + $folder = "female"; + } + #open(FILE,$work_dir."/dnase/".$folder."/".$dnase); + open(FILE,"gzip -dc ${work_dir}/dnase/${folder}/${dnase}".'AlnRep*.bam.unique.tagAlign.gz |'); + while(<FILE>){ + chomp; + my ($cur_chr,$start,$end,undef,undef,$strand)=split("\t"); + next if(($cur_chr ne $chr) || !defined($datap{$start})); + if($strand eq '+'){ + $datap{$start}++; + } else { + $datan{$end-1}++; + } + } + close FILE; + + foreach my $start (sort keys %{$motifs{$chr}}){ + foreach my $strand (keys %{$motifs{$chr}{$start}}){ + my $score = $motifs{$chr}{$start}{$strand}; + + #filter out those that have no mappability in more than 20% of their extension... + #my $count = 0; + #for(my $i=$start-$hlen;$i<$start+$hlen+$motif_size;$i++){ + # if($mappability{$i} <= 0.5){ $count++; } + #} + #Remove if more than 20% of region is "non_mappable" (approach from Centipede paper) + #if(($count / (2*$hlen + $motif_size)) > 0.2){ + # warn "Motif at chr: $chr start: $start strand: $strand was ignored due to low mappability"; + # next; + #} + + print FO $chr."\t".$start."\t".($start+$motif_size)."\t".$matrix."\t".$score."\t".$strand; + for(my $i=$start-$hlen;$i<$start+$hlen+$motif_size;$i++){ + if($strand eq '+'){ + print FO "\t".$datap{$i}; + } else { + print FO "\t".$datan{$i}; + } + } + for(my $i=$start-$hlen;$i<$start+$hlen+$motif_size;$i++){ + if($strand eq '+'){ + print FO "\t".$datan{$i}; + } else { + print FO "\t".$datap{$i}; + } + } + print FO "\n"; + } + } + } + + close FO; + + return 1; +} + + +sub write_output { # Nothing to do here + my $self = shift @_; + + $self->dataflow_output_id($self->input_id, 2); + + return 1; + + +} + +1;