#!/usr/bin/perl
# combine microarray pre-preprocessed data and binding data

use Getopt::Long;
$|++;

#
# command line arguments
#
$options = "Usage: ./Integrate_Data.pl <OPTIONS>
	-b  binding data (required)
	-bf binding data format (default 1)
	     1 - .bed format (<chr> <pkposn> <other information>)
	         first two columns are required, 
		 <other information> may contain peak ids, enrichments, etc, and is optional but will be retained
	     2 - Dinner group \"annotation\" format
	-e  pre-processed expression data (required)
	-o  output (required)
	-d  distance (positive number, in kbp, default 100)
	-dt distance type (1 = to gene boundaries, 2 = to TSS, default 1)
	-v  verbose (print progress to stdout: y or n, default y)
\n";

$b  = "";
$bf = 1;
$e  = "";
$o  = "";
$d  = 100;
$dt = "1";
$v = "y";

GetOptions('b=s'  => \$b,
	   'bf=i' => \$bf,
	   'e=s'  => \$e,
	   'o=s'  => \$o,
	   'd=f'  => \$d,
	   'dt=i' => \$dt,
	   'v=s'  => \$v,
) || die "\n$options\n";

if( $b eq "" ){
	print "\nError: set a value for -b\n\n$options";
	exit;
}
if( $bf != 1 && $bf != 2 ){
	print "\nError: set -bf to be 1 or 2\n\n$options";
	exit;
}
if( $e eq "" ){
	print "\nError: set a value for -e\n\n$options";
	exit;
}
if( $o eq "" ){
	print "\nError: set a value for -o\n\n$options";
	exit;
}

if( $d < 0 ){
	print "\nError: choose -d to be positive\n\n$options";
	exit;
}
if( $dt != 1 && $dt != 2 ){
	print "\nError: choose -dt to be 1 or 2\n\n$options";
	exit;
}
if( $v ne "y" && $v ne "n" ){
	print "\nError: choose -v to be y or n\n\n$options";
	exit;
}

#
# read in binding data
#
if( $v eq "y" ){
	print "\nReading in binding and expression data\n";
}
@peaks = ();
open(IN,"$b") || die "Error: can't open file $b\n";
while($line = <IN>){
	if( $line !~ /#/ ){
		chomp($line);
		@parts = split(' ',$line);
		$info = {};
		if( $bf == 1 ){
			$info->{chr} = $parts[0];
			$info->{pk} = $parts[1];
			$val = "";
			if( $#parts >= 2 ){
				$val = $parts[2];
				for($i=3; $i<= $#parts; $i++){
					$val = sprintf("%s %s", $val, $parts[$i]);
				}
			}
			$info->{rest} = $val;
			push(@peaks, $info);
		}
		if( $bf == 2 ){
			if( $#parts == 19 ){
				$info->{chr} = $parts[2];
				$info->{pk} = $parts[3];
				$info->{rest} = sprintf("%s %s %s", $parts[0], $parts[1], $parts[6]);
				push(@peaks, $info);
			}
		}
	}
}
close(IN);

#
# read in expression data
#
@expr = ();
open(IN,"$e") || die "Error: can't open file $e\n";
while($line = <IN>){
	chomp($line);
	@parts = split(' ',$line);
	$info = {};
	$info->{chr} = $parts[2];
	$info->{start} = $parts[3];
	$info->{end} = $parts[4];
	$info->{strand} = $parts[5];
	$info->{line} = $line;
	push(@expr, $info);
}
close(IN);

if( $v eq "y" ){
	printf("   %i peaks, %i genes\n", $#peaks+1, $#expr+1);
	printf("\nSorting peaks and genes, indexing chromosomes\n");
}

#
# sort binding and expression data by chr to save some time
#
@speaks = sort{ $a->{chr} cmp $b->{chr} } @peaks;
@sexpr = sort{ $a->{chr} cmp $b->{chr} } @expr;

#
# index starting elements for each chromosome in sexpr
#
@chrinds = ();
$chr = $sexpr[0]->{chr};
$info = {};
$info->{chr} = $chr;
$info->{start} = 0;
for($i=0; $i<= $#sexpr; $i++){
	if( $sexpr[$i]->{chr} ne $chr ){
		$info->{end} = $i-1;
		push(@chrinds, $info);
		$info = {};
		$chr = $sexpr[$i]->{chr};
		$info->{chr} = $chr;
		$info->{start} = $i;
	}
}
$info->{end} = $i-1;
push(@chrinds, $info);

#
# compile potential targets for each peak and print out
#
if( $v eq "y" ){
	printf("\nFinding potential targets\n");
}
open(OUT,">$o");
$count=0;
for($i=0; $i<= $#speaks; $i++){
	if( $v eq "y" ){
		printf("\r   peak %i of %i", $i+1, $#speaks+1);
	}
	# print out peak info
	printf OUT ("PEAK: %s %s %s\n", $speaks[$i]->{chr}, $speaks[$i]->{pk}, $speaks[$i]->{rest} );
	
	# find the right chromosome and search through all the genes on the same chromosome for possible matches
	$j=0;
	$end=$#chrinds;
	while( $chrinds[$j]->{chr} ne $speaks[$i]->{chr} && $j<= $end ){$j++;}
	if( $chrinds[$j]->{chr} eq $speaks[$i]->{chr} ){
		$start = $chrinds[$j]->{start};
		$end = $chrinds[$j]->{end};
		for($k=$start; $k<= $end; $k++){
			if( &distance( $i, $k ) <= $d ){
				$count++;
				printf OUT ("GENE: %s\n", $sexpr[$k]->{line});
			}
		}
	}
}
close(OUT);
if( $v eq "y" ){
	print "\n\nGot $count potential targets\n\n";
}

exit;


# compute distance between peak $_[0] and gene $_[1]
sub distance{
	my $dist;
	my $tss;
	my $ii;
	my $jj;
	$ii = $_[0];
	$jj = $_[1];
	
	if( $speaks[$ii]->{chr} ne $sexpr[$jj]->{chr}){
		print "\nError: somehow computing distance between peak/gene on different chromosomes\n";
		print "   peak chr: $speaks[$ii]->{chr}, gene chr: $sexpr[$jj]->{chr}\n";
		exit;
	}
	if( $dt == 1 ){
		# distance to closest side of gene
		if( $speaks[$ii]->{pk} < $sexpr[$jj]->{start} ){
			$dist = $sexpr[$jj]->{start} - $speaks[$ii]->{pk};
		}
		elsif( $speaks[$ii]->{pk} > $sexpr[$jj]->{end} ){
			$dist = $speaks[$ii]->{pk} - $sexpr[$jj]->{end};
		}
		else{
			$dist = 0;
		}
	}
	elsif( $dt == 2 ){
		# distance to TSS
		if( $sexpr[$jj]->{strand} == 1 ){
			$tss = $sexpr[$jj]->{start};
		}
		elsif( $sexpr[$jj]->{strand} == -1 ){
			$tss = $sexpr[$jj]->{end};
		}
		else{
			print "\nError: did not have strand of +/-1 (got $sexpr[$jj]->{strand})\n";
			exit;
		}
		$dist = abs($speaks[$ii]->{pk} - $tss);
	}
	else{
		print "\nError: wrong value for dt ($dt)\n";
		exit;
	}
	if( $dist < 0 ){
		print "\nError: negative distance\n";
		exit;
	}
	# convert to kbp
	$dist/=1000;
	return $dist;
}




