view run_cf0611.pl @ 22:9a26daf268da draft

Uploaded
author geert-vandeweyer
date Tue, 29 Jul 2014 10:50:35 -0400
parents e79b771cf3a9
children 9f96e2fa94ed
line wrap: on
line source

#!/usr/bin/perl

#Copyright 2012-2013 Haley Abel
#
#This file is part of ClusterFAST.
#
#ClusterFAST is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version.
#
#ClusterFAST is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for more details.
#
#You should have received a copy of the GNU General Public License along with ClusterFAST.  If not, see <http://www.gnu.org/licenses/>.


use warnings;
use strict;

use Getopt::Long;
use Pod::Usage;
use File::Path qw(make_path);
use Cwd ;
use Cwd 'abs_path';
use File::Basename;

my ($inbam, $outdir, $targets, $minct1, $minct2, $dist, $novoindex, $rmtmp, $FASTA,$twobit,$refpaths,$bp1,$bp2,$contigs,$insert,$sdinsert);

$minct1=2;
$minct2=1;
$dist=50000;
$rmtmp=1;
$insert=230;
$sdinsert=140;
my $known_partners=0;
my $partnerfile='';

my $cwd = getcwd();
my $scriptspath = '';
my $cffile = '';
# Galaxy installer puts scripts and jar into a folder, which is available under the environment variable CF_PATH
# this is future work (once github of clusterfast gets updated. For now, it's a placeholder.
if (defined($ENV{'CF_PATH'}) && -d $ENV{'CF_PATH'}) {
	$scriptspath = dirname(abs_path($0))."/cf_scripts";
	$cffile = dirname(abs_path($0))."/cf_v1.1.jar";
}
else {
	$scriptspath="$cwd/cf_scripts";
	$cffile="$cwd/cf_v1.1.jar";
}
my $help='';


#YOU MUST ADD LOCAL FILE PATHS HERE FOR STANDALONE USE (including trailing slashes!). Galaxy adds the binaries to the path, so these can be left empty. 
my $SAMTOOLS='';##INSERT PATH TO SAMTOOLS; http://sourceforge.net/projects/samtools/files/samtools/;
my $NOVOALIGN='';##INSERT PATH TO NOVOALIGN; http://www.novocraft.com/main/downloadpage.php;
my $VELVET='';##INSERT PATH TO VELVET;http://www.molecularevolution.org/software/genomics/velvet;
my $BLAT='';##INSET PATH TO BLAT gfServer/gfClient tools; 




GetOptions(
    "b|inbam=s"=> \$inbam,
    "o|outdir=s"=> \$outdir,
    "t|targets=s"=> \$targets,
    "f|fasta=s"=> \$FASTA,
    "m1|minct1=i"=> \$minct1,
    "m2|minct2=i"=> \$minct2,
    "d|distance=i"=>\$dist,
    "i|novoindex=s"=> \$novoindex,
    "r|rmtmp=i"=> \$rmtmp,
    "h|help"=>\$help,
    "2|twobit=s"=>\$twobit,
    "p|refpaths=s"=>\$refpaths,
    "c|contigs=s"=>\$contigs,
    "y|bp1=s"=>\$bp1,
    "z|bp2=s"=>\$bp2,
    "i|insert=i"=>\$insert,
    "s|sdinsert=i"=>\$sdinsert,
    ) or die "Usage:  run_cf_0611.pl -b inbam -o outdir -f fasta -t targets -d distance -i novoindex -r rmtmp -m1 min_pairs -m2 min_splits -t genome.2bit\n";

if($help) {
	print "Usage:  run_cf_0611.pl -b inbam -o outdir -f fasta -t targets  -d distance -i novoindex -r rmtmp -m1 min_pairs -m2 min_splits -t genome.2bit\n";
	print "targets=BED style interval file containing the regions to search for SV\n";
	print "fasta=reference fasta file\nnovoindex=reference index for novoalign\n";
	print "distance=max distance from target to search (default=50000)\n";
	print "min_pairs=minimum number of read pairs supporting breakpoint (default=2)\nmin_splits=minimum number of split reads supporting breakpoint (default=1)\n";
	print "rmtmp=remove temp directory (default=1, true)\n";
	print "genome.2bit = 2bit index fasta file of reference genome, used by the blat server (which is launched on the localhost)\n";
	print "refpaths = semicolon seperated list of reference genomes paths (used by galaxy). The order is mandatory to be novoalign;2bit;fasta\n";
	print "contigs = (Optional) output file for the contigs (default: outdir/contigs.txt)\n";
	print "bp1 = (Optional) output file for the breakpoints.1.txt file (default: outdir/breakpoints.1.txt)\n";
	print "bp2 = (Optional) output file for the breakpoints.2.txt file (default: outdir/breakpoints.2.txt)\n";
	exit(1);
}

# if refpaths is defined => split into seperate variables.
if (defined($refpaths) && $refpaths ne '') {
	($novoindex, $twobit, $FASTA) = split(/;/,$refpaths);
}

if(! ( defined $inbam && defined $targets && defined $FASTA && defined $outdir && defined $novoindex && defined $twobit )) {
	print STDERR "Usage:  run_cf_0611.pl -b inbam -o outdir -f fasta -t targets  -d distance -i novoindex -r rmtmp -m1 min_pairs -m2 min_splits -t genome.2bit\n";
	exit(1);
}
elsif (! (-e $inbam && -e $targets && -e $FASTA && -e $novoindex && -e $twobit)) {
	print STDERR "Critical file missing.\n";
	exit(1);
}

make_empty_output();
$outdir = abs_path($outdir);

## convert Targets.bed to targets.picard
my $cmd="${SAMTOOLS}samtools view -H $inbam > $outdir/Targets.picard";
print STDOUT "$cmd\n";
system($cmd);
$cmd = "awk '{OFS=\"\\t\"; print \$1,\$2,\$3,\$6,\$4 }' $targets >> $outdir/Targets.picard";
print STDOUT "$cmd\n";
system($cmd);
$targets = "$outdir/Targets.picard";

## check all reads are paired. Combination of paired and single end libraries crashes the tool.
my @fs = `${SAMTOOLS}samtools flagstat $inbam`;
my $total = 0;
my $paired = 0;
foreach(@fs) {
	chomp;
	if ($_ =~ m/^(\d+).*in total/) {
		$total = $1;
	}
	elsif ($_ =~ m/^(\d+).*paired in sequencing/) {
		$paired = $1;
	}
}
if ($total != $paired) {
	print "Removing single-read data from the input bam file\n";
	system("${SAMTOOLS}samtools view -f 1 -h -b -o $outdir/Paired.only.bam $inbam");
	$inbam = "$outdir/Paired.only.bam";
}

## process.
$cmd="java -Xmx8g -Xms6g -jar $cffile $inbam $targets $minct1 $dist $outdir 1 2>&1";
print STDOUT "$cmd\n";
system($cmd);

chdir("${outdir}/temp");

if(!(-s "toRemap1.fq")) {
    print STDOUT "No breakpoints found.\n";
    chdir("$outdir");
    clean_up();
    exit(1);
}
my $r = int(rand(100));
while (-e "/tmp/na.stderr.$r") {
	$r = int(rand(100));
}

$cmd="${NOVOALIGN}novoalign -o SAM -i $insert $sdinsert -r all -e 999 -c2 -d $novoindex -F STDFQ -f toRemap1.fq toRemap2.fq > novoout.2.sam 2>/tmp/na.stderr.$r";
print STDOUT "$cmd\n";
system($cmd);
my $na = `cat /tmp/na.stderr.$r`;
print STDOUT $na;
unlink("/tmp/na.stderr.$r");

$cmd="java -Xmx6g -Xms4g -jar $cffile novoout.2.sam $targets $minct2 $dist $outdir 2 2>/dev/null";
print STDOUT "$cmd\n";
system($cmd);


chdir("${outdir}/temp");
print STDOUT "Assembling contigs...\n";
&assemble();
print STDOUT "Done\n";


system("mv  contigs.txt $outdir");
chdir("$outdir");
clean_up();




#########################

sub make_empty_output {
    make_path($outdir);
    open(FH, ">${outdir}/breakpoints.1.txt") or die "Can't open ${outdir}/breakpoints.1.txt";   close(FH);
    open(FH, ">${outdir}/breakpoints.2.txt") or die "Can't open ${outdir}/breakpoints.2.txt";  close(FH);
    open(FH, ">${outdir}/contigs.txt") or die "Can't open ${outdir}/contigs.txt"; close(FH);
    
}

sub assemble {
    system("${SAMTOOLS}samtools view merged.sorted.final.temp.bam | sort -k 1,1 -k 2,2n -u | tee merged.sorted.final.unique.sam  | perl ${scriptspath}/split_for_velvet.pl pair1.fq pair2.fq single.fq");
    if(!(-s "pair1.fq")) {
	print STDOUT "No breakpoints supported by split reads.\n";
	exit(1);
    }
	system("${VELVET}velveth velvet_out 31 -fastq -shortPaired -separate pair1.fq pair2.fq -short single.fq > /dev/null 2>&1");
	system("${VELVET}velvetg velvet_out -cov_cutoff 3 -ins_length 300 -exp_cov 100  -ins_length_sd 60 > /dev/null 2>&1");
	# launch blat server at random unused port
	my $port = 10000 + int(rand(5000));
	my $status = `${BLAT}gfServer status 127.0.0.1 $port 2>&1` ;
	while ($status !~ m/Error/) {
		$port = 10000 + int(rand(5000));
		$status = `${BLAT}gfServer status 127.0.0.1 $port 2>&1` ;
	}
	print "Waiting for BLAT server to come online.\n";
	system("(${BLAT}gfServer start 127.0.0.1 $port -canStop $twobit >/dev/null 2>&1) &");
	# wait for startup
	$status = `${BLAT}gfServer status 127.0.0.1 $port 2>&1` ;
	while ($status =~ m/Error/) {
		sleep 15;
		$status = `${BLAT}gfServer status 127.0.0.1 $port 2>&1` ;
	}

	# run query
	system("${BLAT}gfClient 127.0.0.1 $port / velvet_out/contigs.fa velvet.psl -minScore=2 -minIdentity=90 -nohead");
	# take blat server down.
	system("${BLAT}gfServer stop 127.0.0.1 $port");
	# continue with analysis.
	my $command = "perl ${scriptspath}/filter_contigs.0403.pl velvet.psl ${outdir}/breakpoints.2.txt velvet_out/contigs.fa velvet > velvet.txt";
	print STDOUT "$command\n";
	system($command);
	system("mv velvet.txt contigs.txt");
}
    
sub clean_up {
    ## move files to output locations.
    if (defined $contigs) {
	$contigs = abs_path($contigs);
	system("cp '$outdir/contigs.txt' '$contigs'");
    }
    if (defined $bp1) {
	$bp1 = abs_path($bp1);
	system("cp '$outdir/breakpoints.1.txt' '$bp1'"); 
    }
    if (defined $bp2) {
	$bp2 = abs_path($bp2);
	system("cp '$outdir/breakpoints.2.txt' '$bp2'");
    }
    if ($rmtmp) {
        system("rm -R temp");
    }
}