diff run_cf0611.pl @ 1:4a3afa90ff7f draft

Uploaded
author geert-vandeweyer
date Mon, 28 Jul 2014 05:53:55 -0400
parents
children 233be956ae78
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/run_cf0611.pl	Mon Jul 28 05:53:55 2014 -0400
@@ -0,0 +1,235 @@
+#!/usr/bin/perl
+
+#Copyright 2012-2013 Haley Abel
+#
+#This file is part of ClusterFAST.
+#
+#ClusterFAST is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version.
+#
+#ClusterFAST is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for more details.
+#
+#You should have received a copy of the GNU General Public License along with ClusterFAST.  If not, see <http://www.gnu.org/licenses/>.
+
+
+use warnings;
+use strict;
+
+use Getopt::Long;
+use Pod::Usage;
+use File::Path qw(make_path);
+use Cwd ;
+use Cwd 'abs_path';
+
+my ($inbam, $outdir, $targets, $minct1, $minct2, $dist, $novoindex, $rmtmp, $FASTA,$twobit,$refpaths,$bp1,$bp2,$contigs);
+
+$minct1=2;
+$minct2=1;
+$dist=50000;
+$rmtmp=1;
+my $known_partners=0;
+my $partnerfile='';
+
+my $cwd = getcwd();
+my $scriptspath="$cwd/cf_scripts";
+my $cffile="$cwd/cf_v1.1.jar";
+my $help='';
+
+
+#YOU MUST ADD LOCAL FILE PATHS HERE FOR STANDALONE USE (including trailing slashes!). Galaxy adds the binaries to the path, so these can be left empty. 
+my $SAMTOOLS=''##INSERT PATH TO SAMTOOLS; http://sourceforge.net/projects/samtools/files/samtools/;
+my $NOVOALIGN=''##INSERT PATH TO NOVOALIGN; http://www.novocraft.com/main/downloadpage.php;
+my $VELVET=''##INSERT PATH TO VELVET;http://www.molecularevolution.org/software/genomics/velvet;
+my $BLAT=''##INSET PATH TO BLAT gfServer/gfClient tools; 
+
+
+
+
+GetOptions(
+    "b|inbam=s"=> \$inbam,
+    "o|outdir=s"=> \$outdir,
+    "t|targets=s"=> \$targets,
+    "f|fasta=s"=> \$FASTA,
+    "m1|minct1=i"=> \$minct1,
+    "m2|minct2=i"=> \$minct2,
+    "d|distance=i"=>\$dist,
+    "i|novoindex=s"=> \$novoindex,
+    "r|rmtmp=i"=> \$rmtmp,
+    "h|help"=>\$help,
+    "2|twobit=s"=>\$twobit,
+    "p|refpaths=s"=>\$refpaths,
+    "c|contigs=s"=>\$contigs,
+    "y|bp1=s"=>\$bp1,
+    "z|bp2=s"=>\$bp2,
+    ) or die "Usage:  run_cf_0611.pl -b inbam -o outdir -f fasta -t targets -d distance -i novoindex -r rmtmp -m1 min_pairs -m2 min_splits -t genome.2bit\n";
+
+if($help) {
+	print "Usage:  run_cf_0611.pl -b inbam -o outdir -f fasta -t targets  -d distance -i novoindex -r rmtmp -m1 min_pairs -m2 min_splits -t genome.2bit\n";
+	print "targets=BED style interval file containing the regions to search for SV\n";
+	print "fasta=reference fasta file\nnovoindex=reference index for novoalign\n";
+	print "distance=max distance from target to search (default=50000)\n";
+	print "min_pairs=minimum number of read pairs supporting breakpoint (default=2)\nmin_splits=minimum number of split reads supporting breakpoint (default=1)\n";
+	print "rmtmp=remove temp directory (default=1, true)\n";
+	print "genome.2bit = 2bit index fasta file of reference genome, used by the blat server (which is launched on the localhost)\n";
+	print "refpaths = semicolon seperated list of reference genomes paths (used by galaxy). The order is mandatory to be novoalign;2bit;fasta\n";
+	print "contigs = (Optional) output file for the contigs (default: outdir/contigs.txt)\n";
+	print "bp1 = (Optional) output file for the breakpoints.1.txt file (default: outdir/breakpoints.1.txt)\n";
+	print "bp2 = (Optional) output file for the breakpoints.2.txt file (default: outdir/breakpoints.2.txt)\n";
+	exit(1);
+}
+
+# if refpaths is defined => split into seperate variables.
+if (defined($refpaths) && $refpaths ne '') {
+	($novoindex, $twobit, $FASTA) = split(/;/,$refpaths);
+}
+
+if(! ( defined $inbam && defined $targets && defined $FASTA && defined $outdir && defined $novoindex && defined $twobit )) {
+	print STDERR "Usage:  run_cf_0611.pl -b inbam -o outdir -f fasta -t targets  -d distance -i novoindex -r rmtmp -m1 min_pairs -m2 min_splits -t genome.2bit\n";
+	exit(1);
+}
+elsif (! (-e $inbam && -e $targets && -e $FASTA && -e $novoindex && -e $twobit)) {
+	print STDERR "Critical file missing.\n";
+	exit(1);
+}
+
+make_empty_output();
+$outdir = abs_path($outdir);
+
+## convert Targets.bed to targets.picard
+my $cmd="${SAMTOOLS}samtools view -H $inbam > $outdir/Targets.picard";
+print STDOUT "$cmd\n";
+system($cmd);
+$cmd = "awk '{OFS=\"\\t\"; print \$1,\$2,\$3,\$6,\$4 }' $targets >> $outdir/Targets.picard";
+print STDOUT "$cmd\n";
+system($cmd);
+$targets = "$outdir/Targets.picard";
+
+## check all reads are paired. Combination of paired and single end libraries crashes the tool.
+my @fs = `${SAMTOOLS}samtools flagstat $inbam`;
+my $total = 0;
+my $paired = 0;
+foreach(@fs) {
+	chomp;
+	if ($_ =~ m/^(\d+).*in total/) {
+		$total = $1;
+	}
+	elsif ($_ =~ m/^(\d+).*paired in sequencing/) {
+		$paired = $1;
+	}
+}
+if ($total != $paired) {
+	print "Removing single-read data from the input bam file\n";
+	system("${SAMTOOLS}samtools view -f 1 -h -b -o $outdir/Paired.only.bam $inbam");
+	$inbam = "$outdir/Paired.only.bam";
+}
+
+## process.
+$cmd="java -Xmx8g -Xms6g -jar $cffile $inbam $targets $minct1 $dist $outdir 1";
+print STDOUT "$cmd\n";
+system($cmd);
+
+chdir("${outdir}/temp");
+
+if(!(-s "toRemap1.fq")) {
+    print STDOUT "No breakpoints found.\n";
+    chdir("$outdir");
+    clean_up();
+    exit(1);
+}
+
+$cmd="${NOVOALIGN}novoalign -o SAM -i 230 140 -r all -e 999 -c2 -d $novoindex -F STDFQ -f toRemap1.fq toRemap2.fq > novoout.2.sam";
+print STDOUT "$cmd\n";
+system($cmd);
+
+$cmd="java -Xmx6g -Xms4g -jar $cffile novoout.2.sam $targets $minct2 $dist $outdir 2";
+print STDOUT "$cmd\n";
+system($cmd);
+
+
+chdir("${outdir}/temp");
+print STDOUT "Assembling contigs...\n";
+&assemble();
+print STDOUT "Done\n";
+
+
+system("mv  contigs.txt $outdir");
+chdir("$outdir");
+clean_up();
+
+
+
+
+#########################
+
+sub make_empty_output {
+    make_path($outdir);
+    open(FH, ">${outdir}/breakpoints.1.txt") or die "Can't open ${outdir}/breakpoints.1.txt";   close(FH);
+    open(FH, ">${outdir}/breakpoints.2.txt") or die "Can't open ${outdir}/breakpoints.2.txt";  close(FH);
+    open(FH, ">${outdir}/contigs.txt") or die "Can't open ${outdir}/contigs.txt"; close(FH);
+    
+}
+
+sub assemble {
+    system("${SAMTOOLS}samtools view merged.sorted.final.temp.bam | sort -k 1,1 -k 2,2n -u | tee merged.sorted.final.unique.sam  | perl ${scriptspath}/split_for_velvet.pl pair1.fq pair2.fq single.fq");
+    if(!(-s "pair1.fq")) {
+	print STDERR "No breakpoints supported by split reads.\n";
+	exit(1);
+    }
+	system("${VELVET}velveth velvet_out 31 -fastq -shortPaired -separate pair1.fq pair2.fq -short single.fq > /dev/null");
+	system("${VELVET}velvetg velvet_out -cov_cutoff 3 -ins_length 300 -exp_cov 100  -ins_length_sd 60 > /dev/null");
+	# launch blat server at random unused port
+	my $port = 10000 + int(rand(5000));
+	my $status = `${BLAT}gfServer status 127.0.0.1 $port 2>&1` ;
+	while ($status !~ m/Error/) {
+		$port = 10000 + int(rand(5000));
+		$status = `${BLAT}gfServer status 127.0.0.1 $port 2>&1` ;
+	}
+	print "Waiting for BLAT server to come online.\n";
+	system("(${BLAT}gfServer start 127.0.0.1 $port -canStop $twobit >/dev/null 2>&1) &");
+	# wait for startup
+	$status = `${BLAT}gfServer status 127.0.0.1 $port 2>&1` ;
+	while ($status =~ m/Error/) {
+		sleep 15;
+		$status = `${BLAT}gfServer status 127.0.0.1 $port 2>&1` ;
+	}
+
+	# run query
+	system("${BLAT}gfClient 127.0.0.1 $port / velvet_out/contigs.fa velvet.psl -minScore=2 -minIdentity=90 -nohead");
+	# take blat server down.
+	system("${BLAT}gfServer stop 127.0.0.1 $port");
+	# continue with analysis.
+	my $command = "perl ${scriptspath}/filter_contigs.0403.pl velvet.psl ${outdir}/breakpoints.2.txt velvet_out/contigs.fa velvet > velvet.txt";
+	print STDOUT "$command\n";
+	system($command);
+	system("mv velvet.txt contigs.txt");
+}
+    
+sub clean_up {
+    ## move files to output locations.
+    if (defined $contigs) {
+	$contigs = abs_path($contigs);
+	system("cp '$outdir/contigs.txt' '$contigs'");
+    }
+    if (defined $bp1) {
+	$bp1 = abs_path($bp1);
+	system("cp '$outdir/breakpoints.1.txt' '$bp1'"); 
+    }
+    if (defined $bp2) {
+	$bp2 = abs_path($bp2);
+	system("cp '$outdir/breakpoints.2.txt' '$bp2'");
+    }
+    if ($rmtmp) {
+        system("rm -R temp");
+    }
+}
+
+
+	
+
+	
+
+	
+
+	
+
+
+