Mercurial > repos > geert-vandeweyer > clusterfast
diff run_cf0611.pl @ 1:4a3afa90ff7f draft
Uploaded
author | geert-vandeweyer |
---|---|
date | Mon, 28 Jul 2014 05:53:55 -0400 |
parents | |
children | 233be956ae78 |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/run_cf0611.pl Mon Jul 28 05:53:55 2014 -0400 @@ -0,0 +1,235 @@ +#!/usr/bin/perl + +#Copyright 2012-2013 Haley Abel +# +#This file is part of ClusterFAST. +# +#ClusterFAST is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. +# +#ClusterFAST is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. +# +#You should have received a copy of the GNU General Public License along with ClusterFAST. If not, see <http://www.gnu.org/licenses/>. + + +use warnings; +use strict; + +use Getopt::Long; +use Pod::Usage; +use File::Path qw(make_path); +use Cwd ; +use Cwd 'abs_path'; + +my ($inbam, $outdir, $targets, $minct1, $minct2, $dist, $novoindex, $rmtmp, $FASTA,$twobit,$refpaths,$bp1,$bp2,$contigs); + +$minct1=2; +$minct2=1; +$dist=50000; +$rmtmp=1; +my $known_partners=0; +my $partnerfile=''; + +my $cwd = getcwd(); +my $scriptspath="$cwd/cf_scripts"; +my $cffile="$cwd/cf_v1.1.jar"; +my $help=''; + + +#YOU MUST ADD LOCAL FILE PATHS HERE FOR STANDALONE USE (including trailing slashes!). Galaxy adds the binaries to the path, so these can be left empty. +my $SAMTOOLS=''##INSERT PATH TO SAMTOOLS; http://sourceforge.net/projects/samtools/files/samtools/; +my $NOVOALIGN=''##INSERT PATH TO NOVOALIGN; http://www.novocraft.com/main/downloadpage.php; +my $VELVET=''##INSERT PATH TO VELVET;http://www.molecularevolution.org/software/genomics/velvet; +my $BLAT=''##INSET PATH TO BLAT gfServer/gfClient tools; + + + + +GetOptions( + "b|inbam=s"=> \$inbam, + "o|outdir=s"=> \$outdir, + "t|targets=s"=> \$targets, + "f|fasta=s"=> \$FASTA, + "m1|minct1=i"=> \$minct1, + "m2|minct2=i"=> \$minct2, + "d|distance=i"=>\$dist, + "i|novoindex=s"=> \$novoindex, + "r|rmtmp=i"=> \$rmtmp, + "h|help"=>\$help, + "2|twobit=s"=>\$twobit, + "p|refpaths=s"=>\$refpaths, + "c|contigs=s"=>\$contigs, + "y|bp1=s"=>\$bp1, + "z|bp2=s"=>\$bp2, + ) or die "Usage: run_cf_0611.pl -b inbam -o outdir -f fasta -t targets -d distance -i novoindex -r rmtmp -m1 min_pairs -m2 min_splits -t genome.2bit\n"; + +if($help) { + print "Usage: run_cf_0611.pl -b inbam -o outdir -f fasta -t targets -d distance -i novoindex -r rmtmp -m1 min_pairs -m2 min_splits -t genome.2bit\n"; + print "targets=BED style interval file containing the regions to search for SV\n"; + print "fasta=reference fasta file\nnovoindex=reference index for novoalign\n"; + print "distance=max distance from target to search (default=50000)\n"; + print "min_pairs=minimum number of read pairs supporting breakpoint (default=2)\nmin_splits=minimum number of split reads supporting breakpoint (default=1)\n"; + print "rmtmp=remove temp directory (default=1, true)\n"; + print "genome.2bit = 2bit index fasta file of reference genome, used by the blat server (which is launched on the localhost)\n"; + print "refpaths = semicolon seperated list of reference genomes paths (used by galaxy). The order is mandatory to be novoalign;2bit;fasta\n"; + print "contigs = (Optional) output file for the contigs (default: outdir/contigs.txt)\n"; + print "bp1 = (Optional) output file for the breakpoints.1.txt file (default: outdir/breakpoints.1.txt)\n"; + print "bp2 = (Optional) output file for the breakpoints.2.txt file (default: outdir/breakpoints.2.txt)\n"; + exit(1); +} + +# if refpaths is defined => split into seperate variables. +if (defined($refpaths) && $refpaths ne '') { + ($novoindex, $twobit, $FASTA) = split(/;/,$refpaths); +} + +if(! ( defined $inbam && defined $targets && defined $FASTA && defined $outdir && defined $novoindex && defined $twobit )) { + print STDERR "Usage: run_cf_0611.pl -b inbam -o outdir -f fasta -t targets -d distance -i novoindex -r rmtmp -m1 min_pairs -m2 min_splits -t genome.2bit\n"; + exit(1); +} +elsif (! (-e $inbam && -e $targets && -e $FASTA && -e $novoindex && -e $twobit)) { + print STDERR "Critical file missing.\n"; + exit(1); +} + +make_empty_output(); +$outdir = abs_path($outdir); + +## convert Targets.bed to targets.picard +my $cmd="${SAMTOOLS}samtools view -H $inbam > $outdir/Targets.picard"; +print STDOUT "$cmd\n"; +system($cmd); +$cmd = "awk '{OFS=\"\\t\"; print \$1,\$2,\$3,\$6,\$4 }' $targets >> $outdir/Targets.picard"; +print STDOUT "$cmd\n"; +system($cmd); +$targets = "$outdir/Targets.picard"; + +## check all reads are paired. Combination of paired and single end libraries crashes the tool. +my @fs = `${SAMTOOLS}samtools flagstat $inbam`; +my $total = 0; +my $paired = 0; +foreach(@fs) { + chomp; + if ($_ =~ m/^(\d+).*in total/) { + $total = $1; + } + elsif ($_ =~ m/^(\d+).*paired in sequencing/) { + $paired = $1; + } +} +if ($total != $paired) { + print "Removing single-read data from the input bam file\n"; + system("${SAMTOOLS}samtools view -f 1 -h -b -o $outdir/Paired.only.bam $inbam"); + $inbam = "$outdir/Paired.only.bam"; +} + +## process. +$cmd="java -Xmx8g -Xms6g -jar $cffile $inbam $targets $minct1 $dist $outdir 1"; +print STDOUT "$cmd\n"; +system($cmd); + +chdir("${outdir}/temp"); + +if(!(-s "toRemap1.fq")) { + print STDOUT "No breakpoints found.\n"; + chdir("$outdir"); + clean_up(); + exit(1); +} + +$cmd="${NOVOALIGN}novoalign -o SAM -i 230 140 -r all -e 999 -c2 -d $novoindex -F STDFQ -f toRemap1.fq toRemap2.fq > novoout.2.sam"; +print STDOUT "$cmd\n"; +system($cmd); + +$cmd="java -Xmx6g -Xms4g -jar $cffile novoout.2.sam $targets $minct2 $dist $outdir 2"; +print STDOUT "$cmd\n"; +system($cmd); + + +chdir("${outdir}/temp"); +print STDOUT "Assembling contigs...\n"; +&assemble(); +print STDOUT "Done\n"; + + +system("mv contigs.txt $outdir"); +chdir("$outdir"); +clean_up(); + + + + +######################### + +sub make_empty_output { + make_path($outdir); + open(FH, ">${outdir}/breakpoints.1.txt") or die "Can't open ${outdir}/breakpoints.1.txt"; close(FH); + open(FH, ">${outdir}/breakpoints.2.txt") or die "Can't open ${outdir}/breakpoints.2.txt"; close(FH); + open(FH, ">${outdir}/contigs.txt") or die "Can't open ${outdir}/contigs.txt"; close(FH); + +} + +sub assemble { + system("${SAMTOOLS}samtools view merged.sorted.final.temp.bam | sort -k 1,1 -k 2,2n -u | tee merged.sorted.final.unique.sam | perl ${scriptspath}/split_for_velvet.pl pair1.fq pair2.fq single.fq"); + if(!(-s "pair1.fq")) { + print STDERR "No breakpoints supported by split reads.\n"; + exit(1); + } + system("${VELVET}velveth velvet_out 31 -fastq -shortPaired -separate pair1.fq pair2.fq -short single.fq > /dev/null"); + system("${VELVET}velvetg velvet_out -cov_cutoff 3 -ins_length 300 -exp_cov 100 -ins_length_sd 60 > /dev/null"); + # launch blat server at random unused port + my $port = 10000 + int(rand(5000)); + my $status = `${BLAT}gfServer status 127.0.0.1 $port 2>&1` ; + while ($status !~ m/Error/) { + $port = 10000 + int(rand(5000)); + $status = `${BLAT}gfServer status 127.0.0.1 $port 2>&1` ; + } + print "Waiting for BLAT server to come online.\n"; + system("(${BLAT}gfServer start 127.0.0.1 $port -canStop $twobit >/dev/null 2>&1) &"); + # wait for startup + $status = `${BLAT}gfServer status 127.0.0.1 $port 2>&1` ; + while ($status =~ m/Error/) { + sleep 15; + $status = `${BLAT}gfServer status 127.0.0.1 $port 2>&1` ; + } + + # run query + system("${BLAT}gfClient 127.0.0.1 $port / velvet_out/contigs.fa velvet.psl -minScore=2 -minIdentity=90 -nohead"); + # take blat server down. + system("${BLAT}gfServer stop 127.0.0.1 $port"); + # continue with analysis. + my $command = "perl ${scriptspath}/filter_contigs.0403.pl velvet.psl ${outdir}/breakpoints.2.txt velvet_out/contigs.fa velvet > velvet.txt"; + print STDOUT "$command\n"; + system($command); + system("mv velvet.txt contigs.txt"); +} + +sub clean_up { + ## move files to output locations. + if (defined $contigs) { + $contigs = abs_path($contigs); + system("cp '$outdir/contigs.txt' '$contigs'"); + } + if (defined $bp1) { + $bp1 = abs_path($bp1); + system("cp '$outdir/breakpoints.1.txt' '$bp1'"); + } + if (defined $bp2) { + $bp2 = abs_path($bp2); + system("cp '$outdir/breakpoints.2.txt' '$bp2'"); + } + if ($rmtmp) { + system("rm -R temp"); + } +} + + + + + + + + + + + +