Mercurial > repos > geert-vandeweyer > clusterfast
view run_cf0611.pl @ 12:e4000df38772 draft
Uploaded
author | geert-vandeweyer |
---|---|
date | Mon, 28 Jul 2014 09:49:10 -0400 |
parents | 233be956ae78 |
children | aaf5c557af13 |
line wrap: on
line source
#!/usr/bin/perl #Copyright 2012-2013 Haley Abel # #This file is part of ClusterFAST. # #ClusterFAST is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. # #ClusterFAST is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. # #You should have received a copy of the GNU General Public License along with ClusterFAST. If not, see <http://www.gnu.org/licenses/>. use warnings; use strict; use Getopt::Long; use Pod::Usage; use File::Path qw(make_path); use Cwd ; use Cwd 'abs_path'; my ($inbam, $outdir, $targets, $minct1, $minct2, $dist, $novoindex, $rmtmp, $FASTA,$twobit,$refpaths,$bp1,$bp2,$contigs); $minct1=2; $minct2=1; $dist=50000; $rmtmp=1; my $known_partners=0; my $partnerfile=''; my $cwd = getcwd(); my $scriptspath = $cffile = ''; # Galaxy installer puts scripts and jar into a folder, which is available under the environment variable CF_PATH if (defined($ENV{'CF_PATH'}) && -d $ENV{'CF_PATH'}) { $scriptspath = $ENV{'CF_PATH'}."/cf_scripts"; $cffile = $ENV{'CF_PATH'}."/cf_v1.1.jar"; } else { $scriptspath="$cwd/cf_scripts"; $cffile="$cwd/cf_v1.1.jar"; } my $help=''; #YOU MUST ADD LOCAL FILE PATHS HERE FOR STANDALONE USE (including trailing slashes!). Galaxy adds the binaries to the path, so these can be left empty. my $SAMTOOLS='';##INSERT PATH TO SAMTOOLS; http://sourceforge.net/projects/samtools/files/samtools/; my $NOVOALIGN='';##INSERT PATH TO NOVOALIGN; http://www.novocraft.com/main/downloadpage.php; my $VELVET='';##INSERT PATH TO VELVET;http://www.molecularevolution.org/software/genomics/velvet; my $BLAT='';##INSET PATH TO BLAT gfServer/gfClient tools; GetOptions( "b|inbam=s"=> \$inbam, "o|outdir=s"=> \$outdir, "t|targets=s"=> \$targets, "f|fasta=s"=> \$FASTA, "m1|minct1=i"=> \$minct1, "m2|minct2=i"=> \$minct2, "d|distance=i"=>\$dist, "i|novoindex=s"=> \$novoindex, "r|rmtmp=i"=> \$rmtmp, "h|help"=>\$help, "2|twobit=s"=>\$twobit, "p|refpaths=s"=>\$refpaths, "c|contigs=s"=>\$contigs, "y|bp1=s"=>\$bp1, "z|bp2=s"=>\$bp2, ) or die "Usage: run_cf_0611.pl -b inbam -o outdir -f fasta -t targets -d distance -i novoindex -r rmtmp -m1 min_pairs -m2 min_splits -t genome.2bit\n"; if($help) { print "Usage: run_cf_0611.pl -b inbam -o outdir -f fasta -t targets -d distance -i novoindex -r rmtmp -m1 min_pairs -m2 min_splits -t genome.2bit\n"; print "targets=BED style interval file containing the regions to search for SV\n"; print "fasta=reference fasta file\nnovoindex=reference index for novoalign\n"; print "distance=max distance from target to search (default=50000)\n"; print "min_pairs=minimum number of read pairs supporting breakpoint (default=2)\nmin_splits=minimum number of split reads supporting breakpoint (default=1)\n"; print "rmtmp=remove temp directory (default=1, true)\n"; print "genome.2bit = 2bit index fasta file of reference genome, used by the blat server (which is launched on the localhost)\n"; print "refpaths = semicolon seperated list of reference genomes paths (used by galaxy). The order is mandatory to be novoalign;2bit;fasta\n"; print "contigs = (Optional) output file for the contigs (default: outdir/contigs.txt)\n"; print "bp1 = (Optional) output file for the breakpoints.1.txt file (default: outdir/breakpoints.1.txt)\n"; print "bp2 = (Optional) output file for the breakpoints.2.txt file (default: outdir/breakpoints.2.txt)\n"; exit(1); } # if refpaths is defined => split into seperate variables. if (defined($refpaths) && $refpaths ne '') { ($novoindex, $twobit, $FASTA) = split(/;/,$refpaths); } if(! ( defined $inbam && defined $targets && defined $FASTA && defined $outdir && defined $novoindex && defined $twobit )) { print STDERR "Usage: run_cf_0611.pl -b inbam -o outdir -f fasta -t targets -d distance -i novoindex -r rmtmp -m1 min_pairs -m2 min_splits -t genome.2bit\n"; exit(1); } elsif (! (-e $inbam && -e $targets && -e $FASTA && -e $novoindex && -e $twobit)) { print STDERR "Critical file missing.\n"; exit(1); } make_empty_output(); $outdir = abs_path($outdir); ## convert Targets.bed to targets.picard my $cmd="${SAMTOOLS}samtools view -H $inbam > $outdir/Targets.picard"; print STDOUT "$cmd\n"; system($cmd); $cmd = "awk '{OFS=\"\\t\"; print \$1,\$2,\$3,\$6,\$4 }' $targets >> $outdir/Targets.picard"; print STDOUT "$cmd\n"; system($cmd); $targets = "$outdir/Targets.picard"; ## check all reads are paired. Combination of paired and single end libraries crashes the tool. my @fs = `${SAMTOOLS}samtools flagstat $inbam`; my $total = 0; my $paired = 0; foreach(@fs) { chomp; if ($_ =~ m/^(\d+).*in total/) { $total = $1; } elsif ($_ =~ m/^(\d+).*paired in sequencing/) { $paired = $1; } } if ($total != $paired) { print "Removing single-read data from the input bam file\n"; system("${SAMTOOLS}samtools view -f 1 -h -b -o $outdir/Paired.only.bam $inbam"); $inbam = "$outdir/Paired.only.bam"; } ## process. $cmd="java -Xmx8g -Xms6g -jar $cffile $inbam $targets $minct1 $dist $outdir 1"; print STDOUT "$cmd\n"; system($cmd); chdir("${outdir}/temp"); if(!(-s "toRemap1.fq")) { print STDOUT "No breakpoints found.\n"; chdir("$outdir"); clean_up(); exit(1); } $cmd="${NOVOALIGN}novoalign -o SAM -i 230 140 -r all -e 999 -c2 -d $novoindex -F STDFQ -f toRemap1.fq toRemap2.fq > novoout.2.sam"; print STDOUT "$cmd\n"; system($cmd); $cmd="java -Xmx6g -Xms4g -jar $cffile novoout.2.sam $targets $minct2 $dist $outdir 2"; print STDOUT "$cmd\n"; system($cmd); chdir("${outdir}/temp"); print STDOUT "Assembling contigs...\n"; &assemble(); print STDOUT "Done\n"; system("mv contigs.txt $outdir"); chdir("$outdir"); clean_up(); ######################### sub make_empty_output { make_path($outdir); open(FH, ">${outdir}/breakpoints.1.txt") or die "Can't open ${outdir}/breakpoints.1.txt"; close(FH); open(FH, ">${outdir}/breakpoints.2.txt") or die "Can't open ${outdir}/breakpoints.2.txt"; close(FH); open(FH, ">${outdir}/contigs.txt") or die "Can't open ${outdir}/contigs.txt"; close(FH); } sub assemble { system("${SAMTOOLS}samtools view merged.sorted.final.temp.bam | sort -k 1,1 -k 2,2n -u | tee merged.sorted.final.unique.sam | perl ${scriptspath}/split_for_velvet.pl pair1.fq pair2.fq single.fq"); if(!(-s "pair1.fq")) { print STDERR "No breakpoints supported by split reads.\n"; exit(1); } system("${VELVET}velveth velvet_out 31 -fastq -shortPaired -separate pair1.fq pair2.fq -short single.fq > /dev/null"); system("${VELVET}velvetg velvet_out -cov_cutoff 3 -ins_length 300 -exp_cov 100 -ins_length_sd 60 > /dev/null"); # launch blat server at random unused port my $port = 10000 + int(rand(5000)); my $status = `${BLAT}gfServer status 127.0.0.1 $port 2>&1` ; while ($status !~ m/Error/) { $port = 10000 + int(rand(5000)); $status = `${BLAT}gfServer status 127.0.0.1 $port 2>&1` ; } print "Waiting for BLAT server to come online.\n"; system("(${BLAT}gfServer start 127.0.0.1 $port -canStop $twobit >/dev/null 2>&1) &"); # wait for startup $status = `${BLAT}gfServer status 127.0.0.1 $port 2>&1` ; while ($status =~ m/Error/) { sleep 15; $status = `${BLAT}gfServer status 127.0.0.1 $port 2>&1` ; } # run query system("${BLAT}gfClient 127.0.0.1 $port / velvet_out/contigs.fa velvet.psl -minScore=2 -minIdentity=90 -nohead"); # take blat server down. system("${BLAT}gfServer stop 127.0.0.1 $port"); # continue with analysis. my $command = "perl ${scriptspath}/filter_contigs.0403.pl velvet.psl ${outdir}/breakpoints.2.txt velvet_out/contigs.fa velvet > velvet.txt"; print STDOUT "$command\n"; system($command); system("mv velvet.txt contigs.txt"); } sub clean_up { ## move files to output locations. if (defined $contigs) { $contigs = abs_path($contigs); system("cp '$outdir/contigs.txt' '$contigs'"); } if (defined $bp1) { $bp1 = abs_path($bp1); system("cp '$outdir/breakpoints.1.txt' '$bp1'"); } if (defined $bp2) { $bp2 = abs_path($bp2); system("cp '$outdir/breakpoints.2.txt' '$bp2'"); } if ($rmtmp) { system("rm -R temp"); } }