comparison run_cf0611.pl @ 1:4a3afa90ff7f draft

Uploaded
author geert-vandeweyer
date Mon, 28 Jul 2014 05:53:55 -0400
parents
children 233be956ae78
comparison
equal deleted inserted replaced
0:1b008b4b05f3 1:4a3afa90ff7f
1 #!/usr/bin/perl
2
3 #Copyright 2012-2013 Haley Abel
4 #
5 #This file is part of ClusterFAST.
6 #
7 #ClusterFAST is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version.
8 #
9 #ClusterFAST is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
10 #
11 #You should have received a copy of the GNU General Public License along with ClusterFAST. If not, see <http://www.gnu.org/licenses/>.
12
13
14 use warnings;
15 use strict;
16
17 use Getopt::Long;
18 use Pod::Usage;
19 use File::Path qw(make_path);
20 use Cwd ;
21 use Cwd 'abs_path';
22
23 my ($inbam, $outdir, $targets, $minct1, $minct2, $dist, $novoindex, $rmtmp, $FASTA,$twobit,$refpaths,$bp1,$bp2,$contigs);
24
25 $minct1=2;
26 $minct2=1;
27 $dist=50000;
28 $rmtmp=1;
29 my $known_partners=0;
30 my $partnerfile='';
31
32 my $cwd = getcwd();
33 my $scriptspath="$cwd/cf_scripts";
34 my $cffile="$cwd/cf_v1.1.jar";
35 my $help='';
36
37
38 #YOU MUST ADD LOCAL FILE PATHS HERE FOR STANDALONE USE (including trailing slashes!). Galaxy adds the binaries to the path, so these can be left empty.
39 my $SAMTOOLS=''##INSERT PATH TO SAMTOOLS; http://sourceforge.net/projects/samtools/files/samtools/;
40 my $NOVOALIGN=''##INSERT PATH TO NOVOALIGN; http://www.novocraft.com/main/downloadpage.php;
41 my $VELVET=''##INSERT PATH TO VELVET;http://www.molecularevolution.org/software/genomics/velvet;
42 my $BLAT=''##INSET PATH TO BLAT gfServer/gfClient tools;
43
44
45
46
47 GetOptions(
48 "b|inbam=s"=> \$inbam,
49 "o|outdir=s"=> \$outdir,
50 "t|targets=s"=> \$targets,
51 "f|fasta=s"=> \$FASTA,
52 "m1|minct1=i"=> \$minct1,
53 "m2|minct2=i"=> \$minct2,
54 "d|distance=i"=>\$dist,
55 "i|novoindex=s"=> \$novoindex,
56 "r|rmtmp=i"=> \$rmtmp,
57 "h|help"=>\$help,
58 "2|twobit=s"=>\$twobit,
59 "p|refpaths=s"=>\$refpaths,
60 "c|contigs=s"=>\$contigs,
61 "y|bp1=s"=>\$bp1,
62 "z|bp2=s"=>\$bp2,
63 ) or die "Usage: run_cf_0611.pl -b inbam -o outdir -f fasta -t targets -d distance -i novoindex -r rmtmp -m1 min_pairs -m2 min_splits -t genome.2bit\n";
64
65 if($help) {
66 print "Usage: run_cf_0611.pl -b inbam -o outdir -f fasta -t targets -d distance -i novoindex -r rmtmp -m1 min_pairs -m2 min_splits -t genome.2bit\n";
67 print "targets=BED style interval file containing the regions to search for SV\n";
68 print "fasta=reference fasta file\nnovoindex=reference index for novoalign\n";
69 print "distance=max distance from target to search (default=50000)\n";
70 print "min_pairs=minimum number of read pairs supporting breakpoint (default=2)\nmin_splits=minimum number of split reads supporting breakpoint (default=1)\n";
71 print "rmtmp=remove temp directory (default=1, true)\n";
72 print "genome.2bit = 2bit index fasta file of reference genome, used by the blat server (which is launched on the localhost)\n";
73 print "refpaths = semicolon seperated list of reference genomes paths (used by galaxy). The order is mandatory to be novoalign;2bit;fasta\n";
74 print "contigs = (Optional) output file for the contigs (default: outdir/contigs.txt)\n";
75 print "bp1 = (Optional) output file for the breakpoints.1.txt file (default: outdir/breakpoints.1.txt)\n";
76 print "bp2 = (Optional) output file for the breakpoints.2.txt file (default: outdir/breakpoints.2.txt)\n";
77 exit(1);
78 }
79
80 # if refpaths is defined => split into seperate variables.
81 if (defined($refpaths) && $refpaths ne '') {
82 ($novoindex, $twobit, $FASTA) = split(/;/,$refpaths);
83 }
84
85 if(! ( defined $inbam && defined $targets && defined $FASTA && defined $outdir && defined $novoindex && defined $twobit )) {
86 print STDERR "Usage: run_cf_0611.pl -b inbam -o outdir -f fasta -t targets -d distance -i novoindex -r rmtmp -m1 min_pairs -m2 min_splits -t genome.2bit\n";
87 exit(1);
88 }
89 elsif (! (-e $inbam && -e $targets && -e $FASTA && -e $novoindex && -e $twobit)) {
90 print STDERR "Critical file missing.\n";
91 exit(1);
92 }
93
94 make_empty_output();
95 $outdir = abs_path($outdir);
96
97 ## convert Targets.bed to targets.picard
98 my $cmd="${SAMTOOLS}samtools view -H $inbam > $outdir/Targets.picard";
99 print STDOUT "$cmd\n";
100 system($cmd);
101 $cmd = "awk '{OFS=\"\\t\"; print \$1,\$2,\$3,\$6,\$4 }' $targets >> $outdir/Targets.picard";
102 print STDOUT "$cmd\n";
103 system($cmd);
104 $targets = "$outdir/Targets.picard";
105
106 ## check all reads are paired. Combination of paired and single end libraries crashes the tool.
107 my @fs = `${SAMTOOLS}samtools flagstat $inbam`;
108 my $total = 0;
109 my $paired = 0;
110 foreach(@fs) {
111 chomp;
112 if ($_ =~ m/^(\d+).*in total/) {
113 $total = $1;
114 }
115 elsif ($_ =~ m/^(\d+).*paired in sequencing/) {
116 $paired = $1;
117 }
118 }
119 if ($total != $paired) {
120 print "Removing single-read data from the input bam file\n";
121 system("${SAMTOOLS}samtools view -f 1 -h -b -o $outdir/Paired.only.bam $inbam");
122 $inbam = "$outdir/Paired.only.bam";
123 }
124
125 ## process.
126 $cmd="java -Xmx8g -Xms6g -jar $cffile $inbam $targets $minct1 $dist $outdir 1";
127 print STDOUT "$cmd\n";
128 system($cmd);
129
130 chdir("${outdir}/temp");
131
132 if(!(-s "toRemap1.fq")) {
133 print STDOUT "No breakpoints found.\n";
134 chdir("$outdir");
135 clean_up();
136 exit(1);
137 }
138
139 $cmd="${NOVOALIGN}novoalign -o SAM -i 230 140 -r all -e 999 -c2 -d $novoindex -F STDFQ -f toRemap1.fq toRemap2.fq > novoout.2.sam";
140 print STDOUT "$cmd\n";
141 system($cmd);
142
143 $cmd="java -Xmx6g -Xms4g -jar $cffile novoout.2.sam $targets $minct2 $dist $outdir 2";
144 print STDOUT "$cmd\n";
145 system($cmd);
146
147
148 chdir("${outdir}/temp");
149 print STDOUT "Assembling contigs...\n";
150 &assemble();
151 print STDOUT "Done\n";
152
153
154 system("mv contigs.txt $outdir");
155 chdir("$outdir");
156 clean_up();
157
158
159
160
161 #########################
162
163 sub make_empty_output {
164 make_path($outdir);
165 open(FH, ">${outdir}/breakpoints.1.txt") or die "Can't open ${outdir}/breakpoints.1.txt"; close(FH);
166 open(FH, ">${outdir}/breakpoints.2.txt") or die "Can't open ${outdir}/breakpoints.2.txt"; close(FH);
167 open(FH, ">${outdir}/contigs.txt") or die "Can't open ${outdir}/contigs.txt"; close(FH);
168
169 }
170
171 sub assemble {
172 system("${SAMTOOLS}samtools view merged.sorted.final.temp.bam | sort -k 1,1 -k 2,2n -u | tee merged.sorted.final.unique.sam | perl ${scriptspath}/split_for_velvet.pl pair1.fq pair2.fq single.fq");
173 if(!(-s "pair1.fq")) {
174 print STDERR "No breakpoints supported by split reads.\n";
175 exit(1);
176 }
177 system("${VELVET}velveth velvet_out 31 -fastq -shortPaired -separate pair1.fq pair2.fq -short single.fq > /dev/null");
178 system("${VELVET}velvetg velvet_out -cov_cutoff 3 -ins_length 300 -exp_cov 100 -ins_length_sd 60 > /dev/null");
179 # launch blat server at random unused port
180 my $port = 10000 + int(rand(5000));
181 my $status = `${BLAT}gfServer status 127.0.0.1 $port 2>&1` ;
182 while ($status !~ m/Error/) {
183 $port = 10000 + int(rand(5000));
184 $status = `${BLAT}gfServer status 127.0.0.1 $port 2>&1` ;
185 }
186 print "Waiting for BLAT server to come online.\n";
187 system("(${BLAT}gfServer start 127.0.0.1 $port -canStop $twobit >/dev/null 2>&1) &");
188 # wait for startup
189 $status = `${BLAT}gfServer status 127.0.0.1 $port 2>&1` ;
190 while ($status =~ m/Error/) {
191 sleep 15;
192 $status = `${BLAT}gfServer status 127.0.0.1 $port 2>&1` ;
193 }
194
195 # run query
196 system("${BLAT}gfClient 127.0.0.1 $port / velvet_out/contigs.fa velvet.psl -minScore=2 -minIdentity=90 -nohead");
197 # take blat server down.
198 system("${BLAT}gfServer stop 127.0.0.1 $port");
199 # continue with analysis.
200 my $command = "perl ${scriptspath}/filter_contigs.0403.pl velvet.psl ${outdir}/breakpoints.2.txt velvet_out/contigs.fa velvet > velvet.txt";
201 print STDOUT "$command\n";
202 system($command);
203 system("mv velvet.txt contigs.txt");
204 }
205
206 sub clean_up {
207 ## move files to output locations.
208 if (defined $contigs) {
209 $contigs = abs_path($contigs);
210 system("cp '$outdir/contigs.txt' '$contigs'");
211 }
212 if (defined $bp1) {
213 $bp1 = abs_path($bp1);
214 system("cp '$outdir/breakpoints.1.txt' '$bp1'");
215 }
216 if (defined $bp2) {
217 $bp2 = abs_path($bp2);
218 system("cp '$outdir/breakpoints.2.txt' '$bp2'");
219 }
220 if ($rmtmp) {
221 system("rm -R temp");
222 }
223 }
224
225
226
227
228
229
230
231
232
233
234
235