1
|
1 #!/usr/bin/perl
|
|
2
|
|
3 #Copyright 2012-2013 Haley Abel
|
|
4 #
|
|
5 #This file is part of ClusterFAST.
|
|
6 #
|
|
7 #ClusterFAST is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version.
|
|
8 #
|
|
9 #ClusterFAST is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
|
|
10 #
|
|
11 #You should have received a copy of the GNU General Public License along with ClusterFAST. If not, see <http://www.gnu.org/licenses/>.
|
|
12
|
|
13
|
|
14 use warnings;
|
|
15 use strict;
|
|
16
|
|
17 use Getopt::Long;
|
|
18 use Pod::Usage;
|
|
19 use File::Path qw(make_path);
|
|
20 use Cwd ;
|
|
21 use Cwd 'abs_path';
|
|
22
|
|
23 my ($inbam, $outdir, $targets, $minct1, $minct2, $dist, $novoindex, $rmtmp, $FASTA,$twobit,$refpaths,$bp1,$bp2,$contigs);
|
|
24
|
|
25 $minct1=2;
|
|
26 $minct2=1;
|
|
27 $dist=50000;
|
|
28 $rmtmp=1;
|
|
29 my $known_partners=0;
|
|
30 my $partnerfile='';
|
|
31
|
|
32 my $cwd = getcwd();
|
|
33 my $scriptspath="$cwd/cf_scripts";
|
|
34 my $cffile="$cwd/cf_v1.1.jar";
|
|
35 my $help='';
|
|
36
|
|
37
|
|
38 #YOU MUST ADD LOCAL FILE PATHS HERE FOR STANDALONE USE (including trailing slashes!). Galaxy adds the binaries to the path, so these can be left empty.
|
|
39 my $SAMTOOLS=''##INSERT PATH TO SAMTOOLS; http://sourceforge.net/projects/samtools/files/samtools/;
|
|
40 my $NOVOALIGN=''##INSERT PATH TO NOVOALIGN; http://www.novocraft.com/main/downloadpage.php;
|
|
41 my $VELVET=''##INSERT PATH TO VELVET;http://www.molecularevolution.org/software/genomics/velvet;
|
|
42 my $BLAT=''##INSET PATH TO BLAT gfServer/gfClient tools;
|
|
43
|
|
44
|
|
45
|
|
46
|
|
47 GetOptions(
|
|
48 "b|inbam=s"=> \$inbam,
|
|
49 "o|outdir=s"=> \$outdir,
|
|
50 "t|targets=s"=> \$targets,
|
|
51 "f|fasta=s"=> \$FASTA,
|
|
52 "m1|minct1=i"=> \$minct1,
|
|
53 "m2|minct2=i"=> \$minct2,
|
|
54 "d|distance=i"=>\$dist,
|
|
55 "i|novoindex=s"=> \$novoindex,
|
|
56 "r|rmtmp=i"=> \$rmtmp,
|
|
57 "h|help"=>\$help,
|
|
58 "2|twobit=s"=>\$twobit,
|
|
59 "p|refpaths=s"=>\$refpaths,
|
|
60 "c|contigs=s"=>\$contigs,
|
|
61 "y|bp1=s"=>\$bp1,
|
|
62 "z|bp2=s"=>\$bp2,
|
|
63 ) or die "Usage: run_cf_0611.pl -b inbam -o outdir -f fasta -t targets -d distance -i novoindex -r rmtmp -m1 min_pairs -m2 min_splits -t genome.2bit\n";
|
|
64
|
|
65 if($help) {
|
|
66 print "Usage: run_cf_0611.pl -b inbam -o outdir -f fasta -t targets -d distance -i novoindex -r rmtmp -m1 min_pairs -m2 min_splits -t genome.2bit\n";
|
|
67 print "targets=BED style interval file containing the regions to search for SV\n";
|
|
68 print "fasta=reference fasta file\nnovoindex=reference index for novoalign\n";
|
|
69 print "distance=max distance from target to search (default=50000)\n";
|
|
70 print "min_pairs=minimum number of read pairs supporting breakpoint (default=2)\nmin_splits=minimum number of split reads supporting breakpoint (default=1)\n";
|
|
71 print "rmtmp=remove temp directory (default=1, true)\n";
|
|
72 print "genome.2bit = 2bit index fasta file of reference genome, used by the blat server (which is launched on the localhost)\n";
|
|
73 print "refpaths = semicolon seperated list of reference genomes paths (used by galaxy). The order is mandatory to be novoalign;2bit;fasta\n";
|
|
74 print "contigs = (Optional) output file for the contigs (default: outdir/contigs.txt)\n";
|
|
75 print "bp1 = (Optional) output file for the breakpoints.1.txt file (default: outdir/breakpoints.1.txt)\n";
|
|
76 print "bp2 = (Optional) output file for the breakpoints.2.txt file (default: outdir/breakpoints.2.txt)\n";
|
|
77 exit(1);
|
|
78 }
|
|
79
|
|
80 # if refpaths is defined => split into seperate variables.
|
|
81 if (defined($refpaths) && $refpaths ne '') {
|
|
82 ($novoindex, $twobit, $FASTA) = split(/;/,$refpaths);
|
|
83 }
|
|
84
|
|
85 if(! ( defined $inbam && defined $targets && defined $FASTA && defined $outdir && defined $novoindex && defined $twobit )) {
|
|
86 print STDERR "Usage: run_cf_0611.pl -b inbam -o outdir -f fasta -t targets -d distance -i novoindex -r rmtmp -m1 min_pairs -m2 min_splits -t genome.2bit\n";
|
|
87 exit(1);
|
|
88 }
|
|
89 elsif (! (-e $inbam && -e $targets && -e $FASTA && -e $novoindex && -e $twobit)) {
|
|
90 print STDERR "Critical file missing.\n";
|
|
91 exit(1);
|
|
92 }
|
|
93
|
|
94 make_empty_output();
|
|
95 $outdir = abs_path($outdir);
|
|
96
|
|
97 ## convert Targets.bed to targets.picard
|
|
98 my $cmd="${SAMTOOLS}samtools view -H $inbam > $outdir/Targets.picard";
|
|
99 print STDOUT "$cmd\n";
|
|
100 system($cmd);
|
|
101 $cmd = "awk '{OFS=\"\\t\"; print \$1,\$2,\$3,\$6,\$4 }' $targets >> $outdir/Targets.picard";
|
|
102 print STDOUT "$cmd\n";
|
|
103 system($cmd);
|
|
104 $targets = "$outdir/Targets.picard";
|
|
105
|
|
106 ## check all reads are paired. Combination of paired and single end libraries crashes the tool.
|
|
107 my @fs = `${SAMTOOLS}samtools flagstat $inbam`;
|
|
108 my $total = 0;
|
|
109 my $paired = 0;
|
|
110 foreach(@fs) {
|
|
111 chomp;
|
|
112 if ($_ =~ m/^(\d+).*in total/) {
|
|
113 $total = $1;
|
|
114 }
|
|
115 elsif ($_ =~ m/^(\d+).*paired in sequencing/) {
|
|
116 $paired = $1;
|
|
117 }
|
|
118 }
|
|
119 if ($total != $paired) {
|
|
120 print "Removing single-read data from the input bam file\n";
|
|
121 system("${SAMTOOLS}samtools view -f 1 -h -b -o $outdir/Paired.only.bam $inbam");
|
|
122 $inbam = "$outdir/Paired.only.bam";
|
|
123 }
|
|
124
|
|
125 ## process.
|
|
126 $cmd="java -Xmx8g -Xms6g -jar $cffile $inbam $targets $minct1 $dist $outdir 1";
|
|
127 print STDOUT "$cmd\n";
|
|
128 system($cmd);
|
|
129
|
|
130 chdir("${outdir}/temp");
|
|
131
|
|
132 if(!(-s "toRemap1.fq")) {
|
|
133 print STDOUT "No breakpoints found.\n";
|
|
134 chdir("$outdir");
|
|
135 clean_up();
|
|
136 exit(1);
|
|
137 }
|
|
138
|
|
139 $cmd="${NOVOALIGN}novoalign -o SAM -i 230 140 -r all -e 999 -c2 -d $novoindex -F STDFQ -f toRemap1.fq toRemap2.fq > novoout.2.sam";
|
|
140 print STDOUT "$cmd\n";
|
|
141 system($cmd);
|
|
142
|
|
143 $cmd="java -Xmx6g -Xms4g -jar $cffile novoout.2.sam $targets $minct2 $dist $outdir 2";
|
|
144 print STDOUT "$cmd\n";
|
|
145 system($cmd);
|
|
146
|
|
147
|
|
148 chdir("${outdir}/temp");
|
|
149 print STDOUT "Assembling contigs...\n";
|
|
150 &assemble();
|
|
151 print STDOUT "Done\n";
|
|
152
|
|
153
|
|
154 system("mv contigs.txt $outdir");
|
|
155 chdir("$outdir");
|
|
156 clean_up();
|
|
157
|
|
158
|
|
159
|
|
160
|
|
161 #########################
|
|
162
|
|
163 sub make_empty_output {
|
|
164 make_path($outdir);
|
|
165 open(FH, ">${outdir}/breakpoints.1.txt") or die "Can't open ${outdir}/breakpoints.1.txt"; close(FH);
|
|
166 open(FH, ">${outdir}/breakpoints.2.txt") or die "Can't open ${outdir}/breakpoints.2.txt"; close(FH);
|
|
167 open(FH, ">${outdir}/contigs.txt") or die "Can't open ${outdir}/contigs.txt"; close(FH);
|
|
168
|
|
169 }
|
|
170
|
|
171 sub assemble {
|
|
172 system("${SAMTOOLS}samtools view merged.sorted.final.temp.bam | sort -k 1,1 -k 2,2n -u | tee merged.sorted.final.unique.sam | perl ${scriptspath}/split_for_velvet.pl pair1.fq pair2.fq single.fq");
|
|
173 if(!(-s "pair1.fq")) {
|
|
174 print STDERR "No breakpoints supported by split reads.\n";
|
|
175 exit(1);
|
|
176 }
|
|
177 system("${VELVET}velveth velvet_out 31 -fastq -shortPaired -separate pair1.fq pair2.fq -short single.fq > /dev/null");
|
|
178 system("${VELVET}velvetg velvet_out -cov_cutoff 3 -ins_length 300 -exp_cov 100 -ins_length_sd 60 > /dev/null");
|
|
179 # launch blat server at random unused port
|
|
180 my $port = 10000 + int(rand(5000));
|
|
181 my $status = `${BLAT}gfServer status 127.0.0.1 $port 2>&1` ;
|
|
182 while ($status !~ m/Error/) {
|
|
183 $port = 10000 + int(rand(5000));
|
|
184 $status = `${BLAT}gfServer status 127.0.0.1 $port 2>&1` ;
|
|
185 }
|
|
186 print "Waiting for BLAT server to come online.\n";
|
|
187 system("(${BLAT}gfServer start 127.0.0.1 $port -canStop $twobit >/dev/null 2>&1) &");
|
|
188 # wait for startup
|
|
189 $status = `${BLAT}gfServer status 127.0.0.1 $port 2>&1` ;
|
|
190 while ($status =~ m/Error/) {
|
|
191 sleep 15;
|
|
192 $status = `${BLAT}gfServer status 127.0.0.1 $port 2>&1` ;
|
|
193 }
|
|
194
|
|
195 # run query
|
|
196 system("${BLAT}gfClient 127.0.0.1 $port / velvet_out/contigs.fa velvet.psl -minScore=2 -minIdentity=90 -nohead");
|
|
197 # take blat server down.
|
|
198 system("${BLAT}gfServer stop 127.0.0.1 $port");
|
|
199 # continue with analysis.
|
|
200 my $command = "perl ${scriptspath}/filter_contigs.0403.pl velvet.psl ${outdir}/breakpoints.2.txt velvet_out/contigs.fa velvet > velvet.txt";
|
|
201 print STDOUT "$command\n";
|
|
202 system($command);
|
|
203 system("mv velvet.txt contigs.txt");
|
|
204 }
|
|
205
|
|
206 sub clean_up {
|
|
207 ## move files to output locations.
|
|
208 if (defined $contigs) {
|
|
209 $contigs = abs_path($contigs);
|
|
210 system("cp '$outdir/contigs.txt' '$contigs'");
|
|
211 }
|
|
212 if (defined $bp1) {
|
|
213 $bp1 = abs_path($bp1);
|
|
214 system("cp '$outdir/breakpoints.1.txt' '$bp1'");
|
|
215 }
|
|
216 if (defined $bp2) {
|
|
217 $bp2 = abs_path($bp2);
|
|
218 system("cp '$outdir/breakpoints.2.txt' '$bp2'");
|
|
219 }
|
|
220 if ($rmtmp) {
|
|
221 system("rm -R temp");
|
|
222 }
|
|
223 }
|
|
224
|
|
225
|
|
226
|
|
227
|
|
228
|
|
229
|
|
230
|
|
231
|
|
232
|
|
233
|
|
234
|
|
235
|