Mercurial > repos > geert-vandeweyer > clusterfast
annotate run_cf0611.pl @ 26:9f96e2fa94ed draft
Create local tmp folder under /tmp instead of in galaxy_new_file_path location
author | geert-vandeweyer |
---|---|
date | Mon, 11 Aug 2014 05:22:08 -0400 |
parents | 9a26daf268da |
children |
rev | line source |
---|---|
1 | 1 #!/usr/bin/perl |
2 | |
3 #Copyright 2012-2013 Haley Abel | |
4 # | |
5 #This file is part of ClusterFAST. | |
6 # | |
7 #ClusterFAST is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. | |
8 # | |
9 #ClusterFAST is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. | |
10 # | |
11 #You should have received a copy of the GNU General Public License along with ClusterFAST. If not, see <http://www.gnu.org/licenses/>. | |
12 | |
13 | |
14 use warnings; | |
15 use strict; | |
16 | |
17 use Getopt::Long; | |
18 use Pod::Usage; | |
19 use File::Path qw(make_path); | |
20 use Cwd ; | |
21 use Cwd 'abs_path'; | |
17 | 22 use File::Basename; |
1 | 23 |
22 | 24 my ($inbam, $outdir, $targets, $minct1, $minct2, $dist, $novoindex, $rmtmp, $FASTA,$twobit,$refpaths,$bp1,$bp2,$contigs,$insert,$sdinsert); |
1 | 25 |
26 $minct1=2; | |
27 $minct2=1; | |
28 $dist=50000; | |
29 $rmtmp=1; | |
22 | 30 $insert=230; |
31 $sdinsert=140; | |
1 | 32 my $known_partners=0; |
33 my $partnerfile=''; | |
34 | |
35 my $cwd = getcwd(); | |
14 | 36 my $scriptspath = ''; |
37 my $cffile = ''; | |
12 | 38 # Galaxy installer puts scripts and jar into a folder, which is available under the environment variable CF_PATH |
17 | 39 # this is future work (once github of clusterfast gets updated. For now, it's a placeholder. |
12 | 40 if (defined($ENV{'CF_PATH'}) && -d $ENV{'CF_PATH'}) { |
17 | 41 $scriptspath = dirname(abs_path($0))."/cf_scripts"; |
42 $cffile = dirname(abs_path($0))."/cf_v1.1.jar"; | |
12 | 43 } |
44 else { | |
45 $scriptspath="$cwd/cf_scripts"; | |
46 $cffile="$cwd/cf_v1.1.jar"; | |
47 } | |
1 | 48 my $help=''; |
49 | |
50 | |
51 #YOU MUST ADD LOCAL FILE PATHS HERE FOR STANDALONE USE (including trailing slashes!). Galaxy adds the binaries to the path, so these can be left empty. | |
6 | 52 my $SAMTOOLS='';##INSERT PATH TO SAMTOOLS; http://sourceforge.net/projects/samtools/files/samtools/; |
53 my $NOVOALIGN='';##INSERT PATH TO NOVOALIGN; http://www.novocraft.com/main/downloadpage.php; | |
54 my $VELVET='';##INSERT PATH TO VELVET;http://www.molecularevolution.org/software/genomics/velvet; | |
55 my $BLAT='';##INSET PATH TO BLAT gfServer/gfClient tools; | |
1 | 56 |
57 | |
58 | |
59 | |
60 GetOptions( | |
61 "b|inbam=s"=> \$inbam, | |
62 "o|outdir=s"=> \$outdir, | |
63 "t|targets=s"=> \$targets, | |
64 "f|fasta=s"=> \$FASTA, | |
65 "m1|minct1=i"=> \$minct1, | |
66 "m2|minct2=i"=> \$minct2, | |
67 "d|distance=i"=>\$dist, | |
68 "i|novoindex=s"=> \$novoindex, | |
69 "r|rmtmp=i"=> \$rmtmp, | |
70 "h|help"=>\$help, | |
71 "2|twobit=s"=>\$twobit, | |
72 "p|refpaths=s"=>\$refpaths, | |
73 "c|contigs=s"=>\$contigs, | |
74 "y|bp1=s"=>\$bp1, | |
75 "z|bp2=s"=>\$bp2, | |
22 | 76 "i|insert=i"=>\$insert, |
77 "s|sdinsert=i"=>\$sdinsert, | |
1 | 78 ) or die "Usage: run_cf_0611.pl -b inbam -o outdir -f fasta -t targets -d distance -i novoindex -r rmtmp -m1 min_pairs -m2 min_splits -t genome.2bit\n"; |
79 | |
80 if($help) { | |
81 print "Usage: run_cf_0611.pl -b inbam -o outdir -f fasta -t targets -d distance -i novoindex -r rmtmp -m1 min_pairs -m2 min_splits -t genome.2bit\n"; | |
82 print "targets=BED style interval file containing the regions to search for SV\n"; | |
83 print "fasta=reference fasta file\nnovoindex=reference index for novoalign\n"; | |
84 print "distance=max distance from target to search (default=50000)\n"; | |
85 print "min_pairs=minimum number of read pairs supporting breakpoint (default=2)\nmin_splits=minimum number of split reads supporting breakpoint (default=1)\n"; | |
86 print "rmtmp=remove temp directory (default=1, true)\n"; | |
87 print "genome.2bit = 2bit index fasta file of reference genome, used by the blat server (which is launched on the localhost)\n"; | |
88 print "refpaths = semicolon seperated list of reference genomes paths (used by galaxy). The order is mandatory to be novoalign;2bit;fasta\n"; | |
89 print "contigs = (Optional) output file for the contigs (default: outdir/contigs.txt)\n"; | |
90 print "bp1 = (Optional) output file for the breakpoints.1.txt file (default: outdir/breakpoints.1.txt)\n"; | |
91 print "bp2 = (Optional) output file for the breakpoints.2.txt file (default: outdir/breakpoints.2.txt)\n"; | |
92 exit(1); | |
93 } | |
94 | |
95 # if refpaths is defined => split into seperate variables. | |
96 if (defined($refpaths) && $refpaths ne '') { | |
97 ($novoindex, $twobit, $FASTA) = split(/;/,$refpaths); | |
98 } | |
99 | |
26
9f96e2fa94ed
Create local tmp folder under /tmp instead of in galaxy_new_file_path location
geert-vandeweyer
parents:
22
diff
changeset
|
100 if(! ( defined $inbam && defined $targets && defined $FASTA && defined $novoindex && defined $twobit )) { |
1 | 101 print STDERR "Usage: run_cf_0611.pl -b inbam -o outdir -f fasta -t targets -d distance -i novoindex -r rmtmp -m1 min_pairs -m2 min_splits -t genome.2bit\n"; |
102 exit(1); | |
103 } | |
104 elsif (! (-e $inbam && -e $targets && -e $FASTA && -e $novoindex && -e $twobit)) { | |
105 print STDERR "Critical file missing.\n"; | |
106 exit(1); | |
107 } | |
108 | |
26
9f96e2fa94ed
Create local tmp folder under /tmp instead of in galaxy_new_file_path location
geert-vandeweyer
parents:
22
diff
changeset
|
109 # output dir specified? |
9f96e2fa94ed
Create local tmp folder under /tmp instead of in galaxy_new_file_path location
geert-vandeweyer
parents:
22
diff
changeset
|
110 if (!defined($outdir)) { |
9f96e2fa94ed
Create local tmp folder under /tmp instead of in galaxy_new_file_path location
geert-vandeweyer
parents:
22
diff
changeset
|
111 $outdir = "/tmp/cf_tmp_".int(rand(1000)); |
9f96e2fa94ed
Create local tmp folder under /tmp instead of in galaxy_new_file_path location
geert-vandeweyer
parents:
22
diff
changeset
|
112 while (-d $outdir) { |
9f96e2fa94ed
Create local tmp folder under /tmp instead of in galaxy_new_file_path location
geert-vandeweyer
parents:
22
diff
changeset
|
113 $outdir = "/tmp/cf_tmp_".int(rand(1000)); |
9f96e2fa94ed
Create local tmp folder under /tmp instead of in galaxy_new_file_path location
geert-vandeweyer
parents:
22
diff
changeset
|
114 } |
9f96e2fa94ed
Create local tmp folder under /tmp instead of in galaxy_new_file_path location
geert-vandeweyer
parents:
22
diff
changeset
|
115 } |
9f96e2fa94ed
Create local tmp folder under /tmp instead of in galaxy_new_file_path location
geert-vandeweyer
parents:
22
diff
changeset
|
116 # prepare output dir |
1 | 117 make_empty_output(); |
118 $outdir = abs_path($outdir); | |
119 | |
120 ## convert Targets.bed to targets.picard | |
121 my $cmd="${SAMTOOLS}samtools view -H $inbam > $outdir/Targets.picard"; | |
122 print STDOUT "$cmd\n"; | |
123 system($cmd); | |
124 $cmd = "awk '{OFS=\"\\t\"; print \$1,\$2,\$3,\$6,\$4 }' $targets >> $outdir/Targets.picard"; | |
125 print STDOUT "$cmd\n"; | |
126 system($cmd); | |
127 $targets = "$outdir/Targets.picard"; | |
128 | |
129 ## check all reads are paired. Combination of paired and single end libraries crashes the tool. | |
130 my @fs = `${SAMTOOLS}samtools flagstat $inbam`; | |
131 my $total = 0; | |
132 my $paired = 0; | |
133 foreach(@fs) { | |
134 chomp; | |
135 if ($_ =~ m/^(\d+).*in total/) { | |
136 $total = $1; | |
137 } | |
138 elsif ($_ =~ m/^(\d+).*paired in sequencing/) { | |
139 $paired = $1; | |
140 } | |
141 } | |
142 if ($total != $paired) { | |
143 print "Removing single-read data from the input bam file\n"; | |
144 system("${SAMTOOLS}samtools view -f 1 -h -b -o $outdir/Paired.only.bam $inbam"); | |
145 $inbam = "$outdir/Paired.only.bam"; | |
146 } | |
147 | |
148 ## process. | |
19 | 149 $cmd="java -Xmx8g -Xms6g -jar $cffile $inbam $targets $minct1 $dist $outdir 1 2>&1"; |
1 | 150 print STDOUT "$cmd\n"; |
151 system($cmd); | |
152 | |
153 chdir("${outdir}/temp"); | |
154 | |
155 if(!(-s "toRemap1.fq")) { | |
156 print STDOUT "No breakpoints found.\n"; | |
157 chdir("$outdir"); | |
158 clean_up(); | |
159 exit(1); | |
160 } | |
18 | 161 my $r = int(rand(100)); |
162 while (-e "/tmp/na.stderr.$r") { | |
163 $r = int(rand(100)); | |
164 } | |
1 | 165 |
22 | 166 $cmd="${NOVOALIGN}novoalign -o SAM -i $insert $sdinsert -r all -e 999 -c2 -d $novoindex -F STDFQ -f toRemap1.fq toRemap2.fq > novoout.2.sam 2>/tmp/na.stderr.$r"; |
1 | 167 print STDOUT "$cmd\n"; |
168 system($cmd); | |
18 | 169 my $na = `cat /tmp/na.stderr.$r`; |
170 print STDOUT $na; | |
171 unlink("/tmp/na.stderr.$r"); | |
1 | 172 |
19 | 173 $cmd="java -Xmx6g -Xms4g -jar $cffile novoout.2.sam $targets $minct2 $dist $outdir 2 2>/dev/null"; |
1 | 174 print STDOUT "$cmd\n"; |
175 system($cmd); | |
176 | |
177 | |
178 chdir("${outdir}/temp"); | |
179 print STDOUT "Assembling contigs...\n"; | |
180 &assemble(); | |
181 print STDOUT "Done\n"; | |
182 | |
183 | |
184 system("mv contigs.txt $outdir"); | |
185 chdir("$outdir"); | |
186 clean_up(); | |
187 | |
188 | |
189 | |
190 | |
191 ######################### | |
192 | |
193 sub make_empty_output { | |
194 make_path($outdir); | |
195 open(FH, ">${outdir}/breakpoints.1.txt") or die "Can't open ${outdir}/breakpoints.1.txt"; close(FH); | |
196 open(FH, ">${outdir}/breakpoints.2.txt") or die "Can't open ${outdir}/breakpoints.2.txt"; close(FH); | |
197 open(FH, ">${outdir}/contigs.txt") or die "Can't open ${outdir}/contigs.txt"; close(FH); | |
198 | |
199 } | |
200 | |
201 sub assemble { | |
202 system("${SAMTOOLS}samtools view merged.sorted.final.temp.bam | sort -k 1,1 -k 2,2n -u | tee merged.sorted.final.unique.sam | perl ${scriptspath}/split_for_velvet.pl pair1.fq pair2.fq single.fq"); | |
203 if(!(-s "pair1.fq")) { | |
20 | 204 print STDOUT "No breakpoints supported by split reads.\n"; |
1 | 205 exit(1); |
206 } | |
19 | 207 system("${VELVET}velveth velvet_out 31 -fastq -shortPaired -separate pair1.fq pair2.fq -short single.fq > /dev/null 2>&1"); |
208 system("${VELVET}velvetg velvet_out -cov_cutoff 3 -ins_length 300 -exp_cov 100 -ins_length_sd 60 > /dev/null 2>&1"); | |
1 | 209 # launch blat server at random unused port |
210 my $port = 10000 + int(rand(5000)); | |
211 my $status = `${BLAT}gfServer status 127.0.0.1 $port 2>&1` ; | |
212 while ($status !~ m/Error/) { | |
213 $port = 10000 + int(rand(5000)); | |
214 $status = `${BLAT}gfServer status 127.0.0.1 $port 2>&1` ; | |
215 } | |
216 print "Waiting for BLAT server to come online.\n"; | |
217 system("(${BLAT}gfServer start 127.0.0.1 $port -canStop $twobit >/dev/null 2>&1) &"); | |
218 # wait for startup | |
219 $status = `${BLAT}gfServer status 127.0.0.1 $port 2>&1` ; | |
220 while ($status =~ m/Error/) { | |
221 sleep 15; | |
222 $status = `${BLAT}gfServer status 127.0.0.1 $port 2>&1` ; | |
223 } | |
224 | |
225 # run query | |
226 system("${BLAT}gfClient 127.0.0.1 $port / velvet_out/contigs.fa velvet.psl -minScore=2 -minIdentity=90 -nohead"); | |
227 # take blat server down. | |
228 system("${BLAT}gfServer stop 127.0.0.1 $port"); | |
229 # continue with analysis. | |
230 my $command = "perl ${scriptspath}/filter_contigs.0403.pl velvet.psl ${outdir}/breakpoints.2.txt velvet_out/contigs.fa velvet > velvet.txt"; | |
231 print STDOUT "$command\n"; | |
232 system($command); | |
233 system("mv velvet.txt contigs.txt"); | |
234 } | |
235 | |
236 sub clean_up { | |
237 ## move files to output locations. | |
238 if (defined $contigs) { | |
239 $contigs = abs_path($contigs); | |
240 system("cp '$outdir/contigs.txt' '$contigs'"); | |
241 } | |
242 if (defined $bp1) { | |
243 $bp1 = abs_path($bp1); | |
244 system("cp '$outdir/breakpoints.1.txt' '$bp1'"); | |
245 } | |
246 if (defined $bp2) { | |
247 $bp2 = abs_path($bp2); | |
248 system("cp '$outdir/breakpoints.2.txt' '$bp2'"); | |
249 } | |
250 if ($rmtmp) { | |
251 system("rm -R temp"); | |
252 } | |
253 } | |
254 | |
255 | |
256 | |
257 | |
258 | |
259 | |
260 | |
261 | |
262 | |
263 | |
264 | |
265 |