Mercurial > repos > geert-vandeweyer > clusterfast
comparison run_cf0611.pl @ 1:4a3afa90ff7f draft
Uploaded
author | geert-vandeweyer |
---|---|
date | Mon, 28 Jul 2014 05:53:55 -0400 |
parents | |
children | 233be956ae78 |
comparison
equal
deleted
inserted
replaced
0:1b008b4b05f3 | 1:4a3afa90ff7f |
---|---|
1 #!/usr/bin/perl | |
2 | |
3 #Copyright 2012-2013 Haley Abel | |
4 # | |
5 #This file is part of ClusterFAST. | |
6 # | |
7 #ClusterFAST is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. | |
8 # | |
9 #ClusterFAST is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. | |
10 # | |
11 #You should have received a copy of the GNU General Public License along with ClusterFAST. If not, see <http://www.gnu.org/licenses/>. | |
12 | |
13 | |
14 use warnings; | |
15 use strict; | |
16 | |
17 use Getopt::Long; | |
18 use Pod::Usage; | |
19 use File::Path qw(make_path); | |
20 use Cwd ; | |
21 use Cwd 'abs_path'; | |
22 | |
23 my ($inbam, $outdir, $targets, $minct1, $minct2, $dist, $novoindex, $rmtmp, $FASTA,$twobit,$refpaths,$bp1,$bp2,$contigs); | |
24 | |
25 $minct1=2; | |
26 $minct2=1; | |
27 $dist=50000; | |
28 $rmtmp=1; | |
29 my $known_partners=0; | |
30 my $partnerfile=''; | |
31 | |
32 my $cwd = getcwd(); | |
33 my $scriptspath="$cwd/cf_scripts"; | |
34 my $cffile="$cwd/cf_v1.1.jar"; | |
35 my $help=''; | |
36 | |
37 | |
38 #YOU MUST ADD LOCAL FILE PATHS HERE FOR STANDALONE USE (including trailing slashes!). Galaxy adds the binaries to the path, so these can be left empty. | |
39 my $SAMTOOLS=''##INSERT PATH TO SAMTOOLS; http://sourceforge.net/projects/samtools/files/samtools/; | |
40 my $NOVOALIGN=''##INSERT PATH TO NOVOALIGN; http://www.novocraft.com/main/downloadpage.php; | |
41 my $VELVET=''##INSERT PATH TO VELVET;http://www.molecularevolution.org/software/genomics/velvet; | |
42 my $BLAT=''##INSET PATH TO BLAT gfServer/gfClient tools; | |
43 | |
44 | |
45 | |
46 | |
47 GetOptions( | |
48 "b|inbam=s"=> \$inbam, | |
49 "o|outdir=s"=> \$outdir, | |
50 "t|targets=s"=> \$targets, | |
51 "f|fasta=s"=> \$FASTA, | |
52 "m1|minct1=i"=> \$minct1, | |
53 "m2|minct2=i"=> \$minct2, | |
54 "d|distance=i"=>\$dist, | |
55 "i|novoindex=s"=> \$novoindex, | |
56 "r|rmtmp=i"=> \$rmtmp, | |
57 "h|help"=>\$help, | |
58 "2|twobit=s"=>\$twobit, | |
59 "p|refpaths=s"=>\$refpaths, | |
60 "c|contigs=s"=>\$contigs, | |
61 "y|bp1=s"=>\$bp1, | |
62 "z|bp2=s"=>\$bp2, | |
63 ) or die "Usage: run_cf_0611.pl -b inbam -o outdir -f fasta -t targets -d distance -i novoindex -r rmtmp -m1 min_pairs -m2 min_splits -t genome.2bit\n"; | |
64 | |
65 if($help) { | |
66 print "Usage: run_cf_0611.pl -b inbam -o outdir -f fasta -t targets -d distance -i novoindex -r rmtmp -m1 min_pairs -m2 min_splits -t genome.2bit\n"; | |
67 print "targets=BED style interval file containing the regions to search for SV\n"; | |
68 print "fasta=reference fasta file\nnovoindex=reference index for novoalign\n"; | |
69 print "distance=max distance from target to search (default=50000)\n"; | |
70 print "min_pairs=minimum number of read pairs supporting breakpoint (default=2)\nmin_splits=minimum number of split reads supporting breakpoint (default=1)\n"; | |
71 print "rmtmp=remove temp directory (default=1, true)\n"; | |
72 print "genome.2bit = 2bit index fasta file of reference genome, used by the blat server (which is launched on the localhost)\n"; | |
73 print "refpaths = semicolon seperated list of reference genomes paths (used by galaxy). The order is mandatory to be novoalign;2bit;fasta\n"; | |
74 print "contigs = (Optional) output file for the contigs (default: outdir/contigs.txt)\n"; | |
75 print "bp1 = (Optional) output file for the breakpoints.1.txt file (default: outdir/breakpoints.1.txt)\n"; | |
76 print "bp2 = (Optional) output file for the breakpoints.2.txt file (default: outdir/breakpoints.2.txt)\n"; | |
77 exit(1); | |
78 } | |
79 | |
80 # if refpaths is defined => split into seperate variables. | |
81 if (defined($refpaths) && $refpaths ne '') { | |
82 ($novoindex, $twobit, $FASTA) = split(/;/,$refpaths); | |
83 } | |
84 | |
85 if(! ( defined $inbam && defined $targets && defined $FASTA && defined $outdir && defined $novoindex && defined $twobit )) { | |
86 print STDERR "Usage: run_cf_0611.pl -b inbam -o outdir -f fasta -t targets -d distance -i novoindex -r rmtmp -m1 min_pairs -m2 min_splits -t genome.2bit\n"; | |
87 exit(1); | |
88 } | |
89 elsif (! (-e $inbam && -e $targets && -e $FASTA && -e $novoindex && -e $twobit)) { | |
90 print STDERR "Critical file missing.\n"; | |
91 exit(1); | |
92 } | |
93 | |
94 make_empty_output(); | |
95 $outdir = abs_path($outdir); | |
96 | |
97 ## convert Targets.bed to targets.picard | |
98 my $cmd="${SAMTOOLS}samtools view -H $inbam > $outdir/Targets.picard"; | |
99 print STDOUT "$cmd\n"; | |
100 system($cmd); | |
101 $cmd = "awk '{OFS=\"\\t\"; print \$1,\$2,\$3,\$6,\$4 }' $targets >> $outdir/Targets.picard"; | |
102 print STDOUT "$cmd\n"; | |
103 system($cmd); | |
104 $targets = "$outdir/Targets.picard"; | |
105 | |
106 ## check all reads are paired. Combination of paired and single end libraries crashes the tool. | |
107 my @fs = `${SAMTOOLS}samtools flagstat $inbam`; | |
108 my $total = 0; | |
109 my $paired = 0; | |
110 foreach(@fs) { | |
111 chomp; | |
112 if ($_ =~ m/^(\d+).*in total/) { | |
113 $total = $1; | |
114 } | |
115 elsif ($_ =~ m/^(\d+).*paired in sequencing/) { | |
116 $paired = $1; | |
117 } | |
118 } | |
119 if ($total != $paired) { | |
120 print "Removing single-read data from the input bam file\n"; | |
121 system("${SAMTOOLS}samtools view -f 1 -h -b -o $outdir/Paired.only.bam $inbam"); | |
122 $inbam = "$outdir/Paired.only.bam"; | |
123 } | |
124 | |
125 ## process. | |
126 $cmd="java -Xmx8g -Xms6g -jar $cffile $inbam $targets $minct1 $dist $outdir 1"; | |
127 print STDOUT "$cmd\n"; | |
128 system($cmd); | |
129 | |
130 chdir("${outdir}/temp"); | |
131 | |
132 if(!(-s "toRemap1.fq")) { | |
133 print STDOUT "No breakpoints found.\n"; | |
134 chdir("$outdir"); | |
135 clean_up(); | |
136 exit(1); | |
137 } | |
138 | |
139 $cmd="${NOVOALIGN}novoalign -o SAM -i 230 140 -r all -e 999 -c2 -d $novoindex -F STDFQ -f toRemap1.fq toRemap2.fq > novoout.2.sam"; | |
140 print STDOUT "$cmd\n"; | |
141 system($cmd); | |
142 | |
143 $cmd="java -Xmx6g -Xms4g -jar $cffile novoout.2.sam $targets $minct2 $dist $outdir 2"; | |
144 print STDOUT "$cmd\n"; | |
145 system($cmd); | |
146 | |
147 | |
148 chdir("${outdir}/temp"); | |
149 print STDOUT "Assembling contigs...\n"; | |
150 &assemble(); | |
151 print STDOUT "Done\n"; | |
152 | |
153 | |
154 system("mv contigs.txt $outdir"); | |
155 chdir("$outdir"); | |
156 clean_up(); | |
157 | |
158 | |
159 | |
160 | |
161 ######################### | |
162 | |
163 sub make_empty_output { | |
164 make_path($outdir); | |
165 open(FH, ">${outdir}/breakpoints.1.txt") or die "Can't open ${outdir}/breakpoints.1.txt"; close(FH); | |
166 open(FH, ">${outdir}/breakpoints.2.txt") or die "Can't open ${outdir}/breakpoints.2.txt"; close(FH); | |
167 open(FH, ">${outdir}/contigs.txt") or die "Can't open ${outdir}/contigs.txt"; close(FH); | |
168 | |
169 } | |
170 | |
171 sub assemble { | |
172 system("${SAMTOOLS}samtools view merged.sorted.final.temp.bam | sort -k 1,1 -k 2,2n -u | tee merged.sorted.final.unique.sam | perl ${scriptspath}/split_for_velvet.pl pair1.fq pair2.fq single.fq"); | |
173 if(!(-s "pair1.fq")) { | |
174 print STDERR "No breakpoints supported by split reads.\n"; | |
175 exit(1); | |
176 } | |
177 system("${VELVET}velveth velvet_out 31 -fastq -shortPaired -separate pair1.fq pair2.fq -short single.fq > /dev/null"); | |
178 system("${VELVET}velvetg velvet_out -cov_cutoff 3 -ins_length 300 -exp_cov 100 -ins_length_sd 60 > /dev/null"); | |
179 # launch blat server at random unused port | |
180 my $port = 10000 + int(rand(5000)); | |
181 my $status = `${BLAT}gfServer status 127.0.0.1 $port 2>&1` ; | |
182 while ($status !~ m/Error/) { | |
183 $port = 10000 + int(rand(5000)); | |
184 $status = `${BLAT}gfServer status 127.0.0.1 $port 2>&1` ; | |
185 } | |
186 print "Waiting for BLAT server to come online.\n"; | |
187 system("(${BLAT}gfServer start 127.0.0.1 $port -canStop $twobit >/dev/null 2>&1) &"); | |
188 # wait for startup | |
189 $status = `${BLAT}gfServer status 127.0.0.1 $port 2>&1` ; | |
190 while ($status =~ m/Error/) { | |
191 sleep 15; | |
192 $status = `${BLAT}gfServer status 127.0.0.1 $port 2>&1` ; | |
193 } | |
194 | |
195 # run query | |
196 system("${BLAT}gfClient 127.0.0.1 $port / velvet_out/contigs.fa velvet.psl -minScore=2 -minIdentity=90 -nohead"); | |
197 # take blat server down. | |
198 system("${BLAT}gfServer stop 127.0.0.1 $port"); | |
199 # continue with analysis. | |
200 my $command = "perl ${scriptspath}/filter_contigs.0403.pl velvet.psl ${outdir}/breakpoints.2.txt velvet_out/contigs.fa velvet > velvet.txt"; | |
201 print STDOUT "$command\n"; | |
202 system($command); | |
203 system("mv velvet.txt contigs.txt"); | |
204 } | |
205 | |
206 sub clean_up { | |
207 ## move files to output locations. | |
208 if (defined $contigs) { | |
209 $contigs = abs_path($contigs); | |
210 system("cp '$outdir/contigs.txt' '$contigs'"); | |
211 } | |
212 if (defined $bp1) { | |
213 $bp1 = abs_path($bp1); | |
214 system("cp '$outdir/breakpoints.1.txt' '$bp1'"); | |
215 } | |
216 if (defined $bp2) { | |
217 $bp2 = abs_path($bp2); | |
218 system("cp '$outdir/breakpoints.2.txt' '$bp2'"); | |
219 } | |
220 if ($rmtmp) { | |
221 system("rm -R temp"); | |
222 } | |
223 } | |
224 | |
225 | |
226 | |
227 | |
228 | |
229 | |
230 | |
231 | |
232 | |
233 | |
234 | |
235 |