comparison defuse.xml @ 0:b75ea9927793

Uploaded
author jjohnson
date Fri, 04 Jan 2013 11:53:33 -0500
parents
children 6ee9d8b45113
comparison
equal deleted inserted replaced
-1:000000000000 0:b75ea9927793
1 <tool id="defuse" name="DeFuse" version="1.5">
2 <description>identify fusion transcripts</description>
3 <requirements>
4 <requirement type="package" version="0.5.0">defuse</requirement>
5 <requirement type="package">bowtie</requirement>
6 <requirement type="package">blat</requirement>
7 <requirement type="package">fatotwobit</requirement>
8 </requirements>
9 <command interpreter="command"> /bin/bash $shscript </command>
10 <inputs>
11 <param name="left_pairendreads" type="data" format="fastq" label="left part of read pairs" help="The left and right reads pairs must be in the same order, and not have any unpaired reads. (FASTQ interlacer will pair reads and remove the unpaired. FASTQ de-interlacer will separate the result into left and right reads.)"/>
12 <param name="right_pairendreads" type="data" format="fastq" label="right part of read pairs" help="In the same order as the left reads"/>
13 <conditional name="refGenomeSource">
14 <param name="genomeSource" type="select" label="Will you select a built-in DeFuse Reference Dataset, or supply a configuration from your history" help="">
15 <option value="indexed">Use a built-in DeFuse Reference Dataset</option>
16 <option value="history">Use a configuration from your history that specifies the DeFuse Reference Dataset</option>
17 </param>
18 <when value="indexed">
19 <param name="index" type="select" label="Select a Reference Dataset" help="if your genome of interest is not listed - contact Galaxy team">
20 <options from_file="defuse.loc">
21 <column name="name" index="1"/>
22 <column name="value" index="2"/>
23 <filter type="sort_by" column="0" />
24 <validator type="no_options" message="No indexes are available" />
25 </options>
26 </param>
27 <conditional name="defuse_param">
28 <param name="settings" type="select" label="Defuse parameter settings" help="">
29 <option value="preSet">Default settings</option>
30 <option value="full">Full parameter list</option>
31 </param>
32 <when value="preSet" />
33 <when value="full">
34 <param name="max_insert_size" type="integer" value="500" optional="true" label="Bowtie max_insert_size" />
35 <param name="dna_concordant_length" type="integer" value="2000" optional="true" label="Minimum gene fusion range dna_concordant_length" />
36 <param name="discord_read_trim" type="integer" value="50" optional="true" label="Trim length for discordant reads discord_read_trim" help="(split reads are not trimmed)" />
37 <param name="clustering_precision" type="float" value=".95" optional="true" label="Filter clustering_precision">
38 <validator type="in_range" message="Choose a value between .1 and 1.0" min=".1" max="1"/>
39 </param>
40 <param name="span_count_threshold" type="integer" value="5" optional="true" label="Filter span_count_threshold" />
41 <param name="split_count_threshold" type="integer" value="3" optional="true" label="Filter split_count_threshold" />
42 <param name="percent_identity_threshold" type="float" value=".90" optional="true" label="Filter percent_identity_threshold">
43 <validator type="in_range" message="Choose a value between .1 and 1.0" min=".1" max="1"/>
44 </param>
45 <param name="max_dist_pos" type="integer" value="600" optional="true" label="Filter max_dist_pos" />
46 <param name="num_dist_genes" type="integer" value="500" optional="true" label="Filter num_dist_genes" />
47 <param name="split_min_anchor" type="integer" value="4" optional="true" label="Filter split_min_anchor" />
48 <param name="max_concordant_ratio" type="float" value="0.1" optional="true" label="Filter max_concordant_ratio">
49 <validator type="in_range" message="Choose a value between 0.0 and 1.0" min="0" max="1"/>
50 </param>
51 <param name="splice_bias" type="integer" value="10" optional="true" label="Filter splice_bias" />
52 <param name="probability_threshold" type="float" value="0.50" optional="true" label="Filter probability_threshold">
53 <validator type="in_range" message="Choose a value between 0.0 and 1.0" min="0" max="1"/>
54 </param>
55 <param name="covariance_sampling_density" type="float" value="0.01" optional="true" label="covariance_sampling_density">
56 <help>Position density when calculating covariance</help>
57 <validator type="in_range" message="Choose a value between 0.0 and 1.0" min="0" max="1"/>
58 </param>
59 <param name="denovo_assembly" type="select" label="denovo_assembly" help="">
60 <option value="">Use Default</option>
61 <option value="no">no</option>
62 <option value="yes">yes</option>
63 </param>
64 <!--
65 <param name="positive_controls" type="data" format="txt" optional=true label="Defuse positive_controls" help=""/>
66 -->
67 </when> <!-- full -->
68 </conditional> <!-- defuse_param -->
69 </when>
70 <when value="history">
71 <param name="config" type="data" format="txt" label="Defuse Config file" help=""/>
72 </when> <!-- history -->
73 </conditional> <!-- refGenomeSource -->
74 <param name="keep_output" type="boolean" checked="true" truevalue="yes" falsevalue="no" label="Save DeFuse working directory files"/>
75 <param name="do_get_reads" type="boolean" checked="false" truevalue="yes" falsevalue="no" label="Run get_reads on each cluster"/>
76 </inputs>
77 <configfiles>
78 <configfile name="defuse_config">
79 #import ast
80 #if $refGenomeSource.genomeSource == "history":
81 #include raw $refGenomeSource.config.__str__
82 #else
83 #set $ref_dict = dict($ast.literal_eval($refGenomeSource.index.value))
84 #
85 # Configuration file for defuse
86 #
87 # At a minimum, change all values enclused by []
88 #
89
90 # Directory where the defuse code was unpacked
91 ## Default location in the tool/defuse directory
92 # source_directory = ${__root_dir__}/tools/defuse
93 source_directory = #slurp
94 #try
95 $ref_dict['source_directory']
96 #except
97 __DEFUSE_PATH__
98 #end try
99
100 # Directory where you want your dataset
101 dataset_directory = #slurp
102 #try
103 $ref_dict['dataset_directory']
104 #except
105 /project/db/genomes/Hsapiens/hg19/defuse
106 #end try
107
108 # Input genome and gene models
109 gene_models = #slurp
110 #try
111 $ref_dict['gene_models']
112 #except
113 \$(dataset_directory)/Homo_sapiens.GRCh37.62.gtf
114 #end try
115 genome_fasta = #slurp
116 #try
117 $ref_dict['genome_fasta']
118 #except
119 \$(dataset_directory)/Homo_sapiens.GRCh37.62.dna.chromosome.fa
120 #end try
121
122 # Repeat table from ucsc genome browser
123 repeats_filename = #slurp
124 #try
125 $ref_dict['repeats_filename']
126 #except
127 \$(dataset_directory)/rmsk.txt
128 #end try
129
130 # EST info downloaded from ucsc genome browser
131 est_fasta = #slurp
132 #try
133 $ref_dict['est_fasta']
134 #except
135 \$(dataset_directory)/est.fa
136 #end try
137 est_alignments = #slurp
138 #try
139 $ref_dict['est_alignments']
140 #except
141 \$(dataset_directory)/intronEst.txt
142 #end try
143
144 # Unigene clusters downloaded from ncbi
145 unigene_fasta = #slurp
146 #try
147 $ref_dict['unigene_fasta']
148 #except
149 \$(dataset_directory)/Hs.seq.uniq
150 #end try
151
152 # Paths to external tools
153 bowtie_bin = #slurp
154 #try
155 $ref_dict['bowtie_bin']
156 #except
157 __BOWTIE_BIN__
158 #end try
159 bowtie_build_bin = #slurp
160 #try
161 $ref_dict['bowtie_build_bin']
162 #except
163 __BOWTIE_BUILD_BIN__
164 #end try
165 blat_bin = #slurp
166 #try
167 $ref_dict['blat_bin']
168 #except
169 __BLAT_BIN__
170 #end try
171 fatotwobit_bin = #slurp
172 #try
173 $ref_dict['fatotwobit_bin']
174 #except
175 __FATOTWOBIT_BIN__
176 #end try
177 r_bin = #slurp
178 #try
179 $ref_dict['r_bin']
180 #except
181 __R_BIN__
182 #end try
183 rscript_bin = #slurp
184 #try
185 $ref_dict['rscript_bin']
186 #except
187 __RSCRIPT_BIN__
188 #end try
189
190 #raw
191 # Dataset files
192 dataset_prefix = $(dataset_directory)/defuse
193 chromosome_prefix = $(dataset_prefix).dna.chromosomes
194 exons_fasta = $(dataset_prefix).exons.fa
195 cds_fasta = $(dataset_prefix).cds.fa
196 cdna_regions = $(dataset_prefix).cdna.regions
197 cdna_fasta = $(dataset_prefix).cdna.fa
198 reference_fasta = $(dataset_prefix).reference.fa
199 rrna_fasta = $(dataset_prefix).rrna.fa
200 ig_gene_list = $(dataset_prefix).ig.gene.list
201 repeats_regions = $(dataset_directory)/repeats.regions
202 est_split_fasta1 = $(dataset_directory)/est.1.fa
203 est_split_fasta2 = $(dataset_directory)/est.2.fa
204 est_split_fasta3 = $(dataset_directory)/est.3.fa
205 est_split_fasta4 = $(dataset_directory)/est.4.fa
206 est_split_fasta5 = $(dataset_directory)/est.5.fa
207 est_split_fasta6 = $(dataset_directory)/est.6.fa
208 est_split_fasta7 = $(dataset_directory)/est.7.fa
209 est_split_fasta8 = $(dataset_directory)/est.8.fa
210 est_split_fasta9 = $(dataset_directory)/est.9.fa
211
212 # Fasta files with bowtie indices for prefiltering reads for concordantly mapping pairs
213 prefilter1 = $(unigene_fasta)
214
215 # deFuse scripts and tools
216 scripts_directory = $(source_directory)/scripts
217 tools_directory = $(source_directory)/tools
218 data_directory = $(source_directory)/data
219 #end raw
220
221 # Path to samtools, 0.1.8 is compiled for you, use other versions at your own risk
222 samtools_bin = #slurp
223 #try
224 $ref_dict['samtools_bin']
225 #except
226 \$(source_directory)/external/samtools-0.1.8/samtools
227 #end try
228
229 # Bowtie parameters
230 bowtie_threads = #slurp
231 #try
232 $ref_dict['bowtie_threads']
233 #except
234 4
235 #end try
236 bowtie_quals = #slurp
237 #try
238 $ref_dict['bowtie_quals']
239 #except
240 --phred33-quals
241 #end try
242 max_insert_size = #slurp
243 #if $refGenomeSource.defuse_param.settings == "full" and $refGenomeSource.defuse_param.max_insert_size.__str__ != "":
244 $refGenomeSource.defuse_param.max_insert_size
245 #else
246 #try
247 $ref_dict['max_insert_size']
248 #except
249 500
250 #end try
251 #end if
252
253 # Parameters for building the dataset
254 chromosomes = #slurp
255 #try
256 $ref_dict.chromosomes
257 #except
258 1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,X,Y,MT
259 #end try
260 mt_chromosome = #slurp
261 #try
262 $ref_dict['mt_chromosome']
263 #except
264 MT
265 #end try
266 gene_sources = #slurp
267 #try
268 $ref_dict['gene_sources']
269 #except
270 IG_C_gene,IG_D_gene,IG_J_gene,IG_V_gene,processed_transcript,protein_coding
271 #end try
272 ig_gene_sources = #slurp
273 #try
274 $ref_dict['ig_gene_sources']
275 #except
276 IG_C_gene,IG_D_gene,IG_J_gene,IG_V_gene,IG_pseudogene
277 #end try
278 rrna_gene_sources = #slurp
279 #try
280 $ref_dict['rrna_gene_sources']
281 #except
282 Mt_rRNA,rRNA,rRNA_pseudogene
283 #end try
284
285 # Blat sequences per job
286 num_blat_sequences = #slurp
287 #try
288 $ref_dict['num_blat_sequences']
289 #except
290 10000
291 #end try
292
293 # Minimum gene fusion range
294 dna_concordant_length = #slurp
295 #if $refGenomeSource.defuse_param.settings == "full" and $refGenomeSource.defuse_param.dna_concordant_length.__str__ != "":
296 $refGenomeSource.defuse_param.dna_concordant_length
297 #else
298 #try
299 $ref_dict['dna_concordant_length']
300 #except
301 2000
302 #end try
303 #end if
304
305 # Trim length for discordant reads (split reads are not trimmed)
306 discord_read_trim = #slurp
307 #if $refGenomeSource.defuse_param.settings == "full" and $refGenomeSource.defuse_param.discord_read_trim.__str__ != "":
308 $refGenomeSource.defuse_param.discord_read_trim
309 #else
310 #try
311 $ref_dict['discord_read_trim']
312 #except
313 50
314 #end try
315 #end if
316
317 # Filtering parameters
318 clustering_precision = #slurp
319 #if $refGenomeSource.defuse_param.settings == "full" and $refGenomeSource.defuse_param.clustering_precision.__str__ != ""
320 $refGenomeSource.defuse_param.clustering_precision
321 #else
322 #try
323 $ref_dict['clustering_precision']
324 #except
325 0.95
326 #end try
327 #end if
328 span_count_threshold = #slurp
329 #if $refGenomeSource.defuse_param.settings == "full" and $refGenomeSource.defuse_param.span_count_threshold.__str__ != ""
330 $refGenomeSource.defuse_param.span_count_threshold
331 #else
332 #try
333 $ref_dict['span_count_threshold']
334 #except
335 5
336 #end try
337 #end if
338 split_count_threshold = #slurp
339 #if $refGenomeSource.defuse_param.settings == "full" and $refGenomeSource.defuse_param.split_count_threshold.__str__ != ""
340 $refGenomeSource.defuse_param.split_count_threshold
341 #else
342 #try
343 $ref_dict['split_count_threshold']
344 #except
345 3
346 #end try
347 #end if
348 percent_identity_threshold = #slurp
349 #if $refGenomeSource.defuse_param.settings == "full" and $refGenomeSource.defuse_param.percent_identity_threshold.__str__ != ""
350 $refGenomeSource.defuse_param.percent_identity_threshold
351 #else
352 #try
353 $ref_dict['percent_identity_threshold']
354 #except
355 0.90
356 #end try
357 #end if
358 max_dist_pos = #slurp
359 #if $refGenomeSource.defuse_param.settings == "full" and $refGenomeSource.defuse_param.max_dist_pos.__str__ != ""
360 $refGenomeSource.defuse_param.max_dist_pos
361 #else
362 #try
363 $ref_dict['max_dist_pos']
364 #except
365 600
366 #end try
367 #end if
368 num_dist_genes = #slurp
369 #if $refGenomeSource.defuse_param.settings == "full" and $refGenomeSource.defuse_param.num_dist_genes.__str__ != ""
370 $refGenomeSource.defuse_param.num_dist_genes
371 #else
372 #try
373 $ref_dict['num_dist_genes']
374 #except
375 500
376 #end try
377 #end if
378 split_min_anchor = #slurp
379 #if $refGenomeSource.defuse_param.settings == "full" and $refGenomeSource.defuse_param.split_min_anchor.__str__ != ""
380 $refGenomeSource.defuse_param.split_min_anchor
381 #else
382 #try
383 $ref_dict['split_min_anchor']
384 #except
385 4
386 #end try
387 #end if
388 max_concordant_ratio = #slurp
389 #if $refGenomeSource.defuse_param.settings == "full" and $refGenomeSource.defuse_param.max_concordant_ratio.__str__ != ""
390 $refGenomeSource.defuse_param.max_concordant_ratio
391 #else
392 #try
393 $ref_dict['max_concordant_ratio']
394 #except
395 0.1
396 #end try
397 #end if
398 splice_bias = #slurp
399 #if $refGenomeSource.defuse_param.settings == "full" and $refGenomeSource.defuse_param.splice_bias.__str__ != ""
400 $refGenomeSource.defuse_param.splice_bias
401 #else
402 #try
403 $ref_dict['splice_bias']
404 #except
405 10
406 #end try
407 #end if
408 denovo_assembly = #slurp
409 #if $refGenomeSource.defuse_param.settings == "full" and $refGenomeSource.defuse_param.denovo_assembly.__str__ != ""
410 $refGenomeSource.defuse_param.denovo_assembly
411 #else
412 #try
413 $ref_dict['denovo_assembly']
414 #except
415 no
416 #end try
417 #end if
418 probability_threshold = #slurp
419 #if $refGenomeSource.defuse_param.settings == "full" and $refGenomeSource.defuse_param.probability_threshold.__str__ != ""
420 $refGenomeSource.defuse_param.probability_threshold
421 #else
422 #try
423 $ref_dict['probability_threshold']
424 #except
425 0.50
426 #end try
427 #end if
428 positive_controls = \$(data_directory)/controls.txt
429
430 # Position density when calculating covariance
431 covariance_sampling_density = #slurp
432 #if $refGenomeSource.defuse_param.settings == "full" and $refGenomeSource.defuse_param.covariance_sampling_density.__str__ != ""
433 $refGenomeSource.defuse_param.covariance_sampling_density
434 #else
435 #try
436 $ref_dict['covariance_sampling_density']
437 #except
438 0.01
439 #end try
440 #end if
441
442
443 # Number of reads for each job in split
444 reads_per_job = 1000000
445
446 # Number of regions for each breakpoint sequence job in split
447 regions_per_job = 20
448
449 #raw
450 # If you have command line 'mail' and wish to be notified
451 # mailto = andrew.mcpherson@gmail.com
452
453 # Remove temp files
454 remove_job_files = yes
455 remove_job_temp_files = yes
456
457 # Converting to fastq
458 # Fastq converter config format 1 for reads stored in separate files for each end
459 # data_lane_rexex_N is a perl regex which stores the lane id in $1
460 # data_end_regex_N is a perl regex which stores the end, 1 or 2, in $1
461 # data_compress_regex_N is a perl regex which stores the compression extension in $1
462 # data_convert_N is the associated conversion utility that takes data at stdin and outputs fastq at stdout
463 # Fastq converter config format 2 for reads stored in separate files for each end
464 # data_lane_regex_N is a perl regex which stores the lane id in $1
465 # data_compress_regex_N is a perl regex which stores the compression extension in $1
466 # data_end1_converter_N is the associated conversion utility that takes data at stdin and outputs fastq for end 1 at stdout
467 # data_end2_converter_N is the associated conversion utility that takes data at stdin and outputs fastq for end 2 at stdout
468
469 data_lane_regex_1 = ^(.+)_[12]_export\.txt.*$
470 data_end_regex_1 = ^.+_([12])_export\.txt.*$
471 data_compress_regex_1 = ^.+_[12]_export\.txt(.*)$
472 data_converter_1 = $(scripts_directory)/fq_all2std.pl export2std
473
474 data_lane_regex_2 = ^(.+)_[12]_concat_qseq\.txt.*$
475 data_end_regex_2 = ^.+_([12])_concat_qseq\.txt.*$
476 data_compress_regex_2 = ^.+_[12]_concat_qseq\.txt(.*)$
477 data_converter_2 = $(scripts_directory)/qseq2fastq.pl
478
479 data_lane_regex_3 = ^(.+)\.bam.*$
480 data_compress_regex_3 = ^.+\.bam(.*)$
481 data_end1_converter_3 = samtools view - | filter_sam_mate.pl 1 | sam_to_fastq.pl
482 data_end2_converter_3 = samtools view - | filter_sam_mate.pl 2 | sam_to_fastq.pl
483
484 data_lane_regex_4 = ^(.+).[12].fastq.*$
485 data_end_regex_4 = ^.+.([12]).fastq.*$
486 data_compress_regex_4 = ^.+.[12].fastq(.*)$
487 data_converter_4 = cat
488 #end raw
489
490 #end if
491
492 </configfile>
493 <configfile name="shscript">
494 #!/bin/bash
495 ## define some things for cheetah proccessing
496 #set $ds = chr(36)
497 #set $amp = chr(38)
498 #set $gt = chr(62)
499 #set $lt = chr(60)
500 #set $echo_cmd = 'echo'
501 ## Find the defuse.pl in the galaxy tool path
502 #import Cheetah.FileUtils
503 ## declare a bash function for converting a results tsv into html with links to the get_reads output files
504 results2html() {
505 rlts=${ds}1
506 rslt_name=`basename ${ds}rlts`
507 html=${ds}2
508 echo '${lt}html${gt}${lt}head${gt}${lt}title${gt}Defuse '${ds}rslt_name'${lt}/title${gt}${lt}/head${gt}${lt}body${gt}' ${gt} ${ds}html
509 echo '${lt}h2${gt}Defuse '${ds}rslt_name'${lt}/h2${gt}${lt}table${gt}' ${gt}${gt} ${ds}html
510 if [ -z "${ds}3" ]
511 then
512 awk '${ds}1 ~ /cluster_id/{printf("${lt}tr${gt}");for (i = 1; i ${lt}= NF; i++) {printf("${lt}th${gt}%s${lt}/th${gt}", ${ds}i);}; printf("${lt}/tr${gt}\n");}\
513 ${ds}1 ~ /[1-9][0-9]*/{printf("${lt}tr${gt}");for (i = 1; i ${lt}= NF; i++) {printf("${lt}td${gt}%s${lt}/td${gt}", ${ds}i);}; printf("${lt}/tr${gt}\n");}' ${ds}rlts ${gt}${gt} ${ds}html
514 echo '${lt}/table${gt}' ${gt}${gt} ${ds}html
515 echo '${lt}/body${gt}${lt}/html${gt}' ${gt}${gt} ${ds}html
516 else
517 export _EFP=${ds}3
518 mkdir -p ${ds}_EFP
519 awk '${ds}1 ~ /cluster_id/{printf("${lt}tr${gt}");for (i = 1; i ${lt}= NF; i++) {printf("${lt}th${gt}%s${lt}/th${gt}", ${ds}i);}; printf("${lt}/tr${gt}\n");}\
520 ${ds}1 ~ /[1-9][0-9]*/{fn="cluster_"${ds}1"_reads.txt"; \
521 printf("${lt}tr${gt}${lt}td${gt}${lt}a href=\"%s\"${gt}%s${lt}/a${gt}${lt}/td${gt}",fn, ${ds}1);for (i = 2; i ${lt}= NF; i++) {printf("${lt}td${gt}%s${lt}/td${gt}", ${ds}i);}; printf("${lt}/tr${gt}\n");}' ${ds}rlts ${gt}${gt} ${ds}html
522 echo '${lt}/table${gt}' ${gt}${gt} ${ds}html
523 echo '${lt}/body${gt}${lt}/html${gt}' ${gt}${gt} ${ds}html
524 for i in `awk '${ds}1 ~ /[1-9][0-9]*/{print ${ds}1}' ${ds}rlts`;
525 do fn=cluster_${ds}{i}_reads.txt;
526 pn=${ds}_EFP/${ds}fn;
527 perl \${DEFUSE_PATH}/scripts/get_reads.pl -c $defuse_config -o output_dir -i ${ds}i ${gt} ${ds}pn;
528 done
529 fi
530 }
531 ## substitute pathnames into config file
532 if `grep __DEFUSE_PATH__ $defuse_config ${gt} /dev/null`;then sed -i '.tmp' "s#__DEFUSE_PATH__#\${DEFUSE_PATH}#" $defuse_config; fi
533 if `grep __SAMTOOLS_BIN__ $defuse_config ${gt} /dev/null` ${amp}${amp} SAMTOOLS_BIN=`which samtools`;then sed -i '.tmp' "s#__SAMTOOLS_BIN__#\${SAMTOOLS_BIN}#" $defuse_config; fi
534 if `grep __BOWTIE_BIN__ $defuse_config ${gt} /dev/null` ${amp}${amp} BOWTIE_BIN=`which bowtie`;then sed -i '.tmp' "s#__BOWTIE_BIN__#\${BOWTIE_BIN}#" $defuse_config; fi
535 if `grep __BOWTIE_BUILD_BIN__ $defuse_config ${gt} /dev/null` ${amp}${amp} BOWTIE_BUILD_BIN=`which bowtie-build`;then sed -i '.tmp' "s#__BOWTIE_BUILD_BIN__#\${BOWTIE_BUILD_BIN}#" $defuse_config; fi
536 if `grep __BLAT_BIN__ $defuse_config ${gt} /dev/null` ${amp}${amp} BLAT_BIN=`which blat`;then sed -i '.tmp' "s#__BLAT_BIN__#\${BLAT_BIN}#" $defuse_config; fi
537 if `grep __FATOTWOBIT_BIN__ $defuse_config ${gt} /dev/null` ${amp}${amp} FATOTWOBIT_BIN=`which fatotwobit`;then sed -i '.tmp' "s#__FATOTWOBIT_BIN__#\${FATOTWOBIT_BIN}#" $defuse_config; fi
538 if `grep __R_BIN__ $defuse_config ${gt} /dev/null` ${amp}${amp} R_BIN=`which R`;then sed -i '.tmp' "s#__R_BIN__#\${R_BIN}#" $defuse_config; fi
539 if `grep __RSCRIPT_BIN__ $defuse_config ${gt} /dev/null` ${amp}${amp} RSCRIPT_BIN=`which Rscript`;then sed -i '.tmp' "s#__RSCRIPT_BIN__#\${RSCRIPT_BIN}#" $defuse_config; fi
540
541
542 ## copy config to output
543 cp $defuse_config $config_txt
544 ## make a data_dir and ln -s the input fastq
545 mkdir -p data_dir
546 ln -s $left_pairendreads data_dir/reads_1.fastq
547 ln -s $right_pairendreads data_dir/reads_2.fastq
548 ## ln to output_dir in from_work_dir
549 #if $defuse_out.__str__ != 'None':
550 mkdir -p $defuse_out.extra_files_path
551 ln -s $defuse_out.extra_files_path output_dir
552 #else
553 mkdir -p output_dir
554 #end if
555 ## run defuse.pl
556 perl \${DEFUSE_PATH}/scripts/defuse.pl -c $defuse_config -d data_dir -o output_dir -p 8
557 ## copy primary results to output datasets
558 if [ -e output_dir/log/defuse.log ]; then cp output_dir/log/defuse.log $defuse_log; fi
559 if [ -e output_dir/results.tsv ]; then cp output_dir/results.tsv $results_tsv; fi
560 if [ -e output_dir/results.filtered.tsv ]; then cp output_dir/results.filtered.tsv $results_filtered_tsv; fi
561 if [ -e output_dir/results.classify.tsv ]; then cp output_dir/results.classify.tsv $results_classify_tsv; fi
562 ## create html with links for output_dir
563 #if $defuse_out.__str__ != 'None':
564 if [ -e $defuse_out ]
565 then
566 echo '${lt}html${gt}${lt}head${gt}${lt}title${gt}Defuse Output${lt}/title${gt}${lt}/head${gt}${lt}body${gt}' ${gt} $defuse_out
567 echo '${lt}h2${gt}Defuse Output Files${lt}/h2${gt}${lt}ul${gt}' ${gt}${gt} $defuse_out
568 pushd $defuse_out.extra_files_path
569 for f in `find -L . -maxdepth 1 -type f`;
570 do fn=`basename ${ds}f`; echo '${lt}li${gt}${lt}a href="'${ds}fn'"${gt}'${ds}fn'${lt}/a${gt}${lt}/li${gt}' ${gt}${gt} $defuse_out;
571 done
572 popd
573 echo '${lt}/ul${gt}' ${gt}${gt} $defuse_out
574 echo '${lt}/body${gt}${lt}/html${gt}' ${gt}${gt} $defuse_out
575 fi
576 #end if
577 ## run get_reads.pl on each cluster
578 #if $fusion_reads.__str__ != 'None':
579 if [ -e output_dir/results.filtered.tsv -a -e $fusion_reads ]
580 then
581 mkdir -p $fusion_reads.extra_files_path
582 results2html output_dir/results.filtered.tsv $fusion_reads $fusion_reads.extra_files_path
583 fi
584 #end if
585 </configfile>
586 </configfiles>
587 <outputs>
588 <data format="txt" name="config_txt" label="${tool.name} on ${on_string}: config.txt"/>
589 <data format="txt" name="defuse_log" label="${tool.name} on ${on_string}: defuse.log" />
590 <data format="html" name="defuse_out" label="${tool.name} on ${on_string}: defuse_output">
591 <filter>keep_output == True</filter>
592 </data>
593 <data format="html" name="fusion_reads" label="${tool.name} on ${on_string}: fusion_reads">
594 <filter>do_get_reads == True</filter>
595 </data>
596 <data format="tabular" name="results_tsv" label="${tool.name} on ${on_string}: results.tsv" />
597 <data format="tabular" name="results_filtered_tsv" label="${tool.name} on ${on_string}: results.filtered.tsv" />
598 <data format="tabular" name="results_classify_tsv" label="${tool.name} on ${on_string}: results.classify.tsv" />
599 </outputs>
600 <tests>
601 </tests>
602 <help>
603 **DeFuse**
604
605 DeFuse_ is a software package for gene fusion discovery using RNA-Seq data. The software uses clusters of discordant paired end alignments to inform a split read alignment analysis for finding fusion boundaries. The software also employs a number of heuristic filters in an attempt to reduce the number of false positives and produces a fully annotated output for each predicted fusion.
606
607 Journal reference: http://www.ploscompbiol.org/article/info%3Adoi%2F10.1371%2Fjournal.pcbi.1001138
608
609 .. _DeFuse: http://sourceforge.net/apps/mediawiki/defuse/index.php?title=Main_Page
610
611 ------
612
613 **Inputs**
614
615 DeFuse requires 2 fastq files for paried reads, one with the left mate of the paired reads, and a second fastq with the the right mate of the paired reads (**with reads in the same order as in the first fastq dataset**).
616
617 If your fastq files have reads in different orders or include unpaired reads, you can preprocess them with **FASTQ interlacer** to create a single interlaced fastq dataset with only the paired reads and input that to **FASTQ de-interlacer** to separate the reads into a left fastq and right fastq.
618
619 DeFuse uses a Reference Dataset to search for gene fusions. The Reference Dataset is generated from the following sources in DeFuse_Version_0.4_:
620 - genome_fasta from Ensembl
621 - gene_models from Ensembl
622 - repeats_filename from UCSC RepeatMasker rmsk.txt
623 - est_fasta from UCSC
624 - est_alignments from UCSC intronEst.txt
625 - unigene_fasta from NCBI
626
627 .. _DeFuse_Version_0.4: http://sourceforge.net/apps/mediawiki/defuse/index.php?title=DeFuse_Version_0.4.2
628
629 ------
630
631 **Outputs**
632
633 The galaxy history will contain 5 outputs: the config.txt file that provides DeFuse with its parameters, the defuse.log which details what DeFuse has done and can be useful in determining any errors, and the 3 results files that defuse generates.
634
635 DeFuse generates 3 results files: results.txt, results.filtered.txt, and results.classify.txt. All three files have the same format, though results.classify.txt has a probability column from the application of the classifier to results.txt, and results.filtered.txt has been filtered according to the threshold probability as set in config.txt.
636
637 The file format is tab delimited with one prediction per line, and the following fields per prediction (not necessarily in this order):
638
639 - **Identification**
640 - cluster_id : random identifier assigned to each prediction
641 - library_name : library name given on the command line of defuse
642 - gene1 : ensembl id of gene 1
643 - gene2 : ensembl id of gene 2
644 - gene_name1 : name of gene 1
645 - gene_name2 : name of gene 2
646 - **Evidence**
647 - break_predict : breakpoint prediction method, denovo or splitr, that is considered most reliable
648 - concordant_ratio : proportion of spanning reads considered concordant by blat
649 - denovo_min_count : minimum kmer count across denovo assembled sequence
650 - denovo_sequence : fusion sequence predicted by debruijn based denovo sequence assembly
651 - denovo_span_pvalue : p-value, lower values are evidence the prediction is a false positive
652 - gene_align_strand1 : alignment strand for spanning read alignments to gene 1
653 - gene_align_strand2 : alignment strand for spanning read alignments to gene 2
654 - min_map_count : minimum of the number of genomic mappings for each spanning read
655 - max_map_count : maximum of the number of genomic mappings for each spanning read
656 - mean_map_count : average of the number of genomic mappings for each spanning read
657 - num_multi_map : number of spanning reads that map to more than one genomic location
658 - span_count : number of spanning reads supporting the fusion
659 - span_coverage1 : coverage of spanning reads aligned to gene 1 as a proportion of expected coverage
660 - span_coverage2 : coverage of spanning reads aligned to gene 2 as a proportion of expected coverage
661 - span_coverage_min : minimum of span_coverage1 and span_coverage2
662 - span_coverage_max : maximum of span_coverage1 and span_coverage2
663 - splitr_count : number of split reads supporting the prediction
664 - splitr_min_pvalue : p-value, lower values are evidence the prediction is a false positive
665 - splitr_pos_pvalue : p-value, lower values are evidence the prediction is a false positive
666 - splitr_sequence : fusion sequence predicted by split reads
667 - splitr_span_pvalue : p-value, lower values are evidence the prediction is a false positive
668 - **Annotation**
669 - adjacent : fusion between adjacent genes
670 - altsplice : fusion likely the product of alternative splicing between adjacent genes
671 - break_adj_entropy1 : di-nucleotide entropy of the 40 nucleotides adjacent to the fusion splice in gene 1
672 - break_adj_entropy2 : di-nucleotide entropy of the 40 nucleotides adjacent to the fusion splice in gene 2
673 - break_adj_entropy_min : minimum of break_adj_entropy1 and break_adj_entropy2
674 - breakpoint_homology : number of nucleotides at the fusion splice that align equally well to gene 1 or gene 2
675 - breakseqs_estislands_percident : maximum percent identity of fusion sequence alignments to est islands
676 - cdna_breakseqs_percident : maximum percent identity of fusion sequence alignments to cdna
677 - deletion : fusion produced by a genomic deletion
678 - est_breakseqs_percident : maximum percent identity of fusion sequence alignments to est
679 - eversion : fusion produced by a genomic eversion
680 - exonboundaries : fusion splice at exon boundaries
681 - expression1 : expression of gene 1 as number of concordant pairs aligned to exons
682 - expression2 : expression of gene 2 as number of concordant pairs aligned to exons
683 - gene_chromosome1 : chromosome of gene 1
684 - gene_chromosome2 : chromosome of gene 2
685 - gene_end1 : end position for gene 1
686 - gene_end2 : end position for gene 2
687 - gene_location1 : location of breakpoint in gene 1
688 - gene_location2 : location of breakpoint in gene 2
689 - gene_start1 : start of gene 1
690 - gene_start2 : start of gene 2
691 - gene_strand1 : strand of gene 1
692 - gene_strand2 : strand of gene 2
693 - genome_breakseqs_percident : maximum percent identity of fusion sequence alignments to genome
694 - genomic_break_pos1 : genomic position in gene 1 of fusion splice / breakpoint
695 - genomic_break_pos2 : genomic position in gene 2 of fusion splice / breakpoint
696 - genomic_strand1 : genomic strand in gene 1 of fusion splice / breakpoint, retained sequence upstream on this strand, breakpoint is downstream
697 - genomic_strand2 : genomic strand in gene 2 of fusion splice / breakpoint, retained sequence upstream on this strand, breakpoint is downstream
698 - interchromosomal : fusion produced by an interchromosomal translocation
699 - interrupted_index1 : ratio of coverage before and after the fusion splice / breakpoint in gene 1
700 - interrupted_index2 : ratio of coverage before and after the fusion splice / breakpoint in gene 2
701 - inversion : fusion produced by genomic inversion
702 - orf : fusion combines genes in a way that preserves a reading frame
703 - probability : probability produced by classification using adaboost and example positives/negatives (only given in results.classified.txt)
704 - read_through : fusion involving adjacent potentially resulting from co-transcription rather than genome rearrangement
705 - repeat_proportion1 : proportion of the spanning reads in gene 1 that span a repeat region
706 - repeat_proportion2 : proportion of the spanning reads in gene 2 that span a repeat region
707 - max_repeat_proportion : max of repeat_proportion1 and repeat_proportion2
708 - splice_score : number of nucleotides similar to GTAG at fusion splice
709 - num_splice_variants : number of potential splice variants for this gene pair
710 - splicing_index1 : number of concordant pairs in gene 1 spanning the fusion splice / breakpoint, divided by number of spanning reads supporting the fusion with gene 2
711 - splicing_index2 : number of concordant pairs in gene 2 spanning the fusion splice / breakpoint, divided by number of spanning reads supporting the fusion with gene 1
712
713
714 **Example**
715
716 results.tsv::
717
718 cluster_id splitr_sequence splitr_count splitr_span_pvalue splitr_pos_pvalue splitr_min_pvalue adjacent altsplice break_adj_entropy1 break_adj_entropy2 break_adj_entropy_min break_predict breakpoint_homology breakseqs_estislands_percident cdna_breakseqs_percident concordant_ratio deletion est_breakseqs_percident eversion exonboundaries expression1 expression2 gene1 gene2 gene_align_strand1 gene_align_strand2 gene_chromosome1 gene_chromosome2 gene_end1 gene_end2 gene_location1 gene_location2 gene_name1 gene_name2 gene_start1 gene_start2 gene_strand1 gene_strand2 genome_breakseqs_percident genomic_break_pos1 genomic_break_pos2 genomic_strand1 genomic_strand2 interchromosomal interrupted_index1 interrupted_index2 inversion library_name max_map_count max_repeat_proportion mean_map_count min_map_count num_multi_map num_splice_variants orf read_through repeat_proportion1 repeat_proportion2 span_count span_coverage1 span_coverage2 span_coverage_max span_coverage_min splice_score splicing_index1 splicing_index2
719 1169 GCTTACTGTATGCCAGGCCCCAGAGGGGCAACCACCCTCTAAAGAGAGCGGCTCCTGCCTCCCAGAAAGCTCACAGACTGTGGGAGGGAAACAGGCAGCAGGTGAAGATGCCAAATGCCAGGATATCTGCCCTGTCCTTGCTTGATGCAGCTGCTGGCTCCCACGTTCTCCCCAGAATCCCCTCACACTCCTGCTGTTTTCTCTGCAGGTTGGCAGAGCCCCATGAGGGCAGGGCAGCCACTTTGTTCTTGGGCGGCAAACCTCCCTGGGCGGCACGGAAACCACGGTGAGAAGGGGGCAGGTCGGGCACGTGCAGGGACCACGCTGCAGG|TGTACCCAACAGCTCCGAAGAGACAGCGACCATCGAGAACGGGCCATGATGACGATGGCGGTTTTGTCGAAAAGAAAAGGGGGAAATGTGGGGAAAAGCAAGAGAGATCAGATTGTTACTGTGTCTGTGTAGAAAGAAGTAGACATGGGAGACTCCATTTTGTTCTGTACTAAGAAAAATTCTTCTGCCTTGAGATTCGGTGACCCCACCCCCAACCCCGTGCTCTCTGAAACATGTGCTGTGTCCACTCAGGGTTGAATGGATTAAGGGCGGTGCGAGACGTGCTTT 2 0.000436307890680442 0.110748295953850 0.0880671602973091 N Y 3.19872427442695 3.48337348351473 3.19872427442695 splitr 0 0 0 0 Y 0 N N 0 0 ENSG00000105549 ENSG00000213753 + - 19 19 376013 59111168 intron upstream THEG AC016629.2 361750 59084870 - + 0 375099 386594 + - N 8.34107429512245 - N output_dir 82 0.677852348993289 40.6666666666667 1 11 1 N N 0.361271676300578 0.677852348993289 12 0.758602776578432 0.569678713445872 0.758602776578432 0.569678713445872 2 0.416666666666667 -
720 3596 TGGGGGTTGAGGCTTCTGTTCCCAGGTTCCATGACCTCAGAGGTGGCTGGTGAGGTTATGACCTTTGCCCTCCAGCCCTGGCTTAAAACCTCAGCCCTAGGACCTGGTTAAAGGAAGGGGAGATGGAGCTTTGCCCCGACCCCCCCCCGTTCCCCTCACCTGTCAGCCCGAGCTGGGCCAGGGCCCCTAGGTGGGGAACTGGGCCGGGGGGCGGGCACAAGCGGAGGTGGTGCCCCCAAAAGGGCTCCCGGTGGGGTCTTGCTGAGAAGGTGAGGGGTTCCCGGGGCCGCAGCAGGTGGTGGTGGAGGAGCCAAGCGGCTGTAGAGCAAGGGGTGAGCAGGTTCCAGACCGTAGAGGCGGGCAGCGGCCACGGCCCCGGGTCCAGTTAGCTCCTCACCCGCCTCATAGAAGCGGGGTGGCCTTGCCAGGCGTGGGGGTGCTGCC|TTCCTTGGATGTGGTAGCCGTTTCTCAGGCTCCCTCTCCGGAATCGAACCCTGATTCCCCGTCACCCGTGGTCACCATGGTAGGCACGGCGACTACCATCGAAAGTTGATAGGGCAGACGTTCGAATGGGTCGTCGCCGCCACGGGGGGCGTGCGATCAGCCCGAGGTTATCTAGAGTCACCAAAGCCGCCGGCGCCCGCCCCCCGGCCGGGGCCGGAGAGGGGCTGACCGGGTTGGTTTTGATCTGATAAATGCACGCATCCCCCCCGCGAAGGGGGTCAGCGCCCGTCGGCATGTATTAGCTCTAGAATTACCACAGTTATCCAAGTAGGAGAGGAGCGAGCGACCAAAGGAACCATAACTGATTTAATGAGCCATTCGCAGTTTCACTGTACCGGCCGTGCGTACTTAGACATGCATGGCTTAATCTTTGAGACAAGCATATGCTACTGGCAGG 250 7.00711162298275e-72 0.00912124762512338 0.00684237452309549 N N 3.31745197152461 3.47233119514066 3.31745197152461 splitr 7 0.0157657657657656 0 0 N 0.0135135135135136 N N 0 0 ENSG00000156860 ENSG00000212932 - + 16 21 30682131 48111157 coding upstream FBRS RPL23AP4 30670289 48110676 + + 0.0157657657657656 30680678 9827473 - + Y - - N output_dir 2 1 1.11111111111111 1 1 1 N N 0 1 9 0.325530693397641 0.296465452915709 0.325530693397641 0.296465452915709 2 - -
721
722 </help>
723 </tool>