comparison create_reference_dataset.xml @ 18:547d8db4673e

Update create_reference_dataset for non human genome builds
author Jim Johnson <jj@umn.edu>
date Sat, 15 Jun 2013 14:36:47 -0500
parents d975e466d443
children 1af6f32ff592
comparison
equal deleted inserted replaced
17:fc35b7b993b1 18:547d8db4673e
7 <requirement type="package" version="2013-05-09">gmap</requirement> 7 <requirement type="package" version="2013-05-09">gmap</requirement>
8 <requirement type="package" version="latest">kent</requirement> 8 <requirement type="package" version="latest">kent</requirement>
9 </requirements> 9 </requirements>
10 <command interpreter="command"> /bin/bash $shscript </command> 10 <command interpreter="command"> /bin/bash $shscript </command>
11 <inputs> 11 <inputs>
12 <param name="ensembl_genome_version" type="text" value="" label="Ensembl Genome Version" help="Example: GRCh37"/> 12 <conditional name="genome">
13 <param name="ensembl_version" type="integer" value="" label="Esembl Release Version" help="Example: 71"/> 13 <param name="choice" type="select" label="Select a Genome Build">
14 <param name="ucsc_genome_version" type="text" value="" label="UCSC Genome Version" help="Example: hg19"/> 14 <option value="GRCh37">Homo_sapiens GRCh37 hg19</option>
15 <param name="chromosomes" type="text" value="" label="Chromosomes" help="Example: 1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,X,Y,MT"/> 15 <option value="NCBI36">Homo_sapiens NCBI36 hg18</option>
16 <param name="mt_chromosome" type="text" value="MT" label="Mitochonrial Chromosome" /> 16 <option value="GRCm38">Mus_musculus GRCm38 mm10</option>
17 <param name="gene_sources" type="text" value="IG_C_gene,IG_D_gene,IG_J_gene,IG_V_gene,processed_transcript,protein_coding" label="Gene sources" /> 17 <option value="NCBIM37">Mus_musculus NCBIM37 mm9</option>
18 <param name="ig_gene_sources" type="text" value="IG_C_gene,IG_D_gene,IG_J_gene,IG_V_gene,IG_pseudogene" label="IG Gene sources" /> 18 <option value="Rnor_5.0">Rattus_norvegicus Rnor_5.0 rn5</option>
19 <param name="rrna_gene_sources" type="text" value="Mt_rRNA,rRNA,rRNA_pseudogene" label="Ribosomal Gene sources" /> 19 <option value="user_specified">User specified</option>
20 </param>
21 <when value="GRCh37">
22 <param name="ensembl_organism" type="hidden" value="homo_sapiens"/>
23 <param name="ensembl_prefix" type="hidden" value="Homo_sapiens"/>
24 <param name="ensembl_genome_version" type="hidden" value="GRCh37"/>
25 <param name="ensembl_version" type="hidden" value="71"/>
26 <param name="ncbi_organism" type="hidden" value="Homo_sapiens"/>
27 <param name="ncbi_prefix" type="hidden" value="Hs"/>
28 <param name="ucsc_genome_version" type="hidden" value="hg19"/>
29 <param name="chromosomes" type="hidden" value="1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,X,Y,MT"/>
30 <param name="mt_chromosome" type="hidden" value="MT"/>
31 <param name="gene_sources" type="hidden" value="IG_C_gene,IG_D_gene,IG_J_gene,IG_V_gene,processed_transcript,protein_coding"/>
32 <param name="ig_gene_sources" type="hidden" value="IG_C_gene,IG_D_gene,IG_J_gene,IG_V_gene,IG_pseudogene"/>
33 <param name="rrna_gene_sources" type="hidden" value="Mt_rRNA,rRNA,rRNA_pseudogene"/>
34 </when>
35 <when value="NCBI36">
36 <param name="ensembl_organism" type="hidden" value="homo_sapiens"/>
37 <param name="ensembl_prefix" type="hidden" value="Homo_sapiens"/>
38 <param name="ensembl_genome_version" type="hidden" value="NCBI36"/>
39 <param name="ensembl_version" type="hidden" value="54"/>
40 <param name="ncbi_organism" type="hidden" value="Homo_sapiens"/>
41 <param name="ncbi_prefix" type="hidden" value="Hs"/>
42 <param name="ucsc_genome_version" type="hidden" value="hg18"/>
43 <param name="chromosomes" type="hidden" value="1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,X,Y,MT"/>
44 <param name="mt_chromosome" type="hidden" value="MT"/>
45 <param name="gene_sources" type="hidden" value="IG_C_gene,IG_D_gene,IG_J_gene,IG_V_gene,processed_transcript,protein_coding"/>
46 <param name="ig_gene_sources" type="hidden" value="IG_C_gene,IG_D_gene,IG_J_gene,IG_V_gene,IG_pseudogene"/>
47 <param name="rrna_gene_sources" type="hidden" value="Mt_rRNA,rRNA,rRNA_pseudogene"/>
48 </when>
49 <when value="GRCm38">
50 <param name="ensembl_organism" type="hidden" value="mus_musculus"/>
51 <param name="ensembl_prefix" type="hidden" value="Mus_musculus"/>
52 <param name="ensembl_genome_version" type="hidden" value="GRCm38"/>
53 <param name="ensembl_version" type="hidden" value="71"/>
54 <param name="ncbi_organism" type="hidden" value="Mus_musculus"/>
55 <param name="ncbi_prefix" type="hidden" value="Mm"/>
56 <param name="ucsc_genome_version" type="hidden" value="mm10"/>
57 <param name="chromosomes" type="hidden" value="1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,X,Y,MT"/>
58 <param name="mt_chromosome" type="hidden" value="MT"/>
59 <param name="gene_sources" type="hidden" value="IG_C_gene,IG_D_gene,IG_J_gene,IG_V_gene,processed_transcript,protein_coding"/>
60 <param name="ig_gene_sources" type="hidden" value="IG_C_gene,IG_D_gene,IG_J_gene,IG_V_gene,IG_pseudogene"/>
61 <param name="rrna_gene_sources" type="hidden" value="Mt_rRNA,rRNA,rRNA_pseudogene"/>
62 </when>
63 <when value="NCBIM37">
64 <param name="ensembl_organism" type="hidden" value="mus_musculus"/>
65 <param name="ensembl_prefix" type="hidden" value="Mus_musculus"/>
66 <param name="ensembl_genome_version" type="hidden" value="NCBIM37"/>
67 <param name="ensembl_version" type="hidden" value="67"/>
68 <param name="ncbi_organism" type="hidden" value="Mus_musculus"/>
69 <param name="ncbi_prefix" type="hidden" value="Mm"/>
70 <param name="ucsc_genome_version" type="hidden" value="mm9"/>
71 <param name="chromosomes" type="hidden" value="1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,X,Y,MT"/>
72 <param name="mt_chromosome" type="hidden" value="MT"/>
73 <param name="gene_sources" type="hidden" value="IG_C_gene,IG_D_gene,IG_J_gene,IG_V_gene,processed_transcript,protein_coding"/>
74 <param name="ig_gene_sources" type="hidden" value="IG_C_gene,IG_D_gene,IG_J_gene,IG_V_gene,IG_pseudogene"/>
75 <param name="rrna_gene_sources" type="hidden" value="Mt_rRNA,rRNA,rRNA_pseudogene"/>
76 </when>
77 <when value="Rnor_5.0">
78 <param name="ensembl_organism" type="hidden" value="rattus_norvegicus"/>
79 <param name="ensembl_prefix" type="hidden" value="Rattus_norvegicus"/>
80 <param name="ensembl_genome_version" type="hidden" value="Rnor_5.0"/>
81 <param name="ensembl_version" type="hidden" value="71"/>
82 <param name="ncbi_organism" type="hidden" value="Rattus_norvegicus"/>
83 <param name="ncbi_prefix" type="hidden" value="Rn"/>
84 <param name="ucsc_genome_version" type="hidden" value="rn5"/>
85 <param name="chromosomes" type="hidden" value="1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,X,MT"/>
86 <param name="mt_chromosome" type="hidden" value="MT"/>
87 <param name="gene_sources" type="hidden" value="IG_C_gene,IG_D_gene,IG_J_gene,IG_V_gene,processed_transcript,protein_coding"/>
88 <param name="ig_gene_sources" type="hidden" value="IG_C_gene,IG_D_gene,IG_J_gene,IG_V_gene,IG_pseudogene"/>
89 <param name="rrna_gene_sources" type="hidden" value="Mt_rRNA,rRNA,rRNA_pseudogene"/>
90 </when>
91 <when value="user_specified">
92 <param name="ensembl_organism" type="text" value="" label="Ensembl Organism Name" help="Examples: homo_sapiens, mus_musculus, rattus_norvegicus"/>
93 <param name="ensembl_prefix" type="text" value="" label="Ensembl Organism prefix" help="Examples: Homo_sapiens, Mus_musculus, Rattus_norvegicus"/>
94 <param name="ensembl_genome_version" type="text" value="" label="Ensembl Genome Version" help="Examples: GRCh37, GRCm38, Rnor_5.0"/>
95 <param name="ensembl_version" type="integer" value="" label="Ensembl Release Version" help="Example: 71"/>
96 <param name="ncbi_organism" type="text" value="" label="NCBI Organism Name" help="Examples: Homo_sapiens, Mus_musculus, Rattus_norvegicus"/>
97 <param name="ncbi_prefix" type="text" value="" label="NCBI Organism Unigene prefix" help="Examples: Hs, Mm, Rn"/>
98 <param name="ucsc_genome_version" type="text" value="" label="UCSC Genome Version" help="Examples: hg19, mm10, rn5"/>
99 <param name="chromosomes" type="text" value="" label="Chromosomes for Ensembl genome build" >
100 <help> Examples:
101 Homo_sapiens: 1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,X,Y,MT
102 Mus_musculus: 1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,X,Y,MT
103 Rattus_norvegicus: 1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,X,MT
104 ( ftp://ftp.ensembl.org/pub/release-71/fasta/homo_sapiens/dna/ )
105 </help>
106 </param>
107 <param name="mt_chromosome" type="text" value="MT" label="Ensembl Mitochonrial Chromosome name" />
108 <param name="gene_sources" type="text" value="IG_C_gene,IG_D_gene,IG_J_gene,IG_V_gene,processed_transcript,protein_coding" label="Gene sources" />
109 <param name="ig_gene_sources" type="text" value="IG_C_gene,IG_D_gene,IG_J_gene,IG_V_gene,IG_pseudogene" label="IG Gene sources" />
110 <param name="rrna_gene_sources" type="text" value="Mt_rRNA,rRNA,rRNA_pseudogene" label="Ribosomal Gene sources" />
111 </when>
112 </conditional>
20 </inputs> 113 </inputs>
21 <outputs> 114 <outputs>
22 <data format="txt" name="config_txt" label="${tool.name} on ${on_string}: config.txt"/> 115 <data format="txt" name="config_txt" label="${tool.name} on ${on_string}: config.txt"/>
23 </outputs> 116 </outputs>
24 <stdio> 117 <stdio>
41 # Directory where the defuse code was unpacked 134 # Directory where the defuse code was unpacked
42 ## Default location in the tool/defuse directory 135 ## Default location in the tool/defuse directory
43 # source_directory = ${__root_dir__}/tools/defuse 136 # source_directory = ${__root_dir__}/tools/defuse
44 source_directory = __DEFUSE_PATH__ 137 source_directory = __DEFUSE_PATH__
45 138
46 ensembl_version = $ensembl_version 139 ensembl_organism = $genome.ensembl_organism
47 ensembl_genome_version = $ensembl_genome_version 140 ensembl_prefix = $genome.ensembl_prefix
48 ucsc_genome_version = $ucsc_genome_version 141 ensembl_version = $genome.ensembl_version
142 ensembl_genome_version = $genome.ensembl_genome_version
143 ucsc_genome_version = $genome.ucsc_genome_version
144 ncbi_organism = $genome.ncbi_organism
145 ncbi_prefix = $genome.ncbi_prefix
49 146
50 # Directory where you want your dataset 147 # Directory where you want your dataset
51 dataset_directory = $config_txt.extra_files_path 148 dataset_directory = $config_txt.extra_files_path
52 149
53 #raw 150 #raw
54 # Input genome and gene models 151 # Input genome and gene models
55 gene_models = $(dataset_directory)/Homo_sapiens.$(ensembl_genome_version).$(ensembl_version).gtf 152 gene_models = $(dataset_directory)/$(ensembl_prefix).$(ensembl_genome_version).$(ensembl_version).gtf
56 genome_fasta = $(dataset_directory)/Homo_sapiens.$(ensembl_genome_version).$(ensembl_version).dna.chromosomes.fa 153 genome_fasta = $(dataset_directory)/$(ensembl_prefix).$(ensembl_genome_version).$(ensembl_version).dna.chromosomes.fa
57 154
58 # Repeat table from ucsc genome browser 155 # Repeat table from ucsc genome browser
59 repeats_filename = $(dataset_directory)/repeats.txt 156 repeats_filename = $(dataset_directory)/repeats.txt
60 157
61 # EST info downloaded from ucsc genome browser 158 # EST info downloaded from ucsc genome browser
62 est_fasta = $(dataset_directory)/est.fa 159 est_fasta = $(dataset_directory)/est.fa
63 est_alignments = $(dataset_directory)/intronEst.txt 160 est_alignments = $(dataset_directory)/intronEst.txt
64 161
65 # Unigene clusters downloaded from ncbi 162 # Unigene clusters downloaded from ncbi
66 unigene_fasta = $(dataset_directory)/Hs.seq.uniq 163 unigene_fasta = $(dataset_directory)/$(ncbi_prefix).seq.uniq
67 #end raw 164 #end raw
68 165
69 # Paths to external tools 166 # Paths to external tools
70 samtools_bin = __SAMTOOLS_BIN__ 167 samtools_bin = __SAMTOOLS_BIN__
71 bowtie_bin = __BOWTIE_BIN__ 168 bowtie_bin = __BOWTIE_BIN__
119 bowtie_quals = --phred33-quals 216 bowtie_quals = --phred33-quals
120 max_insert_size = 500 217 max_insert_size = 500
121 #end raw 218 #end raw
122 219
123 # Parameters for building the dataset 220 # Parameters for building the dataset
124 chromosomes = $chromosomes 221 chromosomes = $genome.chromosomes
125 mt_chromosome = $mt_chromosome 222 mt_chromosome = $genome.mt_chromosome
126 gene_sources = $gene_sources 223 gene_sources = $genome.gene_sources
127 ig_gene_sources = $ig_gene_sources 224 ig_gene_sources = $genome.ig_gene_sources
128 rrna_gene_sources = $rrna_gene_sources 225 rrna_gene_sources = $genome.rrna_gene_sources
129 226
130 #raw 227 #raw
131 # Blat sequences per job 228 # Blat sequences per job
132 num_blat_sequences = 10000 229 num_blat_sequences = 10000
133 230
164 #end raw 261 #end raw
165 </configfile> 262 </configfile>
166 <configfile name="shscript"> 263 <configfile name="shscript">
167 #!/bin/bash 264 #!/bin/bash
168 ## define some things for cheetah proccessing 265 ## define some things for cheetah proccessing
169 #set $ds = chr(36)
170 #set $amp = chr(38) 266 #set $amp = chr(38)
171 #set $gt = chr(62) 267 #set $gt = chr(62)
172 #set $lt = chr(60)
173 #set $echo_cmd = 'echo'
174 ## Find the defuse.pl in the galaxy tool path
175 #import Cheetah.FileUtils
176 ## substitute pathnames into config file 268 ## substitute pathnames into config file
177 if `grep __DEFUSE_PATH__ $defuse_config ${gt} /dev/null`;then sed -i'.tmp' "s#__DEFUSE_PATH__#\${DEFUSE_PATH}#" $defuse_config; fi 269 if `grep __DEFUSE_PATH__ $defuse_config ${gt} /dev/null`;then sed -i'.tmp' "s#__DEFUSE_PATH__#\${DEFUSE_PATH}#" $defuse_config; fi
178 if `grep __SAMTOOLS_BIN__ $defuse_config ${gt} /dev/null` ${amp}${amp} SAMTOOLS_BIN=`which samtools`;then sed -i'.tmp' "s#__SAMTOOLS_BIN__#\${SAMTOOLS_BIN}#" $defuse_config; fi 270 if `grep __SAMTOOLS_BIN__ $defuse_config ${gt} /dev/null` ${amp}${amp} SAMTOOLS_BIN=`which samtools`;then sed -i'.tmp' "s#__SAMTOOLS_BIN__#\${SAMTOOLS_BIN}#" $defuse_config; fi
179 if `grep __BOWTIE_BIN__ $defuse_config ${gt} /dev/null` ${amp}${amp} BOWTIE_BIN=`which bowtie`;then sed -i'.tmp' "s#__BOWTIE_BIN__#\${BOWTIE_BIN}#" $defuse_config; fi 271 if `grep __BOWTIE_BIN__ $defuse_config ${gt} /dev/null` ${amp}${amp} BOWTIE_BIN=`which bowtie`;then sed -i'.tmp' "s#__BOWTIE_BIN__#\${BOWTIE_BIN}#" $defuse_config; fi
180 if `grep __BOWTIE_BUILD_BIN__ $defuse_config ${gt} /dev/null` ${amp}${amp} BOWTIE_BUILD_BIN=`which bowtie-build`;then sed -i'.tmp' "s#__BOWTIE_BUILD_BIN__#\${BOWTIE_BUILD_BIN}#" $defuse_config; fi 272 if `grep __BOWTIE_BUILD_BIN__ $defuse_config ${gt} /dev/null` ${amp}${amp} BOWTIE_BUILD_BIN=`which bowtie-build`;then sed -i'.tmp' "s#__BOWTIE_BUILD_BIN__#\${BOWTIE_BUILD_BIN}#" $defuse_config; fi
183 if `grep __GMAP_BIN__ $defuse_config ${gt} /dev/null` ${amp}${amp} GMAP_BIN=`which gmap`;then sed -i'.tmp' "s#__GMAP_BIN__#\${GMAP_BIN}#" $defuse_config; fi 275 if `grep __GMAP_BIN__ $defuse_config ${gt} /dev/null` ${amp}${amp} GMAP_BIN=`which gmap`;then sed -i'.tmp' "s#__GMAP_BIN__#\${GMAP_BIN}#" $defuse_config; fi
184 if `grep __GMAP_SETUP_BIN__ $defuse_config ${gt} /dev/null` ${amp}${amp} GMAP_SETUP_BIN=`which gmap_setup`;then sed -i'.tmp' "s#__GMAP_SETUP_BIN__#\${GMAP_SETUP_BIN}#" $defuse_config; fi 276 if `grep __GMAP_SETUP_BIN__ $defuse_config ${gt} /dev/null` ${amp}${amp} GMAP_SETUP_BIN=`which gmap_setup`;then sed -i'.tmp' "s#__GMAP_SETUP_BIN__#\${GMAP_SETUP_BIN}#" $defuse_config; fi
185 if `grep __GMAP_INDEX_DIR__ $defuse_config ${gt} /dev/null` ${amp}${amp} GMAP_INDEX_DIR=`pwd`/gmap;then sed -i'.tmp' "s#__GMAP_INDEX_DIR__#\${GMAP_INDEX_DIR}#" $defuse_config; fi 277 if `grep __GMAP_INDEX_DIR__ $defuse_config ${gt} /dev/null` ${amp}${amp} GMAP_INDEX_DIR=`pwd`/gmap;then sed -i'.tmp' "s#__GMAP_INDEX_DIR__#\${GMAP_INDEX_DIR}#" $defuse_config; fi
186 if `grep __R_BIN__ $defuse_config ${gt} /dev/null` ${amp}${amp} R_BIN=`which R`;then sed -i'.tmp' "s#__R_BIN__#\${R_BIN}#" $defuse_config; fi 278 if `grep __R_BIN__ $defuse_config ${gt} /dev/null` ${amp}${amp} R_BIN=`which R`;then sed -i'.tmp' "s#__R_BIN__#\${R_BIN}#" $defuse_config; fi
187 if `grep __RSCRIPT_BIN__ $defuse_config ${gt} /dev/null` ${amp}${amp} RSCRIPT_BIN=`which Rscript`;then sed -i'.tmp' "s#__RSCRIPT_BIN__#\${RSCRIPT_BIN}#" $defuse_config; fi 279 if `grep __RSCRIPT_BIN__ $defuse_config ${gt} /dev/null` ${amp}${amp} RSCRIPT_BIN=`which Rscript`;then sed -i'.tmp' "s#__RSCRIPT_BIN__#\${RSCRIPT_BIN}#" $defuse_config; fi
188
189 ## copy config to output 280 ## copy config to output
190 cp $defuse_config $config_txt 281 cp $defuse_config $config_txt
191 ## make a data_dir and ln -s the input fastq 282 ## make a data_dir and ln -s the input fastq
192 mkdir -p $config_txt.extra_files_path 283 mkdir -p $config_txt.extra_files_path
193 ## run defuse.pl 284 ## create_reference_dataset.pl
194 perl \${DEFUSE_PATH}/scripts/create_reference_dataset.pl -c $defuse_config 285 perl \${DEFUSE_PATH}/scripts/create_reference_dataset.pl -c $defuse_config
195 </configfile> 286 </configfile>
196 </configfiles> 287 </configfiles>
197 288
198 <tests> 289 <tests>
199 </tests> 290 </tests>
200 <help> 291 <help>
201 **DeFuse** 292 **DeFuse**
202 293
203 DeFuse_ is a software package for gene fusion discovery using RNA-Seq data. The software uses clusters of discordant paired end alignments to inform a split read alignment analysis for finding fusion boundaries. The software also employs a number of heuristic filters in an attempt to reduce the number of false positives and produces a fully annotated output for each predicted fusion. 294 DeFuse_ is a software package for gene fusion discovery using RNA-Seq data. The software uses clusters of discordant paired end alignments to inform a split read alignment analysis for finding fusion boundaries. The software also employs a number of heuristic filters in an attempt to reduce the number of false positives and produces a fully annotated output for each predicted fusion. See the DeFuse_Version_0.6.1_ manual for details.
204 295
205 Journal reference: http://www.ploscompbiol.org/article/info%3Adoi%2F10.1371%2Fjournal.pcbi.1001138 296 DeFuse uses a Reference Dataset to search for gene fusions. The Reference Dataset is generated from the following sources in DeFuse_Version_0.6_:
206 297 - genome_fasta from Ensembl
207 .. _DeFuse: http://sourceforge.net/apps/mediawiki/defuse/index.php?title=Main_Page 298 - gene_models from Ensembl
208
209 ------
210
211 **Inputs**
212
213 DeFuse requires 2 fastq files for paried reads, one with the left mate of the paired reads, and a second fastq with the the right mate of the paired reads (**with reads in the same order as in the first fastq dataset**).
214
215 If your fastq files have reads in different orders or include unpaired reads, you can preprocess them with **FASTQ interlacer** to create a single interlaced fastq dataset with only the paired reads and input that to **FASTQ de-interlacer** to separate the reads into a left fastq and right fastq.
216
217 DeFuse uses a Reference Dataset to search for gene fusions. The Reference Dataset is generated from the following sources in DeFuse_Version_0.4_:
218 - genome_fasta from Ensembl
219 - gene_models from Ensembl
220 - repeats_filename from UCSC RepeatMasker rmsk.txt 299 - repeats_filename from UCSC RepeatMasker rmsk.txt
221 - est_fasta from UCSC 300 - est_fasta from UCSC
222 - est_alignments from UCSC intronEst.txt 301 - est_alignments from UCSC intronEst.txt
223 - unigene_fasta from NCBI 302 - unigene_fasta from NCBI
224 303
225 .. _DeFuse_Version_0.4: http://sourceforge.net/apps/mediawiki/defuse/index.php?title=DeFuse_Version_0.4.2 304 The create_defuse_reference Galaxy tool downloads the reference genome and other source files, and builds any derivative files including bowtie indices, gmap indices, and 2bit files. Expect this step to take at least 12 hours.
305
306
307 It will generate a config.txt file that can be input into the deFuse Galaxy tool.
308
309 Journal reference: http://www.ploscompbiol.org/article/info%3Adoi%2F10.1371%2Fjournal.pcbi.1001138
310
311 .. _DeFuse: http://sourceforge.net/apps/mediawiki/defuse/index.php?title=Main_Page
312
313 .. _DeFuse_Version_0.6: http://sourceforge.net/apps/mediawiki/defuse/index.php?title=DeFuse_Version_0.6.1
226 314
227 ------ 315 ------
228 316
229 **Outputs** 317 **Outputs**
230 318
231 The galaxy history will contain 5 outputs: the config.txt file that provides DeFuse with its parameters, the defuse.log which details what DeFuse has done and can be useful in determining any errors, and the 3 results files that defuse generates. 319 The galaxy history will contain: the config.txt file that provides DeFuse with the reference data paths.
232
233 DeFuse generates 3 results files: results.txt, results.filtered.txt, and results.classify.txt. All three files have the same format, though results.classify.txt has a probability column from the application of the classifier to results.txt, and results.filtered.txt has been filtered according to the threshold probability as set in config.txt.
234
235 The file format is tab delimited with one prediction per line, and the following fields per prediction (not necessarily in this order):
236
237 - **Identification**
238 - cluster_id : random identifier assigned to each prediction
239 - library_name : library name given on the command line of defuse
240 - gene1 : ensembl id of gene 1
241 - gene2 : ensembl id of gene 2
242 - gene_name1 : name of gene 1
243 - gene_name2 : name of gene 2
244 - **Evidence**
245 - break_predict : breakpoint prediction method, denovo or splitr, that is considered most reliable
246 - concordant_ratio : proportion of spanning reads considered concordant by blat
247 - denovo_min_count : minimum kmer count across denovo assembled sequence
248 - denovo_sequence : fusion sequence predicted by debruijn based denovo sequence assembly
249 - denovo_span_pvalue : p-value, lower values are evidence the prediction is a false positive
250 - gene_align_strand1 : alignment strand for spanning read alignments to gene 1
251 - gene_align_strand2 : alignment strand for spanning read alignments to gene 2
252 - min_map_count : minimum of the number of genomic mappings for each spanning read
253 - max_map_count : maximum of the number of genomic mappings for each spanning read
254 - mean_map_count : average of the number of genomic mappings for each spanning read
255 - num_multi_map : number of spanning reads that map to more than one genomic location
256 - span_count : number of spanning reads supporting the fusion
257 - span_coverage1 : coverage of spanning reads aligned to gene 1 as a proportion of expected coverage
258 - span_coverage2 : coverage of spanning reads aligned to gene 2 as a proportion of expected coverage
259 - span_coverage_min : minimum of span_coverage1 and span_coverage2
260 - span_coverage_max : maximum of span_coverage1 and span_coverage2
261 - splitr_count : number of split reads supporting the prediction
262 - splitr_min_pvalue : p-value, lower values are evidence the prediction is a false positive
263 - splitr_pos_pvalue : p-value, lower values are evidence the prediction is a false positive
264 - splitr_sequence : fusion sequence predicted by split reads
265 - splitr_span_pvalue : p-value, lower values are evidence the prediction is a false positive
266 - **Annotation**
267 - adjacent : fusion between adjacent genes
268 - altsplice : fusion likely the product of alternative splicing between adjacent genes
269 - break_adj_entropy1 : di-nucleotide entropy of the 40 nucleotides adjacent to the fusion splice in gene 1
270 - break_adj_entropy2 : di-nucleotide entropy of the 40 nucleotides adjacent to the fusion splice in gene 2
271 - break_adj_entropy_min : minimum of break_adj_entropy1 and break_adj_entropy2
272 - breakpoint_homology : number of nucleotides at the fusion splice that align equally well to gene 1 or gene 2
273 - breakseqs_estislands_percident : maximum percent identity of fusion sequence alignments to est islands
274 - cdna_breakseqs_percident : maximum percent identity of fusion sequence alignments to cdna
275 - deletion : fusion produced by a genomic deletion
276 - est_breakseqs_percident : maximum percent identity of fusion sequence alignments to est
277 - eversion : fusion produced by a genomic eversion
278 - exonboundaries : fusion splice at exon boundaries
279 - expression1 : expression of gene 1 as number of concordant pairs aligned to exons
280 - expression2 : expression of gene 2 as number of concordant pairs aligned to exons
281 - gene_chromosome1 : chromosome of gene 1
282 - gene_chromosome2 : chromosome of gene 2
283 - gene_end1 : end position for gene 1
284 - gene_end2 : end position for gene 2
285 - gene_location1 : location of breakpoint in gene 1
286 - gene_location2 : location of breakpoint in gene 2
287 - gene_start1 : start of gene 1
288 - gene_start2 : start of gene 2
289 - gene_strand1 : strand of gene 1
290 - gene_strand2 : strand of gene 2
291 - genome_breakseqs_percident : maximum percent identity of fusion sequence alignments to genome
292 - genomic_break_pos1 : genomic position in gene 1 of fusion splice / breakpoint
293 - genomic_break_pos2 : genomic position in gene 2 of fusion splice / breakpoint
294 - genomic_strand1 : genomic strand in gene 1 of fusion splice / breakpoint, retained sequence upstream on this strand, breakpoint is downstream
295 - genomic_strand2 : genomic strand in gene 2 of fusion splice / breakpoint, retained sequence upstream on this strand, breakpoint is downstream
296 - interchromosomal : fusion produced by an interchromosomal translocation
297 - interrupted_index1 : ratio of coverage before and after the fusion splice / breakpoint in gene 1
298 - interrupted_index2 : ratio of coverage before and after the fusion splice / breakpoint in gene 2
299 - inversion : fusion produced by genomic inversion
300 - orf : fusion combines genes in a way that preserves a reading frame
301 - probability : probability produced by classification using adaboost and example positives/negatives (only given in results.classified.txt)
302 - read_through : fusion involving adjacent potentially resulting from co-transcription rather than genome rearrangement
303 - repeat_proportion1 : proportion of the spanning reads in gene 1 that span a repeat region
304 - repeat_proportion2 : proportion of the spanning reads in gene 2 that span a repeat region
305 - max_repeat_proportion : max of repeat_proportion1 and repeat_proportion2
306 - splice_score : number of nucleotides similar to GTAG at fusion splice
307 - num_splice_variants : number of potential splice variants for this gene pair
308 - splicing_index1 : number of concordant pairs in gene 1 spanning the fusion splice / breakpoint, divided by number of spanning reads supporting the fusion with gene 2
309 - splicing_index2 : number of concordant pairs in gene 2 spanning the fusion splice / breakpoint, divided by number of spanning reads supporting the fusion with gene 1
310
311
312 **Example**
313
314 results.tsv::
315
316 cluster_id splitr_sequence splitr_count splitr_span_pvalue splitr_pos_pvalue splitr_min_pvalue adjacent altsplice break_adj_entropy1 break_adj_entropy2 break_adj_entropy_min break_predict breakpoint_homology breakseqs_estislands_percident cdna_breakseqs_percident concordant_ratio deletion est_breakseqs_percident eversion exonboundaries expression1 expression2 gene1 gene2 gene_align_strand1 gene_align_strand2 gene_chromosome1 gene_chromosome2 gene_end1 gene_end2 gene_location1 gene_location2 gene_name1 gene_name2 gene_start1 gene_start2 gene_strand1 gene_strand2 genome_breakseqs_percident genomic_break_pos1 genomic_break_pos2 genomic_strand1 genomic_strand2 interchromosomal interrupted_index1 interrupted_index2 inversion library_name max_map_count max_repeat_proportion mean_map_count min_map_count num_multi_map num_splice_variants orf read_through repeat_proportion1 repeat_proportion2 span_count span_coverage1 span_coverage2 span_coverage_max span_coverage_min splice_score splicing_index1 splicing_index2
317 1169 GCTTACTGTATGCCAGGCCCCAGAGGGGCAACCACCCTCTAAAGAGAGCGGCTCCTGCCTCCCAGAAAGCTCACAGACTGTGGGAGGGAAACAGGCAGCAGGTGAAGATGCCAAATGCCAGGATATCTGCCCTGTCCTTGCTTGATGCAGCTGCTGGCTCCCACGTTCTCCCCAGAATCCCCTCACACTCCTGCTGTTTTCTCTGCAGGTTGGCAGAGCCCCATGAGGGCAGGGCAGCCACTTTGTTCTTGGGCGGCAAACCTCCCTGGGCGGCACGGAAACCACGGTGAGAAGGGGGCAGGTCGGGCACGTGCAGGGACCACGCTGCAGG|TGTACCCAACAGCTCCGAAGAGACAGCGACCATCGAGAACGGGCCATGATGACGATGGCGGTTTTGTCGAAAAGAAAAGGGGGAAATGTGGGGAAAAGCAAGAGAGATCAGATTGTTACTGTGTCTGTGTAGAAAGAAGTAGACATGGGAGACTCCATTTTGTTCTGTACTAAGAAAAATTCTTCTGCCTTGAGATTCGGTGACCCCACCCCCAACCCCGTGCTCTCTGAAACATGTGCTGTGTCCACTCAGGGTTGAATGGATTAAGGGCGGTGCGAGACGTGCTTT 2 0.000436307890680442 0.110748295953850 0.0880671602973091 N Y 3.19872427442695 3.48337348351473 3.19872427442695 splitr 0 0 0 0 Y 0 N N 0 0 ENSG00000105549 ENSG00000213753 + - 19 19 376013 59111168 intron upstream THEG AC016629.2 361750 59084870 - + 0 375099 386594 + - N 8.34107429512245 - N output_dir 82 0.677852348993289 40.6666666666667 1 11 1 N N 0.361271676300578 0.677852348993289 12 0.758602776578432 0.569678713445872 0.758602776578432 0.569678713445872 2 0.416666666666667 -
318 3596 TGGGGGTTGAGGCTTCTGTTCCCAGGTTCCATGACCTCAGAGGTGGCTGGTGAGGTTATGACCTTTGCCCTCCAGCCCTGGCTTAAAACCTCAGCCCTAGGACCTGGTTAAAGGAAGGGGAGATGGAGCTTTGCCCCGACCCCCCCCCGTTCCCCTCACCTGTCAGCCCGAGCTGGGCCAGGGCCCCTAGGTGGGGAACTGGGCCGGGGGGCGGGCACAAGCGGAGGTGGTGCCCCCAAAAGGGCTCCCGGTGGGGTCTTGCTGAGAAGGTGAGGGGTTCCCGGGGCCGCAGCAGGTGGTGGTGGAGGAGCCAAGCGGCTGTAGAGCAAGGGGTGAGCAGGTTCCAGACCGTAGAGGCGGGCAGCGGCCACGGCCCCGGGTCCAGTTAGCTCCTCACCCGCCTCATAGAAGCGGGGTGGCCTTGCCAGGCGTGGGGGTGCTGCC|TTCCTTGGATGTGGTAGCCGTTTCTCAGGCTCCCTCTCCGGAATCGAACCCTGATTCCCCGTCACCCGTGGTCACCATGGTAGGCACGGCGACTACCATCGAAAGTTGATAGGGCAGACGTTCGAATGGGTCGTCGCCGCCACGGGGGGCGTGCGATCAGCCCGAGGTTATCTAGAGTCACCAAAGCCGCCGGCGCCCGCCCCCCGGCCGGGGCCGGAGAGGGGCTGACCGGGTTGGTTTTGATCTGATAAATGCACGCATCCCCCCCGCGAAGGGGGTCAGCGCCCGTCGGCATGTATTAGCTCTAGAATTACCACAGTTATCCAAGTAGGAGAGGAGCGAGCGACCAAAGGAACCATAACTGATTTAATGAGCCATTCGCAGTTTCACTGTACCGGCCGTGCGTACTTAGACATGCATGGCTTAATCTTTGAGACAAGCATATGCTACTGGCAGG 250 7.00711162298275e-72 0.00912124762512338 0.00684237452309549 N N 3.31745197152461 3.47233119514066 3.31745197152461 splitr 7 0.0157657657657656 0 0 N 0.0135135135135136 N N 0 0 ENSG00000156860 ENSG00000212932 - + 16 21 30682131 48111157 coding upstream FBRS RPL23AP4 30670289 48110676 + + 0.0157657657657656 30680678 9827473 - + Y - - N output_dir 2 1 1.11111111111111 1 1 1 N N 0 1 9 0.325530693397641 0.296465452915709 0.325530693397641 0.296465452915709 2 - -
319 320
320 </help> 321 </help>
321 </tool> 322 </tool>