changeset 18:547d8db4673e

Update create_reference_dataset for non human genome builds
author Jim Johnson <jj@umn.edu>
date Sat, 15 Jun 2013 14:36:47 -0500
parents fc35b7b993b1
children 1af6f32ff592
files README create_reference_dataset.xml defuse.xml defuse_bamfastq.xml test-data/tophat_out2h.bam tool_dependencies.xml
diffstat 6 files changed, 177 insertions(+), 157 deletions(-) [+]
line wrap: on
line diff
--- a/README	Wed Jun 12 21:03:18 2013 -0500
+++ b/README	Sat Jun 15 14:36:47 2013 -0500
@@ -5,7 +5,7 @@
 
 
 Manual:
-http://sourceforge.net/apps/mediawiki/defuse/index.php?title=DeFuse_Version_0.6.0
+http://sourceforge.net/apps/mediawiki/defuse/index.php?title=DeFuse_Version_0.6.1
 
 The included tool_dependencies.xml will download and install the defuse code.  
 It will set the environment variable: "DEFUSE_PATH" to the location of the defuse install.  
@@ -34,6 +34,9 @@
 
 These datasets should be referenced in the tool-data/defuse.loc file. 
 
+The create_reference_dataset will run the create_reference_dataset.pl script to generate deFuse genome reference data in a galaxy dataset.   
+This should me made available in the future as a Galaxy DataManager.
+
 
 Galaxy will try to auto-install dependencies:
 
--- a/create_reference_dataset.xml	Wed Jun 12 21:03:18 2013 -0500
+++ b/create_reference_dataset.xml	Sat Jun 15 14:36:47 2013 -0500
@@ -9,14 +9,107 @@
  </requirements>
   <command interpreter="command"> /bin/bash $shscript </command>
  <inputs>
-  <param name="ensembl_genome_version" type="text" value="" label="Ensembl Genome Version" help="Example: GRCh37"/>
-  <param name="ensembl_version" type="integer" value="" label="Esembl Release Version" help="Example: 71"/>
-  <param name="ucsc_genome_version" type="text" value="" label="UCSC Genome Version" help="Example: hg19"/>
-  <param name="chromosomes" type="text" value="" label="Chromosomes" help="Example: 1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,X,Y,MT"/>
-  <param name="mt_chromosome" type="text" value="MT" label="Mitochonrial Chromosome" />
-  <param name="gene_sources" type="text" value="IG_C_gene,IG_D_gene,IG_J_gene,IG_V_gene,processed_transcript,protein_coding" label="Gene sources" />
-  <param name="ig_gene_sources" type="text" value="IG_C_gene,IG_D_gene,IG_J_gene,IG_V_gene,IG_pseudogene" label="IG Gene sources" />
-  <param name="rrna_gene_sources" type="text" value="Mt_rRNA,rRNA,rRNA_pseudogene" label="Ribosomal Gene sources" />
+  <conditional name="genome">
+    <param name="choice" type="select" label="Select a Genome Build">
+      <option value="GRCh37">Homo_sapiens GRCh37  hg19</option>
+      <option value="NCBI36">Homo_sapiens NCBI36 hg18</option>
+      <option value="GRCm38">Mus_musculus GRCm38 mm10</option>
+      <option value="NCBIM37">Mus_musculus NCBIM37 mm9</option>
+      <option value="Rnor_5.0">Rattus_norvegicus Rnor_5.0 rn5</option>
+      <option value="user_specified">User specified</option>
+    </param>
+    <when value="GRCh37">
+      <param name="ensembl_organism" type="hidden" value="homo_sapiens"/>
+      <param name="ensembl_prefix" type="hidden" value="Homo_sapiens"/>
+      <param name="ensembl_genome_version" type="hidden" value="GRCh37"/>
+      <param name="ensembl_version" type="hidden" value="71"/>
+      <param name="ncbi_organism" type="hidden" value="Homo_sapiens"/>
+      <param name="ncbi_prefix" type="hidden" value="Hs"/>
+      <param name="ucsc_genome_version" type="hidden" value="hg19"/>
+      <param name="chromosomes" type="hidden" value="1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,X,Y,MT"/>
+      <param name="mt_chromosome" type="hidden" value="MT"/>
+      <param name="gene_sources" type="hidden" value="IG_C_gene,IG_D_gene,IG_J_gene,IG_V_gene,processed_transcript,protein_coding"/>
+      <param name="ig_gene_sources" type="hidden" value="IG_C_gene,IG_D_gene,IG_J_gene,IG_V_gene,IG_pseudogene"/>
+      <param name="rrna_gene_sources" type="hidden" value="Mt_rRNA,rRNA,rRNA_pseudogene"/>
+    </when>
+    <when value="NCBI36">
+      <param name="ensembl_organism" type="hidden" value="homo_sapiens"/>
+      <param name="ensembl_prefix" type="hidden" value="Homo_sapiens"/>
+      <param name="ensembl_genome_version" type="hidden" value="NCBI36"/>
+      <param name="ensembl_version" type="hidden" value="54"/>
+      <param name="ncbi_organism" type="hidden" value="Homo_sapiens"/>
+      <param name="ncbi_prefix" type="hidden" value="Hs"/>
+      <param name="ucsc_genome_version" type="hidden" value="hg18"/>
+      <param name="chromosomes" type="hidden" value="1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,X,Y,MT"/>
+      <param name="mt_chromosome" type="hidden" value="MT"/>
+      <param name="gene_sources" type="hidden" value="IG_C_gene,IG_D_gene,IG_J_gene,IG_V_gene,processed_transcript,protein_coding"/>
+      <param name="ig_gene_sources" type="hidden" value="IG_C_gene,IG_D_gene,IG_J_gene,IG_V_gene,IG_pseudogene"/>
+      <param name="rrna_gene_sources" type="hidden" value="Mt_rRNA,rRNA,rRNA_pseudogene"/>
+    </when>
+    <when value="GRCm38">
+      <param name="ensembl_organism" type="hidden" value="mus_musculus"/>
+      <param name="ensembl_prefix" type="hidden" value="Mus_musculus"/>
+      <param name="ensembl_genome_version" type="hidden" value="GRCm38"/>
+      <param name="ensembl_version" type="hidden" value="71"/>
+      <param name="ncbi_organism" type="hidden" value="Mus_musculus"/>
+      <param name="ncbi_prefix" type="hidden" value="Mm"/>
+      <param name="ucsc_genome_version" type="hidden" value="mm10"/>
+      <param name="chromosomes" type="hidden" value="1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,X,Y,MT"/>
+      <param name="mt_chromosome" type="hidden" value="MT"/>
+      <param name="gene_sources" type="hidden" value="IG_C_gene,IG_D_gene,IG_J_gene,IG_V_gene,processed_transcript,protein_coding"/>
+      <param name="ig_gene_sources" type="hidden" value="IG_C_gene,IG_D_gene,IG_J_gene,IG_V_gene,IG_pseudogene"/>
+      <param name="rrna_gene_sources" type="hidden" value="Mt_rRNA,rRNA,rRNA_pseudogene"/>
+    </when>
+    <when value="NCBIM37">
+      <param name="ensembl_organism" type="hidden" value="mus_musculus"/>
+      <param name="ensembl_prefix" type="hidden" value="Mus_musculus"/>
+      <param name="ensembl_genome_version" type="hidden" value="NCBIM37"/>
+      <param name="ensembl_version" type="hidden" value="67"/>
+      <param name="ncbi_organism" type="hidden" value="Mus_musculus"/>
+      <param name="ncbi_prefix" type="hidden" value="Mm"/>
+      <param name="ucsc_genome_version" type="hidden" value="mm9"/>
+      <param name="chromosomes" type="hidden" value="1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,X,Y,MT"/>
+      <param name="mt_chromosome" type="hidden" value="MT"/>
+      <param name="gene_sources" type="hidden" value="IG_C_gene,IG_D_gene,IG_J_gene,IG_V_gene,processed_transcript,protein_coding"/>
+      <param name="ig_gene_sources" type="hidden" value="IG_C_gene,IG_D_gene,IG_J_gene,IG_V_gene,IG_pseudogene"/>
+      <param name="rrna_gene_sources" type="hidden" value="Mt_rRNA,rRNA,rRNA_pseudogene"/>
+    </when>
+    <when value="Rnor_5.0">
+      <param name="ensembl_organism" type="hidden" value="rattus_norvegicus"/>
+      <param name="ensembl_prefix" type="hidden" value="Rattus_norvegicus"/>
+      <param name="ensembl_genome_version" type="hidden" value="Rnor_5.0"/>
+      <param name="ensembl_version" type="hidden" value="71"/>
+      <param name="ncbi_organism" type="hidden" value="Rattus_norvegicus"/>
+      <param name="ncbi_prefix" type="hidden" value="Rn"/>
+      <param name="ucsc_genome_version" type="hidden" value="rn5"/>
+      <param name="chromosomes" type="hidden" value="1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,X,MT"/>
+      <param name="mt_chromosome" type="hidden" value="MT"/>
+      <param name="gene_sources" type="hidden" value="IG_C_gene,IG_D_gene,IG_J_gene,IG_V_gene,processed_transcript,protein_coding"/>
+      <param name="ig_gene_sources" type="hidden" value="IG_C_gene,IG_D_gene,IG_J_gene,IG_V_gene,IG_pseudogene"/>
+      <param name="rrna_gene_sources" type="hidden" value="Mt_rRNA,rRNA,rRNA_pseudogene"/>
+    </when>
+    <when value="user_specified">
+      <param name="ensembl_organism" type="text" value="" label="Ensembl Organism Name" help="Examples: homo_sapiens, mus_musculus, rattus_norvegicus"/>
+      <param name="ensembl_prefix" type="text" value="" label="Ensembl Organism prefix" help="Examples: Homo_sapiens, Mus_musculus, Rattus_norvegicus"/>
+      <param name="ensembl_genome_version" type="text" value="" label="Ensembl Genome Version" help="Examples: GRCh37, GRCm38, Rnor_5.0"/>
+      <param name="ensembl_version" type="integer" value="" label="Ensembl Release Version" help="Example: 71"/>
+      <param name="ncbi_organism" type="text" value="" label="NCBI Organism Name" help="Examples: Homo_sapiens, Mus_musculus, Rattus_norvegicus"/>
+      <param name="ncbi_prefix" type="text" value="" label="NCBI Organism Unigene prefix" help="Examples: Hs, Mm, Rn"/>
+      <param name="ucsc_genome_version" type="text" value="" label="UCSC Genome Version" help="Examples: hg19, mm10, rn5"/>
+      <param name="chromosomes" type="text" value="" label="Chromosomes for Ensembl genome build" >
+       <help>  Examples: 
+         Homo_sapiens: 1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,X,Y,MT
+         Mus_musculus: 1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,X,Y,MT
+         Rattus_norvegicus: 1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,X,MT
+         ( ftp://ftp.ensembl.org/pub/release-71/fasta/homo_sapiens/dna/ )
+       </help>
+      </param>
+      <param name="mt_chromosome" type="text" value="MT" label="Ensembl Mitochonrial Chromosome name" />
+      <param name="gene_sources" type="text" value="IG_C_gene,IG_D_gene,IG_J_gene,IG_V_gene,processed_transcript,protein_coding" label="Gene sources" />
+      <param name="ig_gene_sources" type="text" value="IG_C_gene,IG_D_gene,IG_J_gene,IG_V_gene,IG_pseudogene" label="IG Gene sources" />
+      <param name="rrna_gene_sources" type="text" value="Mt_rRNA,rRNA,rRNA_pseudogene" label="Ribosomal Gene sources" />
+    </when>
+  </conditional>
  </inputs>
  <outputs>
   <data format="txt" name="config_txt" label="${tool.name} on ${on_string}: config.txt"/>
@@ -43,17 +136,21 @@
 # source_directory = ${__root_dir__}/tools/defuse
 source_directory = __DEFUSE_PATH__
 
-ensembl_version = $ensembl_version
-ensembl_genome_version = $ensembl_genome_version
-ucsc_genome_version = $ucsc_genome_version
+ensembl_organism = $genome.ensembl_organism
+ensembl_prefix = $genome.ensembl_prefix
+ensembl_version = $genome.ensembl_version
+ensembl_genome_version = $genome.ensembl_genome_version
+ucsc_genome_version = $genome.ucsc_genome_version
+ncbi_organism = $genome.ncbi_organism
+ncbi_prefix = $genome.ncbi_prefix
 
 # Directory where you want your dataset
 dataset_directory = $config_txt.extra_files_path
 
 #raw
 # Input genome and gene models
-gene_models                                 = $(dataset_directory)/Homo_sapiens.$(ensembl_genome_version).$(ensembl_version).gtf
-genome_fasta                                = $(dataset_directory)/Homo_sapiens.$(ensembl_genome_version).$(ensembl_version).dna.chromosomes.fa
+gene_models                                 = $(dataset_directory)/$(ensembl_prefix).$(ensembl_genome_version).$(ensembl_version).gtf
+genome_fasta                                = $(dataset_directory)/$(ensembl_prefix).$(ensembl_genome_version).$(ensembl_version).dna.chromosomes.fa
 
 # Repeat table from ucsc genome browser
 repeats_filename                            = $(dataset_directory)/repeats.txt
@@ -63,7 +160,7 @@
 est_alignments                              = $(dataset_directory)/intronEst.txt
 
 # Unigene clusters downloaded from ncbi
-unigene_fasta                               = $(dataset_directory)/Hs.seq.uniq
+unigene_fasta                               = $(dataset_directory)/$(ncbi_prefix).seq.uniq
 #end raw
 
 # Paths to external tools
@@ -121,11 +218,11 @@
 #end raw
 
 # Parameters for building the dataset
-chromosomes = $chromosomes
-mt_chromosome = $mt_chromosome
-gene_sources = $gene_sources
-ig_gene_sources = $ig_gene_sources
-rrna_gene_sources = $rrna_gene_sources
+chromosomes = $genome.chromosomes
+mt_chromosome = $genome.mt_chromosome
+gene_sources = $genome.gene_sources
+ig_gene_sources = $genome.ig_gene_sources
+rrna_gene_sources = $genome.rrna_gene_sources
 
 #raw
 # Blat sequences per job
@@ -166,13 +263,8 @@
   <configfile name="shscript">
 #!/bin/bash
 ## define some things for cheetah proccessing
-#set $ds = chr(36)
 #set $amp = chr(38)
 #set $gt = chr(62)
-#set $lt = chr(60)
-#set $echo_cmd = 'echo'
-## Find the defuse.pl in the galaxy tool path
-#import Cheetah.FileUtils
 ## substitute pathnames into config file
 if `grep __DEFUSE_PATH__ $defuse_config ${gt} /dev/null`;then sed -i'.tmp' "s#__DEFUSE_PATH__#\${DEFUSE_PATH}#" $defuse_config; fi
 if `grep __SAMTOOLS_BIN__ $defuse_config ${gt} /dev/null` ${amp}${amp} SAMTOOLS_BIN=`which samtools`;then sed -i'.tmp' "s#__SAMTOOLS_BIN__#\${SAMTOOLS_BIN}#" $defuse_config; fi
@@ -185,12 +277,11 @@
 if `grep __GMAP_INDEX_DIR__ $defuse_config ${gt} /dev/null` ${amp}${amp} GMAP_INDEX_DIR=`pwd`/gmap;then sed -i'.tmp' "s#__GMAP_INDEX_DIR__#\${GMAP_INDEX_DIR}#" $defuse_config; fi
 if `grep __R_BIN__ $defuse_config ${gt} /dev/null` ${amp}${amp} R_BIN=`which R`;then sed -i'.tmp' "s#__R_BIN__#\${R_BIN}#" $defuse_config; fi
 if `grep __RSCRIPT_BIN__ $defuse_config ${gt} /dev/null` ${amp}${amp} RSCRIPT_BIN=`which Rscript`;then sed -i'.tmp' "s#__RSCRIPT_BIN__#\${RSCRIPT_BIN}#" $defuse_config; fi
-
 ## copy config to output
 cp $defuse_config $config_txt
 ## make a data_dir  and ln -s the input fastq
 mkdir -p $config_txt.extra_files_path
-## run defuse.pl
+## create_reference_dataset.pl
 perl \${DEFUSE_PATH}/scripts/create_reference_dataset.pl -c $defuse_config 
   </configfile>
  </configfiles>
@@ -200,122 +291,32 @@
  <help>
 **DeFuse**
 
-DeFuse_ is a software package for gene fusion discovery using RNA-Seq data. The software uses clusters of discordant paired end alignments to inform a split read alignment analysis for finding fusion boundaries. The software also employs a number of heuristic filters in an attempt to reduce the number of false positives and produces a fully annotated output for each predicted fusion.  
+DeFuse_ is a software package for gene fusion discovery using RNA-Seq data. The software uses clusters of discordant paired end alignments to inform a split read alignment analysis for finding fusion boundaries. The software also employs a number of heuristic filters in an attempt to reduce the number of false positives and produces a fully annotated output for each predicted fusion.  See the DeFuse_Version_0.6.1_ manual for details.
+
+DeFuse uses a Reference Dataset to search for gene fusions.  The Reference Dataset is generated from the following sources in DeFuse_Version_0.6_:
+    - genome_fasta from Ensembl
+    - gene_models from Ensembl
+    - repeats_filename from UCSC RepeatMasker rmsk.txt
+    - est_fasta from UCSC
+    - est_alignments from UCSC intronEst.txt
+    - unigene_fasta from NCBI
+
+The create_defuse_reference Galaxy tool downloads the reference genome and other source files, and builds any derivative files including bowtie indices, gmap indices, and 2bit files. Expect this step to take at least 12 hours.
+
+
+It will generate a config.txt file that can be input into the deFuse Galaxy tool.  
 
 Journal reference: http://www.ploscompbiol.org/article/info%3Adoi%2F10.1371%2Fjournal.pcbi.1001138
 
 .. _DeFuse: http://sourceforge.net/apps/mediawiki/defuse/index.php?title=Main_Page
 
-------
-
-**Inputs**
-
-DeFuse requires 2 fastq files for paried reads, one with the left mate of the paired reads, and a second fastq with the the right mate of the paired reads (**with reads in the same order as in the first fastq dataset**).   
-
-If your fastq files have reads in different orders or include unpaired reads,  you can preprocess them with **FASTQ interlacer** to create a single interlaced fastq dataset with only the paired reads and input that to **FASTQ de-interlacer** to separate the reads into a left fastq and right fastq.
-
-DeFuse uses a Reference Dataset to search for gene fusions.  The Reference Dataset is generated from the following sources in DeFuse_Version_0.4_:
-    - genome_fasta from Ensembl 
-    - gene_models from Ensembl 
-    - repeats_filename from UCSC RepeatMasker rmsk.txt
-    - est_fasta from UCSC
-    - est_alignments from UCSC intronEst.txt
-    - unigene_fasta from NCBI
-
-.. _DeFuse_Version_0.4: http://sourceforge.net/apps/mediawiki/defuse/index.php?title=DeFuse_Version_0.4.2
+.. _DeFuse_Version_0.6: http://sourceforge.net/apps/mediawiki/defuse/index.php?title=DeFuse_Version_0.6.1
 
 ------
 
 **Outputs**
 
-The galaxy history will contain 5 outputs: the config.txt file that provides DeFuse with its parameters,  the defuse.log which details what DeFuse has done and can be useful in determining any errors, and the 3 results files that defuse generates.  
-
-DeFuse generates 3 results files: results.txt, results.filtered.txt, and results.classify.txt. All three files have the same format, though results.classify.txt has a probability column from the application of the classifier to results.txt, and results.filtered.txt has been filtered according to the threshold probability as set in config.txt. 
-
-The file format is tab delimited with one prediction per line, and the following fields per prediction (not necessarily in this order):
-
- - **Identification**
-    - cluster_id : random identifier assigned to each prediction
-    - library_name : library name given on the command line of defuse
-    - gene1 : ensembl id of gene 1
-    - gene2 : ensembl id of gene 2
-    - gene_name1 : name of gene 1
-    - gene_name2 : name of gene 2
- - **Evidence**
-    - break_predict : breakpoint prediction method, denovo or splitr, that is considered most reliable
-    - concordant_ratio : proportion of spanning reads considered concordant by blat
-    - denovo_min_count : minimum kmer count across denovo assembled sequence
-    - denovo_sequence : fusion sequence predicted by debruijn based denovo sequence assembly
-    - denovo_span_pvalue : p-value, lower values are evidence the prediction is a false positive
-    - gene_align_strand1 : alignment strand for spanning read alignments to gene 1
-    - gene_align_strand2 : alignment strand for spanning read alignments to gene 2
-    - min_map_count : minimum of the number of genomic mappings for each spanning read
-    - max_map_count : maximum of the number of genomic mappings for each spanning read
-    - mean_map_count : average of the number of genomic mappings for each spanning read
-    - num_multi_map : number of spanning reads that map to more than one genomic location
-    - span_count : number of spanning reads supporting the fusion
-    - span_coverage1 : coverage of spanning reads aligned to gene 1 as a proportion of expected coverage
-    - span_coverage2 : coverage of spanning reads aligned to gene 2 as a proportion of expected coverage
-    - span_coverage_min : minimum of span_coverage1 and span_coverage2
-    - span_coverage_max : maximum of span_coverage1 and span_coverage2
-    - splitr_count : number of split reads supporting the prediction
-    - splitr_min_pvalue : p-value, lower values are evidence the prediction is a false positive
-    - splitr_pos_pvalue : p-value, lower values are evidence the prediction is a false positive
-    - splitr_sequence : fusion sequence predicted by split reads
-    - splitr_span_pvalue : p-value, lower values are evidence the prediction is a false positive
- - **Annotation**
-    - adjacent : fusion between adjacent genes
-    - altsplice : fusion likely the product of alternative splicing between adjacent genes
-    - break_adj_entropy1 : di-nucleotide entropy of the 40 nucleotides adjacent to the fusion splice in gene 1
-    - break_adj_entropy2 : di-nucleotide entropy of the 40 nucleotides adjacent to the fusion splice in gene 2
-    - break_adj_entropy_min : minimum of break_adj_entropy1 and break_adj_entropy2
-    - breakpoint_homology : number of nucleotides at the fusion splice that align equally well to gene 1 or gene 2
-    - breakseqs_estislands_percident : maximum percent identity of fusion sequence alignments to est islands
-    - cdna_breakseqs_percident : maximum percent identity of fusion sequence alignments to cdna
-    - deletion : fusion produced by a genomic deletion
-    - est_breakseqs_percident : maximum percent identity of fusion sequence alignments to est
-    - eversion : fusion produced by a genomic eversion
-    - exonboundaries : fusion splice at exon boundaries
-    - expression1 : expression of gene 1 as number of concordant pairs aligned to exons
-    - expression2 : expression of gene 2 as number of concordant pairs aligned to exons
-    - gene_chromosome1 : chromosome of gene 1
-    - gene_chromosome2 : chromosome of gene 2
-    - gene_end1 : end position for gene 1
-    - gene_end2 : end position for gene 2
-    - gene_location1 : location of breakpoint in gene 1
-    - gene_location2 : location of breakpoint in gene 2
-    - gene_start1 : start of gene 1
-    - gene_start2 : start of gene 2
-    - gene_strand1 : strand of gene 1
-    - gene_strand2 : strand of gene 2
-    - genome_breakseqs_percident : maximum percent identity of fusion sequence alignments to genome
-    - genomic_break_pos1 : genomic position in gene 1 of fusion splice / breakpoint
-    - genomic_break_pos2 : genomic position in gene 2 of fusion splice / breakpoint
-    - genomic_strand1 : genomic strand in gene 1 of fusion splice / breakpoint, retained sequence upstream on this strand, breakpoint is downstream
-    - genomic_strand2 : genomic strand in gene 2 of fusion splice / breakpoint, retained sequence upstream on this strand, breakpoint is downstream
-    - interchromosomal : fusion produced by an interchromosomal translocation
-    - interrupted_index1 : ratio of coverage before and after the fusion splice / breakpoint in gene 1
-    - interrupted_index2 : ratio of coverage before and after the fusion splice / breakpoint in gene 2
-    - inversion : fusion produced by genomic inversion
-    - orf : fusion combines genes in a way that preserves a reading frame
-    - probability : probability produced by classification using adaboost and example positives/negatives (only given in results.classified.txt)
-    - read_through : fusion involving adjacent potentially resulting from co-transcription rather than genome rearrangement
-    - repeat_proportion1 : proportion of the spanning reads in gene 1 that span a repeat region
-    - repeat_proportion2 : proportion of the spanning reads in gene 2 that span a repeat region
-    - max_repeat_proportion : max of repeat_proportion1 and repeat_proportion2
-    - splice_score : number of nucleotides similar to GTAG at fusion splice
-    - num_splice_variants : number of potential splice variants for this gene pair
-    - splicing_index1 : number of concordant pairs in gene 1 spanning the fusion splice / breakpoint, divided by number of spanning reads supporting the fusion with gene 2
-    - splicing_index2 : number of concordant pairs in gene 2 spanning the fusion splice / breakpoint, divided by number of spanning reads supporting the fusion with gene 1
-
-
-**Example**
-
-results.tsv::
-
-  cluster_id	splitr_sequence	splitr_count	splitr_span_pvalue	splitr_pos_pvalue	splitr_min_pvalue	adjacent	altsplice	break_adj_entropy1	break_adj_entropy2	break_adj_entropy_min	break_predict	breakpoint_homology	breakseqs_estislands_percident	cdna_breakseqs_percident	concordant_ratio	deletion	est_breakseqs_percident	eversion	exonboundaries	expression1	expression2	gene1	gene2	gene_align_strand1	gene_align_strand2	gene_chromosome1	gene_chromosome2	gene_end1	gene_end2	gene_location1	gene_location2	gene_name1	gene_name2	gene_start1	gene_start2	gene_strand1	gene_strand2	genome_breakseqs_percident	genomic_break_pos1	genomic_break_pos2	genomic_strand1	genomic_strand2	interchromosomal	interrupted_index1	interrupted_index2	inversion	library_name	max_map_count	max_repeat_proportion	mean_map_count	min_map_count	num_multi_map	num_splice_variants	orf	read_through	repeat_proportion1	repeat_proportion2	span_count	span_coverage1	span_coverage2	span_coverage_max	span_coverage_min	splice_score	splicing_index1	splicing_index2	
-  1169	GCTTACTGTATGCCAGGCCCCAGAGGGGCAACCACCCTCTAAAGAGAGCGGCTCCTGCCTCCCAGAAAGCTCACAGACTGTGGGAGGGAAACAGGCAGCAGGTGAAGATGCCAAATGCCAGGATATCTGCCCTGTCCTTGCTTGATGCAGCTGCTGGCTCCCACGTTCTCCCCAGAATCCCCTCACACTCCTGCTGTTTTCTCTGCAGGTTGGCAGAGCCCCATGAGGGCAGGGCAGCCACTTTGTTCTTGGGCGGCAAACCTCCCTGGGCGGCACGGAAACCACGGTGAGAAGGGGGCAGGTCGGGCACGTGCAGGGACCACGCTGCAGG|TGTACCCAACAGCTCCGAAGAGACAGCGACCATCGAGAACGGGCCATGATGACGATGGCGGTTTTGTCGAAAAGAAAAGGGGGAAATGTGGGGAAAAGCAAGAGAGATCAGATTGTTACTGTGTCTGTGTAGAAAGAAGTAGACATGGGAGACTCCATTTTGTTCTGTACTAAGAAAAATTCTTCTGCCTTGAGATTCGGTGACCCCACCCCCAACCCCGTGCTCTCTGAAACATGTGCTGTGTCCACTCAGGGTTGAATGGATTAAGGGCGGTGCGAGACGTGCTTT	2	0.000436307890680442	0.110748295953850	0.0880671602973091	N	Y	3.19872427442695	3.48337348351473	3.19872427442695	splitr	0	0	0	0	Y	0	N	N	0	0	ENSG00000105549	ENSG00000213753	+	-	19	19	376013	59111168	intron	upstream	THEG	AC016629.2	361750	59084870	-	+	0	375099	386594	+	-	N	8.34107429512245	-	N	output_dir	82	0.677852348993289	40.6666666666667	1	11	1	N	N	0.361271676300578	0.677852348993289	12	0.758602776578432	0.569678713445872	0.758602776578432	0.569678713445872	2	0.416666666666667	-	
-  3596	TGGGGGTTGAGGCTTCTGTTCCCAGGTTCCATGACCTCAGAGGTGGCTGGTGAGGTTATGACCTTTGCCCTCCAGCCCTGGCTTAAAACCTCAGCCCTAGGACCTGGTTAAAGGAAGGGGAGATGGAGCTTTGCCCCGACCCCCCCCCGTTCCCCTCACCTGTCAGCCCGAGCTGGGCCAGGGCCCCTAGGTGGGGAACTGGGCCGGGGGGCGGGCACAAGCGGAGGTGGTGCCCCCAAAAGGGCTCCCGGTGGGGTCTTGCTGAGAAGGTGAGGGGTTCCCGGGGCCGCAGCAGGTGGTGGTGGAGGAGCCAAGCGGCTGTAGAGCAAGGGGTGAGCAGGTTCCAGACCGTAGAGGCGGGCAGCGGCCACGGCCCCGGGTCCAGTTAGCTCCTCACCCGCCTCATAGAAGCGGGGTGGCCTTGCCAGGCGTGGGGGTGCTGCC|TTCCTTGGATGTGGTAGCCGTTTCTCAGGCTCCCTCTCCGGAATCGAACCCTGATTCCCCGTCACCCGTGGTCACCATGGTAGGCACGGCGACTACCATCGAAAGTTGATAGGGCAGACGTTCGAATGGGTCGTCGCCGCCACGGGGGGCGTGCGATCAGCCCGAGGTTATCTAGAGTCACCAAAGCCGCCGGCGCCCGCCCCCCGGCCGGGGCCGGAGAGGGGCTGACCGGGTTGGTTTTGATCTGATAAATGCACGCATCCCCCCCGCGAAGGGGGTCAGCGCCCGTCGGCATGTATTAGCTCTAGAATTACCACAGTTATCCAAGTAGGAGAGGAGCGAGCGACCAAAGGAACCATAACTGATTTAATGAGCCATTCGCAGTTTCACTGTACCGGCCGTGCGTACTTAGACATGCATGGCTTAATCTTTGAGACAAGCATATGCTACTGGCAGG	250	7.00711162298275e-72	0.00912124762512338	0.00684237452309549	N	N	3.31745197152461	3.47233119514066	3.31745197152461	splitr	7	0.0157657657657656	0	0	N	0.0135135135135136	N	N	0	0	ENSG00000156860	ENSG00000212932	-	+	16	21	30682131	48111157	coding	upstream	FBRS	RPL23AP4	30670289	48110676	+	+	0.0157657657657656	30680678	9827473	-	+	Y	-	-	N	output_dir	2	1	1.11111111111111	1	1	1	N	N	0	1	9	0.325530693397641	0.296465452915709	0.325530693397641	0.296465452915709	2	-	-	
+The galaxy history will contain: the config.txt file that provides DeFuse with the reference data paths.  
 
  </help>
 </tool>
--- a/defuse.xml	Wed Jun 12 21:03:18 2013 -0500
+++ b/defuse.xml	Sat Jun 15 14:36:47 2013 -0500
@@ -650,7 +650,7 @@
 
 If your fastq files have reads in different orders or include unpaired reads,  you can preprocess them with **FASTQ interlacer** to create a single interlaced fastq dataset with only the paired reads and input that to **FASTQ de-interlacer** to separate the reads into a left fastq and right fastq.
 
-DeFuse uses a Reference Dataset to search for gene fusions.  The Reference Dataset is generated from the following sources in DeFuse_Version_0.4_:
+DeFuse uses a Reference Dataset to search for gene fusions.  The Reference Dataset is generated from the following sources in DeFuse_Version_0.6_:
     - genome_fasta from Ensembl 
     - gene_models from Ensembl 
     - repeats_filename from UCSC RepeatMasker rmsk.txt
@@ -658,7 +658,7 @@
     - est_alignments from UCSC intronEst.txt
     - unigene_fasta from NCBI
 
-.. _DeFuse_Version_0.4: http://sourceforge.net/apps/mediawiki/defuse/index.php?title=DeFuse_Version_0.4.2
+.. _DeFuse_Version_0.6: http://sourceforge.net/apps/mediawiki/defuse/index.php?title=DeFuse_Version_0.6.1
 
 ------
 
--- a/defuse_bamfastq.xml	Wed Jun 12 21:03:18 2013 -0500
+++ b/defuse_bamfastq.xml	Sat Jun 15 14:36:47 2013 -0500
@@ -1,61 +1,60 @@
 <?xml version="1.0"?>
 <tool id="defuse_bamfastq" name="Defuse BamFastq" version="0.6.1">
   <description>converts a bam file to fastq files.</description>
-  
   <requirements>
     <requirement type="package" version="0.6.1">defuse</requirement>
   </requirements>
-  
   <version_command>bamfastq --version</version_command>
-
   <command>bamfastq
     #if $pair == True :
       $pair
     #end if
-
     #if $multiple == True :
       $multiple
     #end if
-
     #if $rename == True :
       $rename
     #end if
-    
-    -2 $fastq2
+    -b $bamfile
     -1 $fastq1
-    -b $bamname
-
-
+    -2 $fastq2
   </command>
-
   <inputs>
-    <param name="bamname" type="data" format="bam" label="Bam filename."/> 
+    <param name="bamfile" type="data" format="bam" label="Bam file"/> 
     <param name="pair" type="boolean" truevalue="-p" falsevalue="" checked="true" label="Name contains pair info as /1 /2."/>
     <param name="multiple" type="boolean" truevalue="-m" falsevalue="" checked="true" label="Bam contains multiple mappings per read."/>
     <param name="rename" type="boolean" truevalue="-r" falsevalue="" checked="true" label="Rename with integer IDs."/>
   </inputs>
-
   <stdio>
     <exit_code range="1:" level="fatal" description="Error" />
   </stdio>
-
   <outputs>
-    <data format="fastqsanger,fastqillumina" name="fastq1" label="fastq1"  />
-    <data format="fastqsanger,fastqillumina" name="fastq2" label="fastq2"  />
+    <data format="fastqsanger" name="fastq1" label="fastq1"  />
+    <data format="fastqsanger" name="fastq2" label="fastq2"  />
   </outputs>
-
   <tests>
     <test>
-      <param name="bamname" ftype="bam" value="tophat_out2h.bam" />
+      <param name="bamfile" ftype="bam" value="tophat_out2h.bam" />
       <param name="pair" value="True" />
       <param name="multiple" value="True" />
       <param name="rename" value="True" />
-      <output name="fastq1" file="testout_defuse1.fq" />
-      <output name="fastq2" file="testout_defuse2.fq" />
+      <output name="fastq1">
+        <assert_contents>
+          <has_text text="@test_mRNA_36_146_27/1" />
+          <not_has_text text="@test_mRNA_36_146_27/2" />
+          <not_has_text text="test_mRNA_150_290_0" />
+        </assert_contents>
+      </output>
+      <output name="fastq2">
+        <assert_contents>
+          <has_text text="@test_mRNA_36_146_27/2" />
+          <not_has_text text="@test_mRNA_36_146_27/1" />
+          <not_has_text text="test_mRNA_150_290_0" />
+        </assert_contents>
+      </output>
     </test>
   </tests>
-
   <help>
-    Turn your dry, flavorless BAM file into delicious fastq files!
+    bamfastq converts a bam file input into a pair of fastq files that can be used as input to deFuse.
   </help>
 </tool>
Binary file test-data/tophat_out2h.bam has changed
--- a/tool_dependencies.xml	Wed Jun 12 21:03:18 2013 -0500
+++ b/tool_dependencies.xml	Sat Jun 15 14:36:47 2013 -0500
@@ -26,6 +26,23 @@
                     </repository>
                 </action>
                 <action type="shell_command">export CPLUS_INCLUDE_PATH=$BOOST_ROOT_DIR:$CPLUS_INCLUDE_PATH &amp;&amp; cd tools &amp;&amp; make</action>
+                <!-- modify create_reference_dataset.pl to handle more than just human genomes -->
+                <action type="shell_command">
+                   cd scripts &amp;&amp; 
+                   cp create_reference_dataset.pl create_reference_dataset.pl.orig &amp;&amp; 
+                   cat create_reference_dataset.pl.orig |
+                   sed 's#wget_gunzip("ftp://hgdownload.cse.ucsc.edu/goldenPath/$ucsc_genome_version/database/rmsk.txt.gz", $repeats_filename);##' |
+                   sed 's#wget_gunzip("ftp://hgdownload.cse.ucsc.edu/goldenPath/hg19/database/intronEst.txt.gz", $est_alignments);##' |
+                   sed 's#^\(my .*ensembl_genome_version.*config.*get_value.*;\)#\1Qmy $ensembl_organism = $config->get_value("ensembl_organism");Qmy $ensembl_prefix = $config->get_value("ensembl_prefix");Qmy $ncbi_organism = $config->get_value("ncbi_organism");Qmy $ncbi_prefix = $config->get_value("ncbi_prefix");#' |
+                   sed 's/^\(sub wget_gunzip\)/sub try_wgetQ{Q my $url = shift;Q my $filename = shift;Q my $filename_gz = $filename.".gz";Q my $rslt = system "wget $url -O $filename_gz";Q if($rslt == 0)Q {Q  $rslt = system "gunzip $filename_gz";Q }Q return $rslt;Q}QQ\1/' |
+                   tr 'Q' '\n' |
+                   awk 'BEGIN{pfx="p1";fn="p2";}/if \(not -e \$repeats_filename\)/{pfx="rmsk";fn="repeats_filename";} /if \(not -e \$est_alignments\)/{pfx="intronEst";fn="est_alignments"} /ucsc_genome_version eq "hg18"/{printf("\tif (try_wget(\"ftp://hgdownload.cse.ucsc.edu/goldenPath/$ucsc_genome_version/database/%s.txt.gz\", \$%s) != 0)\n",pfx,fn);} $0 !~ /ucsc_genome_version eq "hg18/{print $0;}' |
+                   sed 's#UniGene/Homo_sapiens#UniGene/$ncbi_organism#' |
+                   sed 's/Hs.seq.uniq.gz/$ncbi_prefix.seq.uniq.gz/' |
+                   sed 's/homo_sapiens/$ensembl_organism/' |
+                   sed 's/Homo_sapiens/$ensembl_prefix/' |
+                   sed 's/hg19/$ucsc_genome_version/' > create_reference_dataset.pl
+                </action>
                 <action type="move_directory_files">
                     <source_directory>.</source_directory>
                     <destination_directory>$INSTALL_DIR</destination_directory>