view create_reference_dataset.xml @ 12:33e2235bf003

Add create_reference_dataset.xml
author Jim Johnson <jj@umn.edu>
date Sun, 09 Jun 2013 20:30:21 -0500
parents
children 85693cb5339f
line wrap: on
line source

<tool id="create_defusei_reference" name="Create DeFuse Reference" version="1.6.1">
 <description>create a defuse reference from Ensembl and UCSC sources</description>
 <requirements>
  <requirement type="package" version="0.6.1">defuse</requirement>
  <requirement type="package" version="0.1.18">samtools</requirement>
  <requirement type="package" version="1.0.0">bowtie</requirement>
  <requirement type="package" version="2013-05-09">gmap</requirement>
  <requirement type="package" version="latest">kent</requirement>
 </requirements>
  <command interpreter="command"> /bin/bash $shscript </command>
 <inputs>
  <param name="ensembl_genome_version" type="text" value="" label="Esembl Genome Version" help="Example: GRCh37"/>
  <param name="ensembl_version" type="integer" value="" label="Esembl Release Version" help="Example: 71"/>
  <param name="ucsc_genome_version" type="text" value="" label="UCSC Genome Version" help="Example: hg19"/>
  <param name="chromosomes" type="text" value="" label="Chromosomes" help="Example: 1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,X,Y,MT"/>
  <param name="mt_chromosome" type="text" value="MT" label="Mitochonrial Chromosome" />
  <param name="gene_sources" type="text" value="IG_C_gene,IG_D_gene,IG_J_gene,IG_V_gene,processed_transcript,protein_coding" label="Gene sources" />
  <param name="ig_gene_sources" type="text" value="IG_C_gene,IG_D_gene,IG_J_gene,IG_V_gene,IG_pseudogene" label="IG Gene sources" />
  <param name="rrna_gene_sources" type="text" value="Mt_rRNA,rRNA,rRNA_pseudogene" label="Ribosomal Gene sources" />
 </inputs>
 <outputs>
  <data format="txt" name="config_txt" label="${tool.name} on ${on_string}: config.txt"/>
 </outputs>
 <configfiles>
  <configfile name="defuse_config">
#import ast
#
# Configuration file for defuse
#
# At a minimum, change all values enclused by []
#

# Directory where the defuse code was unpacked
## Default location in the tool/defuse directory  
# source_directory = ${__root_dir__}/tools/defuse
source_directory = __DEFUSE_PATH__

ensembl_version = $ensembl_version
ensembl_genome_version = $ensembl_genome_version
ucsc_genome_version = $ucsc_genome_version

# Directory where you want your dataset
dataset_directory = $config_txt.extra_files_path

#raw
# Input genome and gene models
gene_models                                 = $(dataset_directory)/Homo_sapiens.$(ensembl_genome_version).$(ensembl_version).gtf
genome_fasta                                = $(dataset_directory)/Homo_sapiens.$(ensembl_genome_version).$(ensembl_version).dna.chromosomes.fa

# Repeat table from ucsc genome browser
repeats_filename                            = $(dataset_directory)/repeats.txt

# EST info downloaded from ucsc genome browser
est_fasta                                   = $(dataset_directory)/est.fa
est_alignments                              = $(dataset_directory)/intronEst.txt

# Unigene clusters downloaded from ncbi
unigene_fasta                               = $(dataset_directory)/Hs.seq.uniq
#end raw

# Paths to external tools
samtools_bin =  __SAMTOOLS_BIN__
bowtie_bin = __BOWTIE_BIN__
bowtie_build_bin = __BOWTIE_BUILD_BIN__
blat_bin = __BLAT_BIN__
fatotwobit_bin = __FATOTWOBIT_BIN__
gmap_bin = __GMAP_BIN__
gmap_setup_bin = __GMAP_SETUP_BIN__
r_bin = __R_BIN__
rscript_bin = __RSCRIPT_BIN__

#raw
# Directory where you want your dataset
gmap_index_directory                        = $(dataset_directory)/gmap
#end raw

#raw
# Dataset files
dataset_prefix       = $(dataset_directory)/defuse
chromosome_prefix    = $(dataset_prefix).dna.chromosomes
exons_fasta          = $(dataset_prefix).exons.fa
cds_fasta            = $(dataset_prefix).cds.fa
cdna_regions         = $(dataset_prefix).cdna.regions
cdna_fasta           = $(dataset_prefix).cdna.fa
reference_fasta      = $(dataset_prefix).reference.fa
rrna_fasta           = $(dataset_prefix).rrna.fa
ig_gene_list         = $(dataset_prefix).ig.gene.list
repeats_regions      = $(dataset_directory)/repeats.regions
est_split_fasta1     = $(dataset_directory)/est.1.fa
est_split_fasta2     = $(dataset_directory)/est.2.fa
est_split_fasta3     = $(dataset_directory)/est.3.fa
est_split_fasta4     = $(dataset_directory)/est.4.fa
est_split_fasta5     = $(dataset_directory)/est.5.fa
est_split_fasta6     = $(dataset_directory)/est.6.fa
est_split_fasta7     = $(dataset_directory)/est.7.fa
est_split_fasta8     = $(dataset_directory)/est.8.fa
est_split_fasta9     = $(dataset_directory)/est.9.fa

# Fasta files with bowtie indices for prefiltering reads for concordantly mapping pairs
prefilter1           = $(unigene_fasta)

# deFuse scripts and tools
scripts_directory    = $(source_directory)/scripts
tools_directory      = $(source_directory)/tools
data_directory       = $(source_directory)/data
#end raw

#raw
# Bowtie parameters
bowtie_threads                              = 1
bowtie_quals                                = --phred33-quals
max_insert_size                             = 500
#end raw

# Parameters for building the dataset
chromosomes = $chromosomes
mt_chromosome = $mt_chromosome
gene_sources = $gene_sources
ig_gene_sources = $ig_gene_sources
rrna_gene_sources = $rrna_gene_sources

#raw
# Blat sequences per job
num_blat_sequences                          = 10000

# Minimum gene fusion range
dna_concordant_length                       = 2000

# Trim length for discordant reads (split reads are not trimmed)
discord_read_trim                           = 50

# Calculate extra annotations, fusion splice index and interrupted index
calculate_extra_annotations                 = no

# Filtering parameters
clustering_precision                        = 0.95
span_count_threshold                        = 5
percent_identity_threshold                  = 0.90
split_min_anchor                            = 4
splice_bias                                 = 10
positive_controls                           = $(data_directory)/controls.txt
probability_threshold                       = 0.50

# Position density when calculating covariance
covariance_sampling_density                 = 0.01

# Number of reads for each job in split
reads_per_job                               = 1000000

# If you have command line 'mail' and wish to be notified
mailto                                      = andrew.mcpherson@gmail.com

# Remove temp files
remove_job_files                            = yes
remove_job_temp_files                       = yes
#end raw
  </configfile>
  <configfile name="shscript">
#!/bin/bash
## define some things for cheetah proccessing
#set $ds = chr(36)
#set $amp = chr(38)
#set $gt = chr(62)
#set $lt = chr(60)
#set $echo_cmd = 'echo'
## Find the defuse.pl in the galaxy tool path
#import Cheetah.FileUtils
## substitute pathnames into config file
if `grep __DEFUSE_PATH__ $defuse_config ${gt} /dev/null`;then sed -i'.tmp' "s#__DEFUSE_PATH__#\${DEFUSE_PATH}#" $defuse_config; fi
if `grep __SAMTOOLS_BIN__ $defuse_config ${gt} /dev/null` ${amp}${amp} SAMTOOLS_BIN=`which samtools`;then sed -i'.tmp' "s#__SAMTOOLS_BIN__#\${SAMTOOLS_BIN}#" $defuse_config; fi
if `grep __BOWTIE_BIN__ $defuse_config ${gt} /dev/null` ${amp}${amp} BOWTIE_BIN=`which bowtie`;then sed -i'.tmp' "s#__BOWTIE_BIN__#\${BOWTIE_BIN}#" $defuse_config; fi
if `grep __BOWTIE_BUILD_BIN__ $defuse_config ${gt} /dev/null` ${amp}${amp} BOWTIE_BUILD_BIN=`which bowtie-build`;then sed -i'.tmp' "s#__BOWTIE_BUILD_BIN__#\${BOWTIE_BUILD_BIN}#" $defuse_config; fi
if `grep __BLAT_BIN__ $defuse_config ${gt} /dev/null` ${amp}${amp} BLAT_BIN=`which blat`;then sed -i'.tmp' "s#__BLAT_BIN__#\${BLAT_BIN}#" $defuse_config; fi
if `grep __FATOTWOBIT_BIN__ $defuse_config ${gt} /dev/null` ${amp}${amp} FATOTWOBIT_BIN=`which faToTwoBit`;then sed -i'.tmp' "s#__FATOTWOBIT_BIN__#\${FATOTWOBIT_BIN}#" $defuse_config; fi
if `grep __GMAP_BIN__ $defuse_config ${gt} /dev/null` ${amp}${amp} GMAP_BIN=`which gmap`;then sed -i'.tmp' "s#__GMAP_BIN__#\${GMAP_BIN}#" $defuse_config; fi
if `grep __GMAP_SETUP_BIN__ $defuse_config ${gt} /dev/null` ${amp}${amp} GMAP_SETUP_BIN=`which gmap_setup`;then sed -i'.tmp' "s#__GMAP_SETUP_BIN__#\${GMAP_SETUP_BIN}#" $defuse_config; fi
if `grep __GMAP_INDEX_DIR__ $defuse_config ${gt} /dev/null` ${amp}${amp} GMAP_INDEX_DIR=`pwd`/gmap;then sed -i'.tmp' "s#__GMAP_INDEX_DIR__#\${GMAP_INDEX_DIR}#" $defuse_config; fi
if `grep __R_BIN__ $defuse_config ${gt} /dev/null` ${amp}${amp} R_BIN=`which R`;then sed -i'.tmp' "s#__R_BIN__#\${R_BIN}#" $defuse_config; fi
if `grep __RSCRIPT_BIN__ $defuse_config ${gt} /dev/null` ${amp}${amp} RSCRIPT_BIN=`which Rscript`;then sed -i'.tmp' "s#__RSCRIPT_BIN__#\${RSCRIPT_BIN}#" $defuse_config; fi

## copy config to output
cp $defuse_config $config_txt
## make a data_dir  and ln -s the input fastq
mkdir -p $config_txt.extra_files_path
## run defuse.pl
perl \${DEFUSE_PATH}/scripts/create_reference_dataset.pl -c $defuse_config 
  </configfile>
 </configfiles>

 <tests>
 </tests>
 <help>
**DeFuse**

DeFuse_ is a software package for gene fusion discovery using RNA-Seq data. The software uses clusters of discordant paired end alignments to inform a split read alignment analysis for finding fusion boundaries. The software also employs a number of heuristic filters in an attempt to reduce the number of false positives and produces a fully annotated output for each predicted fusion.  

Journal reference: http://www.ploscompbiol.org/article/info%3Adoi%2F10.1371%2Fjournal.pcbi.1001138

.. _DeFuse: http://sourceforge.net/apps/mediawiki/defuse/index.php?title=Main_Page

------

**Inputs**

DeFuse requires 2 fastq files for paried reads, one with the left mate of the paired reads, and a second fastq with the the right mate of the paired reads (**with reads in the same order as in the first fastq dataset**).   

If your fastq files have reads in different orders or include unpaired reads,  you can preprocess them with **FASTQ interlacer** to create a single interlaced fastq dataset with only the paired reads and input that to **FASTQ de-interlacer** to separate the reads into a left fastq and right fastq.

DeFuse uses a Reference Dataset to search for gene fusions.  The Reference Dataset is generated from the following sources in DeFuse_Version_0.4_:
    - genome_fasta from Ensembl 
    - gene_models from Ensembl 
    - repeats_filename from UCSC RepeatMasker rmsk.txt
    - est_fasta from UCSC
    - est_alignments from UCSC intronEst.txt
    - unigene_fasta from NCBI

.. _DeFuse_Version_0.4: http://sourceforge.net/apps/mediawiki/defuse/index.php?title=DeFuse_Version_0.4.2

------

**Outputs**

The galaxy history will contain 5 outputs: the config.txt file that provides DeFuse with its parameters,  the defuse.log which details what DeFuse has done and can be useful in determining any errors, and the 3 results files that defuse generates.  

DeFuse generates 3 results files: results.txt, results.filtered.txt, and results.classify.txt. All three files have the same format, though results.classify.txt has a probability column from the application of the classifier to results.txt, and results.filtered.txt has been filtered according to the threshold probability as set in config.txt. 

The file format is tab delimited with one prediction per line, and the following fields per prediction (not necessarily in this order):

 - **Identification**
    - cluster_id : random identifier assigned to each prediction
    - library_name : library name given on the command line of defuse
    - gene1 : ensembl id of gene 1
    - gene2 : ensembl id of gene 2
    - gene_name1 : name of gene 1
    - gene_name2 : name of gene 2
 - **Evidence**
    - break_predict : breakpoint prediction method, denovo or splitr, that is considered most reliable
    - concordant_ratio : proportion of spanning reads considered concordant by blat
    - denovo_min_count : minimum kmer count across denovo assembled sequence
    - denovo_sequence : fusion sequence predicted by debruijn based denovo sequence assembly
    - denovo_span_pvalue : p-value, lower values are evidence the prediction is a false positive
    - gene_align_strand1 : alignment strand for spanning read alignments to gene 1
    - gene_align_strand2 : alignment strand for spanning read alignments to gene 2
    - min_map_count : minimum of the number of genomic mappings for each spanning read
    - max_map_count : maximum of the number of genomic mappings for each spanning read
    - mean_map_count : average of the number of genomic mappings for each spanning read
    - num_multi_map : number of spanning reads that map to more than one genomic location
    - span_count : number of spanning reads supporting the fusion
    - span_coverage1 : coverage of spanning reads aligned to gene 1 as a proportion of expected coverage
    - span_coverage2 : coverage of spanning reads aligned to gene 2 as a proportion of expected coverage
    - span_coverage_min : minimum of span_coverage1 and span_coverage2
    - span_coverage_max : maximum of span_coverage1 and span_coverage2
    - splitr_count : number of split reads supporting the prediction
    - splitr_min_pvalue : p-value, lower values are evidence the prediction is a false positive
    - splitr_pos_pvalue : p-value, lower values are evidence the prediction is a false positive
    - splitr_sequence : fusion sequence predicted by split reads
    - splitr_span_pvalue : p-value, lower values are evidence the prediction is a false positive
 - **Annotation**
    - adjacent : fusion between adjacent genes
    - altsplice : fusion likely the product of alternative splicing between adjacent genes
    - break_adj_entropy1 : di-nucleotide entropy of the 40 nucleotides adjacent to the fusion splice in gene 1
    - break_adj_entropy2 : di-nucleotide entropy of the 40 nucleotides adjacent to the fusion splice in gene 2
    - break_adj_entropy_min : minimum of break_adj_entropy1 and break_adj_entropy2
    - breakpoint_homology : number of nucleotides at the fusion splice that align equally well to gene 1 or gene 2
    - breakseqs_estislands_percident : maximum percent identity of fusion sequence alignments to est islands
    - cdna_breakseqs_percident : maximum percent identity of fusion sequence alignments to cdna
    - deletion : fusion produced by a genomic deletion
    - est_breakseqs_percident : maximum percent identity of fusion sequence alignments to est
    - eversion : fusion produced by a genomic eversion
    - exonboundaries : fusion splice at exon boundaries
    - expression1 : expression of gene 1 as number of concordant pairs aligned to exons
    - expression2 : expression of gene 2 as number of concordant pairs aligned to exons
    - gene_chromosome1 : chromosome of gene 1
    - gene_chromosome2 : chromosome of gene 2
    - gene_end1 : end position for gene 1
    - gene_end2 : end position for gene 2
    - gene_location1 : location of breakpoint in gene 1
    - gene_location2 : location of breakpoint in gene 2
    - gene_start1 : start of gene 1
    - gene_start2 : start of gene 2
    - gene_strand1 : strand of gene 1
    - gene_strand2 : strand of gene 2
    - genome_breakseqs_percident : maximum percent identity of fusion sequence alignments to genome
    - genomic_break_pos1 : genomic position in gene 1 of fusion splice / breakpoint
    - genomic_break_pos2 : genomic position in gene 2 of fusion splice / breakpoint
    - genomic_strand1 : genomic strand in gene 1 of fusion splice / breakpoint, retained sequence upstream on this strand, breakpoint is downstream
    - genomic_strand2 : genomic strand in gene 2 of fusion splice / breakpoint, retained sequence upstream on this strand, breakpoint is downstream
    - interchromosomal : fusion produced by an interchromosomal translocation
    - interrupted_index1 : ratio of coverage before and after the fusion splice / breakpoint in gene 1
    - interrupted_index2 : ratio of coverage before and after the fusion splice / breakpoint in gene 2
    - inversion : fusion produced by genomic inversion
    - orf : fusion combines genes in a way that preserves a reading frame
    - probability : probability produced by classification using adaboost and example positives/negatives (only given in results.classified.txt)
    - read_through : fusion involving adjacent potentially resulting from co-transcription rather than genome rearrangement
    - repeat_proportion1 : proportion of the spanning reads in gene 1 that span a repeat region
    - repeat_proportion2 : proportion of the spanning reads in gene 2 that span a repeat region
    - max_repeat_proportion : max of repeat_proportion1 and repeat_proportion2
    - splice_score : number of nucleotides similar to GTAG at fusion splice
    - num_splice_variants : number of potential splice variants for this gene pair
    - splicing_index1 : number of concordant pairs in gene 1 spanning the fusion splice / breakpoint, divided by number of spanning reads supporting the fusion with gene 2
    - splicing_index2 : number of concordant pairs in gene 2 spanning the fusion splice / breakpoint, divided by number of spanning reads supporting the fusion with gene 1


**Example**

results.tsv::

  cluster_id	splitr_sequence	splitr_count	splitr_span_pvalue	splitr_pos_pvalue	splitr_min_pvalue	adjacent	altsplice	break_adj_entropy1	break_adj_entropy2	break_adj_entropy_min	break_predict	breakpoint_homology	breakseqs_estislands_percident	cdna_breakseqs_percident	concordant_ratio	deletion	est_breakseqs_percident	eversion	exonboundaries	expression1	expression2	gene1	gene2	gene_align_strand1	gene_align_strand2	gene_chromosome1	gene_chromosome2	gene_end1	gene_end2	gene_location1	gene_location2	gene_name1	gene_name2	gene_start1	gene_start2	gene_strand1	gene_strand2	genome_breakseqs_percident	genomic_break_pos1	genomic_break_pos2	genomic_strand1	genomic_strand2	interchromosomal	interrupted_index1	interrupted_index2	inversion	library_name	max_map_count	max_repeat_proportion	mean_map_count	min_map_count	num_multi_map	num_splice_variants	orf	read_through	repeat_proportion1	repeat_proportion2	span_count	span_coverage1	span_coverage2	span_coverage_max	span_coverage_min	splice_score	splicing_index1	splicing_index2	
  1169	GCTTACTGTATGCCAGGCCCCAGAGGGGCAACCACCCTCTAAAGAGAGCGGCTCCTGCCTCCCAGAAAGCTCACAGACTGTGGGAGGGAAACAGGCAGCAGGTGAAGATGCCAAATGCCAGGATATCTGCCCTGTCCTTGCTTGATGCAGCTGCTGGCTCCCACGTTCTCCCCAGAATCCCCTCACACTCCTGCTGTTTTCTCTGCAGGTTGGCAGAGCCCCATGAGGGCAGGGCAGCCACTTTGTTCTTGGGCGGCAAACCTCCCTGGGCGGCACGGAAACCACGGTGAGAAGGGGGCAGGTCGGGCACGTGCAGGGACCACGCTGCAGG|TGTACCCAACAGCTCCGAAGAGACAGCGACCATCGAGAACGGGCCATGATGACGATGGCGGTTTTGTCGAAAAGAAAAGGGGGAAATGTGGGGAAAAGCAAGAGAGATCAGATTGTTACTGTGTCTGTGTAGAAAGAAGTAGACATGGGAGACTCCATTTTGTTCTGTACTAAGAAAAATTCTTCTGCCTTGAGATTCGGTGACCCCACCCCCAACCCCGTGCTCTCTGAAACATGTGCTGTGTCCACTCAGGGTTGAATGGATTAAGGGCGGTGCGAGACGTGCTTT	2	0.000436307890680442	0.110748295953850	0.0880671602973091	N	Y	3.19872427442695	3.48337348351473	3.19872427442695	splitr	0	0	0	0	Y	0	N	N	0	0	ENSG00000105549	ENSG00000213753	+	-	19	19	376013	59111168	intron	upstream	THEG	AC016629.2	361750	59084870	-	+	0	375099	386594	+	-	N	8.34107429512245	-	N	output_dir	82	0.677852348993289	40.6666666666667	1	11	1	N	N	0.361271676300578	0.677852348993289	12	0.758602776578432	0.569678713445872	0.758602776578432	0.569678713445872	2	0.416666666666667	-	
  3596	TGGGGGTTGAGGCTTCTGTTCCCAGGTTCCATGACCTCAGAGGTGGCTGGTGAGGTTATGACCTTTGCCCTCCAGCCCTGGCTTAAAACCTCAGCCCTAGGACCTGGTTAAAGGAAGGGGAGATGGAGCTTTGCCCCGACCCCCCCCCGTTCCCCTCACCTGTCAGCCCGAGCTGGGCCAGGGCCCCTAGGTGGGGAACTGGGCCGGGGGGCGGGCACAAGCGGAGGTGGTGCCCCCAAAAGGGCTCCCGGTGGGGTCTTGCTGAGAAGGTGAGGGGTTCCCGGGGCCGCAGCAGGTGGTGGTGGAGGAGCCAAGCGGCTGTAGAGCAAGGGGTGAGCAGGTTCCAGACCGTAGAGGCGGGCAGCGGCCACGGCCCCGGGTCCAGTTAGCTCCTCACCCGCCTCATAGAAGCGGGGTGGCCTTGCCAGGCGTGGGGGTGCTGCC|TTCCTTGGATGTGGTAGCCGTTTCTCAGGCTCCCTCTCCGGAATCGAACCCTGATTCCCCGTCACCCGTGGTCACCATGGTAGGCACGGCGACTACCATCGAAAGTTGATAGGGCAGACGTTCGAATGGGTCGTCGCCGCCACGGGGGGCGTGCGATCAGCCCGAGGTTATCTAGAGTCACCAAAGCCGCCGGCGCCCGCCCCCCGGCCGGGGCCGGAGAGGGGCTGACCGGGTTGGTTTTGATCTGATAAATGCACGCATCCCCCCCGCGAAGGGGGTCAGCGCCCGTCGGCATGTATTAGCTCTAGAATTACCACAGTTATCCAAGTAGGAGAGGAGCGAGCGACCAAAGGAACCATAACTGATTTAATGAGCCATTCGCAGTTTCACTGTACCGGCCGTGCGTACTTAGACATGCATGGCTTAATCTTTGAGACAAGCATATGCTACTGGCAGG	250	7.00711162298275e-72	0.00912124762512338	0.00684237452309549	N	N	3.31745197152461	3.47233119514066	3.31745197152461	splitr	7	0.0157657657657656	0	0	N	0.0135135135135136	N	N	0	0	ENSG00000156860	ENSG00000212932	-	+	16	21	30682131	48111157	coding	upstream	FBRS	RPL23AP4	30670289	48110676	+	+	0.0157657657657656	30680678	9827473	-	+	Y	-	-	N	output_dir	2	1	1.11111111111111	1	1	1	N	N	0	1	9	0.325530693397641	0.296465452915709	0.325530693397641	0.296465452915709	2	-	-	

 </help>
</tool>