# HG changeset patch # User Jim Johnson # Date 1371843971 18000 # Node ID 1af6f32ff592a708385103f872ca02dc340ec14b # Parent 547d8db4673ebe3223fc14bf007e91736e1c21f2 Add datamanager, move to defuse_reference.loc diff -r 547d8db4673e -r 1af6f32ff592 create_reference_dataset.xml --- a/create_reference_dataset.xml Sat Jun 15 14:36:47 2013 -0500 +++ b/create_reference_dataset.xml Fri Jun 21 14:46:11 2013 -0500 @@ -7,7 +7,7 @@ gmap kent - /bin/bash $shscript + /bin/bash $defuse_script @@ -112,7 +112,7 @@ - + @@ -124,18 +124,17 @@ -#import ast # # Configuration file for defuse # -# At a minimum, change all values enclused by [] +# Variables that desiganate the PATH to an application, e.g. __SAMTOOLS_BIN__ +# will be set by the runtime script using the ENV PATH # # Directory where the defuse code was unpacked -## Default location in the tool/defuse directory -# source_directory = ${__root_dir__}/tools/defuse source_directory = __DEFUSE_PATH__ +# Organism IDs ensembl_organism = $genome.ensembl_organism ensembl_prefix = $genome.ensembl_prefix ensembl_version = $genome.ensembl_version @@ -210,13 +209,6 @@ data_directory = $(source_directory)/data #end raw -#raw -# Bowtie parameters -bowtie_threads = 1 -bowtie_quals = --phred33-quals -max_insert_size = 500 -#end raw - # Parameters for building the dataset chromosomes = $genome.chromosomes mt_chromosome = $genome.mt_chromosome @@ -225,42 +217,12 @@ rrna_gene_sources = $genome.rrna_gene_sources #raw -# Blat sequences per job -num_blat_sequences = 10000 - -# Minimum gene fusion range -dna_concordant_length = 2000 - -# Trim length for discordant reads (split reads are not trimmed) -discord_read_trim = 50 - -# Calculate extra annotations, fusion splice index and interrupted index -calculate_extra_annotations = no - -# Filtering parameters -clustering_precision = 0.95 -span_count_threshold = 5 -percent_identity_threshold = 0.90 -split_min_anchor = 4 -splice_bias = 10 -positive_controls = $(data_directory)/controls.txt -probability_threshold = 0.50 - -# Position density when calculating covariance -covariance_sampling_density = 0.01 - -# Number of reads for each job in split -reads_per_job = 1000000 - -# If you have command line 'mail' and wish to be notified -mailto = andrew.mcpherson@gmail.com - # Remove temp files remove_job_files = yes remove_job_temp_files = yes #end raw - + #!/bin/bash ## define some things for cheetah proccessing #set $amp = chr(38) @@ -291,7 +253,7 @@ **DeFuse** -DeFuse_ is a software package for gene fusion discovery using RNA-Seq data. The software uses clusters of discordant paired end alignments to inform a split read alignment analysis for finding fusion boundaries. The software also employs a number of heuristic filters in an attempt to reduce the number of false positives and produces a fully annotated output for each predicted fusion. See the DeFuse_Version_0.6.1_ manual for details. +DeFuse_ is a software package for gene fusion discovery using RNA-Seq data. The software uses clusters of discordant paired end alignments to inform a split read alignment analysis for finding fusion boundaries. The software also employs a number of heuristic filters in an attempt to reduce the number of false positives and produces a fully annotated output for each predicted fusion. See the DeFuse_Version_0.6_ manual for details. DeFuse uses a Reference Dataset to search for gene fusions. The Reference Dataset is generated from the following sources in DeFuse_Version_0.6_: - genome_fasta from Ensembl diff -r 547d8db4673e -r 1af6f32ff592 data_manager_conf.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/data_manager_conf.xml Fri Jun 21 14:46:11 2013 -0500 @@ -0,0 +1,23 @@ + + + + + + + + + + + ${dbkey}/defuse + + + ${GALAXY_DATA_MANAGER_DATA_PATH}/${dbkey}/defuse/${dbkey}.config + + + + + + + diff -r 547d8db4673e -r 1af6f32ff592 datamanager_create_reference.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/datamanager_create_reference.py Fri Jun 21 14:46:11 2013 -0500 @@ -0,0 +1,118 @@ +#!/usr/bin/env python + +import sys +import os +import re +import tempfile +import subprocess +import fileinput +import shutil +import optparse +import urllib2 +from ftplib import FTP +import tarfile + +from galaxy.util.json import from_json_string, to_json_string + + +def stop_err(msg): + sys.stderr.write(msg) + sys.exit(1) + +def get_config_dict(config,dataset_directory=None): + keys = ['dataset_directory','ensembl_organism','ensembl_prefix','ensembl_version','ensembl_genome_version','ucsc_genome_version','ncbi_organism','ncbi_prefix','chromosomes','mt_chromosome','gene_sources','ig_gene_sources','rrna_gene_sources'] + pat = '^([^=]+?)\s*=\s*(.*)$' + config_dict = {} + try: + fh = open(config) + for i,l in enumerate(fh): + line = l.strip() + if line.startswith('#'): + continue + m = re.match(pat,line) + if m and len(m.groups()) == 2: + (k,v) = m.groups() + if k in keys: + config_dict[k] = v + except Exception, e: + stop_err( 'Error parsing %s %s\n' % (config,str( e )) ) + else: + fh.close() + if dataset_directory: + config_dict['dataset_directory'] = dataset_directory + return config_dict + +def run_defuse_script(data_manager_dict, params, target_directory, dbkey, description, config, script): + if not os.path.isdir(target_directory): + os.makedirs(target_directory) + ## Name the config consistently with data_manager_conf.xml + # copy the config file to the target_directory + # when DataManager moves files to there tool-data location, the config will get moved as well, + # and the value_translation in data_manager_conf.xml will tell us the new location + # defuse.xml will use the path to this config file to set the dataset_directory + config_name = '%s.config' % dbkey + defuse_config = os.path.join( target_directory, config_name) + shutil.copyfile(config,defuse_config) + cmd = "/bin/bash %s %s" % (script,target_directory) + # Run + try: + tmp_out = tempfile.NamedTemporaryFile().name + tmp_stdout = open( tmp_out, 'wb' ) + tmp_err = tempfile.NamedTemporaryFile().name + tmp_stderr = open( tmp_err, 'wb' ) + proc = subprocess.Popen( args=cmd, shell=True, cwd=".", stdout=tmp_stdout, stderr=tmp_stderr ) + returncode = proc.wait() + tmp_stderr.close() + # get stderr, allowing for case where it's very large + tmp_stderr = open( tmp_err, 'rb' ) + stderr = '' + buffsize = 1048576 + try: + while True: + stderr += tmp_stderr.read( buffsize ) + if not stderr or len( stderr ) % buffsize != 0: + break + except OverflowError: + pass + tmp_stdout.close() + tmp_stderr.close() + if returncode != 0: + raise Exception, stderr + + # TODO: look for errors in program output. + except Exception, e: + stop_err( 'Error creating defuse reference:\n' + str( e ) ) + config_dict = get_config_dict(config, dataset_directory=target_directory) + data_table_entry = dict(unique_id=dbkey, dbkey=dbkey, name=description, path=config_name) + _add_data_table_entry( data_manager_dict, data_table_entry ) +def _add_data_table_entry( data_manager_dict, data_table_entry ): + data_manager_dict['data_tables'] = data_manager_dict.get( 'data_tables', {} ) + data_manager_dict['data_tables']['defuse'] = data_manager_dict['data_tables'].get( 'defuse', [] ) + data_manager_dict['data_tables']['defuse'].append( data_table_entry ) + return data_manager_dict + +def main(): + #Parse Command Line + parser = optparse.OptionParser() + parser.add_option( '-k', '--dbkey', dest='dbkey', action='store', type="string", default=None, help='dbkey' ) + parser.add_option( '-d', '--description', dest='description', action='store', type="string", default=None, help='description' ) + parser.add_option( '-c', '--defuse_config', dest='defuse_config', action='store', type="string", default=None, help='defuse_config' ) + parser.add_option( '-s', '--defuse_script', dest='defuse_script', action='store', type="string", default=None, help='defuse_script' ) + (options, args) = parser.parse_args() + + filename = args[0] + + params = from_json_string( open( filename ).read() ) + target_directory = params[ 'output_data' ][0]['extra_files_path'] + os.mkdir( target_directory ) + data_manager_dict = {} + + + #Create Defuse Reference Data + run_defuse_script( data_manager_dict, params, target_directory, options.dbkey, options.description,options.defuse_config,options.defuse_script) + + #save info to json file + open( filename, 'wb' ).write( to_json_string( data_manager_dict ) ) + +if __name__ == "__main__": main() + diff -r 547d8db4673e -r 1af6f32ff592 datamanager_create_reference.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/datamanager_create_reference.xml Fri Jun 21 14:46:11 2013 -0500 @@ -0,0 +1,289 @@ + + create a defuse reference from Ensembl and UCSC sources + + defuse + samtools + bowtie + gmap + kent + + datamanager_create_reference.py + --dbkey $genome.ensembl_genome_version + --description "$genome.ensembl_prefix $genome.ensembl_genome_version ($genome.ucsc_genome_version)" + --defuse_config $defuse_config + --defuse_script $defuse_script + $out_file + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Examples: + Homo_sapiens: 1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,X,Y,MT + Mus_musculus: 1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,X,Y,MT + Rattus_norvegicus: 1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,X,MT + ( ftp://ftp.ensembl.org/pub/release-71/fasta/homo_sapiens/dna/ ) + + + + + + + + + + + + + + + + + + + +# +# Configuration file for defuse +# +# Variables that desiganate the PATH to an application, e.g. __SAMTOOLS_BIN__ +# will be set by the runtime script using the ENV PATH +# + +# Directory where the defuse code was unpacked +source_directory = __DEFUSE_PATH__ + +# Organism IDs +ensembl_organism = $genome.ensembl_organism +ensembl_prefix = $genome.ensembl_prefix +ensembl_version = $genome.ensembl_version +ensembl_genome_version = $genome.ensembl_genome_version +ucsc_genome_version = $genome.ucsc_genome_version +ncbi_organism = $genome.ncbi_organism +ncbi_prefix = $genome.ncbi_prefix + +# Directory where you want your dataset +dataset_directory = __DATASET_DIRECTORY__ + +#raw +# Input genome and gene models +gene_models = $(dataset_directory)/$(ensembl_prefix).$(ensembl_genome_version).$(ensembl_version).gtf +genome_fasta = $(dataset_directory)/$(ensembl_prefix).$(ensembl_genome_version).$(ensembl_version).dna.chromosomes.fa + +# Repeat table from ucsc genome browser +repeats_filename = $(dataset_directory)/repeats.txt + +# EST info downloaded from ucsc genome browser +est_fasta = $(dataset_directory)/est.fa +est_alignments = $(dataset_directory)/intronEst.txt + +# Unigene clusters downloaded from ncbi +unigene_fasta = $(dataset_directory)/$(ncbi_prefix).seq.uniq +#end raw + +# Paths to external tools +samtools_bin = __SAMTOOLS_BIN__ +bowtie_bin = __BOWTIE_BIN__ +bowtie_build_bin = __BOWTIE_BUILD_BIN__ +blat_bin = __BLAT_BIN__ +fatotwobit_bin = __FATOTWOBIT_BIN__ +gmap_bin = __GMAP_BIN__ +gmap_setup_bin = __GMAP_SETUP_BIN__ +r_bin = __R_BIN__ +rscript_bin = __RSCRIPT_BIN__ + +#raw +# Directory where you want your dataset +gmap_index_directory = $(dataset_directory)/gmap +#end raw + +#raw +# Dataset files +dataset_prefix = $(dataset_directory)/defuse +chromosome_prefix = $(dataset_prefix).dna.chromosomes +exons_fasta = $(dataset_prefix).exons.fa +cds_fasta = $(dataset_prefix).cds.fa +cdna_regions = $(dataset_prefix).cdna.regions +cdna_fasta = $(dataset_prefix).cdna.fa +reference_fasta = $(dataset_prefix).reference.fa +rrna_fasta = $(dataset_prefix).rrna.fa +ig_gene_list = $(dataset_prefix).ig.gene.list +repeats_regions = $(dataset_directory)/repeats.regions +est_split_fasta1 = $(dataset_directory)/est.1.fa +est_split_fasta2 = $(dataset_directory)/est.2.fa +est_split_fasta3 = $(dataset_directory)/est.3.fa +est_split_fasta4 = $(dataset_directory)/est.4.fa +est_split_fasta5 = $(dataset_directory)/est.5.fa +est_split_fasta6 = $(dataset_directory)/est.6.fa +est_split_fasta7 = $(dataset_directory)/est.7.fa +est_split_fasta8 = $(dataset_directory)/est.8.fa +est_split_fasta9 = $(dataset_directory)/est.9.fa + +# Fasta files with bowtie indices for prefiltering reads for concordantly mapping pairs +prefilter1 = $(unigene_fasta) + +# deFuse scripts and tools +scripts_directory = $(source_directory)/scripts +tools_directory = $(source_directory)/tools +data_directory = $(source_directory)/data +#end raw + +# Parameters for building the dataset +chromosomes = $genome.chromosomes +mt_chromosome = $genome.mt_chromosome +gene_sources = $genome.gene_sources +ig_gene_sources = $genome.ig_gene_sources +rrna_gene_sources = $genome.rrna_gene_sources + +#raw +# Remove temp files +remove_job_files = yes +remove_job_temp_files = yes +#end raw + + +#!/bin/bash +## define some things for cheetah proccessing +#set $amp = chr(38) +#set $gt = chr(62) +## substitute pathnames into config file +if `grep __DATASET_DIRECTORY__ $defuse_config ${gt} /dev/null`;then sed -i'.tmp' "s#__DATASET_DIRECTORY__#\$1#" $defuse_config; fi +if `grep __DEFUSE_PATH__ $defuse_config ${gt} /dev/null`;then sed -i'.tmp' "s#__DEFUSE_PATH__#\${DEFUSE_PATH}#" $defuse_config; fi +if `grep __SAMTOOLS_BIN__ $defuse_config ${gt} /dev/null` ${amp}${amp} SAMTOOLS_BIN=`which samtools`;then sed -i'.tmp' "s#__SAMTOOLS_BIN__#\${SAMTOOLS_BIN}#" $defuse_config; fi +if `grep __BOWTIE_BIN__ $defuse_config ${gt} /dev/null` ${amp}${amp} BOWTIE_BIN=`which bowtie`;then sed -i'.tmp' "s#__BOWTIE_BIN__#\${BOWTIE_BIN}#" $defuse_config; fi +if `grep __BOWTIE_BUILD_BIN__ $defuse_config ${gt} /dev/null` ${amp}${amp} BOWTIE_BUILD_BIN=`which bowtie-build`;then sed -i'.tmp' "s#__BOWTIE_BUILD_BIN__#\${BOWTIE_BUILD_BIN}#" $defuse_config; fi +if `grep __BLAT_BIN__ $defuse_config ${gt} /dev/null` ${amp}${amp} BLAT_BIN=`which blat`;then sed -i'.tmp' "s#__BLAT_BIN__#\${BLAT_BIN}#" $defuse_config; fi +if `grep __FATOTWOBIT_BIN__ $defuse_config ${gt} /dev/null` ${amp}${amp} FATOTWOBIT_BIN=`which faToTwoBit`;then sed -i'.tmp' "s#__FATOTWOBIT_BIN__#\${FATOTWOBIT_BIN}#" $defuse_config; fi +if `grep __GMAP_BIN__ $defuse_config ${gt} /dev/null` ${amp}${amp} GMAP_BIN=`which gmap`;then sed -i'.tmp' "s#__GMAP_BIN__#\${GMAP_BIN}#" $defuse_config; fi +if `grep __GMAP_SETUP_BIN__ $defuse_config ${gt} /dev/null` ${amp}${amp} GMAP_SETUP_BIN=`which gmap_setup`;then sed -i'.tmp' "s#__GMAP_SETUP_BIN__#\${GMAP_SETUP_BIN}#" $defuse_config; fi +if `grep __GMAP_INDEX_DIR__ $defuse_config ${gt} /dev/null` ${amp}${amp} GMAP_INDEX_DIR=`pwd`/gmap;then sed -i'.tmp' "s#__GMAP_INDEX_DIR__#\${GMAP_INDEX_DIR}#" $defuse_config; fi +if `grep __R_BIN__ $defuse_config ${gt} /dev/null` ${amp}${amp} R_BIN=`which R`;then sed -i'.tmp' "s#__R_BIN__#\${R_BIN}#" $defuse_config; fi +if `grep __RSCRIPT_BIN__ $defuse_config ${gt} /dev/null` ${amp}${amp} RSCRIPT_BIN=`which Rscript`;then sed -i'.tmp' "s#__RSCRIPT_BIN__#\${RSCRIPT_BIN}#" $defuse_config; fi +## copy config to output +cp $defuse_config \$1/defuse_config.txt +## Run the create_reference_dataset.pl +perl \${DEFUSE_PATH}/scripts/create_reference_dataset.pl -c $defuse_config + + + + + + +**DeFuse** + +DeFuse_ is a software package for gene fusion discovery using RNA-Seq data. The software uses clusters of discordant paired end alignments to inform a split read alignment analysis for finding fusion boundaries. The software also employs a number of heuristic filters in an attempt to reduce the number of false positives and produces a fully annotated output for each predicted fusion. See the DeFuse_Version_0.6_ manual for details. + +DeFuse uses a Reference Dataset to search for gene fusions. The Reference Dataset is generated from the following sources in DeFuse_Version_0.6_: + - genome_fasta from Ensembl + - gene_models from Ensembl + - repeats_filename from UCSC RepeatMasker rmsk.txt + - est_fasta from UCSC + - est_alignments from UCSC intronEst.txt + - unigene_fasta from NCBI + +The create_defuse_reference Galaxy tool downloads the reference genome and other source files, and builds any derivative files including bowtie indices, gmap indices, and 2bit files. Expect this step to take at least 12 hours. + + +It will generate the refernce data for deFuse Galaxy tool. + +Journal reference: http://www.ploscompbiol.org/article/info%3Adoi%2F10.1371%2Fjournal.pcbi.1001138 + +.. _DeFuse: http://sourceforge.net/apps/mediawiki/defuse/index.php?title=Main_Page + +.. _DeFuse_Version_0.6: http://sourceforge.net/apps/mediawiki/defuse/index.php?title=DeFuse_Version_0.6.1 + +------ + +**Outputs** + +The galaxy history will contain: the config.txt file that provides DeFuse with the reference data paths. + + + diff -r 547d8db4673e -r 1af6f32ff592 datatypes_conf.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/datatypes_conf.xml Fri Jun 21 14:46:11 2013 -0500 @@ -0,0 +1,6 @@ + + + + + + diff -r 547d8db4673e -r 1af6f32ff592 defuse.xml --- a/defuse.xml Sat Jun 15 14:36:47 2013 -0500 +++ b/defuse.xml Fri Jun 21 14:46:11 2013 -0500 @@ -12,66 +12,66 @@ - - - + + + + + + + + + + + + - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - Position density when calculating covariance - - - - - - - - - - - - - - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Position density when calculating covariance + + + + + + + + + + + @@ -83,20 +83,50 @@ keep_output == True - do_get_reads == True + -#import ast +#import re +#set $ds = chr(36) #if $refGenomeSource.genomeSource == "history": -#include raw $refGenomeSource.config.__str__ +#set config_file = $refGenomeSource.config.__str__ +#set #else -#set $ref_dict = dict($ast.literal_eval($refGenomeSource.index.value)) +#set config_file = $refGenomeSource.index.value +#end if +#set pat = '^\s*([^#=][^=]*?)\s*=\s*(.*?)\s*$' +#set fh = open() +#set keys = ['dataset_directory','ensembl_organism','ensembl_prefix','ensembl_version','ensembl_genome_version','ucsc_genome_version','ncbi_organism','ncbi_prefix','chromosomes','mt_chromosome','gene_sources','ig_gene_sources','rrna_gene_sources'] +#set kv = [] +#for $line in $fh: + #set m = $re.match($pat,$line) + #if $m and len($m.groups()) == 2: + ## #echo $line + #if $m.groups()[0] in keys: + #set k = $m.groups()[0] + #if k == 'dataset_directory' and $refGenomeSource.genomeSource == "indexed": + ## The DataManager is conifgured to place the config file in the same directory as the defuse_data: dataset_directory + #set v = $os.path.dirname($config_file) + #else: + #set v = $m.groups()[1] + #end if + #set kv = $kv + [[$k, $v]] + #end if + #end if +#end for +## #echo $kv +#set ref_dict = dict($kv) +## #echo $ref_dict +## include raw $refGenomeSource.config.__str__ # # Configuration file for defuse # @@ -106,12 +136,7 @@ # Directory where the defuse code was unpacked ## Default location in the tool/defuse directory # source_directory = ${__root_dir__}/tools/defuse -source_directory = #slurp -#try -$ref_dict['source_directory'] -#except -__DEFUSE_PATH__ -#end try +source_directory = __DEFUSE_PATH__ # Directory where you want your dataset dataset_directory = #slurp @@ -166,60 +191,15 @@ #end try # Paths to external tools -bowtie_bin = #slurp -#try -$ref_dict['bowtie_bin'] -#except -__BOWTIE_BIN__ -#end try -bowtie_build_bin = #slurp -#try -$ref_dict['bowtie_build_bin'] -#except -__BOWTIE_BUILD_BIN__ -#end try -blat_bin = #slurp -#try -$ref_dict['blat_bin'] -#except -__BLAT_BIN__ -#end try -fatotwobit_bin = #slurp -#try -$ref_dict['fatotwobit_bin'] -#except -__FATOTWOBIT_BIN__ -#end try -gmap_bin = #slurp -#try -$ref_dict['gmap_bin'] -#except -__GMAP_BIN__ -#end try -gmap_bin = #slurp -#try -$ref_dict['gmap_bin'] -#except -__GMAP_BIN__ -#end try -gmap_setup_bin = #slurp -#try -$ref_dict['gmap_setup_bin'] -#except -__GMAP_SETUP_BIN__ -#end try -r_bin = #slurp -#try -$ref_dict['r_bin'] -#except -__R_BIN__ -#end try -rscript_bin = #slurp -#try -$ref_dict['rscript_bin'] -#except -__RSCRIPT_BIN__ -#end try +bowtie_bin = __BOWTIE_BIN__ +bowtie_build_bin = __BOWTIE_BUILD_BIN__ +blat_bin = __BLAT_BIN__ +fatotwobit_bin = __FATOTWOBIT_BIN__ +gmap_bin = __GMAP_BIN__ +gmap_bin = __GMAP_BIN__ +gmap_setup_bin = __GMAP_SETUP_BIN__ +r_bin = __R_BIN__ +rscript_bin = __RSCRIPT_BIN__ # Directory where you want your dataset gmap_index_directory = #slurp @@ -284,8 +264,8 @@ --phred33-quals #end try max_insert_size = #slurp -#if $refGenomeSource.defuse_param.settings == "full" and $refGenomeSource.defuse_param.max_insert_size.__str__ != "": -$refGenomeSource.defuse_param.max_insert_size +#if $defuse_param.settings == "full" and $defuse_param.max_insert_size.__str__ != "": +$defuse_param.max_insert_size #else #try $ref_dict['max_insert_size'] @@ -336,8 +316,8 @@ # Minimum gene fusion range dna_concordant_length = #slurp -#if $refGenomeSource.defuse_param.settings == "full" and $refGenomeSource.defuse_param.dna_concordant_length.__str__ != "": -$refGenomeSource.defuse_param.dna_concordant_length +#if $defuse_param.settings == "full" and $defuse_param.dna_concordant_length.__str__ != "": +$defuse_param.dna_concordant_length #else #try $ref_dict['dna_concordant_length'] @@ -348,8 +328,8 @@ # Trim length for discordant reads (split reads are not trimmed) discord_read_trim = #slurp -#if $refGenomeSource.defuse_param.settings == "full" and $refGenomeSource.defuse_param.discord_read_trim.__str__ != "": -$refGenomeSource.defuse_param.discord_read_trim +#if $defuse_param.settings == "full" and $defuse_param.discord_read_trim.__str__ != "": +$defuse_param.discord_read_trim #else #try $ref_dict['discord_read_trim'] @@ -357,11 +337,21 @@ 50 #end try #end if - +# Calculate extra annotations, fusion splice index and interrupted index +calculate_extra_annotations = #slurp +#if $defuse_param.settings == "full" and $defuse_param.calculate_extra_annotations.__str__ != "": +$defuse_param.calculate_extra_annotations +#else +#try +$ref_dict['calculate_extra_annotations'] +#except +no +#end try +#end if # Filtering parameters clustering_precision = #slurp -#if $refGenomeSource.defuse_param.settings == "full" and $refGenomeSource.defuse_param.clustering_precision.__str__ != "" -$refGenomeSource.defuse_param.clustering_precision +#if $defuse_param.settings == "full" and $defuse_param.clustering_precision.__str__ != "" +$defuse_param.clustering_precision #else #try $ref_dict['clustering_precision'] @@ -370,8 +360,8 @@ #end try #end if span_count_threshold = #slurp -#if $refGenomeSource.defuse_param.settings == "full" and $refGenomeSource.defuse_param.span_count_threshold.__str__ != "" -$refGenomeSource.defuse_param.span_count_threshold +#if $defuse_param.settings == "full" and $defuse_param.span_count_threshold.__str__ != "" +$defuse_param.span_count_threshold #else #try $ref_dict['span_count_threshold'] @@ -379,19 +369,9 @@ 5 #end try #end if -split_count_threshold = #slurp -#if $refGenomeSource.defuse_param.settings == "full" and $refGenomeSource.defuse_param.split_count_threshold.__str__ != "" -$refGenomeSource.defuse_param.split_count_threshold -#else -#try -$ref_dict['split_count_threshold'] -#except -3 -#end try -#end if percent_identity_threshold = #slurp -#if $refGenomeSource.defuse_param.settings == "full" and $refGenomeSource.defuse_param.percent_identity_threshold.__str__ != "" -$refGenomeSource.defuse_param.percent_identity_threshold +#if $defuse_param.settings == "full" and $defuse_param.percent_identity_threshold.__str__ != "" +$defuse_param.percent_identity_threshold #else #try $ref_dict['percent_identity_threshold'] @@ -399,29 +379,9 @@ 0.90 #end try #end if -max_dist_pos = #slurp -#if $refGenomeSource.defuse_param.settings == "full" and $refGenomeSource.defuse_param.max_dist_pos.__str__ != "" -$refGenomeSource.defuse_param.max_dist_pos -#else -#try -$ref_dict['max_dist_pos'] -#except -600 -#end try -#end if -num_dist_genes = #slurp -#if $refGenomeSource.defuse_param.settings == "full" and $refGenomeSource.defuse_param.num_dist_genes.__str__ != "" -$refGenomeSource.defuse_param.num_dist_genes -#else -#try -$ref_dict['num_dist_genes'] -#except -500 -#end try -#end if split_min_anchor = #slurp -#if $refGenomeSource.defuse_param.settings == "full" and $refGenomeSource.defuse_param.split_min_anchor.__str__ != "" -$refGenomeSource.defuse_param.split_min_anchor +#if $defuse_param.settings == "full" and $defuse_param.split_min_anchor.__str__ != "" +$defuse_param.split_min_anchor #else #try $ref_dict['split_min_anchor'] @@ -429,19 +389,9 @@ 4 #end try #end if -max_concordant_ratio = #slurp -#if $refGenomeSource.defuse_param.settings == "full" and $refGenomeSource.defuse_param.max_concordant_ratio.__str__ != "" -$refGenomeSource.defuse_param.max_concordant_ratio -#else -#try -$ref_dict['max_concordant_ratio'] -#except -0.1 -#end try -#end if splice_bias = #slurp -#if $refGenomeSource.defuse_param.settings == "full" and $refGenomeSource.defuse_param.splice_bias.__str__ != "" -$refGenomeSource.defuse_param.splice_bias +#if $defuse_param.settings == "full" and $defuse_param.splice_bias.__str__ != "" +$defuse_param.splice_bias #else #try $ref_dict['splice_bias'] @@ -450,8 +400,8 @@ #end try #end if denovo_assembly = #slurp -#if $refGenomeSource.defuse_param.settings == "full" and $refGenomeSource.defuse_param.denovo_assembly.__str__ != "" -$refGenomeSource.defuse_param.denovo_assembly +#if $defuse_param.settings == "full" and $defuse_param.denovo_assembly.__str__ != "" +$defuse_param.denovo_assembly #else #try $ref_dict['denovo_assembly'] @@ -460,8 +410,8 @@ #end try #end if probability_threshold = #slurp -#if $refGenomeSource.defuse_param.settings == "full" and $refGenomeSource.defuse_param.probability_threshold.__str__ != "" -$refGenomeSource.defuse_param.probability_threshold +#if $defuse_param.settings == "full" and $defuse_param.probability_threshold.__str__ != "" +$defuse_param.probability_threshold #else #try $ref_dict['probability_threshold'] @@ -473,8 +423,8 @@ # Position density when calculating covariance covariance_sampling_density = #slurp -#if $refGenomeSource.defuse_param.settings == "full" and $refGenomeSource.defuse_param.covariance_sampling_density.__str__ != "" -$refGenomeSource.defuse_param.covariance_sampling_density +#if $defuse_param.settings == "full" and $defuse_param.covariance_sampling_density.__str__ != "" +$defuse_param.covariance_sampling_density #else #try $ref_dict['covariance_sampling_density'] @@ -482,13 +432,17 @@ 0.01 #end try #end if - - # Number of reads for each job in split -reads_per_job = 1000000 - -# Number of regions for each breakpoint sequence job in split -regions_per_job = 20 +reads_per_job = #slurp +#if $defuse_param.settings == "full" and $defuse_param.reads_per_job.__str__ != "" +$defuse_param.reads_per_job +#else +#try +$ref_dict['reads_per_job'] +#except +1000000 +#end try +#end if #raw # If you have command line 'mail' and wish to be notified @@ -498,40 +452,8 @@ remove_job_files = yes remove_job_temp_files = yes -# Converting to fastq -# Fastq converter config format 1 for reads stored in separate files for each end -# data_lane_rexex_N is a perl regex which stores the lane id in $1 -# data_end_regex_N is a perl regex which stores the end, 1 or 2, in $1 -# data_compress_regex_N is a perl regex which stores the compression extension in $1 -# data_convert_N is the associated conversion utility that takes data at stdin and outputs fastq at stdout -# Fastq converter config format 2 for reads stored in separate files for each end -# data_lane_regex_N is a perl regex which stores the lane id in $1 -# data_compress_regex_N is a perl regex which stores the compression extension in $1 -# data_end1_converter_N is the associated conversion utility that takes data at stdin and outputs fastq for end 1 at stdout -# data_end2_converter_N is the associated conversion utility that takes data at stdin and outputs fastq for end 2 at stdout - -data_lane_regex_1 = ^(.+)_[12]_export\.txt.*$ -data_end_regex_1 = ^.+_([12])_export\.txt.*$ -data_compress_regex_1 = ^.+_[12]_export\.txt(.*)$ -data_converter_1 = $(scripts_directory)/fq_all2std.pl export2std - -data_lane_regex_2 = ^(.+)_[12]_concat_qseq\.txt.*$ -data_end_regex_2 = ^.+_([12])_concat_qseq\.txt.*$ -data_compress_regex_2 = ^.+_[12]_concat_qseq\.txt(.*)$ -data_converter_2 = $(scripts_directory)/qseq2fastq.pl - -data_lane_regex_3 = ^(.+)\.bam.*$ -data_compress_regex_3 = ^.+\.bam(.*)$ -data_end1_converter_3 = samtools view - | filter_sam_mate.pl 1 | sam_to_fastq.pl -data_end2_converter_3 = samtools view - | filter_sam_mate.pl 2 | sam_to_fastq.pl - -data_lane_regex_4 = ^(.+).[12].fastq.*$ -data_end_regex_4 = ^.+.([12]).fastq.*$ -data_compress_regex_4 = ^.+.[12].fastq(.*)$ -data_converter_4 = cat #end raw -#end if @@ -602,7 +524,7 @@ perl \${DEFUSE_PATH}/scripts/defuse.pl -c $defuse_config -1 data_dir/reads_1.fastq -2 data_dir/reads_2.fastq -o output_dir -p 8 ## copy primary results to output datasets if [ -e output_dir/log/defuse.log ]; then cp output_dir/log/defuse.log $defuse_log; fi -if [ -e output_dir/results.tsv ]; then cp output_dir/results.tsv $results_tsv; fi +## if [ -e output_dir/results.tsv ]; then cp output_dir/results.tsv $results_tsv; fi if [ -e output_dir/results.filtered.tsv ]; then cp output_dir/results.filtered.tsv $results_filtered_tsv; fi if [ -e output_dir/results.classify.tsv ]; then cp output_dir/results.classify.tsv $results_classify_tsv; fi ## create html with links for output_dir @@ -650,7 +572,7 @@ If your fastq files have reads in different orders or include unpaired reads, you can preprocess them with **FASTQ interlacer** to create a single interlaced fastq dataset with only the paired reads and input that to **FASTQ de-interlacer** to separate the reads into a left fastq and right fastq. -DeFuse uses a Reference Dataset to search for gene fusions. The Reference Dataset is generated from the following sources in DeFuse_Version_0.6_: +DeFuse uses a Reference Dataset to search for gene fusions. The Reference Dataset is generated from the following sources in DeFuse_Version_0.4_: - genome_fasta from Ensembl - gene_models from Ensembl - repeats_filename from UCSC RepeatMasker rmsk.txt @@ -658,7 +580,7 @@ - est_alignments from UCSC intronEst.txt - unigene_fasta from NCBI -.. _DeFuse_Version_0.6: http://sourceforge.net/apps/mediawiki/defuse/index.php?title=DeFuse_Version_0.6.1 +.. _DeFuse_Version_0.4: http://sourceforge.net/apps/mediawiki/defuse/index.php?title=DeFuse_Version_0.4.2 ------