Mercurial > repos > jjohnson > defuse
view create_reference_dataset.xml @ 12:33e2235bf003
Add create_reference_dataset.xml
author | Jim Johnson <jj@umn.edu> |
---|---|
date | Sun, 09 Jun 2013 20:30:21 -0500 |
parents | |
children | 85693cb5339f |
line wrap: on
line source
<tool id="create_defusei_reference" name="Create DeFuse Reference" version="1.6.1"> <description>create a defuse reference from Ensembl and UCSC sources</description> <requirements> <requirement type="package" version="0.6.1">defuse</requirement> <requirement type="package" version="0.1.18">samtools</requirement> <requirement type="package" version="1.0.0">bowtie</requirement> <requirement type="package" version="2013-05-09">gmap</requirement> <requirement type="package" version="latest">kent</requirement> </requirements> <command interpreter="command"> /bin/bash $shscript </command> <inputs> <param name="ensembl_genome_version" type="text" value="" label="Esembl Genome Version" help="Example: GRCh37"/> <param name="ensembl_version" type="integer" value="" label="Esembl Release Version" help="Example: 71"/> <param name="ucsc_genome_version" type="text" value="" label="UCSC Genome Version" help="Example: hg19"/> <param name="chromosomes" type="text" value="" label="Chromosomes" help="Example: 1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,X,Y,MT"/> <param name="mt_chromosome" type="text" value="MT" label="Mitochonrial Chromosome" /> <param name="gene_sources" type="text" value="IG_C_gene,IG_D_gene,IG_J_gene,IG_V_gene,processed_transcript,protein_coding" label="Gene sources" /> <param name="ig_gene_sources" type="text" value="IG_C_gene,IG_D_gene,IG_J_gene,IG_V_gene,IG_pseudogene" label="IG Gene sources" /> <param name="rrna_gene_sources" type="text" value="Mt_rRNA,rRNA,rRNA_pseudogene" label="Ribosomal Gene sources" /> </inputs> <outputs> <data format="txt" name="config_txt" label="${tool.name} on ${on_string}: config.txt"/> </outputs> <configfiles> <configfile name="defuse_config"> #import ast # # Configuration file for defuse # # At a minimum, change all values enclused by [] # # Directory where the defuse code was unpacked ## Default location in the tool/defuse directory # source_directory = ${__root_dir__}/tools/defuse source_directory = __DEFUSE_PATH__ ensembl_version = $ensembl_version ensembl_genome_version = $ensembl_genome_version ucsc_genome_version = $ucsc_genome_version # Directory where you want your dataset dataset_directory = $config_txt.extra_files_path #raw # Input genome and gene models gene_models = $(dataset_directory)/Homo_sapiens.$(ensembl_genome_version).$(ensembl_version).gtf genome_fasta = $(dataset_directory)/Homo_sapiens.$(ensembl_genome_version).$(ensembl_version).dna.chromosomes.fa # Repeat table from ucsc genome browser repeats_filename = $(dataset_directory)/repeats.txt # EST info downloaded from ucsc genome browser est_fasta = $(dataset_directory)/est.fa est_alignments = $(dataset_directory)/intronEst.txt # Unigene clusters downloaded from ncbi unigene_fasta = $(dataset_directory)/Hs.seq.uniq #end raw # Paths to external tools samtools_bin = __SAMTOOLS_BIN__ bowtie_bin = __BOWTIE_BIN__ bowtie_build_bin = __BOWTIE_BUILD_BIN__ blat_bin = __BLAT_BIN__ fatotwobit_bin = __FATOTWOBIT_BIN__ gmap_bin = __GMAP_BIN__ gmap_setup_bin = __GMAP_SETUP_BIN__ r_bin = __R_BIN__ rscript_bin = __RSCRIPT_BIN__ #raw # Directory where you want your dataset gmap_index_directory = $(dataset_directory)/gmap #end raw #raw # Dataset files dataset_prefix = $(dataset_directory)/defuse chromosome_prefix = $(dataset_prefix).dna.chromosomes exons_fasta = $(dataset_prefix).exons.fa cds_fasta = $(dataset_prefix).cds.fa cdna_regions = $(dataset_prefix).cdna.regions cdna_fasta = $(dataset_prefix).cdna.fa reference_fasta = $(dataset_prefix).reference.fa rrna_fasta = $(dataset_prefix).rrna.fa ig_gene_list = $(dataset_prefix).ig.gene.list repeats_regions = $(dataset_directory)/repeats.regions est_split_fasta1 = $(dataset_directory)/est.1.fa est_split_fasta2 = $(dataset_directory)/est.2.fa est_split_fasta3 = $(dataset_directory)/est.3.fa est_split_fasta4 = $(dataset_directory)/est.4.fa est_split_fasta5 = $(dataset_directory)/est.5.fa est_split_fasta6 = $(dataset_directory)/est.6.fa est_split_fasta7 = $(dataset_directory)/est.7.fa est_split_fasta8 = $(dataset_directory)/est.8.fa est_split_fasta9 = $(dataset_directory)/est.9.fa # Fasta files with bowtie indices for prefiltering reads for concordantly mapping pairs prefilter1 = $(unigene_fasta) # deFuse scripts and tools scripts_directory = $(source_directory)/scripts tools_directory = $(source_directory)/tools data_directory = $(source_directory)/data #end raw #raw # Bowtie parameters bowtie_threads = 1 bowtie_quals = --phred33-quals max_insert_size = 500 #end raw # Parameters for building the dataset chromosomes = $chromosomes mt_chromosome = $mt_chromosome gene_sources = $gene_sources ig_gene_sources = $ig_gene_sources rrna_gene_sources = $rrna_gene_sources #raw # Blat sequences per job num_blat_sequences = 10000 # Minimum gene fusion range dna_concordant_length = 2000 # Trim length for discordant reads (split reads are not trimmed) discord_read_trim = 50 # Calculate extra annotations, fusion splice index and interrupted index calculate_extra_annotations = no # Filtering parameters clustering_precision = 0.95 span_count_threshold = 5 percent_identity_threshold = 0.90 split_min_anchor = 4 splice_bias = 10 positive_controls = $(data_directory)/controls.txt probability_threshold = 0.50 # Position density when calculating covariance covariance_sampling_density = 0.01 # Number of reads for each job in split reads_per_job = 1000000 # If you have command line 'mail' and wish to be notified mailto = andrew.mcpherson@gmail.com # Remove temp files remove_job_files = yes remove_job_temp_files = yes #end raw </configfile> <configfile name="shscript"> #!/bin/bash ## define some things for cheetah proccessing #set $ds = chr(36) #set $amp = chr(38) #set $gt = chr(62) #set $lt = chr(60) #set $echo_cmd = 'echo' ## Find the defuse.pl in the galaxy tool path #import Cheetah.FileUtils ## substitute pathnames into config file if `grep __DEFUSE_PATH__ $defuse_config ${gt} /dev/null`;then sed -i'.tmp' "s#__DEFUSE_PATH__#\${DEFUSE_PATH}#" $defuse_config; fi if `grep __SAMTOOLS_BIN__ $defuse_config ${gt} /dev/null` ${amp}${amp} SAMTOOLS_BIN=`which samtools`;then sed -i'.tmp' "s#__SAMTOOLS_BIN__#\${SAMTOOLS_BIN}#" $defuse_config; fi if `grep __BOWTIE_BIN__ $defuse_config ${gt} /dev/null` ${amp}${amp} BOWTIE_BIN=`which bowtie`;then sed -i'.tmp' "s#__BOWTIE_BIN__#\${BOWTIE_BIN}#" $defuse_config; fi if `grep __BOWTIE_BUILD_BIN__ $defuse_config ${gt} /dev/null` ${amp}${amp} BOWTIE_BUILD_BIN=`which bowtie-build`;then sed -i'.tmp' "s#__BOWTIE_BUILD_BIN__#\${BOWTIE_BUILD_BIN}#" $defuse_config; fi if `grep __BLAT_BIN__ $defuse_config ${gt} /dev/null` ${amp}${amp} BLAT_BIN=`which blat`;then sed -i'.tmp' "s#__BLAT_BIN__#\${BLAT_BIN}#" $defuse_config; fi if `grep __FATOTWOBIT_BIN__ $defuse_config ${gt} /dev/null` ${amp}${amp} FATOTWOBIT_BIN=`which faToTwoBit`;then sed -i'.tmp' "s#__FATOTWOBIT_BIN__#\${FATOTWOBIT_BIN}#" $defuse_config; fi if `grep __GMAP_BIN__ $defuse_config ${gt} /dev/null` ${amp}${amp} GMAP_BIN=`which gmap`;then sed -i'.tmp' "s#__GMAP_BIN__#\${GMAP_BIN}#" $defuse_config; fi if `grep __GMAP_SETUP_BIN__ $defuse_config ${gt} /dev/null` ${amp}${amp} GMAP_SETUP_BIN=`which gmap_setup`;then sed -i'.tmp' "s#__GMAP_SETUP_BIN__#\${GMAP_SETUP_BIN}#" $defuse_config; fi if `grep __GMAP_INDEX_DIR__ $defuse_config ${gt} /dev/null` ${amp}${amp} GMAP_INDEX_DIR=`pwd`/gmap;then sed -i'.tmp' "s#__GMAP_INDEX_DIR__#\${GMAP_INDEX_DIR}#" $defuse_config; fi if `grep __R_BIN__ $defuse_config ${gt} /dev/null` ${amp}${amp} R_BIN=`which R`;then sed -i'.tmp' "s#__R_BIN__#\${R_BIN}#" $defuse_config; fi if `grep __RSCRIPT_BIN__ $defuse_config ${gt} /dev/null` ${amp}${amp} RSCRIPT_BIN=`which Rscript`;then sed -i'.tmp' "s#__RSCRIPT_BIN__#\${RSCRIPT_BIN}#" $defuse_config; fi ## copy config to output cp $defuse_config $config_txt ## make a data_dir and ln -s the input fastq mkdir -p $config_txt.extra_files_path ## run defuse.pl perl \${DEFUSE_PATH}/scripts/create_reference_dataset.pl -c $defuse_config </configfile> </configfiles> <tests> </tests> <help> **DeFuse** DeFuse_ is a software package for gene fusion discovery using RNA-Seq data. The software uses clusters of discordant paired end alignments to inform a split read alignment analysis for finding fusion boundaries. The software also employs a number of heuristic filters in an attempt to reduce the number of false positives and produces a fully annotated output for each predicted fusion. Journal reference: http://www.ploscompbiol.org/article/info%3Adoi%2F10.1371%2Fjournal.pcbi.1001138 .. _DeFuse: http://sourceforge.net/apps/mediawiki/defuse/index.php?title=Main_Page ------ **Inputs** DeFuse requires 2 fastq files for paried reads, one with the left mate of the paired reads, and a second fastq with the the right mate of the paired reads (**with reads in the same order as in the first fastq dataset**). If your fastq files have reads in different orders or include unpaired reads, you can preprocess them with **FASTQ interlacer** to create a single interlaced fastq dataset with only the paired reads and input that to **FASTQ de-interlacer** to separate the reads into a left fastq and right fastq. DeFuse uses a Reference Dataset to search for gene fusions. The Reference Dataset is generated from the following sources in DeFuse_Version_0.4_: - genome_fasta from Ensembl - gene_models from Ensembl - repeats_filename from UCSC RepeatMasker rmsk.txt - est_fasta from UCSC - est_alignments from UCSC intronEst.txt - unigene_fasta from NCBI .. _DeFuse_Version_0.4: http://sourceforge.net/apps/mediawiki/defuse/index.php?title=DeFuse_Version_0.4.2 ------ **Outputs** The galaxy history will contain 5 outputs: the config.txt file that provides DeFuse with its parameters, the defuse.log which details what DeFuse has done and can be useful in determining any errors, and the 3 results files that defuse generates. DeFuse generates 3 results files: results.txt, results.filtered.txt, and results.classify.txt. All three files have the same format, though results.classify.txt has a probability column from the application of the classifier to results.txt, and results.filtered.txt has been filtered according to the threshold probability as set in config.txt. The file format is tab delimited with one prediction per line, and the following fields per prediction (not necessarily in this order): - **Identification** - cluster_id : random identifier assigned to each prediction - library_name : library name given on the command line of defuse - gene1 : ensembl id of gene 1 - gene2 : ensembl id of gene 2 - gene_name1 : name of gene 1 - gene_name2 : name of gene 2 - **Evidence** - break_predict : breakpoint prediction method, denovo or splitr, that is considered most reliable - concordant_ratio : proportion of spanning reads considered concordant by blat - denovo_min_count : minimum kmer count across denovo assembled sequence - denovo_sequence : fusion sequence predicted by debruijn based denovo sequence assembly - denovo_span_pvalue : p-value, lower values are evidence the prediction is a false positive - gene_align_strand1 : alignment strand for spanning read alignments to gene 1 - gene_align_strand2 : alignment strand for spanning read alignments to gene 2 - min_map_count : minimum of the number of genomic mappings for each spanning read - max_map_count : maximum of the number of genomic mappings for each spanning read - mean_map_count : average of the number of genomic mappings for each spanning read - num_multi_map : number of spanning reads that map to more than one genomic location - span_count : number of spanning reads supporting the fusion - span_coverage1 : coverage of spanning reads aligned to gene 1 as a proportion of expected coverage - span_coverage2 : coverage of spanning reads aligned to gene 2 as a proportion of expected coverage - span_coverage_min : minimum of span_coverage1 and span_coverage2 - span_coverage_max : maximum of span_coverage1 and span_coverage2 - splitr_count : number of split reads supporting the prediction - splitr_min_pvalue : p-value, lower values are evidence the prediction is a false positive - splitr_pos_pvalue : p-value, lower values are evidence the prediction is a false positive - splitr_sequence : fusion sequence predicted by split reads - splitr_span_pvalue : p-value, lower values are evidence the prediction is a false positive - **Annotation** - adjacent : fusion between adjacent genes - altsplice : fusion likely the product of alternative splicing between adjacent genes - break_adj_entropy1 : di-nucleotide entropy of the 40 nucleotides adjacent to the fusion splice in gene 1 - break_adj_entropy2 : di-nucleotide entropy of the 40 nucleotides adjacent to the fusion splice in gene 2 - break_adj_entropy_min : minimum of break_adj_entropy1 and break_adj_entropy2 - breakpoint_homology : number of nucleotides at the fusion splice that align equally well to gene 1 or gene 2 - breakseqs_estislands_percident : maximum percent identity of fusion sequence alignments to est islands - cdna_breakseqs_percident : maximum percent identity of fusion sequence alignments to cdna - deletion : fusion produced by a genomic deletion - est_breakseqs_percident : maximum percent identity of fusion sequence alignments to est - eversion : fusion produced by a genomic eversion - exonboundaries : fusion splice at exon boundaries - expression1 : expression of gene 1 as number of concordant pairs aligned to exons - expression2 : expression of gene 2 as number of concordant pairs aligned to exons - gene_chromosome1 : chromosome of gene 1 - gene_chromosome2 : chromosome of gene 2 - gene_end1 : end position for gene 1 - gene_end2 : end position for gene 2 - gene_location1 : location of breakpoint in gene 1 - gene_location2 : location of breakpoint in gene 2 - gene_start1 : start of gene 1 - gene_start2 : start of gene 2 - gene_strand1 : strand of gene 1 - gene_strand2 : strand of gene 2 - genome_breakseqs_percident : maximum percent identity of fusion sequence alignments to genome - genomic_break_pos1 : genomic position in gene 1 of fusion splice / breakpoint - genomic_break_pos2 : genomic position in gene 2 of fusion splice / breakpoint - genomic_strand1 : genomic strand in gene 1 of fusion splice / breakpoint, retained sequence upstream on this strand, breakpoint is downstream - genomic_strand2 : genomic strand in gene 2 of fusion splice / breakpoint, retained sequence upstream on this strand, breakpoint is downstream - interchromosomal : fusion produced by an interchromosomal translocation - interrupted_index1 : ratio of coverage before and after the fusion splice / breakpoint in gene 1 - interrupted_index2 : ratio of coverage before and after the fusion splice / breakpoint in gene 2 - inversion : fusion produced by genomic inversion - orf : fusion combines genes in a way that preserves a reading frame - probability : probability produced by classification using adaboost and example positives/negatives (only given in results.classified.txt) - read_through : fusion involving adjacent potentially resulting from co-transcription rather than genome rearrangement - repeat_proportion1 : proportion of the spanning reads in gene 1 that span a repeat region - repeat_proportion2 : proportion of the spanning reads in gene 2 that span a repeat region - max_repeat_proportion : max of repeat_proportion1 and repeat_proportion2 - splice_score : number of nucleotides similar to GTAG at fusion splice - num_splice_variants : number of potential splice variants for this gene pair - splicing_index1 : number of concordant pairs in gene 1 spanning the fusion splice / breakpoint, divided by number of spanning reads supporting the fusion with gene 2 - splicing_index2 : number of concordant pairs in gene 2 spanning the fusion splice / breakpoint, divided by number of spanning reads supporting the fusion with gene 1 **Example** results.tsv:: cluster_id splitr_sequence splitr_count splitr_span_pvalue splitr_pos_pvalue splitr_min_pvalue adjacent altsplice break_adj_entropy1 break_adj_entropy2 break_adj_entropy_min break_predict breakpoint_homology breakseqs_estislands_percident cdna_breakseqs_percident concordant_ratio deletion est_breakseqs_percident eversion exonboundaries expression1 expression2 gene1 gene2 gene_align_strand1 gene_align_strand2 gene_chromosome1 gene_chromosome2 gene_end1 gene_end2 gene_location1 gene_location2 gene_name1 gene_name2 gene_start1 gene_start2 gene_strand1 gene_strand2 genome_breakseqs_percident genomic_break_pos1 genomic_break_pos2 genomic_strand1 genomic_strand2 interchromosomal interrupted_index1 interrupted_index2 inversion library_name max_map_count max_repeat_proportion mean_map_count min_map_count num_multi_map num_splice_variants orf read_through repeat_proportion1 repeat_proportion2 span_count span_coverage1 span_coverage2 span_coverage_max span_coverage_min splice_score splicing_index1 splicing_index2 1169 GCTTACTGTATGCCAGGCCCCAGAGGGGCAACCACCCTCTAAAGAGAGCGGCTCCTGCCTCCCAGAAAGCTCACAGACTGTGGGAGGGAAACAGGCAGCAGGTGAAGATGCCAAATGCCAGGATATCTGCCCTGTCCTTGCTTGATGCAGCTGCTGGCTCCCACGTTCTCCCCAGAATCCCCTCACACTCCTGCTGTTTTCTCTGCAGGTTGGCAGAGCCCCATGAGGGCAGGGCAGCCACTTTGTTCTTGGGCGGCAAACCTCCCTGGGCGGCACGGAAACCACGGTGAGAAGGGGGCAGGTCGGGCACGTGCAGGGACCACGCTGCAGG|TGTACCCAACAGCTCCGAAGAGACAGCGACCATCGAGAACGGGCCATGATGACGATGGCGGTTTTGTCGAAAAGAAAAGGGGGAAATGTGGGGAAAAGCAAGAGAGATCAGATTGTTACTGTGTCTGTGTAGAAAGAAGTAGACATGGGAGACTCCATTTTGTTCTGTACTAAGAAAAATTCTTCTGCCTTGAGATTCGGTGACCCCACCCCCAACCCCGTGCTCTCTGAAACATGTGCTGTGTCCACTCAGGGTTGAATGGATTAAGGGCGGTGCGAGACGTGCTTT 2 0.000436307890680442 0.110748295953850 0.0880671602973091 N Y 3.19872427442695 3.48337348351473 3.19872427442695 splitr 0 0 0 0 Y 0 N N 0 0 ENSG00000105549 ENSG00000213753 + - 19 19 376013 59111168 intron upstream THEG AC016629.2 361750 59084870 - + 0 375099 386594 + - N 8.34107429512245 - N output_dir 82 0.677852348993289 40.6666666666667 1 11 1 N N 0.361271676300578 0.677852348993289 12 0.758602776578432 0.569678713445872 0.758602776578432 0.569678713445872 2 0.416666666666667 - 3596 TGGGGGTTGAGGCTTCTGTTCCCAGGTTCCATGACCTCAGAGGTGGCTGGTGAGGTTATGACCTTTGCCCTCCAGCCCTGGCTTAAAACCTCAGCCCTAGGACCTGGTTAAAGGAAGGGGAGATGGAGCTTTGCCCCGACCCCCCCCCGTTCCCCTCACCTGTCAGCCCGAGCTGGGCCAGGGCCCCTAGGTGGGGAACTGGGCCGGGGGGCGGGCACAAGCGGAGGTGGTGCCCCCAAAAGGGCTCCCGGTGGGGTCTTGCTGAGAAGGTGAGGGGTTCCCGGGGCCGCAGCAGGTGGTGGTGGAGGAGCCAAGCGGCTGTAGAGCAAGGGGTGAGCAGGTTCCAGACCGTAGAGGCGGGCAGCGGCCACGGCCCCGGGTCCAGTTAGCTCCTCACCCGCCTCATAGAAGCGGGGTGGCCTTGCCAGGCGTGGGGGTGCTGCC|TTCCTTGGATGTGGTAGCCGTTTCTCAGGCTCCCTCTCCGGAATCGAACCCTGATTCCCCGTCACCCGTGGTCACCATGGTAGGCACGGCGACTACCATCGAAAGTTGATAGGGCAGACGTTCGAATGGGTCGTCGCCGCCACGGGGGGCGTGCGATCAGCCCGAGGTTATCTAGAGTCACCAAAGCCGCCGGCGCCCGCCCCCCGGCCGGGGCCGGAGAGGGGCTGACCGGGTTGGTTTTGATCTGATAAATGCACGCATCCCCCCCGCGAAGGGGGTCAGCGCCCGTCGGCATGTATTAGCTCTAGAATTACCACAGTTATCCAAGTAGGAGAGGAGCGAGCGACCAAAGGAACCATAACTGATTTAATGAGCCATTCGCAGTTTCACTGTACCGGCCGTGCGTACTTAGACATGCATGGCTTAATCTTTGAGACAAGCATATGCTACTGGCAGG 250 7.00711162298275e-72 0.00912124762512338 0.00684237452309549 N N 3.31745197152461 3.47233119514066 3.31745197152461 splitr 7 0.0157657657657656 0 0 N 0.0135135135135136 N N 0 0 ENSG00000156860 ENSG00000212932 - + 16 21 30682131 48111157 coding upstream FBRS RPL23AP4 30670289 48110676 + + 0.0157657657657656 30680678 9827473 - + Y - - N output_dir 2 1 1.11111111111111 1 1 1 N N 0 1 9 0.325530693397641 0.296465452915709 0.325530693397641 0.296465452915709 2 - - </help> </tool>