annotate create_reference_dataset.xml @ 12:33e2235bf003

Add create_reference_dataset.xml
author Jim Johnson <jj@umn.edu>
date Sun, 09 Jun 2013 20:30:21 -0500
parents
children 85693cb5339f
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
12
33e2235bf003 Add create_reference_dataset.xml
Jim Johnson <jj@umn.edu>
parents:
diff changeset
1 <tool id="create_defusei_reference" name="Create DeFuse Reference" version="1.6.1">
33e2235bf003 Add create_reference_dataset.xml
Jim Johnson <jj@umn.edu>
parents:
diff changeset
2 <description>create a defuse reference from Ensembl and UCSC sources</description>
33e2235bf003 Add create_reference_dataset.xml
Jim Johnson <jj@umn.edu>
parents:
diff changeset
3 <requirements>
33e2235bf003 Add create_reference_dataset.xml
Jim Johnson <jj@umn.edu>
parents:
diff changeset
4 <requirement type="package" version="0.6.1">defuse</requirement>
33e2235bf003 Add create_reference_dataset.xml
Jim Johnson <jj@umn.edu>
parents:
diff changeset
5 <requirement type="package" version="0.1.18">samtools</requirement>
33e2235bf003 Add create_reference_dataset.xml
Jim Johnson <jj@umn.edu>
parents:
diff changeset
6 <requirement type="package" version="1.0.0">bowtie</requirement>
33e2235bf003 Add create_reference_dataset.xml
Jim Johnson <jj@umn.edu>
parents:
diff changeset
7 <requirement type="package" version="2013-05-09">gmap</requirement>
33e2235bf003 Add create_reference_dataset.xml
Jim Johnson <jj@umn.edu>
parents:
diff changeset
8 <requirement type="package" version="latest">kent</requirement>
33e2235bf003 Add create_reference_dataset.xml
Jim Johnson <jj@umn.edu>
parents:
diff changeset
9 </requirements>
33e2235bf003 Add create_reference_dataset.xml
Jim Johnson <jj@umn.edu>
parents:
diff changeset
10 <command interpreter="command"> /bin/bash $shscript </command>
33e2235bf003 Add create_reference_dataset.xml
Jim Johnson <jj@umn.edu>
parents:
diff changeset
11 <inputs>
33e2235bf003 Add create_reference_dataset.xml
Jim Johnson <jj@umn.edu>
parents:
diff changeset
12 <param name="ensembl_genome_version" type="text" value="" label="Esembl Genome Version" help="Example: GRCh37"/>
33e2235bf003 Add create_reference_dataset.xml
Jim Johnson <jj@umn.edu>
parents:
diff changeset
13 <param name="ensembl_version" type="integer" value="" label="Esembl Release Version" help="Example: 71"/>
33e2235bf003 Add create_reference_dataset.xml
Jim Johnson <jj@umn.edu>
parents:
diff changeset
14 <param name="ucsc_genome_version" type="text" value="" label="UCSC Genome Version" help="Example: hg19"/>
33e2235bf003 Add create_reference_dataset.xml
Jim Johnson <jj@umn.edu>
parents:
diff changeset
15 <param name="chromosomes" type="text" value="" label="Chromosomes" help="Example: 1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,X,Y,MT"/>
33e2235bf003 Add create_reference_dataset.xml
Jim Johnson <jj@umn.edu>
parents:
diff changeset
16 <param name="mt_chromosome" type="text" value="MT" label="Mitochonrial Chromosome" />
33e2235bf003 Add create_reference_dataset.xml
Jim Johnson <jj@umn.edu>
parents:
diff changeset
17 <param name="gene_sources" type="text" value="IG_C_gene,IG_D_gene,IG_J_gene,IG_V_gene,processed_transcript,protein_coding" label="Gene sources" />
33e2235bf003 Add create_reference_dataset.xml
Jim Johnson <jj@umn.edu>
parents:
diff changeset
18 <param name="ig_gene_sources" type="text" value="IG_C_gene,IG_D_gene,IG_J_gene,IG_V_gene,IG_pseudogene" label="IG Gene sources" />
33e2235bf003 Add create_reference_dataset.xml
Jim Johnson <jj@umn.edu>
parents:
diff changeset
19 <param name="rrna_gene_sources" type="text" value="Mt_rRNA,rRNA,rRNA_pseudogene" label="Ribosomal Gene sources" />
33e2235bf003 Add create_reference_dataset.xml
Jim Johnson <jj@umn.edu>
parents:
diff changeset
20 </inputs>
33e2235bf003 Add create_reference_dataset.xml
Jim Johnson <jj@umn.edu>
parents:
diff changeset
21 <outputs>
33e2235bf003 Add create_reference_dataset.xml
Jim Johnson <jj@umn.edu>
parents:
diff changeset
22 <data format="txt" name="config_txt" label="${tool.name} on ${on_string}: config.txt"/>
33e2235bf003 Add create_reference_dataset.xml
Jim Johnson <jj@umn.edu>
parents:
diff changeset
23 </outputs>
33e2235bf003 Add create_reference_dataset.xml
Jim Johnson <jj@umn.edu>
parents:
diff changeset
24 <configfiles>
33e2235bf003 Add create_reference_dataset.xml
Jim Johnson <jj@umn.edu>
parents:
diff changeset
25 <configfile name="defuse_config">
33e2235bf003 Add create_reference_dataset.xml
Jim Johnson <jj@umn.edu>
parents:
diff changeset
26 #import ast
33e2235bf003 Add create_reference_dataset.xml
Jim Johnson <jj@umn.edu>
parents:
diff changeset
27 #
33e2235bf003 Add create_reference_dataset.xml
Jim Johnson <jj@umn.edu>
parents:
diff changeset
28 # Configuration file for defuse
33e2235bf003 Add create_reference_dataset.xml
Jim Johnson <jj@umn.edu>
parents:
diff changeset
29 #
33e2235bf003 Add create_reference_dataset.xml
Jim Johnson <jj@umn.edu>
parents:
diff changeset
30 # At a minimum, change all values enclused by []
33e2235bf003 Add create_reference_dataset.xml
Jim Johnson <jj@umn.edu>
parents:
diff changeset
31 #
33e2235bf003 Add create_reference_dataset.xml
Jim Johnson <jj@umn.edu>
parents:
diff changeset
32
33e2235bf003 Add create_reference_dataset.xml
Jim Johnson <jj@umn.edu>
parents:
diff changeset
33 # Directory where the defuse code was unpacked
33e2235bf003 Add create_reference_dataset.xml
Jim Johnson <jj@umn.edu>
parents:
diff changeset
34 ## Default location in the tool/defuse directory
33e2235bf003 Add create_reference_dataset.xml
Jim Johnson <jj@umn.edu>
parents:
diff changeset
35 # source_directory = ${__root_dir__}/tools/defuse
33e2235bf003 Add create_reference_dataset.xml
Jim Johnson <jj@umn.edu>
parents:
diff changeset
36 source_directory = __DEFUSE_PATH__
33e2235bf003 Add create_reference_dataset.xml
Jim Johnson <jj@umn.edu>
parents:
diff changeset
37
33e2235bf003 Add create_reference_dataset.xml
Jim Johnson <jj@umn.edu>
parents:
diff changeset
38 ensembl_version = $ensembl_version
33e2235bf003 Add create_reference_dataset.xml
Jim Johnson <jj@umn.edu>
parents:
diff changeset
39 ensembl_genome_version = $ensembl_genome_version
33e2235bf003 Add create_reference_dataset.xml
Jim Johnson <jj@umn.edu>
parents:
diff changeset
40 ucsc_genome_version = $ucsc_genome_version
33e2235bf003 Add create_reference_dataset.xml
Jim Johnson <jj@umn.edu>
parents:
diff changeset
41
33e2235bf003 Add create_reference_dataset.xml
Jim Johnson <jj@umn.edu>
parents:
diff changeset
42 # Directory where you want your dataset
33e2235bf003 Add create_reference_dataset.xml
Jim Johnson <jj@umn.edu>
parents:
diff changeset
43 dataset_directory = $config_txt.extra_files_path
33e2235bf003 Add create_reference_dataset.xml
Jim Johnson <jj@umn.edu>
parents:
diff changeset
44
33e2235bf003 Add create_reference_dataset.xml
Jim Johnson <jj@umn.edu>
parents:
diff changeset
45 #raw
33e2235bf003 Add create_reference_dataset.xml
Jim Johnson <jj@umn.edu>
parents:
diff changeset
46 # Input genome and gene models
33e2235bf003 Add create_reference_dataset.xml
Jim Johnson <jj@umn.edu>
parents:
diff changeset
47 gene_models = $(dataset_directory)/Homo_sapiens.$(ensembl_genome_version).$(ensembl_version).gtf
33e2235bf003 Add create_reference_dataset.xml
Jim Johnson <jj@umn.edu>
parents:
diff changeset
48 genome_fasta = $(dataset_directory)/Homo_sapiens.$(ensembl_genome_version).$(ensembl_version).dna.chromosomes.fa
33e2235bf003 Add create_reference_dataset.xml
Jim Johnson <jj@umn.edu>
parents:
diff changeset
49
33e2235bf003 Add create_reference_dataset.xml
Jim Johnson <jj@umn.edu>
parents:
diff changeset
50 # Repeat table from ucsc genome browser
33e2235bf003 Add create_reference_dataset.xml
Jim Johnson <jj@umn.edu>
parents:
diff changeset
51 repeats_filename = $(dataset_directory)/repeats.txt
33e2235bf003 Add create_reference_dataset.xml
Jim Johnson <jj@umn.edu>
parents:
diff changeset
52
33e2235bf003 Add create_reference_dataset.xml
Jim Johnson <jj@umn.edu>
parents:
diff changeset
53 # EST info downloaded from ucsc genome browser
33e2235bf003 Add create_reference_dataset.xml
Jim Johnson <jj@umn.edu>
parents:
diff changeset
54 est_fasta = $(dataset_directory)/est.fa
33e2235bf003 Add create_reference_dataset.xml
Jim Johnson <jj@umn.edu>
parents:
diff changeset
55 est_alignments = $(dataset_directory)/intronEst.txt
33e2235bf003 Add create_reference_dataset.xml
Jim Johnson <jj@umn.edu>
parents:
diff changeset
56
33e2235bf003 Add create_reference_dataset.xml
Jim Johnson <jj@umn.edu>
parents:
diff changeset
57 # Unigene clusters downloaded from ncbi
33e2235bf003 Add create_reference_dataset.xml
Jim Johnson <jj@umn.edu>
parents:
diff changeset
58 unigene_fasta = $(dataset_directory)/Hs.seq.uniq
33e2235bf003 Add create_reference_dataset.xml
Jim Johnson <jj@umn.edu>
parents:
diff changeset
59 #end raw
33e2235bf003 Add create_reference_dataset.xml
Jim Johnson <jj@umn.edu>
parents:
diff changeset
60
33e2235bf003 Add create_reference_dataset.xml
Jim Johnson <jj@umn.edu>
parents:
diff changeset
61 # Paths to external tools
33e2235bf003 Add create_reference_dataset.xml
Jim Johnson <jj@umn.edu>
parents:
diff changeset
62 samtools_bin = __SAMTOOLS_BIN__
33e2235bf003 Add create_reference_dataset.xml
Jim Johnson <jj@umn.edu>
parents:
diff changeset
63 bowtie_bin = __BOWTIE_BIN__
33e2235bf003 Add create_reference_dataset.xml
Jim Johnson <jj@umn.edu>
parents:
diff changeset
64 bowtie_build_bin = __BOWTIE_BUILD_BIN__
33e2235bf003 Add create_reference_dataset.xml
Jim Johnson <jj@umn.edu>
parents:
diff changeset
65 blat_bin = __BLAT_BIN__
33e2235bf003 Add create_reference_dataset.xml
Jim Johnson <jj@umn.edu>
parents:
diff changeset
66 fatotwobit_bin = __FATOTWOBIT_BIN__
33e2235bf003 Add create_reference_dataset.xml
Jim Johnson <jj@umn.edu>
parents:
diff changeset
67 gmap_bin = __GMAP_BIN__
33e2235bf003 Add create_reference_dataset.xml
Jim Johnson <jj@umn.edu>
parents:
diff changeset
68 gmap_setup_bin = __GMAP_SETUP_BIN__
33e2235bf003 Add create_reference_dataset.xml
Jim Johnson <jj@umn.edu>
parents:
diff changeset
69 r_bin = __R_BIN__
33e2235bf003 Add create_reference_dataset.xml
Jim Johnson <jj@umn.edu>
parents:
diff changeset
70 rscript_bin = __RSCRIPT_BIN__
33e2235bf003 Add create_reference_dataset.xml
Jim Johnson <jj@umn.edu>
parents:
diff changeset
71
33e2235bf003 Add create_reference_dataset.xml
Jim Johnson <jj@umn.edu>
parents:
diff changeset
72 #raw
33e2235bf003 Add create_reference_dataset.xml
Jim Johnson <jj@umn.edu>
parents:
diff changeset
73 # Directory where you want your dataset
33e2235bf003 Add create_reference_dataset.xml
Jim Johnson <jj@umn.edu>
parents:
diff changeset
74 gmap_index_directory = $(dataset_directory)/gmap
33e2235bf003 Add create_reference_dataset.xml
Jim Johnson <jj@umn.edu>
parents:
diff changeset
75 #end raw
33e2235bf003 Add create_reference_dataset.xml
Jim Johnson <jj@umn.edu>
parents:
diff changeset
76
33e2235bf003 Add create_reference_dataset.xml
Jim Johnson <jj@umn.edu>
parents:
diff changeset
77 #raw
33e2235bf003 Add create_reference_dataset.xml
Jim Johnson <jj@umn.edu>
parents:
diff changeset
78 # Dataset files
33e2235bf003 Add create_reference_dataset.xml
Jim Johnson <jj@umn.edu>
parents:
diff changeset
79 dataset_prefix = $(dataset_directory)/defuse
33e2235bf003 Add create_reference_dataset.xml
Jim Johnson <jj@umn.edu>
parents:
diff changeset
80 chromosome_prefix = $(dataset_prefix).dna.chromosomes
33e2235bf003 Add create_reference_dataset.xml
Jim Johnson <jj@umn.edu>
parents:
diff changeset
81 exons_fasta = $(dataset_prefix).exons.fa
33e2235bf003 Add create_reference_dataset.xml
Jim Johnson <jj@umn.edu>
parents:
diff changeset
82 cds_fasta = $(dataset_prefix).cds.fa
33e2235bf003 Add create_reference_dataset.xml
Jim Johnson <jj@umn.edu>
parents:
diff changeset
83 cdna_regions = $(dataset_prefix).cdna.regions
33e2235bf003 Add create_reference_dataset.xml
Jim Johnson <jj@umn.edu>
parents:
diff changeset
84 cdna_fasta = $(dataset_prefix).cdna.fa
33e2235bf003 Add create_reference_dataset.xml
Jim Johnson <jj@umn.edu>
parents:
diff changeset
85 reference_fasta = $(dataset_prefix).reference.fa
33e2235bf003 Add create_reference_dataset.xml
Jim Johnson <jj@umn.edu>
parents:
diff changeset
86 rrna_fasta = $(dataset_prefix).rrna.fa
33e2235bf003 Add create_reference_dataset.xml
Jim Johnson <jj@umn.edu>
parents:
diff changeset
87 ig_gene_list = $(dataset_prefix).ig.gene.list
33e2235bf003 Add create_reference_dataset.xml
Jim Johnson <jj@umn.edu>
parents:
diff changeset
88 repeats_regions = $(dataset_directory)/repeats.regions
33e2235bf003 Add create_reference_dataset.xml
Jim Johnson <jj@umn.edu>
parents:
diff changeset
89 est_split_fasta1 = $(dataset_directory)/est.1.fa
33e2235bf003 Add create_reference_dataset.xml
Jim Johnson <jj@umn.edu>
parents:
diff changeset
90 est_split_fasta2 = $(dataset_directory)/est.2.fa
33e2235bf003 Add create_reference_dataset.xml
Jim Johnson <jj@umn.edu>
parents:
diff changeset
91 est_split_fasta3 = $(dataset_directory)/est.3.fa
33e2235bf003 Add create_reference_dataset.xml
Jim Johnson <jj@umn.edu>
parents:
diff changeset
92 est_split_fasta4 = $(dataset_directory)/est.4.fa
33e2235bf003 Add create_reference_dataset.xml
Jim Johnson <jj@umn.edu>
parents:
diff changeset
93 est_split_fasta5 = $(dataset_directory)/est.5.fa
33e2235bf003 Add create_reference_dataset.xml
Jim Johnson <jj@umn.edu>
parents:
diff changeset
94 est_split_fasta6 = $(dataset_directory)/est.6.fa
33e2235bf003 Add create_reference_dataset.xml
Jim Johnson <jj@umn.edu>
parents:
diff changeset
95 est_split_fasta7 = $(dataset_directory)/est.7.fa
33e2235bf003 Add create_reference_dataset.xml
Jim Johnson <jj@umn.edu>
parents:
diff changeset
96 est_split_fasta8 = $(dataset_directory)/est.8.fa
33e2235bf003 Add create_reference_dataset.xml
Jim Johnson <jj@umn.edu>
parents:
diff changeset
97 est_split_fasta9 = $(dataset_directory)/est.9.fa
33e2235bf003 Add create_reference_dataset.xml
Jim Johnson <jj@umn.edu>
parents:
diff changeset
98
33e2235bf003 Add create_reference_dataset.xml
Jim Johnson <jj@umn.edu>
parents:
diff changeset
99 # Fasta files with bowtie indices for prefiltering reads for concordantly mapping pairs
33e2235bf003 Add create_reference_dataset.xml
Jim Johnson <jj@umn.edu>
parents:
diff changeset
100 prefilter1 = $(unigene_fasta)
33e2235bf003 Add create_reference_dataset.xml
Jim Johnson <jj@umn.edu>
parents:
diff changeset
101
33e2235bf003 Add create_reference_dataset.xml
Jim Johnson <jj@umn.edu>
parents:
diff changeset
102 # deFuse scripts and tools
33e2235bf003 Add create_reference_dataset.xml
Jim Johnson <jj@umn.edu>
parents:
diff changeset
103 scripts_directory = $(source_directory)/scripts
33e2235bf003 Add create_reference_dataset.xml
Jim Johnson <jj@umn.edu>
parents:
diff changeset
104 tools_directory = $(source_directory)/tools
33e2235bf003 Add create_reference_dataset.xml
Jim Johnson <jj@umn.edu>
parents:
diff changeset
105 data_directory = $(source_directory)/data
33e2235bf003 Add create_reference_dataset.xml
Jim Johnson <jj@umn.edu>
parents:
diff changeset
106 #end raw
33e2235bf003 Add create_reference_dataset.xml
Jim Johnson <jj@umn.edu>
parents:
diff changeset
107
33e2235bf003 Add create_reference_dataset.xml
Jim Johnson <jj@umn.edu>
parents:
diff changeset
108 #raw
33e2235bf003 Add create_reference_dataset.xml
Jim Johnson <jj@umn.edu>
parents:
diff changeset
109 # Bowtie parameters
33e2235bf003 Add create_reference_dataset.xml
Jim Johnson <jj@umn.edu>
parents:
diff changeset
110 bowtie_threads = 1
33e2235bf003 Add create_reference_dataset.xml
Jim Johnson <jj@umn.edu>
parents:
diff changeset
111 bowtie_quals = --phred33-quals
33e2235bf003 Add create_reference_dataset.xml
Jim Johnson <jj@umn.edu>
parents:
diff changeset
112 max_insert_size = 500
33e2235bf003 Add create_reference_dataset.xml
Jim Johnson <jj@umn.edu>
parents:
diff changeset
113 #end raw
33e2235bf003 Add create_reference_dataset.xml
Jim Johnson <jj@umn.edu>
parents:
diff changeset
114
33e2235bf003 Add create_reference_dataset.xml
Jim Johnson <jj@umn.edu>
parents:
diff changeset
115 # Parameters for building the dataset
33e2235bf003 Add create_reference_dataset.xml
Jim Johnson <jj@umn.edu>
parents:
diff changeset
116 chromosomes = $chromosomes
33e2235bf003 Add create_reference_dataset.xml
Jim Johnson <jj@umn.edu>
parents:
diff changeset
117 mt_chromosome = $mt_chromosome
33e2235bf003 Add create_reference_dataset.xml
Jim Johnson <jj@umn.edu>
parents:
diff changeset
118 gene_sources = $gene_sources
33e2235bf003 Add create_reference_dataset.xml
Jim Johnson <jj@umn.edu>
parents:
diff changeset
119 ig_gene_sources = $ig_gene_sources
33e2235bf003 Add create_reference_dataset.xml
Jim Johnson <jj@umn.edu>
parents:
diff changeset
120 rrna_gene_sources = $rrna_gene_sources
33e2235bf003 Add create_reference_dataset.xml
Jim Johnson <jj@umn.edu>
parents:
diff changeset
121
33e2235bf003 Add create_reference_dataset.xml
Jim Johnson <jj@umn.edu>
parents:
diff changeset
122 #raw
33e2235bf003 Add create_reference_dataset.xml
Jim Johnson <jj@umn.edu>
parents:
diff changeset
123 # Blat sequences per job
33e2235bf003 Add create_reference_dataset.xml
Jim Johnson <jj@umn.edu>
parents:
diff changeset
124 num_blat_sequences = 10000
33e2235bf003 Add create_reference_dataset.xml
Jim Johnson <jj@umn.edu>
parents:
diff changeset
125
33e2235bf003 Add create_reference_dataset.xml
Jim Johnson <jj@umn.edu>
parents:
diff changeset
126 # Minimum gene fusion range
33e2235bf003 Add create_reference_dataset.xml
Jim Johnson <jj@umn.edu>
parents:
diff changeset
127 dna_concordant_length = 2000
33e2235bf003 Add create_reference_dataset.xml
Jim Johnson <jj@umn.edu>
parents:
diff changeset
128
33e2235bf003 Add create_reference_dataset.xml
Jim Johnson <jj@umn.edu>
parents:
diff changeset
129 # Trim length for discordant reads (split reads are not trimmed)
33e2235bf003 Add create_reference_dataset.xml
Jim Johnson <jj@umn.edu>
parents:
diff changeset
130 discord_read_trim = 50
33e2235bf003 Add create_reference_dataset.xml
Jim Johnson <jj@umn.edu>
parents:
diff changeset
131
33e2235bf003 Add create_reference_dataset.xml
Jim Johnson <jj@umn.edu>
parents:
diff changeset
132 # Calculate extra annotations, fusion splice index and interrupted index
33e2235bf003 Add create_reference_dataset.xml
Jim Johnson <jj@umn.edu>
parents:
diff changeset
133 calculate_extra_annotations = no
33e2235bf003 Add create_reference_dataset.xml
Jim Johnson <jj@umn.edu>
parents:
diff changeset
134
33e2235bf003 Add create_reference_dataset.xml
Jim Johnson <jj@umn.edu>
parents:
diff changeset
135 # Filtering parameters
33e2235bf003 Add create_reference_dataset.xml
Jim Johnson <jj@umn.edu>
parents:
diff changeset
136 clustering_precision = 0.95
33e2235bf003 Add create_reference_dataset.xml
Jim Johnson <jj@umn.edu>
parents:
diff changeset
137 span_count_threshold = 5
33e2235bf003 Add create_reference_dataset.xml
Jim Johnson <jj@umn.edu>
parents:
diff changeset
138 percent_identity_threshold = 0.90
33e2235bf003 Add create_reference_dataset.xml
Jim Johnson <jj@umn.edu>
parents:
diff changeset
139 split_min_anchor = 4
33e2235bf003 Add create_reference_dataset.xml
Jim Johnson <jj@umn.edu>
parents:
diff changeset
140 splice_bias = 10
33e2235bf003 Add create_reference_dataset.xml
Jim Johnson <jj@umn.edu>
parents:
diff changeset
141 positive_controls = $(data_directory)/controls.txt
33e2235bf003 Add create_reference_dataset.xml
Jim Johnson <jj@umn.edu>
parents:
diff changeset
142 probability_threshold = 0.50
33e2235bf003 Add create_reference_dataset.xml
Jim Johnson <jj@umn.edu>
parents:
diff changeset
143
33e2235bf003 Add create_reference_dataset.xml
Jim Johnson <jj@umn.edu>
parents:
diff changeset
144 # Position density when calculating covariance
33e2235bf003 Add create_reference_dataset.xml
Jim Johnson <jj@umn.edu>
parents:
diff changeset
145 covariance_sampling_density = 0.01
33e2235bf003 Add create_reference_dataset.xml
Jim Johnson <jj@umn.edu>
parents:
diff changeset
146
33e2235bf003 Add create_reference_dataset.xml
Jim Johnson <jj@umn.edu>
parents:
diff changeset
147 # Number of reads for each job in split
33e2235bf003 Add create_reference_dataset.xml
Jim Johnson <jj@umn.edu>
parents:
diff changeset
148 reads_per_job = 1000000
33e2235bf003 Add create_reference_dataset.xml
Jim Johnson <jj@umn.edu>
parents:
diff changeset
149
33e2235bf003 Add create_reference_dataset.xml
Jim Johnson <jj@umn.edu>
parents:
diff changeset
150 # If you have command line 'mail' and wish to be notified
33e2235bf003 Add create_reference_dataset.xml
Jim Johnson <jj@umn.edu>
parents:
diff changeset
151 mailto = andrew.mcpherson@gmail.com
33e2235bf003 Add create_reference_dataset.xml
Jim Johnson <jj@umn.edu>
parents:
diff changeset
152
33e2235bf003 Add create_reference_dataset.xml
Jim Johnson <jj@umn.edu>
parents:
diff changeset
153 # Remove temp files
33e2235bf003 Add create_reference_dataset.xml
Jim Johnson <jj@umn.edu>
parents:
diff changeset
154 remove_job_files = yes
33e2235bf003 Add create_reference_dataset.xml
Jim Johnson <jj@umn.edu>
parents:
diff changeset
155 remove_job_temp_files = yes
33e2235bf003 Add create_reference_dataset.xml
Jim Johnson <jj@umn.edu>
parents:
diff changeset
156 #end raw
33e2235bf003 Add create_reference_dataset.xml
Jim Johnson <jj@umn.edu>
parents:
diff changeset
157 </configfile>
33e2235bf003 Add create_reference_dataset.xml
Jim Johnson <jj@umn.edu>
parents:
diff changeset
158 <configfile name="shscript">
33e2235bf003 Add create_reference_dataset.xml
Jim Johnson <jj@umn.edu>
parents:
diff changeset
159 #!/bin/bash
33e2235bf003 Add create_reference_dataset.xml
Jim Johnson <jj@umn.edu>
parents:
diff changeset
160 ## define some things for cheetah proccessing
33e2235bf003 Add create_reference_dataset.xml
Jim Johnson <jj@umn.edu>
parents:
diff changeset
161 #set $ds = chr(36)
33e2235bf003 Add create_reference_dataset.xml
Jim Johnson <jj@umn.edu>
parents:
diff changeset
162 #set $amp = chr(38)
33e2235bf003 Add create_reference_dataset.xml
Jim Johnson <jj@umn.edu>
parents:
diff changeset
163 #set $gt = chr(62)
33e2235bf003 Add create_reference_dataset.xml
Jim Johnson <jj@umn.edu>
parents:
diff changeset
164 #set $lt = chr(60)
33e2235bf003 Add create_reference_dataset.xml
Jim Johnson <jj@umn.edu>
parents:
diff changeset
165 #set $echo_cmd = 'echo'
33e2235bf003 Add create_reference_dataset.xml
Jim Johnson <jj@umn.edu>
parents:
diff changeset
166 ## Find the defuse.pl in the galaxy tool path
33e2235bf003 Add create_reference_dataset.xml
Jim Johnson <jj@umn.edu>
parents:
diff changeset
167 #import Cheetah.FileUtils
33e2235bf003 Add create_reference_dataset.xml
Jim Johnson <jj@umn.edu>
parents:
diff changeset
168 ## substitute pathnames into config file
33e2235bf003 Add create_reference_dataset.xml
Jim Johnson <jj@umn.edu>
parents:
diff changeset
169 if `grep __DEFUSE_PATH__ $defuse_config ${gt} /dev/null`;then sed -i'.tmp' "s#__DEFUSE_PATH__#\${DEFUSE_PATH}#" $defuse_config; fi
33e2235bf003 Add create_reference_dataset.xml
Jim Johnson <jj@umn.edu>
parents:
diff changeset
170 if `grep __SAMTOOLS_BIN__ $defuse_config ${gt} /dev/null` ${amp}${amp} SAMTOOLS_BIN=`which samtools`;then sed -i'.tmp' "s#__SAMTOOLS_BIN__#\${SAMTOOLS_BIN}#" $defuse_config; fi
33e2235bf003 Add create_reference_dataset.xml
Jim Johnson <jj@umn.edu>
parents:
diff changeset
171 if `grep __BOWTIE_BIN__ $defuse_config ${gt} /dev/null` ${amp}${amp} BOWTIE_BIN=`which bowtie`;then sed -i'.tmp' "s#__BOWTIE_BIN__#\${BOWTIE_BIN}#" $defuse_config; fi
33e2235bf003 Add create_reference_dataset.xml
Jim Johnson <jj@umn.edu>
parents:
diff changeset
172 if `grep __BOWTIE_BUILD_BIN__ $defuse_config ${gt} /dev/null` ${amp}${amp} BOWTIE_BUILD_BIN=`which bowtie-build`;then sed -i'.tmp' "s#__BOWTIE_BUILD_BIN__#\${BOWTIE_BUILD_BIN}#" $defuse_config; fi
33e2235bf003 Add create_reference_dataset.xml
Jim Johnson <jj@umn.edu>
parents:
diff changeset
173 if `grep __BLAT_BIN__ $defuse_config ${gt} /dev/null` ${amp}${amp} BLAT_BIN=`which blat`;then sed -i'.tmp' "s#__BLAT_BIN__#\${BLAT_BIN}#" $defuse_config; fi
33e2235bf003 Add create_reference_dataset.xml
Jim Johnson <jj@umn.edu>
parents:
diff changeset
174 if `grep __FATOTWOBIT_BIN__ $defuse_config ${gt} /dev/null` ${amp}${amp} FATOTWOBIT_BIN=`which faToTwoBit`;then sed -i'.tmp' "s#__FATOTWOBIT_BIN__#\${FATOTWOBIT_BIN}#" $defuse_config; fi
33e2235bf003 Add create_reference_dataset.xml
Jim Johnson <jj@umn.edu>
parents:
diff changeset
175 if `grep __GMAP_BIN__ $defuse_config ${gt} /dev/null` ${amp}${amp} GMAP_BIN=`which gmap`;then sed -i'.tmp' "s#__GMAP_BIN__#\${GMAP_BIN}#" $defuse_config; fi
33e2235bf003 Add create_reference_dataset.xml
Jim Johnson <jj@umn.edu>
parents:
diff changeset
176 if `grep __GMAP_SETUP_BIN__ $defuse_config ${gt} /dev/null` ${amp}${amp} GMAP_SETUP_BIN=`which gmap_setup`;then sed -i'.tmp' "s#__GMAP_SETUP_BIN__#\${GMAP_SETUP_BIN}#" $defuse_config; fi
33e2235bf003 Add create_reference_dataset.xml
Jim Johnson <jj@umn.edu>
parents:
diff changeset
177 if `grep __GMAP_INDEX_DIR__ $defuse_config ${gt} /dev/null` ${amp}${amp} GMAP_INDEX_DIR=`pwd`/gmap;then sed -i'.tmp' "s#__GMAP_INDEX_DIR__#\${GMAP_INDEX_DIR}#" $defuse_config; fi
33e2235bf003 Add create_reference_dataset.xml
Jim Johnson <jj@umn.edu>
parents:
diff changeset
178 if `grep __R_BIN__ $defuse_config ${gt} /dev/null` ${amp}${amp} R_BIN=`which R`;then sed -i'.tmp' "s#__R_BIN__#\${R_BIN}#" $defuse_config; fi
33e2235bf003 Add create_reference_dataset.xml
Jim Johnson <jj@umn.edu>
parents:
diff changeset
179 if `grep __RSCRIPT_BIN__ $defuse_config ${gt} /dev/null` ${amp}${amp} RSCRIPT_BIN=`which Rscript`;then sed -i'.tmp' "s#__RSCRIPT_BIN__#\${RSCRIPT_BIN}#" $defuse_config; fi
33e2235bf003 Add create_reference_dataset.xml
Jim Johnson <jj@umn.edu>
parents:
diff changeset
180
33e2235bf003 Add create_reference_dataset.xml
Jim Johnson <jj@umn.edu>
parents:
diff changeset
181 ## copy config to output
33e2235bf003 Add create_reference_dataset.xml
Jim Johnson <jj@umn.edu>
parents:
diff changeset
182 cp $defuse_config $config_txt
33e2235bf003 Add create_reference_dataset.xml
Jim Johnson <jj@umn.edu>
parents:
diff changeset
183 ## make a data_dir and ln -s the input fastq
33e2235bf003 Add create_reference_dataset.xml
Jim Johnson <jj@umn.edu>
parents:
diff changeset
184 mkdir -p $config_txt.extra_files_path
33e2235bf003 Add create_reference_dataset.xml
Jim Johnson <jj@umn.edu>
parents:
diff changeset
185 ## run defuse.pl
33e2235bf003 Add create_reference_dataset.xml
Jim Johnson <jj@umn.edu>
parents:
diff changeset
186 perl \${DEFUSE_PATH}/scripts/create_reference_dataset.pl -c $defuse_config
33e2235bf003 Add create_reference_dataset.xml
Jim Johnson <jj@umn.edu>
parents:
diff changeset
187 </configfile>
33e2235bf003 Add create_reference_dataset.xml
Jim Johnson <jj@umn.edu>
parents:
diff changeset
188 </configfiles>
33e2235bf003 Add create_reference_dataset.xml
Jim Johnson <jj@umn.edu>
parents:
diff changeset
189
33e2235bf003 Add create_reference_dataset.xml
Jim Johnson <jj@umn.edu>
parents:
diff changeset
190 <tests>
33e2235bf003 Add create_reference_dataset.xml
Jim Johnson <jj@umn.edu>
parents:
diff changeset
191 </tests>
33e2235bf003 Add create_reference_dataset.xml
Jim Johnson <jj@umn.edu>
parents:
diff changeset
192 <help>
33e2235bf003 Add create_reference_dataset.xml
Jim Johnson <jj@umn.edu>
parents:
diff changeset
193 **DeFuse**
33e2235bf003 Add create_reference_dataset.xml
Jim Johnson <jj@umn.edu>
parents:
diff changeset
194
33e2235bf003 Add create_reference_dataset.xml
Jim Johnson <jj@umn.edu>
parents:
diff changeset
195 DeFuse_ is a software package for gene fusion discovery using RNA-Seq data. The software uses clusters of discordant paired end alignments to inform a split read alignment analysis for finding fusion boundaries. The software also employs a number of heuristic filters in an attempt to reduce the number of false positives and produces a fully annotated output for each predicted fusion.
33e2235bf003 Add create_reference_dataset.xml
Jim Johnson <jj@umn.edu>
parents:
diff changeset
196
33e2235bf003 Add create_reference_dataset.xml
Jim Johnson <jj@umn.edu>
parents:
diff changeset
197 Journal reference: http://www.ploscompbiol.org/article/info%3Adoi%2F10.1371%2Fjournal.pcbi.1001138
33e2235bf003 Add create_reference_dataset.xml
Jim Johnson <jj@umn.edu>
parents:
diff changeset
198
33e2235bf003 Add create_reference_dataset.xml
Jim Johnson <jj@umn.edu>
parents:
diff changeset
199 .. _DeFuse: http://sourceforge.net/apps/mediawiki/defuse/index.php?title=Main_Page
33e2235bf003 Add create_reference_dataset.xml
Jim Johnson <jj@umn.edu>
parents:
diff changeset
200
33e2235bf003 Add create_reference_dataset.xml
Jim Johnson <jj@umn.edu>
parents:
diff changeset
201 ------
33e2235bf003 Add create_reference_dataset.xml
Jim Johnson <jj@umn.edu>
parents:
diff changeset
202
33e2235bf003 Add create_reference_dataset.xml
Jim Johnson <jj@umn.edu>
parents:
diff changeset
203 **Inputs**
33e2235bf003 Add create_reference_dataset.xml
Jim Johnson <jj@umn.edu>
parents:
diff changeset
204
33e2235bf003 Add create_reference_dataset.xml
Jim Johnson <jj@umn.edu>
parents:
diff changeset
205 DeFuse requires 2 fastq files for paried reads, one with the left mate of the paired reads, and a second fastq with the the right mate of the paired reads (**with reads in the same order as in the first fastq dataset**).
33e2235bf003 Add create_reference_dataset.xml
Jim Johnson <jj@umn.edu>
parents:
diff changeset
206
33e2235bf003 Add create_reference_dataset.xml
Jim Johnson <jj@umn.edu>
parents:
diff changeset
207 If your fastq files have reads in different orders or include unpaired reads, you can preprocess them with **FASTQ interlacer** to create a single interlaced fastq dataset with only the paired reads and input that to **FASTQ de-interlacer** to separate the reads into a left fastq and right fastq.
33e2235bf003 Add create_reference_dataset.xml
Jim Johnson <jj@umn.edu>
parents:
diff changeset
208
33e2235bf003 Add create_reference_dataset.xml
Jim Johnson <jj@umn.edu>
parents:
diff changeset
209 DeFuse uses a Reference Dataset to search for gene fusions. The Reference Dataset is generated from the following sources in DeFuse_Version_0.4_:
33e2235bf003 Add create_reference_dataset.xml
Jim Johnson <jj@umn.edu>
parents:
diff changeset
210 - genome_fasta from Ensembl
33e2235bf003 Add create_reference_dataset.xml
Jim Johnson <jj@umn.edu>
parents:
diff changeset
211 - gene_models from Ensembl
33e2235bf003 Add create_reference_dataset.xml
Jim Johnson <jj@umn.edu>
parents:
diff changeset
212 - repeats_filename from UCSC RepeatMasker rmsk.txt
33e2235bf003 Add create_reference_dataset.xml
Jim Johnson <jj@umn.edu>
parents:
diff changeset
213 - est_fasta from UCSC
33e2235bf003 Add create_reference_dataset.xml
Jim Johnson <jj@umn.edu>
parents:
diff changeset
214 - est_alignments from UCSC intronEst.txt
33e2235bf003 Add create_reference_dataset.xml
Jim Johnson <jj@umn.edu>
parents:
diff changeset
215 - unigene_fasta from NCBI
33e2235bf003 Add create_reference_dataset.xml
Jim Johnson <jj@umn.edu>
parents:
diff changeset
216
33e2235bf003 Add create_reference_dataset.xml
Jim Johnson <jj@umn.edu>
parents:
diff changeset
217 .. _DeFuse_Version_0.4: http://sourceforge.net/apps/mediawiki/defuse/index.php?title=DeFuse_Version_0.4.2
33e2235bf003 Add create_reference_dataset.xml
Jim Johnson <jj@umn.edu>
parents:
diff changeset
218
33e2235bf003 Add create_reference_dataset.xml
Jim Johnson <jj@umn.edu>
parents:
diff changeset
219 ------
33e2235bf003 Add create_reference_dataset.xml
Jim Johnson <jj@umn.edu>
parents:
diff changeset
220
33e2235bf003 Add create_reference_dataset.xml
Jim Johnson <jj@umn.edu>
parents:
diff changeset
221 **Outputs**
33e2235bf003 Add create_reference_dataset.xml
Jim Johnson <jj@umn.edu>
parents:
diff changeset
222
33e2235bf003 Add create_reference_dataset.xml
Jim Johnson <jj@umn.edu>
parents:
diff changeset
223 The galaxy history will contain 5 outputs: the config.txt file that provides DeFuse with its parameters, the defuse.log which details what DeFuse has done and can be useful in determining any errors, and the 3 results files that defuse generates.
33e2235bf003 Add create_reference_dataset.xml
Jim Johnson <jj@umn.edu>
parents:
diff changeset
224
33e2235bf003 Add create_reference_dataset.xml
Jim Johnson <jj@umn.edu>
parents:
diff changeset
225 DeFuse generates 3 results files: results.txt, results.filtered.txt, and results.classify.txt. All three files have the same format, though results.classify.txt has a probability column from the application of the classifier to results.txt, and results.filtered.txt has been filtered according to the threshold probability as set in config.txt.
33e2235bf003 Add create_reference_dataset.xml
Jim Johnson <jj@umn.edu>
parents:
diff changeset
226
33e2235bf003 Add create_reference_dataset.xml
Jim Johnson <jj@umn.edu>
parents:
diff changeset
227 The file format is tab delimited with one prediction per line, and the following fields per prediction (not necessarily in this order):
33e2235bf003 Add create_reference_dataset.xml
Jim Johnson <jj@umn.edu>
parents:
diff changeset
228
33e2235bf003 Add create_reference_dataset.xml
Jim Johnson <jj@umn.edu>
parents:
diff changeset
229 - **Identification**
33e2235bf003 Add create_reference_dataset.xml
Jim Johnson <jj@umn.edu>
parents:
diff changeset
230 - cluster_id : random identifier assigned to each prediction
33e2235bf003 Add create_reference_dataset.xml
Jim Johnson <jj@umn.edu>
parents:
diff changeset
231 - library_name : library name given on the command line of defuse
33e2235bf003 Add create_reference_dataset.xml
Jim Johnson <jj@umn.edu>
parents:
diff changeset
232 - gene1 : ensembl id of gene 1
33e2235bf003 Add create_reference_dataset.xml
Jim Johnson <jj@umn.edu>
parents:
diff changeset
233 - gene2 : ensembl id of gene 2
33e2235bf003 Add create_reference_dataset.xml
Jim Johnson <jj@umn.edu>
parents:
diff changeset
234 - gene_name1 : name of gene 1
33e2235bf003 Add create_reference_dataset.xml
Jim Johnson <jj@umn.edu>
parents:
diff changeset
235 - gene_name2 : name of gene 2
33e2235bf003 Add create_reference_dataset.xml
Jim Johnson <jj@umn.edu>
parents:
diff changeset
236 - **Evidence**
33e2235bf003 Add create_reference_dataset.xml
Jim Johnson <jj@umn.edu>
parents:
diff changeset
237 - break_predict : breakpoint prediction method, denovo or splitr, that is considered most reliable
33e2235bf003 Add create_reference_dataset.xml
Jim Johnson <jj@umn.edu>
parents:
diff changeset
238 - concordant_ratio : proportion of spanning reads considered concordant by blat
33e2235bf003 Add create_reference_dataset.xml
Jim Johnson <jj@umn.edu>
parents:
diff changeset
239 - denovo_min_count : minimum kmer count across denovo assembled sequence
33e2235bf003 Add create_reference_dataset.xml
Jim Johnson <jj@umn.edu>
parents:
diff changeset
240 - denovo_sequence : fusion sequence predicted by debruijn based denovo sequence assembly
33e2235bf003 Add create_reference_dataset.xml
Jim Johnson <jj@umn.edu>
parents:
diff changeset
241 - denovo_span_pvalue : p-value, lower values are evidence the prediction is a false positive
33e2235bf003 Add create_reference_dataset.xml
Jim Johnson <jj@umn.edu>
parents:
diff changeset
242 - gene_align_strand1 : alignment strand for spanning read alignments to gene 1
33e2235bf003 Add create_reference_dataset.xml
Jim Johnson <jj@umn.edu>
parents:
diff changeset
243 - gene_align_strand2 : alignment strand for spanning read alignments to gene 2
33e2235bf003 Add create_reference_dataset.xml
Jim Johnson <jj@umn.edu>
parents:
diff changeset
244 - min_map_count : minimum of the number of genomic mappings for each spanning read
33e2235bf003 Add create_reference_dataset.xml
Jim Johnson <jj@umn.edu>
parents:
diff changeset
245 - max_map_count : maximum of the number of genomic mappings for each spanning read
33e2235bf003 Add create_reference_dataset.xml
Jim Johnson <jj@umn.edu>
parents:
diff changeset
246 - mean_map_count : average of the number of genomic mappings for each spanning read
33e2235bf003 Add create_reference_dataset.xml
Jim Johnson <jj@umn.edu>
parents:
diff changeset
247 - num_multi_map : number of spanning reads that map to more than one genomic location
33e2235bf003 Add create_reference_dataset.xml
Jim Johnson <jj@umn.edu>
parents:
diff changeset
248 - span_count : number of spanning reads supporting the fusion
33e2235bf003 Add create_reference_dataset.xml
Jim Johnson <jj@umn.edu>
parents:
diff changeset
249 - span_coverage1 : coverage of spanning reads aligned to gene 1 as a proportion of expected coverage
33e2235bf003 Add create_reference_dataset.xml
Jim Johnson <jj@umn.edu>
parents:
diff changeset
250 - span_coverage2 : coverage of spanning reads aligned to gene 2 as a proportion of expected coverage
33e2235bf003 Add create_reference_dataset.xml
Jim Johnson <jj@umn.edu>
parents:
diff changeset
251 - span_coverage_min : minimum of span_coverage1 and span_coverage2
33e2235bf003 Add create_reference_dataset.xml
Jim Johnson <jj@umn.edu>
parents:
diff changeset
252 - span_coverage_max : maximum of span_coverage1 and span_coverage2
33e2235bf003 Add create_reference_dataset.xml
Jim Johnson <jj@umn.edu>
parents:
diff changeset
253 - splitr_count : number of split reads supporting the prediction
33e2235bf003 Add create_reference_dataset.xml
Jim Johnson <jj@umn.edu>
parents:
diff changeset
254 - splitr_min_pvalue : p-value, lower values are evidence the prediction is a false positive
33e2235bf003 Add create_reference_dataset.xml
Jim Johnson <jj@umn.edu>
parents:
diff changeset
255 - splitr_pos_pvalue : p-value, lower values are evidence the prediction is a false positive
33e2235bf003 Add create_reference_dataset.xml
Jim Johnson <jj@umn.edu>
parents:
diff changeset
256 - splitr_sequence : fusion sequence predicted by split reads
33e2235bf003 Add create_reference_dataset.xml
Jim Johnson <jj@umn.edu>
parents:
diff changeset
257 - splitr_span_pvalue : p-value, lower values are evidence the prediction is a false positive
33e2235bf003 Add create_reference_dataset.xml
Jim Johnson <jj@umn.edu>
parents:
diff changeset
258 - **Annotation**
33e2235bf003 Add create_reference_dataset.xml
Jim Johnson <jj@umn.edu>
parents:
diff changeset
259 - adjacent : fusion between adjacent genes
33e2235bf003 Add create_reference_dataset.xml
Jim Johnson <jj@umn.edu>
parents:
diff changeset
260 - altsplice : fusion likely the product of alternative splicing between adjacent genes
33e2235bf003 Add create_reference_dataset.xml
Jim Johnson <jj@umn.edu>
parents:
diff changeset
261 - break_adj_entropy1 : di-nucleotide entropy of the 40 nucleotides adjacent to the fusion splice in gene 1
33e2235bf003 Add create_reference_dataset.xml
Jim Johnson <jj@umn.edu>
parents:
diff changeset
262 - break_adj_entropy2 : di-nucleotide entropy of the 40 nucleotides adjacent to the fusion splice in gene 2
33e2235bf003 Add create_reference_dataset.xml
Jim Johnson <jj@umn.edu>
parents:
diff changeset
263 - break_adj_entropy_min : minimum of break_adj_entropy1 and break_adj_entropy2
33e2235bf003 Add create_reference_dataset.xml
Jim Johnson <jj@umn.edu>
parents:
diff changeset
264 - breakpoint_homology : number of nucleotides at the fusion splice that align equally well to gene 1 or gene 2
33e2235bf003 Add create_reference_dataset.xml
Jim Johnson <jj@umn.edu>
parents:
diff changeset
265 - breakseqs_estislands_percident : maximum percent identity of fusion sequence alignments to est islands
33e2235bf003 Add create_reference_dataset.xml
Jim Johnson <jj@umn.edu>
parents:
diff changeset
266 - cdna_breakseqs_percident : maximum percent identity of fusion sequence alignments to cdna
33e2235bf003 Add create_reference_dataset.xml
Jim Johnson <jj@umn.edu>
parents:
diff changeset
267 - deletion : fusion produced by a genomic deletion
33e2235bf003 Add create_reference_dataset.xml
Jim Johnson <jj@umn.edu>
parents:
diff changeset
268 - est_breakseqs_percident : maximum percent identity of fusion sequence alignments to est
33e2235bf003 Add create_reference_dataset.xml
Jim Johnson <jj@umn.edu>
parents:
diff changeset
269 - eversion : fusion produced by a genomic eversion
33e2235bf003 Add create_reference_dataset.xml
Jim Johnson <jj@umn.edu>
parents:
diff changeset
270 - exonboundaries : fusion splice at exon boundaries
33e2235bf003 Add create_reference_dataset.xml
Jim Johnson <jj@umn.edu>
parents:
diff changeset
271 - expression1 : expression of gene 1 as number of concordant pairs aligned to exons
33e2235bf003 Add create_reference_dataset.xml
Jim Johnson <jj@umn.edu>
parents:
diff changeset
272 - expression2 : expression of gene 2 as number of concordant pairs aligned to exons
33e2235bf003 Add create_reference_dataset.xml
Jim Johnson <jj@umn.edu>
parents:
diff changeset
273 - gene_chromosome1 : chromosome of gene 1
33e2235bf003 Add create_reference_dataset.xml
Jim Johnson <jj@umn.edu>
parents:
diff changeset
274 - gene_chromosome2 : chromosome of gene 2
33e2235bf003 Add create_reference_dataset.xml
Jim Johnson <jj@umn.edu>
parents:
diff changeset
275 - gene_end1 : end position for gene 1
33e2235bf003 Add create_reference_dataset.xml
Jim Johnson <jj@umn.edu>
parents:
diff changeset
276 - gene_end2 : end position for gene 2
33e2235bf003 Add create_reference_dataset.xml
Jim Johnson <jj@umn.edu>
parents:
diff changeset
277 - gene_location1 : location of breakpoint in gene 1
33e2235bf003 Add create_reference_dataset.xml
Jim Johnson <jj@umn.edu>
parents:
diff changeset
278 - gene_location2 : location of breakpoint in gene 2
33e2235bf003 Add create_reference_dataset.xml
Jim Johnson <jj@umn.edu>
parents:
diff changeset
279 - gene_start1 : start of gene 1
33e2235bf003 Add create_reference_dataset.xml
Jim Johnson <jj@umn.edu>
parents:
diff changeset
280 - gene_start2 : start of gene 2
33e2235bf003 Add create_reference_dataset.xml
Jim Johnson <jj@umn.edu>
parents:
diff changeset
281 - gene_strand1 : strand of gene 1
33e2235bf003 Add create_reference_dataset.xml
Jim Johnson <jj@umn.edu>
parents:
diff changeset
282 - gene_strand2 : strand of gene 2
33e2235bf003 Add create_reference_dataset.xml
Jim Johnson <jj@umn.edu>
parents:
diff changeset
283 - genome_breakseqs_percident : maximum percent identity of fusion sequence alignments to genome
33e2235bf003 Add create_reference_dataset.xml
Jim Johnson <jj@umn.edu>
parents:
diff changeset
284 - genomic_break_pos1 : genomic position in gene 1 of fusion splice / breakpoint
33e2235bf003 Add create_reference_dataset.xml
Jim Johnson <jj@umn.edu>
parents:
diff changeset
285 - genomic_break_pos2 : genomic position in gene 2 of fusion splice / breakpoint
33e2235bf003 Add create_reference_dataset.xml
Jim Johnson <jj@umn.edu>
parents:
diff changeset
286 - genomic_strand1 : genomic strand in gene 1 of fusion splice / breakpoint, retained sequence upstream on this strand, breakpoint is downstream
33e2235bf003 Add create_reference_dataset.xml
Jim Johnson <jj@umn.edu>
parents:
diff changeset
287 - genomic_strand2 : genomic strand in gene 2 of fusion splice / breakpoint, retained sequence upstream on this strand, breakpoint is downstream
33e2235bf003 Add create_reference_dataset.xml
Jim Johnson <jj@umn.edu>
parents:
diff changeset
288 - interchromosomal : fusion produced by an interchromosomal translocation
33e2235bf003 Add create_reference_dataset.xml
Jim Johnson <jj@umn.edu>
parents:
diff changeset
289 - interrupted_index1 : ratio of coverage before and after the fusion splice / breakpoint in gene 1
33e2235bf003 Add create_reference_dataset.xml
Jim Johnson <jj@umn.edu>
parents:
diff changeset
290 - interrupted_index2 : ratio of coverage before and after the fusion splice / breakpoint in gene 2
33e2235bf003 Add create_reference_dataset.xml
Jim Johnson <jj@umn.edu>
parents:
diff changeset
291 - inversion : fusion produced by genomic inversion
33e2235bf003 Add create_reference_dataset.xml
Jim Johnson <jj@umn.edu>
parents:
diff changeset
292 - orf : fusion combines genes in a way that preserves a reading frame
33e2235bf003 Add create_reference_dataset.xml
Jim Johnson <jj@umn.edu>
parents:
diff changeset
293 - probability : probability produced by classification using adaboost and example positives/negatives (only given in results.classified.txt)
33e2235bf003 Add create_reference_dataset.xml
Jim Johnson <jj@umn.edu>
parents:
diff changeset
294 - read_through : fusion involving adjacent potentially resulting from co-transcription rather than genome rearrangement
33e2235bf003 Add create_reference_dataset.xml
Jim Johnson <jj@umn.edu>
parents:
diff changeset
295 - repeat_proportion1 : proportion of the spanning reads in gene 1 that span a repeat region
33e2235bf003 Add create_reference_dataset.xml
Jim Johnson <jj@umn.edu>
parents:
diff changeset
296 - repeat_proportion2 : proportion of the spanning reads in gene 2 that span a repeat region
33e2235bf003 Add create_reference_dataset.xml
Jim Johnson <jj@umn.edu>
parents:
diff changeset
297 - max_repeat_proportion : max of repeat_proportion1 and repeat_proportion2
33e2235bf003 Add create_reference_dataset.xml
Jim Johnson <jj@umn.edu>
parents:
diff changeset
298 - splice_score : number of nucleotides similar to GTAG at fusion splice
33e2235bf003 Add create_reference_dataset.xml
Jim Johnson <jj@umn.edu>
parents:
diff changeset
299 - num_splice_variants : number of potential splice variants for this gene pair
33e2235bf003 Add create_reference_dataset.xml
Jim Johnson <jj@umn.edu>
parents:
diff changeset
300 - splicing_index1 : number of concordant pairs in gene 1 spanning the fusion splice / breakpoint, divided by number of spanning reads supporting the fusion with gene 2
33e2235bf003 Add create_reference_dataset.xml
Jim Johnson <jj@umn.edu>
parents:
diff changeset
301 - splicing_index2 : number of concordant pairs in gene 2 spanning the fusion splice / breakpoint, divided by number of spanning reads supporting the fusion with gene 1
33e2235bf003 Add create_reference_dataset.xml
Jim Johnson <jj@umn.edu>
parents:
diff changeset
302
33e2235bf003 Add create_reference_dataset.xml
Jim Johnson <jj@umn.edu>
parents:
diff changeset
303
33e2235bf003 Add create_reference_dataset.xml
Jim Johnson <jj@umn.edu>
parents:
diff changeset
304 **Example**
33e2235bf003 Add create_reference_dataset.xml
Jim Johnson <jj@umn.edu>
parents:
diff changeset
305
33e2235bf003 Add create_reference_dataset.xml
Jim Johnson <jj@umn.edu>
parents:
diff changeset
306 results.tsv::
33e2235bf003 Add create_reference_dataset.xml
Jim Johnson <jj@umn.edu>
parents:
diff changeset
307
33e2235bf003 Add create_reference_dataset.xml
Jim Johnson <jj@umn.edu>
parents:
diff changeset
308 cluster_id splitr_sequence splitr_count splitr_span_pvalue splitr_pos_pvalue splitr_min_pvalue adjacent altsplice break_adj_entropy1 break_adj_entropy2 break_adj_entropy_min break_predict breakpoint_homology breakseqs_estislands_percident cdna_breakseqs_percident concordant_ratio deletion est_breakseqs_percident eversion exonboundaries expression1 expression2 gene1 gene2 gene_align_strand1 gene_align_strand2 gene_chromosome1 gene_chromosome2 gene_end1 gene_end2 gene_location1 gene_location2 gene_name1 gene_name2 gene_start1 gene_start2 gene_strand1 gene_strand2 genome_breakseqs_percident genomic_break_pos1 genomic_break_pos2 genomic_strand1 genomic_strand2 interchromosomal interrupted_index1 interrupted_index2 inversion library_name max_map_count max_repeat_proportion mean_map_count min_map_count num_multi_map num_splice_variants orf read_through repeat_proportion1 repeat_proportion2 span_count span_coverage1 span_coverage2 span_coverage_max span_coverage_min splice_score splicing_index1 splicing_index2
33e2235bf003 Add create_reference_dataset.xml
Jim Johnson <jj@umn.edu>
parents:
diff changeset
309 1169 GCTTACTGTATGCCAGGCCCCAGAGGGGCAACCACCCTCTAAAGAGAGCGGCTCCTGCCTCCCAGAAAGCTCACAGACTGTGGGAGGGAAACAGGCAGCAGGTGAAGATGCCAAATGCCAGGATATCTGCCCTGTCCTTGCTTGATGCAGCTGCTGGCTCCCACGTTCTCCCCAGAATCCCCTCACACTCCTGCTGTTTTCTCTGCAGGTTGGCAGAGCCCCATGAGGGCAGGGCAGCCACTTTGTTCTTGGGCGGCAAACCTCCCTGGGCGGCACGGAAACCACGGTGAGAAGGGGGCAGGTCGGGCACGTGCAGGGACCACGCTGCAGG|TGTACCCAACAGCTCCGAAGAGACAGCGACCATCGAGAACGGGCCATGATGACGATGGCGGTTTTGTCGAAAAGAAAAGGGGGAAATGTGGGGAAAAGCAAGAGAGATCAGATTGTTACTGTGTCTGTGTAGAAAGAAGTAGACATGGGAGACTCCATTTTGTTCTGTACTAAGAAAAATTCTTCTGCCTTGAGATTCGGTGACCCCACCCCCAACCCCGTGCTCTCTGAAACATGTGCTGTGTCCACTCAGGGTTGAATGGATTAAGGGCGGTGCGAGACGTGCTTT 2 0.000436307890680442 0.110748295953850 0.0880671602973091 N Y 3.19872427442695 3.48337348351473 3.19872427442695 splitr 0 0 0 0 Y 0 N N 0 0 ENSG00000105549 ENSG00000213753 + - 19 19 376013 59111168 intron upstream THEG AC016629.2 361750 59084870 - + 0 375099 386594 + - N 8.34107429512245 - N output_dir 82 0.677852348993289 40.6666666666667 1 11 1 N N 0.361271676300578 0.677852348993289 12 0.758602776578432 0.569678713445872 0.758602776578432 0.569678713445872 2 0.416666666666667 -
33e2235bf003 Add create_reference_dataset.xml
Jim Johnson <jj@umn.edu>
parents:
diff changeset
310 3596 TGGGGGTTGAGGCTTCTGTTCCCAGGTTCCATGACCTCAGAGGTGGCTGGTGAGGTTATGACCTTTGCCCTCCAGCCCTGGCTTAAAACCTCAGCCCTAGGACCTGGTTAAAGGAAGGGGAGATGGAGCTTTGCCCCGACCCCCCCCCGTTCCCCTCACCTGTCAGCCCGAGCTGGGCCAGGGCCCCTAGGTGGGGAACTGGGCCGGGGGGCGGGCACAAGCGGAGGTGGTGCCCCCAAAAGGGCTCCCGGTGGGGTCTTGCTGAGAAGGTGAGGGGTTCCCGGGGCCGCAGCAGGTGGTGGTGGAGGAGCCAAGCGGCTGTAGAGCAAGGGGTGAGCAGGTTCCAGACCGTAGAGGCGGGCAGCGGCCACGGCCCCGGGTCCAGTTAGCTCCTCACCCGCCTCATAGAAGCGGGGTGGCCTTGCCAGGCGTGGGGGTGCTGCC|TTCCTTGGATGTGGTAGCCGTTTCTCAGGCTCCCTCTCCGGAATCGAACCCTGATTCCCCGTCACCCGTGGTCACCATGGTAGGCACGGCGACTACCATCGAAAGTTGATAGGGCAGACGTTCGAATGGGTCGTCGCCGCCACGGGGGGCGTGCGATCAGCCCGAGGTTATCTAGAGTCACCAAAGCCGCCGGCGCCCGCCCCCCGGCCGGGGCCGGAGAGGGGCTGACCGGGTTGGTTTTGATCTGATAAATGCACGCATCCCCCCCGCGAAGGGGGTCAGCGCCCGTCGGCATGTATTAGCTCTAGAATTACCACAGTTATCCAAGTAGGAGAGGAGCGAGCGACCAAAGGAACCATAACTGATTTAATGAGCCATTCGCAGTTTCACTGTACCGGCCGTGCGTACTTAGACATGCATGGCTTAATCTTTGAGACAAGCATATGCTACTGGCAGG 250 7.00711162298275e-72 0.00912124762512338 0.00684237452309549 N N 3.31745197152461 3.47233119514066 3.31745197152461 splitr 7 0.0157657657657656 0 0 N 0.0135135135135136 N N 0 0 ENSG00000156860 ENSG00000212932 - + 16 21 30682131 48111157 coding upstream FBRS RPL23AP4 30670289 48110676 + + 0.0157657657657656 30680678 9827473 - + Y - - N output_dir 2 1 1.11111111111111 1 1 1 N N 0 1 9 0.325530693397641 0.296465452915709 0.325530693397641 0.296465452915709 2 - -
33e2235bf003 Add create_reference_dataset.xml
Jim Johnson <jj@umn.edu>
parents:
diff changeset
311
33e2235bf003 Add create_reference_dataset.xml
Jim Johnson <jj@umn.edu>
parents:
diff changeset
312 </help>
33e2235bf003 Add create_reference_dataset.xml
Jim Johnson <jj@umn.edu>
parents:
diff changeset
313 </tool>