# HG changeset patch # User david-hoover # Date 1347987549 14400 # Node ID a2c1575ba537ee3b9962889ce396202f66519b23 # Parent 94152a913ac98e124288f14941dc1e3564d730b7 Uploaded diff -r 94152a913ac9 -r a2c1575ba537 README --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/README Tue Sep 18 12:59:09 2012 -0400 @@ -0,0 +1,12 @@ +The gatk2_sorted_picard_index.loc and gatk2_annotations.txt files must be +copied into the tool-data directory. The file tool_data_table_conf.xml must +be edited to include references to these two new files. + +Additionally, copies of or links to the GenomeAnalysisTK.jar and key file +must be made within the directory tool-data/shared/jars/gatk2. + + cd ${GALAXY_DATA_INDEX_DIR}/shared/jars + mkdir gatk2 + cd gatk2 + ln -s /path/to/wherever/GenomeAnalysisTK.jar . + ln -s /path/to/wherever/key.file gatk2_key_file diff -r 94152a913ac9 -r a2c1575ba537 depth_of_coverage.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/depth_of_coverage.xml Tue Sep 18 12:59:09 2012 -0400 @@ -0,0 +1,1027 @@ + + on BAM files + + gatk + samtools + + gatk2_wrapper.py + --max_jvm_heap_fraction "1" + --stdout "${output_log}" + #for $i, $input_bam in enumerate( $reference_source.input_bams ): + -d "-I" "${input_bam.input_bam}" "${input_bam.input_bam.ext}" "gatk_input_${i}" + #if str( $input_bam.input_bam.metadata.bam_index ) != "None": + -d "" "${input_bam.input_bam.metadata.bam_index}" "bam_index" "gatk_input_${i}" ##hardcode galaxy ext type as bam_index + #end if + #end for + -p 'java + -jar "/data/galaxy/galaxy3/tool-data/shared/jars/gatk2/GenomeAnalysisTK.jar" + -T "DepthOfCoverage" + ##--num_threads 4 ##hard coded, for now + + -et "NO_ET" -K "/data/galaxy/galaxy3/tool-data/shared/jars/gatk2/gatk2_key_file" ##ET no phone home + #if $reference_source.reference_source_selector != "history": + -R "${reference_source.ref_file.fields.path}" + #end if + #if str( $input_calculate_coverage_over_genes ) != "None": + --calculateCoverageOverGenes "${input_calculate_coverage_over_genes}" + #end if + #if str( $partition_type ) != "None": + #for $pt in str( $partition_type ).split( ',' ): + --partitionType "${pt}" + #end for + #end if + --out "${output_per_locus_coverage}" + + #for $ct_group in $summary_coverage_threshold_group: + --summaryCoverageThreshold "${ct_group.summary_coverage_threshold}" + #end for + --outputFormat "${output_format}" + ' + + ##start standard gatk options + #if $gatk_param_type.gatk_param_type_selector == "advanced": + #for $pedigree in $gatk_param_type.pedigree: + -p '--pedigree "${pedigree.pedigree_file}"' + #end for + #for $pedigree_string in $gatk_param_type.pedigree_string_repeat: + -p '--pedigreeString "${pedigree_string.pedigree_string}"' + #end for + -p '--pedigreeValidationType "${gatk_param_type.pedigree_validation_type}"' + #for $read_filter in $gatk_param_type.read_filter: + -p '--read_filter "${read_filter.read_filter_type.read_filter_type_selector}" + ###raise Exception( str( dir( $read_filter ) ) ) + #for $name, $param in $read_filter.read_filter_type.iteritems(): + #if $name not in [ "__current_case__", "read_filter_type_selector" ]: + #if hasattr( $param.input, 'truevalue' ): + ${param} + #else: + --${name} "${param}" + #end if + #end if + #end for + ' + #end for + #for $interval_count, $input_intervals in enumerate( $gatk_param_type.input_interval_repeat ): + -d "--intervals" "${input_intervals.input_intervals}" "${input_intervals.input_intervals.ext}" "input_intervals_${interval_count}" + #end for + + #for $interval_count, $input_intervals in enumerate( $gatk_param_type.input_exclude_interval_repeat ): + -d "--excludeIntervals" "${input_intervals.input_exclude_intervals}" "${input_intervals.input_exclude_intervals.ext}" "input_exlude_intervals_${interval_count}" + #end for + + -p '--interval_set_rule "${gatk_param_type.interval_set_rule}"' + + -p '--downsampling_type "${gatk_param_type.downsampling_type.downsampling_type_selector}"' + #if str( $gatk_param_type.downsampling_type.downsampling_type_selector ) != "NONE": + -p '--${gatk_param_type.downsampling_type.downsample_to_type.downsample_to_type_selector} "${gatk_param_type.downsampling_type.downsample_to_type.downsample_to_value}"' + #end if + -p ' + --baq "${gatk_param_type.baq}" + --baqGapOpenPenalty "${gatk_param_type.baq_gap_open_penalty}" + ${gatk_param_type.use_original_qualities} + --defaultBaseQualities "${gatk_param_type.default_base_qualities}" + --validation_strictness "${gatk_param_type.validation_strictness}" + --interval_merging "${gatk_param_type.interval_merging}" + ${gatk_param_type.disable_experimental_low_memory_sharding} + ${gatk_param_type.non_deterministic_random_seed} + ' + #for $rg_black_list_count, $rg_black_list in enumerate( $gatk_param_type.read_group_black_list_repeat ): + #if $rg_black_list.read_group_black_list_type.read_group_black_list_type_selector == "file": + -d "--read_group_black_list" "${rg_black_list.read_group_black_list_type.read_group_black_list}" "txt" "input_read_group_black_list_${rg_black_list_count}" + #else + -p '--read_group_black_list "${rg_black_list.read_group_black_list_type.read_group_black_list}"' + #end if + #end for + #end if + + #if $reference_source.reference_source_selector == "history": + -d "-R" "${reference_source.ref_file}" "${reference_source.ref_file.ext}" "gatk_input" + #end if + ##end standard gatk options + ##start analysis specific options + #if $analysis_param_type.analysis_param_type_selector == "advanced": + -p ' + ${analysis_param_type.ignore_deletion_sites} + ${analysis_param_type.include_deletions} + --maxBaseQuality "${analysis_param_type.max_base_quality}" + --maxMappingQuality "${analysis_param_type.max_mapping_quality}" + --minBaseQuality "${analysis_param_type.min_base_quality}" + --minMappingQuality "${analysis_param_type.min_mapping_quality}" + --nBins "${analysis_param_type.n_bins}" + ${analysis_param_type.omit_depth_output_at_each_base} + ${analysis_param_type.omit_interval_statistics} + ${analysis_param_type.omit_locus_table} + ${analysis_param_type.omit_per_sample_stats} + ${analysis_param_type.print_base_counts} + ${analysis_param_type.print_bin_endpoints_and_exit} + --start "${analysis_param_type.start}" + --stop "${analysis_param_type.stop}" + ' + #end if + ##Move additional files to final location + #if str( $partition_type ) != "None": + #set $partition_types = str( $partition_type ).split( ',' ) + #else: + #set $partition_types = [ 'sample' ] + #end if + #if 'sample' in $partition_types and ( str( $analysis_param_type.analysis_param_type_selector ) == "basic" or str( $analysis_param_type.print_bin_endpoints_and_exit ) == "" ): + #if str( $analysis_param_type.analysis_param_type_selector ) == "basic" or str( $analysis_param_type.omit_per_sample_stats ) == "": + && mv ${output_per_locus_coverage}.sample_summary ${output_summary_sample} + && mv ${output_per_locus_coverage}.sample_statistics ${output_statistics_sample} + #end if + #if $gatk_param_type.gatk_param_type_selector == "advanced" and len( $gatk_param_type.input_interval_repeat ) and ( str( $analysis_param_type.analysis_param_type_selector ) == "basic" or str( $analysis_param_type.omit_interval_statistics ) == "" ): + && mv ${output_per_locus_coverage}.sample_interval_summary ${output_interval_summary_sample} + && mv ${output_per_locus_coverage}.sample_interval_statistics ${output_interval_statistics_sample} + #end if + #if str( $input_calculate_coverage_over_genes ) != "None": + && mv ${output_per_locus_coverage}.sample_gene_summary ${output_gene_summary_sample} + && mv ${output_per_locus_coverage}.sample_gene_statistics ${output_gene_statistics_sample} + #end if + #if str( $analysis_param_type.analysis_param_type_selector ) == "basic" or str( $analysis_param_type.omit_depth_output_at_each_base ) == "": + && mv ${output_per_locus_coverage}.sample_cumulative_coverage_counts ${output_cumulative_coverage_counts_sample} + && mv ${output_per_locus_coverage}.sample_cumulative_coverage_proportions ${output_cumulative_coverage_proportions_sample} + #end if + #end if + + #if 'readgroup' in $partition_types and ( str( $analysis_param_type.analysis_param_type_selector ) == "basic" or str( $analysis_param_type.print_bin_endpoints_and_exit ) == "" ): + #if str( $analysis_param_type.analysis_param_type_selector ) == "basic" or str( $analysis_param_type.omit_per_sample_stats ) == "": + && mv ${output_per_locus_coverage}.read_group_summary ${output_summary_readgroup} + && mv ${output_per_locus_coverage}.read_group_statistics ${output_statistics_readgroup} + #end if + #if $gatk_param_type.gatk_param_type_selector == "advanced" and len( $gatk_param_type.input_interval_repeat ) and ( str( $analysis_param_type.analysis_param_type_selector ) == "basic" or str( $analysis_param_type.omit_interval_statistics ) == "" ): + && mv ${output_per_locus_coverage}.read_group_interval_summary ${output_interval_summary_readgroup} + && mv ${output_per_locus_coverage}.read_group_interval_statistics ${output_interval_statistics_readgroup} + #end if + #if str( $input_calculate_coverage_over_genes ) != "None": + && mv ${output_per_locus_coverage}.read_group_gene_summary ${output_gene_summary_readgroup} + && mv ${output_per_locus_coverage}.read_group_gene_statistics ${output_gene_statistics_readgroup} + #end if + #if str( $analysis_param_type.analysis_param_type_selector ) == "basic" or str( $analysis_param_type.omit_depth_output_at_each_base ) == "": + && mv ${output_per_locus_coverage}.read_group_cumulative_coverage_counts ${output_cumulative_coverage_counts_readgroup} + && mv ${output_per_locus_coverage}.read_group_cumulative_coverage_proportions ${output_cumulative_coverage_proportions_readgroup} + #end if + #end if + + #if 'library' in $partition_types and ( str( $analysis_param_type.analysis_param_type_selector ) == "basic" or str( $analysis_param_type.print_bin_endpoints_and_exit ) == "" ): + #if str( $analysis_param_type.analysis_param_type_selector ) == "basic" or str( $analysis_param_type.omit_per_sample_stats ) == "": + && mv ${output_per_locus_coverage}.library_summary ${output_summary_library} + && mv ${output_per_locus_coverage}.library_statistics ${output_statistics_library} + #end if + #if $gatk_param_type.gatk_param_type_selector == "advanced" and len( $gatk_param_type.input_interval_repeat ) and ( str( $analysis_param_type.analysis_param_type_selector ) == "basic" or str( $analysis_param_type.omit_interval_statistics ) == "" ): + && mv ${output_per_locus_coverage}.library_interval_summary ${output_interval_summary_library} + && mv ${output_per_locus_coverage}.library_interval_statistics ${output_interval_statistics_library} + #end if + #if str( $input_calculate_coverage_over_genes ) != "None": + && mv ${output_per_locus_coverage}.library_gene_summary ${output_gene_summary_library} + && mv ${output_per_locus_coverage}.library_gene_statistics ${output_gene_statistics_library} + #end if + #if str( $analysis_param_type.analysis_param_type_selector ) == "basic" or str( $analysis_param_type.omit_depth_output_at_each_base ) == "": + && mv ${output_per_locus_coverage}.library_cumulative_coverage_counts ${output_cumulative_coverage_counts_library} + && mv ${output_per_locus_coverage}.library_cumulative_coverage_proportions ${output_cumulative_coverage_proportions_library} + #end if + #end if + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + analysis_param_type['analysis_param_type_selector'] == "basic" or analysis_param_type['print_bin_endpoints_and_exit'] == False + + + + + + + + + + + + + + + analysis_param_type['analysis_param_type_selector'] == "basic" or analysis_param_type['omit_per_sample_stats'] == False + analysis_param_type['analysis_param_type_selector'] == "basic" or analysis_param_type['print_bin_endpoints_and_exit'] == False + 'sample' in partition_type or not partition_type + + + + + + + + + + + + + + + analysis_param_type['analysis_param_type_selector'] == "basic" or analysis_param_type['omit_per_sample_stats'] == False + analysis_param_type['analysis_param_type_selector'] == "basic" or analysis_param_type['print_bin_endpoints_and_exit'] == False + 'sample' in partition_type or not partition_type + + + + + + + + + + + + + + + analysis_param_type['analysis_param_type_selector'] == "basic" or analysis_param_type['print_bin_endpoints_and_exit'] == False + 'sample' in partition_type or not partition_type + gatk_param_type['gatk_param_type_selector'] == "advanced" and len( gatk_param_type['input_interval_repeat'] ) + analysis_param_type['analysis_param_type_selector'] == "basic" or analysis_param_type['omit_interval_statistics'] == False + + + + + + + + + + + + + + + analysis_param_type['analysis_param_type_selector'] == "basic" or analysis_param_type['print_bin_endpoints_and_exit'] == False + 'sample' in partition_type or not partition_type + gatk_param_type['gatk_param_type_selector'] == "advanced" and len( gatk_param_type['input_interval_repeat'] ) + analysis_param_type['analysis_param_type_selector'] == "basic" or analysis_param_type['omit_interval_statistics'] == False + + + + + + + + + + + + + + + analysis_param_type['analysis_param_type_selector'] == "basic" or analysis_param_type['print_bin_endpoints_and_exit'] == False + input_calculate_coverage_over_genes is not None and 'sample' in partition_type or not partition_type + + + + + + + + + + + + + + + analysis_param_type['analysis_param_type_selector'] == "basic" or analysis_param_type['print_bin_endpoints_and_exit'] == False + input_calculate_coverage_over_genes is not None and 'sample' in partition_type or not partition_type + + + + + + + + + + + + + + + analysis_param_type['analysis_param_type_selector'] == "basic" or analysis_param_type['omit_depth_output_at_each_base'] == False + analysis_param_type['analysis_param_type_selector'] == "basic" or analysis_param_type['print_bin_endpoints_and_exit'] == False + 'sample' in partition_type or not partition_type + + + + + + + + + + + + + + + analysis_param_type['analysis_param_type_selector'] == "basic" or analysis_param_type['omit_depth_output_at_each_base'] == False + analysis_param_type['analysis_param_type_selector'] == "basic" or analysis_param_type['print_bin_endpoints_and_exit'] == False + 'sample' in partition_type or not partition_type + + + + + + + + + + + + + + + + analysis_param_type['analysis_param_type_selector'] == "basic" or analysis_param_type['omit_per_sample_stats'] == False + analysis_param_type['analysis_param_type_selector'] == "basic" or analysis_param_type['print_bin_endpoints_and_exit'] == False + 'readgroup' in partition_type + + + + + + + + + + + + + + + analysis_param_type['analysis_param_type_selector'] == "basic" or analysis_param_type['omit_per_sample_stats'] == False + analysis_param_type['analysis_param_type_selector'] == "basic" or analysis_param_type['print_bin_endpoints_and_exit'] == False + 'readgroup' in partition_type + + + + + + + + + + + + + + + analysis_param_type['analysis_param_type_selector'] == "basic" or analysis_param_type['print_bin_endpoints_and_exit'] == False + 'readgroup' in partition_type + gatk_param_type['gatk_param_type_selector'] == "advanced" and len( gatk_param_type['input_interval_repeat'] ) + analysis_param_type['analysis_param_type_selector'] == "basic" or analysis_param_type['omit_interval_statistics'] == False + + + + + + + + + + + + + + + analysis_param_type['analysis_param_type_selector'] == "basic" or analysis_param_type['print_bin_endpoints_and_exit'] == False + 'readgroup' in partition_type + gatk_param_type['gatk_param_type_selector'] == "advanced" and len( gatk_param_type['input_interval_repeat'] ) + analysis_param_type['analysis_param_type_selector'] == "basic" or analysis_param_type['omit_interval_statistics'] == False + + + + + + + + + + + + + + + analysis_param_type['analysis_param_type_selector'] == "basic" or analysis_param_type['print_bin_endpoints_and_exit'] == False + input_calculate_coverage_over_genes is not None and 'readgroup' in partition_type or not partition_type + + + + + + + + + + + + + + + analysis_param_type['analysis_param_type_selector'] == "basic" or analysis_param_type['print_bin_endpoints_and_exit'] == False + input_calculate_coverage_over_genes is not None and 'readgroup' in partition_type or not partition_type + + + + + + + + + + + + + + + analysis_param_type['analysis_param_type_selector'] == "basic" or analysis_param_type['print_bin_endpoints_and_exit'] == False + analysis_param_type['analysis_param_type_selector'] == "basic" or analysis_param_type['omit_depth_output_at_each_base'] == False + 'readgroup' in partition_type + + + + + + + + + + + + + + + analysis_param_type['analysis_param_type_selector'] == "basic" or analysis_param_type['print_bin_endpoints_and_exit'] == False + analysis_param_type['analysis_param_type_selector'] == "basic" or analysis_param_type['omit_depth_output_at_each_base'] == False + 'readgroup' in partition_type + + + + + + + + + + + + + + + + analysis_param_type['analysis_param_type_selector'] == "basic" or analysis_param_type['omit_per_sample_stats'] == False + analysis_param_type['analysis_param_type_selector'] == "basic" or analysis_param_type['print_bin_endpoints_and_exit'] == False + 'library' in partition_type + + + + + + + + + + + + + + + analysis_param_type['analysis_param_type_selector'] == "basic" or analysis_param_type['omit_per_sample_stats'] == False + analysis_param_type['analysis_param_type_selector'] == "basic" or analysis_param_type['print_bin_endpoints_and_exit'] == False + 'library' in partition_type + + + + + + + + + + + + + + + analysis_param_type['analysis_param_type_selector'] == "basic" or analysis_param_type['print_bin_endpoints_and_exit'] == False + 'library' in partition_type + gatk_param_type['gatk_param_type_selector'] == "advanced" and len( gatk_param_type['input_interval_repeat'] ) + analysis_param_type['analysis_param_type_selector'] == "basic" or analysis_param_type['omit_interval_statistics'] == False + + + + + + + + + + + + + + + analysis_param_type['analysis_param_type_selector'] == "basic" or analysis_param_type['print_bin_endpoints_and_exit'] == False + 'library' in partition_type + gatk_param_type['gatk_param_type_selector'] == "advanced" and len( gatk_param_type['input_interval_repeat'] ) + analysis_param_type['analysis_param_type_selector'] == "basic" or analysis_param_type['omit_interval_statistics'] == False + + + + + + + + + + + + + + + analysis_param_type['analysis_param_type_selector'] == "basic" or analysis_param_type['print_bin_endpoints_and_exit'] == False + input_calculate_coverage_over_genes is not None and 'library' in partition_type or not partition_type + + + + + + + + + + + + + + + analysis_param_type['analysis_param_type_selector'] == "basic" or analysis_param_type['print_bin_endpoints_and_exit'] == False + input_calculate_coverage_over_genes is not None and 'library' in partition_type or not partition_type + + + + + + + + + + + + + + + analysis_param_type['analysis_param_type_selector'] == "basic" or analysis_param_type['omit_depth_output_at_each_base'] == False + analysis_param_type['analysis_param_type_selector'] == "basic" or analysis_param_type['print_bin_endpoints_and_exit'] == False + 'library' in partition_type + + + + + + + + + + + + + + + analysis_param_type['analysis_param_type_selector'] == "basic" or analysis_param_type['omit_depth_output_at_each_base'] == False + analysis_param_type['analysis_param_type_selector'] == "basic" or analysis_param_type['print_bin_endpoints_and_exit'] == False + 'library' in partition_type + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +**What it does** + +DepthOfCoverage processes a set of bam files to determine coverage at different levels of partitioning and aggregation. Coverage can be analyzed per locus, per interval, per gene, or in total; can be partitioned by sample, by read group, by technology, by center, or by library; and can be summarized by mean, median, quartiles, and/or percentage of bases covered to or beyond a threshold. Additionally, reads and bases can be filtered by mapping or base quality score. + +For more information on the GATK Depth of Coverage, see this `tool specific page <http://www.broadinstitute.org/gsa/wiki/index.php/Depth_of_Coverage>`_. + +To learn about best practices for variant detection using GATK, see this `overview <http://www.broadinstitute.org/gsa/wiki/index.php/Best_Practice_Variant_Detection_with_the_GATK_v3>`_. + +If you encounter errors, please view the `GATK FAQ <http://www.broadinstitute.org/gsa/wiki/index.php/Frequently_Asked_Questions>`_. + +------ + +**Inputs** + +GenomeAnalysisTK: DepthOfCoverage accepts aligned BAM input files. + + +**Outputs** + +The output is in various table formats. + + +Go `here <http://www.broadinstitute.org/gsa/wiki/index.php/Input_files_for_the_GATK>`_ for details on GATK file formats. + +------- + +**Settings**:: + + calculateCoverageOverGenes File NA Calculate the coverage statistics over this list of genes. Currently accepts RefSeq. + ignoreDeletionSites boolean false Ignore sites consisting only of deletions + includeDeletions boolean false Include information on deletions + maxBaseQuality byte 127 Maximum quality of bases to count towards depth. Defaults to 127 (Byte.MAX_VALUE). + maxMappingQuality int 2147483647 Maximum mapping quality of reads to count towards depth. Defaults to 2^31-1 (Integer.MAX_VALUE). + minBaseQuality byte -1 Minimum quality of bases to count towards depth. Defaults to -1. + minMappingQuality int -1 Minimum mapping quality of reads to count towards depth. Defaults to -1. + nBins int 499 Number of bins to use for granular binning + omitDepthOutputAtEachBase boolean false Will omit the output of the depth of coverage at each base, which should result in speedup + omitIntervalStatistics boolean false Will omit the per-interval statistics section, which should result in speedup + omitLocusTable boolean false Will not calculate the per-sample per-depth counts of loci, which should result in speedup + omitPerSampleStats boolean false Omits the summary files per-sample. These statistics are still calculated, so this argument will not improve runtime. + outputFormat String rtable the format of the output file (e.g. csv, table, rtable); defaults to r-readable table + partitionType Set[Partition] [sample] Partition type for depth of coverage. Defaults to sample. Can be any combination of sample, readgroup, library. + printBaseCounts boolean false Will add base counts to per-locus output. + printBinEndpointsAndExit boolean false Prints the bin values and exits immediately. Use to calibrate what bins you want before running on data. + start int 1 Starting (left endpoint) for granular binning + stop int 500 Ending (right endpoint) for granular binning + summaryCoverageThreshold int[] [15] for summary file outputs, report the % of bases coverd to >= this number. Defaults to 15; can take multiple arguments. + +------ + +**Citation** + +For the underlying tool, please cite `DePristo MA, Banks E, Poplin R, Garimella KV, Maguire JR, Hartl C, Philippakis AA, del Angel G, Rivas MA, Hanna M, McKenna A, Fennell TJ, Kernytsky AM, Sivachenko AY, Cibulskis K, Gabriel SB, Altshuler D, Daly MJ. A framework for variation discovery and genotyping using next-generation DNA sequencing data. Nat Genet. 2011 May;43(5):491-8. <http://www.ncbi.nlm.nih.gov/pubmed/21478889>`_ + +If you use this tool in Galaxy, please cite Blankenberg D, et al. *In preparation.* + + + diff -r 94152a913ac9 -r a2c1575ba537 print_reads.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/print_reads.xml Tue Sep 18 12:59:09 2012 -0400 @@ -0,0 +1,425 @@ + + from BAM files + + gatk + samtools + + gatk2_wrapper.py + --max_jvm_heap_fraction "1" + --stdout "${output_log}" + #for $i, $input_bam in enumerate( $reference_source.input_bams ): + -d "-I" "${input_bam.input_bam}" "${input_bam.input_bam.ext}" "gatk_input_${i}" + #if str( $input_bam.input_bam.metadata.bam_index ) != "None": + -d "" "${input_bam.input_bam.metadata.bam_index}" "bam_index" "gatk_input_${i}" ##hardcode galaxy ext type as bam_index + #end if + #end for + -p 'java + -jar "/data/galaxy/galaxy3/tool-data/shared/jars/gatk2/GenomeAnalysisTK.jar" + -T "PrintReads" + ##--num_threads 4 ##hard coded, for now + --out "${output_bam}" + -et "NO_ET" -K "/data/galaxy/galaxy3/tool-data/shared/jars/gatk2/gatk2_key_file" ##ET no phone home + #if $reference_source.reference_source_selector != "history": + -R "${reference_source.ref_file.fields.path}" + #end if + --number "${number}" + #if $platform: + --platform "${platform}" + #end if + #if $read_group: + --readGroup "${read_group}" + #end if + #for $sample_file in $sample_file_repeat: + --sample_file "${sample_file.input_sample_file}" + #end for + #for $sample_name in $sample_name_repeat: + --sample_name "${sample_name.sample_name}" + #end for + ' + + ##start standard gatk options + #if $gatk_param_type.gatk_param_type_selector == "advanced": + #for $pedigree in $gatk_param_type.pedigree: + -p '--pedigree "${pedigree.pedigree_file}"' + #end for + #for $pedigree_string in $gatk_param_type.pedigree_string_repeat: + -p '--pedigreeString "${pedigree_string.pedigree_string}"' + #end for + -p '--pedigreeValidationType "${gatk_param_type.pedigree_validation_type}"' + #for $read_filter in $gatk_param_type.read_filter: + -p '--read_filter "${read_filter.read_filter_type.read_filter_type_selector}" + ###raise Exception( str( dir( $read_filter ) ) ) + #for $name, $param in $read_filter.read_filter_type.iteritems(): + #if $name not in [ "__current_case__", "read_filter_type_selector" ]: + #if hasattr( $param.input, 'truevalue' ): + ${param} + #else: + --${name} "${param}" + #end if + #end if + #end for + ' + #end for + #for $interval_count, $input_intervals in enumerate( $gatk_param_type.input_interval_repeat ): + -d "--intervals" "${input_intervals.input_intervals}" "${input_intervals.input_intervals.ext}" "input_intervals_${interval_count}" + #end for + + #for $interval_count, $input_intervals in enumerate( $gatk_param_type.input_exclude_interval_repeat ): + -d "--excludeIntervals" "${input_intervals.input_exclude_intervals}" "${input_intervals.input_exclude_intervals.ext}" "input_exlude_intervals_${interval_count}" + #end for + + -p '--interval_set_rule "${gatk_param_type.interval_set_rule}"' + + -p '--downsampling_type "${gatk_param_type.downsampling_type.downsampling_type_selector}"' + #if str( $gatk_param_type.downsampling_type.downsampling_type_selector ) != "NONE": + -p '--${gatk_param_type.downsampling_type.downsample_to_type.downsample_to_type_selector} "${gatk_param_type.downsampling_type.downsample_to_type.downsample_to_value}"' + #end if + -p ' + --baq "${gatk_param_type.baq}" + --baqGapOpenPenalty "${gatk_param_type.baq_gap_open_penalty}" + ${gatk_param_type.use_original_qualities} + --defaultBaseQualities "${gatk_param_type.default_base_qualities}" + --validation_strictness "${gatk_param_type.validation_strictness}" + --interval_merging "${gatk_param_type.interval_merging}" + ${gatk_param_type.disable_experimental_low_memory_sharding} + ${gatk_param_type.non_deterministic_random_seed} + ' + #for $rg_black_list_count, $rg_black_list in enumerate( $gatk_param_type.read_group_black_list_repeat ): + #if $rg_black_list.read_group_black_list_type.read_group_black_list_type_selector == "file": + -d "--read_group_black_list" "${rg_black_list.read_group_black_list_type.read_group_black_list}" "txt" "input_read_group_black_list_${rg_black_list_count}" + #else + -p '--read_group_black_list "${rg_black_list.read_group_black_list_type.read_group_black_list}"' + #end if + #end for + #end if + + #if $reference_source.reference_source_selector == "history": + -d "-R" "${reference_source.ref_file}" "${reference_source.ref_file.ext}" "gatk_input" + #end if + ##end standard gatk options + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +**What it does** + +PrintReads can dynamically merge the contents of multiple input BAM files, resulting in merged output sorted in coordinate order. + +For more information on the GATK Print Reads Walker, see this `tool specific page <http://www.broadinstitute.org/gsa/gatkdocs/release/org_broadinstitute_sting_gatk_walkers_PrintReadsWalker.html>`_. + +To learn about best practices for variant detection using GATK, see this `overview <http://www.broadinstitute.org/gsa/wiki/index.php/Best_Practice_Variant_Detection_with_the_GATK_v3>`_. + +If you encounter errors, please view the `GATK FAQ <http://www.broadinstitute.org/gsa/wiki/index.php/Frequently_Asked_Questions>`_. + +------ + +**Inputs** + +GenomeAnalysisTK: PrintReads accepts one or more BAM or SAM input files. + + +**Outputs** + +The output is in BAM format. + + +Go `here <http://www.broadinstitute.org/gsa/wiki/index.php/Input_files_for_the_GATK>`_ for details on GATK file formats. + +------- + +**Settings**:: + + number int -1 Print the first n reads from the file, discarding the rest + platform String NA Exclude all reads with this platform from the output + readGroup String NA Exclude all reads with this read group from the output + sample_file Set[File] [] File containing a list of samples (one per line). Can be specified multiple times + sample_name Set[String] [] Sample name to be included in the analysis. Can be specified multiple times. + +------ + +**Citation** + +For the underlying tool, please cite `DePristo MA, Banks E, Poplin R, Garimella KV, Maguire JR, Hartl C, Philippakis AA, del Angel G, Rivas MA, Hanna M, McKenna A, Fennell TJ, Kernytsky AM, Sivachenko AY, Cibulskis K, Gabriel SB, Altshuler D, Daly MJ. A framework for variation discovery and genotyping using next-generation DNA sequencing data. Nat Genet. 2011 May;43(5):491-8. <http://www.ncbi.nlm.nih.gov/pubmed/21478889>`_ + +If you use this tool in Galaxy, please cite Blankenberg D, et al. *In preparation.* + + + diff -r 94152a913ac9 -r a2c1575ba537 realigner_target_creator.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/realigner_target_creator.xml Tue Sep 18 12:59:09 2012 -0400 @@ -0,0 +1,449 @@ + + for use in local realignment + + gatk + samtools + + gatk2_wrapper.py + --max_jvm_heap_fraction "1" + --stdout "${output_log}" + -d "-I" "${reference_source.input_bam}" "${reference_source.input_bam.ext}" "gatk_input" + #if str( $reference_source.input_bam.metadata.bam_index ) != "None": + -d "" "${reference_source.input_bam.metadata.bam_index}" "bam_index" "gatk_input" ##hardcode galaxy ext type as bam_index + #end if + -p 'java + -jar "/data/galaxy/galaxy3/tool-data/shared/jars/gatk2/GenomeAnalysisTK.jar" + -T "RealignerTargetCreator" + -o "${output_interval}" + -et "NO_ET" -K "/data/galaxy/galaxy3/tool-data/shared/jars/gatk2/gatk2_key_file" ##ET no phone home + --num_threads 4 ##hard coded, for now + ##-log "${output_log}" ##don't use this to log to file, instead directly capture stdout + #if $reference_source.reference_source_selector != "history": + -R "${reference_source.ref_file.fields.path}" + #end if + ' + #set $rod_binding_names = dict() + #for $rod_binding in $rod_bind: + #if str( $rod_binding.rod_bind_type.rod_bind_type_selector ) == 'custom': + #set $rod_bind_name = $rod_binding.rod_bind_type.custom_rod_name + #else + #set $rod_bind_name = $rod_binding.rod_bind_type.rod_bind_type_selector + #end if + #set $rod_binding_names[$rod_bind_name] = $rod_binding_names.get( $rod_bind_name, -1 ) + 1 + -d "-known:${rod_bind_name},%(file_type)s" "${rod_binding.rod_bind_type.input_rod}" "${rod_binding.rod_bind_type.input_rod.ext}" "input_${rod_bind_name}_${rod_binding_names[$rod_bind_name]}" + #end for + + ##start standard gatk options + #if $gatk_param_type.gatk_param_type_selector == "advanced": + #for $pedigree in $gatk_param_type.pedigree: + -p '--pedigree "${pedigree.pedigree_file}"' + #end for + #for $pedigree_string in $gatk_param_type.pedigree_string_repeat: + -p '--pedigreeString "${pedigree_string.pedigree_string}"' + #end for + -p '--pedigreeValidationType "${gatk_param_type.pedigree_validation_type}"' + #for $read_filter in $gatk_param_type.read_filter: + -p '--read_filter "${read_filter.read_filter_type.read_filter_type_selector}" + ###raise Exception( str( dir( $read_filter ) ) ) + #for $name, $param in $read_filter.read_filter_type.iteritems(): + #if $name not in [ "__current_case__", "read_filter_type_selector" ]: + #if hasattr( $param.input, 'truevalue' ): + ${param} + #else: + --${name} "${param}" + #end if + #end if + #end for + ' + #end for + #for $interval_count, $input_intervals in enumerate( $gatk_param_type.input_interval_repeat ): + -d "--intervals" "${input_intervals.input_intervals}" "${input_intervals.input_intervals.ext}" "input_intervals_${interval_count}" + #end for + + #for $interval_count, $input_intervals in enumerate( $gatk_param_type.input_exclude_interval_repeat ): + -d "--excludeIntervals" "${input_intervals.input_exclude_intervals}" "${input_intervals.input_exclude_intervals.ext}" "input_exlude_intervals_${interval_count}" + #end for + + -p '--interval_set_rule "${gatk_param_type.interval_set_rule}"' + + -p '--downsampling_type "${gatk_param_type.downsampling_type.downsampling_type_selector}"' + #if str( $gatk_param_type.downsampling_type.downsampling_type_selector ) != "NONE": + -p '--${gatk_param_type.downsampling_type.downsample_to_type.downsample_to_type_selector} "${gatk_param_type.downsampling_type.downsample_to_type.downsample_to_value}"' + #end if + -p ' + --baq "${gatk_param_type.baq}" + --baqGapOpenPenalty "${gatk_param_type.baq_gap_open_penalty}" + ${gatk_param_type.use_original_qualities} + --defaultBaseQualities "${gatk_param_type.default_base_qualities}" + --validation_strictness "${gatk_param_type.validation_strictness}" + --interval_merging "${gatk_param_type.interval_merging}" + ${gatk_param_type.disable_experimental_low_memory_sharding} + ${gatk_param_type.non_deterministic_random_seed} + ' + #for $rg_black_list_count, $rg_black_list in enumerate( $gatk_param_type.read_group_black_list_repeat ): + #if $rg_black_list.read_group_black_list_type.read_group_black_list_type_selector == "file": + -d "--read_group_black_list" "${rg_black_list.read_group_black_list_type.read_group_black_list}" "txt" "input_read_group_black_list_${rg_black_list_count}" + #else + -p '--read_group_black_list "${rg_black_list.read_group_black_list_type.read_group_black_list}"' + #end if + #end for + #end if + + #if $reference_source.reference_source_selector == "history": + -d "-R" "${reference_source.ref_file}" "${reference_source.ref_file.ext}" "gatk_input" + #end if + ##end standard gatk options + ##start analysis specific options + #if $analysis_param_type.analysis_param_type_selector == "advanced": + -p ' + --minReadsAtLocus "${analysis_param_type.minReadsAtLocus}" + --windowSize "${analysis_param_type.windowSize}" + --mismatchFraction "${analysis_param_type.mismatchFraction}" + --maxIntervalSize "${analysis_param_type.maxIntervalSize}" + ' + #end if + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +**What it does** + +Emits intervals for the Local Indel Realigner to target for cleaning. Ignores 454 reads, MQ0 reads, and reads with consecutive indel operators in the CIGAR string. + +For more information on local realignment around indels using the GATK, see this `tool specific page <http://www.broadinstitute.org/gsa/wiki/index.php/Local_realignment_around_indels>`_. + +To learn about best practices for variant detection using GATK, see this `overview <http://www.broadinstitute.org/gsa/wiki/index.php/Best_Practice_Variant_Detection_with_the_GATK_v3>`_. + +If you encounter errors, please view the `GATK FAQ <http://www.broadinstitute.org/gsa/wiki/index.php/Frequently_Asked_Questions>`_. + +------ + +**Inputs** + +GenomeAnalysisTK: RealignerTargetCreator accepts an aligned BAM input file. + + +**Outputs** + +The output is in GATK Interval format. + + +Go `here <http://www.broadinstitute.org/gsa/wiki/index.php/Input_files_for_the_GATK>`_ for details on GATK file formats. + +------- + +**Settings**:: + + windowSize window size for calculating entropy or SNP clusters + mismatchFraction fraction of base qualities needing to mismatch for a position to have high entropy; to disable set to <= 0 or > 1 + minReadsAtLocus minimum reads at a locus to enable using the entropy calculation + maxIntervalSize maximum interval size + +------ + +**Citation** + +For the underlying tool, please cite `DePristo MA, Banks E, Poplin R, Garimella KV, Maguire JR, Hartl C, Philippakis AA, del Angel G, Rivas MA, Hanna M, McKenna A, Fennell TJ, Kernytsky AM, Sivachenko AY, Cibulskis K, Gabriel SB, Altshuler D, Daly MJ. A framework for variation discovery and genotyping using next-generation DNA sequencing data. Nat Genet. 2011 May;43(5):491-8. <http://www.ncbi.nlm.nih.gov/pubmed/21478889>`_ + +If you use this tool in Galaxy, please cite Blankenberg D, et al. *In preparation.* + + + diff -r 94152a913ac9 -r a2c1575ba537 table_recalibration.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/table_recalibration.xml Tue Sep 18 12:59:09 2012 -0400 @@ -0,0 +1,516 @@ + + on BAM files + + gatk + samtools + + gatk2_wrapper.py + --max_jvm_heap_fraction "1" + --stdout "${output_log}" + -d "-I" "${reference_source.input_bam}" "${reference_source.input_bam.ext}" "gatk_input" + #if str( $reference_source.input_bam.metadata.bam_index ) != "None": + -d "" "${reference_source.input_bam.metadata.bam_index}" "bam_index" "gatk_input" ##hardcode galaxy ext type as bam_index + #end if + -p 'java + -jar "/data/galaxy/galaxy3/tool-data/shared/jars/gatk2/GenomeAnalysisTK.jar" + -T "TableRecalibration" + -o "${output_bam}" + -et "NO_ET" -K "/data/galaxy/galaxy3/tool-data/shared/jars/gatk2/gatk2_key_file" ##ET no phone home + ##--num_threads 4 ##hard coded, for now + ##-log "${output_log}" ##don't use this to log to file, instead directly capture stdout + #if $reference_source.reference_source_selector != "history": + -R "${reference_source.ref_file.fields.path}" + #end if + --recal_file "${input_recal}" + --disable_bam_indexing + ' + ##start standard gatk options + #if $gatk_param_type.gatk_param_type_selector == "advanced": + #for $pedigree in $gatk_param_type.pedigree: + -p '--pedigree "${pedigree.pedigree_file}"' + #end for + #for $pedigree_string in $gatk_param_type.pedigree_string_repeat: + -p '--pedigreeString "${pedigree_string.pedigree_string}"' + #end for + -p '--pedigreeValidationType "${gatk_param_type.pedigree_validation_type}"' + #for $read_filter in $gatk_param_type.read_filter: + -p '--read_filter "${read_filter.read_filter_type.read_filter_type_selector}" + ###raise Exception( str( dir( $read_filter ) ) ) + #for $name, $param in $read_filter.read_filter_type.iteritems(): + #if $name not in [ "__current_case__", "read_filter_type_selector" ]: + #if hasattr( $param.input, 'truevalue' ): + ${param} + #else: + --${name} "${param}" + #end if + #end if + #end for + ' + #end for + #for $interval_count, $input_intervals in enumerate( $gatk_param_type.input_interval_repeat ): + -d "--intervals" "${input_intervals.input_intervals}" "${input_intervals.input_intervals.ext}" "input_intervals_${interval_count}" + #end for + + #for $interval_count, $input_intervals in enumerate( $gatk_param_type.input_exclude_interval_repeat ): + -d "--excludeIntervals" "${input_intervals.input_exclude_intervals}" "${input_intervals.input_exclude_intervals.ext}" "input_exlude_intervals_${interval_count}" + #end for + + -p '--interval_set_rule "${gatk_param_type.interval_set_rule}"' + + -p '--downsampling_type "${gatk_param_type.downsampling_type.downsampling_type_selector}"' + #if str( $gatk_param_type.downsampling_type.downsampling_type_selector ) != "NONE": + -p '--${gatk_param_type.downsampling_type.downsample_to_type.downsample_to_type_selector} "${gatk_param_type.downsampling_type.downsample_to_type.downsample_to_value}"' + #end if + -p ' + --baq "${gatk_param_type.baq}" + --baqGapOpenPenalty "${gatk_param_type.baq_gap_open_penalty}" + ${gatk_param_type.use_original_qualities} + --defaultBaseQualities "${gatk_param_type.default_base_qualities}" + --validation_strictness "${gatk_param_type.validation_strictness}" + --interval_merging "${gatk_param_type.interval_merging}" + ${gatk_param_type.disable_experimental_low_memory_sharding} + ${gatk_param_type.non_deterministic_random_seed} + ' + #for $rg_black_list_count, $rg_black_list in enumerate( $gatk_param_type.read_group_black_list_repeat ): + #if $rg_black_list.read_group_black_list_type.read_group_black_list_type_selector == "file": + -d "--read_group_black_list" "${rg_black_list.read_group_black_list_type.read_group_black_list}" "txt" "input_read_group_black_list_${rg_black_list_count}" + #else + -p '--read_group_black_list "${rg_black_list.read_group_black_list_type.read_group_black_list}"' + #end if + #end for + #end if + + #if str( $reference_source.reference_source_selector ) == "history": + -d "-R" "${reference_source.ref_file}" "${reference_source.ref_file.ext}" "gatk_input" + #end if + ##end standard gatk options + + ##start analysis specific options + #if $analysis_param_type.analysis_param_type_selector == "advanced": + -p ' + #if $analysis_param_type.default_read_group_type.default_read_group_type_selector == "set": + --default_read_group "${analysis_param_type.default_read_group_type.default_read_group}" + #end if + #if str( $analysis_param_type.default_platform ) != "default": + --default_platform "${analysis_param_type.default_platform}" + #end if + #if str( $analysis_param_type.force_read_group_type.force_read_group_type_selector ) == "set": + --force_read_group "${analysis_param_type.force_read_group_type.force_read_group}" + #end if + #if str( $analysis_param_type.force_platform ) != "default": + --force_platform "${analysis_param_type.force_platform}" + #end if + ${analysis_param_type.exception_if_no_tile} + #if str( $analysis_param_type.solid_options_type.solid_options_type_selector ) == "set": + #if str( $analysis_param_type.solid_options_type.solid_recal_mode ) != "default": + --solid_recal_mode "${analysis_param_type.solid_options_type.solid_recal_mode}" + #end if + #if str( $analysis_param_type.solid_options_type.solid_nocall_strategy ) != "default": + --solid_nocall_strategy "${analysis_param_type.solid_options_type.solid_nocall_strategy}" + #end if + #end if + ${analysis_param_type.simplify_bam} + --preserve_qscores_less_than "${analysis_param_type.preserve_qscores_less_than}" + --smoothing "${analysis_param_type.smoothing}" + --max_quality_score "${analysis_param_type.max_quality_score}" + --window_size_nqs "${analysis_param_type.window_size_nqs}" + --homopolymer_nback "${analysis_param_type.homopolymer_nback}" + ${analysis_param_type.do_not_write_original_quals} + ' + #end if + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +**What it does** + +This walker is designed to work as the second pass in a two-pass processing step, doing a by-read traversal. For each base in each read this walker calculates various user-specified covariates (such as read group, reported quality score, cycle, and dinuc) Using these values as a key in a large hashmap the walker calculates an empirical base quality score and overwrites the quality score currently in the read. This walker then outputs a new bam file with these updated (recalibrated) reads. Note: This walker expects as input the recalibration table file generated previously by CovariateCounterWalker. Note: This walker is designed to be used in conjunction with CovariateCounterWalker. + +For more information on base quality score recalibration using the GATK, see this `tool specific page <http://www.broadinstitute.org/gsa/wiki/index.php/Base_quality_score_recalibration>`_. + +To learn about best practices for variant detection using GATK, see this `overview <http://www.broadinstitute.org/gsa/wiki/index.php/Best_Practice_Variant_Detection_with_the_GATK_v3>`_. + +If you encounter errors, please view the `GATK FAQ <http://www.broadinstitute.org/gsa/wiki/index.php/Frequently_Asked_Questions>`_. + +------ + +**Inputs** + +GenomeAnalysisTK: TableRecalibration accepts an aligned BAM and a recalibration CSV input files. + + +**Outputs** + +The output is in BAM format. + + +Go `here <http://www.broadinstitute.org/gsa/wiki/index.php/Input_files_for_the_GATK>`_ for details on GATK file formats. + +------- + +**Settings**:: + + default_read_group If a read has no read group then default to the provided String. + default_platform If a read has no platform then default to the provided String. Valid options are illumina, 454, and solid. + force_read_group If provided, the read group ID of EVERY read will be forced to be the provided String. This is useful to collapse all data into a single read group. + force_platform If provided, the platform of EVERY read will be forced to be the provided String. Valid options are illumina, 454, and solid. + window_size_nqs The window size used by MinimumNQSCovariate for its calculation + homopolymer_nback The number of previous bases to look at in HomopolymerCovariate + exception_if_no_tile If provided, TileCovariate will throw an exception when no tile can be found. The default behavior is to use tile = -1 + solid_recal_mode How should we recalibrate solid bases in whichthe reference was inserted? Options = DO_NOTHING, SET_Q_ZERO, SET_Q_ZERO_BASE_N, or REMOVE_REF_BIAS (DO_NOTHING|SET_Q_ZERO|SET_Q_ZERO_BASE_N|REMOVE_REF_BIAS) + solid_nocall_strategy Defines the behavior of the recalibrator when it encounters no calls in the color space. Options = THROW_EXCEPTION, LEAVE_READ_UNRECALIBRATED, or PURGE_READ (THROW_EXCEPTION|LEAVE_READ_UNRECALIBRATED|PURGE_READ) + recal_file Filename for the input covariates table recalibration .csv file + out The output BAM file + bam_compression Compression level to use for writing BAM files + disable_bam_indexing Turn off on-the-fly creation of indices for output BAM files. + simplifyBAM If provided, output BAM files will be simplified to include just key reads for downstream variation discovery analyses (removing duplicates, PF-, non-primary reads), as well stripping all extended tags from the kept reads except the read group identifier + preserve_qscores_less_than Bases with quality scores less than this threshold won't be recalibrated, default=5. In general it's unsafe to change qualities scores below < 5, since base callers use these values to indicate random or bad bases + smoothing Number of imaginary counts to add to each bin bin order to smooth out bins with few data points, default=1 + max_quality_score The integer value at which to cap the quality scores, default=50 + doNotWriteOriginalQuals If true, we will not write the original quality (OQ) tag for each read + +------ + +**Citation** + +For the underlying tool, please cite `DePristo MA, Banks E, Poplin R, Garimella KV, Maguire JR, Hartl C, Philippakis AA, del Angel G, Rivas MA, Hanna M, McKenna A, Fennell TJ, Kernytsky AM, Sivachenko AY, Cibulskis K, Gabriel SB, Altshuler D, Daly MJ. A framework for variation discovery and genotyping using next-generation DNA sequencing data. Nat Genet. 2011 May;43(5):491-8. <http://www.ncbi.nlm.nih.gov/pubmed/21478889>`_ + +If you use this tool in Galaxy, please cite Blankenberg D, et al. *In preparation.* + + + diff -r 94152a913ac9 -r a2c1575ba537 unified_genotyper.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/unified_genotyper.xml Tue Sep 18 12:59:09 2012 -0400 @@ -0,0 +1,611 @@ + + SNP and indel caller + + gatk + samtools + + gatk2_wrapper.py + --max_jvm_heap_fraction "1" + --stdout "${output_log}" + #for $i, $input_bam in enumerate( $reference_source.input_bams ): + -d "-I" "${input_bam.input_bam}" "${input_bam.input_bam.ext}" "gatk_input_${i}" + #if str( $input_bam.input_bam.metadata.bam_index ) != "None": + -d "" "${input_bam.input_bam.metadata.bam_index}" "bam_index" "gatk_input_${i}" ##hardcode galaxy ext type as bam_index + #end if + #end for + -p 'java + -jar "/data/galaxy/galaxy3/tool-data/shared/jars/gatk2/GenomeAnalysisTK.jar" + -T "UnifiedGenotyper" + --num_threads 4 ##hard coded, for now + --out "${output_vcf}" + --metrics_file "${output_metrics}" + -et "NO_ET" -K "/data/galaxy/galaxy3/tool-data/shared/jars/gatk2/gatk2_key_file" ##ET no phone home + ##-log "${output_log}" ##don't use this to log to file, instead directly capture stdout + #if $reference_source.reference_source_selector != "history": + -R "${reference_source.ref_file.fields.path}" + #end if + --genotype_likelihoods_model "${genotype_likelihoods_model}" + --standard_min_confidence_threshold_for_calling "${standard_min_confidence_threshold_for_calling}" + --standard_min_confidence_threshold_for_emitting "${standard_min_confidence_threshold_for_emitting}" + ' + #set $rod_binding_names = dict() + #for $rod_binding in $rod_bind: + #if str( $rod_binding.rod_bind_type.rod_bind_type_selector ) == 'custom': + #set $rod_bind_name = $rod_binding.rod_bind_type.custom_rod_name + #else + #set $rod_bind_name = $rod_binding.rod_bind_type.rod_bind_type_selector + #end if + #set $rod_binding_names[$rod_bind_name] = $rod_binding_names.get( $rod_bind_name, -1 ) + 1 + -d "--dbsnp:${rod_bind_name},%(file_type)s" "${rod_binding.rod_bind_type.input_rod}" "${rod_binding.rod_bind_type.input_rod.ext}" "input_${rod_bind_name}_${rod_binding_names[$rod_bind_name]}" + #end for + + ##start standard gatk options + #if $gatk_param_type.gatk_param_type_selector == "advanced": + #for $pedigree in $gatk_param_type.pedigree: + -p '--pedigree "${pedigree.pedigree_file}"' + #end for + #for $pedigree_string in $gatk_param_type.pedigree_string_repeat: + -p '--pedigreeString "${pedigree_string.pedigree_string}"' + #end for + -p '--pedigreeValidationType "${gatk_param_type.pedigree_validation_type}"' + #for $read_filter in $gatk_param_type.read_filter: + -p '--read_filter "${read_filter.read_filter_type.read_filter_type_selector}" + ###raise Exception( str( dir( $read_filter ) ) ) + #for $name, $param in $read_filter.read_filter_type.iteritems(): + #if $name not in [ "__current_case__", "read_filter_type_selector" ]: + #if hasattr( $param.input, 'truevalue' ): + ${param} + #else: + --${name} "${param}" + #end if + #end if + #end for + ' + #end for + #for $interval_count, $input_intervals in enumerate( $gatk_param_type.input_interval_repeat ): + -d "--intervals" "${input_intervals.input_intervals}" "${input_intervals.input_intervals.ext}" "input_intervals_${interval_count}" + #end for + + #for $interval_count, $input_intervals in enumerate( $gatk_param_type.input_exclude_interval_repeat ): + -d "--excludeIntervals" "${input_intervals.input_exclude_intervals}" "${input_intervals.input_exclude_intervals.ext}" "input_exlude_intervals_${interval_count}" + #end for + + -p '--interval_set_rule "${gatk_param_type.interval_set_rule}"' + + -p '--downsampling_type "${gatk_param_type.downsampling_type.downsampling_type_selector}"' + #if str( $gatk_param_type.downsampling_type.downsampling_type_selector ) != "NONE": + -p '--${gatk_param_type.downsampling_type.downsample_to_type.downsample_to_type_selector} "${gatk_param_type.downsampling_type.downsample_to_type.downsample_to_value}"' + #end if + -p ' + --baq "${gatk_param_type.baq}" + --baqGapOpenPenalty "${gatk_param_type.baq_gap_open_penalty}" + ${gatk_param_type.use_original_qualities} + --defaultBaseQualities "${gatk_param_type.default_base_qualities}" + --validation_strictness "${gatk_param_type.validation_strictness}" + --interval_merging "${gatk_param_type.interval_merging}" + ${gatk_param_type.disable_experimental_low_memory_sharding} + ${gatk_param_type.non_deterministic_random_seed} + ' + #for $rg_black_list_count, $rg_black_list in enumerate( $gatk_param_type.read_group_black_list_repeat ): + #if $rg_black_list.read_group_black_list_type.read_group_black_list_type_selector == "file": + -d "--read_group_black_list" "${rg_black_list.read_group_black_list_type.read_group_black_list}" "txt" "input_read_group_black_list_${rg_black_list_count}" + #else + -p '--read_group_black_list "${rg_black_list.read_group_black_list_type.read_group_black_list}"' + #end if + #end for + #end if + + #if $reference_source.reference_source_selector == "history": + -d "-R" "${reference_source.ref_file}" "${reference_source.ref_file.ext}" "gatk_input" + #end if + ##end standard gatk options + ##start analysis specific options + #if $analysis_param_type.analysis_param_type_selector == "advanced": + -p ' + --p_nonref_model "${analysis_param_type.p_nonref_model}" + --heterozygosity "${analysis_param_type.heterozygosity}" + --pcr_error_rate "${analysis_param_type.pcr_error_rate}" + --genotyping_mode "${analysis_param_type.genotyping_mode_type.genotyping_mode}" + #if str( $analysis_param_type.genotyping_mode_type.genotyping_mode ) == 'GENOTYPE_GIVEN_ALLELES': + --alleles "${analysis_param_type.genotyping_mode_type.input_alleles_rod}" + #end if + --output_mode "${analysis_param_type.output_mode}" + ${analysis_param_type.compute_SLOD} + --min_base_quality_score "${analysis_param_type.min_base_quality_score}" + --max_deletion_fraction "${analysis_param_type.max_deletion_fraction}" + --max_alternate_alleles "${analysis_param_type.max_alternate_alleles}" + --min_indel_count_for_genotyping "${analysis_param_type.min_indel_count_for_genotyping}" + --indel_heterozygosity "${analysis_param_type.indel_heterozygosity}" + --indelGapContinuationPenalty "${analysis_param_type.indelGapContinuationPenalty}" + --indelGapOpenPenalty "${analysis_param_type.indelGapOpenPenalty}" + --indelHaplotypeSize "${analysis_param_type.indelHaplotypeSize}" + ${analysis_param_type.doContextDependentGapPenalties} + #if str( $analysis_param_type.annotation ) != "None": + #for $annotation in str( $analysis_param_type.annotation.fields.gatk_value ).split( ','): + --annotation "${annotation}" + #end for + #end if + #for $additional_annotation in $analysis_param_type.additional_annotations: + --annotation "${additional_annotation.additional_annotation_name}" + #end for + #if str( $analysis_param_type.group ) != "None": + #for $group in str( $analysis_param_type.group ).split( ','): + --group "${group}" + #end for + #end if + #if str( $analysis_param_type.exclude_annotations ) != "None": + #for $annotation in str( $analysis_param_type.exclude_annotations.fields.gatk_value ).split( ','): + --excludeAnnotation "${annotation}" + #end for + #end if + ${analysis_param_type.multiallelic} + ' +## #if str( $analysis_param_type.snpEff_rod_bind_type.snpEff_rod_bind_type_selector ) == 'set_snpEff': +## -p '--annotation "SnpEff"' +## -d "--snpEffFile:${analysis_param_type.snpEff_rod_bind_type.snpEff_rod_name},%(file_type)s" "${analysis_param_type.snpEff_rod_bind_type.snpEff_input_rod}" "${analysis_param_type.snpEff_rod_bind_type.snpEff_input_rod.ext}" "input_snpEff_${analysis_param_type.snpEff_rod_bind_type.snpEff_rod_name}" +## #else: +## -p '--excludeAnnotation "SnpEff"' +## #end if + #end if + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +**What it does** + +A variant caller which unifies the approaches of several disparate callers. Works for single-sample and multi-sample data. The user can choose from several different incorporated calculation models. + +For more information on the GATK Unified Genotyper, see this `tool specific page <http://www.broadinstitute.org/gsa/wiki/index.php/Unified_genotyper>`_. + +To learn about best practices for variant detection using GATK, see this `overview <http://www.broadinstitute.org/gsa/wiki/index.php/Best_Practice_Variant_Detection_with_the_GATK_v3>`_. + +If you encounter errors, please view the `GATK FAQ <http://www.broadinstitute.org/gsa/wiki/index.php/Frequently_Asked_Questions>`_. + +------ + +**Inputs** + +GenomeAnalysisTK: UnifiedGenotyper accepts an aligned BAM input file. + + +**Outputs** + +The output is in VCF format. + + +Go `here <http://www.broadinstitute.org/gsa/wiki/index.php/Input_files_for_the_GATK>`_ for details on GATK file formats. + +------- + +**Settings**:: + + genotype_likelihoods_model Genotype likelihoods calculation model to employ -- BOTH is the default option, while INDEL is also available for calling indels and SNP is available for calling SNPs only (SNP|INDEL|BOTH) + p_nonref_model Non-reference probability calculation model to employ -- EXACT is the default option, while GRID_SEARCH is also available. (EXACT|GRID_SEARCH) + heterozygosity Heterozygosity value used to compute prior likelihoods for any locus + pcr_error_rate The PCR error rate to be used for computing fragment-based likelihoods + genotyping_mode Should we output confident genotypes (i.e. including ref calls) or just the variants? (DISCOVERY|GENOTYPE_GIVEN_ALLELES) + output_mode Should we output confident genotypes (i.e. including ref calls) or just the variants? (EMIT_VARIANTS_ONLY|EMIT_ALL_CONFIDENT_SITES|EMIT_ALL_SITES) + standard_min_confidence_threshold_for_calling The minimum phred-scaled confidence threshold at which variants not at 'trigger' track sites should be called + standard_min_confidence_threshold_for_emitting The minimum phred-scaled confidence threshold at which variants not at 'trigger' track sites should be emitted (and filtered if less than the calling threshold) + noSLOD If provided, we will not calculate the SLOD + min_base_quality_score Minimum base quality required to consider a base for calling + max_deletion_fraction Maximum fraction of reads with deletions spanning this locus for it to be callable [to disable, set to < 0 or > 1; default:0.05] + min_indel_count_for_genotyping Minimum number of consensus indels required to trigger genotyping run + indel_heterozygosity Heterozygosity for indel calling + indelGapContinuationPenalty Indel gap continuation penalty + indelGapOpenPenalty Indel gap open penalty + indelHaplotypeSize Indel haplotype size + doContextDependentGapPenalties Vary gap penalties by context + indel_recal_file Filename for the input covariates table recalibration .csv file - EXPERIMENTAL, DO NO USE + indelDebug Output indel debug info + out File to which variants should be written + annotation One or more specific annotations to apply to variant calls + group One or more classes/groups of annotations to apply to variant calls + +------ + +**Citation** + +For the underlying tool, please cite `DePristo MA, Banks E, Poplin R, Garimella KV, Maguire JR, Hartl C, Philippakis AA, del Angel G, Rivas MA, Hanna M, McKenna A, Fennell TJ, Kernytsky AM, Sivachenko AY, Cibulskis K, Gabriel SB, Altshuler D, Daly MJ. A framework for variation discovery and genotyping using next-generation DNA sequencing data. Nat Genet. 2011 May;43(5):491-8. <http://www.ncbi.nlm.nih.gov/pubmed/21478889>`_ + +If you use this tool in Galaxy, please cite Blankenberg D, et al. *In preparation.* + + + diff -r 94152a913ac9 -r a2c1575ba537 variant_annotator.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_annotator.xml Tue Sep 18 12:59:09 2012 -0400 @@ -0,0 +1,540 @@ + + + + gatk + samtools + + gatk2_wrapper.py + --max_jvm_heap_fraction "1" + --stdout "${output_log}" + #if str( $reference_source.input_bam ) != "None": + -d "-I" "${reference_source.input_bam}" "${reference_source.input_bam.ext}" "gatk_input" + #if str( $reference_source.input_bam.metadata.bam_index ) != "None": + -d "" "${reference_source.input_bam.metadata.bam_index}" "bam_index" "gatk_input" ##hardcode galaxy ext type as bam_index + #end if + #end if + -d "--variant" "${reference_source.input_variant}" "${reference_source.input_variant.ext}" "input_variant" + -p 'java + -jar "/data/galaxy/galaxy3/tool-data/shared/jars/gatk2/GenomeAnalysisTK.jar" + ##--list + -T "VariantAnnotator" + ##--num_threads 4 ##hard coded, for now + -et "NO_ET" -K "/data/galaxy/galaxy3/tool-data/shared/jars/gatk2/gatk2_key_file" ##ET no phone home + ##-log "${output_log}" ##don't use this to log to file, instead directly capture stdout + #if $reference_source.reference_source_selector != "history": + -R "${reference_source.ref_file.fields.path}" + #end if + -o "${output_vcf}" + #if str( $annotations_type.annotations_type_selector ) == "use_all_annotations": + --useAllAnnotations + #else: + #if $annotations_type.annotations: + #for $annotation in str( $annotations_type.annotations.fields.gatk_value ).split( ',' ): + --annotation "${annotation}" + #end for + #end if + #end if + #if $exclude_annotations: + #for $annotation in str( $exclude_annotations.fields.gatk_value ).split( ',' ): + --excludeAnnotation "${annotation}" + #end for + #end if + #for $additional_annotation in $additional_annotations: + --annotation "${additional_annotation.additional_annotation_name}" + #end for + ' + #if $reference_source.input_variant_bti: + -d "--intervals" "${reference_source.input_variant}" "${reference_source.input_variant.ext}" "input_variant_bti" + #end if + + #for $rod_binding in $comp_rod_bind: + -d "--comp:${rod_binding.comp_rod_name},%(file_type)s" "${rod_binding.comp_input_rod}" "${rod_binding.comp_input_rod.ext}" "input_comp_${rod_binding.comp_rod_name}" + #end for + + #if str( $dbsnp_rod_bind_type.dbsnp_rod_bind_type_selector ) == 'set_dbsnp': + -d "--dbsnp:${dbsnp_rod_bind_type.dbsnp_rod_name},%(file_type)s" "${dbsnp_rod_bind_type.dbsnp_input_rod}" "${dbsnp_rod_bind_type.dbsnp_input_rod.ext}" "input_dbsnp_${dbsnp_rod_bind_type.dbsnp_rod_name}" + #end if + + + #for $rod_binding in $resource_rod_bind: + -d "--resource:${rod_binding.resource_rod_name},%(file_type)s" "${rod_binding.resource_input_rod}" "${rod_binding.resource_input_rod.ext}" "input_resource_${rod_binding.resource_rod_name}" + #end for + + #if str( $snpEff_rod_bind_type.snpEff_rod_bind_type_selector ) == 'set_snpEff': + -p '--annotation "SnpEff"' + -d "--snpEffFile:${snpEff_rod_bind_type.snpEff_rod_name},%(file_type)s" "${snpEff_rod_bind_type.snpEff_input_rod}" "${snpEff_rod_bind_type.snpEff_input_rod.ext}" "input_snpEff_${snpEff_rod_bind_type.snpEff_rod_name}" + #else: + -p '--excludeAnnotation "SnpEff"' + #end if + + #for $expression in $expressions: + -p '--expression "${expression.expression}"' + #end for + + ##start standard gatk options + #if $gatk_param_type.gatk_param_type_selector == "advanced": + #for $pedigree in $gatk_param_type.pedigree: + -p '--pedigree "${pedigree.pedigree_file}"' + #end for + #for $pedigree_string in $gatk_param_type.pedigree_string_repeat: + -p '--pedigreeString "${pedigree_string.pedigree_string}"' + #end for + -p '--pedigreeValidationType "${gatk_param_type.pedigree_validation_type}"' + #for $read_filter in $gatk_param_type.read_filter: + -p '--read_filter "${read_filter.read_filter_type.read_filter_type_selector}" + ###raise Exception( str( dir( $read_filter ) ) ) + #for $name, $param in $read_filter.read_filter_type.iteritems(): + #if $name not in [ "__current_case__", "read_filter_type_selector" ]: + #if hasattr( $param.input, 'truevalue' ): + ${param} + #else: + --${name} "${param}" + #end if + #end if + #end for + ' + #end for + #for $interval_count, $input_intervals in enumerate( $gatk_param_type.input_interval_repeat ): + -d "--intervals" "${input_intervals.input_intervals}" "${input_intervals.input_intervals.ext}" "input_intervals_${interval_count}" + #end for + + #for $interval_count, $input_intervals in enumerate( $gatk_param_type.input_exclude_interval_repeat ): + -d "--excludeIntervals" "${input_intervals.input_exclude_intervals}" "${input_intervals.input_exclude_intervals.ext}" "input_exlude_intervals_${interval_count}" + #end for + + -p '--interval_set_rule "${gatk_param_type.interval_set_rule}"' + + -p '--downsampling_type "${gatk_param_type.downsampling_type.downsampling_type_selector}"' + #if str( $gatk_param_type.downsampling_type.downsampling_type_selector ) != "NONE": + -p '--${gatk_param_type.downsampling_type.downsample_to_type.downsample_to_type_selector} "${gatk_param_type.downsampling_type.downsample_to_type.downsample_to_value}"' + #end if + -p ' + --baq "${gatk_param_type.baq}" + --baqGapOpenPenalty "${gatk_param_type.baq_gap_open_penalty}" + ${gatk_param_type.use_original_qualities} + --defaultBaseQualities "${gatk_param_type.default_base_qualities}" + --validation_strictness "${gatk_param_type.validation_strictness}" + --interval_merging "${gatk_param_type.interval_merging}" + ${gatk_param_type.disable_experimental_low_memory_sharding} + ${gatk_param_type.non_deterministic_random_seed} + ' + #for $rg_black_list_count, $rg_black_list in enumerate( $gatk_param_type.read_group_black_list_repeat ): + #if $rg_black_list.read_group_black_list_type.read_group_black_list_type_selector == "file": + -d "--read_group_black_list" "${rg_black_list.read_group_black_list_type.read_group_black_list}" "txt" "input_read_group_black_list_${rg_black_list_count}" + #else + -p '--read_group_black_list "${rg_black_list.read_group_black_list_type.read_group_black_list}"' + #end if + #end for + #end if + #if str( $reference_source.reference_source_selector ) == "history": + -d "-R" "${reference_source.ref_file}" "${reference_source.ref_file.ext}" "gatk_input" + #end if + ##end standard gatk options + + -p ' + #if str( $annotation_group ) != "None": + #for $group in str( $annotation_group ).split( ',' ): + --group "${group}" + #end for + #end if + #if str( $family_string ) != "": + --family_string "${family_string}" + #end if + --MendelViolationGenotypeQualityThreshold "${mendel_violation_genotype_quality_threshold}" + ' + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +**What it does** + +Annotates variant calls with context information. Users can specify which of the available annotations to use. + +For more information on using the VariantAnnotator, see this `tool specific page <http://www.broadinstitute.org/gsa/wiki/index.php/VariantAnnotator>`_. + +To learn about best practices for variant detection using GATK, see this `overview <http://www.broadinstitute.org/gsa/wiki/index.php/Best_Practice_Variant_Detection_with_the_GATK_v3>`_. + +If you encounter errors, please view the `GATK FAQ <http://www.broadinstitute.org/gsa/wiki/index.php/Frequently_Asked_Questions>`_. + +------ + + +**Inputs** + +GenomeAnalysisTK: VariantAnnotator accepts a variant input file. + + +**Outputs** + +The output is in VCF format. + + +Go `here <http://www.broadinstitute.org/gsa/wiki/index.php/Input_files_for_the_GATK>`_ for details on GATK file formats. + +------- + +**Settings**:: + + + sampleName The sample (NA-ID) corresponding to the variant input (for non-VCF input only) + annotation One or more specific annotations to apply to variant calls + group One or more classes/groups of annotations to apply to variant calls + expression One or more specific expressions to apply to variant calls; see documentation for more details + useAllAnnotations Use all possible annotations (not for the faint of heart) + +------ + +**Citation** + +For the underlying tool, please cite `DePristo MA, Banks E, Poplin R, Garimella KV, Maguire JR, Hartl C, Philippakis AA, del Angel G, Rivas MA, Hanna M, McKenna A, Fennell TJ, Kernytsky AM, Sivachenko AY, Cibulskis K, Gabriel SB, Altshuler D, Daly MJ. A framework for variation discovery and genotyping using next-generation DNA sequencing data. Nat Genet. 2011 May;43(5):491-8. <http://www.ncbi.nlm.nih.gov/pubmed/21478889>`_ + +If you use this tool in Galaxy, please cite Blankenberg D, et al. *In preparation.* + + + diff -r 94152a913ac9 -r a2c1575ba537 variant_apply_recalibration.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_apply_recalibration.xml Tue Sep 18 12:59:09 2012 -0400 @@ -0,0 +1,414 @@ + + + + gatk2 + + gatk2_wrapper.py + --max_jvm_heap_fraction "1" + --stdout "${output_log}" + #for $var_count, $variant in enumerate( $reference_source.variants ): + -d "--input:input_${var_count},%(file_type)s" "${variant.input_variants}" "${variant.input_variants.ext}" "input_variants_${var_count}" + #end for + -p 'java + -jar "/data/galaxy/galaxy3/tool-data/shared/jars/gatk2/GenomeAnalysisTK.jar" + -T "ApplyRecalibration" + ##--num_threads 4 ##hard coded, for now + -et "NO_ET" -K "/data/galaxy/galaxy3/tool-data/shared/jars/gatk2/gatk2_key_file" ##ET no phone home + #if $reference_source.reference_source_selector != "history": + -R "${reference_source.ref_file.fields.path}" + #end if + --recal_file "${reference_source.input_recal}" + --tranches_file "${reference_source.input_tranches}" + --out "${output_variants}" + ' + + ##start standard gatk options + #if $gatk_param_type.gatk_param_type_selector == "advanced": + #for $pedigree in $gatk_param_type.pedigree: + -p '--pedigree "${pedigree.pedigree_file}"' + #end for + #for $pedigree_string in $gatk_param_type.pedigree_string_repeat: + -p '--pedigreeString "${pedigree_string.pedigree_string}"' + #end for + -p '--pedigreeValidationType "${gatk_param_type.pedigree_validation_type}"' + #for $read_filter in $gatk_param_type.read_filter: + -p '--read_filter "${read_filter.read_filter_type.read_filter_type_selector}" + ###raise Exception( str( dir( $read_filter ) ) ) + #for $name, $param in $read_filter.read_filter_type.iteritems(): + #if $name not in [ "__current_case__", "read_filter_type_selector" ]: + #if hasattr( $param.input, 'truevalue' ): + ${param} + #else: + --${name} "${param}" + #end if + #end if + #end for + ' + #end for + #for $interval_count, $input_intervals in enumerate( $gatk_param_type.input_interval_repeat ): + -d "--intervals" "${input_intervals.input_intervals}" "${input_intervals.input_intervals.ext}" "input_intervals_${interval_count}" + #end for + + #for $interval_count, $input_intervals in enumerate( $gatk_param_type.input_exclude_interval_repeat ): + -d "--excludeIntervals" "${input_intervals.input_exclude_intervals}" "${input_intervals.input_exclude_intervals.ext}" "input_exlude_intervals_${interval_count}" + #end for + + -p '--interval_set_rule "${gatk_param_type.interval_set_rule}"' + + -p '--downsampling_type "${gatk_param_type.downsampling_type.downsampling_type_selector}"' + #if str( $gatk_param_type.downsampling_type.downsampling_type_selector ) != "NONE": + -p '--${gatk_param_type.downsampling_type.downsample_to_type.downsample_to_type_selector} "${gatk_param_type.downsampling_type.downsample_to_type.downsample_to_value}"' + #end if + -p ' + --baq "${gatk_param_type.baq}" + --baqGapOpenPenalty "${gatk_param_type.baq_gap_open_penalty}" + ${gatk_param_type.use_original_qualities} + --defaultBaseQualities "${gatk_param_type.default_base_qualities}" + --validation_strictness "${gatk_param_type.validation_strictness}" + --interval_merging "${gatk_param_type.interval_merging}" + ${gatk_param_type.disable_experimental_low_memory_sharding} + ${gatk_param_type.non_deterministic_random_seed} + ' + #for $rg_black_list_count, $rg_black_list in enumerate( $gatk_param_type.read_group_black_list_repeat ): + #if $rg_black_list.read_group_black_list_type.read_group_black_list_type_selector == "file": + -d "--read_group_black_list" "${rg_black_list.read_group_black_list_type.read_group_black_list}" "txt" "input_read_group_black_list_${rg_black_list_count}" + #else + -p '--read_group_black_list "${rg_black_list.read_group_black_list_type.read_group_black_list}"' + #end if + #end for + #end if + + #if str( $reference_source.reference_source_selector ) == "history": + -d "-R" "${reference_source.ref_file}" "${reference_source.ref_file.ext}" "gatk_input" + #end if + ##end standard gatk options + + ##start analysis specific options + -p ' + --mode "${mode}" + + #for $ignore_filter in $ignore_filters: + #set $ignore_filter_name = str( $ignore_filter.ignore_filter_type.ignore_filter_type_selector ) + #if $ignore_filter_name == "custom": + #set $ignore_filter_name = str( $ignore_filter.ignore_filter_type.filter_name ) + #end if + --ignore_filter "${ignore_filter_name}" + #end for + --ts_filter_level "${ts_filter_level}" + ' + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +**What it does** + +Applies cuts to the input vcf file (by adding filter lines) to achieve the desired novel FDR levels which were specified during VariantRecalibration + +For more information on using the ApplyRecalibration module, see this `tool specific page <http://www.broadinstitute.org/gsa/wiki/index.php/Variant_quality_score_recalibration>`_. + +To learn about best practices for variant detection using GATK, see this `overview <http://www.broadinstitute.org/gsa/wiki/index.php/Best_Practice_Variant_Detection_with_the_GATK_v3>`_. + +If you encounter errors, please view the `GATK FAQ <http://www.broadinstitute.org/gsa/wiki/index.php/Frequently_Asked_Questions>`_. + +------ + +**Inputs** + +GenomeAnalysisTK: ApplyRecalibration accepts a variant input file, a recalibration file and a tranches file. + + +**Outputs** + +The output is in VCF format. + + +Go `here <http://www.broadinstitute.org/gsa/wiki/index.php/Input_files_for_the_GATK>`_ for details on GATK file formats. + +------- + +**Settings**:: + + + recal_file The output recal file used by ApplyRecalibration + tranches_file The input tranches file describing where to cut the data + out The output filtered, recalibrated VCF file + ts_filter_level The truth sensitivity level at which to start filtering + ignore_filter If specified the optimizer will use variants even if the specified filter name is marked in the input VCF file + mode Recalibration mode to employ: 1.) SNP for recalibrating only SNPs (emitting indels untouched in the output VCF); 2.) INDEL for indels; and 3.) BOTH for recalibrating both SNPs and indels simultaneously. (SNP|INDEL|BOTH) + +------ + +**Citation** + +For the underlying tool, please cite `DePristo MA, Banks E, Poplin R, Garimella KV, Maguire JR, Hartl C, Philippakis AA, del Angel G, Rivas MA, Hanna M, McKenna A, Fennell TJ, Kernytsky AM, Sivachenko AY, Cibulskis K, Gabriel SB, Altshuler D, Daly MJ. A framework for variation discovery and genotyping using next-generation DNA sequencing data. Nat Genet. 2011 May;43(5):491-8. <http://www.ncbi.nlm.nih.gov/pubmed/21478889>`_ + +If you use this tool in Galaxy, please cite Blankenberg D, et al. *In preparation.* + + + diff -r 94152a913ac9 -r a2c1575ba537 variant_combine.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_combine.xml Tue Sep 18 12:59:09 2012 -0400 @@ -0,0 +1,455 @@ + + + + gatk + + gatk2_wrapper.py + --max_jvm_heap_fraction "1" + --stdout "${output_log}" + + #set $priority_order = [] + #for $input_variant in $reference_source.input_variants: + -d "--variant:${input_variant.input_variant_name},%(file_type)s" "${input_variant.input_variant}" "${input_variant.input_variant.ext}" "input_variant_${input_variant.input_variant_name}" + #set $input_variant_name = str( $input_variant.input_variant_name ) + #assert $input_variant_name not in $priority_order, "Variant Names must be unique" ##this should be handled by a validator + #silent $priority_order.append( $input_variant_name ) + #end for + -p 'java + -jar "/data/galaxy/galaxy3/tool-data/shared/jars/gatk2/GenomeAnalysisTK.jar" + -T "CombineVariants" + --out "${output_variants}" + ##--num_threads 4 ##hard coded, for now + -et "NO_ET" -K "/data/galaxy/galaxy3/tool-data/shared/jars/gatk2/gatk2_key_file" ##ET no phone home + ##-log "${output_log}" ##don't use this to log to file, instead directly capture stdout + #if $reference_source.reference_source_selector != "history": + -R "${reference_source.ref_file.fields.path}" + #end if + --genotypemergeoption "${genotype_merge_option}" + --rod_priority_list "${ ','.join( $priority_order ) }" + ' + + ##start standard gatk options + #if $gatk_param_type.gatk_param_type_selector == "advanced": + #for $pedigree in $gatk_param_type.pedigree: + -p '--pedigree "${pedigree.pedigree_file}"' + #end for + #for $pedigree_string in $gatk_param_type.pedigree_string_repeat: + -p '--pedigreeString "${pedigree_string.pedigree_string}"' + #end for + -p '--pedigreeValidationType "${gatk_param_type.pedigree_validation_type}"' + #for $read_filter in $gatk_param_type.read_filter: + -p '--read_filter "${read_filter.read_filter_type.read_filter_type_selector}" + ###raise Exception( str( dir( $read_filter ) ) ) + #for $name, $param in $read_filter.read_filter_type.iteritems(): + #if $name not in [ "__current_case__", "read_filter_type_selector" ]: + #if hasattr( $param.input, 'truevalue' ): + ${param} + #else: + --${name} "${param}" + #end if + #end if + #end for + ' + #end for + #for $interval_count, $input_intervals in enumerate( $gatk_param_type.input_interval_repeat ): + -d "--intervals" "${input_intervals.input_intervals}" "${input_intervals.input_intervals.ext}" "input_intervals_${interval_count}" + #end for + + #for $interval_count, $input_intervals in enumerate( $gatk_param_type.input_exclude_interval_repeat ): + -d "--excludeIntervals" "${input_intervals.input_exclude_intervals}" "${input_intervals.input_exclude_intervals.ext}" "input_exlude_intervals_${interval_count}" + #end for + + -p '--interval_set_rule "${gatk_param_type.interval_set_rule}"' + + -p '--downsampling_type "${gatk_param_type.downsampling_type.downsampling_type_selector}"' + #if str( $gatk_param_type.downsampling_type.downsampling_type_selector ) != "NONE": + -p '--${gatk_param_type.downsampling_type.downsample_to_type.downsample_to_type_selector} "${gatk_param_type.downsampling_type.downsample_to_type.downsample_to_value}"' + #end if + -p ' + --baq "${gatk_param_type.baq}" + --baqGapOpenPenalty "${gatk_param_type.baq_gap_open_penalty}" + ${gatk_param_type.use_original_qualities} + --defaultBaseQualities "${gatk_param_type.default_base_qualities}" + --validation_strictness "${gatk_param_type.validation_strictness}" + --interval_merging "${gatk_param_type.interval_merging}" + ${gatk_param_type.disable_experimental_low_memory_sharding} + ${gatk_param_type.non_deterministic_random_seed} + ' + #for $rg_black_list_count, $rg_black_list in enumerate( $gatk_param_type.read_group_black_list_repeat ): + #if $rg_black_list.read_group_black_list_type.read_group_black_list_type_selector == "file": + -d "--read_group_black_list" "${rg_black_list.read_group_black_list_type.read_group_black_list}" "txt" "input_read_group_black_list_${rg_black_list_count}" + #else + -p '--read_group_black_list "${rg_black_list.read_group_black_list_type.read_group_black_list}"' + #end if + #end for + #end if + + #if $reference_source.reference_source_selector == "history": + -d "-R" "${reference_source.ref_file}" "${reference_source.ref_file.ext}" "gatk_input" + #end if + ##end standard gatk options + + + ##start analysis specific options + #if $analysis_param_type.analysis_param_type_selector == "advanced": + -p ' + --filteredrecordsmergetype "${analysis_param_type.filtered_records_merge_type}" + ${analysis_param_type.print_complex_merges} + ${analysis_param_type.filtered_are_uncalled} + ${analysis_param_type.minimal_vcf} + ${analysis_param_type.assume_identical_samples} + + #if str( $analysis_param_type.set_key ): + --setKey "${analysis_param_type.set_key}" + #end if + + --minimumN "${analysis_param_type.minimum_n}" + ' + #end if + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +**What it does** + +Combines VCF records from different sources; supports both full merges and set unions. Merge: combines multiple records into a single one; if sample names overlap then they are uniquified. Union: assumes each rod represents the same set of samples (although this is not enforced); using the priority list (if provided), emits a single record instance at every position represented in the rods. + +For more information on using the CombineVariants module, see this `tool specific page <http://www.broadinstitute.org/gsa/wiki/index.php/CombineVariants>`_. + +To learn about best practices for variant detection using GATK, see this `overview <http://www.broadinstitute.org/gsa/wiki/index.php/Best_Practice_Variant_Detection_with_the_GATK_v3>`_. + +If you encounter errors, please view the `GATK FAQ <http://www.broadinstitute.org/gsa/wiki/index.php/Frequently_Asked_Questions>`_. + +------ + +**Inputs** + +GenomeAnalysisTK: CombineVariants accepts variant files as input. + +------ + +**Outputs** + +The output is a combined vcf file. + + +Go `here <http://www.broadinstitute.org/gsa/wiki/index.php/Input_files_for_the_GATK>`_ for details on GATK file formats. + +------- + +**Settings**:: + + out File to which variants should be written + genotypemergeoption How should we merge genotype records for samples shared across the ROD files? (UNIQUIFY|PRIORITIZE|UNSORTED|REQUIRE_UNIQUE) + filteredrecordsmergetype How should we deal with records seen at the same site in the VCF, but with different FILTER fields? KEEP_IF_ANY_UNFILTERED PASSes the record if any record is unfiltered, KEEP_IF_ALL_UNFILTERED requires all records to be unfiltered (KEEP_IF_ANY_UNFILTERED|KEEP_IF_ALL_UNFILTERED) + rod_priority_list When taking the union of variants containing genotypes: a comma-separated string describing the priority ordering for the genotypes as far as which record gets emitted; a complete priority list MUST be provided + printComplexMerges Print out interesting sites requiring complex compatibility merging + filteredAreUncalled If true, then filtered VCFs are treated as uncalled, so that filtered set annotation don't appear in the combined VCF + minimalVCF If true, then the output VCF will contain no INFO or genotype INFO field + setKey Key, by default set, in the INFO key=value tag emitted describing which set the combined VCF record came from. Set to null if you don't want the set field emitted. + assumeIdenticalSamples If true, assume input VCFs have identical sample sets and disjoint calls so that one can simply perform a merge sort to combine the VCFs into one, drastically reducing the runtime. + minimumN Combine variants and output site only if variant is present in at least N input files. + +------ + +**Citation** + +For the underlying tool, please cite `DePristo MA, Banks E, Poplin R, Garimella KV, Maguire JR, Hartl C, Philippakis AA, del Angel G, Rivas MA, Hanna M, McKenna A, Fennell TJ, Kernytsky AM, Sivachenko AY, Cibulskis K, Gabriel SB, Altshuler D, Daly MJ. A framework for variation discovery and genotyping using next-generation DNA sequencing data. Nat Genet. 2011 May;43(5):491-8. <http://www.ncbi.nlm.nih.gov/pubmed/21478889>`_ + +If you use this tool in Galaxy, please cite Blankenberg D, et al. *In preparation.* + + + diff -r 94152a913ac9 -r a2c1575ba537 variant_eval.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_eval.xml Tue Sep 18 12:59:09 2012 -0400 @@ -0,0 +1,572 @@ + + + + gatk + + gatk2_wrapper.py + #from binascii import hexlify + --max_jvm_heap_fraction "1" + --stdout "${output_log}" + #for $var_count, $variant in enumerate( $reference_source.variants ): + -d "--eval:input_${var_count},%(file_type)s" "${variant.input_variant}" "${variant.input_variant.ext}" "input_variants_${var_count}" + #end for + -p 'java + -jar "/data/galaxy/galaxy3/tool-data/shared/jars/gatk2/GenomeAnalysisTK.jar" + -T "VariantEval" + --out "${output_report}" + --num_threads 4 ##hard coded, for now + -et "NO_ET" -K "/data/galaxy/galaxy3/tool-data/shared/jars/gatk2/gatk2_key_file" ##ET no phone home + ##-log "${output_log}" ##don't use this to log to file, instead directly capture stdout + #if $reference_source.reference_source_selector != "history": + -R "${reference_source.ref_file.fields.path}" + #end if + ' + + #for $rod_binding in $comp_rod_bind: + -d "--comp:${rod_binding.comp_rod_name},%(file_type)s" "${rod_binding.comp_input_rod}" "${rod_binding.comp_input_rod.ext}" "input_comp_${rod_binding.comp_rod_name}" + #if str( $rod_binding.comp_known_names ): + -p '--known_names "${rod_binding.comp_rod_name}"' + #end if + #end for + + #if str( $dbsnp_rod_bind_type.dbsnp_rod_bind_type_selector ) == 'set_dbsnp': + -d "--dbsnp:${dbsnp_rod_bind_type.dbsnp_rod_name},%(file_type)s" "${dbsnp_rod_bind_type.dbsnp_input_rod}" "${dbsnp_rod_bind_type.dbsnp_input_rod.ext}" "input_dbsnp_${dbsnp_rod_bind_type.dbsnp_rod_name}" + #if str( $dbsnp_rod_bind_type.dbsnp_known_names ): + -p '--known_names "${dbsnp_rod_bind_type.dbsnp_rod_name}"' + #end if + #end if + + ##start standard gatk options + #if $gatk_param_type.gatk_param_type_selector == "advanced": + #for $pedigree in $gatk_param_type.pedigree: + -p '--pedigree "${pedigree.pedigree_file}"' + #end for + #for $pedigree_string in $gatk_param_type.pedigree_string_repeat: + -p '--pedigreeString "${pedigree_string.pedigree_string}"' + #end for + -p '--pedigreeValidationType "${gatk_param_type.pedigree_validation_type}"' + #for $read_filter in $gatk_param_type.read_filter: + -p '--read_filter "${read_filter.read_filter_type.read_filter_type_selector}" + ###raise Exception( str( dir( $read_filter ) ) ) + #for $name, $param in $read_filter.read_filter_type.iteritems(): + #if $name not in [ "__current_case__", "read_filter_type_selector" ]: + #if hasattr( $param.input, 'truevalue' ): + ${param} + #else: + --${name} "${param}" + #end if + #end if + #end for + ' + #end for + #for $interval_count, $input_intervals in enumerate( $gatk_param_type.input_interval_repeat ): + -d "--intervals" "${input_intervals.input_intervals}" "${input_intervals.input_intervals.ext}" "input_intervals_${interval_count}" + #end for + + #for $interval_count, $input_intervals in enumerate( $gatk_param_type.input_exclude_interval_repeat ): + -d "--excludeIntervals" "${input_intervals.input_exclude_intervals}" "${input_intervals.input_exclude_intervals.ext}" "input_exlude_intervals_${interval_count}" + #end for + + -p '--interval_set_rule "${gatk_param_type.interval_set_rule}"' + + -p '--downsampling_type "${gatk_param_type.downsampling_type.downsampling_type_selector}"' + #if str( $gatk_param_type.downsampling_type.downsampling_type_selector ) != "NONE": + -p '--${gatk_param_type.downsampling_type.downsample_to_type.downsample_to_type_selector} "${gatk_param_type.downsampling_type.downsample_to_type.downsample_to_value}"' + #end if + -p ' + --baq "${gatk_param_type.baq}" + --baqGapOpenPenalty "${gatk_param_type.baq_gap_open_penalty}" + ${gatk_param_type.use_original_qualities} + --defaultBaseQualities "${gatk_param_type.default_base_qualities}" + --validation_strictness "${gatk_param_type.validation_strictness}" + --interval_merging "${gatk_param_type.interval_merging}" + ${gatk_param_type.disable_experimental_low_memory_sharding} + ${gatk_param_type.non_deterministic_random_seed} + ' + #for $rg_black_list_count, $rg_black_list in enumerate( $gatk_param_type.read_group_black_list_repeat ): + #if $rg_black_list.read_group_black_list_type.read_group_black_list_type_selector == "file": + -d "--read_group_black_list" "${rg_black_list.read_group_black_list_type.read_group_black_list}" "txt" "input_read_group_black_list_${rg_black_list_count}" + #else + -p '--read_group_black_list "${rg_black_list.read_group_black_list_type.read_group_black_list}"' + #end if + #end for + #end if + + #if $reference_source.reference_source_selector == "history": + -d "-R" "${reference_source.ref_file}" "${reference_source.ref_file.ext}" "gatk_input" + #end if + ##end standard gatk options + + + ##start analysis specific options + #if $analysis_param_type.analysis_param_type_selector == "advanced": + #for $stratification in $analysis_param_type.stratifications: + #set $select_string = "--select_exps '%s' --select_names '%s'" % ( str( $stratification.select_exps ), str( $stratification.select_name ) ) + -o '${ hexlify( $select_string ) }' + #end for + -p ' + + #for $sample in $analysis_param_type.samples: + --sample "${sample.sample}" + #end for + + #if str( $analysis_param_type.stratification_modules ) != "None": + #for $stratification_module in str( $analysis_param_type.stratification_modules).split( ',' ): + --stratificationModule "${stratification_module}" + #end for + #end if + + ${analysis_param_type.do_not_use_all_standard_stratifications} + + #for $variant_type in $analysis_param_type.only_variants_of_type: + --onlyVariantsOfType "${variant_type.variant_type}" + #end for + + #if str( $analysis_param_type.eval_modules ) != "None": + #for $eval_module in str( $analysis_param_type.eval_modules).split( ',' ): + --evalModule "${eval_module}" + #end for + #end if + + ${analysis_param_type.do_not_use_all_standard_modules} + + #if str( $analysis_param_type.num_samples ) != "0": + --numSamples "${analysis_param_type.num_samples}" + #end if + + --minPhaseQuality "${analysis_param_type.min_phase_quality}" + + #if str( $analysis_param_type.family ): + --family_structure "${analysis_param_type.family}" + #end if + + --mendelianViolationQualThreshold "${analysis_param_type.mendelian_violation_qual_threshold}" + + #if str( $analysis_param_type.ancestral_alignments ) != "None": + --ancestralAlignments "${analysis_param_type.ancestral_alignments}" + #end if + ' + #if str( $analysis_param_type.known_cnvs ) != "None": + -d "--knownCNVs" "${analysis_param_type.known_cnvs}" "${analysis_param_type.known_cnvs.ext}" "input_known_cnvs" + #end if + + #if str( $analysis_param_type.strat_intervals ) != "None": + -d "--stratIntervals" "${analysis_param_type.strat_intervals}" "${analysis_param_type.strat_intervals.ext}" "input_strat_intervals" + #end if + #end if + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +**What it does** + +General-purpose tool for variant evaluation (% in dbSNP, genotype concordance, Ti/Tv ratios, and a lot more) + +For more information on using the VariantEval module, see this `tool specific page <http://www.broadinstitute.org/gsa/wiki/index.php/VariantEval>`_. + +To learn about best practices for variant detection using GATK, see this `overview <http://www.broadinstitute.org/gsa/wiki/index.php/Best_Practice_Variant_Detection_with_the_GATK_v3>`_. + +If you encounter errors, please view the `GATK FAQ <http://www.broadinstitute.org/gsa/wiki/index.php/Frequently_Asked_Questions>`_. + +------ + +**Inputs** + +GenomeAnalysisTK: VariantEval accepts variant files as input. + + +**Outputs** + +The output is a table of variant evaluation. + + +Go `here <http://www.broadinstitute.org/gsa/wiki/index.php/Input_files_for_the_GATK>`_ for details on GATK file formats. + + +------- + +**Settings**:: + + out An output file presented to the walker. Will overwrite contents if file exists. + list List the available eval modules and exit + select_exps One or more stratifications to use when evaluating the data + select_names Names to use for the list of stratifications (must be a 1-to-1 mapping) + sample Derive eval and comp contexts using only these sample genotypes, when genotypes are available in the original context + known_names Name of ROD bindings containing variant sites that should be treated as known when splitting eval rods into known and novel subsets + stratificationModule One or more specific stratification modules to apply to the eval track(s) (in addition to the standard stratifications, unless -noS is specified) + doNotUseAllStandardStratifications Do not use the standard stratification modules by default (instead, only those that are specified with the -S option) + onlyVariantsOfType If provided, only variants of these types will be considered during the evaluation, in + evalModule One or more specific eval modules to apply to the eval track(s) (in addition to the standard modules, unless -noE is specified) + doNotUseAllStandardModules Do not use the standard modules by default (instead, only those that are specified with the -E option) + numSamples Number of samples (used if no samples are available in the VCF file + minPhaseQuality Minimum phasing quality + family_structure If provided, genotypes in will be examined for mendelian violations: this argument is a string formatted as dad+mom=child where these parameters determine which sample names are examined + mendelianViolationQualThreshold Minimum genotype QUAL score for each trio member required to accept a site as a violation + ancestralAlignments Fasta file with ancestral alleles + +------ + +**Citation** + +For the underlying tool, please cite `DePristo MA, Banks E, Poplin R, Garimella KV, Maguire JR, Hartl C, Philippakis AA, del Angel G, Rivas MA, Hanna M, McKenna A, Fennell TJ, Kernytsky AM, Sivachenko AY, Cibulskis K, Gabriel SB, Altshuler D, Daly MJ. A framework for variation discovery and genotyping using next-generation DNA sequencing data. Nat Genet. 2011 May;43(5):491-8. <http://www.ncbi.nlm.nih.gov/pubmed/21478889>`_ + +If you use this tool in Galaxy, please cite Blankenberg D, et al. *In preparation.* + + + diff -r 94152a913ac9 -r a2c1575ba537 variant_filtration.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_filtration.xml Tue Sep 18 12:59:09 2012 -0400 @@ -0,0 +1,457 @@ + + on VCF files + + gatk + + gatk2_wrapper.py + #from binascii import hexlify + --max_jvm_heap_fraction "1" + --stdout "${output_log}" + -d "--variant:variant,%(file_type)s" "${reference_source.input_variant}" "${reference_source.input_variant.ext}" "input_variant" + -p 'java + -jar "/data/galaxy/galaxy3/tool-data/shared/jars/gatk2/GenomeAnalysisTK.jar" + -T "VariantFiltration" + ##--num_threads 4 ##hard coded, for now + -et "NO_ET" -K "/data/galaxy/galaxy3/tool-data/shared/jars/gatk2/gatk2_key_file" ##ET no phone home + -o "${output_vcf}" + ##-log "${output_log}" ##don't use this to log to file, instead directly capture stdout + #if $reference_source.reference_source_selector != "history": + -R "${reference_source.ref_file.fields.path}" + #end if + ' + #for $variant_filter in $variant_filters: + #set $variant_filter = "--%sExpression '%s' --%sName '%s'" % ( str( $variant_filter.is_genotype_filter ), str( $variant_filter.filter_expression ), str( $variant_filter.is_genotype_filter ), str( $variant_filter.filter_name ) ) + -o '${ hexlify( $variant_filter ) }' + #end for + + #if str( $mask_rod_bind_type.mask_rod_bind_type_selector ) == 'set_mask': + -d "--mask:${mask_rod_bind_type.mask_rod_name},%(file_type)s" "${mask_rod_bind_type.input_mask_rod}" "${mask_rod_bind_type.input_mask_rod.ext}" "input_mask_${mask_rod_bind_type.mask_rod_name}" + -p ' + --maskExtension "${mask_rod_bind_type.mask_extension}" + --maskName "${mask_rod_bind_type.mask_rod_name}" + ' + #end if + + ##start standard gatk options + #if $gatk_param_type.gatk_param_type_selector == "advanced": + #for $pedigree in $gatk_param_type.pedigree: + -p '--pedigree "${pedigree.pedigree_file}"' + #end for + #for $pedigree_string in $gatk_param_type.pedigree_string_repeat: + -p '--pedigreeString "${pedigree_string.pedigree_string}"' + #end for + -p '--pedigreeValidationType "${gatk_param_type.pedigree_validation_type}"' + #for $read_filter in $gatk_param_type.read_filter: + -p '--read_filter "${read_filter.read_filter_type.read_filter_type_selector}" + ###raise Exception( str( dir( $read_filter ) ) ) + #for $name, $param in $read_filter.read_filter_type.iteritems(): + #if $name not in [ "__current_case__", "read_filter_type_selector" ]: + #if hasattr( $param.input, 'truevalue' ): + ${param} + #else: + --${name} "${param}" + #end if + #end if + #end for + ' + #end for + #for $interval_count, $input_intervals in enumerate( $gatk_param_type.input_interval_repeat ): + -d "--intervals" "${input_intervals.input_intervals}" "${input_intervals.input_intervals.ext}" "input_intervals_${interval_count}" + #end for + + #for $interval_count, $input_intervals in enumerate( $gatk_param_type.input_exclude_interval_repeat ): + -d "--excludeIntervals" "${input_intervals.input_exclude_intervals}" "${input_intervals.input_exclude_intervals.ext}" "input_exlude_intervals_${interval_count}" + #end for + + -p '--interval_set_rule "${gatk_param_type.interval_set_rule}"' + + -p '--downsampling_type "${gatk_param_type.downsampling_type.downsampling_type_selector}"' + #if str( $gatk_param_type.downsampling_type.downsampling_type_selector ) != "NONE": + -p '--${gatk_param_type.downsampling_type.downsample_to_type.downsample_to_type_selector} "${gatk_param_type.downsampling_type.downsample_to_type.downsample_to_value}"' + #end if + -p ' + --baq "${gatk_param_type.baq}" + --baqGapOpenPenalty "${gatk_param_type.baq_gap_open_penalty}" + ${gatk_param_type.use_original_qualities} + --defaultBaseQualities "${gatk_param_type.default_base_qualities}" + --validation_strictness "${gatk_param_type.validation_strictness}" + --interval_merging "${gatk_param_type.interval_merging}" + ${gatk_param_type.disable_experimental_low_memory_sharding} + ${gatk_param_type.non_deterministic_random_seed} + ' + #for $rg_black_list_count, $rg_black_list in enumerate( $gatk_param_type.read_group_black_list_repeat ): + #if $rg_black_list.read_group_black_list_type.read_group_black_list_type_selector == "file": + -d "--read_group_black_list" "${rg_black_list.read_group_black_list_type.read_group_black_list}" "txt" "input_read_group_black_list_${rg_black_list_count}" + #else + -p '--read_group_black_list "${rg_black_list.read_group_black_list_type.read_group_black_list}"' + #end if + #end for + #end if + + #if str( $reference_source.reference_source_selector ) == "history": + -d "-R" "${reference_source.ref_file}" "${reference_source.ref_file.ext}" "gatk_input" + #end if + ##end standard gatk options + + ##start analysis specific options + #if $cluster_snp_type.cluster_snp_type_selector == "cluster_snp": + -p ' + --clusterSize "${cluster_snp_type.cluster_size}" + --clusterWindowSize "${cluster_snp_type.cluster_window_size}" + ' + #end if + -p '${missing_values_in_expressions_should_evaluate_as_failing}' + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +**What it does** + +Filters variant calls using a number of user-selectable, parameterizable criteria. + +For more information on using the VariantFiltration module, see this `tool specific page <http://www.broadinstitute.org/gsa/wiki/index.php/VariantFiltrationWalker>`_. + +To learn about best practices for variant detection using GATK, see this `overview <http://www.broadinstitute.org/gsa/wiki/index.php/Best_Practice_Variant_Detection_with_the_GATK_v3>`_. + +If you encounter errors, please view the `GATK FAQ <http://www.broadinstitute.org/gsa/wiki/index.php/Frequently_Asked_Questions>`_. + +------ + +**Inputs** + +GenomeAnalysisTK: VariantFiltration accepts a VCF input file. + + +**Outputs** + +The output is in VCF format. + + +Go `here <http://www.broadinstitute.org/gsa/wiki/index.php/Input_files_for_the_GATK>`_ for details on GATK file formats. + +------- + +**Settings**:: + + + filterExpression One or more expression used with INFO fields to filter (see wiki docs for more info) + filterName Names to use for the list of filters (must be a 1-to-1 mapping); this name is put in the FILTER field for variants that get filtered + genotypeFilterExpression One or more expression used with FORMAT (sample/genotype-level) fields to filter (see wiki docs for more info) + genotypeFilterName Names to use for the list of sample/genotype filters (must be a 1-to-1 mapping); this name is put in the FILTER field for variants that get filtered + clusterSize The number of SNPs which make up a cluster (see also --clusterWindowSize); [default:3] + clusterWindowSize The window size (in bases) in which to evaluate clustered SNPs (to disable the clustered SNP filter, set this value to less than 1); [default:0] + maskName The text to put in the FILTER field if a 'mask' rod is provided and overlaps with a variant call; [default:'Mask'] + missingValuesInExpressionsShouldEvaluateAsFailing When evaluating the JEXL expressions, should missing values be considered failing the expression (by default they are considered passing)? + +------ + +**Citation** + +For the underlying tool, please cite `DePristo MA, Banks E, Poplin R, Garimella KV, Maguire JR, Hartl C, Philippakis AA, del Angel G, Rivas MA, Hanna M, McKenna A, Fennell TJ, Kernytsky AM, Sivachenko AY, Cibulskis K, Gabriel SB, Altshuler D, Daly MJ. A framework for variation discovery and genotyping using next-generation DNA sequencing data. Nat Genet. 2011 May;43(5):491-8. <http://www.ncbi.nlm.nih.gov/pubmed/21478889>`_ + +If you use this tool in Galaxy, please cite Blankenberg D, et al. *In preparation.* + + + diff -r 94152a913ac9 -r a2c1575ba537 variant_recalibrator.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_recalibrator.xml Tue Sep 18 12:59:09 2012 -0400 @@ -0,0 +1,715 @@ + + + + gatk + + gatk2_wrapper.py + --max_jvm_heap_fraction "1" + --stdout "${output_log}" + #for $var_count, $variant in enumerate( $reference_source.variants ): + -d "--input:input_${var_count},%(file_type)s" "${variant.input_variants}" "${variant.input_variants.ext}" "input_variants_${var_count}" + #end for + -p 'java + -jar "/data/galaxy/galaxy3/tool-data/shared/jars/gatk2/GenomeAnalysisTK.jar" + -T "VariantRecalibrator" + --num_threads 4 ##hard coded, for now + -et "NO_ET" -K "/data/galaxy/galaxy3/tool-data/shared/jars/gatk2/gatk2_key_file"##ET no phone home + ##-log "${output_log}" ##don't use this to log to file, instead directly capture stdout + #if $reference_source.reference_source_selector != "history": + -R "${reference_source.ref_file.fields.path}" + #end if + --recal_file "${output_recal}" + --tranches_file "${output_tranches}" + --rscript_file "${output_rscript}" + ' + + #set $rod_binding_names = dict() + #for $rod_binding in $rod_bind: + #if str( $rod_binding.rod_bind_type.rod_bind_type_selector ) == 'custom': + #set $rod_bind_name = $rod_binding.rod_bind_type.custom_rod_name + #elif str( $rod_binding.rod_bind_type.rod_bind_type_selector ) == 'comp': + #set $rod_bind_name = "comp" + $rod_binding.rod_bind_type.custom_rod_name + #else + #set $rod_bind_name = $rod_binding.rod_bind_type.rod_bind_type_selector + #end if + #set $rod_binding_names[$rod_bind_name] = $rod_binding_names.get( $rod_bind_name, -1 ) + 1 + #if $rod_binding.rod_bind_type.rod_training_type.rod_training_type_selector == "not_training_truth_known": + -d "--resource:${rod_bind_name},%(file_type)s" "${rod_binding.rod_bind_type.input_rod}" "${rod_binding.rod_bind_type.input_rod.ext}" "input_${rod_bind_name}_${rod_binding_names[$rod_bind_name]}" + #else: + -d "--resource:${rod_bind_name},%(file_type)s,known=${rod_binding.rod_bind_type.rod_training_type.known},training=${rod_binding.rod_bind_type.rod_training_type.training},truth=${rod_binding.rod_bind_type.rod_training_type.truth},bad=${rod_binding.rod_bind_type.rod_training_type.bad},prior=${rod_binding.rod_bind_type.rod_training_type.prior}" "${rod_binding.rod_bind_type.input_rod}" "${rod_binding.rod_bind_type.input_rod.ext}" "input_${rod_bind_name}_${rod_binding_names[$rod_bind_name]}" + #end if + #end for + + ##start standard gatk options + #if $gatk_param_type.gatk_param_type_selector == "advanced": + #for $pedigree in $gatk_param_type.pedigree: + -p '--pedigree "${pedigree.pedigree_file}"' + #end for + #for $pedigree_string in $gatk_param_type.pedigree_string_repeat: + -p '--pedigreeString "${pedigree_string.pedigree_string}"' + #end for + -p '--pedigreeValidationType "${gatk_param_type.pedigree_validation_type}"' + #for $read_filter in $gatk_param_type.read_filter: + -p '--read_filter "${read_filter.read_filter_type.read_filter_type_selector}" + ###raise Exception( str( dir( $read_filter ) ) ) + #for $name, $param in $read_filter.read_filter_type.iteritems(): + #if $name not in [ "__current_case__", "read_filter_type_selector" ]: + #if hasattr( $param.input, 'truevalue' ): + ${param} + #else: + --${name} "${param}" + #end if + #end if + #end for + ' + #end for + #for $interval_count, $input_intervals in enumerate( $gatk_param_type.input_interval_repeat ): + -d "--intervals" "${input_intervals.input_intervals}" "${input_intervals.input_intervals.ext}" "input_intervals_${interval_count}" + #end for + + #for $interval_count, $input_intervals in enumerate( $gatk_param_type.input_exclude_interval_repeat ): + -d "--excludeIntervals" "${input_intervals.input_exclude_intervals}" "${input_intervals.input_exclude_intervals.ext}" "input_exlude_intervals_${interval_count}" + #end for + + -p '--interval_set_rule "${gatk_param_type.interval_set_rule}"' + + -p '--downsampling_type "${gatk_param_type.downsampling_type.downsampling_type_selector}"' + #if str( $gatk_param_type.downsampling_type.downsampling_type_selector ) != "NONE": + -p '--${gatk_param_type.downsampling_type.downsample_to_type.downsample_to_type_selector} "${gatk_param_type.downsampling_type.downsample_to_type.downsample_to_value}"' + #end if + -p ' + --baq "${gatk_param_type.baq}" + --baqGapOpenPenalty "${gatk_param_type.baq_gap_open_penalty}" + ${gatk_param_type.use_original_qualities} + --defaultBaseQualities "${gatk_param_type.default_base_qualities}" + --validation_strictness "${gatk_param_type.validation_strictness}" + --interval_merging "${gatk_param_type.interval_merging}" + ${gatk_param_type.disable_experimental_low_memory_sharding} + ${gatk_param_type.non_deterministic_random_seed} + ' + #for $rg_black_list_count, $rg_black_list in enumerate( $gatk_param_type.read_group_black_list_repeat ): + #if $rg_black_list.read_group_black_list_type.read_group_black_list_type_selector == "file": + -d "--read_group_black_list" "${rg_black_list.read_group_black_list_type.read_group_black_list}" "txt" "input_read_group_black_list_${rg_black_list_count}" + #else + -p '--read_group_black_list "${rg_black_list.read_group_black_list_type.read_group_black_list}"' + #end if + #end for + #end if + + #if str( $reference_source.reference_source_selector ) == "history": + -d "-R" "${reference_source.ref_file}" "${reference_source.ref_file.ext}" "gatk_input" + #end if + ##end standard gatk options + + ##start analysis specific options + -p ' + #if str( $annotations ) != "None": + #for $annotation in str( $annotations.fields.gatk_value ).split( ',' ): + --use_annotation "${annotation}" + #end for + #end if + #for $additional_annotation in $additional_annotations: + --use_annotation "${additional_annotation.additional_annotation_name}" + #end for + --mode "${mode}" + ' + + #if $analysis_param_type.analysis_param_type_selector == "advanced": + -p ' + --maxGaussians "${analysis_param_type.max_gaussians}" + --maxIterations "${analysis_param_type.max_iterations}" + --numKMeans "${analysis_param_type.num_k_means}" + --stdThreshold "${analysis_param_type.std_threshold}" + --qualThreshold "${analysis_param_type.qual_threshold}" + --shrinkage "${analysis_param_type.shrinkage}" + --dirichlet "${analysis_param_type.dirichlet}" + --priorCounts "${analysis_param_type.prior_counts}" + #if str( $analysis_param_type.bad_variant_selector.bad_variant_selector_type ) == 'percent': + --percentBadVariants "${analysis_param_type.bad_variant_selector.percent_bad_variants}" + #else: + --minNumBadVariants "${analysis_param_type.bad_variant_selector.min_num_bad_variants}" + #end if + --target_titv "${analysis_param_type.target_titv}" + #for $tranche in [ $tranche.strip() for $tranche in str( $analysis_param_type.ts_tranche ).split( ',' ) if $tranche.strip() ] + --TStranche "${tranche}" + #end for + #for $ignore_filter in $analysis_param_type.ignore_filters: + #set $ignore_filter_name = str( $ignore_filter.ignore_filter_type.ignore_filter_type_selector ) + #if $ignore_filter_name == "custom": + #set $ignore_filter_name = str( $ignore_filter.ignore_filter_type.filter_name ) + #end if + --ignore_filter "${ignore_filter_name}" + #end for + --ts_filter_level "${analysis_param_type.ts_filter_level}" + ' + #end if + + + && + mv "${output_rscript}.pdf" "${output_tranches_pdf}" + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +**What it does** + +Takes variant calls as .vcf files, learns a Gaussian mixture model over the variant annotations and evaluates the variant -- assigning an informative lod score + +For more information on using the VariantRecalibrator module, see this `tool specific page <http://www.broadinstitute.org/gsa/wiki/index.php/Variant_quality_score_recalibration>`_. + +To learn about best practices for variant detection using GATK, see this `overview <http://www.broadinstitute.org/gsa/wiki/index.php/Best_Practice_Variant_Detection_with_the_GATK_v3>`_. + +If you encounter errors, please view the `GATK FAQ <http://www.broadinstitute.org/gsa/wiki/index.php/Frequently_Asked_Questions>`_. + +------ + +**Inputs** + +GenomeAnalysisTK: VariantRecalibrator accepts a variant input file. + + +**Outputs** + +The output is in VCF format. + + +Go `here <http://www.broadinstitute.org/gsa/wiki/index.php/Input_files_for_the_GATK>`_ for details on GATK file formats. + +------- + +**Settings**:: + + + tranches_file The output tranches file used by ApplyRecalibration + use_annotation The names of the annotations which should used for calculations + mode Recalibration mode to employ: 1.) SNP for recalibrating only snps (emitting indels untouched in the output VCF); 2.) INDEL for indels; and 3.) BOTH for recalibrating both snps and indels simultaneously. (SNP|INDEL|BOTH) + maxGaussians The maximum number of Gaussians to try during variational Bayes algorithm + maxIterations The maximum number of VBEM iterations to be performed in variational Bayes algorithm. Procedure will normally end when convergence is detected. + numKMeans The number of k-means iterations to perform in order to initialize the means of the Gaussians in the Gaussian mixture model. + stdThreshold If a variant has annotations more than -std standard deviations away from mean then don't use it for building the Gaussian mixture model. + qualThreshold If a known variant has raw QUAL value less than -qual then don't use it for building the Gaussian mixture model. + shrinkage The shrinkage parameter in variational Bayes algorithm. + dirichlet The dirichlet parameter in variational Bayes algorithm. + priorCounts The number of prior counts to use in variational Bayes algorithm. + percentBadVariants What percentage of the worst scoring variants to use when building the Gaussian mixture model of bad variants. 0.07 means bottom 7 percent. + minNumBadVariants The minimum amount of worst scoring variants to use when building the Gaussian mixture model of bad variants. Will override -percentBad arugment if necessary. + recal_file The output recal file used by ApplyRecalibration + target_titv The expected novel Ti/Tv ratio to use when calculating FDR tranches and for display on optimization curve output figures. (approx 2.15 for whole genome experiments). ONLY USED FOR PLOTTING PURPOSES! + TStranche The levels of novel false discovery rate (FDR, implied by ti/tv) at which to slice the data. (in percent, that is 1.0 for 1 percent) + ignore_filter If specified the optimizer will use variants even if the specified filter name is marked in the input VCF file + path_to_Rscript The path to your implementation of Rscript. For Broad users this is maybe /broad/tools/apps/R-2.6.0/bin/Rscript + rscript_file The output rscript file generated by the VQSR to aid in visualization of the input data and learned model + path_to_resources Path to resources folder holding the Sting R scripts. + ts_filter_level The truth sensitivity level at which to start filtering, used here to indicate filtered variants in plots + +------ + +**Citation** + +For the underlying tool, please cite `DePristo MA, Banks E, Poplin R, Garimella KV, Maguire JR, Hartl C, Philippakis AA, del Angel G, Rivas MA, Hanna M, McKenna A, Fennell TJ, Kernytsky AM, Sivachenko AY, Cibulskis K, Gabriel SB, Altshuler D, Daly MJ. A framework for variation discovery and genotyping using next-generation DNA sequencing data. Nat Genet. 2011 May;43(5):491-8. <http://www.ncbi.nlm.nih.gov/pubmed/21478889>`_ + +If you use this tool in Galaxy, please cite Blankenberg D, et al. *In preparation.* + + + diff -r 94152a913ac9 -r a2c1575ba537 variant_select.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_select.xml Tue Sep 18 12:59:09 2012 -0400 @@ -0,0 +1,574 @@ + + from VCF files + + gatk + + gatk2_wrapper.py + #from binascii import hexlify + --max_jvm_heap_fraction "1" + --stdout "${output_log}" + -d "--variant:variant,%(file_type)s" "${reference_source.input_variant}" "${reference_source.input_variant.ext}" "input_variant" + -p 'java + -jar "/data/galaxy/galaxy3/tool-data/shared/jars/gatk2/GenomeAnalysisTK.jar" + -T "SelectVariants" + --num_threads 4 ##hard coded, for now + -et "NO_ET" -K "/data/galaxy/galaxy3/tool-data/shared/jars/gatk2/gatk2_key_file" ##ET no phone home + -o "${output_vcf}" + + #if $reference_source.reference_source_selector != "history": + -R "${reference_source.ref_file.fields.path}" + #end if + ' + -p ' + #if $input_concordance: + --concordance "${input_concordance}" + #end if + #if $input_discordance: + --discordance "${input_discordance}" + #end if + + #for $exclude_sample_name in $exclude_sample_name_repeat: + --exclude_sample_name "${exclude_sample_name.exclude_sample_name}" + #end for + + ${exclude_filtered} + + #for $sample_name in $sample_name_repeat: + --sample_name "${sample_name.sample_name}" + #end for + + ' + + #for $select_expressions in $select_expressions_repeat: + #set $select_expression = "--select_expressions '%s'" % ( str( $select_expressions.select_expressions ) ) + -o '${ hexlify( $select_expression ) }' + #end for + + ##start tool specific options + #if str( $analysis_param_type.analysis_param_type_selector ) == 'advanced': + -p ' + #for $exclude_sample_file in $analysis_param_type.exclude_sample_file_repeat: + --exclude_sample_file "${exclude_sample_file.exclude_sample_file}" + #end for + + #for $sample_file in $analysis_param_type.sample_file_repeat: + --sample_file "${ample_file.sample_file}" + #end for + + #if $analysis_param_type.input_keep_ids: + --keepIDs "${analysis_param_type.input_keep_ids}" + #end if + + ${analysis_param_type.keep_original_AC} + + ${analysis_param_type.mendelian_violation} + + --mendelianViolationQualThreshold "${analysis_param_type.mendelian_violation_qual_threshold}" + + --remove_fraction_genotypes "${analysis_param_type.remove_fraction_genotypes}" + + --restrictAllelesTo "${analysis_param_type.restrict_alleles_to}" + + #if str( $analysis_param_type.select_random_type.select_random_type_selector ) == 'select_random_fraction': + --select_random_fraction "${analysis_param_type.select_random_type.select_random_fraction}" + #elif str( $analysis_param_type.select_random_type.select_random_type_selector ) == 'select_random_number': + --select_random_number "${analysis_param_type.select_random_type.select_random_number}" + #end if + + #if $analysis_param_type.select_type_to_include: + #for $type_to_include in str( $analysis_param_type.select_type_to_include ).split( ',' ): + --selectTypeToInclude "${type_to_include}" + #end for + #end if + + ${analysis_param_type.exclude_non_variants} + ' + + #for $sample_expressions in $analysis_param_type.sample_expressions_repeat: + #set $sample_expression = "--sample_expressions '%s'" % ( str( $sample_expressions.sample_expressions ) ) + -o '${ hexlify( $sample_expression ) }' + #end for + + #end if + ##end tool specific options + + ##start standard gatk options + #if $gatk_param_type.gatk_param_type_selector == "advanced": + #for $pedigree in $gatk_param_type.pedigree: + -p '--pedigree "${pedigree.pedigree_file}"' + #end for + #for $pedigree_string in $gatk_param_type.pedigree_string_repeat: + -p '--pedigreeString "${pedigree_string.pedigree_string}"' + #end for + -p '--pedigreeValidationType "${gatk_param_type.pedigree_validation_type}"' + #for $read_filter in $gatk_param_type.read_filter: + -p '--read_filter "${read_filter.read_filter_type.read_filter_type_selector}" + ###raise Exception( str( dir( $read_filter ) ) ) + #for $name, $param in $read_filter.read_filter_type.iteritems(): + #if $name not in [ "__current_case__", "read_filter_type_selector" ]: + #if hasattr( $param.input, 'truevalue' ): + ${param} + #else: + --${name} "${param}" + #end if + #end if + #end for + ' + #end for + #for $interval_count, $input_intervals in enumerate( $gatk_param_type.input_interval_repeat ): + -d "--intervals" "${input_intervals.input_intervals}" "${input_intervals.input_intervals.ext}" "input_intervals_${interval_count}" + #end for + + #for $interval_count, $input_intervals in enumerate( $gatk_param_type.input_exclude_interval_repeat ): + -d "--excludeIntervals" "${input_intervals.input_exclude_intervals}" "${input_intervals.input_exclude_intervals.ext}" "input_exlude_intervals_${interval_count}" + #end for + + -p '--interval_set_rule "${gatk_param_type.interval_set_rule}"' + + -p '--downsampling_type "${gatk_param_type.downsampling_type.downsampling_type_selector}"' + #if str( $gatk_param_type.downsampling_type.downsampling_type_selector ) != "NONE": + -p '--${gatk_param_type.downsampling_type.downsample_to_type.downsample_to_type_selector} "${gatk_param_type.downsampling_type.downsample_to_type.downsample_to_value}"' + #end if + -p ' + --baq "${gatk_param_type.baq}" + --baqGapOpenPenalty "${gatk_param_type.baq_gap_open_penalty}" + ${gatk_param_type.use_original_qualities} + --defaultBaseQualities "${gatk_param_type.default_base_qualities}" + --validation_strictness "${gatk_param_type.validation_strictness}" + --interval_merging "${gatk_param_type.interval_merging}" + ${gatk_param_type.disable_experimental_low_memory_sharding} + ${gatk_param_type.non_deterministic_random_seed} + ' + #for $rg_black_list_count, $rg_black_list in enumerate( $gatk_param_type.read_group_black_list_repeat ): + #if $rg_black_list.read_group_black_list_type.read_group_black_list_type_selector == "file": + -d "--read_group_black_list" "${rg_black_list.read_group_black_list_type.read_group_black_list}" "txt" "input_read_group_black_list_${rg_black_list_count}" + #else + -p '--read_group_black_list "${rg_black_list.read_group_black_list_type.read_group_black_list}"' + #end if + #end for + #end if + + #if str( $reference_source.reference_source_selector ) == "history": + -d "-R" "${reference_source.ref_file}" "${reference_source.ref_file.ext}" "gatk_input" + #end if + ##end standard gatk options + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +**What it does** + +Often, a VCF containing many samples and/or variants will need to be subset in order to facilitate certain analyses (e.g. comparing and contrasting cases vs. controls; extracting variant or non-variant loci that meet certain requirements, displaying just a few samples in a browser like IGV, etc.). SelectVariants can be used for this purpose. Given a single VCF file, one or more samples can be extracted from the file (based on a complete sample name or a pattern match). Variants can be further selected by specifying criteria for inclusion, i.e. "DP > 1000" (depth of coverage greater than 1000x), "AF < 0.25" (sites with allele frequency less than 0.25). These JEXL expressions are documented in the Using JEXL expressions section (http://www.broadinstitute.org/gsa/wiki/index.php/Using_JEXL_expressions). One can optionally include concordance or discordance tracks for use in selecting overlapping variants. + +For more information on using the SelectVariants module, see this `tool specific page <http://www.broadinstitute.org/gsa/wiki/index.php/SelectVariants>`_. + +To learn about best practices for variant detection using GATK, see this `overview <http://www.broadinstitute.org/gsa/wiki/index.php/Best_Practice_Variant_Detection_with_the_GATK_v3>`_. + +If you encounter errors, please view the `GATK FAQ <http://www.broadinstitute.org/gsa/wiki/index.php/Frequently_Asked_Questions>`_. + +------ + +**Inputs** + +GenomeAnalysisTK: SelectVariants accepts a VCF input file. + + +**Outputs** + +The output is in VCF format. + + +Go `here <http://www.broadinstitute.org/gsa/wiki/index.php/Input_files_for_the_GATK>`_ for details on GATK file formats. + +------- + +**Settings**:: + + + out VCFWriter stdout File to which variants should be written + variant RodBinding[VariantContext] NA Input VCF file + concordance RodBinding[VariantContext] none Output variants that were also called in this comparison track + discordance RodBinding[VariantContext] none Output variants that were not called in this comparison track + exclude_sample_file Set[File] [] File containing a list of samples (one per line) to exclude. Can be specified multiple times + exclude_sample_name Set[String] [] Exclude genotypes from this sample. Can be specified multiple times + excludeFiltered boolean false Don't include filtered loci in the analysis + excludeNonVariants boolean false Don't include loci found to be non-variant after the subsetting procedure + keepIDs File NA Only emit sites whose ID is found in this file (one ID per line) + keepOriginalAC boolean false Don't update the AC, AF, or AN values in the INFO field after selecting + mendelianViolation Boolean false output mendelian violation sites only + mvq double 0.0 Minimum genotype QUAL score for each trio member required to accept a site as a violation + remove_fraction_genotypes double 0.0 Selects a fraction (a number between 0 and 1) of the total genotypes at random from the variant track and sets them to nocall + restrictAllelesTo NumberAlleleRestriction ALL Select only variants of a particular allelicity. Valid options are ALL (default), MULTIALLELIC or BIALLELIC + sample_expressions Set[String] NA Regular expression to select many samples from the ROD tracks provided. Can be specified multiple times + sample_file Set[File] NA File containing a list of samples (one per line) to include. Can be specified multiple times + sample_name Set[String] [] Include genotypes from this sample. Can be specified multiple times + select_expressions ArrayList[String] [] One or more criteria to use when selecting the data + select_random_fraction double 0.0 Selects a fraction (a number between 0 and 1) of the total variants at random from the variant track + select_random_number int 0 Selects a number of variants at random from the variant track + selectTypeToInclude List[Type] [] Select only a certain type of variants from the input file. Valid types are INDEL, SNP, MIXED, MNP, SYMBOLIC, NO_VARIATION. Can be specified multiple times + +------ + +**Citation** + +For the underlying tool, please cite `DePristo MA, Banks E, Poplin R, Garimella KV, Maguire JR, Hartl C, Philippakis AA, del Angel G, Rivas MA, Hanna M, McKenna A, Fennell TJ, Kernytsky AM, Sivachenko AY, Cibulskis K, Gabriel SB, Altshuler D, Daly MJ. A framework for variation discovery and genotyping using next-generation DNA sequencing data. Nat Genet. 2011 May;43(5):491-8. <http://www.ncbi.nlm.nih.gov/pubmed/21478889>`_ + +If you use this tool in Galaxy, please cite Blankenberg D, et al. *In preparation.* + + + diff -r 94152a913ac9 -r a2c1575ba537 variants_validate.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variants_validate.xml Tue Sep 18 12:59:09 2012 -0400 @@ -0,0 +1,398 @@ + + + + gatk + + gatk2_wrapper.py + --max_jvm_heap_fraction "1" + --stdout "${output_log}" + -d "--variant:variant,%(file_type)s" "${reference_source.input_variant}" "${reference_source.input_variant.ext}" "input_variant" + -p 'java + -jar "/data/galaxy/galaxy3/tool-data/shared/jars/gatk2/GenomeAnalysisTK.jar" + -T "ValidateVariants" + + -et "NO_ET" -K "/data/galaxy/galaxy3/tool-data/shared/jars/gatk2/gatk2_key_file" ##ET no phone home + ##--num_threads 4 ##hard coded, for now + ##-log "${output_log}" ##don't use this to log to file, instead directly capture stdout + #if $reference_source.reference_source_selector != "history": + -R "${reference_source.ref_file.fields.path}" + #end if + ${warn_on_errors} + ${do_not_validate_filtered_records} + ' + + #if str( $dbsnp_rod_bind_type.dbsnp_rod_bind_type_selector ) == 'set_dbsnp': + -d "--dbsnp:${dbsnp_rod_bind_type.dbsnp_rod_name},%(file_type)s" "${dbsnp_rod_bind_type.dbsnp_input_rod}" "${dbsnp_rod_bind_type.dbsnp_input_rod.ext}" "input_dbsnp_${dbsnp_rod_bind_type.dbsnp_rod_name}" + #end if + + ##start standard gatk options + #if $gatk_param_type.gatk_param_type_selector == "advanced": + #for $pedigree in $gatk_param_type.pedigree: + -p '--pedigree "${pedigree.pedigree_file}"' + #end for + #for $pedigree_string in $gatk_param_type.pedigree_string_repeat: + -p '--pedigreeString "${pedigree_string.pedigree_string}"' + #end for + -p '--pedigreeValidationType "${gatk_param_type.pedigree_validation_type}"' + #for $read_filter in $gatk_param_type.read_filter: + -p '--read_filter "${read_filter.read_filter_type.read_filter_type_selector}" + ###raise Exception( str( dir( $read_filter ) ) ) + #for $name, $param in $read_filter.read_filter_type.iteritems(): + #if $name not in [ "__current_case__", "read_filter_type_selector" ]: + #if hasattr( $param.input, 'truevalue' ): + ${param} + #else: + --${name} "${param}" + #end if + #end if + #end for + ' + #end for + #for $interval_count, $input_intervals in enumerate( $gatk_param_type.input_interval_repeat ): + -d "--intervals" "${input_intervals.input_intervals}" "${input_intervals.input_intervals.ext}" "input_intervals_${interval_count}" + #end for + + #for $interval_count, $input_intervals in enumerate( $gatk_param_type.input_exclude_interval_repeat ): + -d "--excludeIntervals" "${input_intervals.input_exclude_intervals}" "${input_intervals.input_exclude_intervals.ext}" "input_exlude_intervals_${interval_count}" + #end for + + -p '--interval_set_rule "${gatk_param_type.interval_set_rule}"' + + -p '--downsampling_type "${gatk_param_type.downsampling_type.downsampling_type_selector}"' + #if str( $gatk_param_type.downsampling_type.downsampling_type_selector ) != "NONE": + -p '--${gatk_param_type.downsampling_type.downsample_to_type.downsample_to_type_selector} "${gatk_param_type.downsampling_type.downsample_to_type.downsample_to_value}"' + #end if + -p ' + --baq "${gatk_param_type.baq}" + --baqGapOpenPenalty "${gatk_param_type.baq_gap_open_penalty}" + ${gatk_param_type.use_original_qualities} + --defaultBaseQualities "${gatk_param_type.default_base_qualities}" + --validation_strictness "${gatk_param_type.validation_strictness}" + --interval_merging "${gatk_param_type.interval_merging}" + ${gatk_param_type.disable_experimental_low_memory_sharding} + ${gatk_param_type.non_deterministic_random_seed} + ' + #for $rg_black_list_count, $rg_black_list in enumerate( $gatk_param_type.read_group_black_list_repeat ): + #if $rg_black_list.read_group_black_list_type.read_group_black_list_type_selector == "file": + -d "--read_group_black_list" "${rg_black_list.read_group_black_list_type.read_group_black_list}" "txt" "input_read_group_black_list_${rg_black_list_count}" + #else + -p '--read_group_black_list "${rg_black_list.read_group_black_list_type.read_group_black_list}"' + #end if + #end for + #end if + + #if $reference_source.reference_source_selector == "history": + -d "-R" "${reference_source.ref_file}" "${reference_source.ref_file.ext}" "gatk_input" + #end if + ##end standard gatk options + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +**What it does** + +Validates a variants file. + +For more information on using the ValidateVariants module, see this `tool specific page <http://www.broadinstitute.org/gsa/wiki/index.php/VariantValidator>`_. + +To learn about best practices for variant detection using GATK, see this `overview <http://www.broadinstitute.org/gsa/wiki/index.php/Best_Practice_Variant_Detection_with_the_GATK_v3>`_. + +If you encounter errors, please view the `GATK FAQ <http://www.broadinstitute.org/gsa/wiki/index.php/Frequently_Asked_Questions>`_. + +------ + +**Inputs** + +GenomeAnalysisTK: ValidateVariants accepts variant files as input. + + +**Outputs** + +The output is a log of variant validation. + + +Go `here <http://www.broadinstitute.org/gsa/wiki/index.php/Input_files_for_the_GATK>`_ for details on GATK file formats. + +------- + +**Settings**:: + + doNotValidateFilteredRecords should we skip validation on filtered records? + warnOnErrors should we just emit warnings on errors instead of terminating the run? + +------ + +**Citation** + +For the underlying tool, please cite `DePristo MA, Banks E, Poplin R, Garimella KV, Maguire JR, Hartl C, Philippakis AA, del Angel G, Rivas MA, Hanna M, McKenna A, Fennell TJ, Kernytsky AM, Sivachenko AY, Cibulskis K, Gabriel SB, Altshuler D, Daly MJ. A framework for variation discovery and genotyping using next-generation DNA sequencing data. Nat Genet. 2011 May;43(5):491-8. <http://www.ncbi.nlm.nih.gov/pubmed/21478889>`_ + +If you use this tool in Galaxy, please cite Blankenberg D, et al. *In preparation.* + + +