# HG changeset patch # User jjohnson # Date 1351792402 14400 # Node ID 74c05070a3f8a57002982c21db7d9d72946efad6 Uploaded diff -r 000000000000 -r 74c05070a3f8 README --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/README Thu Nov 01 13:53:22 2012 -0400 @@ -0,0 +1,5 @@ +The gatk2_sorted_picard_index.loc and gatk2_annotations.txt files must be +copied into the tool-data directory. The file tool_data_table_conf.xml must +be edited to include references to these two new files. + + diff -r 000000000000 -r 74c05070a3f8 base_recalibrator.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/base_recalibrator.xml Thu Nov 01 13:53:22 2012 -0400 @@ -0,0 +1,578 @@ + + on BAM files + + gatk + samtools + + gatk2_wrapper.py + --max_jvm_heap_fraction "1" + --stdout "${output_log}" + -d "-I" "${reference_source.input_bam}" "${reference_source.input_bam.ext}" "gatk_input" + #if str( $reference_source.input_bam.metadata.bam_index ) != "None": + -d "" "${reference_source.input_bam.metadata.bam_index}" "bam_index" "gatk_input" ##hardcode galaxy ext type as bam_index + #end if + -p 'java + -jar "\$GATK2_PATH/GenomeAnalysisTK.jar" + -T "BaseRecalibrator" + \$GATK2_SITE_OPTIONS + ##--num_threads 4 ##hard coded, for now + ##-et "NO_ET" -K "/data/galaxy/appList/GenomeAnalysisTK-2.0-36-gf5c1c1a/gatk2_key_file" ##ET no phone home + ##-log "${output_log}" ##don't use this to log to file, instead directly capture stdout + #if $reference_source.reference_source_selector != "history": + -R "${reference_source.ref_file.fields.path}" + #end if + --out "${output_recal}" + ${standard_covs} + #if str( $covariates ) != "None": + #for $cov in str( $covariates ).split( ',' ): + -cov "${cov}" + #end for + #end if + ' + + #set $snp_dataset_provided = False + #set $rod_binding_names = dict() + #for $rod_binding in $rod_bind: + #if str( $rod_binding.rod_bind_type.rod_bind_type_selector ) == 'custom': + #set $rod_bind_name = $rod_binding.rod_bind_type.custom_rod_name + #else + #set $rod_bind_name = $rod_binding.rod_bind_type.rod_bind_type_selector + #end if + #if str( $rod_binding.rod_bind_type.rod_bind_type_selector ) == 'dbsnp': + #set $snp_dataset_provided = True + #end if + #set $rod_binding_names[$rod_bind_name] = $rod_binding_names.get( $rod_bind_name, -1 ) + 1 + -d "--knownSites:${rod_bind_name},%(file_type)s" "${rod_binding.rod_bind_type.input_rod}" "${rod_binding.rod_bind_type.input_rod.ext}" "input_${rod_bind_name}_${rod_binding_names[$rod_bind_name]}" + #end for + + ##start standard gatk options + #if $gatk_param_type.gatk_param_type_selector == "advanced": + #for $pedigree in $gatk_param_type.pedigree: + -p '--pedigree "${pedigree.pedigree_file}"' + #end for + #for $pedigree_string in $gatk_param_type.pedigree_string_repeat: + -p '--pedigreeString "${pedigree_string.pedigree_string}"' + #end for + -p '--pedigreeValidationType "${gatk_param_type.pedigree_validation_type}"' + #for $read_filter in $gatk_param_type.read_filter: + -p '--read_filter "${read_filter.read_filter_type.read_filter_type_selector}" + ###raise Exception( str( dir( $read_filter ) ) ) + #for $name, $param in $read_filter.read_filter_type.iteritems(): + #if $name not in [ "__current_case__", "read_filter_type_selector" ]: + #if hasattr( $param.input, 'truevalue' ): + ${param} + #else: + --${name} "${param}" + #end if + #end if + #end for + ' + #end for + #for $interval_count, $input_intervals in enumerate( $gatk_param_type.input_interval_repeat ): + -d "--intervals" "${input_intervals.input_intervals}" "${input_intervals.input_intervals.ext}" "input_intervals_${interval_count}" + #end for + + #for $interval_count, $input_intervals in enumerate( $gatk_param_type.input_exclude_interval_repeat ): + -d "--excludeIntervals" "${input_intervals.input_exclude_intervals}" "${input_intervals.input_exclude_intervals.ext}" "input_exlude_intervals_${interval_count}" + #end for + + -p '--interval_set_rule "${gatk_param_type.interval_set_rule}"' + + -p '--downsampling_type "${gatk_param_type.downsampling_type.downsampling_type_selector}"' + #if str( $gatk_param_type.downsampling_type.downsampling_type_selector ) != "NONE": + -p '--${gatk_param_type.downsampling_type.downsample_to_type.downsample_to_type_selector} "${gatk_param_type.downsampling_type.downsample_to_type.downsample_to_value}"' + #end if + -p ' + --baq "${gatk_param_type.baq}" + --baqGapOpenPenalty "${gatk_param_type.baq_gap_open_penalty}" + ${gatk_param_type.use_original_qualities} + --defaultBaseQualities "${gatk_param_type.default_base_qualities}" + --validation_strictness "${gatk_param_type.validation_strictness}" + --interval_merging "${gatk_param_type.interval_merging}" + ${gatk_param_type.disable_experimental_low_memory_sharding} + ${gatk_param_type.non_deterministic_random_seed} + ' + #for $rg_black_list_count, $rg_black_list in enumerate( $gatk_param_type.read_group_black_list_repeat ): + #if $rg_black_list.read_group_black_list_type.read_group_black_list_type_selector == "file": + -d "--read_group_black_list" "${rg_black_list.read_group_black_list_type.read_group_black_list}" "txt" "input_read_group_black_list_${rg_black_list_count}" + #else + -p '--read_group_black_list "${rg_black_list.read_group_black_list_type.read_group_black_list}"' + #end if + #end for + #end if + + #if str( $reference_source.reference_source_selector ) == "history": + -d "-R" "${reference_source.ref_file}" "${reference_source.ref_file.ext}" "gatk_input" + #end if + ##end standard gatk options + + ##start analysis specific options + #if $analysis_param_type.analysis_param_type_selector == "advanced": + -p ' + #if $analysis_param_type.default_read_group_type.default_read_group_type_selector == "set": + --default_read_group "${analysis_param_type.default_read_group_type.default_read_group}" + #end if + #if str( $analysis_param_type.default_platform ) != "default": + --default_platform "${analysis_param_type.default_platform}" + #end if + #if str( $analysis_param_type.force_read_group_type.force_read_group_type_selector ) == "set": + --force_read_group "${analysis_param_type.force_read_group_type.force_read_group}" + #end if + #if str( $analysis_param_type.force_platform ) != "default": + --force_platform "${analysis_param_type.force_platform}" + #end if + ${analysis_param_type.exception_if_no_tile} + #if str( $analysis_param_type.solid_options_type.solid_options_type_selector ) == "set": + #if str( $analysis_param_type.solid_options_type.solid_recal_mode ) != "default": + --solid_recal_mode "${analysis_param_type.solid_options_type.solid_recal_mode}" + #end if + #if str( $analysis_param_type.solid_options_type.solid_nocall_strategy ) != "default": + --solid_nocall_strategy "${analysis_param_type.solid_options_type.solid_nocall_strategy}" + #end if + #end if + --window_size_nqs "${analysis_param_type.window_size_nqs}" + --homopolymer_nback "${analysis_param_type.homopolymer_nback}" + ' + #end if + #if not $snp_dataset_provided: + -p '--run_without_dbsnp_potentially_ruining_quality' + #end if + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +.. class:: warningmark + +"This calculation is critically dependent on being able to skip over known variant sites. Please provide a dbSNP ROD or a VCF file containing known sites of genetic variation." +However, if you do not provide this file, the '--run_without_dbsnp_potentially_ruining_quality' flag will be automatically used, and the command will be allowed to run. + +**What it does** + +This walker is designed to work as the first pass in a two-pass processing step. It does a by-locus traversal operating only at sites that are not in dbSNP. We assume that all reference mismatches we see are therefore errors and indicative of poor base quality. This walker generates tables based on various user-specified covariates (such as read group, reported quality score, cycle, and dinucleotide) Since there is a large amount of data one can then calculate an empirical probability of error given the particular covariates seen at this site, where p(error) = num mismatches / num observations The output file is a CSV list of (the several covariate values, num observations, num mismatches, empirical quality score) The first non-comment line of the output file gives the name of the covariates that were used for this calculation. Note: ReadGroupCovariate and QualityScoreCovariate are required covariates and will be added for the user regardless of whether or not they were specified Note: This walker is designed to be used in conjunction with TableRecalibrationWalker. + +For more information on base quality score recalibration using the GATK, see this `tool specific page <http://www.broadinstitute.org/gatk/gatkdocs/org_broadinstitute_sting_gatk_walkers_bqsr_BaseRecalibrator.html>`_. + +To learn about best practices for variant detection using GATK, see this `overview <http://www.broadinstitute.org/gatk/guide/topic?name=best-practices>`_. + +If you encounter errors, please view the `GATK FAQ <http://www.broadinstitute.org/gatk/guide/topic?name=faqs>`_. + +------ + +**Inputs** + +GenomeAnalysisTK: BaseRecalibrator accepts an aligned BAM input file. + + +**Outputs** + +The output is in CSV format. + + +Go `here <http://www.broadinstitute.org/gatk/guide/topic?name=intro>`_ for details on GATK file formats. + +------- + +**Settings**:: + + + default_read_group If a read has no read group then default to the provided String. + default_platform If a read has no platform then default to the provided String. Valid options are illumina, 454, and solid. + force_read_group If provided, the read group ID of EVERY read will be forced to be the provided String. This is useful to collapse all data into a single read group. + force_platform If provided, the platform of EVERY read will be forced to be the provided String. Valid options are illumina, 454, and solid. + window_size_nqs The window size used by MinimumNQSCovariate for its calculation + homopolymer_nback The number of previous bases to look at in HomopolymerCovariate + exception_if_no_tile If provided, TileCovariate will throw an exception when no tile can be found. The default behavior is to use tile = -1 + solid_recal_mode How should we recalibrate solid bases in whichthe reference was inserted? Options = DO_NOTHING, SET_Q_ZERO, SET_Q_ZERO_BASE_N, or REMOVE_REF_BIAS (DO_NOTHING|SET_Q_ZERO|SET_Q_ZERO_BASE_N|REMOVE_REF_BIAS) + solid_nocall_strategy Defines the behavior of the recalibrator when it encounters no calls in the color space. Options = THROW_EXCEPTION, LEAVE_READ_UNRECALIBRATED, or PURGE_READ (THROW_EXCEPTION|LEAVE_READ_UNRECALIBRATED|PURGE_READ) + recal_file Filename for the input covariates table recalibration .csv file + out The output CSV file + standard_covs Use the standard set of covariates in addition to the ones listed using the -cov argument + run_without_dbsnp_potentially_ruining_quality If specified, allows the recalibrator to be used without a dbsnp rod. Very unsafe and for expert users only. + +------ + +**Citation** + +For the underlying tool, please cite `DePristo MA, Banks E, Poplin R, Garimella KV, Maguire JR, Hartl C, Philippakis AA, del Angel G, Rivas MA, Hanna M, McKenna A, Fennell TJ, Kernytsky AM, Sivachenko AY, Cibulskis K, Gabriel SB, Altshuler D, Daly MJ. A framework for variation discovery and genotyping using next-generation DNA sequencing data. Nat Genet. 2011 May;43(5):491-8. <http://www.ncbi.nlm.nih.gov/pubmed/21478889>`_ + +Please also site `McKenna A, Hanna M, Banks E, Sivachenko A, Cibulskis K, Kernytsky A, Garimella K, Altshuler D, Gabriel S, Daly M, DePristo MA (2010). The Genome Analysis Toolkit: a MapReduce framework for analyzing next-generation DNA sequencing data. Genome Res. 20:1297-303. Epub 2010 Jul 19. <http://www.ncbi.nlm.nih.gov/pubmed/20644199>`_ + +If you use this tool in Galaxy, please cite `Blankenberg D, Von Kuster G, Coraor N, Ananda G, Lazarus R, Mangan M, Nekrutenko A, Taylor J. Galaxy: a web-based genome analysis tool for experimentalists. Curr Protoc Mol Biol. 2010 Jan;Chapter 19:Unit 19.10.1-21. <http://www.ncbi.nlm.nih.gov/pubmed/20069535>`_ + + + diff -r 000000000000 -r 74c05070a3f8 depth_of_coverage.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/depth_of_coverage.xml Thu Nov 01 13:53:22 2012 -0400 @@ -0,0 +1,1030 @@ + + on BAM files + + gatk + samtools + + gatk2_wrapper.py + --max_jvm_heap_fraction "1" + --stdout "${output_log}" + #for $i, $input_bam in enumerate( $reference_source.input_bams ): + -d "-I" "${input_bam.input_bam}" "${input_bam.input_bam.ext}" "gatk_input_${i}" + #if str( $input_bam.input_bam.metadata.bam_index ) != "None": + -d "" "${input_bam.input_bam.metadata.bam_index}" "bam_index" "gatk_input_${i}" ##hardcode galaxy ext type as bam_index + #end if + #end for + -p 'java + -jar "\$GATK2_PATH/GenomeAnalysisTK.jar" + -T "DepthOfCoverage" + \$GATK2_SITE_OPTIONS + ##--num_threads 4 ##hard coded, for now + + ##-et "NO_ET" -K "/data/galaxy/appList/GenomeAnalysisTK-2.0-36-gf5c1c1a/gatk2_key_file" ##ET no phone home + #if $reference_source.reference_source_selector != "history": + -R "${reference_source.ref_file.fields.path}" + #end if + #if str( $input_calculate_coverage_over_genes ) != "None": + --calculateCoverageOverGenes "${input_calculate_coverage_over_genes}" + #end if + #if str( $partition_type ) != "None": + #for $pt in str( $partition_type ).split( ',' ): + --partitionType "${pt}" + #end for + #end if + --out "${output_per_locus_coverage}" + + #for $ct_group in $summary_coverage_threshold_group: + --summaryCoverageThreshold "${ct_group.summary_coverage_threshold}" + #end for + --outputFormat "${output_format}" + ' + + ##start standard gatk options + #if $gatk_param_type.gatk_param_type_selector == "advanced": + #for $pedigree in $gatk_param_type.pedigree: + -p '--pedigree "${pedigree.pedigree_file}"' + #end for + #for $pedigree_string in $gatk_param_type.pedigree_string_repeat: + -p '--pedigreeString "${pedigree_string.pedigree_string}"' + #end for + -p '--pedigreeValidationType "${gatk_param_type.pedigree_validation_type}"' + #for $read_filter in $gatk_param_type.read_filter: + -p '--read_filter "${read_filter.read_filter_type.read_filter_type_selector}" + ###raise Exception( str( dir( $read_filter ) ) ) + #for $name, $param in $read_filter.read_filter_type.iteritems(): + #if $name not in [ "__current_case__", "read_filter_type_selector" ]: + #if hasattr( $param.input, 'truevalue' ): + ${param} + #else: + --${name} "${param}" + #end if + #end if + #end for + ' + #end for + #for $interval_count, $input_intervals in enumerate( $gatk_param_type.input_interval_repeat ): + -d "--intervals" "${input_intervals.input_intervals}" "${input_intervals.input_intervals.ext}" "input_intervals_${interval_count}" + #end for + + #for $interval_count, $input_intervals in enumerate( $gatk_param_type.input_exclude_interval_repeat ): + -d "--excludeIntervals" "${input_intervals.input_exclude_intervals}" "${input_intervals.input_exclude_intervals.ext}" "input_exlude_intervals_${interval_count}" + #end for + + -p '--interval_set_rule "${gatk_param_type.interval_set_rule}"' + + -p '--downsampling_type "${gatk_param_type.downsampling_type.downsampling_type_selector}"' + #if str( $gatk_param_type.downsampling_type.downsampling_type_selector ) != "NONE": + -p '--${gatk_param_type.downsampling_type.downsample_to_type.downsample_to_type_selector} "${gatk_param_type.downsampling_type.downsample_to_type.downsample_to_value}"' + #end if + -p ' + --baq "${gatk_param_type.baq}" + --baqGapOpenPenalty "${gatk_param_type.baq_gap_open_penalty}" + ${gatk_param_type.use_original_qualities} + --defaultBaseQualities "${gatk_param_type.default_base_qualities}" + --validation_strictness "${gatk_param_type.validation_strictness}" + --interval_merging "${gatk_param_type.interval_merging}" + ${gatk_param_type.disable_experimental_low_memory_sharding} + ${gatk_param_type.non_deterministic_random_seed} + ' + #for $rg_black_list_count, $rg_black_list in enumerate( $gatk_param_type.read_group_black_list_repeat ): + #if $rg_black_list.read_group_black_list_type.read_group_black_list_type_selector == "file": + -d "--read_group_black_list" "${rg_black_list.read_group_black_list_type.read_group_black_list}" "txt" "input_read_group_black_list_${rg_black_list_count}" + #else + -p '--read_group_black_list "${rg_black_list.read_group_black_list_type.read_group_black_list}"' + #end if + #end for + #end if + + #if $reference_source.reference_source_selector == "history": + -d "-R" "${reference_source.ref_file}" "${reference_source.ref_file.ext}" "gatk_input" + #end if + ##end standard gatk options + ##start analysis specific options + #if $analysis_param_type.analysis_param_type_selector == "advanced": + -p ' + ${analysis_param_type.ignore_deletion_sites} + ${analysis_param_type.include_deletions} + --maxBaseQuality "${analysis_param_type.max_base_quality}" + --maxMappingQuality "${analysis_param_type.max_mapping_quality}" + --minBaseQuality "${analysis_param_type.min_base_quality}" + --minMappingQuality "${analysis_param_type.min_mapping_quality}" + --nBins "${analysis_param_type.n_bins}" + ${analysis_param_type.omit_depth_output_at_each_base} + ${analysis_param_type.omit_interval_statistics} + ${analysis_param_type.omit_locus_table} + ${analysis_param_type.omit_per_sample_stats} + ${analysis_param_type.print_base_counts} + ${analysis_param_type.print_bin_endpoints_and_exit} + --start "${analysis_param_type.start}" + --stop "${analysis_param_type.stop}" + ' + #end if + ##Move additional files to final location + #if str( $partition_type ) != "None": + #set $partition_types = str( $partition_type ).split( ',' ) + #else: + #set $partition_types = [ 'sample' ] + #end if + #if 'sample' in $partition_types and ( str( $analysis_param_type.analysis_param_type_selector ) == "basic" or str( $analysis_param_type.print_bin_endpoints_and_exit ) == "" ): + #if str( $analysis_param_type.analysis_param_type_selector ) == "basic" or str( $analysis_param_type.omit_per_sample_stats ) == "": + && mv ${output_per_locus_coverage}.sample_summary ${output_summary_sample} + && mv ${output_per_locus_coverage}.sample_statistics ${output_statistics_sample} + #end if + #if $gatk_param_type.gatk_param_type_selector == "advanced" and len( $gatk_param_type.input_interval_repeat ) and ( str( $analysis_param_type.analysis_param_type_selector ) == "basic" or str( $analysis_param_type.omit_interval_statistics ) == "" ): + && mv ${output_per_locus_coverage}.sample_interval_summary ${output_interval_summary_sample} + && mv ${output_per_locus_coverage}.sample_interval_statistics ${output_interval_statistics_sample} + #end if + #if str( $input_calculate_coverage_over_genes ) != "None": + && mv ${output_per_locus_coverage}.sample_gene_summary ${output_gene_summary_sample} + && mv ${output_per_locus_coverage}.sample_gene_statistics ${output_gene_statistics_sample} + #end if + #if str( $analysis_param_type.analysis_param_type_selector ) == "basic" or str( $analysis_param_type.omit_depth_output_at_each_base ) == "": + && mv ${output_per_locus_coverage}.sample_cumulative_coverage_counts ${output_cumulative_coverage_counts_sample} + && mv ${output_per_locus_coverage}.sample_cumulative_coverage_proportions ${output_cumulative_coverage_proportions_sample} + #end if + #end if + + #if 'readgroup' in $partition_types and ( str( $analysis_param_type.analysis_param_type_selector ) == "basic" or str( $analysis_param_type.print_bin_endpoints_and_exit ) == "" ): + #if str( $analysis_param_type.analysis_param_type_selector ) == "basic" or str( $analysis_param_type.omit_per_sample_stats ) == "": + && mv ${output_per_locus_coverage}.read_group_summary ${output_summary_readgroup} + && mv ${output_per_locus_coverage}.read_group_statistics ${output_statistics_readgroup} + #end if + #if $gatk_param_type.gatk_param_type_selector == "advanced" and len( $gatk_param_type.input_interval_repeat ) and ( str( $analysis_param_type.analysis_param_type_selector ) == "basic" or str( $analysis_param_type.omit_interval_statistics ) == "" ): + && mv ${output_per_locus_coverage}.read_group_interval_summary ${output_interval_summary_readgroup} + && mv ${output_per_locus_coverage}.read_group_interval_statistics ${output_interval_statistics_readgroup} + #end if + #if str( $input_calculate_coverage_over_genes ) != "None": + && mv ${output_per_locus_coverage}.read_group_gene_summary ${output_gene_summary_readgroup} + && mv ${output_per_locus_coverage}.read_group_gene_statistics ${output_gene_statistics_readgroup} + #end if + #if str( $analysis_param_type.analysis_param_type_selector ) == "basic" or str( $analysis_param_type.omit_depth_output_at_each_base ) == "": + && mv ${output_per_locus_coverage}.read_group_cumulative_coverage_counts ${output_cumulative_coverage_counts_readgroup} + && mv ${output_per_locus_coverage}.read_group_cumulative_coverage_proportions ${output_cumulative_coverage_proportions_readgroup} + #end if + #end if + + #if 'library' in $partition_types and ( str( $analysis_param_type.analysis_param_type_selector ) == "basic" or str( $analysis_param_type.print_bin_endpoints_and_exit ) == "" ): + #if str( $analysis_param_type.analysis_param_type_selector ) == "basic" or str( $analysis_param_type.omit_per_sample_stats ) == "": + && mv ${output_per_locus_coverage}.library_summary ${output_summary_library} + && mv ${output_per_locus_coverage}.library_statistics ${output_statistics_library} + #end if + #if $gatk_param_type.gatk_param_type_selector == "advanced" and len( $gatk_param_type.input_interval_repeat ) and ( str( $analysis_param_type.analysis_param_type_selector ) == "basic" or str( $analysis_param_type.omit_interval_statistics ) == "" ): + && mv ${output_per_locus_coverage}.library_interval_summary ${output_interval_summary_library} + && mv ${output_per_locus_coverage}.library_interval_statistics ${output_interval_statistics_library} + #end if + #if str( $input_calculate_coverage_over_genes ) != "None": + && mv ${output_per_locus_coverage}.library_gene_summary ${output_gene_summary_library} + && mv ${output_per_locus_coverage}.library_gene_statistics ${output_gene_statistics_library} + #end if + #if str( $analysis_param_type.analysis_param_type_selector ) == "basic" or str( $analysis_param_type.omit_depth_output_at_each_base ) == "": + && mv ${output_per_locus_coverage}.library_cumulative_coverage_counts ${output_cumulative_coverage_counts_library} + && mv ${output_per_locus_coverage}.library_cumulative_coverage_proportions ${output_cumulative_coverage_proportions_library} + #end if + #end if + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + analysis_param_type['analysis_param_type_selector'] == "basic" or analysis_param_type['print_bin_endpoints_and_exit'] == False + + + + + + + + + + + + + + + analysis_param_type['analysis_param_type_selector'] == "basic" or analysis_param_type['omit_per_sample_stats'] == False + analysis_param_type['analysis_param_type_selector'] == "basic" or analysis_param_type['print_bin_endpoints_and_exit'] == False + 'sample' in partition_type or not partition_type + + + + + + + + + + + + + + + analysis_param_type['analysis_param_type_selector'] == "basic" or analysis_param_type['omit_per_sample_stats'] == False + analysis_param_type['analysis_param_type_selector'] == "basic" or analysis_param_type['print_bin_endpoints_and_exit'] == False + 'sample' in partition_type or not partition_type + + + + + + + + + + + + + + + analysis_param_type['analysis_param_type_selector'] == "basic" or analysis_param_type['print_bin_endpoints_and_exit'] == False + 'sample' in partition_type or not partition_type + gatk_param_type['gatk_param_type_selector'] == "advanced" and len( gatk_param_type['input_interval_repeat'] ) + analysis_param_type['analysis_param_type_selector'] == "basic" or analysis_param_type['omit_interval_statistics'] == False + + + + + + + + + + + + + + + analysis_param_type['analysis_param_type_selector'] == "basic" or analysis_param_type['print_bin_endpoints_and_exit'] == False + 'sample' in partition_type or not partition_type + gatk_param_type['gatk_param_type_selector'] == "advanced" and len( gatk_param_type['input_interval_repeat'] ) + analysis_param_type['analysis_param_type_selector'] == "basic" or analysis_param_type['omit_interval_statistics'] == False + + + + + + + + + + + + + + + analysis_param_type['analysis_param_type_selector'] == "basic" or analysis_param_type['print_bin_endpoints_and_exit'] == False + input_calculate_coverage_over_genes is not None and 'sample' in partition_type or not partition_type + + + + + + + + + + + + + + + analysis_param_type['analysis_param_type_selector'] == "basic" or analysis_param_type['print_bin_endpoints_and_exit'] == False + input_calculate_coverage_over_genes is not None and 'sample' in partition_type or not partition_type + + + + + + + + + + + + + + + analysis_param_type['analysis_param_type_selector'] == "basic" or analysis_param_type['omit_depth_output_at_each_base'] == False + analysis_param_type['analysis_param_type_selector'] == "basic" or analysis_param_type['print_bin_endpoints_and_exit'] == False + 'sample' in partition_type or not partition_type + + + + + + + + + + + + + + + analysis_param_type['analysis_param_type_selector'] == "basic" or analysis_param_type['omit_depth_output_at_each_base'] == False + analysis_param_type['analysis_param_type_selector'] == "basic" or analysis_param_type['print_bin_endpoints_and_exit'] == False + 'sample' in partition_type or not partition_type + + + + + + + + + + + + + + + + analysis_param_type['analysis_param_type_selector'] == "basic" or analysis_param_type['omit_per_sample_stats'] == False + analysis_param_type['analysis_param_type_selector'] == "basic" or analysis_param_type['print_bin_endpoints_and_exit'] == False + 'readgroup' in partition_type + + + + + + + + + + + + + + + analysis_param_type['analysis_param_type_selector'] == "basic" or analysis_param_type['omit_per_sample_stats'] == False + analysis_param_type['analysis_param_type_selector'] == "basic" or analysis_param_type['print_bin_endpoints_and_exit'] == False + 'readgroup' in partition_type + + + + + + + + + + + + + + + analysis_param_type['analysis_param_type_selector'] == "basic" or analysis_param_type['print_bin_endpoints_and_exit'] == False + 'readgroup' in partition_type + gatk_param_type['gatk_param_type_selector'] == "advanced" and len( gatk_param_type['input_interval_repeat'] ) + analysis_param_type['analysis_param_type_selector'] == "basic" or analysis_param_type['omit_interval_statistics'] == False + + + + + + + + + + + + + + + analysis_param_type['analysis_param_type_selector'] == "basic" or analysis_param_type['print_bin_endpoints_and_exit'] == False + 'readgroup' in partition_type + gatk_param_type['gatk_param_type_selector'] == "advanced" and len( gatk_param_type['input_interval_repeat'] ) + analysis_param_type['analysis_param_type_selector'] == "basic" or analysis_param_type['omit_interval_statistics'] == False + + + + + + + + + + + + + + + analysis_param_type['analysis_param_type_selector'] == "basic" or analysis_param_type['print_bin_endpoints_and_exit'] == False + input_calculate_coverage_over_genes is not None and 'readgroup' in partition_type or not partition_type + + + + + + + + + + + + + + + analysis_param_type['analysis_param_type_selector'] == "basic" or analysis_param_type['print_bin_endpoints_and_exit'] == False + input_calculate_coverage_over_genes is not None and 'readgroup' in partition_type or not partition_type + + + + + + + + + + + + + + + analysis_param_type['analysis_param_type_selector'] == "basic" or analysis_param_type['print_bin_endpoints_and_exit'] == False + analysis_param_type['analysis_param_type_selector'] == "basic" or analysis_param_type['omit_depth_output_at_each_base'] == False + 'readgroup' in partition_type + + + + + + + + + + + + + + + analysis_param_type['analysis_param_type_selector'] == "basic" or analysis_param_type['print_bin_endpoints_and_exit'] == False + analysis_param_type['analysis_param_type_selector'] == "basic" or analysis_param_type['omit_depth_output_at_each_base'] == False + 'readgroup' in partition_type + + + + + + + + + + + + + + + + analysis_param_type['analysis_param_type_selector'] == "basic" or analysis_param_type['omit_per_sample_stats'] == False + analysis_param_type['analysis_param_type_selector'] == "basic" or analysis_param_type['print_bin_endpoints_and_exit'] == False + 'library' in partition_type + + + + + + + + + + + + + + + analysis_param_type['analysis_param_type_selector'] == "basic" or analysis_param_type['omit_per_sample_stats'] == False + analysis_param_type['analysis_param_type_selector'] == "basic" or analysis_param_type['print_bin_endpoints_and_exit'] == False + 'library' in partition_type + + + + + + + + + + + + + + + analysis_param_type['analysis_param_type_selector'] == "basic" or analysis_param_type['print_bin_endpoints_and_exit'] == False + 'library' in partition_type + gatk_param_type['gatk_param_type_selector'] == "advanced" and len( gatk_param_type['input_interval_repeat'] ) + analysis_param_type['analysis_param_type_selector'] == "basic" or analysis_param_type['omit_interval_statistics'] == False + + + + + + + + + + + + + + + analysis_param_type['analysis_param_type_selector'] == "basic" or analysis_param_type['print_bin_endpoints_and_exit'] == False + 'library' in partition_type + gatk_param_type['gatk_param_type_selector'] == "advanced" and len( gatk_param_type['input_interval_repeat'] ) + analysis_param_type['analysis_param_type_selector'] == "basic" or analysis_param_type['omit_interval_statistics'] == False + + + + + + + + + + + + + + + analysis_param_type['analysis_param_type_selector'] == "basic" or analysis_param_type['print_bin_endpoints_and_exit'] == False + input_calculate_coverage_over_genes is not None and 'library' in partition_type or not partition_type + + + + + + + + + + + + + + + analysis_param_type['analysis_param_type_selector'] == "basic" or analysis_param_type['print_bin_endpoints_and_exit'] == False + input_calculate_coverage_over_genes is not None and 'library' in partition_type or not partition_type + + + + + + + + + + + + + + + analysis_param_type['analysis_param_type_selector'] == "basic" or analysis_param_type['omit_depth_output_at_each_base'] == False + analysis_param_type['analysis_param_type_selector'] == "basic" or analysis_param_type['print_bin_endpoints_and_exit'] == False + 'library' in partition_type + + + + + + + + + + + + + + + analysis_param_type['analysis_param_type_selector'] == "basic" or analysis_param_type['omit_depth_output_at_each_base'] == False + analysis_param_type['analysis_param_type_selector'] == "basic" or analysis_param_type['print_bin_endpoints_and_exit'] == False + 'library' in partition_type + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +**What it does** + +DepthOfCoverage processes a set of bam files to determine coverage at different levels of partitioning and aggregation. Coverage can be analyzed per locus, per interval, per gene, or in total; can be partitioned by sample, by read group, by technology, by center, or by library; and can be summarized by mean, median, quartiles, and/or percentage of bases covered to or beyond a threshold. Additionally, reads and bases can be filtered by mapping or base quality score. + +For more information on the GATK Depth of Coverage, see this `tool specific page <http://www.broadinstitute.org/gatk/gatkdocs/org_broadinstitute_sting_gatk_walkers_annotator_DepthOfCoverage.html>`_. + +To learn about best practices for variant detection using GATK, see this `overview <http://www.broadinstitute.org/gatk/guide/topic?name=best-practices>`_. + +If you encounter errors, please view the `GATK FAQ <http://www.broadinstitute.org/gatk/guide/topic?name=faqs>`_. + +------ + +**Inputs** + +GenomeAnalysisTK: DepthOfCoverage accepts aligned BAM input files. + + +**Outputs** + +The output is in various table formats. + + +Go `here <http://www.broadinstitute.org/gatk/guide/topic?name=intro>`_ for details on GATK file formats. + +------- + +**Settings**:: + + calculateCoverageOverGenes File NA Calculate the coverage statistics over this list of genes. Currently accepts RefSeq. + ignoreDeletionSites boolean false Ignore sites consisting only of deletions + includeDeletions boolean false Include information on deletions + maxBaseQuality byte 127 Maximum quality of bases to count towards depth. Defaults to 127 (Byte.MAX_VALUE). + maxMappingQuality int 2147483647 Maximum mapping quality of reads to count towards depth. Defaults to 2^31-1 (Integer.MAX_VALUE). + minBaseQuality byte -1 Minimum quality of bases to count towards depth. Defaults to -1. + minMappingQuality int -1 Minimum mapping quality of reads to count towards depth. Defaults to -1. + nBins int 499 Number of bins to use for granular binning + omitDepthOutputAtEachBase boolean false Will omit the output of the depth of coverage at each base, which should result in speedup + omitIntervalStatistics boolean false Will omit the per-interval statistics section, which should result in speedup + omitLocusTable boolean false Will not calculate the per-sample per-depth counts of loci, which should result in speedup + omitPerSampleStats boolean false Omits the summary files per-sample. These statistics are still calculated, so this argument will not improve runtime. + outputFormat String rtable the format of the output file (e.g. csv, table, rtable); defaults to r-readable table + partitionType Set[Partition] [sample] Partition type for depth of coverage. Defaults to sample. Can be any combination of sample, readgroup, library. + printBaseCounts boolean false Will add base counts to per-locus output. + printBinEndpointsAndExit boolean false Prints the bin values and exits immediately. Use to calibrate what bins you want before running on data. + start int 1 Starting (left endpoint) for granular binning + stop int 500 Ending (right endpoint) for granular binning + summaryCoverageThreshold int[] [15] for summary file outputs, report the % of bases coverd to >= this number. Defaults to 15; can take multiple arguments. + +------ + +**Citation** + +For the underlying tool, please cite `DePristo MA, Banks E, Poplin R, Garimella KV, Maguire JR, Hartl C, Philippakis AA, del Angel G, Rivas MA, Hanna M, McKenna A, Fennell TJ, Kernytsky AM, Sivachenko AY, Cibulskis K, Gabriel SB, Altshuler D, Daly MJ. A framework for variation discovery and genotyping using next-generation DNA sequencing data. Nat Genet. 2011 May;43(5):491-8. <http://www.ncbi.nlm.nih.gov/pubmed/21478889>`_ + +Please also site `McKenna A, Hanna M, Banks E, Sivachenko A, Cibulskis K, Kernytsky A, Garimella K, Altshuler D, Gabriel S, Daly M, DePristo MA (2010). The Genome Analysis Toolkit: a MapReduce framework for analyzing next-generation DNA sequencing data. Genome Res. 20:1297-303. Epub 2010 Jul 19. <http://www.ncbi.nlm.nih.gov/pubmed/20644199>`_ + +If you use this tool in Galaxy, please cite `Blankenberg D, Von Kuster G, Coraor N, Ananda G, Lazarus R, Mangan M, Nekrutenko A, Taylor J. Galaxy: a web-based genome analysis tool for experimentalists. Curr Protoc Mol Biol. 2010 Jan;Chapter 19:Unit 19.10.1-21. <http://www.ncbi.nlm.nih.gov/pubmed/20069535>`_ + + + diff -r 000000000000 -r 74c05070a3f8 gatk2_annotations.txt.sample --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/gatk2_annotations.txt.sample Thu Nov 01 13:53:22 2012 -0400 @@ -0,0 +1,30 @@ +#unique_id name gatk_value tools_valid_for +AlleleBalance AlleleBalance AlleleBalance UnifiedGenotyper,VariantAnnotator,VariantRecalibrator +AlleleBalanceBySample AlleleBalanceBySample AlleleBalanceBySample UnifiedGenotyper,VariantAnnotator,VariantRecalibrator +BaseCounts BaseCounts BaseCounts UnifiedGenotyper,VariantAnnotator,VariantRecalibrator +BaseQualityRankSumTest BaseQualityRankSumTest BaseQualityRankSumTest UnifiedGenotyper,VariantAnnotator,VariantRecalibrator +ChromosomeCounts ChromosomeCounts ChromosomeCounts UnifiedGenotyper,VariantAnnotator,VariantRecalibrator +DepthOfCoverage DepthOfCoverage DepthOfCoverage UnifiedGenotyper,VariantAnnotator,VariantRecalibrator +DepthPerAlleleBySample DepthPerAlleleBySample DepthPerAlleleBySample UnifiedGenotyper,VariantAnnotator,VariantRecalibrator +FisherStrand FisherStrand FisherStrand UnifiedGenotyper,VariantAnnotator,VariantRecalibrator +GCContent GCContent GCContent UnifiedGenotyper,VariantAnnotator,VariantRecalibrator +HaplotypeScore HaplotypeScore HaplotypeScore UnifiedGenotyper,VariantAnnotator,VariantRecalibrator +HardyWeinberg HardyWeinberg HardyWeinberg UnifiedGenotyper,VariantAnnotator,VariantRecalibrator +HomopolymerRun HomopolymerRun HomopolymerRun UnifiedGenotyper,VariantAnnotator,VariantRecalibrator +InbreedingCoeff InbreedingCoeff InbreedingCoeff UnifiedGenotyper,VariantAnnotator,VariantRecalibrator +IndelType IndelType IndelType UnifiedGenotyper,VariantAnnotator,VariantRecalibrator +LowMQ LowMQ LowMQ UnifiedGenotyper,VariantAnnotator,VariantRecalibrator +MVLikelihoodRatio MVLikelihoodRatio MVLikelihoodRatio UnifiedGenotyper,VariantAnnotator,VariantRecalibrator +MappingQualityRankSumTest MappingQualityRankSumTest MappingQualityRankSumTest UnifiedGenotyper,VariantAnnotator,VariantRecalibrator +MappingQualityZero MappingQualityZero MappingQualityZero UnifiedGenotyper,VariantAnnotator,VariantRecalibrator +MappingQualityZeroBySample MappingQualityZeroBySample MappingQualityZeroBySample UnifiedGenotyper,VariantAnnotator,VariantRecalibrator +MappingQualityZeroFraction MappingQualityZeroFraction MappingQualityZeroFraction UnifiedGenotyper,VariantAnnotator,VariantRecalibrator +NBaseCount NBaseCount NBaseCount UnifiedGenotyper,VariantAnnotator,VariantRecalibrator +QualByDepth QualByDepth QualByDepth UnifiedGenotyper,VariantAnnotator,VariantRecalibrator +RMSMappingQuality RMSMappingQuality RMSMappingQuality UnifiedGenotyper,VariantAnnotator,VariantRecalibrator +ReadDepthAndAllelicFractionBySample ReadDepthAndAllelicFractionBySample ReadDepthAndAllelicFractionBySample UnifiedGenotyper,VariantAnnotator,VariantRecalibrator +ReadPosRankSumTest ReadPosRankSumTest ReadPosRankSumTest UnifiedGenotyper,VariantAnnotator,VariantRecalibrator +SampleList SampleList SampleList UnifiedGenotyper,VariantAnnotator,VariantRecalibrator +SnpEff SnpEff SnpEff VariantAnnotator,VariantRecalibrator +SpanningDeletions SpanningDeletions SpanningDeletions UnifiedGenotyper,VariantAnnotator,VariantRecalibrator +TechnologyComposition TechnologyComposition TechnologyComposition UnifiedGenotyper,VariantAnnotator,VariantRecalibrator diff -r 000000000000 -r 74c05070a3f8 gatk2_sorted_picard_index.loc.sample --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/gatk2_sorted_picard_index.loc.sample Thu Nov 01 13:53:22 2012 -0400 @@ -0,0 +1,30 @@ +#This is a sample file distributed with Galaxy that enables tools +#to use a directory of Picard dict and associated files. You will need +#to create these data files and then create a picard_index.loc file +#similar to this one (store it in this directory) that points to +#the directories in which those files are stored. The picard_index.loc +#file has this format (longer white space is the TAB character): +# +# +# +#So, for example, if you had hg18 indexed and stored in +#/depot/data2/galaxy/srma/hg18/, +#then the srma_index.loc entry would look like this: +# +#hg18 hg18 hg18 Pretty /depot/data2/galaxy/picard/hg18/hg18.fa +# +#and your /depot/data2/galaxy/srma/hg18/ directory +#would contain the following three files: +#hg18.fa +#hg18.dict +#hg18.fa.fai +# +#The dictionary file for each reference (ex. hg18.dict) must be +#created via Picard (http://picard.sourceforge.net). Note that +#the dict file does not have the .fa extension although the +#path list in the loc file does include it. +# +hg18 hg18 hg18 /data/galaxy/ext-tool-data/picard/hg18.fa +hg19 hg19 hg19 /data/galaxy/ext-tool-data/picard/hg19.fa +mm8 mm8 mm8 /data/galaxy/ext-tool-data/picard/mm8.fa +mm9 mm9 mm9 /data/galaxy/ext-tool-data/picard/mm9.fa diff -r 000000000000 -r 74c05070a3f8 gatk2_wrapper.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/gatk2_wrapper.py Thu Nov 01 13:53:22 2012 -0400 @@ -0,0 +1,128 @@ +#!/usr/bin/env python +#David Hoover, based on gatk by Dan Blankenberg + +""" +A wrapper script for running the GenomeAnalysisTK.jar commands. +""" + +import sys, optparse, os, tempfile, subprocess, shutil +from binascii import unhexlify +from string import Template + +GALAXY_EXT_TO_GATK_EXT = { 'gatk_interval':'intervals', 'bam_index':'bam.bai', 'gatk_dbsnp':'dbSNP', 'picard_interval_list':'interval_list' } #items not listed here will use the galaxy extension as-is +GALAXY_EXT_TO_GATK_FILE_TYPE = GALAXY_EXT_TO_GATK_EXT #for now, these are the same, but could be different if needed +DEFAULT_GATK_PREFIX = "gatk_file" +CHUNK_SIZE = 2**20 #1mb + + +def cleanup_before_exit( tmp_dir ): + if tmp_dir and os.path.exists( tmp_dir ): + shutil.rmtree( tmp_dir ) + +def gatk_filename_from_galaxy( galaxy_filename, galaxy_ext, target_dir = None, prefix = None ): + suffix = GALAXY_EXT_TO_GATK_EXT.get( galaxy_ext, galaxy_ext ) + if prefix is None: + prefix = DEFAULT_GATK_PREFIX + if target_dir is None: + target_dir = os.getcwd() + gatk_filename = os.path.join( target_dir, "%s.%s" % ( prefix, suffix ) ) + os.symlink( galaxy_filename, gatk_filename ) + return gatk_filename + +def gatk_filetype_argument_substitution( argument, galaxy_ext ): + return argument % dict( file_type = GALAXY_EXT_TO_GATK_FILE_TYPE.get( galaxy_ext, galaxy_ext ) ) + +def open_file_from_option( filename, mode = 'rb' ): + if filename: + return open( filename, mode = mode ) + return None + +def html_report_from_directory( html_out, dir ): + html_out.write( '\n\nGalaxy - GATK Output\n\n\n

\n

    \n' ) + for fname in sorted( os.listdir( dir ) ): + html_out.write( '
  • %s
  • \n' % ( fname, fname ) ) + html_out.write( '
\n\n\n' ) + +def index_bam_files( bam_filenames, tmp_dir ): + for bam_filename in bam_filenames: + bam_index_filename = "%s.bai" % bam_filename + if not os.path.exists( bam_index_filename ): + #need to index this bam file + stderr_name = tempfile.NamedTemporaryFile( prefix = "bam_index_stderr" ).name + command = 'samtools index %s %s' % ( bam_filename, bam_index_filename ) + proc = subprocess.Popen( args=command, shell=True, stderr=open( stderr_name, 'wb' ) ) + return_code = proc.wait() + if return_code: + for line in open( stderr_name ): + print >> sys.stderr, line + os.unlink( stderr_name ) #clean up + cleanup_before_exit( tmp_dir ) + raise Exception( "Error indexing BAM file" ) + os.unlink( stderr_name ) #clean up + +def __main__(): + #Parse Command Line + parser = optparse.OptionParser() + parser.add_option( '-p', '--pass_through', dest='pass_through_options', action='append', type="string", help='These options are passed through directly to GATK, without any modification.' ) + parser.add_option( '-o', '--pass_through_options', dest='pass_through_options_encoded', action='append', type="string", help='These options are passed through directly to GATK, with decoding from binascii.unhexlify.' ) + parser.add_option( '-d', '--dataset', dest='datasets', action='append', type="string", nargs=4, help='"-argument" "original_filename" "galaxy_filetype" "name_prefix"' ) + parser.add_option( '', '--max_jvm_heap', dest='max_jvm_heap', action='store', type="string", default=None, help='If specified, the maximum java virtual machine heap size will be set to the provide value.' ) + parser.add_option( '', '--max_jvm_heap_fraction', dest='max_jvm_heap_fraction', action='store', type="int", default=None, help='If specified, the maximum java virtual machine heap size will be set to the provide value as a fraction of total physical memory.' ) + parser.add_option( '', '--stdout', dest='stdout', action='store', type="string", default=None, help='If specified, the output of stdout will be written to this file.' ) + parser.add_option( '', '--stderr', dest='stderr', action='store', type="string", default=None, help='If specified, the output of stderr will be written to this file.' ) + parser.add_option( '', '--html_report_from_directory', dest='html_report_from_directory', action='append', type="string", nargs=2, help='"Target HTML File" "Directory"') + parser.add_option( '-e', '--phone_home', dest='phone_home', action='store', type="string", default='STANDARD', help='What kind of GATK run report should we generate(NO_ET|STANDARD|STDOUT)' ) + parser.add_option( '-K', '--gatk_key', dest='gatk_key', action='store', type="string", default=None, help='What kind of GATK run report should we generate(NO_ET|STANDARD|STDOUT)' ) + (options, args) = parser.parse_args() + + tmp_dir = tempfile.mkdtemp( prefix='tmp-gatk-' ) + if options.pass_through_options: + cmd = ' '.join( options.pass_through_options ) + else: + cmd = '' + if options.pass_through_options_encoded: + cmd = '%s %s' % ( cmd, ' '.join( map( unhexlify, options.pass_through_options_encoded ) ) ) + if options.max_jvm_heap is not None: + cmd = cmd.replace( 'java ', 'java -Xmx%s ' % ( options.max_jvm_heap ), 1 ) + elif options.max_jvm_heap_fraction is not None: + cmd = cmd.replace( 'java ', 'java -XX:DefaultMaxRAMFraction=%s -XX:+UseParallelGC ' % ( options.max_jvm_heap_fraction ), 1 ) + bam_filenames = [] + if options.datasets: + for ( dataset_arg, filename, galaxy_ext, prefix ) in options.datasets: + gatk_filename = gatk_filename_from_galaxy( filename, galaxy_ext, target_dir = tmp_dir, prefix = prefix ) + if dataset_arg: + cmd = '%s %s "%s"' % ( cmd, gatk_filetype_argument_substitution( dataset_arg, galaxy_ext ), gatk_filename ) + if galaxy_ext == "bam": + bam_filenames.append( gatk_filename ) + index_bam_files( bam_filenames, tmp_dir ) + #set up stdout and stderr output options + stdout = open_file_from_option( options.stdout, mode = 'wb' ) + stderr = open_file_from_option( options.stderr, mode = 'wb' ) + #if no stderr file is specified, we'll use our own + if stderr is None: + stderr = tempfile.NamedTemporaryFile( prefix="gatk-stderr-", dir=tmp_dir ) + + proc = subprocess.Popen( args=cmd, stdout=stdout, stderr=stderr, shell=True, cwd=tmp_dir ) + return_code = proc.wait() + + if return_code: + stderr_target = sys.stderr + else: + stderr_target = sys.stdout + stderr.flush() + stderr.seek(0) + while True: + chunk = stderr.read( CHUNK_SIZE ) + if chunk: + stderr_target.write( chunk ) + else: + break + stderr.close() + #generate html reports + if options.html_report_from_directory: + for ( html_filename, html_dir ) in options.html_report_from_directory: + html_report_from_directory( open( html_filename, 'wb' ), html_dir ) + + cleanup_before_exit( tmp_dir ) + +if __name__=="__main__": __main__() diff -r 000000000000 -r 74c05070a3f8 gatk_wrapper.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/gatk_wrapper.py Thu Nov 01 13:53:22 2012 -0400 @@ -0,0 +1,126 @@ +#!/usr/bin/env python +#Dan Blankenberg + +""" +A wrapper script for running the GenomeAnalysisTK.jar commands. +""" + +import sys, optparse, os, tempfile, subprocess, shutil +from binascii import unhexlify +from string import Template + +GALAXY_EXT_TO_GATK_EXT = { 'gatk_interval':'intervals', 'bam_index':'bam.bai', 'gatk_dbsnp':'dbSNP', 'picard_interval_list':'interval_list' } #items not listed here will use the galaxy extension as-is +GALAXY_EXT_TO_GATK_FILE_TYPE = GALAXY_EXT_TO_GATK_EXT #for now, these are the same, but could be different if needed +DEFAULT_GATK_PREFIX = "gatk_file" +CHUNK_SIZE = 2**20 #1mb + + +def cleanup_before_exit( tmp_dir ): + if tmp_dir and os.path.exists( tmp_dir ): + shutil.rmtree( tmp_dir ) + +def gatk_filename_from_galaxy( galaxy_filename, galaxy_ext, target_dir = None, prefix = None ): + suffix = GALAXY_EXT_TO_GATK_EXT.get( galaxy_ext, galaxy_ext ) + if prefix is None: + prefix = DEFAULT_GATK_PREFIX + if target_dir is None: + target_dir = os.getcwd() + gatk_filename = os.path.join( target_dir, "%s.%s" % ( prefix, suffix ) ) + os.symlink( galaxy_filename, gatk_filename ) + return gatk_filename + +def gatk_filetype_argument_substitution( argument, galaxy_ext ): + return argument % dict( file_type = GALAXY_EXT_TO_GATK_FILE_TYPE.get( galaxy_ext, galaxy_ext ) ) + +def open_file_from_option( filename, mode = 'rb' ): + if filename: + return open( filename, mode = mode ) + return None + +def html_report_from_directory( html_out, dir ): + html_out.write( '\n\nGalaxy - GATK Output\n\n\n

\n

    \n' ) + for fname in sorted( os.listdir( dir ) ): + html_out.write( '
  • %s
  • \n' % ( fname, fname ) ) + html_out.write( '
\n\n\n' ) + +def index_bam_files( bam_filenames, tmp_dir ): + for bam_filename in bam_filenames: + bam_index_filename = "%s.bai" % bam_filename + if not os.path.exists( bam_index_filename ): + #need to index this bam file + stderr_name = tempfile.NamedTemporaryFile( prefix = "bam_index_stderr" ).name + command = 'samtools index %s %s' % ( bam_filename, bam_index_filename ) + proc = subprocess.Popen( args=command, shell=True, stderr=open( stderr_name, 'wb' ) ) + return_code = proc.wait() + if return_code: + for line in open( stderr_name ): + print >> sys.stderr, line + os.unlink( stderr_name ) #clean up + cleanup_before_exit( tmp_dir ) + raise Exception( "Error indexing BAM file" ) + os.unlink( stderr_name ) #clean up + +def __main__(): + #Parse Command Line + parser = optparse.OptionParser() + parser.add_option( '-p', '--pass_through', dest='pass_through_options', action='append', type="string", help='These options are passed through directly to GATK, without any modification.' ) + parser.add_option( '-o', '--pass_through_options', dest='pass_through_options_encoded', action='append', type="string", help='These options are passed through directly to GATK, with decoding from binascii.unhexlify.' ) + parser.add_option( '-d', '--dataset', dest='datasets', action='append', type="string", nargs=4, help='"-argument" "original_filename" "galaxy_filetype" "name_prefix"' ) + parser.add_option( '', '--max_jvm_heap', dest='max_jvm_heap', action='store', type="string", default=None, help='If specified, the maximum java virtual machine heap size will be set to the provide value.' ) + parser.add_option( '', '--max_jvm_heap_fraction', dest='max_jvm_heap_fraction', action='store', type="int", default=None, help='If specified, the maximum java virtual machine heap size will be set to the provide value as a fraction of total physical memory.' ) + parser.add_option( '', '--stdout', dest='stdout', action='store', type="string", default=None, help='If specified, the output of stdout will be written to this file.' ) + parser.add_option( '', '--stderr', dest='stderr', action='store', type="string", default=None, help='If specified, the output of stderr will be written to this file.' ) + parser.add_option( '', '--html_report_from_directory', dest='html_report_from_directory', action='append', type="string", nargs=2, help='"Target HTML File" "Directory"') + (options, args) = parser.parse_args() + + tmp_dir = tempfile.mkdtemp( prefix='tmp-gatk-' ) + if options.pass_through_options: + cmd = ' '.join( options.pass_through_options ) + else: + cmd = '' + if options.pass_through_options_encoded: + cmd = '%s %s' % ( cmd, ' '.join( map( unhexlify, options.pass_through_options_encoded ) ) ) + if options.max_jvm_heap is not None: + cmd = cmd.replace( 'java ', 'java -Xmx%s ' % ( options.max_jvm_heap ), 1 ) + elif options.max_jvm_heap_fraction is not None: + cmd = cmd.replace( 'java ', 'java -XX:DefaultMaxRAMFraction=%s -XX:+UseParallelGC ' % ( options.max_jvm_heap_fraction ), 1 ) + bam_filenames = [] + if options.datasets: + for ( dataset_arg, filename, galaxy_ext, prefix ) in options.datasets: + gatk_filename = gatk_filename_from_galaxy( filename, galaxy_ext, target_dir = tmp_dir, prefix = prefix ) + if dataset_arg: + cmd = '%s %s "%s"' % ( cmd, gatk_filetype_argument_substitution( dataset_arg, galaxy_ext ), gatk_filename ) + if galaxy_ext == "bam": + bam_filenames.append( gatk_filename ) + index_bam_files( bam_filenames, tmp_dir ) + #set up stdout and stderr output options + stdout = open_file_from_option( options.stdout, mode = 'wb' ) + stderr = open_file_from_option( options.stderr, mode = 'wb' ) + #if no stderr file is specified, we'll use our own + if stderr is None: + stderr = tempfile.NamedTemporaryFile( prefix="gatk-stderr-", dir=tmp_dir ) + + proc = subprocess.Popen( args=cmd, stdout=stdout, stderr=stderr, shell=True, cwd=tmp_dir ) + return_code = proc.wait() + + if return_code: + stderr_target = sys.stderr + else: + stderr_target = sys.stdout + stderr.flush() + stderr.seek(0) + while True: + chunk = stderr.read( CHUNK_SIZE ) + if chunk: + stderr_target.write( chunk ) + else: + break + stderr.close() + #generate html reports + if options.html_report_from_directory: + for ( html_filename, html_dir ) in options.html_report_from_directory: + html_report_from_directory( open( html_filename, 'wb' ), html_dir ) + + cleanup_before_exit( tmp_dir ) + +if __name__=="__main__": __main__() diff -r 000000000000 -r 74c05070a3f8 indel_realigner.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/indel_realigner.xml Thu Nov 01 13:53:22 2012 -0400 @@ -0,0 +1,495 @@ + + - perform local realignment + + gatk + samtools + + gatk2_wrapper.py + --max_jvm_heap_fraction "1" + --stdout "${output_log}" + -d "-I" "${reference_source.input_bam}" "${reference_source.input_bam.ext}" "gatk_input" + #if str( $reference_source.input_bam.metadata.bam_index ) != "None": + -d "" "${reference_source.input_bam.metadata.bam_index}" "bam_index" "gatk_input" ##hardcode galaxy ext type as bam_index + #end if + -p 'java + -jar "\$GATK2_PATH/GenomeAnalysisTK.jar" + -T "IndelRealigner" + -o "${output_bam}" + \$GATK2_SITE_OPTIONS + ##-et "NO_ET" -K "/data/galaxy/appList/GenomeAnalysisTK-2.0-36-gf5c1c1a/gatk2_key_file" ##ET no phone home + ##--num_threads 4 ##hard coded, for now + ##-log "${output_log}" ##don't use this to log to file, instead directly capture stdout + #if $reference_source.reference_source_selector != "history": + -R "${reference_source.ref_file.fields.path}" + #end if + -LOD "${lod_threshold}" + ${knowns_only} + ' + + #set $rod_binding_names = dict() + #for $rod_binding in $rod_bind: + #if str( $rod_binding.rod_bind_type.rod_bind_type_selector ) == 'custom': + #set $rod_bind_name = $rod_binding.rod_bind_type.custom_rod_name + #else + #set $rod_bind_name = $rod_binding.rod_bind_type.rod_bind_type_selector + #end if + #set $rod_binding_names[$rod_bind_name] = $rod_binding_names.get( $rod_bind_name, -1 ) + 1 + -d "-known:${rod_bind_name},%(file_type)s" "${rod_binding.rod_bind_type.input_rod}" "${rod_binding.rod_bind_type.input_rod.ext}" "input_${rod_bind_name}_${rod_binding_names[$rod_bind_name]}" + #end for + + ##start standard gatk options + #if $gatk_param_type.gatk_param_type_selector == "advanced": + #for $pedigree in $gatk_param_type.pedigree: + -p '--pedigree "${pedigree.pedigree_file}"' + #end for + #for $pedigree_string in $gatk_param_type.pedigree_string_repeat: + -p '--pedigreeString "${pedigree_string.pedigree_string}"' + #end for + -p '--pedigreeValidationType "${gatk_param_type.pedigree_validation_type}"' + #for $read_filter in $gatk_param_type.read_filter: + -p '--read_filter "${read_filter.read_filter_type.read_filter_type_selector}" + ###raise Exception( str( dir( $read_filter ) ) ) + #for $name, $param in $read_filter.read_filter_type.iteritems(): + #if $name not in [ "__current_case__", "read_filter_type_selector" ]: + #if hasattr( $param.input, 'truevalue' ): + ${param} + #else: + --${name} "${param}" + #end if + #end if + #end for + ' + #end for + #for $interval_count, $input_intervals in enumerate( $gatk_param_type.input_interval_repeat ): + -d "--intervals" "${input_intervals.input_intervals}" "${input_intervals.input_intervals.ext}" "input_intervals_${interval_count}" + #end for + + #for $interval_count, $input_intervals in enumerate( $gatk_param_type.input_exclude_interval_repeat ): + -d "--excludeIntervals" "${input_intervals.input_exclude_intervals}" "${input_intervals.input_exclude_intervals.ext}" "input_exlude_intervals_${interval_count}" + #end for + + -p '--interval_set_rule "${gatk_param_type.interval_set_rule}"' + + -p '--downsampling_type "${gatk_param_type.downsampling_type.downsampling_type_selector}"' + #if str( $gatk_param_type.downsampling_type.downsampling_type_selector ) != "NONE": + -p '--${gatk_param_type.downsampling_type.downsample_to_type.downsample_to_type_selector} "${gatk_param_type.downsampling_type.downsample_to_type.downsample_to_value}"' + #end if + -p ' + --baq "${gatk_param_type.baq}" + --baqGapOpenPenalty "${gatk_param_type.baq_gap_open_penalty}" + ${gatk_param_type.use_original_qualities} + --defaultBaseQualities "${gatk_param_type.default_base_qualities}" + --validation_strictness "${gatk_param_type.validation_strictness}" + --interval_merging "${gatk_param_type.interval_merging}" + ${gatk_param_type.disable_experimental_low_memory_sharding} + ${gatk_param_type.non_deterministic_random_seed} + ' + #for $rg_black_list_count, $rg_black_list in enumerate( $gatk_param_type.read_group_black_list_repeat ): + #if $rg_black_list.read_group_black_list_type.read_group_black_list_type_selector == "file": + -d "--read_group_black_list" "${rg_black_list.read_group_black_list_type.read_group_black_list}" "txt" "input_read_group_black_list_${rg_black_list_count}" + #else + -p '--read_group_black_list "${rg_black_list.read_group_black_list_type.read_group_black_list}"' + #end if + #end for + #end if + #if $reference_source.reference_source_selector == "history": + -d "-R" "${reference_source.ref_file}" "${reference_source.ref_file.ext}" "gatk_input" + #end if + ##end standard gatk options + ##start analysis specific options + -d "-targetIntervals" "${target_intervals}" "${target_intervals.ext}" "gatk_target_intervals" + -p ' + --disable_bam_indexing + ' + #if $analysis_param_type.analysis_param_type_selector == "advanced": + -p ' + --entropyThreshold "${analysis_param_type.entropy_threshold}" + ${analysis_param_type.simplify_bam} + --consensusDeterminationModel "${analysis_param_type.consensus_determination_model}" + --maxIsizeForMovement "${analysis_param_type.max_insert_size_for_movement}" + --maxPositionalMoveAllowed "${analysis_param_type.max_positional_move_allowed}" + --maxConsensuses "${analysis_param_type.max_consensuses}" + --maxReadsForConsensuses "${analysis_param_type.max_reads_for_consensuses}" + --maxReadsForRealignment "${analysis_param_type.max_reads_for_realignment}" + ${analysis_param_type.no_original_alignment_tags} + ' + #end if + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +**What it does** + +Performs local realignment of reads based on misalignments due to the presence of indels. Unlike most mappers, this walker uses the full alignment context to determine whether an appropriate alternate reference (i.e. indel) exists and updates SAMRecords accordingly. + +For more information on local realignment around indels using the GATK, see this `tool specific page <http://www.broadinstitute.org/gatk/gatkdocs/org_broadinstitute_sting_gatk_walkers_indels_IndelRealigner.html>`_. + +To learn about best practices for variant detection using GATK, see this `overview <http://www.broadinstitute.org/gatk/guide/topic?name=best-practices>`_. + +If you encounter errors, please view the `GATK FAQ <http://www.broadinstitute.org/gatk/guide/topic?name=faqs>`_. + +------ + +**Inputs** + +GenomeAnalysisTK: IndelRealigner accepts an aligned BAM and a list of intervals to realign as input files. + + +**Outputs** + +The output is in the BAM format. + + +Go `here <http://www.broadinstitute.org/gatk/guide/topic?name=intro>`_ for details on GATK file formats. + +------- + +**Settings**:: + + targetIntervals intervals file output from RealignerTargetCreator + LODThresholdForCleaning LOD threshold above which the cleaner will clean + entropyThreshold percentage of mismatches at a locus to be considered having high entropy + out Output bam + bam_compression Compression level to use for writing BAM files + disable_bam_indexing Turn off on-the-fly creation of indices for output BAM files. + simplifyBAM If provided, output BAM files will be simplified to include just key reads for downstream variation discovery analyses (removing duplicates, PF-, non-primary reads), as well stripping all extended tags from the kept reads except the read group identifier + useOnlyKnownIndels Don't run 'Smith-Waterman' to generate alternate consenses; use only known indels provided as RODs for constructing the alternate references. + maxReadsInMemory max reads allowed to be kept in memory at a time by the SAMFileWriter. Keep it low to minimize memory consumption (but the tool may skip realignment on regions with too much coverage. If it is too low, it may generate errors during realignment); keep it high to maximize realignment (but make sure to give Java enough memory). + maxIsizeForMovement maximum insert size of read pairs that we attempt to realign + maxPositionalMoveAllowed maximum positional move in basepairs that a read can be adjusted during realignment + maxConsensuses max alternate consensuses to try (necessary to improve performance in deep coverage) + maxReadsForConsensuses max reads used for finding the alternate consensuses (necessary to improve performance in deep coverage) + maxReadsForRealignment max reads allowed at an interval for realignment; if this value is exceeded, realignment is not attempted and the reads are passed to the output file(s) as-is + noOriginalAlignmentTags Don't output the original cigar or alignment start tags for each realigned read in the output bam. + +------ + +**Citation** + +For the underlying tool, please cite `DePristo MA, Banks E, Poplin R, Garimella KV, Maguire JR, Hartl C, Philippakis AA, del Angel G, Rivas MA, Hanna M, McKenna A, Fennell TJ, Kernytsky AM, Sivachenko AY, Cibulskis K, Gabriel SB, Altshuler D, Daly MJ. A framework for variation discovery and genotyping using next-generation DNA sequencing data. Nat Genet. 2011 May;43(5):491-8. <http://www.ncbi.nlm.nih.gov/pubmed/21478889>`_ + +Please also site `McKenna A, Hanna M, Banks E, Sivachenko A, Cibulskis K, Kernytsky A, Garimella K, Altshuler D, Gabriel S, Daly M, DePristo MA (2010). The Genome Analysis Toolkit: a MapReduce framework for analyzing next-generation DNA sequencing data. Genome Res. 20:1297-303. Epub 2010 Jul 19. <http://www.ncbi.nlm.nih.gov/pubmed/20644199>`_ + +If you use this tool in Galaxy, please cite `Blankenberg D, Von Kuster G, Coraor N, Ananda G, Lazarus R, Mangan M, Nekrutenko A, Taylor J. Galaxy: a web-based genome analysis tool for experimentalists. Curr Protoc Mol Biol. 2010 Jan;Chapter 19:Unit 19.10.1-21. <http://www.ncbi.nlm.nih.gov/pubmed/20069535>`_ + + + diff -r 000000000000 -r 74c05070a3f8 print_reads.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/print_reads.xml Thu Nov 01 13:53:22 2012 -0400 @@ -0,0 +1,519 @@ + + on BAM files + + gatk + samtools + + gatk2_wrapper.py + --max_jvm_heap_fraction "1" + --stdout "${output_log}" + -d "-I" "${reference_source.input_bam}" "${reference_source.input_bam.ext}" "gatk_input" + #if str( $reference_source.input_bam.metadata.bam_index ) != "None": + -d "" "${reference_source.input_bam.metadata.bam_index}" "bam_index" "gatk_input" ##hardcode galaxy ext type as bam_index + #end if + -p 'java + -jar "\$GATK2_PATH/GenomeAnalysisTK.jar" + -T "PrintReads" + -o "${output_bam}" + \$GATK2_SITE_OPTIONS + ##-et "NO_ET" -K "/data/galaxy/appList/GenomeAnalysisTK-2.0-36-gf5c1c1a/gatk2_key_file" ##ET no phone home + ##--num_threads 4 ##not supported yet + ##-log "${output_log}" ##don't use this to log to file, instead directly capture stdout + #if $reference_source.reference_source_selector != "history": + -R "${reference_source.ref_file.fields.path}" + #end if + --BQSR "${input_recal}" + --disable_bam_indexing + ' + ##start standard gatk options + #if $gatk_param_type.gatk_param_type_selector == "advanced": + #for $pedigree in $gatk_param_type.pedigree: + -p '--pedigree "${pedigree.pedigree_file}"' + #end for + #for $pedigree_string in $gatk_param_type.pedigree_string_repeat: + -p '--pedigreeString "${pedigree_string.pedigree_string}"' + #end for + -p '--pedigreeValidationType "${gatk_param_type.pedigree_validation_type}"' + #for $read_filter in $gatk_param_type.read_filter: + -p '--read_filter "${read_filter.read_filter_type.read_filter_type_selector}" + ###raise Exception( str( dir( $read_filter ) ) ) + #for $name, $param in $read_filter.read_filter_type.iteritems(): + #if $name not in [ "__current_case__", "read_filter_type_selector" ]: + #if hasattr( $param.input, 'truevalue' ): + ${param} + #else: + --${name} "${param}" + #end if + #end if + #end for + ' + #end for + #for $interval_count, $input_intervals in enumerate( $gatk_param_type.input_interval_repeat ): + -d "--intervals" "${input_intervals.input_intervals}" "${input_intervals.input_intervals.ext}" "input_intervals_${interval_count}" + #end for + + #for $interval_count, $input_intervals in enumerate( $gatk_param_type.input_exclude_interval_repeat ): + -d "--excludeIntervals" "${input_intervals.input_exclude_intervals}" "${input_intervals.input_exclude_intervals.ext}" "input_exlude_intervals_${interval_count}" + #end for + + -p '--interval_set_rule "${gatk_param_type.interval_set_rule}"' + + -p '--downsampling_type "${gatk_param_type.downsampling_type.downsampling_type_selector}"' + #if str( $gatk_param_type.downsampling_type.downsampling_type_selector ) != "NONE": + -p '--${gatk_param_type.downsampling_type.downsample_to_type.downsample_to_type_selector} "${gatk_param_type.downsampling_type.downsample_to_type.downsample_to_value}"' + #end if + -p ' + --baq "${gatk_param_type.baq}" + --baqGapOpenPenalty "${gatk_param_type.baq_gap_open_penalty}" + ${gatk_param_type.use_original_qualities} + --defaultBaseQualities "${gatk_param_type.default_base_qualities}" + --validation_strictness "${gatk_param_type.validation_strictness}" + --interval_merging "${gatk_param_type.interval_merging}" + ${gatk_param_type.disable_experimental_low_memory_sharding} + ${gatk_param_type.non_deterministic_random_seed} + ' + #for $rg_black_list_count, $rg_black_list in enumerate( $gatk_param_type.read_group_black_list_repeat ): + #if $rg_black_list.read_group_black_list_type.read_group_black_list_type_selector == "file": + -d "--read_group_black_list" "${rg_black_list.read_group_black_list_type.read_group_black_list}" "txt" "input_read_group_black_list_${rg_black_list_count}" + #else + -p '--read_group_black_list "${rg_black_list.read_group_black_list_type.read_group_black_list}"' + #end if + #end for + #end if + + #if str( $reference_source.reference_source_selector ) == "history": + -d "-R" "${reference_source.ref_file}" "${reference_source.ref_file.ext}" "gatk_input" + #end if + ##end standard gatk options + + ##start analysis specific options + #if $analysis_param_type.analysis_param_type_selector == "advanced": + -p ' + #if $analysis_param_type.default_read_group_type.default_read_group_type_selector == "set": + --default_read_group "${analysis_param_type.default_read_group_type.default_read_group}" + #end if + #if str( $analysis_param_type.default_platform ) != "default": + --default_platform "${analysis_param_type.default_platform}" + #end if + #if str( $analysis_param_type.force_read_group_type.force_read_group_type_selector ) == "set": + --force_read_group "${analysis_param_type.force_read_group_type.force_read_group}" + #end if + #if str( $analysis_param_type.force_platform ) != "default": + --force_platform "${analysis_param_type.force_platform}" + #end if + ${analysis_param_type.exception_if_no_tile} + #if str( $analysis_param_type.solid_options_type.solid_options_type_selector ) == "set": + #if str( $analysis_param_type.solid_options_type.solid_recal_mode ) != "default": + --solid_recal_mode "${analysis_param_type.solid_options_type.solid_recal_mode}" + #end if + #if str( $analysis_param_type.solid_options_type.solid_nocall_strategy ) != "default": + --solid_nocall_strategy "${analysis_param_type.solid_options_type.solid_nocall_strategy}" + #end if + #end if + ${analysis_param_type.simplify_bam} + --preserve_qscores_less_than "${analysis_param_type.preserve_qscores_less_than}" + --smoothing "${analysis_param_type.smoothing}" + --max_quality_score "${analysis_param_type.max_quality_score}" + --window_size_nqs "${analysis_param_type.window_size_nqs}" + --homopolymer_nback "${analysis_param_type.homopolymer_nback}" + ${analysis_param_type.do_not_write_original_quals} + ' + #end if + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +**What it does** + +This walker is designed to work as the second pass in a two-pass processing step, doing a by-read traversal. For each base in each read this walker calculates various user-specified covariates (such as read group, reported quality score, cycle, and dinuc) Using these values as a key in a large hashmap the walker calculates an empirical base quality score and overwrites the quality score currently in the read. This walker then outputs a new bam file with these updated (recalibrated) reads. Note: This walker expects as input the recalibration table file generated previously by CovariateCounterWalker. Note: This walker is designed to be used in conjunction with CovariateCounterWalker. + +For more information on base quality score recalibration using the GATK, see this `tool specific page <http://www.broadinstitute.org/gatk/gatkdocs/org_broadinstitute_sting_gatk_walkers_PrintReads.html>`_. + +To learn about best practices for variant detection using GATK, see this `overview <http://www.broadinstitute.org/gatk/guide/topic?name=best-practices>`_. + +If you encounter errors, please view the `GATK FAQ <http://www.broadinstitute.org/gatk/guide/topic?name=faqs>`_. + +------ + +**Inputs** + +GenomeAnalysisTK: PrintReads accepts an aligned BAM and a recalibration CSV input files. + + +**Outputs** + +The output is in BAM format. + + +Go `here <http://www.broadinstitute.org/gatk/guide/topic?name=intro>`_ for details on GATK file formats. + +------- + +**Settings**:: + + default_read_group If a read has no read group then default to the provided String. + default_platform If a read has no platform then default to the provided String. Valid options are illumina, 454, and solid. + force_read_group If provided, the read group ID of EVERY read will be forced to be the provided String. This is useful to collapse all data into a single read group. + force_platform If provided, the platform of EVERY read will be forced to be the provided String. Valid options are illumina, 454, and solid. + window_size_nqs The window size used by MinimumNQSCovariate for its calculation + homopolymer_nback The number of previous bases to look at in HomopolymerCovariate + exception_if_no_tile If provided, TileCovariate will throw an exception when no tile can be found. The default behavior is to use tile = -1 + solid_recal_mode How should we recalibrate solid bases in whichthe reference was inserted? Options = DO_NOTHING, SET_Q_ZERO, SET_Q_ZERO_BASE_N, or REMOVE_REF_BIAS (DO_NOTHING|SET_Q_ZERO|SET_Q_ZERO_BASE_N|REMOVE_REF_BIAS) + solid_nocall_strategy Defines the behavior of the recalibrator when it encounters no calls in the color space. Options = THROW_EXCEPTION, LEAVE_READ_UNRECALIBRATED, or PURGE_READ (THROW_EXCEPTION|LEAVE_READ_UNRECALIBRATED|PURGE_READ) + recal_file Filename for the input covariates table recalibration .csv file + out The output BAM file + bam_compression Compression level to use for writing BAM files + disable_bam_indexing Turn off on-the-fly creation of indices for output BAM files. + simplifyBAM If provided, output BAM files will be simplified to include just key reads for downstream variation discovery analyses (removing duplicates, PF-, non-primary reads), as well stripping all extended tags from the kept reads except the read group identifier + preserve_qscores_less_than Bases with quality scores less than this threshold won't be recalibrated, default=5. In general it's unsafe to change qualities scores below < 5, since base callers use these values to indicate random or bad bases + smoothing Number of imaginary counts to add to each bin bin order to smooth out bins with few data points, default=1 + max_quality_score The integer value at which to cap the quality scores, default=50 + doNotWriteOriginalQuals If true, we will not write the original quality (OQ) tag for each read + +------ + +**Citation** + +For the underlying tool, please cite `DePristo MA, Banks E, Poplin R, Garimella KV, Maguire JR, Hartl C, Philippakis AA, del Angel G, Rivas MA, Hanna M, McKenna A, Fennell TJ, Kernytsky AM, Sivachenko AY, Cibulskis K, Gabriel SB, Altshuler D, Daly MJ. A framework for variation discovery and genotyping using next-generation DNA sequencing data. Nat Genet. 2011 May;43(5):491-8. <http://www.ncbi.nlm.nih.gov/pubmed/21478889>`_ + +Please also site `McKenna A, Hanna M, Banks E, Sivachenko A, Cibulskis K, Kernytsky A, Garimella K, Altshuler D, Gabriel S, Daly M, DePristo MA (2010). The Genome Analysis Toolkit: a MapReduce framework for analyzing next-generation DNA sequencing data. Genome Res. 20:1297-303. Epub 2010 Jul 19. <http://www.ncbi.nlm.nih.gov/pubmed/20644199>`_ + +If you use this tool in Galaxy, please cite `Blankenberg D, Von Kuster G, Coraor N, Ananda G, Lazarus R, Mangan M, Nekrutenko A, Taylor J. Galaxy: a web-based genome analysis tool for experimentalists. Curr Protoc Mol Biol. 2010 Jan;Chapter 19:Unit 19.10.1-21. <http://www.ncbi.nlm.nih.gov/pubmed/20069535>`_ + + + diff -r 000000000000 -r 74c05070a3f8 realigner_target_creator.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/realigner_target_creator.xml Thu Nov 01 13:53:22 2012 -0400 @@ -0,0 +1,452 @@ + + for use in local realignment + + gatk + samtools + + gatk2_wrapper.py + --max_jvm_heap_fraction "1" + --stdout "${output_log}" + -d "-I" "${reference_source.input_bam}" "${reference_source.input_bam.ext}" "gatk_input" + #if str( $reference_source.input_bam.metadata.bam_index ) != "None": + -d "" "${reference_source.input_bam.metadata.bam_index}" "bam_index" "gatk_input" ##hardcode galaxy ext type as bam_index + #end if + -p 'java + -jar "\$GATK2_PATH/GenomeAnalysisTK.jar" + -T "RealignerTargetCreator" + -o "${output_interval}" + \$GATK2_SITE_OPTIONS + ##-et "NO_ET" -K "/data/galaxy/appList/GenomeAnalysisTK-2.0-36-gf5c1c1a/gatk2_key_file" ##ET no phone home + ##--num_threads 4 ##hard coded, for now + ##-log "${output_log}" ##don't use this to log to file, instead directly capture stdout + #if $reference_source.reference_source_selector != "history": + -R "${reference_source.ref_file.fields.path}" + #end if + ' + #set $rod_binding_names = dict() + #for $rod_binding in $rod_bind: + #if str( $rod_binding.rod_bind_type.rod_bind_type_selector ) == 'custom': + #set $rod_bind_name = $rod_binding.rod_bind_type.custom_rod_name + #else + #set $rod_bind_name = $rod_binding.rod_bind_type.rod_bind_type_selector + #end if + #set $rod_binding_names[$rod_bind_name] = $rod_binding_names.get( $rod_bind_name, -1 ) + 1 + -d "-known:${rod_bind_name},%(file_type)s" "${rod_binding.rod_bind_type.input_rod}" "${rod_binding.rod_bind_type.input_rod.ext}" "input_${rod_bind_name}_${rod_binding_names[$rod_bind_name]}" + #end for + + ##start standard gatk options + #if $gatk_param_type.gatk_param_type_selector == "advanced": + #for $pedigree in $gatk_param_type.pedigree: + -p '--pedigree "${pedigree.pedigree_file}"' + #end for + #for $pedigree_string in $gatk_param_type.pedigree_string_repeat: + -p '--pedigreeString "${pedigree_string.pedigree_string}"' + #end for + -p '--pedigreeValidationType "${gatk_param_type.pedigree_validation_type}"' + #for $read_filter in $gatk_param_type.read_filter: + -p '--read_filter "${read_filter.read_filter_type.read_filter_type_selector}" + ###raise Exception( str( dir( $read_filter ) ) ) + #for $name, $param in $read_filter.read_filter_type.iteritems(): + #if $name not in [ "__current_case__", "read_filter_type_selector" ]: + #if hasattr( $param.input, 'truevalue' ): + ${param} + #else: + --${name} "${param}" + #end if + #end if + #end for + ' + #end for + #for $interval_count, $input_intervals in enumerate( $gatk_param_type.input_interval_repeat ): + -d "--intervals" "${input_intervals.input_intervals}" "${input_intervals.input_intervals.ext}" "input_intervals_${interval_count}" + #end for + + #for $interval_count, $input_intervals in enumerate( $gatk_param_type.input_exclude_interval_repeat ): + -d "--excludeIntervals" "${input_intervals.input_exclude_intervals}" "${input_intervals.input_exclude_intervals.ext}" "input_exlude_intervals_${interval_count}" + #end for + + -p '--interval_set_rule "${gatk_param_type.interval_set_rule}"' + + -p '--downsampling_type "${gatk_param_type.downsampling_type.downsampling_type_selector}"' + #if str( $gatk_param_type.downsampling_type.downsampling_type_selector ) != "NONE": + -p '--${gatk_param_type.downsampling_type.downsample_to_type.downsample_to_type_selector} "${gatk_param_type.downsampling_type.downsample_to_type.downsample_to_value}"' + #end if + -p ' + --baq "${gatk_param_type.baq}" + --baqGapOpenPenalty "${gatk_param_type.baq_gap_open_penalty}" + ${gatk_param_type.use_original_qualities} + --defaultBaseQualities "${gatk_param_type.default_base_qualities}" + --validation_strictness "${gatk_param_type.validation_strictness}" + --interval_merging "${gatk_param_type.interval_merging}" + ${gatk_param_type.disable_experimental_low_memory_sharding} + ${gatk_param_type.non_deterministic_random_seed} + ' + #for $rg_black_list_count, $rg_black_list in enumerate( $gatk_param_type.read_group_black_list_repeat ): + #if $rg_black_list.read_group_black_list_type.read_group_black_list_type_selector == "file": + -d "--read_group_black_list" "${rg_black_list.read_group_black_list_type.read_group_black_list}" "txt" "input_read_group_black_list_${rg_black_list_count}" + #else + -p '--read_group_black_list "${rg_black_list.read_group_black_list_type.read_group_black_list}"' + #end if + #end for + #end if + + #if $reference_source.reference_source_selector == "history": + -d "-R" "${reference_source.ref_file}" "${reference_source.ref_file.ext}" "gatk_input" + #end if + ##end standard gatk options + ##start analysis specific options + #if $analysis_param_type.analysis_param_type_selector == "advanced": + -p ' + --minReadsAtLocus "${analysis_param_type.minReadsAtLocus}" + --windowSize "${analysis_param_type.windowSize}" + --mismatchFraction "${analysis_param_type.mismatchFraction}" + --maxIntervalSize "${analysis_param_type.maxIntervalSize}" + ' + #end if + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +**What it does** + +Emits intervals for the Local Indel Realigner to target for cleaning. Ignores 454 reads, MQ0 reads, and reads with consecutive indel operators in the CIGAR string. + +For more information on local realignment around indels using the GATK, see this `tool specific page <http://www.broadinstitute.org/gatk/gatkdocs/org_broadinstitute_sting_gatk_walkers_indels_RealignerTargetCreator.html>`_. + +To learn about best practices for variant detection using GATK, see this `overview <http://www.broadinstitute.org/gatk/guide/topic?name=best-practices>`_. + +If you encounter errors, please view the `GATK FAQ <http://www.broadinstitute.org/gatk/guide/topic?name=faqs>`_. + +------ + +**Inputs** + +GenomeAnalysisTK: RealignerTargetCreator accepts an aligned BAM input file. + + +**Outputs** + +The output is in GATK Interval format. + + +Go `here <http://www.broadinstitute.org/gatk/guide/topic?name=intro>`_ for details on GATK file formats. + +------- + +**Settings**:: + + windowSize window size for calculating entropy or SNP clusters + mismatchFraction fraction of base qualities needing to mismatch for a position to have high entropy; to disable set to <= 0 or > 1 + minReadsAtLocus minimum reads at a locus to enable using the entropy calculation + maxIntervalSize maximum interval size + +------ + +**Citation** + +For the underlying tool, please cite `DePristo MA, Banks E, Poplin R, Garimella KV, Maguire JR, Hartl C, Philippakis AA, del Angel G, Rivas MA, Hanna M, McKenna A, Fennell TJ, Kernytsky AM, Sivachenko AY, Cibulskis K, Gabriel SB, Altshuler D, Daly MJ. A framework for variation discovery and genotyping using next-generation DNA sequencing data. Nat Genet. 2011 May;43(5):491-8. <http://www.ncbi.nlm.nih.gov/pubmed/21478889>`_ + +Please also site `McKenna A, Hanna M, Banks E, Sivachenko A, Cibulskis K, Kernytsky A, Garimella K, Altshuler D, Gabriel S, Daly M, DePristo MA (2010). The Genome Analysis Toolkit: a MapReduce framework for analyzing next-generation DNA sequencing data. Genome Res. 20:1297-303. Epub 2010 Jul 19. <http://www.ncbi.nlm.nih.gov/pubmed/20644199>`_ + +If you use this tool in Galaxy, please cite `Blankenberg D, Von Kuster G, Coraor N, Ananda G, Lazarus R, Mangan M, Nekrutenko A, Taylor J. Galaxy: a web-based genome analysis tool for experimentalists. Curr Protoc Mol Biol. 2010 Jan;Chapter 19:Unit 19.10.1-21. <http://www.ncbi.nlm.nih.gov/pubmed/20069535>`_ + + + diff -r 000000000000 -r 74c05070a3f8 tool_data_table_conf.xml.sample --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tool_data_table_conf.xml.sample Thu Nov 01 13:53:22 2012 -0400 @@ -0,0 +1,12 @@ + + + + value, dbkey, name, path + +
+ + + value, name, gatk_value, tools_valid_for + +
+
diff -r 000000000000 -r 74c05070a3f8 tool_dependencies.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tool_dependencies.xml Thu Nov 01 13:53:22 2012 -0400 @@ -0,0 +1,66 @@ + + + + + + ftp://ftp.broadinstitute.org/pub/gsa/GenomeAnalysisTK/GenomeAnalysisTKLite-latest.tar.bz2 + + GenomeAnalysisTKLite.jar + $INSTALL_DIR/GenomeAnalysisTK.jar + + + $INSTALL_DIR + + + --num_threads 4 --num_cpu_threads_per_data_thread 3 --phone_home STANDARD + + + + +The following GATK2 analysis types require a License for commercial use and the full GATK2 build will need to be manually installed: + + haplotypecaller + HaplotypeCaller Call SNPs and indels simultaneously via local de-novo assembly of haplotypes in an + active region. + HaplotypeResolver Haplotype-based resolution of variants in 2 different eval files. + + reducereads + CompareBAM Given two BAMs with different read groups, it compares them based on ReduceReads + metrics. + ReduceReads Reduces the BAM file using read based compression that keeps only essential information + for variant calling + + +See: http://www.appistry.com/gatk/gatk-faqs-static for Licensing details. +The full GATK build can be acquired from: http://www.broadinstitute.org/gatk/download +and GenomeAnalysisTK.jar need to be manually installed into the GATK2_PATH set for the GenomeAnalysisTKLite.jar + +The GATK2_SITE_OPTIONS environment variable can be modified in the env.sh in the tool_dependencies path can be modified +for the local installation. + + + + + + + http://sourceforge.net/projects/samtools/files/samtools/0.1.18/samtools-0.1.18.tar.bz2 + sed -i.bak -e 's/-lcurses/-lncurses/g' Makefile + make + + samtools + $INSTALL_DIR/bin + + + misc/maq2sam-long + $INSTALL_DIR/bin + + + $INSTALL_DIR/bin + + + + +Compiling SAMtools requires the ncurses and zlib development libraries. + + + diff -r 000000000000 -r 74c05070a3f8 unified_genotyper.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/unified_genotyper.xml Thu Nov 01 13:53:22 2012 -0400 @@ -0,0 +1,614 @@ + + SNP and indel caller + + gatk + samtools + + gatk2_wrapper.py + --max_jvm_heap_fraction "1" + --stdout "${output_log}" + #for $i, $input_bam in enumerate( $reference_source.input_bams ): + -d "-I" "${input_bam.input_bam}" "${input_bam.input_bam.ext}" "gatk_input_${i}" + #if str( $input_bam.input_bam.metadata.bam_index ) != "None": + -d "" "${input_bam.input_bam.metadata.bam_index}" "bam_index" "gatk_input_${i}" ##hardcode galaxy ext type as bam_index + #end if + #end for + -p 'java + -jar "\$GATK2_PATH/GenomeAnalysisTK.jar" + -T "UnifiedGenotyper" + ##--num_threads 4 ##hard coded, for now + --out "${output_vcf}" + --metrics_file "${output_metrics}" + \$GATK2_SITE_OPTIONS + ##-et "NO_ET" -K "\$GATK2_BASE/gatk2_key_file" ##ET no phone home + ##-log "${output_log}" ##don't use this to log to file, instead directly capture stdout + #if $reference_source.reference_source_selector != "history": + -R "${reference_source.ref_file.fields.path}" + #end if + --genotype_likelihoods_model "${genotype_likelihoods_model}" + --standard_min_confidence_threshold_for_calling "${standard_min_confidence_threshold_for_calling}" + --standard_min_confidence_threshold_for_emitting "${standard_min_confidence_threshold_for_emitting}" + ' + #set $rod_binding_names = dict() + #for $rod_binding in $rod_bind: + #if str( $rod_binding.rod_bind_type.rod_bind_type_selector ) == 'custom': + #set $rod_bind_name = $rod_binding.rod_bind_type.custom_rod_name + #else + #set $rod_bind_name = $rod_binding.rod_bind_type.rod_bind_type_selector + #end if + #set $rod_binding_names[$rod_bind_name] = $rod_binding_names.get( $rod_bind_name, -1 ) + 1 + -d "--dbsnp:${rod_bind_name},%(file_type)s" "${rod_binding.rod_bind_type.input_rod}" "${rod_binding.rod_bind_type.input_rod.ext}" "input_${rod_bind_name}_${rod_binding_names[$rod_bind_name]}" + #end for + + ##start standard gatk options + #if $gatk_param_type.gatk_param_type_selector == "advanced": + #for $pedigree in $gatk_param_type.pedigree: + -p '--pedigree "${pedigree.pedigree_file}"' + #end for + #for $pedigree_string in $gatk_param_type.pedigree_string_repeat: + -p '--pedigreeString "${pedigree_string.pedigree_string}"' + #end for + -p '--pedigreeValidationType "${gatk_param_type.pedigree_validation_type}"' + #for $read_filter in $gatk_param_type.read_filter: + -p '--read_filter "${read_filter.read_filter_type.read_filter_type_selector}" + ###raise Exception( str( dir( $read_filter ) ) ) + #for $name, $param in $read_filter.read_filter_type.iteritems(): + #if $name not in [ "__current_case__", "read_filter_type_selector" ]: + #if hasattr( $param.input, 'truevalue' ): + ${param} + #else: + --${name} "${param}" + #end if + #end if + #end for + ' + #end for + #for $interval_count, $input_intervals in enumerate( $gatk_param_type.input_interval_repeat ): + -d "--intervals" "${input_intervals.input_intervals}" "${input_intervals.input_intervals.ext}" "input_intervals_${interval_count}" + #end for + + #for $interval_count, $input_intervals in enumerate( $gatk_param_type.input_exclude_interval_repeat ): + -d "--excludeIntervals" "${input_intervals.input_exclude_intervals}" "${input_intervals.input_exclude_intervals.ext}" "input_exlude_intervals_${interval_count}" + #end for + + -p '--interval_set_rule "${gatk_param_type.interval_set_rule}"' + + -p '--downsampling_type "${gatk_param_type.downsampling_type.downsampling_type_selector}"' + #if str( $gatk_param_type.downsampling_type.downsampling_type_selector ) != "NONE": + -p '--${gatk_param_type.downsampling_type.downsample_to_type.downsample_to_type_selector} "${gatk_param_type.downsampling_type.downsample_to_type.downsample_to_value}"' + #end if + -p ' + --baq "${gatk_param_type.baq}" + --baqGapOpenPenalty "${gatk_param_type.baq_gap_open_penalty}" + ${gatk_param_type.use_original_qualities} + --defaultBaseQualities "${gatk_param_type.default_base_qualities}" + --validation_strictness "${gatk_param_type.validation_strictness}" + --interval_merging "${gatk_param_type.interval_merging}" + ${gatk_param_type.disable_experimental_low_memory_sharding} + ${gatk_param_type.non_deterministic_random_seed} + ' + #for $rg_black_list_count, $rg_black_list in enumerate( $gatk_param_type.read_group_black_list_repeat ): + #if $rg_black_list.read_group_black_list_type.read_group_black_list_type_selector == "file": + -d "--read_group_black_list" "${rg_black_list.read_group_black_list_type.read_group_black_list}" "txt" "input_read_group_black_list_${rg_black_list_count}" + #else + -p '--read_group_black_list "${rg_black_list.read_group_black_list_type.read_group_black_list}"' + #end if + #end for + #end if + + #if $reference_source.reference_source_selector == "history": + -d "-R" "${reference_source.ref_file}" "${reference_source.ref_file.ext}" "gatk_input" + #end if + ##end standard gatk options + ##start analysis specific options + #if $analysis_param_type.analysis_param_type_selector == "advanced": + -p ' + --p_nonref_model "${analysis_param_type.p_nonref_model}" + --heterozygosity "${analysis_param_type.heterozygosity}" + --pcr_error_rate "${analysis_param_type.pcr_error_rate}" + --genotyping_mode "${analysis_param_type.genotyping_mode_type.genotyping_mode}" + #if str( $analysis_param_type.genotyping_mode_type.genotyping_mode ) == 'GENOTYPE_GIVEN_ALLELES': + --alleles "${analysis_param_type.genotyping_mode_type.input_alleles_rod}" + #end if + --output_mode "${analysis_param_type.output_mode}" + ${analysis_param_type.compute_SLOD} + --min_base_quality_score "${analysis_param_type.min_base_quality_score}" + --max_deletion_fraction "${analysis_param_type.max_deletion_fraction}" + --max_alternate_alleles "${analysis_param_type.max_alternate_alleles}" + --min_indel_count_for_genotyping "${analysis_param_type.min_indel_count_for_genotyping}" + --indel_heterozygosity "${analysis_param_type.indel_heterozygosity}" + --indelGapContinuationPenalty "${analysis_param_type.indelGapContinuationPenalty}" + --indelGapOpenPenalty "${analysis_param_type.indelGapOpenPenalty}" + --indelHaplotypeSize "${analysis_param_type.indelHaplotypeSize}" + ${analysis_param_type.doContextDependentGapPenalties} + #if str( $analysis_param_type.annotation ) != "None": + #for $annotation in str( $analysis_param_type.annotation.fields.gatk_value ).split( ','): + --annotation "${annotation}" + #end for + #end if + #for $additional_annotation in $analysis_param_type.additional_annotations: + --annotation "${additional_annotation.additional_annotation_name}" + #end for + #if str( $analysis_param_type.group ) != "None": + #for $group in str( $analysis_param_type.group ).split( ','): + --group "${group}" + #end for + #end if + #if str( $analysis_param_type.exclude_annotations ) != "None": + #for $annotation in str( $analysis_param_type.exclude_annotations.fields.gatk_value ).split( ','): + --excludeAnnotation "${annotation}" + #end for + #end if + ${analysis_param_type.multiallelic} + ' +## #if str( $analysis_param_type.snpEff_rod_bind_type.snpEff_rod_bind_type_selector ) == 'set_snpEff': +## -p '--annotation "SnpEff"' +## -d "--snpEffFile:${analysis_param_type.snpEff_rod_bind_type.snpEff_rod_name},%(file_type)s" "${analysis_param_type.snpEff_rod_bind_type.snpEff_input_rod}" "${analysis_param_type.snpEff_rod_bind_type.snpEff_input_rod.ext}" "input_snpEff_${analysis_param_type.snpEff_rod_bind_type.snpEff_rod_name}" +## #else: +## -p '--excludeAnnotation "SnpEff"' +## #end if + #end if + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +**What it does** + +A variant caller which unifies the approaches of several disparate callers. Works for single-sample and multi-sample data. The user can choose from several different incorporated calculation models. + +For more information on the GATK Unified Genotyper, see this `tool specific page <http://www.broadinstitute.org/gatk/gatkdocs/org_broadinstitute_sting_gatk_walkers_genotyper_UnifiedGenotyper.html>`_. + +To learn about best practices for variant detection using GATK, see this `overview <http://www.broadinstitute.org/gatk/guide/topic?name=best-practices>`_. + +If you encounter errors, please view the `GATK FAQ <http://www.broadinstitute.org/gatk/guide/topic?name=faqs>`_. + +------ + +**Inputs** + +GenomeAnalysisTK: UnifiedGenotyper accepts an aligned BAM input file. + + +**Outputs** + +The output is in VCF format. + + +Go `here <http://www.broadinstitute.org/gatk/guide/topic?name=intro>`_ for details on GATK file formats. + +------- + +**Settings**:: + + genotype_likelihoods_model Genotype likelihoods calculation model to employ -- BOTH is the default option, while INDEL is also available for calling indels and SNP is available for calling SNPs only (SNP|INDEL|BOTH) + p_nonref_model Non-reference probability calculation model to employ -- EXACT is the default option, while GRID_SEARCH is also available. (EXACT|GRID_SEARCH) + heterozygosity Heterozygosity value used to compute prior likelihoods for any locus + pcr_error_rate The PCR error rate to be used for computing fragment-based likelihoods + genotyping_mode Should we output confident genotypes (i.e. including ref calls) or just the variants? (DISCOVERY|GENOTYPE_GIVEN_ALLELES) + output_mode Should we output confident genotypes (i.e. including ref calls) or just the variants? (EMIT_VARIANTS_ONLY|EMIT_ALL_CONFIDENT_SITES|EMIT_ALL_SITES) + standard_min_confidence_threshold_for_calling The minimum phred-scaled confidence threshold at which variants not at 'trigger' track sites should be called + standard_min_confidence_threshold_for_emitting The minimum phred-scaled confidence threshold at which variants not at 'trigger' track sites should be emitted (and filtered if less than the calling threshold) + noSLOD If provided, we will not calculate the SLOD + min_base_quality_score Minimum base quality required to consider a base for calling + max_deletion_fraction Maximum fraction of reads with deletions spanning this locus for it to be callable [to disable, set to < 0 or > 1; default:0.05] + min_indel_count_for_genotyping Minimum number of consensus indels required to trigger genotyping run + indel_heterozygosity Heterozygosity for indel calling + indelGapContinuationPenalty Indel gap continuation penalty + indelGapOpenPenalty Indel gap open penalty + indelHaplotypeSize Indel haplotype size + doContextDependentGapPenalties Vary gap penalties by context + indel_recal_file Filename for the input covariates table recalibration .csv file - EXPERIMENTAL, DO NO USE + indelDebug Output indel debug info + out File to which variants should be written + annotation One or more specific annotations to apply to variant calls + group One or more classes/groups of annotations to apply to variant calls + +------ + +**Citation** + +For the underlying tool, please cite `DePristo MA, Banks E, Poplin R, Garimella KV, Maguire JR, Hartl C, Philippakis AA, del Angel G, Rivas MA, Hanna M, McKenna A, Fennell TJ, Kernytsky AM, Sivachenko AY, Cibulskis K, Gabriel SB, Altshuler D, Daly MJ. A framework for variation discovery and genotyping using next-generation DNA sequencing data. Nat Genet. 2011 May;43(5):491-8. <http://www.ncbi.nlm.nih.gov/pubmed/21478889>`_ + +Please also site `McKenna A, Hanna M, Banks E, Sivachenko A, Cibulskis K, Kernytsky A, Garimella K, Altshuler D, Gabriel S, Daly M, DePristo MA (2010). The Genome Analysis Toolkit: a MapReduce framework for analyzing next-generation DNA sequencing data. Genome Res. 20:1297-303. Epub 2010 Jul 19. <http://www.ncbi.nlm.nih.gov/pubmed/20644199>`_ + +If you use this tool in Galaxy, please cite `Blankenberg D, Von Kuster G, Coraor N, Ananda G, Lazarus R, Mangan M, Nekrutenko A, Taylor J. Galaxy: a web-based genome analysis tool for experimentalists. Curr Protoc Mol Biol. 2010 Jan;Chapter 19:Unit 19.10.1-21. <http://www.ncbi.nlm.nih.gov/pubmed/20069535>`_ + + + diff -r 000000000000 -r 74c05070a3f8 variant_annotator.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_annotator.xml Thu Nov 01 13:53:22 2012 -0400 @@ -0,0 +1,543 @@ + + + + gatk + samtools + + gatk2_wrapper.py + --max_jvm_heap_fraction "1" + --stdout "${output_log}" + #if str( $reference_source.input_bam ) != "None": + -d "-I" "${reference_source.input_bam}" "${reference_source.input_bam.ext}" "gatk_input" + #if str( $reference_source.input_bam.metadata.bam_index ) != "None": + -d "" "${reference_source.input_bam.metadata.bam_index}" "bam_index" "gatk_input" ##hardcode galaxy ext type as bam_index + #end if + #end if + -d "--variant" "${reference_source.input_variant}" "${reference_source.input_variant.ext}" "input_variant" + -p 'java + -jar "\$GATK2_PATH/GenomeAnalysisTK.jar" + ##--list + -T "VariantAnnotator" + \$GATK2_SITE_OPTIONS + ##--num_threads 4 ##hard coded, for now + ##-et "NO_ET" -K "\$GATK2_BASE/gatk2_key_file" ##ET no phone home + ##-log "${output_log}" ##don't use this to log to file, instead directly capture stdout + #if $reference_source.reference_source_selector != "history": + -R "${reference_source.ref_file.fields.path}" + #end if + -o "${output_vcf}" + #if str( $annotations_type.annotations_type_selector ) == "use_all_annotations": + --useAllAnnotations + #else: + #if $annotations_type.annotations: + #for $annotation in str( $annotations_type.annotations.fields.gatk_value ).split( ',' ): + --annotation "${annotation}" + #end for + #end if + #end if + #if $exclude_annotations: + #for $annotation in str( $exclude_annotations.fields.gatk_value ).split( ',' ): + --excludeAnnotation "${annotation}" + #end for + #end if + #for $additional_annotation in $additional_annotations: + --annotation "${additional_annotation.additional_annotation_name}" + #end for + ' + #if $reference_source.input_variant_bti: + -d "--intervals" "${reference_source.input_variant}" "${reference_source.input_variant.ext}" "input_variant_bti" + #end if + + #for $rod_binding in $comp_rod_bind: + -d "--comp:${rod_binding.comp_rod_name},%(file_type)s" "${rod_binding.comp_input_rod}" "${rod_binding.comp_input_rod.ext}" "input_comp_${rod_binding.comp_rod_name}" + #end for + + #if str( $dbsnp_rod_bind_type.dbsnp_rod_bind_type_selector ) == 'set_dbsnp': + -d "--dbsnp:${dbsnp_rod_bind_type.dbsnp_rod_name},%(file_type)s" "${dbsnp_rod_bind_type.dbsnp_input_rod}" "${dbsnp_rod_bind_type.dbsnp_input_rod.ext}" "input_dbsnp_${dbsnp_rod_bind_type.dbsnp_rod_name}" + #end if + + + #for $rod_binding in $resource_rod_bind: + -d "--resource:${rod_binding.resource_rod_name},%(file_type)s" "${rod_binding.resource_input_rod}" "${rod_binding.resource_input_rod.ext}" "input_resource_${rod_binding.resource_rod_name}" + #end for + + #if str( $snpEff_rod_bind_type.snpEff_rod_bind_type_selector ) == 'set_snpEff': + -p '--annotation "SnpEff"' + -d "--snpEffFile:${snpEff_rod_bind_type.snpEff_rod_name},%(file_type)s" "${snpEff_rod_bind_type.snpEff_input_rod}" "${snpEff_rod_bind_type.snpEff_input_rod.ext}" "input_snpEff_${snpEff_rod_bind_type.snpEff_rod_name}" + #else: + -p '--excludeAnnotation "SnpEff"' + #end if + + #for $expression in $expressions: + -p '--expression "${expression.expression}"' + #end for + + ##start standard gatk options + #if $gatk_param_type.gatk_param_type_selector == "advanced": + #for $pedigree in $gatk_param_type.pedigree: + -p '--pedigree "${pedigree.pedigree_file}"' + #end for + #for $pedigree_string in $gatk_param_type.pedigree_string_repeat: + -p '--pedigreeString "${pedigree_string.pedigree_string}"' + #end for + -p '--pedigreeValidationType "${gatk_param_type.pedigree_validation_type}"' + #for $read_filter in $gatk_param_type.read_filter: + -p '--read_filter "${read_filter.read_filter_type.read_filter_type_selector}" + ###raise Exception( str( dir( $read_filter ) ) ) + #for $name, $param in $read_filter.read_filter_type.iteritems(): + #if $name not in [ "__current_case__", "read_filter_type_selector" ]: + #if hasattr( $param.input, 'truevalue' ): + ${param} + #else: + --${name} "${param}" + #end if + #end if + #end for + ' + #end for + #for $interval_count, $input_intervals in enumerate( $gatk_param_type.input_interval_repeat ): + -d "--intervals" "${input_intervals.input_intervals}" "${input_intervals.input_intervals.ext}" "input_intervals_${interval_count}" + #end for + + #for $interval_count, $input_intervals in enumerate( $gatk_param_type.input_exclude_interval_repeat ): + -d "--excludeIntervals" "${input_intervals.input_exclude_intervals}" "${input_intervals.input_exclude_intervals.ext}" "input_exlude_intervals_${interval_count}" + #end for + + -p '--interval_set_rule "${gatk_param_type.interval_set_rule}"' + + -p '--downsampling_type "${gatk_param_type.downsampling_type.downsampling_type_selector}"' + #if str( $gatk_param_type.downsampling_type.downsampling_type_selector ) != "NONE": + -p '--${gatk_param_type.downsampling_type.downsample_to_type.downsample_to_type_selector} "${gatk_param_type.downsampling_type.downsample_to_type.downsample_to_value}"' + #end if + -p ' + --baq "${gatk_param_type.baq}" + --baqGapOpenPenalty "${gatk_param_type.baq_gap_open_penalty}" + ${gatk_param_type.use_original_qualities} + --defaultBaseQualities "${gatk_param_type.default_base_qualities}" + --validation_strictness "${gatk_param_type.validation_strictness}" + --interval_merging "${gatk_param_type.interval_merging}" + ${gatk_param_type.disable_experimental_low_memory_sharding} + ${gatk_param_type.non_deterministic_random_seed} + ' + #for $rg_black_list_count, $rg_black_list in enumerate( $gatk_param_type.read_group_black_list_repeat ): + #if $rg_black_list.read_group_black_list_type.read_group_black_list_type_selector == "file": + -d "--read_group_black_list" "${rg_black_list.read_group_black_list_type.read_group_black_list}" "txt" "input_read_group_black_list_${rg_black_list_count}" + #else + -p '--read_group_black_list "${rg_black_list.read_group_black_list_type.read_group_black_list}"' + #end if + #end for + #end if + #if str( $reference_source.reference_source_selector ) == "history": + -d "-R" "${reference_source.ref_file}" "${reference_source.ref_file.ext}" "gatk_input" + #end if + ##end standard gatk options + + -p ' + #if str( $annotation_group ) != "None": + #for $group in str( $annotation_group ).split( ',' ): + --group "${group}" + #end for + #end if + #if str( $family_string ) != "": + --family_string "${family_string}" + #end if + --MendelViolationGenotypeQualityThreshold "${mendel_violation_genotype_quality_threshold}" + ' + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +**What it does** + +Annotates variant calls with context information. Users can specify which of the available annotations to use. + +For more information on using the VariantAnnotator, see this `tool specific page <http://www.broadinstitute.org/gatk/gatkdocs/org_broadinstitute_sting_gatk_walkers_annotator_VariantAnnotator.html>`_. + +To learn about best practices for variant detection using GATK, see this `overview <http://www.broadinstitute.org/gatk/guide/topic?name=best-practices>`_. + +If you encounter errors, please view the `GATK FAQ <http://www.broadinstitute.org/gatk/guide/topic?name=faqs>`_. + +------ + + +**Inputs** + +GenomeAnalysisTK: VariantAnnotator accepts a variant input file. + + +**Outputs** + +The output is in VCF format. + + +Go `here <http://www.broadinstitute.org/gatk/guide/topic?name=intro>`_ for details on GATK file formats. + +------- + +**Settings**:: + + + sampleName The sample (NA-ID) corresponding to the variant input (for non-VCF input only) + annotation One or more specific annotations to apply to variant calls + group One or more classes/groups of annotations to apply to variant calls + expression One or more specific expressions to apply to variant calls; see documentation for more details + useAllAnnotations Use all possible annotations (not for the faint of heart) + +------ + +**Citation** + +For the underlying tool, please cite `DePristo MA, Banks E, Poplin R, Garimella KV, Maguire JR, Hartl C, Philippakis AA, del Angel G, Rivas MA, Hanna M, McKenna A, Fennell TJ, Kernytsky AM, Sivachenko AY, Cibulskis K, Gabriel SB, Altshuler D, Daly MJ. A framework for variation discovery and genotyping using next-generation DNA sequencing data. Nat Genet. 2011 May;43(5):491-8. <http://www.ncbi.nlm.nih.gov/pubmed/21478889>`_ + +Please also site `McKenna A, Hanna M, Banks E, Sivachenko A, Cibulskis K, Kernytsky A, Garimella K, Altshuler D, Gabriel S, Daly M, DePristo MA (2010). The Genome Analysis Toolkit: a MapReduce framework for analyzing next-generation DNA sequencing data. Genome Res. 20:1297-303. Epub 2010 Jul 19. <http://www.ncbi.nlm.nih.gov/pubmed/20644199>`_ + +If you use this tool in Galaxy, please cite `Blankenberg D, Von Kuster G, Coraor N, Ananda G, Lazarus R, Mangan M, Nekrutenko A, Taylor J. Galaxy: a web-based genome analysis tool for experimentalists. Curr Protoc Mol Biol. 2010 Jan;Chapter 19:Unit 19.10.1-21. <http://www.ncbi.nlm.nih.gov/pubmed/20069535>`_ + + + diff -r 000000000000 -r 74c05070a3f8 variant_apply_recalibration.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_apply_recalibration.xml Thu Nov 01 13:53:22 2012 -0400 @@ -0,0 +1,416 @@ + + + + gatk + + gatk2_wrapper.py + --max_jvm_heap_fraction "1" + --stdout "${output_log}" + #for $var_count, $variant in enumerate( $reference_source.variants ): + -d "--input:input_${var_count},%(file_type)s" "${variant.input_variants}" "${variant.input_variants.ext}" "input_variants_${var_count}" + #end for + -p 'java + -jar "\$GATK2_PATH/GenomeAnalysisTK.jar" + -T "ApplyRecalibration" + \$GATK2_SITE_OPTIONS + ##--num_threads 4 ##hard coded, for now + ##-et "NO_ET" -K "\$GATK2_BASE/gatk2_key_file" ##ET no phone home + #if $reference_source.reference_source_selector != "history": + -R "${reference_source.ref_file.fields.path}" + #end if + --recal_file "${reference_source.input_recal}" + --tranches_file "${reference_source.input_tranches}" + --out "${output_variants}" + ' + + ##start standard gatk options + #if $gatk_param_type.gatk_param_type_selector == "advanced": + #for $pedigree in $gatk_param_type.pedigree: + -p '--pedigree "${pedigree.pedigree_file}"' + #end for + #for $pedigree_string in $gatk_param_type.pedigree_string_repeat: + -p '--pedigreeString "${pedigree_string.pedigree_string}"' + #end for + -p '--pedigreeValidationType "${gatk_param_type.pedigree_validation_type}"' + #for $read_filter in $gatk_param_type.read_filter: + -p '--read_filter "${read_filter.read_filter_type.read_filter_type_selector}" + ###raise Exception( str( dir( $read_filter ) ) ) + #for $name, $param in $read_filter.read_filter_type.iteritems(): + #if $name not in [ "__current_case__", "read_filter_type_selector" ]: + #if hasattr( $param.input, 'truevalue' ): + ${param} + #else: + --${name} "${param}" + #end if + #end if + #end for + ' + #end for + #for $interval_count, $input_intervals in enumerate( $gatk_param_type.input_interval_repeat ): + -d "--intervals" "${input_intervals.input_intervals}" "${input_intervals.input_intervals.ext}" "input_intervals_${interval_count}" + #end for + + #for $interval_count, $input_intervals in enumerate( $gatk_param_type.input_exclude_interval_repeat ): + -d "--excludeIntervals" "${input_intervals.input_exclude_intervals}" "${input_intervals.input_exclude_intervals.ext}" "input_exlude_intervals_${interval_count}" + #end for + + -p '--interval_set_rule "${gatk_param_type.interval_set_rule}"' + + -p '--downsampling_type "${gatk_param_type.downsampling_type.downsampling_type_selector}"' + #if str( $gatk_param_type.downsampling_type.downsampling_type_selector ) != "NONE": + -p '--${gatk_param_type.downsampling_type.downsample_to_type.downsample_to_type_selector} "${gatk_param_type.downsampling_type.downsample_to_type.downsample_to_value}"' + #end if + -p ' + --baq "${gatk_param_type.baq}" + --baqGapOpenPenalty "${gatk_param_type.baq_gap_open_penalty}" + ${gatk_param_type.use_original_qualities} + --defaultBaseQualities "${gatk_param_type.default_base_qualities}" + --validation_strictness "${gatk_param_type.validation_strictness}" + --interval_merging "${gatk_param_type.interval_merging}" + ${gatk_param_type.disable_experimental_low_memory_sharding} + ${gatk_param_type.non_deterministic_random_seed} + ' + #for $rg_black_list_count, $rg_black_list in enumerate( $gatk_param_type.read_group_black_list_repeat ): + #if $rg_black_list.read_group_black_list_type.read_group_black_list_type_selector == "file": + -d "--read_group_black_list" "${rg_black_list.read_group_black_list_type.read_group_black_list}" "txt" "input_read_group_black_list_${rg_black_list_count}" + #else + -p '--read_group_black_list "${rg_black_list.read_group_black_list_type.read_group_black_list}"' + #end if + #end for + #end if + + #if str( $reference_source.reference_source_selector ) == "history": + -d "-R" "${reference_source.ref_file}" "${reference_source.ref_file.ext}" "gatk_input" + #end if + ##end standard gatk options + + ##start analysis specific options + -p ' + --mode "${mode}" + + #for $ignore_filter in $ignore_filters: + #set $ignore_filter_name = str( $ignore_filter.ignore_filter_type.ignore_filter_type_selector ) + #if $ignore_filter_name == "custom": + #set $ignore_filter_name = str( $ignore_filter.ignore_filter_type.filter_name ) + #end if + --ignore_filter "${ignore_filter_name}" + #end for + --ts_filter_level "${ts_filter_level}" + ' + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +**What it does** + +Applies cuts to the input vcf file (by adding filter lines) to achieve the desired novel FDR levels which were specified during VariantRecalibration + +For more information on using the ApplyRecalibration module, see this `tool specific page <http://www.broadinstitute.org/gatk/gatkdocs/org_broadinstitute_sting_gatk_walkers_variantrecalibration_ApplyRecalibration.html>`_. + +To learn about best practices for variant detection using GATK, see this `overview <http://www.broadinstitute.org/gatk/guide/topic?name=best-practices>`_. + +If you encounter errors, please view the `GATK FAQ <http://www.broadinstitute.org/gatk/guide/topic?name=faqs>`_. + +------ + +**Inputs** + +GenomeAnalysisTK: ApplyRecalibration accepts a variant input file, a recalibration file and a tranches file. + + +**Outputs** + +The output is in VCF format. + +Go `here <http://www.broadinstitute.org/gatk/guide/topic?name=intro>`_ for details on GATK file formats. + +------- + +**Settings**:: + + + recal_file The output recal file used by ApplyRecalibration + tranches_file The input tranches file describing where to cut the data + out The output filtered, recalibrated VCF file + ts_filter_level The truth sensitivity level at which to start filtering + ignore_filter If specified the optimizer will use variants even if the specified filter name is marked in the input VCF file + mode Recalibration mode to employ: 1.) SNP for recalibrating only SNPs (emitting indels untouched in the output VCF); 2.) INDEL for indels; and 3.) BOTH for recalibrating both SNPs and indels simultaneously. (SNP|INDEL|BOTH) + +------ + +**Citation** + +For the underlying tool, please cite `DePristo MA, Banks E, Poplin R, Garimella KV, Maguire JR, Hartl C, Philippakis AA, del Angel G, Rivas MA, Hanna M, McKenna A, Fennell TJ, Kernytsky AM, Sivachenko AY, Cibulskis K, Gabriel SB, Altshuler D, Daly MJ. A framework for variation discovery and genotyping using next-generation DNA sequencing data. Nat Genet. 2011 May;43(5):491-8. <http://www.ncbi.nlm.nih.gov/pubmed/21478889>`_ + +Please also site `McKenna A, Hanna M, Banks E, Sivachenko A, Cibulskis K, Kernytsky A, Garimella K, Altshuler D, Gabriel S, Daly M, DePristo MA (2010). The Genome Analysis Toolkit: a MapReduce framework for analyzing next-generation DNA sequencing data. Genome Res. 20:1297-303. Epub 2010 Jul 19. <http://www.ncbi.nlm.nih.gov/pubmed/20644199>`_ + +If you use this tool in Galaxy, please cite `Blankenberg D, Von Kuster G, Coraor N, Ananda G, Lazarus R, Mangan M, Nekrutenko A, Taylor J. Galaxy: a web-based genome analysis tool for experimentalists. Curr Protoc Mol Biol. 2010 Jan;Chapter 19:Unit 19.10.1-21. <http://www.ncbi.nlm.nih.gov/pubmed/20069535>`_ + + + diff -r 000000000000 -r 74c05070a3f8 variant_combine.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_combine.xml Thu Nov 01 13:53:22 2012 -0400 @@ -0,0 +1,458 @@ + + + + gatk + + gatk2_wrapper.py + --max_jvm_heap_fraction "1" + --stdout "${output_log}" + + #set $priority_order = [] + #for $input_variant in $reference_source.input_variants: + -d "--variant:${input_variant.input_variant_name},%(file_type)s" "${input_variant.input_variant}" "${input_variant.input_variant.ext}" "input_variant_${input_variant.input_variant_name}" + #set $input_variant_name = str( $input_variant.input_variant_name ) + #assert $input_variant_name not in $priority_order, "Variant Names must be unique" ##this should be handled by a validator + #silent $priority_order.append( $input_variant_name ) + #end for + -p 'java + -jar "\$GATK2_PATH/GenomeAnalysisTK.jar" + -T "CombineVariants" + --out "${output_variants}" + \$GATK2_SITE_OPTIONS + ##--num_threads 4 ##hard coded, for now + ##-et "NO_ET" -K "\$GATK2_BASE/gatk2_key_file" ##ET no phone home + ##-log "${output_log}" ##don't use this to log to file, instead directly capture stdout + #if $reference_source.reference_source_selector != "history": + -R "${reference_source.ref_file.fields.path}" + #end if + --genotypemergeoption "${genotype_merge_option}" + --rod_priority_list "${ ','.join( $priority_order ) }" + ' + + ##start standard gatk options + #if $gatk_param_type.gatk_param_type_selector == "advanced": + #for $pedigree in $gatk_param_type.pedigree: + -p '--pedigree "${pedigree.pedigree_file}"' + #end for + #for $pedigree_string in $gatk_param_type.pedigree_string_repeat: + -p '--pedigreeString "${pedigree_string.pedigree_string}"' + #end for + -p '--pedigreeValidationType "${gatk_param_type.pedigree_validation_type}"' + #for $read_filter in $gatk_param_type.read_filter: + -p '--read_filter "${read_filter.read_filter_type.read_filter_type_selector}" + ###raise Exception( str( dir( $read_filter ) ) ) + #for $name, $param in $read_filter.read_filter_type.iteritems(): + #if $name not in [ "__current_case__", "read_filter_type_selector" ]: + #if hasattr( $param.input, 'truevalue' ): + ${param} + #else: + --${name} "${param}" + #end if + #end if + #end for + ' + #end for + #for $interval_count, $input_intervals in enumerate( $gatk_param_type.input_interval_repeat ): + -d "--intervals" "${input_intervals.input_intervals}" "${input_intervals.input_intervals.ext}" "input_intervals_${interval_count}" + #end for + + #for $interval_count, $input_intervals in enumerate( $gatk_param_type.input_exclude_interval_repeat ): + -d "--excludeIntervals" "${input_intervals.input_exclude_intervals}" "${input_intervals.input_exclude_intervals.ext}" "input_exlude_intervals_${interval_count}" + #end for + + -p '--interval_set_rule "${gatk_param_type.interval_set_rule}"' + + -p '--downsampling_type "${gatk_param_type.downsampling_type.downsampling_type_selector}"' + #if str( $gatk_param_type.downsampling_type.downsampling_type_selector ) != "NONE": + -p '--${gatk_param_type.downsampling_type.downsample_to_type.downsample_to_type_selector} "${gatk_param_type.downsampling_type.downsample_to_type.downsample_to_value}"' + #end if + -p ' + --baq "${gatk_param_type.baq}" + --baqGapOpenPenalty "${gatk_param_type.baq_gap_open_penalty}" + ${gatk_param_type.use_original_qualities} + --defaultBaseQualities "${gatk_param_type.default_base_qualities}" + --validation_strictness "${gatk_param_type.validation_strictness}" + --interval_merging "${gatk_param_type.interval_merging}" + ${gatk_param_type.disable_experimental_low_memory_sharding} + ${gatk_param_type.non_deterministic_random_seed} + ' + #for $rg_black_list_count, $rg_black_list in enumerate( $gatk_param_type.read_group_black_list_repeat ): + #if $rg_black_list.read_group_black_list_type.read_group_black_list_type_selector == "file": + -d "--read_group_black_list" "${rg_black_list.read_group_black_list_type.read_group_black_list}" "txt" "input_read_group_black_list_${rg_black_list_count}" + #else + -p '--read_group_black_list "${rg_black_list.read_group_black_list_type.read_group_black_list}"' + #end if + #end for + #end if + + #if $reference_source.reference_source_selector == "history": + -d "-R" "${reference_source.ref_file}" "${reference_source.ref_file.ext}" "gatk_input" + #end if + ##end standard gatk options + + + ##start analysis specific options + #if $analysis_param_type.analysis_param_type_selector == "advanced": + -p ' + --filteredrecordsmergetype "${analysis_param_type.filtered_records_merge_type}" + ${analysis_param_type.print_complex_merges} + ${analysis_param_type.filtered_are_uncalled} + ${analysis_param_type.minimal_vcf} + ${analysis_param_type.assume_identical_samples} + + #if str( $analysis_param_type.set_key ): + --setKey "${analysis_param_type.set_key}" + #end if + + --minimumN "${analysis_param_type.minimum_n}" + ' + #end if + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +**What it does** + +Combines VCF records from different sources; supports both full merges and set unions. Merge: combines multiple records into a single one; if sample names overlap then they are uniquified. Union: assumes each rod represents the same set of samples (although this is not enforced); using the priority list (if provided), emits a single record instance at every position represented in the rods. + +For more information on using the CombineVariants module, see this `tool specific page <http://www.broadinstitute.org/gatk/gatkdocs/org_broadinstitute_sting_gatk_walkers_variantutils_CombineVariants.html>`_. + +To learn about best practices for variant detection using GATK, see this `overview <http://www.broadinstitute.org/gatk/guide/topic?name=best-practices>`_. + +If you encounter errors, please view the `GATK FAQ <http://www.broadinstitute.org/gatk/guide/topic?name=faqs>`_. + +------ + +**Inputs** + +GenomeAnalysisTK: CombineVariants accepts variant files as input. + +------ + +**Outputs** + +The output is a combined vcf file. + + +Go `here <http://www.broadinstitute.org/gatk/guide/topic?name=intro>`_ for details on GATK file formats. + +------- + +**Settings**:: + + out File to which variants should be written + genotypemergeoption How should we merge genotype records for samples shared across the ROD files? (UNIQUIFY|PRIORITIZE|UNSORTED|REQUIRE_UNIQUE) + filteredrecordsmergetype How should we deal with records seen at the same site in the VCF, but with different FILTER fields? KEEP_IF_ANY_UNFILTERED PASSes the record if any record is unfiltered, KEEP_IF_ALL_UNFILTERED requires all records to be unfiltered (KEEP_IF_ANY_UNFILTERED|KEEP_IF_ALL_UNFILTERED) + rod_priority_list When taking the union of variants containing genotypes: a comma-separated string describing the priority ordering for the genotypes as far as which record gets emitted; a complete priority list MUST be provided + printComplexMerges Print out interesting sites requiring complex compatibility merging + filteredAreUncalled If true, then filtered VCFs are treated as uncalled, so that filtered set annotation don't appear in the combined VCF + minimalVCF If true, then the output VCF will contain no INFO or genotype INFO field + setKey Key, by default set, in the INFO key=value tag emitted describing which set the combined VCF record came from. Set to null if you don't want the set field emitted. + assumeIdenticalSamples If true, assume input VCFs have identical sample sets and disjoint calls so that one can simply perform a merge sort to combine the VCFs into one, drastically reducing the runtime. + minimumN Combine variants and output site only if variant is present in at least N input files. + +------ + +**Citation** + +For the underlying tool, please cite `DePristo MA, Banks E, Poplin R, Garimella KV, Maguire JR, Hartl C, Philippakis AA, del Angel G, Rivas MA, Hanna M, McKenna A, Fennell TJ, Kernytsky AM, Sivachenko AY, Cibulskis K, Gabriel SB, Altshuler D, Daly MJ. A framework for variation discovery and genotyping using next-generation DNA sequencing data. Nat Genet. 2011 May;43(5):491-8. <http://www.ncbi.nlm.nih.gov/pubmed/21478889>`_ + +Please also site `McKenna A, Hanna M, Banks E, Sivachenko A, Cibulskis K, Kernytsky A, Garimella K, Altshuler D, Gabriel S, Daly M, DePristo MA (2010). The Genome Analysis Toolkit: a MapReduce framework for analyzing next-generation DNA sequencing data. Genome Res. 20:1297-303. Epub 2010 Jul 19. <http://www.ncbi.nlm.nih.gov/pubmed/20644199>`_ + +If you use this tool in Galaxy, please cite `Blankenberg D, Von Kuster G, Coraor N, Ananda G, Lazarus R, Mangan M, Nekrutenko A, Taylor J. Galaxy: a web-based genome analysis tool for experimentalists. Curr Protoc Mol Biol. 2010 Jan;Chapter 19:Unit 19.10.1-21. <http://www.ncbi.nlm.nih.gov/pubmed/20069535>`_ + + + diff -r 000000000000 -r 74c05070a3f8 variant_eval.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_eval.xml Thu Nov 01 13:53:22 2012 -0400 @@ -0,0 +1,574 @@ + + + + gatk + + gatk2_wrapper.py + #from binascii import hexlify + --max_jvm_heap_fraction "1" + --stdout "${output_log}" + #for $var_count, $variant in enumerate( $reference_source.variants ): + -d "--eval:input_${var_count},%(file_type)s" "${variant.input_variant}" "${variant.input_variant.ext}" "input_variants_${var_count}" + #end for + -p 'java + -jar "\$GATK2_PATH/GenomeAnalysisTK.jar" + -T "VariantEval" + --out "${output_report}" + \$GATK2_SITE_OPTIONS + ##--num_threads 4 ##hard coded, for now + ##-et "NO_ET" -K "\$GATK2_BASE/gatk2_key_file" ##ET no phone home + ##-log "${output_log}" ##don't use this to log to file, instead directly capture stdout + #if $reference_source.reference_source_selector != "history": + -R "${reference_source.ref_file.fields.path}" + #end if + ' + + #for $rod_binding in $comp_rod_bind: + -d "--comp:${rod_binding.comp_rod_name},%(file_type)s" "${rod_binding.comp_input_rod}" "${rod_binding.comp_input_rod.ext}" "input_comp_${rod_binding.comp_rod_name}" + #if str( $rod_binding.comp_known_names ): + -p '--known_names "${rod_binding.comp_rod_name}"' + #end if + #end for + + #if str( $dbsnp_rod_bind_type.dbsnp_rod_bind_type_selector ) == 'set_dbsnp': + -d "--dbsnp:${dbsnp_rod_bind_type.dbsnp_rod_name},%(file_type)s" "${dbsnp_rod_bind_type.dbsnp_input_rod}" "${dbsnp_rod_bind_type.dbsnp_input_rod.ext}" "input_dbsnp_${dbsnp_rod_bind_type.dbsnp_rod_name}" + #if str( $dbsnp_rod_bind_type.dbsnp_known_names ): + -p '--known_names "${dbsnp_rod_bind_type.dbsnp_rod_name}"' + #end if + #end if + + ##start standard gatk options + #if $gatk_param_type.gatk_param_type_selector == "advanced": + #for $pedigree in $gatk_param_type.pedigree: + -p '--pedigree "${pedigree.pedigree_file}"' + #end for + #for $pedigree_string in $gatk_param_type.pedigree_string_repeat: + -p '--pedigreeString "${pedigree_string.pedigree_string}"' + #end for + -p '--pedigreeValidationType "${gatk_param_type.pedigree_validation_type}"' + #for $read_filter in $gatk_param_type.read_filter: + -p '--read_filter "${read_filter.read_filter_type.read_filter_type_selector}" + ###raise Exception( str( dir( $read_filter ) ) ) + #for $name, $param in $read_filter.read_filter_type.iteritems(): + #if $name not in [ "__current_case__", "read_filter_type_selector" ]: + #if hasattr( $param.input, 'truevalue' ): + ${param} + #else: + --${name} "${param}" + #end if + #end if + #end for + ' + #end for + #for $interval_count, $input_intervals in enumerate( $gatk_param_type.input_interval_repeat ): + -d "--intervals" "${input_intervals.input_intervals}" "${input_intervals.input_intervals.ext}" "input_intervals_${interval_count}" + #end for + + #for $interval_count, $input_intervals in enumerate( $gatk_param_type.input_exclude_interval_repeat ): + -d "--excludeIntervals" "${input_intervals.input_exclude_intervals}" "${input_intervals.input_exclude_intervals.ext}" "input_exlude_intervals_${interval_count}" + #end for + + -p '--interval_set_rule "${gatk_param_type.interval_set_rule}"' + + -p '--downsampling_type "${gatk_param_type.downsampling_type.downsampling_type_selector}"' + #if str( $gatk_param_type.downsampling_type.downsampling_type_selector ) != "NONE": + -p '--${gatk_param_type.downsampling_type.downsample_to_type.downsample_to_type_selector} "${gatk_param_type.downsampling_type.downsample_to_type.downsample_to_value}"' + #end if + -p ' + --baq "${gatk_param_type.baq}" + --baqGapOpenPenalty "${gatk_param_type.baq_gap_open_penalty}" + ${gatk_param_type.use_original_qualities} + --defaultBaseQualities "${gatk_param_type.default_base_qualities}" + --validation_strictness "${gatk_param_type.validation_strictness}" + --interval_merging "${gatk_param_type.interval_merging}" + ${gatk_param_type.disable_experimental_low_memory_sharding} + ${gatk_param_type.non_deterministic_random_seed} + ' + #for $rg_black_list_count, $rg_black_list in enumerate( $gatk_param_type.read_group_black_list_repeat ): + #if $rg_black_list.read_group_black_list_type.read_group_black_list_type_selector == "file": + -d "--read_group_black_list" "${rg_black_list.read_group_black_list_type.read_group_black_list}" "txt" "input_read_group_black_list_${rg_black_list_count}" + #else + -p '--read_group_black_list "${rg_black_list.read_group_black_list_type.read_group_black_list}"' + #end if + #end for + #end if + + #if $reference_source.reference_source_selector == "history": + -d "-R" "${reference_source.ref_file}" "${reference_source.ref_file.ext}" "gatk_input" + #end if + ##end standard gatk options + + + ##start analysis specific options + #if $analysis_param_type.analysis_param_type_selector == "advanced": + #for $stratification in $analysis_param_type.stratifications: + #set $select_string = "--select_exps '%s' --select_names '%s'" % ( str( $stratification.select_exps ), str( $stratification.select_name ) ) + -o '${ hexlify( $select_string ) }' + #end for + -p ' + + #for $sample in $analysis_param_type.samples: + --sample "${sample.sample}" + #end for + + #if str( $analysis_param_type.stratification_modules ) != "None": + #for $stratification_module in str( $analysis_param_type.stratification_modules).split( ',' ): + --stratificationModule "${stratification_module}" + #end for + #end if + + ${analysis_param_type.do_not_use_all_standard_stratifications} + + #for $variant_type in $analysis_param_type.only_variants_of_type: + --onlyVariantsOfType "${variant_type.variant_type}" + #end for + + #if str( $analysis_param_type.eval_modules ) != "None": + #for $eval_module in str( $analysis_param_type.eval_modules).split( ',' ): + --evalModule "${eval_module}" + #end for + #end if + + ${analysis_param_type.do_not_use_all_standard_modules} + + #if str( $analysis_param_type.num_samples ) != "0": + --numSamples "${analysis_param_type.num_samples}" + #end if + + --minPhaseQuality "${analysis_param_type.min_phase_quality}" + + #if str( $analysis_param_type.family ): + --family_structure "${analysis_param_type.family}" + #end if + + --mendelianViolationQualThreshold "${analysis_param_type.mendelian_violation_qual_threshold}" + + #if str( $analysis_param_type.ancestral_alignments ) != "None": + --ancestralAlignments "${analysis_param_type.ancestral_alignments}" + #end if + ' + #if str( $analysis_param_type.known_cnvs ) != "None": + -d "--knownCNVs" "${analysis_param_type.known_cnvs}" "${analysis_param_type.known_cnvs.ext}" "input_known_cnvs" + #end if + + #if str( $analysis_param_type.strat_intervals ) != "None": + -d "--stratIntervals" "${analysis_param_type.strat_intervals}" "${analysis_param_type.strat_intervals.ext}" "input_strat_intervals" + #end if + #end if + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +**What it does** + +General-purpose tool for variant evaluation (% in dbSNP, genotype concordance, Ti/Tv ratios, and a lot more) + +For more information on using the VariantEval module, see this `tool specific page <http://www.broadinstitute.org/gatk/gatkdocs/org_broadinstitute_sting_gatk_walkers_varianteval_VariantEval.html>`_. + +To learn about best practices for variant detection using GATK, see this `overview <http://www.broadinstitute.org/gatk/guide/topic?name=best-practices>`_. + +If you encounter errors, please view the `GATK FAQ <http://www.broadinstitute.org/gatk/guide/topic?name=faqs>`_. + +------ + +**Inputs** + +GenomeAnalysisTK: VariantEval accepts variant files as input. + + +**Outputs** + +The output is a table of variant evaluation. + + +Go `here <http://www.broadinstitute.org/gatk/guide/topic?name=intro>`_ for details on GATK file formats. + +------- + +**Settings**:: + + out An output file presented to the walker. Will overwrite contents if file exists. + list List the available eval modules and exit + select_exps One or more stratifications to use when evaluating the data + select_names Names to use for the list of stratifications (must be a 1-to-1 mapping) + sample Derive eval and comp contexts using only these sample genotypes, when genotypes are available in the original context + known_names Name of ROD bindings containing variant sites that should be treated as known when splitting eval rods into known and novel subsets + stratificationModule One or more specific stratification modules to apply to the eval track(s) (in addition to the standard stratifications, unless -noS is specified) + doNotUseAllStandardStratifications Do not use the standard stratification modules by default (instead, only those that are specified with the -S option) + onlyVariantsOfType If provided, only variants of these types will be considered during the evaluation, in + evalModule One or more specific eval modules to apply to the eval track(s) (in addition to the standard modules, unless -noE is specified) + doNotUseAllStandardModules Do not use the standard modules by default (instead, only those that are specified with the -E option) + numSamples Number of samples (used if no samples are available in the VCF file + minPhaseQuality Minimum phasing quality + family_structure If provided, genotypes in will be examined for mendelian violations: this argument is a string formatted as dad+mom=child where these parameters determine which sample names are examined + mendelianViolationQualThreshold Minimum genotype QUAL score for each trio member required to accept a site as a violation + ancestralAlignments Fasta file with ancestral alleles + +------ + +**Citation** + +For the underlying tool, please cite `DePristo MA, Banks E, Poplin R, Garimella KV, Maguire JR, Hartl C, Philippakis AA, del Angel G, Rivas MA, Hanna M, McKenna A, Fennell TJ, Kernytsky AM, Sivachenko AY, Cibulskis K, Gabriel SB, Altshuler D, Daly MJ. A framework for variation discovery and genotyping using next-generation DNA sequencing data. Nat Genet. 2011 May;43(5):491-8. <http://www.ncbi.nlm.nih.gov/pubmed/21478889>`_ + +Please also site `McKenna A, Hanna M, Banks E, Sivachenko A, Cibulskis K, Kernytsky A, Garimella K, Altshuler D, Gabriel S, Daly M, DePristo MA (2010). The Genome Analysis Toolkit: a MapReduce framework for analyzing next-generation DNA sequencing data. Genome Res. 20:1297-303. Epub 2010 Jul 19. <http://www.ncbi.nlm.nih.gov/pubmed/20644199>`_ + +If you use this tool in Galaxy, please cite `Blankenberg D, Von Kuster G, Coraor N, Ananda G, Lazarus R, Mangan M, Nekrutenko A, Taylor J. Galaxy: a web-based genome analysis tool for experimentalists. Curr Protoc Mol Biol. 2010 Jan;Chapter 19:Unit 19.10.1-21. <http://www.ncbi.nlm.nih.gov/pubmed/20069535>`_ + + + diff -r 000000000000 -r 74c05070a3f8 variant_filtration.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_filtration.xml Thu Nov 01 13:53:22 2012 -0400 @@ -0,0 +1,460 @@ + + on VCF files + + gatk + + gatk2_wrapper.py + #from binascii import hexlify + --max_jvm_heap_fraction "1" + --stdout "${output_log}" + -d "--variant:variant,%(file_type)s" "${reference_source.input_variant}" "${reference_source.input_variant.ext}" "input_variant" + -p 'java + -jar "\$GATK2_PATH/GenomeAnalysisTK.jar" + -T "VariantFiltration" + \$GATK2_SITE_OPTIONS + ##--num_threads 4 ##hard coded, for now + ##-et "NO_ET" -K "\$GATK2_BASE/gatk2_key_file" ##ET no phone home + -o "${output_vcf}" + ##-log "${output_log}" ##don't use this to log to file, instead directly capture stdout + #if $reference_source.reference_source_selector != "history": + -R "${reference_source.ref_file.fields.path}" + #end if + ' + #for $variant_filter in $variant_filters: + #set $variant_filter = "--%sExpression '%s' --%sName '%s'" % ( str( $variant_filter.is_genotype_filter ), str( $variant_filter.filter_expression ), str( $variant_filter.is_genotype_filter ), str( $variant_filter.filter_name ) ) + -o '${ hexlify( $variant_filter ) }' + #end for + + #if str( $mask_rod_bind_type.mask_rod_bind_type_selector ) == 'set_mask': + -d "--mask:${mask_rod_bind_type.mask_rod_name},%(file_type)s" "${mask_rod_bind_type.input_mask_rod}" "${mask_rod_bind_type.input_mask_rod.ext}" "input_mask_${mask_rod_bind_type.mask_rod_name}" + -p ' + --maskExtension "${mask_rod_bind_type.mask_extension}" + --maskName "${mask_rod_bind_type.mask_rod_name}" + ' + #end if + + ##start standard gatk options + #if $gatk_param_type.gatk_param_type_selector == "advanced": + #for $pedigree in $gatk_param_type.pedigree: + -p '--pedigree "${pedigree.pedigree_file}"' + #end for + #for $pedigree_string in $gatk_param_type.pedigree_string_repeat: + -p '--pedigreeString "${pedigree_string.pedigree_string}"' + #end for + -p '--pedigreeValidationType "${gatk_param_type.pedigree_validation_type}"' + #for $read_filter in $gatk_param_type.read_filter: + -p '--read_filter "${read_filter.read_filter_type.read_filter_type_selector}" + ###raise Exception( str( dir( $read_filter ) ) ) + #for $name, $param in $read_filter.read_filter_type.iteritems(): + #if $name not in [ "__current_case__", "read_filter_type_selector" ]: + #if hasattr( $param.input, 'truevalue' ): + ${param} + #else: + --${name} "${param}" + #end if + #end if + #end for + ' + #end for + #for $interval_count, $input_intervals in enumerate( $gatk_param_type.input_interval_repeat ): + -d "--intervals" "${input_intervals.input_intervals}" "${input_intervals.input_intervals.ext}" "input_intervals_${interval_count}" + #end for + + #for $interval_count, $input_intervals in enumerate( $gatk_param_type.input_exclude_interval_repeat ): + -d "--excludeIntervals" "${input_intervals.input_exclude_intervals}" "${input_intervals.input_exclude_intervals.ext}" "input_exlude_intervals_${interval_count}" + #end for + + -p '--interval_set_rule "${gatk_param_type.interval_set_rule}"' + + -p '--downsampling_type "${gatk_param_type.downsampling_type.downsampling_type_selector}"' + #if str( $gatk_param_type.downsampling_type.downsampling_type_selector ) != "NONE": + -p '--${gatk_param_type.downsampling_type.downsample_to_type.downsample_to_type_selector} "${gatk_param_type.downsampling_type.downsample_to_type.downsample_to_value}"' + #end if + -p ' + --baq "${gatk_param_type.baq}" + --baqGapOpenPenalty "${gatk_param_type.baq_gap_open_penalty}" + ${gatk_param_type.use_original_qualities} + --defaultBaseQualities "${gatk_param_type.default_base_qualities}" + --validation_strictness "${gatk_param_type.validation_strictness}" + --interval_merging "${gatk_param_type.interval_merging}" + ${gatk_param_type.disable_experimental_low_memory_sharding} + ${gatk_param_type.non_deterministic_random_seed} + ' + #for $rg_black_list_count, $rg_black_list in enumerate( $gatk_param_type.read_group_black_list_repeat ): + #if $rg_black_list.read_group_black_list_type.read_group_black_list_type_selector == "file": + -d "--read_group_black_list" "${rg_black_list.read_group_black_list_type.read_group_black_list}" "txt" "input_read_group_black_list_${rg_black_list_count}" + #else + -p '--read_group_black_list "${rg_black_list.read_group_black_list_type.read_group_black_list}"' + #end if + #end for + #end if + + #if str( $reference_source.reference_source_selector ) == "history": + -d "-R" "${reference_source.ref_file}" "${reference_source.ref_file.ext}" "gatk_input" + #end if + ##end standard gatk options + + ##start analysis specific options + #if $cluster_snp_type.cluster_snp_type_selector == "cluster_snp": + -p ' + --clusterSize "${cluster_snp_type.cluster_size}" + --clusterWindowSize "${cluster_snp_type.cluster_window_size}" + ' + #end if + -p '${missing_values_in_expressions_should_evaluate_as_failing}' + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +**What it does** + +Filters variant calls using a number of user-selectable, parameterizable criteria. + +For more information on using the VariantFiltration module, see this `tool specific page <http://www.broadinstitute.org/gatk/gatkdocs/org_broadinstitute_sting_gatk_walkers_filters_VariantFiltration.html>`_. + +To learn about best practices for variant detection using GATK, see this `overview <http://www.broadinstitute.org/gatk/guide/topic?name=best-practices>`_. + +If you encounter errors, please view the `GATK FAQ <http://www.broadinstitute.org/gatk/guide/topic?name=faqs>`_. + +------ + +**Inputs** + +GenomeAnalysisTK: VariantFiltration accepts a VCF input file. + + +**Outputs** + +The output is in VCF format. + +Go `here <http://www.broadinstitute.org/gatk/guide/topic?name=intro>`_ for details on GATK file formats. + +------- + +**Settings**:: + + + filterExpression One or more expression used with INFO fields to filter (see wiki docs for more info) + filterName Names to use for the list of filters (must be a 1-to-1 mapping); this name is put in the FILTER field for variants that get filtered + genotypeFilterExpression One or more expression used with FORMAT (sample/genotype-level) fields to filter (see wiki docs for more info) + genotypeFilterName Names to use for the list of sample/genotype filters (must be a 1-to-1 mapping); this name is put in the FILTER field for variants that get filtered + clusterSize The number of SNPs which make up a cluster (see also --clusterWindowSize); [default:3] + clusterWindowSize The window size (in bases) in which to evaluate clustered SNPs (to disable the clustered SNP filter, set this value to less than 1); [default:0] + maskName The text to put in the FILTER field if a 'mask' rod is provided and overlaps with a variant call; [default:'Mask'] + missingValuesInExpressionsShouldEvaluateAsFailing When evaluating the JEXL expressions, should missing values be considered failing the expression (by default they are considered passing)? + +------ + +**Citation** + +For the underlying tool, please cite `DePristo MA, Banks E, Poplin R, Garimella KV, Maguire JR, Hartl C, Philippakis AA, del Angel G, Rivas MA, Hanna M, McKenna A, Fennell TJ, Kernytsky AM, Sivachenko AY, Cibulskis K, Gabriel SB, Altshuler D, Daly MJ. A framework for variation discovery and genotyping using next-generation DNA sequencing data. Nat Genet. 2011 May;43(5):491-8. <http://www.ncbi.nlm.nih.gov/pubmed/21478889>`_ + +Please also site `McKenna A, Hanna M, Banks E, Sivachenko A, Cibulskis K, Kernytsky A, Garimella K, Altshuler D, Gabriel S, Daly M, DePristo MA (2010). The Genome Analysis Toolkit: a MapReduce framework for analyzing next-generation DNA sequencing data. Genome Res. 20:1297-303. Epub 2010 Jul 19. <http://www.ncbi.nlm.nih.gov/pubmed/20644199>`_ + +If you use this tool in Galaxy, please cite `Blankenberg D, Von Kuster G, Coraor N, Ananda G, Lazarus R, Mangan M, Nekrutenko A, Taylor J. Galaxy: a web-based genome analysis tool for experimentalists. Curr Protoc Mol Biol. 2010 Jan;Chapter 19:Unit 19.10.1-21. <http://www.ncbi.nlm.nih.gov/pubmed/20069535>`_ + + + + diff -r 000000000000 -r 74c05070a3f8 variant_recalibrator.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_recalibrator.xml Thu Nov 01 13:53:22 2012 -0400 @@ -0,0 +1,718 @@ + + + + gatk + + gatk2_wrapper.py + --max_jvm_heap_fraction "1" + --stdout "${output_log}" + #for $var_count, $variant in enumerate( $reference_source.variants ): + -d "--input:input_${var_count},%(file_type)s" "${variant.input_variants}" "${variant.input_variants.ext}" "input_variants_${var_count}" + #end for + -p 'java + -jar "\$GATK2_PATH/GenomeAnalysisTK.jar" + -T "VariantRecalibrator" + \$GATK2_SITE_OPTIONS + ##--num_threads 4 ##hard coded, for now + ##-et "NO_ET" -K "\$GATK2_BASE/gatk2_key_file"##ET no phone home + ##-log "${output_log}" ##don't use this to log to file, instead directly capture stdout + #if $reference_source.reference_source_selector != "history": + -R "${reference_source.ref_file.fields.path}" + #end if + --recal_file "${output_recal}" + --tranches_file "${output_tranches}" + --rscript_file "${output_rscript}" + ' + + #set $rod_binding_names = dict() + #for $rod_binding in $rod_bind: + #if str( $rod_binding.rod_bind_type.rod_bind_type_selector ) == 'custom': + #set $rod_bind_name = $rod_binding.rod_bind_type.custom_rod_name + #elif str( $rod_binding.rod_bind_type.rod_bind_type_selector ) == 'comp': + #set $rod_bind_name = "comp" + $rod_binding.rod_bind_type.custom_rod_name + #else + #set $rod_bind_name = $rod_binding.rod_bind_type.rod_bind_type_selector + #end if + #set $rod_binding_names[$rod_bind_name] = $rod_binding_names.get( $rod_bind_name, -1 ) + 1 + #if $rod_binding.rod_bind_type.rod_training_type.rod_training_type_selector == "not_training_truth_known": + -d "--resource:${rod_bind_name},%(file_type)s" "${rod_binding.rod_bind_type.input_rod}" "${rod_binding.rod_bind_type.input_rod.ext}" "input_${rod_bind_name}_${rod_binding_names[$rod_bind_name]}" + #else: + -d "--resource:${rod_bind_name},%(file_type)s,known=${rod_binding.rod_bind_type.rod_training_type.known},training=${rod_binding.rod_bind_type.rod_training_type.training},truth=${rod_binding.rod_bind_type.rod_training_type.truth},bad=${rod_binding.rod_bind_type.rod_training_type.bad},prior=${rod_binding.rod_bind_type.rod_training_type.prior}" "${rod_binding.rod_bind_type.input_rod}" "${rod_binding.rod_bind_type.input_rod.ext}" "input_${rod_bind_name}_${rod_binding_names[$rod_bind_name]}" + #end if + #end for + + ##start standard gatk options + #if $gatk_param_type.gatk_param_type_selector == "advanced": + #for $pedigree in $gatk_param_type.pedigree: + -p '--pedigree "${pedigree.pedigree_file}"' + #end for + #for $pedigree_string in $gatk_param_type.pedigree_string_repeat: + -p '--pedigreeString "${pedigree_string.pedigree_string}"' + #end for + -p '--pedigreeValidationType "${gatk_param_type.pedigree_validation_type}"' + #for $read_filter in $gatk_param_type.read_filter: + -p '--read_filter "${read_filter.read_filter_type.read_filter_type_selector}" + ###raise Exception( str( dir( $read_filter ) ) ) + #for $name, $param in $read_filter.read_filter_type.iteritems(): + #if $name not in [ "__current_case__", "read_filter_type_selector" ]: + #if hasattr( $param.input, 'truevalue' ): + ${param} + #else: + --${name} "${param}" + #end if + #end if + #end for + ' + #end for + #for $interval_count, $input_intervals in enumerate( $gatk_param_type.input_interval_repeat ): + -d "--intervals" "${input_intervals.input_intervals}" "${input_intervals.input_intervals.ext}" "input_intervals_${interval_count}" + #end for + + #for $interval_count, $input_intervals in enumerate( $gatk_param_type.input_exclude_interval_repeat ): + -d "--excludeIntervals" "${input_intervals.input_exclude_intervals}" "${input_intervals.input_exclude_intervals.ext}" "input_exlude_intervals_${interval_count}" + #end for + + -p '--interval_set_rule "${gatk_param_type.interval_set_rule}"' + + -p '--downsampling_type "${gatk_param_type.downsampling_type.downsampling_type_selector}"' + #if str( $gatk_param_type.downsampling_type.downsampling_type_selector ) != "NONE": + -p '--${gatk_param_type.downsampling_type.downsample_to_type.downsample_to_type_selector} "${gatk_param_type.downsampling_type.downsample_to_type.downsample_to_value}"' + #end if + -p ' + --baq "${gatk_param_type.baq}" + --baqGapOpenPenalty "${gatk_param_type.baq_gap_open_penalty}" + ${gatk_param_type.use_original_qualities} + --defaultBaseQualities "${gatk_param_type.default_base_qualities}" + --validation_strictness "${gatk_param_type.validation_strictness}" + --interval_merging "${gatk_param_type.interval_merging}" + ${gatk_param_type.disable_experimental_low_memory_sharding} + ${gatk_param_type.non_deterministic_random_seed} + ' + #for $rg_black_list_count, $rg_black_list in enumerate( $gatk_param_type.read_group_black_list_repeat ): + #if $rg_black_list.read_group_black_list_type.read_group_black_list_type_selector == "file": + -d "--read_group_black_list" "${rg_black_list.read_group_black_list_type.read_group_black_list}" "txt" "input_read_group_black_list_${rg_black_list_count}" + #else + -p '--read_group_black_list "${rg_black_list.read_group_black_list_type.read_group_black_list}"' + #end if + #end for + #end if + + #if str( $reference_source.reference_source_selector ) == "history": + -d "-R" "${reference_source.ref_file}" "${reference_source.ref_file.ext}" "gatk_input" + #end if + ##end standard gatk options + + ##start analysis specific options + -p ' + #if str( $annotations ) != "None": + #for $annotation in str( $annotations.fields.gatk_value ).split( ',' ): + --use_annotation "${annotation}" + #end for + #end if + #for $additional_annotation in $additional_annotations: + --use_annotation "${additional_annotation.additional_annotation_name}" + #end for + --mode "${mode}" + ' + + #if $analysis_param_type.analysis_param_type_selector == "advanced": + -p ' + --maxGaussians "${analysis_param_type.max_gaussians}" + --maxIterations "${analysis_param_type.max_iterations}" + --numKMeans "${analysis_param_type.num_k_means}" + --stdThreshold "${analysis_param_type.std_threshold}" + --qualThreshold "${analysis_param_type.qual_threshold}" + --shrinkage "${analysis_param_type.shrinkage}" + --dirichlet "${analysis_param_type.dirichlet}" + --priorCounts "${analysis_param_type.prior_counts}" + #if str( $analysis_param_type.bad_variant_selector.bad_variant_selector_type ) == 'percent': + --percentBadVariants "${analysis_param_type.bad_variant_selector.percent_bad_variants}" + #else: + --minNumBadVariants "${analysis_param_type.bad_variant_selector.min_num_bad_variants}" + #end if + --target_titv "${analysis_param_type.target_titv}" + #for $tranche in [ $tranche.strip() for $tranche in str( $analysis_param_type.ts_tranche ).split( ',' ) if $tranche.strip() ] + --TStranche "${tranche}" + #end for + #for $ignore_filter in $analysis_param_type.ignore_filters: + #set $ignore_filter_name = str( $ignore_filter.ignore_filter_type.ignore_filter_type_selector ) + #if $ignore_filter_name == "custom": + #set $ignore_filter_name = str( $ignore_filter.ignore_filter_type.filter_name ) + #end if + --ignore_filter "${ignore_filter_name}" + #end for + --ts_filter_level "${analysis_param_type.ts_filter_level}" + ' + #end if + + + && + mv "${output_rscript}.pdf" "${output_tranches_pdf}" + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +**What it does** + +Takes variant calls as .vcf files, learns a Gaussian mixture model over the variant annotations and evaluates the variant -- assigning an informative lod score + +For more information on using the VariantRecalibrator module, see this `tool specific page <http://www.broadinstitute.org/gatk/gatkdocs/org_broadinstitute_sting_gatk_walkers_variantrecalibration_VariantRecalibrator.html>`_. + +To learn about best practices for variant detection using GATK, see this `overview <http://www.broadinstitute.org/gatk/guide/topic?name=best-practices>`_. + +If you encounter errors, please view the `GATK FAQ <http://www.broadinstitute.org/gatk/guide/topic?name=faqs>`_. + +------ + +**Inputs** + +GenomeAnalysisTK: VariantRecalibrator accepts a variant input file. + + +**Outputs** + +The output is in VCF format. + + +Go `here <http://www.broadinstitute.org/gatk/guide/topic?name=intro>`_ for details on GATK file formats. + +------- + +**Settings**:: + + + tranches_file The output tranches file used by ApplyRecalibration + use_annotation The names of the annotations which should used for calculations + mode Recalibration mode to employ: 1.) SNP for recalibrating only snps (emitting indels untouched in the output VCF); 2.) INDEL for indels; and 3.) BOTH for recalibrating both snps and indels simultaneously. (SNP|INDEL|BOTH) + maxGaussians The maximum number of Gaussians to try during variational Bayes algorithm + maxIterations The maximum number of VBEM iterations to be performed in variational Bayes algorithm. Procedure will normally end when convergence is detected. + numKMeans The number of k-means iterations to perform in order to initialize the means of the Gaussians in the Gaussian mixture model. + stdThreshold If a variant has annotations more than -std standard deviations away from mean then don't use it for building the Gaussian mixture model. + qualThreshold If a known variant has raw QUAL value less than -qual then don't use it for building the Gaussian mixture model. + shrinkage The shrinkage parameter in variational Bayes algorithm. + dirichlet The dirichlet parameter in variational Bayes algorithm. + priorCounts The number of prior counts to use in variational Bayes algorithm. + percentBadVariants What percentage of the worst scoring variants to use when building the Gaussian mixture model of bad variants. 0.07 means bottom 7 percent. + minNumBadVariants The minimum amount of worst scoring variants to use when building the Gaussian mixture model of bad variants. Will override -percentBad arugment if necessary. + recal_file The output recal file used by ApplyRecalibration + target_titv The expected novel Ti/Tv ratio to use when calculating FDR tranches and for display on optimization curve output figures. (approx 2.15 for whole genome experiments). ONLY USED FOR PLOTTING PURPOSES! + TStranche The levels of novel false discovery rate (FDR, implied by ti/tv) at which to slice the data. (in percent, that is 1.0 for 1 percent) + ignore_filter If specified the optimizer will use variants even if the specified filter name is marked in the input VCF file + path_to_Rscript The path to your implementation of Rscript. For Broad users this is maybe /broad/tools/apps/R-2.6.0/bin/Rscript + rscript_file The output rscript file generated by the VQSR to aid in visualization of the input data and learned model + path_to_resources Path to resources folder holding the Sting R scripts. + ts_filter_level The truth sensitivity level at which to start filtering, used here to indicate filtered variants in plots + +------ + +**Citation** + +For the underlying tool, please cite `DePristo MA, Banks E, Poplin R, Garimella KV, Maguire JR, Hartl C, Philippakis AA, del Angel G, Rivas MA, Hanna M, McKenna A, Fennell TJ, Kernytsky AM, Sivachenko AY, Cibulskis K, Gabriel SB, Altshuler D, Daly MJ. A framework for variation discovery and genotyping using next-generation DNA sequencing data. Nat Genet. 2011 May;43(5):491-8. <http://www.ncbi.nlm.nih.gov/pubmed/21478889>`_ + +Please also site `McKenna A, Hanna M, Banks E, Sivachenko A, Cibulskis K, Kernytsky A, Garimella K, Altshuler D, Gabriel S, Daly M, DePristo MA (2010). The Genome Analysis Toolkit: a MapReduce framework for analyzing next-generation DNA sequencing data. Genome Res. 20:1297-303. Epub 2010 Jul 19. <http://www.ncbi.nlm.nih.gov/pubmed/20644199>`_ + +If you use this tool in Galaxy, please cite `Blankenberg D, Von Kuster G, Coraor N, Ananda G, Lazarus R, Mangan M, Nekrutenko A, Taylor J. Galaxy: a web-based genome analysis tool for experimentalists. Curr Protoc Mol Biol. 2010 Jan;Chapter 19:Unit 19.10.1-21. <http://www.ncbi.nlm.nih.gov/pubmed/20069535>`_ + + + diff -r 000000000000 -r 74c05070a3f8 variant_select.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_select.xml Thu Nov 01 13:53:22 2012 -0400 @@ -0,0 +1,577 @@ + + from VCF files + + gatk + + gatk2_wrapper.py + #from binascii import hexlify + --max_jvm_heap_fraction "1" + --stdout "${output_log}" + -d "--variant:variant,%(file_type)s" "${reference_source.input_variant}" "${reference_source.input_variant.ext}" "input_variant" + -p 'java + -jar "\$GATK2_PATH/GenomeAnalysisTK.jar" + -T "SelectVariants" + \$GATK2_SITE_OPTIONS + ##--num_threads 4 ##hard coded, for now + ##-et "NO_ET" -K "\$GATK2_BASE/gatk2_key_file" ##ET no phone home + -o "${output_vcf}" + + #if $reference_source.reference_source_selector != "history": + -R "${reference_source.ref_file.fields.path}" + #end if + ' + -p ' + #if $input_concordance: + --concordance "${input_concordance}" + #end if + #if $input_discordance: + --discordance "${input_discordance}" + #end if + + #for $exclude_sample_name in $exclude_sample_name_repeat: + --exclude_sample_name "${exclude_sample_name.exclude_sample_name}" + #end for + + ${exclude_filtered} + + #for $sample_name in $sample_name_repeat: + --sample_name "${sample_name.sample_name}" + #end for + + ' + + #for $select_expressions in $select_expressions_repeat: + #set $select_expression = "--select_expressions '%s'" % ( str( $select_expressions.select_expressions ) ) + -o '${ hexlify( $select_expression ) }' + #end for + + ##start tool specific options + #if str( $analysis_param_type.analysis_param_type_selector ) == 'advanced': + -p ' + #for $exclude_sample_file in $analysis_param_type.exclude_sample_file_repeat: + --exclude_sample_file "${exclude_sample_file.exclude_sample_file}" + #end for + + #for $sample_file in $analysis_param_type.sample_file_repeat: + --sample_file "${ample_file.sample_file}" + #end for + + #if $analysis_param_type.input_keep_ids: + --keepIDs "${analysis_param_type.input_keep_ids}" + #end if + + ${analysis_param_type.keep_original_AC} + + ${analysis_param_type.mendelian_violation} + + --mendelianViolationQualThreshold "${analysis_param_type.mendelian_violation_qual_threshold}" + + --remove_fraction_genotypes "${analysis_param_type.remove_fraction_genotypes}" + + --restrictAllelesTo "${analysis_param_type.restrict_alleles_to}" + + #if str( $analysis_param_type.select_random_type.select_random_type_selector ) == 'select_random_fraction': + --select_random_fraction "${analysis_param_type.select_random_type.select_random_fraction}" + #elif str( $analysis_param_type.select_random_type.select_random_type_selector ) == 'select_random_number': + --select_random_number "${analysis_param_type.select_random_type.select_random_number}" + #end if + + #if $analysis_param_type.select_type_to_include: + #for $type_to_include in str( $analysis_param_type.select_type_to_include ).split( ',' ): + --selectTypeToInclude "${type_to_include}" + #end for + #end if + + ${analysis_param_type.exclude_non_variants} + ' + + #for $sample_expressions in $analysis_param_type.sample_expressions_repeat: + #set $sample_expression = "--sample_expressions '%s'" % ( str( $sample_expressions.sample_expressions ) ) + -o '${ hexlify( $sample_expression ) }' + #end for + + #end if + ##end tool specific options + + ##start standard gatk options + #if $gatk_param_type.gatk_param_type_selector == "advanced": + #for $pedigree in $gatk_param_type.pedigree: + -p '--pedigree "${pedigree.pedigree_file}"' + #end for + #for $pedigree_string in $gatk_param_type.pedigree_string_repeat: + -p '--pedigreeString "${pedigree_string.pedigree_string}"' + #end for + -p '--pedigreeValidationType "${gatk_param_type.pedigree_validation_type}"' + #for $read_filter in $gatk_param_type.read_filter: + -p '--read_filter "${read_filter.read_filter_type.read_filter_type_selector}" + ###raise Exception( str( dir( $read_filter ) ) ) + #for $name, $param in $read_filter.read_filter_type.iteritems(): + #if $name not in [ "__current_case__", "read_filter_type_selector" ]: + #if hasattr( $param.input, 'truevalue' ): + ${param} + #else: + --${name} "${param}" + #end if + #end if + #end for + ' + #end for + #for $interval_count, $input_intervals in enumerate( $gatk_param_type.input_interval_repeat ): + -d "--intervals" "${input_intervals.input_intervals}" "${input_intervals.input_intervals.ext}" "input_intervals_${interval_count}" + #end for + + #for $interval_count, $input_intervals in enumerate( $gatk_param_type.input_exclude_interval_repeat ): + -d "--excludeIntervals" "${input_intervals.input_exclude_intervals}" "${input_intervals.input_exclude_intervals.ext}" "input_exlude_intervals_${interval_count}" + #end for + + -p '--interval_set_rule "${gatk_param_type.interval_set_rule}"' + + -p '--downsampling_type "${gatk_param_type.downsampling_type.downsampling_type_selector}"' + #if str( $gatk_param_type.downsampling_type.downsampling_type_selector ) != "NONE": + -p '--${gatk_param_type.downsampling_type.downsample_to_type.downsample_to_type_selector} "${gatk_param_type.downsampling_type.downsample_to_type.downsample_to_value}"' + #end if + -p ' + --baq "${gatk_param_type.baq}" + --baqGapOpenPenalty "${gatk_param_type.baq_gap_open_penalty}" + ${gatk_param_type.use_original_qualities} + --defaultBaseQualities "${gatk_param_type.default_base_qualities}" + --validation_strictness "${gatk_param_type.validation_strictness}" + --interval_merging "${gatk_param_type.interval_merging}" + ${gatk_param_type.disable_experimental_low_memory_sharding} + ${gatk_param_type.non_deterministic_random_seed} + ' + #for $rg_black_list_count, $rg_black_list in enumerate( $gatk_param_type.read_group_black_list_repeat ): + #if $rg_black_list.read_group_black_list_type.read_group_black_list_type_selector == "file": + -d "--read_group_black_list" "${rg_black_list.read_group_black_list_type.read_group_black_list}" "txt" "input_read_group_black_list_${rg_black_list_count}" + #else + -p '--read_group_black_list "${rg_black_list.read_group_black_list_type.read_group_black_list}"' + #end if + #end for + #end if + + #if str( $reference_source.reference_source_selector ) == "history": + -d "-R" "${reference_source.ref_file}" "${reference_source.ref_file.ext}" "gatk_input" + #end if + ##end standard gatk options + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +**What it does** + +Often, a VCF containing many samples and/or variants will need to be subset in order to facilitate certain analyses (e.g. comparing and contrasting cases vs. controls; extracting variant or non-variant loci that meet certain requirements, displaying just a few samples in a browser like IGV, etc.). SelectVariants can be used for this purpose. Given a single VCF file, one or more samples can be extracted from the file (based on a complete sample name or a pattern match). Variants can be further selected by specifying criteria for inclusion, i.e. "DP > 1000" (depth of coverage greater than 1000x), "AF < 0.25" (sites with allele frequency less than 0.25). These JEXL expressions are documented in the `Using JEXL expressions section <http://gatkforums.broadinstitute.org/discussion/1255/what-are-jexl-expressions-and-how-can-i-use-them-with-the-gatk>`_. One can optionally include concordance or discordance tracks for use in selecting overlapping variants. + +For more information on using the SelectVariants module, see this `tool specific page <http://www.broadinstitute.org/gatk/gatkdocs/org_broadinstitute_sting_gatk_walkers_variantutils_SelectVariants.html>`_. + +To learn about best practices for variant detection using GATK, see this `overview <http://www.broadinstitute.org/gatk/guide/topic?name=best-practices>`_. + +If you encounter errors, please view the `GATK FAQ <http://www.broadinstitute.org/gatk/guide/topic?name=faqs>`_. + +------ + +**Inputs** + +GenomeAnalysisTK: SelectVariants accepts a VCF input file. + + +**Outputs** + +The output is in VCF format. + + +Go `here <http://www.broadinstitute.org/gatk/guide/topic?name=intro>`_ for details on GATK file formats. + +------- + +**Settings**:: + + + out VCFWriter stdout File to which variants should be written + variant RodBinding[VariantContext] NA Input VCF file + concordance RodBinding[VariantContext] none Output variants that were also called in this comparison track + discordance RodBinding[VariantContext] none Output variants that were not called in this comparison track + exclude_sample_file Set[File] [] File containing a list of samples (one per line) to exclude. Can be specified multiple times + exclude_sample_name Set[String] [] Exclude genotypes from this sample. Can be specified multiple times + excludeFiltered boolean false Don't include filtered loci in the analysis + excludeNonVariants boolean false Don't include loci found to be non-variant after the subsetting procedure + keepIDs File NA Only emit sites whose ID is found in this file (one ID per line) + keepOriginalAC boolean false Don't update the AC, AF, or AN values in the INFO field after selecting + mendelianViolation Boolean false output mendelian violation sites only + mvq double 0.0 Minimum genotype QUAL score for each trio member required to accept a site as a violation + remove_fraction_genotypes double 0.0 Selects a fraction (a number between 0 and 1) of the total genotypes at random from the variant track and sets them to nocall + restrictAllelesTo NumberAlleleRestriction ALL Select only variants of a particular allelicity. Valid options are ALL (default), MULTIALLELIC or BIALLELIC + sample_expressions Set[String] NA Regular expression to select many samples from the ROD tracks provided. Can be specified multiple times + sample_file Set[File] NA File containing a list of samples (one per line) to include. Can be specified multiple times + sample_name Set[String] [] Include genotypes from this sample. Can be specified multiple times + select_expressions ArrayList[String] [] One or more criteria to use when selecting the data + select_random_fraction double 0.0 Selects a fraction (a number between 0 and 1) of the total variants at random from the variant track + select_random_number int 0 Selects a number of variants at random from the variant track + selectTypeToInclude List[Type] [] Select only a certain type of variants from the input file. Valid types are INDEL, SNP, MIXED, MNP, SYMBOLIC, NO_VARIATION. Can be specified multiple times + +------ + +**Citation** + +For the underlying tool, please cite `DePristo MA, Banks E, Poplin R, Garimella KV, Maguire JR, Hartl C, Philippakis AA, del Angel G, Rivas MA, Hanna M, McKenna A, Fennell TJ, Kernytsky AM, Sivachenko AY, Cibulskis K, Gabriel SB, Altshuler D, Daly MJ. A framework for variation discovery and genotyping using next-generation DNA sequencing data. Nat Genet. 2011 May;43(5):491-8. <http://www.ncbi.nlm.nih.gov/pubmed/21478889>`_ + +Please also site `McKenna A, Hanna M, Banks E, Sivachenko A, Cibulskis K, Kernytsky A, Garimella K, Altshuler D, Gabriel S, Daly M, DePristo MA (2010). The Genome Analysis Toolkit: a MapReduce framework for analyzing next-generation DNA sequencing data. Genome Res. 20:1297-303. Epub 2010 Jul 19. <http://www.ncbi.nlm.nih.gov/pubmed/20644199>`_ + +If you use this tool in Galaxy, please cite `Blankenberg D, Von Kuster G, Coraor N, Ananda G, Lazarus R, Mangan M, Nekrutenko A, Taylor J. Galaxy: a web-based genome analysis tool for experimentalists. Curr Protoc Mol Biol. 2010 Jan;Chapter 19:Unit 19.10.1-21. <http://www.ncbi.nlm.nih.gov/pubmed/20069535>`_ + + + diff -r 000000000000 -r 74c05070a3f8 variants_validate.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variants_validate.xml Thu Nov 01 13:53:22 2012 -0400 @@ -0,0 +1,401 @@ + + + + gatk + + gatk2_wrapper.py + --max_jvm_heap_fraction "1" + --stdout "${output_log}" + -d "--variant:variant,%(file_type)s" "${reference_source.input_variant}" "${reference_source.input_variant.ext}" "input_variant" + -p 'java + -jar "\$GATK2_PATH/GenomeAnalysisTK.jar" + -T "ValidateVariants" + + \$GATK2_SITE_OPTIONS + ##-et "NO_ET" -K "\$GATK2_BASE/gatk2_key_file" ##ET no phone home + ##--num_threads 4 ##hard coded, for now + ##-log "${output_log}" ##don't use this to log to file, instead directly capture stdout + #if $reference_source.reference_source_selector != "history": + -R "${reference_source.ref_file.fields.path}" + #end if + ${warn_on_errors} + ${do_not_validate_filtered_records} + ' + + #if str( $dbsnp_rod_bind_type.dbsnp_rod_bind_type_selector ) == 'set_dbsnp': + -d "--dbsnp:${dbsnp_rod_bind_type.dbsnp_rod_name},%(file_type)s" "${dbsnp_rod_bind_type.dbsnp_input_rod}" "${dbsnp_rod_bind_type.dbsnp_input_rod.ext}" "input_dbsnp_${dbsnp_rod_bind_type.dbsnp_rod_name}" + #end if + + ##start standard gatk options + #if $gatk_param_type.gatk_param_type_selector == "advanced": + #for $pedigree in $gatk_param_type.pedigree: + -p '--pedigree "${pedigree.pedigree_file}"' + #end for + #for $pedigree_string in $gatk_param_type.pedigree_string_repeat: + -p '--pedigreeString "${pedigree_string.pedigree_string}"' + #end for + -p '--pedigreeValidationType "${gatk_param_type.pedigree_validation_type}"' + #for $read_filter in $gatk_param_type.read_filter: + -p '--read_filter "${read_filter.read_filter_type.read_filter_type_selector}" + ###raise Exception( str( dir( $read_filter ) ) ) + #for $name, $param in $read_filter.read_filter_type.iteritems(): + #if $name not in [ "__current_case__", "read_filter_type_selector" ]: + #if hasattr( $param.input, 'truevalue' ): + ${param} + #else: + --${name} "${param}" + #end if + #end if + #end for + ' + #end for + #for $interval_count, $input_intervals in enumerate( $gatk_param_type.input_interval_repeat ): + -d "--intervals" "${input_intervals.input_intervals}" "${input_intervals.input_intervals.ext}" "input_intervals_${interval_count}" + #end for + + #for $interval_count, $input_intervals in enumerate( $gatk_param_type.input_exclude_interval_repeat ): + -d "--excludeIntervals" "${input_intervals.input_exclude_intervals}" "${input_intervals.input_exclude_intervals.ext}" "input_exlude_intervals_${interval_count}" + #end for + + -p '--interval_set_rule "${gatk_param_type.interval_set_rule}"' + + -p '--downsampling_type "${gatk_param_type.downsampling_type.downsampling_type_selector}"' + #if str( $gatk_param_type.downsampling_type.downsampling_type_selector ) != "NONE": + -p '--${gatk_param_type.downsampling_type.downsample_to_type.downsample_to_type_selector} "${gatk_param_type.downsampling_type.downsample_to_type.downsample_to_value}"' + #end if + -p ' + --baq "${gatk_param_type.baq}" + --baqGapOpenPenalty "${gatk_param_type.baq_gap_open_penalty}" + ${gatk_param_type.use_original_qualities} + --defaultBaseQualities "${gatk_param_type.default_base_qualities}" + --validation_strictness "${gatk_param_type.validation_strictness}" + --interval_merging "${gatk_param_type.interval_merging}" + ${gatk_param_type.disable_experimental_low_memory_sharding} + ${gatk_param_type.non_deterministic_random_seed} + ' + #for $rg_black_list_count, $rg_black_list in enumerate( $gatk_param_type.read_group_black_list_repeat ): + #if $rg_black_list.read_group_black_list_type.read_group_black_list_type_selector == "file": + -d "--read_group_black_list" "${rg_black_list.read_group_black_list_type.read_group_black_list}" "txt" "input_read_group_black_list_${rg_black_list_count}" + #else + -p '--read_group_black_list "${rg_black_list.read_group_black_list_type.read_group_black_list}"' + #end if + #end for + #end if + + #if $reference_source.reference_source_selector == "history": + -d "-R" "${reference_source.ref_file}" "${reference_source.ref_file.ext}" "gatk_input" + #end if + ##end standard gatk options + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +**What it does** + +Validates a variants file. + +For more information on using the ValidateVariants module, see this `tool specific page <http://www.broadinstitute.org/gatk/gatkdocs/org_broadinstitute_sting_gatk_walkers_variantutils_ValidateVariants.html>`_. + +To learn about best practices for variant detection using GATK, see this `overview <http://www.broadinstitute.org/gatk/guide/topic?name=best-practices>`_. + +If you encounter errors, please view the `GATK FAQ <http://www.broadinstitute.org/gatk/guide/topic?name=faqs>`_. + +------ + +**Inputs** + +GenomeAnalysisTK: ValidateVariants accepts variant files as input. + + +**Outputs** + +The output is a log of variant validation. + + +Go `here <http://www.broadinstitute.org/gatk/guide/topic?name=intro>`_ for details on GATK file formats. + +------- + +**Settings**:: + + doNotValidateFilteredRecords should we skip validation on filtered records? + warnOnErrors should we just emit warnings on errors instead of terminating the run? + +------ + +**Citation** + +For the underlying tool, please cite `DePristo MA, Banks E, Poplin R, Garimella KV, Maguire JR, Hartl C, Philippakis AA, del Angel G, Rivas MA, Hanna M, McKenna A, Fennell TJ, Kernytsky AM, Sivachenko AY, Cibulskis K, Gabriel SB, Altshuler D, Daly MJ. A framework for variation discovery and genotyping using next-generation DNA sequencing data. Nat Genet. 2011 May;43(5):491-8. <http://www.ncbi.nlm.nih.gov/pubmed/21478889>`_ + +Please also site `McKenna A, Hanna M, Banks E, Sivachenko A, Cibulskis K, Kernytsky A, Garimella K, Altshuler D, Gabriel S, Daly M, DePristo MA (2010). The Genome Analysis Toolkit: a MapReduce framework for analyzing next-generation DNA sequencing data. Genome Res. 20:1297-303. Epub 2010 Jul 19. <http://www.ncbi.nlm.nih.gov/pubmed/20644199>`_ + +If you use this tool in Galaxy, please cite `Blankenberg D, Von Kuster G, Coraor N, Ananda G, Lazarus R, Mangan M, Nekrutenko A, Taylor J. Galaxy: a web-based genome analysis tool for experimentalists. Curr Protoc Mol Biol. 2010 Jan;Chapter 19:Unit 19.10.1-21. <http://www.ncbi.nlm.nih.gov/pubmed/20069535>`_ + + +