# HG changeset patch # User Jim Johnson # Date 1352392698 21600 # Node ID 5faf7ace8aeed4aca2c2ad859c976fdd794a3c51 # Parent a14e79e7ac758a39593344dd34c025ef6e9d409d Add HaplotypeCaller diff -r a14e79e7ac75 -r 5faf7ace8aee haplotype_caller.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/haplotype_caller.xml Thu Nov 08 10:38:18 2012 -0600 @@ -0,0 +1,639 @@ + + Call SNPs and indels simultaneously via local de-novo assembly of haplotypes in an active region + + gatk + samtools + + gatk2_wrapper.py + --max_jvm_heap_fraction "1" + --stdout "${output_log}" + -d "-I" "${reference_source.input_bam}" "${reference_source.input_bam.ext}" "gatk_input" + #if str( $reference_source.input_bam.metadata.bam_index ) != "None": + -d "" "${reference_source.input_bam.metadata.bam_index}" "bam_index" "gatk_input" ##hardcode galaxy ext type as bam_index + #end if + -p 'java + -jar "\$GATK2_PATH/GenomeAnalysisTK.jar" + -T "HaplotypeCaller" + -o "${output_vcf}" + ## \$GATK2_SITE_OPTIONS + ##-et "NO_ET" -K "/data/galaxy/appList/GenomeAnalysisTK-2.0-36-gf5c1c1a/gatk2_key_file" ##ET no phone home + ##--num_threads 4 ##not supported yet + ##-log "${output_log}" ##don't use this to log to file, instead directly capture stdout + #if $reference_source.reference_source_selector != "history": + -R "${reference_source.ref_file.fields.path}" + #end if + #if str($input_recal) != 'None': + --BQSR "${input_recal}" + #end if + --disable_bam_indexing + ' + ##start standard gatk options + #if $gatk_param_type.gatk_param_type_selector == "advanced": + #for $pedigree in $gatk_param_type.pedigree: + -p '--pedigree "${pedigree.pedigree_file}"' + #end for + #for $pedigree_string in $gatk_param_type.pedigree_string_repeat: + -p '--pedigreeString "${pedigree_string.pedigree_string}"' + #end for + -p '--pedigreeValidationType "${gatk_param_type.pedigree_validation_type}"' + #for $read_filter in $gatk_param_type.read_filter: + -p '--read_filter "${read_filter.read_filter_type.read_filter_type_selector}" + ###raise Exception( str( dir( $read_filter ) ) ) + #for $name, $param in $read_filter.read_filter_type.iteritems(): + #if $name not in [ "__current_case__", "read_filter_type_selector" ]: + #if hasattr( $param.input, 'truevalue' ): + ${param} + #else: + --${name} "${param}" + #end if + #end if + #end for + ' + #end for + #for $interval_count, $input_intervals in enumerate( $gatk_param_type.input_interval_repeat ): + -d "--intervals" "${input_intervals.input_intervals}" "${input_intervals.input_intervals.ext}" "input_intervals_${interval_count}" + #end for + + #for $interval_count, $input_intervals in enumerate( $gatk_param_type.input_exclude_interval_repeat ): + -d "--excludeIntervals" "${input_intervals.input_exclude_intervals}" "${input_intervals.input_exclude_intervals.ext}" "input_exlude_intervals_${interval_count}" + #end for + + -p '--interval_set_rule "${gatk_param_type.interval_set_rule}"' + + -p '--downsampling_type "${gatk_param_type.downsampling_type.downsampling_type_selector}"' + #if str( $gatk_param_type.downsampling_type.downsampling_type_selector ) != "NONE": + -p '--${gatk_param_type.downsampling_type.downsample_to_type.downsample_to_type_selector} "${gatk_param_type.downsampling_type.downsample_to_type.downsample_to_value}"' + #end if + -p ' + --baq "${gatk_param_type.baq}" + --baqGapOpenPenalty "${gatk_param_type.baq_gap_open_penalty}" + ${gatk_param_type.use_original_qualities} + --defaultBaseQualities "${gatk_param_type.default_base_qualities}" + --validation_strictness "${gatk_param_type.validation_strictness}" + --interval_merging "${gatk_param_type.interval_merging}" + ${gatk_param_type.disable_experimental_low_memory_sharding} + ${gatk_param_type.non_deterministic_random_seed} + ' + #for $rg_black_list_count, $rg_black_list in enumerate( $gatk_param_type.read_group_black_list_repeat ): + #if $rg_black_list.read_group_black_list_type.read_group_black_list_type_selector == "file": + -d "--read_group_black_list" "${rg_black_list.read_group_black_list_type.read_group_black_list}" "txt" "input_read_group_black_list_${rg_black_list_count}" + #else + -p '--read_group_black_list "${rg_black_list.read_group_black_list_type.read_group_black_list}"' + #end if + #end for + #end if + + #if str( $reference_source.reference_source_selector ) == "history": + -d "-R" "${reference_source.ref_file}" "${reference_source.ref_file.ext}" "gatk_input" + #end if + ##end standard gatk options + + ##start analysis specific options + #if $analysis_param_type.analysis_param_type_selector == "advanced": + -p ' + ## files + #if str($analysis_param_type.activeRegionIn) != 'None': + --activeRegionIn "$analysis_param_type.activeRegionIn" + #end if + #if str($analysis_param_type.alleles) != 'None': + --alleles "$analysis_param_type.alleles" + #end if + #if str($analysis_param_type.comp) != 'None': + --comp "$analysis_param_type.comp" + #end if + #if str($analysis_param_type.dbsnp) != 'None': + --dbsnp "$analysis_param_type.dbsnp" + #end if + ## text + #if len($analysis_param_type.annotation.__str__) > 0: + --annotation $analysis_param_type.annotation + #end if + #if len($analysis_param_type.excludeAnnotation.__str__) > 0: + --excludeAnnotation $analysis_param_type.excludeAnnotation + #end if + #if len($analysis_param_type.group.__str__) > 0: + --group $analysis_param_type.group + #end if + ## value setings + #if $analysis_param_type.contamination_fraction_to_filter.__str__.strip() != '': + --contamination_fraction_to_filter $analysis_param_type.contamination_fraction_to_filter + #end if + #if $analysis_param_type.downsampleRegion.__str__.strip() != '': + --downsampleRegion $analysis_param_type.downsampleRegion + #end if + #if $analysis_param_type.heterozygosity.__str__.strip() != '': + --heterozygosity $analysis_param_type.heterozygosity + #end if + #if $analysis_param_type.minPruning.__str__.strip() != '': + --minPruning $analysis_param_type.minPruning + #end if + #if $analysis_param_type.standard_min_confidence_threshold_for_calling.__str__.strip() != '': + --standard_min_confidence_threshold_for_calling $analysis_param_type.standard_min_confidence_threshold_for_calling + #end if + #if $analysis_param_type.standard_min_confidence_threshold_for_emitting.__str__.strip() != '': + --standard_min_confidence_threshold_for_emitting $analysis_param_type.standard_min_confidence_threshold_for_emitting + #end if + #if $analysis_param_type.gcpHMM.__str__.strip() != '': + --gcpHMM $analysis_param_type.gcpHMM + #end if + #if $analysis_param_type.max_alternate_alleles.__str__.strip() != '': + --max_alternate_alleles $analysis_param_type.max_alternate_alleles + #end if + ## mode selections + #if $analysis_param_type.genotyping_mode.__str__ != "None" and len($analysis_param_type.genotyping_mode.__str__) > 0: + --genotyping_mode $analysis_param_type.genotyping_mode + #end if + #if $analysis_param_type.output_mode.__str__ != "None" and len($analysis_param_type.output_mode.__str__) > 0: + --output_mode $analysis_param_type.output_mode + #end if + #if $analysis_param_type.pair_hmm_implementation.__str__ != "None" and len($analysis_param_type.pair_hmm_implementation.__str__) > 0: + --pair_hmm_implementation $analysis_param_type.pair_hmm_implementation + #end if + #if $analysis_param_type.p_nonref_model.__str__ != "None" and len($analysis_param_type.p_nonref_model.__str__) > 0: + --p_nonref_model $analysis_param_type.p_nonref_model + #end if + ## optional outputs + #if $analysis_param_type.activeRegionOut: + --activeRegionOut $active_region_out + #end if + #if $analysis_param_type.graphOutput: + --graphOutput $graph_out + #end if + ## flags + $analysis_param_type.useAllelesTrigger + $analysis_param_type.fullHaplotype + $analysis_param_type.genotypeFullActiveRegion + $analysis_param_type.debug + ' + #end if + + + + The input covariates table file which enables on-the-fly base quality score recalibration. + Enables on-the-fly recalibrate of base qualities. The covariates tables are produced by the BaseQualityScoreRecalibrator tool. + Please be aware that one should only run recalibration with the covariates file created on the same input bam(s). + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + analysis_param_type['graphOutput'] == True + + + analysis_param_type['activeRegionOut'] == True + + + + + + + + + + + + + + + + +**What it does** + +HaplotypeCaller +Call SNPs and indels simultaneously via local de-novo assembly of haplotypes in an active region. +Haplotypes are evaluated using an affine gap penalty Pair HMM. + +For more information on using read based compression in the GATK, see this `tool specific page <http://www.broadinstitute.org/gatk/gatkdocs/org_broadinstitute_sting_gatk_walkers_haplotypecaller_HaplotypeCaller.html>`_. + +To learn about best practices for variant detection using GATK, see this `overview <http://www.broadinstitute.org/gatk/guide/topic?name=best-practices>`_. + +If you encounter errors, please view the `GATK FAQ <http://www.broadinstitute.org/gatk/guide/topic?name=faqs>`_. + +------ + +**Inputs** + +GenomeAnalysisTK: PrintReads accepts an aligned BAM files. + + +**Outputs** + +The output is a VCF file with raw, unrecalibrated SNP and indel calls. + + +Go `here <http://www.broadinstitute.org/gatk/guide/topic?name=intro>`_ for details on GATK file formats. + +------- + +**Settings**:: + + default_read_group If a read has no read group then default to the provided String. + default_platform If a read has no platform then default to the provided String. Valid options are illumina, 454, and solid. + force_read_group If provided, the read group ID of EVERY read will be forced to be the provided String. This is useful to collapse all data into a single read group. + force_platform If provided, the platform of EVERY read will be forced to be the provided String. Valid options are illumina, 454, and solid. + window_size_nqs The window size used by MinimumNQSCovariate for its calculation + homopolymer_nback The number of previous bases to look at in HomopolymerCovariate + exception_if_no_tile If provided, TileCovariate will throw an exception when no tile can be found. The default behavior is to use tile = -1 + solid_recal_mode How should we recalibrate solid bases in whichthe reference was inserted? Options = DO_NOTHING, SET_Q_ZERO, SET_Q_ZERO_BASE_N, or REMOVE_REF_BIAS (DO_NOTHING|SET_Q_ZERO|SET_Q_ZERO_BASE_N|REMOVE_REF_BIAS) + solid_nocall_strategy Defines the behavior of the recalibrator when it encounters no calls in the color space. Options = THROW_EXCEPTION, LEAVE_READ_UNRECALIBRATED, or PURGE_READ (THROW_EXCEPTION|LEAVE_READ_UNRECALIBRATED|PURGE_READ) + recal_file Filename for the input covariates table recalibration .csv file + out The output BAM file + bam_compression Compression level to use for writing BAM files + disable_bam_indexing Turn off on-the-fly creation of indices for output BAM files. + simplifyBAM If provided, output BAM files will be simplified to include just key reads for downstream variation discovery analyses (removing duplicates, PF-, non-primary reads), as well stripping all extended tags from the kept reads except the read group identifier + preserve_qscores_less_than Bases with quality scores less than this threshold won't be recalibrated, default=5. In general it's unsafe to change qualities scores below < 5, since base callers use these values to indicate random or bad bases + smoothing Number of imaginary counts to add to each bin bin order to smooth out bins with few data points, default=1 + max_quality_score The integer value at which to cap the quality scores, default=50 + doNotWriteOriginalQuals If true, we will not write the original quality (OQ) tag for each read + +------ + +**Citation** + +For the underlying tool, please cite `DePristo MA, Banks E, Poplin R, Garimella KV, Maguire JR, Hartl C, Philippakis AA, del Angel G, Rivas MA, Hanna M, McKenna A, Fennell TJ, Kernytsky AM, Sivachenko AY, Cibulskis K, Gabriel SB, Altshuler D, Daly MJ. A framework for variation discovery and genotyping using next-generation DNA sequencing data. Nat Genet. 2011 May;43(5):491-8. <http://www.ncbi.nlm.nih.gov/pubmed/21478889>`_ + +Please also site `McKenna A, Hanna M, Banks E, Sivachenko A, Cibulskis K, Kernytsky A, Garimella K, Altshuler D, Gabriel S, Daly M, DePristo MA (2010). The Genome Analysis Toolkit: a MapReduce framework for analyzing next-generation DNA sequencing data. Genome Res. 20:1297-303. Epub 2010 Jul 19. <http://www.ncbi.nlm.nih.gov/pubmed/20644199>`_ + +If you use this tool in Galaxy, please cite `Blankenberg D, Von Kuster G, Coraor N, Ananda G, Lazarus R, Mangan M, Nekrutenko A, Taylor J. Galaxy: a web-based genome analysis tool for experimentalists. Curr Protoc Mol Biol. 2010 Jan;Chapter 19:Unit 19.10.1-21. <http://www.ncbi.nlm.nih.gov/pubmed/20069535>`_ + + + diff -r a14e79e7ac75 -r 5faf7ace8aee reduce_reads.xml --- a/reduce_reads.xml Tue Nov 06 12:07:36 2012 -0600 +++ b/reduce_reads.xml Thu Nov 08 10:38:18 2012 -0600 @@ -123,7 +123,12 @@ #end if - + + The input covariates table file which enables on-the-fly base quality score recalibration. + Enables on-the-fly recalibrate of base qualities. The covariates tables are produced by the BaseQualityScoreRecalibrator tool. + Please be aware that one should only run recalibration with the covariates file created on the same input bam(s). + +