view hicBuildMatrix.xml @ 25:37530b4c6fe4 draft default tip

planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/hicexplorer commit 79133bc3739fbcc6c2055d589679aae312161d03
author iuc
date Mon, 04 Nov 2024 22:34:35 +0000
parents f75705bee191
children
line wrap: on
line source

<tool id="hicexplorer_hicbuildmatrix" name="@BINARY@" version="@TOOL_VERSION@+galaxy@VERSION_SUFFIX@">
    <description>create a contact matrix</description>
    <macros>
        <token name="@BINARY@">hicBuildMatrix</token>
        <import>macros.xml</import>
    </macros>
    <expand macro="requirements">
        <requirement type="package" version="1.19">samtools</requirement>
    </expand>
    <command detect_errors="exit_code"><![CDATA[

        mkdir ./QCfolder &&
        mkdir '$qc.files_path' &&
        @BINARY@
            --samFiles
            #for $repeat in $samFiles:
                '${repeat.samFile}'
            #end for

            --restrictionCutFile '$restrictionCutFile'

            #if $restrictionSequence:
                --restrictionSequence $restrictionSequence
            #end if
            #if $danglingSequence:
                --danglingSequence $danglingSequence
            #end if
            #if $minDistance:
            --minDistance $minDistance
            #end if
            #if $maxLibraryInsertSize:
                --maxLibraryInsertSize $maxLibraryInsertSize
            #end if

            #if $binSizes:
                --binSize
                #for $repeat in $binSizes
                    '${repeat.binSize}'
                #end for
            #end if

            #if $chromosomeSizes:
                --chromosomeSizes '$chromosomeSizes'
            #end if
            #if $dbKey:
                --genomeAssembly '$dbKey'
            #else
                --genomeAssembly '$samFiles[0].samFile.metadata.dbkey'
            #end if

            #if $region:
                --region '$region'
            #end if

            --outFileName 'matrix.$outputFormat'

            #if $outBam:
                $outBam ./unsorted.bam
            #end if

            $keepSelfCircles
            $keepSelfLigation
            $skipDuplicationCheck

            #if $minMappingQuality and $minMappingQuality is not None:
                --minMappingQuality $minMappingQuality
            #end if

            --threads @THREADS@

            --QCfolder ./QCfolder
        &&
        mv ./QCfolder/* $qc.files_path/
        &&
        mv '$qc.files_path/hicQC.html' '$qc'
        && mv "$qc.files_path"/*.log raw_qc
        && mv matrix.$outputFormat matrix
        #if $outBam:
            && samtools sort -@ @THREADS@ -T "\${TMPDIR:-.}" ./unsorted.bam -o sorted.bam
        #end if
]]>
    </command>
    <inputs>
        <!-- can we use multiple=True here with min="2" and max="2" ? -->
        <repeat max="2" min="2" name="samFiles" title="Sam/Bam files to process (forward/reverse)" help="Please use the special BAM datatype: qname_input_sorted.bam and use for 'bowtie2' the '--reorder' option to create a BAM file.">
            <param name="samFile" type="data" format="sam,qname_input_sorted.bam">
            </param>
        </repeat>

        <expand macro="restrictionCutFile" />
        <expand macro="restrictionSequence" />
        <expand macro="danglingSequence" />

        <param argument="--minDistance" type="integer" optional="true" value="" label="Minimum distance between restriction sites" help="Restriction sites that are closer that this distance are merged into one.
                This option only applies if --restrictionCutFile is given." />
        <param argument="--maxLibraryInsertSize" type="integer" optional="true" value="" label="Maximum library insert size defines different cut offs based on the maximum expected library size" help="*This is not the average fragment size* but the higher end of the fragment size distribution (obtained using for example Fragment Analyzer)
                        which usually is between 800 to 1500 bp. If this value if not known use the default of 1000. The insert value is used to decide if two mates
                        belong to the same fragment (by checking if they are within this max insert size) and to decide if a mate
                        is too far away from the nearest restriction site." />

        <repeat name="binSizes" title="Bin size in bp" min="1" help="If used, the restriction cut places (if given) are used to only consider reads that are in the vicinity of the resctriction sites.
                Otherwise all reads in the interval are considered. Use multiple ones to create a mcool file.">
            <param argument="--binSize" type="integer" optional="true" value="" label="Bin size in bp" />
        </repeat>

        <expand macro="region" />
        <param argument="--keepSelfCircles" type="boolean" truevalue="--keepSelfCircles" falsevalue="" label="Keep self circles" help="If set, outward facing reads without any restriction fragment (self circles) are kept. They will be counted and shown in the QC plots." />
        <param argument="--keepSelfLigation" type="boolean" truevalue="--keepSelfLigation" falsevalue="" label="Keep self ligation" help="If set, inward facing reads less than 1000 bp apart and having a restriction site in between are kept. Although this reads do not contribute to any distant contact, they are useful to account for bias in the data." />
        <expand macro="minMappingQuality" />
        <param argument="--skipDuplicationCheck" type="boolean" truevalue="--skipDuplicationCheck" falsevalue="" label="Skip duplication check" help="Identification of duplicated read pairs is memory consuming. Thus, in case of memory errors this check can be skipped." />
        <param argument="--chromosomeSizes" type="data" format="tabular" optional="true" label="Chromosome sizes for your genome" help="File with the chromosome sizes for your genome. A tab-delimited two column layout 'chr_name size' is expected
                    Usually the sizes can be determined from the SAM/BAM input files, however,
                    for cHi-C or scHi-C it can be that at the start or end no data is present.
                    Please consider that this option causes that only reads are considered which are on the listed chromosomes.
                    Use this option to guarantee fixed sizes. An example file is available via UCSC:
                    http://hgdownload.soe.ucsc.edu/goldenPath/dm3/bigZips/dm3.chrom.sizes" />
        <param name="dbKey" type="text" optional="true" label="Use this dbkey for your history genome"
        help="You can set the reference genome in your history as metadata. In case you have not you can specify it here." />

        <param argument="--outBam" type="boolean" truevalue="--outBam" falsevalue="" checked="false" label="Save valid Hi-C reads in BAM file" help="A bam
                    file containing all valid Hi-C reads can be created
                    using this option. This bam file could be useful to
                    inspect the distribution of valid Hi-C reads pairs or
                    for other downstream analyses, but is not used by any
                    HiCExplorer tool. Computation will be significantly
                    longer if this option is set." />

        <param name="outputFormat" type="select" label="Output file format">
            <option value="h5">HiCExplorer format</option>
            <option value="cool">cool</option>
        </param>
    </inputs>
    <outputs>
        <data name="outfileBam" from_work_dir="sorted.bam" format="bam" label="${tool.name} BAM file on ${on_string}">
            <filter>outBam</filter>
        </data>
        <data name="outFileName" from_work_dir="matrix" format="h5" label="${tool.name} MATRIX on ${on_string}">
            <change_format>
                <when input="outputFormat" value="cool" format="cool" />
            </change_format>
        </data>
        <data name="qc" format="html" label="${tool.name} QC on ${on_string}" />
        <data name="raw_qc" from_work_dir="raw_qc" format="txt" label="${tool.name} raw QC on ${on_string}" />
    </outputs>
    <tests>
        <test expect_num_outputs="4">
            <repeat name="samFiles">
                <param name="samFile" value="small_test_R1_unsorted.sam" dbkey="hg38" />
            </repeat>
            <repeat name="samFiles">
                <param name="samFile" value="small_test_R2_unsorted.sam" dbkey="hg38" />
            </repeat>
            <param name="outputFormat" value="h5" />
            <repeat name="binSizes">
                <param name="binSize" value="5000" />
            </repeat>
            <param name="restrictionCutFile" value="DpnII_10k.bed" />
            <param name="restrictionSequence" value="GATC" />
            <param name="danglingSequence" value="GATC" />
            <param name="outBam" value="True" />
            <output name="outfileBam" file="small_test_matrix_result_sorted.bam" compare="diff" lines_diff="2" ftype="bam" />
            <output name="outFileName" ftype="h5">
                <assert_contents>
                    <has_h5_keys keys="intervals,matrix" />
                </assert_contents>
            </output>
            <output name="raw_qc" file="raw_qc_report" compare="diff" lines_diff="2" />
        </test>
        <test expect_num_outputs="4">
            <repeat name="samFiles">
                <param name="samFile" value="small_test_R1_unsorted.sam" dbkey="hg38" />
            </repeat>
            <repeat name="samFiles">
                <param name="samFile" value="small_test_R2_unsorted.sam" dbkey="hg38" />
            </repeat>
            <repeat name="binSizes">
                <param name="binSize" value="5000" />
            </repeat>
            <param name="restrictionCutFile" value="DpnII_10k.bed" />
            <param name="restrictionSequence" value="GATC" />
            <param name="danglingSequence" value="GATC" />
            <param name="outputFormat" value="cool" />
            <param name="outBam" value="True" />
            <output name="outfileBam" file="small_test_matrix_result_sorted.bam" compare="diff" lines_diff="2" ftype="bam" />
            <output name="outFileName" ftype="cool">
                <assert_contents>
                    <has_h5_keys keys="bins,chroms,indexes,pixels" />
                </assert_contents>
            </output>
            <output name="raw_qc" file="raw_qc_report" compare="diff" lines_diff="2" />
        </test>
    </tests>
    <help><![CDATA[

Creation of the contact matrix
===============================


**hicBuildMatrix** creates a contact matrix based on Hi-C read pairs. It requires two sam or bam files
corresponding to the first and second mates of the paired-end Hi-C reads mapped on the reference genome.
The sam and bam files should not be sorted by position. There are two main options to create the Hi-C contact matrix,
either by fixed bin size (eg. 10000 bp) or by bins of variable length following restriction enzyme sites location in the genome (restriction enzyme resolution).
**hicBuildMatrix** generates a quality control output that can be used to analyze the quality of the Hi-C reads to assess if the experiment and sequencing were successful.

_________________


Usage
-----


This tool must be used on paired sam / bam files produced with a program that supports local alignment (e.g. Bowtie2) where both PE reads are mapped using the --local option.

_________________


Output
------

**hicBuildMatrix** creates multiple outputs:

    - The contact matrix used by HiCExplorer for all downstream analyses.
    - A bam file with the accepted alignments, which can be useful to inspect the distribution of valid Hi-C reads pairs, notably around restriction enzyme sites or for other downstream analyses. This file is not used by any HiCExplorer tools.
    - A quality control report to assess if the Hi-C protocol and library contrusction were successful.

Example plot
++++++++++++

.. image:: hicPlotMatrix.png
   :width: 50%

Contact matrix of *Drosophila melanogaster* embryos built with **hicBuildMatrix** and visualized using ``hicPlotMatrix``. Hi-C matrix bins were merged to a 25 kb bin size before plotting using ``hicMergeMatrixBins``.




Quality report
++++++++++++++

A quality report is produced alongside the contact matrix.

.. image:: $PATH_TO_IMAGES/hicQC.png
   :width: 40%

Several plots, that are described in details below, are comprised inside this report.

.. image:: $PATH_TO_IMAGES/hicQC_pairs_sequenced.png
   :width: 40%

On the plot above, we can see how many reads were sequenced per sample (pairs considered), how many reads were mappable, unique and of high quality and how many reads passed all quality controls and are thus useful for further analysis (pairs used). All quality controls used for read filtering are explained below.

.. image:: $PATH_TO_IMAGES/hicQC_unmappable_and_non_unique.png
   :width: 40%

The figure above contains the fraction of reads with respect to the total number of reads that did not map, that have a low quality score or that didn't map uniquely to the genome. In our example we can see that Sample 3 has the highest fraction of pairs used. We explain the differences between the three samples on the plot below.

.. image:: $PATH_TO_IMAGES/hicQC_pairs_discarded.png
   :width: 40%

This figure contains the fraction of read pairs (with respect to mappable and unique reads) that were discarded when building the Hi-C matrix. You can find the description of each category below:

    - **Dangling ends:** reads that start with the restriction site and constitute reads that were digested but not ligated. Sample 1 in our example has a high fraction of dangling ends (and thus a low proportion of pairs used). Reasons for this can be inefficient ligation or insufficient removal of danging ends during samples preparation.

    - **Duplicated pairs:** reads that have the same sequence due to PCR amplification. For example, Sample 2 was amplified too much and thus has a very high fraction of duplicated pairs.

    - **Same fragment:** read mates facing inward, separated by up to 800bp that do not have a restriction enzyme site in between. These read pairs are not valid Hi-C pairs and are thus discarded from further analyses.

    - **Self circle:** read pairs within 25kb with 'outward' read orientation.

    - **Self ligation:** read pairs with a restriction site in between that are within 800bp.

.. image:: $PATH_TO_IMAGES/hicQC_distance.png
   :width: 40%

The figure above contains the fraction of read pairs (with respect to mappable reads) that compose inter chromosomal, short range (< 20kb) or long range contacts. Inter chromosomal reads of a wild-type sample are expected to be low. Trans-chromosomal contacts can be primarily considered as random ligation events. These would be expected to contribute to technical noise that may obscure some of the finer features in the Hi-C datasets (Nagano *et al.* 2015, Comparison of Hi-C results using in-solution versus in-nucleus ligation, doi: https://doi.org/10.1186/s13059-015-0753-7). As such, a high fraction of inter chromosomal reads is an indicator of low sample quality, but it can also be associated to cell cycle changes (Nagano *et al.* 2018, Cell-cycle dynamics of chromosomal organisation at single-cell resolution, doi: https://doi.org/10.1038/nature23001).

Short range and long range contacts proportions can be associated to how the fixation is performed during Hi-C sample preparation. These two proportions also directly impact the Hi-C corrected counts versus genomic distance plots generated by hicPlotDistVsCounts.

.. image:: $PATH_TO_IMAGES/hicQC_read_orientation.png
   :width: 40%

The last figure shows the fractions of inward, outward, left or right read pairs (with respect to mappable reads). Deviations from an equal distribution indicates problems during sample preparation.

_________________

| For more information about HiCExplorer please consider our documentation on readthedocs.io_.

.. _readthedocs.io: http://hicexplorer.readthedocs.io/en/latest/index.html
]]>    </help>
    <expand macro="citations" />
</tool>