Mercurial > repos > yating-l > ucsc_blat
diff blat.xml @ 35:87644259e668 draft
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/ucsc_blat/ commit f38778f3a25020809c3f6cf17aafb8dbfc54b2e8
author | iuc |
---|---|
date | Sat, 28 Sep 2024 16:29:10 +0000 |
parents | 53cf9d25ef39 |
children | e564213a92b8 |
line wrap: on
line diff
--- a/blat.xml Mon Nov 21 11:11:06 2022 +0000 +++ b/blat.xml Sat Sep 28 16:29:10 2024 +0000 @@ -1,8 +1,25 @@ <tool id="ucsc_blat" name="UCSC BLAT Alignment Tool" version="@TOOL_VERSION@+galaxy@VERSION_SUFFIX@"> <description>BLAST-like sequence alignment tool</description> <macros> - <token name="@TOOL_VERSION@">377</token> + <token name="@TOOL_VERSION@">469</token> <token name="@VERSION_SUFFIX@">0</token> + + <xml name="mask_cond" tokens="maskarg,label,help"> + <conditional name="@MASKARG@_type"> + <param argument="-@MASKARG@" type="select" label="@LABEL@" help="@HELP@"> + <option value="" selected="true">No masking</option> + <option value="lower">lower - mask out lower-cased sequence</option> + <option value="upper">upper - mask out upper-cased sequence</option> + <option value="file.out">out - mask database according to RepeatMasker out</option> + </param> + <when value="" /> + <when value="lower" /> + <when value="upper" /> + <when value="file.out"> + <param name="@MASKARG@_file" type="data" format="txt" label="RepeatMasker file.out" /> + </when> + </conditional> + </xml> </macros> <xrefs> <xref type="bio.tools">blat</xref> @@ -13,7 +30,9 @@ <command detect_errors="exit_code"><![CDATA[ #if str($reference_source.reference_source_selector) == "history": ## blat depends on file extension - #if $reference_source.database.is_of_type("fasta"): + #if $reference_source.database.is_of_type("fasta.gz"): + #set $reference_fasta_filename = "localref.fa.gz" + #elif $reference_source.database.is_of_type("fasta"): #set $reference_fasta_filename = "localref.fa" #elif $reference_source.database.is_of_type("twobit"): #set $reference_fasta_filename = "localref.2bit" @@ -25,36 +44,77 @@ #set $reference_fasta_filename = str($reference_source.database.fields.path) #end if + ## blat depends on file extension + #if $query.is_of_type("fasta.gz"): + #set $query_filename = "query.fa.gz" + #elif $query.is_of_type("fasta"): + #set $query_filename = "query.fa" + #elif $query.is_of_type("twobit"): + #set $query_filename = "query.2bit" + #else + #set $query_filename = "query" + #end if + ln -s '$query' '$query_filename' && + blat -q=$query_type -t=$database_type - $oneOff - #if str($minScore) - -minScore=$minScore + ## Basic alignment parameters + #if str($basic_align.minScore) + -minScore=$basic_align.minScore + #end if + #if str($basic_align.minIdentity) + -minIdentity=$basic_align.minIdentity + #end if + $basic_align.trimT + $basic_align.noTrimA + $basic_align.trimHardA + $basic_align.fastMap + $basic_align.fine + #if str($basic_align.maxIntron) + -maxIntron=$basic_align.maxIntron + #end if + $basic_align.extendThroughN + ## Advanced alignment parameters + #if str($adv_align.tileSize) + -tileSize=$adv_align.tileSize + #end if + #if str($adv_align.stepSize) + -stepSize=$adv_align.stepSize #end if - -maxGap=$maxGap - #if str($repMatch) - -repMatch=$repMatch + $adv_align.oneOff + #if str($adv_align.minMatch) + -minMatch=$adv_align.minMatch + #end if + -maxGap=$adv_align.maxGap + #if str($adv_align.repMatch) + -repMatch=$adv_align.repMatch + #end if + ## Repeat masking parameters + #if $repeat.mask_type.mask == "file.out": + -mask='$repeat.mask_type.mask_file' + #elif $repeat.mask_type.mask: + -mask=$repeat.mask_type.mask #end if - #if $mask_type.mask == "file.out": - -mask='$mask_type.mask_file' - #else: - -mask=$mask_type.mask + #if $repeat.qMask_type.qMask == "file.out": + -qMask='$repeat.qMask_type.qMask_file' + #elif $repeat.qMask_type.qMask: + -qmask=$repeat.qMask_type.qMask #end if + #if $repeat.repeats_type.repeats == "file.out": + -repeats='$repeat.repeats_type.repeats_file' + #elif $repeat.repeats_type.repeats: + -repeats=$repeat.repeats_type.repeats + #end if + #if str($repeat.minRepDivergence) + -minRepDivergence=$repeat.minRepDivergence + #end if + #if str($dots) -dots=$dots #end if - $trimT - $noTrimA - $trimHardA - $fastMap - $fine - #if str($maxIntron) - -maxIntron=$maxIntron - #end if - $extendThroughN '$reference_fasta_filename' - '$query' + '$query_filename' -out=$out '$output' ]]></command> @@ -67,57 +127,61 @@ <when value="cached"> <param name="database" type="select" label="Select database"> <options from_data_table="all_fasta"> + <!-- <column name="name" index="0"/> + <column name="value" index="2"/> --> <filter type="sort_by" column="2" /> </options> <validator type="no_options" message="A built-in database is not available" /> </param> </when> <when value="history"> - <param name="database" type="data" format="fasta, twobit" label="Using database file, either a .fa, .nib or .2bit file" /> + <param name="database" type="data" format="fasta,fasta.gz,twobit" label="Using database file, either a fasta, fasta.gz or twobit dataset" /> </when> </conditional> - <param name="query" type="data" format="fasta, twobit" label="Query data, either a .fa, .nib or .2bit file"/> + <param name="query" type="data" format="fasta,fasta.gz,twobit" label="Query data, either a fasta, fasta.gz or twobit dataset"/> <param argument="-t" name="database_type" type="select" format="txt" multiple="false" label="database type" help="Choose your database type, the default is dnax"> - <option value="dna">dna - DNA sequence</option> + <option value="dna" selected="true">dna - DNA sequence</option> <option value="prot">prot - protein sequence</option> - <option value="dnax" selected="true">dnax - DNA sequence translated in six frames to protein</option> + <option value="dnax">dnax - DNA sequence translated in six frames to protein</option> </param> <param argument="-q" name="query_type" type="select" format="txt" multiple="false" label="query type" help="Choose your query type, the default is rnax"> - <option value="dna">dna - DNA sequence </option> + <option value="dna" selected="true">dna - DNA sequence </option> <option value="rna">rna - RNA sequence</option> <option value="prot">prot - protein sequence</option> <option value="dnax">dnax - DNA sequence translated in six frames to protein</option> - <option value="rnax" selected="true">rnax - DNA sequence translated in three frames to protein</option> + <option value="rnax">rnax - DNA sequence translated in three frames to protein</option> </param> - <param argument="-oneOff" type="boolean" truevalue="-oneOff=1" falsevalue="" label="If set, this allows one mismatch in tile and still triggers an alignments" /> - <param argument="-minScore" type="integer" value="30" label="Minimum score" help="It is the matches minus the mismatches minus some sort of gap penalty" /> - <param argument="-maxGap" type="integer" value="2" min="0" max="3" label="Maximum gap between tiles in a clump" help="Usually set from 0 to 3. Only relevant for minMatch > 1" /> - <param argument="-repMatch" type="integer" value="" optional="true" label="Number of repetitions of a tile allowed before it is marked as overused" help="Typically this is 256 for tileSize 12, 1024 for tileSize 11, 4096 for tileSize 10. Also affected by stepSize. When stepSize is halved repMatch is doubled to compensate" /> - <conditional name="mask_type"> - <param argument="-mask" type="select" label="Mask out repeats" help="Alignments won't be started in masked region but may extend through it in nucleotide searches. Masked areas are ignored entirely in protein or translated searches. Default is lower"> - <option value="lower" selected="true">lower - mask out lower-cased sequence</option> - <option value="upper">upper - mask out upper-cased sequence</option> - <option value="out">out - mask according to database.out RepeatMasker .out file</option> - <option value="file.out">file.out - mask database according to RepeatMasker file.out</option> - </param> - <when value="lower" /> - <when value="upper" /> - <when value="out" /> - <when value="file.out"> - <param name="mask_file" type="data" format="txt" label="RepeatMasker file.out" /> - </when> - </conditional> + <section name="basic_align" title="Alignment parameters" expanded="true"> + <param argument="-minScore" type="integer" value="30" label="Minimum score" help="It is the matches minus the mismatches minus some sort of gap penalty" /> + <param argument="-minIdentity" type="integer" value="" optional="true" min="0" max="100" label="Minimum sequence identity (in percent)" help="Default is 90 for nucleotide searches, 25 for protein or translated protein searches" /> + <param argument="-trimT" type="boolean" truevalue="-trimT" falsevalue="" label="Trim leading poly-T" /> + <param argument="-noTrimA" type="boolean" truevalue="-noTrimA" falsevalue="" label="Don't trim trailing poly-A" /> + <param argument="-trimHardA" type="boolean" truevalue="-trimHardA" falsevalue="" label="Remove poly-A tail from qSize and alignments in .psl output" /> + <param argument="-fastMap" type="boolean" truevalue="-fastMap" falsevalue="" label="Run for fast DNA/DNA remapping" help="It does not allow introns and require high %ID. Query sizes must not exceed 5000" /> + <param argument="-fine" type="boolean" truevalue="-fine" falsevalue="" label="Refine search for small initial and terminal exons" help="For high-quality mRNAs. Not recommended for ESTs" /> + <param argument="-maxIntron" type="integer" value="750000" optional="true" label="Maximum intron size" /> + <param argument="-extendThroughN" type="boolean" truevalue="-extendThroughN" falsevalue="" label="Allow extension of alignment through large blocks of N's" /> + </section> + <section name="adv_align" title="Advanced alignment parameters" expanded="false"> + <param argument="-tileSize" type="integer" value="" optional="true" min="1" label="Tile size" help="Sets the size of match that triggers an alignment. Usually between 8 and 12. Default is 11 for DNA and 5 for protein" /> + <param argument="-stepSize" type="integer" value="" optional="true" min="1" label="Spacing between tiles" help="Default is tileSize" /> + <param argument="-oneOff" type="boolean" truevalue="-oneOff=1" falsevalue="" label="If set, this allows one mismatch in tile and still triggers an alignments" /> + <param argument="-minMatch" type="integer" value="" optional="true" min="1" label="Minimum number of tile matches" help="Usually set from 2 to 4. Default is 2 for nucleotide, 1 for protein." /> + <param argument="-maxGap" type="integer" value="2" min="0" max="3" label="Maximum gap between tiles in a clump" help="Usually set from 0 to 3. Only relevant for minMatch > 1" /> + <param argument="-repMatch" type="integer" value="" optional="true" label="Number of repetitions of a tile allowed before it is marked as overused" help="Typically this is 256 for tileSize 12, 1024 for tileSize 11, 4096 for tileSize 10. Also affected by stepSize. When stepSize is halved repMatch is doubled to compensate" /> + </section> + <section name="repeat" title="Repeat masking parameters" expanded="true"> + <expand macro="mask_cond" maskarg="mask" label="Mask out repeats" help="Alignments won't be started in masked region but may extend through it in nucleotide searches. Masked areas are ignored entirely in protein or translated searches. Default is no masking"/> + <expand macro="mask_cond" maskarg="qMask" label="Mask out repeats in query sequence" help="Analoguous to -mask, but for the query sequence"/> + <expand macro="mask_cond" maskarg="repeats" label="Report matches in repeats separately" help="Repeat bases will not be masked in any way, but matches in repeat areas will be reported separately from matches in other areas in the output"/> + <param argument="-minRepDivergence" type="integer" value="" min="0" max="100" optional="true" label="Minimum divergence of repeats (percent)" help="to allow them to be unmasked. Default is 15. Only relevant for masking using RepeatMasker .out files" /> + </section> <param argument="-dots" type="integer" value="" optional="true" label="Output a dot every N sequences in log" help="Dots show program's progress" /> - <param argument="-trimT" type="boolean" truevalue="-trimT" falsevalue="" label="Trim leading poly-T" /> - <param argument="-noTrimA" type="boolean" truevalue="-noTrimA" falsevalue="" label="Don't trim trailing poly-A" /> - <param argument="-trimHardA" type="boolean" truevalue="-trimHardA" falsevalue="" label="Remove poly-A tail from qSize and alignments in .psl output" /> - <param argument="-fastMap" type="boolean" truevalue="-fastMap" falsevalue="" label="Run for fast DNA/DNA remapping" help="It does not allow introns and require high %ID. Query sizes must not exceed 5000" /> - <param argument="-fine" type="boolean" truevalue="-fine" falsevalue="" label="Refine search for small initial and terminal exons" help="For high-quality mRNAs. Not recommended for ESTs" /> - <param argument="-maxIntron" type="integer" value="750000" optional="true" label="Maximum intron size" /> - <param argument="-extendThroughN" type="boolean" truevalue="-extendThroughN" falsevalue="" label="Allow extension of alignment through large blocks of N's" /> <param name="out" type="select" label="Select output file format (-out)"> <option value="psl">Tab-separated format, no sequence (psl)</option> <option value="psl -noHead">Tab-separated format, no sequence, no header (psl -noHead)</option> + <option value="pslx">Tab-separated format (pslx)</option> + <option value="pslx -noHead">Tab-separated format, no header (pslx -noHead)</option> <option value="axt">Blastz-associated axt format (axt)</option> <option value="maf">Multiz-associated maf format (maf)</option> <option value="sim4">Similar to sim4 format (sim4)</option> @@ -129,44 +193,96 @@ </inputs> <outputs> <data name="output" format="tabular" label="${tool.name} on ${on_string}"> - <change_format> + <change_format><!-- add test --> <when input="out" value="axt" format="axt" /> <when input="out" value="maf" format="maf" /> <when input="out" value="sim4" format="txt" /> - <when input="out" value="wublast" format="tabular" /> - <when input="out" value="blast" format="tabular" /> </change_format> </data> </outputs> <tests> <!-- test on query of GenBank RefSeq records for Gallus gallus and database of Amazona vittata --> <test> - <param name="reference_source_selector" value="history" /> - <param name="database" value="amaVit1_Gallus/amaVit1.fa" /> - <param name="query" value="amaVit1_Gallus/Gallus_gallus_RefSeq.fa" /> + <conditional name="reference_source"> + <param name="reference_source_selector" value="history" /> + <param name="database" value="amaVit1_Gallus/amaVit1.fa" ftype="fasta" /> + </conditional> + <param name="query" value="amaVit1_Gallus/Gallus_gallus_RefSeq.fa" ftype="fasta" /> + <param name="database_type" value="dnax" /> + <param name="query_type" value="rnax" /> + <conditional name="mask_type"> + <param name="mask" value="lower" /> + </conditional> + <param name="out" value="maf" /> + <output name="output" value="amaVit1_Gallus/amaVit1_Gallus_gallus_sorted.maf" ftype="maf"/> + <assert_command> + <has_text text="-tileSize=" negate="true"/> + <has_text text="-stepSize=" negate="true"/> + <has_text text="-mask=lower"/> + </assert_command> + </test> + <!-- test on query of partial mRNA of Drosophila melanogaster and the + database of Drosophila biamipes dot chromosome + - also test cached reference --> + <test> + <conditional name="reference_source"> + <param name="reference_source_selector" value="cached"/> + <param name="database" value="dbdia display name"/> + </conditional> + <param name="query" value="dbia3/dmel-transcript.fa" ftype="fasta" /> <param name="database_type" value="dnax" /> <param name="query_type" value="rnax" /> - <param name="mask" value="lower" /> + <section name="basic_align"> + <param name="maxIntron" value="" /> + </section> + <section name="adv_align"> + <param name="tileSize" value="5"/><!--explicitly set default .. to check if it is on the CL--> + <param name="stepSize" value="5"/><!--explicitly set default .. to check if it is on the CL--> + </section> <param name="out" value="psl -noHead" /> - <output name="output" value="amaVit1_Gallus/amaVit1_Gallus_gallus_sorted.psl" sort="true"/> + <output name="output" value="dbia3/dbia3.sorted.psl" ftype="tabular" sort="true"> + <assert_contents> + <has_n_columns n="21"/> + </assert_contents> + </output> + <assert_command> + <has_text text="-tileSize=5"/> + <has_text text="-mask" negate="true"/> + </assert_command> </test> - <!-- test on query of partial mRNA of Drosophila melanogaster and the database of Drosophila biamipes dot chromosome --> <test> - <param name="reference_source_selector" value="history" /> - <param name="database" value="dbia3/dbia3.fa" /> - <param name="query" value="dbia3/dmel-transcript.fa" /> + <conditional name="reference_source"> + <param name="reference_source_selector" value="cached"/> + <param name="database" value="dbdia display name"/> + </conditional> + <param name="query" value="dbia3/dmel-transcript.fa" ftype="fasta" /> <param name="database_type" value="dnax" /> <param name="query_type" value="rnax" /> - <param name="mask" value="lower" /> - <param name="out" value="psl -noHead" /> - <param name="maxIntron" value="" /> - <output name="output" value="dbia3/dbia3.sorted.psl" sort="true"/> + <section name="basic_align"> + <param name="maxIntron" value="" /> + </section> + <section name="adv_align"> + <param name="tileSize" value="5"/><!--explicitly set default .. to check if it is on the CL--> + <param name="stepSize" value="5"/><!--explicitly set default .. to check if it is on the CL--> + </section> + <param name="out" value="pslx -noHead" /> + <output name="output" value="dbia3/dbia3.sorted.psl" ftype="tabular" sort="true" compare="contains"> + <assert_contents> + <has_n_columns n="23"/> + </assert_contents> + </output> + <assert_command> + <has_text text="-tileSize=5"/> + <has_text text="-mask" negate="true"/> + </assert_command> </test> <!-- test on the database masked by repeat masker --> <test> - <param name="reference_source_selector" value="history" /> - <param name="database" value="dbia3/dbia3_masked.2bit" /> - <param name="query" value="dbia3/dmel-transcript.fa" /> + <conditional name="reference_source"> + <param name="reference_source_selector" value="history" /> + <param name="database" value="dbia3/dbia3_masked.2bit" ftype="twobit" /> + </conditional> + <param name="query" value="dbia3/dmel-transcript.fa" ftype="fasta"/> <param name="database_type" value="dnax" /> <param name="query_type" value="rnax" /> <param name="oneOff" value="false" /> @@ -177,35 +293,108 @@ <param name="fine" value="false" /> <param name="maxIntron" value="750000" /> <param name="extendThroughN" value="false" /> - <param name="mask" value="file.out" /> - <param name="mask_file" value="dbia3/dbia3_RM.out" /> - <param name="out" value="psl -noHead" /> + <conditional name="mask_type"> + <param name="mask" value="file.out" /> + <param name="mask_file" value="dbia3/dbia3_RM.out" /> + </conditional> + <param name="out" value="psl" ftype="tabular" /> <output name="output" value="dbia3/dbia3_masked.sorted.psl"/> + <assert_command> + <has_text text="-tileSize=" negate="true"/> + <has_text text="-stepSize=" negate="true"/> + <has_text text="-mask='/"/> + </assert_command> </test> - </tests> + <!-- tiny test data from https://davetang.org/muse/2012/05/15/using-blat/ --> + <test> + <conditional name="reference_source"> + <param name="reference_source_selector" value="history" /> + <param name="database" value="mini-db.fa.gz" ftype="fasta.gz" /> + </conditional> + <param name="query" value="mini-query.fa.gz" ftype="fasta.gz"/> + <param name="minScore" value="0" /> + <section name="adv_align"> + <param name="stepSize" value="1"/> + </section> + <param name="out" value="psl" ftype="tabular" /> + <output name="output"> + <assert_contents> + <has_n_lines n="7"/> + </assert_contents> + </output> + <assert_command> + <has_text text="-minScore=0"/> + <has_text text="-stepSize=1"/> + </assert_command> + </test> </tests> <help> <![CDATA[ BLAT ==== -BLAT is a bioinformatics software a tool which performs rapid mRNA/DNA and cross-species protein alignments. +BLAT is a bioinformatics software a tool which performs rapid sequence alignments (mRNA/DNA and cross-species protein). +It is designed to find sequences of high similarity and have a certain minimum length. With the default setting this is + +- >95% similarity and a minimum length of 25 bases for nucleotide sequences +- >80% similarity and a minimum lenth of 20 amino acids for proteins -blat (version: v36)- Standalone blat sequence search command line tool. -------------------------------------------------------------------------- +More divergent or shorter sequence alignments may be missed. +The algorithm works in two phases: + +1. Search phase: find regions of probable homology using an index of the reference sequence +2. Alignment phase: Detailed Alignment of the sequences in these regions + +Search phase +++++++++++++ -usage: -++++++ +Builds an index of the reference containing the nonoverlapping K-mers and their +positions (by default, can be changed using `-tileSize` and `-stepSize`). Hits, +i.e. exactly matching k-mers in query and reference, are then found by looking +up each overlapping K-mer of the query sequence. By enabling `-oneOff` the +algorithm allows for a single substitition. Note that this increases the run +time of this phase significantly. - $ blat database query [-ooc=11.ooc] output.psl +The hits are then split into buckets of 64k (based on the database position) +and sorted on the diagonal (database minus query positions). Hits within the +gap limit form so called proto-clumps. Those are then sorted by database position +and put into clumps if they are within the window limit (wrt database coordinate). + +Clumps with less than the minimum number of hits are discarded (-minMatch) and +those within 300 bases or 100 amino acids in the database are merged together. +The resulting clumps define regions of the database which are homologous to the +query sequence which are then aligned. -where: - database and query are each either a .fa, .nib or .2bit file, - or a list of these files with one file name per line. - -ooc=11.ooc tells the program to load over-occurring 11-mers from - an external file. This will increase the speed - by a factor of 40 in many cases, but is not required. - output.psl is the name of the output file. +Alignment phase ++++++++++++++++ + +The alignment is performed differently for nucleotide and +aminoacid sequences. + +**Alignment for nucleotide sequences**: A hit list (exactly matching k-mers) for +the query and the homologous region of the database is generated. If necessary +hits are mode unique by extending them until they are unique or have a maximum +size. The hits are then extended maximally allowing no mismatches, and overlapping +hits are merged. +Subsequent (wrt query and reference) extended hits are then linked in an +alignment. If there are gaps in query and reference, the algorithm recurses +using a smaller value for k until no additional hits are found or gaps are +smaller than 6 bases. -documentation: +**Protein Alignments**: The hits from the search stage are extended into maximally +scoring ungapped alignments (HSPs) (match cost 2 and mismatch cost 1). The HSPs +are organized in a directed graph where an edge connect HSPs A and B if A starts +before B wrt query and database coordinates. The weight of the edge is then +defined as the score of B minus a gap penalty based on the distance between A +and B (overlapping HSPs are treated differently, see Kent 2002). The maximal +scoring alignment is then determined as the maximum weight path through the +graph and the HSPs of this path are removed. This is repeated until no HSPs are +left. + +**Stitching and Filling In**: +In order to find also alignments of genes scattered across multiple homologous +regions that have been determined in the search phase a variation of the +alignment algorithm for proteins is employed. For details see Kent 2002. + +Documentation: ++++++++++++++ See Blat documentation (http://genome.ucsc.edu/goldenPath/help/blatSpec.html)