changeset 5:70d7377d5e24 draft

planemo upload commit 7856c637db5bd4ea0b8b4db63e242618421a9cc6-dirty
author yating-l
date Wed, 01 Feb 2017 17:16:02 -0500
parents fbeff1df110b
children 6f06b6d68c0b
files README.rst blat.xml test-data/amaVit1_Gallus_gallus.psl test-data/amaVit1_Gallus_gallus_filtered.psl tool_dependencies.xml
diffstat 5 files changed, 146 insertions(+), 96 deletions(-) [+]
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/README.rst	Wed Feb 01 17:16:02 2017 -0500
@@ -0,0 +1,14 @@
+Galaxy wrapper for UCSC BLAT alignment tool
+===========================================
+BLAT (http://genome.ucsc.edu/goldenPath/help/blatSpec.html) is a bioinformatics software a tool which performs rapid mRNA/DNA and cross-species protein alignments. 
+
+Features
+--------
+1. Perform rapid mRNA/DNA and cross-species protein alignments using blat 
+2. Sort the output file
+3. Filter cDNA alignments in psl format using pslCDnaFilter
+
+Licence
+-------
+Please note that commercial download and installation of the Blat and In-Silico PCR software may be licensed through Kent Informatics (http://www.kentinformatics.com).
+
--- a/blat.xml	Tue Jan 31 18:31:42 2017 -0500
+++ b/blat.xml	Wed Feb 01 17:16:02 2017 -0500
@@ -1,5 +1,6 @@
+<?xml version="1.0"?>
 <tool id="ucsc_blat" name="UCSC BLAT Alignment Tool" version="1.0">
-    <description>Rapidly align sequences to the genome</description>
+    <description>Standalone blat sequence search command line tool</description>
     <requirements>
       <requirement type="package" version="1.0">ucsc_tools_340_for_BLAT</requirement>
     </requirements>
@@ -13,8 +14,46 @@
         -mask=$mask
         '${database}'
         '${query}'
-        '${output}'
-
+        output
+    && sort -k 10,10 -k 12,12n output > '${output_sorted}'
+    && pslReps -minAli=0.25 '${output_sorted}' output.reps.psl output.reps.psr
+    && faPolyASizes '${query}' query.polyA
+    #if $filter_param.filter =="yes"
+      && pslCDnaFilter
+            #if $filter_param.assembly_type == "native"
+                  -localNearBest=0.001
+                  #if $filter_param.assembly_category == "finished"
+                        -minId=0.95
+                        -minCover=0.25
+                  #else if $filter_param.assembly_category == "well-ordered"
+                        -minId=0.95
+                        -minCover=0.15
+            #else
+                  -minId=0.94
+                  -minAlnSize=80
+            #end if
+            #else
+                  -localNearBest=0.010
+                  #if $filter_param.assembly_category == "finished"
+                        -minId=0.35
+                        -minCover=0.25
+                  #else if $filter_param.assembly_category == "well-ordered"
+                        -minId=0.35
+                        -minCover=0.15
+                  #else
+                        -minId=0.33
+                        -minAlnSize=80
+                  #end if
+            #end if
+            -minQSize=20 
+            -ignoreIntrons 
+            -repsAsMatch 
+            -ignoreNs 
+            -bestOverlap 
+            -polyASizes=query.polyA 
+            output.reps.psl 
+            '${output_filtered}'
+    #end if
 ]]></command>
       <inputs>
             <param type="data" name="database" format="fasta" />
@@ -40,93 +79,27 @@
                   <option value="out">out - mask according to database.out RepeatMasker .out file</option>
                   <option value="file.out">file.out - mask database according to RepeatMasker file.out</option>
             </param>
-            <!--<conditional name="database" format="fasta">
-                  <param type="select" name="database_type" format="text" multiple="false" label="database type" help="Choose your database type, the default is dna">
-                        <option value="dna">DNA sequence</option>
-                        <option value="prot">protein sequence</option>
-                        <option value="dnax">DNA sequence translated in six frames to protein</option>
+            <conditional name="filter_param">
+                  <param name="filter" type="select" label="Filter BLAT results with pslCDnaFilter">
+                        <option value="no" selected="true">No</option>
+                        <option value="yes">Yes</option>
                   </param>
-        <when value="dna">
-             <param type="integer" name="tileSize" value="11" min="1" max="12" label="tileSize" help="Sets the size of match that triggers an alignment. Usually between 8 and 12">tileSize</param>
-             <param name="minMatch" type="integer" value="2" label="Sets the number of tile matches.  Usually set from 2 to 4.
-                  Default is 2 for nucleotide, 1 for protein.">-minMatch</param>
-            <param name="minIdentity" type="integer" value="90" label="Sets minimum sequence identity (in percent).  Default is
-                  90 for nucleotide searches, 25 for protein or translated
-                  protein searches.">-minIdentity</param>
-            
-        </when>
-        <when value="prot">  
-            <param type="integer" name="tileSize" value="5" min="1" max="12" label="tileSize" help="Sets the size of match that triggers an alignment. Usually between 8 and 12">tileSize</param>
-            <param name="minMatch" type="integer" value="1" label="Sets the number of tile matches.  Usually set from 2 to 4.
-                  Default is 2 for nucleotide, 1 for protein.">-minMatch</param>
-            <param name="minIdentity" type="integer" value="25" label="Sets minimum sequence identity (in percent).  Default is
-                  90 for nucleotide searches, 25 for protein or translated
-                  protein searches.">-minIdentity</param>
-        </when> 
-    </conditional>
-    <param type="select" name="query_type" format="text" multiple="false" label="query type" help="Choose your query type, the default is dna">
-        <option value="dna">DNA sequence</option>
-        <option value="rna">RNA sequence</option>
-        <option value="prot">protein sequence</option>
-        <option value="dnax">DNA sequence translated in six frames to protein</option>
-        <option value="rnax">DNA sequence translated in three frames to protein</option>
-    </param>
-    <conditional name="settings">
-        <param name="advanced" type="select" multiple="false" label="Specify advanced parameters">
-            <option value="simple" selected="true">No, use program defaults. </option>
-            <option value="advanced">Yes, see full parameter list.</option>
-        </param>
-        <when value="advanced">
-            <param name="mask" type="select" label="Mask out repeats" help="Alignments won't be started in masked region
-                  but may extend through it in nucleotide searches.  Masked areas
-                  are ignored entirely in protein or translated searches.">
-                  <option value="lower">lower - mask out lower-cased sequence</option>
-                  <option value="upper">upper - mask out upper-cased sequence</option>
-                  <option value="out">out - mask according to database.out RepeatMasker .out file</option>
-                  <option value="file.out">file.out - mask database according to RepeatMasker file.out</option>
-            </param>
-            <param name="qmask" type="select" label="Mask out repeats in query sequence" help="Similar to -mask above, but
-                  for query rather than target sequence.">
-                  <option value="lower">lower - mask out lower-cased sequence</option>
-                  <option value="upper">upper - mask out upper-cased sequence</option>
-                  <option value="out">out - mask according to database.out RepeatMasker .out file</option>
-                  <option value="file.out">file.out - mask database according to RepeatMasker file.out</option>
-            </param>
-            <param name="oneOff" type="integer" value="0" label="If set to 1, this allows one mismatch in tile and still triggers an alignment. Default is 0.">-oneOff</param>
-            <param name="minScore" type="integer" value="30" label="Sets minimum score.  This is the matches minus the
-                  mismatches minus some sort of gap penalty.  Default is 30.">-minScore</param>
-            <param name="maxGap" type="integer" value="2" label="Sets the size of maximum gap between tiles in a clump.  Usually
-                  set from 0 to 3.  Default is 2. Only relevent for minMatch > 1.">-maxGap</param>
-            <param name="minRepDivergence" type="integer" value="15" min="0" max="100" label="Minimum percent divergence of repeats to allow
-                  them to be unmasked.  Default is 15.  Only relevant for
-                  masking using RepeatMasker .out files.">-minRepDivergence</param>
-            <param name="noHead" type="boolean" value="false" label="Suppresses .psl header (so it's just a tab-separated file)." />
-            <param name="dots" type="integer" value="0" label="Output dot every N sequences to show program's progress." />
-            <param name="trimT" type="boolean" value="false" label="Trim leading poly-T." />
-            <param name="trimHardA" type="boolean" value="false" label="Remove poly-A tail from qSize as well as alignments in
-                  psl output." />
-            <param name="fastMap" type="boolean" value="false" label="Run for fast DNA/DNA remapping - not allowing introns,
-                  requiring high %ID. Query sizes must not exceed 5000." />
-            <param name="fine" type="boolean" value="false" label="For high-quality mRNAs, look harder for small initial and
-                  terminal exons.  Not recommended for ESTs." />
-            <param name="out" type="select" label="Output file format">
-                <option value="psl" selected="true">psl - Default.  Tab-separated format, no sequence</option>
-                <option value="pslx">pslx - Tab-separated format with sequence</option>
-                <option value="axt">axt - blastz-associated axt format</option>
-                <option value="maf">maf - multiz-associated maf format</option>
-                <option value="sim4">sim4 - similar to sim4 format</option>
-                <option value="wublast">wublast - similar to wublast format</option>
-                <option value="blast">blast - similar to NCBI blast format</option>
-                <option value="blast8">blast8- NCBI blast tabular format</option>
-                <option value="blast9">blast9 - NCBI blast tabular format with comments</option>
-           </param>
-           <param name="maxIntro" type="integer" value="750000" label="Sets maximum intron size. Default is 750000." />
-           <param name="extendThroughN" type="boolean" value="false" label="Allows extension of alignment through large blocks of Ns." />
-      </when>
-    </conditional>-->
+                  <when value="yes">
+                        <param name="assembly_type" type="select" label="Choose your type of cDNA sequence">
+                              <option value="native">Same species</option>
+                              <option value="xeno">Across species</option>
+                        </param>
+                        <param name="assembly_category" type="select" label="Choose your genome assembly category">
+                              <option value="finished">finished assemblies (high quality)</option>
+                              <option value="well-ordered">well-ordered assemblies (well ordered, whole genome shotgun)</option>
+                              <option value="low-coverage">low-coverage assemblies (low coverage (&lt; 4x"), lots of contigs, N50 scaffold size &lt; 1mb) </option>
+                        </param>
+                  </when>
+            </conditional>
       </inputs>
       <outputs>
-            <data format="psl" name="output"></data>
+            <data format="psl" name="output_sorted"></data>
+            <data format="psl" name="output_filtered"></data>
       </outputs>
   <tests>
       <test>
@@ -136,9 +109,64 @@
             <param name="query_type" value="rnax" />     
             <param name="noHead" value="true" />
             <param name="mask" value="lower" />
-            <output name="output" value="amaVit1_Gallus_gallus.psl" />
+            <param name="filter" value="yes" />
+            <param name="assembly_type" value="xeno" />
+            <param name="assembly_category" value="well-ordered" />
+            <output name="output_sorted" value="amaVit1_Gallus_gallus.psl" />
+            <output name="output_filtered" value="amaVit1_Gallus_gallus_filtered.psl" />
       </test>
-  </tests>           
+  </tests> 
+  <help>
+        <![CDATA[
+BLAT
+====
+BLAT is a bioinformatics software a tool which performs rapid mRNA/DNA and cross-species protein alignments. 
+
+blat (version: v340)- Standalone blat sequence search command line tool. 
+---------------------------------------------------------
+usage:
+++++++
+   blat database query [-ooc=11.ooc] output.psl
+where:
+   database and query are each either a .fa, .nib or .2bit file,
+      or a list of these files with one file name per line.
+   -ooc=11.ooc tells the program to load over-occurring 11-mers from
+      an external file.  This will increase the speed
+      by a factor of 40 in many cases, but is not required.
+   output.psl is the name of the output file.   
+documentation:
+++++++++++++++
+See Blat documentation (http://genome.ucsc.edu/goldenPath/help/blatSpec.html)  
+Source code:
+++++++++++++
+http://hgdownload.cse.ucsc.edu/admin/exe/
+pslCDnaFilter (version: v340)
+---------------------------
+Filter cDNA alignments in psl format. Filtering criteria are comparative, selecting near best in genome alignments for each given cDNA and non-comparative, based only on the quality of an individual alignment.
+usage:
+++++++
+      pslCDnaFilter [options] inPsl outPsl
+Source code:
+++++++++++++
+http://hgdownload.cse.ucsc.edu/admin/exe/
+
+Licence
+=======
+Please note that commercial download and installation of the Blat and In-Silico PCR software may be licensed through Kent Informatics (http://www.kentinformatics.com).
+]]>
+</help>  
+<citations>
+      <citation type="bibtex">@article{kent2002blat,
+  title={BLAT—the BLAST-like alignment tool},
+  author={Kent, W James},
+  journal={Genome research},
+  volume={12},
+  number={4},
+  pages={656--664},
+  year={2002},
+  publisher={Cold Spring Harbor Lab}
+      }</citation>
+</citations> 
 </tool>
              
             
--- a/test-data/amaVit1_Gallus_gallus.psl	Tue Jan 31 18:31:42 2017 -0500
+++ b/test-data/amaVit1_Gallus_gallus.psl	Wed Feb 01 17:16:02 2017 -0500
@@ -1,12 +1,12 @@
 2134	377	0	62	6	361	28	33978	++	NM_001006176	3248	296	3230	LMAW01000001.1	10443977	10027590	10064141	29	110,48,10,76,75,152,66,51,87,83,55,56,148,96,63,24,177,107,83,145,109,83,145,90,117,129,68,93,27,	296,406,454,502,578,653,844,910,961,1048,1131,1186,1242,1390,1486,1609,1741,1918,2025,2108,2253,2362,2445,2590,2680,2797,2926,3034,3203,	10027590,10031154,10031353,10031517,10033261,10033696,10034454,10035646,10043484,10043794,10059668,10059824,10059973,10060798,10060979,10061265,10061487,10061740,10061947,10062143,10062365,10062584,10062743,10063001,10063188,10063383,10063616,10063806,10064114,
+1247	370	0	44	6	280	17	96639	+-	NM_001012804	4592	121	2062	LMAW01000001.1	10443977	517465	615765	18	119,32,74,112,2,87,150,114,39,171,3,94,58,142,81,107,137,139,	121,240,272,355,467,469,598,757,1066,1129,1300,1303,1397,1455,1597,1679,1786,1923,	9828212,9828431,9869264,9881475,9881600,9882641,9893172,9893708,9899022,9904744,9904997,9908755,9909749,9911745,9915154,9915481,9920844,9926373,
 420	114	0	0	4	99	3	7093	++	NM_001031013	971	69	702	LMAW01000002.1	5883884	3610886	3618513	5	33,66,21,276,138,	69,138,232,285,564,	3610886,3610955,3617941,3618099,3618375,
-189	73	0	0	4	773	4	13674	++	NM_204766	5992	553	1588	LMAW01000003.1	4188689	2559457	2573393	5	84,66,33,13,66,	553,739,844,1457,1522,	2559457,2563244,2563349,2570767,2573327,
-143	58	0	19	3	659	4	9089	++	NM_204766	5992	574	1453	LMAW01000003.1	4188689	259524	268833	5	27,80,11,66,36,	574,700,780,847,1417,	259524,261255,261529,262379,268797,
-3724	1107	0	326	14	609	44	22663	++	NM_204766	5992	49	5815	LMAW01000002.1	5883884	103604	131424	45	51,78,132,3,152,2,29,75,90,63,1,95,81,3,136,119,150,2,169,180,123,48,90,117,123,134,241,261,177,146,90,72,267,129,81,2,196,348,116,193,106,123,177,99,87,	49,154,256,388,391,543,545,574,691,781,844,845,967,1048,1051,1187,1306,1456,1458,1627,1807,1960,2008,2098,2215,2338,2472,2713,2974,3151,3298,3388,3493,3778,3943,4024,4026,4222,4696,4812,5051,5212,5335,5608,5728,	103604,104407,107764,107909,108002,108233,108257,109261,112047,112967,113036,113148,114257,114391,114866,116183,116438,116593,116908,117767,118030,118682,120119,120672,121329,122185,122724,123518,123945,124449,124904,125793,125898,126790,127405,127543,127717,128055,129238,129442,129761,130020,130254,131103,131337,
-537	234	0	42	7	1113	7	34987	++	NM_204766	5992	262	2188	LMAW01000001.1	10443977	9729332	9765132	8	108,153,66,93,39,171,99,84,	262,388,574,688,1414,1459,1981,2104,	9729332,9734141,9751019,9755220,9762042,9762555,9764665,9765048,
-1247	370	0	44	6	280	17	96639	+-	NM_001012804	4592	121	2062	LMAW01000001.1	10443977	517465	615765	18	119,32,74,112,2,87,150,114,39,171,3,94,58,142,81,107,137,139,	121,240,272,355,467,469,598,757,1066,1129,1300,1303,1397,1455,1597,1679,1786,1923,	9828212,9828431,9869264,9881475,9881600,9882641,9893172,9893708,9899022,9904744,9904997,9908755,9909749,9911745,9915154,9915481,9920844,9926373,
 2710	404	0	198	24	2053	34	11593	+-	NM_001292086	6644	10	5375	LMAW01000003.1	4188689	2820233	2835138	37	64,727,107,5,90,146,87,93,135,137,171,93,72,90,69,90,23,34,9,72,125,70,106,21,23,161,26,29,93,15,45,18,11,12,54,162,27,	10,74,801,908,913,1003,1149,1236,1329,1464,1617,1792,1885,2049,2163,2350,2918,3036,3070,3366,3445,3576,3647,3794,3985,4008,4236,4263,4476,4603,4688,4742,4882,4941,5031,5113,5348,	1353551,1355131,1356116,1356798,1356971,1357445,1359020,1359444,1359816,1360523,1360971,1361443,1365082,1365245,1365351,1365525,1366096,1366220,1366255,1366474,1366549,1366674,1366749,1366889,1367081,1367105,1367332,1367358,1367558,1367688,1367786,1367837,1367979,1368029,1368113,1368194,1368429,
 200	50	0	0	0	0	1	24	+-	NM_001292086	6644	481	731	LMAW01000003.1	4188689	2832850	2833124	2	110,140,	481,591,	1355565,1355699,
 182	48	0	20	1	24	0	0	+-	NM_001292086	6644	508	782	LMAW01000003.1	4188689	2832901	2833151	2	102,148,	508,634,	1355538,1355640,
 92	25	0	0	0	0	1	183	+-	NM_001292086	6644	562	679	LMAW01000003.1	4188689	2832851	2833151	2	43,74,	562,605,	1355538,1355764,
 49	11	0	0	0	0	0	0	+-	NM_001292086	6644	619	679	LMAW01000003.1	4188689	2833085	2833145	1	60,	619,	1355544,
+3724	1107	0	326	14	609	44	22663	++	NM_204766	5992	49	5815	LMAW01000002.1	5883884	103604	131424	45	51,78,132,3,152,2,29,75,90,63,1,95,81,3,136,119,150,2,169,180,123,48,90,117,123,134,241,261,177,146,90,72,267,129,81,2,196,348,116,193,106,123,177,99,87,	49,154,256,388,391,543,545,574,691,781,844,845,967,1048,1051,1187,1306,1456,1458,1627,1807,1960,2008,2098,2215,2338,2472,2713,2974,3151,3298,3388,3493,3778,3943,4024,4026,4222,4696,4812,5051,5212,5335,5608,5728,	103604,104407,107764,107909,108002,108233,108257,109261,112047,112967,113036,113148,114257,114391,114866,116183,116438,116593,116908,117767,118030,118682,120119,120672,121329,122185,122724,123518,123945,124449,124904,125793,125898,126790,127405,127543,127717,128055,129238,129442,129761,130020,130254,131103,131337,
+537	234	0	42	7	1113	7	34987	++	NM_204766	5992	262	2188	LMAW01000001.1	10443977	9729332	9765132	8	108,153,66,93,39,171,99,84,	262,388,574,688,1414,1459,1981,2104,	9729332,9734141,9751019,9755220,9762042,9762555,9764665,9765048,
+189	73	0	0	4	773	4	13674	++	NM_204766	5992	553	1588	LMAW01000003.1	4188689	2559457	2573393	5	84,66,33,13,66,	553,739,844,1457,1522,	2559457,2563244,2563349,2570767,2573327,
+143	58	0	19	3	659	4	9089	++	NM_204766	5992	574	1453	LMAW01000003.1	4188689	259524	268833	5	27,80,11,66,36,	574,700,780,847,1417,	259524,261255,261529,262379,268797,
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/amaVit1_Gallus_gallus_filtered.psl	Wed Feb 01 17:16:02 2017 -0500
@@ -0,0 +1,5 @@
+2134	377	0	62	6	361	28	33978	++	NM_001006176	3248	296	3230	LMAW01000001.1	10443977	10027590	10064141	29	110,48,10,76,75,152,66,51,87,83,55,56,148,96,63,24,177,107,83,145,109,83,145,90,117,129,68,93,27,	296,406,454,502,578,653,844,910,961,1048,1131,1186,1242,1390,1486,1609,1741,1918,2025,2108,2253,2362,2445,2590,2680,2797,2926,3034,3203,	10027590,10031154,10031353,10031517,10033261,10033696,10034454,10035646,10043484,10043794,10059668,10059824,10059973,10060798,10060979,10061265,10061487,10061740,10061947,10062143,10062365,10062584,10062743,10063001,10063188,10063383,10063616,10063806,10064114,
+1247	370	0	44	6	280	17	96639	+-	NM_001012804	4592	121	2062	LMAW01000001.1	10443977	517465	615765	18	119,32,74,112,2,87,150,114,39,171,3,94,58,142,81,107,137,139,	121,240,272,355,467,469,598,757,1066,1129,1300,1303,1397,1455,1597,1679,1786,1923,	9828212,9828431,9869264,9881475,9881600,9882641,9893172,9893708,9899022,9904744,9904997,9908755,9909749,9911745,9915154,9915481,9920844,9926373,
+420	114	0	0	4	99	3	7093	++	NM_001031013	971	69	702	LMAW01000002.1	5883884	3610886	3618513	5	33,66,21,276,138,	69,138,232,285,564,	3610886,3610955,3617941,3618099,3618375,
+2710	404	0	198	24	2053	34	11593	+-	NM_001292086	6644	10	5375	LMAW01000003.1	4188689	2820233	2835138	37	64,727,107,5,90,146,87,93,135,137,171,93,72,90,69,90,23,34,9,72,125,70,106,21,23,161,26,29,93,15,45,18,11,12,54,162,27,	10,74,801,908,913,1003,1149,1236,1329,1464,1617,1792,1885,2049,2163,2350,2918,3036,3070,3366,3445,3576,3647,3794,3985,4008,4236,4263,4476,4603,4688,4742,4882,4941,5031,5113,5348,	1353551,1355131,1356116,1356798,1356971,1357445,1359020,1359444,1359816,1360523,1360971,1361443,1365082,1365245,1365351,1365525,1366096,1366220,1366255,1366474,1366549,1366674,1366749,1366889,1367081,1367105,1367332,1367358,1367558,1367688,1367786,1367837,1367979,1368029,1368113,1368194,1368429,
+3724	1107	0	326	14	609	44	22663	++	NM_204766	5992	49	5815	LMAW01000002.1	5883884	103604	131424	45	51,78,132,3,152,2,29,75,90,63,1,95,81,3,136,119,150,2,169,180,123,48,90,117,123,134,241,261,177,146,90,72,267,129,81,2,196,348,116,193,106,123,177,99,87,	49,154,256,388,391,543,545,574,691,781,844,845,967,1048,1051,1187,1306,1456,1458,1627,1807,1960,2008,2098,2215,2338,2472,2713,2974,3151,3298,3388,3493,3778,3943,4024,4026,4222,4696,4812,5051,5212,5335,5608,5728,	103604,104407,107764,107909,108002,108233,108257,109261,112047,112967,113036,113148,114257,114391,114866,116183,116438,116593,116908,117767,118030,118682,120119,120672,121329,122185,122724,123518,123945,124449,124904,125793,125898,126790,127405,127543,127717,128055,129238,129442,129761,130020,130254,131103,131337,
--- a/tool_dependencies.xml	Tue Jan 31 18:31:42 2017 -0500
+++ b/tool_dependencies.xml	Wed Feb 01 17:16:02 2017 -0500
@@ -1,8 +1,11 @@
 <?xml version="1.0"?>
 <tool_dependency>
-    <!-- UCSC tools:
+    <!-- required tools:
         - blat
-        - 
+        - sort
+        - pslReps
+        - faPolyASizes
+        - pslCDnaFilter
     -->
     <package name="ucsc_tools_340_for_BLAT" version="1.0">
         <install version="1.0">