Mercurial > repos > jjohnson > split_to_collection

--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/split_tabular_to_collection.xml	Thu Oct 26 13:32:38 2017 -0400
@@ -0,0 +1,47 @@
+<tool id="split_tabular_to_collection" name="Split Tabular into Collection" version="0.1.0">
+    <description>by lines</description>
+    <command><![CDATA[
+        #set $width = len(str($input.dataset.metadata.data_lines))
+        #if $skip_comment_lines:
+            #set $skip = $input.dataset.metadata.comment_lines
+        #else
+            #set $skip = 0
+        #end if
+        #set $offset = $skip + 1
+        awk 'NR > $skip{ print \$0 > (sprintf("%s_%.${width}d","${input.name}",int((NR-${offset})/${lines})*${lines}) ) }' $input
+    ]]></command>
+    <inputs>
+        <param name="input" type="data" format="tabular" label="Tabular dataset to split"/>
+        <param name="lines" type="integer" value="1000" min="1" label="Number of lines per output dataset"/>
+        <param name="skip_comment_lines" type="boolean" truevalue="yes" falsevalue="no" checked="true"
+            label="Skip comment lines"/>
+    </inputs>
+    <outputs>
+        <collection name="output_set" type="list" label="${input.name} Split List">
+            <discover_datasets pattern="__name__" ext="tabular" visible="false"/>
+        </collection>
+    </outputs>
+    <tests>
+        <test>
+          <param name="input" value="input.tsv" ftype="tabular"/>
+          <param name="lines" value="20"/>
+          <output_collection name="output_set" type="list">
+              <element name="input.tsv_00">
+                  <assert_contents>
+                      <has_text_matching expression="20\tt\tT" />
+                  </assert_contents>
+              </element>
+              <element name="input.tsv_00">
+                  <assert_contents>
+                      <has_text_matching expression="21\tu\tU" />
+                  </assert_contents>
+              </element>
+          </output_collection>
+        </test>
+    </tests>
+    <help><![CDATA[
+    Splits a tabular dataset into multiple datsets in a dataset collection.
+    This can be used in a workflow to process datasets in the collection in parallel.
+
+    ]]></help>
+</tool>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/input.tsv	Thu Oct 26 13:32:38 2017 -0400
@@ -0,0 +1,28 @@
+#First comment line
+#col1	col2	col3
+1	a	A
+2	b	B
+3	c	C
+4	d	D
+5	e	E
+6	f	F
+7	g	G
+8	h	H
+9	i	I
+10	j	J
+11	k	K
+12	l	L
+13	m	M
+14	n	N
+15	o	O
+16	p	P
+17	q	Q
+18	r	R
+19	s	S
+20	t	T
+21	u	U
+22	v	V
+23	w	W
+24	x	X
+25	y	Y
+26	z	Z