Mercurial > repos > jjohnson > split_to_collection
changeset 0:f6254e4e155e draft default tip
planemo upload for repository https://github.com/jj-umn/galaxytools/tree/master/split_to_collection commit b2ce04dd96d8b00103c23b58a4c6539a6b30809a-dirty
author | jjohnson |
---|---|
date | Thu, 26 Oct 2017 13:32:38 -0400 |
parents | |
children | |
files | split_tabular_to_collection.xml test-data/input.tsv |
diffstat | 2 files changed, 75 insertions(+), 0 deletions(-) [+] |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/split_tabular_to_collection.xml Thu Oct 26 13:32:38 2017 -0400 @@ -0,0 +1,47 @@ +<tool id="split_tabular_to_collection" name="Split Tabular into Collection" version="0.1.0"> + <description>by lines</description> + <command><![CDATA[ + #set $width = len(str($input.dataset.metadata.data_lines)) + #if $skip_comment_lines: + #set $skip = $input.dataset.metadata.comment_lines + #else + #set $skip = 0 + #end if + #set $offset = $skip + 1 + awk 'NR > $skip{ print \$0 > (sprintf("%s_%.${width}d","${input.name}",int((NR-${offset})/${lines})*${lines}) ) }' $input + ]]></command> + <inputs> + <param name="input" type="data" format="tabular" label="Tabular dataset to split"/> + <param name="lines" type="integer" value="1000" min="1" label="Number of lines per output dataset"/> + <param name="skip_comment_lines" type="boolean" truevalue="yes" falsevalue="no" checked="true" + label="Skip comment lines"/> + </inputs> + <outputs> + <collection name="output_set" type="list" label="${input.name} Split List"> + <discover_datasets pattern="__name__" ext="tabular" visible="false"/> + </collection> + </outputs> + <tests> + <test> + <param name="input" value="input.tsv" ftype="tabular"/> + <param name="lines" value="20"/> + <output_collection name="output_set" type="list"> + <element name="input.tsv_00"> + <assert_contents> + <has_text_matching expression="20\tt\tT" /> + </assert_contents> + </element> + <element name="input.tsv_00"> + <assert_contents> + <has_text_matching expression="21\tu\tU" /> + </assert_contents> + </element> + </output_collection> + </test> + </tests> + <help><![CDATA[ + Splits a tabular dataset into multiple datsets in a dataset collection. + This can be used in a workflow to process datasets in the collection in parallel. + + ]]></help> +</tool>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/input.tsv Thu Oct 26 13:32:38 2017 -0400 @@ -0,0 +1,28 @@ +#First comment line +#col1 col2 col3 +1 a A +2 b B +3 c C +4 d D +5 e E +6 f F +7 g G +8 h H +9 i I +10 j J +11 k K +12 l L +13 m M +14 n N +15 o O +16 p P +17 q Q +18 r R +19 s S +20 t T +21 u U +22 v V +23 w W +24 x X +25 y Y +26 z Z