Mercurial > repos > bgruening > graphclust_preprocessing
changeset 0:7ba39ab6f48d draft default tip
planemo upload for repository https://github.com/eteriSokhoyan/galaxytools/tree/branchForIterations/tools/GraphClust commit f447414150c19865e904d3914a68e2479fadddce
author | bgruening |
---|---|
date | Thu, 15 Dec 2016 18:18:10 -0500 |
parents | |
children | |
files | preprocessing.xml test-data/FASTA/data.fasta test-data/FASTA/data.fasta.scan test-data/FASTA/data.map test-data/FASTA/data.names test-data/input.fa |
diffstat | 6 files changed, 165 insertions(+), 0 deletions(-) [+] |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/preprocessing.xml Thu Dec 15 18:18:10 2016 -0500 @@ -0,0 +1,85 @@ +<tool id="preproc" name="Preprocessing" version="0.1"> + <requirements> + <requirement type="package" version="0.1">graphclust-wrappers</requirement> + </requirements> + <stdio> + <exit_code range="1:" /> + </stdio> + <command> + <![CDATA[ + + 'preprocessing.pl' + '$fastaFile' $max_length $in_winShift $min_seq_length + +]]> + </command> + <inputs> + <param type="data" name="fastaFile" format="fasta" /> + <param name="max_length" type="integer" value="10000" size="5" label="window size"/> + <param name="in_winShift" type="integer" value="100" size="5" label="window shift in percent"/> + <param name="min_seq_length" type="integer" value="5" size="5" label="minimum sequence length"/> + </inputs> + + <outputs> + <data name="data.fasta" format="fasta" from_work_dir="FASTA/data.fasta" label="data.fasta"/> + <data name="data.map" format="txt" from_work_dir="FASTA/data.map" label="data.map"/> + <data name="data.names" format="txt" from_work_dir="FASTA/data.names" label="data.names"/> + <data name="data.fasta.scan" format="fasta" from_work_dir="FASTA/data.fasta.scan" label="data.fasta.scan"/> + <data name="FASTA" format="zip" from_work_dir="FASTA.zip" label="FASTA.ZIP"/> + </outputs> + + + <tests> + <test> + <param name="fastaFile" value="input.fa"/> + <param name="max_length" value="10000"/> + <param name="in_winShift" value="100"/> + <param name="min_seq_length" value="5"/> + <output name="data.fasta" file="FASTA/data.fasta"/> + <output name="data.map" file="FASTA/data.map" /> + <output name="data.names" file="FASTA/data.names"/> + <output name="data.fasta.scan" file="FASTA/data.fasta.scan" /> + </test> +</tests> + + <help> +<![CDATA[ + +**What it does** + +The tool takes as an input file of sequences in Fasta format and creates the final input for GraphCLust based on given parameters. + +**Parameters** + ++ **window size** : All input sequences are splitted into fragments of this length. + The shift of the sliding window can be defined via option *window shift in percent*. + This paramter reflects the expected length of signals to be found. + Slightly larger windows are usually ok. Too small windows can disturb existing signals. + + + + ++ **window shift in percent** : Relative window size in % for window shift during input preprocessing. + Please note that a small shift results in much more fragments for clustering. The benefit is that RNA + motifs/structures are not destroyed by arbitrary split points. Smaller + shifts usually increase the cluster quality. Too small shifts (<20) are not + recommended as a dense center is "polluted" by overlapping fragments and + no other occurences in the dataset can be found. + + + + + ++ **minimum sequence length** : Minimal length of input sequences. + Every input sequence below that length is ignored completely during clustering. + + + ]]></help> + + + <citations> + <citation type="doi">10.1093/bioinformatics/bts224</citation> + </citations> + + +</tool>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/FASTA/data.fasta Thu Dec 15 18:18:10 2016 -0500 @@ -0,0 +1,20 @@ +>1 SEQ1#1#120#+ ORIGID RF00001_rep.0_AL096764.11/46123-46004_1 ORIGHEAD RF00001_rep.0 +GUCUAUGGCCAUACCACCCUGAAUGUGCUUGAUCUCAUCUGAUCUCGUGAAGCCAAGCAGGGUGGGGCCUAGUUAGUACUUGGAUGGGAGACUUCCUGGGAAUAUAAGCUGCUGUUGGCU +>2 SEQ2#1#118#+ ORIGID RF00001_rep.1_U89919.1/939-1056_2 ORIGHEAD RF00001_rep.1 +CUUUACGGCCACACCACCCUGAACGCACCGGAUCUCGACUGACCUUGAAAGCUAAGCAGGAUCGGGCCUGGUUAGUAUUGGGAUGGCAGACCCCCUGGAAAUACAGGGUGCUGAAGGU +>3 SEQ3#1#104#+ ORIGID RF00001_rep.2_AJ508600.1/161-58_3 ORIGHEAD RF00001_rep.2 +GUCUACAGCCAUACCAUCCUGAACAUGCCAGAUCUUGUCUGACCUCUGAAGCUAAGCAGGGUCAAGCCUGGUUAGUACUUGGGAGAAGCUGGUGUGGCUAGACC +>4 SEQ4#1#73#+ ORIGID RF00005_rep.0_M15347.1/1040-968_4 ORIGHEAD RF00005_rep.0 +GGCUCCAUAGCUCAGGGGUUAGAGCACUGGUCUUGUAAACCAGGGGUCGCGAGUUCAAUUCUCGCUGGGGCUU +>5 SEQ5#1#72#+ ORIGID RF00005_rep.10_X58792.1/174-245_5 ORIGHEAD RF00005_rep.10 +GGUCCCAUGGUGUAAUGGUUAGCACUCUGGACUUUGAAUCCAGCGAUCCGAGUUCAAAUCUCGGUGGGACCU +>6 SEQ6#1#66#+ ORIGID RF00005_rep.11_AF346992.1/15890-15955_6 ORIGHEAD RF00005_rep.11 +GUCCUUGUAGUAUAAACUAAUACACCAGUCUUGUAAACCGGAGAUGAAAACCUUUUUCCAAGGACA +>7 SEQ7#1#83#+ ORIGID RF00005_rep.12_AC108081.2/59868-59786_7 ORIGHEAD RF00005_rep.12 +GUCAGGAUGGCCGAGCGGUCUAAGGCGCUGCGUUCAGGUCGCAGUCUCCCCUGGAGGCGUGGGUUCGAAUCCCACUUCUGACA +>8 SEQ8#1#70#+ ORIGID RF00005_rep.13_AC067849.6/4771-4840_8 ORIGHEAD RF00005_rep.13 +CACUGUAAAGCUAACUUAGCAUUAACCUUUUAAGUUAAAGAUUAAGAGAACCAACACCUCUUUACAGUGA +>9 SEQ9#1#73#+ ORIGID RF00005_rep.14_AL021808.2/65570-65498_9 ORIGHEAD RF00005_rep.14 +GCUUCUGUAGUGUAGUGGUUAUCACGUUCGCCUCACACGCGAAAGGUCCCCGGUUCGAAACCGGGCAGAAGCA +>10 SEQ10#1#73#+ ORIGID RF00005_rep.15_AC008443.10/42590-42518_10 ORIGHEAD RF00005_rep.15 +GCCCGGCUAGCUCAGUCGGUAGAGCAUGAGACUCUUAAUCUCAGGGUCGUGGGUUCGAGCCCCACGUUGGGCG
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/FASTA/data.fasta.scan Thu Dec 15 18:18:10 2016 -0500 @@ -0,0 +1,20 @@ +>SEQ1 ORIGID RF00001_rep.0_AL096764.11/46123-46004_1 ORIGHEAD RF00001_rep.0 +GUCUAUGGCCAUACCACCCUGAAUGUGCUUGAUCUCAUCUGAUCUCGUGAAGCCAAGCAGGGUGGGGCCUAGUUAGUACUUGGAUGGGAGACUUCCUGGGAAUAUAAGCUGCUGUUGGCU +>SEQ2 ORIGID RF00001_rep.1_U89919.1/939-1056_2 ORIGHEAD RF00001_rep.1 +CUUUACGGCCACACCACCCUGAACGCACCGGAUCUCGACUGACCUUGAAAGCUAAGCAGGAUCGGGCCUGGUUAGUAUUGGGAUGGCAGACCCCCUGGAAAUACAGGGUGCUGAAGGU +>SEQ3 ORIGID RF00001_rep.2_AJ508600.1/161-58_3 ORIGHEAD RF00001_rep.2 +GUCUACAGCCAUACCAUCCUGAACAUGCCAGAUCUUGUCUGACCUCUGAAGCUAAGCAGGGUCAAGCCUGGUUAGUACUUGGGAGAAGCUGGUGUGGCUAGACC +>SEQ4 ORIGID RF00005_rep.0_M15347.1/1040-968_4 ORIGHEAD RF00005_rep.0 +GGCUCCAUAGCUCAGGGGUUAGAGCACUGGUCUUGUAAACCAGGGGUCGCGAGUUCAAUUCUCGCUGGGGCUU +>SEQ5 ORIGID RF00005_rep.10_X58792.1/174-245_5 ORIGHEAD RF00005_rep.10 +GGUCCCAUGGUGUAAUGGUUAGCACUCUGGACUUUGAAUCCAGCGAUCCGAGUUCAAAUCUCGGUGGGACCU +>SEQ6 ORIGID RF00005_rep.11_AF346992.1/15890-15955_6 ORIGHEAD RF00005_rep.11 +GUCCUUGUAGUAUAAACUAAUACACCAGUCUUGUAAACCGGAGAUGAAAACCUUUUUCCAAGGACA +>SEQ7 ORIGID RF00005_rep.12_AC108081.2/59868-59786_7 ORIGHEAD RF00005_rep.12 +GUCAGGAUGGCCGAGCGGUCUAAGGCGCUGCGUUCAGGUCGCAGUCUCCCCUGGAGGCGUGGGUUCGAAUCCCACUUCUGACA +>SEQ8 ORIGID RF00005_rep.13_AC067849.6/4771-4840_8 ORIGHEAD RF00005_rep.13 +CACUGUAAAGCUAACUUAGCAUUAACCUUUUAAGUUAAAGAUUAAGAGAACCAACACCUCUUUACAGUGA +>SEQ9 ORIGID RF00005_rep.14_AL021808.2/65570-65498_9 ORIGHEAD RF00005_rep.14 +GCUUCUGUAGUGUAGUGGUUAUCACGUUCGCCUCACACGCGAAAGGUCCCCGGUUCGAAACCGGGCAGAAGCA +>SEQ10 ORIGID RF00005_rep.15_AC008443.10/42590-42518_10 ORIGHEAD RF00005_rep.15 +GCCCGGCUAGCUCAGUCGGUAGAGCAUGAGACUCUUAAUCUCAGGGUCGUGGGUUCGAGCCCCACGUUGGGCG
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/FASTA/data.map Thu Dec 15 18:18:10 2016 -0500 @@ -0,0 +1,10 @@ +1 SEQ1#1#120#+ +2 SEQ2#1#118#+ +3 SEQ3#1#104#+ +4 SEQ4#1#73#+ +5 SEQ5#1#72#+ +6 SEQ6#1#66#+ +7 SEQ7#1#83#+ +8 SEQ8#1#70#+ +9 SEQ9#1#73#+ +10 SEQ10#1#73#+
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/FASTA/data.names Thu Dec 15 18:18:10 2016 -0500 @@ -0,0 +1,10 @@ +1 SEQ1#1#120#+ ORIGID RF00001_rep.0_AL096764.11/46123-46004_1 ORIGHEAD RF00001_rep.0 +2 SEQ2#1#118#+ ORIGID RF00001_rep.1_U89919.1/939-1056_2 ORIGHEAD RF00001_rep.1 +3 SEQ3#1#104#+ ORIGID RF00001_rep.2_AJ508600.1/161-58_3 ORIGHEAD RF00001_rep.2 +4 SEQ4#1#73#+ ORIGID RF00005_rep.0_M15347.1/1040-968_4 ORIGHEAD RF00005_rep.0 +5 SEQ5#1#72#+ ORIGID RF00005_rep.10_X58792.1/174-245_5 ORIGHEAD RF00005_rep.10 +6 SEQ6#1#66#+ ORIGID RF00005_rep.11_AF346992.1/15890-15955_6 ORIGHEAD RF00005_rep.11 +7 SEQ7#1#83#+ ORIGID RF00005_rep.12_AC108081.2/59868-59786_7 ORIGHEAD RF00005_rep.12 +8 SEQ8#1#70#+ ORIGID RF00005_rep.13_AC067849.6/4771-4840_8 ORIGHEAD RF00005_rep.13 +9 SEQ9#1#73#+ ORIGID RF00005_rep.14_AL021808.2/65570-65498_9 ORIGHEAD RF00005_rep.14 +10 SEQ10#1#73#+ ORIGID RF00005_rep.15_AC008443.10/42590-42518_10 ORIGHEAD RF00005_rep.15
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/input.fa Thu Dec 15 18:18:10 2016 -0500 @@ -0,0 +1,20 @@ +>RF00001_rep.0_AL096764.11/46123-46004 RF00001_rep.0 +GUCUAUGGCCAUACCACCCUGAAUGUGCUUGAUCUCAUCUGAUCUCGUGAAGCCAAGCAGGGUGGGGCCUAGUUAGUACUUGGAUGGGAGACUUCCUGGGAAUAUAAGCUGCUGUUGGCU +>RF00001_rep.1_U89919.1/939-1056 RF00001_rep.1 +CUUUACGGCCACACCACCCUGAACGCACCGGAUCUCGACUGACCUUGAAAGCUAAGCAGGAUCGGGCCUGGUUAGUAUUGGGAUGGCAGACCCCCUGGAAAUACAGGGUGCUGAAGGU +>RF00001_rep.2_AJ508600.1/161-58 RF00001_rep.2 +GUCUACAGCCAUACCAUCCUGAACAUGCCAGAUCUUGUCUGACCUCUGAAGCUAAGCAGGGUCAAGCCUGGUUAGUACUUGGGAGAAGCUGGUGUGGCUAGACC +>RF00005_rep.0_M15347.1/1040-968 RF00005_rep.0 +GGCUCCAUAGCUCAGGGGUUAGAGCACUGGUCUUGUAAACCAGGGGUCGCGAGUUCAAUUCUCGCUGGGGCUU +>RF00005_rep.10_X58792.1/174-245 RF00005_rep.10 +GGUCCCAUGGUGUAAUGGUUAGCACUCUGGACUUUGAAUCCAGCGAUCCGAGUUCAAAUCUCGGUGGGACCU +>RF00005_rep.11_AF346992.1/15890-15955 RF00005_rep.11 +GUCCUUGUAGUAUAAACUAAUACACCAGUCUUGUAAACCGGAGAUGAAAACCUUUUUCCAAGGACA +>RF00005_rep.12_AC108081.2/59868-59786 RF00005_rep.12 +GUCAGGAUGGCCGAGCGGUCUAAGGCGCUGCGUUCAGGUCGCAGUCUCCCCUGGAGGCGUGGGUUCGAAUCCCACUUCUGACA +>RF00005_rep.13_AC067849.6/4771-4840 RF00005_rep.13 +CACUGUAAAGCUAACUUAGCAUUAACCUUUUAAGUUAAAGAUUAAGAGAACCAACACCUCUUUACAGUGA +>RF00005_rep.14_AL021808.2/65570-65498 RF00005_rep.14 +GCUUCUGUAGUGUAGUGGUUAUCACGUUCGCCUCACACGCGAAAGGUCCCCGGUUCGAAACCGGGCAGAAGCA +>RF00005_rep.15_AC008443.10/42590-42518 RF00005_rep.15 +GCCCGGCUAGCUCAGUCGGUAGAGCAUGAGACUCUUAAUCUCAGGGUCGUGGGUUCGAGCCCCACGUUGGGCG