Repository 'sample_seqs'
hg clone https://testtoolshed.g2.bx.psu.edu/repos/peterjc/sample_seqs

Changeset 3:dc55e58fa890 (2014-11-21)
Previous changeset 2:219924bd7e3e (2014-03-27) Next changeset 4:09a4ee5d12fd (2015-03-06)
Commit message:
Uploaded v0.1.2, embeds citations, interleaved mode
modified:
tools/sample_seqs/README.rst
tools/sample_seqs/sample_seqs.py
tools/sample_seqs/sample_seqs.xml
tools/sample_seqs/tool_dependencies.xml
added:
test-data/MID4_GLZRM4E04_rnd30_frclip.pair_sample_N5.sff
test-data/ecoli.pair_sample_N100.fastq
test-data/get_orf_input.Suis_ORF.prot.pair_sample_N100.fasta
b
diff -r 219924bd7e3e -r dc55e58fa890 test-data/MID4_GLZRM4E04_rnd30_frclip.pair_sample_N5.sff
b
Binary file test-data/MID4_GLZRM4E04_rnd30_frclip.pair_sample_N5.sff has changed
b
diff -r 219924bd7e3e -r dc55e58fa890 test-data/ecoli.pair_sample_N100.fastq
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/ecoli.pair_sample_N100.fastq Fri Nov 21 08:30:03 2014 -0500
b
@@ -0,0 +1,208 @@
+@frag_1
+AGCTTTTCATTCTGACTGCAACGGGCAATATGTCTC
++
+##%')+.024JMMMMMMMMMMMMMMMMMMMMMMMMM
+@frag_1_a
+GAGACATATTGCCCGTTGCAGTCAGAATGAAAAGCT
++
+MMMMMMMMMMMMMMMMMMMMMMMMMJ420.+)'%##
+@frag_200
+TGGTAATGGTGATGGTGGTGGTAATGGTGGTGCTAA
++
+MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMM
+@frag_201
+TAGCACCACCATTACCACCACCATCACCATTACCAC
++
+MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMM
+@frag_400
+TGGCCACCTGCCCCTGCCTGGCATTGCTTTCCAGAA
++
+MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMM
+@frag_401
+TCTGGAAAGCAATGCCAGGCAGGGGCAGGTGGCCAC
++
+MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMM
+@frag_600
+TTGGGCAAATTCCTGATCGACGAAAGTTTTCAATTG
++
+MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMM
+@frag_601
+AATTGAAAACTTTCGTCGATCAGGAATTTGCCCAAA
++
+MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMM
+@frag_800
+ATATCGACGGTAGATTCGAGGTAATGCCCCACTGCC
++
+MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMM
+@frag_801
+GCAGTGGGGCATTACCTCGAATCTACCGTCGATATT
++
+MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMM
+@frag_1000
+TATAGACCCCGTCAACGTCCGTCCAAATCTCGCAAC
++
+MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMM
+@frag_1001
+TTGCGAGATTTGGACGGACGTTGACGGGGTCTATAC
++
+MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMM
+@frag_1200
+ATCACGGCTGGCACCAATGAGCGTACCTGGTGCTTG
++
+MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMM
+@frag_1201
+AAGCACCAGGTACGCTCATTGGTGCCAGCCGTGATG
++
+MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMM
+@frag_1400
+CAGTCGCTTTGTGGAACGCAGAAACTGATGCTGTAT
++
+MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMM
+@frag_1401
+TACAGCATCAGTTTCTGCGTTCCACAAAGCGACTGT
++
+MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMM
+@frag_1600
+ATCCCTGAGCAATGGCGACAATGTTGATATTGGCGC
++
+MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMM
+@frag_1601
+CGCCAATATCAACATTGTCGCCATTGCTCAGGGATC
++
+MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMM
+@frag_1800
+GACACGTAAGTCGATATGTTTATTCTTCAGCCAGCT
++
+MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMM
+@frag_1801
+GCTGGCTGAAGAATAAACATATCGACTTACGTGTCT
++
+MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMM
+@frag_2000
+AAGTCGGCATATTGATCCGCCACTGCCTGGCTGGAA
++
+MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMM
+@frag_2001
+TCCAGCCAGGCAGTGGCGGATCAATATGCCGACTTC
++
+MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMM
+@frag_2200
+CGGAGAACTTCATCAATTCATCACCTGCATTGAGCA
++
+MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMM
+@frag_2201
+GCTCAATGCAGGTGATGAATTGATGAAGTTCTCCGG
++
+MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMM
+@frag_2400
+TTCAATATCCGCCAGCTCCAGTTCACGTCCCGTTTC
++
+MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMM
+@frag_2401
+AAACGGGACGTGAACTGGAGCTGGCGGATATTGAAA
++
+MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMM
+@frag_2600
+GGATCATTACCATCCACTTCGGCAATCTTCACGCGG
++
+MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMM
+@frag_2601
+CGCGTGAAGATTGCCGAAGTGGATGGTAATGATCCG
++
+MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMM
+@frag_2800
+ATTGGCACTGGAAGCCGGGGCATAAACTTTAACCAT
++
+MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMM
+@frag_2801
+TGGTTAAAGTTTATGCCCCGGCTTCCAGTGCCAATA
++
+MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMM
+@frag_3000
+TGCTTACCCAGTTCCTGGCAAAAACGCTCCCAGCAC
++
+MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMM
+@frag_3001
+TGCTGGGAGCGTTTTTGCCAGGAACTGGGTAAGCAA
++
+MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMM
+@frag_3200
+ACGGTGCCACGTTGTCGTAATGAATGCTGCCGGAGA
++
+MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMM
+@frag_3201
+CTCCGGCAGCATTCATTACGACAACGTGGCACCGTG
++
+MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMM
+@frag_3400
+GTGAATGAAGCCTGCCAGATGTCGCCCGTGCGCAAT
++
+MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMM
+@frag_3401
+TTGCGCACGGGCGACATCTGGCAGGCTTCATTCACG
++
+MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMM
+@frag_3600
+ACGCGCTGGGCGGTTTCCGGCTTGTCACACAGAGCG
++
+MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMM
+@frag_3601
+GCTCTGTGTGACAAGCCGGAAACCGCCCAGCGCGTT
++
+MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMM
+@frag_3800
+GGTCGTGCGGAAAAAACAGCCCCTGATTTTTGCCCA
++
+MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMM
+@frag_3801
+GGGCAAAAATCAGGGGCTGTTTTTTCCGCACGACCT
++
+MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMM
+@frag_4000
+CCCGTGGAACAATTCCAGACAACCGACATCGCTTTC
++
+MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMM
+@frag_4001
+AAAGCGATGTCGGTTGTCTGGAATTGTTCCACGGGC
++
+MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMM
+@frag_4200
+TTTTCTTGCAGTGGACTGATTTTGCCTCGTGGATAG
++
+MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMM
+@frag_4201
+TATCCACGAGGCAAAATCAGTCCACTGCAAGAAAAA
++
+MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMM
+@frag_4400
+GCGGCAGCTGCGCAACAGCTTCAAAGTAGTAGCAAA
++
+MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMM
+@frag_4401
+TTGCTACTACTTTGAAGCTGTTGCGCAGCTGCCGCA
++
+MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMM
+@frag_4600
+CATCGCGTTGGATAACGTCGCCTGAGTCGCTTTGGG
++
+MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMM
+@frag_4601
+CCAAAGCGACTCAGGCGACGTTATCCAACGCGATGG
++
+MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMM
+@frag_4800
+CCTGGATTCAACTGATCACGCAGCGCACGATAAGCT
++
+MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMM
+@frag_4801
+GCTTATCGTGCGCTGCGTGATCAGTTGAATCCAGGC
++
+MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMM
+@frag_5000
+AGATAATGAATAGATTTTACTGATGATTCATCATCA
++
+MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMM
+@frag_5001
+GATGATGAATCATCAGTAAAATCTATTCATTATCTC
++
+MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMK
b
diff -r 219924bd7e3e -r dc55e58fa890 test-data/get_orf_input.Suis_ORF.prot.pair_sample_N100.fasta
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/get_orf_input.Suis_ORF.prot.pair_sample_N100.fasta Fri Nov 21 08:30:03 2014 -0500
b
b'@@ -0,0 +1,214 @@\n+>Streptococcus_suis|ORF1 length 457 aa, 1374 bp, from 1..1374 of Streptococcus_suis\n+MNQEQLFWQRFIELAKVNFKPSIYDFYVADAKLLGINQQVANIFLNRPFKKDFWEKNFEE\n+LMIAASFESYGEPLTIQYQFTEDEQEIRNTTNTRSSIVHQVQTLEPATPQETFKPVHSDI\n+KSQYTFANFVQGDNNHWAKAAALAVSDNLGELYNPLFIFGGPGLGKTHILNAIGNKVLAD\n+NPQARIKYVSSETFINEFLEHLRLNDMESFKKTYRNLDLLLIDDIQSLRNKATTQEEFFH\n+TFNALHEKNKQIVLTSDRNPDHLDNLEERLVTRFKWGLTSEITPPDFETRIAILRNKCEN\n+LPYNFTNETLSYLAGQFDSNVRDLEGALKDIHLIATMRQLSEISVEVAAEAIRSRKQTNP\n+QNMVIPIEKIQTEVGNFYGVSLKELKGSKRVQHIVHARQVAMFLAREMTDNSLPKIGKEF\n+GNRDHTTVMHAYNKIKTLLLDDENLEIEITSIKNKLR\n+>Streptococcus_suis|ORF2 length 385 aa, 1158 bp, from 1507..2664 of Streptococcus_suis\n+IINKGESMIQFSINKNIFLQALSITKRAISTKNAIPILSTVKITVTSEGITLTGSNGQIS\n+IEHFISIQDENAGLLISSPGSILLEAGFFINVVSSMPDLVLDFNEIEQKQIVLTSGKSEI\n+TLKGKEAEQYPRLQEVPTSKPLVLETKVLKQTINETAFAASTQESRPILTGVHFVLTENK\n+NLKTVATDSHRMSQRKLVLDTSGDDFNVVIPSRSLREFTAVFTDDIETVEVFFSNNQILF\n+RSEHISFYTRLLEGTYPDTDRLIPTEFKTTAIFDTANLRHSMERARLLSNATQNGTVKLE\n+IANNVVSAHVNSPEVGRVNEELDTVEVSGEDLVISFNPTYLIEALKATTSEQVKISFISS\n+VRPFTLIPNNEGEDFIQLVTPVRTN\n+>Streptococcus_suis|ORF201 length 360 aa, 1083 bp, from complement(128035..129117) of Streptococcus_suis\n+SCHGGRRMTLFGKIKEVTELQSLPGFEGQVRNHIRQKITPHVDRIETDGLGGIFGIKDTA\n+VENAPRILVVAHMDEVGFMISQIKPDGTFRVVELGGWNPLVVSSQAFTLQLQDGRTIPAI\n+SGSVPPHLSRGANAPGMPAIADIIFDAGFANYDEAWAFGVRPGDVLVPKNETILTANGKN\n+VISKAWDNRFGVLMVTELLESLSGHALPNQLIAGANVQEEVGLRGAHASTTKFNPDIFLA\n+VDCSPAGDIYGDQGKIGDGTLLRFYDPGHIMLKNMKDFLLTTAEEAGVKFQYYCGKGGTD\n+AGAAHLKNHGVPSTTIGVCARYIHSHQTLYSMDDFLEAQAFLQTIVKKLDRSTVDLIKNY\n+>Streptococcus_suis|ORF202 length 106 aa, 321 bp, from 128792..129112 of Streptococcus_suis\n+RVKAWLETTRGFQPPSSTTRKVPSGLIWLIIKPTSSMWATTRIRGAFSTAVSLIPKIPPS\n+PSVSMRSTCGVIFWRMWLRTCPSNPGKLCNSVTSLIFPKRVILLPP\n+>Streptococcus_suis|ORF401 length 120 aa, 363 bp, from 265643..266005 of Streptococcus_suis\n+TTGTTSPIAPKWKASSKSLRVPTSEPTTLIPSSTVFTILRSMYSDGSPTATTYPPARTLS\n+IAWLKATLETAVTTVECTPPPVISLIYPGTSSTSSPLIVTSAPTSLASSNLSLLMSTAIT\n+>Streptococcus_suis|ORF402 length 201 aa, 606 bp, from 265741..266346 of Streptococcus_suis\n+HSLHDTEVHVFRWKSDSYYISTSTNTVNSLVEGYFGNSCYNSRVYTATSNFFNISRNIFY\n+FKSVDRHICTNFFGEFQFIIIDVYGDNMSVEDFFSVLYSKVSKSTSTIDSNPLTWFQVSF\n+FNRFVASNASTSDRTCLSWIKTFWNFYCIVRCYNTLLSHTTVNRVACIFYGTAESFATGC\n+TIFTHTTALEEPSNADTVTNF\n+>Streptococcus_suis|ORF601 length 665 aa, 1998 bp, from 409896..411893 of Streptococcus_suis\n+VMIQIGKIFAGRYRIVRQIGRGGMADVYLARDLILDGEEVAVKVLRTNYQTDQIAIQRFQ\n+REARAMAELDHPNIVRISDIGEEDGQQYLAMEYVNGLDLKRYIKENAPLSNDVAVRIMGQ\n+ILLAMRMAHTRGIVHRDLKPQNVLLTSNGVAKVTDFGIAVAFAETSLTQTNSMLGSVHYL\n+SPEQARGSKATIQSDIYAMGIILFEMLTGRIPYDGDSAVTIALQHFQKPLPSVREENANV\n+PQALENVVLKATAKKLNERYKSVAEMYADLASALSMDRQNEPRVELEGNKVDTKTLPKLS\n+QANVETKVPHTNSSAQVSATDKGSGKKEVAKSGNKPVSKPRPGIRTRYKVLIGAILLTVI\n+AAGLMFFNTPRTVTVPDVSGQTVEKATEMIEVAGLEVGNITEEATATVDEGLVIRTSPAA\n+KTTRRQGSKIDIVVATAALASIPDVVDKESDTARQELEALGFQVTIKEEYSEKVAQGLVI\n+KTDPGANSSAEKGAKITLYVSKGVAPQVVPNVVGKSQENATQILQTAGFSIGTITQEYSS\n+SVTAGQVISTDPVANTELAKGSIINLVISKGKELIMPDLTSGNYTYSQARSQLQALGVNA\n+ESIEKQEDRSYYSTTSDIVIGQYPAAGATIDGTVTLYVSVASTRTSSDSSAGSSTSTSTS\n+TGSGQ\n+>Streptococcus_suis|ORF602 length 120 aa, 363 bp, from complement(410593..410955) of Streptococcus_suis\n+LLSRLCSVYVCLALVLKQAYFPTSPLLSYQTPYPSQKLEQNCLYAVLSFQHWLGRVSARF\n+LYQPCSLLVQPWVHSDDPWIEPKLNLHTFLQPTYSARLIFLPLLLVQHFLRPEVRWHSLL\n+>Streptococcus_suis|ORF801 length 428 aa, 1287 bp, from 561960..563246 of Streptococcus_suis\n+KSSRDCESCLLLFVILKVMQADRRKTFGKMRIRINNLFFVAIAFMGIIISNSQVVLAIGK\n+ASVIQYLSYLVLILCIVNDLLKNNKHIVVYKLGYLFLIIFLFTIGICQQILPITTKIYLS\n+ISMMIISVLATLPISLIKDIDDFRRISNHLLFALFITSILGIMMGATMFTGAVEGIGFSQ\n+GFNGGLTHKNFFGITILMGFVLTYLAYKYGSYKRTDRFILGLELFLILISNTRSVYLILL\n+LFLFLVNLDKIKIEQRQWSTLKYISMLFCAIFLYYFFGFLITHSDSYAHRVNGLINFFEY\n+YRNDWFHLMFGAADLAYGDLTLDYAIRVRRVLGWNGTLEMPLLSIMLKNGFIGLVGYGIV\n+LYKLYRNVRILKTDNIKTIGKSVFIIVVLSATVENYIVNLSFVFMPICFCLLNSISTMES\n+TINKQLQT\n+>Streptococcus_suis|ORF802 length 333 aa, 1002 bp, from 563382..564383 of Streptococcus_suis\n+RMEKVSIIVPIFNTEKYLRECLDSIISQSYTNLEILLIDDGSSDSSTDICLEYAEQDGRI\n+KLFRLPNGGVSNARNYGIKNSTANYIMFV'..b'SQQIETYHSIRETIQFGQLYRLKKTSNTWAANYVSQ\n+DKNQVVFTFVKILAKPEAPLLHVRLKGLDPDALYECPQLGETFYGDELMNIGLTMPHVQK\n+DYFSVQYIFNKI\n+>Streptococcus_suis|ORF2201 length 272 aa, 819 bp, from complement(1531599..1532417) of Streptococcus_suis\n+DCSKIKIIDLAVGKLKLLSSKRKGAFMEIIRSKANHLVKQVKKLQQKKYRTSSYLIEGWH\n+LLEEAMEAGANIEHIFVVEEYFEKVAGLANVTVVSPEIMQELADSKTPQGVVAQLALPSQ\n+RLPETLDGKFLVLEDVQDPGNVGTMIRTADAAGFDGVFLSDKSADIYNMKVLRSMQGSHF\n+HLPVYRMPISSILTALKSNQIQILATTLSSQSVDYKEITPHSSFALVMGNEGQGISDLVA\n+DEADQLVHITMPGQAESLNVAIAAGILLFSFI\n+>Streptococcus_suis|ORF2202 length 101 aa, 306 bp, from 1532445..1532750 of Streptococcus_suis\n+MSCQKEKLMRKVKMIASGRVQGVGFRWSVQFLAVEIGDIYGRVWNNDDGTVTILAQSDNA\n+EKLSHFIHEIRKGPSRMAKVIYLDVTLANFEDYKDFQVSYR\n+>Streptococcus_suis|ORF2401 length 141 aa, 426 bp, from 1658030..1658455 of Streptococcus_suis\n+ASITVPIARTVGSAFSSWISATKRTVSNNSSMFWLNLAEISTNSDSPPQAVEITPCSANS\n+PMTRSGFAPGLSILLIATMIGTLAAFEWLIASIVCGMTPSSAATTRMVKSVTDAPRARIE\n+VKAACPGVSKKVIFLPASSIW\n+>Streptococcus_suis|ORF2402 length 266 aa, 801 bp, from 1658515..1659315 of Streptococcus_suis\n+GVQQGCFTMVNVSHDSHNRWAFCHLFFIEVALFYEETLNICVIDLYLFFRFNTIINHEEF\n+DSISIQRLVLSRHNSHKEEFFHNFSRFTFDSFCNFCDGHASSIFKFSWQFVELAFCDRFG\n+RLVSLAFFIFLVVIPVTCSLISHLILTISISLLFPWTIFFVTIKVTFFIWSSLFLTTGIY\n+SSFCNLLWYRCNKCRFHKWFAFHNRFFKLNFFWLLRLLFSFLSLTKTFFTGTSILRILFC\n+FQSSSTRFEVNFRSCWFCSLSLFKAS\n+>Streptococcus_suis|ORF2601 length 100 aa, 303 bp, from 1790150..1790452 of Streptococcus_suis\n+LKDGYQRLVVEGFADIAETFLQTETNLMTTVIFIARHDDDRPIAFPLGSLNQVNMTLVHG\n+SKGPKNNCYCLFHNLPFYCFLYFISYSFLKPKSRVFYIFL\n+>Streptococcus_suis|ORF2602 length 823 aa, 2472 bp, from complement(1790482..1792953) of Streptococcus_suis\n+ERGVVRMKISRGLQGVYEDAQLIAQRYSSDYLETWHLLLAFVINPDTVAGAILAEYPADV\n+LDYERAVYMVMGRRYHEELESFFFLPSSKRVKELQVFAEKIAEIVKSKGLGTEHIFMGML\n+LDKRSTASQILDQVGFHFEDSDDKVRFLDLRKNLEAKAGFTKEHLKAIRTMTKGGKPKQA\n+TVGNMMGMTQSQSGGLEDYTRDLTALARSGQLEPVIGRDEEISRMLQILSRKTKNNPVLV\n+GDAGVGKTALALGLAQRIANGEVPASLVNMRILELDLMNVIAGTRFRGDFEERMNNIIND\n+IEEDGRVILFIDELHTIMGSGSGIDSILDAANILKPALSRGTLRTVGATTQDEYQKHIEK\n+DAALVRRFAKVTIEEPSVADSVAILQGLKPAYEAHHKVTISDQAVVTAVAYAKRYLTSKN\n+LPDSAIDLLDEASATVQNRAKGQVEEGGLTALDQALMAGKYKTVTQLLLKAQEAENQATS\n+YSLEVTEEDILATLSRLSGIPVTKLSQTDAKKYLNLEQELHKRVIGQEEAISAVSRAIRR\n+NQSGIRTGHRPIGSFMFLGPTGVGKTELAKALAEILFDDESALIRFDMSEYMEKFAASRL\n+NGAPPGYVGYEEGGELTEKVRNKPYSVLLFDEVEKAHPDIFNVLLQVLDDGVLTDRKGRK\n+VDFSNTVIIMTSNLGATALRDDKTVGFGALDLSKSQEHVEKRIFEALKKAYRPEFINRID\n+EKVVFHSLTEADMQDVVKVMVKPLIAVAASKGITLKLQASALKLLAKEGYDPEMGARPLR\n+RLLQTKLEDPLAEMLLRGELPAGVTLKVGVKAEQLKFDSVKAG\n+>Streptococcus_suis|ORF2801 length 1006 aa, 3021 bp, from complement(1921434..1924454) of Streptococcus_suis\n+TQTKEYEMIEFRKKAVQLASLMSVFFLCTYSFTDAMYIMAESLSTDGASTIRRTYIEDKK\n+EDKDRLNIELVESLSSPKTIGQKITIDKQSLATQNFNEKGIVVITQKGLELKKDDLEKGW\n+KLDESYNEKDLAITKSETEKRSLSNELDVLSKTVEELPVYGENYHSYRLLPTTELDYSAD\n+NVSLTLSFTKVSEVIKGELVAVVDAEHIAYFKAEPSVFKEYSQVNEKPSSTEDVNVVSPS\n+QDPPVSETKENVPDNPESQGSSTVPESEQAVDALVEQRGVICIKLTKSSSEQEEGIEDTE\n+NEAIEGATFEVRNVESENLVYTGQTDKDGLLTISNLPLGNYAVIQKSTIDGYEISATKEV\n+VELTVAQSRQTVSISNSPKNPLEGLMLNSILDSSLIPRSARVARSLLDTSLLDNPTVTGN\n+ANATTTTTVFGNKTTTITREESNIKYIFKPITISIPGVYQSYSQDGVLKKKEVVVDSNTN\n+TTKIIWEYTTTVGGVNSNITSIRNAFSTTTDSGLGEPKITSIMKDGVAITPNTTYYGNFD\n+NFKSATDNLPVGNGTYVYTIETPVVIPSDNYSLDYRSEVTVDAPKGSKLTYNGTSVTLTQ\n+KETRTLSTADTITLPAKNDGGPLGDLKVDTVNTSNTNRTIGKYRDNDDKVIEWTSSQLND\n+TSTTQSFTFDVALDSSQAAHEYKVYIYEPSNGTYTETKAEKVATPGNQITVDNVPAGAVA\n+LVKTVTNVKDEKVNHTISGAQLEALKGDIKIQKNWEADSDKVDVTFTVNGGSLTNRKETL\n+SANNTQITIANVDKFSGMRSTATKKRIYYDVTEAVPSGYILSSAQTDWENLYYVFTNKKD\n+NTTTPVFPPDTCGNYGVSSIDLVSINYVMYKSGSKIWGGFDGSMKMNLKIPAFARAGDSF\n+TLELPPELKLSHVANPNVAWSTVSANGKVIAKVYHEKDNLIRFVLTTEAYSVQEYNGWFE\n+IGVPTSNVIKINNRETTELYKTGVLPNLPEWYTTTTRNQTLIKRSR\n+>Streptococcus_suis|ORF2802 length 252 aa, 759 bp, from complement(1925855..1926613) of Streptococcus_suis\n+LEARMQQYFVNGRAPQGMFQISDKDTAKHMFSVMRLQAGDQIVLVFDDGIKRLARVVDSQ\n+SQSVEIIEELTDNVELPISVTIAMGFPKGDKLEFVAQKATELGMSALWAFPADWSVVKWD\n+GKKLAKKAEKLEKIAQGAAEQSKRNRIPAVRLFEKKSDFLAQLAGFDQIILAYEEAAKEG\n+EQANLVKILSGLEIGQSVLVIVGPEGGVSPEEVAAFEGAGAVKTGLGPRILRAETAPLYA\n+LSTISYATELLR\n'
b
diff -r 219924bd7e3e -r dc55e58fa890 tools/sample_seqs/README.rst
--- a/tools/sample_seqs/README.rst Thu Mar 27 12:14:06 2014 -0400
+++ b/tools/sample_seqs/README.rst Fri Nov 21 08:30:03 2014 -0500
b
@@ -39,11 +39,12 @@
 
     <tool file="sample_seqs/sample_seqs.xml" />
 
-You will also need to install Biopython 1.62 or later. If you want to run
-the unit tests, include this line in ``tools_conf.xml.sample`` and the sample
-FASTA files under the ``test-data`` directory. Then::
+You will also need to install Biopython 1.62 or later.
 
-    ./run_functional_tests.sh -id sample_seqs
+If you wish to run the unit tests, also move/copy the ``test-data/`` files
+under Galaxy's ``test-data/`` folder. Then::
+
+    ./run_tests.sh -id sample_seqs
 
 That's it.
 
@@ -55,6 +56,9 @@
 Version Changes
 ------- ----------------------------------------------------------------------
 v0.0.1  - Initial version.
+v0.1.1  - Using optparse to provide a proper command line API.
+v0.1.2  - Interleaved mode for working with paired records.
+        - Tool definition now embeds citation information.
 ======= ======================================================================
 
 
@@ -67,7 +71,7 @@
 For making the "Galaxy Tool Shed" http://toolshed.g2.bx.psu.edu/ tarball use
 the following command from the Galaxy root folder::
 
-    $ tar -czf sample_seqs.tar.gz tools/sample_seqs/README.rst tools/sample_seqs/sample_seqs.py tools/sample_seqs/sample_seqs.xml tools/sample_seqs/tool_dependencies.xml test-data/ecoli.fastq test-data/ecoli.sample_N100.fastq test-data/get_orf_input.Suis_ORF.prot.fasta test-data/get_orf_input.Suis_ORF.prot.sample_N100.fasta test-data/MID4_GLZRM4E04_rnd30_frclip.sff test-data/MID4_GLZRM4E04_rnd30_frclip.sample_N5.sff
+    $ tar -czf sample_seqs.tar.gz tools/sample_seqs/README.rst tools/sample_seqs/sample_seqs.py tools/sample_seqs/sample_seqs.xml tools/sample_seqs/tool_dependencies.xml test-data/ecoli.fastq test-data/ecoli.sample_N100.fastq test-data/ecoli.pair_sample_N100.fastq test-data/get_orf_input.Suis_ORF.prot.fasta test-data/get_orf_input.Suis_ORF.prot.sample_N100.fasta test-data/get_orf_input.Suis_ORF.prot.pair_sample_N100.fasta test-data/MID4_GLZRM4E04_rnd30_frclip.sff test-data/MID4_GLZRM4E04_rnd30_frclip.sample_N5.sff test-data/MID4_GLZRM4E04_rnd30_frclip.pair_sample_N5.sff
 
 Check this worked::
 
@@ -78,10 +82,14 @@
     tools/sample_seqs/tool_dependencies.xml
     test-data/ecoli.fastq
     test-data/ecoli.sample_N100.fastq
+    test-data/ecoli.pair_sample_N100.fastq
     test-data/get_orf_input.Suis_ORF.prot.fasta
     test-data/get_orf_input.Suis_ORF.prot.sample_N100.fasta
+    test-data/get_orf_input.Suis_ORF.prot.pair_sample_N100.fasta
     test-data/MID4_GLZRM4E04_rnd30_frclip.sff
     test-data/MID4_GLZRM4E04_rnd30_frclip.sample_N5.sff
+    test-data/MID4_GLZRM4E04_rnd30_pair_sample.sff
+    test-data/MID4_GLZRM4E04_rnd30_frclip.pair_sample_N5.sff
 
 
 Licence (MIT)
b
diff -r 219924bd7e3e -r dc55e58fa890 tools/sample_seqs/sample_seqs.py
--- a/tools/sample_seqs/sample_seqs.py Thu Mar 27 12:14:06 2014 -0400
+++ b/tools/sample_seqs/sample_seqs.py Fri Nov 21 08:30:03 2014 -0500
[
b'@@ -9,39 +9,80 @@\n molecular biology and bioinformatics. Bioinformatics 25(11) 1422-3.\n http://dx.doi.org/10.1093/bioinformatics/btp163 pmid:19304878.\n \n-This script is copyright 2010-2013 by Peter Cock, The James Hutton Institute\n+This script is copyright 2014 by Peter Cock, The James Hutton Institute\n (formerly the Scottish Crop Research Institute, SCRI), UK. All rights reserved.\n See accompanying text file for licence details (MIT license).\n \n-This is version 0.1.0 of the script, use -v or --version to get the version.\n+Use -v or --version to get the version, -h or --help for help.\n """\n import os\n import sys\n+from optparse import OptionParser\n+\n \n def stop_err(msg, err=1):\n     sys.stderr.write(msg.rstrip() + "\\n")\n     sys.exit(err)\n \n-if "-v" in sys.argv or "--version" in sys.argv:\n-    print("v0.1.0")\n+#Parse Command Line\n+usage = """Use as follows:\n+\n+$ python sample_seqs.py [options]\n+\n+e.g. Sample 20% of the reads:\n+\n+$ python sample_seqs.py -i my_seq.fastq -f fastq -p 20.0 -o sample.fastq\n+"""\n+parser = OptionParser(usage=usage)\n+parser.add_option(\'-i\', \'--input\', dest=\'input\',\n+                  default=None, help=\'Input sequences filename\',\n+                  metavar="FILE")\n+parser.add_option(\'-f\', \'--format\', dest=\'format\',\n+                  default=None,\n+                  help=\'Input sequence format (e.g. fasta, fastq, sff)\')\n+parser.add_option(\'-o\', \'--output\', dest=\'output\',\n+                  default=None, help=\'Output sampled sequenced filename\',\n+                  metavar="FILE")\n+parser.add_option(\'-p\', \'--percent\', dest=\'percent\',\n+                  default=None,\n+                  help=\'Take this percent of the reads\')\n+parser.add_option(\'-n\', \'--everyn\', dest=\'everyn\',\n+                  default=None,\n+                  help=\'Take every N-th read\')\n+parser.add_option("--interleaved", dest="interleaved",\n+                  default=False, action="store_true",\n+                  help="Input is interleaved reads, preserve the pairings")\n+parser.add_option("-v", "--version", dest="version",\n+                  default=False, action="store_true",\n+                  help="Show version and quit")\n+options, args = parser.parse_args()\n+\n+if options.version:\n+    print("v0.1.2")\n     sys.exit(0)\n \n-#Parse Command Line\n-if len(sys.argv) < 5:\n-    stop_err("Requires at least four arguments: seq_format, in_file, out_file, mode, ...")\n-seq_format, in_file, out_file, mode = sys.argv[1:5]\n+seq_format = options.format\n+in_file = options.input\n+out_file = options.output\n+interleaved = options.interleaved\n+\n+if not in_file:\n+    stop_err("Require an input filename")\n if in_file != "/dev/stdin" and not os.path.isfile(in_file):\n     stop_err("Missing input file %r" % in_file)\n+if not out_file:\n+    stop_err("Require an output filename")\n \n-if mode == "everyNth":\n-    if len(sys.argv) != 6:\n-        stop_err("If using everyNth, just need argument N (integer, at least 2)")\n+\n+if options.percent and options.everyn:\n+    stop_err("Cannot combine -p and -n options")\n+elif options.everyn:\n     try:\n-        N = int(sys.argv[5])\n+        N = int(options.everyn)\n     except:\n-        stop_err("Bad N argument %r" % sys.argv[5])\n+        stop_err("Bad N argument %r" % options.everyn)\n     if N < 2:\n-        stop_err("Bad N argument %r" % sys.argv[5])\n+        stop_err("Bad N argument %r" % options.everyn)\n     if (N % 10) == 1:\n         sys.stderr.write("Sampling every %ist sequence\\n" % N)\n     elif (N % 10) == 2:\n@@ -57,15 +98,13 @@\n             count += 1\n             if count % N == 1:\n                 yield record\n-elif mode == "percentage":\n-    if len(sys.argv) != 6:\n-        stop_err("If using percentage, just need percentage argument (float, range 0 to 100)")\n+elif options.percent:\n     try:\n-        percent = float(sys.argv[5]) / 100.0\n+        percent = float(options.percent) / 100.0\n     except:\n-        stop_err("Bad percent argument %r" % sys.argv[5])\n+        stop_err("Bad percent argument %r" % options.percent)\n    '..b'ndle))):\n+                    count += 1\n+                    pos_handle.write(r1)\n+                    pos_handle.write(r2)\n+            else:\n+                for record in iterator_filter(raw_fasta_iterator(in_handle)):\n+                    count += 1\n+                    pos_handle.write(record)\n     return count\n \n try:\n     from galaxy_utils.sequence.fastq import fastqReader, fastqWriter\n-    def fastq_filter(in_file, out_file, iterator_filter):\n+    def fastq_filter(in_file, out_file, iterator_filter, inter):\n         count = 0\n         #from galaxy_utils.sequence.fastq import fastqReader, fastqWriter\n         reader = fastqReader(open(in_file, "rU"))\n         writer = fastqWriter(open(out_file, "w"))\n-        for record in iterator_filter(reader):\n-            count += 1\n-            writer.write(record)\n+        if inter:\n+            for r1, r2 in iterator_filter(pair(reader)):\n+                count += 1\n+                writer.write(r1)\n+                writer.write(r2)\n+        else:\n+            for record in iterator_filter(reader):\n+                count += 1\n+                writer.write(record)\n         writer.close()\n         reader.close()\n         return count\n except ImportError:\n     from Bio.SeqIO.QualityIO import FastqGeneralIterator\n-    def fastq_filter(in_file, out_file, iterator_filter):\n+    def fastq_filter(in_file, out_file, iterator_filter, inter):\n         count = 0\n         with open(in_file) as in_handle:\n             with open(out_file, "w") as pos_handle:\n-                for title, seq, qual in iterator_filter(FastqGeneralIterator(in_handle)):\n-                    count += 1\n-                    pos_handle.write("@%s\\n%s\\n+\\n%s\\n" % (title, seq, qual))\n+                if inter:\n+                    for r1, r2 in iterator_filter(pair(FastqGeneralIterator(in_handle))):\n+                        count += 1\n+                        pos_handle.write("@%s\\n%s\\n+\\n%s\\n" % r1)\n+                        pos_handle.write("@%s\\n%s\\n+\\n%s\\n" % r2)\n+                else:\n+                    for title, seq, qual in iterator_filter(FastqGeneralIterator(in_handle)):\n+                        count += 1\n+                        pos_handle.write("@%s\\n%s\\n+\\n%s\\n" % (title, seq, qual))\n         return count\n \n-def sff_filter(in_file, out_file, iterator_filter):\n+def sff_filter(in_file, out_file, iterator_filter, inter):\n     count = 0\n     try:\n         from Bio.SeqIO.SffIO import SffIterator, SffWriter\n@@ -167,17 +236,26 @@\n         with open(out_file, "wb") as out_handle:\n             writer = SffWriter(out_handle, xml=manifest)\n             in_handle.seek(0) #start again after getting manifest\n-            count = writer.write_file(iterator_filter(SffIterator(in_handle)))\n-            #count = writer.write_file(SffIterator(in_handle))\n+            if inter:\n+                from itertools import chain\n+                count = writer.write_file(chain.from_iterable(iterator_filter(pair(SffIterator(in_handle)))))\n+                assert count % 2 == 0, "Odd number of records? %i" % count\n+                count /= 2\n+            else:\n+                count = writer.write_file(iterator_filter(SffIterator(in_handle)))\n+                #count = writer.write_file(SffIterator(in_handle))\n     return count\n \n if seq_format.lower()=="sff":\n-    count = sff_filter(in_file, out_file, sampler)\n+    count = sff_filter(in_file, out_file, sampler, interleaved)\n elif seq_format.lower()=="fasta":\n-    count = fasta_filter(in_file, out_file, sampler)\n+    count = fasta_filter(in_file, out_file, sampler, interleaved)\n elif seq_format.lower().startswith("fastq"):\n-    count = fastq_filter(in_file, out_file, sampler)\n+    count = fastq_filter(in_file, out_file, sampler, interleaved)\n else:\n     stop_err("Unsupported file type %r" % seq_format)\n \n-sys.stderr.write("Sampled %i records\\n" % count)\n+if interleaved:\n+    sys.stderr.write("Selected %i pairs\\n" % count)\n+else:\n+    sys.stderr.write("Selected %i records\\n" % count)\n'
b
diff -r 219924bd7e3e -r dc55e58fa890 tools/sample_seqs/sample_seqs.xml
--- a/tools/sample_seqs/sample_seqs.xml Thu Mar 27 12:14:06 2014 -0400
+++ b/tools/sample_seqs/sample_seqs.xml Fri Nov 21 08:30:03 2014 -0500
b
@@ -1,4 +1,4 @@
-<tool id="sample_seqs" name="Sub-sample sequences files" version="0.0.1">
+<tool id="sample_seqs" name="Sub-sample sequences files" version="0.1.2">
     <description>e.g. to reduce coverage</description>
     <requirements>
         <requirement type="package" version="1.63">biopython</requirement>
@@ -6,13 +6,15 @@
     </requirements>
     <version_command interpreter="python">sample_seqs.py --version</version_command>
     <command interpreter="python">
+sample_seqs.py -f "$input_file.ext" -i "$input_file" -o "$output_file"
 #if str($sampling.type) == "everyNth":
-sample_seqs.py "$input_file.ext" "$input_file" "$output_file" "${sampling.type}" "${sampling.every_n}"
-#elif str($sampling.type) == "percentage":
-sample_seqs.py "$input_file.ext" "$input_file" "$output_file" "${sampling.type}" "${sampling.percent}"
-#else:
-##Should give an error about invalid sampling type:
-sample_seqs.py "$input_file.ext" "$input_file" "$output_file" "${sampling.type}"
+-n "${sampling.every_n}"
+#else
+##elif str($sampling.type) == "percentage":
+-p "${sampling.percent}"
+#end if
+#if $interleaved
+--interleaved
 #end if
     </command>
     <stdio>
@@ -35,6 +37,7 @@
                 <param name="percent" value="20.0" type="float" min="0" max="100" label="Percentage" help="Between 0 and 100, e.g. 20% will take every 5th sequence" />
             </when>
         </conditional>
+        <param name="interleaved" type="boolean" label="Interleaved paired reads" help="Tick to preserve interleaved pairs on output" />
     </inputs>
     <outputs>
         <data name="output_file" format="input" metadata_source="input_file" label="${input_file.name} (sub-sampled)"/>
@@ -53,6 +56,13 @@
             <output name="output_file" file="ecoli.sample_N100.fastq" />
         </test>
         <test>
+            <param name="input_file" value="ecoli.fastq" />
+            <param name="type" value="everyNth" />
+            <param name="every_n" value="100" />
+            <param name="interleaved" value="true" />
+            <output name="output_file" file="ecoli.pair_sample_N100.fastq" />
+        </test>
+        <test>
             <param name="input_file" value="MID4_GLZRM4E04_rnd30_frclip.sff" ftype="sff" />
             <param name="type" value="everyNth" />
             <param name="every_n" value="5" />
@@ -65,6 +75,13 @@
             <output name="output_file" file="get_orf_input.Suis_ORF.prot.sample_N100.fasta" />
         </test>
         <test>
+            <param name="input_file" value="get_orf_input.Suis_ORF.prot.fasta" />
+            <param name="type" value="everyNth" />
+            <param name="every_n" value="100" />
+            <param name="interleaved" value="true" />
+            <output name="output_file" file="get_orf_input.Suis_ORF.prot.pair_sample_N100.fasta" />
+        </test>
+        <test>
             <param name="input_file" value="ecoli.fastq" />
             <param name="type" value="percentage" />
             <param name="percent" value="1.0" />
@@ -76,6 +93,13 @@
             <param name="percent" value="20.0" />
             <output name="output_file" file="MID4_GLZRM4E04_rnd30_frclip.sample_N5.sff" ftype="sff"/>
         </test>
+        <test>
+            <param name="input_file" value="MID4_GLZRM4E04_rnd30_frclip.sff" ftype="sff" />
+            <param name="type" value="everyNth" />
+            <param name="percent" value="5" />
+            <param name="interleaved" value="true" />
+            <output name="output_file" file="MID4_GLZRM4E04_rnd30_frclip.pair_sample_N5.sff" ftype="sff"/>
+        </test>
     </tests>
     <help>
 **What it does**
@@ -94,6 +118,10 @@
 every 2nd sequence would sample half the file - while taking every 5th
 sequence would take 20% of the file.
 
+If you tick the interleaved option, the file is processed as pairs of
+records - taking for example using 20% would take every 5th pair of
+records. This ensures your read pairs are preserved. Note this does not
+actually check your read names match a known pair naming scheme!
 
 **Example Usage**
 
@@ -103,6 +131,10 @@
 Taking every 3rd read would reduce the estimated coverage to about x66,
 and would preserve the pairing as well.
 
+Similarly, if you had some Illumina paired end data interleaved into one
+file with an estimated x200 coverage, you would run this tool in
+interleaved mode. Taking every 3rd read pair. This would reduce the
+estimated coverage to about x66, while preserving the read pairing.
 
 **Citation**
 
@@ -116,4 +148,7 @@
 This tool is available to install into other Galaxy Instances via the Galaxy
 Tool Shed at http://toolshed.g2.bx.psu.edu/view/peterjc/sample_seqs
     </help>
+    <citations>
+        <citation type="doi">10.1093/bioinformatics/btp163</citation>
+    </citations>
 </tool>
b
diff -r 219924bd7e3e -r dc55e58fa890 tools/sample_seqs/tool_dependencies.xml
--- a/tools/sample_seqs/tool_dependencies.xml Thu Mar 27 12:14:06 2014 -0400
+++ b/tools/sample_seqs/tool_dependencies.xml Fri Nov 21 08:30:03 2014 -0500
b
@@ -1,6 +1,6 @@
 <?xml version="1.0"?>
 <tool_dependency>
     <package name="biopython" version="1.63">
-        <repository changeset_revision="d8b200f1f5a5" name="package_biopython_1_63" owner="biopython" toolshed="http://testtoolshed.g2.bx.psu.edu" />
+        <repository changeset_revision="d8b200f1f5a5" name="package_biopython_1_63" owner="biopython" toolshed="https://testtoolshed.g2.bx.psu.edu" />
     </package>
 </tool_dependency>