changeset 3:dc55e58fa890 draft

Uploaded v0.1.2, embeds citations, interleaved mode
author peterjc
date Fri, 21 Nov 2014 08:30:03 -0500
parents 219924bd7e3e
children 09a4ee5d12fd
files test-data/MID4_GLZRM4E04_rnd30_frclip.pair_sample_N5.sff test-data/ecoli.pair_sample_N100.fastq test-data/get_orf_input.Suis_ORF.prot.pair_sample_N100.fasta tools/sample_seqs/README.rst tools/sample_seqs/sample_seqs.py tools/sample_seqs/sample_seqs.xml tools/sample_seqs/tool_dependencies.xml
diffstat 7 files changed, 596 insertions(+), 53 deletions(-) [+]
line wrap: on
line diff
Binary file test-data/MID4_GLZRM4E04_rnd30_frclip.pair_sample_N5.sff has changed
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/ecoli.pair_sample_N100.fastq	Fri Nov 21 08:30:03 2014 -0500
@@ -0,0 +1,208 @@
+@frag_1
+AGCTTTTCATTCTGACTGCAACGGGCAATATGTCTC
++
+##%')+.024JMMMMMMMMMMMMMMMMMMMMMMMMM
+@frag_1_a
+GAGACATATTGCCCGTTGCAGTCAGAATGAAAAGCT
++
+MMMMMMMMMMMMMMMMMMMMMMMMMJ420.+)'%##
+@frag_200
+TGGTAATGGTGATGGTGGTGGTAATGGTGGTGCTAA
++
+MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMM
+@frag_201
+TAGCACCACCATTACCACCACCATCACCATTACCAC
++
+MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMM
+@frag_400
+TGGCCACCTGCCCCTGCCTGGCATTGCTTTCCAGAA
++
+MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMM
+@frag_401
+TCTGGAAAGCAATGCCAGGCAGGGGCAGGTGGCCAC
++
+MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMM
+@frag_600
+TTGGGCAAATTCCTGATCGACGAAAGTTTTCAATTG
++
+MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMM
+@frag_601
+AATTGAAAACTTTCGTCGATCAGGAATTTGCCCAAA
++
+MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMM
+@frag_800
+ATATCGACGGTAGATTCGAGGTAATGCCCCACTGCC
++
+MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMM
+@frag_801
+GCAGTGGGGCATTACCTCGAATCTACCGTCGATATT
++
+MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMM
+@frag_1000
+TATAGACCCCGTCAACGTCCGTCCAAATCTCGCAAC
++
+MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMM
+@frag_1001
+TTGCGAGATTTGGACGGACGTTGACGGGGTCTATAC
++
+MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMM
+@frag_1200
+ATCACGGCTGGCACCAATGAGCGTACCTGGTGCTTG
++
+MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMM
+@frag_1201
+AAGCACCAGGTACGCTCATTGGTGCCAGCCGTGATG
++
+MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMM
+@frag_1400
+CAGTCGCTTTGTGGAACGCAGAAACTGATGCTGTAT
++
+MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMM
+@frag_1401
+TACAGCATCAGTTTCTGCGTTCCACAAAGCGACTGT
++
+MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMM
+@frag_1600
+ATCCCTGAGCAATGGCGACAATGTTGATATTGGCGC
++
+MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMM
+@frag_1601
+CGCCAATATCAACATTGTCGCCATTGCTCAGGGATC
++
+MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMM
+@frag_1800
+GACACGTAAGTCGATATGTTTATTCTTCAGCCAGCT
++
+MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMM
+@frag_1801
+GCTGGCTGAAGAATAAACATATCGACTTACGTGTCT
++
+MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMM
+@frag_2000
+AAGTCGGCATATTGATCCGCCACTGCCTGGCTGGAA
++
+MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMM
+@frag_2001
+TCCAGCCAGGCAGTGGCGGATCAATATGCCGACTTC
++
+MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMM
+@frag_2200
+CGGAGAACTTCATCAATTCATCACCTGCATTGAGCA
++
+MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMM
+@frag_2201
+GCTCAATGCAGGTGATGAATTGATGAAGTTCTCCGG
++
+MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMM
+@frag_2400
+TTCAATATCCGCCAGCTCCAGTTCACGTCCCGTTTC
++
+MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMM
+@frag_2401
+AAACGGGACGTGAACTGGAGCTGGCGGATATTGAAA
++
+MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMM
+@frag_2600
+GGATCATTACCATCCACTTCGGCAATCTTCACGCGG
++
+MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMM
+@frag_2601
+CGCGTGAAGATTGCCGAAGTGGATGGTAATGATCCG
++
+MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMM
+@frag_2800
+ATTGGCACTGGAAGCCGGGGCATAAACTTTAACCAT
++
+MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMM
+@frag_2801
+TGGTTAAAGTTTATGCCCCGGCTTCCAGTGCCAATA
++
+MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMM
+@frag_3000
+TGCTTACCCAGTTCCTGGCAAAAACGCTCCCAGCAC
++
+MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMM
+@frag_3001
+TGCTGGGAGCGTTTTTGCCAGGAACTGGGTAAGCAA
++
+MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMM
+@frag_3200
+ACGGTGCCACGTTGTCGTAATGAATGCTGCCGGAGA
++
+MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMM
+@frag_3201
+CTCCGGCAGCATTCATTACGACAACGTGGCACCGTG
++
+MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMM
+@frag_3400
+GTGAATGAAGCCTGCCAGATGTCGCCCGTGCGCAAT
++
+MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMM
+@frag_3401
+TTGCGCACGGGCGACATCTGGCAGGCTTCATTCACG
++
+MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMM
+@frag_3600
+ACGCGCTGGGCGGTTTCCGGCTTGTCACACAGAGCG
++
+MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMM
+@frag_3601
+GCTCTGTGTGACAAGCCGGAAACCGCCCAGCGCGTT
++
+MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMM
+@frag_3800
+GGTCGTGCGGAAAAAACAGCCCCTGATTTTTGCCCA
++
+MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMM
+@frag_3801
+GGGCAAAAATCAGGGGCTGTTTTTTCCGCACGACCT
++
+MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMM
+@frag_4000
+CCCGTGGAACAATTCCAGACAACCGACATCGCTTTC
++
+MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMM
+@frag_4001
+AAAGCGATGTCGGTTGTCTGGAATTGTTCCACGGGC
++
+MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMM
+@frag_4200
+TTTTCTTGCAGTGGACTGATTTTGCCTCGTGGATAG
++
+MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMM
+@frag_4201
+TATCCACGAGGCAAAATCAGTCCACTGCAAGAAAAA
++
+MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMM
+@frag_4400
+GCGGCAGCTGCGCAACAGCTTCAAAGTAGTAGCAAA
++
+MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMM
+@frag_4401
+TTGCTACTACTTTGAAGCTGTTGCGCAGCTGCCGCA
++
+MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMM
+@frag_4600
+CATCGCGTTGGATAACGTCGCCTGAGTCGCTTTGGG
++
+MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMM
+@frag_4601
+CCAAAGCGACTCAGGCGACGTTATCCAACGCGATGG
++
+MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMM
+@frag_4800
+CCTGGATTCAACTGATCACGCAGCGCACGATAAGCT
++
+MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMM
+@frag_4801
+GCTTATCGTGCGCTGCGTGATCAGTTGAATCCAGGC
++
+MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMM
+@frag_5000
+AGATAATGAATAGATTTTACTGATGATTCATCATCA
++
+MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMM
+@frag_5001
+GATGATGAATCATCAGTAAAATCTATTCATTATCTC
++
+MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMK
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/get_orf_input.Suis_ORF.prot.pair_sample_N100.fasta	Fri Nov 21 08:30:03 2014 -0500
@@ -0,0 +1,214 @@
+>Streptococcus_suis|ORF1 length 457 aa, 1374 bp, from 1..1374 of Streptococcus_suis
+MNQEQLFWQRFIELAKVNFKPSIYDFYVADAKLLGINQQVANIFLNRPFKKDFWEKNFEE
+LMIAASFESYGEPLTIQYQFTEDEQEIRNTTNTRSSIVHQVQTLEPATPQETFKPVHSDI
+KSQYTFANFVQGDNNHWAKAAALAVSDNLGELYNPLFIFGGPGLGKTHILNAIGNKVLAD
+NPQARIKYVSSETFINEFLEHLRLNDMESFKKTYRNLDLLLIDDIQSLRNKATTQEEFFH
+TFNALHEKNKQIVLTSDRNPDHLDNLEERLVTRFKWGLTSEITPPDFETRIAILRNKCEN
+LPYNFTNETLSYLAGQFDSNVRDLEGALKDIHLIATMRQLSEISVEVAAEAIRSRKQTNP
+QNMVIPIEKIQTEVGNFYGVSLKELKGSKRVQHIVHARQVAMFLAREMTDNSLPKIGKEF
+GNRDHTTVMHAYNKIKTLLLDDENLEIEITSIKNKLR
+>Streptococcus_suis|ORF2 length 385 aa, 1158 bp, from 1507..2664 of Streptococcus_suis
+IINKGESMIQFSINKNIFLQALSITKRAISTKNAIPILSTVKITVTSEGITLTGSNGQIS
+IEHFISIQDENAGLLISSPGSILLEAGFFINVVSSMPDLVLDFNEIEQKQIVLTSGKSEI
+TLKGKEAEQYPRLQEVPTSKPLVLETKVLKQTINETAFAASTQESRPILTGVHFVLTENK
+NLKTVATDSHRMSQRKLVLDTSGDDFNVVIPSRSLREFTAVFTDDIETVEVFFSNNQILF
+RSEHISFYTRLLEGTYPDTDRLIPTEFKTTAIFDTANLRHSMERARLLSNATQNGTVKLE
+IANNVVSAHVNSPEVGRVNEELDTVEVSGEDLVISFNPTYLIEALKATTSEQVKISFISS
+VRPFTLIPNNEGEDFIQLVTPVRTN
+>Streptococcus_suis|ORF201 length 360 aa, 1083 bp, from complement(128035..129117) of Streptococcus_suis
+SCHGGRRMTLFGKIKEVTELQSLPGFEGQVRNHIRQKITPHVDRIETDGLGGIFGIKDTA
+VENAPRILVVAHMDEVGFMISQIKPDGTFRVVELGGWNPLVVSSQAFTLQLQDGRTIPAI
+SGSVPPHLSRGANAPGMPAIADIIFDAGFANYDEAWAFGVRPGDVLVPKNETILTANGKN
+VISKAWDNRFGVLMVTELLESLSGHALPNQLIAGANVQEEVGLRGAHASTTKFNPDIFLA
+VDCSPAGDIYGDQGKIGDGTLLRFYDPGHIMLKNMKDFLLTTAEEAGVKFQYYCGKGGTD
+AGAAHLKNHGVPSTTIGVCARYIHSHQTLYSMDDFLEAQAFLQTIVKKLDRSTVDLIKNY
+>Streptococcus_suis|ORF202 length 106 aa, 321 bp, from 128792..129112 of Streptococcus_suis
+RVKAWLETTRGFQPPSSTTRKVPSGLIWLIIKPTSSMWATTRIRGAFSTAVSLIPKIPPS
+PSVSMRSTCGVIFWRMWLRTCPSNPGKLCNSVTSLIFPKRVILLPP
+>Streptococcus_suis|ORF401 length 120 aa, 363 bp, from 265643..266005 of Streptococcus_suis
+TTGTTSPIAPKWKASSKSLRVPTSEPTTLIPSSTVFTILRSMYSDGSPTATTYPPARTLS
+IAWLKATLETAVTTVECTPPPVISLIYPGTSSTSSPLIVTSAPTSLASSNLSLLMSTAIT
+>Streptococcus_suis|ORF402 length 201 aa, 606 bp, from 265741..266346 of Streptococcus_suis
+HSLHDTEVHVFRWKSDSYYISTSTNTVNSLVEGYFGNSCYNSRVYTATSNFFNISRNIFY
+FKSVDRHICTNFFGEFQFIIIDVYGDNMSVEDFFSVLYSKVSKSTSTIDSNPLTWFQVSF
+FNRFVASNASTSDRTCLSWIKTFWNFYCIVRCYNTLLSHTTVNRVACIFYGTAESFATGC
+TIFTHTTALEEPSNADTVTNF
+>Streptococcus_suis|ORF601 length 665 aa, 1998 bp, from 409896..411893 of Streptococcus_suis
+VMIQIGKIFAGRYRIVRQIGRGGMADVYLARDLILDGEEVAVKVLRTNYQTDQIAIQRFQ
+REARAMAELDHPNIVRISDIGEEDGQQYLAMEYVNGLDLKRYIKENAPLSNDVAVRIMGQ
+ILLAMRMAHTRGIVHRDLKPQNVLLTSNGVAKVTDFGIAVAFAETSLTQTNSMLGSVHYL
+SPEQARGSKATIQSDIYAMGIILFEMLTGRIPYDGDSAVTIALQHFQKPLPSVREENANV
+PQALENVVLKATAKKLNERYKSVAEMYADLASALSMDRQNEPRVELEGNKVDTKTLPKLS
+QANVETKVPHTNSSAQVSATDKGSGKKEVAKSGNKPVSKPRPGIRTRYKVLIGAILLTVI
+AAGLMFFNTPRTVTVPDVSGQTVEKATEMIEVAGLEVGNITEEATATVDEGLVIRTSPAA
+KTTRRQGSKIDIVVATAALASIPDVVDKESDTARQELEALGFQVTIKEEYSEKVAQGLVI
+KTDPGANSSAEKGAKITLYVSKGVAPQVVPNVVGKSQENATQILQTAGFSIGTITQEYSS
+SVTAGQVISTDPVANTELAKGSIINLVISKGKELIMPDLTSGNYTYSQARSQLQALGVNA
+ESIEKQEDRSYYSTTSDIVIGQYPAAGATIDGTVTLYVSVASTRTSSDSSAGSSTSTSTS
+TGSGQ
+>Streptococcus_suis|ORF602 length 120 aa, 363 bp, from complement(410593..410955) of Streptococcus_suis
+LLSRLCSVYVCLALVLKQAYFPTSPLLSYQTPYPSQKLEQNCLYAVLSFQHWLGRVSARF
+LYQPCSLLVQPWVHSDDPWIEPKLNLHTFLQPTYSARLIFLPLLLVQHFLRPEVRWHSLL
+>Streptococcus_suis|ORF801 length 428 aa, 1287 bp, from 561960..563246 of Streptococcus_suis
+KSSRDCESCLLLFVILKVMQADRRKTFGKMRIRINNLFFVAIAFMGIIISNSQVVLAIGK
+ASVIQYLSYLVLILCIVNDLLKNNKHIVVYKLGYLFLIIFLFTIGICQQILPITTKIYLS
+ISMMIISVLATLPISLIKDIDDFRRISNHLLFALFITSILGIMMGATMFTGAVEGIGFSQ
+GFNGGLTHKNFFGITILMGFVLTYLAYKYGSYKRTDRFILGLELFLILISNTRSVYLILL
+LFLFLVNLDKIKIEQRQWSTLKYISMLFCAIFLYYFFGFLITHSDSYAHRVNGLINFFEY
+YRNDWFHLMFGAADLAYGDLTLDYAIRVRRVLGWNGTLEMPLLSIMLKNGFIGLVGYGIV
+LYKLYRNVRILKTDNIKTIGKSVFIIVVLSATVENYIVNLSFVFMPICFCLLNSISTMES
+TINKQLQT
+>Streptococcus_suis|ORF802 length 333 aa, 1002 bp, from 563382..564383 of Streptococcus_suis
+RMEKVSIIVPIFNTEKYLRECLDSIISQSYTNLEILLIDDGSSDSSTDICLEYAEQDGRI
+KLFRLPNGGVSNARNYGIKNSTANYIMFVDSDDIVDGNIVESLYTCLKENDSDLSGGLLA
+TFDGNYQESELQKCQIDLEEIKEVRDLGNENFPNHYMSGIFNSPCCKLYKNIYINKGFDT
+EQWLGEDLLFNLNYLKNIKKVSYVNRNLYFARRGIQSTTNTFKKDVFIQLENLEEKTFDL
+FVKIFGGQYEFSVFKETLQWHIIYYSLLMFKNGDESLPKKLHIFKYLYNRHSLDTLSIKR
+TSSVFKRICKLIVANNLFKIFLNTLIREEKNND
+>Streptococcus_suis|ORF1001 length 374 aa, 1125 bp, from 694014..695138 of Streptococcus_suis
+HYLLFQGGILMKVFASPSRYIQGKHVLFQGAEAIGKLGTKPLILCDDLVYGIIGEKFLSY
+LVEEGMQVHRVAFNGEASDKEIQRVVEIGKEQASDVVIGLGGGKTIDSAKAIADLLGVPV
+VIAPTIASTDAPTSALSVIYSEEGAFERYIFYKKNPDLVLVDTAIICQAPPRLLASGIAD
+GLATWVEARAILQSNGTTMAGGGQTLAGIAIAQTCEQTLFEYGLQAMASCEAKVVTAALE
+NIVEANTLLSGLGFESAGLAAAHAIHNGFTALEGDIHHLTHGEKVAYGTLTQLFLENRPK
+EELEKYIRFYQALNLPTTLEELHLADASYEELLKVGQQATIEGETIHGMPFAISAEDVAE
+ALMAVDYYVRSLDK
+>Streptococcus_suis|ORF1002 length 366 aa, 1101 bp, from 695283..696383 of Streptococcus_suis
+RIDLKEISMAYVVAVVGATGAVGAQMIKMLEESTLPIEKVRFLASARSAGKTLQFKGQDI
+VIEETTETAFEGVDIALFSAGGSTSAKYAPYAVKAGAVVVDNTSYFRQNPDVPLVVPEVN
+AHALDAHNGIIACPNCSTIQMMVALEPVRQKWGLERIIVSTYQAVSGAGMGAILETQAQL
+RSVLNDGVEPKAVEANILPSGGDKKHYPIGFNAIPQIDLFTENDYTYEEMKMTKETKKIM
+EDDSIAVSATCVRIPVLSAHSESVYIETKEIAPIDEVKAAIASFPGAVLEDDVANQIYPQ
+AINAVGSRDTFVGRIRKDLDKENGIHMWVVSDNLLKGAAWNSVQIAETLHERGLVRPTAE
+LKFELK
+>Streptococcus_suis|ORF1201 length 144 aa, 435 bp, from 842957..843391 of Streptococcus_suis
+FQTIKEKSRLMNIKKLILTLLTLTLTIVPCACGNQSNSNDSQLSGTYSYEKGGIDGSEMG
+FEDEELTLHYELKVSGDENILNINLLSERGNNVKYLYSEKVTIDTDKQIISDSNGTELEY
+SVSGDSVTIPDLAGDSGETVTLKR
+>Streptococcus_suis|ORF1202 length 343 aa, 1032 bp, from 843537..844568 of Streptococcus_suis
+VKVMYIFETTEQNNSKANDFETKSLLYLMSFKSDSTDIDTFFVDCFNDITGASSDLLKLW
+DVQAKNISSLRPKTIGKSLITLFQNFISSVDFYEYILFIPKLKENYLMDISLTEFKIDNF
+KDIAKIQEGLEEEYKRRKKLGALNLKQLSQLNTFLEQIHFVTGDSSKAIYIKNIIQFKSN
+IRDDNFFESVFNEVRSKQTELKNINIHNISINSIEEVLKLNKHLTKRQLETLVVNRIIGV
+ELFKQRIPNDFFDVINDKSSSDRKDIIQDCNANLSRLLFDKNSNKKKFWSLLEQILILVE
+EKDDIYQILNRIKQYQIPKIINDDYTLLYLISMVKEGMEENAC
+>Streptococcus_suis|ORF1401 length 409 aa, 1230 bp, from 991071..992300 of Streptococcus_suis
+GDNMKYPTLLDRFLVYVKENTRSDENSTTTPSTQNQVEFAQNILLPEMERIGLQNVHYLP
+NGFAVGTLPANDPSLTRKIGFIAHMDTADFNAEGVNPQIIENYDGNPIALGTSGYELHPK
+DFPQLANYHGQILITTDGTTLLGSDDKSGIAEIMTAIEFLIQNPDIKHCEIRVGFGPDEE
+IGVGADKFDVKDFDVDFAYTMDGGPLGELQYETFSAAGAKIDFLGRNVHPGSAKDQMINA
+FQMAIDFHNALPETDRPEKTEGYEGFFHLMNMEGSVDTASTTYIIRDFEEEDFQARKQLM
+LDIAEKMNANFDTPRVIVNLHDQYYNMKKIIEKDMTPINIAKDVMENLGIKPLIEPVRGG
+TDGSKISFMGIPTPNIFAGGENMHGRFEFVSLETMEKAVDVILGIVAYK
+>Streptococcus_suis|ORF1402 length 144 aa, 435 bp, from 992392..992826 of Streptococcus_suis
+YNRTIKKKWSFIMTEETLAQGILIGIWGTTLLFSFIWYILVAISNYILFKKAGYAGWKSL
+IPIYNLYIQQCITFGYEKRWFILFLLIPLAGPLYGIYLVYNFGRSFGLSAVQAIFYVLLT
+PIFNLYIAFNDGSRYQGPQEFFID
+>Streptococcus_suis|ORF1601 length 141 aa, 426 bp, from complement(1127307..1127732) of Streptococcus_suis
+VHPLHGRSLLIYFDCFAYEGGGIMTIQALAMFLASLGFLYFIFRNINKNKILFEHAFMWI
+VIGFGLIVFALFDVIPIKLAYLFGFGLTSNFLLSVAIFVLLVIGFLHSMALSQQKQQIKN
+LIQEVSMAKKRISELEEHHAE
+>Streptococcus_suis|ORF1602 length 241 aa, 726 bp, from complement(1127663..1128388) of Streptococcus_suis
+REKMKVLMIIPAYNEEESILQTVQGIIDYKNSVNFQLDYVVINDGSTDSTKEILIQNKLN
+AVHLVQNLGIGGAVQTGYKYALDNDYDVAVQFDGDGQHDIRSLNGLIQPILVGQADMVIG
+SRFVGDTLSEFQTSFMRRFGIGVISNMIKLTTGNRIWDTTSGYRLGNRKVIAQFAKRYPI
+KYPEPESTVHLLKQNFQVVEAPANMFERAGGVSSITPFKSIRYMVEVCSSILIASLMKEG
+E
+>Streptococcus_suis|ORF1801 length 128 aa, 387 bp, from 1263312..1263698 of Streptococcus_suis
+RLHDSCSICFLFIHGNIAGNRPCKEIGILQNNPHVTAQAFTRIITDVFPINQYTSLLWII
+ETIEEIHNRRLTRPSMPNQSNCFSFFCSNGNIFQNWSVFFIAKVHVFKHDLPLFNFQNTI
+TVVLQLFF
+>Streptococcus_suis|ORF1802 length 578 aa, 1737 bp, from complement(1264661..1266397) of Streptococcus_suis
+RRYMFRLIFDYIKRHKWLYLLVAVTLIIYDATLLLPTQIIQRMVDILTKNELTQAILVQE
+MTLLLLVTVLNYATAFIWHLKLFQASVNFKFDMQQRAFKKLVTMRTPFYEKFRSGDVMTR
+FSTDVDGLMEMVGYGLMIVVYAGGMLAFIIPTMFFIDWKISLVALLPMLFMTLCIFFIGR
+KQDKAIDANREAVAQLNNEVLEVIEGIRVTRAYSKKANQKAQFQARTKQLAQGGDRITSL
+QSLYNPLATVCLGLSTIFVLLMGAQAVKAGQLTLGQVIALQLYVGSLLEPFWTLADFILV
+YQTGKTSFEKLQELIETGDDLEADGSKEIAELSSISFKNYSFSYPQAERASLQDINWTLK
+AGQTVGIVGKTGSGKTTLVRQFLRQYPIGQGNFFINHQSILDFKRSSIEEKIGYVPQEHI
+LFSRSVGENIALGKVASSSEEIEQAIATAAFSQDLKRMSDGLDTMIGERGVSISGGQKQR
+ISIARAFLREPDLLILDDSLSAVDARTERQIIQNIQKERAGKTNVIVTHRLSAVNHADWV
+LVLDEGRIVEEGRPADLLAQRGWYYEQYQRQQSQEGGE
+>Streptococcus_suis|ORF2001 length 415 aa, 1248 bp, from complement(1398025..1399272) of Streptococcus_suis
+EDFIMKMKTFLKCASVCAFASFLVACGNASSSDKVEIEYFSQKPEMQATLQEIIDDFEKE
+NPTIDVKFSNVPDAGTVLKTRMANNEAPDVINIYPQNADFKAYAADGRFLEIGDDAGLNH
+LKDGAVTPYLVNEKNYTLPLTANAYGIYYNKDKFKELGLEVPTTYAEFVALVDKIKADGS
+AAPFALSLNDAWSLNGYHQLAWVTVAGGFDGAEDILIRSAKGAIQDDATTKAVLERLQLL
+KDNGQKGATGALYADAVAAFAAGDALMLPQGTWAATAVNQQEPEFEYGMFTFPGDKEGGD
+YTIGAADLALSISADTEHPEESKKFLEYLSRPEVIQKYYDVDGSPTSVEGVDTEGKFEET
+AGVTQYAFTDKHVVWLQSEWESEEEFWNITVEMVKNPNSAELVKKLNAFFDPMKK
+>Streptococcus_suis|ORF2002 length 732 aa, 2199 bp, from complement(1399273..1401471) of Streptococcus_suis
+DHKEEIGEMNVIEIYNEKQIFHLKTREFSYIIQVLETGDLVHRYFGKKIEKFSDGNKITY
+LDRSFSPSPITGDRTYSLDVLPLEYSSNGLGDFRTSALDVRNEFGVTLDLKYKEYRLYKG
+KKELRGLPASFGNQEEVESLEIDLYDQLTDITVTLQYSVFEEASYLARSATIQTGKYPCK
+LEKVLSATLDFPHQDFIVHSLAGRYAYEKEWTQTPLTKGQYSIGSIRGASSHSRTPFLAL
+VSPDASEDKGDVYAAHLVYSGNFTAFVETTAMETSRLGLGLESHYFSWQLDKDDRFQTPE
+VLLSYTDKGFTGMTQNSHHFITKHLIRSSFVNKPRPILINNWEATYFEFTEEKILQLAQV
+ASRAGIELFVLDDGWFGKRNNDESSLGDWKVNLDKLPNGLNGLAERINELGMKFGLWFEP
+EMISIDSDLYREHPDWAIRTEGRLPIYSREQLVLDLTKQEVCDYIIDSVSSILESANISY
+VKWDMNRNITNIPEGLANDQRFEFHHRYMLGLYRVLDHLTKRFPDILFESCAGGGGRNDL
+GIMYYMPQAWASDDTDAIERLSIQEGTSLIYPPSSIGAHVSAVPNHQVGRITPLATRGNV
+AMMGGAFGYELDLTKLSEKELDEISQQIETYHSIRETIQFGQLYRLKKTSNTWAANYVSQ
+DKNQVVFTFVKILAKPEAPLLHVRLKGLDPDALYECPQLGETFYGDELMNIGLTMPHVQK
+DYFSVQYIFNKI
+>Streptococcus_suis|ORF2201 length 272 aa, 819 bp, from complement(1531599..1532417) of Streptococcus_suis
+DCSKIKIIDLAVGKLKLLSSKRKGAFMEIIRSKANHLVKQVKKLQQKKYRTSSYLIEGWH
+LLEEAMEAGANIEHIFVVEEYFEKVAGLANVTVVSPEIMQELADSKTPQGVVAQLALPSQ
+RLPETLDGKFLVLEDVQDPGNVGTMIRTADAAGFDGVFLSDKSADIYNMKVLRSMQGSHF
+HLPVYRMPISSILTALKSNQIQILATTLSSQSVDYKEITPHSSFALVMGNEGQGISDLVA
+DEADQLVHITMPGQAESLNVAIAAGILLFSFI
+>Streptococcus_suis|ORF2202 length 101 aa, 306 bp, from 1532445..1532750 of Streptococcus_suis
+MSCQKEKLMRKVKMIASGRVQGVGFRWSVQFLAVEIGDIYGRVWNNDDGTVTILAQSDNA
+EKLSHFIHEIRKGPSRMAKVIYLDVTLANFEDYKDFQVSYR
+>Streptococcus_suis|ORF2401 length 141 aa, 426 bp, from 1658030..1658455 of Streptococcus_suis
+ASITVPIARTVGSAFSSWISATKRTVSNNSSMFWLNLAEISTNSDSPPQAVEITPCSANS
+PMTRSGFAPGLSILLIATMIGTLAAFEWLIASIVCGMTPSSAATTRMVKSVTDAPRARIE
+VKAACPGVSKKVIFLPASSIW
+>Streptococcus_suis|ORF2402 length 266 aa, 801 bp, from 1658515..1659315 of Streptococcus_suis
+GVQQGCFTMVNVSHDSHNRWAFCHLFFIEVALFYEETLNICVIDLYLFFRFNTIINHEEF
+DSISIQRLVLSRHNSHKEEFFHNFSRFTFDSFCNFCDGHASSIFKFSWQFVELAFCDRFG
+RLVSLAFFIFLVVIPVTCSLISHLILTISISLLFPWTIFFVTIKVTFFIWSSLFLTTGIY
+SSFCNLLWYRCNKCRFHKWFAFHNRFFKLNFFWLLRLLFSFLSLTKTFFTGTSILRILFC
+FQSSSTRFEVNFRSCWFCSLSLFKAS
+>Streptococcus_suis|ORF2601 length 100 aa, 303 bp, from 1790150..1790452 of Streptococcus_suis
+LKDGYQRLVVEGFADIAETFLQTETNLMTTVIFIARHDDDRPIAFPLGSLNQVNMTLVHG
+SKGPKNNCYCLFHNLPFYCFLYFISYSFLKPKSRVFYIFL
+>Streptococcus_suis|ORF2602 length 823 aa, 2472 bp, from complement(1790482..1792953) of Streptococcus_suis
+ERGVVRMKISRGLQGVYEDAQLIAQRYSSDYLETWHLLLAFVINPDTVAGAILAEYPADV
+LDYERAVYMVMGRRYHEELESFFFLPSSKRVKELQVFAEKIAEIVKSKGLGTEHIFMGML
+LDKRSTASQILDQVGFHFEDSDDKVRFLDLRKNLEAKAGFTKEHLKAIRTMTKGGKPKQA
+TVGNMMGMTQSQSGGLEDYTRDLTALARSGQLEPVIGRDEEISRMLQILSRKTKNNPVLV
+GDAGVGKTALALGLAQRIANGEVPASLVNMRILELDLMNVIAGTRFRGDFEERMNNIIND
+IEEDGRVILFIDELHTIMGSGSGIDSILDAANILKPALSRGTLRTVGATTQDEYQKHIEK
+DAALVRRFAKVTIEEPSVADSVAILQGLKPAYEAHHKVTISDQAVVTAVAYAKRYLTSKN
+LPDSAIDLLDEASATVQNRAKGQVEEGGLTALDQALMAGKYKTVTQLLLKAQEAENQATS
+YSLEVTEEDILATLSRLSGIPVTKLSQTDAKKYLNLEQELHKRVIGQEEAISAVSRAIRR
+NQSGIRTGHRPIGSFMFLGPTGVGKTELAKALAEILFDDESALIRFDMSEYMEKFAASRL
+NGAPPGYVGYEEGGELTEKVRNKPYSVLLFDEVEKAHPDIFNVLLQVLDDGVLTDRKGRK
+VDFSNTVIIMTSNLGATALRDDKTVGFGALDLSKSQEHVEKRIFEALKKAYRPEFINRID
+EKVVFHSLTEADMQDVVKVMVKPLIAVAASKGITLKLQASALKLLAKEGYDPEMGARPLR
+RLLQTKLEDPLAEMLLRGELPAGVTLKVGVKAEQLKFDSVKAG
+>Streptococcus_suis|ORF2801 length 1006 aa, 3021 bp, from complement(1921434..1924454) of Streptococcus_suis
+TQTKEYEMIEFRKKAVQLASLMSVFFLCTYSFTDAMYIMAESLSTDGASTIRRTYIEDKK
+EDKDRLNIELVESLSSPKTIGQKITIDKQSLATQNFNEKGIVVITQKGLELKKDDLEKGW
+KLDESYNEKDLAITKSETEKRSLSNELDVLSKTVEELPVYGENYHSYRLLPTTELDYSAD
+NVSLTLSFTKVSEVIKGELVAVVDAEHIAYFKAEPSVFKEYSQVNEKPSSTEDVNVVSPS
+QDPPVSETKENVPDNPESQGSSTVPESEQAVDALVEQRGVICIKLTKSSSEQEEGIEDTE
+NEAIEGATFEVRNVESENLVYTGQTDKDGLLTISNLPLGNYAVIQKSTIDGYEISATKEV
+VELTVAQSRQTVSISNSPKNPLEGLMLNSILDSSLIPRSARVARSLLDTSLLDNPTVTGN
+ANATTTTTVFGNKTTTITREESNIKYIFKPITISIPGVYQSYSQDGVLKKKEVVVDSNTN
+TTKIIWEYTTTVGGVNSNITSIRNAFSTTTDSGLGEPKITSIMKDGVAITPNTTYYGNFD
+NFKSATDNLPVGNGTYVYTIETPVVIPSDNYSLDYRSEVTVDAPKGSKLTYNGTSVTLTQ
+KETRTLSTADTITLPAKNDGGPLGDLKVDTVNTSNTNRTIGKYRDNDDKVIEWTSSQLND
+TSTTQSFTFDVALDSSQAAHEYKVYIYEPSNGTYTETKAEKVATPGNQITVDNVPAGAVA
+LVKTVTNVKDEKVNHTISGAQLEALKGDIKIQKNWEADSDKVDVTFTVNGGSLTNRKETL
+SANNTQITIANVDKFSGMRSTATKKRIYYDVTEAVPSGYILSSAQTDWENLYYVFTNKKD
+NTTTPVFPPDTCGNYGVSSIDLVSINYVMYKSGSKIWGGFDGSMKMNLKIPAFARAGDSF
+TLELPPELKLSHVANPNVAWSTVSANGKVIAKVYHEKDNLIRFVLTTEAYSVQEYNGWFE
+IGVPTSNVIKINNRETTELYKTGVLPNLPEWYTTTTRNQTLIKRSR
+>Streptococcus_suis|ORF2802 length 252 aa, 759 bp, from complement(1925855..1926613) of Streptococcus_suis
+LEARMQQYFVNGRAPQGMFQISDKDTAKHMFSVMRLQAGDQIVLVFDDGIKRLARVVDSQ
+SQSVEIIEELTDNVELPISVTIAMGFPKGDKLEFVAQKATELGMSALWAFPADWSVVKWD
+GKKLAKKAEKLEKIAQGAAEQSKRNRIPAVRLFEKKSDFLAQLAGFDQIILAYEEAAKEG
+EQANLVKILSGLEIGQSVLVIVGPEGGVSPEEVAAFEGAGAVKTGLGPRILRAETAPLYA
+LSTISYATELLR
--- a/tools/sample_seqs/README.rst	Thu Mar 27 12:14:06 2014 -0400
+++ b/tools/sample_seqs/README.rst	Fri Nov 21 08:30:03 2014 -0500
@@ -39,11 +39,12 @@
 
     <tool file="sample_seqs/sample_seqs.xml" />
 
-You will also need to install Biopython 1.62 or later. If you want to run
-the unit tests, include this line in ``tools_conf.xml.sample`` and the sample
-FASTA files under the ``test-data`` directory. Then::
+You will also need to install Biopython 1.62 or later.
 
-    ./run_functional_tests.sh -id sample_seqs
+If you wish to run the unit tests, also	move/copy the ``test-data/`` files
+under Galaxy's ``test-data/`` folder. Then::
+
+    ./run_tests.sh -id sample_seqs
 
 That's it.
 
@@ -55,6 +56,9 @@
 Version Changes
 ------- ----------------------------------------------------------------------
 v0.0.1  - Initial version.
+v0.1.1  - Using optparse to provide a proper command line API.
+v0.1.2  - Interleaved mode for working with paired records.
+        - Tool definition now embeds citation information.
 ======= ======================================================================
 
 
@@ -67,7 +71,7 @@
 For making the "Galaxy Tool Shed" http://toolshed.g2.bx.psu.edu/ tarball use
 the following command from the Galaxy root folder::
 
-    $ tar -czf sample_seqs.tar.gz tools/sample_seqs/README.rst tools/sample_seqs/sample_seqs.py tools/sample_seqs/sample_seqs.xml tools/sample_seqs/tool_dependencies.xml test-data/ecoli.fastq test-data/ecoli.sample_N100.fastq test-data/get_orf_input.Suis_ORF.prot.fasta test-data/get_orf_input.Suis_ORF.prot.sample_N100.fasta test-data/MID4_GLZRM4E04_rnd30_frclip.sff test-data/MID4_GLZRM4E04_rnd30_frclip.sample_N5.sff
+    $ tar -czf sample_seqs.tar.gz tools/sample_seqs/README.rst tools/sample_seqs/sample_seqs.py tools/sample_seqs/sample_seqs.xml tools/sample_seqs/tool_dependencies.xml test-data/ecoli.fastq test-data/ecoli.sample_N100.fastq test-data/ecoli.pair_sample_N100.fastq test-data/get_orf_input.Suis_ORF.prot.fasta test-data/get_orf_input.Suis_ORF.prot.sample_N100.fasta test-data/get_orf_input.Suis_ORF.prot.pair_sample_N100.fasta test-data/MID4_GLZRM4E04_rnd30_frclip.sff test-data/MID4_GLZRM4E04_rnd30_frclip.sample_N5.sff test-data/MID4_GLZRM4E04_rnd30_frclip.pair_sample_N5.sff
 
 Check this worked::
 
@@ -78,10 +82,14 @@
     tools/sample_seqs/tool_dependencies.xml
     test-data/ecoli.fastq
     test-data/ecoli.sample_N100.fastq
+    test-data/ecoli.pair_sample_N100.fastq
     test-data/get_orf_input.Suis_ORF.prot.fasta
     test-data/get_orf_input.Suis_ORF.prot.sample_N100.fasta
+    test-data/get_orf_input.Suis_ORF.prot.pair_sample_N100.fasta
     test-data/MID4_GLZRM4E04_rnd30_frclip.sff
     test-data/MID4_GLZRM4E04_rnd30_frclip.sample_N5.sff
+    test-data/MID4_GLZRM4E04_rnd30_pair_sample.sff
+    test-data/MID4_GLZRM4E04_rnd30_frclip.pair_sample_N5.sff
 
 
 Licence (MIT)
--- a/tools/sample_seqs/sample_seqs.py	Thu Mar 27 12:14:06 2014 -0400
+++ b/tools/sample_seqs/sample_seqs.py	Fri Nov 21 08:30:03 2014 -0500
@@ -9,39 +9,80 @@
 molecular biology and bioinformatics. Bioinformatics 25(11) 1422-3.
 http://dx.doi.org/10.1093/bioinformatics/btp163 pmid:19304878.
 
-This script is copyright 2010-2013 by Peter Cock, The James Hutton Institute
+This script is copyright 2014 by Peter Cock, The James Hutton Institute
 (formerly the Scottish Crop Research Institute, SCRI), UK. All rights reserved.
 See accompanying text file for licence details (MIT license).
 
-This is version 0.1.0 of the script, use -v or --version to get the version.
+Use -v or --version to get the version, -h or --help for help.
 """
 import os
 import sys
+from optparse import OptionParser
+
 
 def stop_err(msg, err=1):
     sys.stderr.write(msg.rstrip() + "\n")
     sys.exit(err)
 
-if "-v" in sys.argv or "--version" in sys.argv:
-    print("v0.1.0")
+#Parse Command Line
+usage = """Use as follows:
+
+$ python sample_seqs.py [options]
+
+e.g. Sample 20% of the reads:
+
+$ python sample_seqs.py -i my_seq.fastq -f fastq -p 20.0 -o sample.fastq
+"""
+parser = OptionParser(usage=usage)
+parser.add_option('-i', '--input', dest='input',
+                  default=None, help='Input sequences filename',
+                  metavar="FILE")
+parser.add_option('-f', '--format', dest='format',
+                  default=None,
+                  help='Input sequence format (e.g. fasta, fastq, sff)')
+parser.add_option('-o', '--output', dest='output',
+                  default=None, help='Output sampled sequenced filename',
+                  metavar="FILE")
+parser.add_option('-p', '--percent', dest='percent',
+                  default=None,
+                  help='Take this percent of the reads')
+parser.add_option('-n', '--everyn', dest='everyn',
+                  default=None,
+                  help='Take every N-th read')
+parser.add_option("--interleaved", dest="interleaved",
+                  default=False, action="store_true",
+                  help="Input is interleaved reads, preserve the pairings")
+parser.add_option("-v", "--version", dest="version",
+                  default=False, action="store_true",
+                  help="Show version and quit")
+options, args = parser.parse_args()
+
+if options.version:
+    print("v0.1.2")
     sys.exit(0)
 
-#Parse Command Line
-if len(sys.argv) < 5:
-    stop_err("Requires at least four arguments: seq_format, in_file, out_file, mode, ...")
-seq_format, in_file, out_file, mode = sys.argv[1:5]
+seq_format = options.format
+in_file = options.input
+out_file = options.output
+interleaved = options.interleaved
+
+if not in_file:
+    stop_err("Require an input filename")
 if in_file != "/dev/stdin" and not os.path.isfile(in_file):
     stop_err("Missing input file %r" % in_file)
+if not out_file:
+    stop_err("Require an output filename")
 
-if mode == "everyNth":
-    if len(sys.argv) != 6:
-        stop_err("If using everyNth, just need argument N (integer, at least 2)")
+
+if options.percent and options.everyn:
+    stop_err("Cannot combine -p and -n options")
+elif options.everyn:
     try:
-        N = int(sys.argv[5])
+        N = int(options.everyn)
     except:
-        stop_err("Bad N argument %r" % sys.argv[5])
+        stop_err("Bad N argument %r" % options.everyn)
     if N < 2:
-        stop_err("Bad N argument %r" % sys.argv[5])
+        stop_err("Bad N argument %r" % options.everyn)
     if (N % 10) == 1:
         sys.stderr.write("Sampling every %ist sequence\n" % N)
     elif (N % 10) == 2:
@@ -57,15 +98,13 @@
             count += 1
             if count % N == 1:
                 yield record
-elif mode == "percentage":
-    if len(sys.argv) != 6:
-        stop_err("If using percentage, just need percentage argument (float, range 0 to 100)")
+elif options.percent:
     try:
-        percent = float(sys.argv[5]) / 100.0
+        percent = float(options.percent) / 100.0
     except:
-        stop_err("Bad percent argument %r" % sys.argv[5])
+        stop_err("Bad percent argument %r" % options.percent)
     if percent <= 0.0 or 1.0 <= percent:
-        stop_err("Bad percent argument %r" % sys.argv[5])
+        stop_err("Bad percent argument %r" % options.percent)
     sys.stderr.write("Sampling %0.3f%% of sequences\n" % (100.0 * percent))
     def sampler(iterator):
         global percent
@@ -77,7 +116,19 @@
                 taken += 1
                 yield record
 else:
-    stop_err("Unsupported mode %r" % mode)
+    stop_err("Must use either -n or -p")
+
+
+def pair(iterator):
+    """Quick and dirty pair batched iterator."""
+    while True:
+        a = next(iterator)
+        b = next(iterator)
+        if not b:
+            assert not a, "Odd number of records?"
+            break
+        yield (a, b)
+
 
 def raw_fasta_iterator(handle):
     """Yields raw FASTA records as multi-line strings."""
@@ -113,41 +164,59 @@
         if not line:
             return # StopIteration 
 
-def fasta_filter(in_file, out_file, iterator_filter):
+def fasta_filter(in_file, out_file, iterator_filter, inter):
     count = 0
     #Galaxy now requires Python 2.5+ so can use with statements,
     with open(in_file) as in_handle:
         with open(out_file, "w") as pos_handle:
-            for record in iterator_filter(raw_fasta_iterator(in_handle)):
-                count += 1
-                pos_handle.write(record)
+            if inter:
+                for r1, r2 in iterator_filter(pair(raw_fasta_iterator(in_handle))):
+                    count += 1
+                    pos_handle.write(r1)
+                    pos_handle.write(r2)
+            else:
+                for record in iterator_filter(raw_fasta_iterator(in_handle)):
+                    count += 1
+                    pos_handle.write(record)
     return count
 
 try:
     from galaxy_utils.sequence.fastq import fastqReader, fastqWriter
-    def fastq_filter(in_file, out_file, iterator_filter):
+    def fastq_filter(in_file, out_file, iterator_filter, inter):
         count = 0
         #from galaxy_utils.sequence.fastq import fastqReader, fastqWriter
         reader = fastqReader(open(in_file, "rU"))
         writer = fastqWriter(open(out_file, "w"))
-        for record in iterator_filter(reader):
-            count += 1
-            writer.write(record)
+        if inter:
+            for r1, r2 in iterator_filter(pair(reader)):
+                count += 1
+                writer.write(r1)
+                writer.write(r2)
+        else:
+            for record in iterator_filter(reader):
+                count += 1
+                writer.write(record)
         writer.close()
         reader.close()
         return count
 except ImportError:
     from Bio.SeqIO.QualityIO import FastqGeneralIterator
-    def fastq_filter(in_file, out_file, iterator_filter):
+    def fastq_filter(in_file, out_file, iterator_filter, inter):
         count = 0
         with open(in_file) as in_handle:
             with open(out_file, "w") as pos_handle:
-                for title, seq, qual in iterator_filter(FastqGeneralIterator(in_handle)):
-                    count += 1
-                    pos_handle.write("@%s\n%s\n+\n%s\n" % (title, seq, qual))
+                if inter:
+                    for r1, r2 in iterator_filter(pair(FastqGeneralIterator(in_handle))):
+                        count += 1
+                        pos_handle.write("@%s\n%s\n+\n%s\n" % r1)
+                        pos_handle.write("@%s\n%s\n+\n%s\n" % r2)
+                else:
+                    for title, seq, qual in iterator_filter(FastqGeneralIterator(in_handle)):
+                        count += 1
+                        pos_handle.write("@%s\n%s\n+\n%s\n" % (title, seq, qual))
         return count
 
-def sff_filter(in_file, out_file, iterator_filter):
+def sff_filter(in_file, out_file, iterator_filter, inter):
     count = 0
     try:
         from Bio.SeqIO.SffIO import SffIterator, SffWriter
@@ -167,17 +236,26 @@
         with open(out_file, "wb") as out_handle:
             writer = SffWriter(out_handle, xml=manifest)
             in_handle.seek(0) #start again after getting manifest
-            count = writer.write_file(iterator_filter(SffIterator(in_handle)))
-            #count = writer.write_file(SffIterator(in_handle))
+            if inter:
+                from itertools import chain
+                count = writer.write_file(chain.from_iterable(iterator_filter(pair(SffIterator(in_handle)))))
+                assert count % 2 == 0, "Odd number of records? %i" % count
+                count /= 2
+            else:
+                count = writer.write_file(iterator_filter(SffIterator(in_handle)))
+                #count = writer.write_file(SffIterator(in_handle))
     return count
 
 if seq_format.lower()=="sff":
-    count = sff_filter(in_file, out_file, sampler)
+    count = sff_filter(in_file, out_file, sampler, interleaved)
 elif seq_format.lower()=="fasta":
-    count = fasta_filter(in_file, out_file, sampler)
+    count = fasta_filter(in_file, out_file, sampler, interleaved)
 elif seq_format.lower().startswith("fastq"):
-    count = fastq_filter(in_file, out_file, sampler)
+    count = fastq_filter(in_file, out_file, sampler, interleaved)
 else:
     stop_err("Unsupported file type %r" % seq_format)
 
-sys.stderr.write("Sampled %i records\n" % count)
+if interleaved:
+    sys.stderr.write("Selected %i pairs\n" % count)
+else:
+    sys.stderr.write("Selected %i records\n" % count)
--- a/tools/sample_seqs/sample_seqs.xml	Thu Mar 27 12:14:06 2014 -0400
+++ b/tools/sample_seqs/sample_seqs.xml	Fri Nov 21 08:30:03 2014 -0500
@@ -1,4 +1,4 @@
-<tool id="sample_seqs" name="Sub-sample sequences files" version="0.0.1">
+<tool id="sample_seqs" name="Sub-sample sequences files" version="0.1.2">
     <description>e.g. to reduce coverage</description>
     <requirements>
         <requirement type="package" version="1.63">biopython</requirement>
@@ -6,13 +6,15 @@
     </requirements>
     <version_command interpreter="python">sample_seqs.py --version</version_command>
     <command interpreter="python">
+sample_seqs.py -f "$input_file.ext" -i "$input_file" -o "$output_file"
 #if str($sampling.type) == "everyNth":
-sample_seqs.py "$input_file.ext" "$input_file" "$output_file" "${sampling.type}" "${sampling.every_n}"
-#elif str($sampling.type) == "percentage":
-sample_seqs.py "$input_file.ext" "$input_file" "$output_file" "${sampling.type}" "${sampling.percent}"
-#else:
-##Should give an error about invalid sampling type:
-sample_seqs.py "$input_file.ext" "$input_file" "$output_file" "${sampling.type}"
+-n "${sampling.every_n}"
+#else
+##elif str($sampling.type) == "percentage":
+-p "${sampling.percent}"
+#end if
+#if $interleaved
+--interleaved
 #end if
     </command>
     <stdio>
@@ -35,6 +37,7 @@
                 <param name="percent" value="20.0" type="float" min="0" max="100" label="Percentage" help="Between 0 and 100, e.g. 20% will take every 5th sequence" />
             </when>
         </conditional>
+        <param name="interleaved" type="boolean" label="Interleaved paired reads" help="Tick to preserve interleaved pairs on output" />
     </inputs>
     <outputs>
         <data name="output_file" format="input" metadata_source="input_file" label="${input_file.name} (sub-sampled)"/>
@@ -53,6 +56,13 @@
             <output name="output_file" file="ecoli.sample_N100.fastq" />
         </test>
         <test>
+            <param name="input_file" value="ecoli.fastq" />
+            <param name="type" value="everyNth" />
+            <param name="every_n" value="100" />
+            <param name="interleaved" value="true" />
+            <output name="output_file" file="ecoli.pair_sample_N100.fastq" />
+        </test>
+        <test>
             <param name="input_file" value="MID4_GLZRM4E04_rnd30_frclip.sff" ftype="sff" />
             <param name="type" value="everyNth" />
             <param name="every_n" value="5" />
@@ -65,6 +75,13 @@
             <output name="output_file" file="get_orf_input.Suis_ORF.prot.sample_N100.fasta" />
         </test>
         <test>
+            <param name="input_file" value="get_orf_input.Suis_ORF.prot.fasta" />
+            <param name="type" value="everyNth" />
+            <param name="every_n" value="100" />
+            <param name="interleaved" value="true" />
+            <output name="output_file" file="get_orf_input.Suis_ORF.prot.pair_sample_N100.fasta" />
+        </test>
+        <test>
             <param name="input_file" value="ecoli.fastq" />
             <param name="type" value="percentage" />
             <param name="percent" value="1.0" />
@@ -76,6 +93,13 @@
             <param name="percent" value="20.0" />
             <output name="output_file" file="MID4_GLZRM4E04_rnd30_frclip.sample_N5.sff" ftype="sff"/>
         </test>
+        <test>
+            <param name="input_file" value="MID4_GLZRM4E04_rnd30_frclip.sff" ftype="sff" />
+            <param name="type" value="everyNth" />
+            <param name="percent" value="5" />
+            <param name="interleaved" value="true" />
+            <output name="output_file" file="MID4_GLZRM4E04_rnd30_frclip.pair_sample_N5.sff" ftype="sff"/>
+        </test>
     </tests>
     <help>
 **What it does**
@@ -94,6 +118,10 @@
 every 2nd sequence would sample half the file - while taking every 5th
 sequence would take 20% of the file.
 
+If you tick the interleaved option, the file is processed as pairs of
+records - taking for example using 20% would take every 5th pair of
+records. This ensures your read pairs are preserved. Note this does not
+actually check your read names match a known pair naming scheme!
 
 **Example Usage**
 
@@ -103,6 +131,10 @@
 Taking every 3rd read would reduce the estimated coverage to about x66,
 and would preserve the pairing as well.
 
+Similarly, if you had some Illumina paired end data interleaved into one
+file with an estimated x200 coverage, you would run this tool in
+interleaved mode. Taking every 3rd read pair. This would reduce the
+estimated coverage to about x66, while preserving the read pairing.
 
 **Citation**
 
@@ -116,4 +148,7 @@
 This tool is available to install into other Galaxy Instances via the Galaxy
 Tool Shed at http://toolshed.g2.bx.psu.edu/view/peterjc/sample_seqs
     </help>
+    <citations>
+        <citation type="doi">10.1093/bioinformatics/btp163</citation>
+    </citations>
 </tool>
--- a/tools/sample_seqs/tool_dependencies.xml	Thu Mar 27 12:14:06 2014 -0400
+++ b/tools/sample_seqs/tool_dependencies.xml	Fri Nov 21 08:30:03 2014 -0500
@@ -1,6 +1,6 @@
 <?xml version="1.0"?>
 <tool_dependency>
     <package name="biopython" version="1.63">
-        <repository changeset_revision="d8b200f1f5a5" name="package_biopython_1_63" owner="biopython" toolshed="http://testtoolshed.g2.bx.psu.edu" />
+        <repository changeset_revision="d8b200f1f5a5" name="package_biopython_1_63" owner="biopython" toolshed="https://testtoolshed.g2.bx.psu.edu" />
     </package>
 </tool_dependency>