Repository 'ncbi_blast_plus'
hg clone https://testtoolshed.g2.bx.psu.edu/repos/peterjc/ncbi_blast_plus

Changeset 33:5402f9b0d508 (2013-11-28)
Previous changeset 32:b2795652d2b4 (2013-11-25) Next changeset 34:3952ec621ea9 (2013-11-28)
Commit message:
Uploaded v0.0.22b, using BLAST+ 2.2.28 now
modified:
test-data/blastp_four_human_vs_rhodopsin.xml
test-data/blastx_rhodopsin_vs_four_human.xml
test-data/tblastn_four_human_vs_rhodopsin.html
test-data/tblastn_four_human_vs_rhodopsin.xml
tools/ncbi_blast_plus/README.rst
tools/ncbi_blast_plus/ncbi_macros.xml
tools/ncbi_blast_plus/ncbi_makeblastdb.xml
added:
tools/ncbi_blast_plus/check_no_duplicates.py
b
diff -r b2795652d2b4 -r 5402f9b0d508 test-data/blastp_four_human_vs_rhodopsin.xml
--- a/test-data/blastp_four_human_vs_rhodopsin.xml Mon Nov 25 10:58:46 2013 -0500
+++ b/test-data/blastp_four_human_vs_rhodopsin.xml Thu Nov 28 08:50:15 2013 -0500
[
b'@@ -2,7 +2,7 @@\n <!DOCTYPE BlastOutput PUBLIC "-//NCBI//NCBI BlastOutput/EN" "http://www.ncbi.nlm.nih.gov/dtd/NCBI_BlastOutput.dtd">\n <BlastOutput>\n   <BlastOutput_program>blastp</BlastOutput_program>\n-  <BlastOutput_version>BLASTP 2.2.27+</BlastOutput_version>\n+  <BlastOutput_version>BLASTP 2.2.28+</BlastOutput_version>\n   <BlastOutput_reference>Stephen F. Altschul, Thomas L. Madden, Alejandro A. Sch&amp;auml;ffer, Jinghui Zhang, Zheng Zhang, Webb Miller, and David J. Lipman (1997), &quot;Gapped BLAST and PSI-BLAST: a new generation of protein database search programs&quot;, Nucleic Acids Res. 25:3389-3402.</BlastOutput_reference>\n   <BlastOutput_db></BlastOutput_db>\n   <BlastOutput_query-ID>sp|Q9BS26|ERP44_HUMAN</BlastOutput_query-ID>\n@@ -17,630 +17,649 @@\n       <Parameters_filter>F</Parameters_filter>\n     </Parameters>\n   </BlastOutput_param>\n-  <BlastOutput_iterations>\n-    <Iteration>\n-      <Iteration_iter-num>1</Iteration_iter-num>\n-      <Iteration_query-ID>sp|Q9BS26|ERP44_HUMAN</Iteration_query-ID>\n-      <Iteration_query-def>Endoplasmic reticulum resident protein 44 OS=Homo sapiens GN=ERP44 PE=1 SV=1</Iteration_query-def>\n-      <Iteration_query-len>406</Iteration_query-len>\n-      <Iteration_hits></Iteration_hits>\n-      <Iteration_stat>\n-        <Statistics>\n-          <Statistics_db-num>0</Statistics_db-num>\n-          <Statistics_db-len>0</Statistics_db-len>\n-          <Statistics_hsp-len>30</Statistics_hsp-len>\n-          <Statistics_eff-space>119568</Statistics_eff-space>\n-          <Statistics_kappa>0.041</Statistics_kappa>\n-          <Statistics_lambda>0.267</Statistics_lambda>\n-          <Statistics_entropy>0.14</Statistics_entropy>\n-        </Statistics>\n-      </Iteration_stat>\n-      <Iteration_message>No hits found</Iteration_message>\n-    </Iteration>\n-    <Iteration>\n-      <Iteration_iter-num>2</Iteration_iter-num>\n-      <Iteration_query-ID>sp|Q9BS26|ERP44_HUMAN</Iteration_query-ID>\n-      <Iteration_query-def>Endoplasmic reticulum resident protein 44 OS=Homo sapiens GN=ERP44 PE=1 SV=1</Iteration_query-def>\n-      <Iteration_query-len>406</Iteration_query-len>\n-      <Iteration_hits></Iteration_hits>\n-      <Iteration_stat>\n-        <Statistics>\n-          <Statistics_db-num>0</Statistics_db-num>\n-          <Statistics_db-len>0</Statistics_db-len>\n-          <Statistics_hsp-len>30</Statistics_hsp-len>\n-          <Statistics_eff-space>119568</Statistics_eff-space>\n-          <Statistics_kappa>0.041</Statistics_kappa>\n-          <Statistics_lambda>0.267</Statistics_lambda>\n-          <Statistics_entropy>0.14</Statistics_entropy>\n-        </Statistics>\n-      </Iteration_stat>\n-      <Iteration_message>No hits found</Iteration_message>\n-    </Iteration>\n-    <Iteration>\n-      <Iteration_iter-num>3</Iteration_iter-num>\n-      <Iteration_query-ID>sp|Q9BS26|ERP44_HUMAN</Iteration_query-ID>\n-      <Iteration_query-def>Endoplasmic reticulum resident protein 44 OS=Homo sapiens GN=ERP44 PE=1 SV=1</Iteration_query-def>\n-      <Iteration_query-len>406</Iteration_query-len>\n-      <Iteration_hits></Iteration_hits>\n-      <Iteration_stat>\n-        <Statistics>\n-          <Statistics_db-num>0</Statistics_db-num>\n-          <Statistics_db-len>0</Statistics_db-len>\n-          <Statistics_hsp-len>30</Statistics_hsp-len>\n-          <Statistics_eff-space>119568</Statistics_eff-space>\n-          <Statistics_kappa>0.041</Statistics_kappa>\n-          <Statistics_lambda>0.267</Statistics_lambda>\n-          <Statistics_entropy>0.14</Statistics_entropy>\n-        </Statistics>\n-      </Iteration_stat>\n-      <Iteration_message>No hits found</Iteration_message>\n-    </Iteration>\n-    <Iteration>\n-      <Iteration_iter-num>4</Iteration_iter-num>\n-      <Iteration_query-ID>sp|Q9BS26|ERP44_HUMAN</Iteration_query-ID>\n-      <Iteration_query-def>Endoplasmic reticulum resident protein 44 OS=Homo sapiens GN=ERP44 PE=1 SV=1</Iteration_query-def>\n-      <Iteration_query-len>406</Iteration_query-len>\n-      <Iteration_hits></Iteratio'..b'q>\n+      <Hsp_hseq>MNGTEGPNFYVPFSNKTGVVRSPFEAPQYYLAEPWQFSMLAAYMFLLIMLGFPINFLTLYVTVQHKKLRTPLNYILLNLAVADLFMVFGGFTTTLYTSLHGYFVFGPTGCNLEGFFATLGGEIALWSLVVLAIERYVVVCKPMSNFRFGENHAIMGVAFTWVMALACAAPPLVGWSRYIPEGMQCSCGID-YTPHEETNNESFVIYMFVVHFIIPLIVIFFCYGQLVFTVKEAAAQQQESATTQKAEKEVTRMVIIMVIAFLICWLPYAGVAFYIFTHQGSDFGPIFMTIPAFFAKTSAVYNPVIYIMMNKQFRNCMVTTLCCGKNPLGDDEASTTVSKTETSQVAPA</Hsp_hseq>\n+      <Hsp_midline>MNGTEGPNFYVPFSN TGVVRSPFE PQYYLAEPWQFSMLAAYMFLLI+LGFPINFLTLYVTVQHKKLRTPLNYILLNLAVADLFMV GGFT+TLYTSLHGYFVFGPTGCNLEGFFATLGGEIALWSLVVLAIERYVVVCKPMSNFRFGENHAIMGVAFTWVMALACAAPPL GWSRYIPEG+QCSCGID YT   E NNESFVIYMFVVHF IP+I+IFFCYGQLVFTVKEAAAQQQESATTQKAEKEVTRMVIIMVIAFLICW+PYA VAFYIFTHQGS+FGPIFMTIPAFFAK++A+YNPVIYIMMNKQFRNCM+TT+CCGKNPLGDDEAS TVSKTETSQVAPA</Hsp_midline>\n+    </Hsp>\n+  </Hit_hsps>\n+</Hit>\n+</Iteration_hits>\n+  <Iteration_stat>\n+    <Statistics>\n+      <Statistics_db-num>0</Statistics_db-num>\n+      <Statistics_db-len>0</Statistics_db-len>\n+      <Statistics_hsp-len>29</Statistics_hsp-len>\n+      <Statistics_eff-space>101761</Statistics_eff-space>\n+      <Statistics_kappa>0.041</Statistics_kappa>\n+      <Statistics_lambda>0.267</Statistics_lambda>\n+      <Statistics_entropy>0.14</Statistics_entropy>\n+    </Statistics>\n+  </Iteration_stat>\n+</Iteration>\n+<Iteration>\n+  <Iteration_iter-num>24</Iteration_iter-num>\n+  <Iteration_query-ID>sp|P08100|OPSD_HUMAN</Iteration_query-ID>\n+  <Iteration_query-def>Rhodopsin OS=Homo sapiens GN=RHO PE=1 SV=1</Iteration_query-def>\n+  <Iteration_query-len>348</Iteration_query-len>\n+<Iteration_hits>\n+<Hit>\n+  <Hit_num>1</Hit_num>\n+  <Hit_id>gi|12583665|dbj|BAB21486.1|</Hit_id>\n+  <Hit_def>fresh water form rod opsin [Conger myriaster]</Hit_def>\n+  <Hit_accession>BAB21486</Hit_accession>\n+  <Hit_len>354</Hit_len>\n+  <Hit_hsps>\n+    <Hsp>\n+      <Hsp_num>1</Hsp_num>\n+      <Hsp_bit-score>599.356</Hsp_bit-score>\n+      <Hsp_score>1544</Hsp_score>\n+      <Hsp_evalue>0</Hsp_evalue>\n+      <Hsp_query-from>1</Hsp_query-from>\n+      <Hsp_query-to>341</Hsp_query-to>\n+      <Hsp_hit-from>1</Hsp_hit-from>\n+      <Hsp_hit-to>342</Hsp_hit-to>\n+      <Hsp_query-frame>0</Hsp_query-frame>\n+      <Hsp_hit-frame>0</Hsp_hit-frame>\n+      <Hsp_identity>281</Hsp_identity>\n+      <Hsp_positive>314</Hsp_positive>\n+      <Hsp_gaps>1</Hsp_gaps>\n+      <Hsp_align-len>342</Hsp_align-len>\n+      <Hsp_qseq>MNGTEGPNFYVPFSNATGVVRSPFEYPQYYLAEPWQFSMLAAYMFLLIVLGFPINFLTLYVTVQHKKLRTPLNYILLNLAVADLFMVLGGFTSTLYTSLHGYFVFGPTGCNLEGFFATLGGEIALWSLVVLAIERYVVVCKPMSNFRFGENHAIMGVAFTWVMALACAAPPLAGWSRYIPEGLQCSCGIDYYTLKPEVNNESFVIYMFVVHFTIPMIIIFFCYGQLVFTVKEAAAQQQESATTQKAEKEVTRMVIIMVIAFLICWVPYASVAFYIFTHQGSNFGPIFMTIPAFFAKSAAIYNPVIYIMMNKQFRNCMLTTICCGKNPL-GDDEASATVSKTE</Hsp_qseq>\n+      <Hsp_hseq>MNGTEGPNFYIPMSNATGVVRSPFEYPQYYLAEPWAFSALSAYMFFLIIAGFPINFLTLYVTIEHKKLRTPLNYILLNLAVADLFMVFGGFTTTMYTSMHGYFVFGPTGCNIEGFFATLGGEIALWCLVVLAIERWMVVCKPVTNFRFGESHAIMGVMVTWTMALACALPPLFGWSRYIPEGLQCSCGIDYYTRAPGINNESFVIYMFTCHFSIPLAVISFCYGRLVCTVKEAAAQQQESETTQRAEREVTRMVVIMVISFLVCWVPYASVAWYIFTHQGSTFGPIFMTIPSFFAKSSALYNPMIYICMNKQFRHCMITTLCCGKNPFEEEDGASATSSKTE</Hsp_hseq>\n+      <Hsp_midline>MNGTEGPNFY+P SNATGVVRSPFEYPQYYLAEPW FS L+AYMF LI+ GFPINFLTLYVT++HKKLRTPLNYILLNLAVADLFMV GGFT+T+YTS+HGYFVFGPTGCN+EGFFATLGGEIALW LVVLAIER++VVCKP++NFRFGE+HAIMGV  TW MALACA PPL GWSRYIPEGLQCSCGIDYYT  P +NNESFVIYMF  HF+IP+ +I FCYG+LV TVKEAAAQQQES TTQ+AE+EVTRMV+IMVI+FL+CWVPYASVA+YIFTHQGS FGPIFMTIP+FFAKS+A+YNP+IYI MNKQFR+CM+TT+CCGKNP   +D ASAT SKTE</Hsp_midline>\n+    </Hsp>\n+  </Hit_hsps>\n+</Hit>\n+</Iteration_hits>\n+  <Iteration_stat>\n+    <Statistics>\n+      <Statistics_db-num>0</Statistics_db-num>\n+      <Statistics_db-len>0</Statistics_db-len>\n+      <Statistics_hsp-len>29</Statistics_hsp-len>\n+      <Statistics_eff-space>101761</Statistics_eff-space>\n+      <Statistics_kappa>0.041</Statistics_kappa>\n+      <Statistics_lambda>0.267</Statistics_lambda>\n+      <Statistics_entropy>0.14</Statistics_entropy>\n+    </Statistics>\n+  </Iteration_stat>\n+</Iteration>\n+</BlastOutput_iterations>\n+</BlastOutput>\n+\n'
b
diff -r b2795652d2b4 -r 5402f9b0d508 test-data/blastx_rhodopsin_vs_four_human.xml
--- a/test-data/blastx_rhodopsin_vs_four_human.xml Mon Nov 25 10:58:46 2013 -0500
+++ b/test-data/blastx_rhodopsin_vs_four_human.xml Thu Nov 28 08:50:15 2013 -0500
b
b'@@ -2,7 +2,7 @@\n <!DOCTYPE BlastOutput PUBLIC "-//NCBI//NCBI BlastOutput/EN" "http://www.ncbi.nlm.nih.gov/dtd/NCBI_BlastOutput.dtd">\n <BlastOutput>\n   <BlastOutput_program>blastx</BlastOutput_program>\n-  <BlastOutput_version>BLASTX 2.2.27+</BlastOutput_version>\n+  <BlastOutput_version>BLASTX 2.2.28+</BlastOutput_version>\n   <BlastOutput_reference>Stephen F. Altschul, Thomas L. Madden, Alejandro A. Sch&amp;auml;ffer, Jinghui Zhang, Zheng Zhang, Webb Miller, and David J. Lipman (1997), &quot;Gapped BLAST and PSI-BLAST: a new generation of protein database search programs&quot;, Nucleic Acids Res. 25:3389-3402.</BlastOutput_reference>\n   <BlastOutput_db></BlastOutput_db>\n   <BlastOutput_query-ID>Query_1</BlastOutput_query-ID>\n@@ -17,706 +17,725 @@\n       <Parameters_filter>L;</Parameters_filter>\n     </Parameters>\n   </BlastOutput_param>\n-  <BlastOutput_iterations>\n-    <Iteration>\n-      <Iteration_iter-num>1</Iteration_iter-num>\n-      <Iteration_query-ID>Query_1</Iteration_query-ID>\n-      <Iteration_query-def>gi|57163782|ref|NM_001009242.1| Felis catus rhodopsin (RHO), mRNA</Iteration_query-def>\n-      <Iteration_query-len>1047</Iteration_query-len>\n-      <Iteration_hits></Iteration_hits>\n-      <Iteration_stat>\n-        <Statistics>\n-          <Statistics_db-num>0</Statistics_db-num>\n-          <Statistics_db-len>0</Statistics_db-len>\n-          <Statistics_hsp-len>30</Statistics_hsp-len>\n-          <Statistics_eff-space>119944</Statistics_eff-space>\n-          <Statistics_kappa>0.041</Statistics_kappa>\n-          <Statistics_lambda>0.267</Statistics_lambda>\n-          <Statistics_entropy>0.14</Statistics_entropy>\n-        </Statistics>\n-      </Iteration_stat>\n-      <Iteration_message>No hits found</Iteration_message>\n-    </Iteration>\n-    <Iteration>\n-      <Iteration_iter-num>2</Iteration_iter-num>\n-      <Iteration_query-ID>Query_1</Iteration_query-ID>\n-      <Iteration_query-def>gi|57163782|ref|NM_001009242.1| Felis catus rhodopsin (RHO), mRNA</Iteration_query-def>\n-      <Iteration_query-len>1047</Iteration_query-len>\n-      <Iteration_hits></Iteration_hits>\n-      <Iteration_stat>\n-        <Statistics>\n-          <Statistics_db-num>0</Statistics_db-num>\n-          <Statistics_db-len>0</Statistics_db-len>\n-          <Statistics_hsp-len>30</Statistics_hsp-len>\n-          <Statistics_eff-space>119944</Statistics_eff-space>\n-          <Statistics_kappa>0.041</Statistics_kappa>\n-          <Statistics_lambda>0.267</Statistics_lambda>\n-          <Statistics_entropy>0.14</Statistics_entropy>\n-        </Statistics>\n-      </Iteration_stat>\n-      <Iteration_message>No hits found</Iteration_message>\n-    </Iteration>\n-    <Iteration>\n-      <Iteration_iter-num>3</Iteration_iter-num>\n-      <Iteration_query-ID>Query_1</Iteration_query-ID>\n-      <Iteration_query-def>gi|57163782|ref|NM_001009242.1| Felis catus rhodopsin (RHO), mRNA</Iteration_query-def>\n-      <Iteration_query-len>1047</Iteration_query-len>\n-      <Iteration_hits></Iteration_hits>\n-      <Iteration_stat>\n-        <Statistics>\n-          <Statistics_db-num>0</Statistics_db-num>\n-          <Statistics_db-len>0</Statistics_db-len>\n-          <Statistics_hsp-len>30</Statistics_hsp-len>\n-          <Statistics_eff-space>119944</Statistics_eff-space>\n-          <Statistics_kappa>0.041</Statistics_kappa>\n-          <Statistics_lambda>0.267</Statistics_lambda>\n-          <Statistics_entropy>0.14</Statistics_entropy>\n-        </Statistics>\n-      </Iteration_stat>\n-      <Iteration_message>No hits found</Iteration_message>\n-    </Iteration>\n-    <Iteration>\n-      <Iteration_iter-num>4</Iteration_iter-num>\n-      <Iteration_query-ID>Query_1</Iteration_query-ID>\n-      <Iteration_query-def>gi|57163782|ref|NM_001009242.1| Felis catus rhodopsin (RHO), mRNA</Iteration_query-def>\n-      <Iteration_query-len>1047</Iteration_query-len>\n-      <Iteration_hits>\n-        <Hit>\n-          <Hit_num>1</Hit_num>\n-          <Hit_id>Subject_4</Hit_id>\n-          <Hit_def>sp|P08100|OPS'..b'Statistics_db-len>\n+      <Statistics_hsp-len>32</Statistics_hsp-len>\n+      <Statistics_eff-space>155584</Statistics_eff-space>\n+      <Statistics_kappa>0.041</Statistics_kappa>\n+      <Statistics_lambda>0.267</Statistics_lambda>\n+      <Statistics_entropy>0.14</Statistics_entropy>\n+    </Statistics>\n+  </Iteration_stat>\n+  <Iteration_message>No hits found</Iteration_message>\n+</Iteration>\n+<Iteration>\n+  <Iteration_iter-num>23</Iteration_iter-num>\n+  <Iteration_query-ID>Query_6</Iteration_query-ID>\n+  <Iteration_query-def>gi|12583664|dbj|AB043817.1| Conger myriaster conf gene for fresh water form rod opsin, complete cds</Iteration_query-def>\n+  <Iteration_query-len>1344</Iteration_query-len>\n+<Iteration_hits>\n+</Iteration_hits>\n+  <Iteration_stat>\n+    <Statistics>\n+      <Statistics_db-num>0</Statistics_db-num>\n+      <Statistics_db-len>0</Statistics_db-len>\n+      <Statistics_hsp-len>32</Statistics_hsp-len>\n+      <Statistics_eff-space>155584</Statistics_eff-space>\n+      <Statistics_kappa>0.041</Statistics_kappa>\n+      <Statistics_lambda>0.267</Statistics_lambda>\n+      <Statistics_entropy>0.14</Statistics_entropy>\n+    </Statistics>\n+  </Iteration_stat>\n+  <Iteration_message>No hits found</Iteration_message>\n+</Iteration>\n+<Iteration>\n+  <Iteration_iter-num>24</Iteration_iter-num>\n+  <Iteration_query-ID>Query_6</Iteration_query-ID>\n+  <Iteration_query-def>gi|12583664|dbj|AB043817.1| Conger myriaster conf gene for fresh water form rod opsin, complete cds</Iteration_query-def>\n+  <Iteration_query-len>1344</Iteration_query-len>\n+<Iteration_hits>\n+<Hit>\n+  <Hit_num>1</Hit_num>\n+  <Hit_id>Subject_4</Hit_id>\n+  <Hit_def>sp|P08100|OPSD_HUMAN Rhodopsin OS=Homo sapiens GN=RHO PE=1 SV=1</Hit_def>\n+  <Hit_accession>Subject_4</Hit_accession>\n+  <Hit_len>348</Hit_len>\n+  <Hit_hsps>\n+    <Hsp>\n+      <Hsp_num>1</Hsp_num>\n+      <Hsp_bit-score>532.717</Hsp_bit-score>\n+      <Hsp_score>1371</Hsp_score>\n+      <Hsp_evalue>0</Hsp_evalue>\n+      <Hsp_query-from>23</Hsp_query-from>\n+      <Hsp_query-to>1021</Hsp_query-to>\n+      <Hsp_hit-from>1</Hsp_hit-from>\n+      <Hsp_hit-to>333</Hsp_hit-to>\n+      <Hsp_query-frame>2</Hsp_query-frame>\n+      <Hsp_hit-frame>0</Hsp_hit-frame>\n+      <Hsp_identity>272</Hsp_identity>\n+      <Hsp_positive>307</Hsp_positive>\n+      <Hsp_gaps>0</Hsp_gaps>\n+      <Hsp_align-len>333</Hsp_align-len>\n+      <Hsp_qseq>MNGTEGPNFYIPMSNATGVVRSPFEYPQYYLAEPWAFSALSAYMFFLIIAGFPINFLTLYVTIEHKKLRTPLNYILLNLAVADLFMVFGGFTTTMYTSMHGYFVFGPTGCNIEGFFATLGGEIALWCLVVLAIERWMVVCKPVTNFRFGESHAIMGVMVTWTMALACALPPLFGWSRYIPEGLQCSCGIDYYTRAPGINNESFVIYMFTCHFSIPLAVISFCYGRLVCTVKXXXXXXXXXXXXXXXXXXVTRMVVIMVISFLVCWVPYASVAWYIFTHQGSTFGPIFMTIPSFFAKSSALYNPMIYICMNKQFRHCMITTLCCGKNPFEEEDG</Hsp_qseq>\n+      <Hsp_hseq>MNGTEGPNFYVPFSNATGVVRSPFEYPQYYLAEPWQFSMLAAYMFLLIVLGFPINFLTLYVTVQHKKLRTPLNYILLNLAVADLFMVLGGFTSTLYTSLHGYFVFGPTGCNLEGFFATLGGEIALWSLVVLAIERYVVVCKPMSNFRFGENHAIMGVAFTWVMALACAAPPLAGWSRYIPEGLQCSCGIDYYTLKPEVNNESFVIYMFVVHFTIPMIIIFFCYGQLVFTVKEAAAQQQESATTQKAEKEVTRMVIIMVIAFLICWVPYASVAFYIFTHQGSNFGPIFMTIPAFFAKSAAIYNPVIYIMMNKQFRNCMLTTICCGKNPLGDDEA</Hsp_hseq>\n+      <Hsp_midline>MNGTEGPNFY+P SNATGVVRSPFEYPQYYLAEPW FS L+AYMF LI+ GFPINFLTLYVT++HKKLRTPLNYILLNLAVADLFMV GGFT+T+YTS+HGYFVFGPTGCN+EGFFATLGGEIALW LVVLAIER++VVCKP++NFRFGE+HAIMGV  TW MALACA PPL GWSRYIPEGLQCSCGIDYYT  P +NNESFVIYMF  HF+IP+ +I FCYG+LV TVKEAAAQQQES TTQ+AE+EVTRMV+IMVI+FL+CWVPYASVA+YIFTHQGS FGPIFMTIP+FFAKS+A+YNP+IYI MNKQFR+CM+TT+CCGKNP  +++ </Hsp_midline>\n+    </Hsp>\n+  </Hit_hsps>\n+</Hit>\n+</Iteration_hits>\n+  <Iteration_stat>\n+    <Statistics>\n+      <Statistics_db-num>0</Statistics_db-num>\n+      <Statistics_db-len>0</Statistics_db-len>\n+      <Statistics_hsp-len>32</Statistics_hsp-len>\n+      <Statistics_eff-space>155584</Statistics_eff-space>\n+      <Statistics_kappa>0.041</Statistics_kappa>\n+      <Statistics_lambda>0.267</Statistics_lambda>\n+      <Statistics_entropy>0.14</Statistics_entropy>\n+    </Statistics>\n+  </Iteration_stat>\n+</Iteration>\n+</BlastOutput_iterations>\n+</BlastOutput>\n+\n'
b
diff -r b2795652d2b4 -r 5402f9b0d508 test-data/tblastn_four_human_vs_rhodopsin.html
--- a/test-data/tblastn_four_human_vs_rhodopsin.html Mon Nov 25 10:58:46 2013 -0500
+++ b/test-data/tblastn_four_human_vs_rhodopsin.html Thu Nov 28 08:50:15 2013 -0500
b
@@ -3,7 +3,7 @@
 <BODY BGCOLOR="#FFFFFF" LINK="#0000FF" VLINK="#660099" ALINK="#660099">
 <PRE>
 
-<b>TBLASTN 2.2.27+</b>
+<b>TBLASTN 2.2.28+</b>
 
 
 <b>Query=</b> sp|Q9BS26|ERP44_HUMAN Endoplasmic reticulum resident protein 44
b
diff -r b2795652d2b4 -r 5402f9b0d508 test-data/tblastn_four_human_vs_rhodopsin.xml
--- a/test-data/tblastn_four_human_vs_rhodopsin.xml Mon Nov 25 10:58:46 2013 -0500
+++ b/test-data/tblastn_four_human_vs_rhodopsin.xml Thu Nov 28 08:50:15 2013 -0500
b
b'@@ -2,7 +2,7 @@\n <!DOCTYPE BlastOutput PUBLIC "-//NCBI//NCBI BlastOutput/EN" "http://www.ncbi.nlm.nih.gov/dtd/NCBI_BlastOutput.dtd">\n <BlastOutput>\n   <BlastOutput_program>tblastn</BlastOutput_program>\n-  <BlastOutput_version>TBLASTN 2.2.27+</BlastOutput_version>\n+  <BlastOutput_version>TBLASTN 2.2.28+</BlastOutput_version>\n   <BlastOutput_reference>Stephen F. Altschul, Thomas L. Madden, Alejandro A. Sch&amp;auml;ffer, Jinghui Zhang, Zheng Zhang, Webb Miller, and David J. Lipman (1997), &quot;Gapped BLAST and PSI-BLAST: a new generation of protein database search programs&quot;, Nucleic Acids Res. 25:3389-3402.</BlastOutput_reference>\n   <BlastOutput_db></BlastOutput_db>\n   <BlastOutput_query-ID>Query_1</BlastOutput_query-ID>\n@@ -17,706 +17,725 @@\n       <Parameters_filter>F</Parameters_filter>\n     </Parameters>\n   </BlastOutput_param>\n-  <BlastOutput_iterations>\n-    <Iteration>\n-      <Iteration_iter-num>1</Iteration_iter-num>\n-      <Iteration_query-ID>Query_1</Iteration_query-ID>\n-      <Iteration_query-def>sp|Q9BS26|ERP44_HUMAN Endoplasmic reticulum resident protein 44 OS=Homo sapiens GN=ERP44 PE=1 SV=1</Iteration_query-def>\n-      <Iteration_query-len>406</Iteration_query-len>\n-      <Iteration_hits></Iteration_hits>\n-      <Iteration_stat>\n-        <Statistics>\n-          <Statistics_db-num>0</Statistics_db-num>\n-          <Statistics_db-len>0</Statistics_db-len>\n-          <Statistics_hsp-len>19</Statistics_hsp-len>\n-          <Statistics_eff-space>127710</Statistics_eff-space>\n-          <Statistics_kappa>0.071</Statistics_kappa>\n-          <Statistics_lambda>0.299</Statistics_lambda>\n-          <Statistics_entropy>0.27</Statistics_entropy>\n-        </Statistics>\n-      </Iteration_stat>\n-      <Iteration_message>No hits found</Iteration_message>\n-    </Iteration>\n-    <Iteration>\n-      <Iteration_iter-num>2</Iteration_iter-num>\n-      <Iteration_query-ID>Query_1</Iteration_query-ID>\n-      <Iteration_query-def>sp|Q9BS26|ERP44_HUMAN Endoplasmic reticulum resident protein 44 OS=Homo sapiens GN=ERP44 PE=1 SV=1</Iteration_query-def>\n-      <Iteration_query-len>406</Iteration_query-len>\n-      <Iteration_hits></Iteration_hits>\n-      <Iteration_stat>\n-        <Statistics>\n-          <Statistics_db-num>0</Statistics_db-num>\n-          <Statistics_db-len>0</Statistics_db-len>\n-          <Statistics_hsp-len>19</Statistics_hsp-len>\n-          <Statistics_eff-space>127710</Statistics_eff-space>\n-          <Statistics_kappa>0.071</Statistics_kappa>\n-          <Statistics_lambda>0.299</Statistics_lambda>\n-          <Statistics_entropy>0.27</Statistics_entropy>\n-        </Statistics>\n-      </Iteration_stat>\n-      <Iteration_message>No hits found</Iteration_message>\n-    </Iteration>\n-    <Iteration>\n-      <Iteration_iter-num>3</Iteration_iter-num>\n-      <Iteration_query-ID>Query_1</Iteration_query-ID>\n-      <Iteration_query-def>sp|Q9BS26|ERP44_HUMAN Endoplasmic reticulum resident protein 44 OS=Homo sapiens GN=ERP44 PE=1 SV=1</Iteration_query-def>\n-      <Iteration_query-len>406</Iteration_query-len>\n-      <Iteration_hits></Iteration_hits>\n-      <Iteration_stat>\n-        <Statistics>\n-          <Statistics_db-num>0</Statistics_db-num>\n-          <Statistics_db-len>0</Statistics_db-len>\n-          <Statistics_hsp-len>19</Statistics_hsp-len>\n-          <Statistics_eff-space>127710</Statistics_eff-space>\n-          <Statistics_kappa>0.071</Statistics_kappa>\n-          <Statistics_lambda>0.299</Statistics_lambda>\n-          <Statistics_entropy>0.27</Statistics_entropy>\n-        </Statistics>\n-      </Iteration_stat>\n-      <Iteration_message>No hits found</Iteration_message>\n-    </Iteration>\n-    <Iteration>\n-      <Iteration_iter-num>4</Iteration_iter-num>\n-      <Iteration_query-ID>Query_1</Iteration_query-ID>\n-      <Iteration_query-def>sp|Q9BS26|ERP44_HUMAN Endoplasmic reticulum resident protein 44 OS=Homo sapiens GN=ERP44 PE=1 SV=1</Iteration_query-def>\n-      <Iteration_query-len>406</Iteration_query-len>\n-      <Iter'..b'YYLAEPWQFSMLAAYMFLLIMLGFPINFLTLYVTVQHKKLRTPLNYILLNLAVADLFMVFGGFTTTLYTSLHGYFVFGPTGCNLEGFFATLGGEIALWSLVVLAIERYVVVCKPMSNFRFGENHAIMGVAFTWVMALACAAPPLVGWSRYIPEGMQCSCGIDYYTPHEETNNESFVIYMFVVHFIIPLIVIFFCYGQLVFTVKEAAAQQQESATTQKAEKEVTRMVIIMVIAFLICWLPYAGVAFYIFTHQGSDFGPIFMTIPAFFAKTSAVYNPVIYIMMNKQFRNCMVTTLCCGKNPLGDDEASTTVSKTETSQVAPA</Hsp_hseq>\n+      <Hsp_midline>MNGTEGPNFYVPFSN TGVVRSPFE PQYYLAEPWQFSMLAAYMFLLI+LGFPINFLTLYVTVQHKKLRTPLNYILLNLAVADLFMV GGFT+TLYTSLHGYFVFGPTGCNLEGFFATLGGEIALWSLVVLAIERYVVVCKPMSNFRFGENHAIMGVAFTWVMALACAAPPL GWSRYIPEG+QCSCGIDYYT   E NNESFVIYMFVVHF IP+I+IFFCYGQLVFTVKEAAAQQQESATTQKAEKEVTRMVIIMVIAFLICW+PYA VAFYIFTHQGS+FGPIFMTIPAFFAK++A+YNPVIYIMMNKQFRNCM+TT+CCGKNPLGDDEAS TVSKTETSQVAPA</Hsp_midline>\n+    </Hsp>\n+  </Hit_hsps>\n+</Hit>\n+</Iteration_hits>\n+  <Iteration_stat>\n+    <Statistics>\n+      <Statistics_db-num>0</Statistics_db-num>\n+      <Statistics_db-len>0</Statistics_db-len>\n+      <Statistics_hsp-len>18</Statistics_hsp-len>\n+      <Statistics_eff-space>109230</Statistics_eff-space>\n+      <Statistics_kappa>0.071</Statistics_kappa>\n+      <Statistics_lambda>0.299</Statistics_lambda>\n+      <Statistics_entropy>0.27</Statistics_entropy>\n+    </Statistics>\n+  </Iteration_stat>\n+</Iteration>\n+<Iteration>\n+  <Iteration_iter-num>24</Iteration_iter-num>\n+  <Iteration_query-ID>Query_4</Iteration_query-ID>\n+  <Iteration_query-def>sp|P08100|OPSD_HUMAN Rhodopsin OS=Homo sapiens GN=RHO PE=1 SV=1</Iteration_query-def>\n+  <Iteration_query-len>348</Iteration_query-len>\n+<Iteration_hits>\n+<Hit>\n+  <Hit_num>1</Hit_num>\n+  <Hit_id>Subject_6</Hit_id>\n+  <Hit_def>gi|12583664|dbj|AB043817.1| Conger myriaster conf gene for fresh water form rod opsin, complete cds</Hit_def>\n+  <Hit_accession>Subject_6</Hit_accession>\n+  <Hit_len>1344</Hit_len>\n+  <Hit_hsps>\n+    <Hsp>\n+      <Hsp_num>1</Hsp_num>\n+      <Hsp_bit-score>626.708</Hsp_bit-score>\n+      <Hsp_score>1444</Hsp_score>\n+      <Hsp_evalue>0</Hsp_evalue>\n+      <Hsp_query-from>1</Hsp_query-from>\n+      <Hsp_query-to>341</Hsp_query-to>\n+      <Hsp_hit-from>23</Hsp_hit-from>\n+      <Hsp_hit-to>1048</Hsp_hit-to>\n+      <Hsp_query-frame>0</Hsp_query-frame>\n+      <Hsp_hit-frame>2</Hsp_hit-frame>\n+      <Hsp_identity>281</Hsp_identity>\n+      <Hsp_positive>311</Hsp_positive>\n+      <Hsp_gaps>1</Hsp_gaps>\n+      <Hsp_align-len>342</Hsp_align-len>\n+      <Hsp_qseq>MNGTEGPNFYVPFSNATGVVRSPFEYPQYYLAEPWQFSMLAAYMFLLIVLGFPINFLTLYVTVQHKKLRTPLNYILLNLAVADLFMVLGGFTSTLYTSLHGYFVFGPTGCNLEGFFATLGGEIALWSLVVLAIERYVVVCKPMSNFRFGENHAIMGVAFTWVMALACAAPPLAGWSRYIPEGLQCSCGIDYYTLKPEVNNESFVIYMFVVHFTIPMIIIFFCYGQLVFTVKEAAAQQQESATTQKAEKEVTRMVIIMVIAFLICWVPYASVAFYIFTHQGSNFGPIFMTIPAFFAKSAAIYNPVIYIMMNKQFRNCMLTTICCGKNPL-GDDEASATVSKTE</Hsp_qseq>\n+      <Hsp_hseq>MNGTEGPNFYIPMSNATGVVRSPFEYPQYYLAEPWAFSALSAYMFFLIIAGFPINFLTLYVTIEHKKLRTPLNYILLNLAVADLFMVFGGFTTTMYTSMHGYFVFGPTGCNIEGFFATLGGEIALWCLVVLAIERWMVVCKPVTNFRFGESHAIMGVMVTWTMALACALPPLFGWSRYIPEGLQCSCGIDYYTRAPGINNESFVIYMFTCHFSIPLAVISFCYGRLVCTVKEAAAQQQESETTQRAEREVTRMVVIMVISFLVCWVPYASVAWYIFTHQGSTFGPIFMTIPSFFAKSSALYNPMIYICMNKQFRHCMITTLCCGKNPFEEEDGASATSSKTE</Hsp_hseq>\n+      <Hsp_midline>MNGTEGPNFY+P SNATGVVRSPFEYPQYYLAEPW FS L+AYMF LI+ GFPINFLTLYVT++HKKLRTPLNYILLNLAVADLFMV GGFT+T+YTS+HGYFVFGPTGCN+EGFFATLGGEIALW LVVLAIER++VVCKP++NFRFGE HAIMGV  TW MALACA PPL GWSRYIPEGLQCSCGIDYYT  P +NNESFVIYMF  HF+IP+ +I FCYG+LV TVKEAAAQQQES TTQ+AE+EVTRMV+IMVI+FL+CWVPYASVA YIFTHQGS FGPIFMTIP+FFAKS+A+YNP+IYI MNKQFR CM+TT+CCGKNP   +D ASAT SKTE</Hsp_midline>\n+    </Hsp>\n+  </Hit_hsps>\n+</Hit>\n+</Iteration_hits>\n+  <Iteration_stat>\n+    <Statistics>\n+      <Statistics_db-num>0</Statistics_db-num>\n+      <Statistics_db-len>0</Statistics_db-len>\n+      <Statistics_hsp-len>18</Statistics_hsp-len>\n+      <Statistics_eff-space>109230</Statistics_eff-space>\n+      <Statistics_kappa>0.071</Statistics_kappa>\n+      <Statistics_lambda>0.299</Statistics_lambda>\n+      <Statistics_entropy>0.27</Statistics_entropy>\n+    </Statistics>\n+  </Iteration_stat>\n+</Iteration>\n+</BlastOutput_iterations>\n+</BlastOutput>\n+\n'
b
diff -r b2795652d2b4 -r 5402f9b0d508 tools/ncbi_blast_plus/README.rst
--- a/tools/ncbi_blast_plus/README.rst Mon Nov 25 10:58:46 2013 -0500
+++ b/tools/ncbi_blast_plus/README.rst Thu Nov 28 08:50:15 2013 -0500
b
@@ -5,7 +5,7 @@
 (formerly SCRI, Scottish Crop Research Institute), UK. All rights reserved.
 See the licence text below.
 
-Currently tested with NCBI BLAST 2.2.27+ (i.e. version 2.2.27 of BLAST+),
+Currently tested with NCBI BLAST 2.2.28+ (i.e. version 2.2.28 of BLAST+),
 and does not work with the NCBI 'legacy' BLAST suite (e.g. blastall).
 
 Note that these wrappers (and the associated datatypes) were originally
@@ -71,7 +71,7 @@
 about any system level BLAST databases using the tool-data/blastdb*.loc files.
 
 You must install the NCBI BLAST+ standalone tools somewhere on the system
-path. Currently the unit tests are written using "BLAST 2.2.27+".
+path. Currently the unit tests are written using "BLAST 2.2.28+".
 
 Run the functional tests (adjusting the section identifier to match your
 tool_conf.xml.sample file)::
@@ -131,6 +131,9 @@
         - Set number of threads via $GALAXY_SLOTS environment variable
         - More descriptive default output names
         - Tests require updated BLAST DB definitions (blast_datatypes v0.0.18)
+        - Pre-check for duplicate identifiers in makeblastdb wrapper.
+        - Tests updated for BLAST+ 2.2.28 instead of BLAST+ 2.2.27
+        - Now depends on package_blast_plus_2_2_28 in ToolShed
 ======= ======================================================================
 
 
b
diff -r b2795652d2b4 -r 5402f9b0d508 tools/ncbi_blast_plus/check_no_duplicates.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/ncbi_blast_plus/check_no_duplicates.py Thu Nov 28 08:50:15 2013 -0500
[
@@ -0,0 +1,46 @@
+#!/usr/bin/env python
+"""Check for duplicate sequence identifiers in FASTA files.
+
+This is run as a pre-check before makeblastdb, in order to avoid
+a regression bug in BLAST+ 2.2.28 which fails to catch this. See:
+http://blastedbio.blogspot.co.uk/2012/10/my-ids-not-good-enough-for-ncbi-blast.html
+
+This script takes one or more FASTA filenames as input, and
+will return a non-zero error if any duplicate identifiers
+are found.
+"""
+import sys
+import os
+
+if "-v" in sys.argv or "--version" in sys.argv:
+    print("v0.0.22")
+    sys.exit(0)
+
+def stop_err(msg, error=1):
+    sys.stderr.write("%s\n" % msg)
+    sys.exit(error)
+
+
+identifiers = set()
+files = 0
+for filename in sys.argv[1:]:
+    if not os.path.isfile(filename):
+        stop_err("Missing FASTA file %r" % filename, 2)
+    files += 1
+    handle = open(filename)
+    for line in handle:
+        if line.startswith(">"):
+            #The split will also take care of the new line character,
+            #e.g. ">test\n" and ">test description here\n" both give "test"
+            seq_id = line[1:].split(None, 1)[0]
+            if seq_id in identifiers:
+                handle.close()
+                stop_err("Repeated identifiers, e.g. %r" % seq_id, 1)
+            identifiers.add(seq_id)
+    handle.close()
+if not files:
+    stop_err("No FASTA files given to check for duplicates", 3)
+elif files == 1:
+    print("%i sequences" % len(identifiers))
+else:
+    print("%i sequences in %i FASTA files" % (len(identifiers), files))
b
diff -r b2795652d2b4 -r 5402f9b0d508 tools/ncbi_blast_plus/ncbi_macros.xml
--- a/tools/ncbi_blast_plus/ncbi_macros.xml Mon Nov 25 10:58:46 2013 -0500
+++ b/tools/ncbi_blast_plus/ncbi_macros.xml Thu Nov 28 08:50:15 2013 -0500
b
@@ -240,7 +240,7 @@
     <xml name="requirements">
         <requirements>
             <requirement type="binary">@BINARY@</requirement>
-            <requirement type="package" version="2.2.27">blast+</requirement>
+            <requirement type="package" version="2.2.28">blast+</requirement>
         </requirements>
         <version_command>@BINARY@ -version</version_command>
     </xml>
b
diff -r b2795652d2b4 -r 5402f9b0d508 tools/ncbi_blast_plus/ncbi_makeblastdb.xml
--- a/tools/ncbi_blast_plus/ncbi_makeblastdb.xml Mon Nov 25 10:58:46 2013 -0500
+++ b/tools/ncbi_blast_plus/ncbi_makeblastdb.xml Thu Nov 28 08:50:15 2013 -0500
b
@@ -5,7 +5,13 @@
         <import>ncbi_macros.xml</import>
     </macros>
     <expand macro="requirements" />
-    <command>
+    <command interpreter="python">check_no_duplicates.py
+##First check for duplicates (since BLAST+ 2.2.28 fails to do so)
+##and abort (via the ampersand ampersand trick) if any are found.
+#for $i in $in
+"${i.file}"
+#end for
+&amp;&amp;
 makeblastdb -out "${os.path.join($outfile.extra_files_path,'blastdb')}"
 $parse_seqids
 $hash_index