# HG changeset patch
# User peterjc
# Date 1400668430 14400
# Node ID e47960bcdccbcad6f2070f79ea84e3e928d09fce
# Parent c84b6c21e3d40ebe223c40d9293a2ebe88229af4
Uploaded v0.1.0f, exclude tied best hits
diff -r c84b6c21e3d4 -r e47960bcdccb test-data/k12_edited_proteins.fasta
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/k12_edited_proteins.fasta Wed May 21 06:33:50 2014 -0400
@@ -0,0 +1,69 @@
+>gi|16127995|ref|NP_414542.1| thr operon leader peptide [Escherichia coli str. K-12 substr. MG1655]
+MKRISTTITTTITITTGNGAG
+>gi|16127996|ref|NP_414543.1| fused aspartokinase I and homoserine dehydrogenase I [Escherichia coli str. K-12 substr. MG1655]
+MRVLKFGGTSVANAERFLRVADILESNARQGQVATVLSAPAKITNHLVAMIEKTISGQDALPNISDAERI
+FAELLTGLAAAQPGFPLAQLKTFVDQEFAQIKHVLHGISLLGQCPDSINAALICRGEKMSIAIMAGVLEA
+RGHNVTVIDPVEKLLAVGHYLESTVDIAESTRRIAASRIPADHMVLMAGFTAGNEKGELVVLGRNGSDYS
+AAVLAACLRADCCEIWTDVDGVYTCDPRQVPDARLLKSMSYQEAMELSYFGAKVLHPRTITPIAQFQIPC
+LIKNTGNPQAPGTLIGASRDEDELPVKGISNLNNMAMFSVSGPGMKGMVGMAARVFAAMSRARISVVLIT
+QSSSEYSISFCVPQSDCVRAERAMQEEFYLELKEGLLEPLAVTERLAIISVVGDGMRTLRGISAKFFAAL
+ARANINIVAIAQGSSERSISVVVNNDDATTGVRVTHQMLFNTDQVIEVFVIGVGGVGGALLEQLKRQQSW
+LKNKHIDLRVCGVANSKALLTNVHGLNLENWQEELAQAKEPFNLGRLIRLVKEYHLLNPVIVDCTSSQAV
+ADQYADFLREGFHVVTPNKKANTSSMDYYHQLRYAAEKSRRKFLYDTNVGAGLPVIENLQNLLNAGDELM
+KFSGILSGSLSYIFGKLDEGMSFSEATTLAREMGYTEPDPRDDLSGMDVARKLLILARETGRELELADIE
+IEPVLPAEFNAEGDVAAFMANLSQLDDLFAARVAKARDEGKVLRYVGNIDEDGVCRVKIAEVDGNDPLFK
+VKNGENALAFYSHYYQPLPLVLRGYGAGNDVTAAGVFADLLRTLSWKLGV
+>gi|16127997|ref|NP_414544.1| homoserine kinase [Escherichia coli str. K-12 substr. MG1655]
+MVKVYAPASSANMSVGFDVLGAAVTPVDGALLGDVVTVEAAETFSLNNLGRFADKLPSEPRENIVYQCWE
+RFCQELGKQIPVAMTLEKNMPIGSGLGSSACSVVAALMAMNEHCGKPLNDTRLLALMGELEGRISGSIHY
+DNVAPCFLGGMQLMIEENDIISQQVPGFDEWLWVLAYPGIKVSTAEARAILPAQYRRQDCIAHGRHLAGF
+IHACYSRQPELAAKLMKDVIAEPYRERLLPGFRQARQAVAEIGAVASGISGSGPTLFALCDKPETAQRVA
+DWLGKNYLQNQEGFVHICRLDTAGARVLEN
+>NP_414544_near_copy
+MKVYAPASSANMSVGFDVLGAAVTPVDGALLGDVVTVEAAETFSLNNLGRFADKLPSEPRENIVYQCWE
+RFCQELGKQIPVAMTLEKNMPIGSGLGSSACSVVAALMAMNEHCGKPLNDTRLLALMGELEGRISGSIHY
+DNVAPCFLGGMQLMIEENDIISQQVPGFDEWLWVLAYPGIKVSTAEARAILPAQYRRQDCIAHGRHLAGF
+IHACYSRQPELAAKLMKDVIAEPYRERLLPGFRQARQAVAEIGAVASGISGSGPTLFALCDKPETAQRVA
+DWLGKNYLQNQEGFVHICRLDTAGARVLEN
+>gi|16127998|ref|NP_414545.1| threonine synthase [Escherichia coli str. K-12 substr. MG1655]
+MKLYNLKDHNEQVSFAQAVTQGLGKNQGLFFPHDLPEFSLTEIDEMLKLDFVTRSAKILSAFIGDEIPQE
+ILEERVRAAFAFPAPVANVESDVGCLELFHGPTLAFKDFGGRFMAQMLTHIAGDKPVTILTATSGDTGAA
+VAHAFYGLPNVKVVILYPRGKISPLQEKLFCTLGGNIETVAIDGDFDACQALVKQAFDDEELKVALGLNS
+ANSINISRLLAQICYYFEAVAQLPQETRNQLVVSVPSGNFGDLTAGLLAKSLGLPVKRFIAATNVNDTVP
+RFLHDGQWSPKATQATLSNAMDVSQPNNWPRVEELFRRKIWQLKELGYAAVDDETTQQTMRELKELGYTS
+EPHAAVAYRALRDQLNPGEYGLFLGTAHPAKFKESVEAILGETLDLPKELAERADLPLLSHNLPADFAAL
+RKLMMNHQ
+>NP_414546_near_copy_1
+MKKMQSIVLALSLVLVAPMAAQAAEITLVPSVKLQIGDRDNRGYYWDGGHWRDHGWWKQHYEWRGNRWHL
+HGPPPPPRHHKKAPHDHHGGHGPGKHHRV
+>NP_414546_near_copy_2
+MKKMQSIVLALSLVLVAPMAAQAAEITLVPSVKLQIGDRDNRGYYWDGGHWRDHGWWKQHYEWRGNRWHL
+HGPPPPPRHHKKAPHDHHGGHGPGKHHRRI
+>gi|16128000|ref|NP_414547.1| peroxide resistance protein, lowers intracellular iron [Escherichia coli str. K-12 substr. MG1655]
+MLILISPAKTLDYQSPLTTTRYTLPELLDNSQQLIHEARKLTPPQISTLMRISDKLAGINAARFHDWQPD
+FTPANARQAILAFKGDVYTGLQAETFSEDDFDFAQQHLRMLSGLYGVLRPLDLMQPYRLEMGIRLENARG
+KDLYQFWGDIITNKLNEALAAQGDNVVINLASDEYFKSVKPKKLNAEIIKPVFLDEKNGKFKIISFYAKK
+ARGLMSRFIIENRLTKPEQLTGFNSEGYFFDEDSSSNGELVFKRYEQR
+>gi|16128001|ref|NP_414548.1| putative transporter [Escherichia coli str. K-12 substr. MG1655]
+MPDFFSFINSVLWGSVMIYLLFGAGCWFTFRTGFVQFRYIRQFGKSLKNSIHPQPGGLTSFQSLCTSLAA
+RVGSGNLAGVALAITAGGPGAVFWMWVAAFIGMATSFAECSLAQLYKERDVNGQFRGGPAWYMARGLGMR
+WMGVLFAVFLLIAYGIIFSGVQANAVARALSFSFDFPPLVTGIILAVFTLLAITRGLHGVARLMQGFVPL
+MAIIWVLTSLVICVMNIGQLPHVIWSIFESAFGWQEAAGGAAGYTLSQAITNGFQRSMFSNEAGMGSTPN
+AAAAAASWPPHPAAQGIVQMIGIFIDTLVICTASAMLILLAGNGTTYMPLEGIQLIQKAMRVLMGSWGAE
+FVTLVVILFAFSSIVANYIYAENNLFFLRLNNPKAIWCLRICTFATVIGGTLLSLPLMWQLADIIMACMA
+ITNLTAILLLSPVVHTIASDYLRQRKLGVRPVFDPLRYPDIGRQLSPDAWDDVSQE
+>gi|16128002|ref|NP_414549.1| transaldolase B [Escherichia coli str. K-12 substr. MG1655]
+MTDKLTSLRQYTTVVADTGDIAAMKLYQPQDATTNPSLILNAAQIPEYRKLIDDAVAWAKQQSNDRAQQI
+VDATDKLAVNIGLEILKLVPGRISTEVDARLSYDTEASIAKAKRLIKLYNDAGISNDRILIKLASTWQGI
+RAAEQLEKEGINCNLTLLFSFAQARACAEAGVFLISPFVGRILDWYKANTDKKEYAPAEDPGVVSVSEIY
+QYYKEHGYETVVMGASFRNIGEILELAGCDRLTIAPALLKELAESEGAIERKLSYTGEVKARPARITESE
+FLWQHNQDPMAVDKLAEGIRKFAIDQEKLEKMIGDLL
+>gi|16128003|ref|NP_414550.1| molybdochelatase incorporating molybdenum into molybdopterin [Escherichia coli str. K-12 substr. MG1655]
+MNTLRIGLVSISDRASSGVYQDKGIPALEEWLTSALTTPFELETRLIPDEQAIIEQTLCELVDEMSCHLV
+LTTGGTGPARRDVTPDATLAVADREMPGFGEQMRQISLHFVPTAILSRQVGVIRKQALILNLPGQPKSIK
+ETLEGVKDAEGNVVVHGIFASVPYCIQLLEGPYVETAPEVVAAFRPKSARRDVSE
+>gi|16128004|ref|NP_414551.1| inner membrane protein, Grp1_Fun34_YaaH family [Escherichia coli str. K-12 substr. MG1655]
+MGNTKLANPAPLGLMGFGMTTILLNLHNVGYFALDGIILAMGIFYGGIAQIFAGLLEYKKGNTFGLTAFT
+SYGSFWLTLVAILLMPKLGLTDAPNAQFLGVYLGLWGVFTLFMFFGTLKGARVLQFVFFSLTVLFALLAI
+GNIAGNAAIIHFAGWIGLICGASAIYLAMGEVLNEQFGRTVLPIGESH
+
diff -r c84b6c21e3d4 -r e47960bcdccb test-data/k12_ten_proteins.fasta
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/k12_ten_proteins.fasta Wed May 21 06:33:50 2014 -0400
@@ -0,0 +1,60 @@
+>gi|16127995|ref|NP_414542.1| thr operon leader peptide [Escherichia coli str. K-12 substr. MG1655]
+MKRISTTITTTITITTGNGAG
+>gi|16127996|ref|NP_414543.1| fused aspartokinase I and homoserine dehydrogenase I [Escherichia coli str. K-12 substr. MG1655]
+MRVLKFGGTSVANAERFLRVADILESNARQGQVATVLSAPAKITNHLVAMIEKTISGQDALPNISDAERI
+FAELLTGLAAAQPGFPLAQLKTFVDQEFAQIKHVLHGISLLGQCPDSINAALICRGEKMSIAIMAGVLEA
+RGHNVTVIDPVEKLLAVGHYLESTVDIAESTRRIAASRIPADHMVLMAGFTAGNEKGELVVLGRNGSDYS
+AAVLAACLRADCCEIWTDVDGVYTCDPRQVPDARLLKSMSYQEAMELSYFGAKVLHPRTITPIAQFQIPC
+LIKNTGNPQAPGTLIGASRDEDELPVKGISNLNNMAMFSVSGPGMKGMVGMAARVFAAMSRARISVVLIT
+QSSSEYSISFCVPQSDCVRAERAMQEEFYLELKEGLLEPLAVTERLAIISVVGDGMRTLRGISAKFFAAL
+ARANINIVAIAQGSSERSISVVVNNDDATTGVRVTHQMLFNTDQVIEVFVIGVGGVGGALLEQLKRQQSW
+LKNKHIDLRVCGVANSKALLTNVHGLNLENWQEELAQAKEPFNLGRLIRLVKEYHLLNPVIVDCTSSQAV
+ADQYADFLREGFHVVTPNKKANTSSMDYYHQLRYAAEKSRRKFLYDTNVGAGLPVIENLQNLLNAGDELM
+KFSGILSGSLSYIFGKLDEGMSFSEATTLAREMGYTEPDPRDDLSGMDVARKLLILARETGRELELADIE
+IEPVLPAEFNAEGDVAAFMANLSQLDDLFAARVAKARDEGKVLRYVGNIDEDGVCRVKIAEVDGNDPLFK
+VKNGENALAFYSHYYQPLPLVLRGYGAGNDVTAAGVFADLLRTLSWKLGV
+>gi|16127997|ref|NP_414544.1| homoserine kinase [Escherichia coli str. K-12 substr. MG1655]
+MVKVYAPASSANMSVGFDVLGAAVTPVDGALLGDVVTVEAAETFSLNNLGRFADKLPSEPRENIVYQCWE
+RFCQELGKQIPVAMTLEKNMPIGSGLGSSACSVVAALMAMNEHCGKPLNDTRLLALMGELEGRISGSIHY
+DNVAPCFLGGMQLMIEENDIISQQVPGFDEWLWVLAYPGIKVSTAEARAILPAQYRRQDCIAHGRHLAGF
+IHACYSRQPELAAKLMKDVIAEPYRERLLPGFRQARQAVAEIGAVASGISGSGPTLFALCDKPETAQRVA
+DWLGKNYLQNQEGFVHICRLDTAGARVLEN
+>gi|16127998|ref|NP_414545.1| threonine synthase [Escherichia coli str. K-12 substr. MG1655]
+MKLYNLKDHNEQVSFAQAVTQGLGKNQGLFFPHDLPEFSLTEIDEMLKLDFVTRSAKILSAFIGDEIPQE
+ILEERVRAAFAFPAPVANVESDVGCLELFHGPTLAFKDFGGRFMAQMLTHIAGDKPVTILTATSGDTGAA
+VAHAFYGLPNVKVVILYPRGKISPLQEKLFCTLGGNIETVAIDGDFDACQALVKQAFDDEELKVALGLNS
+ANSINISRLLAQICYYFEAVAQLPQETRNQLVVSVPSGNFGDLTAGLLAKSLGLPVKRFIAATNVNDTVP
+RFLHDGQWSPKATQATLSNAMDVSQPNNWPRVEELFRRKIWQLKELGYAAVDDETTQQTMRELKELGYTS
+EPHAAVAYRALRDQLNPGEYGLFLGTAHPAKFKESVEAILGETLDLPKELAERADLPLLSHNLPADFAAL
+RKLMMNHQ
+>gi|16127999|ref|NP_414546.1| hypothetical protein b0005 [Escherichia coli str. K-12 substr. MG1655]
+MKKMQSIVLALSLVLVAPMAAQAAEITLVPSVKLQIGDRDNRGYYWDGGHWRDHGWWKQHYEWRGNRWHL
+HGPPPPPRHHKKAPHDHHGGHGPGKHHR
+>gi|16128000|ref|NP_414547.1| peroxide resistance protein, lowers intracellular iron [Escherichia coli str. K-12 substr. MG1655]
+MLILISPAKTLDYQSPLTTTRYTLPELLDNSQQLIHEARKLTPPQISTLMRISDKLAGINAARFHDWQPD
+FTPANARQAILAFKGDVYTGLQAETFSEDDFDFAQQHLRMLSGLYGVLRPLDLMQPYRLEMGIRLENARG
+KDLYQFWGDIITNKLNEALAAQGDNVVINLASDEYFKSVKPKKLNAEIIKPVFLDEKNGKFKIISFYAKK
+ARGLMSRFIIENRLTKPEQLTGFNSEGYFFDEDSSSNGELVFKRYEQR
+>gi|16128001|ref|NP_414548.1| putative transporter [Escherichia coli str. K-12 substr. MG1655]
+MPDFFSFINSVLWGSVMIYLLFGAGCWFTFRTGFVQFRYIRQFGKSLKNSIHPQPGGLTSFQSLCTSLAA
+RVGSGNLAGVALAITAGGPGAVFWMWVAAFIGMATSFAECSLAQLYKERDVNGQFRGGPAWYMARGLGMR
+WMGVLFAVFLLIAYGIIFSGVQANAVARALSFSFDFPPLVTGIILAVFTLLAITRGLHGVARLMQGFVPL
+MAIIWVLTSLVICVMNIGQLPHVIWSIFESAFGWQEAAGGAAGYTLSQAITNGFQRSMFSNEAGMGSTPN
+AAAAAASWPPHPAAQGIVQMIGIFIDTLVICTASAMLILLAGNGTTYMPLEGIQLIQKAMRVLMGSWGAE
+FVTLVVILFAFSSIVANYIYAENNLFFLRLNNPKAIWCLRICTFATVIGGTLLSLPLMWQLADIIMACMA
+ITNLTAILLLSPVVHTIASDYLRQRKLGVRPVFDPLRYPDIGRQLSPDAWDDVSQE
+>gi|16128002|ref|NP_414549.1| transaldolase B [Escherichia coli str. K-12 substr. MG1655]
+MTDKLTSLRQYTTVVADTGDIAAMKLYQPQDATTNPSLILNAAQIPEYRKLIDDAVAWAKQQSNDRAQQI
+VDATDKLAVNIGLEILKLVPGRISTEVDARLSYDTEASIAKAKRLIKLYNDAGISNDRILIKLASTWQGI
+RAAEQLEKEGINCNLTLLFSFAQARACAEAGVFLISPFVGRILDWYKANTDKKEYAPAEDPGVVSVSEIY
+QYYKEHGYETVVMGASFRNIGEILELAGCDRLTIAPALLKELAESEGAIERKLSYTGEVKARPARITESE
+FLWQHNQDPMAVDKLAEGIRKFAIDQEKLEKMIGDLL
+>gi|16128003|ref|NP_414550.1| molybdochelatase incorporating molybdenum into molybdopterin [Escherichia coli str. K-12 substr. MG1655]
+MNTLRIGLVSISDRASSGVYQDKGIPALEEWLTSALTTPFELETRLIPDEQAIIEQTLCELVDEMSCHLV
+LTTGGTGPARRDVTPDATLAVADREMPGFGEQMRQISLHFVPTAILSRQVGVIRKQALILNLPGQPKSIK
+ETLEGVKDAEGNVVVHGIFASVPYCIQLLEGPYVETAPEVVAAFRPKSARRDVSE
+>gi|16128004|ref|NP_414551.1| inner membrane protein, Grp1_Fun34_YaaH family [Escherichia coli str. K-12 substr. MG1655]
+MGNTKLANPAPLGLMGFGMTTILLNLHNVGYFALDGIILAMGIFYGGIAQIFAGLLEYKKGNTFGLTAFT
+SYGSFWLTLVAILLMPKLGLTDAPNAQFLGVYLGLWGVFTLFMFFGTLKGARVLQFVFFSLTVLFALLAI
+GNIAGNAAIIHFAGWIGLICGASAIYLAMGEVLNEQFGRTVLPIGESH
+
diff -r c84b6c21e3d4 -r e47960bcdccb test-data/rbh_blastp_k12.tabular
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/rbh_blastp_k12.tabular Wed May 21 06:33:50 2014 -0400
@@ -0,0 +1,10 @@
+#A_id B_id A_length B_length A_qcovhsp B_qcovhsp length pident bitscore
+gi|16127995|ref|NP_414542.1| gi|16127995|ref|NP_414542.1| 21 21 100 100 21 100.00 38.1
+gi|16127996|ref|NP_414543.1| gi|16127996|ref|NP_414543.1| 820 820 100 100 820 100.00 1687
+gi|16127997|ref|NP_414544.1| gi|16127997|ref|NP_414544.1| 310 310 100 100 310 100.00 642
+gi|16127998|ref|NP_414545.1| gi|16127998|ref|NP_414545.1| 428 428 100 100 428 100.00 882
+gi|16128000|ref|NP_414547.1| gi|16128000|ref|NP_414547.1| 258 258 100 100 258 100.00 531
+gi|16128001|ref|NP_414548.1| gi|16128001|ref|NP_414548.1| 476 476 100 100 476 100.00 959
+gi|16128002|ref|NP_414549.1| gi|16128002|ref|NP_414549.1| 317 317 100 100 317 100.00 648
+gi|16128003|ref|NP_414550.1| gi|16128003|ref|NP_414550.1| 195 195 100 100 195 100.00 397
+gi|16128004|ref|NP_414551.1| gi|16128004|ref|NP_414551.1| 188 188 100 100 188 100.00 365
diff -r c84b6c21e3d4 -r e47960bcdccb tools/blast_rbh/README.rst
--- a/tools/blast_rbh/README.rst Tue May 20 06:33:08 2014 -0400
+++ b/tools/blast_rbh/README.rst Wed May 21 06:33:50 2014 -0400
@@ -62,7 +62,7 @@
For making the "Galaxy Tool Shed" http://toolshed.g2.bx.psu.edu/ tarball I use
the following command from the Galaxy root folder::
- $ tar -czf blast_rbh.tar.gz tools/blast_rbh/README.rst tools/blast_rbh/blast_rbh.xml tools/blast_rbh/blast_rbh.py tools/blast_rbh/repository_dependencies.xml test-data/rhodopsin_nucs.fasta test-data/rhodopsin_proteins.fasta test-data/three_human_mRNA.fasta test-data/four_human_proteins.fasta test-data/rbh_megablast_rhodopsin_nucs_vs_three_human_mRNA.tabular test-data/rbh_blastn_three_human_mRNA_vs_rhodopsin_nucs.tabular test-data/rbh_blastp_four_human_vs_rhodopsin_proteins.tabular test-data/rbh_none.tabular test-data/rbh_tblastx_rhodopsin_nucs_vs_three_human_mRNA.tabular
+ $ tar -czf blast_rbh.tar.gz tools/blast_rbh/README.rst tools/blast_rbh/blast_rbh.xml tools/blast_rbh/blast_rbh.py tools/blast_rbh/repository_dependencies.xml test-data/rhodopsin_nucs.fasta test-data/rhodopsin_proteins.fasta test-data/three_human_mRNA.fasta test-data/four_human_proteins.fasta test-data/k12_edited_proteins.fasta test-data/k12_ten_proteins.fasta test-data/rbh_megablast_rhodopsin_nucs_vs_three_human_mRNA.tabular test-data/rbh_blastn_three_human_mRNA_vs_rhodopsin_nucs.tabular test-data/rbh_blastp_four_human_vs_rhodopsin_proteins.tabular test-data/rbh_none.tabular test-data/rbh_tblastx_rhodopsin_nucs_vs_three_human_mRNA.tabular test-data/rbh_blastp_k12.tabular
Check this worked::
@@ -75,11 +75,14 @@
test-data/rhodopsin_proteins.fasta
test-data/three_human_mRNA.fasta
test-data/four_human_proteins.fasta
+ test-data/k12_edited_proteins.fasta
+ test-data/k12_ten_proteins.fasta
test-data/rbh_megablast_rhodopsin_nucs_vs_three_human_mRNA.tabular
test-data/rbh_blastn_three_human_mRNA_vs_rhodopsin_nucs.tabular
test-data/rbh_blastp_four_human_vs_rhodopsin_proteins.tabular
test-data/rbh_none.tabular
test-data/rbh_tblastx_rhodopsin_nucs_vs_three_human_mRNA.tabular
+ test-data/rbh_blastp_k12.tabular
Licence (MIT)
diff -r c84b6c21e3d4 -r e47960bcdccb tools/blast_rbh/blast_rbh.py
--- a/tools/blast_rbh/blast_rbh.py Tue May 20 06:33:08 2014 -0400
+++ b/tools/blast_rbh/blast_rbh.py Wed May 21 06:33:50 2014 -0400
@@ -99,6 +99,75 @@
c_qlen = 5
c_length = 6
+tie_warning = 0
+
+def best_hits(blast_tabular):
+ """Iterate over BLAST tabular output, returns best hits as 2-tuples.
+
+ Each return value is (query name, tuple of value for the best hit).
+
+ Tied best hits to different sequences are NOT returned.
+
+ One hit is returned for tied best hits to the same sequence
+ (e.g. repeated domains).
+ """
+ global tie_warning
+ current = None
+ best_score = None
+ best = None
+ with open(blast_tabular) as h:
+ for line in h:
+ if line.startswith("#"):
+ continue
+ parts = line.rstrip("\n").split("\t")
+ if float(parts[c_identity]) < min_identity or float(parts[c_coverage]) < min_coverage:
+ continue
+ a = parts[c_query]
+ b = parts[c_match]
+ score = float(parts[c_score])
+ qlen = int(parts[c_qlen])
+ length = int(parts[c_length])
+ #print("Considering hit for %s to %s with score %s..." % (a, b, score))
+ if current is None:
+ #First hit
+ assert best is None
+ assert best_score is None
+ best = dict()
+ #Now append this hit...
+ elif a != current:
+ #New hit
+ if len(best) == 1:
+ #Unambiguous (no tied matches)
+ yield current, list(best.values())[0]
+ else:
+ #print("%s has %i equally good hits: %s" % (a, len(best), ", ".join(best)))
+ tie_warning += 1
+ best = dict()
+ #Now append this hit...
+ elif score < best_score:
+ #print("No improvement for %s, %s < %s" % (a, score, best_score))
+ continue
+ elif score > best_score:
+ #This is better, discard old best
+ best = dict()
+ #Now append this hit...
+ else:
+ #print("Tied best hits for %s" % a)
+ assert best_score == score
+ #Now append this hit...
+ current = a
+ best_score = score
+ #This will collapse two equally good hits to the same target (e.g. duplicated domain)
+ best[b] = (b, score, parts[c_score], parts[c_identity], parts[c_coverage], qlen, length)
+ #Best hit for final query, if unambiguous:
+ if current is not None:
+ if len(best)==1:
+ yield current, list(best.values())[0]
+ else:
+ #print("%s has %i equally good hits: %s" % (a, len(best), ", ".join(best)))
+ tie_warning += 1
+
+
#print("Starting...")
#TODO - Report log in case of error?
run('%s -dbtype %s -in "%s" -out "%s" -logfile "%s"' % (makeblastdb_exe, dbtype, fasta_a, db_a, log))
@@ -111,70 +180,40 @@
% (blast_cmd, fasta_b, db_a, b_vs_a, cols, threads))
#print("BLAST species B vs species A done.")
-best_a_vs_b = dict()
-for line in open(a_vs_b):
- if line.startswith("#"): continue
- parts = line.rstrip("\n").split("\t")
- a = parts[c_query]
- b = parts[c_match]
- if float(parts[c_identity]) < min_identity or float(parts[c_coverage]) < min_coverage:
- continue
- score = float(parts[c_score])
- qlen = int(parts[c_qlen])
- length = int(parts[c_length])
- if a not in best_a_vs_b or score > best_a_vs_b[a][1]:
- # Store bitscore as a float for sorting, original string for output
- best_a_vs_b[a] = (b, score, parts[c_score], parts[c_identity], parts[c_coverage], qlen, length)
-b_short_list = set(v[0] for v in best_a_vs_b.values())
-best_b_vs_a = dict()
-for line in open(b_vs_a):
- if line.startswith("#"): continue
- parts = line.rstrip("\n").split("\t")
- b = parts[c_query]
- a = parts[c_match]
- if a not in best_a_vs_b:
- continue
- #stop_err("The A-vs-B file does not have A-ID %r found in B-vs-A file" % a)
- if b not in b_short_list: continue
- if float(parts[c_identity])< min_identity or float(parts[c_coverage]) < min_coverage:
- continue
- score = float(parts[c_score])
- qlen = int(parts[c_qlen])
- length = int(parts[c_length])
- if b not in best_b_vs_a or score > best_b_vs_a[b][1]:
- # Store bitscore as a float for sorting, original string for output
- best_b_vs_a[b] = (a, score, parts[c_score], parts[c_identity], parts[c_coverage], qlen, length)
-#TODO - Preserve order from A vs B?
-a_short_list = sorted(set(v[0] for v in best_b_vs_a.values()))
+best_b_vs_a = dict(best_hits(b_vs_a))
+
count = 0
outfile = open(out_file, 'w')
outfile.write("#A_id\tB_id\tA_length\tB_length\tA_qcovhsp\tB_qcovhsp\tlength\tpident\tbitscore\n")
-for a in a_short_list:
- a_values = best_a_vs_b[a]
- b = a_values[0]
- b_values = best_b_vs_a[b]
- if b in best_b_vs_a and a == b_values[0]:
- #Start with IDs, lengths [5], coverage [4]
- values = [a, b, a_values[5], b_values[5], a_values[4], b_values[4]]
- #Length was an integer so don't care about original string
- values.append(min(a_values[6], b_values[6]))
- #Output the original string versions of the scores
- #[3] = identity as string
- if float(a_values[3]) < float(b_values[3]):
- values.append(a_values[3])
- else:
- values.append(b_values[3])
- #[1] = bitscore as float, [2] = bitscore as original string
- if a_values[1] < b_values[1]:
- values.append(a_values[2])
- else:
- values.append(b_values[2])
- outfile.write("%s\t%s\t%i\t%i\t%s\t%s\t%i\t%s\t%s\n" % tuple(values))
- count += 1
+for a, (b, a_score_float, a_score_str, a_identity_str, a_coverage_str, a_qlen, a_length) in best_hits(a_vs_b):
+ if b not in best_b_vs_a:
+ #Match b has no best hit
+ continue
+ a2, b_score_float, b_score_str, b_identity_str, b_coverage_str, b_qlen, b_length = best_b_vs_a[b]
+ if a != a2:
+ #Not an RBH
+ continue
+ #Start with IDs, lengths, coverage
+ values = [a, b, a_qlen, b_qlen, a_coverage_str, b_coverage_str]
+ #Alignment length was an integer so don't care about original string
+ values.append(min(a_length, b_length))
+ #Output the original string versions of the scores
+ if float(a_identity_str) < float(b_identity_str):
+ values.append(a_identity_str)
+ else:
+ values.append(b_identity_str)
+ if a_score_float < b_score_float:
+ values.append(a_score_str)
+ else:
+ values.append(b_score_str)
+ outfile.write("%s\t%s\t%i\t%i\t%s\t%s\t%i\t%s\t%s\n" % tuple(values))
+ count += 1
outfile.close()
print "Done, %i RBH found" % count
+if tie_warning:
+ sys.stderr.write("Warning: Sequencies with tied best hits found, you may have duplicates/clusters\n")
#Remove temp files...
shutil.rmtree(base_path)
diff -r c84b6c21e3d4 -r e47960bcdccb tools/blast_rbh/blast_rbh.xml
--- a/tools/blast_rbh/blast_rbh.xml Tue May 20 06:33:08 2014 -0400
+++ b/tools/blast_rbh/blast_rbh.xml Wed May 21 06:33:50 2014 -0400
@@ -125,6 +125,25 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
**What it does**
@@ -159,6 +178,11 @@
The coverage values are given by the HSP alignment length divided by the
sequence length (adjusted by a factor of three for TBLASTX).
+Note that if a sequence has equally scoring top BLAST matches to multiple
+sequence in the other file, it will not be considered for an RBH. This
+can happen following gene duplication, or for (near) identical gene
+duplicates.
+
.. class:: warningmark
**Note**