# HG changeset patch # User peterjc # Date 1400668430 14400 # Node ID e47960bcdccbcad6f2070f79ea84e3e928d09fce # Parent c84b6c21e3d40ebe223c40d9293a2ebe88229af4 Uploaded v0.1.0f, exclude tied best hits diff -r c84b6c21e3d4 -r e47960bcdccb test-data/k12_edited_proteins.fasta --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/k12_edited_proteins.fasta Wed May 21 06:33:50 2014 -0400 @@ -0,0 +1,69 @@ +>gi|16127995|ref|NP_414542.1| thr operon leader peptide [Escherichia coli str. K-12 substr. MG1655] +MKRISTTITTTITITTGNGAG +>gi|16127996|ref|NP_414543.1| fused aspartokinase I and homoserine dehydrogenase I [Escherichia coli str. K-12 substr. MG1655] +MRVLKFGGTSVANAERFLRVADILESNARQGQVATVLSAPAKITNHLVAMIEKTISGQDALPNISDAERI +FAELLTGLAAAQPGFPLAQLKTFVDQEFAQIKHVLHGISLLGQCPDSINAALICRGEKMSIAIMAGVLEA +RGHNVTVIDPVEKLLAVGHYLESTVDIAESTRRIAASRIPADHMVLMAGFTAGNEKGELVVLGRNGSDYS +AAVLAACLRADCCEIWTDVDGVYTCDPRQVPDARLLKSMSYQEAMELSYFGAKVLHPRTITPIAQFQIPC +LIKNTGNPQAPGTLIGASRDEDELPVKGISNLNNMAMFSVSGPGMKGMVGMAARVFAAMSRARISVVLIT +QSSSEYSISFCVPQSDCVRAERAMQEEFYLELKEGLLEPLAVTERLAIISVVGDGMRTLRGISAKFFAAL +ARANINIVAIAQGSSERSISVVVNNDDATTGVRVTHQMLFNTDQVIEVFVIGVGGVGGALLEQLKRQQSW +LKNKHIDLRVCGVANSKALLTNVHGLNLENWQEELAQAKEPFNLGRLIRLVKEYHLLNPVIVDCTSSQAV +ADQYADFLREGFHVVTPNKKANTSSMDYYHQLRYAAEKSRRKFLYDTNVGAGLPVIENLQNLLNAGDELM +KFSGILSGSLSYIFGKLDEGMSFSEATTLAREMGYTEPDPRDDLSGMDVARKLLILARETGRELELADIE +IEPVLPAEFNAEGDVAAFMANLSQLDDLFAARVAKARDEGKVLRYVGNIDEDGVCRVKIAEVDGNDPLFK +VKNGENALAFYSHYYQPLPLVLRGYGAGNDVTAAGVFADLLRTLSWKLGV +>gi|16127997|ref|NP_414544.1| homoserine kinase [Escherichia coli str. K-12 substr. MG1655] +MVKVYAPASSANMSVGFDVLGAAVTPVDGALLGDVVTVEAAETFSLNNLGRFADKLPSEPRENIVYQCWE +RFCQELGKQIPVAMTLEKNMPIGSGLGSSACSVVAALMAMNEHCGKPLNDTRLLALMGELEGRISGSIHY +DNVAPCFLGGMQLMIEENDIISQQVPGFDEWLWVLAYPGIKVSTAEARAILPAQYRRQDCIAHGRHLAGF +IHACYSRQPELAAKLMKDVIAEPYRERLLPGFRQARQAVAEIGAVASGISGSGPTLFALCDKPETAQRVA +DWLGKNYLQNQEGFVHICRLDTAGARVLEN +>NP_414544_near_copy +MKVYAPASSANMSVGFDVLGAAVTPVDGALLGDVVTVEAAETFSLNNLGRFADKLPSEPRENIVYQCWE +RFCQELGKQIPVAMTLEKNMPIGSGLGSSACSVVAALMAMNEHCGKPLNDTRLLALMGELEGRISGSIHY +DNVAPCFLGGMQLMIEENDIISQQVPGFDEWLWVLAYPGIKVSTAEARAILPAQYRRQDCIAHGRHLAGF +IHACYSRQPELAAKLMKDVIAEPYRERLLPGFRQARQAVAEIGAVASGISGSGPTLFALCDKPETAQRVA +DWLGKNYLQNQEGFVHICRLDTAGARVLEN +>gi|16127998|ref|NP_414545.1| threonine synthase [Escherichia coli str. K-12 substr. MG1655] +MKLYNLKDHNEQVSFAQAVTQGLGKNQGLFFPHDLPEFSLTEIDEMLKLDFVTRSAKILSAFIGDEIPQE +ILEERVRAAFAFPAPVANVESDVGCLELFHGPTLAFKDFGGRFMAQMLTHIAGDKPVTILTATSGDTGAA +VAHAFYGLPNVKVVILYPRGKISPLQEKLFCTLGGNIETVAIDGDFDACQALVKQAFDDEELKVALGLNS +ANSINISRLLAQICYYFEAVAQLPQETRNQLVVSVPSGNFGDLTAGLLAKSLGLPVKRFIAATNVNDTVP +RFLHDGQWSPKATQATLSNAMDVSQPNNWPRVEELFRRKIWQLKELGYAAVDDETTQQTMRELKELGYTS +EPHAAVAYRALRDQLNPGEYGLFLGTAHPAKFKESVEAILGETLDLPKELAERADLPLLSHNLPADFAAL +RKLMMNHQ +>NP_414546_near_copy_1 +MKKMQSIVLALSLVLVAPMAAQAAEITLVPSVKLQIGDRDNRGYYWDGGHWRDHGWWKQHYEWRGNRWHL +HGPPPPPRHHKKAPHDHHGGHGPGKHHRV +>NP_414546_near_copy_2 +MKKMQSIVLALSLVLVAPMAAQAAEITLVPSVKLQIGDRDNRGYYWDGGHWRDHGWWKQHYEWRGNRWHL +HGPPPPPRHHKKAPHDHHGGHGPGKHHRRI +>gi|16128000|ref|NP_414547.1| peroxide resistance protein, lowers intracellular iron [Escherichia coli str. K-12 substr. MG1655] +MLILISPAKTLDYQSPLTTTRYTLPELLDNSQQLIHEARKLTPPQISTLMRISDKLAGINAARFHDWQPD +FTPANARQAILAFKGDVYTGLQAETFSEDDFDFAQQHLRMLSGLYGVLRPLDLMQPYRLEMGIRLENARG +KDLYQFWGDIITNKLNEALAAQGDNVVINLASDEYFKSVKPKKLNAEIIKPVFLDEKNGKFKIISFYAKK +ARGLMSRFIIENRLTKPEQLTGFNSEGYFFDEDSSSNGELVFKRYEQR +>gi|16128001|ref|NP_414548.1| putative transporter [Escherichia coli str. K-12 substr. MG1655] +MPDFFSFINSVLWGSVMIYLLFGAGCWFTFRTGFVQFRYIRQFGKSLKNSIHPQPGGLTSFQSLCTSLAA +RVGSGNLAGVALAITAGGPGAVFWMWVAAFIGMATSFAECSLAQLYKERDVNGQFRGGPAWYMARGLGMR +WMGVLFAVFLLIAYGIIFSGVQANAVARALSFSFDFPPLVTGIILAVFTLLAITRGLHGVARLMQGFVPL +MAIIWVLTSLVICVMNIGQLPHVIWSIFESAFGWQEAAGGAAGYTLSQAITNGFQRSMFSNEAGMGSTPN +AAAAAASWPPHPAAQGIVQMIGIFIDTLVICTASAMLILLAGNGTTYMPLEGIQLIQKAMRVLMGSWGAE +FVTLVVILFAFSSIVANYIYAENNLFFLRLNNPKAIWCLRICTFATVIGGTLLSLPLMWQLADIIMACMA +ITNLTAILLLSPVVHTIASDYLRQRKLGVRPVFDPLRYPDIGRQLSPDAWDDVSQE +>gi|16128002|ref|NP_414549.1| transaldolase B [Escherichia coli str. K-12 substr. MG1655] +MTDKLTSLRQYTTVVADTGDIAAMKLYQPQDATTNPSLILNAAQIPEYRKLIDDAVAWAKQQSNDRAQQI +VDATDKLAVNIGLEILKLVPGRISTEVDARLSYDTEASIAKAKRLIKLYNDAGISNDRILIKLASTWQGI +RAAEQLEKEGINCNLTLLFSFAQARACAEAGVFLISPFVGRILDWYKANTDKKEYAPAEDPGVVSVSEIY +QYYKEHGYETVVMGASFRNIGEILELAGCDRLTIAPALLKELAESEGAIERKLSYTGEVKARPARITESE +FLWQHNQDPMAVDKLAEGIRKFAIDQEKLEKMIGDLL +>gi|16128003|ref|NP_414550.1| molybdochelatase incorporating molybdenum into molybdopterin [Escherichia coli str. K-12 substr. MG1655] +MNTLRIGLVSISDRASSGVYQDKGIPALEEWLTSALTTPFELETRLIPDEQAIIEQTLCELVDEMSCHLV +LTTGGTGPARRDVTPDATLAVADREMPGFGEQMRQISLHFVPTAILSRQVGVIRKQALILNLPGQPKSIK +ETLEGVKDAEGNVVVHGIFASVPYCIQLLEGPYVETAPEVVAAFRPKSARRDVSE +>gi|16128004|ref|NP_414551.1| inner membrane protein, Grp1_Fun34_YaaH family [Escherichia coli str. K-12 substr. MG1655] +MGNTKLANPAPLGLMGFGMTTILLNLHNVGYFALDGIILAMGIFYGGIAQIFAGLLEYKKGNTFGLTAFT +SYGSFWLTLVAILLMPKLGLTDAPNAQFLGVYLGLWGVFTLFMFFGTLKGARVLQFVFFSLTVLFALLAI +GNIAGNAAIIHFAGWIGLICGASAIYLAMGEVLNEQFGRTVLPIGESH + diff -r c84b6c21e3d4 -r e47960bcdccb test-data/k12_ten_proteins.fasta --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/k12_ten_proteins.fasta Wed May 21 06:33:50 2014 -0400 @@ -0,0 +1,60 @@ +>gi|16127995|ref|NP_414542.1| thr operon leader peptide [Escherichia coli str. K-12 substr. MG1655] +MKRISTTITTTITITTGNGAG +>gi|16127996|ref|NP_414543.1| fused aspartokinase I and homoserine dehydrogenase I [Escherichia coli str. K-12 substr. MG1655] +MRVLKFGGTSVANAERFLRVADILESNARQGQVATVLSAPAKITNHLVAMIEKTISGQDALPNISDAERI +FAELLTGLAAAQPGFPLAQLKTFVDQEFAQIKHVLHGISLLGQCPDSINAALICRGEKMSIAIMAGVLEA +RGHNVTVIDPVEKLLAVGHYLESTVDIAESTRRIAASRIPADHMVLMAGFTAGNEKGELVVLGRNGSDYS +AAVLAACLRADCCEIWTDVDGVYTCDPRQVPDARLLKSMSYQEAMELSYFGAKVLHPRTITPIAQFQIPC +LIKNTGNPQAPGTLIGASRDEDELPVKGISNLNNMAMFSVSGPGMKGMVGMAARVFAAMSRARISVVLIT +QSSSEYSISFCVPQSDCVRAERAMQEEFYLELKEGLLEPLAVTERLAIISVVGDGMRTLRGISAKFFAAL +ARANINIVAIAQGSSERSISVVVNNDDATTGVRVTHQMLFNTDQVIEVFVIGVGGVGGALLEQLKRQQSW +LKNKHIDLRVCGVANSKALLTNVHGLNLENWQEELAQAKEPFNLGRLIRLVKEYHLLNPVIVDCTSSQAV +ADQYADFLREGFHVVTPNKKANTSSMDYYHQLRYAAEKSRRKFLYDTNVGAGLPVIENLQNLLNAGDELM +KFSGILSGSLSYIFGKLDEGMSFSEATTLAREMGYTEPDPRDDLSGMDVARKLLILARETGRELELADIE +IEPVLPAEFNAEGDVAAFMANLSQLDDLFAARVAKARDEGKVLRYVGNIDEDGVCRVKIAEVDGNDPLFK +VKNGENALAFYSHYYQPLPLVLRGYGAGNDVTAAGVFADLLRTLSWKLGV +>gi|16127997|ref|NP_414544.1| homoserine kinase [Escherichia coli str. K-12 substr. MG1655] +MVKVYAPASSANMSVGFDVLGAAVTPVDGALLGDVVTVEAAETFSLNNLGRFADKLPSEPRENIVYQCWE +RFCQELGKQIPVAMTLEKNMPIGSGLGSSACSVVAALMAMNEHCGKPLNDTRLLALMGELEGRISGSIHY +DNVAPCFLGGMQLMIEENDIISQQVPGFDEWLWVLAYPGIKVSTAEARAILPAQYRRQDCIAHGRHLAGF +IHACYSRQPELAAKLMKDVIAEPYRERLLPGFRQARQAVAEIGAVASGISGSGPTLFALCDKPETAQRVA +DWLGKNYLQNQEGFVHICRLDTAGARVLEN +>gi|16127998|ref|NP_414545.1| threonine synthase [Escherichia coli str. K-12 substr. MG1655] +MKLYNLKDHNEQVSFAQAVTQGLGKNQGLFFPHDLPEFSLTEIDEMLKLDFVTRSAKILSAFIGDEIPQE +ILEERVRAAFAFPAPVANVESDVGCLELFHGPTLAFKDFGGRFMAQMLTHIAGDKPVTILTATSGDTGAA +VAHAFYGLPNVKVVILYPRGKISPLQEKLFCTLGGNIETVAIDGDFDACQALVKQAFDDEELKVALGLNS +ANSINISRLLAQICYYFEAVAQLPQETRNQLVVSVPSGNFGDLTAGLLAKSLGLPVKRFIAATNVNDTVP +RFLHDGQWSPKATQATLSNAMDVSQPNNWPRVEELFRRKIWQLKELGYAAVDDETTQQTMRELKELGYTS +EPHAAVAYRALRDQLNPGEYGLFLGTAHPAKFKESVEAILGETLDLPKELAERADLPLLSHNLPADFAAL +RKLMMNHQ +>gi|16127999|ref|NP_414546.1| hypothetical protein b0005 [Escherichia coli str. K-12 substr. MG1655] +MKKMQSIVLALSLVLVAPMAAQAAEITLVPSVKLQIGDRDNRGYYWDGGHWRDHGWWKQHYEWRGNRWHL +HGPPPPPRHHKKAPHDHHGGHGPGKHHR +>gi|16128000|ref|NP_414547.1| peroxide resistance protein, lowers intracellular iron [Escherichia coli str. K-12 substr. MG1655] +MLILISPAKTLDYQSPLTTTRYTLPELLDNSQQLIHEARKLTPPQISTLMRISDKLAGINAARFHDWQPD +FTPANARQAILAFKGDVYTGLQAETFSEDDFDFAQQHLRMLSGLYGVLRPLDLMQPYRLEMGIRLENARG +KDLYQFWGDIITNKLNEALAAQGDNVVINLASDEYFKSVKPKKLNAEIIKPVFLDEKNGKFKIISFYAKK +ARGLMSRFIIENRLTKPEQLTGFNSEGYFFDEDSSSNGELVFKRYEQR +>gi|16128001|ref|NP_414548.1| putative transporter [Escherichia coli str. K-12 substr. MG1655] +MPDFFSFINSVLWGSVMIYLLFGAGCWFTFRTGFVQFRYIRQFGKSLKNSIHPQPGGLTSFQSLCTSLAA +RVGSGNLAGVALAITAGGPGAVFWMWVAAFIGMATSFAECSLAQLYKERDVNGQFRGGPAWYMARGLGMR +WMGVLFAVFLLIAYGIIFSGVQANAVARALSFSFDFPPLVTGIILAVFTLLAITRGLHGVARLMQGFVPL +MAIIWVLTSLVICVMNIGQLPHVIWSIFESAFGWQEAAGGAAGYTLSQAITNGFQRSMFSNEAGMGSTPN +AAAAAASWPPHPAAQGIVQMIGIFIDTLVICTASAMLILLAGNGTTYMPLEGIQLIQKAMRVLMGSWGAE +FVTLVVILFAFSSIVANYIYAENNLFFLRLNNPKAIWCLRICTFATVIGGTLLSLPLMWQLADIIMACMA +ITNLTAILLLSPVVHTIASDYLRQRKLGVRPVFDPLRYPDIGRQLSPDAWDDVSQE +>gi|16128002|ref|NP_414549.1| transaldolase B [Escherichia coli str. K-12 substr. MG1655] +MTDKLTSLRQYTTVVADTGDIAAMKLYQPQDATTNPSLILNAAQIPEYRKLIDDAVAWAKQQSNDRAQQI +VDATDKLAVNIGLEILKLVPGRISTEVDARLSYDTEASIAKAKRLIKLYNDAGISNDRILIKLASTWQGI +RAAEQLEKEGINCNLTLLFSFAQARACAEAGVFLISPFVGRILDWYKANTDKKEYAPAEDPGVVSVSEIY +QYYKEHGYETVVMGASFRNIGEILELAGCDRLTIAPALLKELAESEGAIERKLSYTGEVKARPARITESE +FLWQHNQDPMAVDKLAEGIRKFAIDQEKLEKMIGDLL +>gi|16128003|ref|NP_414550.1| molybdochelatase incorporating molybdenum into molybdopterin [Escherichia coli str. K-12 substr. MG1655] +MNTLRIGLVSISDRASSGVYQDKGIPALEEWLTSALTTPFELETRLIPDEQAIIEQTLCELVDEMSCHLV +LTTGGTGPARRDVTPDATLAVADREMPGFGEQMRQISLHFVPTAILSRQVGVIRKQALILNLPGQPKSIK +ETLEGVKDAEGNVVVHGIFASVPYCIQLLEGPYVETAPEVVAAFRPKSARRDVSE +>gi|16128004|ref|NP_414551.1| inner membrane protein, Grp1_Fun34_YaaH family [Escherichia coli str. K-12 substr. MG1655] +MGNTKLANPAPLGLMGFGMTTILLNLHNVGYFALDGIILAMGIFYGGIAQIFAGLLEYKKGNTFGLTAFT +SYGSFWLTLVAILLMPKLGLTDAPNAQFLGVYLGLWGVFTLFMFFGTLKGARVLQFVFFSLTVLFALLAI +GNIAGNAAIIHFAGWIGLICGASAIYLAMGEVLNEQFGRTVLPIGESH + diff -r c84b6c21e3d4 -r e47960bcdccb test-data/rbh_blastp_k12.tabular --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/rbh_blastp_k12.tabular Wed May 21 06:33:50 2014 -0400 @@ -0,0 +1,10 @@ +#A_id B_id A_length B_length A_qcovhsp B_qcovhsp length pident bitscore +gi|16127995|ref|NP_414542.1| gi|16127995|ref|NP_414542.1| 21 21 100 100 21 100.00 38.1 +gi|16127996|ref|NP_414543.1| gi|16127996|ref|NP_414543.1| 820 820 100 100 820 100.00 1687 +gi|16127997|ref|NP_414544.1| gi|16127997|ref|NP_414544.1| 310 310 100 100 310 100.00 642 +gi|16127998|ref|NP_414545.1| gi|16127998|ref|NP_414545.1| 428 428 100 100 428 100.00 882 +gi|16128000|ref|NP_414547.1| gi|16128000|ref|NP_414547.1| 258 258 100 100 258 100.00 531 +gi|16128001|ref|NP_414548.1| gi|16128001|ref|NP_414548.1| 476 476 100 100 476 100.00 959 +gi|16128002|ref|NP_414549.1| gi|16128002|ref|NP_414549.1| 317 317 100 100 317 100.00 648 +gi|16128003|ref|NP_414550.1| gi|16128003|ref|NP_414550.1| 195 195 100 100 195 100.00 397 +gi|16128004|ref|NP_414551.1| gi|16128004|ref|NP_414551.1| 188 188 100 100 188 100.00 365 diff -r c84b6c21e3d4 -r e47960bcdccb tools/blast_rbh/README.rst --- a/tools/blast_rbh/README.rst Tue May 20 06:33:08 2014 -0400 +++ b/tools/blast_rbh/README.rst Wed May 21 06:33:50 2014 -0400 @@ -62,7 +62,7 @@ For making the "Galaxy Tool Shed" http://toolshed.g2.bx.psu.edu/ tarball I use the following command from the Galaxy root folder:: - $ tar -czf blast_rbh.tar.gz tools/blast_rbh/README.rst tools/blast_rbh/blast_rbh.xml tools/blast_rbh/blast_rbh.py tools/blast_rbh/repository_dependencies.xml test-data/rhodopsin_nucs.fasta test-data/rhodopsin_proteins.fasta test-data/three_human_mRNA.fasta test-data/four_human_proteins.fasta test-data/rbh_megablast_rhodopsin_nucs_vs_three_human_mRNA.tabular test-data/rbh_blastn_three_human_mRNA_vs_rhodopsin_nucs.tabular test-data/rbh_blastp_four_human_vs_rhodopsin_proteins.tabular test-data/rbh_none.tabular test-data/rbh_tblastx_rhodopsin_nucs_vs_three_human_mRNA.tabular + $ tar -czf blast_rbh.tar.gz tools/blast_rbh/README.rst tools/blast_rbh/blast_rbh.xml tools/blast_rbh/blast_rbh.py tools/blast_rbh/repository_dependencies.xml test-data/rhodopsin_nucs.fasta test-data/rhodopsin_proteins.fasta test-data/three_human_mRNA.fasta test-data/four_human_proteins.fasta test-data/k12_edited_proteins.fasta test-data/k12_ten_proteins.fasta test-data/rbh_megablast_rhodopsin_nucs_vs_three_human_mRNA.tabular test-data/rbh_blastn_three_human_mRNA_vs_rhodopsin_nucs.tabular test-data/rbh_blastp_four_human_vs_rhodopsin_proteins.tabular test-data/rbh_none.tabular test-data/rbh_tblastx_rhodopsin_nucs_vs_three_human_mRNA.tabular test-data/rbh_blastp_k12.tabular Check this worked:: @@ -75,11 +75,14 @@ test-data/rhodopsin_proteins.fasta test-data/three_human_mRNA.fasta test-data/four_human_proteins.fasta + test-data/k12_edited_proteins.fasta + test-data/k12_ten_proteins.fasta test-data/rbh_megablast_rhodopsin_nucs_vs_three_human_mRNA.tabular test-data/rbh_blastn_three_human_mRNA_vs_rhodopsin_nucs.tabular test-data/rbh_blastp_four_human_vs_rhodopsin_proteins.tabular test-data/rbh_none.tabular test-data/rbh_tblastx_rhodopsin_nucs_vs_three_human_mRNA.tabular + test-data/rbh_blastp_k12.tabular Licence (MIT) diff -r c84b6c21e3d4 -r e47960bcdccb tools/blast_rbh/blast_rbh.py --- a/tools/blast_rbh/blast_rbh.py Tue May 20 06:33:08 2014 -0400 +++ b/tools/blast_rbh/blast_rbh.py Wed May 21 06:33:50 2014 -0400 @@ -99,6 +99,75 @@ c_qlen = 5 c_length = 6 +tie_warning = 0 + +def best_hits(blast_tabular): + """Iterate over BLAST tabular output, returns best hits as 2-tuples. + + Each return value is (query name, tuple of value for the best hit). + + Tied best hits to different sequences are NOT returned. + + One hit is returned for tied best hits to the same sequence + (e.g. repeated domains). + """ + global tie_warning + current = None + best_score = None + best = None + with open(blast_tabular) as h: + for line in h: + if line.startswith("#"): + continue + parts = line.rstrip("\n").split("\t") + if float(parts[c_identity]) < min_identity or float(parts[c_coverage]) < min_coverage: + continue + a = parts[c_query] + b = parts[c_match] + score = float(parts[c_score]) + qlen = int(parts[c_qlen]) + length = int(parts[c_length]) + #print("Considering hit for %s to %s with score %s..." % (a, b, score)) + if current is None: + #First hit + assert best is None + assert best_score is None + best = dict() + #Now append this hit... + elif a != current: + #New hit + if len(best) == 1: + #Unambiguous (no tied matches) + yield current, list(best.values())[0] + else: + #print("%s has %i equally good hits: %s" % (a, len(best), ", ".join(best))) + tie_warning += 1 + best = dict() + #Now append this hit... + elif score < best_score: + #print("No improvement for %s, %s < %s" % (a, score, best_score)) + continue + elif score > best_score: + #This is better, discard old best + best = dict() + #Now append this hit... + else: + #print("Tied best hits for %s" % a) + assert best_score == score + #Now append this hit... + current = a + best_score = score + #This will collapse two equally good hits to the same target (e.g. duplicated domain) + best[b] = (b, score, parts[c_score], parts[c_identity], parts[c_coverage], qlen, length) + #Best hit for final query, if unambiguous: + if current is not None: + if len(best)==1: + yield current, list(best.values())[0] + else: + #print("%s has %i equally good hits: %s" % (a, len(best), ", ".join(best))) + tie_warning += 1 + + #print("Starting...") #TODO - Report log in case of error? run('%s -dbtype %s -in "%s" -out "%s" -logfile "%s"' % (makeblastdb_exe, dbtype, fasta_a, db_a, log)) @@ -111,70 +180,40 @@ % (blast_cmd, fasta_b, db_a, b_vs_a, cols, threads)) #print("BLAST species B vs species A done.") -best_a_vs_b = dict() -for line in open(a_vs_b): - if line.startswith("#"): continue - parts = line.rstrip("\n").split("\t") - a = parts[c_query] - b = parts[c_match] - if float(parts[c_identity]) < min_identity or float(parts[c_coverage]) < min_coverage: - continue - score = float(parts[c_score]) - qlen = int(parts[c_qlen]) - length = int(parts[c_length]) - if a not in best_a_vs_b or score > best_a_vs_b[a][1]: - # Store bitscore as a float for sorting, original string for output - best_a_vs_b[a] = (b, score, parts[c_score], parts[c_identity], parts[c_coverage], qlen, length) -b_short_list = set(v[0] for v in best_a_vs_b.values()) -best_b_vs_a = dict() -for line in open(b_vs_a): - if line.startswith("#"): continue - parts = line.rstrip("\n").split("\t") - b = parts[c_query] - a = parts[c_match] - if a not in best_a_vs_b: - continue - #stop_err("The A-vs-B file does not have A-ID %r found in B-vs-A file" % a) - if b not in b_short_list: continue - if float(parts[c_identity])< min_identity or float(parts[c_coverage]) < min_coverage: - continue - score = float(parts[c_score]) - qlen = int(parts[c_qlen]) - length = int(parts[c_length]) - if b not in best_b_vs_a or score > best_b_vs_a[b][1]: - # Store bitscore as a float for sorting, original string for output - best_b_vs_a[b] = (a, score, parts[c_score], parts[c_identity], parts[c_coverage], qlen, length) -#TODO - Preserve order from A vs B? -a_short_list = sorted(set(v[0] for v in best_b_vs_a.values())) +best_b_vs_a = dict(best_hits(b_vs_a)) + count = 0 outfile = open(out_file, 'w') outfile.write("#A_id\tB_id\tA_length\tB_length\tA_qcovhsp\tB_qcovhsp\tlength\tpident\tbitscore\n") -for a in a_short_list: - a_values = best_a_vs_b[a] - b = a_values[0] - b_values = best_b_vs_a[b] - if b in best_b_vs_a and a == b_values[0]: - #Start with IDs, lengths [5], coverage [4] - values = [a, b, a_values[5], b_values[5], a_values[4], b_values[4]] - #Length was an integer so don't care about original string - values.append(min(a_values[6], b_values[6])) - #Output the original string versions of the scores - #[3] = identity as string - if float(a_values[3]) < float(b_values[3]): - values.append(a_values[3]) - else: - values.append(b_values[3]) - #[1] = bitscore as float, [2] = bitscore as original string - if a_values[1] < b_values[1]: - values.append(a_values[2]) - else: - values.append(b_values[2]) - outfile.write("%s\t%s\t%i\t%i\t%s\t%s\t%i\t%s\t%s\n" % tuple(values)) - count += 1 +for a, (b, a_score_float, a_score_str, a_identity_str, a_coverage_str, a_qlen, a_length) in best_hits(a_vs_b): + if b not in best_b_vs_a: + #Match b has no best hit + continue + a2, b_score_float, b_score_str, b_identity_str, b_coverage_str, b_qlen, b_length = best_b_vs_a[b] + if a != a2: + #Not an RBH + continue + #Start with IDs, lengths, coverage + values = [a, b, a_qlen, b_qlen, a_coverage_str, b_coverage_str] + #Alignment length was an integer so don't care about original string + values.append(min(a_length, b_length)) + #Output the original string versions of the scores + if float(a_identity_str) < float(b_identity_str): + values.append(a_identity_str) + else: + values.append(b_identity_str) + if a_score_float < b_score_float: + values.append(a_score_str) + else: + values.append(b_score_str) + outfile.write("%s\t%s\t%i\t%i\t%s\t%s\t%i\t%s\t%s\n" % tuple(values)) + count += 1 outfile.close() print "Done, %i RBH found" % count +if tie_warning: + sys.stderr.write("Warning: Sequencies with tied best hits found, you may have duplicates/clusters\n") #Remove temp files... shutil.rmtree(base_path) diff -r c84b6c21e3d4 -r e47960bcdccb tools/blast_rbh/blast_rbh.xml --- a/tools/blast_rbh/blast_rbh.xml Tue May 20 06:33:08 2014 -0400 +++ b/tools/blast_rbh/blast_rbh.xml Wed May 21 06:33:50 2014 -0400 @@ -125,6 +125,25 @@ + + + + + + + + + + + + + + + + + + + **What it does** @@ -159,6 +178,11 @@ The coverage values are given by the HSP alignment length divided by the sequence length (adjusted by a factor of three for TBLASTX). +Note that if a sequence has equally scoring top BLAST matches to multiple +sequence in the other file, it will not be considered for an RBH. This +can happen following gene duplication, or for (near) identical gene +duplicates. + .. class:: warningmark **Note**