# HG changeset patch
# User peterjc
# Date 1550847503 18000
# Node ID 3f4e7df708f2acb3d73a786d904c0682988cd154
# Parent 0f4669378764b8e9e53ab01b0cfc5234df147fdd
planemo upload for repository https://github.com/peterjc/galaxy_blast/tree/master/tools/ncbi_blast_plus commit 960f4708be7cdd486e4569e7b44eb856b2cad79d-dirty
diff -r 0f4669378764 -r 3f4e7df708f2 tools/ncbi_blast_plus/blastxml_to_tabular.py
--- a/tools/ncbi_blast_plus/blastxml_to_tabular.py Wed Nov 14 06:23:31 2018 -0500
+++ b/tools/ncbi_blast_plus/blastxml_to_tabular.py Fri Feb 22 09:58:23 2019 -0500
@@ -81,12 +81,14 @@
else:
from galaxy import eggs # noqa - ignore flake8 F401
import pkg_resources
+
pkg_resources.require("elementtree")
from elementtree import ElementTree
if len(sys.argv) == 4 and sys.argv[3] in ["std", "x22", "ext"]:
# False positive if user really has a BLAST XML file called 'std' or 'ext'...
- sys.exit("""ERROR: The script API has changed, sorry.
+ sys.exit(
+ """ERROR: The script API has changed, sorry.
Instead of the old style:
@@ -99,7 +101,8 @@
For more information, use:
$ python blastxml_to_tabular.py -h
-""")
+"""
+ )
usage = """usage: %prog [options] blastxml[,...]
@@ -113,16 +116,29 @@
extended column names are supported.
"""
parser = OptionParser(usage=usage)
-parser.add_option('-o', '--output', dest='output', default=None,
- help='output filename (defaults to stdout)',
- metavar="FILE")
-parser.add_option("-c", "--columns", dest="columns", default='std',
- help="[std|ext|col1,col2,...] standard 12 columns, extended 25 columns, or list of column names")
+parser.add_option(
+ "-o",
+ "--output",
+ dest="output",
+ default=None,
+ help="output filename (defaults to stdout)",
+ metavar="FILE",
+)
+parser.add_option(
+ "-c",
+ "--columns",
+ dest="columns",
+ default="std",
+ help="[std|ext|col1,col2,...] standard 12 columns, "
+ "extended 25 columns, or list of column names",
+)
(options, args) = parser.parse_args()
-colnames = ('qseqid,sseqid,pident,length,mismatch,gapopen,qstart,qend,'
- 'sstart,send,evalue,bitscore,sallseqid,score,nident,positive,'
- 'gaps,ppos,qframe,sframe,qseq,sseq,qlen,slen,salltitles').split(',')
+colnames = (
+ "qseqid,sseqid,pident,length,mismatch,gapopen,qstart,qend,"
+ "sstart,send,evalue,bitscore,sallseqid,score,nident,positive,"
+ "gaps,ppos,qframe,sframe,qseq,sseq,qlen,slen,salltitles"
+).split(",")
if len(args) < 1:
sys.exit("ERROR: No BLASTXML input files given; run with --help to see options.")
@@ -148,7 +164,9 @@
assert set(colnames).issuperset(cols), cols
if not cols:
sys.exit("No columns selected!")
- extended = max(colnames.index(c) for c in cols) >= 12 # Do we need any higher columns?
+ extended = (
+ max(colnames.index(c) for c in cols) >= 12
+ ) # Do we need any higher columns?
del out_fmt
for in_file in args:
@@ -213,7 +231,8 @@
# P56514
# or,
# Subject_1
- # gi|57163783|ref|NP_001009242.1| rhodopsin [Felis catus]
+ # gi|57163783|ref|NP_001009242.1|
+ # rhodopsin [Felis catus]
# Subject_1
#
# apparently depending on the parse_deflines switch
@@ -225,11 +244,15 @@
# 2
sseqid = hit.findtext("Hit_id").split(None, 1)[0]
hit_def = sseqid + " " + hit.findtext("Hit_def")
- if re_default_subject_id.match(sseqid) and sseqid == hit.findtext("Hit_accession"):
+ if re_default_subject_id.match(sseqid) and sseqid == hit.findtext(
+ "Hit_accession"
+ ):
# Place holder ID, take the first word of the subject definition
hit_def = hit.findtext("Hit_def")
sseqid = hit_def.split(None, 1)[0]
- if sseqid.startswith("gnl|BL_ORD_ID|") and sseqid == "gnl|BL_ORD_ID|" + hit.findtext("Hit_accession"):
+ if sseqid.startswith(
+ "gnl|BL_ORD_ID|"
+ ) and sseqid == "gnl|BL_ORD_ID|" + hit.findtext("Hit_accession"):
# Alternative place holder ID, again take the first word of hit_def
hit_def = hit.findtext("Hit_def")
sseqid = hit_def.split(None, 1)[0]
@@ -244,27 +267,61 @@
h_seq = hsp.findtext("Hsp_hseq")
m_seq = hsp.findtext("Hsp_midline")
assert len(q_seq) == len(h_seq) == len(m_seq) == int(length)
- gapopen = str(len(q_seq.replace('-', ' ').split()) - 1 +
- len(h_seq.replace('-', ' ').split()) - 1)
+ gapopen = str(
+ len(q_seq.replace("-", " ").split())
+ - 1
+ + len(h_seq.replace("-", " ").split())
+ - 1
+ )
- mismatch = m_seq.count(' ') + m_seq.count('+') - q_seq.count('-') - h_seq.count('-')
+ mismatch = (
+ m_seq.count(" ")
+ + m_seq.count("+")
+ - q_seq.count("-")
+ - h_seq.count("-")
+ )
# TODO - Remove this alternative mismatch calculation and test
# once satisifed there are no problems
- expected_mismatch = len(q_seq) - sum(1 for q, h in zip(q_seq, h_seq)
- if q == h or q == "-" or h == "-")
+ expected_mismatch = len(q_seq) - sum(
+ 1
+ for q, h in zip(q_seq, h_seq)
+ if q == h or q == "-" or h == "-"
+ )
xx = sum(1 for q, h in zip(q_seq, h_seq) if q == "X" and h == "X")
- if not (expected_mismatch - q_seq.count("X") <= int(mismatch) <= expected_mismatch + xx):
- sys.exit("%s vs %s mismatches, expected %i <= %i <= %i"
- % (qseqid, sseqid, expected_mismatch - q_seq.count("X"),
- int(mismatch), expected_mismatch))
+ if not (
+ expected_mismatch - q_seq.count("X")
+ <= int(mismatch)
+ <= expected_mismatch + xx
+ ):
+ sys.exit(
+ "%s vs %s mismatches, expected %i <= %i <= %i"
+ % (
+ qseqid,
+ sseqid,
+ expected_mismatch - q_seq.count("X"),
+ int(mismatch),
+ expected_mismatch,
+ )
+ )
# TODO - Remove this alternative identity calculation and test
# once satisifed there are no problems
expected_identity = sum(1 for q, h in zip(q_seq, h_seq) if q == h)
- if not (expected_identity - xx <= int(nident) <= expected_identity + q_seq.count("X")):
- sys.exit("%s vs %s identities, expected %i <= %i <= %i"
- % (qseqid, sseqid, expected_identity, int(nident),
- expected_identity + q_seq.count("X")))
+ if not (
+ expected_identity - xx
+ <= int(nident)
+ <= expected_identity + q_seq.count("X")
+ ):
+ sys.exit(
+ "%s vs %s identities, expected %i <= %i <= %i"
+ % (
+ qseqid,
+ sseqid,
+ expected_identity,
+ int(nident),
+ expected_identity + q_seq.count("X"),
+ )
+ )
evalue = hsp.findtext("Hsp_evalue")
if evalue == "0":
@@ -280,53 +337,66 @@
# Note BLAST does not round to nearest int, it truncates
bitscore = "%i" % bitscore
- values = [qseqid,
- sseqid,
- pident,
- length, # hsp.findtext("Hsp_align-len")
- str(mismatch),
- gapopen,
- hsp.findtext("Hsp_query-from"), # qstart,
- hsp.findtext("Hsp_query-to"), # qend,
- hsp.findtext("Hsp_hit-from"), # sstart,
- hsp.findtext("Hsp_hit-to"), # send,
- evalue, # hsp.findtext("Hsp_evalue") in scientific notation
- bitscore, # hsp.findtext("Hsp_bit-score") rounded
- ]
+ values = [
+ qseqid,
+ sseqid,
+ pident,
+ length, # hsp.findtext("Hsp_align-len")
+ str(mismatch),
+ gapopen,
+ hsp.findtext("Hsp_query-from"), # qstart,
+ hsp.findtext("Hsp_query-to"), # qend,
+ hsp.findtext("Hsp_hit-from"), # sstart,
+ hsp.findtext("Hsp_hit-to"), # send,
+ evalue, # hsp.findtext("Hsp_evalue") in scientific notation
+ bitscore, # hsp.findtext("Hsp_bit-score") rounded
+ ]
if extended:
try:
- sallseqid = ";".join(name.split(None, 1)[0] for name in hit_def.split(" >"))
- salltitles = "<>".join(name.split(None, 1)[1] for name in hit_def.split(" >"))
+ sallseqid = ";".join(
+ name.split(None, 1)[0] for name in hit_def.split(" >")
+ )
+ salltitles = "<>".join(
+ name.split(None, 1)[1] for name in hit_def.split(" >")
+ )
except IndexError as e:
- sys.exit("Problem splitting multuple hits?\n%r\n--> %s" % (hit_def, e))
+ sys.exit(
+ "Problem splitting multuple hits?\n%r\n--> %s"
+ % (hit_def, e)
+ )
# print(hit_def, "-->", sallseqid)
positive = hsp.findtext("Hsp_positive")
ppos = "%0.2f" % (100 * float(positive) / float(length))
qframe = hsp.findtext("Hsp_query-frame")
sframe = hsp.findtext("Hsp_hit-frame")
if blast_program == "blastp":
- # Probably a bug in BLASTP that they use 0 or 1 depending on format
+ # Probably a bug in BLASTP that they use 0 or 1
+ # depending on format
if qframe == "0":
qframe = "1"
if sframe == "0":
sframe = "1"
slen = int(hit.findtext("Hit_len"))
- values.extend([sallseqid,
- hsp.findtext("Hsp_score"), # score,
- nident,
- positive,
- hsp.findtext("Hsp_gaps"), # gaps,
- ppos,
- qframe,
- sframe,
- # NOTE - for blastp, XML shows original seq, tabular uses XXX masking
- q_seq,
- h_seq,
- str(qlen),
- str(slen),
- salltitles,
- ])
+ values.extend(
+ [
+ sallseqid,
+ hsp.findtext("Hsp_score"), # score,
+ nident,
+ positive,
+ hsp.findtext("Hsp_gaps"), # gaps,
+ ppos,
+ qframe,
+ sframe,
+ # NOTE - for blastp, XML shows original seq,
+ # tabular uses XXX masking
+ q_seq,
+ h_seq,
+ str(qlen),
+ str(slen),
+ salltitles,
+ ]
+ )
if cols:
# Only a subset of the columns are needed
values = [values[colnames.index(c)] for c in cols]
diff -r 0f4669378764 -r 3f4e7df708f2 tools/ncbi_blast_plus/check_no_duplicates.py
--- a/tools/ncbi_blast_plus/check_no_duplicates.py Wed Nov 14 06:23:31 2018 -0500
+++ b/tools/ncbi_blast_plus/check_no_duplicates.py Fri Feb 22 09:58:23 2019 -0500
@@ -31,7 +31,7 @@
if not magic:
# Empty file, special case
continue
- elif magic == b'\x1f\x8b':
+ elif magic == b"\x1f\x8b":
# Gzipped
handle = gzip.open(filename, "rt")
elif magic[0:1] == b">":
diff -r 0f4669378764 -r 3f4e7df708f2 tools/ncbi_blast_plus/repository_dependencies.xml
--- a/tools/ncbi_blast_plus/repository_dependencies.xml Wed Nov 14 06:23:31 2018 -0500
+++ b/tools/ncbi_blast_plus/repository_dependencies.xml Fri Feb 22 09:58:23 2019 -0500
@@ -1,4 +1,4 @@
-
+
-
-
+
+
\ No newline at end of file
diff -r 0f4669378764 -r 3f4e7df708f2 tools/ncbi_blast_plus/tool_dependencies.xml
--- a/tools/ncbi_blast_plus/tool_dependencies.xml Wed Nov 14 06:23:31 2018 -0500
+++ b/tools/ncbi_blast_plus/tool_dependencies.xml Fri Feb 22 09:58:23 2019 -0500
@@ -1,6 +1,6 @@
-
+
-
+
-
+
\ No newline at end of file