blastxml_to_gapped_gff3: blastxml_to_gapped

comparison blastxml_to_gapped_gff3.py @ 2:2c71c9b0214c draft default tip

planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/blastxml_to_gapped_gff3 commit 908f16ea4eb082227437dc93e06e8cb742f5a257

author	iuc
date	Wed, 15 Nov 2017 15:14:02 -0500
parents	f9812943a3c6
children

comparison

equal deleted inserted replaced

-:f9812943a3c6
+:2c71c9b0214c
 import logging
 import re
 import sys
 from BCBio import GFF
 logging.basicConfig(level=logging.INFO)
 log = logging.getLogger(name='blastxml2gff3')
-__author__ = "Eric Rasche"
-__version__ = "0.4.0"
-__maintainer__ = "Eric Rasche"
-__email__ = "esr@tamu.edu"
 __doc__ = """
 BlastXML files, when transformed to GFF3, do not normally show gaps in the
 blast hits. This tool aims to fill that "gap".
 """
-def blastxml2gff3(blastxml, min_gap=3, trim=False, trim_end=False):
+def blastxml2gff3(blastxml, min_gap=3, trim=False, trim_end=False, include_seq=False):
 from Bio.Blast import NCBIXML
 from Bio.Seq import Seq
 from Bio.SeqRecord import SeqRecord
 from Bio.SeqFeature import SeqFeature, FeatureLocation
 blast_records = NCBIXML.parse(blastxml)
-records = []
+for idx_record, record in enumerate(blast_records):
-for record in blast_records:
 # http://www.sequenceontology.org/browser/release_2.4/term/SO:0000343
 match_type = {  # Currently we can only handle BLASTN, BLASTP
 'BLASTN': 'nucleotide_match',
 'BLASTP': 'protein_match',
 }.get(record.application, 'match')
-rec = SeqRecord(Seq("ACTG"), id=record.query)
+recid = record.query
-for hit in record.alignments:
+if ' ' in recid:
-for hsp in hit.hsps:
+recid = recid[0:recid.index(' ')]
+rec = SeqRecord(Seq("ACTG"), id=recid)
+for idx_hit, hit in enumerate(record.alignments):
+for idx_hsp, hsp in enumerate(hit.hsps):
 qualifiers = {
+"ID": 'b2g.%s.%s.%s' % (idx_record, idx_hit, idx_hsp),
 "source": "blast",
 "score": hsp.expect,
 "accession": hit.accession,
 "hit_id": hit.hit_id,
 "length": hit.length,
-"hit_titles": hit.title.split(' >')
+"hit_titles": hit.title.split(' >'),
 }
+if include_seq:
+qualifiers.update({
+'blast_qseq': hsp.query,
+'blast_sseq': hsp.sbjct,
+'blast_mseq': hsp.match,
+})
+for prop in ('score', 'bits', 'identities', 'positives',
+'gaps', 'align_length', 'strand', 'frame',
+'query_start', 'query_end', 'sbjct_start',
+'sbjct_end'):
+qualifiers['blast_' + prop] = getattr(hsp, prop, None)
 desc = hit.title.split(' >')[0]
 qualifiers['description'] = desc[desc.index(' '):]
 # This required a fair bit of sketching out/match to figure out
 # the first time.
 # may be longer than the parent feature, so we use the supplied
 # subject/hit length to calculate the real ending of the target
 # protein.
 parent_match_end = hsp.query_start + hit.length + hsp.query.count('-')
-# However, if the user requests that we trim the feature, then
+# If we trim the left end, we need to trim without losing information.
-# we need to cut the ``match`` start to 0 to match the parent feature.
+used_parent_match_start = parent_match_start
-# We'll also need to cut the end to match the query's end. It (maybe)
-# should be the feature end? But we don't have access to that data, so
-# We settle for this.
 if trim:
 if parent_match_start < 1:
-parent_match_start = 0
+used_parent_match_start = 0
 if trim or trim_end:
 if parent_match_end > hsp.query_end:
 parent_match_end = hsp.query_end + 1
 # The ``match`` feature will hold one or more ``match_part``s
 top_feature = SeqFeature(
-FeatureLocation(parent_match_start, parent_match_end),
+FeatureLocation(used_parent_match_start, parent_match_end),
 type=match_type, strand=0,
 qualifiers=qualifiers
 )
 # Unlike the parent feature, ``match_part``s have sources.
 part_qualifiers = {
 "source": "blast",
 }
 top_feature.sub_features = []
-for start, end, cigar in generate_parts(hsp.query, hsp.match,
+for idx_part, (start, end, cigar) in \
-hsp.sbjct,
+enumerate(generate_parts(hsp.query, hsp.match,
-ignore_under=min_gap):
+hsp.sbjct,
+ignore_under=min_gap)):
 part_qualifiers['Gap'] = cigar
-part_qualifiers['ID'] = hit.hit_id
+part_qualifiers['ID'] = qualifiers['ID'] + ('.%s' % idx_part)
-if trim:
+# Otherwise, we have to account for the subject start's location
-# If trimming, then we start relative to the
+match_part_start = parent_match_start + hsp.sbjct_start + start - 1
-# match's start
-match_part_start = parent_match_start + start
-else:
-# Otherwise, we have to account for the subject start's location
-match_part_start = parent_match_start + hsp.sbjct_start + start - 1
 # We used to use hsp.align_length here, but that includes
 # gaps in the parent sequence
 #
 # Furthermore align_length will give calculation errors in weird places
 qualifiers=copy.deepcopy(part_qualifiers))
 )
 rec.features.append(top_feature)
 rec.annotations = {}
-records.append(rec)
+yield rec
-return records
 def __remove_query_gaps(query, match, subject):
 """remove positions in all three based on gaps in query
 return ""
 if __name__ == '__main__':
 parser = argparse.ArgumentParser(description='Convert Blast XML to gapped GFF3', epilog='')
-parser.add_argument('blastxml', type=open, help='Blast XML Output')
+parser.add_argument('blastxml', type=argparse.FileType("r"), help='Blast XML Output')
 parser.add_argument('--min_gap', type=int, help='Maximum gap size before generating a new match_part', default=3)
 parser.add_argument('--trim', action='store_true', help='Trim blast hits to be only as long as the parent feature')
 parser.add_argument('--trim_end', action='store_true', help='Cut blast results off at end of gene')
+parser.add_argument('--include_seq', action='store_true', help='Include sequence')
 args = parser.parse_args()
-result = blastxml2gff3(**vars(args))
+for rec in blastxml2gff3(**vars(args)):
-GFF.write(result, sys.stdout)
+if len(rec.features):
+GFF.write([rec], sys.stdout)

Mercurial > repos > iuc > blastxml_to_gapped_gff3

comparison blastxml_to_gapped_gff3.py @ 2:2c71c9b0214c draft default tip