Mercurial > repos > cpt_testbed > functionalworkflow
comparison blast_to_gff3.py @ 0:f678e282b320 draft default tip
"planemo upload"
| author | cpt_testbed |
|---|---|
| date | Fri, 06 May 2022 07:07:23 +0000 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:f678e282b320 |
|---|---|
| 1 #!/usr/bin/env python | |
| 2 import argparse | |
| 3 import copy | |
| 4 import logging | |
| 5 import re | |
| 6 import sys | |
| 7 from CPT_GFFParser import gffParse, gffWrite, gffSeqFeature | |
| 8 from Bio.Blast import NCBIXML | |
| 9 from Bio.Seq import Seq | |
| 10 from Bio.SeqRecord import SeqRecord | |
| 11 from Bio.SeqFeature import SeqFeature, FeatureLocation | |
| 12 | |
| 13 logging.basicConfig(level=logging.INFO) | |
| 14 log = logging.getLogger(name="blast2gff3") | |
| 15 | |
| 16 __doc__ = """ | |
| 17 Convert BlastXML or Blast 25 Column Table output into GFF3 | |
| 18 """ | |
| 19 | |
| 20 # note for all FeatureLocations, Biopython saves in zero index and Blast provides one indexed locations, thus a Blast Location of (123,500) should be saved as (122, 500) | |
| 21 def blast2gff3(blast, blastxml=False, blasttab=False, include_seq=False): | |
| 22 # Call correct function based on xml or tabular file input, raise error if neither or both are provided | |
| 23 if blastxml and blasttab: | |
| 24 raise Exception("Cannot provide both blast xml and tabular flag") | |
| 25 | |
| 26 if blastxml: | |
| 27 return blastxml2gff3(blast, include_seq) | |
| 28 elif blasttab: | |
| 29 return blasttsv2gff3(blast, include_seq) | |
| 30 else: | |
| 31 raise Exception("Must provide either blast xml or tabular flag") | |
| 32 | |
| 33 | |
| 34 def check_bounds(ps, pe, qs, qe): | |
| 35 # simplify the constant boundary checking used in subfeature generation | |
| 36 if qs < ps: | |
| 37 ps = qs | |
| 38 if qe > pe: | |
| 39 pe = qe | |
| 40 if ps <= 0: | |
| 41 ps = 1 | |
| 42 return (min(ps, pe), max(ps, pe)) | |
| 43 | |
| 44 | |
| 45 def clean_string(s): | |
| 46 clean_str = re.sub("\|", "_", s) # Replace any \ or | with _ | |
| 47 clean_str = re.sub( | |
| 48 "[^A-Za-z0-9_\ .-]", "", clean_str | |
| 49 ) # Remove any non-alphanumeric or _.- chars | |
| 50 return clean_str | |
| 51 | |
| 52 | |
| 53 def clean_slist(l): | |
| 54 cleaned_list = [] | |
| 55 for s in l: | |
| 56 cleaned_list.append(clean_string(s)) | |
| 57 return cleaned_list | |
| 58 | |
| 59 | |
| 60 def blastxml2gff3(blastxml, include_seq=False): | |
| 61 | |
| 62 blast_records = NCBIXML.parse(blastxml) | |
| 63 for idx_record, record in enumerate(blast_records): | |
| 64 # http://www.sequenceontology.org/browser/release_2.4/term/SO:0000343 | |
| 65 # match_type = { # Currently we can only handle BLASTN, BLASTP | |
| 66 # "BLASTN": "nucleotide_match", | |
| 67 # "BLASTP": "protein_match", | |
| 68 # }.get(record.application, "match") | |
| 69 match_type = "match" | |
| 70 collected_records = [] | |
| 71 | |
| 72 recid = record.query | |
| 73 if " " in recid: | |
| 74 recid = clean_string(recid[0 : recid.index(" ")]) | |
| 75 | |
| 76 for idx_hit, hit in enumerate(record.alignments): | |
| 77 # gotta check all hsps in a hit to see boundaries | |
| 78 rec = SeqRecord("", id=recid) | |
| 79 parent_match_start = 0 | |
| 80 parent_match_end = 0 | |
| 81 hit_qualifiers = { | |
| 82 "ID": "b2g.%s.%s" % (idx_record, idx_hit), | |
| 83 "source": "blast", | |
| 84 "accession": hit.accession, | |
| 85 "hit_id": clean_string(hit.hit_id), | |
| 86 "score": None, | |
| 87 "length": hit.length, | |
| 88 "hit_titles": clean_slist(hit.title.split(" >")), | |
| 89 "hsp_count": len(hit.hsps), | |
| 90 } | |
| 91 desc = hit.title.split(" >")[0] | |
| 92 hit_qualifiers["Name"] = desc | |
| 93 sub_features = [] | |
| 94 for idx_hsp, hsp in enumerate(hit.hsps): | |
| 95 if idx_hsp == 0: | |
| 96 # -2 and +1 for start/end to convert 0 index of python to 1 index of people, -2 on start because feature location saving issue | |
| 97 parent_match_start = hsp.query_start | |
| 98 parent_match_end = hsp.query_end | |
| 99 hit_qualifiers["score"] = hsp.expect | |
| 100 # generate qualifiers to be added to gff3 feature | |
| 101 hit_qualifiers["score"] = min(hit_qualifiers["score"], hsp.expect) | |
| 102 hsp_qualifiers = { | |
| 103 "ID": "b2g.%s.%s.hsp%s" % (idx_record, idx_hit, idx_hsp), | |
| 104 "source": "blast", | |
| 105 "score": hsp.expect, | |
| 106 "accession": hit.accession, | |
| 107 "hit_id": clean_string(hit.hit_id), | |
| 108 "length": hit.length, | |
| 109 "hit_titles": clean_slist(hit.title.split(" >")), | |
| 110 } | |
| 111 if include_seq: | |
| 112 if ( | |
| 113 "blast_qseq", | |
| 114 "blast_sseq", | |
| 115 "blast_mseq", | |
| 116 ) in hit_qualifiers.keys(): | |
| 117 hit_qualifiers.update( | |
| 118 { | |
| 119 "blast_qseq": hit_qualifiers["blast_qseq"] + hsp.query, | |
| 120 "blast_sseq": hit_qualifiers["blast_sseq"] + hsp.sbjct, | |
| 121 "blast_mseq": hit_qualifiers["blast_mseq"] + hsp.match, | |
| 122 } | |
| 123 ) | |
| 124 else: | |
| 125 hit_qualifiers.update( | |
| 126 { | |
| 127 "blast_qseq": hsp.query, | |
| 128 "blast_sseq": hsp.sbjct, | |
| 129 "blast_mseq": hsp.match, | |
| 130 } | |
| 131 ) | |
| 132 for prop in ( | |
| 133 "score", | |
| 134 "bits", | |
| 135 "identities", | |
| 136 "positives", | |
| 137 "gaps", | |
| 138 "align_length", | |
| 139 "strand", | |
| 140 "frame", | |
| 141 "query_start", | |
| 142 "query_end", | |
| 143 "sbjct_start", | |
| 144 "sbjct_end", | |
| 145 ): | |
| 146 hsp_qualifiers["blast_" + prop] = getattr(hsp, prop, None) | |
| 147 | |
| 148 # check if parent boundary needs to increase to envelope hsp | |
| 149 # if hsp.query_start < parent_match_start: | |
| 150 # parent_match_start = hsp.query_start - 1 | |
| 151 # if hsp.query_end > parent_match_end: | |
| 152 # parent_match_end = hsp.query_end + 1 | |
| 153 | |
| 154 parent_match_start, parent_match_end = check_bounds( | |
| 155 parent_match_start, parent_match_end, hsp.query_start, hsp.query_end | |
| 156 ) | |
| 157 | |
| 158 # add hsp to the gff3 feature as a "match_part" | |
| 159 sub_features.append( | |
| 160 gffSeqFeature( | |
| 161 FeatureLocation(hsp.query_start - 1, hsp.query_end), | |
| 162 type="match_part", | |
| 163 strand=0, | |
| 164 qualifiers=copy.deepcopy(hsp_qualifiers), | |
| 165 ) | |
| 166 ) | |
| 167 | |
| 168 # Build the top level seq feature for the hit | |
| 169 hit_qualifiers["description"] = "Residue %s..%s hit to %s" % (parent_match_start, parent_match_end, desc,) | |
| 170 top_feature = gffSeqFeature( | |
| 171 FeatureLocation(parent_match_start - 1, parent_match_end), | |
| 172 type=match_type, | |
| 173 strand=0, | |
| 174 qualifiers=hit_qualifiers, | |
| 175 ) | |
| 176 # add the generated subfeature hsp match_parts to the hit feature | |
| 177 top_feature.sub_features = copy.deepcopy( | |
| 178 sorted(sub_features, key=lambda x: int(x.location.start)) | |
| 179 ) | |
| 180 # Add the hit feature to the record | |
| 181 rec.features.append(top_feature) | |
| 182 rec.annotations = {} | |
| 183 collected_records.append(rec) | |
| 184 | |
| 185 if not len(collected_records): | |
| 186 print("##gff-version 3\n##sequence-region null 1 4") | |
| 187 | |
| 188 for rec in collected_records: | |
| 189 yield rec | |
| 190 | |
| 191 | |
| 192 def combine_records(records): | |
| 193 # Go through each record and identify those records with | |
| 194 cleaned_records = {} | |
| 195 for rec in records: | |
| 196 combo_id = ( | |
| 197 rec.features[0].qualifiers["target"] | |
| 198 + rec.features[0].qualifiers["accession"] | |
| 199 ) | |
| 200 if combo_id not in cleaned_records.keys(): | |
| 201 # First instance of a query ID + subject ID combination | |
| 202 # Save this record as it's only item | |
| 203 newid = rec.features[0].qualifiers["ID"] + ".0" | |
| 204 rec.features[0].qualifiers["ID"] = newid | |
| 205 rec.features[0].sub_features[0].qualifiers["ID"] = newid + ".hsp0" | |
| 206 cleaned_records[combo_id] = rec | |
| 207 else: | |
| 208 # Query ID + Subject ID has appeared before | |
| 209 # Combine the Match Parts as subfeatures | |
| 210 sub_features = copy.deepcopy( | |
| 211 cleaned_records[combo_id].features[0].sub_features | |
| 212 ) | |
| 213 addtnl_features = rec.features[0].sub_features | |
| 214 # add the current records sub features to the ones previous | |
| 215 for feat in addtnl_features: | |
| 216 sub_features.append(feat) | |
| 217 cleaned_records[combo_id].features[0].subfeatures = copy.deepcopy( | |
| 218 sub_features | |
| 219 ) | |
| 220 cleaned_records[combo_id].features[0].qualifiers["score"] = min(cleaned_records[combo_id].features[0].qualifiers["score"], rec.features[0].qualifiers["score"]) | |
| 221 # now we need to update the IDs for the features when combined | |
| 222 # sort them into the proper order, then apply new ids | |
| 223 # and also ensure the parent record boundaries fit the whole span of subfeatures | |
| 224 sub_features = sorted(sub_features, key=lambda x: int(x.location.start)) | |
| 225 new_parent_start = cleaned_records[combo_id].features[0].location.start + 1 | |
| 226 new_parent_end = cleaned_records[combo_id].features[0].location.end | |
| 227 for idx, feat in enumerate(sub_features): | |
| 228 feat.qualifiers["ID"] = "%s.hsp%s" % ( | |
| 229 cleaned_records[combo_id].features[0].qualifiers["ID"], | |
| 230 idx, | |
| 231 ) | |
| 232 new_parent_start, new_parent_end = check_bounds( | |
| 233 new_parent_start, | |
| 234 new_parent_end, | |
| 235 feat.location.start + 1, | |
| 236 feat.location.end, | |
| 237 ) | |
| 238 cleaned_records[combo_id].features[0].qualifiers["score"] = min(cleaned_records[combo_id].features[0].qualifiers["score"], feat.qualifiers["blast_score"]) | |
| 239 # if feat.location.start < new_parent_start: | |
| 240 # new_parent_start = feat.location.start - 1 | |
| 241 # if feat.location.end > new_parent_end: | |
| 242 # new_parent_end = feat.location.end + 1 | |
| 243 cleaned_records[combo_id].features[0].location = FeatureLocation( | |
| 244 new_parent_start - 1, new_parent_end | |
| 245 ) | |
| 246 cleaned_records[combo_id].features[0].qualifiers[ | |
| 247 "description" | |
| 248 ] = "Residue %s..%s hit to %s" % ( | |
| 249 new_parent_start, | |
| 250 new_parent_end, | |
| 251 cleaned_records[combo_id].features[0].qualifiers["Name"], | |
| 252 ) | |
| 253 # save the renamed and ordered feature list to record | |
| 254 cleaned_records[combo_id].features[0].sub_features = copy.deepcopy( | |
| 255 sub_features | |
| 256 ) | |
| 257 return sorted( | |
| 258 cleaned_records.values(), key=lambda x: int(x.features[0].location.start) | |
| 259 ) | |
| 260 | |
| 261 | |
| 262 def blasttsv2gff3(blasttsv, include_seq=False): | |
| 263 | |
| 264 # http://www.sequenceontology.org/browser/release_2.4/term/SO:0000343 | |
| 265 # match_type = { # Currently we can only handle BLASTN, BLASTP | |
| 266 # "BLASTN": "nucleotide_match", | |
| 267 # "BLASTP": "protein_match", | |
| 268 # }.get(type, "match") | |
| 269 match_type = "match" | |
| 270 | |
| 271 columns = [ | |
| 272 "qseqid", # 01 Query Seq-id (ID of your sequence) | |
| 273 "sseqid", # 02 Subject Seq-id (ID of the database hit) | |
| 274 "pident", # 03 Percentage of identical matches | |
| 275 "length", # 04 Alignment length | |
| 276 "mismatch", # 05 Number of mismatches | |
| 277 "gapopen", # 06 Number of gap openings | |
| 278 "qstart", # 07 Start of alignment in query | |
| 279 "qend", # 08 End of alignment in query | |
| 280 "sstart", # 09 Start of alignment in subject (database hit) | |
| 281 "send", # 10 End of alignment in subject (database hit) | |
| 282 "evalue", # 11 Expectation value (E-value) | |
| 283 "bitscore", # 12 Bit score | |
| 284 "sallseqid", # 13 All subject Seq-id(s), separated by a ';' | |
| 285 "score", # 14 Raw score | |
| 286 "nident", # 15 Number of identical matches | |
| 287 "positive", # 16 Number of positive-scoring matches | |
| 288 "gaps", # 17 Total number of gaps | |
| 289 "ppos", # 18 Percentage of positive-scoring matches | |
| 290 "qframe", # 19 Query frame | |
| 291 "sframe", # 20 Subject frame | |
| 292 "qseq", # 21 Aligned part of query sequence | |
| 293 "sseq", # 22 Aligned part of subject sequence | |
| 294 "qlen", # 23 Query sequence length | |
| 295 "slen", # 24 Subject sequence length | |
| 296 "salltitles", # 25 All subject title(s), separated by a '<>' | |
| 297 ] | |
| 298 collected_records = [] | |
| 299 for record_idx, record in enumerate(blasttsv): | |
| 300 if record.startswith("#"): | |
| 301 continue | |
| 302 | |
| 303 dc = {k: v for (k, v) in zip(columns, (x.strip() for x in record.split("\t")))} | |
| 304 | |
| 305 rec = SeqRecord("", id=dc["qseqid"]) | |
| 306 | |
| 307 feature_id = "b2g.%s" % (record_idx) | |
| 308 hit_qualifiers = { | |
| 309 "ID": feature_id, | |
| 310 "Name": (dc["salltitles"].split("<>")[0]), | |
| 311 "description": "Residue {sstart}..{send} hit to {x}".format( | |
| 312 x=dc["salltitles"].split("<>")[0], **dc | |
| 313 ), | |
| 314 "source": "blast", | |
| 315 "score": dc["evalue"], | |
| 316 "accession": clean_string(dc["sseqid"]), | |
| 317 "length": dc["qlen"], | |
| 318 "hit_titles": clean_slist(dc["salltitles"].split("<>")), | |
| 319 "target": clean_string(dc["qseqid"]), | |
| 320 } | |
| 321 hsp_qualifiers = {"source": "blast"} | |
| 322 for key in dc.keys(): | |
| 323 # Add the remaining BLAST info to the GFF qualifiers | |
| 324 if key in ("salltitles", "sallseqid", "sseqid", "qseqid", "qseq", "sseq",): | |
| 325 continue | |
| 326 hsp_qualifiers["blast_%s" % key] = clean_string(dc[key]) | |
| 327 | |
| 328 # Below numbers stored as strings, convert to proper form | |
| 329 for ( | |
| 330 integer_numerical_key | |
| 331 ) in "gapopen gaps length mismatch nident positive qend qframe qlen qstart score send sframe slen sstart".split( | |
| 332 " " | |
| 333 ): | |
| 334 dc[integer_numerical_key] = int(dc[integer_numerical_key]) | |
| 335 | |
| 336 for float_numerical_key in "bitscore evalue pident ppos".split(" "): | |
| 337 dc[float_numerical_key] = float(dc[float_numerical_key]) | |
| 338 | |
| 339 parent_match_start = dc["qstart"] | |
| 340 parent_match_end = dc["qend"] | |
| 341 | |
| 342 parent_match_start, parent_match_end = check_bounds( | |
| 343 parent_match_start, parent_match_end, dc["qstart"], dc["qend"] | |
| 344 ) | |
| 345 | |
| 346 # The ``match`` feature will hold one or more ``match_part``s | |
| 347 top_feature = gffSeqFeature( | |
| 348 FeatureLocation( | |
| 349 min(parent_match_start, parent_match_end) - 1, | |
| 350 max(parent_match_start, parent_match_end), | |
| 351 ), | |
| 352 type=match_type, | |
| 353 strand=0, | |
| 354 qualifiers=hit_qualifiers, | |
| 355 ) | |
| 356 top_feature.sub_features = [] | |
| 357 # There is a possibility of multiple lines containing the HSPS | |
| 358 # for the same hit. | |
| 359 # Unlike the parent feature, ``match_part``s have sources. | |
| 360 hsp_qualifiers["ID"] = clean_string(dc["sseqid"]) | |
| 361 match_part_start = dc["qstart"] | |
| 362 match_part_end = dc["qend"] | |
| 363 | |
| 364 top_feature.sub_features.append( | |
| 365 gffSeqFeature( | |
| 366 FeatureLocation( | |
| 367 min(match_part_start, match_part_end) - 1, | |
| 368 max(match_part_start, match_part_end), | |
| 369 ), | |
| 370 type="match_part", | |
| 371 strand=0, | |
| 372 qualifiers=copy.deepcopy(hsp_qualifiers), | |
| 373 ) | |
| 374 ) | |
| 375 top_feature.sub_features = sorted( | |
| 376 top_feature.sub_features, key=lambda x: int(x.location.start) | |
| 377 ) | |
| 378 rec.features = [top_feature] | |
| 379 rec.annotations = {} | |
| 380 collected_records.append(rec) | |
| 381 | |
| 382 collected_records = combine_records(collected_records) | |
| 383 if not len(collected_records): | |
| 384 print("##gff-version 3\n##sequence-region null 1 4") | |
| 385 for rec in collected_records: | |
| 386 yield rec | |
| 387 | |
| 388 | |
| 389 if __name__ == "__main__": | |
| 390 parser = argparse.ArgumentParser( | |
| 391 description="Convert BlastP or BlastN output to GFF3, must provide XML or Tabular output", | |
| 392 epilog="", | |
| 393 ) | |
| 394 parser.add_argument( | |
| 395 "blast", | |
| 396 type=argparse.FileType("r"), | |
| 397 help="Blast XML or 25 Column Tabular Output file", | |
| 398 ) | |
| 399 parser.add_argument( | |
| 400 "--blastxml", action="store_true", help="Process file as Blast XML Output" | |
| 401 ) | |
| 402 parser.add_argument( | |
| 403 "--blasttab", | |
| 404 action="store_true", | |
| 405 help="Process file as Blast 25 Column Tabular Output", | |
| 406 ) | |
| 407 parser.add_argument( | |
| 408 "--include_seq", | |
| 409 action="store_true", | |
| 410 help="Include sequence, only used for Blast XML", | |
| 411 ) | |
| 412 args = parser.parse_args() | |
| 413 | |
| 414 for rec in blast2gff3(**vars(args)): | |
| 415 if len(rec.features): | |
| 416 gffWrite([rec], sys.stdout) |
