Mercurial > repos > cpt_testbed > functionalworkflow
comparison intron_detection.py @ 0:f678e282b320 draft default tip
"planemo upload"
| author | cpt_testbed |
|---|---|
| date | Fri, 06 May 2022 07:07:23 +0000 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:f678e282b320 |
|---|---|
| 1 #!/usr/bin/env python | |
| 2 import sys | |
| 3 import re | |
| 4 import itertools | |
| 5 import argparse | |
| 6 import hashlib | |
| 7 import copy | |
| 8 from CPT_GFFParser import gffParse, gffWrite, gffSeqFeature | |
| 9 from Bio.Blast import NCBIXML | |
| 10 from Bio.SeqFeature import SeqFeature, FeatureLocation | |
| 11 from gff3 import feature_lambda | |
| 12 from collections import OrderedDict | |
| 13 import logging | |
| 14 | |
| 15 logging.basicConfig(level=logging.DEBUG) | |
| 16 log = logging.getLogger() | |
| 17 | |
| 18 | |
| 19 def parse_xml(blastxml, thresh): | |
| 20 """ Parses xml file to get desired info (genes, hits, etc) """ | |
| 21 blast = [] | |
| 22 discarded_records = 0 | |
| 23 totLen = 0 | |
| 24 for iter_num, blast_record in enumerate(NCBIXML.parse(blastxml), 1): | |
| 25 blast_gene = [] | |
| 26 align_num = 0 | |
| 27 for alignment in blast_record.alignments: | |
| 28 align_num += 1 | |
| 29 # hit_gis = alignment.hit_id + alignment.hit_def | |
| 30 # gi_nos = [str(gi) for gi in re.findall('(?<=gi\|)\d{9,10}', hit_gis)] | |
| 31 gi_nos = str(alignment.accession) | |
| 32 | |
| 33 for hsp in alignment.hsps: | |
| 34 x = float(hsp.identities - 1) / ((hsp.query_end) - hsp.query_start) | |
| 35 if x < thresh: | |
| 36 discarded_records += 1 | |
| 37 continue | |
| 38 nice_name = blast_record.query | |
| 39 | |
| 40 if " " in nice_name: | |
| 41 nice_name = nice_name[0 : nice_name.index(" ")] | |
| 42 | |
| 43 blast_gene.append( | |
| 44 { | |
| 45 "gi_nos": gi_nos, | |
| 46 "sbjct_length": alignment.length, | |
| 47 "query_length": blast_record.query_length, | |
| 48 "sbjct_range": (hsp.sbjct_start, hsp.sbjct_end), | |
| 49 "query_range": (hsp.query_start, hsp.query_end), | |
| 50 "name": nice_name, | |
| 51 "evalue": hsp.expect, | |
| 52 "identity": hsp.identities, | |
| 53 "identity_percent": x, | |
| 54 "hit_num": align_num, | |
| 55 "iter_num": iter_num, | |
| 56 "match_id": alignment.title.partition(">")[0], | |
| 57 } | |
| 58 ) | |
| 59 | |
| 60 blast.append(blast_gene) | |
| 61 totLen += len(blast_gene) | |
| 62 log.debug("parse_blastxml %s -> %s", totLen + discarded_records, totLen) | |
| 63 return blast | |
| 64 | |
| 65 | |
| 66 def filter_lone_clusters(clusters): | |
| 67 """ Removes all clusters with only one member and those with no hits """ | |
| 68 filtered_clusters = {} | |
| 69 for key in clusters: | |
| 70 if len(clusters[key]) > 1 and len(key) > 0: | |
| 71 filtered_clusters[key] = clusters[key] | |
| 72 log.debug("filter_lone_clusters %s -> %s", len(clusters), len(filtered_clusters)) | |
| 73 return filtered_clusters | |
| 74 | |
| 75 | |
| 76 def test_true(feature, **kwargs): | |
| 77 return True | |
| 78 | |
| 79 | |
| 80 def parse_gff(gff3): | |
| 81 """ Extracts strand and start location to be used in cluster filtering """ | |
| 82 log.debug("parse_gff3") | |
| 83 gff_info = {} | |
| 84 _rec = None | |
| 85 for rec in gffParse(gff3): | |
| 86 endBase = len(rec.seq) | |
| 87 | |
| 88 _rec = rec | |
| 89 _rec.annotations = {} | |
| 90 for feat in feature_lambda(rec.features, test_true, {}, subfeatures=False): | |
| 91 if feat.type == "CDS": | |
| 92 if "Name" in feat.qualifiers.keys(): | |
| 93 CDSname = feat.qualifiers["Name"] | |
| 94 else: | |
| 95 CDSname = feat.qualifiers["ID"] | |
| 96 gff_info[feat.id] = { | |
| 97 "strand": feat.strand, | |
| 98 "start": feat.location.start, | |
| 99 "end": feat.location.end, | |
| 100 "loc": feat.location, | |
| 101 "feat": feat, | |
| 102 "name": CDSname, | |
| 103 } | |
| 104 | |
| 105 gff_info = OrderedDict(sorted(gff_info.items(), key=lambda k: k[1]["start"])) | |
| 106 # endBase = 0 | |
| 107 for i, feat_id in enumerate(gff_info): | |
| 108 gff_info[feat_id].update({"index": i}) | |
| 109 if gff_info[feat_id]["loc"].end > endBase: | |
| 110 endBase = gff_info[feat_id]["loc"].end | |
| 111 | |
| 112 return dict(gff_info), _rec, endBase | |
| 113 | |
| 114 | |
| 115 def all_same(genes_list): | |
| 116 """ Returns True if all gene names in cluster are identical """ | |
| 117 return all(gene["name"] == genes_list[0]["name"] for gene in genes_list[1:]) | |
| 118 | |
| 119 | |
| 120 def remove_duplicates(clusters): | |
| 121 """ Removes clusters with multiple members but only one gene name """ | |
| 122 filtered_clusters = {} | |
| 123 for key in clusters: | |
| 124 if all_same(clusters[key]): | |
| 125 continue | |
| 126 else: | |
| 127 filtered_clusters[key] = clusters[key] | |
| 128 log.debug("remove_duplicates %s -> %s", len(clusters), len(filtered_clusters)) | |
| 129 return filtered_clusters | |
| 130 | |
| 131 | |
| 132 class IntronFinder(object): | |
| 133 """ IntronFinder objects are lists that contain a list of hits for every gene """ | |
| 134 | |
| 135 def __init__(self, gff3, blastp, thresh): | |
| 136 self.blast = [] | |
| 137 self.clusters = {} | |
| 138 self.gff_info = {} | |
| 139 self.length = 0 | |
| 140 | |
| 141 (self.gff_info, self.rec, self.length) = parse_gff(gff3) | |
| 142 self.blast = parse_xml(blastp, thresh) | |
| 143 | |
| 144 def create_clusters(self): | |
| 145 """ Finds 2 or more genes with matching hits """ | |
| 146 clusters = {} | |
| 147 for gene in self.blast: | |
| 148 for hit in gene: | |
| 149 if " " in hit["gi_nos"]: | |
| 150 hit["gi_nos"] = hit["gi_nos"][0 : hit["gi_nos"].index(" ")] | |
| 151 | |
| 152 nameCheck = hit["gi_nos"] | |
| 153 if nameCheck == "": | |
| 154 continue | |
| 155 name = hashlib.md5((nameCheck).encode()).hexdigest() | |
| 156 | |
| 157 if name in clusters: | |
| 158 if hit not in clusters[name]: | |
| 159 clusters[name].append(hit) | |
| 160 else: | |
| 161 clusters[name] = [hit] | |
| 162 log.debug("create_clusters %s -> %s", len(self.blast), len(clusters)) | |
| 163 self.clusters = filter_lone_clusters(clusters) | |
| 164 | |
| 165 def check_strand(self): | |
| 166 """ filters clusters for genes on the same strand """ | |
| 167 filtered_clusters = {} | |
| 168 for key in self.clusters: | |
| 169 pos_strand = [] | |
| 170 neg_strand = [] | |
| 171 for gene in self.clusters[key]: | |
| 172 if self.gff_info[gene["name"]]["strand"] == 1: | |
| 173 pos_strand.append(gene) | |
| 174 else: | |
| 175 neg_strand.append(gene) | |
| 176 if len(pos_strand) == 0 or len(neg_strand) == 0: | |
| 177 filtered_clusters[key] = self.clusters[key] | |
| 178 else: | |
| 179 if len(pos_strand) > 1: | |
| 180 filtered_clusters[key + "_+1"] = pos_strand | |
| 181 if len(neg_strand) > 1: | |
| 182 filtered_clusters[key + "_-1"] = neg_strand | |
| 183 | |
| 184 return filtered_clusters | |
| 185 | |
| 186 def check_gene_gap(self, maximum=10000): | |
| 187 filtered_clusters = {} | |
| 188 for key in self.clusters: | |
| 189 hits_lists = [] | |
| 190 gene_added = False | |
| 191 for gene in self.clusters[key]: | |
| 192 for hits in hits_lists: | |
| 193 for hit in hits: | |
| 194 lastStart = max( | |
| 195 self.gff_info[gene["name"]]["start"], | |
| 196 self.gff_info[hit["name"]]["start"], | |
| 197 ) | |
| 198 lastEnd = max( | |
| 199 self.gff_info[gene["name"]]["end"], | |
| 200 self.gff_info[hit["name"]]["end"], | |
| 201 ) | |
| 202 firstEnd = min( | |
| 203 self.gff_info[gene["name"]]["end"], | |
| 204 self.gff_info[hit["name"]]["end"], | |
| 205 ) | |
| 206 firstStart = min( | |
| 207 self.gff_info[gene["name"]]["start"], | |
| 208 self.gff_info[hit["name"]]["start"], | |
| 209 ) | |
| 210 if ( | |
| 211 lastStart - firstEnd <= maximum | |
| 212 or self.length - lastEnd + firstStart <= maximum | |
| 213 ): | |
| 214 hits.append(gene) | |
| 215 gene_added = True | |
| 216 break | |
| 217 if not gene_added: | |
| 218 hits_lists.append([gene]) | |
| 219 | |
| 220 for i, hits in enumerate(hits_lists): | |
| 221 if len(hits) >= 2: | |
| 222 filtered_clusters[key + "_" + str(i)] = hits | |
| 223 # for i in filtered_clusters: | |
| 224 # print(i) | |
| 225 # print(filtered_clusters[i]) | |
| 226 log.debug("check_gene_gap %s -> %s", len(self.clusters), len(filtered_clusters)) | |
| 227 | |
| 228 return remove_duplicates( | |
| 229 filtered_clusters | |
| 230 ) # call remove_duplicates somewhere else? | |
| 231 | |
| 232 # maybe figure out how to merge with check_gene_gap? | |
| 233 # def check_seq_gap(): | |
| 234 | |
| 235 # also need a check for gap in sequence coverage? | |
| 236 def check_seq_overlap(self, minimum=-1): | |
| 237 filtered_clusters = {} | |
| 238 for key in self.clusters: | |
| 239 add_cluster = True | |
| 240 sbjct_ranges = [] | |
| 241 query_ranges = [] | |
| 242 for gene in self.clusters[key]: | |
| 243 sbjct_ranges.append(gene["sbjct_range"]) | |
| 244 query_ranges.append(gene["query_range"]) | |
| 245 | |
| 246 combinations = list(itertools.combinations(sbjct_ranges, 2)) | |
| 247 | |
| 248 for pair in combinations: | |
| 249 overlap = len( | |
| 250 set(range(pair[0][0], pair[0][1])) | |
| 251 & set(range(pair[1][0], pair[1][1])) | |
| 252 ) | |
| 253 minPair = pair[0] | |
| 254 maxPair = pair[1] | |
| 255 | |
| 256 if minPair[0] > maxPair[0]: | |
| 257 minPair = pair[1] | |
| 258 maxPair = pair[0] | |
| 259 elif minPair[0] == maxPair[0] and minPair[1] > maxPair[1]: | |
| 260 minPair = pair[1] | |
| 261 maxPair = pair[0] | |
| 262 if overlap > 0: | |
| 263 dist1 = maxPair[0] - minPair[0] | |
| 264 else: | |
| 265 dist1 = abs(maxPair[0] - minPair[1]) | |
| 266 | |
| 267 if minimum < 0: | |
| 268 if overlap > (minimum * -1): | |
| 269 # print("Rejcting: Neg min but too much overlap: " + str(pair)) | |
| 270 add_cluster = False | |
| 271 elif minimum == 0: | |
| 272 if overlap > 0: | |
| 273 # print("Rejcting: 0 min and overlap: " + str(pair)) | |
| 274 add_cluster = False | |
| 275 elif overlap > 0: | |
| 276 # print("Rejcting: Pos min and overlap: " + str(pair)) | |
| 277 add_cluster = False | |
| 278 | |
| 279 if (dist1 < minimum) and (minimum >= 0): | |
| 280 # print("Rejcting: Dist failure: " + str(pair) + " D1: " + dist1) | |
| 281 add_cluster = False | |
| 282 # if add_cluster: | |
| 283 # print("Accepted: " + str(pair) + " D1: " + str(dist1) + " Ov: " + str(overlap)) | |
| 284 if add_cluster: | |
| 285 | |
| 286 filtered_clusters[key] = self.clusters[key] | |
| 287 | |
| 288 log.debug( | |
| 289 "check_seq_overlap %s -> %s", len(self.clusters), len(filtered_clusters) | |
| 290 ) | |
| 291 # print(self.clusters) | |
| 292 return filtered_clusters | |
| 293 | |
| 294 def cluster_report(self): | |
| 295 condensed_report = {} | |
| 296 for key in self.clusters: | |
| 297 for gene in self.clusters[key]: | |
| 298 if gene["name"] in condensed_report: | |
| 299 condensed_report[gene["name"]].append(gene["sbjct_range"]) | |
| 300 else: | |
| 301 condensed_report[gene["name"]] = [gene["sbjct_range"]] | |
| 302 return condensed_report | |
| 303 | |
| 304 def cluster_report_2(self): | |
| 305 condensed_report = {} | |
| 306 for key in self.clusters: | |
| 307 gene_names = [] | |
| 308 for gene in self.clusters[key]: | |
| 309 gene_names.append((gene["name"]).strip("CPT_phageK_")) | |
| 310 if ", ".join(gene_names) in condensed_report: | |
| 311 condensed_report[", ".join(gene_names)] += 1 | |
| 312 else: | |
| 313 condensed_report[", ".join(gene_names)] = 1 | |
| 314 return condensed_report | |
| 315 | |
| 316 def cluster_report_3(self): | |
| 317 condensed_report = {} | |
| 318 for key in self.clusters: | |
| 319 gene_names = [] | |
| 320 gi_nos = [] | |
| 321 for i, gene in enumerate(self.clusters[key]): | |
| 322 if i == 0: | |
| 323 gi_nos = gene["gi_nos"] | |
| 324 gene_names.append((gene["name"]).strip(".p01").strip("CPT_phageK_gp")) | |
| 325 if ", ".join(gene_names) in condensed_report: | |
| 326 condensed_report[", ".join(gene_names)].append(gi_nos) | |
| 327 else: | |
| 328 condensed_report[", ".join(gene_names)] = [gi_nos] | |
| 329 return condensed_report | |
| 330 | |
| 331 def output_gff3(self, clusters): | |
| 332 rec = copy.deepcopy(self.rec) | |
| 333 rec.features = [] | |
| 334 for cluster_idx, cluster_id in enumerate(clusters): | |
| 335 # Get the list of genes in this cluster | |
| 336 associated_genes = set([x["name"] for x in clusters[cluster_id]]) | |
| 337 # print(associated_genes) | |
| 338 # Get the gene locations | |
| 339 assoc_gene_info = {x: self.gff_info[x]["loc"] for x in associated_genes} | |
| 340 # Now we construct a gene from the children as a "standard gene model" gene. | |
| 341 # Get the minimum and maximum locations covered by all of the children genes | |
| 342 gene_min = min([min(x[1].start, x[1].end) for x in assoc_gene_info.items()]) | |
| 343 gene_max = max([max(x[1].start, x[1].end) for x in assoc_gene_info.items()]) | |
| 344 | |
| 345 evidence_notes = [] | |
| 346 for cluster_elem in clusters[cluster_id]: | |
| 347 note = "{name} had {ident}% identity to NCBI Protein ID {pretty_gi}".format( | |
| 348 pretty_gi=(cluster_elem["gi_nos"]), | |
| 349 ident=int( | |
| 350 100 | |
| 351 * float(cluster_elem["identity"] - 1.00) | |
| 352 / abs( | |
| 353 cluster_elem["query_range"][1] | |
| 354 - cluster_elem["query_range"][0] | |
| 355 ) | |
| 356 ), | |
| 357 **cluster_elem | |
| 358 ) | |
| 359 evidence_notes.append(note) | |
| 360 if gene_max - gene_min > 0.8 * float(self.length): | |
| 361 evidence_notes.append( | |
| 362 "Intron is over 80% of the total length of the genome, possible wraparound scenario" | |
| 363 ) | |
| 364 # With that we can create the top level gene | |
| 365 gene = gffSeqFeature( | |
| 366 location=FeatureLocation(gene_min, gene_max), | |
| 367 type="gene", | |
| 368 id=cluster_id, | |
| 369 qualifiers={ | |
| 370 "ID": ["gp_%s" % cluster_idx], | |
| 371 "Percent_Identities": evidence_notes, | |
| 372 "Note": clusters[cluster_id][0]["match_id"], | |
| 373 }, | |
| 374 ) | |
| 375 | |
| 376 # Below that we have an mRNA | |
| 377 mRNA = gffSeqFeature( | |
| 378 location=FeatureLocation(gene_min, gene_max), | |
| 379 type="mRNA", | |
| 380 id=cluster_id + ".mRNA", | |
| 381 qualifiers={"ID": ["gp_%s.mRNA" % cluster_idx], "note": evidence_notes}, | |
| 382 ) | |
| 383 | |
| 384 # Now come the CDSs. | |
| 385 cdss = [] | |
| 386 # We sort them just for kicks | |
| 387 for idx, gene_name in enumerate( | |
| 388 sorted(associated_genes, key=lambda x: int(self.gff_info[x]["start"])) | |
| 389 ): | |
| 390 # Copy the CDS so we don't muck up a good one | |
| 391 cds = copy.copy(self.gff_info[gene_name]["feat"]) | |
| 392 # Get the associated cluster element (used in the Notes above) | |
| 393 cluster_elem = [ | |
| 394 x for x in clusters[cluster_id] if x["name"] == gene_name | |
| 395 ][0] | |
| 396 | |
| 397 # Calculate %identity which we'll use to score | |
| 398 score = int( | |
| 399 1000 | |
| 400 * float(cluster_elem["identity"]) | |
| 401 / abs( | |
| 402 cluster_elem["query_range"][1] - cluster_elem["query_range"][0] | |
| 403 ) | |
| 404 ) | |
| 405 | |
| 406 tempLoc = FeatureLocation( | |
| 407 cds.location.start + (3 * (cluster_elem["query_range"][0] - 1)), | |
| 408 cds.location.start + (3 * (cluster_elem["query_range"][1])), | |
| 409 cds.location.strand, | |
| 410 ) | |
| 411 cds.location = tempLoc | |
| 412 # Set the qualifiers appropriately | |
| 413 cds.qualifiers = { | |
| 414 "ID": ["gp_%s.CDS.%s" % (cluster_idx, idx)], | |
| 415 "score": score, | |
| 416 "Name": self.gff_info[gene_name]["name"], | |
| 417 "evalue": cluster_elem["evalue"], | |
| 418 "Identity": cluster_elem["identity_percent"] * 100, | |
| 419 #'|'.join(cluster_elem['gi_nos']) + "| title goes here." | |
| 420 } | |
| 421 # cds.location.start = cds.location.start + | |
| 422 cdss.append(cds) | |
| 423 | |
| 424 # And we attach the things properly. | |
| 425 mRNA.sub_features = cdss | |
| 426 mRNA.location = FeatureLocation(mRNA.location.start, mRNA.location.end, cds.location.strand) | |
| 427 gene.sub_features = [mRNA] | |
| 428 gene.location = FeatureLocation(gene.location.start, gene.location.end, cds.location.strand) | |
| 429 | |
| 430 # And append to our record | |
| 431 rec.features.append(gene) | |
| 432 return rec | |
| 433 | |
| 434 def output_xml(self, clusters): | |
| 435 threeLevel = {} | |
| 436 # print((clusters.viewkeys())) | |
| 437 # print(type(enumerate(clusters))) | |
| 438 # print(type(clusters)) | |
| 439 for cluster_idx, cluster_id in enumerate(clusters): | |
| 440 # print(type(cluster_id)) | |
| 441 # print(type(cluster_idx)) | |
| 442 # print(type(clusters[cluster_id][0]['hit_num'])) | |
| 443 if not (clusters[cluster_id][0]["iter_num"] in threeLevel.keys): | |
| 444 threeLevel[clusters[cluster_id][0]["iter_num"]] = {} | |
| 445 # for cluster_idx, cluster_id in enumerate(clusters): | |
| 446 # print(type(clusters[cluster_id])) | |
| 447 # b = {clusters[cluster_id][i]: clusters[cluster_id][i+1] for i in range(0, len(clusters[cluster_id]), 2)} | |
| 448 # print(type(b))#['name'])) | |
| 449 # for hspList in clusters: | |
| 450 # for x, idx in (enumerate(clusters)):#for hsp in hspList: | |
| 451 # print("In X") | |
| 452 | |
| 453 | |
| 454 if __name__ == "__main__": | |
| 455 parser = argparse.ArgumentParser(description="Intron detection") | |
| 456 parser.add_argument("gff3", type=argparse.FileType("r"), help="GFF3 gene calls") | |
| 457 parser.add_argument( | |
| 458 "blastp", type=argparse.FileType("r"), help="blast XML protein results" | |
| 459 ) | |
| 460 parser.add_argument( | |
| 461 "--minimum", | |
| 462 help="Gap minimum (Default -1, set to a negative number to allow overlap)", | |
| 463 default=-1, | |
| 464 type=int, | |
| 465 ) | |
| 466 parser.add_argument( | |
| 467 "--maximum", | |
| 468 help="Gap maximum in genome (Default 10000)", | |
| 469 default=10000, | |
| 470 type=int, | |
| 471 ) | |
| 472 parser.add_argument( | |
| 473 "--idThresh", help="ID Percent Threshold", default=0.4, type=float | |
| 474 ) | |
| 475 | |
| 476 args = parser.parse_args() | |
| 477 | |
| 478 threshCap = args.idThresh | |
| 479 if threshCap > 1.00: | |
| 480 threshCap = 1.00 | |
| 481 if threshCap < 0: | |
| 482 threshCap = 0 | |
| 483 | |
| 484 # create new IntronFinder object based on user input | |
| 485 ifinder = IntronFinder(args.gff3, args.blastp, threshCap) | |
| 486 ifinder.create_clusters() | |
| 487 ifinder.clusters = ifinder.check_strand() | |
| 488 ifinder.clusters = ifinder.check_gene_gap(maximum=args.maximum) | |
| 489 ifinder.clusters = ifinder.check_seq_overlap(minimum=args.minimum) | |
| 490 # ifinder.output_xml(ifinder.clusters) | |
| 491 # for x, idx in (enumerate(ifinder.clusters)): | |
| 492 # print(ifinder.blast) | |
| 493 | |
| 494 condensed_report = ifinder.cluster_report() | |
| 495 rec = ifinder.output_gff3(ifinder.clusters) | |
| 496 gffWrite([rec], sys.stdout) | |
| 497 | |
| 498 # import pprint; pprint.pprint(ifinder.clusters) |
