Mercurial > repos > cpt_testbed > functionalworkflow
comparison lipory.py @ 0:f678e282b320 draft default tip
"planemo upload"
| author | cpt_testbed |
|---|---|
| date | Fri, 06 May 2022 07:07:23 +0000 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:f678e282b320 |
|---|---|
| 1 #!/usr/bin/env python | |
| 2 import re | |
| 3 import sys | |
| 4 import argparse | |
| 5 import logging | |
| 6 from Bio import SeqIO | |
| 7 from CPT_GFFParser import gffParse, gffWrite, gffSeqFeature | |
| 8 from gff3 import feature_lambda, feature_test_type, get_id | |
| 9 from Bio.SeqFeature import SeqFeature, FeatureLocation | |
| 10 | |
| 11 logging.basicConfig(level=logging.INFO) | |
| 12 log = logging.getLogger(__name__) | |
| 13 | |
| 14 | |
| 15 def find_lipoprotein(gff3_file, fasta_genome, lipobox_mindist=10, lipobox_maxdist=40): | |
| 16 seq_dict = SeqIO.to_dict(SeqIO.parse(fasta_genome, "fasta")) | |
| 17 | |
| 18 CASES = [ | |
| 19 re.compile( | |
| 20 "^.{%s,%s}[ILMFTV][^REKD][GAS]C" % (lipobox_mindist, lipobox_maxdist) | |
| 21 ), | |
| 22 re.compile( | |
| 23 "^.{%s,%s}AW[AGS]C" % (lipobox_mindist, lipobox_maxdist) | |
| 24 ), | |
| 25 # Make sure to not have multiple cases that share matches, will introduce duplicate features into gff3 file | |
| 26 ] | |
| 27 | |
| 28 for record in gffParse(gff3_file, base_dict=seq_dict): | |
| 29 good_features = [] | |
| 30 | |
| 31 genes = list( | |
| 32 feature_lambda( | |
| 33 record.features, feature_test_type, {"type": "gene"}, subfeatures=True | |
| 34 ) | |
| 35 ) | |
| 36 for gene in genes: | |
| 37 cdss = list( | |
| 38 feature_lambda( | |
| 39 gene.sub_features, | |
| 40 feature_test_type, | |
| 41 {"type": "CDS"}, | |
| 42 subfeatures=False, | |
| 43 ) | |
| 44 ) | |
| 45 if len(cdss) == 0: | |
| 46 continue | |
| 47 | |
| 48 for cds in cdss: | |
| 49 try: | |
| 50 tmpseq = str( | |
| 51 cds.extract(record.seq).translate(table=11, cds=True) | |
| 52 ).replace("*", "") | |
| 53 except: | |
| 54 continue | |
| 55 | |
| 56 for case in CASES: | |
| 57 m = case.search(tmpseq) | |
| 58 if m: | |
| 59 if cds.location.strand > 0: | |
| 60 start = cds.location.start + (3 * (m.end() - 4)) | |
| 61 end = cds.location.start + (3 * m.end()) | |
| 62 else: | |
| 63 start = cds.location.end - (3 * (m.end() - 4)) | |
| 64 end = cds.location.end - (3 * m.end()) | |
| 65 | |
| 66 tmp = gffSeqFeature( | |
| 67 FeatureLocation( | |
| 68 min(start, end), | |
| 69 max(start, end), | |
| 70 strand=cds.location.strand, | |
| 71 ), | |
| 72 type="Lipobox", | |
| 73 qualifiers={ | |
| 74 "source": "CPT_LipoRy", | |
| 75 "ID": "%s.lipobox" % get_id(gene), | |
| 76 }, | |
| 77 ) | |
| 78 tmp.qualifiers["sequence"] = str( | |
| 79 tmp.extract(record).seq.translate() | |
| 80 ) | |
| 81 | |
| 82 gene.sub_features.append(tmp) | |
| 83 good_features.append(gene) | |
| 84 | |
| 85 record.features = good_features | |
| 86 yield [record] | |
| 87 | |
| 88 | |
| 89 if __name__ == "__main__": | |
| 90 parser = argparse.ArgumentParser(description="Filter out lipoproteins", epilog="") | |
| 91 parser.add_argument( | |
| 92 "gff3_file", type=argparse.FileType("r"), help="Naive ORF Calls" | |
| 93 ) | |
| 94 parser.add_argument( | |
| 95 "fasta_genome", type=argparse.FileType("r"), help="Fasta genome sequence" | |
| 96 ) | |
| 97 | |
| 98 parser.add_argument( | |
| 99 "--lipobox_mindist", | |
| 100 type=int, | |
| 101 help="Minimum distance in codons to start of lipobox", | |
| 102 default=10, | |
| 103 ) | |
| 104 parser.add_argument( | |
| 105 "--lipobox_maxdist", | |
| 106 type=int, | |
| 107 help="Maximum distance in codons to start of lipobox", | |
| 108 default=40, | |
| 109 ) | |
| 110 | |
| 111 args = parser.parse_args() | |
| 112 | |
| 113 args = vars(parser.parse_args()) | |
| 114 for record in find_lipoprotein(**args): | |
| 115 record[0].annotations = {} | |
| 116 gffWrite(record, sys.stdout) |
