Mercurial > repos > dfornika > match_plasmid_to_reference
comparison match_plasmid_to_reference.py @ 4:826ddf832bef draft default tip
"planemo upload for repository https://github.com/dfornika/galaxy/tree/master/tools/match_plasmid_to_reference commit dcdac86bce5c44043516fbd472ab7c19d7bf4d50-dirty"
| author | dfornika |
|---|---|
| date | Wed, 06 Nov 2019 13:52:40 -0500 |
| parents | 3616b6eda1da |
| children |
comparison
equal
deleted
inserted
replaced
| 3:d56b4f743779 | 4:826ddf832bef |
|---|---|
| 1 #!/usr/bin/env python | 1 #!/usr/bin/env python |
| 2 | 2 |
| 3 from __future__ import print_function | 3 from __future__ import print_function, division |
| 4 | 4 |
| 5 import argparse | 5 import argparse |
| 6 import csv | 6 import csv |
| 7 import errno | 7 import errno |
| 8 import json | 8 import json |
| 54 reader = csv.DictReader(f, delimiter="\t", quotechar='"', fieldnames=MOB_TYPER_FIELDNAMES) | 54 reader = csv.DictReader(f, delimiter="\t", quotechar='"', fieldnames=MOB_TYPER_FIELDNAMES) |
| 55 for row in reader: | 55 for row in reader: |
| 56 mob_typer_report.append(row) | 56 mob_typer_report.append(row) |
| 57 return mob_typer_report | 57 return mob_typer_report |
| 58 | 58 |
| 59 def parse_genbank_accession(genbank_file_path): | 59 def parse_genbank_accession(genbank_path): |
| 60 with open(genbank_file_path, 'r') as f: | 60 with open(genbank_path, 'r') as f: |
| 61 while True: | 61 while True: |
| 62 line = f.readline() | 62 line = f.readline() |
| 63 # break while statement if it is not a comment line | |
| 64 # i.e. does not startwith # | |
| 65 if line.startswith('ACCESSION'): | 63 if line.startswith('ACCESSION'): |
| 66 return line.strip().split()[1] | 64 return line.strip().split()[1] |
| 67 | 65 |
| 66 def parse_fasta_accession(fasta_path): | |
| 67 with open(fasta_path, 'r') as f: | |
| 68 while True: | |
| 69 line = f.readline() | |
| 70 if line.startswith('>'): | |
| 71 return line.strip().split()[0][1:] | |
| 68 | 72 |
| 69 def count_contigs(plasmid_fasta_path): | 73 def count_fasta_contigs(fasta_path): |
| 70 contigs = 0 | 74 contigs = 0 |
| 71 with open(plasmid_fasta_path, 'r') as f: | 75 with open(fasta_path, 'r') as f: |
| 72 for line in f: | 76 for line in f: |
| 73 if line.startswith('>'): | 77 if line.startswith('>'): |
| 74 contigs += 1 | 78 contigs += 1 |
| 75 return contigs | 79 return contigs |
| 76 | 80 |
| 77 def count_bases(plasmid_fasta_path): | 81 def count_fasta_bases(fasta_path): |
| 78 bases = 0 | 82 bases = 0 |
| 79 with open(plasmid_fasta_path, 'r') as f: | 83 with open(fasta_path, 'r') as f: |
| 80 for line in f: | 84 for line in f: |
| 81 line = line.strip() | 85 line = line.strip() |
| 82 if not line.startswith('>'): | 86 if not line.startswith('>'): |
| 83 bases += len(line) | 87 bases += len(line) |
| 84 return bases | 88 return bases |
| 85 | 89 |
| 90 def compute_fasta_gc_percent(fasta_path): | |
| 91 gc_count = 0 | |
| 92 total_bases_count = 0 | |
| 93 with open(fasta_path, 'r') as f: | |
| 94 for line in f: | |
| 95 if not line.startswith('>'): | |
| 96 line = line.strip() | |
| 97 line_c_count = line.count('c') + line.count('C') | |
| 98 line_g_count = line.count('g') + line.count('G') | |
| 99 line_total_bases_count = len(line) | |
| 100 gc_count += line_c_count + line_g_count | |
| 101 total_bases_count += line_total_bases_count | |
| 102 return 100 * (gc_count / total_bases_count) | |
| 103 | |
| 86 def main(args): | 104 def main(args): |
| 105 | |
| 87 # create output directory | 106 # create output directory |
| 88 try: | 107 try: |
| 89 os.mkdir(args.outdir) | 108 os.mkdir(args.outdir) |
| 90 except OSError as exc: | 109 except OSError as exc: |
| 91 if exc.errno == errno.EEXIST and os.path.isdir(args.outdir): | 110 if exc.errno == errno.EEXIST and os.path.isdir(args.outdir): |
| 93 else: | 112 else: |
| 94 raise | 113 raise |
| 95 | 114 |
| 96 # parse mob_typer report | 115 # parse mob_typer report |
| 97 mob_typer_report = parse_mob_typer_report(args.mob_typer_report) | 116 mob_typer_report = parse_mob_typer_report(args.mob_typer_report) |
| 98 num_plasmid_contigs = count_contigs(args.plasmid) | 117 num_plasmid_contigs = count_fasta_contigs(args.plasmid) |
| 99 num_plasmid_bases = count_bases(args.plasmid) | 118 num_plasmid_bases = count_fasta_bases(args.plasmid) |
| 100 | 119 plasmid_gc_percent = compute_fasta_gc_percent(args.plasmid) |
| 120 | |
| 101 with open(os.path.join(args.outdir, 'mob_typer_record.tsv'), 'w') as f: | 121 with open(os.path.join(args.outdir, 'mob_typer_record.tsv'), 'w') as f: |
| 102 mob_typer_record_writer = csv.DictWriter(f, delimiter="\t", quotechar='"', fieldnames=MOB_TYPER_FIELDNAMES) | 122 mob_typer_record_writer = csv.DictWriter(f, delimiter="\t", quotechar='"', fieldnames=MOB_TYPER_FIELDNAMES) |
| 103 mob_typer_record_writer.writeheader() | 123 mob_typer_record_writer.writeheader() |
| 104 for record in mob_typer_report: | 124 for record in mob_typer_report: |
| 105 if num_plasmid_contigs == int(record['num_contigs']) and num_plasmid_bases == int(record['total_length']): | 125 # match the plasmid against three properties in the MOB-Typer report: |
| 106 for reference_plasmid in args.reference_plasmids: | 126 # 1. number of contigs |
| 127 # 2. total length of all contigs | |
| 128 # 3. G/C percent (within +/-0.1%) | |
| 129 if num_plasmid_contigs == int(record['num_contigs']) and \ | |
| 130 num_plasmid_bases == int(record['total_length']) and \ | |
| 131 abs(plasmid_gc_percent - float(record['gc'])) < 0.1: | |
| 132 for reference_plasmid in args.reference_plasmids_genbank: | |
| 107 if parse_genbank_accession(reference_plasmid) == record['mash_nearest_neighbor']: | 133 if parse_genbank_accession(reference_plasmid) == record['mash_nearest_neighbor']: |
| 108 shutil.copy2(reference_plasmid, os.path.join(args.outdir, "reference_plasmid.gbk")) | 134 shutil.copy2(reference_plasmid, os.path.join(args.outdir, "reference_plasmid.gbk")) |
| 109 mob_typer_record_writer.writerow(record) | 135 |
| 136 for reference_plasmid in args.reference_plasmids_fasta: | |
| 137 if re.match(record['mash_nearest_neighbor'], parse_fasta_accession(reference_plasmid)) is not None: | |
| 138 shutil.copy2(reference_plasmid, os.path.join(args.outdir, "reference_plasmid.fasta")) | |
| 139 mob_typer_record_writer.writerow(record) | |
| 110 | 140 |
| 111 shutil.copy2(args.plasmid, os.path.join(args.outdir, "plasmid.fasta")) | 141 shutil.copy2(args.plasmid, os.path.join(args.outdir, "plasmid.fasta")) |
| 112 | 142 |
| 113 | 143 |
| 114 if __name__ == '__main__': | 144 if __name__ == '__main__': |
| 115 parser = argparse.ArgumentParser() | 145 parser = argparse.ArgumentParser() |
| 116 parser.add_argument("--plasmid", help="plasmid assembly (fasta)") | 146 parser.add_argument("--plasmid", help="plasmid assembly (fasta)") |
| 117 parser.add_argument("--reference_plasmids", nargs='+', help="reference plasmids (genbank)") | 147 parser.add_argument("--reference_plasmids_genbank", nargs='+', help="reference plasmids (genbank)") |
| 148 parser.add_argument("--reference_plasmids_fasta", nargs='+', help="reference plasmids (fasta)") | |
| 118 parser.add_argument("--mob_typer_report", help="mob_typer reports (tsv)") | 149 parser.add_argument("--mob_typer_report", help="mob_typer reports (tsv)") |
| 119 parser.add_argument("--outdir", dest="outdir", default=".", help="Output directory") | 150 parser.add_argument("--outdir", dest="outdir", default=".", help="Output directory") |
| 120 args = parser.parse_args() | 151 args = parser.parse_args() |
| 121 main(args) | 152 main(args) |
