Mercurial > repos > dfornika > artic_margin_cons_medaka
changeset 0:7df876ad836a draft
"planemo upload for repository https://github.com/public-health-bioinformatics/galaxy_tools/blob/master/tools/artic_margin_cons_medaka commit c866a76470b72bf5abdc2a88e6bf584c2f2f9b02-dirty"
author | dfornika |
---|---|
date | Wed, 11 Mar 2020 01:58:44 +0000 |
parents | |
children | 9fcffca8f6ad |
files | LICENSE artic_margin_cons_medaka.xml margin_cons_medaka.py |
diffstat | 3 files changed, 243 insertions(+), 0 deletions(-) [+] |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/LICENSE Wed Mar 11 01:58:44 2020 +0000 @@ -0,0 +1,19 @@ +Copyright (c) 2017-2018 Nick Loman & the ZiBRA Project & the ARTIC project + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE.
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/artic_margin_cons_medaka.xml Wed Mar 11 01:58:44 2020 +0000 @@ -0,0 +1,34 @@ +<tool id="artic_margin_cons_medaka" name="ARTIC margin_cons_medaka" version="0.1.0+galaxy0"> + <description>Generate consensus fasta file from a reference fasta, bam alignment and VCF file</description> + <requirements> + <requirement type="package" version="0.6.8">pyvcf</requirement> + <requirement type="package" version="1.76">biopython</requirement> + </requirements> + <command detect_errors="exit_code"><![CDATA[ + python '${__tool_directory__}/margin_cons_medaka.py' + --depth '${minimum_depth}' + --quality '${minimum_quality}' + '${ref_fasta}' + '${input_variants}' + '${input_alignment}' + > '${consensus_fasta}' + 2> '${report}' + ]]></command> + <inputs> + <param name="ref_fasta" type="data" format="fasta" label="Reference Sequence"/> + <param name="input_alignment" type="data" format="bam" label="Input Alignment"/> + <param name="input_variants" type="data" format="vcf" label="Input Variants"/> + <param name="minimum_depth" type="int" value="5" min="1" max="500" label="Minimum depth to call a variant"/> + <param name="minimum_quality" type="int" value="0" min="0" max="100" label="Minimum quality to call a variant"/> + </inputs> + <outputs> + <data name="consensus_fasta" format="fasta" label="${tool.name} on ${on_string}: Consensus (.fasta)" /> + <data name="report" format="tabular" label="${tool.name} on ${on_string}: Report" /> + </outputs> + <tests> + </tests> + <help><![CDATA[ + ]]></help> + <citations> + </citations> +</tool>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/margin_cons_medaka.py Wed Mar 11 01:58:44 2020 +0000 @@ -0,0 +1,190 @@ +#!/usr/bin/env python + +# Written by Nick Loman +# This file adapted from ARTICnetwork 'fieldbioinformatics' pipeline: +# https://github.com/artic-network/fieldbioinformatics + +import argparse +import operator +import os.path +import subprocess +import sys + +from collections import defaultdict + +import vcf + +from Bio import SeqIO + + +def read_bed_file(fn): + bedfile = [] + with open(fn) as csvfile: + reader = csv.reader(csvfile, dialect='excel-tab') + for row in reader: + bedrow = {} + bedrow['Primer_ID'] = row[3] + + if len(row) >= 6: + # new style bed + bedrow['direction'] = row[5] + elif len(row) == 5: + # old style without directory + if 'LEFT' in row[3]: + bedrow['direction'] = '+' + elif 'RIGHT' in row[3]: + bedrow['direction'] = '-' + else: + print("Malformed BED file!", file=sys.stderr) + raise SystemExit + + if bedrow['direction'] == '+': + bedrow['end'] = int(row[2]) + bedrow['start'] = int(row[1]) + else: + bedrow['end'] = int(row[1]) + bedrow['start'] = int(row[2]) + bedfile.append(bedrow) + return bedfile + + +def collect_depths(bamfile): + if not os.path.exists(bamfile): + raise SystemExit("bamfile %s doesn't exist" % (bamfile,)) + + p = subprocess.Popen(['samtools', 'depth', bamfile], + stdout=subprocess.PIPE) + out, err = p.communicate() + depths = defaultdict(dict) + for ln in out.decode('utf-8').split("\n"): + if ln: + contig, pos, depth = ln.split("\t") + depths[contig][int(pos)] = int(depth) + return depths + + +class Reporter: + def __init__(self, vcffile, depths): + self.vcffile = vcffile + self.depths = depths + + def report(self, r, status, allele): + idfile = os.path.basename(self.vcffile).split(".")[0] + print("%s\t%s\tstatus\t%s" % (idfile, r.POS, status), file=sys.stderr) + print("%s\t%s\tallele\t%s" % (idfile, r.POS, allele), file=sys.stderr) + print("%s\t%s\tref\t%s" % (idfile, r.POS, r.REF), file=sys.stderr) + print("%s\t%s\tdepth\t%s" % (idfile, r.POS, self.depths[r.CHROM][r.POS]), file=sys.stderr) + + +def go(args): + MASKED_POSITIONS = defaultdict(set) + + depths = collect_depths(args.bamfile) + reporter = Reporter(args.vcffile, depths) + + seqs = dict([(rec.id, rec) for rec in SeqIO.parse(open(args.reference), "fasta")]) + cons = {} + for k in seqs.keys(): + cons[k] = list(seqs[k].seq) + + for n, c in enumerate(cons[k]): + try: + depth = depths[k][n+1] + except: + depth = 0 + + if depth < args.depth: + cons[k][n] = 'N' + + if args.masked: + for region in args.masked.split(","): + contig, positions = region.split(":") + start, end = positions.split("-") + start = int(start) + end = int(end) + for n in range(start, end): + cons[contig][n-1] = 'N' + MASKED_POSITIONS[contig].add(n-1) + + sett = set() + vcf_reader = vcf.Reader(open(args.vcffile, 'r')) + for record in vcf_reader: + if record.ALT[0] != '.': + # variant call + + if record.POS in MASKED_POSITIONS[record.CHROM]: + reporter.report(record, "masked_manual", "n") + cons[record.CHROM][record.POS-1] = 'N' + continue + + if record.num_het: + if depths[record.CHROM][record.POS] < args.depth: + reporter.report(record, "het_site_low_depth", "y") + continue + else: + reporter.report(record, "het_site", "y") + cons[record.CHROM][record.POS-1] = 'N' + continue + + if 'PRIMER' in record.INFO: + reporter.report(record, "primer_binding_site", "n") + cons[record.CHROM][record.POS-1] = 'N' + continue + + #support = float(record.INFO['SupportFraction']) + #total_reads = int(record.INFO['TotalReads']) + qual = record.QUAL + + REF = record.REF + ALT = str(record.ALT[0]) + + if len(ALT) > len(REF): + print("Skipping insertion at position: %s" % (record.POS), file=sys.stderr) + continue + + if depths[record.CHROM][record.POS] >= args.depth and record.QUAL >= args.quality: + if len(REF) > len(ALT): + print("N-masking confident deletion at %s" % (record.POS), file=sys.stderr) + for n in range(len(REF)): + cons[record.CHROM][record.POS-1+n] = 'N' + continue + + reporter.report(record, "variant", ALT) + sett.add(record.POS) + if len(REF) > len(ALT): + print("deletion", file=sys.stderr) + continue + + if len(ALT) > len(REF): + print("insertion", file=sys.stderr) + continue + cons[record.CHROM][record.POS-1] = str(ALT) + elif len(REF) > len(ALT): + continue + else: + if depths[record.CHROM][record.POS] < args.depth: + reporter.report(record, "low_depth_variant", "n") + else: + reporter.report(record, "low_qual_variant", "n") + #cons[record.CHROM][record.POS-1] = 'N' + continue + + #print >>sys.stderr, str(sett) + + for k in seqs.keys(): + print(">%s-%s" % (args.bamfile, k)) + print("".join(cons[k])) + +def main(): + parser = argparse.ArgumentParser() + parser.add_argument('--depth', type=int, default=5, help='minimum depth to call a variant') + parser.add_argument('--quality', type=int, default=0, help='minimum quality to call a variant') + parser.add_argument('--masked', help='Regions to mask (contig:start-end,contig:start-end)') + parser.add_argument('reference') + parser.add_argument('vcffile') + parser.add_argument('bamfile') + args = parser.parse_args() + go(args) + +if __name__ == "__main__": + main()