changeset 0:7df876ad836a draft

"planemo upload for repository https://github.com/public-health-bioinformatics/galaxy_tools/blob/master/tools/artic_margin_cons_medaka commit c866a76470b72bf5abdc2a88e6bf584c2f2f9b02-dirty"
author dfornika
date Wed, 11 Mar 2020 01:58:44 +0000
parents
children 9fcffca8f6ad
files LICENSE artic_margin_cons_medaka.xml margin_cons_medaka.py
diffstat 3 files changed, 243 insertions(+), 0 deletions(-) [+]
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/LICENSE	Wed Mar 11 01:58:44 2020 +0000
@@ -0,0 +1,19 @@
+Copyright (c) 2017-2018 Nick Loman & the ZiBRA Project & the ARTIC project
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/artic_margin_cons_medaka.xml	Wed Mar 11 01:58:44 2020 +0000
@@ -0,0 +1,34 @@
+<tool id="artic_margin_cons_medaka" name="ARTIC margin_cons_medaka" version="0.1.0+galaxy0">
+    <description>Generate consensus fasta file from a reference fasta, bam alignment and VCF file</description>
+    <requirements>
+        <requirement type="package" version="0.6.8">pyvcf</requirement>
+        <requirement type="package" version="1.76">biopython</requirement>
+    </requirements>
+    <command detect_errors="exit_code"><![CDATA[
+        python '${__tool_directory__}/margin_cons_medaka.py'
+          --depth '${minimum_depth}'
+          --quality '${minimum_quality}'
+          '${ref_fasta}'
+          '${input_variants}'
+          '${input_alignment}'
+          > '${consensus_fasta}'
+          2> '${report}'
+    ]]></command>
+    <inputs>
+        <param name="ref_fasta" type="data" format="fasta" label="Reference Sequence"/>
+        <param name="input_alignment" type="data" format="bam" label="Input Alignment"/>
+        <param name="input_variants" type="data" format="vcf" label="Input Variants"/>
+        <param name="minimum_depth" type="int" value="5" min="1" max="500" label="Minimum depth to call a variant"/>
+        <param name="minimum_quality" type="int" value="0" min="0" max="100" label="Minimum quality to call a variant"/>
+    </inputs>
+    <outputs>
+        <data name="consensus_fasta" format="fasta" label="${tool.name} on ${on_string}: Consensus (.fasta)" />
+        <data name="report" format="tabular" label="${tool.name} on ${on_string}: Report" />
+    </outputs>
+    <tests>
+    </tests>
+    <help><![CDATA[
+    ]]></help>
+    <citations>
+    </citations>
+</tool>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/margin_cons_medaka.py	Wed Mar 11 01:58:44 2020 +0000
@@ -0,0 +1,190 @@
+#!/usr/bin/env python
+
+# Written by Nick Loman
+# This file adapted from ARTICnetwork 'fieldbioinformatics' pipeline:
+# https://github.com/artic-network/fieldbioinformatics
+
+import argparse
+import operator
+import os.path
+import subprocess
+import sys
+
+from collections import defaultdict
+
+import vcf
+
+from Bio import SeqIO
+
+
+def read_bed_file(fn):
+    bedfile = []
+    with open(fn) as csvfile:
+	reader = csv.reader(csvfile, dialect='excel-tab')
+	for row in reader:
+	    bedrow = {}
+	    bedrow['Primer_ID'] = row[3]
+
+	    if len(row) >= 6:
+		# new style bed
+		bedrow['direction'] = row[5]
+	    elif len(row) == 5:
+		# old style without directory
+		if 'LEFT' in row[3]:
+		    bedrow['direction'] = '+'
+		elif 'RIGHT' in row[3]:
+		    bedrow['direction'] = '-'
+		else:
+		    print("Malformed BED file!", file=sys.stderr) 
+		    raise SystemExit
+
+	    if bedrow['direction'] == '+':
+		bedrow['end'] = int(row[2])
+		bedrow['start'] = int(row[1])
+	    else:
+		bedrow['end'] = int(row[1])
+		bedrow['start'] = int(row[2])
+	    bedfile.append(bedrow)
+    return bedfile
+
+
+def collect_depths(bamfile):
+    if not os.path.exists(bamfile):
+        raise SystemExit("bamfile %s doesn't exist" % (bamfile,))
+
+    p = subprocess.Popen(['samtools', 'depth', bamfile],
+                             stdout=subprocess.PIPE)
+    out, err = p.communicate()
+    depths = defaultdict(dict)
+    for ln in out.decode('utf-8').split("\n"):
+       if ln:
+          contig, pos, depth = ln.split("\t")
+          depths[contig][int(pos)] = int(depth)
+    return depths
+
+
+class Reporter:
+    def __init__(self, vcffile, depths):
+        self.vcffile = vcffile
+        self.depths = depths
+
+    def report(self, r, status, allele):
+        idfile = os.path.basename(self.vcffile).split(".")[0]
+        print("%s\t%s\tstatus\t%s" % (idfile, r.POS, status), file=sys.stderr)
+        print("%s\t%s\tallele\t%s" % (idfile, r.POS, allele), file=sys.stderr)
+        print("%s\t%s\tref\t%s" % (idfile, r.POS, r.REF), file=sys.stderr)
+        print("%s\t%s\tdepth\t%s" % (idfile, r.POS, self.depths[r.CHROM][r.POS]), file=sys.stderr)
+
+
+def go(args):
+    MASKED_POSITIONS = defaultdict(set)
+
+    depths = collect_depths(args.bamfile)
+    reporter = Reporter(args.vcffile, depths)
+
+    seqs = dict([(rec.id, rec) for rec in SeqIO.parse(open(args.reference), "fasta")])
+    cons = {}
+    for k in seqs.keys():
+        cons[k] = list(seqs[k].seq)
+
+        for n, c in enumerate(cons[k]):
+            try:
+                depth = depths[k][n+1]
+            except:
+                depth = 0
+
+            if depth < args.depth:
+                cons[k][n] = 'N'
+
+    if args.masked:
+        for region in args.masked.split(","):
+            contig, positions = region.split(":")
+            start, end = positions.split("-")
+            start = int(start)
+            end = int(end)
+            for n in range(start, end):
+                cons[contig][n-1] = 'N'
+                MASKED_POSITIONS[contig].add(n-1)
+
+    sett = set()
+    vcf_reader = vcf.Reader(open(args.vcffile, 'r'))
+    for record in vcf_reader:
+        if record.ALT[0] != '.':
+            # variant call
+
+            if record.POS in MASKED_POSITIONS[record.CHROM]:
+                reporter.report(record, "masked_manual", "n")
+                cons[record.CHROM][record.POS-1] = 'N'
+                continue
+
+            if record.num_het:
+                if depths[record.CHROM][record.POS] < args.depth:
+                    reporter.report(record, "het_site_low_depth", "y")
+                    continue
+                else:
+                    reporter.report(record, "het_site", "y")
+                    cons[record.CHROM][record.POS-1] = 'N'
+                    continue
+
+            if 'PRIMER' in record.INFO:
+                reporter.report(record, "primer_binding_site", "n")
+                cons[record.CHROM][record.POS-1] = 'N'
+                continue
+
+            #support = float(record.INFO['SupportFraction'])
+            #total_reads = int(record.INFO['TotalReads'])
+            qual = record.QUAL
+
+            REF = record.REF
+            ALT = str(record.ALT[0])
+
+            if len(ALT) > len(REF):
+                print("Skipping insertion at position: %s" % (record.POS), file=sys.stderr)
+                continue
+
+            if depths[record.CHROM][record.POS] >= args.depth and record.QUAL >= args.quality:
+                if len(REF) > len(ALT):
+                    print("N-masking confident deletion at %s" % (record.POS), file=sys.stderr)
+                    for n in range(len(REF)):
+                        cons[record.CHROM][record.POS-1+n] = 'N'
+                    continue
+
+                reporter.report(record, "variant", ALT)
+                sett.add(record.POS)
+                if len(REF) > len(ALT):
+                    print("deletion", file=sys.stderr)
+                    continue
+
+                if len(ALT) > len(REF):
+                    print("insertion", file=sys.stderr)
+                    continue
+                cons[record.CHROM][record.POS-1] = str(ALT)
+            elif len(REF) > len(ALT):
+                continue
+            else:
+                if depths[record.CHROM][record.POS] < args.depth:
+                    reporter.report(record, "low_depth_variant", "n")
+                else:
+                    reporter.report(record, "low_qual_variant", "n")
+                #cons[record.CHROM][record.POS-1] = 'N'
+                continue    
+
+    #print >>sys.stderr, str(sett)
+
+    for k in seqs.keys():
+       print(">%s-%s" % (args.bamfile, k))
+       print("".join(cons[k]))
+
+def main():
+   parser = argparse.ArgumentParser()
+   parser.add_argument('--depth', type=int, default=5, help='minimum depth to call a variant')
+   parser.add_argument('--quality', type=int, default=0, help='minimum quality to call a variant')
+   parser.add_argument('--masked', help='Regions to mask (contig:start-end,contig:start-end)')
+   parser.add_argument('reference')
+   parser.add_argument('vcffile')
+   parser.add_argument('bamfile')
+   args = parser.parse_args()
+   go(args)
+
+if __name__ == "__main__":
+    main()