Mercurial > repos > dpryan79 > bwameth

--- a/bwameth.py	Wed Sep 14 05:07:47 2016 -0400
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,450 +0,0 @@
-#!/usr/bin/env python
-"""
-map bisulfite converted reads to an insilico converted genome using bwa mem.
-A command to this program like:
-
-    python bwameth.py --reference ref.fa A.fq B.fq
-
-Gets converted to:
-
-    bwa mem -pCMR ref.fa.bwameth.c2t '<python bwameth.py c2t A.fq B.fq'
-
-So that A.fq has C's converted to T's and B.fq has G's converted to A's
-and both are streamed directly to the aligner without a temporary file.
-The output is a corrected, sorted, indexed BAM.
-"""
-from __future__ import print_function
-import tempfile
-import sys
-import os
-import os.path as op
-import argparse
-from subprocess import check_call
-from operator import itemgetter
-from itertools import groupby, repeat, chain
-import re
-
-try:
-    from itertools import izip
-    import string
-    maketrans = string.maketrans
-except ImportError: # python3
-    izip = zip
-    maketrans = str.maketrans
-from toolshed import nopen, reader, is_newer_b
-
-__version__ = "0.2.0"
-
-def checkX(cmd):
-    for p in os.environ['PATH'].split(":"):
-        if os.access(os.path.join(p, cmd), os.X_OK):
-            break
-    else:
-        raise Exception("executable for '%s' not found" % cmd)
-
-checkX('samtools')
-checkX('bwa')
-
-class BWAMethException(Exception): pass
-
-def comp(s, _comp=maketrans('ATCG', 'TAGC')):
-    return s.translate(_comp)
-
-def wrap(text, width=100): # much faster than textwrap
-    try: xrange
-    except NameError: xrange = range
-    for s in xrange(0, len(text), width):
-        yield text[s:s+width]
-
-def run(cmd):
-    list(nopen("|%s" % cmd.lstrip("|")))
-
-def fasta_iter(fasta_name):
-    fh = nopen(fasta_name)
-    faiter = (x[1] for x in groupby(fh, lambda line: line[0] == ">"))
-    for header in faiter:
-        header = next(header)[1:].strip()
-        yield header, "".join(s.strip() for s in next(faiter)).upper()
-
-def convert_reads(fq1s, fq2s, out=sys.stdout):
-
-    for fq1, fq2 in zip(fq1s.split(","), fq2s.split(",")):
-        sys.stderr.write("converting reads in %s,%s\n" % (fq1, fq2))
-        fq1 = nopen(fq1)
-        if fq2 != "NA":
-            fq2 = nopen(fq2)
-            q2_iter = izip(*[fq2] * 4)
-        else:
-            sys.stderr.write("WARNING: running bwameth in single-end mode\n")
-            q2_iter = repeat((None, None, None, None))
-        q1_iter = izip(*[fq1] * 4)
-
-        lt80 = 0
-        for pair in izip(q1_iter, q2_iter):
-            for read_i, (name, seq, _, qual) in enumerate(pair):
-                if name is None: continue
-                name = name.rstrip("\r\n").split(" ")[0]
-                if name[0] != "@":
-                    sys.stderr.write("""ERROR!!!!
-ERROR!!! FASTQ conversion failed
-ERROR!!! expecting FASTQ 4-tuples, but found a record %s that doesn't start with "@"
-""" % name)
-                    sys.exit(1)
-                if name.endswith(("_R1", "_R2")):
-                    name = name[:-3]
-                elif name.endswith(("/1", "/2")):
-                    name = name[:-2]
-
-                seq = seq.upper().rstrip('\n')
-                if len(seq) < 80:
-                    lt80 += 1
-
-                char_a, char_b = ['CT', 'GA'][read_i]
-                # keep original sequence as name.
-                name = " ".join((name,
-                                "YS:Z:" + seq +
-                                "\tYC:Z:" + char_a + char_b + '\n'))
-                seq = seq.replace(char_a, char_b)
-                out.write("".join((name, seq, "\n+\n", qual)))
-
-    out.flush()
-    if lt80 > 50:
-        sys.stderr.write("WARNING: %i reads with length < 80\n" % lt80)
-        sys.stderr.write("       : this program is designed for long reads\n")
-    return 0
-
-def convert_fasta(ref_fasta, just_name=False):
-    out_fa = ref_fasta + ".bwameth.c2t"
-    if just_name:
-        return out_fa
-    msg = "c2t in %s to %s" % (ref_fasta, out_fa)
-    if is_newer_b(ref_fasta, out_fa):
-        sys.stderr.write("already converted: %s\n" % msg)
-        return out_fa
-    sys.stderr.write("converting %s\n" % msg)
-    try:
-        fh = open(out_fa, "w")
-        for header, seq in fasta_iter(ref_fasta):
-            ########### Reverse ######################
-            fh.write(">r%s\n" % header)
-
-            #if non_cpg_only:
-            #    for ctx in "TAG": # use "ATC" for fwd
-            #        seq = seq.replace('G' + ctx, "A" + ctx)
-            #    for line in wrap(seq):
-            #        print >>fh, line
-            #else:
-            for line in wrap(seq.replace("G", "A")):
-                fh.write(line + '\n')
-
-            ########### Forward ######################
-            fh.write(">f%s\n" % header)
-            for line in wrap(seq.replace("C", "T")):
-                fh.write(line + '\n')
-        fh.close()
-    except:
-        try:
-            fh.close()
-        except UnboundLocalError:
-            pass
-        os.unlink(out_fa)
-        raise
-    return out_fa
-
-
-def bwa_index(fa):
-    if is_newer_b(fa, (fa + '.amb', fa + '.sa')):
-        return
-    sys.stderr.write("indexing: %s\n" % fa)
-    try:
-        run("bwa index -a bwtsw %s" % fa)
-    except:
-        if op.exists(fa + ".amb"):
-            os.unlink(fa + ".amb")
-        raise
-
-class Bam(object):
-    __slots__ = 'read flag chrom pos mapq cigar chrom_mate pos_mate tlen \
-            seq qual other'.split()
-    def __init__(self, args):
-        for a, v in zip(self.__slots__[:11], args):
-            setattr(self, a, v)
-        self.other = args[11:]
-        self.flag = int(self.flag)
-        self.pos = int(self.pos)
-        self.tlen = int(float(self.tlen))
-
-    def __repr__(self):
-        return "Bam({chr}:{start}:{read}".format(chr=self.chrom,
-                                                 start=self.pos,
-                                                 read=self.read)
-
-    def __str__(self):
-        return "\t".join(str(getattr(self, s)) for s in self.__slots__[:11]) \
-                         + "\t" + "\t".join(self.other)
-
-    def is_first_read(self):
-        return bool(self.flag & 0x40)
-
-    def is_second_read(self):
-        return bool(self.flag & 0x80)
-
-    def is_plus_read(self):
-        return not (self.flag & 0x10)
-
-    def is_minus_read(self):
-        return bool(self.flag & 0x10)
-
-    def is_mapped(self):
-        return not (self.flag & 0x4)
-
-    def cigs(self):
-        if self.cigar == "*":
-            yield (0, None)
-            raise StopIteration
-        cig_iter = groupby(self.cigar, lambda c: c.isdigit())
-        for g, n in cig_iter:
-            yield int("".join(n)), "".join(next(cig_iter)[1])
-
-    def cig_len(self):
-        return sum(c[0] for c in self.cigs() if c[1] in
-                   ("M", "D", "N", "EQ", "X", "P"))
-
-    def left_shift(self):
-        left = 0
-        for n, cig in self.cigs():
-            if cig == "M": break
-            if cig == "H":
-                left += n
-        return left
-
-    def right_shift(self):
-        right = 0
-        for n, cig in reversed(list(self.cigs())):
-            if cig == "M": break
-            if cig == "H":
-                right += n
-        return -right or None
-
-    @property
-    def original_seq(self):
-        try:
-            return next(x for x in self.other if x.startswith("YS:Z:"))[5:]
-        except:
-            sys.stderr.write(repr(self.other) + "\n")
-            sys.stderr.write(self.read + "\n")
-            raise
-
-    @property
-    def ga_ct(self):
-        return [x for x in self.other if x.startswith("YC:Z:")]
-
-    def longest_match(self, patt=re.compile("\d+M")):
-        return max(int(x[:-1]) for x in patt.findall(self.cigar))
-
-
-def rname(fq1, fq2=""):
-    fq1, fq2 = fq1.split(",")[0], fq2.split(",")[0]
-    def name(f):
-        n = op.basename(op.splitext(f)[0])
-        if n.endswith('.fastq'): n = n[:-6]
-        if n.endswith(('.fq', '.r1', '.r2')): n = n[:-3]
-        return n
-    return "".join(a for a, b in zip(name(fq1), name(fq2)) if a == b) or 'bm'
-
-
-def bwa_mem(fa, mfq, extra_args, threads=1, rg=None,
-            paired=True, set_as_failed=None):
-    conv_fa = convert_fasta(fa, just_name=True)
-    if not is_newer_b(conv_fa, (conv_fa + '.amb', conv_fa + '.sa')):
-        raise BWAMethException("first run bwameth.py index %s" % fa)
-
-    if not rg is None and not rg.startswith('@RG'):
-        rg = '@RG\tID:{rg}\tSM:{rg}'.format(rg=rg)
-
-    # penalize clipping and unpaired. lower penalty on mismatches (-B)
-    cmd = "|bwa mem -T 40 -B 2 -L 10 -CM "
-
-    if paired:
-        cmd += ("-U 100 -p ")
-    cmd += "-R '{rg}' -t {threads} {extra_args} {conv_fa} {mfq}"
-    cmd = cmd.format(**locals())
-    sys.stderr.write("running: %s\n" % cmd.lstrip("|"))
-    as_bam(cmd, fa, set_as_failed)
-
-
-def as_bam(pfile, fa, set_as_failed=None):
-    """
-    pfile: either a file or a |process to generate sam output
-    fa: the reference fasta
-    set_as_failed: None, 'f', or 'r'. If 'f'. Reads mapping to that strand
-                      are given the sam flag of a failed QC alignment (0x200).
-    """
-    sam_iter = nopen(pfile)
-
-    for line in sam_iter:
-        if not line[0] == "@": break
-        handle_header(line)
-    else:
-        sys.stderr.flush()
-        raise Exception("bad or empty fastqs")
-    sam_iter2 = (x.rstrip().split("\t") for x in chain([line], sam_iter))
-    for read_name, pair_list in groupby(sam_iter2, itemgetter(0)):
-        pair_list = [Bam(toks) for toks in pair_list]
-
-        for aln in handle_reads(pair_list, set_as_failed):
-            sys.stdout.write(str(aln) + '\n')
-
-def handle_header(line, out=sys.stdout):
-    toks = line.rstrip().split("\t")
-    if toks[0].startswith("@SQ"):
-        sq, sn, ln = toks  # @SQ    SN:fchr11    LN:122082543
-        # we have f and r, only print out f
-        chrom = sn.split(":")[1]
-        if chrom.startswith('r'): return
-        chrom = chrom[1:]
-        toks = ["%s\tSN:%s\t%s" % (sq, chrom, ln)]
-    if toks[0].startswith("@PG"):
-        #out.write("\t".join(toks) + "\n")
-        toks = ["@PG\tID:bwa-meth\tPN:bwa-meth\tVN:%s\tCL:\"%s\"" % (
-                         __version__,
-                         " ".join(x.replace("\t", "\\t") for x in sys.argv))]
-    out.write("\t".join(toks) + "\n")
-
-
-def handle_reads(alns, set_as_failed):
-
-    for aln in alns:
-        orig_seq = aln.original_seq
-        assert len(aln.seq) == len(aln.qual), aln.read
-        # don't need this any more.
-        aln.other = [x for x in aln.other if not x.startswith('YS:Z')]
-
-        # first letter of chrom is 'f' or 'r'
-        direction = aln.chrom[0]
-        aln.chrom = aln.chrom.lstrip('fr')
-
-        if not aln.is_mapped():
-            aln.seq = orig_seq
-            continue
-
-        assert direction in 'fr', (direction, aln)
-        aln.other.append('YD:Z:' + direction)
-
-        if set_as_failed == direction:
-            aln.flag |= 0x200
-
-        # here we have a heuristic that if the longest match is not 44% of the
-        # sequence length, we mark it as failed QC and un-pair it. At the end
-        # of the loop we set all members of this pair to be unmapped
-        if aln.longest_match() < (len(orig_seq) * 0.44):
-            aln.flag |= 0x200  # fail qc
-            aln.flag &= (~0x2) # un-pair
-            aln.mapq = min(int(aln.mapq), 1)
-
-        mate_direction = aln.chrom_mate[0]
-        if mate_direction not in "*=":
-            aln.chrom_mate = aln.chrom_mate[1:]
-
-        # adjust the original seq to the cigar
-        l, r = aln.left_shift(), aln.right_shift()
-        if aln.is_plus_read():
-            aln.seq = orig_seq[l:r]
-        else:
-            aln.seq = comp(orig_seq[::-1][l:r])
-
-    if any(aln.flag & 0x200 for aln in alns):
-        for aln in alns:
-            aln.flag |= 0x200
-            aln.flag &= (~0x2)
-    return alns
-
-def cnvs_main(args):
-    __doc__ = """
-    calculate CNVs from BS-Seq bams or vcfs
-    """
-    p = argparse.ArgumentParser(__doc__)
-    p.add_argument("--regions", help="optional target regions", default='NA')
-    p.add_argument("bams", nargs="+")
-
-    a = p.parse_args(args)
-    r_script = """
-options(stringsAsFactors=FALSE)
-suppressPackageStartupMessages(library(cn.mops))
-suppressPackageStartupMessages(library(snow))
-args = commandArgs(TRUE)
-regions = args[1]
-bams = args[2:length(args)]
-n = length(bams)
-if(is.na(regions)){
-    bam_counts = getReadCountsFromBAM(bams, parallel=min(n, 4), mode="paired")
-    res = cn.mops(bam_counts, parallel=min(n, 4), priorImpact=20)
-} else {
-    segments = read.delim(regions, header=FALSE)
-    gr = GRanges(segments[,1], IRanges(segments[,2], segments[,3]))
-    bam_counts = getSegmentReadCountsFromBAM(bams, GR=gr, mode="paired", parallel=min(n, 4))
-    res = exomecn.mops(bam_counts, parallel=min(n, 4), priorImpact=20)
-}
-res = calcIntegerCopyNumbers(res)
-
-df = as.data.frame(cnvs(res))
-write.table(df, row.names=FALSE, quote=FALSE, sep="\t")
-"""
-    with tempfile.NamedTemporaryFile(delete=True) as rfh:
-        rfh.write(r_script + '\n')
-        rfh.flush()
-        for d in reader('|Rscript {rs_name} {regions} {bams}'.format(
-            rs_name=rfh.name, regions=a.regions, bams=" ".join(a.bams)),
-            header=False):
-            print("\t".join(d))
-
-
-def convert_fqs(fqs):
-    script = __file__
-    return "'<%s %s c2t %s %s'" % (sys.executable, script, fqs[0],
-               fqs[1] if len(fqs) > 1
-                      else ','.join(['NA'] * len(fqs[0].split(","))))
-
-def main(args=sys.argv[1:]):
-
-    if len(args) > 0 and args[0] == "index":
-        assert len(args) == 2, ("must specify fasta as 2nd argument")
-        sys.exit(bwa_index(convert_fasta(args[1])))
-
-    if len(args) > 0 and args[0] == "c2t":
-        sys.exit(convert_reads(args[1], args[2]))
-
-    if len(args) > 0 and args[0] == "cnvs":
-        sys.exit(cnvs_main(args[1:]))
-
-    p = argparse.ArgumentParser(__doc__)
-    p.add_argument("--reference", help="reference fasta", required=True)
-    p.add_argument("-t", "--threads", type=int, default=6)
-    p.add_argument("--read-group", help="read-group to add to bam in same"
-            " format as to bwa: '@RG\\tID:foo\\tSM:bar'")
-    p.add_argument('--set-as-failed', help="flag alignments to this strand"
-            " as not passing QC (0x200). Targetted BS-Seq libraries are often"
-            " to a single strand, so we can flag them as QC failures. Note"
-            " f == OT, r == OB. Likely, this will be 'f' as we will expect"
-            " reads to align to the original-bottom (OB) strand and will flag"
-            " as failed those aligning to the forward, or original top (OT).",
-        default=None, choices=('f', 'r'))
-    p.add_argument('--version', action='version', version='bwa-meth.py {}'.format(__version__))
-
-    p.add_argument("fastqs", nargs="+", help="bs-seq fastqs to align. Run"
-            "multiple sets separated by commas, e.g. ... a_R1.fastq,b_R1.fastq"
-            " a_R2.fastq,b_R2.fastq note that the order must be maintained.")
-
-    args, pass_through_args = p.parse_known_args(args)
-
-    # for the 2nd file. use G => A and bwa's support for streaming.
-    conv_fqs_cmd = convert_fqs(args.fastqs)
-
-    bwa_mem(args.reference, conv_fqs_cmd, ' '.join(map(str, pass_through_args)),
-             threads=args.threads, rg=args.read_group or
-             rname(*args.fastqs),
-             paired=len(args.fastqs) == 2,
-             set_as_failed=args.set_as_failed)
-
-if __name__ == "__main__":
-    main(sys.argv[1:])
--- a/bwameth_wrapper.py	Wed Sep 14 05:07:47 2016 -0400
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,45 +0,0 @@
-#!/usr/bin/env python
-import argparse
-import subprocess
-import os
-import shlex
-
-def createIndex(fname):
-    """
-    Create an index for a file, return where it is
-    """
-    d = os.mkdir("index_dir")
-    os.link(fname, "index_dir/genome.fa")
-    cmd = "{}/bwameth.py index {}/index_dir/genome.fa".format(os.path.dirname(__file__), os.getcwd())
-    proc = subprocess.Popen(args=shlex.split(cmd))
-    returncode = proc.wait()
-    if returncode != 0:
-        raise Exception("Error during '%s'" % cmd)
-    return "{}/index_dir/genome.fa".format(os.getcwd())
-
-
-parser = argparse.ArgumentParser(description="A wrapper around bwameth for Galaxy. There's minimal argument checking done")
-parser.add_argument('-p', '--numThreads', type=int, default=4, help="number of threads")
-parser.add_argument('--makeIndex', help="Given a fasta file, index it")
-parser.add_argument('--premadeIndex', help="If an index already exists, this is the fasta file associated with it (the index files must be in the same directory")
-parser.add_argument('--readGroup', help="The read group text, if desired")
-parser.add_argument('files', nargs="+", help="Fasta files (possibly gzipped, if the file names end in .gz)")
-args = parser.parse_args()
-
-if args.makeIndex:
-    ifile = createIndex(args.makeIndex)
-else:
-    ifile = args.premadeIndex
-
-files = " ".join(['{}'.format(x) for x in args.files])
-if args.readGroup:
-    files = "{} --read-group '{}'".format(files, args.readGroup)
-
-cmd = "{}/bwameth.py -t {} --reference '{}' {} | samtools view -u - | samtools sort -@ {} - output".format(os.path.dirname(__file__), args.numThreads, ifile, files, args.numThreads)
-of = open("temp.sh", "w")
-of.write("{}\n".format(cmd))
-of.close()
-try:
-    subprocess.check_call(["bash", "./temp.sh"])
-except:
-    raise Exception("Error during '%s'" % cmd)
Binary file test-data/output.bam has changed
Binary file test-data/t_R1.fastq.gz has changed
Binary file test-data/t_R2.fastq.gz has changed