Mercurial > repos > dpryan79 > bwameth
changeset 16:9f1401078360 draft
Deleted selected files
author | dpryan79 |
---|---|
date | Wed, 14 Sep 2016 07:25:18 -0400 |
parents | 66a8223839c8 |
children | b55c12d26066 |
files | bwameth.py bwameth_wrapper.py test-data/output.bam test-data/t_R1.fastq.gz test-data/t_R2.fastq.gz |
diffstat | 5 files changed, 0 insertions(+), 495 deletions(-) [+] |
line wrap: on
line diff
--- a/bwameth.py Wed Sep 14 05:07:47 2016 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,450 +0,0 @@ -#!/usr/bin/env python -""" -map bisulfite converted reads to an insilico converted genome using bwa mem. -A command to this program like: - - python bwameth.py --reference ref.fa A.fq B.fq - -Gets converted to: - - bwa mem -pCMR ref.fa.bwameth.c2t '<python bwameth.py c2t A.fq B.fq' - -So that A.fq has C's converted to T's and B.fq has G's converted to A's -and both are streamed directly to the aligner without a temporary file. -The output is a corrected, sorted, indexed BAM. -""" -from __future__ import print_function -import tempfile -import sys -import os -import os.path as op -import argparse -from subprocess import check_call -from operator import itemgetter -from itertools import groupby, repeat, chain -import re - -try: - from itertools import izip - import string - maketrans = string.maketrans -except ImportError: # python3 - izip = zip - maketrans = str.maketrans -from toolshed import nopen, reader, is_newer_b - -__version__ = "0.2.0" - -def checkX(cmd): - for p in os.environ['PATH'].split(":"): - if os.access(os.path.join(p, cmd), os.X_OK): - break - else: - raise Exception("executable for '%s' not found" % cmd) - -checkX('samtools') -checkX('bwa') - -class BWAMethException(Exception): pass - -def comp(s, _comp=maketrans('ATCG', 'TAGC')): - return s.translate(_comp) - -def wrap(text, width=100): # much faster than textwrap - try: xrange - except NameError: xrange = range - for s in xrange(0, len(text), width): - yield text[s:s+width] - -def run(cmd): - list(nopen("|%s" % cmd.lstrip("|"))) - -def fasta_iter(fasta_name): - fh = nopen(fasta_name) - faiter = (x[1] for x in groupby(fh, lambda line: line[0] == ">")) - for header in faiter: - header = next(header)[1:].strip() - yield header, "".join(s.strip() for s in next(faiter)).upper() - -def convert_reads(fq1s, fq2s, out=sys.stdout): - - for fq1, fq2 in zip(fq1s.split(","), fq2s.split(",")): - sys.stderr.write("converting reads in %s,%s\n" % (fq1, fq2)) - fq1 = nopen(fq1) - if fq2 != "NA": - fq2 = nopen(fq2) - q2_iter = izip(*[fq2] * 4) - else: - sys.stderr.write("WARNING: running bwameth in single-end mode\n") - q2_iter = repeat((None, None, None, None)) - q1_iter = izip(*[fq1] * 4) - - lt80 = 0 - for pair in izip(q1_iter, q2_iter): - for read_i, (name, seq, _, qual) in enumerate(pair): - if name is None: continue - name = name.rstrip("\r\n").split(" ")[0] - if name[0] != "@": - sys.stderr.write("""ERROR!!!! -ERROR!!! FASTQ conversion failed -ERROR!!! expecting FASTQ 4-tuples, but found a record %s that doesn't start with "@" -""" % name) - sys.exit(1) - if name.endswith(("_R1", "_R2")): - name = name[:-3] - elif name.endswith(("/1", "/2")): - name = name[:-2] - - seq = seq.upper().rstrip('\n') - if len(seq) < 80: - lt80 += 1 - - char_a, char_b = ['CT', 'GA'][read_i] - # keep original sequence as name. - name = " ".join((name, - "YS:Z:" + seq + - "\tYC:Z:" + char_a + char_b + '\n')) - seq = seq.replace(char_a, char_b) - out.write("".join((name, seq, "\n+\n", qual))) - - out.flush() - if lt80 > 50: - sys.stderr.write("WARNING: %i reads with length < 80\n" % lt80) - sys.stderr.write(" : this program is designed for long reads\n") - return 0 - -def convert_fasta(ref_fasta, just_name=False): - out_fa = ref_fasta + ".bwameth.c2t" - if just_name: - return out_fa - msg = "c2t in %s to %s" % (ref_fasta, out_fa) - if is_newer_b(ref_fasta, out_fa): - sys.stderr.write("already converted: %s\n" % msg) - return out_fa - sys.stderr.write("converting %s\n" % msg) - try: - fh = open(out_fa, "w") - for header, seq in fasta_iter(ref_fasta): - ########### Reverse ###################### - fh.write(">r%s\n" % header) - - #if non_cpg_only: - # for ctx in "TAG": # use "ATC" for fwd - # seq = seq.replace('G' + ctx, "A" + ctx) - # for line in wrap(seq): - # print >>fh, line - #else: - for line in wrap(seq.replace("G", "A")): - fh.write(line + '\n') - - ########### Forward ###################### - fh.write(">f%s\n" % header) - for line in wrap(seq.replace("C", "T")): - fh.write(line + '\n') - fh.close() - except: - try: - fh.close() - except UnboundLocalError: - pass - os.unlink(out_fa) - raise - return out_fa - - -def bwa_index(fa): - if is_newer_b(fa, (fa + '.amb', fa + '.sa')): - return - sys.stderr.write("indexing: %s\n" % fa) - try: - run("bwa index -a bwtsw %s" % fa) - except: - if op.exists(fa + ".amb"): - os.unlink(fa + ".amb") - raise - -class Bam(object): - __slots__ = 'read flag chrom pos mapq cigar chrom_mate pos_mate tlen \ - seq qual other'.split() - def __init__(self, args): - for a, v in zip(self.__slots__[:11], args): - setattr(self, a, v) - self.other = args[11:] - self.flag = int(self.flag) - self.pos = int(self.pos) - self.tlen = int(float(self.tlen)) - - def __repr__(self): - return "Bam({chr}:{start}:{read}".format(chr=self.chrom, - start=self.pos, - read=self.read) - - def __str__(self): - return "\t".join(str(getattr(self, s)) for s in self.__slots__[:11]) \ - + "\t" + "\t".join(self.other) - - def is_first_read(self): - return bool(self.flag & 0x40) - - def is_second_read(self): - return bool(self.flag & 0x80) - - def is_plus_read(self): - return not (self.flag & 0x10) - - def is_minus_read(self): - return bool(self.flag & 0x10) - - def is_mapped(self): - return not (self.flag & 0x4) - - def cigs(self): - if self.cigar == "*": - yield (0, None) - raise StopIteration - cig_iter = groupby(self.cigar, lambda c: c.isdigit()) - for g, n in cig_iter: - yield int("".join(n)), "".join(next(cig_iter)[1]) - - def cig_len(self): - return sum(c[0] for c in self.cigs() if c[1] in - ("M", "D", "N", "EQ", "X", "P")) - - def left_shift(self): - left = 0 - for n, cig in self.cigs(): - if cig == "M": break - if cig == "H": - left += n - return left - - def right_shift(self): - right = 0 - for n, cig in reversed(list(self.cigs())): - if cig == "M": break - if cig == "H": - right += n - return -right or None - - @property - def original_seq(self): - try: - return next(x for x in self.other if x.startswith("YS:Z:"))[5:] - except: - sys.stderr.write(repr(self.other) + "\n") - sys.stderr.write(self.read + "\n") - raise - - @property - def ga_ct(self): - return [x for x in self.other if x.startswith("YC:Z:")] - - def longest_match(self, patt=re.compile("\d+M")): - return max(int(x[:-1]) for x in patt.findall(self.cigar)) - - -def rname(fq1, fq2=""): - fq1, fq2 = fq1.split(",")[0], fq2.split(",")[0] - def name(f): - n = op.basename(op.splitext(f)[0]) - if n.endswith('.fastq'): n = n[:-6] - if n.endswith(('.fq', '.r1', '.r2')): n = n[:-3] - return n - return "".join(a for a, b in zip(name(fq1), name(fq2)) if a == b) or 'bm' - - -def bwa_mem(fa, mfq, extra_args, threads=1, rg=None, - paired=True, set_as_failed=None): - conv_fa = convert_fasta(fa, just_name=True) - if not is_newer_b(conv_fa, (conv_fa + '.amb', conv_fa + '.sa')): - raise BWAMethException("first run bwameth.py index %s" % fa) - - if not rg is None and not rg.startswith('@RG'): - rg = '@RG\tID:{rg}\tSM:{rg}'.format(rg=rg) - - # penalize clipping and unpaired. lower penalty on mismatches (-B) - cmd = "|bwa mem -T 40 -B 2 -L 10 -CM " - - if paired: - cmd += ("-U 100 -p ") - cmd += "-R '{rg}' -t {threads} {extra_args} {conv_fa} {mfq}" - cmd = cmd.format(**locals()) - sys.stderr.write("running: %s\n" % cmd.lstrip("|")) - as_bam(cmd, fa, set_as_failed) - - -def as_bam(pfile, fa, set_as_failed=None): - """ - pfile: either a file or a |process to generate sam output - fa: the reference fasta - set_as_failed: None, 'f', or 'r'. If 'f'. Reads mapping to that strand - are given the sam flag of a failed QC alignment (0x200). - """ - sam_iter = nopen(pfile) - - for line in sam_iter: - if not line[0] == "@": break - handle_header(line) - else: - sys.stderr.flush() - raise Exception("bad or empty fastqs") - sam_iter2 = (x.rstrip().split("\t") for x in chain([line], sam_iter)) - for read_name, pair_list in groupby(sam_iter2, itemgetter(0)): - pair_list = [Bam(toks) for toks in pair_list] - - for aln in handle_reads(pair_list, set_as_failed): - sys.stdout.write(str(aln) + '\n') - -def handle_header(line, out=sys.stdout): - toks = line.rstrip().split("\t") - if toks[0].startswith("@SQ"): - sq, sn, ln = toks # @SQ SN:fchr11 LN:122082543 - # we have f and r, only print out f - chrom = sn.split(":")[1] - if chrom.startswith('r'): return - chrom = chrom[1:] - toks = ["%s\tSN:%s\t%s" % (sq, chrom, ln)] - if toks[0].startswith("@PG"): - #out.write("\t".join(toks) + "\n") - toks = ["@PG\tID:bwa-meth\tPN:bwa-meth\tVN:%s\tCL:\"%s\"" % ( - __version__, - " ".join(x.replace("\t", "\\t") for x in sys.argv))] - out.write("\t".join(toks) + "\n") - - -def handle_reads(alns, set_as_failed): - - for aln in alns: - orig_seq = aln.original_seq - assert len(aln.seq) == len(aln.qual), aln.read - # don't need this any more. - aln.other = [x for x in aln.other if not x.startswith('YS:Z')] - - # first letter of chrom is 'f' or 'r' - direction = aln.chrom[0] - aln.chrom = aln.chrom.lstrip('fr') - - if not aln.is_mapped(): - aln.seq = orig_seq - continue - - assert direction in 'fr', (direction, aln) - aln.other.append('YD:Z:' + direction) - - if set_as_failed == direction: - aln.flag |= 0x200 - - # here we have a heuristic that if the longest match is not 44% of the - # sequence length, we mark it as failed QC and un-pair it. At the end - # of the loop we set all members of this pair to be unmapped - if aln.longest_match() < (len(orig_seq) * 0.44): - aln.flag |= 0x200 # fail qc - aln.flag &= (~0x2) # un-pair - aln.mapq = min(int(aln.mapq), 1) - - mate_direction = aln.chrom_mate[0] - if mate_direction not in "*=": - aln.chrom_mate = aln.chrom_mate[1:] - - # adjust the original seq to the cigar - l, r = aln.left_shift(), aln.right_shift() - if aln.is_plus_read(): - aln.seq = orig_seq[l:r] - else: - aln.seq = comp(orig_seq[::-1][l:r]) - - if any(aln.flag & 0x200 for aln in alns): - for aln in alns: - aln.flag |= 0x200 - aln.flag &= (~0x2) - return alns - -def cnvs_main(args): - __doc__ = """ - calculate CNVs from BS-Seq bams or vcfs - """ - p = argparse.ArgumentParser(__doc__) - p.add_argument("--regions", help="optional target regions", default='NA') - p.add_argument("bams", nargs="+") - - a = p.parse_args(args) - r_script = """ -options(stringsAsFactors=FALSE) -suppressPackageStartupMessages(library(cn.mops)) -suppressPackageStartupMessages(library(snow)) -args = commandArgs(TRUE) -regions = args[1] -bams = args[2:length(args)] -n = length(bams) -if(is.na(regions)){ - bam_counts = getReadCountsFromBAM(bams, parallel=min(n, 4), mode="paired") - res = cn.mops(bam_counts, parallel=min(n, 4), priorImpact=20) -} else { - segments = read.delim(regions, header=FALSE) - gr = GRanges(segments[,1], IRanges(segments[,2], segments[,3])) - bam_counts = getSegmentReadCountsFromBAM(bams, GR=gr, mode="paired", parallel=min(n, 4)) - res = exomecn.mops(bam_counts, parallel=min(n, 4), priorImpact=20) -} -res = calcIntegerCopyNumbers(res) - -df = as.data.frame(cnvs(res)) -write.table(df, row.names=FALSE, quote=FALSE, sep="\t") -""" - with tempfile.NamedTemporaryFile(delete=True) as rfh: - rfh.write(r_script + '\n') - rfh.flush() - for d in reader('|Rscript {rs_name} {regions} {bams}'.format( - rs_name=rfh.name, regions=a.regions, bams=" ".join(a.bams)), - header=False): - print("\t".join(d)) - - -def convert_fqs(fqs): - script = __file__ - return "'<%s %s c2t %s %s'" % (sys.executable, script, fqs[0], - fqs[1] if len(fqs) > 1 - else ','.join(['NA'] * len(fqs[0].split(",")))) - -def main(args=sys.argv[1:]): - - if len(args) > 0 and args[0] == "index": - assert len(args) == 2, ("must specify fasta as 2nd argument") - sys.exit(bwa_index(convert_fasta(args[1]))) - - if len(args) > 0 and args[0] == "c2t": - sys.exit(convert_reads(args[1], args[2])) - - if len(args) > 0 and args[0] == "cnvs": - sys.exit(cnvs_main(args[1:])) - - p = argparse.ArgumentParser(__doc__) - p.add_argument("--reference", help="reference fasta", required=True) - p.add_argument("-t", "--threads", type=int, default=6) - p.add_argument("--read-group", help="read-group to add to bam in same" - " format as to bwa: '@RG\\tID:foo\\tSM:bar'") - p.add_argument('--set-as-failed', help="flag alignments to this strand" - " as not passing QC (0x200). Targetted BS-Seq libraries are often" - " to a single strand, so we can flag them as QC failures. Note" - " f == OT, r == OB. Likely, this will be 'f' as we will expect" - " reads to align to the original-bottom (OB) strand and will flag" - " as failed those aligning to the forward, or original top (OT).", - default=None, choices=('f', 'r')) - p.add_argument('--version', action='version', version='bwa-meth.py {}'.format(__version__)) - - p.add_argument("fastqs", nargs="+", help="bs-seq fastqs to align. Run" - "multiple sets separated by commas, e.g. ... a_R1.fastq,b_R1.fastq" - " a_R2.fastq,b_R2.fastq note that the order must be maintained.") - - args, pass_through_args = p.parse_known_args(args) - - # for the 2nd file. use G => A and bwa's support for streaming. - conv_fqs_cmd = convert_fqs(args.fastqs) - - bwa_mem(args.reference, conv_fqs_cmd, ' '.join(map(str, pass_through_args)), - threads=args.threads, rg=args.read_group or - rname(*args.fastqs), - paired=len(args.fastqs) == 2, - set_as_failed=args.set_as_failed) - -if __name__ == "__main__": - main(sys.argv[1:])
--- a/bwameth_wrapper.py Wed Sep 14 05:07:47 2016 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,45 +0,0 @@ -#!/usr/bin/env python -import argparse -import subprocess -import os -import shlex - -def createIndex(fname): - """ - Create an index for a file, return where it is - """ - d = os.mkdir("index_dir") - os.link(fname, "index_dir/genome.fa") - cmd = "{}/bwameth.py index {}/index_dir/genome.fa".format(os.path.dirname(__file__), os.getcwd()) - proc = subprocess.Popen(args=shlex.split(cmd)) - returncode = proc.wait() - if returncode != 0: - raise Exception("Error during '%s'" % cmd) - return "{}/index_dir/genome.fa".format(os.getcwd()) - - -parser = argparse.ArgumentParser(description="A wrapper around bwameth for Galaxy. There's minimal argument checking done") -parser.add_argument('-p', '--numThreads', type=int, default=4, help="number of threads") -parser.add_argument('--makeIndex', help="Given a fasta file, index it") -parser.add_argument('--premadeIndex', help="If an index already exists, this is the fasta file associated with it (the index files must be in the same directory") -parser.add_argument('--readGroup', help="The read group text, if desired") -parser.add_argument('files', nargs="+", help="Fasta files (possibly gzipped, if the file names end in .gz)") -args = parser.parse_args() - -if args.makeIndex: - ifile = createIndex(args.makeIndex) -else: - ifile = args.premadeIndex - -files = " ".join(['{}'.format(x) for x in args.files]) -if args.readGroup: - files = "{} --read-group '{}'".format(files, args.readGroup) - -cmd = "{}/bwameth.py -t {} --reference '{}' {} | samtools view -u - | samtools sort -@ {} - output".format(os.path.dirname(__file__), args.numThreads, ifile, files, args.numThreads) -of = open("temp.sh", "w") -of.write("{}\n".format(cmd)) -of.close() -try: - subprocess.check_call(["bash", "./temp.sh"]) -except: - raise Exception("Error during '%s'" % cmd)