bctools: merge_pcr_duplicates.py comparison

comparison merge_pcr_duplicates.py @ 55:4bedd35bcdff draft

Move deps from macros to individual tools

author	rnateam
date	Fri, 12 Feb 2016 08:37:33 -0500
parents	0b9aab6aaebf
children	bbbae1ee87e0

comparison

equal deleted inserted replaced

-:7a05b21c5629
+:4bedd35bcdff
 #!/usr/bin/env python
 import argparse
 import logging
 from sys import stdout
-import pandas as pd
 from subprocess import check_call
 from shutil import rmtree
 from tempfile import mkdtemp
 from os.path import isfile
 # avoid ugly python IOError when stdout output is piped into another program
 signal(SIGPIPE, SIG_DFL)
 tool_description = """
 Merge PCR duplicates according to random barcode library.
-Barcodes containing uncalled base 'N' are removed. By default output is written
+Barcodes containing uncalled base 'N' are removed.
-to stdout.
 Input:
 * bed6 file containing alignments with fastq read-id in name field
 * fastq library of random barcodes
 score, sorted by fields chrom, start, stop, strand, name
 Example usage:
 - read PCR duplicates from file duplicates.bed and write merged results to file
 merged.bed:
-merge_pcr_duplicates.py duplicates.bed bclibrary.fa --out merged.bed
+merge_pcr_duplicates.py duplicates.bed bclibrary.fa --outfile merged.bed
 """
 epilog = """
 Author: Daniel Maticzka
 Copyright: 2015
 "bclib",
 help="Path to fastq barcode library.")
 # optional arguments
 parser.add_argument(
 "-o", "--outfile",
+required=True,
 help="Write results to this file.")
 # misc arguments
 parser.add_argument(
 "-v", "--verbose",
 help="Be verbose.",
 try:
 tmpdir = mkdtemp()
 logging.debug("tmpdir: " + tmpdir)
-# prepare barcode library
-syscall1 = "cat " + args.bclib + " | awk 'BEGIN{OFS=\"\\t\"}NR%4==1{gsub(/^@/,\"\"); id=$1}NR%4==2{bc=$1}NR%4==3{print id,bc}' | sort -k1,1 > " + tmpdir + "/bclib.csv"
-check_call(syscall1, shell=True)
 # prepare alinments
-syscall2 = "cat " + args.alignments + " | awk -F \"\\t\" 'BEGIN{OFS=\"\\t\"}{split($4, a, \" \"); $4 = a[1]; print}'| sort -k4,4 > " + tmpdir + "/alns.csv"
+syscall2 = "cat " + args.alignments + " | awk -F \"\\t\" 'BEGIN{OFS=\"\\t\"}{split($4, a, \" \"); $4 = a[1]; print}'| sort --compress-program=gzip -k4,4 > " + tmpdir + "/alns.csv"
 check_call(syscall2, shell=True)
 # join barcode library and alignments
-syscall3 = "join -1 1 -2 4 " + tmpdir + "/bclib.csv " + tmpdir + "/alns.csv " + " | awk 'BEGIN{OFS=\"\\t\"}{print $3,$4,$5,$2,$6,$7}' > " + tmpdir + "/bcalib.csv"
+syscall3 = "cat " + args.bclib + " | awk 'BEGIN{OFS=\"\\t\"}NR%4==1{gsub(/^@/,\"\"); id=$1}NR%4==2{bc=$1}NR%4==3{print id,bc}' | sort --compress-program=gzip -k1,1 | join -1 1 -2 4 - " + tmpdir + "/alns.csv " + " | awk 'BEGIN{OFS=\"\\t\"}$4!~/N/{print $3,$4,$5,$2,$6,$7}' | datamash --sort -g 1,2,3,4,6 count 4 | awk 'BEGIN{OFS=\"\\t\"}$4!~/N/{print $1,$2,$3,$4,$6,$5}' > " + args.outfile
+# 'chrom', 'start', 'stop', 'bc', 'ndupes', 'strand'
 check_call(syscall3, shell=True)
-# get alignments combined with barcodes
-bcalib = pd.read_csv(
-tmpdir + "/bcalib.csv",
-sep="\t",
-names=["chrom", "start", "stop", "bc", "score", "strand"])
 finally:
 logging.debug("removed tmpdir: " + tmpdir)
 rmtree(tmpdir)
-# fail if alignments given but combined library is empty
-if bcalib.empty:
-raise Exception("ERROR: no common entries for alignments and barcode library found. Please check your input files.")
-# warn if not all alignments could be assigned a barcode
-n_bcalib = len(bcalib.index)
-if n_bcalib < n_alns:
-logging.warning(
-"{} of {} alignments could not be associated with a random barcode.".format(n_alns - n_bcalib, n_alns))
-# remove entries with barcodes that has uncalled base N
-bcalib_cleaned = bcalib.drop(bcalib[bcalib.bc.str.contains("N")].index)
-n_bcalib_cleaned = len(bcalib_cleaned)
-# if n_bcalib_cleaned < n_bcalib:
-#     msg = "{} of {} alignments had random barcodes containing uncalled bases and were dropped.".format(
-#         n_bcalib - n_bcalib_cleaned, n_bcalib)
-#     if n_bcalib_cleaned < (0.8 * n_bcalib):
-#         logging.warning(msg)
-#     else:
-#         logging.info(msg)
-# count and merge pcr duplicates
-# grouping sorts by keys, so the ouput will be properly sorted
-merged = bcalib_cleaned.groupby(['chrom', 'start', 'stop', 'strand', 'bc']).size().reset_index()
-merged.rename(columns={0: 'ndupes'}, copy=False, inplace=True)
-# write coordinates of crosslinking event alignments
-eventalnout = (open(args.outfile, "w") if args.outfile is not None else stdout)
-merged.to_csv(
-eventalnout,
-columns=['chrom', 'start', 'stop', 'bc', 'ndupes', 'strand'],
-sep="\t", index=False, header=False)
-eventalnout.close()

Mercurial > repos > rnateam > bctools

comparison merge_pcr_duplicates.py @ 55:4bedd35bcdff draft