comparison merge_pcr_duplicates.py @ 55:4bedd35bcdff draft

Move deps from macros to individual tools
author rnateam
date Fri, 12 Feb 2016 08:37:33 -0500
parents 0b9aab6aaebf
children bbbae1ee87e0
comparison
equal deleted inserted replaced
54:7a05b21c5629 55:4bedd35bcdff
1 #!/usr/bin/env python 1 #!/usr/bin/env python
2 2
3 import argparse 3 import argparse
4 import logging 4 import logging
5 from sys import stdout 5 from sys import stdout
6 import pandas as pd
7 from subprocess import check_call 6 from subprocess import check_call
8 from shutil import rmtree 7 from shutil import rmtree
9 from tempfile import mkdtemp 8 from tempfile import mkdtemp
10 from os.path import isfile 9 from os.path import isfile
11 # avoid ugly python IOError when stdout output is piped into another program 10 # avoid ugly python IOError when stdout output is piped into another program
14 signal(SIGPIPE, SIG_DFL) 13 signal(SIGPIPE, SIG_DFL)
15 14
16 tool_description = """ 15 tool_description = """
17 Merge PCR duplicates according to random barcode library. 16 Merge PCR duplicates according to random barcode library.
18 17
19 Barcodes containing uncalled base 'N' are removed. By default output is written 18 Barcodes containing uncalled base 'N' are removed.
20 to stdout.
21 19
22 Input: 20 Input:
23 * bed6 file containing alignments with fastq read-id in name field 21 * bed6 file containing alignments with fastq read-id in name field
24 * fastq library of random barcodes 22 * fastq library of random barcodes
25 23
28 score, sorted by fields chrom, start, stop, strand, name 26 score, sorted by fields chrom, start, stop, strand, name
29 27
30 Example usage: 28 Example usage:
31 - read PCR duplicates from file duplicates.bed and write merged results to file 29 - read PCR duplicates from file duplicates.bed and write merged results to file
32 merged.bed: 30 merged.bed:
33 merge_pcr_duplicates.py duplicates.bed bclibrary.fa --out merged.bed 31 merge_pcr_duplicates.py duplicates.bed bclibrary.fa --outfile merged.bed
34 """ 32 """
35 33
36 epilog = """ 34 epilog = """
37 Author: Daniel Maticzka 35 Author: Daniel Maticzka
38 Copyright: 2015 36 Copyright: 2015
53 "bclib", 51 "bclib",
54 help="Path to fastq barcode library.") 52 help="Path to fastq barcode library.")
55 # optional arguments 53 # optional arguments
56 parser.add_argument( 54 parser.add_argument(
57 "-o", "--outfile", 55 "-o", "--outfile",
56 required=True,
58 help="Write results to this file.") 57 help="Write results to this file.")
59 # misc arguments 58 # misc arguments
60 parser.add_argument( 59 parser.add_argument(
61 "-v", "--verbose", 60 "-v", "--verbose",
62 help="Be verbose.", 61 help="Be verbose.",
102 101
103 try: 102 try:
104 tmpdir = mkdtemp() 103 tmpdir = mkdtemp()
105 logging.debug("tmpdir: " + tmpdir) 104 logging.debug("tmpdir: " + tmpdir)
106 105
107 # prepare barcode library
108 syscall1 = "cat " + args.bclib + " | awk 'BEGIN{OFS=\"\\t\"}NR%4==1{gsub(/^@/,\"\"); id=$1}NR%4==2{bc=$1}NR%4==3{print id,bc}' | sort -k1,1 > " + tmpdir + "/bclib.csv"
109 check_call(syscall1, shell=True)
110
111 # prepare alinments 106 # prepare alinments
112 syscall2 = "cat " + args.alignments + " | awk -F \"\\t\" 'BEGIN{OFS=\"\\t\"}{split($4, a, \" \"); $4 = a[1]; print}'| sort -k4,4 > " + tmpdir + "/alns.csv" 107 syscall2 = "cat " + args.alignments + " | awk -F \"\\t\" 'BEGIN{OFS=\"\\t\"}{split($4, a, \" \"); $4 = a[1]; print}'| sort --compress-program=gzip -k4,4 > " + tmpdir + "/alns.csv"
113 check_call(syscall2, shell=True) 108 check_call(syscall2, shell=True)
114 109
115 # join barcode library and alignments 110 # join barcode library and alignments
116 syscall3 = "join -1 1 -2 4 " + tmpdir + "/bclib.csv " + tmpdir + "/alns.csv " + " | awk 'BEGIN{OFS=\"\\t\"}{print $3,$4,$5,$2,$6,$7}' > " + tmpdir + "/bcalib.csv" 111 syscall3 = "cat " + args.bclib + " | awk 'BEGIN{OFS=\"\\t\"}NR%4==1{gsub(/^@/,\"\"); id=$1}NR%4==2{bc=$1}NR%4==3{print id,bc}' | sort --compress-program=gzip -k1,1 | join -1 1 -2 4 - " + tmpdir + "/alns.csv " + " | awk 'BEGIN{OFS=\"\\t\"}$4!~/N/{print $3,$4,$5,$2,$6,$7}' | datamash --sort -g 1,2,3,4,6 count 4 | awk 'BEGIN{OFS=\"\\t\"}$4!~/N/{print $1,$2,$3,$4,$6,$5}' > " + args.outfile
112 # 'chrom', 'start', 'stop', 'bc', 'ndupes', 'strand'
117 check_call(syscall3, shell=True) 113 check_call(syscall3, shell=True)
118
119 # get alignments combined with barcodes
120 bcalib = pd.read_csv(
121 tmpdir + "/bcalib.csv",
122 sep="\t",
123 names=["chrom", "start", "stop", "bc", "score", "strand"])
124 finally: 114 finally:
125 logging.debug("removed tmpdir: " + tmpdir) 115 logging.debug("removed tmpdir: " + tmpdir)
126 rmtree(tmpdir) 116 rmtree(tmpdir)
127
128 # fail if alignments given but combined library is empty
129 if bcalib.empty:
130 raise Exception("ERROR: no common entries for alignments and barcode library found. Please check your input files.")
131
132 # warn if not all alignments could be assigned a barcode
133 n_bcalib = len(bcalib.index)
134 if n_bcalib < n_alns:
135 logging.warning(
136 "{} of {} alignments could not be associated with a random barcode.".format(n_alns - n_bcalib, n_alns))
137
138 # remove entries with barcodes that has uncalled base N
139 bcalib_cleaned = bcalib.drop(bcalib[bcalib.bc.str.contains("N")].index)
140 n_bcalib_cleaned = len(bcalib_cleaned)
141 # if n_bcalib_cleaned < n_bcalib:
142 # msg = "{} of {} alignments had random barcodes containing uncalled bases and were dropped.".format(
143 # n_bcalib - n_bcalib_cleaned, n_bcalib)
144 # if n_bcalib_cleaned < (0.8 * n_bcalib):
145 # logging.warning(msg)
146 # else:
147 # logging.info(msg)
148
149 # count and merge pcr duplicates
150 # grouping sorts by keys, so the ouput will be properly sorted
151 merged = bcalib_cleaned.groupby(['chrom', 'start', 'stop', 'strand', 'bc']).size().reset_index()
152 merged.rename(columns={0: 'ndupes'}, copy=False, inplace=True)
153
154 # write coordinates of crosslinking event alignments
155 eventalnout = (open(args.outfile, "w") if args.outfile is not None else stdout)
156 merged.to_csv(
157 eventalnout,
158 columns=['chrom', 'start', 'stop', 'bc', 'ndupes', 'strand'],
159 sep="\t", index=False, header=False)
160 eventalnout.close()