Mercurial > repos > rnateam > bctools
comparison merge_pcr_duplicates.py @ 55:4bedd35bcdff draft
Move deps from macros to individual tools
author | rnateam |
---|---|
date | Fri, 12 Feb 2016 08:37:33 -0500 |
parents | 0b9aab6aaebf |
children | bbbae1ee87e0 |
comparison
equal
deleted
inserted
replaced
54:7a05b21c5629 | 55:4bedd35bcdff |
---|---|
1 #!/usr/bin/env python | 1 #!/usr/bin/env python |
2 | 2 |
3 import argparse | 3 import argparse |
4 import logging | 4 import logging |
5 from sys import stdout | 5 from sys import stdout |
6 import pandas as pd | |
7 from subprocess import check_call | 6 from subprocess import check_call |
8 from shutil import rmtree | 7 from shutil import rmtree |
9 from tempfile import mkdtemp | 8 from tempfile import mkdtemp |
10 from os.path import isfile | 9 from os.path import isfile |
11 # avoid ugly python IOError when stdout output is piped into another program | 10 # avoid ugly python IOError when stdout output is piped into another program |
14 signal(SIGPIPE, SIG_DFL) | 13 signal(SIGPIPE, SIG_DFL) |
15 | 14 |
16 tool_description = """ | 15 tool_description = """ |
17 Merge PCR duplicates according to random barcode library. | 16 Merge PCR duplicates according to random barcode library. |
18 | 17 |
19 Barcodes containing uncalled base 'N' are removed. By default output is written | 18 Barcodes containing uncalled base 'N' are removed. |
20 to stdout. | |
21 | 19 |
22 Input: | 20 Input: |
23 * bed6 file containing alignments with fastq read-id in name field | 21 * bed6 file containing alignments with fastq read-id in name field |
24 * fastq library of random barcodes | 22 * fastq library of random barcodes |
25 | 23 |
28 score, sorted by fields chrom, start, stop, strand, name | 26 score, sorted by fields chrom, start, stop, strand, name |
29 | 27 |
30 Example usage: | 28 Example usage: |
31 - read PCR duplicates from file duplicates.bed and write merged results to file | 29 - read PCR duplicates from file duplicates.bed and write merged results to file |
32 merged.bed: | 30 merged.bed: |
33 merge_pcr_duplicates.py duplicates.bed bclibrary.fa --out merged.bed | 31 merge_pcr_duplicates.py duplicates.bed bclibrary.fa --outfile merged.bed |
34 """ | 32 """ |
35 | 33 |
36 epilog = """ | 34 epilog = """ |
37 Author: Daniel Maticzka | 35 Author: Daniel Maticzka |
38 Copyright: 2015 | 36 Copyright: 2015 |
53 "bclib", | 51 "bclib", |
54 help="Path to fastq barcode library.") | 52 help="Path to fastq barcode library.") |
55 # optional arguments | 53 # optional arguments |
56 parser.add_argument( | 54 parser.add_argument( |
57 "-o", "--outfile", | 55 "-o", "--outfile", |
56 required=True, | |
58 help="Write results to this file.") | 57 help="Write results to this file.") |
59 # misc arguments | 58 # misc arguments |
60 parser.add_argument( | 59 parser.add_argument( |
61 "-v", "--verbose", | 60 "-v", "--verbose", |
62 help="Be verbose.", | 61 help="Be verbose.", |
102 | 101 |
103 try: | 102 try: |
104 tmpdir = mkdtemp() | 103 tmpdir = mkdtemp() |
105 logging.debug("tmpdir: " + tmpdir) | 104 logging.debug("tmpdir: " + tmpdir) |
106 | 105 |
107 # prepare barcode library | |
108 syscall1 = "cat " + args.bclib + " | awk 'BEGIN{OFS=\"\\t\"}NR%4==1{gsub(/^@/,\"\"); id=$1}NR%4==2{bc=$1}NR%4==3{print id,bc}' | sort -k1,1 > " + tmpdir + "/bclib.csv" | |
109 check_call(syscall1, shell=True) | |
110 | |
111 # prepare alinments | 106 # prepare alinments |
112 syscall2 = "cat " + args.alignments + " | awk -F \"\\t\" 'BEGIN{OFS=\"\\t\"}{split($4, a, \" \"); $4 = a[1]; print}'| sort -k4,4 > " + tmpdir + "/alns.csv" | 107 syscall2 = "cat " + args.alignments + " | awk -F \"\\t\" 'BEGIN{OFS=\"\\t\"}{split($4, a, \" \"); $4 = a[1]; print}'| sort --compress-program=gzip -k4,4 > " + tmpdir + "/alns.csv" |
113 check_call(syscall2, shell=True) | 108 check_call(syscall2, shell=True) |
114 | 109 |
115 # join barcode library and alignments | 110 # join barcode library and alignments |
116 syscall3 = "join -1 1 -2 4 " + tmpdir + "/bclib.csv " + tmpdir + "/alns.csv " + " | awk 'BEGIN{OFS=\"\\t\"}{print $3,$4,$5,$2,$6,$7}' > " + tmpdir + "/bcalib.csv" | 111 syscall3 = "cat " + args.bclib + " | awk 'BEGIN{OFS=\"\\t\"}NR%4==1{gsub(/^@/,\"\"); id=$1}NR%4==2{bc=$1}NR%4==3{print id,bc}' | sort --compress-program=gzip -k1,1 | join -1 1 -2 4 - " + tmpdir + "/alns.csv " + " | awk 'BEGIN{OFS=\"\\t\"}$4!~/N/{print $3,$4,$5,$2,$6,$7}' | datamash --sort -g 1,2,3,4,6 count 4 | awk 'BEGIN{OFS=\"\\t\"}$4!~/N/{print $1,$2,$3,$4,$6,$5}' > " + args.outfile |
112 # 'chrom', 'start', 'stop', 'bc', 'ndupes', 'strand' | |
117 check_call(syscall3, shell=True) | 113 check_call(syscall3, shell=True) |
118 | |
119 # get alignments combined with barcodes | |
120 bcalib = pd.read_csv( | |
121 tmpdir + "/bcalib.csv", | |
122 sep="\t", | |
123 names=["chrom", "start", "stop", "bc", "score", "strand"]) | |
124 finally: | 114 finally: |
125 logging.debug("removed tmpdir: " + tmpdir) | 115 logging.debug("removed tmpdir: " + tmpdir) |
126 rmtree(tmpdir) | 116 rmtree(tmpdir) |
127 | |
128 # fail if alignments given but combined library is empty | |
129 if bcalib.empty: | |
130 raise Exception("ERROR: no common entries for alignments and barcode library found. Please check your input files.") | |
131 | |
132 # warn if not all alignments could be assigned a barcode | |
133 n_bcalib = len(bcalib.index) | |
134 if n_bcalib < n_alns: | |
135 logging.warning( | |
136 "{} of {} alignments could not be associated with a random barcode.".format(n_alns - n_bcalib, n_alns)) | |
137 | |
138 # remove entries with barcodes that has uncalled base N | |
139 bcalib_cleaned = bcalib.drop(bcalib[bcalib.bc.str.contains("N")].index) | |
140 n_bcalib_cleaned = len(bcalib_cleaned) | |
141 # if n_bcalib_cleaned < n_bcalib: | |
142 # msg = "{} of {} alignments had random barcodes containing uncalled bases and were dropped.".format( | |
143 # n_bcalib - n_bcalib_cleaned, n_bcalib) | |
144 # if n_bcalib_cleaned < (0.8 * n_bcalib): | |
145 # logging.warning(msg) | |
146 # else: | |
147 # logging.info(msg) | |
148 | |
149 # count and merge pcr duplicates | |
150 # grouping sorts by keys, so the ouput will be properly sorted | |
151 merged = bcalib_cleaned.groupby(['chrom', 'start', 'stop', 'strand', 'bc']).size().reset_index() | |
152 merged.rename(columns={0: 'ndupes'}, copy=False, inplace=True) | |
153 | |
154 # write coordinates of crosslinking event alignments | |
155 eventalnout = (open(args.outfile, "w") if args.outfile is not None else stdout) | |
156 merged.to_csv( | |
157 eventalnout, | |
158 columns=['chrom', 'start', 'stop', 'bc', 'ndupes', 'strand'], | |
159 sep="\t", index=False, header=False) | |
160 eventalnout.close() |