2
|
1 #!/usr/bin/env python
|
|
2
|
|
3 tool_description = """
|
|
4 Merge PCR duplicates according to random barcode library.
|
|
5
|
|
6 Barcodes containing uncalled base 'N' are removed. By default output is written
|
|
7 to stdout.
|
|
8
|
|
9 Input:
|
|
10 * bed6 file containing alignments with fastq read-id in name field
|
8
|
11 * fastq library of random barcodes
|
2
|
12
|
|
13 Output:
|
|
14 * bed6 file with random barcode in name field and number of PCR duplicates as
|
|
15 score, sorted by fields chrom, start, stop, strand, name
|
|
16
|
|
17 Example usage:
|
|
18 - read PCR duplicates from file duplicates.bed and write merged results to file
|
|
19 merged.bed:
|
|
20 merge_pcr_duplicates.py duplicates.bed bclibrary.fa --out merged.bed
|
|
21 """
|
|
22
|
|
23 epilog = """
|
|
24 Author: Daniel Maticzka
|
|
25 Copyright: 2015
|
|
26 License: Apache
|
|
27 Email: maticzkd@informatik.uni-freiburg.de
|
|
28 Status: Testing
|
|
29 """
|
|
30
|
|
31 import argparse
|
|
32 import logging
|
|
33 from sys import stdout
|
|
34 from Bio import SeqIO
|
|
35 import pandas as pd
|
|
36
|
|
37 # avoid ugly python IOError when stdout output is piped into another program
|
|
38 # and then truncated (such as piping to head)
|
|
39 from signal import signal, SIGPIPE, SIG_DFL
|
|
40 signal(SIGPIPE, SIG_DFL)
|
|
41
|
|
42
|
|
43 def fasta_tuple_generator(fasta_iterator):
|
|
44 "Yields id, sequence tuples given an iterator over Biopython SeqIO objects."
|
|
45 for record in input_seq_iterator:
|
|
46 yield (record.id, str(record.seq))
|
|
47
|
|
48
|
|
49 # parse command line arguments
|
|
50 parser = argparse.ArgumentParser(description=tool_description,
|
|
51 epilog=epilog,
|
|
52 formatter_class=argparse.RawDescriptionHelpFormatter)
|
|
53 # positional arguments
|
|
54 parser.add_argument(
|
|
55 "alignments",
|
|
56 help="Path to bed6 file containing alignments.")
|
|
57 parser.add_argument(
|
|
58 "bclib",
|
|
59 help="Path to fasta barcode library.")
|
|
60 # optional arguments
|
|
61 parser.add_argument(
|
|
62 "-o", "--outfile",
|
|
63 help="Write results to this file.")
|
8
|
64 parser.add_argument(
|
|
65 "--fasta-library",
|
|
66 dest="fasta_library",
|
|
67 action="store_true",
|
|
68 help="Read random barcode library as fasta format.")
|
2
|
69 # misc arguments
|
|
70 parser.add_argument(
|
|
71 "-v", "--verbose",
|
|
72 help="Be verbose.",
|
|
73 action="store_true")
|
|
74 parser.add_argument(
|
|
75 "-d", "--debug",
|
|
76 help="Print lots of debugging information",
|
|
77 action="store_true")
|
|
78 parser.add_argument(
|
|
79 '--version',
|
|
80 action='version',
|
|
81 version='0.1.0')
|
|
82
|
|
83 args = parser.parse_args()
|
|
84
|
|
85 if args.debug:
|
|
86 logging.basicConfig(level=logging.DEBUG, format="%(asctime)s - %(filename)s - %(levelname)s - %(message)s")
|
|
87 elif args.verbose:
|
|
88 logging.basicConfig(level=logging.INFO, format="%(filename)s - %(levelname)s - %(message)s")
|
|
89 else:
|
|
90 logging.basicConfig(format="%(filename)s - %(levelname)s - %(message)s")
|
|
91 logging.info("Parsed arguments:")
|
|
92 logging.info(" alignments: '{}'".format(args.alignments))
|
|
93 logging.info(" bclib: '{}'".format(args.bclib))
|
|
94 if args.outfile:
|
|
95 logging.info(" outfile: enabled writing to file")
|
|
96 logging.info(" outfile: '{}'".format(args.outfile))
|
|
97 logging.info("")
|
|
98
|
|
99 # load barcode library into dictionary
|
|
100 input_handle = open(args.bclib, "rU")
|
8
|
101 if args.fasta_library:
|
|
102 input_seq_iterator = SeqIO.parse(input_handle, "fasta")
|
|
103 else:
|
|
104 input_seq_iterator = SeqIO.parse(input_handle, "fastq")
|
2
|
105 bcs = pd.DataFrame.from_records(
|
|
106 data=fasta_tuple_generator(input_seq_iterator),
|
|
107 columns=["read_id", "bc"])
|
|
108
|
|
109 # load alignments
|
|
110 alns = pd.read_csv(
|
|
111 args.alignments,
|
|
112 sep="\t",
|
|
113 names=["chrom", "start", "stop", "read_id", "score", "strand"])
|
|
114
|
14
|
115 # keep id parts up to first whitespace
|
|
116 alns["read_id"] = alns["read_id"].str.split(' ').str.get(0)
|
|
117
|
2
|
118 # combine barcode library and alignments
|
|
119 bcalib = pd.merge(
|
|
120 bcs, alns,
|
|
121 on="read_id",
|
|
122 how="inner",
|
|
123 sort=False)
|
|
124 if bcalib.empty:
|
|
125 raise Exception("ERROR: no common entries for alignments and barcode library found. Please check your input files.")
|
|
126 n_alns = len(alns.index)
|
|
127 n_bcalib = len(bcalib.index)
|
|
128 if n_bcalib < n_alns:
|
|
129 logging.warning(
|
|
130 "{} of {} alignments could not be associated with a random barcode.".format(
|
|
131 n_alns - n_bcalib, n_alns))
|
|
132
|
|
133 # remove entries with barcodes that has uncalled base N
|
|
134 bcalib_cleaned = bcalib.drop(bcalib[bcalib.bc.str.contains("N")].index)
|
|
135 n_bcalib_cleaned = len(bcalib_cleaned)
|
|
136 if n_bcalib_cleaned < n_bcalib:
|
|
137 msg = "{} of {} alignments had random barcodes containing uncalled bases and were dropped.".format(
|
|
138 n_bcalib - n_bcalib_cleaned, n_bcalib)
|
|
139 if n_bcalib_cleaned < (0.8 * n_bcalib):
|
|
140 logging.warning(msg)
|
|
141 else:
|
|
142 logging.info(msg)
|
|
143
|
|
144 # count and merge pcr duplicates
|
|
145 # grouping sorts by keys, so the ouput will be properly sorted
|
|
146 merged = bcalib_cleaned.groupby(['chrom', 'start', 'stop', 'strand', 'bc']).size().reset_index()
|
|
147 merged.rename(columns={0: 'ndupes'}, copy=False, inplace=True)
|
|
148
|
|
149 # write coordinates of crosslinking event alignments
|
|
150 eventalnout = (open(args.outfile, "w") if args.outfile is not None else stdout)
|
|
151 merged.to_csv(
|
|
152 eventalnout,
|
|
153 columns=['chrom', 'start', 'stop', 'bc', 'ndupes', 'strand'],
|
|
154 sep="\t", index=False, header=False)
|
|
155 eventalnout.close()
|