comparison merge_pcr_duplicates.py @ 2:de4ea3aa1090 draft

Uploaded
author rnateam
date Thu, 22 Oct 2015 10:26:45 -0400
parents
children 17ef0e0dae68
comparison
equal deleted inserted replaced
1:ae0f58d3318f 2:de4ea3aa1090
1 #!/usr/bin/env python
2
3 tool_description = """
4 Merge PCR duplicates according to random barcode library.
5
6 Barcodes containing uncalled base 'N' are removed. By default output is written
7 to stdout.
8
9 Input:
10 * bed6 file containing alignments with fastq read-id in name field
11 * fasta library with fastq read-id as sequence ids
12
13 Output:
14 * bed6 file with random barcode in name field and number of PCR duplicates as
15 score, sorted by fields chrom, start, stop, strand, name
16
17 Example usage:
18 - read PCR duplicates from file duplicates.bed and write merged results to file
19 merged.bed:
20 merge_pcr_duplicates.py duplicates.bed bclibrary.fa --out merged.bed
21 """
22
23 epilog = """
24 Author: Daniel Maticzka
25 Copyright: 2015
26 License: Apache
27 Email: maticzkd@informatik.uni-freiburg.de
28 Status: Testing
29 """
30
31 import argparse
32 import logging
33 from sys import stdout
34 from Bio import SeqIO
35 import pandas as pd
36
37 # avoid ugly python IOError when stdout output is piped into another program
38 # and then truncated (such as piping to head)
39 from signal import signal, SIGPIPE, SIG_DFL
40 signal(SIGPIPE, SIG_DFL)
41
42
43 def fasta_tuple_generator(fasta_iterator):
44 "Yields id, sequence tuples given an iterator over Biopython SeqIO objects."
45 for record in input_seq_iterator:
46 yield (record.id, str(record.seq))
47
48
49 # parse command line arguments
50 parser = argparse.ArgumentParser(description=tool_description,
51 epilog=epilog,
52 formatter_class=argparse.RawDescriptionHelpFormatter)
53 # positional arguments
54 parser.add_argument(
55 "alignments",
56 help="Path to bed6 file containing alignments.")
57 parser.add_argument(
58 "bclib",
59 help="Path to fasta barcode library.")
60 # optional arguments
61 parser.add_argument(
62 "-o", "--outfile",
63 help="Write results to this file.")
64 # misc arguments
65 parser.add_argument(
66 "-v", "--verbose",
67 help="Be verbose.",
68 action="store_true")
69 parser.add_argument(
70 "-d", "--debug",
71 help="Print lots of debugging information",
72 action="store_true")
73 parser.add_argument(
74 '--version',
75 action='version',
76 version='0.1.0')
77
78 args = parser.parse_args()
79
80 if args.debug:
81 logging.basicConfig(level=logging.DEBUG, format="%(asctime)s - %(filename)s - %(levelname)s - %(message)s")
82 elif args.verbose:
83 logging.basicConfig(level=logging.INFO, format="%(filename)s - %(levelname)s - %(message)s")
84 else:
85 logging.basicConfig(format="%(filename)s - %(levelname)s - %(message)s")
86 logging.info("Parsed arguments:")
87 logging.info(" alignments: '{}'".format(args.alignments))
88 logging.info(" bclib: '{}'".format(args.bclib))
89 if args.outfile:
90 logging.info(" outfile: enabled writing to file")
91 logging.info(" outfile: '{}'".format(args.outfile))
92 logging.info("")
93
94 # load barcode library into dictionary
95 input_handle = open(args.bclib, "rU")
96 input_seq_iterator = SeqIO.parse(input_handle, "fasta")
97 bcs = pd.DataFrame.from_records(
98 data=fasta_tuple_generator(input_seq_iterator),
99 columns=["read_id", "bc"])
100
101 # load alignments
102 alns = pd.read_csv(
103 args.alignments,
104 sep="\t",
105 names=["chrom", "start", "stop", "read_id", "score", "strand"])
106
107 # combine barcode library and alignments
108 bcalib = pd.merge(
109 bcs, alns,
110 on="read_id",
111 how="inner",
112 sort=False)
113 if bcalib.empty:
114 raise Exception("ERROR: no common entries for alignments and barcode library found. Please check your input files.")
115 n_alns = len(alns.index)
116 n_bcalib = len(bcalib.index)
117 if n_bcalib < n_alns:
118 logging.warning(
119 "{} of {} alignments could not be associated with a random barcode.".format(
120 n_alns - n_bcalib, n_alns))
121
122 # remove entries with barcodes that has uncalled base N
123 bcalib_cleaned = bcalib.drop(bcalib[bcalib.bc.str.contains("N")].index)
124 n_bcalib_cleaned = len(bcalib_cleaned)
125 if n_bcalib_cleaned < n_bcalib:
126 msg = "{} of {} alignments had random barcodes containing uncalled bases and were dropped.".format(
127 n_bcalib - n_bcalib_cleaned, n_bcalib)
128 if n_bcalib_cleaned < (0.8 * n_bcalib):
129 logging.warning(msg)
130 else:
131 logging.info(msg)
132
133 # count and merge pcr duplicates
134 # grouping sorts by keys, so the ouput will be properly sorted
135 merged = bcalib_cleaned.groupby(['chrom', 'start', 'stop', 'strand', 'bc']).size().reset_index()
136 merged.rename(columns={0: 'ndupes'}, copy=False, inplace=True)
137
138 # write coordinates of crosslinking event alignments
139 eventalnout = (open(args.outfile, "w") if args.outfile is not None else stdout)
140 merged.to_csv(
141 eventalnout,
142 columns=['chrom', 'start', 'stop', 'bc', 'ndupes', 'strand'],
143 sep="\t", index=False, header=False)
144 eventalnout.close()