annotate merge_pcr_duplicates.py @ 34:5c46d375b837 draft

pybedtools 0.7.4
author rnateam
date Fri, 18 Dec 2015 03:04:46 -0500
parents 570a7de9f151
children 0b9aab6aaebf
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
2
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
1 #!/usr/bin/env python
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
2
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
3 tool_description = """
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
4 Merge PCR duplicates according to random barcode library.
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
5
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
6 Barcodes containing uncalled base 'N' are removed. By default output is written
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
7 to stdout.
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
8
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
9 Input:
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
10 * bed6 file containing alignments with fastq read-id in name field
8
17ef0e0dae68 Uploaded
rnateam
parents: 2
diff changeset
11 * fastq library of random barcodes
2
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
12
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
13 Output:
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
14 * bed6 file with random barcode in name field and number of PCR duplicates as
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
15 score, sorted by fields chrom, start, stop, strand, name
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
16
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
17 Example usage:
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
18 - read PCR duplicates from file duplicates.bed and write merged results to file
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
19 merged.bed:
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
20 merge_pcr_duplicates.py duplicates.bed bclibrary.fa --out merged.bed
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
21 """
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
22
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
23 epilog = """
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
24 Author: Daniel Maticzka
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
25 Copyright: 2015
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
26 License: Apache
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
27 Email: maticzkd@informatik.uni-freiburg.de
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
28 Status: Testing
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
29 """
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
30
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
31 import argparse
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
32 import logging
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
33 from sys import stdout
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
34 from Bio import SeqIO
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
35 import pandas as pd
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
36
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
37 # avoid ugly python IOError when stdout output is piped into another program
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
38 # and then truncated (such as piping to head)
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
39 from signal import signal, SIGPIPE, SIG_DFL
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
40 signal(SIGPIPE, SIG_DFL)
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
41
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
42
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
43 def fasta_tuple_generator(fasta_iterator):
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
44 "Yields id, sequence tuples given an iterator over Biopython SeqIO objects."
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
45 for record in input_seq_iterator:
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
46 yield (record.id, str(record.seq))
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
47
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
48
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
49 # parse command line arguments
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
50 parser = argparse.ArgumentParser(description=tool_description,
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
51 epilog=epilog,
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
52 formatter_class=argparse.RawDescriptionHelpFormatter)
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
53 # positional arguments
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
54 parser.add_argument(
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
55 "alignments",
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
56 help="Path to bed6 file containing alignments.")
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
57 parser.add_argument(
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
58 "bclib",
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
59 help="Path to fasta barcode library.")
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
60 # optional arguments
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
61 parser.add_argument(
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
62 "-o", "--outfile",
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
63 help="Write results to this file.")
8
17ef0e0dae68 Uploaded
rnateam
parents: 2
diff changeset
64 parser.add_argument(
17ef0e0dae68 Uploaded
rnateam
parents: 2
diff changeset
65 "--fasta-library",
17ef0e0dae68 Uploaded
rnateam
parents: 2
diff changeset
66 dest="fasta_library",
17ef0e0dae68 Uploaded
rnateam
parents: 2
diff changeset
67 action="store_true",
17ef0e0dae68 Uploaded
rnateam
parents: 2
diff changeset
68 help="Read random barcode library as fasta format.")
2
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
69 # misc arguments
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
70 parser.add_argument(
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
71 "-v", "--verbose",
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
72 help="Be verbose.",
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
73 action="store_true")
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
74 parser.add_argument(
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
75 "-d", "--debug",
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
76 help="Print lots of debugging information",
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
77 action="store_true")
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
78 parser.add_argument(
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
79 '--version',
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
80 action='version',
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
81 version='0.1.0')
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
82
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
83 args = parser.parse_args()
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
84
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
85 if args.debug:
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
86 logging.basicConfig(level=logging.DEBUG, format="%(asctime)s - %(filename)s - %(levelname)s - %(message)s")
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
87 elif args.verbose:
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
88 logging.basicConfig(level=logging.INFO, format="%(filename)s - %(levelname)s - %(message)s")
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
89 else:
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
90 logging.basicConfig(format="%(filename)s - %(levelname)s - %(message)s")
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
91 logging.info("Parsed arguments:")
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
92 logging.info(" alignments: '{}'".format(args.alignments))
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
93 logging.info(" bclib: '{}'".format(args.bclib))
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
94 if args.outfile:
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
95 logging.info(" outfile: enabled writing to file")
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
96 logging.info(" outfile: '{}'".format(args.outfile))
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
97 logging.info("")
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
98
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
99 # load barcode library into dictionary
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
100 input_handle = open(args.bclib, "rU")
8
17ef0e0dae68 Uploaded
rnateam
parents: 2
diff changeset
101 if args.fasta_library:
17ef0e0dae68 Uploaded
rnateam
parents: 2
diff changeset
102 input_seq_iterator = SeqIO.parse(input_handle, "fasta")
17ef0e0dae68 Uploaded
rnateam
parents: 2
diff changeset
103 else:
17ef0e0dae68 Uploaded
rnateam
parents: 2
diff changeset
104 input_seq_iterator = SeqIO.parse(input_handle, "fastq")
2
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
105 bcs = pd.DataFrame.from_records(
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
106 data=fasta_tuple_generator(input_seq_iterator),
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
107 columns=["read_id", "bc"])
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
108
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
109 # load alignments
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
110 alns = pd.read_csv(
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
111 args.alignments,
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
112 sep="\t",
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
113 names=["chrom", "start", "stop", "read_id", "score", "strand"])
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
114
14
570a7de9f151 read from bam; fix header issue
rnateam
parents: 8
diff changeset
115 # keep id parts up to first whitespace
570a7de9f151 read from bam; fix header issue
rnateam
parents: 8
diff changeset
116 alns["read_id"] = alns["read_id"].str.split(' ').str.get(0)
570a7de9f151 read from bam; fix header issue
rnateam
parents: 8
diff changeset
117
2
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
118 # combine barcode library and alignments
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
119 bcalib = pd.merge(
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
120 bcs, alns,
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
121 on="read_id",
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
122 how="inner",
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
123 sort=False)
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
124 if bcalib.empty:
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
125 raise Exception("ERROR: no common entries for alignments and barcode library found. Please check your input files.")
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
126 n_alns = len(alns.index)
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
127 n_bcalib = len(bcalib.index)
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
128 if n_bcalib < n_alns:
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
129 logging.warning(
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
130 "{} of {} alignments could not be associated with a random barcode.".format(
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
131 n_alns - n_bcalib, n_alns))
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
132
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
133 # remove entries with barcodes that has uncalled base N
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
134 bcalib_cleaned = bcalib.drop(bcalib[bcalib.bc.str.contains("N")].index)
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
135 n_bcalib_cleaned = len(bcalib_cleaned)
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
136 if n_bcalib_cleaned < n_bcalib:
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
137 msg = "{} of {} alignments had random barcodes containing uncalled bases and were dropped.".format(
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
138 n_bcalib - n_bcalib_cleaned, n_bcalib)
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
139 if n_bcalib_cleaned < (0.8 * n_bcalib):
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
140 logging.warning(msg)
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
141 else:
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
142 logging.info(msg)
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
143
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
144 # count and merge pcr duplicates
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
145 # grouping sorts by keys, so the ouput will be properly sorted
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
146 merged = bcalib_cleaned.groupby(['chrom', 'start', 'stop', 'strand', 'bc']).size().reset_index()
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
147 merged.rename(columns={0: 'ndupes'}, copy=False, inplace=True)
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
148
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
149 # write coordinates of crosslinking event alignments
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
150 eventalnout = (open(args.outfile, "w") if args.outfile is not None else stdout)
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
151 merged.to_csv(
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
152 eventalnout,
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
153 columns=['chrom', 'start', 'stop', 'bc', 'ndupes', 'strand'],
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
154 sep="\t", index=False, header=False)
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
155 eventalnout.close()