Mercurial > repos > rnateam > bctools
comparison merge_pcr_duplicates.py @ 2:de4ea3aa1090 draft
Uploaded
| author | rnateam |
|---|---|
| date | Thu, 22 Oct 2015 10:26:45 -0400 |
| parents | |
| children | 17ef0e0dae68 |
comparison
equal
deleted
inserted
replaced
| 1:ae0f58d3318f | 2:de4ea3aa1090 |
|---|---|
| 1 #!/usr/bin/env python | |
| 2 | |
| 3 tool_description = """ | |
| 4 Merge PCR duplicates according to random barcode library. | |
| 5 | |
| 6 Barcodes containing uncalled base 'N' are removed. By default output is written | |
| 7 to stdout. | |
| 8 | |
| 9 Input: | |
| 10 * bed6 file containing alignments with fastq read-id in name field | |
| 11 * fasta library with fastq read-id as sequence ids | |
| 12 | |
| 13 Output: | |
| 14 * bed6 file with random barcode in name field and number of PCR duplicates as | |
| 15 score, sorted by fields chrom, start, stop, strand, name | |
| 16 | |
| 17 Example usage: | |
| 18 - read PCR duplicates from file duplicates.bed and write merged results to file | |
| 19 merged.bed: | |
| 20 merge_pcr_duplicates.py duplicates.bed bclibrary.fa --out merged.bed | |
| 21 """ | |
| 22 | |
| 23 epilog = """ | |
| 24 Author: Daniel Maticzka | |
| 25 Copyright: 2015 | |
| 26 License: Apache | |
| 27 Email: maticzkd@informatik.uni-freiburg.de | |
| 28 Status: Testing | |
| 29 """ | |
| 30 | |
| 31 import argparse | |
| 32 import logging | |
| 33 from sys import stdout | |
| 34 from Bio import SeqIO | |
| 35 import pandas as pd | |
| 36 | |
| 37 # avoid ugly python IOError when stdout output is piped into another program | |
| 38 # and then truncated (such as piping to head) | |
| 39 from signal import signal, SIGPIPE, SIG_DFL | |
| 40 signal(SIGPIPE, SIG_DFL) | |
| 41 | |
| 42 | |
| 43 def fasta_tuple_generator(fasta_iterator): | |
| 44 "Yields id, sequence tuples given an iterator over Biopython SeqIO objects." | |
| 45 for record in input_seq_iterator: | |
| 46 yield (record.id, str(record.seq)) | |
| 47 | |
| 48 | |
| 49 # parse command line arguments | |
| 50 parser = argparse.ArgumentParser(description=tool_description, | |
| 51 epilog=epilog, | |
| 52 formatter_class=argparse.RawDescriptionHelpFormatter) | |
| 53 # positional arguments | |
| 54 parser.add_argument( | |
| 55 "alignments", | |
| 56 help="Path to bed6 file containing alignments.") | |
| 57 parser.add_argument( | |
| 58 "bclib", | |
| 59 help="Path to fasta barcode library.") | |
| 60 # optional arguments | |
| 61 parser.add_argument( | |
| 62 "-o", "--outfile", | |
| 63 help="Write results to this file.") | |
| 64 # misc arguments | |
| 65 parser.add_argument( | |
| 66 "-v", "--verbose", | |
| 67 help="Be verbose.", | |
| 68 action="store_true") | |
| 69 parser.add_argument( | |
| 70 "-d", "--debug", | |
| 71 help="Print lots of debugging information", | |
| 72 action="store_true") | |
| 73 parser.add_argument( | |
| 74 '--version', | |
| 75 action='version', | |
| 76 version='0.1.0') | |
| 77 | |
| 78 args = parser.parse_args() | |
| 79 | |
| 80 if args.debug: | |
| 81 logging.basicConfig(level=logging.DEBUG, format="%(asctime)s - %(filename)s - %(levelname)s - %(message)s") | |
| 82 elif args.verbose: | |
| 83 logging.basicConfig(level=logging.INFO, format="%(filename)s - %(levelname)s - %(message)s") | |
| 84 else: | |
| 85 logging.basicConfig(format="%(filename)s - %(levelname)s - %(message)s") | |
| 86 logging.info("Parsed arguments:") | |
| 87 logging.info(" alignments: '{}'".format(args.alignments)) | |
| 88 logging.info(" bclib: '{}'".format(args.bclib)) | |
| 89 if args.outfile: | |
| 90 logging.info(" outfile: enabled writing to file") | |
| 91 logging.info(" outfile: '{}'".format(args.outfile)) | |
| 92 logging.info("") | |
| 93 | |
| 94 # load barcode library into dictionary | |
| 95 input_handle = open(args.bclib, "rU") | |
| 96 input_seq_iterator = SeqIO.parse(input_handle, "fasta") | |
| 97 bcs = pd.DataFrame.from_records( | |
| 98 data=fasta_tuple_generator(input_seq_iterator), | |
| 99 columns=["read_id", "bc"]) | |
| 100 | |
| 101 # load alignments | |
| 102 alns = pd.read_csv( | |
| 103 args.alignments, | |
| 104 sep="\t", | |
| 105 names=["chrom", "start", "stop", "read_id", "score", "strand"]) | |
| 106 | |
| 107 # combine barcode library and alignments | |
| 108 bcalib = pd.merge( | |
| 109 bcs, alns, | |
| 110 on="read_id", | |
| 111 how="inner", | |
| 112 sort=False) | |
| 113 if bcalib.empty: | |
| 114 raise Exception("ERROR: no common entries for alignments and barcode library found. Please check your input files.") | |
| 115 n_alns = len(alns.index) | |
| 116 n_bcalib = len(bcalib.index) | |
| 117 if n_bcalib < n_alns: | |
| 118 logging.warning( | |
| 119 "{} of {} alignments could not be associated with a random barcode.".format( | |
| 120 n_alns - n_bcalib, n_alns)) | |
| 121 | |
| 122 # remove entries with barcodes that has uncalled base N | |
| 123 bcalib_cleaned = bcalib.drop(bcalib[bcalib.bc.str.contains("N")].index) | |
| 124 n_bcalib_cleaned = len(bcalib_cleaned) | |
| 125 if n_bcalib_cleaned < n_bcalib: | |
| 126 msg = "{} of {} alignments had random barcodes containing uncalled bases and were dropped.".format( | |
| 127 n_bcalib - n_bcalib_cleaned, n_bcalib) | |
| 128 if n_bcalib_cleaned < (0.8 * n_bcalib): | |
| 129 logging.warning(msg) | |
| 130 else: | |
| 131 logging.info(msg) | |
| 132 | |
| 133 # count and merge pcr duplicates | |
| 134 # grouping sorts by keys, so the ouput will be properly sorted | |
| 135 merged = bcalib_cleaned.groupby(['chrom', 'start', 'stop', 'strand', 'bc']).size().reset_index() | |
| 136 merged.rename(columns={0: 'ndupes'}, copy=False, inplace=True) | |
| 137 | |
| 138 # write coordinates of crosslinking event alignments | |
| 139 eventalnout = (open(args.outfile, "w") if args.outfile is not None else stdout) | |
| 140 merged.to_csv( | |
| 141 eventalnout, | |
| 142 columns=['chrom', 'start', 'stop', 'bc', 'ndupes', 'strand'], | |
| 143 sep="\t", index=False, header=False) | |
| 144 eventalnout.close() |
