Mercurial > repos > rnateam > bctools
comparison rm_spurious_events.py @ 50:0b9aab6aaebf draft
Uploaded 16cfcafe8b42055c5dd64e62c42b82b455027a40
| author | rnateam |
|---|---|
| date | Tue, 26 Jan 2016 04:38:27 -0500 |
| parents | de4ea3aa1090 |
| children |
comparison
equal
deleted
inserted
replaced
| 49:303f6402a035 | 50:0b9aab6aaebf |
|---|---|
| 1 #!/usr/bin/env python | 1 #!/usr/bin/env python |
| 2 | |
| 3 import argparse | |
| 4 import logging | |
| 5 from subprocess import check_call | |
| 6 import os | |
| 2 | 7 |
| 3 tool_description = """ | 8 tool_description = """ |
| 4 Remove spurious events originating from errors in random sequence tags. | 9 Remove spurious events originating from errors in random sequence tags. |
| 5 | 10 |
| 6 This script compares all events sharing the same coordinates. Among each group | 11 This script compares all events sharing the same coordinates. Among each group |
| 7 of events the maximum number of PCR duplicates is determined. All events that | 12 of events the maximum number of PCR duplicates is determined. All events that |
| 8 are supported by less than 10 percent of this maximum count are removed. | 13 are supported by less than 10 percent of this maximum count are removed. |
| 9 | |
| 10 By default output is written to stdout. | |
| 11 | 14 |
| 12 Input: | 15 Input: |
| 13 * bed6 file containing crosslinking events with score field set to number of PCR | 16 * bed6 file containing crosslinking events with score field set to number of PCR |
| 14 duplicates | 17 duplicates |
| 15 | 18 |
| 17 * bed6 file with spurious crosslinking events removed, sorted by fields chrom, | 20 * bed6 file with spurious crosslinking events removed, sorted by fields chrom, |
| 18 start, stop, strand | 21 start, stop, strand |
| 19 | 22 |
| 20 Example usage: | 23 Example usage: |
| 21 - remove spurious events from spurious.bed and write results to file cleaned.bed | 24 - remove spurious events from spurious.bed and write results to file cleaned.bed |
| 22 rm_spurious_events.py spurious.bed --out cleaned.bed | 25 rm_spurious_events.py spurious.bed --oufile cleaned.bed |
| 23 """ | 26 """ |
| 24 | 27 |
| 25 epilog = """ | 28 epilog = """ |
| 26 Author: Daniel Maticzka | 29 Author: Daniel Maticzka |
| 27 Copyright: 2015 | 30 Copyright: 2015 |
| 28 License: Apache | 31 License: Apache |
| 29 Email: maticzkd@informatik.uni-freiburg.de | 32 Email: maticzkd@informatik.uni-freiburg.de |
| 30 Status: Testing | 33 Status: Testing |
| 31 """ | 34 """ |
| 32 | 35 |
| 33 import argparse | |
| 34 import logging | |
| 35 from sys import stdout | |
| 36 import pandas as pd | |
| 37 | |
| 38 | 36 |
| 39 class DefaultsRawDescriptionHelpFormatter(argparse.ArgumentDefaultsHelpFormatter, | 37 class DefaultsRawDescriptionHelpFormatter(argparse.ArgumentDefaultsHelpFormatter, |
| 40 argparse.RawDescriptionHelpFormatter): | 38 argparse.RawDescriptionHelpFormatter): |
| 41 # To join the behaviour of RawDescriptionHelpFormatter with that of ArgumentDefaultsHelpFormatter | 39 # To join the behaviour of RawDescriptionHelpFormatter with that of ArgumentDefaultsHelpFormatter |
| 42 pass | 40 pass |
| 43 | 41 |
| 44 # avoid ugly python IOError when stdout output is piped into another program | |
| 45 # and then truncated (such as piping to head) | |
| 46 from signal import signal, SIGPIPE, SIG_DFL | |
| 47 signal(SIGPIPE, SIG_DFL) | |
| 48 | 42 |
| 49 # parse command line arguments | 43 def main(): |
| 50 parser = argparse.ArgumentParser(description=tool_description, | 44 # parse command line arguments |
| 51 epilog=epilog, | 45 parser = argparse.ArgumentParser(description=tool_description, |
| 52 formatter_class=DefaultsRawDescriptionHelpFormatter) | 46 epilog=epilog, |
| 53 # positional arguments | 47 formatter_class=DefaultsRawDescriptionHelpFormatter) |
| 54 parser.add_argument( | 48 # positional arguments |
| 55 "events", | 49 parser.add_argument( |
| 56 help="Path to bed6 file containing alignments.") | 50 "events", |
| 57 # optional arguments | 51 help="Path to bed6 file containing alignments.") |
| 58 parser.add_argument( | 52 # optional arguments |
| 59 "-o", "--outfile", | 53 parser.add_argument( |
| 60 help="Write results to this file.") | 54 "-o", "--outfile", |
| 61 parser.add_argument( | 55 required=True, |
| 62 "-t", "--threshold", | 56 help="Write results to this file.") |
| 63 type=float, | 57 parser.add_argument( |
| 64 default=0.1, | 58 "-t", "--threshold", |
| 65 help="Threshold for spurious event removal." | 59 type=float, |
| 66 ) | 60 default=0.1, |
| 67 # misc arguments | 61 help="Threshold for spurious event removal." |
| 68 parser.add_argument( | 62 ) |
| 69 "-v", "--verbose", | 63 # misc arguments |
| 70 help="Be verbose.", | 64 parser.add_argument( |
| 71 action="store_true") | 65 "-v", "--verbose", |
| 72 parser.add_argument( | 66 help="Be verbose.", |
| 73 "-d", "--debug", | 67 action="store_true") |
| 74 help="Print lots of debugging information", | 68 parser.add_argument( |
| 75 action="store_true") | 69 "-d", "--debug", |
| 76 parser.add_argument( | 70 help="Print lots of debugging information", |
| 77 '--version', | 71 action="store_true") |
| 78 action='version', | 72 parser.add_argument( |
| 79 version='0.1.0') | 73 '--version', |
| 74 action='version', | |
| 75 version='0.1.0') | |
| 80 | 76 |
| 81 args = parser.parse_args() | 77 args = parser.parse_args() |
| 82 | 78 |
| 83 if args.debug: | 79 if args.debug: |
| 84 logging.basicConfig(level=logging.DEBUG, format="%(asctime)s - %(filename)s - %(levelname)s - %(message)s") | 80 logging.basicConfig(level=logging.DEBUG, format="%(asctime)s - %(filename)s - %(levelname)s - %(message)s") |
| 85 elif args.verbose: | 81 elif args.verbose: |
| 86 logging.basicConfig(level=logging.INFO, format="%(filename)s - %(levelname)s - %(message)s") | 82 logging.basicConfig(level=logging.INFO, format="%(filename)s - %(levelname)s - %(message)s") |
| 87 else: | 83 else: |
| 88 logging.basicConfig(format="%(filename)s - %(levelname)s - %(message)s") | 84 logging.basicConfig(format="%(filename)s - %(levelname)s - %(message)s") |
| 89 logging.info("Parsed arguments:") | 85 logging.info("Parsed arguments:") |
| 90 logging.info(" alignments: '{}'".format(args.events)) | 86 logging.info(" alignments: '{}'".format(args.events)) |
| 91 logging.info(" threshold: '{}'".format(args.threshold)) | 87 logging.info(" threshold: '{}'".format(args.threshold)) |
| 92 if args.outfile: | 88 if args.outfile: |
| 93 logging.info(" outfile: enabled writing to file") | 89 logging.info(" outfile: enabled writing to file") |
| 94 logging.info(" outfile: '{}'".format(args.outfile)) | 90 logging.info(" outfile: '{}'".format(args.outfile)) |
| 95 logging.info("") | 91 logging.info("") |
| 96 | 92 |
| 97 # check threshold parameter value | 93 # check threshold parameter value |
| 98 if args.threshold < 0 or args.threshold > 1: | 94 if args.threshold < 0 or args.threshold > 1: |
| 99 raise ValueError("Threshold must be in [0,1].") | 95 raise ValueError("Threshold must be in [0,1].") |
| 100 | 96 |
| 101 # load alignments | 97 if not os.path.isfile(args.events): |
| 102 alns = pd.read_csv( | 98 raise Exception("ERROR: file '{}' not found.") |
| 103 args.events, | |
| 104 sep="\t", | |
| 105 names=["chrom", "start", "stop", "read_id", "score", "strand"]) | |
| 106 | 99 |
| 107 # remove all alignments that not enough PCR duplicates with respect to | 100 # prepare barcode library |
| 108 # the group maximum | 101 syscall = "cat " + args.events + " | sort -k1,1V -k6,6 -k2,2n -k3,3 -k5,5nr | perl " + os.path.dirname(os.path.realpath(__file__)) + "/rm_spurious_events.pl --frac_max " + str(args.threshold) + "| sort -k1,1V -k2,2n -k3,3n -k6,6 -k4,4 -k5,5nr > " + args.outfile |
| 109 grouped = alns.groupby(['chrom', 'start', 'stop', 'strand'], group_keys=False) | 102 check_call(syscall, shell=True) |
| 110 alns_cleaned = grouped.apply(lambda g: g[g["score"] >= args.threshold * g["score"].max()]) | |
| 111 | 103 |
| 112 # write coordinates of crosslinking event alignments | 104 |
| 113 alns_cleaned_out = (open(args.outfile, "w") if args.outfile is not None else stdout) | 105 if __name__ == "__main__": |
| 114 alns_cleaned.to_csv( | 106 main() |
| 115 alns_cleaned_out, | |
| 116 columns=['chrom', 'start', 'stop', 'read_id', 'score', 'strand'], | |
| 117 sep="\t", index=False, header=False) | |
| 118 alns_cleaned_out.close() |
