Mercurial > repos > rnateam > bctools
comparison merge_pcr_duplicates.py @ 55:4bedd35bcdff draft
Move deps from macros to individual tools
| author | rnateam |
|---|---|
| date | Fri, 12 Feb 2016 08:37:33 -0500 |
| parents | 0b9aab6aaebf |
| children | bbbae1ee87e0 |
comparison
equal
deleted
inserted
replaced
| 54:7a05b21c5629 | 55:4bedd35bcdff |
|---|---|
| 1 #!/usr/bin/env python | 1 #!/usr/bin/env python |
| 2 | 2 |
| 3 import argparse | 3 import argparse |
| 4 import logging | 4 import logging |
| 5 from sys import stdout | 5 from sys import stdout |
| 6 import pandas as pd | |
| 7 from subprocess import check_call | 6 from subprocess import check_call |
| 8 from shutil import rmtree | 7 from shutil import rmtree |
| 9 from tempfile import mkdtemp | 8 from tempfile import mkdtemp |
| 10 from os.path import isfile | 9 from os.path import isfile |
| 11 # avoid ugly python IOError when stdout output is piped into another program | 10 # avoid ugly python IOError when stdout output is piped into another program |
| 14 signal(SIGPIPE, SIG_DFL) | 13 signal(SIGPIPE, SIG_DFL) |
| 15 | 14 |
| 16 tool_description = """ | 15 tool_description = """ |
| 17 Merge PCR duplicates according to random barcode library. | 16 Merge PCR duplicates according to random barcode library. |
| 18 | 17 |
| 19 Barcodes containing uncalled base 'N' are removed. By default output is written | 18 Barcodes containing uncalled base 'N' are removed. |
| 20 to stdout. | |
| 21 | 19 |
| 22 Input: | 20 Input: |
| 23 * bed6 file containing alignments with fastq read-id in name field | 21 * bed6 file containing alignments with fastq read-id in name field |
| 24 * fastq library of random barcodes | 22 * fastq library of random barcodes |
| 25 | 23 |
| 28 score, sorted by fields chrom, start, stop, strand, name | 26 score, sorted by fields chrom, start, stop, strand, name |
| 29 | 27 |
| 30 Example usage: | 28 Example usage: |
| 31 - read PCR duplicates from file duplicates.bed and write merged results to file | 29 - read PCR duplicates from file duplicates.bed and write merged results to file |
| 32 merged.bed: | 30 merged.bed: |
| 33 merge_pcr_duplicates.py duplicates.bed bclibrary.fa --out merged.bed | 31 merge_pcr_duplicates.py duplicates.bed bclibrary.fa --outfile merged.bed |
| 34 """ | 32 """ |
| 35 | 33 |
| 36 epilog = """ | 34 epilog = """ |
| 37 Author: Daniel Maticzka | 35 Author: Daniel Maticzka |
| 38 Copyright: 2015 | 36 Copyright: 2015 |
| 53 "bclib", | 51 "bclib", |
| 54 help="Path to fastq barcode library.") | 52 help="Path to fastq barcode library.") |
| 55 # optional arguments | 53 # optional arguments |
| 56 parser.add_argument( | 54 parser.add_argument( |
| 57 "-o", "--outfile", | 55 "-o", "--outfile", |
| 56 required=True, | |
| 58 help="Write results to this file.") | 57 help="Write results to this file.") |
| 59 # misc arguments | 58 # misc arguments |
| 60 parser.add_argument( | 59 parser.add_argument( |
| 61 "-v", "--verbose", | 60 "-v", "--verbose", |
| 62 help="Be verbose.", | 61 help="Be verbose.", |
| 102 | 101 |
| 103 try: | 102 try: |
| 104 tmpdir = mkdtemp() | 103 tmpdir = mkdtemp() |
| 105 logging.debug("tmpdir: " + tmpdir) | 104 logging.debug("tmpdir: " + tmpdir) |
| 106 | 105 |
| 107 # prepare barcode library | |
| 108 syscall1 = "cat " + args.bclib + " | awk 'BEGIN{OFS=\"\\t\"}NR%4==1{gsub(/^@/,\"\"); id=$1}NR%4==2{bc=$1}NR%4==3{print id,bc}' | sort -k1,1 > " + tmpdir + "/bclib.csv" | |
| 109 check_call(syscall1, shell=True) | |
| 110 | |
| 111 # prepare alinments | 106 # prepare alinments |
| 112 syscall2 = "cat " + args.alignments + " | awk -F \"\\t\" 'BEGIN{OFS=\"\\t\"}{split($4, a, \" \"); $4 = a[1]; print}'| sort -k4,4 > " + tmpdir + "/alns.csv" | 107 syscall2 = "cat " + args.alignments + " | awk -F \"\\t\" 'BEGIN{OFS=\"\\t\"}{split($4, a, \" \"); $4 = a[1]; print}'| sort --compress-program=gzip -k4,4 > " + tmpdir + "/alns.csv" |
| 113 check_call(syscall2, shell=True) | 108 check_call(syscall2, shell=True) |
| 114 | 109 |
| 115 # join barcode library and alignments | 110 # join barcode library and alignments |
| 116 syscall3 = "join -1 1 -2 4 " + tmpdir + "/bclib.csv " + tmpdir + "/alns.csv " + " | awk 'BEGIN{OFS=\"\\t\"}{print $3,$4,$5,$2,$6,$7}' > " + tmpdir + "/bcalib.csv" | 111 syscall3 = "cat " + args.bclib + " | awk 'BEGIN{OFS=\"\\t\"}NR%4==1{gsub(/^@/,\"\"); id=$1}NR%4==2{bc=$1}NR%4==3{print id,bc}' | sort --compress-program=gzip -k1,1 | join -1 1 -2 4 - " + tmpdir + "/alns.csv " + " | awk 'BEGIN{OFS=\"\\t\"}$4!~/N/{print $3,$4,$5,$2,$6,$7}' | datamash --sort -g 1,2,3,4,6 count 4 | awk 'BEGIN{OFS=\"\\t\"}$4!~/N/{print $1,$2,$3,$4,$6,$5}' > " + args.outfile |
| 112 # 'chrom', 'start', 'stop', 'bc', 'ndupes', 'strand' | |
| 117 check_call(syscall3, shell=True) | 113 check_call(syscall3, shell=True) |
| 118 | |
| 119 # get alignments combined with barcodes | |
| 120 bcalib = pd.read_csv( | |
| 121 tmpdir + "/bcalib.csv", | |
| 122 sep="\t", | |
| 123 names=["chrom", "start", "stop", "bc", "score", "strand"]) | |
| 124 finally: | 114 finally: |
| 125 logging.debug("removed tmpdir: " + tmpdir) | 115 logging.debug("removed tmpdir: " + tmpdir) |
| 126 rmtree(tmpdir) | 116 rmtree(tmpdir) |
| 127 | |
| 128 # fail if alignments given but combined library is empty | |
| 129 if bcalib.empty: | |
| 130 raise Exception("ERROR: no common entries for alignments and barcode library found. Please check your input files.") | |
| 131 | |
| 132 # warn if not all alignments could be assigned a barcode | |
| 133 n_bcalib = len(bcalib.index) | |
| 134 if n_bcalib < n_alns: | |
| 135 logging.warning( | |
| 136 "{} of {} alignments could not be associated with a random barcode.".format(n_alns - n_bcalib, n_alns)) | |
| 137 | |
| 138 # remove entries with barcodes that has uncalled base N | |
| 139 bcalib_cleaned = bcalib.drop(bcalib[bcalib.bc.str.contains("N")].index) | |
| 140 n_bcalib_cleaned = len(bcalib_cleaned) | |
| 141 # if n_bcalib_cleaned < n_bcalib: | |
| 142 # msg = "{} of {} alignments had random barcodes containing uncalled bases and were dropped.".format( | |
| 143 # n_bcalib - n_bcalib_cleaned, n_bcalib) | |
| 144 # if n_bcalib_cleaned < (0.8 * n_bcalib): | |
| 145 # logging.warning(msg) | |
| 146 # else: | |
| 147 # logging.info(msg) | |
| 148 | |
| 149 # count and merge pcr duplicates | |
| 150 # grouping sorts by keys, so the ouput will be properly sorted | |
| 151 merged = bcalib_cleaned.groupby(['chrom', 'start', 'stop', 'strand', 'bc']).size().reset_index() | |
| 152 merged.rename(columns={0: 'ndupes'}, copy=False, inplace=True) | |
| 153 | |
| 154 # write coordinates of crosslinking event alignments | |
| 155 eventalnout = (open(args.outfile, "w") if args.outfile is not None else stdout) | |
| 156 merged.to_csv( | |
| 157 eventalnout, | |
| 158 columns=['chrom', 'start', 'stop', 'bc', 'ndupes', 'strand'], | |
| 159 sep="\t", index=False, header=False) | |
| 160 eventalnout.close() |
