Mercurial > repos > brenninc > umicount
comparison dedup_barcode_fingerprint.py @ 0:6d2ac5ec56c8 draft default tip
Uploaded first file
| author | brenninc |
|---|---|
| date | Thu, 24 Mar 2016 09:41:40 -0400 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:6d2ac5ec56c8 |
|---|---|
| 1 """ | |
| 2 .. module:: fingerprint | |
| 3 :platform: Unix | |
| 4 :synopsis: Use UMI to count transcripts | |
| 5 :version: 1.0 | |
| 6 | |
| 7 .. moduleauthor:: Mickael Mendez <mendez.mickael@gmail.com> | |
| 8 | |
| 9 .. source: https://github.com/mmendez12/umicount | |
| 10 | |
| 11 """ | |
| 12 | |
| 13 import csv | |
| 14 import itertools | |
| 15 import subprocess | |
| 16 import argparse | |
| 17 import tempfile | |
| 18 import os | |
| 19 import shutil | |
| 20 from collections import defaultdict | |
| 21 | |
| 22 import bed12 | |
| 23 | |
| 24 | |
| 25 def get_fingerprint(read): | |
| 26 """Get fingerprint id from the read's name. It assumes that the read's name | |
| 27 contains the following pattern *FP:XXX;* where *XXX* is the fingerprint id. | |
| 28 | |
| 29 Args: | |
| 30 read: A list of twelve elements where each element refers to a field in the BED format. | |
| 31 | |
| 32 Returns: | |
| 33 A string containing the fingerprint id | |
| 34 | |
| 35 >>> read = ['chrX', '100', '200', 'BC:ATGC;FP:0012', '12', '+', '100', '110', '255,0,0', '2', '21,25', '0,75'] | |
| 36 >>> get_fingerprint(read) | |
| 37 '0012' | |
| 38 """ | |
| 39 return read[3].split('FP:')[1].split(';')[0] | |
| 40 | |
| 41 | |
| 42 def get_barcode(read): | |
| 43 """Get barcode from the read's name. It assumes that the read's name | |
| 44 contains the following pattern *BC:XXX;* where *XXX* is the actual barcode. | |
| 45 | |
| 46 Args: | |
| 47 read: A list of twelve elements where each element refers to a field in the BED format. | |
| 48 | |
| 49 Returns: | |
| 50 A string containing the barcode | |
| 51 | |
| 52 >>> read = ['chrX', '100', '200', 'BC:ATGC;FP:0012', '12', '+', '100', '110', '255,0,0', '2', '21,25', '0,75'] | |
| 53 >>> get_barcode(read) | |
| 54 'ATGC' | |
| 55 """ | |
| 56 return read[3].split('BC:')[1].split(';')[0] | |
| 57 | |
| 58 | |
| 59 def print_read_to_bed12(key, reads): | |
| 60 """ Merge the reads by blocks and print a single read in the BED12 format on stdout. | |
| 61 It assumes that the reads are on the same TSS and contains | |
| 62 barcode and fingerprint information in the read's name. | |
| 63 | |
| 64 Args: | |
| 65 key: A tuple that contain the chromosome, barcode and fingerprint information. | |
| 66 | |
| 67 reads: A list of reads (in a list) from the same TSS, that have similar barcode and fingerprint. | |
| 68 | |
| 69 >>> reads = [] | |
| 70 >>> reads.append(['chrX', '100', '200', 'BC:AAA;FP:0012', '12', '+', '100', '110', '255,0,0', '2', '20,25', '0,75']) | |
| 71 >>> reads.append(['chrX', '100', '300', 'BC:AAA;FP:0012', '12', '+', '100', '110', '255,0,0', '3', '20,25', '0,175']) | |
| 72 >>> print_read_to_bed12(('chrX', 'AAA', '0012'), reads) #doctest: +NORMALIZE_WHITESPACE | |
| 73 chrX 100 300 BC:AAA;FP:0012 2 + 100 120 255,0,0 3 20,25,25 0,75,175 | |
| 74 """ | |
| 75 block_sizes, block_starts = bed12.merge_overlapping_blocks(reads) | |
| 76 | |
| 77 #bed12 | |
| 78 first_read = sorted(reads, key = bed12.get_start)[0] | |
| 79 chrom, barcode, fingerprint = key | |
| 80 start = bed12.get_start(first_read) | |
| 81 end = start + block_starts[-1] + block_sizes[-1] | |
| 82 name = "BC:{0};FP:{1}".format(barcode, fingerprint) | |
| 83 score = len(reads) | |
| 84 | |
| 85 strand = bed12.get_strand(first_read) | |
| 86 | |
| 87 if strand == '+': | |
| 88 thick_start = start | |
| 89 thick_end = start + block_sizes[0] | |
| 90 else: | |
| 91 thick_start = end - block_sizes[-1] | |
| 92 thick_end = end | |
| 93 | |
| 94 color = "255,0,0" | |
| 95 block_count = len(block_sizes) | |
| 96 block_sizes = ','.join(map(str, block_sizes)) | |
| 97 block_starts = ','.join(map(str, block_starts)) | |
| 98 | |
| 99 output = [chrom, start, end, name, score, strand, thick_start, thick_end, | |
| 100 color, block_count, block_sizes, block_starts] | |
| 101 | |
| 102 output_str = map(str, output) | |
| 103 print '\t'.join(output_str) | |
| 104 | |
| 105 | |
| 106 def main(): | |
| 107 | |
| 108 #PARSER TODO: move this code somewhere else | |
| 109 parser = argparse.ArgumentParser() | |
| 110 group = parser.add_mutually_exclusive_group(required=True) | |
| 111 group.add_argument("-d", "--directory", help="absolute path of the folder containing the bed files") | |
| 112 group.add_argument("-f", "--file", help="a bed file") | |
| 113 parser.add_argument("-o", help='name of the output file. Only works if the script is called with the -f option, \ | |
| 114 ignored otherwise.') | |
| 115 | |
| 116 args = parser.parse_args() | |
| 117 | |
| 118 if args.directory: | |
| 119 path, folder, files = os.walk(args.directory).next() | |
| 120 elif args.file: | |
| 121 path = '' | |
| 122 files = [args.file] | |
| 123 #ENDPARSER | |
| 124 | |
| 125 #create a temporary directory | |
| 126 tmp_dir = tempfile.mkdtemp() | |
| 127 | |
| 128 plus_strand_tmp_file = open(os.path.join(tmp_dir, '+'), 'w') | |
| 129 minus_strand_tmp_file = open(os.path.join(tmp_dir, '-'), 'w') | |
| 130 plus_and_minus_sorted_path = os.path.join(tmp_dir, '+-s') | |
| 131 | |
| 132 #creates two temporary bed files containing either reads on the plus or minus strand | |
| 133 for bed_file in files: | |
| 134 | |
| 135 with open(os.path.join(path, bed_file)) as bed_file: | |
| 136 reader = csv.reader(bed_file, delimiter='\t') | |
| 137 | |
| 138 for read in reader: | |
| 139 strand = bed12.get_strand(read) | |
| 140 if strand == '+': | |
| 141 plus_strand_tmp_file.write('\t'.join(read) + '\n') | |
| 142 elif strand == '-': | |
| 143 minus_strand_tmp_file.write('\t'.join(read) + '\n') | |
| 144 | |
| 145 | |
| 146 #close the files | |
| 147 plus_strand_tmp_file.close() | |
| 148 minus_strand_tmp_file.close() | |
| 149 | |
| 150 #call unix sort on the file containing reads on the plus strand by tss | |
| 151 with open(os.path.join(tmp_dir, '+sorted'), "w") as outfile: | |
| 152 subprocess.call(["sort", '-k2,2n', os.path.join(tmp_dir, '+')], stdout=outfile) | |
| 153 | |
| 154 #call unix sort on the file containing reads on the minus strand by tss | |
| 155 with open(os.path.join(tmp_dir, '-sorted'), "w") as outfile: | |
| 156 subprocess.call(["sort", '-k3,3n', os.path.join(tmp_dir, '-')], stdout=outfile) | |
| 157 | |
| 158 #concatenate the files sorted by tss | |
| 159 with open(plus_and_minus_sorted_path, "w") as outfile: | |
| 160 subprocess.call(['cat', os.path.join(tmp_dir, '+sorted'), os.path.join(tmp_dir, '-sorted')], stdout=outfile) | |
| 161 | |
| 162 with open(plus_and_minus_sorted_path) as bedfile: | |
| 163 reader = csv.reader(bedfile, delimiter='\t') | |
| 164 reads = (line for line in reader) | |
| 165 | |
| 166 #for each reads on the same tss | |
| 167 for tss, reads in itertools.groupby(reads, bed12.get_tss): | |
| 168 d = defaultdict(list) | |
| 169 | |
| 170 #group the reads by chr, barcode and fingerprint | |
| 171 for read in reads: | |
| 172 key = (bed12.get_chrom(read), get_barcode(read), get_fingerprint(read)) | |
| 173 d[key].append(read) | |
| 174 | |
| 175 #merge and print the reads that have similar tss, barcode and fingerprint | |
| 176 for key, reads in d.iteritems(): | |
| 177 print_read_to_bed12(key, reads) | |
| 178 | |
| 179 shutil.rmtree(tmp_dir) | |
| 180 | |
| 181 | |
| 182 if __name__ == '__main__': | |
| 183 main() | |
| 184 | |
| 185 #TODO: combine this script with dedup_fingerprint |
