| 0 | 1 """ | 
|  | 2 .. module:: fingerprint | 
|  | 3    :platform: Unix | 
|  | 4    :synopsis: Use UMI to count transcripts | 
|  | 5    :version: 1.0 | 
|  | 6 | 
|  | 7 .. moduleauthor:: Mickael Mendez <mendez.mickael@gmail.com> | 
|  | 8 | 
|  | 9 .. source: https://github.com/mmendez12/umicount | 
|  | 10 | 
|  | 11 """ | 
|  | 12 | 
|  | 13 import csv | 
|  | 14 import itertools | 
|  | 15 import subprocess | 
|  | 16 import argparse | 
|  | 17 import tempfile | 
|  | 18 import os | 
|  | 19 import shutil | 
|  | 20 from collections import defaultdict | 
|  | 21 | 
|  | 22 import bed12 | 
|  | 23 | 
|  | 24 | 
|  | 25 def get_fingerprint(read): | 
|  | 26     """Get fingerprint id from the read's name. It assumes that the read's name | 
|  | 27     contains the following pattern *FP:XXX;* where *XXX* is the fingerprint id. | 
|  | 28 | 
|  | 29     Args: | 
|  | 30         read: A list of twelve elements where each element refers to a field in the BED format. | 
|  | 31 | 
|  | 32     Returns: | 
|  | 33         A string containing the fingerprint id | 
|  | 34 | 
|  | 35     >>> read = ['chrX', '100', '200', 'BC:ATGC;FP:0012', '12', '+', '100', '110', '255,0,0', '2', '21,25', '0,75'] | 
|  | 36     >>> get_fingerprint(read) | 
|  | 37     '0012' | 
|  | 38     """ | 
|  | 39     return read[3].split('FP:')[1].split(';')[0] | 
|  | 40 | 
|  | 41 | 
|  | 42 def get_barcode(read): | 
|  | 43     """Get barcode from the read's name. It assumes that the read's name | 
|  | 44     contains the following pattern *BC:XXX;* where *XXX* is the actual barcode. | 
|  | 45 | 
|  | 46     Args: | 
|  | 47         read: A list of twelve elements where each element refers to a field in the BED format. | 
|  | 48 | 
|  | 49     Returns: | 
|  | 50         A string containing the barcode | 
|  | 51 | 
|  | 52     >>> read = ['chrX', '100', '200', 'BC:ATGC;FP:0012', '12', '+', '100', '110', '255,0,0', '2', '21,25', '0,75'] | 
|  | 53     >>> get_barcode(read) | 
|  | 54     'ATGC' | 
|  | 55     """ | 
|  | 56     return read[3].split('BC:')[1].split(';')[0] | 
|  | 57 | 
|  | 58 | 
|  | 59 def print_read_to_bed12(key, reads): | 
|  | 60     """ Merge the reads by blocks and print a single read in the BED12 format on stdout. | 
|  | 61     It assumes that the reads are on the same TSS and contains | 
|  | 62     barcode and fingerprint information in the read's name. | 
|  | 63 | 
|  | 64     Args: | 
|  | 65         key: A tuple that contain the chromosome, barcode and fingerprint information. | 
|  | 66 | 
|  | 67         reads: A list of reads (in a list) from the same TSS, that have similar barcode and fingerprint. | 
|  | 68 | 
|  | 69     >>> reads = [] | 
|  | 70     >>> reads.append(['chrX', '100', '200', 'BC:AAA;FP:0012', '12', '+', '100', '110', '255,0,0', '2', '20,25', '0,75']) | 
|  | 71     >>> reads.append(['chrX', '100', '300', 'BC:AAA;FP:0012', '12', '+', '100', '110', '255,0,0', '3', '20,25', '0,175']) | 
|  | 72     >>> print_read_to_bed12(('chrX', 'AAA', '0012'), reads) #doctest: +NORMALIZE_WHITESPACE | 
|  | 73     chrX    100	300	BC:AAA;FP:0012	2	+	100	120	255,0,0	3	20,25,25	0,75,175 | 
|  | 74     """ | 
|  | 75     block_sizes, block_starts = bed12.merge_overlapping_blocks(reads) | 
|  | 76 | 
|  | 77     #bed12 | 
|  | 78     first_read = sorted(reads, key = bed12.get_start)[0] | 
|  | 79     chrom, barcode, fingerprint = key | 
|  | 80     start = bed12.get_start(first_read) | 
|  | 81     end = start + block_starts[-1] + block_sizes[-1] | 
|  | 82     name = "BC:{0};FP:{1}".format(barcode, fingerprint) | 
|  | 83     score = len(reads) | 
|  | 84 | 
|  | 85     strand = bed12.get_strand(first_read) | 
|  | 86 | 
|  | 87     if strand == '+': | 
|  | 88         thick_start = start | 
|  | 89         thick_end = start + block_sizes[0] | 
|  | 90     else: | 
|  | 91         thick_start = end - block_sizes[-1] | 
|  | 92         thick_end = end | 
|  | 93 | 
|  | 94     color = "255,0,0" | 
|  | 95     block_count = len(block_sizes) | 
|  | 96     block_sizes = ','.join(map(str, block_sizes)) | 
|  | 97     block_starts = ','.join(map(str, block_starts)) | 
|  | 98 | 
|  | 99     output = [chrom, start, end, name, score, strand, thick_start, thick_end, | 
|  | 100               color, block_count, block_sizes, block_starts] | 
|  | 101 | 
|  | 102     output_str = map(str, output) | 
|  | 103     print '\t'.join(output_str) | 
|  | 104 | 
|  | 105 | 
|  | 106 def main(): | 
|  | 107 | 
|  | 108     #PARSER TODO: move this code somewhere else | 
|  | 109     parser = argparse.ArgumentParser() | 
|  | 110     group = parser.add_mutually_exclusive_group(required=True) | 
|  | 111     group.add_argument("-d", "--directory", help="absolute path of the folder containing the bed files") | 
|  | 112     group.add_argument("-f", "--file", help="a bed file") | 
|  | 113     parser.add_argument("-o", help='name of the output file. Only works if the script is called with the -f option, \ | 
|  | 114                                     ignored otherwise.') | 
|  | 115 | 
|  | 116     args = parser.parse_args() | 
|  | 117 | 
|  | 118     if args.directory: | 
|  | 119         path, folder, files = os.walk(args.directory).next() | 
|  | 120     elif args.file: | 
|  | 121         path = '' | 
|  | 122         files = [args.file] | 
|  | 123     #ENDPARSER | 
|  | 124 | 
|  | 125     #create a temporary directory | 
|  | 126     tmp_dir = tempfile.mkdtemp() | 
|  | 127 | 
|  | 128     plus_strand_tmp_file = open(os.path.join(tmp_dir, '+'), 'w') | 
|  | 129     minus_strand_tmp_file = open(os.path.join(tmp_dir, '-'), 'w') | 
|  | 130     plus_and_minus_sorted_path = os.path.join(tmp_dir, '+-s') | 
|  | 131 | 
|  | 132     #creates two temporary bed files containing either reads on the plus or minus strand | 
|  | 133     for bed_file in files: | 
|  | 134 | 
|  | 135         with open(os.path.join(path, bed_file)) as bed_file: | 
|  | 136             reader = csv.reader(bed_file, delimiter='\t') | 
|  | 137 | 
|  | 138             for read in reader: | 
|  | 139                 strand = bed12.get_strand(read) | 
|  | 140                 if strand == '+': | 
|  | 141                     plus_strand_tmp_file.write('\t'.join(read) + '\n') | 
|  | 142                 elif strand == '-': | 
|  | 143                     minus_strand_tmp_file.write('\t'.join(read) + '\n') | 
|  | 144 | 
|  | 145 | 
|  | 146     #close the files | 
|  | 147     plus_strand_tmp_file.close() | 
|  | 148     minus_strand_tmp_file.close() | 
|  | 149 | 
|  | 150     #call unix sort on the file containing reads on the plus strand by tss | 
|  | 151     with open(os.path.join(tmp_dir, '+sorted'), "w") as outfile: | 
|  | 152         subprocess.call(["sort", '-k2,2n', os.path.join(tmp_dir, '+')], stdout=outfile) | 
|  | 153 | 
|  | 154     #call unix sort on the file containing reads on the minus strand by tss | 
|  | 155     with open(os.path.join(tmp_dir, '-sorted'), "w") as outfile: | 
|  | 156         subprocess.call(["sort", '-k3,3n', os.path.join(tmp_dir, '-')], stdout=outfile) | 
|  | 157 | 
|  | 158     #concatenate the files sorted by tss | 
|  | 159     with open(plus_and_minus_sorted_path, "w") as outfile: | 
|  | 160         subprocess.call(['cat', os.path.join(tmp_dir, '+sorted'), os.path.join(tmp_dir, '-sorted')], stdout=outfile) | 
|  | 161 | 
|  | 162     with open(plus_and_minus_sorted_path) as bedfile: | 
|  | 163         reader = csv.reader(bedfile, delimiter='\t') | 
|  | 164         reads = (line for line in reader) | 
|  | 165 | 
|  | 166         #for each reads on the same tss | 
|  | 167         for tss, reads in itertools.groupby(reads, bed12.get_tss): | 
|  | 168             d = defaultdict(list) | 
|  | 169 | 
|  | 170             #group the reads by chr, barcode and fingerprint | 
|  | 171             for read in reads: | 
|  | 172                 key = (bed12.get_chrom(read), get_barcode(read), get_fingerprint(read)) | 
|  | 173                 d[key].append(read) | 
|  | 174 | 
|  | 175             #merge and print the reads that have similar tss, barcode and fingerprint | 
|  | 176             for key, reads in d.iteritems(): | 
|  | 177                 print_read_to_bed12(key, reads) | 
|  | 178 | 
|  | 179     shutil.rmtree(tmp_dir) | 
|  | 180 | 
|  | 181 | 
|  | 182 if __name__ == '__main__': | 
|  | 183     main() | 
|  | 184 | 
|  | 185 #TODO: combine this script with dedup_fingerprint |