Mercurial > repos > artbio > small_rna_maps
comparison small_rna_maps.py @ 0:0a06985c0894 draft
planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/small_rna_maps commit 7b2ceb05489c27ddb769c38fdec56274108a6fa1
author | artbio |
---|---|
date | Tue, 22 Aug 2017 12:06:58 -0400 |
parents | |
children | 615fa2171a34 |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:0a06985c0894 |
---|---|
1 import argparse | |
2 from collections import defaultdict | |
3 | |
4 import numpy | |
5 | |
6 import pysam | |
7 | |
8 | |
9 def Parser(): | |
10 the_parser = argparse.ArgumentParser() | |
11 the_parser.add_argument('--inputs', dest='inputs', required=True, | |
12 nargs='+', help='list of input BAM files') | |
13 the_parser.add_argument('--sample_names', dest='sample_names', | |
14 required=True, nargs='+', | |
15 help='list of sample names') | |
16 the_parser.add_argument('--outputs', nargs='+', action='store', | |
17 help='list of two output paths (only two)') | |
18 the_parser.add_argument('-M', '--plot_methods', nargs='+', action='store', | |
19 help='list of 2 plot methods (only two) among:\ | |
20 Counts, Max, Mean, Median, Coverage and Size') | |
21 args = the_parser.parse_args() | |
22 return args | |
23 | |
24 | |
25 class Map: | |
26 | |
27 def __init__(self, bam_file, sample): | |
28 self.sample_name = sample | |
29 self.bam_object = pysam.AlignmentFile(bam_file, 'rb') | |
30 self.chromosomes = dict(zip(self.bam_object.references, | |
31 self.bam_object.lengths)) | |
32 self.map_dict = self.create_map(self.bam_object) | |
33 | |
34 def create_map(self, bam_object): | |
35 ''' | |
36 Returns a map_dictionary {(chromosome,read_position,polarity): | |
37 [read_length, ...]} | |
38 ''' | |
39 map_dictionary = defaultdict(list) | |
40 # get empty value for start and end of each chromosome | |
41 for chrom in self.chromosomes: | |
42 map_dictionary[(chrom, 1, 'F')] = [] | |
43 map_dictionary[(chrom, self.chromosomes[chrom], 'F')] = [] | |
44 for chrom in self.chromosomes: | |
45 for read in bam_object.fetch(chrom): | |
46 positions = read.positions # a list of covered positions | |
47 for pos in positions: | |
48 if not map_dictionary[(chrom, pos+1, 'F')]: | |
49 map_dictionary[(chrom, pos+1, 'F')] = [] | |
50 if read.is_reverse: | |
51 map_dictionary[(chrom, positions[-1]+1, | |
52 'R')].append(read.query_alignment_length) | |
53 else: | |
54 map_dictionary[(chrom, positions[0]+1, | |
55 'F')].append(read.query_alignment_length) | |
56 return map_dictionary | |
57 | |
58 def compute_readcount(self, map_dictionary, out): | |
59 ''' | |
60 takes a map_dictionary as input and writes | |
61 a readmap_dictionary {(chromosome,read_position,polarity): | |
62 number_of_reads} | |
63 in an open file handler out | |
64 ''' | |
65 readmap_dictionary = dict() | |
66 for key in map_dictionary: | |
67 readmap_dictionary[key] = len(map_dictionary[key]) | |
68 self.write_table(readmap_dictionary, out) | |
69 | |
70 def compute_max(self, map_dictionary, out): | |
71 ''' | |
72 takes a map_dictionary as input and writes | |
73 a max_dictionary {(chromosome,read_position,polarity): | |
74 max_of_number_of_read_at_any_position} | |
75 Not clear this function is still required | |
76 ''' | |
77 merge_keylist = [(i[0], 0) for i in map_dictionary.keys()] | |
78 max_dictionary = dict(merge_keylist) | |
79 for key in map_dictionary: | |
80 if len(map_dictionary[key]) > max_dictionary[key[0]]: | |
81 max_dictionary[key[0]] = len(map_dictionary[key]) | |
82 self.write_table(max_dictionary, out) | |
83 | |
84 def compute_mean(self, map_dictionary, out): | |
85 ''' | |
86 takes a map_dictionary as input and returns | |
87 a mean_dictionary {(chromosome,read_position,polarity): | |
88 mean_value_of_reads} | |
89 ''' | |
90 mean_dictionary = dict() | |
91 for key in map_dictionary: | |
92 if len(map_dictionary[key]) == 0: | |
93 mean_dictionary[key] = 0 | |
94 else: | |
95 mean_dictionary[key] = round(numpy.mean(map_dictionary[key]), | |
96 1) | |
97 self.write_table(mean_dictionary, out) | |
98 | |
99 def compute_median(self, map_dictionary, out): | |
100 ''' | |
101 takes a map_dictionary as input and returns | |
102 a mean_dictionary {(chromosome,read_position,polarity): | |
103 mean_value_of_reads} | |
104 ''' | |
105 median_dictionary = dict() | |
106 for key in map_dictionary: | |
107 if len(map_dictionary[key]) == 0: | |
108 median_dictionary[key] = 0 | |
109 else: | |
110 median_dictionary[key] = numpy.median(map_dictionary[key]) | |
111 self.write_table(median_dictionary, out) | |
112 | |
113 def compute_coverage(self, map_dictionary, out, quality=10): | |
114 ''' | |
115 takes a map_dictionary as input and returns | |
116 a coverage_dictionary {(chromosome,read_position,polarity): | |
117 coverage} | |
118 ''' | |
119 coverage_dictionary = dict() | |
120 for chrom in self.chromosomes: | |
121 coverage_dictionary[(chrom, 1, 'F')] = 0 | |
122 coverage_dictionary[(chrom, self.chromosomes[chrom], 'F')] = 0 | |
123 for key in map_dictionary: | |
124 coverage = self.bam_object.count_coverage( | |
125 reference=key[0], | |
126 start=key[1]-1, | |
127 end=key[1], | |
128 quality_threshold=quality) | |
129 """ Add the 4 coverage values """ | |
130 coverage = [sum(x) for x in zip(*coverage)] | |
131 coverage_dictionary[key] = coverage[0] | |
132 self.write_table(coverage_dictionary, out) | |
133 | |
134 def compute_size(self, map_dictionary, out): | |
135 ''' | |
136 Takes a map_dictionary and returns a dictionary of sizes: | |
137 {chrom: {polarity: {size: nbre of reads}}} | |
138 ''' | |
139 size_dictionary = defaultdict(lambda: defaultdict( | |
140 lambda: defaultdict(int))) | |
141 # to track empty chromosomes | |
142 for chrom in self.chromosomes: | |
143 if self.bam_object.count(chrom) == 0: | |
144 size_dictionary[chrom]['F'][10] = 0 | |
145 for key in map_dictionary: | |
146 for size in map_dictionary[key]: | |
147 size_dictionary[key[0]][key[2]][size] += 1 | |
148 self.write_size_table(size_dictionary, out) | |
149 | |
150 def write_table(self, mapdict, out): | |
151 ''' | |
152 Generic writer | |
153 Dataset, Chromosome, Chrom_length, Coordinate, Polarity, | |
154 <some mapped value> | |
155 out is an *open* file handler | |
156 ''' | |
157 for key in sorted(mapdict): | |
158 line = [self.sample_name, key[0], self.chromosomes[key[0]], | |
159 key[1], key[2], mapdict[key]] | |
160 line = [str(i) for i in line] | |
161 out.write('\t'.join(line) + '\n') | |
162 | |
163 def write_size_table(self, sizedic, out): | |
164 ''' | |
165 Generic writer of summary values | |
166 Dataset, Chromosome, Chrom_length, <some category>, <some value> | |
167 out is an *open* file handler | |
168 ''' | |
169 for chrom in sorted(sizedic): | |
170 sizes = sizedic[chrom]['F'].keys() | |
171 sizes.extend(sizedic[chrom]['R'].keys()) | |
172 for polarity in sorted(sizedic[chrom]): | |
173 for size in range(min(sizes), max(sizes)+1): | |
174 try: | |
175 line = [self.sample_name, chrom, polarity, size, | |
176 sizedic[chrom][polarity][size]] | |
177 except KeyError: | |
178 line = [self.sample_name, chrom, polarity, size, 0] | |
179 line = [str(i) for i in line] | |
180 out.write('\t'.join(line) + '\n') | |
181 | |
182 | |
183 def main(inputs, samples, methods, outputs): | |
184 for method, output in zip(methods, outputs): | |
185 F = open(output, 'w') | |
186 if method == 'Size': | |
187 header = ["Dataset", "Chromosome", "Polarity", method, "Count"] | |
188 else: | |
189 header = ["Dataset", "Chromosome", "Chrom_length", "Coordinate", | |
190 "Polarity", method] | |
191 F.write('\t'.join(header) + '\n') | |
192 for input, sample in zip(inputs, samples): | |
193 mapobj = Map(input, sample) | |
194 token = {"Counts": mapobj.compute_readcount, | |
195 "Max": mapobj.compute_max, | |
196 "Mean": mapobj.compute_mean, | |
197 "Median": mapobj.compute_median, | |
198 "Coverage": mapobj.compute_coverage, | |
199 "Size": mapobj.compute_size} | |
200 token[method](mapobj.map_dict, F) | |
201 F.close() | |
202 | |
203 | |
204 if __name__ == "__main__": | |
205 args = Parser() | |
206 # if identical sample names | |
207 if len(set(args.sample_names)) != len(args.sample_names): | |
208 args.sample_names = [name + '_' + str(i) for | |
209 i, name in enumerate(args.sample_names)] | |
210 main(args.inputs, args.sample_names, args.plot_methods, args.outputs) |