comparison small_rna_maps.py @ 1:615fa2171a34 draft

"planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/small_rna_maps commit c3d728b98db4987821feae40952d9797c97eaf5a"
author artbio
date Fri, 04 Oct 2019 04:33:08 -0400
parents 0a06985c0894
children 59d93aa7cc20
comparison
equal deleted inserted replaced
0:0a06985c0894 1:615fa2171a34
8 8
9 def Parser(): 9 def Parser():
10 the_parser = argparse.ArgumentParser() 10 the_parser = argparse.ArgumentParser()
11 the_parser.add_argument('--inputs', dest='inputs', required=True, 11 the_parser.add_argument('--inputs', dest='inputs', required=True,
12 nargs='+', help='list of input BAM files') 12 nargs='+', help='list of input BAM files')
13 the_parser.add_argument('--minsize', dest='minsize', type=int,
14 default=19, help='minimal size of reads')
15 the_parser.add_argument('--maxsize', dest='maxsize', type=int,
16 default=29, help='maximal size of reads')
17 the_parser.add_argument('--cluster', dest='cluster', type=int,
18 default=0, help='clustering distance')
13 the_parser.add_argument('--sample_names', dest='sample_names', 19 the_parser.add_argument('--sample_names', dest='sample_names',
14 required=True, nargs='+', 20 required=True, nargs='+',
15 help='list of sample names') 21 help='list of sample names')
22 the_parser.add_argument('--bed', dest='bed', required=False,
23 help='Name of bed output must be specified\
24 if --cluster option used')
25 the_parser.add_argument('--bed_skipsize', dest='bed_skipsize',
26 required=False, type=int, default=1,
27 help='Skip clusters of size equal or less than\
28 specified integer in the bed output. \
29 Default = 0, not skipping')
30 the_parser.add_argument('--bed_skipdensity', dest='bed_skipdensity',
31 required=False, type=float, default=0,
32 help='Skip clusters of density equal or less than\
33 specified float number in the bed output. \
34 Default = 0, not skipping')
35 the_parser.add_argument('--bed_skipcounts', dest='bed_skipcounts',
36 required=False, type=int, default=1,
37 help='Skip clusters of size equal or less than\
38 specified integer in the bed output. \
39 Default = 0, not skipping')
16 the_parser.add_argument('--outputs', nargs='+', action='store', 40 the_parser.add_argument('--outputs', nargs='+', action='store',
17 help='list of two output paths (only two)') 41 help='list of two output paths (only two)')
18 the_parser.add_argument('-M', '--plot_methods', nargs='+', action='store', 42 the_parser.add_argument('-M', '--plot_methods', nargs='+', action='store',
19 help='list of 2 plot methods (only two) among:\ 43 help='list of 2 plot methods (only two) among:\
20 Counts, Max, Mean, Median, Coverage and Size') 44 Counts, Max, Mean, Median, Coverage and Size')
45 the_parser.add_argument('--nostrand', action='store_true',
46 help='Consider reads regardless their polarity')
47
21 args = the_parser.parse_args() 48 args = the_parser.parse_args()
22 return args 49 return args
23 50
24 51
25 class Map: 52 class Map:
26 53
27 def __init__(self, bam_file, sample): 54 def __init__(self, bam_file, sample, minsize, maxsize, cluster, nostrand):
28 self.sample_name = sample 55 self.sample_name = sample
56 self.minsize = minsize
57 self.maxsize = maxsize
58 self.cluster = cluster
59 if not nostrand:
60 self.nostrand = False
61 else:
62 self.nostrand = True
29 self.bam_object = pysam.AlignmentFile(bam_file, 'rb') 63 self.bam_object = pysam.AlignmentFile(bam_file, 'rb')
30 self.chromosomes = dict(zip(self.bam_object.references, 64 self.chromosomes = dict(zip(self.bam_object.references,
31 self.bam_object.lengths)) 65 self.bam_object.lengths))
32 self.map_dict = self.create_map(self.bam_object) 66 self.map_dict = self.create_map(self.bam_object, self.nostrand)
33 67 if self.cluster:
34 def create_map(self, bam_object): 68 self.map_dict = self.tile_map(self.map_dict, self.cluster)
69
70 def create_map(self, bam_object, nostrand=False):
35 ''' 71 '''
36 Returns a map_dictionary {(chromosome,read_position,polarity): 72 Returns a map_dictionary {(chromosome,read_position,polarity):
37 [read_length, ...]} 73 [read_length, ...]}
38 ''' 74 '''
39 map_dictionary = defaultdict(list) 75 map_dictionary = defaultdict(list)
40 # get empty value for start and end of each chromosome
41 for chrom in self.chromosomes: 76 for chrom in self.chromosomes:
77 # get empty value for start and end of each chromosome
42 map_dictionary[(chrom, 1, 'F')] = [] 78 map_dictionary[(chrom, 1, 'F')] = []
43 map_dictionary[(chrom, self.chromosomes[chrom], 'F')] = [] 79 map_dictionary[(chrom, self.chromosomes[chrom], 'F')] = []
80 if not nostrand:
81 for read in bam_object.fetch(chrom):
82 positions = read.positions # a list of covered positions
83 if read.is_reverse:
84 map_dictionary[(chrom, positions[-1]+1, 'R')].append(
85 read.query_alignment_length)
86 else:
87 map_dictionary[(chrom, positions[0]+1, 'F')].append(
88 read.query_alignment_length)
89 else:
90 for read in bam_object.fetch(chrom):
91 positions = read.positions # a list of covered positions
92 map_dictionary[(chrom, positions[0]+1, 'F')].append(
93 read.query_alignment_length)
94 return map_dictionary
95
96 def grouper(self, iterable, clust_distance):
97 prev = None
98 group = []
99 for item in iterable:
100 if not prev or item - prev <= clust_distance:
101 group.append(item)
102 else:
103 yield group
104 group = [item]
105 prev = item
106 if group:
107 yield group
108
109 def tile_map(self, map_dic, clust_distance):
110 '''
111 takes a map_dictionary {(chromosome,read_position,polarity):
112 [read_length, ...]}
113 and returns a map_dictionary with structure:
114 {(chromosome,read_position,polarity):
115 [*counts*, [start_clust, end_clust]]}
116 '''
117 clustered_dic = defaultdict(list)
44 for chrom in self.chromosomes: 118 for chrom in self.chromosomes:
45 for read in bam_object.fetch(chrom): 119 F_chrom_coord = []
120 R_chrom_coord = []
121 for key in map_dic:
122 if key[0] == chrom and key[2] == 'F':
123 F_chrom_coord.append(key[1])
124 elif key[0] == chrom and key[2] == 'R':
125 R_chrom_coord.append(key[1])
126 F_chrom_coord = list(set(F_chrom_coord))
127 R_chrom_coord = list(set(R_chrom_coord))
128 F_chrom_coord.sort()
129 R_chrom_coord.sort()
130 F_clust_values = [i for i in self.grouper(F_chrom_coord,
131 clust_distance)]
132 F_clust_keys = [(i[-1]+i[0])/2 for i in F_clust_values]
133 R_clust_values = [i for i in self.grouper(R_chrom_coord,
134 clust_distance)]
135 R_clust_keys = [(i[-1]+i[0])/2 for i in R_clust_values]
136 # now 2 dictionnaries (F and R) with structure:
137 # {centered_coordinate: [coord1, coord2, coord3, ..]}
138 F_clust_dic = dict(zip(F_clust_keys, F_clust_values))
139 R_clust_dic = dict(zip(R_clust_keys, R_clust_values))
140 for centcoor in F_clust_dic:
141 accumulator = []
142 for coor in F_clust_dic[centcoor]:
143 accumulator.extend(map_dic[(chrom, coor, 'F')])
144 '''
145 compute the offset of the cluster due to
146 size of reads
147 '''
148 last = sorted(F_clust_dic[centcoor])[-1]
149 try:
150 margin = max(map_dic[(chrom, last, 'F')]) - 1
151 except ValueError:
152 margin = 0
153 clustered_dic[(chrom, centcoor, 'F')] = [len(accumulator), [
154 F_clust_dic[centcoor][0],
155 F_clust_dic[centcoor][-1] + margin]]
156 for centcoor in R_clust_dic:
157 accumulator = []
158 for coor in R_clust_dic[centcoor]:
159 accumulator.extend(map_dic[(chrom, coor, 'R')])
160 '''
161 compute the offset of the cluster due to
162 size of reads
163 '''
164 first = sorted(R_clust_dic[centcoor])[0]
165 try:
166 margin = max(map_dic[(chrom, first, 'R')]) - 1
167 except ValueError:
168 margin = 0
169 clustered_dic[(chrom, centcoor, 'R')] = [len(accumulator), [
170 R_clust_dic[centcoor][0] - margin,
171 R_clust_dic[centcoor][-1]]]
172 return clustered_dic
173
174 def compute_readcount(self, map_dictionary, out):
175 '''
176 takes a map_dictionary as input and writes
177 a readmap_dictionary {(chromosome,read_position,polarity):
178 number_of_reads}
179 in an open file handler out
180 '''
181 readmap_dictionary = dict()
182 for key in map_dictionary:
183 readmap_dictionary[key] = len(map_dictionary[key])
184 self.write_table(readmap_dictionary, out)
185
186 def compute_max(self, map_dictionary, out):
187 '''
188 takes a map_dictionary as input and writes
189 a max_dictionary {(chromosome,read_position,polarity):
190 max_of_number_of_read_at_any_position}
191 Not clear this function is still required
192 '''
193 merge_keylist = [(i[0], 0) for i in map_dictionary.keys()]
194 max_dictionary = dict(merge_keylist)
195 for key in map_dictionary:
196 if len(map_dictionary[key]) > max_dictionary[key[0]]:
197 max_dictionary[key[0]] = len(map_dictionary[key])
198 self.write_table(max_dictionary, out)
199
200 def compute_mean(self, map_dictionary, out):
201 '''
202 takes a map_dictionary as input and returns
203 a mean_dictionary {(chromosome,read_position,polarity):
204 mean_value_of_reads}
205 '''
206 mean_dictionary = dict()
207 for key in map_dictionary:
208 if len(map_dictionary[key]) == 0:
209 mean_dictionary[key] = 0
210 else:
211 mean_dictionary[key] = round(numpy.mean(map_dictionary[key]),
212 1)
213 self.write_table(mean_dictionary, out)
214
215 def compute_median(self, map_dictionary, out):
216 '''
217 takes a map_dictionary as input and returns
218 a mean_dictionary {(chromosome,read_position,polarity):
219 mean_value_of_reads}
220 '''
221 median_dictionary = dict()
222 for key in map_dictionary:
223 if len(map_dictionary[key]) == 0:
224 median_dictionary[key] = 0
225 else:
226 median_dictionary[key] = numpy.median(map_dictionary[key])
227 self.write_table(median_dictionary, out)
228
229 def compute_coverage(self, map_dictionary, out, quality=15):
230 '''
231 takes a map_dictionary as input and returns
232 a coverage_dictionary {(chromosome,read_position,polarity):
233 coverage}
234 '''
235 coverage_dictionary = dict()
236 for chrom in self.chromosomes:
237 coverage_dictionary[(chrom, 1, 'F')] = 0
238 coverage_dictionary[(chrom, self.chromosomes[chrom], 'F')] = 0
239 for read in self.bam_object.fetch(chrom):
46 positions = read.positions # a list of covered positions 240 positions = read.positions # a list of covered positions
47 for pos in positions: 241 for pos in positions:
48 if not map_dictionary[(chrom, pos+1, 'F')]: 242 if not map_dictionary[(chrom, pos+1, 'F')]:
49 map_dictionary[(chrom, pos+1, 'F')] = [] 243 map_dictionary[(chrom, pos+1, 'F')] = []
50 if read.is_reverse:
51 map_dictionary[(chrom, positions[-1]+1,
52 'R')].append(read.query_alignment_length)
53 else:
54 map_dictionary[(chrom, positions[0]+1,
55 'F')].append(read.query_alignment_length)
56 return map_dictionary
57
58 def compute_readcount(self, map_dictionary, out):
59 '''
60 takes a map_dictionary as input and writes
61 a readmap_dictionary {(chromosome,read_position,polarity):
62 number_of_reads}
63 in an open file handler out
64 '''
65 readmap_dictionary = dict()
66 for key in map_dictionary:
67 readmap_dictionary[key] = len(map_dictionary[key])
68 self.write_table(readmap_dictionary, out)
69
70 def compute_max(self, map_dictionary, out):
71 '''
72 takes a map_dictionary as input and writes
73 a max_dictionary {(chromosome,read_position,polarity):
74 max_of_number_of_read_at_any_position}
75 Not clear this function is still required
76 '''
77 merge_keylist = [(i[0], 0) for i in map_dictionary.keys()]
78 max_dictionary = dict(merge_keylist)
79 for key in map_dictionary:
80 if len(map_dictionary[key]) > max_dictionary[key[0]]:
81 max_dictionary[key[0]] = len(map_dictionary[key])
82 self.write_table(max_dictionary, out)
83
84 def compute_mean(self, map_dictionary, out):
85 '''
86 takes a map_dictionary as input and returns
87 a mean_dictionary {(chromosome,read_position,polarity):
88 mean_value_of_reads}
89 '''
90 mean_dictionary = dict()
91 for key in map_dictionary:
92 if len(map_dictionary[key]) == 0:
93 mean_dictionary[key] = 0
94 else:
95 mean_dictionary[key] = round(numpy.mean(map_dictionary[key]),
96 1)
97 self.write_table(mean_dictionary, out)
98
99 def compute_median(self, map_dictionary, out):
100 '''
101 takes a map_dictionary as input and returns
102 a mean_dictionary {(chromosome,read_position,polarity):
103 mean_value_of_reads}
104 '''
105 median_dictionary = dict()
106 for key in map_dictionary:
107 if len(map_dictionary[key]) == 0:
108 median_dictionary[key] = 0
109 else:
110 median_dictionary[key] = numpy.median(map_dictionary[key])
111 self.write_table(median_dictionary, out)
112
113 def compute_coverage(self, map_dictionary, out, quality=10):
114 '''
115 takes a map_dictionary as input and returns
116 a coverage_dictionary {(chromosome,read_position,polarity):
117 coverage}
118 '''
119 coverage_dictionary = dict()
120 for chrom in self.chromosomes:
121 coverage_dictionary[(chrom, 1, 'F')] = 0
122 coverage_dictionary[(chrom, self.chromosomes[chrom], 'F')] = 0
123 for key in map_dictionary: 244 for key in map_dictionary:
124 coverage = self.bam_object.count_coverage( 245 coverage = self.bam_object.count_coverage(
125 reference=key[0], 246 contig=key[0],
126 start=key[1]-1, 247 start=key[1]-1,
127 end=key[1], 248 stop=key[1],
128 quality_threshold=quality) 249 quality_threshold=quality)
129 """ Add the 4 coverage values """ 250 """ Add the 4 coverage values """
130 coverage = [sum(x) for x in zip(*coverage)] 251 coverage = [sum(x) for x in zip(*coverage)]
131 coverage_dictionary[key] = coverage[0] 252 coverage_dictionary[key] = coverage[0]
132 self.write_table(coverage_dictionary, out) 253 self.write_table(coverage_dictionary, out)
147 size_dictionary[key[0]][key[2]][size] += 1 268 size_dictionary[key[0]][key[2]][size] += 1
148 self.write_size_table(size_dictionary, out) 269 self.write_size_table(size_dictionary, out)
149 270
150 def write_table(self, mapdict, out): 271 def write_table(self, mapdict, out):
151 ''' 272 '''
152 Generic writer 273 Writer of a tabular file
153 Dataset, Chromosome, Chrom_length, Coordinate, Polarity, 274 Dataset, Chromosome, Chrom_length, Coordinate, Polarity,
154 <some mapped value> 275 <some mapped value>
155 out is an *open* file handler 276 out is an *open* file handler
156 ''' 277 '''
157 for key in sorted(mapdict): 278 for key in sorted(mapdict):
160 line = [str(i) for i in line] 281 line = [str(i) for i in line]
161 out.write('\t'.join(line) + '\n') 282 out.write('\t'.join(line) + '\n')
162 283
163 def write_size_table(self, sizedic, out): 284 def write_size_table(self, sizedic, out):
164 ''' 285 '''
165 Generic writer of summary values 286 Writer of a tabular file
166 Dataset, Chromosome, Chrom_length, <some category>, <some value> 287 Dataset, Chromosome, Chrom_length, <category (size)>, <some value>
288 from a dictionary of sizes: {chrom: {polarity: {size: nbre of reads}}}
167 out is an *open* file handler 289 out is an *open* file handler
168 ''' 290 '''
169 for chrom in sorted(sizedic): 291 for chrom in sorted(sizedic):
170 sizes = sizedic[chrom]['F'].keys() 292 sizes = range(self.minsize, self.maxsize+1)
171 sizes.extend(sizedic[chrom]['R'].keys()) 293 strandness = defaultdict(int)
294 sizeness = defaultdict(int)
295 for polarity in sizedic[chrom]:
296 for size in sizes:
297 strandness[polarity] += sizedic[chrom][polarity][size]
298 sizeness[size] += sizedic[chrom][polarity][size]
299 Strandbias = strandness['F'] + strandness['R']
300 if Strandbias:
301 Strandbias = round(strandness['F'] / float(Strandbias), 2)
302 else:
303 Strandbias = 2
304 Mean = numpy.mean(sizeness.values())
305 StDev = numpy.std(sizeness.values())
306 for size in sizeness:
307 if StDev:
308 sizeness[size] = (sizeness[size] - Mean) / StDev
309 else:
310 sizeness[size] = 0
172 for polarity in sorted(sizedic[chrom]): 311 for polarity in sorted(sizedic[chrom]):
173 for size in range(min(sizes), max(sizes)+1): 312 for size in sizes:
174 try: 313 try:
175 line = [self.sample_name, chrom, polarity, size, 314 line = [self.sample_name, chrom, polarity, size,
176 sizedic[chrom][polarity][size]] 315 sizedic[chrom][polarity][size],
316 Strandbias, round(sizeness[size], 3)]
177 except KeyError: 317 except KeyError:
178 line = [self.sample_name, chrom, polarity, size, 0] 318 line = [self.sample_name, chrom, polarity, size, 0,
319 Strandbias, round(sizeness[size], 3)]
179 line = [str(i) for i in line] 320 line = [str(i) for i in line]
180 out.write('\t'.join(line) + '\n') 321 out.write('\t'.join(line) + '\n')
181 322
182 323 def write_cluster_table(self, clustered_dic, out, bedpath):
183 def main(inputs, samples, methods, outputs): 324 '''
325 Writer of a tabular file
326 Dataset, Chromosome, Chrom_length, Coordinate, Polarity,
327 <some mapped value>
328 out is an *open* file handler
329 bed is an a file handler internal to the function
330 '''
331 def filterCluster(size, count, density):
332 if size < args.bed_skipsize:
333 return False
334 if count < args.bed_skipcounts:
335 return False
336 if density <= args.bed_skipdensity:
337 return False
338 return True
339 bed = open(bedpath, 'w')
340 clusterid = 0
341 for key in sorted(clustered_dic):
342 start = clustered_dic[key][1][0]
343 end = clustered_dic[key][1][1]
344 size = end - start + 1
345 read_count = clustered_dic[key][0]
346 if self.nostrand:
347 polarity = '.'
348 elif key[2] == 'F':
349 polarity = '+'
350 else:
351 polarity = '-'
352 density = float(read_count) / size
353 line = [self.sample_name, key[0], self.chromosomes[key[0]],
354 key[1], key[2], read_count,
355 str(start) + "-" + str(end), str(size), str(density)]
356 line = [str(i) for i in line]
357 out.write('\t'.join(line) + '\n')
358 if filterCluster(size, read_count, density):
359 clusterid += 1
360 name = 'cluster_' + str(clusterid)
361 bedline = [key[0], str(start-1), str(end), name,
362 str(read_count), polarity, str(density)]
363 bed.write('\t'.join(bedline) + '\n')
364 print("number of reported clusters:", clusterid)
365 bed.close()
366
367
368 def main(inputs, samples, methods, outputs, minsize, maxsize, cluster,
369 nostrand, bedfile=None, bed_skipsize=0):
184 for method, output in zip(methods, outputs): 370 for method, output in zip(methods, outputs):
185 F = open(output, 'w') 371 out = open(output, 'w')
186 if method == 'Size': 372 if method == 'Size':
187 header = ["Dataset", "Chromosome", "Polarity", method, "Count"] 373 header = ["# Dataset", "Chromosome", "Polarity", method, "Counts",
374 "Strandness", "z-score"]
375 elif cluster:
376 header = ["# Dataset", "Chromosome", "Chrom_length", "Coordinate",
377 "Polarity", method, "Start-End", "Cluster Size",
378 "density"]
188 else: 379 else:
189 header = ["Dataset", "Chromosome", "Chrom_length", "Coordinate", 380 header = ["# Dataset", "Chromosome", "Chrom_length", "Coordinate",
190 "Polarity", method] 381 "Polarity", method]
191 F.write('\t'.join(header) + '\n') 382 out.write('\t'.join(header) + '\n')
192 for input, sample in zip(inputs, samples): 383 for input, sample in zip(inputs, samples):
193 mapobj = Map(input, sample) 384 mapobj = Map(input, sample, minsize, maxsize, cluster, nostrand)
194 token = {"Counts": mapobj.compute_readcount, 385 token = {"Counts": mapobj.compute_readcount,
195 "Max": mapobj.compute_max, 386 "Max": mapobj.compute_max,
196 "Mean": mapobj.compute_mean, 387 "Mean": mapobj.compute_mean,
197 "Median": mapobj.compute_median, 388 "Median": mapobj.compute_median,
198 "Coverage": mapobj.compute_coverage, 389 "Coverage": mapobj.compute_coverage,
199 "Size": mapobj.compute_size} 390 "Size": mapobj.compute_size,
200 token[method](mapobj.map_dict, F) 391 "cluster": mapobj.write_cluster_table}
201 F.close() 392 if cluster:
393 token["cluster"](mapobj.map_dict, out, bedfile)
394 else:
395 token[method](mapobj.map_dict, out)
396 out.close()
202 397
203 398
204 if __name__ == "__main__": 399 if __name__ == "__main__":
205 args = Parser() 400 args = Parser()
206 # if identical sample names 401 # if identical sample names
207 if len(set(args.sample_names)) != len(args.sample_names): 402 if len(set(args.sample_names)) != len(args.sample_names):
208 args.sample_names = [name + '_' + str(i) for 403 args.sample_names = [name + '_' + str(i) for
209 i, name in enumerate(args.sample_names)] 404 i, name in enumerate(args.sample_names)]
210 main(args.inputs, args.sample_names, args.plot_methods, args.outputs) 405 main(args.inputs, args.sample_names, args.plot_methods, args.outputs,
406 args.minsize, args.maxsize, args.cluster, args.nostrand, args.bed)