comparison signature.py @ 0:3e0ea204d09e draft

"planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/small_rna_signatures commit 4e84386b619a7941f23d175d7fc86aba7990ac36"
author artbio
date Tue, 07 Jan 2020 11:59:34 +0000
parents
children ca2b04cdbf4d
comparison
equal deleted inserted replaced
-1:000000000000 0:3e0ea204d09e
1 import argparse
2 from collections import defaultdict
3
4 import numpy
5
6 import pysam
7
8
9 def Parser():
10 the_parser = argparse.ArgumentParser()
11 the_parser.add_argument(
12 '--input', action="store", type=str, help="bam alignment file")
13 the_parser.add_argument(
14 '--minquery', type=int,
15 help="Minimum readsize of query reads (nt) - must be an integer")
16 the_parser.add_argument(
17 '--maxquery', type=int,
18 help="Maximum readsize of query reads (nt) - must be an integer")
19 the_parser.add_argument(
20 '--mintarget', type=int,
21 help="Minimum readsize of target reads (nt) - must be an integer")
22 the_parser.add_argument(
23 '--maxtarget', type=int,
24 help="Maximum readsize of target reads (nt) - must be an integer")
25 the_parser.add_argument(
26 '--minscope', type=int,
27 help="Minimum overlap analyzed (nt) - must be an integer")
28 the_parser.add_argument(
29 '--maxscope', type=int,
30 help="Maximum overlap analyzed (nt) - must be an integer")
31 the_parser.add_argument(
32 '--output_h', action="store", type=str,
33 help="h-signature dataframe")
34 the_parser.add_argument(
35 '--output_z', action="store", type=str,
36 help="z-signature dataframe")
37 args = the_parser.parse_args()
38 return args
39
40
41 class Map:
42
43 def __init__(self, bam_file, minquery=23, maxquery=29, mintarget=23,
44 maxtarget=29, minscope=1, maxscope=19, output_h='',
45 output_z=''):
46 self.bam_object = pysam.AlignmentFile(bam_file, 'rb')
47 self.query_range = range(minquery, maxquery + 1)
48 self.target_range = range(mintarget, maxtarget + 1)
49 self.scope = range(minscope, maxscope + 1)
50 self.H = open(output_h, 'w')
51 self.Z = open(output_z, 'w')
52 self.chromosomes = dict(zip(self.bam_object.references,
53 self.bam_object.lengths))
54 self.map_dict = self.create_map(self.bam_object)
55 self.query_positions = self.compute_query_positions()
56 self.Z.write(self.compute_signature_pairs())
57 self.H.write(self.compute_signature_h())
58 self.H.close()
59 self.Z.close()
60
61 def create_map(self, bam_object):
62 '''
63 Returns a map_dictionary {(chromosome,read_position,polarity):
64 [read_length, ...]}
65 '''
66 map_dictionary = defaultdict(list)
67 # get empty value for start and end of each chromosome
68 for chrom in self.chromosomes:
69 map_dictionary[(chrom, 1, 'F')] = []
70 map_dictionary[(chrom, self.chromosomes[chrom], 'F')] = []
71 for chrom in self.chromosomes:
72 for read in bam_object.fetch(chrom):
73 if read.is_reverse:
74 map_dictionary[(chrom, read.reference_end,
75 'R')].append(read.query_alignment_length)
76 else:
77 map_dictionary[(chrom, read.reference_start+1,
78 'F')].append(read.query_alignment_length)
79 return map_dictionary
80
81 def compute_query_positions(self):
82 ''' this method does not filter on read size, just forward reads
83 that overlap reverse reads in the overlap range'''
84 all_query_positions = defaultdict(list)
85 for genomicKey in self.map_dict.keys():
86 chrom, coord, pol = genomicKey
87 for i in self.scope:
88 if pol == 'F' and len(self.map_dict[chrom,
89 coord+i-1,
90 'R']) > 0:
91 all_query_positions[chrom].append(coord)
92 break
93 for chrom in all_query_positions:
94 all_query_positions[chrom] = sorted(
95 list(set(all_query_positions[chrom])))
96 return all_query_positions
97
98 def countpairs(self, uppers, lowers):
99 query_range = self.query_range
100 target_range = self.target_range
101 uppers = [size for size in uppers if size in query_range or size in
102 target_range]
103 lowers = [size for size in lowers if size in query_range or size in
104 target_range]
105 paired = []
106 for upread in uppers:
107 for downread in lowers:
108 if (upread in query_range and downread in target_range) or (
109 upread in target_range and downread in query_range):
110 paired.append(upread)
111 lowers.remove(downread)
112 break
113 return len(paired)
114
115 def compute_signature_pairs(self):
116 frequency_table = defaultdict(dict)
117 scope = self.scope
118 for chrom in self.chromosomes:
119 for overlap in scope:
120 frequency_table[chrom][overlap] = 0
121 for chrom in self.query_positions:
122 for coord in self.query_positions[chrom]:
123 for overlap in scope:
124 uppers = self.map_dict[chrom, coord, 'F']
125 lowers = self.map_dict[chrom, coord+overlap-1, 'R']
126 frequency_table[chrom][overlap] += self.countpairs(uppers,
127 lowers)
128 # compute overlaps for all chromosomes merged
129 for overlap in scope:
130 accumulator = []
131 for chrom in frequency_table:
132 if chrom != 'all_chromosomes':
133 accumulator.append(frequency_table[chrom][overlap])
134 frequency_table['all_chromosomes'][overlap] = sum(accumulator)
135 return self.stringify_table(frequency_table)
136
137 def signature_tables(self):
138 query_range = self.query_range
139 target_range = self.target_range
140 Query_table = defaultdict(dict)
141 Target_table = defaultdict(dict)
142 for key in self.map_dict:
143 for size in self.map_dict[key]:
144 if size in query_range or size in target_range:
145 if key[2] == 'F':
146 coordinate = key[1]
147 else:
148 coordinate = -key[1]
149 if size in query_range:
150 Query_table[key[0]][coordinate] = Query_table[key[0]].get(
151 coordinate, 0) + 1
152 if size in target_range:
153 Target_table[key[0]][coordinate] = \
154 Target_table[key[0]].get(coordinate, 0) + 1
155 return Query_table, Target_table
156
157 def compute_signature_h(self):
158 scope = self.scope
159 Query_table, Target_table = self.signature_tables()
160 frequency_table = defaultdict(dict)
161 for chrom in self.chromosomes:
162 for overlap in scope:
163 frequency_table[chrom][overlap] = 0
164 for chrom in Query_table:
165 Total_Query_Numb = 0
166 for coord in Query_table[chrom]:
167 Total_Query_Numb += Query_table[chrom][coord]
168 for coord in Query_table[chrom]:
169 local_table = dict([(overlap, 0) for overlap in scope])
170 number_of_targets = 0
171 for overlap in scope:
172 local_table[overlap] += Query_table[chrom][coord] * \
173 Target_table[chrom].get(-coord - overlap + 1, 0)
174 number_of_targets += Target_table[chrom].get(
175 -coord - overlap + 1, 0)
176 for overlap in scope:
177 try:
178 frequency_table[chrom][overlap] += \
179 local_table[overlap] / number_of_targets \
180 / float(Total_Query_Numb)
181 except ZeroDivisionError:
182 continue
183 # compute overlap probabilities for all chromosomes merged
184 general_frequency_table = dict([(overlap, 0) for overlap in scope])
185 total_aligned_reads = 0
186 for chrom in frequency_table:
187 for overlap in frequency_table[chrom]:
188 total_aligned_reads += self.bam_object.count(chrom)
189 for chrom in frequency_table:
190 for overlap in frequency_table[chrom]:
191 try:
192 general_frequency_table[overlap] += \
193 frequency_table[chrom][overlap] / total_aligned_reads \
194 * self.bam_object.count(chrom)
195 except ZeroDivisionError:
196 continue
197 for overlap in general_frequency_table:
198 frequency_table['all_chromosomes'][overlap] = \
199 general_frequency_table[overlap]
200 return self.stringify_table(frequency_table)
201
202 def stringify_table(self, frequency_table):
203 '''
204 method both to compute z-score and to return a writable string
205 '''
206 tablestring = []
207 for chrom in sorted(frequency_table):
208 accumulator = []
209 for overlap in frequency_table[chrom]:
210 accumulator.append(frequency_table[chrom][overlap])
211 z_mean = numpy.mean(accumulator)
212 z_std = numpy.std(accumulator)
213 if z_std == 0:
214 for overlap in sorted(frequency_table[chrom]):
215 tablestring.append('%s\t%s\t%s\t%s\n' % (
216 chrom, str(overlap),
217 str(frequency_table[chrom][overlap]), str(0)))
218 else:
219 for overlap in sorted(frequency_table[chrom]):
220 tablestring.append('%s\t%s\t%s\t%s\n' % (
221 chrom, str(overlap),
222 str(frequency_table[chrom][overlap]),
223 str((frequency_table[chrom][overlap] - z_mean)/z_std)))
224 return ''.join(tablestring)
225
226
227 if __name__ == "__main__":
228 args = Parser()
229 mapobj = Map(args.input, args.minquery, args.maxquery, args.mintarget,
230 args.maxtarget, args.minscope, args.maxscope, args.output_h,
231 args.output_z)