Mercurial > repos > artbio > small_rna_signatures
comparison signature.py @ 0:3e0ea204d09e draft
"planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/small_rna_signatures commit 4e84386b619a7941f23d175d7fc86aba7990ac36"
| author | artbio |
|---|---|
| date | Tue, 07 Jan 2020 11:59:34 +0000 |
| parents | |
| children | ca2b04cdbf4d |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:3e0ea204d09e |
|---|---|
| 1 import argparse | |
| 2 from collections import defaultdict | |
| 3 | |
| 4 import numpy | |
| 5 | |
| 6 import pysam | |
| 7 | |
| 8 | |
| 9 def Parser(): | |
| 10 the_parser = argparse.ArgumentParser() | |
| 11 the_parser.add_argument( | |
| 12 '--input', action="store", type=str, help="bam alignment file") | |
| 13 the_parser.add_argument( | |
| 14 '--minquery', type=int, | |
| 15 help="Minimum readsize of query reads (nt) - must be an integer") | |
| 16 the_parser.add_argument( | |
| 17 '--maxquery', type=int, | |
| 18 help="Maximum readsize of query reads (nt) - must be an integer") | |
| 19 the_parser.add_argument( | |
| 20 '--mintarget', type=int, | |
| 21 help="Minimum readsize of target reads (nt) - must be an integer") | |
| 22 the_parser.add_argument( | |
| 23 '--maxtarget', type=int, | |
| 24 help="Maximum readsize of target reads (nt) - must be an integer") | |
| 25 the_parser.add_argument( | |
| 26 '--minscope', type=int, | |
| 27 help="Minimum overlap analyzed (nt) - must be an integer") | |
| 28 the_parser.add_argument( | |
| 29 '--maxscope', type=int, | |
| 30 help="Maximum overlap analyzed (nt) - must be an integer") | |
| 31 the_parser.add_argument( | |
| 32 '--output_h', action="store", type=str, | |
| 33 help="h-signature dataframe") | |
| 34 the_parser.add_argument( | |
| 35 '--output_z', action="store", type=str, | |
| 36 help="z-signature dataframe") | |
| 37 args = the_parser.parse_args() | |
| 38 return args | |
| 39 | |
| 40 | |
| 41 class Map: | |
| 42 | |
| 43 def __init__(self, bam_file, minquery=23, maxquery=29, mintarget=23, | |
| 44 maxtarget=29, minscope=1, maxscope=19, output_h='', | |
| 45 output_z=''): | |
| 46 self.bam_object = pysam.AlignmentFile(bam_file, 'rb') | |
| 47 self.query_range = range(minquery, maxquery + 1) | |
| 48 self.target_range = range(mintarget, maxtarget + 1) | |
| 49 self.scope = range(minscope, maxscope + 1) | |
| 50 self.H = open(output_h, 'w') | |
| 51 self.Z = open(output_z, 'w') | |
| 52 self.chromosomes = dict(zip(self.bam_object.references, | |
| 53 self.bam_object.lengths)) | |
| 54 self.map_dict = self.create_map(self.bam_object) | |
| 55 self.query_positions = self.compute_query_positions() | |
| 56 self.Z.write(self.compute_signature_pairs()) | |
| 57 self.H.write(self.compute_signature_h()) | |
| 58 self.H.close() | |
| 59 self.Z.close() | |
| 60 | |
| 61 def create_map(self, bam_object): | |
| 62 ''' | |
| 63 Returns a map_dictionary {(chromosome,read_position,polarity): | |
| 64 [read_length, ...]} | |
| 65 ''' | |
| 66 map_dictionary = defaultdict(list) | |
| 67 # get empty value for start and end of each chromosome | |
| 68 for chrom in self.chromosomes: | |
| 69 map_dictionary[(chrom, 1, 'F')] = [] | |
| 70 map_dictionary[(chrom, self.chromosomes[chrom], 'F')] = [] | |
| 71 for chrom in self.chromosomes: | |
| 72 for read in bam_object.fetch(chrom): | |
| 73 if read.is_reverse: | |
| 74 map_dictionary[(chrom, read.reference_end, | |
| 75 'R')].append(read.query_alignment_length) | |
| 76 else: | |
| 77 map_dictionary[(chrom, read.reference_start+1, | |
| 78 'F')].append(read.query_alignment_length) | |
| 79 return map_dictionary | |
| 80 | |
| 81 def compute_query_positions(self): | |
| 82 ''' this method does not filter on read size, just forward reads | |
| 83 that overlap reverse reads in the overlap range''' | |
| 84 all_query_positions = defaultdict(list) | |
| 85 for genomicKey in self.map_dict.keys(): | |
| 86 chrom, coord, pol = genomicKey | |
| 87 for i in self.scope: | |
| 88 if pol == 'F' and len(self.map_dict[chrom, | |
| 89 coord+i-1, | |
| 90 'R']) > 0: | |
| 91 all_query_positions[chrom].append(coord) | |
| 92 break | |
| 93 for chrom in all_query_positions: | |
| 94 all_query_positions[chrom] = sorted( | |
| 95 list(set(all_query_positions[chrom]))) | |
| 96 return all_query_positions | |
| 97 | |
| 98 def countpairs(self, uppers, lowers): | |
| 99 query_range = self.query_range | |
| 100 target_range = self.target_range | |
| 101 uppers = [size for size in uppers if size in query_range or size in | |
| 102 target_range] | |
| 103 lowers = [size for size in lowers if size in query_range or size in | |
| 104 target_range] | |
| 105 paired = [] | |
| 106 for upread in uppers: | |
| 107 for downread in lowers: | |
| 108 if (upread in query_range and downread in target_range) or ( | |
| 109 upread in target_range and downread in query_range): | |
| 110 paired.append(upread) | |
| 111 lowers.remove(downread) | |
| 112 break | |
| 113 return len(paired) | |
| 114 | |
| 115 def compute_signature_pairs(self): | |
| 116 frequency_table = defaultdict(dict) | |
| 117 scope = self.scope | |
| 118 for chrom in self.chromosomes: | |
| 119 for overlap in scope: | |
| 120 frequency_table[chrom][overlap] = 0 | |
| 121 for chrom in self.query_positions: | |
| 122 for coord in self.query_positions[chrom]: | |
| 123 for overlap in scope: | |
| 124 uppers = self.map_dict[chrom, coord, 'F'] | |
| 125 lowers = self.map_dict[chrom, coord+overlap-1, 'R'] | |
| 126 frequency_table[chrom][overlap] += self.countpairs(uppers, | |
| 127 lowers) | |
| 128 # compute overlaps for all chromosomes merged | |
| 129 for overlap in scope: | |
| 130 accumulator = [] | |
| 131 for chrom in frequency_table: | |
| 132 if chrom != 'all_chromosomes': | |
| 133 accumulator.append(frequency_table[chrom][overlap]) | |
| 134 frequency_table['all_chromosomes'][overlap] = sum(accumulator) | |
| 135 return self.stringify_table(frequency_table) | |
| 136 | |
| 137 def signature_tables(self): | |
| 138 query_range = self.query_range | |
| 139 target_range = self.target_range | |
| 140 Query_table = defaultdict(dict) | |
| 141 Target_table = defaultdict(dict) | |
| 142 for key in self.map_dict: | |
| 143 for size in self.map_dict[key]: | |
| 144 if size in query_range or size in target_range: | |
| 145 if key[2] == 'F': | |
| 146 coordinate = key[1] | |
| 147 else: | |
| 148 coordinate = -key[1] | |
| 149 if size in query_range: | |
| 150 Query_table[key[0]][coordinate] = Query_table[key[0]].get( | |
| 151 coordinate, 0) + 1 | |
| 152 if size in target_range: | |
| 153 Target_table[key[0]][coordinate] = \ | |
| 154 Target_table[key[0]].get(coordinate, 0) + 1 | |
| 155 return Query_table, Target_table | |
| 156 | |
| 157 def compute_signature_h(self): | |
| 158 scope = self.scope | |
| 159 Query_table, Target_table = self.signature_tables() | |
| 160 frequency_table = defaultdict(dict) | |
| 161 for chrom in self.chromosomes: | |
| 162 for overlap in scope: | |
| 163 frequency_table[chrom][overlap] = 0 | |
| 164 for chrom in Query_table: | |
| 165 Total_Query_Numb = 0 | |
| 166 for coord in Query_table[chrom]: | |
| 167 Total_Query_Numb += Query_table[chrom][coord] | |
| 168 for coord in Query_table[chrom]: | |
| 169 local_table = dict([(overlap, 0) for overlap in scope]) | |
| 170 number_of_targets = 0 | |
| 171 for overlap in scope: | |
| 172 local_table[overlap] += Query_table[chrom][coord] * \ | |
| 173 Target_table[chrom].get(-coord - overlap + 1, 0) | |
| 174 number_of_targets += Target_table[chrom].get( | |
| 175 -coord - overlap + 1, 0) | |
| 176 for overlap in scope: | |
| 177 try: | |
| 178 frequency_table[chrom][overlap] += \ | |
| 179 local_table[overlap] / number_of_targets \ | |
| 180 / float(Total_Query_Numb) | |
| 181 except ZeroDivisionError: | |
| 182 continue | |
| 183 # compute overlap probabilities for all chromosomes merged | |
| 184 general_frequency_table = dict([(overlap, 0) for overlap in scope]) | |
| 185 total_aligned_reads = 0 | |
| 186 for chrom in frequency_table: | |
| 187 for overlap in frequency_table[chrom]: | |
| 188 total_aligned_reads += self.bam_object.count(chrom) | |
| 189 for chrom in frequency_table: | |
| 190 for overlap in frequency_table[chrom]: | |
| 191 try: | |
| 192 general_frequency_table[overlap] += \ | |
| 193 frequency_table[chrom][overlap] / total_aligned_reads \ | |
| 194 * self.bam_object.count(chrom) | |
| 195 except ZeroDivisionError: | |
| 196 continue | |
| 197 for overlap in general_frequency_table: | |
| 198 frequency_table['all_chromosomes'][overlap] = \ | |
| 199 general_frequency_table[overlap] | |
| 200 return self.stringify_table(frequency_table) | |
| 201 | |
| 202 def stringify_table(self, frequency_table): | |
| 203 ''' | |
| 204 method both to compute z-score and to return a writable string | |
| 205 ''' | |
| 206 tablestring = [] | |
| 207 for chrom in sorted(frequency_table): | |
| 208 accumulator = [] | |
| 209 for overlap in frequency_table[chrom]: | |
| 210 accumulator.append(frequency_table[chrom][overlap]) | |
| 211 z_mean = numpy.mean(accumulator) | |
| 212 z_std = numpy.std(accumulator) | |
| 213 if z_std == 0: | |
| 214 for overlap in sorted(frequency_table[chrom]): | |
| 215 tablestring.append('%s\t%s\t%s\t%s\n' % ( | |
| 216 chrom, str(overlap), | |
| 217 str(frequency_table[chrom][overlap]), str(0))) | |
| 218 else: | |
| 219 for overlap in sorted(frequency_table[chrom]): | |
| 220 tablestring.append('%s\t%s\t%s\t%s\n' % ( | |
| 221 chrom, str(overlap), | |
| 222 str(frequency_table[chrom][overlap]), | |
| 223 str((frequency_table[chrom][overlap] - z_mean)/z_std))) | |
| 224 return ''.join(tablestring) | |
| 225 | |
| 226 | |
| 227 if __name__ == "__main__": | |
| 228 args = Parser() | |
| 229 mapobj = Map(args.input, args.minquery, args.maxquery, args.mintarget, | |
| 230 args.maxtarget, args.minscope, args.maxscope, args.output_h, | |
| 231 args.output_z) |
