comparison overlapping_reads.py @ 0:3e0ea204d09e draft

"planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/small_rna_signatures commit 4e84386b619a7941f23d175d7fc86aba7990ac36"
author artbio
date Tue, 07 Jan 2020 11:59:34 +0000
parents
children ca2b04cdbf4d
comparison
equal deleted inserted replaced
-1:000000000000 0:3e0ea204d09e
1 import argparse
2 from collections import defaultdict
3
4 import pysam
5
6
7 def Parser():
8 the_parser = argparse.ArgumentParser()
9 the_parser.add_argument(
10 '--input', action="store", type=str, help="bam alignment file")
11 the_parser.add_argument(
12 '--minquery', type=int,
13 help="Minimum readsize of query reads (nt) - must be an integer")
14 the_parser.add_argument(
15 '--maxquery', type=int,
16 help="Maximum readsize of query reads (nt) - must be an integer")
17 the_parser.add_argument(
18 '--mintarget', type=int,
19 help="Minimum readsize of target reads (nt) - must be an integer")
20 the_parser.add_argument(
21 '--maxtarget', type=int,
22 help="Maximum readsize of target reads (nt) - must be an integer")
23 the_parser.add_argument(
24 '--overlap', type=int,
25 help="Overlap analyzed (nt) - must be an integer")
26 the_parser.add_argument(
27 '--output', action="store", type=str,
28 help="Pairable sequences")
29 args = the_parser.parse_args()
30 return args
31
32
33 class Map:
34
35 def __init__(self, bam_file, output, minquery=23, maxquery=29,
36 mintarget=23, maxtarget=29, overlap=10):
37 self.bam_object = pysam.AlignmentFile(bam_file, 'rb')
38 self.output = output
39 self.query_range = range(minquery, maxquery + 1)
40 self.target_range = range(mintarget, maxtarget + 1)
41 self.overlap = overlap
42 self.chromosomes = dict(zip(self.bam_object.references,
43 self.bam_object.lengths))
44 self.alignement_dic = self.index_alignments(self.bam_object)
45 self.all_query_positions = self.query_positions(self.bam_object,
46 overlap=self.overlap)
47 self.readdic = self.make_readdic(self.bam_object)
48 self.pairing()
49
50 def make_readdic(self, bam_object):
51 readdic = defaultdict(int)
52 for read in bam_object.fetch():
53 readdic[read.query_sequence] += 1
54 return readdic
55
56 def index_alignments(self, bam_object):
57 '''
58 dic[(chrom, pos, polarity)]: [readseq1, readseq2, ...]
59 the list value is further converted in set
60 '''
61 dic = defaultdict(list)
62 for chrom in self.chromosomes:
63 for read in bam_object.fetch(chrom):
64 if read.is_reverse:
65 coord = read.reference_end-1
66 pol = 'R'
67 else:
68 coord = read.reference_start
69 pol = 'F'
70 dic[(chrom, coord, pol)].append(read.query_sequence)
71 for key in dic:
72 dic[key] = set(dic[key])
73 return dic
74
75 def query_positions(self, bam_object, overlap):
76 all_query_positions = defaultdict(list)
77 for genomicKey in self.alignement_dic.keys():
78 chrom, coord, pol = genomicKey
79 if pol == 'F' and len(self.alignement_dic[(chrom,
80 coord+overlap-1,
81 'R')]) > 0:
82 all_query_positions[chrom].append(coord)
83 for chrom in all_query_positions:
84 all_query_positions[chrom] = sorted(
85 list(set(all_query_positions[chrom])))
86 return all_query_positions
87
88 def countpairs(self, uppers, lowers):
89 query_range = self.query_range
90 target_range = self.target_range
91 uppers = [seq for seq in uppers if (len(seq) in query_range or len(seq)
92 in target_range)]
93 print(uppers)
94 uppers_expanded = []
95 for seq in uppers:
96 expand = [seq for i in range(self.readdic[seq])]
97 uppers_expanded.extend(expand)
98 print(uppers_expanded)
99 uppers = uppers_expanded
100 lowers = [seq for seq in lowers if (len(seq) in query_range or len(seq)
101 in target_range)]
102 lowers_expanded = []
103 for seq in lowers:
104 expand = [seq for i in range(self.readdic[seq])]
105 lowers_expanded.extend(expand)
106 lowers = lowers_expanded
107 paired = []
108 for upread in uppers:
109 for downread in lowers:
110 if (len(upread) in query_range and len(downread) in
111 target_range) or (len(upread) in target_range and
112 len(downread) in query_range):
113 paired.append(upread)
114 lowers.remove(downread)
115 break
116 return len(paired)
117
118 def pairing(self):
119 F = open(self.output, 'w')
120 query_range = self.query_range
121 target_range = self.target_range
122 overlap = self.overlap
123 stringresult = []
124 header_template = '>%s|coord=%s|strand %s|size=%s|nreads=%s\n%s\n'
125 total_pairs = 0
126 print('Chromosome\tNbre of pairs')
127 for chrom in sorted(self.chromosomes):
128 number_pairs = 0
129 for pos in self.all_query_positions[chrom]:
130 stringbuffer = []
131 uppers = self.alignement_dic[chrom, pos, 'F']
132 lowers = self.alignement_dic[chrom, pos+overlap-1, 'R']
133 number_pairs += self.countpairs(uppers, lowers)
134 total_pairs += number_pairs
135 if uppers and lowers:
136 for upread in uppers:
137 for downread in lowers:
138 if (len(upread) in query_range and len(downread) in
139 target_range) or (len(upread) in target_range
140 and len(downread) in
141 query_range):
142 stringbuffer.append(
143 header_template %
144 (chrom, pos+1, '+', len(upread),
145 self.readdic[upread], upread))
146 stringbuffer.append(
147 header_template %
148 (chrom, pos+overlap-len(downread)+1, '-',
149 len(downread), self.readdic[downread],
150 self.revcomp(downread)))
151 stringresult.extend(sorted(set(stringbuffer)))
152 print('%s\t%s' % (chrom, number_pairs))
153 print('Total nbre of pairs that can be simultaneously formed\t%s'
154 % total_pairs)
155 F.write(''.join(stringresult))
156
157 def revcomp(self, sequence):
158 antidict = {"A": "T", "T": "A", "G": "C", "C": "G", "N": "N"}
159 revseq = sequence[::-1]
160 return "".join([antidict[i] for i in revseq])
161
162
163 if __name__ == "__main__":
164 args = Parser()
165 mapobj = Map(args.input, args.output, args.minquery, args.maxquery,
166 args.mintarget, args.maxtarget, args.overlap)