Mercurial > repos > artbio > small_rna_signatures
comparison overlapping_reads.py @ 0:3e0ea204d09e draft
"planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/small_rna_signatures commit 4e84386b619a7941f23d175d7fc86aba7990ac36"
| author | artbio |
|---|---|
| date | Tue, 07 Jan 2020 11:59:34 +0000 |
| parents | |
| children | ca2b04cdbf4d |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:3e0ea204d09e |
|---|---|
| 1 import argparse | |
| 2 from collections import defaultdict | |
| 3 | |
| 4 import pysam | |
| 5 | |
| 6 | |
| 7 def Parser(): | |
| 8 the_parser = argparse.ArgumentParser() | |
| 9 the_parser.add_argument( | |
| 10 '--input', action="store", type=str, help="bam alignment file") | |
| 11 the_parser.add_argument( | |
| 12 '--minquery', type=int, | |
| 13 help="Minimum readsize of query reads (nt) - must be an integer") | |
| 14 the_parser.add_argument( | |
| 15 '--maxquery', type=int, | |
| 16 help="Maximum readsize of query reads (nt) - must be an integer") | |
| 17 the_parser.add_argument( | |
| 18 '--mintarget', type=int, | |
| 19 help="Minimum readsize of target reads (nt) - must be an integer") | |
| 20 the_parser.add_argument( | |
| 21 '--maxtarget', type=int, | |
| 22 help="Maximum readsize of target reads (nt) - must be an integer") | |
| 23 the_parser.add_argument( | |
| 24 '--overlap', type=int, | |
| 25 help="Overlap analyzed (nt) - must be an integer") | |
| 26 the_parser.add_argument( | |
| 27 '--output', action="store", type=str, | |
| 28 help="Pairable sequences") | |
| 29 args = the_parser.parse_args() | |
| 30 return args | |
| 31 | |
| 32 | |
| 33 class Map: | |
| 34 | |
| 35 def __init__(self, bam_file, output, minquery=23, maxquery=29, | |
| 36 mintarget=23, maxtarget=29, overlap=10): | |
| 37 self.bam_object = pysam.AlignmentFile(bam_file, 'rb') | |
| 38 self.output = output | |
| 39 self.query_range = range(minquery, maxquery + 1) | |
| 40 self.target_range = range(mintarget, maxtarget + 1) | |
| 41 self.overlap = overlap | |
| 42 self.chromosomes = dict(zip(self.bam_object.references, | |
| 43 self.bam_object.lengths)) | |
| 44 self.alignement_dic = self.index_alignments(self.bam_object) | |
| 45 self.all_query_positions = self.query_positions(self.bam_object, | |
| 46 overlap=self.overlap) | |
| 47 self.readdic = self.make_readdic(self.bam_object) | |
| 48 self.pairing() | |
| 49 | |
| 50 def make_readdic(self, bam_object): | |
| 51 readdic = defaultdict(int) | |
| 52 for read in bam_object.fetch(): | |
| 53 readdic[read.query_sequence] += 1 | |
| 54 return readdic | |
| 55 | |
| 56 def index_alignments(self, bam_object): | |
| 57 ''' | |
| 58 dic[(chrom, pos, polarity)]: [readseq1, readseq2, ...] | |
| 59 the list value is further converted in set | |
| 60 ''' | |
| 61 dic = defaultdict(list) | |
| 62 for chrom in self.chromosomes: | |
| 63 for read in bam_object.fetch(chrom): | |
| 64 if read.is_reverse: | |
| 65 coord = read.reference_end-1 | |
| 66 pol = 'R' | |
| 67 else: | |
| 68 coord = read.reference_start | |
| 69 pol = 'F' | |
| 70 dic[(chrom, coord, pol)].append(read.query_sequence) | |
| 71 for key in dic: | |
| 72 dic[key] = set(dic[key]) | |
| 73 return dic | |
| 74 | |
| 75 def query_positions(self, bam_object, overlap): | |
| 76 all_query_positions = defaultdict(list) | |
| 77 for genomicKey in self.alignement_dic.keys(): | |
| 78 chrom, coord, pol = genomicKey | |
| 79 if pol == 'F' and len(self.alignement_dic[(chrom, | |
| 80 coord+overlap-1, | |
| 81 'R')]) > 0: | |
| 82 all_query_positions[chrom].append(coord) | |
| 83 for chrom in all_query_positions: | |
| 84 all_query_positions[chrom] = sorted( | |
| 85 list(set(all_query_positions[chrom]))) | |
| 86 return all_query_positions | |
| 87 | |
| 88 def countpairs(self, uppers, lowers): | |
| 89 query_range = self.query_range | |
| 90 target_range = self.target_range | |
| 91 uppers = [seq for seq in uppers if (len(seq) in query_range or len(seq) | |
| 92 in target_range)] | |
| 93 print(uppers) | |
| 94 uppers_expanded = [] | |
| 95 for seq in uppers: | |
| 96 expand = [seq for i in range(self.readdic[seq])] | |
| 97 uppers_expanded.extend(expand) | |
| 98 print(uppers_expanded) | |
| 99 uppers = uppers_expanded | |
| 100 lowers = [seq for seq in lowers if (len(seq) in query_range or len(seq) | |
| 101 in target_range)] | |
| 102 lowers_expanded = [] | |
| 103 for seq in lowers: | |
| 104 expand = [seq for i in range(self.readdic[seq])] | |
| 105 lowers_expanded.extend(expand) | |
| 106 lowers = lowers_expanded | |
| 107 paired = [] | |
| 108 for upread in uppers: | |
| 109 for downread in lowers: | |
| 110 if (len(upread) in query_range and len(downread) in | |
| 111 target_range) or (len(upread) in target_range and | |
| 112 len(downread) in query_range): | |
| 113 paired.append(upread) | |
| 114 lowers.remove(downread) | |
| 115 break | |
| 116 return len(paired) | |
| 117 | |
| 118 def pairing(self): | |
| 119 F = open(self.output, 'w') | |
| 120 query_range = self.query_range | |
| 121 target_range = self.target_range | |
| 122 overlap = self.overlap | |
| 123 stringresult = [] | |
| 124 header_template = '>%s|coord=%s|strand %s|size=%s|nreads=%s\n%s\n' | |
| 125 total_pairs = 0 | |
| 126 print('Chromosome\tNbre of pairs') | |
| 127 for chrom in sorted(self.chromosomes): | |
| 128 number_pairs = 0 | |
| 129 for pos in self.all_query_positions[chrom]: | |
| 130 stringbuffer = [] | |
| 131 uppers = self.alignement_dic[chrom, pos, 'F'] | |
| 132 lowers = self.alignement_dic[chrom, pos+overlap-1, 'R'] | |
| 133 number_pairs += self.countpairs(uppers, lowers) | |
| 134 total_pairs += number_pairs | |
| 135 if uppers and lowers: | |
| 136 for upread in uppers: | |
| 137 for downread in lowers: | |
| 138 if (len(upread) in query_range and len(downread) in | |
| 139 target_range) or (len(upread) in target_range | |
| 140 and len(downread) in | |
| 141 query_range): | |
| 142 stringbuffer.append( | |
| 143 header_template % | |
| 144 (chrom, pos+1, '+', len(upread), | |
| 145 self.readdic[upread], upread)) | |
| 146 stringbuffer.append( | |
| 147 header_template % | |
| 148 (chrom, pos+overlap-len(downread)+1, '-', | |
| 149 len(downread), self.readdic[downread], | |
| 150 self.revcomp(downread))) | |
| 151 stringresult.extend(sorted(set(stringbuffer))) | |
| 152 print('%s\t%s' % (chrom, number_pairs)) | |
| 153 print('Total nbre of pairs that can be simultaneously formed\t%s' | |
| 154 % total_pairs) | |
| 155 F.write(''.join(stringresult)) | |
| 156 | |
| 157 def revcomp(self, sequence): | |
| 158 antidict = {"A": "T", "T": "A", "G": "C", "C": "G", "N": "N"} | |
| 159 revseq = sequence[::-1] | |
| 160 return "".join([antidict[i] for i in revseq]) | |
| 161 | |
| 162 | |
| 163 if __name__ == "__main__": | |
| 164 args = Parser() | |
| 165 mapobj = Map(args.input, args.output, args.minquery, args.maxquery, | |
| 166 args.mintarget, args.maxtarget, args.overlap) |
