Mercurial > repos > drosofff > msp_sr_size_histograms
annotate smRtools.py @ 7:fed53c74fb92 draft default tip
planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/msp_sr_size_histograms commit 89caea4594db1ae6d6bb9c651bc6019bb6dd3ce6
| author | drosofff |
|---|---|
| date | Thu, 10 Mar 2016 10:59:45 -0500 |
| parents | 9f75d887904d |
| children |
| rev | line source |
|---|---|
| 0 | 1 #!/usr/bin/python |
| 2 # version 1 7-5-2012 unification of the SmRNAwindow class | |
| 3 | |
| 4 import sys, subprocess | |
| 5 from collections import defaultdict | |
| 6 from numpy import mean, median, std | |
|
2
9f75d887904d
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
mvdbeek
parents:
0
diff
changeset
|
7 ##Disable scipy import temporarily, as no working scipy on toolshed. |
|
9f75d887904d
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
mvdbeek
parents:
0
diff
changeset
|
8 ##from scipy import stats |
| 0 | 9 |
| 10 def get_fasta (index="/home/galaxy/galaxy-dist/bowtie/5.37_Dmel/5.37_Dmel"): | |
| 11 '''This function will return a dictionary containing fasta identifiers as keys and the | |
| 12 sequence as values. Index must be the path to a fasta file.''' | |
| 13 p = subprocess.Popen(args=["bowtie-inspect","-a", "0", index], stdout=subprocess.PIPE, stderr=subprocess.STDOUT) # bowtie-inspect outputs sequences on single lines | |
| 14 outputlines = p.stdout.readlines() | |
| 15 p.wait() | |
| 16 item_dic = {} | |
| 17 for line in outputlines: | |
| 18 if (line[0] == ">"): | |
| 19 try: | |
| 20 item_dic[current_item] = "".join(stringlist) # to dump the sequence of the previous item - try because of the keyerror of the first item | |
| 21 except: pass | |
| 22 current_item = line[1:].rstrip().split()[0] #take the first word before space because bowtie splits headers ! | |
| 23 item_dic[current_item] = "" | |
| 24 stringlist=[] | |
| 25 else: | |
| 26 stringlist.append(line.rstrip() ) | |
| 27 item_dic[current_item] = "".join(stringlist) # for the last item | |
| 28 return item_dic | |
| 29 | |
| 30 def get_fasta_headers (index): | |
| 31 p = subprocess.Popen(args=["bowtie-inspect","-n", index], stdout=subprocess.PIPE, stderr=subprocess.STDOUT) # bowtie-inspect outputs sequences on single lines | |
| 32 outputlines = p.stdout.readlines() | |
| 33 p.wait() | |
| 34 item_dic = {} | |
| 35 for line in outputlines: | |
| 36 header = line.rstrip().split()[0] #take the first word before space because bowtie splits headers ! | |
| 37 item_dic[header] = 1 | |
| 38 return item_dic | |
| 39 | |
| 40 | |
| 41 def get_file_sample (file, numberoflines): | |
| 42 '''import random to use this function''' | |
| 43 F=open(file) | |
| 44 fullfile = F.read().splitlines() | |
| 45 F.close() | |
| 46 if len(fullfile) < numberoflines: | |
| 47 return "sample size exceeds file size" | |
| 48 return random.sample(fullfile, numberoflines) | |
| 49 | |
| 50 def get_fasta_from_history (file): | |
| 51 F = open (file, "r") | |
| 52 item_dic = {} | |
| 53 for line in F: | |
| 54 if (line[0] == ">"): | |
| 55 try: | |
| 56 item_dic[current_item] = "".join(stringlist) # to dump the sequence of the previous item - try because of the keyerror of the first item | |
| 57 except: pass | |
| 58 current_item = line[1:-1].split()[0] #take the first word before space because bowtie splits headers ! | |
| 59 item_dic[current_item] = "" | |
| 60 stringlist=[] | |
| 61 else: | |
| 62 stringlist.append(line[:-1]) | |
| 63 item_dic[current_item] = "".join(stringlist) # for the last item | |
| 64 return item_dic | |
| 65 | |
| 66 def antipara (sequence): | |
| 67 antidict = {"A":"T", "T":"A", "G":"C", "C":"G", "N":"N"} | |
| 68 revseq = sequence[::-1] | |
| 69 return "".join([antidict[i] for i in revseq]) | |
| 70 | |
| 71 def RNAtranslate (sequence): | |
| 72 return "".join([i if i in "AGCN" else "U" for i in sequence]) | |
| 73 def DNAtranslate (sequence): | |
| 74 return "".join([i if i in "AGCN" else "T" for i in sequence]) | |
| 75 | |
| 76 def RNAfold (sequence_list): | |
| 77 thestring= "\n".join(sequence_list) | |
| 78 p = subprocess.Popen(args=["RNAfold","--noPS"], stdin= subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.STDOUT) | |
| 79 output=p.communicate(thestring)[0] | |
| 80 p.wait() | |
| 81 output=output.split("\n") | |
| 82 if not output[-1]: output = output[:-1] # nasty patch to remove last empty line | |
| 83 buffer=[] | |
| 84 for line in output: | |
| 85 if line[0] in ["N","A","T","U","G","C"]: | |
| 86 buffer.append(DNAtranslate(line)) | |
| 87 if line[0] in ["(",".",")"]: | |
| 88 fields=line.split("(") | |
| 89 energy= fields[-1] | |
| 90 energy = energy[:-1] # remove the ) parenthesis | |
| 91 energy=float(energy) | |
| 92 buffer.append(str(energy)) | |
| 93 return dict(zip(buffer[::2], buffer[1::2])) | |
| 94 | |
| 95 def extractsubinstance (start, end, instance): | |
| 96 ''' Testing whether this can be an function external to the class to save memory''' | |
| 97 subinstance = SmRNAwindow (instance.gene, instance.sequence[start-1:end], start) | |
| 98 subinstance.gene = "%s %s %s" % (subinstance.gene, subinstance.windowoffset, subinstance.windowoffset + subinstance.size - 1) | |
| 99 upcoordinate = [i for i in range(start,end+1) if instance.readDict.has_key(i) ] | |
| 100 downcoordinate = [-i for i in range(start,end+1) if instance.readDict.has_key(-i) ] | |
| 101 for i in upcoordinate: | |
| 102 subinstance.readDict[i]=instance.readDict[i] | |
| 103 for i in downcoordinate: | |
| 104 subinstance.readDict[i]=instance.readDict[i] | |
| 105 return subinstance | |
| 106 | |
| 107 class HandleSmRNAwindows: | |
| 108 def __init__(self, alignmentFile="~", alignmentFileFormat="tabular", genomeRefFile="~", genomeRefFormat="bowtieIndex", biosample="undetermined", size_inf=None, size_sup=1000, norm=1.0): | |
| 109 self.biosample = biosample | |
| 110 self.alignmentFile = alignmentFile | |
| 111 self.alignmentFileFormat = alignmentFileFormat # can be "tabular" or "sam" | |
| 112 self.genomeRefFile = genomeRefFile | |
| 113 self.genomeRefFormat = genomeRefFormat # can be "bowtieIndex" or "fastaSource" | |
| 114 self.alignedReads = 0 | |
| 115 self.instanceDict = {} | |
| 116 self.size_inf=size_inf | |
| 117 self.size_sup=size_sup | |
| 118 self.norm=norm | |
| 119 if genomeRefFormat == "bowtieIndex": | |
| 120 self.itemDict = get_fasta (genomeRefFile) | |
| 121 elif genomeRefFormat == "fastaSource": | |
| 122 self.itemDict = get_fasta_from_history (genomeRefFile) | |
| 123 for item in self.itemDict: | |
| 124 self.instanceDict[item] = SmRNAwindow(item, sequence=self.itemDict[item], windowoffset=1, biosample=self.biosample, norm=self.norm) # create as many instances as there is items | |
| 125 self.readfile() | |
| 126 | |
| 127 def readfile (self) : | |
| 128 if self.alignmentFileFormat == "tabular": | |
| 129 F = open (self.alignmentFile, "r") | |
| 130 for line in F: | |
| 131 fields = line.split() | |
| 132 polarity = fields[1] | |
| 133 gene = fields[2] | |
| 134 offset = int(fields[3]) | |
| 135 size = len (fields[4]) | |
| 136 if self.size_inf: | |
| 137 if (size>=self.size_inf and size<= self.size_sup): | |
| 138 self.instanceDict[gene].addread (polarity, offset+1, size) # to correct to 1-based coordinates of SmRNAwindow | |
| 139 self.alignedReads += 1 | |
| 140 else: | |
| 141 self.instanceDict[gene].addread (polarity, offset+1, size) # to correct to 1-based coordinates of SmRNAwindow | |
| 142 self.alignedReads += 1 | |
| 143 F.close() | |
| 144 return self.instanceDict | |
| 145 # elif self.alignmentFileFormat == "sam": | |
| 146 # F = open (self.alignmentFile, "r") | |
| 147 # dict = {"0":"+", "16":"-"} | |
| 148 # for line in F: | |
| 149 # if line[0]=='@': | |
| 150 # continue | |
| 151 # fields = line.split() | |
| 152 # if fields[2] == "*": continue | |
| 153 # polarity = dict[fields[1]] | |
| 154 # gene = fields[2] | |
| 155 # offset = int(fields[3]) | |
| 156 # size = len (fields[9]) | |
| 157 # if self.size_inf: | |
| 158 # if (size>=self.size_inf and size<= self.size_sup): | |
| 159 # self.instanceDict[gene].addread (polarity, offset, size) | |
| 160 # self.alignedReads += 1 | |
| 161 # else: | |
| 162 # self.instanceDict[gene].addread (polarity, offset, size) | |
| 163 # self.alignedReads += 1 | |
| 164 # F.close() | |
| 165 elif self.alignmentFileFormat == "bam" or self.alignmentFileFormat == "sam": | |
| 166 import pysam | |
| 167 samfile = pysam.Samfile(self.alignmentFile) | |
| 168 for read in samfile: | |
| 169 if read.tid == -1: | |
| 170 continue # filter out unaligned reads | |
| 171 if read.is_reverse: | |
| 172 polarity="-" | |
| 173 else: | |
| 174 polarity="+" | |
| 175 gene = samfile.getrname(read.tid) | |
| 176 offset = read.pos | |
| 177 size = read.qlen | |
| 178 if self.size_inf: | |
| 179 if (size>=self.size_inf and size<= self.size_sup): | |
| 180 self.instanceDict[gene].addread (polarity, offset+1, size) # to correct to 1-based coordinates of SmRNAwindow | |
| 181 self.alignedReads += 1 | |
| 182 else: | |
| 183 self.instanceDict[gene].addread (polarity, offset+1, size) # to correct to 1-based coordinates of SmRNAwindow | |
| 184 self.alignedReads += 1 | |
| 185 return self.instanceDict | |
| 186 | |
| 187 # def size_histogram (self): | |
| 188 # size_dict={} | |
| 189 # size_dict['F']= defaultdict (int) | |
| 190 # size_dict['R']= defaultdict (int) | |
| 191 # size_dict['both'] = defaultdict (int) | |
| 192 # for item in self.instanceDict: | |
| 193 # buffer_dict_F = self.instanceDict[item].size_histogram()['F'] | |
| 194 # buffer_dict_R = self.instanceDict[item].size_histogram()['R'] | |
| 195 # for size in buffer_dict_F: | |
| 196 # size_dict['F'][size] += buffer_dict_F[size] | |
| 197 # for size in buffer_dict_R: | |
| 198 # size_dict['R'][size] -= buffer_dict_R[size] | |
| 199 # allSizeKeys = list (set (size_dict['F'].keys() + size_dict['R'].keys() ) ) | |
| 200 # for size in allSizeKeys: | |
| 201 # size_dict['both'][size] = size_dict['F'][size] + size_dict['R'][size] | |
| 202 # return size_dict | |
| 203 def size_histogram (self): # in HandleSmRNAwindows | |
| 204 '''refactored on 7-9-2014 to debug size_histogram tool''' | |
| 205 size_dict={} | |
| 206 size_dict['F']= defaultdict (float) | |
| 207 size_dict['R']= defaultdict (float) | |
| 208 size_dict['both'] = defaultdict (float) | |
| 209 for item in self.instanceDict: | |
| 210 buffer_dict = self.instanceDict[item].size_histogram() | |
| 211 for polarity in ["F", "R"]: | |
| 212 for size in buffer_dict[polarity]: | |
| 213 size_dict[polarity][size] += buffer_dict[polarity][size] | |
| 214 for size in buffer_dict["both"]: | |
| 215 size_dict["both"][size] += buffer_dict["F"][size] - buffer_dict["R"][size] | |
| 216 return size_dict | |
| 217 | |
| 218 def CountFeatures (self, GFF3="path/to/file"): | |
| 219 featureDict = defaultdict(int) | |
| 220 F = open (GFF3, "r") | |
| 221 for line in F: | |
| 222 if line[0] == "#": continue | |
| 223 fields = line[:-1].split() | |
| 224 chrom, feature, leftcoord, rightcoord, polarity = fields[0], fields[2], fields[3], fields[4], fields[6] | |
| 225 featureDict[feature] += self.instanceDict[chrom].readcount(upstream_coord=int(leftcoord), downstream_coord=int(rightcoord), polarity="both", method="destructive") | |
| 226 F.close() | |
| 227 return featureDict | |
| 228 | |
| 229 class SmRNAwindow: | |
| 230 | |
| 231 def __init__(self, gene, sequence="ATGC", windowoffset=1, biosample="Undetermined", norm=1.0): | |
| 232 self.biosample = biosample | |
| 233 self.sequence = sequence | |
| 234 self.gene = gene | |
| 235 self.windowoffset = windowoffset | |
| 236 self.size = len(sequence) | |
| 237 self.readDict = defaultdict(list) # with a {+/-offset:[size1, size2, ...], ...} | |
| 238 self.matchedreadsUp = 0 | |
| 239 self.matchedreadsDown = 0 | |
| 240 self.norm=norm | |
| 241 | |
| 242 def addread (self, polarity, offset, size): | |
| 243 '''ATTENTION ATTENTION ATTENTION''' | |
| 244 ''' We removed the conversion from 0 to 1 based offset, as we do this now during readparsing.''' | |
| 245 if polarity == "+": | |
| 246 self.readDict[offset].append(size) | |
| 247 self.matchedreadsUp += 1 | |
| 248 else: | |
| 249 self.readDict[-(offset + size -1)].append(size) | |
| 250 self.matchedreadsDown += 1 | |
| 251 return | |
| 252 | |
| 253 def barycenter (self, upstream_coord=None, downstream_coord=None): | |
| 254 '''refactored 24-12-2013 to save memory and introduce offset filtering see readcount method for further discussion on that | |
| 255 In this version, attempt to replace the dictionary structure by a list of tupple to save memory too''' | |
| 256 upstream_coord = upstream_coord or self.windowoffset | |
| 257 downstream_coord = downstream_coord or self.windowoffset+self.size-1 | |
| 258 window_size = downstream_coord - upstream_coord +1 | |
| 259 def weigthAverage (TuppleList): | |
| 260 weightSum = 0 | |
| 261 PonderWeightSum = 0 | |
| 262 for tuple in TuppleList: | |
| 263 PonderWeightSum += tuple[0] * tuple[1] | |
| 264 weightSum += tuple[1] | |
| 265 if weightSum > 0: | |
| 266 return PonderWeightSum / float(weightSum) | |
| 267 else: | |
| 268 return 0 | |
| 269 forwardTuppleList = [(k, len(self.readDict[k])) for k in self.readDict.keys() if (k > 0 and abs(k) >= upstream_coord and abs(k) <= downstream_coord)] # both forward and in the proper offset window | |
| 270 reverseTuppleList = [(-k, len(self.readDict[k])) for k in self.readDict.keys() if (k < 0 and abs(k) >= upstream_coord and abs(k) <= downstream_coord)] # both reverse and in the proper offset window | |
| 271 Fbarycenter = (weigthAverage (forwardTuppleList) - upstream_coord) / window_size | |
| 272 Rbarycenter = (weigthAverage (reverseTuppleList) - upstream_coord) / window_size | |
| 273 return Fbarycenter, Rbarycenter | |
| 274 | |
| 275 def correlation_mapper (self, reference, window_size): | |
| 276 '''to map correlation with a sliding window 26-2-2013''' | |
|
2
9f75d887904d
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
mvdbeek
parents:
0
diff
changeset
|
277 from scipy import stats |
|
9f75d887904d
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
mvdbeek
parents:
0
diff
changeset
|
278 |
| 0 | 279 if window_size > self.size: |
| 280 return [] | |
| 281 F=open(reference, "r") | |
| 282 reference_forward = [] | |
| 283 reference_reverse = [] | |
| 284 for line in F: | |
| 285 fields=line.split() | |
| 286 reference_forward.append(int(float(fields[1]))) | |
| 287 reference_reverse.append(int(float(fields[2]))) | |
| 288 F.close() | |
| 289 local_object_forward=[] | |
| 290 local_object_reverse=[] | |
| 291 ## Dict to list for the local object | |
| 292 for i in range(1, self.size+1): | |
| 293 local_object_forward.append(len(self.readDict[i])) | |
| 294 local_object_reverse.append(len(self.readDict[-i])) | |
| 295 ## start compiling results by slides | |
| 296 results=[] | |
| 297 for coordinate in range(self.size - window_size): | |
| 298 local_forward=local_object_forward[coordinate:coordinate + window_size] | |
| 299 local_reverse=local_object_reverse[coordinate:coordinate + window_size] | |
| 300 if sum(local_forward) == 0 or sum(local_reverse) == 0: | |
| 301 continue | |
| 302 try: | |
| 303 reference_to_local_cor_forward = stats.spearmanr(local_forward, reference_forward) | |
| 304 reference_to_local_cor_reverse = stats.spearmanr(local_reverse, reference_reverse) | |
| 305 if (reference_to_local_cor_forward[0] > 0.2 or reference_to_local_cor_reverse[0]>0.2): | |
| 306 results.append([coordinate+1, reference_to_local_cor_forward[0], reference_to_local_cor_reverse[0]]) | |
| 307 except: | |
| 308 pass | |
| 309 return results | |
| 310 | |
| 311 def readcount (self, size_inf=0, size_sup=1000, upstream_coord=None, downstream_coord=None, polarity="both", method="conservative"): | |
| 312 '''refactored 24-12-2013 to save memory and introduce offset filtering | |
| 313 take a look at the defaut parameters that cannot be defined relatively to the instance are they are defined before instanciation | |
| 314 the trick is to pass None and then test | |
| 315 polarity parameter can take "both", "forward" or "reverse" as value''' | |
| 316 upstream_coord = upstream_coord or self.windowoffset | |
| 317 downstream_coord = downstream_coord or self.windowoffset+self.size-1 | |
| 318 if upstream_coord == 1 and downstream_coord == self.windowoffset+self.size-1 and polarity == "both": | |
| 319 return self.matchedreadsUp + self.matchedreadsDown | |
| 320 if upstream_coord == 1 and downstream_coord == self.windowoffset+self.size-1 and polarity == "forward": | |
| 321 return self.matchedreadsUp | |
| 322 if upstream_coord == 1 and downstream_coord == self.windowoffset+self.size-1 and polarity == "reverse": | |
| 323 return self.matchedreadsDown | |
| 324 n=0 | |
| 325 if polarity == "both": | |
| 326 for offset in xrange(upstream_coord, downstream_coord+1): | |
| 327 if self.readDict.has_key(offset): | |
| 328 for read in self.readDict[offset]: | |
| 329 if (read>=size_inf and read<= size_sup): | |
| 330 n += 1 | |
| 331 if method != "conservative": | |
| 332 del self.readDict[offset] ## Carefull ! precludes re-use on the self.readDict dictionary !!!!!! TEST | |
| 333 if self.readDict.has_key(-offset): | |
| 334 for read in self.readDict[-offset]: | |
| 335 if (read>=size_inf and read<= size_sup): | |
| 336 n += 1 | |
| 337 if method != "conservative": | |
| 338 del self.readDict[-offset] | |
| 339 return n | |
| 340 elif polarity == "forward": | |
| 341 for offset in xrange(upstream_coord, downstream_coord+1): | |
| 342 if self.readDict.has_key(offset): | |
| 343 for read in self.readDict[offset]: | |
| 344 if (read>=size_inf and read<= size_sup): | |
| 345 n += 1 | |
| 346 return n | |
| 347 elif polarity == "reverse": | |
| 348 for offset in xrange(upstream_coord, downstream_coord+1): | |
| 349 if self.readDict.has_key(-offset): | |
| 350 for read in self.readDict[-offset]: | |
| 351 if (read>=size_inf and read<= size_sup): | |
| 352 n += 1 | |
| 353 return n | |
| 354 | |
| 355 def readsizes (self): | |
| 356 '''return a dictionary of number of reads by size (the keys)''' | |
| 357 dicsize = {} | |
| 358 for offset in self.readDict: | |
| 359 for size in self.readDict[offset]: | |
| 360 dicsize[size] = dicsize.get(size, 0) + 1 | |
| 361 for offset in range (min(dicsize.keys()), max(dicsize.keys())+1): | |
| 362 dicsize[size] = dicsize.get(size, 0) # to fill offsets with null values | |
| 363 return dicsize | |
| 364 | |
| 365 # def size_histogram(self): | |
| 366 # norm=self.norm | |
| 367 # hist_dict={} | |
| 368 # hist_dict['F']={} | |
| 369 # hist_dict['R']={} | |
| 370 # for offset in self.readDict: | |
| 371 # for size in self.readDict[offset]: | |
| 372 # if offset < 0: | |
| 373 # hist_dict['R'][size] = hist_dict['R'].get(size, 0) - 1*norm | |
| 374 # else: | |
| 375 # hist_dict['F'][size] = hist_dict['F'].get(size, 0) + 1*norm | |
| 376 # ## patch to avoid missing graphs when parsed by R-lattice. 27-08-2014. Test and validate ! | |
| 377 # if not (hist_dict['F']) and (not hist_dict['R']): | |
| 378 # hist_dict['F'][21] = 0 | |
| 379 # hist_dict['R'][21] = 0 | |
| 380 # ## | |
| 381 # return hist_dict | |
| 382 | |
| 383 def size_histogram(self, minquery=None, maxquery=None): # in SmRNAwindow | |
| 384 '''refactored on 7-9-2014 to debug size_histogram tool''' | |
| 385 norm=self.norm | |
| 386 size_dict={} | |
| 387 size_dict['F']= defaultdict (float) | |
| 388 size_dict['R']= defaultdict (float) | |
| 389 size_dict['both']= defaultdict (float) | |
| 390 for offset in self.readDict: | |
| 391 for size in self.readDict[offset]: | |
| 392 if offset < 0: | |
| 393 size_dict['R'][size] = size_dict['R'][size] - 1*norm | |
| 394 else: | |
| 395 size_dict['F'][size] = size_dict['F'][size] + 1*norm | |
| 396 ## patch to avoid missing graphs when parsed by R-lattice. 27-08-2014. Test and validate ! | |
| 397 if not (size_dict['F']) and (not size_dict['R']): | |
| 398 size_dict['F'][21] = 0 | |
| 399 size_dict['R'][21] = 0 | |
| 400 ## | |
| 401 allSizeKeys = list (set (size_dict['F'].keys() + size_dict['R'].keys() ) ) | |
| 402 for size in allSizeKeys: | |
| 403 size_dict['both'][size] = size_dict['F'][size] - size_dict['R'][size] | |
| 404 if minquery: | |
| 405 for polarity in size_dict.keys(): | |
| 406 for size in xrange(minquery, maxquery+1): | |
| 407 if not size in size_dict[polarity].keys(): | |
| 408 size_dict[polarity][size]=0 | |
| 409 return size_dict | |
| 410 | |
| 411 def statsizes (self, upstream_coord=None, downstream_coord=None): | |
| 412 ''' migration to memory saving by specifying possible subcoordinates | |
| 413 see the readcount method for further discussion''' | |
| 414 upstream_coord = upstream_coord or self.windowoffset | |
| 415 downstream_coord = downstream_coord or self.windowoffset+self.size-1 | |
| 416 L = [] | |
| 417 for offset in self.readDict: | |
| 418 if (abs(offset) < upstream_coord or abs(offset) > downstream_coord): continue | |
| 419 for size in self.readDict[offset]: | |
| 420 L.append(size) | |
| 421 meansize = mean(L) | |
| 422 stdv = std(L) | |
| 423 mediansize = median(L) | |
| 424 return meansize, mediansize, stdv | |
| 425 | |
| 426 def foldEnergy (self, upstream_coord=None, downstream_coord=None): | |
| 427 ''' migration to memory saving by specifying possible subcoordinates | |
| 428 see the readcount method for further discussion''' | |
| 429 upstream_coord = upstream_coord or self.windowoffset | |
| 430 downstream_coord = downstream_coord or self.windowoffset+self.size-1 | |
| 431 Energy = RNAfold ([self.sequence[upstream_coord-1:downstream_coord] ]) | |
| 432 return float(Energy[self.sequence[upstream_coord-1:downstream_coord]]) | |
| 433 | |
| 434 def Ufreq (self, size_scope, upstream_coord=None, downstream_coord=None): | |
| 435 ''' migration to memory saving by specifying possible subcoordinates | |
| 436 see the readcount method for further discussion. size_scope must be an interable''' | |
| 437 upstream_coord = upstream_coord or self.windowoffset | |
| 438 downstream_coord = downstream_coord or self.windowoffset+self.size-1 | |
| 439 freqDic = {"A":0,"T":0,"G":0,"C":0, "N":0} | |
| 440 convertDic = {"A":"T","T":"A","G":"C","C":"G","N":"N"} | |
| 441 for offset in self.readDict: | |
| 442 if (abs(offset) < upstream_coord or abs(offset) > downstream_coord): continue | |
| 443 for size in self.readDict[offset]: | |
| 444 if size in size_scope: | |
| 445 startbase = self.sequence[abs(offset)-self.windowoffset] | |
| 446 if offset < 0: | |
| 447 startbase = convertDic[startbase] | |
| 448 freqDic[startbase] += 1 | |
| 449 base_sum = float ( sum( freqDic.values()) ) | |
| 450 if base_sum == 0: | |
| 451 return "." | |
| 452 else: | |
| 453 return freqDic["T"] / base_sum * 100 | |
| 454 | |
| 455 def Ufreq_stranded (self, size_scope, upstream_coord=None, downstream_coord=None): | |
| 456 ''' migration to memory saving by specifying possible subcoordinates | |
| 457 see the readcount method for further discussion. size_scope must be an interable | |
| 458 This method is similar to the Ufreq method but take strandness into account''' | |
| 459 upstream_coord = upstream_coord or self.windowoffset | |
| 460 downstream_coord = downstream_coord or self.windowoffset+self.size-1 | |
| 461 freqDic = {"Afor":0,"Tfor":0,"Gfor":0,"Cfor":0, "Nfor":0,"Arev":0,"Trev":0,"Grev":0,"Crev":0, "Nrev":0} | |
| 462 convertDic = {"A":"T","T":"A","G":"C","C":"G","N":"N"} | |
| 463 for offset in self.readDict: | |
| 464 if (abs(offset) < upstream_coord or abs(offset) > downstream_coord): continue | |
| 465 for size in self.readDict[offset]: | |
| 466 if size in size_scope: | |
| 467 startbase = self.sequence[abs(offset)-self.windowoffset] | |
| 468 if offset < 0: | |
| 469 startbase = convertDic[startbase] | |
| 470 freqDic[startbase+"rev"] += 1 | |
| 471 else: | |
| 472 freqDic[startbase+"for"] += 1 | |
| 473 forward_sum = float ( freqDic["Afor"]+freqDic["Tfor"]+freqDic["Gfor"]+freqDic["Cfor"]+freqDic["Nfor"]) | |
| 474 reverse_sum = float ( freqDic["Arev"]+freqDic["Trev"]+freqDic["Grev"]+freqDic["Crev"]+freqDic["Nrev"]) | |
| 475 if forward_sum == 0 and reverse_sum == 0: | |
| 476 return ". | ." | |
| 477 elif reverse_sum == 0: | |
| 478 return "%s | ." % (freqDic["Tfor"] / forward_sum * 100) | |
| 479 elif forward_sum == 0: | |
| 480 return ". | %s" % (freqDic["Trev"] / reverse_sum * 100) | |
| 481 else: | |
| 482 return "%s | %s" % (freqDic["Tfor"] / forward_sum * 100, freqDic["Trev"] / reverse_sum * 100) | |
| 483 | |
| 484 | |
| 485 def readplot (self): | |
| 486 norm=self.norm | |
| 487 readmap = {} | |
| 488 for offset in self.readDict.keys(): | |
| 489 readmap[abs(offset)] = ( len(self.readDict.get(-abs(offset),[]))*norm , len(self.readDict.get(abs(offset),[]))*norm ) | |
| 490 mylist = [] | |
| 491 for offset in sorted(readmap): | |
| 492 if readmap[offset][1] != 0: | |
| 493 mylist.append("%s\t%s\t%s\t%s" % (self.gene, offset, readmap[offset][1], "F") ) | |
| 494 if readmap[offset][0] != 0: | |
| 495 mylist.append("%s\t%s\t%s\t%s" % (self.gene, offset, -readmap[offset][0], "R") ) | |
| 496 ## patch to avoid missing graphs when parsed by R-lattice. 27-08-2014. Test and validate ! | |
| 497 if not mylist: | |
| 498 mylist.append("%s\t%s\t%s\t%s" % (self.gene, 1, 0, "F") ) | |
| 499 ### | |
| 500 return mylist | |
| 501 | |
| 502 def readcoverage (self, upstream_coord=None, downstream_coord=None, windowName=None): | |
| 503 '''Use by MirParser tool''' | |
| 504 upstream_coord = upstream_coord or 1 | |
| 505 downstream_coord = downstream_coord or self.size | |
| 506 windowName = windowName or "%s_%s_%s" % (self.gene, upstream_coord, downstream_coord) | |
| 507 forORrev_coverage = dict ([(i,0) for i in xrange(1, downstream_coord-upstream_coord+1)]) | |
| 508 totalforward = self.readcount(upstream_coord=upstream_coord, downstream_coord=downstream_coord, polarity="forward") | |
| 509 totalreverse = self.readcount(upstream_coord=upstream_coord, downstream_coord=downstream_coord, polarity="reverse") | |
| 510 if totalforward > totalreverse: | |
| 511 majorcoverage = "forward" | |
| 512 for offset in self.readDict.keys(): | |
| 513 if (offset > 0) and ((offset-upstream_coord+1) in forORrev_coverage.keys() ): | |
| 514 for read in self.readDict[offset]: | |
| 515 for i in xrange(read): | |
| 516 try: | |
| 517 forORrev_coverage[offset-upstream_coord+1+i] += 1 | |
| 518 except KeyError: | |
| 519 continue # a sense read may span over the downstream limit | |
| 520 else: | |
| 521 majorcoverage = "reverse" | |
| 522 for offset in self.readDict.keys(): | |
| 523 if (offset < 0) and (-offset-upstream_coord+1 in forORrev_coverage.keys() ): | |
| 524 for read in self.readDict[offset]: | |
| 525 for i in xrange(read): | |
| 526 try: | |
| 527 forORrev_coverage[-offset-upstream_coord-i] += 1 ## positive coordinates in the instance, with + for forward coverage and - for reverse coverage | |
| 528 except KeyError: | |
| 529 continue # an antisense read may span over the upstream limit | |
| 530 output_list = [] | |
| 531 maximum = max (forORrev_coverage.values()) or 1 | |
| 532 for n in sorted (forORrev_coverage): | |
| 533 output_list.append("%s\t%s\t%s\t%s\t%s\t%s\t%s" % (self.biosample, windowName, n, float(n)/(downstream_coord-upstream_coord+1), forORrev_coverage[n], float(forORrev_coverage[n])/maximum, majorcoverage)) | |
| 534 return "\n".join(output_list) | |
| 535 | |
| 536 | |
| 537 def signature (self, minquery, maxquery, mintarget, maxtarget, scope, zscore="no", upstream_coord=None, downstream_coord=None): | |
| 538 ''' migration to memory saving by specifying possible subcoordinates | |
| 539 see the readcount method for further discussion | |
| 540 scope must be a python iterable; scope define the *relative* offset range to be computed''' | |
| 541 upstream_coord = upstream_coord or self.windowoffset | |
| 542 downstream_coord = downstream_coord or self.windowoffset+self.size-1 | |
| 543 query_range = range (minquery, maxquery+1) | |
| 544 target_range = range (mintarget, maxtarget+1) | |
| 545 Query_table = {} | |
| 546 Target_table = {} | |
| 547 frequency_table = dict ([(i, 0) for i in scope]) | |
| 548 for offset in self.readDict: | |
| 549 if (abs(offset) < upstream_coord or abs(offset) > downstream_coord): continue | |
| 550 for size in self.readDict[offset]: | |
| 551 if size in query_range: | |
| 552 Query_table[offset] = Query_table.get(offset, 0) + 1 | |
| 553 if size in target_range: | |
| 554 Target_table[offset] = Target_table.get(offset, 0) + 1 | |
| 555 for offset in Query_table: | |
| 556 for i in scope: | |
| 557 frequency_table[i] += min(Query_table[offset], Target_table.get(-offset -i +1, 0)) | |
| 558 if minquery==mintarget and maxquery==maxtarget: ## added to incorporate the division by 2 in the method (26/11/2013), see signature_options.py and lattice_signature.py | |
| 559 frequency_table = dict([(i,frequency_table[i]/2) for i in frequency_table]) | |
| 560 if zscore == "yes": | |
| 561 z_mean = mean(frequency_table.values() ) | |
| 562 z_std = std(frequency_table.values() ) | |
| 563 if z_std == 0: | |
| 564 frequency_table = dict([(i,0) for i in frequency_table] ) | |
| 565 else: | |
| 566 frequency_table = dict([(i, (frequency_table[i]- z_mean)/z_std) for i in frequency_table] ) | |
| 567 return frequency_table | |
| 568 | |
| 569 def hannon_signature (self, minquery, maxquery, mintarget, maxtarget, scope, upstream_coord=None, downstream_coord=None): | |
| 570 ''' migration to memory saving by specifying possible subcoordinates see the readcount method for further discussion | |
| 571 note that scope must be an iterable (a list or a tuple), which specifies the relative offsets that will be computed''' | |
| 572 upstream_coord = upstream_coord or self.windowoffset | |
| 573 downstream_coord = downstream_coord or self.windowoffset+self.size-1 | |
| 574 query_range = range (minquery, maxquery+1) | |
| 575 target_range = range (mintarget, maxtarget+1) | |
| 576 Query_table = {} | |
| 577 Target_table = {} | |
| 578 Total_Query_Numb = 0 | |
| 579 general_frequency_table = dict ([(i,0) for i in scope]) | |
| 580 ## filtering the appropriate reads for the study | |
| 581 for offset in self.readDict: | |
| 582 if (abs(offset) < upstream_coord or abs(offset) > downstream_coord): continue | |
| 583 for size in self.readDict[offset]: | |
| 584 if size in query_range: | |
| 585 Query_table[offset] = Query_table.get(offset, 0) + 1 | |
| 586 Total_Query_Numb += 1 | |
| 587 if size in target_range: | |
| 588 Target_table[offset] = Target_table.get(offset, 0) + 1 | |
| 589 for offset in Query_table: | |
| 590 frequency_table = dict ([(i,0) for i in scope]) | |
| 591 number_of_targets = 0 | |
| 592 for i in scope: | |
| 593 frequency_table[i] += Query_table[offset] * Target_table.get(-offset -i +1, 0) | |
| 594 number_of_targets += Target_table.get(-offset -i +1, 0) | |
| 595 for i in scope: | |
| 596 try: | |
| 597 general_frequency_table[i] += (1. / number_of_targets / Total_Query_Numb) * frequency_table[i] | |
| 598 except ZeroDivisionError : | |
| 599 continue | |
| 600 return general_frequency_table | |
| 601 | |
| 602 def phasing (self, size_range, scope): | |
| 603 ''' to calculate autocorelation like signal - scope must be an python iterable''' | |
| 604 read_table = {} | |
| 605 total_read_number = 0 | |
| 606 general_frequency_table = dict ([(i, 0) for i in scope]) | |
| 607 ## read input filtering | |
| 608 for offset in self.readDict: | |
| 609 for size in self.readDict[offset]: | |
| 610 if size in size_range: | |
| 611 read_table[offset] = read_table.get(offset, 0) + 1 | |
| 612 total_read_number += 1 | |
| 613 ## per offset read phasing computing | |
| 614 for offset in read_table: | |
| 615 frequency_table = dict ([(i, 0) for i in scope]) # local frequency table | |
| 616 number_of_targets = 0 | |
| 617 for i in scope: | |
| 618 if offset > 0: | |
| 619 frequency_table[i] += read_table[offset] * read_table.get(offset + i, 0) | |
| 620 number_of_targets += read_table.get(offset + i, 0) | |
| 621 else: | |
| 622 frequency_table[i] += read_table[offset] * read_table.get(offset - i, 0) | |
| 623 number_of_targets += read_table.get(offset - i, 0) | |
| 624 ## inclusion of local frequency table in the general frequency table (all offsets average) | |
| 625 for i in scope: | |
| 626 try: | |
| 627 general_frequency_table[i] += (1. / number_of_targets / total_read_number) * frequency_table[i] | |
| 628 except ZeroDivisionError : | |
| 629 continue | |
| 630 return general_frequency_table | |
| 631 | |
| 632 | |
| 633 | |
| 634 def z_signature (self, minquery, maxquery, mintarget, maxtarget, scope): | |
| 635 '''Must do: from numpy import mean, std, to use this method; scope must be a python iterable and defines the relative offsets to compute''' | |
| 636 frequency_table = self.signature (minquery, maxquery, mintarget, maxtarget, scope) | |
| 637 z_table = {} | |
| 638 frequency_list = [frequency_table[i] for i in sorted (frequency_table)] | |
| 639 if std(frequency_list): | |
| 640 meanlist = mean(frequency_list) | |
| 641 stdlist = std(frequency_list) | |
| 642 z_list = [(i-meanlist)/stdlist for i in frequency_list] | |
| 643 return dict (zip (sorted(frequency_table), z_list) ) | |
| 644 else: | |
| 645 return dict (zip (sorted(frequency_table), [0 for i in frequency_table]) ) | |
| 646 | |
| 647 def percent_signature (self, minquery, maxquery, mintarget, maxtarget, scope): | |
| 648 frequency_table = self.signature (minquery, maxquery, mintarget, maxtarget, scope) | |
| 649 total = float(sum ([self.readsizes().get(i,0) for i in set(range(minquery,maxquery)+range(mintarget,maxtarget))]) ) | |
| 650 if total == 0: | |
| 651 return dict( [(i,0) for i in scope]) | |
| 652 return dict( [(i, frequency_table[i]/total*100) for i in scope]) | |
| 653 | |
| 654 def pairer (self, overlap, minquery, maxquery, mintarget, maxtarget): | |
| 655 queryhash = defaultdict(list) | |
| 656 targethash = defaultdict(list) | |
| 657 query_range = range (int(minquery), int(maxquery)+1) | |
| 658 target_range = range (int(mintarget), int(maxtarget)+1) | |
| 659 paired_sequences = [] | |
| 660 for offset in self.readDict: # selection of data | |
| 661 for size in self.readDict[offset]: | |
| 662 if size in query_range: | |
| 663 queryhash[offset].append(size) | |
| 664 if size in target_range: | |
| 665 targethash[offset].append(size) | |
| 666 for offset in queryhash: | |
| 667 if offset >= 0: matched_offset = -offset - overlap + 1 | |
| 668 else: matched_offset = -offset - overlap + 1 | |
| 669 if targethash[matched_offset]: | |
| 670 paired = min ( len(queryhash[offset]), len(targethash[matched_offset]) ) | |
| 671 if offset >= 0: | |
| 672 for i in range (paired): | |
| 673 paired_sequences.append("+%s" % RNAtranslate ( self.sequence[offset:offset+queryhash[offset][i]]) ) | |
| 674 paired_sequences.append("-%s" % RNAtranslate (antipara (self.sequence[-matched_offset-targethash[matched_offset][i]+1:-matched_offset+1]) ) ) | |
| 675 if offset < 0: | |
| 676 for i in range (paired): | |
| 677 paired_sequences.append("-%s" % RNAtranslate (antipara (self.sequence[-offset-queryhash[offset][i]+1:-offset+1]) ) ) | |
| 678 paired_sequences.append("+%s" % RNAtranslate (self.sequence[matched_offset:matched_offset+targethash[matched_offset][i]] ) ) | |
| 679 return paired_sequences | |
| 680 | |
| 681 def pairable (self, overlap, minquery, maxquery, mintarget, maxtarget): | |
| 682 queryhash = defaultdict(list) | |
| 683 targethash = defaultdict(list) | |
| 684 query_range = range (int(minquery), int(maxquery)+1) | |
| 685 target_range = range (int(mintarget), int(maxtarget)+1) | |
| 686 paired_sequences = [] | |
| 687 | |
| 688 for offset in self.readDict: # selection of data | |
| 689 for size in self.readDict[offset]: | |
| 690 if size in query_range: | |
| 691 queryhash[offset].append(size) | |
| 692 if size in target_range: | |
| 693 targethash[offset].append(size) | |
| 694 | |
| 695 for offset in queryhash: | |
| 696 matched_offset = -offset - overlap + 1 | |
| 697 if targethash[matched_offset]: | |
| 698 if offset >= 0: | |
| 699 for i in queryhash[offset]: | |
| 700 paired_sequences.append("+%s" % RNAtranslate (self.sequence[offset:offset+i]) ) | |
| 701 for i in targethash[matched_offset]: | |
| 702 paired_sequences.append( "-%s" % RNAtranslate (antipara (self.sequence[-matched_offset-i+1:-matched_offset+1]) ) ) | |
| 703 if offset < 0: | |
| 704 for i in queryhash[offset]: | |
| 705 paired_sequences.append("-%s" % RNAtranslate (antipara (self.sequence[-offset-i+1:-offset+1]) ) ) | |
| 706 for i in targethash[matched_offset]: | |
| 707 paired_sequences.append("+%s" % RNAtranslate (self.sequence[matched_offset:matched_offset+i] ) ) | |
| 708 return paired_sequences | |
| 709 | |
| 710 def newpairable_bowtie (self, overlap, minquery, maxquery, mintarget, maxtarget): | |
| 711 ''' revision of pairable on 3-12-2012, with focus on the offset shift problem (bowtie is 1-based cooordinates whereas python strings are 0-based coordinates''' | |
| 712 queryhash = defaultdict(list) | |
| 713 targethash = defaultdict(list) | |
| 714 query_range = range (int(minquery), int(maxquery)+1) | |
| 715 target_range = range (int(mintarget), int(maxtarget)+1) | |
| 716 bowtie_output = [] | |
| 717 | |
| 718 for offset in self.readDict: # selection of data | |
| 719 for size in self.readDict[offset]: | |
| 720 if size in query_range: | |
| 721 queryhash[offset].append(size) | |
| 722 if size in target_range: | |
| 723 targethash[offset].append(size) | |
| 724 counter = 0 | |
| 725 for offset in queryhash: | |
| 726 matched_offset = -offset - overlap + 1 | |
| 727 if targethash[matched_offset]: | |
| 728 if offset >= 0: | |
| 729 for i in queryhash[offset]: | |
| 730 counter += 1 | |
| 731 bowtie_output.append("%s\t%s\t%s\t%s\t%s" % (counter, "+", self.gene, offset-1, self.sequence[offset-1:offset-1+i]) ) # attention a la base 1-0 de l'offset | |
| 732 if offset < 0: | |
| 733 for i in queryhash[offset]: | |
| 734 counter += 1 | |
| 735 bowtie_output.append("%s\t%s\t%s\t%s\t%s" % (counter, "-", self.gene, -offset-i, self.sequence[-offset-i:-offset])) # attention a la base 1-0 de l'offset | |
| 736 return bowtie_output | |
| 737 | |
| 738 | |
| 739 def __main__(bowtie_index_path, bowtie_output_path): | |
| 740 sequenceDic = get_fasta (bowtie_index_path) | |
| 741 objDic = {} | |
| 742 F = open (bowtie_output_path, "r") # F is the bowtie output taken as input | |
| 743 for line in F: | |
| 744 fields = line.split() | |
| 745 polarity = fields[1] | |
| 746 gene = fields[2] | |
| 747 offset = int(fields[3]) | |
| 748 size = len (fields[4]) | |
| 749 try: | |
| 750 objDic[gene].addread (polarity, offset, size) | |
| 751 except KeyError: | |
| 752 objDic[gene] = SmRNAwindow(gene, sequenceDic[gene]) | |
| 753 objDic[gene].addread (polarity, offset, size) | |
| 754 F.close() | |
| 755 for gene in objDic: | |
| 756 print gene, objDic[gene].pairer(19,19,23,19,23) | |
| 757 | |
| 758 if __name__ == "__main__" : __main__(sys.argv[1], sys.argv[2]) |
