| 0 | 1 #!/usr/bin/python | 
|  | 2 # version 1 7-5-2012 unification of the SmRNAwindow class | 
|  | 3 | 
|  | 4 import sys, subprocess | 
|  | 5 from collections import defaultdict | 
|  | 6 from numpy import mean, median, std | 
|  | 7 from scipy import stats | 
|  | 8 | 
|  | 9 def get_fasta (index="/home/galaxy/galaxy-dist/bowtie/5.37_Dmel/5.37_Dmel"): | 
|  | 10   '''This function will return a dictionary containing fasta identifiers as keys and the | 
|  | 11   sequence as values. Index must be the path to a fasta file.''' | 
|  | 12   p = subprocess.Popen(args=["bowtie-inspect","-a", "0", index], stdout=subprocess.PIPE, stderr=subprocess.STDOUT) # bowtie-inspect outputs sequences on single lines | 
|  | 13   outputlines = p.stdout.readlines() | 
|  | 14   p.wait() | 
|  | 15   item_dic = {} | 
|  | 16   for line in outputlines: | 
|  | 17     if (line[0] == ">"): | 
|  | 18       try: | 
|  | 19         item_dic[current_item] = "".join(stringlist) # to dump the sequence of the previous item - try because of the keyerror of the first item | 
|  | 20       except: pass | 
|  | 21       current_item = line[1:].rstrip().split()[0] #take the first word before space because bowtie splits headers ! | 
|  | 22       item_dic[current_item] = "" | 
|  | 23       stringlist=[] | 
|  | 24     else: | 
|  | 25       stringlist.append(line.rstrip() ) | 
|  | 26   item_dic[current_item] = "".join(stringlist) # for the last item | 
|  | 27   return item_dic | 
|  | 28 | 
|  | 29 def get_fasta_headers (index): | 
|  | 30   p = subprocess.Popen(args=["bowtie-inspect","-n", index], stdout=subprocess.PIPE, stderr=subprocess.STDOUT) # bowtie-inspect outputs sequences on single lines | 
|  | 31   outputlines = p.stdout.readlines() | 
|  | 32   p.wait() | 
|  | 33   item_dic = {} | 
|  | 34   for line in outputlines: | 
|  | 35     header = line.rstrip().split()[0] #take the first word before space because bowtie splits headers ! | 
|  | 36     item_dic[header] = 1 | 
|  | 37   return item_dic | 
|  | 38 | 
|  | 39 | 
|  | 40 def get_file_sample (file, numberoflines): | 
|  | 41   '''import random to use this function''' | 
|  | 42   F=open(file) | 
|  | 43   fullfile = F.read().splitlines() | 
|  | 44   F.close() | 
|  | 45   if len(fullfile) < numberoflines: | 
|  | 46     return "sample size exceeds file size" | 
|  | 47   return random.sample(fullfile, numberoflines) | 
|  | 48 | 
|  | 49 def get_fasta_from_history (file): | 
|  | 50   F = open (file, "r") | 
|  | 51   item_dic = {} | 
|  | 52   for line in F: | 
|  | 53     if (line[0] == ">"): | 
|  | 54       try: | 
|  | 55         item_dic[current_item] = "".join(stringlist) # to dump the sequence of the previous item - try because of the keyerror of the first item | 
|  | 56       except: pass | 
|  | 57       current_item = line[1:-1].split()[0] #take the first word before space because bowtie splits headers ! | 
|  | 58       item_dic[current_item] = "" | 
|  | 59       stringlist=[] | 
|  | 60     else: | 
|  | 61       stringlist.append(line[:-1]) | 
|  | 62   item_dic[current_item] = "".join(stringlist) # for the last item | 
|  | 63   return item_dic | 
|  | 64 | 
|  | 65 def antipara (sequence): | 
|  | 66     antidict = {"A":"T", "T":"A", "G":"C", "C":"G", "N":"N"} | 
|  | 67     revseq = sequence[::-1] | 
|  | 68     return "".join([antidict[i] for i in revseq]) | 
|  | 69 | 
|  | 70 def RNAtranslate (sequence): | 
|  | 71     return "".join([i if i in "AGCN" else "U" for i in sequence]) | 
|  | 72 def DNAtranslate (sequence): | 
|  | 73     return "".join([i if i in "AGCN" else "T" for i in sequence]) | 
|  | 74 | 
|  | 75 def RNAfold (sequence_list): | 
|  | 76   thestring= "\n".join(sequence_list) | 
|  | 77   p = subprocess.Popen(args=["RNAfold","--noPS"], stdin= subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.STDOUT) | 
|  | 78   output=p.communicate(thestring)[0] | 
|  | 79   p.wait() | 
|  | 80   output=output.split("\n") | 
|  | 81   if not output[-1]: output = output[:-1] # nasty patch to remove last empty line | 
|  | 82   buffer=[] | 
|  | 83   for line in output: | 
|  | 84     if line[0] in ["N","A","T","U","G","C"]: | 
|  | 85       buffer.append(DNAtranslate(line)) | 
|  | 86     if line[0] in ["(",".",")"]: | 
|  | 87       fields=line.split("(") | 
|  | 88       energy= fields[-1] | 
|  | 89       energy = energy[:-1] # remove the ) parenthesis | 
|  | 90       energy=float(energy) | 
|  | 91       buffer.append(str(energy)) | 
|  | 92   return dict(zip(buffer[::2], buffer[1::2])) | 
|  | 93 | 
|  | 94 def extractsubinstance (start, end, instance): | 
|  | 95   ''' Testing whether this can be an function external to the class to save memory''' | 
|  | 96   subinstance = SmRNAwindow (instance.gene, instance.sequence[start-1:end], start) | 
|  | 97   subinstance.gene = "%s %s %s" % (subinstance.gene, subinstance.windowoffset, subinstance.windowoffset + subinstance.size - 1) | 
|  | 98   upcoordinate = [i for i in range(start,end+1) if instance.readDict.has_key(i) ] | 
|  | 99   downcoordinate = [-i for i in range(start,end+1) if instance.readDict.has_key(-i) ] | 
|  | 100   for i in upcoordinate: | 
|  | 101     subinstance.readDict[i]=instance.readDict[i] | 
|  | 102   for i in downcoordinate: | 
|  | 103     subinstance.readDict[i]=instance.readDict[i] | 
|  | 104   return subinstance | 
|  | 105 | 
|  | 106 class HandleSmRNAwindows: | 
|  | 107   def __init__(self, alignmentFile="~", alignmentFileFormat="tabular", genomeRefFile="~", genomeRefFormat="bowtieIndex", biosample="undetermined", size_inf=None, size_sup=1000, norm=1.0): | 
|  | 108     self.biosample = biosample | 
|  | 109     self.alignmentFile = alignmentFile | 
|  | 110     self.alignmentFileFormat = alignmentFileFormat # can be "tabular" or "sam" | 
|  | 111     self.genomeRefFile = genomeRefFile | 
|  | 112     self.genomeRefFormat = genomeRefFormat # can be "bowtieIndex" or "fastaSource" | 
|  | 113     self.alignedReads = 0 | 
|  | 114     self.instanceDict = {} | 
|  | 115     self.size_inf=size_inf | 
|  | 116     self.size_sup=size_sup | 
|  | 117     self.norm=norm | 
|  | 118     if genomeRefFormat == "bowtieIndex": | 
|  | 119       self.itemDict = get_fasta (genomeRefFile) | 
|  | 120     elif genomeRefFormat == "fastaSource": | 
|  | 121       self.itemDict = get_fasta_from_history (genomeRefFile) | 
|  | 122     for item in self.itemDict: | 
|  | 123       self.instanceDict[item] = SmRNAwindow(item, sequence=self.itemDict[item], windowoffset=1, biosample=self.biosample, norm=self.norm) # create as many instances as there is items | 
|  | 124     self.readfile() | 
|  | 125 | 
|  | 126   def readfile (self) : | 
|  | 127     if self.alignmentFileFormat == "tabular": | 
|  | 128       F = open (self.alignmentFile, "r") | 
|  | 129       for line in F: | 
|  | 130         fields = line.split() | 
|  | 131         polarity = fields[1] | 
|  | 132         gene = fields[2] | 
|  | 133         offset = int(fields[3]) | 
|  | 134         size = len (fields[4]) | 
|  | 135         if self.size_inf: | 
|  | 136           if (size>=self.size_inf and size<= self.size_sup): | 
|  | 137             self.instanceDict[gene].addread (polarity, offset+1, size) # to correct to 1-based coordinates of SmRNAwindow | 
|  | 138             self.alignedReads += 1 | 
|  | 139         else: | 
|  | 140           self.instanceDict[gene].addread (polarity, offset+1, size) # to correct to 1-based coordinates of SmRNAwindow | 
|  | 141           self.alignedReads += 1 | 
|  | 142       F.close() | 
|  | 143       return self.instanceDict | 
|  | 144 #    elif self.alignmentFileFormat == "sam": | 
|  | 145 #      F = open (self.alignmentFile, "r") | 
|  | 146 #      dict = {"0":"+", "16":"-"} | 
|  | 147 #      for line in F: | 
|  | 148 #        if line[0]=='@': | 
|  | 149 #            continue | 
|  | 150 #        fields = line.split() | 
|  | 151 #        if fields[2] == "*": continue | 
|  | 152 #        polarity = dict[fields[1]] | 
|  | 153 #        gene = fields[2] | 
|  | 154 #        offset = int(fields[3]) | 
|  | 155 #        size = len (fields[9]) | 
|  | 156 #        if self.size_inf: | 
|  | 157 #          if (size>=self.size_inf and size<= self.size_sup): | 
|  | 158 #            self.instanceDict[gene].addread (polarity, offset, size) | 
|  | 159 #            self.alignedReads += 1 | 
|  | 160 #       else: | 
|  | 161 #          self.instanceDict[gene].addread (polarity, offset, size) | 
|  | 162 #          self.alignedReads += 1 | 
|  | 163 #      F.close() | 
|  | 164     elif self.alignmentFileFormat == "bam" or self.alignmentFileFormat == "sam": | 
|  | 165       import pysam | 
|  | 166       samfile = pysam.Samfile(self.alignmentFile) | 
|  | 167       for read in samfile: | 
|  | 168         if read.tid == -1: | 
|  | 169           continue # filter out unaligned reads | 
|  | 170         if read.is_reverse: | 
|  | 171           polarity="-" | 
|  | 172         else: | 
|  | 173           polarity="+" | 
|  | 174         gene = samfile.getrname(read.tid) | 
|  | 175         offset = read.pos | 
|  | 176         size = read.qlen | 
|  | 177         if self.size_inf: | 
|  | 178           if (size>=self.size_inf and size<= self.size_sup): | 
|  | 179             self.instanceDict[gene].addread (polarity, offset+1, size) # to correct to 1-based coordinates of SmRNAwindow | 
|  | 180             self.alignedReads += 1 | 
|  | 181         else: | 
|  | 182           self.instanceDict[gene].addread (polarity, offset+1, size) # to correct to 1-based coordinates of SmRNAwindow | 
|  | 183           self.alignedReads += 1 | 
|  | 184       return self.instanceDict | 
|  | 185 | 
|  | 186 #  def size_histogram (self): | 
|  | 187 #    size_dict={} | 
|  | 188 #    size_dict['F']= defaultdict (int) | 
|  | 189 #    size_dict['R']= defaultdict (int) | 
|  | 190 #    size_dict['both'] = defaultdict (int) | 
|  | 191 #    for item in self.instanceDict: | 
|  | 192 #      buffer_dict_F = self.instanceDict[item].size_histogram()['F'] | 
|  | 193 #      buffer_dict_R = self.instanceDict[item].size_histogram()['R'] | 
|  | 194 #      for size in buffer_dict_F: | 
|  | 195 #        size_dict['F'][size] += buffer_dict_F[size] | 
|  | 196 #      for size in buffer_dict_R: | 
|  | 197 #        size_dict['R'][size] -= buffer_dict_R[size] | 
|  | 198 #    allSizeKeys = list (set (size_dict['F'].keys() + size_dict['R'].keys() ) ) | 
|  | 199 #    for size in allSizeKeys: | 
|  | 200 #      size_dict['both'][size] = size_dict['F'][size] + size_dict['R'][size] | 
|  | 201 #    return size_dict | 
|  | 202   def size_histogram (self): # in HandleSmRNAwindows | 
|  | 203     '''refactored on 7-9-2014 to debug size_histogram tool''' | 
|  | 204     size_dict={} | 
|  | 205     size_dict['F']= defaultdict (float) | 
|  | 206     size_dict['R']= defaultdict (float) | 
|  | 207     size_dict['both'] = defaultdict (float) | 
|  | 208     for item in self.instanceDict: | 
|  | 209       buffer_dict = self.instanceDict[item].size_histogram() | 
|  | 210       for polarity in ["F", "R"]: | 
|  | 211         for size in buffer_dict[polarity]: | 
|  | 212           size_dict[polarity][size] += buffer_dict[polarity][size] | 
|  | 213       for size in buffer_dict["both"]: | 
|  | 214         size_dict["both"][size] += buffer_dict["F"][size] - buffer_dict["R"][size] | 
|  | 215     return size_dict | 
|  | 216 | 
|  | 217   def CountFeatures (self, GFF3="path/to/file"): | 
|  | 218     featureDict = defaultdict(int) | 
|  | 219     F  = open (GFF3, "r") | 
|  | 220     for line in F: | 
|  | 221       if line[0] ==  "#": continue | 
|  | 222       fields = line[:-1].split() | 
|  | 223       chrom, feature, leftcoord, rightcoord, polarity = fields[0], fields[2], fields[3], fields[4], fields[6] | 
|  | 224       featureDict[feature] += self.instanceDict[chrom].readcount(upstream_coord=int(leftcoord), downstream_coord=int(rightcoord), polarity="both", method="destructive") | 
|  | 225     F.close() | 
|  | 226     return featureDict | 
|  | 227 | 
|  | 228 class SmRNAwindow: | 
|  | 229 | 
|  | 230   def __init__(self, gene, sequence="ATGC", windowoffset=1, biosample="Undetermined", norm=1.0): | 
|  | 231     self.biosample = biosample | 
|  | 232     self.sequence = sequence | 
|  | 233     self.gene = gene | 
|  | 234     self.windowoffset = windowoffset | 
|  | 235     self.size = len(sequence) | 
|  | 236     self.readDict = defaultdict(list) # with a {+/-offset:[size1, size2, ...], ...} | 
|  | 237     self.matchedreadsUp = 0 | 
|  | 238     self.matchedreadsDown = 0 | 
|  | 239     self.norm=norm | 
|  | 240 | 
|  | 241   def addread (self, polarity, offset, size): | 
|  | 242     '''ATTENTION ATTENTION ATTENTION''' | 
|  | 243     ''' We removed the conversion from 0 to 1 based offset, as we do this now during readparsing.''' | 
|  | 244     if polarity == "+": | 
|  | 245       self.readDict[offset].append(size) | 
|  | 246       self.matchedreadsUp += 1 | 
|  | 247     else: | 
|  | 248       self.readDict[-(offset + size -1)].append(size) | 
|  | 249       self.matchedreadsDown += 1 | 
|  | 250     return | 
|  | 251 | 
|  | 252   def barycenter (self, upstream_coord=None, downstream_coord=None): | 
|  | 253     '''refactored 24-12-2013 to save memory and introduce offset filtering see readcount method for further discussion on that | 
|  | 254     In this version, attempt to replace the dictionary structure by a list of tupple to save memory too''' | 
|  | 255     upstream_coord = upstream_coord or self.windowoffset | 
|  | 256     downstream_coord = downstream_coord or self.windowoffset+self.size-1 | 
|  | 257     window_size = downstream_coord - upstream_coord +1 | 
|  | 258     def weigthAverage (TuppleList): | 
|  | 259       weightSum = 0 | 
|  | 260       PonderWeightSum = 0 | 
|  | 261       for tuple in TuppleList: | 
|  | 262         PonderWeightSum += tuple[0] * tuple[1] | 
|  | 263         weightSum += tuple[1] | 
|  | 264       if weightSum > 0: | 
|  | 265         return PonderWeightSum / float(weightSum) | 
|  | 266       else: | 
|  | 267         return 0 | 
|  | 268     forwardTuppleList = [(k, len(self.readDict[k])) for k in self.readDict.keys() if (k > 0 and abs(k) >= upstream_coord and abs(k) <= downstream_coord)] # both forward and in the proper offset window | 
|  | 269     reverseTuppleList = [(-k, len(self.readDict[k])) for k in self.readDict.keys() if (k < 0 and abs(k) >= upstream_coord and abs(k) <= downstream_coord)] # both reverse and in the proper offset window | 
|  | 270     Fbarycenter = (weigthAverage (forwardTuppleList) - upstream_coord) / window_size | 
|  | 271     Rbarycenter = (weigthAverage (reverseTuppleList) - upstream_coord) / window_size | 
|  | 272     return Fbarycenter, Rbarycenter | 
|  | 273 | 
|  | 274   def correlation_mapper (self, reference, window_size): | 
|  | 275     '''to map correlation with a sliding window 26-2-2013''' | 
|  | 276     if window_size > self.size: | 
|  | 277       return [] | 
|  | 278     F=open(reference, "r") | 
|  | 279     reference_forward = [] | 
|  | 280     reference_reverse = [] | 
|  | 281     for line in F: | 
|  | 282       fields=line.split() | 
|  | 283       reference_forward.append(int(float(fields[1]))) | 
|  | 284       reference_reverse.append(int(float(fields[2]))) | 
|  | 285     F.close() | 
|  | 286     local_object_forward=[] | 
|  | 287     local_object_reverse=[] | 
|  | 288     ## Dict to list for the local object | 
|  | 289     for i in range(1, self.size+1): | 
|  | 290       local_object_forward.append(len(self.readDict[i])) | 
|  | 291       local_object_reverse.append(len(self.readDict[-i])) | 
|  | 292     ## start compiling results by slides | 
|  | 293     results=[] | 
|  | 294     for coordinate in range(self.size - window_size): | 
|  | 295       local_forward=local_object_forward[coordinate:coordinate + window_size] | 
|  | 296       local_reverse=local_object_reverse[coordinate:coordinate + window_size] | 
|  | 297       if sum(local_forward) == 0 or sum(local_reverse) == 0: | 
|  | 298         continue | 
|  | 299       try: | 
|  | 300         reference_to_local_cor_forward = stats.spearmanr(local_forward, reference_forward) | 
|  | 301         reference_to_local_cor_reverse = stats.spearmanr(local_reverse, reference_reverse) | 
|  | 302         if (reference_to_local_cor_forward[0] > 0.2 or  reference_to_local_cor_reverse[0]>0.2): | 
|  | 303           results.append([coordinate+1, reference_to_local_cor_forward[0], reference_to_local_cor_reverse[0]]) | 
|  | 304       except: | 
|  | 305         pass | 
|  | 306     return results | 
|  | 307 | 
|  | 308   def readcount (self, size_inf=0, size_sup=1000, upstream_coord=None, downstream_coord=None, polarity="both", method="conservative"): | 
|  | 309     '''refactored 24-12-2013 to save memory and introduce offset filtering | 
|  | 310     take a look at the defaut parameters that cannot be defined relatively to the instance are they are defined before instanciation | 
|  | 311     the trick is to pass None and then test | 
|  | 312     polarity parameter can take "both", "forward" or "reverse" as value''' | 
|  | 313     upstream_coord = upstream_coord or self.windowoffset | 
|  | 314     downstream_coord = downstream_coord or self.windowoffset+self.size-1 | 
|  | 315     if upstream_coord == 1 and downstream_coord == self.windowoffset+self.size-1 and polarity == "both": | 
|  | 316       return self.matchedreadsUp +  self.matchedreadsDown | 
|  | 317     if upstream_coord == 1 and downstream_coord == self.windowoffset+self.size-1 and polarity == "forward": | 
|  | 318       return self.matchedreadsUp | 
|  | 319     if upstream_coord == 1 and downstream_coord == self.windowoffset+self.size-1 and polarity == "reverse": | 
|  | 320       return self.matchedreadsDown | 
|  | 321     n=0 | 
|  | 322     if polarity == "both": | 
|  | 323       for offset in xrange(upstream_coord, downstream_coord+1): | 
|  | 324         if self.readDict.has_key(offset): | 
|  | 325           for read in self.readDict[offset]: | 
|  | 326             if (read>=size_inf and read<= size_sup): | 
|  | 327               n += 1 | 
|  | 328           if method != "conservative": | 
|  | 329             del self.readDict[offset] ## Carefull ! precludes re-use on the self.readDict dictionary !!!!!! TEST | 
|  | 330         if self.readDict.has_key(-offset): | 
|  | 331           for read in self.readDict[-offset]: | 
|  | 332             if (read>=size_inf and read<= size_sup): | 
|  | 333               n += 1 | 
|  | 334           if method != "conservative": | 
|  | 335             del self.readDict[-offset] | 
|  | 336       return n | 
|  | 337     elif polarity == "forward": | 
|  | 338       for offset in xrange(upstream_coord, downstream_coord+1): | 
|  | 339         if self.readDict.has_key(offset): | 
|  | 340           for read in self.readDict[offset]: | 
|  | 341             if (read>=size_inf and read<= size_sup): | 
|  | 342               n += 1 | 
|  | 343       return n | 
|  | 344     elif polarity == "reverse": | 
|  | 345       for offset in xrange(upstream_coord, downstream_coord+1): | 
|  | 346         if self.readDict.has_key(-offset): | 
|  | 347           for read in self.readDict[-offset]: | 
|  | 348             if (read>=size_inf and read<= size_sup): | 
|  | 349               n += 1 | 
|  | 350       return n | 
|  | 351 | 
|  | 352   def readsizes (self): | 
|  | 353     '''return a dictionary of number of reads by size (the keys)''' | 
|  | 354     dicsize = {} | 
|  | 355     for offset in self.readDict: | 
|  | 356       for size in self.readDict[offset]: | 
|  | 357         dicsize[size] = dicsize.get(size, 0) + 1 | 
|  | 358     for offset in range (min(dicsize.keys()), max(dicsize.keys())+1): | 
|  | 359       dicsize[size] = dicsize.get(size, 0) # to fill offsets with null values | 
|  | 360     return dicsize | 
|  | 361 | 
|  | 362 #  def size_histogram(self): | 
|  | 363 #    norm=self.norm | 
|  | 364 #    hist_dict={} | 
|  | 365 #    hist_dict['F']={} | 
|  | 366 #    hist_dict['R']={} | 
|  | 367 #    for offset in self.readDict: | 
|  | 368 #      for size in self.readDict[offset]: | 
|  | 369 #        if offset < 0: | 
|  | 370 #          hist_dict['R'][size] = hist_dict['R'].get(size, 0) - 1*norm | 
|  | 371 #        else: | 
|  | 372 #          hist_dict['F'][size] = hist_dict['F'].get(size, 0) + 1*norm | 
|  | 373 #   ## patch to avoid missing graphs when parsed by R-lattice. 27-08-2014. Test and validate ! | 
|  | 374 #    if not (hist_dict['F']) and (not hist_dict['R']): | 
|  | 375 #      hist_dict['F'][21] = 0 | 
|  | 376 #      hist_dict['R'][21] = 0 | 
|  | 377 #   ## | 
|  | 378 #    return hist_dict | 
|  | 379 | 
|  | 380   def size_histogram(self, minquery=None, maxquery=None): # in SmRNAwindow | 
|  | 381     '''refactored on 7-9-2014 to debug size_histogram tool''' | 
|  | 382     norm=self.norm | 
|  | 383     size_dict={} | 
|  | 384     size_dict['F']= defaultdict (float) | 
|  | 385     size_dict['R']= defaultdict (float) | 
|  | 386     size_dict['both']= defaultdict (float) | 
|  | 387     for offset in self.readDict: | 
|  | 388       for size in self.readDict[offset]: | 
|  | 389         if offset < 0: | 
|  | 390           size_dict['R'][size] = size_dict['R'][size] - 1*norm | 
|  | 391         else: | 
|  | 392           size_dict['F'][size] = size_dict['F'][size] + 1*norm | 
|  | 393     ## patch to avoid missing graphs when parsed by R-lattice. 27-08-2014. Test and validate ! | 
|  | 394     if not (size_dict['F']) and (not size_dict['R']): | 
|  | 395       size_dict['F'][21] = 0 | 
|  | 396       size_dict['R'][21] = 0 | 
|  | 397     ## | 
|  | 398     allSizeKeys = list (set (size_dict['F'].keys() + size_dict['R'].keys() ) ) | 
|  | 399     for size in allSizeKeys: | 
|  | 400       size_dict['both'][size] = size_dict['F'][size] - size_dict['R'][size] | 
|  | 401     if minquery: | 
|  | 402       for polarity in size_dict.keys(): | 
|  | 403         for size in xrange(minquery, maxquery+1): | 
|  | 404           if not size in size_dict[polarity].keys(): | 
|  | 405             size_dict[polarity][size]=0 | 
|  | 406     return size_dict | 
|  | 407 | 
|  | 408   def statsizes (self, upstream_coord=None, downstream_coord=None): | 
|  | 409     ''' migration to memory saving by specifying possible subcoordinates | 
|  | 410     see the readcount method for further discussion''' | 
|  | 411     upstream_coord = upstream_coord or self.windowoffset | 
|  | 412     downstream_coord = downstream_coord or self.windowoffset+self.size-1 | 
|  | 413     L = [] | 
|  | 414     for offset in self.readDict: | 
|  | 415       if (abs(offset) < upstream_coord or abs(offset) > downstream_coord): continue | 
|  | 416       for size in self.readDict[offset]: | 
|  | 417         L.append(size) | 
|  | 418     meansize = mean(L) | 
|  | 419     stdv = std(L) | 
|  | 420     mediansize = median(L) | 
|  | 421     return meansize, mediansize, stdv | 
|  | 422 | 
|  | 423   def foldEnergy (self, upstream_coord=None, downstream_coord=None): | 
|  | 424     ''' migration to memory saving by specifying possible subcoordinates | 
|  | 425     see the readcount method for further discussion''' | 
|  | 426     upstream_coord = upstream_coord or self.windowoffset | 
|  | 427     downstream_coord = downstream_coord or self.windowoffset+self.size-1 | 
|  | 428     Energy = RNAfold ([self.sequence[upstream_coord-1:downstream_coord] ]) | 
|  | 429     return float(Energy[self.sequence[upstream_coord-1:downstream_coord]]) | 
|  | 430 | 
|  | 431   def Ufreq (self, size_scope, upstream_coord=None, downstream_coord=None): | 
|  | 432     ''' migration to memory saving by specifying possible subcoordinates | 
|  | 433     see the readcount method for further discussion. size_scope must be an interable''' | 
|  | 434     upstream_coord = upstream_coord or self.windowoffset | 
|  | 435     downstream_coord = downstream_coord or self.windowoffset+self.size-1 | 
|  | 436     freqDic = {"A":0,"T":0,"G":0,"C":0, "N":0} | 
|  | 437     convertDic = {"A":"T","T":"A","G":"C","C":"G","N":"N"} | 
|  | 438     for offset in self.readDict: | 
|  | 439       if (abs(offset) < upstream_coord or abs(offset) > downstream_coord): continue | 
|  | 440       for size in self.readDict[offset]: | 
|  | 441         if size in size_scope: | 
|  | 442           startbase = self.sequence[abs(offset)-self.windowoffset] | 
|  | 443           if offset < 0: | 
|  | 444             startbase = convertDic[startbase] | 
|  | 445           freqDic[startbase] += 1 | 
|  | 446     base_sum = float ( sum( freqDic.values()) ) | 
|  | 447     if base_sum == 0: | 
|  | 448       return "." | 
|  | 449     else: | 
|  | 450       return freqDic["T"] / base_sum * 100 | 
|  | 451 | 
|  | 452   def Ufreq_stranded (self, size_scope, upstream_coord=None, downstream_coord=None): | 
|  | 453     ''' migration to memory saving by specifying possible subcoordinates | 
|  | 454     see the readcount method for further discussion. size_scope must be an interable | 
|  | 455     This method is similar to the Ufreq method but take strandness into account''' | 
|  | 456     upstream_coord = upstream_coord or self.windowoffset | 
|  | 457     downstream_coord = downstream_coord or self.windowoffset+self.size-1 | 
|  | 458     freqDic = {"Afor":0,"Tfor":0,"Gfor":0,"Cfor":0, "Nfor":0,"Arev":0,"Trev":0,"Grev":0,"Crev":0, "Nrev":0} | 
|  | 459     convertDic = {"A":"T","T":"A","G":"C","C":"G","N":"N"} | 
|  | 460     for offset in self.readDict: | 
|  | 461       if (abs(offset) < upstream_coord or abs(offset) > downstream_coord): continue | 
|  | 462       for size in self.readDict[offset]: | 
|  | 463         if size in size_scope: | 
|  | 464           startbase = self.sequence[abs(offset)-self.windowoffset] | 
|  | 465           if offset < 0: | 
|  | 466             startbase = convertDic[startbase] | 
|  | 467             freqDic[startbase+"rev"] += 1 | 
|  | 468           else: | 
|  | 469             freqDic[startbase+"for"] += 1 | 
|  | 470     forward_sum = float ( freqDic["Afor"]+freqDic["Tfor"]+freqDic["Gfor"]+freqDic["Cfor"]+freqDic["Nfor"]) | 
|  | 471     reverse_sum = float ( freqDic["Arev"]+freqDic["Trev"]+freqDic["Grev"]+freqDic["Crev"]+freqDic["Nrev"]) | 
|  | 472     if forward_sum == 0 and reverse_sum == 0: | 
|  | 473       return ". | ." | 
|  | 474     elif reverse_sum == 0: | 
|  | 475       return "%s | ." % (freqDic["Tfor"] / forward_sum * 100) | 
|  | 476     elif forward_sum == 0: | 
|  | 477       return ". | %s" % (freqDic["Trev"] / reverse_sum * 100) | 
|  | 478     else: | 
|  | 479       return "%s | %s" % (freqDic["Tfor"] / forward_sum * 100, freqDic["Trev"] / reverse_sum * 100) | 
|  | 480 | 
|  | 481 | 
|  | 482   def readplot (self): | 
|  | 483     norm=self.norm | 
|  | 484     readmap = {} | 
|  | 485     for offset in self.readDict.keys(): | 
|  | 486       readmap[abs(offset)] = ( len(self.readDict.get(-abs(offset),[]))*norm , len(self.readDict.get(abs(offset),[]))*norm ) | 
|  | 487     mylist = [] | 
|  | 488     for offset in sorted(readmap): | 
|  | 489       if readmap[offset][1] != 0: | 
|  | 490         mylist.append("%s\t%s\t%s\t%s" % (self.gene, offset, readmap[offset][1], "F") ) | 
|  | 491       if readmap[offset][0] != 0: | 
|  | 492         mylist.append("%s\t%s\t%s\t%s" % (self.gene, offset, -readmap[offset][0], "R") ) | 
|  | 493     ## patch to avoid missing graphs when parsed by R-lattice. 27-08-2014. Test and validate ! | 
|  | 494     if not mylist: | 
|  | 495       mylist.append("%s\t%s\t%s\t%s" % (self.gene, 1, 0, "F") ) | 
|  | 496     ### | 
|  | 497     return mylist | 
|  | 498 | 
|  | 499   def readcoverage (self, upstream_coord=None, downstream_coord=None, windowName=None): | 
|  | 500     '''Use by MirParser tool''' | 
|  | 501     upstream_coord = upstream_coord or 1 | 
|  | 502     downstream_coord = downstream_coord or self.size | 
|  | 503     windowName = windowName or "%s_%s_%s" % (self.gene, upstream_coord, downstream_coord) | 
|  | 504     forORrev_coverage = dict ([(i,0) for i in xrange(1, downstream_coord-upstream_coord+1)]) | 
|  | 505     totalforward = self.readcount(upstream_coord=upstream_coord, downstream_coord=downstream_coord, polarity="forward") | 
|  | 506     totalreverse = self.readcount(upstream_coord=upstream_coord, downstream_coord=downstream_coord, polarity="reverse") | 
|  | 507     if totalforward > totalreverse: | 
|  | 508       majorcoverage = "forward" | 
|  | 509       for offset in self.readDict.keys(): | 
|  | 510         if (offset > 0) and ((offset-upstream_coord+1) in forORrev_coverage.keys() ): | 
|  | 511           for read in self.readDict[offset]: | 
|  | 512             for i in xrange(read): | 
|  | 513               try: | 
|  | 514                 forORrev_coverage[offset-upstream_coord+1+i] += 1 | 
|  | 515               except KeyError: | 
|  | 516                 continue # a sense read may span over the downstream limit | 
|  | 517     else: | 
|  | 518       majorcoverage = "reverse" | 
|  | 519       for offset in self.readDict.keys(): | 
|  | 520         if (offset < 0) and (-offset-upstream_coord+1 in forORrev_coverage.keys() ): | 
|  | 521           for read in self.readDict[offset]: | 
|  | 522             for i in xrange(read): | 
|  | 523               try: | 
|  | 524                 forORrev_coverage[-offset-upstream_coord-i] += 1 ## positive coordinates in the instance, with + for forward coverage and - for reverse coverage | 
|  | 525               except KeyError: | 
|  | 526                 continue # an antisense read may span over the upstream limit | 
|  | 527     output_list = [] | 
|  | 528     maximum = max (forORrev_coverage.values()) or 1 | 
|  | 529     for n in sorted (forORrev_coverage): | 
|  | 530       output_list.append("%s\t%s\t%s\t%s\t%s\t%s\t%s" % (self.biosample, windowName, n, float(n)/(downstream_coord-upstream_coord+1), forORrev_coverage[n], float(forORrev_coverage[n])/maximum, majorcoverage)) | 
|  | 531     return "\n".join(output_list) | 
|  | 532 | 
|  | 533 | 
|  | 534   def signature (self, minquery, maxquery, mintarget, maxtarget, scope, zscore="no", upstream_coord=None, downstream_coord=None): | 
|  | 535     ''' migration to memory saving by specifying possible subcoordinates | 
|  | 536     see the readcount method for further discussion | 
|  | 537     scope must be a python iterable; scope define the *relative* offset range to be computed''' | 
|  | 538     upstream_coord = upstream_coord or self.windowoffset | 
|  | 539     downstream_coord = downstream_coord or self.windowoffset+self.size-1 | 
|  | 540     query_range = range (minquery, maxquery+1) | 
|  | 541     target_range = range (mintarget, maxtarget+1) | 
|  | 542     Query_table = {} | 
|  | 543     Target_table = {} | 
|  | 544     frequency_table = dict ([(i, 0) for i in scope]) | 
|  | 545     for offset in self.readDict: | 
|  | 546       if (abs(offset) < upstream_coord or abs(offset) > downstream_coord): continue | 
|  | 547       for size in self.readDict[offset]: | 
|  | 548         if size in query_range: | 
|  | 549           Query_table[offset] = Query_table.get(offset, 0) + 1 | 
|  | 550         if size in target_range: | 
|  | 551           Target_table[offset] = Target_table.get(offset, 0) + 1 | 
|  | 552     for offset in Query_table: | 
|  | 553       for i in scope: | 
|  | 554         frequency_table[i] += min(Query_table[offset], Target_table.get(-offset -i +1, 0)) | 
|  | 555     if minquery==mintarget and maxquery==maxtarget: ## added to incorporate the division by 2 in the method (26/11/2013), see signature_options.py and lattice_signature.py | 
|  | 556       frequency_table = dict([(i,frequency_table[i]/2) for i in frequency_table]) | 
|  | 557     if zscore == "yes": | 
|  | 558       z_mean = mean(frequency_table.values() ) | 
|  | 559       z_std = std(frequency_table.values() ) | 
|  | 560       if z_std == 0: | 
|  | 561         frequency_table = dict([(i,0) for i in frequency_table] ) | 
|  | 562       else: | 
|  | 563         frequency_table = dict([(i, (frequency_table[i]- z_mean)/z_std) for i in frequency_table] ) | 
|  | 564     return frequency_table | 
|  | 565 | 
|  | 566   def hannon_signature (self, minquery, maxquery, mintarget, maxtarget, scope, upstream_coord=None, downstream_coord=None): | 
|  | 567     ''' migration to memory saving by specifying possible subcoordinates see the readcount method for further discussion | 
|  | 568     note that scope must be an iterable (a list or a tuple), which specifies the relative offsets that will be computed''' | 
|  | 569     upstream_coord = upstream_coord or self.windowoffset | 
|  | 570     downstream_coord = downstream_coord or self.windowoffset+self.size-1 | 
|  | 571     query_range = range (minquery, maxquery+1) | 
|  | 572     target_range = range (mintarget, maxtarget+1) | 
|  | 573     Query_table = {} | 
|  | 574     Target_table = {} | 
|  | 575     Total_Query_Numb = 0 | 
|  | 576     general_frequency_table = dict ([(i,0) for i in scope]) | 
|  | 577     ## filtering the appropriate reads for the study | 
|  | 578     for offset in self.readDict: | 
|  | 579       if (abs(offset) < upstream_coord or abs(offset) > downstream_coord): continue | 
|  | 580       for size in self.readDict[offset]: | 
|  | 581         if size in query_range: | 
|  | 582           Query_table[offset] = Query_table.get(offset, 0) + 1 | 
|  | 583           Total_Query_Numb += 1 | 
|  | 584         if size in target_range: | 
|  | 585           Target_table[offset] = Target_table.get(offset, 0) + 1 | 
|  | 586     for offset in Query_table: | 
|  | 587       frequency_table = dict ([(i,0) for i in scope]) | 
|  | 588       number_of_targets = 0 | 
|  | 589       for i in scope: | 
|  | 590         frequency_table[i] += Query_table[offset] *  Target_table.get(-offset -i +1, 0) | 
|  | 591         number_of_targets += Target_table.get(-offset -i +1, 0) | 
|  | 592       for i in scope: | 
|  | 593         try: | 
|  | 594           general_frequency_table[i] += (1. / number_of_targets / Total_Query_Numb) * frequency_table[i] | 
|  | 595         except ZeroDivisionError : | 
|  | 596           continue | 
|  | 597     return general_frequency_table | 
|  | 598 | 
|  | 599   def phasing (self, size_range, scope): | 
|  | 600     ''' to calculate autocorelation like signal - scope must be an python iterable''' | 
|  | 601     read_table = {} | 
|  | 602     total_read_number = 0 | 
|  | 603     general_frequency_table = dict ([(i, 0) for i in scope]) | 
|  | 604     ## read input filtering | 
|  | 605     for offset in self.readDict: | 
|  | 606       for size in self.readDict[offset]: | 
|  | 607         if size in size_range: | 
|  | 608           read_table[offset] = read_table.get(offset, 0) + 1 | 
|  | 609           total_read_number += 1 | 
|  | 610     ## per offset read phasing computing | 
|  | 611     for offset in read_table: | 
|  | 612       frequency_table = dict ([(i, 0) for i in scope]) # local frequency table | 
|  | 613       number_of_targets = 0 | 
|  | 614       for i in scope: | 
|  | 615         if offset > 0: | 
|  | 616           frequency_table[i] += read_table[offset] *  read_table.get(offset + i, 0) | 
|  | 617           number_of_targets += read_table.get(offset + i, 0) | 
|  | 618         else: | 
|  | 619           frequency_table[i] += read_table[offset] *  read_table.get(offset - i, 0) | 
|  | 620           number_of_targets += read_table.get(offset - i, 0) | 
|  | 621     ## inclusion of local frequency table in the general frequency table (all offsets average) | 
|  | 622       for i in scope: | 
|  | 623         try: | 
|  | 624           general_frequency_table[i] += (1. / number_of_targets / total_read_number) * frequency_table[i] | 
|  | 625         except ZeroDivisionError : | 
|  | 626           continue | 
|  | 627     return general_frequency_table | 
|  | 628 | 
|  | 629 | 
|  | 630 | 
|  | 631   def z_signature (self, minquery, maxquery, mintarget, maxtarget, scope): | 
|  | 632     '''Must do: from numpy import mean, std, to use this method; scope must be a python iterable and defines the relative offsets to compute''' | 
|  | 633     frequency_table = self.signature (minquery, maxquery, mintarget, maxtarget, scope) | 
|  | 634     z_table = {} | 
|  | 635     frequency_list = [frequency_table[i] for i in sorted (frequency_table)] | 
|  | 636     if std(frequency_list): | 
|  | 637       meanlist = mean(frequency_list) | 
|  | 638       stdlist = std(frequency_list) | 
|  | 639       z_list = [(i-meanlist)/stdlist for i in frequency_list] | 
|  | 640       return dict (zip (sorted(frequency_table), z_list) ) | 
|  | 641     else: | 
|  | 642       return dict (zip (sorted(frequency_table), [0 for i in frequency_table]) ) | 
|  | 643 | 
|  | 644   def percent_signature (self, minquery, maxquery, mintarget, maxtarget, scope): | 
|  | 645     frequency_table = self.signature (minquery, maxquery, mintarget, maxtarget, scope) | 
|  | 646     total = float(sum ([self.readsizes().get(i,0) for i in set(range(minquery,maxquery)+range(mintarget,maxtarget))]) ) | 
|  | 647     if total == 0: | 
|  | 648       return dict( [(i,0) for i in scope]) | 
|  | 649     return dict( [(i, frequency_table[i]/total*100) for i in scope]) | 
|  | 650 | 
|  | 651   def pairer (self, overlap, minquery, maxquery, mintarget, maxtarget): | 
|  | 652     queryhash = defaultdict(list) | 
|  | 653     targethash = defaultdict(list) | 
|  | 654     query_range = range (int(minquery), int(maxquery)+1) | 
|  | 655     target_range = range (int(mintarget), int(maxtarget)+1) | 
|  | 656     paired_sequences = [] | 
|  | 657     for offset in self.readDict: # selection of data | 
|  | 658       for size in self.readDict[offset]: | 
|  | 659         if size in query_range: | 
|  | 660           queryhash[offset].append(size) | 
|  | 661         if size in target_range: | 
|  | 662           targethash[offset].append(size) | 
|  | 663     for offset in queryhash: | 
|  | 664       if offset >= 0: matched_offset = -offset - overlap + 1 | 
|  | 665       else: matched_offset = -offset - overlap + 1 | 
|  | 666       if targethash[matched_offset]: | 
|  | 667         paired = min ( len(queryhash[offset]), len(targethash[matched_offset]) ) | 
|  | 668         if offset >= 0: | 
|  | 669           for i in range (paired): | 
|  | 670             paired_sequences.append("+%s" % RNAtranslate ( self.sequence[offset:offset+queryhash[offset][i]]) ) | 
|  | 671             paired_sequences.append("-%s" % RNAtranslate (antipara (self.sequence[-matched_offset-targethash[matched_offset][i]+1:-matched_offset+1]) ) ) | 
|  | 672         if offset < 0: | 
|  | 673           for i in range (paired): | 
|  | 674             paired_sequences.append("-%s" % RNAtranslate (antipara (self.sequence[-offset-queryhash[offset][i]+1:-offset+1]) ) ) | 
|  | 675             paired_sequences.append("+%s" % RNAtranslate (self.sequence[matched_offset:matched_offset+targethash[matched_offset][i]] ) ) | 
|  | 676     return paired_sequences | 
|  | 677 | 
|  | 678   def pairable (self, overlap, minquery, maxquery, mintarget, maxtarget): | 
|  | 679     queryhash = defaultdict(list) | 
|  | 680     targethash = defaultdict(list) | 
|  | 681     query_range = range (int(minquery), int(maxquery)+1) | 
|  | 682     target_range = range (int(mintarget), int(maxtarget)+1) | 
|  | 683     paired_sequences = [] | 
|  | 684 | 
|  | 685     for offset in self.readDict: # selection of data | 
|  | 686       for size in self.readDict[offset]: | 
|  | 687         if size in query_range: | 
|  | 688           queryhash[offset].append(size) | 
|  | 689         if size in target_range: | 
|  | 690           targethash[offset].append(size) | 
|  | 691 | 
|  | 692     for offset in queryhash: | 
|  | 693       matched_offset = -offset - overlap + 1 | 
|  | 694       if targethash[matched_offset]: | 
|  | 695         if offset >= 0: | 
|  | 696           for i in queryhash[offset]: | 
|  | 697             paired_sequences.append("+%s" % RNAtranslate (self.sequence[offset:offset+i]) ) | 
|  | 698           for i in targethash[matched_offset]: | 
|  | 699             paired_sequences.append( "-%s" % RNAtranslate (antipara (self.sequence[-matched_offset-i+1:-matched_offset+1]) ) ) | 
|  | 700         if offset < 0: | 
|  | 701           for i in queryhash[offset]: | 
|  | 702             paired_sequences.append("-%s" %  RNAtranslate (antipara (self.sequence[-offset-i+1:-offset+1]) ) ) | 
|  | 703           for i in targethash[matched_offset]: | 
|  | 704             paired_sequences.append("+%s" %  RNAtranslate (self.sequence[matched_offset:matched_offset+i] ) ) | 
|  | 705     return paired_sequences | 
|  | 706 | 
|  | 707   def newpairable_bowtie (self, overlap, minquery, maxquery, mintarget, maxtarget): | 
|  | 708     ''' revision of pairable on 3-12-2012, with focus on the offset shift problem (bowtie is 1-based cooordinates whereas python strings are 0-based coordinates''' | 
|  | 709     queryhash = defaultdict(list) | 
|  | 710     targethash = defaultdict(list) | 
|  | 711     query_range = range (int(minquery), int(maxquery)+1) | 
|  | 712     target_range = range (int(mintarget), int(maxtarget)+1) | 
|  | 713     bowtie_output = [] | 
|  | 714 | 
|  | 715     for offset in self.readDict: # selection of data | 
|  | 716       for size in self.readDict[offset]: | 
|  | 717         if size in query_range: | 
|  | 718           queryhash[offset].append(size) | 
|  | 719         if size in target_range: | 
|  | 720           targethash[offset].append(size) | 
|  | 721     counter = 0 | 
|  | 722     for offset in queryhash: | 
|  | 723       matched_offset = -offset - overlap + 1 | 
|  | 724       if targethash[matched_offset]: | 
|  | 725         if offset >= 0: | 
|  | 726           for i in queryhash[offset]: | 
|  | 727             counter += 1 | 
|  | 728             bowtie_output.append("%s\t%s\t%s\t%s\t%s" % (counter, "+", self.gene, offset-1, self.sequence[offset-1:offset-1+i]) ) # attention a la base 1-0 de l'offset | 
|  | 729         if offset < 0: | 
|  | 730           for i in queryhash[offset]: | 
|  | 731             counter += 1 | 
|  | 732             bowtie_output.append("%s\t%s\t%s\t%s\t%s" % (counter, "-", self.gene, -offset-i, self.sequence[-offset-i:-offset])) # attention a la base 1-0 de l'offset | 
|  | 733     return bowtie_output | 
|  | 734 | 
|  | 735 | 
|  | 736 def __main__(bowtie_index_path, bowtie_output_path): | 
|  | 737   sequenceDic = get_fasta (bowtie_index_path) | 
|  | 738   objDic = {} | 
|  | 739   F = open (bowtie_output_path, "r") # F is the bowtie output taken as input | 
|  | 740   for line in F: | 
|  | 741     fields = line.split() | 
|  | 742     polarity = fields[1] | 
|  | 743     gene = fields[2] | 
|  | 744     offset = int(fields[3]) | 
|  | 745     size = len (fields[4]) | 
|  | 746     try: | 
|  | 747       objDic[gene].addread (polarity, offset, size) | 
|  | 748     except KeyError: | 
|  | 749       objDic[gene] = SmRNAwindow(gene, sequenceDic[gene]) | 
|  | 750       objDic[gene].addread (polarity, offset, size) | 
|  | 751   F.close() | 
|  | 752   for gene in objDic: | 
|  | 753     print gene, objDic[gene].pairer(19,19,23,19,23) | 
|  | 754 | 
|  | 755 if __name__ == "__main__" : __main__(sys.argv[1], sys.argv[2]) |