| 
38
 | 
     1 #!/usr/bin/env python
 | 
| 
 | 
     2 
 | 
| 
 | 
     3 
 | 
| 
 | 
     4 import random
 | 
| 
 | 
     5 import math
 | 
| 
 | 
     6 
 | 
| 
 | 
     7 
 | 
| 
80
 | 
     8 __version_info__ = ('1', '0', '0')
 | 
| 
 | 
     9 __version__ = '.'.join(__version_info__)
 | 
| 
 | 
    10 
 | 
| 
 | 
    11 
 | 
| 
38
 | 
    12 class Region:
 | 
| 
 | 
    13 	def __init__(self,start,stop,sequence):
 | 
| 
 | 
    14 		self.start = start
 | 
| 
 | 
    15 		self.stop = stop
 | 
| 
 | 
    16 		self.sequence = sequence.strip().replace("\n","").replace(" ","")
 | 
| 
 | 
    17 		if(len(self.sequence) != self.getSpanningLength()):
 | 
| 
 | 
    18 			print "ERROR: sequence length: "+str(len(self.sequence))+", while spanning region is: "+str(self.getSpanningLength())
 | 
| 
 | 
    19 			import sys
 | 
| 
 | 
    20 			sys.exit()
 | 
| 
 | 
    21 	
 | 
| 
 | 
    22 	def getSpanningLength(self):
 | 
| 
 | 
    23 		return abs(self.stop-self.start+1)
 | 
| 
 | 
    24 
 | 
| 
 | 
    25 class ReadSynthesizer:
 | 
| 
 | 
    26 	def __init__(self,chromosome):
 | 
| 
 | 
    27 		self.regions = []
 | 
| 
 | 
    28 		self.chromosome = chromosome
 | 
| 
 | 
    29 	
 | 
| 
 | 
    30 	def addRegion(self,region):
 | 
| 
 | 
    31 		self.regions.append(region)
 | 
| 
 | 
    32 	
 | 
| 
 | 
    33 	def produceReads(self,readDensity = 1,read_length = 50):
 | 
| 
 | 
    34 		"""
 | 
| 
 | 
    35 		Produces uniform reads by walking iteratively over self.regions
 | 
| 
 | 
    36 		"""
 | 
| 
 | 
    37 		
 | 
| 
 | 
    38 		mRNA = self.getTotalmRNA()
 | 
| 
 | 
    39 		spanning_length = self.getRegionSpanningLength()
 | 
| 
 | 
    40 		n = spanning_length['total'] - read_length + 1
 | 
| 
 | 
    41 		
 | 
| 
 | 
    42 		j = 0
 | 
| 
 | 
    43 		k = 0
 | 
| 
 | 
    44 		
 | 
| 
 | 
    45 		for i in range(n):
 | 
| 
 | 
    46 			#  "alpha is playing the role of k and beta is playing the role of theta"
 | 
| 
 | 
    47 			dd = max(0,int(round(random.lognormvariate(math.log(readDensity),0.5))))# Notice this is NOT a binomial distribution!!
 | 
| 
 | 
    48 			
 | 
| 
 | 
    49 			for d in range(dd):
 | 
| 
 | 
    50 				sequence = mRNA[i:i+read_length]
 | 
| 
 | 
    51 				
 | 
| 
 | 
    52 				if(random.randint(0,1) == 0):
 | 
| 
 | 
    53 					strand = 0
 | 
| 
 | 
    54 				else:
 | 
| 
 | 
    55 					strand = 16
 | 
| 
 | 
    56 				flag = strand + 0
 | 
| 
 | 
    57 				
 | 
| 
 | 
    58 				print "read_"+str(j)+"."+str(i)+"."+str(d)+"\t"+str(flag)+"\t"+self.chromosome+"\t"+str(self.regions[j].start + k)+"\t60\t"+self.getMappingString(read_length,j,k)+"\t*\t0\t0\t"+str(sequence.upper())+"\t*"
 | 
| 
 | 
    59 			
 | 
| 
 | 
    60 			spanning_length['iter'][j] -= 1
 | 
| 
 | 
    61 			if(k >= self.regions[j].getSpanningLength()-1):
 | 
| 
 | 
    62 				j += 1
 | 
| 
 | 
    63 				k = 0
 | 
| 
 | 
    64 			else:
 | 
| 
 | 
    65 				k += 1
 | 
| 
 | 
    66 	
 | 
| 
 | 
    67 	def getMappingString(self,length,j,offset):
 | 
| 
 | 
    68 		m = 0
 | 
| 
 | 
    69 		
 | 
| 
 | 
    70 		out = ""
 | 
| 
 | 
    71 		
 | 
| 
 | 
    72 		for i in range(length):
 | 
| 
 | 
    73 			k = i + offset
 | 
| 
 | 
    74 			
 | 
| 
 | 
    75 			if(k >= self.regions[j].getSpanningLength()):
 | 
| 
 | 
    76 				j += 1
 | 
| 
 | 
    77 				
 | 
| 
 | 
    78 				out += str(m)+"M"
 | 
| 
 | 
    79 				out += (str(self.regions[j].start - self.regions[j-1].stop-1))+"N"
 | 
| 
 | 
    80 				m = 1
 | 
| 
 | 
    81 				
 | 
| 
 | 
    82 				offset = -k
 | 
| 
 | 
    83 			else:
 | 
| 
 | 
    84 				m += 1
 | 
| 
 | 
    85 		
 | 
| 
 | 
    86 		out += str(m) + "M"
 | 
| 
 | 
    87 		
 | 
| 
 | 
    88 		
 | 
| 
 | 
    89 		return out
 | 
| 
 | 
    90 	
 | 
| 
 | 
    91 	def getRegionSpanningLength(self):
 | 
| 
 | 
    92 		length = {'total':0,'iter':[]}
 | 
| 
 | 
    93 		for r in self.regions:
 | 
| 
 | 
    94 			l = r.getSpanningLength()
 | 
| 
 | 
    95 			length['iter'].append(l)
 | 
| 
 | 
    96 			length['total'] += l
 | 
| 
 | 
    97 		return length
 | 
| 
 | 
    98 	
 | 
| 
 | 
    99 	def getTotalmRNA(self):
 | 
| 
 | 
   100 		mRNA = ""
 | 
| 
 | 
   101 		for r in self.regions:
 | 
| 
 | 
   102 			mRNA += r.sequence
 | 
| 
 | 
   103 		return mRNA
 | 
| 
 | 
   104 
 | 
| 
80
 | 
   105 
 | 
| 
 | 
   106 
 | 
| 
38
 | 
   107 #rs = ReadSynthesizer('chr6')
 | 
| 
 | 
   108 #rs.addRegion(Region(100,149,'ccaggactggtttctgtaagaaacagcaggagctgtggcagcggcgaaag'))
 | 
| 
 | 
   109 #rs.addRegion(Region(151,152,'at'))
 | 
| 
 | 
   110 #rs.produceReads(3,50)
 | 
| 
 | 
   111 
 | 
| 
 | 
   112 
 | 
| 
 | 
   113 
 | 
| 
80
 | 
   114 if __name__ == "__main__":
 | 
| 
 | 
   115 	rs = ReadSynthesizer('chr6')
 | 
| 
 | 
   116 	rs.addRegion(Region(154360546,154360969,'ccaggactggtttctgtaagaaacagcaggagctgtggcagcggcgaaaggaagcggctgaggcgcttggaacccgaaaagtctcggtgctcctggctacctcgcacagcggtgcccgcccggccgtcagtaccatggacagcagcgctgcccccacgaacgccagcaattgcactgatgccttggcgtactcaagttgctccccagcacccagccccggttcctgggtcaacttgtcccacttagatggcGacctgtccgacccatgcggtccgaaccgcaccgacctgggcgggagagacagcctgtgccctccgaccggcagtccctccatgatcacggccatcacgatcatggccctctactccatcgtgtgcgtggtggggctcttcggaaacttcctggtcatgtatgtgattgtcag'))
 | 
| 
 | 
   117 	rs.addRegion(Region(154410961,154411313,'atacaccaagatgaagactgccaccaacatctacattttcaaccttgctctggcagatgccttagccaccagtaccctgcccttccagagtgtgaattacctaatgggaacatggccatttggaaccatcctttgcaagatagtgatctccatagattactataacatgttcaccagcatattcaccctctgcaccatgagtgttgatcgatacattgcagtctgccaccctgtcaaggccttagatttccgtactccccgaaatgccaaaattatcaatgtctgcaactggatcctctcttcagccattggtcttcctgtaatgttcatggctacaacaaaatacaggcaag'))
 | 
| 
 | 
   118 	rs.addRegion(Region(154412087,154412607,'gttccatagattgtacactaacattctctcatccaacctggtactgggaaaacctgctgaagatctgtgttttcatcttcgccttcattatgccagtgctcatcattaccgtgtgctatggactgatgatcttgcgcctcaagagtgtccgcatgctctctggctccaaagaaaaggacaggaatcttcgaaggatcaccaggatggtgctggtggtggtggctgtgttcatcgtctgctggactcccattcacatttacgtcatcattaaagccttggttacaatcccagaaactacgttccagactgtttcttggcacttctgcattgctctaggttacacaaacagctgcctcaacccagtcctttatgcatttctggatgaaaacttcaaacgatgcttcagagagttctgtatcccaacctcttccaacattgagcaacaaaactccactcgaattcgtcagaacactagagaccacccctccacggccaatacagtggatagaactaatcatcag'))
 | 
| 
 | 
   119 	rs.addRegion(Region(154428600,154428787,'gtggaattgaacctggactgtcactgtgaaaatgcaaagccttggccactgagctacaatgcagggcagtctccatttcccttcccaggaagagtctagagcattaattttgagtttgcaaaggcttgtaactatttcatatgatttttagagctgactatgacatgaaccctaaaattcctgttccc'))
 | 
| 
 | 
   120 	rs.produceReads(3,50)
 |