| 38 | 1 #!/usr/bin/env python | 
|  | 2 | 
|  | 3 | 
|  | 4 import random | 
|  | 5 import math | 
|  | 6 | 
|  | 7 | 
|  | 8 class Region: | 
|  | 9 	def __init__(self,start,stop,sequence): | 
|  | 10 		self.start = start | 
|  | 11 		self.stop = stop | 
|  | 12 		self.sequence = sequence.strip().replace("\n","").replace(" ","") | 
|  | 13 		if(len(self.sequence) != self.getSpanningLength()): | 
|  | 14 			print "ERROR: sequence length: "+str(len(self.sequence))+", while spanning region is: "+str(self.getSpanningLength()) | 
|  | 15 			import sys | 
|  | 16 			sys.exit() | 
|  | 17 | 
|  | 18 	def getSpanningLength(self): | 
|  | 19 		return abs(self.stop-self.start+1) | 
|  | 20 | 
|  | 21 class ReadSynthesizer: | 
|  | 22 	def __init__(self,chromosome): | 
|  | 23 		self.regions = [] | 
|  | 24 		self.chromosome = chromosome | 
|  | 25 | 
|  | 26 	def addRegion(self,region): | 
|  | 27 		self.regions.append(region) | 
|  | 28 | 
|  | 29 	def produceReads(self,readDensity = 1,read_length = 50): | 
|  | 30 		""" | 
|  | 31 		Produces uniform reads by walking iteratively over self.regions | 
|  | 32 		""" | 
|  | 33 | 
|  | 34 		mRNA = self.getTotalmRNA() | 
|  | 35 		spanning_length = self.getRegionSpanningLength() | 
|  | 36 		n = spanning_length['total'] - read_length + 1 | 
|  | 37 | 
|  | 38 		j = 0 | 
|  | 39 		k = 0 | 
|  | 40 | 
|  | 41 		for i in range(n): | 
|  | 42 			#  "alpha is playing the role of k and beta is playing the role of theta" | 
|  | 43 			dd = max(0,int(round(random.lognormvariate(math.log(readDensity),0.5))))# Notice this is NOT a binomial distribution!! | 
|  | 44 | 
|  | 45 			for d in range(dd): | 
|  | 46 				sequence = mRNA[i:i+read_length] | 
|  | 47 | 
|  | 48 				if(random.randint(0,1) == 0): | 
|  | 49 					strand = 0 | 
|  | 50 				else: | 
|  | 51 					strand = 16 | 
|  | 52 				flag = strand + 0 | 
|  | 53 | 
|  | 54 				print "read_"+str(j)+"."+str(i)+"."+str(d)+"\t"+str(flag)+"\t"+self.chromosome+"\t"+str(self.regions[j].start + k)+"\t60\t"+self.getMappingString(read_length,j,k)+"\t*\t0\t0\t"+str(sequence.upper())+"\t*" | 
|  | 55 | 
|  | 56 			spanning_length['iter'][j] -= 1 | 
|  | 57 			if(k >= self.regions[j].getSpanningLength()-1): | 
|  | 58 				j += 1 | 
|  | 59 				k = 0 | 
|  | 60 			else: | 
|  | 61 				k += 1 | 
|  | 62 | 
|  | 63 	def getMappingString(self,length,j,offset): | 
|  | 64 		m = 0 | 
|  | 65 | 
|  | 66 		out = "" | 
|  | 67 | 
|  | 68 		for i in range(length): | 
|  | 69 			k = i + offset | 
|  | 70 | 
|  | 71 			if(k >= self.regions[j].getSpanningLength()): | 
|  | 72 				j += 1 | 
|  | 73 | 
|  | 74 				out += str(m)+"M" | 
|  | 75 				out += (str(self.regions[j].start - self.regions[j-1].stop-1))+"N" | 
|  | 76 				m = 1 | 
|  | 77 | 
|  | 78 				offset = -k | 
|  | 79 			else: | 
|  | 80 				m += 1 | 
|  | 81 | 
|  | 82 		out += str(m) + "M" | 
|  | 83 | 
|  | 84 | 
|  | 85 		return out | 
|  | 86 | 
|  | 87 	def getRegionSpanningLength(self): | 
|  | 88 		length = {'total':0,'iter':[]} | 
|  | 89 		for r in self.regions: | 
|  | 90 			l = r.getSpanningLength() | 
|  | 91 			length['iter'].append(l) | 
|  | 92 			length['total'] += l | 
|  | 93 		return length | 
|  | 94 | 
|  | 95 	def getTotalmRNA(self): | 
|  | 96 		mRNA = "" | 
|  | 97 		for r in self.regions: | 
|  | 98 			mRNA += r.sequence | 
|  | 99 		return mRNA | 
|  | 100 | 
|  | 101 #rs = ReadSynthesizer('chr6') | 
|  | 102 #rs.addRegion(Region(100,149,'ccaggactggtttctgtaagaaacagcaggagctgtggcagcggcgaaag')) | 
|  | 103 #rs.addRegion(Region(151,152,'at')) | 
|  | 104 #rs.produceReads(3,50) | 
|  | 105 | 
|  | 106 | 
|  | 107 rs = ReadSynthesizer('chr6') | 
|  | 108 rs.addRegion(Region(154360546,154360969,'ccaggactggtttctgtaagaaacagcaggagctgtggcagcggcgaaaggaagcggctgaggcgcttggaacccgaaaagtctcggtgctcctggctacctcgcacagcggtgcccgcccggccgtcagtaccatggacagcagcgctgcccccacgaacgccagcaattgcactgatgccttggcgtactcaagttgctccccagcacccagccccggttcctgggtcaacttgtcccacttagatggcGacctgtccgacccatgcggtccgaaccgcaccgacctgggcgggagagacagcctgtgccctccgaccggcagtccctccatgatcacggccatcacgatcatggccctctactccatcgtgtgcgtggtggggctcttcggaaacttcctggtcatgtatgtgattgtcag')) | 
|  | 109 rs.addRegion(Region(154410961,154411313,'atacaccaagatgaagactgccaccaacatctacattttcaaccttgctctggcagatgccttagccaccagtaccctgcccttccagagtgtgaattacctaatgggaacatggccatttggaaccatcctttgcaagatagtgatctccatagattactataacatgttcaccagcatattcaccctctgcaccatgagtgttgatcgatacattgcagtctgccaccctgtcaaggccttagatttccgtactccccgaaatgccaaaattatcaatgtctgcaactggatcctctcttcagccattggtcttcctgtaatgttcatggctacaacaaaatacaggcaag')) | 
|  | 110 rs.addRegion(Region(154412087,154412607,'gttccatagattgtacactaacattctctcatccaacctggtactgggaaaacctgctgaagatctgtgttttcatcttcgccttcattatgccagtgctcatcattaccgtgtgctatggactgatgatcttgcgcctcaagagtgtccgcatgctctctggctccaaagaaaaggacaggaatcttcgaaggatcaccaggatggtgctggtggtggtggctgtgttcatcgtctgctggactcccattcacatttacgtcatcattaaagccttggttacaatcccagaaactacgttccagactgtttcttggcacttctgcattgctctaggttacacaaacagctgcctcaacccagtcctttatgcatttctggatgaaaacttcaaacgatgcttcagagagttctgtatcccaacctcttccaacattgagcaacaaaactccactcgaattcgtcagaacactagagaccacccctccacggccaatacagtggatagaactaatcatcag')) | 
|  | 111 rs.addRegion(Region(154428600,154428787,'gtggaattgaacctggactgtcactgtgaaaatgcaaagccttggccactgagctacaatgcagggcagtctccatttcccttcccaggaagagtctagagcattaattttgagtttgcaaaggcttgtaactatttcatatgatttttagagctgactatgacatgaaccctaaaattcctgttccc')) | 
|  | 112 rs.produceReads(3,50) | 
|  | 113 | 
|  | 114 | 
|  | 115 | 
|  | 116 | 
|  | 117 | 
|  | 118 |