view test-data/generate_reads.py @ 80:5272ffe31456 draft

Uploaded
author yhoogstrate
date Wed, 11 Jun 2014 05:25:06 -0400
parents 48c78adade03
children aaf98cdc5916
line wrap: on
line source

#!/usr/bin/env python


import random
import math


__version_info__ = ('1', '0', '0')
__version__ = '.'.join(__version_info__)


class Region:
	def __init__(self,start,stop,sequence):
		self.start = start
		self.stop = stop
		self.sequence = sequence.strip().replace("\n","").replace(" ","")
		if(len(self.sequence) != self.getSpanningLength()):
			print "ERROR: sequence length: "+str(len(self.sequence))+", while spanning region is: "+str(self.getSpanningLength())
			import sys
			sys.exit()
	
	def getSpanningLength(self):
		return abs(self.stop-self.start+1)

class ReadSynthesizer:
	def __init__(self,chromosome):
		self.regions = []
		self.chromosome = chromosome
	
	def addRegion(self,region):
		self.regions.append(region)
	
	def produceReads(self,readDensity = 1,read_length = 50):
		"""
		Produces uniform reads by walking iteratively over self.regions
		"""
		
		mRNA = self.getTotalmRNA()
		spanning_length = self.getRegionSpanningLength()
		n = spanning_length['total'] - read_length + 1
		
		j = 0
		k = 0
		
		for i in range(n):
			#  "alpha is playing the role of k and beta is playing the role of theta"
			dd = max(0,int(round(random.lognormvariate(math.log(readDensity),0.5))))# Notice this is NOT a binomial distribution!!
			
			for d in range(dd):
				sequence = mRNA[i:i+read_length]
				
				if(random.randint(0,1) == 0):
					strand = 0
				else:
					strand = 16
				flag = strand + 0
				
				print "read_"+str(j)+"."+str(i)+"."+str(d)+"\t"+str(flag)+"\t"+self.chromosome+"\t"+str(self.regions[j].start + k)+"\t60\t"+self.getMappingString(read_length,j,k)+"\t*\t0\t0\t"+str(sequence.upper())+"\t*"
			
			spanning_length['iter'][j] -= 1
			if(k >= self.regions[j].getSpanningLength()-1):
				j += 1
				k = 0
			else:
				k += 1
	
	def getMappingString(self,length,j,offset):
		m = 0
		
		out = ""
		
		for i in range(length):
			k = i + offset
			
			if(k >= self.regions[j].getSpanningLength()):
				j += 1
				
				out += str(m)+"M"
				out += (str(self.regions[j].start - self.regions[j-1].stop-1))+"N"
				m = 1
				
				offset = -k
			else:
				m += 1
		
		out += str(m) + "M"
		
		
		return out
	
	def getRegionSpanningLength(self):
		length = {'total':0,'iter':[]}
		for r in self.regions:
			l = r.getSpanningLength()
			length['iter'].append(l)
			length['total'] += l
		return length
	
	def getTotalmRNA(self):
		mRNA = ""
		for r in self.regions:
			mRNA += r.sequence
		return mRNA



#rs = ReadSynthesizer('chr6')
#rs.addRegion(Region(100,149,'ccaggactggtttctgtaagaaacagcaggagctgtggcagcggcgaaag'))
#rs.addRegion(Region(151,152,'at'))
#rs.produceReads(3,50)



if __name__ == "__main__":
	rs = ReadSynthesizer('chr6')
	rs.addRegion(Region(154360546,154360969,'ccaggactggtttctgtaagaaacagcaggagctgtggcagcggcgaaaggaagcggctgaggcgcttggaacccgaaaagtctcggtgctcctggctacctcgcacagcggtgcccgcccggccgtcagtaccatggacagcagcgctgcccccacgaacgccagcaattgcactgatgccttggcgtactcaagttgctccccagcacccagccccggttcctgggtcaacttgtcccacttagatggcGacctgtccgacccatgcggtccgaaccgcaccgacctgggcgggagagacagcctgtgccctccgaccggcagtccctccatgatcacggccatcacgatcatggccctctactccatcgtgtgcgtggtggggctcttcggaaacttcctggtcatgtatgtgattgtcag'))
	rs.addRegion(Region(154410961,154411313,'atacaccaagatgaagactgccaccaacatctacattttcaaccttgctctggcagatgccttagccaccagtaccctgcccttccagagtgtgaattacctaatgggaacatggccatttggaaccatcctttgcaagatagtgatctccatagattactataacatgttcaccagcatattcaccctctgcaccatgagtgttgatcgatacattgcagtctgccaccctgtcaaggccttagatttccgtactccccgaaatgccaaaattatcaatgtctgcaactggatcctctcttcagccattggtcttcctgtaatgttcatggctacaacaaaatacaggcaag'))
	rs.addRegion(Region(154412087,154412607,'gttccatagattgtacactaacattctctcatccaacctggtactgggaaaacctgctgaagatctgtgttttcatcttcgccttcattatgccagtgctcatcattaccgtgtgctatggactgatgatcttgcgcctcaagagtgtccgcatgctctctggctccaaagaaaaggacaggaatcttcgaaggatcaccaggatggtgctggtggtggtggctgtgttcatcgtctgctggactcccattcacatttacgtcatcattaaagccttggttacaatcccagaaactacgttccagactgtttcttggcacttctgcattgctctaggttacacaaacagctgcctcaacccagtcctttatgcatttctggatgaaaacttcaaacgatgcttcagagagttctgtatcccaacctcttccaacattgagcaacaaaactccactcgaattcgtcagaacactagagaccacccctccacggccaatacagtggatagaactaatcatcag'))
	rs.addRegion(Region(154428600,154428787,'gtggaattgaacctggactgtcactgtgaaaatgcaaagccttggccactgagctacaatgcagggcagtctccatttcccttcccaggaagagtctagagcattaattttgagtttgcaaaggcttgtaactatttcatatgatttttagagctgactatgacatgaaccctaaaattcctgttccc'))
	rs.produceReads(3,50)