38
|
1 #!/usr/bin/env python
|
|
2
|
|
3
|
|
4 import random
|
|
5 import math
|
|
6
|
|
7
|
80
|
8 __version_info__ = ('1', '0', '0')
|
|
9 __version__ = '.'.join(__version_info__)
|
|
10
|
|
11
|
38
|
12 class Region:
|
|
13 def __init__(self,start,stop,sequence):
|
|
14 self.start = start
|
|
15 self.stop = stop
|
|
16 self.sequence = sequence.strip().replace("\n","").replace(" ","")
|
|
17 if(len(self.sequence) != self.getSpanningLength()):
|
|
18 print "ERROR: sequence length: "+str(len(self.sequence))+", while spanning region is: "+str(self.getSpanningLength())
|
|
19 import sys
|
|
20 sys.exit()
|
|
21
|
|
22 def getSpanningLength(self):
|
|
23 return abs(self.stop-self.start+1)
|
|
24
|
|
25 class ReadSynthesizer:
|
|
26 def __init__(self,chromosome):
|
|
27 self.regions = []
|
|
28 self.chromosome = chromosome
|
|
29
|
|
30 def addRegion(self,region):
|
|
31 self.regions.append(region)
|
|
32
|
|
33 def produceReads(self,readDensity = 1,read_length = 50):
|
|
34 """
|
|
35 Produces uniform reads by walking iteratively over self.regions
|
|
36 """
|
|
37
|
|
38 mRNA = self.getTotalmRNA()
|
|
39 spanning_length = self.getRegionSpanningLength()
|
|
40 n = spanning_length['total'] - read_length + 1
|
|
41
|
|
42 j = 0
|
|
43 k = 0
|
|
44
|
|
45 for i in range(n):
|
|
46 # "alpha is playing the role of k and beta is playing the role of theta"
|
|
47 dd = max(0,int(round(random.lognormvariate(math.log(readDensity),0.5))))# Notice this is NOT a binomial distribution!!
|
|
48
|
|
49 for d in range(dd):
|
|
50 sequence = mRNA[i:i+read_length]
|
|
51
|
|
52 if(random.randint(0,1) == 0):
|
|
53 strand = 0
|
|
54 else:
|
|
55 strand = 16
|
|
56 flag = strand + 0
|
|
57
|
|
58 print "read_"+str(j)+"."+str(i)+"."+str(d)+"\t"+str(flag)+"\t"+self.chromosome+"\t"+str(self.regions[j].start + k)+"\t60\t"+self.getMappingString(read_length,j,k)+"\t*\t0\t0\t"+str(sequence.upper())+"\t*"
|
|
59
|
|
60 spanning_length['iter'][j] -= 1
|
|
61 if(k >= self.regions[j].getSpanningLength()-1):
|
|
62 j += 1
|
|
63 k = 0
|
|
64 else:
|
|
65 k += 1
|
|
66
|
|
67 def getMappingString(self,length,j,offset):
|
|
68 m = 0
|
|
69
|
|
70 out = ""
|
|
71
|
|
72 for i in range(length):
|
|
73 k = i + offset
|
|
74
|
|
75 if(k >= self.regions[j].getSpanningLength()):
|
|
76 j += 1
|
|
77
|
|
78 out += str(m)+"M"
|
|
79 out += (str(self.regions[j].start - self.regions[j-1].stop-1))+"N"
|
|
80 m = 1
|
|
81
|
|
82 offset = -k
|
|
83 else:
|
|
84 m += 1
|
|
85
|
|
86 out += str(m) + "M"
|
|
87
|
|
88
|
|
89 return out
|
|
90
|
|
91 def getRegionSpanningLength(self):
|
|
92 length = {'total':0,'iter':[]}
|
|
93 for r in self.regions:
|
|
94 l = r.getSpanningLength()
|
|
95 length['iter'].append(l)
|
|
96 length['total'] += l
|
|
97 return length
|
|
98
|
|
99 def getTotalmRNA(self):
|
|
100 mRNA = ""
|
|
101 for r in self.regions:
|
|
102 mRNA += r.sequence
|
|
103 return mRNA
|
|
104
|
80
|
105
|
|
106
|
38
|
107 #rs = ReadSynthesizer('chr6')
|
|
108 #rs.addRegion(Region(100,149,'ccaggactggtttctgtaagaaacagcaggagctgtggcagcggcgaaag'))
|
|
109 #rs.addRegion(Region(151,152,'at'))
|
|
110 #rs.produceReads(3,50)
|
|
111
|
|
112
|
|
113
|
80
|
114 if __name__ == "__main__":
|
|
115 rs = ReadSynthesizer('chr6')
|
|
116 rs.addRegion(Region(154360546,154360969,'ccaggactggtttctgtaagaaacagcaggagctgtggcagcggcgaaaggaagcggctgaggcgcttggaacccgaaaagtctcggtgctcctggctacctcgcacagcggtgcccgcccggccgtcagtaccatggacagcagcgctgcccccacgaacgccagcaattgcactgatgccttggcgtactcaagttgctccccagcacccagccccggttcctgggtcaacttgtcccacttagatggcGacctgtccgacccatgcggtccgaaccgcaccgacctgggcgggagagacagcctgtgccctccgaccggcagtccctccatgatcacggccatcacgatcatggccctctactccatcgtgtgcgtggtggggctcttcggaaacttcctggtcatgtatgtgattgtcag'))
|
|
117 rs.addRegion(Region(154410961,154411313,'atacaccaagatgaagactgccaccaacatctacattttcaaccttgctctggcagatgccttagccaccagtaccctgcccttccagagtgtgaattacctaatgggaacatggccatttggaaccatcctttgcaagatagtgatctccatagattactataacatgttcaccagcatattcaccctctgcaccatgagtgttgatcgatacattgcagtctgccaccctgtcaaggccttagatttccgtactccccgaaatgccaaaattatcaatgtctgcaactggatcctctcttcagccattggtcttcctgtaatgttcatggctacaacaaaatacaggcaag'))
|
|
118 rs.addRegion(Region(154412087,154412607,'gttccatagattgtacactaacattctctcatccaacctggtactgggaaaacctgctgaagatctgtgttttcatcttcgccttcattatgccagtgctcatcattaccgtgtgctatggactgatgatcttgcgcctcaagagtgtccgcatgctctctggctccaaagaaaaggacaggaatcttcgaaggatcaccaggatggtgctggtggtggtggctgtgttcatcgtctgctggactcccattcacatttacgtcatcattaaagccttggttacaatcccagaaactacgttccagactgtttcttggcacttctgcattgctctaggttacacaaacagctgcctcaacccagtcctttatgcatttctggatgaaaacttcaaacgatgcttcagagagttctgtatcccaacctcttccaacattgagcaacaaaactccactcgaattcgtcagaacactagagaccacccctccacggccaatacagtggatagaactaatcatcag'))
|
|
119 rs.addRegion(Region(154428600,154428787,'gtggaattgaacctggactgtcactgtgaaaatgcaaagccttggccactgagctacaatgcagggcagtctccatttcccttcccaggaagagtctagagcattaattttgagtttgcaaaggcttgtaactatttcatatgatttttagagctgactatgacatgaaccctaaaattcctgttccc'))
|
|
120 rs.produceReads(3,50)
|