Mercurial > repos > yhoogstrate > varscan_mpileup2snp_from_bam
annotate test-data/generate_reads.py @ 91:94fb905e8d4f draft default tip
Uploaded
author | yhoogstrate |
---|---|
date | Thu, 05 Nov 2015 08:29:47 -0500 |
parents | aaf98cdc5916 |
children |
rev | line source |
---|---|
38 | 1 #!/usr/bin/env python |
2 | |
3 | |
4 import random | |
5 import math | |
6 | |
7 | |
80 | 8 __version_info__ = ('1', '0', '0') |
9 __version__ = '.'.join(__version_info__) | |
10 | |
11 | |
38 | 12 class Region: |
13 def __init__(self,start,stop,sequence): | |
14 self.start = start | |
15 self.stop = stop | |
16 self.sequence = sequence.strip().replace("\n","").replace(" ","") | |
17 if(len(self.sequence) != self.getSpanningLength()): | |
18 print "ERROR: sequence length: "+str(len(self.sequence))+", while spanning region is: "+str(self.getSpanningLength()) | |
19 import sys | |
20 sys.exit() | |
21 | |
22 def getSpanningLength(self): | |
23 return abs(self.stop-self.start+1) | |
24 | |
25 class ReadSynthesizer: | |
26 def __init__(self,chromosome): | |
27 self.regions = [] | |
28 self.chromosome = chromosome | |
29 | |
30 def addRegion(self,region): | |
31 self.regions.append(region) | |
32 | |
33 def produceReads(self,readDensity = 1,read_length = 50): | |
34 """ | |
35 Produces uniform reads by walking iteratively over self.regions | |
36 """ | |
37 | |
38 mRNA = self.getTotalmRNA() | |
39 spanning_length = self.getRegionSpanningLength() | |
40 n = spanning_length['total'] - read_length + 1 | |
41 | |
42 j = 0 | |
43 k = 0 | |
44 | |
45 for i in range(n): | |
46 # "alpha is playing the role of k and beta is playing the role of theta" | |
47 dd = max(0,int(round(random.lognormvariate(math.log(readDensity),0.5))))# Notice this is NOT a binomial distribution!! | |
48 | |
49 for d in range(dd): | |
50 sequence = mRNA[i:i+read_length] | |
51 | |
52 if(random.randint(0,1) == 0): | |
53 strand = 0 | |
54 else: | |
55 strand = 16 | |
56 flag = strand + 0 | |
57 | |
58 print "read_"+str(j)+"."+str(i)+"."+str(d)+"\t"+str(flag)+"\t"+self.chromosome+"\t"+str(self.regions[j].start + k)+"\t60\t"+self.getMappingString(read_length,j,k)+"\t*\t0\t0\t"+str(sequence.upper())+"\t*" | |
59 | |
60 spanning_length['iter'][j] -= 1 | |
61 if(k >= self.regions[j].getSpanningLength()-1): | |
62 j += 1 | |
63 k = 0 | |
64 else: | |
65 k += 1 | |
66 | |
67 def getMappingString(self,length,j,offset): | |
68 m = 0 | |
69 | |
70 out = "" | |
71 | |
72 for i in range(length): | |
73 k = i + offset | |
74 | |
75 if(k >= self.regions[j].getSpanningLength()): | |
76 j += 1 | |
77 | |
78 out += str(m)+"M" | |
79 out += (str(self.regions[j].start - self.regions[j-1].stop-1))+"N" | |
80 m = 1 | |
81 | |
82 offset = -k | |
83 else: | |
84 m += 1 | |
85 | |
86 out += str(m) + "M" | |
87 | |
88 | |
89 return out | |
90 | |
91 def getRegionSpanningLength(self): | |
92 length = {'total':0,'iter':[]} | |
93 for r in self.regions: | |
94 l = r.getSpanningLength() | |
95 length['iter'].append(l) | |
96 length['total'] += l | |
97 return length | |
98 | |
99 def getTotalmRNA(self): | |
100 mRNA = "" | |
101 for r in self.regions: | |
102 mRNA += r.sequence | |
103 return mRNA | |
104 | |
80 | 105 |
106 | |
107 if __name__ == "__main__": | |
88
aaf98cdc5916
planemo upload for repository https://bitbucket.org/EMCbioinf/galaxy-tool-shed-tools commit 1477c39d48b290394b7247b9c7b1e4a62a85f2de-dirty
yhoogstrate
parents:
80
diff
changeset
|
108 # Real world example snp |
aaf98cdc5916
planemo upload for repository https://bitbucket.org/EMCbioinf/galaxy-tool-shed-tools commit 1477c39d48b290394b7247b9c7b1e4a62a85f2de-dirty
yhoogstrate
parents:
80
diff
changeset
|
109 |
aaf98cdc5916
planemo upload for repository https://bitbucket.org/EMCbioinf/galaxy-tool-shed-tools commit 1477c39d48b290394b7247b9c7b1e4a62a85f2de-dirty
yhoogstrate
parents:
80
diff
changeset
|
110 #rs = ReadSynthesizer('chr6') |
aaf98cdc5916
planemo upload for repository https://bitbucket.org/EMCbioinf/galaxy-tool-shed-tools commit 1477c39d48b290394b7247b9c7b1e4a62a85f2de-dirty
yhoogstrate
parents:
80
diff
changeset
|
111 #rs.addRegion(Region(154360546,154360969,'ccaggactggtttctgtaagaaacagcaggagctgtggcagcggcgaaaggaagcggctgaggcgcttggaacccgaaaagtctcggtgctcctggctacctcgcacagcggtgcccgcccggccgtcagtaccatggacagcagcgctgcccccacgaacgccagcaattgcactgatgccttggcgtactcaagttgctccccagcacccagccccggttcctgggtcaacttgtcccacttagatggcGacctgtccgacccatgcggtccgaaccgcaccgacctgggcgggagagacagcctgtgccctccgaccggcagtccctccatgatcacggccatcacgatcatggccctctactccatcgtgtgcgtggtggggctcttcggaaacttcctggtcatgtatgtgattgtcag')) |
aaf98cdc5916
planemo upload for repository https://bitbucket.org/EMCbioinf/galaxy-tool-shed-tools commit 1477c39d48b290394b7247b9c7b1e4a62a85f2de-dirty
yhoogstrate
parents:
80
diff
changeset
|
112 #rs.addRegion(Region(154410961,154411313,'atacaccaagatgaagactgccaccaacatctacattttcaaccttgctctggcagatgccttagccaccagtaccctgcccttccagagtgtgaattacctaatgggaacatggccatttggaaccatcctttgcaagatagtgatctccatagattactataacatgttcaccagcatattcaccctctgcaccatgagtgttgatcgatacattgcagtctgccaccctgtcaaggccttagatttccgtactccccgaaatgccaaaattatcaatgtctgcaactggatcctctcttcagccattggtcttcctgtaatgttcatggctacaacaaaatacaggcaag')) |
aaf98cdc5916
planemo upload for repository https://bitbucket.org/EMCbioinf/galaxy-tool-shed-tools commit 1477c39d48b290394b7247b9c7b1e4a62a85f2de-dirty
yhoogstrate
parents:
80
diff
changeset
|
113 #rs.addRegion(Region(154412087,154412607,'gttccatagattgtacactaacattctctcatccaacctggtactgggaaaacctgctgaagatctgtgttttcatcttcgccttcattatgccagtgctcatcattaccgtgtgctatggactgatgatcttgcgcctcaagagtgtccgcatgctctctggctccaaagaaaaggacaggaatcttcgaaggatcaccaggatggtgctggtggtggtggctgtgttcatcgtctgctggactcccattcacatttacgtcatcattaaagccttggttacaatcccagaaactacgttccagactgtttcttggcacttctgcattgctctaggttacacaaacagctgcctcaacccagtcctttatgcatttctggatgaaaacttcaaacgatgcttcagagagttctgtatcccaacctcttccaacattgagcaacaaaactccactcgaattcgtcagaacactagagaccacccctccacggccaatacagtggatagaactaatcatcag')) |
aaf98cdc5916
planemo upload for repository https://bitbucket.org/EMCbioinf/galaxy-tool-shed-tools commit 1477c39d48b290394b7247b9c7b1e4a62a85f2de-dirty
yhoogstrate
parents:
80
diff
changeset
|
114 #rs.addRegion(Region(154428600,154428787,'gtggaattgaacctggactgtcactgtgaaaatgcaaagccttggccactgagctacaatgcagggcagtctccatttcccttcccaggaagagtctagagcattaattttgagtttgcaaaggcttgtaactatttcatatgatttttagagctgactatgacatgaaccctaaaattcctgttccc')) |
aaf98cdc5916
planemo upload for repository https://bitbucket.org/EMCbioinf/galaxy-tool-shed-tools commit 1477c39d48b290394b7247b9c7b1e4a62a85f2de-dirty
yhoogstrate
parents:
80
diff
changeset
|
115 #rs.produceReads(3,50) |
aaf98cdc5916
planemo upload for repository https://bitbucket.org/EMCbioinf/galaxy-tool-shed-tools commit 1477c39d48b290394b7247b9c7b1e4a62a85f2de-dirty
yhoogstrate
parents:
80
diff
changeset
|
116 |
aaf98cdc5916
planemo upload for repository https://bitbucket.org/EMCbioinf/galaxy-tool-shed-tools commit 1477c39d48b290394b7247b9c7b1e4a62a85f2de-dirty
yhoogstrate
parents:
80
diff
changeset
|
117 # Artificial SNP |
aaf98cdc5916
planemo upload for repository https://bitbucket.org/EMCbioinf/galaxy-tool-shed-tools commit 1477c39d48b290394b7247b9c7b1e4a62a85f2de-dirty
yhoogstrate
parents:
80
diff
changeset
|
118 rs = ReadSynthesizer('chr1') |
aaf98cdc5916
planemo upload for repository https://bitbucket.org/EMCbioinf/galaxy-tool-shed-tools commit 1477c39d48b290394b7247b9c7b1e4a62a85f2de-dirty
yhoogstrate
parents:
80
diff
changeset
|
119 rs.addRegion(Region( 0+1, 59+1,'aaataggtcccaaacgttacgca'+'G'+'tctatgcctgacaaagttgcgaccacttcctctgcc'))#c -> G |
aaf98cdc5916
planemo upload for repository https://bitbucket.org/EMCbioinf/galaxy-tool-shed-tools commit 1477c39d48b290394b7247b9c7b1e4a62a85f2de-dirty
yhoogstrate
parents:
80
diff
changeset
|
120 rs.addRegion(Region( 60+1,119+1,'ttgtgtgacacgccggagatagg'+'A'+'catcagcaagtacgttaagtacactgaacgaactgg'))#g -> A |
aaf98cdc5916
planemo upload for repository https://bitbucket.org/EMCbioinf/galaxy-tool-shed-tools commit 1477c39d48b290394b7247b9c7b1e4a62a85f2de-dirty
yhoogstrate
parents:
80
diff
changeset
|
121 rs.addRegion(Region(120+1,179+1,'aggtttctacatcgtgcgtgatggc'+'C'+'ctaggagaagtgggtgtatctgcacagcataagt'))#t -> C |
aaf98cdc5916
planemo upload for repository https://bitbucket.org/EMCbioinf/galaxy-tool-shed-tools commit 1477c39d48b290394b7247b9c7b1e4a62a85f2de-dirty
yhoogstrate
parents:
80
diff
changeset
|
122 rs.addRegion(Region(180+1,239+1,'tataagacggaagtaaagcgtcttc'+'G'+'ccgttcagcaccccacgctcatagtcaatgctgg'))#a -> G |
aaf98cdc5916
planemo upload for repository https://bitbucket.org/EMCbioinf/galaxy-tool-shed-tools commit 1477c39d48b290394b7247b9c7b1e4a62a85f2de-dirty
yhoogstrate
parents:
80
diff
changeset
|
123 #rs.addRegion(Region(240+1,299+1,'ttcagcatagtcaagcgccggtggcctccaaaaagacgcactgagtagcttagctacttt')) |
aaf98cdc5916
planemo upload for repository https://bitbucket.org/EMCbioinf/galaxy-tool-shed-tools commit 1477c39d48b290394b7247b9c7b1e4a62a85f2de-dirty
yhoogstrate
parents:
80
diff
changeset
|
124 #rs.addRegion(Region(300+1,359+1,'gctccgcttgcggaagcactaagaggagattgaatttccaaatcccccccgatacctgtg')) |
aaf98cdc5916
planemo upload for repository https://bitbucket.org/EMCbioinf/galaxy-tool-shed-tools commit 1477c39d48b290394b7247b9c7b1e4a62a85f2de-dirty
yhoogstrate
parents:
80
diff
changeset
|
125 #rs.addRegion(Region(360+1,419+1,'cggtcgctacgtaagtgcgaagttctgttagatacgctccccttagtatatgggcgttaa')) |
aaf98cdc5916
planemo upload for repository https://bitbucket.org/EMCbioinf/galaxy-tool-shed-tools commit 1477c39d48b290394b7247b9c7b1e4a62a85f2de-dirty
yhoogstrate
parents:
80
diff
changeset
|
126 #rs.addRegion(Region(420+1,479+1,'tcggaccgtcggtactcactgcattccaggtctcatatagttcgccctagaagcctggga')) |
aaf98cdc5916
planemo upload for repository https://bitbucket.org/EMCbioinf/galaxy-tool-shed-tools commit 1477c39d48b290394b7247b9c7b1e4a62a85f2de-dirty
yhoogstrate
parents:
80
diff
changeset
|
127 rs.addRegion(Region(480+1,539+1,'tgaacgttgaacta'+'GCC'+'ctgatgtaaaccccgcgtgccaattccaggcgtcatgggggca'))#tag -> gcc |
aaf98cdc5916
planemo upload for repository https://bitbucket.org/EMCbioinf/galaxy-tool-shed-tools commit 1477c39d48b290394b7247b9c7b1e4a62a85f2de-dirty
yhoogstrate
parents:
80
diff
changeset
|
128 #rs.addRegion(Region(540+1,599+1,'acccctcgcagcctccctcttgctgttggtgcctagtatttcatgatttcgagccgacat')) |
aaf98cdc5916
planemo upload for repository https://bitbucket.org/EMCbioinf/galaxy-tool-shed-tools commit 1477c39d48b290394b7247b9c7b1e4a62a85f2de-dirty
yhoogstrate
parents:
80
diff
changeset
|
129 rs.produceReads(2,35) |