Mercurial > repos > yhoogstrate > varscan_mpileup2snp_from_bam
comparison test-data/generate_reads.py @ 38:48c78adade03
Deleted selected files
| author | yhoogstrate |
|---|---|
| date | Wed, 05 Mar 2014 05:42:06 -0500 |
| parents | |
| children | 5272ffe31456 |
comparison
equal
deleted
inserted
replaced
| 37:17f10d28dab3 | 38:48c78adade03 |
|---|---|
| 1 #!/usr/bin/env python | |
| 2 | |
| 3 | |
| 4 import random | |
| 5 import math | |
| 6 | |
| 7 | |
| 8 class Region: | |
| 9 def __init__(self,start,stop,sequence): | |
| 10 self.start = start | |
| 11 self.stop = stop | |
| 12 self.sequence = sequence.strip().replace("\n","").replace(" ","") | |
| 13 if(len(self.sequence) != self.getSpanningLength()): | |
| 14 print "ERROR: sequence length: "+str(len(self.sequence))+", while spanning region is: "+str(self.getSpanningLength()) | |
| 15 import sys | |
| 16 sys.exit() | |
| 17 | |
| 18 def getSpanningLength(self): | |
| 19 return abs(self.stop-self.start+1) | |
| 20 | |
| 21 class ReadSynthesizer: | |
| 22 def __init__(self,chromosome): | |
| 23 self.regions = [] | |
| 24 self.chromosome = chromosome | |
| 25 | |
| 26 def addRegion(self,region): | |
| 27 self.regions.append(region) | |
| 28 | |
| 29 def produceReads(self,readDensity = 1,read_length = 50): | |
| 30 """ | |
| 31 Produces uniform reads by walking iteratively over self.regions | |
| 32 """ | |
| 33 | |
| 34 mRNA = self.getTotalmRNA() | |
| 35 spanning_length = self.getRegionSpanningLength() | |
| 36 n = spanning_length['total'] - read_length + 1 | |
| 37 | |
| 38 j = 0 | |
| 39 k = 0 | |
| 40 | |
| 41 for i in range(n): | |
| 42 # "alpha is playing the role of k and beta is playing the role of theta" | |
| 43 dd = max(0,int(round(random.lognormvariate(math.log(readDensity),0.5))))# Notice this is NOT a binomial distribution!! | |
| 44 | |
| 45 for d in range(dd): | |
| 46 sequence = mRNA[i:i+read_length] | |
| 47 | |
| 48 if(random.randint(0,1) == 0): | |
| 49 strand = 0 | |
| 50 else: | |
| 51 strand = 16 | |
| 52 flag = strand + 0 | |
| 53 | |
| 54 print "read_"+str(j)+"."+str(i)+"."+str(d)+"\t"+str(flag)+"\t"+self.chromosome+"\t"+str(self.regions[j].start + k)+"\t60\t"+self.getMappingString(read_length,j,k)+"\t*\t0\t0\t"+str(sequence.upper())+"\t*" | |
| 55 | |
| 56 spanning_length['iter'][j] -= 1 | |
| 57 if(k >= self.regions[j].getSpanningLength()-1): | |
| 58 j += 1 | |
| 59 k = 0 | |
| 60 else: | |
| 61 k += 1 | |
| 62 | |
| 63 def getMappingString(self,length,j,offset): | |
| 64 m = 0 | |
| 65 | |
| 66 out = "" | |
| 67 | |
| 68 for i in range(length): | |
| 69 k = i + offset | |
| 70 | |
| 71 if(k >= self.regions[j].getSpanningLength()): | |
| 72 j += 1 | |
| 73 | |
| 74 out += str(m)+"M" | |
| 75 out += (str(self.regions[j].start - self.regions[j-1].stop-1))+"N" | |
| 76 m = 1 | |
| 77 | |
| 78 offset = -k | |
| 79 else: | |
| 80 m += 1 | |
| 81 | |
| 82 out += str(m) + "M" | |
| 83 | |
| 84 | |
| 85 return out | |
| 86 | |
| 87 def getRegionSpanningLength(self): | |
| 88 length = {'total':0,'iter':[]} | |
| 89 for r in self.regions: | |
| 90 l = r.getSpanningLength() | |
| 91 length['iter'].append(l) | |
| 92 length['total'] += l | |
| 93 return length | |
| 94 | |
| 95 def getTotalmRNA(self): | |
| 96 mRNA = "" | |
| 97 for r in self.regions: | |
| 98 mRNA += r.sequence | |
| 99 return mRNA | |
| 100 | |
| 101 #rs = ReadSynthesizer('chr6') | |
| 102 #rs.addRegion(Region(100,149,'ccaggactggtttctgtaagaaacagcaggagctgtggcagcggcgaaag')) | |
| 103 #rs.addRegion(Region(151,152,'at')) | |
| 104 #rs.produceReads(3,50) | |
| 105 | |
| 106 | |
| 107 rs = ReadSynthesizer('chr6') | |
| 108 rs.addRegion(Region(154360546,154360969,'ccaggactggtttctgtaagaaacagcaggagctgtggcagcggcgaaaggaagcggctgaggcgcttggaacccgaaaagtctcggtgctcctggctacctcgcacagcggtgcccgcccggccgtcagtaccatggacagcagcgctgcccccacgaacgccagcaattgcactgatgccttggcgtactcaagttgctccccagcacccagccccggttcctgggtcaacttgtcccacttagatggcGacctgtccgacccatgcggtccgaaccgcaccgacctgggcgggagagacagcctgtgccctccgaccggcagtccctccatgatcacggccatcacgatcatggccctctactccatcgtgtgcgtggtggggctcttcggaaacttcctggtcatgtatgtgattgtcag')) | |
| 109 rs.addRegion(Region(154410961,154411313,'atacaccaagatgaagactgccaccaacatctacattttcaaccttgctctggcagatgccttagccaccagtaccctgcccttccagagtgtgaattacctaatgggaacatggccatttggaaccatcctttgcaagatagtgatctccatagattactataacatgttcaccagcatattcaccctctgcaccatgagtgttgatcgatacattgcagtctgccaccctgtcaaggccttagatttccgtactccccgaaatgccaaaattatcaatgtctgcaactggatcctctcttcagccattggtcttcctgtaatgttcatggctacaacaaaatacaggcaag')) | |
| 110 rs.addRegion(Region(154412087,154412607,'gttccatagattgtacactaacattctctcatccaacctggtactgggaaaacctgctgaagatctgtgttttcatcttcgccttcattatgccagtgctcatcattaccgtgtgctatggactgatgatcttgcgcctcaagagtgtccgcatgctctctggctccaaagaaaaggacaggaatcttcgaaggatcaccaggatggtgctggtggtggtggctgtgttcatcgtctgctggactcccattcacatttacgtcatcattaaagccttggttacaatcccagaaactacgttccagactgtttcttggcacttctgcattgctctaggttacacaaacagctgcctcaacccagtcctttatgcatttctggatgaaaacttcaaacgatgcttcagagagttctgtatcccaacctcttccaacattgagcaacaaaactccactcgaattcgtcagaacactagagaccacccctccacggccaatacagtggatagaactaatcatcag')) | |
| 111 rs.addRegion(Region(154428600,154428787,'gtggaattgaacctggactgtcactgtgaaaatgcaaagccttggccactgagctacaatgcagggcagtctccatttcccttcccaggaagagtctagagcattaattttgagtttgcaaaggcttgtaactatttcatatgatttttagagctgactatgacatgaaccctaaaattcctgttccc')) | |
| 112 rs.produceReads(3,50) | |
| 113 | |
| 114 | |
| 115 | |
| 116 | |
| 117 | |
| 118 |
