Mercurial > repos > yhoogstrate > varscan_mpileup2snp_from_bam
changeset 39:9a2a88d1dd4a
Uploaded
author | yhoogstrate |
---|---|
date | Wed, 05 Mar 2014 05:42:55 -0500 |
parents | 48c78adade03 |
children | ff87ad6669eb |
files | test-data/generate_reads.py tool_data_table_conf.xml.sample varscan_mpileup2snp_from_bam.xml x.tar.bz2 |
diffstat | 3 files changed, 7 insertions(+), 125 deletions(-) [+] |
line wrap: on
line diff
--- a/test-data/generate_reads.py Wed Mar 05 05:42:06 2014 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,118 +0,0 @@ -#!/usr/bin/env python - - -import random -import math - - -class Region: - def __init__(self,start,stop,sequence): - self.start = start - self.stop = stop - self.sequence = sequence.strip().replace("\n","").replace(" ","") - if(len(self.sequence) != self.getSpanningLength()): - print "ERROR: sequence length: "+str(len(self.sequence))+", while spanning region is: "+str(self.getSpanningLength()) - import sys - sys.exit() - - def getSpanningLength(self): - return abs(self.stop-self.start+1) - -class ReadSynthesizer: - def __init__(self,chromosome): - self.regions = [] - self.chromosome = chromosome - - def addRegion(self,region): - self.regions.append(region) - - def produceReads(self,readDensity = 1,read_length = 50): - """ - Produces uniform reads by walking iteratively over self.regions - """ - - mRNA = self.getTotalmRNA() - spanning_length = self.getRegionSpanningLength() - n = spanning_length['total'] - read_length + 1 - - j = 0 - k = 0 - - for i in range(n): - # "alpha is playing the role of k and beta is playing the role of theta" - dd = max(0,int(round(random.lognormvariate(math.log(readDensity),0.5))))# Notice this is NOT a binomial distribution!! - - for d in range(dd): - sequence = mRNA[i:i+read_length] - - if(random.randint(0,1) == 0): - strand = 0 - else: - strand = 16 - flag = strand + 0 - - print "read_"+str(j)+"."+str(i)+"."+str(d)+"\t"+str(flag)+"\t"+self.chromosome+"\t"+str(self.regions[j].start + k)+"\t60\t"+self.getMappingString(read_length,j,k)+"\t*\t0\t0\t"+str(sequence.upper())+"\t*" - - spanning_length['iter'][j] -= 1 - if(k >= self.regions[j].getSpanningLength()-1): - j += 1 - k = 0 - else: - k += 1 - - def getMappingString(self,length,j,offset): - m = 0 - - out = "" - - for i in range(length): - k = i + offset - - if(k >= self.regions[j].getSpanningLength()): - j += 1 - - out += str(m)+"M" - out += (str(self.regions[j].start - self.regions[j-1].stop-1))+"N" - m = 1 - - offset = -k - else: - m += 1 - - out += str(m) + "M" - - - return out - - def getRegionSpanningLength(self): - length = {'total':0,'iter':[]} - for r in self.regions: - l = r.getSpanningLength() - length['iter'].append(l) - length['total'] += l - return length - - def getTotalmRNA(self): - mRNA = "" - for r in self.regions: - mRNA += r.sequence - return mRNA - -#rs = ReadSynthesizer('chr6') -#rs.addRegion(Region(100,149,'ccaggactggtttctgtaagaaacagcaggagctgtggcagcggcgaaag')) -#rs.addRegion(Region(151,152,'at')) -#rs.produceReads(3,50) - - -rs = ReadSynthesizer('chr6') -rs.addRegion(Region(154360546,154360969,'ccaggactggtttctgtaagaaacagcaggagctgtggcagcggcgaaaggaagcggctgaggcgcttggaacccgaaaagtctcggtgctcctggctacctcgcacagcggtgcccgcccggccgtcagtaccatggacagcagcgctgcccccacgaacgccagcaattgcactgatgccttggcgtactcaagttgctccccagcacccagccccggttcctgggtcaacttgtcccacttagatggcGacctgtccgacccatgcggtccgaaccgcaccgacctgggcgggagagacagcctgtgccctccgaccggcagtccctccatgatcacggccatcacgatcatggccctctactccatcgtgtgcgtggtggggctcttcggaaacttcctggtcatgtatgtgattgtcag')) -rs.addRegion(Region(154410961,154411313,'atacaccaagatgaagactgccaccaacatctacattttcaaccttgctctggcagatgccttagccaccagtaccctgcccttccagagtgtgaattacctaatgggaacatggccatttggaaccatcctttgcaagatagtgatctccatagattactataacatgttcaccagcatattcaccctctgcaccatgagtgttgatcgatacattgcagtctgccaccctgtcaaggccttagatttccgtactccccgaaatgccaaaattatcaatgtctgcaactggatcctctcttcagccattggtcttcctgtaatgttcatggctacaacaaaatacaggcaag')) -rs.addRegion(Region(154412087,154412607,'gttccatagattgtacactaacattctctcatccaacctggtactgggaaaacctgctgaagatctgtgttttcatcttcgccttcattatgccagtgctcatcattaccgtgtgctatggactgatgatcttgcgcctcaagagtgtccgcatgctctctggctccaaagaaaaggacaggaatcttcgaaggatcaccaggatggtgctggtggtggtggctgtgttcatcgtctgctggactcccattcacatttacgtcatcattaaagccttggttacaatcccagaaactacgttccagactgtttcttggcacttctgcattgctctaggttacacaaacagctgcctcaacccagtcctttatgcatttctggatgaaaacttcaaacgatgcttcagagagttctgtatcccaacctcttccaacattgagcaacaaaactccactcgaattcgtcagaacactagagaccacccctccacggccaatacagtggatagaactaatcatcag')) -rs.addRegion(Region(154428600,154428787,'gtggaattgaacctggactgtcactgtgaaaatgcaaagccttggccactgagctacaatgcagggcagtctccatttcccttcccaggaagagtctagagcattaattttgagtttgcaaaggcttgtaactatttcatatgatttttagagctgactatgacatgaaccctaaaattcctgttccc')) -rs.produceReads(3,50) - - - - - -
--- a/tool_data_table_conf.xml.sample Wed Mar 05 05:42:06 2014 -0500 +++ b/tool_data_table_conf.xml.sample Wed Mar 05 05:42:55 2014 -0500 @@ -2,7 +2,7 @@ <tables> <!-- Locations of all fasta files under genome directory --> <table name="all_fasta" comment_char="#"> - <columns>name, dbkey, display_name, value</columns> + <columns>value, dbkey, name, path</columns> <file path="tool-data/all_fasta.loc" /> </table> </tables> \ No newline at end of file
--- a/varscan_mpileup2snp_from_bam.xml Wed Mar 05 05:42:06 2014 -0500 +++ b/varscan_mpileup2snp_from_bam.xml Wed Mar 05 05:42:55 2014 -0500 @@ -119,10 +119,10 @@ </when> <when value="indexed_filtered"> <param name="reference_genome" type="select" label="Reference Genome used during alignment (fasta)" > - <options from_file="all_fasta.loc"> - <column name="name" index="0"/> + <options from_data_table="all_fasta"> + <column name="name" index="2"/> <column name="dbkey" index="1"/> - <column name="value" index="3"/> + <column name="value" index="3"/><!-- Value is the path of the fasta file --> <filter type="data_meta" ref="alignments" multiple="false" key="dbkey" column="1" /> <validator type="no_options" message="No indexes are available for the selected input dataset" /> </options> @@ -130,10 +130,10 @@ </when> <when value="indexed_all"> <param name="reference_genome" type="select" label="Reference Genome used during alignment (fasta)" > - <options from_file="all_fasta.loc"> - <column name="name" index="0"/> + <options from_data_table="all_fasta"> + <column name="name" index="2"/> <column name="dbkey" index="1"/> - <column name="value" index="3"/> + <column name="value" index="3"/><!-- Value is the path of the fasta file --> <validator type="no_options" message="No indexes are available for the selected input dataset" /> </options> </param>