# HG changeset patch # User yhoogstrate # Date 1394016126 18000 # Node ID 48c78adade0340f55b482bf50a598cc025715a92 # Parent 17f10d28dab3f7d03896d428ee49edf4f42f693c Deleted selected files diff -r 17f10d28dab3 -r 48c78adade03 samtools-parallel-mpileup.xml --- a/samtools-parallel-mpileup.xml Tue Mar 04 07:50:19 2014 -0500 +++ b/samtools-parallel-mpileup.xml Wed Mar 05 05:42:06 2014 -0500 @@ -1,6 +1,6 @@ - Samtools mpileup (classical or supporting parallelization). + Samtools mpileup (supporting parallelization) samtools-parallel-mpileup samtools @@ -66,6 +66,9 @@ #end for 2> stderr_1.txt + #if $sort_mpileup == "true" + | sort -k 1,1 -k 2,2 + #end if > $output ; cat stderr_1.txt #end if @@ -84,10 +87,10 @@ - - + + - + @@ -98,10 +101,10 @@ - - + + - + @@ -125,7 +128,7 @@ - + @@ -140,74 +143,107 @@ + + - - + - - + + - + - - + + - - - + + + - - - + + + - + - - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + -VarScan2.3.6:: - -*VarScan2 Overview* - -VarScan is a platform-independent mutation caller for targeted, exome, and whole-genome resequencing data generated on Illumina, SOLiD, Life/PGM, Roche/454, and similar instruments. The newest version, VarScan 2, is written in Java, so it runs on most operating systems. -http://dx.doi.org/10.1101/gr.129684.111 -http://www.ncbi.nlm.nih.gov/pubmed/19542151 +**Samtools mpileup (supporting parallelization)** -*VarScan* requires mpileup formatted input files, which are generally derived from BAM files. Since mpileup files can become humongous, the interim step of storing it is bypassed. Thus, in this wrapper one or multiple BAM/SAM files go in, get processed into a mpileup file and get directly linked to VarScan. -The samtools package is not able to parallelize the mpileup generation which make it a very slow process. -Other people were aware of this and have written a version that can do parallelization: -https://github.com/mydatascience/parallel-mpileup +SAM (Sequence Alignment/Map) format is a generic format for storing large nucleotide sequence alignments. SAM aims to be a format that: + +Is flexible enough to store all the alignment information generated by various alignment programs; +Is simple enough to be easily generated by alignment programs or converted from existing alignment formats; +Is compact in file size; +Allows most of operations on the alignment to work on a stream without loading the whole alignment into memory; +Allows the file to be indexed by genomic position to efficiently retrieve all reads aligning to a locus. +SAM Tools provide various utilities for manipulating alignments in the SAM format, including sorting, merging, indexing and generating alignments in a per-position format. -Consequently, when a BAM files gets processed by this wrapper, it's processed by *parallel-mpileup* before its send to VarScan. +SAMtools is hosted by SourceForge.net. The project page is http://samtools.sourceforge.net/. The source code releases are available from the download page. You can check out the most recent source code from the github project page with: +git clone git://github.com/samtools/samtools.git +https://github.com/mydatascience/parallel-mpileup/ -.. _VarScan: http://varscan.sourceforge.net/ +Because samtools does not support parallization of the mpileup command, the project was forked to include paralellization support: + + +However, since the project seems to lack support and contains fatal bugs this project was continued at: +https://github.com/yhoogstrate/parallel-mpileup/ + **Input formats** -VarScan2 accepts sequencing alignments in the same, either SAM or BAM format (http://samtools.sourceforge.net/). The alignment files have to be linked to a reference genome by galaxy. This is indicated under every history item with e.g.: *"database: hg19"* for a link to hg19, or *"database: ?"* if the link is missing. +Satmools accepts sequencing alignments in the same, either SAM or BAM format (http://samtools.sourceforge.net/). The alignment files have to be linked to a reference genome by galaxy. This is indicated under every history item with e.g.: *"database: hg19"* for a link to hg19, or *"database: ?"* if the link is missing. **Installation** -Make sure your reference genomes are properly annotated in "tool-data/all_fasta.loc", and linked to the names of the reference used for alignment. +The installation is fully automatic. **License** -* VarScan2.3.6: Non-Profit Open Software License 3.0 (Non-Profit OSL 3.0) -* parallel-mpileup: MIT License (https://github.com/mydatascience/parallel-mpileup/blob/master/samtools-0.1.19/COPYING) +* parallel-mpileup: MIT License (https://github.com/yhoogstrate/parallel-mpileup/blob/master/samtools-0.1.19/COPYING) * samtool: MIT License @@ -219,4 +255,4 @@ More tools by the Translational Research IT (TraIT) project can be found in the following repository: http://toolshed.dtls.nl/ - + \ No newline at end of file diff -r 17f10d28dab3 -r 48c78adade03 test-data/generate_reads.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/generate_reads.py Wed Mar 05 05:42:06 2014 -0500 @@ -0,0 +1,118 @@ +#!/usr/bin/env python + + +import random +import math + + +class Region: + def __init__(self,start,stop,sequence): + self.start = start + self.stop = stop + self.sequence = sequence.strip().replace("\n","").replace(" ","") + if(len(self.sequence) != self.getSpanningLength()): + print "ERROR: sequence length: "+str(len(self.sequence))+", while spanning region is: "+str(self.getSpanningLength()) + import sys + sys.exit() + + def getSpanningLength(self): + return abs(self.stop-self.start+1) + +class ReadSynthesizer: + def __init__(self,chromosome): + self.regions = [] + self.chromosome = chromosome + + def addRegion(self,region): + self.regions.append(region) + + def produceReads(self,readDensity = 1,read_length = 50): + """ + Produces uniform reads by walking iteratively over self.regions + """ + + mRNA = self.getTotalmRNA() + spanning_length = self.getRegionSpanningLength() + n = spanning_length['total'] - read_length + 1 + + j = 0 + k = 0 + + for i in range(n): + # "alpha is playing the role of k and beta is playing the role of theta" + dd = max(0,int(round(random.lognormvariate(math.log(readDensity),0.5))))# Notice this is NOT a binomial distribution!! + + for d in range(dd): + sequence = mRNA[i:i+read_length] + + if(random.randint(0,1) == 0): + strand = 0 + else: + strand = 16 + flag = strand + 0 + + print "read_"+str(j)+"."+str(i)+"."+str(d)+"\t"+str(flag)+"\t"+self.chromosome+"\t"+str(self.regions[j].start + k)+"\t60\t"+self.getMappingString(read_length,j,k)+"\t*\t0\t0\t"+str(sequence.upper())+"\t*" + + spanning_length['iter'][j] -= 1 + if(k >= self.regions[j].getSpanningLength()-1): + j += 1 + k = 0 + else: + k += 1 + + def getMappingString(self,length,j,offset): + m = 0 + + out = "" + + for i in range(length): + k = i + offset + + if(k >= self.regions[j].getSpanningLength()): + j += 1 + + out += str(m)+"M" + out += (str(self.regions[j].start - self.regions[j-1].stop-1))+"N" + m = 1 + + offset = -k + else: + m += 1 + + out += str(m) + "M" + + + return out + + def getRegionSpanningLength(self): + length = {'total':0,'iter':[]} + for r in self.regions: + l = r.getSpanningLength() + length['iter'].append(l) + length['total'] += l + return length + + def getTotalmRNA(self): + mRNA = "" + for r in self.regions: + mRNA += r.sequence + return mRNA + +#rs = ReadSynthesizer('chr6') +#rs.addRegion(Region(100,149,'ccaggactggtttctgtaagaaacagcaggagctgtggcagcggcgaaag')) +#rs.addRegion(Region(151,152,'at')) +#rs.produceReads(3,50) + + +rs = ReadSynthesizer('chr6') +rs.addRegion(Region(154360546,154360969,'ccaggactggtttctgtaagaaacagcaggagctgtggcagcggcgaaaggaagcggctgaggcgcttggaacccgaaaagtctcggtgctcctggctacctcgcacagcggtgcccgcccggccgtcagtaccatggacagcagcgctgcccccacgaacgccagcaattgcactgatgccttggcgtactcaagttgctccccagcacccagccccggttcctgggtcaacttgtcccacttagatggcGacctgtccgacccatgcggtccgaaccgcaccgacctgggcgggagagacagcctgtgccctccgaccggcagtccctccatgatcacggccatcacgatcatggccctctactccatcgtgtgcgtggtggggctcttcggaaacttcctggtcatgtatgtgattgtcag')) +rs.addRegion(Region(154410961,154411313,'atacaccaagatgaagactgccaccaacatctacattttcaaccttgctctggcagatgccttagccaccagtaccctgcccttccagagtgtgaattacctaatgggaacatggccatttggaaccatcctttgcaagatagtgatctccatagattactataacatgttcaccagcatattcaccctctgcaccatgagtgttgatcgatacattgcagtctgccaccctgtcaaggccttagatttccgtactccccgaaatgccaaaattatcaatgtctgcaactggatcctctcttcagccattggtcttcctgtaatgttcatggctacaacaaaatacaggcaag')) +rs.addRegion(Region(154412087,154412607,'gttccatagattgtacactaacattctctcatccaacctggtactgggaaaacctgctgaagatctgtgttttcatcttcgccttcattatgccagtgctcatcattaccgtgtgctatggactgatgatcttgcgcctcaagagtgtccgcatgctctctggctccaaagaaaaggacaggaatcttcgaaggatcaccaggatggtgctggtggtggtggctgtgttcatcgtctgctggactcccattcacatttacgtcatcattaaagccttggttacaatcccagaaactacgttccagactgtttcttggcacttctgcattgctctaggttacacaaacagctgcctcaacccagtcctttatgcatttctggatgaaaacttcaaacgatgcttcagagagttctgtatcccaacctcttccaacattgagcaacaaaactccactcgaattcgtcagaacactagagaccacccctccacggccaatacagtggatagaactaatcatcag')) +rs.addRegion(Region(154428600,154428787,'gtggaattgaacctggactgtcactgtgaaaatgcaaagccttggccactgagctacaatgcagggcagtctccatttcccttcccaggaagagtctagagcattaattttgagtttgcaaaggcttgtaactatttcatatgatttttagagctgactatgacatgaaccctaaaattcctgttccc')) +rs.produceReads(3,50) + + + + + + diff -r 17f10d28dab3 -r 48c78adade03 varscan_mpileup2snp.xml --- a/varscan_mpileup2snp.xml Tue Mar 04 07:50:19 2014 -0500 +++ b/varscan_mpileup2snp.xml Wed Mar 05 05:42:06 2014 -0500 @@ -38,18 +38,18 @@ - - - - - - - - + + + + + + + + - + @@ -60,10 +60,18 @@ + + + + + + + + + + -VarScan2.3.6:: - -*VarScan2 Overview* +**VarScan 2.3.6** VarScan is a platform-independent mutation caller for targeted, exome, and whole-genome resequencing data generated on Illumina, SOLiD, Life/PGM, Roche/454, and similar instruments. The newest version, VarScan 2, is written in Java, so it runs on most operating systems. http://dx.doi.org/10.1101/gr.129684.111 @@ -100,4 +108,4 @@ More tools by the Translational Research IT (TraIT) project can be found in the following repository: http://toolshed.dtls.nl/ - + \ No newline at end of file diff -r 17f10d28dab3 -r 48c78adade03 varscan_mpileup2snp_from_bam.xml --- a/varscan_mpileup2snp_from_bam.xml Tue Mar 04 07:50:19 2014 -0500 +++ b/varscan_mpileup2snp_from_bam.xml Wed Mar 05 05:42:06 2014 -0500 @@ -84,7 +84,9 @@ #end if --output-vcf $varscan_output_vcf - > $snv_output + #if $sort_mpileup == "true" + | sort -k 1,1 -k 2,2 + #end if 2> stderr_2.txt ; echo "-------------------------[ mpileup generation ]-------------------------" ; @@ -136,9 +138,7 @@ - - - + @@ -148,9 +148,7 @@ - - - + @@ -173,6 +171,8 @@ + + @@ -183,36 +183,36 @@ - - + + - + - - + + - - - + + + - - - + + + - + - - - - - - - - + + + + + + + + - + @@ -223,10 +223,40 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + -VarScan2.3.6:: - -*VarScan2 Overview* +**VarScan 2.3.6** VarScan is a platform-independent mutation caller for targeted, exome, and whole-genome resequencing data generated on Illumina, SOLiD, Life/PGM, Roche/454, and similar instruments. The newest version, VarScan 2, is written in Java, so it runs on most operating systems. http://dx.doi.org/10.1101/gr.129684.111 @@ -263,4 +293,4 @@ More tools by the Translational Research IT (TraIT) project can be found in the following repository: http://toolshed.dtls.nl/ - + \ No newline at end of file