Mercurial > repos > jjohnson > gmap
diff gmap_build.xml @ 2:f6ba0f12cca2 draft
Untested work-in-progress GMAP wrappers v3.0.0, from JJ back in June 2013
author | peterjc |
---|---|
date | Wed, 28 Sep 2016 10:43:44 -0400 |
parents | 74391fc6e3f2 |
children | 488e9d642566 |
line wrap: on
line diff
--- a/gmap_build.xml Fri Oct 05 13:08:43 2012 -0500 +++ b/gmap_build.xml Wed Sep 28 10:43:44 2016 -0400 @@ -1,10 +1,10 @@ -<tool id="gmap_build" name="GMAP Build" version="2.0.0"> +<tool id="gmap_build" name="GMAP Build" version="3.0.0"> <description>a database genome index for GMAP and GSNAP</description> <requirements> - <requirement type="package" version="2011-11-30">gmap</requirement> + <requirement type="package" version="2013-05-09">gmap</requirement> </requirements> <version_string>gmap --version</version_string> - <command interpreter="command"> /bin/bash $shscript 2>1 1> $output </command> + <command interpreter="command"> /bin/bash $shscript > $output </command> <inputs> <!-- Name for this gmapdb --> <param name="refname" type="text" label="Name you want to give this gmap database" help=""> @@ -15,12 +15,17 @@ <param name="input" type="data" format="fasta" label="reference sequence fasta" /> </repeat> - <param name="kmer" type="select" multiple="true" force_select="true" label="kmer size" help=""> - <option value="12">12</option> - <option value="13">13</option> - <option value="14">14</option> - <option value="15" selected="true">15</option> + <param name="circular_chroms" type="text" value="" optional="true" label="Names of circular chromosomes" + help="a list of chromosomes, separated by commas, allow GSNAP and GMAP to align reads across the ends of the chromosome"> + </param> + + <param name="sort" type="select" label="Sort chromosomes" help=""> + <option value="none">none - use chromosomes as found in FASTA file(s)</option> + <option value="alpha">alpha - sort chromosomes alphabetically (chr10 before chr 1)</option> + <option value="numeric-alpha">numeric-alpha - chr1, chr1U, chr2, chrM, chrU, chrX, chrY</option> + <option value="chrom">chrom - chr1, chr2, chrM, chrX, chrY, chr1U, chrU</option> </param> + <param name="cmetindex" type="boolean" checked="true" truevalue="yes" falsevalue="no" label="Create cmetindex to process reads from bisulfite-treated DNA"/> <param name="atoiindex" type="boolean" checked="true" truevalue="yes" falsevalue="no" label="Create atoiindex to process reads under RNA-editing tolerance"/> <conditional name="splicesite"> @@ -51,6 +56,7 @@ <option value="none"></option> <option value="snpTable">UCSC SNP Table</option> <option value="snpFile">GMAP SNP File</option> + <option value="vcfFile">VCF File</option> </param> <when value="none"/> <when value="snpTable"> @@ -77,8 +83,33 @@ sequence, that SNP will be ignored in subsequent processing as a probable error. The N stands for any other allele." /> </when> + <when value="vcfFile"> + <param name="snps" type="data" format="vcf" optional="true" label="VCF SNPs file" + help="Example: ftp://ftp.ncbi.nih.gov/snp/organisms/human_9606/VCF/00-All.vcf.gz + The VCF file contains multiple versions of dbSNP, so if you want a + particular version, such as 135. The vcf_iit program tries to pick + a subset of SNPs that somewhat parallel + the ones without exceptions in the UCSC dbSNP file. It keeps all SNPs + that have been validated (marked in the VCF file as "VLD") or have a + submitter link-out ("SLO"). Otherwise, it excludes SNPs that are + individual genotypes ("GNO"). If none of these conditions hold, then + the SNP is allowed. "/> + <param name="vcf_version" type="text" value="" optional="true" label="dbSNP version" + help="The VCF file contains multiple versions of dbSNP, so if you want a particular version, such as 135"/> + </when> </conditional> + + <param name="kmer" type="select" multiple="true" force_select="true" label="kmer size" help="Use smaller values when building indexes on machines with limited RAM"> + <option value="12">12 (64MB RAM)</option> + <option value="13">13 (256MB RAM)</option> + <option value="14">14 (1GB RAM)</option> + <option value="15" selected="true">15 (4GB RAM)</option> + </param> + </inputs> + <stdio> + <exit_code range="1" level="fatal" description="Error running gmap_build" /> + </stdio> <outputs> <!-- <data format="txt" name="log" label="${tool.name} on ${on_string}: log"/> @@ -97,15 +128,23 @@ ## #set $ref_files = $ref_files $i.input ## #end for ## echo $ref_files +#set circular = "" +#if $circular_chroms.__str__.strip() != '': +#set circular = ('').join([' -c ','"', $circular_chroms.__str__,'"']) +#end if #import os.path #set $gmapdb = $output.extra_files_path #set $mapsdir = $os.path.join($os.path.join($gmapdb,str($refname)), str($refname) + '.maps') mkdir -p $gmapdb ## export GMAPDB required for cmetindex and atoiindex export GMAPDB=$gmapdb +#if $kmer: #for $k in $kmer.__str__.split(','): -gmap_build -D $gmapdb -d $refname -s numeric-alpha -k $k #for i in $inputs# ${i.input}#end for# +gmap_build -D $gmapdb -d $refname -s $sort $circular -k $k #for i in $inputs# ${i.input}#end for# #end for +#else: +gmap_build -D $gmapdb -d $refname -s $sort $circular #for i in $inputs# ${i.input}#end for# +#end if get-genome -D $gmapdb -d '?' | sed 's/^Available .*/gmap db: /' echo "kmers: " $kmer #if $splicesite.splice_source == 'refGeneTable': @@ -125,17 +164,23 @@ #end if #end if #if $dbsnp.snp_source != 'none' and $dbsnp.snps.__str__ != 'None': -#if $dbsnp.snp_source == 'snpTable': -#if $dbsnp.snpsex.__str__ != 'None': -cat $dbsnp.snps | dbsnp_iit -w $dbsnp.weight -e $dbsnp.snpsex | iit_store -o $os.path.join($mapsdir,'snps') -#else: -cat $dbsnp.snps | dbsnp_iit -w $dbsnp.weight | iit_store -o $os.path.join($mapsdir,'snps') -#end if -#else: -cat $dbsnp.snps | iit_store -o $os.path.join($mapsdir,'snps') -#end if -snpindex -d $refname -v snps -echo "snpindex" -d $refname -v snps + #if $dbsnp.snp_source == 'snpTable': + #if $dbsnp.snpsex.__str__ != 'None': + cat $dbsnp.snps | dbsnp_iit -w $dbsnp.weight -e $dbsnp.snpsex | iit_store -o $os.path.join($mapsdir,'snps') + #else: + cat $dbsnp.snps | dbsnp_iit -w $dbsnp.weight | iit_store -o $os.path.join($mapsdir,'snps') + #end if + #elif $dbsnp.snp_source == 'vcfFile': + #if $dbsnp.vcf_version and len($dbsnp.vcf_version.__str__.strip()) > 0: + cat $dbsnp.snps | vcf_iit -v $dbsnp.vcf_version.__str__.strip() | iit_store -o $os.path.join($mapsdir,'snps') + #else: + cat $dbsnp.snps | vcf_iit | iit_store -o $os.path.join($mapsdir,'snps') + #end if + #else: + cat $dbsnp.snps | iit_store -o $os.path.join($mapsdir,'snps') + #end if + snpindex -d $refname -v snps + echo "snpindex" -d $refname -v snps #end if #if $cmetindex.__str__ == 'yes': cmetindex -d $refname @@ -154,10 +199,9 @@ <help> - **GMAP Build** -GMAP Build creates an index of a genomic sequence for mapping and alignment using GMAP_ (Genomic Mapping and Alignment Program for mRNA and EST sequences) and GSNAP_ (Genomic Short-read Nucleotide Alignment Program). (GMAP Build uses GMSP commands: gmap_build, iit_store, psl_splicesites, psl_introns, gtf_splicesites, gtf_introns, gff3_splicesites, gff3_introns, dbsnp_iit, snpindex, cmetindex, and atoiindex.) +GMAP Build creates an index of a genomic sequence for mapping and alignment using GMAP_ (Genomic Mapping and Alignment Program for mRNA and EST sequences) and GSNAP_ (Genomic Short-read Nucleotide Alignment Program). (GMAP Build uses GMAP commands: gmap_build, iit_store, psl_splicesites, psl_introns, gtf_splicesites, gtf_introns, gff3_splicesites, gff3_introns, dbsnp_iit, snpindex, cmetindex, and atoiindex.) You will want to read the README_ @@ -169,6 +213,129 @@ .. _Publication: http://bioinformatics.oxfordjournals.org/cgi/content/full/21/9/1859 +**circular chromosomes** + +Finally, you can provide information to gmap_build that certain +chromosomes are circular, with the -c or -\-circular flag. The value +for these flags is a list of chromosomes, separated by commas. The +gmap_build program will then allow GSNAP and GMAP to align reads +across the ends of the chromosome. For example, the mitochondrial +genome in human beings is circular. + + +**Detecting known and novel splice sites in GSNAP** + +GSNAP can detect splice junctions in individual reads. +GSNAP allows for known splicing at two levels: at the level of known +splice sites and at the level of known introns. At the site level, +GSNAP finds splicing between arbitrary combinations of donor and +acceptor splice sites, meaning that it can find alternative splicing +events. At the intron level, GSNAP finds splicing only between the +set of given donor-acceptor pairs, so it is constrained not to find +alternative splicing events, only introns included in the given list. +For most purposes, I would recommend using known splice sites, rather +than known introns, unless you are certain that all alternative +splicing events are known are represented in your file. + +Splice site files can be generated from a GTF file +or from refGenes table from UCSC. + + +**SNP-tolerant alignment in GSNAP** + +GSNAP has the ability to align to a reference space of all possible +major and minor alleles in a set of known SNPs provided by the user. + + +Process known SNP data, either from older dbSNP files or from newer +files in VCF format. The older dbSNP files can be obtained from UCSC, +either from the Galaxy UCSC table browser or downloaded: + + ftp://hgdownload.cse.ucsc.edu/goldenPath/hg18/database/snp130.txt.gz + +For versions before snp132, you may also want to exclude exceptions, +which will require this file: + + ftp://hgdownload.cse.ucsc.edu/goldenPath/hg18/database/snp130Exceptions.txt.gz + +The option "-w weight" makes use of the dbSNP item weight, a value +from 1 to 3, where lower weight means higher confidence. Items will +be included if the item weight is the given value weight or less. +The default value of -w is 1, which is the criterion UCSC uses to +build its ambiguous version of the genome. To allow all item weights, +specify "-w 3". + +The more recent SNP data are provided in VCF format, and can be +retrieved like this: + + ftp://ftp.ncbi.nih.gov/snp/organisms/human_9606/VCF/00-All.vcf.gz + +The VCF file contains multiple versions of dbSNP, so if you want a +particular version, such as 135, you would use the flag "-v 135". The +vcf_iit program tries to pick a subset of SNPs that somewhat parallel +the ones without exceptions in the UCSC dbSNP file. It keeps all SNPs +that have been validated (marked in the VCF file as "VLD") or have a +submitter link-out ("SLO"). Otherwise, it excludes SNPs that are +individual genotypes ("GNO"). If none of these conditions hold, then +the SNP is allowed. These rules might not be the best ones; I made +them up by trying to compare version 135 of the VCF data with +version 135 of the UCSC dbSNP data. + +**Alignment of reads from bisulfite-treated DNA in GSNAP** + +GSNAP has the ability to align reads from bisulfite-treated DNA, which +converts unmethylated cytosines to uracils that appear as thymines in +reads. GSNAP is able to identify genomic-T to read-C mismatches, +if a cmetindex is generated. + +**RNA-editing tolerance in GSNAP** + +Just as GSNAP has a program cmetindex and a mode called "cmet" for +tolerance to C-to-T changes, it can be tolerant to A-to-G changes +using the program atoiindex and a mode called "atoi". This mode is +designed to facilitate alignments that are tolerant to RNA editing +where A's are converted to I's, which appear as G's to a sequencer. + +To process reads under RNA-editing tolerance, you will first need to +create th atoi index. + + + +**K-mer size** + +You can control the k-mer size +for the genomic index with the -k flag, which can range from 12 to 15. +The default value for -k is 15, but this requires your machine to have +4 GB of RAM to build the indices. If you do not have 4 GB of RAM, +then you will need to reduce the value of -k or find another machine. +Here are the RAM requirements for building various indices:: + + k-mer of 12: 64 MB + k-mer of 13: 256 MB + k-mer of 14: 1 GB + k-mer of 15: 4 GB + +These are the RAM requirements for building indices, but not to run +the GMAP/GSNAP programs once the indices are built, because the +genomic indices are compressed. For example, the genomic index for a +k-mer of 15 gives a gammaptrs file of 64 MB and an offsetscomp file of +about 350 MB, much smaller than the 4 GB that would otherwise be +required. Therefore, you may want to build your genomic index on a +computer with sufficient RAM, and distribute that index to be used by +computers with less RAM. + +The amount of compression can be controlled using the -b or -\-basesize +parameter to gmap_build. By default, the value for k-mer size is 15, +and the value for basesize is 12. If you select a different value for +k-mer size, then basesize is made by default to be equal to that k-mer +size. + +If you want to build your genomic databases with more than one k-mer +size, you can re-run gmap_build with different values of -k. This +will overwrite only the identical files from the previous runs. You +can then choose the k-mer size at run-time by using the -k flag for +either GMAP or GSNAP. + </help> </tool>