view gmap_build.xml @ 2:f6ba0f12cca2 draft

Untested work-in-progress GMAP wrappers v3.0.0, from JJ back in June 2013
author peterjc
date Wed, 28 Sep 2016 10:43:44 -0400
parents 74391fc6e3f2
children 488e9d642566
line wrap: on
line source

<tool id="gmap_build" name="GMAP Build" version="3.0.0">
  <description>a database genome index for GMAP and GSNAP</description>
  <requirements>
      <requirement type="package" version="2013-05-09">gmap</requirement>
  </requirements>
  <version_string>gmap --version</version_string>
  <command interpreter="command"> /bin/bash $shscript > $output </command>
  <inputs>
    <!-- Name for this gmapdb -->
    <param name="refname" type="text" label="Name you want to give this gmap database" help="">
      <validator type="empty_field" message="A database name is required."/>
    </param>
    <!-- Input data -->
    <repeat name="inputs" title="Reference Sequence" min="1">
      <param name="input" type="data" format="fasta" label="reference sequence fasta" />
    </repeat>

    <param name="circular_chroms" type="text" value="" optional="true" label="Names of circular chromosomes" 
           help="a list of chromosomes, separated by commas, allow GSNAP and GMAP to align reads across the ends of the chromosome">
    </param>

    <param name="sort" type="select" label="Sort chromosomes" help="">
      <option value="none">none - use chromosomes as found in FASTA file(s)</option>
      <option value="alpha">alpha - sort chromosomes alphabetically (chr10 before chr 1)</option>
      <option value="numeric-alpha">numeric-alpha - chr1, chr1U, chr2, chrM, chrU, chrX, chrY</option>
      <option value="chrom">chrom - chr1, chr2, chrM, chrX, chrY, chr1U, chrU</option>
    </param>  

    <param name="cmetindex" type="boolean" checked="true" truevalue="yes" falsevalue="no" label="Create cmetindex to process reads from bisulfite-treated DNA"/>
    <param name="atoiindex" type="boolean" checked="true" truevalue="yes" falsevalue="no" label="Create atoiindex to process reads under RNA-editing tolerance"/>
    <conditional name="splicesite">
      <param name="splice_source" type="select" label="Add splice and intron info from" >
        <option value="none"></option>
        <option value="refGeneTable">refGenes table from UCSC table browser</option>
        <option value="gtf">GTF</option>
        <option value="gff3">GFF3</option>
      </param>
      <when value="none"/>
      <when value="refGeneTable">
        <param name="refGenes" type="data" format="tabular" optional="true" label="UCSC refGenes table" help="Example: ftp://hgdownload.cse.ucsc.edu/goldenPath/hg18/database/refGene.txt.gz" />
        <param name="col_skip" type="integer" value="1" label="Columns to skip before the id/name column (default 1)" 
               help="Note that alignment tracks in UCSC sometimes have an extra column on the left.">
          <validator type="in_range" message="The number of colmumns to skip must >= 0." min="0."/>
        </param>
 
      </when>
      <when value="gtf">
        <param name="gtfGenes" type="data" format="gtf" optional="true" label="Genes as GTF" help="" />
      </when>
      <when value="gff3">
        <param name="gff3Genes" type="data" format="gff3" optional="true" label="Genes in GFF3 format" help="" />
      </when>
    </conditional> 
    <conditional name="dbsnp">
      <param name="snp_source" type="select" label="Add SNP info from" >
        <option value="none"></option>
        <option value="snpTable">UCSC SNP Table</option>
        <option value="snpFile">GMAP SNP File</option>
        <option value="vcfFile">VCF File</option>
      </param>
      <when value="none"/>
      <when value="snpTable">
        <param name="snps" type="data" format="tabular" optional="true" label="UCSC SNPs table" help="Example: ftp://hgdownload.cse.ucsc.edu/goldenPath/hg18/database/snp130.txt.gz" />
        <param name="snpsex" type="data" format="tabular" optional="true" label="UCSC SNP Exceptions table" help="Example: ftp://hgdownload.cse.ucsc.edu/goldenPath/hg18/database/snp130Exceptions.txt.gz" />
        <param name="weight" type="select" label="Include SNPs with at least Confidence Level" help="">
          <option value="1" selected="true">1 (High)</option>
          <option value="2">2 (Medium)</option>
          <option value="3">3 (All)</option>
        </param>
      </when>
      <when value="snpFile">
        <param name="snps" type="data" format="gmap_snps" optional="true" label="GMAP SNPs file" 
           help="Format (3 columns):
                &lt;br&gt;>rs62211261 21:14379270 CG
                &lt;br&gt;>rs62211262 21:14379281 CG
                &lt;br&gt;Each line must start with a &gt; character, then be followed by an
                identifier (which may have duplicates).  Then there should be the
                chromosomal coordinate of the SNP.  (Coordinates are all 1-based, so
                the first character of a chromosome is number 1.)  Finally, there
                should be the two possible alleles: ( AC AG AT CG CT GT or AN CN GN TN)
                &lt;br&gt;These alleles must correspond to the possible nucleotides on the plus strand of the genome.  
                If the one of these two letters does not match the allele in the reference
                sequence, that SNP will be ignored in subsequent processing as a probable error.
                The N stands for any other allele." />
      </when>
      <when value="vcfFile">
        <param name="snps" type="data" format="vcf" optional="true" label="VCF SNPs file" 
               help="Example: ftp://ftp.ncbi.nih.gov/snp/organisms/human_9606/VCF/00-All.vcf.gz
                     The VCF file contains multiple versions of dbSNP, so if you want a
                     particular version, such as 135.  The vcf_iit program tries to pick 
                     a subset of SNPs that somewhat parallel
                     the ones without exceptions in the UCSC dbSNP file.  It keeps all SNPs
                     that have been validated (marked in the VCF file as &#34;VLD&#34;) or have a
                     submitter link-out (&#34;SLO&#34;).  Otherwise, it excludes SNPs that are
                     individual genotypes (&#34;GNO&#34;).  If none of these conditions hold, then
                     the SNP is allowed.  "/>
        <param name="vcf_version" type="text" value="" optional="true" label="dbSNP version" 
           help="The VCF file contains multiple versions of dbSNP, so if you want a particular version, such as 135"/>
      </when>
    </conditional> 

    <param name="kmer" type="select" multiple="true" force_select="true" label="kmer size" help="Use smaller values when building indexes on machines with limited RAM">
      <option value="12">12 (64MB RAM)</option>
      <option value="13">13 (256MB RAM)</option>
      <option value="14">14 (1GB RAM)</option>
      <option value="15" selected="true">15 (4GB RAM)</option>
    </param>

  </inputs>
  <stdio>
    <exit_code range="1"  level="fatal"   description="Error running gmap_build" />
  </stdio>
  <outputs>
    <!--
    <data format="txt" name="log" label="${tool.name} on ${on_string}: log"/>
    -->
    <data format="gmapdb" name="output" label="${tool.name} on ${on_string} gmapdb ${refname}" />
  </outputs>
  <configfiles>
    <configfile name="shscript">
#!/bin/bash
#set $ds = chr(36)
#set $gt = chr(62)
#set $lt = chr(60)
#set $ad = chr(38)
## #set $ref_files = ''
## #for $i in $inputs:
  ## #set $ref_files = $ref_files $i.input
## #end for
## echo $ref_files
#set circular = ""
#if $circular_chroms.__str__.strip() != '':
#set circular = ('').join([' -c ','"', $circular_chroms.__str__,'"'])
#end if
#import os.path
#set $gmapdb = $output.extra_files_path
#set $mapsdir = $os.path.join($os.path.join($gmapdb,str($refname)), str($refname) + '.maps')
mkdir -p $gmapdb
## export GMAPDB required for cmetindex  and atoiindex
export GMAPDB=$gmapdb
#if $kmer:
#for $k in $kmer.__str__.split(','):
gmap_build -D $gmapdb -d $refname -s $sort $circular -k $k #for i in $inputs# ${i.input}#end for#
#end for
#else:
gmap_build -D $gmapdb -d $refname -s $sort $circular #for i in $inputs# ${i.input}#end for#
#end if
get-genome -D $gmapdb -d '?' | sed 's/^Available .*/gmap db: /' 
echo "kmers: " $kmer 
#if $splicesite.splice_source == 'refGeneTable':
#if $splicesite.refGenes.__str__ != 'None':
cat $splicesite.refGenes | psl_splicesites -s $splicesite.col_skip | iit_store -o  $os.path.join($mapsdir,'splicesites')
cat $splicesite.refGenes | psl_introns -s $splicesite.col_skip | iit_store -o  $os.path.join($mapsdir,'introns')
#end if
#elif $splicesite.splice_source == 'gtf':
#if $splicesite.gtfGenes.__str__ != 'None':
cat $splicesite.gtfGenes | gtf_splicesites | iit_store -o  $os.path.join($mapsdir,'splicesites')
cat $splicesite.gtfGenes | gtf_introns | iit_store -o  $os.path.join($mapsdir,'introns')
#end if
#elif $splicesite.splice_source == 'gff3':
#if $splicesite.gff3Genes.__str__ != 'None':
cat $splicesite.gff3Genes | gff3_splicesites | iit_store -o  $os.path.join($mapsdir,'splicesites')
cat $splicesite.gff3Genes | gff3_introns | iit_store -o  $os.path.join($mapsdir,'introns')
#end if
#end if
#if $dbsnp.snp_source != 'none' and $dbsnp.snps.__str__ != 'None':
 #if $dbsnp.snp_source == 'snpTable':
  #if $dbsnp.snpsex.__str__ != 'None':
   cat $dbsnp.snps | dbsnp_iit -w $dbsnp.weight -e $dbsnp.snpsex | iit_store -o  $os.path.join($mapsdir,'snps')
  #else:
   cat $dbsnp.snps | dbsnp_iit -w $dbsnp.weight | iit_store -o  $os.path.join($mapsdir,'snps')
  #end if
 #elif $dbsnp.snp_source == 'vcfFile':
  #if $dbsnp.vcf_version  and len($dbsnp.vcf_version.__str__.strip()) > 0:
   cat $dbsnp.snps | vcf_iit -v $dbsnp.vcf_version.__str__.strip() | iit_store -o  $os.path.join($mapsdir,'snps')
  #else:
   cat $dbsnp.snps | vcf_iit | iit_store -o  $os.path.join($mapsdir,'snps')
  #end if
 #else:
  cat $dbsnp.snps | iit_store -o  $os.path.join($mapsdir,'snps')
 #end if
 snpindex -d $refname -v snps
 echo "snpindex" -d  $refname -v snps
#end if
#if $cmetindex.__str__ == 'yes':
cmetindex -d $refname
echo "cmetindex" -d $refname
#end if
#if $atoiindex.__str__ == 'yes':
atoiindex -d $refname
echo "atoiindex" -d $refname
#end if
get-genome -D $gmapdb -d $refname -m '?' | sed 's/^Available maps .*/maps: /' 
    </configfile>
  </configfiles>

  <tests>
  </tests> 

  <help>

**GMAP Build**

GMAP Build creates an index of a genomic sequence for mapping and alignment using GMAP_ (Genomic Mapping and Alignment Program for mRNA and EST sequences) and GSNAP_ (Genomic Short-read Nucleotide Alignment Program).  (GMAP Build uses GMAP commands: gmap_build, iit_store, psl_splicesites, psl_introns, gtf_splicesites, gtf_introns, gff3_splicesites, gff3_introns, dbsnp_iit, snpindex, cmetindex, and atoiindex.)

You will want to read the README_

Publication_ citation: Thomas D. Wu, Colin K. Watanabe  Bioinformatics 2005 21(9):1859-1875; doi:10.1093/bioinformatics/bti310

.. _GMAP: http://research-pub.gene.com/gmap/
.. _GSNAP: http://research-pub.gene.com/gmap/
.. _README: http://research-pub.gene.com/gmap/src/README
.. _Publication: http://bioinformatics.oxfordjournals.org/cgi/content/full/21/9/1859


**circular chromosomes**

Finally, you can provide information to gmap_build that certain
chromosomes are circular, with the -c or -\-circular flag.  The value
for these flags is a list of chromosomes, separated by commas.  The
gmap_build program will then allow GSNAP and GMAP to align reads
across the ends of the chromosome.  For example, the mitochondrial
genome in human beings is circular.


**Detecting known and novel splice sites in GSNAP**

GSNAP can detect splice junctions in individual reads.  
GSNAP allows for known splicing at two levels: at the level of known
splice sites and at the level of known introns.  At the site level,
GSNAP finds splicing between arbitrary combinations of donor and
acceptor splice sites, meaning that it can find alternative splicing
events.  At the intron level, GSNAP finds splicing only between the
set of given donor-acceptor pairs, so it is constrained not to find
alternative splicing events, only introns included in the given list.
For most purposes, I would recommend using known splice sites, rather
than known introns, unless you are certain that all alternative
splicing events are known are represented in your file.

Splice site files can be generated from a GTF file 
or from  refGenes table from UCSC.  


**SNP-tolerant alignment in GSNAP**

GSNAP has the ability to align to a reference space of all possible
major and minor alleles in a set of known SNPs provided by the user.


Process known SNP data, either from older dbSNP files or from newer
files in VCF format.  The older dbSNP files can be obtained from UCSC,
either from the Galaxy UCSC table browser or downloaded:

    ftp://hgdownload.cse.ucsc.edu/goldenPath/hg18/database/snp130.txt.gz

For versions before snp132, you may also want to exclude exceptions,
which will require this file:

    ftp://hgdownload.cse.ucsc.edu/goldenPath/hg18/database/snp130Exceptions.txt.gz

The option "-w weight" makes use of the dbSNP item weight, a value
from 1 to 3, where lower weight means higher confidence.  Items will
be included if the item weight is the given value weight or less.
The default value of -w is 1, which is the criterion UCSC uses to
build its ambiguous version of the genome.  To allow all item weights,
specify "-w 3".

The more recent SNP data are provided in VCF format, and can be
retrieved like this:

    ftp://ftp.ncbi.nih.gov/snp/organisms/human_9606/VCF/00-All.vcf.gz

The VCF file contains multiple versions of dbSNP, so if you want a
particular version, such as 135, you would use the flag "-v 135".  The
vcf_iit program tries to pick a subset of SNPs that somewhat parallel
the ones without exceptions in the UCSC dbSNP file.  It keeps all SNPs
that have been validated (marked in the VCF file as "VLD") or have a
submitter link-out ("SLO").  Otherwise, it excludes SNPs that are
individual genotypes ("GNO").  If none of these conditions hold, then
the SNP is allowed.  These rules might not be the best ones; I made
them up by trying to compare version 135 of the VCF data with
version 135 of the UCSC dbSNP data.

**Alignment of reads from bisulfite-treated DNA in GSNAP**

GSNAP has the ability to align reads from bisulfite-treated DNA, which
converts unmethylated cytosines to uracils that appear as thymines in
reads.  GSNAP is able to identify genomic-T to read-C mismatches, 
if a cmetindex is generated.

**RNA-editing tolerance in GSNAP**

Just as GSNAP has a program cmetindex and a mode called "cmet" for
tolerance to C-to-T changes, it can be tolerant to A-to-G changes
using the program atoiindex and a mode called "atoi".  This mode is
designed to facilitate alignments that are tolerant to RNA editing
where A's are converted to I's, which appear as G's to a sequencer.

To process reads under RNA-editing tolerance, you will first need to
create th atoi index.



**K-mer size**

You can control the k-mer size
for the genomic index with the -k flag, which can range from 12 to 15.
The default value for -k is 15, but this requires your machine to have
4 GB of RAM to build the indices.  If you do not have 4 GB of RAM,
then you will need to reduce the value of -k or find another machine.
Here are the RAM requirements for building various indices::

    k-mer of 12: 64 MB
    k-mer of 13: 256 MB
    k-mer of 14: 1 GB
    k-mer of 15: 4 GB

These are the RAM requirements for building indices, but not to run
the GMAP/GSNAP programs once the indices are built, because the
genomic indices are compressed.  For example, the genomic index for a
k-mer of 15 gives a gammaptrs file of 64 MB and an offsetscomp file of
about 350 MB, much smaller than the 4 GB that would otherwise be
required.  Therefore, you may want to build your genomic index on a
computer with sufficient RAM, and distribute that index to be used by
computers with less RAM.

The amount of compression can be controlled using the -b or -\-basesize
parameter to gmap_build.  By default, the value for k-mer size is 15,
and the value for basesize is 12.  If you select a different value for
k-mer size, then basesize is made by default to be equal to that k-mer
size.

If you want to build your genomic databases with more than one k-mer
size, you can re-run gmap_build with different values of -k.  This
will overwrite only the identical files from the previous runs.  You
can then choose the k-mer size at run-time by using the -k flag for
either GMAP or GSNAP.

  </help>
</tool>