Mercurial > repos > devteam > table_annovar
changeset 0:525e6995fe44 draft default tip
planemo upload for repository Nonehttps://github.com/galaxyproject/tools-devteam/tree/master/tools/table_annovar commit 5a4e0ca9992af3a6e5ed2b533f04bb82ce761e0b
author | devteam |
---|---|
date | Mon, 09 Nov 2015 11:58:50 -0500 |
parents | |
children | |
files | README replace_NA.py table_annovar.xml tool-data/annovar_index.loc.sample tool_data_table_conf.xml.sample tool_dependencies.xml |
diffstat | 6 files changed, 272 insertions(+), 0 deletions(-) [+] |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/README Mon Nov 09 11:58:50 2015 -0500 @@ -0,0 +1,24 @@ +ANNOVAR needs to be installed manually in the following way: + +1a) If you already have ANNOVAR installed on your system, simply edit the tool-data/annovar.loc file to reflect locations of + the perl scripts (annotate_variation.pl and convert2annovar.pl) and humandb directory (directory containing the annovar database files) + +1b) If you do not have ANNOVAR installed, request annovar download and sign license here: + http://www.openbioinformatics.org/annovar/annovar_download_form.php + + i) Once downloaded, install annovar per the installation instructions and note the installation path. + + ii) Then download all desired databases for all desired builds as follows: + annotate_variation.pl -downdb -buildver <build> [-webfrom annovar] <database> <humandb> + + where <humandb> is location where all database files should be stored + and <database> is the database file to download, e.g. refGene (see bottom of document for all available database files at the time of writing this tool) + and <build> can be hg18 or hg19 for humans, also other organisms available. + + list of all available databases can be found here: http://www.openbioinformatics.org/annovar/annovar_db.html + + iii) edit the tool-data/annovar.loc file to reflect location of humandb folder + +2) add the annovar scripts convert2annovar.pl and table_annovar.pl to your Galaxy user's path + +3) restart galaxy instance for changes in .loc file to take effect \ No newline at end of file
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/replace_NA.py Mon Nov 09 11:58:50 2015 -0500 @@ -0,0 +1,89 @@ +#!/usr/bin/env python + +# Reads a tabular file and replaces a target sequence (currently 'NA') with a number in columns that have numerical values. +# Limitations: (a) can only take input from stdin and (b) cannot specify target or replacement. + +import sys +import os +import tempfile + +# Constants. +SEPARATOR = '\t' +TARGET = 'NA' +REPLACEMENT = -1 +# List of known numerical columns. +NUMERICAL_COLUMNS = ['1000g2012apr_all', 'esp6500si_all'] + +# Use tempfile to store data. +temp_out = tempfile.NamedTemporaryFile(delete=False) + +# Use first line to set up data structure and identify numerical columns. +first_line = sys.stdin.readline() +fields = first_line.strip().split(SEPARATOR) +numerical_cols = [] +for i, f in enumerate(fields): + if f in NUMERICAL_COLUMNS: + numerical_cols.append(i) + +# Data structure is a 2-element list for each fields; first element is # of string elements and second element is # of number elements. +col_type_counts = [ [0, 0] for i in range( len(fields) ) ] + +# Set up function to process lines. +def process_line_fields(fields): + ''' + Process fields in a line. + ''' + for i, f in enumerate(fields): + # Ignore targets in calculation. + if f == TARGET: + continue + + # Assume it's a number. + type_index = 1 + try: + float(f) + except: + # Not a number. + type_index = 0 + col_type_counts[i][type_index] += 1 + + +# Process first line. +process_line_fields(fields) +temp_out.write(first_line) + +# Process N-1 lines. +for line in sys.stdin: + fields = line.strip().split(SEPARATOR) + process_line_fields(fields) + temp_out.write(line) + +# Close temp file so that it can be read. +temp_name = temp_out.name +temp_out.close() + +# Get column type based on label or consensus. +col_types = range(len(col_type_counts)) +for i, counts in enumerate(col_type_counts): + if i in numerical_cols: + col_type = 'number' + elif counts[0] > counts[1]: + col_type = 'string' + else: + col_type = 'number' + col_types[i] = col_type + +# Replace target in number columns. +for line in open(temp_name, 'r'): + fields = line.strip().split(SEPARATOR) + for i, f in enumerate(fields): + if fields[i] == TARGET and col_types[i] == 'number': + fields[i] = str(REPLACEMENT) + print SEPARATOR.join(fields) + +# Clean up temp file. +temp_out.close() +os.unlink(temp_out.name) + + +
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/table_annovar.xml Mon Nov 09 11:58:50 2015 -0500 @@ -0,0 +1,126 @@ +<tool id="table_annovar" name="ANNOVAR Annotate VCF" version="0.2"> + <description>with functional information using ANNOVAR</description> + + <requirements> + <requirement type="package">annovar</requirement> + <requirement type="set_environment">SCRIPT_PATH</requirement> + </requirements> + + <command> + ## Convert VCF to AV input format. + #if str($out_format) == "tabular": + convert2annovar.pl -format vcf4 -includeinfo ${input} > input.avinput ; + #set tab_anno_input = "input.avinput" + #else: + #set tab_anno_input = $input + #end if + + ## Variant annotation; make sure to include entry in indexes table for build database. + + #set protocol = [] + #set operation = [] + + ## Add gene annotations. + #if $gene_anns: + #silent protocol.append( str( $gene_anns ) ) + #silent operation.append( ','.join( ['g' for t in range( str($gene_anns).count(',') + 1 )] ) ) + #end if + + ## Add regions. + #if $regions: + #silent protocol.append( str( $regions ) ) + #silent operation.append( ','.join( ['r' for t in range( str($regions).count(',') + 1 )] ) ) + #end if + + ## Add filters. + #if $filters: + #silent protocol.append( str( $filters ) ) + #silent operation.append( ','.join( ['f' for t in range( str($filters).count(',') + 1 )] ) ) + #end if + + #set protocol = ','.join( $protocol ) + #set operation = ','.join( $operation ) + + ## Annotate variants. + table_annovar.pl ${tab_anno_input} ${__get_data_table_entry__('annovar_indexes', 'dbkey', $input.dbkey, 'path')} -protocol ${protocol} -operation ${operation} -nastring '.' -buildver ${input.dbkey} --outfile output + + ## Add option to consume/produce VCF. + #if str($out_format) == "vcf": + --vcfinput + #end if + + ## Post-processing: process annotated table to remove "NA" strings from numerical columns if + ## tabular. Copy to output. + #if str($out_format) == "tabular": + ; cat output.${input.dbkey}_multianno.txt | python \${SCRIPT_PATH}/replace_NA.py > ${output} + #else: + ; cp output.${input.dbkey}_multianno.vcf ${output} + #end if + </command> + + <inputs> + <param name="input" type="data" format="vcf" metadata_name="dbkey" label="Variants" help=""> + <validator type="unspecified_build" /> + </param> + + <param name="gene_anns" type="select" multiple="True" optional="True" label="Gene Annotations" help="" > + <options from_data_table="annovar_indexes"> + <filter type="data_meta" key="dbkey" ref="input" column="1"/> + <filter type="static_value" name="type" value="gene_ann" column="2"/> + </options> + </param> + + <param name="regions" type="select" multiple="True" optional="True" label="Annotation Regions" help="" > + <options from_data_table="annovar_indexes"> + <filter type="data_meta" key="dbkey" ref="input" column="1"/> + <filter type="static_value" name="type" value="region" column="2"/> + </options> + </param> + + <param name="filters" type="select" multiple="True" label="Annotation Databases" help="" > + <options from_data_table="annovar_indexes"> + <filter type="data_meta" key="dbkey" ref="input" column="1"/> + <filter type="static_value" name="type" value="filter" column="2"/> + </options> + </param> + + <param name="out_format" type="select" label="Output data type"> + <option value="vcf">VCF</option> + <option value="tabular">Tabular</option> + </param> + + </inputs> + + <stdio> + <regex match=".*" source="both" level="log" description="tool progress"/> + </stdio> + + <outputs> + <data name="output" format="tabular"> + <change_format> + <when input="out_format" value="vcf" format="vcf"/> + </change_format> + </data> + </outputs> + + <tests> + </tests> + + <help> +**What it does** + +This tool will annotate variants using specified gene annotations, regions, and filtering databases. Input is a VCF dataset, and output is a table of annotations for each variant in the +VCF dataset or a VCF dataset with the annotations in INFO fields. + +**ANNOVAR Website and Documentation** + +Website: http://www.openbioinformatics.org/annovar/ + +Paper: http://nar.oxfordjournals.org/content/38/16/e164 + +**Important Usage Note** + +ANNOVAR is open-source and free for non-profit use. If you use it for commercial purposes, please contact BIOBASE (info@biobase-international.com) directly for license related issues. Also see http://www.openbioinformatics.org/annovar/annovar_faq.html#license + + </help> +</tool>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tool-data/annovar_index.loc.sample Mon Nov 09 11:58:50 2015 -0500 @@ -0,0 +1,20 @@ +# +# Database name (value), dbkey, type, and path. +# +# Sample entries for gene-based annotations: +#refGene hg19 gene_ann /aut/bx/jgoecks/galaxy/data/Homo_sapiens/UCSC/hg19/AnnovarIndices/ +#wgEncodeGencodeCompV14 hg19 gene_ann /aut/bx/jgoecks/galaxy/data/Homo_sapiens/UCSC/hg19/AnnovarIndices/ +# +# Samples entries for region-based annotations: +# +#genomicSuperDups hg19 region /aut/bx/jgoecks/galaxy/data/Homo_sapiens/UCSC/hg19/AnnovarIndices/ +#phastConsElements46way hg19 region /aut/bx/jgoecks/galaxy/data/Homo_sapiens/UCSC/hg19/AnnovarIndices/ +# +# Sample entries for filter-based annotations: +# +#1000g2012apr_all hg19 filter /aut/bx/jgoecks/galaxy/data/Homo_sapiens/UCSC/hg19/AnnovarIndices/ +#avsift hg19 filter /aut/bx/jgoecks/galaxy/data/Homo_sapiens/UCSC/hg19/AnnovarIndices/ +#snp137NonFlagged hg19 filter /aut/bx/jgoecks/galaxy/data/Homo_sapiens/UCSC/hg19/AnnovarIndices/ +#esp6500si_all hg19 filter /aut/bx/jgoecks/galaxy/data/Homo_sapiens/UCSC/hg19/AnnovarIndice/ +#snp137 hg19 filter /aut/bx/jgoecks/galaxy/data/Homo_sapiens/UCSC/hg19/AnnovarIndices/ +#cosmic64 hg19 filter /aut/bx/jgoecks/galaxy/data/Homo_sapiens/UCSC/hg19/AnnovarIndices/ \ No newline at end of file
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tool_data_table_conf.xml.sample Mon Nov 09 11:58:50 2015 -0500 @@ -0,0 +1,7 @@ +<!-- ANNOVAR files --> +<tables> + <table name="annovar_indexes" comment_char="#"> + <columns>value, dbkey, type, path</columns> + <file path="tool-data/annovar_index.loc" /> + </table> +</tables> \ No newline at end of file
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tool_dependencies.xml Mon Nov 09 11:58:50 2015 -0500 @@ -0,0 +1,6 @@ +<?xml version="1.0"?> +<tool_dependency> + <set_environment version="1.0"> + <environment_variable name="SCRIPT_PATH" action="set_to">$REPOSITORY_INSTALL_DIR</environment_variable> + </set_environment> +</tool_dependency> \ No newline at end of file