Mercurial > repos > devteam > table_annovar

--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/README	Mon Nov 09 11:58:50 2015 -0500
@@ -0,0 +1,24 @@
+ANNOVAR needs to be installed manually in the following way:
+
+1a)	If you already have ANNOVAR installed on your system, simply edit the tool-data/annovar.loc file to reflect locations of
+	the perl scripts (annotate_variation.pl and convert2annovar.pl) and humandb directory (directory containing the annovar database files)
+
+1b)	If you do not have ANNOVAR installed, request annovar download and sign license here:
+		http://www.openbioinformatics.org/annovar/annovar_download_form.php
+
+	i)	 Once downloaded, install annovar per the installation instructions and note the installation path.
+
+	ii)	 Then download all desired databases for all desired builds as follows:
+			annotate_variation.pl -downdb -buildver <build> [-webfrom annovar] <database> <humandb>
+
+		 where <humandb> is location where all database files should be stored
+		 and <database> is the database file to download, e.g. refGene (see bottom of document for all available database files at the time of writing this tool)
+		 and <build> can be hg18 or hg19 for humans, also other organisms available.
+
+		 list of all available databases can be found here: http://www.openbioinformatics.org/annovar/annovar_db.html
+
+	iii) edit the tool-data/annovar.loc file to reflect location of humandb folder
+
+2) add the annovar scripts convert2annovar.pl and table_annovar.pl to your Galaxy user's path
+
+3) restart galaxy instance for changes in .loc file to take effect
\ No newline at end of file
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/replace_NA.py	Mon Nov 09 11:58:50 2015 -0500
@@ -0,0 +1,89 @@
+#!/usr/bin/env python
+
+# Reads a tabular file and replaces a target sequence (currently 'NA') with a number in columns that have numerical values.
+# Limitations: (a) can only take input from stdin and (b) cannot specify target or replacement.
+
+import sys
+import os
+import tempfile
+
+# Constants.
+SEPARATOR = '\t'
+TARGET = 'NA'
+REPLACEMENT = -1
+# List of known numerical columns.
+NUMERICAL_COLUMNS = ['1000g2012apr_all', 'esp6500si_all']
+
+# Use tempfile to store data.
+temp_out = tempfile.NamedTemporaryFile(delete=False)
+
+# Use first line to set up data structure and identify numerical columns.
+first_line = sys.stdin.readline()
+fields = first_line.strip().split(SEPARATOR)
+numerical_cols = []
+for i, f in enumerate(fields):
+    if f in NUMERICAL_COLUMNS:
+        numerical_cols.append(i)
+
+# Data structure is a 2-element list for each fields; first element is # of string elements and second element is # of number elements.
+col_type_counts = [ [0, 0] for i in range( len(fields) ) ]
+
+# Set up function to process lines.
+def process_line_fields(fields):
+    '''
+    Process fields in a line.
+    '''
+    for i, f in enumerate(fields):
+        # Ignore targets in calculation.
+        if f == TARGET:
+            continue
+
+        # Assume it's a number.
+        type_index = 1
+        try:
+            float(f)
+        except:
+            # Not a number.
+            type_index = 0
+        col_type_counts[i][type_index] += 1
+
+
+# Process first line.
+process_line_fields(fields)
+temp_out.write(first_line)
+
+# Process N-1 lines.
+for line in sys.stdin:
+    fields = line.strip().split(SEPARATOR)
+    process_line_fields(fields)
+    temp_out.write(line)
+
+# Close temp file so that it can be read.
+temp_name = temp_out.name
+temp_out.close()
+
+# Get column type based on label or consensus.
+col_types = range(len(col_type_counts))
+for i, counts in enumerate(col_type_counts):
+    if i in numerical_cols:
+        col_type = 'number'
+    elif counts[0] > counts[1]:
+        col_type = 'string'
+    else:
+        col_type = 'number'
+    col_types[i] = col_type
+
+# Replace target in number columns.
+for line in open(temp_name, 'r'):
+    fields = line.strip().split(SEPARATOR)
+    for i, f in enumerate(fields):
+        if fields[i] == TARGET and col_types[i] == 'number':
+            fields[i] = str(REPLACEMENT)
+    print SEPARATOR.join(fields)
+
+# Clean up temp file.
+temp_out.close()
+os.unlink(temp_out.name)
+
+
+
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/table_annovar.xml	Mon Nov 09 11:58:50 2015 -0500
@@ -0,0 +1,126 @@
+<tool id="table_annovar" name="ANNOVAR Annotate VCF" version="0.2">
+    <description>with functional information using ANNOVAR</description>
+
+    <requirements>
+        <requirement type="package">annovar</requirement>
+        <requirement type="set_environment">SCRIPT_PATH</requirement>
+    </requirements>
+
+    <command>
+        ## Convert VCF to AV input format.
+        #if str($out_format) == "tabular":
+            convert2annovar.pl -format vcf4 -includeinfo ${input} > input.avinput ;
+            #set tab_anno_input = "input.avinput"
+        #else:
+            #set tab_anno_input = $input
+        #end if
+
+        ## Variant annotation; make sure to include entry in indexes table for build database.
+
+        #set protocol = []
+        #set operation = []
+
+        ## Add gene annotations.
+        #if $gene_anns:
+            #silent protocol.append( str( $gene_anns )  )
+            #silent operation.append( ','.join( ['g' for t in range( str($gene_anns).count(',') + 1 )] ) )
+        #end if
+
+        ## Add regions.
+        #if $regions:
+            #silent protocol.append( str( $regions ) )
+            #silent operation.append( ','.join( ['r' for t in range( str($regions).count(',') + 1 )] ) )
+        #end if
+
+        ## Add filters.
+        #if $filters:
+            #silent protocol.append( str( $filters ) )
+            #silent operation.append( ','.join( ['f' for t in range( str($filters).count(',') + 1 )] ) )
+        #end if
+
+        #set protocol = ','.join( $protocol )
+        #set operation = ','.join( $operation )
+
+        ## Annotate variants.
+        table_annovar.pl ${tab_anno_input} ${__get_data_table_entry__('annovar_indexes', 'dbkey', $input.dbkey, 'path')} -protocol ${protocol} -operation ${operation} -nastring '.' -buildver ${input.dbkey} --outfile output
+
+        ## Add option to consume/produce VCF.
+        #if str($out_format) == "vcf":
+            --vcfinput
+        #end if
+
+        ## Post-processing: process annotated table to remove "NA" strings from numerical columns if
+        ## tabular. Copy to output.
+        #if str($out_format) == "tabular":
+            ; cat output.${input.dbkey}_multianno.txt | python \${SCRIPT_PATH}/replace_NA.py > ${output}
+        #else:
+            ; cp output.${input.dbkey}_multianno.vcf ${output}
+        #end if
+    </command>
+
+    <inputs>
+        <param name="input" type="data" format="vcf" metadata_name="dbkey" label="Variants" help="">
+        <validator type="unspecified_build" />
+    </param>
+
+    <param name="gene_anns" type="select" multiple="True" optional="True" label="Gene Annotations" help="" >
+        <options from_data_table="annovar_indexes">
+            <filter type="data_meta" key="dbkey" ref="input" column="1"/>
+            <filter type="static_value" name="type" value="gene_ann" column="2"/>
+        </options>
+    </param>
+
+    <param name="regions" type="select" multiple="True" optional="True" label="Annotation Regions" help="" >
+        <options from_data_table="annovar_indexes">
+            <filter type="data_meta" key="dbkey" ref="input" column="1"/>
+            <filter type="static_value" name="type" value="region" column="2"/>
+        </options>
+    </param>
+
+    <param name="filters" type="select" multiple="True" label="Annotation Databases" help="" >
+        <options from_data_table="annovar_indexes">
+            <filter type="data_meta" key="dbkey" ref="input" column="1"/>
+            <filter type="static_value" name="type" value="filter" column="2"/>
+        </options>
+    </param>
+
+    <param name="out_format" type="select" label="Output data type">
+        <option value="vcf">VCF</option>
+        <option value="tabular">Tabular</option>
+    </param>
+
+    </inputs>
+
+    <stdio>
+        <regex match=".*" source="both" level="log" description="tool progress"/>
+    </stdio>
+
+    <outputs>
+        <data name="output" format="tabular">
+            <change_format>
+                <when input="out_format" value="vcf" format="vcf"/>
+            </change_format>
+        </data>
+    </outputs>
+
+    <tests>
+    </tests>
+
+    <help>
+**What it does**
+
+This tool will annotate variants using specified gene annotations, regions, and filtering databases. Input is a VCF dataset, and output is a table of annotations for each variant in the
+VCF dataset or a VCF dataset with the annotations in INFO fields.
+
+**ANNOVAR Website and Documentation**
+
+Website: http://www.openbioinformatics.org/annovar/
+
+Paper: http://nar.oxfordjournals.org/content/38/16/e164
+
+**Important Usage Note**
+
+ANNOVAR is open-source and free for non-profit use. If you use it for commercial purposes, please contact BIOBASE (info@biobase-international.com) directly for license related issues. Also see http://www.openbioinformatics.org/annovar/annovar_faq.html#license
+
+    </help>
+</tool>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/tool-data/annovar_index.loc.sample	Mon Nov 09 11:58:50 2015 -0500
@@ -0,0 +1,20 @@
+#
+# Database name (value), dbkey, type, and path.
+#
+# Sample entries for gene-based annotations:
+#refGene	hg19	gene_ann	/aut/bx/jgoecks/galaxy/data/Homo_sapiens/UCSC/hg19/AnnovarIndices/
+#wgEncodeGencodeCompV14	hg19	gene_ann	/aut/bx/jgoecks/galaxy/data/Homo_sapiens/UCSC/hg19/AnnovarIndices/
+#
+# Samples entries for region-based annotations:
+#
+#genomicSuperDups	hg19	region	/aut/bx/jgoecks/galaxy/data/Homo_sapiens/UCSC/hg19/AnnovarIndices/
+#phastConsElements46way	hg19	region	/aut/bx/jgoecks/galaxy/data/Homo_sapiens/UCSC/hg19/AnnovarIndices/
+#
+# Sample entries for filter-based annotations:
+#
+#1000g2012apr_all	hg19	filter	/aut/bx/jgoecks/galaxy/data/Homo_sapiens/UCSC/hg19/AnnovarIndices/
+#avsift	hg19	filter	/aut/bx/jgoecks/galaxy/data/Homo_sapiens/UCSC/hg19/AnnovarIndices/
+#snp137NonFlagged	hg19	filter	/aut/bx/jgoecks/galaxy/data/Homo_sapiens/UCSC/hg19/AnnovarIndices/
+#esp6500si_all	hg19	filter	/aut/bx/jgoecks/galaxy/data/Homo_sapiens/UCSC/hg19/AnnovarIndice/
+#snp137	hg19	filter	/aut/bx/jgoecks/galaxy/data/Homo_sapiens/UCSC/hg19/AnnovarIndices/
+#cosmic64	hg19	filter	/aut/bx/jgoecks/galaxy/data/Homo_sapiens/UCSC/hg19/AnnovarIndices/
\ No newline at end of file
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/tool_data_table_conf.xml.sample	Mon Nov 09 11:58:50 2015 -0500
@@ -0,0 +1,7 @@
+<!-- ANNOVAR files -->
+<tables>
+	<table name="annovar_indexes" comment_char="#">
+		<columns>value, dbkey, type, path</columns>
+		<file path="tool-data/annovar_index.loc" />
+	</table>
+</tables>
\ No newline at end of file
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/tool_dependencies.xml	Mon Nov 09 11:58:50 2015 -0500
@@ -0,0 +1,6 @@
+<?xml version="1.0"?>
+<tool_dependency>
+    <set_environment version="1.0">
+        <environment_variable name="SCRIPT_PATH" action="set_to">$REPOSITORY_INSTALL_DIR</environment_variable>
+    </set_environment>
+</tool_dependency>
\ No newline at end of file