Mercurial > repos > charles_s_test > seqsero2
diff libs/sratoolkit.2.8.0-centos_linux64/schema/ncbi/varloc.vschema @ 3:38ad1130d077 draft
planemo upload commit a4fb57231f274270afbfebd47f67df05babffa4a-dirty
author | charles_s_test |
---|---|
date | Mon, 27 Nov 2017 11:21:07 -0500 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/libs/sratoolkit.2.8.0-centos_linux64/schema/ncbi/varloc.vschema Mon Nov 27 11:21:07 2017 -0500 @@ -0,0 +1,204 @@ +/*=========================================================================== +* +* PUBLIC DOMAIN NOTICE +* National Center for Biotechnology Information +* +* This software/database is a "United States Government Work" under the +* terms of the United States Copyright Act. It was written as part of +* the author's official duties as a United States Government employee and +* thus cannot be copyrighted. This software/database is freely available +* to the public for use. The National Library of Medicine and the U.S. +* Government have not placed any restriction on its use or reproduction. +* +* Although all reasonable efforts have been taken to ensure the accuracy +* and reliability of the software and data, the NLM and the U.S. +* Government do not and cannot warrant the performance or results that +* may be obtained by using this software or data. The NLM and the U.S. +* Government disclaim all warranties, express or implied, including +* warranties of performance, merchantability or fitness for any particular +* purpose. +* +* Please cite the author in any work or product based on this material. +* +* =========================================================================== +* +*/ + +/*========================================================================== + * VarLoc table + */ +version 1; + +include 'vdb/vdb.vschema'; +include 'insdc/insdc.vschema'; +include 'ncbi/ncbi.vschema'; + + +/*-------------------------------------------------------------------------- + * types + * http://www.ncbi.nlm.nih.gov/IEB/ToolBox/CPP_DOC/asn_spec/Variation-inst.html + */ +typedef U8 NCBI:var:inst:type; +const NCBI:var:inst:type NCBI:var:inst:value:unknown = 0; +const NCBI:var:inst:type NCBI:var:inst:value:identity = 1; +const NCBI:var:inst:type NCBI:var:inst:value:inv = 2; +const NCBI:var:inst:type NCBI:var:inst:value:snv = 3; +const NCBI:var:inst:type NCBI:var:inst:value:mnp = 4; +const NCBI:var:inst:type NCBI:var:inst:value:delins = 5; +const NCBI:var:inst:type NCBI:var:inst:value:del = 6; +const NCBI:var:inst:type NCBI:var:inst:value:ins = 7; +const NCBI:var:inst:type NCBI:var:inst:value:microsatellite = 8; +const NCBI:var:inst:type NCBI:var:inst:value:transposon = 9; +const NCBI:var:inst:type NCBI:var:inst:value:cnv = 10; +const NCBI:var:inst:type NCBI:var:inst:value:direct_copy = 11; +const NCBI:var:inst:type NCBI:var:inst:value:rev_direct_copy = 12; +const NCBI:var:inst:type NCBI:var:inst:value:inverted_copy = 13; +const NCBI:var:inst:type NCBI:var:inst:value:everted_copy = 14; +const NCBI:var:inst:type NCBI:var:inst:value:translocation = 15; +const NCBI:var:inst:type NCBI:var:inst:value:prot_missense = 16; +const NCBI:var:inst:type NCBI:var:inst:value:prot_nonsense = 17; +const NCBI:var:inst:type NCBI:var:inst:value:prot_neutral = 18; +const NCBI:var:inst:type NCBI:var:inst:value:prot_silent = 19; +const NCBI:var:inst:type NCBI:var:inst:value:prot_other = 20; +const NCBI:var:inst:type NCBI:var:inst:value:other = 255; + +typedef U8 NCBI:var:source:type; +const NCBI:var:source:type NCBI:var:source:value:dbSNP = 1; +const NCBI:var:source:type NCBI:var:source:value:dbVar = 2; +const NCBI:var:source:type NCBI:var:source:value:ClinVar = 3; +const NCBI:var:source:type NCBI:var:source:value:other = 10; + + +/*-------------------------------------------------------------------------- + * functions + */ + +/* tokenize_var_id + * splits into 2 tokens + * 0 - prefix + * 1 - suffix + */ +extern function +text:token NCBI:var:tokenize_var_id #1 ( ascii var_id ); + + +/*-------------------------------------------------------------------------- + * varloc + * this name is questionable + */ +table NCBI:var:tbl:varloc #1 +{ + /* SQL schema: + var_id varchar(50), + parent_var_id varchar(50) NULL OKAY, + var_type int, + var_source int, + gi int, + pos_from int, + pos_to int, + entrez_id int, + score int + */ + + /* VAR_ID + * example: "rs5852452" + */ + extern column ascii VAR_ID = out_var_id; + + // on input, separate into 3 columns + ascii in_var_id = VAR_ID; + text:token in_var_id_tok = NCBI:var:tokenize_var_id ( in_var_id ); + ascii in_var_id_prefix = extract_token < 0 > ( in_var_id, in_var_id_tok ); + ascii in_var_id_suffix_text = extract_token < 1 > ( in_var_id, in_var_id_tok ); + U32 in_var_id_suffix = strtonum ( in_var_id_suffix_text ); + + // prefix column + physical column < ascii > zip_encoding .VAR_ID_PREFIX = in_var_id_prefix; + physical column < U32 > izip_encoding .VAR_ID_SUFFIX_LEN = row_len ( in_var_id_suffix_text ); + physical column < U32 > izip_encoding .VAR_ID_SUFFIX = in_var_id_suffix; + + // on output, restore original id + U32 out_var_id_suffix = .VAR_ID_SUFFIX; + U32 out_var_id_suffix_len = .VAR_ID_SUFFIX_LEN; + ascii out_var_id_prefix = .VAR_ID_PREFIX; + ascii out_var_id = sprintf < "%s%0*u" > ( out_var_id_prefix, out_var_id_suffix_len, out_var_id_suffix ); + + /* PARENT_VAR_ID + * example: "rs5852452" + * may be EMPTY + */ + extern column ascii PARENT_VAR_ID = out_parent_var_id; + + // same treatment as VAR_ID + ascii in_parent_var_id = PARENT_VAR_ID; + text:token in_parent_var_id_tok = NCBI:var:tokenize_var_id ( in_parent_var_id ); + ascii in_parent_var_id_prefix = extract_token < 0 > ( in_parent_var_id, in_parent_var_id_tok ); + ascii in_parent_var_id_suffix_text = extract_token < 1 > ( in_parent_var_id, in_parent_var_id_tok ); + U32 in_parent_var_id_suffix = strtonum ( in_parent_var_id_suffix_text ); + physical column < ascii > zip_encoding .PARENT_VAR_ID_PREFIX = in_parent_var_id_prefix; + physical column < U32 > izip_encoding .PARENT_VAR_ID_SUFFIX_LEN = row_len ( in_parent_var_id_suffix_text ); + physical column < U32 > izip_encoding .PARENT_VAR_ID_SUFFIX = in_parent_var_id_suffix; + U32 out_parent_var_id_suffix = .PARENT_VAR_ID_SUFFIX; + U32 out_parent_var_id_suffix_len = .PARENT_VAR_ID_SUFFIX_LEN; + ascii out_parent_var_id_prefix = .PARENT_VAR_ID_PREFIX; + ascii out_parent_var_id = sprintf < "%s%.*u" > ( out_parent_var_id_prefix, out_parent_var_id_suffix_len, out_parent_var_id_suffix ); + + /* VAR_TYPE + */ + extern column < NCBI:var:inst:type > zip_encoding VAR_TYPE; + + /* VAR_SOURCE + */ + extern column < NCBI:var:source:type > zip_encoding VAR_SOURCE; + + /* GI + */ + extern column < NCBI:gi > izip_encoding GI; + + /* POS_FROM + * starting position + */ + extern column < INSDC:coord:zero > izip_encoding POS_FROM; + + INSDC:coord:zero in_pos_from = POS_FROM; + INSDC:coord:zero out_pos_from = .POS_FROM; + + /* POS_TO + * ending position + */ + extern column INSDC:coord:zero POS_TO = out_pos_to; + + INSDC:coord:zero in_pos_to = POS_TO; + INSDC:coord:len in_pos_len = ( INSDC:coord:len ) < I32 > diff < -1 > ( in_pos_to, in_pos_from ); + + physical column < INSDC:coord:len > izip_encoding .POS_LEN = in_pos_len; + + INSDC:coord:zero out_pos_len = ( INSDC:coord:zero ) .POS_LEN; + INSDC:coord:zero out_pos_to = < INSDC:coord:zero > sum < -1 > ( out_pos_from, out_pos_len ); + + /* ENTREZ_ID + * do we need this? + */ + extern column < I32 > izip_encoding ENTREZ_ID; + + /* SCORE + */ + extern column < I32 > izip_encoding SCORE; +}; + +table NCBI:var:tbl:hitmap #1 +{ + extern column U32 MAX_SEQ_LEN; /* must be static */ + extern column bool_encoding HITS; /* places on the reference with variations */ +}; + + +/*-------------------------------------------------------------------------- + * varloc + * contains the varloc table and hit table + */ +database NCBI:var:db:varloc #1 +{ + table NCBI:var:tbl:varloc VARLOC; + table NCBI:var:tbl:hitmap HITMAP; +};