view libs/sratoolkit.2.8.0-centos_linux64/schema/ncbi/varloc.vschema @ 3:38ad1130d077 draft

planemo upload commit a4fb57231f274270afbfebd47f67df05babffa4a-dirty
author charles_s_test
date Mon, 27 Nov 2017 11:21:07 -0500
parents
children
line wrap: on
line source

/*===========================================================================
*
*                            PUBLIC DOMAIN NOTICE
*               National Center for Biotechnology Information
*
*  This software/database is a "United States Government Work" under the
*  terms of the United States Copyright Act.  It was written as part of
*  the author's official duties as a United States Government employee and
*  thus cannot be copyrighted.  This software/database is freely available
*  to the public for use. The National Library of Medicine and the U.S.
*  Government have not placed any restriction on its use or reproduction.
*
*  Although all reasonable efforts have been taken to ensure the accuracy
*  and reliability of the software and data, the NLM and the U.S.
*  Government do not and cannot warrant the performance or results that
*  may be obtained by using this software or data. The NLM and the U.S.
*  Government disclaim all warranties, express or implied, including
*  warranties of performance, merchantability or fitness for any particular
*  purpose.
*
*  Please cite the author in any work or product based on this material.
*
* ===========================================================================
*
*/

/*==========================================================================
 * VarLoc table
 */
version 1;

include 'vdb/vdb.vschema';
include 'insdc/insdc.vschema';
include 'ncbi/ncbi.vschema';


/*--------------------------------------------------------------------------
 * types
 *  http://www.ncbi.nlm.nih.gov/IEB/ToolBox/CPP_DOC/asn_spec/Variation-inst.html
 */
typedef U8 NCBI:var:inst:type;
const NCBI:var:inst:type NCBI:var:inst:value:unknown          = 0;
const NCBI:var:inst:type NCBI:var:inst:value:identity         = 1;
const NCBI:var:inst:type NCBI:var:inst:value:inv              = 2;
const NCBI:var:inst:type NCBI:var:inst:value:snv              = 3;
const NCBI:var:inst:type NCBI:var:inst:value:mnp              = 4;
const NCBI:var:inst:type NCBI:var:inst:value:delins           = 5;
const NCBI:var:inst:type NCBI:var:inst:value:del              = 6;
const NCBI:var:inst:type NCBI:var:inst:value:ins              = 7;
const NCBI:var:inst:type NCBI:var:inst:value:microsatellite   = 8;
const NCBI:var:inst:type NCBI:var:inst:value:transposon       = 9;
const NCBI:var:inst:type NCBI:var:inst:value:cnv              = 10;
const NCBI:var:inst:type NCBI:var:inst:value:direct_copy      = 11;
const NCBI:var:inst:type NCBI:var:inst:value:rev_direct_copy  = 12;
const NCBI:var:inst:type NCBI:var:inst:value:inverted_copy    = 13;
const NCBI:var:inst:type NCBI:var:inst:value:everted_copy     = 14;
const NCBI:var:inst:type NCBI:var:inst:value:translocation    = 15;
const NCBI:var:inst:type NCBI:var:inst:value:prot_missense    = 16;
const NCBI:var:inst:type NCBI:var:inst:value:prot_nonsense    = 17;
const NCBI:var:inst:type NCBI:var:inst:value:prot_neutral     = 18;
const NCBI:var:inst:type NCBI:var:inst:value:prot_silent      = 19;
const NCBI:var:inst:type NCBI:var:inst:value:prot_other       = 20;
const NCBI:var:inst:type NCBI:var:inst:value:other            = 255;

typedef U8 NCBI:var:source:type;
const NCBI:var:source:type NCBI:var:source:value:dbSNP        = 1;
const NCBI:var:source:type NCBI:var:source:value:dbVar        = 2;
const NCBI:var:source:type NCBI:var:source:value:ClinVar      = 3;
const NCBI:var:source:type NCBI:var:source:value:other        = 10;


/*--------------------------------------------------------------------------
 * functions
 */

/* tokenize_var_id
 *   splits into 2 tokens
 *   0 - prefix
 *   1 - suffix
 */
extern function
text:token NCBI:var:tokenize_var_id #1 ( ascii var_id );


/*--------------------------------------------------------------------------
 * varloc
 *  this name is questionable
 */
table NCBI:var:tbl:varloc #1
{
    /* SQL schema:
       var_id             varchar(50),
       parent_var_id      varchar(50) NULL OKAY,
       var_type           int,
       var_source         int,
       gi                 int,
       pos_from           int,
       pos_to             int,
       entrez_id          int,
       score              int
    */

    /* VAR_ID
     *  example: "rs5852452"
     */
    extern column ascii VAR_ID = out_var_id;

    // on input, separate into 3 columns
    ascii in_var_id = VAR_ID;
    text:token in_var_id_tok = NCBI:var:tokenize_var_id ( in_var_id );
    ascii in_var_id_prefix = extract_token < 0 > ( in_var_id, in_var_id_tok );
    ascii in_var_id_suffix_text = extract_token < 1 > ( in_var_id, in_var_id_tok );
    U32 in_var_id_suffix = strtonum ( in_var_id_suffix_text );

    // prefix column
    physical column < ascii > zip_encoding .VAR_ID_PREFIX = in_var_id_prefix;
    physical column < U32 > izip_encoding .VAR_ID_SUFFIX_LEN = row_len ( in_var_id_suffix_text );
    physical column < U32 > izip_encoding .VAR_ID_SUFFIX = in_var_id_suffix;

    // on output, restore original id
    U32 out_var_id_suffix = .VAR_ID_SUFFIX;
    U32 out_var_id_suffix_len = .VAR_ID_SUFFIX_LEN;
    ascii out_var_id_prefix = .VAR_ID_PREFIX;
    ascii out_var_id = sprintf < "%s%0*u" > ( out_var_id_prefix, out_var_id_suffix_len, out_var_id_suffix );

    /* PARENT_VAR_ID
     *  example: "rs5852452"
     *  may be EMPTY
     */
    extern column ascii PARENT_VAR_ID = out_parent_var_id;

    // same treatment as VAR_ID
    ascii in_parent_var_id = PARENT_VAR_ID;
    text:token in_parent_var_id_tok = NCBI:var:tokenize_var_id ( in_parent_var_id );
    ascii in_parent_var_id_prefix = extract_token < 0 > ( in_parent_var_id, in_parent_var_id_tok );
    ascii in_parent_var_id_suffix_text = extract_token < 1 > ( in_parent_var_id, in_parent_var_id_tok );
    U32 in_parent_var_id_suffix = strtonum ( in_parent_var_id_suffix_text );
    physical column < ascii > zip_encoding .PARENT_VAR_ID_PREFIX = in_parent_var_id_prefix;
    physical column < U32 > izip_encoding .PARENT_VAR_ID_SUFFIX_LEN = row_len ( in_parent_var_id_suffix_text );
    physical column < U32 > izip_encoding .PARENT_VAR_ID_SUFFIX = in_parent_var_id_suffix;
    U32 out_parent_var_id_suffix = .PARENT_VAR_ID_SUFFIX;
    U32 out_parent_var_id_suffix_len = .PARENT_VAR_ID_SUFFIX_LEN;
    ascii out_parent_var_id_prefix = .PARENT_VAR_ID_PREFIX;
    ascii out_parent_var_id = sprintf < "%s%.*u" > ( out_parent_var_id_prefix, out_parent_var_id_suffix_len, out_parent_var_id_suffix );

    /* VAR_TYPE
     */
    extern column < NCBI:var:inst:type > zip_encoding VAR_TYPE;

    /* VAR_SOURCE
     */
    extern column < NCBI:var:source:type > zip_encoding VAR_SOURCE;

    /* GI
     */
    extern column < NCBI:gi > izip_encoding GI;

    /* POS_FROM
     *  starting position
     */
    extern column < INSDC:coord:zero > izip_encoding POS_FROM;

    INSDC:coord:zero in_pos_from = POS_FROM;
    INSDC:coord:zero out_pos_from = .POS_FROM;

    /* POS_TO
     *  ending position
     */
    extern column INSDC:coord:zero POS_TO = out_pos_to;

    INSDC:coord:zero in_pos_to = POS_TO;
    INSDC:coord:len in_pos_len = ( INSDC:coord:len ) < I32 > diff < -1 > ( in_pos_to, in_pos_from );

    physical column < INSDC:coord:len > izip_encoding .POS_LEN = in_pos_len;

    INSDC:coord:zero out_pos_len = ( INSDC:coord:zero ) .POS_LEN;
    INSDC:coord:zero out_pos_to = < INSDC:coord:zero > sum < -1 > ( out_pos_from, out_pos_len );

    /* ENTREZ_ID
     *  do we need this?
     */
    extern column < I32 > izip_encoding ENTREZ_ID;

    /* SCORE
     */
    extern column < I32 > izip_encoding SCORE;
};

table NCBI:var:tbl:hitmap #1
{
    extern column U32 MAX_SEQ_LEN;    /* must be static                          */
    extern column bool_encoding HITS; /* places on the reference with variations */
};


/*--------------------------------------------------------------------------
 * varloc
 *  contains the varloc table and hit table
 */
database NCBI:var:db:varloc #1
{
    table NCBI:var:tbl:varloc VARLOC;
    table NCBI:var:tbl:hitmap HITMAP;
};