Mercurial > repos > charles_s_test > seqsero2

/*===========================================================================
*
*                            PUBLIC DOMAIN NOTICE
*               National Center for Biotechnology Information
*
*  This software/database is a "United States Government Work" under the
*  terms of the United States Copyright Act.  It was written as part of
*  the author's official duties as a United States Government employee and
*  thus cannot be copyrighted.  This software/database is freely available
*  to the public for use. The National Library of Medicine and the U.S.
*  Government have not placed any restriction on its use or reproduction.
*
*  Although all reasonable efforts have been taken to ensure the accuracy
*  and reliability of the software and data, the NLM and the U.S.
*  Government do not and cannot warrant the performance or results that
*  may be obtained by using this software or data. The NLM and the U.S.
*  Government disclaim all warranties, express or implied, including
*  warranties of performance, merchantability or fitness for any particular
*  purpose.
*
*  Please cite the author in any work or product based on this material.
*
* ===========================================================================
*
*/

/*==========================================================================
 * NCBI Sequence Read Archive schema
 */
version 1;

include 'vdb/vdb.vschema';
include 'insdc/sra.vschema';


/*--------------------------------------------------------------------------
 * types
 */

/* spot_name_token
 *  a vector describing tokens recognized within a spot name
 *
 * COMPONENTS:
 *  0 - token id
 *  1 - token starting coordinate
 *  2 - token length
 */
alias text:token NCBI:SRA:spot_name_token;


/* token values
 *
 *  tokens are produced by a schema-specific tokenizer function
 *  this function is purposely abstract because it may rely upon
 *  whatever information it needs to perform its task. the only
 *  requirement is that it produce these tokens as its output.
 *
 *  an empty name input must produce no tokens. in this case,
 *  there is no name to tokenize or data to produce.
 *
 *  a non-empty name must produce 1 or more tokens of output.
 *  all tokens must be ordered by starting character position.
 *
 *  if a name does not conform to any pattern recognized by the
 *  tokenizer, then the tokenizer emits a single token of "unrecognized"
 *
 *  if a name conforms to some pattern but does not have any
 *  substitution tokens, the tokenizer emits a single token of "recognized"
 *
 *  if a name may be tokenized, then the resulting tokens should
 *  describe only the portions of the string that should be removed
 *  from the name, e.g. "X" or "Y".
 *
 *  the standard coordinates "X".."L" are given in unsigned decimal.
 *  alternate representations are contained within their respective
 *  namespaces: "signed", "hex" and "octal".
 *
 *  the special coordinate "Q" represents the 454-specific encoding
 *  of X and Y into base-36, where the formula for Q is:
 *    Q = 4096 * X + Y
 *  and ASCII encoding:
 *    0..25 => "A-Z", 26..35 => "0-9"
 */
const U16 NCBI:SRA:name_token:unrecognized =  1;
const U16 NCBI:SRA:name_token:recognized   =  2;
const U16 NCBI:SRA:name_token:Q            =  3;
const U16 NCBI:SRA:name_token:X            =  4;
const U16 NCBI:SRA:name_token:Y            =  5;
const U16 NCBI:SRA:name_token:T            =  6;
const U16 NCBI:SRA:name_token:L            =  7;
const U16 NCBI:SRA:name_token:signed:X     =  8;
const U16 NCBI:SRA:name_token:signed:Y     =  9;
const U16 NCBI:SRA:name_token:signed:T     = 10;
const U16 NCBI:SRA:name_token:signed:L     = 11;
const U16 NCBI:SRA:name_token:octal:X      = 12;
const U16 NCBI:SRA:name_token:octal:Y      = 13;
const U16 NCBI:SRA:name_token:octal:T      = 14;
const U16 NCBI:SRA:name_token:octal:L      = 15;
const U16 NCBI:SRA:name_token:hex:upper:X  = 16;
const U16 NCBI:SRA:name_token:hex:upper:Y  = 17;
const U16 NCBI:SRA:name_token:hex:upper:T  = 18;
const U16 NCBI:SRA:name_token:hex:upper:L  = 19;
const U16 NCBI:SRA:name_token:hex:lower:X  = 20;
const U16 NCBI:SRA:name_token:hex:lower:Y  = 21;
const U16 NCBI:SRA:name_token:hex:lower:T  = 22;
const U16 NCBI:SRA:name_token:hex:lower:L  = 23;


/* token symbols
 *  when a name matches some pattern and tokens are recognized,
 *  the tokens are extracted from the name and sent to individual
 *  columns, and replaced with the symbols below to create a
 *  formatted name.
 */
const ascii NCBI:SRA:name_symbol:Q           = '$Q';
const ascii NCBI:SRA:name_symbol:X           = '$X';
const ascii NCBI:SRA:name_symbol:Y           = '$Y';
const ascii NCBI:SRA:name_symbol:T           = '$T';
const ascii NCBI:SRA:name_symbol:L           = '$L';
const ascii NCBI:SRA:name_symbol:octal:X     = '$a';
const ascii NCBI:SRA:name_symbol:octal:Y     = '$b';
const ascii NCBI:SRA:name_symbol:octal:T     = '$c';
const ascii NCBI:SRA:name_symbol:octal:L     = '$d';
const ascii NCBI:SRA:name_symbol:hex:upper:X = '$e';
const ascii NCBI:SRA:name_symbol:hex:upper:Y = '$f';
const ascii NCBI:SRA:name_symbol:hex:upper:T = '$g';
const ascii NCBI:SRA:name_symbol:hex:upper:L = '$h';
const ascii NCBI:SRA:name_symbol:hex:lower:X = '$x';
const ascii NCBI:SRA:name_symbol:hex:lower:Y = '$y';
const ascii NCBI:SRA:name_symbol:hex:lower:T = '$t';
const ascii NCBI:SRA:name_symbol:hex:lower:L = '$l';


/*--------------------------------------------------------------------------
 * functions
 */

/* extract_spot_name
 *  generates input to .SPOT_NAME column
 *
 *  on NCBI:SRA:name_token:unrecognized, produces the entire spot name row
 *  otherwise, produces an empty row
 *
 *  "name" [ DATA ] - raw spot names from NAME column
 *
 *  "tok" [ DATA ] - delimiting tokens produced by sub-table
 */
function ascii
    NCBI:SRA:extract_spot_name #1 ( ascii name, NCBI:SRA:spot_name_token tok );


/* extract_name_fmt
 *  generates input to .NAME_FMT column and/or updates skey index
 *
 *  on NCBI:SRA:name_token:unrecognized, produces an empty row
 *  otherwise, it creates a temporary "name_fmt" string from name row
 *
 *  an attempt is made to insert name_fmt into indicated text index
 *  ( normally 'skey' ). if the insert succeeds, i.e. associates "name_fmt"
 *  with a row_id, then the output for the row is empty.
 *
 *  if the insert fails due to key duplication, an attempt is made to
 *  extend the id range of associated rows. depending upon the type of index,
 *  this may succeed or fail, e.g. if the existing row range for "name_fmt" is
 *  n..m where m = row_id - 1, the range can be extended to n..row_id and
 *  the update succeeds. if the index supports discontiguous id ranges, the
 *  update will also succeed. upon any success updating the index, the output
 *  row will be empty.
 *
 *  finally, if the temporary "name_fmt" cannot be inserted into the index
 *  nor the existing id range updated, the output for the row will be "name_fmt".
 *
 *  "name" [ DATA ] - raw spot names from NAME column
 *
 *  "tok" [ DATA ] - delimiting tokens produced by sub-table
 */
function ascii
    NCBI:SRA:extract_name_fmt #1 < ascii idx > ( ascii name, NCBI:SRA:spot_name_token tok );


/* extract_name_coord
 *  generates inputs to .X and .Y and possibly other columns
 *
 *  if no tokens match "coord"constant, produces an empty row
 *  otherwise, produces binary coordinate value
 *  if multiple tokens match criteria, all values must be equivalent
 *  because only a single value will be output per row
 *
 *  "coord" [ CONST ] - either NCBI:SRA:name_token:X or NCBI:SRA:name_token:Y
 *  both of these values also match the token NCBI:SRA:name_token:Q and extract
 *  contents appropriately.
 *
 *  "name" [ DATA ] - raw spot names from NAME column
 *
 *  "tok" [ DATA ] - delimiting tokens produced by sub-table
 */
function INSDC:coord:val
    NCBI:SRA:extract_name_coord #1 < U16 coord > ( ascii name, NCBI:SRA:spot_name_token tok );


/* lookup
 */
function INSDC:SRA:spot_ids_found NCBI:SRA:lookup #1.0
    < ascii index_name, ascii query_by_name, U8 name_fmt_version > ( * ascii name_prefix );


/*--------------------------------------------------------------------------
 * spotcoord
 *  spot coordinate table implementation
 */
table NCBI:SRA:tbl:spotcoord #1 = INSDC:SRA:tbl:spotcoord #1
{
    // X and Y stored as I32
    INSDC:coord:val out_x_coord = .X;
    INSDC:coord:val out_y_coord = .Y;

    // T and L are usually present but optional
    INSDC:coord:val out_t_coord = .T;
    INSDC:coord:val out_l_coord = .L;

    // .X, .Y, .T and .L get either empty coordinate or proper coordinate
    physical column < INSDC:coord:val > izip_encoding .X
        = in_x_coord
        | in_name_x_coord;
    physical column < INSDC:coord:val > izip_encoding .Y
        = in_y_coord
        | in_name_y_coord;
    physical column < INSDC:coord:val > izip_encoding .T
        = in_t_coord
        | in_name_t_coord;
    physical column < INSDC:coord:val > izip_encoding .L
        = in_l_coord
        | in_name_l_coord;
};


/*--------------------------------------------------------------------------
 * skeyname
 *  spot name table implementation built upon prefix-tree skey index
 *
 * v1 - maintains a 1->1 key=>spot_id relationship
 *      with unique constraint on key. it does NOT
 *      implement name_fmt or x_coord or y_coord.
 *
 * v2 - maintains a 1->1 key=>spot_id-range relationship
 *      with unique constraint on key. it does NOT
 *      implement spot_name. X and Y are stored using
 *      16-bit unsigned quantities.
 *
 * v3 - maintains a flexible naming approach
 *      retrieves name directly from column if so stored
 *      synthesizes name from name_fmt, X and Y otherwise
 *      name_fmt is either retrieved directly from column
 *      or from skey index. X and Y are stored as 32-bit
 *      signed quantities.
 *
 * history:
 *  1.0.1 - explicitly account for spotname #1.0.1 ancestry
 *  2.0.1 - " "
 *  3.0.1 - moved .X and .Y to spotcoord table
 */
table NCBI:SRA:tbl:skeyname #1.0.1 = INSDC:SRA:tbl:spotname #1.0.1
{
    // read the skey entry
    ascii out_skey = ( ascii ) idx:text:project #1.0 < 'skey' > ();

    // spot_name
    ascii out_spot_name
        = rewritten_spot_name
        | out_skey;

    // search skey entry
    INSDC:SRA:spot_ids_found spot_ids_found
        = ( INSDC:SRA:spot_ids_found ) NCBI:SRA:lookup #1 < 'skey' , 'QUERY_BY_NAME', 1 > ( out_slx_prefix )
        | ( INSDC:SRA:spot_ids_found ) NCBI:SRA:lookup #1 < 'skey' , 'QUERY_BY_NAME', 0 > ();


	/* INSDC:SRA:tbl:spotname inherited productions
	 *  out_x_coord
	 *  out_y_coord
	 *  out_name_fmt
	 */

	/* NCBI:SRA:tbl:skeyname productions
	 *  out_slx_prefix
	 *  rewritten_spot_name
	 */
};

table NCBI:SRA:tbl:skeyname_nocol #2.0.1 = INSDC:SRA:tbl:spotname #1.0.1
{
    // name_fmt
    //  perform reverse lookup through index to get key
    ascii out_name_fmt = ( ascii ) idx:text:project #1.0 < 'skey' > ();

    // search skey entry
    INSDC:SRA:spot_ids_found spot_ids_found
        = ( INSDC:SRA:spot_ids_found ) NCBI:SRA:lookup #1 < 'skey' , 'QUERY_BY_NAME', 2 > ( out_slx_prefix )
        | ( INSDC:SRA:spot_ids_found ) NCBI:SRA:lookup #1 < 'skey' , 'QUERY_BY_NAME', 2 > ();

    // X and Y stored as U16
    INSDC:coord:val out_x_coord = cast ( .X );
    INSDC:coord:val out_y_coord = cast ( .Y );


	/* NCBI:SRA:tbl:skeyname_nocol virtual productions
	 *  out_slx_prefix
	 */
};

table NCBI:SRA:tbl:skeyname #2.0.1 = NCBI:SRA:tbl:skeyname_nocol #2.0.1
{
    // spot_name_tok comes from a platform-specific tokenizer
    // and must be of type 'NCBI:SRA:spot_name_token'
    physical column < INSDC:coord:val > izip_encoding #1 .X
        = NCBI:SRA:extract_name_coord < NCBI:SRA:name_token:X > ( NAME, in_spot_name_tok );
    physical column < INSDC:coord:val > izip_encoding .Y
        = NCBI:SRA:extract_name_coord < NCBI:SRA:name_token:Y > ( NAME, in_spot_name_tok );

	/* NCBI:SRA:tbl:skeyname_nocol inherited virtual productions
	 *  out_slx_prefix
	 */

	/* NCBI:SRA:tbl:skeyname virtual productions
	 *  in_spot_name_tok
	 */
};

table NCBI:SRA:tbl:skeyname #3.0.1 = INSDC:SRA:tbl:spotname #1.0.1, NCBI:SRA:tbl:spotcoord #1
{
    // spot_name
    //  retrieve from hard column
    ascii out_spot_name = .SPOT_NAME;

    // name_fmt
    //  retrieve from hard column or reverse lookup through index
    ascii out_name_fmt = ( ascii ) idx:text:project #1.0 < 'skey' > ( .NAME_FMT );

    INSDC:SRA:spot_ids_found  spot_ids_found
        = ( INSDC:SRA:spot_ids_found ) NCBI:SRA:lookup #1 < 'skey' , 'QUERY_BY_NAME', 2 > ();


    /* encoding rules
     *  the sub-table will provide a platform-specific parser that
     *  produces as its output a series of NCBI:SRA:spot_name_token
     *  for each input row in the virtual production "spot_name_tok"
     *
     *  the tokenizer will look for X, Y or Q (combined) coordinates
     *  within the spot name and issue tokens when found, or in the
     *  case that none are found, an "unrecognized" token is issued.
     *
     *  the tokens are then processed here by common rules
     */

    // .SPOT_NAME gets either empty strings or unrecognized strings
    physical column < ascii > zip_encoding .SPOT_NAME
        = NCBI:SRA:extract_spot_name ( NAME, in_spot_name_tok );

    // .NAME_FMT gets either empty strings or unindexed but recognized strings
    physical column < ascii > zip_encoding .NAME_FMT
        = NCBI:SRA:extract_name_fmt < 'skey' > ( NAME, in_spot_name_tok );

    // .X, .Y, .T and .L get either empty coordinate or proper coordinate
    INSDC:coord:val in_name_x_coord
        = NCBI:SRA:extract_name_coord < NCBI:SRA:name_token:X > ( NAME, in_spot_name_tok );
    INSDC:coord:val in_name_y_coord
        = NCBI:SRA:extract_name_coord < NCBI:SRA:name_token:Y > ( NAME, in_spot_name_tok );
    INSDC:coord:val in_name_t_coord
        = NCBI:SRA:extract_name_coord < NCBI:SRA:name_token:T > ( NAME, in_spot_name_tok );
    INSDC:coord:val in_name_l_coord
        = NCBI:SRA:extract_name_coord < NCBI:SRA:name_token:L > ( NAME, in_spot_name_tok );


	/* NCBI:SRA:tbl:skeyname virtual productions
	 *  in_spot_name_tok
	 */
};
author	charles_s_test
date	Mon, 27 Nov 2017 11:21:07 -0500
parents
children