seqsero2: libs/sratoolkit.2.8.0-centos_linux64/schema/ncbi/spotname.vschema comparison

comparison libs/sratoolkit.2.8.0-centos_linux64/schema/ncbi/spotname.vschema @ 3:38ad1130d077 draft

planemo upload commit a4fb57231f274270afbfebd47f67df05babffa4a-dirty

author	charles_s_test
date	Mon, 27 Nov 2017 11:21:07 -0500
parents
children

comparison

equal deleted inserted replaced

-:0d65b71ff8df
+:38ad1130d077
+/*===========================================================================
+*
+*                            PUBLIC DOMAIN NOTICE
+*               National Center for Biotechnology Information
+*
+*  This software/database is a "United States Government Work" under the
+*  terms of the United States Copyright Act.  It was written as part of
+*  the author's official duties as a United States Government employee and
+*  thus cannot be copyrighted.  This software/database is freely available
+*  to the public for use. The National Library of Medicine and the U.S.
+*  Government have not placed any restriction on its use or reproduction.
+*
+*  Although all reasonable efforts have been taken to ensure the accuracy
+*  and reliability of the software and data, the NLM and the U.S.
+*  Government do not and cannot warrant the performance or results that
+*  may be obtained by using this software or data. The NLM and the U.S.
+*  Government disclaim all warranties, express or implied, including
+*  warranties of performance, merchantability or fitness for any particular
+*  purpose.
+*
+*  Please cite the author in any work or product based on this material.
+*
+* ===========================================================================
+*
+*/
+/*==========================================================================
+* NCBI Sequence Read Archive schema
+*/
+version 1;
+include 'vdb/vdb.vschema';
+include 'insdc/sra.vschema';
+/*--------------------------------------------------------------------------
+* types
+*/
+/* spot_name_token
+*  a vector describing tokens recognized within a spot name
+*
+* COMPONENTS:
+*  0 - token id
+*  1 - token starting coordinate
+*  2 - token length
+*/
+alias text:token NCBI:SRA:spot_name_token;
+/* token values
+*
+*  tokens are produced by a schema-specific tokenizer function
+*  this function is purposely abstract because it may rely upon
+*  whatever information it needs to perform its task. the only
+*  requirement is that it produce these tokens as its output.
+*
+*  an empty name input must produce no tokens. in this case,
+*  there is no name to tokenize or data to produce.
+*
+*  a non-empty name must produce 1 or more tokens of output.
+*  all tokens must be ordered by starting character position.
+*
+*  if a name does not conform to any pattern recognized by the
+*  tokenizer, then the tokenizer emits a single token of "unrecognized"
+*
+*  if a name conforms to some pattern but does not have any
+*  substitution tokens, the tokenizer emits a single token of "recognized"
+*
+*  if a name may be tokenized, then the resulting tokens should
+*  describe only the portions of the string that should be removed
+*  from the name, e.g. "X" or "Y".
+*
+*  the standard coordinates "X".."L" are given in unsigned decimal.
+*  alternate representations are contained within their respective
+*  namespaces: "signed", "hex" and "octal".
+*
+*  the special coordinate "Q" represents the 454-specific encoding
+*  of X and Y into base-36, where the formula for Q is:
+*    Q = 4096 * X + Y
+*  and ASCII encoding:
+*    0..25 => "A-Z", 26..35 => "0-9"
+*/
+const U16 NCBI:SRA:name_token:unrecognized =  1;
+const U16 NCBI:SRA:name_token:recognized   =  2;
+const U16 NCBI:SRA:name_token:Q            =  3;
+const U16 NCBI:SRA:name_token:X            =  4;
+const U16 NCBI:SRA:name_token:Y            =  5;
+const U16 NCBI:SRA:name_token:T            =  6;
+const U16 NCBI:SRA:name_token:L            =  7;
+const U16 NCBI:SRA:name_token:signed:X     =  8;
+const U16 NCBI:SRA:name_token:signed:Y     =  9;
+const U16 NCBI:SRA:name_token:signed:T     = 10;
+const U16 NCBI:SRA:name_token:signed:L     = 11;
+const U16 NCBI:SRA:name_token:octal:X      = 12;
+const U16 NCBI:SRA:name_token:octal:Y      = 13;
+const U16 NCBI:SRA:name_token:octal:T      = 14;
+const U16 NCBI:SRA:name_token:octal:L      = 15;
+const U16 NCBI:SRA:name_token:hex:upper:X  = 16;
+const U16 NCBI:SRA:name_token:hex:upper:Y  = 17;
+const U16 NCBI:SRA:name_token:hex:upper:T  = 18;
+const U16 NCBI:SRA:name_token:hex:upper:L  = 19;
+const U16 NCBI:SRA:name_token:hex:lower:X  = 20;
+const U16 NCBI:SRA:name_token:hex:lower:Y  = 21;
+const U16 NCBI:SRA:name_token:hex:lower:T  = 22;
+const U16 NCBI:SRA:name_token:hex:lower:L  = 23;
+/* token symbols
+*  when a name matches some pattern and tokens are recognized,
+*  the tokens are extracted from the name and sent to individual
+*  columns, and replaced with the symbols below to create a
+*  formatted name.
+*/
+const ascii NCBI:SRA:name_symbol:Q           = '$Q';
+const ascii NCBI:SRA:name_symbol:X           = '$X';
+const ascii NCBI:SRA:name_symbol:Y           = '$Y';
+const ascii NCBI:SRA:name_symbol:T           = '$T';
+const ascii NCBI:SRA:name_symbol:L           = '$L';
+const ascii NCBI:SRA:name_symbol:octal:X     = '$a';
+const ascii NCBI:SRA:name_symbol:octal:Y     = '$b';
+const ascii NCBI:SRA:name_symbol:octal:T     = '$c';
+const ascii NCBI:SRA:name_symbol:octal:L     = '$d';
+const ascii NCBI:SRA:name_symbol:hex:upper:X = '$e';
+const ascii NCBI:SRA:name_symbol:hex:upper:Y = '$f';
+const ascii NCBI:SRA:name_symbol:hex:upper:T = '$g';
+const ascii NCBI:SRA:name_symbol:hex:upper:L = '$h';
+const ascii NCBI:SRA:name_symbol:hex:lower:X = '$x';
+const ascii NCBI:SRA:name_symbol:hex:lower:Y = '$y';
+const ascii NCBI:SRA:name_symbol:hex:lower:T = '$t';
+const ascii NCBI:SRA:name_symbol:hex:lower:L = '$l';
+/*--------------------------------------------------------------------------
+* functions
+*/
+/* extract_spot_name
+*  generates input to .SPOT_NAME column
+*
+*  on NCBI:SRA:name_token:unrecognized, produces the entire spot name row
+*  otherwise, produces an empty row
+*
+*  "name" [ DATA ] - raw spot names from NAME column
+*
+*  "tok" [ DATA ] - delimiting tokens produced by sub-table
+*/
+function ascii
+NCBI:SRA:extract_spot_name #1 ( ascii name, NCBI:SRA:spot_name_token tok );
+/* extract_name_fmt
+*  generates input to .NAME_FMT column and/or updates skey index
+*
+*  on NCBI:SRA:name_token:unrecognized, produces an empty row
+*  otherwise, it creates a temporary "name_fmt" string from name row
+*
+*  an attempt is made to insert name_fmt into indicated text index
+*  ( normally 'skey' ). if the insert succeeds, i.e. associates "name_fmt"
+*  with a row_id, then the output for the row is empty.
+*
+*  if the insert fails due to key duplication, an attempt is made to
+*  extend the id range of associated rows. depending upon the type of index,
+*  this may succeed or fail, e.g. if the existing row range for "name_fmt" is
+*  n..m where m = row_id - 1, the range can be extended to n..row_id and
+*  the update succeeds. if the index supports discontiguous id ranges, the
+*  update will also succeed. upon any success updating the index, the output
+*  row will be empty.
+*
+*  finally, if the temporary "name_fmt" cannot be inserted into the index
+*  nor the existing id range updated, the output for the row will be "name_fmt".
+*
+*  "name" [ DATA ] - raw spot names from NAME column
+*
+*  "tok" [ DATA ] - delimiting tokens produced by sub-table
+*/
+function ascii
+NCBI:SRA:extract_name_fmt #1 < ascii idx > ( ascii name, NCBI:SRA:spot_name_token tok );
+/* extract_name_coord
+*  generates inputs to .X and .Y and possibly other columns
+*
+*  if no tokens match "coord"constant, produces an empty row
+*  otherwise, produces binary coordinate value
+*  if multiple tokens match criteria, all values must be equivalent
+*  because only a single value will be output per row
+*
+*  "coord" [ CONST ] - either NCBI:SRA:name_token:X or NCBI:SRA:name_token:Y
+*  both of these values also match the token NCBI:SRA:name_token:Q and extract
+*  contents appropriately.
+*
+*  "name" [ DATA ] - raw spot names from NAME column
+*
+*  "tok" [ DATA ] - delimiting tokens produced by sub-table
+*/
+function INSDC:coord:val
+NCBI:SRA:extract_name_coord #1 < U16 coord > ( ascii name, NCBI:SRA:spot_name_token tok );
+/* lookup
+*/
+function INSDC:SRA:spot_ids_found NCBI:SRA:lookup #1.0
+< ascii index_name, ascii query_by_name, U8 name_fmt_version > ( * ascii name_prefix );
+/*--------------------------------------------------------------------------
+* spotcoord
+*  spot coordinate table implementation
+*/
+table NCBI:SRA:tbl:spotcoord #1 = INSDC:SRA:tbl:spotcoord #1
+{
+// X and Y stored as I32
+INSDC:coord:val out_x_coord = .X;
+INSDC:coord:val out_y_coord = .Y;
+// T and L are usually present but optional
+INSDC:coord:val out_t_coord = .T;
+INSDC:coord:val out_l_coord = .L;
+// .X, .Y, .T and .L get either empty coordinate or proper coordinate
+physical column < INSDC:coord:val > izip_encoding .X
+= in_x_coord
+| in_name_x_coord;
+physical column < INSDC:coord:val > izip_encoding .Y
+= in_y_coord
+| in_name_y_coord;
+physical column < INSDC:coord:val > izip_encoding .T
+= in_t_coord
+| in_name_t_coord;
+physical column < INSDC:coord:val > izip_encoding .L
+= in_l_coord
+| in_name_l_coord;
+};
+/*--------------------------------------------------------------------------
+* skeyname
+*  spot name table implementation built upon prefix-tree skey index
+*
+* v1 - maintains a 1->1 key=>spot_id relationship
+*      with unique constraint on key. it does NOT
+*      implement name_fmt or x_coord or y_coord.
+*
+* v2 - maintains a 1->1 key=>spot_id-range relationship
+*      with unique constraint on key. it does NOT
+*      implement spot_name. X and Y are stored using
+*      16-bit unsigned quantities.
+*
+* v3 - maintains a flexible naming approach
+*      retrieves name directly from column if so stored
+*      synthesizes name from name_fmt, X and Y otherwise
+*      name_fmt is either retrieved directly from column
+*      or from skey index. X and Y are stored as 32-bit
+*      signed quantities.
+*
+* history:
+*  1.0.1 - explicitly account for spotname #1.0.1 ancestry
+*  2.0.1 - " "
+*  3.0.1 - moved .X and .Y to spotcoord table
+*/
+table NCBI:SRA:tbl:skeyname #1.0.1 = INSDC:SRA:tbl:spotname #1.0.1
+{
+// read the skey entry
+ascii out_skey = ( ascii ) idx:text:project #1.0 < 'skey' > ();
+// spot_name
+ascii out_spot_name
+= rewritten_spot_name
+| out_skey;
+// search skey entry
+INSDC:SRA:spot_ids_found spot_ids_found
+= ( INSDC:SRA:spot_ids_found ) NCBI:SRA:lookup #1 < 'skey' , 'QUERY_BY_NAME', 1 > ( out_slx_prefix )
+| ( INSDC:SRA:spot_ids_found ) NCBI:SRA:lookup #1 < 'skey' , 'QUERY_BY_NAME', 0 > ();
+	/* INSDC:SRA:tbl:spotname inherited productions
+	 *  out_x_coord
+	 *  out_y_coord
+	 *  out_name_fmt
+	 */
+	/* NCBI:SRA:tbl:skeyname productions
+	 *  out_slx_prefix
+	 *  rewritten_spot_name
+	 */
+};
+table NCBI:SRA:tbl:skeyname_nocol #2.0.1 = INSDC:SRA:tbl:spotname #1.0.1
+{
+// name_fmt
+//  perform reverse lookup through index to get key
+ascii out_name_fmt = ( ascii ) idx:text:project #1.0 < 'skey' > ();
+// search skey entry
+INSDC:SRA:spot_ids_found spot_ids_found
+= ( INSDC:SRA:spot_ids_found ) NCBI:SRA:lookup #1 < 'skey' , 'QUERY_BY_NAME', 2 > ( out_slx_prefix )
+| ( INSDC:SRA:spot_ids_found ) NCBI:SRA:lookup #1 < 'skey' , 'QUERY_BY_NAME', 2 > ();
+// X and Y stored as U16
+INSDC:coord:val out_x_coord = cast ( .X );
+INSDC:coord:val out_y_coord = cast ( .Y );
+	/* NCBI:SRA:tbl:skeyname_nocol virtual productions
+	 *  out_slx_prefix
+	 */
+};
+table NCBI:SRA:tbl:skeyname #2.0.1 = NCBI:SRA:tbl:skeyname_nocol #2.0.1
+{
+// spot_name_tok comes from a platform-specific tokenizer
+// and must be of type 'NCBI:SRA:spot_name_token'
+physical column < INSDC:coord:val > izip_encoding #1 .X
+= NCBI:SRA:extract_name_coord < NCBI:SRA:name_token:X > ( NAME, in_spot_name_tok );
+physical column < INSDC:coord:val > izip_encoding .Y
+= NCBI:SRA:extract_name_coord < NCBI:SRA:name_token:Y > ( NAME, in_spot_name_tok );
+	/* NCBI:SRA:tbl:skeyname_nocol inherited virtual productions
+	 *  out_slx_prefix
+	 */
+	/* NCBI:SRA:tbl:skeyname virtual productions
+	 *  in_spot_name_tok
+	 */
+};
+table NCBI:SRA:tbl:skeyname #3.0.1 = INSDC:SRA:tbl:spotname #1.0.1, NCBI:SRA:tbl:spotcoord #1
+{
+// spot_name
+//  retrieve from hard column
+ascii out_spot_name = .SPOT_NAME;
+// name_fmt
+//  retrieve from hard column or reverse lookup through index
+ascii out_name_fmt = ( ascii ) idx:text:project #1.0 < 'skey' > ( .NAME_FMT );
+INSDC:SRA:spot_ids_found  spot_ids_found
+= ( INSDC:SRA:spot_ids_found ) NCBI:SRA:lookup #1 < 'skey' , 'QUERY_BY_NAME', 2 > ();
+/* encoding rules
+*  the sub-table will provide a platform-specific parser that
+*  produces as its output a series of NCBI:SRA:spot_name_token
+*  for each input row in the virtual production "spot_name_tok"
+*
+*  the tokenizer will look for X, Y or Q (combined) coordinates
+*  within the spot name and issue tokens when found, or in the
+*  case that none are found, an "unrecognized" token is issued.
+*
+*  the tokens are then processed here by common rules
+*/
+// .SPOT_NAME gets either empty strings or unrecognized strings
+physical column < ascii > zip_encoding .SPOT_NAME
+= NCBI:SRA:extract_spot_name ( NAME, in_spot_name_tok );
+// .NAME_FMT gets either empty strings or unindexed but recognized strings
+physical column < ascii > zip_encoding .NAME_FMT
+= NCBI:SRA:extract_name_fmt < 'skey' > ( NAME, in_spot_name_tok );
+// .X, .Y, .T and .L get either empty coordinate or proper coordinate
+INSDC:coord:val in_name_x_coord
+= NCBI:SRA:extract_name_coord < NCBI:SRA:name_token:X > ( NAME, in_spot_name_tok );
+INSDC:coord:val in_name_y_coord
+= NCBI:SRA:extract_name_coord < NCBI:SRA:name_token:Y > ( NAME, in_spot_name_tok );
+INSDC:coord:val in_name_t_coord
+= NCBI:SRA:extract_name_coord < NCBI:SRA:name_token:T > ( NAME, in_spot_name_tok );
+INSDC:coord:val in_name_l_coord
+= NCBI:SRA:extract_name_coord < NCBI:SRA:name_token:L > ( NAME, in_spot_name_tok );
+	/* NCBI:SRA:tbl:skeyname virtual productions
+	 *  in_spot_name_tok
+	 */
+};

Mercurial > repos > charles_s_test > seqsero2

comparison libs/sratoolkit.2.8.0-centos_linux64/schema/ncbi/spotname.vschema @ 3:38ad1130d077 draft