diff libs/sratoolkit.2.8.0-centos_linux64/schema/ncbi/wgs-contig.vschema @ 3:38ad1130d077 draft

planemo upload commit a4fb57231f274270afbfebd47f67df05babffa4a-dirty
author charles_s_test
date Mon, 27 Nov 2017 11:21:07 -0500
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/libs/sratoolkit.2.8.0-centos_linux64/schema/ncbi/wgs-contig.vschema	Mon Nov 27 11:21:07 2017 -0500
@@ -0,0 +1,695 @@
+/*===========================================================================
+*
+*                            PUBLIC DOMAIN NOTICE
+*               National Center for Biotechnology Information
+*
+*  This software/database is a "United States Government Work" under the
+*  terms of the United States Copyright Act.  It was written as part of
+*  the author's official duties as a United States Government employee and
+*  thus cannot be copyrighted.  This software/database is freely available
+*  to the public for use. The National Library of Medicine and the U.S.
+*  Government have not placed any restriction on its use or reproduction.
+*
+*  Although all reasonable efforts have been taken to ensure the accuracy
+*  and reliability of the software and data, the NLM and the U.S.
+*  Government do not and cannot warrant the performance or results that
+*  may be obtained by using this software or data. The NLM and the U.S.
+*  Government disclaim all warranties, express or implied, including
+*  warranties of performance, merchantability or fitness for any particular
+*  purpose.
+*
+*  Please cite the author in any work or product based on this material.
+*
+* ===========================================================================
+*
+*/
+
+/*==========================================================================
+ * WGS Contig
+ */
+version 1;
+
+include 'vdb/vdb.vschema';
+include 'ncbi/ncbi.vschema';
+include 'ncbi/seq.vschema';
+include 'ncbi/spotname.vschema';
+include 'ncbi/stats.vschema';
+
+
+/*--------------------------------------------------------------------------
+ * types
+ * constants
+ */
+
+/* component_props
+ *  a signed value describing contig or gap components of scaffolds, or
+ *  gaps in contig sequences.
+ *  Positive values refer to contigs and negatives describe gaps
+ */
+typedef I16 NCBI:WGS:component_props;
+
+/* component description
+ *  the sequencing status of the component
+ *
+ *  These typically correspond to keywords in the INSDC submission.
+ *  Current acceptable values are:
+ *    A           Active Finishing
+ *    D           Draft HTG (often phase1 and phase2 are called Draft,
+ *                whether or not they have the draft keyword).
+ *    F           Finished HTG (phase3)
+ *    G           Whole Genome Finishing
+ *    O           Other sequence (typically means no HTG keyword)
+ *    P           Pre Draft
+ *    W           WGS contig
+ */
+const NCBI:WGS:component_props NCBI:WGS:component:WGS                  = 0;
+const NCBI:WGS:component_props NCBI:WGS:component:ActiveFinishing      = 1;
+const NCBI:WGS:component_props NCBI:WGS:component:DraftHTG             = 2;
+const NCBI:WGS:component_props NCBI:WGS:component:FinishedHTG          = 3;
+const NCBI:WGS:component_props NCBI:WGS:component:WholeGenomeFinishing = 4;
+const NCBI:WGS:component_props NCBI:WGS:component:OtherSequence        = 5;
+const NCBI:WGS:component_props NCBI:WGS:component:PreDraft             = 6;
+
+/* strand
+ *  specifies the orientation of the component relative to scaffold
+ *  values given allow strand to be determined as "prop / 16"
+ *  yielding:
+ *    0           unknown orientation
+ *    1           plus strand
+ *    2           negative strand
+ */
+const NCBI:WGS:component_props NCBI:WGS:strand:plus                    = 16;
+const NCBI:WGS:component_props NCBI:WGS:strand:minus                   = 32;
+
+
+/* gap description
+ *  These typically correspond to keywords in the INSDC submission.
+ *  Current acceptable values are:
+ *    N           gap with specified size
+ *    U           gap of unknown size, defaulting to 100 bases.
+ */
+const NCBI:WGS:component_props NCBI:WGS:gap:known                      = -1;
+const NCBI:WGS:component_props NCBI:WGS:gap:unknown                    = -2;
+
+/* gap type
+ *  scaffold          a gap between two sequence contigs in a scaffold
+ *  contig            an unspanned gap between two sequence contigs
+ *  centromere        a gap inserted for the centromere
+ *  short_arm         a gap inserted at the start of an acrocentric chromosome
+ *  heterochromatin   a gap inserted for an especially large region of heterochromatic sequence
+ *  telomere          a gap inserted for the telomere
+ *  repeat            an unresolvable repeat
+ */
+const NCBI:WGS:component_props NCBI:WGS:gap:scaffold                   = -4;
+const NCBI:WGS:component_props NCBI:WGS:gap:contig                     = -8;
+const NCBI:WGS:component_props NCBI:WGS:gap:centromere                 = -12;
+const NCBI:WGS:component_props NCBI:WGS:gap:short_arm                  = -16;
+const NCBI:WGS:component_props NCBI:WGS:gap:heterochromatin            = -20;
+const NCBI:WGS:component_props NCBI:WGS:gap:telomere                   = -24;
+const NCBI:WGS:component_props NCBI:WGS:gap:repeat                     = -28;
+
+/* gap_linkage
+ */
+typedef I32 NCBI:WGS:gap_linkage;
+
+/* gap linkage and linkage evidence 
+ * There can be multiple linkage evidences or linkage with no evidence
+ * 
+ *  paired-ends       paired sequences from the two ends of a DNA fragment
+ *  align_genus       alignment to a reference genome within the same genus
+ *  align_xgenus      alignment to a reference genome within another genus
+ *  align_trnscpt     alignment to a transcript from the same species
+ *  within_clone      sequence on both sides of the gap is derived from
+ *                    the same clone, but the gap is not spanned by paired-ends
+ *  clone_contig      linkage is provided by a clone contig in the tiling path
+ *  map               linkage asserted using a non-sequence based map
+ *                    such as RH, linkage, fingerprint or optical
+ *  strobe            strobe sequencing (PacBio)
+ *  unspecified
+ *  pcr               PCR
+ */
+const NCBI:WGS:gap_linkage NCBI:WGS:gap:linkage:linked                 = 1;
+const NCBI:WGS:gap_linkage NCBI:WGS:gap:linkage_evidence:paired_ends   = 2;
+const NCBI:WGS:gap_linkage NCBI:WGS:gap:linkage_evidence:align_genus   = 4;
+const NCBI:WGS:gap_linkage NCBI:WGS:gap:linkage_evidence:align_xgenus  = 8;
+const NCBI:WGS:gap_linkage NCBI:WGS:gap:linkage_evidence:align_trnscpt = 16;
+const NCBI:WGS:gap_linkage NCBI:WGS:gap:linkage_evidence:within_clone  = 32;
+const NCBI:WGS:gap_linkage NCBI:WGS:gap:linkage_evidence:clone_contig  = 64;
+const NCBI:WGS:gap_linkage NCBI:WGS:gap:linkage_evidence:map           = 128;
+const NCBI:WGS:gap_linkage NCBI:WGS:gap:linkage_evidence:strobe        = 256;
+const NCBI:WGS:gap_linkage NCBI:WGS:gap:linkage_evidence:unspecified   = 512;
+const NCBI:WGS:gap_linkage NCBI:WGS:gap:linkage_evidence:pcr           = 1024;
+
+/*--------------------------------------------------------------------------
+ * functions
+ */
+
+/* tokenize_nuc_accession
+ * tokenize_prot_accession
+ *  scans name on input
+ *  tokenizes into parts
+ */
+extern function text:token
+    NCBI:WGS:tokenize_nuc_accession #1 ( ascii acc );
+extern function text:token
+    NCBI:WGS:tokenize_prot_accession #1 ( ascii acc );
+
+const U16 NCBI:WGS:acc_token:unrecognized =  1;
+const U16 NCBI:WGS:acc_token:prefix       =  2;
+const U16 NCBI:WGS:acc_token:contig       =  3;
+
+
+/* build_scaffold_read
+ *  assembles contigs and gaps into a single row
+ *  transcribes + strand contigs as they are,
+ *  performs reverse complement of - strand contigs,
+ *  fills gaps with stated number of N
+ *
+ * build_scaffold_quality
+ *  assembles contig and gap qualities into a single row
+ *  contig qualities are taken as they are,
+ *  gap qualities are assigned a constant
+ *
+ *  "component_start" [ DATA ] - starting locations on each
+ *   component or 0 for gaps. normal starting point is 0,
+ *   but offsets are supported.
+ *  NB - ONE-BASED COORDINATES
+ *
+ *  "component_len" [ DATA ] - length of contig sequence
+ *   from component_start, or length of gap, projected onto
+ *   scaffold at scaffold_start.
+ *
+ *  "component_props" [ DATA ] - see discussion of type
+ *   distinguish between contigs and gaps, indicate strand
+ *
+ *  "component_id" [ DATA ] - foreign keys into SEQUENCE table
+ *   row_len ( component_id ) == count-of-contigs ( component_props )
+ */
+extern function INSDC:4na:bin NCBI:WGS:build_scaffold_read #1
+    ( INSDC:coord:one component_start, INSDC:coord:len component_len,
+      NCBI:WGS:component_props component_props, I64 component_id );
+
+extern function INSDC:quality:phred NCBI:WGS:build_scaffold_qual #1
+    ( INSDC:coord:one component_start, INSDC:coord:len component_len,
+      NCBI:WGS:component_props component_props, I64 component_id );
+
+
+/* build_read_type
+ *  generate standard SRA read type from component properties
+ *  contigs are biological, gaps are technical
+ *
+ *  "component_props" [ DATA ] - see discussion of type
+ *   distinguish between contigs and gaps, indicate strand
+ */
+extern function INSDC:SRA:xread_type
+    NCBI:WGS:build_read_type #1 ( NCBI:WGS:component_props component_props );
+
+
+/*--------------------------------------------------------------------------
+ * nucleotide
+ */
+table NCBI:WGS:tbl:nucleotide #1.1
+    = NCBI:tbl:base_space #2.0.3
+    , NCBI:tbl:phred_quality #2.0.4
+    , NCBI:SRA:tbl:stats #1.2.0
+{
+    /* ACCESSION
+     *  [<opt-prefix>]<4-letter-prefix><2-digit-version><6-or-7-digit-contig>
+     */
+    extern column ascii ACCESSION = out_accession;
+    extern column U32   ACC_VERSION = .ACC_VERSION | <U32> echo <1> ();
+
+    // input
+    ascii in_accession = ACCESSION;
+
+    // parsed input
+    text:token in_acc_token
+        = NCBI:WGS:tokenize_nuc_accession ( in_accession );
+    //  [<opt-prefix>]<4-letter-prefix><2-digit-version>
+    ascii in_acc_prefix
+        = extract_token < 0 > ( in_accession, in_acc_token );
+    // <6-or-7-digit-contig>
+    ascii in_contig_text
+        = extract_token < 1 > ( in_accession, in_acc_token );
+    U32 in_contig_len
+        = row_len ( in_contig_text );
+    U64 in_contig_bin
+        = strtonum ( in_contig_text );
+
+    // physical storage
+    physical column < ascii > zip_encoding .ACC_PREFIX = in_acc_prefix;
+    physical column < U32 > izip_encoding .ACC_CONTIG_LEN = in_contig_len;
+    physical column < U64 > izip_encoding .ACC_CONTIG = in_contig_bin;
+    physical column < U32 > izip_encoding .ACC_VERSION = ACC_VERSION; //needed to back-fill WGS data from ID where version may be > 1
+
+    // output
+    ascii out_acc_prefix
+        = .ACC_PREFIX
+        | < ascii > meta:read < 'ACC_PREFIX', true > ()
+        ;
+    U32 out_acc_contig_len
+        = .ACC_CONTIG_LEN
+        | < U32 > meta:value < 'ACC_CONTIG_LEN', true > ()
+        ;
+    U64 out_acc_contig
+        = .ACC_CONTIG
+        | ( U64 ) row_id ()
+        ;
+    ascii out_accession
+        = sprintf < "%s%0*u" > ( out_acc_prefix, out_acc_contig_len, out_acc_contig );
+
+    readonly column ascii ACC_PREFIX
+        = .ACC_PREFIX
+        | < ascii > meta:read < 'ACC_PREFIX', true > ()
+        ;
+    readonly column U32 ACC_CONTIG_LEN
+        = .ACC_CONTIG_LEN
+        | < U32 > meta:value < 'ACC_CONTIG_LEN', true > ()
+        ;
+
+    /* CONTIG_NAME
+     *  principal name
+     */
+    extern column utf8 CONTIG_NAME
+        = idx:text:project #1.0 < 'contig_name' > ( .CONTIG_NAME );
+
+    physical column < utf8 > zip_encoding .CONTIG_NAME
+        = idx:text:insert #1.0 < 'contig_name' > ( CONTIG_NAME );
+
+    ascii out_contig_name = cast ( CONTIG_NAME );
+
+    // NB - this is only useful if CONTIG_NAME is unique
+    // or if clustered by CONTIG_NAME
+    readonly column vdb:row_id_range CONTIG_NAME_ROW_RANGE
+        = idx:text:lookup #1.0 < 'contig_name', 'NAME_QUERY' > ();
+
+    /* EXTRA_SEQIDS
+     *  pipe-separated list of additional names
+     */
+    extern column < ascii > zip_encoding EXTRA_SEQIDS;
+
+    /* TITLE
+     */
+    extern column < ascii > zip_encoding TITLE;
+
+    /* GI
+     *  gi is indexed in a parallel table
+     */
+    extern column < NCBI:gi > izip_encoding GI;
+
+    /* TAXID
+     *  taxonomy id
+     */
+    extern column < NCBI:taxid > izip_encoding TAXID;
+
+    /* GB_STATE
+     *  genbank state
+     */
+    extern column < NCBI:gb_state > izip_encoding GB_STATE;
+
+    /* DESCR
+     *  ASN.1 description
+     */
+    extern column < NCBI:asn:binary > zip_encoding DESCR;
+
+    /* ANNOT
+     *  ASN.1 annotation
+     */
+    extern column < NCBI:asn:binary > zip_encoding ANNOT;
+
+    /* GAP_START
+     *  Starting position of a gap
+     */
+    extern column < INSDC:coord:zero > izip_encoding GAP_START;
+
+    /* GAP_LEN
+     *  Length of a gap
+     */
+    extern column < INSDC:coord:len > izip_encoding GAP_LEN;
+
+    /* GAP_PROPS
+     *  See description of type
+     */
+    extern column < NCBI:WGS:component_props > zip_encoding GAP_PROPS;
+
+    /* GAP_LINKAGE
+     *  See description of type
+     */
+    extern column < NCBI:WGS:gap_linkage> zip_encoding GAP_LINKAGE;
+
+
+    ascii out_seqid_gi = sprintf < "gi|%u" > ( .GI );
+    ascii out_seqid_gb = sprintf < "gb|%s.%u|" > ( out_accession, ACC_VERSION )
+                       | sprintf < "gb|%s.1|" > ( out_accession );
+    ascii out_seqid_gnl = sprintf < "gnl|WGS:%s|%s" > (.ACC_PREFIX, out_contig_name ) | <ascii> echo < '' > ();
+
+
+    /* outputs to spotname */
+    ascii out_seqid_name
+        = sprintf < "%s|%s" > ( out_seqid_gi , out_seqid_gb )
+		| sprintf < "%s" > ( out_seqid_gb );
+
+    ascii out_spot_name = sprintf < "%s %s" > ( out_seqid_name, .TITLE );
+
+    readonly column ascii SEQ_ID = out_seqid_name;
+    readonly column ascii SEQ_ID_GNL = out_seqid_gnl;
+
+    /* outputs to spotdesc */
+    // INSDC:coord:len in_read_len  = (INSDC:coord:len) row_len ( in_2na_bin );
+    INSDC:coord:len  out_read_len = (INSDC:coord:len) row_len ( out_2na_bin );
+    INSDC:coord:len  trim_len = (INSDC:coord:len) row_len ( out_2na_bin );
+    INSDC:coord:zero out_read_start = <INSDC:coord:zero> echo < 0 > ();
+    INSDC:coord:zero trim_start = <INSDC:coord:zero> echo < 0 > ();
+    INSDC:SRA:read_filter out_rd_filter = < INSDC:SRA:read_filter > echo < SRA_READ_FILTER_PASS > ();
+    INSDC:SRA:xread_type  out_read_type = < INSDC:SRA:xread_type > echo < SRA_READ_TYPE_BIOLOGICAL > ();
+    // help trigger statistics
+    INSDC:SRA:xread_type  _alt_in_read_type = < INSDC:SRA:xread_type > echo < SRA_READ_TYPE_BIOLOGICAL > ();
+    INSDC:coord:len       _alt_in_read_len  = (INSDC:coord:len) row_len ( in_2na_bin );
+
+    ascii out_label = < ascii > echo < "contig" > ();
+    INSDC:coord:len out_label_len = < INSDC:coord:len > echo < 6 > ();
+    INSDC:coord:zero out_label_start = < INSDC:coord:zero > echo < 0 > ();
+
+    INSDC:SRA:platform_id out_platform = < INSDC:SRA:platform_id > echo < SRA_PLATFORM_UNDEFINED > ();
+};
+
+
+/*--------------------------------------------------------------------------
+ * protein
+ *  contig
+ */
+table NCBI:WGS:tbl:protein #1
+    = NCBI:tbl:protein #1.0.0
+{
+    /* ACCESSION
+     *  [<opt-prefix>]<4-letter-prefix><2-digit-version><6-or-7-digit-contig>
+     */
+    extern column ascii ACCESSION = out_accession;
+
+    // input
+    ascii in_accession = ACCESSION;
+
+    // parsed input
+    text:token in_acc_token
+        = NCBI:WGS:tokenize_prot_accession ( in_accession );
+    //  [<opt-prefix>]<4-letter-prefix><2-digit-version>
+    ascii in_acc_prefix
+        = extract_token < 0 > ( in_accession, in_acc_token );
+    // <6-or-7-digit-contig>
+    ascii in_contig_text
+        = extract_token < 1 > ( in_accession, in_acc_token );
+    U32 in_contig_len
+        = row_len ( in_contig_text );
+    I64 in_contig_bin
+        = strtonum ( in_contig_text );
+
+    // physical storage
+    physical column < ascii > zip_encoding .ACC_PREFIX = in_acc_prefix;
+    physical column < U32 > izip_encoding .ACC_CONTIG_LEN = in_contig_len;
+    physical column < U64 > izip_encoding .ACC_CONTIG = in_contig_bin;
+
+    // output
+    ascii out_acc_prefix
+        = .ACC_PREFIX
+        | < ascii > meta:read < 'ACC_PREFIX', true > ()
+        ;
+    U32 out_acc_contig_len
+        = .ACC_CONTIG_LEN
+        | < U32 > meta:value < 'ACC_CONTIG_LEN', true > ()
+        ;
+    U64 out_acc_contig
+        = .ACC_CONTIG
+        | ( U64 ) row_id ()
+        ;
+    ascii out_accession
+        = sprintf < "%s%0*u" > ( out_acc_prefix, out_acc_contig_len, out_acc_contig );
+
+    /* TITLE
+     */
+    extern column < ascii > zip_encoding TITLE;
+
+    /* GI
+     *  gi is indexed in a parallel table
+     */
+    extern column < NCBI:gi > izip_encoding GI;
+
+    /* GB_STATE
+     *  genbank state
+     */
+    extern column < NCBI:gb_state > izip_encoding GB_STATE;
+
+    /* DESCR
+     *  ASN.1 description
+     */
+    extern column < NCBI:asn:binary > zip_encoding DESCR;
+
+    /* ANNOT
+     *  ASN.1 annotation
+     */
+    extern column < NCBI:asn:binary > zip_encoding ANNOT;
+
+    /* outputs to spotname */
+    ascii out_seqid_name = sprintf < "TBD" > ( .ACC_PREFIX, out_accession );
+    ascii out_spot_name = sprintf < "%s %s" > ( out_seqid_name, .TITLE );
+
+    /* TBD
+     *  need to create an extension to NCBI:tbl:protein
+     *  that satisfies fastq-dump requirements for READ and QUALITY
+     */
+};
+
+
+/*--------------------------------------------------------------------------
+ * gi_idx
+ *  gi is row-id
+ */
+table NCBI:WGS:tbl:gi_idx #1
+{
+    /* NUC_ROW_ID
+     *  row-id in nucleotide table
+     */
+    extern column < I64 > izip_encoding NUC_ROW_ID;
+
+    /* PROT_ROW_ID
+     *  row-id in protein table
+     */
+    extern column < I64 > izip_encoding PROT_ROW_ID;
+};
+
+
+/*--------------------------------------------------------------------------
+ * scaffold
+ *  records AGP data
+ */
+table NCBI:WGS:tbl:scaffold #1
+{
+    /* SCAFFOLD_NAME
+     *  This is the identifier for the object being assembled.
+     *  This can be a chromosome, scaffold or contig.
+     *  If an accession.version identifier is not used to describe
+     *  the object the naming convention is to precede chromosome numbers
+     *  (e.g. chr1) and linkage group numbers  (e.g. LG3).
+     *  Contigs or scaffolds may have any identifier that is unique
+     *  within the assembly
+     */
+    extern column utf8 SCAFFOLD_NAME
+        = out_scaffold_name;
+    extern column ascii SCAFFOLD_NAME = cast (out_scaffold_name);
+    utf8 out_scaffold_name
+        = idx:text:project #1.0 < 'scaffold_name' > ( .SCAFFOLD_NAME );
+    physical column < utf8 > zip_encoding .SCAFFOLD_NAME
+        = idx:text:insert #1.0 < 'scaffold_name' > ( SCAFFOLD_NAME );
+
+    /* COMPONENT_START
+     *  starting position within the component sequence
+     */
+    extern column < INSDC:coord:one > izip_encoding COMPONENT_START;
+
+    /* COMPONENT_LEN
+     *  length of the component/gap projected onto the scaffold
+     */
+    extern column < INSDC:coord:len > izip_encoding COMPONENT_LEN;
+
+    /* COMPONENT_PROPS
+     *  see description of type
+     */
+    extern column < NCBI:WGS:component_props > zip_encoding COMPONENT_PROPS;
+
+    /* COMPONENT_ID
+     *  one row-id for each non-gap component
+     */
+    extern column < I64 > izip_encoding COMPONENT_ID;
+
+    /* COMPONENT_LINKAGE
+     *  see description of type
+     *  one row-id for each gap component
+     */
+    extern column < NCBI:WGS:gap_linkage > zip_encoding COMPONENT_LINKAGE;
+}
+
+table NCBI:WGS:view:scaffold #1 = NCBI:WGS:tbl:scaffold #1
+{
+    /* ACCESSION
+     *  scaffold accession
+     */
+    readonly column ascii ACCESSION
+        = out_accession;
+    I64 scaffold_row_id
+        = row_id ();
+    I64 acc_row_id
+        = < I64 > echo < 1 > ();
+    ascii acc_prefix
+        = < ascii > simple_sub_select < 'SEQUENCE', 'ACC_PREFIX' > ( acc_row_id );
+    U32 acc_contig_len
+        = < U32 > simple_sub_select < 'SEQUENCE', 'ACC_CONTIG_LEN' > ( acc_row_id );
+    ascii out_accession
+        = sprintf < "%sS%0*d" > ( acc_prefix, acc_contig_len, scaffold_row_id );
+
+    /* READ
+     *  base space construction of entire scaffold
+     */
+
+    // construct the read from contigs and gaps
+    INSDC:4na:bin out_4na_bin = NCBI:WGS:build_scaffold_read
+        ( .COMPONENT_START, .COMPONENT_LEN, .COMPONENT_PROPS, .COMPONENT_ID );
+
+    // various READ columns
+    default readonly column INSDC:dna:text READ
+        = < INSDC:4na:bin, INSDC:dna:text > map < INSDC:4na:map:BINSET, INSDC:4na:map:CHARSET > ( out_4na_bin );
+    readonly column INSDC:4na:bin READ
+        = out_4na_bin;
+    readonly column INSDC:4na:packed READ
+        = pack ( out_4na_bin );
+    readonly column INSDC:x2na:bin READ
+        = out_x2na_bin;
+    INSDC:x2na:bin out_x2na_bin
+        = < INSDC:4na:bin, INSDC:x2na:bin > map < INSDC:4na:map:BINSET, [ 4,0,1,4,2,4,4,4,3,4,4,4,4,4,4,4 ] > ( out_4na_bin );
+    readonly column INSDC:2na:bin READ
+        = out_2na_bin;
+    INSDC:2na:bin out_2na_bin
+        = < INSDC:x2na:bin, INSDC:2na:bin > map < INSDC:x2na:map:BINSET, [ 0, 1, 2, 3, 0 ] > ( out_x2na_bin );
+    readonly column INSDC:2na:packed READ
+        = pack ( out_2na_bin );
+
+
+    /* CSREAD
+     *  base space converted to color space
+     */
+    default readonly column INSDC:color:text CSREAD
+        = < INSDC:x2cs:bin, INSDC:color:text > map <  INSDC:x2cs:map:BINSET, INSDC:x2cs:map:CHARSET > ( out_x2cs_bin );
+    readonly column INSDC:x2cs:bin CSREAD
+        = out_x2cs_bin;
+    INSDC:x2cs:bin out_x2cs_bin
+        = NCBI:color_from_dna ( out_x2na_bin, out_read_start, .COMPONENT_LEN, out_cs_key, out_color_matrix );
+    readonly column INSDC:2cs:bin CSREAD
+        = out_2cs_bin;
+    INSDC:2cs:bin out_2cs_bin
+        = < INSDC:x2cs:bin, INSDC:2cs:bin > map < INSDC:x2cs:map:BINSET, [ 0, 1, 2, 3, 0 ] > ( out_x2cs_bin );
+    readonly column INSDC:2cs:packed CSREAD
+        = pack ( out_2cs_bin );
+
+    /* CS_NATIVE
+     *  is color-space the native sequence space
+     */
+    readonly column bool CS_NATIVE
+        = < bool > echo < false > ();
+
+    /* CS_KEY
+     *  leading call given in base-space
+     */
+    readonly column INSDC:dna:text CS_KEY
+        = out_cs_key;
+    INSDC:dna:text out_cs_key
+        = < INSDC:dna:text > echo < 'T' > ( .COMPONENT_LEN );
+
+    /* COLOR_MATRIX
+     *  matrix used for color-space conversions
+     */
+    readonly column U8 COLOR_MATRIX
+        = out_color_matrix;
+    U8 out_color_matrix
+        = < U8 > echo < INSDC:color:default_matrix > ();
+
+
+    /* QUALITY
+     *  base or color call qualities
+     */
+    INSDC:quality:phred out_qual_phred = NCBI:WGS:build_scaffold_qual
+        ( .COMPONENT_START, .COMPONENT_LEN, .COMPONENT_PROPS, .COMPONENT_ID );
+
+    // PHRED is default
+    default readonly column INSDC:quality:phred QUALITY
+        = out_qual_phred;
+
+    // textual encodings
+    readonly column INSDC:quality:text:phred_33 QUALITY
+        = ( INSDC:quality:text:phred_33 ) < B8 > sum < 33 > ( out_qual_phred );
+    readonly column INSDC:quality:text:phred_64 QUALITY
+        = ( INSDC:quality:text:phred_64 ) < B8 > sum < 64 > ( out_qual_phred );
+
+
+    /* PLATFORM
+     *  sequencing platform, if known
+     */
+    INSDC:SRA:platform_id out_platform
+        = < INSDC:SRA:platform_id > echo < SRA_PLATFORM_UNDEFINED > ();
+    readonly column INSDC:SRA:platform_id PLATFORM
+        = out_platform;
+
+    /* SPOT_ID
+     *  support for libsra
+     */
+    INSDC:SRA:spotid_t out_spot_id
+        = cast ( scaffold_row_id );
+    readonly column INSDC:SRA:spotid_t SPOT_ID
+        = out_spot_id;
+
+    /* NAME
+     *  spot name
+     */
+    readonly column ascii NAME
+        = out_scaffold_name;
+
+    /* SPOT_LEN
+     * TRIM_START
+     * TRIM_LEN
+     *  spot descriptor
+     */
+    readonly column INSDC:coord:len SPOT_LEN
+        = out_spot_len;
+    INSDC:coord:len out_spot_len
+        = < INSDC:coord:len > vec_sum ( .COMPONENT_LEN );
+    readonly column INSDC:coord:zero TRIM_START
+        = < INSDC:coord:zero > echo < 0 > ();
+    readonly column INSDC:coord:len TRIM_LEN
+        = out_spot_len;
+
+
+    /* READ_START
+     * READ_LEN
+     * READ_TYPE
+     *  read descriptor portion
+     */
+    readonly column INSDC:coord:zero READ_START
+        = out_read_start;
+    INSDC:coord:zero out_read_start
+        = ( INSDC:coord:zero ) < U32 > integral ( .COMPONENT_LEN );
+    readonly column INSDC:coord:len READ_LEN
+        = .COMPONENT_LEN;
+    readonly column INSDC:SRA:xread_type READ_TYPE
+        = out_read_type;
+    INSDC:SRA:xread_type out_read_type
+        = NCBI:WGS:build_read_type ( .COMPONENT_PROPS );
+}
+
+
+/*--------------------------------------------------------------------------
+ * contig
+ */
+database NCBI:WGS:db:contig #1.1
+{
+    table NCBI:WGS:tbl:nucleotide SEQUENCE;
+    table NCBI:WGS:tbl:protein PROTEIN;
+    table NCBI:WGS:tbl:gi_idx GI_IDX;
+    table NCBI:WGS:view:scaffold SCAFFOLD;
+};