Mercurial > repos > charles_s_test > seqsero2
diff libs/sratoolkit.2.8.0-centos_linux64/schema/sra/pacbio.vschema @ 3:38ad1130d077 draft
planemo upload commit a4fb57231f274270afbfebd47f67df05babffa4a-dirty
author | charles_s_test |
---|---|
date | Mon, 27 Nov 2017 11:21:07 -0500 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/libs/sratoolkit.2.8.0-centos_linux64/schema/sra/pacbio.vschema Mon Nov 27 11:21:07 2017 -0500 @@ -0,0 +1,307 @@ +/*=========================================================================== + * + * PUBLIC DOMAIN NOTICE + * National Center for Biotechnology Information + * + * This software/database is a "United States Government Work" under the + * terms of the United States Copyright Act. It was written as part of + * the author's official duties as a United States Government employee and + * thus cannot be copyrighted. This software/database is freely available + * to the public for use. The National Library of Medicine and the U.S. + * Government have not placed any restriction on its use or reproduction. + * + * Although all reasonable efforts have been taken to ensure the accuracy + * and reliability of the software and data, the NLM and the U.S. + * Government do not and cannot warrant the performance or results that + * may be obtained by using this software or data. The NLM and the U.S. + * Government disclaim all warranties, express or implied, including + * warranties of performance, merchantability or fitness for any particular + * purpose. + * + * Please cite the author in any work or product based on this material. + * + * =========================================================================== + * + */ + +/*========================================================================== + * NCBI PacBio Fastq Sequence Read Archive schema + */ +version 1; + +include 'insdc/sra.vschema'; +include 'ncbi/sra.vschema'; + + +/*-------------------------------------------------------------------------- + * NCBI:SRA:PacBio + * Pacific Biotech SRA Platform + * + * history: + * 1.0.2 - updated ancestry + * 1.0.3 - updated ancestry + */ +table NCBI:SRA:PacBio:common #1.0.3 = NCBI:SRA:tbl:sra #2.1.3 +{ +} + +/* history: + * 1.0.2 - updated ancestry + * 1.0.3 - updated ancestry + */ +table NCBI:SRA:PacBio:smrt:fastq #1.0.3 + = NCBI:SRA:PacBio:common #1.0.3 + , NCBI:tbl:base_space #2.0.3 + , NCBI:tbl:phred_quality #2.0.3 +{ + /* PLATFORM + * platform name is always "PACBIO_SMRT" + */ + ascii platform_name + = < ascii > echo < "PACBIO_SMRT" > (); + + /* TRIMMED SEQUENCE + * need to find the 0-based trim_start and trim_len + */ + INSDC:coord:zero bio_start + = NCBI:SRA:bio_start ( out_read_start, out_read_type ); + + INSDC:coord:zero trim_start = bio_start; + + U32 trim_left = ( U32 ) trim_start; + INSDC:coord:len trim_len = ( INSDC:coord:len ) + < U32 > diff ( spot_len, trim_left ); +} + +/*-------------------------------------------------------------------------- + * NCBI:SRA:PacBio:smrt:db + * Pacific Biotech SRA Platform + */ +table NCBI:SRA:PacBio:smrt:indelsubst #1 +{ + // probability that the current base is an insertion + column < U8 > zip_encoding INSERTION_QV; + + // probability of a deletion error following current base + // and identity of deleted base, if it exists + column < U8 > zip_encoding DELETION_QV; + column < INSDC:dna:text > zip_encoding DELETION_TAG; + + // probability of a substitution error + // and most likely alternative base call + column < U8 > zip_encoding SUBSTITUTION_QV; + column < INSDC:dna:text > zip_encoding SUBSTITUTION_TAG; +}; + +typedef U8 PacBio:hole:status; +const PacBio:hole:status PacBio:hole:SEQUENCING = 0; +const PacBio:hole:status PacBio:hole:ANTIHOLE = 1; +const PacBio:hole:status PacBio:hole:FIDUCIAL = 2; +const PacBio:hole:status PacBio:hole:SUSPECT = 3; +const PacBio:hole:status PacBio:hole:ANTIMIRROR = 4; +const PacBio:hole:status PacBio:hole:FDZMW = 5; +const PacBio:hole:status PacBio:hole:FBZMW = 6; +const PacBio:hole:status PacBio:hole:ANTIBEAMLET = 7; +const PacBio:hole:status PacBio:hole:OUTSIDEFOV = 8; + +/* history: + * 1.0.1 - updated ancestry + * 1.0.2 - updated ancestry + */ +table NCBI:SRA:PacBio:smrt:basecalls #1.0.2 + = INSDC:SRA:tbl:spotcoord #1 + , NCBI:tbl:base_space #2.0.3 + , NCBI:tbl:phred_quality #2.0.3 + , NCBI:SRA:PacBio:smrt:indelsubst #1 +{ + /* PLATFORM + * platform name is always "PACBIO_SMRT" + */ + ascii platform_name + = < ascii > echo < "PACBIO_SMRT" > (); + + // basecalls will be routed to READ column + readonly column INSDC:dna:text BASECALL + = out_dna_text; + + // quality value for each base + readonly column INSDC:quality:phred QUALITY_VALUE + = out_qual_phred; + + // zero-based hole number + column < U32 > izip_encoding HOLE_NUMBER; + + // hole status + column < PacBio:hole:status > zip_encoding HOLE_STATUS; + + // optional column pair to describe hole status + // when/if it does not line up with our constants above + column < ascii > zip_encoding HOLE_STATUS_VALUE; + column < INSDC:coord:len > izip_encoding HOLE_STATUS_VALUE_LEN; + + // hole ( X,Y ) pair will be split and sent to X and Y columns + column I16 [ 2 ] HOLE_XY + = < I16 > paste ( x_clip_I16, y_clip_I16 ); + I16 x_clip_I16 = cast ( out_x_coord ); + I16 y_clip_I16 = cast ( out_y_coord ); + + I16 in_x16_coord = < I16 > cut < 0 > ( HOLE_XY ); + I16 in_y16_coord = < I16 > cut < 1 > ( HOLE_XY ); + + INSDC:coord:val in_x_coord = cast ( in_x16_coord ); + INSDC:coord:val in_y_coord = cast ( in_y16_coord ); + + // the number of bases in ZMW + readonly column INSDC:coord:len NUM_EVENT + = base_space_spot_len; +}; + +/* history: + * 1.0.1 - updated ancestry + * 1.0.2 - updated ancestry + */ +table NCBI:SRA:PacBio:smrt:sequence #1.0.2 + = NCBI:SRA:PacBio:smrt:basecalls #1.0.2 + , NCBI:SRA:tbl:sra_nopos #2.1.3 +{ + // pulse information + column < U16 > izip_encoding PRE_BASE_FRAMES; + column < U16 > izip_encoding WIDTH_IN_FRAMES; + + // spot to pulse map + default column INSDC:position:zero PULSE_INDEX + = .PULSE_INDEX; + readonly column INSDC:position:one PULSE_INDEX + = out_position; + INSDC:position:one out_position + = ( INSDC:position:one ) < INSDC:position:zero > sum < 1 > ( .PULSE_INDEX ); + + column NCBI:SRA:pos16 PULSE_INDEX + = cast ( .PULSE_INDEX ); + NCBI:SRA:pos16 in_pulse_index16 + = PULSE_INDEX; + + INSDC:position:zero in_pulse_index32 + = PULSE_INDEX + | cast ( in_pulse_index16 ); + + physical column < INSDC:position:zero > izip_encoding .PULSE_INDEX + = in_pulse_index32; + + /* clip quality */ + extern column < INSDC:coord:zero > izip_encoding CLIP_QUALITY_LEFT; + extern column < INSDC:coord:one > izip_encoding CLIP_QUALITY_RIGHT; + + /* TRIMMED SEQUENCE + * need to find the 0-based trim_start and trim_len + */ + INSDC:coord:zero trim_start + = .CLIP_QUALITY_LEFT + | NCBI:SRA:bio_start ( out_read_start, out_read_type ); + + U32 trim_right + = ( U32 ) .CLIP_QUALITY_RIGHT + | spot_len; + + U32 trim_left = ( U32 ) trim_start; + INSDC:coord:len trim_len = ( INSDC:coord:len ) + < U32 > diff ( trim_right, trim_left ); +}; + +/* history: + * 1.0.1 - updated ancestry + * 1.0.2 - updated ancestry + */ +table NCBI:SRA:PacBio:smrt:cons #1.0.2 + = NCBI:SRA:PacBio:smrt:basecalls #1.0.2 + , NCBI:SRA:tbl:sra #2.1.3 +{ + // documented in both hdf5 and xsd as signed... + column < I32 > izip_encoding NUM_PASSES; + + /* TRIMMED SEQUENCE + * need to find the 0-based trim_start and trim_len + */ + INSDC:coord:zero trim_start + = NCBI:SRA:bio_start ( out_read_start, out_read_type ); + + U32 trim_left = ( U32 ) trim_start; + INSDC:coord:len trim_len = ( INSDC:coord:len ) + < U32 > diff ( spot_len, trim_left ); +}; + +/* these encoding rules attempt to compress the channels individually, + although they may compress fine interleaved as they are... */ +physical +F32 [ 4 ] NCBI:SRA:PacBio:smrt:F32_4ch_encoding #1.0 < U32 mantissa > +{ + decode + { + fzip_fmt cmp0 = split < 0 > ( @ ); + fzip_fmt cmp1 = split < 1 > ( @ ); + fzip_fmt cmp2 = split < 2 > ( @ ); + fzip_fmt cmp3 = split < 3 > ( @ ); + + F32 ch0 = funzip ( cmp0 ); + F32 ch1 = funzip ( cmp1 ); + F32 ch2 = funzip ( cmp2 ); + F32 ch3 = funzip ( cmp3 ); + + return < F32 > paste ( ch0, ch1, ch2, ch3 ); + } + + encode + { + F32 ch0 = < F32 > cut < 0 > ( @ ); + F32 ch1 = < F32 > cut < 1 > ( @ ); + F32 ch2 = < F32 > cut < 2 > ( @ ); + F32 ch3 = < F32 > cut < 3 > ( @ ); + + fzip_fmt cmp0 = fzip < mantissa > ( ch0 ); + fzip_fmt cmp1 = fzip < mantissa > ( ch1 ); + fzip_fmt cmp2 = fzip < mantissa > ( ch2 ); + fzip_fmt cmp3 = fzip < mantissa > ( ch3 ); + + return merge ( cmp0, cmp1, cmp2, cmp3 ); + } +} + +table NCBI:SRA:PacBio:smrt:zmw_metrics #1 +{ + column NCBI:SRA:PacBio:smrt:F32_4ch_encoding < 24 > BASE_FRACTION; + column < F32 > fzip_encoding < 24 > BASE_IPD; + column < F32 > fzip_encoding < 24 > BASE_RATE; + column < F32 > fzip_encoding < 24 > BASE_WIDTH; + column NCBI:SRA:PacBio:smrt:F32_4ch_encoding < 24 > CHAN_BASE_QV; + column NCBI:SRA:PacBio:smrt:F32_4ch_encoding < 24 > CHAN_DEL_QV; + column NCBI:SRA:PacBio:smrt:F32_4ch_encoding < 24 > CHAN_INS_QV; + column NCBI:SRA:PacBio:smrt:F32_4ch_encoding < 24 > CHAN_SUB_QV; + column < F32 > fzip_encoding < 24 > LOCAL_BASE_RATE; + column < F32 > fzip_encoding < 24 > DARK_BASE_RATE; + column < F32 > fzip_encoding < 24 > HQ_RGN_START_TIME; + column < F32 > fzip_encoding < 24 > HQ_RGN_END_TIME; + column NCBI:SRA:PacBio:smrt:F32_4ch_encoding < 24 > HQ_RGN_SNR; + column < I8 > zip_encoding PRODUCTIVITY; + column < F32 > fzip_encoding < 24 > READ_SCORE; + column < F32 > fzip_encoding < 24 > READ_BASE_QV; + column < F32 > fzip_encoding < 24 > READ_DEL_QV; + column < F32 > fzip_encoding < 24 > READ_INS_QV; + column < F32 > fzip_encoding < 24 > READ_SUB_QV; +}; + +table NCBI:SRA:PacBio:smrt:passes #1 +{ + column < U8 > zip_encoding ADAPTER_HIT_BEFORE; + column < U8 > zip_encoding ADAPTER_HIT_AFTER; + column < U8 > zip_encoding PASS_DIRECTION; + column < I32 > izip_encoding PASS_NUM_BASES; + column < I32 > izip_encoding PASS_START_BASE; +}; + +database NCBI:SRA:PacBio:smrt:db #1.0.1 +{ + table NCBI:SRA:PacBio:smrt:sequence #1.0 SEQUENCE; + table NCBI:SRA:PacBio:smrt:cons #1.0 CONSENSUS; + table NCBI:SRA:PacBio:smrt:passes #1.0 PASSES; + table NCBI:SRA:PacBio:smrt:zmw_metrics #1.0 ZMW_METRICS; +};