Mercurial > repos > charles_s_test > seqsero2
comparison libs/sratoolkit.2.8.0-centos_linux64/schema/vdb/vdb.vschema @ 3:38ad1130d077 draft
planemo upload commit a4fb57231f274270afbfebd47f67df05babffa4a-dirty
| author | charles_s_test |
|---|---|
| date | Mon, 27 Nov 2017 11:21:07 -0500 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| 2:0d65b71ff8df | 3:38ad1130d077 |
|---|---|
| 1 /*=========================================================================== | |
| 2 * | |
| 3 * PUBLIC DOMAIN NOTICE | |
| 4 * National Center for Biotechnology Information | |
| 5 * | |
| 6 * This software/database is a "United States Government Work" under the | |
| 7 * terms of the United States Copyright Act. It was written as part of | |
| 8 * the author's official duties as a United States Government employee and | |
| 9 * thus cannot be copyrighted. This software/database is freely available | |
| 10 * to the public for use. The National Library of Medicine and the U.S. | |
| 11 * Government have not placed any restriction on its use or reproduction. | |
| 12 * | |
| 13 * Although all reasonable efforts have been taken to ensure the accuracy | |
| 14 * and reliability of the software and data, the NLM and the U.S. | |
| 15 * Government do not and cannot warrant the performance or results that | |
| 16 * may be obtained by using this software or data. The NLM and the U.S. | |
| 17 * Government disclaim all warranties, express or implied, including | |
| 18 * warranties of performance, merchantability or fitness for any particular | |
| 19 * purpose. | |
| 20 * | |
| 21 * Please cite the author in any work or product based on this material. | |
| 22 * | |
| 23 * =========================================================================== | |
| 24 * | |
| 25 */ | |
| 26 | |
| 27 /*========================================================================== | |
| 28 * VDB external functions, formats and types | |
| 29 */ | |
| 30 version 1; | |
| 31 | |
| 32 // built-in functions should be known to all | |
| 33 include 'vdb/built-in.vschema'; | |
| 34 | |
| 35 | |
| 36 /*-------------------------------------------------------------------------- | |
| 37 * types | |
| 38 */ | |
| 39 | |
| 40 /* text_token | |
| 41 * a vector describing tokens recognized within a text string | |
| 42 * | |
| 43 * COMPONENTS: | |
| 44 * 0 - token id | |
| 45 * 1 - token starting coordinate | |
| 46 * 2 - token length | |
| 47 */ | |
| 48 typedef U16 text:token [ 3 ]; | |
| 49 | |
| 50 | |
| 51 /*-------------------------------------------------------------------------- | |
| 52 * typesets | |
| 53 */ | |
| 54 typeset pack_set { B8, B16, B32, B64, integer_set }; | |
| 55 typeset izip_set { integer_set }; | |
| 56 typeset fzip_set { F32 }; | |
| 57 | |
| 58 | |
| 59 /*-------------------------------------------------------------------------- | |
| 60 * formats | |
| 61 */ | |
| 62 fmtdef izip_fmt; | |
| 63 fmtdef fzip_fmt; | |
| 64 fmtdef rle_fmt; | |
| 65 fmtdef zlib_fmt; | |
| 66 fmtdef bzip2_fmt; | |
| 67 | |
| 68 | |
| 69 /*-------------------------------------------------------------------------- | |
| 70 * functions | |
| 71 */ | |
| 72 | |
| 73 /* echo | |
| 74 * returns single or repeated constant value | |
| 75 * | |
| 76 * "T" [ TYPE ] - type of constant data to return | |
| 77 * | |
| 78 * "val" [ CONST ] - a data constant | |
| 79 * | |
| 80 * "row_len" [ DATA, OPTIONAL ] - if omitted, "val" will be | |
| 81 * issued once and the resultant row-length will be the length | |
| 82 * of "val". otherwise, "val" will be repeated and/or truncated | |
| 83 * as necessary to produce a row-length equal to that of input. | |
| 84 * | |
| 85 * USAGE: | |
| 86 * to echo a single constant value | |
| 87 * U16 len = row_len ( col ) | < U16 > echo < 0 > (); | |
| 88 * | |
| 89 * to create a row of repeated values | |
| 90 * ascii allN = < ascii > echo < 'N' > ( col ); | |
| 91 */ | |
| 92 function < type T > | |
| 93 T echo #1.0 < T val > ( * any row_len ) | |
| 94 = vdb:echo; | |
| 95 | |
| 96 | |
| 97 /* exists | |
| 98 * returns constant or dynamic value if predicate input exists | |
| 99 * | |
| 100 * "T" [ TYPE ] - type of data to return | |
| 101 * | |
| 102 * "cval" [ CONST, OPTIONAL ] - a data constant. when present, | |
| 103 * the function will behave like "echo" ( see below ) | |
| 104 * | |
| 105 * "predicate" [ DATA ] - an input whose existence determines | |
| 106 * whether the function will operate or not. | |
| 107 * | |
| 108 * "dval" [ DATA, OPTIONAL ] - data value, either passed through | |
| 109 * or used to determine a repeat count of "cval" ( see below ) | |
| 110 * | |
| 111 * USAGE: | |
| 112 * when "cval" is omitted, "dval" must be present and will be | |
| 113 * passed through depending upon the existence of "predicate" | |
| 114 * U8 count = < U8 > exists ( col, count2 ); | |
| 115 * | |
| 116 * when "cval" is present, "dval" may be omitted, and "cval" will | |
| 117 * be passed through just like echo depending upon "predicate" | |
| 118 * U8 count = < U8 > exists < 2 > ( col2 ) | < U8 > echo < 1 > (); | |
| 119 * | |
| 120 * when "cval" and "dval" are both present, the behavior is | |
| 121 * like echo, but gated with "predicate" | |
| 122 * ascii poly = < ascii > exists < 'a' > ( col, repeat ); | |
| 123 */ | |
| 124 function < type T > | |
| 125 T exists #1.0 < * T cval > ( any predicate, * T dval ) | |
| 126 = vdb:exists; | |
| 127 | |
| 128 | |
| 129 /* map | |
| 130 * translate input elements | |
| 131 * behaves much like the Unix "tr" command | |
| 132 * except that charsets are not [currently] supported | |
| 133 * | |
| 134 * "A" [ TYPE ] - input data type, e.g. "ascii" | |
| 135 * | |
| 136 * "B" [ TYPE ] - output data type, e.g. "ascii" or "U8" | |
| 137 * | |
| 138 * "from" [ CONST ] - set of key values. | |
| 139 * | |
| 140 * "to" [ CONST ] - set of mapped values, | |
| 141 * where length ( from ) === length ( to ) | |
| 142 * | |
| 143 * "in" [ DATA ] - input data to be matched against keys | |
| 144 * in "from". also serves as source data when "src" is omitted | |
| 145 * | |
| 146 * "src" [ DATA, OPTIONAL ] - source data to be edited by | |
| 147 * substituting "to" values when corresponding "in" value | |
| 148 * matches key in "from". if omitted, "in" is used. | |
| 149 * | |
| 150 * USAGE: | |
| 151 * to upper case letters from a given alphabet | |
| 152 * ascii upper = < ascii, ascii > map < 'acgtn', 'ACGTN' > ( in ); | |
| 153 * | |
| 154 * to translate from ascii to binary | |
| 155 * U8 bin = < ascii, U8 > map < 'ACGTN', [ 0, 1, 2, 3, 0 ] > ( in ); | |
| 156 * | |
| 157 * to alter certain values of a column based upon values in another | |
| 158 * U8 n_encoded = < ascii, U8 > map < 'N', 0 > ( read, quality ); | |
| 159 * | |
| 160 * CAVEATS: | |
| 161 * the full canonical mode of operation uses separate inputs | |
| 162 * for key matching and output source. | |
| 163 * | |
| 164 * when a single input is specified: | |
| 165 * - sizeof ( A ) must equal sizeof ( B ) | |
| 166 * - A must be a proper subset of B -OR- | |
| 167 * - "from" keys must match every possible "in" value ( total substitution ) | |
| 168 */ | |
| 169 function < type A, type B > | |
| 170 B map #1.0 < A from, B to > ( A in, * B src ) | |
| 171 = vdb:map; | |
| 172 | |
| 173 | |
| 174 /* clip | |
| 175 * limit data values to given bounds | |
| 176 * | |
| 177 * "T" [ TYPE ] - input and output data type | |
| 178 * | |
| 179 * "dim" [ CONST >= 1 ] - fixed dimension on | |
| 180 * input and output vectors | |
| 181 * | |
| 182 * "lower" [ CONST ] - lower bound, inclusive | |
| 183 * | |
| 184 * "upper" [ CONST ] - upper bounds, inclusive | |
| 185 * | |
| 186 * "in" [ DATA ] - data to be clipped | |
| 187 */ | |
| 188 function < type T > | |
| 189 T clip #1.0 < T lower, T upper > ( T in ) | |
| 190 = vdb:clip; | |
| 191 | |
| 192 function < type T, U32 dim > | |
| 193 T [ dim ] vclip #1.0 < T lower, T upper > ( T [ dim ] in ) | |
| 194 = vdb:clip; | |
| 195 | |
| 196 | |
| 197 /* ceil | |
| 198 * round up to the nearest integer | |
| 199 * | |
| 200 * "in" [ DATA ] - data to be processed | |
| 201 */ | |
| 202 function | |
| 203 numeric_set ceil #1.0 ( float_set in ) | |
| 204 = vdb:ceil; | |
| 205 | |
| 206 /* floor | |
| 207 * round down to the nearest integer | |
| 208 * | |
| 209 * "in" [ DATA ] - data to be processed | |
| 210 */ | |
| 211 function | |
| 212 numeric_set floor #1.0 ( float_set in ) | |
| 213 = vdb:floor; | |
| 214 | |
| 215 /* round | |
| 216 * round to nearest integer away from zero | |
| 217 * | |
| 218 * "T" [ TYPE = { F32, F64 } ] - input and output data type | |
| 219 * | |
| 220 * "in" [ DATA ] - data to be processed | |
| 221 */ | |
| 222 function | |
| 223 numeric_set round #1.0 ( float_set in ) | |
| 224 = vdb:round; | |
| 225 | |
| 226 /* trunc | |
| 227 * round to the nearest integer not larger in absolute value | |
| 228 * | |
| 229 * "T" [ TYPE = { F32, F64 } ] - input and output data type | |
| 230 * | |
| 231 * "in" [ DATA ] - data to be processed | |
| 232 */ | |
| 233 function | |
| 234 numeric_set trunc #1.0 ( float_set in ) | |
| 235 = vdb:trunc; | |
| 236 | |
| 237 | |
| 238 /* min | |
| 239 * return the minimum value of each element | |
| 240 * max | |
| 241 * return the maximum value of each element | |
| 242 * | |
| 243 * "T" [ TYPE ] - input and output data type | |
| 244 * | |
| 245 * "a" [ DATA ] - first operand | |
| 246 * | |
| 247 * "b" [ DATA ] - second operand | |
| 248 * | |
| 249 * SYNOPSIS: | |
| 250 * compares two inputs element by element | |
| 251 * returns min or max element of each | |
| 252 * | |
| 253 * USAGE: | |
| 254 * intersections | |
| 255 * U32 left = < U32 > max ( left_a, left_b ); | |
| 256 * U32 right = < U32 > min ( right_a, right_b ); | |
| 257 */ | |
| 258 function < type T > | |
| 259 T min #1.0 ( T a, T b ) | |
| 260 = vdb:min; | |
| 261 | |
| 262 function < type T > | |
| 263 T max #1.0 ( T a, T b ) | |
| 264 = vdb:max; | |
| 265 | |
| 266 | |
| 267 /* sum | |
| 268 * return the sum of inputs | |
| 269 * diff | |
| 270 * return the difference of inputs | |
| 271 * | |
| 272 * "T" [ TYPE ] - input and output data type | |
| 273 * must be member of numeric_set | |
| 274 * | |
| 275 * "k" [ CONST, DEFAULT 0 ] - optional constant | |
| 276 * to be added or subtracted | |
| 277 * | |
| 278 * "a" [ DATA ] - left-most operand | |
| 279 * | |
| 280 * "b" [ DATA ] - optional subtractand | |
| 281 * | |
| 282 * SYNOPSIS: | |
| 283 * incorporates "k" into expression for every row | |
| 284 * returns sum or difference of inputs for all rows | |
| 285 * | |
| 286 * USAGE: | |
| 287 * length of half-closed interval | |
| 288 * U32 len = < U32 > diff ( stop, start ); | |
| 289 * convert one-based coordinate to zero based | |
| 290 * U32 zero_based = < U32 > diff < 1 > ( one_based ); | |
| 291 */ | |
| 292 function < type T > | |
| 293 T sum #1.0 < * T k > ( T a, ... ) | |
| 294 = vdb:sum; | |
| 295 | |
| 296 function < type T > | |
| 297 T diff #1.0 < * T k > ( T a * T b ) | |
| 298 = vdb:diff; | |
| 299 | |
| 300 /* deriv | |
| 301 * return the 1st derivative of an input row | |
| 302 * integral | |
| 303 * return the "integral" of an input row | |
| 304 * integral -> starts with 1st value | |
| 305 * integral_0 -> starts with 0 | |
| 306 * | |
| 307 * "T" [ TYPE ] - input and output data type | |
| 308 * must be signed integer of any size | |
| 309 * | |
| 310 * "in" [ DATA ] - input to be modified | |
| 311 * | |
| 312 * SYNOPSIS: | |
| 313 * derivative function is ( in [ i ] - in [ i - 1 ] ) | |
| 314 * for i = 0 .. length ( in ) - 1, | |
| 315 * assuming in [ 0 - 1 ] = 0 ( i.e. leaves in [ 0 ] intact ). | |
| 316 * | |
| 317 * integral function is sum ( in [ 0 ] .. in [ i ] ) | |
| 318 * for i = 0 .. length ( in ) - 1. | |
| 319 * | |
| 320 * integral_0 function is sum ( in [ 0 ] .. in [ i - 1 ] ) | |
| 321 * for i = 1 .. length ( in ) - 1, | |
| 322 * setting output [ 0 ] = 0. | |
| 323 * | |
| 324 * USAGE: | |
| 325 * "deriv" and "integral" are reciprocal functions. | |
| 326 * the oddity is that "deriv" creates an output series | |
| 327 * with the same length as the input series, causing the | |
| 328 * first element of input to be copied to first element | |
| 329 * of output. | |
| 330 * | |
| 331 * "integral_0" always creates an output with the first | |
| 332 * element being 0. the oddity here is again that the output | |
| 333 * series is the same length as the input, dropping the effect | |
| 334 * from the last element of input. its utility is primarily in | |
| 335 * operations such as creating absolute offsets from a series of | |
| 336 * lengths. | |
| 337 * | |
| 338 * EXAMPLES: | |
| 339 * given an input series ( 15, 17, 12, 315 ): | |
| 340 * "deriv" produces ( 15, 2, -5, 303 ) [ NOTICE first element ] | |
| 341 * integrating ( 15, 2, -5, 303 ): | |
| 342 * "integral" produces ( 15, 17, 12, 315 ), while | |
| 343 * "integral_0" produces ( 0, 15, 17, 12 ). | |
| 344 * | |
| 345 * generating starting offsets from a series of lengths ( 15, 17, 12, 315 ): | |
| 346 * "integral_0" produces ( 0, 15, 32, 44 ) which can be used | |
| 347 * to accompany the input series for starts and lengths. | |
| 348 */ | |
| 349 | |
| 350 function < type T > | |
| 351 T deriv #1.0 ( T in ) | |
| 352 = vdb:deriv; | |
| 353 function < type T > | |
| 354 T integral #1.0 ( T in ) | |
| 355 = vdb:integral; | |
| 356 function < type T > | |
| 357 T integral_0 #1.1 ( T in ) | |
| 358 = vdb:integral_0; | |
| 359 | |
| 360 /* delta | |
| 361 * return the 1st derivative of a whole blob | |
| 362 * undelta | |
| 363 * return the integral of a whole blob | |
| 364 * | |
| 365 * "T" [ TYPE ] - input and output data type | |
| 366 * must be signed integer of any size | |
| 367 * | |
| 368 * "in" [ DATA ] - input to be modified | |
| 369 * | |
| 370 * SYNOPSIS: | |
| 371 * similar to deriv/integral but operates on full blob | |
| 372 */ | |
| 373 | |
| 374 function < type T > T delta #1.0 ( T in ) = vdb:delta; | |
| 375 function < type T > T undelta #1.0 ( T in ) = vdb:undelta; | |
| 376 | |
| 377 | |
| 378 /* outlier_encode | |
| 379 * removes a given outlier from a data series | |
| 380 * outlier_decode | |
| 381 * removes the effect of outlier_encode | |
| 382 * | |
| 383 * "T" [ TYPE ] - input and output data type | |
| 384 * must be an integer of any size | |
| 385 * | |
| 386 * "in" [ DATA ] - input to be modified | |
| 387 * | |
| 388 * SYNOPSIS: | |
| 389 * The encode replaces every element that is equal to the | |
| 390 * outlier with (the value of the previous element) * 2 + 1 | |
| 391 * and the remaining elements are replaced with their value * 2. | |
| 392 */ | |
| 393 | |
| 394 function < type T > T outlier_encode #1.0 < T outlier > ( T in ) = vdb:outlier_encode; | |
| 395 function < type T > T outlier_decode #1.0 < T outlier > ( T in ) = vdb:outlier_decode; | |
| 396 | |
| 397 /* add_row_id | |
| 398 * return the sum of an input and its row-id | |
| 399 * sub_row_id | |
| 400 * return the difference of an input and its row-id | |
| 401 * | |
| 402 * "T" [ TYPE ] - input and output data type | |
| 403 * must be member of numeric_set | |
| 404 * | |
| 405 * "in" [ DATA ] - input to be modified | |
| 406 * | |
| 407 * SYNOPSIS: | |
| 408 * adjusts for relationship between input and row-id | |
| 409 * used primarily to reduce serial ids to constants | |
| 410 */ | |
| 411 function < type T > | |
| 412 T add_row_id #1.0 ( T in ) | |
| 413 = vdb:add_row_id; | |
| 414 | |
| 415 function < type T > | |
| 416 T sub_row_id #1.0 ( T in ) | |
| 417 = vdb:sub_row_id; | |
| 418 | |
| 419 | |
| 420 /* cut | |
| 421 * extract one or more elements from input vector | |
| 422 * to form an output vector of equal or less dimension | |
| 423 * | |
| 424 * "T" [ TYPE ] - base element type to be processed | |
| 425 * | |
| 426 * "idx" [ CONST ] - mandatory initial element index | |
| 427 * count of parameters must equal dimension of output type | |
| 428 * | |
| 429 * "in" [ DATA ] - source of input vectors where the vector | |
| 430 * element type is known, but any dimension is accepted. | |
| 431 * | |
| 432 * USAGE: | |
| 433 * extracting a single channel from a 4 channel vector | |
| 434 * F32 [ 4 ] vect ... | |
| 435 * F32 chan = < F32 > cut < 0 > ( vect ); | |
| 436 * | |
| 437 * extracting multiple channels | |
| 438 * U8 [ 16 ] in ... | |
| 439 * U8 [ 3 ] out = < U8 > cut < 5, 1, 3 > ( in ); | |
| 440 * | |
| 441 * reversing channels | |
| 442 * I16 [ 2 ] norm ... | |
| 443 * I16 [ 2 ] rev = < I16 > cut < 1, 0 > ( norm ); | |
| 444 */ | |
| 445 function < type T > | |
| 446 T [ * ] cut #1.0 < U32 idx, ... > ( T [ * ] in ) | |
| 447 = vdb:cut; | |
| 448 | |
| 449 | |
| 450 /* paste | |
| 451 * combine all elements of all inputs into a single vector | |
| 452 * output dimension is sum of all input dimensions after type normalization | |
| 453 * | |
| 454 * "T" [ TYPE ] - base element type to be processed | |
| 455 * | |
| 456 * "in" [ DATA ] - first of an arbitrary number of columns | |
| 457 * the total of input elements produces an output of "T [ total ]" | |
| 458 */ | |
| 459 function < type T > | |
| 460 T [ * ] paste #1.0 ( T [ * ] in, ... ) | |
| 461 = vdb:paste; | |
| 462 | |
| 463 | |
| 464 /* vec_sum | |
| 465 * compute the sum of all the elements of the row | |
| 466 * | |
| 467 * "T" [ TYPE ] - base element type to be processed | |
| 468 * | |
| 469 * "in" [ DATA ] - the input | |
| 470 */ | |
| 471 function < type T > | |
| 472 T vec_sum #1.0 ( T [ * ] in ) | |
| 473 = vdb:vec_sum; | |
| 474 | |
| 475 /* vec_sum | |
| 476 * compute the sum of all the elements of the input vector | |
| 477 * | |
| 478 * "T" [ TYPE ] - base element type to be processed | |
| 479 * | |
| 480 * "in" [ DATA ] - the input | |
| 481 */ | |
| 482 function < type T > | |
| 483 T fixed_vec_sum #1.0 ( T [ * ] in ) | |
| 484 = vdb:fixed_vec_sum; | |
| 485 | |
| 486 | |
| 487 /* checksum | |
| 488 * compute a checksum ( hash ) of all of the input bytes | |
| 489 * to be used in a trigger production | |
| 490 * | |
| 491 * "node" [ CONST ] - path to metadata node where checksum | |
| 492 * will be stored. | |
| 493 * | |
| 494 * "algorithm" [ CONST ] - type of checksum to perform: | |
| 495 * 'crc-32' # match against POSIX cksum | |
| 496 * 'md5' # " " md5sum | |
| 497 * 'sha-1' # " " sha1sum | |
| 498 * 'sha-256' # " " sha256sum | |
| 499 * 'sha-384' # " " sha384sum | |
| 500 * 'sha-512' # " " sha512sum | |
| 501 * | |
| 502 * "in" [ DATA ] - the octet-stream to be checksummed | |
| 503 */ | |
| 504 function | |
| 505 bool checksum #1.0 < ascii node, ascii algorithm > ( B8 in ) | |
| 506 = vdb:checksum; | |
| 507 | |
| 508 /* md5sum | |
| 509 * compute an md5 checksum of all of the input bytes | |
| 510 */ | |
| 511 function | |
| 512 bool md5sum #1.0 < ascii node > ( B8 in ) | |
| 513 { | |
| 514 return checksum < node, 'md5' > ( in ); | |
| 515 } | |
| 516 | |
| 517 | |
| 518 /* pack | |
| 519 * packs words into bit-aligned units | |
| 520 * words are expected in architecture native byte-order | |
| 521 * and returned in "big-bit-endian" order | |
| 522 * | |
| 523 * the packed size is determined by the dimension of the | |
| 524 * left-hand assignment value. | |
| 525 * | |
| 526 * "in" [ DATA ] - B8, B16, B32 or B64 data | |
| 527 */ | |
| 528 function | |
| 529 B1 [ * ] pack #1.0 ( pack_set in ) | |
| 530 = vdb:pack; | |
| 531 | |
| 532 | |
| 533 /* unpack | |
| 534 * unpacks bit-aligned units into words | |
| 535 * input is expected in "big-bit-endian" order | |
| 536 * and returned in architecture native byte-order | |
| 537 * | |
| 538 * the unpacked type is determined from the left-hand | |
| 539 * assignment value. | |
| 540 * | |
| 541 * "in" [ DATA ] - B[1]..B[64] | |
| 542 */ | |
| 543 function | |
| 544 pack_set unpack #1.0 ( B1 [ * ] in ) | |
| 545 = vdb:unpack; | |
| 546 | |
| 547 | |
| 548 /* izip | |
| 549 * iunzip | |
| 550 * integer compression | |
| 551 */ | |
| 552 function | |
| 553 izip_fmt izip #2.1 ( izip_set in ) | |
| 554 = vdb:izip; | |
| 555 | |
| 556 function | |
| 557 izip_set iunzip #2.1 ( izip_fmt in ) | |
| 558 = vdb:iunzip; | |
| 559 | |
| 560 physical < type T > | |
| 561 T izip_encoding #1.0 | |
| 562 { | |
| 563 decode { return ( T ) iunzip ( @ ); } | |
| 564 encode { return izip ( @ ); } | |
| 565 }; | |
| 566 | |
| 567 | |
| 568 /* fzip | |
| 569 * funzip | |
| 570 * floating point compression | |
| 571 * | |
| 572 * "mantissa" [ CONST ] - the number of mantissa bits | |
| 573 * to preserve | |
| 574 */ | |
| 575 function | |
| 576 fzip_fmt fzip #1.0 < U32 mantissa > ( fzip_set in ) | |
| 577 = vdb:fzip; | |
| 578 | |
| 579 function | |
| 580 fzip_set funzip #1.0 ( fzip_fmt in ) | |
| 581 = vdb:funzip; | |
| 582 | |
| 583 physical < type T > | |
| 584 T fzip_encoding #1.0 < U32 mantissa > | |
| 585 { | |
| 586 decode { return funzip ( @ ); } | |
| 587 encode { return fzip < mantissa > ( @ ); } | |
| 588 }; | |
| 589 | |
| 590 | |
| 591 /* rlencode | |
| 592 * rldecode | |
| 593 * run-length encoding | |
| 594 */ | |
| 595 function | |
| 596 rle_fmt rlencode #1.0 ( any in ) | |
| 597 = vdb:rlencode; | |
| 598 | |
| 599 function | |
| 600 any rldecode #1.0 ( rle_fmt in ) | |
| 601 = vdb:rldecode; | |
| 602 | |
| 603 | |
| 604 /* zip | |
| 605 * unzip | |
| 606 * run things through zlib | |
| 607 * | |
| 608 * "strategy" [ CONST, OPTIONAL ] - set the compression strategy | |
| 609 * | |
| 610 * "level" [ CONST, OPTIONAL ] - set the amount of compression | |
| 611 * from 0..9 ( none to best compression ), or use -1 for zlib | |
| 612 * default behavior. | |
| 613 */ | |
| 614 | |
| 615 // zlib strategy | |
| 616 const I32 Z_FILTERED = 1; | |
| 617 const I32 Z_HUFFMAN_ONLY = 2; | |
| 618 const I32 Z_RLE = 3; | |
| 619 const I32 Z_DEFAULT_STRATEGY = 0; | |
| 620 | |
| 621 // zlib level | |
| 622 const I32 Z_NO_COMPRESSION = 0; | |
| 623 const I32 Z_BEST_SPEED = 1; | |
| 624 const I32 Z_BEST_COMPRESSION = 9; | |
| 625 const I32 Z_DEFAULT_COMPRESSION = -1; | |
| 626 | |
| 627 function | |
| 628 zlib_fmt zip #1.0 < * I32 strategy, I32 level > ( any in ) | |
| 629 = vdb:zip; | |
| 630 | |
| 631 function | |
| 632 any unzip #1.0 ( zlib_fmt in ) | |
| 633 = vdb:unzip; | |
| 634 | |
| 635 physical < type T > | |
| 636 T zip_encoding #1.0 < * I32 strategy, I32 level > | |
| 637 { | |
| 638 decode { return unzip ( @ ); } | |
| 639 encode { return zip < strategy, level > ( @ ); } | |
| 640 }; | |
| 641 | |
| 642 physical | |
| 643 bool bool_encoding #1.0 | |
| 644 { | |
| 645 decode | |
| 646 { | |
| 647 B1 bit = unzip ( @ ); | |
| 648 return ( bool ) unpack ( bit ); | |
| 649 } | |
| 650 | |
| 651 encode | |
| 652 { | |
| 653 U8 lim = < U8 > clip < 0, 1 > ( @ ); | |
| 654 B1 bit = pack ( lim ); | |
| 655 return zip < Z_RLE, Z_BEST_SPEED > ( bit ); | |
| 656 } | |
| 657 } | |
| 658 | |
| 659 physical < type T > | |
| 660 T delta_izip_encoding #1.0 | |
| 661 { | |
| 662 decode | |
| 663 { | |
| 664 T dlt = iunzip ( @ ); | |
| 665 return < T > undelta ( dlt ); | |
| 666 } | |
| 667 | |
| 668 encode | |
| 669 { | |
| 670 T dlt = <T> delta ( @ ); | |
| 671 return izip ( dlt ); | |
| 672 } | |
| 673 } | |
| 674 physical < type T > | |
| 675 T delta_zip_encoding #1.0 | |
| 676 { | |
| 677 decode | |
| 678 { | |
| 679 T dlt = unzip ( @ ); | |
| 680 return < T > undelta ( dlt ); | |
| 681 } | |
| 682 | |
| 683 encode | |
| 684 { | |
| 685 T dlt = <T> delta ( @ ); | |
| 686 return zip < Z_RLE, Z_BEST_SPEED > ( dlt ); | |
| 687 } | |
| 688 } | |
| 689 physical < type T > | |
| 690 T delta_average_zip_encoding #1.0 | |
| 691 { | |
| 692 decode | |
| 693 { | |
| 694 delta_averaged_fmt t = unzip ( @ ); | |
| 695 return undelta_average ( t ); | |
| 696 } | |
| 697 | |
| 698 encode | |
| 699 { | |
| 700 delta_averaged_fmt t = delta_average ( @ ); | |
| 701 return zip < Z_RLE, Z_BEST_SPEED > ( t ); | |
| 702 } | |
| 703 } | |
| 704 | |
| 705 /* bzip | |
| 706 * bunzip | |
| 707 * run things through bzip2 | |
| 708 * | |
| 709 * "blockSize100k" [ CONST, OPTIONAL ] - set the compression workspace size | |
| 710 * from 1..9 inclusive, produces a workspace of blockSize100K * 100000 bytes | |
| 711 * default is 5 | |
| 712 * | |
| 713 * "workFactor" [ CONST, OPTIONAL ] - set compression level | |
| 714 * from 0..250 inclusive, where 0 means bzip2 default, currently 30 | |
| 715 */ | |
| 716 | |
| 717 function | |
| 718 bzip2_fmt bzip #1.0 < * U32 blockSize100k, U32 workFactor > ( any in ) | |
| 719 = vdb:bzip; | |
| 720 | |
| 721 function | |
| 722 any bunzip #1.0 ( bzip2_fmt in ) | |
| 723 = vdb:bunzip; | |
| 724 | |
| 725 physical < type T > | |
| 726 T bzip_encoding #1.0 < * U32 blockSize100k, U32 workFactor > | |
| 727 { | |
| 728 decode { return bunzip ( @ ); } | |
| 729 encode { return bzip < blockSize100k, workFactor > ( @ ); } | |
| 730 }; | |
| 731 | |
| 732 | |
| 733 /* simple_sub_select | |
| 734 * project a column from another table within database | |
| 735 * | |
| 736 * "T" [ TYPE ] - data type of column | |
| 737 * must be compatible with source column | |
| 738 * | |
| 739 * "tbl" [ CONST ] - name of table within parent | |
| 740 * | |
| 741 * "col" [ CONST ] - column spec, i.e. simple name or | |
| 742 * typed name spec | |
| 743 * | |
| 744 * "row" [ DATA ] - row to select | |
| 745 * | |
| 746 * "idx" [ DATA ] - one-based indexing of what element to pick, defaults to all if not given | |
| 747 */ | |
| 748 function < type T > | |
| 749 T simple_sub_select #1.0 < ascii tbl, ascii col > ( I64 row * I32 idx ) | |
| 750 = vdb:simple_sub_select_1; | |
| 751 | |
| 752 | |
| 753 /* extract_token | |
| 754 * extract a textual token from an input string | |
| 755 * | |
| 756 * "idx" [ CONST ] - a zero-based index of the token | |
| 757 * if value < row_len ( tok ), then the substring of | |
| 758 * indexed token is returned. otherwise, returns empty. | |
| 759 * | |
| 760 * "str" [ DATA ] - input text. type must be compatible with | |
| 761 * output production, meaning types must be same, or ascii input | |
| 762 * with utf8 output. | |
| 763 * | |
| 764 * "tok" [ DATA ] - results of tokenizing "str" | |
| 765 */ | |
| 766 function | |
| 767 text_set extract_token #1.0 < U32 idx > ( text_set str, text:token tok ) | |
| 768 = vdb:extract_token; | |
| 769 | |
| 770 | |
| 771 /* strtonum | |
| 772 * convert string to number | |
| 773 * | |
| 774 * "radix" [ CONST, DEFAULT 10 ] | |
| 775 * if not specified, or if given as 0, the default will be 10 | |
| 776 * unless the string begins with "0x" or "0X", in which case radix will be 16 | |
| 777 * octal is NOT inferred ( i.e. leading "0" does not imply octal ) | |
| 778 * | |
| 779 * "str" [ DATA ] - text to be converted | |
| 780 */ | |
| 781 function | |
| 782 numeric_set strtonum #1.0 < * U32 radix > ( text_set str ) | |
| 783 = vdb:strtonum; | |
| 784 | |
| 785 | |
| 786 /* sprintf | |
| 787 * formatted print to a string | |
| 788 * | |
| 789 * formatting rules differ somewhat from C sprintf: | |
| 790 * | |
| 791 * '%' [ <flags> ] [ <field-width> ] [ '.' <precision> ] [ ':' <index> ] <storage-class> | |
| 792 * | |
| 793 * where: | |
| 794 * | |
| 795 * flags | |
| 796 * = ' ' : prepend space to a numeral if it does not have a sign | |
| 797 * | '+' : always produce a sign on numeric conversion | |
| 798 * | '-' : left-align parameter within field | |
| 799 * | '0' : left-pad with zeroes rather than spaces | |
| 800 * | '#' : use "alternate" representation | |
| 801 * | ',' : produce comma-separated triples | |
| 802 * ; | |
| 803 * | |
| 804 * field-width | |
| 805 * = DECIMAL : a base-10 numeral | |
| 806 * | '*' : take field width from args | |
| 807 * ; | |
| 808 * | |
| 809 * precision | |
| 810 * = DECIMAL : a base-10 numeral | |
| 811 * | '*' : take precision from args | |
| 812 * | : an empty precision means 0 | |
| 813 * ; | |
| 814 * | |
| 815 * index | |
| 816 * = idx : a single, zero-based vector element | |
| 817 * | idx '-' idx : a fully-closed, zero-based interval | |
| 818 * | idx '/' len : a start index plus length | |
| 819 * ; | |
| 820 * | |
| 821 * idx | |
| 822 * = DECIMAL : an unsigned base-10 numeral | |
| 823 * | '*' : take index from args | |
| 824 * | '$' : last element in cell | |
| 825 * | : an empty index means 0 or $ | |
| 826 * ; | |
| 827 * | |
| 828 * len | |
| 829 * = DECIMAL : a base-10 numeral | |
| 830 * | '*' : take length from args | |
| 831 * | '$' : row-length of ( cell ) | |
| 832 * | : an empty length means $ | |
| 833 * ; | |
| 834 * | |
| 835 * | |
| 836 * storage-class | |
| 837 * = 'd' | 'i' : general decimal integer | |
| 838 * | 'u' : decimal unsigned integer | |
| 839 * | 'x' : lower-case hex | |
| 840 * | 'X' : upper-case hex | |
| 841 * | 'o' : octal | |
| 842 * | 'b' : binary | |
| 843 * | 'f' : floating point | |
| 844 * | 'e' : scientific notation | |
| 845 * | 'g' : general floating point | |
| 846 * | 'c' | 's' : character | |
| 847 * ; | |
| 848 * | |
| 849 * | |
| 850 * "fmt" [ CONST ] - constant format string, adhering to | |
| 851 * the description above | |
| 852 * | |
| 853 * "p1" [ DATA ] - first param | |
| 854 * this and any subsequent params must correspond to format | |
| 855 * in type/position/number. | |
| 856 */ | |
| 857 function | |
| 858 text_set sprintf #1.0 < ascii fmt > ( any p1, ... ) | |
| 859 = vdb:sprintf; |
