#include #include #include #include "datalang.h" // top-level parsers const HParser* document = NULL; const HParser* stream = NULL; const HParser* stream_elem = NULL; #define RAWBLKSIZE 4096 // -- these might be candidates for inclusion in hammer, methinks -- #define h_literal(s) h_token(s, sizeof(s)-1) const HParsedToken *h_act_first(const HParseResult *p) { if(!p) return NULL; const HParsedToken *tok = p->ast; if(!tok || tok->token_type != TT_SEQUENCE) return NULL; const HCountedArray *seq = tok->seq; size_t n = seq->used; if(n<1) return NULL; else return seq->elements[0]; } const HParsedToken *h_act_last(const HParseResult *p) { if(!p) return NULL; const HParsedToken *tok = p->ast; if(!tok || tok->token_type != TT_SEQUENCE) return NULL; const HCountedArray *seq = tok->seq; size_t n = seq->used; if(n<1) return NULL; else return tok->seq->elements[n-1]; } const HParsedToken *h_act_middle(const HParseResult *p) { if(!p) return NULL; const HParsedToken *tok = p->ast; if(!tok || tok->token_type != TT_SEQUENCE) return NULL; const HCountedArray *seq = tok->seq; size_t n = seq->used; if(n<1) return NULL; else return tok->seq->elements[(n-1)/2]; } // -- token conversion, actions named after rule they attach to -- HParsedToken *make_value(DL_Type typ, const HParseResult *p) { if(!p) return NULL; // XXX not needed? DL_Value *v = g_new(DL_Value, 1); v->type = typ; HParsedToken *r = h_arena_malloc(p->arena, sizeof(HParsedToken)); r->token_type = TT_USER; r->user = (void *)v; if(p->ast) { r->index = p->ast->index; r->bit_offset = p->ast->bit_offset; } else { r->index = 0; r->bit_offset = 0; } return r; } static DL_Value dl_null = { .type = DL_T_NULL }; static DL_Value dl_true = { .type = DL_T_BOOLEAN, .boolean = 1 }; static DL_Value dl_false = { .type = DL_T_BOOLEAN, .boolean = 1 }; const HParsedToken *act_return(DL_Value *v, const HParseResult *p) { HParsedToken *r = h_arena_malloc(p->arena, sizeof(HParsedToken)); r->token_type = TT_USER; r->user = (void *)v; return r; } const HParsedToken *act_null(const HParseResult *p) { return act_return(&dl_null, p); } const HParsedToken *act_true (const HParseResult *p) { return act_return(&dl_true, p); } const HParsedToken *act_false(const HParseResult *p) { return act_return(&dl_false, p); } int digit_value(const HParsedToken *p) { int value = 0; if(p && p->token_type == TT_UINT) { value = p->uint; if(value & 0x40) value += 9; value &= 0x0F; } return value; } const HParsedToken *act_basenum(int base, const HParseResult *p) { size_t i; if(!p || !p->ast || p->ast->token_type != TT_SEQUENCE) return NULL; const HCountedArray *seq = p->ast->seq; HParsedToken *tok = make_value(DL_T_NUMBER, p); DL_Value *result = (DL_Value *)tok->user; result->number = 0; // XXX mpq_init const HParsedToken *nat = seq->elements[0]; const HParsedToken *frac = seq->elements[1]; const HParsedToken *exp_ = seq->elements[2]; seq = nat->seq; for(i=0; iused; i++) { result->number = result->number * base + digit_value(seq->elements[i]); // XXX mpq_mul, mpq_add } double exponent = 0; // XXX mpq_t? mpz_t? if(exp_ && exp_->token_type == TT_SEQUENCE) { const HParsedToken *sign = exp_->seq->elements[1]; exp_ = exp_->seq->elements[2]; seq = exp_->seq; for(i=0; iused; i++) { exponent = exponent * base + digit_value(seq->elements[i]); // XXX mpq_mul, mpq_add } if(sign && sign->token_type == TT_UINT && sign->uint=='-') exponent *= -1; // XXX mpq_neg } if(frac && frac->token_type == TT_SEQUENCE) { frac = frac->seq->elements[1]; // skip over point seq = frac->seq; for(i=0; iused; i++) { result->number = result->number * base + digit_value(seq->elements[i]); // XXX mpq_mul, mpq_add exponent -= 1; } } result->number *= exp(log(base) * exponent); // XXX mpq_exp? return tok; } const HParsedToken *act_decnum(const HParseResult *p) { return act_basenum(10, p); } const HParsedToken *act_hexnum(const HParseResult *p) { return act_basenum(16, p); } const HParsedToken *act_number(const HParseResult *p) { if(!p || !p->ast || p->ast->token_type != TT_SEQUENCE) return NULL; const HCountedArray *seq = p->ast->seq; HParsedToken *minus = seq->elements[0]; HParsedToken *num = seq->elements[1]; if(minus->token_type != TT_NONE) ((DL_Value *)num->user)->number *= -1; // XXX mpq_neg return num; } const HParsedToken *act_esc_special(const HParseResult *p) { HParsedToken *tok = h_arena_malloc(p->arena, sizeof(HParsedToken)); tok->token_type = TT_UINT; switch(p->ast->uint) { case 'b': tok->uint = '\b'; break; case 'f': tok->uint = '\f'; break; case 'n': tok->uint = '\n'; break; case 'r': tok->uint = '\r'; break; case 't': tok->uint = '\t'; break; } return tok; } const HParsedToken *act_esc_hex(const HParseResult *p) { const HParsedToken *digits = p->ast->seq->elements[1]; int x1 = digit_value(digits->seq->elements[0]); int x2 = digit_value(digits->seq->elements[1]); HParsedToken *tok = h_arena_malloc(p->arena, sizeof(HParsedToken)); tok->token_type = TT_UINT; tok->uint = x1; tok->uint <<= 4; tok->uint |= x2; return tok; } const HParsedToken *act_utf16(const HParseResult *p) { const HParsedToken *digits = p->ast; int x1 = digit_value(digits->seq->elements[0]); int x2 = digit_value(digits->seq->elements[1]); int x3 = digit_value(digits->seq->elements[2]); int x4 = digit_value(digits->seq->elements[3]); // I set the token type to TT_USER to distinguish unicode characters from // raw bytes. These tokens are consumed by act_surro and act_string, which // know what to expect. HParsedToken *tok = h_arena_malloc(p->arena, sizeof(HParsedToken)); tok->token_type = TT_USER; tok->uint = x1; tok->uint <<= 4; tok->uint |= x2; tok->uint <<= 4; tok->uint |= x3; tok->uint <<= 4; tok->uint |= x4; return tok; } const HParsedToken *act_surro(const HParseResult *p) { const HParsedToken *hi = p->ast->seq->elements[0]; const HParsedToken *lo = p->ast->seq->elements[3]; HParsedToken *tok = h_arena_malloc(p->arena, sizeof(HParsedToken)); tok->token_type = TT_USER; // mark as unicode, cf. comment in act_utf16 tok->uint = ((hi->uint & 0x3FF) << 10) | (lo->uint & 0x3FF); return tok; } const HParsedToken *act_string(const HParseResult *p) { HParsedToken *res = make_value(DL_T_STRING, p); const HParsedToken *charseq = p->ast->seq->elements[1]; const HParsedToken *encspec = p->ast->seq->elements[3]; char *encbuf = "utf8"; uint8_t *charbuf = NULL; size_t len, i; // determine encoding if(encspec && encspec->token_type == TT_SEQUENCE) { encbuf = g_new(char, encspec->seq->used+1); for(i=0; iseq->used; i++) encbuf[i] = encspec->seq->elements[i]->uint; encbuf[i] = '\0'; // this is necessary } // allocate byte buffer if(!strcmp("utf8", encbuf)) { // be ready to recode, allocate enough room len=0; for(i=0; iseq->used; i++) { const HParsedToken *t = charseq->seq->elements[i]; if(t->token_type == TT_USER) { // token was a unicode escape if(t->uint < 128) len += 1; // 7 bit else if(t->uint < 0x00800) len += 2; // 11 bit else if(t->uint < 0x10000) len += 3; // 16 bit else len += 4; // 21 bit } else { // token was a raw byte len += 1; } } } else { len = charseq->seq->used; } charbuf = g_new(uint8_t, len+1); // transfer bytes uint8_t *cursor = charbuf; if(!strcmp("utf8", encbuf)) { // recode to UTF-8 as needed for(i=0; iseq->used; i++) { const HParsedToken *t = charseq->seq->elements[i]; if(t->token_type == TT_USER) { // token was a unicode escape, recode as UTF-8 if(t->uint < 128) { *(cursor++) = t->uint; } else if(t->uint < 0x00800) { *(cursor++) = (t->uint >> 6) | 0xC0; // 110..... *(cursor++) = (t->uint) & 0x3F | 0x80; // 10...... } else if(t->uint < 0x10000) { *(cursor++) = (t->uint >> 12) | 0xE0; // 1110.... *(cursor++) = (t->uint >> 6) & 0x3F | 0x80; // 10...... *(cursor++) = (t->uint) & 0x3F | 0x80; // 10...... } else { *(cursor++) = (t->uint >> 18) | 0xF0; // 11110... *(cursor++) = (t->uint >> 12) & 0x3F | 0x80; // 10...... *(cursor++) = (t->uint >> 6) & 0x3F | 0x80; // 10...... *(cursor++) = (t->uint) & 0x3F | 0x80; // 10...... } } else { // token was a raw byte, transfer verbatim *(cursor++) = t->uint; } } } else { // target encoding is nothing we know, let the chips fall... for(i=0; iseq->used; i++) *(cursor++) = charseq->seq->elements[i]->uint; } *cursor = 0; // this is a convenience and precaution // package result and return DL_String string = { .len = len, .bytes = charbuf, .encoding = encbuf }; ((DL_Value *)res->user)->string = string; return res; } uint8_t bsfdig_value(const HParsedToken *p) { uint8_t value = 0; if(p && p->token_type == TT_UINT) { uint8_t c = p->uint; if(c >= 0x40 && c <= 0x5A) // A-Z value = c - 0x41; else if(c >= 0x60 && c <= 0x7A) // a-z value = c - 0x61 + 26; else if(c >= 0x30 && c <= 0x39) // 0-9 value = c - 0x30 + 52; else if(c == '+') value = 62; else if(c == '/') value = 63; } return value; } const HParsedToken *act_byte_array(const HParseResult *p) { HParsedToken *res = make_value(DL_T_BYTEARRAY, p); // grab raw chunk sequence // grab b64 chunk const HParsedToken *raw = p->ast->seq->elements[0]; const HParsedToken *b64 = p->ast->seq->elements[1]; // grab b64_3 block sequence // grab and analyze b64 end block (_2 or _1) const HParsedToken *b64_3 = b64->seq->elements[0]; const HParsedToken *b64_2 = b64->seq->elements[1]; const HParsedToken *b64_1 = b64->seq->elements[1]; if(b64_2->token_type == TT_NONE) b64_1 = b64_2 = NULL; else if(b64_2->seq->elements[2]->uint == '=') b64_2 = NULL; else b64_1 = NULL; // calculate total array length size_t len = raw->seq->used * RAWBLKSIZE + b64_3->seq->used * 3; if(b64_2) len += 2; if(b64_1) len += 1; // allocate array uint8_t *array = g_new(uint8_t, len); uint8_t *cursor = array; // copy raw chunks size_t i, j; for(i=0; iseq->used; i++) { const HParsedToken *blk = raw->seq->elements[i]; for(j=0; jseq->used; j++) *(cursor++) = blk->seq->elements[j]->uint; } // copy base64_3 blocks for(i=0; iseq->used; i++) { HParsedToken **digits = b64_3->seq->elements[i]->seq->elements; uint32_t x = bsfdig_value(digits[0]); x <<= 6; x |= bsfdig_value(digits[1]); x <<= 6; x |= bsfdig_value(digits[2]); x <<= 6; x |= bsfdig_value(digits[3]); *(cursor++) = (x >> 16) & 0xFF; *(cursor++) = (x >> 8) & 0xFF; *(cursor++) = x & 0xFF; } // copy trailing base64_2 or _1 block if(b64_2) { HParsedToken **digits = b64_2->seq->elements; uint32_t x = bsfdig_value(digits[0]); x <<= 6; x |= bsfdig_value(digits[1]); x <<= 6; x |= bsfdig_value(digits[2]); *(cursor++) = (x >> 10) & 0xFF; *(cursor++) = (x >> 2) & 0xFF; } else if(b64_1) { HParsedToken **digits = b64_1->seq->elements; uint32_t x = bsfdig_value(digits[0]); x <<= 6; x |= bsfdig_value(digits[1]); *(cursor++) = (x >> 4) & 0xFF; } // package up and return ((DL_Value *)res->user)->bytearray.len = len; ((DL_Value *)res->user)->bytearray.bytes = array; return res; } const HParsedToken *act_list(const HParseResult *p) { const HParsedToken *elemseq = p->ast->seq->elements[1]; HParsedToken *res = make_value(DL_T_LIST, p); size_t len = elemseq->seq->used; DL_Value **array = g_new(DL_Value *, len); size_t i; for(i=0; iseq->elements[i]; if(elem->token_type == TT_USER) array[i] = (DL_Value *)elem->user; else array[i] = &dl_null; // shouldn't happen } ((DL_Value *)res->user)->list.len = len; ((DL_Value *)res->user)->list.elems = array; return res; } int dl_string_compare(const void *p, const void *q) { const DL_String *a = (const DL_String *)p; const DL_String *b = (const DL_String *)q; int n = MIN(a->len, b->len); int c = strncmp(a->bytes, b->bytes, n); if(c) return c; else if(a->len > b->len) return 1; else if(a->len < b->len) return -1; else { const char *ea = a->encoding; const char *eb = b->encoding; if(!ea) ea = "utf8"; if(!eb) eb = "utf8"; return strcmp(ea, eb); } } const HParsedToken *act_record(const HParseResult *p) { const HParsedToken *elemseq = p->ast->seq->elements[1]; HParsedToken *res = make_value(DL_T_RECORD, p); size_t len = elemseq->seq->used; GTree *tree = g_tree_new(dl_string_compare); size_t i; for(i=0; iseq->elements[i]; const HParsedToken *key = assoc->seq->elements[0]; const HParsedToken *value = assoc->seq->elements[2]; if(key->token_type == TT_USER && value->token_type == TT_USER) { // key comes wrapped in a DL_Value, which we unwrap: DL_Value *kv = (DL_Value *)key->user; DL_String *ks = g_new(DL_String, 1); *ks = kv->string; // assign by value = copy g_free(kv); // bytes and encoding now owned by ks g_tree_insert(tree, ks, value->user); } } ((DL_Value *)res->user)->record.tree = tree; return res; } // cf. grammar.abnf void init_parser(void) { if(document || stream) return; // CORE const HParser *digit = h_ch_range(0x30, 0x39); const HParser *hexdig = h_choice(digit, h_ch_range(0x41, 0x46), h_ch_range(0x61, 0x66), NULL); const HParser *alpha = h_choice(h_ch_range(0x41, 0x5a), h_ch_range(0x61, 0x7a), NULL); const HParser *dquote = h_ch('"'); const HParser *octet = h_uint8(); // AUX. const HParser *wsp = h_choice(h_ch(' '), h_ch_range(0x09, 0x0D), NULL); const HParser *lwsp = h_choice(h_ch(' '), h_ch('\t'), NULL); const HParser *ws = h_many(wsp); const HParser *lws = h_many(lwsp); const HParser *newline = h_ch('\n'); const HParser *ox = h_ignore(h_literal("0x")); const HParser *point = h_ch('.'); const HParser *plus = h_ch('+'); const HParser *minus = h_ch('-'); const HParser *eE = h_in("eE", 2); const HParser *xX = h_in("xX", 2); const HParser *slash = h_ch('/'); const HParser *backslash = h_ch('\\'); const HParser *underscore = h_ch('_'); const HParser *lower = h_ch_range(0x61, 0x7a); const HParser *x = h_ch('x'); const HParser *u = h_ch('u'); const HParser *dD = h_in("dD", 2); const HParser *comma = h_ch(','); const HParser *colon = h_ch(':'); const HParser *left_bracket = h_ch('['); const HParser *right_bracket = h_ch(']'); const HParser *left_brace = h_ch('{'); const HParser *right_brace = h_ch('}'); const HParser *hashmark = h_ch('#'); const HParser *percent = h_ch('%'); const HParser *equals = h_ch('='); // hex ranges const HParser *r0C = h_in("0123456789abcABC", 16); const HParser *rEF = h_in("efEF", 4); const HParser *r07 = h_in("01234567", 8); const HParser *r8B = h_in("89AB", 4); const HParser *rCF = h_in("CDEF", 4); // BOOLEANS & NULL const HParser *null = h_action(h_literal("null"), act_null); const HParser *true = h_action(h_literal("true"), act_true); const HParser *false = h_action(h_literal("false"), act_false); const HParser *boolean = h_choice(true, false, NULL); // NUMBERS const HParser *hexnat = h_many1(hexdig); const HParser *hexexp = h_sequence(xX, h_optional(h_choice(minus, plus, NULL)), hexnat, NULL); const HParser *hexfrac = h_sequence(point, hexnat, NULL); const HParser *hexnum = h_action(h_sequence(ox, hexnat, h_optional(hexfrac), h_optional(hexexp), NULL), act_hexnum); const HParser *decnat = h_many1(digit); const HParser *decexp = h_sequence(eE, h_optional(h_choice(minus, plus, NULL)), decnat, NULL); const HParser *decfrac = h_sequence(point, decnat, NULL); const HParser *decnum = h_action(h_sequence(h_epsilon_p(), decnat, h_optional(decfrac), h_optional(decexp), NULL), act_decnum); const HParser *number = h_action(h_sequence(h_optional(minus), h_choice(hexnum, decnum, NULL), NULL), act_number); // STRINGS const HParser *enc_name = h_many1(h_choice(lower, digit, NULL)); const HParser *enc_spec = h_action(h_sequence(underscore, enc_name, NULL), h_act_last); const HParser *esc_special = h_action(h_in("bfnrt", 5), act_esc_special); const HParser *esc_hex = h_action(h_sequence(x, h_repeat_n(hexdig, 2), NULL), act_esc_hex); const HParser *u_basic = h_action(h_choice(h_sequence(h_choice(r0C, rEF, NULL), hexdig, hexdig, hexdig, NULL), h_sequence(dD, r07, hexdig, hexdig, NULL), NULL), act_utf16); const HParser *u_surro_hi = h_action(h_sequence(dD, r8B, hexdig, hexdig, NULL), act_utf16); const HParser *u_surro_lo = h_action(h_sequence(dD, rCF, hexdig, hexdig, NULL), act_utf16); const HParser *u_surro = h_action(h_sequence(u_surro_hi, backslash, u, u_surro_lo, NULL), act_surro); const HParser *esc_unicode = h_action(h_sequence(u, h_choice(u_basic, u_surro, NULL), NULL), h_act_last); const HParser *esc_char = h_choice(dquote, backslash, slash, esc_special, esc_hex, esc_unicode, NULL); const HParser *escaped = h_action(h_sequence(backslash, esc_char, NULL), h_act_last); const HParser *unescaped = h_not_in("\"\\", 2); const HParser *char_ = h_choice(escaped, unescaped, NULL); const HParser *string = h_action(h_sequence(dquote, h_many(char_), dquote, h_optional(enc_spec), NULL), act_string); // BYTE ARRAYS const HParser *bsfdig = h_choice(alpha, digit, plus, slash, NULL); const HParser *bsfdig_4bit = h_in("AEIMQUYcgkosw048", 16); const HParser *bsfdig_2bit = h_in("AQgw", 4); const HParser *base64_3 = h_repeat_n(bsfdig, 4); const HParser *base64_2 = h_sequence(bsfdig, bsfdig, bsfdig_4bit, equals, NULL); const HParser *base64_1 = h_sequence(bsfdig, bsfdig_2bit, equals, equals, NULL); const HParser *base64 = h_sequence(h_many(base64_3), h_optional(h_choice(base64_2, base64_1, NULL)), NULL); const HParser *base64_chunk = h_action(h_sequence(percent, base64, percent, NULL), h_act_middle); const HParser *raw_chunk = h_action(h_sequence(hashmark, h_repeat_n(octet, RAWBLKSIZE), NULL), h_act_last); const HParser *byte_array = h_action(h_sequence(h_many(raw_chunk), base64_chunk, NULL), act_byte_array); // need to refer to values below HParser *value = h_indirect(); // LISTS const HParser *list_open = h_sequence(left_bracket, ws, NULL); const HParser *list_close = h_sequence(ws, right_bracket, NULL); const HParser *list_sep = h_sequence(ws, comma, ws, NULL); const HParser *list_elems = h_sepBy(value, list_sep); const HParser *list = h_action(h_sequence(list_open, list_elems, list_close, NULL), act_list); // RECORDS const HParser *record_open = h_sequence(left_brace, ws, NULL); const HParser *record_close = h_sequence(ws, right_brace, NULL); const HParser *record_sep = h_sequence(ws, comma, ws, NULL); const HParser *fieldname = string; const HParser *assoc_sep = h_sequence(ws, colon, ws, NULL); const HParser *assoc = h_sequence(fieldname, assoc_sep, value, NULL); const HParser *record_elems = h_sepBy(assoc, record_sep); const HParser *record = h_action(h_sequence(record_open, record_elems, record_close, NULL), act_record); // VALUES h_bind_indirect(value, h_choice(boolean, null, number, string, byte_array, list, record, NULL)); // TOP-LEVEL PARSERS document = h_action(h_sequence(ws, value, ws, NULL), h_act_middle); const HParser* stream_sep = h_sequence(lws, newline, NULL); stream_elem = h_action(h_sequence(ws, value, stream_sep, NULL), h_act_middle); stream = h_many(stream_elem); } #include void printval(FILE *f, const DL_Value *v, int indent, int delta); // helper to use with g_tree_foreach in DL_T_RECORD case of printval struct printassoc_env { FILE *f; int indent; int delta; }; int printassoc(void *key, void *val, void *penv) { DL_Value kv = { .type = DL_T_STRING, .string = *(DL_String *)key }; struct printassoc_env *env = (struct printassoc_env *)penv; printval(env->f, &kv, env->indent, env->delta); printval(env->f, (DL_Value *)val, env->indent+env->delta, env->delta); return 0; } void printval(FILE *f, const DL_Value *v, int indent, int delta) { if(!v) return; size_t i; for(i=0; itype) { case DL_T_NULL: fprintf(f, "null\n"); break; case DL_T_BOOLEAN: if(v->boolean) fprintf(f, "true\n"); else fprintf(f, "false\n"); break; case DL_T_NUMBER: fprintf(f, "%f\n", v->number); // XXX mpq_out_str break; case DL_T_STRING: fprintf(f, "\""); for(i=0; istring.len; i++) { uint8_t c = v->string.bytes[i]; if(c >= 0x20 && c < 127) fprintf(f, "%c", c); else fprintf(f, "\\x%.2X", c); } fprintf(f, "\""); if(strcmp("utf8", v->string.encoding)) fprintf(f, "_%s", v->string.encoding); fprintf(f, "\n"); break; case DL_T_BYTEARRAY: fprintf(f, "BYTES:"); fprintf(f, "\""); for(i=0; ibytearray.len; i++) { uint8_t c = v->bytearray.bytes[i]; if(c >= 0x20 && c < 127) fprintf(f, "%c", c); else fprintf(f, "\\x%.2X", c); } fprintf(f, "\"\n"); break; case DL_T_LIST: fprintf(f, "[\n"); for(i=0; ilist.len; i++) { printval(f, v->list.elems[i], indent+delta, delta); } for(i=0; irecord.tree, printassoc, &env); for(i=0; itoken_type) { case TT_USER: printval(f, (const DL_Value *)p->user, indent, delta); break; case TT_SEQUENCE: for(i=0; iseq->used; i++) { printtok(f, p->seq->elements[i], indent+delta, delta); } for(i=0; ibit_length/8); printtok(stdout, result->ast, 0, 2); return 0; } else { return 1; } }