From e14b0993b101839d2d40b5c4f184e6b0c2083b65 Mon Sep 17 00:00:00 2001 From: Jo-Philipp Wich Date: Tue, 12 Apr 2022 23:03:32 +0200 Subject: [PATCH] syntax: implement support for ES6 template literals Implement support for ECMAScript 6 template literals which allow simple interpolation of variable values into strings without resorting to `sprintf()` or manual string concatenation. Signed-off-by: Jo-Philipp Wich --- compiler.c | 23 +++++ include/ucode/lexer.h | 8 ++ include/ucode/util.h | 5 + lexer.c | 81 ++++++++++++++-- tests/custom/00_syntax/27_template_literals | 102 ++++++++++++++++++++ 5 files changed, 209 insertions(+), 10 deletions(-) create mode 100644 tests/custom/00_syntax/27_template_literals diff --git a/compiler.c b/compiler.c index 80b873d4..d4725b1f 100644 --- a/compiler.c +++ b/compiler.c @@ -33,6 +33,7 @@ static void uc_compiler_compile_paren(uc_compiler_t *compiler); static void uc_compiler_compile_call(uc_compiler_t *compiler); static void uc_compiler_compile_post_inc(uc_compiler_t *compiler); static void uc_compiler_compile_constant(uc_compiler_t *compiler); +static void uc_compiler_compile_template(uc_compiler_t *compiler); static void uc_compiler_compile_comma(uc_compiler_t *compiler); static void uc_compiler_compile_labelexpr(uc_compiler_t *compiler); static void uc_compiler_compile_function(uc_compiler_t *compiler); @@ -72,6 +73,7 @@ uc_compiler_parse_rules[TK_ERROR + 1] = { [TK_NULL] = { uc_compiler_compile_constant, NULL, P_NONE }, [TK_THIS] = { uc_compiler_compile_constant, NULL, P_NONE }, [TK_REGEXP] = { uc_compiler_compile_constant, NULL, P_NONE }, + [TK_TEMPLATE] = { uc_compiler_compile_template, NULL, P_NONE }, [TK_COMMA] = { NULL, uc_compiler_compile_comma, P_COMMA }, [TK_LABEL] = { uc_compiler_compile_labelexpr, NULL, P_NONE }, [TK_FUNC] = { uc_compiler_compile_function, NULL, P_NONE }, @@ -1483,6 +1485,27 @@ uc_compiler_compile_constant(uc_compiler_t *compiler) } } +static void +uc_compiler_compile_template(uc_compiler_t *compiler) +{ + uc_compiler_emit_constant(compiler, compiler->parser->prev.pos, compiler->parser->prev.uv); + + while (true) { + if (uc_compiler_parse_match(compiler, TK_TEMPLATE)) { + uc_compiler_emit_constant(compiler, compiler->parser->prev.pos, compiler->parser->prev.uv); + uc_compiler_emit_insn(compiler, 0, I_ADD); + } + else if (uc_compiler_parse_match(compiler, TK_PLACEH)) { + uc_compiler_compile_expression(compiler); + uc_compiler_emit_insn(compiler, 0, I_ADD); + uc_compiler_parse_consume(compiler, TK_RBRACE); + } + else { + break; + } + } +} + static void uc_compiler_compile_comma(uc_compiler_t *compiler) { diff --git a/include/ucode/lexer.h b/include/ucode/lexer.h index 134f5ef8..835bc2b6 100644 --- a/include/ucode/lexer.h +++ b/include/ucode/lexer.h @@ -115,6 +115,8 @@ typedef enum { TK_ASOR, TK_ASNULLISH, TK_NULLISH, + TK_PLACEH, + TK_TEMPLATE, TK_EOF, TK_ERROR @@ -129,6 +131,7 @@ typedef enum { UC_LEX_BLOCK_COMMENT, UC_LEX_IDENTIFY_TOKEN, UC_LEX_PARSE_TOKEN, + UC_LEX_PLACEHOLDER, UC_LEX_EOF } uc_lex_state_t; @@ -144,6 +147,7 @@ typedef struct { uc_source_t *source; uint8_t eof:1; uint8_t is_escape:1; + uint8_t is_placeholder:1; uint8_t no_regexp:1; uint8_t no_keyword:1; size_t buflen; @@ -168,6 +172,10 @@ typedef struct { STATEMENTS = '%', COMMENT = '#' } block; + struct { + size_t count; + size_t *entries; + } templates; } uc_lexer_t; diff --git a/include/ucode/util.h b/include/ucode/util.h index 3203499e..093951e2 100644 --- a/include/ucode/util.h +++ b/include/ucode/util.h @@ -68,6 +68,11 @@ #define uc_vector_last(vec) \ (&((vec)->entries[(vec)->count - 1])) +#define uc_vector_push(vec, val) do { \ + uc_vector_grow(vec); \ + (vec)->entries[(vec)->count++] = (val); \ +} while(0) + /* "failsafe" utility functions */ diff --git a/lexer.c b/lexer.c index 5fe7f6bc..9ccc3aeb 100644 --- a/lexer.c +++ b/lexer.c @@ -107,6 +107,7 @@ static const struct token tokens[] = { { TK_ARROW, { .pat = "=>" }, 2, NULL }, { TK_NULLISH, { .pat = "??" }, 2, NULL }, { TK_QDOT, { .pat = "?." }, 2, NULL }, + { TK_PLACEH, { .pat = "${" }, 2, NULL }, { TK_ADD, { .pat = "+" }, 1, NULL }, { TK_ASSIGN, { .pat = "=" }, 1, NULL }, { TK_BAND, { .pat = "&" }, 1, NULL }, @@ -138,6 +139,9 @@ static const struct token tokens[] = { { TK_LABEL, { .pat = "az" }, 0, parse_label }, { TK_LABEL, { .pat = "AZ" }, 0, parse_label }, { TK_NUMBER, { .pat = "09" }, 0, parse_number }, + + /* NB: this must be last for simple retrieval */ + { TK_TEMPLATE, { .pat = "`" }, 1, parse_string } }; static const struct keyword reserved_words[] = { @@ -313,6 +317,22 @@ parse_string(uc_lexer_t *lex) return emit_op(lex, lex->lastoff, TK_ERROR, ucv_string_new("Unterminated string")); for (ptr = lex->bufstart; ptr < lex->bufend; ptr++) { + /* continuation of placeholder start */ + if (lex->is_placeholder) { + if (*ptr == '{') { + buf_consume(lex, 1); + rv = lookbehind_to_text(lex, lex->lastoff, tok->type, NULL); + + if (!rv) + rv = emit_op(lex, lex->lastoff, tok->type, ucv_string_new_length("", 0)); + + return rv; + } + + lex->is_placeholder = false; + lookbehind_append(lex, "$", 1); + } + /* continuation of escape sequence */ if (lex->is_escape) { if (lex->esclen == 0) { @@ -486,10 +506,10 @@ parse_string(uc_lexer_t *lex) lookbehind_append(lex, lex->bufstart, ptr - lex->bufstart); buf_consume(lex, (ptr + 1) - lex->bufstart); - rv = lookbehind_to_text(lex, lex->lastoff, TK_STRING, NULL); + rv = lookbehind_to_text(lex, lex->lastoff, tok->type, NULL); if (!rv) - rv = emit_op(lex, lex->lastoff, TK_STRING, ucv_string_new_length("", 0)); + rv = emit_op(lex, lex->lastoff, tok->type, ucv_string_new_length("", 0)); return rv; } @@ -500,6 +520,13 @@ parse_string(uc_lexer_t *lex) lookbehind_append(lex, lex->bufstart, ptr - lex->bufstart); buf_consume(lex, (ptr - lex->bufstart) + 1); } + + /* potential placeholder start */ + else if (q == '`' && *ptr == '$') { + lex->is_placeholder = true; + lookbehind_append(lex, lex->bufstart, ptr - lex->bufstart); + buf_consume(lex, (ptr - lex->bufstart) + 1); + } } lookbehind_append(lex, lex->bufstart, ptr - lex->bufstart); @@ -721,7 +748,7 @@ lex_step(uc_lexer_t *lex, FILE *fp) uint32_t masks[] = { 0, le32toh(0x000000ff), le32toh(0x0000ffff), le32toh(0x00ffffff), le32toh(0xffffffff) }; union { uint32_t n; char str[4]; } search; const struct token *tok; - size_t rlen, rem; + size_t rlen, rem, *nest; char *ptr, c; uc_token_t *rv; size_t i; @@ -966,6 +993,26 @@ lex_step(uc_lexer_t *lex, FILE *fp) lex->block = NONE; } + /* track opening braces */ + else if (tok->type == TK_LBRACE && lex->templates.count > 0) { + nest = uc_vector_last(&lex->templates); + (*nest)++; + } + + /* check end of placeholder expression */ + else if (tok->type == TK_RBRACE && lex->templates.count > 0) { + nest = uc_vector_last(&lex->templates); + + if (*nest == 0) { + lex->templates.count--; + lex->state = UC_LEX_PARSE_TOKEN; + lex->tok = &tokens[ARRAY_SIZE(tokens) - 1]; /* NB: TK_TEMPLATE token spec */ + } + else { + (*nest)--; + } + } + /* do not report statement tags to the parser */ if (tok->type != 0 && tok->type != TK_LSTM) rv = emit_op(lex, lex->source->off, @@ -1001,7 +1048,8 @@ lex_step(uc_lexer_t *lex, FILE *fp) if (rv) { memset(lex->esc, 0, sizeof(lex->esc)); - lex->state = UC_LEX_IDENTIFY_TOKEN; + lex->state = lex->is_placeholder ? UC_LEX_PLACEHOLDER : UC_LEX_IDENTIFY_TOKEN; + lex->is_placeholder = false; lex->tok = NULL; if (rv == UC_LEX_CONTINUE_PARSING) @@ -1013,6 +1061,14 @@ lex_step(uc_lexer_t *lex, FILE *fp) break; + case UC_LEX_PLACEHOLDER: + lex->state = UC_LEX_IDENTIFY_TOKEN; + + uc_vector_push(&lex->templates, 0); + + return emit_op(lex, lex->source->off, TK_PLACEH, NULL); + + case UC_LEX_EOF: break; } @@ -1051,6 +1107,9 @@ uc_lexer_init(uc_lexer_t *lex, uc_parse_config_t *config, uc_source_t *source) lex->lastoff = 0; + lex->templates.count = 0; + lex->templates.entries = NULL; + if (config && config->raw_mode) { lex->state = UC_LEX_IDENTIFY_TOKEN; lex->block = STATEMENTS; @@ -1060,6 +1119,7 @@ uc_lexer_init(uc_lexer_t *lex, uc_parse_config_t *config, uc_source_t *source) void uc_lexer_free(uc_lexer_t *lex) { + uc_vector_clear(&lex->templates); uc_source_put(lex->source); free(lex->lookbehind); @@ -1095,12 +1155,13 @@ uc_tokenname(unsigned type) size_t i; switch (type) { - case 0: return "End of file"; - case TK_STRING: return "String"; - case TK_LABEL: return "Label"; - case TK_NUMBER: return "Number"; - case TK_DOUBLE: return "Double"; - case TK_REGEXP: return "Regexp"; + case 0: return "End of file"; + case TK_TEMPLATE: return "Template"; + case TK_STRING: return "String"; + case TK_LABEL: return "Label"; + case TK_NUMBER: return "Number"; + case TK_DOUBLE: return "Double"; + case TK_REGEXP: return "Regexp"; } for (i = 0; i < ARRAY_SIZE(tokens); i++) { diff --git a/tests/custom/00_syntax/27_template_literals b/tests/custom/00_syntax/27_template_literals new file mode 100644 index 00000000..40fa9ce0 --- /dev/null +++ b/tests/custom/00_syntax/27_template_literals @@ -0,0 +1,102 @@ +The ucode language supports ES6 template literals for easy interpolation +of expression results into strings. + + +1. Simple template literals are equivalent to strings. + +-- Testcase -- +{{ `foo` === 'foo' }} +-- End -- + +-- Expect stdout -- +true +-- End -- + + +2. Template literals may embed expressions using `${...}` placeholder notation. + +-- Testcase -- +{% + let x = 2; + let y = 4; + + print(`The result of ${x} * ${y} is ${x * y}\n`); +%} +-- End -- + +-- Expect stdout -- +The result of 2 * 4 is 8 +-- End -- + + +3. Template literals may be nested. + +-- Testcase -- +{% + let isFoo = false; + let isBar = true; + + print(`Foo is ${isFoo} and ${isBar ? `bar is ${isBar}` : `nothing else`}!\n`); +%} +-- End -- + +-- Expect stdout -- +Foo is false and bar is true! +-- End -- + + +4. Placeholder expression results are implicitly stringified. + +-- Testcase -- +{% + let o1 = { foo: true }; + let o2 = proto({ color: "red" }, { tostring: function() { return `I am a ${this.color} object` } }); + + print(`The first object is ${o1} and the second says "${o2}".\n`); +%} +-- End -- + +-- Expect stdout -- +The first object is { "foo": true } and the second says "I am a red object". +-- End -- + + +5. Escaping either `$` or `{` prevents interpolation as placeholder, sole `$` + characters bear no special meaning. + +-- Testcase -- +{% + printf("%.J\n", [ + `foo \${bar} baz`, + `foo $\{bar} baz`, + `foo $bar baz` + ]); +%} +-- End -- + +-- Expect stdout -- +[ + "foo ${bar} baz", + "foo ${bar} baz", + "foo $bar baz" +] +-- End -- + + +6. Unterminated placeholder expressions are a synatax error. + +-- Testcase -- +{{ + `foo ${ bar` +}} +-- End -- + +-- Expect stderr -- +Syntax error: Unterminated string +In line 2, byte 13: + + ` `foo ${ bar`` + Near here -----^ + + +-- End --