From b5212ceaebdecdd47288018d7291207b3c2598fe Mon Sep 17 00:00:00 2001 From: Denys Vlasenko Date: Sun, 2 Feb 2020 23:28:55 +0100 Subject: [PATCH 01/65] awk: fix more "length" cases, closes 12486 function old new delta next_token 808 831 +23 Signed-off-by: Denys Vlasenko (cherry picked from commit bd8b05ba1b0901bbd6a913dfd5186ac7c8beffed) --- editors/awk.c | 22 ++++++++++++++++++---- testsuite/awk.tests | 23 ++++++++++++++++++++++- 2 files changed, 40 insertions(+), 5 deletions(-) diff --git a/editors/awk.c b/editors/awk.c index d25508e5d..e58c72700 100644 --- a/editors/awk.c +++ b/editors/awk.c @@ -272,7 +272,8 @@ typedef struct tsplitter_s { /* if previous token class is CONCAT1 and next is CONCAT2, concatenation */ /* operator is inserted between them */ #define TC_CONCAT1 (TC_VARIABLE | TC_ARRTERM | TC_SEQTERM \ - | TC_STRING | TC_NUMBER | TC_UOPPOST) + | TC_STRING | TC_NUMBER | TC_UOPPOST \ + | TC_LENGTH) #define TC_CONCAT2 (TC_OPERAND | TC_UOPPRE) #define OF_RES1 0x010000 @@ -1070,8 +1071,10 @@ static uint32_t next_token(uint32_t expected) const uint32_t *ti; if (t_rollback) { + debug_printf_parse("%s: using rolled-back token\n", __func__); t_rollback = FALSE; } else if (concat_inserted) { + debug_printf_parse("%s: using concat-inserted token\n", __func__); concat_inserted = FALSE; t_tclass = save_tclass; t_info = save_info; @@ -1200,7 +1203,11 @@ static uint32_t next_token(uint32_t expected) goto readnext; /* insert concatenation operator when needed */ - if ((ltclass & TC_CONCAT1) && (tc & TC_CONCAT2) && (expected & TC_BINOP)) { + debug_printf_parse("%s: %x %x %x concat_inserted?\n", __func__, + (ltclass & TC_CONCAT1), (tc & TC_CONCAT2), (expected & TC_BINOP)); + if ((ltclass & TC_CONCAT1) && (tc & TC_CONCAT2) && (expected & TC_BINOP) + && !(ltclass == TC_LENGTH && tc == TC_SEQSTART) /* but not for "length(..." */ + ) { concat_inserted = TRUE; save_tclass = tc; save_info = t_info; @@ -1208,6 +1215,7 @@ static uint32_t next_token(uint32_t expected) t_info = OC_CONCAT | SS | P(35); } + debug_printf_parse("%s: t_tclass=tc=%x\n", __func__, t_tclass); t_tclass = tc; } ltclass = t_tclass; @@ -1218,6 +1226,7 @@ static uint32_t next_token(uint32_t expected) EMSG_UNEXP_EOS : EMSG_UNEXP_TOKEN); } + debug_printf_parse("%s: returning, ltclass:%x t_double:%f\n", __func__, ltclass, t_double); return ltclass; #undef concat_inserted #undef save_tclass @@ -1282,7 +1291,7 @@ static node *parse_expr(uint32_t iexp) glptr = NULL; } else if (tc & (TC_BINOP | TC_UOPPOST)) { - debug_printf_parse("%s: TC_BINOP | TC_UOPPOST\n", __func__); + debug_printf_parse("%s: TC_BINOP | TC_UOPPOST tc:%x\n", __func__, tc); /* for binary and postfix-unary operators, jump back over * previous operators with higher priority */ vn = cn; @@ -1387,7 +1396,12 @@ static node *parse_expr(uint32_t iexp) case TC_LENGTH: debug_printf_parse("%s: TC_LENGTH\n", __func__); - next_token(TC_SEQSTART | TC_OPTERM | TC_GRPTERM); + next_token(TC_SEQSTART /* length(...) */ + | TC_OPTERM /* length; (or newline)*/ + | TC_GRPTERM /* length } */ + | TC_BINOPX /* length NUM */ + | TC_COMMA /* print length, 1 */ + ); rollback_token(); if (t_tclass & TC_SEQSTART) { /* It was a "(" token. Handle just like TC_BUILTIN */ diff --git a/testsuite/awk.tests b/testsuite/awk.tests index a7a533ba0..b5008290f 100755 --- a/testsuite/awk.tests +++ b/testsuite/awk.tests @@ -85,7 +85,8 @@ testing "awk floating const with leading zeroes" \ "" "\n" # long field seps requiring regex -testing "awk long field sep" "awk -F-- '{ print NF, length(\$NF), \$NF }'" \ +testing "awk long field sep" \ + "awk -F-- '{ print NF, length(\$NF), \$NF }'" \ "2 0 \n3 0 \n4 0 \n5 0 \n" \ "" \ "a--\na--b--\na--b--c--\na--b--c--d--" @@ -317,6 +318,26 @@ testing "awk length()" \ "3\n3\n3\n3\n" \ "" "qwe" +testing "awk print length, 1" \ + "awk '{ print length, 1 }'" \ + "0 1\n" \ + "" "\n" + +testing "awk print length 1" \ + "awk '{ print length 1 }'" \ + "01\n" \ + "" "\n" + +testing "awk length == 0" \ + "awk 'length == 0 { print \"foo\" }'" \ + "foo\n" \ + "" "\n" + +testing "awk if (length == 0)" \ + "awk '{ if (length == 0) { print \"bar\" } }'" \ + "bar\n" \ + "" "\n" + testing "awk -f and ARGC" \ "awk -f - input" \ "re\n2\n" \ -- 2.27.0 From 2a117a751e6747287c2df92ae9635d12c065a70c Mon Sep 17 00:00:00 2001 From: Denys Vlasenko Date: Tue, 9 Jun 2020 01:33:54 +0200 Subject: [PATCH 02/65] awk: disallow "str"++, closes bug 12981 function old new delta parse_expr 887 896 +9 Signed-off-by: Denys Vlasenko (cherry picked from commit 6f7a0096496a5a9e90638dc01e947015cc776110) --- editors/awk.c | 4 +++- testsuite/awk.tests | 8 ++++++++ 2 files changed, 11 insertions(+), 1 deletion(-) diff --git a/editors/awk.c b/editors/awk.c index e58c72700..c693aa505 100644 --- a/editors/awk.c +++ b/editors/awk.c @@ -1359,8 +1359,10 @@ static node *parse_expr(uint32_t iexp) v = cn->l.v = xzalloc(sizeof(var)); if (tc & TC_NUMBER) setvar_i(v, t_double); - else + else { setvar_s(v, t_string); + xtc &= ~TC_UOPPOST; /* "str"++ is not allowed */ + } break; case TC_REGEXP: diff --git a/testsuite/awk.tests b/testsuite/awk.tests index b5008290f..87f6b5007 100755 --- a/testsuite/awk.tests +++ b/testsuite/awk.tests @@ -390,5 +390,13 @@ testing 'awk negative field access' \ '' \ 'anything' +# was misinterpreted as (("str"++) i) instead of ("str" (++i)) +# (and was executed: "str"++ is "0", thus concatenating "0" and "1"): +testing 'awk do not allow "str"++' \ + 'awk -v i=1 "BEGIN {print \"str\" ++i}"' \ + "str2\n" \ + '' \ + 'anything' + exit $FAILCOUNT -- 2.27.0 From edcd27a4fbfc93e0f8d271f6090718f921176307 Mon Sep 17 00:00:00 2001 From: Denys Vlasenko Date: Mon, 16 Nov 2020 10:40:32 +0100 Subject: [PATCH 03/65] awk: fix dodgy multi-char separators splitting logic function old new delta awk_split 521 484 -37 Signed-off-by: Denys Vlasenko (cherry picked from commit 5323af7f51808d5ff35c624ba70bdae4807f3717) --- editors/awk.c | 25 ++++++++++++++++--------- 1 file changed, 16 insertions(+), 9 deletions(-) diff --git a/editors/awk.c b/editors/awk.c index c693aa505..7f5dace9e 100644 --- a/editors/awk.c +++ b/editors/awk.c @@ -1765,10 +1765,9 @@ static void fsrealloc(int size) static int awk_split(const char *s, node *spl, char **slist) { - int l, n; + int n; char c[4]; char *s1; - regmatch_t pmatch[2]; // TODO: why [2]? [1] is enough... /* in worst case, each char would be a separate field */ *slist = s1 = xzalloc(strlen(s) * 2 + 3); @@ -1785,12 +1784,18 @@ static int awk_split(const char *s, node *spl, char **slist) return n; /* "": zero fields */ n++; /* at least one field will be there */ do { + int l; + regmatch_t pmatch[2]; // TODO: why [2]? [1] is enough... + l = strcspn(s, c+2); /* len till next NUL or \n */ if (regexec(icase ? spl->r.ire : spl->l.re, s, 1, pmatch, 0) == 0 && pmatch[0].rm_so <= l ) { l = pmatch[0].rm_so; if (pmatch[0].rm_eo == 0) { + /* For example, happens when FS can match + * an empthy string (awk -F ' *') + */ l++; pmatch[0].rm_eo++; } @@ -1800,14 +1805,16 @@ static int awk_split(const char *s, node *spl, char **slist) if (s[l]) pmatch[0].rm_eo++; } - memcpy(s1, s, l); - /* make sure we remove *all* of the separator chars */ - do { - s1[l] = '\0'; - } while (++l < pmatch[0].rm_eo); - nextword(&s1); + s1 = mempcpy(s1, s, l); + *s1++ = '\0'; s += pmatch[0].rm_eo; } while (*s); + + /* echo a-- | awk -F-- '{ print NF, length($NF), $NF }' + * should print "2 0 ": + */ + *s1 = '\0'; + return n; } if (c[0] == '\0') { /* null split */ @@ -2011,7 +2018,7 @@ static int ptest(node *pattern) static int awk_getline(rstream *rsm, var *v) { char *b; - regmatch_t pmatch[2]; + regmatch_t pmatch[2]; // TODO: why [2]? [1] is enough... int size, a, p, pp = 0; int fd, so, eo, r, rp; char c, *m, *s; -- 2.27.0 From f87d87eb8da06660b7badd274640deb57023b74d Mon Sep 17 00:00:00 2001 From: Denys Vlasenko Date: Wed, 2 Dec 2020 19:07:31 +0100 Subject: [PATCH 04/65] awk: FS regex matches only non-empty separators (gawk compat) function old new delta awk_split 484 553 +69 Signed-off-by: Denys Vlasenko (cherry picked from commit 665a65953076ea21be49250b8279ddb1f0f99f38) --- editors/awk.c | 33 +++++++++++++++++++++++++-------- testsuite/awk.tests | 7 +++++++ 2 files changed, 32 insertions(+), 8 deletions(-) diff --git a/editors/awk.c b/editors/awk.c index 7f5dace9e..2a3e29db1 100644 --- a/editors/awk.c +++ b/editors/awk.c @@ -1763,6 +1763,29 @@ static void fsrealloc(int size) nfields = size; } +static int regexec1_nonempty(const regex_t *preg, const char *s, regmatch_t pmatch[]) +{ + int r = regexec(preg, s, 1, pmatch, 0); + if (r == 0 && pmatch[0].rm_eo == 0) { + /* For example, happens when FS can match + * an empty string (awk -F ' *'). Logically, + * this should split into one-char fields. + * However, gawk 5.0.1 searches for first + * _non-empty_ separator string match: + */ + size_t ofs = 0; + do { + ofs++; + if (!s[ofs]) + return REG_NOMATCH; + regexec(preg, s + ofs, 1, pmatch, 0); + } while (pmatch[0].rm_eo == 0); + pmatch[0].rm_so += ofs; + pmatch[0].rm_eo += ofs; + } + return r; +} + static int awk_split(const char *s, node *spl, char **slist) { int n; @@ -1788,17 +1811,11 @@ static int awk_split(const char *s, node *spl, char **slist) regmatch_t pmatch[2]; // TODO: why [2]? [1] is enough... l = strcspn(s, c+2); /* len till next NUL or \n */ - if (regexec(icase ? spl->r.ire : spl->l.re, s, 1, pmatch, 0) == 0 + if (regexec1_nonempty(icase ? spl->r.ire : spl->l.re, s, pmatch) == 0 && pmatch[0].rm_so <= l ) { + /* if (pmatch[0].rm_eo == 0) ... - impossible */ l = pmatch[0].rm_so; - if (pmatch[0].rm_eo == 0) { - /* For example, happens when FS can match - * an empthy string (awk -F ' *') - */ - l++; - pmatch[0].rm_eo++; - } n++; /* we saw yet another delimiter */ } else { pmatch[0].rm_eo = l; diff --git a/testsuite/awk.tests b/testsuite/awk.tests index 87f6b5007..06a531d96 100755 --- a/testsuite/awk.tests +++ b/testsuite/awk.tests @@ -398,5 +398,12 @@ testing 'awk do not allow "str"++' \ '' \ 'anything' +#gawk compat: FS regex matches only non-empty separators: +#with -*, the splitting is NOT f o o b a r, but foo bar: +testing 'awk FS regex which can match empty string' \ + "awk -F '-*' '{print \$1 \"-\" \$2 \"=\" \$3 \"*\" \$4}'" \ + "foo-bar=*\n" \ + '' \ + 'foo--bar' exit $FAILCOUNT -- 2.27.0 From 0e29eba7f36fac849a0be3431cc5829cdfc906e9 Mon Sep 17 00:00:00 2001 From: Ron Yorston Date: Wed, 27 Jan 2021 11:19:14 +0000 Subject: [PATCH 05/65] awk: allow printf('%c') to output NUL, closes 13486 Treat the output of printf as binary rather than a null-terminated string so that NUL characters can be output. This is considered to be a GNU extension, though it's also available in mawk and FreeBSD's awk. function old new delta evaluate 3487 3504 +17 awk_printf 504 519 +15 ------------------------------------------------------------------------------ (add/remove: 0/0 grow/shrink: 2/0 up/down: 32/0) Total: 32 bytes Signed-off-by: Ron Yorston Signed-off-by: Denys Vlasenko --- editors/awk.c | 18 +++++++++++++++--- testsuite/awk.tests | 5 +++++ 2 files changed, 20 insertions(+), 3 deletions(-) diff --git a/editors/awk.c b/editors/awk.c index 2a3e29db1..a4b9898e2 100644 --- a/editors/awk.c +++ b/editors/awk.c @@ -2155,7 +2155,10 @@ static int fmt_num(char *b, int size, const char *format, double n, int int_as_i } /* formatted output into an allocated buffer, return ptr to buffer */ -static char *awk_printf(node *n) +#if !ENABLE_FEATURE_AWK_GNU_EXTENSIONS +# define awk_printf(a, b) awk_printf(a) +#endif +static char *awk_printf(node *n, int *len) { char *b = NULL; char *fmt, *s, *f; @@ -2209,6 +2212,10 @@ static char *awk_printf(node *n) nvfree(v); b = xrealloc(b, i + 1); b[i] = '\0'; +#if ENABLE_FEATURE_AWK_GNU_EXTENSIONS + if (len) + *len = i; +#endif return b; } @@ -2666,6 +2673,7 @@ static var *evaluate(node *op, var *res) case XC( OC_PRINT ): case XC( OC_PRINTF ): { FILE *F = stdout; + IF_FEATURE_AWK_GNU_EXTENSIONS(int len;) if (op->r.n) { rstream *rsm = newfile(R.s); @@ -2703,8 +2711,12 @@ static var *evaluate(node *op, var *res) fputs(getvar_s(intvar[ORS]), F); } else { /* OC_PRINTF */ - char *s = awk_printf(op1); + char *s = awk_printf(op1, &len); +#if ENABLE_FEATURE_AWK_GNU_EXTENSIONS + fwrite(s, len, 1, F); +#else fputs(s, F); +#endif free(s); } fflush(F); @@ -2978,7 +2990,7 @@ static var *evaluate(node *op, var *res) break; case XC( OC_SPRINTF ): - setvar_p(res, awk_printf(op1)); + setvar_p(res, awk_printf(op1, NULL)); break; case XC( OC_UNARY ): { diff --git a/testsuite/awk.tests b/testsuite/awk.tests index 06a531d96..6489dc082 100755 --- a/testsuite/awk.tests +++ b/testsuite/awk.tests @@ -383,6 +383,11 @@ testing "awk errors on missing delete arg" \ "awk -e '{delete}' 2>&1" "awk: cmd. line:1: Too few arguments\n" "" "" SKIP= +optional FEATURE_AWK_GNU_EXTENSIONS +testing "awk printf('%c') can output NUL" \ + "awk '{printf(\"hello%c null\n\", 0)}'" "hello\0 null\n" "" "\n" +SKIP= + # testing "description" "command" "result" "infile" "stdin" testing 'awk negative field access' \ 'awk 2>&1 -- '\''{ $(-1) }'\' \ -- 2.27.0 From e39199e2b9fe6443eed4e676d159cdd9f84f8b43 Mon Sep 17 00:00:00 2001 From: Denys Vlasenko Date: Wed, 16 Jun 2021 09:18:08 +0200 Subject: [PATCH 06/65] awk: fix use-after-free in "$BIGNUM1 $BIGGERNUM2" concat op Second reference to a field reallocs/moves Fields[] array, but first ref still tries to use the element where it was before move. function old new delta fsrealloc 94 106 +12 Signed-off-by: Denys Vlasenko --- editors/awk.c | 85 ++++++++++++++++++++++++++++++++++++++++++--------- 1 file changed, 71 insertions(+), 14 deletions(-) diff --git a/editors/awk.c b/editors/awk.c index a4b9898e2..4f36b4b90 100644 --- a/editors/awk.c +++ b/editors/awk.c @@ -1745,12 +1745,22 @@ static char* qrealloc(char *b, int n, int *size) /* resize field storage space */ static void fsrealloc(int size) { - int i; + int i, newsize; if (size >= maxfields) { + /* Sanity cap, easier than catering for overflows */ + if (size > 0xffffff) + bb_die_memory_exhausted(); + i = maxfields; maxfields = size + 16; - Fields = xrealloc(Fields, maxfields * sizeof(Fields[0])); + + newsize = maxfields * sizeof(Fields[0]); + debug_printf_eval("fsrealloc: xrealloc(%p, %u)\n", Fields, newsize); + Fields = xrealloc(Fields, newsize); + debug_printf_eval("fsrealloc: Fields=%p..%p\n", Fields, (char*)Fields + newsize - 1); + /* ^^^ did Fields[] move? debug aid for L.v getting "upstaged" by R.v in evaluate() */ + for (; i < maxfields; i++) { Fields[i].type = VF_SPECIAL; Fields[i].string = NULL; @@ -2614,20 +2624,30 @@ static var *evaluate(node *op, var *res) /* execute inevitable things */ if (opinfo & OF_RES1) L.v = evaluate(op1, v1); - if (opinfo & OF_RES2) - R.v = evaluate(op->r.n, v1+1); if (opinfo & OF_STR1) { L.s = getvar_s(L.v); debug_printf_eval("L.s:'%s'\n", L.s); } - if (opinfo & OF_STR2) { - R.s = getvar_s(R.v); - debug_printf_eval("R.s:'%s'\n", R.s); - } if (opinfo & OF_NUM1) { L_d = getvar_i(L.v); debug_printf_eval("L_d:%f\n", L_d); } + /* NB: Must get string/numeric values of L (done above) + * _before_ evaluate()'ing R.v: if both L and R are $NNNs, + * and right one is large, then L.v points to Fields[NNN1], + * second evaluate() reallocates and moves (!) Fields[], + * R.v points to Fields[NNN2] but L.v now points to freed mem! + * (Seen trying to evaluate "$444 $44444") + */ + if (opinfo & OF_RES2) { + R.v = evaluate(op->r.n, v1+1); + //TODO: L.v may be invalid now, set L.v to NULL to catch bugs? + //L.v = NULL; + } + if (opinfo & OF_STR2) { + R.s = getvar_s(R.v); + debug_printf_eval("R.s:'%s'\n", R.s); + } debug_printf_eval("switch(0x%x)\n", XC(opinfo & OPCLSMASK)); switch (XC(opinfo & OPCLSMASK)) { @@ -2636,6 +2656,7 @@ static var *evaluate(node *op, var *res) /* test pattern */ case XC( OC_TEST ): + debug_printf_eval("TEST\n"); if ((op1->info & OPCLSMASK) == OC_COMMA) { /* it's range pattern */ if ((opinfo & OF_CHECKED) || ptest(op1->l.n)) { @@ -2653,25 +2674,32 @@ static var *evaluate(node *op, var *res) /* just evaluate an expression, also used as unconditional jump */ case XC( OC_EXEC ): + debug_printf_eval("EXEC\n"); break; /* branch, used in if-else and various loops */ case XC( OC_BR ): + debug_printf_eval("BR\n"); op = istrue(L.v) ? op->a.n : op->r.n; break; /* initialize for-in loop */ case XC( OC_WALKINIT ): + debug_printf_eval("WALKINIT\n"); hashwalk_init(L.v, iamarray(R.v)); break; /* get next array item */ case XC( OC_WALKNEXT ): + debug_printf_eval("WALKNEXT\n"); op = hashwalk_next(L.v) ? op->a.n : op->r.n; break; case XC( OC_PRINT ): - case XC( OC_PRINTF ): { + debug_printf_eval("PRINT /\n"); + case XC( OC_PRINTF ): + debug_printf_eval("PRINTF\n"); + { FILE *F = stdout; IF_FEATURE_AWK_GNU_EXTENSIONS(int len;) @@ -2726,22 +2754,28 @@ static var *evaluate(node *op, var *res) /* case XC( OC_DELETE ): - moved to happen before arg evaluation */ case XC( OC_NEWSOURCE ): + debug_printf_eval("NEWSOURCE\n"); g_progname = op->l.new_progname; break; case XC( OC_RETURN ): + debug_printf_eval("RETURN\n"); copyvar(res, L.v); break; case XC( OC_NEXTFILE ): + debug_printf_eval("NEXTFILE\n"); nextfile = TRUE; case XC( OC_NEXT ): + debug_printf_eval("NEXT\n"); nextrec = TRUE; case XC( OC_DONE ): + debug_printf_eval("DONE\n"); clrvar(res); break; case XC( OC_EXIT ): + debug_printf_eval("EXIT\n"); awk_exit(L_d); /* -- recursive node type -- */ @@ -2761,15 +2795,18 @@ static var *evaluate(node *op, var *res) break; case XC( OC_IN ): + debug_printf_eval("IN\n"); setvar_i(res, hash_search(iamarray(R.v), L.s) ? 1 : 0); break; case XC( OC_REGEXP ): + debug_printf_eval("REGEXP\n"); op1 = op; L.s = getvar_s(intvar[F0]); goto re_cont; case XC( OC_MATCH ): + debug_printf_eval("MATCH\n"); op1 = op->r.n; re_cont: { @@ -2795,6 +2832,7 @@ static var *evaluate(node *op, var *res) break; case XC( OC_TERNARY ): + debug_printf_eval("TERNARY\n"); if ((op->r.n->info & OPCLSMASK) != OC_COLON) syntax_error(EMSG_POSSIBLE_ERROR); res = evaluate(istrue(L.v) ? op->r.n->l.n : op->r.n->r.n, res); @@ -2803,6 +2841,7 @@ static var *evaluate(node *op, var *res) case XC( OC_FUNC ): { var *vbeg, *v; const char *sv_progname; + debug_printf_eval("FUNC\n"); /* The body might be empty, still has to eval the args */ if (!op->r.n->info && !op->r.f->body.first) @@ -2832,7 +2871,10 @@ static var *evaluate(node *op, var *res) } case XC( OC_GETLINE ): - case XC( OC_PGETLINE ): { + debug_printf_eval("GETLINE /\n"); + case XC( OC_PGETLINE ): + debug_printf_eval("PGETLINE\n"); + { rstream *rsm; int i; @@ -2873,6 +2915,7 @@ static var *evaluate(node *op, var *res) /* simple builtins */ case XC( OC_FBLTIN ): { double R_d = R_d; /* for compiler */ + debug_printf_eval("FBLTIN\n"); switch (opn) { case F_in: @@ -2986,14 +3029,18 @@ static var *evaluate(node *op, var *res) } case XC( OC_BUILTIN ): + debug_printf_eval("BUILTIN\n"); res = exec_builtin(op, res); break; case XC( OC_SPRINTF ): + debug_printf_eval("SPRINTF\n"); setvar_p(res, awk_printf(op1, NULL)); break; - case XC( OC_UNARY ): { + case XC( OC_UNARY ): + debug_printf_eval("UNARY\n"); + { double Ld, R_d; Ld = R_d = getvar_i(R.v); @@ -3023,7 +3070,9 @@ static var *evaluate(node *op, var *res) break; } - case XC( OC_FIELD ): { + case XC( OC_FIELD ): + debug_printf_eval("FIELD\n"); + { int i = (int)getvar_i(R.v); if (i < 0) syntax_error(EMSG_NEGATIVE_FIELD); @@ -3040,8 +3089,10 @@ static var *evaluate(node *op, var *res) /* concatenation (" ") and index joining (",") */ case XC( OC_CONCAT ): + debug_printf_eval("CONCAT /\n"); case XC( OC_COMMA ): { const char *sep = ""; + debug_printf_eval("COMMA\n"); if ((opinfo & OPCLSMASK) == OC_COMMA) sep = getvar_s(intvar[SUBSEP]); setvar_p(res, xasprintf("%s%s%s", L.s, sep, R.s)); @@ -3049,17 +3100,22 @@ static var *evaluate(node *op, var *res) } case XC( OC_LAND ): + debug_printf_eval("LAND\n"); setvar_i(res, istrue(L.v) ? ptest(op->r.n) : 0); break; case XC( OC_LOR ): + debug_printf_eval("LOR\n"); setvar_i(res, istrue(L.v) ? 1 : ptest(op->r.n)); break; case XC( OC_BINARY ): - case XC( OC_REPLACE ): { + debug_printf_eval("BINARY /\n"); + case XC( OC_REPLACE ): + debug_printf_eval("REPLACE\n"); + { double R_d = getvar_i(R.v); - debug_printf_eval("BINARY/REPLACE: R_d:%f opn:%c\n", R_d, opn); + debug_printf_eval("R_d:%f opn:%c\n", R_d, opn); switch (opn) { case '+': L_d += R_d; @@ -3095,6 +3151,7 @@ static var *evaluate(node *op, var *res) case XC( OC_COMPARE ): { int i = i; /* for compiler */ double Ld; + debug_printf_eval("COMPARE\n"); if (is_numeric(L.v) && is_numeric(R.v)) { Ld = getvar_i(L.v) - getvar_i(R.v); -- 2.27.0 From facddf1c2e7675a23fae484f62427266c9ffc0bb Mon Sep 17 00:00:00 2001 From: Denys Vlasenko Date: Fri, 18 Jun 2021 16:35:27 +0200 Subject: [PATCH 07/65] awk: after preinc/dec, only allow variable, field ref, array ref, or another preinc/dec Accepting nonsense like "--4", and even "-- -4" is confusing. function old new delta parse_expr 917 938 +21 Signed-off-by: Denys Vlasenko --- editors/awk.c | 87 ++++++++++++++++++++++++++++++++++++++++----------- 1 file changed, 69 insertions(+), 18 deletions(-) diff --git a/editors/awk.c b/editors/awk.c index 4f36b4b90..aec883038 100644 --- a/editors/awk.c +++ b/editors/awk.c @@ -66,6 +66,8 @@ #endif #ifndef debug_printf_parse # define debug_printf_parse(...) (fprintf(stderr, __VA_ARGS__)) +#else +# define debug_parse_print_tc(...) ((void)0) #endif @@ -210,13 +212,13 @@ typedef struct tsplitter_s { #define TC_SEQTERM (1 << 1) /* ) */ #define TC_REGEXP (1 << 2) /* /.../ */ #define TC_OUTRDR (1 << 3) /* | > >> */ -#define TC_UOPPOST (1 << 4) /* unary postfix operator */ -#define TC_UOPPRE1 (1 << 5) /* unary prefix operator */ +#define TC_UOPPOST (1 << 4) /* unary postfix operator ++ -- */ +#define TC_UOPPRE1 (1 << 5) /* unary prefix operator ++ -- $ */ #define TC_BINOPX (1 << 6) /* two-opnd operator */ #define TC_IN (1 << 7) #define TC_COMMA (1 << 8) #define TC_PIPE (1 << 9) /* input redirection pipe */ -#define TC_UOPPRE2 (1 << 10) /* unary prefix operator */ +#define TC_UOPPRE2 (1 << 10) /* unary prefix operator + - ! */ #define TC_ARRTERM (1 << 11) /* ] */ #define TC_GRPSTART (1 << 12) /* { */ #define TC_GRPTERM (1 << 13) /* } */ @@ -243,14 +245,51 @@ typedef struct tsplitter_s { #define TC_STRING (1 << 29) #define TC_NUMBER (1 << 30) -#define TC_UOPPRE (TC_UOPPRE1 | TC_UOPPRE2) +#ifndef debug_parse_print_tc +#define debug_parse_print_tc(n) do { \ +if ((n) & TC_SEQSTART) debug_printf_parse(" SEQSTART"); \ +if ((n) & TC_SEQTERM ) debug_printf_parse(" SEQTERM" ); \ +if ((n) & TC_REGEXP ) debug_printf_parse(" REGEXP" ); \ +if ((n) & TC_OUTRDR ) debug_printf_parse(" OUTRDR" ); \ +if ((n) & TC_UOPPOST ) debug_printf_parse(" UOPPOST" ); \ +if ((n) & TC_UOPPRE1 ) debug_printf_parse(" UOPPRE1" ); \ +if ((n) & TC_BINOPX ) debug_printf_parse(" BINOPX" ); \ +if ((n) & TC_IN ) debug_printf_parse(" IN" ); \ +if ((n) & TC_COMMA ) debug_printf_parse(" COMMA" ); \ +if ((n) & TC_PIPE ) debug_printf_parse(" PIPE" ); \ +if ((n) & TC_UOPPRE2 ) debug_printf_parse(" UOPPRE2" ); \ +if ((n) & TC_ARRTERM ) debug_printf_parse(" ARRTERM" ); \ +if ((n) & TC_GRPSTART) debug_printf_parse(" GRPSTART"); \ +if ((n) & TC_GRPTERM ) debug_printf_parse(" GRPTERM" ); \ +if ((n) & TC_SEMICOL ) debug_printf_parse(" SEMICOL" ); \ +if ((n) & TC_NEWLINE ) debug_printf_parse(" NEWLINE" ); \ +if ((n) & TC_STATX ) debug_printf_parse(" STATX" ); \ +if ((n) & TC_WHILE ) debug_printf_parse(" WHILE" ); \ +if ((n) & TC_ELSE ) debug_printf_parse(" ELSE" ); \ +if ((n) & TC_BUILTIN ) debug_printf_parse(" BUILTIN" ); \ +if ((n) & TC_LENGTH ) debug_printf_parse(" LENGTH" ); \ +if ((n) & TC_GETLINE ) debug_printf_parse(" GETLINE" ); \ +if ((n) & TC_FUNCDECL) debug_printf_parse(" FUNCDECL"); \ +if ((n) & TC_BEGIN ) debug_printf_parse(" BEGIN" ); \ +if ((n) & TC_END ) debug_printf_parse(" END" ); \ +if ((n) & TC_EOF ) debug_printf_parse(" EOF" ); \ +if ((n) & TC_VARIABLE) debug_printf_parse(" VARIABLE"); \ +if ((n) & TC_ARRAY ) debug_printf_parse(" ARRAY" ); \ +if ((n) & TC_FUNCTION) debug_printf_parse(" FUNCTION"); \ +if ((n) & TC_STRING ) debug_printf_parse(" STRING" ); \ +if ((n) & TC_NUMBER ) debug_printf_parse(" NUMBER" ); \ +} while (0) +#endif /* combined token classes */ +#define TC_UOPPRE (TC_UOPPRE1 | TC_UOPPRE2) + #define TC_BINOP (TC_BINOPX | TC_COMMA | TC_PIPE | TC_IN) //#define TC_UNARYOP (TC_UOPPRE | TC_UOPPOST) #define TC_OPERAND (TC_VARIABLE | TC_ARRAY | TC_FUNCTION \ | TC_BUILTIN | TC_LENGTH | TC_GETLINE \ | TC_SEQSTART | TC_STRING | TC_NUMBER) +#define TC_LVALUE (TC_VARIABLE | TC_ARRAY) #define TC_STATEMNT (TC_STATX | TC_WHILE) #define TC_OPTERM (TC_SEMICOL | TC_NEWLINE) @@ -284,7 +323,6 @@ typedef struct tsplitter_s { #define OF_CHECKED 0x200000 #define OF_REQUIRED 0x400000 - /* combined operator flags */ #define xx 0 #define xV OF_RES2 @@ -313,10 +351,8 @@ typedef struct tsplitter_s { #define PRIMASK2 0x7E000000 /* Operation classes */ - #define SHIFT_TIL_THIS 0x0600 #define RECUR_FROM_THIS 0x1000 - enum { OC_DELETE = 0x0100, OC_EXEC = 0x0200, OC_NEWSOURCE = 0x0300, OC_PRINT = 0x0400, OC_PRINTF = 0x0500, OC_WALKINIT = 0x0600, @@ -411,7 +447,9 @@ static const uint32_t tokeninfo[] = { OC_REGEXP, xS|'a', xS|'w', xS|'|', OC_UNARY|xV|P(9)|'p', OC_UNARY|xV|P(9)|'m', - OC_UNARY|xV|P(9)|'P', OC_UNARY|xV|P(9)|'M', OC_FIELD|xV|P(5), +#define TI_PREINC (OC_UNARY|xV|P(9)|'P') +#define TI_PREDEC (OC_UNARY|xV|P(9)|'M') + TI_PREINC, TI_PREDEC, OC_FIELD|xV|P(5), OC_COMPARE|VV|P(39)|5, OC_MOVE|VV|P(74), OC_REPLACE|NV|P(74)|'+', OC_REPLACE|NV|P(74)|'-', OC_REPLACE|NV|P(74)|'*', OC_REPLACE|NV|P(74)|'/', OC_REPLACE|NV|P(74)|'%', OC_REPLACE|NV|P(74)|'&', OC_BINARY|NV|P(29)|'+', OC_BINARY|NV|P(29)|'-', OC_REPLACE|NV|P(74)|'&', OC_BINARY|NV|P(15)|'&', @@ -1070,6 +1108,10 @@ static uint32_t next_token(uint32_t expected) uint32_t tc; const uint32_t *ti; + debug_printf_parse("%s() expected(%x):", __func__, expected); + debug_parse_print_tc(expected); + debug_printf_parse("\n"); + if (t_rollback) { debug_printf_parse("%s: using rolled-back token\n", __func__); t_rollback = FALSE; @@ -1226,7 +1268,9 @@ static uint32_t next_token(uint32_t expected) EMSG_UNEXP_EOS : EMSG_UNEXP_TOKEN); } - debug_printf_parse("%s: returning, ltclass:%x t_double:%f\n", __func__, ltclass, t_double); + debug_printf_parse("%s: returning, t_double:%f ltclass:", __func__, t_double); + debug_parse_print_tc(ltclass); + debug_printf_parse("\n"); return ltclass; #undef concat_inserted #undef save_tclass @@ -1266,7 +1310,7 @@ static node *condition(void) /* parse expression terminated by given argument, return ptr * to built subtree. Terminator is eaten by parse_expr */ -static node *parse_expr(uint32_t iexp) +static node *parse_expr(uint32_t term_tc) { node sn; node *cn = &sn; @@ -1274,13 +1318,15 @@ static node *parse_expr(uint32_t iexp) uint32_t tc, xtc; var *v; - debug_printf_parse("%s(%x)\n", __func__, iexp); + debug_printf_parse("%s() term_tc(%x):", __func__, term_tc); + debug_parse_print_tc(term_tc); + debug_printf_parse("\n"); sn.info = PRIMASK; sn.r.n = sn.a.n = glptr = NULL; - xtc = TC_OPERAND | TC_UOPPRE | TC_REGEXP | iexp; + xtc = TC_OPERAND | TC_UOPPRE | TC_REGEXP | term_tc; - while (!((tc = next_token(xtc)) & iexp)) { + while (!((tc = next_token(xtc)) & term_tc)) { if (glptr && (t_info == (OC_COMPARE | VV | P(39) | 2))) { /* input redirection (<) attached to glptr node */ @@ -1313,25 +1359,28 @@ static node *parse_expr(uint32_t iexp) next_token(TC_GETLINE); /* give maximum priority to this pipe */ cn->info &= ~PRIMASK; - xtc = TC_OPERAND | TC_UOPPRE | TC_BINOP | iexp; + xtc = TC_OPERAND | TC_UOPPRE | TC_BINOP | term_tc; } } else { cn->r.n = vn; - xtc = TC_OPERAND | TC_UOPPRE | TC_BINOP | iexp; + xtc = TC_OPERAND | TC_UOPPRE | TC_BINOP | term_tc; } vn->a.n = cn; } else { - debug_printf_parse("%s: other\n", __func__); + debug_printf_parse("%s: other, t_info:%x\n", __func__, t_info); /* for operands and prefix-unary operators, attach them * to last node */ vn = cn; cn = vn->r.n = new_node(t_info); cn->a.n = vn; + xtc = TC_OPERAND | TC_UOPPRE | TC_REGEXP; + if (t_info == TI_PREINC || t_info == TI_PREDEC) + xtc = TC_LVALUE | TC_UOPPRE1; if (tc & (TC_OPERAND | TC_REGEXP)) { debug_printf_parse("%s: TC_OPERAND | TC_REGEXP\n", __func__); - xtc = TC_UOPPRE | TC_UOPPOST | TC_BINOP | TC_OPERAND | iexp; + xtc = TC_UOPPRE | TC_UOPPOST | TC_BINOP | TC_OPERAND | term_tc; /* one should be very careful with switch on tclass - * only simple tclasses should be used! */ switch (tc) { @@ -1388,7 +1437,7 @@ static node *parse_expr(uint32_t iexp) case TC_GETLINE: debug_printf_parse("%s: TC_GETLINE\n", __func__); glptr = cn; - xtc = TC_OPERAND | TC_UOPPRE | TC_BINOP | iexp; + xtc = TC_OPERAND | TC_UOPPRE | TC_BINOP | term_tc; break; case TC_BUILTIN: @@ -1603,6 +1652,8 @@ static void parse_program(char *p) func *f; var *v; + debug_printf_parse("%s()\n", __func__); + g_pos = p; t_lineno = 1; while ((tclass = next_token(TC_EOF | TC_OPSEQ | TC_GRPSTART | -- 2.27.0 From fbab842d8fcfcb87bede0c3f16fe97641cef0aaf Mon Sep 17 00:00:00 2001 From: Denys Vlasenko Date: Sun, 20 Jun 2021 22:52:29 +0200 Subject: [PATCH 08/65] qwk: make code clearer, no actual code changes Signed-off-by: Denys Vlasenko --- editors/awk.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/editors/awk.c b/editors/awk.c index aec883038..25f2c4810 100644 --- a/editors/awk.c +++ b/editors/awk.c @@ -455,7 +455,8 @@ static const uint32_t tokeninfo[] = { OC_BINARY|NV|P(29)|'+', OC_BINARY|NV|P(29)|'-', OC_REPLACE|NV|P(74)|'&', OC_BINARY|NV|P(15)|'&', OC_BINARY|NV|P(25)|'/', OC_BINARY|NV|P(25)|'%', OC_BINARY|NV|P(15)|'&', OC_BINARY|NV|P(25)|'*', OC_COMPARE|VV|P(39)|4, OC_COMPARE|VV|P(39)|3, OC_COMPARE|VV|P(39)|0, OC_COMPARE|VV|P(39)|1, - OC_COMPARE|VV|P(39)|2, OC_MATCH|Sx|P(45)|'!', OC_MATCH|Sx|P(45)|'~', OC_LAND|Vx|P(55), +#define TI_LESS (OC_COMPARE|VV|P(39)|2) + TI_LESS, OC_MATCH|Sx|P(45)|'!', OC_MATCH|Sx|P(45)|'~', OC_LAND|Vx|P(55), OC_LOR|Vx|P(59), OC_TERNARY|Vx|P(64)|'?', OC_COLON|xx|P(67)|':', OC_IN|SV|P(49), /* TC_IN */ OC_COMMA|SS|P(80), @@ -1328,7 +1329,7 @@ static node *parse_expr(uint32_t term_tc) while (!((tc = next_token(xtc)) & term_tc)) { - if (glptr && (t_info == (OC_COMPARE | VV | P(39) | 2))) { + if (glptr && (t_info == TI_LESS)) { /* input redirection (<) attached to glptr node */ debug_printf_parse("%s: input redir\n", __func__); cn = glptr->l.n = new_node(OC_CONCAT | SS | P(37)); -- 2.27.0 From 0cb86ebf5d44fc92660c4cfd11fca53d6a03e9bf Mon Sep 17 00:00:00 2001 From: Denys Vlasenko Date: Fri, 25 Jun 2021 19:38:27 +0200 Subject: [PATCH 09/65] awk: more efficient -f FILE, document what "some trick in next_token" is function old new delta awk_main 890 898 +8 Signed-off-by: Denys Vlasenko --- editors/awk.c | 33 ++++++++++++++++++++++++--------- 1 file changed, 24 insertions(+), 9 deletions(-) diff --git a/editors/awk.c b/editors/awk.c index 25f2c4810..d12645b92 100644 --- a/editors/awk.c +++ b/editors/awk.c @@ -1217,6 +1217,8 @@ static uint32_t next_token(uint32_t expected) if (!isalnum_(*p)) syntax_error(EMSG_UNEXP_TOKEN); /* no */ /* yes */ +/* "move name one char back" trick: we need a byte for NUL terminator */ +/* NB: this results in argv[i][-1] being used (!!!) in e.g. "awk -e 'NAME'" case */ t_string = --p; while (isalnum_(*++p)) { p[-1] = *p; @@ -3345,7 +3347,7 @@ int awk_main(int argc UNUSED_PARAM, char **argv) #if ENABLE_FEATURE_AWK_GNU_EXTENSIONS llist_t *list_e = NULL; #endif - int i, j; + int i; var *v; var tv; char **envp; @@ -3417,30 +3419,43 @@ int awk_main(int argc UNUSED_PARAM, char **argv) bb_show_usage(); } while (list_f) { - char *s = NULL; - FILE *from_file; + int fd; + char *s; g_progname = llist_pop(&list_f); - from_file = xfopen_stdin(g_progname); - /* one byte is reserved for some trick in next_token */ - for (i = j = 1; j > 0; i += j) { - s = xrealloc(s, i + 4096); - j = fread(s + i, 1, 4094, from_file); + fd = xopen_stdin(g_progname); + /* 1st byte is reserved for "move name one char back" trick in next_token */ + i = 1; + s = NULL; + for (;;) { + int sz; + s = xrealloc(s, i + 1000); + sz = safe_read(fd, s + i, 1000); + if (sz <= 0) + break; + i += sz; } + s = xrealloc(s, i + 1); /* trim unused 999 bytes */ s[i] = '\0'; - fclose(from_file); + close(fd); parse_program(s + 1); free(s); } g_progname = "cmd. line"; #if ENABLE_FEATURE_AWK_GNU_EXTENSIONS while (list_e) { + /* NB: "move name one char back" trick in next_token + * can use argv[i][-1] here. + */ parse_program(llist_pop(&list_e)); } #endif if (!(opt & (OPT_f | OPT_e))) { if (!*argv) bb_show_usage(); + /* NB: "move name one char back" trick in next_token + * can use argv[i][-1] here. + */ parse_program(*argv++); } -- 2.27.0 From d6c14489cea2ead6ea4ec02bd1650b2d4da1ebcd Mon Sep 17 00:00:00 2001 From: Denys Vlasenko Date: Fri, 25 Jun 2021 19:41:05 +0200 Subject: [PATCH 10/65] awk: move locals deeper into scopes where they are used, no logic changes Signed-off-by: Denys Vlasenko --- editors/awk.c | 62 ++++++++++++++++++++++++++------------------------- 1 file changed, 32 insertions(+), 30 deletions(-) diff --git a/editors/awk.c b/editors/awk.c index d12645b92..a9207df21 100644 --- a/editors/awk.c +++ b/editors/awk.c @@ -3254,20 +3254,19 @@ static var *evaluate(node *op, var *res) static int awk_exit(int r) { - var tv; unsigned i; - hash_item *hi; - - zero_out_var(&tv); if (!exiting) { + var tv; exiting = TRUE; nextrec = FALSE; + zero_out_var(&tv); evaluate(endseq.first, &tv); } /* waiting for children */ for (i = 0; i < fdhash->csize; i++) { + hash_item *hi; hi = fdhash->items[i]; while (hi) { if (hi->data.rs.F && hi->data.rs.is_pipe) @@ -3348,11 +3347,7 @@ int awk_main(int argc UNUSED_PARAM, char **argv) llist_t *list_e = NULL; #endif int i; - var *v; var tv; - char **envp; - char *vnames = (char *)vNames; /* cheat */ - char *vvalues = (char *)vValues; INIT_G(); @@ -3361,8 +3356,6 @@ int awk_main(int argc UNUSED_PARAM, char **argv) if (ENABLE_LOCALE_SUPPORT) setlocale(LC_NUMERIC, "C"); - zero_out_var(&tv); - /* allocate global buffer */ g_buf = xmalloc(MAXVARFMT + 1); @@ -3372,16 +3365,21 @@ int awk_main(int argc UNUSED_PARAM, char **argv) fnhash = hash_init(); /* initialize variables */ - for (i = 0; *vnames; i++) { - intvar[i] = v = newvar(nextword(&vnames)); - if (*vvalues != '\377') - setvar_s(v, nextword(&vvalues)); - else - setvar_i(v, 0); - - if (*vnames == '*') { - v->type |= VF_SPECIAL; - vnames++; + { + char *vnames = (char *)vNames; /* cheat */ + char *vvalues = (char *)vValues; + for (i = 0; *vnames; i++) { + var *v; + intvar[i] = v = newvar(nextword(&vnames)); + if (*vvalues != '\377') + setvar_s(v, nextword(&vvalues)); + else + setvar_i(v, 0); + + if (*vnames == '*') { + v->type |= VF_SPECIAL; + vnames++; + } } } @@ -3393,16 +3391,19 @@ int awk_main(int argc UNUSED_PARAM, char **argv) newfile("/dev/stderr")->F = stderr; /* Huh, people report that sometimes environ is NULL. Oh well. */ - if (environ) for (envp = environ; *envp; envp++) { - /* environ is writable, thus we don't strdup it needlessly */ - char *s = *envp; - char *s1 = strchr(s, '='); - if (s1) { - *s1 = '\0'; - /* Both findvar and setvar_u take const char* - * as 2nd arg -> environment is not trashed */ - setvar_u(findvar(iamarray(intvar[ENVIRON]), s), s1 + 1); - *s1 = '='; + if (environ) { + char **envp; + for (envp = environ; *envp; envp++) { + /* environ is writable, thus we don't strdup it needlessly */ + char *s = *envp; + char *s1 = strchr(s, '='); + if (s1) { + *s1 = '\0'; + /* Both findvar and setvar_u take const char* + * as 2nd arg -> environment is not trashed */ + setvar_u(findvar(iamarray(intvar[ENVIRON]), s), s1 + 1); + *s1 = '='; + } } } opt = getopt32(argv, OPTSTR_AWK, &opt_F, &list_v, &list_f, IF_FEATURE_AWK_GNU_EXTENSIONS(&list_e,) NULL); @@ -3466,6 +3467,7 @@ int awk_main(int argc UNUSED_PARAM, char **argv) setari_u(intvar[ARGV], ++i, *argv++); setvar_i(intvar[ARGC], i + 1); + zero_out_var(&tv); evaluate(beginseq.first, &tv); if (!mainseq.first && !endseq.first) awk_exit(EXIT_SUCCESS); -- 2.27.0 From 8ed2c6b6414bfcd8eb0a02424e74a4abcc74570e Mon Sep 17 00:00:00 2001 From: Denys Vlasenko Date: Tue, 29 Jun 2021 01:03:42 +0200 Subject: [PATCH 11/65] awk: remove redundant check function old new delta next_token 785 784 -1 parse_program 337 328 -9 ------------------------------------------------------------------------------ (add/remove: 0/0 grow/shrink: 0/2 up/down: 0/-10) Total: -10 bytes Signed-off-by: Denys Vlasenko --- editors/awk.c | 32 ++++++++++++++++++-------------- 1 file changed, 18 insertions(+), 14 deletions(-) diff --git a/editors/awk.c b/editors/awk.c index a9207df21..1ccc7bd98 100644 --- a/editors/awk.c +++ b/editors/awk.c @@ -1093,8 +1093,9 @@ static void nvfree(var *v) /* ------- awk program text parsing ------- */ -/* Parse next token pointed by global pos, place results into global ttt. - * If token isn't expected, give away. Return token class +/* Parse next token pointed by global pos, place results into global t_XYZ variables. + * If token isn't expected, print error message and die. + * Return token class (also store it in t_tclass). */ static uint32_t next_token(uint32_t expected) { @@ -1248,33 +1249,35 @@ static uint32_t next_token(uint32_t expected) goto readnext; /* insert concatenation operator when needed */ - debug_printf_parse("%s: %x %x %x concat_inserted?\n", __func__, - (ltclass & TC_CONCAT1), (tc & TC_CONCAT2), (expected & TC_BINOP)); + debug_printf_parse("%s: concat_inserted if all nonzero: %x %x %x %x\n", __func__, + (ltclass & TC_CONCAT1), (tc & TC_CONCAT2), (expected & TC_BINOP), + !(ltclass == TC_LENGTH && tc == TC_SEQSTART)); if ((ltclass & TC_CONCAT1) && (tc & TC_CONCAT2) && (expected & TC_BINOP) && !(ltclass == TC_LENGTH && tc == TC_SEQSTART) /* but not for "length(..." */ ) { concat_inserted = TRUE; save_tclass = tc; save_info = t_info; - tc = TC_BINOP; + tc = TC_BINOPX; t_info = OC_CONCAT | SS | P(35); } - debug_printf_parse("%s: t_tclass=tc=%x\n", __func__, t_tclass); t_tclass = tc; + debug_printf_parse("%s: t_tclass=tc=%x\n", __func__, tc); } - ltclass = t_tclass; - /* Are we ready for this? */ - if (!(ltclass & expected)) { + if (!(t_tclass & expected)) { syntax_error((ltclass & (TC_NEWLINE | TC_EOF)) ? EMSG_UNEXP_EOS : EMSG_UNEXP_TOKEN); } - debug_printf_parse("%s: returning, t_double:%f ltclass:", __func__, t_double); - debug_parse_print_tc(ltclass); + debug_printf_parse("%s: returning, t_double:%f t_tclass:", __func__, t_double); + debug_parse_print_tc(t_tclass); debug_printf_parse("\n"); - return ltclass; + + ltclass = t_tclass; + + return t_tclass; #undef concat_inserted #undef save_tclass #undef save_info @@ -1700,8 +1703,9 @@ static void parse_program(char *p) /* Arg followed either by end of arg list or 1 comma */ if (next_token(TC_COMMA | TC_SEQTERM) & TC_SEQTERM) break; - if (t_tclass != TC_COMMA) - syntax_error(EMSG_UNEXP_TOKEN); +//Impossible: next_token() above would error out and die +// if (t_tclass != TC_COMMA) +// syntax_error(EMSG_UNEXP_TOKEN); } seq = &f->body; chain_group(); -- 2.27.0 From edce3d162d9d0f83c8a1a76c3130619903cc7404 Mon Sep 17 00:00:00 2001 From: Denys Vlasenko Date: Tue, 29 Jun 2021 01:09:08 +0200 Subject: [PATCH 12/65] awk: make ltclass ("last token class") local to next_token() function old new delta next_token 784 790 +6 next_input_file 219 216 -3 ------------------------------------------------------------------------------ (add/remove: 0/0 grow/shrink: 1/1 up/down: 6/-3) Total: 3 bytes Signed-off-by: Denys Vlasenko --- editors/awk.c | 24 ++++++++++-------------- 1 file changed, 10 insertions(+), 14 deletions(-) diff --git a/editors/awk.c b/editors/awk.c index 1ccc7bd98..f3ca8aa5f 100644 --- a/editors/awk.c +++ b/editors/awk.c @@ -556,7 +556,6 @@ struct globals2 { uint32_t next_token__save_tclass; uint32_t next_token__save_info; - uint32_t next_token__ltclass; smallint next_token__concat_inserted; smallint next_input_file__files_happen; @@ -615,7 +614,7 @@ struct globals2 { #define rsplitter (G.rsplitter ) #define INIT_G() do { \ SET_PTR_TO_GLOBALS((char*)xzalloc(sizeof(G1)+sizeof(G)) + sizeof(G1)); \ - G.next_token__ltclass = TC_OPTERM; \ + t_tclass = TC_OPTERM; \ G.evaluate__seed = 1; \ } while (0) @@ -1102,13 +1101,13 @@ static uint32_t next_token(uint32_t expected) #define concat_inserted (G.next_token__concat_inserted) #define save_tclass (G.next_token__save_tclass) #define save_info (G.next_token__save_info) -/* Initialized to TC_OPTERM: */ -#define ltclass (G.next_token__ltclass) char *p, *s; const char *tl; - uint32_t tc; const uint32_t *ti; + uint32_t tc, last_token_class; + + last_token_class = t_tclass; /* t_tclass is initialized to TC_OPTERM */ debug_printf_parse("%s() expected(%x):", __func__, expected); debug_parse_print_tc(expected); @@ -1245,15 +1244,15 @@ static uint32_t next_token(uint32_t expected) g_pos = p; /* skipping newlines in some cases */ - if ((ltclass & TC_NOTERM) && (tc & TC_NEWLINE)) + if ((last_token_class & TC_NOTERM) && (tc & TC_NEWLINE)) goto readnext; /* insert concatenation operator when needed */ debug_printf_parse("%s: concat_inserted if all nonzero: %x %x %x %x\n", __func__, - (ltclass & TC_CONCAT1), (tc & TC_CONCAT2), (expected & TC_BINOP), - !(ltclass == TC_LENGTH && tc == TC_SEQSTART)); - if ((ltclass & TC_CONCAT1) && (tc & TC_CONCAT2) && (expected & TC_BINOP) - && !(ltclass == TC_LENGTH && tc == TC_SEQSTART) /* but not for "length(..." */ + (last_token_class & TC_CONCAT1), (tc & TC_CONCAT2), (expected & TC_BINOP), + !(last_token_class == TC_LENGTH && tc == TC_SEQSTART)); + if ((last_token_class & TC_CONCAT1) && (tc & TC_CONCAT2) && (expected & TC_BINOP) + && !(last_token_class == TC_LENGTH && tc == TC_SEQSTART) /* but not for "length(..." */ ) { concat_inserted = TRUE; save_tclass = tc; @@ -1267,7 +1266,7 @@ static uint32_t next_token(uint32_t expected) } /* Are we ready for this? */ if (!(t_tclass & expected)) { - syntax_error((ltclass & (TC_NEWLINE | TC_EOF)) ? + syntax_error((last_token_class & (TC_NEWLINE | TC_EOF)) ? EMSG_UNEXP_EOS : EMSG_UNEXP_TOKEN); } @@ -1275,13 +1274,10 @@ static uint32_t next_token(uint32_t expected) debug_parse_print_tc(t_tclass); debug_printf_parse("\n"); - ltclass = t_tclass; - return t_tclass; #undef concat_inserted #undef save_tclass #undef save_info -#undef ltclass } static void rollback_token(void) -- 2.27.0 From ccd57ce9bacb02dd06286d7aa2ef4ca23fbcaa21 Mon Sep 17 00:00:00 2001 From: Denys Vlasenko Date: Tue, 29 Jun 2021 01:23:37 +0200 Subject: [PATCH 13/65] awk: use TS_foo for combined token classes. No code changes Confusion with "simple" classes was the cause of a bug fixed by previous commit Signed-off-by: Denys Vlasenko --- editors/awk.c | 128 +++++++++++++++++++++++++------------------------- 1 file changed, 64 insertions(+), 64 deletions(-) diff --git a/editors/awk.c b/editors/awk.c index f3ca8aa5f..6f79dd138 100644 --- a/editors/awk.c +++ b/editors/awk.c @@ -281,39 +281,39 @@ if ((n) & TC_NUMBER ) debug_printf_parse(" NUMBER" ); \ } while (0) #endif -/* combined token classes */ -#define TC_UOPPRE (TC_UOPPRE1 | TC_UOPPRE2) +/* combined token classes ("token [class] sets") */ +#define TS_UOPPRE (TC_UOPPRE1 | TC_UOPPRE2) -#define TC_BINOP (TC_BINOPX | TC_COMMA | TC_PIPE | TC_IN) -//#define TC_UNARYOP (TC_UOPPRE | TC_UOPPOST) -#define TC_OPERAND (TC_VARIABLE | TC_ARRAY | TC_FUNCTION \ - | TC_BUILTIN | TC_LENGTH | TC_GETLINE \ - | TC_SEQSTART | TC_STRING | TC_NUMBER) -#define TC_LVALUE (TC_VARIABLE | TC_ARRAY) +#define TS_BINOP (TC_BINOPX | TC_COMMA | TC_PIPE | TC_IN) +//#define TS_UNARYOP (TS_UOPPRE | TC_UOPPOST) +#define TS_OPERAND (TC_VARIABLE | TC_ARRAY | TC_FUNCTION \ + | TC_BUILTIN | TC_LENGTH | TC_GETLINE \ + | TC_SEQSTART | TC_STRING | TC_NUMBER) -#define TC_STATEMNT (TC_STATX | TC_WHILE) -#define TC_OPTERM (TC_SEMICOL | TC_NEWLINE) +#define TS_LVALUE (TC_VARIABLE | TC_ARRAY) +#define TS_STATEMNT (TC_STATX | TC_WHILE) +#define TS_OPTERM (TC_SEMICOL | TC_NEWLINE) /* word tokens, cannot mean something else if not expected */ -#define TC_WORD (TC_IN | TC_STATEMNT | TC_ELSE \ - | TC_BUILTIN | TC_LENGTH | TC_GETLINE \ - | TC_FUNCDECL | TC_BEGIN | TC_END) +#define TS_WORD (TC_IN | TS_STATEMNT | TC_ELSE \ + | TC_BUILTIN | TC_LENGTH | TC_GETLINE \ + | TC_FUNCDECL | TC_BEGIN | TC_END) /* discard newlines after these */ -#define TC_NOTERM (TC_COMMA | TC_GRPSTART | TC_GRPTERM \ - | TC_BINOP | TC_OPTERM) +#define TS_NOTERM (TC_COMMA | TC_GRPSTART | TC_GRPTERM \ + | TS_BINOP | TS_OPTERM) /* what can expression begin with */ -#define TC_OPSEQ (TC_OPERAND | TC_UOPPRE | TC_REGEXP) +#define TS_OPSEQ (TS_OPERAND | TS_UOPPRE | TC_REGEXP) /* what can group begin with */ -#define TC_GRPSEQ (TC_OPSEQ | TC_OPTERM | TC_STATEMNT | TC_GRPSTART) +#define TS_GRPSEQ (TS_OPSEQ | TS_OPTERM | TS_STATEMNT | TC_GRPSTART) -/* if previous token class is CONCAT1 and next is CONCAT2, concatenation */ +/* if previous token class is CONCAT_L and next is CONCAT_R, concatenation */ /* operator is inserted between them */ -#define TC_CONCAT1 (TC_VARIABLE | TC_ARRTERM | TC_SEQTERM \ +#define TS_CONCAT_L (TC_VARIABLE | TC_ARRTERM | TC_SEQTERM \ | TC_STRING | TC_NUMBER | TC_UOPPOST \ | TC_LENGTH) -#define TC_CONCAT2 (TC_OPERAND | TC_UOPPRE) +#define TS_CONCAT_R (TS_OPERAND | TS_UOPPRE) #define OF_RES1 0x010000 #define OF_RES2 0x020000 @@ -614,7 +614,7 @@ struct globals2 { #define rsplitter (G.rsplitter ) #define INIT_G() do { \ SET_PTR_TO_GLOBALS((char*)xzalloc(sizeof(G1)+sizeof(G)) + sizeof(G1)); \ - t_tclass = TC_OPTERM; \ + t_tclass = TS_OPTERM; \ G.evaluate__seed = 1; \ } while (0) @@ -1107,7 +1107,7 @@ static uint32_t next_token(uint32_t expected) const uint32_t *ti; uint32_t tc, last_token_class; - last_token_class = t_tclass; /* t_tclass is initialized to TC_OPTERM */ + last_token_class = t_tclass; /* t_tclass is initialized to TS_OPTERM */ debug_printf_parse("%s() expected(%x):", __func__, expected); debug_parse_print_tc(expected); @@ -1198,9 +1198,9 @@ static uint32_t next_token(uint32_t expected) * token matches, * and it's not a longer word, */ - if ((tc & (expected | TC_WORD | TC_NEWLINE)) + if ((tc & (expected | TS_WORD | TC_NEWLINE)) && strncmp(p, tl, l) == 0 - && !((tc & TC_WORD) && isalnum_(p[l])) + && !((tc & TS_WORD) && isalnum_(p[l])) ) { /* then this is what we are looking for */ t_info = *ti; @@ -1244,14 +1244,14 @@ static uint32_t next_token(uint32_t expected) g_pos = p; /* skipping newlines in some cases */ - if ((last_token_class & TC_NOTERM) && (tc & TC_NEWLINE)) + if ((last_token_class & TS_NOTERM) && (tc & TC_NEWLINE)) goto readnext; /* insert concatenation operator when needed */ debug_printf_parse("%s: concat_inserted if all nonzero: %x %x %x %x\n", __func__, - (last_token_class & TC_CONCAT1), (tc & TC_CONCAT2), (expected & TC_BINOP), + (last_token_class & TS_CONCAT_L), (tc & TS_CONCAT_R), (expected & TS_BINOP), !(last_token_class == TC_LENGTH && tc == TC_SEQSTART)); - if ((last_token_class & TC_CONCAT1) && (tc & TC_CONCAT2) && (expected & TC_BINOP) + if ((last_token_class & TS_CONCAT_L) && (tc & TS_CONCAT_R) && (expected & TS_BINOP) && !(last_token_class == TC_LENGTH && tc == TC_SEQSTART) /* but not for "length(..." */ ) { concat_inserted = TRUE; @@ -1317,7 +1317,7 @@ static node *parse_expr(uint32_t term_tc) node sn; node *cn = &sn; node *vn, *glptr; - uint32_t tc, xtc; + uint32_t tc, expected_tc; var *v; debug_printf_parse("%s() term_tc(%x):", __func__, term_tc); @@ -1326,20 +1326,20 @@ static node *parse_expr(uint32_t term_tc) sn.info = PRIMASK; sn.r.n = sn.a.n = glptr = NULL; - xtc = TC_OPERAND | TC_UOPPRE | TC_REGEXP | term_tc; + expected_tc = TS_OPERAND | TS_UOPPRE | TC_REGEXP | term_tc; - while (!((tc = next_token(xtc)) & term_tc)) { + while (!((tc = next_token(expected_tc)) & term_tc)) { if (glptr && (t_info == TI_LESS)) { /* input redirection (<) attached to glptr node */ debug_printf_parse("%s: input redir\n", __func__); cn = glptr->l.n = new_node(OC_CONCAT | SS | P(37)); cn->a.n = glptr; - xtc = TC_OPERAND | TC_UOPPRE; + expected_tc = TS_OPERAND | TS_UOPPRE; glptr = NULL; - } else if (tc & (TC_BINOP | TC_UOPPOST)) { - debug_printf_parse("%s: TC_BINOP | TC_UOPPOST tc:%x\n", __func__, tc); + } else if (tc & (TS_BINOP | TC_UOPPOST)) { + debug_printf_parse("%s: TS_BINOP | TC_UOPPOST tc:%x\n", __func__, tc); /* for binary and postfix-unary operators, jump back over * previous operators with higher priority */ vn = cn; @@ -1353,19 +1353,19 @@ static node *parse_expr(uint32_t term_tc) t_info += P(6); cn = vn->a.n->r.n = new_node(t_info); cn->a.n = vn->a.n; - if (tc & TC_BINOP) { + if (tc & TS_BINOP) { cn->l.n = vn; - xtc = TC_OPERAND | TC_UOPPRE | TC_REGEXP; + expected_tc = TS_OPERAND | TS_UOPPRE | TC_REGEXP; if ((t_info & OPCLSMASK) == OC_PGETLINE) { /* it's a pipe */ next_token(TC_GETLINE); /* give maximum priority to this pipe */ cn->info &= ~PRIMASK; - xtc = TC_OPERAND | TC_UOPPRE | TC_BINOP | term_tc; + expected_tc = TS_OPERAND | TS_UOPPRE | TS_BINOP | term_tc; } } else { cn->r.n = vn; - xtc = TC_OPERAND | TC_UOPPRE | TC_BINOP | term_tc; + expected_tc = TS_OPERAND | TS_UOPPRE | TS_BINOP | term_tc; } vn->a.n = cn; @@ -1377,14 +1377,14 @@ static node *parse_expr(uint32_t term_tc) cn = vn->r.n = new_node(t_info); cn->a.n = vn; - xtc = TC_OPERAND | TC_UOPPRE | TC_REGEXP; + expected_tc = TS_OPERAND | TS_UOPPRE | TC_REGEXP; if (t_info == TI_PREINC || t_info == TI_PREDEC) - xtc = TC_LVALUE | TC_UOPPRE1; - if (tc & (TC_OPERAND | TC_REGEXP)) { - debug_printf_parse("%s: TC_OPERAND | TC_REGEXP\n", __func__); - xtc = TC_UOPPRE | TC_UOPPOST | TC_BINOP | TC_OPERAND | term_tc; + expected_tc = TS_LVALUE | TC_UOPPRE1; + if (tc & (TS_OPERAND | TC_REGEXP)) { + debug_printf_parse("%s: TS_OPERAND | TC_REGEXP\n", __func__); + expected_tc = TS_UOPPRE | TC_UOPPOST | TS_BINOP | TS_OPERAND | term_tc; /* one should be very careful with switch on tclass - - * only simple tclasses should be used! */ + * only simple tclasses should be used (TC_xyz, not TS_xyz) */ switch (tc) { case TC_VARIABLE: case TC_ARRAY: @@ -1412,7 +1412,7 @@ static node *parse_expr(uint32_t term_tc) setvar_i(v, t_double); else { setvar_s(v, t_string); - xtc &= ~TC_UOPPOST; /* "str"++ is not allowed */ + expected_tc &= ~TC_UOPPOST; /* "str"++ is not allowed */ } break; @@ -1439,7 +1439,7 @@ static node *parse_expr(uint32_t term_tc) case TC_GETLINE: debug_printf_parse("%s: TC_GETLINE\n", __func__); glptr = cn; - xtc = TC_OPERAND | TC_UOPPRE | TC_BINOP | term_tc; + expected_tc = TS_OPERAND | TS_UOPPRE | TS_BINOP | term_tc; break; case TC_BUILTIN: @@ -1450,7 +1450,7 @@ static node *parse_expr(uint32_t term_tc) case TC_LENGTH: debug_printf_parse("%s: TC_LENGTH\n", __func__); next_token(TC_SEQSTART /* length(...) */ - | TC_OPTERM /* length; (or newline)*/ + | TS_OPTERM /* length; (or newline)*/ | TC_GRPTERM /* length } */ | TC_BINOPX /* length NUM */ | TC_COMMA /* print length, 1 */ @@ -1464,7 +1464,7 @@ static node *parse_expr(uint32_t term_tc) } } } - } + } /* while() */ debug_printf_parse("%s() returns %p\n", __func__, sn.r.n); return sn.r.n; @@ -1497,7 +1497,7 @@ static void chain_expr(uint32_t info) n = chain_node(info); - n->l.n = parse_expr(TC_OPTERM | TC_GRPTERM); + n->l.n = parse_expr(TS_OPTERM | TC_GRPTERM); if ((info & OF_REQUIRED) && !n->l.n) syntax_error(EMSG_TOO_FEW_ARGS); @@ -1535,12 +1535,12 @@ static void chain_group(void) node *n, *n2, *n3; do { - c = next_token(TC_GRPSEQ); + c = next_token(TS_GRPSEQ); } while (c & TC_NEWLINE); if (c & TC_GRPSTART) { debug_printf_parse("%s: TC_GRPSTART\n", __func__); - while (next_token(TC_GRPSEQ | TC_GRPTERM) != TC_GRPTERM) { + while (next_token(TS_GRPSEQ | TC_GRPTERM) != TC_GRPTERM) { debug_printf_parse("%s: !TC_GRPTERM\n", __func__); if (t_tclass & TC_NEWLINE) continue; @@ -1548,13 +1548,13 @@ static void chain_group(void) chain_group(); } debug_printf_parse("%s: TC_GRPTERM\n", __func__); - } else if (c & (TC_OPSEQ | TC_OPTERM)) { - debug_printf_parse("%s: TC_OPSEQ | TC_OPTERM\n", __func__); + } else if (c & (TS_OPSEQ | TS_OPTERM)) { + debug_printf_parse("%s: TS_OPSEQ | TS_OPTERM\n", __func__); rollback_token(); chain_expr(OC_EXEC | Vx); } else { - /* TC_STATEMNT */ - debug_printf_parse("%s: TC_STATEMNT(?)\n", __func__); + /* TS_STATEMNT */ + debug_printf_parse("%s: TS_STATEMNT(?)\n", __func__); switch (t_info & OPCLSMASK) { case ST_IF: debug_printf_parse("%s: ST_IF\n", __func__); @@ -1563,7 +1563,7 @@ static void chain_group(void) chain_group(); n2 = chain_node(OC_EXEC); n->r.n = seq->last; - if (next_token(TC_GRPSEQ | TC_GRPTERM | TC_ELSE) == TC_ELSE) { + if (next_token(TS_GRPSEQ | TC_GRPTERM | TC_ELSE) == TC_ELSE) { chain_group(); n2->a.n = seq->last; } else { @@ -1616,10 +1616,10 @@ static void chain_group(void) case OC_PRINTF: debug_printf_parse("%s: OC_PRINT[F]\n", __func__); n = chain_node(t_info); - n->l.n = parse_expr(TC_OPTERM | TC_OUTRDR | TC_GRPTERM); + n->l.n = parse_expr(TS_OPTERM | TC_OUTRDR | TC_GRPTERM); if (t_tclass & TC_OUTRDR) { n->info |= t_info; - n->r.n = parse_expr(TC_OPTERM | TC_GRPTERM); + n->r.n = parse_expr(TS_OPTERM | TC_GRPTERM); } if (t_tclass & TC_GRPTERM) rollback_token(); @@ -1658,11 +1658,11 @@ static void parse_program(char *p) g_pos = p; t_lineno = 1; - while ((tclass = next_token(TC_EOF | TC_OPSEQ | TC_GRPSTART | - TC_OPTERM | TC_BEGIN | TC_END | TC_FUNCDECL)) != TC_EOF) { + while ((tclass = next_token(TC_EOF | TS_OPSEQ | TC_GRPSTART | + TS_OPTERM | TC_BEGIN | TC_END | TC_FUNCDECL)) != TC_EOF) { - if (tclass & TC_OPTERM) { - debug_printf_parse("%s: TC_OPTERM\n", __func__); + if (tclass & TS_OPTERM) { + debug_printf_parse("%s: TS_OPTERM\n", __func__); continue; } @@ -1706,11 +1706,11 @@ static void parse_program(char *p) seq = &f->body; chain_group(); clear_array(ahash); - } else if (tclass & TC_OPSEQ) { - debug_printf_parse("%s: TC_OPSEQ\n", __func__); + } else if (tclass & TS_OPSEQ) { + debug_printf_parse("%s: TS_OPSEQ\n", __func__); rollback_token(); cn = chain_node(OC_TEST); - cn->l.n = parse_expr(TC_OPTERM | TC_EOF | TC_GRPSTART); + cn->l.n = parse_expr(TS_OPTERM | TC_EOF | TC_GRPSTART); if (t_tclass & TC_GRPSTART) { debug_printf_parse("%s: TC_GRPSTART\n", __func__); rollback_token(); -- 2.27.0 From 4f7c401cc4cc53e792ea2fc811618c0f9ae44c82 Mon Sep 17 00:00:00 2001 From: Denys Vlasenko Date: Tue, 29 Jun 2021 01:30:49 +0200 Subject: [PATCH 14/65] awk: deindent code block, no code changes Signed-off-by: Denys Vlasenko --- editors/awk.c | 177 +++++++++++++++++++++++++------------------------- 1 file changed, 90 insertions(+), 87 deletions(-) diff --git a/editors/awk.c b/editors/awk.c index 6f79dd138..a979a8aba 100644 --- a/editors/awk.c +++ b/editors/awk.c @@ -1337,8 +1337,9 @@ static node *parse_expr(uint32_t term_tc) cn->a.n = glptr; expected_tc = TS_OPERAND | TS_UOPPRE; glptr = NULL; - - } else if (tc & (TS_BINOP | TC_UOPPOST)) { + continue; + } + if (tc & (TS_BINOP | TC_UOPPOST)) { debug_printf_parse("%s: TS_BINOP | TC_UOPPOST tc:%x\n", __func__, tc); /* for binary and postfix-unary operators, jump back over * previous operators with higher priority */ @@ -1368,101 +1369,103 @@ static node *parse_expr(uint32_t term_tc) expected_tc = TS_OPERAND | TS_UOPPRE | TS_BINOP | term_tc; } vn->a.n = cn; + continue; + } - } else { - debug_printf_parse("%s: other, t_info:%x\n", __func__, t_info); - /* for operands and prefix-unary operators, attach them - * to last node */ - vn = cn; - cn = vn->r.n = new_node(t_info); - cn->a.n = vn; + debug_printf_parse("%s: other, t_info:%x\n", __func__, t_info); + /* for operands and prefix-unary operators, attach them + * to last node */ + vn = cn; + cn = vn->r.n = new_node(t_info); + cn->a.n = vn; - expected_tc = TS_OPERAND | TS_UOPPRE | TC_REGEXP; - if (t_info == TI_PREINC || t_info == TI_PREDEC) - expected_tc = TS_LVALUE | TC_UOPPRE1; - if (tc & (TS_OPERAND | TC_REGEXP)) { - debug_printf_parse("%s: TS_OPERAND | TC_REGEXP\n", __func__); - expected_tc = TS_UOPPRE | TC_UOPPOST | TS_BINOP | TS_OPERAND | term_tc; - /* one should be very careful with switch on tclass - - * only simple tclasses should be used (TC_xyz, not TS_xyz) */ - switch (tc) { - case TC_VARIABLE: - case TC_ARRAY: - debug_printf_parse("%s: TC_VARIABLE | TC_ARRAY\n", __func__); - cn->info = OC_VAR; - v = hash_search(ahash, t_string); - if (v != NULL) { - cn->info = OC_FNARG; - cn->l.aidx = v->x.aidx; - } else { - cn->l.v = newvar(t_string); - } - if (tc & TC_ARRAY) { - cn->info |= xS; - cn->r.n = parse_expr(TC_ARRTERM); - } - break; + expected_tc = TS_OPERAND | TS_UOPPRE | TC_REGEXP; + if (t_info == TI_PREINC || t_info == TI_PREDEC) + expected_tc = TS_LVALUE | TC_UOPPRE1; - case TC_NUMBER: - case TC_STRING: - debug_printf_parse("%s: TC_NUMBER | TC_STRING\n", __func__); - cn->info = OC_VAR; - v = cn->l.v = xzalloc(sizeof(var)); - if (tc & TC_NUMBER) - setvar_i(v, t_double); - else { - setvar_s(v, t_string); - expected_tc &= ~TC_UOPPOST; /* "str"++ is not allowed */ - } - break; + if (!(tc & (TS_OPERAND | TC_REGEXP))) + continue; - case TC_REGEXP: - debug_printf_parse("%s: TC_REGEXP\n", __func__); - mk_re_node(t_string, cn, xzalloc(sizeof(regex_t)*2)); - break; + debug_printf_parse("%s: TS_OPERAND | TC_REGEXP\n", __func__); + expected_tc = TS_UOPPRE | TC_UOPPOST | TS_BINOP | TS_OPERAND | term_tc; + /* one should be very careful with switch on tclass - + * only simple tclasses should be used (TC_xyz, not TS_xyz) */ + switch (tc) { + case TC_VARIABLE: + case TC_ARRAY: + debug_printf_parse("%s: TC_VARIABLE | TC_ARRAY\n", __func__); + cn->info = OC_VAR; + v = hash_search(ahash, t_string); + if (v != NULL) { + cn->info = OC_FNARG; + cn->l.aidx = v->x.aidx; + } else { + cn->l.v = newvar(t_string); + } + if (tc & TC_ARRAY) { + cn->info |= xS; + cn->r.n = parse_expr(TC_ARRTERM); + } + break; - case TC_FUNCTION: - debug_printf_parse("%s: TC_FUNCTION\n", __func__); - cn->info = OC_FUNC; - cn->r.f = newfunc(t_string); - cn->l.n = condition(); - break; + case TC_NUMBER: + case TC_STRING: + debug_printf_parse("%s: TC_NUMBER | TC_STRING\n", __func__); + cn->info = OC_VAR; + v = cn->l.v = xzalloc(sizeof(var)); + if (tc & TC_NUMBER) + setvar_i(v, t_double); + else { + setvar_s(v, t_string); + expected_tc &= ~TC_UOPPOST; /* "str"++ is not allowed */ + } + break; - case TC_SEQSTART: - debug_printf_parse("%s: TC_SEQSTART\n", __func__); - cn = vn->r.n = parse_expr(TC_SEQTERM); - if (!cn) - syntax_error("Empty sequence"); - cn->a.n = vn; - break; + case TC_REGEXP: + debug_printf_parse("%s: TC_REGEXP\n", __func__); + mk_re_node(t_string, cn, xzalloc(sizeof(regex_t)*2)); + break; - case TC_GETLINE: - debug_printf_parse("%s: TC_GETLINE\n", __func__); - glptr = cn; - expected_tc = TS_OPERAND | TS_UOPPRE | TS_BINOP | term_tc; - break; + case TC_FUNCTION: + debug_printf_parse("%s: TC_FUNCTION\n", __func__); + cn->info = OC_FUNC; + cn->r.f = newfunc(t_string); + cn->l.n = condition(); + break; - case TC_BUILTIN: - debug_printf_parse("%s: TC_BUILTIN\n", __func__); - cn->l.n = condition(); - break; + case TC_SEQSTART: + debug_printf_parse("%s: TC_SEQSTART\n", __func__); + cn = vn->r.n = parse_expr(TC_SEQTERM); + if (!cn) + syntax_error("Empty sequence"); + cn->a.n = vn; + break; - case TC_LENGTH: - debug_printf_parse("%s: TC_LENGTH\n", __func__); - next_token(TC_SEQSTART /* length(...) */ - | TS_OPTERM /* length; (or newline)*/ - | TC_GRPTERM /* length } */ - | TC_BINOPX /* length NUM */ - | TC_COMMA /* print length, 1 */ - ); - rollback_token(); - if (t_tclass & TC_SEQSTART) { - /* It was a "(" token. Handle just like TC_BUILTIN */ - cn->l.n = condition(); - } - break; - } + case TC_GETLINE: + debug_printf_parse("%s: TC_GETLINE\n", __func__); + glptr = cn; + expected_tc = TS_OPERAND | TS_UOPPRE | TS_BINOP | term_tc; + break; + + case TC_BUILTIN: + debug_printf_parse("%s: TC_BUILTIN\n", __func__); + cn->l.n = condition(); + break; + + case TC_LENGTH: + debug_printf_parse("%s: TC_LENGTH\n", __func__); + next_token(TC_SEQSTART /* length(...) */ + | TS_OPTERM /* length; (or newline)*/ + | TC_GRPTERM /* length } */ + | TC_BINOPX /* length NUM */ + | TC_COMMA /* print length, 1 */ + ); + rollback_token(); + if (t_tclass & TC_SEQSTART) { + /* It was a "(" token. Handle just like TC_BUILTIN */ + cn->l.n = condition(); } + break; } } /* while() */ -- 2.27.0 From aa11eb5a7f0b4f4067d9d8f188c607b5715a8133 Mon Sep 17 00:00:00 2001 From: Denys Vlasenko Date: Tue, 29 Jun 2021 01:50:47 +0200 Subject: [PATCH 15/65] awk: rename TC_SEQSTART/END to L/RPAREN, no code changes Signed-off-by: Denys Vlasenko --- editors/awk.c | 94 +++++++++++++++++++++++++-------------------------- 1 file changed, 47 insertions(+), 47 deletions(-) diff --git a/editors/awk.c b/editors/awk.c index a979a8aba..794a21e5d 100644 --- a/editors/awk.c +++ b/editors/awk.c @@ -207,48 +207,48 @@ typedef struct tsplitter_s { } tsplitter; /* simple token classes */ -/* Order and hex values are very important!!! See next_token() */ -#define TC_SEQSTART (1 << 0) /* ( */ -#define TC_SEQTERM (1 << 1) /* ) */ +/* order and hex values are very important!!! See next_token() */ +#define TC_LPAREN (1 << 0) /* ( */ +#define TC_RPAREN (1 << 1) /* ) */ #define TC_REGEXP (1 << 2) /* /.../ */ #define TC_OUTRDR (1 << 3) /* | > >> */ #define TC_UOPPOST (1 << 4) /* unary postfix operator ++ -- */ #define TC_UOPPRE1 (1 << 5) /* unary prefix operator ++ -- $ */ #define TC_BINOPX (1 << 6) /* two-opnd operator */ -#define TC_IN (1 << 7) -#define TC_COMMA (1 << 8) -#define TC_PIPE (1 << 9) /* input redirection pipe */ +#define TC_IN (1 << 7) /* 'in' */ +#define TC_COMMA (1 << 8) /* , */ +#define TC_PIPE (1 << 9) /* input redirection pipe | */ #define TC_UOPPRE2 (1 << 10) /* unary prefix operator + - ! */ #define TC_ARRTERM (1 << 11) /* ] */ #define TC_GRPSTART (1 << 12) /* { */ #define TC_GRPTERM (1 << 13) /* } */ -#define TC_SEMICOL (1 << 14) +#define TC_SEMICOL (1 << 14) /* ; */ #define TC_NEWLINE (1 << 15) #define TC_STATX (1 << 16) /* ctl statement (for, next...) */ -#define TC_WHILE (1 << 17) -#define TC_ELSE (1 << 18) +#define TC_WHILE (1 << 17) /* 'while' */ +#define TC_ELSE (1 << 18) /* 'else' */ #define TC_BUILTIN (1 << 19) /* This costs ~50 bytes of code. * A separate class to support deprecated "length" form. If we don't need that * (i.e. if we demand that only "length()" with () is valid), then TC_LENGTH * can be merged with TC_BUILTIN: */ -#define TC_LENGTH (1 << 20) -#define TC_GETLINE (1 << 21) +#define TC_LENGTH (1 << 20) /* 'length' */ +#define TC_GETLINE (1 << 21) /* 'getline' */ #define TC_FUNCDECL (1 << 22) /* 'function' 'func' */ -#define TC_BEGIN (1 << 23) -#define TC_END (1 << 24) +#define TC_BEGIN (1 << 23) /* 'BEGIN' */ +#define TC_END (1 << 24) /* 'END' */ #define TC_EOF (1 << 25) -#define TC_VARIABLE (1 << 26) -#define TC_ARRAY (1 << 27) -#define TC_FUNCTION (1 << 28) -#define TC_STRING (1 << 29) +#define TC_VARIABLE (1 << 26) /* name */ +#define TC_ARRAY (1 << 27) /* name[ */ +#define TC_FUNCTION (1 << 28) /* name( - but unlike TC_ARRAY, parser does not consume '(' */ +#define TC_STRING (1 << 29) /* "..." */ #define TC_NUMBER (1 << 30) #ifndef debug_parse_print_tc #define debug_parse_print_tc(n) do { \ -if ((n) & TC_SEQSTART) debug_printf_parse(" SEQSTART"); \ -if ((n) & TC_SEQTERM ) debug_printf_parse(" SEQTERM" ); \ +if ((n) & TC_LPAREN ) debug_printf_parse(" LPAREN" ); \ +if ((n) & TC_RPAREN ) debug_printf_parse(" RPAREN" ); \ if ((n) & TC_REGEXP ) debug_printf_parse(" REGEXP" ); \ if ((n) & TC_OUTRDR ) debug_printf_parse(" OUTRDR" ); \ if ((n) & TC_UOPPOST ) debug_printf_parse(" UOPPOST" ); \ @@ -288,7 +288,7 @@ if ((n) & TC_NUMBER ) debug_printf_parse(" NUMBER" ); \ //#define TS_UNARYOP (TS_UOPPRE | TC_UOPPOST) #define TS_OPERAND (TC_VARIABLE | TC_ARRAY | TC_FUNCTION \ | TC_BUILTIN | TC_LENGTH | TC_GETLINE \ - | TC_SEQSTART | TC_STRING | TC_NUMBER) + | TC_LPAREN | TC_STRING | TC_NUMBER) #define TS_LVALUE (TC_VARIABLE | TC_ARRAY) #define TS_STATEMNT (TC_STATX | TC_WHILE) @@ -310,7 +310,7 @@ if ((n) & TC_NUMBER ) debug_printf_parse(" NUMBER" ); \ /* if previous token class is CONCAT_L and next is CONCAT_R, concatenation */ /* operator is inserted between them */ -#define TS_CONCAT_L (TC_VARIABLE | TC_ARRTERM | TC_SEQTERM \ +#define TS_CONCAT_L (TC_VARIABLE | TC_ARRTERM | TC_RPAREN \ | TC_STRING | TC_NUMBER | TC_UOPPOST \ | TC_LENGTH) #define TS_CONCAT_R (TS_OPERAND | TS_UOPPRE) @@ -394,8 +394,8 @@ enum { #define NTCC '\377' static const char tokenlist[] ALIGN1 = - "\1(" NTC /* TC_SEQSTART */ - "\1)" NTC /* TC_SEQTERM */ + "\1(" NTC /* TC_LPAREN */ + "\1)" NTC /* TC_RPAREN */ "\1/" NTC /* TC_REGEXP */ "\2>>" "\1>" "\1|" NTC /* TC_OUTRDR */ "\2++" "\2--" NTC /* TC_UOPPOST */ @@ -1250,9 +1250,9 @@ static uint32_t next_token(uint32_t expected) /* insert concatenation operator when needed */ debug_printf_parse("%s: concat_inserted if all nonzero: %x %x %x %x\n", __func__, (last_token_class & TS_CONCAT_L), (tc & TS_CONCAT_R), (expected & TS_BINOP), - !(last_token_class == TC_LENGTH && tc == TC_SEQSTART)); + !(last_token_class == TC_LENGTH && tc == TC_LPAREN)); if ((last_token_class & TS_CONCAT_L) && (tc & TS_CONCAT_R) && (expected & TS_BINOP) - && !(last_token_class == TC_LENGTH && tc == TC_SEQSTART) /* but not for "length(..." */ + && !(last_token_class == TC_LENGTH && tc == TC_LPAREN) /* but not for "length(..." */ ) { concat_inserted = TRUE; save_tclass = tc; @@ -1304,10 +1304,10 @@ static void mk_re_node(const char *s, node *n, regex_t *re) xregcomp(re + 1, s, REG_EXTENDED | REG_ICASE); } -static node *condition(void) +static node *parse_lrparen_list(void) { - next_token(TC_SEQSTART); - return parse_expr(TC_SEQTERM); + next_token(TC_LPAREN); + return parse_expr(TC_RPAREN); } /* parse expression terminated by given argument, return ptr @@ -1430,12 +1430,12 @@ static node *parse_expr(uint32_t term_tc) debug_printf_parse("%s: TC_FUNCTION\n", __func__); cn->info = OC_FUNC; cn->r.f = newfunc(t_string); - cn->l.n = condition(); + cn->l.n = parse_lrparen_list(); break; - case TC_SEQSTART: - debug_printf_parse("%s: TC_SEQSTART\n", __func__); - cn = vn->r.n = parse_expr(TC_SEQTERM); + case TC_LPAREN: + debug_printf_parse("%s: TC_LPAREN\n", __func__); + cn = vn->r.n = parse_expr(TC_RPAREN); if (!cn) syntax_error("Empty sequence"); cn->a.n = vn; @@ -1449,21 +1449,21 @@ static node *parse_expr(uint32_t term_tc) case TC_BUILTIN: debug_printf_parse("%s: TC_BUILTIN\n", __func__); - cn->l.n = condition(); + cn->l.n = parse_lrparen_list(); break; case TC_LENGTH: debug_printf_parse("%s: TC_LENGTH\n", __func__); - next_token(TC_SEQSTART /* length(...) */ + next_token(TC_LPAREN /* length(...) */ | TS_OPTERM /* length; (or newline)*/ | TC_GRPTERM /* length } */ | TC_BINOPX /* length NUM */ | TC_COMMA /* print length, 1 */ ); rollback_token(); - if (t_tclass & TC_SEQSTART) { + if (t_tclass & TC_LPAREN) { /* It was a "(" token. Handle just like TC_BUILTIN */ - cn->l.n = condition(); + cn->l.n = parse_lrparen_list(); } break; } @@ -1562,7 +1562,7 @@ static void chain_group(void) case ST_IF: debug_printf_parse("%s: ST_IF\n", __func__); n = chain_node(OC_BR | Vx); - n->l.n = condition(); + n->l.n = parse_lrparen_list(); chain_group(); n2 = chain_node(OC_EXEC); n->r.n = seq->last; @@ -1576,7 +1576,7 @@ static void chain_group(void) case ST_WHILE: debug_printf_parse("%s: ST_WHILE\n", __func__); - n2 = condition(); + n2 = parse_lrparen_list(); n = chain_loop(NULL); n->l.n = n2; break; @@ -1587,14 +1587,14 @@ static void chain_group(void) n = chain_loop(NULL); n2->a.n = n->a.n; next_token(TC_WHILE); - n->l.n = condition(); + n->l.n = parse_lrparen_list(); break; case ST_FOR: debug_printf_parse("%s: ST_FOR\n", __func__); - next_token(TC_SEQSTART); - n2 = parse_expr(TC_SEMICOL | TC_SEQTERM); - if (t_tclass & TC_SEQTERM) { /* for-in */ + next_token(TC_LPAREN); + n2 = parse_expr(TC_SEMICOL | TC_RPAREN); + if (t_tclass & TC_RPAREN) { /* for-in */ if (!n2 || (n2->info & OPCLSMASK) != OC_IN) syntax_error(EMSG_UNEXP_TOKEN); n = chain_node(OC_WALKINIT | VV); @@ -1607,7 +1607,7 @@ static void chain_group(void) n = chain_node(OC_EXEC | Vx); n->l.n = n2; n2 = parse_expr(TC_SEMICOL); - n3 = parse_expr(TC_SEQTERM); + n3 = parse_expr(TC_RPAREN); n = chain_loop(n3); n->l.n = n2; if (!n2) @@ -1686,13 +1686,13 @@ static void parse_program(char *p) f->body.first = NULL; f->nargs = 0; /* Match func arg list: a comma sep list of >= 0 args, and a close paren */ - while (next_token(TC_VARIABLE | TC_SEQTERM | TC_COMMA)) { + while (next_token(TC_VARIABLE | TC_RPAREN | TC_COMMA)) { /* Either an empty arg list, or trailing comma from prev iter * must be followed by an arg */ - if (f->nargs == 0 && t_tclass == TC_SEQTERM) + if (f->nargs == 0 && t_tclass == TC_RPAREN) break; - /* TC_SEQSTART/TC_COMMA must be followed by TC_VARIABLE */ + /* TC_LPAREN/TC_COMMA must be followed by TC_VARIABLE */ if (t_tclass != TC_VARIABLE) syntax_error(EMSG_UNEXP_TOKEN); @@ -1700,7 +1700,7 @@ static void parse_program(char *p) v->x.aidx = f->nargs++; /* Arg followed either by end of arg list or 1 comma */ - if (next_token(TC_COMMA | TC_SEQTERM) & TC_SEQTERM) + if (next_token(TC_COMMA | TC_RPAREN) & TC_RPAREN) break; //Impossible: next_token() above would error out and die // if (t_tclass != TC_COMMA) -- 2.27.0 From 913c56fb349585648ad1b0860965b8a1d906be53 Mon Sep 17 00:00:00 2001 From: Denys Vlasenko Date: Tue, 29 Jun 2021 02:32:32 +0200 Subject: [PATCH 16/65] awk: simplify parsing of function declaration function old new delta parse_program 328 313 -15 Signed-off-by: Denys Vlasenko --- editors/awk.c | 26 ++++++++++---------------- 1 file changed, 10 insertions(+), 16 deletions(-) diff --git a/editors/awk.c b/editors/awk.c index 794a21e5d..8d449dfb6 100644 --- a/editors/awk.c +++ b/editors/awk.c @@ -769,7 +769,7 @@ static void hash_remove(xhash *hash, const char *name) static char *skip_spaces(char *p) { - while (1) { + for (;;) { if (*p == '\\' && p[1] == '\n') { p++; t_lineno++; @@ -1685,26 +1685,20 @@ static void parse_program(char *p) f = newfunc(t_string); f->body.first = NULL; f->nargs = 0; - /* Match func arg list: a comma sep list of >= 0 args, and a close paren */ - while (next_token(TC_VARIABLE | TC_RPAREN | TC_COMMA)) { - /* Either an empty arg list, or trailing comma from prev iter - * must be followed by an arg */ - if (f->nargs == 0 && t_tclass == TC_RPAREN) - break; - - /* TC_LPAREN/TC_COMMA must be followed by TC_VARIABLE */ - if (t_tclass != TC_VARIABLE) + /* func arg list: comma sep list of args, and a close paren */ + for (;;) { + if (next_token(TC_VARIABLE | TC_RPAREN) == TC_RPAREN) { + if (f->nargs == 0) + break; /* func() is ok */ + /* func(a,) is not ok */ syntax_error(EMSG_UNEXP_TOKEN); - + } v = findvar(ahash, t_string); v->x.aidx = f->nargs++; - /* Arg followed either by end of arg list or 1 comma */ - if (next_token(TC_COMMA | TC_RPAREN) & TC_RPAREN) + if (next_token(TC_COMMA | TC_RPAREN) == TC_RPAREN) break; -//Impossible: next_token() above would error out and die -// if (t_tclass != TC_COMMA) -// syntax_error(EMSG_UNEXP_TOKEN); + /* it was a comma, we ate it */ } seq = &f->body; chain_group(); -- 2.27.0 From 836373527c608ea8beaef994df93175ca38dc015 Mon Sep 17 00:00:00 2001 From: Denys Vlasenko Date: Tue, 29 Jun 2021 02:43:02 +0200 Subject: [PATCH 17/65] awk: g_buf[] does not need a separate allocation function old new delta exec_builtin 1400 1414 +14 evaluate 3132 3141 +9 getvar_s 121 125 +4 awk_main 902 886 -16 ------------------------------------------------------------------------------ (add/remove: 0/0 grow/shrink: 3/1 up/down: 27/-16) Total: 11 bytes Signed-off-by: Denys Vlasenko --- editors/awk.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/editors/awk.c b/editors/awk.c index 8d449dfb6..e91ab9f97 100644 --- a/editors/awk.c +++ b/editors/awk.c @@ -535,7 +535,6 @@ struct globals { var *Fields; nvblock *g_cb; char *g_pos; - char *g_buf; smallint icase; smallint exiting; smallint nextrec; @@ -571,6 +570,8 @@ struct globals2 { /* biggest and least used members go last */ tsplitter fsplitter, rsplitter; + + char g_buf[MAXVARFMT + 1]; }; #define G1 (ptr_to_globals[-1]) #define G (*(struct globals2 *)ptr_to_globals) @@ -598,7 +599,6 @@ struct globals2 { #define Fields (G1.Fields ) #define g_cb (G1.g_cb ) #define g_pos (G1.g_pos ) -#define g_buf (G1.g_buf ) #define icase (G1.icase ) #define exiting (G1.exiting ) #define nextrec (G1.nextrec ) @@ -612,6 +612,7 @@ struct globals2 { #define intvar (G.intvar ) #define fsplitter (G.fsplitter ) #define rsplitter (G.rsplitter ) +#define g_buf (G.g_buf ) #define INIT_G() do { \ SET_PTR_TO_GLOBALS((char*)xzalloc(sizeof(G1)+sizeof(G)) + sizeof(G1)); \ t_tclass = TS_OPTERM; \ @@ -3353,9 +3354,6 @@ int awk_main(int argc UNUSED_PARAM, char **argv) if (ENABLE_LOCALE_SUPPORT) setlocale(LC_NUMERIC, "C"); - /* allocate global buffer */ - g_buf = xmalloc(MAXVARFMT + 1); - vhash = hash_init(); ahash = hash_init(); fdhash = hash_init(); -- 2.27.0 From 10b73e0d6fc5df5e30d1479f3460aab793706d5f Mon Sep 17 00:00:00 2001 From: Denys Vlasenko Date: Tue, 29 Jun 2021 03:02:21 +0200 Subject: [PATCH 18/65] awk: when parsing TC_FUNCTION token, eat its opening '(' ...like we do for array references. function old new delta parse_expr 938 948 +10 next_token 788 791 +3 parse_program 313 310 -3 ------------------------------------------------------------------------------ (add/remove: 0/0 grow/shrink: 2/1 up/down: 13/-3) Total: 10 bytes Signed-off-by: Denys Vlasenko --- editors/awk.c | 30 +++++++++++++++--------------- 1 file changed, 15 insertions(+), 15 deletions(-) diff --git a/editors/awk.c b/editors/awk.c index e91ab9f97..661081e24 100644 --- a/editors/awk.c +++ b/editors/awk.c @@ -241,7 +241,7 @@ typedef struct tsplitter_s { #define TC_EOF (1 << 25) #define TC_VARIABLE (1 << 26) /* name */ #define TC_ARRAY (1 << 27) /* name[ */ -#define TC_FUNCTION (1 << 28) /* name( - but unlike TC_ARRAY, parser does not consume '(' */ +#define TC_FUNCTION (1 << 28) /* name( */ #define TC_STRING (1 << 29) /* "..." */ #define TC_NUMBER (1 << 30) @@ -959,6 +959,7 @@ static double getvar_i(var *v) v->number = my_strtod(&s); debug_printf_eval("%f (s:'%s')\n", v->number, s); if (v->type & VF_USER) { +//TODO: skip_spaces() also skips backslash+newline, is it intended here? s = skip_spaces(s); if (*s != '\0') v->type &= ~VF_USER; @@ -1103,7 +1104,7 @@ static uint32_t next_token(uint32_t expected) #define save_tclass (G.next_token__save_tclass) #define save_info (G.next_token__save_info) - char *p, *s; + char *p; const char *tl; const uint32_t *ti; uint32_t tc, last_token_class; @@ -1131,15 +1132,12 @@ static uint32_t next_token(uint32_t expected) while (*p != '\n' && *p != '\0') p++; - if (*p == '\n') - t_lineno++; - if (*p == '\0') { tc = TC_EOF; debug_printf_parse("%s: token found: TC_EOF\n", __func__); } else if (*p == '\"') { /* it's a string */ - t_string = s = ++p; + char *s = t_string = ++p; while (*p != '\"') { char *pp; if (*p == '\0' || *p == '\n') @@ -1154,7 +1152,7 @@ static uint32_t next_token(uint32_t expected) debug_printf_parse("%s: token found:'%s' TC_STRING\n", __func__, t_string); } else if ((expected & TC_REGEXP) && *p == '/') { /* it's regexp */ - t_string = s = ++p; + char *s = t_string = ++p; while (*p != '/') { if (*p == '\0' || *p == '\n') syntax_error(EMSG_UNEXP_EOS); @@ -1185,6 +1183,9 @@ static uint32_t next_token(uint32_t expected) tc = TC_NUMBER; debug_printf_parse("%s: token found:%f TC_NUMBER\n", __func__, t_double); } else { + if (*p == '\n') + t_lineno++; + /* search for something known */ tl = tokenlist; tc = 0x00000001; @@ -1230,15 +1231,15 @@ static uint32_t next_token(uint32_t expected) if (!(expected & TC_VARIABLE) || (expected & TC_ARRAY)) p = skip_spaces(p); if (*p == '(') { + p++; tc = TC_FUNCTION; debug_printf_parse("%s: token found:'%s' TC_FUNCTION\n", __func__, t_string); + } else if (*p == '[') { + p++; + tc = TC_ARRAY; + debug_printf_parse("%s: token found:'%s' TC_ARRAY\n", __func__, t_string); } else { - if (*p == '[') { - p++; - tc = TC_ARRAY; - debug_printf_parse("%s: token found:'%s' TC_ARRAY\n", __func__, t_string); - } else - debug_printf_parse("%s: token found:'%s' TC_VARIABLE\n", __func__, t_string); + debug_printf_parse("%s: token found:'%s' TC_VARIABLE\n", __func__, t_string); } } token_found: @@ -1431,7 +1432,7 @@ static node *parse_expr(uint32_t term_tc) debug_printf_parse("%s: TC_FUNCTION\n", __func__); cn->info = OC_FUNC; cn->r.f = newfunc(t_string); - cn->l.n = parse_lrparen_list(); + cn->l.n = parse_expr(TC_RPAREN); break; case TC_LPAREN: @@ -1682,7 +1683,6 @@ static void parse_program(char *p) } else if (tclass & TC_FUNCDECL) { debug_printf_parse("%s: TC_FUNCDECL\n", __func__); next_token(TC_FUNCTION); - g_pos++; f = newfunc(t_string); f->body.first = NULL; f->nargs = 0; -- 2.27.0 From f73157f8e78b4905ffc89117112393100d873061 Mon Sep 17 00:00:00 2001 From: Denys Vlasenko Date: Tue, 29 Jun 2021 03:27:07 +0200 Subject: [PATCH 19/65] awk: get rid of "move name one char back" trick in next_token() function old new delta next_token 791 812 +21 awk_main 886 831 -55 ------------------------------------------------------------------------------ (add/remove: 0/0 grow/shrink: 1/1 up/down: 21/-55) Total: -34 bytes Signed-off-by: Denys Vlasenko --- editors/awk.c | 54 +++++++++++++++++++++++++-------------------------- 1 file changed, 27 insertions(+), 27 deletions(-) diff --git a/editors/awk.c b/editors/awk.c index 661081e24..94f51c529 100644 --- a/editors/awk.c +++ b/editors/awk.c @@ -535,6 +535,7 @@ struct globals { var *Fields; nvblock *g_cb; char *g_pos; + char g_saved_ch; smallint icase; smallint exiting; smallint nextrec; @@ -599,6 +600,7 @@ struct globals2 { #define Fields (G1.Fields ) #define g_cb (G1.g_cb ) #define g_pos (G1.g_pos ) +#define g_saved_ch (G1.g_saved_ch ) #define icase (G1.icase ) #define exiting (G1.exiting ) #define nextrec (G1.nextrec ) @@ -1125,6 +1127,10 @@ static uint32_t next_token(uint32_t expected) t_info = save_info; } else { p = g_pos; + if (g_saved_ch != '\0') { + *p = g_saved_ch; + g_saved_ch = '\0'; + } readnext: p = skip_spaces(p); g_lineno = t_lineno; @@ -1183,6 +1189,8 @@ static uint32_t next_token(uint32_t expected) tc = TC_NUMBER; debug_printf_parse("%s: token found:%f TC_NUMBER\n", __func__, t_double); } else { + char *end_of_name; + if (*p == '\n') t_lineno++; @@ -1219,16 +1227,14 @@ static uint32_t next_token(uint32_t expected) if (!isalnum_(*p)) syntax_error(EMSG_UNEXP_TOKEN); /* no */ /* yes */ -/* "move name one char back" trick: we need a byte for NUL terminator */ -/* NB: this results in argv[i][-1] being used (!!!) in e.g. "awk -e 'NAME'" case */ - t_string = --p; - while (isalnum_(*++p)) { - p[-1] = *p; - } - p[-1] = '\0'; + t_string = p; + while (isalnum_(*p)) + p++; + end_of_name = p; tc = TC_VARIABLE; /* also consume whitespace between functionname and bracket */ if (!(expected & TC_VARIABLE) || (expected & TC_ARRAY)) +//TODO: why if variable can be here (but not array ref), skipping is not allowed? Example where it matters? p = skip_spaces(p); if (*p == '(') { p++; @@ -1240,7 +1246,19 @@ static uint32_t next_token(uint32_t expected) debug_printf_parse("%s: token found:'%s' TC_ARRAY\n", __func__, t_string); } else { debug_printf_parse("%s: token found:'%s' TC_VARIABLE\n", __func__, t_string); + if (end_of_name == p) { + /* there is no space for trailing NUL in t_string! + * We need to save the char we are going to NUL. + * (we'll use it in future call to next_token()) + */ + g_saved_ch = *end_of_name; +// especially pathological example is V="abc"; V.2 - it's V concatenated to .2 +// (it evaluates to "abc0.2"). Because of this case, we can't simply cache +// '.' and analyze it later: we also have to *store it back* in next +// next_token(), in order to give my_strtod() the undamaged ".2" string. + } } + *end_of_name = '\0'; /* terminate t_string */ } token_found: g_pos = p; @@ -3420,38 +3438,20 @@ int awk_main(int argc UNUSED_PARAM, char **argv) g_progname = llist_pop(&list_f); fd = xopen_stdin(g_progname); - /* 1st byte is reserved for "move name one char back" trick in next_token */ - i = 1; - s = NULL; - for (;;) { - int sz; - s = xrealloc(s, i + 1000); - sz = safe_read(fd, s + i, 1000); - if (sz <= 0) - break; - i += sz; - } - s = xrealloc(s, i + 1); /* trim unused 999 bytes */ - s[i] = '\0'; + s = xmalloc_read(fd, NULL); /* it's NUL-terminated */ close(fd); - parse_program(s + 1); + parse_program(s); free(s); } g_progname = "cmd. line"; #if ENABLE_FEATURE_AWK_GNU_EXTENSIONS while (list_e) { - /* NB: "move name one char back" trick in next_token - * can use argv[i][-1] here. - */ parse_program(llist_pop(&list_e)); } #endif if (!(opt & (OPT_f | OPT_e))) { if (!*argv) bb_show_usage(); - /* NB: "move name one char back" trick in next_token - * can use argv[i][-1] here. - */ parse_program(*argv++); } -- 2.27.0 From f2e57cb548857109fbd7f5227cf82fb40b733441 Mon Sep 17 00:00:00 2001 From: Denys Vlasenko Date: Tue, 29 Jun 2021 03:44:56 +0200 Subject: [PATCH 20/65] awk: code shrink function old new delta parse_expr 948 945 -3 chain_expr 65 62 -3 chain_group 655 649 -6 parse_program 310 303 -7 rollback_token 10 - -10 ------------------------------------------------------------------------------ (add/remove: 0/1 grow/shrink: 0/4 up/down: 0/-29) Total: -29 bytes Signed-off-by: Denys Vlasenko --- editors/awk.c | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) diff --git a/editors/awk.c b/editors/awk.c index 94f51c529..93f0beb1a 100644 --- a/editors/awk.c +++ b/editors/awk.c @@ -1300,7 +1300,7 @@ static uint32_t next_token(uint32_t expected) #undef save_info } -static void rollback_token(void) +static ALWAYS_INLINE void rollback_token(void) { t_rollback = TRUE; } @@ -1474,14 +1474,14 @@ static node *parse_expr(uint32_t term_tc) case TC_LENGTH: debug_printf_parse("%s: TC_LENGTH\n", __func__); - next_token(TC_LPAREN /* length(...) */ + tc = next_token(TC_LPAREN /* length(...) */ | TS_OPTERM /* length; (or newline)*/ | TC_GRPTERM /* length } */ | TC_BINOPX /* length NUM */ | TC_COMMA /* print length, 1 */ ); rollback_token(); - if (t_tclass & TC_LPAREN) { + if (tc & TC_LPAREN) { /* It was a "(" token. Handle just like TC_BUILTIN */ cn->l.n = parse_lrparen_list(); } @@ -1563,19 +1563,23 @@ static void chain_group(void) if (c & TC_GRPSTART) { debug_printf_parse("%s: TC_GRPSTART\n", __func__); - while (next_token(TS_GRPSEQ | TC_GRPTERM) != TC_GRPTERM) { + while ((c = next_token(TS_GRPSEQ | TC_GRPTERM)) != TC_GRPTERM) { debug_printf_parse("%s: !TC_GRPTERM\n", __func__); - if (t_tclass & TC_NEWLINE) + if (c & TC_NEWLINE) continue; rollback_token(); chain_group(); } debug_printf_parse("%s: TC_GRPTERM\n", __func__); - } else if (c & (TS_OPSEQ | TS_OPTERM)) { + return; + } + if (c & (TS_OPSEQ | TS_OPTERM)) { debug_printf_parse("%s: TS_OPSEQ | TS_OPTERM\n", __func__); rollback_token(); chain_expr(OC_EXEC | Vx); - } else { + return; + } + { /* TS_STATEMNT */ debug_printf_parse("%s: TS_STATEMNT(?)\n", __func__); switch (t_info & OPCLSMASK) { -- 2.27.0 From 160a953d8a3327b96dd81ae8f2096a0265076462 Mon Sep 17 00:00:00 2001 From: Denys Vlasenko Date: Tue, 29 Jun 2021 03:47:46 +0200 Subject: [PATCH 21/65] awk: deindent a block, no code changes Signed-off-by: Denys Vlasenko --- editors/awk.c | 167 +++++++++++++++++++++++++------------------------- 1 file changed, 83 insertions(+), 84 deletions(-) diff --git a/editors/awk.c b/editors/awk.c index 93f0beb1a..a4708516c 100644 --- a/editors/awk.c +++ b/editors/awk.c @@ -1579,98 +1579,97 @@ static void chain_group(void) chain_expr(OC_EXEC | Vx); return; } - { - /* TS_STATEMNT */ - debug_printf_parse("%s: TS_STATEMNT(?)\n", __func__); - switch (t_info & OPCLSMASK) { - case ST_IF: - debug_printf_parse("%s: ST_IF\n", __func__); - n = chain_node(OC_BR | Vx); - n->l.n = parse_lrparen_list(); + + /* TS_STATEMNT */ + debug_printf_parse("%s: TS_STATEMNT(?)\n", __func__); + switch (t_info & OPCLSMASK) { + case ST_IF: + debug_printf_parse("%s: ST_IF\n", __func__); + n = chain_node(OC_BR | Vx); + n->l.n = parse_lrparen_list(); + chain_group(); + n2 = chain_node(OC_EXEC); + n->r.n = seq->last; + if (next_token(TS_GRPSEQ | TC_GRPTERM | TC_ELSE) == TC_ELSE) { chain_group(); - n2 = chain_node(OC_EXEC); - n->r.n = seq->last; - if (next_token(TS_GRPSEQ | TC_GRPTERM | TC_ELSE) == TC_ELSE) { - chain_group(); - n2->a.n = seq->last; - } else { - rollback_token(); - } - break; + n2->a.n = seq->last; + } else { + rollback_token(); + } + break; - case ST_WHILE: - debug_printf_parse("%s: ST_WHILE\n", __func__); - n2 = parse_lrparen_list(); - n = chain_loop(NULL); - n->l.n = n2; - break; + case ST_WHILE: + debug_printf_parse("%s: ST_WHILE\n", __func__); + n2 = parse_lrparen_list(); + n = chain_loop(NULL); + n->l.n = n2; + break; - case ST_DO: - debug_printf_parse("%s: ST_DO\n", __func__); - n2 = chain_node(OC_EXEC); - n = chain_loop(NULL); - n2->a.n = n->a.n; - next_token(TC_WHILE); - n->l.n = parse_lrparen_list(); - break; + case ST_DO: + debug_printf_parse("%s: ST_DO\n", __func__); + n2 = chain_node(OC_EXEC); + n = chain_loop(NULL); + n2->a.n = n->a.n; + next_token(TC_WHILE); + n->l.n = parse_lrparen_list(); + break; - case ST_FOR: - debug_printf_parse("%s: ST_FOR\n", __func__); - next_token(TC_LPAREN); - n2 = parse_expr(TC_SEMICOL | TC_RPAREN); - if (t_tclass & TC_RPAREN) { /* for-in */ - if (!n2 || (n2->info & OPCLSMASK) != OC_IN) - syntax_error(EMSG_UNEXP_TOKEN); - n = chain_node(OC_WALKINIT | VV); - n->l.n = n2->l.n; - n->r.n = n2->r.n; - n = chain_loop(NULL); - n->info = OC_WALKNEXT | Vx; - n->l.n = n2->l.n; - } else { /* for (;;) */ - n = chain_node(OC_EXEC | Vx); - n->l.n = n2; - n2 = parse_expr(TC_SEMICOL); - n3 = parse_expr(TC_RPAREN); - n = chain_loop(n3); - n->l.n = n2; - if (!n2) - n->info = OC_EXEC; - } - break; + case ST_FOR: + debug_printf_parse("%s: ST_FOR\n", __func__); + next_token(TC_LPAREN); + n2 = parse_expr(TC_SEMICOL | TC_RPAREN); + if (t_tclass & TC_RPAREN) { /* for-in */ + if (!n2 || (n2->info & OPCLSMASK) != OC_IN) + syntax_error(EMSG_UNEXP_TOKEN); + n = chain_node(OC_WALKINIT | VV); + n->l.n = n2->l.n; + n->r.n = n2->r.n; + n = chain_loop(NULL); + n->info = OC_WALKNEXT | Vx; + n->l.n = n2->l.n; + } else { /* for (;;) */ + n = chain_node(OC_EXEC | Vx); + n->l.n = n2; + n2 = parse_expr(TC_SEMICOL); + n3 = parse_expr(TC_RPAREN); + n = chain_loop(n3); + n->l.n = n2; + if (!n2) + n->info = OC_EXEC; + } + break; - case OC_PRINT: - case OC_PRINTF: - debug_printf_parse("%s: OC_PRINT[F]\n", __func__); - n = chain_node(t_info); - n->l.n = parse_expr(TS_OPTERM | TC_OUTRDR | TC_GRPTERM); - if (t_tclass & TC_OUTRDR) { - n->info |= t_info; - n->r.n = parse_expr(TS_OPTERM | TC_GRPTERM); - } - if (t_tclass & TC_GRPTERM) - rollback_token(); - break; + case OC_PRINT: + case OC_PRINTF: + debug_printf_parse("%s: OC_PRINT[F]\n", __func__); + n = chain_node(t_info); + n->l.n = parse_expr(TS_OPTERM | TC_OUTRDR | TC_GRPTERM); + if (t_tclass & TC_OUTRDR) { + n->info |= t_info; + n->r.n = parse_expr(TS_OPTERM | TC_GRPTERM); + } + if (t_tclass & TC_GRPTERM) + rollback_token(); + break; - case OC_BREAK: - debug_printf_parse("%s: OC_BREAK\n", __func__); - n = chain_node(OC_EXEC); - n->a.n = break_ptr; - chain_expr(t_info); - break; + case OC_BREAK: + debug_printf_parse("%s: OC_BREAK\n", __func__); + n = chain_node(OC_EXEC); + n->a.n = break_ptr; + chain_expr(t_info); + break; - case OC_CONTINUE: - debug_printf_parse("%s: OC_CONTINUE\n", __func__); - n = chain_node(OC_EXEC); - n->a.n = continue_ptr; - chain_expr(t_info); - break; + case OC_CONTINUE: + debug_printf_parse("%s: OC_CONTINUE\n", __func__); + n = chain_node(OC_EXEC); + n->a.n = continue_ptr; + chain_expr(t_info); + break; - /* delete, next, nextfile, return, exit */ - default: - debug_printf_parse("%s: default\n", __func__); - chain_expr(t_info); - } + /* delete, next, nextfile, return, exit */ + default: + debug_printf_parse("%s: default\n", __func__); + chain_expr(t_info); } } -- 2.27.0 From d383cc84ba220eb2c620ffa72853585240e383d1 Mon Sep 17 00:00:00 2001 From: Denys Vlasenko Date: Tue, 29 Jun 2021 12:16:36 +0200 Subject: [PATCH 22/65] awk: fix parsing of expressions such as "v (a)" function old new delta next_token 812 825 +13 Signed-off-by: Denys Vlasenko --- editors/awk.c | 22 ++++++++++++++++++---- testsuite/awk.tests | 11 +++++++++++ 2 files changed, 29 insertions(+), 4 deletions(-) diff --git a/editors/awk.c b/editors/awk.c index a4708516c..d9e326074 100644 --- a/editors/awk.c +++ b/editors/awk.c @@ -1231,11 +1231,24 @@ static uint32_t next_token(uint32_t expected) while (isalnum_(*p)) p++; end_of_name = p; - tc = TC_VARIABLE; - /* also consume whitespace between functionname and bracket */ - if (!(expected & TC_VARIABLE) || (expected & TC_ARRAY)) -//TODO: why if variable can be here (but not array ref), skipping is not allowed? Example where it matters? + + if (last_token_class == TC_FUNCDECL) + /* eat space in "function FUNC (...) {...}" declaration */ p = skip_spaces(p); + else if (expected & TC_ARRAY) { + /* eat space between array name and [ */ + char *s = skip_spaces(p); + if (*s == '[') /* array ref, not just a name? */ + p = s; + } + /* else: do NOT consume whitespace after variable name! + * gawk allows definition "function FUNC (p) {...}" - note space, + * but disallows the call "FUNC (p)" because it isn't one - + * expression "v (a)" should NOT be parsed as TC_FUNCTION: + * it is a valid concatenation if "v" is a variable, + * not a function name (and type of name is not known at parse time). + */ + if (*p == '(') { p++; tc = TC_FUNCTION; @@ -1245,6 +1258,7 @@ static uint32_t next_token(uint32_t expected) tc = TC_ARRAY; debug_printf_parse("%s: token found:'%s' TC_ARRAY\n", __func__, t_string); } else { + tc = TC_VARIABLE; debug_printf_parse("%s: token found:'%s' TC_VARIABLE\n", __func__, t_string); if (end_of_name == p) { /* there is no space for trailing NUL in t_string! diff --git a/testsuite/awk.tests b/testsuite/awk.tests index 6489dc082..c9d0ef9e5 100755 --- a/testsuite/awk.tests +++ b/testsuite/awk.tests @@ -71,6 +71,17 @@ testing "awk properly handles undefined function" \ "L1\n\nawk: cmd. line:5: Call to undefined function\n" \ "" "" +prg=' +BEGIN { + v=1 + a=2 + print v (a) +}' +testing "'v (a)' is not a function call, it is a concatenation" \ + "awk '$prg' 2>&1" \ + "12\n" \ + "" "" + optional DESKTOP testing "awk hex const 1" "awk '{ print or(0xffffffff,1) }'" "4294967295\n" "" "\n" -- 2.27.0 From cbfa3d396b7c15c41f90af7d009aa7fc7161a629 Mon Sep 17 00:00:00 2001 From: Denys Vlasenko Date: Tue, 29 Jun 2021 14:33:04 +0200 Subject: [PATCH 23/65] awk: document which hashes are used at what state (parse/execute) We can free them after they are no longer needed. (Currently, being a NOEXEC applet is much larger waste of memory for the case of long-running awk script). function old new delta awk_main 831 827 -4 Signed-off-by: Denys Vlasenko --- editors/awk.c | 30 ++++++++++++++++++++---------- 1 file changed, 20 insertions(+), 10 deletions(-) diff --git a/editors/awk.c b/editors/awk.c index d9e326074..72c8dfacf 100644 --- a/editors/awk.c +++ b/editors/awk.c @@ -527,7 +527,10 @@ struct globals { chain *seq; node *break_ptr, *continue_ptr; rstream *iF; - xhash *vhash, *ahash, *fdhash, *fnhash; + xhash *ahash; /* argument names, used only while parsing function bodies */ + xhash *fnhash; /* function names, used only in parsing stage */ + xhash *vhash; /* variables and arrays */ + xhash *fdhash; /* file objects, used only in execution stage */ const char *g_progname; int g_lineno; int nfields; @@ -1719,6 +1722,7 @@ static void parse_program(char *p) debug_printf_parse("%s: TC_FUNCDECL\n", __func__); next_token(TC_FUNCTION); f = newfunc(t_string); +//FIXME: dup check: functions can't be redefined, this is not ok: awk 'func f(){}; func f(){}' f->body.first = NULL; f->nargs = 0; /* func arg list: comma sep list of args, and a close paren */ @@ -3389,12 +3393,8 @@ int awk_main(int argc UNUSED_PARAM, char **argv) if (ENABLE_LOCALE_SUPPORT) setlocale(LC_NUMERIC, "C"); - vhash = hash_init(); - ahash = hash_init(); - fdhash = hash_init(); - fnhash = hash_init(); - /* initialize variables */ + vhash = hash_init(); { char *vnames = (char *)vNames; /* cheat */ char *vvalues = (char *)vValues; @@ -3416,10 +3416,6 @@ int awk_main(int argc UNUSED_PARAM, char **argv) handle_special(intvar[FS]); handle_special(intvar[RS]); - newfile("/dev/stdin")->F = stdin; - newfile("/dev/stdout")->F = stdout; - newfile("/dev/stderr")->F = stderr; - /* Huh, people report that sometimes environ is NULL. Oh well. */ if (environ) { char **envp; @@ -3449,6 +3445,10 @@ int awk_main(int argc UNUSED_PARAM, char **argv) if (!is_assignment(llist_pop(&list_v))) bb_show_usage(); } + + /* Parse all supplied programs */ + fnhash = hash_init(); + ahash = hash_init(); while (list_f) { int fd; char *s; @@ -3471,6 +3471,11 @@ int awk_main(int argc UNUSED_PARAM, char **argv) bb_show_usage(); parse_program(*argv++); } + //free_hash(ahash) // ~250 bytes, arg names, used only during parse of function bodies + //ahash = NULL; // debug + //free_hash(fnhash) // ~250 bytes, used only for function names + //fnhash = NULL; // debug + /* parsing done, on to executing */ /* fill in ARGV array */ setari_u(intvar[ARGV], 0, "awk"); @@ -3479,6 +3484,11 @@ int awk_main(int argc UNUSED_PARAM, char **argv) setari_u(intvar[ARGV], ++i, *argv++); setvar_i(intvar[ARGC], i + 1); + fdhash = hash_init(); + newfile("/dev/stdin")->F = stdin; + newfile("/dev/stdout")->F = stdout; + newfile("/dev/stderr")->F = stderr; + zero_out_var(&tv); evaluate(beginseq.first, &tv); if (!mainseq.first && !endseq.first) -- 2.27.0 From 117b2bb381a726f938fdb49283961f843ad55e28 Mon Sep 17 00:00:00 2001 From: Denys Vlasenko Date: Tue, 29 Jun 2021 18:33:25 +0200 Subject: [PATCH 24/65] awk: free unused parsing structures after parse is done function old new delta hash_clear - 90 +90 awk_main 827 849 +22 clear_array 90 - -90 ------------------------------------------------------------------------------ (add/remove: 1/1 grow/shrink: 1/0 up/down: 112/-90) Total: 22 bytes Signed-off-by: Denys Vlasenko --- editors/awk.c | 74 ++++++++++++++++++++++++++++++++------------------- 1 file changed, 47 insertions(+), 27 deletions(-) diff --git a/editors/awk.c b/editors/awk.c index 72c8dfacf..e603f72db 100644 --- a/editors/awk.c +++ b/editors/awk.c @@ -530,7 +530,8 @@ struct globals { xhash *ahash; /* argument names, used only while parsing function bodies */ xhash *fnhash; /* function names, used only in parsing stage */ xhash *vhash; /* variables and arrays */ - xhash *fdhash; /* file objects, used only in execution stage */ + //xhash *fdhash; /* file objects, used only in execution stage */ + //we are reusing ahash as fdhash, via define (see later) const char *g_progname; int g_lineno; int nfields; @@ -592,10 +593,13 @@ struct globals2 { #define break_ptr (G1.break_ptr ) #define continue_ptr (G1.continue_ptr) #define iF (G1.iF ) -#define vhash (G1.vhash ) #define ahash (G1.ahash ) -#define fdhash (G1.fdhash ) #define fnhash (G1.fnhash ) +#define vhash (G1.vhash ) +#define fdhash ahash +//^^^^^^^^^^^^^^^^^^ ahash is cleared after every function parsing, +// and ends up empty after parsing phase. Thus, we can simply reuse it +// for fdhash in execution stage. #define g_progname (G1.g_progname ) #define g_lineno (G1.g_lineno ) #define nfields (G1.nfields ) @@ -682,6 +686,33 @@ static xhash *hash_init(void) return newhash; } +static void hash_clear(xhash *hash) +{ + unsigned i; + hash_item *hi, *thi; + + for (i = 0; i < hash->csize; i++) { + hi = hash->items[i]; + while (hi) { + thi = hi; + hi = hi->next; + free(thi->data.v.string); + free(thi); + } + hash->items[i] = NULL; + } + hash->glen = hash->nel = 0; +} + +#if 0 //UNUSED +static void hash_free(xhash *hash) +{ + hash_clear(hash); + free(hash->items); + free(hash); +} +#endif + /* find item in hash, return ptr to data, NULL if not found */ static void *hash_search(xhash *hash, const char *name) { @@ -869,23 +900,7 @@ static xhash *iamarray(var *v) return a->x.array; } -static void clear_array(xhash *array) -{ - unsigned i; - hash_item *hi, *thi; - - for (i = 0; i < array->csize; i++) { - hi = array->items[i]; - while (hi) { - thi = hi; - hi = hi->next; - free(thi->data.v.string); - free(thi); - } - array->items[i] = NULL; - } - array->glen = array->nel = 0; -} +#define clear_array(array) hash_clear(array) /* clear a variable */ static var *clrvar(var *v) @@ -1742,7 +1757,7 @@ static void parse_program(char *p) } seq = &f->body; chain_group(); - clear_array(ahash); + hash_clear(ahash); } else if (tclass & TS_OPSEQ) { debug_printf_parse("%s: TS_OPSEQ\n", __func__); rollback_token(); @@ -3471,11 +3486,16 @@ int awk_main(int argc UNUSED_PARAM, char **argv) bb_show_usage(); parse_program(*argv++); } - //free_hash(ahash) // ~250 bytes, arg names, used only during parse of function bodies - //ahash = NULL; // debug - //free_hash(fnhash) // ~250 bytes, used only for function names - //fnhash = NULL; // debug - /* parsing done, on to executing */ + /* Free unused parse structures */ + //hash_free(fnhash); // ~250 bytes when empty, used only for function names + //^^^^^^^^^^^^^^^^^ does not work, hash_clear() inside SEGVs + // (IOW: hash_clear() assumes it's a hash of variables. fnhash is not). + free(fnhash->items); + free(fnhash); + fnhash = NULL; // debug + //hash_free(ahash); // empty after parsing, will reuse as fdhash instead of freeing + + /* Parsing done, on to executing */ /* fill in ARGV array */ setari_u(intvar[ARGV], 0, "awk"); @@ -3484,7 +3504,7 @@ int awk_main(int argc UNUSED_PARAM, char **argv) setari_u(intvar[ARGV], ++i, *argv++); setvar_i(intvar[ARGC], i + 1); - fdhash = hash_init(); + //fdhash = ahash - done via define newfile("/dev/stdin")->F = stdin; newfile("/dev/stdout")->F = stdout; newfile("/dev/stderr")->F = stderr; -- 2.27.0 From 3457d8f7713f5b00631f8b761cda36608435ddf3 Mon Sep 17 00:00:00 2001 From: Denys Vlasenko Date: Tue, 29 Jun 2021 19:07:36 +0200 Subject: [PATCH 25/65] awk: assorted optimizations hash_find(): do not caclculate hash twice. Do not divide - can use cheap multiply-by-8 shift. nextword(): do not repeatedly increment in-memory value, do it in register, then store final result. hashwalk_init(): do not strlen() twice. function old new delta hash_search3 - 49 +49 hash_find 259 281 +22 nextword 19 16 -3 evaluate 3141 3137 -4 hash_search 54 28 -26 ------------------------------------------------------------------------------ (add/remove: 1/0 grow/shrink: 1/3 up/down: 71/-33) Total: 38 bytes Signed-off-by: Denys Vlasenko --- editors/awk.c | 26 +++++++++++++++++--------- 1 file changed, 17 insertions(+), 9 deletions(-) diff --git a/editors/awk.c b/editors/awk.c index e603f72db..ca7444e55 100644 --- a/editors/awk.c +++ b/editors/awk.c @@ -696,6 +696,7 @@ static void hash_clear(xhash *hash) while (hi) { thi = hi; hi = hi->next; +//FIXME: this assumes that it's a hash of *variables*: free(thi->data.v.string); free(thi); } @@ -714,11 +715,11 @@ static void hash_free(xhash *hash) #endif /* find item in hash, return ptr to data, NULL if not found */ -static void *hash_search(xhash *hash, const char *name) +static NOINLINE void *hash_search3(xhash *hash, const char *name, unsigned idx) { hash_item *hi; - hi = hash->items[hashidx(name) % hash->csize]; + hi = hash->items[idx % hash->csize]; while (hi) { if (strcmp(hi->name, name) == 0) return &hi->data; @@ -727,6 +728,11 @@ static void *hash_search(xhash *hash, const char *name) return NULL; } +static void *hash_search(xhash *hash, const char *name) +{ + return hash_search3(hash, name, hashidx(name)); +} + /* grow hash if it becomes too big */ static void hash_rebuild(xhash *hash) { @@ -762,16 +768,17 @@ static void *hash_find(xhash *hash, const char *name) unsigned idx; int l; - hi = hash_search(hash, name); + idx = hashidx(name); + hi = hash_search3(hash, name, idx); if (!hi) { - if (++hash->nel / hash->csize > 10) + if (++hash->nel > hash->csize * 8) hash_rebuild(hash); l = strlen(name) + 1; hi = xzalloc(sizeof(*hi) + l); strcpy(hi->name, name); - idx = hashidx(name) % hash->csize; + idx = idx % hash->csize; hi->next = hash->items[idx]; hash->items[idx] = hi; hash->glen += l; @@ -822,8 +829,10 @@ static char *skip_spaces(char *p) static char *nextword(char **s) { char *p = *s; - while (*(*s)++ != '\0') + char *q = p; + while (*q++ != '\0') continue; + *s = q; return p; } @@ -2116,8 +2125,7 @@ static void hashwalk_init(var *v, xhash *array) for (i = 0; i < array->csize; i++) { hi = array->items[i]; while (hi) { - strcpy(w->end, hi->name); - nextword(&w->end); + w->end = stpcpy(w->end, hi->name) + 1; hi = hi->next; } } @@ -3504,7 +3512,7 @@ int awk_main(int argc UNUSED_PARAM, char **argv) setari_u(intvar[ARGV], ++i, *argv++); setvar_i(intvar[ARGC], i + 1); - //fdhash = ahash - done via define + //fdhash = ahash; // done via define newfile("/dev/stdin")->F = stdin; newfile("/dev/stdout")->F = stdout; newfile("/dev/stderr")->F = stderr; -- 2.27.0 From 4e4286534be38052e4eb33c1c233d391ba09f2ee Mon Sep 17 00:00:00 2001 From: Denys Vlasenko Date: Wed, 30 Jun 2021 02:12:27 +0200 Subject: [PATCH 26/65] awk: remove custom pool allocator for temporary awk variables It seems to be designed to reduce overhead of malloc's auxiliary data, by allocating at least 64 variables as a block. With "struct var" being about 20-32 bytes long (32/64 bits), malloc overhead for one temporary indeed is high, ~33% more memory used than needed. function old new delta evaluate 3137 3145 +8 modprobe_main 798 803 +5 exec_builtin 1414 1419 +5 awk_printf 476 481 +5 as_regex 132 137 +5 EMSG_INTERNAL_ERROR 15 - -15 nvfree 169 116 -53 nvalloc 145 - -145 ------------------------------------------------------------------------------ (add/remove: 0/2 grow/shrink: 5/1 up/down: 28/-213) Total: -185 bytes Signed-off-by: Denys Vlasenko --- editors/awk.c | 164 +++++++++++++++++++------------------------------- 1 file changed, 61 insertions(+), 103 deletions(-) diff --git a/editors/awk.c b/editors/awk.c index ca7444e55..dd4830461 100644 --- a/editors/awk.c +++ b/editors/awk.c @@ -93,7 +93,6 @@ enum { }; #define MAXVARFMT 240 -#define MINNVBLOCK 64 /* variable flags */ #define VF_NUMBER 0x0001 /* 1 = primary type is number */ @@ -120,8 +119,8 @@ typedef struct walker_list { /* Variable */ typedef struct var_s { unsigned type; /* flags */ - double number; char *string; + double number; union { int aidx; /* func arg idx (for compilation stage) */ struct xhash_s *array; /* array ptr */ @@ -192,15 +191,6 @@ typedef struct node_s { } a; } node; -/* Block of temporary variables */ -typedef struct nvblock_s { - int size; - var *pos; - struct nvblock_s *prev; - struct nvblock_s *next; - var nv[]; -} nvblock; - typedef struct tsplitter_s { node n; regex_t re[2]; @@ -537,7 +527,6 @@ struct globals { int nfields; int maxfields; /* used in fsrealloc() only */ var *Fields; - nvblock *g_cb; char *g_pos; char g_saved_ch; smallint icase; @@ -605,7 +594,6 @@ struct globals2 { #define nfields (G1.nfields ) #define maxfields (G1.maxfields ) #define Fields (G1.Fields ) -#define g_cb (G1.g_cb ) #define g_pos (G1.g_pos ) #define g_saved_ch (G1.g_saved_ch ) #define icase (G1.icase ) @@ -640,7 +628,6 @@ static int awk_exit(int) NORETURN; /* ---- error handling ---- */ -static const char EMSG_INTERNAL_ERROR[] ALIGN1 = "Internal error"; static const char EMSG_UNEXP_EOS[] ALIGN1 = "Unexpected end of string"; static const char EMSG_UNEXP_TOKEN[] ALIGN1 = "Unexpected token"; static const char EMSG_DIV_BY_ZERO[] ALIGN1 = "Division by zero"; @@ -1050,77 +1037,6 @@ static int istrue(var *v) return (v->string && v->string[0]); } -/* temporary variables allocator. Last allocated should be first freed */ -static var *nvalloc(int n) -{ - nvblock *pb = NULL; - var *v, *r; - int size; - - while (g_cb) { - pb = g_cb; - if ((g_cb->pos - g_cb->nv) + n <= g_cb->size) - break; - g_cb = g_cb->next; - } - - if (!g_cb) { - size = (n <= MINNVBLOCK) ? MINNVBLOCK : n; - g_cb = xzalloc(sizeof(nvblock) + size * sizeof(var)); - g_cb->size = size; - g_cb->pos = g_cb->nv; - g_cb->prev = pb; - /*g_cb->next = NULL; - xzalloc did it */ - if (pb) - pb->next = g_cb; - } - - v = r = g_cb->pos; - g_cb->pos += n; - - while (v < g_cb->pos) { - v->type = 0; - v->string = NULL; - v++; - } - - return r; -} - -static void nvfree(var *v) -{ - var *p; - - if (v < g_cb->nv || v >= g_cb->pos) - syntax_error(EMSG_INTERNAL_ERROR); - - for (p = v; p < g_cb->pos; p++) { - if ((p->type & (VF_ARRAY | VF_CHILD)) == VF_ARRAY) { - clear_array(iamarray(p)); - free(p->x.array->items); - free(p->x.array); - } - if (p->type & VF_WALK) { - walker_list *n; - walker_list *w = p->x.walker; - debug_printf_walker("nvfree: freeing walker @%p\n", &p->x.walker); - p->x.walker = NULL; - while (w) { - n = w->prev; - debug_printf_walker(" free(%p)\n", w); - free(w); - w = n; - } - } - clrvar(p); - } - - g_cb->pos = v; - while (g_cb->prev && g_cb->pos == g_cb->nv) { - g_cb = g_cb->prev; - } -} - /* ------- awk program text parsing ------- */ /* Parse next token pointed by global pos, place results into global t_XYZ variables. @@ -1793,6 +1709,41 @@ static void parse_program(char *p) /* -------- program execution part -------- */ +/* temporary variables allocator */ +static var *nvalloc(int sz) +{ + return xzalloc(sz * sizeof(var)); +} + +static void nvfree(var *v, int sz) +{ + var *p = v; + + while (--sz >= 0) { + if ((p->type & (VF_ARRAY | VF_CHILD)) == VF_ARRAY) { + clear_array(iamarray(p)); + free(p->x.array->items); + free(p->x.array); + } + if (p->type & VF_WALK) { + walker_list *n; + walker_list *w = p->x.walker; + debug_printf_walker("nvfree: freeing walker @%p\n", &p->x.walker); + p->x.walker = NULL; + while (w) { + n = w->prev; + debug_printf_walker(" free(%p)\n", w); + free(w); + w = n; + } + } + clrvar(p); + p++; + } + + free(v); +} + static node *mk_splitter(const char *s, tsplitter *spl) { regex_t *re, *ire; @@ -1814,9 +1765,9 @@ static node *mk_splitter(const char *s, tsplitter *spl) return n; } -/* use node as a regular expression. Supplied with node ptr and regex_t +/* Use node as a regular expression. Supplied with node ptr and regex_t * storage space. Return ptr to regex (if result points to preg, it should - * be later regfree'd manually + * be later regfree'd manually). */ static regex_t *as_regex(node *op, regex_t *preg) { @@ -1840,7 +1791,7 @@ static regex_t *as_regex(node *op, regex_t *preg) cflags &= ~REG_EXTENDED; xregcomp(preg, s, cflags); } - nvfree(v); + nvfree(v, 1); return preg; } @@ -2292,6 +2243,8 @@ static char *awk_printf(node *n, int *len) var *v, *arg; v = nvalloc(1); +//TODO: above, to avoid allocating a single temporary var, take a pointer +//to a temporary that our caller (evaluate()) already has? fmt = f = xstrdup(getvar_s(evaluate(nextarg(&n), v))); i = 0; @@ -2333,7 +2286,7 @@ static char *awk_printf(node *n, int *len) } free(fmt); - nvfree(v); + nvfree(v, 1); b = xrealloc(b, i + 1); b[i] = '\0'; #if ENABLE_FEATURE_AWK_GNU_EXTENSIONS @@ -2661,14 +2614,14 @@ static NOINLINE var *exec_builtin(node *op, var *res) break; } - nvfree(tv); + nvfree(tv, 4); return res; #undef tspl } /* * Evaluate node - the heart of the program. Supplied with subtree - * and place where to store result. returns ptr to result. + * and place where to store result. Returns ptr to result. */ #define XC(n) ((n) >> 8) @@ -2953,33 +2906,38 @@ static var *evaluate(node *op, var *res) break; case XC( OC_FUNC ): { - var *vbeg, *v; + var *tv, *sv_fnargs; const char *sv_progname; + int nargs1, i; + debug_printf_eval("FUNC\n"); - /* The body might be empty, still has to eval the args */ if (!op->r.n->info && !op->r.f->body.first) syntax_error(EMSG_UNDEF_FUNC); - vbeg = v = nvalloc(op->r.f->nargs + 1); + /* The body might be empty, still has to eval the args */ + nargs1 = op->r.f->nargs + 1; + tv = nvalloc(nargs1); + i = 0; while (op1) { +//TODO: explain why one iteration is done even for the case p->r.f->nargs == 0 var *arg = evaluate(nextarg(&op1), v1); - copyvar(v, arg); - v->type |= VF_CHILD; - v->x.parent = arg; - if (++v - vbeg >= op->r.f->nargs) + copyvar(&tv[i], arg); + tv[i].type |= VF_CHILD; + tv[i].x.parent = arg; + if (++i >= op->r.f->nargs) break; } - v = fnargs; - fnargs = vbeg; + sv_fnargs = fnargs; sv_progname = g_progname; + fnargs = tv; res = evaluate(op->r.f->body.first, res); + nvfree(fnargs, nargs1); g_progname = sv_progname; - nvfree(fnargs); - fnargs = v; + fnargs = sv_fnargs; break; } @@ -3301,7 +3259,7 @@ static var *evaluate(node *op, var *res) break; } /* while (op) */ - nvfree(v1); + nvfree(v1, 2); debug_printf_eval("returning from %s(): %p\n", __func__, res); return res; #undef fnargs -- 2.27.0 From ea4c15af5ffb47c9da4d649cc26f929ff32031b1 Mon Sep 17 00:00:00 2001 From: Denys Vlasenko Date: Wed, 30 Jun 2021 12:12:20 +0200 Subject: [PATCH 27/65] awk: replace incorrect use of union in undefined function check (no code changes) ...which reveals that it's buggy: it thinks "func f(){}" is an undefined function! Signed-off-by: Denys Vlasenko --- editors/awk.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/editors/awk.c b/editors/awk.c index dd4830461..afbb7d9e9 100644 --- a/editors/awk.c +++ b/editors/awk.c @@ -2912,7 +2912,7 @@ static var *evaluate(node *op, var *res) debug_printf_eval("FUNC\n"); - if (!op->r.n->info && !op->r.f->body.first) + if (op->r.f->nargs == 0 && !op->r.f->body.first) syntax_error(EMSG_UNDEF_FUNC); /* The body might be empty, still has to eval the args */ -- 2.27.0 From 08523f47e352f4e2d96b98c4e63feb1edad2dc5b Mon Sep 17 00:00:00 2001 From: Denys Vlasenko Date: Wed, 30 Jun 2021 12:23:51 +0200 Subject: [PATCH 28/65] awk: allow empty fuinctions with no arguments, disallow function redefinitions function old new delta .rodata 103681 103700 +19 parse_program 303 307 +4 evaluate 3145 3141 -4 ------------------------------------------------------------------------------ (add/remove: 0/0 grow/shrink: 2/1 up/down: 23/-4) Total: 19 bytes Signed-off-by: Denys Vlasenko --- editors/awk.c | 11 +++++++---- testsuite/awk.tests | 10 ++++++++++ 2 files changed, 17 insertions(+), 4 deletions(-) diff --git a/editors/awk.c b/editors/awk.c index afbb7d9e9..a217e8804 100644 --- a/editors/awk.c +++ b/editors/awk.c @@ -139,6 +139,7 @@ typedef struct chain_s { /* Function */ typedef struct func_s { unsigned nargs; + smallint defined; struct chain_s body; } func; @@ -1662,9 +1663,11 @@ static void parse_program(char *p) debug_printf_parse("%s: TC_FUNCDECL\n", __func__); next_token(TC_FUNCTION); f = newfunc(t_string); -//FIXME: dup check: functions can't be redefined, this is not ok: awk 'func f(){}; func f(){}' - f->body.first = NULL; - f->nargs = 0; + if (f->defined) + syntax_error("Duplicate function"); + f->defined = 1; + //f->body.first = NULL; - already is + //f->nargs = 0; - already is /* func arg list: comma sep list of args, and a close paren */ for (;;) { if (next_token(TC_VARIABLE | TC_RPAREN) == TC_RPAREN) { @@ -2912,7 +2915,7 @@ static var *evaluate(node *op, var *res) debug_printf_eval("FUNC\n"); - if (op->r.f->nargs == 0 && !op->r.f->body.first) + if (!op->r.f->defined) syntax_error(EMSG_UNDEF_FUNC); /* The body might be empty, still has to eval the args */ diff --git a/testsuite/awk.tests b/testsuite/awk.tests index c9d0ef9e5..124ec5cb3 100755 --- a/testsuite/awk.tests +++ b/testsuite/awk.tests @@ -44,6 +44,16 @@ testing "awk handles empty function f(arg){}" \ "L1\n\nL2\n\n" \ "" "" +prg=' +function empty_fun(){} +END {empty_fun() + print "Ok" +}' +testing "awk handles empty function f(){}" \ + "awk '$prg'" \ + "Ok\n" \ + "" "" + prg=' function outer_fun() { return 1 -- 2.27.0 From 66e2d139d3b8a02c23af2de584eee3f2b3ff39a5 Mon Sep 17 00:00:00 2001 From: Denys Vlasenko Date: Wed, 30 Jun 2021 12:42:39 +0200 Subject: [PATCH 29/65] awk: rewrite "print" logic a bit to make it clearer Signed-off-by: Denys Vlasenko --- editors/awk.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/editors/awk.c b/editors/awk.c index a217e8804..bc9644c34 100644 --- a/editors/awk.c +++ b/editors/awk.c @@ -2792,7 +2792,7 @@ static var *evaluate(node *op, var *res) if (!op1) { fputs(getvar_s(intvar[F0]), F); } else { - while (op1) { + for (;;) { var *v = evaluate(nextarg(&op1), v1); if (v->type & VF_NUMBER) { fmt_num(g_buf, MAXVARFMT, getvar_s(intvar[OFMT]), @@ -2801,13 +2801,12 @@ static var *evaluate(node *op, var *res) } else { fputs(getvar_s(v), F); } - - if (op1) - fputs(getvar_s(intvar[OFS]), F); + if (!op1) + break; + fputs(getvar_s(intvar[OFS]), F); } } fputs(getvar_s(intvar[ORS]), F); - } else { /* OC_PRINTF */ char *s = awk_printf(op1, &len); #if ENABLE_FEATURE_AWK_GNU_EXTENSIONS -- 2.27.0 From 5a2b9e8a9f8b5881121581c3caabcdbc45113634 Mon Sep 17 00:00:00 2001 From: Denys Vlasenko Date: Wed, 30 Jun 2021 12:52:51 +0200 Subject: [PATCH 30/65] awk: evaluate all, even superfluous function args function old new delta evaluate 3128 3135 +7 Signed-off-by: Denys Vlasenko --- editors/awk.c | 19 ++++++++++++------- testsuite/awk.tests | 8 +++++++- 2 files changed, 19 insertions(+), 8 deletions(-) diff --git a/editors/awk.c b/editors/awk.c index bc9644c34..b674eabea 100644 --- a/editors/awk.c +++ b/editors/awk.c @@ -2910,7 +2910,7 @@ static var *evaluate(node *op, var *res) case XC( OC_FUNC ): { var *tv, *sv_fnargs; const char *sv_progname; - int nargs1, i; + int nargs, i; debug_printf_eval("FUNC\n"); @@ -2918,17 +2918,22 @@ static var *evaluate(node *op, var *res) syntax_error(EMSG_UNDEF_FUNC); /* The body might be empty, still has to eval the args */ - nargs1 = op->r.f->nargs + 1; - tv = nvalloc(nargs1); + nargs = op->r.f->nargs; + tv = nvalloc(nargs); i = 0; while (op1) { -//TODO: explain why one iteration is done even for the case p->r.f->nargs == 0 var *arg = evaluate(nextarg(&op1), v1); + if (i == nargs) { + /* call with more arguments than function takes. + * (gawk warns: "warning: function 'f' called with more arguments than declared"). + * They are still evaluated, but discarded: */ + clrvar(arg); + continue; + } copyvar(&tv[i], arg); tv[i].type |= VF_CHILD; tv[i].x.parent = arg; - if (++i >= op->r.f->nargs) - break; + i++; } sv_fnargs = fnargs; @@ -2936,7 +2941,7 @@ static var *evaluate(node *op, var *res) fnargs = tv; res = evaluate(op->r.f->body.first, res); - nvfree(fnargs, nargs1); + nvfree(fnargs, nargs); g_progname = sv_progname; fnargs = sv_fnargs; diff --git a/testsuite/awk.tests b/testsuite/awk.tests index 124ec5cb3..eda5ea0e0 100755 --- a/testsuite/awk.tests +++ b/testsuite/awk.tests @@ -87,11 +87,17 @@ BEGIN { a=2 print v (a) }' -testing "'v (a)' is not a function call, it is a concatenation" \ +testing "awk 'v (a)' is not a function call, it is a concatenation" \ "awk '$prg' 2>&1" \ "12\n" \ "" "" +prg='func f(){print"F"};func g(){print"G"};BEGIN{f(g(),g())}' +testing "awk unused function args are evaluated" \ + "awk '$prg' 2>&1" \ + "G\nG\nF\n" \ + "" "" + optional DESKTOP testing "awk hex const 1" "awk '{ print or(0xffffffff,1) }'" "4294967295\n" "" "\n" -- 2.27.0 From 74290b10bd4f480d462e9583be9cee9516ead981 Mon Sep 17 00:00:00 2001 From: Denys Vlasenko Date: Thu, 1 Jul 2021 16:02:16 +0200 Subject: [PATCH 31/65] awk: rename temp variables, no code changes Signed-off-by: Denys Vlasenko --- editors/awk.c | 76 +++++++++++++++++++++++++++++++-------------------- 1 file changed, 46 insertions(+), 30 deletions(-) diff --git a/editors/awk.c b/editors/awk.c index b674eabea..49e6af0dc 100644 --- a/editors/awk.c +++ b/editors/awk.c @@ -1775,14 +1775,14 @@ static node *mk_splitter(const char *s, tsplitter *spl) static regex_t *as_regex(node *op, regex_t *preg) { int cflags; - var *v; + var *tmpvar; const char *s; if ((op->info & OPCLSMASK) == OC_REGEXP) { return icase ? op->r.ire : op->l.re; } - v = nvalloc(1); - s = getvar_s(evaluate(op, v)); + tmpvar = nvalloc(1); + s = getvar_s(evaluate(op, tmpvar)); cflags = icase ? REG_EXTENDED | REG_ICASE : REG_EXTENDED; /* Testcase where REG_EXTENDED fails (unpaired '{'): @@ -1794,7 +1794,7 @@ static regex_t *as_regex(node *op, regex_t *preg) cflags &= ~REG_EXTENDED; xregcomp(preg, s, cflags); } - nvfree(v, 1); + nvfree(tmpvar, 1); return preg; } @@ -2243,12 +2243,12 @@ static char *awk_printf(node *n, int *len) const char *s1; int i, j, incr, bsize; char c, c1; - var *v, *arg; + var *tmpvar, *arg; - v = nvalloc(1); + tmpvar = nvalloc(1); //TODO: above, to avoid allocating a single temporary var, take a pointer //to a temporary that our caller (evaluate()) already has? - fmt = f = xstrdup(getvar_s(evaluate(nextarg(&n), v))); + fmt = f = xstrdup(getvar_s(evaluate(nextarg(&n), tmpvar))); i = 0; while (*f) { @@ -2268,7 +2268,7 @@ static char *awk_printf(node *n, int *len) f++; c1 = *f; *f = '\0'; - arg = evaluate(nextarg(&n), v); + arg = evaluate(nextarg(&n), tmpvar); j = i; if (c == 'c' || !c) { @@ -2289,7 +2289,7 @@ static char *awk_printf(node *n, int *len) } free(fmt); - nvfree(v, 1); + nvfree(tmpvar, 1); b = xrealloc(b, i + 1); b[i] = '\0'; #if ENABLE_FEATURE_AWK_GNU_EXTENSIONS @@ -2429,7 +2429,7 @@ static NOINLINE var *exec_builtin(node *op, var *res) { #define tspl (G.exec_builtin__tspl) - var *tv; + var *tmpvars; node *an[4]; var *av[4]; const char *as[4]; @@ -2441,7 +2441,12 @@ static NOINLINE var *exec_builtin(node *op, var *res) time_t tt; int i, l, ll, n; - tv = nvalloc(4); + tmpvars = nvalloc(4); +#define TMPVAR0 (tmpvars) +#define TMPVAR1 (tmpvars + 1) +#define TMPVAR2 (tmpvars + 2) +#define TMPVAR3 (tmpvars + 3) +#define TMPVAR(i) (tmpvars + (i)) isr = info = op->info; op = op->l.n; @@ -2449,7 +2454,7 @@ static NOINLINE var *exec_builtin(node *op, var *res) for (i = 0; i < 4 && op; i++) { an[i] = nextarg(&op); if (isr & 0x09000000) - av[i] = evaluate(an[i], &tv[i]); + av[i] = evaluate(an[i], TMPVAR(i)); if (isr & 0x08000000) as[i] = getvar_s(av[i]); isr >>= 1; @@ -2474,7 +2479,7 @@ static NOINLINE var *exec_builtin(node *op, var *res) if (nargs > 2) { spl = (an[2]->info & OPCLSMASK) == OC_REGEXP ? - an[2] : mk_splitter(getvar_s(evaluate(an[2], &tv[2])), &tspl); + an[2] : mk_splitter(getvar_s(evaluate(an[2], TMPVAR2)), &tspl); } else { spl = &fsplitter.n; } @@ -2617,7 +2622,13 @@ static NOINLINE var *exec_builtin(node *op, var *res) break; } - nvfree(tv, 4); + nvfree(tmpvars, 4); +#undef TMPVAR0 +#undef TMPVAR1 +#undef TMPVAR2 +#undef TMPVAR3 +#undef TMPVAR + return res; #undef tspl } @@ -2636,14 +2647,16 @@ static var *evaluate(node *op, var *res) #define seed (G.evaluate__seed) #define sreg (G.evaluate__sreg) - var *v1; + var *tmpvars; +#define TMPVAR0 (tmpvars) +#define TMPVAR1 (tmpvars + 1) if (!op) return setvar_s(res, NULL); debug_printf_eval("entered %s()\n", __func__); - v1 = nvalloc(2); + tmpvars = nvalloc(2); while (op) { struct { @@ -2683,7 +2696,7 @@ static var *evaluate(node *op, var *res) } if (op1->r.n) { /* array ref? */ const char *s; - s = getvar_s(evaluate(op1->r.n, v1)); + s = getvar_s(evaluate(op1->r.n, TMPVAR0)); hash_remove(iamarray(v), s); } else { clear_array(iamarray(v)); @@ -2693,7 +2706,7 @@ static var *evaluate(node *op, var *res) /* execute inevitable things */ if (opinfo & OF_RES1) - L.v = evaluate(op1, v1); + L.v = evaluate(op1, TMPVAR0); if (opinfo & OF_STR1) { L.s = getvar_s(L.v); debug_printf_eval("L.s:'%s'\n", L.s); @@ -2710,7 +2723,7 @@ static var *evaluate(node *op, var *res) * (Seen trying to evaluate "$444 $44444") */ if (opinfo & OF_RES2) { - R.v = evaluate(op->r.n, v1+1); + R.v = evaluate(op->r.n, TMPVAR1); //TODO: L.v may be invalid now, set L.v to NULL to catch bugs? //L.v = NULL; } @@ -2793,7 +2806,7 @@ static var *evaluate(node *op, var *res) fputs(getvar_s(intvar[F0]), F); } else { for (;;) { - var *v = evaluate(nextarg(&op1), v1); + var *v = evaluate(nextarg(&op1), TMPVAR0); if (v->type & VF_NUMBER) { fmt_num(g_buf, MAXVARFMT, getvar_s(intvar[OFMT]), getvar_i(v), TRUE); @@ -2892,7 +2905,7 @@ static var *evaluate(node *op, var *res) /* if source is a temporary string, jusk relink it to dest */ //Disabled: if R.v is numeric but happens to have cached R.v->string, //then L.v ends up being a string, which is wrong -// if (R.v == v1+1 && R.v->string) { +// if (R.v == TMPVAR1 && R.v->string) { // res = setvar_p(L.v, R.v->string); // R.v->string = NULL; // } else { @@ -2908,7 +2921,7 @@ static var *evaluate(node *op, var *res) break; case XC( OC_FUNC ): { - var *tv, *sv_fnargs; + var *argvars, *sv_fnargs; const char *sv_progname; int nargs, i; @@ -2919,10 +2932,10 @@ static var *evaluate(node *op, var *res) /* The body might be empty, still has to eval the args */ nargs = op->r.f->nargs; - tv = nvalloc(nargs); + argvars = nvalloc(nargs); i = 0; while (op1) { - var *arg = evaluate(nextarg(&op1), v1); + var *arg = evaluate(nextarg(&op1), TMPVAR0); if (i == nargs) { /* call with more arguments than function takes. * (gawk warns: "warning: function 'f' called with more arguments than declared"). @@ -2930,18 +2943,18 @@ static var *evaluate(node *op, var *res) clrvar(arg); continue; } - copyvar(&tv[i], arg); - tv[i].type |= VF_CHILD; - tv[i].x.parent = arg; + copyvar(&argvars[i], arg); + argvars[i].type |= VF_CHILD; + argvars[i].x.parent = arg; i++; } sv_fnargs = fnargs; sv_progname = g_progname; - fnargs = tv; + fnargs = argvars; res = evaluate(op->r.f->body.first, res); - nvfree(fnargs, nargs); + nvfree(argvars, nargs); g_progname = sv_progname; fnargs = sv_fnargs; @@ -3266,7 +3279,10 @@ static var *evaluate(node *op, var *res) break; } /* while (op) */ - nvfree(v1, 2); + nvfree(tmpvars, 2); +#undef TMPVAR0 +#undef TMPVAR1 + debug_printf_eval("returning from %s(): %p\n", __func__, res); return res; #undef fnargs -- 2.27.0 From a99d2cf65f27d7c75bf95c4cb9944ec52d1868fd Mon Sep 17 00:00:00 2001 From: Denys Vlasenko Date: Thu, 1 Jul 2021 17:50:26 +0200 Subject: [PATCH 32/65] awk: use static tmpvars instead of nvalloc(1)ed ones ptest() was using this idea already. As far as I can see, this is safe. Ttestsuite passes. One downside is that a temporary from e.g. printf invocation won't be freed until the next printf call. function old new delta awk_printf 481 468 -13 as_regex 137 111 -26 ------------------------------------------------------------------------------ (add/remove: 0/0 grow/shrink: 0/2 up/down: 0/-39) Total: -39 bytes Signed-off-by: Denys Vlasenko --- editors/awk.c | 49 ++++++++++++++++++++++++++++++++++--------------- 1 file changed, 34 insertions(+), 15 deletions(-) diff --git a/editors/awk.c b/editors/awk.c index 49e6af0dc..7fdc51cfd 100644 --- a/editors/awk.c +++ b/editors/awk.c @@ -559,7 +559,9 @@ struct globals2 { unsigned evaluate__seed; regex_t evaluate__sreg; - var ptest__v; + var ptest__tmpvar; + var awk_printf__tmpvar; + var as_regex__tmpvar; tsplitter exec_builtin__tspl; @@ -1775,14 +1777,19 @@ static node *mk_splitter(const char *s, tsplitter *spl) static regex_t *as_regex(node *op, regex_t *preg) { int cflags; - var *tmpvar; const char *s; if ((op->info & OPCLSMASK) == OC_REGEXP) { return icase ? op->r.ire : op->l.re; } - tmpvar = nvalloc(1); - s = getvar_s(evaluate(op, tmpvar)); + +#define TMPVAR (&G.as_regex__tmpvar) + //tmpvar = nvalloc(1); + // We use a single "static" tmpvar (instead of on-stack or malloced one) + // to decrease memory consumption in deeply-recursive awk programs. + // The rule to work safely is to never call evaluate() while our static + // TMPVAR's value is still needed. + s = getvar_s(evaluate(op, TMPVAR)); cflags = icase ? REG_EXTENDED | REG_ICASE : REG_EXTENDED; /* Testcase where REG_EXTENDED fails (unpaired '{'): @@ -1794,7 +1801,8 @@ static regex_t *as_regex(node *op, regex_t *preg) cflags &= ~REG_EXTENDED; xregcomp(preg, s, cflags); } - nvfree(tmpvar, 1); + //nvfree(tmpvar, 1); +#undef TMPVAR return preg; } @@ -2105,8 +2113,11 @@ static int hashwalk_next(var *v) /* evaluate node, return 1 when result is true, 0 otherwise */ static int ptest(node *pattern) { - /* ptest__v is "static": to save stack space? */ - return istrue(evaluate(pattern, &G.ptest__v)); + // We use a single "static" tmpvar (instead of on-stack or malloced one) + // to decrease memory consumption in deeply-recursive awk programs. + // The rule to work safely is to never call evaluate() while our static + // TMPVAR's value is still needed. + return istrue(evaluate(pattern, &G.ptest__tmpvar)); } /* read next record from stream rsm into a variable v */ @@ -2243,12 +2254,18 @@ static char *awk_printf(node *n, int *len) const char *s1; int i, j, incr, bsize; char c, c1; - var *tmpvar, *arg; - - tmpvar = nvalloc(1); -//TODO: above, to avoid allocating a single temporary var, take a pointer -//to a temporary that our caller (evaluate()) already has? - fmt = f = xstrdup(getvar_s(evaluate(nextarg(&n), tmpvar))); + var *arg; + + //tmpvar = nvalloc(1); +#define TMPVAR (&G.awk_printf__tmpvar) + // We use a single "static" tmpvar (instead of on-stack or malloced one) + // to decrease memory consumption in deeply-recursive awk programs. + // The rule to work safely is to never call evaluate() while our static + // TMPVAR's value is still needed. + fmt = f = xstrdup(getvar_s(evaluate(nextarg(&n), TMPVAR))); + // ^^^^^^^^^ here we immediately strdup() the value, so the later call + // to evaluate() potentially recursing into another awk_printf() can't + // mangle the value. i = 0; while (*f) { @@ -2268,7 +2285,7 @@ static char *awk_printf(node *n, int *len) f++; c1 = *f; *f = '\0'; - arg = evaluate(nextarg(&n), tmpvar); + arg = evaluate(nextarg(&n), TMPVAR); j = i; if (c == 'c' || !c) { @@ -2289,7 +2306,9 @@ static char *awk_printf(node *n, int *len) } free(fmt); - nvfree(tmpvar, 1); +// nvfree(tmpvar, 1); +#undef TMPVAR + b = xrealloc(b, i + 1); b[i] = '\0'; #if ENABLE_FEATURE_AWK_GNU_EXTENSIONS -- 2.27.0 From 86ecfff67829c449fc8d609412b97fb4867ce3bc Mon Sep 17 00:00:00 2001 From: Denys Vlasenko Date: Fri, 2 Jul 2021 14:27:40 +0200 Subject: [PATCH 33/65] awk: shuffle functions to reduce forward declarations, no code changes Signed-off-by: Denys Vlasenko --- editors/awk.c | 192 ++++++++++++++++++++++++-------------------------- 1 file changed, 94 insertions(+), 98 deletions(-) diff --git a/editors/awk.c b/editors/awk.c index 7fdc51cfd..392461704 100644 --- a/editors/awk.c +++ b/editors/awk.c @@ -619,18 +619,6 @@ struct globals2 { G.evaluate__seed = 1; \ } while (0) - -/* function prototypes */ -static void handle_special(var *); -static node *parse_expr(uint32_t); -static void chain_group(void); -static var *evaluate(node *, var *); -static rstream *next_input_file(void); -static int fmt_num(char *, int, const char *, double, int); -static int awk_exit(int) NORETURN; - -/* ---- error handling ---- */ - static const char EMSG_UNEXP_EOS[] ALIGN1 = "Unexpected end of string"; static const char EMSG_UNEXP_TOKEN[] ALIGN1 = "Unexpected token"; static const char EMSG_DIV_BY_ZERO[] ALIGN1 = "Division by zero"; @@ -642,10 +630,7 @@ static const char EMSG_UNDEF_FUNC[] ALIGN1 = "Call to undefined function"; static const char EMSG_NO_MATH[] ALIGN1 = "Math support is not compiled in"; static const char EMSG_NEGATIVE_FIELD[] ALIGN1 = "Access to negative field"; -static void zero_out_var(var *vp) -{ - memset(vp, 0, sizeof(*vp)); -} +static int awk_exit(int) NORETURN; static void syntax_error(const char *message) NORETURN; static void syntax_error(const char *message) @@ -653,6 +638,11 @@ static void syntax_error(const char *message) bb_error_msg_and_die("%s:%i: %s", g_progname, g_lineno, message); } +static void zero_out_var(var *vp) +{ + memset(vp, 0, sizeof(*vp)); +} + /* ---- hash stuff ---- */ static unsigned hashidx(const char *name) @@ -885,10 +875,29 @@ static double my_strtod(char **pp) /* -------- working with variables (set/get/copy/etc) -------- */ -static xhash *iamarray(var *v) +static int fmt_num(char *b, int size, const char *format, double n, int int_as_int) { - var *a = v; + int r = 0; + char c; + const char *s = format; + + if (int_as_int && n == (long long)n) { + r = snprintf(b, size, "%lld", (long long)n); + } else { + do { c = *s; } while (c && *++s); + if (strchr("diouxX", c)) { + r = snprintf(b, size, format, (int)n); + } else if (strchr("eEfgG", c)) { + r = snprintf(b, size, format, n); + } else { + syntax_error(EMSG_INV_FMT); + } + } + return r; +} +static xhash *iamarray(var *a) +{ while (a->type & VF_CHILD) a = a->x.parent; @@ -913,6 +922,8 @@ static var *clrvar(var *v) return v; } +static void handle_special(var *); + /* assign string value to variable */ static var *setvar_p(var *v, char *value) { @@ -1284,6 +1295,8 @@ static void mk_re_node(const char *s, node *n, regex_t *re) xregcomp(re + 1, s, REG_EXTENDED | REG_ICASE); } +static node *parse_expr(uint32_t); + static node *parse_lrparen_list(void) { next_token(TC_LPAREN); @@ -1488,6 +1501,8 @@ static void chain_expr(uint32_t info) rollback_token(); } +static void chain_group(void); + static node *chain_loop(node *nn) { node *n, *n2, *save_brk, *save_cont; @@ -1770,6 +1785,8 @@ static node *mk_splitter(const char *s, tsplitter *spl) return n; } +static var *evaluate(node *, var *); + /* Use node as a regular expression. Supplied with node ptr and regex_t * storage space. Return ptr to regex (if result points to preg, it should * be later regfree'd manually). @@ -2222,27 +2239,6 @@ static int awk_getline(rstream *rsm, var *v) return r; } -static int fmt_num(char *b, int size, const char *format, double n, int int_as_int) -{ - int r = 0; - char c; - const char *s = format; - - if (int_as_int && n == (long long)n) { - r = snprintf(b, size, "%lld", (long long)n); - } else { - do { c = *s; } while (c && *++s); - if (strchr("diouxX", c)) { - r = snprintf(b, size, format, (int)n); - } else if (strchr("eEfgG", c)) { - r = snprintf(b, size, format, n); - } else { - syntax_error(EMSG_INV_FMT); - } - } - return r; -} - /* formatted output into an allocated buffer, return ptr to buffer */ #if !ENABLE_FEATURE_AWK_GNU_EXTENSIONS # define awk_printf(a, b) awk_printf(a) @@ -2306,7 +2302,7 @@ static char *awk_printf(node *n, int *len) } free(fmt); -// nvfree(tmpvar, 1); + //nvfree(tmpvar, 1); #undef TMPVAR b = xrealloc(b, i + 1); @@ -2652,6 +2648,64 @@ static NOINLINE var *exec_builtin(node *op, var *res) #undef tspl } +/* if expr looks like "var=value", perform assignment and return 1, + * otherwise return 0 */ +static int is_assignment(const char *expr) +{ + char *exprc, *val; + + if (!isalnum_(*expr) || (val = strchr(expr, '=')) == NULL) { + return FALSE; + } + + exprc = xstrdup(expr); + val = exprc + (val - expr); + *val++ = '\0'; + + unescape_string_in_place(val); + setvar_u(newvar(exprc), val); + free(exprc); + return TRUE; +} + +/* switch to next input file */ +static rstream *next_input_file(void) +{ +#define rsm (G.next_input_file__rsm) +#define files_happen (G.next_input_file__files_happen) + + FILE *F; + const char *fname, *ind; + + if (rsm.F) + fclose(rsm.F); + rsm.F = NULL; + rsm.pos = rsm.adv = 0; + + for (;;) { + if (getvar_i(intvar[ARGIND])+1 >= getvar_i(intvar[ARGC])) { + if (files_happen) + return NULL; + fname = "-"; + F = stdin; + break; + } + ind = getvar_s(incvar(intvar[ARGIND])); + fname = getvar_s(findvar(iamarray(intvar[ARGV]), ind)); + if (fname && *fname && !is_assignment(fname)) { + F = xfopen_stdin(fname); + break; + } + } + + files_happen = TRUE; + setvar_s(intvar[FILENAME], fname); + rsm.F = F; + return &rsm; +#undef rsm +#undef files_happen +} + /* * Evaluate node - the heart of the program. Supplied with subtree * and place where to store result. Returns ptr to result. @@ -3338,64 +3392,6 @@ static int awk_exit(int r) exit(r); } -/* if expr looks like "var=value", perform assignment and return 1, - * otherwise return 0 */ -static int is_assignment(const char *expr) -{ - char *exprc, *val; - - if (!isalnum_(*expr) || (val = strchr(expr, '=')) == NULL) { - return FALSE; - } - - exprc = xstrdup(expr); - val = exprc + (val - expr); - *val++ = '\0'; - - unescape_string_in_place(val); - setvar_u(newvar(exprc), val); - free(exprc); - return TRUE; -} - -/* switch to next input file */ -static rstream *next_input_file(void) -{ -#define rsm (G.next_input_file__rsm) -#define files_happen (G.next_input_file__files_happen) - - FILE *F; - const char *fname, *ind; - - if (rsm.F) - fclose(rsm.F); - rsm.F = NULL; - rsm.pos = rsm.adv = 0; - - for (;;) { - if (getvar_i(intvar[ARGIND])+1 >= getvar_i(intvar[ARGC])) { - if (files_happen) - return NULL; - fname = "-"; - F = stdin; - break; - } - ind = getvar_s(incvar(intvar[ARGIND])); - fname = getvar_s(findvar(iamarray(intvar[ARGV]), ind)); - if (fname && *fname && !is_assignment(fname)) { - F = xfopen_stdin(fname); - break; - } - } - - files_happen = TRUE; - setvar_s(intvar[FILENAME], fname); - rsm.F = F; - return &rsm; -#undef rsm -#undef files_happen -} - int awk_main(int argc, char **argv) MAIN_EXTERNALLY_VISIBLE; int awk_main(int argc UNUSED_PARAM, char **argv) { -- 2.27.0 From 19af40d06d24dcf9a416dc6bf494a5b933222030 Mon Sep 17 00:00:00 2001 From: Denys Vlasenko Date: Fri, 2 Jul 2021 14:29:01 +0200 Subject: [PATCH 34/65] awk: when parsing length(), simplify eating of LPAREN function old new delta parse_expr 945 948 +3 Signed-off-by: Denys Vlasenko --- editors/awk.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/editors/awk.c b/editors/awk.c index 392461704..eabcd9b75 100644 --- a/editors/awk.c +++ b/editors/awk.c @@ -1453,10 +1453,11 @@ static node *parse_expr(uint32_t term_tc) | TC_BINOPX /* length NUM */ | TC_COMMA /* print length, 1 */ ); - rollback_token(); - if (tc & TC_LPAREN) { + if (tc != TC_LPAREN) + rollback_token(); + else { /* It was a "(" token. Handle just like TC_BUILTIN */ - cn->l.n = parse_lrparen_list(); + cn->l.n = parse_expr(TC_RPAREN); } break; } -- 2.27.0 From dad02e0aaa77ea5abf959216c31e5968460d519e Mon Sep 17 00:00:00 2001 From: Denys Vlasenko Date: Fri, 2 Jul 2021 14:33:13 +0200 Subject: [PATCH 35/65] awk: use "static" tmpvars in main and exit function old new delta awk_exit 103 93 -10 awk_main 850 832 -18 ------------------------------------------------------------------------------ (add/remove: 0/0 grow/shrink: 0/2 up/down: 0/-28) Total: -28 bytes Signed-off-by: Denys Vlasenko --- editors/awk.c | 17 +++++------------ 1 file changed, 5 insertions(+), 12 deletions(-) diff --git a/editors/awk.c b/editors/awk.c index eabcd9b75..462d3947b 100644 --- a/editors/awk.c +++ b/editors/awk.c @@ -562,6 +562,8 @@ struct globals2 { var ptest__tmpvar; var awk_printf__tmpvar; var as_regex__tmpvar; + var exit__tmpvar; + var main__tmpvar; tsplitter exec_builtin__tspl; @@ -638,11 +640,6 @@ static void syntax_error(const char *message) bb_error_msg_and_die("%s:%i: %s", g_progname, g_lineno, message); } -static void zero_out_var(var *vp) -{ - memset(vp, 0, sizeof(*vp)); -} - /* ---- hash stuff ---- */ static unsigned hashidx(const char *name) @@ -3372,11 +3369,9 @@ static int awk_exit(int r) unsigned i; if (!exiting) { - var tv; exiting = TRUE; nextrec = FALSE; - zero_out_var(&tv); - evaluate(endseq.first, &tv); + evaluate(endseq.first, &G.exit__tmpvar); } /* waiting for children */ @@ -3404,7 +3399,6 @@ int awk_main(int argc UNUSED_PARAM, char **argv) llist_t *list_e = NULL; #endif int i; - var tv; INIT_G(); @@ -3514,8 +3508,7 @@ int awk_main(int argc UNUSED_PARAM, char **argv) newfile("/dev/stdout")->F = stdout; newfile("/dev/stderr")->F = stderr; - zero_out_var(&tv); - evaluate(beginseq.first, &tv); + evaluate(beginseq.first, &G.main__tmpvar); if (!mainseq.first && !endseq.first) awk_exit(EXIT_SUCCESS); @@ -3532,7 +3525,7 @@ int awk_main(int argc UNUSED_PARAM, char **argv) nextrec = FALSE; incvar(intvar[NR]); incvar(intvar[FNR]); - evaluate(mainseq.first, &tv); + evaluate(mainseq.first, &G.main__tmpvar); if (nextfile) break; -- 2.27.0 From 571beaf1570fb81ba34c776aae1266f76e0da323 Mon Sep 17 00:00:00 2001 From: Denys Vlasenko Date: Fri, 2 Jul 2021 14:53:52 +0200 Subject: [PATCH 36/65] awk: shuffle globals for smaller offsets function old new delta awk_main 832 829 -3 evaluate 3229 3223 -6 ------------------------------------------------------------------------------ (add/remove: 0/0 grow/shrink: 0/2 up/down: 0/-9) Total: -9 bytes Signed-off-by: Denys Vlasenko --- editors/awk.c | 25 +++++++++++++------------ 1 file changed, 13 insertions(+), 12 deletions(-) diff --git a/editors/awk.c b/editors/awk.c index 462d3947b..6c660577d 100644 --- a/editors/awk.c +++ b/editors/awk.c @@ -536,6 +536,11 @@ struct globals { smallint nextfile; smallint is_f0_split; smallint t_rollback; + + /* former statics from various functions */ + smallint next_token__concat_inserted; + uint32_t next_token__save_tclass; + uint32_t next_token__save_info; }; struct globals2 { uint32_t t_info; /* often used */ @@ -548,15 +553,11 @@ struct globals2 { /* former statics from various functions */ char *split_f0__fstrings; - uint32_t next_token__save_tclass; - uint32_t next_token__save_info; - smallint next_token__concat_inserted; - - smallint next_input_file__files_happen; rstream next_input_file__rsm; + smallint next_input_file__files_happen; - var *evaluate__fnargs; unsigned evaluate__seed; + var *evaluate__fnargs; regex_t evaluate__sreg; var ptest__tmpvar; @@ -575,10 +576,10 @@ struct globals2 { #define G1 (ptr_to_globals[-1]) #define G (*(struct globals2 *)ptr_to_globals) /* For debug. nm --size-sort awk.o | grep -vi ' [tr] ' */ -/*char G1size[sizeof(G1)]; - 0x74 */ -/*char Gsize[sizeof(G)]; - 0x1c4 */ +//char G1size[sizeof(G1)]; // 0x70 +//char Gsize[sizeof(G)]; // 0x2f8 /* Trying to keep most of members accessible with short offsets: */ -/*char Gofs_seed[offsetof(struct globals2, evaluate__seed)]; - 0x90 */ +//char Gofs_seed[offsetof(struct globals2, evaluate__seed)]; // 0x7c #define t_double (G1.t_double ) #define beginseq (G1.beginseq ) #define mainseq (G1.mainseq ) @@ -1056,9 +1057,9 @@ static int istrue(var *v) */ static uint32_t next_token(uint32_t expected) { -#define concat_inserted (G.next_token__concat_inserted) -#define save_tclass (G.next_token__save_tclass) -#define save_info (G.next_token__save_info) +#define concat_inserted (G1.next_token__concat_inserted) +#define save_tclass (G1.next_token__save_tclass) +#define save_info (G1.next_token__save_info) char *p; const char *tl; -- 2.27.0 From 879c82fc8f14aa5811a4c93f758985ce72ded707 Mon Sep 17 00:00:00 2001 From: Denys Vlasenko Date: Fri, 2 Jul 2021 15:19:14 +0200 Subject: [PATCH 37/65] awk: do not special-case "delete" Rework of the previous fix: Can use operation attributes to disable arg evaluation instead of special-casing. function old new delta .rodata 104032 104036 +4 evaluate 3223 3215 -8 ------------------------------------------------------------------------------ (add/remove: 0/0 grow/shrink: 1/1 up/down: 4/-8) Total: -4 bytes Signed-off-by: Denys Vlasenko --- editors/awk.c | 56 +++++++++++++++++++++++++-------------------------- 1 file changed, 27 insertions(+), 29 deletions(-) diff --git a/editors/awk.c b/editors/awk.c index 6c660577d..a484b909a 100644 --- a/editors/awk.c +++ b/editors/awk.c @@ -319,7 +319,7 @@ if ((n) & TC_NUMBER ) debug_printf_parse(" NUMBER" ); \ #define xV OF_RES2 #define xS (OF_RES2 | OF_STR2) #define Vx OF_RES1 -#define Rx (OF_RES1 | OF_NUM1 | OF_REQUIRED) +#define Rx OF_REQUIRED #define VV (OF_RES1 | OF_RES2) #define Nx (OF_RES1 | OF_NUM1) #define NV (OF_RES1 | OF_NUM1 | OF_RES2) @@ -2750,32 +2750,6 @@ static var *evaluate(node *op, var *res) op1 = op->l.n; debug_printf_eval("opinfo:%08x opn:%08x\n", opinfo, opn); - /* "delete" is special: - * "delete array[var--]" must evaluate index expr only once, - * must not evaluate it in "execute inevitable things" part. - */ - if (XC(opinfo & OPCLSMASK) == XC(OC_DELETE)) { - uint32_t info = op1->info & OPCLSMASK; - var *v; - - debug_printf_eval("DELETE\n"); - if (info == OC_VAR) { - v = op1->l.v; - } else if (info == OC_FNARG) { - v = &fnargs[op1->l.aidx]; - } else { - syntax_error(EMSG_NOT_ARRAY); - } - if (op1->r.n) { /* array ref? */ - const char *s; - s = getvar_s(evaluate(op1->r.n, TMPVAR0)); - hash_remove(iamarray(v), s); - } else { - clear_array(iamarray(v)); - } - goto next; - } - /* execute inevitable things */ if (opinfo & OF_RES1) L.v = evaluate(op1, TMPVAR0); @@ -2905,7 +2879,31 @@ static var *evaluate(node *op, var *res) break; } - /* case XC( OC_DELETE ): - moved to happen before arg evaluation */ + case XC( OC_DELETE ): + debug_printf_eval("DELETE\n"); + { + /* "delete" is special: + * "delete array[var--]" must evaluate index expr only once. + */ + uint32_t info = op1->info & OPCLSMASK; + var *v; + + if (info == OC_VAR) { + v = op1->l.v; + } else if (info == OC_FNARG) { + v = &fnargs[op1->l.aidx]; + } else { + syntax_error(EMSG_NOT_ARRAY); + } + if (op1->r.n) { /* array ref? */ + const char *s; + s = getvar_s(evaluate(op1->r.n, TMPVAR0)); + hash_remove(iamarray(v), s); + } else { + clear_array(iamarray(v)); + } + break; + } case XC( OC_NEWSOURCE ): debug_printf_eval("NEWSOURCE\n"); @@ -3342,7 +3340,7 @@ static var *evaluate(node *op, var *res) default: syntax_error(EMSG_POSSIBLE_ERROR); } /* switch */ - next: + if ((opinfo & OPCLSMASK) <= SHIFT_TIL_THIS) op = op->a.n; if ((opinfo & OPCLSMASK) >= RECUR_FROM_THIS) -- 2.27.0 From 3dc2dbb1eb5c0816cbf1fc8360570d6503e35bf8 Mon Sep 17 00:00:00 2001 From: Denys Vlasenko Date: Fri, 2 Jul 2021 17:32:08 +0200 Subject: [PATCH 38/65] awk: make builtin definitions more understandable, no code changes Signed-off-by: Denys Vlasenko --- editors/awk.c | 71 +++++++++++++++++++++++++++++++++++---------------- 1 file changed, 49 insertions(+), 22 deletions(-) diff --git a/editors/awk.c b/editors/awk.c index a484b909a..ad0eb482b 100644 --- a/editors/awk.c +++ b/editors/awk.c @@ -331,8 +331,7 @@ if ((n) & TC_NUMBER ) debug_printf_parse(" NUMBER" ); \ #define OPNMASK 0x007F /* operator priority is a highest byte (even: r->l, odd: l->r grouping) - * For builtins it has different meaning: n n s3 s2 s1 v3 v2 v1, - * n - min. number of args, vN - resolve Nth arg to var, sN - resolve to string + * (for builtins it has different meaning) */ #undef P #undef PRIMASK @@ -430,8 +429,6 @@ static const char tokenlist[] ALIGN1 = /* compiler adds trailing "\0" */ ; -#define OC_B OC_BUILTIN - static const uint32_t tokeninfo[] = { 0, 0, @@ -464,20 +461,43 @@ static const uint32_t tokeninfo[] = { OC_RETURN|Vx, OC_EXIT|Nx, ST_WHILE, 0, /* else */ - OC_B|B_an|P(0x83), OC_B|B_co|P(0x41), OC_B|B_ls|P(0x83), OC_B|B_or|P(0x83), - OC_B|B_rs|P(0x83), OC_B|B_xo|P(0x83), - OC_FBLTIN|Sx|F_cl, OC_FBLTIN|Sx|F_sy, OC_FBLTIN|Sx|F_ff, OC_B|B_a2|P(0x83), - OC_FBLTIN|Nx|F_co, OC_FBLTIN|Nx|F_ex, OC_FBLTIN|Nx|F_in, OC_FBLTIN|Nx|F_lg, - OC_FBLTIN|F_rn, OC_FBLTIN|Nx|F_si, OC_FBLTIN|Nx|F_sq, OC_FBLTIN|Nx|F_sr, - OC_B|B_ge|P(0xd6), OC_B|B_gs|P(0xb6), OC_B|B_ix|P(0x9b), /* OC_FBLTIN|Sx|F_le, was here */ - OC_B|B_ma|P(0x89), OC_B|B_sp|P(0x8b), OC_SPRINTF, OC_B|B_su|P(0xb6), - OC_B|B_ss|P(0x8f), OC_FBLTIN|F_ti, OC_B|B_ti|P(0x0b), OC_B|B_mt|P(0x0b), - OC_B|B_lo|P(0x49), OC_B|B_up|P(0x49), - OC_FBLTIN|Sx|F_le, /* TC_LENGTH */ - OC_GETLINE|SV|P(0), - 0, 0, - 0, - 0 /* TC_END */ +// OC_B's are builtins with enforced minimum number of arguments (two upper bits). +// Highest byte bit pattern: nn s3s2s1 v3v2v1 +// nn - min. number of args, sN - resolve Nth arg to string, vN - resolve to var +// OC_FBLTIN's are builtins with one optional argument, +// TODO: enforce exactly one arg for: system, close, cos, sin, exp, int, log, sqrt +// zero args for: rand systime +// Do have one optional arg: fflush, srand, length +#define OC_B OC_BUILTIN +#define A1 P(0x40) /*one arg*/ +#define A2 P(0x80) /*two args*/ +#define A3 P(0xc0) /*three args*/ +#define __v P(1) +#define _vv P(3) +#define __s__v P(9) +#define __s_vv P(0x0b) +#define __svvv P(0x0f) +#define _ss_vv P(0x1b) +#define _s_vv_ P(0x16) +#define ss_vv_ P(0x36) + OC_B|B_an|_vv|A2, OC_B|B_co|__v|A1, OC_B|B_ls|_vv|A2, OC_B|B_or|_vv|A2, // and compl lshift or + OC_B|B_rs|_vv|A2, OC_B|B_xo|_vv|A2, // rshift xor + OC_FBLTIN|Sx|F_cl, OC_FBLTIN|Sx|F_sy, OC_FBLTIN|Sx|F_ff, OC_B|B_a2|_vv|A2, // close system fflush atan2 + OC_FBLTIN|Nx|F_co, OC_FBLTIN|Nx|F_ex, OC_FBLTIN|Nx|F_in, OC_FBLTIN|Nx|F_lg, // cos exp int log + OC_FBLTIN|F_rn, OC_FBLTIN|Nx|F_si, OC_FBLTIN|Nx|F_sq, OC_FBLTIN|Nx|F_sr, // rand sin sqrt srand + OC_B|B_ge|_s_vv_|A3, OC_B|B_gs|ss_vv_|A2, OC_B|B_ix|_ss_vv|A2, // gensub gsub index /*length was here*/ + OC_B|B_ma|__s__v|A2, OC_B|B_sp|__s_vv|A2, OC_SPRINTF, OC_B|B_su|ss_vv_|A2, // match split sprintf sub + OC_B|B_ss|__svvv|A2, OC_FBLTIN|F_ti, OC_B|B_ti|__s_vv, OC_B|B_mt|__s_vv, // substr systime strftime mktime + OC_B|B_lo|__s__v|A1, OC_B|B_up|__s__v|A1, // tolower toupper + OC_FBLTIN|Sx|F_le, // length + OC_GETLINE|SV, // getline + 0, 0, // func function + 0, // BEGIN + 0 // END +#undef A1 +#undef A2 +#undef A3 +#undef OC_B }; /* internal variable names and their initial values */ @@ -1630,6 +1650,7 @@ static void chain_group(void) debug_printf_parse("%s: OC_BREAK\n", __func__); n = chain_node(OC_EXEC); n->a.n = break_ptr; +//TODO: if break_ptr is NULL, syntax error (not in the loop)? chain_expr(t_info); break; @@ -1637,6 +1658,7 @@ static void chain_group(void) debug_printf_parse("%s: OC_CONTINUE\n", __func__); n = chain_node(OC_EXEC); n->a.n = continue_ptr; +//TODO: if continue_ptr is NULL, syntax error (not in the loop)? chain_expr(t_info); break; @@ -1799,8 +1821,8 @@ static regex_t *as_regex(node *op, regex_t *preg) return icase ? op->r.ire : op->l.re; } -#define TMPVAR (&G.as_regex__tmpvar) //tmpvar = nvalloc(1); +#define TMPVAR (&G.as_regex__tmpvar) // We use a single "static" tmpvar (instead of on-stack or malloced one) // to decrease memory consumption in deeply-recursive awk programs. // The rule to work safely is to never call evaluate() while our static @@ -2720,8 +2742,6 @@ static var *evaluate(node *op, var *res) #define sreg (G.evaluate__sreg) var *tmpvars; -#define TMPVAR0 (tmpvars) -#define TMPVAR1 (tmpvars + 1) if (!op) return setvar_s(res, NULL); @@ -2729,6 +2749,8 @@ static var *evaluate(node *op, var *res) debug_printf_eval("entered %s()\n", __func__); tmpvars = nvalloc(2); +#define TMPVAR0 (tmpvars) +#define TMPVAR1 (tmpvars + 1) while (op) { struct { @@ -3166,7 +3188,7 @@ static var *evaluate(node *op, var *res) rstream *rsm; int err = 0; rsm = (rstream *)hash_search(fdhash, L.s); - debug_printf_eval("OC_FBLTIN F_cl rsm:%p\n", rsm); + debug_printf_eval("OC_FBLTIN close: op1:%p s:'%s' rsm:%p\n", op1, L.s, rsm); if (rsm) { debug_printf_eval("OC_FBLTIN F_cl " "rsm->is_pipe:%d, ->F:%p\n", @@ -3177,6 +3199,11 @@ static var *evaluate(node *op, var *res) */ if (rsm->F) err = rsm->is_pipe ? pclose(rsm->F) : fclose(rsm->F); +//TODO: fix this case: +// $ awk 'BEGIN { print close(""); print ERRNO }' +// -1 +// close of redirection that was never opened +// (we print 0, 0) free(rsm->buffer); hash_remove(fdhash, L.s); } -- 2.27.0 From e0ae7592f9ccdc9d63730393b85162202db47cdc Mon Sep 17 00:00:00 2001 From: Denys Vlasenko Date: Fri, 2 Jul 2021 18:28:12 +0200 Subject: [PATCH 39/65] awk: enforce simple builtins' argument number function old new delta evaluate 3215 3303 +88 .rodata 104036 104107 +71 ------------------------------------------------------------------------------ (add/remove: 0/0 grow/shrink: 2/0 up/down: 159/0) Total: 159 bytes Signed-off-by: Denys Vlasenko (cherry picked from commit 47d9133896f0de6b17393309193051e4bd52015e) --- editors/awk.c | 45 ++++++++++++++++++++++++++++----------------- 1 file changed, 28 insertions(+), 17 deletions(-) diff --git a/editors/awk.c b/editors/awk.c index ad0eb482b..69f9474d9 100644 --- a/editors/awk.c +++ b/editors/awk.c @@ -464,11 +464,11 @@ static const uint32_t tokeninfo[] = { // OC_B's are builtins with enforced minimum number of arguments (two upper bits). // Highest byte bit pattern: nn s3s2s1 v3v2v1 // nn - min. number of args, sN - resolve Nth arg to string, vN - resolve to var -// OC_FBLTIN's are builtins with one optional argument, -// TODO: enforce exactly one arg for: system, close, cos, sin, exp, int, log, sqrt -// zero args for: rand systime -// Do have one optional arg: fflush, srand, length -#define OC_B OC_BUILTIN +// OC_FBLTIN's are builtins with zero or one argument. +// |Rx| enforces that arg is present for: system, close, cos, sin, exp, int, log, sqrt. +// Check for no args is present in builtins' code (not in this table): rand, systime. +// Have one _optional_ arg: fflush, srand, length +#define OC_B OC_BUILTIN #define A1 P(0x40) /*one arg*/ #define A2 P(0x80) /*two args*/ #define A3 P(0xc0) /*three args*/ @@ -480,15 +480,15 @@ static const uint32_t tokeninfo[] = { #define _ss_vv P(0x1b) #define _s_vv_ P(0x16) #define ss_vv_ P(0x36) - OC_B|B_an|_vv|A2, OC_B|B_co|__v|A1, OC_B|B_ls|_vv|A2, OC_B|B_or|_vv|A2, // and compl lshift or - OC_B|B_rs|_vv|A2, OC_B|B_xo|_vv|A2, // rshift xor - OC_FBLTIN|Sx|F_cl, OC_FBLTIN|Sx|F_sy, OC_FBLTIN|Sx|F_ff, OC_B|B_a2|_vv|A2, // close system fflush atan2 - OC_FBLTIN|Nx|F_co, OC_FBLTIN|Nx|F_ex, OC_FBLTIN|Nx|F_in, OC_FBLTIN|Nx|F_lg, // cos exp int log - OC_FBLTIN|F_rn, OC_FBLTIN|Nx|F_si, OC_FBLTIN|Nx|F_sq, OC_FBLTIN|Nx|F_sr, // rand sin sqrt srand - OC_B|B_ge|_s_vv_|A3, OC_B|B_gs|ss_vv_|A2, OC_B|B_ix|_ss_vv|A2, // gensub gsub index /*length was here*/ - OC_B|B_ma|__s__v|A2, OC_B|B_sp|__s_vv|A2, OC_SPRINTF, OC_B|B_su|ss_vv_|A2, // match split sprintf sub - OC_B|B_ss|__svvv|A2, OC_FBLTIN|F_ti, OC_B|B_ti|__s_vv, OC_B|B_mt|__s_vv, // substr systime strftime mktime - OC_B|B_lo|__s__v|A1, OC_B|B_up|__s__v|A1, // tolower toupper + OC_B|B_an|_vv|A2, OC_B|B_co|__v|A1, OC_B|B_ls|_vv|A2, OC_B|B_or|_vv|A2, // and compl lshift or + OC_B|B_rs|_vv|A2, OC_B|B_xo|_vv|A2, // rshift xor + OC_FBLTIN|Sx|Rx|F_cl,OC_FBLTIN|Sx|Rx|F_sy,OC_FBLTIN|Sx|F_ff, OC_B|B_a2|_vv|A2, // close system fflush atan2 + OC_FBLTIN|Nx|Rx|F_co,OC_FBLTIN|Nx|Rx|F_ex,OC_FBLTIN|Nx|Rx|F_in,OC_FBLTIN|Nx|Rx|F_lg,// cos exp int log + OC_FBLTIN|F_rn, OC_FBLTIN|Nx|Rx|F_si,OC_FBLTIN|Nx|Rx|F_sq,OC_FBLTIN|Nx|F_sr, // rand sin sqrt srand + OC_B|B_ge|_s_vv_|A3, OC_B|B_gs|ss_vv_|A2, OC_B|B_ix|_ss_vv|A2, // gensub gsub index /*length was here*/ + OC_B|B_ma|__s__v|A2, OC_B|B_sp|__s_vv|A2, OC_SPRINTF, OC_B|B_su|ss_vv_|A2, // match split sprintf sub + OC_B|B_ss|__svvv|A2, OC_FBLTIN|F_ti, OC_B|B_ti|__s_vv, OC_B|B_mt|__s_vv, // substr systime strftime mktime + OC_B|B_lo|__s__v|A1, OC_B|B_up|__s__v|A1, // tolower toupper OC_FBLTIN|Sx|F_le, // length OC_GETLINE|SV, // getline 0, 0, // func function @@ -2773,8 +2773,11 @@ static var *evaluate(node *op, var *res) debug_printf_eval("opinfo:%08x opn:%08x\n", opinfo, opn); /* execute inevitable things */ - if (opinfo & OF_RES1) + if (opinfo & OF_RES1) { + if ((opinfo & OF_REQUIRED) && !op1) + syntax_error(EMSG_TOO_FEW_ARGS); L.v = evaluate(op1, TMPVAR0); + } if (opinfo & OF_STR1) { L.s = getvar_s(L.v); debug_printf_eval("L.s:'%s'\n", L.s); @@ -3101,12 +3104,18 @@ static var *evaluate(node *op, var *res) double R_d = R_d; /* for compiler */ debug_printf_eval("FBLTIN\n"); + if (op1 && (op1->info & OPCLSMASK) == OC_COMMA) + /* Simple builtins take one arg maximum */ + syntax_error("Too many arguments"); + switch (opn) { case F_in: R_d = (long long)L_d; break; - case F_rn: + case F_rn: /*rand*/ + if (op1) + syntax_error("Too many arguments"); R_d = (double)rand() / (double)RAND_MAX; break; @@ -3149,7 +3158,9 @@ static var *evaluate(node *op, var *res) srand(seed); break; - case F_ti: + case F_ti: /*systime*/ + if (op1) + syntax_error("Too many arguments"); R_d = time(NULL); break; -- 2.27.0 From 05bc5c8319d1da17e84fe235550de95bbd17f6b9 Mon Sep 17 00:00:00 2001 From: Denys Vlasenko Date: Fri, 2 Jul 2021 18:55:00 +0200 Subject: [PATCH 40/65] awk: beautify builtins table, no code changes Signed-off-by: Denys Vlasenko (cherry picked from commit 37ae8cdc6e428e68ad76f6b446881ecff305ebd3) --- editors/awk.c | 26 ++++++++++++++------------ 1 file changed, 14 insertions(+), 12 deletions(-) diff --git a/editors/awk.c b/editors/awk.c index 69f9474d9..679d2a346 100644 --- a/editors/awk.c +++ b/editors/awk.c @@ -464,11 +464,12 @@ static const uint32_t tokeninfo[] = { // OC_B's are builtins with enforced minimum number of arguments (two upper bits). // Highest byte bit pattern: nn s3s2s1 v3v2v1 // nn - min. number of args, sN - resolve Nth arg to string, vN - resolve to var -// OC_FBLTIN's are builtins with zero or one argument. +// OC_F's are builtins with zero or one argument. // |Rx| enforces that arg is present for: system, close, cos, sin, exp, int, log, sqrt. // Check for no args is present in builtins' code (not in this table): rand, systime. // Have one _optional_ arg: fflush, srand, length #define OC_B OC_BUILTIN +#define OC_F OC_FBLTIN #define A1 P(0x40) /*one arg*/ #define A2 P(0x80) /*two args*/ #define A3 P(0xc0) /*three args*/ @@ -480,17 +481,17 @@ static const uint32_t tokeninfo[] = { #define _ss_vv P(0x1b) #define _s_vv_ P(0x16) #define ss_vv_ P(0x36) - OC_B|B_an|_vv|A2, OC_B|B_co|__v|A1, OC_B|B_ls|_vv|A2, OC_B|B_or|_vv|A2, // and compl lshift or - OC_B|B_rs|_vv|A2, OC_B|B_xo|_vv|A2, // rshift xor - OC_FBLTIN|Sx|Rx|F_cl,OC_FBLTIN|Sx|Rx|F_sy,OC_FBLTIN|Sx|F_ff, OC_B|B_a2|_vv|A2, // close system fflush atan2 - OC_FBLTIN|Nx|Rx|F_co,OC_FBLTIN|Nx|Rx|F_ex,OC_FBLTIN|Nx|Rx|F_in,OC_FBLTIN|Nx|Rx|F_lg,// cos exp int log - OC_FBLTIN|F_rn, OC_FBLTIN|Nx|Rx|F_si,OC_FBLTIN|Nx|Rx|F_sq,OC_FBLTIN|Nx|F_sr, // rand sin sqrt srand - OC_B|B_ge|_s_vv_|A3, OC_B|B_gs|ss_vv_|A2, OC_B|B_ix|_ss_vv|A2, // gensub gsub index /*length was here*/ - OC_B|B_ma|__s__v|A2, OC_B|B_sp|__s_vv|A2, OC_SPRINTF, OC_B|B_su|ss_vv_|A2, // match split sprintf sub - OC_B|B_ss|__svvv|A2, OC_FBLTIN|F_ti, OC_B|B_ti|__s_vv, OC_B|B_mt|__s_vv, // substr systime strftime mktime - OC_B|B_lo|__s__v|A1, OC_B|B_up|__s__v|A1, // tolower toupper - OC_FBLTIN|Sx|F_le, // length - OC_GETLINE|SV, // getline + OC_B|B_an|_vv|A2, OC_B|B_co|__v|A1, OC_B|B_ls|_vv|A2, OC_B|B_or|_vv|A2, // and compl lshift or + OC_B|B_rs|_vv|A2, OC_B|B_xo|_vv|A2, // rshift xor + OC_F|F_cl|Sx|Rx, OC_F|F_sy|Sx|Rx, OC_F|F_ff|Sx, OC_B|B_a2|_vv|A2, // close system fflush atan2 + OC_F|F_co|Nx|Rx, OC_F|F_ex|Nx|Rx, OC_F|F_in|Nx|Rx, OC_F|F_lg|Nx|Rx, // cos exp int log + OC_F|F_rn, OC_F|F_si|Nx|Rx, OC_F|F_sq|Nx|Rx, OC_F|F_sr|Nx, // rand sin sqrt srand + OC_B|B_ge|_s_vv_|A3,OC_B|B_gs|ss_vv_|A2,OC_B|B_ix|_ss_vv|A2, // gensub gsub index /*length was here*/ + OC_B|B_ma|__s__v|A2,OC_B|B_sp|__s_vv|A2,OC_SPRINTF, OC_B|B_su|ss_vv_|A2,// match split sprintf sub + OC_B|B_ss|__svvv|A2,OC_F|F_ti, OC_B|B_ti|__s_vv, OC_B|B_mt|__s_vv, // substr systime strftime mktime + OC_B|B_lo|__s__v|A1,OC_B|B_up|__s__v|A1, // tolower toupper + OC_F|F_le|Sx, // length + OC_GETLINE|SV, // getline 0, 0, // func function 0, // BEGIN 0 // END @@ -498,6 +499,7 @@ static const uint32_t tokeninfo[] = { #undef A2 #undef A3 #undef OC_B +#undef OC_F }; /* internal variable names and their initial values */ -- 2.27.0 From 79e294b4ffc00aa019ad4d65955b66e9f63e6909 Mon Sep 17 00:00:00 2001 From: Denys Vlasenko Date: Fri, 2 Jul 2021 19:38:03 +0200 Subject: [PATCH 41/65] awk: rand() could return 1.0, fix this - should be in [0,1) While at it, make it finer-grained (63 bits of randomness) function old new delta evaluate 3303 3336 +33 .rodata 104107 104111 +4 ------------------------------------------------------------------------------ (add/remove: 0/0 grow/shrink: 2/0 up/down: 37/0) Total: 37 bytes Signed-off-by: Denys Vlasenko (cherry picked from commit 8bb03da906e1f8f750123214b15a19d7d4e166c1) --- editors/awk.c | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/editors/awk.c b/editors/awk.c index 679d2a346..878098bf9 100644 --- a/editors/awk.c +++ b/editors/awk.c @@ -3118,9 +3118,20 @@ static var *evaluate(node *op, var *res) case F_rn: /*rand*/ if (op1) syntax_error("Too many arguments"); - R_d = (double)rand() / (double)RAND_MAX; + { +#if RAND_MAX >= 0x7fffffff + uint32_t u = ((uint32_t)rand() << 16) ^ rand(); + uint64_t v = ((uint64_t)rand() << 32) | u; + /* the above shift+or is optimized out on 32-bit arches */ +# if RAND_MAX > 0x7fffffff + v &= 0x7fffffffffffffffUL; +# endif + R_d = (double)v / 0x8000000000000000UL; +#else +# error Not implemented for this value of RAND_MAX +#endif break; - + } case F_co: if (ENABLE_FEATURE_AWK_LIBM) { R_d = cos(L_d); -- 2.27.0 From ef1bc694752c05f61107c44a21e181e92f4f3652 Mon Sep 17 00:00:00 2001 From: Denys Vlasenko Date: Fri, 2 Jul 2021 22:28:51 +0200 Subject: [PATCH 42/65] awk: fix beavior of "exit" without parameter function old new delta evaluate 3336 3339 +3 awk_exit 93 94 +1 awk_main 829 827 -2 ------------------------------------------------------------------------------ (add/remove: 0/0 grow/shrink: 2/1 up/down: 4/-2) Total: 2 bytes Signed-off-by: Denys Vlasenko (cherry picked from commit 4d902ea9def573cd15271177abbfa50fbf30c84f) --- editors/awk.c | 40 ++++++++++++++++++++++------------------ testsuite/awk.tests | 12 ++++++++++++ 2 files changed, 34 insertions(+), 18 deletions(-) diff --git a/editors/awk.c b/editors/awk.c index 878098bf9..fc82cb850 100644 --- a/editors/awk.c +++ b/editors/awk.c @@ -578,6 +578,8 @@ struct globals2 { rstream next_input_file__rsm; smallint next_input_file__files_happen; + smalluint exitcode; + unsigned evaluate__seed; var *evaluate__fnargs; regex_t evaluate__sreg; @@ -655,7 +657,7 @@ static const char EMSG_UNDEF_FUNC[] ALIGN1 = "Call to undefined function"; static const char EMSG_NO_MATH[] ALIGN1 = "Math support is not compiled in"; static const char EMSG_NEGATIVE_FIELD[] ALIGN1 = "Access to negative field"; -static int awk_exit(int) NORETURN; +static int awk_exit(void) NORETURN; static void syntax_error(const char *message) NORETURN; static void syntax_error(const char *message) @@ -2779,14 +2781,14 @@ static var *evaluate(node *op, var *res) if ((opinfo & OF_REQUIRED) && !op1) syntax_error(EMSG_TOO_FEW_ARGS); L.v = evaluate(op1, TMPVAR0); - } - if (opinfo & OF_STR1) { - L.s = getvar_s(L.v); - debug_printf_eval("L.s:'%s'\n", L.s); - } - if (opinfo & OF_NUM1) { - L_d = getvar_i(L.v); - debug_printf_eval("L_d:%f\n", L_d); + if (opinfo & OF_STR1) { + L.s = getvar_s(L.v); + debug_printf_eval("L.s:'%s'\n", L.s); + } + if (opinfo & OF_NUM1) { + L_d = getvar_i(L.v); + debug_printf_eval("L_d:%f\n", L_d); + } } /* NB: Must get string/numeric values of L (done above) * _before_ evaluate()'ing R.v: if both L and R are $NNNs, @@ -2799,10 +2801,10 @@ static var *evaluate(node *op, var *res) R.v = evaluate(op->r.n, TMPVAR1); //TODO: L.v may be invalid now, set L.v to NULL to catch bugs? //L.v = NULL; - } - if (opinfo & OF_STR2) { - R.s = getvar_s(R.v); - debug_printf_eval("R.s:'%s'\n", R.s); + if (opinfo & OF_STR2) { + R.s = getvar_s(R.v); + debug_printf_eval("R.s:'%s'\n", R.s); + } } debug_printf_eval("switch(0x%x)\n", XC(opinfo & OPCLSMASK)); @@ -2955,7 +2957,9 @@ static var *evaluate(node *op, var *res) case XC( OC_EXIT ): debug_printf_eval("EXIT\n"); - awk_exit(L_d); + if (op1) + G.exitcode = (int)L_d; + awk_exit(); /* -- recursive node type -- */ @@ -3414,7 +3418,7 @@ static var *evaluate(node *op, var *res) /* -------- main & co. -------- */ -static int awk_exit(int r) +static int awk_exit(void) { unsigned i; @@ -3435,7 +3439,7 @@ static int awk_exit(int r) } } - exit(r); + exit(G.exitcode); } int awk_main(int argc, char **argv) MAIN_EXTERNALLY_VISIBLE; @@ -3560,7 +3564,7 @@ int awk_main(int argc UNUSED_PARAM, char **argv) evaluate(beginseq.first, &G.main__tmpvar); if (!mainseq.first && !endseq.first) - awk_exit(EXIT_SUCCESS); + awk_exit(); /* input file could already be opened in BEGIN block */ if (!iF) @@ -3587,6 +3591,6 @@ int awk_main(int argc UNUSED_PARAM, char **argv) iF = next_input_file(); } - awk_exit(EXIT_SUCCESS); + awk_exit(); /*return 0;*/ } diff --git a/testsuite/awk.tests b/testsuite/awk.tests index eda5ea0e0..11fd17599 100755 --- a/testsuite/awk.tests +++ b/testsuite/awk.tests @@ -438,4 +438,16 @@ testing 'awk FS regex which can match empty string' \ '' \ 'foo--bar' +# last+1 field should be empty (had a bug where it wasn't) +testing 'awk $NF is empty' \ + "awk -F '=+' '{print \$NF}'" \ + "\n" \ + '' \ + 'a=====123=' + +testing "awk exit N propagates through END's exit" \ + "awk 'BEGIN { exit 42 } END { exit }'; echo \$?" \ + "42\n" \ + '' '' + exit $FAILCOUNT -- 2.27.0 From 3d3c3979b6e91faa95e30c12d3f613dc68b96abf Mon Sep 17 00:00:00 2001 From: Denys Vlasenko Date: Fri, 2 Jul 2021 23:07:21 +0200 Subject: [PATCH 43/65] awk: fix detection of VAR=VAL arguments 1NAME=VAL is not it, neither is VA.R=VAL function old new delta next_input_file 216 214 -2 is_assignment 115 91 -24 ------------------------------------------------------------------------------ (add/remove: 0/0 grow/shrink: 0/2 up/down: 0/-26) Total: -26 bytes Signed-off-by: Denys Vlasenko (cherry picked from commit a5d7b0f4f4e9728c3eb7a06d38227d9f3351e677) --- editors/awk.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/editors/awk.c b/editors/awk.c index fc82cb850..fbee5c6d2 100644 --- a/editors/awk.c +++ b/editors/awk.c @@ -2679,7 +2679,8 @@ static int is_assignment(const char *expr) { char *exprc, *val; - if (!isalnum_(*expr) || (val = strchr(expr, '=')) == NULL) { + val = (char*)endofname(expr); + if (val == (char*)expr || *val != '=') { return FALSE; } @@ -2699,7 +2700,6 @@ static rstream *next_input_file(void) #define rsm (G.next_input_file__rsm) #define files_happen (G.next_input_file__files_happen) - FILE *F; const char *fname, *ind; if (rsm.F) @@ -2712,20 +2712,19 @@ static rstream *next_input_file(void) if (files_happen) return NULL; fname = "-"; - F = stdin; + rsm.F = stdin; break; } ind = getvar_s(incvar(intvar[ARGIND])); fname = getvar_s(findvar(iamarray(intvar[ARGV]), ind)); if (fname && *fname && !is_assignment(fname)) { - F = xfopen_stdin(fname); + rsm.F = xfopen_stdin(fname); break; } } files_happen = TRUE; setvar_s(intvar[FILENAME], fname); - rsm.F = F; return &rsm; #undef rsm #undef files_happen -- 2.27.0 From 11f66eb4a59b9570fb113ff9e61797b4dd5b4ac1 Mon Sep 17 00:00:00 2001 From: Denys Vlasenko Date: Fri, 2 Jul 2021 23:24:52 +0200 Subject: [PATCH 44/65] awk: use smaller regmatch_t arrays, they had 2 elements for no apparent reason function old new delta exec_builtin 1479 1434 -45 Signed-off-by: Denys Vlasenko (cherry picked from commit 646429e05e2f62250da80aa8d98111f3a9818e9a) --- editors/awk.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/editors/awk.c b/editors/awk.c index fbee5c6d2..e81b82bb5 100644 --- a/editors/awk.c +++ b/editors/awk.c @@ -1937,7 +1937,7 @@ static int awk_split(const char *s, node *spl, char **slist) n++; /* at least one field will be there */ do { int l; - regmatch_t pmatch[2]; // TODO: why [2]? [1] is enough... + regmatch_t pmatch[1]; l = strcspn(s, c+2); /* len till next NUL or \n */ if (regexec1_nonempty(icase ? spl->r.ire : spl->l.re, s, pmatch) == 0 @@ -2166,7 +2166,7 @@ static int ptest(node *pattern) static int awk_getline(rstream *rsm, var *v) { char *b; - regmatch_t pmatch[2]; // TODO: why [2]? [1] is enough... + regmatch_t pmatch[1]; int size, a, p, pp = 0; int fd, so, eo, r, rp; char c, *m, *s; @@ -2473,7 +2473,7 @@ static NOINLINE var *exec_builtin(node *op, var *res) node *an[4]; var *av[4]; const char *as[4]; - regmatch_t pmatch[2]; + regmatch_t pmatch[1]; regex_t sreg, *re; node *spl; uint32_t isr, info; @@ -3533,6 +3533,8 @@ int awk_main(int argc UNUSED_PARAM, char **argv) parse_program(llist_pop(&list_e)); } #endif +//FIXME: preserve order of -e and -f +//TODO: implement -i LIBRARY and -E FILE too, they are easy-ish if (!(opt & (OPT_f | OPT_e))) { if (!*argv) bb_show_usage(); -- 2.27.0 From fa4c0d4cbbdcdf539d2c8d8abf4c63635e431594 Mon Sep 17 00:00:00 2001 From: Denys Vlasenko Date: Fri, 2 Jul 2021 23:38:50 +0200 Subject: [PATCH 45/65] awk: move match() code out-of-line function old new delta exec_builtin_match - 202 +202 exec_builtin 1434 1157 -277 ------------------------------------------------------------------------------ (add/remove: 1/0 grow/shrink: 0/1 up/down: 202/-277) Total: -75 bytes Signed-off-by: Denys Vlasenko (cherry picked from commit b705bf55395bf338f9b9888d87e418f67d4f1a29) --- editors/awk.c | 45 ++++++++++++++++++++++++++++----------------- 1 file changed, 28 insertions(+), 17 deletions(-) diff --git a/editors/awk.c b/editors/awk.c index e81b82bb5..a4d283465 100644 --- a/editors/awk.c +++ b/editors/awk.c @@ -2465,6 +2465,30 @@ static NOINLINE int do_mktime(const char *ds) return mktime(&then); } +/* Reduce stack usage in exec_builtin() by keeping match() code separate */ +static NOINLINE void exec_builtin_match(node *an1, const char *as0, var *res) +{ + regmatch_t pmatch[1]; + regex_t sreg, *re; + int n; + + re = as_regex(an1, &sreg); + n = regexec(re, as0, 1, pmatch, 0); + if (n == 0) { + pmatch[0].rm_so++; + pmatch[0].rm_eo++; + } else { + pmatch[0].rm_so = 0; + pmatch[0].rm_eo = -1; + } + if (re == &sreg) + regfree(re); + setvar_i(newvar("RSTART"), pmatch[0].rm_so); + setvar_i(newvar("RLENGTH"), pmatch[0].rm_eo - pmatch[0].rm_so); + setvar_i(res, pmatch[0].rm_so); +} + +/* Reduce stack usage in evaluate() by keeping builtins' code separate */ static NOINLINE var *exec_builtin(node *op, var *res) { #define tspl (G.exec_builtin__tspl) @@ -2473,8 +2497,6 @@ static NOINLINE var *exec_builtin(node *op, var *res) node *an[4]; var *av[4]; const char *as[4]; - regmatch_t pmatch[1]; - regex_t sreg, *re; node *spl; uint32_t isr, info; int nargs; @@ -2633,20 +2655,7 @@ static NOINLINE var *exec_builtin(node *op, var *res) break; case B_ma: - re = as_regex(an[1], &sreg); - n = regexec(re, as[0], 1, pmatch, 0); - if (n == 0) { - pmatch[0].rm_so++; - pmatch[0].rm_eo++; - } else { - pmatch[0].rm_so = 0; - pmatch[0].rm_eo = -1; - } - setvar_i(newvar("RSTART"), pmatch[0].rm_so); - setvar_i(newvar("RLENGTH"), pmatch[0].rm_eo - pmatch[0].rm_so); - setvar_i(res, pmatch[0].rm_so); - if (re == &sreg) - regfree(re); + exec_builtin_match(an[1], as[0], res); break; case B_ge: @@ -2732,7 +2741,9 @@ static rstream *next_input_file(void) /* * Evaluate node - the heart of the program. Supplied with subtree - * and place where to store result. Returns ptr to result. + * and "res" variable to assign the result to if we evaluate an expression. + * If node refers to e.g. a variable or a field, no assignment happens. + * Return ptr to the result (which may or may not be the "res" variable!) */ #define XC(n) ((n) >> 8) -- 2.27.0 From 7d2bd9fee85874bf1c8633e6d9024f27e3945534 Mon Sep 17 00:00:00 2001 From: Denys Vlasenko Date: Sat, 3 Jul 2021 00:39:55 +0200 Subject: [PATCH 46/65] awk: rename GRPSTART/END to L/RBRACE, no code changes Signed-off-by: Denys Vlasenko (cherry picked from commit 717200eb43c9420773c0f8b751aadabba3052027) --- editors/awk.c | 60 ++++++++++++++++++++++++++++----------------------- 1 file changed, 33 insertions(+), 27 deletions(-) diff --git a/editors/awk.c b/editors/awk.c index a4d283465..56e3277c8 100644 --- a/editors/awk.c +++ b/editors/awk.c @@ -211,8 +211,8 @@ typedef struct tsplitter_s { #define TC_PIPE (1 << 9) /* input redirection pipe | */ #define TC_UOPPRE2 (1 << 10) /* unary prefix operator + - ! */ #define TC_ARRTERM (1 << 11) /* ] */ -#define TC_GRPSTART (1 << 12) /* { */ -#define TC_GRPTERM (1 << 13) /* } */ +#define TC_LBRACE (1 << 12) /* { */ +#define TC_RBRACE (1 << 13) /* } */ #define TC_SEMICOL (1 << 14) /* ; */ #define TC_NEWLINE (1 << 15) #define TC_STATX (1 << 16) /* ctl statement (for, next...) */ @@ -250,8 +250,8 @@ if ((n) & TC_COMMA ) debug_printf_parse(" COMMA" ); \ if ((n) & TC_PIPE ) debug_printf_parse(" PIPE" ); \ if ((n) & TC_UOPPRE2 ) debug_printf_parse(" UOPPRE2" ); \ if ((n) & TC_ARRTERM ) debug_printf_parse(" ARRTERM" ); \ -if ((n) & TC_GRPSTART) debug_printf_parse(" GRPSTART"); \ -if ((n) & TC_GRPTERM ) debug_printf_parse(" GRPTERM" ); \ +if ((n) & TC_LBRACE ) debug_printf_parse(" LBRACE" ); \ +if ((n) & TC_RBRACE ) debug_printf_parse(" RBRACE" ); \ if ((n) & TC_SEMICOL ) debug_printf_parse(" SEMICOL" ); \ if ((n) & TC_NEWLINE ) debug_printf_parse(" NEWLINE" ); \ if ((n) & TC_STATX ) debug_printf_parse(" STATX" ); \ @@ -291,13 +291,13 @@ if ((n) & TC_NUMBER ) debug_printf_parse(" NUMBER" ); \ | TC_FUNCDECL | TC_BEGIN | TC_END) /* discard newlines after these */ -#define TS_NOTERM (TC_COMMA | TC_GRPSTART | TC_GRPTERM \ +#define TS_NOTERM (TC_COMMA | TC_LBRACE | TC_RBRACE \ | TS_BINOP | TS_OPTERM) /* what can expression begin with */ #define TS_OPSEQ (TS_OPERAND | TS_UOPPRE | TC_REGEXP) /* what can group begin with */ -#define TS_GRPSEQ (TS_OPSEQ | TS_OPTERM | TS_STATEMNT | TC_GRPSTART) +#define TS_GRPSEQ (TS_OPSEQ | TS_OPTERM | TS_STATEMNT | TC_LBRACE) /* if previous token class is CONCAT_L and next is CONCAT_R, concatenation */ /* operator is inserted between them */ @@ -402,8 +402,8 @@ static const char tokenlist[] ALIGN1 = "\1|" NTC /* TC_PIPE */ "\1+" "\1-" "\1!" NTC /* TC_UOPPRE2 */ "\1]" NTC /* TC_ARRTERM */ - "\1{" NTC /* TC_GRPSTART */ - "\1}" NTC /* TC_GRPTERM */ + "\1{" NTC /* TC_LBRACE */ + "\1}" NTC /* TC_RBRACE */ "\1;" NTC /* TC_SEMICOL */ "\1\n" NTC /* TC_NEWLINE */ "\2if" "\2do" "\3for" "\5break" /* TC_STATX */ @@ -1471,7 +1471,7 @@ static node *parse_expr(uint32_t term_tc) debug_printf_parse("%s: TC_LENGTH\n", __func__); tc = next_token(TC_LPAREN /* length(...) */ | TS_OPTERM /* length; (or newline)*/ - | TC_GRPTERM /* length } */ + | TC_RBRACE /* length } */ | TC_BINOPX /* length NUM */ | TC_COMMA /* print length, 1 */ ); @@ -1516,11 +1516,11 @@ static void chain_expr(uint32_t info) n = chain_node(info); - n->l.n = parse_expr(TS_OPTERM | TC_GRPTERM); + n->l.n = parse_expr(TS_OPTERM | TC_RBRACE); if ((info & OF_REQUIRED) && !n->l.n) syntax_error(EMSG_TOO_FEW_ARGS); - if (t_tclass & TC_GRPTERM) + if (t_tclass & TC_RBRACE) rollback_token(); } @@ -1559,16 +1559,16 @@ static void chain_group(void) c = next_token(TS_GRPSEQ); } while (c & TC_NEWLINE); - if (c & TC_GRPSTART) { - debug_printf_parse("%s: TC_GRPSTART\n", __func__); - while ((c = next_token(TS_GRPSEQ | TC_GRPTERM)) != TC_GRPTERM) { - debug_printf_parse("%s: !TC_GRPTERM\n", __func__); + if (c & TC_LBRACE) { + debug_printf_parse("%s: TC_LBRACE\n", __func__); + while ((c = next_token(TS_GRPSEQ | TC_RBRACE)) != TC_RBRACE) { + debug_printf_parse("%s: !TC_RBRACE\n", __func__); if (c & TC_NEWLINE) continue; rollback_token(); chain_group(); } - debug_printf_parse("%s: TC_GRPTERM\n", __func__); + debug_printf_parse("%s: TC_RBRACE\n", __func__); return; } if (c & (TS_OPSEQ | TS_OPTERM)) { @@ -1588,7 +1588,7 @@ static void chain_group(void) chain_group(); n2 = chain_node(OC_EXEC); n->r.n = seq->last; - if (next_token(TS_GRPSEQ | TC_GRPTERM | TC_ELSE) == TC_ELSE) { + if (next_token(TS_GRPSEQ | TC_RBRACE | TC_ELSE) == TC_ELSE) { chain_group(); n2->a.n = seq->last; } else { @@ -1641,12 +1641,12 @@ static void chain_group(void) case OC_PRINTF: debug_printf_parse("%s: OC_PRINT[F]\n", __func__); n = chain_node(t_info); - n->l.n = parse_expr(TS_OPTERM | TC_OUTRDR | TC_GRPTERM); + n->l.n = parse_expr(TS_OPTERM | TC_OUTRDR | TC_RBRACE); if (t_tclass & TC_OUTRDR) { n->info |= t_info; - n->r.n = parse_expr(TS_OPTERM | TC_GRPTERM); + n->r.n = parse_expr(TS_OPTERM | TC_RBRACE); } - if (t_tclass & TC_GRPTERM) + if (t_tclass & TC_RBRACE) rollback_token(); break; @@ -1684,7 +1684,7 @@ static void parse_program(char *p) g_pos = p; t_lineno = 1; - while ((tclass = next_token(TC_EOF | TS_OPSEQ | TC_GRPSTART | + while ((tclass = next_token(TC_EOF | TS_OPSEQ | TC_LBRACE | TS_OPTERM | TC_BEGIN | TC_END | TC_FUNCDECL)) != TC_EOF) { if (tclass & TS_OPTERM) { @@ -1696,10 +1696,14 @@ static void parse_program(char *p) if (tclass & TC_BEGIN) { debug_printf_parse("%s: TC_BEGIN\n", __func__); seq = &beginseq; +//TODO: ensure there is no newline between BEGIN and { +//next_token(TC_LBRACE); rollback_token(); chain_group(); } else if (tclass & TC_END) { debug_printf_parse("%s: TC_END\n", __func__); seq = &endseq; +//TODO: ensure there is no newline between END and { +//next_token(TC_LBRACE); rollback_token(); chain_group(); } else if (tclass & TC_FUNCDECL) { debug_printf_parse("%s: TC_FUNCDECL\n", __func__); @@ -1726,24 +1730,26 @@ static void parse_program(char *p) /* it was a comma, we ate it */ } seq = &f->body; +//TODO: ensure there is { after "func F(...)" - but newlines are allowed +//while (next_token(TC_LBRACE | TC_NEWLINE) == TC_NEWLINE) continue; rollback_token(); chain_group(); hash_clear(ahash); } else if (tclass & TS_OPSEQ) { debug_printf_parse("%s: TS_OPSEQ\n", __func__); rollback_token(); cn = chain_node(OC_TEST); - cn->l.n = parse_expr(TS_OPTERM | TC_EOF | TC_GRPSTART); - if (t_tclass & TC_GRPSTART) { - debug_printf_parse("%s: TC_GRPSTART\n", __func__); + cn->l.n = parse_expr(TS_OPTERM | TC_EOF | TC_LBRACE); + if (t_tclass & TC_LBRACE) { + debug_printf_parse("%s: TC_LBRACE\n", __func__); rollback_token(); chain_group(); } else { - debug_printf_parse("%s: !TC_GRPSTART\n", __func__); + debug_printf_parse("%s: !TC_LBRACE\n", __func__); chain_node(OC_PRINT); } cn->r.n = mainseq.last; - } else /* if (tclass & TC_GRPSTART) */ { - debug_printf_parse("%s: TC_GRPSTART(?)\n", __func__); + } else /* if (tclass & TC_LBRACE) */ { + debug_printf_parse("%s: TC_LBRACE(?)\n", __func__); rollback_token(); chain_group(); } -- 2.27.0 From ce69ee405914c1f62ffa34bf8595e3d6e05880c5 Mon Sep 17 00:00:00 2001 From: Denys Vlasenko Date: Sat, 3 Jul 2021 01:16:48 +0200 Subject: [PATCH 47/65] awk: tighten rules in action parsing Disallow: BEGIN { action } - must start on the same line Disallow: func f() print "hello" - must be in {...} function old new delta chain_until_rbrace - 41 +41 parse_program 307 336 +29 chain_group 649 616 -33 ------------------------------------------------------------------------------ (add/remove: 1/0 grow/shrink: 1/1 up/down: 70/-33) Total: 37 bytes Signed-off-by: Denys Vlasenko (cherry picked from commit 2b65e73db3254a7228802886546152c72217017d) --- editors/awk.c | 108 ++++++++++++++++++++++++++++++-------------------- 1 file changed, 66 insertions(+), 42 deletions(-) diff --git a/editors/awk.c b/editors/awk.c index 56e3277c8..4d97e2103 100644 --- a/editors/awk.c +++ b/editors/awk.c @@ -1549,29 +1549,35 @@ static node *chain_loop(node *nn) return n; } +static void chain_until_rbrace(void) +{ + uint32_t tc; + while ((tc = next_token(TS_GRPSEQ | TC_RBRACE)) != TC_RBRACE) { + debug_printf_parse("%s: !TC_RBRACE\n", __func__); + if (tc == TC_NEWLINE) + continue; + rollback_token(); + chain_group(); + } + debug_printf_parse("%s: TC_RBRACE\n", __func__); +} + /* parse group and attach it to chain */ static void chain_group(void) { - uint32_t c; + uint32_t tc; node *n, *n2, *n3; do { - c = next_token(TS_GRPSEQ); - } while (c & TC_NEWLINE); + tc = next_token(TS_GRPSEQ); + } while (tc == TC_NEWLINE); - if (c & TC_LBRACE) { + if (tc == TC_LBRACE) { debug_printf_parse("%s: TC_LBRACE\n", __func__); - while ((c = next_token(TS_GRPSEQ | TC_RBRACE)) != TC_RBRACE) { - debug_printf_parse("%s: !TC_RBRACE\n", __func__); - if (c & TC_NEWLINE) - continue; - rollback_token(); - chain_group(); - } - debug_printf_parse("%s: TC_RBRACE\n", __func__); + chain_until_rbrace(); return; } - if (c & (TS_OPSEQ | TS_OPTERM)) { + if (tc & (TS_OPSEQ | TS_OPTERM)) { debug_printf_parse("%s: TS_OPSEQ | TS_OPTERM\n", __func__); rollback_token(); chain_expr(OC_EXEC | Vx); @@ -1675,37 +1681,48 @@ static void chain_group(void) static void parse_program(char *p) { - uint32_t tclass; - node *cn; - func *f; - var *v; - debug_printf_parse("%s()\n", __func__); g_pos = p; t_lineno = 1; - while ((tclass = next_token(TC_EOF | TS_OPSEQ | TC_LBRACE | - TS_OPTERM | TC_BEGIN | TC_END | TC_FUNCDECL)) != TC_EOF) { + for (;;) { + uint32_t tclass; - if (tclass & TS_OPTERM) { + tclass = next_token(TC_EOF | TS_OPSEQ | TC_LBRACE | + TS_OPTERM | TC_BEGIN | TC_END | TC_FUNCDECL); + + if (tclass == TC_EOF) { + debug_printf_parse("%s: TC_EOF\n", __func__); + break; + } + if (tclass & TS_OPTERM) { /* ; or */ debug_printf_parse("%s: TS_OPTERM\n", __func__); +//NB: gawk allows many newlines, but does not allow more than one semicolon: +// BEGIN {...};; +//would complain "each rule must have a pattern or an action part". +//Same message for +// ; BEGIN {...} continue; } - - seq = &mainseq; - if (tclass & TC_BEGIN) { + if (tclass == TC_BEGIN) { debug_printf_parse("%s: TC_BEGIN\n", __func__); seq = &beginseq; -//TODO: ensure there is no newline between BEGIN and { -//next_token(TC_LBRACE); rollback_token(); - chain_group(); - } else if (tclass & TC_END) { + /* ensure there is no newline between BEGIN and { */ + next_token(TC_LBRACE); + chain_until_rbrace(); + continue; + } + if (tclass == TC_END) { debug_printf_parse("%s: TC_END\n", __func__); seq = &endseq; -//TODO: ensure there is no newline between END and { -//next_token(TC_LBRACE); rollback_token(); - chain_group(); - } else if (tclass & TC_FUNCDECL) { + /* ensure there is no newline between END and { */ + next_token(TC_LBRACE); + chain_until_rbrace(); + continue; + } + if (tclass == TC_FUNCDECL) { + func *f; + debug_printf_parse("%s: TC_FUNCDECL\n", __func__); next_token(TC_FUNCTION); f = newfunc(t_string); @@ -1716,6 +1733,7 @@ static void parse_program(char *p) //f->nargs = 0; - already is /* func arg list: comma sep list of args, and a close paren */ for (;;) { + var *v; if (next_token(TC_VARIABLE | TC_RPAREN) == TC_RPAREN) { if (f->nargs == 0) break; /* func() is ok */ @@ -1730,31 +1748,37 @@ static void parse_program(char *p) /* it was a comma, we ate it */ } seq = &f->body; -//TODO: ensure there is { after "func F(...)" - but newlines are allowed -//while (next_token(TC_LBRACE | TC_NEWLINE) == TC_NEWLINE) continue; rollback_token(); - chain_group(); + /* ensure there is { after "func F(...)" - but newlines are allowed */ + while (next_token(TC_LBRACE | TC_NEWLINE) == TC_NEWLINE) + continue; + chain_until_rbrace(); hash_clear(ahash); - } else if (tclass & TS_OPSEQ) { + continue; + } + seq = &mainseq; + if (tclass & TS_OPSEQ) { + node *cn; + debug_printf_parse("%s: TS_OPSEQ\n", __func__); rollback_token(); cn = chain_node(OC_TEST); cn->l.n = parse_expr(TS_OPTERM | TC_EOF | TC_LBRACE); - if (t_tclass & TC_LBRACE) { + if (t_tclass == TC_LBRACE) { debug_printf_parse("%s: TC_LBRACE\n", __func__); rollback_token(); chain_group(); } else { + /* no action, assume default "{ print }" */ debug_printf_parse("%s: !TC_LBRACE\n", __func__); chain_node(OC_PRINT); } cn->r.n = mainseq.last; - } else /* if (tclass & TC_LBRACE) */ { - debug_printf_parse("%s: TC_LBRACE(?)\n", __func__); - rollback_token(); - chain_group(); + continue; } + /* tclass == TC_LBRACE */ + debug_printf_parse("%s: TC_LBRACE(?)\n", __func__); + chain_until_rbrace(); } - debug_printf_parse("%s: TC_EOF\n", __func__); } -- 2.27.0 From 6bde7949fb63d01452bc4ad5f028a507cd464ef7 Mon Sep 17 00:00:00 2001 From: Denys Vlasenko Date: Sat, 3 Jul 2021 01:32:03 +0200 Subject: [PATCH 48/65] awk: open-code TS_OPTERM, no logic changes Signed-off-by: Denys Vlasenko (cherry picked from commit 1f765709ed9c9595647853ac2cd7905f218c3044) --- editors/awk.c | 33 +++++++++++++++++---------------- 1 file changed, 17 insertions(+), 16 deletions(-) diff --git a/editors/awk.c b/editors/awk.c index 4d97e2103..2200553d3 100644 --- a/editors/awk.c +++ b/editors/awk.c @@ -283,7 +283,6 @@ if ((n) & TC_NUMBER ) debug_printf_parse(" NUMBER" ); \ #define TS_LVALUE (TC_VARIABLE | TC_ARRAY) #define TS_STATEMNT (TC_STATX | TC_WHILE) -#define TS_OPTERM (TC_SEMICOL | TC_NEWLINE) /* word tokens, cannot mean something else if not expected */ #define TS_WORD (TC_IN | TS_STATEMNT | TC_ELSE \ @@ -291,13 +290,14 @@ if ((n) & TC_NUMBER ) debug_printf_parse(" NUMBER" ); \ | TC_FUNCDECL | TC_BEGIN | TC_END) /* discard newlines after these */ -#define TS_NOTERM (TC_COMMA | TC_LBRACE | TC_RBRACE \ - | TS_BINOP | TS_OPTERM) +#define TS_NOTERM (TS_BINOP | TC_COMMA | TC_LBRACE | TC_RBRACE \ + | TC_SEMICOL | TC_NEWLINE) /* what can expression begin with */ #define TS_OPSEQ (TS_OPERAND | TS_UOPPRE | TC_REGEXP) /* what can group begin with */ -#define TS_GRPSEQ (TS_OPSEQ | TS_OPTERM | TS_STATEMNT | TC_LBRACE) +#define TS_GRPSEQ (TS_OPSEQ | TS_STATEMNT \ + | TC_SEMICOL | TC_NEWLINE | TC_LBRACE) /* if previous token class is CONCAT_L and next is CONCAT_R, concatenation */ /* operator is inserted between them */ @@ -642,7 +642,7 @@ struct globals2 { #define g_buf (G.g_buf ) #define INIT_G() do { \ SET_PTR_TO_GLOBALS((char*)xzalloc(sizeof(G1)+sizeof(G)) + sizeof(G1)); \ - t_tclass = TS_OPTERM; \ + t_tclass = TC_NEWLINE; \ G.evaluate__seed = 1; \ } while (0) @@ -1090,7 +1090,7 @@ static uint32_t next_token(uint32_t expected) const uint32_t *ti; uint32_t tc, last_token_class; - last_token_class = t_tclass; /* t_tclass is initialized to TS_OPTERM */ + last_token_class = t_tclass; /* t_tclass is initialized to TC_NEWLINE */ debug_printf_parse("%s() expected(%x):", __func__, expected); debug_parse_print_tc(expected); @@ -1470,7 +1470,8 @@ static node *parse_expr(uint32_t term_tc) case TC_LENGTH: debug_printf_parse("%s: TC_LENGTH\n", __func__); tc = next_token(TC_LPAREN /* length(...) */ - | TS_OPTERM /* length; (or newline)*/ + | TC_SEMICOL /* length; */ + | TC_NEWLINE /* length */ | TC_RBRACE /* length } */ | TC_BINOPX /* length NUM */ | TC_COMMA /* print length, 1 */ @@ -1516,7 +1517,7 @@ static void chain_expr(uint32_t info) n = chain_node(info); - n->l.n = parse_expr(TS_OPTERM | TC_RBRACE); + n->l.n = parse_expr(TC_SEMICOL | TC_NEWLINE | TC_RBRACE); if ((info & OF_REQUIRED) && !n->l.n) syntax_error(EMSG_TOO_FEW_ARGS); @@ -1577,8 +1578,8 @@ static void chain_group(void) chain_until_rbrace(); return; } - if (tc & (TS_OPSEQ | TS_OPTERM)) { - debug_printf_parse("%s: TS_OPSEQ | TS_OPTERM\n", __func__); + if (tc & (TS_OPSEQ | TC_SEMICOL | TC_NEWLINE)) { + debug_printf_parse("%s: TS_OPSEQ | TC_SEMICOL | TC_NEWLINE\n", __func__); rollback_token(); chain_expr(OC_EXEC | Vx); return; @@ -1647,10 +1648,10 @@ static void chain_group(void) case OC_PRINTF: debug_printf_parse("%s: OC_PRINT[F]\n", __func__); n = chain_node(t_info); - n->l.n = parse_expr(TS_OPTERM | TC_OUTRDR | TC_RBRACE); + n->l.n = parse_expr(TC_SEMICOL | TC_NEWLINE | TC_OUTRDR | TC_RBRACE); if (t_tclass & TC_OUTRDR) { n->info |= t_info; - n->r.n = parse_expr(TS_OPTERM | TC_RBRACE); + n->r.n = parse_expr(TC_SEMICOL | TC_NEWLINE | TC_RBRACE); } if (t_tclass & TC_RBRACE) rollback_token(); @@ -1689,14 +1690,14 @@ static void parse_program(char *p) uint32_t tclass; tclass = next_token(TC_EOF | TS_OPSEQ | TC_LBRACE | - TS_OPTERM | TC_BEGIN | TC_END | TC_FUNCDECL); + TC_SEMICOL | TC_NEWLINE | TC_BEGIN | TC_END | TC_FUNCDECL); if (tclass == TC_EOF) { debug_printf_parse("%s: TC_EOF\n", __func__); break; } - if (tclass & TS_OPTERM) { /* ; or */ - debug_printf_parse("%s: TS_OPTERM\n", __func__); + if (tclass & (TC_SEMICOL | TC_NEWLINE)) { + debug_printf_parse("%s: TC_SEMICOL | TC_NEWLINE\n", __func__); //NB: gawk allows many newlines, but does not allow more than one semicolon: // BEGIN {...};; //would complain "each rule must have a pattern or an action part". @@ -1762,7 +1763,7 @@ static void parse_program(char *p) debug_printf_parse("%s: TS_OPSEQ\n", __func__); rollback_token(); cn = chain_node(OC_TEST); - cn->l.n = parse_expr(TS_OPTERM | TC_EOF | TC_LBRACE); + cn->l.n = parse_expr(TC_SEMICOL | TC_NEWLINE | TC_EOF | TC_LBRACE); if (t_tclass == TC_LBRACE) { debug_printf_parse("%s: TC_LBRACE\n", __func__); rollback_token(); -- 2.27.0 From 34c622bf253f1159ec918dc938aeb88662ed0cd4 Mon Sep 17 00:00:00 2001 From: Denys Vlasenko Date: Sat, 3 Jul 2021 01:59:36 +0200 Subject: [PATCH 49/65] awk: support %F %a %A in printf function old new delta .rodata 104111 104120 +9 Signed-off-by: Denys Vlasenko (cherry picked from commit e1e7ad6b6005b2265667040fc9d7f69b73b0d5b0) --- editors/awk.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/editors/awk.c b/editors/awk.c index 2200553d3..fa7359439 100644 --- a/editors/awk.c +++ b/editors/awk.c @@ -909,7 +909,7 @@ static int fmt_num(char *b, int size, const char *format, double n, int int_as_i do { c = *s; } while (c && *++s); if (strchr("diouxX", c)) { r = snprintf(b, size, format, (int)n); - } else if (strchr("eEfgG", c)) { + } else if (strchr("eEfFgGaA", c)) { r = snprintf(b, size, format, n); } else { syntax_error(EMSG_INV_FMT); -- 2.27.0 From e3dc3967432d1f42ead9775095f028adb030595a Mon Sep 17 00:00:00 2001 From: Denys Vlasenko Date: Sat, 3 Jul 2021 11:54:01 +0200 Subject: [PATCH 50/65] awk: do not use a copy of g_progname for node->l.new_progname We never destroy g_progname's, the strings still exist, no need to copy function old new delta chain_node 104 97 -7 Signed-off-by: Denys Vlasenko (cherry picked from commit 2211fa70ccad29fc7bccd34c13141850ebb199da) --- editors/awk.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/editors/awk.c b/editors/awk.c index fa7359439..8e93fecdf 100644 --- a/editors/awk.c +++ b/editors/awk.c @@ -179,7 +179,7 @@ typedef struct node_s { struct node_s *n; var *v; int aidx; - char *new_progname; + const char *new_progname; regex_t *re; } l; union { @@ -1501,7 +1501,7 @@ static node *chain_node(uint32_t info) if (seq->programname != g_progname) { seq->programname = g_progname; n = chain_node(OC_NEWSOURCE); - n->l.new_progname = xstrdup(g_progname); + n->l.new_progname = g_progname; } n = seq->last; -- 2.27.0 From 478b6510f687c6d1178233ef9db6733b90fd5db2 Mon Sep 17 00:00:00 2001 From: Denys Vlasenko Date: Sat, 3 Jul 2021 11:57:59 +0200 Subject: [PATCH 51/65] awk: rand(): 64-bit constants should be ULL Signed-off-by: Denys Vlasenko (cherry picked from commit 0e3ef4efb061366bfa4b9609fe3a03f3a1e40f0e) --- editors/awk.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/editors/awk.c b/editors/awk.c index 8e93fecdf..83862daad 100644 --- a/editors/awk.c +++ b/editors/awk.c @@ -3169,9 +3169,9 @@ static var *evaluate(node *op, var *res) uint64_t v = ((uint64_t)rand() << 32) | u; /* the above shift+or is optimized out on 32-bit arches */ # if RAND_MAX > 0x7fffffff - v &= 0x7fffffffffffffffUL; + v &= 0x7fffffffffffffffULL; # endif - R_d = (double)v / 0x8000000000000000UL; + R_d = (double)v / 0x8000000000000000ULL; #else # error Not implemented for this value of RAND_MAX #endif -- 2.27.0 From e04b59e3c50080123295bd7bb5622ab6534fc529 Mon Sep 17 00:00:00 2001 From: Denys Vlasenko Date: Sat, 3 Jul 2021 12:20:36 +0200 Subject: [PATCH 52/65] awk: match(): code shrink function old new delta do_match - 165 +165 exec_builtin_match 202 - -202 ------------------------------------------------------------------------------ (add/remove: 1/1 grow/shrink: 0/0 up/down: 165/-202) Total: -37 bytes Signed-off-by: Denys Vlasenko (cherry picked from commit 90404ed2f62a872ffd9a555660b7ce17fae372d8) --- editors/awk.c | 24 +++++++++++------------- 1 file changed, 11 insertions(+), 13 deletions(-) diff --git a/editors/awk.c b/editors/awk.c index 83862daad..eaabc3178 100644 --- a/editors/awk.c +++ b/editors/awk.c @@ -2497,26 +2497,24 @@ static NOINLINE int do_mktime(const char *ds) } /* Reduce stack usage in exec_builtin() by keeping match() code separate */ -static NOINLINE void exec_builtin_match(node *an1, const char *as0, var *res) +static NOINLINE var *do_match(node *an1, const char *as0) { regmatch_t pmatch[1]; regex_t sreg, *re; - int n; + int n, start, len; re = as_regex(an1, &sreg); n = regexec(re, as0, 1, pmatch, 0); - if (n == 0) { - pmatch[0].rm_so++; - pmatch[0].rm_eo++; - } else { - pmatch[0].rm_so = 0; - pmatch[0].rm_eo = -1; - } if (re == &sreg) regfree(re); - setvar_i(newvar("RSTART"), pmatch[0].rm_so); - setvar_i(newvar("RLENGTH"), pmatch[0].rm_eo - pmatch[0].rm_so); - setvar_i(res, pmatch[0].rm_so); + start = 0; + len = -1; + if (n == 0) { + start = pmatch[0].rm_so + 1; + len = pmatch[0].rm_eo - pmatch[0].rm_so; + } + setvar_i(newvar("RLENGTH"), len); + return setvar_i(newvar("RSTART"), start); } /* Reduce stack usage in evaluate() by keeping builtins' code separate */ @@ -2686,7 +2684,7 @@ static NOINLINE var *exec_builtin(node *op, var *res) break; case B_ma: - exec_builtin_match(an[1], as[0], res); + res = do_match(an[1], as[0]); break; case B_ge: -- 2.27.0 From 0442a7ff6b3ef7f04c9f32e91ed774987879b5ab Mon Sep 17 00:00:00 2001 From: Denys Vlasenko Date: Sat, 3 Jul 2021 13:29:32 +0200 Subject: [PATCH 53/65] awk: restore strdup elision optimization in assignment function old new delta evaluate 3339 3387 +48 Signed-off-by: Denys Vlasenko (cherry picked from commit cb042b05828c4c89320bc9c7454c04c2761bbb9a) --- editors/awk.c | 25 +++++++++++++++++-------- 1 file changed, 17 insertions(+), 8 deletions(-) diff --git a/editors/awk.c b/editors/awk.c index eaabc3178..8ac9a6e80 100644 --- a/editors/awk.c +++ b/editors/awk.c @@ -102,7 +102,7 @@ enum { #define VF_USER 0x0200 /* 1 = user input (may be numeric string) */ #define VF_SPECIAL 0x0400 /* 1 = requires extra handling when changed */ #define VF_WALK 0x0800 /* 1 = variable has alloc'd x.walker list */ -#define VF_FSTR 0x1000 /* 1 = var::string points to fstring buffer */ +#define VF_FSTR 0x1000 /* 1 = don't free() var::string (not malloced, or is owned by something else) */ #define VF_CHILD 0x2000 /* 1 = function arg; x.parent points to source */ #define VF_DIRTY 0x4000 /* 1 = variable was set explicitly */ @@ -1371,6 +1371,12 @@ static node *parse_expr(uint32_t term_tc) cn->a.n = vn->a.n; if (tc & TS_BINOP) { cn->l.n = vn; +//FIXME: this is the place to detect and reject assignments to non-lvalues. +//Currently we allow "assignments" to consts and temporaries, nonsense like this: +// awk 'BEGIN { "qwe" = 1 }' +// awk 'BEGIN { 7 *= 7 }' +// awk 'BEGIN { length("qwe") = 1 }' +// awk 'BEGIN { (1+1) += 3 }' expected_tc = TS_OPERAND | TS_UOPPRE | TC_REGEXP; if ((t_info & OPCLSMASK) == OC_PGETLINE) { /* it's a pipe */ @@ -3043,14 +3049,17 @@ static var *evaluate(node *op, var *res) case XC( OC_MOVE ): debug_printf_eval("MOVE\n"); /* if source is a temporary string, jusk relink it to dest */ -//Disabled: if R.v is numeric but happens to have cached R.v->string, -//then L.v ends up being a string, which is wrong -// if (R.v == TMPVAR1 && R.v->string) { -// res = setvar_p(L.v, R.v->string); -// R.v->string = NULL; -// } else { + if (R.v == TMPVAR1 + && !(R.v->type & VF_NUMBER) + /* Why check !NUMBER? if R.v is a number but has cached R.v->string, + * L.v ends up a string, which is wrong */ + /*&& R.v->string - always not NULL (right?) */ + ) { + res = setvar_p(L.v, R.v->string); /* avoids strdup */ + R.v->string = NULL; + } else { res = copyvar(L.v, R.v); -// } + } break; case XC( OC_TERNARY ): -- 2.27.0 From 491e8d089a05e87ac208fae1225d307645aaeed2 Mon Sep 17 00:00:00 2001 From: Denys Vlasenko Date: Sat, 3 Jul 2021 13:57:47 +0200 Subject: [PATCH 54/65] awk: simplify tests for operation class Usually, an operation class has only one possible value of "info" word. In this case, just compare the entire info word, do not bother to mask OPCLSMASK bits. (Example where this is not the case: OC_REPLACE for "=") function old new delta mk_splitter 106 100 -6 chain_group 616 610 -6 nextarg 40 32 -8 exec_builtin 1157 1149 -8 as_regex 111 103 -8 awk_split 553 543 -10 parse_expr 948 936 -12 awk_getline 656 642 -14 evaluate 3387 3343 -44 ------------------------------------------------------------------------------ (add/remove: 0/0 grow/shrink: 0/9 up/down: 0/-116) Total: -116 bytes Signed-off-by: Denys Vlasenko (cherry picked from commit 08ca313d7edb99687068b93b5d2435b59f3db23a) --- editors/awk.c | 64 +++++++++++++++++++++++++++++---------------------- 1 file changed, 36 insertions(+), 28 deletions(-) diff --git a/editors/awk.c b/editors/awk.c index 8ac9a6e80..4905a5aa7 100644 --- a/editors/awk.c +++ b/editors/awk.c @@ -432,7 +432,8 @@ static const char tokenlist[] ALIGN1 = static const uint32_t tokeninfo[] = { 0, 0, - OC_REGEXP, +#define TI_REGEXP OC_REGEXP + TI_REGEXP, xS|'a', xS|'w', xS|'|', OC_UNARY|xV|P(9)|'p', OC_UNARY|xV|P(9)|'m', #define TI_PREINC (OC_UNARY|xV|P(9)|'P') @@ -443,12 +444,17 @@ static const uint32_t tokeninfo[] = { OC_BINARY|NV|P(29)|'+', OC_BINARY|NV|P(29)|'-', OC_REPLACE|NV|P(74)|'&', OC_BINARY|NV|P(15)|'&', OC_BINARY|NV|P(25)|'/', OC_BINARY|NV|P(25)|'%', OC_BINARY|NV|P(15)|'&', OC_BINARY|NV|P(25)|'*', OC_COMPARE|VV|P(39)|4, OC_COMPARE|VV|P(39)|3, OC_COMPARE|VV|P(39)|0, OC_COMPARE|VV|P(39)|1, -#define TI_LESS (OC_COMPARE|VV|P(39)|2) +#define TI_LESS (OC_COMPARE|VV|P(39)|2) TI_LESS, OC_MATCH|Sx|P(45)|'!', OC_MATCH|Sx|P(45)|'~', OC_LAND|Vx|P(55), - OC_LOR|Vx|P(59), OC_TERNARY|Vx|P(64)|'?', OC_COLON|xx|P(67)|':', - OC_IN|SV|P(49), /* TC_IN */ - OC_COMMA|SS|P(80), - OC_PGETLINE|SV|P(37), +#define TI_TERNARY (OC_TERNARY|Vx|P(64)|'?') +#define TI_COLON (OC_COLON|xx|P(67)|':') + OC_LOR|Vx|P(59), TI_TERNARY, TI_COLON, +#define TI_IN (OC_IN|SV|P(49)) + TI_IN, +#define TI_COMMA (OC_COMMA|SS|P(80)) + TI_COMMA, +#define TI_PGETLINE (OC_PGETLINE|SV|P(37)) + TI_PGETLINE, OC_UNARY|xV|P(19)|'+', OC_UNARY|xV|P(19)|'-', OC_UNARY|xV|P(19)|'!', 0, /* ] */ 0, @@ -456,7 +462,8 @@ static const uint32_t tokeninfo[] = { 0, 0, /* \n */ ST_IF, ST_DO, ST_FOR, OC_BREAK, - OC_CONTINUE, OC_DELETE|Rx, OC_PRINT, +#define TI_PRINT OC_PRINT + OC_CONTINUE, OC_DELETE|Rx, TI_PRINT, OC_PRINTF, OC_NEXT, OC_NEXTFILE, OC_RETURN|Vx, OC_EXIT|Nx, ST_WHILE, @@ -465,8 +472,8 @@ static const uint32_t tokeninfo[] = { // Highest byte bit pattern: nn s3s2s1 v3v2v1 // nn - min. number of args, sN - resolve Nth arg to string, vN - resolve to var // OC_F's are builtins with zero or one argument. -// |Rx| enforces that arg is present for: system, close, cos, sin, exp, int, log, sqrt. -// Check for no args is present in builtins' code (not in this table): rand, systime. +// |Rx| enforces that arg is present for: system, close, cos, sin, exp, int, log, sqrt +// Check for no args is present in builtins' code (not in this table): rand, systime // Have one _optional_ arg: fflush, srand, length #define OC_B OC_BUILTIN #define OC_F OC_FBLTIN @@ -1310,7 +1317,7 @@ static node *new_node(uint32_t info) static void mk_re_node(const char *s, node *n, regex_t *re) { - n->info = OC_REGEXP; + n->info = TI_REGEXP; n->l.re = re; n->r.ire = re + 1; xregcomp(re, s, REG_EXTENDED); @@ -1360,12 +1367,13 @@ static node *parse_expr(uint32_t term_tc) * previous operators with higher priority */ vn = cn; while (((t_info & PRIMASK) > (vn->a.n->info & PRIMASK2)) - || ((t_info == vn->info) && ((t_info & OPCLSMASK) == OC_COLON)) + || ((t_info == vn->info) && t_info == TI_COLON) ) { vn = vn->a.n; if (!vn->a.n) syntax_error(EMSG_UNEXP_TOKEN); } - if ((t_info & OPCLSMASK) == OC_TERNARY) + if (t_info == TI_TERNARY) +//TODO: why? t_info += P(6); cn = vn->a.n->r.n = new_node(t_info); cn->a.n = vn->a.n; @@ -1378,7 +1386,7 @@ static node *parse_expr(uint32_t term_tc) // awk 'BEGIN { length("qwe") = 1 }' // awk 'BEGIN { (1+1) += 3 }' expected_tc = TS_OPERAND | TS_UOPPRE | TC_REGEXP; - if ((t_info & OPCLSMASK) == OC_PGETLINE) { + if (t_info == TI_PGETLINE) { /* it's a pipe */ next_token(TC_GETLINE); /* give maximum priority to this pipe */ @@ -1630,7 +1638,7 @@ static void chain_group(void) next_token(TC_LPAREN); n2 = parse_expr(TC_SEMICOL | TC_RPAREN); if (t_tclass & TC_RPAREN) { /* for-in */ - if (!n2 || (n2->info & OPCLSMASK) != OC_IN) + if (!n2 || n2->info != TI_IN) syntax_error(EMSG_UNEXP_TOKEN); n = chain_node(OC_WALKINIT | VV); n->l.n = n2->l.n; @@ -1834,7 +1842,7 @@ static node *mk_splitter(const char *s, tsplitter *spl) re = &spl->re[0]; ire = &spl->re[1]; n = &spl->n; - if ((n->info & OPCLSMASK) == OC_REGEXP) { + if (n->info == TI_REGEXP) { regfree(re); regfree(ire); // TODO: nuke ire, use re+1? } @@ -1858,7 +1866,7 @@ static regex_t *as_regex(node *op, regex_t *preg) int cflags; const char *s; - if ((op->info & OPCLSMASK) == OC_REGEXP) { + if (op->info == TI_REGEXP) { return icase ? op->r.ire : op->l.re; } @@ -1968,7 +1976,7 @@ static int awk_split(const char *s, node *spl, char **slist) c[2] = '\n'; n = 0; - if ((spl->info & OPCLSMASK) == OC_REGEXP) { /* regex split */ + if (spl->info == TI_REGEXP) { /* regex split */ if (!*s) return n; /* "": zero fields */ n++; /* at least one field will be there */ @@ -2135,7 +2143,7 @@ static node *nextarg(node **pn) node *n; n = *pn; - if (n && (n->info & OPCLSMASK) == OC_COMMA) { + if (n && n->info == TI_COMMA) { *pn = n->r.n; n = n->l.n; } else { @@ -2229,7 +2237,7 @@ static int awk_getline(rstream *rsm, var *v) so = eo = p; r = 1; if (p > 0) { - if ((rsplitter.n.info & OPCLSMASK) == OC_REGEXP) { + if (rsplitter.n.info == TI_REGEXP) { if (regexec(icase ? rsplitter.n.r.ire : rsplitter.n.l.re, b, 1, pmatch, 0) == 0) { so = pmatch[0].rm_so; @@ -2575,8 +2583,8 @@ static NOINLINE var *exec_builtin(node *op, var *res) char *s, *s1; if (nargs > 2) { - spl = (an[2]->info & OPCLSMASK) == OC_REGEXP ? - an[2] : mk_splitter(getvar_s(evaluate(an[2], TMPVAR2)), &tspl); + spl = (an[2]->info == TI_REGEXP) ? an[2] + : mk_splitter(getvar_s(evaluate(an[2], TMPVAR2)), &tspl); } else { spl = &fsplitter.n; } @@ -2860,7 +2868,7 @@ static var *evaluate(node *op, var *res) /* test pattern */ case XC( OC_TEST ): debug_printf_eval("TEST\n"); - if ((op1->info & OPCLSMASK) == OC_COMMA) { + if (op1->info == TI_COMMA) { /* it's range pattern */ if ((opinfo & OF_CHECKED) || ptest(op1->l.n)) { op->info |= OF_CHECKED; @@ -2921,7 +2929,7 @@ static var *evaluate(node *op, var *res) F = rsm->F; } - if ((opinfo & OPCLSMASK) == OC_PRINT) { + if (opinfo == TI_PRINT) { if (!op1) { fputs(getvar_s(intvar[F0]), F); } else { @@ -2940,7 +2948,7 @@ static var *evaluate(node *op, var *res) } } fputs(getvar_s(intvar[ORS]), F); - } else { /* OC_PRINTF */ + } else { /* PRINTF */ char *s = awk_printf(op1, &len); #if ENABLE_FEATURE_AWK_GNU_EXTENSIONS fwrite(s, len, 1, F); @@ -3064,7 +3072,7 @@ static var *evaluate(node *op, var *res) case XC( OC_TERNARY ): debug_printf_eval("TERNARY\n"); - if ((op->r.n->info & OPCLSMASK) != OC_COLON) + if (op->r.n->info != TI_COLON) syntax_error(EMSG_POSSIBLE_ERROR); res = evaluate(istrue(L.v) ? op->r.n->l.n : op->r.n->r.n, res); break; @@ -3122,7 +3130,7 @@ static var *evaluate(node *op, var *res) if (op1) { rsm = newfile(L.s); if (!rsm->F) { - if ((opinfo & OPCLSMASK) == OC_PGETLINE) { + if (opinfo == TI_PGETLINE) { rsm->F = popen(L.s, "r"); rsm->is_pipe = TRUE; } else { @@ -3158,7 +3166,7 @@ static var *evaluate(node *op, var *res) double R_d = R_d; /* for compiler */ debug_printf_eval("FBLTIN\n"); - if (op1 && (op1->info & OPCLSMASK) == OC_COMMA) + if (op1 && op1->info == TI_COMMA) /* Simple builtins take one arg maximum */ syntax_error("Too many arguments"); @@ -3358,7 +3366,7 @@ static var *evaluate(node *op, var *res) case XC( OC_COMMA ): { const char *sep = ""; debug_printf_eval("COMMA\n"); - if ((opinfo & OPCLSMASK) == OC_COMMA) + if (opinfo == TI_COMMA) sep = getvar_s(intvar[SUBSEP]); setvar_p(res, xasprintf("%s%s%s", L.s, sep, R.s)); break; -- 2.27.0 From 1b83e0c3a70693e958ae43a16ec35943dd24dd22 Mon Sep 17 00:00:00 2001 From: Denys Vlasenko Date: Sun, 4 Jul 2021 01:25:34 +0200 Subject: [PATCH 55/65] awk: fix printf buffer overflow function old new delta awk_printf 468 546 +78 fmt_num 239 247 +8 getvar_s 125 111 -14 evaluate 3343 3329 -14 ------------------------------------------------------------------------------ (add/remove: 0/0 grow/shrink: 2/2 up/down: 86/-28) Total: 58 bytes Signed-off-by: Denys Vlasenko (cherry picked from commit e2e3802987266c98df0efdf40ad5da4b07df0113) --- editors/awk.c | 94 ++++++++++++++++++++++++++++++--------------------- 1 file changed, 55 insertions(+), 39 deletions(-) diff --git a/editors/awk.c b/editors/awk.c index 4905a5aa7..a414317f1 100644 --- a/editors/awk.c +++ b/editors/awk.c @@ -904,25 +904,23 @@ static double my_strtod(char **pp) /* -------- working with variables (set/get/copy/etc) -------- */ -static int fmt_num(char *b, int size, const char *format, double n, int int_as_int) +static void fmt_num(const char *format, double n) { - int r = 0; - char c; - const char *s = format; - - if (int_as_int && n == (long long)n) { - r = snprintf(b, size, "%lld", (long long)n); + if (n == (long long)n) { + snprintf(g_buf, MAXVARFMT, "%lld", (long long)n); } else { + const char *s = format; + char c; + do { c = *s; } while (c && *++s); if (strchr("diouxX", c)) { - r = snprintf(b, size, format, (int)n); + snprintf(g_buf, MAXVARFMT, format, (int)n); } else if (strchr("eEfFgGaA", c)) { - r = snprintf(b, size, format, n); + snprintf(g_buf, MAXVARFMT, format, n); } else { syntax_error(EMSG_INV_FMT); } } - return r; } static xhash *iamarray(var *a) @@ -999,7 +997,7 @@ static const char *getvar_s(var *v) { /* if v is numeric and has no cached string, convert it to string */ if ((v->type & (VF_NUMBER | VF_CACHED)) == VF_NUMBER) { - fmt_num(g_buf, MAXVARFMT, getvar_s(intvar[CONVFMT]), v->number, TRUE); + fmt_num(getvar_s(intvar[CONVFMT]), v->number); v->string = xstrdup(g_buf); v->type |= VF_CACHED; } @@ -2315,12 +2313,9 @@ static int awk_getline(rstream *rsm, var *v) #endif static char *awk_printf(node *n, int *len) { - char *b = NULL; - char *fmt, *s, *f; - const char *s1; - int i, j, incr, bsize; - char c, c1; - var *arg; + char *b; + char *fmt, *f; + int i; //tmpvar = nvalloc(1); #define TMPVAR (&G.awk_printf__tmpvar) @@ -2333,8 +2328,14 @@ static char *awk_printf(node *n, int *len) // to evaluate() potentially recursing into another awk_printf() can't // mangle the value. + b = NULL; i = 0; - while (*f) { + while (*f) { /* "print one format spec" loop */ + char *s; + char c; + char sv; + var *arg; + s = f; while (*f && (*f != '%' || *++f == '%')) f++; @@ -2343,40 +2344,55 @@ static char *awk_printf(node *n, int *len) syntax_error("%*x formats are not supported"); f++; } - - incr = (f - s) + MAXVARFMT; - b = qrealloc(b, incr + i, &bsize); c = *f; - if (c != '\0') - f++; - c1 = *f; + if (!c) { + /* Tail of fmt with no percent chars, + * or "....%" (percent seen, but no format specifier char found) + */ + goto tail; + } + sv = *++f; *f = '\0'; arg = evaluate(nextarg(&n), TMPVAR); - j = i; - if (c == 'c' || !c) { - i += sprintf(b+i, s, is_numeric(arg) ? + /* Result can be arbitrarily long. Example: + * printf "%99999s", "BOOM" + */ + if (c == 'c') { + s = xasprintf(s, is_numeric(arg) ? (char)getvar_i(arg) : *getvar_s(arg)); } else if (c == 's') { - s1 = getvar_s(arg); - b = qrealloc(b, incr+i+strlen(s1), &bsize); - i += sprintf(b+i, s, s1); + s = xasprintf(s, getvar_s(arg)); } else { - i += fmt_num(b+i, incr, s, getvar_i(arg), FALSE); + double d = getvar_i(arg); + if (strchr("diouxX", c)) { +//TODO: make it wider here (%x -> %llx etc)? + s = xasprintf(s, (int)d); + } else if (strchr("eEfFgGaA", c)) { + s = xasprintf(s, d); + } else { + syntax_error(EMSG_INV_FMT); + } } - *f = c1; + *f = sv; - /* if there was an error while sprintf, return value is negative */ - if (i < j) - i = j; + if (i == 0) { + b = s; + i = strlen(b); + continue; + } + tail: + b = xrealloc(b, i + strlen(s) + 1); + i = stpcpy(b + i, s) - b; + if (!c) /* tail? */ + break; + free(s); } free(fmt); //nvfree(tmpvar, 1); #undef TMPVAR - b = xrealloc(b, i + 1); - b[i] = '\0'; #if ENABLE_FEATURE_AWK_GNU_EXTENSIONS if (len) *len = i; @@ -2936,8 +2952,8 @@ static var *evaluate(node *op, var *res) for (;;) { var *v = evaluate(nextarg(&op1), TMPVAR0); if (v->type & VF_NUMBER) { - fmt_num(g_buf, MAXVARFMT, getvar_s(intvar[OFMT]), - getvar_i(v), TRUE); + fmt_num(getvar_s(intvar[OFMT]), + getvar_i(v)); fputs(g_buf, F); } else { fputs(getvar_s(v), F); -- 2.27.0 From 826860644307b063d826a8dfa815a89108a5274e Mon Sep 17 00:00:00 2001 From: Denys Vlasenko Date: Sun, 11 Jul 2021 11:46:21 +0200 Subject: [PATCH 56/65] awk: rollback_token() + chain_group() == chain_until_rbrace() function old new delta parse_program 336 332 -4 Signed-off-by: Denys Vlasenko (cherry picked from commit 49c3ce64f092fd5434fc67056f312bd32f82bae3) --- editors/awk.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/editors/awk.c b/editors/awk.c index a414317f1..598c74285 100644 --- a/editors/awk.c +++ b/editors/awk.c @@ -1778,8 +1778,7 @@ static void parse_program(char *p) cn->l.n = parse_expr(TC_SEMICOL | TC_NEWLINE | TC_EOF | TC_LBRACE); if (t_tclass == TC_LBRACE) { debug_printf_parse("%s: TC_LBRACE\n", __func__); - rollback_token(); - chain_group(); + chain_until_rbrace(); } else { /* no action, assume default "{ print }" */ debug_printf_parse("%s: !TC_LBRACE\n", __func__); -- 2.27.0 From 0879453754160313fc6fc1df1d055afb2eb48a98 Mon Sep 17 00:00:00 2001 From: Denys Vlasenko Date: Sun, 11 Jul 2021 12:00:31 +0200 Subject: [PATCH 57/65] awk: undo TI_PRINT, it introduced a bug (print with any redirect acting as printf) function old new delta evaluate 3329 3337 +8 Patch by Ron Yorston Signed-off-by: Denys Vlasenko (cherry picked from commit 3d57a8490738d9febaa4496eba791e4fbfc91826) --- editors/awk.c | 8 +++++--- testsuite/awk.tests | 5 +++++ 2 files changed, 10 insertions(+), 3 deletions(-) diff --git a/editors/awk.c b/editors/awk.c index 598c74285..00f1df36c 100644 --- a/editors/awk.c +++ b/editors/awk.c @@ -462,8 +462,7 @@ static const uint32_t tokeninfo[] = { 0, 0, /* \n */ ST_IF, ST_DO, ST_FOR, OC_BREAK, -#define TI_PRINT OC_PRINT - OC_CONTINUE, OC_DELETE|Rx, TI_PRINT, + OC_CONTINUE, OC_DELETE|Rx, OC_PRINT, OC_PRINTF, OC_NEXT, OC_NEXTFILE, OC_RETURN|Vx, OC_EXIT|Nx, ST_WHILE, @@ -2944,7 +2943,10 @@ static var *evaluate(node *op, var *res) F = rsm->F; } - if (opinfo == TI_PRINT) { + /* Can't just check 'opinfo == OC_PRINT' here, parser ORs + * additional bits to opinfos of print/printf with redirects + */ + if ((opinfo & OPCLSMASK) == OC_PRINT) { if (!op1) { fputs(getvar_s(intvar[F0]), F); } else { diff --git a/testsuite/awk.tests b/testsuite/awk.tests index 11fd17599..e4781990f 100755 --- a/testsuite/awk.tests +++ b/testsuite/awk.tests @@ -450,4 +450,9 @@ testing "awk exit N propagates through END's exit" \ "42\n" \ '' '' +testing "awk print + redirect" \ + "awk 'BEGIN { print \"STDERR %s\" >\"/dev/stderr\" }' 2>&1" \ + "STDERR %s\n" \ + '' '' + exit $FAILCOUNT -- 2.27.0 From 3316efb8c5f3e6416a4662a0fc8c4d7ecc4a953f Mon Sep 17 00:00:00 2001 From: Denys Vlasenko Date: Sun, 11 Jul 2021 12:25:33 +0200 Subject: [PATCH 58/65] awk: unbreak "printf('%c') can output NUL" testcase function old new delta awk_printf 546 593 +47 Signed-off-by: Denys Vlasenko (cherry picked from commit 4ef8841b21e27e7c7f58d3c9901c833b4fa5a862) --- editors/awk.c | 43 ++++++++++++++++++++++++++----------------- 1 file changed, 26 insertions(+), 17 deletions(-) diff --git a/editors/awk.c b/editors/awk.c index 00f1df36c..d77185f37 100644 --- a/editors/awk.c +++ b/editors/awk.c @@ -2309,11 +2309,11 @@ static int awk_getline(rstream *rsm, var *v) #if !ENABLE_FEATURE_AWK_GNU_EXTENSIONS # define awk_printf(a, b) awk_printf(a) #endif -static char *awk_printf(node *n, int *len) +static char *awk_printf(node *n, size_t *len) { char *b; char *fmt, *f; - int i; + size_t i; //tmpvar = nvalloc(1); #define TMPVAR (&G.awk_printf__tmpvar) @@ -2333,6 +2333,7 @@ static char *awk_printf(node *n, int *len) char c; char sv; var *arg; + size_t slen; s = f; while (*f && (*f != '%' || *++f == '%')) @@ -2347,6 +2348,7 @@ static char *awk_printf(node *n, int *len) /* Tail of fmt with no percent chars, * or "....%" (percent seen, but no format specifier char found) */ + slen = strlen(s); goto tail; } sv = *++f; @@ -2357,31 +2359,38 @@ static char *awk_printf(node *n, int *len) * printf "%99999s", "BOOM" */ if (c == 'c') { - s = xasprintf(s, is_numeric(arg) ? - (char)getvar_i(arg) : *getvar_s(arg)); - } else if (c == 's') { - s = xasprintf(s, getvar_s(arg)); + c = is_numeric(arg) ? getvar_i(arg) : *getvar_s(arg); + s = xasprintf(s, c); + /* + 1 if c == NUL: handle printf "%c" 0 case + * (and printf "%22c" 0 etc, but still fails for e.g. printf "%-22c" 0) */ + slen = strlen(s) + (c == '\0'); } else { - double d = getvar_i(arg); - if (strchr("diouxX", c)) { -//TODO: make it wider here (%x -> %llx etc)? - s = xasprintf(s, (int)d); - } else if (strchr("eEfFgGaA", c)) { - s = xasprintf(s, d); + if (c == 's') { + s = xasprintf(s, getvar_s(arg)); } else { - syntax_error(EMSG_INV_FMT); + double d = getvar_i(arg); + if (strchr("diouxX", c)) { +//TODO: make it wider here (%x -> %llx etc)? + s = xasprintf(s, (int)d); + } else if (strchr("eEfFgGaA", c)) { + s = xasprintf(s, d); + } else { + syntax_error(EMSG_INV_FMT); + } } + slen = strlen(s); } *f = sv; if (i == 0) { b = s; - i = strlen(b); + i = slen; continue; } tail: - b = xrealloc(b, i + strlen(s) + 1); - i = stpcpy(b + i, s) - b; + b = xrealloc(b, i + slen + 1); + strcpy(b + i, s); + i += slen; if (!c) /* tail? */ break; free(s); @@ -2926,7 +2935,6 @@ static var *evaluate(node *op, var *res) debug_printf_eval("PRINTF\n"); { FILE *F = stdout; - IF_FEATURE_AWK_GNU_EXTENSIONS(int len;) if (op->r.n) { rstream *rsm = newfile(R.s); @@ -2966,6 +2974,7 @@ static var *evaluate(node *op, var *res) } fputs(getvar_s(intvar[ORS]), F); } else { /* PRINTF */ + IF_FEATURE_AWK_GNU_EXTENSIONS(size_t len;) char *s = awk_printf(op1, &len); #if ENABLE_FEATURE_AWK_GNU_EXTENSIONS fwrite(s, len, 1, F); -- 2.27.0 From de6c0d93db4673817ea7f4058f2aed4f208da541 Mon Sep 17 00:00:00 2001 From: Denys Vlasenko Date: Sun, 11 Jul 2021 12:51:43 +0200 Subject: [PATCH 59/65] awk: unbreak "cmd" | getline function old new delta evaluate 3337 3343 +6 Signed-off-by: Denys Vlasenko (cherry picked from commit 39aabfe8f033c9c62acf676b660dc979714d26a7) --- editors/awk.c | 3 ++- testsuite/awk.tests | 5 +++++ 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/editors/awk.c b/editors/awk.c index d77185f37..3691785d1 100644 --- a/editors/awk.c +++ b/editors/awk.c @@ -3156,7 +3156,8 @@ static var *evaluate(node *op, var *res) if (op1) { rsm = newfile(L.s); if (!rsm->F) { - if (opinfo == TI_PGETLINE) { + /* NB: can't use "opinfo == TI_PGETLINE", would break "cmd" | getline */ + if ((opinfo & OPCLSMASK) == OC_PGETLINE) { rsm->F = popen(L.s, "r"); rsm->is_pipe = TRUE; } else { diff --git a/testsuite/awk.tests b/testsuite/awk.tests index e4781990f..261d16947 100755 --- a/testsuite/awk.tests +++ b/testsuite/awk.tests @@ -455,4 +455,9 @@ testing "awk print + redirect" \ "STDERR %s\n" \ '' '' +testing "awk \"cmd\" | getline" \ + "awk 'BEGIN { \"echo HELLO\" | getline; print }'" \ + "HELLO\n" \ + '' '' + exit $FAILCOUNT -- 2.27.0 From 574499b1f2c8a95ecfb7715b110dd93f4cdd766e Mon Sep 17 00:00:00 2001 From: Denys Vlasenko Date: Sun, 11 Jul 2021 18:16:10 +0200 Subject: [PATCH 60/65] awk: fix corner case in awk_printf Example where it wasn't working: awk 'BEGIN { printf "qwe %s rty %c uio\n", "a", 0, "c" }' - the NUL printing in %c caused premature stop of printing. function old new delta awk_printf 593 596 +3 Signed-off-by: Denys Vlasenko (cherry picked from commit caa93ecdd3a9b998a69dcbfafdddbc9c58887ec3) --- editors/awk.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/editors/awk.c b/editors/awk.c index 3691785d1..65d292dcf 100644 --- a/editors/awk.c +++ b/editors/awk.c @@ -2359,11 +2359,11 @@ static char *awk_printf(node *n, size_t *len) * printf "%99999s", "BOOM" */ if (c == 'c') { - c = is_numeric(arg) ? getvar_i(arg) : *getvar_s(arg); - s = xasprintf(s, c); - /* + 1 if c == NUL: handle printf "%c" 0 case + char cc = is_numeric(arg) ? getvar_i(arg) : *getvar_s(arg); + s = xasprintf(s, cc); + /* + 1 if cc == NUL: handle printf "%c" 0 case * (and printf "%22c" 0 etc, but still fails for e.g. printf "%-22c" 0) */ - slen = strlen(s) + (c == '\0'); + slen = strlen(s) + (cc == '\0'); } else { if (c == 's') { s = xasprintf(s, getvar_s(arg)); -- 2.27.0 From 94a0eb5b4a6fcad000e9ded98ca80478e277da23 Mon Sep 17 00:00:00 2001 From: Denys Vlasenko Date: Mon, 12 Jul 2021 11:27:11 +0200 Subject: [PATCH 61/65] awk: fix printf "%-10c", 0 function old new delta awk_printf 596 626 +30 Signed-off-by: Denys Vlasenko (cherry picked from commit 8d269ef85984f6476e7fdbec2c5a70f3b5c48a72) --- editors/awk.c | 9 +++++---- testsuite/awk.tests | 8 ++++++++ 2 files changed, 13 insertions(+), 4 deletions(-) diff --git a/editors/awk.c b/editors/awk.c index 65d292dcf..ed1d08de5 100644 --- a/editors/awk.c +++ b/editors/awk.c @@ -2360,10 +2360,11 @@ static char *awk_printf(node *n, size_t *len) */ if (c == 'c') { char cc = is_numeric(arg) ? getvar_i(arg) : *getvar_s(arg); - s = xasprintf(s, cc); - /* + 1 if cc == NUL: handle printf "%c" 0 case - * (and printf "%22c" 0 etc, but still fails for e.g. printf "%-22c" 0) */ - slen = strlen(s) + (cc == '\0'); + char *r = xasprintf(s, cc ? cc : '^' /* else strlen will be wrong */); + slen = strlen(r); + if (cc == '\0') /* if cc is NUL, re-format the string with it */ + sprintf(r, s, cc); + s = r; } else { if (c == 's') { s = xasprintf(s, getvar_s(arg)); diff --git a/testsuite/awk.tests b/testsuite/awk.tests index 261d16947..c0a6257d7 100755 --- a/testsuite/awk.tests +++ b/testsuite/awk.tests @@ -415,6 +415,14 @@ testing "awk printf('%c') can output NUL" \ "awk '{printf(\"hello%c null\n\", 0)}'" "hello\0 null\n" "" "\n" SKIP= +optional FEATURE_AWK_GNU_EXTENSIONS +testing "awk printf('%-10c') can output NUL" \ + "awk 'BEGIN { printf \"[%-10c]\n\", 0 }' | od -tx1" "\ +0000000 5b 00 20 20 20 20 20 20 20 20 20 5d 0a +0000015 +" "" "" +SKIP= + # testing "description" "command" "result" "infile" "stdin" testing 'awk negative field access' \ 'awk 2>&1 -- '\''{ $(-1) }'\' \ -- 2.27.0 From 0a36ff146ed77e568cc4cc0df458bb29e92ef170 Mon Sep 17 00:00:00 2001 From: Denys Vlasenko Date: Mon, 12 Jul 2021 13:30:30 +0200 Subject: [PATCH 62/65] awk: in parsing, remove superfluous NEWLINE check; optimize builtin arg evaluation function old new delta exec_builtin 1149 1145 -4 Signed-off-by: Denys Vlasenko (cherry picked from commit ab755e3717cefc06fd28ce8db56f0402412afaa3) --- editors/awk.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/editors/awk.c b/editors/awk.c index ed1d08de5..26509c398 100644 --- a/editors/awk.c +++ b/editors/awk.c @@ -1589,8 +1589,8 @@ static void chain_group(void) chain_until_rbrace(); return; } - if (tc & (TS_OPSEQ | TC_SEMICOL | TC_NEWLINE)) { - debug_printf_parse("%s: TS_OPSEQ | TC_SEMICOL | TC_NEWLINE\n", __func__); + if (tc & (TS_OPSEQ | TC_SEMICOL)) { + debug_printf_parse("%s: TS_OPSEQ | TC_SEMICOL\n", __func__); rollback_token(); chain_expr(OC_EXEC | Vx); return; @@ -2582,10 +2582,11 @@ static NOINLINE var *exec_builtin(node *op, var *res) av[2] = av[3] = NULL; for (i = 0; i < 4 && op; i++) { an[i] = nextarg(&op); - if (isr & 0x09000000) + if (isr & 0x09000000) { av[i] = evaluate(an[i], TMPVAR(i)); - if (isr & 0x08000000) - as[i] = getvar_s(av[i]); + if (isr & 0x08000000) + as[i] = getvar_s(av[i]); + } isr >>= 1; } -- 2.27.0 From 5a58022237a3ce26ebf7a7c85daa6254039ecd15 Mon Sep 17 00:00:00 2001 From: Denys Vlasenko Date: Wed, 14 Jul 2021 14:25:07 +0200 Subject: [PATCH 63/65] awk: tighten parsing - disallow extra semicolons '; BEGIN {...}' and 'BEGIN {...} ;; {...}' are not accepted by gawk function old new delta parse_program 332 353 +21 Signed-off-by: Denys Vlasenko (cherry picked from commit d62627487a44d9175b05d49846aeef83fed97019) --- editors/awk.c | 40 ++++++++++++++++++++++++---------------- 1 file changed, 24 insertions(+), 16 deletions(-) diff --git a/editors/awk.c b/editors/awk.c index 26509c398..b491f5ce9 100644 --- a/editors/awk.c +++ b/editors/awk.c @@ -1634,7 +1634,7 @@ static void chain_group(void) debug_printf_parse("%s: ST_FOR\n", __func__); next_token(TC_LPAREN); n2 = parse_expr(TC_SEMICOL | TC_RPAREN); - if (t_tclass & TC_RPAREN) { /* for-in */ + if (t_tclass & TC_RPAREN) { /* for (I in ARRAY) */ if (!n2 || n2->info != TI_IN) syntax_error(EMSG_UNEXP_TOKEN); n = chain_node(OC_WALKINIT | VV); @@ -1700,20 +1700,15 @@ static void parse_program(char *p) for (;;) { uint32_t tclass; - tclass = next_token(TC_EOF | TS_OPSEQ | TC_LBRACE | - TC_SEMICOL | TC_NEWLINE | TC_BEGIN | TC_END | TC_FUNCDECL); - + tclass = next_token(TS_OPSEQ | TC_LBRACE | TC_BEGIN | TC_END | TC_FUNCDECL + | TC_EOF | TC_NEWLINE /* but not TC_SEMICOL */); + got_tok: if (tclass == TC_EOF) { debug_printf_parse("%s: TC_EOF\n", __func__); break; } - if (tclass & (TC_SEMICOL | TC_NEWLINE)) { - debug_printf_parse("%s: TC_SEMICOL | TC_NEWLINE\n", __func__); -//NB: gawk allows many newlines, but does not allow more than one semicolon: -// BEGIN {...};; -//would complain "each rule must have a pattern or an action part". -//Same message for -// ; BEGIN {...} + if (tclass == TC_NEWLINE) { + debug_printf_parse("%s: TC_NEWLINE\n", __func__); continue; } if (tclass == TC_BEGIN) { @@ -1722,7 +1717,7 @@ static void parse_program(char *p) /* ensure there is no newline between BEGIN and { */ next_token(TC_LBRACE); chain_until_rbrace(); - continue; + goto next_tok; } if (tclass == TC_END) { debug_printf_parse("%s: TC_END\n", __func__); @@ -1730,7 +1725,7 @@ static void parse_program(char *p) /* ensure there is no newline between END and { */ next_token(TC_LBRACE); chain_until_rbrace(); - continue; + goto next_tok; } if (tclass == TC_FUNCDECL) { func *f; @@ -1765,7 +1760,7 @@ static void parse_program(char *p) continue; chain_until_rbrace(); hash_clear(ahash); - continue; + goto next_tok; } seq = &mainseq; if (tclass & TS_OPSEQ) { @@ -1784,12 +1779,25 @@ static void parse_program(char *p) chain_node(OC_PRINT); } cn->r.n = mainseq.last; - continue; + goto next_tok; } /* tclass == TC_LBRACE */ debug_printf_parse("%s: TC_LBRACE(?)\n", __func__); chain_until_rbrace(); - } + next_tok: + /* Same as next_token() at the top of the loop, + TC_SEMICOL */ + tclass = next_token(TS_OPSEQ | TC_LBRACE | TC_BEGIN | TC_END | TC_FUNCDECL + | TC_EOF | TC_NEWLINE | TC_SEMICOL); + /* gawk allows many newlines, but does not allow more than one semicolon: + * BEGIN {...};; + * would complain "each rule must have a pattern or an action part". + * Same message for + * ; BEGIN {...} + */ + if (tclass != TC_SEMICOL) + goto got_tok; /* use this token */ + /* else: loop back - ate the semicolon, get and use _next_ token */ + } /* for (;;) */ } -- 2.27.0 From b3194ba37205897e9f807ea73881aaeac090aa75 Mon Sep 17 00:00:00 2001 From: Denys Vlasenko Date: Wed, 14 Jul 2021 14:33:37 +0200 Subject: [PATCH 64/65] awk: disallow break/continue outside of loops function old new delta .rodata 104139 104186 +47 chain_group 610 633 +23 ------------------------------------------------------------------------------ (add/remove: 0/0 grow/shrink: 2/0 up/down: 70/0) Total: 70 bytes Signed-off-by: Denys Vlasenko (cherry picked from commit d3480dd58211d9d8c06ec7ef00089262603003ff) --- editors/awk.c | 6 ++++-- testsuite/awk.tests | 9 ++------- 2 files changed, 6 insertions(+), 9 deletions(-) diff --git a/editors/awk.c b/editors/awk.c index b491f5ce9..51924c057 100644 --- a/editors/awk.c +++ b/editors/awk.c @@ -1671,16 +1671,18 @@ static void chain_group(void) case OC_BREAK: debug_printf_parse("%s: OC_BREAK\n", __func__); n = chain_node(OC_EXEC); + if (!break_ptr) + syntax_error("'break' not in a loop"); n->a.n = break_ptr; -//TODO: if break_ptr is NULL, syntax error (not in the loop)? chain_expr(t_info); break; case OC_CONTINUE: debug_printf_parse("%s: OC_CONTINUE\n", __func__); n = chain_node(OC_EXEC); + if (!continue_ptr) + syntax_error("'continue' not in a loop"); n->a.n = continue_ptr; -//TODO: if continue_ptr is NULL, syntax error (not in the loop)? chain_expr(t_info); break; diff --git a/testsuite/awk.tests b/testsuite/awk.tests index c0a6257d7..64ca9fd9f 100755 --- a/testsuite/awk.tests +++ b/testsuite/awk.tests @@ -379,19 +379,14 @@ testing "awk -e and ARGC" \ "" SKIP= -# The examples are in fact not valid awk programs (break/continue -# can only be used inside loops). -# But we do accept them outside of loops. -# We had a bug with misparsing "break ; else" sequence. -# Test that *that* bug is fixed, using simplest possible scripts: testing "awk break" \ "awk -f - 2>&1; echo \$?" \ - "0\n" \ + "awk: -:1: 'break' not in a loop\n1\n" \ "" \ 'BEGIN { if (1) break; else a = 1 }' testing "awk continue" \ "awk -f - 2>&1; echo \$?" \ - "0\n" \ + "awk: -:1: 'continue' not in a loop\n1\n" \ "" \ 'BEGIN { if (1) continue; else a = 1 }' -- 2.27.0 From e36503b06e052abef68de4d746700795d50cbcf1 Mon Sep 17 00:00:00 2001 From: Denys Vlasenko Date: Wed, 14 Jul 2021 16:58:05 +0200 Subject: [PATCH 65/65] awk: whitespace and debugging tweaks Signed-off-by: Denys Vlasenko (cherry picked from commit dabbeeb79356eef78528acd55e1f143ae80372f7) --- editors/awk.c | 133 +++++++++++++++++++++++++------------------------- 1 file changed, 66 insertions(+), 67 deletions(-) diff --git a/editors/awk.c b/editors/awk.c index 51924c057..62cd019f1 100644 --- a/editors/awk.c +++ b/editors/awk.c @@ -199,77 +199,78 @@ typedef struct tsplitter_s { /* simple token classes */ /* order and hex values are very important!!! See next_token() */ -#define TC_LPAREN (1 << 0) /* ( */ -#define TC_RPAREN (1 << 1) /* ) */ -#define TC_REGEXP (1 << 2) /* /.../ */ -#define TC_OUTRDR (1 << 3) /* | > >> */ -#define TC_UOPPOST (1 << 4) /* unary postfix operator ++ -- */ -#define TC_UOPPRE1 (1 << 5) /* unary prefix operator ++ -- $ */ -#define TC_BINOPX (1 << 6) /* two-opnd operator */ -#define TC_IN (1 << 7) /* 'in' */ -#define TC_COMMA (1 << 8) /* , */ -#define TC_PIPE (1 << 9) /* input redirection pipe | */ -#define TC_UOPPRE2 (1 << 10) /* unary prefix operator + - ! */ -#define TC_ARRTERM (1 << 11) /* ] */ -#define TC_LBRACE (1 << 12) /* { */ -#define TC_RBRACE (1 << 13) /* } */ -#define TC_SEMICOL (1 << 14) /* ; */ -#define TC_NEWLINE (1 << 15) -#define TC_STATX (1 << 16) /* ctl statement (for, next...) */ -#define TC_WHILE (1 << 17) /* 'while' */ -#define TC_ELSE (1 << 18) /* 'else' */ -#define TC_BUILTIN (1 << 19) +#define TC_LPAREN (1 << 0) /* ( */ +#define TC_RPAREN (1 << 1) /* ) */ +#define TC_REGEXP (1 << 2) /* /.../ */ +#define TC_OUTRDR (1 << 3) /* | > >> */ +#define TC_UOPPOST (1 << 4) /* unary postfix operator ++ -- */ +#define TC_UOPPRE1 (1 << 5) /* unary prefix operator ++ -- $ */ +#define TC_BINOPX (1 << 6) /* two-opnd operator */ +#define TC_IN (1 << 7) /* 'in' */ +#define TC_COMMA (1 << 8) /* , */ +#define TC_PIPE (1 << 9) /* input redirection pipe | */ +#define TC_UOPPRE2 (1 << 10) /* unary prefix operator + - ! */ +#define TC_ARRTERM (1 << 11) /* ] */ +#define TC_LBRACE (1 << 12) /* { */ +#define TC_RBRACE (1 << 13) /* } */ +#define TC_SEMICOL (1 << 14) /* ; */ +#define TC_NEWLINE (1 << 15) +#define TC_STATX (1 << 16) /* ctl statement (for, next...) */ +#define TC_WHILE (1 << 17) /* 'while' */ +#define TC_ELSE (1 << 18) /* 'else' */ +#define TC_BUILTIN (1 << 19) /* This costs ~50 bytes of code. * A separate class to support deprecated "length" form. If we don't need that * (i.e. if we demand that only "length()" with () is valid), then TC_LENGTH * can be merged with TC_BUILTIN: */ -#define TC_LENGTH (1 << 20) /* 'length' */ -#define TC_GETLINE (1 << 21) /* 'getline' */ -#define TC_FUNCDECL (1 << 22) /* 'function' 'func' */ -#define TC_BEGIN (1 << 23) /* 'BEGIN' */ -#define TC_END (1 << 24) /* 'END' */ -#define TC_EOF (1 << 25) -#define TC_VARIABLE (1 << 26) /* name */ -#define TC_ARRAY (1 << 27) /* name[ */ -#define TC_FUNCTION (1 << 28) /* name( */ -#define TC_STRING (1 << 29) /* "..." */ -#define TC_NUMBER (1 << 30) +#define TC_LENGTH (1 << 20) /* 'length' */ +#define TC_GETLINE (1 << 21) /* 'getline' */ +#define TC_FUNCDECL (1 << 22) /* 'function' 'func' */ +#define TC_BEGIN (1 << 23) /* 'BEGIN' */ +#define TC_END (1 << 24) /* 'END' */ +#define TC_EOF (1 << 25) +#define TC_VARIABLE (1 << 26) /* name */ +#define TC_ARRAY (1 << 27) /* name[ */ +#define TC_FUNCTION (1 << 28) /* name( */ +#define TC_STRING (1 << 29) /* "..." */ +#define TC_NUMBER (1 << 30) #ifndef debug_parse_print_tc -#define debug_parse_print_tc(n) do { \ -if ((n) & TC_LPAREN ) debug_printf_parse(" LPAREN" ); \ -if ((n) & TC_RPAREN ) debug_printf_parse(" RPAREN" ); \ -if ((n) & TC_REGEXP ) debug_printf_parse(" REGEXP" ); \ -if ((n) & TC_OUTRDR ) debug_printf_parse(" OUTRDR" ); \ -if ((n) & TC_UOPPOST ) debug_printf_parse(" UOPPOST" ); \ -if ((n) & TC_UOPPRE1 ) debug_printf_parse(" UOPPRE1" ); \ -if ((n) & TC_BINOPX ) debug_printf_parse(" BINOPX" ); \ -if ((n) & TC_IN ) debug_printf_parse(" IN" ); \ -if ((n) & TC_COMMA ) debug_printf_parse(" COMMA" ); \ -if ((n) & TC_PIPE ) debug_printf_parse(" PIPE" ); \ -if ((n) & TC_UOPPRE2 ) debug_printf_parse(" UOPPRE2" ); \ -if ((n) & TC_ARRTERM ) debug_printf_parse(" ARRTERM" ); \ -if ((n) & TC_LBRACE ) debug_printf_parse(" LBRACE" ); \ -if ((n) & TC_RBRACE ) debug_printf_parse(" RBRACE" ); \ -if ((n) & TC_SEMICOL ) debug_printf_parse(" SEMICOL" ); \ -if ((n) & TC_NEWLINE ) debug_printf_parse(" NEWLINE" ); \ -if ((n) & TC_STATX ) debug_printf_parse(" STATX" ); \ -if ((n) & TC_WHILE ) debug_printf_parse(" WHILE" ); \ -if ((n) & TC_ELSE ) debug_printf_parse(" ELSE" ); \ -if ((n) & TC_BUILTIN ) debug_printf_parse(" BUILTIN" ); \ -if ((n) & TC_LENGTH ) debug_printf_parse(" LENGTH" ); \ -if ((n) & TC_GETLINE ) debug_printf_parse(" GETLINE" ); \ -if ((n) & TC_FUNCDECL) debug_printf_parse(" FUNCDECL"); \ -if ((n) & TC_BEGIN ) debug_printf_parse(" BEGIN" ); \ -if ((n) & TC_END ) debug_printf_parse(" END" ); \ -if ((n) & TC_EOF ) debug_printf_parse(" EOF" ); \ -if ((n) & TC_VARIABLE) debug_printf_parse(" VARIABLE"); \ -if ((n) & TC_ARRAY ) debug_printf_parse(" ARRAY" ); \ -if ((n) & TC_FUNCTION) debug_printf_parse(" FUNCTION"); \ -if ((n) & TC_STRING ) debug_printf_parse(" STRING" ); \ -if ((n) & TC_NUMBER ) debug_printf_parse(" NUMBER" ); \ -} while (0) +static void debug_parse_print_tc(uint32_t n) +{ + if (n & TC_LPAREN ) debug_printf_parse(" LPAREN" ); + if (n & TC_RPAREN ) debug_printf_parse(" RPAREN" ); + if (n & TC_REGEXP ) debug_printf_parse(" REGEXP" ); + if (n & TC_OUTRDR ) debug_printf_parse(" OUTRDR" ); + if (n & TC_UOPPOST ) debug_printf_parse(" UOPPOST" ); + if (n & TC_UOPPRE1 ) debug_printf_parse(" UOPPRE1" ); + if (n & TC_BINOPX ) debug_printf_parse(" BINOPX" ); + if (n & TC_IN ) debug_printf_parse(" IN" ); + if (n & TC_COMMA ) debug_printf_parse(" COMMA" ); + if (n & TC_PIPE ) debug_printf_parse(" PIPE" ); + if (n & TC_UOPPRE2 ) debug_printf_parse(" UOPPRE2" ); + if (n & TC_ARRTERM ) debug_printf_parse(" ARRTERM" ); + if (n & TC_LBRACE ) debug_printf_parse(" LBRACE" ); + if (n & TC_RBRACE ) debug_printf_parse(" RBRACE" ); + if (n & TC_SEMICOL ) debug_printf_parse(" SEMICOL" ); + if (n & TC_NEWLINE ) debug_printf_parse(" NEWLINE" ); + if (n & TC_STATX ) debug_printf_parse(" STATX" ); + if (n & TC_WHILE ) debug_printf_parse(" WHILE" ); + if (n & TC_ELSE ) debug_printf_parse(" ELSE" ); + if (n & TC_BUILTIN ) debug_printf_parse(" BUILTIN" ); + if (n & TC_LENGTH ) debug_printf_parse(" LENGTH" ); + if (n & TC_GETLINE ) debug_printf_parse(" GETLINE" ); + if (n & TC_FUNCDECL) debug_printf_parse(" FUNCDECL"); + if (n & TC_BEGIN ) debug_printf_parse(" BEGIN" ); + if (n & TC_END ) debug_printf_parse(" END" ); + if (n & TC_EOF ) debug_printf_parse(" EOF" ); + if (n & TC_VARIABLE) debug_printf_parse(" VARIABLE"); + if (n & TC_ARRAY ) debug_printf_parse(" ARRAY" ); + if (n & TC_FUNCTION) debug_printf_parse(" FUNCTION"); + if (n & TC_STRING ) debug_printf_parse(" STRING" ); + if (n & TC_NUMBER ) debug_printf_parse(" NUMBER" ); +} #endif /* combined token classes ("token [class] sets") */ @@ -417,7 +418,7 @@ static const char tokenlist[] ALIGN1 = "\5close" "\6system" "\6fflush" "\5atan2" "\3cos" "\3exp" "\3int" "\3log" "\4rand" "\3sin" "\4sqrt" "\5srand" - "\6gensub" "\4gsub" "\5index" /* "\6length" was here */ + "\6gensub" "\4gsub" "\5index" /* "\6length" was here */ "\5match" "\5split" "\7sprintf" "\3sub" "\6substr" "\7systime" "\10strftime" "\6mktime" "\7tolower" "\7toupper" NTC @@ -1802,7 +1803,6 @@ static void parse_program(char *p) } /* for (;;) */ } - /* -------- program execution part -------- */ /* temporary variables allocator */ @@ -3510,7 +3510,6 @@ static var *evaluate(node *op, var *res) #undef sreg } - /* -------- main & co. -------- */ static int awk_exit(void) -- 2.27.0