89 lines
2.8 KiB
Diff
89 lines
2.8 KiB
Diff
From 5e3b760f65f13856e5717e5b9d935f5b4a615be3 Mon Sep 17 00:00:00 2001
|
|
From: =?UTF-8?q?Carlo=20Marcelo=20Arenas=20Bel=C3=B3n?= <carenas@gmail.com>
|
|
Date: Fri, 6 Jan 2023 19:34:56 -0800
|
|
Subject: [PATCH] pcre: use UCP in UTF mode
|
|
|
|
This fixes a serious bug affecting word-boundary and word-constituent regular
|
|
expressions when the desired match involves non-ASCII UTF8 characters.
|
|
* src/pcresearch.c: Set PCRE2_UCP together with PCRE2_UTF
|
|
* tests/pcre-utf8-w: New file.
|
|
* tests/Makefile.am (TESTS): Add it.
|
|
* NEWS (Bug fixes): Mention this.
|
|
* THANKS.in: Add Gro-Tsen and Karl Petterson.
|
|
Reported by Gro-Tsen https://twitter.com/gro_tsen/status/1610972356972875777
|
|
via Karl Pettersson in https://github.com/PCRE2Project/pcre2/issues/185
|
|
This bug was present from grep-2.5, when --perl-regexp (-P) support was added.
|
|
|
|
Reference:https://git.savannah.gnu.org/cgit/grep.git/commit?id=5e3b760f65f13856e5717e5b9d935f5b4a615be3
|
|
Conflict:delete NEWS,ThANKS.in and change src/pcresearch.c
|
|
---
|
|
src/pcresearch.c | 2 +-
|
|
tests/Makefile.am | 1 +
|
|
tests/pcre-utf8-w | 28 ++++++++++++++++++++++++++++
|
|
3 files changed, 30 insertions(+), 1 deletion(-)
|
|
create mode 100755 tests/pcre-utf8-w
|
|
|
|
diff --git a/src/pcresearch.c b/src/pcresearch.c
|
|
index 577995f..0127073 100644
|
|
--- a/src/pcresearch.c
|
|
+++ b/src/pcresearch.c
|
|
@@ -136,7 +136,7 @@ Pcompile (char *pattern, size_t size, reg_syntax_t ignored, bool exact)
|
|
{
|
|
if (! localeinfo.using_utf8)
|
|
die (EXIT_TROUBLE, 0, _("-P supports only unibyte and UTF-8 locales"));
|
|
- flags |= PCRE_UTF8;
|
|
+ flags |= (PCRE_UTF8 | PCRE_UCP);
|
|
}
|
|
|
|
/* FIXME: Remove this restriction. */
|
|
diff --git a/tests/Makefile.am b/tests/Makefile.am
|
|
index b05a126..d2968c6 100644
|
|
--- a/tests/Makefile.am
|
|
+++ b/tests/Makefile.am
|
|
@@ -143,6 +143,7 @@ TESTS = \
|
|
pcre-jitstack \
|
|
pcre-o \
|
|
pcre-utf8 \
|
|
+ pcre-utf8-w \
|
|
pcre-w \
|
|
pcre-wx-backref \
|
|
pcre-z \
|
|
diff --git a/tests/pcre-utf8-w b/tests/pcre-utf8-w
|
|
new file mode 100755
|
|
index 0000000..4cd7db6
|
|
--- /dev/null
|
|
+++ b/tests/pcre-utf8-w
|
|
@@ -0,0 +1,28 @@
|
|
+#!/bin/sh
|
|
+# Ensure non-ASCII UTF-8 characters are correctly identified as word-consituent
|
|
+#
|
|
+# Copyright (C) 2023 Free Software Foundation, Inc.
|
|
+#
|
|
+# Copying and distribution of this file, with or without modification,
|
|
+# are permitted in any medium without royalty provided the copyright
|
|
+# notice and this notice are preserved.
|
|
+
|
|
+. "${srcdir=.}/init.sh"; path_prepend_ ../src
|
|
+require_en_utf8_locale_
|
|
+LC_ALL=en_US.UTF-8
|
|
+export LC_ALL
|
|
+require_pcre_
|
|
+
|
|
+fail=0
|
|
+
|
|
+echo 'Perú'> in || framework_failure_
|
|
+
|
|
+echo 'ú' > exp || framework_failure_
|
|
+grep -Po '.\b' in > out || fail=1
|
|
+compare exp out || fail=1
|
|
+
|
|
+echo 'rú' > exp || framework_failure_
|
|
+grep -Po 'r\w' in > out || fail=1
|
|
+compare exp out || fail=1
|
|
+
|
|
+Exit $fail
|
|
--
|
|
2.27.0
|
|
|
|
|