From 8374ca9a2f66ad1b36dbd4b53abba9c692fccee6 Mon Sep 17 00:00:00 2001 From: Sajan Karumanchi Date: Tue, 3 Sep 2024 08:23:27 +0000 Subject: [PATCH] backport-x86-Optimizing-memcpy-for-AMD-Zen-architecture Modifying the shareable cache '__x86_shared_cache_size', which is a factor in computing the non-temporal threshold parameter '__x86_shared_non_temporal_threshold' to optimize memcpy for AMD Zen architectures. In the existing implementation, the shareable cache is computed as 'L3 per thread, L2 per core'. Recomputing this shareable cache as 'L3 per CCX(Core-Complex)' has brought in performance gains. As per the large bench variant results, this patch also addresses the regression problem on AMD Zen architectures. Origin backport: https://sourceware.org/git/?p=glibc.git;a=commit;h=8813b2682e4094e43b0cf1634e99619f1b8b2c62 --- sysdeps/x86/cacheinfo.c | 20 +++++++++++++++++--- 1 file changed, 17 insertions(+), 3 deletions(-) diff --git a/sysdeps/x86/cacheinfo.c b/sysdeps/x86/cacheinfo.c index 5b43fa78..37a03af0 100644 --- a/sysdeps/x86/cacheinfo.c +++ b/sysdeps/x86/cacheinfo.c @@ -728,7 +728,7 @@ intel_bug_no_cache_info: threads = 1 << ((ecx >> 12) & 0x0f); } - if (threads == 0) + if (threads == 0 || cpu_features->family >= 0x17) { /* If APIC ID width is not available, use logical processor count. */ @@ -743,8 +743,22 @@ intel_bug_no_cache_info: if (threads > 0) shared /= threads; - /* Account for exclusive L2 and L3 caches. */ - shared += core; + /* Get shared cache per ccx for Zen architectures. */ + if (cpu_features->family >= 0x17) + { + unsigned int eax; + + /* Get number of threads share the L3 cache in CCX. */ + __cpuid_count (0x8000001D, 0x3, eax, ebx, ecx, edx); + + unsigned int threads_per_ccx = ((eax >> 14) & 0xfff) + 1; + shared *= threads_per_ccx; + } + else + { + /* Account for exclusive L2 and L3 caches. */ + shared += core; + } } #ifndef DISABLE_PREFETCHW -- 2.27.0