62 lines
2.0 KiB
Diff
62 lines
2.0 KiB
Diff
|
|
From 8374ca9a2f66ad1b36dbd4b53abba9c692fccee6 Mon Sep 17 00:00:00 2001
|
||
|
|
From: Sajan Karumanchi <sajan.karumanchi@amd.com>
|
||
|
|
Date: Tue, 3 Sep 2024 08:23:27 +0000
|
||
|
|
Subject: [PATCH] backport-x86-Optimizing-memcpy-for-AMD-Zen-architecture
|
||
|
|
|
||
|
|
Modifying the shareable cache '__x86_shared_cache_size', which is a
|
||
|
|
factor in computing the non-temporal threshold parameter
|
||
|
|
'__x86_shared_non_temporal_threshold' to optimize memcpy for AMD Zen
|
||
|
|
architectures.
|
||
|
|
In the existing implementation, the shareable cache is computed as 'L3
|
||
|
|
per thread, L2 per core'. Recomputing this shareable cache as 'L3 per
|
||
|
|
CCX(Core-Complex)' has brought in performance gains.
|
||
|
|
As per the large bench variant results, this patch also addresses the
|
||
|
|
regression problem on AMD Zen architectures.
|
||
|
|
|
||
|
|
Origin backport: https://sourceware.org/git/?p=glibc.git;a=commit;h=8813b2682e4094e43b0cf1634e99619f1b8b2c62
|
||
|
|
---
|
||
|
|
sysdeps/x86/cacheinfo.c | 20 +++++++++++++++++---
|
||
|
|
1 file changed, 17 insertions(+), 3 deletions(-)
|
||
|
|
|
||
|
|
diff --git a/sysdeps/x86/cacheinfo.c b/sysdeps/x86/cacheinfo.c
|
||
|
|
index 5b43fa78..37a03af0 100644
|
||
|
|
--- a/sysdeps/x86/cacheinfo.c
|
||
|
|
+++ b/sysdeps/x86/cacheinfo.c
|
||
|
|
@@ -728,7 +728,7 @@ intel_bug_no_cache_info:
|
||
|
|
threads = 1 << ((ecx >> 12) & 0x0f);
|
||
|
|
}
|
||
|
|
|
||
|
|
- if (threads == 0)
|
||
|
|
+ if (threads == 0 || cpu_features->family >= 0x17)
|
||
|
|
{
|
||
|
|
/* If APIC ID width is not available, use logical
|
||
|
|
processor count. */
|
||
|
|
@@ -743,8 +743,22 @@ intel_bug_no_cache_info:
|
||
|
|
if (threads > 0)
|
||
|
|
shared /= threads;
|
||
|
|
|
||
|
|
- /* Account for exclusive L2 and L3 caches. */
|
||
|
|
- shared += core;
|
||
|
|
+ /* Get shared cache per ccx for Zen architectures. */
|
||
|
|
+ if (cpu_features->family >= 0x17)
|
||
|
|
+ {
|
||
|
|
+ unsigned int eax;
|
||
|
|
+
|
||
|
|
+ /* Get number of threads share the L3 cache in CCX. */
|
||
|
|
+ __cpuid_count (0x8000001D, 0x3, eax, ebx, ecx, edx);
|
||
|
|
+
|
||
|
|
+ unsigned int threads_per_ccx = ((eax >> 14) & 0xfff) + 1;
|
||
|
|
+ shared *= threads_per_ccx;
|
||
|
|
+ }
|
||
|
|
+ else
|
||
|
|
+ {
|
||
|
|
+ /* Account for exclusive L2 and L3 caches. */
|
||
|
|
+ shared += core;
|
||
|
|
+ }
|
||
|
|
}
|
||
|
|
|
||
|
|
#ifndef DISABLE_PREFETCHW
|
||
|
|
--
|
||
|
|
2.27.0
|
||
|
|
|