x86: backport x86 optimization patches

Backport 3 x86 optimization patches. The first one 'Reversing calculation of __x86_shared_non_temporal_threshold' patch reverse calculation of __x86_shared_non_temporal_threshold from 3/4 of the entire shared cache size of a multi-threaded system to 3/4 of one thread's share of the cache size, and then improve the memcpy performance. The second patch 'Optimizing memcpy for AMD Zen architecture' recomputing the shareable cache as 'L3 per CCX(Core-Complex)' and improve performance for amd. The third patch 'Add Hygon Dhyana support' fix Hygon Dhyana processor CPU Vendor ID detection problem in glibc sysdep module.
2024-09-04 06:01:47 +00:00 · 2024-09-04 06:01:47 +00:00 · 8dee2397c0
commit 8dee2397c0
parent a0a8755c54
4 changed files with 206 additions and 1 deletions
--- a/backport-Reversing-calculation-of-__x86_shared_non_t.patch
+++ b/backport-Reversing-calculation-of-__x86_shared_non_t.patch
@ -0,0 +1,95 @@
+From 4d1d91c7fdb52e847a6a7ff096736968e10c6509 Mon Sep 17 00:00:00 2001
+From: Patrick McGehearty <patrick.mcgehearty@oracle.com>
+Date: Wed, 4 Sep 2024 05:47:15 +0000
+Subject: [PATCH] 
+ backport-Reversing-calculation-of-__x86_shared_non_temporal_threshold
+
+The __x86_shared_non_temporal_threshold determines when memcpy on x86
+uses non_temporal stores to avoid pushing other data out of the last
+level cache.
+
+This patch proposes to revert the calculation change made by H.J. Lu's
+patch of June 2, 2017.
+
+H.J. Lu's patch selected a threshold suitable for a single thread
+getting maximum performance. It was tuned using the single threaded
+large memcpy micro benchmark on an 8 core processor. The last change
+changes the threshold from using 3/4 of one thread's share of the
+cache to using 3/4 of the entire cache of a multi-threaded system
+before switching to non-temporal stores. Multi-threaded systems with
+more than a few threads are server-class and typically have many
+active threads. If one thread consumes 3/4 of the available cache for
+all threads, it will cause other active threads to have data removed
+from the cache. Two examples show the range of the effect. John
+McCalpin's widely parallel Stream benchmark, which runs in parallel
+and fetches data sequentially, saw a 20% slowdown with this patch on
+an internal system test of 128 threads. This regression was discovered
+when comparing OL8 performance to OL7.  An example that compares
+normal stores to non-temporal stores may be found at
+https://vgatherps.github.io/2018-09-02-nontemporal/.  A simple test
+shows performance loss of 400 to 500% due to a failure to use
+nontemporal stores. These performance losses are most likely to occur
+when the system load is heaviest and good performance is critical.
+
+The tunable x86_non_temporal_threshold can be used to override the
+default for the knowledgable user who really wants maximum cache
+allocation to a single thread in a multi-threaded system.
+The manual entry for the tunable has been expanded to provide
+more information about its purpose.
+
+Origin backport: https://sourceware.org/git/?p=glibc.git;a=commitdiff;h=d3c57027470
+---
+ manual/tunables.texi    |  6 +++++-
+ sysdeps/x86/cacheinfo.c | 16 +++++++++++-----
+ 2 files changed, 16 insertions(+), 6 deletions(-)
+
+diff --git a/manual/tunables.texi b/manual/tunables.texi
+index 124b39b6..79347bf3 100644
+--- a/manual/tunables.texi
+++ b/manual/tunables.texi
+@@ -352,7 +352,11 @@ set shared cache size in bytes for use in memory and string routines.
+ 
+ @deftp Tunable glibc.tune.x86_non_temporal_threshold
+ The @code{glibc.tune.x86_non_temporal_threshold} tunable allows the user
+-to set threshold in bytes for non temporal store.
+to set threshold in bytes for non temporal store. Non temporal stores
+give a hint to the hardware to move data directly to memory without
+displacing other data from the cache. This tunable is used by some
+platforms to determine when to use non temporal stores in operations
+like memmove and memcpy.
+ 
+ This tunable is specific to i386 and x86-64.
+ @end deftp
+diff --git a/sysdeps/x86/cacheinfo.c b/sysdeps/x86/cacheinfo.c
+index 28bcf2f6..5b43fa78 100644
+--- a/sysdeps/x86/cacheinfo.c
+++ b/sysdeps/x86/cacheinfo.c
+@@ -784,14 +784,20 @@ intel_bug_no_cache_info:
+       __x86_shared_cache_size = shared;
+     }
+ 
+-  /* The large memcpy micro benchmark in glibc shows that 6 times of
+-     shared cache size is the approximate value above which non-temporal
+-     store becomes faster on a 8-core processor.  This is the 3/4 of the
+-     total shared cache size.  */
+  /* The default setting for the non_temporal threshold is 3/4 of one
+     thread's share of the chip's cache. For most Intel and AMD processors
+     with an initial release date between 2017 and 2020, a thread's typical
+     share of the cache is from 500 KBytes to 2 MBytes. Using the 3/4
+     threshold leaves 125 KBytes to 500 KBytes of the thread's data
+     in cache after a maximum temporal copy, which will maintain
+     in cache a reasonable portion of the thread's stack and other
+     active data. If the threshold is set higher than one thread's
+     share of the cache, it has a substantial risk of negatively
+     impacting the performance of other threads running on the chip. */
+   __x86_shared_non_temporal_threshold
+     = (cpu_features->non_temporal_threshold != 0
+        ? cpu_features->non_temporal_threshold
+-       : __x86_shared_cache_size * threads * 3 / 4);
+       : __x86_shared_cache_size * 3 / 4);
+ }
+ 
+ #endif
+-- 
+2.27.0
+
--- a/backport-x86-Add-Hygon-support.patch
+++ b/backport-x86-Add-Hygon-support.patch
@ -0,0 +1,38 @@
+From 2b46bc9b5a148f6da198321a8396a6c2c6a1b070 Mon Sep 17 00:00:00 2001
+From: Feifei Wang1994 <wangfeifei@hygon.cn>
+Date: Tue, 3 Sep 2024 08:30:43 +0000
+Subject: [PATCH] backport-x86-Add-Hygon-support
+
+This patch fix Hygon processor CPU Vendor ID detection problem
+in glibc sysdep module, current glibc-2.28 doesn't recognize
+Hygon CPU Vendor ID("HygonGenuine") and sets kind to arch_kind_other,
+which result in incorrect zero value for __cache_sysconf() syscall.
+
+This patch add Hygon CPU Vendor ID check, setup kind to arch_kind_amd
+and reuse AMD code path, which lead to correct return value in __cache_sysconf() syscall.
+Test case shows no failure with this patch in Hygon arch.
+
+Signed-off-by: Feifei Wang <wangfeifei@hygon.cn>
+---
+ sysdeps/x86/cpu-features.c | 5 +++--
+ 1 file changed, 3 insertions(+), 2 deletions(-)
+
+diff --git a/sysdeps/x86/cpu-features.c b/sysdeps/x86/cpu-features.c
+index ea0b64fd..4b1a0169 100644
+--- a/sysdeps/x86/cpu-features.c
+++ b/sysdeps/x86/cpu-features.c
+@@ -344,8 +344,9 @@ init_cpu_features (struct cpu_features *cpu_features)
+ 	cpu_features->feature[index_arch_Prefer_No_AVX512]
+ 	  |= bit_arch_Prefer_No_AVX512;
+     }
+-  /* This spells out "AuthenticAMD".  */
+-  else if (ebx == 0x68747541 && ecx == 0x444d4163 && edx == 0x69746e65)
+   /* This spells out "AuthenticAMD" or "HygonGenuine".  */
+  else if ((ebx == 0x68747541 && ecx == 0x444d4163 && edx == 0x69746e65)
+	       || (ebx == 0x6f677948 && ecx == 0x656e6975 && edx == 0x6e65476e))
+     {
+       unsigned int extended_model, stepping;
+ 
+-- 
+2.27.0
+
--- a/backport-x86-Optimizing-memcpy-for-AMD-Zen-architect.patch
+++ b/backport-x86-Optimizing-memcpy-for-AMD-Zen-architect.patch
@ -0,0 +1,61 @@
+From 8374ca9a2f66ad1b36dbd4b53abba9c692fccee6 Mon Sep 17 00:00:00 2001
+From: Sajan Karumanchi <sajan.karumanchi@amd.com>
+Date: Tue, 3 Sep 2024 08:23:27 +0000
+Subject: [PATCH] backport-x86-Optimizing-memcpy-for-AMD-Zen-architecture
+
+Modifying the shareable cache '__x86_shared_cache_size', which is a
+factor in computing the non-temporal threshold parameter
+'__x86_shared_non_temporal_threshold' to optimize memcpy for AMD Zen
+architectures.
+In the existing implementation, the shareable cache is computed as 'L3
+per thread, L2 per core'. Recomputing this shareable cache as 'L3 per
+CCX(Core-Complex)' has brought in performance gains.
+As per the large bench variant results, this patch also addresses the
+regression problem on AMD Zen architectures.
+
+Origin backport: https://sourceware.org/git/?p=glibc.git;a=commit;h=8813b2682e4094e43b0cf1634e99619f1b8b2c62
+---
+ sysdeps/x86/cacheinfo.c | 20 +++++++++++++++++---
+ 1 file changed, 17 insertions(+), 3 deletions(-)
+
+diff --git a/sysdeps/x86/cacheinfo.c b/sysdeps/x86/cacheinfo.c
+index 5b43fa78..37a03af0 100644
+--- a/sysdeps/x86/cacheinfo.c
+++ b/sysdeps/x86/cacheinfo.c
+@@ -728,7 +728,7 @@ intel_bug_no_cache_info:
+ 	      threads = 1 << ((ecx >> 12) & 0x0f);
+ 	    }
+ 
+-	  if (threads == 0)
+	  if (threads == 0 || cpu_features->family >= 0x17)
+ 	    {
+ 	      /* If APIC ID width is not available, use logical
+ 		 processor count.  */
+@@ -743,8 +743,22 @@ intel_bug_no_cache_info:
+ 	  if (threads > 0)
+ 	    shared /= threads;
+ 
+-	  /* Account for exclusive L2 and L3 caches.  */
+-	  shared += core;
+	  /* Get shared cache per ccx for Zen architectures.  */
+	  if (cpu_features->family >= 0x17)
+	     {
+	        unsigned int eax;
+
+		/* Get number of threads share the L3 cache in CCX.  */
+		__cpuid_count (0x8000001D, 0x3, eax, ebx, ecx, edx);
+
+		unsigned int threads_per_ccx = ((eax >> 14) & 0xfff) + 1;
+		shared *= threads_per_ccx;
+	     }
+	   else
+	     {
+		/* Account for exclusive L2 and L3 caches.  */
+		shared += core;
+	     }
+ 	}
+ 
+ #ifndef DISABLE_PREFETCHW
+-- 
+2.27.0
+
--- a/glibc.spec
+++ b/glibc.spec
@ -62,7 +62,7 @@
 ##############################################################################
 Name: 	 	glibc
 Version: 	2.28
-Release: 	101
+Release: 	102
 Summary: 	The GNU libc libraries
 License:	%{all_license}
 URL: 		http://www.gnu.org/software/glibc/
@ -167,6 +167,9 @@ Patch80: backport-CVE-2024-33601-CVE-2024-33602-nscd-Use-two-buffer-in-addgetnet
 Patch81: iconv-ISO-2022-CN-EXT-fix-out-of-bound-writes-when-w.patch
 Patch82: backport-Use-errval-not-errno-to-guide-cache-update.patch
 Patch83: backport-Skip-unusable-entries-in-first-pass-in-prune_cache.patch
+Patch84: backport-Reversing-calculation-of-__x86_shared_non_t.patch
+Patch85: backport-x86-Optimizing-memcpy-for-AMD-Zen-architect.patch
+Patch86: backport-x86-Add-Hygon-support.patch

 Provides: ldconfig rtld(GNU_HASH) bundled(gnulib)

@ -1283,6 +1286,14 @@ fi
 %endif

 %changelog
+* Tue Sep 03 2024 Feifei Wang <wangfeifei@hygon.cn> - 2.28-102
+- x86: Reversing calculation of __x86_shared_non_temporal_threshold
+  x86: Optimizing memcpy for AMD Zen architecture
+  x86: Add Hygon Dhyana support
+  https://sourceware.org/git/?p=glibc.git;a=commitdiff;h=d3c57027470b
+  https://sourceware.org/git/?p=glibc.git;a=commit;h=59803e81f96b479c17f583b31eac44b57591a1bf
+  https://sourceware.org/git/?p=glibc.git;a=commit;h=ade8b817fead73b302d08c88cd44ea2ea56793d4
+
 * Mon May 06 2024 chengyechun <chengyechun1@huaiwe.com> - 2.28-101
 - Type:bugfix
 - ID: