bugfix: remove cpu logical index counting code
add relationship between this patch and other patches
This commit is contained in:
parent
567ecf057c
commit
afcde7da58
234
bugfix-modify-the-way-counting-cpu-logical-index.patch
Normal file
234
bugfix-modify-the-way-counting-cpu-logical-index.patch
Normal file
@ -0,0 +1,234 @@
|
||||
From b82767ec717976223134d4e279f874352e7910c9 Mon Sep 17 00:00:00 2001
|
||||
From: Lostwayzxc <luoshengwei@huawei.com>
|
||||
Date: Wed, 24 Nov 2021 09:43:52 +0800
|
||||
Subject: [PATCH] modify the way counting cpu logical index
|
||||
|
||||
It's hard to count cpu logical index according to the mpidr in the userspace,
|
||||
so the index will be counted in the kernel before reported to userspace now.
|
||||
|
||||
Related patches:
|
||||
0006-add-cpu-online-fault-isolation.patch
|
||||
0008-modify-cpu-parse-for-adapting-to-new-bios-version.patch
|
||||
|
||||
---
|
||||
ras-arm-handler.c | 8 ++-
|
||||
ras-cpu-isolation.c | 127 ++------------------------------------------
|
||||
ras-cpu-isolation.h | 6 +--
|
||||
3 files changed, 11 insertions(+), 130 deletions(-)
|
||||
|
||||
diff --git a/ras-arm-handler.c b/ras-arm-handler.c
|
||||
index 8a229b4..47f9a57 100644
|
||||
--- a/ras-arm-handler.c
|
||||
+++ b/ras-arm-handler.c
|
||||
@@ -124,6 +124,12 @@ int ras_arm_event_handler(struct trace_seq *s,
|
||||
trace_seq_printf(s, "\n psci_state: %d", ev.psci_state);
|
||||
|
||||
#ifdef HAVE_CPU_FAULT_ISOLATION
|
||||
+ int cpu;
|
||||
+ if (pevent_get_field_val(s, event, "cpu", record, &val, 1) < 0)
|
||||
+ return -1;
|
||||
+ cpu = val;
|
||||
+ trace_seq_printf(s, "\n cpu: %d", cpu);
|
||||
+
|
||||
/* record cpu error */
|
||||
if (pevent_get_field_val(s, event, "sev", record, &val, 1) < 0)
|
||||
return -1;
|
||||
@@ -156,7 +162,7 @@ int ras_arm_event_handler(struct trace_seq *s,
|
||||
nums = count_errors(event, ev.error_info, len);
|
||||
if (nums > 0) {
|
||||
struct error_info err_info = {nums, now, val};
|
||||
- ras_record_cpu_error(&err_info, ev.mpidr);
|
||||
+ ras_record_cpu_error(&err_info, cpu);
|
||||
}
|
||||
}
|
||||
#endif
|
||||
diff --git a/ras-cpu-isolation.c b/ras-cpu-isolation.c
|
||||
index b1643c4..bca7e0b 100644
|
||||
--- a/ras-cpu-isolation.c
|
||||
+++ b/ras-cpu-isolation.c
|
||||
@@ -24,13 +24,9 @@
|
||||
#include "ras-cpu-isolation.h"
|
||||
|
||||
static struct cpu_info *cpu_infos = NULL;
|
||||
-static unsigned int ncores, cores_per_socket, cores_per_die;
|
||||
-static unsigned int cores_per_cluster = 4;
|
||||
-static unsigned int sockets, dies = 1;
|
||||
+static unsigned int ncores;
|
||||
static unsigned int enabled = 1;
|
||||
static const char *cpu_path_format = "/sys/devices/system/cpu/cpu%d/online";
|
||||
-static const char *core_siblings_list_path = "/sys/devices/system/cpu/cpu%d/topology/core_siblings_list";
|
||||
-static const char *node_path = "/sys/devices/system/node/possible";
|
||||
|
||||
static const struct param normal_units[] = {
|
||||
{ "", 1 },
|
||||
@@ -86,69 +82,6 @@ static int open_sys_file(unsigned cpu, int __oflag, const char *format)
|
||||
return fd;
|
||||
}
|
||||
|
||||
-static int get_sockets(void)
|
||||
-{
|
||||
- int fd, j;
|
||||
- char buf[MAX_BUF_LEN] = "";
|
||||
- cores_per_socket = ncores;
|
||||
- struct cpu_set *cpu_sets = (struct cpu_set *) malloc(sizeof(*cpu_sets) * ncores);
|
||||
-
|
||||
- if (!cpu_sets) {
|
||||
- log(TERM, LOG_ERR, "Failed to allocate memory for cpu sets in %s.\n", __func__);
|
||||
- return -1;
|
||||
- }
|
||||
-
|
||||
- for (int i = 0; i < ncores; ++i) {
|
||||
- fd = open_sys_file(i, O_RDONLY, core_siblings_list_path);
|
||||
- if (fd == -1) {
|
||||
- continue;
|
||||
- }
|
||||
- memset(buf, '\0', strlen(buf));
|
||||
- if (read(fd, buf, sizeof(buf)) <= 0) {
|
||||
- close(fd);
|
||||
- continue;
|
||||
- }
|
||||
- for (j = 0; j < sockets; ++j) {
|
||||
- if (strcmp(cpu_sets[j].buf, buf) == 0) {
|
||||
- break;
|
||||
- }
|
||||
- }
|
||||
- if (j == sockets) {
|
||||
- strcpy(cpu_sets[sockets].buf, buf);
|
||||
- sockets++;
|
||||
- }
|
||||
- close(fd);
|
||||
- }
|
||||
-
|
||||
- free(cpu_sets);
|
||||
- cores_per_socket = sockets > 0 ? ncores / sockets : ncores;
|
||||
-
|
||||
- return 0;
|
||||
-}
|
||||
-
|
||||
-static int get_dies(void)
|
||||
-{
|
||||
- int fd, begin, end;
|
||||
- char buf[20] = "";
|
||||
- cores_per_die = ncores;
|
||||
- fd = open(node_path, O_RDONLY);
|
||||
-
|
||||
- if (fd == -1) {
|
||||
- return -1;
|
||||
- }
|
||||
-
|
||||
- if (read(fd, buf, sizeof(buf))) {
|
||||
- if (sscanf(buf, "%d-%d", &begin, &end) == 2) {
|
||||
- dies = end > begin ? end - begin + 1 : 1;
|
||||
- }
|
||||
- }
|
||||
-
|
||||
- close(fd);
|
||||
- cores_per_die = ncores / dies;
|
||||
-
|
||||
- return 0;
|
||||
-}
|
||||
-
|
||||
static int get_cpu_status(unsigned cpu)
|
||||
{
|
||||
int fd, num;
|
||||
@@ -190,11 +123,6 @@ static int init_cpu_info(unsigned cpus)
|
||||
cpu_limit.limit = cpus - 1;
|
||||
cpu_limit.value = 0;
|
||||
|
||||
- if (get_sockets() < 0 || get_dies() < 0) {
|
||||
- log(TERM, LOG_ERR, "Failed to get sockets or nodes of the system\n");
|
||||
- return -1;
|
||||
- }
|
||||
-
|
||||
return 0;
|
||||
}
|
||||
|
||||
@@ -418,64 +346,15 @@ static void record_error_info(unsigned cpu, struct error_info *err_info)
|
||||
}
|
||||
}
|
||||
|
||||
-static unsigned long get_bit_value(int64_t value, unsigned offset, unsigned size)
|
||||
+void ras_record_cpu_error(struct error_info *err_info, int cpu)
|
||||
{
|
||||
- value >>= offset;
|
||||
- unsigned long res = 0;
|
||||
- int i = 0;
|
||||
-
|
||||
- while (i < size) {
|
||||
- res |= (value & (0x1 << (i++)));
|
||||
- }
|
||||
-
|
||||
- return res;
|
||||
-}
|
||||
-
|
||||
-static unsigned get_cpu_index(int64_t mpidr)
|
||||
-{
|
||||
- unsigned core_id, cluster_id, socket_id, die_id, cpu;
|
||||
- /*
|
||||
- * Adapt to certain BIOS
|
||||
- * In the MPIDR:
|
||||
- * bit 8:15: core id
|
||||
- * bit 16:18: cluster id
|
||||
- * bit 19:20: die_id
|
||||
- * bit 21:22: socket_id
|
||||
- */
|
||||
- core_id = get_bit_value(mpidr, 8, 8);
|
||||
- cluster_id = get_bit_value(mpidr, 16, 3);
|
||||
- socket_id = get_bit_value(mpidr, 21, 2);
|
||||
- die_id = get_bit_value(mpidr, 19, 2);
|
||||
-
|
||||
- /* When die id parsed from MPIDR is 1, it means TotemA, and when it's 3,
|
||||
- * it means TotemB. When cores per die equal to cores per socket, it means
|
||||
- * that there is only one die in the socket, in case that the only die is
|
||||
- * TotemB in CPU 1620s, we set die id to 0 directly.
|
||||
- */
|
||||
- if (cores_per_die == cores_per_socket) {
|
||||
- die_id = 0;
|
||||
- }
|
||||
- else {
|
||||
- die_id = (die_id == 1 ? 0:1);
|
||||
- }
|
||||
- cpu = core_id + socket_id * cores_per_socket + die_id * cores_per_die +
|
||||
- cluster_id * cores_per_cluster;
|
||||
-
|
||||
- return cpu;
|
||||
-}
|
||||
-
|
||||
-void ras_record_cpu_error(struct error_info *err_info, int64_t mpidr)
|
||||
-{
|
||||
- unsigned cpu;
|
||||
int ret;
|
||||
|
||||
if (enabled == 0) {
|
||||
return;
|
||||
}
|
||||
|
||||
- cpu = get_cpu_index(mpidr);
|
||||
-
|
||||
- if (cpu >= ncores) {
|
||||
+ if (cpu >= ncores || cpu < 0) {
|
||||
log(TERM, LOG_ERR, "The current cpu %d has exceed the total number of cpu:%d\n", cpu, ncores);
|
||||
return;
|
||||
}
|
||||
diff --git a/ras-cpu-isolation.h b/ras-cpu-isolation.h
|
||||
index a7d3fdb..95dedc1 100644
|
||||
--- a/ras-cpu-isolation.h
|
||||
+++ b/ras-cpu-isolation.h
|
||||
@@ -65,12 +65,8 @@ struct error_info {
|
||||
enum error_type err_type;
|
||||
};
|
||||
|
||||
-struct cpu_set {
|
||||
- char buf[MAX_BUF_LEN];
|
||||
-};
|
||||
-
|
||||
void ras_error_count_init(unsigned cpus);
|
||||
-void ras_record_cpu_error(struct error_info *err_info, int64_t mpidr);
|
||||
+void ras_record_cpu_error(struct error_info *err_info, int cpu);
|
||||
void cpu_infos_free(void);
|
||||
|
||||
#endif
|
||||
\ No newline at end of file
|
||||
--
|
||||
2.27.0
|
||||
|
||||
@ -1,6 +1,6 @@
|
||||
Name: rasdaemon
|
||||
Version: 0.6.6
|
||||
Release: 6
|
||||
Release: 7
|
||||
License: GPLv2
|
||||
Summary: Utility to get Platform Reliability, Availability and Serviceability (RAS) reports via the Kernel tracing events
|
||||
URL: https://github.com/mchehab/rasdaemon.git
|
||||
@ -35,6 +35,7 @@ Patch13: 0004-rasdaemon-Add-some-modules-supported-by-hisi-common-.patch
|
||||
Patch14: 0006-add-cpu-online-fault-isolation.patch
|
||||
Patch15: 0007-add-trace-print-and-add-sqlite-store.patch
|
||||
Patch16: 0008-modify-cpu-parse-for-adapting-to-new-bios-version.patch
|
||||
Patch17: bugfix-modify-the-way-counting-cpu-logical-index.patch
|
||||
|
||||
%description
|
||||
The rasdaemon program is a daemon which monitors the platform
|
||||
@ -81,6 +82,13 @@ rm INSTALL %{buildroot}/usr/include/*.h
|
||||
/usr/bin/systemctl enable rasdaemon.service >/dev/null 2>&1 || :
|
||||
|
||||
%changelog
|
||||
* Wed Dec 1 2021 luoshengwei<luoshengwei@huawei.com> - 0.6.6-7
|
||||
- Type:bugfix
|
||||
- ID:NA
|
||||
- SUG:NA
|
||||
- DESC: Since the cpu logical index has been counted in kernel, remove
|
||||
- related code in ras.
|
||||
|
||||
* Wed Oct 27 2021 luoshengwei<luoshengwei@huawei.com> - 0.6.6-6
|
||||
- Type:feature
|
||||
- ID:NA
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user