!35 因为内核已经上报cpu id，去掉用户态计算cpu id的代码

From: @Lostwayzxc Reviewed-by: @solarhu,@openeuler-basic Signed-off-by: @solarhu,@openeuler-basic
2021-12-06 06:13:51 +00:00 · 2021-12-06 06:13:51 +00:00 · 567e1b13f2
commit 567e1b13f2
parent 567ecf057c afcde7da58
2 changed files with 243 additions and 1 deletions
--- a/bugfix-modify-the-way-counting-cpu-logical-index.patch
+++ b/bugfix-modify-the-way-counting-cpu-logical-index.patch
@ -0,0 +1,234 @@
+From b82767ec717976223134d4e279f874352e7910c9 Mon Sep 17 00:00:00 2001
+From: Lostwayzxc <luoshengwei@huawei.com>
+Date: Wed, 24 Nov 2021 09:43:52 +0800
+Subject: [PATCH] modify the way counting cpu logical index
+
+It's hard to count cpu logical index according to the mpidr in the userspace,
+so the index will be counted in the kernel before reported to userspace now.
+
+Related patches: 
+0006-add-cpu-online-fault-isolation.patch
+0008-modify-cpu-parse-for-adapting-to-new-bios-version.patch
+
+---
+ ras-arm-handler.c   |   8 ++-
+ ras-cpu-isolation.c | 127 ++------------------------------------------
+ ras-cpu-isolation.h |   6 +--
+ 3 files changed, 11 insertions(+), 130 deletions(-)
+
+diff --git a/ras-arm-handler.c b/ras-arm-handler.c
+index 8a229b4..47f9a57 100644
+--- a/ras-arm-handler.c
+++ b/ras-arm-handler.c
+@@ -124,6 +124,12 @@ int ras_arm_event_handler(struct trace_seq *s,
+ 	trace_seq_printf(s, "\n psci_state: %d", ev.psci_state);
+ 
+ #ifdef HAVE_CPU_FAULT_ISOLATION
+	int cpu;
+	if (pevent_get_field_val(s, event, "cpu", record, &val, 1) < 0)
+		return -1;
+	cpu = val;
+	trace_seq_printf(s, "\n cpu: %d", cpu);
+
+ 	/* record cpu error */
+ 	if (pevent_get_field_val(s, event, "sev", record, &val, 1) < 0)
+ 		return -1;
+@@ -156,7 +162,7 @@ int ras_arm_event_handler(struct trace_seq *s,
+ 		nums = count_errors(event, ev.error_info, len);
+ 		if (nums > 0) {
+ 			struct error_info err_info = {nums, now, val};
+-			ras_record_cpu_error(&err_info, ev.mpidr);
+			ras_record_cpu_error(&err_info, cpu);
+ 		}
+ 	}
+ #endif
+diff --git a/ras-cpu-isolation.c b/ras-cpu-isolation.c
+index b1643c4..bca7e0b 100644
+--- a/ras-cpu-isolation.c
+++ b/ras-cpu-isolation.c
+@@ -24,13 +24,9 @@
+ #include "ras-cpu-isolation.h"
+ 
+ static struct cpu_info *cpu_infos = NULL;
+-static unsigned int ncores, cores_per_socket, cores_per_die;
+-static unsigned int cores_per_cluster = 4;
+-static unsigned int sockets, dies = 1;
+static unsigned int ncores;
+ static unsigned int enabled = 1;
+ static const char *cpu_path_format = "/sys/devices/system/cpu/cpu%d/online";
+-static const char *core_siblings_list_path = "/sys/devices/system/cpu/cpu%d/topology/core_siblings_list";
+-static const char *node_path = "/sys/devices/system/node/possible";
+ 
+ static const struct param normal_units[] = {
+     { "", 1 },
+@@ -86,69 +82,6 @@ static int open_sys_file(unsigned cpu, int __oflag, const char *format)
+     return fd;
+ }
+ 
+-static int get_sockets(void)
+-{
+-    int fd, j;
+-    char buf[MAX_BUF_LEN] = "";
+-    cores_per_socket = ncores;
+-    struct cpu_set *cpu_sets = (struct cpu_set *) malloc(sizeof(*cpu_sets) * ncores);
+-
+-    if (!cpu_sets) {
+-        log(TERM, LOG_ERR, "Failed to allocate memory for cpu sets in %s.\n", __func__);
+-        return -1;
+-    }
+-
+-    for (int i = 0; i < ncores; ++i) {
+-        fd = open_sys_file(i, O_RDONLY, core_siblings_list_path);
+-        if (fd == -1) {
+-            continue;
+-        }
+-        memset(buf, '\0', strlen(buf));
+-        if (read(fd, buf, sizeof(buf)) <= 0) {
+-            close(fd);
+-            continue;
+-        }
+-        for (j = 0; j < sockets; ++j) {
+-            if (strcmp(cpu_sets[j].buf, buf) == 0) {
+-                break;
+-            }
+-        }
+-        if (j == sockets) {
+-            strcpy(cpu_sets[sockets].buf, buf);
+-            sockets++;
+-        }
+-        close(fd);
+-    }
+-
+-    free(cpu_sets);
+-    cores_per_socket = sockets > 0 ? ncores / sockets : ncores;
+-
+-    return 0;
+-}
+-
+-static int get_dies(void)
+-{
+-    int fd, begin, end;
+-    char buf[20] = "";
+-    cores_per_die = ncores;
+-    fd = open(node_path, O_RDONLY);
+-
+-    if (fd == -1) {
+-        return -1;
+-    }
+-
+-    if (read(fd, buf, sizeof(buf))) {
+-        if (sscanf(buf, "%d-%d", &begin, &end) == 2) {
+-            dies = end > begin ? end - begin + 1 : 1;
+-        }
+-    }
+-
+-    close(fd);
+-    cores_per_die = ncores / dies;
+-
+-    return 0;
+-}
+-
+ static int get_cpu_status(unsigned cpu)
+ {
+     int fd, num;
+@@ -190,11 +123,6 @@ static int init_cpu_info(unsigned cpus)
+     cpu_limit.limit = cpus - 1;
+     cpu_limit.value = 0;
+ 
+-    if (get_sockets() < 0 || get_dies() < 0) {
+-        log(TERM, LOG_ERR, "Failed to get sockets or nodes of the system\n");
+-        return -1;
+-    }
+-
+     return 0;
+ }
+ 
+@@ -418,64 +346,15 @@ static void record_error_info(unsigned cpu, struct error_info *err_info)
+     }
+ }
+ 
+-static unsigned long get_bit_value(int64_t value, unsigned offset, unsigned size)
+void ras_record_cpu_error(struct error_info *err_info, int cpu)
+ {
+-    value >>= offset;
+-    unsigned long res = 0;
+-    int i = 0;
+-
+-    while (i < size) {
+-        res |= (value & (0x1 << (i++)));
+-    }
+-
+-    return res;
+-}
+-
+-static unsigned get_cpu_index(int64_t mpidr)
+-{
+-    unsigned core_id, cluster_id, socket_id, die_id, cpu;
+-    /*
+-     * Adapt to certain BIOS
+-     * In the MPIDR:
+-     * bit 8:15: core id
+-     * bit 16:18: cluster id
+-     * bit 19:20: die_id
+-     * bit 21:22: socket_id 
+-     */
+-    core_id = get_bit_value(mpidr, 8, 8);
+-    cluster_id = get_bit_value(mpidr, 16, 3);
+-    socket_id = get_bit_value(mpidr, 21, 2);
+-    die_id = get_bit_value(mpidr, 19, 2);
+-
+-    /* When die id parsed from MPIDR is 1, it means TotemA, and when it's 3,
+-     * it means TotemB. When cores per die equal to cores per socket, it means
+-     * that there is only one die in the socket, in case that the only die is
+-     * TotemB in CPU 1620s, we set die id to 0 directly.
+-     */
+-    if (cores_per_die == cores_per_socket) {
+-        die_id = 0;
+-    }
+-    else {
+-        die_id = (die_id == 1 ? 0:1);
+-    }
+-    cpu = core_id + socket_id * cores_per_socket + die_id * cores_per_die +
+-        cluster_id * cores_per_cluster;
+-
+-    return cpu;
+-}
+-
+-void ras_record_cpu_error(struct error_info *err_info, int64_t mpidr)
+-{
+-    unsigned cpu;
+     int ret;
+ 
+     if (enabled == 0) {
+         return;
+     }
+ 
+-    cpu = get_cpu_index(mpidr);
+-
+-    if (cpu >= ncores) {
+    if (cpu >= ncores || cpu < 0) {
+         log(TERM, LOG_ERR, "The current cpu %d has exceed the total number of cpu:%d\n", cpu, ncores);
+         return;
+     }
+diff --git a/ras-cpu-isolation.h b/ras-cpu-isolation.h
+index a7d3fdb..95dedc1 100644
+--- a/ras-cpu-isolation.h
+++ b/ras-cpu-isolation.h
+@@ -65,12 +65,8 @@ struct error_info {
+     enum error_type err_type;
+ };
+ 
+-struct cpu_set {
+-    char buf[MAX_BUF_LEN];
+-};
+-
+ void ras_error_count_init(unsigned cpus);
+-void ras_record_cpu_error(struct error_info *err_info, int64_t mpidr);
+void ras_record_cpu_error(struct error_info *err_info, int cpu);
+ void cpu_infos_free(void);
+ 
+ #endif
+\ No newline at end of file
+-- 
+2.27.0
+
--- a/rasdaemon.spec
+++ b/rasdaemon.spec
@ -1,6 +1,6 @@
 Name:			rasdaemon
 Version:		0.6.6
-Release:		6
+Release:		7
 License:		GPLv2
 Summary:		Utility to get Platform Reliability, Availability and Serviceability (RAS) reports via the Kernel tracing events
 URL:			https://github.com/mchehab/rasdaemon.git
@ -35,6 +35,7 @@ Patch13: 0004-rasdaemon-Add-some-modules-supported-by-hisi-common-.patch
 Patch14: 0006-add-cpu-online-fault-isolation.patch
 Patch15: 0007-add-trace-print-and-add-sqlite-store.patch
 Patch16: 0008-modify-cpu-parse-for-adapting-to-new-bios-version.patch
+Patch17: bugfix-modify-the-way-counting-cpu-logical-index.patch

 %description
 The  rasdaemon  program  is  a  daemon which monitors the platform
@ -81,6 +82,13 @@ rm INSTALL %{buildroot}/usr/include/*.h
 /usr/bin/systemctl enable rasdaemon.service >/dev/null 2>&1 || :

 %changelog
+* Wed Dec 1 2021 luoshengwei<luoshengwei@huawei.com> - 0.6.6-7
+- Type:bugfix
+- ID:NA
+- SUG:NA
+- DESC: Since the cpu logical index has been counted in kernel, remove
+- related code in ras.
+
 * Wed Oct 27 2021 luoshengwei<luoshengwei@huawei.com> - 0.6.6-6
 - Type:feature
 - ID:NA