!35 因为内核已经上报cpu id,去掉用户态计算cpu id的代码

From: @Lostwayzxc
Reviewed-by: @solarhu,@openeuler-basic
Signed-off-by: @solarhu,@openeuler-basic
This commit is contained in:
openeuler-ci-bot 2021-12-06 06:13:51 +00:00 committed by Gitee
commit 567e1b13f2
2 changed files with 243 additions and 1 deletions

View File

@ -0,0 +1,234 @@
From b82767ec717976223134d4e279f874352e7910c9 Mon Sep 17 00:00:00 2001
From: Lostwayzxc <luoshengwei@huawei.com>
Date: Wed, 24 Nov 2021 09:43:52 +0800
Subject: [PATCH] modify the way counting cpu logical index
It's hard to count cpu logical index according to the mpidr in the userspace,
so the index will be counted in the kernel before reported to userspace now.
Related patches:
0006-add-cpu-online-fault-isolation.patch
0008-modify-cpu-parse-for-adapting-to-new-bios-version.patch
---
ras-arm-handler.c | 8 ++-
ras-cpu-isolation.c | 127 ++------------------------------------------
ras-cpu-isolation.h | 6 +--
3 files changed, 11 insertions(+), 130 deletions(-)
diff --git a/ras-arm-handler.c b/ras-arm-handler.c
index 8a229b4..47f9a57 100644
--- a/ras-arm-handler.c
+++ b/ras-arm-handler.c
@@ -124,6 +124,12 @@ int ras_arm_event_handler(struct trace_seq *s,
trace_seq_printf(s, "\n psci_state: %d", ev.psci_state);
#ifdef HAVE_CPU_FAULT_ISOLATION
+ int cpu;
+ if (pevent_get_field_val(s, event, "cpu", record, &val, 1) < 0)
+ return -1;
+ cpu = val;
+ trace_seq_printf(s, "\n cpu: %d", cpu);
+
/* record cpu error */
if (pevent_get_field_val(s, event, "sev", record, &val, 1) < 0)
return -1;
@@ -156,7 +162,7 @@ int ras_arm_event_handler(struct trace_seq *s,
nums = count_errors(event, ev.error_info, len);
if (nums > 0) {
struct error_info err_info = {nums, now, val};
- ras_record_cpu_error(&err_info, ev.mpidr);
+ ras_record_cpu_error(&err_info, cpu);
}
}
#endif
diff --git a/ras-cpu-isolation.c b/ras-cpu-isolation.c
index b1643c4..bca7e0b 100644
--- a/ras-cpu-isolation.c
+++ b/ras-cpu-isolation.c
@@ -24,13 +24,9 @@
#include "ras-cpu-isolation.h"
static struct cpu_info *cpu_infos = NULL;
-static unsigned int ncores, cores_per_socket, cores_per_die;
-static unsigned int cores_per_cluster = 4;
-static unsigned int sockets, dies = 1;
+static unsigned int ncores;
static unsigned int enabled = 1;
static const char *cpu_path_format = "/sys/devices/system/cpu/cpu%d/online";
-static const char *core_siblings_list_path = "/sys/devices/system/cpu/cpu%d/topology/core_siblings_list";
-static const char *node_path = "/sys/devices/system/node/possible";
static const struct param normal_units[] = {
{ "", 1 },
@@ -86,69 +82,6 @@ static int open_sys_file(unsigned cpu, int __oflag, const char *format)
return fd;
}
-static int get_sockets(void)
-{
- int fd, j;
- char buf[MAX_BUF_LEN] = "";
- cores_per_socket = ncores;
- struct cpu_set *cpu_sets = (struct cpu_set *) malloc(sizeof(*cpu_sets) * ncores);
-
- if (!cpu_sets) {
- log(TERM, LOG_ERR, "Failed to allocate memory for cpu sets in %s.\n", __func__);
- return -1;
- }
-
- for (int i = 0; i < ncores; ++i) {
- fd = open_sys_file(i, O_RDONLY, core_siblings_list_path);
- if (fd == -1) {
- continue;
- }
- memset(buf, '\0', strlen(buf));
- if (read(fd, buf, sizeof(buf)) <= 0) {
- close(fd);
- continue;
- }
- for (j = 0; j < sockets; ++j) {
- if (strcmp(cpu_sets[j].buf, buf) == 0) {
- break;
- }
- }
- if (j == sockets) {
- strcpy(cpu_sets[sockets].buf, buf);
- sockets++;
- }
- close(fd);
- }
-
- free(cpu_sets);
- cores_per_socket = sockets > 0 ? ncores / sockets : ncores;
-
- return 0;
-}
-
-static int get_dies(void)
-{
- int fd, begin, end;
- char buf[20] = "";
- cores_per_die = ncores;
- fd = open(node_path, O_RDONLY);
-
- if (fd == -1) {
- return -1;
- }
-
- if (read(fd, buf, sizeof(buf))) {
- if (sscanf(buf, "%d-%d", &begin, &end) == 2) {
- dies = end > begin ? end - begin + 1 : 1;
- }
- }
-
- close(fd);
- cores_per_die = ncores / dies;
-
- return 0;
-}
-
static int get_cpu_status(unsigned cpu)
{
int fd, num;
@@ -190,11 +123,6 @@ static int init_cpu_info(unsigned cpus)
cpu_limit.limit = cpus - 1;
cpu_limit.value = 0;
- if (get_sockets() < 0 || get_dies() < 0) {
- log(TERM, LOG_ERR, "Failed to get sockets or nodes of the system\n");
- return -1;
- }
-
return 0;
}
@@ -418,64 +346,15 @@ static void record_error_info(unsigned cpu, struct error_info *err_info)
}
}
-static unsigned long get_bit_value(int64_t value, unsigned offset, unsigned size)
+void ras_record_cpu_error(struct error_info *err_info, int cpu)
{
- value >>= offset;
- unsigned long res = 0;
- int i = 0;
-
- while (i < size) {
- res |= (value & (0x1 << (i++)));
- }
-
- return res;
-}
-
-static unsigned get_cpu_index(int64_t mpidr)
-{
- unsigned core_id, cluster_id, socket_id, die_id, cpu;
- /*
- * Adapt to certain BIOS
- * In the MPIDR:
- * bit 8:15: core id
- * bit 16:18: cluster id
- * bit 19:20: die_id
- * bit 21:22: socket_id
- */
- core_id = get_bit_value(mpidr, 8, 8);
- cluster_id = get_bit_value(mpidr, 16, 3);
- socket_id = get_bit_value(mpidr, 21, 2);
- die_id = get_bit_value(mpidr, 19, 2);
-
- /* When die id parsed from MPIDR is 1, it means TotemA, and when it's 3,
- * it means TotemB. When cores per die equal to cores per socket, it means
- * that there is only one die in the socket, in case that the only die is
- * TotemB in CPU 1620s, we set die id to 0 directly.
- */
- if (cores_per_die == cores_per_socket) {
- die_id = 0;
- }
- else {
- die_id = (die_id == 1 ? 0:1);
- }
- cpu = core_id + socket_id * cores_per_socket + die_id * cores_per_die +
- cluster_id * cores_per_cluster;
-
- return cpu;
-}
-
-void ras_record_cpu_error(struct error_info *err_info, int64_t mpidr)
-{
- unsigned cpu;
int ret;
if (enabled == 0) {
return;
}
- cpu = get_cpu_index(mpidr);
-
- if (cpu >= ncores) {
+ if (cpu >= ncores || cpu < 0) {
log(TERM, LOG_ERR, "The current cpu %d has exceed the total number of cpu:%d\n", cpu, ncores);
return;
}
diff --git a/ras-cpu-isolation.h b/ras-cpu-isolation.h
index a7d3fdb..95dedc1 100644
--- a/ras-cpu-isolation.h
+++ b/ras-cpu-isolation.h
@@ -65,12 +65,8 @@ struct error_info {
enum error_type err_type;
};
-struct cpu_set {
- char buf[MAX_BUF_LEN];
-};
-
void ras_error_count_init(unsigned cpus);
-void ras_record_cpu_error(struct error_info *err_info, int64_t mpidr);
+void ras_record_cpu_error(struct error_info *err_info, int cpu);
void cpu_infos_free(void);
#endif
\ No newline at end of file
--
2.27.0

View File

@ -1,6 +1,6 @@
Name: rasdaemon
Version: 0.6.6
Release: 6
Release: 7
License: GPLv2
Summary: Utility to get Platform Reliability, Availability and Serviceability (RAS) reports via the Kernel tracing events
URL: https://github.com/mchehab/rasdaemon.git
@ -35,6 +35,7 @@ Patch13: 0004-rasdaemon-Add-some-modules-supported-by-hisi-common-.patch
Patch14: 0006-add-cpu-online-fault-isolation.patch
Patch15: 0007-add-trace-print-and-add-sqlite-store.patch
Patch16: 0008-modify-cpu-parse-for-adapting-to-new-bios-version.patch
Patch17: bugfix-modify-the-way-counting-cpu-logical-index.patch
%description
The rasdaemon program is a daemon which monitors the platform
@ -81,6 +82,13 @@ rm INSTALL %{buildroot}/usr/include/*.h
/usr/bin/systemctl enable rasdaemon.service >/dev/null 2>&1 || :
%changelog
* Wed Dec 1 2021 luoshengwei<luoshengwei@huawei.com> - 0.6.6-7
- Type:bugfix
- ID:NA
- SUG:NA
- DESC: Since the cpu logical index has been counted in kernel, remove
- related code in ras.
* Wed Oct 27 2021 luoshengwei<luoshengwei@huawei.com> - 0.6.6-6
- Type:feature
- ID:NA