235 lines
6.5 KiB
Diff
235 lines
6.5 KiB
Diff
From b82767ec717976223134d4e279f874352e7910c9 Mon Sep 17 00:00:00 2001
|
|
From: Lostwayzxc <luoshengwei@huawei.com>
|
|
Date: Wed, 24 Nov 2021 09:43:52 +0800
|
|
Subject: [PATCH] modify the way counting cpu logical index
|
|
|
|
It's hard to count cpu logical index according to the mpidr in the userspace,
|
|
so the index will be counted in the kernel before reported to userspace now.
|
|
|
|
Related patches:
|
|
0006-add-cpu-online-fault-isolation.patch
|
|
0008-modify-cpu-parse-for-adapting-to-new-bios-version.patch
|
|
|
|
---
|
|
ras-arm-handler.c | 8 ++-
|
|
ras-cpu-isolation.c | 127 ++------------------------------------------
|
|
ras-cpu-isolation.h | 6 +--
|
|
3 files changed, 11 insertions(+), 130 deletions(-)
|
|
|
|
diff --git a/ras-arm-handler.c b/ras-arm-handler.c
|
|
index 8a229b4..47f9a57 100644
|
|
--- a/ras-arm-handler.c
|
|
+++ b/ras-arm-handler.c
|
|
@@ -124,6 +124,12 @@ int ras_arm_event_handler(struct trace_seq *s,
|
|
trace_seq_printf(s, "\n psci_state: %d", ev.psci_state);
|
|
|
|
#ifdef HAVE_CPU_FAULT_ISOLATION
|
|
+ int cpu;
|
|
+ if (pevent_get_field_val(s, event, "cpu", record, &val, 1) < 0)
|
|
+ return -1;
|
|
+ cpu = val;
|
|
+ trace_seq_printf(s, "\n cpu: %d", cpu);
|
|
+
|
|
/* record cpu error */
|
|
if (pevent_get_field_val(s, event, "sev", record, &val, 1) < 0)
|
|
return -1;
|
|
@@ -156,7 +162,7 @@ int ras_arm_event_handler(struct trace_seq *s,
|
|
nums = count_errors(event, ev.error_info, len);
|
|
if (nums > 0) {
|
|
struct error_info err_info = {nums, now, val};
|
|
- ras_record_cpu_error(&err_info, ev.mpidr);
|
|
+ ras_record_cpu_error(&err_info, cpu);
|
|
}
|
|
}
|
|
#endif
|
|
diff --git a/ras-cpu-isolation.c b/ras-cpu-isolation.c
|
|
index b1643c4..bca7e0b 100644
|
|
--- a/ras-cpu-isolation.c
|
|
+++ b/ras-cpu-isolation.c
|
|
@@ -24,13 +24,9 @@
|
|
#include "ras-cpu-isolation.h"
|
|
|
|
static struct cpu_info *cpu_infos = NULL;
|
|
-static unsigned int ncores, cores_per_socket, cores_per_die;
|
|
-static unsigned int cores_per_cluster = 4;
|
|
-static unsigned int sockets, dies = 1;
|
|
+static unsigned int ncores;
|
|
static unsigned int enabled = 1;
|
|
static const char *cpu_path_format = "/sys/devices/system/cpu/cpu%d/online";
|
|
-static const char *core_siblings_list_path = "/sys/devices/system/cpu/cpu%d/topology/core_siblings_list";
|
|
-static const char *node_path = "/sys/devices/system/node/possible";
|
|
|
|
static const struct param normal_units[] = {
|
|
{ "", 1 },
|
|
@@ -86,69 +82,6 @@ static int open_sys_file(unsigned cpu, int __oflag, const char *format)
|
|
return fd;
|
|
}
|
|
|
|
-static int get_sockets(void)
|
|
-{
|
|
- int fd, j;
|
|
- char buf[MAX_BUF_LEN] = "";
|
|
- cores_per_socket = ncores;
|
|
- struct cpu_set *cpu_sets = (struct cpu_set *) malloc(sizeof(*cpu_sets) * ncores);
|
|
-
|
|
- if (!cpu_sets) {
|
|
- log(TERM, LOG_ERR, "Failed to allocate memory for cpu sets in %s.\n", __func__);
|
|
- return -1;
|
|
- }
|
|
-
|
|
- for (int i = 0; i < ncores; ++i) {
|
|
- fd = open_sys_file(i, O_RDONLY, core_siblings_list_path);
|
|
- if (fd == -1) {
|
|
- continue;
|
|
- }
|
|
- memset(buf, '\0', strlen(buf));
|
|
- if (read(fd, buf, sizeof(buf)) <= 0) {
|
|
- close(fd);
|
|
- continue;
|
|
- }
|
|
- for (j = 0; j < sockets; ++j) {
|
|
- if (strcmp(cpu_sets[j].buf, buf) == 0) {
|
|
- break;
|
|
- }
|
|
- }
|
|
- if (j == sockets) {
|
|
- strcpy(cpu_sets[sockets].buf, buf);
|
|
- sockets++;
|
|
- }
|
|
- close(fd);
|
|
- }
|
|
-
|
|
- free(cpu_sets);
|
|
- cores_per_socket = sockets > 0 ? ncores / sockets : ncores;
|
|
-
|
|
- return 0;
|
|
-}
|
|
-
|
|
-static int get_dies(void)
|
|
-{
|
|
- int fd, begin, end;
|
|
- char buf[20] = "";
|
|
- cores_per_die = ncores;
|
|
- fd = open(node_path, O_RDONLY);
|
|
-
|
|
- if (fd == -1) {
|
|
- return -1;
|
|
- }
|
|
-
|
|
- if (read(fd, buf, sizeof(buf))) {
|
|
- if (sscanf(buf, "%d-%d", &begin, &end) == 2) {
|
|
- dies = end > begin ? end - begin + 1 : 1;
|
|
- }
|
|
- }
|
|
-
|
|
- close(fd);
|
|
- cores_per_die = ncores / dies;
|
|
-
|
|
- return 0;
|
|
-}
|
|
-
|
|
static int get_cpu_status(unsigned cpu)
|
|
{
|
|
int fd, num;
|
|
@@ -190,11 +123,6 @@ static int init_cpu_info(unsigned cpus)
|
|
cpu_limit.limit = cpus - 1;
|
|
cpu_limit.value = 0;
|
|
|
|
- if (get_sockets() < 0 || get_dies() < 0) {
|
|
- log(TERM, LOG_ERR, "Failed to get sockets or nodes of the system\n");
|
|
- return -1;
|
|
- }
|
|
-
|
|
return 0;
|
|
}
|
|
|
|
@@ -418,64 +346,15 @@ static void record_error_info(unsigned cpu, struct error_info *err_info)
|
|
}
|
|
}
|
|
|
|
-static unsigned long get_bit_value(int64_t value, unsigned offset, unsigned size)
|
|
+void ras_record_cpu_error(struct error_info *err_info, int cpu)
|
|
{
|
|
- value >>= offset;
|
|
- unsigned long res = 0;
|
|
- int i = 0;
|
|
-
|
|
- while (i < size) {
|
|
- res |= (value & (0x1 << (i++)));
|
|
- }
|
|
-
|
|
- return res;
|
|
-}
|
|
-
|
|
-static unsigned get_cpu_index(int64_t mpidr)
|
|
-{
|
|
- unsigned core_id, cluster_id, socket_id, die_id, cpu;
|
|
- /*
|
|
- * Adapt to certain BIOS
|
|
- * In the MPIDR:
|
|
- * bit 8:15: core id
|
|
- * bit 16:18: cluster id
|
|
- * bit 19:20: die_id
|
|
- * bit 21:22: socket_id
|
|
- */
|
|
- core_id = get_bit_value(mpidr, 8, 8);
|
|
- cluster_id = get_bit_value(mpidr, 16, 3);
|
|
- socket_id = get_bit_value(mpidr, 21, 2);
|
|
- die_id = get_bit_value(mpidr, 19, 2);
|
|
-
|
|
- /* When die id parsed from MPIDR is 1, it means TotemA, and when it's 3,
|
|
- * it means TotemB. When cores per die equal to cores per socket, it means
|
|
- * that there is only one die in the socket, in case that the only die is
|
|
- * TotemB in CPU 1620s, we set die id to 0 directly.
|
|
- */
|
|
- if (cores_per_die == cores_per_socket) {
|
|
- die_id = 0;
|
|
- }
|
|
- else {
|
|
- die_id = (die_id == 1 ? 0:1);
|
|
- }
|
|
- cpu = core_id + socket_id * cores_per_socket + die_id * cores_per_die +
|
|
- cluster_id * cores_per_cluster;
|
|
-
|
|
- return cpu;
|
|
-}
|
|
-
|
|
-void ras_record_cpu_error(struct error_info *err_info, int64_t mpidr)
|
|
-{
|
|
- unsigned cpu;
|
|
int ret;
|
|
|
|
if (enabled == 0) {
|
|
return;
|
|
}
|
|
|
|
- cpu = get_cpu_index(mpidr);
|
|
-
|
|
- if (cpu >= ncores) {
|
|
+ if (cpu >= ncores || cpu < 0) {
|
|
log(TERM, LOG_ERR, "The current cpu %d has exceed the total number of cpu:%d\n", cpu, ncores);
|
|
return;
|
|
}
|
|
diff --git a/ras-cpu-isolation.h b/ras-cpu-isolation.h
|
|
index a7d3fdb..95dedc1 100644
|
|
--- a/ras-cpu-isolation.h
|
|
+++ b/ras-cpu-isolation.h
|
|
@@ -65,12 +65,8 @@ struct error_info {
|
|
enum error_type err_type;
|
|
};
|
|
|
|
-struct cpu_set {
|
|
- char buf[MAX_BUF_LEN];
|
|
-};
|
|
-
|
|
void ras_error_count_init(unsigned cpus);
|
|
-void ras_record_cpu_error(struct error_info *err_info, int64_t mpidr);
|
|
+void ras_record_cpu_error(struct error_info *err_info, int cpu);
|
|
void cpu_infos_free(void);
|
|
|
|
#endif
|
|
\ No newline at end of file
|
|
--
|
|
2.27.0
|
|
|