From b82767ec717976223134d4e279f874352e7910c9 Mon Sep 17 00:00:00 2001 From: Lostwayzxc Date: Wed, 24 Nov 2021 09:43:52 +0800 Subject: [PATCH] modify the way counting cpu logical index It's hard to count cpu logical index according to the mpidr in the userspace, so the index will be counted in the kernel before reported to userspace now. Related patches: 0006-add-cpu-online-fault-isolation.patch 0008-modify-cpu-parse-for-adapting-to-new-bios-version.patch --- ras-arm-handler.c | 8 ++- ras-cpu-isolation.c | 127 ++------------------------------------------ ras-cpu-isolation.h | 6 +-- 3 files changed, 11 insertions(+), 130 deletions(-) diff --git a/ras-arm-handler.c b/ras-arm-handler.c index 8a229b4..47f9a57 100644 --- a/ras-arm-handler.c +++ b/ras-arm-handler.c @@ -124,6 +124,12 @@ int ras_arm_event_handler(struct trace_seq *s, trace_seq_printf(s, "\n psci_state: %d", ev.psci_state); #ifdef HAVE_CPU_FAULT_ISOLATION + int cpu; + if (pevent_get_field_val(s, event, "cpu", record, &val, 1) < 0) + return -1; + cpu = val; + trace_seq_printf(s, "\n cpu: %d", cpu); + /* record cpu error */ if (pevent_get_field_val(s, event, "sev", record, &val, 1) < 0) return -1; @@ -156,7 +162,7 @@ int ras_arm_event_handler(struct trace_seq *s, nums = count_errors(event, ev.error_info, len); if (nums > 0) { struct error_info err_info = {nums, now, val}; - ras_record_cpu_error(&err_info, ev.mpidr); + ras_record_cpu_error(&err_info, cpu); } } #endif diff --git a/ras-cpu-isolation.c b/ras-cpu-isolation.c index b1643c4..bca7e0b 100644 --- a/ras-cpu-isolation.c +++ b/ras-cpu-isolation.c @@ -24,13 +24,9 @@ #include "ras-cpu-isolation.h" static struct cpu_info *cpu_infos = NULL; -static unsigned int ncores, cores_per_socket, cores_per_die; -static unsigned int cores_per_cluster = 4; -static unsigned int sockets, dies = 1; +static unsigned int ncores; static unsigned int enabled = 1; static const char *cpu_path_format = "/sys/devices/system/cpu/cpu%d/online"; -static const char *core_siblings_list_path = "/sys/devices/system/cpu/cpu%d/topology/core_siblings_list"; -static const char *node_path = "/sys/devices/system/node/possible"; static const struct param normal_units[] = { { "", 1 }, @@ -86,69 +82,6 @@ static int open_sys_file(unsigned cpu, int __oflag, const char *format) return fd; } -static int get_sockets(void) -{ - int fd, j; - char buf[MAX_BUF_LEN] = ""; - cores_per_socket = ncores; - struct cpu_set *cpu_sets = (struct cpu_set *) malloc(sizeof(*cpu_sets) * ncores); - - if (!cpu_sets) { - log(TERM, LOG_ERR, "Failed to allocate memory for cpu sets in %s.\n", __func__); - return -1; - } - - for (int i = 0; i < ncores; ++i) { - fd = open_sys_file(i, O_RDONLY, core_siblings_list_path); - if (fd == -1) { - continue; - } - memset(buf, '\0', strlen(buf)); - if (read(fd, buf, sizeof(buf)) <= 0) { - close(fd); - continue; - } - for (j = 0; j < sockets; ++j) { - if (strcmp(cpu_sets[j].buf, buf) == 0) { - break; - } - } - if (j == sockets) { - strcpy(cpu_sets[sockets].buf, buf); - sockets++; - } - close(fd); - } - - free(cpu_sets); - cores_per_socket = sockets > 0 ? ncores / sockets : ncores; - - return 0; -} - -static int get_dies(void) -{ - int fd, begin, end; - char buf[20] = ""; - cores_per_die = ncores; - fd = open(node_path, O_RDONLY); - - if (fd == -1) { - return -1; - } - - if (read(fd, buf, sizeof(buf))) { - if (sscanf(buf, "%d-%d", &begin, &end) == 2) { - dies = end > begin ? end - begin + 1 : 1; - } - } - - close(fd); - cores_per_die = ncores / dies; - - return 0; -} - static int get_cpu_status(unsigned cpu) { int fd, num; @@ -190,11 +123,6 @@ static int init_cpu_info(unsigned cpus) cpu_limit.limit = cpus - 1; cpu_limit.value = 0; - if (get_sockets() < 0 || get_dies() < 0) { - log(TERM, LOG_ERR, "Failed to get sockets or nodes of the system\n"); - return -1; - } - return 0; } @@ -418,64 +346,15 @@ static void record_error_info(unsigned cpu, struct error_info *err_info) } } -static unsigned long get_bit_value(int64_t value, unsigned offset, unsigned size) +void ras_record_cpu_error(struct error_info *err_info, int cpu) { - value >>= offset; - unsigned long res = 0; - int i = 0; - - while (i < size) { - res |= (value & (0x1 << (i++))); - } - - return res; -} - -static unsigned get_cpu_index(int64_t mpidr) -{ - unsigned core_id, cluster_id, socket_id, die_id, cpu; - /* - * Adapt to certain BIOS - * In the MPIDR: - * bit 8:15: core id - * bit 16:18: cluster id - * bit 19:20: die_id - * bit 21:22: socket_id - */ - core_id = get_bit_value(mpidr, 8, 8); - cluster_id = get_bit_value(mpidr, 16, 3); - socket_id = get_bit_value(mpidr, 21, 2); - die_id = get_bit_value(mpidr, 19, 2); - - /* When die id parsed from MPIDR is 1, it means TotemA, and when it's 3, - * it means TotemB. When cores per die equal to cores per socket, it means - * that there is only one die in the socket, in case that the only die is - * TotemB in CPU 1620s, we set die id to 0 directly. - */ - if (cores_per_die == cores_per_socket) { - die_id = 0; - } - else { - die_id = (die_id == 1 ? 0:1); - } - cpu = core_id + socket_id * cores_per_socket + die_id * cores_per_die + - cluster_id * cores_per_cluster; - - return cpu; -} - -void ras_record_cpu_error(struct error_info *err_info, int64_t mpidr) -{ - unsigned cpu; int ret; if (enabled == 0) { return; } - cpu = get_cpu_index(mpidr); - - if (cpu >= ncores) { + if (cpu >= ncores || cpu < 0) { log(TERM, LOG_ERR, "The current cpu %d has exceed the total number of cpu:%d\n", cpu, ncores); return; } diff --git a/ras-cpu-isolation.h b/ras-cpu-isolation.h index a7d3fdb..95dedc1 100644 --- a/ras-cpu-isolation.h +++ b/ras-cpu-isolation.h @@ -65,12 +65,8 @@ struct error_info { enum error_type err_type; }; -struct cpu_set { - char buf[MAX_BUF_LEN]; -}; - void ras_error_count_init(unsigned cpus); -void ras_record_cpu_error(struct error_info *err_info, int64_t mpidr); +void ras_record_cpu_error(struct error_info *err_info, int cpu); void cpu_infos_free(void); #endif \ No newline at end of file -- 2.27.0