!98 同步22.03-LTS-SP2分支代码至20.03-SP4分支
From: @Lostwayzxc Reviewed-by: @hunan4222, @znzjugod Signed-off-by: @znzjugod
This commit is contained in:
commit
5ad5b8223b
165
0001-rasdaemon-Fix-for-regression-in-ras_mc_create_table-.patch
Normal file
165
0001-rasdaemon-Fix-for-regression-in-ras_mc_create_table-.patch
Normal file
@ -0,0 +1,165 @@
|
||||
From e53389e7d7bd805900386b979fb3d48f1e79a7bc Mon Sep 17 00:00:00 2001
|
||||
From: Shiju Jose <shiju.jose@huawei.com>
|
||||
Date: Sun, 5 Mar 2023 23:14:42 +0000
|
||||
Subject: [PATCH] rasdaemon: Fix for regression in ras_mc_create_table() if
|
||||
some cpus are offline at the system start
|
||||
|
||||
Issues:
|
||||
Regression in the ras_mc_create_table() if some of the cpus are offline
|
||||
at the system start when run the rasdaemon. This issue is
|
||||
reproducible in ras_mc_create_table() with decode and record
|
||||
non-standard events and reproducible sometimes with
|
||||
ras_mc_create_table() for the standard events.
|
||||
Also in the multi thread way, there is memory leak in ras_mc_event_opendb()
|
||||
as struct sqlite3_priv *priv and sqlite3 *db allocated/initialized per
|
||||
thread, but stored in the common struct ras_events ras in pthread data,
|
||||
which is shared across the threads.
|
||||
|
||||
Reason:
|
||||
when the system start with some of the cpus are offline and then run
|
||||
the rasdaemon, read_ras_event_all_cpus() exit with error and switch to
|
||||
the multi thread way. However read() in read_ras_event() return error in
|
||||
threads for each of the offline CPUs and does clean up including calling
|
||||
ras_mc_event_closedb().
|
||||
Since the 'struct ras_events ras' passed in the pthread_data to each of the
|
||||
threads is common, struct sqlite3_priv *priv and sqlite3 *db allocated/
|
||||
initialized per thread and stored in the common 'struct ras_events ras',
|
||||
are getting overwritten in each ras_mc_event_opendb()(which called from
|
||||
pthread per cpu), result memory leak. Also when ras_mc_event_closedb()
|
||||
is called in the above error case from the threads corresponding to the
|
||||
offline cpus, close the sqlite3 *db and free sqlite3_priv *priv stored
|
||||
in the common 'struct ras_events ras', result regression when accessing
|
||||
priv->db in the ras_mc_create_table() from another context later.
|
||||
|
||||
Proposed solution:
|
||||
In ras_mc_event_opendb(), allocate struct sqlite3_priv *priv,
|
||||
init sqlite3 *db and create tables common for the threads with shared
|
||||
'struct ras_events ras' based on a reference count and free them in the
|
||||
same way.
|
||||
Also protect critical code ras_mc_event_opendb() and ras_mc_event_closedb()
|
||||
using mutex in the multi thread case from any regression caused by the
|
||||
thread pre-emption.
|
||||
|
||||
Reported-by: Lei Feng <fenglei47@h-partners.com>
|
||||
Signed-off-by: Shiju Jose <shiju.jose@huawei.com>
|
||||
---
|
||||
ras-events.c | 16 +++++++++++++++-
|
||||
ras-events.h | 4 +++-
|
||||
ras-record.c | 12 ++++++++++++
|
||||
3 files changed, 30 insertions(+), 2 deletions(-)
|
||||
|
||||
diff --git a/ras-events.c b/ras-events.c
|
||||
index 49e4f9a..5fe8e19 100644
|
||||
--- a/ras-events.c
|
||||
+++ b/ras-events.c
|
||||
@@ -625,19 +625,25 @@ static void *handle_ras_events_cpu(void *priv)
|
||||
|
||||
log(TERM, LOG_INFO, "Listening to events on cpu %d\n", pdata->cpu);
|
||||
if (pdata->ras->record_events) {
|
||||
+ pthread_mutex_lock(&pdata->ras->db_lock);
|
||||
if (ras_mc_event_opendb(pdata->cpu, pdata->ras)) {
|
||||
+ pthread_mutex_unlock(&pdata->ras->db_lock);
|
||||
log(TERM, LOG_ERR, "Can't open database\n");
|
||||
close(fd);
|
||||
kbuffer_free(kbuf);
|
||||
free(page);
|
||||
return 0;
|
||||
}
|
||||
+ pthread_mutex_unlock(&pdata->ras->db_lock);
|
||||
}
|
||||
|
||||
read_ras_event(fd, pdata, kbuf, page);
|
||||
|
||||
- if (pdata->ras->record_events)
|
||||
+ if (pdata->ras->record_events) {
|
||||
+ pthread_mutex_lock(&pdata->ras->db_lock);
|
||||
ras_mc_event_closedb(pdata->cpu, pdata->ras);
|
||||
+ pthread_mutex_unlock(&pdata->ras->db_lock);
|
||||
+ }
|
||||
|
||||
close(fd);
|
||||
kbuffer_free(kbuf);
|
||||
@@ -993,6 +999,11 @@ int handle_ras_events(int record_events)
|
||||
|
||||
/* Poll doesn't work on this kernel. Fallback to pthread way */
|
||||
if (rc == -255) {
|
||||
+ if (pthread_mutex_init(&ras->db_lock, NULL) != 0) {
|
||||
+ log(SYSLOG, LOG_INFO, "sqlite db lock init has failed\n");
|
||||
+ goto err;
|
||||
+ }
|
||||
+
|
||||
log(SYSLOG, LOG_INFO,
|
||||
"Opening one thread per cpu (%d threads)\n", cpus);
|
||||
for (i = 0; i < cpus; i++) {
|
||||
@@ -1005,6 +1016,8 @@ int handle_ras_events(int record_events)
|
||||
i);
|
||||
while (--i)
|
||||
pthread_cancel(data[i].thread);
|
||||
+
|
||||
+ pthread_mutex_destroy(&ras->db_lock);
|
||||
goto err;
|
||||
}
|
||||
}
|
||||
@@ -1012,6 +1025,7 @@ int handle_ras_events(int record_events)
|
||||
/* Wait for all threads to complete */
|
||||
for (i = 0; i < cpus; i++)
|
||||
pthread_join(data[i].thread, NULL);
|
||||
+ pthread_mutex_destroy(&ras->db_lock);
|
||||
}
|
||||
|
||||
log(SYSLOG, LOG_INFO, "Huh! something got wrong. Aborting.\n");
|
||||
diff --git a/ras-events.h b/ras-events.h
|
||||
index 6c9f507..649b0c0 100644
|
||||
--- a/ras-events.h
|
||||
+++ b/ras-events.h
|
||||
@@ -56,7 +56,9 @@ struct ras_events {
|
||||
time_t uptime_diff;
|
||||
|
||||
/* For ras-record */
|
||||
- void *db_priv;
|
||||
+ void *db_priv;
|
||||
+ int db_ref_count;
|
||||
+ pthread_mutex_t db_lock;
|
||||
|
||||
/* For the mce handler */
|
||||
struct mce_priv *mce_priv;
|
||||
diff --git a/ras-record.c b/ras-record.c
|
||||
index a367939..adc97a4 100644
|
||||
--- a/ras-record.c
|
||||
+++ b/ras-record.c
|
||||
@@ -763,6 +763,10 @@ int ras_mc_event_opendb(unsigned cpu, struct ras_events *ras)
|
||||
|
||||
printf("Calling %s()\n", __FUNCTION__);
|
||||
|
||||
+ ras->db_ref_count++;
|
||||
+ if (ras->db_ref_count > 1)
|
||||
+ return 0;
|
||||
+
|
||||
ras->db_priv = NULL;
|
||||
|
||||
priv = calloc(1, sizeof(*priv));
|
||||
@@ -912,6 +916,13 @@ int ras_mc_event_closedb(unsigned int cpu, struct ras_events *ras)
|
||||
|
||||
printf("Calling %s()\n", __func__);
|
||||
|
||||
+ if (ras->db_ref_count > 0)
|
||||
+ ras->db_ref_count--;
|
||||
+ else
|
||||
+ return -1;
|
||||
+ if (ras->db_ref_count > 0)
|
||||
+ return 0;
|
||||
+
|
||||
if (!priv)
|
||||
return -1;
|
||||
|
||||
@@ -1018,6 +1029,7 @@ int ras_mc_event_closedb(unsigned int cpu, struct ras_events *ras)
|
||||
log(TERM, LOG_ERR,
|
||||
"cpu %u: Failed to shutdown sqlite: error = %d\n", cpu, rc);
|
||||
free(priv);
|
||||
+ ras->db_priv = NULL;
|
||||
|
||||
return 0;
|
||||
}
|
||||
--
|
||||
2.25.1
|
||||
|
||||
938
0001-rasdaemon-Support-cpu-fault-isolation-for-corrected-.patch
Normal file
938
0001-rasdaemon-Support-cpu-fault-isolation-for-corrected-.patch
Normal file
@ -0,0 +1,938 @@
|
||||
From b9999d40d73dfff8b1cfb515f3b81b2c2891f6a7 Mon Sep 17 00:00:00 2001
|
||||
From: Shengwei Luo <luoshengwei@huawei.com>
|
||||
Date: Wed, 23 Feb 2022 17:21:58 +0800
|
||||
Subject: [PATCH 01/10] rasdaemon: Support cpu fault isolation for corrected
|
||||
errors
|
||||
|
||||
When the corrected errors exceed the set limit in cycle, try to
|
||||
offline the related cpu core.
|
||||
|
||||
Signed-off-by: Shengwei Luo <luoshengwei@huawei.com>
|
||||
Signed-off-by: Junchong Pan <panjunchong@hisilicon.com>
|
||||
Signed-off-by: Lei Feng <fenglei47@h-partners.com>
|
||||
Signed-off-by: Shiju Jose <shiju.jose@huawei.com>
|
||||
---
|
||||
Makefile.am | 6 +-
|
||||
configure.ac | 11 ++
|
||||
misc/rasdaemon.env | 17 ++
|
||||
queue.c | 119 ++++++++++++++
|
||||
queue.h | 39 +++++
|
||||
ras-arm-handler.c | 97 +++++++++++
|
||||
ras-arm-handler.h | 18 ++
|
||||
ras-cpu-isolation.c | 388 ++++++++++++++++++++++++++++++++++++++++++++
|
||||
ras-cpu-isolation.h | 68 ++++++++
|
||||
ras-events.c | 9 +-
|
||||
10 files changed, 770 insertions(+), 2 deletions(-)
|
||||
create mode 100644 queue.c
|
||||
create mode 100644 queue.h
|
||||
create mode 100644 ras-cpu-isolation.c
|
||||
create mode 100644 ras-cpu-isolation.h
|
||||
|
||||
diff --git a/Makefile.am b/Makefile.am
|
||||
index a322b9a..36e7d4e 100644
|
||||
--- a/Makefile.am
|
||||
+++ b/Makefile.am
|
||||
@@ -69,13 +69,17 @@ endif
|
||||
if WITH_AMP_NS_DECODE
|
||||
rasdaemon_SOURCES += non-standard-ampere.c
|
||||
endif
|
||||
+if WITH_CPU_FAULT_ISOLATION
|
||||
+ rasdaemon_SOURCES += ras-cpu-isolation.c queue.c
|
||||
+endif
|
||||
rasdaemon_LDADD = -lpthread $(SQLITE3_LIBS) libtrace/libtrace.a
|
||||
|
||||
include_HEADERS = config.h ras-events.h ras-logger.h ras-mc-handler.h \
|
||||
ras-aer-handler.h ras-mce-handler.h ras-record.h bitfield.h ras-report.h \
|
||||
ras-extlog-handler.h ras-arm-handler.h ras-non-standard-handler.h \
|
||||
ras-devlink-handler.h ras-diskerror-handler.h rbtree.h ras-page-isolation.h \
|
||||
- non-standard-hisilicon.h non-standard-ampere.h ras-memory-failure-handler.h
|
||||
+ non-standard-hisilicon.h non-standard-ampere.h ras-memory-failure-handler.h \
|
||||
+ ras-cpu-isolation.h queue.h
|
||||
|
||||
# This rule can't be called with more than one Makefile job (like make -j8)
|
||||
# I can't figure out a way to fix that
|
||||
diff --git a/configure.ac b/configure.ac
|
||||
index a77991f..e0ed751 100644
|
||||
--- a/configure.ac
|
||||
+++ b/configure.ac
|
||||
@@ -161,6 +161,16 @@ AS_IF([test "x$enable_amp_ns_decode" = "xyes" || test "x$enable_all" == "xyes"],
|
||||
AM_CONDITIONAL([WITH_AMP_NS_DECODE], [test x$enable_amp_ns_decode = xyes || test x$enable_all == xyes])
|
||||
AM_COND_IF([WITH_AMP_NS_DECODE], [USE_AMP_NS_DECODE="yes"], [USE_AMP_NS_DECODE="no"])
|
||||
|
||||
+AC_ARG_ENABLE([cpu_fault_isolation],
|
||||
+ AS_HELP_STRING([--enable-cpu-fault-isolation], [enable cpu online fault isolation]))
|
||||
+
|
||||
+AS_IF([test "x$enable_cpu_fault_isolation" = "xyes" || test "x$enable_all" == "xyes"], [
|
||||
+ AC_DEFINE(HAVE_CPU_FAULT_ISOLATION,1,"have cpu online fault isolation")
|
||||
+ AC_SUBST([WITH_CPU_FAULT_ISOLATION])
|
||||
+])
|
||||
+AM_CONDITIONAL([WITH_CPU_FAULT_ISOLATION], [test x$enable_cpu_fault_isolation = xyes || test x$enable_all == xyes])
|
||||
+AM_COND_IF([WITH_CPU_FAULT_ISOLATION], [USE_CPU_FAULT_ISOLATION="yes"], [USE_CPU_FAULT_ISOLATION="no"])
|
||||
+
|
||||
test "$sysconfdir" = '${prefix}/etc' && sysconfdir=/etc
|
||||
|
||||
CFLAGS="$CFLAGS -Wall -Wmissing-prototypes -Wstrict-prototypes"
|
||||
@@ -201,4 +211,5 @@ compile time options summary
|
||||
Memory Failure : $USE_MEMORY_FAILURE
|
||||
Memory CE PFA : $USE_MEMORY_CE_PFA
|
||||
AMP RAS errors : $USE_AMP_NS_DECODE
|
||||
+ CPU fault isolation : $USE_CPU_FAULT_ISOLATION
|
||||
EOF
|
||||
diff --git a/misc/rasdaemon.env b/misc/rasdaemon.env
|
||||
index 12fd766..7cb18e8 100644
|
||||
--- a/misc/rasdaemon.env
|
||||
+++ b/misc/rasdaemon.env
|
||||
@@ -27,3 +27,20 @@ PAGE_CE_THRESHOLD="50"
|
||||
# soft-then-hard First try to soft offline, then try hard offlining.
|
||||
# Note: default offline choice is "soft".
|
||||
PAGE_CE_ACTION="soft"
|
||||
+
|
||||
+# CPU Online Fault Isolation
|
||||
+# Whether to enable cpu online fault isolation (yes|no).
|
||||
+CPU_ISOLATION_ENABLE="no"
|
||||
+# Specify the threshold of CE numbers.
|
||||
+#
|
||||
+# Format:
|
||||
+# [0-9]+[unit]
|
||||
+#
|
||||
+# Supported units:
|
||||
+# CPU_CE_THRESHOLD: no unit
|
||||
+# CPU_ISOLATION_CYCLE: D|d (day), H|h (hour), M|m (minute), S|s (second), default is in second
|
||||
+CPU_CE_THRESHOLD="18"
|
||||
+CPU_ISOLATION_CYCLE="24h"
|
||||
+
|
||||
+# Prevent excessive isolation from causing an avalanche effect
|
||||
+CPU_ISOLATION_LIMIT="10"
|
||||
\ No newline at end of file
|
||||
diff --git a/queue.c b/queue.c
|
||||
new file mode 100644
|
||||
index 0000000..65b6fb8
|
||||
--- /dev/null
|
||||
+++ b/queue.c
|
||||
@@ -0,0 +1,119 @@
|
||||
+/*
|
||||
+ * Copyright (c) Huawei Technologies Co., Ltd. 2021-2021. All rights reserved.
|
||||
+ *
|
||||
+ * This program is free software; you can redistribute it and/or modify
|
||||
+ * it under the terms of the GNU General Public License as published by
|
||||
+ * the Free Software Foundation; either version 2 of the License, or
|
||||
+ * (at your option) any later version.
|
||||
+ *
|
||||
+ * This program is distributed in the hope that it will be useful,
|
||||
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
+ * GNU General Public License for more details.
|
||||
+ */
|
||||
+#include <stdio.h>
|
||||
+#include <stdlib.h>
|
||||
+#include "queue.h"
|
||||
+#include "ras-logger.h"
|
||||
+
|
||||
+int is_empty(struct link_queue *queue)
|
||||
+{
|
||||
+ if (queue)
|
||||
+ return queue->size == 0;
|
||||
+
|
||||
+ return 1;
|
||||
+}
|
||||
+
|
||||
+struct link_queue *init_queue(void)
|
||||
+{
|
||||
+ struct link_queue *queue = NULL;
|
||||
+
|
||||
+ queue = (struct link_queue *)malloc(sizeof(struct link_queue));
|
||||
+ if (queue == NULL) {
|
||||
+ log(TERM, LOG_ERR, "Failed to allocate memory for queue.\n");
|
||||
+ return NULL;
|
||||
+ }
|
||||
+
|
||||
+ queue->size = 0;
|
||||
+ queue->head = NULL;
|
||||
+ queue->tail = NULL;
|
||||
+
|
||||
+ return queue;
|
||||
+}
|
||||
+
|
||||
+void clear_queue(struct link_queue *queue)
|
||||
+{
|
||||
+ if (queue == NULL)
|
||||
+ return;
|
||||
+
|
||||
+ struct queue_node *node = queue->head;
|
||||
+ struct queue_node *tmp = NULL;
|
||||
+
|
||||
+ while (node != NULL) {
|
||||
+ tmp = node;
|
||||
+ node = node->next;
|
||||
+ free(tmp);
|
||||
+ }
|
||||
+
|
||||
+ queue->head = NULL;
|
||||
+ queue->tail = NULL;
|
||||
+ queue->size = 0;
|
||||
+}
|
||||
+
|
||||
+void free_queue(struct link_queue *queue)
|
||||
+{
|
||||
+ clear_queue(queue);
|
||||
+
|
||||
+ if (queue)
|
||||
+ free(queue);
|
||||
+}
|
||||
+
|
||||
+/* It should be guranteed that the param is not NULL */
|
||||
+void push(struct link_queue *queue, struct queue_node *node)
|
||||
+{
|
||||
+ /* there is no element in the queue */
|
||||
+ if (queue->head == NULL)
|
||||
+ queue->head = node;
|
||||
+ else
|
||||
+ queue->tail->next = node;
|
||||
+
|
||||
+ queue->tail = node;
|
||||
+ (queue->size)++;
|
||||
+}
|
||||
+
|
||||
+int pop(struct link_queue *queue)
|
||||
+{
|
||||
+ struct queue_node *tmp = NULL;
|
||||
+
|
||||
+ if (queue == NULL || is_empty(queue))
|
||||
+ return -1;
|
||||
+
|
||||
+ tmp = queue->head;
|
||||
+ queue->head = queue->head->next;
|
||||
+ free(tmp);
|
||||
+ (queue->size)--;
|
||||
+
|
||||
+ return 0;
|
||||
+}
|
||||
+
|
||||
+struct queue_node *front(struct link_queue *queue)
|
||||
+{
|
||||
+ if (queue == NULL)
|
||||
+ return NULL;
|
||||
+
|
||||
+ return queue->head;
|
||||
+}
|
||||
+
|
||||
+struct queue_node *node_create(time_t time, unsigned int value)
|
||||
+{
|
||||
+ struct queue_node *node = NULL;
|
||||
+
|
||||
+ node = (struct queue_node *)malloc(sizeof(struct queue_node));
|
||||
+ if (node != NULL) {
|
||||
+ node->time = time;
|
||||
+ node->value = value;
|
||||
+ node->next = NULL;
|
||||
+ }
|
||||
+
|
||||
+ return node;
|
||||
+}
|
||||
diff --git a/queue.h b/queue.h
|
||||
new file mode 100644
|
||||
index 0000000..5459f40
|
||||
--- /dev/null
|
||||
+++ b/queue.h
|
||||
@@ -0,0 +1,39 @@
|
||||
+/*
|
||||
+ * Copyright (c) Huawei Technologies Co., Ltd. 2021-2021. All rights reserved.
|
||||
+ *
|
||||
+ * This program is free software; you can redistribute it and/or modify
|
||||
+ * it under the terms of the GNU General Public License as published by
|
||||
+ * the Free Software Foundation; either version 2 of the License, or
|
||||
+ * (at your option) any later version.
|
||||
+ *
|
||||
+ * This program is distributed in the hope that it will be useful,
|
||||
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
+ * GNU General Public License for more details.
|
||||
+ */
|
||||
+
|
||||
+#ifndef __RAS_QUEUE_H
|
||||
+#define __RAS_QUEUE_H
|
||||
+
|
||||
+struct queue_node {
|
||||
+ time_t time;
|
||||
+ unsigned int value;
|
||||
+ struct queue_node *next;
|
||||
+};
|
||||
+
|
||||
+struct link_queue {
|
||||
+ struct queue_node *head;
|
||||
+ struct queue_node *tail;
|
||||
+ int size;
|
||||
+};
|
||||
+
|
||||
+int is_empty(struct link_queue *queue);
|
||||
+struct link_queue *init_queue(void);
|
||||
+void clear_queue(struct link_queue *queue);
|
||||
+void free_queue(struct link_queue *queue);
|
||||
+void push(struct link_queue *queue, struct queue_node *node);
|
||||
+int pop(struct link_queue *queue);
|
||||
+struct queue_node *front(struct link_queue *queue);
|
||||
+struct queue_node *node_create(time_t time, unsigned int value);
|
||||
+
|
||||
+#endif
|
||||
diff --git a/ras-arm-handler.c b/ras-arm-handler.c
|
||||
index 1149dc6..9c7a3c3 100644
|
||||
--- a/ras-arm-handler.c
|
||||
+++ b/ras-arm-handler.c
|
||||
@@ -22,6 +22,10 @@
|
||||
#include "ras-report.h"
|
||||
#include "ras-non-standard-handler.h"
|
||||
#include "non-standard-ampere.h"
|
||||
+#include "ras-cpu-isolation.h"
|
||||
+
|
||||
+#define ARM_ERR_VALID_ERROR_COUNT BIT(0)
|
||||
+#define ARM_ERR_VALID_FLAGS BIT(1)
|
||||
|
||||
void display_raw_data(struct trace_seq *s,
|
||||
const uint8_t *buf,
|
||||
@@ -42,6 +46,93 @@ void display_raw_data(struct trace_seq *s,
|
||||
}
|
||||
}
|
||||
|
||||
+#ifdef HAVE_CPU_FAULT_ISOLATION
|
||||
+static int count_errors(struct ras_arm_event *ev)
|
||||
+{
|
||||
+ struct ras_arm_err_info *err_info;
|
||||
+ int num_pei;
|
||||
+ int err_info_size = sizeof(struct ras_arm_err_info);
|
||||
+ int num = 0;
|
||||
+ int i;
|
||||
+ int error_count;
|
||||
+
|
||||
+ if (ev->pei_len % err_info_size != 0) {
|
||||
+ log(TERM, LOG_ERR,
|
||||
+ "The event data does not match to the ARM Processor Error Information Structure\n");
|
||||
+ return num;
|
||||
+ }
|
||||
+ num_pei = ev->pei_len / err_info_size;
|
||||
+ err_info = (struct ras_arm_err_info *)(ev->pei_error);
|
||||
+
|
||||
+ for (i = 0; i < num_pei; ++i) {
|
||||
+ error_count = 1;
|
||||
+ if (err_info->validation_bits & ARM_ERR_VALID_ERROR_COUNT) {
|
||||
+ /*
|
||||
+ * The value of this field is defined as follows:
|
||||
+ * 0: Single Error
|
||||
+ * 1: Multiple Errors
|
||||
+ * 2-65535: Error Count
|
||||
+ */
|
||||
+ error_count = err_info->multiple_error + 1;
|
||||
+ }
|
||||
+
|
||||
+ num += error_count;
|
||||
+ err_info += 1;
|
||||
+ }
|
||||
+ log(TERM, LOG_INFO, "%d error in cpu core catched\n", num);
|
||||
+ return num;
|
||||
+}
|
||||
+
|
||||
+static int ras_handle_cpu_error(struct trace_seq *s,
|
||||
+ struct pevent_record *record,
|
||||
+ struct event_format *event,
|
||||
+ struct ras_arm_event *ev, time_t now)
|
||||
+{
|
||||
+ unsigned long long val;
|
||||
+ int cpu;
|
||||
+ char *severity;
|
||||
+ struct error_info err_info;
|
||||
+
|
||||
+ if (pevent_get_field_val(s, event, "cpu", record, &val, 1) < 0)
|
||||
+ return -1;
|
||||
+ cpu = val;
|
||||
+ trace_seq_printf(s, "\n cpu: %d", cpu);
|
||||
+
|
||||
+ /* record cpu error */
|
||||
+ if (pevent_get_field_val(s, event, "sev", record, &val, 1) < 0)
|
||||
+ return -1;
|
||||
+ /* refer to UEFI_2_9 specification chapter N2.2 Table N-5 */
|
||||
+ switch (val) {
|
||||
+ case GHES_SEV_NO:
|
||||
+ severity = "Informational";
|
||||
+ break;
|
||||
+ case GHES_SEV_CORRECTED:
|
||||
+ severity = "Corrected";
|
||||
+ break;
|
||||
+ case GHES_SEV_RECOVERABLE:
|
||||
+ severity = "Recoverable";
|
||||
+ break;
|
||||
+ default:
|
||||
+ case GHES_SEV_PANIC:
|
||||
+ severity = "Fatal";
|
||||
+ }
|
||||
+ trace_seq_printf(s, "\n severity: %s", severity);
|
||||
+
|
||||
+ if (val == GHES_SEV_CORRECTED) {
|
||||
+ int nums = count_errors(ev);
|
||||
+
|
||||
+ if (nums > 0) {
|
||||
+ err_info.nums = nums;
|
||||
+ err_info.time = now;
|
||||
+ err_info.err_type = val;
|
||||
+ ras_record_cpu_error(&err_info, cpu);
|
||||
+ }
|
||||
+ }
|
||||
+
|
||||
+ return 0;
|
||||
+}
|
||||
+#endif
|
||||
+
|
||||
int ras_arm_event_handler(struct trace_seq *s,
|
||||
struct pevent_record *record,
|
||||
struct event_format *event, void *context)
|
||||
@@ -52,6 +143,7 @@ int ras_arm_event_handler(struct trace_seq *s,
|
||||
struct tm *tm;
|
||||
struct ras_arm_event ev;
|
||||
int len = 0;
|
||||
+
|
||||
memset(&ev, 0, sizeof(ev));
|
||||
|
||||
/*
|
||||
@@ -139,6 +231,11 @@ int ras_arm_event_handler(struct trace_seq *s,
|
||||
display_raw_data(s, ev.vsei_error, ev.oem_len);
|
||||
#endif
|
||||
|
||||
+#ifdef HAVE_CPU_FAULT_ISOLATION
|
||||
+ if (ras_handle_cpu_error(s, record, event, &ev, now) < 0)
|
||||
+ return -1;
|
||||
+#endif
|
||||
+
|
||||
/* Insert data into the SGBD */
|
||||
#ifdef HAVE_SQLITE3
|
||||
ras_store_arm_record(ras, &ev);
|
||||
diff --git a/ras-arm-handler.h b/ras-arm-handler.h
|
||||
index 563a2d3..52813e7 100644
|
||||
--- a/ras-arm-handler.h
|
||||
+++ b/ras-arm-handler.h
|
||||
@@ -17,6 +17,24 @@
|
||||
#include "ras-events.h"
|
||||
#include "libtrace/event-parse.h"
|
||||
|
||||
+/*
|
||||
+ * ARM Processor Error Information Structure, According to
|
||||
+ * UEFI_2_9 specification chapter N2.4.4.
|
||||
+ */
|
||||
+#pragma pack(1)
|
||||
+struct ras_arm_err_info {
|
||||
+ uint8_t version;
|
||||
+ uint8_t length;
|
||||
+ uint16_t validation_bits;
|
||||
+ uint8_t type;
|
||||
+ uint16_t multiple_error;
|
||||
+ uint8_t flags;
|
||||
+ uint64_t error_info;
|
||||
+ uint64_t virt_fault_addr;
|
||||
+ uint64_t physical_fault_addr;
|
||||
+};
|
||||
+#pragma pack()
|
||||
+
|
||||
int ras_arm_event_handler(struct trace_seq *s,
|
||||
struct pevent_record *record,
|
||||
struct event_format *event, void *context);
|
||||
diff --git a/ras-cpu-isolation.c b/ras-cpu-isolation.c
|
||||
new file mode 100644
|
||||
index 0000000..abcf451
|
||||
--- /dev/null
|
||||
+++ b/ras-cpu-isolation.c
|
||||
@@ -0,0 +1,388 @@
|
||||
+/*
|
||||
+ * Copyright (c) Huawei Technologies Co., Ltd. 2021-2021. All rights reserved.
|
||||
+ *
|
||||
+ * This program is free software; you can redistribute it and/or modify
|
||||
+ * it under the terms of the GNU General Public License as published by
|
||||
+ * the Free Software Foundation; either version 2 of the License, or
|
||||
+ * (at your option) any later version.
|
||||
+ *
|
||||
+ * This program is distributed in the hope that it will be useful,
|
||||
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
+ * GNU General Public License for more details.
|
||||
+ */
|
||||
+
|
||||
+#include <stdio.h>
|
||||
+#include <stdlib.h>
|
||||
+#include <string.h>
|
||||
+#include <fcntl.h>
|
||||
+#include <errno.h>
|
||||
+#include <unistd.h>
|
||||
+#include <limits.h>
|
||||
+#include <ctype.h>
|
||||
+#include "ras-logger.h"
|
||||
+#include "ras-cpu-isolation.h"
|
||||
+
|
||||
+#define SECOND_OF_MON (30 * 24 * 60 * 60)
|
||||
+#define SECOND_OF_DAY (24 * 60 * 60)
|
||||
+#define SECOND_OF_HOU (60 * 60)
|
||||
+#define SECOND_OF_MIN (60)
|
||||
+
|
||||
+#define LIMIT_OF_CPU_THRESHOLD 10000
|
||||
+#define INIT_OF_CPU_THRESHOLD 18
|
||||
+#define DEC_CHECK 10
|
||||
+#define LAST_BIT_OF_UL 5
|
||||
+
|
||||
+static struct cpu_info *cpu_infos;
|
||||
+static unsigned int ncores;
|
||||
+static unsigned int enabled = 1;
|
||||
+static const char *cpu_path_format = "/sys/devices/system/cpu/cpu%d/online";
|
||||
+
|
||||
+static const struct param normal_units[] = {
|
||||
+ {"", 1},
|
||||
+ {}
|
||||
+};
|
||||
+
|
||||
+static const struct param cycle_units[] = {
|
||||
+ {"d", SECOND_OF_DAY},
|
||||
+ {"h", SECOND_OF_HOU},
|
||||
+ {"m", SECOND_OF_MIN},
|
||||
+ {"s", 1},
|
||||
+ {}
|
||||
+};
|
||||
+
|
||||
+static struct isolation_param threshold = {
|
||||
+ .name = "CPU_CE_THRESHOLD",
|
||||
+ .units = normal_units,
|
||||
+ .value = INIT_OF_CPU_THRESHOLD,
|
||||
+ .limit = LIMIT_OF_CPU_THRESHOLD
|
||||
+};
|
||||
+
|
||||
+static struct isolation_param cpu_limit = {
|
||||
+ .name = "CPU_ISOLATION_LIMIT",
|
||||
+ .units = normal_units
|
||||
+};
|
||||
+
|
||||
+static struct isolation_param cycle = {
|
||||
+ .name = "CPU_ISOLATION_CYCLE",
|
||||
+ .units = cycle_units,
|
||||
+ .value = SECOND_OF_DAY,
|
||||
+ .limit = SECOND_OF_MON
|
||||
+};
|
||||
+
|
||||
+static const char * const cpu_state[] = {
|
||||
+ [CPU_OFFLINE] = "offline",
|
||||
+ [CPU_ONLINE] = "online",
|
||||
+ [CPU_OFFLINE_FAILED] = "offline-failed",
|
||||
+ [CPU_UNKNOWN] = "unknown"
|
||||
+};
|
||||
+
|
||||
+static int open_sys_file(unsigned int cpu, int __oflag, const char *format)
|
||||
+{
|
||||
+ int fd;
|
||||
+ char path[MAX_PATH_LEN + 1] = "";
|
||||
+ char real_path[MAX_PATH_LEN + 1] = "";
|
||||
+
|
||||
+ snprintf(path, sizeof(path), format, cpu);
|
||||
+ if (strlen(path) > MAX_PATH_LEN || realpath(path, real_path) == NULL) {
|
||||
+ log(TERM, LOG_ERR, "[%s]:open file: %s failed\n", __func__, path);
|
||||
+ return -1;
|
||||
+ }
|
||||
+ fd = open(real_path, __oflag);
|
||||
+ if (fd == -1) {
|
||||
+ log(TERM, LOG_ERR, "[%s]:open file: %s failed\n", __func__, real_path);
|
||||
+ return -1;
|
||||
+ }
|
||||
+
|
||||
+ return fd;
|
||||
+}
|
||||
+
|
||||
+static int get_cpu_status(unsigned int cpu)
|
||||
+{
|
||||
+ int fd, num;
|
||||
+ char buf[2] = "";
|
||||
+
|
||||
+ fd = open_sys_file(cpu, O_RDONLY, cpu_path_format);
|
||||
+ if (fd == -1)
|
||||
+ return CPU_UNKNOWN;
|
||||
+
|
||||
+ if (read(fd, buf, 1) <= 0 || sscanf(buf, "%d", &num) != 1)
|
||||
+ num = CPU_UNKNOWN;
|
||||
+
|
||||
+ close(fd);
|
||||
+
|
||||
+ return (num < 0 || num > CPU_UNKNOWN) ? CPU_UNKNOWN : num;
|
||||
+}
|
||||
+
|
||||
+static int init_cpu_info(unsigned int cpus)
|
||||
+{
|
||||
+ ncores = cpus;
|
||||
+ cpu_infos = (struct cpu_info *)malloc(sizeof(*cpu_infos) * cpus);
|
||||
+ if (!cpu_infos) {
|
||||
+ log(TERM, LOG_ERR,
|
||||
+ "Failed to allocate memory for cpu infos in %s.\n", __func__);
|
||||
+ return -1;
|
||||
+ }
|
||||
+
|
||||
+ for (unsigned int i = 0; i < cpus; ++i) {
|
||||
+ cpu_infos[i].ce_nums = 0;
|
||||
+ cpu_infos[i].state = get_cpu_status(i);
|
||||
+ cpu_infos[i].ce_queue = init_queue();
|
||||
+
|
||||
+ if (cpu_infos[i].ce_queue == NULL) {
|
||||
+ log(TERM, LOG_ERR,
|
||||
+ "Failed to allocate memory for cpu ce queue in %s.\n", __func__);
|
||||
+ return -1;
|
||||
+ }
|
||||
+ }
|
||||
+ /* set limit of offlined cpu limit according to number of cpu */
|
||||
+ cpu_limit.limit = cpus - 1;
|
||||
+ cpu_limit.value = 0;
|
||||
+
|
||||
+ return 0;
|
||||
+}
|
||||
+
|
||||
+static void check_config(struct isolation_param *config)
|
||||
+{
|
||||
+ if (config->value > config->limit) {
|
||||
+ log(TERM, LOG_WARNING, "Value: %lu exceed limit: %lu, set to limit\n",
|
||||
+ config->value, config->limit);
|
||||
+ config->value = config->limit;
|
||||
+ }
|
||||
+}
|
||||
+
|
||||
+static int parse_ul_config(struct isolation_param *config, char *env, unsigned long *value)
|
||||
+{
|
||||
+ char *unit = NULL;
|
||||
+ int env_size, has_unit = 0;
|
||||
+
|
||||
+ if (!env || strlen(env) == 0)
|
||||
+ return -1;
|
||||
+
|
||||
+ env_size = strlen(env);
|
||||
+ unit = env + env_size - 1;
|
||||
+
|
||||
+ if (isalpha(*unit)) {
|
||||
+ has_unit = 1;
|
||||
+ env_size--;
|
||||
+ if (env_size <= 0)
|
||||
+ return -1;
|
||||
+ }
|
||||
+
|
||||
+ for (int i = 0; i < env_size; ++i) {
|
||||
+ if (isdigit(env[i])) {
|
||||
+ if (*value > ULONG_MAX / DEC_CHECK ||
|
||||
+ (*value == ULONG_MAX / DEC_CHECK && env[i] - '0' > LAST_BIT_OF_UL)) {
|
||||
+ log(TERM, LOG_ERR, "%s is out of range: %lu\n", env, ULONG_MAX);
|
||||
+ return -1;
|
||||
+ }
|
||||
+ *value = DEC_CHECK * (*value) + (env[i] - '0');
|
||||
+ } else
|
||||
+ return -1;
|
||||
+ }
|
||||
+
|
||||
+ if (!has_unit)
|
||||
+ return 0;
|
||||
+
|
||||
+ for (const struct param *units = config->units; units->name; units++) {
|
||||
+ /* value character and unit character are both valid */
|
||||
+ if (!strcasecmp(unit, units->name)) {
|
||||
+ if (*value > (ULONG_MAX / units->value)) {
|
||||
+ log(TERM, LOG_ERR,
|
||||
+ "%s is out of range: %lu\n", env, ULONG_MAX);
|
||||
+ return -1;
|
||||
+ }
|
||||
+ *value = (*value) * units->value;
|
||||
+ return 0;
|
||||
+ }
|
||||
+ }
|
||||
+ log(TERM, LOG_ERR, "Invalid unit %s\n", unit);
|
||||
+ return -1;
|
||||
+}
|
||||
+
|
||||
+static void init_config(struct isolation_param *config)
|
||||
+{
|
||||
+ char *env = getenv(config->name);
|
||||
+ unsigned long value = 0;
|
||||
+
|
||||
+ if (parse_ul_config(config, env, &value) < 0) {
|
||||
+ log(TERM, LOG_ERR, "Invalid %s: %s! Use default value %lu.\n",
|
||||
+ config->name, env, config->value);
|
||||
+ return;
|
||||
+ }
|
||||
+
|
||||
+ config->value = value;
|
||||
+ check_config(config);
|
||||
+}
|
||||
+
|
||||
+static int check_config_status(void)
|
||||
+{
|
||||
+ char *env = getenv("CPU_ISOLATION_ENABLE");
|
||||
+
|
||||
+ if (env == NULL || strcasecmp(env, "yes"))
|
||||
+ return -1;
|
||||
+
|
||||
+ return 0;
|
||||
+}
|
||||
+
|
||||
+void ras_cpu_isolation_init(unsigned int cpus)
|
||||
+{
|
||||
+ if (init_cpu_info(cpus) < 0 || check_config_status() < 0) {
|
||||
+ enabled = 0;
|
||||
+ log(TERM, LOG_WARNING, "Cpu fault isolation is disabled\n");
|
||||
+ return;
|
||||
+ }
|
||||
+
|
||||
+ log(TERM, LOG_INFO, "Cpu fault isolation is enabled\n");
|
||||
+ init_config(&threshold);
|
||||
+ init_config(&cpu_limit);
|
||||
+ init_config(&cycle);
|
||||
+}
|
||||
+
|
||||
+void cpu_infos_free(void)
|
||||
+{
|
||||
+ if (cpu_infos) {
|
||||
+ for (int i = 0; i < ncores; ++i)
|
||||
+ free_queue(cpu_infos[i].ce_queue);
|
||||
+
|
||||
+ free(cpu_infos);
|
||||
+ }
|
||||
+}
|
||||
+
|
||||
+static int do_cpu_offline(unsigned int cpu)
|
||||
+{
|
||||
+ int fd, rc;
|
||||
+ char buf[2] = "";
|
||||
+
|
||||
+ cpu_infos[cpu].state = CPU_OFFLINE_FAILED;
|
||||
+ fd = open_sys_file(cpu, O_RDWR, cpu_path_format);
|
||||
+ if (fd == -1)
|
||||
+ return HANDLE_FAILED;
|
||||
+
|
||||
+ strcpy(buf, "0");
|
||||
+ rc = write(fd, buf, strlen(buf));
|
||||
+ if (rc < 0) {
|
||||
+ log(TERM, LOG_ERR, "cpu%u offline failed, errno:%d\n", cpu, errno);
|
||||
+ close(fd);
|
||||
+ return HANDLE_FAILED;
|
||||
+ }
|
||||
+
|
||||
+ close(fd);
|
||||
+ /* check wthether the cpu is isolated successfully */
|
||||
+ cpu_infos[cpu].state = get_cpu_status(cpu);
|
||||
+
|
||||
+ if (cpu_infos[cpu].state == CPU_OFFLINE)
|
||||
+ return HANDLE_SUCCEED;
|
||||
+
|
||||
+ return HANDLE_FAILED;
|
||||
+}
|
||||
+
|
||||
+static int do_ce_handler(unsigned int cpu)
|
||||
+{
|
||||
+ struct link_queue *queue = cpu_infos[cpu].ce_queue;
|
||||
+ unsigned int tmp;
|
||||
+ /*
|
||||
+ * Since we just count all error numbers in setted cycle, we store the time
|
||||
+ * and error numbers from current event to the queue, then everytime we
|
||||
+ * calculate the period from beginning time to ending time, if the period
|
||||
+ * exceeds setted cycle, we pop the beginning time and error until the period
|
||||
+ * from new beginning time to ending time is less than cycle.
|
||||
+ */
|
||||
+ while (queue->head && queue->tail && queue->tail->time - queue->head->time > cycle.value) {
|
||||
+ tmp = queue->head->value;
|
||||
+ if (pop(queue) == 0)
|
||||
+ cpu_infos[cpu].ce_nums -= tmp;
|
||||
+ }
|
||||
+ log(TERM, LOG_INFO,
|
||||
+ "Current number of Corrected Errors in cpu%d in the cycle is %lu\n",
|
||||
+ cpu, cpu_infos[cpu].ce_nums);
|
||||
+
|
||||
+ if (cpu_infos[cpu].ce_nums >= threshold.value) {
|
||||
+ log(TERM, LOG_INFO,
|
||||
+ "Corrected Errors exceeded threshold %lu, try to offline cpu%u\n",
|
||||
+ threshold.value, cpu);
|
||||
+ return do_cpu_offline(cpu);
|
||||
+ }
|
||||
+ return HANDLE_NOTHING;
|
||||
+}
|
||||
+
|
||||
+static int error_handler(unsigned int cpu, struct error_info *err_info)
|
||||
+{
|
||||
+ int ret = HANDLE_NOTHING;
|
||||
+
|
||||
+ switch (err_info->err_type) {
|
||||
+ case CE:
|
||||
+ ret = do_ce_handler(cpu);
|
||||
+ break;
|
||||
+ default:
|
||||
+ break;
|
||||
+ }
|
||||
+
|
||||
+ return ret;
|
||||
+}
|
||||
+
|
||||
+static void record_error_info(unsigned int cpu, struct error_info *err_info)
|
||||
+{
|
||||
+ switch (err_info->err_type) {
|
||||
+ case CE:
|
||||
+ {
|
||||
+ struct queue_node *node = node_create(err_info->time, err_info->nums);
|
||||
+
|
||||
+ if (node == NULL) {
|
||||
+ log(TERM, LOG_ERR, "Fail to allocate memory for queue node\n");
|
||||
+ return;
|
||||
+ }
|
||||
+ push(cpu_infos[cpu].ce_queue, node);
|
||||
+ cpu_infos[cpu].ce_nums += err_info->nums;
|
||||
+ break;
|
||||
+ }
|
||||
+ default:
|
||||
+ break;
|
||||
+ }
|
||||
+}
|
||||
+
|
||||
+void ras_record_cpu_error(struct error_info *err_info, int cpu)
|
||||
+{
|
||||
+ int ret;
|
||||
+
|
||||
+ if (enabled == 0)
|
||||
+ return;
|
||||
+
|
||||
+ if (cpu >= ncores || cpu < 0) {
|
||||
+ log(TERM, LOG_ERR,
|
||||
+ "The current cpu %d has exceed the total number of cpu:%u\n", cpu, ncores);
|
||||
+ return;
|
||||
+ }
|
||||
+
|
||||
+ log(TERM, LOG_INFO, "Handling error on cpu%d\n", cpu);
|
||||
+ cpu_infos[cpu].state = get_cpu_status(cpu);
|
||||
+
|
||||
+ if (cpu_infos[cpu].state != CPU_ONLINE) {
|
||||
+ log(TERM, LOG_INFO, "Cpu%d is not online or unknown, ignore\n", cpu);
|
||||
+ return;
|
||||
+ }
|
||||
+
|
||||
+ record_error_info(cpu, err_info);
|
||||
+ /*
|
||||
+ * Since user may change cpu state, we get current offlined
|
||||
+ * cpu numbers every recording time.
|
||||
+ */
|
||||
+ if (ncores - sysconf(_SC_NPROCESSORS_ONLN) >= cpu_limit.value) {
|
||||
+ log(TERM, LOG_WARNING,
|
||||
+ "Offlined cpus have exceeded limit: %lu, choose to do nothing\n",
|
||||
+ cpu_limit.value);
|
||||
+ return;
|
||||
+ }
|
||||
+
|
||||
+ ret = error_handler(cpu, err_info);
|
||||
+ if (ret == HANDLE_NOTHING)
|
||||
+ log(TERM, LOG_WARNING, "Doing nothing in the cpu%d\n", cpu);
|
||||
+ else if (ret == HANDLE_SUCCEED) {
|
||||
+ log(TERM, LOG_INFO, "Offline cpu%d succeed, the state is %s\n",
|
||||
+ cpu, cpu_state[cpu_infos[cpu].state]);
|
||||
+ clear_queue(cpu_infos[cpu].ce_queue);
|
||||
+ cpu_infos[cpu].ce_nums = 0;
|
||||
+ } else
|
||||
+ log(TERM, LOG_WARNING, "Offline cpu%d fail, the state is %s\n",
|
||||
+ cpu, cpu_state[cpu_infos[cpu].state]);
|
||||
+}
|
||||
diff --git a/ras-cpu-isolation.h b/ras-cpu-isolation.h
|
||||
new file mode 100644
|
||||
index 0000000..1159853
|
||||
--- /dev/null
|
||||
+++ b/ras-cpu-isolation.h
|
||||
@@ -0,0 +1,68 @@
|
||||
+/*
|
||||
+ * Copyright (c) Huawei Technologies Co., Ltd. 2021-2021. All rights reserved.
|
||||
+ *
|
||||
+ * This program is free software; you can redistribute it and/or modify
|
||||
+ * it under the terms of the GNU General Public License as published by
|
||||
+ * the Free Software Foundation; either version 2 of the License, or
|
||||
+ * (at your option) any later version.
|
||||
+ *
|
||||
+ * This program is distributed in the hope that it will be useful,
|
||||
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
+ * GNU General Public License for more details.
|
||||
+ */
|
||||
+
|
||||
+#ifndef __RAS_CPU_ISOLATION_H
|
||||
+#define __RAS_CPU_ISOLATION_H
|
||||
+
|
||||
+#include "queue.h"
|
||||
+
|
||||
+#define MAX_PATH_LEN 100
|
||||
+#define MAX_BUF_LEN 1024
|
||||
+
|
||||
+struct param {
|
||||
+ char *name;
|
||||
+ unsigned long value;
|
||||
+};
|
||||
+
|
||||
+struct isolation_param {
|
||||
+ char *name;
|
||||
+ const struct param *units;
|
||||
+ unsigned long value;
|
||||
+ unsigned long limit;
|
||||
+};
|
||||
+
|
||||
+enum cpu_state {
|
||||
+ CPU_OFFLINE,
|
||||
+ CPU_ONLINE,
|
||||
+ CPU_OFFLINE_FAILED,
|
||||
+ CPU_UNKNOWN,
|
||||
+};
|
||||
+
|
||||
+enum error_handle_result {
|
||||
+ HANDLE_FAILED = -1,
|
||||
+ HANDLE_SUCCEED,
|
||||
+ HANDLE_NOTHING,
|
||||
+};
|
||||
+
|
||||
+enum error_type {
|
||||
+ CE = 1
|
||||
+};
|
||||
+
|
||||
+struct cpu_info {
|
||||
+ unsigned long ce_nums;
|
||||
+ struct link_queue *ce_queue;
|
||||
+ enum cpu_state state;
|
||||
+};
|
||||
+
|
||||
+struct error_info {
|
||||
+ unsigned long nums;
|
||||
+ time_t time;
|
||||
+ enum error_type err_type;
|
||||
+};
|
||||
+
|
||||
+void ras_cpu_isolation_init(unsigned int cpus);
|
||||
+void ras_record_cpu_error(struct error_info *err_info, int cpu);
|
||||
+void cpu_infos_free(void);
|
||||
+
|
||||
+#endif
|
||||
diff --git a/ras-events.c b/ras-events.c
|
||||
index 39cab20..beda655 100644
|
||||
--- a/ras-events.c
|
||||
+++ b/ras-events.c
|
||||
@@ -42,6 +42,7 @@
|
||||
#include "ras-record.h"
|
||||
#include "ras-logger.h"
|
||||
#include "ras-page-isolation.h"
|
||||
+#include "ras-cpu-isolation.h"
|
||||
|
||||
/*
|
||||
* Polling time, if read() doesn't block. Currently, trace_pipe_raw never
|
||||
@@ -856,6 +857,10 @@ int handle_ras_events(int record_events)
|
||||
|
||||
cpus = get_num_cpus(ras);
|
||||
|
||||
+#ifdef HAVE_CPU_FAULT_ISOLATION
|
||||
+ ras_cpu_isolation_init(cpus);
|
||||
+#endif
|
||||
+
|
||||
#ifdef HAVE_MCE
|
||||
rc = register_mce_handler(ras, cpus);
|
||||
if (rc)
|
||||
@@ -982,6 +987,8 @@ err:
|
||||
}
|
||||
free(ras);
|
||||
}
|
||||
-
|
||||
+#ifdef HAVE_CPU_FAULT_ISOLATION
|
||||
+ cpu_infos_free();
|
||||
+#endif
|
||||
return rc;
|
||||
}
|
||||
--
|
||||
2.25.1
|
||||
|
||||
@ -0,0 +1,95 @@
|
||||
From 2eea64bc7437b0a5dabff52632a372446ddc4765 Mon Sep 17 00:00:00 2001
|
||||
From: Xiaofei Tan <tanxiaofei@huawei.com>
|
||||
Date: Thu, 11 May 2023 10:54:26 +0800
|
||||
Subject: [PATCH 1/3] rasdaemon: fix return value type issue of read/write
|
||||
function from unistd.h
|
||||
|
||||
The return value type of read/write function from unistd.h is ssize_t.
|
||||
It's signed normally, and return -1 on error. Fix incorrect use in the
|
||||
function read_ras_event_all_cpus().
|
||||
|
||||
BTW, make setting buffer_percent as a separate function.
|
||||
|
||||
Fixes: 94750bcf9309 ("rasdaemon: Fix poll() on per_cpu trace_pipe_raw blocks indefinitely")
|
||||
Signed-off-by: Xiaofei Tan <tanxiaofei@huawei.com>
|
||||
Signed-off-by: Shiju Jose <shiju.jose@huawei.com>
|
||||
---
|
||||
ras-events.c | 45 ++++++++++++++++++++++++++++++---------------
|
||||
1 file changed, 30 insertions(+), 15 deletions(-)
|
||||
|
||||
diff --git a/ras-events.c b/ras-events.c
|
||||
index 6e928a3..d08bf37 100644
|
||||
--- a/ras-events.c
|
||||
+++ b/ras-events.c
|
||||
@@ -376,10 +376,37 @@ static int get_num_cpus(struct ras_events *ras)
|
||||
#endif
|
||||
}
|
||||
|
||||
+static int set_buffer_percent(struct ras_events *ras, int percent)
|
||||
+{
|
||||
+ char buf[16];
|
||||
+ ssize_t size;
|
||||
+ int res = 0;
|
||||
+ int fd;
|
||||
+
|
||||
+ fd = open_trace(ras, "buffer_percent", O_WRONLY);
|
||||
+ if (fd >= 0) {
|
||||
+ /* For the backward compatibility to the old kernels, do not return
|
||||
+ * if fail to set the buffer_percent.
|
||||
+ */
|
||||
+ snprintf(buf, sizeof(buf), "%d", percent);
|
||||
+ size = write(fd, buf, strlen(buf));
|
||||
+ if (size <= 0) {
|
||||
+ log(TERM, LOG_WARNING, "can't write to buffer_percent\n");
|
||||
+ res = -1;
|
||||
+ }
|
||||
+ close(fd);
|
||||
+ } else {
|
||||
+ log(TERM, LOG_WARNING, "Can't open buffer_percent\n");
|
||||
+ res = -1;
|
||||
+ }
|
||||
+
|
||||
+ return res;
|
||||
+}
|
||||
+
|
||||
static int read_ras_event_all_cpus(struct pthread_data *pdata,
|
||||
unsigned n_cpus)
|
||||
{
|
||||
- unsigned size;
|
||||
+ ssize_t size;
|
||||
unsigned long long time_stamp;
|
||||
void *data;
|
||||
int ready, i, count_nready;
|
||||
@@ -391,8 +418,6 @@ static int read_ras_event_all_cpus(struct pthread_data *pdata,
|
||||
int warnonce[n_cpus];
|
||||
char pipe_raw[PATH_MAX];
|
||||
int legacy_kernel = 0;
|
||||
- int fd;
|
||||
- char buf[16];
|
||||
#if 0
|
||||
int need_sleep = 0;
|
||||
#endif
|
||||
@@ -419,18 +444,8 @@ static int read_ras_event_all_cpus(struct pthread_data *pdata,
|
||||
* Set buffer_percent to 0 so that poll() will return immediately
|
||||
* when the trace data is available in the ras per_cpu trace pipe_raw
|
||||
*/
|
||||
- fd = open_trace(pdata[0].ras, "buffer_percent", O_WRONLY);
|
||||
- if (fd >= 0) {
|
||||
- /* For the backward compatibility to the old kernels, do not return
|
||||
- * if fail to set the buffer_percent.
|
||||
- */
|
||||
- snprintf(buf, sizeof(buf), "0");
|
||||
- size = write(fd, buf, strlen(buf));
|
||||
- if (size <= 0)
|
||||
- log(TERM, LOG_WARNING, "can't write to buffer_percent\n");
|
||||
- close(fd);
|
||||
- } else
|
||||
- log(TERM, LOG_WARNING, "Can't open buffer_percent\n");
|
||||
+ if (set_buffer_percent(pdata[0].ras, 0))
|
||||
+ log(TERM, LOG_WARNING, "Set buffer_percent failed\n");
|
||||
|
||||
for (i = 0; i < (n_cpus + 1); i++)
|
||||
fds[i].fd = -1;
|
||||
--
|
||||
2.25.1
|
||||
|
||||
@ -0,0 +1,46 @@
|
||||
From: Xiaofei Tan <tanxiaofei@huawei.com>
|
||||
Date: Sat, 20 Aug 2022 09:49:25 +0000
|
||||
Subject: [PATCH] rasdaemon: use standard length PATH_MAX for path name
|
||||
|
||||
Use standard length PATH_MAX for path name space allocation
|
||||
to replace the macro MAX_PATH_LEN.
|
||||
|
||||
Signed-off-by: Xiaofei Tan <tanxiaofei@huawei.com>
|
||||
---
|
||||
ras-cpu-isolation.c | 6 +++---
|
||||
ras-cpu-isolation.h | 1 -
|
||||
2 files changed, 3 insertions(+), 4 deletions(-)
|
||||
|
||||
diff --git a/ras-cpu-isolation.c b/ras-cpu-isolation.c
|
||||
index ba5ccd1..24c07e9 100644
|
||||
--- a/ras-cpu-isolation.c
|
||||
+++ b/ras-cpu-isolation.c
|
||||
@@ -80,11 +80,11 @@ static const char * const cpu_state[] = {
|
||||
static int open_sys_file(unsigned int cpu, int __oflag, const char *format)
|
||||
{
|
||||
int fd;
|
||||
- char path[MAX_PATH_LEN + 1] = "";
|
||||
- char real_path[MAX_PATH_LEN + 1] = "";
|
||||
+ char path[PATH_MAX] = "";
|
||||
+ char real_path[PATH_MAX] = "";
|
||||
|
||||
snprintf(path, sizeof(path), format, cpu);
|
||||
- if (strlen(path) > MAX_PATH_LEN || realpath(path, real_path) == NULL) {
|
||||
+ if (strlen(path) > PATH_MAX || realpath(path, real_path) == NULL) {
|
||||
log(TERM, LOG_ERR, "[%s]:open file: %s failed\n", __func__, path);
|
||||
return -1;
|
||||
}
|
||||
diff --git a/ras-cpu-isolation.h b/ras-cpu-isolation.h
|
||||
index 024a68b..5682106 100644
|
||||
--- a/ras-cpu-isolation.h
|
||||
+++ b/ras-cpu-isolation.h
|
||||
@@ -17,7 +17,6 @@
|
||||
|
||||
#include "queue.h"
|
||||
|
||||
-#define MAX_PATH_LEN 100
|
||||
#define MAX_BUF_LEN 1024
|
||||
|
||||
struct param {
|
||||
--
|
||||
2.17.1
|
||||
@ -0,0 +1,85 @@
|
||||
From 6986d818e6d2c846c001fc7211b5a4153e5ecd11 Mon Sep 17 00:00:00 2001
|
||||
From: Shiju Jose <shiju.jose@huawei.com>
|
||||
Date: Sat, 4 Feb 2023 19:15:55 +0000
|
||||
Subject: [PATCH] rasdaemon: Fix poll() on per_cpu trace_pipe_raw blocks
|
||||
indefinitely
|
||||
|
||||
The error events are not received in the rasdaemon since kernel 6.1-rc6.
|
||||
This issue is firstly detected and reported, when testing the CXL error
|
||||
events in the rasdaemon.
|
||||
|
||||
Debugging showed, poll() on trace_pipe_raw in the ras-events.c do not
|
||||
return and this issue is seen after the commit
|
||||
42fb0a1e84ff525ebe560e2baf9451ab69127e2b ("tracing/ring-buffer: Have
|
||||
polling block on watermark").
|
||||
|
||||
This issue is also verified using a test application for poll()
|
||||
and select() on per_cpu trace_pipe_raw.
|
||||
|
||||
There is also a bug reported on this issue,
|
||||
https://lore.kernel.org/all/31eb3b12-3350-90a4-a0d9-d1494db7cf74@oracle.com/
|
||||
|
||||
This issue occurs for the per_cpu case, which calls the ring_buffer_poll_wait(),
|
||||
in kernel/trace/ring_buffer.c, with the buffer_percent > 0 and then wait until
|
||||
the percentage of pages are available. The default value set for the
|
||||
buffer_percent is 50 in the kernel/trace/trace.c. However poll() does not return
|
||||
even met the percentage of pages condition.
|
||||
|
||||
As a fix, rasdaemon set buffer_percent as 0 through the
|
||||
/sys/kernel/debug/tracing/instances/rasdaemon/buffer_percent, then the
|
||||
task will wake up as soon as data is added to any of the specific cpu
|
||||
buffer and poll() on per_cpu/cpuX/trace_pipe_raw does not block
|
||||
indefinitely.
|
||||
|
||||
Dependency on the kernel fix commit
|
||||
3e46d910d8acf94e5360126593b68bf4fee4c4a1("tracing: Fix poll() and select()
|
||||
do not work on per_cpu trace_pipe and trace_pipe_raw")
|
||||
|
||||
Signed-off-by: Shiju Jose <shiju.jose@huawei.com>
|
||||
---
|
||||
ras-events.c | 22 ++++++++++++++++++++++
|
||||
1 file changed, 22 insertions(+)
|
||||
|
||||
diff --git a/ras-events.c b/ras-events.c
|
||||
index 39f9ce2..49e4f9a 100644
|
||||
--- a/ras-events.c
|
||||
+++ b/ras-events.c
|
||||
@@ -376,6 +376,8 @@ static int read_ras_event_all_cpus(struct pthread_data *pdata,
|
||||
int warnonce[n_cpus];
|
||||
char pipe_raw[PATH_MAX];
|
||||
int legacy_kernel = 0;
|
||||
+ int fd;
|
||||
+ char buf[16];
|
||||
#if 0
|
||||
int need_sleep = 0;
|
||||
#endif
|
||||
@@ -395,6 +397,26 @@ static int read_ras_event_all_cpus(struct pthread_data *pdata,
|
||||
return -ENOMEM;
|
||||
}
|
||||
|
||||
+ /* Fix for poll() on the per_cpu trace_pipe and trace_pipe_raw blocks
|
||||
+ * indefinitely with the default buffer_percent in the kernel trace system,
|
||||
+ * which is introduced by the following change in the kernel.
|
||||
+ * https://lore.kernel.org/all/20221020231427.41be3f26@gandalf.local.home/T/#u.
|
||||
+ * Set buffer_percent to 0 so that poll() will return immediately
|
||||
+ * when the trace data is available in the ras per_cpu trace pipe_raw
|
||||
+ */
|
||||
+ fd = open_trace(pdata[0].ras, "buffer_percent", O_WRONLY);
|
||||
+ if (fd >= 0) {
|
||||
+ /* For the backward compatibility to the old kernels, do not return
|
||||
+ * if fail to set the buffer_percent.
|
||||
+ */
|
||||
+ snprintf(buf, sizeof(buf), "0");
|
||||
+ size = write(fd, buf, strlen(buf));
|
||||
+ if (size <= 0)
|
||||
+ log(TERM, LOG_WARNING, "can't write to buffer_percent\n");
|
||||
+ close(fd);
|
||||
+ } else
|
||||
+ log(TERM, LOG_WARNING, "Can't open buffer_percent\n");
|
||||
+
|
||||
for (i = 0; i < (n_cpus + 1); i++)
|
||||
fds[i].fd = -1;
|
||||
|
||||
--
|
||||
2.25.1
|
||||
|
||||
150
0002-rasdaemon-Support-cpu-fault-isolation-for-recoverabl.patch
Normal file
150
0002-rasdaemon-Support-cpu-fault-isolation-for-recoverabl.patch
Normal file
@ -0,0 +1,150 @@
|
||||
From fefa2d689f96302e64ad2375695703039e2ca951 Mon Sep 17 00:00:00 2001
|
||||
From: Shengwei Luo <luoshengwei@huawei.com>
|
||||
Date: Wed, 23 Feb 2022 17:23:27 +0800
|
||||
Subject: [PATCH 02/10] rasdaemon: Support cpu fault isolation for recoverable
|
||||
errors
|
||||
|
||||
When the recoverable errors in cpu core occurred, try to offline
|
||||
the related cpu core.
|
||||
|
||||
Signed-off-by: Shengwei Luo <luoshengwei@huawei.com>
|
||||
Signed-off-by: Junchong Pan <panjunchong@hisilicon.com>
|
||||
Signed-off-by: Lei Feng <fenglei47@h-partners.com>
|
||||
Signed-off-by: Shiju Jose <shiju.jose@huawei.com>
|
||||
---
|
||||
ras-arm-handler.c | 22 +++++++++++++++++++---
|
||||
ras-cpu-isolation.c | 17 +++++++++++++++++
|
||||
ras-cpu-isolation.h | 4 +++-
|
||||
3 files changed, 39 insertions(+), 4 deletions(-)
|
||||
|
||||
diff --git a/ras-arm-handler.c b/ras-arm-handler.c
|
||||
index 9c7a3c3..a0dfc51 100644
|
||||
--- a/ras-arm-handler.c
|
||||
+++ b/ras-arm-handler.c
|
||||
@@ -26,6 +26,7 @@
|
||||
|
||||
#define ARM_ERR_VALID_ERROR_COUNT BIT(0)
|
||||
#define ARM_ERR_VALID_FLAGS BIT(1)
|
||||
+#define BIT2 2
|
||||
|
||||
void display_raw_data(struct trace_seq *s,
|
||||
const uint8_t *buf,
|
||||
@@ -47,7 +48,20 @@ void display_raw_data(struct trace_seq *s,
|
||||
}
|
||||
|
||||
#ifdef HAVE_CPU_FAULT_ISOLATION
|
||||
-static int count_errors(struct ras_arm_event *ev)
|
||||
+static int is_core_failure(struct ras_arm_err_info *err_info)
|
||||
+{
|
||||
+ if (err_info->validation_bits & ARM_ERR_VALID_FLAGS) {
|
||||
+ /*
|
||||
+ * core failure:
|
||||
+ * Bit 0\1\3: (at lease 1)
|
||||
+ * Bit 2: 0
|
||||
+ */
|
||||
+ return (err_info->flags & 0xf) && !(err_info->flags & (0x1 << BIT2));
|
||||
+ }
|
||||
+ return 0;
|
||||
+}
|
||||
+
|
||||
+static int count_errors(struct ras_arm_event *ev, int sev)
|
||||
{
|
||||
struct ras_arm_err_info *err_info;
|
||||
int num_pei;
|
||||
@@ -75,6 +89,8 @@ static int count_errors(struct ras_arm_event *ev)
|
||||
*/
|
||||
error_count = err_info->multiple_error + 1;
|
||||
}
|
||||
+ if (sev == GHES_SEV_RECOVERABLE && !is_core_failure(err_info))
|
||||
+ error_count = 0;
|
||||
|
||||
num += error_count;
|
||||
err_info += 1;
|
||||
@@ -118,8 +134,8 @@ static int ras_handle_cpu_error(struct trace_seq *s,
|
||||
}
|
||||
trace_seq_printf(s, "\n severity: %s", severity);
|
||||
|
||||
- if (val == GHES_SEV_CORRECTED) {
|
||||
- int nums = count_errors(ev);
|
||||
+ if (val == GHES_SEV_CORRECTED || val == GHES_SEV_RECOVERABLE) {
|
||||
+ int nums = count_errors(ev, val);
|
||||
|
||||
if (nums > 0) {
|
||||
err_info.nums = nums;
|
||||
diff --git a/ras-cpu-isolation.c b/ras-cpu-isolation.c
|
||||
index abcf451..fd23e4e 100644
|
||||
--- a/ras-cpu-isolation.c
|
||||
+++ b/ras-cpu-isolation.c
|
||||
@@ -126,6 +126,7 @@ static int init_cpu_info(unsigned int cpus)
|
||||
|
||||
for (unsigned int i = 0; i < cpus; ++i) {
|
||||
cpu_infos[i].ce_nums = 0;
|
||||
+ cpu_infos[i].uce_nums = 0;
|
||||
cpu_infos[i].state = get_cpu_status(i);
|
||||
cpu_infos[i].ce_queue = init_queue();
|
||||
|
||||
@@ -306,6 +307,15 @@ static int do_ce_handler(unsigned int cpu)
|
||||
return HANDLE_NOTHING;
|
||||
}
|
||||
|
||||
+static int do_uce_handler(unsigned int cpu)
|
||||
+{
|
||||
+ if (cpu_infos[cpu].uce_nums > 0) {
|
||||
+ log(TERM, LOG_INFO, "Uncorrected Errors occurred, try to offline cpu%u\n", cpu);
|
||||
+ return do_cpu_offline(cpu);
|
||||
+ }
|
||||
+ return HANDLE_NOTHING;
|
||||
+}
|
||||
+
|
||||
static int error_handler(unsigned int cpu, struct error_info *err_info)
|
||||
{
|
||||
int ret = HANDLE_NOTHING;
|
||||
@@ -314,6 +324,9 @@ static int error_handler(unsigned int cpu, struct error_info *err_info)
|
||||
case CE:
|
||||
ret = do_ce_handler(cpu);
|
||||
break;
|
||||
+ case UCE:
|
||||
+ ret = do_uce_handler(cpu);
|
||||
+ break;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
@@ -336,6 +349,9 @@ static void record_error_info(unsigned int cpu, struct error_info *err_info)
|
||||
cpu_infos[cpu].ce_nums += err_info->nums;
|
||||
break;
|
||||
}
|
||||
+ case UCE:
|
||||
+ cpu_infos[cpu].uce_nums++;
|
||||
+ break;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
@@ -382,6 +398,7 @@ void ras_record_cpu_error(struct error_info *err_info, int cpu)
|
||||
cpu, cpu_state[cpu_infos[cpu].state]);
|
||||
clear_queue(cpu_infos[cpu].ce_queue);
|
||||
cpu_infos[cpu].ce_nums = 0;
|
||||
+ cpu_infos[cpu].uce_nums = 0;
|
||||
} else
|
||||
log(TERM, LOG_WARNING, "Offline cpu%d fail, the state is %s\n",
|
||||
cpu, cpu_state[cpu_infos[cpu].state]);
|
||||
diff --git a/ras-cpu-isolation.h b/ras-cpu-isolation.h
|
||||
index 1159853..024a68b 100644
|
||||
--- a/ras-cpu-isolation.h
|
||||
+++ b/ras-cpu-isolation.h
|
||||
@@ -46,10 +46,12 @@ enum error_handle_result {
|
||||
};
|
||||
|
||||
enum error_type {
|
||||
- CE = 1
|
||||
+ CE = 1,
|
||||
+ UCE
|
||||
};
|
||||
|
||||
struct cpu_info {
|
||||
+ unsigned long uce_nums;
|
||||
unsigned long ce_nums;
|
||||
struct link_queue *ce_queue;
|
||||
enum cpu_state state;
|
||||
--
|
||||
2.25.1
|
||||
|
||||
114
0002-rasdaemon-fix-issue-of-signed-and-unsigned-integer-c.patch
Normal file
114
0002-rasdaemon-fix-issue-of-signed-and-unsigned-integer-c.patch
Normal file
@ -0,0 +1,114 @@
|
||||
From ad9c1bc8ea907d6faebfb916916b5f898a8e0518 Mon Sep 17 00:00:00 2001
|
||||
From: Xiaofei Tan <tanxiaofei@huawei.com>
|
||||
Date: Tue, 30 May 2023 11:44:12 +0100
|
||||
Subject: [PATCH 2/3] rasdaemon: fix issue of signed and unsigned integer
|
||||
comparison and remove redundant header file
|
||||
|
||||
1. The return value of ARRAY_SIZE() is unsigned integer. It isn't right to
|
||||
compare it with a signed integer. This patch fix them.
|
||||
|
||||
2. Remove redundant header file and adjust the header files sequence.
|
||||
|
||||
Signed-off-by: Xiaofei Tan <tanxiaofei@huawei.com>
|
||||
Signed-off-by: Shiju Jose <shiju.jose@huawei.com>
|
||||
---
|
||||
non-standard-hisi_hip08.c | 2 +-
|
||||
non-standard-hisilicon.c | 8 ++++----
|
||||
ras-diskerror-handler.c | 2 +-
|
||||
ras-memory-failure-handler.c | 7 +++----
|
||||
4 files changed, 9 insertions(+), 10 deletions(-)
|
||||
|
||||
diff --git a/non-standard-hisi_hip08.c b/non-standard-hisi_hip08.c
|
||||
index 4ef47ea..61f12eb 100644
|
||||
--- a/non-standard-hisi_hip08.c
|
||||
+++ b/non-standard-hisi_hip08.c
|
||||
@@ -1029,7 +1029,7 @@ static struct ras_ns_ev_decoder hip08_ns_ev_decoder[] = {
|
||||
|
||||
static void __attribute__((constructor)) hip08_init(void)
|
||||
{
|
||||
- int i;
|
||||
+ unsigned int i;
|
||||
|
||||
for (i = 0; i < ARRAY_SIZE(hip08_ns_ev_decoder); i++)
|
||||
register_ns_ev_decoder(&hip08_ns_ev_decoder[i]);
|
||||
diff --git a/non-standard-hisilicon.c b/non-standard-hisilicon.c
|
||||
index 6ee9271..0d5fe6b 100644
|
||||
--- a/non-standard-hisilicon.c
|
||||
+++ b/non-standard-hisilicon.c
|
||||
@@ -362,13 +362,13 @@ static int decode_hisi_common_section(struct ras_events *ras,
|
||||
trace_seq_printf(s, "%s\n", hevent.error_msg);
|
||||
|
||||
if (err->val_bits & BIT(HISI_COMMON_VALID_REG_ARRAY_SIZE) && err->reg_array_size > 0) {
|
||||
- int i;
|
||||
+ unsigned int i;
|
||||
|
||||
trace_seq_printf(s, "Register Dump:\n");
|
||||
for (i = 0; i < err->reg_array_size / sizeof(uint32_t); i++) {
|
||||
- trace_seq_printf(s, "reg%02d=0x%08x\n", i,
|
||||
+ trace_seq_printf(s, "reg%02u=0x%08x\n", i,
|
||||
err->reg_array[i]);
|
||||
- HISI_SNPRINTF(hevent.reg_msg, "reg%02d=0x%08x",
|
||||
+ HISI_SNPRINTF(hevent.reg_msg, "reg%02u=0x%08x",
|
||||
i, err->reg_array[i]);
|
||||
}
|
||||
}
|
||||
@@ -394,7 +394,7 @@ static struct ras_ns_ev_decoder hisi_section_ns_ev_decoder[] = {
|
||||
|
||||
static void __attribute__((constructor)) hisi_ns_init(void)
|
||||
{
|
||||
- int i;
|
||||
+ unsigned int i;
|
||||
|
||||
for (i = 0; i < ARRAY_SIZE(hisi_section_ns_ev_decoder); i++)
|
||||
register_ns_ev_decoder(&hisi_section_ns_ev_decoder[i]);
|
||||
diff --git a/ras-diskerror-handler.c b/ras-diskerror-handler.c
|
||||
index b16319f..b46f859 100644
|
||||
--- a/ras-diskerror-handler.c
|
||||
+++ b/ras-diskerror-handler.c
|
||||
@@ -52,7 +52,7 @@ static const struct {
|
||||
|
||||
static const char *get_blk_error(int err)
|
||||
{
|
||||
- int i;
|
||||
+ unsigned int i;
|
||||
|
||||
for (i = 0; i < ARRAY_SIZE(blk_errors); i++)
|
||||
if (blk_errors[i].error == err)
|
||||
diff --git a/ras-memory-failure-handler.c b/ras-memory-failure-handler.c
|
||||
index 9941e68..8fd7117 100644
|
||||
--- a/ras-memory-failure-handler.c
|
||||
+++ b/ras-memory-failure-handler.c
|
||||
@@ -15,11 +15,10 @@
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
-#include "libtrace/kbuffer.h"
|
||||
-#include "ras-memory-failure-handler.h"
|
||||
#include "ras-record.h"
|
||||
#include "ras-logger.h"
|
||||
#include "ras-report.h"
|
||||
+#include "ras-memory-failure-handler.h"
|
||||
|
||||
/* Memory failure - various types of pages */
|
||||
enum mf_action_page_type {
|
||||
@@ -99,7 +98,7 @@ static const struct {
|
||||
|
||||
static const char *get_page_type(int page_type)
|
||||
{
|
||||
- int i;
|
||||
+ unsigned int i;
|
||||
|
||||
for (i = 0; i < ARRAY_SIZE(mf_page_type); i++)
|
||||
if (mf_page_type[i].type == page_type)
|
||||
@@ -110,7 +109,7 @@ static const char *get_page_type(int page_type)
|
||||
|
||||
static const char *get_action_result(int result)
|
||||
{
|
||||
- int i;
|
||||
+ unsigned int i;
|
||||
|
||||
for (i = 0; i < ARRAY_SIZE(mf_action_result); i++)
|
||||
if (mf_action_result[i].result == result)
|
||||
--
|
||||
2.25.1
|
||||
|
||||
325
0003-rasdaemon-Add-support-for-creating-the-vendor-error-.patch
Normal file
325
0003-rasdaemon-Add-support-for-creating-the-vendor-error-.patch
Normal file
@ -0,0 +1,325 @@
|
||||
From 9fd84965e70b6d245699d36f8ac4f260d87013cb Mon Sep 17 00:00:00 2001
|
||||
From: Shiju Jose <shiju.jose@huawei.com>
|
||||
Date: Thu, 1 Jun 2023 15:34:53 +0100
|
||||
Subject: [PATCH 3/3] rasdaemon: Add support for creating the vendor error
|
||||
tables at startup
|
||||
|
||||
1. Support for create/open the vendor error tables at rasdaemon startup.
|
||||
2. Make changes in the HiSilicon error handling code for the same.
|
||||
|
||||
Signed-off-by: Shiju Jose <shiju.jose@huawei.com>
|
||||
---
|
||||
non-standard-hisi_hip08.c | 66 ++++++++++++++++++++++++++------------
|
||||
non-standard-hisilicon.c | 28 ++++++++++------
|
||||
ras-events.c | 17 +++++++++-
|
||||
ras-non-standard-handler.c | 35 +++++++++++++++++++-
|
||||
ras-non-standard-handler.h | 3 ++
|
||||
5 files changed, 116 insertions(+), 33 deletions(-)
|
||||
|
||||
diff --git a/non-standard-hisi_hip08.c b/non-standard-hisi_hip08.c
|
||||
index 61f12eb..be84c22 100644
|
||||
--- a/non-standard-hisi_hip08.c
|
||||
+++ b/non-standard-hisi_hip08.c
|
||||
@@ -654,6 +654,20 @@ static void decode_oem_type1_err_regs(struct ras_ns_ev_decoder *ev_decoder,
|
||||
step_vendor_data_tab(ev_decoder, "hip08_oem_type1_event_tab");
|
||||
}
|
||||
|
||||
+static int add_hip08_oem_type1_table(struct ras_events *ras, struct ras_ns_ev_decoder *ev_decoder)
|
||||
+{
|
||||
+#ifdef HAVE_SQLITE3
|
||||
+ if (ras->record_events && !ev_decoder->stmt_dec_record) {
|
||||
+ if (ras_mc_add_vendor_table(ras, &ev_decoder->stmt_dec_record,
|
||||
+ &hip08_oem_type1_event_tab) != SQLITE_OK) {
|
||||
+ log(TERM, LOG_WARNING, "Failed to create sql hip08_oem_type1_event_tab\n");
|
||||
+ return -1;
|
||||
+ }
|
||||
+ }
|
||||
+#endif
|
||||
+ return 0;
|
||||
+}
|
||||
+
|
||||
/* error data decoding functions */
|
||||
static int decode_hip08_oem_type1_error(struct ras_events *ras,
|
||||
struct ras_ns_ev_decoder *ev_decoder,
|
||||
@@ -669,17 +683,6 @@ static int decode_hip08_oem_type1_error(struct ras_events *ras,
|
||||
return -1;
|
||||
}
|
||||
|
||||
-#ifdef HAVE_SQLITE3
|
||||
- if (ras->record_events && !ev_decoder->stmt_dec_record) {
|
||||
- if (ras_mc_add_vendor_table(ras, &ev_decoder->stmt_dec_record,
|
||||
- &hip08_oem_type1_event_tab)
|
||||
- != SQLITE_OK) {
|
||||
- trace_seq_printf(s,
|
||||
- "create sql hip08_oem_type1_event_tab fail\n");
|
||||
- return -1;
|
||||
- }
|
||||
- }
|
||||
-#endif
|
||||
record_vendor_data(ev_decoder, HISI_OEM_DATA_TYPE_TEXT,
|
||||
HIP08_OEM_TYPE1_FIELD_TIMESTAMP,
|
||||
0, event->timestamp);
|
||||
@@ -827,6 +830,20 @@ static void decode_oem_type2_err_regs(struct ras_ns_ev_decoder *ev_decoder,
|
||||
step_vendor_data_tab(ev_decoder, "hip08_oem_type2_event_tab");
|
||||
}
|
||||
|
||||
+static int add_hip08_oem_type2_table(struct ras_events *ras, struct ras_ns_ev_decoder *ev_decoder)
|
||||
+{
|
||||
+#ifdef HAVE_SQLITE3
|
||||
+ if (ras->record_events && !ev_decoder->stmt_dec_record) {
|
||||
+ if (ras_mc_add_vendor_table(ras, &ev_decoder->stmt_dec_record,
|
||||
+ &hip08_oem_type2_event_tab) != SQLITE_OK) {
|
||||
+ log(TERM, LOG_WARNING, "Failed to create sql hip08_oem_type2_event_tab\n");
|
||||
+ return -1;
|
||||
+ }
|
||||
+ }
|
||||
+#endif
|
||||
+ return 0;
|
||||
+}
|
||||
+
|
||||
static int decode_hip08_oem_type2_error(struct ras_events *ras,
|
||||
struct ras_ns_ev_decoder *ev_decoder,
|
||||
struct trace_seq *s,
|
||||
@@ -977,6 +994,20 @@ static void decode_pcie_local_err_regs(struct ras_ns_ev_decoder *ev_decoder,
|
||||
step_vendor_data_tab(ev_decoder, "hip08_pcie_local_event_tab");
|
||||
}
|
||||
|
||||
+static int add_hip08_pcie_local_table(struct ras_events *ras, struct ras_ns_ev_decoder *ev_decoder)
|
||||
+{
|
||||
+#ifdef HAVE_SQLITE3
|
||||
+ if (ras->record_events && !ev_decoder->stmt_dec_record) {
|
||||
+ if (ras_mc_add_vendor_table(ras, &ev_decoder->stmt_dec_record,
|
||||
+ &hip08_pcie_local_event_tab) != SQLITE_OK) {
|
||||
+ log(TERM, LOG_WARNING, "Failed to create sql hip08_pcie_local_event_tab\n");
|
||||
+ return -1;
|
||||
+ }
|
||||
+ }
|
||||
+#endif
|
||||
+ return 0;
|
||||
+}
|
||||
+
|
||||
static int decode_hip08_pcie_local_error(struct ras_events *ras,
|
||||
struct ras_ns_ev_decoder *ev_decoder,
|
||||
struct trace_seq *s,
|
||||
@@ -991,16 +1022,6 @@ static int decode_hip08_pcie_local_error(struct ras_events *ras,
|
||||
return -1;
|
||||
}
|
||||
|
||||
-#ifdef HAVE_SQLITE3
|
||||
- if (ras->record_events && !ev_decoder->stmt_dec_record) {
|
||||
- if (ras_mc_add_vendor_table(ras, &ev_decoder->stmt_dec_record,
|
||||
- &hip08_pcie_local_event_tab) != SQLITE_OK) {
|
||||
- trace_seq_printf(s,
|
||||
- "create sql hip08_pcie_local_event_tab fail\n");
|
||||
- return -1;
|
||||
- }
|
||||
- }
|
||||
-#endif
|
||||
record_vendor_data(ev_decoder, HISI_OEM_DATA_TYPE_TEXT,
|
||||
HIP08_PCIE_LOCAL_FIELD_TIMESTAMP,
|
||||
0, event->timestamp);
|
||||
@@ -1015,14 +1036,17 @@ static int decode_hip08_pcie_local_error(struct ras_events *ras,
|
||||
static struct ras_ns_ev_decoder hip08_ns_ev_decoder[] = {
|
||||
{
|
||||
.sec_type = "1f8161e1-55d6-41e6-bd10-7afd1dc5f7c5",
|
||||
+ .add_table = add_hip08_oem_type1_table,
|
||||
.decode = decode_hip08_oem_type1_error,
|
||||
},
|
||||
{
|
||||
.sec_type = "45534ea6-ce23-4115-8535-e07ab3aef91d",
|
||||
+ .add_table = add_hip08_oem_type2_table,
|
||||
.decode = decode_hip08_oem_type2_error,
|
||||
},
|
||||
{
|
||||
.sec_type = "b2889fc9-e7d7-4f9d-a867-af42e98be772",
|
||||
+ .add_table = add_hip08_pcie_local_table,
|
||||
.decode = decode_hip08_pcie_local_error,
|
||||
},
|
||||
};
|
||||
diff --git a/non-standard-hisilicon.c b/non-standard-hisilicon.c
|
||||
index 0d5fe6b..0ddb5ec 100644
|
||||
--- a/non-standard-hisilicon.c
|
||||
+++ b/non-standard-hisilicon.c
|
||||
@@ -337,6 +337,23 @@ static void decode_hisi_common_section_hdr(struct ras_ns_ev_decoder *ev_decoder,
|
||||
HISI_SNPRINTF(event->error_msg, "]");
|
||||
}
|
||||
|
||||
+static int add_hisi_common_table(struct ras_events *ras,
|
||||
+ struct ras_ns_ev_decoder *ev_decoder)
|
||||
+{
|
||||
+#ifdef HAVE_SQLITE3
|
||||
+ if (ras->record_events &&
|
||||
+ !ev_decoder->stmt_dec_record) {
|
||||
+ if (ras_mc_add_vendor_table(ras, &ev_decoder->stmt_dec_record,
|
||||
+ &hisi_common_section_tab) != SQLITE_OK) {
|
||||
+ log(TERM, LOG_WARNING, "Failed to create sql hisi_common_section_tab\n");
|
||||
+ return -1;
|
||||
+ }
|
||||
+ }
|
||||
+#endif
|
||||
+
|
||||
+ return 0;
|
||||
+}
|
||||
+
|
||||
static int decode_hisi_common_section(struct ras_events *ras,
|
||||
struct ras_ns_ev_decoder *ev_decoder,
|
||||
struct trace_seq *s,
|
||||
@@ -346,16 +363,6 @@ static int decode_hisi_common_section(struct ras_events *ras,
|
||||
(struct hisi_common_error_section *)event->error;
|
||||
struct hisi_event hevent;
|
||||
|
||||
-#ifdef HAVE_SQLITE3
|
||||
- if (ras->record_events && !ev_decoder->stmt_dec_record) {
|
||||
- if (ras_mc_add_vendor_table(ras, &ev_decoder->stmt_dec_record,
|
||||
- &hisi_common_section_tab) != SQLITE_OK) {
|
||||
- trace_seq_printf(s, "create sql hisi_common_section_tab fail\n");
|
||||
- return -1;
|
||||
- }
|
||||
- }
|
||||
-#endif
|
||||
-
|
||||
memset(&hevent, 0, sizeof(struct hisi_event));
|
||||
trace_seq_printf(s, "\nHisilicon Common Error Section:\n");
|
||||
decode_hisi_common_section_hdr(ev_decoder, err, &hevent);
|
||||
@@ -388,6 +395,7 @@ static int decode_hisi_common_section(struct ras_events *ras,
|
||||
static struct ras_ns_ev_decoder hisi_section_ns_ev_decoder[] = {
|
||||
{
|
||||
.sec_type = "c8b328a8-9917-4af6-9a13-2e08ab2e7586",
|
||||
+ .add_table = add_hisi_common_table,
|
||||
.decode = decode_hisi_common_section,
|
||||
},
|
||||
};
|
||||
diff --git a/ras-events.c b/ras-events.c
|
||||
index d08bf37..fc54325 100644
|
||||
--- a/ras-events.c
|
||||
+++ b/ras-events.c
|
||||
@@ -482,6 +482,10 @@ static int read_ras_event_all_cpus(struct pthread_data *pdata,
|
||||
if (pdata[0].ras->record_events) {
|
||||
if (ras_mc_event_opendb(pdata[0].cpu, pdata[0].ras))
|
||||
goto error;
|
||||
+#ifdef HAVE_NON_STANDARD
|
||||
+ if (ras_ns_add_vendor_tables(pdata[0].ras))
|
||||
+ log(TERM, LOG_ERR, "Can't add vendor table\n");
|
||||
+#endif
|
||||
}
|
||||
|
||||
do {
|
||||
@@ -566,8 +570,12 @@ static int read_ras_event_all_cpus(struct pthread_data *pdata,
|
||||
"Old kernel detected. Stop listening and fall back to pthread way.\n");
|
||||
|
||||
cleanup:
|
||||
- if (pdata[0].ras->record_events)
|
||||
+ if (pdata[0].ras->record_events) {
|
||||
+#ifdef HAVE_NON_STANDARD
|
||||
+ ras_ns_finalize_vendor_tables();
|
||||
+#endif
|
||||
ras_mc_event_closedb(pdata[0].cpu, pdata[0].ras);
|
||||
+ }
|
||||
|
||||
error:
|
||||
kbuffer_free(kbuf);
|
||||
@@ -664,6 +672,10 @@ static void *handle_ras_events_cpu(void *priv)
|
||||
free(page);
|
||||
return 0;
|
||||
}
|
||||
+#ifdef HAVE_NON_STANDARD
|
||||
+ if (ras_ns_add_vendor_tables(pdata->ras))
|
||||
+ log(TERM, LOG_ERR, "Can't add vendor table\n");
|
||||
+#endif
|
||||
pthread_mutex_unlock(&pdata->ras->db_lock);
|
||||
}
|
||||
|
||||
@@ -671,6 +683,9 @@ static void *handle_ras_events_cpu(void *priv)
|
||||
|
||||
if (pdata->ras->record_events) {
|
||||
pthread_mutex_lock(&pdata->ras->db_lock);
|
||||
+#ifdef HAVE_NON_STANDARD
|
||||
+ ras_ns_finalize_vendor_tables();
|
||||
+#endif
|
||||
ras_mc_event_closedb(pdata->cpu, pdata->ras);
|
||||
pthread_mutex_unlock(&pdata->ras->db_lock);
|
||||
}
|
||||
diff --git a/ras-non-standard-handler.c b/ras-non-standard-handler.c
|
||||
index 6932e58..20d514b 100644
|
||||
--- a/ras-non-standard-handler.c
|
||||
+++ b/ras-non-standard-handler.c
|
||||
@@ -75,6 +75,32 @@ int register_ns_ev_decoder(struct ras_ns_ev_decoder *ns_ev_decoder)
|
||||
return 0;
|
||||
}
|
||||
|
||||
+int ras_ns_add_vendor_tables(struct ras_events *ras)
|
||||
+{
|
||||
+ struct ras_ns_ev_decoder *ns_ev_decoder;
|
||||
+ int error = 0;
|
||||
+
|
||||
+#ifdef HAVE_SQLITE3
|
||||
+ if (!ras)
|
||||
+ return -1;
|
||||
+
|
||||
+ ns_ev_decoder = ras_ns_ev_dec_list;
|
||||
+ while (ns_ev_decoder) {
|
||||
+ if (ns_ev_decoder->add_table && !ns_ev_decoder->stmt_dec_record) {
|
||||
+ error = ns_ev_decoder->add_table(ras, ns_ev_decoder);
|
||||
+ if (error)
|
||||
+ break;
|
||||
+ }
|
||||
+ ns_ev_decoder = ns_ev_decoder->next;
|
||||
+ }
|
||||
+
|
||||
+ if (error)
|
||||
+ return -1;
|
||||
+#endif
|
||||
+
|
||||
+ return 0;
|
||||
+}
|
||||
+
|
||||
static int find_ns_ev_decoder(const char *sec_type, struct ras_ns_ev_decoder **p_ns_ev_dec)
|
||||
{
|
||||
struct ras_ns_ev_decoder *ns_ev_decoder;
|
||||
@@ -96,7 +122,7 @@ static int find_ns_ev_decoder(const char *sec_type, struct ras_ns_ev_decoder **p
|
||||
return 0;
|
||||
}
|
||||
|
||||
-static void unregister_ns_ev_decoder(void)
|
||||
+void ras_ns_finalize_vendor_tables(void)
|
||||
{
|
||||
#ifdef HAVE_SQLITE3
|
||||
struct ras_ns_ev_decoder *ns_ev_decoder = ras_ns_ev_dec_list;
|
||||
@@ -108,6 +134,13 @@ static void unregister_ns_ev_decoder(void)
|
||||
}
|
||||
ns_ev_decoder = ns_ev_decoder->next;
|
||||
}
|
||||
+#endif
|
||||
+}
|
||||
+
|
||||
+static void unregister_ns_ev_decoder(void)
|
||||
+{
|
||||
+#ifdef HAVE_SQLITE3
|
||||
+ ras_ns_finalize_vendor_tables();
|
||||
#endif
|
||||
ras_ns_ev_dec_list = NULL;
|
||||
}
|
||||
diff --git a/ras-non-standard-handler.h b/ras-non-standard-handler.h
|
||||
index 57d4cb5..341206a 100644
|
||||
--- a/ras-non-standard-handler.h
|
||||
+++ b/ras-non-standard-handler.h
|
||||
@@ -23,6 +23,7 @@
|
||||
struct ras_ns_ev_decoder {
|
||||
struct ras_ns_ev_decoder *next;
|
||||
const char *sec_type;
|
||||
+ int (*add_table)(struct ras_events *ras, struct ras_ns_ev_decoder *ev_decoder);
|
||||
int (*decode)(struct ras_events *ras, struct ras_ns_ev_decoder *ev_decoder,
|
||||
struct trace_seq *s, struct ras_non_standard_event *event);
|
||||
#ifdef HAVE_SQLITE3
|
||||
@@ -39,6 +40,8 @@ void print_le_hex(struct trace_seq *s, const uint8_t *buf, int index);
|
||||
|
||||
#ifdef HAVE_NON_STANDARD
|
||||
int register_ns_ev_decoder(struct ras_ns_ev_decoder *ns_ev_decoder);
|
||||
+int ras_ns_add_vendor_tables(struct ras_events *ras);
|
||||
+void ras_ns_finalize_vendor_tables(void);
|
||||
#else
|
||||
static inline int register_ns_ev_decoder(struct ras_ns_ev_decoder *ns_ev_decoder) { return 0; };
|
||||
#endif
|
||||
--
|
||||
2.25.1
|
||||
|
||||
228
0003-rasdaemon-Modify-recording-Hisilicon-common-error-da.patch
Normal file
228
0003-rasdaemon-Modify-recording-Hisilicon-common-error-da.patch
Normal file
@ -0,0 +1,228 @@
|
||||
From 9c4665f33c39ea84db7d69079ab27205d2fbd07e Mon Sep 17 00:00:00 2001
|
||||
From: Shiju Jose <shiju.jose@huawei.com>
|
||||
Date: Wed, 2 Mar 2022 12:20:40 +0000
|
||||
Subject: [PATCH 03/10] rasdaemon: Modify recording Hisilicon common error data
|
||||
|
||||
The error statistics for the Hisilicon common
|
||||
error need to do based on module, error severity etc.
|
||||
|
||||
Modify recording Hisilicon common error data as separate fields
|
||||
in the sql db table instead of the combined single field.
|
||||
|
||||
Signed-off-by: Shiju Jose <shiju.jose@huawei.com>
|
||||
---
|
||||
non-standard-hisilicon.c | 126 ++++++++++++++++++++++++++++++++-------
|
||||
1 file changed, 104 insertions(+), 22 deletions(-)
|
||||
|
||||
diff --git a/non-standard-hisilicon.c b/non-standard-hisilicon.c
|
||||
index 1432163..d1e1774 100644
|
||||
--- a/non-standard-hisilicon.c
|
||||
+++ b/non-standard-hisilicon.c
|
||||
@@ -17,6 +17,7 @@
|
||||
#include "non-standard-hisilicon.h"
|
||||
|
||||
#define HISI_BUF_LEN 2048
|
||||
+#define HISI_PCIE_INFO_BUF_LEN 256
|
||||
|
||||
struct hisi_common_error_section {
|
||||
uint32_t val_bits;
|
||||
@@ -63,12 +64,25 @@ enum {
|
||||
enum {
|
||||
HISI_COMMON_FIELD_ID,
|
||||
HISI_COMMON_FIELD_TIMESTAMP,
|
||||
- HISI_COMMON_FIELD_ERR_INFO,
|
||||
+ HISI_COMMON_FIELD_VERSION,
|
||||
+ HISI_COMMON_FIELD_SOC_ID,
|
||||
+ HISI_COMMON_FIELD_SOCKET_ID,
|
||||
+ HISI_COMMON_FIELD_TOTEM_ID,
|
||||
+ HISI_COMMON_FIELD_NIMBUS_ID,
|
||||
+ HISI_COMMON_FIELD_SUB_SYSTEM_ID,
|
||||
+ HISI_COMMON_FIELD_MODULE_ID,
|
||||
+ HISI_COMMON_FIELD_SUB_MODULE_ID,
|
||||
+ HISI_COMMON_FIELD_CORE_ID,
|
||||
+ HISI_COMMON_FIELD_PORT_ID,
|
||||
+ HISI_COMMON_FIELD_ERR_TYPE,
|
||||
+ HISI_COMMON_FIELD_PCIE_INFO,
|
||||
+ HISI_COMMON_FIELD_ERR_SEVERITY,
|
||||
HISI_COMMON_FIELD_REGS_DUMP,
|
||||
};
|
||||
|
||||
struct hisi_event {
|
||||
char error_msg[HISI_BUF_LEN];
|
||||
+ char pcie_info[HISI_PCIE_INFO_BUF_LEN];
|
||||
char reg_msg[HISI_BUF_LEN];
|
||||
};
|
||||
|
||||
@@ -132,14 +146,26 @@ int step_vendor_data_tab(struct ras_ns_ev_decoder *ev_decoder, const char *name)
|
||||
|
||||
#ifdef HAVE_SQLITE3
|
||||
static const struct db_fields hisi_common_section_fields[] = {
|
||||
- { .name = "id", .type = "INTEGER PRIMARY KEY" },
|
||||
- { .name = "timestamp", .type = "TEXT" },
|
||||
- { .name = "err_info", .type = "TEXT" },
|
||||
+ { .name = "id", .type = "INTEGER PRIMARY KEY" },
|
||||
+ { .name = "timestamp", .type = "TEXT" },
|
||||
+ { .name = "version", .type = "INTEGER" },
|
||||
+ { .name = "soc_id", .type = "INTEGER" },
|
||||
+ { .name = "socket_id", .type = "INTEGER" },
|
||||
+ { .name = "totem_id", .type = "INTEGER" },
|
||||
+ { .name = "nimbus_id", .type = "INTEGER" },
|
||||
+ { .name = "sub_system_id", .type = "INTEGER" },
|
||||
+ { .name = "module_id", .type = "TEXT" },
|
||||
+ { .name = "sub_module_id", .type = "INTEGER" },
|
||||
+ { .name = "core_id", .type = "INTEGER" },
|
||||
+ { .name = "port_id", .type = "INTEGER" },
|
||||
+ { .name = "err_type", .type = "INTEGER" },
|
||||
+ { .name = "pcie_info", .type = "TEXT" },
|
||||
+ { .name = "err_severity", .type = "TEXT" },
|
||||
{ .name = "regs_dump", .type = "TEXT" },
|
||||
};
|
||||
|
||||
static const struct db_table_descriptor hisi_common_section_tab = {
|
||||
- .name = "hisi_common_section",
|
||||
+ .name = "hisi_common_section_v2",
|
||||
.fields = hisi_common_section_fields,
|
||||
.num_fields = ARRAY_SIZE(hisi_common_section_fields),
|
||||
};
|
||||
@@ -199,12 +225,20 @@ static const char* get_soc_desc(uint8_t soc_id)
|
||||
return soc_desc[soc_id];
|
||||
}
|
||||
|
||||
-static void decode_module(struct hisi_event *event, uint8_t module_id)
|
||||
+static void decode_module(struct ras_ns_ev_decoder *ev_decoder,
|
||||
+ struct hisi_event *event, uint8_t module_id)
|
||||
{
|
||||
- if (module_id >= sizeof(module_name)/sizeof(char *))
|
||||
+ if (module_id >= sizeof(module_name)/sizeof(char *)) {
|
||||
HISI_SNPRINTF(event->error_msg, "module=unknown(id=%hhu) ", module_id);
|
||||
- else
|
||||
+ record_vendor_data(ev_decoder, HISI_OEM_DATA_TYPE_TEXT,
|
||||
+ HISI_COMMON_FIELD_MODULE_ID,
|
||||
+ 0, "unknown");
|
||||
+ } else {
|
||||
HISI_SNPRINTF(event->error_msg, "module=%s ", module_name[module_id]);
|
||||
+ record_vendor_data(ev_decoder, HISI_OEM_DATA_TYPE_TEXT,
|
||||
+ HISI_COMMON_FIELD_MODULE_ID,
|
||||
+ 0, module_name[module_id]);
|
||||
+ }
|
||||
}
|
||||
|
||||
static void decode_hisi_common_section_hdr(struct ras_ns_ev_decoder *ev_decoder,
|
||||
@@ -212,43 +246,93 @@ static void decode_hisi_common_section_hdr(struct ras_ns_ev_decoder *ev_decoder,
|
||||
struct hisi_event *event)
|
||||
{
|
||||
HISI_SNPRINTF(event->error_msg, "[ table_version=%hhu", err->version);
|
||||
- if (err->val_bits & BIT(HISI_COMMON_VALID_SOC_ID))
|
||||
+ record_vendor_data(ev_decoder, HISI_OEM_DATA_TYPE_INT,
|
||||
+ HISI_COMMON_FIELD_VERSION,
|
||||
+ err->version, NULL);
|
||||
+ if (err->val_bits & BIT(HISI_COMMON_VALID_SOC_ID)) {
|
||||
HISI_SNPRINTF(event->error_msg, "soc=%s", get_soc_desc(err->soc_id));
|
||||
+ record_vendor_data(ev_decoder, HISI_OEM_DATA_TYPE_INT,
|
||||
+ HISI_COMMON_FIELD_SOC_ID,
|
||||
+ err->soc_id, NULL);
|
||||
+ }
|
||||
|
||||
- if (err->val_bits & BIT(HISI_COMMON_VALID_SOCKET_ID))
|
||||
+ if (err->val_bits & BIT(HISI_COMMON_VALID_SOCKET_ID)) {
|
||||
HISI_SNPRINTF(event->error_msg, "socket_id=%hhu", err->socket_id);
|
||||
+ record_vendor_data(ev_decoder, HISI_OEM_DATA_TYPE_INT,
|
||||
+ HISI_COMMON_FIELD_SOCKET_ID,
|
||||
+ err->socket_id, NULL);
|
||||
+ }
|
||||
|
||||
- if (err->val_bits & BIT(HISI_COMMON_VALID_TOTEM_ID))
|
||||
+ if (err->val_bits & BIT(HISI_COMMON_VALID_TOTEM_ID)) {
|
||||
HISI_SNPRINTF(event->error_msg, "totem_id=%hhu", err->totem_id);
|
||||
+ record_vendor_data(ev_decoder, HISI_OEM_DATA_TYPE_INT,
|
||||
+ HISI_COMMON_FIELD_TOTEM_ID,
|
||||
+ err->totem_id, NULL);
|
||||
+ }
|
||||
|
||||
- if (err->val_bits & BIT(HISI_COMMON_VALID_NIMBUS_ID))
|
||||
+ if (err->val_bits & BIT(HISI_COMMON_VALID_NIMBUS_ID)) {
|
||||
HISI_SNPRINTF(event->error_msg, "nimbus_id=%hhu", err->nimbus_id);
|
||||
+ record_vendor_data(ev_decoder, HISI_OEM_DATA_TYPE_INT,
|
||||
+ HISI_COMMON_FIELD_NIMBUS_ID,
|
||||
+ err->nimbus_id, NULL);
|
||||
+ }
|
||||
|
||||
- if (err->val_bits & BIT(HISI_COMMON_VALID_SUBSYSTEM_ID))
|
||||
+ if (err->val_bits & BIT(HISI_COMMON_VALID_SUBSYSTEM_ID)) {
|
||||
HISI_SNPRINTF(event->error_msg, "subsystem_id=%hhu", err->subsystem_id);
|
||||
+ record_vendor_data(ev_decoder, HISI_OEM_DATA_TYPE_INT,
|
||||
+ HISI_COMMON_FIELD_SUB_SYSTEM_ID,
|
||||
+ err->subsystem_id, NULL);
|
||||
+ }
|
||||
|
||||
if (err->val_bits & BIT(HISI_COMMON_VALID_MODULE_ID))
|
||||
- decode_module(event, err->module_id);
|
||||
+ decode_module(ev_decoder, event, err->module_id);
|
||||
|
||||
- if (err->val_bits & BIT(HISI_COMMON_VALID_SUBMODULE_ID))
|
||||
+ if (err->val_bits & BIT(HISI_COMMON_VALID_SUBMODULE_ID)) {
|
||||
HISI_SNPRINTF(event->error_msg, "submodule_id=%hhu", err->submodule_id);
|
||||
+ record_vendor_data(ev_decoder, HISI_OEM_DATA_TYPE_INT,
|
||||
+ HISI_COMMON_FIELD_SUB_MODULE_ID,
|
||||
+ err->submodule_id, NULL);
|
||||
+ }
|
||||
|
||||
- if (err->val_bits & BIT(HISI_COMMON_VALID_CORE_ID))
|
||||
+ if (err->val_bits & BIT(HISI_COMMON_VALID_CORE_ID)) {
|
||||
HISI_SNPRINTF(event->error_msg, "core_id=%hhu", err->core_id);
|
||||
+ record_vendor_data(ev_decoder, HISI_OEM_DATA_TYPE_INT,
|
||||
+ HISI_COMMON_FIELD_CORE_ID,
|
||||
+ err->core_id, NULL);
|
||||
+ }
|
||||
|
||||
- if (err->val_bits & BIT(HISI_COMMON_VALID_PORT_ID))
|
||||
+ if (err->val_bits & BIT(HISI_COMMON_VALID_PORT_ID)) {
|
||||
HISI_SNPRINTF(event->error_msg, "port_id=%hhu", err->port_id);
|
||||
+ record_vendor_data(ev_decoder, HISI_OEM_DATA_TYPE_INT,
|
||||
+ HISI_COMMON_FIELD_PORT_ID,
|
||||
+ err->port_id, NULL);
|
||||
+ }
|
||||
|
||||
- if (err->val_bits & BIT(HISI_COMMON_VALID_ERR_TYPE))
|
||||
+ if (err->val_bits & BIT(HISI_COMMON_VALID_ERR_TYPE)) {
|
||||
HISI_SNPRINTF(event->error_msg, "err_type=%hu", err->err_type);
|
||||
+ record_vendor_data(ev_decoder, HISI_OEM_DATA_TYPE_INT,
|
||||
+ HISI_COMMON_FIELD_ERR_TYPE,
|
||||
+ err->err_type, NULL);
|
||||
+ }
|
||||
|
||||
- if (err->val_bits & BIT(HISI_COMMON_VALID_PCIE_INFO))
|
||||
+ if (err->val_bits & BIT(HISI_COMMON_VALID_PCIE_INFO)) {
|
||||
HISI_SNPRINTF(event->error_msg, "pcie_device_id=%04x:%02x:%02x.%x",
|
||||
err->pcie_info.segment, err->pcie_info.bus,
|
||||
err->pcie_info.device, err->pcie_info.function);
|
||||
+ HISI_SNPRINTF(event->pcie_info, "%04x:%02x:%02x.%x",
|
||||
+ err->pcie_info.segment, err->pcie_info.bus,
|
||||
+ err->pcie_info.device, err->pcie_info.function);
|
||||
+ record_vendor_data(ev_decoder, HISI_OEM_DATA_TYPE_TEXT,
|
||||
+ HISI_COMMON_FIELD_PCIE_INFO,
|
||||
+ 0, event->pcie_info);
|
||||
+ }
|
||||
|
||||
- if (err->val_bits & BIT(HISI_COMMON_VALID_ERR_SEVERITY))
|
||||
+ if (err->val_bits & BIT(HISI_COMMON_VALID_ERR_SEVERITY)) {
|
||||
HISI_SNPRINTF(event->error_msg, "err_severity=%s", err_severity(err->err_severity));
|
||||
+ record_vendor_data(ev_decoder, HISI_OEM_DATA_TYPE_TEXT,
|
||||
+ HISI_COMMON_FIELD_ERR_SEVERITY,
|
||||
+ 0, err_severity(err->err_severity));
|
||||
+ }
|
||||
|
||||
HISI_SNPRINTF(event->error_msg, "]");
|
||||
}
|
||||
@@ -293,8 +377,6 @@ static int decode_hisi_common_section(struct ras_events *ras,
|
||||
record_vendor_data(ev_decoder, HISI_OEM_DATA_TYPE_TEXT,
|
||||
HISI_COMMON_FIELD_TIMESTAMP,
|
||||
0, event->timestamp);
|
||||
- record_vendor_data(ev_decoder, HISI_OEM_DATA_TYPE_TEXT,
|
||||
- HISI_COMMON_FIELD_ERR_INFO, 0, hevent.error_msg);
|
||||
record_vendor_data(ev_decoder, HISI_OEM_DATA_TYPE_TEXT,
|
||||
HISI_COMMON_FIELD_REGS_DUMP, 0, hevent.reg_msg);
|
||||
step_vendor_data_tab(ev_decoder, "hisi_common_section_tab");
|
||||
--
|
||||
2.25.1
|
||||
|
||||
@ -0,0 +1,35 @@
|
||||
From c46f65e1315aab8585e24d24223bd56c8931202a Mon Sep 17 00:00:00 2001
|
||||
From: Xiaofei Tan <tanxiaofei@huawei.com>
|
||||
Date: Mon, 31 Oct 2022 18:36:26 +0800
|
||||
Subject: [PATCH 4/4] rasdaemon: Add four modules supported by HiSilicon common
|
||||
section
|
||||
|
||||
Add four modules supported by HiSilicon common error section.
|
||||
|
||||
Signed-off-by: Xiaofei Tan <tanxiaofei@huawei.com>
|
||||
Signed-off-by: Shiju Jose <shiju.jose@huawei.com>
|
||||
Signed-off-by: Mauro Carvalho Chehab <mchehab@kernel.org>
|
||||
---
|
||||
non-standard-hisilicon.c | 6 +++++-
|
||||
1 file changed, 5 insertions(+), 1 deletion(-)
|
||||
|
||||
diff --git a/non-standard-hisilicon.c b/non-standard-hisilicon.c
|
||||
index 0ddb5ec..7296d28 100644
|
||||
--- a/non-standard-hisilicon.c
|
||||
+++ b/non-standard-hisilicon.c
|
||||
@@ -214,7 +214,11 @@ static const char* module_name[] = {
|
||||
"Tsensor",
|
||||
"ROH",
|
||||
"BTC",
|
||||
- "HILINK"
|
||||
+ "HILINK",
|
||||
+ "STARS",
|
||||
+ "SDMA",
|
||||
+ "UC",
|
||||
+ "HBMC",
|
||||
};
|
||||
|
||||
static const char* get_soc_desc(uint8_t soc_id)
|
||||
--
|
||||
2.25.1
|
||||
|
||||
@ -0,0 +1,97 @@
|
||||
From 4f706ff3b1a04de3be506a309e153b99e04b3445 Mon Sep 17 00:00:00 2001
|
||||
From: Shiju Jose <shiju.jose@huawei.com>
|
||||
Date: Thu, 24 Feb 2022 18:02:14 +0000
|
||||
Subject: [PATCH 04/10] rasdaemon: ras-mc-ctl: Modify error statistics for
|
||||
HiSilicon KunPeng9xx common errors
|
||||
|
||||
Modify the error statistics for the HiSilicon KunPeng9xx platforms common errors
|
||||
to display the statistics and error info based on the module and the error severity.
|
||||
|
||||
Signed-off-by: Shiju Jose <shiju.jose@huawei.com>
|
||||
---
|
||||
util/ras-mc-ctl.in | 40 +++++++++++++++++++++++++++++-----------
|
||||
1 file changed, 29 insertions(+), 11 deletions(-)
|
||||
|
||||
diff --git a/util/ras-mc-ctl.in b/util/ras-mc-ctl.in
|
||||
index b22dd60..08eb287 100755
|
||||
--- a/util/ras-mc-ctl.in
|
||||
+++ b/util/ras-mc-ctl.in
|
||||
@@ -1537,7 +1537,7 @@ sub vendor_errors_summary
|
||||
require DBI;
|
||||
my ($num_args, $platform_id);
|
||||
my ($query, $query_handle, $count, $out);
|
||||
- my ($module_id, $sub_module_id, $err_severity, $err_sev, $err_info);
|
||||
+ my ($module_id, $sub_module_id, $err_severity, $err_sev);
|
||||
|
||||
$num_args = $#ARGV + 1;
|
||||
$platform_id = 0;
|
||||
@@ -1614,13 +1614,18 @@ sub vendor_errors_summary
|
||||
|
||||
# HiSilicon Kunpeng9xx common errors
|
||||
if ($platform_id eq HISILICON_KUNPENG_9XX) {
|
||||
- $query = "select err_info, count(*) from hisi_common_section";
|
||||
+ $query = "select err_severity, module_id, count(*) from hisi_common_section_v2 group by err_severity, module_id";
|
||||
$query_handle = $dbh->prepare($query);
|
||||
$query_handle->execute();
|
||||
- $query_handle->bind_columns(\($err_info, $count));
|
||||
+ $query_handle->bind_columns(\($err_severity, $module_id, $count));
|
||||
$out = "";
|
||||
+ $err_sev = "";
|
||||
while($query_handle->fetch()) {
|
||||
- $out .= "\terrors: $count\n";
|
||||
+ if ($err_severity ne $err_sev) {
|
||||
+ $out .= "$err_severity errors:\n";
|
||||
+ $err_sev = $err_severity;
|
||||
+ }
|
||||
+ $out .= "\t$module_id: $count\n";
|
||||
}
|
||||
if ($out ne "") {
|
||||
print "HiSilicon Kunpeng9xx common error events summary:\n$out\n";
|
||||
@@ -1638,8 +1643,8 @@ sub vendor_errors
|
||||
require DBI;
|
||||
my ($num_args, $platform_id);
|
||||
my ($query, $query_handle, $id, $timestamp, $out);
|
||||
- my ($version, $soc_id, $socket_id, $nimbus_id, $core_id, $port_id);
|
||||
- my ($module_id, $sub_module_id, $err_severity, $err_type, $err_info, $regs);
|
||||
+ my ($version, $soc_id, $socket_id, $totem_id, $nimbus_id, $sub_system_id, $core_id, $port_id);
|
||||
+ my ($module_id, $sub_module_id, $err_severity, $err_type, $pcie_info, $regs);
|
||||
|
||||
$num_args = $#ARGV + 1;
|
||||
$platform_id = 0;
|
||||
@@ -1727,15 +1732,28 @@ sub vendor_errors
|
||||
|
||||
# HiSilicon Kunpeng9xx common errors
|
||||
if ($platform_id eq HISILICON_KUNPENG_9XX) {
|
||||
- $query = "select id, timestamp, err_info, regs_dump from hisi_common_section order by id";
|
||||
+ $query = "select id, timestamp, version, soc_id, socket_id, totem_id, nimbus_id, sub_system_id, module_id, sub_module_id, core_id, port_id, err_type, pcie_info, err_severity, regs_dump from hisi_common_section_v2 order by id, module_id, err_severity";
|
||||
$query_handle = $dbh->prepare($query);
|
||||
$query_handle->execute();
|
||||
- $query_handle->bind_columns(\($id, $timestamp, $err_info, $regs));
|
||||
+ $query_handle->bind_columns(\($id, $timestamp, $version, $soc_id, $socket_id, $totem_id, $nimbus_id, $sub_system_id, $module_id, $sub_module_id, $core_id, $port_id, $err_type, $pcie_info, $err_severity, $regs));
|
||||
$out = "";
|
||||
while($query_handle->fetch()) {
|
||||
- $out .= "$id. $timestamp ";
|
||||
- $out .= "Error Info:$err_info \n" if ($err_info);
|
||||
- $out .= "Error Registers: $regs\n\n" if ($regs);
|
||||
+ $out .= "$id. $timestamp Error Info: ";
|
||||
+ $out .= "version=$version, ";
|
||||
+ $out .= "soc_id=$soc_id, " if ($soc_id);
|
||||
+ $out .= "socket_id=$socket_id, " if ($socket_id);
|
||||
+ $out .= "totem_id=$totem_id, " if ($totem_id);
|
||||
+ $out .= "nimbus_id=$nimbus_id, " if ($nimbus_id);
|
||||
+ $out .= "sub_system_id=$sub_system_id, " if ($sub_system_id);
|
||||
+ $out .= "module_id=$module_id, " if ($module_id);
|
||||
+ $out .= "sub_module_id=$sub_module_id, " if ($sub_module_id);
|
||||
+ $out .= "core_id=$core_id, " if ($core_id);
|
||||
+ $out .= "port_id=$port_id, " if ($port_id);
|
||||
+ $out .= "err_type=$err_type, " if ($err_type);
|
||||
+ $out .= "pcie_info=$pcie_info, " if ($pcie_info);
|
||||
+ $out .= "err_severity=$err_severity, " if ($err_severity);
|
||||
+ $out .= "Error Registers: $regs" if ($regs);
|
||||
+ $out .= "\n\n";
|
||||
}
|
||||
if ($out ne "") {
|
||||
print "HiSilicon Kunpeng9xx common error events:\n$out\n";
|
||||
--
|
||||
2.25.1
|
||||
|
||||
@ -0,0 +1,56 @@
|
||||
From f5c3c03039be28bb6b5bbe00e12e9586b19a1060 Mon Sep 17 00:00:00 2001
|
||||
From: Shiju Jose <shiju.jose@huawei.com>
|
||||
Date: Sat, 5 Mar 2022 16:18:55 +0000
|
||||
Subject: [PATCH 05/10] rasdaemon: ras-mc-ctl: Reformat error info of the
|
||||
HiSilicon Kunpeng920
|
||||
|
||||
Reformat the code to display the error info of HiSilicon Kunpeng920.
|
||||
|
||||
Signed-off-by: Shiju Jose <shiju.jose@huawei.com>
|
||||
---
|
||||
util/ras-mc-ctl.in | 15 +++++++++------
|
||||
1 file changed, 9 insertions(+), 6 deletions(-)
|
||||
|
||||
diff --git a/util/ras-mc-ctl.in b/util/ras-mc-ctl.in
|
||||
index 08eb287..8755b6f 100755
|
||||
--- a/util/ras-mc-ctl.in
|
||||
+++ b/util/ras-mc-ctl.in
|
||||
@@ -1671,8 +1671,9 @@ sub vendor_errors
|
||||
$out .= "nimbus_id=$nimbus_id, " if ($nimbus_id);
|
||||
$out .= "module_id=$module_id, " if ($module_id);
|
||||
$out .= "sub_module_id=$sub_module_id, " if ($sub_module_id);
|
||||
- $out .= "err_severity=$err_severity, \n" if ($err_severity);
|
||||
- $out .= "Error Registers: $regs\n\n" if ($regs);
|
||||
+ $out .= "err_severity=$err_severity, " if ($err_severity);
|
||||
+ $out .= "Error Registers: $regs " if ($regs);
|
||||
+ $out .= "\n\n";
|
||||
}
|
||||
if ($out ne "") {
|
||||
print "HiSilicon Kunpeng920 OEM type1 error events:\n$out\n";
|
||||
@@ -1694,8 +1695,9 @@ sub vendor_errors
|
||||
$out .= "nimbus_id=$nimbus_id, " if ($nimbus_id);
|
||||
$out .= "module_id=$module_id, " if ($module_id);
|
||||
$out .= "sub_module_id=$sub_module_id, " if ($sub_module_id);
|
||||
- $out .= "err_severity=$err_severity, \n" if ($err_severity);
|
||||
- $out .= "Error Registers: $regs\n\n" if ($regs);
|
||||
+ $out .= "err_severity=$err_severity, " if ($err_severity);
|
||||
+ $out .= "Error Registers: $regs " if ($regs);
|
||||
+ $out .= "\n\n";
|
||||
}
|
||||
if ($out ne "") {
|
||||
print "HiSilicon Kunpeng920 OEM type2 error events:\n$out\n";
|
||||
@@ -1719,8 +1721,9 @@ sub vendor_errors
|
||||
$out .= "core_id=$core_id, " if ($core_id);
|
||||
$out .= "port_id=$port_id, " if ($port_id);
|
||||
$out .= "err_severity=$err_severity, " if ($err_severity);
|
||||
- $out .= "err_type=$err_type, \n" if ($err_type);
|
||||
- $out .= "Error Registers: $regs\n\n" if ($regs);
|
||||
+ $out .= "err_type=$err_type, " if ($err_type);
|
||||
+ $out .= "Error Registers: $regs " if ($regs);
|
||||
+ $out .= "\n\n";
|
||||
}
|
||||
if ($out ne "") {
|
||||
print "HiSilicon Kunpeng920 PCIe controller error events:\n$out\n";
|
||||
--
|
||||
2.25.1
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@ -0,0 +1,37 @@
|
||||
From d595a9d61f9d8341a5e30d4d800e3237d6e0f390 Mon Sep 17 00:00:00 2001
|
||||
From: Shiju Jose <shiju.jose@huawei.com>
|
||||
Date: Sat, 5 Mar 2022 17:01:35 +0000
|
||||
Subject: [PATCH 06/10] rasdaemon: ras-mc-ctl: Add printing usage if necessary
|
||||
parameters are not passed for the vendor-error options
|
||||
|
||||
Add printing usage if necessary parameters are not passed
|
||||
for the vendor-errors options.
|
||||
|
||||
Signed-off-by: Shiju Jose <shiju.jose@huawei.com>
|
||||
---
|
||||
util/ras-mc-ctl.in | 2 ++
|
||||
1 file changed, 2 insertions(+)
|
||||
|
||||
diff --git a/util/ras-mc-ctl.in b/util/ras-mc-ctl.in
|
||||
index 8755b6f..959ea6b 100755
|
||||
--- a/util/ras-mc-ctl.in
|
||||
+++ b/util/ras-mc-ctl.in
|
||||
@@ -1544,6 +1544,7 @@ sub vendor_errors_summary
|
||||
if ($num_args ne 0) {
|
||||
$platform_id = $ARGV[0];
|
||||
} else {
|
||||
+ usage(1);
|
||||
return;
|
||||
}
|
||||
|
||||
@@ -1651,6 +1652,7 @@ sub vendor_errors
|
||||
if ($num_args ne 0) {
|
||||
$platform_id = $ARGV[0];
|
||||
} else {
|
||||
+ usage(1);
|
||||
return;
|
||||
}
|
||||
|
||||
--
|
||||
2.25.1
|
||||
|
||||
@ -1,78 +0,0 @@
|
||||
From 57640072aead2e00037749d66f05fc26e3fe3071 Mon Sep 17 00:00:00 2001
|
||||
From: Lostwayzxc <luoshengwei@huawei.com>
|
||||
Date: Tue, 25 May 2021 20:07:26 +0800
|
||||
Subject: [PATCH 2/2] add trace print of new information and add it to sqilte
|
||||
|
||||
Since we add new information of the event, we add trace print and store it to
|
||||
Sqlite.
|
||||
|
||||
Signed-off-by: Luo Shengwei <luoshengwei@huawei.com>
|
||||
---
|
||||
ras-arm-handler.c | 10 ++++++++++
|
||||
ras-record.c | 8 ++++++++
|
||||
2 files changed, 18 insertions(+)
|
||||
|
||||
diff --git a/ras-arm-handler.c b/ras-arm-handler.c
|
||||
index 10d0099..23ad470 100644
|
||||
--- a/ras-arm-handler.c
|
||||
+++ b/ras-arm-handler.c
|
||||
@@ -23,6 +23,13 @@
|
||||
#include "ras-cpu-isolation.h"
|
||||
|
||||
#ifdef HAVE_CPU_FAULT_ISOLATION
|
||||
+static void trace_print_hex(struct trace_seq *s, const uint8_t *buf, int buf_len)
|
||||
+{
|
||||
+ for (int i = 0; i < buf_len; ++i) {
|
||||
+ trace_seq_printf(s, "%2.2x", buf[i]);
|
||||
+ }
|
||||
+}
|
||||
+
|
||||
static int is_core_failure(unsigned long value)
|
||||
{
|
||||
/*
|
||||
@@ -135,6 +142,7 @@ int ras_arm_event_handler(struct trace_seq *s,
|
||||
case GHES_SEV_PANIC:
|
||||
ev.severity = "Fatal";
|
||||
}
|
||||
+ trace_seq_printf(s, "\n severity: %s", ev.severity);
|
||||
|
||||
if (val == GHES_SEV_CORRECTED || val == GHES_SEV_RECOVERABLE) {
|
||||
int len, nums;
|
||||
@@ -142,6 +150,8 @@ int ras_arm_event_handler(struct trace_seq *s,
|
||||
if (!ev.error_info)
|
||||
return -1;
|
||||
ev.length = len;
|
||||
+ trace_seq_printf(s, "\n processor_err_info: ");
|
||||
+ trace_print_hex(s, ev.error_info, len);
|
||||
/* relate to enum error_type */
|
||||
nums = count_errors(event, ev.error_info, len);
|
||||
if (nums > 0) {
|
||||
diff --git a/ras-record.c b/ras-record.c
|
||||
index 549c494..33d4741 100644
|
||||
--- a/ras-record.c
|
||||
+++ b/ras-record.c
|
||||
@@ -210,6 +210,10 @@ static const struct db_fields arm_event_fields[] = {
|
||||
{ .name="mpidr", .type="INTEGER" },
|
||||
{ .name="running_state", .type="INTEGER" },
|
||||
{ .name="psci_state", .type="INTEGER" },
|
||||
+#ifdef HAVE_CPU_FAULT_ISOLATION
|
||||
+ { .name="severity", .type="TEXT" },
|
||||
+ { .name="error_info", .type="BLOB" },
|
||||
+#endif
|
||||
};
|
||||
|
||||
static const struct db_table_descriptor arm_event_tab = {
|
||||
@@ -233,6 +237,10 @@ int ras_store_arm_record(struct ras_events *ras, struct ras_arm_event *ev)
|
||||
sqlite3_bind_int64 (priv->stmt_arm_record, 4, ev->mpidr);
|
||||
sqlite3_bind_int (priv->stmt_arm_record, 5, ev->running_state);
|
||||
sqlite3_bind_int (priv->stmt_arm_record, 6, ev->psci_state);
|
||||
+#ifdef HAVE_CPU_FAULT_ISOLATION
|
||||
+ sqlite3_bind_text (priv->stmt_arm_record, 7, ev->severity, -1, NULL);
|
||||
+ sqlite3_bind_blob (priv->stmt_arm_record, 8, ev->error_info, ev->length, NULL);
|
||||
+#endif
|
||||
|
||||
rc = sqlite3_step(priv->stmt_arm_record);
|
||||
if (rc != SQLITE_OK && rc != SQLITE_DONE)
|
||||
--
|
||||
2.27.0
|
||||
|
||||
274
0007-rasdaemon-ras-mc-ctl-Add-support-to-display-the-HiSi.patch
Normal file
274
0007-rasdaemon-ras-mc-ctl-Add-support-to-display-the-HiSi.patch
Normal file
@ -0,0 +1,274 @@
|
||||
From 0643011831e5fb4e81edff16ad55f9a5196ec7a9 Mon Sep 17 00:00:00 2001
|
||||
From: Shiju Jose <shiju.jose@huawei.com>
|
||||
Date: Sat, 5 Mar 2022 18:19:38 +0000
|
||||
Subject: [PATCH 07/10] rasdaemon: ras-mc-ctl: Add support to display the
|
||||
HiSilicon vendor errors for a specified module
|
||||
|
||||
Add support to display the HiSilicon vendor errors for a specified module.
|
||||
|
||||
Signed-off-by: Shiju Jose <shiju.jose@huawei.com>
|
||||
---
|
||||
util/ras-mc-ctl.in | 145 +++++++++++++++++++++++++++------------------
|
||||
1 file changed, 87 insertions(+), 58 deletions(-)
|
||||
|
||||
diff --git a/util/ras-mc-ctl.in b/util/ras-mc-ctl.in
|
||||
index 959ea6b..296eb87 100755
|
||||
--- a/util/ras-mc-ctl.in
|
||||
+++ b/util/ras-mc-ctl.in
|
||||
@@ -96,8 +96,9 @@ Usage: $prog [OPTIONS...]
|
||||
--errors Shows the errors stored at the error database.
|
||||
--error-count Shows the corrected and uncorrected error counts using sysfs.
|
||||
--vendor-errors-summary <platform-id> Presents a summary of the vendor-specific logged errors.
|
||||
- --vendor-errors <platform-id> Shows the vendor-specific errors stored in the error database.
|
||||
- --vendor-platforms Shows the supported platforms with platform-ids for the vendor-specific errors.
|
||||
+ --vendor-errors <platform-id> Shows the vendor-specific errors stored in the error database.
|
||||
+ --vendor-errors <platform-id> <module-name> Shows the vendor-specific errors for a specific module stored in the error database.
|
||||
+ --vendor-platforms List the supported platforms with platform-ids for the vendor-specific errors.
|
||||
--help This help message.
|
||||
EOF
|
||||
|
||||
@@ -1535,12 +1536,14 @@ use constant {
|
||||
sub vendor_errors_summary
|
||||
{
|
||||
require DBI;
|
||||
- my ($num_args, $platform_id);
|
||||
+ my ($num_args, $platform_id, $found_platform);
|
||||
my ($query, $query_handle, $count, $out);
|
||||
my ($module_id, $sub_module_id, $err_severity, $err_sev);
|
||||
|
||||
$num_args = $#ARGV + 1;
|
||||
$platform_id = 0;
|
||||
+ $found_platform = 0;
|
||||
+
|
||||
if ($num_args ne 0) {
|
||||
$platform_id = $ARGV[0];
|
||||
} else {
|
||||
@@ -1552,6 +1555,7 @@ sub vendor_errors_summary
|
||||
|
||||
# HiSilicon Kunpeng920 errors
|
||||
if ($platform_id eq HISILICON_KUNPENG_920) {
|
||||
+ $found_platform = 1;
|
||||
$query = "select err_severity, module_id, count(*) from hip08_oem_type1_event_v2 group by err_severity, module_id";
|
||||
$query_handle = $dbh->prepare($query);
|
||||
$query_handle->execute();
|
||||
@@ -1615,6 +1619,7 @@ sub vendor_errors_summary
|
||||
|
||||
# HiSilicon Kunpeng9xx common errors
|
||||
if ($platform_id eq HISILICON_KUNPENG_9XX) {
|
||||
+ $found_platform = 1;
|
||||
$query = "select err_severity, module_id, count(*) from hisi_common_section_v2 group by err_severity, module_id";
|
||||
$query_handle = $dbh->prepare($query);
|
||||
$query_handle->execute();
|
||||
@@ -1636,21 +1641,31 @@ sub vendor_errors_summary
|
||||
$query_handle->finish;
|
||||
}
|
||||
|
||||
+ if ($platform_id && !($found_platform)) {
|
||||
+ print "Platform ID $platform_id is not valid\n";
|
||||
+ }
|
||||
+
|
||||
undef($dbh);
|
||||
}
|
||||
|
||||
sub vendor_errors
|
||||
{
|
||||
require DBI;
|
||||
- my ($num_args, $platform_id);
|
||||
+ my ($num_args, $platform_id, $found_platform, $module, $found_module);
|
||||
my ($query, $query_handle, $id, $timestamp, $out);
|
||||
my ($version, $soc_id, $socket_id, $totem_id, $nimbus_id, $sub_system_id, $core_id, $port_id);
|
||||
my ($module_id, $sub_module_id, $err_severity, $err_type, $pcie_info, $regs);
|
||||
|
||||
$num_args = $#ARGV + 1;
|
||||
$platform_id = 0;
|
||||
+ $found_platform = 0;
|
||||
+ $module = 0;
|
||||
+ $found_module = 0;
|
||||
if ($num_args ne 0) {
|
||||
$platform_id = $ARGV[0];
|
||||
+ if ($num_args gt 1) {
|
||||
+ $module = $ARGV[1];
|
||||
+ }
|
||||
} else {
|
||||
usage(1);
|
||||
return;
|
||||
@@ -1660,27 +1675,29 @@ sub vendor_errors
|
||||
|
||||
# HiSilicon Kunpeng920 errors
|
||||
if ($platform_id eq HISILICON_KUNPENG_920) {
|
||||
+ $found_platform = 1;
|
||||
$query = "select id, timestamp, version, soc_id, socket_id, nimbus_id, module_id, sub_module_id, err_severity, regs_dump from hip08_oem_type1_event_v2 order by id, module_id, err_severity";
|
||||
$query_handle = $dbh->prepare($query);
|
||||
$query_handle->execute();
|
||||
$query_handle->bind_columns(\($id, $timestamp, $version, $soc_id, $socket_id, $nimbus_id, $module_id, $sub_module_id, $err_severity, $regs));
|
||||
$out = "";
|
||||
while($query_handle->fetch()) {
|
||||
- $out .= "$id. $timestamp Error Info: ";
|
||||
- $out .= "version=$version, ";
|
||||
- $out .= "soc_id=$soc_id, " if ($soc_id);
|
||||
- $out .= "socket_id=$socket_id, " if ($socket_id);
|
||||
- $out .= "nimbus_id=$nimbus_id, " if ($nimbus_id);
|
||||
- $out .= "module_id=$module_id, " if ($module_id);
|
||||
- $out .= "sub_module_id=$sub_module_id, " if ($sub_module_id);
|
||||
- $out .= "err_severity=$err_severity, " if ($err_severity);
|
||||
- $out .= "Error Registers: $regs " if ($regs);
|
||||
- $out .= "\n\n";
|
||||
+ if ($module eq 0 || ($module_id && uc($module) eq uc($module_id))) {
|
||||
+ $out .= "$id. $timestamp Error Info: ";
|
||||
+ $out .= "version=$version, ";
|
||||
+ $out .= "soc_id=$soc_id, " if ($soc_id);
|
||||
+ $out .= "socket_id=$socket_id, " if ($socket_id);
|
||||
+ $out .= "nimbus_id=$nimbus_id, " if ($nimbus_id);
|
||||
+ $out .= "module_id=$module_id, " if ($module_id);
|
||||
+ $out .= "sub_module_id=$sub_module_id, " if ($sub_module_id);
|
||||
+ $out .= "err_severity=$err_severity, " if ($err_severity);
|
||||
+ $out .= "Error Registers: $regs " if ($regs);
|
||||
+ $out .= "\n\n";
|
||||
+ $found_module = 1;
|
||||
+ }
|
||||
}
|
||||
if ($out ne "") {
|
||||
print "HiSilicon Kunpeng920 OEM type1 error events:\n$out\n";
|
||||
- } else {
|
||||
- print "No HiSilicon Kunpeng920 OEM type1 errors.\n";
|
||||
}
|
||||
$query_handle->finish;
|
||||
|
||||
@@ -1690,21 +1707,22 @@ sub vendor_errors
|
||||
$query_handle->bind_columns(\($id, $timestamp, $version, $soc_id, $socket_id, $nimbus_id, $module_id, $sub_module_id, $err_severity, $regs));
|
||||
$out = "";
|
||||
while($query_handle->fetch()) {
|
||||
- $out .= "$id. $timestamp Error Info: ";
|
||||
- $out .= "version=$version, ";
|
||||
- $out .= "soc_id=$soc_id, " if ($soc_id);
|
||||
- $out .= "socket_id=$socket_id, " if ($socket_id);
|
||||
- $out .= "nimbus_id=$nimbus_id, " if ($nimbus_id);
|
||||
- $out .= "module_id=$module_id, " if ($module_id);
|
||||
- $out .= "sub_module_id=$sub_module_id, " if ($sub_module_id);
|
||||
- $out .= "err_severity=$err_severity, " if ($err_severity);
|
||||
- $out .= "Error Registers: $regs " if ($regs);
|
||||
- $out .= "\n\n";
|
||||
+ if ($module eq 0 || ($module_id && uc($module) eq uc($module_id))) {
|
||||
+ $out .= "$id. $timestamp Error Info: ";
|
||||
+ $out .= "version=$version, ";
|
||||
+ $out .= "soc_id=$soc_id, " if ($soc_id);
|
||||
+ $out .= "socket_id=$socket_id, " if ($socket_id);
|
||||
+ $out .= "nimbus_id=$nimbus_id, " if ($nimbus_id);
|
||||
+ $out .= "module_id=$module_id, " if ($module_id);
|
||||
+ $out .= "sub_module_id=$sub_module_id, " if ($sub_module_id);
|
||||
+ $out .= "err_severity=$err_severity, " if ($err_severity);
|
||||
+ $out .= "Error Registers: $regs " if ($regs);
|
||||
+ $out .= "\n\n";
|
||||
+ $found_module = 1;
|
||||
+ }
|
||||
}
|
||||
if ($out ne "") {
|
||||
print "HiSilicon Kunpeng920 OEM type2 error events:\n$out\n";
|
||||
- } else {
|
||||
- print "No HiSilicon Kunpeng920 OEM type2 errors.\n";
|
||||
}
|
||||
$query_handle->finish;
|
||||
|
||||
@@ -1714,51 +1732,56 @@ sub vendor_errors
|
||||
$query_handle->bind_columns(\($id, $timestamp, $version, $soc_id, $socket_id, $nimbus_id, $sub_module_id, $core_id, $port_id, $err_severity, $err_type, $regs));
|
||||
$out = "";
|
||||
while($query_handle->fetch()) {
|
||||
- $out .= "$id. $timestamp Error Info: ";
|
||||
- $out .= "version=$version, ";
|
||||
- $out .= "soc_id=$soc_id, " if ($soc_id);
|
||||
- $out .= "socket_id=$socket_id, " if ($socket_id);
|
||||
- $out .= "nimbus_id=$nimbus_id, " if ($nimbus_id);
|
||||
- $out .= "sub_module_id=$sub_module_id, " if ($sub_module_id);
|
||||
- $out .= "core_id=$core_id, " if ($core_id);
|
||||
- $out .= "port_id=$port_id, " if ($port_id);
|
||||
- $out .= "err_severity=$err_severity, " if ($err_severity);
|
||||
- $out .= "err_type=$err_type, " if ($err_type);
|
||||
- $out .= "Error Registers: $regs " if ($regs);
|
||||
- $out .= "\n\n";
|
||||
+ if ($module eq 0 || ($sub_module_id && uc($module) eq uc($sub_module_id))) {
|
||||
+ $out .= "$id. $timestamp Error Info: ";
|
||||
+ $out .= "version=$version, ";
|
||||
+ $out .= "soc_id=$soc_id, " if ($soc_id);
|
||||
+ $out .= "socket_id=$socket_id, " if ($socket_id);
|
||||
+ $out .= "nimbus_id=$nimbus_id, " if ($nimbus_id);
|
||||
+ $out .= "sub_module_id=$sub_module_id, " if ($sub_module_id);
|
||||
+ $out .= "core_id=$core_id, " if ($core_id);
|
||||
+ $out .= "port_id=$port_id, " if ($port_id);
|
||||
+ $out .= "err_severity=$err_severity, " if ($err_severity);
|
||||
+ $out .= "err_type=$err_type, " if ($err_type);
|
||||
+ $out .= "Error Registers: $regs " if ($regs);
|
||||
+ $out .= "\n\n";
|
||||
+ $found_module = 1;
|
||||
+ }
|
||||
}
|
||||
if ($out ne "") {
|
||||
print "HiSilicon Kunpeng920 PCIe controller error events:\n$out\n";
|
||||
- } else {
|
||||
- print "No HiSilicon Kunpeng920 PCIe controller errors.\n";
|
||||
}
|
||||
$query_handle->finish;
|
||||
}
|
||||
|
||||
# HiSilicon Kunpeng9xx common errors
|
||||
if ($platform_id eq HISILICON_KUNPENG_9XX) {
|
||||
+ $found_platform = 1;
|
||||
$query = "select id, timestamp, version, soc_id, socket_id, totem_id, nimbus_id, sub_system_id, module_id, sub_module_id, core_id, port_id, err_type, pcie_info, err_severity, regs_dump from hisi_common_section_v2 order by id, module_id, err_severity";
|
||||
$query_handle = $dbh->prepare($query);
|
||||
$query_handle->execute();
|
||||
$query_handle->bind_columns(\($id, $timestamp, $version, $soc_id, $socket_id, $totem_id, $nimbus_id, $sub_system_id, $module_id, $sub_module_id, $core_id, $port_id, $err_type, $pcie_info, $err_severity, $regs));
|
||||
$out = "";
|
||||
while($query_handle->fetch()) {
|
||||
- $out .= "$id. $timestamp Error Info: ";
|
||||
- $out .= "version=$version, ";
|
||||
- $out .= "soc_id=$soc_id, " if ($soc_id);
|
||||
- $out .= "socket_id=$socket_id, " if ($socket_id);
|
||||
- $out .= "totem_id=$totem_id, " if ($totem_id);
|
||||
- $out .= "nimbus_id=$nimbus_id, " if ($nimbus_id);
|
||||
- $out .= "sub_system_id=$sub_system_id, " if ($sub_system_id);
|
||||
- $out .= "module_id=$module_id, " if ($module_id);
|
||||
- $out .= "sub_module_id=$sub_module_id, " if ($sub_module_id);
|
||||
- $out .= "core_id=$core_id, " if ($core_id);
|
||||
- $out .= "port_id=$port_id, " if ($port_id);
|
||||
- $out .= "err_type=$err_type, " if ($err_type);
|
||||
- $out .= "pcie_info=$pcie_info, " if ($pcie_info);
|
||||
- $out .= "err_severity=$err_severity, " if ($err_severity);
|
||||
- $out .= "Error Registers: $regs" if ($regs);
|
||||
- $out .= "\n\n";
|
||||
+ if ($module eq 0 || ($module_id && uc($module) eq uc($module_id))) {
|
||||
+ $out .= "$id. $timestamp Error Info: ";
|
||||
+ $out .= "version=$version, ";
|
||||
+ $out .= "soc_id=$soc_id, " if ($soc_id);
|
||||
+ $out .= "socket_id=$socket_id, " if ($socket_id);
|
||||
+ $out .= "totem_id=$totem_id, " if ($totem_id);
|
||||
+ $out .= "nimbus_id=$nimbus_id, " if ($nimbus_id);
|
||||
+ $out .= "sub_system_id=$sub_system_id, " if ($sub_system_id);
|
||||
+ $out .= "module_id=$module_id, " if ($module_id);
|
||||
+ $out .= "sub_module_id=$sub_module_id, " if ($sub_module_id);
|
||||
+ $out .= "core_id=$core_id, " if ($core_id);
|
||||
+ $out .= "port_id=$port_id, " if ($port_id);
|
||||
+ $out .= "err_type=$err_type, " if ($err_type);
|
||||
+ $out .= "pcie_info=$pcie_info, " if ($pcie_info);
|
||||
+ $out .= "err_severity=$err_severity, " if ($err_severity);
|
||||
+ $out .= "Error Registers: $regs" if ($regs);
|
||||
+ $out .= "\n\n";
|
||||
+ $found_module = 1;
|
||||
+ }
|
||||
}
|
||||
if ($out ne "") {
|
||||
print "HiSilicon Kunpeng9xx common error events:\n$out\n";
|
||||
@@ -1768,6 +1791,12 @@ sub vendor_errors
|
||||
$query_handle->finish;
|
||||
}
|
||||
|
||||
+ if ($platform_id && !($found_platform)) {
|
||||
+ print "Platform ID $platform_id is not valid\n";
|
||||
+ } elsif ($module && !($found_module)) {
|
||||
+ print "No error record for the module $module\n";
|
||||
+ }
|
||||
+
|
||||
undef($dbh);
|
||||
}
|
||||
|
||||
--
|
||||
2.25.1
|
||||
|
||||
@ -1,60 +0,0 @@
|
||||
From 6b767a2fce615384f062ecb392cd332452bf4482 Mon Sep 17 00:00:00 2001
|
||||
From: Lostwayzxc <luoshengwei@huawei.com>
|
||||
Date: Wed, 1 Sep 2021 21:00:16 +0800
|
||||
Subject: [PATCH] modify cpu parse for adapting to new bios version
|
||||
|
||||
---
|
||||
ras-cpu-isolation.c | 20 ++++++++++++++++++--
|
||||
1 file changed, 18 insertions(+), 2 deletions(-)
|
||||
|
||||
diff --git a/ras-cpu-isolation.c b/ras-cpu-isolation.c
|
||||
index 6dcff70..b1643c4 100644
|
||||
--- a/ras-cpu-isolation.c
|
||||
+++ b/ras-cpu-isolation.c
|
||||
@@ -25,6 +25,7 @@
|
||||
|
||||
static struct cpu_info *cpu_infos = NULL;
|
||||
static unsigned int ncores, cores_per_socket, cores_per_die;
|
||||
+static unsigned int cores_per_cluster = 4;
|
||||
static unsigned int sockets, dies = 1;
|
||||
static unsigned int enabled = 1;
|
||||
static const char *cpu_path_format = "/sys/devices/system/cpu/cpu%d/online";
|
||||
@@ -432,18 +433,33 @@ static unsigned long get_bit_value(int64_t value, unsigned offset, unsigned size
|
||||
|
||||
static unsigned get_cpu_index(int64_t mpidr)
|
||||
{
|
||||
- unsigned core_id, socket_id, die_id, cpu;
|
||||
+ unsigned core_id, cluster_id, socket_id, die_id, cpu;
|
||||
/*
|
||||
* Adapt to certain BIOS
|
||||
* In the MPIDR:
|
||||
* bit 8:15: core id
|
||||
+ * bit 16:18: cluster id
|
||||
* bit 19:20: die_id
|
||||
* bit 21:22: socket_id
|
||||
*/
|
||||
core_id = get_bit_value(mpidr, 8, 8);
|
||||
+ cluster_id = get_bit_value(mpidr, 16, 3);
|
||||
socket_id = get_bit_value(mpidr, 21, 2);
|
||||
die_id = get_bit_value(mpidr, 19, 2);
|
||||
- cpu = core_id + socket_id * cores_per_socket + die_id * cores_per_die;
|
||||
+
|
||||
+ /* When die id parsed from MPIDR is 1, it means TotemA, and when it's 3,
|
||||
+ * it means TotemB. When cores per die equal to cores per socket, it means
|
||||
+ * that there is only one die in the socket, in case that the only die is
|
||||
+ * TotemB in CPU 1620s, we set die id to 0 directly.
|
||||
+ */
|
||||
+ if (cores_per_die == cores_per_socket) {
|
||||
+ die_id = 0;
|
||||
+ }
|
||||
+ else {
|
||||
+ die_id = (die_id == 1 ? 0:1);
|
||||
+ }
|
||||
+ cpu = core_id + socket_id * cores_per_socket + die_id * cores_per_die +
|
||||
+ cluster_id * cores_per_cluster;
|
||||
|
||||
return cpu;
|
||||
}
|
||||
--
|
||||
2.27.0
|
||||
|
||||
150
0008-rasdaemon-ras-mc-ctl-Relocate-reading-and-display-Ku.patch
Normal file
150
0008-rasdaemon-ras-mc-ctl-Relocate-reading-and-display-Ku.patch
Normal file
@ -0,0 +1,150 @@
|
||||
From 2f23b5dc6e5831c8ef2e179bb936e13502f75041 Mon Sep 17 00:00:00 2001
|
||||
From: Shiju Jose <shiju.jose@huawei.com>
|
||||
Date: Mon, 7 Mar 2022 12:38:45 +0000
|
||||
Subject: [PATCH 08/10] rasdaemon: ras-mc-ctl: Relocate reading and display
|
||||
Kunpeng920 errors to under Kunpeng9xx
|
||||
|
||||
Relocate reading and display Kunpeng920 errors to under Kunpeng9xx.
|
||||
|
||||
Signed-off-by: Shiju Jose <shiju.jose@huawei.com>
|
||||
---
|
||||
util/ras-mc-ctl.in | 40 ++++++++++------------------------------
|
||||
1 file changed, 10 insertions(+), 30 deletions(-)
|
||||
|
||||
diff --git a/util/ras-mc-ctl.in b/util/ras-mc-ctl.in
|
||||
index 296eb87..75981a0 100755
|
||||
--- a/util/ras-mc-ctl.in
|
||||
+++ b/util/ras-mc-ctl.in
|
||||
@@ -1529,7 +1529,6 @@ sub errors
|
||||
|
||||
# Definitions of the vendor platform IDs.
|
||||
use constant {
|
||||
- HISILICON_KUNPENG_920 => "Kunpeng920",
|
||||
HISILICON_KUNPENG_9XX => "Kunpeng9xx",
|
||||
};
|
||||
|
||||
@@ -1553,8 +1552,8 @@ sub vendor_errors_summary
|
||||
|
||||
my $dbh = DBI->connect("dbi:SQLite:dbname=$dbname", "", "", {});
|
||||
|
||||
- # HiSilicon Kunpeng920 errors
|
||||
- if ($platform_id eq HISILICON_KUNPENG_920) {
|
||||
+ # HiSilicon Kunpeng9xx errors
|
||||
+ if ($platform_id eq HISILICON_KUNPENG_9XX) {
|
||||
$found_platform = 1;
|
||||
$query = "select err_severity, module_id, count(*) from hip08_oem_type1_event_v2 group by err_severity, module_id";
|
||||
$query_handle = $dbh->prepare($query);
|
||||
@@ -1570,9 +1569,7 @@ sub vendor_errors_summary
|
||||
$out .= "\t$module_id: $count\n";
|
||||
}
|
||||
if ($out ne "") {
|
||||
- print "HiSilicon Kunpeng920 OEM type1 error events summary:\n$out\n";
|
||||
- } else {
|
||||
- print "No HiSilicon Kunpeng920 OEM type1 errors.\n\n";
|
||||
+ print "HiSilicon Kunpeng9xx OEM type1 error events summary:\n$out\n";
|
||||
}
|
||||
$query_handle->finish;
|
||||
|
||||
@@ -1590,9 +1587,7 @@ sub vendor_errors_summary
|
||||
$out .= "\t$module_id: $count\n";
|
||||
}
|
||||
if ($out ne "") {
|
||||
- print "HiSilicon Kunpeng920 OEM type2 error events summary:\n$out\n";
|
||||
- } else {
|
||||
- print "No HiSilicon Kunpeng920 OEM type2 errors.\n\n";
|
||||
+ print "HiSilicon Kunpeng9xx OEM type2 error events summary:\n$out\n";
|
||||
}
|
||||
$query_handle->finish;
|
||||
|
||||
@@ -1610,16 +1605,10 @@ sub vendor_errors_summary
|
||||
$out .= "\t$sub_module_id: $count\n";
|
||||
}
|
||||
if ($out ne "") {
|
||||
- print "HiSilicon Kunpeng920 PCIe controller error events summary:\n$out\n";
|
||||
- } else {
|
||||
- print "No HiSilicon Kunpeng920 PCIe controller errors.\n\n";
|
||||
+ print "HiSilicon Kunpeng9xx PCIe controller error events summary:\n$out\n";
|
||||
}
|
||||
$query_handle->finish;
|
||||
- }
|
||||
|
||||
- # HiSilicon Kunpeng9xx common errors
|
||||
- if ($platform_id eq HISILICON_KUNPENG_9XX) {
|
||||
- $found_platform = 1;
|
||||
$query = "select err_severity, module_id, count(*) from hisi_common_section_v2 group by err_severity, module_id";
|
||||
$query_handle = $dbh->prepare($query);
|
||||
$query_handle->execute();
|
||||
@@ -1635,8 +1624,6 @@ sub vendor_errors_summary
|
||||
}
|
||||
if ($out ne "") {
|
||||
print "HiSilicon Kunpeng9xx common error events summary:\n$out\n";
|
||||
- } else {
|
||||
- print "No HiSilicon Kunpeng9xx common errors.\n\n";
|
||||
}
|
||||
$query_handle->finish;
|
||||
}
|
||||
@@ -1673,8 +1660,8 @@ sub vendor_errors
|
||||
|
||||
my $dbh = DBI->connect("dbi:SQLite:dbname=$dbname", "", "", {});
|
||||
|
||||
- # HiSilicon Kunpeng920 errors
|
||||
- if ($platform_id eq HISILICON_KUNPENG_920) {
|
||||
+ # HiSilicon Kunpeng9xx errors
|
||||
+ if ($platform_id eq HISILICON_KUNPENG_9XX) {
|
||||
$found_platform = 1;
|
||||
$query = "select id, timestamp, version, soc_id, socket_id, nimbus_id, module_id, sub_module_id, err_severity, regs_dump from hip08_oem_type1_event_v2 order by id, module_id, err_severity";
|
||||
$query_handle = $dbh->prepare($query);
|
||||
@@ -1697,7 +1684,7 @@ sub vendor_errors
|
||||
}
|
||||
}
|
||||
if ($out ne "") {
|
||||
- print "HiSilicon Kunpeng920 OEM type1 error events:\n$out\n";
|
||||
+ print "HiSilicon Kunpeng9xx OEM type1 error events:\n$out\n";
|
||||
}
|
||||
$query_handle->finish;
|
||||
|
||||
@@ -1722,7 +1709,7 @@ sub vendor_errors
|
||||
}
|
||||
}
|
||||
if ($out ne "") {
|
||||
- print "HiSilicon Kunpeng920 OEM type2 error events:\n$out\n";
|
||||
+ print "HiSilicon Kunpeng9xx OEM type2 error events:\n$out\n";
|
||||
}
|
||||
$query_handle->finish;
|
||||
|
||||
@@ -1749,14 +1736,10 @@ sub vendor_errors
|
||||
}
|
||||
}
|
||||
if ($out ne "") {
|
||||
- print "HiSilicon Kunpeng920 PCIe controller error events:\n$out\n";
|
||||
+ print "HiSilicon Kunpeng9xx PCIe controller error events:\n$out\n";
|
||||
}
|
||||
$query_handle->finish;
|
||||
- }
|
||||
|
||||
- # HiSilicon Kunpeng9xx common errors
|
||||
- if ($platform_id eq HISILICON_KUNPENG_9XX) {
|
||||
- $found_platform = 1;
|
||||
$query = "select id, timestamp, version, soc_id, socket_id, totem_id, nimbus_id, sub_system_id, module_id, sub_module_id, core_id, port_id, err_type, pcie_info, err_severity, regs_dump from hisi_common_section_v2 order by id, module_id, err_severity";
|
||||
$query_handle = $dbh->prepare($query);
|
||||
$query_handle->execute();
|
||||
@@ -1785,8 +1768,6 @@ sub vendor_errors
|
||||
}
|
||||
if ($out ne "") {
|
||||
print "HiSilicon Kunpeng9xx common error events:\n$out\n";
|
||||
- } else {
|
||||
- print "No HiSilicon Kunpeng9xx common errors.\n";
|
||||
}
|
||||
$query_handle->finish;
|
||||
}
|
||||
@@ -1803,7 +1784,6 @@ sub vendor_errors
|
||||
sub vendor_platforms
|
||||
{
|
||||
print "\nSupported platforms for the vendor-specific errors:\n";
|
||||
- print "\tHiSilicon Kunpeng920, platform-id=\"", HISILICON_KUNPENG_920, "\"\n";
|
||||
print "\tHiSilicon Kunpeng9xx, platform-id=\"", HISILICON_KUNPENG_9XX, "\"\n";
|
||||
print "\n";
|
||||
}
|
||||
--
|
||||
2.25.1
|
||||
|
||||
127
0009-rasdaemon-ras-mc-ctl-Updated-HiSilicon-platform-name.patch
Normal file
127
0009-rasdaemon-ras-mc-ctl-Updated-HiSilicon-platform-name.patch
Normal file
@ -0,0 +1,127 @@
|
||||
From df6011fed2bb45989f9e5c2ea30b33937b08d06c Mon Sep 17 00:00:00 2001
|
||||
From: Shiju Jose <shiju.jose@huawei.com>
|
||||
Date: Thu, 28 Apr 2022 18:58:43 +0100
|
||||
Subject: [PATCH 09/10] rasdaemon: ras-mc-ctl: Updated HiSilicon platform name
|
||||
|
||||
Updated the HiSilicon platform name as KunPeng9xx.
|
||||
|
||||
Signed-off-by: Shiju Jose <shiju.jose@huawei.com>
|
||||
---
|
||||
util/ras-mc-ctl.in | 24 ++++++++++++------------
|
||||
1 file changed, 12 insertions(+), 12 deletions(-)
|
||||
|
||||
diff --git a/util/ras-mc-ctl.in b/util/ras-mc-ctl.in
|
||||
index 75981a0..1cc19b3 100755
|
||||
--- a/util/ras-mc-ctl.in
|
||||
+++ b/util/ras-mc-ctl.in
|
||||
@@ -1529,7 +1529,7 @@ sub errors
|
||||
|
||||
# Definitions of the vendor platform IDs.
|
||||
use constant {
|
||||
- HISILICON_KUNPENG_9XX => "Kunpeng9xx",
|
||||
+ HISILICON_KUNPENG_9XX => "KunPeng9xx",
|
||||
};
|
||||
|
||||
sub vendor_errors_summary
|
||||
@@ -1552,7 +1552,7 @@ sub vendor_errors_summary
|
||||
|
||||
my $dbh = DBI->connect("dbi:SQLite:dbname=$dbname", "", "", {});
|
||||
|
||||
- # HiSilicon Kunpeng9xx errors
|
||||
+ # HiSilicon KunPeng9xx errors
|
||||
if ($platform_id eq HISILICON_KUNPENG_9XX) {
|
||||
$found_platform = 1;
|
||||
$query = "select err_severity, module_id, count(*) from hip08_oem_type1_event_v2 group by err_severity, module_id";
|
||||
@@ -1569,7 +1569,7 @@ sub vendor_errors_summary
|
||||
$out .= "\t$module_id: $count\n";
|
||||
}
|
||||
if ($out ne "") {
|
||||
- print "HiSilicon Kunpeng9xx OEM type1 error events summary:\n$out\n";
|
||||
+ print "HiSilicon KunPeng9xx OEM type1 error events summary:\n$out\n";
|
||||
}
|
||||
$query_handle->finish;
|
||||
|
||||
@@ -1587,7 +1587,7 @@ sub vendor_errors_summary
|
||||
$out .= "\t$module_id: $count\n";
|
||||
}
|
||||
if ($out ne "") {
|
||||
- print "HiSilicon Kunpeng9xx OEM type2 error events summary:\n$out\n";
|
||||
+ print "HiSilicon KunPeng9xx OEM type2 error events summary:\n$out\n";
|
||||
}
|
||||
$query_handle->finish;
|
||||
|
||||
@@ -1605,7 +1605,7 @@ sub vendor_errors_summary
|
||||
$out .= "\t$sub_module_id: $count\n";
|
||||
}
|
||||
if ($out ne "") {
|
||||
- print "HiSilicon Kunpeng9xx PCIe controller error events summary:\n$out\n";
|
||||
+ print "HiSilicon KunPeng9xx PCIe controller error events summary:\n$out\n";
|
||||
}
|
||||
$query_handle->finish;
|
||||
|
||||
@@ -1623,7 +1623,7 @@ sub vendor_errors_summary
|
||||
$out .= "\t$module_id: $count\n";
|
||||
}
|
||||
if ($out ne "") {
|
||||
- print "HiSilicon Kunpeng9xx common error events summary:\n$out\n";
|
||||
+ print "HiSilicon KunPeng9xx common error events summary:\n$out\n";
|
||||
}
|
||||
$query_handle->finish;
|
||||
}
|
||||
@@ -1660,7 +1660,7 @@ sub vendor_errors
|
||||
|
||||
my $dbh = DBI->connect("dbi:SQLite:dbname=$dbname", "", "", {});
|
||||
|
||||
- # HiSilicon Kunpeng9xx errors
|
||||
+ # HiSilicon KunPeng9xx errors
|
||||
if ($platform_id eq HISILICON_KUNPENG_9XX) {
|
||||
$found_platform = 1;
|
||||
$query = "select id, timestamp, version, soc_id, socket_id, nimbus_id, module_id, sub_module_id, err_severity, regs_dump from hip08_oem_type1_event_v2 order by id, module_id, err_severity";
|
||||
@@ -1684,7 +1684,7 @@ sub vendor_errors
|
||||
}
|
||||
}
|
||||
if ($out ne "") {
|
||||
- print "HiSilicon Kunpeng9xx OEM type1 error events:\n$out\n";
|
||||
+ print "HiSilicon KunPeng9xx OEM type1 error events:\n$out\n";
|
||||
}
|
||||
$query_handle->finish;
|
||||
|
||||
@@ -1709,7 +1709,7 @@ sub vendor_errors
|
||||
}
|
||||
}
|
||||
if ($out ne "") {
|
||||
- print "HiSilicon Kunpeng9xx OEM type2 error events:\n$out\n";
|
||||
+ print "HiSilicon KunPeng9xx OEM type2 error events:\n$out\n";
|
||||
}
|
||||
$query_handle->finish;
|
||||
|
||||
@@ -1736,7 +1736,7 @@ sub vendor_errors
|
||||
}
|
||||
}
|
||||
if ($out ne "") {
|
||||
- print "HiSilicon Kunpeng9xx PCIe controller error events:\n$out\n";
|
||||
+ print "HiSilicon KunPeng9xx PCIe controller error events:\n$out\n";
|
||||
}
|
||||
$query_handle->finish;
|
||||
|
||||
@@ -1767,7 +1767,7 @@ sub vendor_errors
|
||||
}
|
||||
}
|
||||
if ($out ne "") {
|
||||
- print "HiSilicon Kunpeng9xx common error events:\n$out\n";
|
||||
+ print "HiSilicon KunPeng9xx common error events:\n$out\n";
|
||||
}
|
||||
$query_handle->finish;
|
||||
}
|
||||
@@ -1784,7 +1784,7 @@ sub vendor_errors
|
||||
sub vendor_platforms
|
||||
{
|
||||
print "\nSupported platforms for the vendor-specific errors:\n";
|
||||
- print "\tHiSilicon Kunpeng9xx, platform-id=\"", HISILICON_KUNPENG_9XX, "\"\n";
|
||||
+ print "\tHiSilicon KunPeng9xx, platform-id=\"", HISILICON_KUNPENG_9XX, "\"\n";
|
||||
print "\n";
|
||||
}
|
||||
|
||||
--
|
||||
2.25.1
|
||||
|
||||
@ -0,0 +1,90 @@
|
||||
From c019f2f82b7f224e95968037f2afc16f63cc1d1d Mon Sep 17 00:00:00 2001
|
||||
From: Shiju Jose <shiju.jose@huawei.com>
|
||||
Date: Thu, 28 Apr 2022 22:59:04 +0100
|
||||
Subject: [PATCH 10/10] rasdaemon: Fix for a memory out-of-bounds issue and
|
||||
optimized code to remove duplicate function.
|
||||
|
||||
Fixed a memory out-of-bounds issue with string pointers and
|
||||
optimized code structure to remove duplicate function.
|
||||
|
||||
Signed-off-by: Lei Feng <fenglei47@h-partners.com>
|
||||
Signed-off-by: Shiju Jose <shiju.jose@huawei.com>
|
||||
---
|
||||
non-standard-hisi_hip08.c | 6 +++---
|
||||
non-standard-hisilicon.c | 2 +-
|
||||
ras-non-standard-handler.c | 16 +---------------
|
||||
3 files changed, 5 insertions(+), 19 deletions(-)
|
||||
|
||||
diff --git a/non-standard-hisi_hip08.c b/non-standard-hisi_hip08.c
|
||||
index 9092183..4ef47ea 100644
|
||||
--- a/non-standard-hisi_hip08.c
|
||||
+++ b/non-standard-hisi_hip08.c
|
||||
@@ -1014,15 +1014,15 @@ static int decode_hip08_pcie_local_error(struct ras_events *ras,
|
||||
|
||||
static struct ras_ns_ev_decoder hip08_ns_ev_decoder[] = {
|
||||
{
|
||||
- .sec_type = "1f8161e155d641e6bd107afd1dc5f7c5",
|
||||
+ .sec_type = "1f8161e1-55d6-41e6-bd10-7afd1dc5f7c5",
|
||||
.decode = decode_hip08_oem_type1_error,
|
||||
},
|
||||
{
|
||||
- .sec_type = "45534ea6ce2341158535e07ab3aef91d",
|
||||
+ .sec_type = "45534ea6-ce23-4115-8535-e07ab3aef91d",
|
||||
.decode = decode_hip08_oem_type2_error,
|
||||
},
|
||||
{
|
||||
- .sec_type = "b2889fc9e7d74f9da867af42e98be772",
|
||||
+ .sec_type = "b2889fc9-e7d7-4f9d-a867-af42e98be772",
|
||||
.decode = decode_hip08_pcie_local_error,
|
||||
},
|
||||
};
|
||||
diff --git a/non-standard-hisilicon.c b/non-standard-hisilicon.c
|
||||
index d1e1774..6ee9271 100644
|
||||
--- a/non-standard-hisilicon.c
|
||||
+++ b/non-standard-hisilicon.c
|
||||
@@ -387,7 +387,7 @@ static int decode_hisi_common_section(struct ras_events *ras,
|
||||
|
||||
static struct ras_ns_ev_decoder hisi_section_ns_ev_decoder[] = {
|
||||
{
|
||||
- .sec_type = "c8b328a899174af69a132e08ab2e7586",
|
||||
+ .sec_type = "c8b328a8-9917-4af6-9a13-2e08ab2e7586",
|
||||
.decode = decode_hisi_common_section,
|
||||
},
|
||||
};
|
||||
diff --git a/ras-non-standard-handler.c b/ras-non-standard-handler.c
|
||||
index 6d5a6f8..6932e58 100644
|
||||
--- a/ras-non-standard-handler.c
|
||||
+++ b/ras-non-standard-handler.c
|
||||
@@ -52,20 +52,6 @@ static char *uuid_le(const char *uu)
|
||||
return uuid;
|
||||
}
|
||||
|
||||
-static int uuid_le_cmp(const char *sec_type, const char *uuid2)
|
||||
-{
|
||||
- static char uuid1[32];
|
||||
- char *p = uuid1;
|
||||
- int i;
|
||||
- static const unsigned char le[16] = {
|
||||
- 3, 2, 1, 0, 5, 4, 7, 6, 8, 9, 10, 11, 12, 13, 14, 15};
|
||||
-
|
||||
- for (i = 0; i < 16; i++)
|
||||
- p += sprintf(p, "%.2x", (unsigned char) sec_type[le[i]]);
|
||||
- *p = 0;
|
||||
- return strncmp(uuid1, uuid2, 32);
|
||||
-}
|
||||
-
|
||||
int register_ns_ev_decoder(struct ras_ns_ev_decoder *ns_ev_decoder)
|
||||
{
|
||||
struct ras_ns_ev_decoder *list;
|
||||
@@ -96,7 +82,7 @@ static int find_ns_ev_decoder(const char *sec_type, struct ras_ns_ev_decoder **p
|
||||
|
||||
ns_ev_decoder = ras_ns_ev_dec_list;
|
||||
while (ns_ev_decoder) {
|
||||
- if (uuid_le_cmp(sec_type, ns_ev_decoder->sec_type) == 0) {
|
||||
+ if (strcmp(uuid_le(sec_type), ns_ev_decoder->sec_type) == 0) {
|
||||
*p_ns_ev_dec = ns_ev_decoder;
|
||||
match = 1;
|
||||
break;
|
||||
--
|
||||
2.25.1
|
||||
|
||||
@ -1,785 +0,0 @@
|
||||
From 1c085f983f01ec09e5b0dd67dbb8b4afa89e7300 Mon Sep 17 00:00:00 2001
|
||||
From: Shiju Jose <shiju.jose@huawei.com>
|
||||
Date: Mon, 10 Aug 2020 15:42:56 +0100
|
||||
Subject: [PATCH] rasdaemon: Modify non-standard error decoding interface using
|
||||
linked list
|
||||
|
||||
Replace the current non-standard error decoding interface with the
|
||||
interface based on the linked list to avoid using realloc and
|
||||
to improve the interface.
|
||||
|
||||
Signed-off-by: Shiju Jose <shiju.jose@huawei.com>
|
||||
Signed-off-by: Mauro Carvalho Chehab <mchehab+huawei@kernel.org>
|
||||
---
|
||||
non-standard-hisi_hip08.c | 114 +++++++++++++++++-----------------
|
||||
non-standard-hisilicon.c | 46 +++++++-------
|
||||
non-standard-hisilicon.h | 4 +-
|
||||
ras-non-standard-handler.c | 122 ++++++++++++++++++++-----------------
|
||||
ras-non-standard-handler.h | 13 ++--
|
||||
5 files changed, 155 insertions(+), 144 deletions(-)
|
||||
|
||||
diff --git a/non-standard-hisi_hip08.c b/non-standard-hisi_hip08.c
|
||||
index 2197f81..ebf03e1 100644
|
||||
--- a/non-standard-hisi_hip08.c
|
||||
+++ b/non-standard-hisi_hip08.c
|
||||
@@ -528,7 +528,7 @@ static const struct db_table_descriptor hip08_pcie_local_event_tab = {
|
||||
#endif
|
||||
|
||||
#define IN_RANGE(p, start, end) ((p) >= (start) && (p) < (end))
|
||||
-static void decode_oem_type1_err_hdr(struct ras_ns_dec_tab *dec_tab,
|
||||
+static void decode_oem_type1_err_hdr(struct ras_ns_ev_decoder *ev_decoder,
|
||||
struct trace_seq *s,
|
||||
const struct hisi_oem_type1_err_sec *err)
|
||||
{
|
||||
@@ -537,26 +537,26 @@ static void decode_oem_type1_err_hdr(struct ras_ns_dec_tab *dec_tab,
|
||||
char *end = buf + HISI_BUF_LEN;
|
||||
|
||||
p += snprintf(p, end - p, "[ table_version=%d ", err->version);
|
||||
- record_vendor_data(dec_tab, HISI_OEM_DATA_TYPE_INT,
|
||||
+ record_vendor_data(ev_decoder, HISI_OEM_DATA_TYPE_INT,
|
||||
HIP08_OEM_TYPE1_FIELD_VERSION, err->version, NULL);
|
||||
|
||||
if (err->val_bits & HISI_OEM_VALID_SOC_ID && IN_RANGE(p, buf, end)) {
|
||||
p += snprintf(p, end - p, "SOC_ID=%d ", err->soc_id);
|
||||
- record_vendor_data(dec_tab, HISI_OEM_DATA_TYPE_INT,
|
||||
+ record_vendor_data(ev_decoder, HISI_OEM_DATA_TYPE_INT,
|
||||
HIP08_OEM_TYPE1_FIELD_SOC_ID,
|
||||
err->soc_id, NULL);
|
||||
}
|
||||
|
||||
if (err->val_bits & HISI_OEM_VALID_SOCKET_ID && IN_RANGE(p, buf, end)) {
|
||||
p += snprintf(p, end - p, "socket_ID=%d ", err->socket_id);
|
||||
- record_vendor_data(dec_tab, HISI_OEM_DATA_TYPE_INT,
|
||||
+ record_vendor_data(ev_decoder, HISI_OEM_DATA_TYPE_INT,
|
||||
HIP08_OEM_TYPE1_FIELD_SOCKET_ID,
|
||||
err->socket_id, NULL);
|
||||
}
|
||||
|
||||
if (err->val_bits & HISI_OEM_VALID_NIMBUS_ID && IN_RANGE(p, buf, end)) {
|
||||
p += snprintf(p, end - p, "nimbus_ID=%d ", err->nimbus_id);
|
||||
- record_vendor_data(dec_tab, HISI_OEM_DATA_TYPE_INT,
|
||||
+ record_vendor_data(ev_decoder, HISI_OEM_DATA_TYPE_INT,
|
||||
HIP08_OEM_TYPE1_FIELD_NIMBUS_ID,
|
||||
err->nimbus_id, NULL);
|
||||
}
|
||||
@@ -566,7 +566,7 @@ static void decode_oem_type1_err_hdr(struct ras_ns_dec_tab *dec_tab,
|
||||
err->module_id);
|
||||
|
||||
p += snprintf(p, end - p, "module=%s ", str);
|
||||
- record_vendor_data(dec_tab, HISI_OEM_DATA_TYPE_TEXT,
|
||||
+ record_vendor_data(ev_decoder, HISI_OEM_DATA_TYPE_TEXT,
|
||||
HIP08_OEM_TYPE1_FIELD_MODULE_ID,
|
||||
0, str);
|
||||
}
|
||||
@@ -578,7 +578,7 @@ static void decode_oem_type1_err_hdr(struct ras_ns_dec_tab *dec_tab,
|
||||
err->sub_module_id);
|
||||
|
||||
p += snprintf(p, end - p, "submodule=%s ", str);
|
||||
- record_vendor_data(dec_tab, HISI_OEM_DATA_TYPE_TEXT,
|
||||
+ record_vendor_data(ev_decoder, HISI_OEM_DATA_TYPE_TEXT,
|
||||
HIP08_OEM_TYPE1_FIELD_SUB_MODULE_ID,
|
||||
0, str);
|
||||
}
|
||||
@@ -587,7 +587,7 @@ static void decode_oem_type1_err_hdr(struct ras_ns_dec_tab *dec_tab,
|
||||
IN_RANGE(p, buf, end)) {
|
||||
p += snprintf(p, end - p, "error_severity=%s ",
|
||||
err_severity(err->err_severity));
|
||||
- record_vendor_data(dec_tab, HISI_OEM_DATA_TYPE_TEXT,
|
||||
+ record_vendor_data(ev_decoder, HISI_OEM_DATA_TYPE_TEXT,
|
||||
HIP08_OEM_TYPE1_FIELD_ERR_SEV,
|
||||
0, err_severity(err->err_severity));
|
||||
}
|
||||
@@ -598,7 +598,7 @@ static void decode_oem_type1_err_hdr(struct ras_ns_dec_tab *dec_tab,
|
||||
trace_seq_printf(s, "%s\n", buf);
|
||||
}
|
||||
|
||||
-static void decode_oem_type1_err_regs(struct ras_ns_dec_tab *dec_tab,
|
||||
+static void decode_oem_type1_err_regs(struct ras_ns_ev_decoder *ev_decoder,
|
||||
struct trace_seq *s,
|
||||
const struct hisi_oem_type1_err_sec *err)
|
||||
{
|
||||
@@ -649,14 +649,14 @@ static void decode_oem_type1_err_regs(struct ras_ns_dec_tab *dec_tab,
|
||||
*p = '\0';
|
||||
}
|
||||
|
||||
- record_vendor_data(dec_tab, HISI_OEM_DATA_TYPE_TEXT,
|
||||
+ record_vendor_data(ev_decoder, HISI_OEM_DATA_TYPE_TEXT,
|
||||
HIP08_OEM_TYPE1_FIELD_REGS_DUMP, 0, buf);
|
||||
- step_vendor_data_tab(dec_tab, "hip08_oem_type1_event_tab");
|
||||
+ step_vendor_data_tab(ev_decoder, "hip08_oem_type1_event_tab");
|
||||
}
|
||||
|
||||
/* error data decoding functions */
|
||||
static int decode_hip08_oem_type1_error(struct ras_events *ras,
|
||||
- struct ras_ns_dec_tab *dec_tab,
|
||||
+ struct ras_ns_ev_decoder *ev_decoder,
|
||||
struct trace_seq *s,
|
||||
struct ras_non_standard_event *event)
|
||||
{
|
||||
@@ -670,8 +670,8 @@ static int decode_hip08_oem_type1_error(struct ras_events *ras,
|
||||
}
|
||||
|
||||
#ifdef HAVE_SQLITE3
|
||||
- if (!dec_tab->stmt_dec_record) {
|
||||
- if (ras_mc_add_vendor_table(ras, &dec_tab->stmt_dec_record,
|
||||
+ if (!ev_decoder->stmt_dec_record) {
|
||||
+ if (ras_mc_add_vendor_table(ras, &ev_decoder->stmt_dec_record,
|
||||
&hip08_oem_type1_event_tab)
|
||||
!= SQLITE_OK) {
|
||||
trace_seq_printf(s,
|
||||
@@ -680,18 +680,18 @@ static int decode_hip08_oem_type1_error(struct ras_events *ras,
|
||||
}
|
||||
}
|
||||
#endif
|
||||
- record_vendor_data(dec_tab, HISI_OEM_DATA_TYPE_TEXT,
|
||||
+ record_vendor_data(ev_decoder, HISI_OEM_DATA_TYPE_TEXT,
|
||||
HIP08_OEM_TYPE1_FIELD_TIMESTAMP,
|
||||
0, event->timestamp);
|
||||
|
||||
trace_seq_printf(s, "\nHISI HIP08: OEM Type-1 Error\n");
|
||||
- decode_oem_type1_err_hdr(dec_tab, s, err);
|
||||
- decode_oem_type1_err_regs(dec_tab, s, err);
|
||||
+ decode_oem_type1_err_hdr(ev_decoder, s, err);
|
||||
+ decode_oem_type1_err_regs(ev_decoder, s, err);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
-static void decode_oem_type2_err_hdr(struct ras_ns_dec_tab *dec_tab,
|
||||
+static void decode_oem_type2_err_hdr(struct ras_ns_ev_decoder *ev_decoder,
|
||||
struct trace_seq *s,
|
||||
const struct hisi_oem_type2_err_sec *err)
|
||||
{
|
||||
@@ -700,26 +700,26 @@ static void decode_oem_type2_err_hdr(struct ras_ns_dec_tab *dec_tab,
|
||||
char *end = buf + HISI_BUF_LEN;
|
||||
|
||||
p += snprintf(p, end - p, "[ table_version=%d ", err->version);
|
||||
- record_vendor_data(dec_tab, HISI_OEM_DATA_TYPE_INT,
|
||||
+ record_vendor_data(ev_decoder, HISI_OEM_DATA_TYPE_INT,
|
||||
HIP08_OEM_TYPE2_FIELD_VERSION, err->version, NULL);
|
||||
|
||||
if (err->val_bits & HISI_OEM_VALID_SOC_ID && IN_RANGE(p, buf, end)) {
|
||||
p += snprintf(p, end - p, "SOC_ID=%d ", err->soc_id);
|
||||
- record_vendor_data(dec_tab, HISI_OEM_DATA_TYPE_INT,
|
||||
+ record_vendor_data(ev_decoder, HISI_OEM_DATA_TYPE_INT,
|
||||
HIP08_OEM_TYPE2_FIELD_SOC_ID,
|
||||
err->soc_id, NULL);
|
||||
}
|
||||
|
||||
if (err->val_bits & HISI_OEM_VALID_SOCKET_ID && IN_RANGE(p, buf, end)) {
|
||||
p += snprintf(p, end - p, "socket_ID=%d ", err->socket_id);
|
||||
- record_vendor_data(dec_tab, HISI_OEM_DATA_TYPE_INT,
|
||||
+ record_vendor_data(ev_decoder, HISI_OEM_DATA_TYPE_INT,
|
||||
HIP08_OEM_TYPE2_FIELD_SOCKET_ID,
|
||||
err->socket_id, NULL);
|
||||
}
|
||||
|
||||
if (err->val_bits & HISI_OEM_VALID_NIMBUS_ID && IN_RANGE(p, buf, end)) {
|
||||
p += snprintf(p, end - p, "nimbus_ID=%d ", err->nimbus_id);
|
||||
- record_vendor_data(dec_tab, HISI_OEM_DATA_TYPE_INT,
|
||||
+ record_vendor_data(ev_decoder, HISI_OEM_DATA_TYPE_INT,
|
||||
HIP08_OEM_TYPE2_FIELD_NIMBUS_ID,
|
||||
err->nimbus_id, NULL);
|
||||
}
|
||||
@@ -729,7 +729,7 @@ static void decode_oem_type2_err_hdr(struct ras_ns_dec_tab *dec_tab,
|
||||
err->module_id);
|
||||
|
||||
p += snprintf(p, end - p, "module=%s ", str);
|
||||
- record_vendor_data(dec_tab, HISI_OEM_DATA_TYPE_TEXT,
|
||||
+ record_vendor_data(ev_decoder, HISI_OEM_DATA_TYPE_TEXT,
|
||||
HIP08_OEM_TYPE2_FIELD_MODULE_ID,
|
||||
0, str);
|
||||
}
|
||||
@@ -741,7 +741,7 @@ static void decode_oem_type2_err_hdr(struct ras_ns_dec_tab *dec_tab,
|
||||
err->sub_module_id);
|
||||
|
||||
p += snprintf(p, end - p, "submodule=%s ", str);
|
||||
- record_vendor_data(dec_tab, HISI_OEM_DATA_TYPE_TEXT,
|
||||
+ record_vendor_data(ev_decoder, HISI_OEM_DATA_TYPE_TEXT,
|
||||
HIP08_OEM_TYPE2_FIELD_SUB_MODULE_ID,
|
||||
0, str);
|
||||
}
|
||||
@@ -750,7 +750,7 @@ static void decode_oem_type2_err_hdr(struct ras_ns_dec_tab *dec_tab,
|
||||
IN_RANGE(p, buf, end)) {
|
||||
p += snprintf(p, end - p, "error_severity=%s ",
|
||||
err_severity(err->err_severity));
|
||||
- record_vendor_data(dec_tab, HISI_OEM_DATA_TYPE_TEXT,
|
||||
+ record_vendor_data(ev_decoder, HISI_OEM_DATA_TYPE_TEXT,
|
||||
HIP08_OEM_TYPE2_FIELD_ERR_SEV,
|
||||
0, err_severity(err->err_severity));
|
||||
}
|
||||
@@ -761,7 +761,7 @@ static void decode_oem_type2_err_hdr(struct ras_ns_dec_tab *dec_tab,
|
||||
trace_seq_printf(s, "%s\n", buf);
|
||||
}
|
||||
|
||||
-static void decode_oem_type2_err_regs(struct ras_ns_dec_tab *dec_tab,
|
||||
+static void decode_oem_type2_err_regs(struct ras_ns_ev_decoder *ev_decoder,
|
||||
struct trace_seq *s,
|
||||
const struct hisi_oem_type2_err_sec *err)
|
||||
{
|
||||
@@ -822,13 +822,13 @@ static void decode_oem_type2_err_regs(struct ras_ns_dec_tab *dec_tab,
|
||||
*p = '\0';
|
||||
}
|
||||
|
||||
- record_vendor_data(dec_tab, HISI_OEM_DATA_TYPE_TEXT,
|
||||
+ record_vendor_data(ev_decoder, HISI_OEM_DATA_TYPE_TEXT,
|
||||
HIP08_OEM_TYPE2_FIELD_REGS_DUMP, 0, buf);
|
||||
- step_vendor_data_tab(dec_tab, "hip08_oem_type2_event_tab");
|
||||
+ step_vendor_data_tab(ev_decoder, "hip08_oem_type2_event_tab");
|
||||
}
|
||||
|
||||
static int decode_hip08_oem_type2_error(struct ras_events *ras,
|
||||
- struct ras_ns_dec_tab *dec_tab,
|
||||
+ struct ras_ns_ev_decoder *ev_decoder,
|
||||
struct trace_seq *s,
|
||||
struct ras_non_standard_event *event)
|
||||
{
|
||||
@@ -842,8 +842,8 @@ static int decode_hip08_oem_type2_error(struct ras_events *ras,
|
||||
}
|
||||
|
||||
#ifdef HAVE_SQLITE3
|
||||
- if (!dec_tab->stmt_dec_record) {
|
||||
- if (ras_mc_add_vendor_table(ras, &dec_tab->stmt_dec_record,
|
||||
+ if (!ev_decoder->stmt_dec_record) {
|
||||
+ if (ras_mc_add_vendor_table(ras, &ev_decoder->stmt_dec_record,
|
||||
&hip08_oem_type2_event_tab) != SQLITE_OK) {
|
||||
trace_seq_printf(s,
|
||||
"create sql hip08_oem_type2_event_tab fail\n");
|
||||
@@ -851,18 +851,18 @@ static int decode_hip08_oem_type2_error(struct ras_events *ras,
|
||||
}
|
||||
}
|
||||
#endif
|
||||
- record_vendor_data(dec_tab, HISI_OEM_DATA_TYPE_TEXT,
|
||||
+ record_vendor_data(ev_decoder, HISI_OEM_DATA_TYPE_TEXT,
|
||||
HIP08_OEM_TYPE2_FIELD_TIMESTAMP,
|
||||
0, event->timestamp);
|
||||
|
||||
trace_seq_printf(s, "\nHISI HIP08: OEM Type-2 Error\n");
|
||||
- decode_oem_type2_err_hdr(dec_tab, s, err);
|
||||
- decode_oem_type2_err_regs(dec_tab, s, err);
|
||||
+ decode_oem_type2_err_hdr(ev_decoder, s, err);
|
||||
+ decode_oem_type2_err_regs(ev_decoder, s, err);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
-static void decode_pcie_local_err_hdr(struct ras_ns_dec_tab *dec_tab,
|
||||
+static void decode_pcie_local_err_hdr(struct ras_ns_ev_decoder *ev_decoder,
|
||||
struct trace_seq *s,
|
||||
const struct hisi_pcie_local_err_sec *err)
|
||||
{
|
||||
@@ -871,14 +871,14 @@ static void decode_pcie_local_err_hdr(struct ras_ns_dec_tab *dec_tab,
|
||||
char *end = buf + HISI_BUF_LEN;
|
||||
|
||||
p += snprintf(p, end - p, "[ table_version=%d ", err->version);
|
||||
- record_vendor_data(dec_tab, HISI_OEM_DATA_TYPE_INT,
|
||||
+ record_vendor_data(ev_decoder, HISI_OEM_DATA_TYPE_INT,
|
||||
HIP08_PCIE_LOCAL_FIELD_VERSION,
|
||||
err->version, NULL);
|
||||
|
||||
if (err->val_bits & HISI_PCIE_LOCAL_VALID_SOC_ID &&
|
||||
IN_RANGE(p, buf, end)) {
|
||||
p += snprintf(p, end - p, "SOC_ID=%d ", err->soc_id);
|
||||
- record_vendor_data(dec_tab, HISI_OEM_DATA_TYPE_INT,
|
||||
+ record_vendor_data(ev_decoder, HISI_OEM_DATA_TYPE_INT,
|
||||
HIP08_PCIE_LOCAL_FIELD_SOC_ID,
|
||||
err->soc_id, NULL);
|
||||
}
|
||||
@@ -886,7 +886,7 @@ static void decode_pcie_local_err_hdr(struct ras_ns_dec_tab *dec_tab,
|
||||
if (err->val_bits & HISI_PCIE_LOCAL_VALID_SOCKET_ID &&
|
||||
IN_RANGE(p, buf, end)) {
|
||||
p += snprintf(p, end - p, "socket_ID=%d ", err->socket_id);
|
||||
- record_vendor_data(dec_tab, HISI_OEM_DATA_TYPE_INT,
|
||||
+ record_vendor_data(ev_decoder, HISI_OEM_DATA_TYPE_INT,
|
||||
HIP08_PCIE_LOCAL_FIELD_SOCKET_ID,
|
||||
err->socket_id, NULL);
|
||||
}
|
||||
@@ -894,7 +894,7 @@ static void decode_pcie_local_err_hdr(struct ras_ns_dec_tab *dec_tab,
|
||||
if (err->val_bits & HISI_PCIE_LOCAL_VALID_NIMBUS_ID &&
|
||||
IN_RANGE(p, buf, end)) {
|
||||
p += snprintf(p, end - p, "nimbus_ID=%d ", err->nimbus_id);
|
||||
- record_vendor_data(dec_tab, HISI_OEM_DATA_TYPE_INT,
|
||||
+ record_vendor_data(ev_decoder, HISI_OEM_DATA_TYPE_INT,
|
||||
HIP08_PCIE_LOCAL_FIELD_NIMBUS_ID,
|
||||
err->nimbus_id, NULL);
|
||||
}
|
||||
@@ -903,7 +903,7 @@ static void decode_pcie_local_err_hdr(struct ras_ns_dec_tab *dec_tab,
|
||||
IN_RANGE(p, buf, end)) {
|
||||
p += snprintf(p, end - p, "submodule=%s ",
|
||||
pcie_local_sub_module_name(err->sub_module_id));
|
||||
- record_vendor_data(dec_tab, HISI_OEM_DATA_TYPE_TEXT,
|
||||
+ record_vendor_data(ev_decoder, HISI_OEM_DATA_TYPE_TEXT,
|
||||
HIP08_PCIE_LOCAL_FIELD_SUB_MODULE_ID,
|
||||
0, pcie_local_sub_module_name(err->sub_module_id));
|
||||
}
|
||||
@@ -911,7 +911,7 @@ static void decode_pcie_local_err_hdr(struct ras_ns_dec_tab *dec_tab,
|
||||
if (err->val_bits & HISI_PCIE_LOCAL_VALID_CORE_ID &&
|
||||
IN_RANGE(p, buf, end)) {
|
||||
p += snprintf(p, end - p, "core_ID=core%d ", err->core_id);
|
||||
- record_vendor_data(dec_tab, HISI_OEM_DATA_TYPE_INT,
|
||||
+ record_vendor_data(ev_decoder, HISI_OEM_DATA_TYPE_INT,
|
||||
HIP08_PCIE_LOCAL_FIELD_CORE_ID,
|
||||
err->core_id, NULL);
|
||||
}
|
||||
@@ -919,7 +919,7 @@ static void decode_pcie_local_err_hdr(struct ras_ns_dec_tab *dec_tab,
|
||||
if (err->val_bits & HISI_PCIE_LOCAL_VALID_PORT_ID &&
|
||||
IN_RANGE(p, buf, end)) {
|
||||
p += snprintf(p, end - p, "port_ID=port%d ", err->port_id);
|
||||
- record_vendor_data(dec_tab, HISI_OEM_DATA_TYPE_INT,
|
||||
+ record_vendor_data(ev_decoder, HISI_OEM_DATA_TYPE_INT,
|
||||
HIP08_PCIE_LOCAL_FIELD_PORT_ID,
|
||||
err->port_id, NULL);
|
||||
}
|
||||
@@ -928,7 +928,7 @@ static void decode_pcie_local_err_hdr(struct ras_ns_dec_tab *dec_tab,
|
||||
IN_RANGE(p, buf, end)) {
|
||||
p += snprintf(p, end - p, "error_severity=%s ",
|
||||
err_severity(err->err_severity));
|
||||
- record_vendor_data(dec_tab, HISI_OEM_DATA_TYPE_TEXT,
|
||||
+ record_vendor_data(ev_decoder, HISI_OEM_DATA_TYPE_TEXT,
|
||||
HIP08_PCIE_LOCAL_FIELD_ERR_SEV,
|
||||
0, err_severity(err->err_severity));
|
||||
}
|
||||
@@ -936,7 +936,7 @@ static void decode_pcie_local_err_hdr(struct ras_ns_dec_tab *dec_tab,
|
||||
if (err->val_bits & HISI_PCIE_LOCAL_VALID_ERR_TYPE &&
|
||||
IN_RANGE(p, buf, end)) {
|
||||
p += snprintf(p, end - p, "error_type=0x%x ", err->err_type);
|
||||
- record_vendor_data(dec_tab, HISI_OEM_DATA_TYPE_INT,
|
||||
+ record_vendor_data(ev_decoder, HISI_OEM_DATA_TYPE_INT,
|
||||
HIP08_PCIE_LOCAL_FIELD_ERR_TYPE,
|
||||
err->err_type, NULL);
|
||||
}
|
||||
@@ -947,7 +947,7 @@ static void decode_pcie_local_err_hdr(struct ras_ns_dec_tab *dec_tab,
|
||||
trace_seq_printf(s, "%s\n", buf);
|
||||
}
|
||||
|
||||
-static void decode_pcie_local_err_regs(struct ras_ns_dec_tab *dec_tab,
|
||||
+static void decode_pcie_local_err_regs(struct ras_ns_ev_decoder *ev_decoder,
|
||||
struct trace_seq *s,
|
||||
const struct hisi_pcie_local_err_sec *err)
|
||||
{
|
||||
@@ -972,13 +972,13 @@ static void decode_pcie_local_err_regs(struct ras_ns_dec_tab *dec_tab,
|
||||
*p = '\0';
|
||||
}
|
||||
|
||||
- record_vendor_data(dec_tab, HISI_OEM_DATA_TYPE_TEXT,
|
||||
+ record_vendor_data(ev_decoder, HISI_OEM_DATA_TYPE_TEXT,
|
||||
HIP08_PCIE_LOCAL_FIELD_REGS_DUMP, 0, buf);
|
||||
- step_vendor_data_tab(dec_tab, "hip08_pcie_local_event_tab");
|
||||
+ step_vendor_data_tab(ev_decoder, "hip08_pcie_local_event_tab");
|
||||
}
|
||||
|
||||
static int decode_hip08_pcie_local_error(struct ras_events *ras,
|
||||
- struct ras_ns_dec_tab *dec_tab,
|
||||
+ struct ras_ns_ev_decoder *ev_decoder,
|
||||
struct trace_seq *s,
|
||||
struct ras_non_standard_event *event)
|
||||
{
|
||||
@@ -992,8 +992,8 @@ static int decode_hip08_pcie_local_error(struct ras_events *ras,
|
||||
}
|
||||
|
||||
#ifdef HAVE_SQLITE3
|
||||
- if (!dec_tab->stmt_dec_record) {
|
||||
- if (ras_mc_add_vendor_table(ras, &dec_tab->stmt_dec_record,
|
||||
+ if (!ev_decoder->stmt_dec_record) {
|
||||
+ if (ras_mc_add_vendor_table(ras, &ev_decoder->stmt_dec_record,
|
||||
&hip08_pcie_local_event_tab) != SQLITE_OK) {
|
||||
trace_seq_printf(s,
|
||||
"create sql hip08_pcie_local_event_tab fail\n");
|
||||
@@ -1001,18 +1001,18 @@ static int decode_hip08_pcie_local_error(struct ras_events *ras,
|
||||
}
|
||||
}
|
||||
#endif
|
||||
- record_vendor_data(dec_tab, HISI_OEM_DATA_TYPE_TEXT,
|
||||
+ record_vendor_data(ev_decoder, HISI_OEM_DATA_TYPE_TEXT,
|
||||
HIP08_PCIE_LOCAL_FIELD_TIMESTAMP,
|
||||
0, event->timestamp);
|
||||
|
||||
trace_seq_printf(s, "\nHISI HIP08: PCIe local error\n");
|
||||
- decode_pcie_local_err_hdr(dec_tab, s, err);
|
||||
- decode_pcie_local_err_regs(dec_tab, s, err);
|
||||
+ decode_pcie_local_err_hdr(ev_decoder, s, err);
|
||||
+ decode_pcie_local_err_regs(ev_decoder, s, err);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
-struct ras_ns_dec_tab hip08_ns_oem_tab[] = {
|
||||
+static struct ras_ns_ev_decoder hip08_ns_ev_decoder[] = {
|
||||
{
|
||||
.sec_type = "1f8161e155d641e6bd107afd1dc5f7c5",
|
||||
.decode = decode_hip08_oem_type1_error,
|
||||
@@ -1025,10 +1025,12 @@ struct ras_ns_dec_tab hip08_ns_oem_tab[] = {
|
||||
.sec_type = "b2889fc9e7d74f9da867af42e98be772",
|
||||
.decode = decode_hip08_pcie_local_error,
|
||||
},
|
||||
- { /* sentinel */ }
|
||||
};
|
||||
|
||||
static void __attribute__((constructor)) hip08_init(void)
|
||||
{
|
||||
- register_ns_dec_tab(hip08_ns_oem_tab);
|
||||
+ int i;
|
||||
+
|
||||
+ for (i = 0; i < ARRAY_SIZE(hip08_ns_ev_decoder); i++)
|
||||
+ register_ns_ev_decoder(&hip08_ns_ev_decoder[i]);
|
||||
}
|
||||
diff --git a/non-standard-hisilicon.c b/non-standard-hisilicon.c
|
||||
index c9e1fa9..a6f5e78 100644
|
||||
--- a/non-standard-hisilicon.c
|
||||
+++ b/non-standard-hisilicon.c
|
||||
@@ -73,38 +73,38 @@ struct hisi_event {
|
||||
};
|
||||
|
||||
#ifdef HAVE_SQLITE3
|
||||
-void record_vendor_data(struct ras_ns_dec_tab *dec_tab,
|
||||
+void record_vendor_data(struct ras_ns_ev_decoder *ev_decoder,
|
||||
enum hisi_oem_data_type data_type,
|
||||
int id, int64_t data, const char *text)
|
||||
{
|
||||
switch (data_type) {
|
||||
case HISI_OEM_DATA_TYPE_INT:
|
||||
- sqlite3_bind_int(dec_tab->stmt_dec_record, id, data);
|
||||
+ sqlite3_bind_int(ev_decoder->stmt_dec_record, id, data);
|
||||
break;
|
||||
case HISI_OEM_DATA_TYPE_INT64:
|
||||
- sqlite3_bind_int64(dec_tab->stmt_dec_record, id, data);
|
||||
+ sqlite3_bind_int64(ev_decoder->stmt_dec_record, id, data);
|
||||
break;
|
||||
case HISI_OEM_DATA_TYPE_TEXT:
|
||||
- sqlite3_bind_text(dec_tab->stmt_dec_record, id, text, -1, NULL);
|
||||
+ sqlite3_bind_text(ev_decoder->stmt_dec_record, id, text, -1, NULL);
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
-int step_vendor_data_tab(struct ras_ns_dec_tab *dec_tab, const char *name)
|
||||
+int step_vendor_data_tab(struct ras_ns_ev_decoder *ev_decoder, const char *name)
|
||||
{
|
||||
int rc;
|
||||
|
||||
- rc = sqlite3_step(dec_tab->stmt_dec_record);
|
||||
+ rc = sqlite3_step(ev_decoder->stmt_dec_record);
|
||||
if (rc != SQLITE_OK && rc != SQLITE_DONE)
|
||||
log(TERM, LOG_ERR,
|
||||
"Failed to do %s step on sqlite: error = %d\n", name, rc);
|
||||
|
||||
- rc = sqlite3_reset(dec_tab->stmt_dec_record);
|
||||
+ rc = sqlite3_reset(ev_decoder->stmt_dec_record);
|
||||
if (rc != SQLITE_OK && rc != SQLITE_DONE)
|
||||
log(TERM, LOG_ERR,
|
||||
"Failed to reset %s on sqlite: error = %d\n", name, rc);
|
||||
|
||||
- rc = sqlite3_clear_bindings(dec_tab->stmt_dec_record);
|
||||
+ rc = sqlite3_clear_bindings(ev_decoder->stmt_dec_record);
|
||||
if (rc != SQLITE_OK && rc != SQLITE_DONE)
|
||||
log(TERM, LOG_ERR,
|
||||
"Failed to clear bindings %s on sqlite: error = %d\n",
|
||||
@@ -113,12 +113,12 @@ int step_vendor_data_tab(struct ras_ns_dec_tab *dec_tab, const char *name)
|
||||
return rc;
|
||||
}
|
||||
#else
|
||||
-void record_vendor_data(struct ras_ns_dec_tab *dec_tab,
|
||||
+void record_vendor_data(struct ras_ns_ev_decoder *ev_decoder,
|
||||
enum hisi_oem_data_type data_type,
|
||||
int id, int64_t data, const char *text)
|
||||
{ }
|
||||
|
||||
-int step_vendor_data_tab(struct ras_ns_dec_tab *dec_tab, const char *name)
|
||||
+int step_vendor_data_tab(struct ras_ns_ev_decoder *ev_decoder, const char *name)
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
@@ -197,7 +197,7 @@ static void decode_module(struct hisi_event *event, uint8_t module_id)
|
||||
HISI_SNPRINTF(event->error_msg, "module=%s ", module_name[module_id]);
|
||||
}
|
||||
|
||||
-static void decode_hisi_common_section_hdr(struct ras_ns_dec_tab *dec_tab,
|
||||
+static void decode_hisi_common_section_hdr(struct ras_ns_ev_decoder *ev_decoder,
|
||||
const struct hisi_common_error_section *err,
|
||||
struct hisi_event *event)
|
||||
{
|
||||
@@ -244,7 +244,7 @@ static void decode_hisi_common_section_hdr(struct ras_ns_dec_tab *dec_tab,
|
||||
}
|
||||
|
||||
static int decode_hisi_common_section(struct ras_events *ras,
|
||||
- struct ras_ns_dec_tab *dec_tab,
|
||||
+ struct ras_ns_ev_decoder *ev_decoder,
|
||||
struct trace_seq *s,
|
||||
struct ras_non_standard_event *event)
|
||||
{
|
||||
@@ -253,8 +253,8 @@ static int decode_hisi_common_section(struct ras_events *ras,
|
||||
struct hisi_event hevent;
|
||||
|
||||
#ifdef HAVE_SQLITE3
|
||||
- if (ras->record_events && !dec_tab->stmt_dec_record) {
|
||||
- if (ras_mc_add_vendor_table(ras, &dec_tab->stmt_dec_record,
|
||||
+ if (ras->record_events && !ev_decoder->stmt_dec_record) {
|
||||
+ if (ras_mc_add_vendor_table(ras, &ev_decoder->stmt_dec_record,
|
||||
&hisi_common_section_tab) != SQLITE_OK) {
|
||||
trace_seq_printf(s, "create sql hisi_common_section_tab fail\n");
|
||||
return -1;
|
||||
@@ -264,7 +264,7 @@ static int decode_hisi_common_section(struct ras_events *ras,
|
||||
|
||||
memset(&hevent, 0, sizeof(struct hisi_event));
|
||||
trace_seq_printf(s, "\nHisilicon Common Error Section:\n");
|
||||
- decode_hisi_common_section_hdr(dec_tab, err, &hevent);
|
||||
+ decode_hisi_common_section_hdr(ev_decoder, err, &hevent);
|
||||
trace_seq_printf(s, "%s\n", hevent.error_msg);
|
||||
|
||||
if (err->val_bits & BIT(HISI_COMMON_VALID_REG_ARRAY_SIZE) && err->reg_array_size > 0) {
|
||||
@@ -280,28 +280,30 @@ static int decode_hisi_common_section(struct ras_events *ras,
|
||||
}
|
||||
|
||||
if (ras->record_events) {
|
||||
- record_vendor_data(dec_tab, HISI_OEM_DATA_TYPE_TEXT,
|
||||
+ record_vendor_data(ev_decoder, HISI_OEM_DATA_TYPE_TEXT,
|
||||
HISI_COMMON_FIELD_TIMESTAMP,
|
||||
0, event->timestamp);
|
||||
- record_vendor_data(dec_tab, HISI_OEM_DATA_TYPE_TEXT,
|
||||
+ record_vendor_data(ev_decoder, HISI_OEM_DATA_TYPE_TEXT,
|
||||
HISI_COMMON_FIELD_ERR_INFO, 0, hevent.error_msg);
|
||||
- record_vendor_data(dec_tab, HISI_OEM_DATA_TYPE_TEXT,
|
||||
+ record_vendor_data(ev_decoder, HISI_OEM_DATA_TYPE_TEXT,
|
||||
HISI_COMMON_FIELD_REGS_DUMP, 0, hevent.reg_msg);
|
||||
- step_vendor_data_tab(dec_tab, "hisi_common_section_tab");
|
||||
+ step_vendor_data_tab(ev_decoder, "hisi_common_section_tab");
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
-struct ras_ns_dec_tab hisi_section_ns_tab[] = {
|
||||
+static struct ras_ns_ev_decoder hisi_section_ns_ev_decoder[] = {
|
||||
{
|
||||
.sec_type = "c8b328a899174af69a132e08ab2e7586",
|
||||
.decode = decode_hisi_common_section,
|
||||
},
|
||||
- { /* sentinel */ }
|
||||
};
|
||||
|
||||
static void __attribute__((constructor)) hisi_ns_init(void)
|
||||
{
|
||||
- register_ns_dec_tab(hisi_section_ns_tab);
|
||||
+ int i;
|
||||
+
|
||||
+ for (i = 0; i < ARRAY_SIZE(hisi_section_ns_ev_decoder); i++)
|
||||
+ register_ns_ev_decoder(&hisi_section_ns_ev_decoder[i]);
|
||||
}
|
||||
diff --git a/non-standard-hisilicon.h b/non-standard-hisilicon.h
|
||||
index 1ce210a..75b911e 100644
|
||||
--- a/non-standard-hisilicon.h
|
||||
+++ b/non-standard-hisilicon.h
|
||||
@@ -41,9 +41,9 @@ static inline char *err_severity(uint8_t err_sev)
|
||||
return "unknown";
|
||||
}
|
||||
|
||||
-void record_vendor_data(struct ras_ns_dec_tab *dec_tab,
|
||||
+void record_vendor_data(struct ras_ns_ev_decoder *ev_decoder,
|
||||
enum hisi_oem_data_type data_type,
|
||||
int id, int64_t data, const char *text);
|
||||
-int step_vendor_data_tab(struct ras_ns_dec_tab *dec_tab, const char *name);
|
||||
+int step_vendor_data_tab(struct ras_ns_ev_decoder *ev_decoder, const char *name);
|
||||
|
||||
#endif
|
||||
diff --git a/ras-non-standard-handler.c b/ras-non-standard-handler.c
|
||||
index d92fd42..1862335 100644
|
||||
--- a/ras-non-standard-handler.c
|
||||
+++ b/ras-non-standard-handler.c
|
||||
@@ -22,46 +22,7 @@
|
||||
#include "ras-logger.h"
|
||||
#include "ras-report.h"
|
||||
|
||||
-static p_ns_dec_tab * ns_dec_tab;
|
||||
-static size_t dec_tab_count;
|
||||
-
|
||||
-int register_ns_dec_tab(const p_ns_dec_tab tab)
|
||||
-{
|
||||
- ns_dec_tab = (p_ns_dec_tab *)realloc(ns_dec_tab,
|
||||
- (dec_tab_count + 1) * sizeof(tab));
|
||||
- if (ns_dec_tab == NULL) {
|
||||
- printf("%s p_ns_dec_tab malloc failed", __func__);
|
||||
- return -1;
|
||||
- }
|
||||
- ns_dec_tab[dec_tab_count] = tab;
|
||||
- dec_tab_count++;
|
||||
- return 0;
|
||||
-}
|
||||
-
|
||||
-void unregister_ns_dec_tab(void)
|
||||
-{
|
||||
- if (ns_dec_tab) {
|
||||
-#ifdef HAVE_SQLITE3
|
||||
- p_ns_dec_tab dec_tab;
|
||||
- int i, count;
|
||||
-
|
||||
- for (count = 0; count < dec_tab_count; count++) {
|
||||
- dec_tab = ns_dec_tab[count];
|
||||
- for (i = 0; dec_tab[i].decode; i++) {
|
||||
- if (dec_tab[i].stmt_dec_record) {
|
||||
- ras_mc_finalize_vendor_table(
|
||||
- dec_tab[i].stmt_dec_record);
|
||||
- dec_tab[i].stmt_dec_record = NULL;
|
||||
- }
|
||||
- }
|
||||
- }
|
||||
-#endif
|
||||
-
|
||||
- free(ns_dec_tab);
|
||||
- ns_dec_tab = NULL;
|
||||
- dec_tab_count = 0;
|
||||
- }
|
||||
-}
|
||||
+static struct ras_ns_ev_decoder *ras_ns_ev_dec_list;
|
||||
|
||||
void print_le_hex(struct trace_seq *s, const uint8_t *buf, int index) {
|
||||
trace_seq_printf(s, "%02x%02x%02x%02x", buf[index+3], buf[index+2], buf[index+1], buf[index]);
|
||||
@@ -105,18 +66,75 @@ static int uuid_le_cmp(const char *sec_type, const char *uuid2)
|
||||
return strncmp(uuid1, uuid2, 32);
|
||||
}
|
||||
|
||||
+int register_ns_ev_decoder(struct ras_ns_ev_decoder *ns_ev_decoder)
|
||||
+{
|
||||
+ struct ras_ns_ev_decoder *list;
|
||||
+
|
||||
+ if (!ns_ev_decoder)
|
||||
+ return -1;
|
||||
+
|
||||
+ ns_ev_decoder->next = NULL;
|
||||
+ ns_ev_decoder->stmt_dec_record = NULL;
|
||||
+ if (!ras_ns_ev_dec_list) {
|
||||
+ ras_ns_ev_dec_list = ns_ev_decoder;
|
||||
+ } else {
|
||||
+ list = ras_ns_ev_dec_list;
|
||||
+ while (list->next)
|
||||
+ list = list->next;
|
||||
+ list->next = ns_ev_decoder;
|
||||
+ }
|
||||
+
|
||||
+ return 0;
|
||||
+}
|
||||
+
|
||||
+static int find_ns_ev_decoder(const char *sec_type, struct ras_ns_ev_decoder **p_ns_ev_dec)
|
||||
+{
|
||||
+ struct ras_ns_ev_decoder *ns_ev_decoder;
|
||||
+ int match = 0;
|
||||
+
|
||||
+ ns_ev_decoder = ras_ns_ev_dec_list;
|
||||
+ while (ns_ev_decoder) {
|
||||
+ if (uuid_le_cmp(sec_type, ns_ev_decoder->sec_type) == 0) {
|
||||
+ *p_ns_ev_dec = ns_ev_decoder;
|
||||
+ match = 1;
|
||||
+ break;
|
||||
+ }
|
||||
+ ns_ev_decoder = ns_ev_decoder->next;
|
||||
+ }
|
||||
+
|
||||
+ if (!match)
|
||||
+ return -1;
|
||||
+
|
||||
+ return 0;
|
||||
+}
|
||||
+
|
||||
+static void unregister_ns_ev_decoder(void)
|
||||
+{
|
||||
+#ifdef HAVE_SQLITE3
|
||||
+ struct ras_ns_ev_decoder *ns_ev_decoder = ras_ns_ev_dec_list;
|
||||
+
|
||||
+ while (ns_ev_decoder) {
|
||||
+ if (ns_ev_decoder->stmt_dec_record) {
|
||||
+ ras_mc_finalize_vendor_table(ns_ev_decoder->stmt_dec_record);
|
||||
+ ns_ev_decoder->stmt_dec_record = NULL;
|
||||
+ }
|
||||
+ ns_ev_decoder = ns_ev_decoder->next;
|
||||
+ }
|
||||
+#endif
|
||||
+ ras_ns_ev_dec_list = NULL;
|
||||
+}
|
||||
+
|
||||
int ras_non_standard_event_handler(struct trace_seq *s,
|
||||
struct pevent_record *record,
|
||||
struct event_format *event, void *context)
|
||||
{
|
||||
- int len, i, line_count, count;
|
||||
+ int len, i, line_count;
|
||||
unsigned long long val;
|
||||
struct ras_events *ras = context;
|
||||
time_t now;
|
||||
struct tm *tm;
|
||||
struct ras_non_standard_event ev;
|
||||
- p_ns_dec_tab dec_tab;
|
||||
- bool dec_done = false;
|
||||
+ struct ras_ns_ev_decoder *ns_ev_decoder;
|
||||
|
||||
/*
|
||||
* Newer kernels (3.10-rc1 or upper) provide an uptime clock.
|
||||
@@ -177,19 +195,9 @@ int ras_non_standard_event_handler(struct trace_seq *s,
|
||||
if(!ev.error)
|
||||
return -1;
|
||||
|
||||
- for (count = 0; count < dec_tab_count && !dec_done; count++) {
|
||||
- dec_tab = ns_dec_tab[count];
|
||||
- for (i = 0; dec_tab[i].decode; i++) {
|
||||
- if (uuid_le_cmp(ev.sec_type,
|
||||
- dec_tab[i].sec_type) == 0) {
|
||||
- dec_tab[i].decode(ras, &dec_tab[i], s, &ev);
|
||||
- dec_done = true;
|
||||
- break;
|
||||
- }
|
||||
- }
|
||||
- }
|
||||
-
|
||||
- if (!dec_done) {
|
||||
+ if (!find_ns_ev_decoder(ev.sec_type, &ns_ev_decoder)) {
|
||||
+ ns_ev_decoder->decode(ras, ns_ev_decoder, s, &ev);
|
||||
+ } else {
|
||||
len = ev.length;
|
||||
i = 0;
|
||||
line_count = 0;
|
||||
@@ -222,5 +230,5 @@ int ras_non_standard_event_handler(struct trace_seq *s,
|
||||
__attribute__((destructor))
|
||||
static void ns_exit(void)
|
||||
{
|
||||
- unregister_ns_dec_tab();
|
||||
+ unregister_ns_ev_decoder();
|
||||
}
|
||||
diff --git a/ras-non-standard-handler.h b/ras-non-standard-handler.h
|
||||
index 2b9bf40..57d4cb5 100644
|
||||
--- a/ras-non-standard-handler.h
|
||||
+++ b/ras-non-standard-handler.h
|
||||
@@ -20,15 +20,16 @@
|
||||
#define BIT(nr) (1UL << (nr))
|
||||
#define BIT_ULL(nr) (1ULL << (nr))
|
||||
|
||||
-typedef struct ras_ns_dec_tab {
|
||||
+struct ras_ns_ev_decoder {
|
||||
+ struct ras_ns_ev_decoder *next;
|
||||
const char *sec_type;
|
||||
- int (*decode)(struct ras_events *ras, struct ras_ns_dec_tab *dec_tab,
|
||||
+ int (*decode)(struct ras_events *ras, struct ras_ns_ev_decoder *ev_decoder,
|
||||
struct trace_seq *s, struct ras_non_standard_event *event);
|
||||
#ifdef HAVE_SQLITE3
|
||||
#include <sqlite3.h>
|
||||
sqlite3_stmt *stmt_dec_record;
|
||||
#endif
|
||||
-} *p_ns_dec_tab;
|
||||
+};
|
||||
|
||||
int ras_non_standard_event_handler(struct trace_seq *s,
|
||||
struct pevent_record *record,
|
||||
@@ -37,11 +38,9 @@ int ras_non_standard_event_handler(struct trace_seq *s,
|
||||
void print_le_hex(struct trace_seq *s, const uint8_t *buf, int index);
|
||||
|
||||
#ifdef HAVE_NON_STANDARD
|
||||
-int register_ns_dec_tab(const p_ns_dec_tab tab);
|
||||
-void unregister_ns_dec_tab(void);
|
||||
+int register_ns_ev_decoder(struct ras_ns_ev_decoder *ns_ev_decoder);
|
||||
#else
|
||||
-static inline int register_ns_dec_tab(const p_ns_dec_tab tab) { return 0; };
|
||||
-static inline void unregister_ns_dec_tab(void) { return; };
|
||||
+static inline int register_ns_ev_decoder(struct ras_ns_ev_decoder *ns_ev_decoder) { return 0; };
|
||||
#endif
|
||||
|
||||
#endif
|
||||
--
|
||||
2.33.0
|
||||
|
||||
@ -1,63 +0,0 @@
|
||||
From b98880e2cf5fd15e4261676760b719963b956a0e Mon Sep 17 00:00:00 2001
|
||||
From: Xiaofei Tan <tanxiaofei@huawei.com>
|
||||
Date: Mon, 27 Jul 2020 15:38:37 +0800
|
||||
Subject: [PATCH 1/3] rasdaemon: delete the duplicate code about the definition
|
||||
of hip08 DB fields
|
||||
|
||||
Delete the duplicate code about the definition of DB fields for hip08 OEM
|
||||
event format1 and format2. Because the two OEM event format is the same.
|
||||
|
||||
Signed-off-By: Xiaofei Tan <tanxiaofei@huawei.com>
|
||||
Signed-off-by: Mauro Carvalho Chehab <mchehab+huawei@kernel.org>
|
||||
---
|
||||
non-standard-hisi_hip08.c | 23 +++++------------------
|
||||
1 file changed, 5 insertions(+), 18 deletions(-)
|
||||
|
||||
diff --git a/non-standard-hisi_hip08.c b/non-standard-hisi_hip08.c
|
||||
index 8bf10c1..7fc6939 100644
|
||||
--- a/non-standard-hisi_hip08.c
|
||||
+++ b/non-standard-hisi_hip08.c
|
||||
@@ -504,7 +504,7 @@ static char *pcie_local_sub_module_name(uint8_t id)
|
||||
}
|
||||
|
||||
#ifdef HAVE_SQLITE3
|
||||
-static const struct db_fields hip08_oem_type1_event_fields[] = {
|
||||
+static const struct db_fields hip08_oem_event_fields[] = {
|
||||
{ .name = "id", .type = "INTEGER PRIMARY KEY" },
|
||||
{ .name = "timestamp", .type = "TEXT" },
|
||||
{ .name = "version", .type = "INTEGER" },
|
||||
@@ -519,27 +519,14 @@ static const struct db_fields hip08_oem_type1_event_fields[] = {
|
||||
|
||||
static const struct db_table_descriptor hip08_oem_type1_event_tab = {
|
||||
.name = "hip08_oem_type1_event_v2",
|
||||
- .fields = hip08_oem_type1_event_fields,
|
||||
- .num_fields = ARRAY_SIZE(hip08_oem_type1_event_fields),
|
||||
-};
|
||||
-
|
||||
-static const struct db_fields hip08_oem_type2_event_fields[] = {
|
||||
- { .name = "id", .type = "INTEGER PRIMARY KEY" },
|
||||
- { .name = "timestamp", .type = "TEXT" },
|
||||
- { .name = "version", .type = "INTEGER" },
|
||||
- { .name = "soc_id", .type = "INTEGER" },
|
||||
- { .name = "socket_id", .type = "INTEGER" },
|
||||
- { .name = "nimbus_id", .type = "INTEGER" },
|
||||
- { .name = "module_id", .type = "TEXT" },
|
||||
- { .name = "sub_module_id", .type = "TEXT" },
|
||||
- { .name = "err_severity", .type = "TEXT" },
|
||||
- { .name = "regs_dump", .type = "TEXT" },
|
||||
+ .fields = hip08_oem_event_fields,
|
||||
+ .num_fields = ARRAY_SIZE(hip08_oem_event_fields),
|
||||
};
|
||||
|
||||
static const struct db_table_descriptor hip08_oem_type2_event_tab = {
|
||||
.name = "hip08_oem_type2_event_v2",
|
||||
- .fields = hip08_oem_type2_event_fields,
|
||||
- .num_fields = ARRAY_SIZE(hip08_oem_type2_event_fields),
|
||||
+ .fields = hip08_oem_event_fields,
|
||||
+ .num_fields = ARRAY_SIZE(hip08_oem_event_fields),
|
||||
};
|
||||
|
||||
static const struct db_fields hip08_pcie_local_event_fields[] = {
|
||||
--
|
||||
2.7.4
|
||||
|
||||
@ -1,190 +0,0 @@
|
||||
From 6ee76565274f31052868e970bce8768c314f6bb7 Mon Sep 17 00:00:00 2001
|
||||
From: Xiaofei Tan <tanxiaofei@huawei.com>
|
||||
Date: Mon, 27 Jul 2020 15:38:38 +0800
|
||||
Subject: [PATCH 2/3] rasdaemon: delete the code of non-standard error decoder
|
||||
for hip07
|
||||
|
||||
Delete the code of non-standard error decoder for hip07 that was never
|
||||
used. Because the corresponding code in Linux kernel wasn't accepted.
|
||||
|
||||
Signed-off-by: Xiaofei Tan <tanxiaofei@huawei.com>
|
||||
Signed-off-by: Mauro Carvalho Chehab <mchehab+huawei@kernel.org>
|
||||
---
|
||||
Makefile.am | 2 +-
|
||||
non-standard-hisi_hip07.c | 151 ----------------------------------------------
|
||||
2 files changed, 1 insertion(+), 152 deletions(-)
|
||||
delete mode 100644 non-standard-hisi_hip07.c
|
||||
|
||||
diff --git a/Makefile.am b/Makefile.am
|
||||
index 51ef4de..23b4d60 100644
|
||||
--- a/Makefile.am
|
||||
+++ b/Makefile.am
|
||||
@@ -52,7 +52,7 @@ if WITH_ABRT_REPORT
|
||||
rasdaemon_SOURCES += ras-report.c
|
||||
endif
|
||||
if WITH_HISI_NS_DECODE
|
||||
- rasdaemon_SOURCES += non-standard-hisi_hip07.c non-standard-hisi_hip08.c
|
||||
+ rasdaemon_SOURCES += non-standard-hisi_hip08.c
|
||||
endif
|
||||
if WITH_MEMORY_CE_PFA
|
||||
rasdaemon_SOURCES += rbtree.c ras-page-isolation.c
|
||||
diff --git a/non-standard-hisi_hip07.c b/non-standard-hisi_hip07.c
|
||||
deleted file mode 100644
|
||||
index 09ddcb2..0000000
|
||||
--- a/non-standard-hisi_hip07.c
|
||||
+++ /dev/null
|
||||
@@ -1,151 +0,0 @@
|
||||
-/*
|
||||
- * Copyright (c) 2017 Hisilicon Limited.
|
||||
- *
|
||||
- * This program is free software; you can redistribute it and/or modify
|
||||
- * it under the terms of the GNU General Public License as published by
|
||||
- * the Free Software Foundation; either version 2 of the License, or
|
||||
- * (at your option) any later version.
|
||||
- *
|
||||
- */
|
||||
-
|
||||
-#include <stdio.h>
|
||||
-#include <stdlib.h>
|
||||
-#include <string.h>
|
||||
-#include "ras-record.h"
|
||||
-#include "ras-logger.h"
|
||||
-#include "ras-report.h"
|
||||
-#include "ras-non-standard-handler.h"
|
||||
-
|
||||
-/* common definitions */
|
||||
-
|
||||
-/* HISI SAS definitions */
|
||||
-#define HISI_SAS_VALID_PA BIT(0)
|
||||
-#define HISI_SAS_VALID_MB_ERR BIT(1)
|
||||
-#define HISI_SAS_VALID_ERR_TYPE BIT(2)
|
||||
-#define HISI_SAS_VALID_AXI_ERR_INFO BIT(3)
|
||||
-
|
||||
-struct hisi_sas_err_sec {
|
||||
- uint64_t val_bits;
|
||||
- uint64_t physical_addr;
|
||||
- uint32_t mb;
|
||||
- uint32_t type;
|
||||
- uint32_t axi_err_info;
|
||||
-};
|
||||
-
|
||||
-/* Common Functions */
|
||||
-static char *err_bit_type(int etype)
|
||||
-{
|
||||
- switch (etype) {
|
||||
- case 0x0: return "single-bit ecc";
|
||||
- case 0x1: return "multi-bit ecc";
|
||||
- }
|
||||
- return "unknown error";
|
||||
-}
|
||||
-
|
||||
-/* SAS Functions */
|
||||
-static char *sas_err_type(int etype)
|
||||
-{
|
||||
- switch (etype) {
|
||||
- case 0x0001: return "hgc_dqe ecc";
|
||||
- case 0x0002: return "hgc_iost ecc";
|
||||
- case 0x0004: return "hgc_itct ecc";
|
||||
- case 0x0008: return "hgc_iostl ecc";
|
||||
- case 0x0010: return "hgc_itctl ecc";
|
||||
- case 0x0020: return "hgc_cqe ecc";
|
||||
- case 0x0040: return "rxm_mem0 ecc";
|
||||
- case 0x0080: return "rxm_mem1 ecc";
|
||||
- case 0x0100: return "rxm_mem2 ecc";
|
||||
- case 0x0200: return "rxm_mem3 ecc";
|
||||
- case 0x0400: return "wp_depth";
|
||||
- case 0x0800: return "iptt_slot_no_match";
|
||||
- case 0x1000: return "rp_depth";
|
||||
- case 0x2000: return "axi err";
|
||||
- case 0x4000: return "fifo err";
|
||||
- case 0x8000: return "lm_add_fetch_list";
|
||||
- case 0x10000: return "hgc_abt_fetch_lm";
|
||||
- }
|
||||
- return "unknown error";
|
||||
-}
|
||||
-
|
||||
-static char *sas_axi_err_type(int etype)
|
||||
-{
|
||||
- switch (etype) {
|
||||
- case 0x0001: return "IOST_AXI_W_ERR";
|
||||
- case 0x0002: return "IOST_AXI_R_ERR";
|
||||
- case 0x0004: return "ITCT_AXI_W_ERR";
|
||||
- case 0x0008: return "ITCT_AXI_R_ERR";
|
||||
- case 0x0010: return "SATA_AXI_W_ERR";
|
||||
- case 0x0020: return "SATA_AXI_R_ERR";
|
||||
- case 0x0040: return "DQE_AXI_R_ERR";
|
||||
- case 0x0080: return "CQE_AXI_W_ERR";
|
||||
- case 0x0100: return "CQE_WINFO_FIFO";
|
||||
- case 0x0200: return "CQE_MSG_FIFIO";
|
||||
- case 0x0400: return "GETDQE_FIFO";
|
||||
- case 0x0800: return "CMDP_FIFO";
|
||||
- case 0x1000: return "AWTCTRL_FIFO";
|
||||
- }
|
||||
- return "unknown error";
|
||||
-}
|
||||
-
|
||||
-static int decode_hip07_sas_error(struct ras_events *ras,
|
||||
- struct ras_ns_dec_tab *dec_tab,
|
||||
- struct trace_seq *s,
|
||||
- struct ras_non_standard_event *event)
|
||||
-{
|
||||
- char buf[1024];
|
||||
- char *p = buf;
|
||||
- const struct hisi_sas_err_sec *err =
|
||||
- (struct hisi_sas_err_sec *)event->error;
|
||||
-
|
||||
- if (err->val_bits == 0) {
|
||||
- trace_seq_printf(s, "%s: no valid error data\n",
|
||||
- __func__);
|
||||
- return -1;
|
||||
- }
|
||||
- p += sprintf(p, "[");
|
||||
- if (err->val_bits & HISI_SAS_VALID_PA)
|
||||
- p += sprintf(p, "phy addr = 0x%p: ",
|
||||
- (void *)err->physical_addr);
|
||||
-
|
||||
- if (err->val_bits & HISI_SAS_VALID_MB_ERR)
|
||||
- p += sprintf(p, "%s: ", err_bit_type(err->mb));
|
||||
-
|
||||
- if (err->val_bits & HISI_SAS_VALID_ERR_TYPE)
|
||||
- p += sprintf(p, "error type = %s: ",
|
||||
- sas_err_type(err->type));
|
||||
-
|
||||
- if (err->val_bits & HISI_SAS_VALID_AXI_ERR_INFO)
|
||||
- p += sprintf(p, "axi error type = %s",
|
||||
- sas_axi_err_type(err->axi_err_info));
|
||||
-
|
||||
- p += sprintf(p, "]");
|
||||
-
|
||||
- trace_seq_printf(s, "\nHISI HIP07: SAS error: %s\n", buf);
|
||||
- return 0;
|
||||
-}
|
||||
-
|
||||
-static int decode_hip07_hns_error(struct ras_events *ras,
|
||||
- struct ras_ns_dec_tab *dec_tab,
|
||||
- struct trace_seq *s,
|
||||
- struct ras_non_standard_event *event)
|
||||
-{
|
||||
- return 0;
|
||||
-}
|
||||
-
|
||||
-struct ras_ns_dec_tab hisi_ns_dec_tab[] = {
|
||||
- {
|
||||
- .sec_type = "daffd8146eba4d8c8a91bc9bbf4aa301",
|
||||
- .decode = decode_hip07_sas_error,
|
||||
- },
|
||||
- {
|
||||
- .sec_type = "fbc2d923ea7a453dab132949f5af9e53",
|
||||
- .decode = decode_hip07_hns_error,
|
||||
- },
|
||||
- { /* sentinel */ }
|
||||
-};
|
||||
-
|
||||
-__attribute__((constructor))
|
||||
-static void hip07_init(void)
|
||||
-{
|
||||
- register_ns_dec_tab(hisi_ns_dec_tab);
|
||||
-}
|
||||
--
|
||||
2.7.4
|
||||
|
||||
@ -1,527 +0,0 @@
|
||||
From 8c30a852493a6204ded59872bb3a0f0e43537713 Mon Sep 17 00:00:00 2001
|
||||
From: Xiaofei Tan <tanxiaofei@huawei.com>
|
||||
Date: Mon, 27 Jul 2020 15:38:39 +0800
|
||||
Subject: [PATCH 3/3] rasdaemon: add support for hisilicon common section
|
||||
decoder
|
||||
|
||||
Add a new non-standard error section, Hisilicon common section.
|
||||
It is defined for the next generation SoC Kunpeng930. It also supports
|
||||
Kunpeng920 and some modules of Kunpeng920 could be changed to use
|
||||
this section.
|
||||
|
||||
We put the code to an new source file, as it supports multiple Hardware
|
||||
platform. Some code of hip08 could be shared. Move them to this new file.
|
||||
|
||||
Signed-off-by: Xiaofei Tan <tanxiaofei@huawei.com>
|
||||
Signed-off-by: Mauro Carvalho Chehab <mchehab+huawei@kernel.org>
|
||||
---
|
||||
Makefile.am | 2 +-
|
||||
non-standard-hisi_hip08.c | 79 +-----------
|
||||
non-standard-hisilicon.c | 307 ++++++++++++++++++++++++++++++++++++++++++++++
|
||||
non-standard-hisilicon.h | 49 ++++++++
|
||||
4 files changed, 358 insertions(+), 79 deletions(-)
|
||||
create mode 100644 non-standard-hisilicon.c
|
||||
create mode 100644 non-standard-hisilicon.h
|
||||
|
||||
diff --git a/Makefile.am b/Makefile.am
|
||||
index 23b4d60..18d1a92 100644
|
||||
--- a/Makefile.am
|
||||
+++ b/Makefile.am
|
||||
@@ -52,7 +52,7 @@ if WITH_ABRT_REPORT
|
||||
rasdaemon_SOURCES += ras-report.c
|
||||
endif
|
||||
if WITH_HISI_NS_DECODE
|
||||
- rasdaemon_SOURCES += non-standard-hisi_hip08.c
|
||||
+ rasdaemon_SOURCES += non-standard-hisi_hip08.c non-standard-hisilicon.c
|
||||
endif
|
||||
if WITH_MEMORY_CE_PFA
|
||||
rasdaemon_SOURCES += rbtree.c ras-page-isolation.c
|
||||
diff --git a/non-standard-hisi_hip08.c b/non-standard-hisi_hip08.c
|
||||
index 7fc6939..2197f81 100644
|
||||
--- a/non-standard-hisi_hip08.c
|
||||
+++ b/non-standard-hisi_hip08.c
|
||||
@@ -15,6 +15,7 @@
|
||||
#include "ras-logger.h"
|
||||
#include "ras-report.h"
|
||||
#include "ras-non-standard-handler.h"
|
||||
+#include "non-standard-hisilicon.h"
|
||||
|
||||
/* HISI OEM error definitions */
|
||||
/* HISI OEM format1 error definitions */
|
||||
@@ -83,11 +84,6 @@
|
||||
#define HISI_PCIE_LOCAL_ERR_MISC_MAX 33
|
||||
#define HISI_BUF_LEN 1024
|
||||
|
||||
-#define HISI_ERR_SEVERITY_NFE 0
|
||||
-#define HISI_ERR_SEVERITY_FE 1
|
||||
-#define HISI_ERR_SEVERITY_CE 2
|
||||
-#define HISI_ERR_SEVERITY_NONE 3
|
||||
-
|
||||
struct hisi_oem_type1_err_sec {
|
||||
uint32_t val_bits;
|
||||
uint8_t version;
|
||||
@@ -145,12 +141,6 @@ struct hisi_pcie_local_err_sec {
|
||||
uint32_t err_misc[HISI_PCIE_LOCAL_ERR_MISC_MAX];
|
||||
};
|
||||
|
||||
-enum hisi_oem_data_type {
|
||||
- HISI_OEM_DATA_TYPE_INT,
|
||||
- HISI_OEM_DATA_TYPE_INT64,
|
||||
- HISI_OEM_DATA_TYPE_TEXT,
|
||||
-};
|
||||
-
|
||||
enum {
|
||||
HIP08_OEM_TYPE1_FIELD_ID,
|
||||
HIP08_OEM_TYPE1_FIELD_TIMESTAMP,
|
||||
@@ -199,20 +189,6 @@ struct hisi_module_info {
|
||||
int sub_num;
|
||||
};
|
||||
|
||||
-/* helper functions */
|
||||
-static char *err_severity(uint8_t err_sev)
|
||||
-{
|
||||
- switch (err_sev) {
|
||||
- case HISI_ERR_SEVERITY_NFE: return "recoverable";
|
||||
- case HISI_ERR_SEVERITY_FE: return "fatal";
|
||||
- case HISI_ERR_SEVERITY_CE: return "corrected";
|
||||
- case HISI_ERR_SEVERITY_NONE: return "none";
|
||||
- default:
|
||||
- break;
|
||||
- }
|
||||
- return "unknown";
|
||||
-}
|
||||
-
|
||||
static const char *pll_submodule_name[] = {
|
||||
"TB_PLL0",
|
||||
"TB_PLL1",
|
||||
@@ -549,59 +525,6 @@ static const struct db_table_descriptor hip08_pcie_local_event_tab = {
|
||||
.fields = hip08_pcie_local_event_fields,
|
||||
.num_fields = ARRAY_SIZE(hip08_pcie_local_event_fields),
|
||||
};
|
||||
-
|
||||
-static void record_vendor_data(struct ras_ns_dec_tab *dec_tab,
|
||||
- enum hisi_oem_data_type data_type,
|
||||
- int id, int64_t data, const char *text)
|
||||
-{
|
||||
- switch (data_type) {
|
||||
- case HISI_OEM_DATA_TYPE_INT:
|
||||
- sqlite3_bind_int(dec_tab->stmt_dec_record, id, data);
|
||||
- break;
|
||||
- case HISI_OEM_DATA_TYPE_INT64:
|
||||
- sqlite3_bind_int64(dec_tab->stmt_dec_record, id, data);
|
||||
- break;
|
||||
- case HISI_OEM_DATA_TYPE_TEXT:
|
||||
- sqlite3_bind_text(dec_tab->stmt_dec_record, id, text, -1, NULL);
|
||||
- break;
|
||||
- default:
|
||||
- break;
|
||||
- }
|
||||
-}
|
||||
-
|
||||
-static int step_vendor_data_tab(struct ras_ns_dec_tab *dec_tab,
|
||||
- const char *name)
|
||||
-{
|
||||
- int rc;
|
||||
-
|
||||
- rc = sqlite3_step(dec_tab->stmt_dec_record);
|
||||
- if (rc != SQLITE_OK && rc != SQLITE_DONE)
|
||||
- log(TERM, LOG_ERR,
|
||||
- "Failed to do %s step on sqlite: error = %d\n", name, rc);
|
||||
-
|
||||
- rc = sqlite3_reset(dec_tab->stmt_dec_record);
|
||||
- if (rc != SQLITE_OK && rc != SQLITE_DONE)
|
||||
- log(TERM, LOG_ERR,
|
||||
- "Failed to reset %s on sqlite: error = %d\n", name, rc);
|
||||
-
|
||||
- rc = sqlite3_clear_bindings(dec_tab->stmt_dec_record);
|
||||
- if (rc != SQLITE_OK && rc != SQLITE_DONE)
|
||||
- log(TERM, LOG_ERR,
|
||||
- "Failed to clear bindings %s on sqlite: error = %d\n",
|
||||
- name, rc);
|
||||
-
|
||||
- return rc;
|
||||
-}
|
||||
-#else
|
||||
-static void record_vendor_data(struct ras_ns_dec_tab *dec_tab,
|
||||
- enum hisi_oem_data_type data_type,
|
||||
- int id, int64_t data, const char *text)
|
||||
-{ }
|
||||
-
|
||||
-static int step_vendor_data_tab(struct ras_ns_dec_tab *dec_tab, char *name)
|
||||
-{
|
||||
- return 0;
|
||||
-}
|
||||
#endif
|
||||
|
||||
#define IN_RANGE(p, start, end) ((p) >= (start) && (p) < (end))
|
||||
diff --git a/non-standard-hisilicon.c b/non-standard-hisilicon.c
|
||||
new file mode 100644
|
||||
index 0000000..c9e1fa9
|
||||
--- /dev/null
|
||||
+++ b/non-standard-hisilicon.c
|
||||
@@ -0,0 +1,307 @@
|
||||
+/*
|
||||
+ * Copyright (c) 2020 Hisilicon Limited.
|
||||
+ *
|
||||
+ * This program is free software; you can redistribute it and/or modify
|
||||
+ * it under the terms of the GNU General Public License as published by
|
||||
+ * the Free Software Foundation; either version 2 of the License, or
|
||||
+ * (at your option) any later version.
|
||||
+ *
|
||||
+ */
|
||||
+
|
||||
+#include <stdio.h>
|
||||
+#include <stdlib.h>
|
||||
+#include <string.h>
|
||||
+#include "ras-record.h"
|
||||
+#include "ras-logger.h"
|
||||
+#include "ras-report.h"
|
||||
+#include "non-standard-hisilicon.h"
|
||||
+
|
||||
+#define HISI_BUF_LEN 2048
|
||||
+
|
||||
+struct hisi_common_error_section {
|
||||
+ uint32_t val_bits;
|
||||
+ uint8_t version;
|
||||
+ uint8_t soc_id;
|
||||
+ uint8_t socket_id;
|
||||
+ uint8_t totem_id;
|
||||
+ uint8_t nimbus_id;
|
||||
+ uint8_t subsystem_id;
|
||||
+ uint8_t module_id;
|
||||
+ uint8_t submodule_id;
|
||||
+ uint8_t core_id;
|
||||
+ uint8_t port_id;
|
||||
+ uint16_t err_type;
|
||||
+ struct {
|
||||
+ uint8_t function;
|
||||
+ uint8_t device;
|
||||
+ uint16_t segment;
|
||||
+ uint8_t bus;
|
||||
+ uint8_t reserved[3];
|
||||
+ } pcie_info;
|
||||
+ uint8_t err_severity;
|
||||
+ uint8_t reserved[3];
|
||||
+ uint32_t reg_array_size;
|
||||
+ uint32_t reg_array[];
|
||||
+};
|
||||
+
|
||||
+enum {
|
||||
+ HISI_COMMON_VALID_SOC_ID,
|
||||
+ HISI_COMMON_VALID_SOCKET_ID,
|
||||
+ HISI_COMMON_VALID_TOTEM_ID,
|
||||
+ HISI_COMMON_VALID_NIMBUS_ID,
|
||||
+ HISI_COMMON_VALID_SUBSYSTEM_ID,
|
||||
+ HISI_COMMON_VALID_MODULE_ID,
|
||||
+ HISI_COMMON_VALID_SUBMODULE_ID,
|
||||
+ HISI_COMMON_VALID_CORE_ID,
|
||||
+ HISI_COMMON_VALID_PORT_ID,
|
||||
+ HISI_COMMON_VALID_ERR_TYPE,
|
||||
+ HISI_COMMON_VALID_PCIE_INFO,
|
||||
+ HISI_COMMON_VALID_ERR_SEVERITY,
|
||||
+ HISI_COMMON_VALID_REG_ARRAY_SIZE,
|
||||
+};
|
||||
+
|
||||
+enum {
|
||||
+ HISI_COMMON_FIELD_ID,
|
||||
+ HISI_COMMON_FIELD_TIMESTAMP,
|
||||
+ HISI_COMMON_FIELD_ERR_INFO,
|
||||
+ HISI_COMMON_FIELD_REGS_DUMP,
|
||||
+};
|
||||
+
|
||||
+struct hisi_event {
|
||||
+ char error_msg[HISI_BUF_LEN];
|
||||
+ char reg_msg[HISI_BUF_LEN];
|
||||
+};
|
||||
+
|
||||
+#ifdef HAVE_SQLITE3
|
||||
+void record_vendor_data(struct ras_ns_dec_tab *dec_tab,
|
||||
+ enum hisi_oem_data_type data_type,
|
||||
+ int id, int64_t data, const char *text)
|
||||
+{
|
||||
+ switch (data_type) {
|
||||
+ case HISI_OEM_DATA_TYPE_INT:
|
||||
+ sqlite3_bind_int(dec_tab->stmt_dec_record, id, data);
|
||||
+ break;
|
||||
+ case HISI_OEM_DATA_TYPE_INT64:
|
||||
+ sqlite3_bind_int64(dec_tab->stmt_dec_record, id, data);
|
||||
+ break;
|
||||
+ case HISI_OEM_DATA_TYPE_TEXT:
|
||||
+ sqlite3_bind_text(dec_tab->stmt_dec_record, id, text, -1, NULL);
|
||||
+ break;
|
||||
+ }
|
||||
+}
|
||||
+
|
||||
+int step_vendor_data_tab(struct ras_ns_dec_tab *dec_tab, const char *name)
|
||||
+{
|
||||
+ int rc;
|
||||
+
|
||||
+ rc = sqlite3_step(dec_tab->stmt_dec_record);
|
||||
+ if (rc != SQLITE_OK && rc != SQLITE_DONE)
|
||||
+ log(TERM, LOG_ERR,
|
||||
+ "Failed to do %s step on sqlite: error = %d\n", name, rc);
|
||||
+
|
||||
+ rc = sqlite3_reset(dec_tab->stmt_dec_record);
|
||||
+ if (rc != SQLITE_OK && rc != SQLITE_DONE)
|
||||
+ log(TERM, LOG_ERR,
|
||||
+ "Failed to reset %s on sqlite: error = %d\n", name, rc);
|
||||
+
|
||||
+ rc = sqlite3_clear_bindings(dec_tab->stmt_dec_record);
|
||||
+ if (rc != SQLITE_OK && rc != SQLITE_DONE)
|
||||
+ log(TERM, LOG_ERR,
|
||||
+ "Failed to clear bindings %s on sqlite: error = %d\n",
|
||||
+ name, rc);
|
||||
+
|
||||
+ return rc;
|
||||
+}
|
||||
+#else
|
||||
+void record_vendor_data(struct ras_ns_dec_tab *dec_tab,
|
||||
+ enum hisi_oem_data_type data_type,
|
||||
+ int id, int64_t data, const char *text)
|
||||
+{ }
|
||||
+
|
||||
+int step_vendor_data_tab(struct ras_ns_dec_tab *dec_tab, const char *name)
|
||||
+{
|
||||
+ return 0;
|
||||
+}
|
||||
+#endif
|
||||
+
|
||||
+#ifdef HAVE_SQLITE3
|
||||
+static const struct db_fields hisi_common_section_fields[] = {
|
||||
+ { .name = "id", .type = "INTEGER PRIMARY KEY" },
|
||||
+ { .name = "timestamp", .type = "TEXT" },
|
||||
+ { .name = "err_info", .type = "TEXT" },
|
||||
+ { .name = "regs_dump", .type = "TEXT" },
|
||||
+};
|
||||
+
|
||||
+static const struct db_table_descriptor hisi_common_section_tab = {
|
||||
+ .name = "hisi_common_section",
|
||||
+ .fields = hisi_common_section_fields,
|
||||
+ .num_fields = ARRAY_SIZE(hisi_common_section_fields),
|
||||
+};
|
||||
+#endif
|
||||
+
|
||||
+static const char* soc_desc[] = {
|
||||
+ "Kunpeng916",
|
||||
+ "Kunpeng920",
|
||||
+ "Kunpeng930",
|
||||
+};
|
||||
+
|
||||
+static const char* module_name[] = {
|
||||
+ "MN",
|
||||
+ "PLL",
|
||||
+ "SLLC",
|
||||
+ "AA",
|
||||
+ "SIOE",
|
||||
+ "POE",
|
||||
+ "CPA",
|
||||
+ "DISP",
|
||||
+ "GIC",
|
||||
+ "ITS",
|
||||
+ "AVSBUS",
|
||||
+ "CS",
|
||||
+ "PPU",
|
||||
+ "SMMU",
|
||||
+ "PA",
|
||||
+ "HLLC",
|
||||
+ "DDRC",
|
||||
+ "L3TAG",
|
||||
+ "L3DATA",
|
||||
+ "PCS",
|
||||
+ "MATA",
|
||||
+ "PCIe Local",
|
||||
+ "SAS",
|
||||
+ "SATA",
|
||||
+ "NIC",
|
||||
+ "RoCE",
|
||||
+ "USB",
|
||||
+ "ZIP",
|
||||
+ "HPRE",
|
||||
+ "SEC",
|
||||
+ "RDE",
|
||||
+ "MEE",
|
||||
+ "HHA",
|
||||
+};
|
||||
+
|
||||
+static const char* get_soc_desc(uint8_t soc_id)
|
||||
+{
|
||||
+ if (soc_id >= sizeof(soc_desc)/sizeof(char *))
|
||||
+ return "unknown";
|
||||
+
|
||||
+ return soc_desc[soc_id];
|
||||
+}
|
||||
+
|
||||
+static void decode_module(struct hisi_event *event, uint8_t module_id)
|
||||
+{
|
||||
+ if (module_id >= sizeof(module_name)/sizeof(char *))
|
||||
+ HISI_SNPRINTF(event->error_msg, "module=unknown(id=%d) ", module_id);
|
||||
+ else
|
||||
+ HISI_SNPRINTF(event->error_msg, "module=%s ", module_name[module_id]);
|
||||
+}
|
||||
+
|
||||
+static void decode_hisi_common_section_hdr(struct ras_ns_dec_tab *dec_tab,
|
||||
+ const struct hisi_common_error_section *err,
|
||||
+ struct hisi_event *event)
|
||||
+{
|
||||
+ HISI_SNPRINTF(event->error_msg, "[ table_version=%d", err->version);
|
||||
+ if (err->val_bits & BIT(HISI_COMMON_VALID_SOC_ID))
|
||||
+ HISI_SNPRINTF(event->error_msg, "soc=%s", get_soc_desc(err->soc_id));
|
||||
+
|
||||
+ if (err->val_bits & BIT(HISI_COMMON_VALID_SOCKET_ID))
|
||||
+ HISI_SNPRINTF(event->error_msg, "socket_id=%d", err->socket_id);
|
||||
+
|
||||
+ if (err->val_bits & BIT(HISI_COMMON_VALID_TOTEM_ID))
|
||||
+ HISI_SNPRINTF(event->error_msg, "totem_id=%d", err->totem_id);
|
||||
+
|
||||
+ if (err->val_bits & BIT(HISI_COMMON_VALID_NIMBUS_ID))
|
||||
+ HISI_SNPRINTF(event->error_msg, "nimbus_id=%d", err->nimbus_id);
|
||||
+
|
||||
+ if (err->val_bits & BIT(HISI_COMMON_VALID_SUBSYSTEM_ID))
|
||||
+ HISI_SNPRINTF(event->error_msg, "subsystem_id=%d", err->subsystem_id);
|
||||
+
|
||||
+ if (err->val_bits & BIT(HISI_COMMON_VALID_MODULE_ID))
|
||||
+ decode_module(event, err->module_id);
|
||||
+
|
||||
+ if (err->val_bits & BIT(HISI_COMMON_VALID_SUBMODULE_ID))
|
||||
+ HISI_SNPRINTF(event->error_msg, "submodule_id=%d", err->submodule_id);
|
||||
+
|
||||
+ if (err->val_bits & BIT(HISI_COMMON_VALID_CORE_ID))
|
||||
+ HISI_SNPRINTF(event->error_msg, "core_id=%d", err->core_id);
|
||||
+
|
||||
+ if (err->val_bits & BIT(HISI_COMMON_VALID_PORT_ID))
|
||||
+ HISI_SNPRINTF(event->error_msg, "port_id=%d", err->port_id);
|
||||
+
|
||||
+ if (err->val_bits & BIT(HISI_COMMON_VALID_ERR_TYPE))
|
||||
+ HISI_SNPRINTF(event->error_msg, "err_type=%d", err->err_type);
|
||||
+
|
||||
+ if (err->val_bits & BIT(HISI_COMMON_VALID_PCIE_INFO))
|
||||
+ HISI_SNPRINTF(event->error_msg, "pcie_device_id=%04x:%02x:%02x.%x",
|
||||
+ err->pcie_info.segment, err->pcie_info.bus,
|
||||
+ err->pcie_info.device, err->pcie_info.function);
|
||||
+
|
||||
+ if (err->val_bits & BIT(HISI_COMMON_VALID_ERR_SEVERITY))
|
||||
+ HISI_SNPRINTF(event->error_msg, "err_severity=%s", err_severity(err->err_severity));
|
||||
+
|
||||
+ HISI_SNPRINTF(event->error_msg, "]");
|
||||
+}
|
||||
+
|
||||
+static int decode_hisi_common_section(struct ras_events *ras,
|
||||
+ struct ras_ns_dec_tab *dec_tab,
|
||||
+ struct trace_seq *s,
|
||||
+ struct ras_non_standard_event *event)
|
||||
+{
|
||||
+ const struct hisi_common_error_section *err =
|
||||
+ (struct hisi_common_error_section *)event->error;
|
||||
+ struct hisi_event hevent;
|
||||
+
|
||||
+#ifdef HAVE_SQLITE3
|
||||
+ if (ras->record_events && !dec_tab->stmt_dec_record) {
|
||||
+ if (ras_mc_add_vendor_table(ras, &dec_tab->stmt_dec_record,
|
||||
+ &hisi_common_section_tab) != SQLITE_OK) {
|
||||
+ trace_seq_printf(s, "create sql hisi_common_section_tab fail\n");
|
||||
+ return -1;
|
||||
+ }
|
||||
+ }
|
||||
+#endif
|
||||
+
|
||||
+ memset(&hevent, 0, sizeof(struct hisi_event));
|
||||
+ trace_seq_printf(s, "\nHisilicon Common Error Section:\n");
|
||||
+ decode_hisi_common_section_hdr(dec_tab, err, &hevent);
|
||||
+ trace_seq_printf(s, "%s\n", hevent.error_msg);
|
||||
+
|
||||
+ if (err->val_bits & BIT(HISI_COMMON_VALID_REG_ARRAY_SIZE) && err->reg_array_size > 0) {
|
||||
+ int i;
|
||||
+
|
||||
+ trace_seq_printf(s, "Register Dump:\n");
|
||||
+ for (i = 0; i < err->reg_array_size / sizeof(uint32_t); i++) {
|
||||
+ trace_seq_printf(s, "reg%02d=0x%08x\n", i,
|
||||
+ err->reg_array[i]);
|
||||
+ HISI_SNPRINTF(hevent.reg_msg, "reg%02d=0x%08x",
|
||||
+ i, err->reg_array[i]);
|
||||
+ }
|
||||
+ }
|
||||
+
|
||||
+ if (ras->record_events) {
|
||||
+ record_vendor_data(dec_tab, HISI_OEM_DATA_TYPE_TEXT,
|
||||
+ HISI_COMMON_FIELD_TIMESTAMP,
|
||||
+ 0, event->timestamp);
|
||||
+ record_vendor_data(dec_tab, HISI_OEM_DATA_TYPE_TEXT,
|
||||
+ HISI_COMMON_FIELD_ERR_INFO, 0, hevent.error_msg);
|
||||
+ record_vendor_data(dec_tab, HISI_OEM_DATA_TYPE_TEXT,
|
||||
+ HISI_COMMON_FIELD_REGS_DUMP, 0, hevent.reg_msg);
|
||||
+ step_vendor_data_tab(dec_tab, "hisi_common_section_tab");
|
||||
+ }
|
||||
+
|
||||
+ return 0;
|
||||
+}
|
||||
+
|
||||
+struct ras_ns_dec_tab hisi_section_ns_tab[] = {
|
||||
+ {
|
||||
+ .sec_type = "c8b328a899174af69a132e08ab2e7586",
|
||||
+ .decode = decode_hisi_common_section,
|
||||
+ },
|
||||
+ { /* sentinel */ }
|
||||
+};
|
||||
+
|
||||
+static void __attribute__((constructor)) hisi_ns_init(void)
|
||||
+{
|
||||
+ register_ns_dec_tab(hisi_section_ns_tab);
|
||||
+}
|
||||
diff --git a/non-standard-hisilicon.h b/non-standard-hisilicon.h
|
||||
new file mode 100644
|
||||
index 0000000..1ce210a
|
||||
--- /dev/null
|
||||
+++ b/non-standard-hisilicon.h
|
||||
@@ -0,0 +1,49 @@
|
||||
+/*
|
||||
+ * Copyright (c) 2020 Hisilicon Limited.
|
||||
+ *
|
||||
+ * This program is free software; you can redistribute it and/or modify
|
||||
+ * it under the terms of the GNU General Public License as published by
|
||||
+ * the Free Software Foundation; either version 2 of the License, or
|
||||
+ * (at your option) any later version.
|
||||
+ *
|
||||
+ */
|
||||
+
|
||||
+#ifndef __NON_STANDARD_HISILICON_H
|
||||
+#define __NON_STANDARD_HISILICON_H
|
||||
+
|
||||
+#include "ras-non-standard-handler.h"
|
||||
+#include "ras-mc-handler.h"
|
||||
+
|
||||
+#define HISI_SNPRINTF mce_snprintf
|
||||
+
|
||||
+#define HISI_ERR_SEVERITY_NFE 0
|
||||
+#define HISI_ERR_SEVERITY_FE 1
|
||||
+#define HISI_ERR_SEVERITY_CE 2
|
||||
+#define HISI_ERR_SEVERITY_NONE 3
|
||||
+
|
||||
+enum hisi_oem_data_type {
|
||||
+ HISI_OEM_DATA_TYPE_INT,
|
||||
+ HISI_OEM_DATA_TYPE_INT64,
|
||||
+ HISI_OEM_DATA_TYPE_TEXT,
|
||||
+};
|
||||
+
|
||||
+/* helper functions */
|
||||
+static inline char *err_severity(uint8_t err_sev)
|
||||
+{
|
||||
+ switch (err_sev) {
|
||||
+ case HISI_ERR_SEVERITY_NFE: return "recoverable";
|
||||
+ case HISI_ERR_SEVERITY_FE: return "fatal";
|
||||
+ case HISI_ERR_SEVERITY_CE: return "corrected";
|
||||
+ case HISI_ERR_SEVERITY_NONE: return "none";
|
||||
+ default:
|
||||
+ break;
|
||||
+ }
|
||||
+ return "unknown";
|
||||
+}
|
||||
+
|
||||
+void record_vendor_data(struct ras_ns_dec_tab *dec_tab,
|
||||
+ enum hisi_oem_data_type data_type,
|
||||
+ int id, int64_t data, const char *text);
|
||||
+int step_vendor_data_tab(struct ras_ns_dec_tab *dec_tab, const char *name);
|
||||
+
|
||||
+#endif
|
||||
--
|
||||
2.7.4
|
||||
|
||||
@ -1,454 +0,0 @@
|
||||
From 546cf713f667437fb6e283cc3dc090679eb47d08 Mon Sep 17 00:00:00 2001
|
||||
From: Subhendu Saha <subhends@akamai.com>
|
||||
Date: Tue, 12 Jan 2021 03:29:55 -0500
|
||||
Subject: [PATCH] Fix ras-mc-ctl script.
|
||||
|
||||
When rasdaemon is compiled without enabling aer, mce, devlink,
|
||||
etc., those tables are not created in the database file. Then
|
||||
ras-mc-ctl script breaks trying to query data from non-existent
|
||||
tables.
|
||||
|
||||
Signed-off-by: Subhendu Saha subhends@akamai.com
|
||||
Signed-off-by: Mauro Carvalho Chehab <mchehab+huawei@kernel.org>
|
||||
---
|
||||
util/ras-mc-ctl.in | 384 ++++++++++++++++++++++++---------------------
|
||||
1 file changed, 208 insertions(+), 176 deletions(-)
|
||||
|
||||
diff --git a/util/ras-mc-ctl.in b/util/ras-mc-ctl.in
|
||||
index 665a042..be9d983 100755
|
||||
--- a/util/ras-mc-ctl.in
|
||||
+++ b/util/ras-mc-ctl.in
|
||||
@@ -41,6 +41,18 @@ my $sysconfdir = "@sysconfdir@";
|
||||
my $dmidecode = find_prog ("dmidecode");
|
||||
my $modprobe = find_prog ("modprobe") or exit (1);
|
||||
|
||||
+my $has_aer = 0;
|
||||
+my $has_devlink = 0;
|
||||
+my $has_disk_errors = 0;
|
||||
+my $has_extlog = 0;
|
||||
+my $has_mce = 0;
|
||||
+
|
||||
+@WITH_AER_TRUE@$has_aer = 1;
|
||||
+@WITH_DEVLINK_TRUE@$has_devlink = 1;
|
||||
+@WITH_DISKERROR_TRUE@$has_disk_errors = 1;
|
||||
+@WITH_EXTLOG_TRUE@$has_extlog = 1;
|
||||
+@WITH_MCE_TRUE@$has_mce = 1;
|
||||
+
|
||||
my %conf = ();
|
||||
my %bus = ();
|
||||
my %dimm_size = ();
|
||||
@@ -1143,86 +1155,96 @@ sub summary
|
||||
$query_handle->finish;
|
||||
|
||||
# PCIe AER aer_event errors
|
||||
- $query = "select err_type, err_msg, count(*) from aer_event group by err_type, err_msg";
|
||||
- $query_handle = $dbh->prepare($query);
|
||||
- $query_handle->execute();
|
||||
- $query_handle->bind_columns(\($err_type, $msg, $count));
|
||||
- $out = "";
|
||||
- while($query_handle->fetch()) {
|
||||
- $out .= "\t$count $err_type errors: $msg\n";
|
||||
- }
|
||||
- if ($out ne "") {
|
||||
- print "PCIe AER events summary:\n$out\n";
|
||||
- } else {
|
||||
- print "No PCIe AER errors.\n\n";
|
||||
+ if ($has_aer == 1) {
|
||||
+ $query = "select err_type, err_msg, count(*) from aer_event group by err_type, err_msg";
|
||||
+ $query_handle = $dbh->prepare($query);
|
||||
+ $query_handle->execute();
|
||||
+ $query_handle->bind_columns(\($err_type, $msg, $count));
|
||||
+ $out = "";
|
||||
+ while($query_handle->fetch()) {
|
||||
+ $out .= "\t$count $err_type errors: $msg\n";
|
||||
+ }
|
||||
+ if ($out ne "") {
|
||||
+ print "PCIe AER events summary:\n$out\n";
|
||||
+ } else {
|
||||
+ print "No PCIe AER errors.\n\n";
|
||||
+ }
|
||||
+ $query_handle->finish;
|
||||
}
|
||||
- $query_handle->finish;
|
||||
|
||||
# extlog errors
|
||||
- $query = "select etype, severity, count(*) from extlog_event group by etype, severity";
|
||||
- $query_handle = $dbh->prepare($query);
|
||||
- $query_handle->execute();
|
||||
- $query_handle->bind_columns(\($etype, $severity, $count));
|
||||
- $out = "";
|
||||
- while($query_handle->fetch()) {
|
||||
- $etype_string = get_extlog_type($etype);
|
||||
- $severity_string = get_extlog_severity($severity);
|
||||
- $out .= "\t$count $etype_string $severity_string errors\n";
|
||||
- }
|
||||
- if ($out ne "") {
|
||||
- print "Extlog records summary:\n$out";
|
||||
- } else {
|
||||
- print "No Extlog errors.\n\n";
|
||||
+ if ($has_extlog == 1) {
|
||||
+ $query = "select etype, severity, count(*) from extlog_event group by etype, severity";
|
||||
+ $query_handle = $dbh->prepare($query);
|
||||
+ $query_handle->execute();
|
||||
+ $query_handle->bind_columns(\($etype, $severity, $count));
|
||||
+ $out = "";
|
||||
+ while($query_handle->fetch()) {
|
||||
+ $etype_string = get_extlog_type($etype);
|
||||
+ $severity_string = get_extlog_severity($severity);
|
||||
+ $out .= "\t$count $etype_string $severity_string errors\n";
|
||||
+ }
|
||||
+ if ($out ne "") {
|
||||
+ print "Extlog records summary:\n$out";
|
||||
+ } else {
|
||||
+ print "No Extlog errors.\n\n";
|
||||
+ }
|
||||
+ $query_handle->finish;
|
||||
}
|
||||
- $query_handle->finish;
|
||||
|
||||
# devlink errors
|
||||
- $query = "select dev_name, count(*) from devlink_event group by dev_name";
|
||||
- $query_handle = $dbh->prepare($query);
|
||||
- $query_handle->execute();
|
||||
- $query_handle->bind_columns(\($dev_name, $count));
|
||||
- $out = "";
|
||||
- while($query_handle->fetch()) {
|
||||
- $out .= "\t$dev_name has $count errors\n";
|
||||
- }
|
||||
- if ($out ne "") {
|
||||
- print "Devlink records summary:\n$out";
|
||||
- } else {
|
||||
- print "No devlink errors.\n";
|
||||
+ if ($has_devlink == 1) {
|
||||
+ $query = "select dev_name, count(*) from devlink_event group by dev_name";
|
||||
+ $query_handle = $dbh->prepare($query);
|
||||
+ $query_handle->execute();
|
||||
+ $query_handle->bind_columns(\($dev_name, $count));
|
||||
+ $out = "";
|
||||
+ while($query_handle->fetch()) {
|
||||
+ $out .= "\t$dev_name has $count errors\n";
|
||||
+ }
|
||||
+ if ($out ne "") {
|
||||
+ print "Devlink records summary:\n$out";
|
||||
+ } else {
|
||||
+ print "No devlink errors.\n";
|
||||
+ }
|
||||
+ $query_handle->finish;
|
||||
}
|
||||
- $query_handle->finish;
|
||||
|
||||
# Disk errors
|
||||
- $query = "select dev, count(*) from disk_errors group by dev";
|
||||
- $query_handle = $dbh->prepare($query);
|
||||
- $query_handle->execute();
|
||||
- $query_handle->bind_columns(\($dev, $count));
|
||||
- $out = "";
|
||||
- while($query_handle->fetch()) {
|
||||
- $out .= "\t$dev has $count errors\n";
|
||||
- }
|
||||
- if ($out ne "") {
|
||||
- print "Disk errors summary:\n$out";
|
||||
- } else {
|
||||
- print "No disk errors.\n";
|
||||
+ if ($has_disk_errors == 1) {
|
||||
+ $query = "select dev, count(*) from disk_errors group by dev";
|
||||
+ $query_handle = $dbh->prepare($query);
|
||||
+ $query_handle->execute();
|
||||
+ $query_handle->bind_columns(\($dev, $count));
|
||||
+ $out = "";
|
||||
+ while($query_handle->fetch()) {
|
||||
+ $out .= "\t$dev has $count errors\n";
|
||||
+ }
|
||||
+ if ($out ne "") {
|
||||
+ print "Disk errors summary:\n$out";
|
||||
+ } else {
|
||||
+ print "No disk errors.\n";
|
||||
+ }
|
||||
+ $query_handle->finish;
|
||||
}
|
||||
- $query_handle->finish;
|
||||
|
||||
# MCE mce_record errors
|
||||
- $query = "select error_msg, count(*) from mce_record group by error_msg";
|
||||
- $query_handle = $dbh->prepare($query);
|
||||
- $query_handle->execute();
|
||||
- $query_handle->bind_columns(\($msg, $count));
|
||||
- $out = "";
|
||||
- while($query_handle->fetch()) {
|
||||
- $out .= "\t$count $msg errors\n";
|
||||
- }
|
||||
- if ($out ne "") {
|
||||
- print "MCE records summary:\n$out";
|
||||
- } else {
|
||||
- print "No MCE errors.\n";
|
||||
+ if ($has_mce == 1) {
|
||||
+ $query = "select error_msg, count(*) from mce_record group by error_msg";
|
||||
+ $query_handle = $dbh->prepare($query);
|
||||
+ $query_handle->execute();
|
||||
+ $query_handle->bind_columns(\($msg, $count));
|
||||
+ $out = "";
|
||||
+ while($query_handle->fetch()) {
|
||||
+ $out .= "\t$count $msg errors\n";
|
||||
+ }
|
||||
+ if ($out ne "") {
|
||||
+ print "MCE records summary:\n$out";
|
||||
+ } else {
|
||||
+ print "No MCE errors.\n";
|
||||
+ }
|
||||
+ $query_handle->finish;
|
||||
}
|
||||
- $query_handle->finish;
|
||||
|
||||
undef($dbh);
|
||||
}
|
||||
@@ -1259,128 +1281,138 @@ sub errors
|
||||
$query_handle->finish;
|
||||
|
||||
# PCIe AER aer_event errors
|
||||
- $query = "select id, timestamp, dev_name, err_type, err_msg from aer_event order by id";
|
||||
- $query_handle = $dbh->prepare($query);
|
||||
- $query_handle->execute();
|
||||
- $query_handle->bind_columns(\($id, $time, $devname, $type, $msg));
|
||||
- $out = "";
|
||||
- while($query_handle->fetch()) {
|
||||
- $out .= "$id $time $devname $type error: $msg\n";
|
||||
- }
|
||||
- if ($out ne "") {
|
||||
- print "PCIe AER events:\n$out\n";
|
||||
- } else {
|
||||
- print "No PCIe AER errors.\n\n";
|
||||
+ if ($has_aer == 1) {
|
||||
+ $query = "select id, timestamp, dev_name, err_type, err_msg from aer_event order by id";
|
||||
+ $query_handle = $dbh->prepare($query);
|
||||
+ $query_handle->execute();
|
||||
+ $query_handle->bind_columns(\($id, $time, $devname, $type, $msg));
|
||||
+ $out = "";
|
||||
+ while($query_handle->fetch()) {
|
||||
+ $out .= "$id $time $devname $type error: $msg\n";
|
||||
+ }
|
||||
+ if ($out ne "") {
|
||||
+ print "PCIe AER events:\n$out\n";
|
||||
+ } else {
|
||||
+ print "No PCIe AER errors.\n\n";
|
||||
+ }
|
||||
+ $query_handle->finish;
|
||||
}
|
||||
- $query_handle->finish;
|
||||
|
||||
# Extlog errors
|
||||
- $query = "select id, timestamp, etype, severity, address, fru_id, fru_text, cper_data from extlog_event order by id";
|
||||
- $query_handle = $dbh->prepare($query);
|
||||
- $query_handle->execute();
|
||||
- $query_handle->bind_columns(\($id, $timestamp, $etype, $severity, $addr, $fru_id, $fru_text, $cper_data));
|
||||
- $out = "";
|
||||
- while($query_handle->fetch()) {
|
||||
- $etype_string = get_extlog_type($etype);
|
||||
- $severity_string = get_extlog_severity($severity);
|
||||
- $out .= "$id $timestamp error: ";
|
||||
- $out .= "type=$etype_string, ";
|
||||
- $out .= "severity=$severity_string, ";
|
||||
- $out .= sprintf "address=0x%08x, ", $addr;
|
||||
- $out .= sprintf "fru_id=%s, ", get_uuid_le($fru_id);
|
||||
- $out .= "fru_text='$fru_text', ";
|
||||
- $out .= get_cper_data_text($cper_data) if ($cper_data);
|
||||
- $out .= "\n";
|
||||
- }
|
||||
- if ($out ne "") {
|
||||
- print "Extlog events:\n$out\n";
|
||||
- } else {
|
||||
- print "No Extlog errors.\n\n";
|
||||
+ if ($has_extlog == 1) {
|
||||
+ $query = "select id, timestamp, etype, severity, address, fru_id, fru_text, cper_data from extlog_event order by id";
|
||||
+ $query_handle = $dbh->prepare($query);
|
||||
+ $query_handle->execute();
|
||||
+ $query_handle->bind_columns(\($id, $timestamp, $etype, $severity, $addr, $fru_id, $fru_text, $cper_data));
|
||||
+ $out = "";
|
||||
+ while($query_handle->fetch()) {
|
||||
+ $etype_string = get_extlog_type($etype);
|
||||
+ $severity_string = get_extlog_severity($severity);
|
||||
+ $out .= "$id $timestamp error: ";
|
||||
+ $out .= "type=$etype_string, ";
|
||||
+ $out .= "severity=$severity_string, ";
|
||||
+ $out .= sprintf "address=0x%08x, ", $addr;
|
||||
+ $out .= sprintf "fru_id=%s, ", get_uuid_le($fru_id);
|
||||
+ $out .= "fru_text='$fru_text', ";
|
||||
+ $out .= get_cper_data_text($cper_data) if ($cper_data);
|
||||
+ $out .= "\n";
|
||||
+ }
|
||||
+ if ($out ne "") {
|
||||
+ print "Extlog events:\n$out\n";
|
||||
+ } else {
|
||||
+ print "No Extlog errors.\n\n";
|
||||
+ }
|
||||
+ $query_handle->finish;
|
||||
}
|
||||
- $query_handle->finish;
|
||||
|
||||
# devlink errors
|
||||
- $query = "select id, timestamp, bus_name, dev_name, driver_name, reporter_name, msg from devlink_event order by id";
|
||||
- $query_handle = $dbh->prepare($query);
|
||||
- $query_handle->execute();
|
||||
- $query_handle->bind_columns(\($id, $timestamp, $bus_name, $dev_name, $driver_name, $reporter_name, $msg));
|
||||
- $out = "";
|
||||
- while($query_handle->fetch()) {
|
||||
- $out .= "$id $timestamp error: ";
|
||||
- $out .= "bus_name=$bus_name, ";
|
||||
- $out .= "dev_name=$dev_name, ";
|
||||
- $out .= "driver_name=$driver_name, ";
|
||||
- $out .= "reporter_name=$reporter_name, ";
|
||||
- $out .= "message='$msg', ";
|
||||
- $out .= "\n";
|
||||
- }
|
||||
- if ($out ne "") {
|
||||
- print "Devlink events:\n$out\n";
|
||||
- } else {
|
||||
- print "No devlink errors.\n\n";
|
||||
+ if ($has_devlink == 1) {
|
||||
+ $query = "select id, timestamp, bus_name, dev_name, driver_name, reporter_name, msg from devlink_event order by id";
|
||||
+ $query_handle = $dbh->prepare($query);
|
||||
+ $query_handle->execute();
|
||||
+ $query_handle->bind_columns(\($id, $timestamp, $bus_name, $dev_name, $driver_name, $reporter_name, $msg));
|
||||
+ $out = "";
|
||||
+ while($query_handle->fetch()) {
|
||||
+ $out .= "$id $timestamp error: ";
|
||||
+ $out .= "bus_name=$bus_name, ";
|
||||
+ $out .= "dev_name=$dev_name, ";
|
||||
+ $out .= "driver_name=$driver_name, ";
|
||||
+ $out .= "reporter_name=$reporter_name, ";
|
||||
+ $out .= "message='$msg', ";
|
||||
+ $out .= "\n";
|
||||
+ }
|
||||
+ if ($out ne "") {
|
||||
+ print "Devlink events:\n$out\n";
|
||||
+ } else {
|
||||
+ print "No devlink errors.\n\n";
|
||||
+ }
|
||||
+ $query_handle->finish;
|
||||
}
|
||||
- $query_handle->finish;
|
||||
|
||||
# Disk errors
|
||||
- $query = "select id, timestamp, dev, sector, nr_sector, error, rwbs, cmd from disk_errors order by id";
|
||||
- $query_handle = $dbh->prepare($query);
|
||||
- $query_handle->execute();
|
||||
- $query_handle->bind_columns(\($id, $timestamp, $dev, $sector, $nr_sector, $error, $rwbs, $cmd));
|
||||
- $out = "";
|
||||
- while($query_handle->fetch()) {
|
||||
- $out .= "$id $timestamp error: ";
|
||||
- $out .= "dev=$dev, ";
|
||||
- $out .= "sector=$sector, ";
|
||||
- $out .= "nr_sector=$nr_sector, ";
|
||||
- $out .= "error='$error', ";
|
||||
- $out .= "rwbs='$rwbs', ";
|
||||
- $out .= "cmd='$cmd', ";
|
||||
- $out .= "\n";
|
||||
- }
|
||||
- if ($out ne "") {
|
||||
- print "Disk errors\n$out\n";
|
||||
- } else {
|
||||
- print "No disk errors.\n\n";
|
||||
+ if ($has_disk_errors == 1) {
|
||||
+ $query = "select id, timestamp, dev, sector, nr_sector, error, rwbs, cmd from disk_errors order by id";
|
||||
+ $query_handle = $dbh->prepare($query);
|
||||
+ $query_handle->execute();
|
||||
+ $query_handle->bind_columns(\($id, $timestamp, $dev, $sector, $nr_sector, $error, $rwbs, $cmd));
|
||||
+ $out = "";
|
||||
+ while($query_handle->fetch()) {
|
||||
+ $out .= "$id $timestamp error: ";
|
||||
+ $out .= "dev=$dev, ";
|
||||
+ $out .= "sector=$sector, ";
|
||||
+ $out .= "nr_sector=$nr_sector, ";
|
||||
+ $out .= "error='$error', ";
|
||||
+ $out .= "rwbs='$rwbs', ";
|
||||
+ $out .= "cmd='$cmd', ";
|
||||
+ $out .= "\n";
|
||||
+ }
|
||||
+ if ($out ne "") {
|
||||
+ print "Disk errors\n$out\n";
|
||||
+ } else {
|
||||
+ print "No disk errors.\n\n";
|
||||
+ }
|
||||
+ $query_handle->finish;
|
||||
}
|
||||
- $query_handle->finish;
|
||||
|
||||
# MCE mce_record errors
|
||||
- $query = "select id, timestamp, mcgcap, mcgstatus, status, addr, misc, ip, tsc, walltime, cpu, cpuid, apicid, socketid, cs, bank, cpuvendor, bank_name, error_msg, mcgstatus_msg, mcistatus_msg, user_action, mc_location from mce_record order by id";
|
||||
- $query_handle = $dbh->prepare($query);
|
||||
- $query_handle->execute();
|
||||
- $query_handle->bind_columns(\($id, $time, $mcgcap,$mcgstatus, $status, $addr, $misc, $ip, $tsc, $walltime, $cpu, $cpuid, $apicid, $socketid, $cs, $bank, $cpuvendor, $bank_name, $msg, $mcgstatus_msg, $mcistatus_msg, $user_action, $mc_location));
|
||||
- $out = "";
|
||||
- while($query_handle->fetch()) {
|
||||
- $out .= "$id $time error: $msg";
|
||||
- $out .= ", CPU $cpuvendor" if ($cpuvendor);
|
||||
- $out .= ", bank $bank_name" if ($bank_name);
|
||||
- $out .= ", mcg $mcgstatus_msg" if ($mcgstatus_msg);
|
||||
- $out .= ", mci $mcistatus_msg" if ($mcistatus_msg);
|
||||
- $out .= ", $mc_location" if ($mc_location);
|
||||
- $out .= ", $user_action" if ($user_action);
|
||||
- $out .= sprintf ", mcgcap=0x%08x", $mcgcap if ($mcgcap);
|
||||
- $out .= sprintf ", mcgstatus=0x%08x", $mcgstatus if ($mcgstatus);
|
||||
- $out .= sprintf ", status=0x%08x", $status if ($status);
|
||||
- $out .= sprintf ", addr=0x%08x", $addr if ($addr);
|
||||
- $out .= sprintf ", misc=0x%08x", $misc if ($misc);
|
||||
- $out .= sprintf ", ip=0x%08x", $ip if ($ip);
|
||||
- $out .= sprintf ", tsc=0x%08x", $tsc if ($tsc);
|
||||
- $out .= sprintf ", walltime=0x%08x", $walltime if ($walltime);
|
||||
- $out .= sprintf ", cpu=0x%08x", $cpu if ($cpu);
|
||||
- $out .= sprintf ", cpuid=0x%08x", $cpuid if ($cpuid);
|
||||
- $out .= sprintf ", apicid=0x%08x", $apicid if ($apicid);
|
||||
- $out .= sprintf ", socketid=0x%08x", $socketid if ($socketid);
|
||||
- $out .= sprintf ", cs=0x%08x", $cs if ($cs);
|
||||
- $out .= sprintf ", bank=0x%08x", $bank if ($bank);
|
||||
-
|
||||
- $out .= "\n";
|
||||
- }
|
||||
- if ($out ne "") {
|
||||
- print "MCE events:\n$out\n";
|
||||
- } else {
|
||||
- print "No MCE errors.\n\n";
|
||||
+ if ($has_mce == 1) {
|
||||
+ $query = "select id, timestamp, mcgcap, mcgstatus, status, addr, misc, ip, tsc, walltime, cpu, cpuid, apicid, socketid, cs, bank, cpuvendor, bank_name, error_msg, mcgstatus_msg, mcistatus_msg, user_action, mc_location from mce_record order by id";
|
||||
+ $query_handle = $dbh->prepare($query);
|
||||
+ $query_handle->execute();
|
||||
+ $query_handle->bind_columns(\($id, $time, $mcgcap,$mcgstatus, $status, $addr, $misc, $ip, $tsc, $walltime, $cpu, $cpuid, $apicid, $socketid, $cs, $bank, $cpuvendor, $bank_name, $msg, $mcgstatus_msg, $mcistatus_msg, $user_action, $mc_location));
|
||||
+ $out = "";
|
||||
+ while($query_handle->fetch()) {
|
||||
+ $out .= "$id $time error: $msg";
|
||||
+ $out .= ", CPU $cpuvendor" if ($cpuvendor);
|
||||
+ $out .= ", bank $bank_name" if ($bank_name);
|
||||
+ $out .= ", mcg $mcgstatus_msg" if ($mcgstatus_msg);
|
||||
+ $out .= ", mci $mcistatus_msg" if ($mcistatus_msg);
|
||||
+ $out .= ", $mc_location" if ($mc_location);
|
||||
+ $out .= ", $user_action" if ($user_action);
|
||||
+ $out .= sprintf ", mcgcap=0x%08x", $mcgcap if ($mcgcap);
|
||||
+ $out .= sprintf ", mcgstatus=0x%08x", $mcgstatus if ($mcgstatus);
|
||||
+ $out .= sprintf ", status=0x%08x", $status if ($status);
|
||||
+ $out .= sprintf ", addr=0x%08x", $addr if ($addr);
|
||||
+ $out .= sprintf ", misc=0x%08x", $misc if ($misc);
|
||||
+ $out .= sprintf ", ip=0x%08x", $ip if ($ip);
|
||||
+ $out .= sprintf ", tsc=0x%08x", $tsc if ($tsc);
|
||||
+ $out .= sprintf ", walltime=0x%08x", $walltime if ($walltime);
|
||||
+ $out .= sprintf ", cpu=0x%08x", $cpu if ($cpu);
|
||||
+ $out .= sprintf ", cpuid=0x%08x", $cpuid if ($cpuid);
|
||||
+ $out .= sprintf ", apicid=0x%08x", $apicid if ($apicid);
|
||||
+ $out .= sprintf ", socketid=0x%08x", $socketid if ($socketid);
|
||||
+ $out .= sprintf ", cs=0x%08x", $cs if ($cs);
|
||||
+ $out .= sprintf ", bank=0x%08x", $bank if ($bank);
|
||||
+
|
||||
+ $out .= "\n";
|
||||
+ }
|
||||
+ if ($out ne "") {
|
||||
+ print "MCE events:\n$out\n";
|
||||
+ } else {
|
||||
+ print "No MCE errors.\n\n";
|
||||
+ }
|
||||
+ $query_handle->finish;
|
||||
}
|
||||
- $query_handle->finish;
|
||||
|
||||
undef($dbh);
|
||||
}
|
||||
--
|
||||
2.27.0
|
||||
|
||||
37
backport-configure.ac-fix-SYSCONFDEFDIR-default-value.patch
Normal file
37
backport-configure.ac-fix-SYSCONFDEFDIR-default-value.patch
Normal file
@ -0,0 +1,37 @@
|
||||
From 1ff5f3d2a0fcd48add9462567c30fe0e14585fb4 Mon Sep 17 00:00:00 2001
|
||||
From: Matt Whitlock <whitslack@users.noreply.github.com>
|
||||
Date: Wed, 9 Jun 2021 10:25:18 -0400
|
||||
Subject: [PATCH] configure.ac: fix SYSCONFDEFDIR default value
|
||||
|
||||
configure.ac was using AC_ARG_WITH incorrectly, yielding a generated configure script like:
|
||||
|
||||
# Check whether --with-sysconfdefdir was given.
|
||||
if test "${with_sysconfdefdir+set}" = set; then :
|
||||
withval=$with_sysconfdefdir; SYSCONFDEFDIR=$withval
|
||||
else
|
||||
"/etc/sysconfig"
|
||||
fi
|
||||
|
||||
This commit fixes the default case so that the SYSCONFDEFDIR variable is assigned the value "/etc/sysconfig" rather than trying to execute "/etc/sysconfig" as a command.
|
||||
|
||||
Signed-off-by: Mauro Carvalho Chehab <mchehab+huawei@kernel.org>
|
||||
---
|
||||
configure.ac | 2 +-
|
||||
1 file changed, 1 insertion(+), 1 deletion(-)
|
||||
|
||||
diff --git a/configure.ac b/configure.ac
|
||||
index f7d1947..33b81fe 100644
|
||||
--- a/configure.ac
|
||||
+++ b/configure.ac
|
||||
@@ -172,7 +172,7 @@ AC_SUBST([RASSTATEDIR])
|
||||
AC_ARG_WITH(sysconfdefdir,
|
||||
AC_HELP_STRING([--with-sysconfdefdir=DIR], [rasdaemon environment file dir]),
|
||||
[SYSCONFDEFDIR=$withval],
|
||||
- ["/etc/sysconfig"])
|
||||
+ [SYSCONFDEFDIR=/etc/sysconfig])
|
||||
AC_SUBST([SYSCONFDEFDIR])
|
||||
|
||||
AC_DEFINE([RAS_DB_FNAME], ["ras-mc_event.db"], [ras events database])
|
||||
--
|
||||
2.27.0
|
||||
|
||||
@ -1,52 +0,0 @@
|
||||
From 059a901e97f4091e31c50ce55027daf707638f8d Mon Sep 17 00:00:00 2001
|
||||
From: dann frazier <dann.frazier@canonical.com>
|
||||
Date: Tue, 21 Apr 2020 15:56:04 -0600
|
||||
Subject: [PATCH] ras-mc-ctl: PCIe AER: display PCIe dev name
|
||||
|
||||
Storage of PCIe dev name was added in commit 8e96ca2c1c59 ("rasdaemon:
|
||||
store PCIe dev name and TLP header for the aer event"). This makes
|
||||
ras-mc-ctl extract and emit it like so:
|
||||
|
||||
PCIe AER events:
|
||||
1 2020-04-16 22:09:48 +0000 0000:0b:00.0 Corrected error: Receiver Error
|
||||
2 2020-04-16 22:23:24 +0000 0000:0b:00.0 Corrected error: Receiver Error
|
||||
3 2020-04-17 23:00:37 +0000 0000:d9:01.0 Corrected error: Advisory Non-Fatal, BIT15
|
||||
4 2020-04-17 23:21:52 +0000 0000:d9:01.0 Corrected error: Advisory Non-Fatal
|
||||
5 2020-04-18 02:04:24 +0000 0000:5e:00.0 Corrected error: Receiver Error
|
||||
|
||||
Signed-off-by: Dann Frazier <dann.frazier@canonical.com>
|
||||
Tested-by: Shiju Jose <shiju.jose@huawei.com>
|
||||
---
|
||||
util/ras-mc-ctl.in | 8 ++++----
|
||||
1 file changed, 4 insertions(+), 4 deletions(-)
|
||||
|
||||
diff --git a/util/ras-mc-ctl.in b/util/ras-mc-ctl.in
|
||||
index 8d6d866..665a042 100755
|
||||
--- a/util/ras-mc-ctl.in
|
||||
+++ b/util/ras-mc-ctl.in
|
||||
@@ -1230,7 +1230,7 @@ sub summary
|
||||
sub errors
|
||||
{
|
||||
require DBI;
|
||||
- my ($query, $query_handle, $id, $time, $count, $type, $msg, $label, $mc, $top, $mid, $low, $addr, $grain, $syndrome, $detail, $out);
|
||||
+ my ($query, $query_handle, $id, $time, $devname, $count, $type, $msg, $label, $mc, $top, $mid, $low, $addr, $grain, $syndrome, $detail, $out);
|
||||
my ($mcgcap,$mcgstatus, $status, $misc, $ip, $tsc, $walltime, $cpu, $cpuid, $apicid, $socketid, $cs, $bank, $cpuvendor, $bank_name, $mcgstatus_msg, $mcistatus_msg, $user_action, $mc_location);
|
||||
my ($timestamp, $etype, $severity, $etype_string, $severity_string, $fru_id, $fru_text, $cper_data);
|
||||
my ($bus_name, $dev_name, $driver_name, $reporter_name);
|
||||
@@ -1259,13 +1259,13 @@ sub errors
|
||||
$query_handle->finish;
|
||||
|
||||
# PCIe AER aer_event errors
|
||||
- $query = "select id, timestamp, err_type, err_msg from aer_event order by id";
|
||||
+ $query = "select id, timestamp, dev_name, err_type, err_msg from aer_event order by id";
|
||||
$query_handle = $dbh->prepare($query);
|
||||
$query_handle->execute();
|
||||
- $query_handle->bind_columns(\($id, $time, $type, $msg));
|
||||
+ $query_handle->bind_columns(\($id, $time, $devname, $type, $msg));
|
||||
$out = "";
|
||||
while($query_handle->fetch()) {
|
||||
- $out .= "$id $time $type error: $msg\n";
|
||||
+ $out .= "$id $time $devname $type error: $msg\n";
|
||||
}
|
||||
if ($out ne "") {
|
||||
print "PCIe AER events:\n$out\n";
|
||||
@ -1,29 +0,0 @@
|
||||
From 00115dda854f4a50681ccc6c017daa991234411b Mon Sep 17 00:00:00 2001
|
||||
From: Liguang Zhang <zhangliguang@linux.alibaba.com>
|
||||
Date: Mon, 10 Aug 2020 11:07:43 +0800
|
||||
Subject: [PATCH] rasdaemon: Fix error print
|
||||
|
||||
Fix error print handle_ras_events.
|
||||
|
||||
Signed-off-by: Liguang Zhang <zhangliguang@linux.alibaba.com>
|
||||
Signed-off-by: Mauro Carvalho Chehab <mchehab+huawei@kernel.org>
|
||||
---
|
||||
ras-events.c | 2 +-
|
||||
1 file changed, 1 insertion(+), 1 deletion(-)
|
||||
|
||||
diff --git a/ras-events.c b/ras-events.c
|
||||
index a99fd29..c797b20 100644
|
||||
--- a/ras-events.c
|
||||
+++ b/ras-events.c
|
||||
@@ -874,7 +874,7 @@ int handle_ras_events(int record_events)
|
||||
num_events++;
|
||||
} else
|
||||
log(ALL, LOG_ERR, "Can't get traces from %s:%s\n",
|
||||
- "ras", "aer_event");
|
||||
+ "ras", "extlog_mem_event");
|
||||
#endif
|
||||
|
||||
#ifdef HAVE_DEVLINK
|
||||
--
|
||||
2.18.4
|
||||
|
||||
@ -0,0 +1,56 @@
|
||||
From 9415b7449c70f5ea4a0209ddb89c2f5f392d3b4b Mon Sep 17 00:00:00 2001
|
||||
From: Muralidhara M K <muralimk@amd.com>
|
||||
Date: Tue, 27 Jul 2021 06:36:45 -0500
|
||||
Subject: [PATCH] rasdaemon: ras-mc-ctl: Fix script to parse dimm sizes
|
||||
|
||||
Removes trailing spaces at the end of a line from
|
||||
file location and fixes --layout option to parse dimm nodes
|
||||
to get the size of each dimm from ras-mc-ctl.
|
||||
|
||||
Issue is reported https://github.com/mchehab/rasdaemon/issues/43
|
||||
Where '> ras-mc-ctl --layout' reports all 0s
|
||||
|
||||
With this change the layout option prints the correct dimm sizes
|
||||
> sudo ras-mc-ctl --layout
|
||||
+-----------------------------------------------+
|
||||
| mc0 |
|
||||
| csrow0 | csrow1 | csrow2 | csrow3 |
|
||||
----------+-----------------------------------------------+
|
||||
...
|
||||
channel7: | 16384 MB | 0 MB | 0 MB | 0 MB |
|
||||
channel6: | 16384 MB | 0 MB | 0 MB | 0 MB |
|
||||
...
|
||||
----------+-----------------------------------------------+
|
||||
|
||||
Signed-off-by: Muralidhara M K <muralimk@amd.com>
|
||||
Signed-off-by: Naveen Krishna Chatradhi <nchatrad@amd.com>
|
||||
Cc: Yazen Ghannam <yazen.ghannam@amd.com>
|
||||
Signed-off-by: Mauro Carvalho Chehab <mchehab@kernel.org>
|
||||
Link: https://lkml.kernel.org/r/20210810183855.129076-1-nchatrad@amd.com/
|
||||
---
|
||||
util/ras-mc-ctl.in | 2 ++
|
||||
1 file changed, 2 insertions(+)
|
||||
|
||||
diff --git a/util/ras-mc-ctl.in b/util/ras-mc-ctl.in
|
||||
index 1e3aeb7..b22dd60 100755
|
||||
--- a/util/ras-mc-ctl.in
|
||||
+++ b/util/ras-mc-ctl.in
|
||||
@@ -246,6 +246,7 @@ sub parse_dimm_nodes
|
||||
if (($file =~ /max_location$/)) {
|
||||
open IN, $file;
|
||||
my $location = <IN>;
|
||||
+ $location =~ s/\s+$//;
|
||||
close IN;
|
||||
my @temp = split(/ /, $location);
|
||||
|
||||
@@ -288,6 +289,7 @@ sub parse_dimm_nodes
|
||||
|
||||
open IN, $file;
|
||||
my $location = <IN>;
|
||||
+ $location =~ s/\s+$//;
|
||||
close IN;
|
||||
|
||||
my @pos;
|
||||
--
|
||||
2.27.0
|
||||
|
||||
@ -0,0 +1,34 @@
|
||||
From ce33041e0abfa20054ff5d6874ffbd1ab592558d Mon Sep 17 00:00:00 2001
|
||||
From: Aristeu Rozanski <arozansk@redhat.com>
|
||||
Date: Thu, 19 Jan 2023 08:45:57 -0500
|
||||
Subject: [PATCH] rasdaemon: ras-memory-failure-handler: handle localtime()
|
||||
failure correctly
|
||||
|
||||
We could just have an empty string but keeping the format could prevent
|
||||
issues if someone is actually parsing this.
|
||||
Found with covscan.
|
||||
|
||||
v2: fixed the timestamp as pointed by Robert Elliott
|
||||
|
||||
Signed-off-by: Aristeu Rozanski <arozansk@redhat.com>
|
||||
Signed-off-by: Mauro Carvalho Chehab <mchehab@kernel.org>
|
||||
---
|
||||
ras-memory-failure-handler.c | 2 ++
|
||||
1 file changed, 2 insertions(+)
|
||||
|
||||
diff --git a/ras-memory-failure-handler.c b/ras-memory-failure-handler.c
|
||||
index 9941e68..1951456 100644
|
||||
--- a/ras-memory-failure-handler.c
|
||||
+++ b/ras-memory-failure-handler.c
|
||||
@@ -148,6 +148,8 @@ int ras_memory_failure_event_handler(struct trace_seq *s,
|
||||
if (tm)
|
||||
strftime(ev.timestamp, sizeof(ev.timestamp),
|
||||
"%Y-%m-%d %H:%M:%S %z", tm);
|
||||
+ else
|
||||
+ strncpy(ev.timestamp, "1970-01-01 00:00:00 +0000", sizeof(ev.timestamp));
|
||||
trace_seq_printf(s, "%s ", ev.timestamp);
|
||||
|
||||
if (pevent_get_field_val(s, event, "pfn", record, &val, 1) < 0)
|
||||
--
|
||||
2.27.0
|
||||
|
||||
@ -0,0 +1,93 @@
|
||||
From 899fcc2cf21c86b5462c8f4441cd9c92b3d75f7d Mon Sep 17 00:00:00 2001
|
||||
From: Aristeu Rozanski <arozansk@redhat.com>
|
||||
Date: Thu, 19 Jan 2023 08:45:57 -0500
|
||||
Subject: [PATCH] rasdaemon: ras-report: fix possible but unlikely file
|
||||
descriptor leak
|
||||
|
||||
Found with covscan.
|
||||
|
||||
Signed-off-by: Aristeu Rozanski <arozansk@redhat.com>
|
||||
Signed-off-by: Mauro Carvalho Chehab <mchehab@kernel.org>
|
||||
---
|
||||
ras-report.c | 16 ++++++++--------
|
||||
1 file changed, 8 insertions(+), 8 deletions(-)
|
||||
|
||||
diff --git a/ras-report.c b/ras-report.c
|
||||
index ea3a9b6..62d5eb7 100644
|
||||
--- a/ras-report.c
|
||||
+++ b/ras-report.c
|
||||
@@ -434,7 +434,7 @@ int ras_report_mc_event(struct ras_events *ras, struct ras_mc_event *ev){
|
||||
|
||||
mc_fail:
|
||||
|
||||
- if(sockfd > 0){
|
||||
+ if(sockfd >= 0){
|
||||
close(sockfd);
|
||||
}
|
||||
|
||||
@@ -484,7 +484,7 @@ int ras_report_aer_event(struct ras_events *ras, struct ras_aer_event *ev){
|
||||
|
||||
aer_fail:
|
||||
|
||||
- if(sockfd > 0){
|
||||
+ if(sockfd >= 0){
|
||||
close(sockfd);
|
||||
}
|
||||
|
||||
@@ -533,7 +533,7 @@ int ras_report_non_standard_event(struct ras_events *ras, struct ras_non_standar
|
||||
|
||||
non_standard_fail:
|
||||
|
||||
- if(sockfd > 0){
|
||||
+ if(sockfd >= 0){
|
||||
close(sockfd);
|
||||
}
|
||||
|
||||
@@ -578,7 +578,7 @@ int ras_report_arm_event(struct ras_events *ras, struct ras_arm_event *ev){
|
||||
|
||||
arm_fail:
|
||||
|
||||
- if(sockfd > 0){
|
||||
+ if(sockfd >= 0){
|
||||
close(sockfd);
|
||||
}
|
||||
|
||||
@@ -624,7 +624,7 @@ int ras_report_mce_event(struct ras_events *ras, struct mce_event *ev){
|
||||
|
||||
mce_fail:
|
||||
|
||||
- if(sockfd > 0){
|
||||
+ if(sockfd >= 0){
|
||||
close(sockfd);
|
||||
}
|
||||
|
||||
@@ -674,7 +674,7 @@ int ras_report_devlink_event(struct ras_events *ras, struct devlink_event *ev){
|
||||
|
||||
devlink_fail:
|
||||
|
||||
- if(sockfd > 0){
|
||||
+ if(sockfd >= 0){
|
||||
close(sockfd);
|
||||
}
|
||||
|
||||
@@ -723,7 +723,7 @@ int ras_report_diskerror_event(struct ras_events *ras, struct diskerror_event *e
|
||||
done = 1;
|
||||
|
||||
diskerror_fail:
|
||||
- if(sockfd > 0){
|
||||
+ if(sockfd >= 0){
|
||||
close(sockfd);
|
||||
}
|
||||
|
||||
@@ -768,7 +768,7 @@ int ras_report_mf_event(struct ras_events *ras, struct ras_mf_event *ev)
|
||||
done = 1;
|
||||
|
||||
mf_fail:
|
||||
- if (sockfd > 0)
|
||||
+ if (sockfd >= 0)
|
||||
close(sockfd);
|
||||
|
||||
if (done)
|
||||
--
|
||||
2.27.0
|
||||
|
||||
@ -15,7 +15,7 @@ index e73a08a..04a0489 100644
|
||||
@@ -4,7 +4,7 @@ After=syslog.target
|
||||
|
||||
[Service]
|
||||
EnvironmentFile=/etc/sysconfig/rasdaemon
|
||||
EnvironmentFile=@SYSCONFDEFDIR@/rasdaemon
|
||||
-ExecStart=@sbindir@/rasdaemon -f -r
|
||||
+ExecStart=@sbindir@/rasdaemon -f
|
||||
ExecStartPost=@sbindir@/rasdaemon --enable
|
||||
|
||||
@ -1,34 +0,0 @@
|
||||
From fd8c8d1f66a9058a27c2d1fbfb11225499abebb1 Mon Sep 17 00:00:00 2001
|
||||
From: Lostwayzxc <luoshengwei@huawei.com>
|
||||
Date: Wed, 15 Dec 2021 12:54:41 +0800
|
||||
Subject: [PATCH] fix where local variables are not initialized
|
||||
|
||||
---
|
||||
ras-cpu-isolation.c | 4 ++++
|
||||
1 file changed, 4 insertions(+)
|
||||
|
||||
diff --git a/ras-cpu-isolation.c b/ras-cpu-isolation.c
|
||||
index bca7e0b..acef1ad 100644
|
||||
--- a/ras-cpu-isolation.c
|
||||
+++ b/ras-cpu-isolation.c
|
||||
@@ -112,6 +112,8 @@ static int init_cpu_info(unsigned cpus)
|
||||
}
|
||||
|
||||
for (unsigned int i = 0; i < cpus; ++i) {
|
||||
+ cpu_infos[i].ce_nums = 0;
|
||||
+ cpu_infos[i].uce_nums = 0;
|
||||
cpu_infos[i].state = get_cpu_status(i);
|
||||
cpu_infos[i].ce_queue = init_queue();
|
||||
if (cpu_infos[i].ce_queue == NULL) {
|
||||
@@ -384,6 +386,8 @@ void ras_record_cpu_error(struct error_info *err_info, int cpu)
|
||||
log(TERM, LOG_INFO, "Offline cpu%d succeed, the state is %s\n",
|
||||
cpu, cpu_state[cpu_infos[cpu].state]);
|
||||
clear_queue(cpu_infos[cpu].ce_queue);
|
||||
+ cpu_infos[cpu].ce_nums = 0;
|
||||
+ cpu_infos[cpu].uce_nums = 0;
|
||||
}
|
||||
else {
|
||||
log(TERM, LOG_INFO, "Offline cpu%d fail, the state is %s\n",
|
||||
--
|
||||
2.27.0
|
||||
|
||||
@ -1,234 +0,0 @@
|
||||
From b82767ec717976223134d4e279f874352e7910c9 Mon Sep 17 00:00:00 2001
|
||||
From: Lostwayzxc <luoshengwei@huawei.com>
|
||||
Date: Wed, 24 Nov 2021 09:43:52 +0800
|
||||
Subject: [PATCH] modify the way counting cpu logical index
|
||||
|
||||
It's hard to count cpu logical index according to the mpidr in the userspace,
|
||||
so the index will be counted in the kernel before reported to userspace now.
|
||||
|
||||
Related patches:
|
||||
0006-add-cpu-online-fault-isolation.patch
|
||||
0008-modify-cpu-parse-for-adapting-to-new-bios-version.patch
|
||||
|
||||
---
|
||||
ras-arm-handler.c | 8 ++-
|
||||
ras-cpu-isolation.c | 127 ++------------------------------------------
|
||||
ras-cpu-isolation.h | 6 +--
|
||||
3 files changed, 11 insertions(+), 130 deletions(-)
|
||||
|
||||
diff --git a/ras-arm-handler.c b/ras-arm-handler.c
|
||||
index 8a229b4..47f9a57 100644
|
||||
--- a/ras-arm-handler.c
|
||||
+++ b/ras-arm-handler.c
|
||||
@@ -124,6 +124,12 @@ int ras_arm_event_handler(struct trace_seq *s,
|
||||
trace_seq_printf(s, "\n psci_state: %d", ev.psci_state);
|
||||
|
||||
#ifdef HAVE_CPU_FAULT_ISOLATION
|
||||
+ int cpu;
|
||||
+ if (pevent_get_field_val(s, event, "cpu", record, &val, 1) < 0)
|
||||
+ return -1;
|
||||
+ cpu = val;
|
||||
+ trace_seq_printf(s, "\n cpu: %d", cpu);
|
||||
+
|
||||
/* record cpu error */
|
||||
if (pevent_get_field_val(s, event, "sev", record, &val, 1) < 0)
|
||||
return -1;
|
||||
@@ -156,7 +162,7 @@ int ras_arm_event_handler(struct trace_seq *s,
|
||||
nums = count_errors(event, ev.error_info, len);
|
||||
if (nums > 0) {
|
||||
struct error_info err_info = {nums, now, val};
|
||||
- ras_record_cpu_error(&err_info, ev.mpidr);
|
||||
+ ras_record_cpu_error(&err_info, cpu);
|
||||
}
|
||||
}
|
||||
#endif
|
||||
diff --git a/ras-cpu-isolation.c b/ras-cpu-isolation.c
|
||||
index b1643c4..bca7e0b 100644
|
||||
--- a/ras-cpu-isolation.c
|
||||
+++ b/ras-cpu-isolation.c
|
||||
@@ -24,13 +24,9 @@
|
||||
#include "ras-cpu-isolation.h"
|
||||
|
||||
static struct cpu_info *cpu_infos = NULL;
|
||||
-static unsigned int ncores, cores_per_socket, cores_per_die;
|
||||
-static unsigned int cores_per_cluster = 4;
|
||||
-static unsigned int sockets, dies = 1;
|
||||
+static unsigned int ncores;
|
||||
static unsigned int enabled = 1;
|
||||
static const char *cpu_path_format = "/sys/devices/system/cpu/cpu%d/online";
|
||||
-static const char *core_siblings_list_path = "/sys/devices/system/cpu/cpu%d/topology/core_siblings_list";
|
||||
-static const char *node_path = "/sys/devices/system/node/possible";
|
||||
|
||||
static const struct param normal_units[] = {
|
||||
{ "", 1 },
|
||||
@@ -86,69 +82,6 @@ static int open_sys_file(unsigned cpu, int __oflag, const char *format)
|
||||
return fd;
|
||||
}
|
||||
|
||||
-static int get_sockets(void)
|
||||
-{
|
||||
- int fd, j;
|
||||
- char buf[MAX_BUF_LEN] = "";
|
||||
- cores_per_socket = ncores;
|
||||
- struct cpu_set *cpu_sets = (struct cpu_set *) malloc(sizeof(*cpu_sets) * ncores);
|
||||
-
|
||||
- if (!cpu_sets) {
|
||||
- log(TERM, LOG_ERR, "Failed to allocate memory for cpu sets in %s.\n", __func__);
|
||||
- return -1;
|
||||
- }
|
||||
-
|
||||
- for (int i = 0; i < ncores; ++i) {
|
||||
- fd = open_sys_file(i, O_RDONLY, core_siblings_list_path);
|
||||
- if (fd == -1) {
|
||||
- continue;
|
||||
- }
|
||||
- memset(buf, '\0', strlen(buf));
|
||||
- if (read(fd, buf, sizeof(buf)) <= 0) {
|
||||
- close(fd);
|
||||
- continue;
|
||||
- }
|
||||
- for (j = 0; j < sockets; ++j) {
|
||||
- if (strcmp(cpu_sets[j].buf, buf) == 0) {
|
||||
- break;
|
||||
- }
|
||||
- }
|
||||
- if (j == sockets) {
|
||||
- strcpy(cpu_sets[sockets].buf, buf);
|
||||
- sockets++;
|
||||
- }
|
||||
- close(fd);
|
||||
- }
|
||||
-
|
||||
- free(cpu_sets);
|
||||
- cores_per_socket = sockets > 0 ? ncores / sockets : ncores;
|
||||
-
|
||||
- return 0;
|
||||
-}
|
||||
-
|
||||
-static int get_dies(void)
|
||||
-{
|
||||
- int fd, begin, end;
|
||||
- char buf[20] = "";
|
||||
- cores_per_die = ncores;
|
||||
- fd = open(node_path, O_RDONLY);
|
||||
-
|
||||
- if (fd == -1) {
|
||||
- return -1;
|
||||
- }
|
||||
-
|
||||
- if (read(fd, buf, sizeof(buf))) {
|
||||
- if (sscanf(buf, "%d-%d", &begin, &end) == 2) {
|
||||
- dies = end > begin ? end - begin + 1 : 1;
|
||||
- }
|
||||
- }
|
||||
-
|
||||
- close(fd);
|
||||
- cores_per_die = ncores / dies;
|
||||
-
|
||||
- return 0;
|
||||
-}
|
||||
-
|
||||
static int get_cpu_status(unsigned cpu)
|
||||
{
|
||||
int fd, num;
|
||||
@@ -190,11 +123,6 @@ static int init_cpu_info(unsigned cpus)
|
||||
cpu_limit.limit = cpus - 1;
|
||||
cpu_limit.value = 0;
|
||||
|
||||
- if (get_sockets() < 0 || get_dies() < 0) {
|
||||
- log(TERM, LOG_ERR, "Failed to get sockets or nodes of the system\n");
|
||||
- return -1;
|
||||
- }
|
||||
-
|
||||
return 0;
|
||||
}
|
||||
|
||||
@@ -418,64 +346,15 @@ static void record_error_info(unsigned cpu, struct error_info *err_info)
|
||||
}
|
||||
}
|
||||
|
||||
-static unsigned long get_bit_value(int64_t value, unsigned offset, unsigned size)
|
||||
+void ras_record_cpu_error(struct error_info *err_info, int cpu)
|
||||
{
|
||||
- value >>= offset;
|
||||
- unsigned long res = 0;
|
||||
- int i = 0;
|
||||
-
|
||||
- while (i < size) {
|
||||
- res |= (value & (0x1 << (i++)));
|
||||
- }
|
||||
-
|
||||
- return res;
|
||||
-}
|
||||
-
|
||||
-static unsigned get_cpu_index(int64_t mpidr)
|
||||
-{
|
||||
- unsigned core_id, cluster_id, socket_id, die_id, cpu;
|
||||
- /*
|
||||
- * Adapt to certain BIOS
|
||||
- * In the MPIDR:
|
||||
- * bit 8:15: core id
|
||||
- * bit 16:18: cluster id
|
||||
- * bit 19:20: die_id
|
||||
- * bit 21:22: socket_id
|
||||
- */
|
||||
- core_id = get_bit_value(mpidr, 8, 8);
|
||||
- cluster_id = get_bit_value(mpidr, 16, 3);
|
||||
- socket_id = get_bit_value(mpidr, 21, 2);
|
||||
- die_id = get_bit_value(mpidr, 19, 2);
|
||||
-
|
||||
- /* When die id parsed from MPIDR is 1, it means TotemA, and when it's 3,
|
||||
- * it means TotemB. When cores per die equal to cores per socket, it means
|
||||
- * that there is only one die in the socket, in case that the only die is
|
||||
- * TotemB in CPU 1620s, we set die id to 0 directly.
|
||||
- */
|
||||
- if (cores_per_die == cores_per_socket) {
|
||||
- die_id = 0;
|
||||
- }
|
||||
- else {
|
||||
- die_id = (die_id == 1 ? 0:1);
|
||||
- }
|
||||
- cpu = core_id + socket_id * cores_per_socket + die_id * cores_per_die +
|
||||
- cluster_id * cores_per_cluster;
|
||||
-
|
||||
- return cpu;
|
||||
-}
|
||||
-
|
||||
-void ras_record_cpu_error(struct error_info *err_info, int64_t mpidr)
|
||||
-{
|
||||
- unsigned cpu;
|
||||
int ret;
|
||||
|
||||
if (enabled == 0) {
|
||||
return;
|
||||
}
|
||||
|
||||
- cpu = get_cpu_index(mpidr);
|
||||
-
|
||||
- if (cpu >= ncores) {
|
||||
+ if (cpu >= ncores || cpu < 0) {
|
||||
log(TERM, LOG_ERR, "The current cpu %d has exceed the total number of cpu:%d\n", cpu, ncores);
|
||||
return;
|
||||
}
|
||||
diff --git a/ras-cpu-isolation.h b/ras-cpu-isolation.h
|
||||
index a7d3fdb..95dedc1 100644
|
||||
--- a/ras-cpu-isolation.h
|
||||
+++ b/ras-cpu-isolation.h
|
||||
@@ -65,12 +65,8 @@ struct error_info {
|
||||
enum error_type err_type;
|
||||
};
|
||||
|
||||
-struct cpu_set {
|
||||
- char buf[MAX_BUF_LEN];
|
||||
-};
|
||||
-
|
||||
void ras_error_count_init(unsigned cpus);
|
||||
-void ras_record_cpu_error(struct error_info *err_info, int64_t mpidr);
|
||||
+void ras_record_cpu_error(struct error_info *err_info, int cpu);
|
||||
void cpu_infos_free(void);
|
||||
|
||||
#endif
|
||||
\ No newline at end of file
|
||||
--
|
||||
2.27.0
|
||||
|
||||
@ -1,18 +0,0 @@
|
||||
From d59e4d224b3271cf7a7fe53cd7c5d539b58eac32 Mon Sep 17 00:00:00 2001
|
||||
From: lvying <lvying6@huawei.com>
|
||||
Date: Sat, 26 Jan 2019 15:54:17 +0800
|
||||
Subject: [PATCH] rasdaemon:fix ras events memory leak
|
||||
|
||||
reason:fix ras events memory leak
|
||||
|
||||
diff -uprN a/ras-events.c b/ras-events.c
|
||||
--- a/ras-events.c 2018-06-22 14:20:42.880878700 +0800
|
||||
+++ b/ras-events.c 2018-06-22 14:38:24.420726900 +0800
|
||||
@@ -314,6 +314,7 @@ static void parse_ras_data(struct pthrea
|
||||
trace_seq_init(&s);
|
||||
pevent_print_event(pdata->ras->pevent, &s, &record);
|
||||
trace_seq_do_printf(&s);
|
||||
+ trace_seq_destroy(&s);
|
||||
printf("\n");
|
||||
fflush(stdout);
|
||||
}
|
||||
@ -0,0 +1,41 @@
|
||||
From d439975850f947ced01423dc4bb4d6406022b4e1 Mon Sep 17 00:00:00 2001
|
||||
From: hubin <hubin73@huawei.com>
|
||||
Date: Thu, 18 May 2023 16:14:41 +0800
|
||||
Subject: [PATCH] ras-events: quit loop in read_ras_event when kbuf data is
|
||||
broken
|
||||
|
||||
when kbuf data is broken, kbuffer_next_event() may move kbuf->index back to
|
||||
the current kbuf->index position, causing dead loop.
|
||||
|
||||
In this situation, rasdaemon will repeatedly parse an invalid event, and
|
||||
print warning like "ug! negative record size -8!", pushing cpu utilization
|
||||
rate to 100%.
|
||||
|
||||
when kbuf data is broken, discard current page and continue reading next page
|
||||
kbuf.
|
||||
|
||||
Signed-off-by: hubin <hubin73@huawei.com>
|
||||
---
|
||||
ras-events.c | 5 +++++
|
||||
1 file changed, 5 insertions(+)
|
||||
|
||||
diff --git a/ras-events.c b/ras-events.c
|
||||
index 1479732..11ecb4d 100644
|
||||
--- a/ras-events.c
|
||||
+++ b/ras-events.c
|
||||
@@ -498,6 +498,11 @@ static int read_ras_event_all_cpus(struct pthread_data *pdata,
|
||||
kbuffer_load_subbuffer(kbuf, page);
|
||||
|
||||
while ((data = kbuffer_read_event(kbuf, &time_stamp))) {
|
||||
+ if (kbuffer_curr_size(kbuf) < 0) {
|
||||
+ log(TERM, LOG_ERR, "invalid kbuf data, discard\n");
|
||||
+ break;
|
||||
+ }
|
||||
+
|
||||
parse_ras_data(&pdata[i],
|
||||
kbuf, data, time_stamp);
|
||||
|
||||
--
|
||||
2.33.0
|
||||
|
||||
|
||||
@ -0,0 +1,25 @@
|
||||
From fd9341f5f7f3896c4de2a9a90d7dc366fd2ffedc Mon Sep 17 00:00:00 2001
|
||||
From: shixuantong <shixuantong1@huawei.com>
|
||||
Date: Thu, 1 Dec 2022 12:39:11 +0000
|
||||
Subject: [PATCH] fix ras-mc-ctl.service startup failed when selinux is on
|
||||
|
||||
---
|
||||
util/ras-mc-ctl.in | 2 +-
|
||||
1 file changed, 1 insertion(+), 1 deletion(-)
|
||||
|
||||
diff --git a/util/ras-mc-ctl.in b/util/ras-mc-ctl.in
|
||||
index 9198a23..888b4e8 100755
|
||||
--- a/util/ras-mc-ctl.in
|
||||
+++ b/util/ras-mc-ctl.in
|
||||
@@ -39,7 +39,7 @@ my $dbname = "@RASSTATEDIR@/@RAS_DB_FNAME@";
|
||||
my $prefix = "@prefix@";
|
||||
my $sysconfdir = "@sysconfdir@";
|
||||
my $dmidecode = find_prog ("dmidecode");
|
||||
-my $modprobe = find_prog ("modprobe") or exit (1);
|
||||
+my $modprobe = find_prog ("modprobe");
|
||||
|
||||
my $has_aer = 0;
|
||||
my $has_arm = 0;
|
||||
--
|
||||
2.33.0
|
||||
|
||||
Binary file not shown.
BIN
rasdaemon-0.6.7.tar.gz
Normal file
BIN
rasdaemon-0.6.7.tar.gz
Normal file
Binary file not shown.
62
rasdaemon-diskerror-fix-incomplete-diskerror-log.patch
Normal file
62
rasdaemon-diskerror-fix-incomplete-diskerror-log.patch
Normal file
@ -0,0 +1,62 @@
|
||||
From be5ea839fd52453f01ceb131813fb2e6919684ab Mon Sep 17 00:00:00 2001
|
||||
From: Lv Ying <lvying6@huawei.com>
|
||||
Date: Thu, 15 Dec 2022 21:01:59 +0800
|
||||
Subject: [PATCH] rasdaemon/diskerror: fix incomplete diskerror log
|
||||
|
||||
Currently, rasdaemon output incomplete diskerror log(only contains timestamp):
|
||||
<idle>-0 [000] 0.017915: block_rq_complete: 2022-12-16 04:17:32 +0800
|
||||
|
||||
Fix incomplete diskerror log just like block_rq_complete tracepoint output format:
|
||||
<idle>-0 [042] d.h. 177962.715669: block_rq_complete: 21,0 N () 18446744073709551615 + 0 [-121]
|
||||
---
|
||||
ras-diskerror-handler.c | 22 ++++++++++++++--------
|
||||
1 file changed, 14 insertions(+), 8 deletions(-)
|
||||
|
||||
diff --git a/ras-diskerror-handler.c b/ras-diskerror-handler.c
|
||||
index b16319f..0a6e315 100644
|
||||
--- a/ras-diskerror-handler.c
|
||||
+++ b/ras-diskerror-handler.c
|
||||
@@ -97,26 +97,32 @@ int ras_diskerror_event_handler(struct trace_seq *s,
|
||||
dev = (dev_t)val;
|
||||
if (asprintf(&ev.dev, "%u:%u", major(dev), minor(dev)) < 0)
|
||||
return -1;
|
||||
+ trace_seq_printf(s, "%s ", ev.dev);
|
||||
+
|
||||
+ ev.rwbs = pevent_get_field_raw(s, event, "rwbs", record, &len, 1);
|
||||
+ if (!ev.rwbs)
|
||||
+ return -1;
|
||||
+ trace_seq_printf(s, "%s ", ev.rwbs);
|
||||
+
|
||||
+ ev.cmd = pevent_get_field_raw(s, event, "cmd", record, &len, 1);
|
||||
+ if (!ev.cmd)
|
||||
+ return -1;
|
||||
+ trace_seq_printf(s, "(%s) ", ev.cmd);
|
||||
|
||||
if (pevent_get_field_val(s, event, "sector", record, &val, 1) < 0)
|
||||
return -1;
|
||||
ev.sector = val;
|
||||
+ trace_seq_printf(s, "%llu ", ev.sector);
|
||||
|
||||
if (pevent_get_field_val(s, event, "nr_sector", record, &val, 1) < 0)
|
||||
return -1;
|
||||
ev.nr_sector = (unsigned int)val;
|
||||
+ trace_seq_printf(s, "+ %u ", ev.nr_sector);
|
||||
|
||||
if (pevent_get_field_val(s, event, "error", record, &val, 1) < 0)
|
||||
return -1;
|
||||
ev.error = get_blk_error((int)val);
|
||||
-
|
||||
- ev.rwbs = pevent_get_field_raw(s, event, "rwbs", record, &len, 1);
|
||||
- if (!ev.rwbs)
|
||||
- return -1;
|
||||
-
|
||||
- ev.cmd = pevent_get_field_raw(s, event, "cmd", record, &len, 1);
|
||||
- if (!ev.cmd)
|
||||
- return -1;
|
||||
+ trace_seq_printf(s, "[%s]", ev.error);
|
||||
|
||||
/* Insert data into the SGBD */
|
||||
#ifdef HAVE_SQLITE3
|
||||
--
|
||||
2.38.1
|
||||
|
||||
192
rasdaemon.spec
192
rasdaemon.spec
@ -1,10 +1,10 @@
|
||||
Name: rasdaemon
|
||||
Version: 0.6.6
|
||||
Release: 10
|
||||
Version: 0.6.7
|
||||
Release: 13
|
||||
License: GPLv2
|
||||
Summary: Utility to get Platform Reliability, Availability and Serviceability (RAS) reports via the Kernel tracing events
|
||||
URL: https://github.com/mchehab/rasdaemon.git
|
||||
Source0: %{name}-%{version}.tar.gz
|
||||
Source0: https://github.com/mchehab/rasdaemon/archive/v%{version}.tar.gz#/%{name}-%{version}.tar.gz
|
||||
|
||||
ExcludeArch: s390 s390x
|
||||
BuildRequires: gcc, gettext-devel, perl-generators, sqlite-devel, systemd, git, libtool
|
||||
@ -19,28 +19,42 @@ Requires(post): systemd
|
||||
Requires(preun): systemd
|
||||
Requires(postun): systemd
|
||||
|
||||
Patch1: bugfix-ras-events-memory-leak.patch
|
||||
Patch2: bugfix-rasdaemon-wait-for-file-access.patch
|
||||
Patch3: bugfix-fix-fd-check.patch
|
||||
Patch4: bugfix-fix-disk-error-log-storm.patch
|
||||
Patch5: backport-rasdaemon-Fix-error-print.patch
|
||||
Patch6: backport-0001-rasdaemon-delete-the-duplicate-code-about-the-defini.patch
|
||||
Patch7: backport-0002-rasdaemon-delete-the-code-of-non-standard-error-deco.patch
|
||||
Patch8: backport-0003-rasdaemon-add-support-for-hisilicon-common-section-d.patch
|
||||
Patch9: backport-0001-rasdaemon-Modify-non-standard-error-decoding-interfa.patch
|
||||
Patch10: 0001-rasdaemon-Fix-the-issue-of-sprintf-data-type-mismatc.patch
|
||||
Patch11: 0002-rasdaemon-Fix-the-issue-of-command-option-r-for-hip0.patch
|
||||
Patch12: 0003-rasdaemon-Fix-some-print-format-issues-for-hisi-comm.patch
|
||||
Patch13: 0004-rasdaemon-Add-some-modules-supported-by-hisi-common-.patch
|
||||
Patch14: 0006-add-cpu-online-fault-isolation.patch
|
||||
Patch15: 0007-add-trace-print-and-add-sqlite-store.patch
|
||||
Patch16: 0008-modify-cpu-parse-for-adapting-to-new-bios-version.patch
|
||||
Patch17: bugfix-modify-the-way-counting-cpu-logical-index.patch
|
||||
Patch18: bugfix-fix-where-local-variables-are-not-initialized.patch
|
||||
Patch19: backport-ras-mc-ctl-PCIe-AER-display-PCIe-dev-name.patch
|
||||
Patch20: backport-Fix-ras-mc-ctl-script.patch
|
||||
Patch1: bugfix-rasdaemon-wait-for-file-access.patch
|
||||
Patch2: bugfix-fix-fd-check.patch
|
||||
Patch3: bugfix-fix-disk-error-log-storm.patch
|
||||
Patch4: backport-configure.ac-fix-SYSCONFDEFDIR-default-value.patch
|
||||
Patch5: 0001-rasdaemon-Support-cpu-fault-isolation-for-corrected-.patch
|
||||
Patch6: 0002-rasdaemon-Support-cpu-fault-isolation-for-recoverabl.patch
|
||||
Patch7: 0001-rasdaemon-Fix-the-issue-of-sprintf-data-type-mismatc.patch
|
||||
Patch8: 0002-rasdaemon-Fix-the-issue-of-command-option-r-for-hip0.patch
|
||||
Patch9: 0003-rasdaemon-Fix-some-print-format-issues-for-hisi-comm.patch
|
||||
Patch10: 0004-rasdaemon-Add-some-modules-supported-by-hisi-common-.patch
|
||||
Patch11: 0003-rasdaemon-Modify-recording-Hisilicon-common-error-da.patch
|
||||
Patch12: 0004-rasdaemon-ras-mc-ctl-Modify-error-statistics-for-HiS.patch
|
||||
Patch13: 0005-rasdaemon-ras-mc-ctl-Reformat-error-info-of-the-HiSi.patch
|
||||
Patch14: 0006-rasdaemon-ras-mc-ctl-Add-printing-usage-if-necessary.patch
|
||||
Patch15: 0007-rasdaemon-ras-mc-ctl-Add-support-to-display-the-HiSi.patch
|
||||
Patch16: 0008-rasdaemon-ras-mc-ctl-Relocate-reading-and-display-Ku.patch
|
||||
Patch17: 0009-rasdaemon-ras-mc-ctl-Updated-HiSilicon-platform-name.patch
|
||||
Patch18: 0010-rasdaemon-Fix-for-a-memory-out-of-bounds-issue-and-o.patch
|
||||
Patch19: 0001-rasdaemon-use-standard-length-PATH_MAX-for-path-name.patch
|
||||
Patch20: rasdaemon-diskerror-fix-incomplete-diskerror-log.patch
|
||||
Patch21: backport-traceevent-Add-proper-KBUFFER_TYPE_TIME_STAMP-handling.patch
|
||||
|
||||
Patch6000: backport-rasdaemon-ras-mc-ctl-Fix-script-to-parse-dimm-sizes.patch
|
||||
Patch6001: backport-rasdaemon-ras-memory-failure-handler-handle-localtim.patch
|
||||
Patch6002: backport-rasdaemon-ras-report-fix-possible-but-unlikely-file-.patch
|
||||
|
||||
Patch9000: fix-ras-mc-ctl.service-startup-failed-when-selinux-is-no.patch
|
||||
Patch9001: 0001-rasdaemon-Fix-for-regression-in-ras_mc_create_table-.patch
|
||||
Patch9002: 0002-rasdaemon-Fix-poll-on-per_cpu-trace_pipe_raw-blocks-.patch
|
||||
Patch9003: 0001-rasdaemon-fix-return-value-type-issue-of-read-write-.patch
|
||||
Patch9004: 0002-rasdaemon-fix-issue-of-signed-and-unsigned-integer-c.patch
|
||||
Patch9005: 0003-rasdaemon-Add-support-for-creating-the-vendor-error-.patch
|
||||
Patch9006: 0004-rasdaemon-Add-four-modules-supported-by-HiSilicon-co.patch
|
||||
Patch9007: fix-ras-events-quit-loop-in-read_ras_event-when-kbuf-dat.patch
|
||||
|
||||
|
||||
%description
|
||||
The rasdaemon program is a daemon which monitors the platform
|
||||
Reliablity, Availability and Serviceability (RAS) reports from the
|
||||
@ -58,7 +72,7 @@ autoheader
|
||||
libtoolize --automake --copy --debug --force
|
||||
automake --add-missing
|
||||
%ifarch %{arm} aarch64
|
||||
%configure --enable-mce --enable-aer --enable-sqlite3 --enable-extlog --enable-abrt-report --enable-devlink --enable-diskerror --enable-non-standard --enable-hisi-ns-decode --enable-arm
|
||||
%configure --enable-mce --enable-aer --enable-sqlite3 --enable-extlog --enable-abrt-report --enable-devlink --enable-diskerror --enable-non-standard --enable-hisi-ns-decode --enable-arm --enable-memory-failure --enable-memory-ce-pfa --enable-cpu-fault-isolation
|
||||
%else
|
||||
%configure --enable-mce --enable-aer --enable-sqlite3 --enable-extlog --enable-abrt-report --enable-devlink --enable-diskerror
|
||||
%endif
|
||||
@ -78,7 +92,6 @@ rm INSTALL %{buildroot}/usr/include/*.h
|
||||
%{_sbindir}/ras-mc-ctl
|
||||
%{_mandir}/*/*
|
||||
%{_unitdir}/*.service
|
||||
%{_sharedstatedir}/rasdaemon
|
||||
%{_sysconfdir}/ras/dimm_labels.d
|
||||
%config(noreplace) %{_sysconfdir}/sysconfig/%{name}
|
||||
|
||||
@ -86,69 +99,142 @@ rm INSTALL %{buildroot}/usr/include/*.h
|
||||
/usr/bin/systemctl enable rasdaemon.service >/dev/null 2>&1 || :
|
||||
|
||||
%changelog
|
||||
* Sat Jun 17 2023 yanglongkang <yanglongkang@h-partners.com> - 0.6.6-10
|
||||
* Tue Jun 20 2023 zhangnan <zhangnan134@huawei.com> - 0.6.7-13
|
||||
- Type:bugfix
|
||||
- ID:NA
|
||||
- SUG:NA
|
||||
- DESC:ras-events:quit loop in read_ras_event when kbuf data is broken
|
||||
|
||||
* Sat Jun 17 2023 yanglongkang <yanglongkang@h-partners.com> - 0.6.7-12
|
||||
- Type:bugfix
|
||||
- ID:NA
|
||||
- SUG:NA
|
||||
- DESC:backport libtraceevent patch to adapt to kernel ftrace ring buffer change
|
||||
|
||||
* Tue Mar 21 2023 shixuantong <shixuantong1@huawei.com> - 0.6.6-9
|
||||
* Fri Jun 2 2023 Shiju Jose<shiju.jose@huawei.com> - 0.6.7-11
|
||||
- Type:bugfix
|
||||
- ID:NA
|
||||
- SUG:NA
|
||||
- DESC:Fix ras-mc-ctl script
|
||||
- DESC:
|
||||
1. Fix return value type issue of read/write function from unistd.h.
|
||||
2. Fix issue of signed and unsigned integer comparison.
|
||||
3. Remove redundant header file and do some cleaup.
|
||||
4. Add support for create/open the vendor error tables at rasdaemon startup.
|
||||
5. Make changes in the HiSilicon error handling code for the same.
|
||||
6. Add four modules supported by HiSilicon common section.
|
||||
|
||||
* Wed Dec 15 2021 luoshengwei<luoshengwei@huawei.com> - 0.6.6-8
|
||||
* Tue Apr 4 2023 huangfangrun <huangfangrun1@h-partners.com> - 0.6.7-10
|
||||
- Type:bugfix
|
||||
- ID:NA
|
||||
- SUG:NA
|
||||
- DESC: Add initialization to some local variables when they are cleaned
|
||||
- or defined.
|
||||
- DESC:
|
||||
1.Fix for regression in ras_mc_create_table() if some cpus are offline at the system start.
|
||||
2.Fix poll() on per_cpu trace_pipe_raw blocks indefinitely.
|
||||
|
||||
* Wed Dec 1 2021 luoshengwei<luoshengwei@huawei.com> - 0.6.6-7
|
||||
* Wed Mar 29 2023 Lv Ying <lvying6@huawei.com> - 0.6.7-9
|
||||
- Type:bugfix
|
||||
- ID:NA
|
||||
- SUG:NA
|
||||
- DESC: Since the cpu logical index has been counted in kernel, remove
|
||||
- related code in ras.
|
||||
- DESC:fix ras-mc-ctl.service startup failed when selinux is on
|
||||
|
||||
* Wed Oct 27 2021 luoshengwei<luoshengwei@huawei.com> - 0.6.6-6
|
||||
* Thu Mar 23 2023 renhongxun <renhongxun@h-partners.com> - 0.6.7-8
|
||||
- Type:bugfix
|
||||
- ID:NA
|
||||
- SUG:NA
|
||||
- DESC:backport patches from upstream
|
||||
|
||||
* Thu Feb 16 2023 Lv Ying <lvying6@huawei.com> - 0.6.7-7
|
||||
- Type:bugfix
|
||||
- ID:NA
|
||||
- SUG:NA
|
||||
- DESC:rasdaemon/diskerror: fix incomplete diskerror log
|
||||
|
||||
* Thu Oct 27 2022 Lei Feng <fenglei47@h-partners.com> - 0.6.7-6
|
||||
- Type:bugfix
|
||||
- ID:NA
|
||||
- SUG:NA
|
||||
- DESC:
|
||||
Add the following patch to fix startup core dumped issue.
|
||||
0001-rasdaemon-use-standard-length-PATH_MAX-for-path-name.patch
|
||||
|
||||
* Mon May 23 2022 Shiju Jose<shiju.jose@huawei.com> - 0.6.7-5
|
||||
- Type:feature
|
||||
- ID:NA
|
||||
- SUG:NA
|
||||
- DESC: Sync three patches, add cpu online fault isolation.
|
||||
- DESC:
|
||||
Update with the latest patches for the
|
||||
1. CPU online fault isolation for arm event.
|
||||
2. Modify recording Hisilicon common error data in the rasdaemon
|
||||
3. In the ras-mc-ctl,
|
||||
3.1. Improve Hisilicon common error statistics.
|
||||
3.2. Add support to display the HiSilicon vendor-errors for a specified module.
|
||||
3.3. Add printing usage if necessary parameters are not passed for the HiSilicon vendor-errors options.
|
||||
3.4. Reformat error info of the HiSilicon Kunpeng920.
|
||||
3.5. Relocate reading and display Kunpeng920 errors to under Kunpeng9xx.
|
||||
3.6. Updated the HiSilicon platform name as KunPeng9xx.
|
||||
4. Fixed a memory out-of-bounds issue in the rasdaemon.
|
||||
|
||||
* Wed Oct 20 2021 tanxiaofei<tanxiaofei@huawei.com> - 0.6.6-5
|
||||
- Type:Bugfix
|
||||
* Mon Mar 07 2022 Shiju Jose<shiju.jose@huawei.com> - 0.6.7-4
|
||||
- Type:feature
|
||||
- ID:NA
|
||||
- SUG:NA
|
||||
- DESC: Backport one patch, and some little fixes and add some modules
|
||||
support for kunpeng series:
|
||||
1. Modify non-standard error decoding interface using linked list
|
||||
2. Fix the issue of sprintf data type mismatch in uuid_le()
|
||||
3. Fix the issue of command option -r for hip08
|
||||
4. Fix some print format issues for hisi common error section
|
||||
5. Add some modules supported by hisi common error section
|
||||
- DESC:
|
||||
1. Modify recording Hisilicon common error data in the rasdaemon and
|
||||
2. In the ras-mc-ctl,
|
||||
2.1. Improve Hisilicon common error statistics.
|
||||
2.2. Add support to display the HiSilicon vendor-errors for a specified module.
|
||||
2.3. Add printing usage if necessary parameters are not passed for the HiSilicon vendor-errors options.
|
||||
2.4. Reformat error info of the HiSilicon Kunpeng920.
|
||||
2.5. Relocate reading and display Kunpeng920 errors to under Kunpeng9xx.
|
||||
|
||||
* Sat July 29 2021 tanxiaofei<tanxiaofei@huawei.com> - 0.6.6-4
|
||||
* Wed Mar 2 2022 tanxiaofei<tanxiaofei@huawei.com> - 0.6.7-3
|
||||
- Type:bugfix
|
||||
- ID:NA
|
||||
- SUG:NA
|
||||
- DESC:
|
||||
1. Backport 4 patches from openEuler master branch.
|
||||
1) Fix the issue of sprintf data type mismatch in uuid_le()
|
||||
2) Fix the issue of command option -r for hip08
|
||||
3) Fix some print format issues for hisi common error section
|
||||
4) Add some modules supported by hisi common error section
|
||||
2.Enable compilation of the feature memory fault prediction based on
|
||||
corrected error.
|
||||
3.Fix changelog date error of this spec file.
|
||||
|
||||
* Wed Feb 23 2022 luoshengwei<luoshengwei@huawei.com> - 0.6.7-2
|
||||
- Type:feature
|
||||
- ID:NA
|
||||
- SUG:NA
|
||||
- DESC: Add cpu online fault isolation for arm event.
|
||||
|
||||
* Wed Dec 8 2021 xujing <xujing99@huawei.com> - 0.6.7-1
|
||||
- Update software to v0.6.7
|
||||
|
||||
* Thu Jul 29 2021 tanxiaofei<tanxiaofei@huawei.com> - 0.6.6-6
|
||||
- Type:feature
|
||||
- ID:NA
|
||||
- SUG:NA
|
||||
- DESC:Add support for hisilicon common section that some IIO devices may
|
||||
- used in new firmware of Kunpeng920, and Kunpeng930 will also use it too.
|
||||
|
||||
* Sat May 15 2021 xujing<17826839720@163.com> - 0.6.6-3
|
||||
- Type:bugfix
|
||||
- ID:NA
|
||||
- SUG:NA
|
||||
- DESC:fix error print in handle_ras_events
|
||||
|
||||
* Sat May 15 2021 xujing<17826839720@163.com> - 0.6.6-2
|
||||
* Sat May 15 2021 xujing<17826839720@163.com> - 0.6.6-5
|
||||
- Type:bugfix
|
||||
- ID:NA
|
||||
- SUG:NA
|
||||
- DESC:fix disk error log storm
|
||||
|
||||
* Wed Apr 28 2021 Lv Ying <lvying6@huawei.com> - 0.6.6-4
|
||||
- backport bugfix patches from community:
|
||||
1. Fix error print handle_ras_events.
|
||||
|
||||
* Wed Mar 31 2021 Lv Ying <lvying6@huawei.com> - 0.6.6-3
|
||||
- backport bugfix patches from community:
|
||||
1. ras-page-isolation: do_page_offline always considers page offline was successful
|
||||
2. ras-page-isolation: page which is PAGE_OFFLINE_FAILED can be offlined again
|
||||
|
||||
* Fri Sep 25 2020 openEuler Buildteam <buildteam@openeuler.org> - 0.6.6-2
|
||||
- Update software source URL
|
||||
|
||||
* Fri Jul 24 2020 openEuler Buildteam <buildteam@openeuler.org> - 0.6.6-1
|
||||
- Update software to v0.6.6
|
||||
|
||||
|
||||
4
rasdaemon.yaml
Normal file
4
rasdaemon.yaml
Normal file
@ -0,0 +1,4 @@
|
||||
version_control: github
|
||||
src_repo: mchehab/rasdaemon
|
||||
tag_prefix: ^v
|
||||
seperator: .
|
||||
Loading…
x
Reference in New Issue
Block a user