504 lines
11 KiB
C
504 lines
11 KiB
C
/*
|
|
* Copyright (c) 2020 Intel Corporation.
|
|
*
|
|
* SPDX-License-Identifier: Apache-2.0
|
|
*/
|
|
|
|
#define DT_DRV_COMPAT intel_ibecc
|
|
|
|
#include <zephyr.h>
|
|
#include <device.h>
|
|
#include <drivers/pcie/pcie.h>
|
|
|
|
#include <drivers/edac.h>
|
|
#include "ibecc.h"
|
|
|
|
/**
|
|
* In the driver 64 bit registers are used and not all of then at the
|
|
* moment may be correctly logged.
|
|
*/
|
|
#include <logging/log.h>
|
|
LOG_MODULE_REGISTER(edac_ibecc, CONFIG_EDAC_LOG_LEVEL);
|
|
|
|
#define DEVICE_NODE DT_NODELABEL(ibecc)
|
|
#define PCI_HOST_BRIDGE PCIE_BDF(0, 0, 0)
|
|
|
|
struct ibecc_data {
|
|
mem_addr_t mchbar;
|
|
edac_notify_callback_f cb;
|
|
uint32_t error_type;
|
|
|
|
/* Error count */
|
|
unsigned int errors_cor;
|
|
unsigned int errors_uc;
|
|
};
|
|
|
|
static void ibecc_write_reg64(const struct device *dev,
|
|
uint16_t reg, uint64_t value)
|
|
{
|
|
struct ibecc_data *data = dev->data;
|
|
mem_addr_t reg_addr = data->mchbar + reg;
|
|
|
|
sys_write64(value, reg_addr);
|
|
}
|
|
|
|
static uint64_t ibecc_read_reg64(const struct device *dev, uint16_t reg)
|
|
{
|
|
struct ibecc_data *data = dev->data;
|
|
mem_addr_t reg_addr = data->mchbar + reg;
|
|
|
|
return sys_read64(reg_addr);
|
|
}
|
|
|
|
#if defined(CONFIG_EDAC_ERROR_INJECT)
|
|
static void ibecc_write_reg32(const struct device *dev,
|
|
uint16_t reg, uint32_t value)
|
|
{
|
|
struct ibecc_data *data = dev->data;
|
|
mem_addr_t reg_addr = data->mchbar + reg;
|
|
|
|
sys_write32(value, reg_addr);
|
|
}
|
|
#endif
|
|
|
|
static uint32_t ibecc_read_reg32(const struct device *dev, uint16_t reg)
|
|
{
|
|
struct ibecc_data *data = dev->data;
|
|
mem_addr_t reg_addr = data->mchbar + reg;
|
|
|
|
return sys_read32(reg_addr);
|
|
}
|
|
|
|
static bool ibecc_enabled(const pcie_bdf_t bdf)
|
|
{
|
|
return !!(pcie_conf_read(bdf, CAPID0_C_REG) & CAPID0_C_IBECC_ENABLED);
|
|
}
|
|
|
|
static void ibecc_errcmd_setup(const pcie_bdf_t bdf, bool enable)
|
|
{
|
|
uint32_t errcmd;
|
|
|
|
errcmd = pcie_conf_read(bdf, ERRCMD_REG);
|
|
|
|
if (enable) {
|
|
errcmd |= (ERRCMD_IBECC_COR | ERRCMD_IBECC_UC) << 16;
|
|
} else {
|
|
errcmd &= ~(ERRCMD_IBECC_COR | ERRCMD_IBECC_UC) << 16;
|
|
}
|
|
|
|
pcie_conf_write(bdf, ERRCMD_REG, errcmd);
|
|
}
|
|
|
|
static void ibecc_errsts_clear(const pcie_bdf_t bdf)
|
|
{
|
|
uint32_t errsts;
|
|
|
|
errsts = pcie_conf_read(bdf, ERRSTS_REG);
|
|
|
|
if (!(errsts & (ERRSTS_IBECC_COR | ERRSTS_IBECC_UC))) {
|
|
return;
|
|
}
|
|
|
|
pcie_conf_write(bdf, ERRSTS_REG, errsts);
|
|
}
|
|
|
|
static const char *get_ddr_type(uint8_t type)
|
|
{
|
|
switch (type) {
|
|
case 0:
|
|
return "DDR4";
|
|
case 3:
|
|
return "LPDDR4";
|
|
default:
|
|
return "Unknown";
|
|
}
|
|
}
|
|
|
|
static const char *get_dimm_width(uint8_t type)
|
|
{
|
|
switch (type) {
|
|
case 0:
|
|
return "X8";
|
|
case 1:
|
|
return "X16";
|
|
case 2:
|
|
return "X32";
|
|
default:
|
|
return "Unknown";
|
|
}
|
|
}
|
|
|
|
static void mchbar_regs_dump(const struct device *dev)
|
|
{
|
|
uint32_t mad_inter_chan, chan_hash;
|
|
|
|
/* Memory configuration */
|
|
|
|
chan_hash = ibecc_read_reg32(dev, CHANNEL_HASH);
|
|
LOG_DBG("Channel hash %x", chan_hash);
|
|
|
|
mad_inter_chan = ibecc_read_reg32(dev, MAD_INTER_CHAN);
|
|
LOG_DBG("DDR memory type %s",
|
|
get_ddr_type(INTER_CHAN_DDR_TYPE(mad_inter_chan)));
|
|
|
|
for (int ch = 0; ch < DRAM_MAX_CHANNELS; ch++) {
|
|
uint32_t intra_ch = ibecc_read_reg32(dev, MAD_INTRA_CH(ch));
|
|
uint32_t dimm_ch = ibecc_read_reg32(dev, MAD_DIMM_CH(ch));
|
|
uint64_t l_size = DIMM_L_SIZE(dimm_ch);
|
|
uint64_t s_size = DIMM_S_SIZE(dimm_ch);
|
|
uint8_t l_map = DIMM_L_MAP(intra_ch);
|
|
|
|
LOG_DBG("channel %d: l_size 0x%llx s_size 0x%llx l_map %d\n",
|
|
ch, l_size, s_size, l_map);
|
|
|
|
for (int d = 0; d < DRAM_MAX_DIMMS; d++) {
|
|
uint64_t size;
|
|
const char *type;
|
|
|
|
if (d ^ l_map) {
|
|
type = get_dimm_width(DIMM_S_WIDTH(dimm_ch));
|
|
size = s_size;
|
|
} else {
|
|
type = get_dimm_width(DIMM_L_WIDTH(dimm_ch));
|
|
size = l_size;
|
|
}
|
|
|
|
if (!size) {
|
|
continue;
|
|
}
|
|
|
|
LOG_DBG("Channel %d DIMM %d size %llu GiB width %s",
|
|
ch, d, size >> 30, type);
|
|
}
|
|
}
|
|
}
|
|
|
|
static void parse_ecclog(const struct device *dev, const uint64_t ecclog,
|
|
struct ibecc_error *error_data)
|
|
{
|
|
struct ibecc_data *data = dev->data;
|
|
|
|
if (!ecclog) {
|
|
return;
|
|
}
|
|
|
|
error_data->type = ECC_ERROR_ERRTYPE(ecclog);
|
|
error_data->address = ECC_ERROR_ERRADD(ecclog);
|
|
error_data->syndrome = ECC_ERROR_ERRSYND(ecclog);
|
|
|
|
if (ecclog & ECC_ERROR_MERRSTS) {
|
|
data->errors_uc++;
|
|
}
|
|
|
|
if (ecclog & ECC_ERROR_CERRSTS) {
|
|
data->errors_cor++;
|
|
}
|
|
}
|
|
|
|
#if defined(CONFIG_EDAC_ERROR_INJECT)
|
|
static int inject_set_param1(const struct device *dev, uint64_t addr)
|
|
{
|
|
if (addr & ~INJ_ADDR_BASE_MASK) {
|
|
return -EINVAL;
|
|
}
|
|
|
|
ibecc_write_reg64(dev, IBECC_INJ_ADDR_BASE, addr);
|
|
|
|
return 0;
|
|
}
|
|
|
|
static uint64_t inject_get_param1(const struct device *dev)
|
|
{
|
|
return ibecc_read_reg64(dev, IBECC_INJ_ADDR_BASE);
|
|
}
|
|
|
|
static int inject_set_param2(const struct device *dev, uint64_t mask)
|
|
{
|
|
if (mask & ~INJ_ADDR_BASE_MASK_MASK) {
|
|
return -EINVAL;
|
|
}
|
|
|
|
ibecc_write_reg64(dev, IBECC_INJ_ADDR_MASK, mask);
|
|
|
|
return 0;
|
|
}
|
|
|
|
static uint64_t inject_get_param2(const struct device *dev)
|
|
{
|
|
return ibecc_read_reg64(dev, IBECC_INJ_ADDR_MASK);
|
|
}
|
|
|
|
static int inject_set_error_type(const struct device *dev,
|
|
uint32_t error_type)
|
|
{
|
|
struct ibecc_data *data = dev->data;
|
|
|
|
data->error_type = error_type;
|
|
|
|
return 0;
|
|
}
|
|
|
|
static uint32_t inject_get_error_type(const struct device *dev)
|
|
{
|
|
struct ibecc_data *data = dev->data;
|
|
|
|
return data->error_type;
|
|
}
|
|
|
|
static int inject_error_trigger(const struct device *dev)
|
|
{
|
|
struct ibecc_data *data = dev->data;
|
|
uint32_t ctrl = 0;
|
|
|
|
switch (data->error_type) {
|
|
case EDAC_ERROR_TYPE_DRAM_COR:
|
|
ctrl |= INJ_CTRL_COR;
|
|
break;
|
|
case EDAC_ERROR_TYPE_DRAM_UC:
|
|
ctrl |= INJ_CTRL_UC;
|
|
break;
|
|
default:
|
|
/* This would clear error injection */
|
|
break;
|
|
}
|
|
|
|
ibecc_write_reg32(dev, IBECC_INJ_ADDR_CTRL, ctrl);
|
|
|
|
return 0;
|
|
}
|
|
#endif /* CONFIG_EDAC_ERROR_INJECT */
|
|
|
|
static uint64_t ecc_error_log_get(const struct device *dev)
|
|
{
|
|
return ibecc_read_reg64(dev, IBECC_ECC_ERROR_LOG);
|
|
}
|
|
|
|
static void ecc_error_log_clear(const struct device *dev)
|
|
{
|
|
/* Clear all error bits */
|
|
ibecc_write_reg64(dev, IBECC_ECC_ERROR_LOG,
|
|
ECC_ERROR_MERRSTS | ECC_ERROR_CERRSTS);
|
|
}
|
|
|
|
static uint64_t parity_error_log_get(const struct device *dev)
|
|
{
|
|
return ibecc_read_reg64(dev, IBECC_PARITY_ERROR_LOG);
|
|
}
|
|
|
|
static void parity_error_log_clear(const struct device *dev)
|
|
{
|
|
ibecc_write_reg64(dev, IBECC_PARITY_ERROR_LOG, PARITY_ERROR_ERRSTS);
|
|
}
|
|
|
|
static unsigned int errors_cor_get(const struct device *dev)
|
|
{
|
|
struct ibecc_data *data = dev->data;
|
|
|
|
return data->errors_cor;
|
|
}
|
|
|
|
static unsigned int errors_uc_get(const struct device *dev)
|
|
{
|
|
struct ibecc_data *data = dev->data;
|
|
|
|
return data->errors_uc;
|
|
}
|
|
|
|
static int notify_callback_set(const struct device *dev,
|
|
edac_notify_callback_f cb)
|
|
{
|
|
struct ibecc_data *data = dev->data;
|
|
int key = irq_lock();
|
|
|
|
data->cb = cb;
|
|
irq_unlock(key);
|
|
|
|
return 0;
|
|
}
|
|
|
|
static const struct edac_driver_api api = {
|
|
#if defined(CONFIG_EDAC_ERROR_INJECT)
|
|
/* Error Injection functions */
|
|
.inject_set_param1 = inject_set_param1,
|
|
.inject_get_param1 = inject_get_param1,
|
|
.inject_set_param2 = inject_set_param2,
|
|
.inject_get_param2 = inject_get_param2,
|
|
.inject_set_error_type = inject_set_error_type,
|
|
.inject_get_error_type = inject_get_error_type,
|
|
.inject_error_trigger = inject_error_trigger,
|
|
#endif /* CONFIG_EDAC_ERROR_INJECT */
|
|
|
|
/* Error reporting & clearing functions */
|
|
.ecc_error_log_get = ecc_error_log_get,
|
|
.ecc_error_log_clear = ecc_error_log_clear,
|
|
.parity_error_log_get = parity_error_log_get,
|
|
.parity_error_log_clear = parity_error_log_clear,
|
|
|
|
/* Get error stats */
|
|
.errors_cor_get = errors_cor_get,
|
|
.errors_uc_get = errors_uc_get,
|
|
|
|
/* Notification callback set */
|
|
.notify_cb_set = notify_callback_set,
|
|
};
|
|
|
|
int edac_ibecc_init(const struct device *dev)
|
|
{
|
|
const pcie_bdf_t bdf = PCI_HOST_BRIDGE;
|
|
struct ibecc_data *data = dev->data;
|
|
uint32_t tolud;
|
|
uint64_t touud, tom, mchbar;
|
|
|
|
LOG_INF("EDAC IBECC initialization");
|
|
|
|
if (!pcie_probe(bdf, PCIE_ID(PCI_VENDOR_ID_INTEL,
|
|
PCI_DEVICE_ID_SKU5)) &&
|
|
!pcie_probe(bdf, PCIE_ID(PCI_VENDOR_ID_INTEL,
|
|
PCI_DEVICE_ID_SKU6)) &&
|
|
!pcie_probe(bdf, PCIE_ID(PCI_VENDOR_ID_INTEL,
|
|
PCI_DEVICE_ID_SKU7)) &&
|
|
!pcie_probe(bdf, PCIE_ID(PCI_VENDOR_ID_INTEL,
|
|
PCI_DEVICE_ID_SKU8)) &&
|
|
!pcie_probe(bdf, PCIE_ID(PCI_VENDOR_ID_INTEL,
|
|
PCI_DEVICE_ID_SKU9)) &&
|
|
!pcie_probe(bdf, PCIE_ID(PCI_VENDOR_ID_INTEL,
|
|
PCI_DEVICE_ID_SKU10)) &&
|
|
!pcie_probe(bdf, PCIE_ID(PCI_VENDOR_ID_INTEL,
|
|
PCI_DEVICE_ID_SKU11)) &&
|
|
!pcie_probe(bdf, PCIE_ID(PCI_VENDOR_ID_INTEL,
|
|
PCI_DEVICE_ID_SKU12))) {
|
|
LOG_ERR("PCI Probe failed");
|
|
return -ENODEV;
|
|
}
|
|
|
|
if (!ibecc_enabled(bdf)) {
|
|
LOG_ERR("IBECC is not enabled");
|
|
return -ENODEV;
|
|
}
|
|
|
|
mchbar = pcie_conf_read(bdf, MCHBAR_REG);
|
|
mchbar |= (uint64_t)pcie_conf_read(bdf, MCHBAR_REG + 1) << 32;
|
|
|
|
/* Check that MCHBAR is enabled */
|
|
if (!(mchbar & MCHBAR_ENABLE)) {
|
|
LOG_ERR("MCHBAR is not enabled");
|
|
return -ENODEV;
|
|
}
|
|
|
|
mchbar &= MCHBAR_MASK;
|
|
|
|
/* workaround for 32 bit read */
|
|
touud = pcie_conf_read(bdf, TOUUD_REG);
|
|
touud |= (uint64_t)pcie_conf_read(bdf, TOUUD_REG + 1) << 32;
|
|
touud &= TOUUD_MASK;
|
|
|
|
/* workaround for 32 bit read */
|
|
tom = pcie_conf_read(bdf, TOM_REG);
|
|
tom |= (uint64_t)pcie_conf_read(bdf, TOM_REG + 1) << 32;
|
|
tom &= TOM_MASK;
|
|
|
|
tolud = pcie_conf_read(bdf, TOLUD_REG) & TOLUD_MASK;
|
|
|
|
device_map(&data->mchbar, mchbar, MCH_SIZE, K_MEM_CACHE_NONE);
|
|
|
|
LOG_DBG("MCHBAR\t%llx", mchbar);
|
|
LOG_DBG("TOUUD\t%llx", touud);
|
|
LOG_DBG("TOM\t%llx", tom);
|
|
LOG_DBG("TOLUD\t%x", tolud);
|
|
|
|
mchbar_regs_dump(dev);
|
|
|
|
/* Enable Host Bridge generated SERR event */
|
|
ibecc_errcmd_setup(bdf, true);
|
|
|
|
return 0;
|
|
}
|
|
|
|
static struct ibecc_data ibecc_data;
|
|
|
|
DEVICE_DT_DEFINE(DEVICE_NODE, &edac_ibecc_init,
|
|
NULL, &ibecc_data, NULL, POST_KERNEL,
|
|
CONFIG_KERNEL_INIT_PRIORITY_DEVICE, &api);
|
|
|
|
/**
|
|
* An IBECC error causes SERR_NMI_STS set and is indicated by
|
|
* ERRSTS PCI registers by IBECC_UC and IBECC_COR fields.
|
|
* Following needs to be done:
|
|
* - Read ECC_ERR_LOG register
|
|
* - Clear IBECC_UC and IBECC_COR fields of ERRSTS PCI
|
|
* - Clear MERRSTS & CERRSTS fields of ECC_ERR_LOG register
|
|
*/
|
|
|
|
static struct k_spinlock nmi_lock;
|
|
|
|
/* NMI handling */
|
|
|
|
static bool handle_nmi(void)
|
|
{
|
|
uint8_t status;
|
|
|
|
status = sys_in8(NMI_STS_CNT_REG);
|
|
|
|
if (!(status & NMI_STS_SRC_SERR)) {
|
|
LOG_DBG("Skip NMI, NMI_STS_CNT: 0x%x", status);
|
|
/**
|
|
* We should be able to find that this NMI we
|
|
* should not handle and return false. However this
|
|
* does not work for some older SKUs
|
|
*/
|
|
return true;
|
|
}
|
|
|
|
LOG_DBG("core: %d status 0x%x", arch_curr_cpu()->id, status);
|
|
|
|
/* Re-enable */
|
|
|
|
status = (status & NMI_STS_MASK_EN) | NMI_STS_SERR_EN;
|
|
sys_out8(status, NMI_STS_CNT_REG);
|
|
|
|
status &= ~NMI_STS_SERR_EN;
|
|
sys_out8(status, NMI_STS_CNT_REG);
|
|
|
|
return true;
|
|
}
|
|
|
|
bool z_x86_do_kernel_nmi(const z_arch_esf_t *esf)
|
|
{
|
|
const struct device *dev = DEVICE_DT_GET(DEVICE_NODE);
|
|
struct ibecc_data *data = dev->data;
|
|
struct ibecc_error error_data;
|
|
k_spinlock_key_t key;
|
|
bool ret = true;
|
|
uint64_t ecclog;
|
|
|
|
key = k_spin_lock(&nmi_lock);
|
|
|
|
if (!handle_nmi()) {
|
|
/* Indicate that we do not handle this NMI */
|
|
ret = false;
|
|
goto out;
|
|
}
|
|
|
|
/* Skip the same NMI handling for other cores and return handled */
|
|
if (arch_curr_cpu()->id) {
|
|
ret = true;
|
|
goto out;
|
|
}
|
|
|
|
ecclog = edac_ecc_error_log_get(dev);
|
|
parse_ecclog(dev, ecclog, &error_data);
|
|
|
|
if (data->cb) {
|
|
data->cb(dev, &error_data);
|
|
}
|
|
|
|
edac_ecc_error_log_clear(dev);
|
|
|
|
ibecc_errsts_clear(PCI_HOST_BRIDGE);
|
|
|
|
out:
|
|
k_spin_unlock(&nmi_lock, key);
|
|
|
|
return ret;
|
|
}
|