diff --git a/include/lib/extensions/ras.h b/include/lib/extensions/ras.h index 4fc8f04..793ab9f 100644 --- a/include/lib/extensions/ras.h +++ b/include/lib/extensions/ras.h @@ -1,5 +1,6 @@ /* * Copyright (c) 2018, ARM Limited and Contributors. All rights reserved. + * Copyright (c) 2020, NVIDIA Corporation. All rights reserved. * * SPDX-License-Identifier: BSD-3-Clause */ @@ -192,6 +193,7 @@ probe_data); } +const char *ras_serr_to_str(unsigned int serr); int ras_ea_handler(unsigned int ea_reason, uint64_t syndrome, void *cookie, void *handle, uint64_t flags); void ras_init(void); diff --git a/include/lib/extensions/ras_arch.h b/include/lib/extensions/ras_arch.h index 0c98c4a..55760b0 100644 --- a/include/lib/extensions/ras_arch.h +++ b/include/lib/extensions/ras_arch.h @@ -1,5 +1,6 @@ /* * Copyright (c) 2018, ARM Limited and Contributors. All rights reserved. + * Copyright (c) 2020, NVIDIA Corporation. All rights reserved. * * SPDX-License-Identifier: BSD-3-Clause */ @@ -151,6 +152,9 @@ #define ERROR_STATUS_SET_UC 0x2 /* Uncontainable */ #define ERROR_STATUS_SET_CE 0x3 /* Corrected */ +/* Number of architecturally-defined primary error codes */ +#define ERROR_STATUS_NUM_SERR U(22) + /* Implementation Defined Syndrome bit in ESR */ #define SERROR_IDS_BIT U(24) diff --git a/lib/extensions/ras/ras_common.c b/lib/extensions/ras/ras_common.c index 64a4852..36f9a95 100644 --- a/lib/extensions/ras/ras_common.c +++ b/lib/extensions/ras/ras_common.c @@ -1,5 +1,6 @@ /* * Copyright (c) 2018-2019, ARM Limited and Contributors. All rights reserved. + * Copyright (c) 2020, NVIDIA Corporation. All rights reserved. * * SPDX-License-Identifier: BSD-3-Clause */ @@ -18,6 +19,47 @@ # error Platform must define RAS priority value #endif +/* + * Function to convert architecturally-defined primary error code SERR, + * bits[7:0] from ERRSTATUS to its corresponding error string. + */ +const char *ras_serr_to_str(unsigned int serr) +{ + const char *str[ERROR_STATUS_NUM_SERR] = { + "No error", + "IMPLEMENTATION DEFINED error", + "Data value from (non-associative) internal memory", + "IMPLEMENTATION DEFINED pin", + "Assertion failure", + "Error detected on internal data path", + "Data value from associative memory", + "Address/control value from associative memory", + "Data value from a TLB", + "Address/control value from a TLB", + "Data value from producer", + "Address/control value from producer", + "Data value from (non-associative) external memory", + "Illegal address (software fault)", + "Illegal access (software fault)", + "Illegal state (software fault)", + "Internal data register", + "Internal control register", + "Error response from slave", + "External timeout", + "Internal timeout", + "Deferred error from slave not supported at master" + }; + + /* + * All other values are reserved. Reserved values might be defined + * in a future version of the architecture + */ + if (serr >= ERROR_STATUS_NUM_SERR) + return "unknown SERR"; + + return str[serr]; +} + /* Handler that receives External Aborts on RAS-capable systems */ int ras_ea_handler(unsigned int ea_reason, uint64_t syndrome, void *cookie, void *handle, uint64_t flags) diff --git a/plat/nvidia/tegra/include/platform_def.h b/plat/nvidia/tegra/include/platform_def.h index 678b15c..2331869 100644 --- a/plat/nvidia/tegra/include/platform_def.h +++ b/plat/nvidia/tegra/include/platform_def.h @@ -95,6 +95,7 @@ * Platform macros to support exception handling framework ******************************************************************************/ #define PLAT_PRI_BITS U(3) +#define PLAT_RAS_PRI U(0x10) #define PLAT_SDEI_CRITICAL_PRI U(0x20) #define PLAT_SDEI_NORMAL_PRI U(0x30) #define PLAT_TEGRA_WDT_PRIO U(0x40) diff --git a/plat/nvidia/tegra/include/t194/tegra194_ras_private.h b/plat/nvidia/tegra/include/t194/tegra194_ras_private.h new file mode 100644 index 0000000..c867b9d --- /dev/null +++ b/plat/nvidia/tegra/include/t194/tegra194_ras_private.h @@ -0,0 +1,260 @@ +/* + * Copyright (c) 2020, NVIDIA Corporation. All rights reserved. + * + * SPDX-License-Identifier: BSD-3-Clause + */ + +#ifndef TEGRA194_RAS_PRIVATE +#define TEGRA194_RAS_PRIVATE + +#include + +/* Implementation defined RAS error and corresponding error message */ +struct ras_error { + const char *error_msg; + /* IERR(bits[15:8]) from ERRSTATUS */ + uint8_t error_code; +}; + +/* RAS error node-specific auxiliary data */ +struct ras_aux_data { + /* point to null-terminated ras_error array to convert error code to msg. */ + const struct ras_error *error_records; + /* + * function to return an value which needs to be programmed into ERXCTLR_EL1 + * to enable all specified RAS errors for current node. + */ + uint64_t (*err_ctrl)(void); +}; + +/* IFU Uncorrectable RAS ERROR */ +#define IFU_UNCORR_RAS_ERROR_LIST(X) + +/* JSR_RET Uncorrectable RAS ERROR */ +#define JSR_RET_UNCORR_RAS_ERROR_LIST(X) \ + /* Name, ERR_CTRL, IERR, ISA Desc */ \ + X(JSR_RET, 35, 0x13, "Floating Point Register File Parity Error") \ + X(JSR_RET, 34, 0x12, "Integer Register File Parity Error") \ + X(JSR_RET, 33, 0x11, "Garbage Bundle") \ + X(JSR_RET, 32, 0x10, "Bundle Completion Timeout") + +/* JSR_MTS Uncorrectable RAS ERROR */ +#define JSR_MTS_UNCORR_RAS_ERROR_LIST(X) \ + /* Name, ERR_CTRL, IERR, ISA Desc */ \ + X(JSR_MTS, 40, 0x28, "CoreSight Access Error") \ + X(JSR_MTS, 39, 0x27, "Dual Execution Uncorrectable Error") \ + X(JSR_MTS, 37, 0x25, "CTU MMIO Region") \ + X(JSR_MTS, 36, 0x24, "MTS MMCRAB Region Access") \ + X(JSR_MTS, 35, 0x23, "MTS_CARVEOUT Access from ARM SW") \ + X(JSR_MTS, 34, 0x22, "NAFLL PLL Failure to Lock") \ + X(JSR_MTS, 32, 0x20, "Internal Uncorrectable MTS Error") + +/* LSD_STQ Uncorrectable RAS ERROR */ +#define LSD_STQ_UNCORR_RAS_ERROR_LIST(X) \ + /* Name, ERR_CTRL, IERR, ISA Desc */ \ + X(LSD_STQ, 41, 0x39, "Coherent Cache Data Store Multi-Line ECC Error") \ + X(LSD_STQ, 40, 0x38, "Coherent Cache Data Store Uncorrectable ECC Error") \ + X(LSD_STQ, 38, 0x36, "Coherent Cache Data Load Uncorrectable ECC Error") \ + X(LSD_STQ, 33, 0x31, "Coherent Cache Tag Store Parity Error") \ + X(LSD_STQ, 32, 0x30, "Coherent Cache Tag Load Parity Error") + +/* LSD_DCC Uncorrectable RAS ERROR */ +#define LSD_DCC_UNCORR_RAS_ERROR_LIST(X) \ + /* Name, ERR_CTRL, IERR, ISA Desc */ \ + X(LSD_DCC, 41, 0x49, "BTU Copy Mini-Cache PPN Multi-Hit Error") \ + X(LSD_DCC, 39, 0x47, "Coherent Cache Data Uncorrectable ECC Error") \ + X(LSD_DCC, 37, 0x45, "Version Cache Byte-Enable Parity Error") \ + X(LSD_DCC, 36, 0x44, "Version Cache Data Uncorrectable ECC Error") \ + X(LSD_DCC, 33, 0x41, "BTU Copy Coherent Cache PPN Parity Error") \ + X(LSD_DCC, 32, 0x40, "BTU Copy Coherent Cache VPN Parity Error") + +/* LSD_L1HPF Uncorrectable RAS ERROR */ +#define LSD_L1HPF_UNCORR_RAS_ERROR_LIST(X) + +/* L2 Uncorrectable RAS ERROR */ +#define L2_UNCORR_RAS_ERROR_LIST(X) \ + /* Name, ERR_CTRL, IERR, ISA Desc */ \ + X(L2, 56, 0x68, "URT Timeout") \ + X(L2, 55, 0x67, "L2 Protocol Violation") \ + X(L2, 54, 0x66, "SCF to L2 Slave Error Read") \ + X(L2, 53, 0x65, "SCF to L2 Slave Error Write") \ + X(L2, 52, 0x64, "SCF to L2 Decode Error Read") \ + X(L2, 51, 0x63, "SCF to L2 Decode Error Write") \ + X(L2, 50, 0x62, "SCF to L2 Request Response Interface Parity Errors") \ + X(L2, 49, 0x61, "SCF to L2 Advance notice interface parity errors") \ + X(L2, 48, 0x60, "SCF to L2 Filldata Parity Errors") \ + X(L2, 47, 0x5F, "SCF to L2 UnCorrectable ECC Data Error on interface") \ + X(L2, 45, 0x5D, "Core 1 to L2 Parity Error") \ + X(L2, 44, 0x5C, "Core 0 to L2 Parity Error") \ + X(L2, 43, 0x5B, "L2 Multi-Hit") \ + X(L2, 42, 0x5A, "L2 URT Tag Parity Error") \ + X(L2, 41, 0x59, "L2 NTT Tag Parity Error") \ + X(L2, 40, 0x58, "L2 MLT Tag Parity Error") \ + X(L2, 39, 0x57, "L2 URD Data") \ + X(L2, 38, 0x56, "L2 NTP Data") \ + X(L2, 36, 0x54, "L2 MLC Uncorrectable Clean") \ + X(L2, 35, 0x53, "L2 URD Uncorrectable Dirty") \ + X(L2, 34, 0x52, "L2 MLC Uncorrectable Dirty") + +/* CLUSTER_CLOCKS Uncorrectable RAS ERROR */ +#define CLUSTER_CLOCKS_UNCORR_RAS_ERROR_LIST(X) \ + /* Name, ERR_CTRL, IERR, ISA Desc */ \ + X(CLUSTER_CLOCKS, 32, 0xE4, "Frequency Monitor Error") + +/* MMU Uncorrectable RAS ERROR */ +#define MMU_UNCORR_RAS_ERROR_LIST(X) + +/* L3 Uncorrectable RAS ERROR */ +#define L3_UNCORR_RAS_ERROR_LIST(X) \ + /* Name, ERR_CTRL, IERR, ISA Desc */ \ + X(L3, 43, 0x7B, "SNOC Interface Parity Error") \ + X(L3, 42, 0x7A, "MCF Interface Parity Error") \ + X(L3, 41, 0x79, "L3 Tag Parity Error") \ + X(L3, 40, 0x78, "L3 Dir Parity Error") \ + X(L3, 39, 0x77, "L3 Uncorrectable ECC Error") \ + X(L3, 37, 0x75, "Multi-Hit CAM Error") \ + X(L3, 36, 0x74, "Multi-Hit Tag Error") \ + X(L3, 35, 0x73, "Unrecognized Command Error") \ + X(L3, 34, 0x72, "L3 Protocol Error") + +/* CCPMU Uncorrectable RAS ERROR */ +#define CCPMU_UNCORR_RAS_ERROR_LIST(X) \ + /* Name, ERR_CTRL, IERR, ISA Desc */ \ + X(CCPMU, 40, 0x87, "CoreSight Access Error") \ + X(CCPMU, 36, 0x84, "MCE Ucode Error") \ + X(CCPMU, 35, 0x83, "MCE IL1 Parity Error") \ + X(CCPMU, 34, 0x82, "MCE Timeout Error") \ + X(CCPMU, 33, 0x81, "CRAB Access Error") \ + X(CCPMU, 32, 0x80, "MCE Memory Access Error") + +/* SCF_IOB Uncorrectable RAS ERROR */ +#define SCF_IOB_UNCORR_RAS_ERROR_LIST(X) \ + /* Name, ERR_CTRL, IERR, ISA Desc */ \ + X(SCF_IOB, 41, 0x99, "Request parity error") \ + X(SCF_IOB, 40, 0x98, "Putdata parity error") \ + X(SCF_IOB, 39, 0x97, "Uncorrectable ECC on Putdata") \ + X(SCF_IOB, 38, 0x96, "CBB Interface Error") \ + X(SCF_IOB, 37, 0x95, "MMCRAB Error") \ + X(SCF_IOB, 36, 0x94, "IHI Interface Error") \ + X(SCF_IOB, 35, 0x93, "CRI Error") \ + X(SCF_IOB, 34, 0x92, "TBX Interface Error") \ + X(SCF_IOB, 33, 0x91, "EVP Interface Error") + +/* SCF_SNOC Uncorrectable RAS ERROR */ +#define SCF_SNOC_UNCORR_RAS_ERROR_LIST(X) \ + /* Name, ERR_CTRL, IERR, ISA Desc */ \ + X(SCF_SNOC, 42, 0xAA, "Misc Client Parity Error") \ + X(SCF_SNOC, 41, 0xA9, "Misc Filldata Parity Error") \ + X(SCF_SNOC, 40, 0xA8, "Uncorrectable ECC Misc Client") \ + X(SCF_SNOC, 39, 0xA7, "DVMU Interface Parity Error") \ + X(SCF_SNOC, 38, 0xA6, "DVMU Interface Timeout Error") \ + X(SCF_SNOC, 37, 0xA5, "CPE Request Error") \ + X(SCF_SNOC, 36, 0xA4, "CPE Response Error") \ + X(SCF_SNOC, 35, 0xA3, "CPE Timeout Error") \ + X(SCF_SNOC, 34, 0xA2, "Uncorrectable Carveout Error") + +/* SCF_CTU Uncorrectable RAS ERROR */ +#define SCF_CTU_UNCORR_RAS_ERROR_LIST(X) \ + /* Name, ERR_CTRL, IERR, ISA Desc */ \ + X(SCF_CTU, 39, 0xB7, "Timeout error for TRC_DMA request") \ + X(SCF_CTU, 38, 0xB6, "Timeout error for CTU Snp") \ + X(SCF_CTU, 37, 0xB5, "Parity error in CTU TAG RAM") \ + X(SCF_CTU, 36, 0xB3, "Parity error in CTU DATA RAM") \ + X(SCF_CTU, 35, 0xB4, "Parity error for Cluster Rsp") \ + X(SCF_CTU, 34, 0xB2, "Parity error for TRL requests from 9 agents") \ + X(SCF_CTU, 33, 0xB1, "Parity error for MCF request") \ + X(SCF_CTU, 32, 0xB0, "TRC DMA fillsnoop parity error") + +/* CMU_CLOCKS Uncorrectable RAS ERROR */ +#define CMU_CLOCKS_UNCORR_RAS_ERROR_LIST(X) \ + /* Name, ERR_CTRL, IERR, ISA Desc */ \ + X(CMU_CLOCKS, 39, 0xC7, "Cluster 3 frequency monitor error") \ + X(CMU_CLOCKS, 38, 0xC6, "Cluster 2 frequency monitor error") \ + X(CMU_CLOCKS, 37, 0xC5, "Cluster 1 frequency monitor error") \ + X(CMU_CLOCKS, 36, 0xC3, "Cluster 0 frequency monitor error") \ + X(CMU_CLOCKS, 35, 0xC4, "Voltage error on ADC1 Monitored Logic") \ + X(CMU_CLOCKS, 34, 0xC2, "Voltage error on ADC0 Monitored Logic") \ + X(CMU_CLOCKS, 33, 0xC1, "Lookup Table 1 Parity Error") \ + X(CMU_CLOCKS, 32, 0xC0, "Lookup Table 0 Parity Error") + +/* + * Define one ras_error entry. + * + * This macro wille be used to to generate ras_error records for each node + * defined by _UNCORR_RAS_ERROR_LIST macro. + */ +#define DEFINE_ONE_RAS_ERROR_MSG(unit, ras_bit, ierr, msg) \ + { \ + .error_msg = (msg), \ + .error_code = (ierr) \ + }, + +/* + * Set one implementation defined bit in ERRCTLR + * + * This macro will be used to collect all defined ERR_CTRL bits for each node + * defined by _UNCORR_RAS_ERROR_LIST macro. + */ +#define DEFINE_ENABLE_RAS_BIT(unit, ras_bit, ierr, msg) \ + do { \ + val |= (1ULL << ras_bit##U); \ + } while (0); + +/* Represent one RAS node with 0 or more error bits (ERR_CTLR) enabled */ +#define DEFINE_ONE_RAS_NODE(node) \ +static const struct ras_error node##_uncorr_ras_errors[] = { \ + node##_UNCORR_RAS_ERROR_LIST(DEFINE_ONE_RAS_ERROR_MSG) \ + { \ + NULL, \ + 0U \ + }, \ +}; \ +static inline uint64_t node##_err_ctrl(void) \ +{ \ + uint64_t val = 0ULL; \ + node##_UNCORR_RAS_ERROR_LIST(DEFINE_ENABLE_RAS_BIT) \ + return val; \ +} + +#define DEFINE_ONE_RAS_AUX_DATA(node) \ + { \ + .error_records = node##_uncorr_ras_errors, \ + .err_ctrl = &node##_err_ctrl \ + }, + +#define PER_CORE_RAS_NODE_LIST(X) \ + X(IFU) \ + X(JSR_RET) \ + X(JSR_MTS) \ + X(LSD_STQ) \ + X(LSD_DCC) \ + X(LSD_L1HPF) + +#define PER_CORE_RAS_GROUP_NODES PER_CORE_RAS_NODE_LIST(DEFINE_ONE_RAS_AUX_DATA) + +#define PER_CLUSTER_RAS_NODE_LIST(X) \ + X(L2) \ + X(CLUSTER_CLOCKS) \ + X(MMU) + +#define PER_CLUSTER_RAS_GROUP_NODES PER_CLUSTER_RAS_NODE_LIST(DEFINE_ONE_RAS_AUX_DATA) + +#define SCF_L3_BANK_RAS_NODE_LIST(X) X(L3) + +/* we have 4 SCF_L3 nodes:3*256 + L3_Bank_ID(0-3) */ +#define SCF_L3_BANK_RAS_GROUP_NODES \ + SCF_L3_BANK_RAS_NODE_LIST(DEFINE_ONE_RAS_AUX_DATA) \ + SCF_L3_BANK_RAS_NODE_LIST(DEFINE_ONE_RAS_AUX_DATA) \ + SCF_L3_BANK_RAS_NODE_LIST(DEFINE_ONE_RAS_AUX_DATA) \ + SCF_L3_BANK_RAS_NODE_LIST(DEFINE_ONE_RAS_AUX_DATA) + +#define CCPLEX_RAS_NODE_LIST(X) \ + X(CCPMU) \ + X(SCF_IOB) \ + X(SCF_SNOC) \ + X(SCF_CTU) \ + X(CMU_CLOCKS) + +#define CCPLEX_RAS_GROUP_NODES CCPLEX_RAS_NODE_LIST(DEFINE_ONE_RAS_AUX_DATA) + +#endif /* TEGRA194_RAS_PRIVATE */ diff --git a/plat/nvidia/tegra/include/tegra_private.h b/plat/nvidia/tegra/include/tegra_private.h index f72c9cf..a6d8e68 100644 --- a/plat/nvidia/tegra/include/tegra_private.h +++ b/plat/nvidia/tegra/include/tegra_private.h @@ -89,7 +89,7 @@ /* Declarations for tegra_fiq_glue.c */ void tegra_fiq_handler_setup(void); -int tegra_fiq_get_intr_context(void); +int32_t tegra_fiq_get_intr_context(void); void tegra_fiq_set_ns_entrypoint(uint64_t entrypoint); /* Declarations for tegra_security.c */ @@ -157,4 +157,8 @@ void *handle, uint64_t flags); +#if RAS_EXTENSION +void tegra194_ras_enable(void); +#endif + #endif /* TEGRA_PRIVATE_H */ diff --git a/plat/nvidia/tegra/soc/t194/plat_ras.c b/plat/nvidia/tegra/soc/t194/plat_ras.c new file mode 100644 index 0000000..f9ebb37 --- /dev/null +++ b/plat/nvidia/tegra/soc/t194/plat_ras.c @@ -0,0 +1,287 @@ +/* + * Copyright (c) 2020, NVIDIA Corporation. All rights reserved. + * + * SPDX-License-Identifier: BSD-3-Clause + */ + +#include +#include + +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +/* + * ERRFR bits[63:32], it indicates supported RAS errors which can be enabled + * by setting corresponding bits in ERRCTLR + */ +#define ERR_FR_EN_BITS_MASK 0xFFFFFFFF00000000ULL + +/* bakery lock for platform RAS handler. */ +static DEFINE_BAKERY_LOCK(ras_handler_lock); +#define ras_lock() bakery_lock_get(&ras_handler_lock) +#define ras_unlock() bakery_lock_release(&ras_handler_lock) + +/* + * Function to handle an External Abort received at EL3. + * This function is invoked by RAS framework. + */ +static void tegra194_ea_handler(unsigned int ea_reason, uint64_t syndrome, + void *cookie, void *handle, uint64_t flags) +{ + int32_t ret; + + ras_lock(); + + ERROR("exception reason=%u syndrome=0x%llx on 0x%lx at EL3.\n", + ea_reason, syndrome, read_mpidr_el1()); + + /* Call RAS EA handler */ + ret = ras_ea_handler(ea_reason, syndrome, cookie, handle, flags); + if (ret != 0) { + ERROR("RAS error handled!\n"); + ret = sdei_dispatch_event(TEGRA_SDEI_EP_EVENT_0 + + plat_my_core_pos()); + if (ret != 0) + ERROR("sdei_dispatch_event returned %d\n", ret); + } else { + ERROR("Not a RAS error!\n"); + } + + ras_unlock(); +} + +/* Function to enable uncorrectable errors as External abort (SError) */ +void tegra194_ras_enable(void) +{ + VERBOSE("%s\n", __func__); + + /* skip RAS enablement if not a silicon platform. */ + if (!tegra_platform_is_silicon()) { + return; + } + + /* + * Iterate for each group(num_idx ERRSELRs starting from idx_start) + * use normal for loop instead of for_each_err_record_info to get rid + * of MISRA noise.. + */ + for (uint32_t i = 0U; i < err_record_mappings.num_err_records; i++) { + + const struct err_record_info *info = &err_record_mappings.err_records[i]; + + uint32_t idx_start = info->sysreg.idx_start; + uint32_t num_idx = info->sysreg.num_idx; + const struct ras_aux_data *aux_data = (const struct ras_aux_data *)info->aux_data; + + assert(aux_data != NULL); + + for (uint32_t j = 0; j < num_idx; j++) { + uint64_t err_ctrl = 0ULL; + + /* enable SError reporting for uncorrectable error */ + ERR_CTLR_ENABLE_FIELD(err_ctrl, UE); + ERR_CTLR_ENABLE_FIELD(err_ctrl, ED); + + /* + * Catch error if something wrong with the RAS aux data + * record table. + */ + assert(aux_data[j].err_ctrl != NULL); + + /* enable the specified errors */ + err_ctrl |= aux_data[j].err_ctrl(); + + /* Write to ERRSELR_EL1 to select the error record */ + ser_sys_select_record(idx_start + j); + + /* enable specified errors */ + write_erxctlr_el1(err_ctrl); + + /* + * Check if all the bit settings have been enabled to detect + * uncorrected/corrected errors, if not assert. + */ + assert(read_erxctlr_el1() == err_ctrl); + } + } +} + +/* Function to probe an error from error record group. */ +static int32_t tegra194_ras_record_probe(const struct err_record_info *info, + int *probe_data) +{ + /* Skip probing if not a silicon platform */ + if (!tegra_platform_is_silicon()) { + return 0; + } + + return ser_probe_sysreg(info->sysreg.idx_start, info->sysreg.num_idx, probe_data); +} + +/* Function to handle error from one given node */ +static int32_t tegra194_ras_node_handler(const struct ras_error *errors, uint64_t status) +{ + bool found = false; + uint32_t ierr = (uint32_t)ERR_STATUS_GET_FIELD(status, IERR); + uint32_t serr = (uint32_t)ERR_STATUS_GET_FIELD(status, SERR); + + /* IERR to error message */ + for (uint32_t i = 0; errors[i].error_msg != NULL; i++) { + if (ierr == errors[i].error_code) { + ERROR("IERR = %s(0x%x)\n", + errors[i].error_msg, errors[i].error_code); + found = true; + break; + } + } + if (!found) { + ERROR("unknown IERR: 0x%x\n", ierr); + } + + ERROR("SERR = %s(0x%x)\n", ras_serr_to_str(serr), serr); + + /* Write to clear reported errors. */ + write_erxstatus_el1(status); + + return 0; +} + +/* Function to handle one error node from an error record group. */ +static int32_t tegra194_ras_record_handler(const struct err_record_info *info, + int probe_data, const struct err_handler_data *const data) +{ + uint32_t num_idx = info->sysreg.num_idx; + uint32_t idx_start = info->sysreg.idx_start; + const struct ras_aux_data *aux_data = info->aux_data; + + uint64_t status = 0ULL; + + VERBOSE("%s\n", __func__); + + assert(probe_data >= 0); + assert((uint32_t)probe_data < num_idx); + + uint32_t offset = (uint32_t)probe_data; + const struct ras_error *errors = aux_data[offset].error_records; + + assert(errors != NULL); + + /* Write to ERRSELR_EL1 to select the error record */ + ser_sys_select_record(idx_start + offset); + + /* Retrieve status register from the error record */ + status = read_erxstatus_el1(); + + assert(ERR_STATUS_GET_FIELD(status, V) != 0U); + assert(ERR_STATUS_GET_FIELD(status, UE) != 0U); + + return tegra194_ras_node_handler(errors, status); +} + + +/* Instantiate RAS nodes */ +PER_CORE_RAS_NODE_LIST(DEFINE_ONE_RAS_NODE) +PER_CLUSTER_RAS_NODE_LIST(DEFINE_ONE_RAS_NODE) +SCF_L3_BANK_RAS_NODE_LIST(DEFINE_ONE_RAS_NODE) +CCPLEX_RAS_NODE_LIST(DEFINE_ONE_RAS_NODE) + +/* Instantiate RAS node groups */ +static struct ras_aux_data per_core_ras_group[] = { + PER_CORE_RAS_GROUP_NODES +}; + +static struct ras_aux_data per_cluster_ras_group[] = { + PER_CLUSTER_RAS_GROUP_NODES +}; + +static struct ras_aux_data scf_l3_ras_group[] = { + SCF_L3_BANK_RAS_GROUP_NODES +}; + +static struct ras_aux_data ccplex_ras_group[] = { + CCPLEX_RAS_GROUP_NODES +}; + +/* + * We have same probe and handler for each error record group, use a macro to + * simply the record definition. + */ +#define ADD_ONE_ERR_GROUP(errselr_start, group) \ + ERR_RECORD_SYSREG_V1((errselr_start), (uint32_t)ARRAY_SIZE((group)), \ + &tegra194_ras_record_probe, \ + &tegra194_ras_record_handler, (group)) + +/* RAS error record group information */ +static struct err_record_info carmel_ras_records[] = { + /* + * Per core ras error records + * ERRSELR starts from 0*256 + Logical_CPU_ID*16 + 0 to + * 0*256 + Logical_CPU_ID*16 + 5 for each group. + * 8 cores/groups, 6 * 8 nodes in total. + */ + ADD_ONE_ERR_GROUP(0x000, per_core_ras_group), + ADD_ONE_ERR_GROUP(0x010, per_core_ras_group), + ADD_ONE_ERR_GROUP(0x020, per_core_ras_group), + ADD_ONE_ERR_GROUP(0x030, per_core_ras_group), + ADD_ONE_ERR_GROUP(0x040, per_core_ras_group), + ADD_ONE_ERR_GROUP(0x050, per_core_ras_group), + ADD_ONE_ERR_GROUP(0x060, per_core_ras_group), + ADD_ONE_ERR_GROUP(0x070, per_core_ras_group), + + /* + * Per cluster ras error records + * ERRSELR starts from 2*256 + Logical_Cluster_ID*16 + 0 to + * 2*256 + Logical_Cluster_ID*16 + 3. + * 4 clusters/groups, 3 * 4 nodes in total. + */ + ADD_ONE_ERR_GROUP(0x200, per_cluster_ras_group), + ADD_ONE_ERR_GROUP(0x210, per_cluster_ras_group), + ADD_ONE_ERR_GROUP(0x220, per_cluster_ras_group), + ADD_ONE_ERR_GROUP(0x230, per_cluster_ras_group), + + /* + * SCF L3_Bank ras error records + * ERRSELR: 3*256 + L3_Bank_ID, L3_Bank_ID: 0-3 + * 1 groups, 4 nodes in total. + */ + ADD_ONE_ERR_GROUP(0x300, scf_l3_ras_group), + + /* + * CCPLEX ras error records + * ERRSELR: 4*256 + Unit_ID, Unit_ID: 0 - 4 + * 1 groups, 5 nodes in total. + */ + ADD_ONE_ERR_GROUP(0x400, ccplex_ras_group), +}; + +REGISTER_ERR_RECORD_INFO(carmel_ras_records); + +/* dummy RAS interrupt */ +static struct ras_interrupt carmel_ras_interrupts[] = {}; +REGISTER_RAS_INTERRUPTS(carmel_ras_interrupts); + +/******************************************************************************* + * RAS handler for the platform + ******************************************************************************/ +void plat_ea_handler(unsigned int ea_reason, uint64_t syndrome, void *cookie, + void *handle, uint64_t flags) +{ +#if RAS_EXTENSION + tegra194_ea_handler(ea_reason, syndrome, cookie, handle, flags); +#else + ERROR("Unhandled External Abort received on 0x%llx at EL3!\n", + read_mpidr_el1()); + ERROR(" exception reason=%u syndrome=0x%lx\n", ea_reason, syndrome); + panic(); +#endif +} diff --git a/plat/nvidia/tegra/soc/t194/plat_setup.c b/plat/nvidia/tegra/soc/t194/plat_setup.c index 5d6c60b..399aebb 100644 --- a/plat/nvidia/tegra/soc/t194/plat_setup.c +++ b/plat/nvidia/tegra/soc/t194/plat_setup.c @@ -208,6 +208,11 @@ /* sanity check MCE firmware compatibility */ mce_verify_firmware_version(); +#if RAS_EXTENSION + /* Enable Uncorrectable RAS error */ + tegra194_ras_enable(); +#endif + /* * Program XUSB STREAMIDs * ====================== diff --git a/plat/nvidia/tegra/soc/t194/platform_t194.mk b/plat/nvidia/tegra/soc/t194/platform_t194.mk index c02128c..d7d15f5 100644 --- a/plat/nvidia/tegra/soc/t194/platform_t194.mk +++ b/plat/nvidia/tegra/soc/t194/platform_t194.mk @@ -30,6 +30,10 @@ MAX_MMAP_REGIONS := 30 $(eval $(call add_define,MAX_MMAP_REGIONS)) +# enable RAS handling +HANDLE_EA_EL3_FIRST := 1 +RAS_EXTENSION := 1 + # platform files PLAT_INCLUDES += -Iplat/nvidia/tegra/include/t194 \ -I${SOC_DIR}/drivers/include @@ -56,3 +60,10 @@ ifeq (${ENABLE_CONSOLE_SPE},1) BL31_SOURCES += ${COMMON_DIR}/drivers/spe/shared_console.S endif + +# RAS sources +ifeq (${RAS_EXTENSION},1) +BL31_SOURCES += lib/extensions/ras/std_err_record.c \ + lib/extensions/ras/ras_common.c \ + ${SOC_DIR}/plat_ras.c +endif