diff --git a/Makefile b/Makefile index a0d2ae0..bc5604b 100644 --- a/Makefile +++ b/Makefile @@ -900,6 +900,7 @@ $(eval $(call assert_boolean,ENCRYPT_BL31)) $(eval $(call assert_boolean,ENCRYPT_BL32)) $(eval $(call assert_boolean,ERRATA_SPECULATIVE_AT)) +$(eval $(call assert_boolean,RAS_TRAP_LOWER_EL_ERR_ACCESS)) $(eval $(call assert_numeric,ARM_ARCH_MAJOR)) $(eval $(call assert_numeric,ARM_ARCH_MINOR)) @@ -979,6 +980,7 @@ $(eval $(call add_define,BL2_INV_DCACHE)) $(eval $(call add_define,USE_SPINLOCK_CAS)) $(eval $(call add_define,ERRATA_SPECULATIVE_AT)) +$(eval $(call add_define,RAS_TRAP_LOWER_EL_ERR_ACCESS)) ifeq (${SANITIZE_UB},trap) $(eval $(call add_define,MONITOR_TRAPS)) diff --git a/docs/components/ras.rst b/docs/components/ras.rst index 3d81f17..86529d7 100644 --- a/docs/components/ras.rst +++ b/docs/components/ras.rst @@ -32,7 +32,8 @@ The build option ``RAS_EXTENSION`` when set to ``1`` includes the RAS in run time firmware; ``EL3_EXCEPTION_HANDLING`` and ``HANDLE_EA_EL3_FIRST`` must also -be set ``1``. +be set ``1``. ``RAS_TRAP_LOWER_EL_ERR_ACCESS`` controls the access to the RAS +error record registers from lower ELs. .. _ras-figure: diff --git a/docs/getting_started/build-options.rst b/docs/getting_started/build-options.rst index 920f934..f207886 100644 --- a/docs/getting_started/build-options.rst +++ b/docs/getting_started/build-options.rst @@ -707,6 +707,10 @@ | 1530924 | Cortex-A53 | +---------+--------------+ +- ``RAS_TRAP_LOWER_EL_ERR_ACCESS``: This flag enables/disables the SCR_EL3.TERR + bit, to trap access to the RAS ERR and RAS ERX registers from lower ELs. + This flag is disabled by default. + GICv3 driver options -------------------- diff --git a/include/arch/aarch64/arch.h b/include/arch/aarch64/arch.h index 10fe926..90569c3 100644 --- a/include/arch/aarch64/arch.h +++ b/include/arch/aarch64/arch.h @@ -342,6 +342,7 @@ #define SCR_EEL2_BIT (U(1) << 18) #define SCR_API_BIT (U(1) << 17) #define SCR_APK_BIT (U(1) << 16) +#define SCR_TERR_BIT (U(1) << 15) #define SCR_TWE_BIT (U(1) << 13) #define SCR_TWI_BIT (U(1) << 12) #define SCR_ST_BIT (U(1) << 11) diff --git a/include/lib/extensions/ras.h b/include/lib/extensions/ras.h index 4fc8f04..793ab9f 100644 --- a/include/lib/extensions/ras.h +++ b/include/lib/extensions/ras.h @@ -1,5 +1,6 @@ /* * Copyright (c) 2018, ARM Limited and Contributors. All rights reserved. + * Copyright (c) 2020, NVIDIA Corporation. All rights reserved. * * SPDX-License-Identifier: BSD-3-Clause */ @@ -192,6 +193,7 @@ probe_data); } +const char *ras_serr_to_str(unsigned int serr); int ras_ea_handler(unsigned int ea_reason, uint64_t syndrome, void *cookie, void *handle, uint64_t flags); void ras_init(void); diff --git a/include/lib/extensions/ras_arch.h b/include/lib/extensions/ras_arch.h index 0c98c4a..55760b0 100644 --- a/include/lib/extensions/ras_arch.h +++ b/include/lib/extensions/ras_arch.h @@ -1,5 +1,6 @@ /* * Copyright (c) 2018, ARM Limited and Contributors. All rights reserved. + * Copyright (c) 2020, NVIDIA Corporation. All rights reserved. * * SPDX-License-Identifier: BSD-3-Clause */ @@ -151,6 +152,9 @@ #define ERROR_STATUS_SET_UC 0x2 /* Uncontainable */ #define ERROR_STATUS_SET_CE 0x3 /* Corrected */ +/* Number of architecturally-defined primary error codes */ +#define ERROR_STATUS_NUM_SERR U(22) + /* Implementation Defined Syndrome bit in ESR */ #define SERROR_IDS_BIT U(24) diff --git a/lib/el3_runtime/aarch64/context_mgmt.c b/lib/el3_runtime/aarch64/context_mgmt.c index 53b4ea3..f4a34bf 100644 --- a/lib/el3_runtime/aarch64/context_mgmt.c +++ b/lib/el3_runtime/aarch64/context_mgmt.c @@ -108,6 +108,14 @@ if (EP_GET_ST(ep->h.attr) != 0U) scr_el3 |= SCR_ST_BIT; +#if RAS_TRAP_LOWER_EL_ERR_ACCESS + /* + * SCR_EL3.TERR: Trap Error record accesses. Accesses to the RAS ERR + * and RAS ERX registers from EL1 and EL2 are trapped to EL3. + */ + scr_el3 |= SCR_TERR_BIT; +#endif + #if !HANDLE_EA_EL3_FIRST /* * SCR_EL3.EA: Do not route External Abort and SError Interrupt External diff --git a/lib/extensions/ras/ras_common.c b/lib/extensions/ras/ras_common.c index 64a4852..36f9a95 100644 --- a/lib/extensions/ras/ras_common.c +++ b/lib/extensions/ras/ras_common.c @@ -1,5 +1,6 @@ /* * Copyright (c) 2018-2019, ARM Limited and Contributors. All rights reserved. + * Copyright (c) 2020, NVIDIA Corporation. All rights reserved. * * SPDX-License-Identifier: BSD-3-Clause */ @@ -18,6 +19,47 @@ # error Platform must define RAS priority value #endif +/* + * Function to convert architecturally-defined primary error code SERR, + * bits[7:0] from ERRSTATUS to its corresponding error string. + */ +const char *ras_serr_to_str(unsigned int serr) +{ + const char *str[ERROR_STATUS_NUM_SERR] = { + "No error", + "IMPLEMENTATION DEFINED error", + "Data value from (non-associative) internal memory", + "IMPLEMENTATION DEFINED pin", + "Assertion failure", + "Error detected on internal data path", + "Data value from associative memory", + "Address/control value from associative memory", + "Data value from a TLB", + "Address/control value from a TLB", + "Data value from producer", + "Address/control value from producer", + "Data value from (non-associative) external memory", + "Illegal address (software fault)", + "Illegal access (software fault)", + "Illegal state (software fault)", + "Internal data register", + "Internal control register", + "Error response from slave", + "External timeout", + "Internal timeout", + "Deferred error from slave not supported at master" + }; + + /* + * All other values are reserved. Reserved values might be defined + * in a future version of the architecture + */ + if (serr >= ERROR_STATUS_NUM_SERR) + return "unknown SERR"; + + return str[serr]; +} + /* Handler that receives External Aborts on RAS-capable systems */ int ras_ea_handler(unsigned int ea_reason, uint64_t syndrome, void *cookie, void *handle, uint64_t flags) diff --git a/make_helpers/defaults.mk b/make_helpers/defaults.mk index 585f06f..6db228f 100644 --- a/make_helpers/defaults.mk +++ b/make_helpers/defaults.mk @@ -302,3 +302,6 @@ # Select workaround for AT speculative behaviour. ERRATA_SPECULATIVE_AT := 0 + +# Trap RAS error record access from lower EL +RAS_TRAP_LOWER_EL_ERR_ACCESS := 0 diff --git a/plat/nvidia/tegra/include/platform_def.h b/plat/nvidia/tegra/include/platform_def.h index 678b15c..2331869 100644 --- a/plat/nvidia/tegra/include/platform_def.h +++ b/plat/nvidia/tegra/include/platform_def.h @@ -95,6 +95,7 @@ * Platform macros to support exception handling framework ******************************************************************************/ #define PLAT_PRI_BITS U(3) +#define PLAT_RAS_PRI U(0x10) #define PLAT_SDEI_CRITICAL_PRI U(0x20) #define PLAT_SDEI_NORMAL_PRI U(0x30) #define PLAT_TEGRA_WDT_PRIO U(0x40) diff --git a/plat/nvidia/tegra/include/t194/tegra194_ras_private.h b/plat/nvidia/tegra/include/t194/tegra194_ras_private.h new file mode 100644 index 0000000..336461a --- /dev/null +++ b/plat/nvidia/tegra/include/t194/tegra194_ras_private.h @@ -0,0 +1,263 @@ +/* + * Copyright (c) 2020, NVIDIA Corporation. All rights reserved. + * + * SPDX-License-Identifier: BSD-3-Clause + */ + +#ifndef TEGRA194_RAS_PRIVATE +#define TEGRA194_RAS_PRIVATE + +#include + +/* Implementation defined RAS error and corresponding error message */ +struct ras_error { + const char *error_msg; + /* IERR(bits[15:8]) from ERRSTATUS */ + uint8_t error_code; +}; + +/* RAS error node-specific auxiliary data */ +struct ras_aux_data { + /* name for current RAS node. */ + const char *name; + /* point to null-terminated ras_error array to convert error code to msg. */ + const struct ras_error *error_records; + /* + * function to return an value which needs to be programmed into ERXCTLR_EL1 + * to enable all specified RAS errors for current node. + */ + uint64_t (*err_ctrl)(void); +}; + +/* IFU Uncorrectable RAS ERROR */ +#define IFU_UNCORR_RAS_ERROR_LIST(X) + +/* JSR_RET Uncorrectable RAS ERROR */ +#define JSR_RET_UNCORR_RAS_ERROR_LIST(X) \ + /* Name, ERR_CTRL, IERR, ISA Desc */ \ + X(JSR_RET, 35, 0x13, "Floating Point Register File Parity Error") \ + X(JSR_RET, 34, 0x12, "Integer Register File Parity Error") \ + X(JSR_RET, 33, 0x11, "Garbage Bundle") \ + X(JSR_RET, 32, 0x10, "Bundle Completion Timeout") + +/* JSR_MTS Uncorrectable RAS ERROR */ +#define JSR_MTS_UNCORR_RAS_ERROR_LIST(X) \ + /* Name, ERR_CTRL, IERR, ISA Desc */ \ + X(JSR_MTS, 40, 0x28, "CoreSight Access Error") \ + X(JSR_MTS, 39, 0x27, "Dual Execution Uncorrectable Error") \ + X(JSR_MTS, 37, 0x25, "CTU MMIO Region") \ + X(JSR_MTS, 36, 0x24, "MTS MMCRAB Region Access") \ + X(JSR_MTS, 35, 0x23, "MTS_CARVEOUT Access from ARM SW") \ + X(JSR_MTS, 34, 0x22, "NAFLL PLL Failure to Lock") \ + X(JSR_MTS, 32, 0x20, "Internal Uncorrectable MTS Error") + +/* LSD_STQ Uncorrectable RAS ERROR */ +#define LSD_STQ_UNCORR_RAS_ERROR_LIST(X) \ + /* Name, ERR_CTRL, IERR, ISA Desc */ \ + X(LSD_STQ, 41, 0x39, "Coherent Cache Data Store Multi-Line ECC Error") \ + X(LSD_STQ, 40, 0x38, "Coherent Cache Data Store Uncorrectable ECC Error") \ + X(LSD_STQ, 38, 0x36, "Coherent Cache Data Load Uncorrectable ECC Error") \ + X(LSD_STQ, 33, 0x31, "Coherent Cache Tag Store Parity Error") \ + X(LSD_STQ, 32, 0x30, "Coherent Cache Tag Load Parity Error") + +/* LSD_DCC Uncorrectable RAS ERROR */ +#define LSD_DCC_UNCORR_RAS_ERROR_LIST(X) \ + /* Name, ERR_CTRL, IERR, ISA Desc */ \ + X(LSD_DCC, 41, 0x49, "BTU Copy Mini-Cache PPN Multi-Hit Error") \ + X(LSD_DCC, 39, 0x47, "Coherent Cache Data Uncorrectable ECC Error") \ + X(LSD_DCC, 37, 0x45, "Version Cache Byte-Enable Parity Error") \ + X(LSD_DCC, 36, 0x44, "Version Cache Data Uncorrectable ECC Error") \ + X(LSD_DCC, 33, 0x41, "BTU Copy Coherent Cache PPN Parity Error") \ + X(LSD_DCC, 32, 0x40, "BTU Copy Coherent Cache VPN Parity Error") + +/* LSD_L1HPF Uncorrectable RAS ERROR */ +#define LSD_L1HPF_UNCORR_RAS_ERROR_LIST(X) + +/* L2 Uncorrectable RAS ERROR */ +#define L2_UNCORR_RAS_ERROR_LIST(X) \ + /* Name, ERR_CTRL, IERR, ISA Desc */ \ + X(L2, 56, 0x68, "URT Timeout") \ + X(L2, 55, 0x67, "L2 Protocol Violation") \ + X(L2, 54, 0x66, "SCF to L2 Slave Error Read") \ + X(L2, 53, 0x65, "SCF to L2 Slave Error Write") \ + X(L2, 52, 0x64, "SCF to L2 Decode Error Read") \ + X(L2, 51, 0x63, "SCF to L2 Decode Error Write") \ + X(L2, 50, 0x62, "SCF to L2 Request Response Interface Parity Errors") \ + X(L2, 49, 0x61, "SCF to L2 Advance notice interface parity errors") \ + X(L2, 48, 0x60, "SCF to L2 Filldata Parity Errors") \ + X(L2, 47, 0x5F, "SCF to L2 UnCorrectable ECC Data Error on interface") \ + X(L2, 45, 0x5D, "Core 1 to L2 Parity Error") \ + X(L2, 44, 0x5C, "Core 0 to L2 Parity Error") \ + X(L2, 43, 0x5B, "L2 Multi-Hit") \ + X(L2, 42, 0x5A, "L2 URT Tag Parity Error") \ + X(L2, 41, 0x59, "L2 NTT Tag Parity Error") \ + X(L2, 40, 0x58, "L2 MLT Tag Parity Error") \ + X(L2, 39, 0x57, "L2 URD Data") \ + X(L2, 38, 0x56, "L2 NTP Data") \ + X(L2, 36, 0x54, "L2 MLC Uncorrectable Clean") \ + X(L2, 35, 0x53, "L2 URD Uncorrectable Dirty") \ + X(L2, 34, 0x52, "L2 MLC Uncorrectable Dirty") + +/* CLUSTER_CLOCKS Uncorrectable RAS ERROR */ +#define CLUSTER_CLOCKS_UNCORR_RAS_ERROR_LIST(X) \ + /* Name, ERR_CTRL, IERR, ISA Desc */ \ + X(CLUSTER_CLOCKS, 32, 0xE4, "Frequency Monitor Error") + +/* MMU Uncorrectable RAS ERROR */ +#define MMU_UNCORR_RAS_ERROR_LIST(X) + +/* L3 Uncorrectable RAS ERROR */ +#define L3_UNCORR_RAS_ERROR_LIST(X) \ + /* Name, ERR_CTRL, IERR, ISA Desc */ \ + X(L3, 43, 0x7B, "SNOC Interface Parity Error") \ + X(L3, 42, 0x7A, "MCF Interface Parity Error") \ + X(L3, 41, 0x79, "L3 Tag Parity Error") \ + X(L3, 40, 0x78, "L3 Dir Parity Error") \ + X(L3, 39, 0x77, "L3 Uncorrectable ECC Error") \ + X(L3, 37, 0x75, "Multi-Hit CAM Error") \ + X(L3, 36, 0x74, "Multi-Hit Tag Error") \ + X(L3, 35, 0x73, "Unrecognized Command Error") \ + X(L3, 34, 0x72, "L3 Protocol Error") + +/* CCPMU Uncorrectable RAS ERROR */ +#define CCPMU_UNCORR_RAS_ERROR_LIST(X) \ + /* Name, ERR_CTRL, IERR, ISA Desc */ \ + X(CCPMU, 40, 0x87, "CoreSight Access Error") \ + X(CCPMU, 36, 0x84, "MCE Ucode Error") \ + X(CCPMU, 35, 0x83, "MCE IL1 Parity Error") \ + X(CCPMU, 34, 0x82, "MCE Timeout Error") \ + X(CCPMU, 33, 0x81, "CRAB Access Error") \ + X(CCPMU, 32, 0x80, "MCE Memory Access Error") + +/* SCF_IOB Uncorrectable RAS ERROR */ +#define SCF_IOB_UNCORR_RAS_ERROR_LIST(X) \ + /* Name, ERR_CTRL, IERR, ISA Desc */ \ + X(SCF_IOB, 41, 0x99, "Request parity error") \ + X(SCF_IOB, 40, 0x98, "Putdata parity error") \ + X(SCF_IOB, 39, 0x97, "Uncorrectable ECC on Putdata") \ + X(SCF_IOB, 38, 0x96, "CBB Interface Error") \ + X(SCF_IOB, 37, 0x95, "MMCRAB Error") \ + X(SCF_IOB, 36, 0x94, "IHI Interface Error") \ + X(SCF_IOB, 35, 0x93, "CRI Error") \ + X(SCF_IOB, 34, 0x92, "TBX Interface Error") \ + X(SCF_IOB, 33, 0x91, "EVP Interface Error") + +/* SCF_SNOC Uncorrectable RAS ERROR */ +#define SCF_SNOC_UNCORR_RAS_ERROR_LIST(X) \ + /* Name, ERR_CTRL, IERR, ISA Desc */ \ + X(SCF_SNOC, 42, 0xAA, "Misc Client Parity Error") \ + X(SCF_SNOC, 41, 0xA9, "Misc Filldata Parity Error") \ + X(SCF_SNOC, 40, 0xA8, "Uncorrectable ECC Misc Client") \ + X(SCF_SNOC, 39, 0xA7, "DVMU Interface Parity Error") \ + X(SCF_SNOC, 38, 0xA6, "DVMU Interface Timeout Error") \ + X(SCF_SNOC, 37, 0xA5, "CPE Request Error") \ + X(SCF_SNOC, 36, 0xA4, "CPE Response Error") \ + X(SCF_SNOC, 35, 0xA3, "CPE Timeout Error") \ + X(SCF_SNOC, 34, 0xA2, "Uncorrectable Carveout Error") + +/* SCF_CTU Uncorrectable RAS ERROR */ +#define SCF_CTU_UNCORR_RAS_ERROR_LIST(X) \ + /* Name, ERR_CTRL, IERR, ISA Desc */ \ + X(SCF_CTU, 39, 0xB7, "Timeout error for TRC_DMA request") \ + X(SCF_CTU, 38, 0xB6, "Timeout error for CTU Snp") \ + X(SCF_CTU, 37, 0xB5, "Parity error in CTU TAG RAM") \ + X(SCF_CTU, 36, 0xB3, "Parity error in CTU DATA RAM") \ + X(SCF_CTU, 35, 0xB4, "Parity error for Cluster Rsp") \ + X(SCF_CTU, 34, 0xB2, "Parity error for TRL requests from 9 agents") \ + X(SCF_CTU, 33, 0xB1, "Parity error for MCF request") \ + X(SCF_CTU, 32, 0xB0, "TRC DMA fillsnoop parity error") + +/* CMU_CLOCKS Uncorrectable RAS ERROR */ +#define CMU_CLOCKS_UNCORR_RAS_ERROR_LIST(X) \ + /* Name, ERR_CTRL, IERR, ISA Desc */ \ + X(CMU_CLOCKS, 39, 0xC7, "Cluster 3 frequency monitor error") \ + X(CMU_CLOCKS, 38, 0xC6, "Cluster 2 frequency monitor error") \ + X(CMU_CLOCKS, 37, 0xC5, "Cluster 1 frequency monitor error") \ + X(CMU_CLOCKS, 36, 0xC3, "Cluster 0 frequency monitor error") \ + X(CMU_CLOCKS, 35, 0xC4, "Voltage error on ADC1 Monitored Logic") \ + X(CMU_CLOCKS, 34, 0xC2, "Voltage error on ADC0 Monitored Logic") \ + X(CMU_CLOCKS, 33, 0xC1, "Lookup Table 1 Parity Error") \ + X(CMU_CLOCKS, 32, 0xC0, "Lookup Table 0 Parity Error") + +/* + * Define one ras_error entry. + * + * This macro wille be used to to generate ras_error records for each node + * defined by _UNCORR_RAS_ERROR_LIST macro. + */ +#define DEFINE_ONE_RAS_ERROR_MSG(unit, ras_bit, ierr, msg) \ + { \ + .error_msg = (msg), \ + .error_code = (ierr) \ + }, + +/* + * Set one implementation defined bit in ERRCTLR + * + * This macro will be used to collect all defined ERR_CTRL bits for each node + * defined by _UNCORR_RAS_ERROR_LIST macro. + */ +#define DEFINE_ENABLE_RAS_BIT(unit, ras_bit, ierr, msg) \ + do { \ + val |= (1ULL << ras_bit##U); \ + } while (0); + +/* Represent one RAS node with 0 or more error bits (ERR_CTLR) enabled */ +#define DEFINE_ONE_RAS_NODE(node) \ +static const struct ras_error node##_uncorr_ras_errors[] = { \ + node##_UNCORR_RAS_ERROR_LIST(DEFINE_ONE_RAS_ERROR_MSG) \ + { \ + NULL, \ + 0U \ + }, \ +}; \ +static inline uint64_t node##_err_ctrl(void) \ +{ \ + uint64_t val = 0ULL; \ + node##_UNCORR_RAS_ERROR_LIST(DEFINE_ENABLE_RAS_BIT) \ + return val; \ +} + +#define DEFINE_ONE_RAS_AUX_DATA(node) \ + { \ + .name = #node, \ + .error_records = node##_uncorr_ras_errors, \ + .err_ctrl = &node##_err_ctrl \ + }, + +#define PER_CORE_RAS_NODE_LIST(X) \ + X(IFU) \ + X(JSR_RET) \ + X(JSR_MTS) \ + X(LSD_STQ) \ + X(LSD_DCC) \ + X(LSD_L1HPF) + +#define PER_CORE_RAS_GROUP_NODES PER_CORE_RAS_NODE_LIST(DEFINE_ONE_RAS_AUX_DATA) + +#define PER_CLUSTER_RAS_NODE_LIST(X) \ + X(L2) \ + X(CLUSTER_CLOCKS) \ + X(MMU) + +#define PER_CLUSTER_RAS_GROUP_NODES PER_CLUSTER_RAS_NODE_LIST(DEFINE_ONE_RAS_AUX_DATA) + +#define SCF_L3_BANK_RAS_NODE_LIST(X) X(L3) + +/* we have 4 SCF_L3 nodes:3*256 + L3_Bank_ID(0-3) */ +#define SCF_L3_BANK_RAS_GROUP_NODES \ + SCF_L3_BANK_RAS_NODE_LIST(DEFINE_ONE_RAS_AUX_DATA) \ + SCF_L3_BANK_RAS_NODE_LIST(DEFINE_ONE_RAS_AUX_DATA) \ + SCF_L3_BANK_RAS_NODE_LIST(DEFINE_ONE_RAS_AUX_DATA) \ + SCF_L3_BANK_RAS_NODE_LIST(DEFINE_ONE_RAS_AUX_DATA) + +#define CCPLEX_RAS_NODE_LIST(X) \ + X(CCPMU) \ + X(SCF_IOB) \ + X(SCF_SNOC) \ + X(SCF_CTU) \ + X(CMU_CLOCKS) + +#define CCPLEX_RAS_GROUP_NODES CCPLEX_RAS_NODE_LIST(DEFINE_ONE_RAS_AUX_DATA) + +#endif /* TEGRA194_RAS_PRIVATE */ diff --git a/plat/nvidia/tegra/include/tegra_private.h b/plat/nvidia/tegra/include/tegra_private.h index f72c9cf..c181c36 100644 --- a/plat/nvidia/tegra/include/tegra_private.h +++ b/plat/nvidia/tegra/include/tegra_private.h @@ -89,7 +89,7 @@ /* Declarations for tegra_fiq_glue.c */ void tegra_fiq_handler_setup(void); -int tegra_fiq_get_intr_context(void); +int32_t tegra_fiq_get_intr_context(void); void tegra_fiq_set_ns_entrypoint(uint64_t entrypoint); /* Declarations for tegra_security.c */ @@ -157,4 +157,9 @@ void *handle, uint64_t flags); +#if RAS_EXTENSION +void tegra194_ras_enable(void); +void tegra194_ras_corrected_err_clear(void); +#endif + #endif /* TEGRA_PRIVATE_H */ diff --git a/plat/nvidia/tegra/soc/t194/drivers/include/mce_private.h b/plat/nvidia/tegra/soc/t194/drivers/include/mce_private.h index 1fe3aad..6dafeb2 100644 --- a/plat/nvidia/tegra/soc/t194/drivers/include/mce_private.h +++ b/plat/nvidia/tegra/soc/t194/drivers/include/mce_private.h @@ -58,6 +58,7 @@ void nvg_enable_strict_checking_mode(void); void nvg_system_shutdown(void); void nvg_system_reboot(void); +void nvg_clear_hsm_corr_status(void); /* declarations for assembly functions */ void nvg_set_request_data(uint64_t req, uint64_t data); @@ -71,5 +72,6 @@ void mce_enable_strict_checking(void); void mce_system_shutdown(void); void mce_system_reboot(void); +void mce_clear_hsm_corr_status(void); #endif /* MCE_PRIVATE_H */ diff --git a/plat/nvidia/tegra/soc/t194/drivers/mce/mce.c b/plat/nvidia/tegra/soc/t194/drivers/mce/mce.c index 7edd7a0..4663a3d 100644 --- a/plat/nvidia/tegra/soc/t194/drivers/mce/mce.c +++ b/plat/nvidia/tegra/soc/t194/drivers/mce/mce.c @@ -234,3 +234,11 @@ { nvg_system_reboot(); } + +/******************************************************************************* + * Handler to clear CCPLEX->HSM correctable RAS error signal. + ******************************************************************************/ +void mce_clear_hsm_corr_status(void) +{ + nvg_clear_hsm_corr_status(); +} diff --git a/plat/nvidia/tegra/soc/t194/drivers/mce/nvg.c b/plat/nvidia/tegra/soc/t194/drivers/mce/nvg.c index ef740a1..fdf9429 100644 --- a/plat/nvidia/tegra/soc/t194/drivers/mce/nvg.c +++ b/plat/nvidia/tegra/soc/t194/drivers/mce/nvg.c @@ -236,3 +236,15 @@ nvg_set_request_data((uint64_t)TEGRA_NVG_CHANNEL_SHUTDOWN, (uint64_t)TEGRA_NVG_SHUTDOWN); } + +/* + * Request to clear CCPLEX->HSM correctable error signal. + * NVGDATA[1]: A write of 1 clears the CCPLEX->HSM correctable error signal, + * A write of 0 has no effect. + */ +void nvg_clear_hsm_corr_status(void) +{ + nvg_hsm_error_ctrl_channel_t status = { .bits = { .corr = 1U, }, }; + + nvg_set_request_data((uint64_t)TEGRA_NVG_CHANNEL_HSM_ERROR_CTRL, status.flat); +} diff --git a/plat/nvidia/tegra/soc/t194/plat_ras.c b/plat/nvidia/tegra/soc/t194/plat_ras.c new file mode 100644 index 0000000..54c2924 --- /dev/null +++ b/plat/nvidia/tegra/soc/t194/plat_ras.c @@ -0,0 +1,418 @@ +/* + * Copyright (c) 2020, NVIDIA Corporation. All rights reserved. + * + * SPDX-License-Identifier: BSD-3-Clause + */ + +#include +#include + +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +/* + * ERRFR bits[63:32], it indicates supported RAS errors which can be enabled + * by setting corresponding bits in ERRCTLR + */ +#define ERR_FR_EN_BITS_MASK 0xFFFFFFFF00000000ULL + +/* bakery lock for platform RAS handler. */ +static DEFINE_BAKERY_LOCK(ras_handler_lock); +#define ras_lock() bakery_lock_get(&ras_handler_lock) +#define ras_unlock() bakery_lock_release(&ras_handler_lock) + +/* + * Function to handle an External Abort received at EL3. + * This function is invoked by RAS framework. + */ +static void tegra194_ea_handler(unsigned int ea_reason, uint64_t syndrome, + void *cookie, void *handle, uint64_t flags) +{ + int32_t ret; + + ras_lock(); + + ERROR("MPIDR 0x%lx: exception reason=%u syndrome=0x%llx\n", + read_mpidr(), ea_reason, syndrome); + + /* Call RAS EA handler */ + ret = ras_ea_handler(ea_reason, syndrome, cookie, handle, flags); + if (ret != 0) { + ERROR("RAS error handled!\n"); + ret = sdei_dispatch_event(TEGRA_SDEI_EP_EVENT_0 + + plat_my_core_pos()); + if (ret != 0) + ERROR("sdei_dispatch_event returned %d\n", ret); + } else { + ERROR("Not a RAS error!\n"); + } + + ras_unlock(); +} + +/* + * Function to enable all supported RAS error report. + * + * Uncorrected errors are set to report as External abort (SError) + * Corrected errors are set to report as interrupt. + */ +void tegra194_ras_enable(void) +{ + VERBOSE("%s\n", __func__); + + /* skip RAS enablement if not a silicon platform. */ + if (!tegra_platform_is_silicon()) { + return; + } + + /* + * Iterate for each group(num_idx ERRSELRs starting from idx_start) + * use normal for loop instead of for_each_err_record_info to get rid + * of MISRA noise.. + */ + for (uint32_t i = 0U; i < err_record_mappings.num_err_records; i++) { + + const struct err_record_info *info = &err_record_mappings.err_records[i]; + + uint32_t idx_start = info->sysreg.idx_start; + uint32_t num_idx = info->sysreg.num_idx; + const struct ras_aux_data *aux_data = (const struct ras_aux_data *)info->aux_data; + + assert(aux_data != NULL); + + for (uint32_t j = 0; j < num_idx; j++) { + + /* ERRCTLR register value. */ + uint64_t err_ctrl = 0ULL; + /* all supported errors for this node. */ + uint64_t err_fr; + /* uncorrectable errors */ + uint64_t uncorr_errs; + /* correctable errors */ + uint64_t corr_errs; + + /* + * Catch error if something wrong with the RAS aux data + * record table. + */ + assert(aux_data[j].err_ctrl != NULL); + + /* + * Write to ERRSELR_EL1 to select the RAS error node. + * Always program this at first to select corresponding + * RAS node before any other RAS register r/w. + */ + ser_sys_select_record(idx_start + j); + + err_fr = read_erxfr_el1() & ERR_FR_EN_BITS_MASK; + uncorr_errs = aux_data[j].err_ctrl(); + corr_errs = ~uncorr_errs & err_fr; + + /* enable error reporting */ + ERR_CTLR_ENABLE_FIELD(err_ctrl, ED); + + /* enable SError reporting for uncorrectable errors */ + if ((uncorr_errs & err_fr) != 0ULL) { + ERR_CTLR_ENABLE_FIELD(err_ctrl, UE); + } + + /* generate interrupt for corrected errors. */ + if (corr_errs != 0ULL) { + ERR_CTLR_ENABLE_FIELD(err_ctrl, CFI); + } + + /* enable the supported errors */ + err_ctrl |= err_fr; + + VERBOSE("errselr_el1:0x%x, erxfr:0x%llx, err_ctrl:0x%llx\n", + idx_start + j, err_fr, err_ctrl); + + /* enable specified errors, or set to 0 if no supported error */ + write_erxctlr_el1(err_ctrl); + + /* + * Check if all the bit settings have been enabled to detect + * uncorrected/corrected errors, if not assert. + */ + assert(read_erxctlr_el1() == err_ctrl); + } + } +} + +/* + * Function to clear RAS ERRSTATUS for corrected RAS error. + * This function ignores any new RAS error signaled during clearing; it is not + * multi-core safe(no ras_lock is taken to reduce overhead). + */ +void tegra194_ras_corrected_err_clear(void) +{ + uint64_t clear_ce_status = 0ULL; + + ERR_STATUS_SET_FIELD(clear_ce_status, AV, 0x1UL); + ERR_STATUS_SET_FIELD(clear_ce_status, V, 0x1UL); + ERR_STATUS_SET_FIELD(clear_ce_status, OF, 0x1UL); + ERR_STATUS_SET_FIELD(clear_ce_status, MV, 0x1UL); + ERR_STATUS_SET_FIELD(clear_ce_status, CE, 0x3UL); + + for (uint32_t i = 0U; i < err_record_mappings.num_err_records; i++) { + + const struct err_record_info *info = &err_record_mappings.err_records[i]; + uint32_t idx_start = info->sysreg.idx_start; + uint32_t num_idx = info->sysreg.num_idx; + + for (uint32_t j = 0U; j < num_idx; j++) { + + uint64_t status; + uint32_t err_idx = idx_start + j; + + write_errselr_el1(err_idx); + status = read_erxstatus_el1(); + + if (ERR_STATUS_GET_FIELD(status, CE) != 0U) { + write_erxstatus_el1(clear_ce_status); + } + } + } +} + +/* Function to probe an error from error record group. */ +static int32_t tegra194_ras_record_probe(const struct err_record_info *info, + int *probe_data) +{ + /* Skip probing if not a silicon platform */ + if (!tegra_platform_is_silicon()) { + return 0; + } + + return ser_probe_sysreg(info->sysreg.idx_start, info->sysreg.num_idx, probe_data); +} + +/* Function to handle error from one given node */ +static int32_t tegra194_ras_node_handler(uint32_t errselr, const char *name, + const struct ras_error *errors, uint64_t status) +{ + bool found = false; + uint32_t ierr = (uint32_t)ERR_STATUS_GET_FIELD(status, IERR); + uint32_t serr = (uint32_t)ERR_STATUS_GET_FIELD(status, SERR); + uint64_t val = 0; + + /* not a valid error. */ + if (ERR_STATUS_GET_FIELD(status, V) == 0U) { + return 0; + } + + ERR_STATUS_SET_FIELD(val, V, 1); + + /* keep the log print same as linux arm64_ras driver. */ + ERROR("**************************************\n"); + ERROR("RAS Error in %s, ERRSELR_EL1=0x%x:\n", name, errselr); + ERROR("\tStatus = 0x%llx\n", status); + + /* Print uncorrectable errror information. */ + if (ERR_STATUS_GET_FIELD(status, UE) != 0U) { + + ERR_STATUS_SET_FIELD(val, UE, 1); + ERR_STATUS_SET_FIELD(val, UET, 1); + + /* IERR to error message */ + for (uint32_t i = 0; errors[i].error_msg != NULL; i++) { + if (ierr == errors[i].error_code) { + ERROR("\tIERR = %s: 0x%x\n", + errors[i].error_msg, ierr); + + found = true; + break; + } + } + + if (!found) { + ERROR("\tUnknown IERR: 0x%x\n", ierr); + } + + ERROR("SERR = %s: 0x%x\n", ras_serr_to_str(serr), serr); + + /* Overflow, multiple errors have been detected. */ + if (ERR_STATUS_GET_FIELD(status, OF) != 0U) { + ERROR("\tOverflow (there may be more errors) - " + "Uncorrectable\n"); + ERR_STATUS_SET_FIELD(val, OF, 1); + } + + ERROR("\tUncorrectable (this is fatal)\n"); + + /* Miscellaneous Register Valid. */ + if (ERR_STATUS_GET_FIELD(status, MV) != 0U) { + ERROR("\tMISC0 = 0x%lx\n", read_erxmisc0_el1()); + ERROR("\tMISC1 = 0x%lx\n", read_erxmisc1_el1()); + ERR_STATUS_SET_FIELD(val, MV, 1); + } + + /* Address Valid. */ + if (ERR_STATUS_GET_FIELD(status, AV) != 0U) { + ERROR("\tADDR = 0x%lx\n", read_erxaddr_el1()); + ERR_STATUS_SET_FIELD(val, AV, 1); + } + + /* Deferred error */ + if (ERR_STATUS_GET_FIELD(status, DE) != 0U) { + ERROR("\tDeferred error\n"); + ERR_STATUS_SET_FIELD(val, DE, 1); + } + + } else { + /* For corrected error, simply clear it. */ + VERBOSE("corrected RAS error is cleared: ERRSELR_EL1:0x%x, " + "IERR:0x%x, SERR:0x%x\n", errselr, ierr, serr); + ERR_STATUS_SET_FIELD(val, CE, 1); + } + + ERROR("**************************************\n"); + + /* Write to clear reported errors. */ + write_erxstatus_el1(val); + + /* error handled */ + return 0; +} + +/* Function to handle one error node from an error record group. */ +static int32_t tegra194_ras_record_handler(const struct err_record_info *info, + int probe_data, const struct err_handler_data *const data __unused) +{ + uint32_t num_idx = info->sysreg.num_idx; + uint32_t idx_start = info->sysreg.idx_start; + const struct ras_aux_data *aux_data = info->aux_data; + const struct ras_error *errors; + uint32_t offset; + const char *node_name; + + uint64_t status = 0ULL; + + VERBOSE("%s\n", __func__); + + assert(probe_data >= 0); + assert((uint32_t)probe_data < num_idx); + + offset = (uint32_t)probe_data; + errors = aux_data[offset].error_records; + node_name = aux_data[offset].name; + + assert(errors != NULL); + + /* Write to ERRSELR_EL1 to select the error record */ + ser_sys_select_record(idx_start + offset); + + /* Retrieve status register from the error record */ + status = read_erxstatus_el1(); + + return tegra194_ras_node_handler(idx_start + offset, node_name, + errors, status); +} + + +/* Instantiate RAS nodes */ +PER_CORE_RAS_NODE_LIST(DEFINE_ONE_RAS_NODE) +PER_CLUSTER_RAS_NODE_LIST(DEFINE_ONE_RAS_NODE) +SCF_L3_BANK_RAS_NODE_LIST(DEFINE_ONE_RAS_NODE) +CCPLEX_RAS_NODE_LIST(DEFINE_ONE_RAS_NODE) + +/* Instantiate RAS node groups */ +static struct ras_aux_data per_core_ras_group[] = { + PER_CORE_RAS_GROUP_NODES +}; + +static struct ras_aux_data per_cluster_ras_group[] = { + PER_CLUSTER_RAS_GROUP_NODES +}; + +static struct ras_aux_data scf_l3_ras_group[] = { + SCF_L3_BANK_RAS_GROUP_NODES +}; + +static struct ras_aux_data ccplex_ras_group[] = { + CCPLEX_RAS_GROUP_NODES +}; + +/* + * We have same probe and handler for each error record group, use a macro to + * simply the record definition. + */ +#define ADD_ONE_ERR_GROUP(errselr_start, group) \ + ERR_RECORD_SYSREG_V1((errselr_start), (uint32_t)ARRAY_SIZE((group)), \ + &tegra194_ras_record_probe, \ + &tegra194_ras_record_handler, (group)) + +/* RAS error record group information */ +static struct err_record_info carmel_ras_records[] = { + /* + * Per core ras error records + * ERRSELR starts from 0*256 + Logical_CPU_ID*16 + 0 to + * 0*256 + Logical_CPU_ID*16 + 5 for each group. + * 8 cores/groups, 6 * 8 nodes in total. + */ + ADD_ONE_ERR_GROUP(0x000, per_core_ras_group), + ADD_ONE_ERR_GROUP(0x010, per_core_ras_group), + ADD_ONE_ERR_GROUP(0x020, per_core_ras_group), + ADD_ONE_ERR_GROUP(0x030, per_core_ras_group), + ADD_ONE_ERR_GROUP(0x040, per_core_ras_group), + ADD_ONE_ERR_GROUP(0x050, per_core_ras_group), + ADD_ONE_ERR_GROUP(0x060, per_core_ras_group), + ADD_ONE_ERR_GROUP(0x070, per_core_ras_group), + + /* + * Per cluster ras error records + * ERRSELR starts from 2*256 + Logical_Cluster_ID*16 + 0 to + * 2*256 + Logical_Cluster_ID*16 + 3. + * 4 clusters/groups, 3 * 4 nodes in total. + */ + ADD_ONE_ERR_GROUP(0x200, per_cluster_ras_group), + ADD_ONE_ERR_GROUP(0x210, per_cluster_ras_group), + ADD_ONE_ERR_GROUP(0x220, per_cluster_ras_group), + ADD_ONE_ERR_GROUP(0x230, per_cluster_ras_group), + + /* + * SCF L3_Bank ras error records + * ERRSELR: 3*256 + L3_Bank_ID, L3_Bank_ID: 0-3 + * 1 groups, 4 nodes in total. + */ + ADD_ONE_ERR_GROUP(0x300, scf_l3_ras_group), + + /* + * CCPLEX ras error records + * ERRSELR: 4*256 + Unit_ID, Unit_ID: 0 - 4 + * 1 groups, 5 nodes in total. + */ + ADD_ONE_ERR_GROUP(0x400, ccplex_ras_group), +}; + +REGISTER_ERR_RECORD_INFO(carmel_ras_records); + +/* dummy RAS interrupt */ +static struct ras_interrupt carmel_ras_interrupts[] = {}; +REGISTER_RAS_INTERRUPTS(carmel_ras_interrupts); + +/******************************************************************************* + * RAS handler for the platform + ******************************************************************************/ +void plat_ea_handler(unsigned int ea_reason, uint64_t syndrome, void *cookie, + void *handle, uint64_t flags) +{ +#if RAS_EXTENSION + tegra194_ea_handler(ea_reason, syndrome, cookie, handle, flags); +#else + ERROR("Unhandled External Abort received on 0x%llx at EL3!\n", + read_mpidr_el1()); + ERROR(" exception reason=%u syndrome=0x%lx\n", ea_reason, syndrome); + panic(); +#endif +} diff --git a/plat/nvidia/tegra/soc/t194/plat_setup.c b/plat/nvidia/tegra/soc/t194/plat_setup.c index 5d6c60b..399aebb 100644 --- a/plat/nvidia/tegra/soc/t194/plat_setup.c +++ b/plat/nvidia/tegra/soc/t194/plat_setup.c @@ -208,6 +208,11 @@ /* sanity check MCE firmware compatibility */ mce_verify_firmware_version(); +#if RAS_EXTENSION + /* Enable Uncorrectable RAS error */ + tegra194_ras_enable(); +#endif + /* * Program XUSB STREAMIDs * ====================== diff --git a/plat/nvidia/tegra/soc/t194/plat_sip_calls.c b/plat/nvidia/tegra/soc/t194/plat_sip_calls.c index 884762d..a3f996d 100644 --- a/plat/nvidia/tegra/soc/t194/plat_sip_calls.c +++ b/plat/nvidia/tegra/soc/t194/plat_sip_calls.c @@ -12,6 +12,7 @@ #include #include #include +#include #include #include #include @@ -23,6 +24,7 @@ * Tegra194 SiP SMCs ******************************************************************************/ #define TEGRA_SIP_GET_SMMU_PER 0xC200FF00U +#define TEGRA_SIP_CLEAR_RAS_CORRECTED_ERRORS 0xC200FF01U /******************************************************************************* * This function is responsible for handling all T194 SiP calls @@ -69,6 +71,15 @@ break; +#if RAS_EXTENSION + case TEGRA_SIP_CLEAR_RAS_CORRECTED_ERRORS: + /* clear all RAS error records for corrected errors at first. */ + tegra194_ras_corrected_err_clear(); + /* clear HSM corrected error status. */ + mce_clear_hsm_corr_status(); + break; +#endif + default: ret = -ENOTSUP; break; diff --git a/plat/nvidia/tegra/soc/t194/platform_t194.mk b/plat/nvidia/tegra/soc/t194/platform_t194.mk index c02128c..d7d15f5 100644 --- a/plat/nvidia/tegra/soc/t194/platform_t194.mk +++ b/plat/nvidia/tegra/soc/t194/platform_t194.mk @@ -30,6 +30,10 @@ MAX_MMAP_REGIONS := 30 $(eval $(call add_define,MAX_MMAP_REGIONS)) +# enable RAS handling +HANDLE_EA_EL3_FIRST := 1 +RAS_EXTENSION := 1 + # platform files PLAT_INCLUDES += -Iplat/nvidia/tegra/include/t194 \ -I${SOC_DIR}/drivers/include @@ -56,3 +60,10 @@ ifeq (${ENABLE_CONSOLE_SPE},1) BL31_SOURCES += ${COMMON_DIR}/drivers/spe/shared_console.S endif + +# RAS sources +ifeq (${RAS_EXTENSION},1) +BL31_SOURCES += lib/extensions/ras/std_err_record.c \ + lib/extensions/ras/ras_common.c \ + ${SOC_DIR}/plat_ras.c +endif