diff --git a/Makefile b/Makefile index 1727f84..782c489 100644 --- a/Makefile +++ b/Makefile @@ -306,6 +306,11 @@ endif endif +# When building for systems with hardware-assisted coherency, there's no need to +# use USE_COHERENT_MEM. Require that USE_COHERENT_MEM must be set to 0 too. +ifeq ($(HW_ASSISTED_COHERENCY)-$(USE_COHERENT_MEM),1-1) +$(error USE_COHERENT_MEM cannot be enabled with HW_ASSISTED_COHERENCY) +endif ################################################################################ # Process platform overrideable behaviour @@ -386,6 +391,7 @@ $(eval $(call assert_boolean,ENABLE_RUNTIME_INSTRUMENTATION)) $(eval $(call assert_boolean,ERROR_DEPRECATED)) $(eval $(call assert_boolean,GENERATE_COT)) +$(eval $(call assert_boolean,HW_ASSISTED_COHERENCY)) $(eval $(call assert_boolean,LOAD_IMAGE_V2)) $(eval $(call assert_boolean,NS_TIMER_SWITCH)) $(eval $(call assert_boolean,PL011_GENERIC_UART)) @@ -420,6 +426,7 @@ $(eval $(call add_define,ENABLE_PSCI_STAT)) $(eval $(call add_define,ENABLE_RUNTIME_INSTRUMENTATION)) $(eval $(call add_define,ERROR_DEPRECATED)) +$(eval $(call add_define,HW_ASSISTED_COHERENCY)) $(eval $(call add_define,LOAD_IMAGE_V2)) $(eval $(call add_define,LOG_LEVEL)) $(eval $(call add_define,NS_TIMER_SWITCH)) diff --git a/bl31/aarch64/bl31_entrypoint.S b/bl31/aarch64/bl31_entrypoint.S index 4a710bd..6238329 100644 --- a/bl31/aarch64/bl31_entrypoint.S +++ b/bl31/aarch64/bl31_entrypoint.S @@ -180,24 +180,29 @@ _init_c_runtime=0 \ _exception_vectors=runtime_exceptions - /* -------------------------------------------- - * Enable the MMU with the DCache disabled. It - * is safe to use stacks allocated in normal - * memory as a result. All memory accesses are - * marked nGnRnE when the MMU is disabled. So - * all the stack writes will make it to memory. - * All memory accesses are marked Non-cacheable - * when the MMU is enabled but D$ is disabled. - * So used stack memory is guaranteed to be - * visible immediately after the MMU is enabled - * Enabling the DCache at the same time as the - * MMU can lead to speculatively fetched and - * possibly stale stack memory being read from - * other caches. This can lead to coherency - * issues. - * -------------------------------------------- + /* + * We're about to enable MMU and participate in PSCI state coordination. + * + * The PSCI implementation invokes platform routines that enable CPUs to + * participate in coherency. On a system where CPUs are not + * cache-coherent out of reset, having caches enabled until such time + * might lead to coherency issues (resulting from stale data getting + * speculatively fetched, among others). Therefore we keep data caches + * disabled while enabling the MMU, thereby forcing data accesses to + * have non-cacheable, nGnRnE attributes (these will always be coherent + * with main memory). + * + * On systems with hardware-assisted coherency, where CPUs are expected + * to be cache-coherent out of reset without needing explicit software + * intervention, PSCI need not invoke platform routines to enter + * coherency (as CPUs already are); and there's no reason to have caches + * disabled either. */ +#if HW_ASSISTED_COHERENCY + mov x0, #0 +#else mov x0, #DISABLE_DCACHE +#endif bl bl31_plat_enable_mmu bl psci_warmboot_entrypoint diff --git a/bl32/sp_min/aarch32/entrypoint.S b/bl32/sp_min/aarch32/entrypoint.S index 3163f52..c7f60b5 100644 --- a/bl32/sp_min/aarch32/entrypoint.S +++ b/bl32/sp_min/aarch32/entrypoint.S @@ -231,24 +231,27 @@ _init_c_runtime=0 \ _exception_vectors=sp_min_vector_table - /* -------------------------------------------- - * Enable the MMU with the DCache disabled. It - * is safe to use stacks allocated in normal - * memory as a result. All memory accesses are - * marked nGnRnE when the MMU is disabled. So - * all the stack writes will make it to memory. - * All memory accesses are marked Non-cacheable - * when the MMU is enabled but D$ is disabled. - * So used stack memory is guaranteed to be - * visible immediately after the MMU is enabled - * Enabling the DCache at the same time as the - * MMU can lead to speculatively fetched and - * possibly stale stack memory being read from - * other caches. This can lead to coherency - * issues. - * -------------------------------------------- + /* + * We're about to enable MMU and participate in PSCI state coordination. + * + * The PSCI implementation invokes platform routines that enable CPUs to + * participate in coherency. On a system where CPUs are not + * cache-coherent out of reset, having caches enabled until such time + * might lead to coherency issues (resulting from stale data getting + * speculatively fetched, among others). Therefore we keep data caches + * disabled while enabling the MMU, thereby forcing data accesses to + * have non-cacheable, nGnRnE attributes (these will always be coherent + * with main memory). + * + * On systems where CPUs are cache-coherent out of reset, however, PSCI + * need not invoke platform routines to enter coherency (as CPUs already + * are), and there's no reason to have caches disabled either. */ +#if HW_ASSISTED_COHERENCY + mov r0, #0 +#else mov r0, #DISABLE_DCACHE +#endif bl bl32_plat_enable_mmu bl sp_min_warm_boot diff --git a/docs/psci-lib-integration-guide.md b/docs/psci-lib-integration-guide.md index f290966..d81b328 100644 --- a/docs/psci-lib-integration-guide.md +++ b/docs/psci-lib-integration-guide.md @@ -176,7 +176,9 @@ * The page tables must be setup and the MMU enabled * The C runtime environment must be setup and stack initialized * The Data cache must be enabled prior to invoking any of the PSCI library - interfaces except for `psci_warmboot_entrypoint()`. + interfaces except for `psci_warmboot_entrypoint()`. For + `psci_warmboot_entrypoint()`, if the build option `HW_ASSISTED_COHERENCY` + is enabled however, data caches are expected to be enabled. Further requirements for each interface can be found in the interface description. @@ -270,11 +272,11 @@ Return : void This function performs the warm boot initialization/restoration as mandated by -[PSCI spec]. For AArch32, on wakeup from power down the CPU resets to secure -SVC mode and the EL3 Runtime Software must perform the prerequisite -initializations mentioned at top of this section. This function must be called -with Data cache disabled but with MMU initialized and enabled. The major -actions performed by this function are: +[PSCI spec]. For AArch32, on wakeup from power down the CPU resets to secure SVC +mode and the EL3 Runtime Software must perform the prerequisite initializations +mentioned at top of this section. This function must be called with Data cache +disabled (unless build option `HW_ASSISTED_COHERENCY` is enabled) but with MMU +initialized and enabled. The major actions performed by this function are: * Invalidates the stack and enables the data cache. * Initializes architecture and PSCI state coordination. diff --git a/docs/user-guide.md b/docs/user-guide.md index 9d91c2a..c6c8e19 100644 --- a/docs/user-guide.md +++ b/docs/user-guide.md @@ -328,6 +328,15 @@ * `HANDLE_EA_EL3_FIRST`: When defined External Aborts and SError Interrupts will be always trapped in EL3 i.e. in BL31 at runtime. +* `HW_ASSISTED_COHERENCY`: On most ARM systems to-date, platform-specific + software operations are required for CPUs to enter and exit coherency. + However, there exists newer systems where CPUs' entry to and exit from + coherency is managed in hardware. Such systems require software to only + initiate the operations, and the rest is managed in hardware, minimizing + active software management. In such systems, this boolean option enables ARM + Trusted Firmware to carry out build and run-time optimizations during boot + and power management operations. This option defaults to 0. + * `LOAD_IMAGE_V2`: Boolean option to enable support for new version (v2) of image loading, which provides more flexibility and scalability around what images are loaded and executed during boot. Default is 0. diff --git a/lib/psci/psci_common.c b/lib/psci/psci_common.c index 9fdce49..1be37c0 100644 --- a/lib/psci/psci_common.c +++ b/lib/psci/psci_common.c @@ -79,7 +79,8 @@ #endif ; -DEFINE_BAKERY_LOCK(psci_locks[PSCI_NUM_NON_CPU_PWR_DOMAINS]); +/* Lock for PSCI state coordination */ +DEFINE_PSCI_LOCK(psci_locks[PSCI_NUM_NON_CPU_PWR_DOMAINS]); cpu_pd_node_t psci_cpu_pd_nodes[PLATFORM_CORE_COUNT]; @@ -247,6 +248,50 @@ return &psci_req_local_pwr_states[pwrlvl - 1][cpu_idx]; } +/* + * psci_non_cpu_pd_nodes can be placed either in normal memory or coherent + * memory. + * + * With !USE_COHERENT_MEM, psci_non_cpu_pd_nodes is placed in normal memory, + * it's accessed by both cached and non-cached participants. To serve the common + * minimum, perform a cache flush before read and after write so that non-cached + * participants operate on latest data in main memory. + * + * When USE_COHERENT_MEM is used, psci_non_cpu_pd_nodes is placed in coherent + * memory. With HW_ASSISTED_COHERENCY, all PSCI participants are cache-coherent. + * In both cases, no cache operations are required. + */ + +/* + * Retrieve local state of non-CPU power domain node from a non-cached CPU, + * after any required cache maintenance operation. + */ +static plat_local_state_t get_non_cpu_pd_node_local_state( + unsigned int parent_idx) +{ +#if !USE_COHERENT_MEM || !HW_ASSISTED_COHERENCY + flush_dcache_range( + (uintptr_t) &psci_non_cpu_pd_nodes[parent_idx], + sizeof(psci_non_cpu_pd_nodes[parent_idx])); +#endif + return psci_non_cpu_pd_nodes[parent_idx].local_state; +} + +/* + * Update local state of non-CPU power domain node from a cached CPU; perform + * any required cache maintenance operation afterwards. + */ +static void set_non_cpu_pd_node_local_state(unsigned int parent_idx, + plat_local_state_t state) +{ + psci_non_cpu_pd_nodes[parent_idx].local_state = state; +#if !USE_COHERENT_MEM || !HW_ASSISTED_COHERENCY + flush_dcache_range( + (uintptr_t) &psci_non_cpu_pd_nodes[parent_idx], + sizeof(psci_non_cpu_pd_nodes[parent_idx])); +#endif +} + /****************************************************************************** * Helper function to return the current local power state of each power domain * from the current cpu power domain to its ancestor at the 'end_pwrlvl'. This @@ -264,18 +309,7 @@ /* Copy the local power state from node to state_info */ for (lvl = PSCI_CPU_PWR_LVL + 1; lvl <= end_pwrlvl; lvl++) { -#if !USE_COHERENT_MEM - /* - * If using normal memory for psci_non_cpu_pd_nodes, we need - * to flush before reading the local power state as another - * cpu in the same power domain could have updated it and this - * code runs before caches are enabled. - */ - flush_dcache_range( - (uintptr_t) &psci_non_cpu_pd_nodes[parent_idx], - sizeof(psci_non_cpu_pd_nodes[parent_idx])); -#endif - pd_state[lvl] = psci_non_cpu_pd_nodes[parent_idx].local_state; + pd_state[lvl] = get_non_cpu_pd_node_local_state(parent_idx); parent_idx = psci_non_cpu_pd_nodes[parent_idx].parent_node; } @@ -299,21 +333,16 @@ psci_set_cpu_local_state(pd_state[PSCI_CPU_PWR_LVL]); /* - * Need to flush as local_state will be accessed with Data Cache + * Need to flush as local_state might be accessed with Data Cache * disabled during power on */ - flush_cpu_data(psci_svc_cpu_data.local_state); + psci_flush_cpu_data(psci_svc_cpu_data.local_state); parent_idx = psci_cpu_pd_nodes[plat_my_core_pos()].parent_node; /* Copy the local_state from state_info */ for (lvl = 1; lvl <= end_pwrlvl; lvl++) { - psci_non_cpu_pd_nodes[parent_idx].local_state = pd_state[lvl]; -#if !USE_COHERENT_MEM - flush_dcache_range( - (uintptr_t)&psci_non_cpu_pd_nodes[parent_idx], - sizeof(psci_non_cpu_pd_nodes[parent_idx])); -#endif + set_non_cpu_pd_node_local_state(parent_idx, pd_state[lvl]); parent_idx = psci_non_cpu_pd_nodes[parent_idx].parent_node; } } @@ -347,13 +376,8 @@ /* Reset the local_state to RUN for the non cpu power domains. */ for (lvl = PSCI_CPU_PWR_LVL + 1; lvl <= end_pwrlvl; lvl++) { - psci_non_cpu_pd_nodes[parent_idx].local_state = - PSCI_LOCAL_STATE_RUN; -#if !USE_COHERENT_MEM - flush_dcache_range( - (uintptr_t) &psci_non_cpu_pd_nodes[parent_idx], - sizeof(psci_non_cpu_pd_nodes[parent_idx])); -#endif + set_non_cpu_pd_node_local_state(parent_idx, + PSCI_LOCAL_STATE_RUN); psci_set_req_local_pwr_state(lvl, cpu_idx, PSCI_LOCAL_STATE_RUN); @@ -364,7 +388,7 @@ psci_set_aff_info_state(AFF_STATE_ON); psci_set_cpu_local_state(PSCI_LOCAL_STATE_RUN); - flush_cpu_data(psci_svc_cpu_data); + psci_flush_cpu_data(psci_svc_cpu_data); } /****************************************************************************** @@ -969,3 +993,33 @@ } #endif + +/******************************************************************************* + * Initiate power down sequence, by calling power down operations registered for + * this CPU. + ******************************************************************************/ +void psci_do_pwrdown_sequence(unsigned int power_level) +{ +#if HW_ASSISTED_COHERENCY + /* + * With hardware-assisted coherency, the CPU drivers only initiate the + * power down sequence, without performing cache-maintenance operations + * in software. Data caches and MMU remain enabled both before and after + * this call. + */ + prepare_cpu_pwr_dwn(power_level); +#else + /* + * Without hardware-assisted coherency, the CPU drivers disable data + * caches and MMU, then perform cache-maintenance operations in + * software. + * + * We ought to call prepare_cpu_pwr_dwn() to initiate power down + * sequence. We currently have data caches and MMU enabled, but the + * function will return with data caches and MMU disabled. We must + * ensure that the stack memory is flushed out to memory before we start + * popping from it again. + */ + psci_do_pwrdown_cache_maintenance(power_level); +#endif +} diff --git a/lib/psci/psci_off.c b/lib/psci/psci_off.c index 394aaa3..4ba7865 100644 --- a/lib/psci/psci_off.c +++ b/lib/psci/psci_off.c @@ -119,10 +119,9 @@ #endif /* - * Arch. management. Perform the necessary steps to flush all - * cpu caches. + * Arch. management. Initiate power down sequence. */ - psci_do_pwrdown_cache_maintenance(psci_find_max_off_lvl(&state_info)); + psci_do_pwrdown_sequence(psci_find_max_off_lvl(&state_info)); #if ENABLE_RUNTIME_INSTRUMENTATION PMF_CAPTURE_TIMESTAMP(rt_instr_svc, @@ -154,17 +153,17 @@ */ if (rc == PSCI_E_SUCCESS) { /* - * Set the affinity info state to OFF. This writes directly to - * main memory as caches are disabled, so cache maintenance is + * Set the affinity info state to OFF. When caches are disabled, + * this writes directly to main memory, so cache maintenance is * required to ensure that later cached reads of aff_info_state - * return AFF_STATE_OFF. A dsbish() ensures ordering of the + * return AFF_STATE_OFF. A dsbish() ensures ordering of the * update to the affinity info state prior to cache line * invalidation. */ - flush_cpu_data(psci_svc_cpu_data.aff_info_state); + psci_flush_cpu_data(psci_svc_cpu_data.aff_info_state); psci_set_aff_info_state(AFF_STATE_OFF); - dsbish(); - inv_cpu_data(psci_svc_cpu_data.aff_info_state); + psci_dsbish(); + psci_inv_cpu_data(psci_svc_cpu_data.aff_info_state); #if ENABLE_RUNTIME_INSTRUMENTATION diff --git a/lib/psci/psci_on.c b/lib/psci/psci_on.c index f4bb797..675ed66 100644 --- a/lib/psci/psci_on.c +++ b/lib/psci/psci_on.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2013-2016, ARM Limited and Contributors. All rights reserved. + * Copyright (c) 2013-2017, ARM Limited and Contributors. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: @@ -165,10 +165,12 @@ */ psci_plat_pm_ops->pwr_domain_on_finish(state_info); +#if !HW_ASSISTED_COHERENCY /* * Arch. management: Enable data cache and manage stack memory */ psci_do_pwrup_cache_maintenance(); +#endif /* * All the platform specific actions for turning this cpu diff --git a/lib/psci/psci_private.h b/lib/psci/psci_private.h index ca8291e..a27e215 100644 --- a/lib/psci/psci_private.h +++ b/lib/psci/psci_private.h @@ -38,17 +38,60 @@ #include #include +#if HW_ASSISTED_COHERENCY + /* - * The following helper macros abstract the interface to the Bakery - * Lock API. + * On systems with hardware-assisted coherency, make PSCI cache operations NOP, + * as PSCI participants are cache-coherent, and there's no need for explicit + * cache maintenance operations or barriers to coordinate their state. */ -#define psci_lock_init(non_cpu_pd_node, idx) \ - ((non_cpu_pd_node)[(idx)].lock_index = (idx)) +#define psci_flush_dcache_range(addr, size) +#define psci_flush_cpu_data(member) +#define psci_inv_cpu_data(member) + +#define psci_dsbish() + +/* + * On systems where participant CPUs are cache-coherent, we can use spinlocks + * instead of bakery locks. + */ +#define DEFINE_PSCI_LOCK(_name) spinlock_t _name +#define DECLARE_PSCI_LOCK(_name) extern DEFINE_PSCI_LOCK(_name) + +#define psci_lock_get(non_cpu_pd_node) \ + spin_lock(&psci_locks[(non_cpu_pd_node)->lock_index]) +#define psci_lock_release(non_cpu_pd_node) \ + spin_unlock(&psci_locks[(non_cpu_pd_node)->lock_index]) + +#else + +/* + * If not all PSCI participants are cache-coherent, perform cache maintenance + * and issue barriers wherever required to coordinate state. + */ +#define psci_flush_dcache_range(addr, size) flush_dcache_range(addr, size) +#define psci_flush_cpu_data(member) flush_cpu_data(member) +#define psci_inv_cpu_data(member) inv_cpu_data(member) + +#define psci_dsbish() dsbish() + +/* + * Use bakery locks for state coordination as not all PSCI participants are + * cache coherent. + */ +#define DEFINE_PSCI_LOCK(_name) DEFINE_BAKERY_LOCK(_name) +#define DECLARE_PSCI_LOCK(_name) DECLARE_BAKERY_LOCK(_name) + #define psci_lock_get(non_cpu_pd_node) \ bakery_lock_get(&psci_locks[(non_cpu_pd_node)->lock_index]) #define psci_lock_release(non_cpu_pd_node) \ bakery_lock_release(&psci_locks[(non_cpu_pd_node)->lock_index]) +#endif + +#define psci_lock_init(non_cpu_pd_node, idx) \ + ((non_cpu_pd_node)[(idx)].lock_index = (idx)) + /* * The PSCI capability which are provided by the generic code but does not * depend on the platform or spd capabilities. @@ -166,8 +209,8 @@ extern cpu_pd_node_t psci_cpu_pd_nodes[PLATFORM_CORE_COUNT]; extern unsigned int psci_caps; -/* One bakery lock is required for each non-cpu power domain */ -DECLARE_BAKERY_LOCK(psci_locks[PSCI_NUM_NON_CPU_PWR_DOMAINS]); +/* One lock is required per non-CPU power domain node */ +DECLARE_PSCI_LOCK(psci_locks[PSCI_NUM_NON_CPU_PWR_DOMAINS]); /******************************************************************************* * SPD's power management hooks registered with PSCI @@ -204,6 +247,14 @@ void psci_print_power_domain_map(void); unsigned int psci_is_last_on_cpu(void); int psci_spd_migrate_info(u_register_t *mpidr); +void psci_do_pwrdown_sequence(unsigned int power_level); + +/* + * CPU power down is directly called only when HW_ASSISTED_COHERENCY is + * available. Otherwise, this needs post-call stack maintenance, which is + * handled in assembly. + */ +void prepare_cpu_pwr_dwn(unsigned int power_level); /* Private exported functions from psci_on.c */ int psci_cpu_on_start(u_register_t target_cpu, diff --git a/lib/psci/psci_setup.c b/lib/psci/psci_setup.c index 7327b92..323dc62 100644 --- a/lib/psci/psci_setup.c +++ b/lib/psci/psci_setup.c @@ -86,7 +86,7 @@ /* Set the power state to OFF state */ svc_cpu_data->local_state = PLAT_MAX_OFF_STATE; - flush_dcache_range((uintptr_t)svc_cpu_data, + psci_flush_dcache_range((uintptr_t)svc_cpu_data, sizeof(*svc_cpu_data)); cm_set_context_by_index(node_idx, @@ -242,9 +242,9 @@ /* * Flush `psci_plat_pm_ops` as it will be accessed by secondary CPUs - * during warm boot before data cache is enabled. + * during warm boot, possibly before data cache is enabled. */ - flush_dcache_range((uintptr_t)&psci_plat_pm_ops, + psci_flush_dcache_range((uintptr_t)&psci_plat_pm_ops, sizeof(psci_plat_pm_ops)); /* Initialize the psci capability */ diff --git a/lib/psci/psci_suspend.c b/lib/psci/psci_suspend.c index 302116b..08c8fd6 100644 --- a/lib/psci/psci_suspend.c +++ b/lib/psci/psci_suspend.c @@ -91,10 +91,10 @@ psci_set_suspend_pwrlvl(end_pwrlvl); /* - * Flush the target power level as it will be accessed on power up with + * Flush the target power level as it might be accessed on power up with * Data cache disabled. */ - flush_cpu_data(psci_svc_cpu_data.target_pwrlvl); + psci_flush_cpu_data(psci_svc_cpu_data.target_pwrlvl); /* * Call the cpu suspend handler registered by the Secure Payload @@ -121,13 +121,11 @@ #endif /* - * Arch. management. Perform the necessary steps to flush all - * cpu caches. Currently we assume that the power level correspond - * the cache level. + * Arch. management. Initiate power down sequence. * TODO : Introduce a mechanism to query the cache level to flush * and the cpu-ops power down to perform from the platform. */ - psci_do_pwrdown_cache_maintenance(max_off_lvl); + psci_do_pwrdown_sequence(max_off_lvl); #if ENABLE_RUNTIME_INSTRUMENTATION PMF_CAPTURE_TIMESTAMP(rt_instr_svc, @@ -304,12 +302,10 @@ */ psci_plat_pm_ops->pwr_domain_suspend_finish(state_info); - /* - * Arch. management: Enable the data cache, manage stack memory and - * restore the stashed EL3 architectural context from the 'cpu_context' - * structure for this cpu. - */ +#if !HW_ASSISTED_COHERENCY + /* Arch. management: Enable the data cache, stack memory maintenance. */ psci_do_pwrup_cache_maintenance(); +#endif /* Re-init the cntfrq_el0 register */ counter_freq = plat_get_syscnt_freq2(); diff --git a/make_helpers/defaults.mk b/make_helpers/defaults.mk index b47ea46..de506be 100644 --- a/make_helpers/defaults.mk +++ b/make_helpers/defaults.mk @@ -105,6 +105,10 @@ # For Chain of Trust GENERATE_COT := 0 +# Whether system coherency is managed in hardware, without explicit software +# operations. +HW_ASSISTED_COHERENCY := 0 + # Flag to enable new version of image loading LOAD_IMAGE_V2 := 0