Merge branches 'pm-em' and 'pm-cpuidle'

Marge Energy Model support updates and cpuidle updates for 5.19-rc1:

 - Update the Energy Model support code to allow the Energy Model to be
   artificial, which means that the power values may not be on a uniform
   scale with other devices providing power information, and update the
   cpufreq_cooling and devfreq_cooling thermal drivers to support
   artificial Energy Models (Lukasz Luba).

 - Make DTPM check the Energy Model type (Lukasz Luba).

 - Fix policy counter decrementation in cpufreq if Energy Model is in
   use (Pierre Gondois).

 - Add AlderLake processor support to the intel_idle driver (Zhang Rui).

 - Fix regression leading to no genpd governor in the PSCI cpuidle
   driver and fix the riscv-sbi cpuidle driver to allow a genpd
   governor to be used (Ulf Hansson).

* pm-em:
  PM: EM: Decrement policy counter
  powercap: DTPM: Check for Energy Model type
  thermal: cooling: Check Energy Model type in cpufreq_cooling and devfreq_cooling
  Documentation: EM: Add artificial EM registration description
  PM: EM: Remove old debugfs files and print all 'flags'
  PM: EM: Change the order of arguments in the .active_power() callback
  PM: EM: Use the new .get_cost() callback while registering EM
  PM: EM: Add artificial EM flag
  PM: EM: Add .get_cost() callback

* pm-cpuidle:
  cpuidle: riscv-sbi: Fix code to allow a genpd governor to be used
  cpuidle: psci: Fix regression leading to no genpd governor
  intel_idle: Add AlderLake support
This commit is contained in:
Rafael J. Wysocki 2022-05-23 19:18:51 +02:00
commit 16a23f394d
12 changed files with 240 additions and 51 deletions

View File

@ -123,6 +123,26 @@ allows a platform to register EM power values which are reflecting total power
(static + dynamic). These power values might be coming directly from
experiments and measurements.
Registration of 'artificial' EM
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
There is an option to provide a custom callback for drivers missing detailed
knowledge about power value for each performance state. The callback
.get_cost() is optional and provides the 'cost' values used by the EAS.
This is useful for platforms that only provide information on relative
efficiency between CPU types, where one could use the information to
create an abstract power model. But even an abstract power model can
sometimes be hard to fit in, given the input power value size restrictions.
The .get_cost() allows to provide the 'cost' values which reflect the
efficiency of the CPUs. This would allow to provide EAS information which
has different relation than what would be forced by the EM internal
formulas calculating 'cost' values. To register an EM for such platform, the
driver must set the flag 'milliwatts' to 0, provide .get_power() callback
and provide .get_cost() callback. The EM framework would handle such platform
properly during registration. A flag EM_PERF_DOMAIN_ARTIFICIAL is set for such
platform. Special care should be taken by other frameworks which are using EM
to test and treat this flag properly.
Registration of 'simple' EM
~~~~~~~~~~~~~~~~~~~~~~~~~~~
@ -181,8 +201,8 @@ EM framework::
-> drivers/cpufreq/foo_cpufreq.c
01 static int est_power(unsigned long *mW, unsigned long *KHz,
02 struct device *dev)
01 static int est_power(struct device *dev, unsigned long *mW,
02 unsigned long *KHz)
03 {
04 long freq, power;
05

View File

@ -51,8 +51,8 @@ static const u16 cpufreq_mtk_offsets[REG_ARRAY_SIZE] = {
};
static int __maybe_unused
mtk_cpufreq_get_cpu_power(unsigned long *mW,
unsigned long *KHz, struct device *cpu_dev)
mtk_cpufreq_get_cpu_power(struct device *cpu_dev, unsigned long *mW,
unsigned long *KHz)
{
struct mtk_cpufreq_data *data;
struct cpufreq_policy *policy;

View File

@ -96,8 +96,8 @@ scmi_get_sharing_cpus(struct device *cpu_dev, struct cpumask *cpumask)
}
static int __maybe_unused
scmi_get_cpu_power(unsigned long *power, unsigned long *KHz,
struct device *cpu_dev)
scmi_get_cpu_power(struct device *cpu_dev, unsigned long *power,
unsigned long *KHz)
{
unsigned long Hz;
int ret, domain;

View File

@ -52,7 +52,7 @@ static int psci_pd_init(struct device_node *np, bool use_osi)
struct generic_pm_domain *pd;
struct psci_pd_provider *pd_provider;
struct dev_power_governor *pd_gov;
int ret = -ENOMEM, state_count = 0;
int ret = -ENOMEM;
pd = dt_idle_pd_alloc(np, psci_dt_parse_state_node);
if (!pd)
@ -71,7 +71,7 @@ static int psci_pd_init(struct device_node *np, bool use_osi)
pd->flags |= GENPD_FLAG_ALWAYS_ON;
/* Use governor for CPU PM domains if it has some states to manage. */
pd_gov = state_count > 0 ? &pm_domain_cpu_gov : NULL;
pd_gov = pd->states ? &pm_domain_cpu_gov : NULL;
ret = pm_genpd_init(pd, pd_gov, false);
if (ret)

View File

@ -414,7 +414,7 @@ static int sbi_pd_init(struct device_node *np)
struct generic_pm_domain *pd;
struct sbi_pd_provider *pd_provider;
struct dev_power_governor *pd_gov;
int ret = -ENOMEM, state_count = 0;
int ret = -ENOMEM;
pd = dt_idle_pd_alloc(np, sbi_dt_parse_state_node);
if (!pd)
@ -433,7 +433,7 @@ static int sbi_pd_init(struct device_node *np)
pd->flags |= GENPD_FLAG_ALWAYS_ON;
/* Use governor for CPU PM domains if it has some states to manage. */
pd_gov = state_count > 0 ? &pm_domain_cpu_gov : NULL;
pd_gov = pd->states ? &pm_domain_cpu_gov : NULL;
ret = pm_genpd_init(pd, pd_gov, false);
if (ret)

View File

@ -764,6 +764,106 @@ static struct cpuidle_state icx_cstates[] __initdata = {
.enter = NULL }
};
/*
* On AlderLake C1 has to be disabled if C1E is enabled, and vice versa.
* C1E is enabled only if "C1E promotion" bit is set in MSR_IA32_POWER_CTL.
* But in this case there is effectively no C1, because C1 requests are
* promoted to C1E. If the "C1E promotion" bit is cleared, then both C1
* and C1E requests end up with C1, so there is effectively no C1E.
*
* By default we enable C1E and disable C1 by marking it with
* 'CPUIDLE_FLAG_UNUSABLE'.
*/
static struct cpuidle_state adl_cstates[] __initdata = {
{
.name = "C1",
.desc = "MWAIT 0x00",
.flags = MWAIT2flg(0x00) | CPUIDLE_FLAG_UNUSABLE,
.exit_latency = 1,
.target_residency = 1,
.enter = &intel_idle,
.enter_s2idle = intel_idle_s2idle, },
{
.name = "C1E",
.desc = "MWAIT 0x01",
.flags = MWAIT2flg(0x01) | CPUIDLE_FLAG_ALWAYS_ENABLE,
.exit_latency = 2,
.target_residency = 4,
.enter = &intel_idle,
.enter_s2idle = intel_idle_s2idle, },
{
.name = "C6",
.desc = "MWAIT 0x20",
.flags = MWAIT2flg(0x20) | CPUIDLE_FLAG_TLB_FLUSHED,
.exit_latency = 220,
.target_residency = 600,
.enter = &intel_idle,
.enter_s2idle = intel_idle_s2idle, },
{
.name = "C8",
.desc = "MWAIT 0x40",
.flags = MWAIT2flg(0x40) | CPUIDLE_FLAG_TLB_FLUSHED,
.exit_latency = 280,
.target_residency = 800,
.enter = &intel_idle,
.enter_s2idle = intel_idle_s2idle, },
{
.name = "C10",
.desc = "MWAIT 0x60",
.flags = MWAIT2flg(0x60) | CPUIDLE_FLAG_TLB_FLUSHED,
.exit_latency = 680,
.target_residency = 2000,
.enter = &intel_idle,
.enter_s2idle = intel_idle_s2idle, },
{
.enter = NULL }
};
static struct cpuidle_state adl_l_cstates[] __initdata = {
{
.name = "C1",
.desc = "MWAIT 0x00",
.flags = MWAIT2flg(0x00) | CPUIDLE_FLAG_UNUSABLE,
.exit_latency = 1,
.target_residency = 1,
.enter = &intel_idle,
.enter_s2idle = intel_idle_s2idle, },
{
.name = "C1E",
.desc = "MWAIT 0x01",
.flags = MWAIT2flg(0x01) | CPUIDLE_FLAG_ALWAYS_ENABLE,
.exit_latency = 2,
.target_residency = 4,
.enter = &intel_idle,
.enter_s2idle = intel_idle_s2idle, },
{
.name = "C6",
.desc = "MWAIT 0x20",
.flags = MWAIT2flg(0x20) | CPUIDLE_FLAG_TLB_FLUSHED,
.exit_latency = 170,
.target_residency = 500,
.enter = &intel_idle,
.enter_s2idle = intel_idle_s2idle, },
{
.name = "C8",
.desc = "MWAIT 0x40",
.flags = MWAIT2flg(0x40) | CPUIDLE_FLAG_TLB_FLUSHED,
.exit_latency = 200,
.target_residency = 600,
.enter = &intel_idle,
.enter_s2idle = intel_idle_s2idle, },
{
.name = "C10",
.desc = "MWAIT 0x60",
.flags = MWAIT2flg(0x60) | CPUIDLE_FLAG_TLB_FLUSHED,
.exit_latency = 230,
.target_residency = 700,
.enter = &intel_idle,
.enter_s2idle = intel_idle_s2idle, },
{
.enter = NULL }
};
/*
* On Sapphire Rapids Xeon C1 has to be disabled if C1E is enabled, and vice
* versa. On SPR C1E is enabled only if "C1E promotion" bit is set in
@ -1147,6 +1247,14 @@ static const struct idle_cpu idle_cpu_icx __initconst = {
.use_acpi = true,
};
static const struct idle_cpu idle_cpu_adl __initconst = {
.state_table = adl_cstates,
};
static const struct idle_cpu idle_cpu_adl_l __initconst = {
.state_table = adl_l_cstates,
};
static const struct idle_cpu idle_cpu_spr __initconst = {
.state_table = spr_cstates,
.disable_promotion_to_c1e = true,
@ -1215,6 +1323,8 @@ static const struct x86_cpu_id intel_idle_ids[] __initconst = {
X86_MATCH_INTEL_FAM6_MODEL(SKYLAKE_X, &idle_cpu_skx),
X86_MATCH_INTEL_FAM6_MODEL(ICELAKE_X, &idle_cpu_icx),
X86_MATCH_INTEL_FAM6_MODEL(ICELAKE_D, &idle_cpu_icx),
X86_MATCH_INTEL_FAM6_MODEL(ALDERLAKE, &idle_cpu_adl),
X86_MATCH_INTEL_FAM6_MODEL(ALDERLAKE_L, &idle_cpu_adl_l),
X86_MATCH_INTEL_FAM6_MODEL(SAPPHIRERAPIDS_X, &idle_cpu_spr),
X86_MATCH_INTEL_FAM6_MODEL(XEON_PHI_KNL, &idle_cpu_knl),
X86_MATCH_INTEL_FAM6_MODEL(XEON_PHI_KNM, &idle_cpu_knl),
@ -1573,6 +1683,25 @@ static void __init skx_idle_state_table_update(void)
}
}
/**
* adl_idle_state_table_update - Adjust AlderLake idle states table.
*/
static void __init adl_idle_state_table_update(void)
{
/* Check if user prefers C1 over C1E. */
if (preferred_states_mask & BIT(1) && !(preferred_states_mask & BIT(2))) {
cpuidle_state_table[0].flags &= ~CPUIDLE_FLAG_UNUSABLE;
cpuidle_state_table[1].flags |= CPUIDLE_FLAG_UNUSABLE;
/* Disable C1E by clearing the "C1E promotion" bit. */
c1e_promotion = C1E_PROMOTION_DISABLE;
return;
}
/* Make sure C1E is enabled by default */
c1e_promotion = C1E_PROMOTION_ENABLE;
}
/**
* spr_idle_state_table_update - Adjust Sapphire Rapids idle states table.
*/
@ -1642,6 +1771,10 @@ static void __init intel_idle_init_cstates_icpu(struct cpuidle_driver *drv)
case INTEL_FAM6_SAPPHIRERAPIDS_X:
spr_idle_state_table_update();
break;
case INTEL_FAM6_ALDERLAKE:
case INTEL_FAM6_ALDERLAKE_L:
adl_idle_state_table_update();
break;
}
for (cstate = 0; cstate < CPUIDLE_STATE_MAX; ++cstate) {

View File

@ -1448,7 +1448,7 @@ EXPORT_SYMBOL_GPL(dev_pm_opp_get_of_node);
* Returns 0 on success or a proper -EINVAL value in case of error.
*/
static int __maybe_unused
_get_dt_power(unsigned long *mW, unsigned long *kHz, struct device *dev)
_get_dt_power(struct device *dev, unsigned long *mW, unsigned long *kHz)
{
struct dev_pm_opp *opp;
unsigned long opp_freq, opp_power;
@ -1482,8 +1482,8 @@ _get_dt_power(unsigned long *mW, unsigned long *kHz, struct device *dev)
* Returns -EINVAL if the power calculation failed because of missing
* parameters, 0 otherwise.
*/
static int __maybe_unused _get_power(unsigned long *mW, unsigned long *kHz,
struct device *dev)
static int __maybe_unused _get_power(struct device *dev, unsigned long *mW,
unsigned long *kHz)
{
struct dev_pm_opp *opp;
struct device_node *np;

View File

@ -211,7 +211,7 @@ static int __dtpm_cpu_setup(int cpu, struct dtpm *parent)
return 0;
pd = em_cpu_get(cpu);
if (!pd)
if (!pd || em_is_artificial(pd))
return -EINVAL;
dtpm_cpu = kzalloc(sizeof(*dtpm_cpu), GFP_KERNEL);

View File

@ -328,7 +328,7 @@ static inline bool em_is_sane(struct cpufreq_cooling_device *cpufreq_cdev,
struct cpufreq_policy *policy;
unsigned int nr_levels;
if (!em)
if (!em || em_is_artificial(em))
return false;
policy = cpufreq_cdev->policy;

View File

@ -358,6 +358,7 @@ of_devfreq_cooling_register_power(struct device_node *np, struct devfreq *df,
struct thermal_cooling_device *cdev;
struct device *dev = df->dev.parent;
struct devfreq_cooling_device *dfc;
struct em_perf_domain *em;
char *name;
int err, num_opps;
@ -367,8 +368,9 @@ of_devfreq_cooling_register_power(struct device_node *np, struct devfreq *df,
dfc->devfreq = df;
dfc->em_pd = em_pd_get(dev);
if (dfc->em_pd) {
em = em_pd_get(dev);
if (em && !em_is_artificial(em)) {
dfc->em_pd = em;
devfreq_cooling_ops.get_requested_power =
devfreq_cooling_get_requested_power;
devfreq_cooling_ops.state2power = devfreq_cooling_state2power;
@ -379,7 +381,7 @@ of_devfreq_cooling_register_power(struct device_node *np, struct devfreq *df,
num_opps = em_pd_nr_perf_states(dfc->em_pd);
} else {
/* Backward compatibility for drivers which do not use IPA */
dev_dbg(dev, "missing EM for cooling device\n");
dev_dbg(dev, "missing proper EM for cooling device\n");
num_opps = dev_pm_opp_get_opp_count(dev);

View File

@ -67,11 +67,16 @@ struct em_perf_domain {
*
* EM_PERF_DOMAIN_SKIP_INEFFICIENCIES: Skip inefficient states when estimating
* energy consumption.
*
* EM_PERF_DOMAIN_ARTIFICIAL: The power values are artificial and might be
* created by platform missing real power information
*/
#define EM_PERF_DOMAIN_MILLIWATTS BIT(0)
#define EM_PERF_DOMAIN_SKIP_INEFFICIENCIES BIT(1)
#define EM_PERF_DOMAIN_ARTIFICIAL BIT(2)
#define em_span_cpus(em) (to_cpumask((em)->cpus))
#define em_is_artificial(em) ((em)->flags & EM_PERF_DOMAIN_ARTIFICIAL)
#ifdef CONFIG_ENERGY_MODEL
#define EM_MAX_POWER 0xFFFF
@ -96,11 +101,11 @@ struct em_data_callback {
/**
* active_power() - Provide power at the next performance state of
* a device
* @dev : Device for which we do this operation (can be a CPU)
* @power : Active power at the performance state
* (modified)
* @freq : Frequency at the performance state in kHz
* (modified)
* @dev : Device for which we do this operation (can be a CPU)
*
* active_power() must find the lowest performance state of 'dev' above
* 'freq' and update 'power' and 'freq' to the matching active power
@ -112,11 +117,32 @@ struct em_data_callback {
*
* Return 0 on success.
*/
int (*active_power)(unsigned long *power, unsigned long *freq,
struct device *dev);
int (*active_power)(struct device *dev, unsigned long *power,
unsigned long *freq);
/**
* get_cost() - Provide the cost at the given performance state of
* a device
* @dev : Device for which we do this operation (can be a CPU)
* @freq : Frequency at the performance state in kHz
* @cost : The cost value for the performance state
* (modified)
*
* In case of CPUs, the cost is the one of a single CPU in the domain.
* It is expected to fit in the [0, EM_MAX_POWER] range due to internal
* usage in EAS calculation.
*
* Return 0 on success, or appropriate error value in case of failure.
*/
int (*get_cost)(struct device *dev, unsigned long freq,
unsigned long *cost);
};
#define EM_DATA_CB(_active_power_cb) { .active_power = &_active_power_cb }
#define EM_SET_ACTIVE_POWER_CB(em_cb, cb) ((em_cb).active_power = cb)
#define EM_ADV_DATA_CB(_active_power_cb, _cost_cb) \
{ .active_power = _active_power_cb, \
.get_cost = _cost_cb }
#define EM_DATA_CB(_active_power_cb) \
EM_ADV_DATA_CB(_active_power_cb, NULL)
struct em_perf_domain *em_cpu_get(int cpu);
struct em_perf_domain *em_pd_get(struct device *dev);
@ -264,6 +290,7 @@ static inline int em_pd_nr_perf_states(struct em_perf_domain *pd)
#else
struct em_data_callback {};
#define EM_ADV_DATA_CB(_active_power_cb, _cost_cb) { }
#define EM_DATA_CB(_active_power_cb) { }
#define EM_SET_ACTIVE_POWER_CB(em_cb, cb) do { } while (0)

View File

@ -54,28 +54,15 @@ static int em_debug_cpus_show(struct seq_file *s, void *unused)
}
DEFINE_SHOW_ATTRIBUTE(em_debug_cpus);
static int em_debug_units_show(struct seq_file *s, void *unused)
static int em_debug_flags_show(struct seq_file *s, void *unused)
{
struct em_perf_domain *pd = s->private;
char *units = (pd->flags & EM_PERF_DOMAIN_MILLIWATTS) ?
"milliWatts" : "bogoWatts";
seq_printf(s, "%s\n", units);
seq_printf(s, "%#lx\n", pd->flags);
return 0;
}
DEFINE_SHOW_ATTRIBUTE(em_debug_units);
static int em_debug_skip_inefficiencies_show(struct seq_file *s, void *unused)
{
struct em_perf_domain *pd = s->private;
int enabled = (pd->flags & EM_PERF_DOMAIN_SKIP_INEFFICIENCIES) ? 1 : 0;
seq_printf(s, "%d\n", enabled);
return 0;
}
DEFINE_SHOW_ATTRIBUTE(em_debug_skip_inefficiencies);
DEFINE_SHOW_ATTRIBUTE(em_debug_flags);
static void em_debug_create_pd(struct device *dev)
{
@ -89,9 +76,8 @@ static void em_debug_create_pd(struct device *dev)
debugfs_create_file("cpus", 0444, d, dev->em_pd->cpus,
&em_debug_cpus_fops);
debugfs_create_file("units", 0444, d, dev->em_pd, &em_debug_units_fops);
debugfs_create_file("skip-inefficiencies", 0444, d, dev->em_pd,
&em_debug_skip_inefficiencies_fops);
debugfs_create_file("flags", 0444, d, dev->em_pd,
&em_debug_flags_fops);
/* Create a sub-directory for each performance state */
for (i = 0; i < dev->em_pd->nr_perf_states; i++)
@ -121,7 +107,8 @@ static void em_debug_remove_pd(struct device *dev) {}
#endif
static int em_create_perf_table(struct device *dev, struct em_perf_domain *pd,
int nr_states, struct em_data_callback *cb)
int nr_states, struct em_data_callback *cb,
unsigned long flags)
{
unsigned long power, freq, prev_freq = 0, prev_cost = ULONG_MAX;
struct em_perf_state *table;
@ -139,7 +126,7 @@ static int em_create_perf_table(struct device *dev, struct em_perf_domain *pd,
* lowest performance state of 'dev' above 'freq' and updates
* 'power' and 'freq' accordingly.
*/
ret = cb->active_power(&power, &freq, dev);
ret = cb->active_power(dev, &power, &freq);
if (ret) {
dev_err(dev, "EM: invalid perf. state: %d\n",
ret);
@ -173,10 +160,22 @@ static int em_create_perf_table(struct device *dev, struct em_perf_domain *pd,
/* Compute the cost of each performance state. */
fmax = (u64) table[nr_states - 1].frequency;
for (i = nr_states - 1; i >= 0; i--) {
unsigned long power_res = em_scale_power(table[i].power);
unsigned long power_res, cost;
if (flags & EM_PERF_DOMAIN_ARTIFICIAL) {
ret = cb->get_cost(dev, table[i].frequency, &cost);
if (ret || !cost || cost > EM_MAX_POWER) {
dev_err(dev, "EM: invalid cost %lu %d\n",
cost, ret);
goto free_ps_table;
}
} else {
power_res = em_scale_power(table[i].power);
cost = div64_u64(fmax * power_res, table[i].frequency);
}
table[i].cost = cost;
table[i].cost = div64_u64(fmax * power_res,
table[i].frequency);
if (table[i].cost >= prev_cost) {
table[i].flags = EM_PERF_STATE_INEFFICIENT;
dev_dbg(dev, "EM: OPP:%lu is inefficient\n",
@ -197,7 +196,8 @@ static int em_create_perf_table(struct device *dev, struct em_perf_domain *pd,
}
static int em_create_pd(struct device *dev, int nr_states,
struct em_data_callback *cb, cpumask_t *cpus)
struct em_data_callback *cb, cpumask_t *cpus,
unsigned long flags)
{
struct em_perf_domain *pd;
struct device *cpu_dev;
@ -215,7 +215,7 @@ static int em_create_pd(struct device *dev, int nr_states,
return -ENOMEM;
}
ret = em_create_perf_table(dev, pd, nr_states, cb);
ret = em_create_perf_table(dev, pd, nr_states, cb, flags);
if (ret) {
kfree(pd);
return ret;
@ -259,6 +259,8 @@ static void em_cpufreq_update_efficiencies(struct device *dev)
found++;
}
cpufreq_cpu_put(policy);
if (!found)
return;
@ -332,6 +334,7 @@ int em_dev_register_perf_domain(struct device *dev, unsigned int nr_states,
bool milliwatts)
{
unsigned long cap, prev_cap = 0;
unsigned long flags = 0;
int cpu, ret;
if (!dev || !nr_states || !cb)
@ -378,12 +381,16 @@ int em_dev_register_perf_domain(struct device *dev, unsigned int nr_states,
}
}
ret = em_create_pd(dev, nr_states, cb, cpus);
if (milliwatts)
flags |= EM_PERF_DOMAIN_MILLIWATTS;
else if (cb->get_cost)
flags |= EM_PERF_DOMAIN_ARTIFICIAL;
ret = em_create_pd(dev, nr_states, cb, cpus, flags);
if (ret)
goto unlock;
if (milliwatts)
dev->em_pd->flags |= EM_PERF_DOMAIN_MILLIWATTS;
dev->em_pd->flags |= flags;
em_cpufreq_update_efficiencies(dev);