linux/kernel/sched_clock.c
David Miller b9f8fcd55b sched: Fix cpu_clock() in NMIs, on !CONFIG_HAVE_UNSTABLE_SCHED_CLOCK
Relax stable-sched-clock architectures to not save/disable/restore
hardirqs in cpu_clock().

The background is that I was trying to resolve a sparc64 perf
issue when I discovered this problem.

On sparc64 I implement pseudo NMIs by simply running the kernel
at IRQ level 14 when local_irq_disable() is called, this allows
performance counter events to still come in at IRQ level 15.

This doesn't work if any code in an NMI handler does
local_irq_save() or local_irq_disable() since the "disable" will
kick us back to cpu IRQ level 14 thus letting NMIs back in and
we recurse.

The only path which that does that in the perf event IRQ
handling path is the code supporting frequency based events.  It
uses cpu_clock().

cpu_clock() simply invokes sched_clock() with IRQs disabled.

And that's a fundamental bug all on it's own, particularly for
the HAVE_UNSTABLE_SCHED_CLOCK case.  NMIs can thus get into the
sched_clock() code interrupting the local IRQ disable code
sections of it.

Furthermore, for the not-HAVE_UNSTABLE_SCHED_CLOCK case, the IRQ
disabling done by cpu_clock() is just pure overhead and
completely unnecessary.

So the core problem is that sched_clock() is not NMI safe, but
we are invoking it from NMI contexts in the perf events code
(via cpu_clock()).

A less important issue is the overhead of IRQ disabling when it
isn't necessary in cpu_clock().

CONFIG_HAVE_UNSTABLE_SCHED_CLOCK architectures are not
affected by this patch.

Signed-off-by: David S. Miller <davem@davemloft.net>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
LKML-Reference: <20091213.182502.215092085.davem@davemloft.net>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2009-12-15 09:04:36 +01:00

274 lines
5.6 KiB
C

/*
* sched_clock for unstable cpu clocks
*
* Copyright (C) 2008 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com>
*
* Updates and enhancements:
* Copyright (C) 2008 Red Hat, Inc. Steven Rostedt <srostedt@redhat.com>
*
* Based on code by:
* Ingo Molnar <mingo@redhat.com>
* Guillaume Chazarain <guichaz@gmail.com>
*
* Create a semi stable clock from a mixture of other events, including:
* - gtod
* - sched_clock()
* - explicit idle events
*
* We use gtod as base and the unstable clock deltas. The deltas are filtered,
* making it monotonic and keeping it within an expected window.
*
* Furthermore, explicit sleep and wakeup hooks allow us to account for time
* that is otherwise invisible (TSC gets stopped).
*
* The clock: sched_clock_cpu() is monotonic per cpu, and should be somewhat
* consistent between cpus (never more than 2 jiffies difference).
*/
#include <linux/spinlock.h>
#include <linux/hardirq.h>
#include <linux/module.h>
#include <linux/percpu.h>
#include <linux/ktime.h>
#include <linux/sched.h>
/*
* Scheduler clock - returns current time in nanosec units.
* This is default implementation.
* Architectures and sub-architectures can override this.
*/
unsigned long long __attribute__((weak)) sched_clock(void)
{
return (unsigned long long)(jiffies - INITIAL_JIFFIES)
* (NSEC_PER_SEC / HZ);
}
static __read_mostly int sched_clock_running;
#ifdef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK
__read_mostly int sched_clock_stable;
struct sched_clock_data {
u64 tick_raw;
u64 tick_gtod;
u64 clock;
};
static DEFINE_PER_CPU_SHARED_ALIGNED(struct sched_clock_data, sched_clock_data);
static inline struct sched_clock_data *this_scd(void)
{
return &__get_cpu_var(sched_clock_data);
}
static inline struct sched_clock_data *cpu_sdc(int cpu)
{
return &per_cpu(sched_clock_data, cpu);
}
void sched_clock_init(void)
{
u64 ktime_now = ktime_to_ns(ktime_get());
int cpu;
for_each_possible_cpu(cpu) {
struct sched_clock_data *scd = cpu_sdc(cpu);
scd->tick_raw = 0;
scd->tick_gtod = ktime_now;
scd->clock = ktime_now;
}
sched_clock_running = 1;
}
/*
* min, max except they take wrapping into account
*/
static inline u64 wrap_min(u64 x, u64 y)
{
return (s64)(x - y) < 0 ? x : y;
}
static inline u64 wrap_max(u64 x, u64 y)
{
return (s64)(x - y) > 0 ? x : y;
}
/*
* update the percpu scd from the raw @now value
*
* - filter out backward motion
* - use the GTOD tick value to create a window to filter crazy TSC values
*/
static u64 sched_clock_local(struct sched_clock_data *scd)
{
u64 now, clock, old_clock, min_clock, max_clock;
s64 delta;
again:
now = sched_clock();
delta = now - scd->tick_raw;
if (unlikely(delta < 0))
delta = 0;
old_clock = scd->clock;
/*
* scd->clock = clamp(scd->tick_gtod + delta,
* max(scd->tick_gtod, scd->clock),
* scd->tick_gtod + TICK_NSEC);
*/
clock = scd->tick_gtod + delta;
min_clock = wrap_max(scd->tick_gtod, old_clock);
max_clock = wrap_max(old_clock, scd->tick_gtod + TICK_NSEC);
clock = wrap_max(clock, min_clock);
clock = wrap_min(clock, max_clock);
if (cmpxchg64(&scd->clock, old_clock, clock) != old_clock)
goto again;
return clock;
}
static u64 sched_clock_remote(struct sched_clock_data *scd)
{
struct sched_clock_data *my_scd = this_scd();
u64 this_clock, remote_clock;
u64 *ptr, old_val, val;
sched_clock_local(my_scd);
again:
this_clock = my_scd->clock;
remote_clock = scd->clock;
/*
* Use the opportunity that we have both locks
* taken to couple the two clocks: we take the
* larger time as the latest time for both
* runqueues. (this creates monotonic movement)
*/
if (likely((s64)(remote_clock - this_clock) < 0)) {
ptr = &scd->clock;
old_val = remote_clock;
val = this_clock;
} else {
/*
* Should be rare, but possible:
*/
ptr = &my_scd->clock;
old_val = this_clock;
val = remote_clock;
}
if (cmpxchg64(ptr, old_val, val) != old_val)
goto again;
return val;
}
u64 sched_clock_cpu(int cpu)
{
struct sched_clock_data *scd;
u64 clock;
WARN_ON_ONCE(!irqs_disabled());
if (sched_clock_stable)
return sched_clock();
if (unlikely(!sched_clock_running))
return 0ull;
scd = cpu_sdc(cpu);
if (cpu != smp_processor_id())
clock = sched_clock_remote(scd);
else
clock = sched_clock_local(scd);
return clock;
}
void sched_clock_tick(void)
{
struct sched_clock_data *scd;
u64 now, now_gtod;
if (sched_clock_stable)
return;
if (unlikely(!sched_clock_running))
return;
WARN_ON_ONCE(!irqs_disabled());
scd = this_scd();
now_gtod = ktime_to_ns(ktime_get());
now = sched_clock();
scd->tick_raw = now;
scd->tick_gtod = now_gtod;
sched_clock_local(scd);
}
/*
* We are going deep-idle (irqs are disabled):
*/
void sched_clock_idle_sleep_event(void)
{
sched_clock_cpu(smp_processor_id());
}
EXPORT_SYMBOL_GPL(sched_clock_idle_sleep_event);
/*
* We just idled delta nanoseconds (called with irqs disabled):
*/
void sched_clock_idle_wakeup_event(u64 delta_ns)
{
if (timekeeping_suspended)
return;
sched_clock_tick();
touch_softlockup_watchdog();
}
EXPORT_SYMBOL_GPL(sched_clock_idle_wakeup_event);
unsigned long long cpu_clock(int cpu)
{
unsigned long long clock;
unsigned long flags;
local_irq_save(flags);
clock = sched_clock_cpu(cpu);
local_irq_restore(flags);
return clock;
}
#else /* CONFIG_HAVE_UNSTABLE_SCHED_CLOCK */
void sched_clock_init(void)
{
sched_clock_running = 1;
}
u64 sched_clock_cpu(int cpu)
{
if (unlikely(!sched_clock_running))
return 0;
return sched_clock();
}
unsigned long long cpu_clock(int cpu)
{
return sched_clock_cpu(cpu);
}
#endif /* CONFIG_HAVE_UNSTABLE_SCHED_CLOCK */
EXPORT_SYMBOL_GPL(cpu_clock);