From 6687a97d4041f996f725902d2990e5de6ef5cbe5 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 24 Mar 2006 03:18:41 -0800 Subject: [PATCH] [PATCH] timer-irq-driven soft-watchdog, cleanups Make the softlockup detector purely timer-interrupt driven, removing softirq-context (timer) dependencies. This means that if the softlockup watchdog triggers, it has truly observed a longer than 10 seconds scheduling delay of a SCHED_FIFO prio 99 task. (the patch also turns off the softlockup detector during the initial bootup phase and does small style fixes) Signed-off-by: Ingo Molnar Signed-off-by: Ingo Molnar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/sched.h | 4 ++-- kernel/softlockup.c | 54 ++++++++++++++++++++++++------------------- kernel/timer.c | 2 +- 3 files changed, 33 insertions(+), 27 deletions(-) diff --git a/include/linux/sched.h b/include/linux/sched.h index 2cda439ece4..e0054c1b9a0 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -206,11 +206,11 @@ extern void update_process_times(int user); extern void scheduler_tick(void); #ifdef CONFIG_DETECT_SOFTLOCKUP -extern void softlockup_tick(struct pt_regs *regs); +extern void softlockup_tick(void); extern void spawn_softlockup_task(void); extern void touch_softlockup_watchdog(void); #else -static inline void softlockup_tick(struct pt_regs *regs) +static inline void softlockup_tick(void) { } static inline void spawn_softlockup_task(void) diff --git a/kernel/softlockup.c b/kernel/softlockup.c index c67189a25d5..dd9524fa649 100644 --- a/kernel/softlockup.c +++ b/kernel/softlockup.c @@ -1,12 +1,11 @@ /* * Detect Soft Lockups * - * started by Ingo Molnar, (C) 2005, Red Hat + * started by Ingo Molnar, Copyright (C) 2005, 2006 Red Hat, Inc. * * this code detects soft lockups: incidents in where on a CPU * the kernel does not reschedule for 10 seconds or more. */ - #include #include #include @@ -17,13 +16,14 @@ static DEFINE_SPINLOCK(print_lock); -static DEFINE_PER_CPU(unsigned long, timestamp) = 0; -static DEFINE_PER_CPU(unsigned long, print_timestamp) = 0; +static DEFINE_PER_CPU(unsigned long, touch_timestamp); +static DEFINE_PER_CPU(unsigned long, print_timestamp); static DEFINE_PER_CPU(struct task_struct *, watchdog_task); static int did_panic = 0; -static int softlock_panic(struct notifier_block *this, unsigned long event, - void *ptr) + +static int +softlock_panic(struct notifier_block *this, unsigned long event, void *ptr) { did_panic = 1; @@ -36,7 +36,7 @@ static struct notifier_block panic_block = { void touch_softlockup_watchdog(void) { - per_cpu(timestamp, raw_smp_processor_id()) = jiffies; + per_cpu(touch_timestamp, raw_smp_processor_id()) = jiffies; } EXPORT_SYMBOL(touch_softlockup_watchdog); @@ -44,25 +44,35 @@ EXPORT_SYMBOL(touch_softlockup_watchdog); * This callback runs from the timer interrupt, and checks * whether the watchdog thread has hung or not: */ -void softlockup_tick(struct pt_regs *regs) +void softlockup_tick(void) { int this_cpu = smp_processor_id(); - unsigned long timestamp = per_cpu(timestamp, this_cpu); + unsigned long touch_timestamp = per_cpu(touch_timestamp, this_cpu); - if (per_cpu(print_timestamp, this_cpu) == timestamp) + /* prevent double reports: */ + if (per_cpu(print_timestamp, this_cpu) == touch_timestamp || + did_panic || + !per_cpu(watchdog_task, this_cpu)) return; - /* Do not cause a second panic when there already was one */ - if (did_panic) + /* do not print during early bootup: */ + if (unlikely(system_state != SYSTEM_RUNNING)) { + touch_softlockup_watchdog(); return; + } - if (time_after(jiffies, timestamp + 10*HZ)) { - per_cpu(print_timestamp, this_cpu) = timestamp; + /* Wake up the high-prio watchdog task every second: */ + if (time_after(jiffies, touch_timestamp + HZ)) + wake_up_process(per_cpu(watchdog_task, this_cpu)); + + /* Warn about unreasonable 10+ seconds delays: */ + if (time_after(jiffies, touch_timestamp + 10*HZ)) { + per_cpu(print_timestamp, this_cpu) = touch_timestamp; spin_lock(&print_lock); printk(KERN_ERR "BUG: soft lockup detected on CPU#%d!\n", this_cpu); - show_regs(regs); + dump_stack(); spin_unlock(&print_lock); } } @@ -77,18 +87,16 @@ static int watchdog(void * __bind_cpu) sched_setscheduler(current, SCHED_FIFO, ¶m); current->flags |= PF_NOFREEZE; - set_current_state(TASK_INTERRUPTIBLE); - /* - * Run briefly once per second - if this gets delayed for - * more than 10 seconds then the debug-printout triggers - * in softlockup_tick(): + * Run briefly once per second to reset the softlockup timestamp. + * If this gets delayed for more than 10 seconds then the + * debug-printout triggers in softlockup_tick(). */ while (!kthread_should_stop()) { - msleep_interruptible(1000); + set_current_state(TASK_INTERRUPTIBLE); touch_softlockup_watchdog(); + schedule(); } - __set_current_state(TASK_RUNNING); return 0; } @@ -114,7 +122,6 @@ cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) kthread_bind(p, hotcpu); break; case CPU_ONLINE: - wake_up_process(per_cpu(watchdog_task, hotcpu)); break; #ifdef CONFIG_HOTPLUG_CPU @@ -146,4 +153,3 @@ __init void spawn_softlockup_task(void) notifier_chain_register(&panic_notifier_list, &panic_block); } - diff --git a/kernel/timer.c b/kernel/timer.c index 4427e725ccd..17d956cebcb 100644 --- a/kernel/timer.c +++ b/kernel/timer.c @@ -915,6 +915,7 @@ static void run_timer_softirq(struct softirq_action *h) void run_local_timers(void) { raise_softirq(TIMER_SOFTIRQ); + softlockup_tick(); } /* @@ -945,7 +946,6 @@ void do_timer(struct pt_regs *regs) /* prevent loading jiffies before storing new jiffies_64 value. */ barrier(); update_times(); - softlockup_tick(regs); } #ifdef __ARCH_WANT_SYS_ALARM -- 2.41.1