[PATCH] x86-64: Dynamically adjust machine check interval

author Tim Hockin <thockin@google.com>

Wed, 2 May 2007 17:27:19 +0000 (19:27 +0200)

committer Andi Kleen <andi@basil.nowhere.org>

Wed, 2 May 2007 17:27:19 +0000 (19:27 +0200)
author Tim Hockin <thockin@google.com>
Wed, 2 May 2007 17:27:19 +0000 (19:27 +0200)
committer Andi Kleen <andi@basil.nowhere.org>
Wed, 2 May 2007 17:27:19 +0000 (19:27 +0200)
diff --git a/Documentation/x86_64/machinecheck b/Documentation/x86_64/machinecheck

index 068a6d9904b99a60a5c554bc5c1cfe2f2e0bcd7a..feaeaf6f6e4dd45dc03028886764d35477d2b3b9 100644 (file)
--- a/Documentation/x86_64/machinecheck
+++ b/Documentation/x86_64/machinecheck
@@ -36,7 +36,12 @@ between all CPUs.
  
  check_interval
         How often to poll for corrected machine check errors, in seconds
-       (Note output is hexademical). Default 5 minutes.
+       (Note output is hexademical). Default 5 minutes.  When the poller
+       finds MCEs it triggers an exponential speedup (poll more often) on
+       the polling interval.  When the poller stops finding MCEs, it
+       triggers an exponential backoff (poll less often) on the polling
+       interval. The check_interval variable is both the initial and
+       maximum polling interval.
  
  tolerant
         Tolerance level. When a machine check exception occurs for a non
diff --git a/arch/x86_64/kernel/mce.c b/arch/x86_64/kernel/mce.c

index 8011a8e1c7d41d8a5baf2876c87697b37eec39ca..fa26726824775e774682476f072836e5a29cb691 100644 (file)
--- a/arch/x86_64/kernel/mce.c
+++ b/arch/x86_64/kernel/mce.c
@@ -323,10 +323,13 @@ void mce_log_therm_throt_event(unsigned int cpu, __u64 status)
  #endif /* CONFIG_X86_MCE_INTEL */
  
  /*
- * Periodic polling timer for "silent" machine check errors.
+ * Periodic polling timer for "silent" machine check errors.  If the
+ * poller finds an MCE, poll 2x faster.  When the poller finds no more
+ * errors, poll 2x slower (up to check_interval seconds).
   */
  
  static int check_interval = 5 * 60; /* 5 minutes */
+static int next_interval; /* in jiffies */
  static void mcheck_timer(struct work_struct *work);
  static DECLARE_DELAYED_WORK(mcheck_work, mcheck_timer);
  
@@ -339,7 +342,6 @@ static void mcheck_check_cpu(void *info)
  static void mcheck_timer(struct work_struct *work)
  {
         on_each_cpu(mcheck_check_cpu, NULL, 1, 1);
-       schedule_delayed_work(&mcheck_work, check_interval * HZ);
  
         /*
          * It's ok to read stale data here for notify_user and
@@ -349,17 +351,30 @@ static void mcheck_timer(struct work_struct *work)
          * writes.
          */
         if (notify_user && console_logged) {
+               static unsigned long last_print;
+               unsigned long now = jiffies;
+
+               /* if we logged an MCE, reduce the polling interval */
+               next_interval = max(next_interval/2, HZ/100);
                 notify_user = 0;
                 clear_bit(0, &console_logged);
-               printk(KERN_INFO "Machine check events logged\n");
+               if (time_after_eq(now, last_print + (check_interval*HZ))) {
+                       last_print = now;
+                       printk(KERN_INFO "Machine check events logged\n");
+               }
+       } else {
+               next_interval = min(next_interval*2, check_interval*HZ);
         }
+
+       schedule_delayed_work(&mcheck_work, next_interval);
  }
  
  
  static __init int periodic_mcheck_init(void)
  { 
-       if (check_interval)
-               schedule_delayed_work(&mcheck_work, check_interval*HZ);
+       next_interval = check_interval * HZ;
+       if (next_interval)
+               schedule_delayed_work(&mcheck_work, next_interval);
         return 0;
  } 
  __initcall(periodic_mcheck_init);
@@ -597,12 +612,13 @@ static int mce_resume(struct sys_device *dev)
  /* Reinit MCEs after user configuration changes */
  static void mce_restart(void) 
  { 
-       if (check_interval)
+       if (next_interval)
                 cancel_delayed_work(&mcheck_work);
         /* Timer race is harmless here */
         on_each_cpu(mce_init, NULL, 1, 1);       
-       if (check_interval)
-               schedule_delayed_work(&mcheck_work, check_interval*HZ);
+       next_interval = check_interval * HZ;
+       if (next_interval)
+               schedule_delayed_work(&mcheck_work, next_interval);
  }
  
  static struct sysdev_class mce_sysclass = {
author	Tim Hockin <thockin@google.com>
	Wed, 2 May 2007 17:27:19 +0000 (19:27 +0200)
committer	Andi Kleen <andi@basil.nowhere.org>
	Wed, 2 May 2007 17:27:19 +0000 (19:27 +0200)
Documentation/x86_64/machinecheck		patch \| blob \| history
arch/x86_64/kernel/mce.c		patch \| blob \| history