diff --git a/arch/mips/include/asm/r4k-timer.h b/arch/mips/include/asm/r4k-timer.h
index 6e7361629348..432e61dd5204 100644
--- a/arch/mips/include/asm/r4k-timer.h
+++ b/arch/mips/include/asm/r4k-timer.h
@@ -12,15 +12,10 @@
 
 #ifdef CONFIG_SYNC_R4K
 
-extern void synchronise_count_master(int cpu);
 extern void synchronise_count_slave(int cpu);
 
 #else
 
-static inline void synchronise_count_master(int cpu)
-{
-}
-
 static inline void synchronise_count_slave(int cpu)
 {
 }
diff --git a/arch/mips/kernel/smp.c b/arch/mips/kernel/smp.c
index 0b53d35a116e..0362fc5df7b0 100644
--- a/arch/mips/kernel/smp.c
+++ b/arch/mips/kernel/smp.c
@@ -462,8 +462,6 @@ int __cpu_up(unsigned int cpu, struct task_struct *tidle)
 		return -EIO;
 	}
 
-	synchronise_count_master(cpu);
-
 	/* Wait for CPU to finish startup & mark itself online before return */
 	wait_for_completion(&cpu_running);
 	return 0;
diff --git a/arch/mips/kernel/sync-r4k.c b/arch/mips/kernel/sync-r4k.c
index abdd7aaa3311..39156592582e 100644
--- a/arch/mips/kernel/sync-r4k.c
+++ b/arch/mips/kernel/sync-r4k.c
@@ -2,121 +2,244 @@
 /*
  * Count register synchronisation.
  *
- * All CPUs will have their count registers synchronised to the CPU0 next time
- * value. This can cause a small timewarp for CPU0. All other CPU's should
- * not have done anything significant (but they may have had interrupts
- * enabled briefly - prom_smp_finish() should not be responsible for enabling
- * interrupts...)
+ * Derived from arch/x86/kernel/tsc_sync.c
+ * Copyright (C) 2006, Red Hat, Inc., Ingo Molnar
  */
 
 #include <linux/kernel.h>
 #include <linux/irqflags.h>
 #include <linux/cpumask.h>
+#include <linux/atomic.h>
+#include <linux/nmi.h>
+#include <linux/smp.h>
+#include <linux/spinlock.h>
 
 #include <asm/r4k-timer.h>
-#include <linux/atomic.h>
-#include <asm/barrier.h>
 #include <asm/mipsregs.h>
+#include <asm/time.h>
 
-static unsigned int initcount = 0;
-static atomic_t count_count_start = ATOMIC_INIT(0);
-static atomic_t count_count_stop = ATOMIC_INIT(0);
+#define COUNTON		100
+#define NR_LOOPS	3
+#define LOOP_TIMEOUT	20
 
-#define COUNTON 100
-#define NR_LOOPS 3
+/*
+ * Entry/exit counters that make sure that both CPUs
+ * run the measurement code at once:
+ */
+static atomic_t start_count;
+static atomic_t stop_count;
+static atomic_t test_runs;
 
-void synchronise_count_master(int cpu)
+/*
+ * We use a raw spinlock in this exceptional case, because
+ * we want to have the fastest, inlined, non-debug version
+ * of a critical section, to be able to prove counter time-warps:
+ */
+static arch_spinlock_t sync_lock = __ARCH_SPIN_LOCK_UNLOCKED;
+
+static uint32_t last_counter;
+static uint32_t max_warp;
+static int nr_warps;
+static int random_warps;
+
+/*
+ * Counter warp measurement loop running on both CPUs.
+ */
+static uint32_t check_counter_warp(void)
 {
-	int i;
-	unsigned long flags;
+	uint32_t start, now, prev, end, cur_max_warp = 0;
+	int i, cur_warps = 0;
 
-	pr_info("Synchronize counters for CPU %u: ", cpu);
+	start = read_c0_count();
+	end = start + (uint32_t) mips_hpt_frequency / 1000 * LOOP_TIMEOUT;
 
-	local_irq_save(flags);
-
-	/*
-	 * We loop a few times to get a primed instruction cache,
-	 * then the last pass is more or less synchronised and
-	 * the master and slaves each set their cycle counters to a known
-	 * value all at once. This reduces the chance of having random offsets
-	 * between the processors, and guarantees that the maximum
-	 * delay between the cycle counters is never bigger than
-	 * the latency of information-passing (cachelines) between
-	 * two CPUs.
-	 */
-
-	for (i = 0; i < NR_LOOPS; i++) {
-		/* slaves loop on '!= 2' */
-		while (atomic_read(&count_count_start) != 1)
-			mb();
-		atomic_set(&count_count_stop, 0);
-		smp_wmb();
-
-		/* Let the slave writes its count register */
-		atomic_inc(&count_count_start);
-
-		/* Count will be initialised to current timer */
-		if (i == 1)
-			initcount = read_c0_count();
+	for (i = 0; ; i++) {
+		/*
+		 * We take the global lock, measure counter, save the
+		 * previous counter that was measured (possibly on
+		 * another CPU) and update the previous counter timestamp.
+		 */
+		arch_spin_lock(&sync_lock);
+		prev = last_counter;
+		now = read_c0_count();
+		last_counter = now;
+		arch_spin_unlock(&sync_lock);
 
 		/*
-		 * Everyone initialises count in the last loop:
+		 * Be nice every now and then (and also check whether
+		 * measurement is done [we also insert a 10 million
+		 * loops safety exit, so we dont lock up in case the
+		 * counter is totally broken]):
 		 */
-		if (i == NR_LOOPS-1)
-			write_c0_count(initcount);
-
+		if (unlikely(!(i & 7))) {
+			if (now > end || i > 10000000)
+				break;
+			cpu_relax();
+			touch_nmi_watchdog();
+		}
 		/*
-		 * Wait for slave to leave the synchronization point:
+		 * Outside the critical section we can now see whether
+		 * we saw a time-warp of the counter going backwards:
 		 */
-		while (atomic_read(&count_count_stop) != 1)
-			mb();
-		atomic_set(&count_count_start, 0);
-		smp_wmb();
-		atomic_inc(&count_count_stop);
+		if (unlikely(prev > now)) {
+			arch_spin_lock(&sync_lock);
+			max_warp = max(max_warp, prev - now);
+			cur_max_warp = max_warp;
+			/*
+			 * Check whether this bounces back and forth. Only
+			 * one CPU should observe time going backwards.
+			 */
+			if (cur_warps != nr_warps)
+				random_warps++;
+			nr_warps++;
+			cur_warps = nr_warps;
+			arch_spin_unlock(&sync_lock);
+		}
 	}
-	/* Arrange for an interrupt in a short while */
-	write_c0_compare(read_c0_count() + COUNTON);
-
-	local_irq_restore(flags);
-
-	/*
-	 * i386 code reported the skew here, but the
-	 * count registers were almost certainly out of sync
-	 * so no point in alarming people
-	 */
-	pr_cont("done.\n");
+	WARN(!(now-start),
+		"Warning: zero counter calibration delta: %d [max: %d]\n",
+			now-start, end-start);
+	return cur_max_warp;
 }
 
+/*
+ * The freshly booted CPU initiates this via an async SMP function call.
+ */
+static void check_counter_sync_source(void *__cpu)
+{
+	unsigned int cpu = (unsigned long)__cpu;
+	int cpus = 2;
+
+	atomic_set(&test_runs, NR_LOOPS);
+retry:
+	/* Wait for the target to start. */
+	while (atomic_read(&start_count) != cpus - 1)
+		cpu_relax();
+
+	/*
+	 * Trigger the target to continue into the measurement too:
+	 */
+	atomic_inc(&start_count);
+
+	check_counter_warp();
+
+	while (atomic_read(&stop_count) != cpus-1)
+		cpu_relax();
+
+	/*
+	 * If the test was successful set the number of runs to zero and
+	 * stop. If not, decrement the number of runs an check if we can
+	 * retry. In case of random warps no retry is attempted.
+	 */
+	if (!nr_warps) {
+		atomic_set(&test_runs, 0);
+
+		pr_info("Counter synchronization [CPU#%d -> CPU#%u]: passed\n",
+			smp_processor_id(), cpu);
+	} else if (atomic_dec_and_test(&test_runs) || random_warps) {
+		/* Force it to 0 if random warps brought us here */
+		atomic_set(&test_runs, 0);
+
+		pr_info("Counter synchronization [CPU#%d -> CPU#%u]:\n",
+			smp_processor_id(), cpu);
+		pr_info("Measured %d cycles counter warp between CPUs", max_warp);
+		if (random_warps)
+			pr_warn("Counter warped randomly between CPUs\n");
+	}
+
+	/*
+	 * Reset it - just in case we boot another CPU later:
+	 */
+	atomic_set(&start_count, 0);
+	random_warps = 0;
+	nr_warps = 0;
+	max_warp = 0;
+	last_counter = 0;
+
+	/*
+	 * Let the target continue with the bootup:
+	 */
+	atomic_inc(&stop_count);
+
+	/*
+	 * Retry, if there is a chance to do so.
+	 */
+	if (atomic_read(&test_runs) > 0)
+		goto retry;
+}
+
+/*
+ * Freshly booted CPUs call into this:
+ */
 void synchronise_count_slave(int cpu)
 {
-	int i;
-	unsigned long flags;
+	uint32_t cur_max_warp, gbl_max_warp, count;
+	int cpus = 2;
 
-	local_irq_save(flags);
+	if (!cpu_has_counter || !mips_hpt_frequency)
+		return;
+
+	/* Kick the control CPU into the counter synchronization function */
+	smp_call_function_single(cpumask_first(cpu_online_mask),
+				 check_counter_sync_source,
+				 (unsigned long *)(unsigned long)cpu, 0);
+retry:
+	/*
+	 * Register this CPU's participation and wait for the
+	 * source CPU to start the measurement:
+	 */
+	atomic_inc(&start_count);
+	while (atomic_read(&start_count) != cpus)
+		cpu_relax();
+
+	cur_max_warp = check_counter_warp();
 
 	/*
-	 * Not every cpu is online at the time this gets called,
-	 * so we first wait for the master to say everyone is ready
+	 * Store the maximum observed warp value for a potential retry:
 	 */
+	gbl_max_warp = max_warp;
 
-	for (i = 0; i < NR_LOOPS; i++) {
-		atomic_inc(&count_count_start);
-		while (atomic_read(&count_count_start) != 2)
-			mb();
+	/*
+	 * Ok, we are done:
+	 */
+	atomic_inc(&stop_count);
 
-		/*
-		 * Everyone initialises count in the last loop:
-		 */
-		if (i == NR_LOOPS-1)
-			write_c0_count(initcount);
+	/*
+	 * Wait for the source CPU to print stuff:
+	 */
+	while (atomic_read(&stop_count) != cpus)
+		cpu_relax();
 
-		atomic_inc(&count_count_stop);
-		while (atomic_read(&count_count_stop) != 2)
-			mb();
+	/*
+	 * Reset it for the next sync test:
+	 */
+	atomic_set(&stop_count, 0);
+
+	/*
+	 * Check the number of remaining test runs. If not zero, the test
+	 * failed and a retry with adjusted counter is possible. If zero the
+	 * test was either successful or failed terminally.
+	 */
+	if (!atomic_read(&test_runs)) {
+		/* Arrange for an interrupt in a short while */
+		write_c0_compare(read_c0_count() + COUNTON);
+		return;
 	}
-	/* Arrange for an interrupt in a short while */
-	write_c0_compare(read_c0_count() + COUNTON);
 
-	local_irq_restore(flags);
+	/*
+	 * If the warp value of this CPU is 0, then the other CPU
+	 * observed time going backwards so this counter was ahead and
+	 * needs to move backwards.
+	 */
+	if (!cur_max_warp)
+		cur_max_warp = -gbl_max_warp;
+
+	count = read_c0_count();
+	count += cur_max_warp;
+	write_c0_count(count);
+
+	pr_debug("Counter compensate: CPU%u observed %d warp\n", cpu, cur_max_warp);
+
+	goto retry;
+
 }
-#undef NR_LOOPS