diff --git a/include/linux/percpu.h b/include/linux/percpu.h
index 8b97308e65df3ccf094d79af23266e2da5fc73af..9ca008f0c542935440f3c39c0a5fb070dd6f5e8c 100644
--- a/include/linux/percpu.h
+++ b/include/linux/percpu.h
@@ -259,6 +259,9 @@ extern void __bad_size_call_parameter(void);
  * Special handling for cmpxchg_double.  cmpxchg_double is passed two
  * percpu variables.  The first has to be aligned to a double word
  * boundary and the second has to follow directly thereafter.
+ * We enforce this on all architectures even if they don't support
+ * a double cmpxchg instruction, since it's a cheap requirement, and it
+ * avoids breaking the requirement for architectures with the instruction.
  */
 #define __pcpu_double_call_return_bool(stem, pcp1, pcp2, ...)		\
 ({									\
diff --git a/mm/slab.c b/mm/slab.c
index bcfa4987c8ae2c9fc273f1baf75d4461ed743052..d96e223de775378f78d7597d8c2d87a9e24e70a2 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -3604,13 +3604,14 @@ static void cache_flusharray(struct kmem_cache *cachep, struct array_cache *ac)
  * Release an obj back to its cache. If the obj has a constructed state, it must
  * be in this state _before_ it is released.  Called with disabled ints.
  */
-static inline void __cache_free(struct kmem_cache *cachep, void *objp)
+static inline void __cache_free(struct kmem_cache *cachep, void *objp,
+    void *caller)
 {
 	struct array_cache *ac = cpu_cache_get(cachep);
 
 	check_irq_off();
 	kmemleak_free_recursive(objp, cachep->flags);
-	objp = cache_free_debugcheck(cachep, objp, __builtin_return_address(0));
+	objp = cache_free_debugcheck(cachep, objp, caller);
 
 	kmemcheck_slab_free(cachep, objp, obj_size(cachep));
 
@@ -3801,7 +3802,7 @@ void kmem_cache_free(struct kmem_cache *cachep, void *objp)
 	debug_check_no_locks_freed(objp, obj_size(cachep));
 	if (!(cachep->flags & SLAB_DEBUG_OBJECTS))
 		debug_check_no_obj_freed(objp, obj_size(cachep));
-	__cache_free(cachep, objp);
+	__cache_free(cachep, objp, __builtin_return_address(0));
 	local_irq_restore(flags);
 
 	trace_kmem_cache_free(_RET_IP_, objp);
@@ -3831,7 +3832,7 @@ void kfree(const void *objp)
 	c = virt_to_cache(objp);
 	debug_check_no_locks_freed(objp, obj_size(c));
 	debug_check_no_obj_freed(objp, obj_size(c));
-	__cache_free(c, (void *)objp);
+	__cache_free(c, (void *)objp, __builtin_return_address(0));
 	local_irq_restore(flags);
 }
 EXPORT_SYMBOL(kfree);
diff --git a/mm/slub.c b/mm/slub.c
index 7be0223531b090dbd9f716104714bc4d9ca9c7b5..35f351f26193a47145cd8bcb7f55ae091331ca97 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -2320,16 +2320,12 @@ static inline int alloc_kmem_cache_cpus(struct kmem_cache *s)
 	BUILD_BUG_ON(PERCPU_DYNAMIC_EARLY_SIZE <
 			SLUB_PAGE_SHIFT * sizeof(struct kmem_cache_cpu));
 
-#ifdef CONFIG_CMPXCHG_LOCAL
 	/*
-	 * Must align to double word boundary for the double cmpxchg instructions
-	 * to work.
+	 * Must align to double word boundary for the double cmpxchg
+	 * instructions to work; see __pcpu_double_call_return_bool().
 	 */
-	s->cpu_slab = __alloc_percpu(sizeof(struct kmem_cache_cpu), 2 * sizeof(void *));
-#else
-	/* Regular alignment is sufficient */
-	s->cpu_slab = alloc_percpu(struct kmem_cache_cpu);
-#endif
+	s->cpu_slab = __alloc_percpu(sizeof(struct kmem_cache_cpu),
+				     2 * sizeof(void *));
 
 	if (!s->cpu_slab)
 		return 0;