diff --git a/arch/s390/include/asm/processor.h b/arch/s390/include/asm/processor.h
index bed05ea7ec27eb236cc65b2007379fa6f61fb865..e7cbbdcdee1337cf8cefa5fbe74dab2e174eadf2 100644
--- a/arch/s390/include/asm/processor.h
+++ b/arch/s390/include/asm/processor.h
@@ -215,10 +215,7 @@ static inline unsigned short stap(void)
 /*
  * Give up the time slice of the virtual PU.
  */
-static inline void cpu_relax(void)
-{
-	barrier();
-}
+void cpu_relax(void);
 
 #define cpu_relax_lowlatency()  barrier()
 
diff --git a/arch/s390/kernel/processor.c b/arch/s390/kernel/processor.c
index dbdd33ee010204f0e03c9c1fd48ad30acad59b4c..26108232fcaaf049f4e4caa3938e537af5066ac6 100644
--- a/arch/s390/kernel/processor.c
+++ b/arch/s390/kernel/processor.c
@@ -8,16 +8,24 @@
 
 #include <linux/kernel.h>
 #include <linux/init.h>
-#include <linux/smp.h>
 #include <linux/seq_file.h>
 #include <linux/delay.h>
 #include <linux/cpu.h>
 #include <asm/elf.h>
 #include <asm/lowcore.h>
 #include <asm/param.h>
+#include <asm/smp.h>
 
 static DEFINE_PER_CPU(struct cpuid, cpu_id);
 
+void cpu_relax(void)
+{
+	if (!smp_cpu_mtid && MACHINE_HAS_DIAG44)
+		asm volatile("diag 0,0,0x44");
+	barrier();
+}
+EXPORT_SYMBOL(cpu_relax);
+
 /*
  * cpu_init - initializes state that is per-CPU.
  */