diff --git a/arch/arm/include/asm/tls.h b/arch/arm/include/asm/tls.h
index 83259b8733337e05091c82d9705d8e6c224118f4..36172adda9d0a9314473b00b8f01126a714a8b5b 100644
--- a/arch/arm/include/asm/tls.h
+++ b/arch/arm/include/asm/tls.h
@@ -1,6 +1,9 @@
 #ifndef __ASMARM_TLS_H
 #define __ASMARM_TLS_H
 
+#include <linux/compiler.h>
+#include <asm/thread_info.h>
+
 #ifdef __ASSEMBLY__
 #include <asm/asm-offsets.h>
 	.macro switch_tls_none, base, tp, tpuser, tmp1, tmp2
@@ -50,6 +53,47 @@
 #endif
 
 #ifndef __ASSEMBLY__
+
+static inline void set_tls(unsigned long val)
+{
+	struct thread_info *thread;
+
+	thread = current_thread_info();
+
+	thread->tp_value[0] = val;
+
+	/*
+	 * This code runs with preemption enabled and therefore must
+	 * be reentrant with respect to switch_tls.
+	 *
+	 * We need to ensure ordering between the shadow state and the
+	 * hardware state, so that we don't corrupt the hardware state
+	 * with a stale shadow state during context switch.
+	 *
+	 * If we're preempted here, switch_tls will load TPIDRURO from
+	 * thread_info upon resuming execution and the following mcr
+	 * is merely redundant.
+	 */
+	barrier();
+
+	if (!tls_emu) {
+		if (has_tls_reg) {
+			asm("mcr p15, 0, %0, c13, c0, 3"
+			    : : "r" (val));
+		} else {
+			/*
+			 * User space must never try to access this
+			 * directly.  Expect your app to break
+			 * eventually if you do so.  The user helper
+			 * at 0xffff0fe0 must be used instead.  (see
+			 * entry-armv.S for details)
+			 */
+			*((unsigned int *)0xffff0ff0) = val;
+		}
+
+	}
+}
+
 static inline unsigned long get_tpuser(void)
 {
 	unsigned long reg = 0;
@@ -59,5 +103,23 @@ static inline unsigned long get_tpuser(void)
 
 	return reg;
 }
+
+static inline void set_tpuser(unsigned long val)
+{
+	/* Since TPIDRURW is fully context-switched (unlike TPIDRURO),
+	 * we need not update thread_info.
+	 */
+	if (has_tls_reg && !tls_emu) {
+		asm("mcr p15, 0, %0, c13, c0, 2"
+		    : : "r" (val));
+	}
+}
+
+static inline void flush_tls(void)
+{
+	set_tls(0);
+	set_tpuser(0);
+}
+
 #endif
 #endif	/* __ASMARM_TLS_H */
diff --git a/arch/arm/include/asm/uaccess.h b/arch/arm/include/asm/uaccess.h
index a4cd7af475e90de4bb57398cfd569e06f15a1135..4767eb9caa78c89fe0c6b447b0d234fb71738285 100644
--- a/arch/arm/include/asm/uaccess.h
+++ b/arch/arm/include/asm/uaccess.h
@@ -107,8 +107,11 @@ static inline void set_fs(mm_segment_t fs)
 extern int __get_user_1(void *);
 extern int __get_user_2(void *);
 extern int __get_user_4(void *);
-extern int __get_user_lo8(void *);
+extern int __get_user_32t_8(void *);
 extern int __get_user_8(void *);
+extern int __get_user_64t_1(void *);
+extern int __get_user_64t_2(void *);
+extern int __get_user_64t_4(void *);
 
 #define __GUP_CLOBBER_1	"lr", "cc"
 #ifdef CONFIG_CPU_USE_DOMAINS
@@ -117,7 +120,7 @@ extern int __get_user_8(void *);
 #define __GUP_CLOBBER_2 "lr", "cc"
 #endif
 #define __GUP_CLOBBER_4	"lr", "cc"
-#define __GUP_CLOBBER_lo8 "lr", "cc"
+#define __GUP_CLOBBER_32t_8 "lr", "cc"
 #define __GUP_CLOBBER_8	"lr", "cc"
 
 #define __get_user_x(__r2,__p,__e,__l,__s)				\
@@ -131,12 +134,30 @@ extern int __get_user_8(void *);
 
 /* narrowing a double-word get into a single 32bit word register: */
 #ifdef __ARMEB__
-#define __get_user_xb(__r2, __p, __e, __l, __s)				\
-	__get_user_x(__r2, __p, __e, __l, lo8)
+#define __get_user_x_32t(__r2, __p, __e, __l, __s)				\
+	__get_user_x(__r2, __p, __e, __l, 32t_8)
 #else
-#define __get_user_xb __get_user_x
+#define __get_user_x_32t __get_user_x
 #endif
 
+/*
+ * storing result into proper least significant word of 64bit target var,
+ * different only for big endian case where 64 bit __r2 lsw is r3:
+ */
+#ifdef __ARMEB__
+#define __get_user_x_64t(__r2, __p, __e, __l, __s)		        \
+	   __asm__ __volatile__ (					\
+		__asmeq("%0", "r0") __asmeq("%1", "r2")			\
+		__asmeq("%3", "r1")					\
+		"bl	__get_user_64t_" #__s				\
+		: "=&r" (__e), "=r" (__r2)				\
+		: "0" (__p), "r" (__l)					\
+		: __GUP_CLOBBER_##__s)
+#else
+#define __get_user_x_64t __get_user_x
+#endif
+
+
 #define __get_user_check(x,p)							\
 	({								\
 		unsigned long __limit = current_thread_info()->addr_limit - 1; \
@@ -146,17 +167,26 @@ extern int __get_user_8(void *);
 		register int __e asm("r0");				\
 		switch (sizeof(*(__p))) {				\
 		case 1:							\
-			__get_user_x(__r2, __p, __e, __l, 1);		\
+			if (sizeof((x)) >= 8)				\
+				__get_user_x_64t(__r2, __p, __e, __l, 1); \
+			else						\
+				__get_user_x(__r2, __p, __e, __l, 1);	\
 			break;						\
 		case 2:							\
-			__get_user_x(__r2, __p, __e, __l, 2);		\
+			if (sizeof((x)) >= 8)				\
+				__get_user_x_64t(__r2, __p, __e, __l, 2); \
+			else						\
+				__get_user_x(__r2, __p, __e, __l, 2);	\
 			break;						\
 		case 4:							\
-			__get_user_x(__r2, __p, __e, __l, 4);		\
+			if (sizeof((x)) >= 8)				\
+				__get_user_x_64t(__r2, __p, __e, __l, 4); \
+			else						\
+				__get_user_x(__r2, __p, __e, __l, 4);	\
 			break;						\
 		case 8:							\
 			if (sizeof((x)) < 8)				\
-				__get_user_xb(__r2, __p, __e, __l, 4);	\
+				__get_user_x_32t(__r2, __p, __e, __l, 4); \
 			else						\
 				__get_user_x(__r2, __p, __e, __l, 8);	\
 			break;						\
diff --git a/arch/arm/kernel/armksyms.c b/arch/arm/kernel/armksyms.c
index f7b450f97e6884bf5d28de653e7d291f809ab1d1..a88671cfe1ffb1e1ee43b06a7c8c6fd704f38580 100644
--- a/arch/arm/kernel/armksyms.c
+++ b/arch/arm/kernel/armksyms.c
@@ -98,6 +98,14 @@ EXPORT_SYMBOL(__clear_user);
 EXPORT_SYMBOL(__get_user_1);
 EXPORT_SYMBOL(__get_user_2);
 EXPORT_SYMBOL(__get_user_4);
+EXPORT_SYMBOL(__get_user_8);
+
+#ifdef __ARMEB__
+EXPORT_SYMBOL(__get_user_64t_1);
+EXPORT_SYMBOL(__get_user_64t_2);
+EXPORT_SYMBOL(__get_user_64t_4);
+EXPORT_SYMBOL(__get_user_32t_8);
+#endif
 
 EXPORT_SYMBOL(__put_user_1);
 EXPORT_SYMBOL(__put_user_2);
diff --git a/arch/arm/kernel/irq.c b/arch/arm/kernel/irq.c
index 2c425760451340b41a9d5748f5edf0e0bcbcaaf3..5c4d38e32a5128a48ce20cc9d2757465e4e7b935 100644
--- a/arch/arm/kernel/irq.c
+++ b/arch/arm/kernel/irq.c
@@ -175,7 +175,7 @@ static bool migrate_one_irq(struct irq_desc *desc)
 	c = irq_data_get_irq_chip(d);
 	if (!c->irq_set_affinity)
 		pr_debug("IRQ%u: unable to set affinity\n", d->irq);
-	else if (c->irq_set_affinity(d, affinity, true) == IRQ_SET_MASK_OK && ret)
+	else if (c->irq_set_affinity(d, affinity, false) == IRQ_SET_MASK_OK && ret)
 		cpumask_copy(d->affinity, affinity);
 
 	return ret;
diff --git a/arch/arm/kernel/perf_event_cpu.c b/arch/arm/kernel/perf_event_cpu.c
index e6a6edbec6135de27cf0d6426185cec90654f6fc..4bf4cce759fe1cf6d56f25c6d4141124709c7a33 100644
--- a/arch/arm/kernel/perf_event_cpu.c
+++ b/arch/arm/kernel/perf_event_cpu.c
@@ -76,21 +76,15 @@ static struct pmu_hw_events *cpu_pmu_get_cpu_events(void)
 
 static void cpu_pmu_enable_percpu_irq(void *data)
 {
-	struct arm_pmu *cpu_pmu = data;
-	struct platform_device *pmu_device = cpu_pmu->plat_device;
-	int irq = platform_get_irq(pmu_device, 0);
+	int irq = *(int *)data;
 
 	enable_percpu_irq(irq, IRQ_TYPE_NONE);
-	cpumask_set_cpu(smp_processor_id(), &cpu_pmu->active_irqs);
 }
 
 static void cpu_pmu_disable_percpu_irq(void *data)
 {
-	struct arm_pmu *cpu_pmu = data;
-	struct platform_device *pmu_device = cpu_pmu->plat_device;
-	int irq = platform_get_irq(pmu_device, 0);
+	int irq = *(int *)data;
 
-	cpumask_clear_cpu(smp_processor_id(), &cpu_pmu->active_irqs);
 	disable_percpu_irq(irq);
 }
 
@@ -103,7 +97,7 @@ static void cpu_pmu_free_irq(struct arm_pmu *cpu_pmu)
 
 	irq = platform_get_irq(pmu_device, 0);
 	if (irq >= 0 && irq_is_percpu(irq)) {
-		on_each_cpu(cpu_pmu_disable_percpu_irq, cpu_pmu, 1);
+		on_each_cpu(cpu_pmu_disable_percpu_irq, &irq, 1);
 		free_percpu_irq(irq, &percpu_pmu);
 	} else {
 		for (i = 0; i < irqs; ++i) {
@@ -138,7 +132,7 @@ static int cpu_pmu_request_irq(struct arm_pmu *cpu_pmu, irq_handler_t handler)
 				irq);
 			return err;
 		}
-		on_each_cpu(cpu_pmu_enable_percpu_irq, cpu_pmu, 1);
+		on_each_cpu(cpu_pmu_enable_percpu_irq, &irq, 1);
 	} else {
 		for (i = 0; i < irqs; ++i) {
 			err = 0;
diff --git a/arch/arm/kernel/process.c b/arch/arm/kernel/process.c
index 81ef686a91ca18dccfbb85cd75b5a2aa6c303ecc..a35f6ebbd2c2bddecf8f5b63a04e612ad5d38c75 100644
--- a/arch/arm/kernel/process.c
+++ b/arch/arm/kernel/process.c
@@ -334,6 +334,8 @@ void flush_thread(void)
 	memset(&tsk->thread.debug, 0, sizeof(struct debug_info));
 	memset(&thread->fpstate, 0, sizeof(union fp_state));
 
+	flush_tls();
+
 	thread_notify(THREAD_NOTIFY_FLUSH, thread);
 }
 
diff --git a/arch/arm/kernel/swp_emulate.c b/arch/arm/kernel/swp_emulate.c
index 67ca8578c6d8fd9adf35cfebbe0d75cdda8d4786..587fdfe1a72cdb6c582ee03d455ec391dd456a83 100644
--- a/arch/arm/kernel/swp_emulate.c
+++ b/arch/arm/kernel/swp_emulate.c
@@ -142,14 +142,6 @@ static int emulate_swpX(unsigned int address, unsigned int *data,
 	while (1) {
 		unsigned long temp;
 
-		/*
-		 * Barrier required between accessing protected resource and
-		 * releasing a lock for it. Legacy code might not have done
-		 * this, and we cannot determine that this is not the case
-		 * being emulated, so insert always.
-		 */
-		smp_mb();
-
 		if (type == TYPE_SWPB)
 			__user_swpb_asm(*data, address, res, temp);
 		else
@@ -162,13 +154,6 @@ static int emulate_swpX(unsigned int address, unsigned int *data,
 	}
 
 	if (res == 0) {
-		/*
-		 * Barrier also required between acquiring a lock for a
-		 * protected resource and accessing the resource. Inserted for
-		 * same reason as above.
-		 */
-		smp_mb();
-
 		if (type == TYPE_SWPB)
 			swpbcounter++;
 		else
diff --git a/arch/arm/kernel/thumbee.c b/arch/arm/kernel/thumbee.c
index 7b8403b7666617f30baca2fec58fb579615e679b..80f0d69205e73f28a72d3917d73162bc23cf3417 100644
--- a/arch/arm/kernel/thumbee.c
+++ b/arch/arm/kernel/thumbee.c
@@ -45,7 +45,7 @@ static int thumbee_notifier(struct notifier_block *self, unsigned long cmd, void
 
 	switch (cmd) {
 	case THREAD_NOTIFY_FLUSH:
-		thread->thumbee_state = 0;
+		teehbr_write(0);
 		break;
 	case THREAD_NOTIFY_SWITCH:
 		current_thread_info()->thumbee_state = teehbr_read();
diff --git a/arch/arm/kernel/traps.c b/arch/arm/kernel/traps.c
index c8e4bb7149444ce9c2670c92345ea67ccb94eed8..a964c9f40f875882e3bf4d722eb7290074981709 100644
--- a/arch/arm/kernel/traps.c
+++ b/arch/arm/kernel/traps.c
@@ -581,7 +581,6 @@ do_cache_op(unsigned long start, unsigned long end, int flags)
 #define NR(x) ((__ARM_NR_##x) - __ARM_NR_BASE)
 asmlinkage int arm_syscall(int no, struct pt_regs *regs)
 {
-	struct thread_info *thread = current_thread_info();
 	siginfo_t info;
 
 	if ((no >> 16) != (__ARM_NR_BASE>> 16))
@@ -632,21 +631,7 @@ asmlinkage int arm_syscall(int no, struct pt_regs *regs)
 		return regs->ARM_r0;
 
 	case NR(set_tls):
-		thread->tp_value[0] = regs->ARM_r0;
-		if (tls_emu)
-			return 0;
-		if (has_tls_reg) {
-			asm ("mcr p15, 0, %0, c13, c0, 3"
-				: : "r" (regs->ARM_r0));
-		} else {
-			/*
-			 * User space must never try to access this directly.
-			 * Expect your app to break eventually if you do so.
-			 * The user helper at 0xffff0fe0 must be used instead.
-			 * (see entry-armv.S for details)
-			 */
-			*((unsigned int *)0xffff0ff0) = regs->ARM_r0;
-		}
+		set_tls(regs->ARM_r0);
 		return 0;
 
 #ifdef CONFIG_NEEDS_SYSCALL_FOR_CMPXCHG
diff --git a/arch/arm/lib/getuser.S b/arch/arm/lib/getuser.S
index 938600098b88690507ef22489e3a3d0dbc3cc401..8ecfd15c3a0248db29667fe3dc6ec6429fc9fc7c 100644
--- a/arch/arm/lib/getuser.S
+++ b/arch/arm/lib/getuser.S
@@ -80,7 +80,7 @@ ENTRY(__get_user_8)
 ENDPROC(__get_user_8)
 
 #ifdef __ARMEB__
-ENTRY(__get_user_lo8)
+ENTRY(__get_user_32t_8)
 	check_uaccess r0, 8, r1, r2, __get_user_bad
 #ifdef CONFIG_CPU_USE_DOMAINS
 	add	r0, r0, #4
@@ -90,7 +90,37 @@ ENTRY(__get_user_lo8)
 #endif
 	mov	r0, #0
 	ret	lr
-ENDPROC(__get_user_lo8)
+ENDPROC(__get_user_32t_8)
+
+ENTRY(__get_user_64t_1)
+	check_uaccess r0, 1, r1, r2, __get_user_bad8
+8: TUSER(ldrb)	r3, [r0]
+	mov	r0, #0
+	ret	lr
+ENDPROC(__get_user_64t_1)
+
+ENTRY(__get_user_64t_2)
+	check_uaccess r0, 2, r1, r2, __get_user_bad8
+#ifdef CONFIG_CPU_USE_DOMAINS
+rb	.req	ip
+9:	ldrbt	r3, [r0], #1
+10:	ldrbt	rb, [r0], #0
+#else
+rb	.req	r0
+9:	ldrb	r3, [r0]
+10:	ldrb	rb, [r0, #1]
+#endif
+	orr	r3, rb, r3, lsl #8
+	mov	r0, #0
+	ret	lr
+ENDPROC(__get_user_64t_2)
+
+ENTRY(__get_user_64t_4)
+	check_uaccess r0, 4, r1, r2, __get_user_bad8
+11: TUSER(ldr)	r3, [r0]
+	mov	r0, #0
+	ret	lr
+ENDPROC(__get_user_64t_4)
 #endif
 
 __get_user_bad8:
@@ -111,5 +141,9 @@ ENDPROC(__get_user_bad8)
 	.long	6b, __get_user_bad8
 #ifdef __ARMEB__
 	.long   7b, __get_user_bad
+	.long	8b, __get_user_bad8
+	.long	9b, __get_user_bad8
+	.long	10b, __get_user_bad8
+	.long	11b, __get_user_bad8
 #endif
 .popsection
diff --git a/arch/arm/mm/proc-v7-3level.S b/arch/arm/mm/proc-v7-3level.S
index 1a24e9232ec8653b70dc163dd524cf1b6f85b951..b64e67c7f176a97072b342a168db19b292cad4a5 100644
--- a/arch/arm/mm/proc-v7-3level.S
+++ b/arch/arm/mm/proc-v7-3level.S
@@ -146,7 +146,6 @@ ENDPROC(cpu_v7_set_pte_ext)
 	mov	\tmp, \ttbr1, lsr #(32 - ARCH_PGD_SHIFT)	@ upper bits
 	mov	\ttbr1, \ttbr1, lsl #ARCH_PGD_SHIFT		@ lower bits
 	addls	\ttbr1, \ttbr1, #TTBR1_OFFSET
-	adcls	\tmp, \tmp, #0
 	mcrr	p15, 1, \ttbr1, \tmp, c2			@ load TTBR1
 	mov	\tmp, \ttbr0, lsr #(32 - ARCH_PGD_SHIFT)	@ upper bits
 	mov	\ttbr0, \ttbr0, lsl #ARCH_PGD_SHIFT		@ lower bits