diff --git a/arch/sparc64/kernel/Makefile b/arch/sparc64/kernel/Makefile
index 83d67eb188952efbd941b265b55b005df879e260..a482a9ffe5bc12d9a65a3ac913a121696438972d 100644
--- a/arch/sparc64/kernel/Makefile
+++ b/arch/sparc64/kernel/Makefile
@@ -38,5 +38,5 @@ else
   CMODEL_CFLAG := -m64 -mcmodel=medlow
 endif
 
-head.o: head.S ttable.S itlb_base.S dtlb_base.S dtlb_backend.S dtlb_prot.S \
+head.o: head.S ttable.S itlb_miss.S dtlb_miss.S ktlb.S tsb.S \
 	etrap.S rtrap.S winfixup.S entry.S
diff --git a/arch/sparc64/kernel/binfmt_aout32.c b/arch/sparc64/kernel/binfmt_aout32.c
index 202a80c24b6f789cdd11dd7db964158a35a88e56..a57d7f2b6f137ee88a4829ae9e17fa5565a9eccc 100644
--- a/arch/sparc64/kernel/binfmt_aout32.c
+++ b/arch/sparc64/kernel/binfmt_aout32.c
@@ -31,6 +31,7 @@
 #include <asm/system.h>
 #include <asm/uaccess.h>
 #include <asm/pgalloc.h>
+#include <asm/mmu_context.h>
 
 static int load_aout32_binary(struct linux_binprm *, struct pt_regs * regs);
 static int load_aout32_library(struct file*);
@@ -329,15 +330,9 @@ beyond_if:
 
 	current->mm->start_stack =
 		(unsigned long) create_aout32_tables((char __user *)bprm->p, bprm);
-	if (!(orig_thr_flags & _TIF_32BIT)) {
-		unsigned long pgd_cache = get_pgd_cache(current->mm->pgd);
-
-		__asm__ __volatile__("stxa\t%0, [%1] %2\n\t"
-				     "membar #Sync"
-				     : /* no outputs */
-				     : "r" (pgd_cache),
-				       "r" (TSB_REG), "i" (ASI_DMMU));
-	}
+	tsb_context_switch(__pa(current->mm->pgd),
+	                   current->mm->context.sparc64_tsb);
+
 	start_thread32(regs, ex.a_entry, current->mm->start_stack);
 	if (current->ptrace & PT_PTRACED)
 		send_sig(SIGTRAP, current, 0);
diff --git a/arch/sparc64/kernel/dtlb_backend.S b/arch/sparc64/kernel/dtlb_backend.S
deleted file mode 100644
index acc889a7f9c1f3cd78e620cbcaeea37edfb0f5c3..0000000000000000000000000000000000000000
--- a/arch/sparc64/kernel/dtlb_backend.S
+++ /dev/null
@@ -1,170 +0,0 @@
-/* $Id: dtlb_backend.S,v 1.16 2001/10/09 04:02:11 davem Exp $
- * dtlb_backend.S: Back end to DTLB miss replacement strategy.
- *                 This is included directly into the trap table.
- *
- * Copyright (C) 1996,1998 David S. Miller (davem@redhat.com)
- * Copyright (C) 1997,1998 Jakub Jelinek   (jj@ultra.linux.cz)
- */
-
-#include <asm/pgtable.h>
-#include <asm/mmu.h>
-
-#define VALID_SZ_BITS	(_PAGE_VALID | _PAGE_SZBITS)
-
-#define VPTE_BITS		(_PAGE_CP | _PAGE_CV | _PAGE_P )
-#define VPTE_SHIFT		(PAGE_SHIFT - 3)
-
-/* Ways we can get here:
- *
- * 1) Nucleus loads and stores to/from PA-->VA direct mappings at tl>1.
- * 2) Nucleus loads and stores to/from user/kernel window save areas.
- * 3) VPTE misses from dtlb_base and itlb_base.
- *
- * We need to extract out the PMD and PGDIR indexes from the
- * linear virtual page table access address.  The PTE index
- * is at the bottom, but we are not concerned with it.  Bits
- * 0 to 2 are clear since each PTE is 8 bytes in size.  Each
- * PMD and PGDIR entry are 4 bytes in size.   Thus, this
- * address looks something like:
- *
- * |---------------------------------------------------------------|
- * |  ...   |    PGDIR index    |    PMD index    | PTE index  |   |
- * |---------------------------------------------------------------|
- *   63   F   E               D   C             B   A         3 2 0  <- bit nr
- *
- *  The variable bits above are defined as:
- *  A --> 3 + (PAGE_SHIFT - log2(8))
- *    --> 3 + (PAGE_SHIFT - 3) - 1
- *        (ie. this is "bit 3" + PAGE_SIZE - size of PTE entry in bits - 1)
- *  B --> A + 1
- *  C --> B + (PAGE_SHIFT - log2(4))
- *    -->  B + (PAGE_SHIFT - 2) - 1
- *        (ie. this is "bit B" + PAGE_SIZE - size of PMD entry in bits - 1)
- *  D --> C + 1
- *  E --> D + (PAGE_SHIFT - log2(4))
- *    --> D + (PAGE_SHIFT - 2) - 1
- *        (ie. this is "bit D" + PAGE_SIZE - size of PGDIR entry in bits - 1)
- *  F --> E + 1
- *
- * (Note how "B" always evalutes to PAGE_SHIFT, all the other constants
- *  cancel out.)
- *
- * For 8K PAGE_SIZE (thus, PAGE_SHIFT of 13) the bit numbers are:
- * A --> 12
- * B --> 13
- * C --> 23
- * D --> 24
- * E --> 34
- * F --> 35
- *
- * For 64K PAGE_SIZE (thus, PAGE_SHIFT of 16) the bit numbers are:
- * A --> 15
- * B --> 16
- * C --> 29
- * D --> 30
- * E --> 43
- * F --> 44
- *
- * Because bits both above and below each PGDIR and PMD index need to
- * be masked out, and the index can be as long as 14 bits (when using a
- * 64K PAGE_SIZE, and thus a PAGE_SHIFT of 16), we need 3 instructions
- * to extract each index out.
- *
- * Shifts do not pair very well on UltraSPARC-I, II, IIi, and IIe, so
- * we try to avoid using them for the entire operation.  We could setup
- * a mask anywhere from bit 31 down to bit 10 using the sethi instruction.
- *
- * We need a mask covering bits B --> C and one covering D --> E.
- * For 8K PAGE_SIZE these masks are 0x00ffe000 and 0x7ff000000.
- * For 64K PAGE_SIZE these masks are 0x3fff0000 and 0xfffc0000000.
- * The second in each set cannot be loaded with a single sethi
- * instruction, because the upper bits are past bit 32.  We would
- * need to use a sethi + a shift.
- *
- * For the time being, we use 2 shifts and a simple "and" mask.
- * We shift left to clear the bits above the index, we shift down
- * to clear the bits below the index (sans the log2(4 or 8) bits)
- * and a mask to clear the log2(4 or 8) bits.  We need therefore
- * define 4 shift counts, all of which are relative to PAGE_SHIFT.
- *
- * Although unsupportable for other reasons, this does mean that
- * 512K and 4MB page sizes would be generaally supported by the
- * kernel.  (ELF binaries would break with > 64K PAGE_SIZE since
- * the sections are only aligned that strongly).
- *
- * The operations performed for extraction are thus:
- *
- *      ((X << FOO_SHIFT_LEFT) >> FOO_SHIFT_RIGHT) & ~0x3
- *
- */
-
-#define A (3 + (PAGE_SHIFT - 3) - 1)
-#define B (A + 1)
-#define C (B + (PAGE_SHIFT - 2) - 1)
-#define D (C + 1)
-#define E (D + (PAGE_SHIFT - 2) - 1)
-#define F (E + 1)
-
-#define PMD_SHIFT_LEFT		(64 - D)
-#define PMD_SHIFT_RIGHT		(64 - (D - B) - 2)
-#define PGDIR_SHIFT_LEFT 	(64 - F)
-#define PGDIR_SHIFT_RIGHT	(64 - (F - D) - 2)
-#define LOW_MASK_BITS		0x3
-
-/* TLB1 ** ICACHE line 1: tl1 DTLB and quick VPTE miss	*/
-	ldxa		[%g1 + %g1] ASI_DMMU, %g4	! Get TAG_ACCESS
-	add		%g3, %g3, %g5			! Compute VPTE base
-	cmp		%g4, %g5			! VPTE miss?
-	bgeu,pt		%xcc, 1f			! Continue here
-	 andcc		%g4, TAG_CONTEXT_BITS, %g5	! tl0 miss Nucleus test
-	ba,a,pt		%xcc, from_tl1_trap		! Fall to tl0 miss
-1:	sllx		%g6, VPTE_SHIFT, %g4		! Position TAG_ACCESS
-	or		%g4, %g5, %g4			! Prepare TAG_ACCESS
-
-/* TLB1 ** ICACHE line 2: Quick VPTE miss	  	*/
-	mov		TSB_REG, %g1			! Grab TSB reg
-	ldxa		[%g1] ASI_DMMU, %g5		! Doing PGD caching?
-	sllx		%g6, PMD_SHIFT_LEFT, %g1	! Position PMD offset
-	be,pn		%xcc, sparc64_vpte_nucleus	! Is it from Nucleus?
-	 srlx		%g1, PMD_SHIFT_RIGHT, %g1	! Mask PMD offset bits
-	brnz,pt		%g5, sparc64_vpte_continue	! Yep, go like smoke
-	 andn		%g1, LOW_MASK_BITS, %g1		! Final PMD mask
-	sllx		%g6, PGDIR_SHIFT_LEFT, %g5	! Position PGD offset
-
-/* TLB1 ** ICACHE line 3: Quick VPTE miss	  	*/
-	srlx		%g5, PGDIR_SHIFT_RIGHT, %g5	! Mask PGD offset bits
-	andn		%g5, LOW_MASK_BITS, %g5		! Final PGD mask
-	lduwa		[%g7 + %g5] ASI_PHYS_USE_EC, %g5! Load PGD
-	brz,pn		%g5, vpte_noent			! Valid?
-sparc64_kpte_continue:
-	 sllx		%g5, 11, %g5			! Shift into place
-sparc64_vpte_continue:
-	lduwa		[%g5 + %g1] ASI_PHYS_USE_EC, %g5! Load PMD
-	sllx		%g5, 11, %g5			! Shift into place
-	brz,pn		%g5, vpte_noent			! Valid?
-
-/* TLB1 ** ICACHE line 4: Quick VPTE miss	  	*/
-	 mov		(VALID_SZ_BITS >> 61), %g1	! upper vpte into %g1
-	sllx		%g1, 61, %g1			! finish calc
-	or		%g5, VPTE_BITS, %g5		! Prepare VPTE data
-	or		%g5, %g1, %g5			! ...
-	mov		TLB_SFSR, %g1			! Restore %g1 value
-	stxa		%g5, [%g0] ASI_DTLB_DATA_IN	! Load VPTE into TLB
-	stxa		%g4, [%g1 + %g1] ASI_DMMU	! Restore previous TAG_ACCESS
-	retry						! Load PTE once again
-
-#undef VALID_SZ_BITS
-#undef VPTE_SHIFT
-#undef VPTE_BITS
-#undef A
-#undef B
-#undef C
-#undef D
-#undef E
-#undef F
-#undef PMD_SHIFT_LEFT
-#undef PMD_SHIFT_RIGHT
-#undef PGDIR_SHIFT_LEFT
-#undef PGDIR_SHIFT_RIGHT
-#undef LOW_MASK_BITS
-
diff --git a/arch/sparc64/kernel/dtlb_base.S b/arch/sparc64/kernel/dtlb_base.S
deleted file mode 100644
index 6528786840c06bfb5c480abdfe706bcfa257eedc..0000000000000000000000000000000000000000
--- a/arch/sparc64/kernel/dtlb_base.S
+++ /dev/null
@@ -1,109 +0,0 @@
-/* $Id: dtlb_base.S,v 1.17 2001/10/11 22:33:52 davem Exp $
- * dtlb_base.S:	Front end to DTLB miss replacement strategy.
- *              This is included directly into the trap table.
- *
- * Copyright (C) 1996,1998 David S. Miller (davem@redhat.com)
- * Copyright (C) 1997,1998 Jakub Jelinek   (jj@ultra.linux.cz)
- */
-
-#include <asm/pgtable.h>
-#include <asm/mmu.h>
-
-/* %g1	TLB_SFSR	(%g1 + %g1 == TLB_TAG_ACCESS)
- * %g2	(KERN_HIGHBITS | KERN_LOWBITS)
- * %g3  VPTE base	(0xfffffffe00000000)	Spitfire/Blackbird (44-bit VA space)
- *			(0xffe0000000000000)	Cheetah		   (64-bit VA space)
- * %g7	__pa(current->mm->pgd)
- *
- * The VPTE base value is completely magic, but note that
- * few places in the kernel other than these TLB miss
- * handlers know anything about the VPTE mechanism or
- * how it works (see VPTE_SIZE, TASK_SIZE and PTRS_PER_PGD).
- * Consider the 44-bit VADDR Ultra-I/II case as an example:
- *
- * VA[0 :  (1<<43)] produce VPTE index [%g3                        :   0]
- * VA[0 : -(1<<43)] produce VPTE index [%g3-(1<<(43-PAGE_SHIFT+3)) : %g3]
- *
- * For Cheetah's 64-bit VADDR space this is:
- *
- * VA[0 :  (1<<63)] produce VPTE index [%g3                        :   0]
- * VA[0 : -(1<<63)] produce VPTE index [%g3-(1<<(63-PAGE_SHIFT+3)) : %g3]
- *
- * If you're paying attention you'll notice that this means half of
- * the VPTE table is above %g3 and half is below, low VA addresses
- * map progressively upwards from %g3, and high VA addresses map
- * progressively upwards towards %g3.  This trick was needed to make
- * the same 8 instruction handler work both for Spitfire/Blackbird's
- * peculiar VA space hole configuration and the full 64-bit VA space
- * one of Cheetah at the same time.
- */
-
-/* Ways we can get here:
- *
- * 1) Nucleus loads and stores to/from PA-->VA direct mappings.
- * 2) Nucleus loads and stores to/from vmalloc() areas.
- * 3) User loads and stores.
- * 4) User space accesses by nucleus at tl0
- */
-
-#if PAGE_SHIFT == 13
-/*
- * To compute vpte offset, we need to do ((addr >> 13) << 3),
- * which can be optimized to (addr >> 10) if bits 10/11/12 can
- * be guaranteed to be 0 ... mmu_context.h does guarantee this
- * by only using 10 bits in the hwcontext value.
- */
-#define CREATE_VPTE_OFFSET1(r1, r2) nop
-#define CREATE_VPTE_OFFSET2(r1, r2) \
-				srax	r1, 10, r2
-#else
-#define CREATE_VPTE_OFFSET1(r1, r2) \
-				srax	r1, PAGE_SHIFT, r2
-#define CREATE_VPTE_OFFSET2(r1, r2) \
-				sllx	r2, 3, r2
-#endif
-
-/* DTLB ** ICACHE line 1: Quick user TLB misses		*/
-	mov		TLB_SFSR, %g1
-	ldxa		[%g1 + %g1] ASI_DMMU, %g4	! Get TAG_ACCESS
-	andcc		%g4, TAG_CONTEXT_BITS, %g0	! From Nucleus?
-from_tl1_trap:
-	rdpr		%tl, %g5			! For TL==3 test
-	CREATE_VPTE_OFFSET1(%g4, %g6)			! Create VPTE offset
-	be,pn		%xcc, kvmap			! Yep, special processing
-	 CREATE_VPTE_OFFSET2(%g4, %g6)			! Create VPTE offset
-	cmp		%g5, 4				! Last trap level?
-
-/* DTLB ** ICACHE line 2: User finish + quick kernel TLB misses	*/
-	be,pn		%xcc, longpath			! Yep, cannot risk VPTE miss
-	 nop						! delay slot
-	ldxa		[%g3 + %g6] ASI_S, %g5		! Load VPTE
-1:	brgez,pn	%g5, longpath			! Invalid, branch out
-	 nop						! Delay-slot
-9:	stxa		%g5, [%g0] ASI_DTLB_DATA_IN	! Reload TLB
-	retry						! Trap return
-	nop
-
-/* DTLB ** ICACHE line 3: winfixups+real_faults		*/
-longpath:
-	rdpr		%pstate, %g5			! Move into alternate globals
-	wrpr		%g5, PSTATE_AG|PSTATE_MG, %pstate
-	rdpr		%tl, %g4			! See where we came from.
-	cmp		%g4, 1				! Is etrap/rtrap window fault?
-	mov		TLB_TAG_ACCESS, %g4		! Prepare for fault processing
-	ldxa		[%g4] ASI_DMMU, %g5		! Load faulting VA page
-	be,pt		%xcc, sparc64_realfault_common	! Jump to normal fault handling
-	 mov		FAULT_CODE_DTLB, %g4		! It was read from DTLB
-
-/* DTLB ** ICACHE line 4: Unused...	*/
-	ba,a,pt		%xcc, winfix_trampoline		! Call window fixup code
-	nop
-	nop
-	nop
-	nop
-	nop
-	nop
-	nop
-
-#undef CREATE_VPTE_OFFSET1
-#undef CREATE_VPTE_OFFSET2
diff --git a/arch/sparc64/kernel/dtlb_miss.S b/arch/sparc64/kernel/dtlb_miss.S
new file mode 100644
index 0000000000000000000000000000000000000000..d0f1565cb564d86e1b352edc89cee9e644841e06
--- /dev/null
+++ b/arch/sparc64/kernel/dtlb_miss.S
@@ -0,0 +1,39 @@
+/* DTLB ** ICACHE line 1: Context 0 check and TSB load	*/
+	ldxa	[%g0] ASI_DMMU_TSB_8KB_PTR, %g1	! Get TSB 8K pointer
+	ldxa	[%g0] ASI_DMMU, %g6		! Get TAG TARGET
+	srlx	%g6, 48, %g5			! Get context
+	brz,pn	%g5, kvmap_dtlb			! Context 0 processing
+	 nop					! Delay slot (fill me)
+	ldda	[%g1] ASI_NUCLEUS_QUAD_LDD, %g4	! Load TSB entry
+	nop					! Push branch to next I$ line
+	cmp	%g4, %g6			! Compare TAG
+
+/* DTLB ** ICACHE line 2: TSB compare and TLB load	*/
+	bne,pn	%xcc, tsb_miss_dtlb		! Miss
+	 mov	FAULT_CODE_DTLB, %g3
+	stxa	%g5, [%g0] ASI_DTLB_DATA_IN	! Load TLB
+	retry					! Trap done
+	nop
+	nop
+	nop
+	nop
+
+/* DTLB ** ICACHE line 3:				*/
+	nop
+	nop
+	nop
+	nop
+	nop
+	nop
+	nop
+	nop
+
+/* DTLB ** ICACHE line 4: 				*/
+	nop
+	nop
+	nop
+	nop
+	nop
+	nop
+	nop
+	nop
diff --git a/arch/sparc64/kernel/etrap.S b/arch/sparc64/kernel/etrap.S
index 0d8eba21111b75582ba1f52c946a1121ab7a8774..567dbb765c3494bc16ba97e3ff820d45577e9b05 100644
--- a/arch/sparc64/kernel/etrap.S
+++ b/arch/sparc64/kernel/etrap.S
@@ -99,6 +99,7 @@ etrap_irq:
 		wrpr	%g0, ETRAP_PSTATE2, %pstate
 		mov	%l6, %g6
 #ifdef CONFIG_SMP
+#error IMMU TSB usage must be fixed
 		mov	TSB_REG, %g3
 		ldxa	[%g3] ASI_IMMU, %g5
 #endif
@@ -248,6 +249,7 @@ scetrap:	rdpr	%pil, %g2
 		mov	%l6, %g6
 		stx	%i7, [%sp + PTREGS_OFF + PT_V9_I7]
 #ifdef CONFIG_SMP
+#error IMMU TSB usage must be fixed
 		mov	TSB_REG, %g3
 		ldxa	[%g3] ASI_IMMU, %g5
 #endif
diff --git a/arch/sparc64/kernel/head.S b/arch/sparc64/kernel/head.S
index b49dcd4504b029e1684730a554fe5cd0684dc92b..d00e20693be1024d4099d32ad33e67b7ba8c2eee 100644
--- a/arch/sparc64/kernel/head.S
+++ b/arch/sparc64/kernel/head.S
@@ -429,17 +429,6 @@ setup_trap_table:
 	 *
 	 * %g6			--> current_thread_info()
 	 *
-	 * MMU Globals (PSTATE_MG):
-	 *
-	 * %g1			--> TLB_SFSR
-	 * %g2			--> ((_PAGE_VALID | _PAGE_SZ4MB |
-	 *			      _PAGE_CP | _PAGE_CV | _PAGE_P | _PAGE_W)
-	 *			     ^ 0xfffff80000000000)
-	 * (this %g2 value is used for computing the PAGE_OFFSET kernel
-	 *  TLB entries quickly, the virtual address of the fault XOR'd
-	 *  with this %g2 value is the PTE to load into the TLB)
-	 * %g3			--> VPTE_BASE_CHEETAH or VPTE_BASE_SPITFIRE
-	 *
 	 * Interrupt Globals (PSTATE_IG, setup by init_irqwork_curcpu()):
 	 *
 	 * %g6			--> __irq_work[smp_processor_id()]
@@ -450,40 +439,6 @@ setup_trap_table:
 	wrpr	%o1, PSTATE_AG, %pstate
 	mov	%o2, %g6
 
-#define KERN_HIGHBITS		((_PAGE_VALID|_PAGE_SZ4MB)^0xfffff80000000000)
-#define KERN_LOWBITS		(_PAGE_CP | _PAGE_CV | _PAGE_P | _PAGE_W)
-	wrpr	%o1, PSTATE_MG, %pstate
-	mov	TSB_REG, %g1
-	stxa	%g0, [%g1] ASI_DMMU
-	membar	#Sync
-	stxa	%g0, [%g1] ASI_IMMU
-	membar	#Sync
-	mov	TLB_SFSR, %g1
-	sethi	%uhi(KERN_HIGHBITS), %g2
-	or	%g2, %ulo(KERN_HIGHBITS), %g2
-	sllx	%g2, 32, %g2
-	or	%g2, KERN_LOWBITS, %g2
-
-	BRANCH_IF_ANY_CHEETAH(g3,g7,8f)
-	ba,pt	%xcc, 9f
-	 nop
-
-8:
-	sethi		%uhi(VPTE_BASE_CHEETAH), %g3
-	or		%g3, %ulo(VPTE_BASE_CHEETAH), %g3
-	ba,pt		%xcc, 2f
-	 sllx		%g3, 32, %g3
-
-9:
-	sethi		%uhi(VPTE_BASE_SPITFIRE), %g3
-	or		%g3, %ulo(VPTE_BASE_SPITFIRE), %g3
-	sllx		%g3, 32, %g3
-
-2:
-	clr	%g7
-#undef KERN_HIGHBITS
-#undef KERN_LOWBITS
-
 	/* Kill PROM timer */
 	sethi	%hi(0x80000000), %o2
 	sllx	%o2, 32, %o2
@@ -538,6 +493,7 @@ sparc64_boot_end:
 
 #include "systbls.S"
 #include "ktlb.S"
+#include "tsb.S"
 #include "etrap.S"
 #include "rtrap.S"
 #include "winfixup.S"
diff --git a/arch/sparc64/kernel/itlb_base.S b/arch/sparc64/kernel/itlb_base.S
deleted file mode 100644
index 4951ff8f68775c52d46420bbeeee9df002614150..0000000000000000000000000000000000000000
--- a/arch/sparc64/kernel/itlb_base.S
+++ /dev/null
@@ -1,79 +0,0 @@
-/* $Id: itlb_base.S,v 1.12 2002/02/09 19:49:30 davem Exp $
- * itlb_base.S:	Front end to ITLB miss replacement strategy.
- *              This is included directly into the trap table.
- *
- * Copyright (C) 1996,1998 David S. Miller (davem@redhat.com)
- * Copyright (C) 1997,1998 Jakub Jelinek   (jj@ultra.linux.cz)
- */
-
-#if PAGE_SHIFT == 13
-/*
- * To compute vpte offset, we need to do ((addr >> 13) << 3),
- * which can be optimized to (addr >> 10) if bits 10/11/12 can
- * be guaranteed to be 0 ... mmu_context.h does guarantee this
- * by only using 10 bits in the hwcontext value.
- */
-#define CREATE_VPTE_OFFSET1(r1, r2) \
-				srax	r1, 10, r2
-#define CREATE_VPTE_OFFSET2(r1, r2) nop
-#else /* PAGE_SHIFT */
-#define CREATE_VPTE_OFFSET1(r1, r2) \
-				srax	r1, PAGE_SHIFT, r2
-#define CREATE_VPTE_OFFSET2(r1, r2) \
-				sllx	r2, 3, r2
-#endif /* PAGE_SHIFT */
-
-
-/* Ways we can get here:
- *
- * 1) Nucleus instruction misses from module code.
- * 2) All user instruction misses.
- *
- * All real page faults merge their code paths to the
- * sparc64_realfault_common label below.
- */
-
-/* ITLB ** ICACHE line 1: Quick user TLB misses		*/
-	mov		TLB_SFSR, %g1
-	ldxa		[%g1 + %g1] ASI_IMMU, %g4	! Get TAG_ACCESS
-	CREATE_VPTE_OFFSET1(%g4, %g6)			! Create VPTE offset
-	CREATE_VPTE_OFFSET2(%g4, %g6)			! Create VPTE offset
-	ldxa		[%g3 + %g6] ASI_P, %g5		! Load VPTE
-1:	brgez,pn	%g5, 3f				! Not valid, branch out
-	 sethi		%hi(_PAGE_EXEC), %g4		! Delay-slot
-	andcc		%g5, %g4, %g0			! Executable?
-
-/* ITLB ** ICACHE line 2: Real faults			*/
-	be,pn		%xcc, 3f			! Nope, branch.
-	 nop						! Delay-slot
-2:	stxa		%g5, [%g0] ASI_ITLB_DATA_IN	! Load PTE into TLB
-	retry						! Trap return
-3:	rdpr		%pstate, %g4			! Move into alt-globals
-	wrpr		%g4, PSTATE_AG|PSTATE_MG, %pstate
-	rdpr		%tpc, %g5			! And load faulting VA
-	mov		FAULT_CODE_ITLB, %g4		! It was read from ITLB
-
-/* ITLB ** ICACHE line 3: Finish faults	*/
-sparc64_realfault_common:				! Called by dtlb_miss
-	stb		%g4, [%g6 + TI_FAULT_CODE]
-	stx		%g5, [%g6 + TI_FAULT_ADDR]
-	ba,pt		%xcc, etrap			! Save state
-1:	 rd		%pc, %g7			! ...
-	call		do_sparc64_fault		! Call fault handler
-	 add		%sp, PTREGS_OFF, %o0! Compute pt_regs arg
-	ba,pt		%xcc, rtrap_clr_l6		! Restore cpu state
-	 nop
-
-/* ITLB ** ICACHE line 4: Window fixups */
-winfix_trampoline:
-	rdpr		%tpc, %g3			! Prepare winfixup TNPC
-	or		%g3, 0x7c, %g3			! Compute branch offset
-	wrpr		%g3, %tnpc			! Write it into TNPC
-	done						! Do it to it
-	nop
-	nop
-	nop
-	nop
-
-#undef CREATE_VPTE_OFFSET1
-#undef CREATE_VPTE_OFFSET2
diff --git a/arch/sparc64/kernel/itlb_miss.S b/arch/sparc64/kernel/itlb_miss.S
new file mode 100644
index 0000000000000000000000000000000000000000..6b6c8fee04bd14d0d75a21deffd8374dbdb38311
--- /dev/null
+++ b/arch/sparc64/kernel/itlb_miss.S
@@ -0,0 +1,39 @@
+/* ITLB ** ICACHE line 1: Context 0 check and TSB load	*/
+	ldxa	[%g0] ASI_IMMU_TSB_8KB_PTR, %g1	! Get TSB 8K pointer
+	ldxa	[%g0] ASI_IMMU, %g6		! Get TAG TARGET
+	srlx	%g6, 48, %g5			! Get context
+	brz,pn	%g5, kvmap_itlb			! Context 0 processing
+	 nop					! Delay slot (fill me)
+	ldda	[%g1] ASI_NUCLEUS_QUAD_LDD, %g4	! Load TSB entry
+	cmp	%g4, %g6			! Compare TAG
+	sethi	%hi(_PAGE_EXEC), %g4		! Setup exec check
+
+/* ITLB ** ICACHE line 2: TSB compare and TLB load	*/
+	bne,pn	%xcc, tsb_miss_itlb		! Miss
+	 mov	FAULT_CODE_ITLB, %g3
+	andcc	%g5, %g4, %g0			! Executable?
+	be,pn	%xcc, tsb_do_fault
+	 nop					! Delay slot, fill me
+	stxa	%g5, [%g0] ASI_ITLB_DATA_IN	! Load TLB
+	retry					! Trap done
+	nop
+
+/* ITLB ** ICACHE line 3: 				*/
+	nop
+	nop
+	nop
+	nop
+	nop
+	nop
+	nop
+	nop
+
+/* ITLB ** ICACHE line 4: 				*/
+	nop
+	nop
+	nop
+	nop
+	nop
+	nop
+	nop
+	nop
diff --git a/arch/sparc64/kernel/ktlb.S b/arch/sparc64/kernel/ktlb.S
index d9244d3c9f73dd9d47a9c08b95190afa9cee0e45..2b5e71b688828b457eec247f3337a3be5902da5b 100644
--- a/arch/sparc64/kernel/ktlb.S
+++ b/arch/sparc64/kernel/ktlb.S
@@ -4,191 +4,170 @@
  * Copyright (C) 1996 Eddie C. Dost        (ecd@brainaid.de)
  * Copyright (C) 1996 Miguel de Icaza      (miguel@nuclecu.unam.mx)
  * Copyright (C) 1996,98,99 Jakub Jelinek  (jj@sunsite.mff.cuni.cz)
-*/
+ */
 
 #include <linux/config.h>
 #include <asm/head.h>
 #include <asm/asi.h>
 #include <asm/page.h>
 #include <asm/pgtable.h>
+#include <asm/tsb.h>
 
 	.text
 	.align		32
 
-/*
- * On a second level vpte miss, check whether the original fault is to the OBP 
- * range (note that this is only possible for instruction miss, data misses to
- * obp range do not use vpte). If so, go back directly to the faulting address.
- * This is because we want to read the tpc, otherwise we have no way of knowing
- * the 8k aligned faulting address if we are using >8k kernel pagesize. This
- * also ensures no vpte range addresses are dropped into tlb while obp is
- * executing (see inherit_locked_prom_mappings() rant).
- */
-sparc64_vpte_nucleus:
-	/* Note that kvmap below has verified that the address is
-	 * in the range MODULES_VADDR --> VMALLOC_END already.  So
-	 * here we need only check if it is an OBP address or not.
-	 */
+	.globl		kvmap_itlb
+kvmap_itlb:
+	/* g6: TAG TARGET */
+	mov		TLB_TAG_ACCESS, %g4
+	ldxa		[%g4] ASI_IMMU, %g4
+
+kvmap_itlb_nonlinear:
+	/* Catch kernel NULL pointer calls.  */
+	sethi		%hi(PAGE_SIZE), %g5
+	cmp		%g4, %g5
+	bleu,pn		%xcc, kvmap_dtlb_longpath
+	 nop
+
+	KERN_TSB_LOOKUP_TL1(%g4, %g6, %g5, %g1, %g2, %g3, kvmap_itlb_load)
+
+kvmap_itlb_tsb_miss:
 	sethi		%hi(LOW_OBP_ADDRESS), %g5
 	cmp		%g4, %g5
-	blu,pn		%xcc, kern_vpte
+	blu,pn		%xcc, kvmap_itlb_vmalloc_addr
 	 mov		0x1, %g5
 	sllx		%g5, 32, %g5
 	cmp		%g4, %g5
-	blu,pn		%xcc, vpte_insn_obp
+	blu,pn		%xcc, kvmap_itlb_obp
 	 nop
 
-	/* These two instructions are patched by paginig_init().  */
-kern_vpte:
-	sethi		%hi(swapper_pgd_zero), %g5
-	lduw		[%g5 + %lo(swapper_pgd_zero)], %g5
-
-	/* With kernel PGD in %g5, branch back into dtlb_backend.  */
-	ba,pt		%xcc, sparc64_kpte_continue
-	 andn		%g1, 0x3, %g1	/* Finish PMD offset adjustment.  */
-
-vpte_noent:
-	/* Restore previous TAG_ACCESS, %g5 is zero, and we will
-	 * skip over the trap instruction so that the top level
-	 * TLB miss handler will thing this %g5 value is just an
-	 * invalid PTE, thus branching to full fault processing.
-	 */
-	mov		TLB_SFSR, %g1
-	stxa		%g4, [%g1 + %g1] ASI_DMMU
-	done
-
-vpte_insn_obp:
-	/* Behave as if we are at TL0.  */
-	wrpr		%g0, 1, %tl
-	rdpr		%tpc, %g4	/* Find original faulting iaddr */
-	srlx		%g4, 13, %g4	/* Throw out context bits */
-	sllx		%g4, 13, %g4	/* g4 has vpn + ctx0 now */
-
-	/* Restore previous TAG_ACCESS.  */
-	mov		TLB_SFSR, %g1
-	stxa		%g4, [%g1 + %g1] ASI_IMMU
-
-	sethi		%hi(prom_trans), %g5
-	or		%g5, %lo(prom_trans), %g5
-
-1:	ldx		[%g5 + 0x00], %g6	! base
-	brz,a,pn	%g6, longpath		! no more entries, fail
-	 mov		TLB_SFSR, %g1		! and restore %g1
-	ldx		[%g5 + 0x08], %g1	! len
-	add		%g6, %g1, %g1		! end
-	cmp		%g6, %g4
-	bgu,pt		%xcc, 2f
-	 cmp		%g4, %g1
-	bgeu,pt		%xcc, 2f
-	 ldx		[%g5 + 0x10], %g1	! PTE
-
-	/* TLB load, restore %g1, and return from trap.  */
-	sub		%g4, %g6, %g6
-	add		%g1, %g6, %g5
-	mov		TLB_SFSR, %g1
-	stxa		%g5, [%g0] ASI_ITLB_DATA_IN
-	retry
+kvmap_itlb_vmalloc_addr:
+	KERN_PGTABLE_WALK(%g4, %g5, %g2, kvmap_itlb_longpath)
+
+	TSB_LOCK_TAG(%g1, %g2, %g4)
+
+	/* Load and check PTE.  */
+	ldxa		[%g5] ASI_PHYS_USE_EC, %g5
+	brgez,a,pn	%g5, kvmap_itlb_longpath
+	 stx		%g0, [%g1]
 
-2:	ba,pt		%xcc, 1b
-	 add		%g5, (3 * 8), %g5	! next entry
-
-kvmap_do_obp:
-	sethi		%hi(prom_trans), %g5
-	or		%g5, %lo(prom_trans), %g5
-	srlx		%g4, 13, %g4
-	sllx		%g4, 13, %g4
-
-1:	ldx		[%g5 + 0x00], %g6	! base
-	brz,a,pn	%g6, longpath		! no more entries, fail
-	 mov		TLB_SFSR, %g1		! and restore %g1
-	ldx		[%g5 + 0x08], %g1	! len
-	add		%g6, %g1, %g1		! end
-	cmp		%g6, %g4
-	bgu,pt		%xcc, 2f
-	 cmp		%g4, %g1
-	bgeu,pt		%xcc, 2f
-	 ldx		[%g5 + 0x10], %g1	! PTE
-
-	/* TLB load, restore %g1, and return from trap.  */
-	sub		%g4, %g6, %g6
-	add		%g1, %g6, %g5
-	mov		TLB_SFSR, %g1
-	stxa		%g5, [%g0] ASI_DTLB_DATA_IN
+	TSB_WRITE(%g1, %g5, %g6)
+
+	/* fallthrough to TLB load */
+
+kvmap_itlb_load:
+	stxa		%g5, [%g0] ASI_ITLB_DATA_IN	! Reload TLB
 	retry
 
-2:	ba,pt		%xcc, 1b
-	 add		%g5, (3 * 8), %g5	! next entry
+kvmap_itlb_longpath:
+	rdpr	%pstate, %g5
+	wrpr	%g5, PSTATE_AG | PSTATE_MG, %pstate
+	rdpr	%tpc, %g5
+	ba,pt	%xcc, sparc64_realfault_common
+	 mov	FAULT_CODE_ITLB, %g4
+
+kvmap_itlb_obp:
+	OBP_TRANS_LOOKUP(%g4, %g5, %g2, %g3, kvmap_itlb_longpath)
+
+	TSB_LOCK_TAG(%g1, %g2, %g4)
+
+	TSB_WRITE(%g1, %g5, %g6)
+
+	ba,pt		%xcc, kvmap_itlb_load
+	 nop
+
+kvmap_dtlb_obp:
+	OBP_TRANS_LOOKUP(%g4, %g5, %g2, %g3, kvmap_dtlb_longpath)
+
+	TSB_LOCK_TAG(%g1, %g2, %g4)
+
+	TSB_WRITE(%g1, %g5, %g6)
+
+	ba,pt		%xcc, kvmap_dtlb_load
+	 nop
 
-/*
- * On a first level data miss, check whether this is to the OBP range (note
- * that such accesses can be made by prom, as well as by kernel using
- * prom_getproperty on "address"), and if so, do not use vpte access ...
- * rather, use information saved during inherit_prom_mappings() using 8k
- * pagesize.
- */
 	.align		32
-kvmap:
-	brgez,pn	%g4, kvmap_nonlinear
+	.globl		kvmap_dtlb
+kvmap_dtlb:
+	/* %g6: TAG TARGET */
+	mov		TLB_TAG_ACCESS, %g4
+	ldxa		[%g4] ASI_DMMU, %g4
+	brgez,pn	%g4, kvmap_dtlb_nonlinear
 	 nop
 
-#ifdef CONFIG_DEBUG_PAGEALLOC
+#define KERN_HIGHBITS	((_PAGE_VALID|_PAGE_SZ4MB)^0xfffff80000000000)
+#define KERN_LOWBITS	(_PAGE_CP | _PAGE_CV | _PAGE_P | _PAGE_W)
+
+	sethi		%uhi(KERN_HIGHBITS), %g2
+	or		%g2, %ulo(KERN_HIGHBITS), %g2
+	sllx		%g2, 32, %g2
+	or		%g2, KERN_LOWBITS, %g2
+
+#undef KERN_HIGHBITS
+#undef KERN_LOWBITS
+
 	.globl		kvmap_linear_patch
 kvmap_linear_patch:
-#endif
-	ba,pt		%xcc, kvmap_load
+	ba,pt		%xcc, kvmap_dtlb_load
 	 xor		%g2, %g4, %g5
 
-#ifdef CONFIG_DEBUG_PAGEALLOC
-	sethi		%hi(swapper_pg_dir), %g5
-	or		%g5, %lo(swapper_pg_dir), %g5
-	sllx		%g4, 64 - (PGDIR_SHIFT + PGDIR_BITS), %g6
-	srlx		%g6, 64 - PAGE_SHIFT, %g6
-	andn		%g6, 0x3, %g6
-	lduw		[%g5 + %g6], %g5
-	brz,pn		%g5, longpath
-	 sllx		%g4, 64 - (PMD_SHIFT + PMD_BITS), %g6
-	srlx		%g6, 64 - PAGE_SHIFT, %g6
-	sllx		%g5, 11, %g5
-	andn		%g6, 0x3, %g6
-	lduwa		[%g5 + %g6] ASI_PHYS_USE_EC, %g5
-	brz,pn		%g5, longpath
-	 sllx		%g4, 64 - PMD_SHIFT, %g6
-	srlx		%g6, 64 - PAGE_SHIFT, %g6
-	sllx		%g5, 11, %g5
-	andn		%g6, 0x7, %g6
-	ldxa		[%g5 + %g6] ASI_PHYS_USE_EC, %g5
-	brz,pn		%g5, longpath
+kvmap_dtlb_vmalloc_addr:
+	KERN_PGTABLE_WALK(%g4, %g5, %g2, kvmap_dtlb_longpath)
+
+	TSB_LOCK_TAG(%g1, %g2, %g4)
+
+	/* Load and check PTE.  */
+	ldxa		[%g5] ASI_PHYS_USE_EC, %g5
+	brgez,a,pn	%g5, kvmap_dtlb_longpath
+	 stx		%g0, [%g1]
+
+	TSB_WRITE(%g1, %g5, %g6)
+
+	/* fallthrough to TLB load */
+
+kvmap_dtlb_load:
+	stxa		%g5, [%g0] ASI_DTLB_DATA_IN	! Reload TLB
+	retry
+
+kvmap_dtlb_nonlinear:
+	/* Catch kernel NULL pointer derefs.  */
+	sethi		%hi(PAGE_SIZE), %g5
+	cmp		%g4, %g5
+	bleu,pn		%xcc, kvmap_dtlb_longpath
 	 nop
-	ba,a,pt		%xcc, kvmap_load
-#endif
 
-kvmap_nonlinear:
+	KERN_TSB_LOOKUP_TL1(%g4, %g6, %g5, %g1, %g2, %g3, kvmap_dtlb_load)
+
+kvmap_dtlb_tsbmiss:
 	sethi		%hi(MODULES_VADDR), %g5
 	cmp		%g4, %g5
-	blu,pn		%xcc, longpath
+	blu,pn		%xcc, kvmap_dtlb_longpath
 	 mov		(VMALLOC_END >> 24), %g5
 	sllx		%g5, 24, %g5
 	cmp		%g4, %g5
-	bgeu,pn		%xcc, longpath
+	bgeu,pn		%xcc, kvmap_dtlb_longpath
 	 nop
 
 kvmap_check_obp:
 	sethi		%hi(LOW_OBP_ADDRESS), %g5
 	cmp		%g4, %g5
-	blu,pn		%xcc, kvmap_vmalloc_addr
+	blu,pn		%xcc, kvmap_dtlb_vmalloc_addr
 	 mov		0x1, %g5
 	sllx		%g5, 32, %g5
 	cmp		%g4, %g5
-	blu,pn		%xcc, kvmap_do_obp
+	blu,pn		%xcc, kvmap_dtlb_obp
 	 nop
-
-kvmap_vmalloc_addr:
-	/* If we get here, a vmalloc addr was accessed, load kernel VPTE.  */
-	ldxa		[%g3 + %g6] ASI_N, %g5
-	brgez,pn	%g5, longpath
+	ba,pt		%xcc, kvmap_dtlb_vmalloc_addr
 	 nop
 
-kvmap_load:
-	/* PTE is valid, load into TLB and return from trap.  */
-	stxa		%g5, [%g0] ASI_DTLB_DATA_IN	! Reload TLB
-	retry
+kvmap_dtlb_longpath:
+	rdpr	%pstate, %g5
+	wrpr	%g5, PSTATE_AG | PSTATE_MG, %pstate
+	rdpr	%tl, %g4
+	cmp	%g4, 1
+	mov	TLB_TAG_ACCESS, %g4
+	ldxa	[%g4] ASI_DMMU, %g5
+	be,pt	%xcc, sparc64_realfault_common
+	 mov	FAULT_CODE_DTLB, %g4
+	ba,pt	%xcc, winfix_trampoline
+	 nop
diff --git a/arch/sparc64/kernel/process.c b/arch/sparc64/kernel/process.c
index 059b0d0252245800bf415110644a659a9f6c5a8e..2784aab0d3e59d22f0b0a20fcc946f5d61b96855 100644
--- a/arch/sparc64/kernel/process.c
+++ b/arch/sparc64/kernel/process.c
@@ -44,6 +44,7 @@
 #include <asm/fpumacro.h>
 #include <asm/head.h>
 #include <asm/cpudata.h>
+#include <asm/mmu_context.h>
 #include <asm/unistd.h>
 
 /* #define VERBOSE_SHOWREGS */
@@ -433,30 +434,16 @@ void exit_thread(void)
 void flush_thread(void)
 {
 	struct thread_info *t = current_thread_info();
+	struct mm_struct *mm;
 
 	if (t->flags & _TIF_ABI_PENDING)
 		t->flags ^= (_TIF_ABI_PENDING | _TIF_32BIT);
 
-	if (t->task->mm) {
-		unsigned long pgd_cache = 0UL;
-		if (test_thread_flag(TIF_32BIT)) {
-			struct mm_struct *mm = t->task->mm;
-			pgd_t *pgd0 = &mm->pgd[0];
-			pud_t *pud0 = pud_offset(pgd0, 0);
+	mm = t->task->mm;
+	if (mm)
+		tsb_context_switch(__pa(mm->pgd),
+				   mm->context.sparc64_tsb);
 
-			if (pud_none(*pud0)) {
-				pmd_t *page = pmd_alloc_one(mm, 0);
-				pud_set(pud0, page);
-			}
-			pgd_cache = get_pgd_cache(pgd0);
-		}
-		__asm__ __volatile__("stxa %0, [%1] %2\n\t"
-				     "membar #Sync"
-				     : /* no outputs */
-				     : "r" (pgd_cache),
-				     "r" (TSB_REG),
-				     "i" (ASI_DMMU));
-	}
 	set_thread_wsaved(0);
 
 	/* Turn off performance counters if on. */
diff --git a/arch/sparc64/kernel/rtrap.S b/arch/sparc64/kernel/rtrap.S
index b80eba0081ca8d5b7b382adea960a0bf37239a44..213eb4a9d8a41e4cd4652e1836848123db755582 100644
--- a/arch/sparc64/kernel/rtrap.S
+++ b/arch/sparc64/kernel/rtrap.S
@@ -223,10 +223,14 @@ rt_continue:	ldx			[%sp + PTREGS_OFF + PT_V9_G1], %g1
 		ldx			[%sp + PTREGS_OFF + PT_V9_G3], %g3
 		ldx			[%sp + PTREGS_OFF + PT_V9_G4], %g4
 		ldx			[%sp + PTREGS_OFF + PT_V9_G5], %g5
+#ifdef CONFIG_SMP
+#error IMMU TSB usage must be fixed
 		mov			TSB_REG, %g6
 		brnz,a,pn		%l3, 1f
 		 ldxa			[%g6] ASI_IMMU, %g5
-1:		ldx			[%sp + PTREGS_OFF + PT_V9_G6], %g6
+#endif
+1:
+		ldx			[%sp + PTREGS_OFF + PT_V9_G6], %g6
 		ldx			[%sp + PTREGS_OFF + PT_V9_G7], %g7
 		wrpr			%g0, RTRAP_PSTATE_AG_IRQOFF, %pstate
 		ldx			[%sp + PTREGS_OFF + PT_V9_I0], %i0
diff --git a/arch/sparc64/kernel/smp.c b/arch/sparc64/kernel/smp.c
index 1f7ad8a690526177bcfc62abe1c7a6f90c4f90b0..d2d3369e7b5dc2a18424959073f8c96099168f24 100644
--- a/arch/sparc64/kernel/smp.c
+++ b/arch/sparc64/kernel/smp.c
@@ -123,6 +123,7 @@ extern void inherit_locked_prom_mappings(int save_p);
 
 static inline void cpu_setup_percpu_base(unsigned long cpu_id)
 {
+#error IMMU TSB usage must be fixed
 	__asm__ __volatile__("mov	%0, %%g5\n\t"
 			     "stxa	%0, [%1] %2\n\t"
 			     "membar	#Sync"
@@ -662,8 +663,6 @@ void smp_call_function_client(int irq, struct pt_regs *regs)
 extern unsigned long xcall_flush_tlb_mm;
 extern unsigned long xcall_flush_tlb_pending;
 extern unsigned long xcall_flush_tlb_kernel_range;
-extern unsigned long xcall_flush_tlb_all_spitfire;
-extern unsigned long xcall_flush_tlb_all_cheetah;
 extern unsigned long xcall_report_regs;
 extern unsigned long xcall_receive_signal;
 
@@ -794,15 +793,6 @@ void smp_report_regs(void)
 	smp_cross_call(&xcall_report_regs, 0, 0, 0);
 }
 
-void smp_flush_tlb_all(void)
-{
-	if (tlb_type == spitfire)
-		smp_cross_call(&xcall_flush_tlb_all_spitfire, 0, 0, 0);
-	else
-		smp_cross_call(&xcall_flush_tlb_all_cheetah, 0, 0, 0);
-	__flush_tlb_all();
-}
-
 /* We know that the window frames of the user have been flushed
  * to the stack before we get here because all callers of us
  * are flush_tlb_*() routines, and these run after flush_cache_*()
diff --git a/arch/sparc64/kernel/trampoline.S b/arch/sparc64/kernel/trampoline.S
index 9478551cb02026b051cc3418ff6ea9d41adf2d80..782d8c4973e4515a28e7c464ef5d344aa10b248c 100644
--- a/arch/sparc64/kernel/trampoline.S
+++ b/arch/sparc64/kernel/trampoline.S
@@ -295,39 +295,6 @@ do_unlock:
 	wrpr		%g5, %tba
 	mov		%o2, %g6
 
-	wrpr		%o1, PSTATE_MG, %pstate
-#define KERN_HIGHBITS		((_PAGE_VALID|_PAGE_SZ4MB)^0xfffff80000000000)
-#define KERN_LOWBITS		(_PAGE_CP | _PAGE_CV | _PAGE_P | _PAGE_W)
-
-	mov		TSB_REG, %g1
-	stxa		%g0, [%g1] ASI_DMMU
-	membar		#Sync
-	mov		TLB_SFSR, %g1
-	sethi		%uhi(KERN_HIGHBITS), %g2
-	or		%g2, %ulo(KERN_HIGHBITS), %g2
-	sllx		%g2, 32, %g2
-	or		%g2, KERN_LOWBITS, %g2
-
-	BRANCH_IF_ANY_CHEETAH(g3,g7,9f)
-
-	ba,pt		%xcc, 1f
-	 nop
-
-9:
-	sethi		%uhi(VPTE_BASE_CHEETAH), %g3
-	or		%g3, %ulo(VPTE_BASE_CHEETAH), %g3
-	ba,pt		%xcc, 2f
-	 sllx		%g3, 32, %g3
-1:
-	sethi		%uhi(VPTE_BASE_SPITFIRE), %g3
-	or		%g3, %ulo(VPTE_BASE_SPITFIRE), %g3
-	sllx		%g3, 32, %g3
-
-2:
-	clr	%g7
-#undef KERN_HIGHBITS
-#undef KERN_LOWBITS
-
 	wrpr		%o1, 0x0, %pstate
 	ldx		[%g6 + TI_TASK], %g4
 
diff --git a/arch/sparc64/kernel/tsb.S b/arch/sparc64/kernel/tsb.S
new file mode 100644
index 0000000000000000000000000000000000000000..44b9e6fed09fc11679bab27e833e8e8e6df37b08
--- /dev/null
+++ b/arch/sparc64/kernel/tsb.S
@@ -0,0 +1,169 @@
+/* tsb.S: Sparc64 TSB table handling.
+ *
+ * Copyright (C) 2006 David S. Miller <davem@davemloft.net>
+ */
+
+#include <asm/tsb.h>
+
+	.text
+	.align	32
+
+	/* Invoked from TLB miss handler, we are in the
+	 * MMU global registers and they are setup like
+	 * this:
+	 *
+	 * %g1: TSB entry pointer
+	 * %g2:	available temporary
+	 * %g3:	FAULT_CODE_{D,I}TLB
+	 * %g4:	available temporary
+	 * %g5:	available temporary
+	 * %g6: TAG TARGET
+	 * %g7:	physical address base of the linux page
+	 *      tables for the current address space
+	 */
+	.globl		tsb_miss_dtlb
+tsb_miss_dtlb:
+	mov		TLB_TAG_ACCESS, %g4
+	ldxa		[%g4] ASI_DMMU, %g4
+	ba,pt		%xcc, tsb_miss_page_table_walk
+	 nop
+
+	.globl		tsb_miss_itlb
+tsb_miss_itlb:
+	mov		TLB_TAG_ACCESS, %g4
+	ldxa		[%g4] ASI_IMMU, %g4
+	ba,pt		%xcc, tsb_miss_page_table_walk
+	 nop
+
+tsb_miss_page_table_walk:
+	USER_PGTABLE_WALK_TL1(%g4, %g7, %g5, %g2, tsb_do_fault)
+
+tsb_reload:
+	TSB_LOCK_TAG(%g1, %g2, %g4)
+
+	/* Load and check PTE.  */
+	ldxa		[%g5] ASI_PHYS_USE_EC, %g5
+	brgez,a,pn	%g5, tsb_do_fault
+	 stx		%g0, [%g1]
+
+	TSB_WRITE(%g1, %g5, %g6)
+
+	/* Finally, load TLB and return from trap.  */
+tsb_tlb_reload:
+	cmp		%g3, FAULT_CODE_DTLB
+	bne,pn		%xcc, tsb_itlb_load
+	 nop
+
+tsb_dtlb_load:
+	stxa		%g5, [%g0] ASI_DTLB_DATA_IN
+	retry
+
+tsb_itlb_load:
+	stxa		%g5, [%g0] ASI_ITLB_DATA_IN
+	retry
+
+	/* No valid entry in the page tables, do full fault
+	 * processing.
+	 */
+
+	.globl		tsb_do_fault
+tsb_do_fault:
+	cmp		%g3, FAULT_CODE_DTLB
+	rdpr		%pstate, %g5
+	bne,pn		%xcc, tsb_do_itlb_fault
+	 wrpr		%g5, PSTATE_AG | PSTATE_MG, %pstate
+
+tsb_do_dtlb_fault:
+	rdpr	%tl, %g4
+	cmp	%g4, 1
+	mov	TLB_TAG_ACCESS, %g4
+	ldxa	[%g4] ASI_DMMU, %g5
+	be,pt	%xcc, sparc64_realfault_common
+	 mov	FAULT_CODE_DTLB, %g4
+	ba,pt	%xcc, winfix_trampoline
+	 nop
+
+tsb_do_itlb_fault:
+	rdpr	%tpc, %g5
+	ba,pt	%xcc, sparc64_realfault_common
+	 mov	FAULT_CODE_ITLB, %g4
+
+	.globl	sparc64_realfault_common
+sparc64_realfault_common:
+	stb	%g4, [%g6 + TI_FAULT_CODE]	! Save fault code
+	stx	%g5, [%g6 + TI_FAULT_ADDR]	! Save fault address
+	ba,pt	%xcc, etrap			! Save trap state
+1:	 rd	%pc, %g7			! ...
+	call	do_sparc64_fault		! Call fault handler
+	 add	%sp, PTREGS_OFF, %o0		! Compute pt_regs arg
+	ba,pt	%xcc, rtrap_clr_l6		! Restore cpu state
+	 nop					! Delay slot (fill me)
+
+	.globl	winfix_trampoline
+winfix_trampoline:
+	rdpr	%tpc, %g3			! Prepare winfixup TNPC
+	or	%g3, 0x7c, %g3			! Compute branch offset
+	wrpr	%g3, %tnpc			! Write it into TNPC
+	done					! Trap return
+
+	/* Reload MMU related context switch state at
+	 * schedule() time.
+	 *
+	 * %o0: page table physical address
+	 * %o1:	TSB address
+	 */
+	.globl	tsb_context_switch
+tsb_context_switch:
+	wrpr	%g0, PSTATE_MG | PSTATE_RMO | PSTATE_PEF | PSTATE_PRIV, %pstate
+
+	/* Set page table base alternate global.  */
+	mov	%o0, %g7
+
+	/* XXX can this happen?  */
+	brz,pn	%o1, 9f
+	 nop
+
+	/* Lock TSB into D-TLB.  */
+	sethi		%hi(PAGE_SIZE), %o3
+	and		%o3, %o1, %o3
+	sethi		%hi(TSBMAP_BASE), %o2
+	add		%o2, %o3, %o2
+
+	/* XXX handle PAGE_SIZE != 8K correctly...  */
+	mov	TSB_REG, %g1
+	stxa	%o2, [%g1] ASI_DMMU
+	membar	#Sync
+
+	stxa	%o2, [%g1] ASI_IMMU
+	membar	#Sync
+
+#define KERN_HIGHBITS	((_PAGE_VALID|_PAGE_SZBITS)^0xfffff80000000000)
+#define KERN_LOWBITS	(_PAGE_CP | _PAGE_CV | _PAGE_P | _PAGE_W | _PAGE_L)
+	sethi		%uhi(KERN_HIGHBITS), %g2
+	or		%g2, %ulo(KERN_HIGHBITS), %g2
+	sllx		%g2, 32, %g2
+	or		%g2, KERN_LOWBITS, %g2
+#undef KERN_HIGHBITS
+#undef KERN_LOWBITS
+
+	xor		%o1, %g2, %o1	
+
+	/* We use entry 61 for this locked entry.  This is the spitfire
+	 * TLB entry number, and luckily cheetah masks the value with
+	 * 15 ending us up with entry 13 which is what we want in that
+	 * case too.
+	 *
+	 * XXX Interactions with prom_world()...
+	 */
+	mov		TLB_TAG_ACCESS, %g1
+	stxa		%o2, [%g1] ASI_DMMU
+	membar		#Sync
+	mov		(61 << 3), %g1
+	stxa		%o1, [%g1] ASI_DTLB_DATA_ACCESS
+	membar		#Sync
+
+9:
+	wrpr	%g0, PSTATE_RMO | PSTATE_PEF | PSTATE_PRIV | PSTATE_IE, %pstate
+
+	retl
+	 mov	%o2, %o0
diff --git a/arch/sparc64/kernel/ttable.S b/arch/sparc64/kernel/ttable.S
index 8365bc1f81f3105f969ad938c3b3de3cebf221e7..56f060c8fbf0fc65bcd465718a06cd24741130e3 100644
--- a/arch/sparc64/kernel/ttable.S
+++ b/arch/sparc64/kernel/ttable.S
@@ -78,9 +78,9 @@ tl0_vaw:	TRAP(do_vaw)
 tl0_cee:	membar #Sync
 		TRAP_NOSAVE_7INSNS(__spitfire_cee_trap)
 tl0_iamiss:
-#include	"itlb_base.S"
+#include	"itlb_miss.S"
 tl0_damiss:
-#include	"dtlb_base.S"
+#include	"dtlb_miss.S"
 tl0_daprot:
 #include	"dtlb_prot.S"
 tl0_fecc:	BTRAP(0x70)	/* Fast-ECC on Cheetah */
@@ -241,7 +241,7 @@ tl1_cee:	membar	#Sync
 
 tl1_iamiss:	BTRAPTL1(0x64) BTRAPTL1(0x65) BTRAPTL1(0x66) BTRAPTL1(0x67)
 tl1_damiss:
-#include	"dtlb_backend.S"
+#include	"dtlb_miss.S"
 tl1_daprot:
 #include	"dtlb_prot.S"
 tl1_fecc:	BTRAPTL1(0x70)	/* Fast-ECC on Cheetah */
diff --git a/arch/sparc64/kernel/vmlinux.lds.S b/arch/sparc64/kernel/vmlinux.lds.S
index 467d13a0d5c1314fa99c645ac62f5699ee7cb3d3..f018aaf4548601dc777be93914d3c20341135629 100644
--- a/arch/sparc64/kernel/vmlinux.lds.S
+++ b/arch/sparc64/kernel/vmlinux.lds.S
@@ -43,6 +43,9 @@ SECTIONS
   __ex_table : { *(__ex_table) }
   __stop___ex_table = .;
 
+  . = ALIGN(8192);
+  swapper_tsb = .;
+  . += 8192;
   . = ALIGN(8192);
   __init_begin = .;
   .init.text : { 
diff --git a/arch/sparc64/kernel/winfixup.S b/arch/sparc64/kernel/winfixup.S
index 39160926267b61b7ee816a90c25dbd014b467dfa..f5d93aa99cbb1659e2adb7986410a4c0b0c0c81b 100644
--- a/arch/sparc64/kernel/winfixup.S
+++ b/arch/sparc64/kernel/winfixup.S
@@ -85,6 +85,7 @@ fill_fixup:
 	mov		%o7, %g6
 	ldx		[%g6 + TI_TASK], %g4
 #ifdef CONFIG_SMP
+#error IMMU TSB usage must be fixed
 	mov		TSB_REG, %g1
 	ldxa		[%g1] ASI_IMMU, %g5
 #endif
@@ -209,6 +210,7 @@ fill_fixup_mna:
 	mov		%o7, %g6			! Get current back.
 	ldx		[%g6 + TI_TASK], %g4		! Finish it.
 #ifdef CONFIG_SMP
+#error IMMU TSB usage must be fixed
 	mov		TSB_REG, %g1
 	ldxa		[%g1] ASI_IMMU, %g5
 #endif
@@ -278,11 +280,6 @@ window_mna_from_user_common:
 	ba,pt		%xcc, rtrap
 	 clr		%l6
 	
-	/* These are only needed for 64-bit mode processes which
-	 * put their stack pointer into the VPTE area and there
-	 * happens to be a VPTE tlb entry mapped there during
-	 * a spill/fill trap to that stack frame.
-	 */
 	.globl		winfix_dax, fill_fixup_dax, spill_fixup_dax
 winfix_dax:
 	andn		%g3, 0x7f, %g3
@@ -318,6 +315,7 @@ fill_fixup_dax:
 	mov		%o7, %g6			! Get current back.
 	ldx		[%g6 + TI_TASK], %g4		! Finish it.
 #ifdef CONFIG_SMP
+#error IMMU TSB usage must be fixed
 	mov		TSB_REG, %g1
 	ldxa		[%g1] ASI_IMMU, %g5
 #endif
diff --git a/arch/sparc64/mm/Makefile b/arch/sparc64/mm/Makefile
index 9d0960e69f487a94c0f23b53050267f56f35b1e9..e415bf942bcd33ce2c426ae1f7f5f4d723ba8508 100644
--- a/arch/sparc64/mm/Makefile
+++ b/arch/sparc64/mm/Makefile
@@ -5,6 +5,6 @@
 EXTRA_AFLAGS := -ansi
 EXTRA_CFLAGS := -Werror
 
-obj-y    := ultra.o tlb.o fault.o init.o generic.o
+obj-y    := ultra.o tlb.o tsb.o fault.o init.o generic.o
 
 obj-$(CONFIG_HUGETLB_PAGE) += hugetlbpage.o
diff --git a/arch/sparc64/mm/init.c b/arch/sparc64/mm/init.c
index 1e44ee26cee8470d8bfc97e0ddb9c29f01383e5c..da068f6b2595b1df5566a1c42ce941f30155bd8f 100644
--- a/arch/sparc64/mm/init.c
+++ b/arch/sparc64/mm/init.c
@@ -408,8 +408,7 @@ unsigned long prom_virt_to_phys(unsigned long promva, int *error)
 
 /* The obp translations are saved based on 8k pagesize, since obp can
  * use a mixture of pagesizes. Misses to the LOW_OBP_ADDRESS ->
- * HI_OBP_ADDRESS range are handled in ktlb.S and do not use the vpte
- * scheme (also, see rant in inherit_locked_prom_mappings()).
+ * HI_OBP_ADDRESS range are handled in ktlb.S.
  */
 static inline int in_obp_range(unsigned long vaddr)
 {
@@ -539,75 +538,6 @@ static void __init inherit_prom_mappings(void)
 	prom_printf("done.\n");
 }
 
-/* The OBP specifications for sun4u mark 0xfffffffc00000000 and
- * upwards as reserved for use by the firmware (I wonder if this
- * will be the same on Cheetah...).  We use this virtual address
- * range for the VPTE table mappings of the nucleus so we need
- * to zap them when we enter the PROM.  -DaveM
- */
-static void __flush_nucleus_vptes(void)
-{
-	unsigned long prom_reserved_base = 0xfffffffc00000000UL;
-	int i;
-
-	/* Only DTLB must be checked for VPTE entries. */
-	if (tlb_type == spitfire) {
-		for (i = 0; i < 63; i++) {
-			unsigned long tag;
-
-			/* Spitfire Errata #32 workaround */
-			/* NOTE: Always runs on spitfire, so no cheetah+
-			 *       page size encodings.
-			 */
-			__asm__ __volatile__("stxa	%0, [%1] %2\n\t"
-					     "flush	%%g6"
-					     : /* No outputs */
-					     : "r" (0),
-					     "r" (PRIMARY_CONTEXT), "i" (ASI_DMMU));
-
-			tag = spitfire_get_dtlb_tag(i);
-			if (((tag & ~(PAGE_MASK)) == 0) &&
-			    ((tag &  (PAGE_MASK)) >= prom_reserved_base)) {
-				__asm__ __volatile__("stxa %%g0, [%0] %1\n\t"
-						     "membar #Sync"
-						     : /* no outputs */
-						     : "r" (TLB_TAG_ACCESS), "i" (ASI_DMMU));
-				spitfire_put_dtlb_data(i, 0x0UL);
-			}
-		}
-	} else if (tlb_type == cheetah || tlb_type == cheetah_plus) {
-		for (i = 0; i < 512; i++) {
-			unsigned long tag = cheetah_get_dtlb_tag(i, 2);
-
-			if ((tag & ~PAGE_MASK) == 0 &&
-			    (tag & PAGE_MASK) >= prom_reserved_base) {
-				__asm__ __volatile__("stxa %%g0, [%0] %1\n\t"
-						     "membar #Sync"
-						     : /* no outputs */
-						     : "r" (TLB_TAG_ACCESS), "i" (ASI_DMMU));
-				cheetah_put_dtlb_data(i, 0x0UL, 2);
-			}
-
-			if (tlb_type != cheetah_plus)
-				continue;
-
-			tag = cheetah_get_dtlb_tag(i, 3);
-
-			if ((tag & ~PAGE_MASK) == 0 &&
-			    (tag & PAGE_MASK) >= prom_reserved_base) {
-				__asm__ __volatile__("stxa %%g0, [%0] %1\n\t"
-						     "membar #Sync"
-						     : /* no outputs */
-						     : "r" (TLB_TAG_ACCESS), "i" (ASI_DMMU));
-				cheetah_put_dtlb_data(i, 0x0UL, 3);
-			}
-		}
-	} else {
-		/* Implement me :-) */
-		BUG();
-	}
-}
-
 static int prom_ditlb_set;
 struct prom_tlb_entry {
 	int		tlb_ent;
@@ -635,9 +565,6 @@ void prom_world(int enter)
 			     : "i" (PSTATE_IE));
 
 	if (enter) {
-		/* Kick out nucleus VPTEs. */
-		__flush_nucleus_vptes();
-
 		/* Install PROM world. */
 		for (i = 0; i < 16; i++) {
 			if (prom_dtlb[i].tlb_ent != -1) {
@@ -1039,18 +966,7 @@ out:
 struct pgtable_cache_struct pgt_quicklists;
 #endif
 
-/* OK, we have to color these pages. The page tables are accessed
- * by non-Dcache enabled mapping in the VPTE area by the dtlb_backend.S
- * code, as well as by PAGE_OFFSET range direct-mapped addresses by 
- * other parts of the kernel. By coloring, we make sure that the tlbmiss 
- * fast handlers do not get data from old/garbage dcache lines that 
- * correspond to an old/stale virtual address (user/kernel) that 
- * previously mapped the pagetable page while accessing vpte range 
- * addresses. The idea is that if the vpte color and PAGE_OFFSET range 
- * color is the same, then when the kernel initializes the pagetable 
- * using the later address range, accesses with the first address
- * range will see the newly initialized data rather than the garbage.
- */
+/* XXX We don't need to color these things in the D-cache any longer.  */
 #ifdef DCACHE_ALIASING_POSSIBLE
 #define DC_ALIAS_SHIFT	1
 #else
@@ -1419,6 +1335,9 @@ void kernel_map_pages(struct page *page, int numpages, int enable)
 	kernel_map_range(phys_start, phys_end,
 			 (enable ? PAGE_KERNEL : __pgprot(0)));
 
+	flush_tsb_kernel_range(PAGE_OFFSET + phys_start,
+			       PAGE_OFFSET + phys_end);
+
 	/* we should perform an IPI and flush all tlbs,
 	 * but that can deadlock->flush only current cpu.
 	 */
diff --git a/arch/sparc64/mm/tlb.c b/arch/sparc64/mm/tlb.c
index 8b104be4662b3089e3ba5344cb61e13d85608161..78357cc2a0b74e1d1242f38e5259a4452b04a129 100644
--- a/arch/sparc64/mm/tlb.c
+++ b/arch/sparc64/mm/tlb.c
@@ -25,6 +25,8 @@ void flush_tlb_pending(void)
 	struct mmu_gather *mp = &__get_cpu_var(mmu_gathers);
 
 	if (mp->tlb_nr) {
+		flush_tsb_user(mp);
+
 		if (CTX_VALID(mp->mm->context)) {
 #ifdef CONFIG_SMP
 			smp_flush_tlb_pending(mp->mm, mp->tlb_nr,
@@ -89,62 +91,3 @@ no_cache_flush:
 	if (nr >= TLB_BATCH_NR)
 		flush_tlb_pending();
 }
-
-void flush_tlb_pgtables(struct mm_struct *mm, unsigned long start, unsigned long end)
-{
-	struct mmu_gather *mp = &__get_cpu_var(mmu_gathers);
-	unsigned long nr = mp->tlb_nr;
-	long s = start, e = end, vpte_base;
-
-	if (mp->fullmm)
-		return;
-
-	/* If start is greater than end, that is a real problem.  */
-	BUG_ON(start > end);
-
-	/* However, straddling the VA space hole is quite normal. */
-	s &= PMD_MASK;
-	e = (e + PMD_SIZE - 1) & PMD_MASK;
-
-	vpte_base = (tlb_type == spitfire ?
-		     VPTE_BASE_SPITFIRE :
-		     VPTE_BASE_CHEETAH);
-
-	if (unlikely(nr != 0 && mm != mp->mm)) {
-		flush_tlb_pending();
-		nr = 0;
-	}
-
-	if (nr == 0)
-		mp->mm = mm;
-
-	start = vpte_base + (s >> (PAGE_SHIFT - 3));
-	end = vpte_base + (e >> (PAGE_SHIFT - 3));
-
-	/* If the request straddles the VA space hole, we
-	 * need to swap start and end.  The reason this
-	 * occurs is that "vpte_base" is the center of
-	 * the linear page table mapping area.  Thus,
-	 * high addresses with the sign bit set map to
-	 * addresses below vpte_base and non-sign bit
-	 * addresses map to addresses above vpte_base.
-	 */
-	if (end < start) {
-		unsigned long tmp = start;
-
-		start = end;
-		end = tmp;
-	}
-
-	while (start < end) {
-		mp->vaddrs[nr] = start;
-		mp->tlb_nr = ++nr;
-		if (nr >= TLB_BATCH_NR) {
-			flush_tlb_pending();
-			nr = 0;
-		}
-		start += PAGE_SIZE;
-	}
-	if (nr)
-		flush_tlb_pending();
-}
diff --git a/arch/sparc64/mm/tsb.c b/arch/sparc64/mm/tsb.c
new file mode 100644
index 0000000000000000000000000000000000000000..15e8af58b1d2f98347eb7d64672c9ff303ab2dcc
--- /dev/null
+++ b/arch/sparc64/mm/tsb.c
@@ -0,0 +1,84 @@
+/* arch/sparc64/mm/tsb.c
+ *
+ * Copyright (C) 2006 David S. Miller <davem@davemloft.net>
+ */
+
+#include <linux/kernel.h>
+#include <asm/system.h>
+#include <asm/page.h>
+#include <asm/tlbflush.h>
+#include <asm/tlb.h>
+
+#define TSB_ENTRY_ALIGNMENT	16
+
+struct tsb {
+	unsigned long tag;
+	unsigned long pte;
+} __attribute__((aligned(TSB_ENTRY_ALIGNMENT)));
+
+/* We use an 8K TSB for the whole kernel, this allows to
+ * handle about 4MB of modules and vmalloc mappings without
+ * incurring many hash conflicts.
+ */
+#define KERNEL_TSB_SIZE_BYTES	8192
+#define KERNEL_TSB_NENTRIES \
+	(KERNEL_TSB_SIZE_BYTES / sizeof(struct tsb))
+
+extern struct tsb swapper_tsb[KERNEL_TSB_NENTRIES];
+
+static inline unsigned long tsb_hash(unsigned long vaddr)
+{
+	vaddr >>= PAGE_SHIFT;
+	return vaddr & (KERNEL_TSB_NENTRIES - 1);
+}
+
+static inline int tag_compare(struct tsb *entry, unsigned long vaddr, unsigned long context)
+{
+	if (context == ~0UL)
+		return 1;
+
+	return (entry->tag == ((vaddr >> 22) | (context << 48)));
+}
+
+/* TSB flushes need only occur on the processor initiating the address
+ * space modification, not on each cpu the address space has run on.
+ * Only the TLB flush needs that treatment.
+ */
+
+void flush_tsb_kernel_range(unsigned long start, unsigned long end)
+{
+	unsigned long v;
+
+	for (v = start; v < end; v += PAGE_SIZE) {
+		struct tsb *ent = &swapper_tsb[tsb_hash(v)];
+
+		if (tag_compare(ent, v, 0)) {
+			ent->tag = 0UL;
+			membar_storeload_storestore();
+		}
+	}
+}
+
+void flush_tsb_user(struct mmu_gather *mp)
+{
+	struct mm_struct *mm = mp->mm;
+	struct tsb *tsb = (struct tsb *) mm->context.sparc64_tsb;
+	unsigned long ctx = ~0UL;
+	int i;
+
+	if (CTX_VALID(mm->context))
+		ctx = CTX_HWBITS(mm->context);
+
+	for (i = 0; i < mp->tlb_nr; i++) {
+		unsigned long v = mp->vaddrs[i];
+		struct tsb *ent;
+
+		v &= ~0x1UL;
+
+		ent = &tsb[tsb_hash(v)];
+		if (tag_compare(ent, v, ctx)) {
+			ent->tag = 0UL;
+			membar_storeload_storestore();
+		}
+	}
+}
diff --git a/arch/sparc64/mm/ultra.S b/arch/sparc64/mm/ultra.S
index e4c9151fa116a16d2879a2a2243be64bb1e8e07d..22791f29552e5b3dac954f61ac2156fda113a3aa 100644
--- a/arch/sparc64/mm/ultra.S
+++ b/arch/sparc64/mm/ultra.S
@@ -453,64 +453,6 @@ xcall_flush_dcache_page_spitfire: /* %g1 == physical page address
 	nop
 	nop
 
-	.data
-
-errata32_hwbug:
-	.xword	0
-
-	.text
-
-	/* These two are not performance critical... */
-	.globl		xcall_flush_tlb_all_spitfire
-xcall_flush_tlb_all_spitfire:
-	/* Spitfire Errata #32 workaround. */
-	sethi		%hi(errata32_hwbug), %g4
-	stx		%g0, [%g4 + %lo(errata32_hwbug)]
-
-	clr		%g2
-	clr		%g3
-1:	ldxa		[%g3] ASI_DTLB_DATA_ACCESS, %g4
-	and		%g4, _PAGE_L, %g5
-	brnz,pn		%g5, 2f
-	 mov		TLB_TAG_ACCESS, %g7
-
-	stxa		%g0, [%g7] ASI_DMMU
-	membar		#Sync
-	stxa		%g0, [%g3] ASI_DTLB_DATA_ACCESS
-	membar		#Sync
-
-	/* Spitfire Errata #32 workaround. */
-	sethi		%hi(errata32_hwbug), %g4
-	stx		%g0, [%g4 + %lo(errata32_hwbug)]
-
-2:	ldxa		[%g3] ASI_ITLB_DATA_ACCESS, %g4
-	and		%g4, _PAGE_L, %g5
-	brnz,pn		%g5, 2f
-	 mov		TLB_TAG_ACCESS, %g7
-
-	stxa		%g0, [%g7] ASI_IMMU
-	membar		#Sync
-	stxa		%g0, [%g3] ASI_ITLB_DATA_ACCESS
-	membar		#Sync
-
-	/* Spitfire Errata #32 workaround. */
-	sethi		%hi(errata32_hwbug), %g4
-	stx		%g0, [%g4 + %lo(errata32_hwbug)]
-
-2:	add		%g2, 1, %g2
-	cmp		%g2, SPITFIRE_HIGHEST_LOCKED_TLBENT
-	ble,pt		%icc, 1b
-	 sll		%g2, 3, %g3
-	flush		%g6
-	retry
-
-	.globl		xcall_flush_tlb_all_cheetah
-xcall_flush_tlb_all_cheetah:
-	mov		0x80, %g2
-	stxa		%g0, [%g2] ASI_DMMU_DEMAP
-	stxa		%g0, [%g2] ASI_IMMU_DEMAP
-	retry
-
 	/* These just get rescheduled to PIL vectors. */
 	.globl		xcall_call_function
 xcall_call_function:
diff --git a/include/asm-sparc64/mmu.h b/include/asm-sparc64/mmu.h
index 8627eed6e83dfa7a59eba88c64c72ee00b157857..36384cf7faa60c3ce85ddaa1b63b6516da34cca0 100644
--- a/include/asm-sparc64/mmu.h
+++ b/include/asm-sparc64/mmu.h
@@ -92,6 +92,7 @@
 
 typedef struct {
 	unsigned long	sparc64_ctx_val;
+	unsigned long	*sparc64_tsb;
 } mm_context_t;
 
 #endif /* !__ASSEMBLY__ */
diff --git a/include/asm-sparc64/mmu_context.h b/include/asm-sparc64/mmu_context.h
index 57ee7b3061897d9810593b53d0c66eabe3b414ce..34640a370ab41fc89b0391baecd5699bc2bbeb5e 100644
--- a/include/asm-sparc64/mmu_context.h
+++ b/include/asm-sparc64/mmu_context.h
@@ -25,7 +25,13 @@ extern void get_new_mmu_context(struct mm_struct *mm);
  * This just needs to set mm->context to an invalid context.
  */
 #define init_new_context(__tsk, __mm)	\
-	(((__mm)->context.sparc64_ctx_val = 0UL), 0)
+({	unsigned long __pg = get_zeroed_page(GFP_KERNEL); \
+	(__mm)->context.sparc64_ctx_val = 0UL; \
+	(__mm)->context.sparc64_tsb = \
+	  (unsigned long *) __pg; \
+	(__pg ? 0 : -ENOMEM); \
+})
+
 
 /* Destroy a dead context.  This occurs when mmput drops the
  * mm_users count to zero, the mmaps have been released, and
@@ -35,7 +41,8 @@ extern void get_new_mmu_context(struct mm_struct *mm);
  * this task if valid.
  */
 #define destroy_context(__mm)					\
-do {	spin_lock(&ctx_alloc_lock);				\
+do {	free_page((unsigned long)(__mm)->context.sparc64_tsb);	\
+	spin_lock(&ctx_alloc_lock);				\
 	if (CTX_VALID((__mm)->context)) {			\
 		unsigned long nr = CTX_NRBITS((__mm)->context);	\
 		mmu_context_bmap[nr>>6] &= ~(1UL << (nr & 63));	\
@@ -43,35 +50,7 @@ do {	spin_lock(&ctx_alloc_lock);				\
 	spin_unlock(&ctx_alloc_lock);				\
 } while(0)
 
-/* Reload the two core values used by TLB miss handler
- * processing on sparc64.  They are:
- * 1) The physical address of mm->pgd, when full page
- *    table walks are necessary, this is where the
- *    search begins.
- * 2) A "PGD cache".  For 32-bit tasks only pgd[0] is
- *    ever used since that maps the entire low 4GB
- *    completely.  To speed up TLB miss processing we
- *    make this value available to the handlers.  This
- *    decreases the amount of memory traffic incurred.
- */
-#define reload_tlbmiss_state(__tsk, __mm) \
-do { \
-	register unsigned long paddr asm("o5"); \
-	register unsigned long pgd_cache asm("o4"); \
-	paddr = __pa((__mm)->pgd); \
-	pgd_cache = 0UL; \
-	if (task_thread_info(__tsk)->flags & _TIF_32BIT) \
-		pgd_cache = get_pgd_cache((__mm)->pgd); \
-	__asm__ __volatile__("wrpr	%%g0, 0x494, %%pstate\n\t" \
-			     "mov	%3, %%g4\n\t" \
-			     "mov	%0, %%g7\n\t" \
-			     "stxa	%1, [%%g4] %2\n\t" \
-			     "membar	#Sync\n\t" \
-			     "wrpr	%%g0, 0x096, %%pstate" \
-			     : /* no outputs */ \
-			     : "r" (paddr), "r" (pgd_cache),\
-			       "i" (ASI_DMMU), "i" (TSB_REG)); \
-} while(0)
+extern unsigned long tsb_context_switch(unsigned long pgd_pa, unsigned long *tsb);
 
 /* Set MMU context in the actual hardware. */
 #define load_secondary_context(__mm) \
@@ -101,7 +80,8 @@ static inline void switch_mm(struct mm_struct *old_mm, struct mm_struct *mm, str
 
 	if (!ctx_valid || (old_mm != mm)) {
 		load_secondary_context(mm);
-		reload_tlbmiss_state(tsk, mm);
+		tsb_context_switch(__pa(mm->pgd),
+				   mm->context.sparc64_tsb);
 	}
 
 	/* Even if (mm == old_mm) we _must_ check
@@ -139,7 +119,7 @@ static inline void activate_mm(struct mm_struct *active_mm, struct mm_struct *mm
 
 	load_secondary_context(mm);
 	__flush_tlb_mm(CTX_HWBITS(mm->context), SECONDARY_CONTEXT);
-	reload_tlbmiss_state(current, mm);
+	tsb_context_switch(__pa(mm->pgd), mm->context.sparc64_tsb);
 }
 
 #endif /* !(__ASSEMBLY__) */
diff --git a/include/asm-sparc64/pgalloc.h b/include/asm-sparc64/pgalloc.h
index a96067cca96394f8697f5b9d1114e61f0caad328..baf59c00ea4713410e610948d0508b54bdfbd4e0 100644
--- a/include/asm-sparc64/pgalloc.h
+++ b/include/asm-sparc64/pgalloc.h
@@ -61,6 +61,7 @@ static __inline__ void free_pgd_slow(pgd_t *pgd)
 	free_page((unsigned long)pgd);
 }
 
+/* XXX This crap can die, no longer using virtual page tables... */
 #ifdef DCACHE_ALIASING_POSSIBLE
 #define VPTE_COLOR(address)		(((address) >> (PAGE_SHIFT + 10)) & 1UL)
 #define DCACHE_COLOR(address)		(((address) >> PAGE_SHIFT) & 1UL)
diff --git a/include/asm-sparc64/pgtable.h b/include/asm-sparc64/pgtable.h
index f0a9b44d3eb5068c1c70b14520d25c4f5226974b..f3ba1e05819517a24ba50bc8353616e4e3039dfe 100644
--- a/include/asm-sparc64/pgtable.h
+++ b/include/asm-sparc64/pgtable.h
@@ -25,7 +25,8 @@
 #include <asm/const.h>
 
 /* The kernel image occupies 0x4000000 to 0x1000000 (4MB --> 32MB).
- * The page copy blockops can use 0x2000000 to 0x10000000.
+ * The page copy blockops can use 0x2000000 to 0x4000000.
+ * The TSB is mapped in the 0x4000000 to 0x6000000 range.
  * The PROM resides in an area spanning 0xf0000000 to 0x100000000.
  * The vmalloc area spans 0x100000000 to 0x200000000.
  * Since modules need to be in the lowest 32-bits of the address space,
@@ -34,6 +35,7 @@
  * 0x400000000.
  */
 #define	TLBTEMP_BASE		_AC(0x0000000002000000,UL)
+#define	TSBMAP_BASE		_AC(0x0000000004000000,UL)
 #define MODULES_VADDR		_AC(0x0000000010000000,UL)
 #define MODULES_LEN		_AC(0x00000000e0000000,UL)
 #define MODULES_END		_AC(0x00000000f0000000,UL)
@@ -296,11 +298,6 @@ static inline pte_t pte_modify(pte_t orig_pte, pgprot_t new_prot)
 /* to find an entry in a kernel page-table-directory */
 #define pgd_offset_k(address) pgd_offset(&init_mm, address)
 
-/* extract the pgd cache used for optimizing the tlb miss
- * slow path when executing 32-bit compat processes
- */
-#define get_pgd_cache(pgd)	((unsigned long) pgd_val(*pgd) << 11)
-
 /* Find an entry in the second-level page table.. */
 #define pmd_offset(pudp, address)	\
 	((pmd_t *) pud_page(*(pudp)) + \
diff --git a/include/asm-sparc64/processor.h b/include/asm-sparc64/processor.h
index cd8d9b4c86587073821ab5144cec09803a5fcaa5..b3889f3f943a2bda36514154c6ba76565dce6ba9 100644
--- a/include/asm-sparc64/processor.h
+++ b/include/asm-sparc64/processor.h
@@ -28,6 +28,8 @@
  * User lives in his very own context, and cannot reference us. Note
  * that TASK_SIZE is a misnomer, it really gives maximum user virtual 
  * address that the kernel will allocate out.
+ *
+ * XXX No longer using virtual page tables, kill this upper limit...
  */
 #define VA_BITS		44
 #ifndef __ASSEMBLY__
@@ -37,18 +39,6 @@
 #endif
 #define TASK_SIZE	((unsigned long)-VPTE_SIZE)
 
-/*
- * The vpte base must be able to hold the entire vpte, half
- * of which lives above, and half below, the base. And it
- * is placed as close to the highest address range as possible.
- */
-#define VPTE_BASE_SPITFIRE	(-(VPTE_SIZE/2))
-#if 1
-#define VPTE_BASE_CHEETAH	VPTE_BASE_SPITFIRE
-#else
-#define VPTE_BASE_CHEETAH	0xffe0000000000000
-#endif
-
 #ifndef __ASSEMBLY__
 
 typedef struct {
diff --git a/include/asm-sparc64/tlbflush.h b/include/asm-sparc64/tlbflush.h
index 3ef9909ac3ac28110ca0b9c62c26397baff41402..9ad5d9c51d42b90f0f82dd338d25aa076fd99fe9 100644
--- a/include/asm-sparc64/tlbflush.h
+++ b/include/asm-sparc64/tlbflush.h
@@ -5,6 +5,11 @@
 #include <linux/mm.h>
 #include <asm/mmu_context.h>
 
+/* TSB flush operations. */
+struct mmu_gather;
+extern void flush_tsb_kernel_range(unsigned long start, unsigned long end);
+extern void flush_tsb_user(struct mmu_gather *mp);
+
 /* TLB flush operations. */
 
 extern void flush_tlb_pending(void);
@@ -14,28 +19,36 @@ extern void flush_tlb_pending(void);
 #define flush_tlb_page(vma,addr)	flush_tlb_pending()
 #define flush_tlb_mm(mm)		flush_tlb_pending()
 
+/* Local cpu only.  */
 extern void __flush_tlb_all(void);
+
 extern void __flush_tlb_page(unsigned long context, unsigned long page, unsigned long r);
 
 extern void __flush_tlb_kernel_range(unsigned long start, unsigned long end);
 
 #ifndef CONFIG_SMP
 
-#define flush_tlb_all()		__flush_tlb_all()
 #define flush_tlb_kernel_range(start,end) \
-	__flush_tlb_kernel_range(start,end)
+do {	flush_tsb_kernel_range(start,end); \
+	__flush_tlb_kernel_range(start,end); \
+} while (0)
 
 #else /* CONFIG_SMP */
 
-extern void smp_flush_tlb_all(void);
 extern void smp_flush_tlb_kernel_range(unsigned long start, unsigned long end);
 
-#define flush_tlb_all()		smp_flush_tlb_all()
 #define flush_tlb_kernel_range(start, end) \
-	smp_flush_tlb_kernel_range(start, end)
+do {	flush_tsb_kernel_range(start,end); \
+	smp_flush_tlb_kernel_range(start, end); \
+} while (0)
 
 #endif /* ! CONFIG_SMP */
 
-extern void flush_tlb_pgtables(struct mm_struct *, unsigned long, unsigned long);
+static inline void flush_tlb_pgtables(struct mm_struct *mm, unsigned long start, unsigned long end)
+{
+	/* We don't use virtual page tables for TLB miss processing
+	 * any more.  Nowadays we use the TSB.
+	 */
+}
 
 #endif /* _SPARC64_TLBFLUSH_H */
diff --git a/include/asm-sparc64/tsb.h b/include/asm-sparc64/tsb.h
new file mode 100644
index 0000000000000000000000000000000000000000..03d272e0e477467fd15362ea49658c5554ba8c9c
--- /dev/null
+++ b/include/asm-sparc64/tsb.h
@@ -0,0 +1,165 @@
+#ifndef _SPARC64_TSB_H
+#define _SPARC64_TSB_H
+
+/* The sparc64 TSB is similar to the powerpc hashtables.  It's a
+ * power-of-2 sized table of TAG/PTE pairs.  The cpu precomputes
+ * pointers into this table for 8K and 64K page sizes, and also a
+ * comparison TAG based upon the virtual address and context which
+ * faults.
+ *
+ * TLB miss trap handler software does the actual lookup via something
+ * of the form:
+ *
+ * 	ldxa		[%g0] ASI_{D,I}MMU_TSB_8KB_PTR, %g1
+ * 	ldxa		[%g0] ASI_{D,I}MMU, %g6
+ * 	ldda		[%g1] ASI_NUCLEUS_QUAD_LDD, %g4
+ * 	cmp		%g4, %g6
+ * 	bne,pn	%xcc, tsb_miss_{d,i}tlb
+ * 	 mov		FAULT_CODE_{D,I}TLB, %g3
+ * 	stxa		%g5, [%g0] ASI_{D,I}TLB_DATA_IN
+ * 	retry
+ *
+
+ * Each 16-byte slot of the TSB is the 8-byte tag and then the 8-byte
+ * PTE.  The TAG is of the same layout as the TLB TAG TARGET mmu
+ * register which is:
+ *
+ * -------------------------------------------------
+ * |  -  |  CONTEXT |  -  |    VADDR bits 63:22    |
+ * -------------------------------------------------
+ *  63 61 60      48 47 42 41                     0
+ *
+ * Like the powerpc hashtables we need to use locking in order to
+ * synchronize while we update the entries.  PTE updates need locking
+ * as well.
+ *
+ * We need to carefully choose a lock bits for the TSB entry.  We
+ * choose to use bit 47 in the tag.  Also, since we never map anything
+ * at page zero in context zero, we use zero as an invalid tag entry.
+ * When the lock bit is set, this forces a tag comparison failure.
+ *
+ * Currently, we allocate an 8K TSB per-process and we use it for both
+ * I-TLB and D-TLB misses.  Perhaps at some point we'll add code that
+ * monitors the number of active pages in the process as we get
+ * major/minor faults, and grow the TSB in response.  The only trick
+ * in implementing that is synchronizing the freeing of the old TSB
+ * wrt.  parallel TSB updates occuring on other processors.  On
+ * possible solution is to use RCU for the freeing of the TSB.
+ */
+
+#define TSB_TAG_LOCK	(1 << (47 - 32))
+
+#define TSB_MEMBAR	membar	#StoreStore
+
+#define TSB_LOCK_TAG(TSB, REG1, REG2)	\
+99:	lduwa	[TSB] ASI_N, REG1;	\
+	sethi	%hi(TSB_TAG_LOCK), REG2;\
+	andcc	REG1, REG2, %g0;	\
+	bne,pn	%icc, 99b;		\
+	 nop;				\
+	casa	[TSB] ASI_N, REG1, REG2;\
+	cmp	REG1, REG2;		\
+	bne,pn	%icc, 99b;		\
+	 nop;				\
+	TSB_MEMBAR
+
+#define TSB_WRITE(TSB, TTE, TAG)	   \
+	stx		TTE, [TSB + 0x08]; \
+	TSB_MEMBAR;			   \
+	stx		TAG, [TSB + 0x00];
+
+	/* Do a kernel page table walk.  Leaves physical PTE pointer in
+	 * REG1.  Jumps to FAIL_LABEL on early page table walk termination.
+	 * VADDR will not be clobbered, but REG2 will.
+	 */
+#define KERN_PGTABLE_WALK(VADDR, REG1, REG2, FAIL_LABEL)	\
+	sethi		%hi(swapper_pg_dir), REG1; \
+	or		REG1, %lo(swapper_pg_dir), REG1; \
+	sllx		VADDR, 64 - (PGDIR_SHIFT + PGDIR_BITS), REG2; \
+	srlx		REG2, 64 - PAGE_SHIFT, REG2; \
+	andn		REG2, 0x3, REG2; \
+	lduw		[REG1 + REG2], REG1; \
+	brz,pn		REG1, FAIL_LABEL; \
+	 sllx		VADDR, 64 - (PMD_SHIFT + PMD_BITS), REG2; \
+	srlx		REG2, 64 - PAGE_SHIFT, REG2; \
+	sllx		REG1, 11, REG1; \
+	andn		REG2, 0x3, REG2; \
+	lduwa		[REG1 + REG2] ASI_PHYS_USE_EC, REG1; \
+	brz,pn		REG1, FAIL_LABEL; \
+	 sllx		VADDR, 64 - PMD_SHIFT, REG2; \
+	srlx		REG2, 64 - PAGE_SHIFT, REG2; \
+	sllx		REG1, 11, REG1; \
+	andn		REG2, 0x7, REG2; \
+	add		REG1, REG2, REG1;
+
+	/* Do a user page table walk in MMU globals.  Leaves physical PTE
+	 * pointer in REG1.  Jumps to FAIL_LABEL on early page table walk
+	 * termination.  Physical base of page tables is in PHYS_PGD which
+	 * will not be modified.
+	 *
+	 * VADDR will not be clobbered, but REG1 and REG2 will.
+	 */
+#define USER_PGTABLE_WALK_TL1(VADDR, PHYS_PGD, REG1, REG2, FAIL_LABEL)	\
+	sllx		VADDR, 64 - (PGDIR_SHIFT + PGDIR_BITS), REG2; \
+	srlx		REG2, 64 - PAGE_SHIFT, REG2; \
+	andn		REG2, 0x3, REG2; \
+	lduwa		[PHYS_PGD + REG2] ASI_PHYS_USE_EC, REG1; \
+	brz,pn		REG1, FAIL_LABEL; \
+	 sllx		VADDR, 64 - (PMD_SHIFT + PMD_BITS), REG2; \
+	srlx		REG2, 64 - PAGE_SHIFT, REG2; \
+	sllx		REG1, 11, REG1; \
+	andn		REG2, 0x3, REG2; \
+	lduwa		[REG1 + REG2] ASI_PHYS_USE_EC, REG1; \
+	brz,pn		REG1, FAIL_LABEL; \
+	 sllx		VADDR, 64 - PMD_SHIFT, REG2; \
+	srlx		REG2, 64 - PAGE_SHIFT, REG2; \
+	sllx		REG1, 11, REG1; \
+	andn		REG2, 0x7, REG2; \
+	add		REG1, REG2, REG1;
+
+/* Lookup a OBP mapping on VADDR in the prom_trans[] table at TL>0.
+ * If no entry is found, FAIL_LABEL will be branched to.  On success
+ * the resulting PTE value will be left in REG1.  VADDR is preserved
+ * by this routine.
+ */
+#define OBP_TRANS_LOOKUP(VADDR, REG1, REG2, REG3, FAIL_LABEL) \
+	sethi		%hi(prom_trans), REG1; \
+	or		REG1, %lo(prom_trans), REG1; \
+97:	ldx		[REG1 + 0x00], REG2; \
+	brz,pn		REG2, FAIL_LABEL; \
+	 nop; \
+	ldx		[REG1 + 0x08], REG3; \
+	add		REG2, REG3, REG3; \
+	cmp		REG2, VADDR; \
+	bgu,pt		%xcc, 98f; \
+	 cmp		VADDR, REG3; \
+	bgeu,pt		%xcc, 98f; \
+	 ldx		[REG1 + 0x10], REG3; \
+	sub		VADDR, REG2, REG2; \
+	ba,pt		%xcc, 99f; \
+	 add		REG3, REG2, REG1; \
+98:	ba,pt		%xcc, 97b; \
+	 add		REG1, (3 * 8), REG1; \
+99:
+
+	/* Do a kernel TSB lookup at tl>0 on VADDR+TAG, branch to OK_LABEL
+	 * on TSB hit.  REG1, REG2, REG3, and REG4 are used as temporaries
+	 * and the found TTE will be left in REG1.  REG3 and REG4 must
+	 * be an even/odd pair of registers.
+	 *
+	 * VADDR and TAG will be preserved and not clobbered by this macro.
+	 */
+	/* XXX non-8K base page size support... */
+#define KERN_TSB_LOOKUP_TL1(VADDR, TAG, REG1, REG2, REG3, REG4, OK_LABEL) \
+	sethi		%hi(swapper_tsb), REG1; \
+	or		REG1, %lo(swapper_tsb), REG1; \
+	srlx		VADDR, 13, REG2; \
+	and		REG2, (512 - 1), REG2; \
+	sllx		REG2, 4, REG2; \
+	add		REG1, REG2, REG2; \
+	ldda		[REG2] ASI_NUCLEUS_QUAD_LDD, REG3; \
+	cmp		REG3, TAG; \
+	be,a,pt		%xcc, OK_LABEL; \
+	 mov		REG4, REG1;
+
+#endif /* !(_SPARC64_TSB_H) */