diff --git a/drivers/gpu/drm/nouveau/nv50_display.c b/drivers/gpu/drm/nouveau/nv50_display.c
index 973a4126213902c1924f225f3854e05bc65fc546..dd8d4352ed998e7f09d36a981098edae84830808 100644
--- a/drivers/gpu/drm/nouveau/nv50_display.c
+++ b/drivers/gpu/drm/nouveau/nv50_display.c
@@ -137,8 +137,10 @@ struct nv50_head_atom {
 	} mode;
 
 	struct {
+		bool visible;
 		u32 handle;
 		u64 offset:40;
+		u8  mode:4;
 	} lut;
 
 	struct {
@@ -192,6 +194,7 @@ struct nv50_head_atom {
 
 	union {
 		struct {
+			bool ilut:1;
 			bool core:1;
 			bool curs:1;
 		};
@@ -200,6 +203,7 @@ struct nv50_head_atom {
 
 	union {
 		struct {
+			bool ilut:1;
 			bool core:1;
 			bool curs:1;
 			bool view:1;
@@ -661,7 +665,8 @@ nv50_ovly_create(struct nvif_device *device, struct nvif_object *disp,
 struct nv50_head {
 	struct nouveau_crtc base;
 	struct {
-		struct nouveau_bo *nvbo[1];
+		struct nouveau_bo *nvbo[2];
+		int next;
 	} lut;
 	struct nv50_ovly ovly;
 	struct nv50_oimm oimm;
@@ -1797,6 +1802,54 @@ nv50_head_lut_clr(struct nv50_head *head)
 	}
 }
 
+static void
+nv50_head_lut_load(struct drm_property_blob *blob, int mode,
+		   struct nouveau_bo *nvbo)
+{
+	struct drm_color_lut *in = (struct drm_color_lut *)blob->data;
+	void __iomem *lut = (u8 *)nvbo_kmap_obj_iovirtual(nvbo);
+	const int size = blob->length / sizeof(*in);
+	int bits, shift, i;
+	u16 zero, r, g, b;
+
+	/* This can't happen.. But it shuts the compiler up. */
+	if (WARN_ON(size != 256))
+		return;
+
+	switch (mode) {
+	case 0: /* LORES. */
+	case 1: /* HIRES. */
+		bits = 11;
+		shift = 3;
+		zero = 0x0000;
+		break;
+	case 7: /* INTERPOLATE_257_UNITY_RANGE. */
+		bits = 14;
+		shift = 0;
+		zero = 0x6000;
+		break;
+	default:
+		WARN_ON(1);
+		return;
+	}
+
+	for (i = 0; i < size; i++) {
+		r = (drm_color_lut_extract(in[i].  red, bits) + zero) << shift;
+		g = (drm_color_lut_extract(in[i].green, bits) + zero) << shift;
+		b = (drm_color_lut_extract(in[i]. blue, bits) + zero) << shift;
+		writew(r, lut + (i * 0x08) + 0);
+		writew(g, lut + (i * 0x08) + 2);
+		writew(b, lut + (i * 0x08) + 4);
+	}
+
+	/* INTERPOLATE modes require a "next" entry to interpolate with,
+	 * so we replicate the last entry to deal with this for now.
+	 */
+	writew(r, lut + (i * 0x08) + 0);
+	writew(g, lut + (i * 0x08) + 2);
+	writew(b, lut + (i * 0x08) + 4);
+}
+
 static void
 nv50_head_lut_set(struct nv50_head *head, struct nv50_head_atom *asyh)
 {
@@ -1805,20 +1858,18 @@ nv50_head_lut_set(struct nv50_head *head, struct nv50_head_atom *asyh)
 	if ((push = evo_wait(core, 7))) {
 		if (core->base.user.oclass < G82_DISP_CORE_CHANNEL_DMA) {
 			evo_mthd(push, 0x0840 + (head->base.index * 0x400), 2);
-			evo_data(push, asyh->base.depth == 8 ?
-				 0x80000000 : 0xc0000000);
+			evo_data(push, 0x80000000 | asyh->lut.mode << 30);
 			evo_data(push, asyh->lut.offset >> 8);
 		} else
 		if (core->base.user.oclass < GF110_DISP_CORE_CHANNEL_DMA) {
 			evo_mthd(push, 0x0840 + (head->base.index * 0x400), 2);
-			evo_data(push, asyh->base.depth == 8 ?
-				 0x80000000 : 0xc0000000);
+			evo_data(push, 0x80000000 | asyh->lut.mode << 30);
 			evo_data(push, asyh->lut.offset >> 8);
 			evo_mthd(push, 0x085c + (head->base.index * 0x400), 1);
 			evo_data(push, asyh->lut.handle);
 		} else {
 			evo_mthd(push, 0x0440 + (head->base.index * 0x300), 4);
-			evo_data(push, 0x87000000);
+			evo_data(push, 0x80000000 | asyh->lut.mode << 24);
 			evo_data(push, asyh->lut.offset >> 8);
 			evo_data(push, 0x00000000);
 			evo_data(push, 0x00000000);
@@ -1901,7 +1952,7 @@ nv50_head_view(struct nv50_head *head, struct nv50_head_atom *asyh)
 static void
 nv50_head_flush_clr(struct nv50_head *head, struct nv50_head_atom *asyh, bool y)
 {
-	if (asyh->clr.core && (!asyh->set.core || y))
+	if (asyh->clr.ilut && (!asyh->set.ilut || y))
 		nv50_head_lut_clr(head);
 	if (asyh->clr.core && (!asyh->set.core || y))
 		nv50_head_core_clr(head);
@@ -1914,7 +1965,15 @@ nv50_head_flush_set(struct nv50_head *head, struct nv50_head_atom *asyh)
 {
 	if (asyh->set.view   ) nv50_head_view    (head, asyh);
 	if (asyh->set.mode   ) nv50_head_mode    (head, asyh);
-	if (asyh->set.core   ) nv50_head_lut_set (head, asyh);
+	if (asyh->set.ilut   ) {
+		struct nouveau_bo *nvbo = head->lut.nvbo[head->lut.next];
+		struct drm_property_blob *blob = asyh->state.gamma_lut;
+		if (blob)
+			nv50_head_lut_load(blob, asyh->lut.mode, nvbo);
+		asyh->lut.offset = nvbo->bo.offset;
+		head->lut.next ^= 1;
+		nv50_head_lut_set(head, asyh);
+	}
 	if (asyh->set.core   ) nv50_head_core_set(head, asyh);
 	if (asyh->set.curs   ) nv50_head_curs_set(head, asyh);
 	if (asyh->set.base   ) nv50_head_base    (head, asyh);
@@ -2048,6 +2107,37 @@ nv50_head_atomic_check_view(struct nv50_head_atom *armh,
 	asyh->set.view = true;
 }
 
+static void
+nv50_head_atomic_check_lut(struct nv50_head *head,
+			   struct nv50_head_atom *armh,
+			   struct nv50_head_atom *asyh)
+{
+	struct nv50_disp *disp = nv50_disp(head->base.base.dev);
+
+	/* An I8 surface without an input LUT makes no sense, and
+	 * EVO will throw an error if you try.
+	 *
+	 * Legacy clients actually cause this due to the order in
+	 * which they call ioctls, so we will enable the LUT with
+	 * whatever contents the buffer already contains to avoid
+	 * triggering the error check.
+	 */
+	if (!asyh->state.gamma_lut && asyh->base.cpp != 1) {
+		asyh->lut.handle = 0;
+		asyh->clr.ilut = armh->lut.visible;
+		return;
+	}
+
+	if (disp->disp->oclass < GF110_DISP) {
+		asyh->lut.mode = (asyh->base.cpp == 1) ? 0 : 1;
+		asyh->set.ilut = true;
+	} else {
+		asyh->lut.mode = 7;
+		asyh->set.ilut = asyh->state.color_mgmt_changed;
+	}
+	asyh->lut.handle = disp->mast.base.vram.handle;
+}
+
 static void
 nv50_head_atomic_check_mode(struct nv50_head *head, struct nv50_head_atom *asyh)
 {
@@ -2133,6 +2223,11 @@ nv50_head_atomic_check(struct drm_crtc *crtc, struct drm_crtc_state *state)
 		if (asyh->state.mode_changed)
 			nv50_head_atomic_check_mode(head, asyh);
 
+		if (asyh->state.color_mgmt_changed ||
+		    asyh->base.cpp != armh->base.cpp)
+			nv50_head_atomic_check_lut(head, armh, asyh);
+		asyh->lut.visible = asyh->lut.handle != 0;
+
 		if (asyc) {
 			if (asyc->set.scaler)
 				nv50_head_atomic_check_view(armh, asyh, asyc);
@@ -2148,7 +2243,8 @@ nv50_head_atomic_check(struct drm_crtc *crtc, struct drm_crtc_state *state)
 			asyh->core.w = asyh->base.w;
 			asyh->core.h = asyh->base.h;
 		} else
-		if ((asyh->core.visible = asyh->curs.visible)) {
+		if ((asyh->core.visible = asyh->curs.visible) ||
+		    (asyh->core.visible = asyh->lut.visible)) {
 			/*XXX: We need to either find some way of having the
 			 *     primary base layer appear black, while still
 			 *     being able to display the other layers, or we
@@ -2166,11 +2262,10 @@ nv50_head_atomic_check(struct drm_crtc *crtc, struct drm_crtc_state *state)
 		asyh->core.layout = 1;
 		asyh->core.block = 0;
 		asyh->core.pitch = ALIGN(asyh->core.w, 64) * 4;
-		asyh->lut.handle = disp->mast.base.vram.handle;
-		asyh->lut.offset = head->lut.nvbo[0]->bo.offset;
 		asyh->set.base = armh->base.cpp != asyh->base.cpp;
 		asyh->set.ovly = armh->ovly.cpp != asyh->ovly.cpp;
 	} else {
+		asyh->lut.visible = false;
 		asyh->core.visible = false;
 		asyh->curs.visible = false;
 		asyh->base.cpp = 0;
@@ -2194,8 +2289,10 @@ nv50_head_atomic_check(struct drm_crtc *crtc, struct drm_crtc_state *state)
 			asyh->clr.curs = true;
 		}
 	} else {
+		asyh->clr.ilut = armh->lut.visible;
 		asyh->clr.core = armh->core.visible;
 		asyh->clr.curs = armh->curs.visible;
+		asyh->set.ilut = asyh->lut.visible;
 		asyh->set.core = asyh->core.visible;
 		asyh->set.curs = asyh->curs.visible;
 	}
@@ -2205,47 +2302,11 @@ nv50_head_atomic_check(struct drm_crtc *crtc, struct drm_crtc_state *state)
 	return 0;
 }
 
-static void
-nv50_head_lut_load(struct drm_crtc *crtc)
-{
-	struct nv50_disp *disp = nv50_disp(crtc->dev);
-	struct nv50_head *head = nv50_head(crtc);
-	void __iomem *lut = nvbo_kmap_obj_iovirtual(head->lut.nvbo[0]);
-	u16 *r, *g, *b;
-	int i;
-
-	r = crtc->gamma_store;
-	g = r + crtc->gamma_size;
-	b = g + crtc->gamma_size;
-
-	for (i = 0; i < 256; i++) {
-		if (disp->disp->oclass < GF110_DISP) {
-			writew((*r++ >> 2) + 0x0000, lut + (i * 0x08) + 0);
-			writew((*g++ >> 2) + 0x0000, lut + (i * 0x08) + 2);
-			writew((*b++ >> 2) + 0x0000, lut + (i * 0x08) + 4);
-		} else {
-			/* 0x6000 interferes with the 14-bit color??? */
-			writew((*r++ >> 2) + 0x6000, lut + (i * 0x08) + 0);
-			writew((*g++ >> 2) + 0x6000, lut + (i * 0x08) + 2);
-			writew((*b++ >> 2) + 0x6000, lut + (i * 0x08) + 4);
-		}
-	}
-}
-
 static const struct drm_crtc_helper_funcs
 nv50_head_help = {
 	.atomic_check = nv50_head_atomic_check,
 };
 
-static int
-nv50_head_gamma_set(struct drm_crtc *crtc, u16 *r, u16 *g, u16 *b,
-		    uint32_t size,
-		    struct drm_modeset_acquire_ctx *ctx)
-{
-	nv50_head_lut_load(crtc);
-	return 0;
-}
-
 static void
 nv50_head_atomic_destroy_state(struct drm_crtc *crtc,
 			       struct drm_crtc_state *state)
@@ -2318,7 +2379,7 @@ nv50_head_destroy(struct drm_crtc *crtc)
 static const struct drm_crtc_funcs
 nv50_head_func = {
 	.reset = nv50_head_reset,
-	.gamma_set = nv50_head_gamma_set,
+	.gamma_set = drm_atomic_helper_legacy_gamma_set,
 	.destroy = nv50_head_destroy,
 	.set_config = drm_atomic_helper_set_config,
 	.page_flip = drm_atomic_helper_page_flip,
@@ -4345,7 +4406,6 @@ nv50_display_init(struct drm_device *dev)
 {
 	struct drm_encoder *encoder;
 	struct drm_plane *plane;
-	struct drm_crtc *crtc;
 	u32 *push;
 
 	push = evo_wait(nv50_mast(dev), 32);
@@ -4364,10 +4424,6 @@ nv50_display_init(struct drm_device *dev)
 		}
 	}
 
-	drm_for_each_crtc(crtc, dev) {
-		nv50_head_lut_load(crtc);
-	}
-
 	drm_for_each_plane(plane, dev) {
 		struct nv50_wndw *wndw = nv50_wndw(plane);
 		if (plane->funcs != &nv50_wndw)