From 2d5d16b95db8b8ed266d103e4f2289e2115273f1 Mon Sep 17 00:00:00 2001
From: Jerome Glisse <jglisse@redhat.com>
Date: Wed, 8 Feb 2012 13:40:04 -0500
Subject: [PATCH] r600g: add htile support v2

htile is used for HiZ and HiS support and fast Z/S clears.
This commit just adds the htile setup. Fast Z/S clear is
enabled. But we don't take full advantage of HiS with that
patch. Following regs needs more tweaking :
DB_SRESULTS_COMPARE_STATE0
DB_SRESULTS_COMPARE_STATE1
DB_PRELOAD_CONTROL

v2 really use fast clear, still random issue with some tiles
   need to try more flush combination, fix depth/stencil
   texture decompression

Signed-off-by: Pierre-Eric Pelloux-Prayer <pelloux@gmail.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
Signed-off-by: Jerome Glisse <jglisse@redhat.com>
---
 src/gallium/drivers/r600/evergreen_state.c   |   61 ++++++++----
 src/gallium/drivers/r600/r600.h              |    1 +
 src/gallium/drivers/r600/r600_blit.c         |   60 ++++++++++-
 src/gallium/drivers/r600/r600_hw_context.c   |   27 +++++
 src/gallium/drivers/r600/r600_pipe.h         |    7 +-
 src/gallium/drivers/r600/r600_resource.h     |    7 +-
 src/gallium/drivers/r600/r600_state.c        |   86 +++++++++--------
 src/gallium/drivers/r600/r600_state_common.c |   36 +++++++-
 src/gallium/drivers/r600/r600_texture.c      |  134 ++++++++++++++++++++++++++
 src/gallium/drivers/r600/r600d.h             |   14 +++
 10 files changed, 363 insertions(+), 70 deletions(-)

diff --git a/src/gallium/drivers/r600/evergreen_state.c b/src/gallium/drivers/r600/evergreen_state.c
index dd67e4b..083a795 100644
--- a/src/gallium/drivers/r600/evergreen_state.c
+++ b/src/gallium/drivers/r600/evergreen_state.c
@@ -854,7 +854,6 @@ static void *evergreen_create_dsa_state(struct pipe_context *ctx,
 	struct r600_context *rctx = (struct r600_context *)ctx;
 	struct r600_pipe_dsa *dsa = CALLOC_STRUCT(r600_pipe_dsa);
 	unsigned db_depth_control, alpha_test_control, alpha_ref;
-	unsigned db_render_override, db_render_control;
 	struct r600_pipe_state *rstate;
 
 	if (dsa == NULL) {
@@ -900,27 +899,20 @@ static void *evergreen_create_dsa_state(struct pipe_context *ctx,
 	}
 	dsa->alpha_ref = alpha_ref;
 
-	/* misc */
-	db_render_control = 0;
-	db_render_override = S_02800C_FORCE_HIZ_ENABLE(V_02800C_FORCE_DISABLE) |
-		S_02800C_FORCE_HIS_ENABLE0(V_02800C_FORCE_DISABLE) |
-		S_02800C_FORCE_HIS_ENABLE1(V_02800C_FORCE_DISABLE);
-	/* TODO db_render_override depends on query */
-	r600_pipe_state_add_reg(rstate, R_028028_DB_STENCIL_CLEAR, 0x00000000, NULL, 0);
-	r600_pipe_state_add_reg(rstate, R_02802C_DB_DEPTH_CLEAR, 0x3F800000, NULL, 0);
 	r600_pipe_state_add_reg(rstate, R_028410_SX_ALPHA_TEST_CONTROL, alpha_test_control, NULL, 0);
 	r600_pipe_state_add_reg(rstate, R_0286DC_SPI_FOG_CNTL, 0x00000000, NULL, 0);
 	r600_pipe_state_add_reg(rstate, R_028800_DB_DEPTH_CONTROL, db_depth_control, NULL, 0);
 	/* The DB_SHADER_CONTROL mask is 0xFFFFFFBC since Z_EXPORT_ENABLE,
 	 * STENCIL_EXPORT_ENABLE and KILL_ENABLE are controlled by
 	 * evergreen_pipe_shader_ps().*/
-	r600_pipe_state_add_reg(rstate, R_028000_DB_RENDER_CONTROL, db_render_control, NULL, 0);
-	r600_pipe_state_add_reg(rstate, R_02800C_DB_RENDER_OVERRIDE, db_render_override, NULL, 0);
+	/* There are 2 sets of HiS states the hw updates when HiS is enabled.  Each state has an associated
+	 * func/ref/mask.  The results of these 2 states are stored per htile. There are two sets so that
+	 * the driver can update one while the other is in use.  The ENABLE0/1 bits select which one(s)
+	 * are active.
+	 */
 	r600_pipe_state_add_reg(rstate, R_028AC0_DB_SRESULTS_COMPARE_STATE0, 0x0, NULL, 0);
 	r600_pipe_state_add_reg(rstate, R_028AC4_DB_SRESULTS_COMPARE_STATE1, 0x0, NULL, 0);
-	r600_pipe_state_add_reg(rstate, R_028AC8_DB_PRELOAD_CONTROL, 0x0, NULL, 0);
 	r600_pipe_state_add_reg(rstate, R_028B70_DB_ALPHA_TO_MASK, 0x0000AA00, NULL, 0);
-	dsa->db_render_override = db_render_override;
 
 	return rstate;
 }
@@ -1642,6 +1634,7 @@ static void evergreen_db(struct r600_context *rctx, struct r600_pipe_state *rsta
 	uint64_t offset;
 	unsigned level, first_layer, pitch, slice, format, array_mode;
 	unsigned macro_aspect, tile_split, bankh, bankw, z_info, nbanks;
+	unsigned db_render_override, db_render_control;
 
 	if (state->zsbuf == NULL)
 		return;
@@ -1769,6 +1762,39 @@ static void evergreen_db(struct r600_context *rctx, struct r600_pipe_state *rsta
 		}
 	}
 
+	db_render_override = S_02800C_FORCE_HIZ_ENABLE(V_02800C_FORCE_DISABLE) |
+				S_02800C_FORCE_HIS_ENABLE0(V_02800C_FORCE_DISABLE) |
+				S_02800C_FORCE_HIS_ENABLE1(V_02800C_FORCE_DISABLE);
+	db_render_control = rctx->db_render_control;
+	if (rtex->hyperz) {
+		uint64_t htile_offset = rtex->hyperz->surface.level[0].offset;
+
+		htile_offset += r600_resource_va(rctx->context.screen, (void*)rtex->hyperz);
+		r600_pipe_state_add_reg(rstate, R_028014_DB_HTILE_DATA_BASE,
+					htile_offset >> 8, &rtex->hyperz->resource,
+					RADEON_USAGE_READWRITE);
+		r600_pipe_state_add_reg(rstate, R_028ABC_DB_HTILE_SURFACE,
+					rtex->htile_surface, NULL, 0);
+		z_info |= S_028040_TILE_SURFACE_ENABLE(1);
+		/* FORCE_OFF means HiZ/HiS are determined by DB_SHADER_CONTROL */
+		db_render_override = S_02800C_FORCE_HIZ_ENABLE(V_02800C_FORCE_ENABLE) |
+				S_02800C_FORCE_HIS_ENABLE0(V_02800C_FORCE_OFF) |
+				S_02800C_FORCE_HIS_ENABLE1(V_02800C_FORCE_OFF);
+		r600_pipe_state_add_reg(rstate, R_028028_DB_STENCIL_CLEAR, rtex->sclear_value, NULL, 0);
+		r600_pipe_state_add_reg(rstate, R_02802C_DB_DEPTH_CLEAR, rtex->dclear_value, NULL, 0);
+		r600_pipe_state_add_reg(rstate, R_028AC8_DB_PRELOAD_CONTROL, rtex->db_preload_control, NULL, 0);
+	}
+
+	r600_pipe_state_add_reg(rstate, R_028000_DB_RENDER_CONTROL, db_render_control, NULL, 0);
+	r600_pipe_state_add_reg(rstate, R_02800C_DB_RENDER_OVERRIDE, db_render_override, NULL, 0);
+	rctx->db_render_override = db_render_override;
+	/* clear depth decompress bit, this is needed because we can't save
+	 * state prior to calling blit util function when doing depth decompress.
+	 * So don't remove.
+	 */
+	rctx->db_render_control = 0;
+
+
 	r600_pipe_state_add_reg(rstate, R_028040_DB_Z_INFO, z_info,
 				&rtex->resource, RADEON_USAGE_READWRITE);
 	r600_pipe_state_add_reg(rstate, R_028058_DB_DEPTH_SIZE,
@@ -2498,6 +2524,9 @@ void evergreen_pipe_shader_ps(struct pipe_context *ctx, struct r600_pipe_shader
 
 	rstate->nregs = 0;
 
+	/* Z order is a driver provided hint to the hw.  In most cases leave it to early then late.
+	 * short shaders -> late Z, medium shaders -> early Z, and long shaders -> ReZ
+	 */
 	db_shader_control = S_02880C_Z_ORDER(V_02880C_EARLY_Z_THEN_LATE_Z);
 	for (i = 0; i < rshader->ninput; i++) {
 		/* evergreen NUM_INTERP only contains values interpolated into the LDS,
@@ -2726,12 +2755,6 @@ void *evergreen_create_db_flush_dsa(struct r600_context *rctx)
 	memset(&dsa, 0, sizeof(dsa));
 
 	rstate = rctx->context.create_depth_stencil_alpha_state(&rctx->context, &dsa);
-	r600_pipe_state_add_reg(rstate,
-				R_028000_DB_RENDER_CONTROL,
-				S_028000_DEPTH_COPY_ENABLE(1) |
-				S_028000_STENCIL_COPY_ENABLE(1) |
-				S_028000_COPY_CENTROID(1),
-				NULL, 0);
 	return rstate;
 }
 
diff --git a/src/gallium/drivers/r600/r600.h b/src/gallium/drivers/r600/r600.h
index 229fa70..29ae8e9 100644
--- a/src/gallium/drivers/r600/r600.h
+++ b/src/gallium/drivers/r600/r600.h
@@ -238,6 +238,7 @@ void r600_inval_shader_cache(struct r600_context *ctx);
 void r600_inval_texture_cache(struct r600_context *ctx);
 void r600_inval_vertex_cache(struct r600_context *ctx);
 void r600_flush_framebuffer(struct r600_context *ctx, bool flush_now);
+void r600_flush_hyperz(struct r600_context *ctx);
 
 void r600_context_streamout_begin(struct r600_context *ctx);
 void r600_context_streamout_end(struct r600_context *ctx);
diff --git a/src/gallium/drivers/r600/r600_blit.c b/src/gallium/drivers/r600/r600_blit.c
index 2ec39e8..814183b 100644
--- a/src/gallium/drivers/r600/r600_blit.c
+++ b/src/gallium/drivers/r600/r600_blit.c
@@ -24,6 +24,7 @@
 #include "util/u_blitter.h"
 #include "util/u_format.h"
 #include "r600_pipe.h"
+#include "r600d.h"
 
 enum r600_blitter_op /* bitmask */
 {
@@ -147,6 +148,10 @@ void r600_blit_uncompress_depth(struct pipe_context *ctx, struct r600_resource_t
 					(struct pipe_resource*)texture->flushed_depth_texture, &surf_tmpl);
 
 			r600_blitter_begin(ctx, R600_DECOMPRESS);
+			rctx->db_render_control |=
+					S_028D0C_DEPTH_COPY_ENABLE(1) |
+					S_028D0C_STENCIL_COPY_ENABLE(1) |
+					S_028D0C_COPY_CENTROID(1);
 			util_blitter_custom_depth_stencil(rctx->blitter, zsurf, cbsurf, rctx->custom_dsa_flush, depth);
 			r600_blitter_end(ctx);
 
@@ -163,7 +168,6 @@ void r600_flush_depth_textures(struct r600_context *rctx)
 	unsigned int i;
 
 	/* FIXME: This handles fragment shader textures only. */
-
 	for (i = 0; i < rctx->ps_samplers.n_views; ++i) {
 		struct r600_pipe_sampler_view *view;
 		struct r600_resource_texture *tex;
@@ -202,12 +206,47 @@ static void r600_clear(struct pipe_context *ctx, unsigned buffers,
 {
 	struct r600_context *rctx = (struct r600_context *)ctx;
 	struct pipe_framebuffer_state *fb = &rctx->framebuffer;
+	struct r600_resource_texture *rtex;
+
+	/* if hyperz enabled just clear hyperz */
+	if (fb->zsbuf && (buffers & PIPE_CLEAR_DEPTHSTENCIL)) {
+		rtex = (struct r600_resource_texture*)fb->zsbuf->texture;
+		if (rtex->hyperz) {
+			/* set clear value, as we use R600_CLEAR_SURFACE
+			 * the framebuffer state will be reset with proper
+			 * depth clear value
+			 */
+			rtex->dclear_value = depth;
+			rtex->sclear_value = stencil;
+
+			r600_blitter_begin(ctx, R600_CLEAR_SURFACE);
+			rctx->db_render_control = 0;
+			if (buffers & PIPE_CLEAR_DEPTH) {
+				rctx->db_render_control |= S_028D0C_DEPTH_CLEAR_ENABLE(1);
+			}
+			if (buffers & PIPE_CLEAR_STENCIL) {
+				rctx->db_render_control |= S_028D0C_STENCIL_CLEAR_ENABLE(1);
+			}
+			util_blitter_clear_depth_stencil(rctx->blitter,
+							 fb->zsbuf,
+							 buffers & PIPE_CLEAR_DEPTHSTENCIL,
+							 depth, stencil,
+							 0, 0,
+							 fb->zsbuf->texture->width0,
+							 fb->zsbuf->texture->height0);
+			r600_flush_hyperz(ctx);
+			r600_blitter_end(ctx);
+			buffers &= ~PIPE_CLEAR_DEPTHSTENCIL;
+		}
+	}
 
-	r600_blitter_begin(ctx, R600_CLEAR);
-	util_blitter_clear(rctx->blitter, fb->width, fb->height,
-			   fb->nr_cbufs, buffers, fb->nr_cbufs ? fb->cbufs[0]->format : PIPE_FORMAT_NONE,
-			   color, depth, stencil);
-	r600_blitter_end(ctx);
+	if (buffers) {
+		r600_blitter_begin(ctx, R600_CLEAR);
+		util_blitter_clear(rctx->blitter, fb->width, fb->height,
+				   fb->nr_cbufs, buffers, fb->nr_cbufs ? fb->cbufs[0]->format : PIPE_FORMAT_NONE,
+				   color, depth, stencil);
+		r600_blitter_end(ctx);
+	}
 }
 
 static void r600_clear_render_target(struct pipe_context *ctx,
@@ -233,11 +272,20 @@ static void r600_clear_depth_stencil(struct pipe_context *ctx,
 				     unsigned width, unsigned height)
 {
 	struct r600_context *rctx = (struct r600_context *)ctx;
+	struct r600_resource_texture *rtex;
+	float dclear_value = depth;
+
+fprintf(stderr, "%s %d ---------------------------------------------------\n", __func__, __LINE__);
+	/* update depth clear value */
+	rtex = (struct r600_resource_texture*)dst->texture;
+	rtex->dclear_value = dclear_value;
+	rtex->sclear_value = stencil;
 
 	r600_blitter_begin(ctx, R600_CLEAR_SURFACE);
 	util_blitter_clear_depth_stencil(rctx->blitter, dst, clear_flags, depth, stencil,
 					 dstx, dsty, width, height);
 	r600_blitter_end(ctx);
+fprintf(stderr, "%s %d ___________________________________________________\n", __func__, __LINE__);
 }
 
 
diff --git a/src/gallium/drivers/r600/r600_hw_context.c b/src/gallium/drivers/r600/r600_hw_context.c
index ab51b3e..59530de 100644
--- a/src/gallium/drivers/r600/r600_hw_context.c
+++ b/src/gallium/drivers/r600/r600_hw_context.c
@@ -454,6 +454,8 @@ static const struct r600_reg r600_context_reg_list[] = {
 	{R_028010_DB_DEPTH_INFO, REG_FLAG_NEED_BO, 0},
 	{R_028D0C_DB_RENDER_CONTROL, 0, 0},
 	{R_028D10_DB_RENDER_OVERRIDE, 0, 0},
+	{GROUP_FORCE_NEW_BLOCK, 0, 0},
+	{R_028014_DB_HTILE_DATA_BASE, REG_FLAG_NEED_BO, 0},
 	{R_028D24_DB_HTILE_SURFACE, 0, 0},
 	{R_028D30_DB_PRELOAD_CONTROL, 0, 0},
 	{R_028D34_DB_PREFETCH_LIMIT, 0, 0},
@@ -944,6 +946,8 @@ void r600_need_cs_space(struct r600_context *ctx, unsigned num_dw,
 	/* Save 16 dwords for the fence mechanism. */
 	num_dw += 16;
 
+num_dw += 16;
+
 	/* Flush if there's not enough space. */
 	if (num_dw > RADEON_MAX_CMDBUF_DWORDS) {
 		r600_flush(&ctx->context, NULL, RADEON_FLUSH_ASYNC);
@@ -1412,6 +1416,12 @@ void r600_flush_framebuffer(struct r600_context *ctx, bool flush_now)
 	ctx->flags &= ~R600_CONTEXT_DST_CACHES_DIRTY;
 }
 
+void r600_flush_hyperz(struct r600_context *ctx)
+{
+	r600_emit_atom(ctx, &ctx->atom_hyperz_flush);
+	r600_atom_dirty(ctx, &ctx->atom_hyperz_flush);
+}
+
 void r600_context_flush(struct r600_context *ctx, unsigned flags)
 {
 	struct radeon_winsys_cs *cs = ctx->cs;
@@ -1433,11 +1443,28 @@ void r600_context_flush(struct r600_context *ctx, unsigned flags)
 		streamout_suspended = true;
 	}
 
+
 	r600_flush_framebuffer(ctx, true);
 
 	/* partial flush is needed to avoid lockups on some chips with user fences */
 	cs->buf[cs->cdw++] = PKT3(PKT3_EVENT_WRITE, 0, 0);
 	cs->buf[cs->cdw++] = EVENT_TYPE(EVENT_TYPE_PS_PARTIAL_FLUSH) | EVENT_INDEX(4);
+cs->buf[cs->cdw++] = PKT3(PKT3_EVENT_WRITE, 0, 0);
+cs->buf[cs->cdw++] = EVENT_TYPE(42) | EVENT_INDEX(0);
+cs->buf[cs->cdw++] = PKT3(PKT3_EVENT_WRITE, 0, 0);
+cs->buf[cs->cdw++] = EVENT_TYPE(44) | EVENT_INDEX(0);
+cs->buf[cs->cdw++] = PKT3(PKT3_EVENT_WRITE, 0, 0);
+cs->buf[cs->cdw++] = EVENT_TYPE(43) | EVENT_INDEX(0);
+
+	cs->buf[cs->cdw++] = PKT3(PKT3_SURFACE_SYNC, 3, 0);
+	cs->buf[cs->cdw++] = 0xFFFFFFFF;
+	cs->buf[cs->cdw++] = 0xffffffff;      /* CP_COHER_SIZE */
+	cs->buf[cs->cdw++] = 0;               /* CP_COHER_BASE */
+	cs->buf[cs->cdw++] = 0x0000000A;      /* POLL_INTERVAL */
+
+cs->buf[cs->cdw++] = PKT3(PKT3_EVENT_WRITE, 0, 0);
+cs->buf[cs->cdw++] = EVENT_TYPE(22) | EVENT_INDEX(0);
+
 
 	/* force to keep tiling flags */
 	flags |= RADEON_FLUSH_KEEP_TILING_FLAGS;
diff --git a/src/gallium/drivers/r600/r600_pipe.h b/src/gallium/drivers/r600/r600_pipe.h
index f130617..b3f3256 100644
--- a/src/gallium/drivers/r600/r600_pipe.h
+++ b/src/gallium/drivers/r600/r600_pipe.h
@@ -118,6 +118,7 @@ struct r600_screen {
 
 	unsigned			num_contexts;
 	unsigned			use_surface;
+	unsigned			hyperz_enabled;
 
 	/* for thread-safe write accessing to num_contexts */
 	pipe_mutex			mutex_num_contexts;
@@ -150,8 +151,6 @@ struct r600_pipe_blend {
 struct r600_pipe_dsa {
 	struct r600_pipe_state		rstate;
 	unsigned			alpha_ref;
-	unsigned			db_render_override;
-	unsigned			db_render_control;
 	ubyte				valuemask[2];
 	ubyte				writemask[2];
 };
@@ -237,6 +236,8 @@ struct r600_context {
 	struct r600_vertex_element	*vertex_elements;
 	struct r600_pipe_resource_state	fs_resource[PIPE_MAX_ATTRIBS];
 	struct pipe_framebuffer_state	framebuffer;
+	unsigned			db_render_override;
+	unsigned			db_render_control;
 	unsigned			cb_target_mask;
 	unsigned			cb_color_control;
 	unsigned			pa_sc_line_stipple;
@@ -280,6 +281,7 @@ struct r600_context {
 	struct list_head		dirty_states;
 	struct r600_atom_surface_sync	atom_surface_sync;
 	struct r600_atom		atom_r6xx_flush_and_inv;
+	struct r600_atom		atom_hyperz_flush;
 
 	/* Below are variables from the old r600_context.
 	 */
@@ -497,4 +499,5 @@ static INLINE unsigned r600_pack_float_12p4(float x)
 	       x >= 4096 ? 0xffff : x * 16;
 }
 
+void rtexdump(struct r600_screen *rscreen, struct r600_resource_texture *rtex);
 #endif
diff --git a/src/gallium/drivers/r600/r600_resource.h b/src/gallium/drivers/r600/r600_resource.h
index 8b90b12..c488a24 100644
--- a/src/gallium/drivers/r600/r600_resource.h
+++ b/src/gallium/drivers/r600/r600_resource.h
@@ -57,10 +57,15 @@ struct r600_resource_texture {
 	unsigned			tile_type;
 	unsigned			depth;
 	unsigned			dirty_db;
-	struct r600_resource_texture    *stencil; /* Stencil is in a separate buffer on Evergreen. */
+	struct r600_resource_texture	*stencil; /* Stencil is in a separate buffer on Evergreen. */
 	struct r600_resource_texture	*flushed_depth_texture;
 	boolean				is_flushing_texture;
 	struct radeon_surface		surface;
+	struct r600_resource_texture	*hyperz; /* hyperz */
+	uint32_t			htile_surface;
+	uint32_t			db_preload_control;
+	uint32_t			sclear_value;
+	float				dclear_value;
 };
 
 #define R600_TEX_IS_TILED(tex, level) ((tex)->array_mode[level] != V_038000_ARRAY_LINEAR_GENERAL && (tex)->array_mode[level] != V_038000_ARRAY_LINEAR_ALIGNED)
diff --git a/src/gallium/drivers/r600/r600_state.c b/src/gallium/drivers/r600/r600_state.c
index 0d83fa6..044ea91 100644
--- a/src/gallium/drivers/r600/r600_state.c
+++ b/src/gallium/drivers/r600/r600_state.c
@@ -854,7 +854,6 @@ static void *r600_create_dsa_state(struct pipe_context *ctx,
 	struct r600_context *rctx = (struct r600_context *)ctx;
 	struct r600_pipe_dsa *dsa = CALLOC_STRUCT(r600_pipe_dsa);
 	unsigned db_depth_control, alpha_test_control, alpha_ref;
-	unsigned db_render_override, db_render_control;
 	struct r600_pipe_state *rstate;
 
 	if (dsa == NULL) {
@@ -900,28 +899,14 @@ static void *r600_create_dsa_state(struct pipe_context *ctx,
 	}
 	dsa->alpha_ref = alpha_ref;
 
-	/* misc */
-	db_render_control = 0;
-	db_render_override = S_028D10_FORCE_HIZ_ENABLE(V_028D10_FORCE_DISABLE) |
-		S_028D10_FORCE_HIS_ENABLE0(V_028D10_FORCE_DISABLE) |
-		S_028D10_FORCE_HIS_ENABLE1(V_028D10_FORCE_DISABLE);
-	/* TODO db_render_override depends on query */
-	r600_pipe_state_add_reg(rstate, R_028028_DB_STENCIL_CLEAR, 0x00000000, NULL, 0);
-	r600_pipe_state_add_reg(rstate, R_02802C_DB_DEPTH_CLEAR, 0x3F800000, NULL, 0);
 	r600_pipe_state_add_reg(rstate, R_028410_SX_ALPHA_TEST_CONTROL, alpha_test_control, NULL, 0);
 	r600_pipe_state_add_reg(rstate, R_0286E0_SPI_FOG_FUNC_SCALE, 0x00000000, NULL, 0);
 	r600_pipe_state_add_reg(rstate, R_0286E4_SPI_FOG_FUNC_BIAS, 0x00000000, NULL, 0);
 	r600_pipe_state_add_reg(rstate, R_0286DC_SPI_FOG_CNTL, 0x00000000, NULL, 0);
 	r600_pipe_state_add_reg(rstate, R_028800_DB_DEPTH_CONTROL, db_depth_control, NULL, 0);
-	r600_pipe_state_add_reg(rstate, R_028D0C_DB_RENDER_CONTROL, db_render_control, NULL, 0);
-	r600_pipe_state_add_reg(rstate, R_028D10_DB_RENDER_OVERRIDE, db_render_override, NULL, 0);
 	r600_pipe_state_add_reg(rstate, R_028D2C_DB_SRESULTS_COMPARE_STATE1, 0x00000000, NULL, 0);
-	r600_pipe_state_add_reg(rstate, R_028D30_DB_PRELOAD_CONTROL, 0x00000000, NULL, 0);
 	r600_pipe_state_add_reg(rstate, R_028D44_DB_ALPHA_TO_MASK, 0x0000AA00, NULL, 0);
 
-	dsa->db_render_override = db_render_override;
-	dsa->db_render_control = db_render_control;
-
 	return rstate;
 }
 
@@ -1677,6 +1662,7 @@ static void r600_db(struct r600_context *rctx, struct r600_pipe_state *rstate,
 	struct r600_resource_texture *rtex;
 	struct r600_surface *surf;
 	unsigned level, pitch, slice, format, offset, array_mode;
+	unsigned db_render_override, db_depth_info, db_render_control, db_preload_control;
 
 	if (state->zsbuf == NULL)
 		return;
@@ -1720,12 +1706,47 @@ static void r600_db(struct r600_context *rctx, struct r600_pipe_state *rstate,
 	}
 
 	format = r600_translate_dbformat(state->zsbuf->texture->format);
+	db_depth_info = S_028010_ARRAY_MODE(array_mode) | S_028010_FORMAT(format);
+	db_render_control = rctx->db_render_control;
+	db_render_override = S_028D10_FORCE_HIZ_ENABLE(V_028D10_FORCE_DISABLE) |
+		S_028D10_FORCE_HIS_ENABLE0(V_028D10_FORCE_DISABLE) |
+		S_028D10_FORCE_HIS_ENABLE1(V_028D10_FORCE_DISABLE);
+	db_preload_control = 0;
+
+	if (rtex->hyperz) {
+		uint64_t htile_offset = rtex->hyperz->surface.level[0].offset;
+
+		r600_pipe_state_add_reg(rstate, R_028014_DB_HTILE_DATA_BASE,
+					htile_offset >> 8, &rtex->hyperz->resource,
+					RADEON_USAGE_READWRITE);
+		r600_pipe_state_add_reg(rstate, R_028D24_DB_HTILE_SURFACE,
+					rtex->htile_surface, NULL, 0);
+		db_depth_info |= S_028010_TILE_SURFACE_ENABLE(1);
+		/* FORCE_OFF means HiZ/HiS are determined by DB_SHADER_CONTROL */
+		db_render_override = S_028D10_FORCE_HIZ_ENABLE(V_028D10_FORCE_OFF) |
+				S_028D10_FORCE_HIS_ENABLE0(V_028D10_FORCE_OFF) |
+				S_028D10_FORCE_HIS_ENABLE1(V_028D10_FORCE_OFF);
+		db_preload_control = rtex->db_preload_control;
+
+		r600_pipe_state_add_reg(rstate, R_028028_DB_STENCIL_CLEAR, rtex->sclear_value, NULL, 0);
+		r600_pipe_state_add_reg(rstate, R_02802C_DB_DEPTH_CLEAR, fui(rtex->dclear_value), NULL, 0);
+	}
+	r600_pipe_state_add_reg(rstate, R_028D0C_DB_RENDER_CONTROL, db_render_control, NULL, 0);
+	r600_pipe_state_add_reg(rstate, R_028D10_DB_RENDER_OVERRIDE, db_render_override, NULL, 0);
+	r600_pipe_state_add_reg(rstate, R_028D30_DB_PRELOAD_CONTROL, db_preload_control, NULL, 0);
+	rctx->db_render_override = db_render_override;
+	/* clear depth decompress bit, this is needed because we can't save
+	 * state prior to calling blit util function when doing depth decompress.
+	 * So don't remove.
+	 */
+	rctx->db_render_control = 0;
+
+	r600_pipe_state_add_reg(rstate, R_028010_DB_DEPTH_INFO,
+				db_depth_info,
+				&rtex->resource, RADEON_USAGE_READWRITE);
+	r600_pipe_state_add_reg(rstate, R_028D34_DB_PREFETCH_LIMIT,
+				(surf->aligned_height / 8) - 1, NULL, 0);
 
-	r600_pipe_state_add_reg(rstate, R_02800C_DB_DEPTH_BASE,
-				offset >> 8, &rtex->resource, RADEON_USAGE_READWRITE);
-	r600_pipe_state_add_reg(rstate, R_028000_DB_DEPTH_SIZE,
-				S_028000_PITCH_TILE_MAX(pitch) | S_028000_SLICE_TILE_MAX(slice),
-				NULL, 0);
 	if (!rscreen->use_surface) {
 		r600_pipe_state_add_reg(rstate, R_028004_DB_DEPTH_VIEW, 0x00000000, NULL, 0);
 	} else {
@@ -1734,11 +1755,11 @@ static void r600_db(struct r600_context *rctx, struct r600_pipe_state *rstate,
 					S_028004_SLICE_MAX(state->zsbuf->u.tex.last_layer),
 					NULL, 0);
 	}
-	r600_pipe_state_add_reg(rstate, R_028010_DB_DEPTH_INFO,
-				S_028010_ARRAY_MODE(array_mode) | S_028010_FORMAT(format),
-				&rtex->resource, RADEON_USAGE_READWRITE);
-	r600_pipe_state_add_reg(rstate, R_028D34_DB_PREFETCH_LIMIT,
-				(surf->aligned_height / 8) - 1, NULL, 0);
+	r600_pipe_state_add_reg(rstate, R_02800C_DB_DEPTH_BASE,
+				offset >> 8, &rtex->resource, RADEON_USAGE_READWRITE);
+	r600_pipe_state_add_reg(rstate, R_028000_DB_DEPTH_SIZE,
+				S_028000_PITCH_TILE_MAX(pitch) | S_028000_SLICE_TILE_MAX(slice),
+				NULL, 0);
 }
 
 static void r600_set_framebuffer_state(struct pipe_context *ctx,
@@ -2131,7 +2152,7 @@ void r600_init_config(struct r600_context *rctx)
 	if (rctx->chip_class >= R700) {
 		r600_pipe_state_add_reg(rstate, R_008D8C_SQ_DYN_GPR_CNTL_PS_FLUSH_REQ, 0x00004000, NULL, 0);
 		r600_pipe_state_add_reg(rstate, R_009830_DB_DEBUG, 0x00000000, NULL, 0);
-		r600_pipe_state_add_reg(rstate, R_009838_DB_WATERMARKS, 0x00420204, NULL, 0);
+		r600_pipe_state_add_reg(rstate, R_009838_DB_WATERMARKS, 0x00f2ffff, NULL, 0);
 		r600_pipe_state_add_reg(rstate, R_0286C8_SPI_THREAD_GROUPING, 0x00000000, NULL, 0);
 	} else {
 		r600_pipe_state_add_reg(rstate, R_008D8C_SQ_DYN_GPR_CNTL_PS_FLUSH_REQ, 0x00000000, NULL, 0);
@@ -2382,8 +2403,6 @@ void *r600_create_db_flush_dsa(struct r600_context *rctx)
 {
 	struct pipe_depth_stencil_alpha_state dsa;
 	struct r600_pipe_state *rstate;
-	struct r600_pipe_dsa *dsa_state;
-	unsigned db_render_control;
 	boolean quirk = false;
 
 	if (rctx->family == CHIP_RV610 || rctx->family == CHIP_RV630 ||
@@ -2403,17 +2422,6 @@ void *r600_create_db_flush_dsa(struct r600_context *rctx)
 	}
 
 	rstate = rctx->context.create_depth_stencil_alpha_state(&rctx->context, &dsa);
-	dsa_state = (struct r600_pipe_dsa*)rstate;
-
-	db_render_control =
-		S_028D0C_DEPTH_COPY_ENABLE(1) |
-		S_028D0C_STENCIL_COPY_ENABLE(1) |
-		S_028D0C_COPY_CENTROID(1);
-
-	r600_pipe_state_add_reg(rstate, R_028D0C_DB_RENDER_CONTROL, db_render_control, NULL, 0);
-
-	dsa_state->db_render_control = db_render_control;
-
 	return rstate;
 }
 
diff --git a/src/gallium/drivers/r600/r600_state_common.c b/src/gallium/drivers/r600/r600_state_common.c
index 21e4bd1..61d1c12 100644
--- a/src/gallium/drivers/r600/r600_state_common.c
+++ b/src/gallium/drivers/r600/r600_state_common.c
@@ -55,6 +55,36 @@ static void r600_emit_r6xx_flush_and_inv(struct r600_context *rctx, struct r600_
 	cs->buf[cs->cdw++] = EVENT_TYPE(EVENT_TYPE_CACHE_FLUSH_AND_INV_EVENT) | EVENT_INDEX(0);
 }
 
+static void r600_emit_hyperz_flush(struct r600_context *rctx, struct r600_atom *atom)
+{
+	struct radeon_winsys_cs *cs = rctx->cs;
+
+#if 0
+	cs->buf[cs->cdw++] = PKT3(PKT3_EVENT_WRITE, 0, 0);
+	cs->buf[cs->cdw++] = EVENT_TYPE(22) | EVENT_INDEX(0);
+	cs->buf[cs->cdw++] = PKT3(PKT3_EVENT_WRITE, 0, 0);
+	cs->buf[cs->cdw++] = EVENT_TYPE(16) | EVENT_INDEX(0);
+	cs->buf[cs->cdw++] = PKT3(PKT3_EVENT_WRITE, 0, 0);
+	cs->buf[cs->cdw++] = EVENT_TYPE(42) | EVENT_INDEX(0);
+#endif
+	cs->buf[cs->cdw++] = PKT3(PKT3_EVENT_WRITE, 0, 0);
+	cs->buf[cs->cdw++] = EVENT_TYPE(43) | EVENT_INDEX(0);
+#if 0
+	cs->buf[cs->cdw++] = PKT3(PKT3_SURFACE_SYNC, 3, 0);
+//	cs->buf[cs->cdw++] = S_0085F0_DB_ACTION_ENA(1) | S_0085F0_DB_DEST_BASE_ENA(1);  /* CP_COHER_CNTL */
+	cs->buf[cs->cdw++] = 0xFFFFFFFF;
+	cs->buf[cs->cdw++] = 0xffffffff;      /* CP_COHER_SIZE */
+	cs->buf[cs->cdw++] = 0;               /* CP_COHER_BASE */
+	cs->buf[cs->cdw++] = 0x0000000A;      /* POLL_INTERVAL */
+#else
+	cs->buf[cs->cdw++] = PKT3(PKT3_NOP, 3, 0);
+	cs->buf[cs->cdw++] = 0;
+	cs->buf[cs->cdw++] = 0;
+	cs->buf[cs->cdw++] = 0;
+	cs->buf[cs->cdw++] = 0;
+#endif
+}
+
 static void r600_init_atom(struct r600_atom *atom,
 			   void (*emit)(struct r600_context *ctx, struct r600_atom *state),
 			   unsigned num_dw,
@@ -69,6 +99,7 @@ void r600_init_common_atoms(struct r600_context *rctx)
 {
 	r600_init_atom(&rctx->atom_surface_sync.atom,	r600_emit_surface_sync,		5, EMIT_EARLY);
 	r600_init_atom(&rctx->atom_r6xx_flush_and_inv,	r600_emit_r6xx_flush_and_inv,	2, EMIT_EARLY);
+	r600_init_atom(&rctx->atom_hyperz_flush,	r600_emit_hyperz_flush,		11, EMIT_EARLY);
 }
 
 unsigned r600_get_cb_flush_flags(struct r600_context *rctx)
@@ -736,7 +767,6 @@ static void r600_update_derived_state(struct r600_context *rctx)
 void r600_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info *dinfo)
 {
 	struct r600_context *rctx = (struct r600_context *)ctx;
-	struct r600_pipe_dsa *dsa = (struct r600_pipe_dsa*)rctx->states[R600_PIPE_STATE_DSA];
 	struct pipe_draw_info info = *dinfo;
 	struct r600_draw rdraw = {};
 	struct pipe_index_buffer ib = {};
@@ -854,8 +884,8 @@ void r600_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info *dinfo)
 
 	r600_context_pipe_state_set(rctx, &rctx->vgt);
 
-	rdraw.db_render_override = dsa->db_render_override;
-	rdraw.db_render_control = dsa->db_render_control;
+	rdraw.db_render_override = rctx->db_render_override;
+	rdraw.db_render_control = rctx->db_render_control;
 
 	/* Emit states. */
 	r600_need_cs_space(rctx, 0, TRUE);
diff --git a/src/gallium/drivers/r600/r600_texture.c b/src/gallium/drivers/r600/r600_texture.c
index 4e2e600..6beefe7 100644
--- a/src/gallium/drivers/r600/r600_texture.c
+++ b/src/gallium/drivers/r600/r600_texture.c
@@ -413,6 +413,64 @@ static void r600_setup_miptree(struct pipe_screen *screen,
 	rtex->size = offset;
 }
 
+static uint32_t r600_htile_settings(struct r600_screen *rscreen, struct r600_resource_texture *zbuf)
+{
+	unsigned num_tile_pipes;
+	unsigned tile_pipes_per_DB;
+	unsigned max_pixels_per_DB;
+	unsigned width_per_DB;
+	const unsigned k = 1024;
+	uint32_t htile_settings = 0;
+
+	num_tile_pipes = rscreen->info.r600_num_tile_pipes;
+	tile_pipes_per_DB =  num_tile_pipes / rscreen->info.r600_num_backends;
+	max_pixels_per_DB = (zbuf->surface.npix_x * zbuf->surface.npix_y * tile_pipes_per_DB) / num_tile_pipes;
+
+	/* eg is always 8x8 */
+	if (rscreen->family >= CHIP_CEDAR)
+		htile_settings |= S_028D24_HTILE_WIDTH(1) | S_028D24_HTILE_HEIGHT(1);
+	htile_settings |= S_028D24_PRELOAD(1);
+
+	if (max_pixels_per_DB <= 64 * k) {
+		htile_settings |= S_028D24_LINEAR(1);
+	} else if (max_pixels_per_DB <= 128 * k) {
+		htile_settings |= S_028D24_FULL_CACHE(1) | S_028D24_LINEAR(1);
+	} else if (max_pixels_per_DB <= 256 * k) {
+		htile_settings |= S_028D24_HTILE_WIDTH(1) |
+				S_028D24_FULL_CACHE(1) |
+				S_028D24_LINEAR(1);
+	} else if (max_pixels_per_DB <= 512 * k) {
+		htile_settings |= S_028D24_HTILE_WIDTH(1) |
+				S_028D24_HTILE_HEIGHT(1) |
+				S_028D24_FULL_CACHE(1) |
+				S_028D24_LINEAR(1);
+	} else {
+		width_per_DB = (zbuf->surface.npix_x * tile_pipes_per_DB) / num_tile_pipes;
+		if (width_per_DB <= 512) {
+			htile_settings |= S_028D24_HTILE_WIDTH(1) |
+					S_028D24_HTILE_HEIGHT(1) |
+					S_028D24_FULL_CACHE(1) |
+					S_028D24_PREFETCH_WIDTH(16) |
+					S_028D24_PREFETCH_HEIGHT(4) |
+					S_028D24_HTILE_USES_PRELOAD_WIN(1);
+		} else if (width_per_DB <= 1024) {
+				htile_settings |= S_028D24_HTILE_WIDTH(1) |
+						S_028D24_HTILE_HEIGHT(1) |
+						S_028D24_FULL_CACHE(1) |
+						S_028D24_PREFETCH_WIDTH(16) |
+						S_028D24_PREFETCH_HEIGHT(2) |
+						S_028D24_HTILE_USES_PRELOAD_WIN(1);
+		} else {
+			htile_settings |= S_028D24_HTILE_WIDTH(1) |
+					S_028D24_HTILE_HEIGHT(1) |
+					S_028D24_FULL_CACHE(1) |
+					S_028D24_PREFETCH_WIDTH(16) |
+					S_028D24_HTILE_USES_PRELOAD_WIN(1);
+		}
+	}
+	return htile_settings;
+}
+
 /* Figure out whether u_blitter will fallback to a transfer operation.
  * If so, don't use a staging resource.
  */
@@ -495,6 +553,7 @@ static const struct u_resource_vtbl r600_texture_vtbl =
 };
 
 DEBUG_GET_ONCE_BOOL_OPTION(use_surface, "R600_SURF", TRUE);
+DEBUG_GET_ONCE_BOOL_OPTION(hyperz_enabled, "R600_HYPERZ", TRUE);
 
 static struct r600_resource_texture *
 r600_texture_create_object(struct pipe_screen *screen,
@@ -514,8 +573,13 @@ r600_texture_create_object(struct pipe_screen *screen,
 	/* FIXME ugly temporary hack to allow to switch btw current code
 	 * and common surface allocator code
 	 */
+	if (debug_get_option_hyperz_enabled()) {
+		rscreen->hyperz_enabled = 1;
+	}
 	if (debug_get_option_use_surface()) {
 		rscreen->use_surface = 1;
+	} else {
+		rscreen->hyperz_enabled = 0;
 	}
 
 	rtex = CALLOC_STRUCT(r600_resource_texture);
@@ -574,6 +638,76 @@ r600_texture_create_object(struct pipe_screen *screen,
 		/* Proceed in creating the depth buffer. */
 	}
 
+	rtex->hyperz = NULL;
+	if (!(base->flags & R600_RESOURCE_FLAG_TRANSFER) &&
+	    util_format_is_depth_or_stencil(base->format) &&
+	    rscreen->use_surface &&
+	    rscreen->hyperz_enabled &&
+	    rscreen->info.drm_minor >= 14) {
+		struct pipe_resource hyperz;
+		struct radeon_surface hsurface;
+		unsigned max_x, max_y, psize, htile_max;
+
+		/* Allocate the hyperz buffer. */
+		hyperz = *base;
+		hyperz.format = PIPE_FORMAT_A8R8G8B8_UNORM;
+		hsurface = *surface;
+		/* FIXME compute htile width & height depending on buffer size
+		 * and number of pipes
+		 */
+		hsurface.npix_x = hsurface.npix_x * hsurface.blk_w;
+		hsurface.npix_y = hsurface.npix_y * hsurface.blk_h;
+		hsurface.npix_x = align(hsurface.npix_x, 8);
+		hsurface.npix_y = align(hsurface.npix_y, 8);
+		hsurface.blk_w = 4;
+		hsurface.blk_h = 4;
+		hsurface.bpe = 4;
+		hsurface.flags = RADEON_SURF_CLR(hsurface.flags, MODE);
+		rtex->htile_surface = r600_htile_settings(rscreen, rtex);
+		if (G_028D24_HTILE_WIDTH(rtex->htile_surface)) {
+			hsurface.blk_w = 8;
+		}
+		if (G_028D24_HTILE_HEIGHT(rtex->htile_surface)) {
+			hsurface.blk_h = 8;
+		}
+		hyperz.width0 = hsurface.npix_x / hsurface.blk_w;
+		hyperz.height0 = hsurface.npix_y / hsurface.blk_h;
+		/* force small surface to different size so we don't get
+		 * pitch issue in htile clear path
+		 */
+		if (hyperz.width0 < 128) {
+			htile_max = MAX2(hyperz.width0 * hyperz.height0, 128);
+			hyperz.width0 = 128;
+			hyperz.height0 = htile_max / 128;
+			hsurface.npix_x = hyperz.width0 * hsurface.blk_w;
+			hsurface.npix_y = hyperz.height0 * hsurface.blk_h;
+		}
+		hyperz.last_level = 0;
+		hyperz.nr_samples = 1;
+		hyperz.bind = PIPE_BIND_RENDER_TARGET;
+		hyperz.flags = 0;
+
+		psize = (rscreen->family >= CHIP_CEDAR) ? 64 : 32;
+		max_x = (hsurface.npix_x + hsurface.blk_w - 1) / hsurface.blk_w;
+		max_y = (hsurface.npix_y + hsurface.blk_h - 1) / hsurface.blk_h;
+		max_x = (max_x + psize - 1) / psize;
+		max_y = (max_y + psize - 1) / psize;
+		htile_max = 8192 / ((psize / hsurface.blk_w) * (psize / hsurface.blk_h));
+		while ((max_x * max_y) > htile_max) {
+			max_x -= 1;
+			max_y -= 1;
+		}
+		rtex->db_preload_control = S_028D30_MAX_X(max_x) | S_028D30_MAX_Y(max_y);
+
+		rtex->hyperz = r600_texture_create_object(screen, &hyperz, array_mode, 0,
+							  max_buffer_size, NULL, TRUE, &hsurface);
+		if (!rtex->hyperz) {
+			FREE(rtex);
+			return NULL;
+		}
+		/* Proceed in creating the depth buffer. */
+	}
+
 	/* only mark depth textures the HW can hit as depth textures */
 	if (util_format_is_depth_or_stencil(rtex->real_format) && permit_hardware_blit(screen, base))
 		rtex->depth = 1;
diff --git a/src/gallium/drivers/r600/r600d.h b/src/gallium/drivers/r600/r600d.h
index 3c3238a..d917ccc 100644
--- a/src/gallium/drivers/r600/r600d.h
+++ b/src/gallium/drivers/r600/r600d.h
@@ -120,6 +120,7 @@
 #define EVENT_TYPE_CACHE_FLUSH_AND_INV_EVENT   0x16
 #define EVENT_TYPE_SO_VGTSTREAMOUT_FLUSH	0x1f
 #define EVENT_TYPE_SAMPLE_STREAMOUTSTATS	0x20
+#define EVENT_TYPE_FLUSH_AND_INV_DB_META        0x2c    /* R7XX and newer only */
 #define		EVENT_TYPE(x)                           ((x) << 0)
 #define		EVENT_INDEX(x)                          ((x) << 8)
                 /* 0 - any non-TS event
@@ -663,6 +664,7 @@
 #define   S_028004_SLICE_MAX(x)                        (((x) & 0x7FF) << 13)
 #define   G_028004_SLICE_MAX(x)                        (((x) >> 13) & 0x7FF)
 #define   C_028004_SLICE_MAX                           0xFF001FFF
+#define R_028014_DB_HTILE_DATA_BASE                  0x00028014
 #define R_028D24_DB_HTILE_SURFACE                    0x028D24
 #define   S_028D24_HTILE_WIDTH(x)                      (((x) & 0x1) << 0)
 #define   G_028D24_HTILE_WIDTH(x)                      (((x) >> 0) & 0x1)
@@ -2248,6 +2250,18 @@
 #define R_028D10_DB_RENDER_OVERRIDE                  0x028D10
 #define R_028D2C_DB_SRESULTS_COMPARE_STATE1          0x028D2C
 #define R_028D30_DB_PRELOAD_CONTROL                  0x028D30
+#define   S_028D30_START_X(x)                          (((x) & 0xFF) << 0)
+#define   G_028D30_START_X(x)                          (((x) >> 0) & 0xFF)
+#define   C_028D30_START_X                             0xFFFFFF00
+#define   S_028D30_START_Y(x)                          (((x) & 0xFF) << 8)
+#define   G_028D30_START_Y(x)                          (((x) >> 8) & 0xFF)
+#define   C_028D30_START_Y                             0xFFFF00FF
+#define   S_028D30_MAX_X(x)                            (((x) & 0xFF) << 16)
+#define   G_028D30_MAX_X(x)                            (((x) >> 16) & 0xFF)
+#define   C_028D30_MAX_X                               0xFF00FFFF
+#define   S_028D30_MAX_Y(x)                            (((x) & 0xFF) << 24)
+#define   G_028D30_MAX_Y(x)                            (((x) >> 24) & 0xFF)
+#define   C_028D30_MAX_Y                               0x00FFFFFF
 #define R_028D44_DB_ALPHA_TO_MASK                    0x028D44
 #define R_028868_SQ_PGM_RESOURCES_VS                 0x028868
 #define R_0286CC_SPI_PS_IN_CONTROL_0                 0x0286CC
-- 
1.7.7.1

