perf: 优化 Anime4K_Denoise_Bilateral_Mode 的性能

2026-06-24 02:04:10 +00:00 · 2022-03-06 16:56:36 +08:00 · 2022-03-06 16:56:36 +08:00 · 4708cc812e
commit 4708cc812e
parent 88383e0e6a
1 changed files with 78 additions and 37 deletions
--- a/Effects/Anime4K_Denoise_Bilateral_Mode.hlsl
+++ b/Effects/Anime4K_Denoise_Bilateral_Mode.hlsl
@ -1,3 +1,4 @@
+// Anime4K_Denoise_Bilateral_Mode
 // 移植自 https://github.com/bloc97/Anime4K/blob/master/glsl/Denoise/Anime4K_Denoise_Bilateral_Mode.glsl


@ -21,9 +22,9 @@ SamplerState sam;


 //!PASS 1
-//!STYLE PS
 //!IN INPUT
-
+//!BLOCK_SIZE 16,16
+//!NUM_THREADS 64,1,1

 #define INTENSITY_SIGMA intensitySigma //Intensity window size, higher is stronger denoise, must be a positive real number
 #define SPATIAL_SIGMA 1.0 //Spatial window size, higher is stronger denoise, must be a positive real number.
@ -46,49 +47,89 @@ float gaussian(float x, float s, float m) {
 	return exp(-0.5 * scaled * scaled);
 }

-float4 Pass1(float2 pos) {
-	float3 histogram_v[KERNELLEN];
-	float histogram_l[KERNELLEN];
-	float histogram_w[KERNELLEN];
-	float histogram_wn[KERNELLEN];
+void Pass1(uint2 blockStart, uint3 threadId) {
+	uint2 gxy = (Rmp8x8(threadId.x) << 1) + blockStart;
+	if (!CheckViewport(gxy)) {
+		return;
+	}

-	float vc = get_luma(INPUT.SampleLevel(sam, pos, 0).rgb);
-
-	float is = pow(vc + 0.0001, INTENSITY_POWER_CURVE) * INTENSITY_SIGMA;
-	float ss = SPATIAL_SIGMA;
-
-	uint i;
 	float2 inputPt = GetInputPt();
+	uint i, j, k, m;

+	float4 src[KERNELSIZE + 1][KERNELSIZE + 1];
 	[unroll]
-	for (i = 0; i < KERNELLEN; i++) {
-		float2 ipos = GETOFFSET(i);
-		histogram_v[i] = INPUT.SampleLevel(sam, pos + ipos * inputPt, 0).rgb;
-		histogram_l[i] = get_luma(histogram_v[i]);
-		histogram_w[i] = gaussian(histogram_l[i], is, vc) * gaussian(length(ipos), ss, 0.0);
-		histogram_wn[i] = 0.0;
-	}
+	for (i = 0; i <= KERNELSIZE - 1; i += 2) {
+		[unroll]
+		for (j = 0; j <= KERNELSIZE - 1; j += 2) {
+			float2 tpos = (gxy + int2(i, j) - KERNELHALFSIZE + 1) * inputPt;
+			const float4 sr = INPUT.GatherRed(sam, tpos);
+			const float4 sg = INPUT.GatherGreen(sam, tpos);
+			const float4 sb = INPUT.GatherBlue(sam, tpos);

-	[unroll]
-	for (i = 0; i < KERNELLEN; i++) {
-		histogram_wn[i] += gaussian(0.0, HISTOGRAM_REGULARIZATION, 0.0) * histogram_w[i];
-		for (uint j = (i + 1); j < KERNELLEN; j++) {
-			float d = gaussian(histogram_l[j], HISTOGRAM_REGULARIZATION, histogram_l[i]);
-			histogram_wn[j] += d * histogram_w[i];
-			histogram_wn[i] += d * histogram_w[j];
+			// w z
+			// x y
+			src[i][j] = float4(sr.w, sg.w, sb.w, get_luma(float3(sr.w, sg.w, sb.w)));
+			src[i][j + 1] = float4(sr.x, sg.x, sb.x, get_luma(float3(sr.x, sg.x, sb.x)));
+			src[i + 1][j] = float4(sr.z, sg.z, sb.z, get_luma(float3(sr.z, sg.z, sb.z)));
+			src[i + 1][j + 1] = float4(sr.y, sg.y, sb.y, get_luma(float3(sr.y, sg.y, sb.y)));
 		}
 	}

-	float3 maxv = 0;
-	float maxw = 0;
-
 	[unroll]
-	for (i = 0; i < KERNELLEN; ++i) {
-		if (histogram_wn[i] >= maxw) {
-			maxw = histogram_wn[i];
-			maxv = histogram_v[i];
+	for (i = 0; i <= 1; ++i) {
+		[unroll]
+		for (j = 0; j <= 1; ++j) {
+			const uint2 destPos = gxy + uint2(i, j);
+
+			if (i != 0 && j != 0) {
+				if (!CheckViewport(gxy)) {
+					continue;
+				}
+			}
+
+			float3 histogram_v[KERNELLEN];
+			float histogram_l[KERNELLEN];
+			float histogram_w[KERNELLEN];
+			float histogram_wn[KERNELLEN];
+
+			float vc = src[KERNELHALFSIZE + i][KERNELHALFSIZE + j].a;
+
+			float is = pow(vc + 0.0001, INTENSITY_POWER_CURVE) * INTENSITY_SIGMA;
+			float ss = SPATIAL_SIGMA;
+
+			[unroll]
+			for (k = 0; k < KERNELLEN; k++) {
+				const int2 ipos = GETOFFSET(k);
+				const uint2 idx = uint2(i, j) + ipos.yx + KERNELHALFSIZE;
+				histogram_v[k] = src[idx.x][idx.y].rgb;
+				histogram_l[k] = src[idx.x][idx.y].a;
+				histogram_w[k] = gaussian(histogram_l[k], is, vc) * gaussian(length(ipos), ss, 0.0);
+				histogram_wn[k] = 0.0;
+			}
+
+			[unroll]
+			for (k = 0; k < KERNELLEN; k++) {
+				histogram_wn[k] += gaussian(0.0, HISTOGRAM_REGULARIZATION, 0.0) * histogram_w[k];
+				[unroll]
+				for (uint m = (k + 1); m < KERNELLEN; m++) {
+					float d = gaussian(histogram_l[m], HISTOGRAM_REGULARIZATION, histogram_l[k]);
+					histogram_wn[m] += d * histogram_w[k];
+					histogram_wn[k] += d * histogram_w[m];
+				}
+			}
+
+			float3 maxv = 0;
+			float maxw = 0;
+
+			[unroll]
+			for (k = 0; k < KERNELLEN; ++k) {
+				if (histogram_wn[k] >= maxw) {
+					maxw = histogram_wn[k];
+					maxv = histogram_v[k];
+				}
+			}
+
+			WriteToOutput(destPos, maxv);
 		}
 	}
-
-	return float4(maxv, 1);
 }