perf: 优化 Anime4K_Denoise_Bilateral_Mode 的性能

This commit is contained in:
Xu Liu 2022-03-06 16:56:36 +08:00
commit 4708cc812e

View file

@ -1,3 +1,4 @@
// Anime4K_Denoise_Bilateral_Mode
// 移植自 https://github.com/bloc97/Anime4K/blob/master/glsl/Denoise/Anime4K_Denoise_Bilateral_Mode.glsl
@ -21,9 +22,9 @@ SamplerState sam;
//!PASS 1
//!STYLE PS
//!IN INPUT
//!BLOCK_SIZE 16,16
//!NUM_THREADS 64,1,1
#define INTENSITY_SIGMA intensitySigma //Intensity window size, higher is stronger denoise, must be a positive real number
#define SPATIAL_SIGMA 1.0 //Spatial window size, higher is stronger denoise, must be a positive real number.
@ -46,49 +47,89 @@ float gaussian(float x, float s, float m) {
return exp(-0.5 * scaled * scaled);
}
float4 Pass1(float2 pos) {
float3 histogram_v[KERNELLEN];
float histogram_l[KERNELLEN];
float histogram_w[KERNELLEN];
float histogram_wn[KERNELLEN];
void Pass1(uint2 blockStart, uint3 threadId) {
uint2 gxy = (Rmp8x8(threadId.x) << 1) + blockStart;
if (!CheckViewport(gxy)) {
return;
}
float vc = get_luma(INPUT.SampleLevel(sam, pos, 0).rgb);
float is = pow(vc + 0.0001, INTENSITY_POWER_CURVE) * INTENSITY_SIGMA;
float ss = SPATIAL_SIGMA;
uint i;
float2 inputPt = GetInputPt();
uint i, j, k, m;
float4 src[KERNELSIZE + 1][KERNELSIZE + 1];
[unroll]
for (i = 0; i < KERNELLEN; i++) {
float2 ipos = GETOFFSET(i);
histogram_v[i] = INPUT.SampleLevel(sam, pos + ipos * inputPt, 0).rgb;
histogram_l[i] = get_luma(histogram_v[i]);
histogram_w[i] = gaussian(histogram_l[i], is, vc) * gaussian(length(ipos), ss, 0.0);
histogram_wn[i] = 0.0;
}
for (i = 0; i <= KERNELSIZE - 1; i += 2) {
[unroll]
for (j = 0; j <= KERNELSIZE - 1; j += 2) {
float2 tpos = (gxy + int2(i, j) - KERNELHALFSIZE + 1) * inputPt;
const float4 sr = INPUT.GatherRed(sam, tpos);
const float4 sg = INPUT.GatherGreen(sam, tpos);
const float4 sb = INPUT.GatherBlue(sam, tpos);
[unroll]
for (i = 0; i < KERNELLEN; i++) {
histogram_wn[i] += gaussian(0.0, HISTOGRAM_REGULARIZATION, 0.0) * histogram_w[i];
for (uint j = (i + 1); j < KERNELLEN; j++) {
float d = gaussian(histogram_l[j], HISTOGRAM_REGULARIZATION, histogram_l[i]);
histogram_wn[j] += d * histogram_w[i];
histogram_wn[i] += d * histogram_w[j];
// w z
// x y
src[i][j] = float4(sr.w, sg.w, sb.w, get_luma(float3(sr.w, sg.w, sb.w)));
src[i][j + 1] = float4(sr.x, sg.x, sb.x, get_luma(float3(sr.x, sg.x, sb.x)));
src[i + 1][j] = float4(sr.z, sg.z, sb.z, get_luma(float3(sr.z, sg.z, sb.z)));
src[i + 1][j + 1] = float4(sr.y, sg.y, sb.y, get_luma(float3(sr.y, sg.y, sb.y)));
}
}
float3 maxv = 0;
float maxw = 0;
[unroll]
for (i = 0; i < KERNELLEN; ++i) {
if (histogram_wn[i] >= maxw) {
maxw = histogram_wn[i];
maxv = histogram_v[i];
for (i = 0; i <= 1; ++i) {
[unroll]
for (j = 0; j <= 1; ++j) {
const uint2 destPos = gxy + uint2(i, j);
if (i != 0 && j != 0) {
if (!CheckViewport(gxy)) {
continue;
}
}
float3 histogram_v[KERNELLEN];
float histogram_l[KERNELLEN];
float histogram_w[KERNELLEN];
float histogram_wn[KERNELLEN];
float vc = src[KERNELHALFSIZE + i][KERNELHALFSIZE + j].a;
float is = pow(vc + 0.0001, INTENSITY_POWER_CURVE) * INTENSITY_SIGMA;
float ss = SPATIAL_SIGMA;
[unroll]
for (k = 0; k < KERNELLEN; k++) {
const int2 ipos = GETOFFSET(k);
const uint2 idx = uint2(i, j) + ipos.yx + KERNELHALFSIZE;
histogram_v[k] = src[idx.x][idx.y].rgb;
histogram_l[k] = src[idx.x][idx.y].a;
histogram_w[k] = gaussian(histogram_l[k], is, vc) * gaussian(length(ipos), ss, 0.0);
histogram_wn[k] = 0.0;
}
[unroll]
for (k = 0; k < KERNELLEN; k++) {
histogram_wn[k] += gaussian(0.0, HISTOGRAM_REGULARIZATION, 0.0) * histogram_w[k];
[unroll]
for (uint m = (k + 1); m < KERNELLEN; m++) {
float d = gaussian(histogram_l[m], HISTOGRAM_REGULARIZATION, histogram_l[k]);
histogram_wn[m] += d * histogram_w[k];
histogram_wn[k] += d * histogram_w[m];
}
}
float3 maxv = 0;
float maxw = 0;
[unroll]
for (k = 0; k < KERNELLEN; ++k) {
if (histogram_wn[k] >= maxw) {
maxw = histogram_wn[k];
maxv = histogram_v[k];
}
}
WriteToOutput(destPos, maxv);
}
}
return float4(maxv, 1);
}