Magpie/Effects/NIS.hlsl

// 移植自 https://github.com/NVIDIAGameWorks/NVIDIAImageScaling/blob/main/NIS/NIS_Scaler.h

//!MAGPIE EFFECT
//!VERSION 2


//!PARAMETER
//!DEFAULT 0.5
//!MIN 0
//!MAX 1
float sharpness;

//!TEXTURE
Texture2D INPUT;

//!TEXTURE
//!SOURCE NIS_Coef_Scale.dds
Texture2D coef_scaler;

//!TEXTURE
//!SOURCE NIS_Coef_USM.dds
Texture2D coef_usm;

//!SAMPLER
//!FILTER LINEAR
SamplerState samplerLinearClamp;


//!PASS 1
//!IN INPUT, coef_scaler, coef_usm
//!BLOCK_SIZE 32,24
//!NUM_THREADS 256,1,1


#define kDetectRatio (2 * 1127.f / 1024.f)
#define kDetectThres (64.0f / 1024.0f)
#define kEps (1.0f / 255.0f)
#define kMinContrastRatio 2.0f
#define kMaxContrastRatio 10.0f
#define kRatioNorm (1.0f / (kMaxContrastRatio - kMinContrastRatio))
#define kContrastBoost 1.0f
#define kSharpStartY 0.45f
#define kSharpEndY 0.9f
#define kSharpScaleY (1.0f / (kSharpEndY - kSharpStartY))
#define kSharpStrengthScale (kSharpStrengthMax - kSharpStrengthMin)
#define sharpen_slider (sharpness - 0.5f)
#define MinScale ((sharpen_slider >= 0.0f) ? 1.25f : 1.0f)
#define MaxScale ((sharpen_slider >= 0.0f) ? 1.25f : 1.75f)
#define kSharpStrengthMin (max(0.0f, 0.4f + sharpen_slider * MinScale * 1.2f))
#define kSharpStrengthMax (1.6f + sharpen_slider * MaxScale * 1.8f)
#define LimitScale ((sharpen_slider >= 0.0f) ? 1.25f : 1.0f)
#define kSharpLimitMin (max(0.1f, 0.14f + sharpen_slider * LimitScale * 0.32f))
#define kSharpLimitMax (0.5f + sharpen_slider * LimitScale * 0.6f)
#define kSharpLimitScale (kSharpLimitMax - kSharpLimitMin)

#define NIS_SCALE_FLOAT 1.0f
#define NIS_SCALE_INT 1

#define NIS_BLOCK_WIDTH 32
#define NIS_BLOCK_HEIGHT 24
#define NIS_THREAD_GROUP_SIZE 256
#define kPhaseCount  64
#define kFilterSize  6
#define kSupportSize 6
#define kPadSize     kSupportSize
// 'Tile' is the region of source luminance values that we load into shPixelsY.
// It is the area of source pixels covered by the destination 'Block' plus a
// 3 pixel border of support pixels.
#define kTilePitch              (NIS_BLOCK_WIDTH + kPadSize)
#define kTileSize               (kTilePitch * (NIS_BLOCK_HEIGHT + kPadSize))
// 'EdgeMap' is the region of source pixels for which edge map vectors are derived.
// It is the area of source pixels covered by the destination 'Block' plus a
// 1 pixel border.
#define kEdgeMapPitch           (NIS_BLOCK_WIDTH + 2)
#define kEdgeMapSize            (kEdgeMapPitch * (NIS_BLOCK_HEIGHT + 2))

groupshared float shPixelsY[kTileSize];
groupshared float shCoefScaler[kPhaseCount][kFilterSize];
groupshared float shCoefUSM[kPhaseCount][kFilterSize];
groupshared float4 shEdgeMap[kEdgeMapSize];

float getY(float3 rgba) {
	return 0.2126f * rgba.x + 0.7152f * rgba.y + 0.0722f * rgba.z;
}

float4 GetEdgeMap(float p[4][4], int i, int j) {
	const float g_0 = abs(p[0 + i][0 + j] + p[0 + i][1 + j] + p[0 + i][2 + j] - p[2 + i][0 + j] - p[2 + i][1 + j] - p[2 + i][2 + j]);
	const float g_45 = abs(p[1 + i][0 + j] + p[0 + i][0 + j] + p[0 + i][1 + j] - p[2 + i][1 + j] - p[2 + i][2 + j] - p[1 + i][2 + j]);
	const float g_90 = abs(p[0 + i][0 + j] + p[1 + i][0 + j] + p[2 + i][0 + j] - p[0 + i][2 + j] - p[1 + i][2 + j] - p[2 + i][2 + j]);
	const float g_135 = abs(p[1 + i][0 + j] + p[2 + i][0 + j] + p[2 + i][1 + j] - p[0 + i][1 + j] - p[0 + i][2 + j] - p[1 + i][2 + j]);

	const float g_0_90_max = max(g_0, g_90);
	const float g_0_90_min = min(g_0, g_90);
	const float g_45_135_max = max(g_45, g_135);
	const float g_45_135_min = min(g_45, g_135);

	float e_0_90 = 0;
	float e_45_135 = 0;

	if (g_0_90_max + g_45_135_max == 0) {
		return float4(0, 0, 0, 0);
	}

	e_0_90 = min(g_0_90_max / (g_0_90_max + g_45_135_max), 1.0f);
	e_45_135 = 1.0f - e_0_90;

	bool c_0_90 = (g_0_90_max > (g_0_90_min * kDetectRatio)) && (g_0_90_max > kDetectThres) && (g_0_90_max > g_45_135_min);
	bool c_45_135 = (g_45_135_max > (g_45_135_min * kDetectRatio)) && (g_45_135_max > kDetectThres) && (g_45_135_max > g_0_90_min);
	bool c_g_0_90 = g_0_90_max == g_0;
	bool c_g_45_135 = g_45_135_max == g_45;

	float f_e_0_90 = (c_0_90 && c_45_135) ? e_0_90 : 1.0f;
	float f_e_45_135 = (c_0_90 && c_45_135) ? e_45_135 : 1.0f;

	float weight_0 = (c_0_90 && c_g_0_90) ? f_e_0_90 : 0.0f;
	float weight_90 = (c_0_90 && !c_g_0_90) ? f_e_0_90 : 0.0f;
	float weight_45 = (c_45_135 && c_g_45_135) ? f_e_45_135 : 0.0f;
	float weight_135 = (c_45_135 && !c_g_45_135) ? f_e_45_135 : 0.0f;

	return float4(weight_0, weight_90, weight_45, weight_135);
}

void LoadFilterBanksSh(int i0, int di) {
	// Load up filter banks to shared memory
	// The work is spread over (kPhaseCount * 2) threads
	for (int i = i0; i < kPhaseCount * 2; i += di) {
		int phase = i >> 1;
		int vIdx = i & 1;

		float4 v = float4(coef_scaler[int2(vIdx, phase)]);
		int filterOffset = vIdx * 4;
		shCoefScaler[phase][filterOffset + 0] = v.x;
		shCoefScaler[phase][filterOffset + 1] = v.y;
		if (vIdx == 0) {
			shCoefScaler[phase][2] = v.z;
			shCoefScaler[phase][3] = v.w;
		}

		v = float4(coef_usm[int2(vIdx, phase)]);
		shCoefUSM[phase][filterOffset + 0] = v.x;
		shCoefUSM[phase][filterOffset + 1] = v.y;
		if (vIdx == 0) {
			shCoefUSM[phase][2] = v.z;
			shCoefUSM[phase][3] = v.w;
		}
	}
}


float CalcLTI(float p0, float p1, float p2, float p3, float p4, float p5, int phase_index) {
    const bool selector = (phase_index <= kPhaseCount / 2);
    float sel = selector ? p0 : p3;
    const float a_min = min(min(p1, p2), sel);
    const float a_max = max(max(p1, p2), sel);
    sel = selector ? p2 : p5;
    const float b_min = min(min(p3, p4), sel);
    const float b_max = max(max(p3, p4), sel);

    const float a_cont = a_max - a_min;
    const float b_cont = b_max - b_min;

    const float cont_ratio = max(a_cont, b_cont) / (min(a_cont, b_cont) + kEps);
    return (1.0f - saturate((cont_ratio - kMinContrastRatio) * kRatioNorm)) * kContrastBoost;
}

float4 GetInterpEdgeMap(const float4 edge[2][2], float phase_frac_x, float phase_frac_y) {
    float4 h0 = lerp(edge[0][0], edge[0][1], phase_frac_x);
    float4 h1 = lerp(edge[1][0], edge[1][1], phase_frac_x);
    return lerp(h0, h1, phase_frac_y);
}

float EvalPoly6(const float pxl[6], int phase_int) {
	float y = 0.f;
	{
		[unroll]
		for (int i = 0; i < 6; ++i) {
			y += shCoefScaler[phase_int][i] * pxl[i];
		}
	}
	float y_usm = 0.f;
	{
		[unroll]
		for (int i = 0; i < 6; ++i) {
			y_usm += shCoefUSM[phase_int][i] * pxl[i];
		}
	}

	// let's compute a piece-wise ramp based on luma
	const float y_scale = 1.0f - saturate((y * (1.0f / NIS_SCALE_FLOAT) - kSharpStartY) * kSharpScaleY);

	// scale the ramp to sharpen as a function of luma
	const float y_sharpness = y_scale * kSharpStrengthScale + kSharpStrengthMin;

	y_usm *= y_sharpness;

	// scale the ramp to limit USM as a function of luma
	const float y_sharpness_limit = (y_scale * kSharpLimitScale + kSharpLimitMin) * y;

	y_usm = min(y_sharpness_limit, max(-y_sharpness_limit, y_usm));
	// reduce ringing
	y_usm *= CalcLTI(pxl[0], pxl[1], pxl[2], pxl[3], pxl[4], pxl[5], phase_int);

	return y + y_usm;
}

float FilterNormal(const float p[6][6], int phase_x_frac_int, int phase_y_frac_int) {
	float h_acc = 0.0f;
	[unroll]
	for (int j = 0; j < 6; ++j) {
		float v_acc = 0.0f;
		[unroll]
		for (int i = 0; i < 6; ++i) {
			v_acc += p[i][j] * shCoefScaler[phase_y_frac_int][i];
		}
		h_acc += v_acc * shCoefScaler[phase_x_frac_int][j];
	}

	// let's return the sum unpacked -> we can accumulate it later
	return h_acc;
}

float AddDirFilters(float p[6][6], float phase_x_frac, float phase_y_frac, int phase_x_frac_int, int phase_y_frac_int, float4 w) {
	float f = 0;
	if (w.x > 0.0f) {
		// 0 deg filter
		float interp0Deg[6];
		{
			[unroll]
			for (int i = 0; i < 6; ++i) {
				interp0Deg[i] = lerp(p[i][2], p[i][3], phase_x_frac);
			}
		}
		f += EvalPoly6(interp0Deg, phase_y_frac_int) * w.x;
	}
	if (w.y > 0.0f) {
		// 90 deg filter
		float interp90Deg[6];
		{
			[unroll]
			for (int i = 0; i < 6; ++i) {
				interp90Deg[i] = lerp(p[2][i], p[3][i], phase_y_frac);
			}
		}

		f += EvalPoly6(interp90Deg, phase_x_frac_int) * w.y;
	}
	if (w.z > 0.0f) {
		//45 deg filter
		float pphase_b45 = 0.5f + 0.5f * (phase_x_frac - phase_y_frac);

		float temp_interp45Deg[7];
		temp_interp45Deg[1] = lerp(p[2][1], p[1][2], pphase_b45);
		temp_interp45Deg[3] = lerp(p[3][2], p[2][3], pphase_b45);
		temp_interp45Deg[5] = lerp(p[4][3], p[3][4], pphase_b45);
		{
			pphase_b45 = pphase_b45 - 0.5f;
			float a = (pphase_b45 >= 0.f) ? p[0][2] : p[2][0];
			float b = (pphase_b45 >= 0.f) ? p[1][3] : p[3][1];
			float c = (pphase_b45 >= 0.f) ? p[2][4] : p[4][2];
			float d = (pphase_b45 >= 0.f) ? p[3][5] : p[5][3];
			temp_interp45Deg[0] = lerp(p[1][1], a, abs(pphase_b45));
			temp_interp45Deg[2] = lerp(p[2][2], b, abs(pphase_b45));
			temp_interp45Deg[4] = lerp(p[3][3], c, abs(pphase_b45));
			temp_interp45Deg[6] = lerp(p[4][4], d, abs(pphase_b45));
		}

		float interp45Deg[6];
		float pphase_p45 = phase_x_frac + phase_y_frac;
		if (pphase_p45 >= 1) {
			[unroll]
			for (int i = 0; i < 6; i++) {
				interp45Deg[i] = temp_interp45Deg[i + 1];
			}
			pphase_p45 = pphase_p45 - 1;
		} else {
			[unroll]
			for (int i = 0; i < 6; i++) {
				interp45Deg[i] = temp_interp45Deg[i];
			}
		}

		f += EvalPoly6(interp45Deg, int(pphase_p45 * 64)) * w.z;
	}
	if (w.w > 0.0f) {
		//135 deg filter
		float pphase_b135 = 0.5f * (phase_x_frac + phase_y_frac);

		float temp_interp135Deg[7];
		temp_interp135Deg[1] = lerp(p[3][1], p[4][2], pphase_b135);
		temp_interp135Deg[3] = lerp(p[2][2], p[3][3], pphase_b135);
		temp_interp135Deg[5] = lerp(p[1][3], p[2][4], pphase_b135);
		{
			pphase_b135 = pphase_b135 - 0.5f;
			float a = (pphase_b135 >= 0.f) ? p[5][2] : p[3][0];
			float b = (pphase_b135 >= 0.f) ? p[4][3] : p[2][1];
			float c = (pphase_b135 >= 0.f) ? p[3][4] : p[1][2];
			float d = (pphase_b135 >= 0.f) ? p[2][5] : p[0][3];
			temp_interp135Deg[0] = lerp(p[4][1], a, abs(pphase_b135));
			temp_interp135Deg[2] = lerp(p[3][2], b, abs(pphase_b135));
			temp_interp135Deg[4] = lerp(p[2][3], c, abs(pphase_b135));
			temp_interp135Deg[6] = lerp(p[1][4], d, abs(pphase_b135));
		}

		float interp135Deg[6];
		float pphase_p135 = 1 + (phase_x_frac - phase_y_frac);
		if (pphase_p135 >= 1) {
			[unroll]
			for (int i = 0; i < 6; ++i) {
				interp135Deg[i] = temp_interp135Deg[i + 1];
			}
			pphase_p135 = pphase_p135 - 1;
		} else {
			[unroll]
			for (int i = 0; i < 6; ++i) {
				interp135Deg[i] = temp_interp135Deg[i];
			}
		}

		f += EvalPoly6(interp135Deg, int(pphase_p135 * 64)) * w.w;
	}
	return f;
}

void Pass1(uint2 blockStart, uint3 threadId) {
	float2 scale = GetScale();
	float2 inputPt = GetInputPt();

	float kScaleX = 1.0f / scale.x;
	float kScaleY = 1.0f / scale.y;
	float threadIdx = threadId.x;
	float kSrcNormX = inputPt.x;
	float kSrcNormY = inputPt.y;

	// Figure out the range of pixels from input image that would be needed to be loaded for this thread-block
	int dstBlockX = blockStart.x;
	int dstBlockY = blockStart.y;

	const int srcBlockStartX = int(floor((dstBlockX + 0.5f) * kScaleX - 0.5f));
	const int srcBlockStartY = int(floor((dstBlockY + 0.5f) * kScaleY - 0.5f));
	const int srcBlockEndX = int(ceil((dstBlockX + NIS_BLOCK_WIDTH + 0.5f) * kScaleX - 0.5f));
	const int srcBlockEndY = int(ceil((dstBlockY + NIS_BLOCK_HEIGHT + 0.5f) * kScaleY - 0.5f));

	int numTilePixelsX = srcBlockEndX - srcBlockStartX + kSupportSize - 1;
	int numTilePixelsY = srcBlockEndY - srcBlockStartY + kSupportSize - 1;

	// round-up load region to even size since we're loading in 2x2 batches
	numTilePixelsX += numTilePixelsX & 0x1;
	numTilePixelsY += numTilePixelsY & 0x1;
	const int numTilePixels = numTilePixelsX * numTilePixelsY;

	// calculate the equivalent values for the edge map
	const int numEdgeMapPixelsX = numTilePixelsX - kSupportSize + 2;
	const int numEdgeMapPixelsY = numTilePixelsY - kSupportSize + 2;
	const int numEdgeMapPixels = numEdgeMapPixelsX * numEdgeMapPixelsY;

	// fill in input luma tile (shPixelsY) in batches of 2x2 pixels
	// we use texture gather to get extra support necessary
	// to compute 2x2 edge map outputs too
	{
		for (uint i = threadIdx * 2; i < uint(numTilePixels) >> 1; i += NIS_THREAD_GROUP_SIZE * 2) {
			uint py = (i / numTilePixelsX) * 2;
			uint px = i % numTilePixelsX;

			// 0.5 to be in the center of texel
			// - (kSupportSize - 1) / 2 to shift by the kernel support size
			float kShift = 0.5f - (kSupportSize - 1) / 2;

			const float tx = (srcBlockStartX + px + kShift) * kSrcNormX;
			const float ty = (srcBlockStartY + py + kShift) * kSrcNormY;

			float p[2][2];
			{
				const float4 sr = INPUT.GatherRed(samplerLinearClamp, float2(tx, ty));
				const float4 sg = INPUT.GatherGreen(samplerLinearClamp, float2(tx, ty));
				const float4 sb = INPUT.GatherBlue(samplerLinearClamp, float2(tx, ty));

				p[0][0] = getY(float3(sr.w, sg.w, sb.w));
				p[0][1] = getY(float3(sr.z, sg.z, sb.z));
				p[1][0] = getY(float3(sr.x, sg.x, sb.x));
				p[1][1] = getY(float3(sr.y, sg.y, sb.y));
			}

			const uint idx = py * kTilePitch + px;
			shPixelsY[idx] = float(p[0][0]);
			shPixelsY[idx + 1] = float(p[0][1]);
			shPixelsY[idx + kTilePitch] = float(p[1][0]);
			shPixelsY[idx + kTilePitch + 1] = float(p[1][1]);
		}
	}
	GroupMemoryBarrierWithGroupSync();
	{
		// fill in the edge map of 2x2 pixels
		for (uint i = threadIdx * 2; i < uint(numEdgeMapPixels) >> 1; i += NIS_THREAD_GROUP_SIZE * 2) {
			uint py = (i / numEdgeMapPixelsX) * 2;
			uint px = i % numEdgeMapPixelsX;

			const uint edgeMapIdx = py * kEdgeMapPitch + px;

			uint tileCornerIdx = (py + 1) * kTilePitch + px + 1;
			float p[4][4];
			[unroll]
			for (int j = 0; j < 4; j++) {
				[unroll]
				for (int k = 0; k < 4; k++) {
					p[j][k] = shPixelsY[tileCornerIdx + j * kTilePitch + k];
				}
			}

			shEdgeMap[edgeMapIdx] = float4(GetEdgeMap(p, 0, 0));
			shEdgeMap[edgeMapIdx + 1] = float4(GetEdgeMap(p, 0, 1));
			shEdgeMap[edgeMapIdx + kEdgeMapPitch] = float4(GetEdgeMap(p, 1, 0));
			shEdgeMap[edgeMapIdx + kEdgeMapPitch + 1] = float4(GetEdgeMap(p, 1, 1));
		}
	}
	LoadFilterBanksSh(int(threadIdx), NIS_THREAD_GROUP_SIZE);
	GroupMemoryBarrierWithGroupSync();

	// output coord within a tile
	const int2 pos = int2(uint(threadIdx) % uint(NIS_BLOCK_WIDTH), uint(threadIdx) / uint(NIS_BLOCK_WIDTH));
	// x coord inside the output image
	const int dstX = dstBlockX + pos.x;
	// x coord inside the input image
	const float srcX = (0.5f + dstX) * kScaleX - 0.5f;
	// nearest integer part
	const int px = int(floor(srcX) - srcBlockStartX);
	// fractional part
	const float fx = srcX - floor(srcX);
	// discretized phase
	const int fx_int = int(fx * kPhaseCount);

	for (int k = 0; k < NIS_BLOCK_WIDTH * NIS_BLOCK_HEIGHT / NIS_THREAD_GROUP_SIZE; ++k) {
		// y coord inside the output image
		const int dstY = dstBlockY + pos.y + k * (NIS_THREAD_GROUP_SIZE / NIS_BLOCK_WIDTH);
		if (!CheckViewport(int2(dstX, dstY))) {
			return;
		}
		// y coord inside the input image
		const float srcY = (0.5f + dstY) * kScaleY - 0.5f;

		// nearest integer part
		const int py = int(floor(srcY) - srcBlockStartY);
		// fractional part
		const float fy = srcY - floor(srcY);
		// discretized phase
		const int fy_int = int(fy * kPhaseCount);

		// generate weights for directional filters
		const int startEdgeMapIdx = py * kEdgeMapPitch + px;
		float4 edge[2][2];
		[unroll]
		for (int i = 0; i < 2; i++) {
			[unroll]
			for (int j = 0; j < 2; j++) {
				// need to shift edge map sampling since it's a 2x2 centered inside 6x6 grid
				edge[i][j] = shEdgeMap[startEdgeMapIdx + (i * kEdgeMapPitch) + j];
			}
		}
		const float4 w = GetInterpEdgeMap(edge, fx, fy) * NIS_SCALE_INT;

		// load 6x6 support to regs
		const int startTileIdx = py * kTilePitch + px;
		float p[6][6];
		{
			[unroll]
			for (int i = 0; i < 6; ++i) {
				[unroll]
				for (int j = 0; j < 6; ++j) {
					p[i][j] = shPixelsY[startTileIdx + i * kTilePitch + j];
				}
			}
		}

		// weigth for luma
		const float baseWeight = NIS_SCALE_FLOAT - w.x - w.y - w.z - w.w;

		// final luma is a weighted product of directional & normal filters
		float opY = 0;

		// get traditional scaler filter output
		opY += FilterNormal(p, fx_int, fy_int) * baseWeight;

		// get directional filter bank output
		opY += AddDirFilters(p, fx, fy, fx_int, fy_int, w);

		// do bilinear tap for chroma upscaling

		float3 op = INPUT.SampleLevel(samplerLinearClamp, float2((srcX + 0.5f) * kSrcNormX, (srcY + 0.5f) * kSrcNormY), 0).rgb;

		const float corr = opY * (1.0f / NIS_SCALE_FLOAT) - getY(float3(op.x, op.y, op.z));
		op.x += corr;
		op.y += corr;
		op.z += corr;

		WriteToOutput(uint2(dstX, dstY), op);
	}
}