feat: 移植 NIS v1.0.2

2026-06-24 02:04:10 +00:00 · 2022-02-25 22:43:37 +08:00 · 2022-02-25 22:43:37 +08:00 · 1b613e89c7
commit 1b613e89c7
parent cb356916e9
4 changed files with 358 additions and 252 deletions
--- a/Effects/NIS.hlsl
+++ b/Effects/NIS.hlsl
@ -1,64 +1,40 @@
 // 移植自 https://github.com/NVIDIAGameWorks/NVIDIAImageScaling/blob/main/NIS/NIS_Scaler.h

 //!MAGPIE EFFECT
-//!VERSION 1
+//!VERSION 2


-//!CONSTANT
-//!VALUE INPUT_PT_X
-float inputPtX;
-
-//!CONSTANT
-//!VALUE INPUT_PT_Y
-float inputPtY;
-
-//!CONSTANT
-//!VALUE SCALE_X
-float scaleX;
-
-//!CONSTANT
-//!VALUE SCALE_Y
-float scaleY;
-
-//!TEXTURE
-Texture2D INPUT;
-
-//!TEXTURE
-//!WIDTH INPUT_WIDTH
-//!HEIGHT INPUT_HEIGHT
-//!FORMAT R16G16B16A16_FLOAT
-Texture2D shEdgeMap;
-
-//!TEXTURE
-//!SOURCE NIS_Coef_Scale.dds
-Texture2D coefScale;
-
-//!TEXTURE
-//!SOURCE NIS_Coef_USM.dds
-Texture2D coefUSM;
-
-//!SAMPLER
-//!FILTER LINEAR
-SamplerState samLinear;
-
-//!SAMPLER
-//!FILTER POINT
-SamplerState samPoint;
-
-//!CONSTANT
+//!PARAMETER
 //!DEFAULT 0.5
 //!MIN 0
 //!MAX 1
 float sharpness;

+//!TEXTURE
+Texture2D INPUT;

-//!COMMON
+//!TEXTURE
+//!SOURCE NIS_Coef_Scale.dds
+Texture2D coef_scaler;

-#define kDetectRatio (1127.f / 1024.f)
+//!TEXTURE
+//!SOURCE NIS_Coef_USM.dds
+Texture2D coef_usm;
+
+//!SAMPLER
+//!FILTER LINEAR
+SamplerState samplerLinearClamp;
+
+
+//!PASS 1
+//!IN INPUT, coef_scaler, coef_usm
+//!BLOCK_SIZE 32,24
+//!NUM_THREADS 256,1,1
+
+
+#define kDetectRatio (2 * 1127.f / 1024.f)
 #define kDetectThres (64.0f / 1024.0f)
-#define kPhaseCount 64
-#define kFilterSize 8
-#define kEps 1.0f
+#define kEps (1.0f / 255.0f)
 #define kMinContrastRatio 2.0f
 #define kMaxContrastRatio 10.0f
 #define kRatioNorm (1.0f / (kMaxContrastRatio - kMinContrastRatio))
@ -79,21 +55,38 @@ float sharpness;
 #define NIS_SCALE_FLOAT 1.0f
 #define NIS_SCALE_INT 1

+#define NIS_BLOCK_WIDTH 32
+#define NIS_BLOCK_HEIGHT 24
+#define NIS_THREAD_GROUP_SIZE 256
+#define kPhaseCount  64
+#define kFilterSize  6
+#define kSupportSize 6
+#define kPadSize     kSupportSize
+// 'Tile' is the region of source luminance values that we load into shPixelsY.
+// It is the area of source pixels covered by the destination 'Block' plus a
+// 3 pixel border of support pixels.
+#define kTilePitch              (NIS_BLOCK_WIDTH + kPadSize)
+#define kTileSize               (kTilePitch * (NIS_BLOCK_HEIGHT + kPadSize))
+// 'EdgeMap' is the region of source pixels for which edge map vectors are derived.
+// It is the area of source pixels covered by the destination 'Block' plus a
+// 1 pixel border.
+#define kEdgeMapPitch           (NIS_BLOCK_WIDTH + 2)
+#define kEdgeMapSize            (kEdgeMapPitch * (NIS_BLOCK_HEIGHT + 2))
+
+groupshared float shPixelsY[kTileSize];
+groupshared float shCoefScaler[kPhaseCount][kFilterSize];
+groupshared float shCoefUSM[kPhaseCount][kFilterSize];
+groupshared float4 shEdgeMap[kEdgeMapSize];

 float getY(float3 rgba) {
 	return 0.2126f * rgba.x + 0.7152f * rgba.y + 0.0722f * rgba.z;
 }

-
-//!PASS 1
-//!BIND INPUT
-//!SAVE shEdgeMap
-
-float4 GetEdgeMap(float p[3][3]) {
-	const float g_0 = abs(p[0][0] + p[0][1] + p[0][2] - p[2][0] - p[2][1] - p[2][2]);
-	const float g_45 = abs(p[1][0] + p[0][0] + p[0][1] - p[2][1] - p[2][2] - p[1][2]);
-	const float g_90 = abs(p[0][0] + p[1][0] + p[2][0] - p[0][2] - p[1][2] - p[2][2]);
-	const float g_135 = abs(p[1][0] + p[2][0] + p[2][1] - p[0][1] - p[0][2] - p[1][2]);
+float4 GetEdgeMap(float p[4][4], int i, int j) {
+	const float g_0 = abs(p[0 + i][0 + j] + p[0 + i][1 + j] + p[0 + i][2 + j] - p[2 + i][0 + j] - p[2 + i][1 + j] - p[2 + i][2 + j]);
+	const float g_45 = abs(p[1 + i][0 + j] + p[0 + i][0 + j] + p[0 + i][1 + j] - p[2 + i][1 + j] - p[2 + i][2 + j] - p[1 + i][2 + j]);
+	const float g_90 = abs(p[0 + i][0 + j] + p[1 + i][0 + j] + p[2 + i][0 + j] - p[0 + i][2 + j] - p[1 + i][2 + j] - p[2 + i][2 + j]);
+	const float g_135 = abs(p[1 + i][0 + j] + p[2 + i][0 + j] + p[2 + i][1 + j] - p[0 + i][1 + j] - p[0 + i][2 + j] - p[1 + i][2 + j]);

 	const float g_0_90_max = max(g_0, g_90);
 	const float g_0_90_min = min(g_0, g_90);
@ -103,80 +96,76 @@ float4 GetEdgeMap(float p[3][3]) {
 	float e_0_90 = 0;
 	float e_45_135 = 0;

-	if ((g_0_90_max + g_45_135_max) != 0) {
-		e_0_90 = g_0_90_max / (g_0_90_max + g_45_135_max);
-		e_0_90 = min(e_0_90, 1.0f);
-		e_45_135 = 1.0f - e_0_90;
+	if (g_0_90_max + g_45_135_max == 0) {
+		return float4(0, 0, 0, 0);
 	}

-	float e = ((g_0_90_max > (g_0_90_min * kDetectRatio)) && (g_0_90_max > kDetectThres) && (g_0_90_max > g_45_135_min)) ? 1.f : 0.f;
-	float edge_0 = (g_0_90_max == g_0) ? e : 0.f;
-	float edge_90 = (g_0_90_max == g_0) ? 0.f : e;
+	e_0_90 = min(g_0_90_max / (g_0_90_max + g_45_135_max), 1.0f);
+	e_45_135 = 1.0f - e_0_90;

-	e = ((g_45_135_max > (g_45_135_min * kDetectRatio)) && (g_45_135_max > kDetectThres) && (g_45_135_max > g_0_90_min)) ? 1.f : 0.f;
-	float edge_45 = (g_45_135_max == g_45) ? e : 0.f;
-	float edge_135 = (g_45_135_max == g_45) ? 0.f : e;
+	bool c_0_90 = (g_0_90_max > (g_0_90_min * kDetectRatio)) && (g_0_90_max > kDetectThres) && (g_0_90_max > g_45_135_min);
+	bool c_45_135 = (g_45_135_max > (g_45_135_min * kDetectRatio)) && (g_45_135_max > kDetectThres) && (g_45_135_max > g_0_90_min);
+	bool c_g_0_90 = g_0_90_max == g_0;
+	bool c_g_45_135 = g_45_135_max == g_45;

-	float weight_0 = 0.f;
-	float weight_90 = 0.f;
-	float weight_45 = 0.f;
-	float weight_135 = 0.f;
-	if ((edge_0 + edge_90 + edge_45 + edge_135) >= 2.0f) {
-		weight_0 = (edge_0 == 1.0f) ? e_0_90 : 0.f;
-		weight_90 = (edge_0 == 1.0f) ? 0.f : e_0_90;
+	float f_e_0_90 = (c_0_90 && c_45_135) ? e_0_90 : 1.0f;
+	float f_e_45_135 = (c_0_90 && c_45_135) ? e_45_135 : 1.0f;

-		weight_45 = (edge_45 == 1.0f) ? e_45_135 : 0.f;
-		weight_135 = (edge_45 == 1.0f) ? 0.f : e_45_135;
-	} else if ((edge_0 + edge_90 + edge_45 + edge_135) >= 1.0f) {
-		weight_0 = edge_0;
-		weight_90 = edge_90;
-		weight_45 = edge_45;
-		weight_135 = edge_135;
-	}
+	float weight_0 = (c_0_90 && c_g_0_90) ? f_e_0_90 : 0.0f;
+	float weight_90 = (c_0_90 && !c_g_0_90) ? f_e_0_90 : 0.0f;
+	float weight_45 = (c_45_135 && c_g_45_135) ? f_e_45_135 : 0.0f;
+	float weight_135 = (c_45_135 && !c_g_45_135) ? f_e_45_135 : 0.0f;

 	return float4(weight_0, weight_90, weight_45, weight_135);
 }

-float4 Pass1(float2 pos) {
-	float p[3][3];
+void LoadFilterBanksSh(int i0, int di) {
+	// Load up filter banks to shared memory
+	// The work is spread over (kPhaseCount * 2) threads
+	for (int i = i0; i < kPhaseCount * 2; i += di) {
+		int phase = i >> 1;
+		int vIdx = i & 1;

-	[unroll]
-	for (int j = 0; j < 3; j++) {
-		[unroll]
-		for (int k = 0; k < 3; k++) {
-			const float3 px = INPUT.Sample(samPoint, pos + float2(k - 1, j - 1) * float2(inputPtX, inputPtY)).xyz;
-			p[j][k] = getY(px);
+		float4 v = float4(coef_scaler[int2(vIdx, phase)]);
+		int filterOffset = vIdx * 4;
+		shCoefScaler[phase][filterOffset + 0] = v.x;
+		shCoefScaler[phase][filterOffset + 1] = v.y;
+		if (vIdx == 0) {
+			shCoefScaler[phase][2] = v.z;
+			shCoefScaler[phase][3] = v.w;
+		}
+
+		v = float4(coef_usm[int2(vIdx, phase)]);
+		shCoefUSM[phase][filterOffset + 0] = v.x;
+		shCoefUSM[phase][filterOffset + 1] = v.y;
+		if (vIdx == 0) {
+			shCoefUSM[phase][2] = v.z;
+			shCoefUSM[phase][3] = v.w;
 		}
 	}
-
-	return GetEdgeMap(p);
 }


-//!PASS 2
-//!BIND INPUT, shEdgeMap, coefScale, coefUSM
-
 float CalcLTI(float p0, float p1, float p2, float p3, float p4, float p5, int phase_index) {
-	const bool selector = (phase_index <= kPhaseCount / 2);
-	float sel = selector ? p0 : p3;
-	const float a_min = min(min(p1, p2), sel);
-	const float a_max = max(max(p1, p2), sel);
-	sel = selector ? p2 : p5;
-	const float b_min = min(min(p3, p4), sel);
-	const float b_max = max(max(p3, p4), sel);
+    const bool selector = (phase_index <= kPhaseCount / 2);
+    float sel = selector ? p0 : p3;
+    const float a_min = min(min(p1, p2), sel);
+    const float a_max = max(max(p1, p2), sel);
+    sel = selector ? p2 : p5;
+    const float b_min = min(min(p3, p4), sel);
+    const float b_max = max(max(p3, p4), sel);

-	const float a_cont = a_max - a_min;
-	const float b_cont = b_max - b_min;
+    const float a_cont = a_max - a_min;
+    const float b_cont = b_max - b_min;

-	const float cont_ratio = max(a_cont, b_cont) / (min(a_cont, b_cont) + kEps);
-	return (1.0f - saturate((cont_ratio - kMinContrastRatio) * kRatioNorm)) * kContrastBoost;
+    const float cont_ratio = max(a_cont, b_cont) / (min(a_cont, b_cont) + kEps);
+    return (1.0f - saturate((cont_ratio - kMinContrastRatio) * kRatioNorm)) * kContrastBoost;
 }

-
 float4 GetInterpEdgeMap(const float4 edge[2][2], float phase_frac_x, float phase_frac_y) {
-	float4 h0 = lerp(edge[0][0], edge[0][1], phase_frac_x);
-	float4 h1 = lerp(edge[1][0], edge[1][1], phase_frac_x);
-	return lerp(h0, h1, phase_frac_y);
+    float4 h0 = lerp(edge[0][0], edge[0][1], phase_frac_x);
+    float4 h1 = lerp(edge[1][0], edge[1][1], phase_frac_x);
+    return lerp(h0, h1, phase_frac_y);
 }

 float EvalPoly6(const float pxl[6], int phase_int) {
@ -184,14 +173,14 @@ float EvalPoly6(const float pxl[6], int phase_int) {
 	{
 		[unroll]
 		for (int i = 0; i < 6; ++i) {
-			y += coefScale.Sample(samPoint, float2((phase_int + 0.5f) / kPhaseCount, (i + 0.5f) / 6.0f)).x * pxl[i];
+			y += shCoefScaler[phase_int][i] * pxl[i];
 		}
 	}
 	float y_usm = 0.f;
 	{
 		[unroll]
 		for (int i = 0; i < 6; ++i) {
-			y_usm += coefUSM.Sample(samPoint, float2((phase_int + 0.5f) / kPhaseCount, (i + 0.5f) / 6.0f)).x * pxl[i];
+			y_usm += shCoefUSM[phase_int][i] * pxl[i];
 		}
 	}

@ -213,109 +202,6 @@ float EvalPoly6(const float pxl[6], int phase_int) {
 	return y + y_usm;
 }

-
-float4 GetDirFilters(float p[6][6], float phase_x_frac, float phase_y_frac, int phase_x_frac_int, int phase_y_frac_int) {
-	float4 f;
-	// 0 deg filter
-	float interp0Deg[6];
-	{
-		[unroll]
-		for (int i = 0; i < 6; ++i) {
-			interp0Deg[i] = lerp(p[i][2], p[i][3], phase_x_frac);
-		}
-	}
-
-	f.x = EvalPoly6(interp0Deg, phase_y_frac_int);
-
-	// 90 deg filter
-	float interp90Deg[6];
-	{
-		[unroll]
-		for (int i = 0; i < 6; ++i) {
-			interp90Deg[i] = lerp(p[2][i], p[3][i], phase_y_frac);
-		}
-	}
-
-	f.y = EvalPoly6(interp90Deg, phase_x_frac_int);
-
-	//45 deg filter
-	float pphase_b45;
-	pphase_b45 = 0.5f + 0.5f * (phase_x_frac - phase_y_frac);
-
-	float temp_interp45Deg[7];
-	temp_interp45Deg[1] = lerp(p[2][1], p[1][2], pphase_b45);
-	temp_interp45Deg[3] = lerp(p[3][2], p[2][3], pphase_b45);
-	temp_interp45Deg[5] = lerp(p[4][3], p[3][4], pphase_b45);
-	{
-		pphase_b45 = pphase_b45 - 0.5f;
-		float a = (pphase_b45 >= 0.f) ? p[0][2] : p[2][0];
-		float b = (pphase_b45 >= 0.f) ? p[1][3] : p[3][1];
-		float c = (pphase_b45 >= 0.f) ? p[2][4] : p[4][2];
-		float d = (pphase_b45 >= 0.f) ? p[3][5] : p[5][3];
-		temp_interp45Deg[0] = lerp(p[1][1], a, abs(pphase_b45));
-		temp_interp45Deg[2] = lerp(p[2][2], b, abs(pphase_b45));
-		temp_interp45Deg[4] = lerp(p[3][3], c, abs(pphase_b45));
-		temp_interp45Deg[6] = lerp(p[4][4], d, abs(pphase_b45));
-	}
-
-	float interp45Deg[6];
-	float pphase_p45 = phase_x_frac + phase_y_frac;
-	if (pphase_p45 >= 1) {
-		[unroll]
-		for (int i = 0; i < 6; i++) {
-			interp45Deg[i] = temp_interp45Deg[i + 1];
-		}
-		pphase_p45 = pphase_p45 - 1;
-	} else {
-		[unroll]
-		for (int i = 0; i < 6; i++) {
-			interp45Deg[i] = temp_interp45Deg[i];
-		}
-	}
-
-	f.z = EvalPoly6(interp45Deg, (int)(pphase_p45 * 64));
-
-	//135 deg filter
-	float pphase_b135;
-	pphase_b135 = 0.5f * (phase_x_frac + phase_y_frac);
-
-	float temp_interp135Deg[7];
-
-	temp_interp135Deg[1] = lerp(p[3][1], p[4][2], pphase_b135);
-	temp_interp135Deg[3] = lerp(p[2][2], p[3][3], pphase_b135);
-	temp_interp135Deg[5] = lerp(p[1][3], p[2][4], pphase_b135);
-
-	{
-		pphase_b135 = pphase_b135 - 0.5f;
-		float a = (pphase_b135 >= 0.f) ? p[5][2] : p[3][0];
-		float b = (pphase_b135 >= 0.f) ? p[4][3] : p[2][1];
-		float c = (pphase_b135 >= 0.f) ? p[3][4] : p[1][2];
-		float d = (pphase_b135 >= 0.f) ? p[2][5] : p[0][3];
-		temp_interp135Deg[0] = lerp(p[4][1], a, abs(pphase_b135));
-		temp_interp135Deg[2] = lerp(p[3][2], b, abs(pphase_b135));
-		temp_interp135Deg[4] = lerp(p[2][3], c, abs(pphase_b135));
-		temp_interp135Deg[6] = lerp(p[1][4], d, abs(pphase_b135));
-	}
-
-	float interp135Deg[6];
-	float pphase_p135 = 1 + (phase_x_frac - phase_y_frac);
-	if (pphase_p135 >= 1) {
-		[unroll]
-		for (int i = 0; i < 6; ++i) {
-			interp135Deg[i] = temp_interp135Deg[i + 1];
-		}
-		pphase_p135 = pphase_p135 - 1;
-	} else {
-		[unroll]
-		for (int i = 0; i < 6; ++i) {
-			interp135Deg[i] = temp_interp135Deg[i];
-		}
-	}
-
-	f.w = EvalPoly6(interp135Deg, (int)(pphase_p135 * 64));
-	return f;
-}
-
 float FilterNormal(const float p[6][6], int phase_x_frac_int, int phase_y_frac_int) {
 	float h_acc = 0.0f;
 	[unroll]
@ -323,64 +209,284 @@ float FilterNormal(const float p[6][6], int phase_x_frac_int, int phase_y_frac_i
 		float v_acc = 0.0f;
 		[unroll]
 		for (int i = 0; i < 6; ++i) {
-			v_acc += p[i][j] * coefScale.Sample(samPoint, float2((phase_y_frac_int + 0.5f) / kPhaseCount, (i + 0.5f) / 6.0f)).x;
+			v_acc += p[i][j] * shCoefScaler[phase_y_frac_int][i];
 		}
-		h_acc += v_acc * coefScale.Sample(samPoint, float2((phase_x_frac_int + 0.5f) / kPhaseCount, (j + 0.5f) / 6.0f)).x;
+		h_acc += v_acc * shCoefScaler[phase_x_frac_int][j];
 	}

 	// let's return the sum unpacked -> we can accumulate it later
 	return h_acc;
 }

-float4 Pass2(float2 pos) {
-	float2 srcPos = pos / float2(inputPtX, inputPtY) + 0.5f;
-	float2 srcPosB = floor(srcPos);
-
-	// load 6x6 support to regs
-	float p[6][6];
-	{
-		[unroll]
-		for (int i = 0; i < 6; ++i) {
+float AddDirFilters(float p[6][6], float phase_x_frac, float phase_y_frac, int phase_x_frac_int, int phase_y_frac_int, float4 w) {
+	float f = 0;
+	if (w.x > 0.0f) {
+		// 0 deg filter
+		float interp0Deg[6];
+		{
 			[unroll]
-			for (int j = 0; j < 6; ++j) {
-				p[i][j] = getY(INPUT.Sample(samPoint, (srcPosB + float2(j - 2, i - 2) - 0.5f) * float2(inputPtX, inputPtY)).rgb);
+			for (int i = 0; i < 6; ++i) {
+				interp0Deg[i] = lerp(p[i][2], p[i][3], phase_x_frac);
 			}
 		}
+		f += EvalPoly6(interp0Deg, phase_y_frac_int) * w.x;
 	}
+	if (w.y > 0.0f) {
+		// 90 deg filter
+		float interp90Deg[6];
+		{
+			[unroll]
+			for (int i = 0; i < 6; ++i) {
+				interp90Deg[i] = lerp(p[2][i], p[3][i], phase_y_frac);
+			}
+		}

-	// compute discretized filter phase
-	const float2 f = srcPos - srcPosB;
-	const int fx_int = (int)(f.x * kPhaseCount);
-	const int fy_int = (int)(f.y * kPhaseCount);
-	
-	// get traditional scaler filter output
-	const float pixel_n = FilterNormal(p, fx_int, fy_int);
-	
-	// get directional filter bank output
-	float4 opDirYU = GetDirFilters(p, f.x, f.y, fx_int, fy_int);
+		f += EvalPoly6(interp90Deg, phase_x_frac_int) * w.y;
+	}
+	if (w.z > 0.0f) {
+		//45 deg filter
+		float pphase_b45 = 0.5f + 0.5f * (phase_x_frac - phase_y_frac);

-	// final luma is a weighted product of directional & normal filters
+		float temp_interp45Deg[7];
+		temp_interp45Deg[1] = lerp(p[2][1], p[1][2], pphase_b45);
+		temp_interp45Deg[3] = lerp(p[3][2], p[2][3], pphase_b45);
+		temp_interp45Deg[5] = lerp(p[4][3], p[3][4], pphase_b45);
+		{
+			pphase_b45 = pphase_b45 - 0.5f;
+			float a = (pphase_b45 >= 0.f) ? p[0][2] : p[2][0];
+			float b = (pphase_b45 >= 0.f) ? p[1][3] : p[3][1];
+			float c = (pphase_b45 >= 0.f) ? p[2][4] : p[4][2];
+			float d = (pphase_b45 >= 0.f) ? p[3][5] : p[5][3];
+			temp_interp45Deg[0] = lerp(p[1][1], a, abs(pphase_b45));
+			temp_interp45Deg[2] = lerp(p[2][2], b, abs(pphase_b45));
+			temp_interp45Deg[4] = lerp(p[3][3], c, abs(pphase_b45));
+			temp_interp45Deg[6] = lerp(p[4][4], d, abs(pphase_b45));
+		}

-	// generate weights for directional filters
-	float4 edge[2][2];
-	edge[0][0] = shEdgeMap.Sample(samPoint, (srcPosB + float2(0, 0)) * float2(inputPtX, inputPtY));
-	edge[0][1] = shEdgeMap.Sample(samPoint, (srcPosB + float2(1, 0)) * float2(inputPtX, inputPtY));
-	edge[1][0] = shEdgeMap.Sample(samPoint, (srcPosB + float2(0, 1)) * float2(inputPtX, inputPtY));
-	edge[1][1] = shEdgeMap.Sample(samPoint, (srcPosB + float2(1, 1)) * float2(inputPtX, inputPtY));
+		float interp45Deg[6];
+		float pphase_p45 = phase_x_frac + phase_y_frac;
+		if (pphase_p45 >= 1) {
+			[unroll]
+			for (int i = 0; i < 6; i++) {
+				interp45Deg[i] = temp_interp45Deg[i + 1];
+			}
+			pphase_p45 = pphase_p45 - 1;
+		} else {
+			[unroll]
+			for (int i = 0; i < 6; i++) {
+				interp45Deg[i] = temp_interp45Deg[i];
+			}
+		}

-	const float4 w = GetInterpEdgeMap(edge, f.x, f.y) * NIS_SCALE_INT;
-	
-	// final pixel is a weighted sum filter outputs
-	const float opY = (opDirYU.x * w.x + opDirYU.y * w.y + opDirYU.z * w.z + opDirYU.w * w.w +
-		pixel_n * (NIS_SCALE_FLOAT - w.x - w.y - w.z - w.w)) * (1.0f / NIS_SCALE_FLOAT);
-	// do bilinear tap for chroma upscaling
+		f += EvalPoly6(interp45Deg, int(pphase_p45 * 64)) * w.z;
+	}
+	if (w.w > 0.0f) {
+		//135 deg filter
+		float pphase_b135 = 0.5f * (phase_x_frac + phase_y_frac);

-	float4 op = INPUT.Sample(samLinear, pos);
+		float temp_interp135Deg[7];
+		temp_interp135Deg[1] = lerp(p[3][1], p[4][2], pphase_b135);
+		temp_interp135Deg[3] = lerp(p[2][2], p[3][3], pphase_b135);
+		temp_interp135Deg[5] = lerp(p[1][3], p[2][4], pphase_b135);
+		{
+			pphase_b135 = pphase_b135 - 0.5f;
+			float a = (pphase_b135 >= 0.f) ? p[5][2] : p[3][0];
+			float b = (pphase_b135 >= 0.f) ? p[4][3] : p[2][1];
+			float c = (pphase_b135 >= 0.f) ? p[3][4] : p[1][2];
+			float d = (pphase_b135 >= 0.f) ? p[2][5] : p[0][3];
+			temp_interp135Deg[0] = lerp(p[4][1], a, abs(pphase_b135));
+			temp_interp135Deg[2] = lerp(p[3][2], b, abs(pphase_b135));
+			temp_interp135Deg[4] = lerp(p[2][3], c, abs(pphase_b135));
+			temp_interp135Deg[6] = lerp(p[1][4], d, abs(pphase_b135));
+		}

-	const float corr = opY * (1.0f / NIS_SCALE_FLOAT) - getY(float3(op.x, op.y, op.z));
-	op.x += corr;
-	op.y += corr;
-	op.z += corr;
+		float interp135Deg[6];
+		float pphase_p135 = 1 + (phase_x_frac - phase_y_frac);
+		if (pphase_p135 >= 1) {
+			[unroll]
+			for (int i = 0; i < 6; ++i) {
+				interp135Deg[i] = temp_interp135Deg[i + 1];
+			}
+			pphase_p135 = pphase_p135 - 1;
+		} else {
+			[unroll]
+			for (int i = 0; i < 6; ++i) {
+				interp135Deg[i] = temp_interp135Deg[i];
+			}
+		}

-	return op;
+		f += EvalPoly6(interp135Deg, int(pphase_p135 * 64)) * w.w;
+	}
+	return f;
+}
+
+void Main(uint2 blockStart, uint3 threadId) {
+	float2 scale = GetScale();
+	float2 inputPt = GetInputPt();
+
+	float kScaleX = 1.0f / scale.x;
+	float kScaleY = 1.0f / scale.y;
+	float threadIdx = threadId.x;
+	float kSrcNormX = inputPt.x;
+	float kSrcNormY = inputPt.y;
+
+	// Figure out the range of pixels from input image that would be needed to be loaded for this thread-block
+	int dstBlockX = blockStart.x;
+	int dstBlockY = blockStart.y;
+
+	const int srcBlockStartX = int(floor((dstBlockX + 0.5f) * kScaleX - 0.5f));
+	const int srcBlockStartY = int(floor((dstBlockY + 0.5f) * kScaleY - 0.5f));
+	const int srcBlockEndX = int(ceil((dstBlockX + NIS_BLOCK_WIDTH + 0.5f) * kScaleX - 0.5f));
+	const int srcBlockEndY = int(ceil((dstBlockY + NIS_BLOCK_HEIGHT + 0.5f) * kScaleY - 0.5f));
+
+	int numTilePixelsX = srcBlockEndX - srcBlockStartX + kSupportSize - 1;
+	int numTilePixelsY = srcBlockEndY - srcBlockStartY + kSupportSize - 1;
+
+	// round-up load region to even size since we're loading in 2x2 batches
+	numTilePixelsX += numTilePixelsX & 0x1;
+	numTilePixelsY += numTilePixelsY & 0x1;
+	const int numTilePixels = numTilePixelsX * numTilePixelsY;
+
+	// calculate the equivalent values for the edge map
+	const int numEdgeMapPixelsX = numTilePixelsX - kSupportSize + 2;
+	const int numEdgeMapPixelsY = numTilePixelsY - kSupportSize + 2;
+	const int numEdgeMapPixels = numEdgeMapPixelsX * numEdgeMapPixelsY;
+
+	// fill in input luma tile (shPixelsY) in batches of 2x2 pixels
+	// we use texture gather to get extra support necessary
+	// to compute 2x2 edge map outputs too
+	{
+		for (uint i = threadIdx * 2; i < uint(numTilePixels) >> 1; i += NIS_THREAD_GROUP_SIZE * 2) {
+			uint py = (i / numTilePixelsX) * 2;
+			uint px = i % numTilePixelsX;
+
+			// 0.5 to be in the center of texel
+			// - (kSupportSize - 1) / 2 to shift by the kernel support size
+			float kShift = 0.5f - (kSupportSize - 1) / 2;
+
+			const float tx = (srcBlockStartX + px + kShift) * kSrcNormX;
+			const float ty = (srcBlockStartY + py + kShift) * kSrcNormY;
+
+			float p[2][2];
+			{
+				const float4 sr = INPUT.GatherRed(samplerLinearClamp, float2(tx, ty));
+				const float4 sg = INPUT.GatherGreen(samplerLinearClamp, float2(tx, ty));
+				const float4 sb = INPUT.GatherBlue(samplerLinearClamp, float2(tx, ty));
+
+				p[0][0] = getY(float3(sr.w, sg.w, sb.w));
+				p[0][1] = getY(float3(sr.z, sg.z, sb.z));
+				p[1][0] = getY(float3(sr.x, sg.x, sb.x));
+				p[1][1] = getY(float3(sr.y, sg.y, sb.y));
+			}
+
+			const uint idx = py * kTilePitch + px;
+			shPixelsY[idx] = float(p[0][0]);
+			shPixelsY[idx + 1] = float(p[0][1]);
+			shPixelsY[idx + kTilePitch] = float(p[1][0]);
+			shPixelsY[idx + kTilePitch + 1] = float(p[1][1]);
+		}
+	}
+	GroupMemoryBarrierWithGroupSync();
+	{
+		// fill in the edge map of 2x2 pixels
+		for (uint i = threadIdx * 2; i < uint(numEdgeMapPixels) >> 1; i += NIS_THREAD_GROUP_SIZE * 2) {
+			uint py = (i / numEdgeMapPixelsX) * 2;
+			uint px = i % numEdgeMapPixelsX;
+
+			const uint edgeMapIdx = py * kEdgeMapPitch + px;
+
+			uint tileCornerIdx = (py + 1) * kTilePitch + px + 1;
+			float p[4][4];
+			[unroll]
+			for (int j = 0; j < 4; j++) {
+				[unroll]
+				for (int k = 0; k < 4; k++) {
+					p[j][k] = shPixelsY[tileCornerIdx + j * kTilePitch + k];
+				}
+			}
+
+			shEdgeMap[edgeMapIdx] = float4(GetEdgeMap(p, 0, 0));
+			shEdgeMap[edgeMapIdx + 1] = float4(GetEdgeMap(p, 0, 1));
+			shEdgeMap[edgeMapIdx + kEdgeMapPitch] = float4(GetEdgeMap(p, 1, 0));
+			shEdgeMap[edgeMapIdx + kEdgeMapPitch + 1] = float4(GetEdgeMap(p, 1, 1));
+		}
+	}
+	LoadFilterBanksSh(int(threadIdx), NIS_THREAD_GROUP_SIZE);
+	GroupMemoryBarrierWithGroupSync();
+
+	// output coord within a tile
+	const int2 pos = int2(uint(threadIdx) % uint(NIS_BLOCK_WIDTH), uint(threadIdx) / uint(NIS_BLOCK_WIDTH));
+	// x coord inside the output image
+	const int dstX = dstBlockX + pos.x;
+	// x coord inside the input image
+	const float srcX = (0.5f + dstX) * kScaleX - 0.5f;
+	// nearest integer part
+	const int px = int(floor(srcX) - srcBlockStartX);
+	// fractional part
+	const float fx = srcX - floor(srcX);
+	// discretized phase
+	const int fx_int = int(fx * kPhaseCount);
+
+	for (int k = 0; k < NIS_BLOCK_WIDTH * NIS_BLOCK_HEIGHT / NIS_THREAD_GROUP_SIZE; ++k) {
+		// y coord inside the output image
+		const int dstY = dstBlockY + pos.y + k * (NIS_THREAD_GROUP_SIZE / NIS_BLOCK_WIDTH);
+		// y coord inside the input image
+		const float srcY = (0.5f + dstY) * kScaleY - 0.5f;
+
+		// nearest integer part
+		const int py = int(floor(srcY) - srcBlockStartY);
+		// fractional part
+		const float fy = srcY - floor(srcY);
+		// discretized phase
+		const int fy_int = int(fy * kPhaseCount);
+
+		// generate weights for directional filters
+		const int startEdgeMapIdx = py * kEdgeMapPitch + px;
+		float4 edge[2][2];
+		[unroll]
+		for (int i = 0; i < 2; i++) {
+			[unroll]
+			for (int j = 0; j < 2; j++) {
+				// need to shift edge map sampling since it's a 2x2 centered inside 6x6 grid
+				edge[i][j] = shEdgeMap[startEdgeMapIdx + (i * kEdgeMapPitch) + j];
+			}
+		}
+		const float4 w = GetInterpEdgeMap(edge, fx, fy) * NIS_SCALE_INT;
+
+		// load 6x6 support to regs
+		const int startTileIdx = py * kTilePitch + px;
+		float p[6][6];
+		{
+			[unroll]
+			for (int i = 0; i < 6; ++i) {
+				[unroll]
+				for (int j = 0; j < 6; ++j) {
+					p[i][j] = shPixelsY[startTileIdx + i * kTilePitch + j];
+				}
+			}
+		}
+
+		// weigth for luma
+		const float baseWeight = NIS_SCALE_FLOAT - w.x - w.y - w.z - w.w;
+
+		// final luma is a weighted product of directional & normal filters
+		float opY = 0;
+
+		// get traditional scaler filter output
+		opY += FilterNormal(p, fx_int, fy_int) * baseWeight;
+
+		// get directional filter bank output
+		opY += AddDirFilters(p, fx, fy, fx_int, fy_int, w);
+
+		// do bilinear tap for chroma upscaling
+
+		float3 op = INPUT.SampleLevel(samplerLinearClamp, float2((srcX + 0.5f) * kSrcNormX, (srcY + 0.5f) * kSrcNormY), 0).rgb;
+
+		const float corr = opY * (1.0f / NIS_SCALE_FLOAT) - getY(float3(op.x, op.y, op.z));
+		op.x += corr;
+		op.y += corr;
+		op.z += corr;
+
+		WriteToOutput(uint2(dstX, dstY), op);
+	}
 }
--- a/Effects/NIS_Coef_Scale.dds
+++ b/Effects/NIS_Coef_Scale.dds
--- a/Effects/NIS_Coef_USM.dds
+++ b/Effects/NIS_Coef_USM.dds
--- a/tools/MPVHookTextureParser/MPVHookTextureParser.vcxproj
+++ b/tools/MPVHookTextureParser/MPVHookTextureParser.vcxproj
@ -29,20 +29,20 @@
  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'" Label="Configuration">
    <ConfigurationType>Application</ConfigurationType>
    <UseDebugLibraries>true</UseDebugLibraries>
-    <PlatformToolset>v142</PlatformToolset>
+    <PlatformToolset>v143</PlatformToolset>
    <CharacterSet>Unicode</CharacterSet>
  </PropertyGroup>
  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'" Label="Configuration">
    <ConfigurationType>Application</ConfigurationType>
    <UseDebugLibraries>false</UseDebugLibraries>
-    <PlatformToolset>v142</PlatformToolset>
+    <PlatformToolset>v143</PlatformToolset>
    <WholeProgramOptimization>true</WholeProgramOptimization>
    <CharacterSet>Unicode</CharacterSet>
  </PropertyGroup>
  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="Configuration">
    <ConfigurationType>Application</ConfigurationType>
    <UseDebugLibraries>true</UseDebugLibraries>
-    <PlatformToolset>v142</PlatformToolset>
+    <PlatformToolset>v143</PlatformToolset>
    <CharacterSet>Unicode</CharacterSet>
  </PropertyGroup>
  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="Configuration">