feat: 移植 NIS v1.0.2

This commit is contained in:
Xu Liu 2022-02-25 22:43:37 +08:00
commit 1b613e89c7
4 changed files with 358 additions and 252 deletions

View file

@ -1,64 +1,40 @@
// 移植自 https://github.com/NVIDIAGameWorks/NVIDIAImageScaling/blob/main/NIS/NIS_Scaler.h
//!MAGPIE EFFECT
//!VERSION 1
//!VERSION 2
//!CONSTANT
//!VALUE INPUT_PT_X
float inputPtX;
//!CONSTANT
//!VALUE INPUT_PT_Y
float inputPtY;
//!CONSTANT
//!VALUE SCALE_X
float scaleX;
//!CONSTANT
//!VALUE SCALE_Y
float scaleY;
//!TEXTURE
Texture2D INPUT;
//!TEXTURE
//!WIDTH INPUT_WIDTH
//!HEIGHT INPUT_HEIGHT
//!FORMAT R16G16B16A16_FLOAT
Texture2D shEdgeMap;
//!TEXTURE
//!SOURCE NIS_Coef_Scale.dds
Texture2D coefScale;
//!TEXTURE
//!SOURCE NIS_Coef_USM.dds
Texture2D coefUSM;
//!SAMPLER
//!FILTER LINEAR
SamplerState samLinear;
//!SAMPLER
//!FILTER POINT
SamplerState samPoint;
//!CONSTANT
//!PARAMETER
//!DEFAULT 0.5
//!MIN 0
//!MAX 1
float sharpness;
//!TEXTURE
Texture2D INPUT;
//!COMMON
//!TEXTURE
//!SOURCE NIS_Coef_Scale.dds
Texture2D coef_scaler;
#define kDetectRatio (1127.f / 1024.f)
//!TEXTURE
//!SOURCE NIS_Coef_USM.dds
Texture2D coef_usm;
//!SAMPLER
//!FILTER LINEAR
SamplerState samplerLinearClamp;
//!PASS 1
//!IN INPUT, coef_scaler, coef_usm
//!BLOCK_SIZE 32,24
//!NUM_THREADS 256,1,1
#define kDetectRatio (2 * 1127.f / 1024.f)
#define kDetectThres (64.0f / 1024.0f)
#define kPhaseCount 64
#define kFilterSize 8
#define kEps 1.0f
#define kEps (1.0f / 255.0f)
#define kMinContrastRatio 2.0f
#define kMaxContrastRatio 10.0f
#define kRatioNorm (1.0f / (kMaxContrastRatio - kMinContrastRatio))
@ -79,21 +55,38 @@ float sharpness;
#define NIS_SCALE_FLOAT 1.0f
#define NIS_SCALE_INT 1
#define NIS_BLOCK_WIDTH 32
#define NIS_BLOCK_HEIGHT 24
#define NIS_THREAD_GROUP_SIZE 256
#define kPhaseCount 64
#define kFilterSize 6
#define kSupportSize 6
#define kPadSize kSupportSize
// 'Tile' is the region of source luminance values that we load into shPixelsY.
// It is the area of source pixels covered by the destination 'Block' plus a
// 3 pixel border of support pixels.
#define kTilePitch (NIS_BLOCK_WIDTH + kPadSize)
#define kTileSize (kTilePitch * (NIS_BLOCK_HEIGHT + kPadSize))
// 'EdgeMap' is the region of source pixels for which edge map vectors are derived.
// It is the area of source pixels covered by the destination 'Block' plus a
// 1 pixel border.
#define kEdgeMapPitch (NIS_BLOCK_WIDTH + 2)
#define kEdgeMapSize (kEdgeMapPitch * (NIS_BLOCK_HEIGHT + 2))
groupshared float shPixelsY[kTileSize];
groupshared float shCoefScaler[kPhaseCount][kFilterSize];
groupshared float shCoefUSM[kPhaseCount][kFilterSize];
groupshared float4 shEdgeMap[kEdgeMapSize];
float getY(float3 rgba) {
return 0.2126f * rgba.x + 0.7152f * rgba.y + 0.0722f * rgba.z;
}
//!PASS 1
//!BIND INPUT
//!SAVE shEdgeMap
float4 GetEdgeMap(float p[3][3]) {
const float g_0 = abs(p[0][0] + p[0][1] + p[0][2] - p[2][0] - p[2][1] - p[2][2]);
const float g_45 = abs(p[1][0] + p[0][0] + p[0][1] - p[2][1] - p[2][2] - p[1][2]);
const float g_90 = abs(p[0][0] + p[1][0] + p[2][0] - p[0][2] - p[1][2] - p[2][2]);
const float g_135 = abs(p[1][0] + p[2][0] + p[2][1] - p[0][1] - p[0][2] - p[1][2]);
float4 GetEdgeMap(float p[4][4], int i, int j) {
const float g_0 = abs(p[0 + i][0 + j] + p[0 + i][1 + j] + p[0 + i][2 + j] - p[2 + i][0 + j] - p[2 + i][1 + j] - p[2 + i][2 + j]);
const float g_45 = abs(p[1 + i][0 + j] + p[0 + i][0 + j] + p[0 + i][1 + j] - p[2 + i][1 + j] - p[2 + i][2 + j] - p[1 + i][2 + j]);
const float g_90 = abs(p[0 + i][0 + j] + p[1 + i][0 + j] + p[2 + i][0 + j] - p[0 + i][2 + j] - p[1 + i][2 + j] - p[2 + i][2 + j]);
const float g_135 = abs(p[1 + i][0 + j] + p[2 + i][0 + j] + p[2 + i][1 + j] - p[0 + i][1 + j] - p[0 + i][2 + j] - p[1 + i][2 + j]);
const float g_0_90_max = max(g_0, g_90);
const float g_0_90_min = min(g_0, g_90);
@ -103,80 +96,76 @@ float4 GetEdgeMap(float p[3][3]) {
float e_0_90 = 0;
float e_45_135 = 0;
if ((g_0_90_max + g_45_135_max) != 0) {
e_0_90 = g_0_90_max / (g_0_90_max + g_45_135_max);
e_0_90 = min(e_0_90, 1.0f);
e_45_135 = 1.0f - e_0_90;
if (g_0_90_max + g_45_135_max == 0) {
return float4(0, 0, 0, 0);
}
float e = ((g_0_90_max > (g_0_90_min * kDetectRatio)) && (g_0_90_max > kDetectThres) && (g_0_90_max > g_45_135_min)) ? 1.f : 0.f;
float edge_0 = (g_0_90_max == g_0) ? e : 0.f;
float edge_90 = (g_0_90_max == g_0) ? 0.f : e;
e_0_90 = min(g_0_90_max / (g_0_90_max + g_45_135_max), 1.0f);
e_45_135 = 1.0f - e_0_90;
e = ((g_45_135_max > (g_45_135_min * kDetectRatio)) && (g_45_135_max > kDetectThres) && (g_45_135_max > g_0_90_min)) ? 1.f : 0.f;
float edge_45 = (g_45_135_max == g_45) ? e : 0.f;
float edge_135 = (g_45_135_max == g_45) ? 0.f : e;
bool c_0_90 = (g_0_90_max > (g_0_90_min * kDetectRatio)) && (g_0_90_max > kDetectThres) && (g_0_90_max > g_45_135_min);
bool c_45_135 = (g_45_135_max > (g_45_135_min * kDetectRatio)) && (g_45_135_max > kDetectThres) && (g_45_135_max > g_0_90_min);
bool c_g_0_90 = g_0_90_max == g_0;
bool c_g_45_135 = g_45_135_max == g_45;
float weight_0 = 0.f;
float weight_90 = 0.f;
float weight_45 = 0.f;
float weight_135 = 0.f;
if ((edge_0 + edge_90 + edge_45 + edge_135) >= 2.0f) {
weight_0 = (edge_0 == 1.0f) ? e_0_90 : 0.f;
weight_90 = (edge_0 == 1.0f) ? 0.f : e_0_90;
float f_e_0_90 = (c_0_90 && c_45_135) ? e_0_90 : 1.0f;
float f_e_45_135 = (c_0_90 && c_45_135) ? e_45_135 : 1.0f;
weight_45 = (edge_45 == 1.0f) ? e_45_135 : 0.f;
weight_135 = (edge_45 == 1.0f) ? 0.f : e_45_135;
} else if ((edge_0 + edge_90 + edge_45 + edge_135) >= 1.0f) {
weight_0 = edge_0;
weight_90 = edge_90;
weight_45 = edge_45;
weight_135 = edge_135;
}
float weight_0 = (c_0_90 && c_g_0_90) ? f_e_0_90 : 0.0f;
float weight_90 = (c_0_90 && !c_g_0_90) ? f_e_0_90 : 0.0f;
float weight_45 = (c_45_135 && c_g_45_135) ? f_e_45_135 : 0.0f;
float weight_135 = (c_45_135 && !c_g_45_135) ? f_e_45_135 : 0.0f;
return float4(weight_0, weight_90, weight_45, weight_135);
}
float4 Pass1(float2 pos) {
float p[3][3];
void LoadFilterBanksSh(int i0, int di) {
// Load up filter banks to shared memory
// The work is spread over (kPhaseCount * 2) threads
for (int i = i0; i < kPhaseCount * 2; i += di) {
int phase = i >> 1;
int vIdx = i & 1;
[unroll]
for (int j = 0; j < 3; j++) {
[unroll]
for (int k = 0; k < 3; k++) {
const float3 px = INPUT.Sample(samPoint, pos + float2(k - 1, j - 1) * float2(inputPtX, inputPtY)).xyz;
p[j][k] = getY(px);
float4 v = float4(coef_scaler[int2(vIdx, phase)]);
int filterOffset = vIdx * 4;
shCoefScaler[phase][filterOffset + 0] = v.x;
shCoefScaler[phase][filterOffset + 1] = v.y;
if (vIdx == 0) {
shCoefScaler[phase][2] = v.z;
shCoefScaler[phase][3] = v.w;
}
v = float4(coef_usm[int2(vIdx, phase)]);
shCoefUSM[phase][filterOffset + 0] = v.x;
shCoefUSM[phase][filterOffset + 1] = v.y;
if (vIdx == 0) {
shCoefUSM[phase][2] = v.z;
shCoefUSM[phase][3] = v.w;
}
}
return GetEdgeMap(p);
}
//!PASS 2
//!BIND INPUT, shEdgeMap, coefScale, coefUSM
float CalcLTI(float p0, float p1, float p2, float p3, float p4, float p5, int phase_index) {
const bool selector = (phase_index <= kPhaseCount / 2);
float sel = selector ? p0 : p3;
const float a_min = min(min(p1, p2), sel);
const float a_max = max(max(p1, p2), sel);
sel = selector ? p2 : p5;
const float b_min = min(min(p3, p4), sel);
const float b_max = max(max(p3, p4), sel);
const bool selector = (phase_index <= kPhaseCount / 2);
float sel = selector ? p0 : p3;
const float a_min = min(min(p1, p2), sel);
const float a_max = max(max(p1, p2), sel);
sel = selector ? p2 : p5;
const float b_min = min(min(p3, p4), sel);
const float b_max = max(max(p3, p4), sel);
const float a_cont = a_max - a_min;
const float b_cont = b_max - b_min;
const float a_cont = a_max - a_min;
const float b_cont = b_max - b_min;
const float cont_ratio = max(a_cont, b_cont) / (min(a_cont, b_cont) + kEps);
return (1.0f - saturate((cont_ratio - kMinContrastRatio) * kRatioNorm)) * kContrastBoost;
const float cont_ratio = max(a_cont, b_cont) / (min(a_cont, b_cont) + kEps);
return (1.0f - saturate((cont_ratio - kMinContrastRatio) * kRatioNorm)) * kContrastBoost;
}
float4 GetInterpEdgeMap(const float4 edge[2][2], float phase_frac_x, float phase_frac_y) {
float4 h0 = lerp(edge[0][0], edge[0][1], phase_frac_x);
float4 h1 = lerp(edge[1][0], edge[1][1], phase_frac_x);
return lerp(h0, h1, phase_frac_y);
float4 h0 = lerp(edge[0][0], edge[0][1], phase_frac_x);
float4 h1 = lerp(edge[1][0], edge[1][1], phase_frac_x);
return lerp(h0, h1, phase_frac_y);
}
float EvalPoly6(const float pxl[6], int phase_int) {
@ -184,14 +173,14 @@ float EvalPoly6(const float pxl[6], int phase_int) {
{
[unroll]
for (int i = 0; i < 6; ++i) {
y += coefScale.Sample(samPoint, float2((phase_int + 0.5f) / kPhaseCount, (i + 0.5f) / 6.0f)).x * pxl[i];
y += shCoefScaler[phase_int][i] * pxl[i];
}
}
float y_usm = 0.f;
{
[unroll]
for (int i = 0; i < 6; ++i) {
y_usm += coefUSM.Sample(samPoint, float2((phase_int + 0.5f) / kPhaseCount, (i + 0.5f) / 6.0f)).x * pxl[i];
y_usm += shCoefUSM[phase_int][i] * pxl[i];
}
}
@ -213,109 +202,6 @@ float EvalPoly6(const float pxl[6], int phase_int) {
return y + y_usm;
}
float4 GetDirFilters(float p[6][6], float phase_x_frac, float phase_y_frac, int phase_x_frac_int, int phase_y_frac_int) {
float4 f;
// 0 deg filter
float interp0Deg[6];
{
[unroll]
for (int i = 0; i < 6; ++i) {
interp0Deg[i] = lerp(p[i][2], p[i][3], phase_x_frac);
}
}
f.x = EvalPoly6(interp0Deg, phase_y_frac_int);
// 90 deg filter
float interp90Deg[6];
{
[unroll]
for (int i = 0; i < 6; ++i) {
interp90Deg[i] = lerp(p[2][i], p[3][i], phase_y_frac);
}
}
f.y = EvalPoly6(interp90Deg, phase_x_frac_int);
//45 deg filter
float pphase_b45;
pphase_b45 = 0.5f + 0.5f * (phase_x_frac - phase_y_frac);
float temp_interp45Deg[7];
temp_interp45Deg[1] = lerp(p[2][1], p[1][2], pphase_b45);
temp_interp45Deg[3] = lerp(p[3][2], p[2][3], pphase_b45);
temp_interp45Deg[5] = lerp(p[4][3], p[3][4], pphase_b45);
{
pphase_b45 = pphase_b45 - 0.5f;
float a = (pphase_b45 >= 0.f) ? p[0][2] : p[2][0];
float b = (pphase_b45 >= 0.f) ? p[1][3] : p[3][1];
float c = (pphase_b45 >= 0.f) ? p[2][4] : p[4][2];
float d = (pphase_b45 >= 0.f) ? p[3][5] : p[5][3];
temp_interp45Deg[0] = lerp(p[1][1], a, abs(pphase_b45));
temp_interp45Deg[2] = lerp(p[2][2], b, abs(pphase_b45));
temp_interp45Deg[4] = lerp(p[3][3], c, abs(pphase_b45));
temp_interp45Deg[6] = lerp(p[4][4], d, abs(pphase_b45));
}
float interp45Deg[6];
float pphase_p45 = phase_x_frac + phase_y_frac;
if (pphase_p45 >= 1) {
[unroll]
for (int i = 0; i < 6; i++) {
interp45Deg[i] = temp_interp45Deg[i + 1];
}
pphase_p45 = pphase_p45 - 1;
} else {
[unroll]
for (int i = 0; i < 6; i++) {
interp45Deg[i] = temp_interp45Deg[i];
}
}
f.z = EvalPoly6(interp45Deg, (int)(pphase_p45 * 64));
//135 deg filter
float pphase_b135;
pphase_b135 = 0.5f * (phase_x_frac + phase_y_frac);
float temp_interp135Deg[7];
temp_interp135Deg[1] = lerp(p[3][1], p[4][2], pphase_b135);
temp_interp135Deg[3] = lerp(p[2][2], p[3][3], pphase_b135);
temp_interp135Deg[5] = lerp(p[1][3], p[2][4], pphase_b135);
{
pphase_b135 = pphase_b135 - 0.5f;
float a = (pphase_b135 >= 0.f) ? p[5][2] : p[3][0];
float b = (pphase_b135 >= 0.f) ? p[4][3] : p[2][1];
float c = (pphase_b135 >= 0.f) ? p[3][4] : p[1][2];
float d = (pphase_b135 >= 0.f) ? p[2][5] : p[0][3];
temp_interp135Deg[0] = lerp(p[4][1], a, abs(pphase_b135));
temp_interp135Deg[2] = lerp(p[3][2], b, abs(pphase_b135));
temp_interp135Deg[4] = lerp(p[2][3], c, abs(pphase_b135));
temp_interp135Deg[6] = lerp(p[1][4], d, abs(pphase_b135));
}
float interp135Deg[6];
float pphase_p135 = 1 + (phase_x_frac - phase_y_frac);
if (pphase_p135 >= 1) {
[unroll]
for (int i = 0; i < 6; ++i) {
interp135Deg[i] = temp_interp135Deg[i + 1];
}
pphase_p135 = pphase_p135 - 1;
} else {
[unroll]
for (int i = 0; i < 6; ++i) {
interp135Deg[i] = temp_interp135Deg[i];
}
}
f.w = EvalPoly6(interp135Deg, (int)(pphase_p135 * 64));
return f;
}
float FilterNormal(const float p[6][6], int phase_x_frac_int, int phase_y_frac_int) {
float h_acc = 0.0f;
[unroll]
@ -323,64 +209,284 @@ float FilterNormal(const float p[6][6], int phase_x_frac_int, int phase_y_frac_i
float v_acc = 0.0f;
[unroll]
for (int i = 0; i < 6; ++i) {
v_acc += p[i][j] * coefScale.Sample(samPoint, float2((phase_y_frac_int + 0.5f) / kPhaseCount, (i + 0.5f) / 6.0f)).x;
v_acc += p[i][j] * shCoefScaler[phase_y_frac_int][i];
}
h_acc += v_acc * coefScale.Sample(samPoint, float2((phase_x_frac_int + 0.5f) / kPhaseCount, (j + 0.5f) / 6.0f)).x;
h_acc += v_acc * shCoefScaler[phase_x_frac_int][j];
}
// let's return the sum unpacked -> we can accumulate it later
return h_acc;
}
float4 Pass2(float2 pos) {
float2 srcPos = pos / float2(inputPtX, inputPtY) + 0.5f;
float2 srcPosB = floor(srcPos);
// load 6x6 support to regs
float p[6][6];
{
[unroll]
for (int i = 0; i < 6; ++i) {
float AddDirFilters(float p[6][6], float phase_x_frac, float phase_y_frac, int phase_x_frac_int, int phase_y_frac_int, float4 w) {
float f = 0;
if (w.x > 0.0f) {
// 0 deg filter
float interp0Deg[6];
{
[unroll]
for (int j = 0; j < 6; ++j) {
p[i][j] = getY(INPUT.Sample(samPoint, (srcPosB + float2(j - 2, i - 2) - 0.5f) * float2(inputPtX, inputPtY)).rgb);
for (int i = 0; i < 6; ++i) {
interp0Deg[i] = lerp(p[i][2], p[i][3], phase_x_frac);
}
}
f += EvalPoly6(interp0Deg, phase_y_frac_int) * w.x;
}
if (w.y > 0.0f) {
// 90 deg filter
float interp90Deg[6];
{
[unroll]
for (int i = 0; i < 6; ++i) {
interp90Deg[i] = lerp(p[2][i], p[3][i], phase_y_frac);
}
}
// compute discretized filter phase
const float2 f = srcPos - srcPosB;
const int fx_int = (int)(f.x * kPhaseCount);
const int fy_int = (int)(f.y * kPhaseCount);
// get traditional scaler filter output
const float pixel_n = FilterNormal(p, fx_int, fy_int);
// get directional filter bank output
float4 opDirYU = GetDirFilters(p, f.x, f.y, fx_int, fy_int);
f += EvalPoly6(interp90Deg, phase_x_frac_int) * w.y;
}
if (w.z > 0.0f) {
//45 deg filter
float pphase_b45 = 0.5f + 0.5f * (phase_x_frac - phase_y_frac);
// final luma is a weighted product of directional & normal filters
float temp_interp45Deg[7];
temp_interp45Deg[1] = lerp(p[2][1], p[1][2], pphase_b45);
temp_interp45Deg[3] = lerp(p[3][2], p[2][3], pphase_b45);
temp_interp45Deg[5] = lerp(p[4][3], p[3][4], pphase_b45);
{
pphase_b45 = pphase_b45 - 0.5f;
float a = (pphase_b45 >= 0.f) ? p[0][2] : p[2][0];
float b = (pphase_b45 >= 0.f) ? p[1][3] : p[3][1];
float c = (pphase_b45 >= 0.f) ? p[2][4] : p[4][2];
float d = (pphase_b45 >= 0.f) ? p[3][5] : p[5][3];
temp_interp45Deg[0] = lerp(p[1][1], a, abs(pphase_b45));
temp_interp45Deg[2] = lerp(p[2][2], b, abs(pphase_b45));
temp_interp45Deg[4] = lerp(p[3][3], c, abs(pphase_b45));
temp_interp45Deg[6] = lerp(p[4][4], d, abs(pphase_b45));
}
// generate weights for directional filters
float4 edge[2][2];
edge[0][0] = shEdgeMap.Sample(samPoint, (srcPosB + float2(0, 0)) * float2(inputPtX, inputPtY));
edge[0][1] = shEdgeMap.Sample(samPoint, (srcPosB + float2(1, 0)) * float2(inputPtX, inputPtY));
edge[1][0] = shEdgeMap.Sample(samPoint, (srcPosB + float2(0, 1)) * float2(inputPtX, inputPtY));
edge[1][1] = shEdgeMap.Sample(samPoint, (srcPosB + float2(1, 1)) * float2(inputPtX, inputPtY));
float interp45Deg[6];
float pphase_p45 = phase_x_frac + phase_y_frac;
if (pphase_p45 >= 1) {
[unroll]
for (int i = 0; i < 6; i++) {
interp45Deg[i] = temp_interp45Deg[i + 1];
}
pphase_p45 = pphase_p45 - 1;
} else {
[unroll]
for (int i = 0; i < 6; i++) {
interp45Deg[i] = temp_interp45Deg[i];
}
}
const float4 w = GetInterpEdgeMap(edge, f.x, f.y) * NIS_SCALE_INT;
// final pixel is a weighted sum filter outputs
const float opY = (opDirYU.x * w.x + opDirYU.y * w.y + opDirYU.z * w.z + opDirYU.w * w.w +
pixel_n * (NIS_SCALE_FLOAT - w.x - w.y - w.z - w.w)) * (1.0f / NIS_SCALE_FLOAT);
// do bilinear tap for chroma upscaling
f += EvalPoly6(interp45Deg, int(pphase_p45 * 64)) * w.z;
}
if (w.w > 0.0f) {
//135 deg filter
float pphase_b135 = 0.5f * (phase_x_frac + phase_y_frac);
float4 op = INPUT.Sample(samLinear, pos);
float temp_interp135Deg[7];
temp_interp135Deg[1] = lerp(p[3][1], p[4][2], pphase_b135);
temp_interp135Deg[3] = lerp(p[2][2], p[3][3], pphase_b135);
temp_interp135Deg[5] = lerp(p[1][3], p[2][4], pphase_b135);
{
pphase_b135 = pphase_b135 - 0.5f;
float a = (pphase_b135 >= 0.f) ? p[5][2] : p[3][0];
float b = (pphase_b135 >= 0.f) ? p[4][3] : p[2][1];
float c = (pphase_b135 >= 0.f) ? p[3][4] : p[1][2];
float d = (pphase_b135 >= 0.f) ? p[2][5] : p[0][3];
temp_interp135Deg[0] = lerp(p[4][1], a, abs(pphase_b135));
temp_interp135Deg[2] = lerp(p[3][2], b, abs(pphase_b135));
temp_interp135Deg[4] = lerp(p[2][3], c, abs(pphase_b135));
temp_interp135Deg[6] = lerp(p[1][4], d, abs(pphase_b135));
}
const float corr = opY * (1.0f / NIS_SCALE_FLOAT) - getY(float3(op.x, op.y, op.z));
op.x += corr;
op.y += corr;
op.z += corr;
float interp135Deg[6];
float pphase_p135 = 1 + (phase_x_frac - phase_y_frac);
if (pphase_p135 >= 1) {
[unroll]
for (int i = 0; i < 6; ++i) {
interp135Deg[i] = temp_interp135Deg[i + 1];
}
pphase_p135 = pphase_p135 - 1;
} else {
[unroll]
for (int i = 0; i < 6; ++i) {
interp135Deg[i] = temp_interp135Deg[i];
}
}
return op;
f += EvalPoly6(interp135Deg, int(pphase_p135 * 64)) * w.w;
}
return f;
}
void Main(uint2 blockStart, uint3 threadId) {
float2 scale = GetScale();
float2 inputPt = GetInputPt();
float kScaleX = 1.0f / scale.x;
float kScaleY = 1.0f / scale.y;
float threadIdx = threadId.x;
float kSrcNormX = inputPt.x;
float kSrcNormY = inputPt.y;
// Figure out the range of pixels from input image that would be needed to be loaded for this thread-block
int dstBlockX = blockStart.x;
int dstBlockY = blockStart.y;
const int srcBlockStartX = int(floor((dstBlockX + 0.5f) * kScaleX - 0.5f));
const int srcBlockStartY = int(floor((dstBlockY + 0.5f) * kScaleY - 0.5f));
const int srcBlockEndX = int(ceil((dstBlockX + NIS_BLOCK_WIDTH + 0.5f) * kScaleX - 0.5f));
const int srcBlockEndY = int(ceil((dstBlockY + NIS_BLOCK_HEIGHT + 0.5f) * kScaleY - 0.5f));
int numTilePixelsX = srcBlockEndX - srcBlockStartX + kSupportSize - 1;
int numTilePixelsY = srcBlockEndY - srcBlockStartY + kSupportSize - 1;
// round-up load region to even size since we're loading in 2x2 batches
numTilePixelsX += numTilePixelsX & 0x1;
numTilePixelsY += numTilePixelsY & 0x1;
const int numTilePixels = numTilePixelsX * numTilePixelsY;
// calculate the equivalent values for the edge map
const int numEdgeMapPixelsX = numTilePixelsX - kSupportSize + 2;
const int numEdgeMapPixelsY = numTilePixelsY - kSupportSize + 2;
const int numEdgeMapPixels = numEdgeMapPixelsX * numEdgeMapPixelsY;
// fill in input luma tile (shPixelsY) in batches of 2x2 pixels
// we use texture gather to get extra support necessary
// to compute 2x2 edge map outputs too
{
for (uint i = threadIdx * 2; i < uint(numTilePixels) >> 1; i += NIS_THREAD_GROUP_SIZE * 2) {
uint py = (i / numTilePixelsX) * 2;
uint px = i % numTilePixelsX;
// 0.5 to be in the center of texel
// - (kSupportSize - 1) / 2 to shift by the kernel support size
float kShift = 0.5f - (kSupportSize - 1) / 2;
const float tx = (srcBlockStartX + px + kShift) * kSrcNormX;
const float ty = (srcBlockStartY + py + kShift) * kSrcNormY;
float p[2][2];
{
const float4 sr = INPUT.GatherRed(samplerLinearClamp, float2(tx, ty));
const float4 sg = INPUT.GatherGreen(samplerLinearClamp, float2(tx, ty));
const float4 sb = INPUT.GatherBlue(samplerLinearClamp, float2(tx, ty));
p[0][0] = getY(float3(sr.w, sg.w, sb.w));
p[0][1] = getY(float3(sr.z, sg.z, sb.z));
p[1][0] = getY(float3(sr.x, sg.x, sb.x));
p[1][1] = getY(float3(sr.y, sg.y, sb.y));
}
const uint idx = py * kTilePitch + px;
shPixelsY[idx] = float(p[0][0]);
shPixelsY[idx + 1] = float(p[0][1]);
shPixelsY[idx + kTilePitch] = float(p[1][0]);
shPixelsY[idx + kTilePitch + 1] = float(p[1][1]);
}
}
GroupMemoryBarrierWithGroupSync();
{
// fill in the edge map of 2x2 pixels
for (uint i = threadIdx * 2; i < uint(numEdgeMapPixels) >> 1; i += NIS_THREAD_GROUP_SIZE * 2) {
uint py = (i / numEdgeMapPixelsX) * 2;
uint px = i % numEdgeMapPixelsX;
const uint edgeMapIdx = py * kEdgeMapPitch + px;
uint tileCornerIdx = (py + 1) * kTilePitch + px + 1;
float p[4][4];
[unroll]
for (int j = 0; j < 4; j++) {
[unroll]
for (int k = 0; k < 4; k++) {
p[j][k] = shPixelsY[tileCornerIdx + j * kTilePitch + k];
}
}
shEdgeMap[edgeMapIdx] = float4(GetEdgeMap(p, 0, 0));
shEdgeMap[edgeMapIdx + 1] = float4(GetEdgeMap(p, 0, 1));
shEdgeMap[edgeMapIdx + kEdgeMapPitch] = float4(GetEdgeMap(p, 1, 0));
shEdgeMap[edgeMapIdx + kEdgeMapPitch + 1] = float4(GetEdgeMap(p, 1, 1));
}
}
LoadFilterBanksSh(int(threadIdx), NIS_THREAD_GROUP_SIZE);
GroupMemoryBarrierWithGroupSync();
// output coord within a tile
const int2 pos = int2(uint(threadIdx) % uint(NIS_BLOCK_WIDTH), uint(threadIdx) / uint(NIS_BLOCK_WIDTH));
// x coord inside the output image
const int dstX = dstBlockX + pos.x;
// x coord inside the input image
const float srcX = (0.5f + dstX) * kScaleX - 0.5f;
// nearest integer part
const int px = int(floor(srcX) - srcBlockStartX);
// fractional part
const float fx = srcX - floor(srcX);
// discretized phase
const int fx_int = int(fx * kPhaseCount);
for (int k = 0; k < NIS_BLOCK_WIDTH * NIS_BLOCK_HEIGHT / NIS_THREAD_GROUP_SIZE; ++k) {
// y coord inside the output image
const int dstY = dstBlockY + pos.y + k * (NIS_THREAD_GROUP_SIZE / NIS_BLOCK_WIDTH);
// y coord inside the input image
const float srcY = (0.5f + dstY) * kScaleY - 0.5f;
// nearest integer part
const int py = int(floor(srcY) - srcBlockStartY);
// fractional part
const float fy = srcY - floor(srcY);
// discretized phase
const int fy_int = int(fy * kPhaseCount);
// generate weights for directional filters
const int startEdgeMapIdx = py * kEdgeMapPitch + px;
float4 edge[2][2];
[unroll]
for (int i = 0; i < 2; i++) {
[unroll]
for (int j = 0; j < 2; j++) {
// need to shift edge map sampling since it's a 2x2 centered inside 6x6 grid
edge[i][j] = shEdgeMap[startEdgeMapIdx + (i * kEdgeMapPitch) + j];
}
}
const float4 w = GetInterpEdgeMap(edge, fx, fy) * NIS_SCALE_INT;
// load 6x6 support to regs
const int startTileIdx = py * kTilePitch + px;
float p[6][6];
{
[unroll]
for (int i = 0; i < 6; ++i) {
[unroll]
for (int j = 0; j < 6; ++j) {
p[i][j] = shPixelsY[startTileIdx + i * kTilePitch + j];
}
}
}
// weigth for luma
const float baseWeight = NIS_SCALE_FLOAT - w.x - w.y - w.z - w.w;
// final luma is a weighted product of directional & normal filters
float opY = 0;
// get traditional scaler filter output
opY += FilterNormal(p, fx_int, fy_int) * baseWeight;
// get directional filter bank output
opY += AddDirFilters(p, fx, fy, fx_int, fy_int, w);
// do bilinear tap for chroma upscaling
float3 op = INPUT.SampleLevel(samplerLinearClamp, float2((srcX + 0.5f) * kSrcNormX, (srcY + 0.5f) * kSrcNormY), 0).rgb;
const float corr = opY * (1.0f / NIS_SCALE_FLOAT) - getY(float3(op.x, op.y, op.z));
op.x += corr;
op.y += corr;
op.z += corr;
WriteToOutput(uint2(dstX, dstY), op);
}
}

Binary file not shown.

Binary file not shown.

View file

@ -29,20 +29,20 @@
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'" Label="Configuration">
<ConfigurationType>Application</ConfigurationType>
<UseDebugLibraries>true</UseDebugLibraries>
<PlatformToolset>v142</PlatformToolset>
<PlatformToolset>v143</PlatformToolset>
<CharacterSet>Unicode</CharacterSet>
</PropertyGroup>
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'" Label="Configuration">
<ConfigurationType>Application</ConfigurationType>
<UseDebugLibraries>false</UseDebugLibraries>
<PlatformToolset>v142</PlatformToolset>
<PlatformToolset>v143</PlatformToolset>
<WholeProgramOptimization>true</WholeProgramOptimization>
<CharacterSet>Unicode</CharacterSet>
</PropertyGroup>
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="Configuration">
<ConfigurationType>Application</ConfigurationType>
<UseDebugLibraries>true</UseDebugLibraries>
<PlatformToolset>v142</PlatformToolset>
<PlatformToolset>v143</PlatformToolset>
<CharacterSet>Unicode</CharacterSet>
</PropertyGroup>
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="Configuration">