Magpie/Effects/NIS.hlsl
刘旭 5c05330d1f feat: 每个通道的入口改为 PassN
虽然Main更统一,但不易阅读
2022-03-04 15:34:22 +08:00

496 lines
16 KiB
HLSL

// 移植自 https://github.com/NVIDIAGameWorks/NVIDIAImageScaling/blob/main/NIS/NIS_Scaler.h
//!MAGPIE EFFECT
//!VERSION 2
//!PARAMETER
//!DEFAULT 0.5
//!MIN 0
//!MAX 1
float sharpness;
//!TEXTURE
Texture2D INPUT;
//!TEXTURE
//!SOURCE NIS_Coef_Scale.dds
Texture2D coef_scaler;
//!TEXTURE
//!SOURCE NIS_Coef_USM.dds
Texture2D coef_usm;
//!SAMPLER
//!FILTER LINEAR
SamplerState samplerLinearClamp;
//!PASS 1
//!IN INPUT, coef_scaler, coef_usm
//!BLOCK_SIZE 32,24
//!NUM_THREADS 256,1,1
#define kDetectRatio (2 * 1127.f / 1024.f)
#define kDetectThres (64.0f / 1024.0f)
#define kEps (1.0f / 255.0f)
#define kMinContrastRatio 2.0f
#define kMaxContrastRatio 10.0f
#define kRatioNorm (1.0f / (kMaxContrastRatio - kMinContrastRatio))
#define kContrastBoost 1.0f
#define kSharpStartY 0.45f
#define kSharpEndY 0.9f
#define kSharpScaleY (1.0f / (kSharpEndY - kSharpStartY))
#define kSharpStrengthScale (kSharpStrengthMax - kSharpStrengthMin)
#define sharpen_slider (sharpness - 0.5f)
#define MinScale ((sharpen_slider >= 0.0f) ? 1.25f : 1.0f)
#define MaxScale ((sharpen_slider >= 0.0f) ? 1.25f : 1.75f)
#define kSharpStrengthMin (max(0.0f, 0.4f + sharpen_slider * MinScale * 1.2f))
#define kSharpStrengthMax (1.6f + sharpen_slider * MaxScale * 1.8f)
#define LimitScale ((sharpen_slider >= 0.0f) ? 1.25f : 1.0f)
#define kSharpLimitMin (max(0.1f, 0.14f + sharpen_slider * LimitScale * 0.32f))
#define kSharpLimitMax (0.5f + sharpen_slider * LimitScale * 0.6f)
#define kSharpLimitScale (kSharpLimitMax - kSharpLimitMin)
#define NIS_SCALE_FLOAT 1.0f
#define NIS_SCALE_INT 1
#define NIS_BLOCK_WIDTH 32
#define NIS_BLOCK_HEIGHT 24
#define NIS_THREAD_GROUP_SIZE 256
#define kPhaseCount 64
#define kFilterSize 6
#define kSupportSize 6
#define kPadSize kSupportSize
// 'Tile' is the region of source luminance values that we load into shPixelsY.
// It is the area of source pixels covered by the destination 'Block' plus a
// 3 pixel border of support pixels.
#define kTilePitch (NIS_BLOCK_WIDTH + kPadSize)
#define kTileSize (kTilePitch * (NIS_BLOCK_HEIGHT + kPadSize))
// 'EdgeMap' is the region of source pixels for which edge map vectors are derived.
// It is the area of source pixels covered by the destination 'Block' plus a
// 1 pixel border.
#define kEdgeMapPitch (NIS_BLOCK_WIDTH + 2)
#define kEdgeMapSize (kEdgeMapPitch * (NIS_BLOCK_HEIGHT + 2))
groupshared float shPixelsY[kTileSize];
groupshared float shCoefScaler[kPhaseCount][kFilterSize];
groupshared float shCoefUSM[kPhaseCount][kFilterSize];
groupshared float4 shEdgeMap[kEdgeMapSize];
float getY(float3 rgba) {
return 0.2126f * rgba.x + 0.7152f * rgba.y + 0.0722f * rgba.z;
}
float4 GetEdgeMap(float p[4][4], int i, int j) {
const float g_0 = abs(p[0 + i][0 + j] + p[0 + i][1 + j] + p[0 + i][2 + j] - p[2 + i][0 + j] - p[2 + i][1 + j] - p[2 + i][2 + j]);
const float g_45 = abs(p[1 + i][0 + j] + p[0 + i][0 + j] + p[0 + i][1 + j] - p[2 + i][1 + j] - p[2 + i][2 + j] - p[1 + i][2 + j]);
const float g_90 = abs(p[0 + i][0 + j] + p[1 + i][0 + j] + p[2 + i][0 + j] - p[0 + i][2 + j] - p[1 + i][2 + j] - p[2 + i][2 + j]);
const float g_135 = abs(p[1 + i][0 + j] + p[2 + i][0 + j] + p[2 + i][1 + j] - p[0 + i][1 + j] - p[0 + i][2 + j] - p[1 + i][2 + j]);
const float g_0_90_max = max(g_0, g_90);
const float g_0_90_min = min(g_0, g_90);
const float g_45_135_max = max(g_45, g_135);
const float g_45_135_min = min(g_45, g_135);
float e_0_90 = 0;
float e_45_135 = 0;
if (g_0_90_max + g_45_135_max == 0) {
return float4(0, 0, 0, 0);
}
e_0_90 = min(g_0_90_max / (g_0_90_max + g_45_135_max), 1.0f);
e_45_135 = 1.0f - e_0_90;
bool c_0_90 = (g_0_90_max > (g_0_90_min * kDetectRatio)) && (g_0_90_max > kDetectThres) && (g_0_90_max > g_45_135_min);
bool c_45_135 = (g_45_135_max > (g_45_135_min * kDetectRatio)) && (g_45_135_max > kDetectThres) && (g_45_135_max > g_0_90_min);
bool c_g_0_90 = g_0_90_max == g_0;
bool c_g_45_135 = g_45_135_max == g_45;
float f_e_0_90 = (c_0_90 && c_45_135) ? e_0_90 : 1.0f;
float f_e_45_135 = (c_0_90 && c_45_135) ? e_45_135 : 1.0f;
float weight_0 = (c_0_90 && c_g_0_90) ? f_e_0_90 : 0.0f;
float weight_90 = (c_0_90 && !c_g_0_90) ? f_e_0_90 : 0.0f;
float weight_45 = (c_45_135 && c_g_45_135) ? f_e_45_135 : 0.0f;
float weight_135 = (c_45_135 && !c_g_45_135) ? f_e_45_135 : 0.0f;
return float4(weight_0, weight_90, weight_45, weight_135);
}
void LoadFilterBanksSh(int i0, int di) {
// Load up filter banks to shared memory
// The work is spread over (kPhaseCount * 2) threads
for (int i = i0; i < kPhaseCount * 2; i += di) {
int phase = i >> 1;
int vIdx = i & 1;
float4 v = float4(coef_scaler[int2(vIdx, phase)]);
int filterOffset = vIdx * 4;
shCoefScaler[phase][filterOffset + 0] = v.x;
shCoefScaler[phase][filterOffset + 1] = v.y;
if (vIdx == 0) {
shCoefScaler[phase][2] = v.z;
shCoefScaler[phase][3] = v.w;
}
v = float4(coef_usm[int2(vIdx, phase)]);
shCoefUSM[phase][filterOffset + 0] = v.x;
shCoefUSM[phase][filterOffset + 1] = v.y;
if (vIdx == 0) {
shCoefUSM[phase][2] = v.z;
shCoefUSM[phase][3] = v.w;
}
}
}
float CalcLTI(float p0, float p1, float p2, float p3, float p4, float p5, int phase_index) {
const bool selector = (phase_index <= kPhaseCount / 2);
float sel = selector ? p0 : p3;
const float a_min = min(min(p1, p2), sel);
const float a_max = max(max(p1, p2), sel);
sel = selector ? p2 : p5;
const float b_min = min(min(p3, p4), sel);
const float b_max = max(max(p3, p4), sel);
const float a_cont = a_max - a_min;
const float b_cont = b_max - b_min;
const float cont_ratio = max(a_cont, b_cont) / (min(a_cont, b_cont) + kEps);
return (1.0f - saturate((cont_ratio - kMinContrastRatio) * kRatioNorm)) * kContrastBoost;
}
float4 GetInterpEdgeMap(const float4 edge[2][2], float phase_frac_x, float phase_frac_y) {
float4 h0 = lerp(edge[0][0], edge[0][1], phase_frac_x);
float4 h1 = lerp(edge[1][0], edge[1][1], phase_frac_x);
return lerp(h0, h1, phase_frac_y);
}
float EvalPoly6(const float pxl[6], int phase_int) {
float y = 0.f;
{
[unroll]
for (int i = 0; i < 6; ++i) {
y += shCoefScaler[phase_int][i] * pxl[i];
}
}
float y_usm = 0.f;
{
[unroll]
for (int i = 0; i < 6; ++i) {
y_usm += shCoefUSM[phase_int][i] * pxl[i];
}
}
// let's compute a piece-wise ramp based on luma
const float y_scale = 1.0f - saturate((y * (1.0f / NIS_SCALE_FLOAT) - kSharpStartY) * kSharpScaleY);
// scale the ramp to sharpen as a function of luma
const float y_sharpness = y_scale * kSharpStrengthScale + kSharpStrengthMin;
y_usm *= y_sharpness;
// scale the ramp to limit USM as a function of luma
const float y_sharpness_limit = (y_scale * kSharpLimitScale + kSharpLimitMin) * y;
y_usm = min(y_sharpness_limit, max(-y_sharpness_limit, y_usm));
// reduce ringing
y_usm *= CalcLTI(pxl[0], pxl[1], pxl[2], pxl[3], pxl[4], pxl[5], phase_int);
return y + y_usm;
}
float FilterNormal(const float p[6][6], int phase_x_frac_int, int phase_y_frac_int) {
float h_acc = 0.0f;
[unroll]
for (int j = 0; j < 6; ++j) {
float v_acc = 0.0f;
[unroll]
for (int i = 0; i < 6; ++i) {
v_acc += p[i][j] * shCoefScaler[phase_y_frac_int][i];
}
h_acc += v_acc * shCoefScaler[phase_x_frac_int][j];
}
// let's return the sum unpacked -> we can accumulate it later
return h_acc;
}
float AddDirFilters(float p[6][6], float phase_x_frac, float phase_y_frac, int phase_x_frac_int, int phase_y_frac_int, float4 w) {
float f = 0;
if (w.x > 0.0f) {
// 0 deg filter
float interp0Deg[6];
{
[unroll]
for (int i = 0; i < 6; ++i) {
interp0Deg[i] = lerp(p[i][2], p[i][3], phase_x_frac);
}
}
f += EvalPoly6(interp0Deg, phase_y_frac_int) * w.x;
}
if (w.y > 0.0f) {
// 90 deg filter
float interp90Deg[6];
{
[unroll]
for (int i = 0; i < 6; ++i) {
interp90Deg[i] = lerp(p[2][i], p[3][i], phase_y_frac);
}
}
f += EvalPoly6(interp90Deg, phase_x_frac_int) * w.y;
}
if (w.z > 0.0f) {
//45 deg filter
float pphase_b45 = 0.5f + 0.5f * (phase_x_frac - phase_y_frac);
float temp_interp45Deg[7];
temp_interp45Deg[1] = lerp(p[2][1], p[1][2], pphase_b45);
temp_interp45Deg[3] = lerp(p[3][2], p[2][3], pphase_b45);
temp_interp45Deg[5] = lerp(p[4][3], p[3][4], pphase_b45);
{
pphase_b45 = pphase_b45 - 0.5f;
float a = (pphase_b45 >= 0.f) ? p[0][2] : p[2][0];
float b = (pphase_b45 >= 0.f) ? p[1][3] : p[3][1];
float c = (pphase_b45 >= 0.f) ? p[2][4] : p[4][2];
float d = (pphase_b45 >= 0.f) ? p[3][5] : p[5][3];
temp_interp45Deg[0] = lerp(p[1][1], a, abs(pphase_b45));
temp_interp45Deg[2] = lerp(p[2][2], b, abs(pphase_b45));
temp_interp45Deg[4] = lerp(p[3][3], c, abs(pphase_b45));
temp_interp45Deg[6] = lerp(p[4][4], d, abs(pphase_b45));
}
float interp45Deg[6];
float pphase_p45 = phase_x_frac + phase_y_frac;
if (pphase_p45 >= 1) {
[unroll]
for (int i = 0; i < 6; i++) {
interp45Deg[i] = temp_interp45Deg[i + 1];
}
pphase_p45 = pphase_p45 - 1;
} else {
[unroll]
for (int i = 0; i < 6; i++) {
interp45Deg[i] = temp_interp45Deg[i];
}
}
f += EvalPoly6(interp45Deg, int(pphase_p45 * 64)) * w.z;
}
if (w.w > 0.0f) {
//135 deg filter
float pphase_b135 = 0.5f * (phase_x_frac + phase_y_frac);
float temp_interp135Deg[7];
temp_interp135Deg[1] = lerp(p[3][1], p[4][2], pphase_b135);
temp_interp135Deg[3] = lerp(p[2][2], p[3][3], pphase_b135);
temp_interp135Deg[5] = lerp(p[1][3], p[2][4], pphase_b135);
{
pphase_b135 = pphase_b135 - 0.5f;
float a = (pphase_b135 >= 0.f) ? p[5][2] : p[3][0];
float b = (pphase_b135 >= 0.f) ? p[4][3] : p[2][1];
float c = (pphase_b135 >= 0.f) ? p[3][4] : p[1][2];
float d = (pphase_b135 >= 0.f) ? p[2][5] : p[0][3];
temp_interp135Deg[0] = lerp(p[4][1], a, abs(pphase_b135));
temp_interp135Deg[2] = lerp(p[3][2], b, abs(pphase_b135));
temp_interp135Deg[4] = lerp(p[2][3], c, abs(pphase_b135));
temp_interp135Deg[6] = lerp(p[1][4], d, abs(pphase_b135));
}
float interp135Deg[6];
float pphase_p135 = 1 + (phase_x_frac - phase_y_frac);
if (pphase_p135 >= 1) {
[unroll]
for (int i = 0; i < 6; ++i) {
interp135Deg[i] = temp_interp135Deg[i + 1];
}
pphase_p135 = pphase_p135 - 1;
} else {
[unroll]
for (int i = 0; i < 6; ++i) {
interp135Deg[i] = temp_interp135Deg[i];
}
}
f += EvalPoly6(interp135Deg, int(pphase_p135 * 64)) * w.w;
}
return f;
}
void Pass1(uint2 blockStart, uint3 threadId) {
float2 scale = GetScale();
float2 inputPt = GetInputPt();
float kScaleX = 1.0f / scale.x;
float kScaleY = 1.0f / scale.y;
float threadIdx = threadId.x;
float kSrcNormX = inputPt.x;
float kSrcNormY = inputPt.y;
// Figure out the range of pixels from input image that would be needed to be loaded for this thread-block
int dstBlockX = blockStart.x;
int dstBlockY = blockStart.y;
const int srcBlockStartX = int(floor((dstBlockX + 0.5f) * kScaleX - 0.5f));
const int srcBlockStartY = int(floor((dstBlockY + 0.5f) * kScaleY - 0.5f));
const int srcBlockEndX = int(ceil((dstBlockX + NIS_BLOCK_WIDTH + 0.5f) * kScaleX - 0.5f));
const int srcBlockEndY = int(ceil((dstBlockY + NIS_BLOCK_HEIGHT + 0.5f) * kScaleY - 0.5f));
int numTilePixelsX = srcBlockEndX - srcBlockStartX + kSupportSize - 1;
int numTilePixelsY = srcBlockEndY - srcBlockStartY + kSupportSize - 1;
// round-up load region to even size since we're loading in 2x2 batches
numTilePixelsX += numTilePixelsX & 0x1;
numTilePixelsY += numTilePixelsY & 0x1;
const int numTilePixels = numTilePixelsX * numTilePixelsY;
// calculate the equivalent values for the edge map
const int numEdgeMapPixelsX = numTilePixelsX - kSupportSize + 2;
const int numEdgeMapPixelsY = numTilePixelsY - kSupportSize + 2;
const int numEdgeMapPixels = numEdgeMapPixelsX * numEdgeMapPixelsY;
// fill in input luma tile (shPixelsY) in batches of 2x2 pixels
// we use texture gather to get extra support necessary
// to compute 2x2 edge map outputs too
{
for (uint i = threadIdx * 2; i < uint(numTilePixels) >> 1; i += NIS_THREAD_GROUP_SIZE * 2) {
uint py = (i / numTilePixelsX) * 2;
uint px = i % numTilePixelsX;
// 0.5 to be in the center of texel
// - (kSupportSize - 1) / 2 to shift by the kernel support size
float kShift = 0.5f - (kSupportSize - 1) / 2;
const float tx = (srcBlockStartX + px + kShift) * kSrcNormX;
const float ty = (srcBlockStartY + py + kShift) * kSrcNormY;
float p[2][2];
{
const float4 sr = INPUT.GatherRed(samplerLinearClamp, float2(tx, ty));
const float4 sg = INPUT.GatherGreen(samplerLinearClamp, float2(tx, ty));
const float4 sb = INPUT.GatherBlue(samplerLinearClamp, float2(tx, ty));
p[0][0] = getY(float3(sr.w, sg.w, sb.w));
p[0][1] = getY(float3(sr.z, sg.z, sb.z));
p[1][0] = getY(float3(sr.x, sg.x, sb.x));
p[1][1] = getY(float3(sr.y, sg.y, sb.y));
}
const uint idx = py * kTilePitch + px;
shPixelsY[idx] = float(p[0][0]);
shPixelsY[idx + 1] = float(p[0][1]);
shPixelsY[idx + kTilePitch] = float(p[1][0]);
shPixelsY[idx + kTilePitch + 1] = float(p[1][1]);
}
}
GroupMemoryBarrierWithGroupSync();
{
// fill in the edge map of 2x2 pixels
for (uint i = threadIdx * 2; i < uint(numEdgeMapPixels) >> 1; i += NIS_THREAD_GROUP_SIZE * 2) {
uint py = (i / numEdgeMapPixelsX) * 2;
uint px = i % numEdgeMapPixelsX;
const uint edgeMapIdx = py * kEdgeMapPitch + px;
uint tileCornerIdx = (py + 1) * kTilePitch + px + 1;
float p[4][4];
[unroll]
for (int j = 0; j < 4; j++) {
[unroll]
for (int k = 0; k < 4; k++) {
p[j][k] = shPixelsY[tileCornerIdx + j * kTilePitch + k];
}
}
shEdgeMap[edgeMapIdx] = float4(GetEdgeMap(p, 0, 0));
shEdgeMap[edgeMapIdx + 1] = float4(GetEdgeMap(p, 0, 1));
shEdgeMap[edgeMapIdx + kEdgeMapPitch] = float4(GetEdgeMap(p, 1, 0));
shEdgeMap[edgeMapIdx + kEdgeMapPitch + 1] = float4(GetEdgeMap(p, 1, 1));
}
}
LoadFilterBanksSh(int(threadIdx), NIS_THREAD_GROUP_SIZE);
GroupMemoryBarrierWithGroupSync();
// output coord within a tile
const int2 pos = int2(uint(threadIdx) % uint(NIS_BLOCK_WIDTH), uint(threadIdx) / uint(NIS_BLOCK_WIDTH));
// x coord inside the output image
const int dstX = dstBlockX + pos.x;
// x coord inside the input image
const float srcX = (0.5f + dstX) * kScaleX - 0.5f;
// nearest integer part
const int px = int(floor(srcX) - srcBlockStartX);
// fractional part
const float fx = srcX - floor(srcX);
// discretized phase
const int fx_int = int(fx * kPhaseCount);
for (int k = 0; k < NIS_BLOCK_WIDTH * NIS_BLOCK_HEIGHT / NIS_THREAD_GROUP_SIZE; ++k) {
// y coord inside the output image
const int dstY = dstBlockY + pos.y + k * (NIS_THREAD_GROUP_SIZE / NIS_BLOCK_WIDTH);
if (!CheckViewport(int2(dstX, dstY))) {
return;
}
// y coord inside the input image
const float srcY = (0.5f + dstY) * kScaleY - 0.5f;
// nearest integer part
const int py = int(floor(srcY) - srcBlockStartY);
// fractional part
const float fy = srcY - floor(srcY);
// discretized phase
const int fy_int = int(fy * kPhaseCount);
// generate weights for directional filters
const int startEdgeMapIdx = py * kEdgeMapPitch + px;
float4 edge[2][2];
[unroll]
for (int i = 0; i < 2; i++) {
[unroll]
for (int j = 0; j < 2; j++) {
// need to shift edge map sampling since it's a 2x2 centered inside 6x6 grid
edge[i][j] = shEdgeMap[startEdgeMapIdx + (i * kEdgeMapPitch) + j];
}
}
const float4 w = GetInterpEdgeMap(edge, fx, fy) * NIS_SCALE_INT;
// load 6x6 support to regs
const int startTileIdx = py * kTilePitch + px;
float p[6][6];
{
[unroll]
for (int i = 0; i < 6; ++i) {
[unroll]
for (int j = 0; j < 6; ++j) {
p[i][j] = shPixelsY[startTileIdx + i * kTilePitch + j];
}
}
}
// weigth for luma
const float baseWeight = NIS_SCALE_FLOAT - w.x - w.y - w.z - w.w;
// final luma is a weighted product of directional & normal filters
float opY = 0;
// get traditional scaler filter output
opY += FilterNormal(p, fx_int, fy_int) * baseWeight;
// get directional filter bank output
opY += AddDirFilters(p, fx, fy, fx_int, fy_int, w);
// do bilinear tap for chroma upscaling
float3 op = INPUT.SampleLevel(samplerLinearClamp, float2((srcX + 0.5f) * kSrcNormX, (srcY + 0.5f) * kSrcNormY), 0).rgb;
const float corr = opY * (1.0f / NIS_SCALE_FLOAT) - getY(float3(op.x, op.y, op.z));
op.x += corr;
op.y += corr;
op.z += corr;
WriteToOutput(uint2(dstX, dstY), op);
}
}