mirror of
https://github.com/Blinue/Magpie.git
synced 2026-06-24 02:04:10 +00:00
Compare commits
4 commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
2980a69156 | ||
|
|
1840bded7b |
||
|
|
8daceb51f9 |
||
|
|
e73483a272 |
278 changed files with 54061 additions and 17414 deletions
|
|
@ -333,6 +333,15 @@
|
||||||
"contributions": [
|
"contributions": [
|
||||||
"translation"
|
"translation"
|
||||||
]
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"login": "rezorrand",
|
||||||
|
"name": "Pate L",
|
||||||
|
"avatar_url": "https://avatars.githubusercontent.com/u/7170353?v=4",
|
||||||
|
"profile": "https://github.com/rezorrand",
|
||||||
|
"contributions": [
|
||||||
|
"translation"
|
||||||
|
]
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
"contributorsPerLine": 7,
|
"contributorsPerLine": 7,
|
||||||
|
|
|
||||||
|
|
@ -112,6 +112,7 @@ Thanks go to these wonderful people:
|
||||||
</tr>
|
</tr>
|
||||||
<tr>
|
<tr>
|
||||||
<td align="center" valign="top" width="14.28%"><a href="https://github.com/Androidlate"><img src="https://avatars.githubusercontent.com/u/194900061?v=4?s=100" width="100px;" alt="Raphael"/><br /><sub><b>Raphael</b></sub></a><br /><a href="#translation-Androidlate" title="Translation">🌍</a></td>
|
<td align="center" valign="top" width="14.28%"><a href="https://github.com/Androidlate"><img src="https://avatars.githubusercontent.com/u/194900061?v=4?s=100" width="100px;" alt="Raphael"/><br /><sub><b>Raphael</b></sub></a><br /><a href="#translation-Androidlate" title="Translation">🌍</a></td>
|
||||||
|
<td align="center" valign="top" width="14.28%"><a href="https://github.com/rezorrand"><img src="https://avatars.githubusercontent.com/u/7170353?v=4?s=100" width="100px;" alt="Pate L"/><br /><sub><b>Pate L</b></sub></a><br /><a href="#translation-rezorrand" title="Translation">🌍</a></td>
|
||||||
</tr>
|
</tr>
|
||||||
</tbody>
|
</tbody>
|
||||||
</table>
|
</table>
|
||||||
|
|
|
||||||
|
|
@ -111,6 +111,7 @@ Magpie 是一个轻量级的窗口超分辨率工具,内置众多高效的算
|
||||||
</tr>
|
</tr>
|
||||||
<tr>
|
<tr>
|
||||||
<td align="center" valign="top" width="14.28%"><a href="https://github.com/Androidlate"><img src="https://avatars.githubusercontent.com/u/194900061?v=4?s=100" width="100px;" alt="Raphael"/><br /><sub><b>Raphael</b></sub></a><br /><a href="#translation-Androidlate" title="Translation">🌍</a></td>
|
<td align="center" valign="top" width="14.28%"><a href="https://github.com/Androidlate"><img src="https://avatars.githubusercontent.com/u/194900061?v=4?s=100" width="100px;" alt="Raphael"/><br /><sub><b>Raphael</b></sub></a><br /><a href="#translation-Androidlate" title="Translation">🌍</a></td>
|
||||||
|
<td align="center" valign="top" width="14.28%"><a href="https://github.com/rezorrand"><img src="https://avatars.githubusercontent.com/u/7170353?v=4?s=100" width="100px;" alt="Pate L"/><br /><sub><b>Pate L</b></sub></a><br /><a href="#translation-rezorrand" title="Translation">🌍</a></td>
|
||||||
</tr>
|
</tr>
|
||||||
</tbody>
|
</tbody>
|
||||||
</table>
|
</table>
|
||||||
|
|
|
||||||
|
|
@ -1,6 +1,5 @@
|
||||||
<?xml version="1.0" encoding="utf-8"?>
|
|
||||||
<AutoVisualizer xmlns="http://schemas.microsoft.com/vstudio/debugger/natvis/2010">
|
<AutoVisualizer xmlns="http://schemas.microsoft.com/vstudio/debugger/natvis/2010">
|
||||||
<Type Name="Magpie::SmallVectorImpl<*>">
|
<Type Name="SmallVectorImpl<*>">
|
||||||
<DisplayString IncludeView ="elt0" Condition="Size == 0"></DisplayString>
|
<DisplayString IncludeView ="elt0" Condition="Size == 0"></DisplayString>
|
||||||
<DisplayString IncludeView ="elt0">{(($T1*)BeginX)[0]}{*this,view(elt1)}</DisplayString>
|
<DisplayString IncludeView ="elt0">{(($T1*)BeginX)[0]}{*this,view(elt1)}</DisplayString>
|
||||||
<DisplayString IncludeView ="elt1" Condition="Size == 1"></DisplayString>
|
<DisplayString IncludeView ="elt1" Condition="Size == 1"></DisplayString>
|
||||||
|
|
|
||||||
|
|
@ -60,7 +60,7 @@ versionNumProps = f";MajorVersion={args.version_major};MinorVersion={args.versio
|
||||||
versionStrProp = "" if args.version_string == "" else f";VersionString={args.version_string}"
|
versionStrProp = "" if args.version_string == "" else f";VersionString={args.version_string}"
|
||||||
|
|
||||||
p = subprocess.run(
|
p = subprocess.run(
|
||||||
f'"{msbuildPath}" Magpie.slnx -m -t:Rebuild -restore -p:RestorePackagesConfig=true;Configuration=Release;Platform={args.platform};DisablePDB=true;UseClangCL={args.compiler == "ClangCL"};UseNativeMicroArch={args.use_native_march};OutBaseDir={os.getcwd()}\\publish\\{args.platform}\\;CommitId={commitId}{versionNumProps}{versionStrProp}'
|
f'"{msbuildPath}" Magpie.slnx -m -t:Rebuild -restore -p:RestorePackagesConfig=true;Configuration=Release;Platform={args.platform};DisablePDB=true;UseClangCL={args.compiler == "ClangCL"};UseNativeMicroArch={args.use_native_march};OutDir={os.getcwd()}\\publish\\{args.platform}\\;CommitId={commitId}{versionNumProps}{versionStrProp}'
|
||||||
)
|
)
|
||||||
if p.returncode != 0:
|
if p.returncode != 0:
|
||||||
raise Exception("编译失败")
|
raise Exception("编译失败")
|
||||||
|
|
@ -82,9 +82,8 @@ def remove_file(file):
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
|
||||||
for pattern in ["*.lib", "*.exp"]:
|
for file in glob.glob("*.lib"):
|
||||||
for file in glob.glob(pattern):
|
remove_file(file)
|
||||||
remove_file(file)
|
|
||||||
|
|
||||||
print("清理完毕", flush=True)
|
print("清理完毕", flush=True)
|
||||||
|
|
||||||
|
|
@ -103,7 +102,7 @@ if args.pfx_path != "":
|
||||||
)
|
)
|
||||||
passwordOption = "" if args.pfx_password == "" else f'/p "{args.pfx_password}"'
|
passwordOption = "" if args.pfx_password == "" else f'/p "{args.pfx_password}"'
|
||||||
p = subprocess.run(
|
p = subprocess.run(
|
||||||
f'"{windowsSdkDir}\\x64\\signtool.exe" sign /fd SHA256 /a /f "{pfxPath}" {passwordOption} app\\TouchHelper.exe'
|
f'"{windowsSdkDir}\\x64\\signtool.exe" sign /fd SHA256 /a /f "{pfxPath}" {passwordOption} TouchHelper.exe'
|
||||||
)
|
)
|
||||||
if p.returncode != 0:
|
if p.returncode != 0:
|
||||||
raise Exception("签名失败")
|
raise Exception("签名失败")
|
||||||
|
|
|
||||||
|
|
@ -8,14 +8,13 @@
|
||||||
<UseNativeMicroArch>false</UseNativeMicroArch>
|
<UseNativeMicroArch>false</UseNativeMicroArch>
|
||||||
<!-- 编译为打包应用 (暂不支持) -->
|
<!-- 编译为打包应用 (暂不支持) -->
|
||||||
<IsPackaged>false</IsPackaged>
|
<IsPackaged>false</IsPackaged>
|
||||||
<!-- 启用调试信息 -->
|
<!-- 窗口模式缩放时把用于调整窗口尺寸的辅助窗口标示出来 -->
|
||||||
<DebugInfo>false</DebugInfo>
|
<DebugBorder>false</DebugBorder>
|
||||||
|
<!-- 在性能分析器上显示调试信息 -->
|
||||||
|
<DebugInfoOnOverlay>false</DebugInfoOnOverlay>
|
||||||
<!-- 使用 composition swapchain 呈现 -->
|
<!-- 使用 composition swapchain 呈现 -->
|
||||||
<UseCompSwapchain>false</UseCompSwapchain>
|
<UseCompSwapchain>false</UseCompSwapchain>
|
||||||
<!-- 禁止生成 PDB -->
|
|
||||||
<DisablePDB>false</DisablePDB>
|
|
||||||
|
|
||||||
<OutBaseDir></OutBaseDir>
|
|
||||||
<MajorVersion></MajorVersion>
|
<MajorVersion></MajorVersion>
|
||||||
<MinorVersion></MinorVersion>
|
<MinorVersion></MinorVersion>
|
||||||
<PatchVersion></PatchVersion>
|
<PatchVersion></PatchVersion>
|
||||||
|
|
|
||||||
|
|
@ -10,7 +10,8 @@
|
||||||
<PreprocessorDefinitions>MP_MAJOR_VERSION=$(MajorVersion);MP_MINOR_VERSION=$(MinorVersion);MP_PATCH_VERSION=$(PatchVersion);%(PreprocessorDefinitions)</PreprocessorDefinitions>
|
<PreprocessorDefinitions>MP_MAJOR_VERSION=$(MajorVersion);MP_MINOR_VERSION=$(MinorVersion);MP_PATCH_VERSION=$(PatchVersion);%(PreprocessorDefinitions)</PreprocessorDefinitions>
|
||||||
<PreprocessorDefinitions Condition="'$(VersionString)' != ''">MP_VERSION_STRING=$(VersionString);%(PreprocessorDefinitions)</PreprocessorDefinitions>
|
<PreprocessorDefinitions Condition="'$(VersionString)' != ''">MP_VERSION_STRING=$(VersionString);%(PreprocessorDefinitions)</PreprocessorDefinitions>
|
||||||
<PreprocessorDefinitions Condition="'$(CommitId)' != ''">MP_COMMIT_ID=$(CommitId);%(PreprocessorDefinitions)</PreprocessorDefinitions>
|
<PreprocessorDefinitions Condition="'$(CommitId)' != ''">MP_COMMIT_ID=$(CommitId);%(PreprocessorDefinitions)</PreprocessorDefinitions>
|
||||||
<PreprocessorDefinitions Condition="$(DebugInfo)">MP_DEBUG_INFO;%(PreprocessorDefinitions)</PreprocessorDefinitions>
|
<PreprocessorDefinitions Condition="$(DebugBorder)">MP_DEBUG_BORDER;%(PreprocessorDefinitions)</PreprocessorDefinitions>
|
||||||
|
<PreprocessorDefinitions Condition="$(DebugInfoOnOverlay)">MP_DEBUG_INFO_ON_OVERLAY;%(PreprocessorDefinitions)</PreprocessorDefinitions>
|
||||||
<PreprocessorDefinitions Condition="$(UseCompSwapchain)">MP_USE_COMPSWAPCHAIN;%(PreprocessorDefinitions)</PreprocessorDefinitions>
|
<PreprocessorDefinitions Condition="$(UseCompSwapchain)">MP_USE_COMPSWAPCHAIN;%(PreprocessorDefinitions)</PreprocessorDefinitions>
|
||||||
<DebugInformationFormat Condition="'$(DisablePDB)' == 'true'">None</DebugInformationFormat>
|
<DebugInformationFormat Condition="'$(DisablePDB)' == 'true'">None</DebugInformationFormat>
|
||||||
<AdditionalOptions>/bigobj %(AdditionalOptions)</AdditionalOptions>
|
<AdditionalOptions>/bigobj %(AdditionalOptions)</AdditionalOptions>
|
||||||
|
|
@ -67,7 +68,7 @@
|
||||||
<!-- /Zc:checkGwOdr: 防止 /Gw 导致某些 ODR 违规被忽略 -->
|
<!-- /Zc:checkGwOdr: 防止 /Gw 导致某些 ODR 违规被忽略 -->
|
||||||
<AdditionalOptions>/Gw %(AdditionalOptions)</AdditionalOptions>
|
<AdditionalOptions>/Gw %(AdditionalOptions)</AdditionalOptions>
|
||||||
<AdditionalOptions Condition="!$(UseClangCL)">/Zc:checkGwOdr %(AdditionalOptions)</AdditionalOptions>
|
<AdditionalOptions Condition="!$(UseClangCL)">/Zc:checkGwOdr %(AdditionalOptions)</AdditionalOptions>
|
||||||
<!-- clang-cl 不支持 LTCG,应使用 LTO -->
|
<!-- clang-cl 不支持 /LTCG,应使用 LTO -->
|
||||||
<AdditionalOptions Condition="$(UseClangCL)">/clang:-flto %(AdditionalOptions)</AdditionalOptions>
|
<AdditionalOptions Condition="$(UseClangCL)">/clang:-flto %(AdditionalOptions)</AdditionalOptions>
|
||||||
</ClCompile>
|
</ClCompile>
|
||||||
<Link>
|
<Link>
|
||||||
|
|
|
||||||
|
|
@ -29,8 +29,6 @@
|
||||||
<VCProjectVersion Condition="$(VS17)">17.0</VCProjectVersion>
|
<VCProjectVersion Condition="$(VS17)">17.0</VCProjectVersion>
|
||||||
<VCProjectVersion Condition="!$(VS17)">18.0</VCProjectVersion>
|
<VCProjectVersion Condition="!$(VS17)">18.0</VCProjectVersion>
|
||||||
<DefaultLanguage>en-US</DefaultLanguage>
|
<DefaultLanguage>en-US</DefaultLanguage>
|
||||||
<IntDir>$(SolutionDir)\obj\$(Platform)\$(Configuration)\$(MSBuildProjectName)\</IntDir>
|
|
||||||
<OutBaseDir Condition="'$(OutBaseDir)' == ''">$(SolutionDir)\bin\$(Platform)\$(Configuration)\</OutBaseDir>
|
|
||||||
<MajorVersion Condition="'$(MajorVersion)' == ''">0</MajorVersion>
|
<MajorVersion Condition="'$(MajorVersion)' == ''">0</MajorVersion>
|
||||||
<MinorVersion Condition="'$(MinorVersion)' == ''">0</MinorVersion>
|
<MinorVersion Condition="'$(MinorVersion)' == ''">0</MinorVersion>
|
||||||
<PatchVersion Condition="'$(PatchVersion)' == ''">0</PatchVersion>
|
<PatchVersion Condition="'$(PatchVersion)' == ''">0</PatchVersion>
|
||||||
|
|
|
||||||
|
|
@ -2,16 +2,19 @@
|
||||||
// 移植自 https://github.com/TianZerL/ACNetGLSL/blob/f20a6b6b7327f4caf588b06c6b21f18e40dae1ce/glsl/ACNet.glsl
|
// 移植自 https://github.com/TianZerL/ACNetGLSL/blob/f20a6b6b7327f4caf588b06c6b21f18e40dae1ce/glsl/ACNet.glsl
|
||||||
|
|
||||||
//!MAGPIE EFFECT
|
//!MAGPIE EFFECT
|
||||||
//!VERSION 5
|
//!VERSION 4
|
||||||
|
//!USE MulAdd
|
||||||
//!CAPABILITY FP16
|
//!CAPABILITY FP16
|
||||||
//!SCALE_FACTOR 2
|
|
||||||
|
|
||||||
#include "StubDefs.hlsli"
|
#include "StubDefs.hlsli"
|
||||||
|
|
||||||
|
|
||||||
//!TEXTURE
|
//!TEXTURE
|
||||||
Texture2D INPUT;
|
Texture2D INPUT;
|
||||||
|
|
||||||
//!TEXTURE
|
//!TEXTURE
|
||||||
|
//!WIDTH INPUT_WIDTH * 2
|
||||||
|
//!HEIGHT INPUT_HEIGHT * 2
|
||||||
Texture2D OUTPUT;
|
Texture2D OUTPUT;
|
||||||
|
|
||||||
//!TEXTURE
|
//!TEXTURE
|
||||||
|
|
@ -46,6 +49,7 @@ SamplerState sam;
|
||||||
//!FILTER LINEAR
|
//!FILTER LINEAR
|
||||||
SamplerState sam1;
|
SamplerState sam1;
|
||||||
|
|
||||||
|
|
||||||
//!COMMON
|
//!COMMON
|
||||||
|
|
||||||
#ifdef MP_DEBUG
|
#ifdef MP_DEBUG
|
||||||
|
|
@ -54,6 +58,7 @@ SamplerState sam1;
|
||||||
|
|
||||||
#define RELU(x) max(x, 0)
|
#define RELU(x) max(x, 0)
|
||||||
|
|
||||||
|
|
||||||
//!PASS 1
|
//!PASS 1
|
||||||
//!DESC L1
|
//!DESC L1
|
||||||
//!IN INPUT
|
//!IN INPUT
|
||||||
|
|
@ -61,11 +66,8 @@ SamplerState sam1;
|
||||||
//!BLOCK_SIZE 16
|
//!BLOCK_SIZE 16
|
||||||
//!NUM_THREADS 64
|
//!NUM_THREADS 64
|
||||||
|
|
||||||
// ACNet 工作在 YUV 颜色空间,原作者是这么做的,见
|
MF GetLuma(MF3 color) {
|
||||||
// https://github.com/TianZerL/Anime4KCPP/blob/b8b3a09fd50b1bb15751eb9aa90b7e7f55b8e51e/Anime4KCore/src/Anime4KGPUCNN.cpp
|
return dot(MF3(0.299, 0.587, 0.114), color);
|
||||||
// sRGB 和 YUV 的转换见 https://en.wikipedia.org/wiki/YCbCr#ITU-R_BT.709_conversion
|
|
||||||
MF GetY(MF3 color) {
|
|
||||||
return dot(MF3(0.2126, 0.7152, 0.0722), EncodeSrgb(color));
|
|
||||||
}
|
}
|
||||||
|
|
||||||
const static MF kernelsL1A[9 * 4] = {
|
const static MF kernelsL1A[9 * 4] = {
|
||||||
|
|
@ -102,6 +104,7 @@ const static MF kernelsL1B[9 * 4] = {
|
||||||
|
|
||||||
const static MF4 biasL1B = { 0.0223, 0.0340, 0.0150, -0.0044 };
|
const static MF4 biasL1B = { 0.0223, 0.0340, 0.0150, -0.0044 };
|
||||||
|
|
||||||
|
|
||||||
void Pass1(uint2 blockStart, uint3 threadId) {
|
void Pass1(uint2 blockStart, uint3 threadId) {
|
||||||
uint2 gxy = (Rmp8x8(threadId.x) << 1) + blockStart;
|
uint2 gxy = (Rmp8x8(threadId.x) << 1) + blockStart;
|
||||||
uint2 inputSize = GetInputSize();
|
uint2 inputSize = GetInputSize();
|
||||||
|
|
@ -124,10 +127,10 @@ void Pass1(uint2 blockStart, uint3 threadId) {
|
||||||
|
|
||||||
// w z
|
// w z
|
||||||
// x y
|
// x y
|
||||||
src[i][j] = GetY(MF3(sr.w, sg.w, sb.w));
|
src[i][j] = GetLuma(MF3(sr.w, sg.w, sb.w));
|
||||||
src[i][j + 1] = GetY(MF3(sr.x, sg.x, sb.x));
|
src[i][j + 1] = GetLuma(MF3(sr.x, sg.x, sb.x));
|
||||||
src[i + 1][j] = GetY(MF3(sr.z, sg.z, sb.z));
|
src[i + 1][j] = GetLuma(MF3(sr.z, sg.z, sb.z));
|
||||||
src[i + 1][j + 1] = GetY(MF3(sr.y, sg.y, sb.y));
|
src[i + 1][j + 1] = GetLuma(MF3(sr.y, sg.y, sb.y));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -173,6 +176,7 @@ void Pass1(uint2 blockStart, uint3 threadId) {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
//!PASS 2
|
//!PASS 2
|
||||||
//!DESC L2
|
//!DESC L2
|
||||||
//!IN tex1, tex2
|
//!IN tex1, tex2
|
||||||
|
|
@ -461,6 +465,7 @@ void Pass2(uint2 blockStart, uint3 threadId) {
|
||||||
tex4[gxy] = target2;
|
tex4[gxy] = target2;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
//!PASS 3
|
//!PASS 3
|
||||||
//!DESC L3
|
//!DESC L3
|
||||||
//!IN tex3, tex4
|
//!IN tex3, tex4
|
||||||
|
|
@ -670,6 +675,7 @@ const static MF kernelsLB[9 * 8 * 4] = {
|
||||||
|
|
||||||
const static MF4 biasLB = { -0.0225, 0.0082, -0.0191, -0.0185 };
|
const static MF4 biasLB = { -0.0225, 0.0082, -0.0191, -0.0185 };
|
||||||
|
|
||||||
|
|
||||||
void Pass3(uint2 blockStart, uint3 threadId) {
|
void Pass3(uint2 blockStart, uint3 threadId) {
|
||||||
uint2 gxy = Rmp8x8(threadId.x) + blockStart;
|
uint2 gxy = Rmp8x8(threadId.x) + blockStart;
|
||||||
uint2 inputSize = GetInputSize();
|
uint2 inputSize = GetInputSize();
|
||||||
|
|
@ -749,6 +755,7 @@ void Pass3(uint2 blockStart, uint3 threadId) {
|
||||||
tex2[gxy] = target2;
|
tex2[gxy] = target2;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
//!PASS 4
|
//!PASS 4
|
||||||
//!DESC L4
|
//!DESC L4
|
||||||
//!IN tex1, tex2
|
//!IN tex1, tex2
|
||||||
|
|
@ -958,6 +965,7 @@ const static MF kernelsLB[9 * 8 * 4] = {
|
||||||
|
|
||||||
const static MF4 biasLB = { -8.1892e-04, 3.3171e-03, -1.1582e-02, -4.1205e-40 };
|
const static MF4 biasLB = { -8.1892e-04, 3.3171e-03, -1.1582e-02, -4.1205e-40 };
|
||||||
|
|
||||||
|
|
||||||
void Pass4(uint2 blockStart, uint3 threadId) {
|
void Pass4(uint2 blockStart, uint3 threadId) {
|
||||||
uint2 gxy = Rmp8x8(threadId.x) + blockStart;
|
uint2 gxy = Rmp8x8(threadId.x) + blockStart;
|
||||||
uint2 inputSize = GetInputSize();
|
uint2 inputSize = GetInputSize();
|
||||||
|
|
@ -1037,6 +1045,7 @@ void Pass4(uint2 blockStart, uint3 threadId) {
|
||||||
tex4[gxy] = target2;
|
tex4[gxy] = target2;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
//!PASS 5
|
//!PASS 5
|
||||||
//!DESC L5
|
//!DESC L5
|
||||||
//!IN tex3, tex4
|
//!IN tex3, tex4
|
||||||
|
|
@ -1246,6 +1255,7 @@ const static MF kernelsLB[9 * 8 * 4] = {
|
||||||
|
|
||||||
const static MF4 biasLB = { -0.0039, -0.0426, 0.0053, -0.0017 };
|
const static MF4 biasLB = { -0.0039, -0.0426, 0.0053, -0.0017 };
|
||||||
|
|
||||||
|
|
||||||
void Pass5(uint2 blockStart, uint3 threadId) {
|
void Pass5(uint2 blockStart, uint3 threadId) {
|
||||||
uint2 gxy = Rmp8x8(threadId.x) + blockStart;
|
uint2 gxy = Rmp8x8(threadId.x) + blockStart;
|
||||||
uint2 inputSize = GetInputSize();
|
uint2 inputSize = GetInputSize();
|
||||||
|
|
@ -1325,6 +1335,7 @@ void Pass5(uint2 blockStart, uint3 threadId) {
|
||||||
tex2[gxy] = target2;
|
tex2[gxy] = target2;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
//!PASS 6
|
//!PASS 6
|
||||||
//!DESC L6
|
//!DESC L6
|
||||||
//!IN tex1, tex2
|
//!IN tex1, tex2
|
||||||
|
|
@ -1534,6 +1545,7 @@ const static MF kernelsLB[9 * 8 * 4] = {
|
||||||
|
|
||||||
const static MF4 biasLB = { 0.1077, 0.0347, -0.0165, 0.7296 };
|
const static MF4 biasLB = { 0.1077, 0.0347, -0.0165, 0.7296 };
|
||||||
|
|
||||||
|
|
||||||
void Pass6(uint2 blockStart, uint3 threadId) {
|
void Pass6(uint2 blockStart, uint3 threadId) {
|
||||||
uint2 gxy = Rmp8x8(threadId.x) + blockStart;
|
uint2 gxy = Rmp8x8(threadId.x) + blockStart;
|
||||||
uint2 inputSize = GetInputSize();
|
uint2 inputSize = GetInputSize();
|
||||||
|
|
@ -1613,6 +1625,7 @@ void Pass6(uint2 blockStart, uint3 threadId) {
|
||||||
tex4[gxy] = target2;
|
tex4[gxy] = target2;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
//!PASS 7
|
//!PASS 7
|
||||||
//!DESC L7
|
//!DESC L7
|
||||||
//!IN tex3, tex4
|
//!IN tex3, tex4
|
||||||
|
|
@ -1822,6 +1835,7 @@ const static MF kernelsLB[9 * 8 * 4] = {
|
||||||
|
|
||||||
const static MF4 biasLB = { 2.3381e-02, -1.2136e-40, -5.6040e-39, 3.7100e-02 };
|
const static MF4 biasLB = { 2.3381e-02, -1.2136e-40, -5.6040e-39, 3.7100e-02 };
|
||||||
|
|
||||||
|
|
||||||
void Pass7(uint2 blockStart, uint3 threadId) {
|
void Pass7(uint2 blockStart, uint3 threadId) {
|
||||||
uint2 gxy = Rmp8x8(threadId.x) + blockStart;
|
uint2 gxy = Rmp8x8(threadId.x) + blockStart;
|
||||||
uint2 inputSize = GetInputSize();
|
uint2 inputSize = GetInputSize();
|
||||||
|
|
@ -1901,6 +1915,7 @@ void Pass7(uint2 blockStart, uint3 threadId) {
|
||||||
tex2[gxy] = target2;
|
tex2[gxy] = target2;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
//!PASS 8
|
//!PASS 8
|
||||||
//!DESC L8
|
//!DESC L8
|
||||||
//!IN tex1, tex2
|
//!IN tex1, tex2
|
||||||
|
|
@ -2110,6 +2125,7 @@ const static MF kernelsLB[9 * 8 * 4] = {
|
||||||
|
|
||||||
const static MF4 biasLB = { 7.9956e-02, 3.0679e-04, -1.0257e-02, -1.2037e-02 };
|
const static MF4 biasLB = { 7.9956e-02, 3.0679e-04, -1.0257e-02, -1.2037e-02 };
|
||||||
|
|
||||||
|
|
||||||
void Pass8(uint2 blockStart, uint3 threadId) {
|
void Pass8(uint2 blockStart, uint3 threadId) {
|
||||||
uint2 gxy = Rmp8x8(threadId.x) + blockStart;
|
uint2 gxy = Rmp8x8(threadId.x) + blockStart;
|
||||||
uint2 inputSize = GetInputSize();
|
uint2 inputSize = GetInputSize();
|
||||||
|
|
@ -2189,6 +2205,7 @@ void Pass8(uint2 blockStart, uint3 threadId) {
|
||||||
tex4[gxy] = target2;
|
tex4[gxy] = target2;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
//!PASS 9
|
//!PASS 9
|
||||||
//!DESC L9, L10
|
//!DESC L9, L10
|
||||||
//!IN INPUT, tex3, tex4
|
//!IN INPUT, tex3, tex4
|
||||||
|
|
@ -2417,16 +2434,15 @@ const static MF kernelsL10[4 * 8] = {
|
||||||
0.0415, -0.1858
|
0.0415, -0.1858
|
||||||
};
|
};
|
||||||
|
|
||||||
// 来自 https://en.wikipedia.org/wiki/YCbCr#ITU-R_BT.709_conversion
|
const static MF2x3 rgb2uv = {
|
||||||
const static MF2x3 srgb2uv = {
|
-0.169, -0.331, 0.5,
|
||||||
-0.1146, -0.3854, 0.5,
|
0.5, -0.419, -0.081
|
||||||
0.5, -0.4542, -0.0458
|
|
||||||
};
|
};
|
||||||
|
|
||||||
const static MF3x3 yuv2srgb = {
|
const static MF3x3 yuv2rgb = {
|
||||||
1, 0, 1.5748,
|
1, -0.00093, 1.401687,
|
||||||
1, -0.1873, -0.4681,
|
1, -0.3437, -0.71417,
|
||||||
1, 1.8556, 0
|
1, 1.77216, 0.00099
|
||||||
};
|
};
|
||||||
|
|
||||||
void Pass9(uint2 blockStart, uint3 threadId) {
|
void Pass9(uint2 blockStart, uint3 threadId) {
|
||||||
|
|
@ -2514,7 +2530,7 @@ void Pass9(uint2 blockStart, uint3 threadId) {
|
||||||
uint2 destPos = gxy + uint2(i, j);
|
uint2 destPos = gxy + uint2(i, j);
|
||||||
|
|
||||||
uint index = j * 2 + i;
|
uint index = j * 2 + i;
|
||||||
MF newY = saturate(
|
MF luma = saturate(
|
||||||
target1.x * kernelsL10[0 + index] +
|
target1.x * kernelsL10[0 + index] +
|
||||||
target1.y * kernelsL10[4 + index] +
|
target1.y * kernelsL10[4 + index] +
|
||||||
target1.z * kernelsL10[8 + index] +
|
target1.z * kernelsL10[8 + index] +
|
||||||
|
|
@ -2524,10 +2540,8 @@ void Pass9(uint2 blockStart, uint3 threadId) {
|
||||||
target2.z * kernelsL10[24 + index] +
|
target2.z * kernelsL10[24 + index] +
|
||||||
target2.w * kernelsL10[28 + index]);
|
target2.w * kernelsL10[28 + index]);
|
||||||
|
|
||||||
// ACNet 工作在 YUV 颜色空间
|
MF2 originUV = mul(rgb2uv, INPUT.SampleLevel(sam1, (destPos + 0.5f) * outputPt, 0).rgb);
|
||||||
float3 originC = INPUT.SampleLevel(sam1, (destPos + 0.5f) * outputPt, 0).rgb;
|
OUTPUT[destPos] = MF4(mul(yuv2rgb, MF3(luma, originUV)), 1);
|
||||||
MF2 originUV = mul(srgb2uv, EncodeSrgb(originC));
|
|
||||||
OUTPUT[destPos] = MF4(DecodeSrgb(mul(yuv2srgb, MF3(newY, originUV))), 1);
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -2,17 +2,20 @@
|
||||||
// 移植自 https://github.com/bloc97/Anime4K/blob/master/glsl/Upscale/Anime4K_Upscale_CNN_x2_S.glsl
|
// 移植自 https://github.com/bloc97/Anime4K/blob/master/glsl/Upscale/Anime4K_Upscale_CNN_x2_S.glsl
|
||||||
|
|
||||||
//!MAGPIE EFFECT
|
//!MAGPIE EFFECT
|
||||||
//!VERSION 5
|
//!VERSION 4
|
||||||
//!SORT_NAME Anime4K_Upscale_0
|
//!SORT_NAME Anime4K_Upscale_0
|
||||||
|
//!USE MulAdd
|
||||||
//!CAPABILITY FP16
|
//!CAPABILITY FP16
|
||||||
//!SCALE_FACTOR 2
|
|
||||||
|
|
||||||
#include "../StubDefs.hlsli"
|
#include "../StubDefs.hlsli"
|
||||||
|
|
||||||
|
|
||||||
//!TEXTURE
|
//!TEXTURE
|
||||||
Texture2D INPUT;
|
Texture2D INPUT;
|
||||||
|
|
||||||
//!TEXTURE
|
//!TEXTURE
|
||||||
|
//!WIDTH INPUT_WIDTH * 2
|
||||||
|
//!HEIGHT INPUT_HEIGHT * 2
|
||||||
Texture2D OUTPUT;
|
Texture2D OUTPUT;
|
||||||
|
|
||||||
//!TEXTURE
|
//!TEXTURE
|
||||||
|
|
@ -35,6 +38,7 @@ SamplerState sam;
|
||||||
//!FILTER LINEAR
|
//!FILTER LINEAR
|
||||||
SamplerState sam1;
|
SamplerState sam1;
|
||||||
|
|
||||||
|
|
||||||
//!PASS 1
|
//!PASS 1
|
||||||
//!DESC Conv-4x3x3x3
|
//!DESC Conv-4x3x3x3
|
||||||
//!IN INPUT
|
//!IN INPUT
|
||||||
|
|
@ -93,6 +97,7 @@ void Pass1(uint2 blockStart, uint3 threadId) {
|
||||||
tex1[gxy] = A4KS1(src, 1, 2);
|
tex1[gxy] = A4KS1(src, 1, 2);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
//!PASS 2
|
//!PASS 2
|
||||||
//!DESC Conv-4x3x3x8
|
//!DESC Conv-4x3x3x8
|
||||||
//!IN tex1
|
//!IN tex1
|
||||||
|
|
@ -235,6 +240,7 @@ void Pass3(uint2 blockStart, uint3 threadId) {
|
||||||
tex1[gxy] = A4KS3(src, 1, 2);
|
tex1[gxy] = A4KS3(src, 1, 2);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
//!PASS 4
|
//!PASS 4
|
||||||
//!DESC Conv-4x3x3x8, Depth-to-Space
|
//!DESC Conv-4x3x3x8, Depth-to-Space
|
||||||
//!IN INPUT, tex1
|
//!IN INPUT, tex1
|
||||||
|
|
|
||||||
|
|
@ -2,8 +2,7 @@
|
||||||
// 移植自 https://github.com/ActualMandM/cemu_graphic_packs/blob/468d165cf27dae13a06e8bdc3d588d0af775ad91/Filters/Bicubic/output.glsl
|
// 移植自 https://github.com/ActualMandM/cemu_graphic_packs/blob/468d165cf27dae13a06e8bdc3d588d0af775ad91/Filters/Bicubic/output.glsl
|
||||||
|
|
||||||
//!MAGPIE EFFECT
|
//!MAGPIE EFFECT
|
||||||
//!VERSION 5
|
//!VERSION 4
|
||||||
//!CAPABILITY AdvancedColor
|
|
||||||
|
|
||||||
#include "StubDefs.hlsli"
|
#include "StubDefs.hlsli"
|
||||||
|
|
||||||
|
|
@ -35,6 +34,7 @@ Texture2D OUTPUT;
|
||||||
//!FILTER LINEAR
|
//!FILTER LINEAR
|
||||||
SamplerState sam;
|
SamplerState sam;
|
||||||
|
|
||||||
|
|
||||||
//!PASS 1
|
//!PASS 1
|
||||||
//!STYLE PS
|
//!STYLE PS
|
||||||
//!IN INPUT
|
//!IN INPUT
|
||||||
|
|
@ -48,7 +48,7 @@ float weight(float x) {
|
||||||
|
|
||||||
if (ax < 1.0) {
|
if (ax < 1.0) {
|
||||||
return (x * x * ((12.0 - 9.0 * B - 6.0 * C) * ax + (-18.0 + 12.0 * B + 6.0 * C)) + (6.0 - 2.0 * B)) / 6.0;
|
return (x * x * ((12.0 - 9.0 * B - 6.0 * C) * ax + (-18.0 + 12.0 * B + 6.0 * C)) + (6.0 - 2.0 * B)) / 6.0;
|
||||||
} else if (ax < 2.0) {
|
} else if (ax >= 1.0 && ax < 2.0) {
|
||||||
return (x * x * ((-B - 6.0 * C) * ax + (6.0 * B + 30.0 * C)) + (-12.0 * B - 48.0 * C) * ax + (8.0 * B + 24.0 * C)) / 6.0;
|
return (x * x * ((-B - 6.0 * C) * ax + (6.0 * B + 30.0 * C)) + (-12.0 * B - 48.0 * C) * ax + (8.0 * B + 24.0 * C)) / 6.0;
|
||||||
} else {
|
} else {
|
||||||
return 0.0;
|
return 0.0;
|
||||||
|
|
@ -64,6 +64,7 @@ float4 weight4(float x) {
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
float4 Pass1(float2 pos) {
|
float4 Pass1(float2 pos) {
|
||||||
const float2 inputPt = GetInputPt();
|
const float2 inputPt = GetInputPt();
|
||||||
const float2 inputSize = GetInputSize();
|
const float2 inputSize = GetInputSize();
|
||||||
|
|
@ -75,6 +76,10 @@ float4 Pass1(float2 pos) {
|
||||||
float4 rowtaps = weight4(1 - f.x);
|
float4 rowtaps = weight4(1 - f.x);
|
||||||
float4 coltaps = weight4(1 - f.y);
|
float4 coltaps = weight4(1 - f.y);
|
||||||
|
|
||||||
|
// make sure all taps added together is exactly 1.0, otherwise some (very small) distortion can occur
|
||||||
|
rowtaps /= rowtaps.r + rowtaps.g + rowtaps.b + rowtaps.a;
|
||||||
|
coltaps /= coltaps.r + coltaps.g + coltaps.b + coltaps.a;
|
||||||
|
|
||||||
float2 uv1 = pos1 * inputPt;
|
float2 uv1 = pos1 * inputPt;
|
||||||
float2 uv0 = uv1 - inputPt;
|
float2 uv0 = uv1 - inputPt;
|
||||||
float2 uv2 = uv1 + inputPt;
|
float2 uv2 = uv1 + inputPt;
|
||||||
|
|
|
||||||
|
|
@ -1,6 +1,5 @@
|
||||||
//!MAGPIE EFFECT
|
//!MAGPIE EFFECT
|
||||||
//!VERSION 5
|
//!VERSION 4
|
||||||
//!CAPABILITY AdvancedColor
|
|
||||||
|
|
||||||
//!TEXTURE
|
//!TEXTURE
|
||||||
Texture2D INPUT;
|
Texture2D INPUT;
|
||||||
|
|
|
||||||
7570
src/Effects/CuNNy/CuNNy-16x16C-NVL-DN.hlsl
Normal file
7570
src/Effects/CuNNy/CuNNy-16x16C-NVL-DN.hlsl
Normal file
File diff suppressed because it is too large
Load diff
7571
src/Effects/CuNNy/CuNNy-16x16C-NVL.hlsl
Normal file
7571
src/Effects/CuNNy/CuNNy-16x16C-NVL.hlsl
Normal file
File diff suppressed because it is too large
Load diff
340
src/Effects/CuNNy/CuNNy-2x4C-NVL-DN.hlsl
Normal file
340
src/Effects/CuNNy/CuNNy-2x4C-NVL-DN.hlsl
Normal file
|
|
@ -0,0 +1,340 @@
|
||||||
|
// CuNNy 2x4C BILINEAR RGB NVL DN - https://github.com/funnyplanter/CuNNy
|
||||||
|
|
||||||
|
// This program is free software: you can redistribute it and/or modify
|
||||||
|
// it under the terms of the GNU General Public License as published by
|
||||||
|
// the Free Software Foundation, either version 3 of the License, or
|
||||||
|
// (at your option) any later version.
|
||||||
|
//
|
||||||
|
// This program is distributed in the hope that it will be useful,
|
||||||
|
// but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
//
|
||||||
|
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||||
|
// GNU General Public License for more details.
|
||||||
|
//
|
||||||
|
// You should have received a copy of the GNU General Public License
|
||||||
|
// along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||||
|
|
||||||
|
//!MAGPIE EFFECT
|
||||||
|
//!VERSION 4
|
||||||
|
//!SORT_NAME CuNNy-DN-D04N02
|
||||||
|
//!USE MulAdd
|
||||||
|
//!CAPABILITY FP16
|
||||||
|
|
||||||
|
#include "../StubDefs.hlsli"
|
||||||
|
|
||||||
|
//!TEXTURE
|
||||||
|
Texture2D INPUT;
|
||||||
|
|
||||||
|
//!TEXTURE
|
||||||
|
//!WIDTH INPUT_WIDTH * 2
|
||||||
|
//!HEIGHT INPUT_HEIGHT * 2
|
||||||
|
Texture2D OUTPUT;
|
||||||
|
|
||||||
|
//!SAMPLER
|
||||||
|
//!FILTER POINT
|
||||||
|
SamplerState SP;
|
||||||
|
|
||||||
|
//!SAMPLER
|
||||||
|
//!FILTER LINEAR
|
||||||
|
SamplerState SL;
|
||||||
|
|
||||||
|
//!COMMON
|
||||||
|
#define O(t, p) t.SampleLevel(SP, pos + p * pt, 0)
|
||||||
|
#define V4 MF4
|
||||||
|
#define M4 MF4x4
|
||||||
|
|
||||||
|
//!TEXTURE
|
||||||
|
//!WIDTH INPUT_WIDTH
|
||||||
|
//!HEIGHT INPUT_HEIGHT
|
||||||
|
//!FORMAT R8G8B8A8_SNORM
|
||||||
|
Texture2D t0;
|
||||||
|
|
||||||
|
//!TEXTURE
|
||||||
|
//!WIDTH INPUT_WIDTH
|
||||||
|
//!HEIGHT INPUT_HEIGHT
|
||||||
|
//!FORMAT R8G8B8A8_SNORM
|
||||||
|
Texture2D t1;
|
||||||
|
|
||||||
|
//!PASS 1
|
||||||
|
//!DESC in
|
||||||
|
//!BLOCK_SIZE 8
|
||||||
|
//!NUM_THREADS 64
|
||||||
|
//!IN INPUT
|
||||||
|
//!OUT t0
|
||||||
|
|
||||||
|
#define l0(x, y) (dot(MF3(-3.725e-01, -7.046e-01, -1.734e-01), O(INPUT, float2(x, y)).rgb) + MF(1.169e-01))
|
||||||
|
|
||||||
|
V4 f0(MF s0_0, MF s0_1, MF s0_2, MF s0_3, MF s0_4, MF s0_5, MF s0_6, MF s0_7, MF s0_8) {
|
||||||
|
V4 r = { 1.492e-02, -1.961e-02, -7.539e-03, -3.574e-03 };
|
||||||
|
r = mad(s0_0, V4(-2.745e-03, -2.925e-03, 1.135e-01, 3.162e-02), r);
|
||||||
|
r = mad(s0_1, V4(4.049e-03, -3.428e-01, -7.641e-02, 2.484e-02), r);
|
||||||
|
r = mad(s0_2, V4(-8.372e-03, 3.398e-01, 1.072e-01, -5.449e-02), r);
|
||||||
|
r = mad(s0_3, V4(1.592e-02, 1.884e-02, -3.160e-02, -7.727e-02), r);
|
||||||
|
r = mad(s0_4, V4(4.429e-01, -3.936e-01, -4.134e-01, -4.287e-01), r);
|
||||||
|
r = mad(s0_5, V4(4.556e-02, 3.754e-01, -2.300e-02, 4.971e-01), r);
|
||||||
|
r = mad(s0_6, V4(-2.031e-02, -6.662e-03, 8.906e-02, 4.602e-02), r);
|
||||||
|
r = mad(s0_7, V4(-4.365e-01, 2.183e-03, 8.609e-02, 9.402e-03), r);
|
||||||
|
r = mad(s0_8, V4(-3.845e-02, 5.695e-03, 9.645e-02, -5.310e-02), r);
|
||||||
|
return r;
|
||||||
|
}
|
||||||
|
|
||||||
|
void Pass1(uint2 blockStart, uint3 tid) {
|
||||||
|
float2 pt = float2(GetInputPt());
|
||||||
|
uint2 gxy = Rmp8x8(tid.x) + blockStart;
|
||||||
|
uint2 size = GetInputSize();
|
||||||
|
if (gxy.x >= size.x || gxy.y >= size.y) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
float2 pos = (gxy + 0.5) * pt;
|
||||||
|
|
||||||
|
MF s0_0 = l0(-1.0, -1.0);
|
||||||
|
MF s0_1 = l0(0.0, -1.0);
|
||||||
|
MF s0_2 = l0(1.0, -1.0);
|
||||||
|
MF s0_3 = l0(-1.0, 0.0);
|
||||||
|
MF s0_4 = l0(0.0, 0.0);
|
||||||
|
MF s0_5 = l0(1.0, 0.0);
|
||||||
|
MF s0_6 = l0(-1.0, 1.0);
|
||||||
|
MF s0_7 = l0(0.0, 1.0);
|
||||||
|
MF s0_8 = l0(1.0, 1.0);
|
||||||
|
|
||||||
|
t0[gxy] = f0(s0_0, s0_1, s0_2, s0_3, s0_4, s0_5, s0_6, s0_7, s0_8);
|
||||||
|
}
|
||||||
|
|
||||||
|
//!PASS 2
|
||||||
|
//!DESC conv1
|
||||||
|
//!BLOCK_SIZE 8
|
||||||
|
//!NUM_THREADS 64
|
||||||
|
//!IN t0
|
||||||
|
//!OUT t1
|
||||||
|
|
||||||
|
#define l0(x, y) V4(O(t0, float2(x, y)))
|
||||||
|
|
||||||
|
V4 f0(V4 s0_0, V4 s0_1, V4 s0_2, V4 s0_3, V4 s0_4, V4 s0_5, V4 s0_6, V4 s0_7, V4 s0_8, V4 s1_0, V4 s1_1, V4 s1_2, V4 s1_3, V4 s1_4, V4 s1_5, V4 s1_6, V4 s1_7, V4 s1_8) {
|
||||||
|
V4 r = { 4.789e-02, 4.713e-03, -2.854e-02, 9.967e-03 };
|
||||||
|
r = MulAdd(s0_0, M4(1.218e-02, -1.208e-01, -1.955e-01, -1.217e-01, 3.123e-02, -2.317e-02, 1.961e-01, -9.984e-02, 3.038e-03, 2.863e-02, -1.042e-01, -5.529e-02, 1.266e-01, -3.877e-01, 2.315e-01, -1.334e-01), r);
|
||||||
|
r = MulAdd(s0_1, M4(-1.774e-02, 1.636e-01, 1.379e-01, 7.499e-03, -7.890e-02, -3.970e-02, -6.053e-02, -1.431e-02, 4.167e-02, 9.728e-02, 3.825e-02, -2.704e-02, -2.303e-01, -3.348e-01, 2.940e-01, 4.825e-02), r);
|
||||||
|
r = MulAdd(s0_2, M4(1.239e-02, 1.613e-02, -2.280e-01, 8.985e-02, 2.106e-03, 3.847e-02, -2.539e-02, -3.326e-02, -6.327e-02, -1.427e-01, 4.218e-02, 8.995e-02, -6.045e-02, -1.073e-01, -1.329e-01, -2.085e-02), r);
|
||||||
|
r = MulAdd(s0_3, M4(-1.601e-01, -2.448e-01, -3.950e-01, 9.169e-03, -3.694e-02, 2.018e-01, -2.524e-01, 1.719e+00, 3.009e-02, 4.927e-02, 1.564e-01, 3.509e-02, -2.630e-02, -3.986e-01, 1.326e-01, -1.037e-02), r);
|
||||||
|
r = MulAdd(s0_4, M4(-1.074e+00, -1.654e-01, 4.163e-01, 3.816e-02, 4.580e-01, 4.350e-01, -3.490e-01, -1.257e-02, 1.159e-02, -2.083e-01, -2.744e-01, -2.667e-02, 2.826e-03, 1.986e-01, -2.723e-01, 9.612e-02), r);
|
||||||
|
r = MulAdd(s0_5, M4(-3.195e-01, -1.450e-01, -1.523e-01, -2.999e-03, 1.166e-01, 1.304e-01, 1.475e-01, 7.286e-02, -4.077e-02, -3.477e-02, 1.496e-01, -1.199e-02, 7.881e-02, 8.911e-02, -1.082e-01, -6.762e-02), r);
|
||||||
|
r = MulAdd(s0_6, M4(2.020e-02, 1.556e-01, -9.837e-03, 1.537e-02, -1.047e-01, 2.095e-01, 2.025e-01, -3.522e-02, -3.407e-02, -8.949e-02, -7.721e-02, -8.910e-03, 9.305e-02, 2.231e-01, 2.178e-01, 1.502e-02), r);
|
||||||
|
r = MulAdd(s0_7, M4(-7.936e-02, 3.096e-01, 1.869e-01, -1.950e-03, -2.452e-01, -5.098e-01, 5.304e-01, -4.921e-02, -1.073e-01, 1.062e-01, 2.527e-01, 5.909e-04, 3.797e-02, 3.291e-01, -2.395e-01, 2.768e-02), r);
|
||||||
|
r = MulAdd(s0_8, M4(-5.559e-02, 1.090e-01, -1.757e-01, 1.261e-02, -1.632e-01, -2.476e-01, -5.674e-02, -4.843e-03, 1.064e-02, 1.023e-01, 2.540e-02, -1.336e-02, 1.362e-01, 1.833e-01, 3.772e-03, 5.118e-04), r);
|
||||||
|
r = MulAdd(s1_0, M4(1.383e-01, 3.469e-01, 3.568e-02, -1.958e-01, -3.170e-02, -1.076e-02, -2.012e-02, -2.104e-04, 2.046e-02, -1.268e-02, -1.618e-01, -6.370e-02, 2.615e-02, 1.494e-01, -1.523e-01, 3.702e-02), r);
|
||||||
|
r = MulAdd(s1_1, M4(-1.140e-02, 6.811e-01, 5.722e-02, 1.514e-01, -6.311e-02, -3.541e-02, -1.150e-01, 3.625e-02, 1.146e-01, -1.395e-03, 5.059e-01, -7.835e-02, -3.907e-01, 6.172e-02, -9.656e-02, -2.727e-02), r);
|
||||||
|
r = MulAdd(s1_2, M4(1.239e-01, 1.206e-01, 7.519e-01, 2.106e-02, 8.647e-03, 1.082e-02, 5.931e-02, -4.215e-02, -2.216e-02, -4.829e-02, -1.927e-01, 1.159e-01, -1.789e-01, -9.596e-02, 1.395e-01, -6.395e-02), r);
|
||||||
|
r = MulAdd(s1_3, M4(1.194e-01, -5.786e-01, -1.761e-03, -1.126e-02, -5.311e-02, -2.325e-01, 1.733e-01, 2.842e-01, -1.080e-01, -1.012e-01, 1.851e-01, 4.253e-02, 1.212e-01, 2.435e-02, -3.061e-01, -9.579e-02), r);
|
||||||
|
r = MulAdd(s1_4, M4(-4.651e-02, -1.299e+00, -5.020e-01, 5.830e-02, 5.098e-01, 7.344e-02, -1.358e-01, 1.725e-02, -2.980e-01, -6.077e-01, 6.308e-01, -4.014e-02, 3.497e-01, 3.700e-01, -6.035e-01, 8.026e-02), r);
|
||||||
|
r = MulAdd(s1_5, M4(-1.851e-02, -2.057e-01, 5.081e-01, -5.262e-02, 1.715e-01, 1.387e-01, -1.123e-01, 9.022e-02, -1.532e-01, -3.749e-02, -1.930e-01, 6.423e-02, 2.763e-02, 5.993e-02, 4.141e-01, -8.825e-02), r);
|
||||||
|
r = MulAdd(s1_6, M4(-6.324e-03, -9.461e-02, 3.044e-02, -4.139e-03, -2.925e-02, 3.975e-01, 1.161e-01, 9.726e-03, 1.353e-01, 2.762e-01, 3.297e-03, 1.076e-02, -8.503e-02, -7.010e-01, -1.967e-01, -1.360e-03), r);
|
||||||
|
r = MulAdd(s1_7, M4(1.873e-02, 1.099e-01, 1.229e-01, -1.232e-02, -5.723e-01, -4.599e-02, -1.236e-01, -2.003e-02, -4.268e-01, 5.929e-01, 2.942e-01, 3.485e-02, 4.326e-01, -9.250e-02, 3.736e-01, -2.393e-02), r);
|
||||||
|
r = MulAdd(s1_8, M4(-5.991e-02, 1.199e-03, -1.349e-02, -1.321e-03, -2.036e-01, -1.937e-01, -7.888e-02, -9.144e-03, 1.557e-01, 7.018e-02, -2.646e-01, -3.360e-06, 1.742e-01, 1.814e-01, 1.385e-01, -1.030e-02), r);
|
||||||
|
return r;
|
||||||
|
}
|
||||||
|
|
||||||
|
void Pass2(uint2 blockStart, uint3 tid) {
|
||||||
|
float2 pt = float2(GetInputPt());
|
||||||
|
uint2 gxy = Rmp8x8(tid.x) + blockStart;
|
||||||
|
uint2 size = GetInputSize();
|
||||||
|
if (gxy.x >= size.x || gxy.y >= size.y) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
float2 pos = (gxy + 0.5) * pt;
|
||||||
|
|
||||||
|
V4 s0_0 = l0(-1.0, -1.0);
|
||||||
|
V4 s0_1 = l0(0.0, -1.0);
|
||||||
|
V4 s0_2 = l0(1.0, -1.0);
|
||||||
|
V4 s0_3 = l0(-1.0, 0.0);
|
||||||
|
V4 s0_4 = l0(0.0, 0.0);
|
||||||
|
V4 s0_5 = l0(1.0, 0.0);
|
||||||
|
V4 s0_6 = l0(-1.0, 1.0);
|
||||||
|
V4 s0_7 = l0(0.0, 1.0);
|
||||||
|
V4 s0_8 = l0(1.0, 1.0);
|
||||||
|
V4 s1_0 = -max(-s0_0, 0.0);
|
||||||
|
V4 s1_1 = -max(-s0_1, 0.0);
|
||||||
|
V4 s1_2 = -max(-s0_2, 0.0);
|
||||||
|
V4 s1_3 = -max(-s0_3, 0.0);
|
||||||
|
V4 s1_4 = -max(-s0_4, 0.0);
|
||||||
|
V4 s1_5 = -max(-s0_5, 0.0);
|
||||||
|
V4 s1_6 = -max(-s0_6, 0.0);
|
||||||
|
V4 s1_7 = -max(-s0_7, 0.0);
|
||||||
|
V4 s1_8 = -max(-s0_8, 0.0);
|
||||||
|
s0_0 = max(s0_0, 0.0);
|
||||||
|
s0_1 = max(s0_1, 0.0);
|
||||||
|
s0_2 = max(s0_2, 0.0);
|
||||||
|
s0_3 = max(s0_3, 0.0);
|
||||||
|
s0_4 = max(s0_4, 0.0);
|
||||||
|
s0_5 = max(s0_5, 0.0);
|
||||||
|
s0_6 = max(s0_6, 0.0);
|
||||||
|
s0_7 = max(s0_7, 0.0);
|
||||||
|
s0_8 = max(s0_8, 0.0);
|
||||||
|
|
||||||
|
t1[gxy] = f0(s0_0, s0_1, s0_2, s0_3, s0_4, s0_5, s0_6, s0_7, s0_8, s1_0, s1_1, s1_2, s1_3, s1_4, s1_5, s1_6, s1_7, s1_8);
|
||||||
|
}
|
||||||
|
|
||||||
|
//!PASS 3
|
||||||
|
//!DESC conv2
|
||||||
|
//!BLOCK_SIZE 8
|
||||||
|
//!NUM_THREADS 64
|
||||||
|
//!IN t1
|
||||||
|
//!OUT t0
|
||||||
|
|
||||||
|
#define l0(x, y) V4(O(t1, float2(x, y)))
|
||||||
|
|
||||||
|
V4 f0(V4 s0_0, V4 s0_1, V4 s0_2, V4 s0_3, V4 s0_4, V4 s0_5, V4 s0_6, V4 s0_7, V4 s0_8, V4 s1_0, V4 s1_1, V4 s1_2, V4 s1_3, V4 s1_4, V4 s1_5, V4 s1_6, V4 s1_7, V4 s1_8) {
|
||||||
|
V4 r = { 7.359e-03, -1.132e-02, 1.248e-02, 7.243e-04 };
|
||||||
|
r = MulAdd(s0_0, M4(-1.565e-01, 1.307e-02, -5.269e-02, 5.465e-02, 2.936e-01, 1.626e-01, 4.589e-02, 2.478e-02, 3.520e-01, -5.445e-02, -2.480e-01, 2.838e-02, 1.841e-04, 1.264e-02, -1.370e-02, 2.588e-02), r);
|
||||||
|
r = MulAdd(s0_1, M4(2.350e-01, 2.116e-01, 2.167e-02, -1.559e-01, 2.502e-01, 4.320e-01, -7.152e-01, 2.270e-01, -2.668e-01, -2.117e-01, 5.598e-01, 2.261e-01, 4.101e-02, -4.860e-02, 3.530e-02, 8.932e-02), r);
|
||||||
|
r = MulAdd(s0_2, M4(-4.398e-02, -4.486e-02, -5.040e-02, 9.803e-02, 7.515e-02, 1.203e-01, -5.357e-02, -2.803e-01, -1.435e-01, 7.150e-03, -3.118e-02, -2.636e-01, -2.969e-02, -2.011e-02, 2.658e-02, -2.572e-02), r);
|
||||||
|
r = MulAdd(s0_3, M4(9.140e-02, -1.875e-01, 9.757e-02, 2.976e-02, -8.325e-02, 6.109e-02, -4.304e-02, 7.057e-02, 7.324e-01, -1.528e-01, 2.930e-01, 7.503e-02, -3.901e-02, 1.109e-03, -2.693e-02, -3.330e-02), r);
|
||||||
|
r = MulAdd(s0_4, M4(-9.944e-02, 1.858e-01, -2.436e-01, 3.822e-02, 6.685e-02, -1.758e-01, 1.382e-01, -1.715e-01, 3.252e-01, 5.176e-01, -2.939e-01, 4.311e-01, -6.125e-02, 1.905e-01, 8.140e-02, 2.095e-01), r);
|
||||||
|
r = MulAdd(s0_5, M4(3.193e-02, 6.029e-02, 1.869e-03, 8.627e-04, -1.402e-02, 4.288e-02, -5.756e-02, 8.813e-02, -2.758e-02, -5.267e-02, 1.702e-03, -6.676e-01, 6.373e-02, 5.766e-02, -6.325e-02, -2.744e-01), r);
|
||||||
|
r = MulAdd(s0_6, M4(4.918e-02, 5.420e-04, 3.692e-02, 7.796e-03, -1.163e-02, -4.074e-02, 2.057e-02, -2.837e-02, 1.083e-01, 1.958e-01, -5.078e-02, 2.750e-02, 5.323e-02, 5.953e-03, 4.766e-02, -2.265e-03), r);
|
||||||
|
r = MulAdd(s0_7, M4(-3.968e-02, -1.535e-01, 6.564e-02, -2.620e-02, 3.742e-02, 8.659e-02, -4.440e-02, 6.007e-03, -9.585e-02, -9.425e-02, -1.517e-01, 3.701e-01, -1.332e-01, -1.860e-01, -5.436e-02, 3.781e-01), r);
|
||||||
|
r = MulAdd(s0_8, M4(-1.145e-02, 6.045e-02, -4.676e-02, -5.604e-02, -1.576e-02, -3.528e-02, 2.252e-02, 1.997e-02, -2.546e-02, -6.894e-02, 7.238e-02, -3.495e-01, -6.323e-02, -1.042e-01, 1.091e-01, -4.170e-01), r);
|
||||||
|
r = MulAdd(s1_0, M4(-5.215e-01, 6.255e-01, 5.587e-02, -5.362e-02, 9.895e-02, -8.743e-03, 1.058e-01, -3.585e-02, -1.594e-02, -1.034e-01, 3.848e-02, -5.432e-02, -1.796e-02, 5.838e-02, 1.304e-01, -2.122e-02), r);
|
||||||
|
r = MulAdd(s1_1, M4(-6.987e-02, 8.696e-01, -1.130e+00, 5.558e-03, -1.080e-01, 4.195e-02, -1.323e-01, 2.270e-01, 3.451e-02, -1.616e-02, 4.251e-03, 1.470e-01, 2.442e-01, -5.904e-02, -3.467e-01, -2.056e-02), r);
|
||||||
|
r = MulAdd(s1_2, M4(4.884e-02, -1.034e-01, 5.823e-02, 1.131e-01, -4.126e-02, 6.519e-02, -1.532e-02, -2.420e-01, 1.092e-02, 1.869e-02, 1.913e-03, -1.787e-02, 1.122e-01, -1.481e-01, 1.843e-01, 3.454e-01), r);
|
||||||
|
r = MulAdd(s1_3, M4(-2.906e-01, -9.847e-01, 4.092e-01, 1.655e-01, 4.092e-02, 2.913e-01, 1.306e-01, -4.682e-02, 2.568e-01, -4.528e-02, 3.207e-02, 9.888e-02, -3.928e-01, -3.546e-01, -2.367e-01, -3.239e-01), r);
|
||||||
|
r = MulAdd(s1_4, M4(4.463e-01, -1.594e-01, 8.418e-01, -3.525e-01, 5.957e-01, 1.082e+00, -9.245e-01, 2.726e-01, 1.210e-01, 2.024e-01, -8.063e-03, -2.433e-01, -1.512e+00, 9.316e-01, 2.305e-01, -5.109e-01), r);
|
||||||
|
r = MulAdd(s1_5, M4(-2.393e-02, 1.286e-02, -9.453e-02, 3.071e-01, -1.402e-01, -2.436e-01, 1.202e-01, -1.409e-01, -1.857e-02, 2.421e-02, -2.642e-02, -7.415e-02, 8.786e-01, 5.260e-04, -9.212e-02, 1.849e-01), r);
|
||||||
|
r = MulAdd(s1_6, M4(8.958e-02, 9.057e-02, 1.712e-02, -2.838e-02, -1.405e-01, -6.455e-02, -2.695e-02, -1.110e-02, 8.731e-03, 6.531e-02, -3.752e-02, 1.194e-01, 4.585e-01, 6.270e-01, -1.367e-01, -2.529e-01), r);
|
||||||
|
r = MulAdd(s1_7, M4(-4.381e-02, -1.595e-02, -4.601e-02, 7.257e-02, -8.036e-02, -1.360e-01, 1.154e-01, -7.942e-02, -4.653e-02, -7.121e-02, 2.720e-02, 8.346e-02, -1.871e+00, -8.300e-01, -6.760e-01, 7.402e-01), r);
|
||||||
|
r = MulAdd(s1_8, M4(1.359e-02, -2.489e-02, 3.529e-02, -1.121e-01, -6.190e-02, -2.628e-02, -2.090e-03, 2.359e-01, -2.412e-02, -2.463e-02, 8.317e-03, -5.330e-02, 2.105e+00, 1.550e-01, 1.457e+00, -1.129e+00), r);
|
||||||
|
return r;
|
||||||
|
}
|
||||||
|
|
||||||
|
void Pass3(uint2 blockStart, uint3 tid) {
|
||||||
|
float2 pt = float2(GetInputPt());
|
||||||
|
uint2 gxy = Rmp8x8(tid.x) + blockStart;
|
||||||
|
uint2 size = GetInputSize();
|
||||||
|
if (gxy.x >= size.x || gxy.y >= size.y) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
float2 pos = (gxy + 0.5) * pt;
|
||||||
|
|
||||||
|
V4 s0_0 = l0(-1.0, -1.0);
|
||||||
|
V4 s0_1 = l0(0.0, -1.0);
|
||||||
|
V4 s0_2 = l0(1.0, -1.0);
|
||||||
|
V4 s0_3 = l0(-1.0, 0.0);
|
||||||
|
V4 s0_4 = l0(0.0, 0.0);
|
||||||
|
V4 s0_5 = l0(1.0, 0.0);
|
||||||
|
V4 s0_6 = l0(-1.0, 1.0);
|
||||||
|
V4 s0_7 = l0(0.0, 1.0);
|
||||||
|
V4 s0_8 = l0(1.0, 1.0);
|
||||||
|
V4 s1_0 = -max(-s0_0, 0.0);
|
||||||
|
V4 s1_1 = -max(-s0_1, 0.0);
|
||||||
|
V4 s1_2 = -max(-s0_2, 0.0);
|
||||||
|
V4 s1_3 = -max(-s0_3, 0.0);
|
||||||
|
V4 s1_4 = -max(-s0_4, 0.0);
|
||||||
|
V4 s1_5 = -max(-s0_5, 0.0);
|
||||||
|
V4 s1_6 = -max(-s0_6, 0.0);
|
||||||
|
V4 s1_7 = -max(-s0_7, 0.0);
|
||||||
|
V4 s1_8 = -max(-s0_8, 0.0);
|
||||||
|
s0_0 = max(s0_0, 0.0);
|
||||||
|
s0_1 = max(s0_1, 0.0);
|
||||||
|
s0_2 = max(s0_2, 0.0);
|
||||||
|
s0_3 = max(s0_3, 0.0);
|
||||||
|
s0_4 = max(s0_4, 0.0);
|
||||||
|
s0_5 = max(s0_5, 0.0);
|
||||||
|
s0_6 = max(s0_6, 0.0);
|
||||||
|
s0_7 = max(s0_7, 0.0);
|
||||||
|
s0_8 = max(s0_8, 0.0);
|
||||||
|
|
||||||
|
t0[gxy] = f0(s0_0, s0_1, s0_2, s0_3, s0_4, s0_5, s0_6, s0_7, s0_8, s1_0, s1_1, s1_2, s1_3, s1_4, s1_5, s1_6, s1_7, s1_8);
|
||||||
|
}
|
||||||
|
|
||||||
|
//!PASS 4
|
||||||
|
//!DESC out-shuffle
|
||||||
|
//!BLOCK_SIZE 16
|
||||||
|
//!NUM_THREADS 64
|
||||||
|
//!IN INPUT, t0
|
||||||
|
//!OUT OUTPUT
|
||||||
|
|
||||||
|
#define l0(x, y) V4(O(t0, float2(x, y)))
|
||||||
|
|
||||||
|
V4 f0(V4 s0_0, V4 s0_1, V4 s0_2, V4 s0_3, V4 s0_4, V4 s0_5, V4 s0_6, V4 s0_7, V4 s0_8, V4 s1_0, V4 s1_1, V4 s1_2, V4 s1_3, V4 s1_4, V4 s1_5, V4 s1_6, V4 s1_7, V4 s1_8) {
|
||||||
|
V4 r = { -7.528e-04, -8.388e-04, -1.247e-03, -1.205e-03 };
|
||||||
|
r = MulAdd(s0_0, M4(8.642e-03, -1.295e-02, 1.998e-02, -1.289e-03, -4.147e-02, -4.021e-03, 1.491e-04, -7.275e-03, 1.574e-02, -4.122e-03, 1.126e-02, 8.962e-03, 5.174e-02, 3.405e-02, 4.993e-02, 4.529e-02), r);
|
||||||
|
r = MulAdd(s0_1, M4(-1.028e-01, -2.764e-02, -2.777e-02, -7.170e-03, -8.365e-02, 3.550e-02, 1.288e-01, 2.475e-02, 5.017e-02, 5.917e-02, 3.473e-02, 8.510e-03, 2.332e-02, 8.047e-02, 9.838e-02, 4.234e-02), r);
|
||||||
|
r = MulAdd(s0_2, M4(-2.319e-02, -4.432e-02, -1.679e-02, 8.855e-03, 3.259e-02, -1.974e-01, 5.938e-02, 1.616e-01, -5.605e-04, 3.183e-02, -3.356e-03, 3.138e-02, 9.572e-03, -3.887e-02, -2.632e-02, -1.161e-02), r);
|
||||||
|
r = MulAdd(s0_3, M4(-2.947e-02, -4.358e-02, 1.208e-03, -2.705e-02, -1.037e-02, -6.812e-02, -5.436e-02, -3.840e-02, 3.684e-02, 2.560e-02, 1.715e-02, -3.670e-02, -5.930e-02, -2.310e-02, -6.163e-02, -3.562e-02), r);
|
||||||
|
r = MulAdd(s0_4, M4(5.520e-01, 1.213e-01, 1.753e-01, 5.436e-02, 5.879e-01, 2.281e-01, -2.703e-01, 1.519e-01, 5.739e-01, 2.959e-01, 9.449e-02, 2.473e-02, -5.998e-01, -9.548e-02, -6.035e-01, -9.663e-02), r);
|
||||||
|
r = MulAdd(s0_5, M4(-9.740e-02, 2.744e-01, -1.522e-01, -7.204e-02, 1.178e-01, 6.112e-01, -4.801e-02, -5.176e-01, 1.480e-02, 8.323e-02, -6.764e-02, 4.138e-02, 1.121e-01, -8.141e-02, 1.211e-01, -8.737e-02), r);
|
||||||
|
r = MulAdd(s0_6, M4(6.315e-02, 6.323e-02, 1.146e-02, 3.378e-02, -9.598e-02, -1.089e-01, 2.780e-02, -6.091e-02, -1.194e-01, -1.038e-01, -2.147e-02, -4.236e-02, -2.300e-02, -3.184e-02, -1.560e-02, -2.206e-02), r);
|
||||||
|
r = MulAdd(s0_7, M4(-1.772e-01, -1.304e-01, 1.265e-01, -7.871e-02, 1.978e-01, 1.074e-01, 1.240e-02, 4.600e-02, 1.558e-02, -3.196e-02, 2.018e-01, 1.496e-01, 1.421e-01, 8.472e-02, 7.432e-02, 9.935e-02), r);
|
||||||
|
r = MulAdd(s0_8, M4(1.132e-02, -2.296e-03, 1.274e-01, 3.428e-01, -5.796e-02, -6.156e-02, -2.549e-01, -2.231e-01, -8.762e-02, -9.318e-02, -2.378e-01, -3.018e-01, 5.601e-03, -2.670e-02, 2.896e-02, -3.910e-02), r);
|
||||||
|
r = MulAdd(s1_0, M4(4.603e-02, -2.582e-02, -9.045e-03, 1.446e-02, -1.835e-02, -2.533e-02, 3.681e-03, -9.420e-03, -5.802e-02, 2.310e-02, 3.059e-02, 1.313e-03, 9.639e-02, 8.284e-02, 1.071e-01, -3.287e-02), r);
|
||||||
|
r = MulAdd(s1_1, M4(-2.480e-02, 2.321e-03, -3.594e-02, -1.101e-01, 2.850e-02, 2.912e-02, 2.597e-02, 2.777e-02, 5.701e-02, 9.536e-04, 2.533e-02, 1.102e-02, -3.714e-03, 7.838e-02, -1.716e-02, 1.723e-01), r);
|
||||||
|
r = MulAdd(s1_2, M4(-4.473e-03, 1.521e-02, -1.887e-02, 6.731e-03, 2.199e-03, 2.965e-02, -3.709e-03, 1.671e-02, 1.376e-02, -4.819e-02, -8.832e-04, 3.531e-02, -8.453e-03, -1.276e-02, -1.461e-02, 4.460e-03), r);
|
||||||
|
r = MulAdd(s1_3, M4(6.139e-02, -1.511e-01, 1.102e-01, -1.428e-01, -5.114e-02, -6.594e-02, -1.693e-02, -4.651e-02, 2.440e-01, 2.010e-02, -1.900e-01, -1.243e-03, -2.397e-01, 2.002e-01, -3.506e-01, 2.171e-01), r);
|
||||||
|
r = MulAdd(s1_4, M4(-6.189e-02, 5.137e-01, -8.132e-02, 4.526e-01, 3.263e-01, 2.134e-01, 1.027e-01, 2.067e-02, 2.407e-01, 2.591e-01, 4.489e-01, 2.042e-01, 1.932e-02, -4.463e-01, -1.479e-01, -6.843e-01), r);
|
||||||
|
r = MulAdd(s1_5, M4(-7.571e-03, -7.787e-02, 9.918e-03, -8.469e-02, 4.056e-02, -1.926e-02, -4.968e-02, 2.416e-02, 2.699e-02, 2.783e-01, -7.854e-02, -6.549e-02, 6.835e-03, 2.288e-02, 1.048e-02, -3.273e-02), r);
|
||||||
|
r = MulAdd(s1_6, M4(7.034e-02, 4.236e-02, 7.905e-02, -2.283e-03, -8.423e-02, -7.784e-02, -7.540e-03, -3.373e-02, -1.019e-01, -1.421e-01, 6.713e-02, -8.716e-02, -6.980e-02, -4.731e-02, -3.086e-02, -6.210e-03), r);
|
||||||
|
r = MulAdd(s1_7, M4(-1.597e-01, -2.036e-01, 5.194e-02, 8.457e-02, 1.387e-01, 7.910e-02, 2.030e-02, 5.848e-02, 2.154e-01, 1.382e-01, -8.617e-02, 7.552e-02, 3.127e-02, 5.899e-02, 1.733e-01, 1.657e-01), r);
|
||||||
|
r = MulAdd(s1_8, M4(3.595e-02, 3.243e-02, 1.450e-01, 2.046e-01, -2.939e-02, -1.306e-02, -1.587e-01, -2.607e-01, -8.980e-02, -5.350e-02, -2.627e-01, -2.861e-01, -1.585e-02, -2.032e-02, -1.662e-02, 1.560e-02), r);
|
||||||
|
return tanh(r);
|
||||||
|
}
|
||||||
|
|
||||||
|
void Pass4(uint2 blockStart, uint3 tid) {
|
||||||
|
float2 pt = float2(GetInputPt());
|
||||||
|
uint2 gxy = (Rmp8x8(tid.x) << 1) + blockStart;
|
||||||
|
uint2 size = GetOutputSize();
|
||||||
|
if (gxy.x >= size.x || gxy.y >= size.y) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
float2 pos = ((gxy >> 1) + 0.5) * pt;
|
||||||
|
|
||||||
|
V4 s0_0 = l0(-1.0, -1.0);
|
||||||
|
V4 s0_1 = l0(0.0, -1.0);
|
||||||
|
V4 s0_2 = l0(1.0, -1.0);
|
||||||
|
V4 s0_3 = l0(-1.0, 0.0);
|
||||||
|
V4 s0_4 = l0(0.0, 0.0);
|
||||||
|
V4 s0_5 = l0(1.0, 0.0);
|
||||||
|
V4 s0_6 = l0(-1.0, 1.0);
|
||||||
|
V4 s0_7 = l0(0.0, 1.0);
|
||||||
|
V4 s0_8 = l0(1.0, 1.0);
|
||||||
|
V4 s1_0 = -max(-s0_0, 0.0);
|
||||||
|
V4 s1_1 = -max(-s0_1, 0.0);
|
||||||
|
V4 s1_2 = -max(-s0_2, 0.0);
|
||||||
|
V4 s1_3 = -max(-s0_3, 0.0);
|
||||||
|
V4 s1_4 = -max(-s0_4, 0.0);
|
||||||
|
V4 s1_5 = -max(-s0_5, 0.0);
|
||||||
|
V4 s1_6 = -max(-s0_6, 0.0);
|
||||||
|
V4 s1_7 = -max(-s0_7, 0.0);
|
||||||
|
V4 s1_8 = -max(-s0_8, 0.0);
|
||||||
|
s0_0 = max(s0_0, 0.0);
|
||||||
|
s0_1 = max(s0_1, 0.0);
|
||||||
|
s0_2 = max(s0_2, 0.0);
|
||||||
|
s0_3 = max(s0_3, 0.0);
|
||||||
|
s0_4 = max(s0_4, 0.0);
|
||||||
|
s0_5 = max(s0_5, 0.0);
|
||||||
|
s0_6 = max(s0_6, 0.0);
|
||||||
|
s0_7 = max(s0_7, 0.0);
|
||||||
|
s0_8 = max(s0_8, 0.0);
|
||||||
|
|
||||||
|
V4 r = f0(s0_0, s0_1, s0_2, s0_3, s0_4, s0_5, s0_6, s0_7, s0_8, s1_0, s1_1, s1_2, s1_3, s1_4, s1_5, s1_6, s1_7, s1_8);
|
||||||
|
|
||||||
|
static const MF3x3 rgb2yuv = { 0.299, 0.587, 0.114, -0.169, -0.331, 0.5, 0.5, -0.419, -0.081 };
|
||||||
|
static const MF3x3 yuv2rgb = { 1, -0.00093, 1.401687, 1, -0.3437, -0.71417, 1, 1.77216, 0.00099 };
|
||||||
|
float2 opt = float2(GetOutputPt());
|
||||||
|
|
||||||
|
pos -= 0.5f * opt;
|
||||||
|
MF3 yuv = mul(rgb2yuv, INPUT.SampleLevel(SL, pos, 0).rgb);
|
||||||
|
OUTPUT[gxy] = MF4(mul(yuv2rgb, MF3(saturate(yuv.r + r.x), yuv.yz)), 1);
|
||||||
|
|
||||||
|
++gxy.x;
|
||||||
|
pos.x += opt.x;
|
||||||
|
yuv = mul(rgb2yuv, INPUT.SampleLevel(SL, pos, 0).rgb);
|
||||||
|
OUTPUT[gxy] = MF4(mul(yuv2rgb, MF3(saturate(yuv.r + r.y), yuv.yz)), 1);
|
||||||
|
|
||||||
|
++gxy.y;
|
||||||
|
pos.y += opt.y;
|
||||||
|
yuv = mul(rgb2yuv, INPUT.SampleLevel(SL, pos, 0).rgb);
|
||||||
|
OUTPUT[gxy] = MF4(mul(yuv2rgb, MF3(saturate(yuv.r + r.w), yuv.yz)), 1);
|
||||||
|
|
||||||
|
--gxy.x;
|
||||||
|
pos.x -= opt.x;
|
||||||
|
yuv = mul(rgb2yuv, INPUT.SampleLevel(SL, pos, 0).rgb);
|
||||||
|
OUTPUT[gxy] = MF4(mul(yuv2rgb, MF3(saturate(yuv.r + r.z), yuv.yz)), 1);
|
||||||
|
}
|
||||||
340
src/Effects/CuNNy/CuNNy-2x4C-NVL.hlsl
Normal file
340
src/Effects/CuNNy/CuNNy-2x4C-NVL.hlsl
Normal file
|
|
@ -0,0 +1,340 @@
|
||||||
|
// CuNNy 2x4C BILINEAR RGB NVL - https://github.com/funnyplanter/CuNNy
|
||||||
|
|
||||||
|
// This program is free software: you can redistribute it and/or modify
|
||||||
|
// it under the terms of the GNU General Public License as published by
|
||||||
|
// the Free Software Foundation, either version 3 of the License, or
|
||||||
|
// (at your option) any later version.
|
||||||
|
//
|
||||||
|
// This program is distributed in the hope that it will be useful,
|
||||||
|
// but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
//
|
||||||
|
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||||
|
// GNU General Public License for more details.
|
||||||
|
//
|
||||||
|
// You should have received a copy of the GNU General Public License
|
||||||
|
// along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||||
|
|
||||||
|
//!MAGPIE EFFECT
|
||||||
|
//!VERSION 4
|
||||||
|
//!SORT_NAME CuNNy-D04N02
|
||||||
|
//!USE MulAdd
|
||||||
|
//!CAPABILITY FP16
|
||||||
|
|
||||||
|
#include "../StubDefs.hlsli"
|
||||||
|
|
||||||
|
//!TEXTURE
|
||||||
|
Texture2D INPUT;
|
||||||
|
|
||||||
|
//!TEXTURE
|
||||||
|
//!WIDTH INPUT_WIDTH * 2
|
||||||
|
//!HEIGHT INPUT_HEIGHT * 2
|
||||||
|
Texture2D OUTPUT;
|
||||||
|
|
||||||
|
//!SAMPLER
|
||||||
|
//!FILTER POINT
|
||||||
|
SamplerState SP;
|
||||||
|
|
||||||
|
//!SAMPLER
|
||||||
|
//!FILTER LINEAR
|
||||||
|
SamplerState SL;
|
||||||
|
|
||||||
|
//!COMMON
|
||||||
|
#define O(t, p) t.SampleLevel(SP, pos + p * pt, 0)
|
||||||
|
#define V4 MF4
|
||||||
|
#define M4 MF4x4
|
||||||
|
|
||||||
|
//!TEXTURE
|
||||||
|
//!WIDTH INPUT_WIDTH
|
||||||
|
//!HEIGHT INPUT_HEIGHT
|
||||||
|
//!FORMAT R8G8B8A8_SNORM
|
||||||
|
Texture2D t0;
|
||||||
|
|
||||||
|
//!TEXTURE
|
||||||
|
//!WIDTH INPUT_WIDTH
|
||||||
|
//!HEIGHT INPUT_HEIGHT
|
||||||
|
//!FORMAT R8G8B8A8_SNORM
|
||||||
|
Texture2D t1;
|
||||||
|
|
||||||
|
//!PASS 1
|
||||||
|
//!DESC in
|
||||||
|
//!BLOCK_SIZE 8
|
||||||
|
//!NUM_THREADS 64
|
||||||
|
//!IN INPUT
|
||||||
|
//!OUT t0
|
||||||
|
|
||||||
|
#define l0(x, y) (dot(MF3(-6.049e-01, -1.145e+00, -2.540e-01), O(INPUT, float2(x, y)).rgb) + MF(1.794e+00))
|
||||||
|
|
||||||
|
V4 f0(MF s0_0, MF s0_1, MF s0_2, MF s0_3, MF s0_4, MF s0_5, MF s0_6, MF s0_7, MF s0_8) {
|
||||||
|
V4 r = { 4.440e-03, -1.956e-04, 1.215e-03, 1.790e-03 };
|
||||||
|
r = mad(s0_0, V4(1.411e-01, -9.763e-03, -1.361e-01, -9.610e-04), r);
|
||||||
|
r = mad(s0_1, V4(6.068e-02, 7.238e-03, -1.182e-01, -1.535e-02), r);
|
||||||
|
r = mad(s0_2, V4(-8.549e-02, -2.876e-03, -8.740e-03, 1.652e-02), r);
|
||||||
|
r = mad(s0_3, V4(-3.249e-01, 5.392e-02, -8.518e-02, -7.437e-03), r);
|
||||||
|
r = mad(s0_4, V4(2.435e-02, -6.191e-01, 7.147e-01, 5.862e-01), r);
|
||||||
|
r = mad(s0_5, V4(1.968e-01, 1.868e-02, -1.723e-01, -5.801e-01), r);
|
||||||
|
r = mad(s0_6, V4(1.528e-01, -4.489e-02, 5.871e-03, 4.528e-03), r);
|
||||||
|
r = mad(s0_7, V4(-4.619e-01, 6.152e-01, -1.313e-01, -5.326e-02), r);
|
||||||
|
r = mad(s0_8, V4(2.902e-01, -1.801e-02, -6.907e-02, 5.105e-02), r);
|
||||||
|
return r;
|
||||||
|
}
|
||||||
|
|
||||||
|
void Pass1(uint2 blockStart, uint3 tid) {
|
||||||
|
float2 pt = float2(GetInputPt());
|
||||||
|
uint2 gxy = Rmp8x8(tid.x) + blockStart;
|
||||||
|
uint2 size = GetInputSize();
|
||||||
|
if (gxy.x >= size.x || gxy.y >= size.y) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
float2 pos = (gxy + 0.5) * pt;
|
||||||
|
|
||||||
|
MF s0_0 = l0(-1.0, -1.0);
|
||||||
|
MF s0_1 = l0(0.0, -1.0);
|
||||||
|
MF s0_2 = l0(1.0, -1.0);
|
||||||
|
MF s0_3 = l0(-1.0, 0.0);
|
||||||
|
MF s0_4 = l0(0.0, 0.0);
|
||||||
|
MF s0_5 = l0(1.0, 0.0);
|
||||||
|
MF s0_6 = l0(-1.0, 1.0);
|
||||||
|
MF s0_7 = l0(0.0, 1.0);
|
||||||
|
MF s0_8 = l0(1.0, 1.0);
|
||||||
|
|
||||||
|
t0[gxy] = f0(s0_0, s0_1, s0_2, s0_3, s0_4, s0_5, s0_6, s0_7, s0_8);
|
||||||
|
}
|
||||||
|
|
||||||
|
//!PASS 2
|
||||||
|
//!DESC conv1
|
||||||
|
//!BLOCK_SIZE 8
|
||||||
|
//!NUM_THREADS 64
|
||||||
|
//!IN t0
|
||||||
|
//!OUT t1
|
||||||
|
|
||||||
|
#define l0(x, y) V4(O(t0, float2(x, y)))
|
||||||
|
|
||||||
|
V4 f0(V4 s0_0, V4 s0_1, V4 s0_2, V4 s0_3, V4 s0_4, V4 s0_5, V4 s0_6, V4 s0_7, V4 s0_8, V4 s1_0, V4 s1_1, V4 s1_2, V4 s1_3, V4 s1_4, V4 s1_5, V4 s1_6, V4 s1_7, V4 s1_8) {
|
||||||
|
V4 r = { 3.566e-03, 2.403e-03, -1.451e-03, 4.304e-03 };
|
||||||
|
r = MulAdd(s0_0, M4(1.120e-01, 8.150e-03, 7.146e-02, -4.942e-02, 3.623e-01, -1.678e-01, 1.189e-01, 1.372e-01, 1.225e-01, -2.568e-02, 6.959e-02, 1.788e-02, 1.962e-01, -1.870e-01, -6.548e-03, -4.334e-02), r);
|
||||||
|
r = MulAdd(s0_1, M4(1.805e-01, 4.881e-02, -2.342e-03, 2.035e-02, -2.427e-01, -2.197e-02, -2.036e-02, 3.919e-01, -3.037e-01, 7.047e-02, 3.426e-02, -8.694e-02, 2.144e-01, 1.431e-01, -7.851e-02, 2.247e-01), r);
|
||||||
|
r = MulAdd(s0_2, M4(6.328e-02, -4.140e-02, 3.362e-02, 5.204e-02, -1.052e-01, 1.698e-01, -2.727e-03, 1.110e-01, 7.156e-02, -1.108e-02, -2.717e-02, 5.680e-02, -6.118e-02, 2.435e-02, 1.743e-02, 8.179e-02), r);
|
||||||
|
r = MulAdd(s0_3, M4(1.557e-01, 1.189e-01, 8.836e-02, 2.178e-02, -3.954e-01, 2.466e-01, -2.166e-01, -7.051e-02, -2.857e-01, -1.611e-02, -8.667e-02, 1.895e-04, 2.744e-01, 1.499e-01, 8.228e-02, 2.938e-02), r);
|
||||||
|
r = MulAdd(s0_4, M4(2.441e-01, -3.694e-01, 1.751e-01, 6.833e-01, -1.087e-01, -2.065e-01, -1.557e-01, -6.945e-02, -1.403e-02, 2.171e-02, 3.748e-02, 2.646e-01, -3.718e-01, -1.188e-01, 1.569e-01, 8.554e-02), r);
|
||||||
|
r = MulAdd(s0_5, M4(-5.069e-02, 2.646e-01, -5.754e-02, -3.545e-01, 1.404e-01, 1.123e-01, 4.577e-02, -1.465e-01, -2.119e-02, -1.115e-02, 1.661e-01, -4.029e-01, -2.123e-01, 2.774e-01, -1.905e-02, -1.093e-02), r);
|
||||||
|
r = MulAdd(s0_6, M4(2.593e-02, -1.801e-02, 9.053e-02, -2.721e-02, 6.658e-03, 3.802e-02, -3.282e-02, -1.116e-01, 1.201e-01, 2.095e-02, -2.061e-02, 2.498e-03, -1.831e-01, -1.743e-01, 1.062e-01, -6.113e-01), r);
|
||||||
|
r = MulAdd(s0_7, M4(-1.172e-01, -1.130e-02, -6.727e-02, 7.753e-02, -3.958e-03, -9.790e-02, -1.635e-01, 1.049e-01, 2.862e-01, -2.733e-02, -1.566e-01, -2.900e-01, -1.050e-01, -3.441e-01, -8.690e-02, 8.659e-02), r);
|
||||||
|
r = MulAdd(s0_8, M4(2.145e-01, 4.613e-02, 1.590e-02, -4.749e-02, 3.291e-01, 1.012e-01, 8.647e-03, -2.282e-01, 2.215e-01, 1.713e-01, 1.414e-01, -3.916e-01, -2.488e-01, 1.458e-01, 2.518e-02, -9.979e-02), r);
|
||||||
|
r = MulAdd(s1_0, M4(-2.127e-02, 3.575e-02, 9.372e-02, -2.662e-02, 4.467e-02, 1.304e-02, 3.849e-02, 5.186e-02, 7.417e-02, 3.647e-02, 4.960e-02, -3.988e-02, -3.998e-02, 1.173e-01, 7.752e-03, -2.263e-02), r);
|
||||||
|
r = MulAdd(s1_1, M4(-1.283e-01, -1.460e-01, 1.963e-02, -1.108e-01, -4.171e-01, 2.397e-01, -5.886e-02, 7.788e-02, -2.820e-02, -1.719e-01, 9.334e-03, -1.255e-01, 1.392e-01, 9.532e-03, -5.163e-02, 8.641e-02), r);
|
||||||
|
r = MulAdd(s1_2, M4(-1.889e-01, 1.933e-01, 5.574e-02, 6.723e-02, -1.015e-01, -3.316e-01, -1.460e-02, -1.606e-01, 1.052e-01, 1.027e-02, -4.626e-02, 5.368e-02, -9.160e-03, -9.514e-02, 2.577e-02, 7.122e-02), r);
|
||||||
|
r = MulAdd(s1_3, M4(-1.958e-01, 1.276e-01, 7.303e-02, -1.135e-01, -2.277e-01, 2.017e-01, -5.223e-02, 1.379e-01, -1.737e-01, 4.871e-02, -8.142e-02, 1.392e-01, 8.113e-02, 4.415e-01, -1.174e-01, 1.910e-02), r);
|
||||||
|
r = MulAdd(s1_4, M4(-3.233e-01, -4.158e-01, 8.391e-02, 2.017e-01, 9.790e-02, -4.865e-02, -2.172e-01, 2.607e-01, -2.458e-01, -4.931e-01, 3.016e-01, 2.198e-01, -7.173e-02, -5.683e-01, -7.447e-02, -1.264e-01), r);
|
||||||
|
r = MulAdd(s1_5, M4(-4.189e-01, 3.271e-01, 8.844e-02, -5.295e-01, 6.365e-02, -1.513e-01, 1.246e-02, -2.005e-01, 1.764e-01, 5.796e-01, 7.286e-02, -1.428e-01, -1.130e-01, -6.883e-02, -1.303e-02, -1.091e-01), r);
|
||||||
|
r = MulAdd(s1_6, M4(-6.621e-02, 9.901e-03, 9.472e-02, -3.568e-02, 1.067e-01, -3.318e-02, 3.152e-01, -5.261e-02, 1.108e-01, 7.081e-02, -1.289e-01, 6.477e-03, 1.036e-01, -1.477e-03, 1.035e+00, -9.204e-02), r);
|
||||||
|
r = MulAdd(s1_7, M4(-2.721e-01, -5.458e-02, -1.707e-01, -1.096e-02, -1.302e-01, -9.074e-02, 1.694e-01, 6.307e-02, 4.233e-01, -5.112e-02, -3.545e-01, -2.589e-01, 8.276e-02, -3.975e-01, 7.705e-02, 4.482e-01), r);
|
||||||
|
r = MulAdd(s1_8, M4(1.175e-01, 2.212e-03, 5.751e-02, -8.666e-02, 2.532e-01, 1.303e-01, 7.291e-02, -2.126e-01, 4.815e-01, 1.649e-01, -4.748e-02, -3.330e-01, -1.252e-01, -8.987e-03, -4.285e-03, -1.106e-01), r);
|
||||||
|
return r;
|
||||||
|
}
|
||||||
|
|
||||||
|
void Pass2(uint2 blockStart, uint3 tid) {
|
||||||
|
float2 pt = float2(GetInputPt());
|
||||||
|
uint2 gxy = Rmp8x8(tid.x) + blockStart;
|
||||||
|
uint2 size = GetInputSize();
|
||||||
|
if (gxy.x >= size.x || gxy.y >= size.y) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
float2 pos = (gxy + 0.5) * pt;
|
||||||
|
|
||||||
|
V4 s0_0 = l0(-1.0, -1.0);
|
||||||
|
V4 s0_1 = l0(0.0, -1.0);
|
||||||
|
V4 s0_2 = l0(1.0, -1.0);
|
||||||
|
V4 s0_3 = l0(-1.0, 0.0);
|
||||||
|
V4 s0_4 = l0(0.0, 0.0);
|
||||||
|
V4 s0_5 = l0(1.0, 0.0);
|
||||||
|
V4 s0_6 = l0(-1.0, 1.0);
|
||||||
|
V4 s0_7 = l0(0.0, 1.0);
|
||||||
|
V4 s0_8 = l0(1.0, 1.0);
|
||||||
|
V4 s1_0 = -max(-s0_0, 0.0);
|
||||||
|
V4 s1_1 = -max(-s0_1, 0.0);
|
||||||
|
V4 s1_2 = -max(-s0_2, 0.0);
|
||||||
|
V4 s1_3 = -max(-s0_3, 0.0);
|
||||||
|
V4 s1_4 = -max(-s0_4, 0.0);
|
||||||
|
V4 s1_5 = -max(-s0_5, 0.0);
|
||||||
|
V4 s1_6 = -max(-s0_6, 0.0);
|
||||||
|
V4 s1_7 = -max(-s0_7, 0.0);
|
||||||
|
V4 s1_8 = -max(-s0_8, 0.0);
|
||||||
|
s0_0 = max(s0_0, 0.0);
|
||||||
|
s0_1 = max(s0_1, 0.0);
|
||||||
|
s0_2 = max(s0_2, 0.0);
|
||||||
|
s0_3 = max(s0_3, 0.0);
|
||||||
|
s0_4 = max(s0_4, 0.0);
|
||||||
|
s0_5 = max(s0_5, 0.0);
|
||||||
|
s0_6 = max(s0_6, 0.0);
|
||||||
|
s0_7 = max(s0_7, 0.0);
|
||||||
|
s0_8 = max(s0_8, 0.0);
|
||||||
|
|
||||||
|
t1[gxy] = f0(s0_0, s0_1, s0_2, s0_3, s0_4, s0_5, s0_6, s0_7, s0_8, s1_0, s1_1, s1_2, s1_3, s1_4, s1_5, s1_6, s1_7, s1_8);
|
||||||
|
}
|
||||||
|
|
||||||
|
//!PASS 3
|
||||||
|
//!DESC conv2
|
||||||
|
//!BLOCK_SIZE 8
|
||||||
|
//!NUM_THREADS 64
|
||||||
|
//!IN t1
|
||||||
|
//!OUT t0
|
||||||
|
|
||||||
|
#define l0(x, y) V4(O(t1, float2(x, y)))
|
||||||
|
|
||||||
|
V4 f0(V4 s0_0, V4 s0_1, V4 s0_2, V4 s0_3, V4 s0_4, V4 s0_5, V4 s0_6, V4 s0_7, V4 s0_8, V4 s1_0, V4 s1_1, V4 s1_2, V4 s1_3, V4 s1_4, V4 s1_5, V4 s1_6, V4 s1_7, V4 s1_8) {
|
||||||
|
V4 r = { 5.508e-03, 4.690e-03, -5.708e-04, -7.674e-03 };
|
||||||
|
r = MulAdd(s0_0, M4(-1.173e-02, 2.762e-03, -2.225e-03, -6.814e-03, 8.328e-02, -1.275e-02, 6.091e-02, -6.470e-02, -6.067e-02, -1.086e-01, 7.501e-02, 1.227e-01, -1.551e-02, -1.728e-02, -2.694e-02, 7.490e-02), r);
|
||||||
|
r = MulAdd(s0_1, M4(5.326e-02, 1.003e-02, 3.989e-02, -1.908e-03, -4.580e-02, -4.303e-03, 4.333e-02, 8.324e-02, 8.170e-01, 8.040e-01, -3.975e-01, -1.034e+00, 1.362e-01, 3.776e-04, -1.102e-02, -5.030e-02), r);
|
||||||
|
r = MulAdd(s0_2, M4(-6.068e-02, 6.212e-02, -4.979e-02, 9.626e-03, 1.301e-02, -2.045e-02, 1.798e-02, 2.091e-02, -2.290e-01, 3.612e-01, -7.014e-02, 1.669e-01, -5.191e-03, 1.304e-02, 9.444e-05, -2.137e-02), r);
|
||||||
|
r = MulAdd(s0_3, M4(-3.235e-02, -6.238e-02, 3.894e-02, 5.893e-02, -3.530e-02, -1.063e-01, 8.668e-02, 1.232e-02, -3.851e-02, 2.952e-02, 6.132e-02, -5.755e-02, 8.317e-02, 8.340e-02, -8.227e-02, 6.481e-03), r);
|
||||||
|
r = MulAdd(s0_4, M4(2.118e-02, 2.725e-01, -1.393e-01, -2.377e-01, 4.872e-01, 2.235e-01, -1.746e-02, -3.662e-01, -3.945e-01, -1.862e-01, -9.132e-02, 8.777e-02, -5.084e-01, -3.300e-01, -3.443e-02, 4.203e-01), r);
|
||||||
|
r = MulAdd(s0_5, M4(1.165e-01, -1.743e-01, 4.169e-03, -1.518e-01, 1.174e-01, -3.314e-02, 2.295e-02, -9.160e-02, -1.854e-01, -6.999e-02, -6.985e-02, 4.875e-04, -1.147e-01, 1.722e-01, -2.588e-02, 1.185e-01), r);
|
||||||
|
r = MulAdd(s0_6, M4(-8.881e-03, 1.907e-03, 9.002e-03, 8.085e-03, -8.728e-03, -1.074e-01, 7.035e-02, 6.519e-02, 4.323e-02, -4.675e-02, 4.382e-02, 1.091e-02, 3.357e-02, 4.384e-02, -8.031e-03, -1.945e-02), r);
|
||||||
|
r = MulAdd(s0_7, M4(-7.981e-02, 1.492e-02, -9.399e-02, -3.750e-02, -1.274e-01, -3.235e-02, -3.169e-02, 6.420e-02, 4.304e-02, 9.302e-02, 1.250e-02, 3.906e-03, 1.752e-01, -1.211e-02, 9.058e-02, -6.273e-02), r);
|
||||||
|
r = MulAdd(s0_8, M4(-1.290e-02, -4.309e-02, 3.384e-02, 3.819e-02, -3.309e-02, 3.986e-02, 3.783e-03, 5.361e-02, 5.473e-02, 1.574e-02, -2.385e-02, -7.630e-02, -1.778e-02, 1.375e-02, -2.936e-02, -1.778e-02), r);
|
||||||
|
r = MulAdd(s1_0, M4(1.219e-01, 1.166e-02, -5.932e-02, 1.191e-02, -2.487e-03, -5.945e-02, 6.637e-02, 5.775e-02, -1.705e-02, 5.538e-02, -5.130e-02, -3.602e-02, 5.461e-02, -1.253e-01, 6.953e-02, 1.066e-01), r);
|
||||||
|
r = MulAdd(s1_1, M4(6.504e-01, -9.638e-01, 1.371e+00, 5.682e-02, 1.583e-02, -2.371e-02, 5.201e-02, 3.845e-02, 3.478e-02, -1.477e-01, 1.763e-01, 5.129e-02, 2.992e-01, -3.335e-01, 2.490e-02, 4.873e-01), r);
|
||||||
|
r = MulAdd(s1_2, M4(2.415e-02, 8.838e-02, -1.519e-01, 9.012e-02, -6.676e-02, 3.422e-02, -2.380e-02, 5.608e-02, -1.744e-01, -9.595e-02, -7.627e-02, -5.823e-02, -9.466e-02, 5.554e-02, -1.024e-01, -1.763e-01), r);
|
||||||
|
r = MulAdd(s1_3, M4(8.380e-02, -7.972e-02, 8.813e-02, 3.371e-02, 5.392e-03, 4.385e-02, 1.207e-02, -5.728e-02, -3.427e-03, -2.027e-03, 1.211e-03, -7.897e-03, 3.360e-02, 4.603e-02, -1.240e-02, -2.219e-02), r);
|
||||||
|
r = MulAdd(s1_4, M4(-6.699e-01, -3.512e-01, -2.153e-01, 3.218e-01, -5.100e-01, 4.324e-03, 2.713e-01, -2.073e-01, 1.547e-01, -2.123e-03, 7.928e-02, -5.698e-02, 2.450e-02, -4.866e-02, 9.436e-02, 7.900e-02), r);
|
||||||
|
r = MulAdd(s1_5, M4(1.609e-01, -7.910e-02, 1.112e-01, -2.959e-02, -3.877e-01, -2.803e-01, -1.071e-01, -6.881e-03, 1.922e-02, 2.433e-02, -3.581e-02, -5.264e-02, -3.287e-01, -1.037e-02, -6.159e-02, 8.219e-02), r);
|
||||||
|
r = MulAdd(s1_6, M4(-4.263e-02, -6.372e-02, 2.607e-02, 5.285e-02, -6.156e-02, -7.837e-02, 7.299e-03, 8.959e-02, -8.706e-03, -1.642e-02, 1.825e-02, 1.850e-02, 2.735e-02, 2.413e-02, -3.236e-02, -9.612e-03), r);
|
||||||
|
r = MulAdd(s1_7, M4(-5.849e-02, 1.530e-01, -6.767e-02, -1.392e-02, -3.430e-01, -1.851e-01, -1.013e-01, 2.465e-01, -1.715e-02, 4.970e-03, -1.850e-02, -4.214e-03, 1.889e-02, -5.787e-02, 7.154e-02, 9.237e-02), r);
|
||||||
|
r = MulAdd(s1_8, M4(-2.084e-02, -2.484e-01, 5.767e-02, -2.550e-02, -9.126e-02, 4.292e-01, 1.983e-02, 2.979e-01, -3.807e-03, -3.367e-03, 1.835e-03, 8.694e-03, -9.074e-02, 4.820e-02, -2.886e-02, 5.975e-02), r);
|
||||||
|
return r;
|
||||||
|
}
|
||||||
|
|
||||||
|
void Pass3(uint2 blockStart, uint3 tid) {
|
||||||
|
float2 pt = float2(GetInputPt());
|
||||||
|
uint2 gxy = Rmp8x8(tid.x) + blockStart;
|
||||||
|
uint2 size = GetInputSize();
|
||||||
|
if (gxy.x >= size.x || gxy.y >= size.y) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
float2 pos = (gxy + 0.5) * pt;
|
||||||
|
|
||||||
|
V4 s0_0 = l0(-1.0, -1.0);
|
||||||
|
V4 s0_1 = l0(0.0, -1.0);
|
||||||
|
V4 s0_2 = l0(1.0, -1.0);
|
||||||
|
V4 s0_3 = l0(-1.0, 0.0);
|
||||||
|
V4 s0_4 = l0(0.0, 0.0);
|
||||||
|
V4 s0_5 = l0(1.0, 0.0);
|
||||||
|
V4 s0_6 = l0(-1.0, 1.0);
|
||||||
|
V4 s0_7 = l0(0.0, 1.0);
|
||||||
|
V4 s0_8 = l0(1.0, 1.0);
|
||||||
|
V4 s1_0 = -max(-s0_0, 0.0);
|
||||||
|
V4 s1_1 = -max(-s0_1, 0.0);
|
||||||
|
V4 s1_2 = -max(-s0_2, 0.0);
|
||||||
|
V4 s1_3 = -max(-s0_3, 0.0);
|
||||||
|
V4 s1_4 = -max(-s0_4, 0.0);
|
||||||
|
V4 s1_5 = -max(-s0_5, 0.0);
|
||||||
|
V4 s1_6 = -max(-s0_6, 0.0);
|
||||||
|
V4 s1_7 = -max(-s0_7, 0.0);
|
||||||
|
V4 s1_8 = -max(-s0_8, 0.0);
|
||||||
|
s0_0 = max(s0_0, 0.0);
|
||||||
|
s0_1 = max(s0_1, 0.0);
|
||||||
|
s0_2 = max(s0_2, 0.0);
|
||||||
|
s0_3 = max(s0_3, 0.0);
|
||||||
|
s0_4 = max(s0_4, 0.0);
|
||||||
|
s0_5 = max(s0_5, 0.0);
|
||||||
|
s0_6 = max(s0_6, 0.0);
|
||||||
|
s0_7 = max(s0_7, 0.0);
|
||||||
|
s0_8 = max(s0_8, 0.0);
|
||||||
|
|
||||||
|
t0[gxy] = f0(s0_0, s0_1, s0_2, s0_3, s0_4, s0_5, s0_6, s0_7, s0_8, s1_0, s1_1, s1_2, s1_3, s1_4, s1_5, s1_6, s1_7, s1_8);
|
||||||
|
}
|
||||||
|
|
||||||
|
//!PASS 4
|
||||||
|
//!DESC out-shuffle
|
||||||
|
//!BLOCK_SIZE 16
|
||||||
|
//!NUM_THREADS 64
|
||||||
|
//!IN INPUT, t0
|
||||||
|
//!OUT OUTPUT
|
||||||
|
|
||||||
|
#define l0(x, y) V4(O(t0, float2(x, y)))
|
||||||
|
|
||||||
|
V4 f0(V4 s0_0, V4 s0_1, V4 s0_2, V4 s0_3, V4 s0_4, V4 s0_5, V4 s0_6, V4 s0_7, V4 s0_8, V4 s1_0, V4 s1_1, V4 s1_2, V4 s1_3, V4 s1_4, V4 s1_5, V4 s1_6, V4 s1_7, V4 s1_8) {
|
||||||
|
V4 r = { -1.734e-03, -1.825e-03, -1.635e-03, -1.665e-03 };
|
||||||
|
r = MulAdd(s0_0, M4(-1.841e-04, -5.677e-02, 9.249e-03, -8.726e-03, 4.041e-02, -1.295e-01, 1.154e-01, 2.765e-02, 1.833e-01, -8.427e-02, 1.078e-01, -1.432e-01, 1.068e-01, -1.222e-01, 2.535e-02, 5.316e-02), r);
|
||||||
|
r = MulAdd(s0_1, M4(-3.609e-03, 5.812e-02, -4.650e-02, -2.093e-02, -3.442e-02, 7.643e-02, 1.424e-02, 7.195e-02, 1.552e-01, -8.291e-01, 1.547e-01, 4.354e-01, -2.851e-02, 1.023e-01, -8.481e-03, -6.567e-02), r);
|
||||||
|
r = MulAdd(s0_2, M4(1.724e-02, -1.165e-02, 1.007e-02, -3.008e-02, -9.814e-04, -2.007e-02, -5.905e-03, 6.714e-03, -1.736e-01, 2.035e-01, -1.333e-01, 1.250e-01, -9.118e-03, -4.989e-02, 2.142e-02, -4.038e-03), r);
|
||||||
|
r = MulAdd(s0_3, M4(7.885e-02, -8.350e-02, -6.025e-03, -1.139e-01, -8.380e-02, -6.836e-02, -5.589e-01, -4.614e-01, -6.742e-01, 2.118e-01, -4.442e-01, 2.197e-01, -5.873e-02, 1.902e-01, -4.687e-01, -4.712e-01), r);
|
||||||
|
r = MulAdd(s0_4, M4(-4.506e-01, 2.396e-01, -1.350e-02, 4.072e-01, 3.249e-01, 9.930e-02, 1.576e-02, -2.456e-01, 1.506e+00, 6.047e-02, 8.841e-01, -1.927e+00, -4.337e-01, -5.801e-01, 3.334e-01, 8.276e-02), r);
|
||||||
|
r = MulAdd(s0_5, M4(5.049e-02, -1.870e-01, 7.413e-02, -2.569e-02, -2.152e-02, 1.139e-01, -3.874e-02, 1.634e-02, -1.325e-01, 4.002e-02, -1.874e-01, 1.204e-01, 2.267e-02, 1.380e-02, -1.055e-02, 5.504e-02), r);
|
||||||
|
r = MulAdd(s0_6, M4(-2.855e-02, 1.255e-02, 3.941e-02, 4.466e-03, 4.814e-05, -9.003e-03, 1.231e-01, 5.676e-02, 5.020e-02, -5.407e-02, -1.951e-01, 4.240e-02, 3.525e-02, -1.021e-01, 4.517e-01, 2.399e-01), r);
|
||||||
|
r = MulAdd(s0_7, M4(-5.781e-02, -4.964e-02, -3.981e-01, -1.716e-01, 3.430e-02, -1.644e-02, 2.352e-01, 1.938e-01, 1.266e-01, -1.061e-01, 7.754e-01, 5.337e-01, 2.664e-01, 3.669e-01, -1.113e+00, -1.742e-01), r);
|
||||||
|
r = MulAdd(s0_8, M4(2.948e-02, 3.723e-02, 2.739e-02, -5.215e-02, -1.542e-02, -2.173e-02, -1.944e-02, 1.856e-02, -4.535e-02, 1.163e-02, -5.014e-02, 8.660e-02, 1.421e-01, 2.314e-01, 1.171e-02, -4.975e-01), r);
|
||||||
|
r = MulAdd(s1_0, M4(-4.408e-02, -3.573e-02, 3.842e-02, 2.571e-02, 2.872e-01, -4.960e-01, 2.569e-01, -6.254e-02, 2.158e-02, -6.452e-02, 7.495e-02, 1.997e-02, 4.094e-02, -9.741e-02, 3.542e-02, -8.115e-03), r);
|
||||||
|
r = MulAdd(s1_1, M4(3.480e-02, 1.949e-04, 1.780e-02, 4.483e-02, -2.814e-01, 4.229e-01, -5.482e-02, 1.512e-02, -3.120e-02, 3.945e-02, 4.626e-02, 7.013e-02, -6.686e-03, 5.832e-02, -4.408e-02, -1.262e-02), r);
|
||||||
|
r = MulAdd(s1_2, M4(-9.847e-03, 1.973e-03, 1.457e-02, 2.290e-02, 4.741e-02, 2.270e-02, 8.902e-04, 1.152e-02, -2.473e-02, -1.948e-02, -3.475e-03, 4.431e-02, 2.044e-02, 1.571e-04, 9.470e-03, -2.825e-02), r);
|
||||||
|
r = MulAdd(s1_3, M4(5.918e-02, -1.939e-02, -4.628e-02, -7.774e-02, -3.040e-01, 8.634e-02, -5.254e-01, -6.906e-01, -1.218e-01, -6.178e-02, -3.115e-01, -2.697e-01, -2.402e-02, -2.149e-02, -3.878e-01, -3.453e-01), r);
|
||||||
|
r = MulAdd(s1_4, M4(2.920e-01, 3.711e-01, -2.753e-01, -4.654e-02, 1.379e-01, 3.908e-01, -4.798e-01, 6.668e-01, 4.870e-01, -1.634e-01, -7.790e-02, -2.683e-01, -4.834e-01, -1.822e-02, -8.492e-03, 7.620e-02), r);
|
||||||
|
r = MulAdd(s1_5, M4(-4.786e-02, 2.412e-02, 4.992e-02, -1.913e-01, 9.058e-02, -4.485e-02, 8.249e-02, -9.418e-02, 3.555e-02, 3.543e-01, -1.140e-01, -1.358e-01, 5.079e-02, -2.007e-01, 6.132e-02, -2.373e-03), r);
|
||||||
|
r = MulAdd(s1_6, M4(6.553e-03, -7.804e-03, 8.569e-02, 4.875e-02, 5.085e-02, 1.728e-02, 6.949e-02, 1.313e-01, 1.825e-02, -5.557e-02, -7.548e-03, -5.534e-02, 7.059e-02, 4.382e-02, 2.807e-01, 1.919e-01), r);
|
||||||
|
r = MulAdd(s1_7, M4(-1.071e-01, -3.709e-02, -4.757e-01, -1.943e-01, 8.182e-02, -3.334e-02, 4.170e-01, 6.716e-02, 1.563e-01, 1.382e-01, 7.441e-01, 4.082e-01, -9.101e-02, -3.943e-02, -5.142e-01, -1.910e-01), r);
|
||||||
|
r = MulAdd(s1_8, M4(4.255e-03, 4.204e-02, 5.834e-02, -6.508e-02, -3.675e-02, 1.165e-02, -2.694e-02, -2.212e-02, -3.036e-02, -4.393e-02, 1.855e-03, 1.909e-01, 3.812e-02, 3.309e-02, 3.942e-02, -7.422e-02), r);
|
||||||
|
return tanh(r);
|
||||||
|
}
|
||||||
|
|
||||||
|
void Pass4(uint2 blockStart, uint3 tid) {
|
||||||
|
float2 pt = float2(GetInputPt());
|
||||||
|
uint2 gxy = (Rmp8x8(tid.x) << 1) + blockStart;
|
||||||
|
uint2 size = GetOutputSize();
|
||||||
|
if (gxy.x >= size.x || gxy.y >= size.y) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
float2 pos = ((gxy >> 1) + 0.5) * pt;
|
||||||
|
|
||||||
|
V4 s0_0 = l0(-1.0, -1.0);
|
||||||
|
V4 s0_1 = l0(0.0, -1.0);
|
||||||
|
V4 s0_2 = l0(1.0, -1.0);
|
||||||
|
V4 s0_3 = l0(-1.0, 0.0);
|
||||||
|
V4 s0_4 = l0(0.0, 0.0);
|
||||||
|
V4 s0_5 = l0(1.0, 0.0);
|
||||||
|
V4 s0_6 = l0(-1.0, 1.0);
|
||||||
|
V4 s0_7 = l0(0.0, 1.0);
|
||||||
|
V4 s0_8 = l0(1.0, 1.0);
|
||||||
|
V4 s1_0 = -max(-s0_0, 0.0);
|
||||||
|
V4 s1_1 = -max(-s0_1, 0.0);
|
||||||
|
V4 s1_2 = -max(-s0_2, 0.0);
|
||||||
|
V4 s1_3 = -max(-s0_3, 0.0);
|
||||||
|
V4 s1_4 = -max(-s0_4, 0.0);
|
||||||
|
V4 s1_5 = -max(-s0_5, 0.0);
|
||||||
|
V4 s1_6 = -max(-s0_6, 0.0);
|
||||||
|
V4 s1_7 = -max(-s0_7, 0.0);
|
||||||
|
V4 s1_8 = -max(-s0_8, 0.0);
|
||||||
|
s0_0 = max(s0_0, 0.0);
|
||||||
|
s0_1 = max(s0_1, 0.0);
|
||||||
|
s0_2 = max(s0_2, 0.0);
|
||||||
|
s0_3 = max(s0_3, 0.0);
|
||||||
|
s0_4 = max(s0_4, 0.0);
|
||||||
|
s0_5 = max(s0_5, 0.0);
|
||||||
|
s0_6 = max(s0_6, 0.0);
|
||||||
|
s0_7 = max(s0_7, 0.0);
|
||||||
|
s0_8 = max(s0_8, 0.0);
|
||||||
|
|
||||||
|
V4 r = f0(s0_0, s0_1, s0_2, s0_3, s0_4, s0_5, s0_6, s0_7, s0_8, s1_0, s1_1, s1_2, s1_3, s1_4, s1_5, s1_6, s1_7, s1_8);
|
||||||
|
|
||||||
|
static const MF3x3 rgb2yuv = { 0.299, 0.587, 0.114, -0.169, -0.331, 0.5, 0.5, -0.419, -0.081 };
|
||||||
|
static const MF3x3 yuv2rgb = { 1, -0.00093, 1.401687, 1, -0.3437, -0.71417, 1, 1.77216, 0.00099 };
|
||||||
|
float2 opt = float2(GetOutputPt());
|
||||||
|
|
||||||
|
pos -= 0.5f * opt;
|
||||||
|
MF3 yuv = mul(rgb2yuv, INPUT.SampleLevel(SL, pos, 0).rgb);
|
||||||
|
OUTPUT[gxy] = MF4(mul(yuv2rgb, MF3(saturate(yuv.r + r.x), yuv.yz)), 1);
|
||||||
|
|
||||||
|
++gxy.x;
|
||||||
|
pos.x += opt.x;
|
||||||
|
yuv = mul(rgb2yuv, INPUT.SampleLevel(SL, pos, 0).rgb);
|
||||||
|
OUTPUT[gxy] = MF4(mul(yuv2rgb, MF3(saturate(yuv.r + r.y), yuv.yz)), 1);
|
||||||
|
|
||||||
|
++gxy.y;
|
||||||
|
pos.y += opt.y;
|
||||||
|
yuv = mul(rgb2yuv, INPUT.SampleLevel(SL, pos, 0).rgb);
|
||||||
|
OUTPUT[gxy] = MF4(mul(yuv2rgb, MF3(saturate(yuv.r + r.w), yuv.yz)), 1);
|
||||||
|
|
||||||
|
--gxy.x;
|
||||||
|
pos.x -= opt.x;
|
||||||
|
yuv = mul(rgb2yuv, INPUT.SampleLevel(SL, pos, 0).rgb);
|
||||||
|
OUTPUT[gxy] = MF4(mul(yuv2rgb, MF3(saturate(yuv.r + r.z), yuv.yz)), 1);
|
||||||
|
}
|
||||||
412
src/Effects/CuNNy/CuNNy-3x4C-NVL-DN.hlsl
Normal file
412
src/Effects/CuNNy/CuNNy-3x4C-NVL-DN.hlsl
Normal file
|
|
@ -0,0 +1,412 @@
|
||||||
|
// CuNNy 3x4C BILINEAR RGB NVL DN - https://github.com/funnyplanter/CuNNy
|
||||||
|
|
||||||
|
// This program is free software: you can redistribute it and/or modify
|
||||||
|
// it under the terms of the GNU General Public License as published by
|
||||||
|
// the Free Software Foundation, either version 3 of the License, or
|
||||||
|
// (at your option) any later version.
|
||||||
|
//
|
||||||
|
// This program is distributed in the hope that it will be useful,
|
||||||
|
// but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
//
|
||||||
|
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||||
|
// GNU General Public License for more details.
|
||||||
|
//
|
||||||
|
// You should have received a copy of the GNU General Public License
|
||||||
|
// along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||||
|
|
||||||
|
//!MAGPIE EFFECT
|
||||||
|
//!VERSION 4
|
||||||
|
//!SORT_NAME CuNNy-DN-D04N03
|
||||||
|
//!USE MulAdd
|
||||||
|
//!CAPABILITY FP16
|
||||||
|
|
||||||
|
#include "../StubDefs.hlsli"
|
||||||
|
|
||||||
|
//!TEXTURE
|
||||||
|
Texture2D INPUT;
|
||||||
|
|
||||||
|
//!TEXTURE
|
||||||
|
//!WIDTH INPUT_WIDTH * 2
|
||||||
|
//!HEIGHT INPUT_HEIGHT * 2
|
||||||
|
Texture2D OUTPUT;
|
||||||
|
|
||||||
|
//!SAMPLER
|
||||||
|
//!FILTER POINT
|
||||||
|
SamplerState SP;
|
||||||
|
|
||||||
|
//!SAMPLER
|
||||||
|
//!FILTER LINEAR
|
||||||
|
SamplerState SL;
|
||||||
|
|
||||||
|
//!COMMON
|
||||||
|
#define O(t, p) t.SampleLevel(SP, pos + p * pt, 0)
|
||||||
|
#define V4 MF4
|
||||||
|
#define M4 MF4x4
|
||||||
|
|
||||||
|
//!TEXTURE
|
||||||
|
//!WIDTH INPUT_WIDTH
|
||||||
|
//!HEIGHT INPUT_HEIGHT
|
||||||
|
//!FORMAT R8G8B8A8_SNORM
|
||||||
|
Texture2D t0;
|
||||||
|
|
||||||
|
//!TEXTURE
|
||||||
|
//!WIDTH INPUT_WIDTH
|
||||||
|
//!HEIGHT INPUT_HEIGHT
|
||||||
|
//!FORMAT R8G8B8A8_SNORM
|
||||||
|
Texture2D t1;
|
||||||
|
|
||||||
|
//!PASS 1
|
||||||
|
//!DESC in
|
||||||
|
//!BLOCK_SIZE 8
|
||||||
|
//!NUM_THREADS 64
|
||||||
|
//!IN INPUT
|
||||||
|
//!OUT t0
|
||||||
|
|
||||||
|
#define l0(x, y) (dot(MF3(-2.683e-01, -5.217e-01, -1.382e-01), O(INPUT, float2(x, y)).rgb) + MF(7.973e-01))
|
||||||
|
|
||||||
|
V4 f0(MF s0_0, MF s0_1, MF s0_2, MF s0_3, MF s0_4, MF s0_5, MF s0_6, MF s0_7, MF s0_8) {
|
||||||
|
V4 r = { 3.156e-02, 7.379e-02, 1.078e-02, -5.510e-04 };
|
||||||
|
r = mad(s0_0, V4(1.850e-01, -2.860e-02, -5.321e-01, 2.390e-03), r);
|
||||||
|
r = mad(s0_1, V4(-4.299e-01, -2.946e-02, -1.180e-01, -5.652e-02), r);
|
||||||
|
r = mad(s0_2, V4(-4.798e-01, -2.276e-02, 3.201e-02, 4.870e-02), r);
|
||||||
|
r = mad(s0_3, V4(2.783e-01, -2.262e-03, -1.864e-01, 1.793e-01), r);
|
||||||
|
r = mad(s0_4, V4(9.435e-04, 8.115e-01, 7.806e-01, -7.793e-01), r);
|
||||||
|
r = mad(s0_5, V4(2.180e-01, -2.564e-05, 2.774e-03, -7.015e-02), r);
|
||||||
|
r = mad(s0_6, V4(1.479e-03, -4.675e-02, 3.323e-02, 3.392e-01), r);
|
||||||
|
r = mad(s0_7, V4(1.203e-01, 1.509e-02, 5.239e-02, 3.194e-01), r);
|
||||||
|
r = mad(s0_8, V4(7.680e-02, -4.310e-02, -7.203e-02, 1.255e-02), r);
|
||||||
|
return r;
|
||||||
|
}
|
||||||
|
|
||||||
|
void Pass1(uint2 blockStart, uint3 tid) {
|
||||||
|
float2 pt = float2(GetInputPt());
|
||||||
|
uint2 gxy = Rmp8x8(tid.x) + blockStart;
|
||||||
|
uint2 size = GetInputSize();
|
||||||
|
if (gxy.x >= size.x || gxy.y >= size.y) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
float2 pos = (gxy + 0.5) * pt;
|
||||||
|
|
||||||
|
MF s0_0 = l0(-1.0, -1.0);
|
||||||
|
MF s0_1 = l0(0.0, -1.0);
|
||||||
|
MF s0_2 = l0(1.0, -1.0);
|
||||||
|
MF s0_3 = l0(-1.0, 0.0);
|
||||||
|
MF s0_4 = l0(0.0, 0.0);
|
||||||
|
MF s0_5 = l0(1.0, 0.0);
|
||||||
|
MF s0_6 = l0(-1.0, 1.0);
|
||||||
|
MF s0_7 = l0(0.0, 1.0);
|
||||||
|
MF s0_8 = l0(1.0, 1.0);
|
||||||
|
|
||||||
|
t0[gxy] = f0(s0_0, s0_1, s0_2, s0_3, s0_4, s0_5, s0_6, s0_7, s0_8);
|
||||||
|
}
|
||||||
|
|
||||||
|
//!PASS 2
|
||||||
|
//!DESC conv1
|
||||||
|
//!BLOCK_SIZE 8
|
||||||
|
//!NUM_THREADS 64
|
||||||
|
//!IN t0
|
||||||
|
//!OUT t1
|
||||||
|
|
||||||
|
#define l0(x, y) V4(O(t0, float2(x, y)))
|
||||||
|
|
||||||
|
V4 f0(V4 s0_0, V4 s0_1, V4 s0_2, V4 s0_3, V4 s0_4, V4 s0_5, V4 s0_6, V4 s0_7, V4 s0_8, V4 s1_0, V4 s1_1, V4 s1_2, V4 s1_3, V4 s1_4, V4 s1_5, V4 s1_6, V4 s1_7, V4 s1_8) {
|
||||||
|
V4 r = { 1.427e-02, -1.982e-02, 4.114e-03, -2.883e-02 };
|
||||||
|
r = MulAdd(s0_0, M4(1.949e-01, -1.247e-01, -7.307e-02, 8.783e-02, -4.773e-02, 6.012e-02, 8.043e-02, -8.489e-02, 6.760e-02, -7.809e-02, -4.745e-02, -1.304e-02, -1.402e-01, -1.248e-01, 3.334e-01, -1.498e-01), r);
|
||||||
|
r = MulAdd(s0_1, M4(7.053e-02, 9.895e-02, 1.655e-01, 2.251e-01, 3.511e-02, -1.010e-01, -2.736e-01, 1.174e-01, -2.551e-01, 1.100e-01, 1.518e-01, -4.343e-02, -9.293e-01, 5.327e-01, -2.723e-01, 4.006e-01), r);
|
||||||
|
r = MulAdd(s0_2, M4(-2.390e-02, 8.154e-03, -2.332e-02, -3.708e-02, 2.814e-02, 5.506e-02, -2.627e-01, -8.081e-02, -1.062e-01, -6.819e-02, -9.498e-02, -2.749e-01, -2.457e-01, 6.868e-01, 6.527e-03, 7.676e-01), r);
|
||||||
|
r = MulAdd(s0_3, M4(2.704e-01, 4.055e-02, -4.756e-01, 2.506e-01, -9.498e-02, 5.838e-02, 1.733e-01, 3.420e-03, -7.051e-02, -8.233e-02, -3.006e-01, 6.824e-02, -1.308e-01, 1.196e-01, 2.560e-01, 8.304e-02), r);
|
||||||
|
r = MulAdd(s0_4, M4(4.190e-01, -1.207e-01, 2.708e-01, -6.375e-01, 1.740e-01, 1.955e-03, -1.816e-01, -7.933e-02, -9.308e-01, 1.333e-01, -1.335e-01, -1.401e-01, 3.447e-01, 3.389e-01, 6.660e-01, -3.387e-01), r);
|
||||||
|
r = MulAdd(s0_5, M4(7.310e-02, 1.403e-02, 8.114e-02, 7.400e-02, -2.552e-02, -1.607e-01, -1.208e-01, -3.943e-02, -2.743e-02, -7.229e-03, -1.749e-03, 3.062e-01, 1.429e-01, 8.105e-01, 3.562e-01, 4.580e-01), r);
|
||||||
|
r = MulAdd(s0_6, M4(2.115e-01, -1.686e-01, -1.948e-01, -1.191e-01, -5.798e-02, 3.493e-02, 8.264e-02, 1.579e-01, -1.081e-01, -1.775e-01, -8.196e-02, -2.085e-01, 6.791e-02, 1.652e-02, -4.933e-03, 2.833e-02), r);
|
||||||
|
r = MulAdd(s0_7, M4(-2.160e-01, -3.858e-01, -8.407e-01, -1.091e-01, 8.415e-03, 8.626e-02, 2.340e-01, 9.177e-02, -4.697e-01, -6.623e-02, -5.176e-01, 6.762e-02, -3.437e-03, 6.570e-02, 7.630e-02, 8.988e-02), r);
|
||||||
|
r = MulAdd(s0_8, M4(6.527e-02, -6.320e-02, 1.192e-02, -1.196e-01, -1.605e-02, -9.294e-03, 1.955e-01, -2.356e-02, -3.582e-02, 1.377e-02, 9.253e-02, -2.362e-02, 3.578e-02, 1.822e-01, 3.329e-01, 1.489e-01), r);
|
||||||
|
r = MulAdd(s1_0, M4(1.154e-01, -1.822e-01, -2.122e-01, 3.031e-02, 6.550e-01, -4.855e-02, 6.554e-02, 4.432e-02, 1.671e-02, -4.477e-02, -9.428e-03, 4.413e-03, -3.185e-02, -1.529e-01, -1.222e-01, 6.523e-02), r);
|
||||||
|
r = MulAdd(s1_1, M4(-4.920e-02, -1.697e-02, 4.141e-02, 1.997e-01, 6.972e-01, -5.157e-01, 2.031e-01, 2.829e-02, -5.005e-02, 2.335e-01, 2.985e-01, 6.871e-02, -5.232e-01, 2.146e-02, -1.418e+00, 2.193e-01), r);
|
||||||
|
r = MulAdd(s1_2, M4(-6.472e-02, 2.595e-02, -2.610e-02, -2.279e-02, 4.165e-01, -7.745e-01, 1.261e-01, -3.845e-01, 3.279e-02, 2.445e-02, 1.796e-01, -2.581e-01, -3.838e-01, 6.280e-02, -4.893e-01, -1.475e-01), r);
|
||||||
|
r = MulAdd(s1_3, M4(9.330e-02, 1.742e-01, -1.685e-01, 2.376e-02, -9.586e-01, -1.236e+00, -7.271e-01, -7.674e-01, 2.500e-01, -3.709e-02, -1.303e-01, 1.490e-01, -2.746e-01, -1.376e-01, -2.321e-02, -1.967e-02), r);
|
||||||
|
r = MulAdd(s1_4, M4(3.660e-01, 4.772e-02, 5.524e-01, -2.804e-01, -2.756e+00, -1.336e+00, 2.038e-01, 2.593e+00, 2.156e-01, 3.281e-01, 3.152e-01, 8.064e-01, 3.970e-01, -1.379e-01, -7.518e-02, -2.723e-01), r);
|
||||||
|
r = MulAdd(s1_5, M4(5.214e-03, 1.695e-02, 1.024e-01, 1.333e-01, -2.250e-01, -1.298e+00, 4.673e-01, 1.317e+00, 3.036e-01, -1.273e-01, 2.900e-01, 2.249e-02, -1.870e-01, -1.124e-01, -5.879e-01, 6.314e-02), r);
|
||||||
|
r = MulAdd(s1_6, M4(-8.225e-02, -1.149e-01, 1.598e-04, -3.662e-01, -8.572e-02, -8.909e-01, 9.891e-02, 1.818e-01, 1.715e-01, -2.348e-01, 1.178e-01, -6.289e-02, 1.522e-02, 1.973e-02, 3.707e-02, 2.911e-02), r);
|
||||||
|
r = MulAdd(s1_7, M4(-6.380e-02, 8.661e-02, -2.666e-01, 9.586e-02, -1.257e+00, -2.231e+00, -1.232e+00, 5.642e-01, 5.730e-02, -3.294e-01, -1.151e-01, 2.382e-01, 4.529e-02, 4.927e-02, 9.893e-02, 8.365e-02), r);
|
||||||
|
r = MulAdd(s1_8, M4(1.906e-02, -8.920e-02, 8.931e-02, -6.752e-02, -3.680e-01, -1.282e+00, -1.388e-01, -7.545e-02, 6.262e-02, -1.695e-01, 2.278e-01, -3.066e-01, -7.412e-02, 1.145e-02, 4.667e-02, -4.205e-04), r);
|
||||||
|
return r;
|
||||||
|
}
|
||||||
|
|
||||||
|
void Pass2(uint2 blockStart, uint3 tid) {
|
||||||
|
float2 pt = float2(GetInputPt());
|
||||||
|
uint2 gxy = Rmp8x8(tid.x) + blockStart;
|
||||||
|
uint2 size = GetInputSize();
|
||||||
|
if (gxy.x >= size.x || gxy.y >= size.y) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
float2 pos = (gxy + 0.5) * pt;
|
||||||
|
|
||||||
|
V4 s0_0 = l0(-1.0, -1.0);
|
||||||
|
V4 s0_1 = l0(0.0, -1.0);
|
||||||
|
V4 s0_2 = l0(1.0, -1.0);
|
||||||
|
V4 s0_3 = l0(-1.0, 0.0);
|
||||||
|
V4 s0_4 = l0(0.0, 0.0);
|
||||||
|
V4 s0_5 = l0(1.0, 0.0);
|
||||||
|
V4 s0_6 = l0(-1.0, 1.0);
|
||||||
|
V4 s0_7 = l0(0.0, 1.0);
|
||||||
|
V4 s0_8 = l0(1.0, 1.0);
|
||||||
|
V4 s1_0 = -max(-s0_0, 0.0);
|
||||||
|
V4 s1_1 = -max(-s0_1, 0.0);
|
||||||
|
V4 s1_2 = -max(-s0_2, 0.0);
|
||||||
|
V4 s1_3 = -max(-s0_3, 0.0);
|
||||||
|
V4 s1_4 = -max(-s0_4, 0.0);
|
||||||
|
V4 s1_5 = -max(-s0_5, 0.0);
|
||||||
|
V4 s1_6 = -max(-s0_6, 0.0);
|
||||||
|
V4 s1_7 = -max(-s0_7, 0.0);
|
||||||
|
V4 s1_8 = -max(-s0_8, 0.0);
|
||||||
|
s0_0 = max(s0_0, 0.0);
|
||||||
|
s0_1 = max(s0_1, 0.0);
|
||||||
|
s0_2 = max(s0_2, 0.0);
|
||||||
|
s0_3 = max(s0_3, 0.0);
|
||||||
|
s0_4 = max(s0_4, 0.0);
|
||||||
|
s0_5 = max(s0_5, 0.0);
|
||||||
|
s0_6 = max(s0_6, 0.0);
|
||||||
|
s0_7 = max(s0_7, 0.0);
|
||||||
|
s0_8 = max(s0_8, 0.0);
|
||||||
|
|
||||||
|
t1[gxy] = f0(s0_0, s0_1, s0_2, s0_3, s0_4, s0_5, s0_6, s0_7, s0_8, s1_0, s1_1, s1_2, s1_3, s1_4, s1_5, s1_6, s1_7, s1_8);
|
||||||
|
}
|
||||||
|
|
||||||
|
//!PASS 3
|
||||||
|
//!DESC conv2
|
||||||
|
//!BLOCK_SIZE 8
|
||||||
|
//!NUM_THREADS 64
|
||||||
|
//!IN t1
|
||||||
|
//!OUT t0
|
||||||
|
|
||||||
|
#define l0(x, y) V4(O(t1, float2(x, y)))
|
||||||
|
|
||||||
|
V4 f0(V4 s0_0, V4 s0_1, V4 s0_2, V4 s0_3, V4 s0_4, V4 s0_5, V4 s0_6, V4 s0_7, V4 s0_8, V4 s1_0, V4 s1_1, V4 s1_2, V4 s1_3, V4 s1_4, V4 s1_5, V4 s1_6, V4 s1_7, V4 s1_8) {
|
||||||
|
V4 r = { -1.292e-02, 8.156e-04, -2.055e-03, -3.100e-03 };
|
||||||
|
r = MulAdd(s0_0, M4(2.965e-01, -1.919e-01, 9.202e-02, 8.775e-03, -4.948e-02, 1.061e-01, -3.754e-02, -1.900e-01, -2.114e-01, 1.267e-01, 1.989e-02, 2.570e-02, 4.634e-03, -2.718e-01, 2.171e-01, 1.512e-01), r);
|
||||||
|
r = MulAdd(s0_1, M4(-5.527e-01, -4.825e-01, 4.325e-01, 4.447e-01, -6.577e-02, 5.161e-01, 3.286e-02, -3.800e-01, 2.625e-02, 3.835e-01, -7.794e-02, -5.489e-02, -2.647e-01, -4.952e-01, 1.587e-01, 1.471e-01), r);
|
||||||
|
r = MulAdd(s0_2, M4(-3.687e-01, -1.096e-01, 1.849e-01, -6.915e-02, 2.257e-01, 2.760e-01, -8.875e-02, -8.871e-02, -8.394e-02, -6.714e-02, 5.322e-03, -3.252e-01, -7.885e-02, -2.723e-01, 6.149e-02, 2.998e-01), r);
|
||||||
|
r = MulAdd(s0_3, M4(1.606e-01, -1.199e-01, 3.573e-01, 2.833e-02, 6.514e-03, -2.242e-02, -6.231e-02, 6.702e-02, -8.717e-02, -2.227e-01, -1.626e-01, 5.313e-02, -1.411e-01, -2.445e-02, 1.194e-01, -1.101e-01), r);
|
||||||
|
r = MulAdd(s0_4, M4(-1.127e+00, 1.823e-01, 1.358e-01, -1.618e-01, -4.171e-04, -7.771e-02, 2.147e-01, 6.493e-01, 4.989e-01, 3.955e-01, -1.017e-01, -2.861e-01, 3.878e-01, -6.653e-01, -4.968e-01, -5.063e-01), r);
|
||||||
|
r = MulAdd(s0_5, M4(-2.270e-01, -3.965e-01, -2.794e-02, 1.487e-01, -2.667e-01, -1.410e-02, 1.475e-01, -4.992e-01, -1.071e-01, 2.096e-01, 1.159e-01, -6.073e-02, -7.157e-02, -2.446e-01, -4.807e-02, 1.968e-01), r);
|
||||||
|
r = MulAdd(s0_6, M4(8.199e-02, 8.336e-02, -3.090e-02, -1.287e-02, -6.954e-02, -7.544e-02, 1.272e-01, 7.930e-02, -3.647e-02, -2.685e-02, -4.235e-02, 3.214e-02, -4.526e-02, 1.479e-01, -4.963e-02, -3.035e-02), r);
|
||||||
|
r = MulAdd(s0_7, M4(-2.012e-02, -1.497e-02, -2.952e-01, -6.026e-02, 2.135e-03, 2.979e-02, -2.713e-02, 7.951e-03, -8.069e-02, -2.374e-01, 1.865e-01, 1.048e-01, -9.076e-02, 6.683e-02, 9.576e-02, -2.432e-02), r);
|
||||||
|
r = MulAdd(s0_8, M4(1.455e-01, 2.613e-01, -1.616e-01, -3.564e-01, 1.229e-01, -3.778e-02, 3.316e-02, 5.927e-02, -1.831e-01, -1.388e-01, 5.986e-02, 2.083e-02, -1.368e-03, 2.394e-01, -1.623e-01, -2.768e-02), r);
|
||||||
|
r = MulAdd(s1_0, M4(7.711e-03, -6.696e-04, -3.229e-02, 1.549e-02, -1.596e-01, 2.068e-01, -6.162e-02, -9.571e-02, -1.500e-01, 1.743e-01, 2.746e-02, -5.845e-02, -7.649e-03, -4.265e-03, 4.154e-03, 3.950e-03), r);
|
||||||
|
r = MulAdd(s1_1, M4(2.764e-01, -4.505e-02, 4.280e-02, 6.044e-02, 3.396e-02, 2.750e-01, -1.910e-01, -2.153e-01, 9.633e-02, -2.194e-02, -2.131e-01, -1.181e-01, -1.343e-01, 6.123e-02, 1.904e-02, -6.568e-02), r);
|
||||||
|
r = MulAdd(s1_2, M4(-3.643e-01, -1.709e-02, 1.528e-01, -1.405e-01, 3.307e-01, -1.979e-03, -1.819e-01, 7.635e-02, 1.266e-01, 2.162e-01, -7.492e-02, -9.075e-02, 4.120e-02, 1.521e-01, -2.790e-03, -4.330e-02), r);
|
||||||
|
r = MulAdd(s1_3, M4(1.913e-02, -5.373e-02, 5.748e-02, -1.443e-02, -2.776e-01, -1.162e-01, -1.994e-01, 1.430e-01, 9.058e-02, -3.720e-02, -3.585e-02, -8.516e-02, -2.228e-02, 7.507e-02, -9.620e-02, -1.013e-01), r);
|
||||||
|
r = MulAdd(s1_4, M4(-3.592e-01, 1.415e-01, 1.018e+00, -1.555e-01, 5.378e-01, 8.818e-02, 2.190e-01, 1.997e-01, -1.128e-01, 3.331e-02, -1.410e-01, 2.844e-01, 4.756e-01, -5.850e-02, -3.757e-01, -1.716e-01), r);
|
||||||
|
r = MulAdd(s1_5, M4(2.636e-02, -3.596e-01, -3.280e-01, 2.027e-01, 3.000e-01, -2.297e-01, 4.282e-02, 1.776e-01, 5.222e-02, 1.751e-01, 4.529e-02, -8.347e-02, -3.409e-01, -2.640e-01, 1.753e-01, -5.672e-01), r);
|
||||||
|
r = MulAdd(s1_6, M4(-1.699e-02, 4.941e-02, -2.642e-02, -1.406e-04, -1.655e-01, -1.464e-02, -4.353e-02, 1.946e-01, 6.067e-02, -1.429e-01, 1.170e-01, -4.644e-02, -6.567e-02, -2.264e-02, 6.666e-02, 9.009e-02), r);
|
||||||
|
r = MulAdd(s1_7, M4(7.805e-02, 2.173e-02, -3.276e-01, 2.004e-03, -7.789e-02, -1.466e-02, -1.560e-01, -1.126e-01, -3.823e-02, -2.446e-03, 1.465e-01, -2.744e-01, -2.129e-01, -2.141e-02, 4.456e-01, 1.240e-01), r);
|
||||||
|
r = MulAdd(s1_8, M4(1.315e-02, 2.686e-01, -1.987e-01, -2.093e-01, 3.184e-02, -8.723e-02, 3.012e-01, 3.580e-01, 1.198e-02, -2.655e-01, 1.455e-01, 7.602e-02, -4.605e-02, 3.276e-01, -2.036e-01, -2.590e-01), r);
|
||||||
|
return r;
|
||||||
|
}
|
||||||
|
|
||||||
|
void Pass3(uint2 blockStart, uint3 tid) {
|
||||||
|
float2 pt = float2(GetInputPt());
|
||||||
|
uint2 gxy = Rmp8x8(tid.x) + blockStart;
|
||||||
|
uint2 size = GetInputSize();
|
||||||
|
if (gxy.x >= size.x || gxy.y >= size.y) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
float2 pos = (gxy + 0.5) * pt;
|
||||||
|
|
||||||
|
V4 s0_0 = l0(-1.0, -1.0);
|
||||||
|
V4 s0_1 = l0(0.0, -1.0);
|
||||||
|
V4 s0_2 = l0(1.0, -1.0);
|
||||||
|
V4 s0_3 = l0(-1.0, 0.0);
|
||||||
|
V4 s0_4 = l0(0.0, 0.0);
|
||||||
|
V4 s0_5 = l0(1.0, 0.0);
|
||||||
|
V4 s0_6 = l0(-1.0, 1.0);
|
||||||
|
V4 s0_7 = l0(0.0, 1.0);
|
||||||
|
V4 s0_8 = l0(1.0, 1.0);
|
||||||
|
V4 s1_0 = -max(-s0_0, 0.0);
|
||||||
|
V4 s1_1 = -max(-s0_1, 0.0);
|
||||||
|
V4 s1_2 = -max(-s0_2, 0.0);
|
||||||
|
V4 s1_3 = -max(-s0_3, 0.0);
|
||||||
|
V4 s1_4 = -max(-s0_4, 0.0);
|
||||||
|
V4 s1_5 = -max(-s0_5, 0.0);
|
||||||
|
V4 s1_6 = -max(-s0_6, 0.0);
|
||||||
|
V4 s1_7 = -max(-s0_7, 0.0);
|
||||||
|
V4 s1_8 = -max(-s0_8, 0.0);
|
||||||
|
s0_0 = max(s0_0, 0.0);
|
||||||
|
s0_1 = max(s0_1, 0.0);
|
||||||
|
s0_2 = max(s0_2, 0.0);
|
||||||
|
s0_3 = max(s0_3, 0.0);
|
||||||
|
s0_4 = max(s0_4, 0.0);
|
||||||
|
s0_5 = max(s0_5, 0.0);
|
||||||
|
s0_6 = max(s0_6, 0.0);
|
||||||
|
s0_7 = max(s0_7, 0.0);
|
||||||
|
s0_8 = max(s0_8, 0.0);
|
||||||
|
|
||||||
|
t0[gxy] = f0(s0_0, s0_1, s0_2, s0_3, s0_4, s0_5, s0_6, s0_7, s0_8, s1_0, s1_1, s1_2, s1_3, s1_4, s1_5, s1_6, s1_7, s1_8);
|
||||||
|
}
|
||||||
|
|
||||||
|
//!PASS 4
|
||||||
|
//!DESC conv3
|
||||||
|
//!BLOCK_SIZE 8
|
||||||
|
//!NUM_THREADS 64
|
||||||
|
//!IN t0
|
||||||
|
//!OUT t1
|
||||||
|
|
||||||
|
#define l0(x, y) V4(O(t0, float2(x, y)))
|
||||||
|
|
||||||
|
V4 f0(V4 s0_0, V4 s0_1, V4 s0_2, V4 s0_3, V4 s0_4, V4 s0_5, V4 s0_6, V4 s0_7, V4 s0_8, V4 s1_0, V4 s1_1, V4 s1_2, V4 s1_3, V4 s1_4, V4 s1_5, V4 s1_6, V4 s1_7, V4 s1_8) {
|
||||||
|
V4 r = { 3.045e-03, 3.707e-03, -6.011e-03, -5.162e-03 };
|
||||||
|
r = MulAdd(s0_0, M4(2.151e-02, -4.754e-02, 3.454e-02, -1.338e-03, -4.337e-02, 4.608e-02, -1.116e-01, -2.296e-02, -2.839e-02, -3.878e-01, -2.317e-02, 5.774e-02, 4.317e-03, 6.680e-02, 6.325e-02, -1.449e-01), r);
|
||||||
|
r = MulAdd(s0_1, M4(-1.173e-01, -8.942e-02, -1.017e-01, 6.496e-02, 5.558e-02, 2.788e-02, 2.184e-02, -2.837e-03, -1.057e-01, -2.075e-01, -3.255e-02, -1.297e-02, -2.643e-02, -1.695e-02, -9.425e-02, 3.942e-02), r);
|
||||||
|
r = MulAdd(s0_2, M4(-1.773e-02, -4.118e-02, -2.141e-02, 4.282e-02, 4.234e-02, -1.221e-02, -3.375e-03, 4.469e-02, -2.586e-01, -1.112e-01, -7.688e-02, 3.426e-02, 8.170e-02, -2.355e-02, -3.737e-02, 3.004e-02), r);
|
||||||
|
r = MulAdd(s0_3, M4(2.192e-01, 1.955e+00, 2.012e-01, -2.598e-02, -7.453e-02, 5.510e-02, -1.517e-01, -2.571e-01, -2.182e-02, -2.345e-02, -5.767e-02, -5.534e-02, -1.996e-02, 2.329e-01, 4.447e-04, -1.111e-01), r);
|
||||||
|
r = MulAdd(s0_4, M4(3.476e-01, -4.368e-01, -1.180e-01, 5.371e-01, 5.294e-01, 1.509e-01, 2.456e-01, -7.875e-02, 2.055e-01, 9.732e-02, 1.285e-01, 5.178e-01, 3.256e-01, -2.842e-01, 4.421e-02, 3.426e-01), r);
|
||||||
|
r = MulAdd(s0_5, M4(6.119e-01, -1.393e-01, -1.144e-02, 2.438e-01, -5.126e-02, -1.049e-01, -7.847e-02, 9.942e-02, 5.371e-01, 9.985e-02, 9.193e-02, -3.067e-02, -1.962e-01, -4.272e-02, -7.821e-03, 2.557e-02), r);
|
||||||
|
r = MulAdd(s0_6, M4(1.224e-02, -5.098e-01, 3.052e-01, 5.332e-01, 2.249e-01, 4.201e-02, 5.423e-01, 1.106e-01, -1.056e-02, -4.091e-03, -1.267e-02, -5.280e-02, 1.898e-02, 9.430e-03, 1.470e-02, 7.235e-02), r);
|
||||||
|
r = MulAdd(s0_7, M4(-4.342e-01, 2.385e-01, -3.834e-02, -7.654e-02, -9.043e-01, -3.139e-01, -1.511e-01, 3.800e-01, -8.848e-02, -3.911e-02, -7.025e-03, -1.196e-02, -3.322e-03, -1.455e-01, 2.084e-02, 1.106e-01), r);
|
||||||
|
r = MulAdd(s0_8, M4(1.382e-01, -1.894e-01, -8.814e-02, 1.373e-01, 1.362e-01, -1.298e-01, -1.007e-01, 1.166e-01, -1.553e-02, 8.530e-02, 2.744e-02, -1.083e-01, -5.606e-02, 5.965e-02, 1.406e-02, -4.496e-02), r);
|
||||||
|
r = MulAdd(s1_0, M4(-4.828e-03, -1.035e-01, -5.021e-02, 1.972e-02, -9.942e-03, -3.057e-01, -7.373e-03, 4.274e-02, -3.475e-03, 4.653e-02, 9.115e-03, -5.794e-02, 1.170e-02, 1.322e-01, 1.195e-01, -2.535e-02), r);
|
||||||
|
r = MulAdd(s1_1, M4(-5.424e-02, -1.541e-01, -9.945e-02, 8.862e-02, -1.198e-01, -3.591e-05, 4.305e-02, -1.079e-01, 1.605e-02, -3.377e-02, -5.398e-02, 1.201e-02, 3.432e-02, 1.090e-02, 8.871e-02, 3.186e-02), r);
|
||||||
|
r = MulAdd(s1_2, M4(-1.108e-01, -3.481e-02, -1.616e-02, -4.136e-03, -3.382e-02, 1.836e-02, -3.071e-02, -3.186e-02, -1.014e-01, -1.412e-01, -7.790e-02, 9.763e-02, -1.624e-02, -2.520e-02, -2.152e-02, 2.524e-02), r);
|
||||||
|
r = MulAdd(s1_3, M4(3.337e-03, -1.439e-02, 2.317e-03, 2.097e-01, 5.091e-03, 4.138e-02, -5.988e-02, -2.348e-02, -5.626e-03, 1.695e-02, 2.371e-02, -1.652e-02, 8.541e-02, -1.851e-01, 1.130e+00, -1.181e-01), r);
|
||||||
|
r = MulAdd(s1_4, M4(1.184e-01, -3.385e-02, 2.659e-02, 3.233e-01, 2.333e-01, 1.694e-01, 1.915e-01, 1.162e-01, 4.309e-02, -3.793e-02, 1.412e-01, -1.345e-02, -6.074e-01, -2.408e-01, -1.306e-01, 1.033e-01), r);
|
||||||
|
r = MulAdd(s1_5, M4(3.452e-01, 1.401e-01, 3.650e-02, -4.950e-02, 1.755e-01, -1.210e-01, -1.041e-02, 1.281e-01, 4.262e-01, 2.166e-02, 3.851e-02, 1.295e-01, -1.910e-01, -2.029e-02, -2.151e-02, -1.537e-02), r);
|
||||||
|
r = MulAdd(s1_6, M4(4.989e-03, -5.730e-02, 5.803e-02, 2.946e-02, 1.825e-02, 2.660e-02, -4.900e-03, 3.848e-03, 1.078e-02, 1.823e-02, -4.751e-03, 4.219e-02, -1.024e-01, 7.721e-02, -6.709e-01, 8.423e-02), r);
|
||||||
|
r = MulAdd(s1_7, M4(-1.567e-01, 4.125e-02, -2.721e-02, -1.831e-01, 9.470e-03, -1.205e-01, 1.793e-02, 1.160e-01, -4.874e-02, -4.902e-02, -1.479e-01, 7.102e-02, 6.699e-01, -1.383e-01, 1.314e-01, 2.999e-01), r);
|
||||||
|
r = MulAdd(s1_8, M4(-2.625e-01, -9.735e-02, -6.038e-02, 3.588e-03, 2.247e-02, 4.993e-02, 1.171e-02, -2.071e-02, 2.066e-01, 2.852e-01, -5.781e-02, -3.231e-01, 6.922e-02, 8.960e-02, 9.107e-02, -2.880e-02), r);
|
||||||
|
return r;
|
||||||
|
}
|
||||||
|
|
||||||
|
void Pass4(uint2 blockStart, uint3 tid) {
|
||||||
|
float2 pt = float2(GetInputPt());
|
||||||
|
uint2 gxy = Rmp8x8(tid.x) + blockStart;
|
||||||
|
uint2 size = GetInputSize();
|
||||||
|
if (gxy.x >= size.x || gxy.y >= size.y) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
float2 pos = (gxy + 0.5) * pt;
|
||||||
|
|
||||||
|
V4 s0_0 = l0(-1.0, -1.0);
|
||||||
|
V4 s0_1 = l0(0.0, -1.0);
|
||||||
|
V4 s0_2 = l0(1.0, -1.0);
|
||||||
|
V4 s0_3 = l0(-1.0, 0.0);
|
||||||
|
V4 s0_4 = l0(0.0, 0.0);
|
||||||
|
V4 s0_5 = l0(1.0, 0.0);
|
||||||
|
V4 s0_6 = l0(-1.0, 1.0);
|
||||||
|
V4 s0_7 = l0(0.0, 1.0);
|
||||||
|
V4 s0_8 = l0(1.0, 1.0);
|
||||||
|
V4 s1_0 = -max(-s0_0, 0.0);
|
||||||
|
V4 s1_1 = -max(-s0_1, 0.0);
|
||||||
|
V4 s1_2 = -max(-s0_2, 0.0);
|
||||||
|
V4 s1_3 = -max(-s0_3, 0.0);
|
||||||
|
V4 s1_4 = -max(-s0_4, 0.0);
|
||||||
|
V4 s1_5 = -max(-s0_5, 0.0);
|
||||||
|
V4 s1_6 = -max(-s0_6, 0.0);
|
||||||
|
V4 s1_7 = -max(-s0_7, 0.0);
|
||||||
|
V4 s1_8 = -max(-s0_8, 0.0);
|
||||||
|
s0_0 = max(s0_0, 0.0);
|
||||||
|
s0_1 = max(s0_1, 0.0);
|
||||||
|
s0_2 = max(s0_2, 0.0);
|
||||||
|
s0_3 = max(s0_3, 0.0);
|
||||||
|
s0_4 = max(s0_4, 0.0);
|
||||||
|
s0_5 = max(s0_5, 0.0);
|
||||||
|
s0_6 = max(s0_6, 0.0);
|
||||||
|
s0_7 = max(s0_7, 0.0);
|
||||||
|
s0_8 = max(s0_8, 0.0);
|
||||||
|
|
||||||
|
t1[gxy] = f0(s0_0, s0_1, s0_2, s0_3, s0_4, s0_5, s0_6, s0_7, s0_8, s1_0, s1_1, s1_2, s1_3, s1_4, s1_5, s1_6, s1_7, s1_8);
|
||||||
|
}
|
||||||
|
|
||||||
|
//!PASS 5
|
||||||
|
//!DESC out-shuffle
|
||||||
|
//!BLOCK_SIZE 16
|
||||||
|
//!NUM_THREADS 64
|
||||||
|
//!IN INPUT, t1
|
||||||
|
//!OUT OUTPUT
|
||||||
|
|
||||||
|
#define l0(x, y) V4(O(t1, float2(x, y)))
|
||||||
|
|
||||||
|
V4 f0(V4 s0_0, V4 s0_1, V4 s0_2, V4 s0_3, V4 s0_4, V4 s0_5, V4 s0_6, V4 s0_7, V4 s0_8, V4 s1_0, V4 s1_1, V4 s1_2, V4 s1_3, V4 s1_4, V4 s1_5, V4 s1_6, V4 s1_7, V4 s1_8) {
|
||||||
|
V4 r = { 1.708e-05, 2.435e-04, 1.267e-03, 1.926e-03 };
|
||||||
|
r = MulAdd(s0_0, M4(1.116e-01, 1.402e-01, 1.439e-02, 5.091e-02, -1.526e-02, -2.562e-02, -1.193e-02, -1.365e-02, -6.156e-02, -3.463e-02, 2.155e-02, -2.192e-02, -2.937e-02, -1.072e-01, -4.538e-02, -3.302e-02), r);
|
||||||
|
r = MulAdd(s0_1, M4(-1.192e-02, -1.724e-02, 9.899e-03, -5.861e-03, -1.552e-02, 2.422e-02, 4.929e-03, 7.339e-03, 4.700e-02, 1.993e-01, -6.323e-02, 5.778e-02, 1.499e-01, 3.916e-01, -4.578e-02, -2.026e-02), r);
|
||||||
|
r = MulAdd(s0_2, M4(5.431e-03, 1.916e-03, -2.064e-03, -6.545e-04, -1.731e-02, -8.081e-02, 1.391e-02, -7.036e-03, 7.739e-02, -1.588e-01, 2.970e-02, 3.357e-02, 3.869e-02, -7.824e-02, 1.813e-02, -6.252e-02), r);
|
||||||
|
r = MulAdd(s0_3, M4(5.283e-01, 8.076e-02, 3.430e-01, 2.332e-01, -3.540e-02, 1.903e-02, -1.354e-02, -1.415e-02, -1.644e-01, -1.319e-02, -9.781e-02, -3.256e-02, 2.768e-02, -3.914e-02, 1.596e-01, -1.067e-01), r);
|
||||||
|
r = MulAdd(s0_4, M4(-1.638e-02, 4.385e-01, -1.479e-01, -1.789e-02, -1.399e-01, -5.884e-02, -7.306e-02, -2.036e-03, 5.196e-01, -1.849e-01, 8.771e-01, 3.595e-01, -7.094e-01, 2.485e-02, -3.977e-02, 7.246e-01), r);
|
||||||
|
r = MulAdd(s0_5, M4(-1.647e-03, -6.027e-03, -3.787e-03, -1.975e-02, -4.810e-02, -4.557e-01, 4.921e-02, -1.313e-01, -2.044e-02, 3.533e-01, -7.591e-02, 1.249e-02, 2.648e-02, -5.215e-01, 1.204e-01, -2.254e-01), r);
|
||||||
|
r = MulAdd(s0_6, M4(-2.852e-02, -1.630e-02, 1.249e-01, -1.758e-02, 4.285e-02, 1.425e-02, -1.595e-02, 2.618e-02, 4.460e-03, 1.266e-02, -3.914e-02, 1.111e-02, 5.378e-02, 2.199e-02, 2.561e-03, 2.125e-02), r);
|
||||||
|
r = MulAdd(s0_7, M4(-6.567e-02, -4.333e-02, -4.153e-03, 1.692e-01, 5.376e-02, 5.736e-02, -1.860e-01, -9.094e-02, 3.357e-02, -3.186e-02, 1.244e-01, -9.606e-02, 6.227e-02, 6.827e-02, -2.086e-01, -6.625e-02), r);
|
||||||
|
r = MulAdd(s0_8, M4(4.553e-05, -3.116e-02, 1.023e-02, 2.322e-02, 8.623e-02, 1.125e-01, 2.802e-02, -2.768e-01, -1.003e-01, -2.143e-02, -2.413e-02, 1.460e-01, 5.421e-02, 5.798e-02, 3.478e-03, -1.421e-01), r);
|
||||||
|
r = MulAdd(s1_0, M4(2.165e-01, 1.123e-01, -3.653e-02, -6.070e-03, -1.021e-01, -6.901e-04, 6.256e-03, -3.182e-03, -4.285e-02, -6.763e-02, 2.278e-02, -1.860e-02, -2.689e-02, 2.567e-02, 2.634e-03, 3.600e-02), r);
|
||||||
|
r = MulAdd(s1_1, M4(-1.159e-01, -1.198e-01, 2.991e-02, -6.143e-02, 1.038e-01, -5.076e-02, -1.785e-02, -3.611e-02, 6.860e-02, 9.302e-02, -1.125e-02, 3.332e-02, 6.457e-02, -3.919e-02, 4.158e-03, -1.201e-02), r);
|
||||||
|
r = MulAdd(s1_2, M4(-6.554e-03, 3.359e-02, -2.003e-02, -2.227e-04, 3.354e-02, -3.700e-02, -9.588e-03, -3.740e-02, -1.336e-02, -2.556e-04, -4.733e-03, -1.636e-02, 1.127e-02, 1.421e-02, -1.019e-02, -2.731e-02), r);
|
||||||
|
r = MulAdd(s1_3, M4(3.642e-01, -3.756e-03, 6.584e-01, 1.773e-01, -1.638e-02, 1.109e-02, -7.427e-02, -1.572e-02, -1.869e-01, -3.059e-02, -8.088e-02, -5.092e-02, -5.794e-02, -4.431e-02, -7.912e-02, -9.767e-02), r);
|
||||||
|
r = MulAdd(s1_4, M4(-3.255e-02, 3.115e-01, -2.109e-01, 2.804e-01, -6.504e-01, -1.342e-02, 1.355e-01, 3.623e-01, 5.142e-01, 2.124e-01, 1.866e-01, 2.268e-01, -2.470e-02, 1.629e-01, 1.163e-01, 1.663e-01), r);
|
||||||
|
r = MulAdd(s1_5, M4(-1.093e-02, -1.640e-04, -3.502e-02, -3.746e-02, 1.836e-02, -5.959e-01, 1.323e-01, -2.388e-01, 3.482e-02, 1.823e-01, -3.895e-02, 5.164e-03, -7.314e-02, -3.897e-01, 6.275e-02, -3.974e-02), r);
|
||||||
|
r = MulAdd(s1_6, M4(7.922e-03, -3.284e-02, 1.274e-01, -2.930e-02, 6.307e-02, 2.548e-02, -4.094e-02, 2.130e-02, -1.123e-02, 1.824e-03, -9.595e-02, 1.808e-02, 7.955e-02, 3.285e-02, 4.592e-02, 7.153e-02), r);
|
||||||
|
r = MulAdd(s1_7, M4(-6.410e-02, -1.423e-02, -4.912e-02, 1.461e-01, 6.612e-02, 9.838e-02, -2.153e-01, -1.067e-01, -1.108e-02, -1.048e-01, 2.778e-01, -1.116e-01, 4.569e-02, 2.955e-02, -1.440e-01, -3.364e-02), r);
|
||||||
|
r = MulAdd(s1_8, M4(1.721e-02, 1.171e-02, 1.096e-02, -2.832e-02, 7.446e-02, 4.785e-02, 8.270e-03, -1.640e-01, -8.912e-02, -6.617e-02, 3.225e-03, 9.894e-02, 4.367e-02, 8.102e-02, -1.779e-02, -2.410e-01), r);
|
||||||
|
return tanh(r);
|
||||||
|
}
|
||||||
|
|
||||||
|
void Pass5(uint2 blockStart, uint3 tid) {
|
||||||
|
float2 pt = float2(GetInputPt());
|
||||||
|
uint2 gxy = (Rmp8x8(tid.x) << 1) + blockStart;
|
||||||
|
uint2 size = GetOutputSize();
|
||||||
|
if (gxy.x >= size.x || gxy.y >= size.y) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
float2 pos = ((gxy >> 1) + 0.5) * pt;
|
||||||
|
|
||||||
|
V4 s0_0 = l0(-1.0, -1.0);
|
||||||
|
V4 s0_1 = l0(0.0, -1.0);
|
||||||
|
V4 s0_2 = l0(1.0, -1.0);
|
||||||
|
V4 s0_3 = l0(-1.0, 0.0);
|
||||||
|
V4 s0_4 = l0(0.0, 0.0);
|
||||||
|
V4 s0_5 = l0(1.0, 0.0);
|
||||||
|
V4 s0_6 = l0(-1.0, 1.0);
|
||||||
|
V4 s0_7 = l0(0.0, 1.0);
|
||||||
|
V4 s0_8 = l0(1.0, 1.0);
|
||||||
|
V4 s1_0 = -max(-s0_0, 0.0);
|
||||||
|
V4 s1_1 = -max(-s0_1, 0.0);
|
||||||
|
V4 s1_2 = -max(-s0_2, 0.0);
|
||||||
|
V4 s1_3 = -max(-s0_3, 0.0);
|
||||||
|
V4 s1_4 = -max(-s0_4, 0.0);
|
||||||
|
V4 s1_5 = -max(-s0_5, 0.0);
|
||||||
|
V4 s1_6 = -max(-s0_6, 0.0);
|
||||||
|
V4 s1_7 = -max(-s0_7, 0.0);
|
||||||
|
V4 s1_8 = -max(-s0_8, 0.0);
|
||||||
|
s0_0 = max(s0_0, 0.0);
|
||||||
|
s0_1 = max(s0_1, 0.0);
|
||||||
|
s0_2 = max(s0_2, 0.0);
|
||||||
|
s0_3 = max(s0_3, 0.0);
|
||||||
|
s0_4 = max(s0_4, 0.0);
|
||||||
|
s0_5 = max(s0_5, 0.0);
|
||||||
|
s0_6 = max(s0_6, 0.0);
|
||||||
|
s0_7 = max(s0_7, 0.0);
|
||||||
|
s0_8 = max(s0_8, 0.0);
|
||||||
|
|
||||||
|
V4 r = f0(s0_0, s0_1, s0_2, s0_3, s0_4, s0_5, s0_6, s0_7, s0_8, s1_0, s1_1, s1_2, s1_3, s1_4, s1_5, s1_6, s1_7, s1_8);
|
||||||
|
|
||||||
|
static const MF3x3 rgb2yuv = { 0.299, 0.587, 0.114, -0.169, -0.331, 0.5, 0.5, -0.419, -0.081 };
|
||||||
|
static const MF3x3 yuv2rgb = { 1, -0.00093, 1.401687, 1, -0.3437, -0.71417, 1, 1.77216, 0.00099 };
|
||||||
|
float2 opt = float2(GetOutputPt());
|
||||||
|
|
||||||
|
pos -= 0.5f * opt;
|
||||||
|
MF3 yuv = mul(rgb2yuv, INPUT.SampleLevel(SL, pos, 0).rgb);
|
||||||
|
OUTPUT[gxy] = MF4(mul(yuv2rgb, MF3(saturate(yuv.r + r.x), yuv.yz)), 1);
|
||||||
|
|
||||||
|
++gxy.x;
|
||||||
|
pos.x += opt.x;
|
||||||
|
yuv = mul(rgb2yuv, INPUT.SampleLevel(SL, pos, 0).rgb);
|
||||||
|
OUTPUT[gxy] = MF4(mul(yuv2rgb, MF3(saturate(yuv.r + r.y), yuv.yz)), 1);
|
||||||
|
|
||||||
|
++gxy.y;
|
||||||
|
pos.y += opt.y;
|
||||||
|
yuv = mul(rgb2yuv, INPUT.SampleLevel(SL, pos, 0).rgb);
|
||||||
|
OUTPUT[gxy] = MF4(mul(yuv2rgb, MF3(saturate(yuv.r + r.w), yuv.yz)), 1);
|
||||||
|
|
||||||
|
--gxy.x;
|
||||||
|
pos.x -= opt.x;
|
||||||
|
yuv = mul(rgb2yuv, INPUT.SampleLevel(SL, pos, 0).rgb);
|
||||||
|
OUTPUT[gxy] = MF4(mul(yuv2rgb, MF3(saturate(yuv.r + r.z), yuv.yz)), 1);
|
||||||
|
}
|
||||||
412
src/Effects/CuNNy/CuNNy-3x4C-NVL.hlsl
Normal file
412
src/Effects/CuNNy/CuNNy-3x4C-NVL.hlsl
Normal file
|
|
@ -0,0 +1,412 @@
|
||||||
|
// CuNNy 3x4C BILINEAR RGB NVL - https://github.com/funnyplanter/CuNNy
|
||||||
|
|
||||||
|
// This program is free software: you can redistribute it and/or modify
|
||||||
|
// it under the terms of the GNU General Public License as published by
|
||||||
|
// the Free Software Foundation, either version 3 of the License, or
|
||||||
|
// (at your option) any later version.
|
||||||
|
//
|
||||||
|
// This program is distributed in the hope that it will be useful,
|
||||||
|
// but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
//
|
||||||
|
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||||
|
// GNU General Public License for more details.
|
||||||
|
//
|
||||||
|
// You should have received a copy of the GNU General Public License
|
||||||
|
// along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||||
|
|
||||||
|
//!MAGPIE EFFECT
|
||||||
|
//!VERSION 4
|
||||||
|
//!SORT_NAME CuNNy-D04N03
|
||||||
|
//!USE MulAdd
|
||||||
|
//!CAPABILITY FP16
|
||||||
|
|
||||||
|
#include "../StubDefs.hlsli"
|
||||||
|
|
||||||
|
//!TEXTURE
|
||||||
|
Texture2D INPUT;
|
||||||
|
|
||||||
|
//!TEXTURE
|
||||||
|
//!WIDTH INPUT_WIDTH * 2
|
||||||
|
//!HEIGHT INPUT_HEIGHT * 2
|
||||||
|
Texture2D OUTPUT;
|
||||||
|
|
||||||
|
//!SAMPLER
|
||||||
|
//!FILTER POINT
|
||||||
|
SamplerState SP;
|
||||||
|
|
||||||
|
//!SAMPLER
|
||||||
|
//!FILTER LINEAR
|
||||||
|
SamplerState SL;
|
||||||
|
|
||||||
|
//!COMMON
|
||||||
|
#define O(t, p) t.SampleLevel(SP, pos + p * pt, 0)
|
||||||
|
#define V4 MF4
|
||||||
|
#define M4 MF4x4
|
||||||
|
|
||||||
|
//!TEXTURE
|
||||||
|
//!WIDTH INPUT_WIDTH
|
||||||
|
//!HEIGHT INPUT_HEIGHT
|
||||||
|
//!FORMAT R8G8B8A8_SNORM
|
||||||
|
Texture2D t0;
|
||||||
|
|
||||||
|
//!TEXTURE
|
||||||
|
//!WIDTH INPUT_WIDTH
|
||||||
|
//!HEIGHT INPUT_HEIGHT
|
||||||
|
//!FORMAT R8G8B8A8_SNORM
|
||||||
|
Texture2D t1;
|
||||||
|
|
||||||
|
//!PASS 1
|
||||||
|
//!DESC in
|
||||||
|
//!BLOCK_SIZE 8
|
||||||
|
//!NUM_THREADS 64
|
||||||
|
//!IN INPUT
|
||||||
|
//!OUT t0
|
||||||
|
|
||||||
|
#define l0(x, y) MF((dot(float3(6.094e-01, 1.148e+00, 2.568e-01), O(INPUT, float2(x, y)).rgb) + -1.542e+00))
|
||||||
|
|
||||||
|
V4 f0(MF s0_0, MF s0_1, MF s0_2, MF s0_3, MF s0_4, MF s0_5, MF s0_6, MF s0_7, MF s0_8) {
|
||||||
|
V4 r = { -4.952e-03, -2.750e-03, -9.137e-04, 6.736e-02 };
|
||||||
|
r = mad(s0_0, V4(-6.372e-02, 1.685e-01, -2.573e-02, -2.185e-02), r);
|
||||||
|
r = mad(s0_1, V4(-3.502e-02, -2.984e-03, 5.048e-02, -2.445e-01), r);
|
||||||
|
r = mad(s0_2, V4(9.644e-02, -7.557e-03, -1.770e-02, 3.162e-02), r);
|
||||||
|
r = mad(s0_3, V4(7.199e-02, -6.233e-01, -4.180e-01, 1.392e-01), r);
|
||||||
|
r = mad(s0_4, V4(-5.683e-01, 1.451e-01, -8.148e-02, 9.768e-02), r);
|
||||||
|
r = mad(s0_5, V4(4.702e-01, -1.319e-03, 3.745e-03, -4.204e-02), r);
|
||||||
|
r = mad(s0_6, V4(9.855e-03, 3.213e-01, 5.098e-01, 4.001e-02), r);
|
||||||
|
r = mad(s0_7, V4(8.216e-02, -1.219e-02, -3.347e-02, 5.017e-02), r);
|
||||||
|
r = mad(s0_8, V4(-6.691e-02, 5.417e-03, 1.235e-02, -9.640e-03), r);
|
||||||
|
return r;
|
||||||
|
}
|
||||||
|
|
||||||
|
void Pass1(uint2 blockStart, uint3 tid) {
|
||||||
|
float2 pt = float2(GetInputPt());
|
||||||
|
uint2 gxy = Rmp8x8(tid.x) + blockStart;
|
||||||
|
uint2 size = GetInputSize();
|
||||||
|
if (gxy.x >= size.x || gxy.y >= size.y) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
float2 pos = (gxy + 0.5) * pt;
|
||||||
|
|
||||||
|
MF s0_0 = l0(-1.0, -1.0);
|
||||||
|
MF s0_1 = l0(0.0, -1.0);
|
||||||
|
MF s0_2 = l0(1.0, -1.0);
|
||||||
|
MF s0_3 = l0(-1.0, 0.0);
|
||||||
|
MF s0_4 = l0(0.0, 0.0);
|
||||||
|
MF s0_5 = l0(1.0, 0.0);
|
||||||
|
MF s0_6 = l0(-1.0, 1.0);
|
||||||
|
MF s0_7 = l0(0.0, 1.0);
|
||||||
|
MF s0_8 = l0(1.0, 1.0);
|
||||||
|
|
||||||
|
t0[gxy] = f0(s0_0, s0_1, s0_2, s0_3, s0_4, s0_5, s0_6, s0_7, s0_8);
|
||||||
|
}
|
||||||
|
|
||||||
|
//!PASS 2
|
||||||
|
//!DESC conv1
|
||||||
|
//!BLOCK_SIZE 8
|
||||||
|
//!NUM_THREADS 64
|
||||||
|
//!IN t0
|
||||||
|
//!OUT t1
|
||||||
|
|
||||||
|
#define l0(x, y) V4(O(t0, float2(x, y)))
|
||||||
|
|
||||||
|
V4 f0(V4 s0_0, V4 s0_1, V4 s0_2, V4 s0_3, V4 s0_4, V4 s0_5, V4 s0_6, V4 s0_7, V4 s0_8, V4 s1_0, V4 s1_1, V4 s1_2, V4 s1_3, V4 s1_4, V4 s1_5, V4 s1_6, V4 s1_7, V4 s1_8) {
|
||||||
|
V4 r = { -1.511e-02, -2.848e-03, 7.160e-03, -2.555e-03 };
|
||||||
|
r = MulAdd(s0_0, M4(3.169e-01, 3.467e-01, -2.365e-01, 2.253e-01, 6.307e-02, 1.727e-01, -1.053e-01, 9.324e-02, -4.901e-02, -2.112e-01, 8.983e-02, -1.851e-01, -1.987e-01, 6.645e-02, 2.188e-02, 1.988e-02), r);
|
||||||
|
r = MulAdd(s0_1, M4(4.393e-02, 2.078e-01, -1.967e-01, 4.673e-02, -7.991e-02, 2.461e-01, -6.028e-02, 9.252e-02, 3.871e-01, 6.138e-02, -3.603e-01, -1.485e-01, 2.466e-01, 5.251e-02, -6.181e-02, 8.932e-02), r);
|
||||||
|
r = MulAdd(s0_2, M4(-1.707e-02, 2.598e-02, 1.641e-02, 2.780e-02, 2.425e-02, 1.769e-01, -8.461e-02, 1.067e-01, -2.503e-01, 6.051e-01, -2.782e-01, 1.311e-01, -8.456e-03, -1.370e-02, -6.391e-02, 6.935e-02), r);
|
||||||
|
r = MulAdd(s0_3, M4(-8.251e-01, -4.981e-01, -1.726e-01, -1.815e-01, 1.411e-01, 2.889e-02, -3.115e-01, -3.255e-01, 1.812e-03, -4.529e-02, 2.350e-01, 1.999e-01, -1.993e-01, -1.868e-02, 4.249e-02, -1.117e-01), r);
|
||||||
|
r = MulAdd(s0_4, M4(-4.732e-02, -5.673e-02, 1.274e-01, 4.894e-02, 9.126e-02, 1.717e-01, -3.294e-01, -2.378e-01, -7.089e-02, -8.116e-02, 2.510e-01, 7.381e-02, 1.275e-01, 8.030e-02, -1.671e-01, -1.824e-02), r);
|
||||||
|
r = MulAdd(s0_5, M4(3.373e-02, -4.163e-02, -4.077e-02, -2.085e-02, 1.265e-01, -4.133e-01, 7.433e-02, 7.763e-02, -1.466e-01, 3.291e-01, -7.784e-02, 9.472e-02, 2.725e-01, -2.393e-01, -6.913e-02, -9.445e-02), r);
|
||||||
|
r = MulAdd(s0_6, M4(3.043e-02, -9.985e-02, 1.538e-01, -2.529e-01, 2.379e-01, 1.079e-01, -1.517e-01, -9.289e-02, -1.396e-01, -4.354e-02, 8.463e-02, 7.052e-02, 5.629e-02, 3.293e-03, 5.342e-02, -1.606e-01), r);
|
||||||
|
r = MulAdd(s0_7, M4(3.626e-02, -1.421e-01, 4.017e-02, -3.963e-02, 2.148e-03, 5.522e-02, 3.174e-01, 2.270e-02, -5.590e-02, -9.875e-02, -1.683e-01, 5.415e-02, 1.509e-01, 7.709e-02, -1.161e-01, 1.440e-01), r);
|
||||||
|
r = MulAdd(s0_8, M4(-1.132e-02, 2.337e-02, 1.264e-02, 2.638e-03, -6.582e-02, -1.965e-01, 2.803e-01, 1.333e-01, 9.171e-02, 1.567e-01, -2.419e-01, -1.602e-01, -2.271e-01, 3.614e-02, 2.179e-01, 4.826e-02), r);
|
||||||
|
r = MulAdd(s1_0, M4(1.452e-01, 1.313e-01, -6.140e-02, 2.412e-01, -3.691e-02, 7.355e-02, -4.209e-02, 1.343e-01, -2.509e-02, -1.266e-01, 9.017e-02, -1.854e-02, -4.280e-01, -1.004e-01, 2.319e-01, 4.211e-02), r);
|
||||||
|
r = MulAdd(s1_1, M4(4.894e-02, 7.564e-02, -9.350e-02, 5.422e-02, -6.111e-02, 6.969e-02, -4.398e-02, 6.622e-02, 7.113e-01, 3.461e-01, -5.254e-01, -8.808e-02, 4.481e-01, 3.171e-01, -2.198e-01, 1.048e-01), r);
|
||||||
|
r = MulAdd(s1_2, M4(-3.483e-02, 3.150e-03, 2.215e-02, 2.616e-02, 1.468e-01, -1.295e-01, -1.470e-01, 3.371e-02, -4.514e-02, 4.677e-02, -1.313e-01, -1.176e-01, 1.507e-03, 2.290e-01, -2.163e-01, 3.895e-02), r);
|
||||||
|
r = MulAdd(s1_3, M4(-2.258e-01, -1.353e-01, -4.873e-01, -1.236e+00, 1.660e-01, -1.803e-02, -2.797e-01, -4.092e-01, -1.525e-01, -8.178e-02, 2.665e-01, 3.652e-01, -1.853e-01, -3.819e-02, 1.627e-01, -3.896e-01), r);
|
||||||
|
r = MulAdd(s1_4, M4(-1.005e-01, -3.821e-02, 9.917e-02, -1.324e-01, -2.040e-01, -3.586e-01, 9.776e-02, -1.376e-01, 2.065e-01, 2.017e-01, -1.320e-01, -2.225e-02, 2.944e-01, 5.393e-02, -4.301e-01, -7.240e-02), r);
|
||||||
|
r = MulAdd(s1_5, M4(5.353e-02, -4.257e-02, -4.131e-02, -3.943e-02, -6.151e-02, 3.059e-01, -1.481e-02, 3.662e-01, 3.098e-02, -8.774e-02, 1.790e-02, -1.332e-01, 8.670e-02, -6.985e-02, -1.359e-01, 2.063e-01), r);
|
||||||
|
r = MulAdd(s1_6, M4(-9.271e-02, 2.259e-01, 2.200e-02, -2.390e-01, 3.258e-01, 1.082e-01, -1.499e-01, -3.063e-02, -2.775e-01, -9.008e-02, 1.294e-01, 3.533e-02, 1.011e-02, 4.294e-02, 4.935e-02, -1.005e-01), r);
|
||||||
|
r = MulAdd(s1_7, M4(1.321e-02, -7.160e-02, 7.229e-02, -3.050e-02, 4.303e-02, -1.518e-01, 5.137e-01, 4.029e-02, 4.896e-02, 5.334e-02, -3.545e-01, 2.370e-02, 1.645e-01, 3.433e-02, -9.552e-03, 1.032e-01), r);
|
||||||
|
r = MulAdd(s1_8, M4(8.370e-03, -2.408e-02, 2.693e-02, -8.183e-03, -2.375e-02, -2.973e-01, 1.889e-01, 1.096e-01, 1.093e-02, 2.310e-01, -1.613e-01, -1.343e-01, -1.718e-01, -2.165e-02, 1.384e-01, 9.956e-02), r);
|
||||||
|
return r;
|
||||||
|
}
|
||||||
|
|
||||||
|
void Pass2(uint2 blockStart, uint3 tid) {
|
||||||
|
float2 pt = float2(GetInputPt());
|
||||||
|
uint2 gxy = Rmp8x8(tid.x) + blockStart;
|
||||||
|
uint2 size = GetInputSize();
|
||||||
|
if (gxy.x >= size.x || gxy.y >= size.y) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
float2 pos = (gxy + 0.5) * pt;
|
||||||
|
|
||||||
|
V4 s0_0 = l0(-1.0, -1.0);
|
||||||
|
V4 s0_1 = l0(0.0, -1.0);
|
||||||
|
V4 s0_2 = l0(1.0, -1.0);
|
||||||
|
V4 s0_3 = l0(-1.0, 0.0);
|
||||||
|
V4 s0_4 = l0(0.0, 0.0);
|
||||||
|
V4 s0_5 = l0(1.0, 0.0);
|
||||||
|
V4 s0_6 = l0(-1.0, 1.0);
|
||||||
|
V4 s0_7 = l0(0.0, 1.0);
|
||||||
|
V4 s0_8 = l0(1.0, 1.0);
|
||||||
|
V4 s1_0 = -max(-s0_0, 0.0);
|
||||||
|
V4 s1_1 = -max(-s0_1, 0.0);
|
||||||
|
V4 s1_2 = -max(-s0_2, 0.0);
|
||||||
|
V4 s1_3 = -max(-s0_3, 0.0);
|
||||||
|
V4 s1_4 = -max(-s0_4, 0.0);
|
||||||
|
V4 s1_5 = -max(-s0_5, 0.0);
|
||||||
|
V4 s1_6 = -max(-s0_6, 0.0);
|
||||||
|
V4 s1_7 = -max(-s0_7, 0.0);
|
||||||
|
V4 s1_8 = -max(-s0_8, 0.0);
|
||||||
|
s0_0 = max(s0_0, 0.0);
|
||||||
|
s0_1 = max(s0_1, 0.0);
|
||||||
|
s0_2 = max(s0_2, 0.0);
|
||||||
|
s0_3 = max(s0_3, 0.0);
|
||||||
|
s0_4 = max(s0_4, 0.0);
|
||||||
|
s0_5 = max(s0_5, 0.0);
|
||||||
|
s0_6 = max(s0_6, 0.0);
|
||||||
|
s0_7 = max(s0_7, 0.0);
|
||||||
|
s0_8 = max(s0_8, 0.0);
|
||||||
|
|
||||||
|
t1[gxy] = f0(s0_0, s0_1, s0_2, s0_3, s0_4, s0_5, s0_6, s0_7, s0_8, s1_0, s1_1, s1_2, s1_3, s1_4, s1_5, s1_6, s1_7, s1_8);
|
||||||
|
}
|
||||||
|
|
||||||
|
//!PASS 3
|
||||||
|
//!DESC conv2
|
||||||
|
//!BLOCK_SIZE 8
|
||||||
|
//!NUM_THREADS 64
|
||||||
|
//!IN t1
|
||||||
|
//!OUT t0
|
||||||
|
|
||||||
|
#define l0(x, y) V4(O(t1, float2(x, y)))
|
||||||
|
|
||||||
|
V4 f0(V4 s0_0, V4 s0_1, V4 s0_2, V4 s0_3, V4 s0_4, V4 s0_5, V4 s0_6, V4 s0_7, V4 s0_8, V4 s1_0, V4 s1_1, V4 s1_2, V4 s1_3, V4 s1_4, V4 s1_5, V4 s1_6, V4 s1_7, V4 s1_8) {
|
||||||
|
V4 r = { -3.514e-03, 2.350e-03, 2.221e-03, 1.089e-03 };
|
||||||
|
r = MulAdd(s0_0, M4(6.983e-02, 8.935e-03, -1.644e-01, -4.232e-04, -1.981e-01, 9.265e-02, 1.769e-01, 1.705e-01, -2.300e-02, -7.408e-03, -4.221e-02, -1.617e-02, -6.026e-02, -9.185e-03, -7.420e-02, -4.238e-02), r);
|
||||||
|
r = MulAdd(s0_1, M4(1.832e-01, -1.117e-01, 1.784e-02, 6.345e-02, -9.651e-02, 5.753e-02, 1.480e-01, 1.284e-01, 3.957e-01, -2.684e-01, 2.853e-02, -5.823e-02, -8.184e-02, 1.062e-01, -2.604e-02, -7.579e-02), r);
|
||||||
|
r = MulAdd(s0_2, M4(-1.753e-01, 5.019e-03, -1.285e-01, 8.470e-02, -2.566e-01, 6.556e-02, -9.751e-02, 7.653e-03, -9.466e-02, 3.098e-02, -9.617e-02, -4.826e-02, 3.951e-02, -5.446e-02, 1.297e-01, 1.076e-01), r);
|
||||||
|
r = MulAdd(s0_3, M4(-7.377e-02, -2.183e-01, 9.806e-02, 1.735e-01, 2.795e-01, 3.730e-01, 1.906e-01, 1.313e-01, 2.115e-01, 2.222e-01, 1.880e-01, 2.427e-01, -1.177e-01, 2.587e-02, -1.928e-01, -1.489e-01), r);
|
||||||
|
r = MulAdd(s0_4, M4(-3.487e-01, -3.194e-01, 7.963e-01, -1.044e-01, 3.136e-01, -5.467e-02, 5.059e-01, -4.801e-02, -4.943e-01, -1.466e-01, -5.938e-02, -9.473e-01, 2.661e-01, -1.545e-01, 1.986e-01, -2.172e-02), r);
|
||||||
|
r = MulAdd(s0_5, M4(-3.450e-01, 1.931e-01, -2.303e-01, -1.880e-01, -1.323e-01, 1.839e-01, -1.130e-01, -5.181e-02, 3.049e-02, 9.834e-02, -1.342e-01, -1.072e-01, 1.925e-02, -9.652e-02, 1.169e-01, 2.084e-01), r);
|
||||||
|
r = MulAdd(s0_6, M4(1.543e-02, 2.202e-01, 4.809e-02, 1.085e-01, 3.076e-02, -4.127e-01, 4.606e-02, 9.444e-02, 7.886e-02, -1.314e-01, -1.638e-02, 4.353e-02, 9.790e-02, -6.783e-02, -1.008e-01, -1.558e-01), r);
|
||||||
|
r = MulAdd(s0_7, M4(-4.453e-02, 3.133e-01, -2.217e-01, -5.271e-02, -2.055e-01, -1.000e-01, 8.374e-02, 6.141e-02, 2.147e-02, -3.844e-01, -2.203e-01, -1.105e-01, -3.596e-02, 2.026e-01, 3.174e-01, 1.519e-01), r);
|
||||||
|
r = MulAdd(s0_8, M4(-5.107e-03, 2.380e-01, 2.147e-02, -8.032e-02, -9.743e-02, 6.943e-02, 9.403e-02, 3.742e-02, -1.822e-02, -4.950e-02, 7.963e-02, -1.338e-01, -1.491e-01, 1.655e-02, -5.817e-02, 1.164e-01), r);
|
||||||
|
r = MulAdd(s1_0, M4(8.679e-02, -7.335e-02, -5.999e-02, -4.504e-02, -3.329e-02, 4.349e-03, -4.883e-02, 3.159e-02, -7.948e-02, 3.308e-02, 6.579e-02, 1.607e-01, 1.336e-01, -1.042e-01, -2.368e-01, -1.546e-01), r);
|
||||||
|
r = MulAdd(s1_1, M4(2.764e-01, -6.665e-02, 1.661e-02, -4.103e-02, 1.095e-01, -1.159e-01, -1.142e-01, -1.412e-01, 4.033e-01, -8.697e-02, 2.387e-01, 1.762e-01, 4.948e-01, -1.533e-01, 7.816e-02, 5.700e-02), r);
|
||||||
|
r = MulAdd(s1_2, M4(1.187e-01, -6.571e-02, 4.698e-02, 4.931e-02, -5.523e-02, 3.925e-02, -7.453e-02, -8.429e-02, -2.202e-01, 6.090e-02, -1.460e-01, 2.777e-02, 4.405e-01, 6.445e-03, 3.494e-01, 3.311e-01), r);
|
||||||
|
r = MulAdd(s1_3, M4(-4.333e-02, -8.517e-02, 1.372e-01, 2.066e-01, 4.728e-01, 1.195e-01, -2.627e-01, -2.280e-01, 1.606e-01, 2.216e-01, 2.269e-01, 3.505e-01, -2.499e-01, -3.977e-01, -3.659e-02, 1.460e-02), r);
|
||||||
|
r = MulAdd(s1_4, M4(-4.640e-01, -7.221e-01, -2.524e-01, -6.513e-01, 6.699e-01, -1.727e-01, 4.444e-01, -3.115e-01, -6.748e-01, 1.063e-01, 6.487e-01, -3.195e-01, -5.136e-01, -8.272e-01, 4.014e-01, 4.914e-01), r);
|
||||||
|
r = MulAdd(s1_5, M4(-1.112e-03, -1.293e-02, 1.567e-02, -1.266e-01, 1.185e-01, 4.940e-02, -9.925e-02, -1.034e-01, -1.041e-01, 1.822e-01, -4.277e-02, 1.313e-01, -6.459e-01, -1.562e-01, -3.961e-01, -7.262e-02), r);
|
||||||
|
r = MulAdd(s1_6, M4(1.499e-02, 3.135e-01, 2.187e-01, 2.386e-01, 1.171e-01, -4.899e-01, -1.987e-01, -1.717e-01, 5.232e-02, -1.984e-01, 9.338e-04, 1.092e-01, 1.545e-01, 4.183e-01, 1.180e-01, 1.102e-01), r);
|
||||||
|
r = MulAdd(s1_7, M4(-1.411e-01, 2.619e-01, -2.549e-01, -2.113e-01, -1.109e-01, -3.038e-01, 7.579e-02, -3.585e-02, -1.373e-03, -2.713e-01, -5.527e-02, 7.052e-02, -1.648e-01, 7.324e-01, 3.974e-01, 2.306e-01), r);
|
||||||
|
r = MulAdd(s1_8, M4(-1.861e-02, 9.414e-02, -6.739e-02, -8.921e-02, -2.337e-02, -2.657e-02, -3.376e-03, -7.209e-02, -1.042e-01, -2.504e-02, 1.287e-01, -1.459e-02, -1.617e-01, 2.384e-01, -6.969e-01, -3.760e-01), r);
|
||||||
|
return r;
|
||||||
|
}
|
||||||
|
|
||||||
|
void Pass3(uint2 blockStart, uint3 tid) {
|
||||||
|
float2 pt = float2(GetInputPt());
|
||||||
|
uint2 gxy = Rmp8x8(tid.x) + blockStart;
|
||||||
|
uint2 size = GetInputSize();
|
||||||
|
if (gxy.x >= size.x || gxy.y >= size.y) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
float2 pos = (gxy + 0.5) * pt;
|
||||||
|
|
||||||
|
V4 s0_0 = l0(-1.0, -1.0);
|
||||||
|
V4 s0_1 = l0(0.0, -1.0);
|
||||||
|
V4 s0_2 = l0(1.0, -1.0);
|
||||||
|
V4 s0_3 = l0(-1.0, 0.0);
|
||||||
|
V4 s0_4 = l0(0.0, 0.0);
|
||||||
|
V4 s0_5 = l0(1.0, 0.0);
|
||||||
|
V4 s0_6 = l0(-1.0, 1.0);
|
||||||
|
V4 s0_7 = l0(0.0, 1.0);
|
||||||
|
V4 s0_8 = l0(1.0, 1.0);
|
||||||
|
V4 s1_0 = -max(-s0_0, 0.0);
|
||||||
|
V4 s1_1 = -max(-s0_1, 0.0);
|
||||||
|
V4 s1_2 = -max(-s0_2, 0.0);
|
||||||
|
V4 s1_3 = -max(-s0_3, 0.0);
|
||||||
|
V4 s1_4 = -max(-s0_4, 0.0);
|
||||||
|
V4 s1_5 = -max(-s0_5, 0.0);
|
||||||
|
V4 s1_6 = -max(-s0_6, 0.0);
|
||||||
|
V4 s1_7 = -max(-s0_7, 0.0);
|
||||||
|
V4 s1_8 = -max(-s0_8, 0.0);
|
||||||
|
s0_0 = max(s0_0, 0.0);
|
||||||
|
s0_1 = max(s0_1, 0.0);
|
||||||
|
s0_2 = max(s0_2, 0.0);
|
||||||
|
s0_3 = max(s0_3, 0.0);
|
||||||
|
s0_4 = max(s0_4, 0.0);
|
||||||
|
s0_5 = max(s0_5, 0.0);
|
||||||
|
s0_6 = max(s0_6, 0.0);
|
||||||
|
s0_7 = max(s0_7, 0.0);
|
||||||
|
s0_8 = max(s0_8, 0.0);
|
||||||
|
|
||||||
|
t0[gxy] = f0(s0_0, s0_1, s0_2, s0_3, s0_4, s0_5, s0_6, s0_7, s0_8, s1_0, s1_1, s1_2, s1_3, s1_4, s1_5, s1_6, s1_7, s1_8);
|
||||||
|
}
|
||||||
|
|
||||||
|
//!PASS 4
|
||||||
|
//!DESC conv3
|
||||||
|
//!BLOCK_SIZE 8
|
||||||
|
//!NUM_THREADS 64
|
||||||
|
//!IN t0
|
||||||
|
//!OUT t1
|
||||||
|
|
||||||
|
#define l0(x, y) V4(O(t0, float2(x, y)))
|
||||||
|
|
||||||
|
V4 f0(V4 s0_0, V4 s0_1, V4 s0_2, V4 s0_3, V4 s0_4, V4 s0_5, V4 s0_6, V4 s0_7, V4 s0_8, V4 s1_0, V4 s1_1, V4 s1_2, V4 s1_3, V4 s1_4, V4 s1_5, V4 s1_6, V4 s1_7, V4 s1_8) {
|
||||||
|
V4 r = { -4.349e-03, -3.760e-03, 4.684e-03, 4.745e-03 };
|
||||||
|
r = MulAdd(s0_0, M4(1.869e-01, 8.774e-02, -6.451e-02, 6.682e-02, 8.374e-02, 1.313e-02, -2.649e-02, 2.741e-02, -3.609e-02, -9.330e-02, -8.233e-02, 1.117e-01, -1.203e-01, 1.719e-02, 1.288e-01, -9.851e-02), r);
|
||||||
|
r = MulAdd(s0_1, M4(3.100e-01, 5.063e-02, 1.169e-01, -3.828e-02, 3.428e-01, 4.869e-02, -1.232e-02, -1.003e-02, 2.756e-01, 3.916e-01, 1.450e-01, 1.078e-01, -2.568e-01, -2.157e-01, -1.057e-01, -1.338e-01), r);
|
||||||
|
r = MulAdd(s0_2, M4(1.199e-01, -1.890e-01, 5.870e-03, -5.995e-03, 2.255e-01, -2.325e-03, 7.916e-03, -2.038e-02, 1.353e-01, -9.590e-02, -2.119e-02, -5.860e-02, -7.698e-02, -3.608e-02, -3.571e-02, 2.010e-02), r);
|
||||||
|
r = MulAdd(s0_3, M4(9.889e-02, -2.665e-02, -2.627e-01, 3.583e-01, 7.891e-02, 8.737e-02, 5.322e-02, 5.246e-04, -5.188e-02, -8.491e-02, -4.991e-02, -3.735e-02, 5.711e-02, 4.482e-02, 5.660e-02, -1.322e-01), r);
|
||||||
|
r = MulAdd(s0_4, M4(-5.488e-01, 2.898e-01, 1.046e+00, 6.036e-01, -3.180e-01, -6.309e-01, -2.627e-01, 1.734e-01, -2.067e-01, 3.775e-02, -2.881e-01, -9.242e-02, 3.369e-01, 2.554e-02, -1.645e-01, 4.973e-01), r);
|
||||||
|
r = MulAdd(s0_5, M4(6.976e-03, -1.830e-01, 2.842e-01, 2.570e-02, -2.902e-01, 5.059e-01, 1.944e-01, 1.794e-02, -1.333e-01, 2.341e-01, 4.161e-01, -5.179e-02, 8.176e-02, -2.435e-02, -1.598e-02, 6.211e-02), r);
|
||||||
|
r = MulAdd(s0_6, M4(-2.668e-02, -6.958e-02, -5.015e-02, 8.035e-02, 4.451e-02, -1.290e-03, -7.688e-02, 1.708e-01, -5.133e-02, -2.768e-02, -1.780e-02, -6.317e-02, -9.692e-03, -2.748e-03, 9.070e-03, -1.314e-01), r);
|
||||||
|
r = MulAdd(s0_7, M4(1.402e-01, 4.997e-02, -4.973e-02, 6.839e-01, 2.079e-02, -2.511e-02, 3.403e-01, -3.077e-01, -2.831e-02, 4.816e-02, -9.142e-02, -8.176e-02, -2.999e-02, -5.749e-03, -5.579e-02, -2.355e-01), r);
|
||||||
|
r = MulAdd(s0_8, M4(-1.783e-02, -2.882e-02, 9.841e-02, 4.473e-02, 4.128e-02, -3.071e-02, -2.378e-01, 1.347e-01, -2.285e-02, 1.317e-02, -1.632e-02, 1.058e-01, -3.696e-02, -6.864e-03, -8.989e-02, -7.315e-02), r);
|
||||||
|
r = MulAdd(s1_0, M4(8.857e-02, 3.169e-02, -1.896e-02, 1.258e-02, 7.086e-02, 5.699e-02, 1.550e-02, -1.836e-02, 1.209e-01, 5.334e-02, -1.557e-02, -2.374e-02, -1.411e-02, 1.543e-02, 1.769e-02, -4.332e-02), r);
|
||||||
|
r = MulAdd(s1_1, M4(1.199e-01, -8.203e-03, -1.695e-02, -3.214e-02, 5.918e-01, 3.458e-01, 7.684e-02, -5.137e-01, 2.827e-01, -2.008e-02, -1.848e-01, 2.147e-01, 7.212e-02, -3.906e-03, -2.220e-01, -1.918e-01), r);
|
||||||
|
r = MulAdd(s1_2, M4(4.464e-02, 4.035e-02, 4.265e-03, 1.350e-02, -4.623e-01, -1.882e-01, 9.929e-02, -2.295e-01, 2.010e-01, 6.059e-01, 3.648e-01, -1.670e-02, -6.763e-02, -2.588e-01, -1.741e-01, 3.358e-02), r);
|
||||||
|
r = MulAdd(s1_3, M4(1.003e-01, -2.961e-02, -1.715e-01, 1.057e-01, 3.275e-03, 1.877e-02, -4.995e-02, 1.181e-01, 3.600e-02, 2.101e-02, -1.050e-01, 8.035e-02, -8.107e-02, -1.067e-01, -5.457e-02, 5.339e-02), r);
|
||||||
|
r = MulAdd(s1_4, M4(3.875e-01, 3.638e-01, 1.178e-01, -4.404e-02, 6.128e-02, -1.193e-01, -3.161e-01, 3.510e-01, -3.482e-02, -2.842e-01, -3.917e-01, 4.525e-01, 1.969e-01, 5.299e-01, 4.720e-01, -2.266e-01), r);
|
||||||
|
r = MulAdd(s1_5, M4(-1.420e-02, 2.325e-02, -8.697e-02, -4.296e-03, 8.697e-02, 7.490e-02, 1.773e-01, 4.010e-01, 2.380e-01, -1.182e-01, 9.121e-01, 2.252e-01, 1.348e-01, -7.448e-02, -8.496e-01, -3.335e-01), r);
|
||||||
|
r = MulAdd(s1_6, M4(-7.923e-02, -2.533e-02, -4.896e-02, -5.473e-02, -5.329e-03, 1.285e-02, -1.763e-02, 7.009e-02, 9.670e-04, -1.889e-02, -1.008e-01, 1.149e-01, 7.259e-03, 4.080e-02, 1.042e-01, -2.627e-01), r);
|
||||||
|
r = MulAdd(s1_7, M4(-9.746e-02, 6.679e-02, -1.421e-01, -2.202e-01, -9.918e-03, -2.413e-02, -1.554e-02, 7.011e-03, -3.226e-02, -3.024e-02, -5.431e-02, 7.446e-02, 5.860e-02, 2.851e-02, -2.367e-01, 2.562e-02), r);
|
||||||
|
r = MulAdd(s1_8, M4(-4.627e-02, 4.226e-02, -8.654e-02, -3.312e-02, 1.600e-02, 2.983e-02, 8.834e-03, -3.871e-02, -4.137e-03, 1.767e-02, 2.492e-02, -5.391e-02, 8.133e-03, 1.430e-02, -2.428e-02, -1.132e-01), r);
|
||||||
|
return r;
|
||||||
|
}
|
||||||
|
|
||||||
|
void Pass4(uint2 blockStart, uint3 tid) {
|
||||||
|
float2 pt = float2(GetInputPt());
|
||||||
|
uint2 gxy = Rmp8x8(tid.x) + blockStart;
|
||||||
|
uint2 size = GetInputSize();
|
||||||
|
if (gxy.x >= size.x || gxy.y >= size.y) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
float2 pos = (gxy + 0.5) * pt;
|
||||||
|
|
||||||
|
V4 s0_0 = l0(-1.0, -1.0);
|
||||||
|
V4 s0_1 = l0(0.0, -1.0);
|
||||||
|
V4 s0_2 = l0(1.0, -1.0);
|
||||||
|
V4 s0_3 = l0(-1.0, 0.0);
|
||||||
|
V4 s0_4 = l0(0.0, 0.0);
|
||||||
|
V4 s0_5 = l0(1.0, 0.0);
|
||||||
|
V4 s0_6 = l0(-1.0, 1.0);
|
||||||
|
V4 s0_7 = l0(0.0, 1.0);
|
||||||
|
V4 s0_8 = l0(1.0, 1.0);
|
||||||
|
V4 s1_0 = -max(-s0_0, 0.0);
|
||||||
|
V4 s1_1 = -max(-s0_1, 0.0);
|
||||||
|
V4 s1_2 = -max(-s0_2, 0.0);
|
||||||
|
V4 s1_3 = -max(-s0_3, 0.0);
|
||||||
|
V4 s1_4 = -max(-s0_4, 0.0);
|
||||||
|
V4 s1_5 = -max(-s0_5, 0.0);
|
||||||
|
V4 s1_6 = -max(-s0_6, 0.0);
|
||||||
|
V4 s1_7 = -max(-s0_7, 0.0);
|
||||||
|
V4 s1_8 = -max(-s0_8, 0.0);
|
||||||
|
s0_0 = max(s0_0, 0.0);
|
||||||
|
s0_1 = max(s0_1, 0.0);
|
||||||
|
s0_2 = max(s0_2, 0.0);
|
||||||
|
s0_3 = max(s0_3, 0.0);
|
||||||
|
s0_4 = max(s0_4, 0.0);
|
||||||
|
s0_5 = max(s0_5, 0.0);
|
||||||
|
s0_6 = max(s0_6, 0.0);
|
||||||
|
s0_7 = max(s0_7, 0.0);
|
||||||
|
s0_8 = max(s0_8, 0.0);
|
||||||
|
|
||||||
|
t1[gxy] = f0(s0_0, s0_1, s0_2, s0_3, s0_4, s0_5, s0_6, s0_7, s0_8, s1_0, s1_1, s1_2, s1_3, s1_4, s1_5, s1_6, s1_7, s1_8);
|
||||||
|
}
|
||||||
|
|
||||||
|
//!PASS 5
|
||||||
|
//!DESC out-shuffle
|
||||||
|
//!BLOCK_SIZE 16
|
||||||
|
//!NUM_THREADS 64
|
||||||
|
//!IN INPUT, t1
|
||||||
|
//!OUT OUTPUT
|
||||||
|
|
||||||
|
#define l0(x, y) V4(O(t1, float2(x, y)))
|
||||||
|
|
||||||
|
V4 f0(V4 s0_0, V4 s0_1, V4 s0_2, V4 s0_3, V4 s0_4, V4 s0_5, V4 s0_6, V4 s0_7, V4 s0_8, V4 s1_0, V4 s1_1, V4 s1_2, V4 s1_3, V4 s1_4, V4 s1_5, V4 s1_6, V4 s1_7, V4 s1_8) {
|
||||||
|
V4 r = { 7.204e-05, -6.226e-05, 2.867e-04, -3.251e-05 };
|
||||||
|
r = MulAdd(s0_0, M4(-4.783e-03, 7.235e-03, -7.275e-03, -2.802e-03, 4.921e-02, 7.543e-02, -3.357e-02, 1.213e-02, 2.900e-02, 2.380e-03, -9.028e-03, -2.594e-02, 1.576e-03, 3.334e-04, -2.460e-02, -1.285e-02), r);
|
||||||
|
r = MulAdd(s0_1, M4(4.582e-02, 9.378e-04, 2.217e-02, 5.083e-02, -1.054e-02, 8.518e-02, -1.884e-02, -5.149e-02, 1.983e-02, -1.106e-02, -4.317e-03, 5.384e-02, -5.193e-02, 1.089e-02, -9.384e-03, 3.137e-02), r);
|
||||||
|
r = MulAdd(s0_2, M4(-5.241e-03, 3.821e-02, -1.136e-02, -3.033e-02, 3.186e-02, -3.270e-03, 1.422e-02, 2.401e-02, -1.360e-02, 1.024e-01, -6.042e-02, -2.325e-02, -1.248e-01, -1.377e-01, 1.654e-02, -1.347e-02), r);
|
||||||
|
r = MulAdd(s0_3, M4(-3.552e-02, -3.211e-02, -2.282e-03, 1.775e-02, 1.360e-01, 2.808e-02, 1.082e-01, -1.311e-02, -1.699e-02, -2.628e-02, 3.430e-02, -3.880e-03, 2.514e-02, -3.171e-02, 4.675e-02, -2.711e-02), r);
|
||||||
|
r = MulAdd(s0_4, M4(4.756e-01, 2.686e-01, 4.514e-02, -8.813e-02, 2.636e-01, -4.893e-01, 1.301e-01, 1.304e-01, 3.778e-01, 2.765e-01, 3.369e-01, 8.811e-02, 5.080e-02, 2.783e-01, -1.131e-01, 2.487e-01), r);
|
||||||
|
r = MulAdd(s0_5, M4(-2.961e-02, 7.757e-02, -8.471e-02, -4.636e-02, -6.862e-02, 1.733e-01, -7.301e-02, -1.408e-02, 1.636e-02, 9.982e-02, 5.704e-02, 2.568e-01, -2.224e-02, -2.588e-01, -2.202e-01, -4.898e-01), r);
|
||||||
|
r = MulAdd(s0_6, M4(1.058e-01, -2.810e-02, -2.960e-02, -8.398e-02, -9.106e-02, 6.642e-02, -2.574e-02, 7.841e-02, -1.978e-02, -3.700e-02, -1.504e-02, -3.186e-02, 2.438e-03, 6.191e-03, -1.155e-02, -1.161e-02), r);
|
||||||
|
r = MulAdd(s0_7, M4(-6.316e-01, -7.748e-02, 8.006e-01, 3.936e-01, 1.300e-01, -1.999e-01, 2.351e-01, -7.485e-01, -7.151e-02, -4.285e-02, -2.277e-02, 2.849e-02, -2.207e-02, -2.585e-02, -2.498e-02, -3.308e-02), r);
|
||||||
|
r = MulAdd(s0_8, M4(-2.002e-01, -6.934e-01, -1.093e-01, 3.325e-01, -5.778e-02, 2.138e-02, -2.930e-02, 1.794e-01, -3.028e-03, 2.300e-03, 5.845e-03, -1.959e-02, 1.403e-02, 1.565e-02, 1.840e-02, -6.027e-04), r);
|
||||||
|
r = MulAdd(s1_0, M4(2.228e-02, -8.352e-03, -1.007e-02, -1.911e-02, -1.489e-02, 2.785e-03, -9.190e-03, 5.858e-03, 2.420e-02, -7.701e-03, -2.327e-02, -2.494e-02, -8.526e-03, -2.384e-02, -2.601e-02, -4.833e-02), r);
|
||||||
|
r = MulAdd(s1_1, M4(5.671e-02, 3.666e-02, 3.309e-02, 1.011e-02, -8.053e-03, 4.673e-02, -5.358e-02, -2.451e-02, 3.779e-01, 5.642e-02, -2.324e-01, -3.499e-02, -3.479e-01, 1.179e-01, -4.630e-02, 1.118e-01), r);
|
||||||
|
r = MulAdd(s1_2, M4(-1.650e-02, 6.203e-04, -1.322e-02, -1.996e-02, 2.118e-02, -9.244e-03, 2.813e-02, 9.773e-03, -2.654e-02, -8.373e-02, 6.663e-04, -6.860e-02, -3.436e-02, -7.207e-01, 2.389e-01, 1.903e-01), r);
|
||||||
|
r = MulAdd(s1_3, M4(-8.045e-02, -2.073e-02, 3.380e-02, 1.327e-02, 1.247e-01, 1.129e-02, 6.421e-02, -8.326e-03, -4.675e-02, 4.920e-02, -3.699e-02, 4.601e-02, 3.389e-02, -4.151e-02, 3.012e-02, -2.241e-02), r);
|
||||||
|
r = MulAdd(s1_4, M4(5.223e-01, 1.394e-01, 1.222e-01, -7.687e-03, -3.115e-01, 3.989e-02, -1.679e-01, 2.607e-01, 4.393e-01, -1.821e-01, 1.006e+00, -2.920e-01, 8.062e-02, 2.231e-01, -1.282e-02, 2.495e-01), r);
|
||||||
|
r = MulAdd(s1_5, M4(-1.146e-01, 6.738e-02, -1.655e-02, 1.178e-02, -3.058e-02, 1.093e-01, 9.367e-03, 1.382e-02, -7.397e-02, 2.300e-01, -4.202e-02, 1.765e-01, -4.671e-02, -1.375e-02, -3.662e-01, -5.254e-01), r);
|
||||||
|
r = MulAdd(s1_6, M4(5.090e-02, 8.633e-03, -1.128e-02, -3.186e-02, -6.263e-02, 4.143e-02, -2.214e-02, 5.270e-02, -1.370e-02, -1.692e-02, -2.644e-02, -9.847e-03, -2.147e-03, -7.941e-03, -1.323e-04, -5.173e-03), r);
|
||||||
|
r = MulAdd(s1_7, M4(-9.353e-02, 6.696e-02, 2.744e-01, 2.743e-01, 9.809e-02, -1.439e-01, -2.583e-02, -3.717e-01, -5.135e-02, -1.889e-02, -1.775e-02, 9.383e-03, -2.496e-02, -2.936e-02, -2.578e-02, -1.586e-02), r);
|
||||||
|
r = MulAdd(s1_8, M4(-1.565e-02, -1.635e-01, -1.800e-01, -2.607e-01, 1.975e-02, 1.594e-02, -4.568e-02, 1.218e-01, -6.668e-03, 7.923e-03, -4.625e-02, 1.324e-02, -6.838e-03, 2.045e-02, 1.141e-02, 2.717e-02), r);
|
||||||
|
return tanh(r);
|
||||||
|
}
|
||||||
|
|
||||||
|
void Pass5(uint2 blockStart, uint3 tid) {
|
||||||
|
float2 pt = float2(GetInputPt());
|
||||||
|
uint2 gxy = (Rmp8x8(tid.x) << 1) + blockStart;
|
||||||
|
uint2 size = GetOutputSize();
|
||||||
|
if (gxy.x >= size.x || gxy.y >= size.y) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
float2 pos = ((gxy >> 1) + 0.5) * pt;
|
||||||
|
|
||||||
|
V4 s0_0 = l0(-1.0, -1.0);
|
||||||
|
V4 s0_1 = l0(0.0, -1.0);
|
||||||
|
V4 s0_2 = l0(1.0, -1.0);
|
||||||
|
V4 s0_3 = l0(-1.0, 0.0);
|
||||||
|
V4 s0_4 = l0(0.0, 0.0);
|
||||||
|
V4 s0_5 = l0(1.0, 0.0);
|
||||||
|
V4 s0_6 = l0(-1.0, 1.0);
|
||||||
|
V4 s0_7 = l0(0.0, 1.0);
|
||||||
|
V4 s0_8 = l0(1.0, 1.0);
|
||||||
|
V4 s1_0 = -max(-s0_0, 0.0);
|
||||||
|
V4 s1_1 = -max(-s0_1, 0.0);
|
||||||
|
V4 s1_2 = -max(-s0_2, 0.0);
|
||||||
|
V4 s1_3 = -max(-s0_3, 0.0);
|
||||||
|
V4 s1_4 = -max(-s0_4, 0.0);
|
||||||
|
V4 s1_5 = -max(-s0_5, 0.0);
|
||||||
|
V4 s1_6 = -max(-s0_6, 0.0);
|
||||||
|
V4 s1_7 = -max(-s0_7, 0.0);
|
||||||
|
V4 s1_8 = -max(-s0_8, 0.0);
|
||||||
|
s0_0 = max(s0_0, 0.0);
|
||||||
|
s0_1 = max(s0_1, 0.0);
|
||||||
|
s0_2 = max(s0_2, 0.0);
|
||||||
|
s0_3 = max(s0_3, 0.0);
|
||||||
|
s0_4 = max(s0_4, 0.0);
|
||||||
|
s0_5 = max(s0_5, 0.0);
|
||||||
|
s0_6 = max(s0_6, 0.0);
|
||||||
|
s0_7 = max(s0_7, 0.0);
|
||||||
|
s0_8 = max(s0_8, 0.0);
|
||||||
|
|
||||||
|
V4 r = f0(s0_0, s0_1, s0_2, s0_3, s0_4, s0_5, s0_6, s0_7, s0_8, s1_0, s1_1, s1_2, s1_3, s1_4, s1_5, s1_6, s1_7, s1_8);
|
||||||
|
|
||||||
|
static const MF3x3 rgb2yuv = { 0.299, 0.587, 0.114, -0.169, -0.331, 0.5, 0.5, -0.419, -0.081 };
|
||||||
|
static const MF3x3 yuv2rgb = { 1, -0.00093, 1.401687, 1, -0.3437, -0.71417, 1, 1.77216, 0.00099 };
|
||||||
|
float2 opt = float2(GetOutputPt());
|
||||||
|
|
||||||
|
pos -= 0.5f * opt;
|
||||||
|
MF3 yuv = mul(rgb2yuv, INPUT.SampleLevel(SL, pos, 0).rgb);
|
||||||
|
OUTPUT[gxy] = MF4(mul(yuv2rgb, MF3(saturate(yuv.r + r.x), yuv.yz)), 1);
|
||||||
|
|
||||||
|
++gxy.x;
|
||||||
|
pos.x += opt.x;
|
||||||
|
yuv = mul(rgb2yuv, INPUT.SampleLevel(SL, pos, 0).rgb);
|
||||||
|
OUTPUT[gxy] = MF4(mul(yuv2rgb, MF3(saturate(yuv.r + r.y), yuv.yz)), 1);
|
||||||
|
|
||||||
|
++gxy.y;
|
||||||
|
pos.y += opt.y;
|
||||||
|
yuv = mul(rgb2yuv, INPUT.SampleLevel(SL, pos, 0).rgb);
|
||||||
|
OUTPUT[gxy] = MF4(mul(yuv2rgb, MF3(saturate(yuv.r + r.w), yuv.yz)), 1);
|
||||||
|
|
||||||
|
--gxy.x;
|
||||||
|
pos.x -= opt.x;
|
||||||
|
yuv = mul(rgb2yuv, INPUT.SampleLevel(SL, pos, 0).rgb);
|
||||||
|
OUTPUT[gxy] = MF4(mul(yuv2rgb, MF3(saturate(yuv.r + r.z), yuv.yz)), 1);
|
||||||
|
}
|
||||||
2206
src/Effects/CuNNy/CuNNy-4x16C-NVL-DN.hlsl
Normal file
2206
src/Effects/CuNNy/CuNNy-4x16C-NVL-DN.hlsl
Normal file
File diff suppressed because it is too large
Load diff
2206
src/Effects/CuNNy/CuNNy-4x16C-NVL.hlsl
Normal file
2206
src/Effects/CuNNy/CuNNy-4x16C-NVL.hlsl
Normal file
File diff suppressed because it is too large
Load diff
484
src/Effects/CuNNy/CuNNy-4x4C-NVL-DN.hlsl
Normal file
484
src/Effects/CuNNy/CuNNy-4x4C-NVL-DN.hlsl
Normal file
|
|
@ -0,0 +1,484 @@
|
||||||
|
// CuNNy 4x4C BILINEAR RGB NVL DN - https://github.com/funnyplanter/CuNNy
|
||||||
|
|
||||||
|
// This program is free software: you can redistribute it and/or modify
|
||||||
|
// it under the terms of the GNU General Public License as published by
|
||||||
|
// the Free Software Foundation, either version 3 of the License, or
|
||||||
|
// (at your option) any later version.
|
||||||
|
//
|
||||||
|
// This program is distributed in the hope that it will be useful,
|
||||||
|
// but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
//
|
||||||
|
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||||
|
// GNU General Public License for more details.
|
||||||
|
//
|
||||||
|
// You should have received a copy of the GNU General Public License
|
||||||
|
// along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||||
|
|
||||||
|
//!MAGPIE EFFECT
|
||||||
|
//!VERSION 4
|
||||||
|
//!SORT_NAME CuNNy-DN-D04N04
|
||||||
|
//!USE MulAdd
|
||||||
|
//!CAPABILITY FP16
|
||||||
|
|
||||||
|
#include "../StubDefs.hlsli"
|
||||||
|
|
||||||
|
//!TEXTURE
|
||||||
|
Texture2D INPUT;
|
||||||
|
|
||||||
|
//!TEXTURE
|
||||||
|
//!WIDTH INPUT_WIDTH * 2
|
||||||
|
//!HEIGHT INPUT_HEIGHT * 2
|
||||||
|
Texture2D OUTPUT;
|
||||||
|
|
||||||
|
//!SAMPLER
|
||||||
|
//!FILTER POINT
|
||||||
|
SamplerState SP;
|
||||||
|
|
||||||
|
//!SAMPLER
|
||||||
|
//!FILTER LINEAR
|
||||||
|
SamplerState SL;
|
||||||
|
|
||||||
|
//!COMMON
|
||||||
|
#define O(t, p) t.SampleLevel(SP, pos + p * pt, 0)
|
||||||
|
#define V4 MF4
|
||||||
|
#define M4 MF4x4
|
||||||
|
|
||||||
|
//!TEXTURE
|
||||||
|
//!WIDTH INPUT_WIDTH
|
||||||
|
//!HEIGHT INPUT_HEIGHT
|
||||||
|
//!FORMAT R8G8B8A8_SNORM
|
||||||
|
Texture2D t0;
|
||||||
|
|
||||||
|
//!TEXTURE
|
||||||
|
//!WIDTH INPUT_WIDTH
|
||||||
|
//!HEIGHT INPUT_HEIGHT
|
||||||
|
//!FORMAT R8G8B8A8_SNORM
|
||||||
|
Texture2D t1;
|
||||||
|
|
||||||
|
//!PASS 1
|
||||||
|
//!DESC in
|
||||||
|
//!BLOCK_SIZE 8
|
||||||
|
//!NUM_THREADS 64
|
||||||
|
//!IN INPUT
|
||||||
|
//!OUT t0
|
||||||
|
|
||||||
|
#define l0(x, y) (dot(MF3(2.428e-01, 4.714e-01, 1.229e-01), O(INPUT, float2(x, y)).rgb) + MF(-7.696e-02))
|
||||||
|
|
||||||
|
V4 f0(MF s0_0, MF s0_1, MF s0_2, MF s0_3, MF s0_4, MF s0_5, MF s0_6, MF s0_7, MF s0_8) {
|
||||||
|
V4 r = { 1.690e-02, 8.856e-03, -9.136e-04, 2.267e-02 };
|
||||||
|
r = mad(s0_0, V4(9.154e-02, 3.758e-01, 2.353e-02, -5.798e-02), r);
|
||||||
|
r = mad(s0_1, V4(-5.382e-01, 1.688e-01, -1.190e-01, 4.082e-02), r);
|
||||||
|
r = mad(s0_2, V4(2.460e-02, -5.810e-02, 7.788e-02, 3.018e-02), r);
|
||||||
|
r = mad(s0_3, V4(1.211e-01, -1.552e-01, -9.990e-02, 3.963e-02), r);
|
||||||
|
r = mad(s0_4, V4(-2.611e-01, -4.835e-01, -6.965e-01, -4.893e-01), r);
|
||||||
|
r = mad(s0_5, V4(-3.017e-01, -4.435e-02, 1.836e-01, 4.600e-01), r);
|
||||||
|
r = mad(s0_6, V4(1.275e-01, 2.485e-01, 7.354e-02, -4.648e-02), r);
|
||||||
|
r = mad(s0_7, V4(2.527e-01, 1.279e-01, 3.053e-01, 3.957e-02), r);
|
||||||
|
r = mad(s0_8, V4(1.003e-02, 1.193e-01, 2.476e-01, -2.051e-02), r);
|
||||||
|
return r;
|
||||||
|
}
|
||||||
|
|
||||||
|
void Pass1(uint2 blockStart, uint3 tid) {
|
||||||
|
float2 pt = float2(GetInputPt());
|
||||||
|
uint2 gxy = Rmp8x8(tid.x) + blockStart;
|
||||||
|
uint2 size = GetInputSize();
|
||||||
|
if (gxy.x >= size.x || gxy.y >= size.y) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
float2 pos = (gxy + 0.5) * pt;
|
||||||
|
|
||||||
|
MF s0_0 = l0(-1.0, -1.0);
|
||||||
|
MF s0_1 = l0(0.0, -1.0);
|
||||||
|
MF s0_2 = l0(1.0, -1.0);
|
||||||
|
MF s0_3 = l0(-1.0, 0.0);
|
||||||
|
MF s0_4 = l0(0.0, 0.0);
|
||||||
|
MF s0_5 = l0(1.0, 0.0);
|
||||||
|
MF s0_6 = l0(-1.0, 1.0);
|
||||||
|
MF s0_7 = l0(0.0, 1.0);
|
||||||
|
MF s0_8 = l0(1.0, 1.0);
|
||||||
|
|
||||||
|
t0[gxy] = f0(s0_0, s0_1, s0_2, s0_3, s0_4, s0_5, s0_6, s0_7, s0_8);
|
||||||
|
}
|
||||||
|
|
||||||
|
//!PASS 2
|
||||||
|
//!DESC conv1
|
||||||
|
//!BLOCK_SIZE 8
|
||||||
|
//!NUM_THREADS 64
|
||||||
|
//!IN t0
|
||||||
|
//!OUT t1
|
||||||
|
|
||||||
|
#define l0(x, y) V4(O(t0, float2(x, y)))
|
||||||
|
|
||||||
|
V4 f0(V4 s0_0, V4 s0_1, V4 s0_2, V4 s0_3, V4 s0_4, V4 s0_5, V4 s0_6, V4 s0_7, V4 s0_8, V4 s1_0, V4 s1_1, V4 s1_2, V4 s1_3, V4 s1_4, V4 s1_5, V4 s1_6, V4 s1_7, V4 s1_8) {
|
||||||
|
V4 r = { -4.697e-03, -2.213e-02, 3.898e-01, -1.481e-02 };
|
||||||
|
r = MulAdd(s0_0, M4(-4.540e-03, -2.499e-01, 4.202e-02, 1.132e-02, 2.910e-02, -3.788e-02, 3.330e-02, -2.254e-02, -1.953e-01, 1.226e-01, -1.907e-01, -1.378e-01, 9.555e-02, -2.443e-01, 6.124e-02, -7.256e-03), r);
|
||||||
|
r = MulAdd(s0_1, M4(-1.225e-01, -1.812e-01, -1.238e-02, 4.088e-01, -9.977e-02, 4.395e-02, -2.394e-02, -5.584e-03, 2.939e-01, 4.102e-01, 6.228e-02, 3.822e-01, 8.618e-02, -1.109e-01, 1.776e-01, -7.505e-02), r);
|
||||||
|
r = MulAdd(s0_2, M4(2.047e-01, -6.853e-02, 1.880e-02, -9.030e-03, 1.505e-01, 7.782e-02, 1.347e-02, 5.566e-01, -6.951e-02, -1.352e-01, 1.941e-03, 3.975e-02, 1.637e-01, 6.708e-02, 1.501e-02, 1.373e-01), r);
|
||||||
|
r = MulAdd(s0_3, M4(-1.974e-01, 1.068e-01, -1.102e-01, 5.909e-02, 2.355e-03, 1.275e-01, -5.986e-02, -5.288e-02, 8.785e-04, -1.440e-01, -3.369e-01, -9.128e-02, 2.030e-01, 4.937e-01, -1.637e-01, 4.814e-02), r);
|
||||||
|
r = MulAdd(s0_4, M4(-3.954e-01, 4.772e-01, -5.841e-01, -8.070e-02, -2.056e-01, -2.335e-01, -2.091e-01, 1.223e-01, -2.686e-01, 1.240e+00, 7.095e-02, 6.502e-01, 1.044e-01, -3.071e-01, -2.892e-01, 4.861e-01), r);
|
||||||
|
r = MulAdd(s0_5, M4(5.943e-02, 2.245e-01, 4.014e-01, -1.063e-01, -1.869e-01, 1.384e-01, 2.996e-01, -1.928e-01, 1.212e-01, 2.849e-01, 2.093e-01, -3.821e-01, -8.705e-02, 1.976e-01, 5.176e-01, -7.461e-02), r);
|
||||||
|
r = MulAdd(s0_6, M4(1.048e-01, 2.374e-02, 2.730e-01, 1.446e-01, -5.406e-02, -1.587e-02, -2.014e-01, -3.422e-02, -2.114e-01, -5.198e-01, 2.674e-02, -6.078e-02, -2.293e-01, -9.914e-02, -2.110e-01, 7.008e-02), r);
|
||||||
|
r = MulAdd(s0_7, M4(5.799e-02, 4.932e-01, 4.559e-01, -3.118e-02, 4.706e-02, -2.242e-01, -3.165e-01, -9.912e-02, 4.041e-01, 7.241e-01, -1.696e-01, 1.990e-01, 4.697e-01, 9.965e-03, -1.141e-02, -1.365e-02), r);
|
||||||
|
r = MulAdd(s0_8, M4(-1.744e-01, -7.119e-02, 3.632e-01, -2.802e-01, -3.155e-01, 4.455e-01, -1.866e-02, -2.667e-02, 1.255e-01, -5.762e-01, -2.226e-02, 2.812e-02, -2.349e-01, 1.552e-01, -6.424e-03, 7.450e-02), r);
|
||||||
|
r = MulAdd(s1_0, M4(6.159e-02, -4.426e-02, 2.277e-02, 1.040e-01, -6.306e-04, -1.704e-01, 3.807e-02, -8.670e-02, -1.403e-01, 1.644e-01, -9.679e-02, -1.055e-01, 2.394e-01, -5.504e-02, 8.006e-02, 6.312e-02), r);
|
||||||
|
r = MulAdd(s1_1, M4(-1.134e-01, -1.030e-01, -2.777e-02, 2.955e-01, -1.225e-01, -4.096e-02, -2.748e-02, 9.404e-02, 2.890e-01, -2.441e-01, 1.560e-01, 1.694e-01, 1.853e-01, 3.311e-01, 3.408e-01, -8.678e-02), r);
|
||||||
|
r = MulAdd(s1_2, M4(1.821e-01, 3.898e-02, -2.560e-02, 1.160e-01, 2.382e-01, -1.638e-01, -1.345e-01, 3.193e-01, -1.839e-01, -2.638e-01, 5.265e-02, 2.415e-01, 2.803e-01, 1.919e-01, -7.340e-02, 1.762e-02), r);
|
||||||
|
r = MulAdd(s1_3, M4(-2.606e-01, -1.263e-01, -3.067e-02, -1.695e-02, 4.665e-03, 2.947e-02, -1.965e-02, -2.658e-02, -7.935e-02, -1.566e-01, -3.246e-01, -1.075e-03, 1.896e-01, -2.937e-01, -1.020e-01, -1.513e-01), r);
|
||||||
|
r = MulAdd(s1_4, M4(-3.696e-01, 8.901e-02, -1.890e-01, -2.804e-02, -2.998e-01, -6.597e-02, -2.613e-01, 3.877e-01, -1.032e+00, -2.328e-01, 7.941e-02, 5.733e-01, 8.618e-02, 4.213e-02, -1.242e+00, 5.861e-01), r);
|
||||||
|
r = MulAdd(s1_5, M4(1.919e-02, -5.609e-02, 3.295e-01, -2.364e-01, -4.238e-01, -6.041e-01, 3.389e-01, -4.460e-01, 4.482e-02, 1.077e-03, 8.990e-02, -2.725e-01, -4.829e-02, 1.184e-01, 1.941e-01, -3.646e-01), r);
|
||||||
|
r = MulAdd(s1_6, M4(2.968e-01, 2.018e-01, 2.695e-01, 8.891e-02, -5.857e-02, 6.005e-02, -2.440e-01, -1.349e-02, -7.572e-02, -3.213e-01, 6.274e-02, -1.229e-02, -7.589e-01, -2.313e-01, -1.627e-01, 2.538e-01), r);
|
||||||
|
r = MulAdd(s1_7, M4(-5.728e-02, 1.333e-01, 2.492e-01, -3.609e-02, 1.936e-01, -1.276e-01, -3.034e-01, -1.091e-01, 1.390e-01, 3.356e-01, -1.183e-01, 2.047e-01, 3.779e-01, -3.353e-01, 2.019e-01, 4.337e-02), r);
|
||||||
|
r = MulAdd(s1_8, M4(-1.386e-01, 1.179e-01, 2.340e-01, -1.604e-01, -4.890e-01, -5.407e-01, -1.546e-01, -1.826e-01, 1.596e-01, -1.784e-01, 5.777e-02, 3.961e-02, -2.290e-01, 2.752e-01, -4.260e-02, 9.649e-02), r);
|
||||||
|
return r;
|
||||||
|
}
|
||||||
|
|
||||||
|
void Pass2(uint2 blockStart, uint3 tid) {
|
||||||
|
float2 pt = float2(GetInputPt());
|
||||||
|
uint2 gxy = Rmp8x8(tid.x) + blockStart;
|
||||||
|
uint2 size = GetInputSize();
|
||||||
|
if (gxy.x >= size.x || gxy.y >= size.y) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
float2 pos = (gxy + 0.5) * pt;
|
||||||
|
|
||||||
|
V4 s0_0 = l0(-1.0, -1.0);
|
||||||
|
V4 s0_1 = l0(0.0, -1.0);
|
||||||
|
V4 s0_2 = l0(1.0, -1.0);
|
||||||
|
V4 s0_3 = l0(-1.0, 0.0);
|
||||||
|
V4 s0_4 = l0(0.0, 0.0);
|
||||||
|
V4 s0_5 = l0(1.0, 0.0);
|
||||||
|
V4 s0_6 = l0(-1.0, 1.0);
|
||||||
|
V4 s0_7 = l0(0.0, 1.0);
|
||||||
|
V4 s0_8 = l0(1.0, 1.0);
|
||||||
|
V4 s1_0 = -max(-s0_0, 0.0);
|
||||||
|
V4 s1_1 = -max(-s0_1, 0.0);
|
||||||
|
V4 s1_2 = -max(-s0_2, 0.0);
|
||||||
|
V4 s1_3 = -max(-s0_3, 0.0);
|
||||||
|
V4 s1_4 = -max(-s0_4, 0.0);
|
||||||
|
V4 s1_5 = -max(-s0_5, 0.0);
|
||||||
|
V4 s1_6 = -max(-s0_6, 0.0);
|
||||||
|
V4 s1_7 = -max(-s0_7, 0.0);
|
||||||
|
V4 s1_8 = -max(-s0_8, 0.0);
|
||||||
|
s0_0 = max(s0_0, 0.0);
|
||||||
|
s0_1 = max(s0_1, 0.0);
|
||||||
|
s0_2 = max(s0_2, 0.0);
|
||||||
|
s0_3 = max(s0_3, 0.0);
|
||||||
|
s0_4 = max(s0_4, 0.0);
|
||||||
|
s0_5 = max(s0_5, 0.0);
|
||||||
|
s0_6 = max(s0_6, 0.0);
|
||||||
|
s0_7 = max(s0_7, 0.0);
|
||||||
|
s0_8 = max(s0_8, 0.0);
|
||||||
|
|
||||||
|
t1[gxy] = f0(s0_0, s0_1, s0_2, s0_3, s0_4, s0_5, s0_6, s0_7, s0_8, s1_0, s1_1, s1_2, s1_3, s1_4, s1_5, s1_6, s1_7, s1_8);
|
||||||
|
}
|
||||||
|
|
||||||
|
//!PASS 3
|
||||||
|
//!DESC conv2
|
||||||
|
//!BLOCK_SIZE 8
|
||||||
|
//!NUM_THREADS 64
|
||||||
|
//!IN t1
|
||||||
|
//!OUT t0
|
||||||
|
|
||||||
|
#define l0(x, y) V4(O(t1, float2(x, y)))
|
||||||
|
|
||||||
|
V4 f0(V4 s0_0, V4 s0_1, V4 s0_2, V4 s0_3, V4 s0_4, V4 s0_5, V4 s0_6, V4 s0_7, V4 s0_8, V4 s1_0, V4 s1_1, V4 s1_2, V4 s1_3, V4 s1_4, V4 s1_5, V4 s1_6, V4 s1_7, V4 s1_8) {
|
||||||
|
V4 r = { -8.341e-03, 1.434e-02, 5.791e-03, -1.033e-02 };
|
||||||
|
r = MulAdd(s0_0, M4(-1.362e-01, -5.847e-02, 2.766e-02, 2.969e-02, 9.796e-02, 6.555e-02, -3.067e-02, -5.139e-02, 1.512e-01, 1.401e-01, -3.820e-03, 2.649e-02, -1.802e-01, -2.099e-02, -6.604e-02, 4.042e-02), r);
|
||||||
|
r = MulAdd(s0_1, M4(-2.144e-01, -1.437e-01, 4.670e-02, -2.348e-01, 9.990e-02, -5.186e-02, 1.658e-01, 9.557e-02, -1.353e-01, -1.146e-01, -9.837e-02, -8.956e-02, 1.229e-01, 2.354e-01, -2.342e-01, -1.343e-01), r);
|
||||||
|
r = MulAdd(s0_2, M4(5.918e-01, 2.130e-02, 5.753e-01, -6.941e-02, -3.156e-02, -4.438e-02, -6.348e-02, 2.682e-02, -1.078e-02, 9.727e-03, 8.472e-02, 1.460e-01, -1.921e-01, 1.872e-01, 6.067e-02, 3.762e-02), r);
|
||||||
|
r = MulAdd(s0_3, M4(1.341e-01, 1.082e-01, -4.460e-02, -1.008e-02, -1.262e-01, -7.942e-02, 5.610e-02, 4.418e-02, -1.725e-01, -1.158e-01, 6.377e-03, -1.171e-01, -3.447e-02, 4.459e-02, 2.822e-04, -7.623e-02), r);
|
||||||
|
r = MulAdd(s0_4, M4(1.994e-01, -2.251e-01, -2.432e-01, 2.467e-02, 3.717e-02, 3.275e-01, 2.005e-01, 1.427e-01, 1.122e-01, 2.864e-01, 1.478e-01, 3.701e-01, 3.111e-01, -1.704e-01, -1.410e-01, -7.490e-01), r);
|
||||||
|
r = MulAdd(s0_5, M4(-1.392e-01, -2.284e-02, 2.819e-01, -5.560e-02, -2.624e-01, 7.282e-02, -2.417e-01, -5.534e-02, -6.351e-03, -1.714e-01, -1.505e-01, -3.035e-01, -3.580e-02, 4.429e-02, 1.628e-01, -1.101e-01), r);
|
||||||
|
r = MulAdd(s0_6, M4(8.306e-04, 3.258e-02, -2.746e-02, -3.143e-02, -1.301e-02, -5.828e-02, 2.411e-03, 1.395e-02, 3.728e-02, -8.319e-02, 3.326e-02, 1.294e-01, -6.226e-02, 5.103e-02, -1.218e-02, 2.411e-01), r);
|
||||||
|
r = MulAdd(s0_7, M4(-6.323e-02, -1.343e-02, 3.400e-02, -1.727e-02, 3.683e-02, 6.325e-02, 4.834e-04, 3.849e-02, 9.424e-03, -2.010e-02, -3.447e-02, -1.330e-01, -4.107e-01, -7.682e-02, 4.138e-01, 5.994e-02), r);
|
||||||
|
r = MulAdd(s0_8, M4(7.556e-02, 1.846e-02, 1.847e-02, 1.057e-01, -1.140e-01, -2.834e-02, -3.141e-02, -1.045e-01, -2.025e-02, 4.729e-02, -2.822e-02, -4.072e-02, 3.368e-01, 6.871e-02, 1.184e-01, 1.536e-01), r);
|
||||||
|
r = MulAdd(s1_0, M4(-6.688e-02, 2.483e-02, 1.598e-01, -4.834e-02, 2.141e-01, -4.911e-02, -4.452e-02, -4.879e-02, -9.473e-01, 6.527e-01, -6.118e-01, -2.436e-01, -3.017e-02, -3.402e-01, 1.343e-01, 9.397e-02), r);
|
||||||
|
r = MulAdd(s1_1, M4(-1.330e-01, 2.557e-01, 6.838e-02, -3.936e-01, 4.806e-01, 1.828e-01, 5.073e-01, 4.502e-01, -1.404e+00, -2.954e-01, -6.745e-02, 5.594e-02, 2.640e-01, 2.330e-02, 1.331e-02, -2.700e-02), r);
|
||||||
|
r = MulAdd(s1_2, M4(2.695e-01, -1.004e-01, 9.104e-02, -4.919e-01, 3.357e-01, 4.895e-02, 4.062e-01, -3.494e-02, -4.352e-01, -1.232e-01, 8.889e-03, 3.472e-01, -1.174e-01, 7.690e-02, 6.341e-02, 9.255e-02), r);
|
||||||
|
r = MulAdd(s1_3, M4(1.805e-01, 2.494e-01, 3.474e-02, 3.930e-02, 2.671e-02, -1.438e-02, 7.294e-02, 4.854e-02, -2.864e+00, -5.832e-01, 4.350e-01, -4.265e-01, -2.643e-02, -6.234e-01, 1.283e-01, 5.168e-02), r);
|
||||||
|
r = MulAdd(s1_4, M4(-2.192e-01, 2.982e-01, -2.860e-01, -4.050e-01, 8.612e-02, 5.008e-02, 5.366e-01, 5.256e-01, -6.222e-01, 1.169e+00, 1.897e+00, 3.009e+00, 9.105e-02, -2.369e-01, -4.718e-01, -2.725e-01), r);
|
||||||
|
r = MulAdd(s1_5, M4(-7.441e-01, -1.820e-01, -5.828e-02, -6.348e-01, 5.721e-01, 1.143e-01, 2.871e-01, 3.254e-01, -1.446e-01, 1.446e-01, -8.526e-02, 7.228e-01, -9.749e-02, -1.665e-01, -1.116e-01, -2.705e-01), r);
|
||||||
|
r = MulAdd(s1_6, M4(-6.357e-02, -2.576e-02, 1.277e-02, -3.956e-02, 2.724e-02, -2.141e-02, 9.778e-02, 7.199e-03, -1.153e+00, -6.945e-01, -4.788e-01, -1.246e+00, 1.909e-01, 1.315e-01, 4.454e-02, 2.678e-01), r);
|
||||||
|
r = MulAdd(s1_7, M4(-1.022e-01, 1.572e-01, 9.404e-02, 6.768e-02, 2.191e-01, -3.163e-02, 1.257e-01, 1.058e-01, -6.394e-01, 7.223e-03, -6.930e-01, -2.963e-01, -2.666e-01, 3.461e-03, 2.203e-01, -1.212e-01), r);
|
||||||
|
r = MulAdd(s1_8, M4(-1.179e-01, 7.311e-02, 1.371e-01, -4.039e-02, 2.171e-01, 3.131e-02, 2.219e-01, 1.564e-02, -4.895e-01, -5.067e-03, -4.528e-01, 5.694e-02, 6.858e-02, 6.808e-03, -1.017e-01, 6.675e-03), r);
|
||||||
|
return r;
|
||||||
|
}
|
||||||
|
|
||||||
|
void Pass3(uint2 blockStart, uint3 tid) {
|
||||||
|
float2 pt = float2(GetInputPt());
|
||||||
|
uint2 gxy = Rmp8x8(tid.x) + blockStart;
|
||||||
|
uint2 size = GetInputSize();
|
||||||
|
if (gxy.x >= size.x || gxy.y >= size.y) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
float2 pos = (gxy + 0.5) * pt;
|
||||||
|
|
||||||
|
V4 s0_0 = l0(-1.0, -1.0);
|
||||||
|
V4 s0_1 = l0(0.0, -1.0);
|
||||||
|
V4 s0_2 = l0(1.0, -1.0);
|
||||||
|
V4 s0_3 = l0(-1.0, 0.0);
|
||||||
|
V4 s0_4 = l0(0.0, 0.0);
|
||||||
|
V4 s0_5 = l0(1.0, 0.0);
|
||||||
|
V4 s0_6 = l0(-1.0, 1.0);
|
||||||
|
V4 s0_7 = l0(0.0, 1.0);
|
||||||
|
V4 s0_8 = l0(1.0, 1.0);
|
||||||
|
V4 s1_0 = -max(-s0_0, 0.0);
|
||||||
|
V4 s1_1 = -max(-s0_1, 0.0);
|
||||||
|
V4 s1_2 = -max(-s0_2, 0.0);
|
||||||
|
V4 s1_3 = -max(-s0_3, 0.0);
|
||||||
|
V4 s1_4 = -max(-s0_4, 0.0);
|
||||||
|
V4 s1_5 = -max(-s0_5, 0.0);
|
||||||
|
V4 s1_6 = -max(-s0_6, 0.0);
|
||||||
|
V4 s1_7 = -max(-s0_7, 0.0);
|
||||||
|
V4 s1_8 = -max(-s0_8, 0.0);
|
||||||
|
s0_0 = max(s0_0, 0.0);
|
||||||
|
s0_1 = max(s0_1, 0.0);
|
||||||
|
s0_2 = max(s0_2, 0.0);
|
||||||
|
s0_3 = max(s0_3, 0.0);
|
||||||
|
s0_4 = max(s0_4, 0.0);
|
||||||
|
s0_5 = max(s0_5, 0.0);
|
||||||
|
s0_6 = max(s0_6, 0.0);
|
||||||
|
s0_7 = max(s0_7, 0.0);
|
||||||
|
s0_8 = max(s0_8, 0.0);
|
||||||
|
|
||||||
|
t0[gxy] = f0(s0_0, s0_1, s0_2, s0_3, s0_4, s0_5, s0_6, s0_7, s0_8, s1_0, s1_1, s1_2, s1_3, s1_4, s1_5, s1_6, s1_7, s1_8);
|
||||||
|
}
|
||||||
|
|
||||||
|
//!PASS 4
|
||||||
|
//!DESC conv3
|
||||||
|
//!BLOCK_SIZE 8
|
||||||
|
//!NUM_THREADS 64
|
||||||
|
//!IN t0
|
||||||
|
//!OUT t1
|
||||||
|
|
||||||
|
#define l0(x, y) V4(O(t0, float2(x, y)))
|
||||||
|
|
||||||
|
V4 f0(V4 s0_0, V4 s0_1, V4 s0_2, V4 s0_3, V4 s0_4, V4 s0_5, V4 s0_6, V4 s0_7, V4 s0_8, V4 s1_0, V4 s1_1, V4 s1_2, V4 s1_3, V4 s1_4, V4 s1_5, V4 s1_6, V4 s1_7, V4 s1_8) {
|
||||||
|
V4 r = { 1.370e-02, 1.151e-02, 2.567e-03, -1.881e-03 };
|
||||||
|
r = MulAdd(s0_0, M4(-6.123e-02, 9.666e-03, 4.969e-02, 3.030e-02, 1.714e-02, -3.117e-02, -9.470e-02, 2.078e-03, 4.109e-02, -5.560e-02, 3.757e-02, -3.667e-03, -3.500e-02, -8.151e-02, 1.104e-01, -1.219e-01), r);
|
||||||
|
r = MulAdd(s0_1, M4(9.596e-02, -6.361e-02, 1.162e-02, -3.138e-02, -1.277e-02, -4.005e-02, 1.805e-02, -1.459e-02, -7.903e-03, 1.138e-02, 1.542e-02, -2.357e-02, -1.421e-01, -2.953e-01, 1.322e-01, 6.480e-03), r);
|
||||||
|
r = MulAdd(s0_2, M4(1.571e-01, -1.081e-01, 1.345e-01, -5.616e-02, -1.211e-02, 4.515e-02, 1.797e-02, 6.143e-02, -9.605e-02, 7.782e-02, -1.421e-01, 3.195e-02, 1.841e-01, -7.735e-02, 1.082e-01, 1.785e-02), r);
|
||||||
|
r = MulAdd(s0_3, M4(1.739e-03, -4.187e-02, 1.093e-01, 1.042e-01, -6.538e-03, 5.025e-02, -7.052e-03, -1.033e-01, -1.394e-01, -4.638e-01, 4.354e-02, -1.188e-02, 7.809e-04, 2.484e-01, -8.330e-01, -2.787e-01), r);
|
||||||
|
r = MulAdd(s0_4, M4(-6.489e-03, -6.309e-01, 7.169e-01, 1.557e-01, 1.478e-01, 2.977e-01, -2.818e-01, 5.129e-02, 7.598e-01, 8.124e-01, -1.262e-02, -1.325e-01, -2.764e-01, 3.485e-01, 4.717e-01, -2.467e-01), r);
|
||||||
|
r = MulAdd(s0_5, M4(2.022e-02, -1.396e-01, 1.865e-01, 1.568e-02, 3.924e-01, -2.466e-01, 4.990e-01, 3.971e-02, -1.176e-01, 1.792e-01, -2.861e-01, 3.555e-02, -1.428e-01, 2.528e-01, -2.085e-01, -1.311e-01), r);
|
||||||
|
r = MulAdd(s0_6, M4(3.340e-02, -1.203e-01, 1.014e-01, 1.154e-01, -9.031e-03, -5.586e-02, -5.700e-03, 2.391e-02, -3.509e-01, 6.729e-02, 1.004e-01, -3.277e-01, 1.026e-01, 3.286e-03, -6.603e-02, -3.238e-03), r);
|
||||||
|
r = MulAdd(s0_7, M4(-6.854e-01, 1.013e-01, -6.298e-02, -5.464e-01, 2.486e-01, -2.186e-01, 3.986e-02, 3.800e-01, -1.267e-01, 1.037e-01, 1.538e-01, -2.069e-01, 9.431e-02, 5.337e-02, -8.507e-02, 2.015e-01), r);
|
||||||
|
r = MulAdd(s0_8, M4(-5.009e-03, 1.493e-01, -3.010e-02, -2.429e-02, -3.137e-01, -2.276e-01, 1.556e-01, 1.452e-02, 2.063e-01, 3.699e-02, -1.675e-03, 8.221e-02, -6.732e-02, 8.296e-02, -8.474e-02, -1.458e-01), r);
|
||||||
|
r = MulAdd(s1_0, M4(-3.003e-02, -9.777e-03, 1.239e-02, -3.907e-02, 1.841e-01, -8.959e-02, 9.257e-02, 1.333e-01, 5.703e-04, -1.367e-01, -1.026e-01, 6.398e-02, 1.262e-02, 1.101e-02, 4.291e-02, -4.238e-02), r);
|
||||||
|
r = MulAdd(s1_1, M4(5.516e-02, 9.884e-04, -5.383e-02, -1.048e-02, 2.529e-01, 9.819e-02, 1.255e-01, 3.149e-02, -8.249e-02, -1.386e-02, 6.214e-02, 2.957e-02, 1.001e-01, 1.590e-01, 1.159e-02, 5.273e-02), r);
|
||||||
|
r = MulAdd(s1_2, M4(4.571e-02, -6.277e-03, 1.496e-01, -4.044e-02, 4.089e-02, -3.801e-02, -3.690e-02, -1.037e-01, -6.031e-02, 2.117e-03, -9.644e-02, 6.392e-02, 5.093e-02, -2.512e-02, 1.131e-01, 1.304e-01), r);
|
||||||
|
r = MulAdd(s1_3, M4(-3.118e-02, 2.185e-02, 1.763e-01, 8.327e-02, 6.337e-02, 8.724e-02, 6.808e-02, -4.070e-01, -6.922e-02, -2.417e-01, -1.175e-01, -1.845e-01, -3.773e-03, -1.869e-01, -9.345e-02, -2.340e-01), r);
|
||||||
|
r = MulAdd(s1_4, M4(-1.159e-01, -4.476e-01, 2.989e-01, 2.794e-01, 5.756e-01, -4.803e-01, -5.979e-02, -1.959e-01, 5.261e-02, -2.399e-01, -6.616e-02, -9.243e-01, 4.622e-01, 1.139e-01, 2.482e-01, 2.254e-01), r);
|
||||||
|
r = MulAdd(s1_5, M4(1.064e-01, -1.989e-02, 8.581e-02, 3.218e-02, 3.344e-01, -5.684e-01, 4.009e-01, 4.482e-01, 7.737e-02, 8.716e-02, -1.382e-01, -7.145e-02, -1.225e-01, 1.471e-01, -1.866e-01, 3.674e-02), r);
|
||||||
|
r = MulAdd(s1_6, M4(5.376e-02, -6.192e-03, -1.760e-01, 7.590e-02, -3.279e-02, -1.888e-01, 2.057e-01, 2.114e-01, -3.941e-01, 5.584e-03, 9.400e-03, -4.289e-01, -2.289e-01, 1.880e-01, 3.184e-02, -4.442e-01), r);
|
||||||
|
r = MulAdd(s1_7, M4(-4.174e-01, -1.344e-01, 3.866e-02, 4.521e-02, -4.215e-01, 1.479e-01, 2.476e-01, -7.051e-01, -4.153e-01, 3.373e-01, 8.098e-02, -6.680e-01, 3.920e-01, -1.023e-01, -2.166e-02, 3.816e-01), r);
|
||||||
|
r = MulAdd(s1_8, M4(-3.441e-02, 3.404e-03, -4.958e-02, 9.652e-03, -1.930e-02, -2.470e-01, 1.610e-01, 1.112e-01, 2.574e-02, 2.310e-01, 3.643e-02, -5.044e-02, 7.788e-02, 1.923e-03, -7.115e-02, -6.575e-03), r);
|
||||||
|
return r;
|
||||||
|
}
|
||||||
|
|
||||||
|
void Pass4(uint2 blockStart, uint3 tid) {
|
||||||
|
float2 pt = float2(GetInputPt());
|
||||||
|
uint2 gxy = Rmp8x8(tid.x) + blockStart;
|
||||||
|
uint2 size = GetInputSize();
|
||||||
|
if (gxy.x >= size.x || gxy.y >= size.y) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
float2 pos = (gxy + 0.5) * pt;
|
||||||
|
|
||||||
|
V4 s0_0 = l0(-1.0, -1.0);
|
||||||
|
V4 s0_1 = l0(0.0, -1.0);
|
||||||
|
V4 s0_2 = l0(1.0, -1.0);
|
||||||
|
V4 s0_3 = l0(-1.0, 0.0);
|
||||||
|
V4 s0_4 = l0(0.0, 0.0);
|
||||||
|
V4 s0_5 = l0(1.0, 0.0);
|
||||||
|
V4 s0_6 = l0(-1.0, 1.0);
|
||||||
|
V4 s0_7 = l0(0.0, 1.0);
|
||||||
|
V4 s0_8 = l0(1.0, 1.0);
|
||||||
|
V4 s1_0 = -max(-s0_0, 0.0);
|
||||||
|
V4 s1_1 = -max(-s0_1, 0.0);
|
||||||
|
V4 s1_2 = -max(-s0_2, 0.0);
|
||||||
|
V4 s1_3 = -max(-s0_3, 0.0);
|
||||||
|
V4 s1_4 = -max(-s0_4, 0.0);
|
||||||
|
V4 s1_5 = -max(-s0_5, 0.0);
|
||||||
|
V4 s1_6 = -max(-s0_6, 0.0);
|
||||||
|
V4 s1_7 = -max(-s0_7, 0.0);
|
||||||
|
V4 s1_8 = -max(-s0_8, 0.0);
|
||||||
|
s0_0 = max(s0_0, 0.0);
|
||||||
|
s0_1 = max(s0_1, 0.0);
|
||||||
|
s0_2 = max(s0_2, 0.0);
|
||||||
|
s0_3 = max(s0_3, 0.0);
|
||||||
|
s0_4 = max(s0_4, 0.0);
|
||||||
|
s0_5 = max(s0_5, 0.0);
|
||||||
|
s0_6 = max(s0_6, 0.0);
|
||||||
|
s0_7 = max(s0_7, 0.0);
|
||||||
|
s0_8 = max(s0_8, 0.0);
|
||||||
|
|
||||||
|
t1[gxy] = f0(s0_0, s0_1, s0_2, s0_3, s0_4, s0_5, s0_6, s0_7, s0_8, s1_0, s1_1, s1_2, s1_3, s1_4, s1_5, s1_6, s1_7, s1_8);
|
||||||
|
}
|
||||||
|
|
||||||
|
//!PASS 5
|
||||||
|
//!DESC conv4
|
||||||
|
//!BLOCK_SIZE 8
|
||||||
|
//!NUM_THREADS 64
|
||||||
|
//!IN t1
|
||||||
|
//!OUT t0
|
||||||
|
|
||||||
|
#define l0(x, y) V4(O(t1, float2(x, y)))
|
||||||
|
|
||||||
|
V4 f0(V4 s0_0, V4 s0_1, V4 s0_2, V4 s0_3, V4 s0_4, V4 s0_5, V4 s0_6, V4 s0_7, V4 s0_8, V4 s1_0, V4 s1_1, V4 s1_2, V4 s1_3, V4 s1_4, V4 s1_5, V4 s1_6, V4 s1_7, V4 s1_8) {
|
||||||
|
V4 r = { 7.249e-03, 2.949e-03, 5.297e-03, 3.693e-03 };
|
||||||
|
r = MulAdd(s0_0, M4(2.376e-02, 2.931e-02, 7.304e-02, -5.238e-02, -6.500e-03, -3.887e-02, 2.506e-02, 5.201e-03, 5.599e-02, -1.951e-01, -3.847e-01, 8.685e-02, -1.106e-01, -3.954e-02, 1.571e-01, 2.293e-02), r);
|
||||||
|
r = MulAdd(s0_1, M4(-2.738e-02, 1.554e-01, 1.120e-01, 1.856e-02, 9.513e-03, -2.222e-01, -2.174e-01, -1.065e-02, 3.001e-02, 7.638e-02, -7.497e-02, -2.727e-02, -1.521e-02, 1.843e-01, 3.547e-01, -1.642e-02), r);
|
||||||
|
r = MulAdd(s0_2, M4(-2.533e-02, -1.959e-02, -6.274e-02, 8.121e-03, -8.703e-03, 5.091e-02, 6.548e-02, 1.988e-02, 4.089e-02, -4.827e-02, -4.089e-02, -4.361e-02, -1.112e-02, -1.101e-02, 2.968e-02, -2.196e-03), r);
|
||||||
|
r = MulAdd(s0_3, M4(1.813e-02, -2.087e-01, -2.474e-01, -1.066e-01, 2.549e-01, 6.466e-01, 3.169e-01, -1.109e-01, -1.551e-02, -3.119e-01, -3.959e-01, 2.141e-01, 1.121e-01, 3.268e-01, 1.038e-01, -5.818e-02), r);
|
||||||
|
r = MulAdd(s0_4, M4(-3.147e-01, 2.716e-01, 1.304e-01, 3.887e-01, 9.396e-02, -9.787e-02, -1.596e-01, -7.138e-02, -2.462e-01, -3.027e-01, 6.980e-01, -1.546e-01, 3.730e-02, -7.502e-02, -4.408e-02, 3.814e-02), r);
|
||||||
|
r = MulAdd(s0_5, M4(-4.177e-02, -1.326e-02, -7.497e-02, 1.168e-03, 5.595e-03, 3.603e-02, 2.589e-02, -2.179e-02, 1.998e-02, -3.544e-03, 1.125e-01, 2.648e-03, -2.417e-02, -1.876e-02, 4.009e-02, 5.481e-02), r);
|
||||||
|
r = MulAdd(s0_6, M4(-7.181e-02, -2.968e-02, -3.169e-02, -1.899e-02, -3.692e-02, -2.156e-02, 9.595e-02, 1.055e-01, -1.274e-01, -2.576e-02, 8.706e-02, 1.895e-01, 6.316e-04, -4.574e-02, 2.201e-02, 1.199e-01), r);
|
||||||
|
r = MulAdd(s0_7, M4(-2.193e-01, 1.563e-02, 1.287e-01, 2.403e-01, 2.222e-01, -1.748e-02, 1.486e-02, -7.685e-02, 4.971e-01, 2.920e-01, -2.253e-01, -8.145e-01, 3.018e-01, -4.559e-02, -1.509e-01, -3.003e-01), r);
|
||||||
|
r = MulAdd(s0_8, M4(1.685e-02, -1.082e-02, 3.539e-03, -2.765e-02, -5.968e-03, -4.628e-03, 3.847e-02, 6.426e-02, -6.284e-02, 5.455e-02, -3.291e-02, 1.636e-01, 5.828e-02, -5.613e-02, -4.404e-02, -1.715e-02), r);
|
||||||
|
r = MulAdd(s1_0, M4(1.875e-02, 7.150e-02, 3.015e-02, -4.917e-02, 9.333e-03, -1.519e-01, -1.153e-01, 4.344e-02, -1.603e-02, -4.775e-02, -4.484e-02, 6.567e-02, -6.714e-02, 2.569e-01, 4.638e-01, 3.038e-02), r);
|
||||||
|
r = MulAdd(s1_1, M4(-4.046e-02, 1.372e-01, 2.476e-01, 6.565e-02, 6.481e-04, -1.529e-02, 1.376e-02, 1.367e-02, 2.941e-04, 1.423e-01, 2.311e-01, 7.538e-03, -6.762e-02, -3.992e-01, -1.160e-02, 3.123e-02), r);
|
||||||
|
r = MulAdd(s1_2, M4(-3.926e-02, 1.709e-04, -4.761e-02, -8.731e-03, 5.123e-03, 7.039e-02, 1.061e-01, -1.322e-03, 4.069e-02, -1.182e-01, -3.698e-04, -7.746e-02, -3.827e-02, 9.957e-02, 9.991e-02, 5.215e-02), r);
|
||||||
|
r = MulAdd(s1_3, M4(-1.865e-01, -9.784e-01, -5.871e-01, 1.384e-01, 2.097e-01, -1.229e-01, -4.912e-01, -4.254e-02, 3.395e-04, -8.968e-02, -6.923e-02, -4.916e-02, 2.424e-01, 7.730e-01, 2.573e-01, -2.380e-01), r);
|
||||||
|
r = MulAdd(s1_4, M4(-9.293e-01, 6.176e-01, 1.970e-01, 3.467e-01, 4.341e-01, 9.866e-01, 3.035e-01, -1.062e-01, -1.501e-01, 2.709e-01, 1.991e-01, -2.164e-01, 2.881e-01, -1.696e-01, -4.141e-01, -1.004e+00), r);
|
||||||
|
r = MulAdd(s1_5, M4(-8.323e-02, -1.285e-02, -3.468e-02, 1.551e-01, 1.330e-01, -1.238e-01, -1.675e-03, 5.588e-02, 2.128e-01, -2.327e-01, -2.891e-02, 1.567e-01, -1.448e-01, 8.781e-02, 3.254e-02, 7.142e-02), r);
|
||||||
|
r = MulAdd(s1_6, M4(1.231e-01, 5.139e-02, -9.426e-02, -2.822e-01, 1.761e-03, 6.853e-03, 1.165e-01, 7.861e-02, -9.715e-03, 5.489e-03, -1.066e-02, -8.332e-03, -9.111e-02, 3.911e-02, 1.757e-01, 2.222e-01), r);
|
||||||
|
r = MulAdd(s1_7, M4(2.275e-02, 1.199e-01, 5.904e-02, -2.051e-01, 6.950e-01, 1.592e-02, -9.888e-02, -6.701e-01, -9.096e-02, 3.203e-02, 1.204e-01, 2.153e-01, 1.448e-01, -5.225e-03, 6.786e-02, 2.005e-02), r);
|
||||||
|
r = MulAdd(s1_8, M4(-3.290e-02, -3.758e-02, -3.158e-02, 8.713e-02, 3.917e-02, 4.275e-02, -2.450e-02, 3.970e-02, 1.928e-01, 5.498e-02, -5.673e-02, -3.743e-01, 4.981e-02, -1.785e-02, 1.958e-02, 3.487e-02), r);
|
||||||
|
return r;
|
||||||
|
}
|
||||||
|
|
||||||
|
void Pass5(uint2 blockStart, uint3 tid) {
|
||||||
|
float2 pt = float2(GetInputPt());
|
||||||
|
uint2 gxy = Rmp8x8(tid.x) + blockStart;
|
||||||
|
uint2 size = GetInputSize();
|
||||||
|
if (gxy.x >= size.x || gxy.y >= size.y) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
float2 pos = (gxy + 0.5) * pt;
|
||||||
|
|
||||||
|
V4 s0_0 = l0(-1.0, -1.0);
|
||||||
|
V4 s0_1 = l0(0.0, -1.0);
|
||||||
|
V4 s0_2 = l0(1.0, -1.0);
|
||||||
|
V4 s0_3 = l0(-1.0, 0.0);
|
||||||
|
V4 s0_4 = l0(0.0, 0.0);
|
||||||
|
V4 s0_5 = l0(1.0, 0.0);
|
||||||
|
V4 s0_6 = l0(-1.0, 1.0);
|
||||||
|
V4 s0_7 = l0(0.0, 1.0);
|
||||||
|
V4 s0_8 = l0(1.0, 1.0);
|
||||||
|
V4 s1_0 = -max(-s0_0, 0.0);
|
||||||
|
V4 s1_1 = -max(-s0_1, 0.0);
|
||||||
|
V4 s1_2 = -max(-s0_2, 0.0);
|
||||||
|
V4 s1_3 = -max(-s0_3, 0.0);
|
||||||
|
V4 s1_4 = -max(-s0_4, 0.0);
|
||||||
|
V4 s1_5 = -max(-s0_5, 0.0);
|
||||||
|
V4 s1_6 = -max(-s0_6, 0.0);
|
||||||
|
V4 s1_7 = -max(-s0_7, 0.0);
|
||||||
|
V4 s1_8 = -max(-s0_8, 0.0);
|
||||||
|
s0_0 = max(s0_0, 0.0);
|
||||||
|
s0_1 = max(s0_1, 0.0);
|
||||||
|
s0_2 = max(s0_2, 0.0);
|
||||||
|
s0_3 = max(s0_3, 0.0);
|
||||||
|
s0_4 = max(s0_4, 0.0);
|
||||||
|
s0_5 = max(s0_5, 0.0);
|
||||||
|
s0_6 = max(s0_6, 0.0);
|
||||||
|
s0_7 = max(s0_7, 0.0);
|
||||||
|
s0_8 = max(s0_8, 0.0);
|
||||||
|
|
||||||
|
t0[gxy] = f0(s0_0, s0_1, s0_2, s0_3, s0_4, s0_5, s0_6, s0_7, s0_8, s1_0, s1_1, s1_2, s1_3, s1_4, s1_5, s1_6, s1_7, s1_8);
|
||||||
|
}
|
||||||
|
|
||||||
|
//!PASS 6
|
||||||
|
//!DESC out-shuffle
|
||||||
|
//!BLOCK_SIZE 16
|
||||||
|
//!NUM_THREADS 64
|
||||||
|
//!IN INPUT, t0
|
||||||
|
//!OUT OUTPUT
|
||||||
|
|
||||||
|
#define l0(x, y) V4(O(t0, float2(x, y)))
|
||||||
|
|
||||||
|
V4 f0(V4 s0_0, V4 s0_1, V4 s0_2, V4 s0_3, V4 s0_4, V4 s0_5, V4 s0_6, V4 s0_7, V4 s0_8, V4 s1_0, V4 s1_1, V4 s1_2, V4 s1_3, V4 s1_4, V4 s1_5, V4 s1_6, V4 s1_7, V4 s1_8) {
|
||||||
|
V4 r = { 2.510e-03, 4.409e-03, 2.891e-03, 4.977e-03 };
|
||||||
|
r = MulAdd(s0_0, M4(2.340e-02, 8.171e-02, -1.124e-01, -5.065e-02, -5.505e-02, -5.540e-02, -3.000e-03, -1.346e-02, 3.800e-02, 4.944e-02, -2.084e-02, 6.388e-03, 8.566e-02, 2.480e-02, 1.184e-01, -1.075e-04), r);
|
||||||
|
r = MulAdd(s0_1, M4(-2.188e-02, -2.056e-01, 1.480e-02, -7.451e-02, 5.240e-02, 4.098e-02, -4.668e-03, 1.810e-02, -2.533e-02, -6.403e-02, 1.984e-02, -5.716e-02, -3.356e-03, -2.173e-01, 1.218e-01, 1.179e-01), r);
|
||||||
|
r = MulAdd(s0_2, M4(7.330e-03, 2.521e-02, 1.372e-02, 3.411e-02, -1.438e-02, -1.009e-02, 7.676e-03, -1.712e-02, 5.980e-03, 2.040e-02, -8.766e-03, 3.442e-02, -1.623e-02, -2.557e-02, -6.086e-03, 5.413e-04), r);
|
||||||
|
r = MulAdd(s0_3, M4(1.754e-01, 6.364e-02, 2.842e-01, 2.378e-01, -1.684e-01, -1.911e-02, -3.838e-01, -2.622e-02, 2.065e-01, 3.951e-02, 4.217e-01, 4.374e-02, -1.028e-02, 2.417e-02, -1.595e-02, 6.305e-02), r);
|
||||||
|
r = MulAdd(s0_4, M4(-5.620e-02, -8.609e-02, -1.256e-01, -3.166e-01, -1.712e-01, -1.602e-01, -1.577e-01, -4.901e-01, -5.012e-02, 1.082e-01, -7.271e-02, 4.072e-01, -7.789e-02, -1.725e-01, -1.397e-01, -4.507e-01), r);
|
||||||
|
r = MulAdd(s0_5, M4(1.401e-02, 4.716e-02, 1.486e-02, 4.642e-02, 1.131e-02, 3.865e-02, -9.865e-03, 9.301e-02, 3.441e-03, -8.098e-03, -6.012e-03, -1.549e-01, 1.486e-02, 1.872e-02, -2.469e-03, 1.294e-02), r);
|
||||||
|
r = MulAdd(s0_6, M4(-3.894e-02, -4.136e-05, -3.022e-02, 1.045e-03, -3.730e-02, -1.838e-02, -5.573e-02, -2.760e-02, 3.516e-02, 1.602e-02, 6.358e-02, 3.111e-02, -3.045e-02, -7.728e-03, -4.189e-02, -1.102e-02), r);
|
||||||
|
r = MulAdd(s0_7, M4(-1.184e-02, 1.728e-02, 7.925e-03, 6.763e-02, 2.590e-03, -9.456e-03, -4.407e-02, -2.044e-02, 4.472e-02, 2.228e-02, 7.233e-02, 4.863e-02, -1.814e-02, -2.034e-03, -4.994e-02, -2.460e-02), r);
|
||||||
|
r = MulAdd(s0_8, M4(-3.292e-03, -9.015e-03, -3.171e-03, -2.504e-02, 2.120e-03, 3.064e-02, 2.108e-02, 4.592e-02, 2.258e-03, -2.192e-04, -3.576e-03, 3.733e-02, -1.931e-03, -5.083e-03, 5.877e-03, -1.764e-02), r);
|
||||||
|
r = MulAdd(s1_0, M4(4.321e-02, -8.135e-02, -1.567e-01, -6.888e-03, -6.542e-02, -1.656e-02, 1.236e-02, -7.563e-03, 4.657e-02, 9.222e-03, -6.696e-03, -3.545e-03, -6.401e-01, 1.189e-01, 1.509e-01, 2.417e-01), r);
|
||||||
|
r = MulAdd(s1_1, M4(-2.058e-02, 1.174e-01, -2.482e-02, -8.423e-02, -1.692e-02, -1.094e-02, 3.530e-02, 1.780e-02, -9.937e-02, -9.030e-02, 2.304e-02, 1.294e-02, 7.976e-02, -3.096e-01, 1.382e-01, 2.456e-01), r);
|
||||||
|
r = MulAdd(s1_2, M4(4.491e-02, -1.336e-02, 3.593e-02, -3.503e-02, -8.630e-03, -4.295e-03, -1.356e-02, 3.843e-02, 9.887e-03, 1.913e-03, 2.247e-03, 1.113e-02, -7.234e-04, -3.058e-02, 2.833e-03, -1.707e-02), r);
|
||||||
|
r = MulAdd(s1_3, M4(2.007e-01, 6.756e-02, 9.393e-01, 9.057e-02, -3.701e-01, -1.729e-02, -4.136e-01, 2.233e-02, 2.783e-01, 3.590e-02, 3.564e-01, 8.342e-03, 1.333e-01, 7.944e-02, -2.312e-01, 8.354e-02), r);
|
||||||
|
r = MulAdd(s1_4, M4(-3.334e-01, -2.705e-01, -4.072e-01, 3.946e-01, 5.159e-03, -5.860e-01, 1.578e-01, -3.614e-01, 5.366e-01, 4.699e-01, -3.700e-01, 9.463e-02, -4.090e-02, -9.767e-02, -7.999e-02, -4.859e-01), r);
|
||||||
|
r = MulAdd(s1_5, M4(5.700e-02, 6.092e-02, 4.114e-02, -1.564e-02, -1.345e-02, 9.692e-02, 1.456e-03, 9.371e-02, -3.845e-02, -4.751e-02, -2.509e-02, -2.842e-01, 2.938e-03, 2.387e-02, -6.191e-04, -3.120e-04), r);
|
||||||
|
r = MulAdd(s1_6, M4(3.888e-02, 4.969e-02, -1.851e-01, -9.866e-03, -3.527e-02, -1.377e-02, -7.594e-02, -2.619e-02, 3.259e-02, 9.636e-03, 8.622e-03, 1.788e-02, -3.505e-02, -1.048e-03, -1.329e-02, 1.425e-02), r);
|
||||||
|
r = MulAdd(s1_7, M4(6.891e-03, 8.118e-02, -6.443e-02, -1.487e-01, 2.183e-02, 1.106e-03, 6.656e-02, -9.506e-02, 7.418e-04, -6.015e-02, 3.594e-01, 1.039e-02, -3.600e-02, -7.771e-03, -3.406e-02, 2.935e-02), r);
|
||||||
|
r = MulAdd(s1_8, M4(-4.598e-03, -4.678e-03, 1.595e-02, -8.273e-03, 6.740e-03, 1.175e-02, -2.997e-02, -6.116e-03, -3.788e-02, -9.471e-02, -2.149e-02, 4.139e-02, -9.614e-03, -5.573e-03, -1.643e-02, -1.712e-02), r);
|
||||||
|
return tanh(r);
|
||||||
|
}
|
||||||
|
|
||||||
|
void Pass6(uint2 blockStart, uint3 tid) {
|
||||||
|
float2 pt = float2(GetInputPt());
|
||||||
|
uint2 gxy = (Rmp8x8(tid.x) << 1) + blockStart;
|
||||||
|
uint2 size = GetOutputSize();
|
||||||
|
if (gxy.x >= size.x || gxy.y >= size.y) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
float2 pos = ((gxy >> 1) + 0.5) * pt;
|
||||||
|
|
||||||
|
V4 s0_0 = l0(-1.0, -1.0);
|
||||||
|
V4 s0_1 = l0(0.0, -1.0);
|
||||||
|
V4 s0_2 = l0(1.0, -1.0);
|
||||||
|
V4 s0_3 = l0(-1.0, 0.0);
|
||||||
|
V4 s0_4 = l0(0.0, 0.0);
|
||||||
|
V4 s0_5 = l0(1.0, 0.0);
|
||||||
|
V4 s0_6 = l0(-1.0, 1.0);
|
||||||
|
V4 s0_7 = l0(0.0, 1.0);
|
||||||
|
V4 s0_8 = l0(1.0, 1.0);
|
||||||
|
V4 s1_0 = -max(-s0_0, 0.0);
|
||||||
|
V4 s1_1 = -max(-s0_1, 0.0);
|
||||||
|
V4 s1_2 = -max(-s0_2, 0.0);
|
||||||
|
V4 s1_3 = -max(-s0_3, 0.0);
|
||||||
|
V4 s1_4 = -max(-s0_4, 0.0);
|
||||||
|
V4 s1_5 = -max(-s0_5, 0.0);
|
||||||
|
V4 s1_6 = -max(-s0_6, 0.0);
|
||||||
|
V4 s1_7 = -max(-s0_7, 0.0);
|
||||||
|
V4 s1_8 = -max(-s0_8, 0.0);
|
||||||
|
s0_0 = max(s0_0, 0.0);
|
||||||
|
s0_1 = max(s0_1, 0.0);
|
||||||
|
s0_2 = max(s0_2, 0.0);
|
||||||
|
s0_3 = max(s0_3, 0.0);
|
||||||
|
s0_4 = max(s0_4, 0.0);
|
||||||
|
s0_5 = max(s0_5, 0.0);
|
||||||
|
s0_6 = max(s0_6, 0.0);
|
||||||
|
s0_7 = max(s0_7, 0.0);
|
||||||
|
s0_8 = max(s0_8, 0.0);
|
||||||
|
|
||||||
|
V4 r = f0(s0_0, s0_1, s0_2, s0_3, s0_4, s0_5, s0_6, s0_7, s0_8, s1_0, s1_1, s1_2, s1_3, s1_4, s1_5, s1_6, s1_7, s1_8);
|
||||||
|
|
||||||
|
static const MF3x3 rgb2yuv = { 0.299, 0.587, 0.114, -0.169, -0.331, 0.5, 0.5, -0.419, -0.081 };
|
||||||
|
static const MF3x3 yuv2rgb = { 1, -0.00093, 1.401687, 1, -0.3437, -0.71417, 1, 1.77216, 0.00099 };
|
||||||
|
float2 opt = float2(GetOutputPt());
|
||||||
|
|
||||||
|
pos -= 0.5f * opt;
|
||||||
|
MF3 yuv = mul(rgb2yuv, INPUT.SampleLevel(SL, pos, 0).rgb);
|
||||||
|
OUTPUT[gxy] = MF4(mul(yuv2rgb, MF3(saturate(yuv.r + r.x), yuv.yz)), 1);
|
||||||
|
|
||||||
|
++gxy.x;
|
||||||
|
pos.x += opt.x;
|
||||||
|
yuv = mul(rgb2yuv, INPUT.SampleLevel(SL, pos, 0).rgb);
|
||||||
|
OUTPUT[gxy] = MF4(mul(yuv2rgb, MF3(saturate(yuv.r + r.y), yuv.yz)), 1);
|
||||||
|
|
||||||
|
++gxy.y;
|
||||||
|
pos.y += opt.y;
|
||||||
|
yuv = mul(rgb2yuv, INPUT.SampleLevel(SL, pos, 0).rgb);
|
||||||
|
OUTPUT[gxy] = MF4(mul(yuv2rgb, MF3(saturate(yuv.r + r.w), yuv.yz)), 1);
|
||||||
|
|
||||||
|
--gxy.x;
|
||||||
|
pos.x -= opt.x;
|
||||||
|
yuv = mul(rgb2yuv, INPUT.SampleLevel(SL, pos, 0).rgb);
|
||||||
|
OUTPUT[gxy] = MF4(mul(yuv2rgb, MF3(saturate(yuv.r + r.z), yuv.yz)), 1);
|
||||||
|
}
|
||||||
484
src/Effects/CuNNy/CuNNy-4x4C-NVL.hlsl
Normal file
484
src/Effects/CuNNy/CuNNy-4x4C-NVL.hlsl
Normal file
|
|
@ -0,0 +1,484 @@
|
||||||
|
// CuNNy 4x4C BILINEAR RGB NVL - https://github.com/funnyplanter/CuNNy
|
||||||
|
|
||||||
|
// This program is free software: you can redistribute it and/or modify
|
||||||
|
// it under the terms of the GNU General Public License as published by
|
||||||
|
// the Free Software Foundation, either version 3 of the License, or
|
||||||
|
// (at your option) any later version.
|
||||||
|
//
|
||||||
|
// This program is distributed in the hope that it will be useful,
|
||||||
|
// but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
//
|
||||||
|
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||||
|
// GNU General Public License for more details.
|
||||||
|
//
|
||||||
|
// You should have received a copy of the GNU General Public License
|
||||||
|
// along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||||
|
|
||||||
|
//!MAGPIE EFFECT
|
||||||
|
//!VERSION 4
|
||||||
|
//!SORT_NAME CuNNy-D04N04
|
||||||
|
//!USE MulAdd
|
||||||
|
//!CAPABILITY FP16
|
||||||
|
|
||||||
|
#include "../StubDefs.hlsli"
|
||||||
|
|
||||||
|
//!TEXTURE
|
||||||
|
Texture2D INPUT;
|
||||||
|
|
||||||
|
//!TEXTURE
|
||||||
|
//!WIDTH INPUT_WIDTH * 2
|
||||||
|
//!HEIGHT INPUT_HEIGHT * 2
|
||||||
|
Texture2D OUTPUT;
|
||||||
|
|
||||||
|
//!SAMPLER
|
||||||
|
//!FILTER POINT
|
||||||
|
SamplerState SP;
|
||||||
|
|
||||||
|
//!SAMPLER
|
||||||
|
//!FILTER LINEAR
|
||||||
|
SamplerState SL;
|
||||||
|
|
||||||
|
//!COMMON
|
||||||
|
#define O(t, p) t.SampleLevel(SP, pos + p * pt, 0)
|
||||||
|
#define V4 MF4
|
||||||
|
#define M4 MF4x4
|
||||||
|
|
||||||
|
//!TEXTURE
|
||||||
|
//!WIDTH INPUT_WIDTH
|
||||||
|
//!HEIGHT INPUT_HEIGHT
|
||||||
|
//!FORMAT R8G8B8A8_SNORM
|
||||||
|
Texture2D t0;
|
||||||
|
|
||||||
|
//!TEXTURE
|
||||||
|
//!WIDTH INPUT_WIDTH
|
||||||
|
//!HEIGHT INPUT_HEIGHT
|
||||||
|
//!FORMAT R8G8B8A8_SNORM
|
||||||
|
Texture2D t1;
|
||||||
|
|
||||||
|
//!PASS 1
|
||||||
|
//!DESC in
|
||||||
|
//!BLOCK_SIZE 8
|
||||||
|
//!NUM_THREADS 64
|
||||||
|
//!IN INPUT
|
||||||
|
//!OUT t0
|
||||||
|
|
||||||
|
#define l0(x, y) (dot(MF3(-4.174e-01, -7.873e-01, -1.763e-01), O(INPUT, float2(x, y)).rgb) + MF(1.011e+00))
|
||||||
|
|
||||||
|
V4 f0(MF s0_0, MF s0_1, MF s0_2, MF s0_3, MF s0_4, MF s0_5, MF s0_6, MF s0_7, MF s0_8) {
|
||||||
|
V4 r = { 3.566e-02, -1.308e-03, -5.595e-03, -5.246e-03 };
|
||||||
|
r = mad(s0_0, V4(1.222e-01, 7.038e-03, 1.179e-01, 1.876e-01), r);
|
||||||
|
r = mad(s0_1, V4(1.025e-01, -2.993e-01, 3.154e-01, -1.050e-01), r);
|
||||||
|
r = mad(s0_2, V4(5.656e-02, -3.117e-03, -6.665e-02, -2.044e-01), r);
|
||||||
|
r = mad(s0_3, V4(-5.045e-01, -4.189e-01, -3.076e-01, -3.691e-01), r);
|
||||||
|
r = mad(s0_4, V4(1.365e-01, 6.699e-01, 3.389e-01, 4.561e-01), r);
|
||||||
|
r = mad(s0_5, V4(-7.690e-02, 2.655e-02, -1.044e-02, 7.271e-02), r);
|
||||||
|
r = mad(s0_6, V4(1.358e-02, 3.378e-03, -1.802e-01, -1.936e-01), r);
|
||||||
|
r = mad(s0_7, V4(8.227e-02, 1.550e-02, -1.820e-01, -1.670e-01), r);
|
||||||
|
r = mad(s0_8, V4(9.988e-03, 1.413e-03, -2.486e-02, 3.258e-01), r);
|
||||||
|
return r;
|
||||||
|
}
|
||||||
|
|
||||||
|
void Pass1(uint2 blockStart, uint3 tid) {
|
||||||
|
float2 pt = float2(GetInputPt());
|
||||||
|
uint2 gxy = Rmp8x8(tid.x) + blockStart;
|
||||||
|
uint2 size = GetInputSize();
|
||||||
|
if (gxy.x >= size.x || gxy.y >= size.y) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
float2 pos = (gxy + 0.5) * pt;
|
||||||
|
|
||||||
|
MF s0_0 = l0(-1.0, -1.0);
|
||||||
|
MF s0_1 = l0(0.0, -1.0);
|
||||||
|
MF s0_2 = l0(1.0, -1.0);
|
||||||
|
MF s0_3 = l0(-1.0, 0.0);
|
||||||
|
MF s0_4 = l0(0.0, 0.0);
|
||||||
|
MF s0_5 = l0(1.0, 0.0);
|
||||||
|
MF s0_6 = l0(-1.0, 1.0);
|
||||||
|
MF s0_7 = l0(0.0, 1.0);
|
||||||
|
MF s0_8 = l0(1.0, 1.0);
|
||||||
|
|
||||||
|
t0[gxy] = f0(s0_0, s0_1, s0_2, s0_3, s0_4, s0_5, s0_6, s0_7, s0_8);
|
||||||
|
}
|
||||||
|
|
||||||
|
//!PASS 2
|
||||||
|
//!DESC conv1
|
||||||
|
//!BLOCK_SIZE 8
|
||||||
|
//!NUM_THREADS 64
|
||||||
|
//!IN t0
|
||||||
|
//!OUT t1
|
||||||
|
|
||||||
|
#define l0(x, y) V4(O(t0, float2(x, y)))
|
||||||
|
|
||||||
|
V4 f0(V4 s0_0, V4 s0_1, V4 s0_2, V4 s0_3, V4 s0_4, V4 s0_5, V4 s0_6, V4 s0_7, V4 s0_8, V4 s1_0, V4 s1_1, V4 s1_2, V4 s1_3, V4 s1_4, V4 s1_5, V4 s1_6, V4 s1_7, V4 s1_8) {
|
||||||
|
V4 r = { -4.317e-03, 2.687e-03, -1.530e-03, 4.681e-04 };
|
||||||
|
r = MulAdd(s0_0, M4(1.282e-01, 1.199e-01, 1.156e-01, -4.091e-02, -1.771e-02, -1.431e-01, -1.478e-02, 4.041e-02, -1.559e-01, 1.231e-02, -8.571e-02, 2.159e-02, -6.484e-02, 3.819e-02, -3.386e-02, -3.344e-02), r);
|
||||||
|
r = MulAdd(s0_1, M4(6.131e-02, 1.493e-01, 1.954e-01, -2.565e-01, 1.570e-01, -3.852e-01, -2.313e-01, 9.262e-02, 1.038e-01, -4.169e-01, -2.446e-01, 9.953e-02, -1.830e-01, -9.774e-02, -1.498e-01, 8.626e-02), r);
|
||||||
|
r = MulAdd(s0_2, M4(9.908e-02, 1.372e-01, -1.254e-02, 4.486e-03, 1.023e-01, 6.484e-02, 1.645e-01, -4.932e-02, -4.221e-02, -1.919e-01, -2.135e-02, 6.955e-02, -1.406e-01, 8.082e-02, -7.935e-02, 3.010e-02), r);
|
||||||
|
r = MulAdd(s0_3, M4(-7.203e-02, -1.210e-01, 1.084e-01, -6.958e-03, 1.303e-01, 1.030e-01, -2.392e-01, -1.084e-01, 2.173e-01, -7.864e-02, -2.983e-01, -3.510e-01, -3.076e-01, 4.533e-02, 1.940e-01, 4.051e-01), r);
|
||||||
|
r = MulAdd(s0_4, M4(9.270e-02, -4.072e-01, 2.338e-01, 4.098e-01, -1.440e-01, 6.971e-01, 5.515e-01, 2.682e-01, -1.401e-01, 3.504e-02, 1.366e-01, 6.149e-01, -3.330e-01, 1.880e-01, -4.170e-01, 3.244e-01), r);
|
||||||
|
r = MulAdd(s0_5, M4(-5.380e-01, -7.843e-02, -1.293e-01, -9.225e-02, 1.393e-01, -2.588e-01, 4.618e-01, -2.264e-02, -5.369e-02, 1.321e-01, -3.029e-02, 7.983e-02, -1.048e-01, 3.279e-02, -5.969e-02, -3.766e-03), r);
|
||||||
|
r = MulAdd(s0_6, M4(3.432e-02, 1.518e-02, 1.940e-02, -1.086e-01, 1.052e-01, -5.430e-02, -3.343e-02, 1.824e-01, -9.831e-02, 1.097e-02, 6.281e-02, 1.194e-01, 3.253e-02, 4.046e-02, -2.183e-02, -1.328e-01), r);
|
||||||
|
r = MulAdd(s0_7, M4(1.538e-01, 6.796e-02, -4.870e-01, 7.139e-02, -2.497e-01, 2.916e-02, 6.191e-01, -2.650e-01, -4.194e-02, 1.782e-01, -3.431e-01, -9.707e-02, 2.173e-02, -1.150e-01, -8.162e-03, 4.551e-02), r);
|
||||||
|
r = MulAdd(s0_8, M4(5.804e-02, 5.436e-02, -1.604e-01, 8.077e-02, 2.685e-01, 4.741e-02, 1.225e-01, -1.033e-01, -4.358e-02, -1.091e-01, 8.815e-02, -3.121e-02, -2.569e-02, -1.093e-02, -2.550e-02, -1.571e-02), r);
|
||||||
|
r = MulAdd(s1_0, M4(8.760e-02, 1.254e-01, 9.299e-02, -1.140e-02, 4.179e-02, -1.333e-01, 3.048e-03, -3.111e-02, -6.091e-02, 6.563e-03, 4.609e-03, -4.717e-02, -6.470e-02, -5.791e-02, -5.529e-03, 8.697e-02), r);
|
||||||
|
r = MulAdd(s1_1, M4(6.935e-02, 9.805e-02, 1.851e-01, -2.726e-01, 1.731e-01, -2.863e-01, -2.267e-01, -3.813e-02, 1.104e-01, -3.193e-01, -1.958e-01, 9.567e-02, 1.819e-01, -2.054e-01, 1.228e-01, 3.906e-02), r);
|
||||||
|
r = MulAdd(s1_2, M4(-1.957e-01, 7.733e-02, -2.023e-01, 1.297e-01, -1.646e-01, 1.304e-01, -1.728e-02, -4.396e-02, 7.828e-02, -2.639e-01, 3.389e-02, 1.101e-01, 1.388e-01, -4.075e-03, 1.023e-01, -7.785e-03), r);
|
||||||
|
r = MulAdd(s1_3, M4(-2.828e-02, -7.018e-02, 4.269e-02, -1.386e-01, 2.143e-02, 2.504e-01, -2.134e-01, -2.483e-01, 1.075e-01, -2.671e-02, -2.588e-01, -3.271e-01, 1.173e-01, -6.103e-02, 5.539e-01, 5.341e-01), r);
|
||||||
|
r = MulAdd(s1_4, M4(-2.415e-01, -2.975e-01, -6.622e-02, 4.027e-01, -5.871e-01, 7.506e-01, 1.939e-02, -1.680e-01, 4.796e-01, -2.840e-01, 5.077e-01, 9.122e-02, 1.463e-01, 2.124e-01, 6.358e-02, 2.993e-01), r);
|
||||||
|
r = MulAdd(s1_5, M4(4.298e-01, -1.754e-01, 5.357e-01, -1.440e-01, -4.439e-01, -3.819e-01, -1.009e-01, 2.113e-02, -2.275e-02, -1.842e-02, 1.441e-01, 6.590e-03, 2.627e-02, 3.381e-02, 9.956e-02, -1.935e-02), r);
|
||||||
|
r = MulAdd(s1_6, M4(-5.557e-02, 3.378e-02, -2.451e-02, -1.718e-01, -2.037e-01, 1.631e-02, -2.822e-01, -7.724e-02, -6.657e-02, -2.282e-02, 2.673e-02, 8.716e-02, 1.291e-01, 9.472e-03, 3.810e-02, -1.134e-01), r);
|
||||||
|
r = MulAdd(s1_7, M4(1.441e-01, 4.331e-02, -4.741e-01, 2.165e-01, -5.974e-01, -2.669e-02, -4.949e-02, -3.179e-01, 1.007e-01, 1.512e-01, -4.138e-02, -7.470e-02, 8.828e-02, -1.400e-01, 5.797e-02, -4.988e-03), r);
|
||||||
|
r = MulAdd(s1_8, M4(-2.478e-01, 1.392e-01, -8.663e-02, -3.629e-02, 1.823e-01, 7.573e-03, -2.445e-01, -1.641e-02, -5.197e-02, -8.804e-02, 1.244e-01, 2.095e-02, 1.683e-02, -4.073e-02, -5.207e-03, -3.854e-03), r);
|
||||||
|
return r;
|
||||||
|
}
|
||||||
|
|
||||||
|
void Pass2(uint2 blockStart, uint3 tid) {
|
||||||
|
float2 pt = float2(GetInputPt());
|
||||||
|
uint2 gxy = Rmp8x8(tid.x) + blockStart;
|
||||||
|
uint2 size = GetInputSize();
|
||||||
|
if (gxy.x >= size.x || gxy.y >= size.y) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
float2 pos = (gxy + 0.5) * pt;
|
||||||
|
|
||||||
|
V4 s0_0 = l0(-1.0, -1.0);
|
||||||
|
V4 s0_1 = l0(0.0, -1.0);
|
||||||
|
V4 s0_2 = l0(1.0, -1.0);
|
||||||
|
V4 s0_3 = l0(-1.0, 0.0);
|
||||||
|
V4 s0_4 = l0(0.0, 0.0);
|
||||||
|
V4 s0_5 = l0(1.0, 0.0);
|
||||||
|
V4 s0_6 = l0(-1.0, 1.0);
|
||||||
|
V4 s0_7 = l0(0.0, 1.0);
|
||||||
|
V4 s0_8 = l0(1.0, 1.0);
|
||||||
|
V4 s1_0 = -max(-s0_0, 0.0);
|
||||||
|
V4 s1_1 = -max(-s0_1, 0.0);
|
||||||
|
V4 s1_2 = -max(-s0_2, 0.0);
|
||||||
|
V4 s1_3 = -max(-s0_3, 0.0);
|
||||||
|
V4 s1_4 = -max(-s0_4, 0.0);
|
||||||
|
V4 s1_5 = -max(-s0_5, 0.0);
|
||||||
|
V4 s1_6 = -max(-s0_6, 0.0);
|
||||||
|
V4 s1_7 = -max(-s0_7, 0.0);
|
||||||
|
V4 s1_8 = -max(-s0_8, 0.0);
|
||||||
|
s0_0 = max(s0_0, 0.0);
|
||||||
|
s0_1 = max(s0_1, 0.0);
|
||||||
|
s0_2 = max(s0_2, 0.0);
|
||||||
|
s0_3 = max(s0_3, 0.0);
|
||||||
|
s0_4 = max(s0_4, 0.0);
|
||||||
|
s0_5 = max(s0_5, 0.0);
|
||||||
|
s0_6 = max(s0_6, 0.0);
|
||||||
|
s0_7 = max(s0_7, 0.0);
|
||||||
|
s0_8 = max(s0_8, 0.0);
|
||||||
|
|
||||||
|
t1[gxy] = f0(s0_0, s0_1, s0_2, s0_3, s0_4, s0_5, s0_6, s0_7, s0_8, s1_0, s1_1, s1_2, s1_3, s1_4, s1_5, s1_6, s1_7, s1_8);
|
||||||
|
}
|
||||||
|
|
||||||
|
//!PASS 3
|
||||||
|
//!DESC conv2
|
||||||
|
//!BLOCK_SIZE 8
|
||||||
|
//!NUM_THREADS 64
|
||||||
|
//!IN t1
|
||||||
|
//!OUT t0
|
||||||
|
|
||||||
|
#define l0(x, y) V4(O(t1, float2(x, y)))
|
||||||
|
|
||||||
|
V4 f0(V4 s0_0, V4 s0_1, V4 s0_2, V4 s0_3, V4 s0_4, V4 s0_5, V4 s0_6, V4 s0_7, V4 s0_8, V4 s1_0, V4 s1_1, V4 s1_2, V4 s1_3, V4 s1_4, V4 s1_5, V4 s1_6, V4 s1_7, V4 s1_8) {
|
||||||
|
V4 r = { -1.043e-03, 3.601e-03, 5.622e-03, -7.848e-04 };
|
||||||
|
r = MulAdd(s0_0, M4(1.921e-01, -2.132e-02, -5.460e-03, -6.681e-02, 9.988e-02, -2.228e-02, 4.719e-02, 9.124e-03, -1.072e-01, 1.506e-01, 2.070e-02, -4.671e-02, 2.244e-01, -4.895e-02, -8.150e-03, -9.520e-02), r);
|
||||||
|
r = MulAdd(s0_1, M4(8.226e-02, 4.651e-02, -1.842e-01, -3.376e-02, 1.349e-01, 2.148e-02, -1.746e-01, 1.671e-02, 9.761e-02, 7.581e-02, 1.470e-01, -8.582e-02, -1.149e-01, 2.143e-02, -1.597e-01, 1.626e-01), r);
|
||||||
|
r = MulAdd(s0_2, M4(-5.810e-04, -3.566e-02, 4.708e-02, -3.068e-02, 1.578e-02, 5.503e-03, 3.081e-02, -4.174e-02, 3.394e-01, 7.398e-02, -9.467e-02, -1.127e-01, -1.314e-01, 1.511e-02, 1.538e-01, -5.695e-03), r);
|
||||||
|
r = MulAdd(s0_3, M4(2.959e-01, 3.316e-02, -5.716e-02, -2.233e-01, 5.020e-01, -1.416e-01, -6.082e-02, -3.393e-01, 3.292e-01, -6.813e-02, 9.009e-02, -1.638e-01, 1.190e-01, -2.728e-02, -6.042e-02, -1.360e-01), r);
|
||||||
|
r = MulAdd(s0_4, M4(5.902e-01, 3.040e-01, -2.870e-01, 2.228e-02, -1.646e-01, 2.078e-02, -1.480e-01, 2.083e-01, -4.397e-01, -2.549e-01, -1.168e-01, -4.199e-01, 2.199e-01, 2.596e-02, 2.598e-02, -1.313e-01), r);
|
||||||
|
r = MulAdd(s0_5, M4(1.043e-01, 1.050e-02, -5.654e-02, -1.265e-01, -1.978e-01, 3.772e-02, 2.474e-01, 1.395e-01, 2.041e-01, 6.617e-02, -2.602e-01, -1.601e-01, -5.577e-02, -1.591e-02, 2.096e-01, 2.594e-02), r);
|
||||||
|
r = MulAdd(s0_6, M4(7.245e-02, 6.156e-02, 5.317e-02, -3.912e-01, 1.871e-01, -2.079e-02, -2.552e-02, -6.961e-02, 2.686e-01, 8.518e-02, -1.026e-01, -4.040e-01, -6.324e-02, 7.999e-03, 1.317e-02, 1.619e-02), r);
|
||||||
|
r = MulAdd(s0_7, M4(1.240e-01, -8.349e-02, -1.258e-01, -3.269e-01, 6.624e-01, -1.357e-01, -6.738e-01, -5.998e-01, -8.375e-04, 2.226e-01, -1.880e-01, 5.678e-02, -8.383e-02, -3.455e-02, -1.399e-02, 4.540e-02), r);
|
||||||
|
r = MulAdd(s0_8, M4(-3.130e-02, 9.691e-02, 1.763e-01, -1.847e-02, -1.193e-01, -7.494e-03, 1.485e-02, 1.244e-02, 9.559e-02, 3.116e-02, 8.046e-03, -1.264e-01, -2.403e-01, 6.389e-02, 2.999e-01, 1.484e-01), r);
|
||||||
|
r = MulAdd(s1_0, M4(2.569e-01, -8.689e-03, -1.806e-02, -3.993e-02, 9.155e-02, -2.022e-02, 1.034e-02, -3.455e-02, -1.534e-01, 1.836e-02, -1.176e-03, 3.593e-03, 2.642e-01, -6.587e-02, -4.169e-02, -2.237e-01), r);
|
||||||
|
r = MulAdd(s1_1, M4(1.398e-01, 1.020e-02, -2.478e-01, 2.747e-02, 7.152e-02, 1.835e-02, -2.013e-01, 1.151e-02, -2.586e-01, -3.622e-02, 2.529e-01, 1.465e-01, -3.973e-01, 5.907e-02, -9.450e-02, 3.761e-02), r);
|
||||||
|
r = MulAdd(s1_2, M4(3.157e-02, 7.847e-03, 8.109e-03, -3.333e-02, -3.333e-02, -6.401e-03, -6.632e-03, 3.296e-02, -1.433e-02, 2.167e-02, 1.194e-01, -1.028e-01, -2.104e-01, 1.352e-02, -6.835e-02, 1.901e-01), r);
|
||||||
|
r = MulAdd(s1_3, M4(3.443e-01, -1.004e-01, -6.176e-02, -3.047e-01, 4.779e-01, -7.928e-02, -8.134e-02, -4.873e-01, -1.421e-01, 3.972e-02, 7.459e-02, 2.099e-01, 1.118e-01, -1.022e-02, -8.584e-02, -1.657e-01), r);
|
||||||
|
r = MulAdd(s1_4, M4(-1.721e-01, 2.625e-02, -7.292e-03, 2.646e-01, 2.505e-02, 1.479e-01, -3.357e-01, 1.088e-01, 1.016e-01, -1.902e-01, -1.622e-01, -6.326e-02, -4.305e-01, 4.763e-01, -1.357e-03, -5.685e-01), r);
|
||||||
|
r = MulAdd(s1_5, M4(3.324e-03, 1.692e-02, -5.726e-02, 2.853e-02, -3.135e-01, -4.534e-03, 2.549e-01, 1.183e-01, -1.277e-01, -5.030e-02, 9.190e-02, 1.145e-01, 3.445e-01, 6.425e-02, -2.707e-01, -1.701e-01), r);
|
||||||
|
r = MulAdd(s1_6, M4(2.164e-02, 1.998e-02, 1.667e-02, -6.126e-02, 2.400e-01, -9.253e-02, -4.525e-02, 8.615e-03, 5.148e-02, -1.803e-02, -7.495e-02, -7.102e-02, -2.646e-02, 6.819e-02, 1.465e-01, 1.904e-01), r);
|
||||||
|
r = MulAdd(s1_7, M4(-2.339e-02, 3.350e-02, -1.274e-01, 5.525e-02, 9.120e-01, -9.074e-01, -6.856e-01, -7.422e-02, 4.849e-02, -1.377e-02, -1.409e-01, -5.792e-02, -1.044e-01, 9.079e-02, 2.520e-01, 2.053e-01), r);
|
||||||
|
r = MulAdd(s1_8, M4(1.891e-02, -1.562e-02, -1.024e-02, -2.686e-02, -1.038e-01, -3.210e-02, 4.222e-01, -2.084e-01, -1.841e-01, 3.231e-02, 7.320e-02, 1.727e-01, 2.861e-01, 2.506e-02, -2.266e-01, -3.940e-01), r);
|
||||||
|
return r;
|
||||||
|
}
|
||||||
|
|
||||||
|
void Pass3(uint2 blockStart, uint3 tid) {
|
||||||
|
float2 pt = float2(GetInputPt());
|
||||||
|
uint2 gxy = Rmp8x8(tid.x) + blockStart;
|
||||||
|
uint2 size = GetInputSize();
|
||||||
|
if (gxy.x >= size.x || gxy.y >= size.y) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
float2 pos = (gxy + 0.5) * pt;
|
||||||
|
|
||||||
|
V4 s0_0 = l0(-1.0, -1.0);
|
||||||
|
V4 s0_1 = l0(0.0, -1.0);
|
||||||
|
V4 s0_2 = l0(1.0, -1.0);
|
||||||
|
V4 s0_3 = l0(-1.0, 0.0);
|
||||||
|
V4 s0_4 = l0(0.0, 0.0);
|
||||||
|
V4 s0_5 = l0(1.0, 0.0);
|
||||||
|
V4 s0_6 = l0(-1.0, 1.0);
|
||||||
|
V4 s0_7 = l0(0.0, 1.0);
|
||||||
|
V4 s0_8 = l0(1.0, 1.0);
|
||||||
|
V4 s1_0 = -max(-s0_0, 0.0);
|
||||||
|
V4 s1_1 = -max(-s0_1, 0.0);
|
||||||
|
V4 s1_2 = -max(-s0_2, 0.0);
|
||||||
|
V4 s1_3 = -max(-s0_3, 0.0);
|
||||||
|
V4 s1_4 = -max(-s0_4, 0.0);
|
||||||
|
V4 s1_5 = -max(-s0_5, 0.0);
|
||||||
|
V4 s1_6 = -max(-s0_6, 0.0);
|
||||||
|
V4 s1_7 = -max(-s0_7, 0.0);
|
||||||
|
V4 s1_8 = -max(-s0_8, 0.0);
|
||||||
|
s0_0 = max(s0_0, 0.0);
|
||||||
|
s0_1 = max(s0_1, 0.0);
|
||||||
|
s0_2 = max(s0_2, 0.0);
|
||||||
|
s0_3 = max(s0_3, 0.0);
|
||||||
|
s0_4 = max(s0_4, 0.0);
|
||||||
|
s0_5 = max(s0_5, 0.0);
|
||||||
|
s0_6 = max(s0_6, 0.0);
|
||||||
|
s0_7 = max(s0_7, 0.0);
|
||||||
|
s0_8 = max(s0_8, 0.0);
|
||||||
|
|
||||||
|
t0[gxy] = f0(s0_0, s0_1, s0_2, s0_3, s0_4, s0_5, s0_6, s0_7, s0_8, s1_0, s1_1, s1_2, s1_3, s1_4, s1_5, s1_6, s1_7, s1_8);
|
||||||
|
}
|
||||||
|
|
||||||
|
//!PASS 4
|
||||||
|
//!DESC conv3
|
||||||
|
//!BLOCK_SIZE 8
|
||||||
|
//!NUM_THREADS 64
|
||||||
|
//!IN t0
|
||||||
|
//!OUT t1
|
||||||
|
|
||||||
|
#define l0(x, y) V4(O(t0, float2(x, y)))
|
||||||
|
|
||||||
|
V4 f0(V4 s0_0, V4 s0_1, V4 s0_2, V4 s0_3, V4 s0_4, V4 s0_5, V4 s0_6, V4 s0_7, V4 s0_8, V4 s1_0, V4 s1_1, V4 s1_2, V4 s1_3, V4 s1_4, V4 s1_5, V4 s1_6, V4 s1_7, V4 s1_8) {
|
||||||
|
V4 r = { -6.562e-04, 7.371e-04, -4.319e-03, -8.757e-04 };
|
||||||
|
r = MulAdd(s0_0, M4(-7.801e-03, 7.517e-03, 5.348e-02, 7.686e-02, -8.770e-03, 1.144e-02, -2.398e-02, 1.355e-02, -4.642e-02, 5.880e-02, 3.263e-02, 1.860e-01, -4.443e-02, -2.732e-02, -2.133e-02, -1.166e-01), r);
|
||||||
|
r = MulAdd(s0_1, M4(-1.751e-02, -1.230e-02, -1.218e-01, -1.231e-01, 4.092e-03, -8.769e-03, -2.251e-03, 5.142e-02, 4.354e-03, -4.445e-02, -2.369e-01, -1.616e-01, 4.495e-03, -1.326e-01, -5.371e-01, -5.119e-01), r);
|
||||||
|
r = MulAdd(s0_2, M4(3.143e-02, 2.366e-02, 8.884e-02, -1.819e-02, 2.358e-03, 3.812e-04, -4.972e-02, -5.311e-02, 1.729e-02, 1.523e-02, 7.798e-02, -1.705e-05, -2.295e-02, 6.567e-02, 1.422e-01, 1.890e-01), r);
|
||||||
|
r = MulAdd(s0_3, M4(2.363e-02, 1.555e-02, -1.307e-01, -8.190e-02, 1.026e-02, 9.724e-03, 5.358e-02, -2.783e-01, 7.268e-03, 1.659e-01, -5.801e-02, 3.076e-01, -1.575e-01, -9.567e-02, 3.294e-02, -7.694e-01), r);
|
||||||
|
r = MulAdd(s0_4, M4(1.677e-02, -1.324e-01, 4.019e-01, -2.902e-01, -6.051e-02, -4.625e-02, 8.409e-01, 4.756e-01, -1.135e-01, -3.213e-01, 6.389e-02, -2.083e-01, -1.219e+00, 2.280e-01, 9.667e-01, -3.604e-01), r);
|
||||||
|
r = MulAdd(s0_5, M4(-5.948e-02, 1.567e-01, 3.883e-02, -4.843e-03, -2.153e-02, 3.439e-02, -1.160e-01, -1.325e-02, -5.312e-02, 1.136e-01, -5.260e-02, -3.524e-02, 7.315e-02, 3.527e-01, 6.186e-01, -7.505e-02), r);
|
||||||
|
r = MulAdd(s0_6, M4(-3.841e-02, 1.620e-03, 9.449e-02, -8.648e-02, -2.656e-02, -1.676e-03, 2.364e-03, -7.221e-02, -9.590e-02, 4.160e-02, -1.278e-02, -3.171e-02, 6.213e-02, 2.673e-02, -7.931e-02, 2.588e-01), r);
|
||||||
|
r = MulAdd(s0_7, M4(-3.636e-02, -1.558e-01, 2.151e-01, 1.188e-01, 1.275e-01, -8.114e-02, -8.376e-02, -3.690e-02, -1.968e-02, -1.038e-01, 8.994e-02, 3.846e-02, -1.499e-01, 6.457e-01, -8.201e-02, -3.935e-01), r);
|
||||||
|
r = MulAdd(s0_8, M4(-2.833e-03, 2.529e-01, -3.350e-03, -3.433e-02, 1.943e-02, -2.796e-02, 3.313e-02, 1.582e-02, 1.702e-02, 5.663e-02, -1.647e-02, -2.229e-02, -4.865e-01, 3.285e-01, -4.462e-01, -4.307e-01), r);
|
||||||
|
r = MulAdd(s1_0, M4(-6.004e-02, 4.898e-03, 3.591e-02, 1.900e-01, -3.816e-02, -3.269e-02, 1.459e-01, -3.464e-03, -1.235e-02, -3.737e-02, 1.569e-02, 2.559e-01, -3.173e-04, 1.268e-02, 8.886e-03, 2.960e-02), r);
|
||||||
|
r = MulAdd(s1_1, M4(-1.582e-02, -7.507e-02, -2.026e-01, 2.027e-01, -6.107e-02, 2.055e-02, -5.811e-02, 5.420e-03, 1.028e-02, -1.374e-02, -6.152e-01, -2.259e-01, -3.408e-03, -1.800e-02, 4.574e-02, -9.590e-02), r);
|
||||||
|
r = MulAdd(s1_2, M4(4.210e-02, 2.126e-02, 8.277e-02, 2.079e-02, -1.733e-01, -2.483e-02, 2.686e-01, 1.498e-01, 7.352e-02, -2.511e-02, 3.159e-02, 5.775e-02, 5.942e-02, 3.383e-02, 1.274e-01, -5.928e-02), r);
|
||||||
|
r = MulAdd(s1_3, M4(5.614e-02, 7.561e-02, -8.328e-02, 2.427e-01, 7.214e-02, -1.122e-01, 9.434e-02, -2.602e-01, -1.052e-02, -6.944e-02, -3.023e-02, -1.655e-01, 1.236e-03, 4.025e-03, -3.082e-02, -1.533e-01), r);
|
||||||
|
r = MulAdd(s1_4, M4(6.675e-01, -2.254e-01, 1.173e+00, -8.261e-02, 5.655e-01, -2.000e-01, 8.301e-01, 1.458e+00, -2.497e-01, -1.091e+00, -4.698e-01, -1.876e-01, -3.358e-02, -2.854e-01, 5.032e-01, -1.558e-01), r);
|
||||||
|
r = MulAdd(s1_5, M4(-1.444e-02, 1.502e-01, -4.221e-02, -4.864e-02, 3.236e-01, -2.572e-01, 1.344e-01, 8.562e-02, -1.030e-01, 2.690e-01, 1.238e-01, 3.309e-02, -3.849e-02, 1.860e-01, 6.528e-03, 2.840e-02), r);
|
||||||
|
r = MulAdd(s1_6, M4(-1.161e-01, 5.405e-02, -3.101e-02, -1.009e-01, -9.594e-02, -1.207e-02, -3.836e-02, -6.894e-02, -1.770e-02, -2.958e-02, 8.484e-02, -2.284e-02, 2.585e-04, -2.764e-02, 4.972e-02, -5.968e-02), r);
|
||||||
|
r = MulAdd(s1_7, M4(-4.113e-02, -1.948e-01, -2.728e-02, -3.142e-02, -2.894e-01, -1.111e-01, 7.492e-02, -2.892e-02, 9.054e-02, 4.350e-02, 2.183e-01, 1.489e-01, 1.167e-02, -6.678e-02, 3.696e-02, -1.315e-02), r);
|
||||||
|
r = MulAdd(s1_8, M4(2.532e-02, 4.585e-02, -3.694e-02, -6.244e-02, -1.673e-01, 6.180e-02, -4.475e-02, 1.028e-02, -1.658e-02, 8.923e-02, 1.711e-02, 3.037e-03, 4.651e-02, 1.652e-01, 7.863e-03, -3.387e-02), r);
|
||||||
|
return r;
|
||||||
|
}
|
||||||
|
|
||||||
|
void Pass4(uint2 blockStart, uint3 tid) {
|
||||||
|
float2 pt = float2(GetInputPt());
|
||||||
|
uint2 gxy = Rmp8x8(tid.x) + blockStart;
|
||||||
|
uint2 size = GetInputSize();
|
||||||
|
if (gxy.x >= size.x || gxy.y >= size.y) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
float2 pos = (gxy + 0.5) * pt;
|
||||||
|
|
||||||
|
V4 s0_0 = l0(-1.0, -1.0);
|
||||||
|
V4 s0_1 = l0(0.0, -1.0);
|
||||||
|
V4 s0_2 = l0(1.0, -1.0);
|
||||||
|
V4 s0_3 = l0(-1.0, 0.0);
|
||||||
|
V4 s0_4 = l0(0.0, 0.0);
|
||||||
|
V4 s0_5 = l0(1.0, 0.0);
|
||||||
|
V4 s0_6 = l0(-1.0, 1.0);
|
||||||
|
V4 s0_7 = l0(0.0, 1.0);
|
||||||
|
V4 s0_8 = l0(1.0, 1.0);
|
||||||
|
V4 s1_0 = -max(-s0_0, 0.0);
|
||||||
|
V4 s1_1 = -max(-s0_1, 0.0);
|
||||||
|
V4 s1_2 = -max(-s0_2, 0.0);
|
||||||
|
V4 s1_3 = -max(-s0_3, 0.0);
|
||||||
|
V4 s1_4 = -max(-s0_4, 0.0);
|
||||||
|
V4 s1_5 = -max(-s0_5, 0.0);
|
||||||
|
V4 s1_6 = -max(-s0_6, 0.0);
|
||||||
|
V4 s1_7 = -max(-s0_7, 0.0);
|
||||||
|
V4 s1_8 = -max(-s0_8, 0.0);
|
||||||
|
s0_0 = max(s0_0, 0.0);
|
||||||
|
s0_1 = max(s0_1, 0.0);
|
||||||
|
s0_2 = max(s0_2, 0.0);
|
||||||
|
s0_3 = max(s0_3, 0.0);
|
||||||
|
s0_4 = max(s0_4, 0.0);
|
||||||
|
s0_5 = max(s0_5, 0.0);
|
||||||
|
s0_6 = max(s0_6, 0.0);
|
||||||
|
s0_7 = max(s0_7, 0.0);
|
||||||
|
s0_8 = max(s0_8, 0.0);
|
||||||
|
|
||||||
|
t1[gxy] = f0(s0_0, s0_1, s0_2, s0_3, s0_4, s0_5, s0_6, s0_7, s0_8, s1_0, s1_1, s1_2, s1_3, s1_4, s1_5, s1_6, s1_7, s1_8);
|
||||||
|
}
|
||||||
|
|
||||||
|
//!PASS 5
|
||||||
|
//!DESC conv4
|
||||||
|
//!BLOCK_SIZE 8
|
||||||
|
//!NUM_THREADS 64
|
||||||
|
//!IN t1
|
||||||
|
//!OUT t0
|
||||||
|
|
||||||
|
#define l0(x, y) V4(O(t1, float2(x, y)))
|
||||||
|
|
||||||
|
V4 f0(V4 s0_0, V4 s0_1, V4 s0_2, V4 s0_3, V4 s0_4, V4 s0_5, V4 s0_6, V4 s0_7, V4 s0_8, V4 s1_0, V4 s1_1, V4 s1_2, V4 s1_3, V4 s1_4, V4 s1_5, V4 s1_6, V4 s1_7, V4 s1_8) {
|
||||||
|
V4 r = { -3.709e-04, 2.029e-04, -3.042e-03, -2.970e-04 };
|
||||||
|
r = MulAdd(s0_0, M4(-1.087e-01, -5.083e-02, 3.146e-01, -4.241e-02, 4.462e-02, -4.358e-02, -1.562e-01, -2.609e-03, 5.918e-02, -2.526e-02, -3.132e-02, -1.150e-02, -8.799e-03, 3.070e-02, -1.680e-02, -1.046e-02), r);
|
||||||
|
r = MulAdd(s0_1, M4(1.762e-01, 8.784e-01, -2.704e+00, -1.565e+00, -1.473e-01, -5.723e-01, 7.838e-02, -7.420e-03, -1.769e-01, -2.041e-01, -1.783e-03, -4.944e-03, 1.304e-02, 2.646e-01, -1.708e-01, 7.483e-03), r);
|
||||||
|
r = MulAdd(s0_2, M4(-1.907e-01, 1.514e-01, -3.657e-01, -5.840e-01, -4.943e-02, -1.014e-02, -2.869e-03, 6.488e-03, 2.266e-02, -3.850e-02, 6.125e-03, 1.899e-02, -3.541e-02, -2.011e-01, 1.567e-01, 1.008e-02), r);
|
||||||
|
r = MulAdd(s0_3, M4(-3.061e-01, -1.768e-01, 9.163e-02, -2.243e-01, 4.945e-02, 1.106e-01, -1.137e-01, 1.755e-02, 2.640e-01, -9.298e-02, -1.704e-01, 3.935e-02, 1.506e-01, -3.284e-02, 4.719e-02, 5.543e-02), r);
|
||||||
|
r = MulAdd(s0_4, M4(-4.579e-01, -6.198e-02, -9.889e-01, -4.446e-01, -1.612e-01, 1.518e-01, 2.588e-01, 1.075e-02, -1.527e+00, -7.923e-01, 8.120e-02, -1.116e-01, -2.079e-01, -1.206e-01, -4.422e-01, -1.951e-01), r);
|
||||||
|
r = MulAdd(s0_5, M4(1.064e-01, -1.684e-01, 2.316e-01, 4.211e-01, -9.153e-02, 9.155e-02, -7.649e-02, -1.385e-01, 9.422e-02, -1.631e-01, 8.278e-02, 3.318e-01, 7.284e-02, 3.489e-01, -2.303e-02, -6.554e-01), r);
|
||||||
|
r = MulAdd(s0_6, M4(-6.320e-02, -4.390e-02, 1.453e-02, 3.187e-02, 2.166e-02, 2.423e-03, 1.573e-03, -2.226e-02, 1.401e-01, 2.026e-01, -2.249e-01, 6.471e-02, 3.593e-02, -1.575e-02, -3.186e-02, 1.339e-02), r);
|
||||||
|
r = MulAdd(s0_7, M4(2.778e-02, 7.495e-02, -1.086e-01, 8.862e-02, -2.352e-02, 1.477e-02, 2.741e-02, 4.345e-02, -2.865e-01, 9.405e-02, 1.880e-01, -3.610e-01, -7.797e-02, -5.710e-03, 3.386e-02, 2.830e-02), r);
|
||||||
|
r = MulAdd(s0_8, M4(-3.734e-02, 3.357e-02, 5.657e-03, -1.596e-01, -7.661e-03, 1.603e-02, -3.137e-02, -7.023e-03, 6.522e-03, -2.715e-02, 2.765e-02, 4.724e-02, 1.922e-02, 3.944e-02, -8.276e-02, -1.915e-02), r);
|
||||||
|
r = MulAdd(s1_0, M4(-7.121e-02, -2.276e-02, 7.266e-02, -4.411e-03, -5.600e-01, 4.502e-01, -1.817e-01, -2.906e-01, -5.675e-02, 2.653e-02, 3.284e-02, -1.925e-03, -4.729e-03, -1.554e-03, -6.081e-03, -2.195e-02), r);
|
||||||
|
r = MulAdd(s1_1, M4(2.212e-01, 3.154e-01, -2.765e-01, 4.432e-02, 1.402e+00, 2.159e-01, 4.402e-01, 2.537e-01, 6.697e-02, 1.207e-01, -5.192e-02, 2.638e-02, 5.366e-02, 5.855e-02, -3.687e-02, 4.389e-03), r);
|
||||||
|
r = MulAdd(s1_2, M4(3.137e-02, -1.157e-01, 9.497e-02, -3.724e-02, 5.241e-02, 7.793e-02, 2.277e-04, -4.033e-01, 1.432e-02, 4.622e-02, -1.636e-02, -5.840e-03, -1.593e-02, -7.447e-02, 3.943e-02, -3.517e-03), r);
|
||||||
|
r = MulAdd(s1_3, M4(-1.209e-02, -1.350e-01, 3.018e-01, 1.233e-01, -1.262e-03, 2.194e-01, -2.919e-01, -8.031e-03, 4.620e-03, 5.318e-02, 1.247e-02, -4.260e-02, 7.155e-02, 3.256e-02, -9.839e-02, -6.741e-04), r);
|
||||||
|
r = MulAdd(s1_4, M4(3.291e-01, 2.397e-01, -2.820e-01, 5.703e-01, 7.831e-03, 5.816e-02, -1.696e-02, -1.957e-01, -1.851e-01, 3.696e-02, -2.611e-01, 7.039e-03, -1.562e-01, -7.676e-01, 9.080e-01, 7.823e-02), r);
|
||||||
|
r = MulAdd(s1_5, M4(9.918e-03, 6.364e-02, 3.364e-02, -3.291e-01, 1.393e-02, 3.139e-02, 1.701e-02, -5.675e-02, 5.085e-02, -2.050e-01, 1.160e-01, 4.875e-02, -1.189e-01, 2.310e-01, -1.353e-01, 2.046e-02), r);
|
||||||
|
r = MulAdd(s1_6, M4(-5.477e-03, -1.704e-02, 9.510e-03, -1.701e-02, 1.391e-02, -8.760e-03, -3.355e-02, -6.898e-03, -9.203e-03, -2.442e-02, 7.547e-03, 1.817e-02, 1.871e-02, -1.149e-02, 6.458e-02, 1.403e-02), r);
|
||||||
|
r = MulAdd(s1_7, M4(-5.073e-03, -5.454e-02, -2.710e-02, 1.292e-02, 2.458e-02, 1.739e-02, -2.319e-03, 3.865e-02, 5.399e-02, -1.176e-02, -1.315e-01, 1.489e-01, -7.903e-02, 8.120e-02, 4.749e-02, 1.961e-01), r);
|
||||||
|
r = MulAdd(s1_8, M4(4.163e-02, -1.603e-02, 8.659e-03, 1.023e-01, 5.233e-03, -2.900e-03, -5.293e-03, -5.829e-03, -1.453e-02, 2.467e-02, 7.198e-02, -2.407e-01, -4.023e-02, 1.009e-01, -1.560e-01, -1.567e-01), r);
|
||||||
|
return r;
|
||||||
|
}
|
||||||
|
|
||||||
|
void Pass5(uint2 blockStart, uint3 tid) {
|
||||||
|
float2 pt = float2(GetInputPt());
|
||||||
|
uint2 gxy = Rmp8x8(tid.x) + blockStart;
|
||||||
|
uint2 size = GetInputSize();
|
||||||
|
if (gxy.x >= size.x || gxy.y >= size.y) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
float2 pos = (gxy + 0.5) * pt;
|
||||||
|
|
||||||
|
V4 s0_0 = l0(-1.0, -1.0);
|
||||||
|
V4 s0_1 = l0(0.0, -1.0);
|
||||||
|
V4 s0_2 = l0(1.0, -1.0);
|
||||||
|
V4 s0_3 = l0(-1.0, 0.0);
|
||||||
|
V4 s0_4 = l0(0.0, 0.0);
|
||||||
|
V4 s0_5 = l0(1.0, 0.0);
|
||||||
|
V4 s0_6 = l0(-1.0, 1.0);
|
||||||
|
V4 s0_7 = l0(0.0, 1.0);
|
||||||
|
V4 s0_8 = l0(1.0, 1.0);
|
||||||
|
V4 s1_0 = -max(-s0_0, 0.0);
|
||||||
|
V4 s1_1 = -max(-s0_1, 0.0);
|
||||||
|
V4 s1_2 = -max(-s0_2, 0.0);
|
||||||
|
V4 s1_3 = -max(-s0_3, 0.0);
|
||||||
|
V4 s1_4 = -max(-s0_4, 0.0);
|
||||||
|
V4 s1_5 = -max(-s0_5, 0.0);
|
||||||
|
V4 s1_6 = -max(-s0_6, 0.0);
|
||||||
|
V4 s1_7 = -max(-s0_7, 0.0);
|
||||||
|
V4 s1_8 = -max(-s0_8, 0.0);
|
||||||
|
s0_0 = max(s0_0, 0.0);
|
||||||
|
s0_1 = max(s0_1, 0.0);
|
||||||
|
s0_2 = max(s0_2, 0.0);
|
||||||
|
s0_3 = max(s0_3, 0.0);
|
||||||
|
s0_4 = max(s0_4, 0.0);
|
||||||
|
s0_5 = max(s0_5, 0.0);
|
||||||
|
s0_6 = max(s0_6, 0.0);
|
||||||
|
s0_7 = max(s0_7, 0.0);
|
||||||
|
s0_8 = max(s0_8, 0.0);
|
||||||
|
|
||||||
|
t0[gxy] = f0(s0_0, s0_1, s0_2, s0_3, s0_4, s0_5, s0_6, s0_7, s0_8, s1_0, s1_1, s1_2, s1_3, s1_4, s1_5, s1_6, s1_7, s1_8);
|
||||||
|
}
|
||||||
|
|
||||||
|
//!PASS 6
|
||||||
|
//!DESC out-shuffle
|
||||||
|
//!BLOCK_SIZE 16
|
||||||
|
//!NUM_THREADS 64
|
||||||
|
//!IN INPUT, t0
|
||||||
|
//!OUT OUTPUT
|
||||||
|
|
||||||
|
#define l0(x, y) V4(O(t0, float2(x, y)))
|
||||||
|
|
||||||
|
V4 f0(V4 s0_0, V4 s0_1, V4 s0_2, V4 s0_3, V4 s0_4, V4 s0_5, V4 s0_6, V4 s0_7, V4 s0_8, V4 s1_0, V4 s1_1, V4 s1_2, V4 s1_3, V4 s1_4, V4 s1_5, V4 s1_6, V4 s1_7, V4 s1_8) {
|
||||||
|
V4 r = { -3.259e-04, -3.197e-04, 4.954e-04, 4.568e-04 };
|
||||||
|
r = MulAdd(s0_0, M4(-6.857e-02, -6.042e-02, 3.293e-03, -2.389e-03, -1.606e-01, -1.556e-02, -5.115e-02, -4.602e-02, -3.762e-02, 1.994e-02, -2.370e-02, 3.558e-02, -7.142e-01, 8.184e-01, -1.361e-01, 1.228e-01), r);
|
||||||
|
r = MulAdd(s0_1, M4(-1.887e-01, -2.260e-01, 1.293e-02, -1.757e-02, 1.257e-01, 1.304e-01, -4.525e-02, 4.471e-02, 6.895e-01, -4.096e-01, 4.096e-02, 1.817e-02, -1.343e-01, -4.170e-01, 3.991e-03, 1.516e-03), r);
|
||||||
|
r = MulAdd(s0_2, M4(-2.667e-01, -8.692e-02, 1.481e-01, -1.466e-01, 6.142e-02, -2.084e-02, 1.942e-02, 6.700e-04, 3.942e-02, 3.109e-01, -1.323e-02, 2.240e-02, -2.306e-02, -4.749e-02, -1.155e-02, 1.843e-03), r);
|
||||||
|
r = MulAdd(s0_3, M4(-1.004e-01, -1.184e-02, -8.590e-02, -1.018e-01, 6.862e-02, -4.700e-02, -1.537e-01, -1.096e-01, -1.228e-01, 1.462e-02, -1.715e-01, 1.862e-02, 3.668e-01, -1.138e-01, 8.494e-04, 6.113e-01), r);
|
||||||
|
r = MulAdd(s0_4, M4(4.389e-01, -5.527e-01, -4.972e-01, -7.620e-01, 1.684e-01, 5.375e-02, 1.032e+00, 5.723e-01, 4.427e-02, -2.447e-01, 1.132e+00, -5.297e-01, 1.150e-01, 3.877e-01, 1.224e-01, 1.294e-01), r);
|
||||||
|
r = MulAdd(s0_5, M4(-1.023e+00, 1.567e+00, -9.747e-01, 1.051e+00, 1.537e-02, 1.993e-01, -1.679e-01, 1.139e-01, -7.358e-02, -1.782e-01, -1.938e-01, 4.419e-02, 2.001e-02, 5.881e-02, 8.971e-03, 3.368e-03), r);
|
||||||
|
r = MulAdd(s0_6, M4(-5.126e-03, 1.449e-02, -7.018e-02, 2.929e-02, 4.748e-02, -4.443e-03, -5.791e-02, -3.490e-02, 3.817e-02, 1.007e-02, -5.501e-02, -1.488e-02, -8.848e-03, 4.884e-02, -6.548e-02, 3.392e-02), r);
|
||||||
|
r = MulAdd(s0_7, M4(-4.449e-02, 7.313e-02, 3.311e-01, 3.138e-02, -6.466e-02, 5.666e-02, 1.929e-01, 8.274e-02, 3.994e-02, 2.105e-02, -1.821e-01, -1.539e-02, -9.333e-03, -4.728e-02, 6.975e-03, -3.292e-03), r);
|
||||||
|
r = MulAdd(s0_8, M4(2.038e-01, -2.356e-01, -1.987e-01, -3.746e-02, -1.499e-02, -7.007e-02, -9.546e-02, 1.905e-02, -9.802e-03, 1.990e-02, 2.140e-02, -8.164e-03, 5.109e-03, -2.081e-02, -2.386e-02, 1.183e-02), r);
|
||||||
|
r = MulAdd(s1_0, M4(-7.067e-02, -4.613e-02, -5.433e-04, -2.191e-02, -1.125e-01, -3.650e-02, -1.298e-02, -3.479e-02, -1.118e-01, -1.521e-02, -4.731e-03, -7.478e-03, 1.802e-01, 4.872e-02, -1.599e-03, -1.452e-02), r);
|
||||||
|
r = MulAdd(s1_1, M4(-2.920e-01, -1.831e-01, -1.305e-02, 4.031e-02, 1.989e-01, 3.120e-03, 2.025e-02, 5.432e-02, 2.607e-01, 2.403e-02, 1.863e-02, 8.423e-02, -3.372e-01, -1.327e-01, -1.248e-01, -1.247e-01), r);
|
||||||
|
r = MulAdd(s1_2, M4(-9.286e-02, -1.948e-01, -8.532e-03, 7.416e-03, 4.578e-02, 1.581e-01, 1.473e-03, -3.796e-02, 1.011e-01, 2.393e-01, 2.742e-02, -4.224e-02, -9.579e-03, -9.888e-02, -2.065e-03, 7.685e-03), r);
|
||||||
|
r = MulAdd(s1_3, M4(-2.056e-01, -3.479e-02, -2.666e-01, -5.344e-02, 1.579e-01, -6.091e-02, -1.655e-01, -1.575e-01, -8.230e-02, -4.748e-02, -1.304e-01, -7.186e-02, 2.953e-01, 6.950e-02, 1.865e-01, 7.567e-02), r);
|
||||||
|
r = MulAdd(s1_4, M4(3.408e-01, -1.054e-01, -2.613e-01, -6.084e-01, 3.193e-01, 6.366e-01, 4.251e-01, 4.066e-01, -3.742e-01, -8.521e-02, 5.906e-01, 1.870e-01, 2.044e-02, 2.495e-01, 1.046e-01, 3.018e-01), r);
|
||||||
|
r = MulAdd(s1_5, M4(4.748e-03, 2.086e-01, 4.231e-03, -7.764e-03, 3.933e-02, 3.446e-03, -3.431e-02, 8.415e-02, -3.798e-02, -3.428e-01, -7.206e-02, 2.392e-01, 2.157e-02, 2.692e-02, 3.313e-02, 1.841e-02), r);
|
||||||
|
r = MulAdd(s1_6, M4(1.813e-02, 2.306e-03, -3.402e-02, 1.009e-03, 4.408e-02, -2.307e-02, -3.394e-02, -3.912e-02, 3.822e-02, -1.051e-02, -1.023e-01, -4.626e-02, -4.871e-02, 6.250e-03, 1.367e-01, 3.674e-02), r);
|
||||||
|
r = MulAdd(s1_7, M4(-1.170e-02, 3.747e-02, 1.548e-01, 1.243e-01, -1.074e-01, -9.848e-03, 2.627e-01, 1.132e-01, 4.550e-02, 5.050e-02, -1.194e-01, -6.091e-02, -2.180e-02, -6.381e-02, -5.949e-02, 1.580e-02), r);
|
||||||
|
r = MulAdd(s1_8, M4(-1.146e-04, -1.852e-02, -1.515e-02, 2.488e-02, -1.877e-02, -7.739e-02, -6.812e-02, 7.656e-03, 2.688e-02, 5.650e-02, 4.285e-02, -3.270e-02, 1.163e-03, 8.328e-04, -1.998e-02, -2.282e-02), r);
|
||||||
|
return tanh(r);
|
||||||
|
}
|
||||||
|
|
||||||
|
void Pass6(uint2 blockStart, uint3 tid) {
|
||||||
|
float2 pt = float2(GetInputPt());
|
||||||
|
uint2 gxy = (Rmp8x8(tid.x) << 1) + blockStart;
|
||||||
|
uint2 size = GetOutputSize();
|
||||||
|
if (gxy.x >= size.x || gxy.y >= size.y) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
float2 pos = ((gxy >> 1) + 0.5) * pt;
|
||||||
|
|
||||||
|
V4 s0_0 = l0(-1.0, -1.0);
|
||||||
|
V4 s0_1 = l0(0.0, -1.0);
|
||||||
|
V4 s0_2 = l0(1.0, -1.0);
|
||||||
|
V4 s0_3 = l0(-1.0, 0.0);
|
||||||
|
V4 s0_4 = l0(0.0, 0.0);
|
||||||
|
V4 s0_5 = l0(1.0, 0.0);
|
||||||
|
V4 s0_6 = l0(-1.0, 1.0);
|
||||||
|
V4 s0_7 = l0(0.0, 1.0);
|
||||||
|
V4 s0_8 = l0(1.0, 1.0);
|
||||||
|
V4 s1_0 = -max(-s0_0, 0.0);
|
||||||
|
V4 s1_1 = -max(-s0_1, 0.0);
|
||||||
|
V4 s1_2 = -max(-s0_2, 0.0);
|
||||||
|
V4 s1_3 = -max(-s0_3, 0.0);
|
||||||
|
V4 s1_4 = -max(-s0_4, 0.0);
|
||||||
|
V4 s1_5 = -max(-s0_5, 0.0);
|
||||||
|
V4 s1_6 = -max(-s0_6, 0.0);
|
||||||
|
V4 s1_7 = -max(-s0_7, 0.0);
|
||||||
|
V4 s1_8 = -max(-s0_8, 0.0);
|
||||||
|
s0_0 = max(s0_0, 0.0);
|
||||||
|
s0_1 = max(s0_1, 0.0);
|
||||||
|
s0_2 = max(s0_2, 0.0);
|
||||||
|
s0_3 = max(s0_3, 0.0);
|
||||||
|
s0_4 = max(s0_4, 0.0);
|
||||||
|
s0_5 = max(s0_5, 0.0);
|
||||||
|
s0_6 = max(s0_6, 0.0);
|
||||||
|
s0_7 = max(s0_7, 0.0);
|
||||||
|
s0_8 = max(s0_8, 0.0);
|
||||||
|
|
||||||
|
V4 r = f0(s0_0, s0_1, s0_2, s0_3, s0_4, s0_5, s0_6, s0_7, s0_8, s1_0, s1_1, s1_2, s1_3, s1_4, s1_5, s1_6, s1_7, s1_8);
|
||||||
|
|
||||||
|
static const MF3x3 rgb2yuv = { 0.299, 0.587, 0.114, -0.169, -0.331, 0.5, 0.5, -0.419, -0.081 };
|
||||||
|
static const MF3x3 yuv2rgb = { 1, -0.00093, 1.401687, 1, -0.3437, -0.71417, 1, 1.77216, 0.00099 };
|
||||||
|
float2 opt = float2(GetOutputPt());
|
||||||
|
|
||||||
|
pos -= 0.5f * opt;
|
||||||
|
MF3 yuv = mul(rgb2yuv, INPUT.SampleLevel(SL, pos, 0).rgb);
|
||||||
|
OUTPUT[gxy] = MF4(mul(yuv2rgb, MF3(saturate(yuv.r + r.x), yuv.yz)), 1);
|
||||||
|
|
||||||
|
++gxy.x;
|
||||||
|
pos.x += opt.x;
|
||||||
|
yuv = mul(rgb2yuv, INPUT.SampleLevel(SL, pos, 0).rgb);
|
||||||
|
OUTPUT[gxy] = MF4(mul(yuv2rgb, MF3(saturate(yuv.r + r.y), yuv.yz)), 1);
|
||||||
|
|
||||||
|
++gxy.y;
|
||||||
|
pos.y += opt.y;
|
||||||
|
yuv = mul(rgb2yuv, INPUT.SampleLevel(SL, pos, 0).rgb);
|
||||||
|
OUTPUT[gxy] = MF4(mul(yuv2rgb, MF3(saturate(yuv.r + r.w), yuv.yz)), 1);
|
||||||
|
|
||||||
|
--gxy.x;
|
||||||
|
pos.x -= opt.x;
|
||||||
|
yuv = mul(rgb2yuv, INPUT.SampleLevel(SL, pos, 0).rgb);
|
||||||
|
OUTPUT[gxy] = MF4(mul(yuv2rgb, MF3(saturate(yuv.r + r.z), yuv.yz)), 1);
|
||||||
|
}
|
||||||
914
src/Effects/CuNNy/CuNNy-4x8C-NVL-DN.hlsl
Normal file
914
src/Effects/CuNNy/CuNNy-4x8C-NVL-DN.hlsl
Normal file
|
|
@ -0,0 +1,914 @@
|
||||||
|
// CuNNy 4x8C BILINEAR RGB NVL DN - https://github.com/funnyplanter/CuNNy
|
||||||
|
|
||||||
|
// This program is free software: you can redistribute it and/or modify
|
||||||
|
// it under the terms of the GNU General Public License as published by
|
||||||
|
// the Free Software Foundation, either version 3 of the License, or
|
||||||
|
// (at your option) any later version.
|
||||||
|
//
|
||||||
|
// This program is distributed in the hope that it will be useful,
|
||||||
|
// but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
//
|
||||||
|
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||||
|
// GNU General Public License for more details.
|
||||||
|
//
|
||||||
|
// You should have received a copy of the GNU General Public License
|
||||||
|
// along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||||
|
|
||||||
|
//!MAGPIE EFFECT
|
||||||
|
//!VERSION 4
|
||||||
|
//!SORT_NAME CuNNy-DN-D08N04
|
||||||
|
//!USE MulAdd
|
||||||
|
//!CAPABILITY FP16
|
||||||
|
|
||||||
|
#include "../StubDefs.hlsli"
|
||||||
|
|
||||||
|
//!TEXTURE
|
||||||
|
Texture2D INPUT;
|
||||||
|
|
||||||
|
//!TEXTURE
|
||||||
|
//!WIDTH INPUT_WIDTH * 2
|
||||||
|
//!HEIGHT INPUT_HEIGHT * 2
|
||||||
|
Texture2D OUTPUT;
|
||||||
|
|
||||||
|
//!SAMPLER
|
||||||
|
//!FILTER POINT
|
||||||
|
SamplerState SP;
|
||||||
|
|
||||||
|
//!SAMPLER
|
||||||
|
//!FILTER LINEAR
|
||||||
|
SamplerState SL;
|
||||||
|
|
||||||
|
//!COMMON
|
||||||
|
#define O(t, p) t.SampleLevel(SP, pos + p * pt, 0)
|
||||||
|
#define V4 MF4
|
||||||
|
#define M4 MF4x4
|
||||||
|
|
||||||
|
//!TEXTURE
|
||||||
|
//!WIDTH INPUT_WIDTH
|
||||||
|
//!HEIGHT INPUT_HEIGHT
|
||||||
|
//!FORMAT R8G8B8A8_SNORM
|
||||||
|
Texture2D t0;
|
||||||
|
|
||||||
|
//!TEXTURE
|
||||||
|
//!WIDTH INPUT_WIDTH
|
||||||
|
//!HEIGHT INPUT_HEIGHT
|
||||||
|
//!FORMAT R8G8B8A8_SNORM
|
||||||
|
Texture2D t1;
|
||||||
|
|
||||||
|
//!TEXTURE
|
||||||
|
//!WIDTH INPUT_WIDTH
|
||||||
|
//!HEIGHT INPUT_HEIGHT
|
||||||
|
//!FORMAT R8G8B8A8_SNORM
|
||||||
|
Texture2D t2;
|
||||||
|
|
||||||
|
//!TEXTURE
|
||||||
|
//!WIDTH INPUT_WIDTH
|
||||||
|
//!HEIGHT INPUT_HEIGHT
|
||||||
|
//!FORMAT R8G8B8A8_SNORM
|
||||||
|
Texture2D t3;
|
||||||
|
|
||||||
|
//!PASS 1
|
||||||
|
//!DESC in
|
||||||
|
//!BLOCK_SIZE 8
|
||||||
|
//!NUM_THREADS 64
|
||||||
|
//!IN INPUT
|
||||||
|
//!OUT t0, t1
|
||||||
|
|
||||||
|
#define l0(x, y) (dot(MF3(2.214e-01, 4.385e-01, 1.006e-01), O(INPUT, float2(x, y)).rgb) + MF(-6.858e-01))
|
||||||
|
|
||||||
|
V4 f0(MF s0_0, MF s0_1, MF s0_2, MF s0_3, MF s0_4, MF s0_5, MF s0_6, MF s0_7, MF s0_8) {
|
||||||
|
V4 r = { 2.880e-04, 1.418e-02, 1.413e-02, -1.036e-01 };
|
||||||
|
r = mad(s0_0, V4(-2.401e-02, 1.817e-03, -1.218e-01, 2.796e-02), r);
|
||||||
|
r = mad(s0_1, V4(3.256e-02, 3.929e-03, -5.850e-02, -5.602e-02), r);
|
||||||
|
r = mad(s0_2, V4(4.497e-04, -1.812e-02, 5.241e-02, 3.698e-02), r);
|
||||||
|
r = mad(s0_3, V4(5.371e-01, -2.302e-01, -1.373e-01, -4.038e-03), r);
|
||||||
|
r = mad(s0_4, V4(1.565e-01, -6.067e-02, 3.397e-01, -3.741e-01), r);
|
||||||
|
r = mad(s0_5, V4(-2.095e-03, 4.044e-02, -3.770e-02, 5.665e-02), r);
|
||||||
|
r = mad(s0_6, V4(-1.993e-01, -2.645e-01, -8.892e-02, 1.948e-02), r);
|
||||||
|
r = mad(s0_7, V4(-4.865e-01, 5.400e-01, -1.396e-01, 1.270e-01), r);
|
||||||
|
r = mad(s0_8, V4(-1.667e-02, -9.433e-03, -1.324e-02, -1.803e-03), r);
|
||||||
|
return r;
|
||||||
|
}
|
||||||
|
|
||||||
|
V4 f1(MF s0_0, MF s0_1, MF s0_2, MF s0_3, MF s0_4, MF s0_5, MF s0_6, MF s0_7, MF s0_8) {
|
||||||
|
V4 r = { -4.485e-04, -2.620e-04, 2.449e-02, -7.403e-04 };
|
||||||
|
r = mad(s0_0, V4(-4.610e-02, -6.199e-01, 8.493e-03, -1.532e-02), r);
|
||||||
|
r = mad(s0_1, V4(-7.178e-02, 5.957e-01, 1.575e-03, 1.807e-02), r);
|
||||||
|
r = mad(s0_2, V4(1.106e-01, 3.625e-03, 3.713e-02, -4.124e-03), r);
|
||||||
|
r = mad(s0_3, V4(1.288e-01, -5.582e-02, 5.082e-02, 1.674e-02), r);
|
||||||
|
r = mad(s0_4, V4(-6.074e-01, 8.818e-02, -3.371e-01, -6.663e-01), r);
|
||||||
|
r = mad(s0_5, V4(-8.030e-02, -4.780e-03, -3.421e-01, 5.358e-02), r);
|
||||||
|
r = mad(s0_6, V4(4.990e-01, 7.623e-03, 1.778e-03, 2.401e-02), r);
|
||||||
|
r = mad(s0_7, V4(9.546e-02, -1.656e-02, 6.935e-04, 6.387e-01), r);
|
||||||
|
r = mad(s0_8, V4(-2.302e-02, 5.209e-03, 5.835e-02, -6.361e-02), r);
|
||||||
|
return r;
|
||||||
|
}
|
||||||
|
|
||||||
|
void Pass1(uint2 blockStart, uint3 tid) {
|
||||||
|
float2 pt = float2(GetInputPt());
|
||||||
|
uint2 gxy = Rmp8x8(tid.x) + blockStart;
|
||||||
|
uint2 size = GetInputSize();
|
||||||
|
if (gxy.x >= size.x || gxy.y >= size.y) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
float2 pos = (gxy + 0.5) * pt;
|
||||||
|
|
||||||
|
MF s0_0 = l0(-1.0, -1.0);
|
||||||
|
MF s0_1 = l0(0.0, -1.0);
|
||||||
|
MF s0_2 = l0(1.0, -1.0);
|
||||||
|
MF s0_3 = l0(-1.0, 0.0);
|
||||||
|
MF s0_4 = l0(0.0, 0.0);
|
||||||
|
MF s0_5 = l0(1.0, 0.0);
|
||||||
|
MF s0_6 = l0(-1.0, 1.0);
|
||||||
|
MF s0_7 = l0(0.0, 1.0);
|
||||||
|
MF s0_8 = l0(1.0, 1.0);
|
||||||
|
|
||||||
|
t0[gxy] = f0(s0_0, s0_1, s0_2, s0_3, s0_4, s0_5, s0_6, s0_7, s0_8);
|
||||||
|
t1[gxy] = f1(s0_0, s0_1, s0_2, s0_3, s0_4, s0_5, s0_6, s0_7, s0_8);
|
||||||
|
}
|
||||||
|
|
||||||
|
//!PASS 2
|
||||||
|
//!DESC conv1
|
||||||
|
//!BLOCK_SIZE 8
|
||||||
|
//!NUM_THREADS 64
|
||||||
|
//!IN t0, t1
|
||||||
|
//!OUT t2, t3
|
||||||
|
|
||||||
|
#define l0(x, y) V4(O(t0, float2(x, y)))
|
||||||
|
#define l1(x, y) V4(O(t1, float2(x, y)))
|
||||||
|
|
||||||
|
V4 f0(V4 s0_0, V4 s0_1, V4 s0_2, V4 s0_3, V4 s0_4, V4 s0_5, V4 s0_6, V4 s0_7, V4 s0_8, V4 s1_0, V4 s1_1, V4 s1_2, V4 s1_3, V4 s1_4, V4 s1_5, V4 s1_6, V4 s1_7, V4 s1_8, V4 s2_0, V4 s2_1, V4 s2_2, V4 s2_3, V4 s2_4, V4 s2_5, V4 s2_6, V4 s2_7, V4 s2_8, V4 s3_0, V4 s3_1, V4 s3_2, V4 s3_3, V4 s3_4, V4 s3_5, V4 s3_6, V4 s3_7, V4 s3_8) {
|
||||||
|
V4 r = { -3.046e-02, 3.515e-02, 4.880e-02, 4.740e-03 };
|
||||||
|
r = MulAdd(s0_0, M4(7.103e-02, 1.495e-01, -1.731e-02, -9.952e-02, -1.539e-01, -1.103e-01, 7.099e-02, 2.023e-01, 2.681e-02, 5.202e-03, 1.954e-02, -6.822e-02, -1.650e-01, 3.710e-01, -6.020e-01, 4.879e-01), r);
|
||||||
|
r = MulAdd(s0_1, M4(-1.168e-02, 2.587e-01, -4.670e-01, -3.986e-02, -1.268e-01, 3.619e-02, 5.712e-02, 1.722e-01, 4.473e-02, -1.224e-01, 8.228e-02, -3.981e-02, 4.044e-01, -3.039e-01, -3.390e-01, 5.925e-02), r);
|
||||||
|
r = MulAdd(s0_2, M4(4.083e-02, 7.140e-02, -5.864e-01, 1.188e-01, 2.214e-01, -2.826e-01, 2.294e-01, -2.199e-01, -9.048e-02, 1.787e-01, -6.887e-02, -6.645e-02, -1.285e-01, -8.261e-02, -1.975e-01, 2.428e-01), r);
|
||||||
|
r = MulAdd(s0_3, M4(-5.801e-02, -3.381e-02, -2.285e-01, 9.377e-02, 1.878e-01, 9.285e-02, -1.001e-01, -5.059e-02, -2.155e-02, -9.098e-02, -1.279e-02, 9.801e-02, 1.178e-01, -1.967e-01, -4.792e-02, -1.106e-01), r);
|
||||||
|
r = MulAdd(s0_4, M4(3.048e-01, 2.731e-01, -2.351e-01, -1.516e-01, -1.382e-02, 1.296e-01, -9.530e-02, 2.975e-02, 2.411e-01, 2.343e-02, 1.731e-02, -2.331e-01, -2.161e-01, 4.114e-01, 4.417e-01, 1.225e+00), r);
|
||||||
|
r = MulAdd(s0_5, M4(3.337e-01, 2.844e-01, 1.065e-01, -2.391e-01, -1.265e-01, -3.625e-02, -7.062e-02, 3.529e-02, 2.208e-02, -8.459e-03, -1.366e-01, -1.563e-02, -1.648e-01, -5.919e-01, 4.061e-01, -4.975e-02), r);
|
||||||
|
r = MulAdd(s0_6, M4(6.213e-03, -2.020e-02, 2.520e-03, 2.167e-02, -2.361e-01, -1.421e-01, -4.579e-02, -1.353e-01, -2.883e-01, -5.900e-04, 2.720e-02, 1.591e-01, -5.120e-01, -4.253e-01, -3.397e-02, -4.633e-01), r);
|
||||||
|
r = MulAdd(s0_7, M4(2.456e-01, -6.978e-02, 5.668e-02, -9.795e-03, -1.925e-01, -4.841e-02, -1.273e-02, 1.282e-02, -1.223e-01, -4.080e-02, 2.975e-02, 1.595e-01, -3.345e-01, -1.504e-01, 1.080e-01, 8.549e-01), r);
|
||||||
|
r = MulAdd(s0_8, M4(8.700e-02, 1.611e-02, 8.589e-02, -3.284e-02, -1.637e-01, 2.627e-01, 1.851e-02, 2.843e-02, 1.224e-01, 6.163e-02, 4.991e-02, -1.510e-01, 1.885e-01, -5.951e-02, -3.463e-02, 2.172e-01), r);
|
||||||
|
r = MulAdd(s1_0, M4(1.856e-01, -1.041e-01, 1.900e-01, 8.420e-02, -3.223e-01, 6.258e-02, -9.766e-02, -6.517e-01, 3.066e-02, -7.562e-02, 1.015e-02, -1.139e-01, 1.569e-02, -3.684e-02, -2.813e-02, 8.835e-02), r);
|
||||||
|
r = MulAdd(s1_1, M4(-7.107e-02, -1.146e-01, 5.488e-01, -2.960e-01, 3.743e-01, -5.368e-01, -2.219e-01, -3.122e-01, 2.468e-02, -7.477e-01, 1.858e-01, 3.498e-01, 1.771e-03, 4.215e-03, 8.478e-02, 9.318e-02), r);
|
||||||
|
r = MulAdd(s1_2, M4(-2.350e-03, -3.382e-01, 5.964e-01, -2.321e-01, 2.011e-01, 1.890e-01, -2.062e-01, -3.725e-02, -1.003e-01, -1.464e-01, 1.040e-01, 9.994e-02, -7.113e-02, -3.827e-02, -1.258e-01, -1.584e-01), r);
|
||||||
|
r = MulAdd(s1_3, M4(-1.609e-01, -1.460e-01, -4.804e-03, 5.503e-02, 2.784e-01, -1.475e-02, 9.395e-02, -1.128e-01, 1.032e-02, -1.969e-01, 2.170e-01, 2.335e-01, -1.371e-01, 4.853e-02, 8.945e-03, -2.698e-01), r);
|
||||||
|
r = MulAdd(s1_4, M4(7.739e-02, -1.105e-01, 3.348e-01, 1.093e-01, -7.745e-02, -1.642e-01, -2.191e-01, -2.674e-02, 4.199e-01, -3.302e-01, 1.445e-01, -2.815e-01, -3.154e-01, 6.646e-02, 8.520e-02, -1.053e-01), r);
|
||||||
|
r = MulAdd(s1_5, M4(-4.165e-01, -8.545e-02, 2.291e-01, -1.042e-01, 3.791e-01, -7.209e-02, -6.332e-02, -3.174e-01, 1.038e-01, 8.122e-03, -9.715e-02, 6.808e-01, -9.362e-02, -4.634e-02, 5.184e-03, 1.295e-01), r);
|
||||||
|
r = MulAdd(s1_6, M4(-8.179e-02, -8.513e-02, 4.470e-02, -7.799e-02, -1.092e-01, -1.851e-01, -1.025e-01, -4.220e-02, -3.853e-01, 3.040e-02, -9.081e-02, 1.439e-01, -2.730e-02, -5.086e-02, 5.352e-03, -5.102e-03), r);
|
||||||
|
r = MulAdd(s1_7, M4(7.601e-02, -1.423e-01, 3.421e-01, 2.574e-03, 1.165e-01, 6.863e-03, 1.250e-02, -4.862e-02, -3.859e-01, -1.108e-01, 2.515e-02, 5.564e-01, 2.485e-01, 2.230e-01, -3.839e-02, 3.605e-02), r);
|
||||||
|
r = MulAdd(s1_8, M4(-9.424e-02, 1.248e-01, 1.980e-01, -1.671e-01, 1.098e-01, 6.555e-02, -7.194e-02, -1.626e-01, -1.439e-01, -2.086e-01, -1.925e-02, 1.520e-01, 2.139e-01, -7.764e-02, 6.469e-02, 7.875e-03), r);
|
||||||
|
r = MulAdd(s2_0, M4(4.572e-02, 3.661e-02, -3.845e-01, -1.383e-01, 1.729e-02, 1.780e-02, 3.664e-02, -6.961e-02, -9.001e-03, -1.853e-02, -6.735e-02, -1.864e-02, 1.695e-01, -1.420e-01, 2.679e-01, -1.525e-01), r);
|
||||||
|
r = MulAdd(s2_1, M4(9.967e-02, -2.869e-01, -2.251e-01, 8.470e-02, 3.178e-02, -9.701e-03, 9.260e-02, 4.087e-04, -8.081e-02, 1.341e-01, 5.882e-03, 1.043e-02, 8.559e-03, 6.534e-02, -4.619e-01, -3.010e-01), r);
|
||||||
|
r = MulAdd(s2_2, M4(-1.676e-02, -3.339e-01, 1.848e-01, -2.562e-01, -8.563e-02, 2.487e-02, 2.495e-01, 9.448e-02, 2.189e-02, -3.018e-02, 5.698e-02, 6.041e-02, -4.869e-02, -2.627e-02, 1.602e-01, 1.092e-01), r);
|
||||||
|
r = MulAdd(s2_3, M4(6.867e-02, -1.693e-01, -1.614e-01, -1.944e-01, 1.992e-01, 1.720e-01, 2.393e-01, 1.219e-02, 4.866e-02, -1.165e-01, -1.285e-01, 2.929e-01, 2.043e-01, -1.399e-02, 1.595e-02, -2.746e-01), r);
|
||||||
|
r = MulAdd(s2_4, M4(-4.477e-01, -5.696e-01, -1.760e-02, 1.362e-01, 1.472e-01, 3.113e-01, -2.419e-01, 8.650e-02, -8.358e-02, 1.081e-01, 3.881e-02, -1.400e-01, -2.071e-01, 3.977e-02, -3.149e-01, 2.525e-01), r);
|
||||||
|
r = MulAdd(s2_5, M4(5.496e-02, -9.963e-02, -1.227e-01, -1.892e-01, 4.361e-02, -3.776e-01, -6.576e-01, 2.628e-01, -8.215e-02, -8.123e-02, 2.248e-03, 1.261e-01, 1.193e-01, 2.608e-01, 2.567e-01, 8.120e-02), r);
|
||||||
|
r = MulAdd(s2_6, M4(-1.587e-01, -9.849e-02, 1.122e-01, -5.963e-02, -9.176e-02, 7.341e-03, 1.164e-03, -5.660e-02, 1.567e-01, -6.958e-02, -3.780e-02, 4.238e-04, -6.186e-02, 1.777e-01, 2.398e-01, 6.853e-03), r);
|
||||||
|
r = MulAdd(s2_7, M4(1.062e-01, -1.498e-01, 5.492e-02, 1.108e-01, -3.248e-01, -2.901e-01, -4.360e-01, 1.128e-01, 7.346e-02, 8.659e-02, 9.740e-02, -1.434e-01, 1.538e-01, 1.349e-01, 1.408e-01, -1.367e-01), r);
|
||||||
|
r = MulAdd(s2_8, M4(1.412e-01, -8.889e-02, 2.029e-02, -1.523e-01, 4.847e-01, -7.432e-01, -1.181e-01, 4.132e-01, 3.119e-02, -5.840e-02, -2.292e-02, -3.125e-02, 2.440e-02, 2.815e-02, 2.759e-01, -8.781e-02), r);
|
||||||
|
r = MulAdd(s3_0, M4(2.147e-02, 2.192e-01, 2.489e-01, -3.436e-02, 1.086e-02, -2.680e-02, -9.925e-02, 3.978e-02, 1.239e-01, 3.645e-02, 5.463e-01, 5.005e-01, 1.039e-01, -1.694e-01, -3.816e-02, 3.834e-01), r);
|
||||||
|
r = MulAdd(s3_1, M4(1.418e-01, 5.806e-02, 1.317e-01, 2.227e-01, 1.486e-02, -4.235e-03, -5.750e-02, -1.548e-01, -7.700e-01, 3.263e-01, -1.193e-02, 3.537e-01, -2.841e-01, 4.657e-01, -1.576e-01, -9.526e-02), r);
|
||||||
|
r = MulAdd(s3_2, M4(7.641e-02, 8.195e-01, 1.080e-01, 1.814e-01, -5.471e-02, 2.211e-02, -4.212e-02, -1.249e-02, 2.469e-02, 5.436e-01, 3.805e-01, -9.622e-02, -6.358e-02, -3.739e-01, -3.504e-01, -2.627e-01), r);
|
||||||
|
r = MulAdd(s3_3, M4(-9.359e-02, -1.830e-02, -7.015e-02, -7.774e-02, 2.286e-01, -6.321e-02, -5.124e-02, -2.799e-03, -5.063e-01, -1.835e-01, 3.716e-01, 1.130e+00, 3.259e-01, -2.045e-01, -1.792e-01, 4.892e-01), r);
|
||||||
|
r = MulAdd(s3_4, M4(-7.478e-01, -1.192e-01, 1.022e-01, 8.111e-01, 7.253e-02, 2.280e-01, -1.116e-01, -2.828e-01, -2.364e-01, -1.233e+00, -1.125e+00, 1.750e+00, -1.215e+00, 4.973e-02, 2.070e-01, 6.996e-01), r);
|
||||||
|
r = MulAdd(s3_5, M4(-4.115e-02, 3.613e-01, 2.694e-01, 4.126e-02, 7.046e-02, 6.242e-02, 9.300e-02, -1.965e-01, -3.211e-01, 8.504e-01, 2.518e-01, -5.622e-01, 5.663e-02, -1.139e-01, 1.150e-01, -1.954e-01), r);
|
||||||
|
r = MulAdd(s3_6, M4(-1.870e-01, -9.168e-02, -8.947e-02, 6.127e-03, 1.163e-02, 3.733e-04, -3.330e-01, 1.935e-01, 3.424e-01, 1.313e-01, -6.732e-01, 8.256e-02, 6.713e-02, 2.980e-02, -6.912e-02, 1.715e-01), r);
|
||||||
|
r = MulAdd(s3_7, M4(1.636e-01, 1.212e-01, 2.280e-02, 1.552e-01, -4.955e-01, 8.376e-01, 1.476e-01, 2.192e-01, 9.746e-01, -3.148e-01, 8.206e-01, -8.104e-01, -7.918e-02, -1.604e-01, 5.505e-02, 7.640e-02), r);
|
||||||
|
r = MulAdd(s3_8, M4(1.248e-01, 2.878e-01, -4.182e-02, -9.214e-02, -1.210e-01, 4.382e-01, 8.062e-02, -3.051e-01, -1.803e-01, -3.041e-01, 1.368e-01, -1.030e-01, 2.941e-02, -2.724e-01, 3.480e-02, 1.396e-02), r);
|
||||||
|
return r;
|
||||||
|
}
|
||||||
|
|
||||||
|
V4 f1(V4 s0_0, V4 s0_1, V4 s0_2, V4 s0_3, V4 s0_4, V4 s0_5, V4 s0_6, V4 s0_7, V4 s0_8, V4 s1_0, V4 s1_1, V4 s1_2, V4 s1_3, V4 s1_4, V4 s1_5, V4 s1_6, V4 s1_7, V4 s1_8, V4 s2_0, V4 s2_1, V4 s2_2, V4 s2_3, V4 s2_4, V4 s2_5, V4 s2_6, V4 s2_7, V4 s2_8, V4 s3_0, V4 s3_1, V4 s3_2, V4 s3_3, V4 s3_4, V4 s3_5, V4 s3_6, V4 s3_7, V4 s3_8) {
|
||||||
|
V4 r = { 3.015e-03, -4.690e-02, 3.573e-02, -1.486e-02 };
|
||||||
|
r = MulAdd(s0_0, M4(1.574e-01, -6.104e-03, -2.288e-01, 5.024e-03, -2.149e-03, -8.674e-02, 1.209e-01, 7.107e-02, 1.242e-01, 2.312e-03, -5.300e-02, -2.285e-01, -8.824e-02, 7.402e-02, -4.447e-01, 1.117e+00), r);
|
||||||
|
r = MulAdd(s0_1, M4(-5.617e-02, 3.613e-01, -4.666e-01, 1.795e-01, 1.718e-01, -1.005e-01, -2.593e-01, 4.103e-01, 2.477e-01, 1.883e-01, 3.928e-02, -3.635e-01, -7.353e-01, 3.209e-01, 2.171e-01, 3.924e-01), r);
|
||||||
|
r = MulAdd(s0_2, M4(-3.304e-01, 6.332e-01, -3.898e-01, 2.704e-01, 4.110e-02, -2.786e-01, -2.513e-01, 1.800e-01, 9.402e-03, -1.975e-01, -4.040e-02, -2.047e-01, -3.239e-01, -1.623e-01, 1.001e-01, 3.053e-02), r);
|
||||||
|
r = MulAdd(s0_3, M4(3.935e-01, 5.218e-02, 4.630e-02, 2.202e-02, -2.172e-01, -1.530e-02, -1.782e-01, -9.327e-02, -6.425e-02, -2.402e-02, -2.919e-02, -4.034e-02, 6.589e-01, -4.900e-02, 7.783e-02, 6.334e-01), r);
|
||||||
|
r = MulAdd(s0_4, M4(-5.475e-02, 1.543e-01, -1.597e-01, -2.500e-01, 4.990e-02, 5.780e-02, 1.162e-01, 1.140e-01, -2.980e-01, -2.524e-02, -2.103e-01, 4.297e-01, 4.528e-01, -3.098e-01, -1.415e-01, 7.565e-01), r);
|
||||||
|
r = MulAdd(s0_5, M4(-1.097e-01, 3.376e-01, -5.685e-01, 1.347e-01, 1.155e-01, -1.396e-01, -2.840e-01, -1.373e-01, 1.442e-01, 8.711e-02, 1.357e-01, -1.110e-01, 2.095e-01, -2.901e-01, -1.007e-01, -2.473e-01), r);
|
||||||
|
r = MulAdd(s0_6, M4(-7.405e-02, 9.320e-02, -5.870e-02, -2.569e-01, 6.017e-03, -8.078e-02, -3.798e-02, 2.334e-01, 1.440e-01, -1.852e-01, -6.627e-03, 3.514e-03, -1.499e-02, -6.237e-02, 3.665e-01, 3.270e-01), r);
|
||||||
|
r = MulAdd(s0_7, M4(2.443e-01, 8.076e-02, -2.143e-01, 1.120e-01, 1.187e-01, 1.317e-01, 1.811e-01, 1.918e-01, -2.164e-02, -1.829e-01, 2.105e-01, 3.085e-01, 3.155e-01, 2.801e-01, -6.834e-01, 2.861e-01), r);
|
||||||
|
r = MulAdd(s0_8, M4(9.974e-03, 9.704e-02, -2.363e-01, 1.829e-01, 1.844e-02, 9.298e-02, -5.319e-02, -5.899e-02, -2.154e-01, 2.555e-02, -8.374e-02, 1.254e-01, -2.736e-01, -4.065e-02, 4.838e-02, 3.338e-02), r);
|
||||||
|
r = MulAdd(s1_0, M4(-1.239e-02, -1.316e-01, 8.694e-02, -8.443e-02, -1.143e-01, -6.018e-02, -9.054e-02, 7.381e-02, 2.722e-01, 1.030e-01, -8.583e-02, -4.433e-01, -1.339e-01, 1.264e-01, 8.581e-02, -1.947e-01), r);
|
||||||
|
r = MulAdd(s1_1, M4(3.030e-01, -3.527e-02, 4.665e-01, -3.372e-02, -2.301e-02, 7.308e-01, 5.938e-01, -5.901e-01, 4.766e-01, 1.081e-01, 8.809e-02, 3.482e-01, -1.938e-01, -8.091e-02, 3.649e-02, 9.321e-02), r);
|
||||||
|
r = MulAdd(s1_2, M4(1.376e-01, -4.460e-01, 4.298e-01, -4.809e-02, -3.819e-01, 5.216e-01, 2.687e-01, 1.359e-01, 2.936e-01, 1.222e-02, 3.706e-01, 2.481e-01, -4.716e-02, -1.798e-02, 2.731e-02, -7.140e-02), r);
|
||||||
|
r = MulAdd(s1_3, M4(1.657e-01, -3.624e-02, 1.541e-01, -5.006e-03, -4.051e-01, -9.782e-02, 3.008e-02, 1.962e-01, -6.146e-02, 1.866e-03, -3.052e-01, -2.202e-01, 1.057e-01, -1.151e-01, -6.310e-02, 3.914e-01), r);
|
||||||
|
r = MulAdd(s1_4, M4(-2.629e-01, 1.029e-01, 1.812e-02, -2.950e-01, -1.191e-01, 2.580e-01, -4.833e-01, 1.095e-01, 2.309e-02, 4.519e-02, 1.086e-01, 5.362e-01, -1.349e-01, -1.278e-01, 7.109e-02, -1.992e-01), r);
|
||||||
|
r = MulAdd(s1_5, M4(-1.815e-01, 2.898e-01, 3.446e-01, -1.587e-01, -6.360e-02, 1.662e-01, 5.187e-01, 1.701e-01, -2.770e-02, -5.932e-01, 2.467e-01, 3.940e-01, 1.022e-01, 1.033e-01, -5.084e-02, -6.520e-02), r);
|
||||||
|
r = MulAdd(s1_6, M4(-1.494e-01, 3.180e-02, 9.864e-02, -3.409e-01, 1.397e-02, 9.932e-03, -2.110e-01, 2.636e-01, 1.353e-01, -8.495e-02, -2.680e-03, -2.287e-01, 1.136e-01, -1.047e-01, 2.910e-02, 9.922e-02), r);
|
||||||
|
r = MulAdd(s1_7, M4(1.533e-01, 4.819e-04, 1.735e-01, 2.027e-01, 1.316e-01, 1.029e-01, 1.446e-01, 1.737e-01, 4.855e-02, 4.781e-02, 2.025e-01, 1.587e-01, 1.661e-01, 7.134e-02, 5.853e-02, -1.530e-01), r);
|
||||||
|
r = MulAdd(s1_8, M4(-1.476e-01, -4.916e-02, 1.989e-01, 1.159e-01, 4.753e-02, 1.694e-01, 4.343e-02, -6.974e-03, 3.382e-02, 2.275e-01, 3.466e-01, -7.178e-03, -1.104e-01, 2.059e-03, -7.101e-02, 8.934e-02), r);
|
||||||
|
r = MulAdd(s2_0, M4(-3.467e-01, 8.471e-04, 1.580e-01, 2.685e-01, -2.680e-02, -6.444e-02, 8.843e-02, 5.232e-03, 2.576e-02, -3.756e-02, -7.913e-03, -3.871e-02, -5.374e-02, -6.060e-02, -7.688e-02, 6.738e-01), r);
|
||||||
|
r = MulAdd(s2_1, M4(-3.963e-01, 1.295e-01, 2.623e-01, 2.565e-01, -1.831e-01, -6.054e-02, 1.817e-01, -8.944e-02, 1.974e-01, -2.800e-04, -3.964e-02, 1.232e-01, -3.477e-01, 3.791e-01, 1.438e-01, -7.862e-02), r);
|
||||||
|
r = MulAdd(s2_2, M4(2.540e-02, 1.123e-01, 6.461e-01, -3.856e-03, 3.373e-02, -5.719e-02, 1.556e-01, -1.100e-01, -3.499e-02, 9.146e-02, -4.624e-02, 9.774e-02, -1.148e-01, -2.280e-01, 4.977e-01, -1.568e-01), r);
|
||||||
|
r = MulAdd(s2_3, M4(5.352e-02, -1.293e-01, -6.991e-03, 4.190e-01, -2.334e-03, -4.433e-02, -8.470e-02, 1.162e-01, -1.045e-01, -7.444e-02, 8.951e-02, -1.124e-01, 4.295e-01, 1.086e-01, 1.336e-01, 2.645e-01), r);
|
||||||
|
r = MulAdd(s2_4, M4(-4.062e-01, -6.781e-02, 4.629e-01, -4.931e-01, -1.875e-01, 1.958e-01, -4.560e-01, -2.286e-02, -2.066e-01, 1.151e-01, -5.924e-02, 1.350e-01, -1.752e-01, 2.244e-01, -3.564e-02, -6.129e-01), r);
|
||||||
|
r = MulAdd(s2_5, M4(6.644e-02, 4.611e-01, 9.200e-02, 6.845e-03, -1.628e-02, 8.352e-02, -1.119e-01, -4.386e-02, -5.822e-02, -4.769e-02, -3.224e-02, -1.235e-01, -3.296e-01, 5.835e-03, 2.231e-01, 5.535e-02), r);
|
||||||
|
r = MulAdd(s2_6, M4(-2.961e-02, -5.230e-02, 5.124e-02, 6.542e-02, 2.004e-01, 1.189e-01, -1.797e-01, -1.535e-02, 6.469e-02, 1.134e-01, -1.204e-04, -7.606e-02, 2.436e-02, -1.630e-02, 1.841e-01, -2.529e-01), r);
|
||||||
|
r = MulAdd(s2_7, M4(-1.147e-02, 3.246e-02, 7.626e-02, -1.013e-01, 1.075e-01, 5.871e-01, -5.227e-01, -3.076e-01, 1.609e-01, 5.768e-02, -1.912e-02, 5.898e-02, -7.530e-02, -1.307e-01, 5.828e-02, -1.456e-02), r);
|
||||||
|
r = MulAdd(s2_8, M4(-7.053e-02, 8.728e-02, 1.211e-01, 1.410e-01, -2.160e-01, 9.970e-02, -5.345e-01, 1.141e-01, 8.112e-04, -4.348e-02, 9.858e-02, 2.780e-02, -1.116e-01, -2.331e-01, 1.545e-01, 7.984e-02), r);
|
||||||
|
r = MulAdd(s3_0, M4(-5.412e-02, 6.012e-03, -2.395e-01, -1.209e-02, -5.734e-02, 3.058e-02, -7.202e-02, -7.514e-02, 7.241e-03, -1.702e-01, 1.020e+00, 2.997e-01, -2.173e-01, 4.518e-02, -2.703e-02, -4.087e-02), r);
|
||||||
|
r = MulAdd(s3_1, M4(-5.670e-02, -9.713e-03, -2.091e-01, -1.621e-01, -5.370e-03, -5.579e-02, 1.042e-01, 2.220e-02, 4.788e-01, -6.623e-01, 5.548e-01, 8.186e-01, 2.462e-01, -7.624e-01, -9.065e-02, -1.105e-02), r);
|
||||||
|
r = MulAdd(s3_2, M4(4.043e-02, -1.577e-01, -3.166e-01, -1.256e-01, -9.515e-02, -8.852e-02, -4.960e-02, 1.129e-01, 1.690e-01, 2.314e-01, -5.134e-01, 9.584e-02, -3.085e-02, 2.399e-01, -3.381e-01, -7.233e-02), r);
|
||||||
|
r = MulAdd(s3_3, M4(1.750e-01, -9.450e-02, -2.230e-01, 4.190e-01, 8.900e-02, 2.306e-02, 2.783e-01, -3.295e-01, 2.697e+00, 8.855e-02, 5.728e-01, -8.682e-01, 6.085e-02, 5.010e-02, 1.343e-01, 1.137e-01), r);
|
||||||
|
r = MulAdd(s3_4, M4(9.857e-02, 3.310e-01, -3.584e-01, -5.586e-01, 5.751e-01, -4.023e-01, 3.838e-01, 1.240e-01, -1.482e-01, -1.233e-01, -5.953e-01, 1.534e+00, 3.390e-01, -2.022e-02, 1.619e-01, -2.959e-01), r);
|
||||||
|
r = MulAdd(s3_5, M4(1.528e-01, 1.593e-01, -1.886e-01, 2.281e-02, 2.174e-01, -8.846e-01, 5.726e-02, 7.369e-03, -1.490e-01, 3.377e-01, -4.669e-02, 1.206e-01, -1.251e-01, 2.600e-01, -2.439e-01, 2.067e-01), r);
|
||||||
|
r = MulAdd(s3_6, M4(4.090e-02, -2.118e-02, -9.012e-02, -8.624e-03, 1.464e-01, 6.929e-02, 1.492e-01, -4.039e-01, 6.123e-01, 2.679e-01, -2.284e-01, -3.609e-01, -6.598e-02, 1.341e-01, -2.371e-02, -2.899e-01), r);
|
||||||
|
r = MulAdd(s3_7, M4(5.189e-02, -3.928e-02, 1.670e-01, -1.536e-01, 5.066e-01, -3.768e-01, 6.577e-01, 1.140e-01, -1.537e-01, -1.941e-01, -9.152e-02, -3.571e-02, 1.068e-01, 4.803e-02, -3.180e-01, 4.361e-02), r);
|
||||||
|
r = MulAdd(s3_8, M4(-8.453e-02, -1.454e-02, 3.613e-02, 8.974e-03, -1.258e-01, -5.842e-01, 3.264e-01, 2.910e-01, 1.306e-01, 4.552e-01, 4.524e-01, 1.065e-02, -1.792e-02, 1.875e-02, -2.206e-01, 2.028e-01), r);
|
||||||
|
return r;
|
||||||
|
}
|
||||||
|
|
||||||
|
void Pass2(uint2 blockStart, uint3 tid) {
|
||||||
|
float2 pt = float2(GetInputPt());
|
||||||
|
uint2 gxy = Rmp8x8(tid.x) + blockStart;
|
||||||
|
uint2 size = GetInputSize();
|
||||||
|
if (gxy.x >= size.x || gxy.y >= size.y) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
float2 pos = (gxy + 0.5) * pt;
|
||||||
|
|
||||||
|
V4 s0_0 = l0(-1.0, -1.0);
|
||||||
|
V4 s0_1 = l0(0.0, -1.0);
|
||||||
|
V4 s0_2 = l0(1.0, -1.0);
|
||||||
|
V4 s0_3 = l0(-1.0, 0.0);
|
||||||
|
V4 s0_4 = l0(0.0, 0.0);
|
||||||
|
V4 s0_5 = l0(1.0, 0.0);
|
||||||
|
V4 s0_6 = l0(-1.0, 1.0);
|
||||||
|
V4 s0_7 = l0(0.0, 1.0);
|
||||||
|
V4 s0_8 = l0(1.0, 1.0);
|
||||||
|
V4 s1_0 = -max(-s0_0, 0.0);
|
||||||
|
V4 s1_1 = -max(-s0_1, 0.0);
|
||||||
|
V4 s1_2 = -max(-s0_2, 0.0);
|
||||||
|
V4 s1_3 = -max(-s0_3, 0.0);
|
||||||
|
V4 s1_4 = -max(-s0_4, 0.0);
|
||||||
|
V4 s1_5 = -max(-s0_5, 0.0);
|
||||||
|
V4 s1_6 = -max(-s0_6, 0.0);
|
||||||
|
V4 s1_7 = -max(-s0_7, 0.0);
|
||||||
|
V4 s1_8 = -max(-s0_8, 0.0);
|
||||||
|
s0_0 = max(s0_0, 0.0);
|
||||||
|
s0_1 = max(s0_1, 0.0);
|
||||||
|
s0_2 = max(s0_2, 0.0);
|
||||||
|
s0_3 = max(s0_3, 0.0);
|
||||||
|
s0_4 = max(s0_4, 0.0);
|
||||||
|
s0_5 = max(s0_5, 0.0);
|
||||||
|
s0_6 = max(s0_6, 0.0);
|
||||||
|
s0_7 = max(s0_7, 0.0);
|
||||||
|
s0_8 = max(s0_8, 0.0);
|
||||||
|
|
||||||
|
V4 s2_0 = l1(-1.0, -1.0);
|
||||||
|
V4 s2_1 = l1(0.0, -1.0);
|
||||||
|
V4 s2_2 = l1(1.0, -1.0);
|
||||||
|
V4 s2_3 = l1(-1.0, 0.0);
|
||||||
|
V4 s2_4 = l1(0.0, 0.0);
|
||||||
|
V4 s2_5 = l1(1.0, 0.0);
|
||||||
|
V4 s2_6 = l1(-1.0, 1.0);
|
||||||
|
V4 s2_7 = l1(0.0, 1.0);
|
||||||
|
V4 s2_8 = l1(1.0, 1.0);
|
||||||
|
V4 s3_0 = -max(-s2_0, 0.0);
|
||||||
|
V4 s3_1 = -max(-s2_1, 0.0);
|
||||||
|
V4 s3_2 = -max(-s2_2, 0.0);
|
||||||
|
V4 s3_3 = -max(-s2_3, 0.0);
|
||||||
|
V4 s3_4 = -max(-s2_4, 0.0);
|
||||||
|
V4 s3_5 = -max(-s2_5, 0.0);
|
||||||
|
V4 s3_6 = -max(-s2_6, 0.0);
|
||||||
|
V4 s3_7 = -max(-s2_7, 0.0);
|
||||||
|
V4 s3_8 = -max(-s2_8, 0.0);
|
||||||
|
s2_0 = max(s2_0, 0.0);
|
||||||
|
s2_1 = max(s2_1, 0.0);
|
||||||
|
s2_2 = max(s2_2, 0.0);
|
||||||
|
s2_3 = max(s2_3, 0.0);
|
||||||
|
s2_4 = max(s2_4, 0.0);
|
||||||
|
s2_5 = max(s2_5, 0.0);
|
||||||
|
s2_6 = max(s2_6, 0.0);
|
||||||
|
s2_7 = max(s2_7, 0.0);
|
||||||
|
s2_8 = max(s2_8, 0.0);
|
||||||
|
|
||||||
|
t2[gxy] = f0(s0_0, s0_1, s0_2, s0_3, s0_4, s0_5, s0_6, s0_7, s0_8, s1_0, s1_1, s1_2, s1_3, s1_4, s1_5, s1_6, s1_7, s1_8, s2_0, s2_1, s2_2, s2_3, s2_4, s2_5, s2_6, s2_7, s2_8, s3_0, s3_1, s3_2, s3_3, s3_4, s3_5, s3_6, s3_7, s3_8);
|
||||||
|
t3[gxy] = f1(s0_0, s0_1, s0_2, s0_3, s0_4, s0_5, s0_6, s0_7, s0_8, s1_0, s1_1, s1_2, s1_3, s1_4, s1_5, s1_6, s1_7, s1_8, s2_0, s2_1, s2_2, s2_3, s2_4, s2_5, s2_6, s2_7, s2_8, s3_0, s3_1, s3_2, s3_3, s3_4, s3_5, s3_6, s3_7, s3_8);
|
||||||
|
}
|
||||||
|
|
||||||
|
//!PASS 3
|
||||||
|
//!DESC conv2
|
||||||
|
//!BLOCK_SIZE 8
|
||||||
|
//!NUM_THREADS 64
|
||||||
|
//!IN t2, t3
|
||||||
|
//!OUT t0, t1
|
||||||
|
|
||||||
|
#define l0(x, y) V4(O(t2, float2(x, y)))
|
||||||
|
#define l1(x, y) V4(O(t3, float2(x, y)))
|
||||||
|
|
||||||
|
V4 f0(V4 s0_0, V4 s0_1, V4 s0_2, V4 s0_3, V4 s0_4, V4 s0_5, V4 s0_6, V4 s0_7, V4 s0_8, V4 s1_0, V4 s1_1, V4 s1_2, V4 s1_3, V4 s1_4, V4 s1_5, V4 s1_6, V4 s1_7, V4 s1_8, V4 s2_0, V4 s2_1, V4 s2_2, V4 s2_3, V4 s2_4, V4 s2_5, V4 s2_6, V4 s2_7, V4 s2_8, V4 s3_0, V4 s3_1, V4 s3_2, V4 s3_3, V4 s3_4, V4 s3_5, V4 s3_6, V4 s3_7, V4 s3_8) {
|
||||||
|
V4 r = { 3.575e-03, 3.041e-03, 1.241e-02, -2.230e-03 };
|
||||||
|
r = MulAdd(s0_0, M4(8.130e-02, -1.243e-01, -7.648e-02, -2.424e-01, -4.742e-02, -5.420e-02, 4.117e-02, 1.568e-01, -3.621e-02, 2.032e-01, 4.484e-02, 1.249e-02, -1.505e-01, 7.294e-02, 4.943e-02, -6.336e-02), r);
|
||||||
|
r = MulAdd(s0_1, M4(-1.474e-01, -3.366e-01, -5.670e-01, 4.113e-02, -1.260e-01, -1.539e-01, -5.421e-02, 1.779e-01, -1.072e-01, 1.209e-01, 4.423e-02, 2.454e-01, -5.430e-02, -1.442e-01, -1.501e-02, -4.731e-02), r);
|
||||||
|
r = MulAdd(s0_2, M4(6.444e-02, 1.509e-01, 1.452e-01, -2.840e-02, 9.365e-02, 2.016e-01, 1.002e-01, -3.226e-02, -1.186e-01, 1.535e-01, -1.652e-01, -1.104e-02, 4.170e-02, -4.404e-02, 1.189e-01, 1.007e-02), r);
|
||||||
|
r = MulAdd(s0_3, M4(-4.618e-02, 1.024e-01, -1.723e-01, -1.354e-01, 1.981e-01, -1.992e-01, 1.670e-01, 3.857e-01, -6.927e-03, 9.087e-02, 1.176e-01, 3.314e-01, 9.860e-02, 4.009e-04, 1.061e-01, -6.930e-02), r);
|
||||||
|
r = MulAdd(s0_4, M4(4.923e-01, -9.248e-02, 8.616e-03, 4.541e-02, -1.148e-01, 3.990e-03, -3.218e-02, 8.942e-02, 3.219e-02, -9.786e-02, 6.813e-02, 2.492e-01, -3.165e-01, 6.925e-02, -9.826e-02, 4.518e-01), r);
|
||||||
|
r = MulAdd(s0_5, M4(2.222e-02, 1.046e-01, -2.327e-02, -7.823e-02, 3.540e-01, 3.363e-01, -4.089e-02, 1.292e-02, -2.530e-01, 4.606e-01, -6.191e-02, -3.673e-02, -2.764e-01, -1.360e-01, -2.947e-03, 2.534e-02), r);
|
||||||
|
r = MulAdd(s0_6, M4(-2.067e-02, -1.566e-01, -8.968e-02, 1.386e-03, -8.841e-02, -1.077e-01, 1.646e-01, 1.987e-01, -3.098e-01, 2.764e-01, 1.935e-01, 1.847e-01, 1.116e-01, -1.514e-01, -5.175e-02, 8.710e-02), r);
|
||||||
|
r = MulAdd(s0_7, M4(1.266e-02, -2.119e-01, -1.610e-01, -6.512e-02, -1.679e-01, 2.247e-01, -5.854e-02, 1.200e-02, -1.406e-01, 4.393e-01, -8.517e-02, 3.281e-02, -1.177e-01, -1.861e-01, -3.241e-01, -2.918e-02), r);
|
||||||
|
r = MulAdd(s0_8, M4(-3.015e-02, -1.605e-01, -1.001e-01, 7.795e-03, -5.873e-02, -7.686e-02, -1.448e-01, -1.851e-02, -2.172e-01, 1.977e-01, -1.333e-01, -8.894e-02, -8.939e-03, 1.675e-01, -7.976e-03, 4.020e-02), r);
|
||||||
|
r = MulAdd(s1_0, M4(1.165e-01, -4.833e-02, 4.750e-02, -4.032e-02, -2.287e-02, -4.825e-02, 9.058e-02, 2.136e-01, 1.009e-01, -2.133e-02, 4.162e-02, -6.816e-02, -9.863e-02, -4.160e-03, -2.467e-02, -9.096e-02), r);
|
||||||
|
r = MulAdd(s1_1, M4(8.597e-02, -2.205e-01, 1.515e-01, -2.918e-02, -1.099e-01, -4.171e-02, 3.893e-04, -5.273e-03, -2.046e-02, -3.905e-03, 7.793e-04, 5.930e-02, 2.653e-02, -2.546e-01, -8.456e-02, -6.554e-02), r);
|
||||||
|
r = MulAdd(s1_2, M4(-1.058e-01, 3.302e-01, 1.812e-01, 6.427e-02, -4.601e-02, -1.589e-02, 4.405e-02, -1.366e-02, -5.996e-03, -5.402e-04, 3.237e-02, -5.725e-02, -7.486e-02, 1.358e-01, 4.739e-02, -2.432e-02), r);
|
||||||
|
r = MulAdd(s1_3, M4(3.333e-02, 5.179e-01, -1.939e-03, 7.798e-02, 2.011e-02, -2.959e-01, 1.135e-01, 3.122e-01, 8.651e-02, -2.708e-02, 7.183e-03, 4.554e-02, -3.342e-02, 9.136e-03, -7.067e-02, -1.867e-01), r);
|
||||||
|
r = MulAdd(s1_4, M4(6.231e-01, 9.512e-01, 3.523e-01, 3.744e-01, 2.388e-01, -2.827e-01, 9.968e-02, -5.306e-02, -4.498e-02, -2.222e-01, -5.865e-02, 2.967e-02, -3.029e-01, -2.137e-01, -5.363e-01, 8.872e-02), r);
|
||||||
|
r = MulAdd(s1_5, M4(-4.862e-02, 7.326e-01, 1.354e-01, 5.607e-02, 1.667e-01, -1.184e-01, -1.304e-01, 6.817e-02, 3.287e-02, 3.310e-01, 1.521e-01, -3.212e-02, -8.947e-02, 4.250e-02, -9.770e-02, -8.344e-02), r);
|
||||||
|
r = MulAdd(s1_6, M4(-9.242e-04, 4.835e-03, 1.322e-01, 3.745e-02, 9.613e-02, -8.310e-03, 4.718e-02, 2.763e-02, -1.616e-02, 6.167e-02, -3.382e-02, 3.624e-02, 1.213e-02, -2.014e-01, -2.776e-03, 4.360e-02), r);
|
||||||
|
r = MulAdd(s1_7, M4(-6.861e-02, 4.772e-02, -3.779e-02, 7.567e-02, -8.548e-02, -1.028e-02, 1.881e-02, 2.421e-03, 1.378e-01, 1.305e-01, 2.177e-02, -1.118e-03, 5.861e-02, -1.416e-01, -3.140e-01, -9.031e-02), r);
|
||||||
|
r = MulAdd(s1_8, M4(-4.147e-02, 1.546e-01, 5.650e-02, 4.098e-02, -1.460e-01, -5.779e-02, -1.959e-02, -2.318e-02, 3.538e-02, -5.044e-02, 3.304e-02, -3.517e-03, -1.176e-01, -3.185e-01, -1.738e-01, -4.349e-02), r);
|
||||||
|
r = MulAdd(s2_0, M4(-3.428e-03, 6.059e-02, 7.024e-02, 2.739e-02, 1.313e-02, -5.748e-02, 9.005e-03, -7.139e-03, 1.165e-01, -1.541e-01, 1.493e-01, 2.725e-01, 3.254e-02, -2.934e-02, 1.115e-02, -2.844e-02), r);
|
||||||
|
r = MulAdd(s2_1, M4(-8.601e-03, -3.177e-03, 1.878e-01, 1.106e-01, 1.951e-02, 8.194e-02, 4.971e-02, 5.805e-02, 2.515e-02, -2.529e-01, -2.250e-01, 3.498e-02, 7.183e-02, -8.617e-02, -8.616e-02, 1.623e-01), r);
|
||||||
|
r = MulAdd(s2_2, M4(-8.072e-02, -1.234e-01, 3.482e-02, -2.873e-02, -4.049e-02, 4.828e-03, 1.940e-02, 3.828e-02, -5.156e-03, 4.585e-03, 2.326e-02, 2.346e-02, -8.908e-02, -1.384e-03, -2.366e-02, 1.290e-02), r);
|
||||||
|
r = MulAdd(s2_3, M4(4.921e-02, 1.726e-01, 3.832e-02, -2.490e-01, -1.152e-01, -1.722e-01, -1.705e-01, 4.228e-01, -8.215e-02, -1.478e-02, 1.554e-01, 3.701e-01, -8.863e-02, 1.068e-01, 8.890e-03, 6.324e-02), r);
|
||||||
|
r = MulAdd(s2_4, M4(1.307e-01, 2.312e-01, -1.734e-01, 2.083e-02, -1.966e-01, -3.991e-01, -8.681e-02, 1.976e-03, -3.177e-01, 1.528e-01, -2.329e-01, 2.569e-01, -6.230e-03, 6.020e-02, 4.969e-02, -2.039e-01), r);
|
||||||
|
r = MulAdd(s2_5, M4(1.660e-01, 1.642e-02, 7.203e-02, -1.613e-01, 6.225e-02, 6.470e-02, 3.305e-03, 2.230e-02, -2.455e-02, 6.599e-02, -1.740e-01, 7.887e-02, 3.463e-03, 1.003e-01, -1.850e-01, 7.885e-02), r);
|
||||||
|
r = MulAdd(s2_6, M4(-2.170e-02, 1.372e-01, 7.445e-02, -9.419e-02, -1.851e-01, 4.957e-02, -2.454e-01, 5.879e-02, -5.800e-02, -1.122e-01, 7.445e-02, 1.190e-01, 2.695e-02, -5.701e-02, -5.166e-02, -5.058e-02), r);
|
||||||
|
r = MulAdd(s2_7, M4(5.390e-01, 1.674e-01, 1.213e-01, -1.147e-01, -6.939e-02, -1.218e-01, -2.891e-01, 2.682e-02, -2.636e-01, -1.104e-01, -1.556e-01, 3.774e-02, -4.121e-02, -2.431e-01, -1.248e-01, 1.275e-01), r);
|
||||||
|
r = MulAdd(s2_8, M4(1.053e-01, 2.238e-01, -1.104e-01, 5.372e-02, 6.179e-02, -2.431e-03, -4.843e-02, 3.820e-02, -7.539e-02, 7.898e-02, 7.562e-03, 1.596e-02, 7.298e-02, -1.553e-01, -3.545e-01, 1.990e-02), r);
|
||||||
|
r = MulAdd(s3_0, M4(8.232e-02, -6.815e-02, -7.421e-02, -3.191e-02, -1.592e-01, 2.814e-01, 5.009e-02, 3.669e-02, -5.908e-02, -5.445e-02, 4.873e-02, 1.538e-01, 1.065e-01, -2.194e-01, -2.612e-02, -2.297e-02), r);
|
||||||
|
r = MulAdd(s3_1, M4(1.431e-02, -7.835e-02, -2.790e-03, 9.305e-02, -2.975e-01, 1.527e-01, 1.888e-01, -1.279e-02, -1.938e-02, -1.022e-01, -2.197e-02, -2.919e-02, 2.192e-01, -8.056e-02, 1.328e-03, 3.478e-02), r);
|
||||||
|
r = MulAdd(s3_2, M4(4.920e-03, -6.286e-02, -7.779e-02, 1.075e-01, -1.092e-01, 2.909e-01, 3.056e-01, -9.017e-02, -3.625e-02, 1.079e-01, 1.107e-01, 6.613e-02, 1.696e-01, -1.852e-01, -1.253e-01, -9.675e-02), r);
|
||||||
|
r = MulAdd(s3_3, M4(-6.350e-02, 1.137e-01, -3.559e-02, -1.684e-01, -2.044e-01, -9.368e-02, 2.283e-01, 8.052e-01, 4.476e-03, -1.599e-01, 2.594e-02, 1.582e-01, -2.483e-02, 9.216e-02, 5.719e-02, 2.237e-01), r);
|
||||||
|
r = MulAdd(s3_4, M4(-1.694e-01, 1.597e-01, -3.311e-01, 1.880e-01, 2.614e-01, -2.584e-01, 5.296e-02, 9.726e-02, -3.932e-02, -7.518e-02, -1.749e-01, 1.604e-01, 1.008e-01, 2.920e-01, 5.358e-01, -6.383e-01), r);
|
||||||
|
r = MulAdd(s3_5, M4(-2.706e-01, -2.716e-01, -4.196e-01, 1.023e-01, 2.201e-01, -1.412e-01, 1.003e-01, -6.972e-02, 3.727e-02, -8.424e-02, -7.870e-02, 2.294e-02, 2.836e-01, -4.165e-01, -2.974e-01, -3.567e-02), r);
|
||||||
|
r = MulAdd(s3_6, M4(-3.434e-02, 6.420e-02, -8.729e-02, -8.600e-02, -2.041e-01, 1.646e-02, 9.025e-02, 1.724e-01, -4.951e-02, -3.894e-02, -7.985e-02, 1.580e-02, 2.554e-01, -3.100e-01, -2.769e-01, 8.336e-05), r);
|
||||||
|
r = MulAdd(s3_7, M4(-6.557e-02, 3.865e-02, -3.263e-02, 4.621e-02, -2.077e-01, 2.705e-02, -3.354e-01, 1.480e-01, 4.155e-02, -2.143e-01, -2.626e-01, 1.091e-02, 1.382e-01, -1.706e-01, -1.355e-01, -7.700e-02), r);
|
||||||
|
r = MulAdd(s3_8, M4(-2.004e-01, 4.575e-01, -1.812e-01, 6.102e-02, 3.469e-01, -6.634e-02, 1.302e-01, -9.621e-02, 4.023e-02, 1.048e-01, -9.194e-02, 5.130e-03, 4.272e-01, -5.971e-01, -2.025e-01, -1.364e-01), r);
|
||||||
|
return r;
|
||||||
|
}
|
||||||
|
|
||||||
|
V4 f1(V4 s0_0, V4 s0_1, V4 s0_2, V4 s0_3, V4 s0_4, V4 s0_5, V4 s0_6, V4 s0_7, V4 s0_8, V4 s1_0, V4 s1_1, V4 s1_2, V4 s1_3, V4 s1_4, V4 s1_5, V4 s1_6, V4 s1_7, V4 s1_8, V4 s2_0, V4 s2_1, V4 s2_2, V4 s2_3, V4 s2_4, V4 s2_5, V4 s2_6, V4 s2_7, V4 s2_8, V4 s3_0, V4 s3_1, V4 s3_2, V4 s3_3, V4 s3_4, V4 s3_5, V4 s3_6, V4 s3_7, V4 s3_8) {
|
||||||
|
V4 r = { -8.611e-03, -6.529e-03, -1.098e-03, 4.669e-03 };
|
||||||
|
r = MulAdd(s0_0, M4(-1.437e-01, -9.784e-02, 2.649e-01, -8.638e-02, -1.746e-01, 2.031e-01, 1.203e-01, -8.812e-02, -2.317e-01, 2.311e-01, 3.171e-02, -3.619e-02, -7.798e-02, -2.507e-02, 1.902e-01, 5.780e-02), r);
|
||||||
|
r = MulAdd(s0_1, M4(2.504e-01, 1.577e-01, -5.397e-02, 4.599e-01, -1.392e-01, 2.560e-01, 1.018e-01, 7.968e-02, 2.247e-01, -2.962e-03, 1.421e-03, -1.201e-01, -3.622e-01, 1.378e-01, 1.392e-01, 1.641e-01), r);
|
||||||
|
r = MulAdd(s0_2, M4(-6.143e-02, -6.336e-02, 1.131e-01, 6.811e-02, -5.817e-02, 7.362e-02, 1.407e-01, 1.823e-02, 4.880e-01, -2.282e-01, -2.704e-01, -4.287e-01, -2.741e-01, 3.163e-02, 1.098e-01, 1.514e-01), r);
|
||||||
|
r = MulAdd(s0_3, M4(1.794e-01, 1.720e-01, -4.092e-01, 1.277e-01, -1.938e-01, 3.107e-01, 2.915e-01, 2.279e-01, 2.259e-01, 2.136e-01, 5.867e-02, 2.359e-01, -1.589e-01, 1.132e-01, 6.871e-02, 2.837e-01), r);
|
||||||
|
r = MulAdd(s0_4, M4(-3.070e-01, -4.494e-01, 5.817e-02, 5.153e-01, 5.215e-01, 5.410e-01, 1.286e-01, -5.596e-01, 4.287e-01, 1.821e-01, 1.542e-01, 3.755e-01, 3.820e-01, 2.953e-01, -2.768e-01, -6.977e-02), r);
|
||||||
|
r = MulAdd(s0_5, M4(-4.881e-02, 2.327e-02, 9.209e-02, -2.102e-02, -1.394e-01, -8.093e-03, 2.263e-01, -4.307e-01, 1.998e-01, -8.793e-02, -1.057e-01, -1.899e-01, 1.577e-01, 3.435e-01, 6.721e-02, 3.093e-01), r);
|
||||||
|
r = MulAdd(s0_6, M4(7.516e-02, -1.224e-01, 1.257e-02, -6.769e-02, -8.618e-02, 1.283e-01, 2.060e-01, -1.966e-01, 8.166e-02, -1.263e-01, -2.269e-01, -3.272e-01, -3.439e-02, -2.849e-01, 2.105e-01, -3.015e-03), r);
|
||||||
|
r = MulAdd(s0_7, M4(7.447e-02, -8.731e-02, 2.804e-02, -4.819e-02, -3.311e-01, 3.824e-01, 7.766e-02, 5.672e-02, 4.014e-01, -4.037e-03, 2.287e-01, 5.626e-02, 3.481e-01, -1.010e-01, -1.156e-01, -2.865e-01), r);
|
||||||
|
r = MulAdd(s0_8, M4(5.454e-02, -5.590e-02, 3.408e-02, 3.551e-03, 1.262e-02, 8.638e-02, 1.222e-01, 3.418e-01, -2.154e-01, 1.868e-01, 1.210e-01, -2.330e-01, -4.810e-02, -5.190e-02, -8.587e-02, -2.145e-01), r);
|
||||||
|
r = MulAdd(s1_0, M4(-3.063e-01, -1.830e-02, 5.167e-01, 4.813e-02, -7.310e-02, 1.443e-01, 1.654e-01, 1.158e-01, 4.789e-02, -3.030e-02, -1.358e-01, 2.986e-02, -4.855e-02, -7.736e-02, 4.514e-01, -1.797e-02), r);
|
||||||
|
r = MulAdd(s1_1, M4(4.322e-01, -1.369e-01, 9.431e-02, 3.921e-01, 2.708e-02, -1.218e-02, -9.091e-02, 1.871e-01, 3.763e-02, -9.213e-02, -1.209e-01, -1.587e-01, 3.014e-03, 1.816e-01, 3.099e-01, 3.210e-01), r);
|
||||||
|
r = MulAdd(s1_2, M4(-7.234e-02, 1.685e-02, 4.444e-01, -1.886e-01, -9.543e-03, 3.966e-02, 1.105e-01, 4.870e-02, 9.471e-02, -5.263e-02, -1.085e-01, 4.226e-02, -1.565e-01, -3.812e-02, 1.708e-01, 1.457e-01), r);
|
||||||
|
r = MulAdd(s1_3, M4(2.370e-01, -3.354e-02, -9.648e-02, 1.531e-01, -3.468e-01, -3.957e-02, 3.152e-01, 3.402e-02, 3.762e-02, 9.507e-02, 7.836e-02, 9.088e-03, -1.614e-01, 4.377e-02, 4.748e-02, 1.055e-01), r);
|
||||||
|
r = MulAdd(s1_4, M4(2.342e-01, -5.059e-01, 2.781e-01, 2.906e-01, 1.656e-01, 1.268e-01, 1.183e-01, -2.458e-02, 2.290e-01, 1.779e-01, -8.310e-02, 1.389e-01, 7.282e-02, 1.050e-01, -3.525e-01, 6.810e-02), r);
|
||||||
|
r = MulAdd(s1_5, M4(1.078e-01, -4.451e-02, 7.031e-02, -2.977e-01, 3.596e-02, 3.359e-02, 9.589e-03, 9.070e-02, -1.862e-01, -1.863e-01, -9.652e-02, -5.039e-02, 1.004e-01, 1.598e-01, 1.466e-01, 2.349e-01), r);
|
||||||
|
r = MulAdd(s1_6, M4(1.109e-02, -1.607e-01, 1.578e-02, -1.971e-01, 5.020e-02, -7.597e-02, 7.238e-02, 7.241e-02, 2.025e-02, -2.246e-02, 4.652e-02, -8.760e-02, -1.111e-02, 1.890e-02, 1.046e-01, -2.233e-03), r);
|
||||||
|
r = MulAdd(s1_7, M4(1.252e-01, -8.046e-02, -1.321e-01, -3.724e-01, -1.383e-01, 1.151e-01, 5.397e-02, -1.422e-01, 8.319e-02, 9.089e-02, -2.620e-02, 1.662e-01, 2.847e-02, -1.255e-01, 6.933e-02, -1.636e-01), r);
|
||||||
|
r = MulAdd(s1_8, M4(-1.517e-01, 3.661e-02, -3.135e-01, -3.395e-01, -1.139e-01, 1.973e-01, 8.547e-03, -3.118e-02, -8.869e-02, -1.209e-01, 1.867e-02, -4.531e-02, 1.016e-01, -6.909e-02, 1.436e-01, 1.663e-01), r);
|
||||||
|
r = MulAdd(s2_0, M4(-9.314e-02, 1.395e-02, -1.741e-02, -7.208e-02, -5.164e-02, -5.743e-02, 5.702e-02, 1.342e-01, 6.011e-03, 1.626e-01, 1.101e-01, -1.130e-01, 6.127e-02, -8.956e-03, -7.149e-02, -6.488e-03), r);
|
||||||
|
r = MulAdd(s2_1, M4(-2.534e-01, 1.086e-01, -1.007e-01, -3.067e-02, -1.074e-01, 7.219e-03, 6.768e-02, -1.012e-01, 2.019e-01, 4.263e-03, -7.411e-02, -1.173e-01, 1.961e-01, -5.619e-02, -2.390e-01, -1.323e-01), r);
|
||||||
|
r = MulAdd(s2_2, M4(-1.039e-01, -9.899e-02, -2.206e-01, -2.187e-01, -8.739e-03, 6.607e-02, 4.125e-02, 5.363e-02, -6.572e-03, 3.014e-02, 1.314e-01, -9.560e-02, 2.106e-01, 1.237e-02, -8.354e-02, -4.939e-03), r);
|
||||||
|
r = MulAdd(s2_3, M4(-4.682e-02, -1.357e-01, 3.481e-02, -2.187e-01, 1.113e-01, 8.812e-02, -1.211e-01, -2.011e-02, 1.567e-01, -2.216e-02, -4.920e-03, -2.458e-01, 2.263e-02, 6.741e-02, -1.234e-02, 2.338e-02), r);
|
||||||
|
r = MulAdd(s2_4, M4(5.105e-02, -3.845e-01, 1.812e-01, -1.927e-01, 2.840e-01, -2.094e-01, 5.673e-02, 4.405e-02, 5.957e-01, 1.734e-02, -1.158e-01, -6.956e-01, -2.077e-01, 5.130e-03, 4.744e-01, -1.540e-02), r);
|
||||||
|
r = MulAdd(s2_5, M4(1.601e-01, -2.680e-01, -1.678e-01, -1.207e-01, -4.648e-02, -6.454e-02, 1.122e-01, -6.567e-02, 1.638e-01, -1.259e-01, -2.470e-02, -3.547e-01, -1.333e-01, -1.219e-02, -7.710e-02, -3.881e-01), r);
|
||||||
|
r = MulAdd(s2_6, M4(-6.060e-02, 1.662e-01, -2.082e-01, 3.193e-01, -1.317e-01, 1.395e-04, 2.436e-01, -1.480e-01, 6.104e-03, -2.009e-01, -6.729e-02, -2.207e-01, -7.784e-02, -7.589e-02, 7.569e-02, 3.261e-03), r);
|
||||||
|
r = MulAdd(s2_7, M4(-2.951e-01, -2.050e-01, 2.827e-02, 3.739e-01, 1.947e-01, 5.411e-01, -2.262e-01, -8.808e-03, 2.262e-01, -9.010e-02, -1.476e-01, -3.582e-01, -1.718e-01, 2.844e-02, 7.832e-02, 1.414e-03), r);
|
||||||
|
r = MulAdd(s2_8, M4(3.534e-01, 1.695e-01, -1.247e-01, 4.750e-01, 4.171e-02, 2.338e-02, -4.525e-02, -4.955e-02, 2.934e-01, -3.865e-02, -1.125e-01, -2.127e-01, 1.326e-01, 5.967e-02, 6.215e-02, 1.048e-01), r);
|
||||||
|
r = MulAdd(s3_0, M4(4.186e-02, -5.378e-02, 7.641e-02, -3.524e-02, -2.447e-01, -5.374e-02, -1.380e-01, -4.221e-01, -3.797e-02, -7.623e-03, -4.826e-02, 1.791e-02, -1.390e-01, 1.115e-01, 2.252e-01, -9.103e-03), r);
|
||||||
|
r = MulAdd(s3_1, M4(1.339e-01, 3.093e-01, -3.615e-02, 8.684e-02, -4.098e-01, -1.216e-01, 2.372e-01, -1.247e-01, -5.358e-02, -1.660e-01, -8.435e-02, 3.871e-02, 2.722e-01, -1.145e-01, -3.944e-01, -5.003e-02), r);
|
||||||
|
r = MulAdd(s3_2, M4(-4.430e-02, -3.135e-02, 1.019e-01, -1.129e-01, -2.647e-01, -1.317e-01, 8.715e-02, -5.466e-02, -3.946e-02, 7.216e-02, 1.677e-01, 9.349e-02, 8.069e-02, -1.097e-01, -9.659e-03, -8.460e-02), r);
|
||||||
|
r = MulAdd(s3_3, M4(-5.036e-03, 4.992e-02, 1.086e-01, -1.339e-02, 2.792e-01, 3.294e-01, -1.578e-01, 4.592e-01, -7.749e-02, 4.384e-02, -4.212e-02, 2.287e-02, 1.456e-01, 4.774e-02, -1.264e-01, 7.437e-02), r);
|
||||||
|
r = MulAdd(s3_4, M4(3.022e-01, -2.197e-01, -4.347e-02, -2.198e-01, 3.922e-02, 8.609e-02, 8.862e-02, 3.418e-01, 8.117e-02, -2.026e-02, -3.236e-01, -2.539e-01, -6.030e-02, -2.409e-01, 7.879e-02, -8.457e-02), r);
|
||||||
|
r = MulAdd(s3_5, M4(3.525e-01, 2.622e-01, -4.994e-02, -1.932e-01, -1.508e-01, 1.229e-01, 1.359e-01, 1.613e-01, 1.830e-01, -4.473e-02, -5.438e-02, -1.041e-01, 4.534e-01, -4.660e-01, -7.405e-02, -1.001e-01), r);
|
||||||
|
r = MulAdd(s3_6, M4(-1.224e-02, -5.840e-03, 8.031e-02, -2.279e-02, -2.128e-01, 1.477e-01, -9.937e-03, 4.142e-02, -3.726e-02, -1.013e-01, -2.940e-03, -1.333e-01, 1.353e-01, -2.192e-01, -3.858e-01, -1.100e-01), r);
|
||||||
|
r = MulAdd(s3_7, M4(-8.882e-02, 1.341e-01, 2.707e-01, 2.212e-01, 2.628e-01, 3.454e-01, -3.703e-01, 4.902e-01, 1.527e-01, 8.567e-03, -1.742e-01, -1.884e-01, -7.710e-01, 1.028e-01, 3.233e-01, -3.897e-01), r);
|
||||||
|
r = MulAdd(s3_8, M4(3.715e-02, 2.936e-01, -1.195e-01, -1.295e-01, -1.313e-01, -1.222e-01, -2.876e-01, 5.694e-02, 6.813e-02, -1.738e-02, -1.154e-01, 1.649e-02, 1.755e-01, -1.639e-01, 3.212e-02, 3.504e-01), r);
|
||||||
|
return r;
|
||||||
|
}
|
||||||
|
|
||||||
|
void Pass3(uint2 blockStart, uint3 tid) {
|
||||||
|
float2 pt = float2(GetInputPt());
|
||||||
|
uint2 gxy = Rmp8x8(tid.x) + blockStart;
|
||||||
|
uint2 size = GetInputSize();
|
||||||
|
if (gxy.x >= size.x || gxy.y >= size.y) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
float2 pos = (gxy + 0.5) * pt;
|
||||||
|
|
||||||
|
V4 s0_0 = l0(-1.0, -1.0);
|
||||||
|
V4 s0_1 = l0(0.0, -1.0);
|
||||||
|
V4 s0_2 = l0(1.0, -1.0);
|
||||||
|
V4 s0_3 = l0(-1.0, 0.0);
|
||||||
|
V4 s0_4 = l0(0.0, 0.0);
|
||||||
|
V4 s0_5 = l0(1.0, 0.0);
|
||||||
|
V4 s0_6 = l0(-1.0, 1.0);
|
||||||
|
V4 s0_7 = l0(0.0, 1.0);
|
||||||
|
V4 s0_8 = l0(1.0, 1.0);
|
||||||
|
V4 s1_0 = -max(-s0_0, 0.0);
|
||||||
|
V4 s1_1 = -max(-s0_1, 0.0);
|
||||||
|
V4 s1_2 = -max(-s0_2, 0.0);
|
||||||
|
V4 s1_3 = -max(-s0_3, 0.0);
|
||||||
|
V4 s1_4 = -max(-s0_4, 0.0);
|
||||||
|
V4 s1_5 = -max(-s0_5, 0.0);
|
||||||
|
V4 s1_6 = -max(-s0_6, 0.0);
|
||||||
|
V4 s1_7 = -max(-s0_7, 0.0);
|
||||||
|
V4 s1_8 = -max(-s0_8, 0.0);
|
||||||
|
s0_0 = max(s0_0, 0.0);
|
||||||
|
s0_1 = max(s0_1, 0.0);
|
||||||
|
s0_2 = max(s0_2, 0.0);
|
||||||
|
s0_3 = max(s0_3, 0.0);
|
||||||
|
s0_4 = max(s0_4, 0.0);
|
||||||
|
s0_5 = max(s0_5, 0.0);
|
||||||
|
s0_6 = max(s0_6, 0.0);
|
||||||
|
s0_7 = max(s0_7, 0.0);
|
||||||
|
s0_8 = max(s0_8, 0.0);
|
||||||
|
|
||||||
|
V4 s2_0 = l1(-1.0, -1.0);
|
||||||
|
V4 s2_1 = l1(0.0, -1.0);
|
||||||
|
V4 s2_2 = l1(1.0, -1.0);
|
||||||
|
V4 s2_3 = l1(-1.0, 0.0);
|
||||||
|
V4 s2_4 = l1(0.0, 0.0);
|
||||||
|
V4 s2_5 = l1(1.0, 0.0);
|
||||||
|
V4 s2_6 = l1(-1.0, 1.0);
|
||||||
|
V4 s2_7 = l1(0.0, 1.0);
|
||||||
|
V4 s2_8 = l1(1.0, 1.0);
|
||||||
|
V4 s3_0 = -max(-s2_0, 0.0);
|
||||||
|
V4 s3_1 = -max(-s2_1, 0.0);
|
||||||
|
V4 s3_2 = -max(-s2_2, 0.0);
|
||||||
|
V4 s3_3 = -max(-s2_3, 0.0);
|
||||||
|
V4 s3_4 = -max(-s2_4, 0.0);
|
||||||
|
V4 s3_5 = -max(-s2_5, 0.0);
|
||||||
|
V4 s3_6 = -max(-s2_6, 0.0);
|
||||||
|
V4 s3_7 = -max(-s2_7, 0.0);
|
||||||
|
V4 s3_8 = -max(-s2_8, 0.0);
|
||||||
|
s2_0 = max(s2_0, 0.0);
|
||||||
|
s2_1 = max(s2_1, 0.0);
|
||||||
|
s2_2 = max(s2_2, 0.0);
|
||||||
|
s2_3 = max(s2_3, 0.0);
|
||||||
|
s2_4 = max(s2_4, 0.0);
|
||||||
|
s2_5 = max(s2_5, 0.0);
|
||||||
|
s2_6 = max(s2_6, 0.0);
|
||||||
|
s2_7 = max(s2_7, 0.0);
|
||||||
|
s2_8 = max(s2_8, 0.0);
|
||||||
|
|
||||||
|
t0[gxy] = f0(s0_0, s0_1, s0_2, s0_3, s0_4, s0_5, s0_6, s0_7, s0_8, s1_0, s1_1, s1_2, s1_3, s1_4, s1_5, s1_6, s1_7, s1_8, s2_0, s2_1, s2_2, s2_3, s2_4, s2_5, s2_6, s2_7, s2_8, s3_0, s3_1, s3_2, s3_3, s3_4, s3_5, s3_6, s3_7, s3_8);
|
||||||
|
t1[gxy] = f1(s0_0, s0_1, s0_2, s0_3, s0_4, s0_5, s0_6, s0_7, s0_8, s1_0, s1_1, s1_2, s1_3, s1_4, s1_5, s1_6, s1_7, s1_8, s2_0, s2_1, s2_2, s2_3, s2_4, s2_5, s2_6, s2_7, s2_8, s3_0, s3_1, s3_2, s3_3, s3_4, s3_5, s3_6, s3_7, s3_8);
|
||||||
|
}
|
||||||
|
|
||||||
|
//!PASS 4
|
||||||
|
//!DESC conv3
|
||||||
|
//!BLOCK_SIZE 8
|
||||||
|
//!NUM_THREADS 64
|
||||||
|
//!IN t0, t1
|
||||||
|
//!OUT t2, t3
|
||||||
|
|
||||||
|
#define l0(x, y) V4(O(t0, float2(x, y)))
|
||||||
|
#define l1(x, y) V4(O(t1, float2(x, y)))
|
||||||
|
|
||||||
|
V4 f0(V4 s0_0, V4 s0_1, V4 s0_2, V4 s0_3, V4 s0_4, V4 s0_5, V4 s0_6, V4 s0_7, V4 s0_8, V4 s1_0, V4 s1_1, V4 s1_2, V4 s1_3, V4 s1_4, V4 s1_5, V4 s1_6, V4 s1_7, V4 s1_8, V4 s2_0, V4 s2_1, V4 s2_2, V4 s2_3, V4 s2_4, V4 s2_5, V4 s2_6, V4 s2_7, V4 s2_8, V4 s3_0, V4 s3_1, V4 s3_2, V4 s3_3, V4 s3_4, V4 s3_5, V4 s3_6, V4 s3_7, V4 s3_8) {
|
||||||
|
V4 r = { -1.057e-02, -1.114e-02, 1.597e-04, 1.132e-02 };
|
||||||
|
r = MulAdd(s0_0, M4(7.356e-02, 8.402e-03, 1.287e-01, 6.762e-02, 2.134e-01, -6.620e-02, -2.788e-01, -5.744e-02, -3.896e-02, -3.993e-02, -7.161e-02, -1.982e-01, -6.734e-02, 8.804e-03, -4.739e-02, 6.502e-02), r);
|
||||||
|
r = MulAdd(s0_1, M4(2.249e-01, 4.958e-02, 1.138e-01, 3.152e-01, 2.008e-01, 1.703e-01, 5.817e-02, -9.482e-02, -2.371e-01, 3.975e-02, -1.755e-01, -2.666e-01, 2.819e-01, -2.640e-02, 1.405e-01, -6.009e-02), r);
|
||||||
|
r = MulAdd(s0_2, M4(2.065e-01, -3.027e-02, -3.447e-02, 3.226e-03, -1.252e-02, -7.589e-03, 2.344e-03, -1.704e-02, -8.894e-02, 3.136e-02, -1.517e-01, -2.176e-02, 8.920e-02, -5.322e-02, -9.529e-02, 8.355e-02), r);
|
||||||
|
r = MulAdd(s0_3, M4(1.136e-01, 1.015e-01, -2.730e-02, -2.144e-01, -9.526e-02, -2.857e-01, 2.711e-01, -1.991e-01, 2.596e-01, 1.602e-01, -2.169e-01, -1.097e-01, 3.353e-02, 6.231e-02, 8.753e-03, 3.707e-01), r);
|
||||||
|
r = MulAdd(s0_4, M4(-1.945e-01, 3.081e-01, -2.270e-01, -5.963e-02, -1.666e-01, -3.408e-01, 1.161e-01, -6.384e-02, -6.823e-01, -4.014e-01, -6.276e-01, -1.672e-01, 2.986e-03, -1.351e-01, 1.668e-01, -3.133e-01), r);
|
||||||
|
r = MulAdd(s0_5, M4(4.802e-02, -4.275e-02, 1.978e-03, -7.602e-02, -4.082e-03, 5.572e-02, -3.341e-02, 9.101e-03, -1.038e-01, 1.622e-01, 2.334e-02, 1.768e-01, 9.416e-03, -2.287e-01, 1.048e-01, -2.926e-01), r);
|
||||||
|
r = MulAdd(s0_6, M4(5.333e-04, 3.089e-02, 2.721e-02, -3.601e-02, -5.081e-02, -1.152e-01, 6.752e-02, 1.701e-01, -2.951e-02, 2.450e-01, -1.684e-01, -4.702e-02, -1.580e-02, 1.200e-02, -1.266e-02, 4.937e-02), r);
|
||||||
|
r = MulAdd(s0_7, M4(-1.351e-02, -6.248e-02, -3.060e-03, 4.140e-02, -2.090e-01, -6.831e-01, -8.857e-02, 2.536e-01, -2.333e-02, 1.521e-01, -8.033e-02, 2.124e-01, -6.615e-02, 1.317e-01, 1.847e-01, -2.150e-01), r);
|
||||||
|
r = MulAdd(s0_8, M4(4.605e-02, 1.013e-01, 6.834e-03, -6.411e-02, -1.476e-02, -2.845e-01, -4.312e-02, -1.171e-02, 6.985e-02, -6.859e-02, -2.785e-02, -3.226e-02, 5.186e-02, 1.102e-01, -2.071e-02, -1.250e-01), r);
|
||||||
|
r = MulAdd(s1_0, M4(1.952e-01, -3.342e-02, -3.770e-02, -2.026e-01, 4.850e-02, -3.174e-02, -1.987e-01, -2.886e-02, -1.298e-01, 1.994e-02, 1.131e-01, 2.950e-02, -1.791e-02, -4.533e-02, 4.695e-02, -6.907e-02), r);
|
||||||
|
r = MulAdd(s1_1, M4(2.401e-01, 1.809e-01, -5.151e-02, -6.271e-02, -1.409e-01, 9.215e-03, 1.176e-01, 2.717e-02, 1.130e-01, -3.228e-02, -9.086e-02, -1.202e-03, 1.642e-03, -7.943e-03, 1.097e-01, 1.842e-01), r);
|
||||||
|
r = MulAdd(s1_2, M4(8.774e-02, -1.486e-02, -4.808e-02, 4.089e-02, 6.244e-02, -7.645e-02, 5.614e-02, -5.706e-02, -2.386e-02, 4.407e-02, -1.378e-01, -5.880e-02, 2.936e-02, 2.285e-02, -3.924e-02, 5.724e-02), r);
|
||||||
|
r = MulAdd(s1_3, M4(2.603e-01, -1.455e-01, 1.429e-01, -2.992e-02, -6.288e-02, -5.216e-02, -1.802e-01, 1.060e-01, -2.473e-02, -6.795e-03, 2.843e-02, 7.745e-02, -4.868e-03, -9.998e-02, -7.961e-02, 5.068e-02), r);
|
||||||
|
r = MulAdd(s1_4, M4(2.018e-01, -1.293e-01, -5.291e-02, -4.763e-02, 3.484e-02, -1.648e-01, 8.786e-02, -6.101e-02, -1.083e-01, 5.522e-02, -1.814e-01, -2.392e-01, 6.427e-02, -1.908e-02, 2.643e-01, 1.294e-01), r);
|
||||||
|
r = MulAdd(s1_5, M4(-7.897e-02, -5.967e-02, -2.620e-01, 1.274e-02, -2.583e-02, 5.654e-02, -7.639e-02, -7.534e-03, -5.812e-02, -7.887e-02, -3.738e-03, 7.664e-02, 1.753e-02, -2.842e-01, -3.237e-01, 2.077e-02), r);
|
||||||
|
r = MulAdd(s1_6, M4(6.558e-02, -9.890e-02, 1.849e-02, 3.242e-04, 1.021e-02, 1.234e-01, 1.224e-02, -4.322e-02, -2.778e-02, 3.860e-02, -5.257e-02, -1.466e-02, -1.001e-02, -1.291e-03, 1.724e-01, -9.167e-02), r);
|
||||||
|
r = MulAdd(s1_7, M4(-5.291e-02, -2.764e-01, -6.402e-02, 4.327e-02, 1.921e-02, -1.484e-01, 3.286e-02, 4.051e-02, 1.636e-02, 3.932e-01, -5.432e-02, 4.540e-02, 3.947e-02, -1.385e-01, -1.065e-01, 1.569e-01), r);
|
||||||
|
r = MulAdd(s1_8, M4(-1.729e-02, 8.177e-02, -4.479e-02, -1.275e-01, -3.302e-03, -1.265e-01, -2.922e-02, 3.720e-02, 1.560e-02, 5.266e-02, -1.572e-02, -4.840e-02, 3.991e-03, 1.003e-01, -1.423e-01, 7.414e-02), r);
|
||||||
|
r = MulAdd(s2_0, M4(-1.207e-02, -2.418e-02, -7.769e-03, -1.401e-01, 1.660e-01, -6.347e-03, -1.092e-02, -1.830e-02, -1.252e-01, -5.217e-02, 9.898e-03, 1.461e-02, 2.654e-02, 1.219e-02, -3.769e-02, 1.897e-02), r);
|
||||||
|
r = MulAdd(s2_1, M4(-3.650e-02, 1.317e-01, 1.299e-02, -5.512e-02, -1.287e-01, 2.438e-02, -1.609e-03, 1.759e-01, 1.824e-02, 6.477e-03, 2.905e-02, -8.644e-02, 7.496e-02, -9.920e-02, 1.147e-02, 1.889e-01), r);
|
||||||
|
r = MulAdd(s2_2, M4(-7.005e-03, -4.482e-02, -1.853e-02, 3.441e-02, 1.251e-01, -3.162e-02, -1.701e-01, -5.231e-02, -1.647e-01, 2.261e-02, 8.255e-02, -3.730e-02, 1.811e-01, -9.052e-02, 1.728e-02, 1.911e-02), r);
|
||||||
|
r = MulAdd(s2_3, M4(2.359e-02, -1.334e-01, 2.761e-02, -1.251e-01, 1.455e-01, 4.076e-02, -3.260e-02, -1.782e-01, -3.575e-02, 1.411e-02, 1.322e-01, -9.592e-02, 5.423e-02, 7.989e-03, -1.460e-01, 8.895e-02), r);
|
||||||
|
r = MulAdd(s2_4, M4(1.304e-01, 1.296e-01, -7.250e-02, -6.647e-02, 8.382e-02, 1.111e-01, 8.976e-02, -5.914e-02, -2.228e-01, -4.772e-02, -1.931e-03, 8.499e-02, 4.483e-01, 1.327e-01, 5.086e-02, -4.795e-01), r);
|
||||||
|
r = MulAdd(s2_5, M4(4.674e-02, 7.104e-02, -5.312e-02, -7.730e-02, 2.647e-03, 8.893e-03, -8.889e-02, -5.714e-02, -4.546e-02, -4.002e-02, -1.514e-01, -2.989e-02, -8.669e-02, -5.441e-03, 1.460e-02, -2.327e-02), r);
|
||||||
|
r = MulAdd(s2_6, M4(1.146e-01, -1.154e-01, 8.289e-03, 7.655e-02, -2.194e-02, -3.908e-02, -2.191e-02, 2.363e-03, 4.527e-02, -7.852e-02, -4.728e-02, 1.066e-01, 4.023e-02, -5.192e-02, -4.180e-02, -3.879e-02), r);
|
||||||
|
r = MulAdd(s2_7, M4(2.446e-01, -2.295e-01, -5.819e-02, -2.646e-02, 8.106e-02, -8.799e-02, -3.455e-02, 6.900e-02, 5.579e-02, -1.551e-01, 1.609e-01, 9.954e-02, -1.499e-01, 8.628e-02, 1.114e-01, 1.313e-02), r);
|
||||||
|
r = MulAdd(s2_8, M4(1.028e-02, 9.150e-02, -6.161e-02, 5.124e-03, 3.822e-02, 1.533e-02, 2.329e-02, -1.106e-01, -1.541e-03, -1.818e-01, -9.577e-02, -3.402e-02, 1.784e-02, -1.152e-01, 6.896e-02, -1.111e-01), r);
|
||||||
|
r = MulAdd(s3_0, M4(-7.349e-02, -4.782e-02, 3.080e-02, -1.668e-01, 9.572e-02, 5.307e-02, 5.573e-03, 6.483e-02, 1.104e-01, -5.707e-02, -8.579e-02, -1.754e-02, 1.038e-01, 1.706e-02, -1.185e-01, 5.863e-02), r);
|
||||||
|
r = MulAdd(s3_1, M4(-1.639e-01, -6.808e-03, 1.836e-02, -1.482e-01, 1.032e-01, 2.612e-02, -1.751e-01, -1.527e-01, 3.169e-03, 5.272e-02, 7.983e-02, 5.066e-02, 1.191e-01, 3.658e-02, 3.275e-02, -1.122e-01), r);
|
||||||
|
r = MulAdd(s3_2, M4(-8.279e-02, -1.068e-02, 3.848e-02, -8.857e-03, -3.783e-02, 9.934e-02, -7.181e-02, 2.801e-02, -1.524e-01, -7.166e-02, 1.038e-01, -9.840e-04, -7.254e-03, -3.252e-02, -1.435e-02, 6.052e-03), r);
|
||||||
|
r = MulAdd(s3_3, M4(-3.534e-02, -2.891e-02, 3.778e-01, -2.472e-01, -4.015e-02, -5.651e-02, 2.006e-01, 1.249e-02, -8.408e-02, -1.160e-02, 2.881e-01, -6.805e-03, 1.340e-02, -1.237e-01, -1.617e-01, 1.894e-02), r);
|
||||||
|
r = MulAdd(s3_4, M4(-1.512e-02, 3.232e-01, -1.441e-01, -3.778e-01, -1.475e-01, -2.644e-03, -3.149e-01, 3.225e-02, 1.227e-01, -3.620e-02, -1.175e-01, -3.857e-01, 4.834e-02, -1.567e-01, 1.632e-01, -1.292e-01), r);
|
||||||
|
r = MulAdd(s3_5, M4(-1.592e-01, 3.426e-02, -1.506e-01, 1.215e-01, 1.314e-01, -7.432e-02, -8.767e-02, 1.685e-01, 6.875e-02, 2.804e-01, -3.279e-02, -1.870e-01, 1.049e-01, -9.061e-02, 8.573e-02, -9.407e-02), r);
|
||||||
|
r = MulAdd(s3_6, M4(5.310e-02, -1.089e-01, -1.496e-01, 2.134e-01, 5.599e-02, -1.565e-01, -6.842e-02, -1.362e-02, 6.861e-02, -2.548e-02, -1.614e-01, -3.698e-02, -2.731e-02, 1.138e-02, 1.288e-02, -1.789e-02), r);
|
||||||
|
r = MulAdd(s3_7, M4(-7.967e-02, -2.461e-01, -2.139e-01, 3.193e-01, 1.377e-01, -1.213e-01, 8.415e-02, 1.224e-02, 1.192e-01, 1.785e-01, 1.978e-01, 1.008e-01, 3.016e-02, 9.868e-02, 3.118e-03, -3.294e-02), r);
|
||||||
|
r = MulAdd(s3_8, M4(1.121e-01, -4.625e-02, 3.331e-02, -7.687e-02, 5.520e-02, 6.326e-02, 1.369e-02, 1.850e-02, 4.062e-02, -1.561e-01, -8.640e-02, 1.105e-01, 8.446e-03, -1.746e-03, 4.572e-02, -1.015e-01), r);
|
||||||
|
return r;
|
||||||
|
}
|
||||||
|
|
||||||
|
V4 f1(V4 s0_0, V4 s0_1, V4 s0_2, V4 s0_3, V4 s0_4, V4 s0_5, V4 s0_6, V4 s0_7, V4 s0_8, V4 s1_0, V4 s1_1, V4 s1_2, V4 s1_3, V4 s1_4, V4 s1_5, V4 s1_6, V4 s1_7, V4 s1_8, V4 s2_0, V4 s2_1, V4 s2_2, V4 s2_3, V4 s2_4, V4 s2_5, V4 s2_6, V4 s2_7, V4 s2_8, V4 s3_0, V4 s3_1, V4 s3_2, V4 s3_3, V4 s3_4, V4 s3_5, V4 s3_6, V4 s3_7, V4 s3_8) {
|
||||||
|
V4 r = { -4.712e-03, -1.187e-02, 1.287e-02, -6.625e-03 };
|
||||||
|
r = MulAdd(s0_0, M4(2.399e-01, 1.190e-01, 9.941e-02, -5.908e-03, 2.176e-01, -3.861e-02, -4.997e-02, -3.036e-02, -6.079e-02, 2.294e-02, -1.260e-01, 6.001e-02, -7.690e-02, -4.805e-02, 6.117e-03, 4.358e-02), r);
|
||||||
|
r = MulAdd(s0_1, M4(-4.669e-02, -1.150e-01, 9.700e-03, 2.351e-02, 3.215e-01, -1.737e-03, 2.091e-01, -1.245e-01, -8.592e-02, 1.866e-01, 2.826e-01, -6.728e-01, 1.528e-01, 5.511e-02, -4.930e-02, -1.959e-02), r);
|
||||||
|
r = MulAdd(s0_2, M4(-2.182e-02, -4.512e-02, 6.864e-02, 8.299e-02, 8.483e-03, -4.855e-02, -1.500e-01, 1.325e-02, 6.098e-02, -1.867e-02, 1.276e-02, 1.721e-02, 5.918e-03, -1.130e-01, 7.066e-04, -1.824e-03), r);
|
||||||
|
r = MulAdd(s0_3, M4(4.594e-02, 1.518e-01, -2.067e-01, 1.546e-02, -1.548e-02, 1.126e-01, -4.502e-03, -2.014e-02, 2.417e-01, -1.530e-01, -1.095e-01, -4.966e-02, 2.291e-01, -4.598e-03, 2.836e-01, 5.562e-02), r);
|
||||||
|
r = MulAdd(s0_4, M4(5.432e-02, -3.003e-01, 7.389e-01, -1.497e-01, -2.439e-01, -3.298e-01, 4.081e-01, -2.105e-01, -4.267e-01, 3.913e-01, 5.470e-01, 5.594e-01, -1.221e-01, -5.444e-02, -4.180e-01, 1.515e-01), r);
|
||||||
|
r = MulAdd(s0_5, M4(2.205e-01, -5.813e-03, 7.451e-03, 8.130e-02, -3.312e-02, -9.387e-02, -9.824e-02, 4.493e-02, 8.187e-02, -2.042e-01, 1.644e-01, 1.562e-01, -8.427e-02, 2.057e-01, -1.668e-02, -2.356e-01), r);
|
||||||
|
r = MulAdd(s0_6, M4(1.150e-02, 1.442e-02, -1.973e-02, -4.599e-02, -9.680e-02, 3.962e-02, 1.731e-02, -2.402e-02, 3.936e-02, 6.512e-03, 2.103e-02, 2.025e-03, -1.308e-02, -5.259e-02, 5.631e-02, 3.037e-02), r);
|
||||||
|
r = MulAdd(s0_7, M4(-1.306e-02, -3.164e-02, 1.196e-01, 2.798e-02, -2.533e-01, -1.204e-01, 1.860e-01, 1.564e-01, -4.731e-02, -7.323e-02, 1.441e-03, -9.049e-02, -3.371e-02, -2.801e-04, 2.952e-02, -2.632e-02), r);
|
||||||
|
r = MulAdd(s0_8, M4(3.024e-02, -1.034e-02, -7.595e-02, -7.550e-02, 3.562e-02, -4.589e-02, -3.066e-02, 7.995e-02, -1.866e-02, 1.022e-01, -2.624e-02, -1.074e-01, 2.176e-02, 1.434e-01, -5.664e-02, -3.473e-02), r);
|
||||||
|
r = MulAdd(s1_0, M4(2.252e-01, 9.801e-02, -5.786e-02, -6.661e-02, 7.599e-02, -9.244e-02, 4.437e-02, -1.203e-01, -1.577e-01, -3.797e-02, -1.335e-02, 4.540e-02, -3.540e-03, -9.094e-03, -4.076e-02, -8.099e-02), r);
|
||||||
|
r = MulAdd(s1_1, M4(2.557e-01, -2.549e-01, 2.306e-01, -4.389e-02, -3.677e-02, 5.796e-02, 4.505e-02, -1.209e-01, -4.484e-02, 1.229e-01, -5.686e-02, 2.778e-02, 9.876e-02, -6.893e-04, 9.771e-02, 1.264e-01), r);
|
||||||
|
r = MulAdd(s1_2, M4(-5.324e-02, -9.632e-02, -1.092e-02, -1.426e-02, 3.082e-02, 9.196e-02, -1.381e-01, -1.013e-01, 7.758e-03, -3.290e-02, 1.630e-02, -4.979e-03, -7.297e-02, -7.534e-02, 2.040e-02, -1.983e-01), r);
|
||||||
|
r = MulAdd(s1_3, M4(1.951e-01, 3.566e-02, 4.220e-02, 8.086e-02, -5.114e-02, -5.626e-02, -6.912e-02, 1.462e-01, 2.268e-03, -2.592e-02, 3.527e-02, -3.832e-02, 4.756e-02, 1.234e-01, -5.494e-03, 4.695e-02), r);
|
||||||
|
r = MulAdd(s1_4, M4(4.147e-01, -2.431e-01, 2.372e-01, 2.574e-04, -4.485e-02, 5.014e-02, 3.928e-02, -2.817e-02, 3.512e-01, 2.983e-01, -1.260e-01, 4.326e-01, -2.366e-01, -6.912e-02, 2.259e-01, -4.534e-01), r);
|
||||||
|
r = MulAdd(s1_5, M4(1.323e-01, 5.260e-03, 2.693e-02, 1.841e-01, -1.105e-01, 6.002e-02, -1.233e-01, 1.012e-02, -9.410e-02, -1.260e-01, 1.264e-02, -3.910e-02, 3.656e-01, -1.103e-01, 5.059e-01, 4.280e-01), r);
|
||||||
|
r = MulAdd(s1_6, M4(-7.537e-02, -2.153e-02, -4.511e-02, -5.184e-02, -1.745e-02, -1.165e-02, 1.352e-02, -1.951e-02, -4.888e-02, 2.249e-02, -3.915e-02, -4.557e-03, -9.946e-03, -1.633e-04, -3.200e-02, -1.356e-02), r);
|
||||||
|
r = MulAdd(s1_7, M4(-1.509e-01, -2.227e-02, 1.640e-01, 2.693e-02, 4.846e-02, 3.303e-02, -5.390e-02, 3.607e-02, -2.818e-02, -7.170e-02, 3.311e-02, -9.203e-02, -1.946e-03, -8.577e-02, -2.925e-02, 1.238e-01), r);
|
||||||
|
r = MulAdd(s1_8, M4(-3.295e-02, 1.995e-02, -1.689e-01, -4.353e-02, -4.138e-02, -7.439e-03, -2.343e-02, 6.997e-02, 8.031e-02, 1.117e-01, 4.894e-02, -6.214e-02, -1.960e-01, -1.630e-01, 8.586e-02, -8.213e-02), r);
|
||||||
|
r = MulAdd(s2_0, M4(-9.883e-02, -1.168e-02, -1.110e-01, -2.148e-01, 1.452e-01, 3.417e-03, -4.513e-02, 8.845e-02, -7.791e-02, 2.326e-02, -4.188e-02, -3.659e-02, 3.105e-02, -1.318e-02, -4.552e-03, 7.109e-02), r);
|
||||||
|
r = MulAdd(s2_1, M4(1.958e-02, -6.995e-02, 2.588e-01, -6.431e-02, -2.211e-01, 5.281e-02, 5.399e-02, 8.884e-02, -5.135e-02, -4.768e-02, 1.363e-01, -2.064e-01, -1.391e-01, 1.106e-01, -2.611e-01, 2.038e-01), r);
|
||||||
|
r = MulAdd(s2_2, M4(-6.883e-02, -1.360e-03, -1.628e-01, 7.301e-02, 1.213e-01, -5.159e-03, 1.194e-01, -1.148e-02, -1.285e-01, -1.448e-01, 1.776e-02, -1.414e-01, -3.022e-02, 1.382e-01, 6.695e-02, -4.201e-02), r);
|
||||||
|
r = MulAdd(s2_3, M4(-1.194e-01, 1.524e-03, -1.945e-01, -1.496e-01, 1.413e-03, -8.697e-04, -1.542e-01, -1.798e-03, -4.991e-02, -7.944e-03, -1.094e-01, -5.578e-02, 1.526e-01, -6.170e-02, 1.598e-01, 1.306e-01), r);
|
||||||
|
r = MulAdd(s2_4, M4(3.583e-02, -1.213e-01, 2.087e-01, -4.616e-02, 2.125e-01, -1.242e-01, 2.776e-01, -8.100e-02, -1.733e-01, 1.016e-01, 2.949e-01, 1.489e-01, 5.059e-01, 3.526e-01, -4.764e-01, -1.105e-02), r);
|
||||||
|
r = MulAdd(s2_5, M4(7.240e-02, 1.034e-01, -1.103e-01, 2.351e-02, -2.711e-02, 1.506e-02, -1.534e-01, 1.093e-01, 5.065e-02, -2.686e-01, 1.423e-01, -4.993e-02, 7.167e-02, 1.084e-01, -8.139e-03, 4.460e-02), r);
|
||||||
|
r = MulAdd(s2_6, M4(1.243e-01, 1.281e-02, 7.048e-02, 1.117e-01, -1.145e-01, -1.703e-02, -1.470e-02, -3.647e-02, 3.796e-03, 2.441e-02, -8.422e-02, 1.955e-02, -2.861e-02, -6.963e-02, 6.894e-02, -4.071e-02), r);
|
||||||
|
r = MulAdd(s2_7, M4(2.315e-01, 7.446e-02, -7.632e-02, 1.319e-01, -2.392e-02, 2.525e-02, 4.687e-02, 7.645e-02, 4.250e-02, -4.733e-02, 2.179e-01, -3.843e-02, -3.526e-01, 9.675e-02, -1.837e-01, -1.563e-01), r);
|
||||||
|
r = MulAdd(s2_8, M4(5.933e-02, 1.490e-01, -5.844e-02, 9.363e-02, 7.616e-04, -1.075e-02, -1.365e-01, -6.094e-02, 7.094e-03, -1.218e-01, 7.021e-02, 3.101e-02, -4.184e-02, 3.989e-02, -7.167e-02, -1.179e-01), r);
|
||||||
|
r = MulAdd(s3_0, M4(-7.835e-02, 6.392e-02, -5.802e-02, -1.483e-01, 1.374e-01, 3.699e-02, 2.043e-03, 1.554e-01, -6.873e-02, -1.174e-02, -1.518e-01, -1.405e-02, 4.783e-03, -1.131e-01, 4.121e-02, -8.849e-02), r);
|
||||||
|
r = MulAdd(s3_1, M4(-1.463e-01, 5.240e-02, -1.651e-02, -2.410e-01, 1.092e-01, -3.146e-02, -1.629e-02, -2.974e-02, -7.838e-02, -7.374e-03, 2.745e-01, -1.408e-01, 1.335e-01, 8.634e-02, 1.073e-02, -1.407e-02), r);
|
||||||
|
r = MulAdd(s3_2, M4(-7.340e-02, 2.321e-02, 1.922e-02, -1.112e-01, 2.932e-02, -2.587e-02, 1.333e-01, 4.721e-02, -1.514e-01, -3.395e-02, -1.264e-01, 1.777e-02, -8.692e-02, 1.186e-02, -7.424e-02, -2.402e-02), r);
|
||||||
|
r = MulAdd(s3_3, M4(5.052e-02, 2.790e-03, 3.121e-02, -1.839e-01, 3.910e-02, 2.279e-02, 6.041e-02, -8.205e-03, -5.819e-02, 5.701e-04, 5.763e-02, -1.835e-02, -7.273e-02, -1.017e-01, -4.708e-02, 3.331e-02), r);
|
||||||
|
r = MulAdd(s3_4, M4(-4.521e-02, -3.700e-02, -1.199e-01, -3.863e-01, -4.641e-01, -2.451e-01, 1.512e-03, -3.424e-01, -1.194e-01, 1.119e-01, -1.183e-01, 1.918e-01, 8.865e-02, 1.866e-01, -4.503e-02, 2.355e-03), r);
|
||||||
|
r = MulAdd(s3_5, M4(5.461e-02, -1.461e-01, 2.827e-01, 2.041e-01, -8.786e-03, 1.079e-02, 1.593e-01, 2.173e-01, 4.916e-01, -1.773e-01, 2.149e-02, -1.461e-01, -5.435e-02, 1.909e-01, -2.171e-01, -7.547e-02), r);
|
||||||
|
r = MulAdd(s3_6, M4(2.543e-02, 5.455e-02, -6.107e-02, 5.194e-03, 9.984e-02, 8.664e-02, -7.757e-04, 3.957e-02, 1.432e-01, 3.805e-02, -1.005e-03, 7.600e-02, -4.304e-02, -6.326e-02, 3.996e-02, 3.872e-03), r);
|
||||||
|
r = MulAdd(s3_7, M4(-1.234e-01, -1.276e-01, 1.312e-01, 8.454e-02, 1.539e-01, 6.822e-02, -1.455e-02, 1.223e-01, -1.060e-01, -3.708e-02, -1.480e-01, -7.922e-02, 6.503e-02, 1.105e-01, -1.249e-01, -3.210e-02), r);
|
||||||
|
r = MulAdd(s3_8, M4(1.676e-01, -7.072e-03, 4.581e-02, -1.006e-01, -1.056e-02, -8.209e-02, -4.804e-02, 2.427e-02, -1.165e-01, -8.224e-02, 2.940e-01, -9.220e-03, 5.420e-02, 1.802e-01, -1.190e-01, 1.433e-02), r);
|
||||||
|
return r;
|
||||||
|
}
|
||||||
|
|
||||||
|
void Pass4(uint2 blockStart, uint3 tid) {
|
||||||
|
float2 pt = float2(GetInputPt());
|
||||||
|
uint2 gxy = Rmp8x8(tid.x) + blockStart;
|
||||||
|
uint2 size = GetInputSize();
|
||||||
|
if (gxy.x >= size.x || gxy.y >= size.y) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
float2 pos = (gxy + 0.5) * pt;
|
||||||
|
|
||||||
|
V4 s0_0 = l0(-1.0, -1.0);
|
||||||
|
V4 s0_1 = l0(0.0, -1.0);
|
||||||
|
V4 s0_2 = l0(1.0, -1.0);
|
||||||
|
V4 s0_3 = l0(-1.0, 0.0);
|
||||||
|
V4 s0_4 = l0(0.0, 0.0);
|
||||||
|
V4 s0_5 = l0(1.0, 0.0);
|
||||||
|
V4 s0_6 = l0(-1.0, 1.0);
|
||||||
|
V4 s0_7 = l0(0.0, 1.0);
|
||||||
|
V4 s0_8 = l0(1.0, 1.0);
|
||||||
|
V4 s1_0 = -max(-s0_0, 0.0);
|
||||||
|
V4 s1_1 = -max(-s0_1, 0.0);
|
||||||
|
V4 s1_2 = -max(-s0_2, 0.0);
|
||||||
|
V4 s1_3 = -max(-s0_3, 0.0);
|
||||||
|
V4 s1_4 = -max(-s0_4, 0.0);
|
||||||
|
V4 s1_5 = -max(-s0_5, 0.0);
|
||||||
|
V4 s1_6 = -max(-s0_6, 0.0);
|
||||||
|
V4 s1_7 = -max(-s0_7, 0.0);
|
||||||
|
V4 s1_8 = -max(-s0_8, 0.0);
|
||||||
|
s0_0 = max(s0_0, 0.0);
|
||||||
|
s0_1 = max(s0_1, 0.0);
|
||||||
|
s0_2 = max(s0_2, 0.0);
|
||||||
|
s0_3 = max(s0_3, 0.0);
|
||||||
|
s0_4 = max(s0_4, 0.0);
|
||||||
|
s0_5 = max(s0_5, 0.0);
|
||||||
|
s0_6 = max(s0_6, 0.0);
|
||||||
|
s0_7 = max(s0_7, 0.0);
|
||||||
|
s0_8 = max(s0_8, 0.0);
|
||||||
|
|
||||||
|
V4 s2_0 = l1(-1.0, -1.0);
|
||||||
|
V4 s2_1 = l1(0.0, -1.0);
|
||||||
|
V4 s2_2 = l1(1.0, -1.0);
|
||||||
|
V4 s2_3 = l1(-1.0, 0.0);
|
||||||
|
V4 s2_4 = l1(0.0, 0.0);
|
||||||
|
V4 s2_5 = l1(1.0, 0.0);
|
||||||
|
V4 s2_6 = l1(-1.0, 1.0);
|
||||||
|
V4 s2_7 = l1(0.0, 1.0);
|
||||||
|
V4 s2_8 = l1(1.0, 1.0);
|
||||||
|
V4 s3_0 = -max(-s2_0, 0.0);
|
||||||
|
V4 s3_1 = -max(-s2_1, 0.0);
|
||||||
|
V4 s3_2 = -max(-s2_2, 0.0);
|
||||||
|
V4 s3_3 = -max(-s2_3, 0.0);
|
||||||
|
V4 s3_4 = -max(-s2_4, 0.0);
|
||||||
|
V4 s3_5 = -max(-s2_5, 0.0);
|
||||||
|
V4 s3_6 = -max(-s2_6, 0.0);
|
||||||
|
V4 s3_7 = -max(-s2_7, 0.0);
|
||||||
|
V4 s3_8 = -max(-s2_8, 0.0);
|
||||||
|
s2_0 = max(s2_0, 0.0);
|
||||||
|
s2_1 = max(s2_1, 0.0);
|
||||||
|
s2_2 = max(s2_2, 0.0);
|
||||||
|
s2_3 = max(s2_3, 0.0);
|
||||||
|
s2_4 = max(s2_4, 0.0);
|
||||||
|
s2_5 = max(s2_5, 0.0);
|
||||||
|
s2_6 = max(s2_6, 0.0);
|
||||||
|
s2_7 = max(s2_7, 0.0);
|
||||||
|
s2_8 = max(s2_8, 0.0);
|
||||||
|
|
||||||
|
t2[gxy] = f0(s0_0, s0_1, s0_2, s0_3, s0_4, s0_5, s0_6, s0_7, s0_8, s1_0, s1_1, s1_2, s1_3, s1_4, s1_5, s1_6, s1_7, s1_8, s2_0, s2_1, s2_2, s2_3, s2_4, s2_5, s2_6, s2_7, s2_8, s3_0, s3_1, s3_2, s3_3, s3_4, s3_5, s3_6, s3_7, s3_8);
|
||||||
|
t3[gxy] = f1(s0_0, s0_1, s0_2, s0_3, s0_4, s0_5, s0_6, s0_7, s0_8, s1_0, s1_1, s1_2, s1_3, s1_4, s1_5, s1_6, s1_7, s1_8, s2_0, s2_1, s2_2, s2_3, s2_4, s2_5, s2_6, s2_7, s2_8, s3_0, s3_1, s3_2, s3_3, s3_4, s3_5, s3_6, s3_7, s3_8);
|
||||||
|
}
|
||||||
|
|
||||||
|
//!PASS 5
|
||||||
|
//!DESC conv4
|
||||||
|
//!BLOCK_SIZE 8
|
||||||
|
//!NUM_THREADS 64
|
||||||
|
//!IN t2, t3
|
||||||
|
//!OUT t0, t1
|
||||||
|
|
||||||
|
#define l0(x, y) V4(O(t2, float2(x, y)))
|
||||||
|
#define l1(x, y) V4(O(t3, float2(x, y)))
|
||||||
|
|
||||||
|
V4 f0(V4 s0_0, V4 s0_1, V4 s0_2, V4 s0_3, V4 s0_4, V4 s0_5, V4 s0_6, V4 s0_7, V4 s0_8, V4 s1_0, V4 s1_1, V4 s1_2, V4 s1_3, V4 s1_4, V4 s1_5, V4 s1_6, V4 s1_7, V4 s1_8, V4 s2_0, V4 s2_1, V4 s2_2, V4 s2_3, V4 s2_4, V4 s2_5, V4 s2_6, V4 s2_7, V4 s2_8, V4 s3_0, V4 s3_1, V4 s3_2, V4 s3_3, V4 s3_4, V4 s3_5, V4 s3_6, V4 s3_7, V4 s3_8) {
|
||||||
|
V4 r = { -5.270e-03, 1.390e-02, 8.622e-03, 1.255e-02 };
|
||||||
|
r = MulAdd(s0_0, M4(1.060e-02, 9.173e-03, 9.548e-04, -7.886e-02, -1.324e-02, 4.660e-02, -4.997e-02, -5.676e-02, -3.290e-02, 6.253e-02, -5.777e-02, 1.265e-02, 6.136e-03, 7.179e-02, 3.102e-02, 4.961e-02), r);
|
||||||
|
r = MulAdd(s0_1, M4(5.787e-03, -2.090e-03, -1.489e-01, 4.380e-02, 1.259e-01, 5.508e-01, 1.211e-01, 3.385e-01, 2.399e-02, -1.436e-01, 2.987e-03, -2.839e-02, -3.021e-02, -8.641e-03, 1.716e-01, -1.328e-02), r);
|
||||||
|
r = MulAdd(s0_2, M4(-3.284e-02, -5.196e-02, -2.983e-02, -2.858e-02, 1.729e-02, 7.665e-02, 1.387e-01, 1.037e-01, 4.289e-02, 1.274e-01, 3.348e-02, 1.911e-02, -1.786e-02, -4.888e-02, 6.323e-02, -2.989e-02), r);
|
||||||
|
r = MulAdd(s0_3, M4(2.473e-02, -6.550e-02, -1.373e-01, 3.680e-02, 1.575e-01, -8.270e-02, 3.186e-02, -3.836e-02, 4.508e-02, 4.254e-02, 5.656e-03, -9.132e-02, 1.334e-01, -5.076e-02, -2.445e-02, -4.735e-02), r);
|
||||||
|
r = MulAdd(s0_4, M4(-5.346e-01, 1.950e-01, 2.121e-01, -3.694e-01, 5.004e-02, 1.610e-02, 2.249e-01, -5.962e-02, -6.243e-02, -3.270e-01, 1.851e-01, 4.051e-02, -2.310e-01, -2.300e-01, -1.314e-01, 3.374e-01), r);
|
||||||
|
r = MulAdd(s0_5, M4(-4.686e-02, -3.968e-01, 2.772e-02, 2.495e-02, 4.541e-02, 8.724e-02, 4.401e-02, -1.515e-02, -6.453e-02, -7.210e-02, -1.250e-02, 4.044e-02, 3.057e-02, 2.485e-01, 2.228e-02, 6.774e-02), r);
|
||||||
|
r = MulAdd(s0_6, M4(-1.518e-01, -6.862e-02, -8.148e-02, -2.030e-01, -4.453e-02, -2.133e-03, -6.081e-02, -8.941e-02, -5.417e-02, 1.564e-02, -5.425e-02, 5.875e-02, -8.805e-02, -1.910e-02, 2.099e-02, -1.402e-02), r);
|
||||||
|
r = MulAdd(s0_7, M4(-1.730e-02, -6.152e-02, -2.764e-01, -8.728e-02, 9.519e-03, -2.799e-02, -5.662e-02, 3.249e-02, 8.716e-02, 2.809e-02, -7.241e-02, 3.046e-02, 1.368e-01, 2.723e-02, 1.130e-01, -4.615e-02), r);
|
||||||
|
r = MulAdd(s0_8, M4(-5.021e-02, -3.352e-02, 5.072e-02, -1.434e-02, 6.511e-02, 6.519e-02, -8.987e-02, 2.193e-02, 1.583e-04, 2.714e-02, -2.315e-02, -3.077e-02, 7.792e-03, 2.782e-02, 9.282e-02, 5.011e-02), r);
|
||||||
|
r = MulAdd(s1_0, M4(-2.541e-02, -9.530e-03, -2.089e-01, -2.421e-02, 1.340e-02, 1.228e-01, 8.861e-02, -1.063e-02, -7.461e-02, 5.226e-02, -7.276e-02, 3.544e-02, -1.591e-02, 1.851e-02, 9.562e-03, 4.559e-02), r);
|
||||||
|
r = MulAdd(s1_1, M4(2.747e-02, -7.982e-02, -1.475e-01, 4.885e-02, -1.175e-02, -9.209e-02, -9.273e-02, -7.428e-02, 3.696e-02, -2.012e-01, 4.627e-02, 3.609e-02, 1.096e-01, -5.087e-02, 2.170e-01, 5.311e-02), r);
|
||||||
|
r = MulAdd(s1_2, M4(2.410e-02, 6.970e-02, 2.315e-02, 2.908e-02, 2.961e-05, 1.661e-02, 8.374e-02, 5.064e-02, 2.637e-02, 1.330e-01, 5.175e-02, -5.518e-02, -4.871e-03, 1.162e-01, 8.451e-02, 1.741e-02), r);
|
||||||
|
r = MulAdd(s1_3, M4(4.863e-02, -7.095e-02, 3.927e-03, -9.085e-02, 2.639e-02, -8.297e-02, -1.865e-01, -9.647e-02, 6.967e-02, 1.376e-02, 1.222e-01, -2.819e-01, 1.563e-01, -1.399e-02, -4.367e-02, -5.187e-02), r);
|
||||||
|
r = MulAdd(s1_4, M4(9.322e-02, 9.848e-02, 1.680e-01, -2.298e-01, -6.183e-02, -4.167e-02, -1.103e-02, -9.856e-03, -2.983e-03, -3.805e-01, -3.115e-01, -4.107e-01, -1.341e-01, -3.703e-01, -3.661e-01, -4.633e-01), r);
|
||||||
|
r = MulAdd(s1_5, M4(-2.785e-03, -2.188e-02, -2.790e-03, -4.276e-04, 7.082e-02, 1.004e-01, -3.532e-03, 1.740e-03, 6.693e-03, -5.230e-01, 2.119e-01, 2.878e-02, 3.915e-03, 1.842e-01, -1.630e-02, -3.874e-02), r);
|
||||||
|
r = MulAdd(s1_6, M4(2.313e-02, -6.545e-02, 1.631e-02, -1.278e-01, -4.216e-02, -4.147e-02, 6.827e-02, -1.725e-02, -5.254e-02, -3.942e-02, -2.400e-02, -8.124e-02, -3.250e-02, -1.806e-03, -3.947e-02, -7.056e-02), r);
|
||||||
|
r = MulAdd(s1_7, M4(8.445e-03, 1.147e-01, -7.772e-02, 1.091e-01, 1.842e-02, -6.040e-03, -7.053e-02, 1.824e-02, 2.212e-01, -8.777e-02, -1.003e-01, 6.533e-03, 2.090e-01, 4.588e-02, 9.886e-02, 6.176e-02), r);
|
||||||
|
r = MulAdd(s1_8, M4(4.046e-02, 1.872e-02, -5.723e-02, -4.997e-02, 5.232e-03, 1.795e-02, -2.747e-02, -1.507e-02, -1.704e-01, 7.849e-02, -1.475e-01, -4.255e-02, 7.807e-02, 4.185e-02, 3.849e-02, 3.137e-02), r);
|
||||||
|
r = MulAdd(s2_0, M4(-8.062e-03, 6.677e-02, 6.217e-02, 1.833e-01, -1.475e-01, 2.782e-01, 3.524e-02, -6.275e-02, 4.315e-02, 1.484e-02, 3.820e-02, -3.304e-02, 1.659e-03, -9.567e-03, -3.360e-02, -2.623e-02), r);
|
||||||
|
r = MulAdd(s2_1, M4(9.928e-02, -2.526e-01, -2.613e-02, 2.043e-01, 1.710e-02, -1.137e-01, 1.798e-01, -1.427e-01, 4.676e-03, 1.728e-01, 8.082e-02, -5.413e-02, -1.710e-02, -3.169e-02, -6.860e-02, 1.496e-02), r);
|
||||||
|
r = MulAdd(s2_2, M4(1.785e-02, 1.092e-01, -7.685e-02, 7.691e-02, 5.271e-03, -5.168e-02, 3.395e-02, 1.726e-02, 2.936e-02, -1.321e-02, 5.364e-02, -6.785e-03, 2.429e-02, -4.442e-02, -6.348e-02, 3.035e-02), r);
|
||||||
|
r = MulAdd(s2_3, M4(2.676e-01, 4.022e-03, -5.435e-02, -2.723e-01, -1.412e-01, -6.091e-01, 1.576e-02, 6.829e-02, -1.410e-01, 5.578e-03, 3.833e-03, 1.863e-01, -2.274e-02, 6.034e-03, 1.518e-01, -5.434e-02), r);
|
||||||
|
r = MulAdd(s2_4, M4(7.884e-02, 5.377e-01, -4.655e-02, -3.752e-01, 1.490e-01, -4.235e-02, -5.390e-02, 2.610e-01, 1.979e-01, -5.718e-02, 1.773e-02, 5.727e-02, 1.703e-02, 7.533e-01, -3.023e-02, 5.456e-02), r);
|
||||||
|
r = MulAdd(s2_5, M4(-4.898e-02, 4.237e-02, 6.311e-02, -4.635e-02, 3.660e-03, 2.139e-01, -3.722e-02, -6.738e-02, -3.009e-02, -6.140e-02, 2.777e-02, 3.917e-02, -1.421e-01, -4.041e-01, -1.524e-01, -9.837e-02), r);
|
||||||
|
r = MulAdd(s2_6, M4(6.071e-02, 1.084e-01, -6.370e-02, 1.323e-01, -7.251e-02, -1.079e-01, 1.208e-01, -4.495e-02, -2.115e-03, -4.107e-02, 2.465e-02, -1.230e-01, -6.064e-02, -4.263e-02, -1.388e-01, 6.519e-02), r);
|
||||||
|
r = MulAdd(s2_7, M4(9.042e-02, -8.032e-02, 1.186e-01, -1.537e-02, -6.566e-03, -3.216e-02, 3.412e-02, -3.207e-02, -1.586e-01, -2.988e-03, -2.358e-03, 2.172e-02, 6.775e-02, -3.590e-01, -4.123e-01, -3.506e-01), r);
|
||||||
|
r = MulAdd(s2_8, M4(8.486e-02, 4.731e-02, 5.779e-02, 1.000e-01, 9.121e-03, -3.421e-02, 4.891e-02, 4.916e-02, 3.343e-03, 4.437e-03, -2.002e-02, -3.856e-02, -1.319e-01, -4.022e-02, -1.752e-01, -9.250e-02), r);
|
||||||
|
r = MulAdd(s3_0, M4(-6.652e-03, 6.416e-02, 9.292e-03, 6.520e-02, 1.213e-02, 4.177e-02, 7.038e-02, -3.160e-02, 2.146e-02, -9.523e-02, -1.436e-01, -8.325e-02, -1.234e-02, -1.222e-02, -3.877e-02, -4.175e-02), r);
|
||||||
|
r = MulAdd(s3_1, M4(-5.171e-02, 1.011e-01, 7.998e-02, -8.804e-02, 1.067e-02, 1.516e-01, 6.508e-02, -7.724e-02, 2.717e-02, -4.901e-02, -6.059e-03, 4.013e-02, -3.833e-02, 1.538e-01, 5.948e-02, -4.945e-02), r);
|
||||||
|
r = MulAdd(s3_2, M4(1.023e-02, -1.230e-01, -1.861e-02, -2.570e-02, 2.512e-02, -4.630e-02, 6.354e-02, 3.897e-02, -2.146e-02, 2.446e-01, 1.906e-03, -9.068e-03, 1.754e-02, -7.082e-02, 1.107e-04, -9.604e-03), r);
|
||||||
|
r = MulAdd(s3_3, M4(7.380e-02, 2.216e-02, -2.608e-02, -6.491e-02, 4.018e-02, -6.657e-02, 1.116e-01, 9.405e-02, -7.168e-02, -3.646e-01, 8.387e-02, 1.352e-02, -4.589e-02, 2.235e-02, 1.881e-01, 1.759e-01), r);
|
||||||
|
r = MulAdd(s3_4, M4(-7.735e-02, -8.574e-02, -6.380e-02, 1.221e-01, 5.556e-02, -1.281e-01, 1.461e-01, 2.757e-01, 8.144e-01, -1.075e-01, 3.165e-03, -2.036e-01, 1.814e-01, 1.744e-01, -1.745e-01, 3.724e-02), r);
|
||||||
|
r = MulAdd(s3_5, M4(-6.864e-02, 1.273e-02, 7.502e-02, 4.164e-02, 1.301e-02, 1.407e-01, -9.985e-02, -8.079e-02, 1.428e-01, 3.034e-01, -1.564e-02, 6.091e-02, -1.271e-02, -2.153e-01, -7.843e-02, -4.063e-02), r);
|
||||||
|
r = MulAdd(s3_6, M4(-5.115e-02, 6.016e-02, -2.719e-02, 4.668e-02, -3.214e-02, -2.274e-02, -6.954e-03, -9.099e-03, 4.861e-02, 1.007e-01, -2.150e-01, -1.607e-01, -3.578e-02, 1.230e-02, -5.095e-02, 1.622e-02), r);
|
||||||
|
r = MulAdd(s3_7, M4(9.498e-02, -6.763e-02, 1.451e-01, 3.408e-03, -3.253e-02, 1.145e-01, 8.122e-03, -9.192e-02, 5.071e-02, -6.317e-02, 1.097e-01, 5.913e-02, 8.494e-02, 2.731e-04, -3.736e-01, -6.110e-03), r);
|
||||||
|
r = MulAdd(s3_8, M4(1.881e-02, 1.750e-02, 5.956e-02, 4.179e-02, -4.554e-02, -9.824e-02, 8.917e-03, 3.348e-02, 4.160e-02, 6.525e-02, 1.484e-02, -2.331e-02, -8.092e-02, -2.834e-02, -1.284e-01, -7.521e-02), r);
|
||||||
|
return r;
|
||||||
|
}
|
||||||
|
|
||||||
|
V4 f1(V4 s0_0, V4 s0_1, V4 s0_2, V4 s0_3, V4 s0_4, V4 s0_5, V4 s0_6, V4 s0_7, V4 s0_8, V4 s1_0, V4 s1_1, V4 s1_2, V4 s1_3, V4 s1_4, V4 s1_5, V4 s1_6, V4 s1_7, V4 s1_8, V4 s2_0, V4 s2_1, V4 s2_2, V4 s2_3, V4 s2_4, V4 s2_5, V4 s2_6, V4 s2_7, V4 s2_8, V4 s3_0, V4 s3_1, V4 s3_2, V4 s3_3, V4 s3_4, V4 s3_5, V4 s3_6, V4 s3_7, V4 s3_8) {
|
||||||
|
V4 r = { -7.279e-03, 1.016e-02, -7.400e-03, 4.979e-03 };
|
||||||
|
r = MulAdd(s0_0, M4(-4.868e-02, -7.333e-02, -1.029e-02, -7.011e-04, 2.404e-02, -9.301e-02, 1.457e-01, 2.242e-02, 6.850e-02, -1.328e-03, -2.557e-02, -4.854e-04, 1.071e-01, 3.788e-04, 1.408e-01, 5.354e-03), r);
|
||||||
|
r = MulAdd(s0_1, M4(7.892e-03, 5.832e-02, -1.077e-01, -6.140e-02, -1.003e-02, -4.887e-01, 8.263e-01, 2.416e-01, -3.434e-02, 1.089e-02, -1.984e-02, -3.615e-02, -5.692e-03, 1.615e-02, -5.680e-02, 2.041e-02), r);
|
||||||
|
r = MulAdd(s0_2, M4(2.136e-02, -2.731e-02, -1.742e-02, 2.592e-02, -4.319e-02, 6.426e-03, 2.110e-02, -6.338e-02, -6.921e-03, -1.288e-03, 4.579e-02, -2.155e-03, 3.041e-02, 1.946e-02, 1.238e-02, 6.906e-02), r);
|
||||||
|
r = MulAdd(s0_3, M4(-2.058e-02, 3.187e-02, -1.057e-01, 2.407e-01, -3.813e-02, -2.640e-02, 3.941e-02, 1.362e-01, 2.406e-02, 1.518e-02, -4.224e-02, 3.455e-02, 5.443e-02, -6.617e-02, -8.858e-02, -1.949e-02), r);
|
||||||
|
r = MulAdd(s0_4, M4(3.205e-01, -6.490e-01, -3.962e-01, -1.142e-01, -3.091e-02, 4.755e-01, -2.822e-01, -1.328e-01, -5.487e-01, 5.932e-02, -2.439e-02, -1.689e-01, -3.681e-02, -8.227e-02, 3.967e-02, -8.989e-02), r);
|
||||||
|
r = MulAdd(s0_5, M4(-5.668e-02, 3.658e-02, 1.227e-02, 8.117e-02, 1.161e-01, 9.350e-02, 9.971e-02, -1.220e-01, 7.876e-02, 5.186e-02, -4.261e-02, 1.436e-01, -2.114e-02, 6.113e-02, 2.251e-02, 2.534e-02), r);
|
||||||
|
r = MulAdd(s0_6, M4(-5.365e-02, -2.678e-02, -2.565e-02, 7.923e-02, -2.138e-02, -4.932e-02, -6.107e-03, 1.685e-02, 5.425e-02, -1.012e-02, -9.037e-03, 8.218e-04, -1.210e-02, 5.623e-02, -2.094e-02, -2.325e-02), r);
|
||||||
|
r = MulAdd(s0_7, M4(2.031e-02, -3.187e-02, 8.229e-02, 1.457e-01, 1.044e-01, -4.475e-02, 2.858e-02, -7.345e-02, -3.919e-02, -5.753e-02, 1.684e-02, -1.669e-01, 9.680e-03, 1.254e-01, 2.022e-03, -9.900e-02), r);
|
||||||
|
r = MulAdd(s0_8, M4(-1.164e-02, 5.171e-02, -5.704e-02, -1.643e-01, 2.554e-02, -9.988e-02, 3.699e-02, -3.752e-02, -8.076e-04, -2.527e-02, -2.081e-02, 3.110e-02, 1.484e-03, 4.064e-02, 2.481e-02, 2.225e-01), r);
|
||||||
|
r = MulAdd(s1_0, M4(4.384e-02, -1.401e-01, -4.071e-02, -1.137e-02, -4.979e-03, 6.159e-02, 1.275e-01, 6.544e-02, 1.288e-01, -4.421e-02, -4.471e-02, 2.682e-02, 5.621e-02, -4.062e-02, 1.034e-01, 6.606e-02), r);
|
||||||
|
r = MulAdd(s1_1, M4(2.799e-02, -1.333e-01, 1.521e-01, -5.025e-02, -1.895e-01, -7.913e-02, -2.321e-01, -6.526e-02, -1.330e-02, 1.499e-02, 1.620e-01, 6.936e-02, -6.816e-02, 1.353e-01, 1.107e-01, 5.514e-02), r);
|
||||||
|
r = MulAdd(s1_2, M4(5.826e-03, -5.941e-03, -2.338e-02, -2.826e-02, 9.265e-02, 3.608e-02, 1.114e-01, 1.274e-01, -1.291e-01, 1.284e-02, -7.540e-02, -3.458e-02, -1.006e-02, 2.083e-02, -8.393e-02, 8.186e-02), r);
|
||||||
|
r = MulAdd(s1_3, M4(-3.893e-02, -1.137e-02, 1.243e-01, 1.118e-01, 7.397e-02, -1.316e-01, -1.303e-01, -7.808e-05, 1.468e-02, -4.172e-03, -5.014e-02, -2.610e-02, 8.366e-02, -2.755e-02, 1.646e-04, -2.938e-02), r);
|
||||||
|
r = MulAdd(s1_4, M4(-1.114e-01, -2.017e-01, 2.898e-03, -7.984e-02, -8.403e-02, 2.626e-02, 1.563e-01, -1.397e-02, 2.724e-02, -6.698e-01, 2.358e-01, -6.466e-01, -9.650e-03, -6.742e-01, 1.411e-01, -3.343e-01), r);
|
||||||
|
r = MulAdd(s1_5, M4(7.380e-02, 6.420e-02, 7.990e-02, 6.014e-02, 5.950e-02, 6.212e-02, -7.881e-02, -2.782e-02, 1.087e-01, -3.347e-02, 3.819e-01, 1.988e-01, 5.813e-02, 2.239e-02, 3.012e-01, 1.275e-01), r);
|
||||||
|
r = MulAdd(s1_6, M4(-9.473e-02, -2.417e-02, -2.870e-02, 7.718e-02, -2.223e-02, 2.306e-02, 8.255e-03, -1.818e-02, -2.983e-02, -3.495e-02, 1.540e-02, 8.013e-02, 1.651e-02, -1.298e-02, 2.377e-02, 5.523e-02), r);
|
||||||
|
r = MulAdd(s1_7, M4(1.414e-01, 1.346e-01, 4.336e-03, -7.594e-02, -2.044e-02, -9.596e-03, -1.087e-03, 5.324e-02, -2.041e-02, -6.328e-02, 7.533e-02, -3.971e-01, 5.408e-04, 1.087e-01, 9.749e-03, -2.047e-01), r);
|
||||||
|
r = MulAdd(s1_8, M4(4.656e-02, -4.771e-02, -2.210e-02, -2.060e-02, -6.953e-03, -3.366e-02, -7.290e-03, -3.300e-02, -1.354e-01, -5.015e-02, -2.887e-02, 2.802e-01, 2.605e-02, -1.972e-02, 1.168e-03, 1.422e-01), r);
|
||||||
|
r = MulAdd(s2_0, M4(8.826e-02, -4.751e-02, 2.493e-01, 4.446e-02, 1.752e-01, 5.741e-03, -1.820e-01, 1.371e-02, -6.855e-02, 1.164e-02, -5.215e-02, -7.373e-04, -1.491e-02, 7.033e-03, -5.440e-02, -9.302e-05), r);
|
||||||
|
r = MulAdd(s2_1, M4(-6.871e-02, -9.419e-03, 3.276e-01, 2.826e-02, 5.675e-02, -3.974e-03, 1.104e-01, -2.975e-02, 3.281e-02, 8.429e-03, 1.129e-01, -4.830e-02, -4.374e-02, -6.905e-02, 8.143e-02, 3.180e-03), r);
|
||||||
|
r = MulAdd(s2_2, M4(-7.197e-02, -1.804e-02, -9.024e-02, -1.527e-03, 2.403e-02, 6.062e-02, 3.346e-02, 4.784e-02, -1.462e-02, 4.216e-02, 2.800e-02, 4.034e-04, -4.216e-02, -4.431e-03, -3.496e-02, -3.005e-02), r);
|
||||||
|
r = MulAdd(s2_3, M4(2.710e-02, -6.523e-02, 1.559e-01, -6.059e-02, 1.965e-01, -1.608e-01, -9.293e-03, -2.404e-01, -4.061e-02, 8.819e-02, 2.112e-02, 2.398e-01, -1.463e-01, 5.373e-02, -1.346e-02, 3.025e-02), r);
|
||||||
|
r = MulAdd(s2_4, M4(1.846e-02, 3.857e-01, -4.128e-01, -2.530e-01, -2.312e-01, 3.354e-02, -3.948e-01, -1.465e-01, 1.072e-01, -8.544e-02, -7.428e-02, 4.751e-02, 2.139e-01, 3.097e-01, -3.761e-01, 5.621e-02), r);
|
||||||
|
r = MulAdd(s2_5, M4(-1.203e-02, -9.598e-02, 4.101e-01, 1.578e-01, -5.394e-02, -6.714e-02, -6.320e-02, 8.249e-03, 5.620e-02, -3.219e-02, 9.398e-03, 6.809e-02, -7.400e-02, -1.431e-01, -1.425e-01, -2.358e-02), r);
|
||||||
|
r = MulAdd(s2_6, M4(4.035e-02, 5.655e-02, -3.307e-03, -3.497e-02, 6.522e-02, 1.103e-01, -9.802e-02, -2.655e-01, -5.802e-02, -4.359e-02, 3.459e-03, 1.592e-01, -2.566e-02, -1.156e-01, 2.646e-02, 3.806e-02), r);
|
||||||
|
r = MulAdd(s2_7, M4(-3.211e-02, -1.509e-01, 2.028e-03, -1.702e-01, 4.576e-02, -5.340e-02, 5.503e-02, 1.257e-02, -5.581e-02, 9.818e-02, -2.745e-02, 1.486e-01, 1.063e-01, -3.707e-01, 1.116e-01, -6.709e-02), r);
|
||||||
|
r = MulAdd(s2_8, M4(9.062e-04, -7.371e-03, 8.420e-02, 1.629e-01, -2.707e-02, -5.219e-03, 6.567e-02, 1.766e-01, 4.554e-04, 1.178e-02, -1.124e-02, 3.477e-02, -5.473e-02, -7.643e-02, 9.083e-03, -4.250e-02), r);
|
||||||
|
r = MulAdd(s3_0, M4(8.537e-02, 7.246e-02, 5.043e-02, 3.850e-02, -3.951e-02, 9.224e-03, 1.640e-02, 1.906e-02, -1.333e-01, -8.517e-02, -1.410e-01, 4.781e-02, -1.641e-02, 2.463e-03, -7.445e-02, -4.602e-02), r);
|
||||||
|
r = MulAdd(s3_1, M4(7.934e-02, 7.380e-03, -1.062e-01, -4.154e-03, -2.611e-02, -3.119e-02, 9.679e-02, -1.394e-02, -1.108e-01, 1.158e-02, 1.850e-01, -6.765e-02, 5.765e-02, 3.392e-02, -1.560e-02, -6.052e-02), r);
|
||||||
|
r = MulAdd(s3_2, M4(-3.056e-02, -8.450e-03, 1.524e-02, -1.007e-02, 1.030e-02, 4.032e-02, 9.837e-02, 9.371e-03, 4.740e-02, -4.795e-02, -3.356e-02, 8.602e-03, 2.484e-02, -9.889e-03, 6.734e-02, -2.287e-02), r);
|
||||||
|
r = MulAdd(s3_3, M4(6.303e-02, -7.328e-02, -5.082e-02, -5.070e-02, -8.129e-03, 7.948e-03, 7.351e-02, 5.601e-02, -4.169e-01, -8.219e-03, 3.373e-01, 1.781e-01, -5.139e-02, 1.471e-01, 3.415e-02, 7.690e-02), r);
|
||||||
|
r = MulAdd(s3_4, M4(1.455e-01, 6.664e-02, 5.792e-02, -1.276e-01, 2.360e-01, 5.978e-02, 5.147e-02, 2.707e-01, -7.581e-01, -2.740e-01, -3.000e-01, -2.017e-02, 2.005e-01, 6.934e-02, 2.996e-02, -3.036e-01), r);
|
||||||
|
r = MulAdd(s3_5, M4(-1.708e-01, -6.628e-02, 9.170e-02, 3.461e-02, -1.028e-01, -4.244e-02, -1.955e-01, -1.875e-01, 8.215e-02, 2.976e-02, -4.637e-03, 1.512e-01, -4.626e-02, 3.819e-03, -2.222e-01, -1.078e-01), r);
|
||||||
|
r = MulAdd(s3_6, M4(9.585e-02, 3.354e-02, -5.140e-03, -2.883e-02, -9.057e-02, 3.950e-02, -5.781e-02, -2.509e-02, -7.106e-02, -1.351e-01, -4.933e-02, 8.332e-02, -2.922e-02, -3.890e-02, 3.291e-03, 7.054e-02), r);
|
||||||
|
r = MulAdd(s3_7, M4(-8.208e-02, 1.377e-02, -2.475e-02, -1.353e-01, 9.728e-02, 7.136e-02, -3.984e-02, 1.374e-01, -1.160e-01, 4.362e-02, 6.714e-02, 5.038e-03, 1.866e-01, -2.349e-01, 8.599e-02, -1.510e-01), r);
|
||||||
|
r = MulAdd(s3_8, M4(-3.279e-02, 4.634e-02, 1.698e-02, 1.410e-01, -2.615e-02, 1.178e-02, 5.268e-02, 5.209e-02, 1.624e-02, 2.431e-02, -3.280e-02, 7.160e-02, -6.704e-02, -1.195e-01, -4.136e-02, -2.048e-01), r);
|
||||||
|
return r;
|
||||||
|
}
|
||||||
|
|
||||||
|
void Pass5(uint2 blockStart, uint3 tid) {
|
||||||
|
float2 pt = float2(GetInputPt());
|
||||||
|
uint2 gxy = Rmp8x8(tid.x) + blockStart;
|
||||||
|
uint2 size = GetInputSize();
|
||||||
|
if (gxy.x >= size.x || gxy.y >= size.y) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
float2 pos = (gxy + 0.5) * pt;
|
||||||
|
|
||||||
|
V4 s0_0 = l0(-1.0, -1.0);
|
||||||
|
V4 s0_1 = l0(0.0, -1.0);
|
||||||
|
V4 s0_2 = l0(1.0, -1.0);
|
||||||
|
V4 s0_3 = l0(-1.0, 0.0);
|
||||||
|
V4 s0_4 = l0(0.0, 0.0);
|
||||||
|
V4 s0_5 = l0(1.0, 0.0);
|
||||||
|
V4 s0_6 = l0(-1.0, 1.0);
|
||||||
|
V4 s0_7 = l0(0.0, 1.0);
|
||||||
|
V4 s0_8 = l0(1.0, 1.0);
|
||||||
|
V4 s1_0 = -max(-s0_0, 0.0);
|
||||||
|
V4 s1_1 = -max(-s0_1, 0.0);
|
||||||
|
V4 s1_2 = -max(-s0_2, 0.0);
|
||||||
|
V4 s1_3 = -max(-s0_3, 0.0);
|
||||||
|
V4 s1_4 = -max(-s0_4, 0.0);
|
||||||
|
V4 s1_5 = -max(-s0_5, 0.0);
|
||||||
|
V4 s1_6 = -max(-s0_6, 0.0);
|
||||||
|
V4 s1_7 = -max(-s0_7, 0.0);
|
||||||
|
V4 s1_8 = -max(-s0_8, 0.0);
|
||||||
|
s0_0 = max(s0_0, 0.0);
|
||||||
|
s0_1 = max(s0_1, 0.0);
|
||||||
|
s0_2 = max(s0_2, 0.0);
|
||||||
|
s0_3 = max(s0_3, 0.0);
|
||||||
|
s0_4 = max(s0_4, 0.0);
|
||||||
|
s0_5 = max(s0_5, 0.0);
|
||||||
|
s0_6 = max(s0_6, 0.0);
|
||||||
|
s0_7 = max(s0_7, 0.0);
|
||||||
|
s0_8 = max(s0_8, 0.0);
|
||||||
|
|
||||||
|
V4 s2_0 = l1(-1.0, -1.0);
|
||||||
|
V4 s2_1 = l1(0.0, -1.0);
|
||||||
|
V4 s2_2 = l1(1.0, -1.0);
|
||||||
|
V4 s2_3 = l1(-1.0, 0.0);
|
||||||
|
V4 s2_4 = l1(0.0, 0.0);
|
||||||
|
V4 s2_5 = l1(1.0, 0.0);
|
||||||
|
V4 s2_6 = l1(-1.0, 1.0);
|
||||||
|
V4 s2_7 = l1(0.0, 1.0);
|
||||||
|
V4 s2_8 = l1(1.0, 1.0);
|
||||||
|
V4 s3_0 = -max(-s2_0, 0.0);
|
||||||
|
V4 s3_1 = -max(-s2_1, 0.0);
|
||||||
|
V4 s3_2 = -max(-s2_2, 0.0);
|
||||||
|
V4 s3_3 = -max(-s2_3, 0.0);
|
||||||
|
V4 s3_4 = -max(-s2_4, 0.0);
|
||||||
|
V4 s3_5 = -max(-s2_5, 0.0);
|
||||||
|
V4 s3_6 = -max(-s2_6, 0.0);
|
||||||
|
V4 s3_7 = -max(-s2_7, 0.0);
|
||||||
|
V4 s3_8 = -max(-s2_8, 0.0);
|
||||||
|
s2_0 = max(s2_0, 0.0);
|
||||||
|
s2_1 = max(s2_1, 0.0);
|
||||||
|
s2_2 = max(s2_2, 0.0);
|
||||||
|
s2_3 = max(s2_3, 0.0);
|
||||||
|
s2_4 = max(s2_4, 0.0);
|
||||||
|
s2_5 = max(s2_5, 0.0);
|
||||||
|
s2_6 = max(s2_6, 0.0);
|
||||||
|
s2_7 = max(s2_7, 0.0);
|
||||||
|
s2_8 = max(s2_8, 0.0);
|
||||||
|
|
||||||
|
t0[gxy] = f0(s0_0, s0_1, s0_2, s0_3, s0_4, s0_5, s0_6, s0_7, s0_8, s1_0, s1_1, s1_2, s1_3, s1_4, s1_5, s1_6, s1_7, s1_8, s2_0, s2_1, s2_2, s2_3, s2_4, s2_5, s2_6, s2_7, s2_8, s3_0, s3_1, s3_2, s3_3, s3_4, s3_5, s3_6, s3_7, s3_8);
|
||||||
|
t1[gxy] = f1(s0_0, s0_1, s0_2, s0_3, s0_4, s0_5, s0_6, s0_7, s0_8, s1_0, s1_1, s1_2, s1_3, s1_4, s1_5, s1_6, s1_7, s1_8, s2_0, s2_1, s2_2, s2_3, s2_4, s2_5, s2_6, s2_7, s2_8, s3_0, s3_1, s3_2, s3_3, s3_4, s3_5, s3_6, s3_7, s3_8);
|
||||||
|
}
|
||||||
|
|
||||||
|
//!PASS 6
|
||||||
|
//!DESC out-shuffle
|
||||||
|
//!BLOCK_SIZE 16
|
||||||
|
//!NUM_THREADS 64
|
||||||
|
//!IN INPUT, t0, t1
|
||||||
|
//!OUT OUTPUT
|
||||||
|
|
||||||
|
#define l0(x, y) V4(O(t0, float2(x, y)))
|
||||||
|
#define l1(x, y) V4(O(t1, float2(x, y)))
|
||||||
|
|
||||||
|
V4 f0(V4 s0_0, V4 s0_1, V4 s0_2, V4 s0_3, V4 s0_4, V4 s0_5, V4 s0_6, V4 s0_7, V4 s0_8, V4 s1_0, V4 s1_1, V4 s1_2, V4 s1_3, V4 s1_4, V4 s1_5, V4 s1_6, V4 s1_7, V4 s1_8, V4 s2_0, V4 s2_1, V4 s2_2, V4 s2_3, V4 s2_4, V4 s2_5, V4 s2_6, V4 s2_7, V4 s2_8, V4 s3_0, V4 s3_1, V4 s3_2, V4 s3_3, V4 s3_4, V4 s3_5, V4 s3_6, V4 s3_7, V4 s3_8) {
|
||||||
|
V4 r = { -1.667e-03, -2.914e-03, -1.783e-03, -1.113e-03 };
|
||||||
|
r = MulAdd(s0_0, M4(1.838e-01, -1.901e-02, 9.627e-03, -5.113e-02, -2.616e-02, -2.850e-02, -1.739e-02, 9.125e-03, 1.563e-02, -1.253e-02, 1.902e-02, -1.512e-02, 3.495e-03, -1.497e-02, 4.974e-03, -1.115e-02), r);
|
||||||
|
r = MulAdd(s0_1, M4(-4.288e-01, 2.549e-01, -1.017e-01, -1.945e-01, 4.749e-02, 5.258e-02, -8.284e-03, -2.265e-02, -7.010e-02, -1.542e-02, 6.889e-03, 4.028e-02, -1.355e-02, 4.321e-03, 3.330e-02, 8.256e-02), r);
|
||||||
|
r = MulAdd(s0_2, M4(-1.045e-03, -2.140e-01, -3.928e-02, 1.364e-02, 1.787e-03, -9.427e-03, 1.927e-03, -2.145e-03, -4.617e-03, 2.814e-02, 2.045e-02, 1.776e-02, 3.188e-02, 3.187e-02, -1.498e-02, -2.180e-02), r);
|
||||||
|
r = MulAdd(s0_3, M4(1.059e-02, -5.522e-02, 6.447e-02, -4.395e-02, -4.846e-02, -2.209e-02, 1.866e-02, -5.920e-02, -2.419e-02, 1.032e-02, 7.736e-04, 1.092e-03, 1.854e-02, -4.388e-03, 3.893e-02, 9.549e-03), r);
|
||||||
|
r = MulAdd(s0_4, M4(3.358e-02, 7.431e-03, 1.780e-01, 5.353e-01, 2.674e-01, 2.227e-01, 1.450e-01, 2.085e-01, -8.104e-03, -3.561e-02, -1.231e-01, -1.932e-01, -6.538e-02, 3.378e-02, -1.314e-01, -2.862e-02), r);
|
||||||
|
r = MulAdd(s0_5, M4(-7.887e-02, 4.783e-02, -1.380e-01, -3.877e-01, 3.436e-03, 4.712e-02, -1.250e-02, 2.247e-02, 2.920e-02, 7.190e-02, -2.005e-02, 6.169e-02, 5.594e-03, -4.630e-02, 9.661e-02, 3.625e-02), r);
|
||||||
|
r = MulAdd(s0_6, M4(1.963e-02, -1.873e-02, 1.489e-02, -1.253e-02, -9.356e-03, 1.334e-02, -3.747e-02, -1.115e-02, 7.741e-04, -6.463e-03, 3.707e-03, -3.598e-03, 1.783e-02, -5.539e-03, 9.899e-03, -9.354e-03), r);
|
||||||
|
r = MulAdd(s0_7, M4(-2.952e-03, 3.132e-02, -6.679e-02, 2.883e-02, -3.721e-02, -3.573e-02, 1.204e-01, -6.785e-02, -1.208e-02, -1.355e-04, 2.872e-02, 2.196e-02, -1.655e-02, 3.784e-02, -5.921e-03, 2.494e-02), r);
|
||||||
|
r = MulAdd(s0_8, M4(-1.215e-02, -2.947e-02, -2.454e-03, -6.326e-02, 2.248e-03, 2.302e-02, -2.863e-03, 5.834e-02, 2.187e-02, 9.973e-03, 2.158e-02, 4.902e-02, -2.207e-02, -3.485e-02, -5.118e-02, -6.696e-02), r);
|
||||||
|
r = MulAdd(s1_0, M4(8.377e-02, -3.093e-02, 2.280e-02, -2.664e-02, -4.333e-02, -3.292e-02, -8.109e-03, 1.105e-02, 1.507e-02, 9.138e-03, 2.597e-02, -1.926e-02, 4.537e-02, -9.080e-03, -1.629e-02, -1.180e-02), r);
|
||||||
|
r = MulAdd(s1_1, M4(-7.478e-02, 1.238e-01, -5.092e-02, -3.473e-02, 4.269e-02, 4.444e-02, 7.295e-03, 1.274e-04, -1.646e-01, -1.551e-03, 3.424e-02, 4.906e-02, -2.056e-01, -5.847e-02, 5.262e-02, 1.049e-01), r);
|
||||||
|
r = MulAdd(s1_2, M4(-6.323e-02, -1.179e-01, -1.982e-02, -4.065e-02, 3.089e-03, -9.469e-03, 2.850e-03, 3.314e-03, -1.819e-03, -1.065e-01, 1.882e-02, 9.349e-03, 1.624e-02, -2.906e-02, -2.029e-02, -5.020e-02), r);
|
||||||
|
r = MulAdd(s1_3, M4(1.101e-01, -3.490e-02, 1.327e-01, -2.853e-02, -5.027e-03, -5.703e-02, 6.484e-03, -6.473e-02, -4.310e-02, 3.882e-02, -3.100e-02, -7.837e-04, -5.501e-02, -1.261e-02, 7.285e-02, 4.648e-02), r);
|
||||||
|
r = MulAdd(s1_4, M4(-6.214e-02, 1.841e-01, -9.546e-02, 3.700e-01, 2.824e-01, 3.400e-01, 2.309e-01, 2.237e-01, 3.482e-01, -1.294e-01, -4.546e-01, -3.556e-01, -4.730e-01, -1.392e-01, 4.776e-01, 1.210e-01), r);
|
||||||
|
r = MulAdd(s1_5, M4(-5.408e-02, -1.286e-01, -6.571e-02, -1.230e-01, 9.991e-03, 6.421e-02, 4.305e-03, 1.780e-02, 2.254e-02, 3.661e-01, 6.275e-02, 8.004e-02, -1.834e-02, -3.465e-01, 9.274e-02, 3.935e-01), r);
|
||||||
|
r = MulAdd(s1_6, M4(-1.620e-03, -1.423e-02, 2.785e-02, -1.252e-02, -1.218e-02, 2.842e-03, -3.496e-02, -2.927e-02, -2.106e-02, -7.099e-03, 2.545e-02, 2.484e-02, 2.973e-02, 4.563e-04, 2.010e-04, -1.839e-02), r);
|
||||||
|
r = MulAdd(s1_7, M4(1.400e-04, 3.080e-02, -6.992e-03, 8.032e-02, -2.280e-02, -4.436e-02, 7.600e-02, 1.165e-02, -9.494e-02, -2.207e-02, 2.783e-01, 2.095e-01, 2.645e-02, 5.203e-02, -7.492e-02, -1.303e-02), r);
|
||||||
|
r = MulAdd(s1_8, M4(-5.360e-03, -2.277e-02, -2.252e-02, -6.191e-02, 1.263e-02, 1.540e-02, 9.566e-03, 3.637e-02, -1.265e-02, -3.092e-02, -1.298e-02, -1.187e-02, 1.286e-02, 1.181e-02, -5.675e-02, -5.487e-02), r);
|
||||||
|
r = MulAdd(s2_0, M4(6.275e-02, 3.332e-02, 2.458e-02, -1.910e-02, -1.764e-02, 2.292e-02, -3.220e-02, -1.127e-02, 4.114e-02, 4.303e-02, -3.355e-02, -8.882e-03, 1.881e-02, 1.788e-02, -3.354e-03, -1.345e-02), r);
|
||||||
|
r = MulAdd(s2_1, M4(-1.562e-01, -7.407e-02, -5.684e-02, -3.194e-03, 1.150e-01, -2.700e-02, -9.666e-03, -3.629e-02, 5.862e-02, 6.747e-02, -1.085e-02, -3.454e-02, 7.263e-05, 2.167e-02, 5.491e-03, -6.472e-02), r);
|
||||||
|
r = MulAdd(s2_2, M4(5.068e-03, 4.899e-02, 1.480e-02, -2.153e-02, 1.102e-02, 2.831e-02, -5.931e-03, 1.021e-02, -1.267e-02, -1.569e-02, 5.418e-04, 1.030e-02, -3.280e-02, -3.072e-02, -2.688e-02, -2.208e-02), r);
|
||||||
|
r = MulAdd(s2_3, M4(-7.105e-02, 1.664e-03, -3.108e-02, 6.985e-02, 3.176e-02, 2.312e-02, -3.835e-02, 3.884e-02, -1.038e-01, 6.660e-02, -1.372e-01, 2.432e-02, -2.888e-04, -2.049e-02, 2.271e-02, 9.383e-03), r);
|
||||||
|
r = MulAdd(s2_4, M4(4.697e-01, -3.721e-01, 1.705e-01, -2.767e-01, -1.791e-02, -7.276e-02, 2.503e-01, 1.040e-01, 1.180e-02, -5.212e-01, 4.014e-01, 1.946e-01, -2.547e-02, -1.567e-02, -5.652e-02, 9.687e-02), r);
|
||||||
|
r = MulAdd(s2_5, M4(3.024e-02, 1.618e-02, 2.619e-02, 8.868e-02, -5.217e-02, -7.642e-02, -3.704e-02, -2.374e-02, -4.639e-02, 5.743e-02, -3.967e-02, -2.450e-02, 2.091e-02, -1.108e-02, 6.949e-03, -1.502e-02), r);
|
||||||
|
r = MulAdd(s2_6, M4(5.298e-03, -6.810e-03, -1.982e-02, 1.960e-04, 3.645e-03, 5.483e-03, 3.357e-03, 3.697e-02, -1.339e-02, 3.253e-02, 3.649e-02, 4.492e-03, 2.076e-02, -9.046e-03, 2.043e-02, -7.803e-03), r);
|
||||||
|
r = MulAdd(s2_7, M4(-2.707e-02, 7.878e-02, 1.816e-01, -1.506e-03, 9.060e-03, -1.418e-02, -6.983e-02, -5.833e-02, 3.309e-02, -2.537e-02, -3.298e-01, -1.735e-01, -2.132e-03, 5.241e-02, 1.155e-02, 4.817e-02), r);
|
||||||
|
r = MulAdd(s2_8, M4(-1.781e-02, 1.652e-02, -7.188e-03, 2.114e-03, -1.105e-02, -1.137e-02, -9.037e-03, -5.600e-02, -1.220e-02, 8.292e-03, -3.404e-03, -4.211e-02, -1.018e-02, -1.004e-02, 1.505e-03, -4.591e-03), r);
|
||||||
|
r = MulAdd(s3_0, M4(7.349e-02, 2.926e-02, 2.398e-02, -1.821e-02, -1.290e-02, 1.201e-02, 5.000e-03, 1.316e-02, -1.567e-02, 2.025e-02, -2.171e-02, -3.941e-04, -7.948e-03, 6.116e-02, -9.445e-03, 1.911e-02), r);
|
||||||
|
r = MulAdd(s3_1, M4(-1.294e-01, -6.121e-02, -4.576e-02, 9.211e-03, 1.371e-02, -1.964e-02, -3.133e-03, 4.701e-03, 9.544e-02, 6.692e-03, 4.665e-04, -2.056e-02, 3.455e-01, -2.495e-01, 1.027e-02, -7.393e-02), r);
|
||||||
|
r = MulAdd(s3_2, M4(1.532e-02, 9.402e-03, 6.812e-04, -3.241e-02, 1.245e-03, 6.504e-03, 3.970e-03, 7.168e-03, 9.435e-03, 1.574e-02, -5.118e-03, 5.232e-03, -2.659e-02, -6.011e-02, -2.446e-02, 8.062e-04), r);
|
||||||
|
r = MulAdd(s3_3, M4(-6.714e-02, 3.454e-03, -1.486e-02, 5.921e-02, 5.177e-02, 3.766e-02, -1.473e-01, 4.371e-02, -7.118e-02, 2.462e-02, -1.810e-02, 3.430e-02, -5.552e-02, 3.047e-02, -5.066e-02, 5.769e-02), r);
|
||||||
|
r = MulAdd(s3_4, M4(3.191e-02, -1.387e-01, -5.992e-02, -1.554e-01, 4.660e-01, 3.655e-01, 2.406e-02, -3.902e-01, 3.973e-02, -1.333e-01, 1.792e-01, 8.854e-02, 2.477e-01, -3.115e-01, 6.035e-01, -4.717e-01), r);
|
||||||
|
r = MulAdd(s3_5, M4(7.241e-02, 1.273e-01, 6.810e-02, 1.118e-01, -5.454e-02, -1.728e-02, -1.007e-01, -2.265e-02, -4.534e-02, -5.171e-02, -2.524e-02, -3.337e-02, -2.366e-03, -1.723e-02, 2.300e-02, -9.889e-02), r);
|
||||||
|
r = MulAdd(s3_6, M4(3.383e-02, -7.898e-03, 1.681e-02, -5.131e-03, 3.687e-02, 1.929e-02, -7.695e-03, 2.145e-03, -2.814e-02, 3.366e-02, -8.788e-02, 3.614e-02, 2.951e-02, -6.964e-03, 2.272e-02, 1.581e-02), r);
|
||||||
|
r = MulAdd(s3_7, M4(7.020e-03, 6.046e-02, 2.975e-02, 5.663e-02, 2.155e-02, -1.786e-02, -2.588e-01, -1.310e-01, -6.372e-02, -1.218e-01, -7.160e-02, -3.058e-01, 2.297e-03, 3.050e-02, -2.346e-02, 4.674e-02), r);
|
||||||
|
r = MulAdd(s3_8, M4(-1.071e-02, -1.089e-02, 9.286e-03, 5.202e-02, -2.291e-02, -2.655e-02, 2.386e-02, -3.231e-02, 4.599e-03, 1.114e-02, -1.630e-02, -1.693e-02, -2.194e-03, 1.842e-02, -2.522e-02, 5.265e-02), r);
|
||||||
|
return tanh(r);
|
||||||
|
}
|
||||||
|
|
||||||
|
void Pass6(uint2 blockStart, uint3 tid) {
|
||||||
|
float2 pt = float2(GetInputPt());
|
||||||
|
uint2 gxy = (Rmp8x8(tid.x) << 1) + blockStart;
|
||||||
|
uint2 size = GetOutputSize();
|
||||||
|
if (gxy.x >= size.x || gxy.y >= size.y) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
float2 pos = ((gxy >> 1) + 0.5) * pt;
|
||||||
|
|
||||||
|
V4 s0_0 = l0(-1.0, -1.0);
|
||||||
|
V4 s0_1 = l0(0.0, -1.0);
|
||||||
|
V4 s0_2 = l0(1.0, -1.0);
|
||||||
|
V4 s0_3 = l0(-1.0, 0.0);
|
||||||
|
V4 s0_4 = l0(0.0, 0.0);
|
||||||
|
V4 s0_5 = l0(1.0, 0.0);
|
||||||
|
V4 s0_6 = l0(-1.0, 1.0);
|
||||||
|
V4 s0_7 = l0(0.0, 1.0);
|
||||||
|
V4 s0_8 = l0(1.0, 1.0);
|
||||||
|
V4 s1_0 = -max(-s0_0, 0.0);
|
||||||
|
V4 s1_1 = -max(-s0_1, 0.0);
|
||||||
|
V4 s1_2 = -max(-s0_2, 0.0);
|
||||||
|
V4 s1_3 = -max(-s0_3, 0.0);
|
||||||
|
V4 s1_4 = -max(-s0_4, 0.0);
|
||||||
|
V4 s1_5 = -max(-s0_5, 0.0);
|
||||||
|
V4 s1_6 = -max(-s0_6, 0.0);
|
||||||
|
V4 s1_7 = -max(-s0_7, 0.0);
|
||||||
|
V4 s1_8 = -max(-s0_8, 0.0);
|
||||||
|
s0_0 = max(s0_0, 0.0);
|
||||||
|
s0_1 = max(s0_1, 0.0);
|
||||||
|
s0_2 = max(s0_2, 0.0);
|
||||||
|
s0_3 = max(s0_3, 0.0);
|
||||||
|
s0_4 = max(s0_4, 0.0);
|
||||||
|
s0_5 = max(s0_5, 0.0);
|
||||||
|
s0_6 = max(s0_6, 0.0);
|
||||||
|
s0_7 = max(s0_7, 0.0);
|
||||||
|
s0_8 = max(s0_8, 0.0);
|
||||||
|
|
||||||
|
V4 s2_0 = l1(-1.0, -1.0);
|
||||||
|
V4 s2_1 = l1(0.0, -1.0);
|
||||||
|
V4 s2_2 = l1(1.0, -1.0);
|
||||||
|
V4 s2_3 = l1(-1.0, 0.0);
|
||||||
|
V4 s2_4 = l1(0.0, 0.0);
|
||||||
|
V4 s2_5 = l1(1.0, 0.0);
|
||||||
|
V4 s2_6 = l1(-1.0, 1.0);
|
||||||
|
V4 s2_7 = l1(0.0, 1.0);
|
||||||
|
V4 s2_8 = l1(1.0, 1.0);
|
||||||
|
V4 s3_0 = -max(-s2_0, 0.0);
|
||||||
|
V4 s3_1 = -max(-s2_1, 0.0);
|
||||||
|
V4 s3_2 = -max(-s2_2, 0.0);
|
||||||
|
V4 s3_3 = -max(-s2_3, 0.0);
|
||||||
|
V4 s3_4 = -max(-s2_4, 0.0);
|
||||||
|
V4 s3_5 = -max(-s2_5, 0.0);
|
||||||
|
V4 s3_6 = -max(-s2_6, 0.0);
|
||||||
|
V4 s3_7 = -max(-s2_7, 0.0);
|
||||||
|
V4 s3_8 = -max(-s2_8, 0.0);
|
||||||
|
s2_0 = max(s2_0, 0.0);
|
||||||
|
s2_1 = max(s2_1, 0.0);
|
||||||
|
s2_2 = max(s2_2, 0.0);
|
||||||
|
s2_3 = max(s2_3, 0.0);
|
||||||
|
s2_4 = max(s2_4, 0.0);
|
||||||
|
s2_5 = max(s2_5, 0.0);
|
||||||
|
s2_6 = max(s2_6, 0.0);
|
||||||
|
s2_7 = max(s2_7, 0.0);
|
||||||
|
s2_8 = max(s2_8, 0.0);
|
||||||
|
|
||||||
|
V4 r = f0(s0_0, s0_1, s0_2, s0_3, s0_4, s0_5, s0_6, s0_7, s0_8, s1_0, s1_1, s1_2, s1_3, s1_4, s1_5, s1_6, s1_7, s1_8, s2_0, s2_1, s2_2, s2_3, s2_4, s2_5, s2_6, s2_7, s2_8, s3_0, s3_1, s3_2, s3_3, s3_4, s3_5, s3_6, s3_7, s3_8);
|
||||||
|
|
||||||
|
static const MF3x3 rgb2yuv = { 0.299, 0.587, 0.114, -0.169, -0.331, 0.5, 0.5, -0.419, -0.081 };
|
||||||
|
static const MF3x3 yuv2rgb = { 1, -0.00093, 1.401687, 1, -0.3437, -0.71417, 1, 1.77216, 0.00099 };
|
||||||
|
float2 opt = float2(GetOutputPt());
|
||||||
|
|
||||||
|
pos -= 0.5f * opt;
|
||||||
|
MF3 yuv = mul(rgb2yuv, INPUT.SampleLevel(SL, pos, 0).rgb);
|
||||||
|
OUTPUT[gxy] = MF4(mul(yuv2rgb, MF3(saturate(yuv.r + r.x), yuv.yz)), 1);
|
||||||
|
|
||||||
|
++gxy.x;
|
||||||
|
pos.x += opt.x;
|
||||||
|
yuv = mul(rgb2yuv, INPUT.SampleLevel(SL, pos, 0).rgb);
|
||||||
|
OUTPUT[gxy] = MF4(mul(yuv2rgb, MF3(saturate(yuv.r + r.y), yuv.yz)), 1);
|
||||||
|
|
||||||
|
++gxy.y;
|
||||||
|
pos.y += opt.y;
|
||||||
|
yuv = mul(rgb2yuv, INPUT.SampleLevel(SL, pos, 0).rgb);
|
||||||
|
OUTPUT[gxy] = MF4(mul(yuv2rgb, MF3(saturate(yuv.r + r.w), yuv.yz)), 1);
|
||||||
|
|
||||||
|
--gxy.x;
|
||||||
|
pos.x -= opt.x;
|
||||||
|
yuv = mul(rgb2yuv, INPUT.SampleLevel(SL, pos, 0).rgb);
|
||||||
|
OUTPUT[gxy] = MF4(mul(yuv2rgb, MF3(saturate(yuv.r + r.z), yuv.yz)), 1);
|
||||||
|
}
|
||||||
914
src/Effects/CuNNy/CuNNy-4x8C-NVL.hlsl
Normal file
914
src/Effects/CuNNy/CuNNy-4x8C-NVL.hlsl
Normal file
|
|
@ -0,0 +1,914 @@
|
||||||
|
// CuNNy 4x8C BILINEAR RGB NVL - https://github.com/funnyplanter/CuNNy
|
||||||
|
|
||||||
|
// This program is free software: you can redistribute it and/or modify
|
||||||
|
// it under the terms of the GNU General Public License as published by
|
||||||
|
// the Free Software Foundation, either version 3 of the License, or
|
||||||
|
// (at your option) any later version.
|
||||||
|
//
|
||||||
|
// This program is distributed in the hope that it will be useful,
|
||||||
|
// but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
//
|
||||||
|
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||||
|
// GNU General Public License for more details.
|
||||||
|
//
|
||||||
|
// You should have received a copy of the GNU General Public License
|
||||||
|
// along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||||
|
|
||||||
|
//!MAGPIE EFFECT
|
||||||
|
//!VERSION 4
|
||||||
|
//!SORT_NAME CuNNy-D08N04
|
||||||
|
//!USE MulAdd
|
||||||
|
//!CAPABILITY FP16
|
||||||
|
|
||||||
|
#include "../StubDefs.hlsli"
|
||||||
|
|
||||||
|
//!TEXTURE
|
||||||
|
Texture2D INPUT;
|
||||||
|
|
||||||
|
//!TEXTURE
|
||||||
|
//!WIDTH INPUT_WIDTH * 2
|
||||||
|
//!HEIGHT INPUT_HEIGHT * 2
|
||||||
|
Texture2D OUTPUT;
|
||||||
|
|
||||||
|
//!SAMPLER
|
||||||
|
//!FILTER POINT
|
||||||
|
SamplerState SP;
|
||||||
|
|
||||||
|
//!SAMPLER
|
||||||
|
//!FILTER LINEAR
|
||||||
|
SamplerState SL;
|
||||||
|
|
||||||
|
//!COMMON
|
||||||
|
#define O(t, p) t.SampleLevel(SP, pos + p * pt, 0)
|
||||||
|
#define V4 MF4
|
||||||
|
#define M4 MF4x4
|
||||||
|
|
||||||
|
//!TEXTURE
|
||||||
|
//!WIDTH INPUT_WIDTH
|
||||||
|
//!HEIGHT INPUT_HEIGHT
|
||||||
|
//!FORMAT R8G8B8A8_SNORM
|
||||||
|
Texture2D t0;
|
||||||
|
|
||||||
|
//!TEXTURE
|
||||||
|
//!WIDTH INPUT_WIDTH
|
||||||
|
//!HEIGHT INPUT_HEIGHT
|
||||||
|
//!FORMAT R8G8B8A8_SNORM
|
||||||
|
Texture2D t1;
|
||||||
|
|
||||||
|
//!TEXTURE
|
||||||
|
//!WIDTH INPUT_WIDTH
|
||||||
|
//!HEIGHT INPUT_HEIGHT
|
||||||
|
//!FORMAT R8G8B8A8_SNORM
|
||||||
|
Texture2D t2;
|
||||||
|
|
||||||
|
//!TEXTURE
|
||||||
|
//!WIDTH INPUT_WIDTH
|
||||||
|
//!HEIGHT INPUT_HEIGHT
|
||||||
|
//!FORMAT R8G8B8A8_SNORM
|
||||||
|
Texture2D t3;
|
||||||
|
|
||||||
|
//!PASS 1
|
||||||
|
//!DESC in
|
||||||
|
//!BLOCK_SIZE 8
|
||||||
|
//!NUM_THREADS 64
|
||||||
|
//!IN INPUT
|
||||||
|
//!OUT t0, t1
|
||||||
|
|
||||||
|
#define l0(x, y) (dot(MF3(2.329e-01, 4.438e-01, 9.598e-02), O(INPUT, float2(x, y)).rgb) + MF(-5.664e-01))
|
||||||
|
|
||||||
|
V4 f0(MF s0_0, MF s0_1, MF s0_2, MF s0_3, MF s0_4, MF s0_5, MF s0_6, MF s0_7, MF s0_8) {
|
||||||
|
V4 r = { 3.244e-02, 2.492e-04, 8.562e-04, 1.261e-04 };
|
||||||
|
r = mad(s0_0, V4(-1.368e-01, -5.123e-02, -2.270e-01, -9.888e-02), r);
|
||||||
|
r = mad(s0_1, V4(3.682e-01, 4.625e-02, 1.372e-01, 3.834e-01), r);
|
||||||
|
r = mad(s0_2, V4(-9.245e-02, 7.555e-03, 3.923e-02, 1.252e-02), r);
|
||||||
|
r = mad(s0_3, V4(-2.312e-01, 2.012e-02, 1.660e-01, 4.386e-01), r);
|
||||||
|
r = mad(s0_4, V4(-3.965e-02, -4.834e-01, 3.729e-01, -7.207e-01), r);
|
||||||
|
r = mad(s0_5, V4(2.190e-01, -9.021e-02, -1.087e-01, -9.632e-03), r);
|
||||||
|
r = mad(s0_6, V4(4.088e-02, 1.183e-01, 8.976e-02, -1.710e-03), r);
|
||||||
|
r = mad(s0_7, V4(-5.188e-03, 5.274e-01, -8.856e-02, -6.446e-03), r);
|
||||||
|
r = mad(s0_8, V4(-7.160e-02, -9.349e-02, -3.823e-01, 1.947e-03), r);
|
||||||
|
return r;
|
||||||
|
}
|
||||||
|
|
||||||
|
V4 f1(MF s0_0, MF s0_1, MF s0_2, MF s0_3, MF s0_4, MF s0_5, MF s0_6, MF s0_7, MF s0_8) {
|
||||||
|
V4 r = { -1.971e-02, 8.202e-02, 4.706e-03, -6.665e-02 };
|
||||||
|
r = mad(s0_0, V4(2.403e-02, 8.569e-03, -8.618e-02, 2.022e-02), r);
|
||||||
|
r = mad(s0_1, V4(4.893e-01, 2.383e-02, 2.423e-02, -3.486e-01), r);
|
||||||
|
r = mad(s0_2, V4(-3.682e-02, 2.437e-03, 1.872e-01, 1.135e-01), r);
|
||||||
|
r = mad(s0_3, V4(-2.361e-02, 2.588e-02, 7.348e-02, -8.229e-03), r);
|
||||||
|
r = mad(s0_4, V4(-4.433e-01, -5.131e-01, -3.778e-01, 6.107e-02), r);
|
||||||
|
r = mad(s0_5, V4(-4.423e-02, 2.098e-02, 9.260e-03, 4.444e-02), r);
|
||||||
|
r = mad(s0_6, V4(-1.370e-02, 1.009e-02, 3.020e-01, 1.159e-02), r);
|
||||||
|
r = mad(s0_7, V4(-3.030e-03, 8.145e-03, -2.789e-02, -7.085e-03), r);
|
||||||
|
r = mad(s0_8, V4(2.648e-02, 4.731e-03, -1.067e-01, -4.477e-03), r);
|
||||||
|
return r;
|
||||||
|
}
|
||||||
|
|
||||||
|
void Pass1(uint2 blockStart, uint3 tid) {
|
||||||
|
float2 pt = float2(GetInputPt());
|
||||||
|
uint2 gxy = Rmp8x8(tid.x) + blockStart;
|
||||||
|
uint2 size = GetInputSize();
|
||||||
|
if (gxy.x >= size.x || gxy.y >= size.y) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
float2 pos = (gxy + 0.5) * pt;
|
||||||
|
|
||||||
|
MF s0_0 = l0(-1.0, -1.0);
|
||||||
|
MF s0_1 = l0(0.0, -1.0);
|
||||||
|
MF s0_2 = l0(1.0, -1.0);
|
||||||
|
MF s0_3 = l0(-1.0, 0.0);
|
||||||
|
MF s0_4 = l0(0.0, 0.0);
|
||||||
|
MF s0_5 = l0(1.0, 0.0);
|
||||||
|
MF s0_6 = l0(-1.0, 1.0);
|
||||||
|
MF s0_7 = l0(0.0, 1.0);
|
||||||
|
MF s0_8 = l0(1.0, 1.0);
|
||||||
|
|
||||||
|
t0[gxy] = f0(s0_0, s0_1, s0_2, s0_3, s0_4, s0_5, s0_6, s0_7, s0_8);
|
||||||
|
t1[gxy] = f1(s0_0, s0_1, s0_2, s0_3, s0_4, s0_5, s0_6, s0_7, s0_8);
|
||||||
|
}
|
||||||
|
|
||||||
|
//!PASS 2
|
||||||
|
//!DESC conv1
|
||||||
|
//!BLOCK_SIZE 8
|
||||||
|
//!NUM_THREADS 64
|
||||||
|
//!IN t0, t1
|
||||||
|
//!OUT t2, t3
|
||||||
|
|
||||||
|
#define l0(x, y) V4(O(t0, float2(x, y)))
|
||||||
|
#define l1(x, y) V4(O(t1, float2(x, y)))
|
||||||
|
|
||||||
|
V4 f0(V4 s0_0, V4 s0_1, V4 s0_2, V4 s0_3, V4 s0_4, V4 s0_5, V4 s0_6, V4 s0_7, V4 s0_8, V4 s1_0, V4 s1_1, V4 s1_2, V4 s1_3, V4 s1_4, V4 s1_5, V4 s1_6, V4 s1_7, V4 s1_8, V4 s2_0, V4 s2_1, V4 s2_2, V4 s2_3, V4 s2_4, V4 s2_5, V4 s2_6, V4 s2_7, V4 s2_8, V4 s3_0, V4 s3_1, V4 s3_2, V4 s3_3, V4 s3_4, V4 s3_5, V4 s3_6, V4 s3_7, V4 s3_8) {
|
||||||
|
V4 r = { 3.430e-02, -1.031e-02, -1.631e-02, -3.189e-02 };
|
||||||
|
r = MulAdd(s0_0, M4(1.205e-01, 8.504e-02, -7.328e-02, 1.539e-01, -9.103e-03, -2.708e-02, -1.401e-01, -2.159e-01, -2.552e-01, 7.462e-02, 5.919e-02, 8.905e-02, 1.169e-01, -4.383e-03, -1.997e-01, -1.379e-01), r);
|
||||||
|
r = MulAdd(s0_1, M4(2.844e-02, 2.238e-02, 2.143e-01, -1.624e-01, 1.885e-01, 1.316e-01, -1.276e-01, -1.713e-01, 2.553e-03, -1.343e-01, 4.700e-02, 4.762e-01, -2.676e-01, 1.784e-01, -4.065e-02, 1.015e-01), r);
|
||||||
|
r = MulAdd(s0_2, M4(-4.442e-03, 3.253e-01, 2.650e-02, -2.907e-01, 2.749e-01, -3.510e-01, 8.545e-02, -2.446e-01, -1.579e-01, 9.398e-02, -4.544e-02, -9.123e-02, -2.529e-01, -2.538e-01, -2.686e-01, 2.607e-01), r);
|
||||||
|
r = MulAdd(s0_3, M4(1.518e-01, -1.515e-01, -1.597e-01, 2.163e-01, -6.933e-02, 7.220e-02, 2.114e-01, -2.227e-01, -3.743e-01, 9.056e-02, 2.612e-02, 3.036e-01, -1.583e-02, -8.293e-02, -1.068e-01, 6.201e-02), r);
|
||||||
|
r = MulAdd(s0_4, M4(-2.305e-02, 9.029e-02, -1.003e-01, -2.375e-01, -1.891e-01, 3.623e-01, -2.999e-01, -4.511e-01, 1.460e-01, -3.825e-01, 1.231e-01, 6.391e-01, -6.041e-01, 5.588e-01, -3.508e-01, -3.131e-01), r);
|
||||||
|
r = MulAdd(s0_5, M4(8.812e-02, 2.197e-01, -8.630e-03, 2.287e-02, -1.918e-01, -6.428e-01, 1.496e-01, 2.272e-01, 3.445e-02, -7.188e-03, -8.518e-02, 1.948e-01, 1.606e-01, -8.707e-01, 2.092e-02, -4.993e-01), r);
|
||||||
|
r = MulAdd(s0_6, M4(9.718e-03, 8.373e-03, 7.436e-02, -1.552e-01, 8.410e-02, -1.728e-02, -1.971e-01, 2.255e-02, -8.645e-02, 1.863e-02, -9.399e-02, -8.424e-02, -1.533e-03, 1.223e-01, 2.715e-01, -1.268e-01), r);
|
||||||
|
r = MulAdd(s0_7, M4(-4.246e-01, -1.034e-01, 3.236e-01, 5.680e-01, -1.213e-02, 1.577e-01, -9.408e-02, -7.294e-02, -6.410e-02, 4.264e-02, -8.392e-03, 2.192e-01, 1.656e-01, 4.681e-02, 9.146e-01, -6.311e-02), r);
|
||||||
|
r = MulAdd(s0_8, M4(-1.847e-01, -9.105e-02, -3.260e-02, 2.506e-01, -6.470e-02, 4.430e-02, -1.242e-02, -1.097e-01, 5.488e-02, 9.106e-02, 3.144e-02, -3.367e-05, 2.468e-01, -2.535e-01, 1.409e-01, -5.311e-01), r);
|
||||||
|
r = MulAdd(s1_0, M4(1.294e-01, 1.098e-01, 7.497e-03, 1.016e-01, 1.377e-02, -1.480e-02, -2.694e-02, -3.417e-02, -1.083e-01, -2.575e-03, 1.137e-01, -2.616e-01, -1.260e-01, -2.567e-02, -1.958e-01, 6.103e-02), r);
|
||||||
|
r = MulAdd(s1_1, M4(-1.355e-01, 1.168e-01, 2.368e-01, -2.379e-01, 8.556e-01, 1.401e-01, 3.238e-01, 2.737e-01, 8.041e-02, -1.662e-01, 9.181e-02, -3.488e-01, -1.586e-01, 1.407e-01, -1.126e-01, 1.825e-01), r);
|
||||||
|
r = MulAdd(s1_2, M4(-1.881e-02, 4.604e-01, -1.712e-02, 3.453e-02, 3.171e-01, -1.126e-01, 6.510e-02, 2.908e-01, -9.125e-02, 7.793e-02, -5.580e-02, -3.603e-01, 9.996e-02, -2.647e-01, -2.114e-01, 2.330e-01), r);
|
||||||
|
r = MulAdd(s1_3, M4(2.957e-01, -1.252e-01, -2.840e-01, 1.815e-01, -2.900e-01, 1.027e-01, 1.404e-01, -1.123e-01, -1.767e-01, 1.535e-03, -3.568e-03, -2.824e-01, 2.015e-01, -7.712e-02, -6.140e-02, 6.517e-02), r);
|
||||||
|
r = MulAdd(s1_4, M4(-2.439e-01, 7.096e-02, -2.116e-01, -1.980e-01, -3.221e-01, 2.007e-01, -4.243e-01, -5.013e-01, 1.181e-01, -3.735e-01, 1.812e-01, -5.095e-01, 3.646e-01, 4.013e-01, -8.028e-02, 1.287e-01), r);
|
||||||
|
r = MulAdd(s1_5, M4(-8.389e-02, -1.091e-01, 6.962e-02, 2.605e-01, -3.435e-03, -5.146e-01, 4.125e-01, 5.487e-01, -1.481e-01, 6.810e-02, -1.450e-01, -9.583e-02, 3.305e-01, -1.238e+00, 2.036e-01, 1.879e-01), r);
|
||||||
|
r = MulAdd(s1_6, M4(-8.033e-02, 5.944e-03, 2.453e-01, -2.971e-01, -5.652e-02, -1.251e-02, -1.449e-01, -5.344e-02, -1.377e-01, 9.383e-03, -1.862e-01, -2.528e-01, -3.825e-02, 7.296e-02, 2.373e-01, -1.935e-01), r);
|
||||||
|
r = MulAdd(s1_7, M4(-1.795e-01, 1.597e-01, 2.709e-01, -3.738e-01, 2.604e-02, 1.678e-01, -8.718e-02, -9.483e-03, -3.844e-02, 6.235e-02, -1.344e-01, 1.837e-02, -3.074e-02, 2.568e-02, 1.030e+00, 1.831e-01), r);
|
||||||
|
r = MulAdd(s1_8, M4(4.299e-02, 6.530e-03, -2.571e-02, 3.382e-01, -1.327e-01, 2.975e-02, -2.861e-02, 1.963e-01, 8.130e-04, 9.743e-02, -1.177e-02, -1.273e-01, -1.265e-01, -3.003e-01, 2.635e-01, 5.426e-02), r);
|
||||||
|
r = MulAdd(s2_0, M4(-1.538e-01, 1.580e-01, 1.392e-01, -1.077e-01, -1.228e-01, 1.853e-01, -1.010e-01, 3.144e-02, 2.203e-01, -3.309e-02, 6.819e-02, 2.708e-01, 1.720e-01, 2.635e-01, -1.290e-01, -2.932e-01), r);
|
||||||
|
r = MulAdd(s2_1, M4(1.615e-01, -1.424e-01, -2.346e-01, -1.008e-01, 1.386e-01, -2.281e-01, -1.313e-01, -5.902e-02, -3.376e-02, 1.925e-01, -1.172e-01, 7.865e-02, 2.112e-01, -7.280e-02, -1.953e-01, -1.198e-02), r);
|
||||||
|
r = MulAdd(s2_2, M4(1.280e-01, -1.353e-01, 1.251e-01, 3.212e-02, -1.144e-01, -1.492e-01, -1.499e-01, 2.211e-01, 1.307e-01, 1.336e-01, 1.977e-01, -1.429e-02, -5.395e-02, -2.772e-02, -3.214e-01, -1.907e-01), r);
|
||||||
|
r = MulAdd(s2_3, M4(-2.703e-01, 3.122e-01, 1.951e-01, -2.005e-01, 1.463e-01, 3.000e-01, 1.058e-01, 8.352e-02, 1.567e-01, -1.256e-01, -1.854e-01, -2.018e-01, 3.248e-01, 8.780e-02, 1.586e-01, -9.757e-03), r);
|
||||||
|
r = MulAdd(s2_4, M4(3.941e-02, -1.430e-01, 1.023e-01, 2.878e-01, 8.414e-02, 1.385e-01, 8.032e-02, -6.330e-02, -1.020e-01, 2.731e-01, -6.877e-02, -3.492e-01, 3.758e-01, -7.526e-02, 4.955e-01, -5.595e-01), r);
|
||||||
|
r = MulAdd(s2_5, M4(2.684e-01, -1.924e-02, -2.975e-02, 7.205e-01, 6.611e-02, -1.645e-01, 1.267e-01, 6.066e-02, 1.695e-01, -4.367e-01, -1.450e-01, -4.074e-02, 4.469e-01, -7.176e-03, 4.177e-01, -4.565e-01), r);
|
||||||
|
r = MulAdd(s2_6, M4(-1.843e-01, 2.522e-01, 3.324e-01, -1.821e-01, -1.327e-01, 1.182e-01, 1.158e-01, -2.494e-01, -6.459e-03, -6.606e-03, 1.333e-01, 2.229e-01, 2.481e-01, -2.018e-01, 2.456e-01, 2.351e-01), r);
|
||||||
|
r = MulAdd(s2_7, M4(-6.894e-03, -2.822e-01, -1.863e-01, -2.252e-01, 6.755e-02, -1.766e-01, 8.884e-02, -2.720e-03, -4.431e-02, -2.119e-02, 2.876e-01, -5.268e-01, -3.635e-01, -1.001e-01, -8.433e-01, 5.160e-01), r);
|
||||||
|
r = MulAdd(s2_8, M4(-1.786e-01, 2.208e-01, 4.289e-01, 1.663e-01, -2.341e-01, 8.148e-03, -7.557e-02, 7.817e-02, -1.340e-01, -2.341e-01, 3.123e-02, 1.120e-01, -7.753e-01, 2.056e-01, -2.926e-01, -1.222e-01), r);
|
||||||
|
r = MulAdd(s3_0, M4(-4.903e-02, 1.377e-01, 6.984e-02, -1.053e-02, -5.115e-01, 2.891e-01, -4.612e-01, -6.693e-01, 4.752e-02, -5.287e-02, -2.183e-02, 4.134e-01, 1.073e-02, 2.383e-01, -2.142e-01, 1.384e-01), r);
|
||||||
|
r = MulAdd(s3_1, M4(1.680e-01, -1.307e-01, -1.038e-01, -2.130e-02, -1.231e+00, -2.602e-01, -5.456e-01, 3.295e-01, -5.588e-02, 1.505e-01, -4.784e-02, -1.493e-01, 1.202e-01, -2.349e-01, -1.452e-01, -5.111e-02), r);
|
||||||
|
r = MulAdd(s3_2, M4(-8.858e-02, -1.293e-01, 9.441e-02, -1.295e-01, -3.373e-01, -1.841e-01, -1.818e-01, 1.570e+00, -8.336e-02, 2.012e-01, 1.362e-01, 1.830e-01, -6.053e-02, -1.725e-03, -2.011e-01, -1.021e-01), r);
|
||||||
|
r = MulAdd(s3_3, M4(-2.017e-01, 3.505e-01, 3.541e-02, 2.044e-01, -3.839e-01, 5.124e-01, 1.104e-01, 1.311e-01, 1.022e-01, -1.111e-01, -2.883e-01, 1.086e-01, 9.932e-02, 1.308e-01, 2.954e-01, -1.416e-02), r);
|
||||||
|
r = MulAdd(s3_4, M4(6.088e-02, -4.532e-02, -1.302e-01, -1.067e-01, -4.196e+00, 7.383e-01, -2.786e-01, -2.053e+00, -3.758e-01, 2.955e-01, -1.898e-01, 1.875e-01, 1.263e-01, 9.931e-03, 1.016e-01, 5.201e-02), r);
|
||||||
|
r = MulAdd(s3_5, M4(9.722e-03, -5.478e-02, -1.823e-01, -3.983e-02, -2.434e+00, -4.700e-01, 4.168e-01, 3.938e-01, 1.251e-01, -2.933e-01, -2.054e-02, 8.827e-02, 2.048e-02, 6.212e-02, 1.448e-01, 1.042e-01), r);
|
||||||
|
r = MulAdd(s3_6, M4(-1.605e-02, 1.851e-01, 2.427e-01, 4.894e-02, -6.032e-01, -3.413e-02, 4.158e-01, 6.903e-01, -1.865e-02, -1.318e-02, 1.003e-01, 3.193e-01, 4.503e-02, 1.880e-01, -4.608e-02, -3.137e-01), r);
|
||||||
|
r = MulAdd(s3_7, M4(-4.125e-02, -1.494e-01, 8.853e-01, -1.540e-01, -2.445e-01, 2.292e-01, 1.684e+00, 1.098e+00, 5.576e-02, -8.241e-02, 2.507e-01, -1.086e-01, 1.392e-01, -2.115e-01, -2.600e-01, 9.268e-02), r);
|
||||||
|
r = MulAdd(s3_8, M4(5.677e-02, 9.206e-02, 5.863e-02, 5.663e-02, -2.019e+00, -1.006e-01, -1.769e-01, -3.617e-01, 1.293e-02, -2.766e-01, 2.843e-02, 3.331e-01, -2.316e-01, -1.762e-01, -6.013e-03, -2.482e-02), r);
|
||||||
|
return r;
|
||||||
|
}
|
||||||
|
|
||||||
|
V4 f1(V4 s0_0, V4 s0_1, V4 s0_2, V4 s0_3, V4 s0_4, V4 s0_5, V4 s0_6, V4 s0_7, V4 s0_8, V4 s1_0, V4 s1_1, V4 s1_2, V4 s1_3, V4 s1_4, V4 s1_5, V4 s1_6, V4 s1_7, V4 s1_8, V4 s2_0, V4 s2_1, V4 s2_2, V4 s2_3, V4 s2_4, V4 s2_5, V4 s2_6, V4 s2_7, V4 s2_8, V4 s3_0, V4 s3_1, V4 s3_2, V4 s3_3, V4 s3_4, V4 s3_5, V4 s3_6, V4 s3_7, V4 s3_8) {
|
||||||
|
V4 r = { 8.007e-03, 2.570e-02, 2.487e-03, -2.496e-02 };
|
||||||
|
r = MulAdd(s0_0, M4(3.260e-02, 1.675e-01, 8.130e-02, -2.153e-01, -1.987e-01, -9.443e-02, 3.512e-01, 2.289e-02, 9.481e-02, -1.921e-01, -3.818e-01, 1.373e-01, -9.032e-02, 7.892e-02, 1.392e-01, -6.033e-02), r);
|
||||||
|
r = MulAdd(s0_1, M4(-8.203e-02, -1.015e-01, -1.313e-02, -5.337e-02, -2.948e-01, -2.678e-01, -2.321e-01, -5.995e-01, 1.364e-01, 1.030e-01, 1.546e-01, -1.179e-02, 1.996e-01, 2.244e-01, -2.304e-01, -1.304e-02), r);
|
||||||
|
r = MulAdd(s0_2, M4(-2.319e-02, -2.236e-02, 3.976e-02, 1.804e-01, 6.474e-02, 1.315e-01, -1.456e-02, -1.538e-01, 3.061e-02, -1.998e-02, -1.918e-02, -8.662e-02, -1.980e-01, -1.596e-01, -4.624e-01, -3.728e-01), r);
|
||||||
|
r = MulAdd(s0_3, M4(-3.171e-03, -2.887e-02, 3.107e-01, -8.532e-02, 1.489e-02, -2.798e-01, -2.458e-02, 2.922e-01, 5.196e-02, 2.333e-02, -4.100e-01, 3.851e-01, 8.566e-02, 1.655e-01, 3.680e-01, -3.572e-01), r);
|
||||||
|
r = MulAdd(s0_4, M4(4.618e-02, -3.100e-02, -1.849e-01, 2.228e-02, -2.182e-01, -5.806e-01, -6.298e-02, 2.421e-01, 4.266e-01, 7.738e-02, 4.856e-03, -1.191e-01, 3.469e-01, -8.683e-02, -2.397e-01, 6.512e-02), r);
|
||||||
|
r = MulAdd(s0_5, M4(8.363e-02, -9.745e-02, 2.398e-01, -1.335e-01, -1.585e-01, -1.161e-02, 2.482e-02, 1.319e-03, -4.696e-02, -6.675e-02, -7.519e-02, 1.125e-01, -1.199e-01, -9.094e-03, -2.590e-01, -8.812e-01), r);
|
||||||
|
r = MulAdd(s0_6, M4(7.745e-02, 3.414e-02, 6.378e-02, -8.388e-02, 4.456e-02, 1.354e-02, -1.138e-02, 1.131e-01, 2.361e-01, 1.828e-01, -2.135e-01, -1.100e-02, 1.683e-01, 2.134e-01, 1.832e-01, 8.420e-02), r);
|
||||||
|
r = MulAdd(s0_7, M4(-3.223e-01, -4.870e-02, -1.457e-01, 1.996e-01, -1.632e-01, -1.811e-01, -1.625e-01, 4.046e-02, -8.959e-02, 1.432e-01, -2.360e-02, -9.415e-02, -1.547e-01, 1.379e-01, 5.098e-01, -4.069e-01), r);
|
||||||
|
r = MulAdd(s0_8, M4(1.568e-01, -2.510e-02, -9.894e-02, 1.124e-01, -1.372e-01, 5.952e-03, 4.501e-02, 9.591e-03, 1.430e-01, 6.422e-02, -1.412e-03, 1.042e-02, 4.601e-02, -5.133e-02, -7.936e-02, -1.621e-01), r);
|
||||||
|
r = MulAdd(s1_0, M4(1.380e-01, 1.774e-01, 2.958e-01, -2.044e-01, -2.085e-01, 7.192e-03, -7.903e-02, 6.119e-02, -3.542e-02, -1.060e-01, -1.832e-01, 3.603e-01, -3.854e-02, 5.092e-02, -1.092e-01, -2.074e-01), r);
|
||||||
|
r = MulAdd(s1_1, M4(-5.638e-02, -1.659e-01, -1.006e-02, 5.355e-02, -2.243e-01, 3.533e-01, -2.130e-01, 6.480e-02, 4.462e-02, 1.065e-01, 1.598e-01, 5.025e-03, -3.810e-02, 1.012e-01, 2.123e-02, 2.124e-01), r);
|
||||||
|
r = MulAdd(s1_2, M4(5.207e-02, -1.428e-01, 1.745e-01, 2.563e-01, 4.058e-01, 5.320e-02, 3.527e-03, -4.664e-02, -1.641e-03, -2.830e-02, 1.453e-02, 1.169e-01, -5.840e-01, -1.545e-01, 3.880e-01, 1.250e-01), r);
|
||||||
|
r = MulAdd(s1_3, M4(-2.089e-01, 3.070e-02, 3.770e-01, -2.868e-01, -1.965e-01, -2.499e-01, -2.145e-01, 5.348e-02, -1.201e-01, -3.454e-01, -5.723e-01, 4.313e-01, -7.068e-02, -6.358e-02, -2.426e-02, -2.841e-01), r);
|
||||||
|
r = MulAdd(s1_4, M4(1.315e-01, 2.464e-01, -2.505e-01, -1.589e-01, 4.124e-01, 4.860e-01, -2.493e-01, 1.201e-01, -1.304e-01, -1.620e-01, 2.228e-01, 4.485e-02, 6.945e-02, -2.261e-01, -8.190e-04, 5.678e-01), r);
|
||||||
|
r = MulAdd(s1_5, M4(3.529e-01, 1.800e-02, -9.794e-02, -1.160e-01, 7.052e-01, 4.176e-01, 5.822e-02, -5.300e-02, -1.144e-01, -1.890e-01, 1.337e-01, 1.163e-01, -5.024e-01, 9.977e-01, 1.831e-01, 2.166e-02), r);
|
||||||
|
r = MulAdd(s1_6, M4(-1.239e-01, 1.465e-01, 3.700e-01, -1.638e-01, -1.022e-01, -3.216e-02, -2.412e-02, -2.505e-02, 5.450e-02, -1.325e-02, -2.760e-01, 5.219e-02, -5.604e-02, 3.602e-02, -1.026e-01, 4.063e-02), r);
|
||||||
|
r = MulAdd(s1_7, M4(1.669e-01, 2.580e-01, -2.923e-01, -2.497e-01, 1.135e-01, -1.599e-01, -2.419e-01, -1.202e-01, -3.903e-01, -2.141e-01, 9.642e-02, -6.096e-02, -6.762e-01, 5.614e-01, 3.076e-01, -4.187e-01), r);
|
||||||
|
r = MulAdd(s1_8, M4(5.456e-02, -6.641e-02, -3.839e-01, 8.629e-02, 1.149e-01, 1.204e-02, -2.509e-02, -1.413e-03, -1.329e-02, -5.670e-02, -6.186e-02, 5.108e-02, 3.592e-02, 4.563e-01, -7.450e-02, -2.259e-01), r);
|
||||||
|
r = MulAdd(s2_0, M4(1.013e-01, -2.126e-02, -1.260e-01, 8.480e-03, -3.292e-02, 6.069e-04, 4.154e-02, 5.578e-02, 1.586e-02, 8.252e-02, 1.237e-01, -1.312e-01, 1.489e-01, 2.561e-01, -9.917e-02, -1.060e-01), r);
|
||||||
|
r = MulAdd(s2_1, M4(-1.285e-01, -8.314e-02, 1.521e-02, 1.037e-01, -1.021e-02, 7.112e-02, -2.319e-02, 7.051e-04, -1.101e-01, -1.896e-01, -2.458e-01, -7.399e-02, -4.133e-02, 1.606e-01, -1.511e-01, -2.425e-01), r);
|
||||||
|
r = MulAdd(s2_2, M4(7.543e-02, 9.235e-02, 2.139e-01, 2.879e-01, 9.583e-02, 4.372e-02, -8.231e-02, 2.498e-01, 1.241e-01, 1.377e-02, 2.380e-01, 2.586e-02, -1.926e-01, -1.406e-01, -3.627e-01, -8.414e-02), r);
|
||||||
|
r = MulAdd(s2_3, M4(9.655e-03, -9.581e-02, -6.071e-02, 2.231e-01, -1.148e-01, -3.513e-02, -2.013e-02, -1.094e-01, -1.606e-01, 9.180e-02, 3.498e-01, -2.726e-01, -7.696e-03, -4.007e-01, -8.497e-02, -6.989e-01), r);
|
||||||
|
r = MulAdd(s2_4, M4(4.965e-03, -1.346e-01, -4.517e-02, 2.043e-01, -1.348e-01, 1.451e-01, 8.113e-02, -8.530e-02, -1.414e-01, 7.261e-02, -2.368e-01, 1.601e-01, -2.438e-02, -2.554e-01, 4.057e-01, -2.224e-01), r);
|
||||||
|
r = MulAdd(s2_5, M4(-8.716e-02, 1.496e-01, -4.429e-02, 6.451e-01, -9.547e-03, -3.189e-02, -1.096e-01, -5.416e-02, -5.032e-01, 1.331e-01, 2.389e-02, 1.028e-01, -3.186e-01, -2.524e-01, 2.663e-02, -9.995e-03), r);
|
||||||
|
r = MulAdd(s2_6, M4(-2.465e-01, 1.585e-01, 3.196e-01, -9.098e-02, 2.765e-02, -1.793e-01, 1.519e-01, -9.565e-04, -1.160e-01, -3.035e-02, -1.082e-01, 3.172e-02, 5.502e-01, -6.251e-01, -4.487e-01, 1.932e-01), r);
|
||||||
|
r = MulAdd(s2_7, M4(-5.017e-01, -5.180e-01, -2.682e-01, -4.715e-01, 1.958e-02, -7.007e-02, -3.332e-02, -8.389e-02, -1.135e-01, -2.956e-02, 1.994e-01, 2.315e-02, -2.553e-01, -3.153e-03, 4.275e-01, 1.669e+00), r);
|
||||||
|
r = MulAdd(s2_8, M4(1.400e-01, 6.775e-01, 5.287e-02, 2.007e-02, 1.213e-01, -1.460e-03, -2.313e-02, 1.282e-01, -8.355e-02, 2.399e-01, -5.277e-02, -1.499e-01, 7.246e-02, -2.553e-02, 2.185e-01, 8.662e-01), r);
|
||||||
|
r = MulAdd(s3_0, M4(3.069e-02, -3.668e-02, -3.646e-02, 1.140e-01, -7.882e-02, 2.759e-01, 9.170e-01, 2.779e-01, 1.459e-01, 3.766e-02, -1.214e-01, 5.718e-03, -3.323e-02, 9.705e-02, -1.282e-02, -1.401e-01), r);
|
||||||
|
r = MulAdd(s3_1, M4(-1.405e-02, 2.809e-02, 1.466e-01, -1.286e-01, 4.754e-01, 8.076e-01, 5.775e-02, -5.403e-01, 1.919e-01, -2.015e-01, -1.976e-01, -8.544e-02, -8.431e-02, 9.302e-02, 6.560e-02, 2.011e-02), r);
|
||||||
|
r = MulAdd(s3_2, M4(2.107e-01, 2.334e-02, -2.591e-01, -1.023e-01, 6.461e-01, 1.138e+00, 3.917e-01, 2.270e-01, 4.023e-01, 6.135e-02, 4.125e-02, -5.551e-02, 1.871e-02, -1.344e-01, -1.534e-01, 1.216e-01), r);
|
||||||
|
r = MulAdd(s3_3, M4(8.077e-02, -1.149e-01, 6.733e-02, -9.044e-03, -6.431e-02, -1.755e-02, 2.617e+00, 5.203e-01, 8.910e-02, 9.642e-02, 3.720e-01, -2.326e-01, -1.142e-01, -4.017e-02, 2.351e-01, -1.062e-01), r);
|
||||||
|
r = MulAdd(s3_4, M4(-2.427e-01, -4.425e-03, 4.260e-01, -6.273e-02, 4.224e+00, -2.047e+00, -1.911e+00, 2.329e+00, 2.987e-01, -3.286e-01, -1.115e-01, 2.053e-01, -5.309e-02, -8.751e-02, -1.275e-02, -2.105e-01), r);
|
||||||
|
r = MulAdd(s3_5, M4(-1.413e-02, -4.404e-01, -1.525e-01, -1.703e-01, -9.999e-01, 5.276e-01, 4.779e-01, -5.145e-01, 4.772e-01, 2.730e-02, -7.651e-02, -2.235e-01, -1.122e-01, -1.686e-01, 9.595e-02, -1.169e-01), r);
|
||||||
|
r = MulAdd(s3_6, M4(-1.162e-01, 3.109e-01, -2.686e-01, -1.492e-01, 2.122e-01, 6.911e-01, 7.412e-01, 3.675e-02, 1.420e-01, -3.979e-02, -3.526e-02, -1.170e-01, 2.192e-01, 6.369e-02, 2.568e-01, 1.606e-02), r);
|
||||||
|
r = MulAdd(s3_7, M4(-2.482e-02, 6.355e-01, 4.230e-01, -4.331e-01, -1.462e+00, -9.944e-01, 1.154e+00, 8.760e-01, 3.625e-01, 2.127e-01, 3.382e-01, 6.009e-02, 1.431e-01, 9.892e-02, -2.409e-01, 4.223e-02), r);
|
||||||
|
r = MulAdd(s3_8, M4(-1.832e-02, 7.811e-02, -1.928e-02, 1.448e-01, -1.288e+00, 1.805e-01, 6.324e-01, -2.704e-02, 6.456e-02, -6.364e-02, 4.971e-02, -6.535e-03, 1.766e-01, 5.142e-02, -1.375e-01, 2.532e-01), r);
|
||||||
|
return r;
|
||||||
|
}
|
||||||
|
|
||||||
|
void Pass2(uint2 blockStart, uint3 tid) {
|
||||||
|
float2 pt = float2(GetInputPt());
|
||||||
|
uint2 gxy = Rmp8x8(tid.x) + blockStart;
|
||||||
|
uint2 size = GetInputSize();
|
||||||
|
if (gxy.x >= size.x || gxy.y >= size.y) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
float2 pos = (gxy + 0.5) * pt;
|
||||||
|
|
||||||
|
V4 s0_0 = l0(-1.0, -1.0);
|
||||||
|
V4 s0_1 = l0(0.0, -1.0);
|
||||||
|
V4 s0_2 = l0(1.0, -1.0);
|
||||||
|
V4 s0_3 = l0(-1.0, 0.0);
|
||||||
|
V4 s0_4 = l0(0.0, 0.0);
|
||||||
|
V4 s0_5 = l0(1.0, 0.0);
|
||||||
|
V4 s0_6 = l0(-1.0, 1.0);
|
||||||
|
V4 s0_7 = l0(0.0, 1.0);
|
||||||
|
V4 s0_8 = l0(1.0, 1.0);
|
||||||
|
V4 s1_0 = -max(-s0_0, 0.0);
|
||||||
|
V4 s1_1 = -max(-s0_1, 0.0);
|
||||||
|
V4 s1_2 = -max(-s0_2, 0.0);
|
||||||
|
V4 s1_3 = -max(-s0_3, 0.0);
|
||||||
|
V4 s1_4 = -max(-s0_4, 0.0);
|
||||||
|
V4 s1_5 = -max(-s0_5, 0.0);
|
||||||
|
V4 s1_6 = -max(-s0_6, 0.0);
|
||||||
|
V4 s1_7 = -max(-s0_7, 0.0);
|
||||||
|
V4 s1_8 = -max(-s0_8, 0.0);
|
||||||
|
s0_0 = max(s0_0, 0.0);
|
||||||
|
s0_1 = max(s0_1, 0.0);
|
||||||
|
s0_2 = max(s0_2, 0.0);
|
||||||
|
s0_3 = max(s0_3, 0.0);
|
||||||
|
s0_4 = max(s0_4, 0.0);
|
||||||
|
s0_5 = max(s0_5, 0.0);
|
||||||
|
s0_6 = max(s0_6, 0.0);
|
||||||
|
s0_7 = max(s0_7, 0.0);
|
||||||
|
s0_8 = max(s0_8, 0.0);
|
||||||
|
|
||||||
|
V4 s2_0 = l1(-1.0, -1.0);
|
||||||
|
V4 s2_1 = l1(0.0, -1.0);
|
||||||
|
V4 s2_2 = l1(1.0, -1.0);
|
||||||
|
V4 s2_3 = l1(-1.0, 0.0);
|
||||||
|
V4 s2_4 = l1(0.0, 0.0);
|
||||||
|
V4 s2_5 = l1(1.0, 0.0);
|
||||||
|
V4 s2_6 = l1(-1.0, 1.0);
|
||||||
|
V4 s2_7 = l1(0.0, 1.0);
|
||||||
|
V4 s2_8 = l1(1.0, 1.0);
|
||||||
|
V4 s3_0 = -max(-s2_0, 0.0);
|
||||||
|
V4 s3_1 = -max(-s2_1, 0.0);
|
||||||
|
V4 s3_2 = -max(-s2_2, 0.0);
|
||||||
|
V4 s3_3 = -max(-s2_3, 0.0);
|
||||||
|
V4 s3_4 = -max(-s2_4, 0.0);
|
||||||
|
V4 s3_5 = -max(-s2_5, 0.0);
|
||||||
|
V4 s3_6 = -max(-s2_6, 0.0);
|
||||||
|
V4 s3_7 = -max(-s2_7, 0.0);
|
||||||
|
V4 s3_8 = -max(-s2_8, 0.0);
|
||||||
|
s2_0 = max(s2_0, 0.0);
|
||||||
|
s2_1 = max(s2_1, 0.0);
|
||||||
|
s2_2 = max(s2_2, 0.0);
|
||||||
|
s2_3 = max(s2_3, 0.0);
|
||||||
|
s2_4 = max(s2_4, 0.0);
|
||||||
|
s2_5 = max(s2_5, 0.0);
|
||||||
|
s2_6 = max(s2_6, 0.0);
|
||||||
|
s2_7 = max(s2_7, 0.0);
|
||||||
|
s2_8 = max(s2_8, 0.0);
|
||||||
|
|
||||||
|
t2[gxy] = f0(s0_0, s0_1, s0_2, s0_3, s0_4, s0_5, s0_6, s0_7, s0_8, s1_0, s1_1, s1_2, s1_3, s1_4, s1_5, s1_6, s1_7, s1_8, s2_0, s2_1, s2_2, s2_3, s2_4, s2_5, s2_6, s2_7, s2_8, s3_0, s3_1, s3_2, s3_3, s3_4, s3_5, s3_6, s3_7, s3_8);
|
||||||
|
t3[gxy] = f1(s0_0, s0_1, s0_2, s0_3, s0_4, s0_5, s0_6, s0_7, s0_8, s1_0, s1_1, s1_2, s1_3, s1_4, s1_5, s1_6, s1_7, s1_8, s2_0, s2_1, s2_2, s2_3, s2_4, s2_5, s2_6, s2_7, s2_8, s3_0, s3_1, s3_2, s3_3, s3_4, s3_5, s3_6, s3_7, s3_8);
|
||||||
|
}
|
||||||
|
|
||||||
|
//!PASS 3
|
||||||
|
//!DESC conv2
|
||||||
|
//!BLOCK_SIZE 8
|
||||||
|
//!NUM_THREADS 64
|
||||||
|
//!IN t2, t3
|
||||||
|
//!OUT t0, t1
|
||||||
|
|
||||||
|
#define l0(x, y) V4(O(t2, float2(x, y)))
|
||||||
|
#define l1(x, y) V4(O(t3, float2(x, y)))
|
||||||
|
|
||||||
|
V4 f0(V4 s0_0, V4 s0_1, V4 s0_2, V4 s0_3, V4 s0_4, V4 s0_5, V4 s0_6, V4 s0_7, V4 s0_8, V4 s1_0, V4 s1_1, V4 s1_2, V4 s1_3, V4 s1_4, V4 s1_5, V4 s1_6, V4 s1_7, V4 s1_8, V4 s2_0, V4 s2_1, V4 s2_2, V4 s2_3, V4 s2_4, V4 s2_5, V4 s2_6, V4 s2_7, V4 s2_8, V4 s3_0, V4 s3_1, V4 s3_2, V4 s3_3, V4 s3_4, V4 s3_5, V4 s3_6, V4 s3_7, V4 s3_8) {
|
||||||
|
V4 r = { 3.009e-03, -1.445e-03, 8.191e-03, -7.852e-03 };
|
||||||
|
r = MulAdd(s0_0, M4(2.802e-01, -3.301e-02, -1.047e-01, 6.427e-02, 1.357e-02, -8.015e-02, 7.763e-02, -9.646e-02, 1.136e-01, -1.443e-01, -3.950e-02, 2.744e-01, 8.414e-03, -1.005e-01, -1.683e-01, -5.766e-02), r);
|
||||||
|
r = MulAdd(s0_1, M4(2.907e-01, 1.339e-01, -7.005e-02, 9.074e-02, -2.491e-03, 6.498e-02, 1.121e-01, -9.272e-02, 3.415e-01, 1.949e-01, -2.613e-01, -2.328e-01, 1.311e-01, 1.285e-01, 1.685e-02, -4.780e-02), r);
|
||||||
|
r = MulAdd(s0_2, M4(1.671e-01, -2.228e-02, -5.777e-02, -5.853e-02, 1.243e-02, -3.269e-02, 8.757e-03, -1.478e-01, -4.190e-02, 3.164e-02, 2.922e-01, -3.017e-01, -6.631e-02, 5.380e-02, -2.750e-02, -7.771e-02), r);
|
||||||
|
r = MulAdd(s0_3, M4(-2.454e-02, 2.148e-01, -1.116e-01, -1.125e-01, -1.792e-01, -7.021e-01, -2.183e-01, 2.920e-01, -1.698e-01, 1.827e-01, -6.779e-02, 9.333e-02, -2.153e-01, 2.441e-01, 9.794e-02, -2.729e-01), r);
|
||||||
|
r = MulAdd(s0_4, M4(-6.750e-02, 1.324e-01, -5.087e-02, 2.746e-01, 1.579e-01, -1.909e-01, -7.631e-01, -4.744e-01, -1.732e-01, -2.741e-01, 4.145e-02, -2.124e-01, 7.946e-02, -1.579e-01, 2.856e-01, 5.090e-02), r);
|
||||||
|
r = MulAdd(s0_5, M4(8.392e-02, -1.504e-01, 2.815e-01, -1.174e-01, 3.942e-02, 1.918e-02, 1.561e-01, -1.457e-01, -5.976e-02, 1.230e-01, -2.539e-01, -1.965e-01, 1.869e-01, -1.795e-01, -1.283e-01, -3.447e-02), r);
|
||||||
|
r = MulAdd(s0_6, M4(-3.547e-03, -6.576e-03, -5.087e-02, 3.466e-02, -3.130e-03, -3.176e-01, 8.737e-02, 4.018e-02, -6.489e-02, -1.580e-03, -8.784e-03, -4.500e-02, 2.343e-03, 5.945e-02, -5.201e-02, -3.127e-02), r);
|
||||||
|
r = MulAdd(s0_7, M4(-3.546e-02, 1.145e-01, -4.773e-02, 8.280e-02, 6.746e-03, -1.036e-01, -6.616e-02, -1.224e-01, 7.156e-02, -1.941e-01, 9.307e-02, -3.567e-02, -2.215e-01, 2.437e-01, -5.542e-04, 1.208e-01), r);
|
||||||
|
r = MulAdd(s0_8, M4(-1.115e-02, -4.687e-02, -3.210e-02, -1.470e-01, -4.609e-02, 4.657e-02, -6.476e-02, -1.372e-01, -4.956e-03, 1.024e-01, -2.349e-01, -8.472e-02, -2.757e-02, -1.707e-02, 2.065e-01, 1.863e-02), r);
|
||||||
|
r = MulAdd(s1_0, M4(3.728e-02, -7.100e-02, -4.937e-02, 6.239e-02, -7.377e-03, -3.033e-02, 1.675e-01, -1.863e-02, -2.631e-02, -9.633e-02, -1.130e-01, -1.201e-01, 1.414e-01, -1.737e-01, -8.031e-02, -6.951e-02), r);
|
||||||
|
r = MulAdd(s1_1, M4(-3.703e-02, 4.012e-02, -2.289e-02, 3.332e-02, 2.161e-02, 8.828e-02, 5.544e-02, 1.017e-01, 3.684e-01, 3.149e-01, 3.662e-01, 4.298e-02, 1.966e-01, -2.697e-02, 2.216e-02, 7.540e-02), r);
|
||||||
|
r = MulAdd(s1_2, M4(-4.974e-02, -3.826e-02, -2.810e-02, -8.318e-02, 3.356e-02, -7.605e-02, -1.087e-01, 1.987e-02, -1.153e-01, -1.039e-01, -5.868e-02, -3.313e-02, -1.750e-02, 3.884e-03, -9.170e-02, -1.011e-01), r);
|
||||||
|
r = MulAdd(s1_3, M4(2.119e-01, -1.340e-01, -3.650e-02, 2.219e-01, 3.634e-01, 3.474e-01, 2.302e-01, 7.494e-02, -2.253e-01, 1.239e-01, -6.032e-02, 1.293e-01, 9.583e-02, 4.424e-02, -3.920e-02, -1.870e-01), r);
|
||||||
|
r = MulAdd(s1_4, M4(-2.664e-01, 8.462e-02, -4.745e-01, 1.985e-01, 2.803e-01, 7.429e-02, 7.814e-01, 4.658e-01, 3.661e-01, -2.319e-02, 3.324e-01, 2.860e-01, 3.178e-01, 9.301e-02, 1.316e-01, 4.547e-02), r);
|
||||||
|
r = MulAdd(s1_5, M4(5.369e-02, 6.912e-02, 2.659e-01, -1.491e-01, 4.462e-02, -4.823e-02, 1.130e-01, 1.710e-02, -7.604e-02, -7.003e-02, 3.093e-01, 2.537e-01, 2.466e-01, -1.039e-01, 2.413e-02, -1.256e-01), r);
|
||||||
|
r = MulAdd(s1_6, M4(-1.188e-01, 1.026e-01, 4.215e-02, -9.677e-02, 2.443e-03, 1.957e-01, 2.961e-02, -5.553e-02, -3.488e-02, 2.515e-02, -4.840e-03, 1.814e-02, 9.644e-02, -8.802e-02, 3.516e-03, -2.940e-03), r);
|
||||||
|
r = MulAdd(s1_7, M4(-1.792e-01, 1.391e-01, 1.322e-02, -1.514e-02, -2.173e-01, 1.743e-01, 1.530e-01, 5.286e-02, -8.655e-02, 2.541e-01, 6.282e-02, 1.167e-01, 9.664e-02, 2.304e-01, -1.538e-01, -1.298e-01), r);
|
||||||
|
r = MulAdd(s1_8, M4(-1.720e-01, 4.693e-02, 2.790e-01, 2.187e-02, -4.386e-02, 7.714e-03, 9.800e-02, 6.484e-03, -5.497e-02, 1.216e-01, 3.924e-02, 5.162e-02, 1.403e-01, -5.364e-03, -6.795e-03, -6.163e-02), r);
|
||||||
|
r = MulAdd(s2_0, M4(2.905e-01, -3.799e-02, 1.332e-01, 2.496e-02, 7.202e-02, -3.659e-01, -2.940e-02, -1.028e-03, -1.221e-01, 1.147e-01, 3.613e-02, 9.125e-02, -8.760e-03, 1.489e-02, -9.652e-02, 4.452e-03), r);
|
||||||
|
r = MulAdd(s2_1, M4(4.027e-01, -2.178e-01, -8.478e-02, 2.903e-01, 2.463e-02, 9.527e-03, -2.835e-01, 2.066e-01, -6.698e-02, -2.653e-01, -6.667e-02, 4.320e-02, -2.610e-01, -1.351e-01, 7.826e-02, -5.429e-02), r);
|
||||||
|
r = MulAdd(s2_2, M4(-1.249e-01, 4.376e-02, -6.245e-02, 1.702e-01, -5.731e-02, 8.022e-02, -1.335e-01, 1.528e-01, -2.969e-02, 1.062e-01, -1.303e-01, 1.226e-01, 2.030e-02, 5.205e-02, -1.877e-01, 4.309e-02), r);
|
||||||
|
r = MulAdd(s2_3, M4(-6.329e-02, -1.286e-01, -7.222e-02, 5.592e-03, -3.023e-02, 9.502e-02, -4.077e-02, -2.299e-01, -1.038e-01, -5.742e-02, -5.106e-04, 5.143e-02, 3.098e-02, -1.235e-01, 1.987e-02, 1.477e-02), r);
|
||||||
|
r = MulAdd(s2_4, M4(1.113e-01, -1.761e-01, 5.038e-02, -1.304e-01, 3.668e-01, -3.430e-01, 2.169e-01, 3.877e-01, -3.750e-02, 2.473e-01, 3.416e-02, 2.184e-01, 5.168e-01, -7.132e-02, 3.818e-01, -1.508e-01), r);
|
||||||
|
r = MulAdd(s2_5, M4(1.479e-01, -8.656e-02, -1.700e-01, 3.874e-01, 2.286e-02, -8.854e-02, 3.305e-02, -4.668e-03, -1.481e-01, 5.115e-02, 2.686e-01, 4.113e-01, -3.740e-01, -2.013e-01, 9.838e-04, 3.008e-01), r);
|
||||||
|
r = MulAdd(s2_6, M4(3.428e-01, -3.200e-01, 7.593e-02, 1.911e-01, 1.219e-01, 1.211e-02, -5.694e-02, -5.767e-02, 3.119e-02, -7.609e-02, 6.471e-02, 1.215e-01, -2.793e-04, 1.650e-02, 7.190e-03, -4.468e-02), r);
|
||||||
|
r = MulAdd(s2_7, M4(3.970e-01, -3.192e-01, -5.639e-02, 8.182e-02, -2.831e-02, 4.036e-02, 7.004e-02, 1.095e-01, -3.655e-02, 2.443e-01, 5.606e-02, -4.974e-02, 9.825e-02, 1.158e-01, -5.104e-02, -2.986e-02), r);
|
||||||
|
r = MulAdd(s2_8, M4(1.440e-01, 5.504e-02, -2.020e-01, 2.618e-03, -1.098e-02, -3.678e-02, 7.661e-02, 5.652e-02, -7.426e-02, 5.461e-02, 4.239e-01, 2.093e-01, 9.316e-03, -3.679e-02, 6.108e-02, 2.036e-01), r);
|
||||||
|
r = MulAdd(s3_0, M4(1.806e-02, 2.233e-02, 5.056e-02, 1.758e-01, 3.566e-02, -1.383e-01, 5.349e-02, 1.066e-01, 3.314e-02, -1.258e-01, -2.885e-02, -6.648e-02, -6.860e-03, -2.283e-02, -1.052e-01, -1.623e-02), r);
|
||||||
|
r = MulAdd(s3_1, M4(7.369e-02, -3.141e-02, 3.877e-03, 8.113e-03, -1.773e-01, 5.122e-03, -3.198e-01, 9.005e-02, 7.291e-02, -1.519e-01, -1.501e-01, -8.202e-02, -4.729e-02, -2.877e-02, -4.056e-02, 7.599e-02), r);
|
||||||
|
r = MulAdd(s3_2, M4(1.282e-01, 2.477e-03, 6.185e-02, 3.967e-02, -1.343e-01, 8.884e-02, 5.299e-02, -7.324e-02, 1.842e-01, -3.053e-02, -1.335e-01, -6.790e-03, -8.128e-02, 6.665e-02, 1.583e-03, -5.358e-02), r);
|
||||||
|
r = MulAdd(s3_3, M4(1.135e-01, 9.360e-03, 1.646e-01, 1.844e-01, 1.104e-02, 7.072e-02, -9.632e-02, -1.169e-01, -1.458e-01, 2.540e-02, -5.132e-02, -1.627e-01, -1.066e-01, -4.819e-02, -4.340e-02, -5.074e-02), r);
|
||||||
|
r = MulAdd(s3_4, M4(-1.198e-01, -7.965e-02, -2.989e-01, -4.946e-01, -1.666e-02, -2.136e-01, -3.575e-02, 1.351e-01, -8.546e-02, 2.553e-02, -7.878e-02, -3.233e-01, -2.955e-01, -7.765e-02, 1.450e-01, -2.114e-01), r);
|
||||||
|
r = MulAdd(s3_5, M4(-7.593e-02, -1.849e-03, -1.688e-01, 3.626e-02, 4.408e-03, 4.014e-02, -1.401e-01, -2.239e-01, 9.538e-02, -2.310e-01, 2.831e-02, 5.065e-02, 1.135e-01, 2.542e-02, -4.365e-01, 4.393e-02), r);
|
||||||
|
r = MulAdd(s3_6, M4(-5.217e-02, -1.327e-02, -1.851e-02, 2.806e-02, 4.648e-02, -9.047e-04, 2.961e-02, -2.922e-02, 6.360e-02, -3.494e-02, 2.573e-02, 1.309e-02, -2.512e-03, -4.086e-02, -2.086e-03, -6.018e-02), r);
|
||||||
|
r = MulAdd(s3_7, M4(9.887e-02, -9.515e-03, 1.306e-01, 5.290e-02, 1.832e-01, -2.549e-01, -4.640e-02, -1.256e-01, 4.915e-02, -5.163e-02, 3.044e-02, -9.871e-02, 8.168e-03, -7.112e-02, -5.743e-02, 3.687e-02), r);
|
||||||
|
r = MulAdd(s3_8, M4(-6.440e-02, 2.530e-02, -2.166e-03, -4.680e-02, 8.009e-02, -6.634e-02, -1.390e-01, -2.524e-02, 6.524e-02, -1.120e-01, -4.252e-02, -8.413e-03, -2.017e-02, 1.444e-02, -4.483e-02, 4.690e-02), r);
|
||||||
|
return r;
|
||||||
|
}
|
||||||
|
|
||||||
|
V4 f1(V4 s0_0, V4 s0_1, V4 s0_2, V4 s0_3, V4 s0_4, V4 s0_5, V4 s0_6, V4 s0_7, V4 s0_8, V4 s1_0, V4 s1_1, V4 s1_2, V4 s1_3, V4 s1_4, V4 s1_5, V4 s1_6, V4 s1_7, V4 s1_8, V4 s2_0, V4 s2_1, V4 s2_2, V4 s2_3, V4 s2_4, V4 s2_5, V4 s2_6, V4 s2_7, V4 s2_8, V4 s3_0, V4 s3_1, V4 s3_2, V4 s3_3, V4 s3_4, V4 s3_5, V4 s3_6, V4 s3_7, V4 s3_8) {
|
||||||
|
V4 r = { -6.039e-04, -3.875e-03, -3.020e-03, 2.282e-03 };
|
||||||
|
r = MulAdd(s0_0, M4(-1.005e-01, -6.367e-02, 4.428e-02, 1.687e-02, -9.639e-02, -1.209e-01, -1.374e-02, 4.932e-02, -9.949e-02, -2.569e-01, 1.199e-01, 1.077e-02, 5.110e-02, -1.129e-01, 6.104e-02, -4.656e-03), r);
|
||||||
|
r = MulAdd(s0_1, M4(-2.156e-01, 8.505e-02, 4.815e-04, -1.042e-01, -2.724e-01, -1.870e-01, 3.876e-02, 7.840e-02, -4.018e-01, -8.239e-01, 2.611e-01, -3.623e-01, -6.999e-03, 1.848e-02, 6.095e-02, -2.318e-02), r);
|
||||||
|
r = MulAdd(s0_2, M4(-2.195e-01, -6.727e-02, 7.111e-02, 5.119e-02, 7.396e-02, 1.116e-02, -1.261e-02, 9.531e-02, -3.892e-01, 1.430e-01, -9.840e-02, -2.423e-01, 2.669e-01, 3.009e-02, -2.478e-02, 1.168e-01), r);
|
||||||
|
r = MulAdd(s0_3, M4(-4.344e-01, 8.202e-02, 9.272e-03, -8.384e-02, -8.136e-02, -4.359e-01, 2.361e-01, -2.183e-01, 4.609e-02, -2.144e-02, 9.525e-03, -7.197e-02, -9.339e-02, 1.927e-01, -1.687e-02, 3.193e-02), r);
|
||||||
|
r = MulAdd(s0_4, M4(4.702e-01, 1.415e-04, 1.097e-01, 2.415e-01, 1.899e-01, -7.324e-01, -4.745e-03, -1.237e-01, -2.043e-01, 2.674e-02, 6.899e-01, 8.700e-02, 5.083e-02, 2.271e-01, 4.884e-02, 3.767e-01), r);
|
||||||
|
r = MulAdd(s0_5, M4(6.758e-02, -4.638e-02, 9.477e-02, -8.290e-02, -1.994e-01, 1.090e-01, -5.148e-02, -1.470e-01, 7.433e-02, 3.404e-01, 1.020e-01, -8.353e-02, 1.793e-01, -1.368e-01, 6.375e-02, 5.993e-02), r);
|
||||||
|
r = MulAdd(s0_6, M4(1.596e-02, 3.589e-02, 1.177e-02, 1.541e-01, -1.159e-01, -1.621e-02, 2.451e-01, 2.767e-01, -3.754e-04, 4.995e-02, -6.760e-02, -9.945e-02, 4.017e-01, 4.413e-02, 2.189e-02, 4.126e-02), r);
|
||||||
|
r = MulAdd(s0_7, M4(1.635e-01, -1.853e-01, -1.823e-01, -1.003e-01, -4.884e-02, 1.686e-01, 7.826e-02, 5.419e-01, -1.017e-01, 7.007e-02, 2.084e-01, 2.030e-01, 5.150e-01, -1.861e-01, -3.037e-01, -3.846e-01), r);
|
||||||
|
r = MulAdd(s0_8, M4(1.162e-01, 9.675e-02, -9.807e-02, 7.794e-02, 1.154e-01, 7.680e-02, 7.823e-02, 1.665e-01, 1.414e-01, 4.509e-02, -1.327e-02, 1.752e-01, -2.721e-01, -9.636e-04, 2.198e-02, -9.405e-02), r);
|
||||||
|
r = MulAdd(s1_0, M4(-3.554e-02, 7.673e-02, -1.735e-02, 3.910e-02, -9.934e-02, 1.798e-01, -4.244e-02, -2.008e-02, -1.586e-01, 7.918e-02, 6.812e-02, 1.784e-01, -2.173e-01, 8.736e-02, -3.130e-02, -1.487e-02), r);
|
||||||
|
r = MulAdd(s1_1, M4(1.142e-01, 2.330e-02, -7.096e-03, 5.291e-02, -3.702e-01, 2.102e-01, 7.156e-02, -1.416e-01, 1.017e-01, 3.888e-01, -5.335e-02, 9.686e-02, -1.093e-01, -1.631e-02, -2.884e-03, -4.091e-02), r);
|
||||||
|
r = MulAdd(s1_2, M4(4.795e-02, 4.423e-03, 1.494e-02, 2.666e-02, 1.261e-01, -7.251e-02, 2.103e-02, 1.095e-01, 2.166e-01, -1.249e-01, 8.981e-03, 1.792e-01, -3.697e-02, 6.864e-03, -1.141e-02, 2.430e-02), r);
|
||||||
|
r = MulAdd(s1_3, M4(-1.206e-01, 1.584e-03, -1.789e-02, -1.335e-02, 2.398e-01, 8.681e-01, -1.241e-01, -4.454e-02, -7.396e-02, 1.759e-02, -9.138e-02, 1.573e-01, -2.025e-01, 8.569e-02, 2.132e-02, 9.791e-02), r);
|
||||||
|
r = MulAdd(s1_4, M4(-4.834e-02, -7.974e-01, 2.858e-01, -2.441e-01, 4.163e-01, -1.650e-01, -1.897e-01, 1.309e-01, 4.031e-02, -8.242e-02, 3.338e-01, 3.567e-01, -1.532e-01, 2.807e-01, -7.324e-02, 5.093e-03), r);
|
||||||
|
r = MulAdd(s1_5, M4(-1.538e-01, 9.244e-02, -7.570e-02, -4.333e-02, -1.407e-01, -4.201e-02, -4.186e-02, -1.603e-01, -2.031e-01, 6.309e-02, -8.191e-02, 9.121e-02, -8.138e-02, -4.037e-02, 3.793e-02, 4.240e-02), r);
|
||||||
|
r = MulAdd(s1_6, M4(1.780e-01, 1.059e-01, -5.233e-03, 1.087e-01, 1.808e-01, -1.409e-01, 1.162e-02, -1.312e-01, 6.866e-02, 1.401e-02, 6.420e-02, 5.614e-02, -6.830e-02, 1.731e-02, 5.889e-02, 2.257e-02), r);
|
||||||
|
r = MulAdd(s1_7, M4(2.057e-01, -2.093e-02, -1.741e-01, 9.891e-02, -3.673e-02, 3.314e-02, -2.223e-01, -3.177e-01, 2.374e-01, -5.871e-02, -5.086e-02, -9.418e-02, -1.935e-02, -1.902e-02, -1.255e-01, -2.744e-01), r);
|
||||||
|
r = MulAdd(s1_8, M4(1.654e-01, 7.328e-02, 2.874e-02, 1.256e-01, -2.608e-01, 1.926e-03, 4.500e-02, -7.882e-02, -1.035e-02, -3.478e-02, -1.061e-01, -8.474e-02, -2.438e-01, -6.889e-02, -7.579e-02, -1.871e-01), r);
|
||||||
|
r = MulAdd(s2_0, M4(6.493e-02, 1.357e-01, -6.197e-02, -5.055e-02, 2.568e-01, -5.699e-02, -1.266e-01, -1.411e-02, 2.936e-02, -5.234e-02, -5.882e-03, -8.014e-02, -5.334e-02, -8.555e-02, 5.632e-02, 8.296e-03), r);
|
||||||
|
r = MulAdd(s2_1, M4(-3.582e-01, 2.351e-01, -1.636e-01, 2.172e-01, -1.840e-01, 9.838e-02, -7.565e-02, 1.535e-01, 8.151e-02, 3.002e-02, 1.149e-01, 1.180e-01, 1.323e-01, -7.682e-03, 5.013e-02, -2.190e-02), r);
|
||||||
|
r = MulAdd(s2_2, M4(-1.957e-01, -5.823e-02, -1.131e-01, -7.025e-02, 3.355e-01, 1.378e-01, -2.046e-01, 2.575e-01, 1.663e-01, 2.567e-02, -3.703e-02, -9.489e-02, -6.431e-02, -6.700e-02, 9.598e-02, 4.460e-03), r);
|
||||||
|
r = MulAdd(s2_3, M4(-1.522e-01, 1.335e-01, -2.140e-01, 3.368e-02, -5.076e-02, 2.412e-01, 6.141e-03, 2.456e-02, -9.105e-03, 1.014e-02, -1.056e-02, 1.368e-01, 8.030e-02, -2.874e-02, -7.499e-02, -2.675e-02), r);
|
||||||
|
r = MulAdd(s2_4, M4(2.115e-02, -6.849e-02, -8.528e-02, -3.270e-01, 2.112e-02, 7.309e-02, -3.852e-02, 2.604e-01, 1.772e-01, 4.115e-01, -2.443e-01, 3.100e-01, 3.139e-01, 3.829e-01, -2.701e-01, 1.463e-01), r);
|
||||||
|
r = MulAdd(s2_5, M4(2.664e-03, 4.352e-02, -2.378e-01, 5.316e-02, -1.369e-01, -1.293e-01, 1.587e-01, 2.153e-01, 3.820e-01, -1.515e-01, -4.429e-02, 2.391e-01, -3.720e-01, -1.154e-01, -1.196e-01, 3.172e-01), r);
|
||||||
|
r = MulAdd(s2_6, M4(-3.174e-01, -2.340e-01, 1.286e-01, -1.076e-01, 5.834e-02, 6.138e-02, -6.854e-03, 5.658e-02, 5.314e-02, -1.751e-02, 9.115e-03, 8.328e-03, 8.394e-03, 2.608e-02, 1.125e-01, 1.593e-01), r);
|
||||||
|
r = MulAdd(s2_7, M4(-6.600e-01, 1.899e-01, 1.094e-01, 1.665e-02, 1.089e-01, -1.034e-01, -1.811e-01, -3.040e-01, 4.782e-01, 3.160e-02, -4.648e-02, 1.286e-01, 1.070e-01, -1.022e-01, 5.693e-02, -5.195e-02), r);
|
||||||
|
r = MulAdd(s2_8, M4(3.748e-03, -4.142e-02, -7.021e-02, -2.596e-01, -2.444e-01, -6.341e-05, 4.125e-02, -7.382e-02, 4.456e-02, 3.144e-02, -5.055e-02, -1.724e-01, -1.835e-01, 4.462e-02, -1.398e-01, -2.631e-02), r);
|
||||||
|
r = MulAdd(s3_0, M4(-1.892e-01, -2.298e-01, 7.045e-02, -6.423e-02, 7.789e-02, -9.540e-02, -3.161e-02, -5.171e-02, -3.656e-02, -6.148e-02, -1.413e-02, -8.995e-02, 2.536e-02, 1.995e-03, 3.317e-02, 1.918e-02), r);
|
||||||
|
r = MulAdd(s3_1, M4(1.245e-02, -4.971e-03, 1.026e-02, -7.525e-02, -2.233e-01, -4.502e-01, -4.530e-03, -1.802e-01, -1.799e-01, 1.915e-02, 1.043e-02, 4.008e-02, 1.524e-01, 1.881e-03, -7.387e-02, 1.566e-02), r);
|
||||||
|
r = MulAdd(s3_2, M4(-1.750e-01, 3.216e-03, -1.033e-03, -7.055e-02, -1.263e-01, 1.586e-01, 2.603e-02, -1.282e-01, 5.606e-02, -1.498e-02, -3.338e-02, -8.978e-03, -2.218e-02, -5.852e-02, -3.208e-03, -1.352e-02), r);
|
||||||
|
r = MulAdd(s3_3, M4(-9.577e-02, -8.859e-02, 7.921e-02, -1.569e-02, -7.962e-02, 2.890e-02, 4.107e-02, -5.870e-02, 2.510e-02, 1.765e-02, 4.458e-02, 1.891e-02, 7.541e-02, 3.492e-02, 3.160e-02, 1.201e-02), r);
|
||||||
|
r = MulAdd(s3_4, M4(-6.228e-02, 9.576e-02, -1.743e-01, -1.935e-01, 2.054e-01, 1.479e-01, 8.056e-04, 3.321e-02, -1.362e-01, 5.003e-01, 9.071e-02, 8.153e-02, 2.283e-01, -3.484e-01, 4.509e-02, -4.658e-01), r);
|
||||||
|
r = MulAdd(s3_5, M4(2.528e-01, -9.286e-04, -2.468e-02, 1.338e-01, 4.431e-02, 3.503e-02, 1.304e-01, 1.652e-01, 4.628e-01, -2.670e-01, 1.880e-01, 1.516e-01, -1.538e-01, 1.379e-01, -3.334e-02, 2.977e-02), r);
|
||||||
|
r = MulAdd(s3_6, M4(1.385e-01, -6.592e-02, -1.225e-01, -1.381e-01, -4.498e-02, -6.343e-03, 4.811e-02, 9.639e-02, 1.635e-02, -3.467e-02, 3.640e-03, -3.186e-02, 6.265e-02, 2.282e-01, 9.661e-02, 1.295e-01), r);
|
||||||
|
r = MulAdd(s3_7, M4(-3.053e-03, 7.999e-02, 2.407e-01, 2.655e-01, -3.969e-01, -9.502e-03, 1.900e-02, 9.557e-02, -6.199e-02, -3.574e-02, 8.350e-02, -7.837e-02, -1.442e-02, -5.281e-03, 4.503e-01, 4.026e-01), r);
|
||||||
|
r = MulAdd(s3_8, M4(-1.313e-01, 4.424e-02, -1.155e-02, 6.769e-02, 2.192e-02, 6.721e-02, 5.694e-03, 7.376e-02, -2.155e-01, -7.512e-02, 6.252e-03, -3.428e-01, 3.324e-01, 2.784e-03, -5.606e-02, 2.108e-01), r);
|
||||||
|
return r;
|
||||||
|
}
|
||||||
|
|
||||||
|
void Pass3(uint2 blockStart, uint3 tid) {
|
||||||
|
float2 pt = float2(GetInputPt());
|
||||||
|
uint2 gxy = Rmp8x8(tid.x) + blockStart;
|
||||||
|
uint2 size = GetInputSize();
|
||||||
|
if (gxy.x >= size.x || gxy.y >= size.y) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
float2 pos = (gxy + 0.5) * pt;
|
||||||
|
|
||||||
|
V4 s0_0 = l0(-1.0, -1.0);
|
||||||
|
V4 s0_1 = l0(0.0, -1.0);
|
||||||
|
V4 s0_2 = l0(1.0, -1.0);
|
||||||
|
V4 s0_3 = l0(-1.0, 0.0);
|
||||||
|
V4 s0_4 = l0(0.0, 0.0);
|
||||||
|
V4 s0_5 = l0(1.0, 0.0);
|
||||||
|
V4 s0_6 = l0(-1.0, 1.0);
|
||||||
|
V4 s0_7 = l0(0.0, 1.0);
|
||||||
|
V4 s0_8 = l0(1.0, 1.0);
|
||||||
|
V4 s1_0 = -max(-s0_0, 0.0);
|
||||||
|
V4 s1_1 = -max(-s0_1, 0.0);
|
||||||
|
V4 s1_2 = -max(-s0_2, 0.0);
|
||||||
|
V4 s1_3 = -max(-s0_3, 0.0);
|
||||||
|
V4 s1_4 = -max(-s0_4, 0.0);
|
||||||
|
V4 s1_5 = -max(-s0_5, 0.0);
|
||||||
|
V4 s1_6 = -max(-s0_6, 0.0);
|
||||||
|
V4 s1_7 = -max(-s0_7, 0.0);
|
||||||
|
V4 s1_8 = -max(-s0_8, 0.0);
|
||||||
|
s0_0 = max(s0_0, 0.0);
|
||||||
|
s0_1 = max(s0_1, 0.0);
|
||||||
|
s0_2 = max(s0_2, 0.0);
|
||||||
|
s0_3 = max(s0_3, 0.0);
|
||||||
|
s0_4 = max(s0_4, 0.0);
|
||||||
|
s0_5 = max(s0_5, 0.0);
|
||||||
|
s0_6 = max(s0_6, 0.0);
|
||||||
|
s0_7 = max(s0_7, 0.0);
|
||||||
|
s0_8 = max(s0_8, 0.0);
|
||||||
|
|
||||||
|
V4 s2_0 = l1(-1.0, -1.0);
|
||||||
|
V4 s2_1 = l1(0.0, -1.0);
|
||||||
|
V4 s2_2 = l1(1.0, -1.0);
|
||||||
|
V4 s2_3 = l1(-1.0, 0.0);
|
||||||
|
V4 s2_4 = l1(0.0, 0.0);
|
||||||
|
V4 s2_5 = l1(1.0, 0.0);
|
||||||
|
V4 s2_6 = l1(-1.0, 1.0);
|
||||||
|
V4 s2_7 = l1(0.0, 1.0);
|
||||||
|
V4 s2_8 = l1(1.0, 1.0);
|
||||||
|
V4 s3_0 = -max(-s2_0, 0.0);
|
||||||
|
V4 s3_1 = -max(-s2_1, 0.0);
|
||||||
|
V4 s3_2 = -max(-s2_2, 0.0);
|
||||||
|
V4 s3_3 = -max(-s2_3, 0.0);
|
||||||
|
V4 s3_4 = -max(-s2_4, 0.0);
|
||||||
|
V4 s3_5 = -max(-s2_5, 0.0);
|
||||||
|
V4 s3_6 = -max(-s2_6, 0.0);
|
||||||
|
V4 s3_7 = -max(-s2_7, 0.0);
|
||||||
|
V4 s3_8 = -max(-s2_8, 0.0);
|
||||||
|
s2_0 = max(s2_0, 0.0);
|
||||||
|
s2_1 = max(s2_1, 0.0);
|
||||||
|
s2_2 = max(s2_2, 0.0);
|
||||||
|
s2_3 = max(s2_3, 0.0);
|
||||||
|
s2_4 = max(s2_4, 0.0);
|
||||||
|
s2_5 = max(s2_5, 0.0);
|
||||||
|
s2_6 = max(s2_6, 0.0);
|
||||||
|
s2_7 = max(s2_7, 0.0);
|
||||||
|
s2_8 = max(s2_8, 0.0);
|
||||||
|
|
||||||
|
t0[gxy] = f0(s0_0, s0_1, s0_2, s0_3, s0_4, s0_5, s0_6, s0_7, s0_8, s1_0, s1_1, s1_2, s1_3, s1_4, s1_5, s1_6, s1_7, s1_8, s2_0, s2_1, s2_2, s2_3, s2_4, s2_5, s2_6, s2_7, s2_8, s3_0, s3_1, s3_2, s3_3, s3_4, s3_5, s3_6, s3_7, s3_8);
|
||||||
|
t1[gxy] = f1(s0_0, s0_1, s0_2, s0_3, s0_4, s0_5, s0_6, s0_7, s0_8, s1_0, s1_1, s1_2, s1_3, s1_4, s1_5, s1_6, s1_7, s1_8, s2_0, s2_1, s2_2, s2_3, s2_4, s2_5, s2_6, s2_7, s2_8, s3_0, s3_1, s3_2, s3_3, s3_4, s3_5, s3_6, s3_7, s3_8);
|
||||||
|
}
|
||||||
|
|
||||||
|
//!PASS 4
|
||||||
|
//!DESC conv3
|
||||||
|
//!BLOCK_SIZE 8
|
||||||
|
//!NUM_THREADS 64
|
||||||
|
//!IN t0, t1
|
||||||
|
//!OUT t2, t3
|
||||||
|
|
||||||
|
#define l0(x, y) V4(O(t0, float2(x, y)))
|
||||||
|
#define l1(x, y) V4(O(t1, float2(x, y)))
|
||||||
|
|
||||||
|
V4 f0(V4 s0_0, V4 s0_1, V4 s0_2, V4 s0_3, V4 s0_4, V4 s0_5, V4 s0_6, V4 s0_7, V4 s0_8, V4 s1_0, V4 s1_1, V4 s1_2, V4 s1_3, V4 s1_4, V4 s1_5, V4 s1_6, V4 s1_7, V4 s1_8, V4 s2_0, V4 s2_1, V4 s2_2, V4 s2_3, V4 s2_4, V4 s2_5, V4 s2_6, V4 s2_7, V4 s2_8, V4 s3_0, V4 s3_1, V4 s3_2, V4 s3_3, V4 s3_4, V4 s3_5, V4 s3_6, V4 s3_7, V4 s3_8) {
|
||||||
|
V4 r = { 8.385e-03, 1.035e-02, -6.465e-04, -6.502e-03 };
|
||||||
|
r = MulAdd(s0_0, M4(8.947e-02, -1.234e-01, -3.169e-02, -9.158e-02, -1.406e-01, 6.941e-02, -1.367e-02, -1.406e-02, 9.073e-02, 5.642e-01, -2.007e-02, 9.725e-02, 7.122e-03, -1.956e-03, 6.532e-03, -5.457e-02), r);
|
||||||
|
r = MulAdd(s0_1, M4(-1.130e-01, -4.645e-02, 3.624e-02, 3.391e-02, 3.882e-01, 2.453e-01, -2.237e-01, -2.271e-01, 2.803e-01, 1.718e-01, 3.255e-02, -2.046e-01, 1.441e-01, -1.880e-03, 2.335e-02, -1.232e-01), r);
|
||||||
|
r = MulAdd(s0_2, M4(2.016e-01, 1.243e-01, -3.895e-02, -1.135e-01, -2.167e-02, 1.465e-02, -7.776e-02, -1.213e-01, -7.195e-03, 4.404e-03, 6.598e-02, -5.135e-02, -2.062e-01, -3.725e-02, -8.296e-03, 8.739e-03), r);
|
||||||
|
r = MulAdd(s0_3, M4(-2.068e-02, -3.876e-02, 5.737e-02, 9.886e-02, -9.663e-02, -2.569e-01, 6.761e-02, -1.454e-01, 4.660e-02, 7.810e-01, -2.254e-01, 1.899e-01, -9.628e-02, 8.080e-02, -1.093e-02, 1.451e-02), r);
|
||||||
|
r = MulAdd(s0_4, M4(3.133e-01, 2.759e-01, -9.917e-02, -3.134e-01, 1.137e-01, -5.446e-01, -2.044e-03, -5.215e-01, -6.867e-02, 5.254e-01, -1.466e-01, -3.048e-01, 3.408e-01, 5.791e-01, -2.594e-01, -4.879e-04), r);
|
||||||
|
r = MulAdd(s0_5, M4(6.871e-02, -1.221e-01, -5.702e-02, -2.731e-02, 6.025e-01, 1.350e-01, -3.119e-01, -4.130e-01, 2.091e-01, 1.003e-01, 4.509e-02, -1.541e-01, 1.151e-01, -1.558e-01, 6.309e-03, -2.192e-01), r);
|
||||||
|
r = MulAdd(s0_6, M4(2.139e-02, 1.540e-02, -9.451e-02, 8.898e-02, 1.983e-02, -1.259e-01, 2.162e-01, -9.477e-02, -2.253e-01, -1.456e-01, -2.432e-02, 9.649e-02, 2.147e-02, -9.523e-02, 2.042e-02, -7.790e-02), r);
|
||||||
|
r = MulAdd(s0_7, M4(-3.105e-03, 1.944e-01, -1.808e-01, -3.058e-02, 4.007e-01, 5.645e-01, -2.452e-01, -7.366e-02, 1.279e-02, 3.212e-02, -1.573e-01, -1.267e-01, 1.613e-02, -1.976e-01, -1.519e-01, -2.687e-02), r);
|
||||||
|
r = MulAdd(s0_8, M4(-1.906e-04, 8.306e-02, 2.480e-02, 1.696e-02, 1.275e-01, 1.372e-01, 1.205e-01, 1.120e-02, 1.424e-02, -1.526e-01, -6.629e-02, -9.104e-02, 2.042e-02, -1.167e-01, 1.050e-01, 1.560e-02), r);
|
||||||
|
r = MulAdd(s1_0, M4(-2.398e-02, -1.009e-01, 2.671e-02, -8.841e-02, -7.277e-03, -4.411e-02, -1.240e-02, -5.367e-04, -1.223e-01, -7.251e-02, 4.941e-02, 7.545e-02, 6.688e-02, 1.727e-02, -1.144e-02, -7.713e-02), r);
|
||||||
|
r = MulAdd(s1_1, M4(-1.507e-01, -3.095e-01, 5.017e-02, -1.145e-01, 3.430e-02, -2.241e-01, -9.050e-02, -8.470e-02, -8.624e-02, -1.021e-02, -1.620e-02, 3.932e-03, 7.775e-02, -2.376e-02, 6.270e-02, -7.896e-02), r);
|
||||||
|
r = MulAdd(s1_2, M4(3.578e-02, -3.242e-02, 9.400e-03, -2.998e-02, -1.545e-02, -1.481e-01, -6.667e-02, 3.496e-02, 6.722e-02, 7.676e-04, -8.215e-04, 2.142e-03, 4.007e-02, 9.690e-02, -1.652e-03, 3.858e-02), r);
|
||||||
|
r = MulAdd(s1_3, M4(6.321e-02, -1.472e-01, 6.571e-02, -1.929e-01, -7.340e-02, -8.067e-02, 1.715e-02, 2.182e-02, -8.623e-02, -2.195e-01, -6.101e-02, 8.246e-02, -4.908e-02, -3.293e-02, -7.341e-02, -1.941e-01), r);
|
||||||
|
r = MulAdd(s1_4, M4(5.609e-01, 5.581e-01, -1.143e-01, -1.052e-01, 2.477e-01, 2.387e-01, 1.272e-01, 3.284e-03, -3.135e-01, 8.385e-02, -7.393e-02, -2.270e-01, 4.403e-01, -1.179e-01, -1.620e-01, 2.978e-01), r);
|
||||||
|
r = MulAdd(s1_5, M4(-3.015e-02, 1.055e-01, 1.072e-01, 1.177e-01, 3.838e-01, 3.206e-02, -4.556e-03, -5.072e-02, 4.250e-02, -1.665e-02, -1.759e-02, 2.822e-02, -2.408e-01, -2.204e-02, -3.440e-02, 6.520e-02), r);
|
||||||
|
r = MulAdd(s1_6, M4(9.180e-04, 3.395e-02, -1.211e-02, -5.605e-03, -7.356e-03, -2.439e-02, -2.498e-02, -6.361e-04, -5.167e-02, -1.009e-02, 7.202e-02, 3.652e-02, 3.036e-03, -7.672e-03, -2.822e-02, -9.942e-02), r);
|
||||||
|
r = MulAdd(s1_7, M4(-7.041e-02, -2.366e-01, -1.556e-01, 1.499e-01, -2.674e-02, 6.601e-03, -1.490e-01, 1.329e-02, -1.127e-01, 8.363e-03, -1.333e-01, 1.038e-02, -1.219e-02, -1.366e-01, 8.814e-02, 4.260e-03), r);
|
||||||
|
r = MulAdd(s1_8, M4(-1.397e-02, 2.863e-02, 5.459e-03, -1.166e-02, -1.201e-02, 1.346e-01, 5.461e-02, 1.584e-02, -8.155e-02, 8.451e-03, -3.444e-02, 3.920e-02, 2.082e-02, -4.174e-02, 6.205e-02, 5.646e-02), r);
|
||||||
|
r = MulAdd(s2_0, M4(5.465e-02, 7.303e-02, 1.200e-01, 8.938e-03, -8.960e-02, -2.248e-01, -1.073e-02, 6.882e-02, 4.637e-02, -1.215e-01, -2.319e-02, -2.049e-01, -8.235e-02, -2.689e-02, 8.521e-02, 2.612e-02), r);
|
||||||
|
r = MulAdd(s2_1, M4(-1.284e-01, -8.509e-02, 6.859e-02, 2.538e-02, -7.401e-02, 2.860e-01, -2.240e-01, 1.754e-01, -2.073e-01, -9.333e-02, -9.310e-02, -3.311e-01, 2.251e-01, 1.948e-01, -1.091e-01, 2.448e-02), r);
|
||||||
|
r = MulAdd(s2_2, M4(4.550e-03, 2.884e-02, -1.023e-02, -1.793e-02, 1.472e-01, 1.728e-02, -5.533e-02, -4.606e-02, -1.128e-01, 1.845e-01, -9.297e-02, 7.245e-02, 2.303e-02, -1.293e-01, -2.277e-02, -1.523e-02), r);
|
||||||
|
r = MulAdd(s2_3, M4(5.703e-02, 4.629e-03, -7.495e-02, -7.220e-02, -1.245e-01, 1.142e-01, -1.688e-03, -9.906e-03, 9.714e-02, -2.851e-02, 7.069e-03, -3.250e-01, -5.029e-03, -1.421e-01, -4.162e-02, 1.032e-01), r);
|
||||||
|
r = MulAdd(s2_4, M4(5.200e-02, -3.414e-02, -3.809e-02, -9.742e-02, 8.686e-01, 1.140e+00, 2.062e-01, 8.598e-02, 4.073e-01, -3.313e-01, 2.673e-01, 1.050e-01, -9.355e-02, 1.764e-01, 8.423e-02, 1.156e-01), r);
|
||||||
|
r = MulAdd(s2_5, M4(-5.260e-03, 8.804e-02, 3.636e-02, 3.074e-03, 1.724e-01, 2.433e-01, -1.126e-02, -2.652e-01, -1.229e-01, 3.135e-02, 1.187e-02, -6.661e-02, -1.872e-02, -6.508e-02, -7.109e-02, 1.141e-01), r);
|
||||||
|
r = MulAdd(s2_6, M4(6.180e-03, 2.059e-03, -1.768e-02, 4.877e-03, -7.838e-02, 1.366e-01, -7.231e-02, -2.826e-02, 6.251e-02, 7.375e-02, 2.531e-02, 2.038e-02, -4.462e-03, -4.896e-02, -4.376e-02, -7.998e-03), r);
|
||||||
|
r = MulAdd(s2_7, M4(1.011e-01, 8.753e-02, -5.554e-02, 6.949e-04, 4.137e-02, 2.710e-01, -3.203e-01, 6.752e-02, 9.720e-02, 3.447e-02, -5.777e-02, -1.723e-02, -9.154e-03, 5.461e-02, 1.248e-01, -3.906e-04), r);
|
||||||
|
r = MulAdd(s2_8, M4(4.126e-02, 3.442e-02, 9.763e-03, -4.560e-02, -4.233e-04, -1.519e-01, 2.421e-02, -4.043e-02, -1.281e-02, 1.166e-02, 2.489e-04, -3.061e-02, -4.476e-02, 4.493e-03, -4.164e-02, 9.694e-03), r);
|
||||||
|
r = MulAdd(s3_0, M4(-1.352e-01, -1.938e-01, 7.285e-02, -4.706e-02, 1.920e-02, 1.891e-02, 1.233e-02, 3.876e-02, 1.342e-02, 2.020e-01, 3.292e-02, 2.778e-02, -5.017e-02, 3.560e-02, 7.028e-02, 7.562e-03), r);
|
||||||
|
r = MulAdd(s3_1, M4(3.014e-01, 1.243e-01, -2.656e-02, -9.796e-02, 1.585e-01, 2.259e-01, -6.651e-02, 4.080e-02, 1.902e-01, 2.705e-01, -9.774e-02, -1.144e-02, -4.653e-01, -3.536e-01, 2.515e-02, 9.628e-02), r);
|
||||||
|
r = MulAdd(s3_2, M4(-7.724e-02, 1.181e-01, 2.182e-02, 1.999e-02, -7.114e-02, -4.414e-02, -5.748e-06, -8.931e-03, 4.985e-03, 6.360e-02, 4.422e-02, 6.005e-02, 1.335e-01, -8.144e-03, -3.979e-02, 6.952e-03), r);
|
||||||
|
r = MulAdd(s3_3, M4(-1.826e-03, 2.390e-02, 4.665e-03, -3.357e-02, 2.088e-02, 1.436e-01, -2.474e-02, 1.100e-02, 2.727e-02, -1.649e-02, -9.539e-02, -1.112e-01, -2.427e-02, 1.811e-01, -4.267e-02, 1.060e-01), r);
|
||||||
|
r = MulAdd(s3_4, M4(9.873e-02, -1.417e-01, -1.365e-01, -3.187e-01, -7.583e-03, 3.047e-01, -2.480e-02, 2.623e-01, -3.193e-01, -1.539e-01, -2.986e-01, 2.350e-01, 4.367e-01, 2.441e-01, -3.426e-01, -5.108e-02), r);
|
||||||
|
r = MulAdd(s3_5, M4(-8.384e-02, 1.343e-01, 1.653e-01, 7.978e-02, 6.329e-02, 7.040e-02, 2.203e-02, -2.280e-01, 2.531e-02, -9.408e-02, -5.137e-02, -1.717e-01, -1.577e-01, 4.030e-02, -2.802e-01, 1.155e-01), r);
|
||||||
|
r = MulAdd(s3_6, M4(1.895e-02, 1.436e-01, 3.568e-04, -6.075e-02, 3.213e-02, -3.462e-02, -2.821e-02, -8.374e-03, 3.451e-02, -2.349e-02, 9.517e-03, 2.092e-02, -8.229e-02, 6.530e-02, 6.116e-03, -2.414e-02), r);
|
||||||
|
r = MulAdd(s3_7, M4(1.530e-01, 2.073e-01, -7.258e-02, -6.975e-02, -6.610e-03, -3.885e-02, -5.636e-03, 1.227e-01, 8.913e-02, 4.336e-02, -1.931e-03, -3.869e-02, -2.019e-02, -1.340e-02, -1.506e-02, 1.591e-02), r);
|
||||||
|
r = MulAdd(s3_8, M4(2.536e-02, -3.220e-02, 6.413e-02, -1.835e-02, -9.124e-02, -8.098e-02, -5.479e-02, -1.361e-02, -3.146e-03, 1.204e-01, -4.020e-02, -6.924e-02, -1.030e-01, -1.301e-01, 1.634e-02, 1.029e-01), r);
|
||||||
|
return r;
|
||||||
|
}
|
||||||
|
|
||||||
|
V4 f1(V4 s0_0, V4 s0_1, V4 s0_2, V4 s0_3, V4 s0_4, V4 s0_5, V4 s0_6, V4 s0_7, V4 s0_8, V4 s1_0, V4 s1_1, V4 s1_2, V4 s1_3, V4 s1_4, V4 s1_5, V4 s1_6, V4 s1_7, V4 s1_8, V4 s2_0, V4 s2_1, V4 s2_2, V4 s2_3, V4 s2_4, V4 s2_5, V4 s2_6, V4 s2_7, V4 s2_8, V4 s3_0, V4 s3_1, V4 s3_2, V4 s3_3, V4 s3_4, V4 s3_5, V4 s3_6, V4 s3_7, V4 s3_8) {
|
||||||
|
V4 r = { -2.994e-04, -3.163e-05, 4.528e-03, -1.285e-02 };
|
||||||
|
r = MulAdd(s0_0, M4(-5.222e-02, -3.069e-02, 2.456e-03, -1.117e-02, 4.933e-02, 5.166e-02, -6.284e-03, -9.151e-02, 1.439e-02, 1.755e-02, -8.848e-02, -9.796e-02, -2.835e-02, 3.699e-02, 2.912e-02, 4.373e-02), r);
|
||||||
|
r = MulAdd(s0_1, M4(6.333e-03, -2.767e-02, -7.247e-02, 8.441e-02, -3.433e-02, -4.699e-02, -1.193e-02, -1.729e-01, 2.481e-02, -4.121e-02, -2.861e-01, -1.202e-02, 2.687e-02, -1.313e-01, 1.747e-02, -9.108e-02), r);
|
||||||
|
r = MulAdd(s0_2, M4(1.309e-02, -1.968e-02, -1.246e-01, -3.915e-02, -1.159e-01, -6.491e-03, 3.316e-01, -6.851e-02, -2.940e-02, -1.787e-02, -5.850e-03, -6.207e-02, 5.272e-02, 9.800e-02, 4.709e-02, 7.491e-02), r);
|
||||||
|
r = MulAdd(s0_3, M4(-1.127e-01, -3.748e-02, -1.091e-01, 1.788e-01, -7.982e-02, -7.528e-02, 1.898e-01, -1.355e-01, -1.568e-01, 9.648e-02, 2.337e-01, -9.666e-02, -7.316e-02, 2.915e-02, 2.259e-02, -1.310e-02), r);
|
||||||
|
r = MulAdd(s0_4, M4(1.689e-02, -1.028e-01, 1.304e-01, -6.012e-02, -8.030e-02, -1.823e-01, 4.179e-01, -3.553e-01, 9.095e-04, 9.972e-02, 3.227e-01, -4.967e-02, -2.329e-01, 1.272e-01, 4.332e-01, -8.456e-01), r);
|
||||||
|
r = MulAdd(s0_5, M4(1.815e-02, -5.743e-02, 7.236e-02, -8.782e-02, 1.161e-01, 2.258e-01, 7.053e-01, -2.993e-01, 6.605e-02, -2.666e-03, -4.733e-02, -1.087e-01, -1.101e-01, 1.554e-01, 1.656e-01, 2.530e-01), r);
|
||||||
|
r = MulAdd(s0_6, M4(-7.750e-02, -6.619e-02, 2.202e-02, 4.186e-02, -1.519e-01, -8.918e-03, -1.919e-01, -7.085e-02, -1.356e-01, -1.363e-01, 1.782e-01, -1.499e-02, 9.670e-02, 1.450e-03, 5.675e-02, -3.337e-02), r);
|
||||||
|
r = MulAdd(s0_7, M4(-9.267e-02, 1.661e-01, 1.306e-01, -2.387e-01, -2.261e-02, 2.870e-01, -2.711e-01, 6.281e-02, 2.181e-02, 1.010e-01, 2.979e-01, -9.254e-02, 1.307e-01, -2.024e-02, 2.013e-01, -1.862e-02), r);
|
||||||
|
r = MulAdd(s0_8, M4(-7.233e-02, 8.276e-02, 1.279e-02, -3.778e-02, -3.737e-01, -2.422e-01, -1.352e-01, -1.631e-01, 6.518e-02, 2.511e-01, 1.588e-01, -3.599e-02, 8.821e-02, 3.757e-02, -1.340e-01, 1.006e-01), r);
|
||||||
|
r = MulAdd(s1_0, M4(1.034e-02, 8.194e-02, 9.844e-02, -1.052e-01, 4.683e-03, 4.432e-03, 8.420e-03, 7.511e-03, 7.210e-02, -8.697e-03, -9.834e-02, 1.366e-01, 3.221e-04, 1.836e-02, 1.307e-02, -6.823e-02), r);
|
||||||
|
r = MulAdd(s1_1, M4(-7.232e-02, 1.103e-01, 2.975e-01, 4.747e-02, -1.075e-01, -6.863e-02, 2.378e-01, -2.994e-02, 6.426e-02, 2.459e-02, -1.361e-01, 4.394e-02, 4.558e-02, -5.684e-02, -3.386e-02, 8.075e-02), r);
|
||||||
|
r = MulAdd(s1_2, M4(-1.568e-02, 6.463e-02, 4.001e-02, 3.549e-02, -3.385e-02, -1.547e-02, 2.510e-01, 3.198e-02, 2.533e-02, -6.612e-02, -5.453e-02, 1.387e-03, 3.071e-02, -5.115e-03, -9.345e-02, 1.790e-02), r);
|
||||||
|
r = MulAdd(s1_3, M4(1.723e-01, 2.119e-02, -3.394e-01, -1.101e-01, 7.882e-03, -4.188e-02, -6.882e-02, 5.060e-02, 4.902e-02, 2.919e-02, 7.773e-02, 1.080e-01, 8.944e-02, -2.819e-02, -1.252e-02, -2.744e-01), r);
|
||||||
|
r = MulAdd(s1_4, M4(2.682e-01, 8.840e-03, -3.974e-01, 2.436e-01, 1.156e-02, 3.806e-04, -5.090e-01, -1.339e-02, 1.677e-02, -1.337e-01, -1.050e-01, 2.647e-01, -1.971e-01, -1.145e-02, 1.471e-01, -7.814e-02), r);
|
||||||
|
r = MulAdd(s1_5, M4(-5.376e-02, 2.321e-02, -1.908e-01, -1.538e-01, 5.032e-03, 2.979e-02, -3.934e-02, -1.754e-01, 3.674e-02, 8.713e-03, -7.429e-02, -2.768e-03, -1.878e-01, -1.382e-01, 1.114e-01, 4.843e-02), r);
|
||||||
|
r = MulAdd(s1_6, M4(4.390e-03, 1.082e-02, 6.300e-03, -2.220e-02, -1.578e-02, -3.883e-02, 6.290e-02, 5.752e-03, 9.478e-02, 5.108e-03, 6.174e-02, 8.270e-02, -5.128e-02, -3.664e-02, 3.095e-02, -1.575e-01), r);
|
||||||
|
r = MulAdd(s1_7, M4(2.131e-01, 8.669e-03, 8.288e-02, 1.767e-01, -8.764e-02, -6.440e-03, 1.179e-01, -9.407e-02, -1.114e-01, -1.384e-01, 7.349e-02, 2.379e-02, 6.264e-02, -6.347e-02, -1.973e-01, 3.150e-02), r);
|
||||||
|
r = MulAdd(s1_8, M4(6.920e-02, 2.737e-01, 5.444e-02, -1.065e-01, -8.435e-02, 1.268e-01, -7.219e-03, -4.022e-02, -3.687e-02, -3.873e-02, 5.773e-02, 1.171e-02, 5.552e-02, -2.870e-02, -4.903e-02, 2.162e-02), r);
|
||||||
|
r = MulAdd(s2_0, M4(-6.811e-02, 3.915e-02, -1.970e-02, 5.496e-02, -3.225e-02, -5.284e-02, -3.737e-03, -1.864e-03, -1.361e-01, -7.308e-02, -4.948e-02, -1.634e-01, 5.283e-02, 1.746e-02, -8.374e-02, 7.123e-02), r);
|
||||||
|
r = MulAdd(s2_1, M4(4.868e-03, 7.851e-02, 1.067e-01, 5.576e-02, 1.276e-01, -7.837e-02, -2.875e-01, 3.754e-02, -1.315e-01, -9.095e-02, 8.041e-02, -1.156e-01, 1.309e-02, 1.086e-01, -1.335e-01, 9.059e-02), r);
|
||||||
|
r = MulAdd(s2_2, M4(-1.092e-02, 1.501e-01, -3.542e-02, 2.500e-02, 1.500e-02, -1.832e-01, -3.447e-01, -2.562e-02, -1.110e-01, 1.362e-01, 1.634e-01, -5.146e-02, -1.184e-02, -1.154e-01, 4.862e-02, 1.344e-03), r);
|
||||||
|
r = MulAdd(s2_3, M4(3.103e-02, -2.009e-02, 2.266e-02, 5.094e-02, 5.909e-01, 1.844e-01, -3.418e-02, -1.460e-01, 1.218e-02, -3.631e-02, -2.582e-01, -2.230e-01, 9.666e-02, -6.432e-02, 7.267e-02, 7.577e-02), r);
|
||||||
|
r = MulAdd(s2_4, M4(8.062e-02, -3.981e-02, -3.232e-02, -1.032e-01, -9.859e-02, 6.539e-01, 5.533e-01, -1.046e-02, -5.348e-01, 1.009e-02, -3.879e-01, 1.190e-01, -1.151e-01, 1.835e-01, -7.797e-02, 1.418e-01), r);
|
||||||
|
r = MulAdd(s2_5, M4(-1.404e-02, -1.730e-01, -4.516e-02, -2.158e-02, 2.544e-01, 4.463e-01, 1.404e-01, -6.854e-02, -9.712e-02, -4.920e-01, -2.485e-02, -6.416e-02, 3.612e-02, 2.451e-01, 2.327e-02, -1.251e-03), r);
|
||||||
|
r = MulAdd(s2_6, M4(6.507e-02, -2.267e-02, -7.660e-02, 3.043e-02, 3.541e-01, 2.804e-01, 2.783e-01, -2.580e-01, -1.185e-01, 8.028e-02, -1.395e-01, -4.988e-03, 4.702e-02, -5.327e-02, 4.580e-02, 3.130e-03), r);
|
||||||
|
r = MulAdd(s2_7, M4(9.806e-02, 6.990e-02, -4.317e-02, -2.415e-02, -2.263e-01, -1.723e-01, 2.669e-02, -3.393e-01, 9.368e-02, -6.775e-02, -1.883e-01, -8.601e-02, -2.278e-01, 1.612e-01, 1.625e-01, 8.821e-02), r);
|
||||||
|
r = MulAdd(s2_8, M4(-1.921e-02, 1.119e-01, 3.717e-02, -2.554e-02, 2.852e-02, 8.987e-02, 1.246e-01, 6.463e-03, 2.548e-02, -2.950e-02, 7.289e-02, 1.802e-02, 2.576e-02, 5.798e-02, 6.021e-02, -5.030e-03), r);
|
||||||
|
r = MulAdd(s3_0, M4(-1.023e-01, -3.759e-02, -2.437e-02, 1.032e-01, -2.143e-02, -4.189e-02, -6.139e-02, 9.887e-02, -9.094e-03, 3.087e-02, -1.056e-01, 1.376e-01, 1.702e-02, 3.138e-02, -1.243e-01, -5.115e-02), r);
|
||||||
|
r = MulAdd(s3_1, M4(3.439e-02, -1.018e-01, -3.260e-01, 6.226e-02, 3.794e-02, -6.747e-02, -1.743e-01, -9.149e-02, 6.116e-02, -3.539e-02, -3.971e-01, -2.458e-02, -1.436e-01, 4.323e-02, 5.595e-01, 1.160e-01), r);
|
||||||
|
r = MulAdd(s3_2, M4(-7.596e-02, -9.502e-02, -1.112e-02, -7.256e-02, -1.625e-02, -1.013e-01, -7.450e-02, 2.969e-03, -1.481e-02, -1.199e-01, -8.230e-02, 2.952e-02, -3.199e-02, 8.852e-02, -1.541e-02, 1.722e-02), r);
|
||||||
|
r = MulAdd(s3_3, M4(2.768e-03, -9.600e-02, 1.333e-01, -1.174e-01, -7.190e-02, 1.265e-02, 8.135e-02, -6.909e-03, 9.249e-02, -2.800e-02, 2.029e-01, -1.212e-02, 9.955e-02, -2.791e-02, -1.172e-01, 2.079e-01), r);
|
||||||
|
r = MulAdd(s3_4, M4(-1.948e-01, -1.936e-01, 5.127e-01, -7.970e-02, -1.135e-01, 1.060e-01, 1.226e-01, -3.195e-01, -4.980e-01, -5.665e-03, 3.167e-01, -2.413e-01, 2.036e-01, 1.519e-01, 7.793e-04, -1.316e-01), r);
|
||||||
|
r = MulAdd(s3_5, M4(-8.284e-02, -1.590e-01, 5.041e-03, -2.936e-02, 1.485e-01, 8.341e-02, -3.804e-02, 3.576e-02, 1.499e-01, -8.989e-02, 7.085e-02, -4.898e-02, 1.070e-01, 5.825e-02, 1.863e-01, -9.850e-03), r);
|
||||||
|
r = MulAdd(s3_6, M4(-3.057e-01, 2.794e-02, -7.737e-02, -4.168e-02, 2.696e-02, 1.279e-02, 2.638e-02, 8.177e-02, 1.217e-01, 2.531e-02, -1.188e-01, 1.018e-01, -5.486e-02, -6.606e-03, 1.868e-01, -1.050e-01), r);
|
||||||
|
r = MulAdd(s3_7, M4(-3.018e-01, -1.795e-01, -1.578e-01, -1.809e-01, 1.241e-01, -4.960e-02, -1.067e-01, -1.004e-02, -8.835e-02, 6.620e-02, 1.309e-01, -1.399e-01, 4.651e-02, 4.837e-02, -9.106e-02, 1.670e-01), r);
|
||||||
|
r = MulAdd(s3_8, M4(1.081e-02, -9.947e-02, 1.643e-02, -2.769e-02, 9.803e-02, -8.389e-02, -2.782e-02, -2.689e-02, 3.693e-02, -3.436e-03, 1.229e-02, -2.929e-02, -1.751e-01, -5.859e-03, 1.543e-01, 8.225e-02), r);
|
||||||
|
return r;
|
||||||
|
}
|
||||||
|
|
||||||
|
void Pass4(uint2 blockStart, uint3 tid) {
|
||||||
|
float2 pt = float2(GetInputPt());
|
||||||
|
uint2 gxy = Rmp8x8(tid.x) + blockStart;
|
||||||
|
uint2 size = GetInputSize();
|
||||||
|
if (gxy.x >= size.x || gxy.y >= size.y) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
float2 pos = (gxy + 0.5) * pt;
|
||||||
|
|
||||||
|
V4 s0_0 = l0(-1.0, -1.0);
|
||||||
|
V4 s0_1 = l0(0.0, -1.0);
|
||||||
|
V4 s0_2 = l0(1.0, -1.0);
|
||||||
|
V4 s0_3 = l0(-1.0, 0.0);
|
||||||
|
V4 s0_4 = l0(0.0, 0.0);
|
||||||
|
V4 s0_5 = l0(1.0, 0.0);
|
||||||
|
V4 s0_6 = l0(-1.0, 1.0);
|
||||||
|
V4 s0_7 = l0(0.0, 1.0);
|
||||||
|
V4 s0_8 = l0(1.0, 1.0);
|
||||||
|
V4 s1_0 = -max(-s0_0, 0.0);
|
||||||
|
V4 s1_1 = -max(-s0_1, 0.0);
|
||||||
|
V4 s1_2 = -max(-s0_2, 0.0);
|
||||||
|
V4 s1_3 = -max(-s0_3, 0.0);
|
||||||
|
V4 s1_4 = -max(-s0_4, 0.0);
|
||||||
|
V4 s1_5 = -max(-s0_5, 0.0);
|
||||||
|
V4 s1_6 = -max(-s0_6, 0.0);
|
||||||
|
V4 s1_7 = -max(-s0_7, 0.0);
|
||||||
|
V4 s1_8 = -max(-s0_8, 0.0);
|
||||||
|
s0_0 = max(s0_0, 0.0);
|
||||||
|
s0_1 = max(s0_1, 0.0);
|
||||||
|
s0_2 = max(s0_2, 0.0);
|
||||||
|
s0_3 = max(s0_3, 0.0);
|
||||||
|
s0_4 = max(s0_4, 0.0);
|
||||||
|
s0_5 = max(s0_5, 0.0);
|
||||||
|
s0_6 = max(s0_6, 0.0);
|
||||||
|
s0_7 = max(s0_7, 0.0);
|
||||||
|
s0_8 = max(s0_8, 0.0);
|
||||||
|
|
||||||
|
V4 s2_0 = l1(-1.0, -1.0);
|
||||||
|
V4 s2_1 = l1(0.0, -1.0);
|
||||||
|
V4 s2_2 = l1(1.0, -1.0);
|
||||||
|
V4 s2_3 = l1(-1.0, 0.0);
|
||||||
|
V4 s2_4 = l1(0.0, 0.0);
|
||||||
|
V4 s2_5 = l1(1.0, 0.0);
|
||||||
|
V4 s2_6 = l1(-1.0, 1.0);
|
||||||
|
V4 s2_7 = l1(0.0, 1.0);
|
||||||
|
V4 s2_8 = l1(1.0, 1.0);
|
||||||
|
V4 s3_0 = -max(-s2_0, 0.0);
|
||||||
|
V4 s3_1 = -max(-s2_1, 0.0);
|
||||||
|
V4 s3_2 = -max(-s2_2, 0.0);
|
||||||
|
V4 s3_3 = -max(-s2_3, 0.0);
|
||||||
|
V4 s3_4 = -max(-s2_4, 0.0);
|
||||||
|
V4 s3_5 = -max(-s2_5, 0.0);
|
||||||
|
V4 s3_6 = -max(-s2_6, 0.0);
|
||||||
|
V4 s3_7 = -max(-s2_7, 0.0);
|
||||||
|
V4 s3_8 = -max(-s2_8, 0.0);
|
||||||
|
s2_0 = max(s2_0, 0.0);
|
||||||
|
s2_1 = max(s2_1, 0.0);
|
||||||
|
s2_2 = max(s2_2, 0.0);
|
||||||
|
s2_3 = max(s2_3, 0.0);
|
||||||
|
s2_4 = max(s2_4, 0.0);
|
||||||
|
s2_5 = max(s2_5, 0.0);
|
||||||
|
s2_6 = max(s2_6, 0.0);
|
||||||
|
s2_7 = max(s2_7, 0.0);
|
||||||
|
s2_8 = max(s2_8, 0.0);
|
||||||
|
|
||||||
|
t2[gxy] = f0(s0_0, s0_1, s0_2, s0_3, s0_4, s0_5, s0_6, s0_7, s0_8, s1_0, s1_1, s1_2, s1_3, s1_4, s1_5, s1_6, s1_7, s1_8, s2_0, s2_1, s2_2, s2_3, s2_4, s2_5, s2_6, s2_7, s2_8, s3_0, s3_1, s3_2, s3_3, s3_4, s3_5, s3_6, s3_7, s3_8);
|
||||||
|
t3[gxy] = f1(s0_0, s0_1, s0_2, s0_3, s0_4, s0_5, s0_6, s0_7, s0_8, s1_0, s1_1, s1_2, s1_3, s1_4, s1_5, s1_6, s1_7, s1_8, s2_0, s2_1, s2_2, s2_3, s2_4, s2_5, s2_6, s2_7, s2_8, s3_0, s3_1, s3_2, s3_3, s3_4, s3_5, s3_6, s3_7, s3_8);
|
||||||
|
}
|
||||||
|
|
||||||
|
//!PASS 5
|
||||||
|
//!DESC conv4
|
||||||
|
//!BLOCK_SIZE 8
|
||||||
|
//!NUM_THREADS 64
|
||||||
|
//!IN t2, t3
|
||||||
|
//!OUT t0, t1
|
||||||
|
|
||||||
|
#define l0(x, y) V4(O(t2, float2(x, y)))
|
||||||
|
#define l1(x, y) V4(O(t3, float2(x, y)))
|
||||||
|
|
||||||
|
V4 f0(V4 s0_0, V4 s0_1, V4 s0_2, V4 s0_3, V4 s0_4, V4 s0_5, V4 s0_6, V4 s0_7, V4 s0_8, V4 s1_0, V4 s1_1, V4 s1_2, V4 s1_3, V4 s1_4, V4 s1_5, V4 s1_6, V4 s1_7, V4 s1_8, V4 s2_0, V4 s2_1, V4 s2_2, V4 s2_3, V4 s2_4, V4 s2_5, V4 s2_6, V4 s2_7, V4 s2_8, V4 s3_0, V4 s3_1, V4 s3_2, V4 s3_3, V4 s3_4, V4 s3_5, V4 s3_6, V4 s3_7, V4 s3_8) {
|
||||||
|
V4 r = { -3.261e-03, 1.350e-04, -6.605e-05, 1.307e-04 };
|
||||||
|
r = MulAdd(s0_0, M4(5.681e-02, -7.933e-02, -1.161e-02, -3.257e-02, -1.507e-02, 2.248e-02, -1.351e-02, 2.789e-02, -1.713e-01, 9.482e-02, 2.715e-02, 9.506e-02, 1.714e-01, -1.090e-01, -7.237e-02, -1.563e-01), r);
|
||||||
|
r = MulAdd(s0_1, M4(-9.622e-03, -7.774e-04, -4.095e-02, 1.106e-02, -3.592e-02, -4.358e-02, 1.983e-02, -1.134e-02, -1.313e-02, -1.086e-01, 1.102e-01, -3.091e-01, 1.982e-01, 1.438e-01, -6.038e-02, 9.579e-02), r);
|
||||||
|
r = MulAdd(s0_2, M4(-3.893e-02, 1.554e-02, -7.763e-05, 1.610e-02, 3.470e-03, 9.915e-03, -9.881e-03, 5.331e-02, -9.152e-02, 6.899e-02, -3.615e-02, 1.558e-01, -3.300e-02, 4.493e-02, 2.148e-02, -3.677e-02), r);
|
||||||
|
r = MulAdd(s0_3, M4(1.939e-01, -7.700e-02, -1.449e-01, -1.942e-02, 9.649e-02, -3.580e-03, -1.767e-02, 2.394e-02, -1.299e-01, 1.160e-01, 8.000e-02, 9.737e-02, 2.751e-01, -4.435e-01, 1.013e-01, -1.782e-01), r);
|
||||||
|
r = MulAdd(s0_4, M4(-2.745e-01, 2.922e-01, -2.008e-01, 1.636e-01, -4.843e-02, 4.172e-01, 3.097e-02, 3.326e-01, -1.798e-02, -3.860e-01, 3.246e-02, 4.225e-01, -1.057e-01, 2.302e-01, -7.879e-02, 4.832e-02), r);
|
||||||
|
r = MulAdd(s0_5, M4(-6.834e-04, -3.372e-02, -9.351e-02, 1.547e-02, 5.621e-02, -1.195e-02, -9.402e-03, 6.439e-02, 8.787e-02, 1.499e-02, 1.928e-01, 6.693e-02, 6.516e-02, -1.145e-01, -6.610e-02, 3.986e-02), r);
|
||||||
|
r = MulAdd(s0_6, M4(7.682e-02, -9.222e-02, 1.566e-01, -1.438e-02, 5.080e-02, -2.762e-02, -3.121e-02, -1.242e-02, 2.046e-02, -1.131e-02, 4.555e-02, -3.006e-02, 1.125e-01, -7.883e-02, 1.063e-01, 3.027e-03), r);
|
||||||
|
r = MulAdd(s0_7, M4(-1.395e-01, 4.847e-02, 1.605e-01, 1.363e-01, 6.243e-02, -1.464e-02, 3.336e-02, -8.862e-02, 3.286e-02, -2.398e-02, -2.326e-02, -8.408e-02, 1.274e-01, -4.997e-02, 1.548e-01, -8.650e-02), r);
|
||||||
|
r = MulAdd(s0_8, M4(4.236e-02, 3.116e-02, 7.690e-02, 3.084e-02, 6.290e-03, 1.016e-02, 7.155e-02, -9.786e-02, -1.453e-02, -4.564e-04, -3.654e-02, 7.179e-03, -2.110e-02, -2.766e-02, 1.022e-01, -6.664e-02), r);
|
||||||
|
r = MulAdd(s1_0, M4(-2.814e-02, 6.473e-02, 5.209e-02, 6.202e-02, -1.898e-02, 6.061e-02, -1.557e-02, 3.561e-02, 2.137e-01, -1.913e-01, 2.387e-03, -1.470e-01, 4.553e-02, -3.358e-02, 1.936e-03, -4.798e-02), r);
|
||||||
|
r = MulAdd(s1_1, M4(4.947e-03, -8.431e-02, -3.362e-03, -1.057e-01, -6.735e-02, 8.463e-03, -4.622e-02, -2.022e-02, -1.450e-01, -1.687e-03, -1.541e-02, -1.116e-02, 4.447e-02, 5.088e-02, -7.198e-03, 3.279e-02), r);
|
||||||
|
r = MulAdd(s1_2, M4(1.202e-03, -2.591e-02, -5.357e-03, -3.844e-02, -7.403e-03, 3.771e-02, -6.171e-02, 8.820e-02, 6.744e-03, -4.156e-02, -1.377e-02, 9.398e-02, -2.643e-02, 4.991e-02, -2.000e-02, 1.056e-02), r);
|
||||||
|
r = MulAdd(s1_3, M4(3.923e-01, 3.525e-02, -1.294e-01, 1.478e-02, 9.667e-02, 1.289e-01, 8.960e-02, 1.946e-02, 3.128e-01, -3.315e-01, -3.019e-01, 1.021e-01, 2.095e-01, -1.488e-01, -9.439e-02, -9.635e-02), r);
|
||||||
|
r = MulAdd(s1_4, M4(-3.641e-01, -9.985e-02, -3.482e-01, -2.646e-01, -5.257e-01, 9.475e-01, 1.714e-01, 5.842e-01, -2.199e-01, -6.131e-02, -4.597e-01, 5.556e-01, 7.933e-02, -2.150e-01, -3.469e-01, -1.978e-01), r);
|
||||||
|
r = MulAdd(s1_5, M4(7.883e-05, -2.207e-02, -1.735e-02, 2.167e-02, 4.628e-02, 8.814e-02, -4.837e-02, 6.515e-02, 1.617e-01, -4.460e-02, -1.002e-01, 7.496e-02, -1.180e-01, 5.540e-02, -5.708e-02, 5.715e-02), r);
|
||||||
|
r = MulAdd(s1_6, M4(1.680e-01, -5.262e-02, 6.143e-02, -4.758e-02, -5.343e-02, 4.332e-02, 1.191e-01, 8.545e-03, 1.171e-01, -8.169e-02, 1.535e-02, -2.281e-01, 8.009e-02, -9.744e-02, 6.114e-02, 8.379e-03), r);
|
||||||
|
r = MulAdd(s1_7, M4(-9.744e-02, 2.573e-02, 6.125e-02, 1.265e-01, 9.253e-02, -1.227e-01, 3.224e-01, -2.402e-01, 1.083e-01, 1.607e-02, 1.155e-01, -4.014e-01, -2.347e-02, -3.821e-02, 2.379e-01, 2.605e-02), r);
|
||||||
|
r = MulAdd(s1_8, M4(5.428e-02, -5.434e-02, -2.345e-02, -2.189e-03, 1.274e-02, 7.503e-02, 1.442e-01, -8.839e-02, -3.480e-02, 1.444e-02, -3.859e-02, -1.089e-01, -3.183e-02, 9.172e-02, 1.092e-01, 6.688e-02), r);
|
||||||
|
r = MulAdd(s2_0, M4(2.283e-01, 3.872e-02, -5.533e-02, -1.704e-02, -1.533e-02, 1.459e-02, 3.842e-02, 6.367e-02, -4.041e-02, -6.411e-03, -5.052e-03, -8.331e-03, 2.786e-03, -5.502e-02, 6.695e-03, -1.982e-02), r);
|
||||||
|
r = MulAdd(s2_1, M4(-4.716e-01, 4.092e-01, -1.581e-01, 4.209e-01, 1.255e-01, -7.138e-02, 7.300e-02, -1.357e-01, -6.908e-02, -1.986e-02, 1.801e-02, -4.505e-02, -1.611e-01, -1.216e-01, -6.522e-02, -9.093e-02), r);
|
||||||
|
r = MulAdd(s2_2, M4(1.019e-01, -3.650e-02, 1.353e-02, 2.487e-01, -1.344e-04, 4.653e-02, 1.721e-02, 4.005e-02, 7.572e-03, -4.357e-02, -3.720e-02, 2.091e-02, 6.051e-03, -6.957e-02, -9.009e-02, -1.788e-02), r);
|
||||||
|
r = MulAdd(s2_3, M4(2.159e-02, -3.325e-02, 3.084e-02, 1.091e-01, -9.662e-02, 1.040e-01, 1.078e-01, -2.572e-02, 2.237e-04, -2.571e-02, -2.335e-02, -1.554e-02, 1.275e-01, -4.579e-02, -1.772e-02, 3.282e-02), r);
|
||||||
|
r = MulAdd(s2_4, M4(4.984e-02, 2.302e-01, 6.568e-02, 1.279e-01, 6.857e-02, -1.499e-01, -4.461e-02, -1.977e-01, -1.903e-01, 1.430e-01, 3.271e-02, 1.978e-01, 2.410e-01, 5.980e-01, -1.394e-01, 2.261e-01), r);
|
||||||
|
r = MulAdd(s2_5, M4(2.188e-02, -8.976e-03, 2.475e-02, 1.340e-02, -4.458e-02, 5.360e-02, 2.628e-02, -1.405e-02, 6.166e-02, -4.895e-02, 1.348e-03, 5.680e-02, -1.123e-01, 7.224e-02, -6.458e-02, 1.314e-01), r);
|
||||||
|
r = MulAdd(s2_6, M4(3.252e-02, -2.389e-02, -2.067e-02, -6.871e-02, -8.327e-02, 7.793e-02, 7.681e-03, 5.095e-02, -1.693e-02, -3.622e-02, 3.065e-02, -1.582e-02, -6.963e-03, 2.835e-02, 6.805e-02, -1.475e-02), r);
|
||||||
|
r = MulAdd(s2_7, M4(4.783e-02, -2.945e-02, 4.732e-02, -9.789e-04, -1.619e-02, -2.603e-02, -1.368e-01, 2.956e-02, 9.844e-02, -1.214e-01, 1.776e-01, -1.461e-01, -5.165e-02, -1.055e-02, 1.793e-01, -4.355e-02), r);
|
||||||
|
r = MulAdd(s2_8, M4(2.619e-03, 4.801e-02, 6.393e-02, -2.399e-02, -1.280e-03, -2.210e-02, -4.649e-02, 1.561e-03, -1.789e-02, 5.576e-02, 1.200e-01, 3.338e-03, 4.475e-02, -2.957e-02, 9.300e-02, -7.837e-02), r);
|
||||||
|
r = MulAdd(s3_0, M4(-1.536e-01, -3.593e-03, -1.064e-02, 1.740e-02, 9.197e-02, 2.772e-01, 5.258e-01, 5.745e-01, 2.331e-02, 8.995e-02, 2.611e-02, 5.463e-02, 4.872e-02, -8.230e-03, -1.742e-02, 3.405e-03), r);
|
||||||
|
r = MulAdd(s3_1, M4(4.799e-02, 1.088e-01, -7.562e-02, 5.926e-02, 4.190e-01, -4.922e-01, -1.822e-01, -2.309e-01, 1.776e-01, 1.799e-01, 1.213e-01, 3.198e-01, -1.565e-01, 2.118e-02, -5.914e-02, 1.048e-01), r);
|
||||||
|
r = MulAdd(s3_2, M4(-6.867e-02, -2.488e-02, 2.563e-02, -3.161e-02, -4.038e-02, 5.042e-02, 2.474e-02, 3.962e-03, -4.263e-02, 4.382e-02, -6.197e-03, 5.435e-02, 8.477e-02, -7.694e-02, -2.473e-02, -2.000e-02), r);
|
||||||
|
r = MulAdd(s3_3, M4(-6.567e-02, 7.271e-02, -2.275e-02, -4.345e-03, -4.825e-02, -7.541e-01, 5.163e-01, 9.170e-01, -1.040e-01, -9.911e-03, 3.569e-02, 2.347e-01, 2.350e-02, 6.202e-02, 7.421e-03, 2.377e-02), r);
|
||||||
|
r = MulAdd(s3_4, M4(-3.371e-02, -2.738e-02, 1.670e-01, 2.607e-01, -5.009e-02, 5.743e-03, -6.991e-01, -2.858e-02, -6.907e-02, -4.016e-01, 3.462e-01, 9.128e-01, -1.622e-01, 1.392e-01, 2.250e-01, 1.183e-01), r);
|
||||||
|
r = MulAdd(s3_5, M4(-8.330e-03, 1.029e-01, 1.045e-01, 2.013e-01, 2.609e-02, 7.939e-02, -1.054e-01, 6.487e-02, 1.165e-01, -6.250e-02, 1.274e-01, 2.396e-01, 2.390e-01, -2.468e-01, 1.178e-02, 6.794e-02), r);
|
||||||
|
r = MulAdd(s3_6, M4(5.411e-02, -5.669e-02, 2.831e-02, -3.762e-02, 1.186e-01, 1.750e-01, -2.862e-01, -9.876e-02, 5.851e-02, 2.750e-02, 7.348e-03, -2.151e-01, -3.151e-02, 5.225e-02, 3.178e-02, 1.438e-02), r);
|
||||||
|
r = MulAdd(s3_7, M4(-2.053e-03, 2.875e-02, -4.633e-02, -7.843e-02, -5.216e-02, -1.497e-04, -2.534e-01, -5.098e-01, 3.092e-02, -4.215e-02, -1.330e-01, -9.137e-02, 5.062e-02, 5.514e-02, -1.958e-01, 6.162e-03), r);
|
||||||
|
r = MulAdd(s3_8, M4(3.627e-02, 1.482e-02, 2.228e-02, -7.151e-02, -1.770e-02, 6.009e-02, 2.013e-01, 2.403e-02, 1.912e-03, -9.001e-03, 1.673e-02, -3.465e-02, 5.222e-02, -3.027e-02, -4.458e-03, -6.391e-02), r);
|
||||||
|
return r;
|
||||||
|
}
|
||||||
|
|
||||||
|
V4 f1(V4 s0_0, V4 s0_1, V4 s0_2, V4 s0_3, V4 s0_4, V4 s0_5, V4 s0_6, V4 s0_7, V4 s0_8, V4 s1_0, V4 s1_1, V4 s1_2, V4 s1_3, V4 s1_4, V4 s1_5, V4 s1_6, V4 s1_7, V4 s1_8, V4 s2_0, V4 s2_1, V4 s2_2, V4 s2_3, V4 s2_4, V4 s2_5, V4 s2_6, V4 s2_7, V4 s2_8, V4 s3_0, V4 s3_1, V4 s3_2, V4 s3_3, V4 s3_4, V4 s3_5, V4 s3_6, V4 s3_7, V4 s3_8) {
|
||||||
|
V4 r = { -1.782e-04, -1.204e-03, 6.004e-04, -1.736e-03 };
|
||||||
|
r = MulAdd(s0_0, M4(-5.528e-02, 2.435e-02, -2.728e-02, 5.042e-02, -2.357e-02, 1.752e-02, 6.730e-02, -1.869e-02, 5.562e-02, 2.108e-03, -2.535e-02, -7.791e-02, -6.984e-02, 8.842e-02, 7.203e-02, 3.709e-02), r);
|
||||||
|
r = MulAdd(s0_1, M4(-6.164e-02, -1.824e-02, 8.179e-02, -3.238e-02, 5.338e-02, -5.506e-02, -1.020e-01, 1.520e-02, 1.953e-01, -2.850e-02, 8.323e-02, -8.899e-02, 5.112e-02, 6.369e-02, -5.510e-02, 1.997e-02), r);
|
||||||
|
r = MulAdd(s0_2, M4(6.117e-02, -1.311e-02, -9.258e-03, -1.479e-02, -2.710e-02, 2.958e-02, 2.946e-02, -9.472e-03, 4.257e-02, -7.053e-02, -5.896e-02, 5.475e-02, 6.131e-02, -1.827e-02, -2.909e-02, -6.470e-02), r);
|
||||||
|
r = MulAdd(s0_3, M4(-1.411e-01, 1.597e-01, 2.142e-01, 6.972e-02, 1.704e-02, 4.423e-02, -8.405e-02, 4.993e-02, 1.176e-02, -8.471e-02, 4.062e-02, -1.001e-01, -3.805e-02, 3.820e-02, -6.258e-01, 2.568e-01), r);
|
||||||
|
r = MulAdd(s0_4, M4(3.384e-01, -2.619e-01, 1.799e-01, -3.175e-01, 3.472e-03, -1.186e-01, 7.886e-02, -1.126e-01, 1.378e-01, -3.772e-02, -1.396e-02, 6.889e-02, -1.383e-01, 1.958e-01, 7.297e-02, -1.066e+00), r);
|
||||||
|
r = MulAdd(s0_5, M4(-4.115e-04, 8.733e-03, 3.432e-02, 5.650e-02, 9.203e-02, 6.899e-02, -9.987e-03, 5.139e-02, 2.075e-01, -1.229e-02, 5.912e-02, -2.866e-02, -1.602e-01, 1.654e-01, 6.957e-02, 5.472e-02), r);
|
||||||
|
r = MulAdd(s0_6, M4(-1.000e-01, 9.401e-02, -3.864e-02, 1.160e-01, 1.108e-03, 8.814e-02, 6.570e-04, 2.167e-02, 6.762e-05, -1.080e-02, -1.670e-02, -4.178e-03, -9.704e-03, 2.164e-01, 3.748e-02, -1.258e-02), r);
|
||||||
|
r = MulAdd(s0_7, M4(7.557e-02, -2.360e-01, -2.727e-02, -7.688e-02, -3.110e-02, 1.671e-02, -4.238e-02, 5.553e-02, 6.518e-02, 3.357e-02, -2.725e-02, -2.524e-02, -1.352e-01, -1.005e-01, -4.108e-02, 2.664e-01), r);
|
||||||
|
r = MulAdd(s0_8, M4(9.624e-02, 5.754e-03, 8.412e-02, -2.955e-02, 2.850e-02, 8.830e-03, -4.162e-02, -1.337e-02, -4.374e-02, -2.352e-02, -1.566e-02, 1.822e-02, 7.979e-02, -9.058e-02, -1.071e-01, -3.379e-03), r);
|
||||||
|
r = MulAdd(s1_0, M4(1.395e-02, 1.801e-02, 1.899e-03, -3.313e-02, 2.251e-02, -3.697e-03, 5.577e-02, -3.001e-02, -6.090e-02, 1.645e-01, -1.047e-01, 1.483e-01, -6.634e-03, 3.917e-04, -1.999e-02, 2.114e-02), r);
|
||||||
|
r = MulAdd(s1_1, M4(2.859e-03, 5.455e-02, 4.336e-02, -2.717e-02, 9.302e-02, -9.807e-02, 7.046e-02, -3.707e-02, -1.275e-01, -3.463e-02, -1.160e-01, -4.227e-02, 3.162e-02, 3.583e-02, 4.579e-02, -1.196e-02), r);
|
||||||
|
r = MulAdd(s1_2, M4(-7.086e-03, 2.542e-03, 1.500e-03, -6.273e-03, 5.711e-02, -5.317e-02, -5.455e-03, 4.847e-02, 8.830e-02, 5.991e-02, 3.356e-02, 1.214e-03, -5.272e-03, -5.211e-02, -2.142e-02, -1.246e-02), r);
|
||||||
|
r = MulAdd(s1_3, M4(-4.807e-02, 4.530e-02, 2.719e-01, -1.035e-02, 4.911e-02, -5.824e-03, -6.478e-02, -1.051e-03, -1.348e-02, 6.405e-01, -4.257e-01, 3.690e-01, -9.665e-02, 2.101e-01, 6.571e-02, 9.738e-02), r);
|
||||||
|
r = MulAdd(s1_4, M4(2.423e-01, -2.074e-01, -4.394e-01, -2.830e-02, 5.415e-02, -2.337e-01, 6.080e-01, -1.843e-01, -5.128e-01, 1.559e-01, -2.033e-01, -6.040e-02, -6.726e-02, 2.589e-01, 1.901e-01, -9.598e-02), r);
|
||||||
|
r = MulAdd(s1_5, M4(-1.456e-01, 6.484e-02, 1.125e-01, -1.183e-02, 2.186e-01, 2.930e-02, -4.285e-02, 6.272e-02, 1.500e-01, 1.033e-01, 2.173e-01, -3.328e-02, -6.785e-02, -7.882e-02, -1.450e-01, 7.182e-02), r);
|
||||||
|
r = MulAdd(s1_6, M4(-4.062e-02, 9.988e-02, -5.106e-02, 1.546e-01, 5.122e-02, -7.398e-02, -5.320e-03, -5.669e-02, -4.188e-02, 2.035e-01, -5.253e-02, -7.554e-03, -6.233e-02, 1.285e-01, 1.152e-02, 7.495e-02), r);
|
||||||
|
r = MulAdd(s1_7, M4(1.168e-01, -1.061e-01, -8.798e-02, -2.456e-01, -1.274e-01, -9.338e-02, 6.064e-04, 1.255e-01, 2.944e-02, -9.599e-02, -1.606e-01, 1.477e-01, -5.541e-02, -9.992e-02, -5.652e-02, 1.402e-02), r);
|
||||||
|
r = MulAdd(s1_8, M4(-8.447e-02, -2.272e-02, 3.291e-02, 1.141e-01, 2.835e-01, 2.747e-02, 9.338e-03, -1.271e-01, 1.118e-03, -3.543e-02, -3.201e-02, 5.803e-02, 1.793e-01, -6.889e-02, -3.139e-02, -1.000e-01), r);
|
||||||
|
r = MulAdd(s2_0, M4(3.477e-02, 8.152e-03, -8.100e-03, 3.869e-02, 4.675e-02, 8.080e-02, -4.909e-02, 6.764e-03, -2.946e-03, -7.021e-02, -1.191e-02, -1.660e-02, -5.967e-02, -1.872e-02, -3.485e-02, 3.391e-02), r);
|
||||||
|
r = MulAdd(s2_1, M4(1.685e-01, -2.681e-01, -2.340e-01, -1.748e-01, -1.593e-01, 7.496e-02, 3.748e-02, 1.562e-02, 5.150e-02, -3.648e-02, 3.739e-02, -4.384e-02, -1.521e-02, -1.061e-01, -1.381e-01, 1.733e-02), r);
|
||||||
|
r = MulAdd(s2_2, M4(1.573e-01, 1.415e-01, 1.714e-01, -5.175e-02, -2.442e-02, 1.054e-02, 3.047e-03, -5.944e-03, -6.027e-03, 1.034e-02, -3.381e-02, 4.299e-02, -9.763e-02, 4.729e-02, 9.642e-02, -1.450e-02), r);
|
||||||
|
r = MulAdd(s2_3, M4(8.191e-03, 1.353e-01, -6.018e-02, 5.677e-02, -1.725e-02, -1.324e-01, 1.646e-01, -1.154e-01, -9.796e-03, 3.066e-02, -5.975e-02, 2.878e-02, -1.381e-01, 1.550e-01, 3.556e-02, 8.926e-02), r);
|
||||||
|
r = MulAdd(s2_4, M4(1.715e-01, -2.115e-02, 8.179e-02, -2.066e-01, 1.275e-01, 1.599e-01, 2.325e-02, -9.637e-03, 6.565e-02, -1.901e-01, 7.185e-02, -1.559e-01, 1.106e-01, -6.210e-02, -3.672e-01, 6.248e-02), r);
|
||||||
|
r = MulAdd(s2_5, M4(-3.453e-03, 5.284e-02, -1.031e-01, 5.091e-02, 1.538e-02, -9.971e-02, -5.610e-02, -2.585e-02, 6.441e-02, 1.113e-01, 3.085e-02, 6.860e-02, -6.167e-02, -6.774e-02, -6.898e-02, -4.397e-03), r);
|
||||||
|
r = MulAdd(s2_6, M4(-1.561e-02, 5.106e-02, 2.999e-03, -7.663e-03, 6.665e-02, -1.217e-01, -9.529e-03, -2.096e-02, -2.825e-02, 4.854e-02, -2.196e-02, -7.191e-03, 2.274e-03, 1.698e-02, -1.727e-02, 1.967e-03), r);
|
||||||
|
r = MulAdd(s2_7, M4(3.534e-02, -1.077e-02, 1.607e-02, 4.542e-02, -7.989e-02, 1.294e-01, 4.920e-02, -6.332e-02, -9.402e-02, 2.028e-02, -6.305e-03, 9.061e-02, 2.225e-03, 2.352e-02, -4.032e-03, -4.985e-02), r);
|
||||||
|
r = MulAdd(s2_8, M4(7.112e-02, -1.427e-02, -2.352e-02, -2.989e-02, -5.633e-02, -6.039e-03, 3.496e-03, 2.535e-02, 1.265e-01, -4.541e-02, -5.393e-02, -5.355e-02, 1.498e-03, 2.057e-02, 1.278e-02, 5.662e-02), r);
|
||||||
|
r = MulAdd(s3_0, M4(9.523e-02, -7.183e-02, -2.740e-01, -1.569e-02, 1.008e-01, 3.065e+00, -2.003e-01, 1.938e-01, 7.503e-02, -1.096e-01, -3.177e-02, -4.074e-02, 1.090e-03, -2.250e-02, -4.727e-02, 2.528e-02), r);
|
||||||
|
r = MulAdd(s3_1, M4(-7.789e-02, 7.186e-03, 3.838e-01, -1.314e-01, -4.119e-01, 1.344e-01, 5.252e-02, -4.478e-02, -2.421e-01, 8.221e-02, 1.588e-01, 5.943e-02, -6.960e-02, -7.055e-02, -5.857e-02, -2.367e-02), r);
|
||||||
|
r = MulAdd(s3_2, M4(1.578e-01, -5.477e-02, -1.343e-01, 7.698e-02, 9.761e-02, -2.725e-02, -6.329e-02, -5.552e-02, -6.854e-02, 1.143e-02, -8.043e-02, 1.416e-02, 5.387e-02, 1.371e-01, 1.146e-01, -5.881e-04), r);
|
||||||
|
r = MulAdd(s3_3, M4(7.307e-03, -8.177e-02, 5.634e-02, -1.149e-01, -4.060e-01, 1.613e+00, -3.145e-01, 2.057e-02, -9.555e-02, 2.548e-01, 5.932e-02, 7.789e-02, 7.174e-03, -6.399e-03, -2.315e-02, 8.381e-03), r);
|
||||||
|
r = MulAdd(s3_4, M4(1.200e-01, 1.356e-01, 8.711e-03, 7.537e-02, -1.751e-01, 3.458e-02, 2.391e-01, -1.111e-01, 1.506e-01, -3.165e-01, -4.619e-01, -9.386e-02, -4.377e-02, -1.492e-01, -5.002e-01, 9.821e-02), r);
|
||||||
|
r = MulAdd(s3_5, M4(1.539e-01, 7.309e-02, 4.257e-03, -1.539e-01, -4.757e-01, 1.070e-01, 1.702e-02, 9.709e-02, -1.140e-01, 1.938e-01, 1.982e-01, -3.215e-02, -3.822e-01, 3.408e-01, 1.647e-01, 1.597e-01), r);
|
||||||
|
r = MulAdd(s3_6, M4(-3.320e-02, 4.854e-02, -1.957e-02, 3.353e-02, 1.823e-01, 8.532e-02, 3.236e-02, -1.874e-01, -1.073e-02, -6.598e-03, -2.954e-02, -2.175e-02, 1.184e-02, -3.856e-02, 2.166e-02, -2.608e-02), r);
|
||||||
|
r = MulAdd(s3_7, M4(2.038e-02, -4.606e-02, -3.841e-02, -4.008e-02, -2.542e-01, -1.076e-01, -2.891e-02, 1.837e-01, 3.842e-02, 1.753e-01, 3.043e-02, -3.298e-02, 2.990e-02, 1.215e-01, 9.583e-02, -5.860e-02), r);
|
||||||
|
r = MulAdd(s3_8, M4(6.138e-02, 3.405e-02, 3.364e-04, 6.037e-03, 1.811e-01, 9.691e-04, 3.497e-02, -1.810e-02, -3.940e-02, -1.159e-01, -7.007e-02, 1.170e-01, 1.829e-02, -2.216e-02, -1.689e-02, 1.150e-01), r);
|
||||||
|
return r;
|
||||||
|
}
|
||||||
|
|
||||||
|
void Pass5(uint2 blockStart, uint3 tid) {
|
||||||
|
float2 pt = float2(GetInputPt());
|
||||||
|
uint2 gxy = Rmp8x8(tid.x) + blockStart;
|
||||||
|
uint2 size = GetInputSize();
|
||||||
|
if (gxy.x >= size.x || gxy.y >= size.y) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
float2 pos = (gxy + 0.5) * pt;
|
||||||
|
|
||||||
|
V4 s0_0 = l0(-1.0, -1.0);
|
||||||
|
V4 s0_1 = l0(0.0, -1.0);
|
||||||
|
V4 s0_2 = l0(1.0, -1.0);
|
||||||
|
V4 s0_3 = l0(-1.0, 0.0);
|
||||||
|
V4 s0_4 = l0(0.0, 0.0);
|
||||||
|
V4 s0_5 = l0(1.0, 0.0);
|
||||||
|
V4 s0_6 = l0(-1.0, 1.0);
|
||||||
|
V4 s0_7 = l0(0.0, 1.0);
|
||||||
|
V4 s0_8 = l0(1.0, 1.0);
|
||||||
|
V4 s1_0 = -max(-s0_0, 0.0);
|
||||||
|
V4 s1_1 = -max(-s0_1, 0.0);
|
||||||
|
V4 s1_2 = -max(-s0_2, 0.0);
|
||||||
|
V4 s1_3 = -max(-s0_3, 0.0);
|
||||||
|
V4 s1_4 = -max(-s0_4, 0.0);
|
||||||
|
V4 s1_5 = -max(-s0_5, 0.0);
|
||||||
|
V4 s1_6 = -max(-s0_6, 0.0);
|
||||||
|
V4 s1_7 = -max(-s0_7, 0.0);
|
||||||
|
V4 s1_8 = -max(-s0_8, 0.0);
|
||||||
|
s0_0 = max(s0_0, 0.0);
|
||||||
|
s0_1 = max(s0_1, 0.0);
|
||||||
|
s0_2 = max(s0_2, 0.0);
|
||||||
|
s0_3 = max(s0_3, 0.0);
|
||||||
|
s0_4 = max(s0_4, 0.0);
|
||||||
|
s0_5 = max(s0_5, 0.0);
|
||||||
|
s0_6 = max(s0_6, 0.0);
|
||||||
|
s0_7 = max(s0_7, 0.0);
|
||||||
|
s0_8 = max(s0_8, 0.0);
|
||||||
|
|
||||||
|
V4 s2_0 = l1(-1.0, -1.0);
|
||||||
|
V4 s2_1 = l1(0.0, -1.0);
|
||||||
|
V4 s2_2 = l1(1.0, -1.0);
|
||||||
|
V4 s2_3 = l1(-1.0, 0.0);
|
||||||
|
V4 s2_4 = l1(0.0, 0.0);
|
||||||
|
V4 s2_5 = l1(1.0, 0.0);
|
||||||
|
V4 s2_6 = l1(-1.0, 1.0);
|
||||||
|
V4 s2_7 = l1(0.0, 1.0);
|
||||||
|
V4 s2_8 = l1(1.0, 1.0);
|
||||||
|
V4 s3_0 = -max(-s2_0, 0.0);
|
||||||
|
V4 s3_1 = -max(-s2_1, 0.0);
|
||||||
|
V4 s3_2 = -max(-s2_2, 0.0);
|
||||||
|
V4 s3_3 = -max(-s2_3, 0.0);
|
||||||
|
V4 s3_4 = -max(-s2_4, 0.0);
|
||||||
|
V4 s3_5 = -max(-s2_5, 0.0);
|
||||||
|
V4 s3_6 = -max(-s2_6, 0.0);
|
||||||
|
V4 s3_7 = -max(-s2_7, 0.0);
|
||||||
|
V4 s3_8 = -max(-s2_8, 0.0);
|
||||||
|
s2_0 = max(s2_0, 0.0);
|
||||||
|
s2_1 = max(s2_1, 0.0);
|
||||||
|
s2_2 = max(s2_2, 0.0);
|
||||||
|
s2_3 = max(s2_3, 0.0);
|
||||||
|
s2_4 = max(s2_4, 0.0);
|
||||||
|
s2_5 = max(s2_5, 0.0);
|
||||||
|
s2_6 = max(s2_6, 0.0);
|
||||||
|
s2_7 = max(s2_7, 0.0);
|
||||||
|
s2_8 = max(s2_8, 0.0);
|
||||||
|
|
||||||
|
t0[gxy] = f0(s0_0, s0_1, s0_2, s0_3, s0_4, s0_5, s0_6, s0_7, s0_8, s1_0, s1_1, s1_2, s1_3, s1_4, s1_5, s1_6, s1_7, s1_8, s2_0, s2_1, s2_2, s2_3, s2_4, s2_5, s2_6, s2_7, s2_8, s3_0, s3_1, s3_2, s3_3, s3_4, s3_5, s3_6, s3_7, s3_8);
|
||||||
|
t1[gxy] = f1(s0_0, s0_1, s0_2, s0_3, s0_4, s0_5, s0_6, s0_7, s0_8, s1_0, s1_1, s1_2, s1_3, s1_4, s1_5, s1_6, s1_7, s1_8, s2_0, s2_1, s2_2, s2_3, s2_4, s2_5, s2_6, s2_7, s2_8, s3_0, s3_1, s3_2, s3_3, s3_4, s3_5, s3_6, s3_7, s3_8);
|
||||||
|
}
|
||||||
|
|
||||||
|
//!PASS 6
|
||||||
|
//!DESC out-shuffle
|
||||||
|
//!BLOCK_SIZE 16
|
||||||
|
//!NUM_THREADS 64
|
||||||
|
//!IN INPUT, t0, t1
|
||||||
|
//!OUT OUTPUT
|
||||||
|
|
||||||
|
#define l0(x, y) V4(O(t0, float2(x, y)))
|
||||||
|
#define l1(x, y) V4(O(t1, float2(x, y)))
|
||||||
|
|
||||||
|
V4 f0(V4 s0_0, V4 s0_1, V4 s0_2, V4 s0_3, V4 s0_4, V4 s0_5, V4 s0_6, V4 s0_7, V4 s0_8, V4 s1_0, V4 s1_1, V4 s1_2, V4 s1_3, V4 s1_4, V4 s1_5, V4 s1_6, V4 s1_7, V4 s1_8, V4 s2_0, V4 s2_1, V4 s2_2, V4 s2_3, V4 s2_4, V4 s2_5, V4 s2_6, V4 s2_7, V4 s2_8, V4 s3_0, V4 s3_1, V4 s3_2, V4 s3_3, V4 s3_4, V4 s3_5, V4 s3_6, V4 s3_7, V4 s3_8) {
|
||||||
|
V4 r = { -8.480e-04, -1.222e-04, -8.629e-04, -1.828e-04 };
|
||||||
|
r = MulAdd(s0_0, M4(-6.910e-02, 1.215e-03, -2.039e-03, -1.079e-04, 8.088e-02, -2.119e-02, -1.929e-02, 1.865e-02, -6.142e-02, 2.499e-02, -4.185e-03, 1.951e-03, -1.099e-02, 1.071e-02, 3.133e-03, -9.539e-03), r);
|
||||||
|
r = MulAdd(s0_1, M4(-2.129e-02, 6.812e-02, 2.738e-02, -2.965e-02, -1.569e-01, -7.369e-02, 6.714e-02, -2.416e-02, 6.421e-02, -3.329e-02, 4.397e-03, 1.902e-02, 1.426e-01, 7.469e-02, -3.306e-02, 1.260e-02), r);
|
||||||
|
r = MulAdd(s0_2, M4(-2.521e-02, -1.556e-02, -1.880e-02, 1.813e-02, -2.926e-03, -3.967e-02, -2.562e-02, 1.669e-02, 1.699e-03, 2.545e-02, 9.862e-03, 1.052e-02, -1.392e-02, 1.215e-02, 2.436e-02, 2.113e-04), r);
|
||||||
|
r = MulAdd(s0_3, M4(1.800e-02, -2.761e-02, 1.145e-02, -6.469e-02, 1.392e-01, 1.033e-02, 1.406e-01, -7.326e-03, -2.077e-02, 2.985e-03, -1.102e-01, 2.804e-02, -1.544e-02, 5.050e-02, 2.915e-02, 2.396e-02), r);
|
||||||
|
r = MulAdd(s0_4, M4(1.242e-01, -4.463e-01, -3.829e-01, 1.871e-01, -8.392e-02, 6.470e-02, -3.115e-01, -1.970e-01, -1.186e-01, -1.204e-01, -2.296e-02, -1.763e-01, -1.265e-01, -1.919e-01, 6.718e-02, 8.923e-02), r);
|
||||||
|
r = MulAdd(s0_5, M4(-2.493e-02, 3.014e-02, 2.446e-02, -1.488e-01, 1.299e-02, -5.759e-02, 2.138e-02, -9.211e-02, -8.051e-03, -4.216e-02, -1.327e-02, -9.724e-04, 3.675e-02, 7.968e-03, -3.353e-02, -4.044e-02), r);
|
||||||
|
r = MulAdd(s0_6, M4(2.027e-02, 3.813e-03, -2.557e-03, -2.670e-02, 2.068e-02, 1.886e-02, 6.014e-02, 3.191e-02, -1.917e-03, -2.659e-03, 1.273e-02, 3.109e-03, 9.881e-03, -4.410e-04, 7.569e-03, 1.276e-02), r);
|
||||||
|
r = MulAdd(s0_7, M4(-1.802e-03, 4.820e-02, 4.201e-02, 4.574e-02, 2.826e-02, 2.044e-02, 1.196e-01, 9.132e-02, 1.800e-02, 2.670e-02, -3.398e-03, 1.359e-02, 1.247e-02, 1.268e-02, 1.628e-03, -1.067e-02), r);
|
||||||
|
r = MulAdd(s0_8, M4(5.233e-03, 3.648e-02, 2.719e-02, 2.838e-02, 1.857e-03, -1.999e-03, 1.703e-02, 5.921e-02, 7.925e-03, -2.543e-03, 5.431e-03, -1.102e-02, -1.116e-02, -5.510e-03, -9.183e-03, -8.054e-03), r);
|
||||||
|
r = MulAdd(s1_0, M4(-6.423e-02, -5.758e-03, -8.948e-03, -2.227e-03, 5.802e-02, -2.252e-02, -8.134e-03, 1.448e-02, -3.642e-02, 4.476e-03, 7.865e-03, 3.269e-03, 1.053e-02, 1.269e-02, -1.530e-03, -9.628e-03), r);
|
||||||
|
r = MulAdd(s1_1, M4(-2.553e-02, 4.747e-02, 4.136e-02, -2.368e-02, -1.401e-01, -4.967e-02, 6.372e-02, -1.788e-04, 3.663e-01, 2.193e-01, -8.228e-02, -8.507e-02, 1.404e-01, 8.229e-02, -5.862e-02, -1.161e-02), r);
|
||||||
|
r = MulAdd(s1_2, M4(-2.216e-02, -7.521e-03, -2.522e-02, 2.337e-02, -2.651e-03, -3.786e-02, -9.854e-03, 2.033e-02, 9.696e-03, 1.237e-01, 6.173e-03, 2.898e-02, -1.335e-02, 2.948e-02, 9.778e-03, -1.243e-02), r);
|
||||||
|
r = MulAdd(s1_3, M4(-1.598e-02, -1.677e-02, -4.726e-02, -2.250e-02, 2.076e-01, -2.825e-02, 1.389e-01, -2.552e-02, 3.209e-02, -3.267e-03, -9.876e-02, 3.775e-02, -5.440e-02, 6.367e-02, 8.425e-02, 7.583e-03), r);
|
||||||
|
r = MulAdd(s1_4, M4(-2.339e-01, -8.617e-02, -3.313e-01, 1.470e-01, -1.249e-01, 3.994e-01, -7.191e-01, -2.121e-01, 2.521e-02, 4.601e-02, -3.584e-01, -4.014e-01, -4.299e-01, -4.828e-01, 4.034e-01, 3.633e-01), r);
|
||||||
|
r = MulAdd(s1_5, M4(3.413e-02, -4.685e-03, 4.308e-02, -1.211e-01, 3.722e-02, -1.000e-01, 5.938e-02, -1.900e-01, 3.286e-03, 6.076e-03, 2.628e-02, -1.190e-01, 3.968e-02, -3.583e-02, -4.724e-02, 5.713e-02), r);
|
||||||
|
r = MulAdd(s1_6, M4(3.008e-02, -2.083e-02, 7.970e-03, -2.011e-02, -8.809e-03, 9.741e-03, 7.228e-02, 1.875e-02, -8.374e-03, -2.245e-03, 1.642e-02, -9.996e-03, 2.093e-02, 6.393e-03, 6.227e-03, -6.775e-03), r);
|
||||||
|
r = MulAdd(s1_7, M4(1.113e-02, 5.783e-02, -1.430e-02, 2.826e-02, -1.250e-02, -3.106e-02, 1.754e-01, 2.001e-01, -1.431e-02, -1.368e-02, 4.329e-02, 4.832e-02, 4.089e-02, 3.702e-02, -5.774e-03, 8.701e-03), r);
|
||||||
|
r = MulAdd(s1_8, M4(1.395e-03, 3.747e-02, 2.706e-02, 4.675e-02, -1.191e-02, -2.163e-02, 3.137e-02, 7.056e-02, 4.929e-03, -6.465e-03, 1.083e-03, 1.816e-02, -3.896e-03, 1.081e-02, -1.507e-02, -1.412e-02), r);
|
||||||
|
r = MulAdd(s2_0, M4(5.551e-02, 3.061e-02, 2.172e-02, -4.435e-04, 7.341e-02, -4.254e-03, -3.710e-02, 2.005e-02, 3.528e-02, 1.764e-02, 4.547e-03, -6.460e-03, 1.949e-01, 2.466e-02, 7.886e-02, -2.722e-03), r);
|
||||||
|
r = MulAdd(s2_1, M4(-1.216e-03, 4.895e-02, -2.548e-02, 1.354e-02, 1.184e-01, -2.592e-01, 3.262e-02, 3.213e-02, -7.885e-02, -2.429e-02, -5.811e-02, 1.909e-02, 3.185e-02, -7.057e-02, -2.388e-02, 1.018e-01), r);
|
||||||
|
r = MulAdd(s2_2, M4(-4.325e-03, 8.278e-03, -7.126e-04, -3.013e-03, -2.277e-02, 6.470e-02, -3.258e-02, 6.558e-03, 2.954e-02, 9.175e-03, -1.066e-03, -1.931e-02, 3.523e-03, 1.347e-03, -1.837e-03, -3.765e-03), r);
|
||||||
|
r = MulAdd(s2_3, M4(-1.063e-01, 1.364e-02, -1.031e-01, 7.569e-02, -3.770e-02, 3.667e-02, 2.683e-02, 5.980e-02, -1.057e-01, -1.107e-02, -7.272e-02, 5.094e-02, 7.605e-02, 1.566e-02, 1.708e-01, 2.124e-01), r);
|
||||||
|
r = MulAdd(s2_4, M4(1.344e-02, -6.091e-02, 2.694e-02, -2.727e-02, 2.786e-01, 5.187e-02, 6.738e-01, -9.220e-01, 1.745e-01, -1.468e-02, 1.843e-01, -1.866e-01, -9.396e-02, -1.505e-01, 2.471e-01, -1.138e+00), r);
|
||||||
|
r = MulAdd(s2_5, M4(6.506e-03, 7.226e-03, 9.650e-03, 3.959e-03, -2.858e-02, -1.124e-01, -5.599e-02, 8.081e-02, -3.923e-02, 6.977e-02, 2.327e-03, 1.164e-01, 1.242e-02, -1.947e-02, -4.582e-02, 2.119e-02), r);
|
||||||
|
r = MulAdd(s2_6, M4(-1.730e-02, -2.202e-02, -2.408e-02, -6.448e-02, -3.767e-03, 2.506e-02, -4.165e-02, 4.527e-02, 1.431e-02, -2.421e-02, -1.170e-02, -6.665e-02, -1.236e-02, 5.709e-03, -6.345e-03, -3.440e-02), r);
|
||||||
|
r = MulAdd(s2_7, M4(-4.211e-02, -5.191e-02, -9.762e-02, -1.275e-01, 2.079e-02, -1.004e-01, 7.470e-02, 1.084e-02, -1.789e-02, 8.006e-02, 3.170e-02, 1.111e-01, -4.772e-02, -6.100e-02, 2.375e-02, 2.545e-03), r);
|
||||||
|
r = MulAdd(s2_8, M4(-7.109e-03, 1.968e-03, -9.159e-03, -1.523e-02, -1.024e-02, -5.787e-04, -4.581e-02, -1.496e-02, 2.302e-02, -1.568e-02, 2.850e-02, 9.731e-03, -1.219e-02, 1.316e-03, -1.859e-02, 8.662e-02), r);
|
||||||
|
r = MulAdd(s3_0, M4(2.241e-01, 1.599e-02, -3.007e-02, -8.278e-02, -2.343e-02, -1.323e-02, 6.153e-03, 8.030e-03, 1.988e-02, 1.870e-02, 7.620e-03, -1.035e-02, 2.443e-01, 4.061e-02, 3.123e-02, -4.152e-03), r);
|
||||||
|
r = MulAdd(s3_1, M4(-1.500e-02, -2.365e-02, -2.046e-02, 4.369e-02, 7.611e-03, -9.342e-03, 4.413e-03, -1.110e-03, -1.238e-01, -3.394e-02, -4.442e-02, 2.423e-02, -9.742e-02, -2.324e-02, -3.479e-02, 4.742e-02), r);
|
||||||
|
r = MulAdd(s3_2, M4(5.839e-03, 1.560e-02, -3.631e-03, 6.730e-03, -2.371e-03, -1.011e-02, -3.821e-03, 1.830e-03, 2.255e-02, 1.426e-02, -1.146e-02, -1.650e-02, 9.035e-03, 5.831e-03, 2.660e-03, -4.854e-03), r);
|
||||||
|
r = MulAdd(s3_3, M4(-1.694e-01, -2.771e-01, 6.449e-01, -2.979e-01, 9.108e-02, -2.277e-02, -5.309e-02, -3.552e-02, -1.626e-01, 2.544e-02, -7.033e-02, 7.145e-02, -1.334e-01, 1.008e-01, 1.121e-01, 1.733e-01), r);
|
||||||
|
r = MulAdd(s3_4, M4(-1.019e-01, 1.989e-01, -6.682e-02, -7.066e-02, -3.795e-02, 1.362e-01, 4.307e-02, -4.383e-02, 6.286e-01, -3.881e-01, 1.970e-01, -3.421e-01, -5.374e-03, -2.446e-01, -8.874e-02, -4.099e-01), r);
|
||||||
|
r = MulAdd(s3_5, M4(1.279e-02, -1.406e-02, 7.997e-03, 1.743e-02, 2.251e-02, -4.285e-02, -2.154e-03, -1.441e-02, -2.329e-02, 1.667e-02, 4.333e-02, 1.229e-01, -2.284e-03, -2.450e-02, -8.000e-03, -1.712e-02), r);
|
||||||
|
r = MulAdd(s3_6, M4(7.251e-02, 9.488e-03, -1.511e-01, -6.947e-02, -2.728e-02, 7.342e-03, 2.289e-02, 1.443e-02, 1.492e-02, -8.903e-03, -5.817e-02, -4.836e-02, -1.677e-03, 1.964e-02, -6.858e-03, -1.328e-02), r);
|
||||||
|
r = MulAdd(s3_7, M4(-8.618e-02, -5.596e-02, -1.276e-01, -1.230e-01, 4.851e-03, -5.676e-02, 2.939e-02, -4.192e-02, -2.508e-02, 4.430e-02, 1.352e-01, 2.072e-02, -8.584e-03, -3.983e-02, 1.177e-02, -4.721e-02), r);
|
||||||
|
r = MulAdd(s3_8, M4(6.050e-03, -3.781e-04, -3.124e-03, -1.667e-02, -1.291e-02, -1.315e-02, -2.106e-02, -5.240e-03, 1.412e-02, -2.504e-02, 3.138e-02, -2.989e-02, -6.363e-03, -1.480e-04, 1.157e-03, 1.933e-02), r);
|
||||||
|
return tanh(r);
|
||||||
|
}
|
||||||
|
|
||||||
|
void Pass6(uint2 blockStart, uint3 tid) {
|
||||||
|
float2 pt = float2(GetInputPt());
|
||||||
|
uint2 gxy = (Rmp8x8(tid.x) << 1) + blockStart;
|
||||||
|
uint2 size = GetOutputSize();
|
||||||
|
if (gxy.x >= size.x || gxy.y >= size.y) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
float2 pos = ((gxy >> 1) + 0.5) * pt;
|
||||||
|
|
||||||
|
V4 s0_0 = l0(-1.0, -1.0);
|
||||||
|
V4 s0_1 = l0(0.0, -1.0);
|
||||||
|
V4 s0_2 = l0(1.0, -1.0);
|
||||||
|
V4 s0_3 = l0(-1.0, 0.0);
|
||||||
|
V4 s0_4 = l0(0.0, 0.0);
|
||||||
|
V4 s0_5 = l0(1.0, 0.0);
|
||||||
|
V4 s0_6 = l0(-1.0, 1.0);
|
||||||
|
V4 s0_7 = l0(0.0, 1.0);
|
||||||
|
V4 s0_8 = l0(1.0, 1.0);
|
||||||
|
V4 s1_0 = -max(-s0_0, 0.0);
|
||||||
|
V4 s1_1 = -max(-s0_1, 0.0);
|
||||||
|
V4 s1_2 = -max(-s0_2, 0.0);
|
||||||
|
V4 s1_3 = -max(-s0_3, 0.0);
|
||||||
|
V4 s1_4 = -max(-s0_4, 0.0);
|
||||||
|
V4 s1_5 = -max(-s0_5, 0.0);
|
||||||
|
V4 s1_6 = -max(-s0_6, 0.0);
|
||||||
|
V4 s1_7 = -max(-s0_7, 0.0);
|
||||||
|
V4 s1_8 = -max(-s0_8, 0.0);
|
||||||
|
s0_0 = max(s0_0, 0.0);
|
||||||
|
s0_1 = max(s0_1, 0.0);
|
||||||
|
s0_2 = max(s0_2, 0.0);
|
||||||
|
s0_3 = max(s0_3, 0.0);
|
||||||
|
s0_4 = max(s0_4, 0.0);
|
||||||
|
s0_5 = max(s0_5, 0.0);
|
||||||
|
s0_6 = max(s0_6, 0.0);
|
||||||
|
s0_7 = max(s0_7, 0.0);
|
||||||
|
s0_8 = max(s0_8, 0.0);
|
||||||
|
|
||||||
|
V4 s2_0 = l1(-1.0, -1.0);
|
||||||
|
V4 s2_1 = l1(0.0, -1.0);
|
||||||
|
V4 s2_2 = l1(1.0, -1.0);
|
||||||
|
V4 s2_3 = l1(-1.0, 0.0);
|
||||||
|
V4 s2_4 = l1(0.0, 0.0);
|
||||||
|
V4 s2_5 = l1(1.0, 0.0);
|
||||||
|
V4 s2_6 = l1(-1.0, 1.0);
|
||||||
|
V4 s2_7 = l1(0.0, 1.0);
|
||||||
|
V4 s2_8 = l1(1.0, 1.0);
|
||||||
|
V4 s3_0 = -max(-s2_0, 0.0);
|
||||||
|
V4 s3_1 = -max(-s2_1, 0.0);
|
||||||
|
V4 s3_2 = -max(-s2_2, 0.0);
|
||||||
|
V4 s3_3 = -max(-s2_3, 0.0);
|
||||||
|
V4 s3_4 = -max(-s2_4, 0.0);
|
||||||
|
V4 s3_5 = -max(-s2_5, 0.0);
|
||||||
|
V4 s3_6 = -max(-s2_6, 0.0);
|
||||||
|
V4 s3_7 = -max(-s2_7, 0.0);
|
||||||
|
V4 s3_8 = -max(-s2_8, 0.0);
|
||||||
|
s2_0 = max(s2_0, 0.0);
|
||||||
|
s2_1 = max(s2_1, 0.0);
|
||||||
|
s2_2 = max(s2_2, 0.0);
|
||||||
|
s2_3 = max(s2_3, 0.0);
|
||||||
|
s2_4 = max(s2_4, 0.0);
|
||||||
|
s2_5 = max(s2_5, 0.0);
|
||||||
|
s2_6 = max(s2_6, 0.0);
|
||||||
|
s2_7 = max(s2_7, 0.0);
|
||||||
|
s2_8 = max(s2_8, 0.0);
|
||||||
|
|
||||||
|
V4 r = f0(s0_0, s0_1, s0_2, s0_3, s0_4, s0_5, s0_6, s0_7, s0_8, s1_0, s1_1, s1_2, s1_3, s1_4, s1_5, s1_6, s1_7, s1_8, s2_0, s2_1, s2_2, s2_3, s2_4, s2_5, s2_6, s2_7, s2_8, s3_0, s3_1, s3_2, s3_3, s3_4, s3_5, s3_6, s3_7, s3_8);
|
||||||
|
|
||||||
|
static const MF3x3 rgb2yuv = { 0.299, 0.587, 0.114, -0.169, -0.331, 0.5, 0.5, -0.419, -0.081 };
|
||||||
|
static const MF3x3 yuv2rgb = { 1, -0.00093, 1.401687, 1, -0.3437, -0.71417, 1, 1.77216, 0.00099 };
|
||||||
|
float2 opt = float2(GetOutputPt());
|
||||||
|
|
||||||
|
pos -= 0.5f * opt;
|
||||||
|
MF3 yuv = mul(rgb2yuv, INPUT.SampleLevel(SL, pos, 0).rgb);
|
||||||
|
OUTPUT[gxy] = MF4(mul(yuv2rgb, MF3(saturate(yuv.r + r.x), yuv.yz)), 1);
|
||||||
|
|
||||||
|
++gxy.x;
|
||||||
|
pos.x += opt.x;
|
||||||
|
yuv = mul(rgb2yuv, INPUT.SampleLevel(SL, pos, 0).rgb);
|
||||||
|
OUTPUT[gxy] = MF4(mul(yuv2rgb, MF3(saturate(yuv.r + r.y), yuv.yz)), 1);
|
||||||
|
|
||||||
|
++gxy.y;
|
||||||
|
pos.y += opt.y;
|
||||||
|
yuv = mul(rgb2yuv, INPUT.SampleLevel(SL, pos, 0).rgb);
|
||||||
|
OUTPUT[gxy] = MF4(mul(yuv2rgb, MF3(saturate(yuv.r + r.w), yuv.yz)), 1);
|
||||||
|
|
||||||
|
--gxy.x;
|
||||||
|
pos.x -= opt.x;
|
||||||
|
yuv = mul(rgb2yuv, INPUT.SampleLevel(SL, pos, 0).rgb);
|
||||||
|
OUTPUT[gxy] = MF4(mul(yuv2rgb, MF3(saturate(yuv.r + r.z), yuv.yz)), 1);
|
||||||
|
}
|
||||||
1236
src/Effects/CuNNy/CuNNy-6x8C-NVL-DN.hlsl
Normal file
1236
src/Effects/CuNNy/CuNNy-6x8C-NVL-DN.hlsl
Normal file
File diff suppressed because it is too large
Load diff
1236
src/Effects/CuNNy/CuNNy-6x8C-NVL.hlsl
Normal file
1236
src/Effects/CuNNy/CuNNy-6x8C-NVL.hlsl
Normal file
File diff suppressed because it is too large
Load diff
3994
src/Effects/CuNNy/CuNNy-8x16C-NVL-DN.hlsl
Normal file
3994
src/Effects/CuNNy/CuNNy-8x16C-NVL-DN.hlsl
Normal file
File diff suppressed because it is too large
Load diff
3994
src/Effects/CuNNy/CuNNy-8x16C-NVL.hlsl
Normal file
3994
src/Effects/CuNNy/CuNNy-8x16C-NVL.hlsl
Normal file
File diff suppressed because it is too large
Load diff
772
src/Effects/CuNNy/CuNNy-8x4C-NVL-DN.hlsl
Normal file
772
src/Effects/CuNNy/CuNNy-8x4C-NVL-DN.hlsl
Normal file
|
|
@ -0,0 +1,772 @@
|
||||||
|
// CuNNy 8x4C BILINEAR RGB NVL DN - https://github.com/funnyplanter/CuNNy
|
||||||
|
|
||||||
|
// This program is free software: you can redistribute it and/or modify
|
||||||
|
// it under the terms of the GNU General Public License as published by
|
||||||
|
// the Free Software Foundation, either version 3 of the License, or
|
||||||
|
// (at your option) any later version.
|
||||||
|
//
|
||||||
|
// This program is distributed in the hope that it will be useful,
|
||||||
|
// but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
//
|
||||||
|
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||||
|
// GNU General Public License for more details.
|
||||||
|
//
|
||||||
|
// You should have received a copy of the GNU General Public License
|
||||||
|
// along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||||
|
|
||||||
|
//!MAGPIE EFFECT
|
||||||
|
//!VERSION 4
|
||||||
|
//!SORT_NAME CuNNy-DN-D04N08
|
||||||
|
//!USE MulAdd
|
||||||
|
//!CAPABILITY FP16
|
||||||
|
|
||||||
|
#include "../StubDefs.hlsli"
|
||||||
|
|
||||||
|
//!TEXTURE
|
||||||
|
Texture2D INPUT;
|
||||||
|
|
||||||
|
//!TEXTURE
|
||||||
|
//!WIDTH INPUT_WIDTH * 2
|
||||||
|
//!HEIGHT INPUT_HEIGHT * 2
|
||||||
|
Texture2D OUTPUT;
|
||||||
|
|
||||||
|
//!SAMPLER
|
||||||
|
//!FILTER POINT
|
||||||
|
SamplerState SP;
|
||||||
|
|
||||||
|
//!SAMPLER
|
||||||
|
//!FILTER LINEAR
|
||||||
|
SamplerState SL;
|
||||||
|
|
||||||
|
//!COMMON
|
||||||
|
#define O(t, p) t.SampleLevel(SP, pos + p * pt, 0)
|
||||||
|
#define V4 MF4
|
||||||
|
#define M4 MF4x4
|
||||||
|
|
||||||
|
//!TEXTURE
|
||||||
|
//!WIDTH INPUT_WIDTH
|
||||||
|
//!HEIGHT INPUT_HEIGHT
|
||||||
|
//!FORMAT R8G8B8A8_SNORM
|
||||||
|
Texture2D t0;
|
||||||
|
|
||||||
|
//!TEXTURE
|
||||||
|
//!WIDTH INPUT_WIDTH
|
||||||
|
//!HEIGHT INPUT_HEIGHT
|
||||||
|
//!FORMAT R8G8B8A8_SNORM
|
||||||
|
Texture2D t1;
|
||||||
|
|
||||||
|
//!PASS 1
|
||||||
|
//!DESC in
|
||||||
|
//!BLOCK_SIZE 8
|
||||||
|
//!NUM_THREADS 64
|
||||||
|
//!IN INPUT
|
||||||
|
//!OUT t0
|
||||||
|
|
||||||
|
#define l0(x, y) (dot(MF3(-1.880e-01, -3.696e-01, -8.936e-02), O(INPUT, float2(x, y)).rgb) + MF(5.137e-01))
|
||||||
|
|
||||||
|
V4 f0(MF s0_0, MF s0_1, MF s0_2, MF s0_3, MF s0_4, MF s0_5, MF s0_6, MF s0_7, MF s0_8) {
|
||||||
|
V4 r = { 1.324e-02, -9.379e-05, 8.452e-03, 5.165e-02 };
|
||||||
|
r = mad(s0_0, V4(6.049e-03, -3.524e-01, -1.308e-01, -6.691e-02), r);
|
||||||
|
r = mad(s0_1, V4(1.720e-02, -7.092e-02, -3.030e-01, 1.654e-01), r);
|
||||||
|
r = mad(s0_2, V4(-6.706e-03, 2.289e-01, 1.982e-03, -5.756e-02), r);
|
||||||
|
r = mad(s0_3, V4(-2.761e-02, 5.050e-01, -2.036e-01, 1.265e-01), r);
|
||||||
|
r = mad(s0_4, V4(-8.654e-01, -6.035e-01, -2.119e-01, 5.055e-01), r);
|
||||||
|
r = mad(s0_5, V4(-7.114e-03, 2.325e-02, 5.721e-02, 4.585e-02), r);
|
||||||
|
r = mad(s0_6, V4(2.796e-01, 1.680e-01, 1.353e-01, 1.286e-02), r);
|
||||||
|
r = mad(s0_7, V4(5.684e-01, 3.022e-01, 6.426e-01, 8.931e-02), r);
|
||||||
|
r = mad(s0_8, V4(3.723e-02, -2.036e-01, 2.732e-02, -4.101e-02), r);
|
||||||
|
return r;
|
||||||
|
}
|
||||||
|
|
||||||
|
void Pass1(uint2 blockStart, uint3 tid) {
|
||||||
|
float2 pt = float2(GetInputPt());
|
||||||
|
uint2 gxy = Rmp8x8(tid.x) + blockStart;
|
||||||
|
uint2 size = GetInputSize();
|
||||||
|
if (gxy.x >= size.x || gxy.y >= size.y) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
float2 pos = (gxy + 0.5) * pt;
|
||||||
|
|
||||||
|
MF s0_0 = l0(-1.0, -1.0);
|
||||||
|
MF s0_1 = l0(0.0, -1.0);
|
||||||
|
MF s0_2 = l0(1.0, -1.0);
|
||||||
|
MF s0_3 = l0(-1.0, 0.0);
|
||||||
|
MF s0_4 = l0(0.0, 0.0);
|
||||||
|
MF s0_5 = l0(1.0, 0.0);
|
||||||
|
MF s0_6 = l0(-1.0, 1.0);
|
||||||
|
MF s0_7 = l0(0.0, 1.0);
|
||||||
|
MF s0_8 = l0(1.0, 1.0);
|
||||||
|
|
||||||
|
t0[gxy] = f0(s0_0, s0_1, s0_2, s0_3, s0_4, s0_5, s0_6, s0_7, s0_8);
|
||||||
|
}
|
||||||
|
|
||||||
|
//!PASS 2
|
||||||
|
//!DESC conv1
|
||||||
|
//!BLOCK_SIZE 8
|
||||||
|
//!NUM_THREADS 64
|
||||||
|
//!IN t0
|
||||||
|
//!OUT t1
|
||||||
|
|
||||||
|
#define l0(x, y) V4(O(t0, float2(x, y)))
|
||||||
|
|
||||||
|
V4 f0(V4 s0_0, V4 s0_1, V4 s0_2, V4 s0_3, V4 s0_4, V4 s0_5, V4 s0_6, V4 s0_7, V4 s0_8, V4 s1_0, V4 s1_1, V4 s1_2, V4 s1_3, V4 s1_4, V4 s1_5, V4 s1_6, V4 s1_7, V4 s1_8) {
|
||||||
|
V4 r = { 1.959e-02, -5.807e-03, 9.415e-02, 7.247e-03 };
|
||||||
|
r = MulAdd(s0_0, M4(2.216e-02, 1.062e-01, -3.433e-03, -1.923e-01, 6.300e-02, -4.594e-01, 2.025e-01, 8.655e-03, -5.497e-02, 1.694e-01, -1.806e-01, 2.115e-01, -6.176e-02, 1.167e-02, -5.987e-02, 1.167e-01), r);
|
||||||
|
r = MulAdd(s0_1, M4(-1.646e-01, -5.524e-01, -1.352e-01, 1.704e-01, 3.398e-02, -2.598e-01, 1.616e-01, -1.772e-01, -5.648e-02, 2.755e-01, 2.638e-02, -2.657e-02, 3.774e-02, -6.833e-02, -1.141e-01, -2.438e-01), r);
|
||||||
|
r = MulAdd(s0_2, M4(-1.459e-01, 9.939e-02, -6.457e-04, 2.352e-02, 5.006e-02, -7.759e-01, -4.862e-02, -3.366e-02, 9.508e-02, 1.537e-01, -6.771e-02, -1.260e-01, 1.067e-01, -5.893e-02, -9.811e-02, -1.060e-02), r);
|
||||||
|
r = MulAdd(s0_3, M4(-2.901e-01, 2.907e-01, 2.178e-01, -3.877e-01, 9.034e-03, 8.718e-03, -1.213e-01, 9.252e-02, 3.286e-01, -8.247e-02, -5.573e-02, -3.852e-01, -1.371e-01, 1.877e-01, 2.337e-01, 5.324e-01), r);
|
||||||
|
r = MulAdd(s0_4, M4(-9.182e-01, 1.013e-01, 2.969e-01, 7.117e-01, -2.367e-01, -7.128e-02, 1.828e-01, 5.993e-01, -2.965e-01, 1.323e-01, 3.117e-02, -3.215e-01, -1.410e-01, 5.359e-02, -1.137e-01, -2.603e-01), r);
|
||||||
|
r = MulAdd(s0_5, M4(-1.071e-01, -8.801e-02, 9.524e-03, -2.937e-02, 7.723e-02, 1.195e-01, -9.056e-02, 6.161e-02, 1.962e-01, -2.740e-01, -9.418e-02, 1.141e-01, 6.203e-02, -1.084e-01, 2.402e-01, -2.066e-01), r);
|
||||||
|
r = MulAdd(s0_6, M4(2.226e-01, -2.259e-01, -2.499e-02, -9.184e-02, -1.499e-01, -3.737e-02, 1.576e-01, 1.084e-01, -2.221e-01, -1.080e-02, 2.643e-02, -1.023e-01, 1.068e-01, 1.193e-01, -2.781e-01, 3.396e-01), r);
|
||||||
|
r = MulAdd(s0_7, M4(7.520e-01, -1.043e-01, -4.535e-02, 2.775e-01, 1.577e-01, -1.526e-01, 1.796e-01, 1.085e-01, -1.012e+00, 4.333e-02, 1.270e-02, -1.692e-01, 1.127e-01, -2.847e-01, -1.784e-01, -3.956e-01), r);
|
||||||
|
r = MulAdd(s0_8, M4(2.206e-01, 1.370e-01, -7.453e-02, 1.050e-01, 8.412e-02, -1.396e-01, 1.707e-02, -1.654e-02, -2.116e-01, -7.944e-02, 1.244e-01, -6.709e-02, -5.577e-02, 1.619e-01, -2.818e-01, 1.460e-01), r);
|
||||||
|
r = MulAdd(s1_0, M4(1.180e-01, -2.345e-01, 5.406e-02, -1.102e-01, 1.559e-02, -3.865e-01, -1.077e-01, 1.442e-02, -1.405e-01, 1.578e-01, -3.338e-02, 1.157e-01, -1.676e-01, 4.656e-02, -1.507e-01, 2.590e-02), r);
|
||||||
|
r = MulAdd(s1_1, M4(-3.112e-02, -5.537e-01, -3.626e-01, -2.915e-01, 7.495e-02, 4.473e-01, -1.847e-01, -8.743e-02, -3.290e-02, 3.660e-02, 1.252e-01, 1.058e-02, 1.193e-01, 6.421e-02, -1.456e-01, -1.693e-01), r);
|
||||||
|
r = MulAdd(s1_2, M4(-1.047e-01, -4.306e-01, 6.486e-03, 1.137e-01, 2.935e-02, -3.608e-01, 5.242e-02, -2.374e-02, 1.130e-01, -4.864e-02, -7.302e-02, -2.205e-02, 8.227e-02, -8.403e-02, -9.468e-02, 8.095e-02), r);
|
||||||
|
r = MulAdd(s1_3, M4(-3.759e-02, 2.709e-01, 1.269e-01, -4.994e-01, -1.577e-02, 1.871e-01, -2.532e-01, 8.960e-02, 2.298e-01, -2.462e-01, -1.634e-02, -3.955e-01, 2.750e-02, -4.812e-02, -2.441e-01, 9.926e-01), r);
|
||||||
|
r = MulAdd(s1_4, M4(-7.288e-01, 5.644e-01, 1.042e+00, 6.160e-01, -4.271e-01, 4.419e-01, 1.437e-01, 3.840e-01, -1.220e-01, -8.627e-01, 6.664e-02, -1.220e-02, 5.260e-02, 1.505e-01, -2.182e-01, -6.116e-01), r);
|
||||||
|
r = MulAdd(s1_5, M4(1.659e-01, 2.566e-01, -5.954e-02, -9.187e-02, -8.251e-02, 1.091e-01, -1.506e-01, 1.370e-01, 3.056e-01, -3.512e-01, -4.956e-03, 7.008e-02, 1.320e-01, -3.995e-01, -8.603e-03, -3.542e-01), r);
|
||||||
|
r = MulAdd(s1_6, M4(2.549e-01, -7.946e-02, -1.755e-01, -2.902e-02, -1.912e-01, 2.349e-01, 6.770e-02, 9.683e-02, -2.690e-01, -1.715e-01, 5.692e-02, -1.064e-01, 2.998e-01, 7.619e-02, 8.040e-03, 2.706e-01), r);
|
||||||
|
r = MulAdd(s1_7, M4(7.320e-01, 1.397e-01, -5.600e-02, 9.609e-02, -1.267e-01, 6.841e-02, 2.429e-01, 3.167e-02, -6.816e-01, -3.313e-03, 5.622e-02, -4.727e-02, -3.420e-01, 4.283e-02, -3.250e-01, -4.118e-01), r);
|
||||||
|
r = MulAdd(s1_8, M4(1.607e-01, 1.581e-01, -6.049e-02, 9.118e-02, -1.583e-02, 2.918e-01, 1.703e-02, -1.206e-01, -2.114e-01, -1.248e-01, 6.689e-02, -2.131e-02, -7.779e-02, 1.069e-01, -1.181e-01, 2.230e-01), r);
|
||||||
|
return r;
|
||||||
|
}
|
||||||
|
|
||||||
|
void Pass2(uint2 blockStart, uint3 tid) {
|
||||||
|
float2 pt = float2(GetInputPt());
|
||||||
|
uint2 gxy = Rmp8x8(tid.x) + blockStart;
|
||||||
|
uint2 size = GetInputSize();
|
||||||
|
if (gxy.x >= size.x || gxy.y >= size.y) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
float2 pos = (gxy + 0.5) * pt;
|
||||||
|
|
||||||
|
V4 s0_0 = l0(-1.0, -1.0);
|
||||||
|
V4 s0_1 = l0(0.0, -1.0);
|
||||||
|
V4 s0_2 = l0(1.0, -1.0);
|
||||||
|
V4 s0_3 = l0(-1.0, 0.0);
|
||||||
|
V4 s0_4 = l0(0.0, 0.0);
|
||||||
|
V4 s0_5 = l0(1.0, 0.0);
|
||||||
|
V4 s0_6 = l0(-1.0, 1.0);
|
||||||
|
V4 s0_7 = l0(0.0, 1.0);
|
||||||
|
V4 s0_8 = l0(1.0, 1.0);
|
||||||
|
V4 s1_0 = -max(-s0_0, 0.0);
|
||||||
|
V4 s1_1 = -max(-s0_1, 0.0);
|
||||||
|
V4 s1_2 = -max(-s0_2, 0.0);
|
||||||
|
V4 s1_3 = -max(-s0_3, 0.0);
|
||||||
|
V4 s1_4 = -max(-s0_4, 0.0);
|
||||||
|
V4 s1_5 = -max(-s0_5, 0.0);
|
||||||
|
V4 s1_6 = -max(-s0_6, 0.0);
|
||||||
|
V4 s1_7 = -max(-s0_7, 0.0);
|
||||||
|
V4 s1_8 = -max(-s0_8, 0.0);
|
||||||
|
s0_0 = max(s0_0, 0.0);
|
||||||
|
s0_1 = max(s0_1, 0.0);
|
||||||
|
s0_2 = max(s0_2, 0.0);
|
||||||
|
s0_3 = max(s0_3, 0.0);
|
||||||
|
s0_4 = max(s0_4, 0.0);
|
||||||
|
s0_5 = max(s0_5, 0.0);
|
||||||
|
s0_6 = max(s0_6, 0.0);
|
||||||
|
s0_7 = max(s0_7, 0.0);
|
||||||
|
s0_8 = max(s0_8, 0.0);
|
||||||
|
|
||||||
|
t1[gxy] = f0(s0_0, s0_1, s0_2, s0_3, s0_4, s0_5, s0_6, s0_7, s0_8, s1_0, s1_1, s1_2, s1_3, s1_4, s1_5, s1_6, s1_7, s1_8);
|
||||||
|
}
|
||||||
|
|
||||||
|
//!PASS 3
|
||||||
|
//!DESC conv2
|
||||||
|
//!BLOCK_SIZE 8
|
||||||
|
//!NUM_THREADS 64
|
||||||
|
//!IN t1
|
||||||
|
//!OUT t0
|
||||||
|
|
||||||
|
#define l0(x, y) V4(O(t1, float2(x, y)))
|
||||||
|
|
||||||
|
V4 f0(V4 s0_0, V4 s0_1, V4 s0_2, V4 s0_3, V4 s0_4, V4 s0_5, V4 s0_6, V4 s0_7, V4 s0_8, V4 s1_0, V4 s1_1, V4 s1_2, V4 s1_3, V4 s1_4, V4 s1_5, V4 s1_6, V4 s1_7, V4 s1_8) {
|
||||||
|
V4 r = { 3.240e-02, -1.989e-01, -2.700e-02, 6.578e-03 };
|
||||||
|
r = MulAdd(s0_0, M4(9.727e-02, 1.849e-01, 2.125e-02, 1.933e-01, 9.183e-02, 8.307e-03, -9.035e-02, 3.241e-02, 1.141e-01, 8.739e-02, -9.547e-02, 1.616e-01, 2.912e-02, -1.780e-02, 5.433e-02, 2.720e-02), r);
|
||||||
|
r = MulAdd(s0_1, M4(-1.524e-01, -9.138e-02, 8.798e-02, -1.691e-01, 8.519e-03, 3.597e-02, -1.784e-02, 3.049e-02, 3.078e-02, 1.823e-01, 1.051e-02, -5.317e-02, -1.977e-01, 1.013e-01, 1.215e-01, 4.261e-02), r);
|
||||||
|
r = MulAdd(s0_2, M4(-1.992e-02, -1.191e-01, -1.365e-03, 3.976e-02, 3.452e-03, 7.503e-03, 4.850e-03, 8.970e-03, -7.652e-03, 1.166e-01, 9.888e-02, 3.423e-03, -3.354e-01, -3.335e-01, -2.226e-02, -1.509e-01), r);
|
||||||
|
r = MulAdd(s0_3, M4(-7.994e-02, 1.374e-01, -1.701e-02, -2.530e-01, 2.153e-01, -6.957e-03, -1.405e-01, -6.175e-02, 7.274e-03, 1.734e-01, -9.107e-02, -1.303e-01, -1.265e-01, 1.669e-02, 3.494e-02, -8.377e-02), r);
|
||||||
|
r = MulAdd(s0_4, M4(-1.124e+00, 1.355e-02, -1.979e-01, -4.092e-01, -1.276e-01, -1.096e-01, 5.949e-02, 1.073e-01, -4.780e-02, 1.378e-01, 1.905e-01, -9.525e-02, -5.999e-01, 1.274e-01, 8.416e-01, 2.483e-01), r);
|
||||||
|
r = MulAdd(s0_5, M4(3.312e-01, 2.036e-01, -5.231e-02, 5.357e-02, 1.666e-03, -2.102e-03, -3.213e-03, 4.747e-02, 1.130e-01, 3.492e-01, -1.263e-01, 4.100e-01, -5.859e-01, 4.875e-02, 2.227e-01, 3.127e-01), r);
|
||||||
|
r = MulAdd(s0_6, M4(-3.699e-02, 6.066e-02, 3.448e-03, -4.158e-03, -4.048e-03, -3.619e-02, -8.830e-02, -8.917e-03, 2.990e-02, 6.919e-03, 9.803e-02, 2.188e-02, 5.674e-02, -3.122e-02, -6.793e-02, 8.573e-02), r);
|
||||||
|
r = MulAdd(s0_7, M4(-1.255e-01, 1.754e-01, -1.332e-01, -1.124e-01, -2.163e-01, 1.552e-02, -7.485e-04, 4.194e-02, -1.899e-01, 1.334e-01, -1.721e-01, -3.487e-01, 3.847e-01, -3.823e-02, 1.121e-02, -7.128e-02), r);
|
||||||
|
r = MulAdd(s0_8, M4(7.152e-02, -1.631e-02, 4.810e-02, 1.435e-01, 3.881e-02, -3.596e-02, -7.544e-03, -1.071e-01, -8.509e-02, 1.110e-01, 8.542e-02, 1.980e-02, -1.134e-01, -7.967e-02, -1.586e-01, 2.511e-01), r);
|
||||||
|
r = MulAdd(s1_0, M4(2.326e-01, 4.791e-02, -1.996e-01, 1.352e-02, -9.909e-03, 1.117e-01, 2.198e-02, -6.683e-02, 1.356e-01, 2.830e-01, -8.418e-02, 2.137e-01, -1.401e-02, -7.056e-02, 5.360e-02, 6.243e-02), r);
|
||||||
|
r = MulAdd(s1_1, M4(7.739e-01, -3.172e-01, -2.031e-01, 2.054e-01, -1.263e-01, -7.571e-03, 8.090e-02, -1.372e-01, 1.053e-01, 2.982e-01, -6.235e-02, 1.452e-02, 1.973e-01, 9.233e-02, -1.067e-01, 1.088e-01), r);
|
||||||
|
r = MulAdd(s1_2, M4(-1.136e-01, -1.332e-01, -7.369e-02, 2.046e-01, -9.302e-02, 2.722e-02, 9.461e-02, -1.895e-01, 1.216e-02, 2.595e-01, 1.028e-01, 8.413e-02, -1.339e-01, -2.259e-01, -1.047e-01, 5.994e-02), r);
|
||||||
|
r = MulAdd(s1_3, M4(1.224e-01, -3.713e-02, -2.383e-01, -1.743e-01, -1.876e-01, 1.155e-01, 2.212e-01, -1.375e-01, 1.618e-01, 2.628e-01, -1.161e-01, -1.826e-01, 8.003e-02, -1.961e-02, -6.278e-02, -5.710e-02), r);
|
||||||
|
r = MulAdd(s1_4, M4(-2.647e-01, -1.603e-01, -7.731e-01, 1.958e-01, -4.093e-01, -1.110e-01, 3.352e-01, -3.093e-02, -6.201e-01, 3.073e-01, 3.779e-01, -2.733e-01, 4.035e-01, 1.230e-01, -1.606e-01, 9.421e-02), r);
|
||||||
|
r = MulAdd(s1_5, M4(1.981e-01, -8.801e-03, -9.874e-03, -4.003e-02, 2.686e-03, -1.346e-01, -1.813e-02, -1.003e-01, 1.561e-01, 3.252e-01, -1.189e-01, 2.014e-01, 1.343e-01, 4.088e-02, -9.918e-02, 1.025e+00), r);
|
||||||
|
r = MulAdd(s1_6, M4(-2.323e-02, 3.284e-02, -5.099e-03, -3.025e-02, -1.458e-02, -1.640e-02, 1.268e-01, -3.787e-02, 5.078e-02, 4.529e-02, 1.050e-02, -8.079e-03, -1.530e-02, -6.509e-02, -1.620e-01, 6.662e-02), r);
|
||||||
|
r = MulAdd(s1_7, M4(3.972e-02, 8.570e-02, -8.723e-02, -3.746e-02, -1.902e-01, 5.121e-02, 1.161e-01, -4.624e-02, -6.268e-02, 1.852e-01, -1.535e-01, -2.023e-01, 2.476e-01, -2.211e-02, -1.590e-01, -3.109e-02), r);
|
||||||
|
r = MulAdd(s1_8, M4(-8.025e-03, -4.798e-02, 5.162e-02, 6.616e-02, -2.416e-02, -5.815e-02, -1.334e-02, -1.029e-01, 5.381e-02, 1.539e-01, 4.511e-02, 1.426e-01, -5.511e-02, -9.311e-02, -3.072e-02, 1.572e-01), r);
|
||||||
|
return r;
|
||||||
|
}
|
||||||
|
|
||||||
|
void Pass3(uint2 blockStart, uint3 tid) {
|
||||||
|
float2 pt = float2(GetInputPt());
|
||||||
|
uint2 gxy = Rmp8x8(tid.x) + blockStart;
|
||||||
|
uint2 size = GetInputSize();
|
||||||
|
if (gxy.x >= size.x || gxy.y >= size.y) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
float2 pos = (gxy + 0.5) * pt;
|
||||||
|
|
||||||
|
V4 s0_0 = l0(-1.0, -1.0);
|
||||||
|
V4 s0_1 = l0(0.0, -1.0);
|
||||||
|
V4 s0_2 = l0(1.0, -1.0);
|
||||||
|
V4 s0_3 = l0(-1.0, 0.0);
|
||||||
|
V4 s0_4 = l0(0.0, 0.0);
|
||||||
|
V4 s0_5 = l0(1.0, 0.0);
|
||||||
|
V4 s0_6 = l0(-1.0, 1.0);
|
||||||
|
V4 s0_7 = l0(0.0, 1.0);
|
||||||
|
V4 s0_8 = l0(1.0, 1.0);
|
||||||
|
V4 s1_0 = -max(-s0_0, 0.0);
|
||||||
|
V4 s1_1 = -max(-s0_1, 0.0);
|
||||||
|
V4 s1_2 = -max(-s0_2, 0.0);
|
||||||
|
V4 s1_3 = -max(-s0_3, 0.0);
|
||||||
|
V4 s1_4 = -max(-s0_4, 0.0);
|
||||||
|
V4 s1_5 = -max(-s0_5, 0.0);
|
||||||
|
V4 s1_6 = -max(-s0_6, 0.0);
|
||||||
|
V4 s1_7 = -max(-s0_7, 0.0);
|
||||||
|
V4 s1_8 = -max(-s0_8, 0.0);
|
||||||
|
s0_0 = max(s0_0, 0.0);
|
||||||
|
s0_1 = max(s0_1, 0.0);
|
||||||
|
s0_2 = max(s0_2, 0.0);
|
||||||
|
s0_3 = max(s0_3, 0.0);
|
||||||
|
s0_4 = max(s0_4, 0.0);
|
||||||
|
s0_5 = max(s0_5, 0.0);
|
||||||
|
s0_6 = max(s0_6, 0.0);
|
||||||
|
s0_7 = max(s0_7, 0.0);
|
||||||
|
s0_8 = max(s0_8, 0.0);
|
||||||
|
|
||||||
|
t0[gxy] = f0(s0_0, s0_1, s0_2, s0_3, s0_4, s0_5, s0_6, s0_7, s0_8, s1_0, s1_1, s1_2, s1_3, s1_4, s1_5, s1_6, s1_7, s1_8);
|
||||||
|
}
|
||||||
|
|
||||||
|
//!PASS 4
|
||||||
|
//!DESC conv3
|
||||||
|
//!BLOCK_SIZE 8
|
||||||
|
//!NUM_THREADS 64
|
||||||
|
//!IN t0
|
||||||
|
//!OUT t1
|
||||||
|
|
||||||
|
#define l0(x, y) V4(O(t0, float2(x, y)))
|
||||||
|
|
||||||
|
V4 f0(V4 s0_0, V4 s0_1, V4 s0_2, V4 s0_3, V4 s0_4, V4 s0_5, V4 s0_6, V4 s0_7, V4 s0_8, V4 s1_0, V4 s1_1, V4 s1_2, V4 s1_3, V4 s1_4, V4 s1_5, V4 s1_6, V4 s1_7, V4 s1_8) {
|
||||||
|
V4 r = { 4.014e-03, -2.020e-02, 1.560e-02, -2.352e-02 };
|
||||||
|
r = MulAdd(s0_0, M4(9.384e-02, 1.183e-01, 5.136e-02, -4.583e-01, -1.060e-01, 6.124e-02, -1.479e-01, -2.457e-01, -5.881e-02, 4.756e-03, -2.540e-02, -5.047e-02, -1.897e-01, 4.062e-02, 1.226e-02, 1.465e-01), r);
|
||||||
|
r = MulAdd(s0_1, M4(-1.890e-01, -9.535e-02, 2.627e-01, 3.224e-01, 1.050e-01, -3.922e-02, -3.551e-01, -2.632e-01, -2.349e-01, -5.605e-02, -2.856e-01, 4.331e-01, -2.614e-02, -6.027e-02, -3.236e-02, 2.873e-01), r);
|
||||||
|
r = MulAdd(s0_2, M4(-1.702e-01, 7.462e-02, 2.168e-01, 4.212e-01, 8.150e-03, 6.671e-02, -2.781e-01, -1.322e-01, -3.933e-02, 2.698e-02, -3.420e-01, -1.116e-02, -1.788e-02, 8.701e-03, -1.044e-01, 1.264e-01), r);
|
||||||
|
r = MulAdd(s0_3, M4(3.573e-01, -4.592e-02, 4.539e-01, 2.854e-01, -6.463e-01, -1.763e-01, 6.236e-01, 7.125e-02, 4.126e-01, -1.621e-02, 1.685e-02, 2.328e-01, -5.456e-01, -2.113e-01, 1.424e-01, 1.414e-01), r);
|
||||||
|
r = MulAdd(s0_4, M4(3.838e-01, -1.008e+00, 4.023e-01, 1.302e+00, -1.503e-01, 4.245e-02, 1.496e+00, -3.479e-01, -3.763e-01, -7.877e-01, 4.081e-01, -2.192e-01, -2.853e-01, 2.123e-01, -3.407e-01, 2.423e-01), r);
|
||||||
|
r = MulAdd(s0_5, M4(5.073e-03, -2.123e-01, 1.851e-01, 1.482e-01, -2.814e-01, 1.262e-01, 6.890e-01, -2.317e-01, 6.427e-02, -5.801e-02, -3.684e-02, 7.526e-02, 1.309e-02, -2.125e-02, -7.760e-02, 4.795e-02), r);
|
||||||
|
r = MulAdd(s0_6, M4(1.409e-01, -1.062e-01, 1.665e-01, 5.277e-01, 6.676e-01, -1.872e-01, 1.251e+00, 1.165e-01, -2.287e-02, -5.235e-02, -2.028e-03, -3.305e-02, -1.968e-01, 1.898e-01, -9.538e-02, -1.418e-01), r);
|
||||||
|
r = MulAdd(s0_7, M4(7.353e-02, -3.073e-01, 1.789e-01, 2.137e-01, -6.435e-01, -6.052e-01, 2.259e+00, 2.884e-02, 7.105e-04, 1.247e-01, -7.393e-02, 2.539e-02, 1.194e-01, 1.870e-01, -1.126e-01, 2.444e-02), r);
|
||||||
|
r = MulAdd(s0_8, M4(3.853e-02, -2.242e-01, 1.470e-01, 1.701e-02, 4.586e-02, 2.027e-01, 7.448e-01, -4.414e-01, 9.096e-03, 1.277e-01, 4.010e-02, 1.064e-02, 2.401e-02, 1.901e-02, 1.956e-02, 8.744e-02), r);
|
||||||
|
r = MulAdd(s1_0, M4(-4.741e-02, 1.819e-03, -8.321e-02, -1.496e-01, -1.801e-02, 4.682e-02, -6.041e-02, -7.243e-02, -1.478e-01, 4.970e-02, 6.424e-02, -5.378e-02, -9.117e-02, 5.496e-02, -2.648e-02, -4.042e-02), r);
|
||||||
|
r = MulAdd(s1_1, M4(-8.815e-02, 5.938e-02, -2.433e-01, 1.737e-01, 1.095e-01, -5.108e-02, -5.729e-02, 8.334e-03, -2.763e-01, -6.431e-02, -2.454e-02, 4.055e-01, 2.113e-02, -1.298e-01, -3.908e-02, -1.780e-02), r);
|
||||||
|
r = MulAdd(s1_2, M4(-1.905e-02, 3.894e-02, -1.293e-01, 8.303e-03, -7.800e-03, -5.508e-03, 8.606e-02, -7.501e-02, 1.542e-02, 3.046e-02, -2.920e-01, -4.240e-02, -3.932e-02, -1.813e-02, -8.213e-02, 1.017e-01), r);
|
||||||
|
r = MulAdd(s1_3, M4(1.965e-01, 3.626e-02, 3.418e-02, 9.779e-02, -6.664e-02, -2.295e-02, -2.736e-02, 1.091e-01, 1.129e-01, -3.896e-02, 1.171e-02, -2.870e-02, -1.382e-01, -1.691e-01, 3.018e-01, -1.186e-01), r);
|
||||||
|
r = MulAdd(s1_4, M4(1.075e-01, -6.894e-01, 1.714e-01, 5.097e-01, 9.868e-03, 1.087e-01, 2.107e-01, -6.591e-02, -3.233e-01, -9.792e-01, -1.189e-01, -5.480e-01, -1.157e-01, 5.941e-02, -5.770e-01, -1.030e-01), r);
|
||||||
|
r = MulAdd(s1_5, M4(3.289e-02, 3.941e-02, 1.824e-01, 7.260e-04, -9.787e-03, 3.128e-02, -1.333e-01, 1.352e-01, 5.954e-03, -2.520e-01, -8.536e-02, -3.566e-01, 2.998e-02, -5.941e-02, -8.531e-02, -4.232e-02), r);
|
||||||
|
r = MulAdd(s1_6, M4(2.592e-02, -7.528e-02, -1.956e-02, 1.002e-01, 2.992e-02, -1.673e-01, 4.413e-02, 1.683e-01, 1.440e-02, -1.047e-02, 1.425e-02, -1.292e-01, -1.777e-01, 1.220e-01, -6.381e-02, 4.174e-02), r);
|
||||||
|
r = MulAdd(s1_7, M4(-3.107e-02, -8.612e-02, 1.248e-02, -8.544e-02, -1.161e-01, 7.718e-02, -1.150e-01, -1.699e-01, -1.392e-02, 7.590e-02, -5.195e-02, -3.599e-01, 4.872e-02, 1.381e-01, -1.143e-01, -1.473e-03), r);
|
||||||
|
r = MulAdd(s1_8, M4(1.277e-02, 3.020e-02, 4.658e-02, 8.071e-02, 6.867e-02, -2.693e-02, 7.897e-02, -1.264e-02, -1.035e-03, 1.509e-01, 4.169e-02, -1.716e-01, -4.694e-03, 1.627e-02, 7.171e-03, -4.496e-02), r);
|
||||||
|
return r;
|
||||||
|
}
|
||||||
|
|
||||||
|
void Pass4(uint2 blockStart, uint3 tid) {
|
||||||
|
float2 pt = float2(GetInputPt());
|
||||||
|
uint2 gxy = Rmp8x8(tid.x) + blockStart;
|
||||||
|
uint2 size = GetInputSize();
|
||||||
|
if (gxy.x >= size.x || gxy.y >= size.y) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
float2 pos = (gxy + 0.5) * pt;
|
||||||
|
|
||||||
|
V4 s0_0 = l0(-1.0, -1.0);
|
||||||
|
V4 s0_1 = l0(0.0, -1.0);
|
||||||
|
V4 s0_2 = l0(1.0, -1.0);
|
||||||
|
V4 s0_3 = l0(-1.0, 0.0);
|
||||||
|
V4 s0_4 = l0(0.0, 0.0);
|
||||||
|
V4 s0_5 = l0(1.0, 0.0);
|
||||||
|
V4 s0_6 = l0(-1.0, 1.0);
|
||||||
|
V4 s0_7 = l0(0.0, 1.0);
|
||||||
|
V4 s0_8 = l0(1.0, 1.0);
|
||||||
|
V4 s1_0 = -max(-s0_0, 0.0);
|
||||||
|
V4 s1_1 = -max(-s0_1, 0.0);
|
||||||
|
V4 s1_2 = -max(-s0_2, 0.0);
|
||||||
|
V4 s1_3 = -max(-s0_3, 0.0);
|
||||||
|
V4 s1_4 = -max(-s0_4, 0.0);
|
||||||
|
V4 s1_5 = -max(-s0_5, 0.0);
|
||||||
|
V4 s1_6 = -max(-s0_6, 0.0);
|
||||||
|
V4 s1_7 = -max(-s0_7, 0.0);
|
||||||
|
V4 s1_8 = -max(-s0_8, 0.0);
|
||||||
|
s0_0 = max(s0_0, 0.0);
|
||||||
|
s0_1 = max(s0_1, 0.0);
|
||||||
|
s0_2 = max(s0_2, 0.0);
|
||||||
|
s0_3 = max(s0_3, 0.0);
|
||||||
|
s0_4 = max(s0_4, 0.0);
|
||||||
|
s0_5 = max(s0_5, 0.0);
|
||||||
|
s0_6 = max(s0_6, 0.0);
|
||||||
|
s0_7 = max(s0_7, 0.0);
|
||||||
|
s0_8 = max(s0_8, 0.0);
|
||||||
|
|
||||||
|
t1[gxy] = f0(s0_0, s0_1, s0_2, s0_3, s0_4, s0_5, s0_6, s0_7, s0_8, s1_0, s1_1, s1_2, s1_3, s1_4, s1_5, s1_6, s1_7, s1_8);
|
||||||
|
}
|
||||||
|
|
||||||
|
//!PASS 5
|
||||||
|
//!DESC conv4
|
||||||
|
//!BLOCK_SIZE 8
|
||||||
|
//!NUM_THREADS 64
|
||||||
|
//!IN t1
|
||||||
|
//!OUT t0
|
||||||
|
|
||||||
|
#define l0(x, y) V4(O(t1, float2(x, y)))
|
||||||
|
|
||||||
|
V4 f0(V4 s0_0, V4 s0_1, V4 s0_2, V4 s0_3, V4 s0_4, V4 s0_5, V4 s0_6, V4 s0_7, V4 s0_8, V4 s1_0, V4 s1_1, V4 s1_2, V4 s1_3, V4 s1_4, V4 s1_5, V4 s1_6, V4 s1_7, V4 s1_8) {
|
||||||
|
V4 r = { -5.942e-03, -2.718e-02, -1.234e-02, 3.307e-02 };
|
||||||
|
r = MulAdd(s0_0, M4(3.174e-02, -2.020e-01, -6.843e-03, 1.049e-01, 1.680e-01, -6.387e-01, -1.541e-01, -1.952e-01, -4.586e-02, -1.580e-01, -5.507e-02, 1.065e-01, -5.257e-03, -9.464e-02, -9.788e-02, 1.221e-01), r);
|
||||||
|
r = MulAdd(s0_1, M4(1.365e-01, -4.220e-02, -4.186e-02, -1.569e-01, -5.527e-01, -1.180e-01, -2.274e-01, -2.007e-01, 2.207e-02, 1.190e-02, 3.746e-02, -1.565e-01, -2.808e-02, 1.657e-02, -5.376e-02, -1.093e-02), r);
|
||||||
|
r = MulAdd(s0_2, M4(-7.935e-02, -3.809e-02, -3.727e-02, -4.730e-02, -8.556e-02, 3.451e-04, -8.191e-02, 8.086e-02, 2.051e-02, 7.072e-03, 2.537e-02, 2.793e-02, 9.384e-04, -3.624e-02, -2.171e-02, 7.103e-02), r);
|
||||||
|
r = MulAdd(s0_3, M4(-1.261e-02, 2.716e-01, 2.739e-01, -7.349e-02, -2.130e-02, -4.131e-01, -1.851e-01, 1.065e-01, -7.827e-02, 2.868e-01, -1.500e-01, -1.442e-01, -1.842e-02, -2.983e-01, -4.232e-02, 1.395e-01), r);
|
||||||
|
r = MulAdd(s0_4, M4(2.733e-01, 4.015e-01, 4.102e-01, -2.027e-01, 4.229e-01, 2.213e-01, 3.628e-01, -1.011e-01, -4.893e-01, 1.333e-01, -4.245e-02, -8.133e-02, -1.086e-02, -1.089e-01, -8.720e-02, 1.513e-01), r);
|
||||||
|
r = MulAdd(s0_5, M4(8.521e-02, 1.460e-01, 1.589e-01, -2.075e-01, -5.391e-02, 7.449e-03, -6.763e-02, -2.352e-01, 4.055e-02, -1.812e-02, -1.413e-02, 9.240e-02, -3.070e-02, -4.975e-03, -8.972e-02, -2.225e-02), r);
|
||||||
|
r = MulAdd(s0_6, M4(1.880e-01, -1.481e-01, 1.001e-01, 6.339e-02, -6.208e-02, -2.814e-02, -5.944e-03, 1.002e-01, -7.822e-02, 1.010e-01, -2.161e-02, 9.175e-02, 1.495e-02, 1.645e-02, 8.901e-03, -3.865e-02), r);
|
||||||
|
r = MulAdd(s0_7, M4(4.449e-01, -1.089e-01, -1.249e-01, -8.911e-01, 3.096e-02, 1.724e-01, 5.605e-02, -7.605e-02, -9.644e-02, -1.191e-01, -1.332e-01, 2.544e-02, 5.659e-02, -2.706e-04, -9.886e-02, 9.218e-02), r);
|
||||||
|
r = MulAdd(s0_8, M4(7.394e-02, -2.112e-01, 1.505e-02, -1.236e-01, -1.848e-02, -2.716e-02, -6.663e-02, 2.764e-02, -1.120e-02, 3.440e-03, -1.443e-02, 1.745e-02, -3.847e-02, -4.228e-03, -8.888e-02, 2.134e-02), r);
|
||||||
|
r = MulAdd(s1_0, M4(6.588e-03, -6.764e-02, -2.660e-02, -3.967e-02, 6.459e-02, -6.345e-01, -5.784e-01, 9.294e-02, 2.426e-02, -9.858e-02, -9.036e-02, -9.545e-02, 2.094e-02, -1.001e-01, -1.145e-01, -6.470e-02), r);
|
||||||
|
r = MulAdd(s1_1, M4(-2.633e-03, 5.849e-02, 3.154e-02, -7.386e-02, -6.412e-01, -4.405e-01, -5.885e-01, 1.657e-01, -1.757e-01, -1.882e-02, -1.023e-01, -1.713e-01, -1.047e-01, -1.558e-01, -1.509e-01, -2.815e-01), r);
|
||||||
|
r = MulAdd(s1_2, M4(1.880e-01, -3.790e-02, 1.112e-01, 1.672e-02, -1.713e-01, 2.611e-02, -9.008e-02, 9.359e-02, -6.567e-02, 9.399e-02, 3.743e-02, 3.662e-02, 3.190e-02, -1.466e-01, -1.154e-01, 1.692e-02), r);
|
||||||
|
r = MulAdd(s1_3, M4(-1.733e-02, 1.381e-01, 8.342e-02, -5.893e-02, -1.467e-02, -4.365e-01, -3.057e-01, 1.506e-01, 7.300e-02, 6.777e-01, -5.484e-03, -3.499e-01, 1.978e-01, -6.846e-01, -2.921e-01, -1.173e-01), r);
|
||||||
|
r = MulAdd(s1_4, M4(-1.829e-01, -4.506e-01, -5.685e-02, 8.260e-01, 3.056e-01, 1.803e-01, 1.908e-01, -2.029e-01, -1.578e-01, 5.039e-01, 3.016e-01, -4.971e-01, -4.977e-01, 4.537e-01, -4.268e-01, 7.878e-01), r);
|
||||||
|
r = MulAdd(s1_5, M4(-3.251e-01, -1.229e-01, -1.447e-01, 3.290e-01, -2.134e-01, -6.542e-03, -7.109e-02, -1.004e-01, 3.887e-02, -1.008e-01, -7.490e-02, 6.126e-02, 2.757e-01, -1.980e-01, -1.792e-01, 2.722e-01), r);
|
||||||
|
r = MulAdd(s1_6, M4(4.765e-02, -5.401e-02, 4.164e-02, 1.847e-03, -3.178e-02, -4.201e-02, -2.504e-02, 1.350e-02, -1.436e-01, 1.654e-01, -1.099e-02, -3.733e-02, 1.118e-01, -2.529e-01, -1.353e-01, -9.309e-02), r);
|
||||||
|
r = MulAdd(s1_7, M4(1.684e-01, -1.978e-01, 2.645e-02, -9.582e-02, 2.618e-02, 9.350e-02, -2.281e-02, -1.901e-01, 1.176e-02, -1.571e-01, 1.491e-02, -2.105e-01, -1.685e-01, -2.459e-01, -2.166e-01, 1.082e-01), r);
|
||||||
|
r = MulAdd(s1_8, M4(2.225e-02, 7.813e-02, -4.112e-02, 6.166e-02, -4.143e-02, -2.160e-02, -7.478e-02, -2.251e-02, -1.306e-02, -6.002e-02, -7.496e-02, -2.538e-03, 7.824e-02, 9.597e-02, -3.546e-03, -1.794e-01), r);
|
||||||
|
return r;
|
||||||
|
}
|
||||||
|
|
||||||
|
void Pass5(uint2 blockStart, uint3 tid) {
|
||||||
|
float2 pt = float2(GetInputPt());
|
||||||
|
uint2 gxy = Rmp8x8(tid.x) + blockStart;
|
||||||
|
uint2 size = GetInputSize();
|
||||||
|
if (gxy.x >= size.x || gxy.y >= size.y) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
float2 pos = (gxy + 0.5) * pt;
|
||||||
|
|
||||||
|
V4 s0_0 = l0(-1.0, -1.0);
|
||||||
|
V4 s0_1 = l0(0.0, -1.0);
|
||||||
|
V4 s0_2 = l0(1.0, -1.0);
|
||||||
|
V4 s0_3 = l0(-1.0, 0.0);
|
||||||
|
V4 s0_4 = l0(0.0, 0.0);
|
||||||
|
V4 s0_5 = l0(1.0, 0.0);
|
||||||
|
V4 s0_6 = l0(-1.0, 1.0);
|
||||||
|
V4 s0_7 = l0(0.0, 1.0);
|
||||||
|
V4 s0_8 = l0(1.0, 1.0);
|
||||||
|
V4 s1_0 = -max(-s0_0, 0.0);
|
||||||
|
V4 s1_1 = -max(-s0_1, 0.0);
|
||||||
|
V4 s1_2 = -max(-s0_2, 0.0);
|
||||||
|
V4 s1_3 = -max(-s0_3, 0.0);
|
||||||
|
V4 s1_4 = -max(-s0_4, 0.0);
|
||||||
|
V4 s1_5 = -max(-s0_5, 0.0);
|
||||||
|
V4 s1_6 = -max(-s0_6, 0.0);
|
||||||
|
V4 s1_7 = -max(-s0_7, 0.0);
|
||||||
|
V4 s1_8 = -max(-s0_8, 0.0);
|
||||||
|
s0_0 = max(s0_0, 0.0);
|
||||||
|
s0_1 = max(s0_1, 0.0);
|
||||||
|
s0_2 = max(s0_2, 0.0);
|
||||||
|
s0_3 = max(s0_3, 0.0);
|
||||||
|
s0_4 = max(s0_4, 0.0);
|
||||||
|
s0_5 = max(s0_5, 0.0);
|
||||||
|
s0_6 = max(s0_6, 0.0);
|
||||||
|
s0_7 = max(s0_7, 0.0);
|
||||||
|
s0_8 = max(s0_8, 0.0);
|
||||||
|
|
||||||
|
t0[gxy] = f0(s0_0, s0_1, s0_2, s0_3, s0_4, s0_5, s0_6, s0_7, s0_8, s1_0, s1_1, s1_2, s1_3, s1_4, s1_5, s1_6, s1_7, s1_8);
|
||||||
|
}
|
||||||
|
|
||||||
|
//!PASS 6
|
||||||
|
//!DESC conv5
|
||||||
|
//!BLOCK_SIZE 8
|
||||||
|
//!NUM_THREADS 64
|
||||||
|
//!IN t0
|
||||||
|
//!OUT t1
|
||||||
|
|
||||||
|
#define l0(x, y) V4(O(t0, float2(x, y)))
|
||||||
|
|
||||||
|
V4 f0(V4 s0_0, V4 s0_1, V4 s0_2, V4 s0_3, V4 s0_4, V4 s0_5, V4 s0_6, V4 s0_7, V4 s0_8, V4 s1_0, V4 s1_1, V4 s1_2, V4 s1_3, V4 s1_4, V4 s1_5, V4 s1_6, V4 s1_7, V4 s1_8) {
|
||||||
|
V4 r = { 1.102e-03, 4.481e-03, 3.096e-03, -9.818e-03 };
|
||||||
|
r = MulAdd(s0_0, M4(-1.069e-01, 1.009e-01, -5.972e-02, -1.732e-02, -9.217e-02, 9.177e-03, -3.127e-02, -5.872e-02, -1.364e-02, -9.990e-04, 1.518e-01, 5.861e-02, -9.835e-02, -1.155e-01, 6.714e-02, -5.142e-02), r);
|
||||||
|
r = MulAdd(s0_1, M4(1.404e-02, 1.372e-01, -2.759e-01, -4.361e-02, -1.407e-01, 1.570e-01, -1.216e-01, -7.289e-02, 3.088e-01, -1.285e-01, 1.107e-01, 1.651e-01, 1.596e-01, -1.569e-01, 1.437e-02, -1.455e-01), r);
|
||||||
|
r = MulAdd(s0_2, M4(-4.001e-02, 1.772e-01, -2.761e-01, 4.916e-02, -1.489e-01, 1.680e-01, -5.244e-02, 1.334e-01, 1.245e-01, -2.321e-01, 5.371e-01, -2.549e-01, -9.624e-02, -1.072e-01, 2.322e-01, -2.261e-01), r);
|
||||||
|
r = MulAdd(s0_3, M4(-2.291e-01, 7.774e-04, -1.015e-02, 6.036e-02, -1.133e-01, 7.554e-02, 1.081e-01, 1.704e-01, 2.123e-01, -2.065e-01, 4.928e-02, 2.352e-03, -2.488e-01, -1.765e-01, 2.044e-01, 1.302e-02), r);
|
||||||
|
r = MulAdd(s0_4, M4(3.195e-01, -5.410e-01, -4.771e-01, -1.713e-01, 2.778e-01, -1.028e-01, 8.603e-02, 2.162e-01, 1.466e-02, 2.633e-02, -3.299e-01, -5.183e-02, -3.598e-01, -4.015e-01, 5.674e-02, -1.429e-01), r);
|
||||||
|
r = MulAdd(s0_5, M4(-1.480e-01, 2.440e-01, -2.189e-01, 1.407e-01, -3.439e-01, 2.624e-01, 4.947e-01, 7.813e-01, 1.067e-01, -6.781e-02, -5.271e-02, -1.331e-02, -2.133e-01, -1.038e-01, 4.267e-01, -4.026e-01), r);
|
||||||
|
r = MulAdd(s0_6, M4(-1.086e-01, 2.607e-01, -1.897e-01, -1.710e-01, 6.096e-02, -1.121e-01, 8.797e-02, -8.204e-02, 4.825e-02, -9.364e-02, 8.472e-02, -1.923e-02, -1.755e-01, 1.086e-01, -3.987e-02, 1.737e-02), r);
|
||||||
|
r = MulAdd(s0_7, M4(5.606e-02, 4.516e-02, -7.352e-02, 7.654e-02, -6.706e-02, 2.674e-01, -2.388e-01, -1.997e-01, 9.871e-02, -9.055e-02, 1.274e-01, 1.854e-01, -1.765e-01, -1.779e-01, 1.114e-01, -1.882e-01), r);
|
||||||
|
r = MulAdd(s0_8, M4(-4.811e-02, 2.057e-01, -2.913e-01, 1.265e-01, 1.304e-01, 1.462e-01, -4.432e-03, 4.191e-01, 6.606e-02, -1.382e-01, 1.052e-01, -3.990e-01, 9.737e-02, -9.675e-02, 6.216e-02, -2.130e-01), r);
|
||||||
|
r = MulAdd(s1_0, M4(-1.183e-01, -5.696e-02, 9.372e-02, 3.074e-03, -2.694e-02, -2.272e-02, -3.489e-02, -2.667e-02, 1.635e-01, -5.761e-04, -1.677e-03, -1.076e-01, -5.411e-02, -1.100e-02, 1.742e-02, 6.403e-02), r);
|
||||||
|
r = MulAdd(s1_1, M4(-4.462e-03, 3.912e-02, -1.208e-01, -9.360e-02, -1.260e-01, 1.602e-02, -1.047e-01, -1.252e-01, 2.940e-01, 1.068e-01, -2.602e-01, 1.692e-01, 1.120e-01, -2.613e-02, -1.083e-02, 1.754e-02), r);
|
||||||
|
r = MulAdd(s1_2, M4(2.307e-02, 1.240e-01, -2.024e-01, 1.761e-01, -2.326e-01, 3.209e-02, 5.352e-02, 3.399e-02, 1.754e-01, -3.059e-01, 4.554e-01, -2.412e-01, 4.242e-03, 3.919e-02, 7.769e-02, -1.155e-01), r);
|
||||||
|
r = MulAdd(s1_3, M4(-1.946e-01, -9.445e-02, 1.698e-01, 1.165e-01, -1.571e-01, 1.700e-02, 5.682e-02, 4.628e-02, 4.425e-01, -1.872e-01, 3.713e-02, 8.537e-02, 4.211e-02, -6.178e-02, 1.398e-02, 5.929e-02), r);
|
||||||
|
r = MulAdd(s1_4, M4(5.957e-01, -6.855e-01, -3.668e-01, -2.565e-01, -4.383e-02, -8.094e-02, -2.101e-02, -2.446e-01, -7.781e-02, 5.879e-01, -5.272e-01, -1.786e-01, -2.396e-01, -4.148e-01, 5.226e-02, 9.011e-02), r);
|
||||||
|
r = MulAdd(s1_5, M4(-4.655e-02, 1.107e-01, -1.109e-01, 3.601e-01, -2.103e-01, 3.712e-01, 1.666e-01, 3.972e-01, -2.227e-02, -2.115e-02, -7.054e-02, -1.216e-01, 4.739e-03, 1.201e-01, 1.335e-01, -1.775e-01), r);
|
||||||
|
r = MulAdd(s1_6, M4(-7.542e-02, 9.157e-02, 1.143e-02, -7.961e-02, -3.812e-02, 1.722e-02, 1.396e-02, -3.920e-02, -6.220e-03, -6.723e-02, 9.364e-02, -4.804e-02, -8.885e-02, 1.313e-01, -7.872e-02, 2.733e-02), r);
|
||||||
|
r = MulAdd(s1_7, M4(-3.879e-01, -2.705e-01, 3.305e-01, -1.542e-01, -1.179e-01, 9.695e-02, -1.353e-01, -2.320e-01, 1.433e-02, -2.689e-01, 2.066e-01, 3.704e-01, -5.587e-02, -6.296e-02, 6.326e-02, 1.881e-02), r);
|
||||||
|
r = MulAdd(s1_8, M4(-4.722e-02, -5.909e-02, 4.089e-02, -8.851e-02, 2.017e-01, -2.652e-02, 9.432e-02, 3.252e-01, -2.219e-01, 2.142e-02, -4.496e-02, 5.456e-02, 2.364e-02, 1.081e-01, -9.898e-02, 9.928e-02), r);
|
||||||
|
return r;
|
||||||
|
}
|
||||||
|
|
||||||
|
void Pass6(uint2 blockStart, uint3 tid) {
|
||||||
|
float2 pt = float2(GetInputPt());
|
||||||
|
uint2 gxy = Rmp8x8(tid.x) + blockStart;
|
||||||
|
uint2 size = GetInputSize();
|
||||||
|
if (gxy.x >= size.x || gxy.y >= size.y) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
float2 pos = (gxy + 0.5) * pt;
|
||||||
|
|
||||||
|
V4 s0_0 = l0(-1.0, -1.0);
|
||||||
|
V4 s0_1 = l0(0.0, -1.0);
|
||||||
|
V4 s0_2 = l0(1.0, -1.0);
|
||||||
|
V4 s0_3 = l0(-1.0, 0.0);
|
||||||
|
V4 s0_4 = l0(0.0, 0.0);
|
||||||
|
V4 s0_5 = l0(1.0, 0.0);
|
||||||
|
V4 s0_6 = l0(-1.0, 1.0);
|
||||||
|
V4 s0_7 = l0(0.0, 1.0);
|
||||||
|
V4 s0_8 = l0(1.0, 1.0);
|
||||||
|
V4 s1_0 = -max(-s0_0, 0.0);
|
||||||
|
V4 s1_1 = -max(-s0_1, 0.0);
|
||||||
|
V4 s1_2 = -max(-s0_2, 0.0);
|
||||||
|
V4 s1_3 = -max(-s0_3, 0.0);
|
||||||
|
V4 s1_4 = -max(-s0_4, 0.0);
|
||||||
|
V4 s1_5 = -max(-s0_5, 0.0);
|
||||||
|
V4 s1_6 = -max(-s0_6, 0.0);
|
||||||
|
V4 s1_7 = -max(-s0_7, 0.0);
|
||||||
|
V4 s1_8 = -max(-s0_8, 0.0);
|
||||||
|
s0_0 = max(s0_0, 0.0);
|
||||||
|
s0_1 = max(s0_1, 0.0);
|
||||||
|
s0_2 = max(s0_2, 0.0);
|
||||||
|
s0_3 = max(s0_3, 0.0);
|
||||||
|
s0_4 = max(s0_4, 0.0);
|
||||||
|
s0_5 = max(s0_5, 0.0);
|
||||||
|
s0_6 = max(s0_6, 0.0);
|
||||||
|
s0_7 = max(s0_7, 0.0);
|
||||||
|
s0_8 = max(s0_8, 0.0);
|
||||||
|
|
||||||
|
t1[gxy] = f0(s0_0, s0_1, s0_2, s0_3, s0_4, s0_5, s0_6, s0_7, s0_8, s1_0, s1_1, s1_2, s1_3, s1_4, s1_5, s1_6, s1_7, s1_8);
|
||||||
|
}
|
||||||
|
|
||||||
|
//!PASS 7
|
||||||
|
//!DESC conv6
|
||||||
|
//!BLOCK_SIZE 8
|
||||||
|
//!NUM_THREADS 64
|
||||||
|
//!IN t1
|
||||||
|
//!OUT t0
|
||||||
|
|
||||||
|
#define l0(x, y) V4(O(t1, float2(x, y)))
|
||||||
|
|
||||||
|
V4 f0(V4 s0_0, V4 s0_1, V4 s0_2, V4 s0_3, V4 s0_4, V4 s0_5, V4 s0_6, V4 s0_7, V4 s0_8, V4 s1_0, V4 s1_1, V4 s1_2, V4 s1_3, V4 s1_4, V4 s1_5, V4 s1_6, V4 s1_7, V4 s1_8) {
|
||||||
|
V4 r = { -6.918e-03, -1.945e-03, -7.751e-03, 1.645e-02 };
|
||||||
|
r = MulAdd(s0_0, M4(-3.488e-02, 3.507e-02, 3.848e-02, -5.906e-02, 9.669e-02, 3.121e-02, -2.182e-02, 1.691e-01, -1.132e-01, -7.602e-02, -5.000e-02, -6.017e-03, 3.962e-02, 1.086e-01, -3.343e-04, 9.002e-02), r);
|
||||||
|
r = MulAdd(s0_1, M4(9.453e-02, -1.793e-01, -6.074e-02, 5.317e-03, 1.056e-01, 3.460e-01, 5.291e-02, 7.825e-02, 5.510e-02, 4.818e-02, -1.119e-02, 3.913e-02, -8.177e-02, -1.060e-01, -9.989e-03, -9.245e-02), r);
|
||||||
|
r = MulAdd(s0_2, M4(-8.190e-02, 1.375e-01, -4.322e-02, -6.721e-02, 1.645e-02, -1.392e-01, 7.103e-02, -1.950e-02, 4.302e-03, -3.213e-02, -7.517e-03, -3.406e-03, -2.132e-02, 1.333e-01, -6.553e-02, 7.300e-02), r);
|
||||||
|
r = MulAdd(s0_3, M4(-1.102e-01, 3.005e-01, -8.521e-02, 3.002e-01, 1.866e-01, 1.089e-01, -2.968e-02, 1.271e-01, -3.566e-01, 1.224e-01, -7.462e-02, -2.765e-01, 5.175e-02, 1.567e-01, 1.450e-01, -1.948e-01), r);
|
||||||
|
r = MulAdd(s0_4, M4(1.558e-01, 3.780e-02, 9.697e-02, -2.485e-01, -3.560e-01, -3.667e-01, 1.396e-01, 1.020e+00, -2.319e-01, -2.878e-01, -2.849e-01, 5.648e-01, 2.094e-01, -5.684e-01, 1.482e-01, -6.172e-01), r);
|
||||||
|
r = MulAdd(s0_5, M4(-1.276e-01, -1.685e-01, 4.271e-01, -1.489e-01, 2.154e-01, 2.661e-01, -1.093e-01, -7.859e-02, 6.618e-02, 9.795e-02, 2.778e-02, -1.286e-01, -1.527e-01, -3.586e-01, 2.523e-01, 9.196e-02), r);
|
||||||
|
r = MulAdd(s0_6, M4(-1.354e-01, -6.680e-02, 5.541e-02, -5.314e-02, 1.639e-02, -1.639e-01, -1.856e-01, -1.863e-01, -1.519e-01, -5.459e-02, 1.027e-01, 6.492e-02, 3.482e-02, -9.074e-03, 1.861e-01, 1.393e-01), r);
|
||||||
|
r = MulAdd(s0_7, M4(1.907e-02, 1.189e-02, -5.038e-01, -8.478e-02, 3.643e-01, 1.086e-02, 3.067e-01, 1.071e-01, -6.552e-01, 1.505e-01, -7.394e-01, 1.155e-01, -1.815e-01, -1.739e-02, -2.723e-01, -1.607e-01), r);
|
||||||
|
r = MulAdd(s0_8, M4(-8.319e-02, -2.563e-02, -1.127e-01, -7.792e-02, 1.295e-01, 1.091e-01, 2.920e-02, -5.761e-02, -9.443e-02, 7.429e-03, -2.117e-01, -3.670e-02, -7.118e-02, -4.469e-02, -6.460e-02, -1.261e-02), r);
|
||||||
|
r = MulAdd(s1_0, M4(2.400e-02, -2.740e-02, -3.394e-02, 5.817e-02, -6.716e-02, -5.672e-02, -7.339e-02, -3.921e-02, -9.506e-02, -3.805e-02, -3.235e-02, -8.145e-02, 1.265e-02, 7.308e-02, -5.707e-02, 1.141e-01), r);
|
||||||
|
r = MulAdd(s1_1, M4(-1.565e-01, 1.052e-01, -8.934e-02, -6.945e-02, 3.804e-02, 2.091e-01, -1.102e-01, 2.394e-01, 6.041e-02, -9.942e-02, -6.054e-03, 4.857e-02, -7.265e-02, 1.596e-02, 9.135e-02, -8.397e-02), r);
|
||||||
|
r = MulAdd(s1_2, M4(-9.449e-02, 1.121e-01, -1.101e-01, -2.980e-02, 5.100e-02, -6.337e-02, 1.692e-01, -5.062e-02, -3.931e-02, 1.083e-01, 3.952e-03, 9.801e-04, -6.425e-02, 8.015e-02, -1.628e-01, 8.317e-02), r);
|
||||||
|
r = MulAdd(s1_3, M4(7.400e-02, 8.412e-02, 2.984e-02, 8.693e-02, -1.474e-01, -3.529e-02, -6.134e-02, -1.107e-01, -3.264e-01, 8.009e-02, -2.261e-01, -1.472e-01, -4.683e-02, -1.258e-01, 1.061e-01, -1.125e-01), r);
|
||||||
|
r = MulAdd(s1_4, M4(4.970e-01, -1.211e-01, 2.379e-01, 2.124e-01, -1.003e-01, -5.656e-01, 5.001e-02, 4.959e-01, 1.538e-01, -7.985e-01, -2.085e-01, 2.220e-01, 7.247e-02, 6.581e-02, -9.437e-02, -3.066e-01), r);
|
||||||
|
r = MulAdd(s1_5, M4(-8.611e-02, -9.199e-02, 2.518e-01, -7.482e-02, -1.208e-01, 1.015e-01, 3.428e-02, -1.354e-01, 1.038e-01, -4.497e-02, 2.744e-01, -4.281e-02, 4.090e-02, -2.726e-01, 1.839e-01, 1.138e-01), r);
|
||||||
|
r = MulAdd(s1_6, M4(-8.703e-02, -4.776e-02, -1.477e-01, -1.870e-02, -1.072e-01, 3.204e-02, -8.396e-03, 1.175e-01, 1.685e-01, -1.427e-01, 2.152e-01, -2.155e-01, 1.898e-03, 5.924e-02, -1.089e-02, 5.197e-02), r);
|
||||||
|
r = MulAdd(s1_7, M4(-9.679e-02, 1.961e-02, 1.636e-01, -6.049e-02, -7.071e-03, 1.519e-01, -6.303e-01, 4.739e-02, -3.331e-01, 8.291e-02, -5.944e-01, -7.677e-02, -1.164e-01, -4.580e-02, 1.419e-01, 6.839e-02), r);
|
||||||
|
r = MulAdd(s1_8, M4(6.174e-02, -4.004e-02, 1.256e-02, -4.981e-02, 4.659e-03, 8.371e-02, -1.664e-01, -2.897e-02, -1.253e-01, 2.381e-02, -1.147e-01, -8.724e-02, -3.736e-02, 1.140e-02, -1.550e-01, 6.350e-03), r);
|
||||||
|
return r;
|
||||||
|
}
|
||||||
|
|
||||||
|
void Pass7(uint2 blockStart, uint3 tid) {
|
||||||
|
float2 pt = float2(GetInputPt());
|
||||||
|
uint2 gxy = Rmp8x8(tid.x) + blockStart;
|
||||||
|
uint2 size = GetInputSize();
|
||||||
|
if (gxy.x >= size.x || gxy.y >= size.y) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
float2 pos = (gxy + 0.5) * pt;
|
||||||
|
|
||||||
|
V4 s0_0 = l0(-1.0, -1.0);
|
||||||
|
V4 s0_1 = l0(0.0, -1.0);
|
||||||
|
V4 s0_2 = l0(1.0, -1.0);
|
||||||
|
V4 s0_3 = l0(-1.0, 0.0);
|
||||||
|
V4 s0_4 = l0(0.0, 0.0);
|
||||||
|
V4 s0_5 = l0(1.0, 0.0);
|
||||||
|
V4 s0_6 = l0(-1.0, 1.0);
|
||||||
|
V4 s0_7 = l0(0.0, 1.0);
|
||||||
|
V4 s0_8 = l0(1.0, 1.0);
|
||||||
|
V4 s1_0 = -max(-s0_0, 0.0);
|
||||||
|
V4 s1_1 = -max(-s0_1, 0.0);
|
||||||
|
V4 s1_2 = -max(-s0_2, 0.0);
|
||||||
|
V4 s1_3 = -max(-s0_3, 0.0);
|
||||||
|
V4 s1_4 = -max(-s0_4, 0.0);
|
||||||
|
V4 s1_5 = -max(-s0_5, 0.0);
|
||||||
|
V4 s1_6 = -max(-s0_6, 0.0);
|
||||||
|
V4 s1_7 = -max(-s0_7, 0.0);
|
||||||
|
V4 s1_8 = -max(-s0_8, 0.0);
|
||||||
|
s0_0 = max(s0_0, 0.0);
|
||||||
|
s0_1 = max(s0_1, 0.0);
|
||||||
|
s0_2 = max(s0_2, 0.0);
|
||||||
|
s0_3 = max(s0_3, 0.0);
|
||||||
|
s0_4 = max(s0_4, 0.0);
|
||||||
|
s0_5 = max(s0_5, 0.0);
|
||||||
|
s0_6 = max(s0_6, 0.0);
|
||||||
|
s0_7 = max(s0_7, 0.0);
|
||||||
|
s0_8 = max(s0_8, 0.0);
|
||||||
|
|
||||||
|
t0[gxy] = f0(s0_0, s0_1, s0_2, s0_3, s0_4, s0_5, s0_6, s0_7, s0_8, s1_0, s1_1, s1_2, s1_3, s1_4, s1_5, s1_6, s1_7, s1_8);
|
||||||
|
}
|
||||||
|
|
||||||
|
//!PASS 8
|
||||||
|
//!DESC conv7
|
||||||
|
//!BLOCK_SIZE 8
|
||||||
|
//!NUM_THREADS 64
|
||||||
|
//!IN t0
|
||||||
|
//!OUT t1
|
||||||
|
|
||||||
|
#define l0(x, y) V4(O(t0, float2(x, y)))
|
||||||
|
|
||||||
|
V4 f0(V4 s0_0, V4 s0_1, V4 s0_2, V4 s0_3, V4 s0_4, V4 s0_5, V4 s0_6, V4 s0_7, V4 s0_8, V4 s1_0, V4 s1_1, V4 s1_2, V4 s1_3, V4 s1_4, V4 s1_5, V4 s1_6, V4 s1_7, V4 s1_8) {
|
||||||
|
V4 r = { 1.569e-02, 1.505e-02, 2.765e-02, 1.258e-02 };
|
||||||
|
r = MulAdd(s0_0, M4(-4.116e-02, -3.385e-02, -4.697e-02, 4.650e-02, -3.488e-02, 1.006e-01, -4.538e-03, 4.637e-02, 1.288e-01, 7.769e-03, 1.150e-01, -7.930e-03, 1.045e-02, 4.849e-02, 2.767e-02, 4.909e-02), r);
|
||||||
|
r = MulAdd(s0_1, M4(-5.332e-01, -5.254e-01, -3.541e-01, -3.525e-01, 1.117e-02, 2.929e-02, 6.817e-02, 9.115e-02, 1.055e+00, 6.141e-02, 3.976e-01, 4.649e-02, 2.561e-01, -1.191e-01, 1.230e-03, -1.047e-01), r);
|
||||||
|
r = MulAdd(s0_2, M4(-1.450e-01, -2.829e-01, -6.857e-01, -4.294e-01, 3.217e-02, 2.745e-02, 5.242e-02, 3.556e-02, 1.284e-01, 4.292e-01, 7.161e-01, 2.220e-01, -1.508e-02, 1.802e-01, 1.842e-01, 9.827e-02), r);
|
||||||
|
r = MulAdd(s0_3, M4(1.381e-01, 5.690e-02, 5.107e-02, 6.625e-02, -1.173e-01, -7.448e-02, -1.152e-01, -1.808e-01, -1.470e-01, -1.833e-01, -1.653e-01, -1.217e-01, 9.096e-02, 5.579e-02, 1.128e-02, 9.791e-02), r);
|
||||||
|
r = MulAdd(s0_4, M4(5.936e-01, -2.579e-01, 5.761e-01, -7.051e-01, -7.023e-01, 2.824e-01, 2.057e-01, 3.628e-01, 1.006e-02, 3.209e-01, 6.969e-02, -3.464e-01, 4.768e-01, -3.194e-01, -4.817e-02, 3.050e-02), r);
|
||||||
|
r = MulAdd(s0_5, M4(2.145e-01, -1.899e-01, 1.446e-01, 2.497e-02, -8.750e-02, -3.154e-01, -5.060e-01, -7.413e-02, -8.542e-02, -4.198e-02, -1.528e-01, -1.812e-01, -2.597e-01, 8.374e-02, -5.592e-01, -2.557e-01), r);
|
||||||
|
r = MulAdd(s0_6, M4(5.713e-02, -4.294e-03, 2.388e-02, -7.124e-02, -2.163e-02, -3.642e-03, 3.839e-02, -6.934e-02, -9.052e-02, -1.153e-02, 1.213e-02, 7.120e-02, -3.698e-02, 4.260e-02, -7.245e-02, 7.898e-02), r);
|
||||||
|
r = MulAdd(s0_7, M4(2.780e-02, 1.944e-02, 1.415e-01, 1.216e-01, 9.163e-02, -3.069e-02, -1.829e-02, -2.182e-01, 5.815e-02, -1.923e-02, -5.934e-02, -3.487e-02, -1.082e-01, 1.362e-01, 8.120e-02, 2.621e-01), r);
|
||||||
|
r = MulAdd(s0_8, M4(9.334e-03, -1.300e-02, 4.936e-02, 1.751e-01, -1.214e-01, 1.629e-02, -1.131e-01, 7.402e-02, 1.134e-02, 1.663e-03, -5.887e-03, -8.862e-02, 1.029e-01, -5.629e-02, 9.127e-02, -6.668e-02), r);
|
||||||
|
r = MulAdd(s1_0, M4(1.165e-02, 4.389e-02, 6.299e-03, 7.939e-02, -2.769e-02, 9.353e-02, 6.239e-02, 1.341e-02, 4.713e-02, -2.731e-03, 5.256e-02, -3.515e-02, -8.911e-02, -1.425e-01, -7.889e-02, -1.627e-01), r);
|
||||||
|
r = MulAdd(s1_1, M4(-2.869e-01, 2.838e-02, -2.541e-02, 5.216e-02, 2.660e-01, -2.095e-01, 1.375e-01, -2.562e-02, 2.715e-01, 1.694e-01, 9.471e-02, -7.292e-03, 3.257e-01, -2.247e-01, 7.698e-03, -2.076e-01), r);
|
||||||
|
r = MulAdd(s1_2, M4(1.832e-02, -1.860e-01, -4.951e-02, -1.392e-03, 9.307e-02, 7.671e-02, 1.043e-01, -3.675e-02, 1.433e-03, 1.219e-01, 1.978e-01, 5.960e-02, 9.624e-02, 1.448e-01, 3.561e-01, 3.054e-02), r);
|
||||||
|
r = MulAdd(s1_3, M4(-4.647e-02, 4.225e-03, 3.830e-02, -3.233e-02, -1.532e-01, -6.289e-01, -3.037e-01, -4.131e-01, -1.794e-01, -4.090e-02, -9.644e-02, -4.828e-02, 7.978e-02, 6.792e-03, 5.043e-02, 4.905e-02), r);
|
||||||
|
r = MulAdd(s1_4, M4(2.967e-01, -5.750e-02, 1.168e-01, -2.681e-02, 1.232e-01, -2.481e-03, 8.164e-01, 2.468e-01, -3.721e-01, -5.041e-02, -4.796e-01, -2.778e-02, 3.623e-01, -8.387e-01, -5.229e-01, -4.492e-01), r);
|
||||||
|
r = MulAdd(s1_5, M4(-1.507e-02, 1.343e-01, 8.567e-02, 8.923e-02, -2.766e-03, -1.548e-01, -2.588e-01, -1.295e-01, 1.777e-02, -9.243e-02, -4.495e-02, -5.528e-02, -1.071e-01, -1.284e-01, -4.142e-01, -1.800e-01), r);
|
||||||
|
r = MulAdd(s1_6, M4(-4.695e-03, -9.431e-04, -1.256e-02, -4.959e-03, 1.607e-01, -8.763e-02, 2.039e-01, -1.243e-01, 3.725e-02, -5.612e-02, -3.615e-03, -2.475e-02, 4.955e-02, 4.065e-02, -1.879e-02, -1.195e-01), r);
|
||||||
|
r = MulAdd(s1_7, M4(-1.039e-02, 5.631e-02, 2.655e-02, 7.419e-02, 1.286e-01, -6.430e-02, 4.800e-02, -4.480e-02, 5.067e-03, -4.197e-02, -3.342e-02, -7.461e-02, -3.225e-02, 6.062e-03, 5.391e-02, -7.135e-02), r);
|
||||||
|
r = MulAdd(s1_8, M4(5.195e-02, 3.799e-02, 1.130e-01, -8.811e-03, -4.285e-02, 1.609e-02, -8.972e-03, 3.530e-02, -6.932e-02, -3.013e-03, -5.208e-02, 5.823e-02, -4.561e-02, -1.068e-01, -1.458e-01, -5.739e-02), r);
|
||||||
|
return r;
|
||||||
|
}
|
||||||
|
|
||||||
|
void Pass8(uint2 blockStart, uint3 tid) {
|
||||||
|
float2 pt = float2(GetInputPt());
|
||||||
|
uint2 gxy = Rmp8x8(tid.x) + blockStart;
|
||||||
|
uint2 size = GetInputSize();
|
||||||
|
if (gxy.x >= size.x || gxy.y >= size.y) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
float2 pos = (gxy + 0.5) * pt;
|
||||||
|
|
||||||
|
V4 s0_0 = l0(-1.0, -1.0);
|
||||||
|
V4 s0_1 = l0(0.0, -1.0);
|
||||||
|
V4 s0_2 = l0(1.0, -1.0);
|
||||||
|
V4 s0_3 = l0(-1.0, 0.0);
|
||||||
|
V4 s0_4 = l0(0.0, 0.0);
|
||||||
|
V4 s0_5 = l0(1.0, 0.0);
|
||||||
|
V4 s0_6 = l0(-1.0, 1.0);
|
||||||
|
V4 s0_7 = l0(0.0, 1.0);
|
||||||
|
V4 s0_8 = l0(1.0, 1.0);
|
||||||
|
V4 s1_0 = -max(-s0_0, 0.0);
|
||||||
|
V4 s1_1 = -max(-s0_1, 0.0);
|
||||||
|
V4 s1_2 = -max(-s0_2, 0.0);
|
||||||
|
V4 s1_3 = -max(-s0_3, 0.0);
|
||||||
|
V4 s1_4 = -max(-s0_4, 0.0);
|
||||||
|
V4 s1_5 = -max(-s0_5, 0.0);
|
||||||
|
V4 s1_6 = -max(-s0_6, 0.0);
|
||||||
|
V4 s1_7 = -max(-s0_7, 0.0);
|
||||||
|
V4 s1_8 = -max(-s0_8, 0.0);
|
||||||
|
s0_0 = max(s0_0, 0.0);
|
||||||
|
s0_1 = max(s0_1, 0.0);
|
||||||
|
s0_2 = max(s0_2, 0.0);
|
||||||
|
s0_3 = max(s0_3, 0.0);
|
||||||
|
s0_4 = max(s0_4, 0.0);
|
||||||
|
s0_5 = max(s0_5, 0.0);
|
||||||
|
s0_6 = max(s0_6, 0.0);
|
||||||
|
s0_7 = max(s0_7, 0.0);
|
||||||
|
s0_8 = max(s0_8, 0.0);
|
||||||
|
|
||||||
|
t1[gxy] = f0(s0_0, s0_1, s0_2, s0_3, s0_4, s0_5, s0_6, s0_7, s0_8, s1_0, s1_1, s1_2, s1_3, s1_4, s1_5, s1_6, s1_7, s1_8);
|
||||||
|
}
|
||||||
|
|
||||||
|
//!PASS 9
|
||||||
|
//!DESC conv8
|
||||||
|
//!BLOCK_SIZE 8
|
||||||
|
//!NUM_THREADS 64
|
||||||
|
//!IN t1
|
||||||
|
//!OUT t0
|
||||||
|
|
||||||
|
#define l0(x, y) V4(O(t1, float2(x, y)))
|
||||||
|
|
||||||
|
V4 f0(V4 s0_0, V4 s0_1, V4 s0_2, V4 s0_3, V4 s0_4, V4 s0_5, V4 s0_6, V4 s0_7, V4 s0_8, V4 s1_0, V4 s1_1, V4 s1_2, V4 s1_3, V4 s1_4, V4 s1_5, V4 s1_6, V4 s1_7, V4 s1_8) {
|
||||||
|
V4 r = { 8.112e-05, 3.290e-03, -6.342e-04, 1.340e-02 };
|
||||||
|
r = MulAdd(s0_0, M4(4.724e-03, 6.987e-03, -3.797e-03, 2.147e-02, -5.616e-03, 1.123e-02, -2.768e-02, 8.185e-03, -4.051e-03, 5.608e-05, -9.522e-02, 2.924e-02, -5.976e-03, 8.331e-03, 7.513e-02, -2.513e-02), r);
|
||||||
|
r = MulAdd(s0_1, M4(-2.224e-02, 1.093e-04, 5.901e-02, 2.350e-02, 1.167e-01, -7.837e-02, 1.939e-01, 1.987e-01, 5.530e-02, -4.759e-05, 1.221e-01, 4.764e-02, -8.813e-02, 7.695e-02, -4.577e-01, 1.671e-02), r);
|
||||||
|
r = MulAdd(s0_2, M4(-5.696e-02, 6.005e-03, -5.620e-02, -8.978e-02, 4.014e-02, -3.822e-02, 1.081e-01, -6.532e-03, 9.444e-03, 7.498e-03, -3.228e-02, 4.908e-02, -2.043e-02, 2.374e-02, 2.163e-02, -4.505e-02), r);
|
||||||
|
r = MulAdd(s0_3, M4(-8.098e-02, 1.943e-02, -5.744e-02, 3.824e-02, -2.071e-01, 1.036e-01, -6.926e-02, -2.348e-01, 2.378e-01, -1.069e-01, -5.307e-02, 1.161e-01, 1.881e-01, -5.785e-02, -6.570e-02, 2.227e-01), r);
|
||||||
|
r = MulAdd(s0_4, M4(7.577e-02, -4.125e-02, 1.714e-01, -6.934e-01, -2.448e-01, 1.146e-01, 2.354e-01, -4.935e-01, -2.321e-01, -8.273e-02, 5.890e-02, 5.704e-01, 4.833e-02, 2.875e-02, 1.163e-01, -1.802e-01), r);
|
||||||
|
r = MulAdd(s0_5, M4(2.287e-01, -3.461e-02, -2.542e-02, 2.882e-02, 7.142e-02, -1.556e-01, 4.055e-02, 1.534e-02, -1.647e-01, 3.087e-03, -6.811e-02, -3.896e-02, 1.334e-01, 1.188e-01, -1.847e-01, 4.293e-02), r);
|
||||||
|
r = MulAdd(s0_6, M4(-3.094e-02, 2.712e-03, 3.387e-03, 1.877e-02, 9.494e-02, -2.863e-02, -4.239e-02, -3.402e-02, 5.541e-03, -1.178e-02, 1.795e-02, -3.515e-02, -3.044e-02, -2.463e-02, -1.320e-02, 8.952e-02), r);
|
||||||
|
r = MulAdd(s0_7, M4(1.035e-01, -3.181e-02, 1.902e-02, 3.973e-03, 2.267e-01, -2.620e-01, 1.821e-01, 1.631e-01, 1.494e-02, 6.125e-02, -6.176e-02, -2.497e-02, -1.364e-02, 7.542e-02, -8.480e-02, -4.648e-02), r);
|
||||||
|
r = MulAdd(s0_8, M4(-1.466e-01, 3.028e-02, 2.798e-02, -7.887e-02, -4.370e-02, 1.408e-02, -6.161e-02, -3.034e-02, 6.567e-02, 2.071e-02, 3.126e-02, 6.993e-02, -5.556e-02, 1.507e-02, 2.991e-02, -4.924e-02), r);
|
||||||
|
r = MulAdd(s1_0, M4(1.637e-02, -2.767e-02, 8.568e-02, -4.254e-02, 3.215e-02, 1.987e-04, -3.697e-02, 3.787e-02, 2.236e-02, -6.576e-02, 7.400e-02, 1.093e-01, 3.271e-03, 1.809e-03, 1.011e-02, 1.509e-01), r);
|
||||||
|
r = MulAdd(s1_1, M4(5.538e-02, -5.865e-02, 4.351e-01, 2.494e-01, 1.101e-01, -1.484e-02, 5.176e-01, 3.999e-02, -4.782e-03, 1.155e-01, -2.099e-01, 5.012e-03, -1.919e-01, 2.292e-01, -5.378e-01, -1.223e-01), r);
|
||||||
|
r = MulAdd(s1_2, M4(-5.691e-02, 7.653e-02, -2.572e-01, -1.332e-01, -5.652e-02, -5.008e-02, 7.840e-02, -3.729e-02, 6.942e-02, 6.483e-04, -2.243e-05, 8.430e-02, -6.848e-02, -2.096e-02, -3.908e-02, -9.062e-02), r);
|
||||||
|
r = MulAdd(s1_3, M4(2.725e-01, -1.841e-01, -6.710e-03, 3.965e-01, -1.298e-01, -4.014e-03, 2.007e-01, -3.700e-01, 5.329e-01, -4.014e-01, 2.619e-02, 1.606e-01, 2.179e-01, -1.403e-01, 4.227e-02, 8.568e-02), r);
|
||||||
|
r = MulAdd(s1_4, M4(4.188e-01, -7.320e-01, -5.609e-01, -6.087e-01, -7.521e-01, 7.363e-01, -6.253e-01, -2.011e-01, -1.017e+00, 3.331e-02, -2.135e-02, 2.084e-01, 6.074e-01, -9.824e-01, 5.154e-01, 1.748e-01), r);
|
||||||
|
r = MulAdd(s1_5, M4(1.733e-01, 5.176e-01, -7.335e-02, 1.899e-02, 1.028e-01, -6.330e-02, -1.632e-01, 9.241e-05, -1.357e-01, -1.131e-01, 9.644e-02, -1.424e-02, -1.835e-02, 7.296e-01, -3.204e-01, -2.966e-02), r);
|
||||||
|
r = MulAdd(s1_6, M4(4.798e-02, -1.047e-01, 3.646e-02, 6.703e-02, 5.371e-02, 4.759e-02, -2.975e-02, -6.945e-02, 7.985e-02, -1.101e-01, 3.034e-02, -1.472e-02, -3.827e-02, 9.839e-03, -4.922e-03, 4.307e-03), r);
|
||||||
|
r = MulAdd(s1_7, M4(-8.950e-02, -1.253e-02, -1.730e-05, 3.862e-02, 2.692e-01, -4.645e-01, 2.399e-01, 2.744e-01, -4.503e-02, 1.724e-01, -7.935e-02, -5.200e-02, -2.132e-03, -1.926e-02, 2.926e-02, -2.288e-02), r);
|
||||||
|
r = MulAdd(s1_8, M4(-1.072e-01, -1.145e-02, 6.605e-03, -1.090e-01, -7.524e-03, 8.598e-02, -7.698e-02, -6.976e-02, 5.869e-02, -5.499e-02, 3.529e-02, 7.813e-02, -1.794e-01, 4.212e-02, -4.479e-03, -7.253e-02), r);
|
||||||
|
return r;
|
||||||
|
}
|
||||||
|
|
||||||
|
void Pass9(uint2 blockStart, uint3 tid) {
|
||||||
|
float2 pt = float2(GetInputPt());
|
||||||
|
uint2 gxy = Rmp8x8(tid.x) + blockStart;
|
||||||
|
uint2 size = GetInputSize();
|
||||||
|
if (gxy.x >= size.x || gxy.y >= size.y) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
float2 pos = (gxy + 0.5) * pt;
|
||||||
|
|
||||||
|
V4 s0_0 = l0(-1.0, -1.0);
|
||||||
|
V4 s0_1 = l0(0.0, -1.0);
|
||||||
|
V4 s0_2 = l0(1.0, -1.0);
|
||||||
|
V4 s0_3 = l0(-1.0, 0.0);
|
||||||
|
V4 s0_4 = l0(0.0, 0.0);
|
||||||
|
V4 s0_5 = l0(1.0, 0.0);
|
||||||
|
V4 s0_6 = l0(-1.0, 1.0);
|
||||||
|
V4 s0_7 = l0(0.0, 1.0);
|
||||||
|
V4 s0_8 = l0(1.0, 1.0);
|
||||||
|
V4 s1_0 = -max(-s0_0, 0.0);
|
||||||
|
V4 s1_1 = -max(-s0_1, 0.0);
|
||||||
|
V4 s1_2 = -max(-s0_2, 0.0);
|
||||||
|
V4 s1_3 = -max(-s0_3, 0.0);
|
||||||
|
V4 s1_4 = -max(-s0_4, 0.0);
|
||||||
|
V4 s1_5 = -max(-s0_5, 0.0);
|
||||||
|
V4 s1_6 = -max(-s0_6, 0.0);
|
||||||
|
V4 s1_7 = -max(-s0_7, 0.0);
|
||||||
|
V4 s1_8 = -max(-s0_8, 0.0);
|
||||||
|
s0_0 = max(s0_0, 0.0);
|
||||||
|
s0_1 = max(s0_1, 0.0);
|
||||||
|
s0_2 = max(s0_2, 0.0);
|
||||||
|
s0_3 = max(s0_3, 0.0);
|
||||||
|
s0_4 = max(s0_4, 0.0);
|
||||||
|
s0_5 = max(s0_5, 0.0);
|
||||||
|
s0_6 = max(s0_6, 0.0);
|
||||||
|
s0_7 = max(s0_7, 0.0);
|
||||||
|
s0_8 = max(s0_8, 0.0);
|
||||||
|
|
||||||
|
t0[gxy] = f0(s0_0, s0_1, s0_2, s0_3, s0_4, s0_5, s0_6, s0_7, s0_8, s1_0, s1_1, s1_2, s1_3, s1_4, s1_5, s1_6, s1_7, s1_8);
|
||||||
|
}
|
||||||
|
|
||||||
|
//!PASS 10
|
||||||
|
//!DESC out-shuffle
|
||||||
|
//!BLOCK_SIZE 16
|
||||||
|
//!NUM_THREADS 64
|
||||||
|
//!IN INPUT, t0
|
||||||
|
//!OUT OUTPUT
|
||||||
|
|
||||||
|
#define l0(x, y) V4(O(t0, float2(x, y)))
|
||||||
|
|
||||||
|
V4 f0(V4 s0_0, V4 s0_1, V4 s0_2, V4 s0_3, V4 s0_4, V4 s0_5, V4 s0_6, V4 s0_7, V4 s0_8, V4 s1_0, V4 s1_1, V4 s1_2, V4 s1_3, V4 s1_4, V4 s1_5, V4 s1_6, V4 s1_7, V4 s1_8) {
|
||||||
|
V4 r = { 3.107e-03, 3.655e-03, 5.416e-04, 5.397e-04 };
|
||||||
|
r = MulAdd(s0_0, M4(-1.348e-01, -9.107e-02, -4.849e-02, 4.484e-04, -3.384e-02, -6.768e-02, -9.628e-03, -1.766e-02, -9.939e-03, -2.182e-02, -1.288e-02, 8.518e-03, 2.218e-02, -1.184e-03, 1.240e-03, 1.065e-02), r);
|
||||||
|
r = MulAdd(s0_1, M4(7.301e-02, 1.014e-01, -1.363e-02, -4.850e-02, -2.842e-01, -3.060e-02, -4.154e-02, 4.057e-03, -3.458e-02, -6.335e-02, -2.660e-02, -1.335e-02, -7.944e-03, 8.560e-03, 4.588e-02, 7.580e-03), r);
|
||||||
|
r = MulAdd(s0_2, M4(-9.349e-05, -2.142e-02, -1.258e-04, -9.330e-03, -5.058e-03, 3.912e-02, -2.976e-02, 2.410e-02, -8.512e-03, 4.954e-02, -2.093e-02, -2.582e-03, 1.648e-02, 7.942e-03, 1.520e-02, 3.414e-02), r);
|
||||||
|
r = MulAdd(s0_3, M4(-1.021e-01, -1.140e-01, 2.412e-01, 1.289e-02, -1.192e-01, -1.140e-01, 2.395e-01, 6.930e-03, -2.027e-01, -4.824e-02, 1.243e-01, 3.820e-03, 9.280e-03, 2.866e-02, 2.106e-02, 3.644e-03), r);
|
||||||
|
r = MulAdd(s0_4, M4(8.727e-02, 8.162e-02, 1.478e-01, 5.348e-01, -3.115e-01, -2.605e-02, 1.510e-01, 7.249e-01, -8.110e-02, -6.698e-01, 1.080e-01, -8.090e-02, -3.492e-01, -1.891e-01, -1.877e-01, -1.319e-01), r);
|
||||||
|
r = MulAdd(s0_5, M4(-5.622e-03, 2.237e-02, -4.008e-03, -1.980e-02, -1.837e-02, -3.311e-02, 4.289e-02, 3.256e-02, -2.178e-02, 2.653e-02, -1.722e-03, 8.373e-02, -8.042e-02, -2.962e-01, -4.643e-03, -6.865e-02), r);
|
||||||
|
r = MulAdd(s0_6, M4(6.248e-03, -2.320e-02, 1.883e-03, -1.430e-02, 1.224e-02, 5.634e-03, -1.964e-02, -1.627e-02, 2.010e-02, 1.174e-02, -3.919e-02, 9.559e-04, 3.016e-02, -2.836e-03, 7.667e-02, 3.552e-02), r);
|
||||||
|
r = MulAdd(s0_7, M4(-6.141e-03, 1.380e-02, 1.024e-02, -1.210e-02, 4.548e-02, 3.626e-02, -9.142e-02, -7.666e-02, -3.241e-02, -2.296e-02, -3.244e-02, -2.870e-01, 4.427e-02, 8.899e-02, -1.327e-01, 4.920e-02), r);
|
||||||
|
r = MulAdd(s0_8, M4(-2.801e-03, -9.930e-04, -2.770e-03, 1.623e-02, 2.158e-03, -1.258e-02, -3.089e-02, 3.211e-02, -9.620e-03, 1.776e-02, -4.337e-03, 4.676e-02, 1.130e-02, -7.436e-03, -3.572e-02, -1.742e-01), r);
|
||||||
|
r = MulAdd(s1_0, M4(-4.306e-02, -6.039e-02, -1.642e-02, -1.966e-02, -5.996e-02, -1.743e-01, -3.128e-02, 1.714e-02, -4.357e-03, -7.720e-03, -4.532e-03, 4.571e-03, 3.988e-02, 2.067e-02, 1.548e-02, -2.964e-04), r);
|
||||||
|
r = MulAdd(s1_1, M4(-6.070e-02, -9.324e-02, 7.472e-03, 2.173e-02, -7.996e-02, -5.139e-02, -5.545e-02, -1.891e-02, -1.767e-02, -1.527e-02, -2.906e-02, 1.310e-02, 2.594e-02, 7.495e-02, -7.681e-03, -4.678e-03), r);
|
||||||
|
r = MulAdd(s1_2, M4(4.883e-02, -3.167e-02, 2.862e-02, 3.357e-02, -8.454e-03, 9.265e-03, -1.657e-02, -8.086e-03, -1.170e-02, -3.549e-02, 7.437e-03, 1.425e-02, 1.441e-02, -1.961e-02, 1.560e-02, -1.122e-02), r);
|
||||||
|
r = MulAdd(s1_3, M4(-6.156e-02, -6.763e-02, 1.987e-01, 2.459e-02, -5.710e-02, -2.009e-01, 4.581e-01, -1.181e-02, -9.054e-02, -5.658e-02, 3.432e-02, 2.004e-02, 6.965e-03, -1.655e-02, 5.178e-03, -1.236e-02), r);
|
||||||
|
r = MulAdd(s1_4, M4(-1.301e-01, -5.093e-02, 7.676e-01, 6.003e-01, -9.216e-02, -2.228e-03, 7.034e-02, 1.851e-01, -5.318e-01, -1.852e-01, -2.980e-02, 1.919e-02, -8.147e-01, -1.773e-01, -2.675e-01, 2.314e-02), r);
|
||||||
|
r = MulAdd(s1_5, M4(-9.962e-03, -1.888e-01, -1.877e-02, 1.190e-01, -2.283e-02, -1.241e-02, 2.969e-04, 3.894e-02, -5.077e-02, 2.300e-01, -6.192e-02, 1.793e-01, 5.384e-03, -4.378e-01, 2.970e-02, -1.125e-01), r);
|
||||||
|
r = MulAdd(s1_6, M4(1.376e-02, -2.891e-03, 9.292e-03, -1.288e-03, 2.615e-02, 2.656e-02, -8.111e-02, -1.779e-02, -7.512e-03, 9.174e-03, -4.553e-02, -1.139e-02, 2.259e-02, 4.351e-03, 4.963e-02, 2.002e-02), r);
|
||||||
|
r = MulAdd(s1_7, M4(9.993e-03, 1.250e-02, -2.090e-02, 6.839e-03, 3.502e-03, 2.070e-03, -5.530e-02, -2.855e-03, 1.144e-02, -4.191e-02, -8.395e-02, -2.056e-01, 6.909e-02, 7.425e-02, -2.374e-01, 8.636e-02), r);
|
||||||
|
r = MulAdd(s1_8, M4(-1.762e-02, -3.804e-04, 2.643e-02, 4.383e-02, 1.748e-03, -1.201e-02, -8.452e-03, -1.216e-02, -1.203e-02, -3.454e-02, -1.957e-02, 2.212e-01, -1.375e-02, -1.094e-02, -1.245e-02, -2.124e-01), r);
|
||||||
|
return tanh(r);
|
||||||
|
}
|
||||||
|
|
||||||
|
void Pass10(uint2 blockStart, uint3 tid) {
|
||||||
|
float2 pt = float2(GetInputPt());
|
||||||
|
uint2 gxy = (Rmp8x8(tid.x) << 1) + blockStart;
|
||||||
|
uint2 size = GetOutputSize();
|
||||||
|
if (gxy.x >= size.x || gxy.y >= size.y) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
float2 pos = ((gxy >> 1) + 0.5) * pt;
|
||||||
|
|
||||||
|
V4 s0_0 = l0(-1.0, -1.0);
|
||||||
|
V4 s0_1 = l0(0.0, -1.0);
|
||||||
|
V4 s0_2 = l0(1.0, -1.0);
|
||||||
|
V4 s0_3 = l0(-1.0, 0.0);
|
||||||
|
V4 s0_4 = l0(0.0, 0.0);
|
||||||
|
V4 s0_5 = l0(1.0, 0.0);
|
||||||
|
V4 s0_6 = l0(-1.0, 1.0);
|
||||||
|
V4 s0_7 = l0(0.0, 1.0);
|
||||||
|
V4 s0_8 = l0(1.0, 1.0);
|
||||||
|
V4 s1_0 = -max(-s0_0, 0.0);
|
||||||
|
V4 s1_1 = -max(-s0_1, 0.0);
|
||||||
|
V4 s1_2 = -max(-s0_2, 0.0);
|
||||||
|
V4 s1_3 = -max(-s0_3, 0.0);
|
||||||
|
V4 s1_4 = -max(-s0_4, 0.0);
|
||||||
|
V4 s1_5 = -max(-s0_5, 0.0);
|
||||||
|
V4 s1_6 = -max(-s0_6, 0.0);
|
||||||
|
V4 s1_7 = -max(-s0_7, 0.0);
|
||||||
|
V4 s1_8 = -max(-s0_8, 0.0);
|
||||||
|
s0_0 = max(s0_0, 0.0);
|
||||||
|
s0_1 = max(s0_1, 0.0);
|
||||||
|
s0_2 = max(s0_2, 0.0);
|
||||||
|
s0_3 = max(s0_3, 0.0);
|
||||||
|
s0_4 = max(s0_4, 0.0);
|
||||||
|
s0_5 = max(s0_5, 0.0);
|
||||||
|
s0_6 = max(s0_6, 0.0);
|
||||||
|
s0_7 = max(s0_7, 0.0);
|
||||||
|
s0_8 = max(s0_8, 0.0);
|
||||||
|
|
||||||
|
V4 r = f0(s0_0, s0_1, s0_2, s0_3, s0_4, s0_5, s0_6, s0_7, s0_8, s1_0, s1_1, s1_2, s1_3, s1_4, s1_5, s1_6, s1_7, s1_8);
|
||||||
|
|
||||||
|
static const MF3x3 rgb2yuv = { 0.299, 0.587, 0.114, -0.169, -0.331, 0.5, 0.5, -0.419, -0.081 };
|
||||||
|
static const MF3x3 yuv2rgb = { 1, -0.00093, 1.401687, 1, -0.3437, -0.71417, 1, 1.77216, 0.00099 };
|
||||||
|
float2 opt = float2(GetOutputPt());
|
||||||
|
|
||||||
|
pos -= 0.5f * opt;
|
||||||
|
MF3 yuv = mul(rgb2yuv, INPUT.SampleLevel(SL, pos, 0).rgb);
|
||||||
|
OUTPUT[gxy] = MF4(mul(yuv2rgb, MF3(saturate(yuv.r + r.x), yuv.yz)), 1);
|
||||||
|
|
||||||
|
++gxy.x;
|
||||||
|
pos.x += opt.x;
|
||||||
|
yuv = mul(rgb2yuv, INPUT.SampleLevel(SL, pos, 0).rgb);
|
||||||
|
OUTPUT[gxy] = MF4(mul(yuv2rgb, MF3(saturate(yuv.r + r.y), yuv.yz)), 1);
|
||||||
|
|
||||||
|
++gxy.y;
|
||||||
|
pos.y += opt.y;
|
||||||
|
yuv = mul(rgb2yuv, INPUT.SampleLevel(SL, pos, 0).rgb);
|
||||||
|
OUTPUT[gxy] = MF4(mul(yuv2rgb, MF3(saturate(yuv.r + r.w), yuv.yz)), 1);
|
||||||
|
|
||||||
|
--gxy.x;
|
||||||
|
pos.x -= opt.x;
|
||||||
|
yuv = mul(rgb2yuv, INPUT.SampleLevel(SL, pos, 0).rgb);
|
||||||
|
OUTPUT[gxy] = MF4(mul(yuv2rgb, MF3(saturate(yuv.r + r.z), yuv.yz)), 1);
|
||||||
|
}
|
||||||
772
src/Effects/CuNNy/CuNNy-8x4C-NVL.hlsl
Normal file
772
src/Effects/CuNNy/CuNNy-8x4C-NVL.hlsl
Normal file
|
|
@ -0,0 +1,772 @@
|
||||||
|
// CuNNy 8x4C BILINEAR RGB NVL - https://github.com/funnyplanter/CuNNy
|
||||||
|
|
||||||
|
// This program is free software: you can redistribute it and/or modify
|
||||||
|
// it under the terms of the GNU General Public License as published by
|
||||||
|
// the Free Software Foundation, either version 3 of the License, or
|
||||||
|
// (at your option) any later version.
|
||||||
|
//
|
||||||
|
// This program is distributed in the hope that it will be useful,
|
||||||
|
// but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
//
|
||||||
|
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||||
|
// GNU General Public License for more details.
|
||||||
|
//
|
||||||
|
// You should have received a copy of the GNU General Public License
|
||||||
|
// along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||||
|
|
||||||
|
//!MAGPIE EFFECT
|
||||||
|
//!VERSION 4
|
||||||
|
//!SORT_NAME CuNNy-D04N08
|
||||||
|
//!USE MulAdd
|
||||||
|
//!CAPABILITY FP16
|
||||||
|
|
||||||
|
#include "../StubDefs.hlsli"
|
||||||
|
|
||||||
|
//!TEXTURE
|
||||||
|
Texture2D INPUT;
|
||||||
|
|
||||||
|
//!TEXTURE
|
||||||
|
//!WIDTH INPUT_WIDTH * 2
|
||||||
|
//!HEIGHT INPUT_HEIGHT * 2
|
||||||
|
Texture2D OUTPUT;
|
||||||
|
|
||||||
|
//!SAMPLER
|
||||||
|
//!FILTER POINT
|
||||||
|
SamplerState SP;
|
||||||
|
|
||||||
|
//!SAMPLER
|
||||||
|
//!FILTER LINEAR
|
||||||
|
SamplerState SL;
|
||||||
|
|
||||||
|
//!COMMON
|
||||||
|
#define O(t, p) t.SampleLevel(SP, pos + p * pt, 0)
|
||||||
|
#define V4 MF4
|
||||||
|
#define M4 MF4x4
|
||||||
|
|
||||||
|
//!TEXTURE
|
||||||
|
//!WIDTH INPUT_WIDTH
|
||||||
|
//!HEIGHT INPUT_HEIGHT
|
||||||
|
//!FORMAT R8G8B8A8_SNORM
|
||||||
|
Texture2D t0;
|
||||||
|
|
||||||
|
//!TEXTURE
|
||||||
|
//!WIDTH INPUT_WIDTH
|
||||||
|
//!HEIGHT INPUT_HEIGHT
|
||||||
|
//!FORMAT R8G8B8A8_SNORM
|
||||||
|
Texture2D t1;
|
||||||
|
|
||||||
|
//!PASS 1
|
||||||
|
//!DESC in
|
||||||
|
//!BLOCK_SIZE 8
|
||||||
|
//!NUM_THREADS 64
|
||||||
|
//!IN INPUT
|
||||||
|
//!OUT t0
|
||||||
|
|
||||||
|
#define l0(x, y) (dot(MF3(2.666e-01, 5.050e-01, 1.135e-01), O(INPUT, float2(x, y)).rgb) + MF(-8.258e-01))
|
||||||
|
|
||||||
|
V4 f0(MF s0_0, MF s0_1, MF s0_2, MF s0_3, MF s0_4, MF s0_5, MF s0_6, MF s0_7, MF s0_8) {
|
||||||
|
V4 r = { -8.495e-04, -1.121e-04, 1.842e-02, 5.844e-02 };
|
||||||
|
r = mad(s0_0, V4(-2.544e-02, -4.130e-01, -2.634e-01, 2.417e-02), r);
|
||||||
|
r = mad(s0_1, V4(1.256e-02, -8.013e-02, 9.539e-02, -7.111e-02), r);
|
||||||
|
r = mad(s0_2, V4(1.768e-02, -2.469e-01, -1.627e-01, 8.569e-02), r);
|
||||||
|
r = mad(s0_3, V4(-1.554e-01, 3.441e-02, -1.508e-01, 2.491e-02), r);
|
||||||
|
r = mad(s0_4, V4(1.628e-01, 8.679e-01, -1.960e-02, -5.810e-01), r);
|
||||||
|
r = mad(s0_5, V4(-1.237e-02, -1.704e-01, 2.915e-01, -5.922e-01), r);
|
||||||
|
r = mad(s0_6, V4(7.925e-01, 5.570e-03, 7.074e-02, 4.442e-04), r);
|
||||||
|
r = mad(s0_7, V4(-7.910e-01, -1.530e-02, -8.229e-02, 3.149e-03), r);
|
||||||
|
r = mad(s0_8, V4(-3.973e-03, 2.262e-02, -1.213e-01, 3.843e-02), r);
|
||||||
|
return r;
|
||||||
|
}
|
||||||
|
|
||||||
|
void Pass1(uint2 blockStart, uint3 tid) {
|
||||||
|
float2 pt = float2(GetInputPt());
|
||||||
|
uint2 gxy = Rmp8x8(tid.x) + blockStart;
|
||||||
|
uint2 size = GetInputSize();
|
||||||
|
if (gxy.x >= size.x || gxy.y >= size.y) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
float2 pos = (gxy + 0.5) * pt;
|
||||||
|
|
||||||
|
MF s0_0 = l0(-1.0, -1.0);
|
||||||
|
MF s0_1 = l0(0.0, -1.0);
|
||||||
|
MF s0_2 = l0(1.0, -1.0);
|
||||||
|
MF s0_3 = l0(-1.0, 0.0);
|
||||||
|
MF s0_4 = l0(0.0, 0.0);
|
||||||
|
MF s0_5 = l0(1.0, 0.0);
|
||||||
|
MF s0_6 = l0(-1.0, 1.0);
|
||||||
|
MF s0_7 = l0(0.0, 1.0);
|
||||||
|
MF s0_8 = l0(1.0, 1.0);
|
||||||
|
|
||||||
|
t0[gxy] = f0(s0_0, s0_1, s0_2, s0_3, s0_4, s0_5, s0_6, s0_7, s0_8);
|
||||||
|
}
|
||||||
|
|
||||||
|
//!PASS 2
|
||||||
|
//!DESC conv1
|
||||||
|
//!BLOCK_SIZE 8
|
||||||
|
//!NUM_THREADS 64
|
||||||
|
//!IN t0
|
||||||
|
//!OUT t1
|
||||||
|
|
||||||
|
#define l0(x, y) V4(O(t0, float2(x, y)))
|
||||||
|
|
||||||
|
V4 f0(V4 s0_0, V4 s0_1, V4 s0_2, V4 s0_3, V4 s0_4, V4 s0_5, V4 s0_6, V4 s0_7, V4 s0_8, V4 s1_0, V4 s1_1, V4 s1_2, V4 s1_3, V4 s1_4, V4 s1_5, V4 s1_6, V4 s1_7, V4 s1_8) {
|
||||||
|
V4 r = { 1.016e-03, 5.583e-03, -1.608e-02, -1.996e-04 };
|
||||||
|
r = MulAdd(s0_0, M4(4.254e-02, 1.997e-01, 4.636e-02, -4.800e-02, 2.043e-01, -4.096e-02, -7.212e-02, 1.408e-02, -3.916e-01, 2.630e-03, 7.016e-02, 9.613e-02, 1.773e-01, -2.723e-01, -9.458e-02, -1.890e-01), r);
|
||||||
|
r = MulAdd(s0_1, M4(2.350e-01, -8.474e-01, -4.044e-01, -9.188e-01, 9.560e-03, 5.061e-02, 1.092e-02, 1.781e-01, -2.144e-01, 3.203e-02, 6.349e-02, -8.272e-02, -3.105e-01, -3.917e-02, -1.320e-02, -1.541e-01), r);
|
||||||
|
r = MulAdd(s0_2, M4(-8.130e-01, -1.003e-01, 8.195e-02, -7.597e-01, 5.207e-02, 3.470e-02, -8.823e-03, -1.131e-01, -4.029e-02, 7.571e-02, -2.010e-01, 2.487e-01, 1.677e-01, -5.118e-02, -1.070e-01, 7.606e-02), r);
|
||||||
|
r = MulAdd(s0_3, M4(-1.158e-02, 4.898e-02, 1.202e-02, 5.012e-01, -5.343e-02, 4.756e-02, -2.438e-01, 6.399e-02, 2.822e-01, -2.863e-02, 1.996e-01, -7.099e-02, -1.323e-01, -3.797e-01, 5.385e-02, -1.014e-01), r);
|
||||||
|
r = MulAdd(s0_4, M4(2.812e-01, 7.903e-01, -1.733e-01, 6.668e-01, 4.775e-01, 5.452e-01, 7.089e-01, -1.851e-01, -2.382e-01, -5.180e-02, -3.623e-01, -3.040e-01, -4.313e-01, -1.167e-02, 1.235e-01, 1.436e-01), r);
|
||||||
|
r = MulAdd(s0_5, M4(-1.291e-01, -3.022e-02, -4.083e-01, -5.939e-02, -4.249e-01, -1.750e-01, 1.094e-01, -1.176e-01, 1.374e-02, 1.342e-01, 2.086e-01, 2.841e-01, 2.347e-01, 1.450e-01, 7.604e-02, 2.176e-01), r);
|
||||||
|
r = MulAdd(s0_6, M4(8.130e-02, -7.215e-02, -5.249e-02, 9.518e-03, -1.979e-01, -4.441e-02, -1.857e-01, -4.227e-01, 2.149e-01, -1.610e-01, 1.655e-01, -8.841e-02, 1.409e-01, -1.059e-01, 2.037e-01, -2.744e-03), r);
|
||||||
|
r = MulAdd(s0_7, M4(-7.266e-02, 1.638e-02, -1.639e-01, 1.957e-02, -2.857e-01, 1.936e-01, -1.243e-01, -1.490e-01, 1.525e-01, -8.934e-02, 7.415e-02, -1.779e-01, 1.648e-02, -6.456e-02, 7.053e-02, -9.530e-02), r);
|
||||||
|
r = MulAdd(s0_8, M4(-6.960e-02, -8.960e-02, -1.757e-02, -1.370e-01, -5.137e-01, -1.179e-01, -4.053e-01, -1.987e-01, 7.100e-02, 2.928e-02, -9.682e-02, 2.403e-01, 1.814e-01, 2.131e-02, 5.579e-02, 5.457e-02), r);
|
||||||
|
r = MulAdd(s1_0, M4(-2.737e-02, 5.272e-02, -1.801e-02, -2.491e-01, 2.871e-01, -3.704e-02, -6.568e-02, 2.905e-02, 1.011e-01, -3.782e-01, -8.696e-02, 4.682e-01, 3.233e-01, -3.060e-01, -3.251e-02, 1.165e+00), r);
|
||||||
|
r = MulAdd(s1_1, M4(-4.994e-01, 3.049e-02, -8.802e-02, -6.179e-02, 7.133e-02, -1.957e-02, -4.465e-02, 1.130e-01, 7.255e-02, 6.956e-03, -1.204e-01, 3.699e-01, -8.844e-02, 4.624e-01, -9.881e-02, -2.512e-01), r);
|
||||||
|
r = MulAdd(s1_2, M4(-3.645e-01, 1.274e-01, 2.387e-01, -1.963e-01, -5.995e-02, -5.943e-02, 9.694e-02, -2.518e-01, -2.797e-01, 1.598e-01, -1.371e-02, 4.000e-01, 2.213e-01, 9.692e-02, -3.302e-01, 1.132e+00), r);
|
||||||
|
r = MulAdd(s1_3, M4(-8.539e-03, -6.535e-02, 5.575e-02, 1.928e-01, 1.156e-01, 5.227e-02, -3.039e-01, 4.794e-01, 1.441e-01, 1.929e-01, -4.689e-02, 2.023e-02, 1.330e-01, -1.358e+00, -5.393e-01, 7.907e-01), r);
|
||||||
|
r = MulAdd(s1_4, M4(1.701e-01, -3.479e-02, 5.404e-01, -2.491e-01, 4.564e-01, 6.659e-01, 7.009e-01, -2.288e-02, -7.696e-01, -4.959e-01, 2.881e-01, -4.322e-01, -9.013e-01, -4.765e-01, 5.556e-02, -1.805e-01), r);
|
||||||
|
r = MulAdd(s1_5, M4(-2.424e-01, 8.034e-03, -4.699e-02, -2.628e-01, -4.682e-01, 2.977e-02, 2.258e-01, -1.419e-01, 3.514e-01, 6.860e-03, 2.147e-01, 3.806e-01, 3.747e-01, 1.403e-01, 3.106e-01, 9.680e-01), r);
|
||||||
|
r = MulAdd(s1_6, M4(1.776e-01, -4.873e-02, -1.403e-01, -1.817e-02, -3.551e-01, 4.838e-04, -2.786e-01, -6.048e-01, 3.082e-01, -4.703e-01, 2.419e-01, -3.002e-01, -4.310e-01, -6.490e-01, 1.343e+00, -1.019e+00), r);
|
||||||
|
r = MulAdd(s1_7, M4(4.689e-02, -2.927e-02, -7.494e-02, -3.516e-02, -2.217e-01, -3.189e-01, 2.202e-01, -2.936e-01, 4.772e-02, -1.609e-01, 9.853e-02, -4.214e-01, 2.780e-01, -1.073e-01, 1.102e-01, -2.033e-01), r);
|
||||||
|
r = MulAdd(s1_8, M4(-9.468e-02, 4.428e-02, 1.269e-01, -1.086e-01, -1.106e-01, -1.367e-01, -3.356e-01, 4.656e-03, 4.648e-02, -1.743e-02, -2.074e-01, -3.745e-02, 1.281e-01, -3.233e-01, 6.533e-01, 3.705e-01), r);
|
||||||
|
return r;
|
||||||
|
}
|
||||||
|
|
||||||
|
void Pass2(uint2 blockStart, uint3 tid) {
|
||||||
|
float2 pt = float2(GetInputPt());
|
||||||
|
uint2 gxy = Rmp8x8(tid.x) + blockStart;
|
||||||
|
uint2 size = GetInputSize();
|
||||||
|
if (gxy.x >= size.x || gxy.y >= size.y) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
float2 pos = (gxy + 0.5) * pt;
|
||||||
|
|
||||||
|
V4 s0_0 = l0(-1.0, -1.0);
|
||||||
|
V4 s0_1 = l0(0.0, -1.0);
|
||||||
|
V4 s0_2 = l0(1.0, -1.0);
|
||||||
|
V4 s0_3 = l0(-1.0, 0.0);
|
||||||
|
V4 s0_4 = l0(0.0, 0.0);
|
||||||
|
V4 s0_5 = l0(1.0, 0.0);
|
||||||
|
V4 s0_6 = l0(-1.0, 1.0);
|
||||||
|
V4 s0_7 = l0(0.0, 1.0);
|
||||||
|
V4 s0_8 = l0(1.0, 1.0);
|
||||||
|
V4 s1_0 = -max(-s0_0, 0.0);
|
||||||
|
V4 s1_1 = -max(-s0_1, 0.0);
|
||||||
|
V4 s1_2 = -max(-s0_2, 0.0);
|
||||||
|
V4 s1_3 = -max(-s0_3, 0.0);
|
||||||
|
V4 s1_4 = -max(-s0_4, 0.0);
|
||||||
|
V4 s1_5 = -max(-s0_5, 0.0);
|
||||||
|
V4 s1_6 = -max(-s0_6, 0.0);
|
||||||
|
V4 s1_7 = -max(-s0_7, 0.0);
|
||||||
|
V4 s1_8 = -max(-s0_8, 0.0);
|
||||||
|
s0_0 = max(s0_0, 0.0);
|
||||||
|
s0_1 = max(s0_1, 0.0);
|
||||||
|
s0_2 = max(s0_2, 0.0);
|
||||||
|
s0_3 = max(s0_3, 0.0);
|
||||||
|
s0_4 = max(s0_4, 0.0);
|
||||||
|
s0_5 = max(s0_5, 0.0);
|
||||||
|
s0_6 = max(s0_6, 0.0);
|
||||||
|
s0_7 = max(s0_7, 0.0);
|
||||||
|
s0_8 = max(s0_8, 0.0);
|
||||||
|
|
||||||
|
t1[gxy] = f0(s0_0, s0_1, s0_2, s0_3, s0_4, s0_5, s0_6, s0_7, s0_8, s1_0, s1_1, s1_2, s1_3, s1_4, s1_5, s1_6, s1_7, s1_8);
|
||||||
|
}
|
||||||
|
|
||||||
|
//!PASS 3
|
||||||
|
//!DESC conv2
|
||||||
|
//!BLOCK_SIZE 8
|
||||||
|
//!NUM_THREADS 64
|
||||||
|
//!IN t1
|
||||||
|
//!OUT t0
|
||||||
|
|
||||||
|
#define l0(x, y) V4(O(t1, float2(x, y)))
|
||||||
|
|
||||||
|
V4 f0(V4 s0_0, V4 s0_1, V4 s0_2, V4 s0_3, V4 s0_4, V4 s0_5, V4 s0_6, V4 s0_7, V4 s0_8, V4 s1_0, V4 s1_1, V4 s1_2, V4 s1_3, V4 s1_4, V4 s1_5, V4 s1_6, V4 s1_7, V4 s1_8) {
|
||||||
|
V4 r = { -1.437e-02, -2.276e-02, 2.275e-02, 6.547e-04 };
|
||||||
|
r = MulAdd(s0_0, M4(-4.810e-02, 2.379e-02, -8.471e-02, 1.305e-01, -5.897e-02, 1.263e-01, -9.639e-02, 9.150e-02, 9.002e-03, -1.763e-01, 8.275e-02, -2.357e-01, 7.181e-02, -7.360e-02, 4.629e-02, -8.259e-02), r);
|
||||||
|
r = MulAdd(s0_1, M4(6.774e-02, 9.108e-02, -3.750e-01, 8.014e-02, 2.890e-01, 9.986e-02, -1.262e-02, -1.285e-01, -2.789e-01, -1.145e-01, -4.982e-02, -1.101e-01, -2.051e-02, -2.271e-01, 1.343e-01, -8.643e-02), r);
|
||||||
|
r = MulAdd(s0_2, M4(-5.433e-02, 6.899e-02, -3.350e-01, -7.837e-02, -1.076e-01, 1.912e-02, -9.061e-02, 1.919e-01, 9.387e-02, -4.206e-02, 1.861e-01, -4.416e-03, -1.560e-01, -4.364e-02, 4.364e-01, 8.765e-02), r);
|
||||||
|
r = MulAdd(s0_3, M4(2.382e-01, 3.032e-01, -1.313e-01, -1.154e-01, 1.008e-01, 3.058e-01, -8.513e-02, 2.713e-01, -9.875e-02, 3.017e-01, 3.203e-02, 5.762e-01, -2.056e-03, -7.698e-02, 8.681e-02, 4.245e-02), r);
|
||||||
|
r = MulAdd(s0_4, M4(2.643e-01, 1.750e-01, 4.850e-02, 3.131e-03, 2.785e-01, 1.598e-01, 5.772e-01, -4.118e-04, -4.270e-01, -2.447e-01, 4.486e-01, 9.155e-02, -3.428e-01, -2.583e-01, -3.721e-02, 6.278e-02), r);
|
||||||
|
r = MulAdd(s0_5, M4(-1.080e-01, -5.514e-02, -3.648e-01, -2.319e-02, -2.100e-01, -4.065e-02, 1.126e-01, 3.970e-02, 9.824e-02, 1.377e-02, 1.295e-01, -2.512e-02, 1.115e-01, 7.094e-02, 3.413e-01, -5.245e-02), r);
|
||||||
|
r = MulAdd(s0_6, M4(1.991e-01, 4.710e-02, -9.305e-02, -1.471e-01, -8.221e-02, 1.134e-01, -1.718e-01, -2.606e-01, -8.167e-02, -1.462e-02, -1.094e-01, -1.569e-01, 2.133e-02, 3.374e-02, 4.583e-02, 1.228e-01), r);
|
||||||
|
r = MulAdd(s0_7, M4(-2.135e-01, 6.874e-02, -4.993e-02, 1.156e-02, -4.261e-01, 1.366e-01, 4.250e-02, -5.707e-02, -1.966e-01, -6.106e-02, 1.265e-01, -3.076e-03, 2.043e-03, -3.072e-02, 1.043e-01, 3.422e-01), r);
|
||||||
|
r = MulAdd(s0_8, M4(7.235e-02, -3.542e-04, -1.435e-02, -3.815e-02, -8.855e-02, 8.327e-02, 1.954e-01, 1.462e-01, 1.615e-01, -4.957e-02, 1.596e-02, -8.625e-02, 6.574e-02, -9.799e-02, 5.401e-03, 7.595e-02), r);
|
||||||
|
r = MulAdd(s1_0, M4(1.245e-01, -2.812e-03, 1.486e-02, 1.246e-01, -5.943e-02, 1.170e-01, -1.068e-01, 8.960e-02, 5.354e-03, -2.039e-01, 8.228e-02, -2.530e-01, -2.789e-03, -6.932e-02, -3.187e-02, -5.794e-02), r);
|
||||||
|
r = MulAdd(s1_1, M4(-2.539e-02, 4.598e-02, -1.205e-01, 1.597e-01, 2.391e-01, 1.269e-01, -1.116e-02, 1.498e-02, -2.388e-01, -1.548e-01, -7.389e-02, -1.083e-02, -1.181e-01, -7.069e-02, 9.383e-03, -2.018e-01), r);
|
||||||
|
r = MulAdd(s1_2, M4(-1.248e-02, 3.267e-02, -2.761e-01, -2.043e-02, -8.520e-02, 3.937e-02, -1.372e-01, 1.821e-02, 6.915e-02, -4.061e-02, 1.782e-01, -4.619e-02, 6.811e-02, -5.458e-04, 3.193e-01, 8.892e-03), r);
|
||||||
|
r = MulAdd(s1_3, M4(-1.580e-01, 7.536e-02, -6.680e-02, 1.891e-01, 1.196e-01, 3.476e-01, -6.321e-02, 1.972e-01, -9.851e-02, 4.483e-01, 9.326e-03, 5.272e-01, -1.478e-01, -4.009e-02, -3.561e-02, -2.549e-01), r);
|
||||||
|
r = MulAdd(s1_4, M4(-1.253e-01, 1.345e-01, 4.994e-01, 2.000e-01, 2.728e-01, 1.672e-01, 5.501e-01, -1.736e-02, -5.782e-01, -2.191e-01, 4.380e-01, 4.346e-02, -3.006e-01, -5.220e-02, -1.613e-01, 6.023e-02), r);
|
||||||
|
r = MulAdd(s1_5, M4(1.276e-01, -8.319e-02, -2.115e-01, 1.471e-01, -1.669e-01, -2.484e-02, 9.906e-02, 1.836e-02, 1.010e-01, 1.847e-02, 1.027e-01, -1.680e-02, -1.880e-01, 1.377e-01, 3.823e-02, -8.256e-02), r);
|
||||||
|
r = MulAdd(s1_6, M4(-3.200e-01, -7.023e-02, -1.243e-01, -2.003e-02, -7.863e-02, 6.650e-02, -1.264e-01, -1.862e-01, -9.119e-02, -4.374e-02, -1.195e-01, -6.902e-02, -1.360e-01, 3.356e-02, -3.667e-02, -1.815e-01), r);
|
||||||
|
r = MulAdd(s1_7, M4(1.462e-02, 1.001e-01, 2.453e-01, -1.298e-02, -4.372e-01, 1.509e-01, 8.011e-02, -1.323e-01, -1.980e-01, -4.785e-02, 1.733e-01, 1.100e-02, -2.153e-01, 6.711e-02, 2.595e-03, 1.213e-01), r);
|
||||||
|
r = MulAdd(s1_8, M4(-3.794e-03, 2.239e-02, -6.960e-02, 7.342e-02, -1.882e-01, 1.159e-01, 1.876e-01, 3.125e-02, 2.242e-01, -5.956e-02, 1.328e-02, -5.400e-02, 2.205e-02, -6.049e-02, -9.151e-02, -1.137e-01), r);
|
||||||
|
return r;
|
||||||
|
}
|
||||||
|
|
||||||
|
void Pass3(uint2 blockStart, uint3 tid) {
|
||||||
|
float2 pt = float2(GetInputPt());
|
||||||
|
uint2 gxy = Rmp8x8(tid.x) + blockStart;
|
||||||
|
uint2 size = GetInputSize();
|
||||||
|
if (gxy.x >= size.x || gxy.y >= size.y) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
float2 pos = (gxy + 0.5) * pt;
|
||||||
|
|
||||||
|
V4 s0_0 = l0(-1.0, -1.0);
|
||||||
|
V4 s0_1 = l0(0.0, -1.0);
|
||||||
|
V4 s0_2 = l0(1.0, -1.0);
|
||||||
|
V4 s0_3 = l0(-1.0, 0.0);
|
||||||
|
V4 s0_4 = l0(0.0, 0.0);
|
||||||
|
V4 s0_5 = l0(1.0, 0.0);
|
||||||
|
V4 s0_6 = l0(-1.0, 1.0);
|
||||||
|
V4 s0_7 = l0(0.0, 1.0);
|
||||||
|
V4 s0_8 = l0(1.0, 1.0);
|
||||||
|
V4 s1_0 = -max(-s0_0, 0.0);
|
||||||
|
V4 s1_1 = -max(-s0_1, 0.0);
|
||||||
|
V4 s1_2 = -max(-s0_2, 0.0);
|
||||||
|
V4 s1_3 = -max(-s0_3, 0.0);
|
||||||
|
V4 s1_4 = -max(-s0_4, 0.0);
|
||||||
|
V4 s1_5 = -max(-s0_5, 0.0);
|
||||||
|
V4 s1_6 = -max(-s0_6, 0.0);
|
||||||
|
V4 s1_7 = -max(-s0_7, 0.0);
|
||||||
|
V4 s1_8 = -max(-s0_8, 0.0);
|
||||||
|
s0_0 = max(s0_0, 0.0);
|
||||||
|
s0_1 = max(s0_1, 0.0);
|
||||||
|
s0_2 = max(s0_2, 0.0);
|
||||||
|
s0_3 = max(s0_3, 0.0);
|
||||||
|
s0_4 = max(s0_4, 0.0);
|
||||||
|
s0_5 = max(s0_5, 0.0);
|
||||||
|
s0_6 = max(s0_6, 0.0);
|
||||||
|
s0_7 = max(s0_7, 0.0);
|
||||||
|
s0_8 = max(s0_8, 0.0);
|
||||||
|
|
||||||
|
t0[gxy] = f0(s0_0, s0_1, s0_2, s0_3, s0_4, s0_5, s0_6, s0_7, s0_8, s1_0, s1_1, s1_2, s1_3, s1_4, s1_5, s1_6, s1_7, s1_8);
|
||||||
|
}
|
||||||
|
|
||||||
|
//!PASS 4
|
||||||
|
//!DESC conv3
|
||||||
|
//!BLOCK_SIZE 8
|
||||||
|
//!NUM_THREADS 64
|
||||||
|
//!IN t0
|
||||||
|
//!OUT t1
|
||||||
|
|
||||||
|
#define l0(x, y) V4(O(t0, float2(x, y)))
|
||||||
|
|
||||||
|
V4 f0(V4 s0_0, V4 s0_1, V4 s0_2, V4 s0_3, V4 s0_4, V4 s0_5, V4 s0_6, V4 s0_7, V4 s0_8, V4 s1_0, V4 s1_1, V4 s1_2, V4 s1_3, V4 s1_4, V4 s1_5, V4 s1_6, V4 s1_7, V4 s1_8) {
|
||||||
|
V4 r = { 1.575e-02, -2.007e-01, -3.519e-03, -9.082e-03 };
|
||||||
|
r = MulAdd(s0_0, M4(3.886e-03, -1.503e-01, -6.378e-01, 4.214e-02, -1.255e-01, 1.146e-01, -1.917e-01, -6.556e-02, -3.368e-02, 6.874e-02, 2.796e-01, -2.936e-02, -3.239e-02, 3.923e-02, -6.439e-02, 1.313e-02), r);
|
||||||
|
r = MulAdd(s0_1, M4(4.357e-01, -1.067e-01, 3.330e-01, -8.295e-02, -4.004e-01, 3.113e-01, -4.222e-02, 2.290e-01, -1.861e-01, 9.039e-02, -1.132e-01, 1.077e-01, -1.603e-02, 6.296e-02, 4.907e-01, 3.396e-02), r);
|
||||||
|
r = MulAdd(s0_2, M4(-3.290e-01, -1.073e-01, 1.064e-02, -2.792e-03, -4.366e-01, 3.239e-01, -1.383e-01, 1.918e-01, 3.058e-02, 1.006e-01, -6.898e-02, -1.451e-02, -1.882e-01, 2.248e-01, 1.744e-02, -3.155e-02), r);
|
||||||
|
r = MulAdd(s0_3, M4(2.403e-02, -1.353e-01, 1.895e-01, -2.285e-01, -1.211e-01, 1.771e-01, 2.135e-01, 1.900e-01, -4.204e-03, 3.719e-02, -4.772e-01, 2.006e-01, -2.532e-03, 5.872e-02, 2.901e-01, -9.450e-02), r);
|
||||||
|
r = MulAdd(s0_4, M4(8.054e-02, 1.389e-02, -2.060e-02, -3.042e-01, -2.476e-01, 9.905e-02, -9.248e-01, 3.372e-01, -5.254e-01, 4.455e-01, 5.707e-02, 1.057e-01, -3.525e-01, 3.349e-01, -3.414e-01, 7.090e-02), r);
|
||||||
|
r = MulAdd(s0_5, M4(-1.889e-01, -2.290e-01, -4.930e-02, -1.824e-01, -2.062e+00, 6.868e-02, 2.552e-01, 3.883e-01, 5.778e-02, 9.141e-02, 9.917e-02, -1.164e-01, 4.359e-02, 2.105e-01, -7.911e-02, -1.916e-01), r);
|
||||||
|
r = MulAdd(s0_6, M4(-2.267e-02, -6.231e-03, -9.718e-03, 3.770e-04, -6.982e-02, 4.184e-02, -2.296e-01, -9.542e-02, 5.236e-02, -5.412e-02, -1.757e-01, -1.054e-01, 1.414e-02, -7.772e-02, -1.338e-02, 3.928e-02), r);
|
||||||
|
r = MulAdd(s0_7, M4(5.776e-02, 4.703e-02, 3.914e-02, -1.617e-02, -3.606e-01, 3.037e-01, -3.096e-01, 3.562e-02, 3.108e-01, -3.684e-01, 3.725e-02, -2.050e-01, -1.494e-02, 8.741e-02, 5.992e-02, 2.655e-02), r);
|
||||||
|
r = MulAdd(s0_8, M4(3.614e-02, -1.212e-01, 2.507e-02, -5.858e-02, -1.121e-01, -3.433e-01, 6.613e-02, -6.943e-01, 2.233e-02, -5.467e-02, -6.900e-03, -2.566e-01, -1.106e-01, 2.016e-02, -3.700e-02, -2.886e-01), r);
|
||||||
|
r = MulAdd(s1_0, M4(-5.136e-02, -2.190e-01, -1.035e+00, -5.722e-02, 2.876e-02, 5.070e-02, 3.532e-01, -6.778e-03, 2.930e-04, -6.219e-02, 2.314e-01, -5.210e-02, 1.508e-02, -4.390e-02, -7.749e-02, -9.658e-03), r);
|
||||||
|
r = MulAdd(s1_1, M4(3.663e-01, -9.746e-02, -6.582e-01, -3.676e-01, -1.694e-01, 7.883e-02, -1.613e-01, 2.328e-02, 2.595e-04, -3.763e-02, -9.946e-02, -6.137e-02, 1.429e-01, -1.964e-01, 2.439e-01, 4.898e-02), r);
|
||||||
|
r = MulAdd(s1_2, M4(7.884e-02, 1.842e-01, -1.309e-01, 4.895e-02, 4.820e-02, 8.364e-02, 1.189e-02, -1.438e-02, -7.934e-02, 4.775e-02, -6.137e-02, -1.335e-02, -4.416e-02, 3.584e-02, 1.751e-04, -1.178e-02), r);
|
||||||
|
r = MulAdd(s1_3, M4(-9.861e-03, -1.277e-01, 2.389e-03, -3.232e-01, -2.782e-03, 1.115e-01, -6.485e-02, 2.093e-01, 2.056e-01, 2.527e-02, -1.772e-01, 1.863e-02, 5.983e-02, -8.103e-02, 3.076e-01, -2.027e-01), r);
|
||||||
|
r = MulAdd(s1_4, M4(1.001e-01, 3.476e-01, -1.305e-01, -1.653e-01, 8.890e-02, -4.170e-01, -1.530e-01, 7.048e-02, -5.605e-01, 1.093e-01, 2.038e-01, -2.320e-01, -1.287e-01, -2.173e-01, -1.630e-01, -9.691e-02), r);
|
||||||
|
r = MulAdd(s1_5, M4(-2.778e-01, 1.393e-01, -2.802e-02, -5.375e-02, -4.550e-01, -1.661e-01, 2.293e-03, -5.984e-02, -5.070e-02, -8.852e-02, 7.806e-02, 2.187e-02, 1.901e-01, -3.219e-01, -1.937e-01, -2.336e-01), r);
|
||||||
|
r = MulAdd(s1_6, M4(-8.489e-02, 1.968e-01, -7.760e-02, 1.388e-01, 4.713e-03, 1.527e-01, 8.535e-02, 1.643e-02, 1.429e-01, -1.558e-01, 2.339e-01, 2.762e-01, 1.694e-02, -4.245e-02, -2.793e-02, -3.332e-02), r);
|
||||||
|
r = MulAdd(s1_7, M4(-4.377e-02, 3.486e-01, -1.766e-01, -1.065e-01, -1.645e-01, -8.722e-04, -1.147e-01, 1.663e-01, 6.801e-02, -3.539e-01, 1.560e-02, -1.819e-01, 1.440e-02, -1.221e-02, 3.693e-02, 5.886e-03), r);
|
||||||
|
r = MulAdd(s1_8, M4(5.940e-02, 1.624e-01, 1.526e-02, 6.692e-02, 1.812e-01, -8.647e-02, 3.210e-02, -3.751e-04, 2.884e-02, -4.717e-02, 4.121e-03, 5.144e-02, -1.995e-02, -2.827e-01, 6.148e-03, 7.209e-02), r);
|
||||||
|
return r;
|
||||||
|
}
|
||||||
|
|
||||||
|
void Pass4(uint2 blockStart, uint3 tid) {
|
||||||
|
float2 pt = float2(GetInputPt());
|
||||||
|
uint2 gxy = Rmp8x8(tid.x) + blockStart;
|
||||||
|
uint2 size = GetInputSize();
|
||||||
|
if (gxy.x >= size.x || gxy.y >= size.y) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
float2 pos = (gxy + 0.5) * pt;
|
||||||
|
|
||||||
|
V4 s0_0 = l0(-1.0, -1.0);
|
||||||
|
V4 s0_1 = l0(0.0, -1.0);
|
||||||
|
V4 s0_2 = l0(1.0, -1.0);
|
||||||
|
V4 s0_3 = l0(-1.0, 0.0);
|
||||||
|
V4 s0_4 = l0(0.0, 0.0);
|
||||||
|
V4 s0_5 = l0(1.0, 0.0);
|
||||||
|
V4 s0_6 = l0(-1.0, 1.0);
|
||||||
|
V4 s0_7 = l0(0.0, 1.0);
|
||||||
|
V4 s0_8 = l0(1.0, 1.0);
|
||||||
|
V4 s1_0 = -max(-s0_0, 0.0);
|
||||||
|
V4 s1_1 = -max(-s0_1, 0.0);
|
||||||
|
V4 s1_2 = -max(-s0_2, 0.0);
|
||||||
|
V4 s1_3 = -max(-s0_3, 0.0);
|
||||||
|
V4 s1_4 = -max(-s0_4, 0.0);
|
||||||
|
V4 s1_5 = -max(-s0_5, 0.0);
|
||||||
|
V4 s1_6 = -max(-s0_6, 0.0);
|
||||||
|
V4 s1_7 = -max(-s0_7, 0.0);
|
||||||
|
V4 s1_8 = -max(-s0_8, 0.0);
|
||||||
|
s0_0 = max(s0_0, 0.0);
|
||||||
|
s0_1 = max(s0_1, 0.0);
|
||||||
|
s0_2 = max(s0_2, 0.0);
|
||||||
|
s0_3 = max(s0_3, 0.0);
|
||||||
|
s0_4 = max(s0_4, 0.0);
|
||||||
|
s0_5 = max(s0_5, 0.0);
|
||||||
|
s0_6 = max(s0_6, 0.0);
|
||||||
|
s0_7 = max(s0_7, 0.0);
|
||||||
|
s0_8 = max(s0_8, 0.0);
|
||||||
|
|
||||||
|
t1[gxy] = f0(s0_0, s0_1, s0_2, s0_3, s0_4, s0_5, s0_6, s0_7, s0_8, s1_0, s1_1, s1_2, s1_3, s1_4, s1_5, s1_6, s1_7, s1_8);
|
||||||
|
}
|
||||||
|
|
||||||
|
//!PASS 5
|
||||||
|
//!DESC conv4
|
||||||
|
//!BLOCK_SIZE 8
|
||||||
|
//!NUM_THREADS 64
|
||||||
|
//!IN t1
|
||||||
|
//!OUT t0
|
||||||
|
|
||||||
|
#define l0(x, y) V4(O(t1, float2(x, y)))
|
||||||
|
|
||||||
|
V4 f0(V4 s0_0, V4 s0_1, V4 s0_2, V4 s0_3, V4 s0_4, V4 s0_5, V4 s0_6, V4 s0_7, V4 s0_8, V4 s1_0, V4 s1_1, V4 s1_2, V4 s1_3, V4 s1_4, V4 s1_5, V4 s1_6, V4 s1_7, V4 s1_8) {
|
||||||
|
V4 r = { 2.513e-04, -2.994e-02, -5.133e-02, -8.977e-03 };
|
||||||
|
r = MulAdd(s0_0, M4(-6.479e-02, -9.976e-02, -1.507e-01, -9.934e-02, -1.046e-02, -1.471e-01, -4.218e-02, -8.348e-04, -5.963e-02, 1.519e-03, 5.897e-03, 5.284e-02, -4.467e-01, 4.779e-01, -1.953e-02, 1.951e-01), r);
|
||||||
|
r = MulAdd(s0_1, M4(-5.276e-02, -1.201e-01, -1.160e-01, 6.076e-02, -4.798e-02, -3.491e-01, -3.055e-01, -1.607e-01, -8.989e-02, 1.221e-01, -1.561e-01, 6.227e-02, -1.598e-01, -6.666e-01, 6.029e-01, -5.466e-01), r);
|
||||||
|
r = MulAdd(s0_2, M4(-1.331e-01, -4.988e-02, -2.217e-02, 3.405e-02, 2.261e-02, 1.352e-01, 1.124e-02, 8.259e-02, -3.548e-02, 2.454e-01, 4.417e-02, 2.297e-01, 1.780e-01, -2.203e-01, 5.913e-02, -2.201e-01), r);
|
||||||
|
r = MulAdd(s0_3, M4(1.348e-01, 5.544e-01, -4.335e-01, -3.619e-01, 1.011e-01, 2.665e-01, -2.627e-01, -1.800e-01, -1.158e-01, -8.543e-02, -7.868e-03, 2.056e-01, 1.988e-01, 1.174e+00, -1.291e-01, 1.131e-01), r);
|
||||||
|
r = MulAdd(s0_4, M4(4.504e-01, 1.025e-01, -1.449e-01, -3.442e-02, -4.525e-01, -1.513e-01, -8.135e-02, -9.669e-02, -3.287e-01, 5.251e-01, -6.540e-01, 7.386e-02, 2.603e-01, -8.246e-01, -1.378e-01, 2.363e+00), r);
|
||||||
|
r = MulAdd(s0_5, M4(-7.102e-02, -5.554e-02, -3.489e-02, -6.688e-02, 2.877e-01, -6.258e-02, 8.515e-02, -2.109e-01, -2.723e-01, 1.543e-01, 1.285e-01, 9.366e-02, 3.135e-02, -3.700e-01, -4.111e-01, 1.822e+00), r);
|
||||||
|
r = MulAdd(s0_6, M4(-4.018e-02, -3.412e-01, 5.388e-02, 4.947e-01, -3.234e-02, -6.778e-02, 3.825e-02, 1.313e-01, -6.083e-02, 3.439e-02, -1.081e-01, 6.456e-02, 2.287e-02, -2.470e-01, 2.026e-02, -1.886e-02), r);
|
||||||
|
r = MulAdd(s0_7, M4(2.410e-01, 1.529e-01, -1.370e-01, -1.389e-01, 1.549e-01, 8.308e-03, 3.064e-02, 3.925e-02, -9.013e-02, 1.131e-01, -9.240e-02, 3.740e-01, -1.009e-01, -6.576e-02, -1.491e-01, -3.452e-02), r);
|
||||||
|
r = MulAdd(s0_8, M4(-1.628e-01, -2.480e-02, -6.569e-02, 3.873e-02, 1.604e-02, 1.651e-02, -4.681e-02, -1.647e-02, -1.648e-02, 1.541e-01, 2.284e-02, 6.545e-01, 1.799e-03, 1.193e-03, -1.215e-01, 5.919e-02), r);
|
||||||
|
r = MulAdd(s1_0, M4(-1.115e-02, -5.014e-02, -1.499e-01, -7.414e-04, -6.944e-02, -4.168e-02, -1.254e-01, -6.576e-02, 2.946e-04, -2.669e-02, 4.109e-02, 1.949e-02, 1.242e-01, 1.753e-01, 9.717e-02, 1.446e-01), r);
|
||||||
|
r = MulAdd(s1_1, M4(-1.327e-02, -1.462e-01, -8.510e-02, -1.228e-02, 1.772e-01, 1.009e-01, -4.342e-02, -8.827e-02, -6.663e-02, -1.245e-01, -4.625e-02, -4.285e-02, 7.586e-02, -1.208e-01, 2.705e-01, -1.558e-01), r);
|
||||||
|
r = MulAdd(s1_2, M4(-7.024e-02, -3.045e-02, -1.916e-02, 4.979e-02, -9.145e-02, 2.285e-01, 4.612e-02, 2.217e-01, 7.690e-02, -4.332e-02, 6.032e-03, -2.370e-02, 3.802e-01, -8.124e-02, 1.982e-02, -8.310e-02), r);
|
||||||
|
r = MulAdd(s1_3, M4(1.238e-01, 5.787e-01, -5.332e-01, -2.806e-01, 1.208e-01, 6.549e-02, -2.040e-01, -2.578e-02, -5.878e-02, -1.496e-01, 1.213e-01, 1.489e-02, 9.569e-02, 1.964e-01, 6.477e-02, -2.939e-01), r);
|
||||||
|
r = MulAdd(s1_4, M4(5.825e-01, 2.257e-01, -1.943e-01, 1.101e-01, -3.240e-01, -2.967e-01, -4.203e-02, -3.636e-01, -1.062e-01, -3.799e-02, -4.444e-01, -7.607e-02, -3.056e-01, -2.926e-01, -4.582e-02, 2.795e-01), r);
|
||||||
|
r = MulAdd(s1_5, M4(-9.076e-02, -5.130e-02, -3.718e-02, -6.163e-02, 1.831e-01, -1.199e-01, 9.176e-02, -2.456e-01, 2.362e-01, -1.854e-01, -1.394e-01, 3.560e-03, 2.070e-02, -6.903e-02, -5.061e-02, 3.068e-02), r);
|
||||||
|
r = MulAdd(s1_6, M4(-4.988e-02, -3.880e-01, 3.001e-02, 3.892e-01, -2.827e-02, -2.880e-02, 4.071e-02, 2.861e-01, -4.016e-02, -1.085e-01, 9.207e-03, -7.367e-02, 9.072e-03, 8.960e-02, 5.334e-03, -6.480e-02), r);
|
||||||
|
r = MulAdd(s1_7, M4(2.900e-01, 1.450e-01, -1.401e-01, -2.809e-01, 1.218e-01, -3.153e-03, -2.544e-02, 1.898e-01, -7.197e-02, -3.721e-01, 4.042e-02, 9.918e-02, -1.132e-01, 3.578e-02, 4.000e-02, 6.991e-02), r);
|
||||||
|
r = MulAdd(s1_8, M4(-1.493e-01, -2.310e-02, -6.133e-02, 5.322e-02, -4.879e-02, -5.139e-02, -8.058e-02, 4.140e-02, 2.511e-01, 3.669e-02, -1.003e-01, -1.457e-01, 1.528e-01, 1.177e-01, 6.665e-02, -3.084e-02), r);
|
||||||
|
return r;
|
||||||
|
}
|
||||||
|
|
||||||
|
void Pass5(uint2 blockStart, uint3 tid) {
|
||||||
|
float2 pt = float2(GetInputPt());
|
||||||
|
uint2 gxy = Rmp8x8(tid.x) + blockStart;
|
||||||
|
uint2 size = GetInputSize();
|
||||||
|
if (gxy.x >= size.x || gxy.y >= size.y) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
float2 pos = (gxy + 0.5) * pt;
|
||||||
|
|
||||||
|
V4 s0_0 = l0(-1.0, -1.0);
|
||||||
|
V4 s0_1 = l0(0.0, -1.0);
|
||||||
|
V4 s0_2 = l0(1.0, -1.0);
|
||||||
|
V4 s0_3 = l0(-1.0, 0.0);
|
||||||
|
V4 s0_4 = l0(0.0, 0.0);
|
||||||
|
V4 s0_5 = l0(1.0, 0.0);
|
||||||
|
V4 s0_6 = l0(-1.0, 1.0);
|
||||||
|
V4 s0_7 = l0(0.0, 1.0);
|
||||||
|
V4 s0_8 = l0(1.0, 1.0);
|
||||||
|
V4 s1_0 = -max(-s0_0, 0.0);
|
||||||
|
V4 s1_1 = -max(-s0_1, 0.0);
|
||||||
|
V4 s1_2 = -max(-s0_2, 0.0);
|
||||||
|
V4 s1_3 = -max(-s0_3, 0.0);
|
||||||
|
V4 s1_4 = -max(-s0_4, 0.0);
|
||||||
|
V4 s1_5 = -max(-s0_5, 0.0);
|
||||||
|
V4 s1_6 = -max(-s0_6, 0.0);
|
||||||
|
V4 s1_7 = -max(-s0_7, 0.0);
|
||||||
|
V4 s1_8 = -max(-s0_8, 0.0);
|
||||||
|
s0_0 = max(s0_0, 0.0);
|
||||||
|
s0_1 = max(s0_1, 0.0);
|
||||||
|
s0_2 = max(s0_2, 0.0);
|
||||||
|
s0_3 = max(s0_3, 0.0);
|
||||||
|
s0_4 = max(s0_4, 0.0);
|
||||||
|
s0_5 = max(s0_5, 0.0);
|
||||||
|
s0_6 = max(s0_6, 0.0);
|
||||||
|
s0_7 = max(s0_7, 0.0);
|
||||||
|
s0_8 = max(s0_8, 0.0);
|
||||||
|
|
||||||
|
t0[gxy] = f0(s0_0, s0_1, s0_2, s0_3, s0_4, s0_5, s0_6, s0_7, s0_8, s1_0, s1_1, s1_2, s1_3, s1_4, s1_5, s1_6, s1_7, s1_8);
|
||||||
|
}
|
||||||
|
|
||||||
|
//!PASS 6
|
||||||
|
//!DESC conv5
|
||||||
|
//!BLOCK_SIZE 8
|
||||||
|
//!NUM_THREADS 64
|
||||||
|
//!IN t0
|
||||||
|
//!OUT t1
|
||||||
|
|
||||||
|
#define l0(x, y) V4(O(t0, float2(x, y)))
|
||||||
|
|
||||||
|
V4 f0(V4 s0_0, V4 s0_1, V4 s0_2, V4 s0_3, V4 s0_4, V4 s0_5, V4 s0_6, V4 s0_7, V4 s0_8, V4 s1_0, V4 s1_1, V4 s1_2, V4 s1_3, V4 s1_4, V4 s1_5, V4 s1_6, V4 s1_7, V4 s1_8) {
|
||||||
|
V4 r = { 4.102e-03, 1.192e-03, -2.598e-03, -2.812e-03 };
|
||||||
|
r = MulAdd(s0_0, M4(4.575e-01, 2.412e-01, 1.926e-01, 5.873e-02, 2.954e-02, -1.424e-01, 7.881e-03, 2.358e-04, -5.872e-02, -1.007e-01, -3.632e-02, 5.718e-02, 1.389e-01, -4.163e-02, -1.379e-01, 2.160e-03), r);
|
||||||
|
r = MulAdd(s0_1, M4(1.347e-01, -8.074e-01, -1.155e-01, 2.242e-01, -2.673e-01, 4.053e-01, 8.867e-02, -2.840e-02, 9.443e-02, 2.632e-01, 9.207e-02, -1.793e-02, 1.519e-01, 3.302e-03, 2.027e-01, 2.643e-02), r);
|
||||||
|
r = MulAdd(s0_2, M4(1.462e-02, -7.543e-02, -6.080e-02, 7.431e-02, -3.673e-02, -1.665e-01, -2.745e-01, -4.416e-02, -3.270e-01, 7.677e-01, 7.241e-01, -1.157e-01, -8.204e-03, 2.172e-02, 3.183e-01, 3.931e-02), r);
|
||||||
|
r = MulAdd(s0_3, M4(1.168e+00, -8.427e-01, -3.237e-03, 5.416e-02, 1.694e-02, -1.042e-01, -2.173e-01, -1.089e-01, -9.881e-02, -1.109e-01, -1.003e-01, -5.080e-02, -9.279e-02, -1.111e-01, -2.699e-02, -2.297e-02), r);
|
||||||
|
r = MulAdd(s0_4, M4(-4.884e-01, -4.472e-01, -9.701e-02, 8.789e-01, 1.962e-02, 5.041e-01, 3.221e-01, -4.622e-02, 9.039e-02, -2.531e-01, 6.228e-01, 1.590e-02, 1.804e-02, 7.795e-02, -8.005e-02, -6.310e-03), r);
|
||||||
|
r = MulAdd(s0_5, M4(-6.567e-02, -5.161e-02, 5.550e-02, 5.285e-02, -6.147e-02, -1.840e-01, 2.028e-01, 4.014e-01, 4.070e-01, -1.022e-01, 1.414e+00, -3.126e-01, 7.508e-03, 1.013e-01, -7.300e-02, -4.282e-01), r);
|
||||||
|
r = MulAdd(s0_6, M4(1.721e+00, 1.776e-01, -8.690e-02, -1.102e-01, -8.467e-02, -2.165e-02, 6.238e-02, 2.052e-02, 2.763e-01, -3.472e-02, -1.179e-01, 2.993e-02, -6.860e-02, 1.887e-02, 3.140e-02, -6.853e-02), r);
|
||||||
|
r = MulAdd(s0_7, M4(1.937e-01, 1.975e-01, -2.456e-01, -1.360e+00, 1.792e-01, -5.969e-02, -7.670e-02, 2.606e-01, 1.355e-01, -9.109e-03, 2.756e-01, 6.674e-02, 1.312e-02, -1.542e-02, 2.236e-02, 1.997e-01), r);
|
||||||
|
r = MulAdd(s0_8, M4(4.255e-02, -1.452e-02, -8.732e-02, -1.084e-01, 1.495e-02, 1.302e-02, -9.151e-02, -2.814e-01, 5.197e-02, 2.866e-02, 5.490e-01, 4.310e-01, 3.666e-02, -3.380e-03, -2.830e-02, -8.223e-02), r);
|
||||||
|
r = MulAdd(s1_0, M4(2.549e-02, 7.469e-02, -5.290e-02, -4.972e-02, -2.340e-01, -1.875e-01, 1.656e-01, 5.697e-02, -8.570e-02, -1.520e-01, -2.622e-02, 1.043e-02, -2.377e-01, -3.927e-02, 1.539e-01, 4.528e-02), r);
|
||||||
|
r = MulAdd(s1_1, M4(-1.188e-02, -9.781e-02, 1.606e-01, 5.138e-02, -4.165e-01, 8.262e-01, 1.709e-01, -1.063e-01, 8.393e-03, 7.300e-02, -9.347e-02, -6.226e-02, -3.633e-01, -4.453e-01, 2.190e-01, 2.415e-01), r);
|
||||||
|
r = MulAdd(s1_2, M4(-4.011e-02, 3.404e-02, 1.013e-01, 3.551e-02, 9.692e-02, -2.109e-01, 1.897e-01, -2.192e-01, -1.703e-01, 5.317e-01, 1.354e-01, -2.027e-01, -3.658e-01, -1.845e-01, -5.465e-01, 1.436e-01), r);
|
||||||
|
r = MulAdd(s1_3, M4(7.674e-01, 1.677e-01, -7.875e-02, 7.537e-03, -4.911e-01, -1.083e-01, 7.183e-03, -1.107e-01, -2.514e-02, -1.257e-01, -5.070e-02, -3.886e-02, 1.368e-01, -1.991e-02, -1.698e-01, -7.850e-03), r);
|
||||||
|
r = MulAdd(s1_4, M4(-5.096e-02, 7.912e-02, -2.105e-01, 1.149e-01, 9.798e-02, 2.243e-01, -3.434e-01, 3.492e-01, -1.265e-01, -1.839e-01, -1.337e-01, -6.909e-02, -8.552e-01, 1.334e-01, 8.652e-01, -3.408e-01), r);
|
||||||
|
r = MulAdd(s1_5, M4(-2.933e-02, 1.424e-01, 6.542e-02, -1.710e-01, -1.459e-01, -3.069e-02, -1.275e-01, -9.443e-02, 2.657e-01, -4.784e-04, -6.729e-03, -1.910e-01, -4.628e-01, 3.808e-02, -1.470e-01, 1.480e-01), r);
|
||||||
|
r = MulAdd(s1_6, M4(1.512e-01, -1.755e-02, -5.440e-02, 1.317e-02, -7.181e-02, -6.842e-03, -7.375e-02, -8.356e-02, 7.332e-02, -9.437e-02, -1.008e-01, -4.731e-02, -9.102e-02, -8.192e-03, 7.862e-04, 6.417e-02), r);
|
||||||
|
r = MulAdd(s1_7, M4(2.457e-01, -1.058e-01, -2.777e-02, -1.532e-03, 7.609e-02, 3.452e-02, 1.774e-01, 3.296e-01, 6.779e-02, -6.683e-02, 1.485e-01, 7.321e-02, -3.082e-02, -4.348e-02, 3.558e-03, 9.111e-03), r);
|
||||||
|
r = MulAdd(s1_8, M4(1.104e-01, 5.040e-03, 9.642e-03, -8.991e-02, -2.134e-01, 3.758e-02, -1.244e-01, -1.987e-01, -7.007e-02, 6.792e-03, 1.369e-01, 5.332e-01, -5.354e-02, -2.024e-02, -1.038e-01, -4.812e-02), r);
|
||||||
|
return r;
|
||||||
|
}
|
||||||
|
|
||||||
|
void Pass6(uint2 blockStart, uint3 tid) {
|
||||||
|
float2 pt = float2(GetInputPt());
|
||||||
|
uint2 gxy = Rmp8x8(tid.x) + blockStart;
|
||||||
|
uint2 size = GetInputSize();
|
||||||
|
if (gxy.x >= size.x || gxy.y >= size.y) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
float2 pos = (gxy + 0.5) * pt;
|
||||||
|
|
||||||
|
V4 s0_0 = l0(-1.0, -1.0);
|
||||||
|
V4 s0_1 = l0(0.0, -1.0);
|
||||||
|
V4 s0_2 = l0(1.0, -1.0);
|
||||||
|
V4 s0_3 = l0(-1.0, 0.0);
|
||||||
|
V4 s0_4 = l0(0.0, 0.0);
|
||||||
|
V4 s0_5 = l0(1.0, 0.0);
|
||||||
|
V4 s0_6 = l0(-1.0, 1.0);
|
||||||
|
V4 s0_7 = l0(0.0, 1.0);
|
||||||
|
V4 s0_8 = l0(1.0, 1.0);
|
||||||
|
V4 s1_0 = -max(-s0_0, 0.0);
|
||||||
|
V4 s1_1 = -max(-s0_1, 0.0);
|
||||||
|
V4 s1_2 = -max(-s0_2, 0.0);
|
||||||
|
V4 s1_3 = -max(-s0_3, 0.0);
|
||||||
|
V4 s1_4 = -max(-s0_4, 0.0);
|
||||||
|
V4 s1_5 = -max(-s0_5, 0.0);
|
||||||
|
V4 s1_6 = -max(-s0_6, 0.0);
|
||||||
|
V4 s1_7 = -max(-s0_7, 0.0);
|
||||||
|
V4 s1_8 = -max(-s0_8, 0.0);
|
||||||
|
s0_0 = max(s0_0, 0.0);
|
||||||
|
s0_1 = max(s0_1, 0.0);
|
||||||
|
s0_2 = max(s0_2, 0.0);
|
||||||
|
s0_3 = max(s0_3, 0.0);
|
||||||
|
s0_4 = max(s0_4, 0.0);
|
||||||
|
s0_5 = max(s0_5, 0.0);
|
||||||
|
s0_6 = max(s0_6, 0.0);
|
||||||
|
s0_7 = max(s0_7, 0.0);
|
||||||
|
s0_8 = max(s0_8, 0.0);
|
||||||
|
|
||||||
|
t1[gxy] = f0(s0_0, s0_1, s0_2, s0_3, s0_4, s0_5, s0_6, s0_7, s0_8, s1_0, s1_1, s1_2, s1_3, s1_4, s1_5, s1_6, s1_7, s1_8);
|
||||||
|
}
|
||||||
|
|
||||||
|
//!PASS 7
|
||||||
|
//!DESC conv6
|
||||||
|
//!BLOCK_SIZE 8
|
||||||
|
//!NUM_THREADS 64
|
||||||
|
//!IN t1
|
||||||
|
//!OUT t0
|
||||||
|
|
||||||
|
#define l0(x, y) V4(O(t1, float2(x, y)))
|
||||||
|
|
||||||
|
V4 f0(V4 s0_0, V4 s0_1, V4 s0_2, V4 s0_3, V4 s0_4, V4 s0_5, V4 s0_6, V4 s0_7, V4 s0_8, V4 s1_0, V4 s1_1, V4 s1_2, V4 s1_3, V4 s1_4, V4 s1_5, V4 s1_6, V4 s1_7, V4 s1_8) {
|
||||||
|
V4 r = { 1.448e-03, -2.432e-03, -8.004e-04, 5.896e-05 };
|
||||||
|
r = MulAdd(s0_0, M4(6.200e-02, 5.385e-02, -5.478e-02, 3.955e-02, -1.722e-02, -1.194e-01, 8.331e-02, -9.296e-02, -2.161e-02, 8.716e-02, -5.918e-02, 1.032e-01, 4.954e-02, -3.822e-02, 8.472e-02, -2.191e-01), r);
|
||||||
|
r = MulAdd(s0_1, M4(2.503e-01, 5.635e-02, 7.355e-03, -2.025e-01, 7.104e-02, -1.324e-01, -3.051e-02, 2.246e-02, -4.480e-02, 6.693e-03, 4.467e-02, 3.388e-02, 4.262e-01, 1.488e-01, -8.809e-01, 5.350e-01), r);
|
||||||
|
r = MulAdd(s0_2, M4(-7.511e-03, 1.921e-01, -3.653e-01, 2.096e-02, 2.413e-02, 4.846e-02, -1.538e-01, 3.359e-02, 5.958e-03, -1.033e-02, 2.389e-02, 1.283e-02, -5.270e-02, 2.842e-01, 5.681e-02, -3.578e-02), r);
|
||||||
|
r = MulAdd(s0_3, M4(-2.198e-02, -1.674e-02, 3.330e-02, 3.249e-02, -4.430e-02, 9.217e-02, -3.348e-02, -3.546e-01, 1.228e-01, 3.875e-02, 7.220e-03, 6.719e-02, -8.768e-01, -1.165e-02, -3.862e-02, -2.045e-02), r);
|
||||||
|
r = MulAdd(s0_4, M4(-6.935e-01, -4.898e-01, 2.252e-01, -1.647e-01, -6.408e-02, 4.562e-01, -6.617e-01, 1.220e-01, 1.053e-02, -9.937e-02, -1.118e-02, 3.272e-01, -9.081e-02, 2.353e-02, 4.776e-01, -1.238e-01), r);
|
||||||
|
r = MulAdd(s0_5, M4(2.481e-01, -3.296e-01, -3.372e-02, -2.008e-02, 5.924e-03, 1.762e-02, 3.642e-01, -1.182e-01, -2.219e-02, -4.332e-02, -9.762e-02, 3.537e-02, 2.114e-02, -5.440e-02, 3.124e-01, 5.069e-02), r);
|
||||||
|
r = MulAdd(s0_6, M4(-5.465e-02, -5.352e-03, -3.419e-03, -6.733e-02, -8.079e-02, -6.569e-02, -1.494e-02, -3.462e-01, -8.125e-03, 2.572e-03, -3.894e-02, -3.246e-02, -1.566e-02, -3.004e-02, 1.145e-01, 6.794e-02), r);
|
||||||
|
r = MulAdd(s0_7, M4(4.788e-02, 7.675e-03, -7.030e-02, -2.384e-02, -3.070e-01, -7.080e-01, -2.017e-01, 9.579e-02, 1.259e-01, -1.004e-02, -1.287e-01, 3.334e-02, -9.642e-02, -8.073e-02, 2.546e-02, 5.204e-02), r);
|
||||||
|
r = MulAdd(s0_8, M4(-6.015e-02, 1.650e-01, -5.471e-02, -1.454e-01, -2.785e-02, -1.831e-01, 1.123e-01, 3.453e-02, -1.179e-02, 1.722e-02, -1.068e-02, -2.608e-02, 1.514e-04, -1.287e-02, -7.741e-03, -9.765e-03), r);
|
||||||
|
r = MulAdd(s1_0, M4(-4.922e-02, -5.675e-03, -2.161e-02, 3.164e-02, -2.003e-02, -3.890e-02, 5.198e-02, -1.811e-03, -3.385e-02, -1.510e-02, -2.289e-02, 1.009e-01, 4.427e-02, -1.763e-01, 1.255e-01, -5.073e-02), r);
|
||||||
|
r = MulAdd(s1_1, M4(1.057e-01, -8.124e-02, 1.131e-01, -1.361e-01, 4.740e-02, -6.425e-02, 8.930e-03, 5.318e-02, 5.266e-02, -6.003e-02, 1.320e-01, 4.163e-02, 1.277e-01, -2.404e-01, -1.696e-01, 2.204e-01), r);
|
||||||
|
r = MulAdd(s1_2, M4(2.723e-02, 1.918e-01, -2.822e-01, -1.877e-02, -4.599e-03, 7.591e-02, -1.128e-01, -6.519e-03, 2.311e-02, -1.684e-01, 2.293e-01, -1.042e-01, -1.882e-02, 4.970e-02, -1.309e-01, -8.894e-03), r);
|
||||||
|
r = MulAdd(s1_3, M4(4.883e-02, 2.819e-02, 4.318e-02, 3.186e-02, 7.782e-02, 1.741e-01, -8.927e-02, 4.005e-02, 5.888e-02, -1.057e-01, 9.692e-02, 8.032e-02, -1.086e-01, 6.323e-02, -8.520e-02, -1.273e-02), r);
|
||||||
|
r = MulAdd(s1_4, M4(-1.746e-01, -2.834e-02, -3.694e-02, 3.226e-01, -2.541e-01, 6.860e-01, -1.436e-01, 1.705e-01, 2.614e-01, -6.751e-02, 5.646e-02, 3.666e-01, -2.621e-02, 4.951e-01, -1.090e-01, -3.168e-01), r);
|
||||||
|
r = MulAdd(s1_5, M4(1.513e-01, 5.210e-02, 2.625e-01, -6.303e-02, -2.252e-02, -9.485e-02, 4.776e-01, -1.789e-01, -1.291e-01, -9.714e-02, -1.427e-01, -1.165e-01, 2.415e-02, 9.790e-02, 6.024e-02, -9.622e-02), r);
|
||||||
|
r = MulAdd(s1_6, M4(3.751e-02, -2.907e-02, -1.762e-02, -9.545e-02, 2.866e-01, -7.329e-02, -9.787e-03, 4.513e-03, -9.486e-02, -2.446e-02, -2.357e-02, -5.002e-02, 4.973e-02, 6.256e-02, -2.532e-02, -1.817e-02), r);
|
||||||
|
r = MulAdd(s1_7, M4(-6.855e-02, -6.762e-02, -6.269e-02, -6.947e-02, -1.389e-01, -1.915e-01, -4.806e-02, 1.870e-01, 1.298e-01, 6.268e-03, -5.985e-02, -5.396e-02, -3.048e-02, -5.396e-03, -9.720e-02, 3.289e-03), r);
|
||||||
|
r = MulAdd(s1_8, M4(-2.052e-02, -8.106e-02, -1.721e-02, 9.911e-03, -8.521e-02, 4.832e-02, -1.708e-01, -6.445e-02, -9.788e-02, 8.836e-02, -1.204e-01, -1.123e-01, 1.514e-02, 1.628e-02, -5.003e-02, -6.128e-03), r);
|
||||||
|
return r;
|
||||||
|
}
|
||||||
|
|
||||||
|
void Pass7(uint2 blockStart, uint3 tid) {
|
||||||
|
float2 pt = float2(GetInputPt());
|
||||||
|
uint2 gxy = Rmp8x8(tid.x) + blockStart;
|
||||||
|
uint2 size = GetInputSize();
|
||||||
|
if (gxy.x >= size.x || gxy.y >= size.y) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
float2 pos = (gxy + 0.5) * pt;
|
||||||
|
|
||||||
|
V4 s0_0 = l0(-1.0, -1.0);
|
||||||
|
V4 s0_1 = l0(0.0, -1.0);
|
||||||
|
V4 s0_2 = l0(1.0, -1.0);
|
||||||
|
V4 s0_3 = l0(-1.0, 0.0);
|
||||||
|
V4 s0_4 = l0(0.0, 0.0);
|
||||||
|
V4 s0_5 = l0(1.0, 0.0);
|
||||||
|
V4 s0_6 = l0(-1.0, 1.0);
|
||||||
|
V4 s0_7 = l0(0.0, 1.0);
|
||||||
|
V4 s0_8 = l0(1.0, 1.0);
|
||||||
|
V4 s1_0 = -max(-s0_0, 0.0);
|
||||||
|
V4 s1_1 = -max(-s0_1, 0.0);
|
||||||
|
V4 s1_2 = -max(-s0_2, 0.0);
|
||||||
|
V4 s1_3 = -max(-s0_3, 0.0);
|
||||||
|
V4 s1_4 = -max(-s0_4, 0.0);
|
||||||
|
V4 s1_5 = -max(-s0_5, 0.0);
|
||||||
|
V4 s1_6 = -max(-s0_6, 0.0);
|
||||||
|
V4 s1_7 = -max(-s0_7, 0.0);
|
||||||
|
V4 s1_8 = -max(-s0_8, 0.0);
|
||||||
|
s0_0 = max(s0_0, 0.0);
|
||||||
|
s0_1 = max(s0_1, 0.0);
|
||||||
|
s0_2 = max(s0_2, 0.0);
|
||||||
|
s0_3 = max(s0_3, 0.0);
|
||||||
|
s0_4 = max(s0_4, 0.0);
|
||||||
|
s0_5 = max(s0_5, 0.0);
|
||||||
|
s0_6 = max(s0_6, 0.0);
|
||||||
|
s0_7 = max(s0_7, 0.0);
|
||||||
|
s0_8 = max(s0_8, 0.0);
|
||||||
|
|
||||||
|
t0[gxy] = f0(s0_0, s0_1, s0_2, s0_3, s0_4, s0_5, s0_6, s0_7, s0_8, s1_0, s1_1, s1_2, s1_3, s1_4, s1_5, s1_6, s1_7, s1_8);
|
||||||
|
}
|
||||||
|
|
||||||
|
//!PASS 8
|
||||||
|
//!DESC conv7
|
||||||
|
//!BLOCK_SIZE 8
|
||||||
|
//!NUM_THREADS 64
|
||||||
|
//!IN t0
|
||||||
|
//!OUT t1
|
||||||
|
|
||||||
|
#define l0(x, y) V4(O(t0, float2(x, y)))
|
||||||
|
|
||||||
|
V4 f0(V4 s0_0, V4 s0_1, V4 s0_2, V4 s0_3, V4 s0_4, V4 s0_5, V4 s0_6, V4 s0_7, V4 s0_8, V4 s1_0, V4 s1_1, V4 s1_2, V4 s1_3, V4 s1_4, V4 s1_5, V4 s1_6, V4 s1_7, V4 s1_8) {
|
||||||
|
V4 r = { 2.671e-03, -5.536e-03, -4.013e-03, 4.378e-03 };
|
||||||
|
r = MulAdd(s0_0, M4(5.901e-02, -1.033e-01, -1.441e-01, 4.291e-02, 2.355e-02, -1.199e-01, -1.741e-01, -5.263e-03, -6.030e-03, -4.043e-02, 1.910e-01, 8.326e-03, 2.913e-02, 1.969e-02, -1.380e-01, 9.492e-02), r);
|
||||||
|
r = MulAdd(s0_1, M4(-1.616e-01, 1.649e-01, -1.133e-02, -1.037e-01, -1.060e-02, 2.299e-01, -5.302e-02, -2.329e-01, -8.540e-02, 2.232e-01, 2.647e-01, 3.922e-01, 5.387e-02, 5.841e-01, -1.264e-01, -1.440e-01), r);
|
||||||
|
r = MulAdd(s0_2, M4(-1.944e-02, -7.262e-02, 9.583e-02, 3.448e-02, 4.402e-02, 5.319e-02, -2.384e-02, 4.652e-02, 6.280e-02, -4.195e-02, 1.573e-02, 7.059e-02, 1.029e-01, -1.784e-02, -3.735e-02, -4.952e-02), r);
|
||||||
|
r = MulAdd(s0_3, M4(7.393e-02, -1.825e-01, -2.983e-01, -5.798e-02, -2.475e-01, -4.958e-02, 6.660e-01, -2.202e-01, -9.158e-02, 4.280e-04, 2.472e-01, -2.979e-01, -9.887e-02, 6.188e-02, 2.163e-01, -9.358e-03), r);
|
||||||
|
r = MulAdd(s0_4, M4(-8.664e-01, 2.357e-01, 3.390e-01, -5.275e-01, -2.213e-01, -4.992e-01, 5.479e-01, 4.245e-01, -7.542e-02, 4.854e-01, -3.525e-01, 3.950e-01, 3.619e-01, -3.968e-01, -3.447e-01, 5.089e-01), r);
|
||||||
|
r = MulAdd(s0_5, M4(-9.239e-02, -6.370e-01, -7.252e-02, -3.435e-01, -1.057e-01, 1.616e-01, -4.413e-02, 1.824e-01, 2.001e-02, -1.343e-01, -5.730e-02, 7.302e-02, -2.361e-02, -9.044e-02, -1.041e-01, 2.971e-01), r);
|
||||||
|
r = MulAdd(s0_6, M4(-2.803e-02, -8.707e-02, -1.407e-01, -2.685e-02, 1.099e-01, 1.721e-01, 1.612e-01, 6.962e-02, -1.659e-02, 7.845e-02, 2.165e-01, -7.067e-02, 1.666e-02, 7.051e-02, 6.373e-02, 4.391e-02), r);
|
||||||
|
r = MulAdd(s0_7, M4(-1.560e-01, -2.698e-02, -5.684e-01, -1.184e-01, 7.742e-01, -1.023e-03, -8.177e-02, 2.857e-01, 2.253e-02, -1.400e-02, -6.523e-02, 7.644e-02, 1.789e-01, -8.433e-03, 1.041e-01, 7.009e-02), r);
|
||||||
|
r = MulAdd(s0_8, M4(-1.491e-01, -2.037e-01, -2.499e-01, -7.730e-02, 1.051e-01, -1.718e-02, -1.762e-01, 4.808e-02, -3.068e-03, 1.737e-02, -3.772e-04, 4.732e-02, 7.205e-02, 7.901e-02, -1.759e-02, 8.476e-02), r);
|
||||||
|
r = MulAdd(s1_0, M4(4.810e-02, -1.822e-02, -1.150e-01, -1.679e-02, -5.481e-02, -7.544e-02, 2.213e-01, 2.615e-02, -2.628e-03, -1.482e-01, -5.570e-02, 5.137e-02, -1.381e-02, -1.878e-03, -3.132e-02, -3.309e-02), r);
|
||||||
|
r = MulAdd(s1_1, M4(1.101e-01, 1.003e-01, -4.307e-01, -2.520e-02, 1.138e-02, -1.966e-01, 6.664e-02, 1.114e-01, -1.431e-01, 3.634e-01, 4.274e-02, -8.279e-02, -5.291e-02, 3.540e-01, 8.995e-02, -1.401e-01), r);
|
||||||
|
r = MulAdd(s1_2, M4(7.230e-02, 4.684e-01, -6.542e-02, -2.792e-01, 2.936e-02, 3.476e-03, -1.024e-02, 1.880e-01, 1.898e-02, 2.529e-02, 8.537e-03, -6.073e-03, 1.025e-01, -2.320e-01, -1.804e-02, 5.471e-02), r);
|
||||||
|
r = MulAdd(s1_3, M4(-9.258e-03, -7.731e-03, 4.285e-02, -4.725e-02, -3.878e-02, -1.749e-02, -1.681e-02, -1.020e-01, -3.975e-02, 1.609e-02, 8.299e-02, -1.824e-01, -2.500e-02, 3.516e-02, 8.591e-02, 1.714e-02), r);
|
||||||
|
r = MulAdd(s1_4, M4(-2.210e-01, 1.534e-01, 3.410e-01, -2.552e-01, -5.090e-02, 1.582e-02, 1.802e-01, -1.333e-01, -5.371e-01, 3.751e-01, -1.323e-01, 3.018e-01, 1.756e-01, -9.756e-02, -4.873e-01, 4.985e-01), r);
|
||||||
|
r = MulAdd(s1_5, M4(-1.073e-02, 2.919e-01, -2.025e-01, 3.240e-01, 4.318e-02, -1.972e-02, -1.612e-01, 3.528e-01, -6.472e-02, -6.212e-02, 3.146e-02, 6.391e-02, 4.950e-02, -6.270e-01, -1.985e-02, 4.680e-02), r);
|
||||||
|
r = MulAdd(s1_6, M4(-2.215e-02, 1.836e-02, 5.021e-02, -3.016e-02, -7.854e-03, 1.135e-02, 3.407e-02, -2.923e-02, -5.384e-03, 6.570e-02, 2.437e-01, -8.712e-02, 2.275e-02, -2.291e-03, -7.378e-02, 5.231e-02), r);
|
||||||
|
r = MulAdd(s1_7, M4(-4.186e-02, 6.944e-02, 8.353e-02, -1.927e-02, 3.937e-02, 2.105e-02, 7.152e-02, 5.635e-03, 1.114e-01, -3.772e-02, -1.853e-01, 6.636e-02, 4.654e-02, -1.008e-01, -1.625e-01, 7.888e-02), r);
|
||||||
|
r = MulAdd(s1_8, M4(5.288e-02, -5.516e-02, -4.014e-02, 8.854e-02, 2.434e-02, 9.192e-02, -1.203e-02, 6.813e-02, 4.626e-02, -4.892e-02, 4.700e-03, 7.578e-02, -5.040e-02, 3.497e-02, 3.176e-02, -9.741e-02), r);
|
||||||
|
return r;
|
||||||
|
}
|
||||||
|
|
||||||
|
void Pass8(uint2 blockStart, uint3 tid) {
|
||||||
|
float2 pt = float2(GetInputPt());
|
||||||
|
uint2 gxy = Rmp8x8(tid.x) + blockStart;
|
||||||
|
uint2 size = GetInputSize();
|
||||||
|
if (gxy.x >= size.x || gxy.y >= size.y) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
float2 pos = (gxy + 0.5) * pt;
|
||||||
|
|
||||||
|
V4 s0_0 = l0(-1.0, -1.0);
|
||||||
|
V4 s0_1 = l0(0.0, -1.0);
|
||||||
|
V4 s0_2 = l0(1.0, -1.0);
|
||||||
|
V4 s0_3 = l0(-1.0, 0.0);
|
||||||
|
V4 s0_4 = l0(0.0, 0.0);
|
||||||
|
V4 s0_5 = l0(1.0, 0.0);
|
||||||
|
V4 s0_6 = l0(-1.0, 1.0);
|
||||||
|
V4 s0_7 = l0(0.0, 1.0);
|
||||||
|
V4 s0_8 = l0(1.0, 1.0);
|
||||||
|
V4 s1_0 = -max(-s0_0, 0.0);
|
||||||
|
V4 s1_1 = -max(-s0_1, 0.0);
|
||||||
|
V4 s1_2 = -max(-s0_2, 0.0);
|
||||||
|
V4 s1_3 = -max(-s0_3, 0.0);
|
||||||
|
V4 s1_4 = -max(-s0_4, 0.0);
|
||||||
|
V4 s1_5 = -max(-s0_5, 0.0);
|
||||||
|
V4 s1_6 = -max(-s0_6, 0.0);
|
||||||
|
V4 s1_7 = -max(-s0_7, 0.0);
|
||||||
|
V4 s1_8 = -max(-s0_8, 0.0);
|
||||||
|
s0_0 = max(s0_0, 0.0);
|
||||||
|
s0_1 = max(s0_1, 0.0);
|
||||||
|
s0_2 = max(s0_2, 0.0);
|
||||||
|
s0_3 = max(s0_3, 0.0);
|
||||||
|
s0_4 = max(s0_4, 0.0);
|
||||||
|
s0_5 = max(s0_5, 0.0);
|
||||||
|
s0_6 = max(s0_6, 0.0);
|
||||||
|
s0_7 = max(s0_7, 0.0);
|
||||||
|
s0_8 = max(s0_8, 0.0);
|
||||||
|
|
||||||
|
t1[gxy] = f0(s0_0, s0_1, s0_2, s0_3, s0_4, s0_5, s0_6, s0_7, s0_8, s1_0, s1_1, s1_2, s1_3, s1_4, s1_5, s1_6, s1_7, s1_8);
|
||||||
|
}
|
||||||
|
|
||||||
|
//!PASS 9
|
||||||
|
//!DESC conv8
|
||||||
|
//!BLOCK_SIZE 8
|
||||||
|
//!NUM_THREADS 64
|
||||||
|
//!IN t1
|
||||||
|
//!OUT t0
|
||||||
|
|
||||||
|
#define l0(x, y) V4(O(t1, float2(x, y)))
|
||||||
|
|
||||||
|
V4 f0(V4 s0_0, V4 s0_1, V4 s0_2, V4 s0_3, V4 s0_4, V4 s0_5, V4 s0_6, V4 s0_7, V4 s0_8, V4 s1_0, V4 s1_1, V4 s1_2, V4 s1_3, V4 s1_4, V4 s1_5, V4 s1_6, V4 s1_7, V4 s1_8) {
|
||||||
|
V4 r = { -5.006e-05, -2.252e-04, -1.752e-03, 4.586e-04 };
|
||||||
|
r = MulAdd(s0_0, M4(8.283e-02, 5.262e-02, 1.580e-02, 4.991e-02, 6.836e-02, -3.234e-02, 5.630e-02, 1.275e-01, 5.398e-03, 9.866e-04, -1.054e-02, 1.601e-02, 1.546e-02, -7.786e-02, -2.630e-02, -3.023e-02), r);
|
||||||
|
r = MulAdd(s0_1, M4(9.285e-02, 3.403e-01, -4.572e-02, 1.431e-01, 2.876e-01, -3.271e-01, -8.133e-04, 5.998e-01, 4.515e-02, 9.836e-02, 2.315e-02, 1.724e-01, -8.080e-02, -1.978e-01, -5.366e-02, -4.535e-02), r);
|
||||||
|
r = MulAdd(s0_2, M4(1.708e-02, -8.374e-02, -1.831e-02, 1.744e-02, 4.902e-02, -1.037e-02, -3.508e-02, 3.501e-02, 1.160e-01, 2.529e-01, 4.235e-02, 4.233e-02, -5.953e-03, -1.398e-01, -8.815e-03, 1.053e-02), r);
|
||||||
|
r = MulAdd(s0_3, M4(-2.836e-03, -2.496e-01, 2.703e-02, 9.490e-02, 3.985e-01, -9.458e-02, 1.355e-01, 5.917e-01, 5.597e-03, -8.963e-02, 5.238e-02, 4.360e-02, -1.070e-01, 7.593e-02, 6.376e-02, -1.498e-01), r);
|
||||||
|
r = MulAdd(s0_4, M4(3.214e-01, -8.045e-01, 6.621e-01, -1.261e-01, -1.487e+00, 1.086e+00, 3.779e-01, -1.762e+00, 2.721e-01, -3.815e-02, -1.450e-01, 4.063e-01, 2.804e-01, 3.876e-01, 2.607e-01, 2.174e-01), r);
|
||||||
|
r = MulAdd(s0_5, M4(-3.896e-01, 3.340e-01, -2.529e-01, -6.519e-02, -1.815e-01, 5.542e-02, -1.669e-01, 1.732e-02, 2.995e-01, 4.942e-02, 6.557e-02, -1.386e-01, -1.392e-01, 2.822e-01, 2.016e-02, -1.313e-01), r);
|
||||||
|
r = MulAdd(s0_6, M4(-2.130e-02, 4.137e-02, 7.324e-02, 4.834e-03, 9.333e-02, -2.998e-01, 4.229e-01, 9.535e-02, -2.595e-02, 2.955e-02, 7.491e-02, -3.028e-02, -2.850e-02, 1.582e-02, -1.076e-01, -3.159e-02), r);
|
||||||
|
r = MulAdd(s0_7, M4(-3.601e-02, 5.993e-02, -1.190e-02, -6.800e-02, 6.894e-03, -2.095e-01, -9.548e-02, -2.539e-02, -2.390e-02, 2.947e-02, 1.581e-01, -5.305e-03, 1.029e-01, -1.456e-01, -3.526e-02, 9.251e-02), r);
|
||||||
|
r = MulAdd(s0_8, M4(-7.206e-02, 9.690e-02, -4.464e-02, -6.999e-03, 3.140e-02, -4.201e-02, -6.364e-03, 5.280e-03, -1.412e-01, 1.696e-01, -1.274e-01, -9.546e-02, 5.285e-02, -1.072e-01, 5.994e-02, 1.293e-02), r);
|
||||||
|
r = MulAdd(s1_0, M4(-1.808e-02, 1.243e-01, -6.814e-02, -4.219e-03, 1.273e-02, 2.752e-02, 3.764e-02, 3.650e-02, 7.663e-04, 6.843e-03, 1.380e-02, -3.235e-02, 5.400e-02, -5.352e-02, 1.190e-02, -1.028e-01), r);
|
||||||
|
r = MulAdd(s1_1, M4(2.568e-01, 2.764e-01, 7.740e-02, 1.273e-01, 7.059e-02, 6.668e-02, 4.211e-02, 6.293e-02, -4.164e-02, 2.210e-01, -1.293e-02, 8.369e-02, 2.046e-01, 1.238e-01, 9.491e-02, 4.614e-02), r);
|
||||||
|
r = MulAdd(s1_2, M4(-2.387e-02, 3.174e-01, 8.165e-02, -6.680e-02, -1.516e-02, 1.482e-02, -1.342e-02, 1.692e-02, -2.288e-02, -6.891e-02, -5.559e-02, 4.771e-02, 3.290e-02, 1.234e-01, 4.334e-02, -5.106e-02), r);
|
||||||
|
r = MulAdd(s1_3, M4(6.216e-02, -2.114e-01, -1.616e-01, 1.664e-01, 3.796e-02, 6.036e-02, -1.106e-01, 1.398e-01, -3.139e-02, -6.274e-02, 4.988e-02, -6.274e-02, 2.296e-02, -5.131e-02, 5.052e-02, -8.866e-02), r);
|
||||||
|
r = MulAdd(s1_4, M4(2.647e-01, -7.858e-01, 1.597e-01, -8.262e-01, -3.213e-01, 2.427e-01, 1.686e-01, -4.251e-01, 1.505e-01, 3.244e-02, 1.023e-01, 1.962e-01, -1.116e-01, 3.525e-01, 8.848e-01, -1.945e-01), r);
|
||||||
|
r = MulAdd(s1_5, M4(-2.549e-01, -1.429e-01, -3.696e-02, 3.042e-01, -1.256e-01, 2.760e-02, -3.650e-02, 7.985e-02, -1.958e-01, 3.076e-01, -9.253e-02, -8.512e-02, -1.708e-01, -3.422e-04, -8.181e-02, 2.319e-01), r);
|
||||||
|
r = MulAdd(s1_6, M4(-3.382e-02, 6.627e-02, 1.158e-01, -3.044e-02, -7.983e-03, -7.855e-02, 1.729e-02, 3.219e-04, -1.764e-02, 4.065e-02, -1.400e-02, -2.387e-02, 2.673e-03, 5.460e-03, -4.992e-02, -1.573e-02), r);
|
||||||
|
r = MulAdd(s1_7, M4(-2.505e-02, 1.763e-01, -4.433e-01, -1.024e-01, 1.391e-01, -2.435e-01, -5.358e-02, 5.203e-02, 3.157e-02, 2.012e-02, 7.424e-03, 3.723e-02, -2.388e-02, 7.204e-02, -4.522e-01, -1.187e-02), r);
|
||||||
|
r = MulAdd(s1_8, M4(9.737e-02, 7.067e-02, 4.072e-02, 4.303e-02, 2.890e-02, -1.810e-02, 5.156e-03, -1.953e-02, -3.503e-02, 7.492e-02, 1.402e-02, -9.796e-03, 2.320e-01, -2.135e-01, 1.462e-01, 1.194e-01), r);
|
||||||
|
return r;
|
||||||
|
}
|
||||||
|
|
||||||
|
void Pass9(uint2 blockStart, uint3 tid) {
|
||||||
|
float2 pt = float2(GetInputPt());
|
||||||
|
uint2 gxy = Rmp8x8(tid.x) + blockStart;
|
||||||
|
uint2 size = GetInputSize();
|
||||||
|
if (gxy.x >= size.x || gxy.y >= size.y) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
float2 pos = (gxy + 0.5) * pt;
|
||||||
|
|
||||||
|
V4 s0_0 = l0(-1.0, -1.0);
|
||||||
|
V4 s0_1 = l0(0.0, -1.0);
|
||||||
|
V4 s0_2 = l0(1.0, -1.0);
|
||||||
|
V4 s0_3 = l0(-1.0, 0.0);
|
||||||
|
V4 s0_4 = l0(0.0, 0.0);
|
||||||
|
V4 s0_5 = l0(1.0, 0.0);
|
||||||
|
V4 s0_6 = l0(-1.0, 1.0);
|
||||||
|
V4 s0_7 = l0(0.0, 1.0);
|
||||||
|
V4 s0_8 = l0(1.0, 1.0);
|
||||||
|
V4 s1_0 = -max(-s0_0, 0.0);
|
||||||
|
V4 s1_1 = -max(-s0_1, 0.0);
|
||||||
|
V4 s1_2 = -max(-s0_2, 0.0);
|
||||||
|
V4 s1_3 = -max(-s0_3, 0.0);
|
||||||
|
V4 s1_4 = -max(-s0_4, 0.0);
|
||||||
|
V4 s1_5 = -max(-s0_5, 0.0);
|
||||||
|
V4 s1_6 = -max(-s0_6, 0.0);
|
||||||
|
V4 s1_7 = -max(-s0_7, 0.0);
|
||||||
|
V4 s1_8 = -max(-s0_8, 0.0);
|
||||||
|
s0_0 = max(s0_0, 0.0);
|
||||||
|
s0_1 = max(s0_1, 0.0);
|
||||||
|
s0_2 = max(s0_2, 0.0);
|
||||||
|
s0_3 = max(s0_3, 0.0);
|
||||||
|
s0_4 = max(s0_4, 0.0);
|
||||||
|
s0_5 = max(s0_5, 0.0);
|
||||||
|
s0_6 = max(s0_6, 0.0);
|
||||||
|
s0_7 = max(s0_7, 0.0);
|
||||||
|
s0_8 = max(s0_8, 0.0);
|
||||||
|
|
||||||
|
t0[gxy] = f0(s0_0, s0_1, s0_2, s0_3, s0_4, s0_5, s0_6, s0_7, s0_8, s1_0, s1_1, s1_2, s1_3, s1_4, s1_5, s1_6, s1_7, s1_8);
|
||||||
|
}
|
||||||
|
|
||||||
|
//!PASS 10
|
||||||
|
//!DESC out-shuffle
|
||||||
|
//!BLOCK_SIZE 16
|
||||||
|
//!NUM_THREADS 64
|
||||||
|
//!IN INPUT, t0
|
||||||
|
//!OUT OUTPUT
|
||||||
|
|
||||||
|
#define l0(x, y) V4(O(t0, float2(x, y)))
|
||||||
|
|
||||||
|
V4 f0(V4 s0_0, V4 s0_1, V4 s0_2, V4 s0_3, V4 s0_4, V4 s0_5, V4 s0_6, V4 s0_7, V4 s0_8, V4 s1_0, V4 s1_1, V4 s1_2, V4 s1_3, V4 s1_4, V4 s1_5, V4 s1_6, V4 s1_7, V4 s1_8) {
|
||||||
|
V4 r = { -1.731e-03, -2.098e-03, -1.131e-03, -1.644e-03 };
|
||||||
|
r = MulAdd(s0_0, M4(2.670e-02, -1.964e-03, 2.191e-02, 3.109e-02, 1.911e-02, -2.017e-02, -2.948e-02, -2.237e-02, -3.845e-02, -7.954e-03, -3.472e-02, -2.253e-02, -1.571e-02, -6.613e-03, -1.489e-02, -2.647e-02), r);
|
||||||
|
r = MulAdd(s0_1, M4(-6.714e-02, -2.106e-02, 7.577e-03, 1.788e-02, 8.081e-02, 8.813e-02, -5.510e-02, -2.724e-02, 1.150e-01, 5.284e-02, -8.964e-02, -3.024e-02, 5.215e-02, 5.334e-02, -1.180e-02, 6.927e-03), r);
|
||||||
|
r = MulAdd(s0_2, M4(1.036e-02, 1.826e-02, -8.095e-03, -9.967e-03, 1.368e-03, 3.479e-02, -1.887e-03, -2.161e-02, -3.464e-02, -1.124e-01, -4.623e-03, -5.295e-03, -7.199e-03, -4.285e-02, 8.862e-03, -1.610e-02), r);
|
||||||
|
r = MulAdd(s0_3, M4(2.388e-01, -1.001e-03, 1.699e-01, -4.519e-02, -3.274e-01, 1.550e-01, 3.748e-02, 3.435e-02, -1.655e-01, 1.227e-02, -1.372e-01, 4.700e-02, -1.636e-01, 1.222e-02, -1.323e-01, 3.239e-02), r);
|
||||||
|
r = MulAdd(s0_4, M4(1.698e-01, 4.561e-01, -1.355e-01, 1.831e-01, -3.815e-01, -7.832e-01, 1.738e-01, 4.516e-02, 2.803e-01, -4.239e-01, 8.945e-01, -1.339e-02, -3.701e-01, -3.731e-01, 1.765e-01, -1.343e-01), r);
|
||||||
|
r = MulAdd(s0_5, M4(-4.653e-02, -8.470e-02, -1.076e-03, -7.153e-02, 1.022e-02, -2.560e-02, -1.154e-02, 2.252e-02, -1.053e-01, 4.014e-01, -1.479e-01, 3.667e-01, 9.425e-02, -8.079e-02, 5.594e-03, 4.870e-02), r);
|
||||||
|
r = MulAdd(s0_6, M4(-6.274e-02, -3.430e-02, -5.955e-02, 1.220e-02, -6.075e-02, 1.284e-02, -8.384e-02, 2.143e-01, -2.050e-02, -8.887e-03, -1.445e-02, 1.797e-02, 1.436e-01, -8.067e-04, 1.013e-01, 3.847e-03), r);
|
||||||
|
r = MulAdd(s0_7, M4(6.862e-02, -7.230e-02, -2.461e-01, -3.760e-01, 4.038e-02, -2.634e-02, -2.725e-01, -4.389e-01, 9.088e-03, -1.873e-02, -9.497e-02, -1.860e-01, -1.038e-01, 2.502e-01, -6.194e-01, 4.470e-02), r);
|
||||||
|
r = MulAdd(s0_8, M4(-1.984e-02, 4.173e-02, 5.328e-02, 5.554e-02, 1.241e-03, -2.290e-03, 5.972e-02, 4.381e-02, -3.320e-03, -1.434e-04, -5.754e-02, -6.072e-02, -6.854e-03, 6.781e-02, 1.208e-01, -5.469e-02), r);
|
||||||
|
r = MulAdd(s1_0, M4(7.050e-02, -3.676e-02, 7.009e-03, 1.431e-02, -1.258e-02, -6.854e-03, -9.803e-04, 5.955e-03, -3.077e-03, -2.372e-02, 8.060e-03, -5.992e-02, -7.957e-02, 2.905e-02, 3.914e-04, -1.408e-02), r);
|
||||||
|
r = MulAdd(s1_1, M4(-1.068e-01, 4.589e-02, -1.399e-02, -8.157e-03, 1.811e-02, 7.241e-03, 9.447e-03, 3.242e-03, 5.152e-02, 8.667e-02, -2.512e-02, -2.978e-02, 1.382e-01, 5.481e-02, -2.199e-02, -2.739e-02), r);
|
||||||
|
r = MulAdd(s1_2, M4(3.676e-02, 1.705e-02, -4.520e-03, -6.449e-03, 1.006e-02, 9.807e-03, -6.046e-03, -1.299e-03, -5.035e-02, -4.415e-02, 9.619e-03, -1.059e-02, -6.952e-03, -1.803e-02, -4.042e-03, -1.751e-02), r);
|
||||||
|
r = MulAdd(s1_3, M4(5.123e-02, 4.500e-02, 2.099e-01, -7.254e-03, -7.977e-02, 2.822e-02, -1.546e-01, -3.748e-02, -2.378e-01, -1.836e-02, -3.508e-02, -2.147e-03, 3.371e-02, -4.720e-02, -5.574e-02, -1.592e-02), r);
|
||||||
|
r = MulAdd(s1_4, M4(-5.764e-01, 5.998e-01, -2.288e-01, 7.223e-01, -1.855e-01, -3.467e-01, 5.173e-02, -8.967e-02, 3.308e-01, -8.987e-02, 2.397e-01, 3.701e-01, -7.970e-02, -9.046e-01, 2.397e-01, -1.626e-01), r);
|
||||||
|
r = MulAdd(s1_5, M4(1.177e-02, -1.538e-01, 4.138e-02, -5.198e-02, 3.165e-03, 3.827e-02, -5.913e-03, 8.727e-03, 7.885e-02, 2.979e-01, -6.160e-02, 1.198e-01, 1.186e-02, 9.421e-02, -4.101e-02, 4.185e-03), r);
|
||||||
|
r = MulAdd(s1_6, M4(-7.690e-02, -4.820e-03, -1.106e-01, 4.040e-02, -6.883e-02, -3.284e-02, 1.259e-02, 1.509e-01, 6.378e-03, -5.293e-04, -3.690e-02, 6.274e-02, 1.401e-01, -3.801e-03, 1.489e-01, -1.044e-02), r);
|
||||||
|
r = MulAdd(s1_7, M4(1.140e-01, -1.333e-01, -1.739e-01, -1.739e-01, 4.736e-02, -1.306e-02, -3.673e-01, -6.127e-01, -3.477e-02, -6.090e-02, 2.430e-02, -2.666e-01, -6.599e-02, 2.794e-01, -1.724e-01, -2.744e-01), r);
|
||||||
|
r = MulAdd(s1_8, M4(1.045e-02, 6.106e-02, 3.463e-02, 6.708e-02, -1.028e-02, -2.277e-02, 6.536e-02, 8.227e-02, -5.566e-02, -3.941e-02, -6.862e-03, -1.219e-02, -1.438e-02, -4.651e-02, 5.359e-02, 4.650e-02), r);
|
||||||
|
return tanh(r);
|
||||||
|
}
|
||||||
|
|
||||||
|
void Pass10(uint2 blockStart, uint3 tid) {
|
||||||
|
float2 pt = float2(GetInputPt());
|
||||||
|
uint2 gxy = (Rmp8x8(tid.x) << 1) + blockStart;
|
||||||
|
uint2 size = GetOutputSize();
|
||||||
|
if (gxy.x >= size.x || gxy.y >= size.y) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
float2 pos = ((gxy >> 1) + 0.5) * pt;
|
||||||
|
|
||||||
|
V4 s0_0 = l0(-1.0, -1.0);
|
||||||
|
V4 s0_1 = l0(0.0, -1.0);
|
||||||
|
V4 s0_2 = l0(1.0, -1.0);
|
||||||
|
V4 s0_3 = l0(-1.0, 0.0);
|
||||||
|
V4 s0_4 = l0(0.0, 0.0);
|
||||||
|
V4 s0_5 = l0(1.0, 0.0);
|
||||||
|
V4 s0_6 = l0(-1.0, 1.0);
|
||||||
|
V4 s0_7 = l0(0.0, 1.0);
|
||||||
|
V4 s0_8 = l0(1.0, 1.0);
|
||||||
|
V4 s1_0 = -max(-s0_0, 0.0);
|
||||||
|
V4 s1_1 = -max(-s0_1, 0.0);
|
||||||
|
V4 s1_2 = -max(-s0_2, 0.0);
|
||||||
|
V4 s1_3 = -max(-s0_3, 0.0);
|
||||||
|
V4 s1_4 = -max(-s0_4, 0.0);
|
||||||
|
V4 s1_5 = -max(-s0_5, 0.0);
|
||||||
|
V4 s1_6 = -max(-s0_6, 0.0);
|
||||||
|
V4 s1_7 = -max(-s0_7, 0.0);
|
||||||
|
V4 s1_8 = -max(-s0_8, 0.0);
|
||||||
|
s0_0 = max(s0_0, 0.0);
|
||||||
|
s0_1 = max(s0_1, 0.0);
|
||||||
|
s0_2 = max(s0_2, 0.0);
|
||||||
|
s0_3 = max(s0_3, 0.0);
|
||||||
|
s0_4 = max(s0_4, 0.0);
|
||||||
|
s0_5 = max(s0_5, 0.0);
|
||||||
|
s0_6 = max(s0_6, 0.0);
|
||||||
|
s0_7 = max(s0_7, 0.0);
|
||||||
|
s0_8 = max(s0_8, 0.0);
|
||||||
|
|
||||||
|
V4 r = f0(s0_0, s0_1, s0_2, s0_3, s0_4, s0_5, s0_6, s0_7, s0_8, s1_0, s1_1, s1_2, s1_3, s1_4, s1_5, s1_6, s1_7, s1_8);
|
||||||
|
|
||||||
|
static const MF3x3 rgb2yuv = { 0.299, 0.587, 0.114, -0.169, -0.331, 0.5, 0.5, -0.419, -0.081 };
|
||||||
|
static const MF3x3 yuv2rgb = { 1, -0.00093, 1.401687, 1, -0.3437, -0.71417, 1, 1.77216, 0.00099 };
|
||||||
|
float2 opt = float2(GetOutputPt());
|
||||||
|
|
||||||
|
pos -= 0.5f * opt;
|
||||||
|
MF3 yuv = mul(rgb2yuv, INPUT.SampleLevel(SL, pos, 0).rgb);
|
||||||
|
OUTPUT[gxy] = MF4(mul(yuv2rgb, MF3(saturate(yuv.r + r.x), yuv.yz)), 1);
|
||||||
|
|
||||||
|
++gxy.x;
|
||||||
|
pos.x += opt.x;
|
||||||
|
yuv = mul(rgb2yuv, INPUT.SampleLevel(SL, pos, 0).rgb);
|
||||||
|
OUTPUT[gxy] = MF4(mul(yuv2rgb, MF3(saturate(yuv.r + r.y), yuv.yz)), 1);
|
||||||
|
|
||||||
|
++gxy.y;
|
||||||
|
pos.y += opt.y;
|
||||||
|
yuv = mul(rgb2yuv, INPUT.SampleLevel(SL, pos, 0).rgb);
|
||||||
|
OUTPUT[gxy] = MF4(mul(yuv2rgb, MF3(saturate(yuv.r + r.w), yuv.yz)), 1);
|
||||||
|
|
||||||
|
--gxy.x;
|
||||||
|
pos.x -= opt.x;
|
||||||
|
yuv = mul(rgb2yuv, INPUT.SampleLevel(SL, pos, 0).rgb);
|
||||||
|
OUTPUT[gxy] = MF4(mul(yuv2rgb, MF3(saturate(yuv.r + r.z), yuv.yz)), 1);
|
||||||
|
}
|
||||||
1558
src/Effects/CuNNy/CuNNy-8x8C-NVL-DN.hlsl
Normal file
1558
src/Effects/CuNNy/CuNNy-8x8C-NVL-DN.hlsl
Normal file
File diff suppressed because it is too large
Load diff
1558
src/Effects/CuNNy/CuNNy-8x8C-NVL.hlsl
Normal file
1558
src/Effects/CuNNy/CuNNy-8x8C-NVL.hlsl
Normal file
File diff suppressed because it is too large
Load diff
|
|
@ -16,11 +16,10 @@
|
||||||
|
|
||||||
|
|
||||||
//!MAGPIE EFFECT
|
//!MAGPIE EFFECT
|
||||||
//!VERSION 5
|
//!VERSION 4
|
||||||
//!SORT_NAME CuNNy-03x12
|
//!SORT_NAME CuNNy-03x12
|
||||||
//!USE MulAdd
|
//!USE MulAdd
|
||||||
//!CAPABILITY FP16
|
//!CAPABILITY FP16
|
||||||
//!SCALE_FACTOR 2
|
|
||||||
|
|
||||||
#include "../StubDefs.hlsli"
|
#include "../StubDefs.hlsli"
|
||||||
|
|
||||||
|
|
@ -28,6 +27,8 @@
|
||||||
Texture2D INPUT;
|
Texture2D INPUT;
|
||||||
|
|
||||||
//!TEXTURE
|
//!TEXTURE
|
||||||
|
//!WIDTH INPUT_WIDTH * 2
|
||||||
|
//!HEIGHT INPUT_HEIGHT * 2
|
||||||
Texture2D OUTPUT;
|
Texture2D OUTPUT;
|
||||||
|
|
||||||
//!SAMPLER
|
//!SAMPLER
|
||||||
|
|
|
||||||
|
|
@ -16,11 +16,10 @@
|
||||||
|
|
||||||
|
|
||||||
//!MAGPIE EFFECT
|
//!MAGPIE EFFECT
|
||||||
//!VERSION 5
|
//!VERSION 4
|
||||||
//!SORT_NAME CuNNy-08x32
|
//!SORT_NAME CuNNy-08x32
|
||||||
//!USE MulAdd
|
//!USE MulAdd
|
||||||
//!CAPABILITY FP16
|
//!CAPABILITY FP16
|
||||||
//!SCALE_FACTOR 2
|
|
||||||
|
|
||||||
#include "../StubDefs.hlsli"
|
#include "../StubDefs.hlsli"
|
||||||
|
|
||||||
|
|
@ -28,6 +27,8 @@
|
||||||
Texture2D INPUT;
|
Texture2D INPUT;
|
||||||
|
|
||||||
//!TEXTURE
|
//!TEXTURE
|
||||||
|
//!WIDTH INPUT_WIDTH * 2
|
||||||
|
//!HEIGHT INPUT_HEIGHT * 2
|
||||||
Texture2D OUTPUT;
|
Texture2D OUTPUT;
|
||||||
|
|
||||||
//!SAMPLER
|
//!SAMPLER
|
||||||
|
|
@ -146,7 +147,7 @@ Texture2D T15;
|
||||||
//!IN INPUT
|
//!IN INPUT
|
||||||
//!OUT T0, T1, T2, T3, T4, T5, T6, T7
|
//!OUT T0, T1, T2, T3, T4, T5, T6, T7
|
||||||
|
|
||||||
#define L0(x, y) V3(EncodeSrgb(O(INPUT, x, y).rgb))
|
#define L0(x, y) V3(O(INPUT, x, y).rgb)
|
||||||
#define V3 MF3
|
#define V3 MF3
|
||||||
#define M3x4 MF3x4
|
#define M3x4 MF3x4
|
||||||
|
|
||||||
|
|
@ -5670,8 +5671,8 @@ void Pass10(uint2 blockStart, uint3 tid) {
|
||||||
r1 = MulAdd(s1_2_2, M4(3.648e-03, -7.492e-03, 7.566e-03, -6.626e-02, -1.922e-03, -1.418e-03, 8.532e-05, -1.628e-03, -1.875e-03, -7.480e-03, -5.740e-03, -3.978e-02, -8.104e-04, 2.341e-03, 5.188e-04, -7.545e-03), r1);
|
r1 = MulAdd(s1_2_2, M4(3.648e-03, -7.492e-03, 7.566e-03, -6.626e-02, -1.922e-03, -1.418e-03, 8.532e-05, -1.628e-03, -1.875e-03, -7.480e-03, -5.740e-03, -3.978e-02, -8.104e-04, 2.341e-03, 5.188e-04, -7.545e-03), r1);
|
||||||
r2 = MulAdd(s1_2_2, M4(2.797e-03, -3.287e-03, 8.760e-03, -5.046e-02, -1.458e-03, -2.502e-03, 6.034e-04, -3.008e-03, -1.281e-03, 1.262e-03, 3.077e-03, 6.751e-02, -1.200e-04, 1.705e-03, -1.655e-05, -5.620e-03), r2);
|
r2 = MulAdd(s1_2_2, M4(2.797e-03, -3.287e-03, 8.760e-03, -5.046e-02, -1.458e-03, -2.502e-03, 6.034e-04, -3.008e-03, -1.281e-03, 1.262e-03, 3.077e-03, 6.751e-02, -1.200e-04, 1.705e-03, -1.655e-05, -5.620e-03), r2);
|
||||||
float2 opt = float2(GetOutputPt()), fpos = (float2(gxy) + 0.5) * opt;
|
float2 opt = float2(GetOutputPt()), fpos = (float2(gxy) + 0.5) * opt;
|
||||||
OUTPUT[gxy + int2(0, 0)] = MF4(saturate(DecodeSrgb(EncodeSrgb(INPUT.SampleLevel(SL, fpos + float2(0.0, 0.0) * opt, 0).rgb) + MF3(r0.x, r1.x, r2.x))), 1.0);
|
OUTPUT[gxy + int2(0, 0)] = MF4(saturate(INPUT.SampleLevel(SL, fpos + float2(0.0, 0.0) * opt, 0).rgb + MF3(r0.x, r1.x, r2.x)), 1.0);
|
||||||
OUTPUT[gxy + int2(1, 0)] = MF4(saturate(DecodeSrgb(EncodeSrgb(INPUT.SampleLevel(SL, fpos + float2(1.0, 0.0) * opt, 0).rgb) + MF3(r0.y, r1.y, r2.y))), 1.0);
|
OUTPUT[gxy + int2(1, 0)] = MF4(saturate(INPUT.SampleLevel(SL, fpos + float2(1.0, 0.0) * opt, 0).rgb + MF3(r0.y, r1.y, r2.y)), 1.0);
|
||||||
OUTPUT[gxy + int2(0, 1)] = MF4(saturate(DecodeSrgb(EncodeSrgb(INPUT.SampleLevel(SL, fpos + float2(0.0, 1.0) * opt, 0).rgb) + MF3(r0.z, r1.z, r2.z))), 1.0);
|
OUTPUT[gxy + int2(0, 1)] = MF4(saturate(INPUT.SampleLevel(SL, fpos + float2(0.0, 1.0) * opt, 0).rgb + MF3(r0.z, r1.z, r2.z)), 1.0);
|
||||||
OUTPUT[gxy + int2(1, 1)] = MF4(saturate(DecodeSrgb(EncodeSrgb(INPUT.SampleLevel(SL, fpos + float2(1.0, 1.0) * opt, 0).rgb) + MF3(r0.w, r1.w, r2.w))), 1.0);
|
OUTPUT[gxy + int2(1, 1)] = MF4(saturate(INPUT.SampleLevel(SL, fpos + float2(1.0, 1.0) * opt, 0).rgb + MF3(r0.w, r1.w, r2.w)), 1.0);
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -2,8 +2,7 @@
|
||||||
// Port from https://github.com/haasn/gentoo-conf/blob/xor/home/nand/.mpv/shaders/deband.glsl
|
// Port from https://github.com/haasn/gentoo-conf/blob/xor/home/nand/.mpv/shaders/deband.glsl
|
||||||
|
|
||||||
//!MAGPIE EFFECT
|
//!MAGPIE EFFECT
|
||||||
//!VERSION 5
|
//!VERSION 4
|
||||||
//!SCALE_FACTOR 1
|
|
||||||
|
|
||||||
//!PARAMETER
|
//!PARAMETER
|
||||||
//!LABEL Threshold
|
//!LABEL Threshold
|
||||||
|
|
@ -54,6 +53,8 @@ float grain;
|
||||||
Texture2D INPUT;
|
Texture2D INPUT;
|
||||||
|
|
||||||
//!TEXTURE
|
//!TEXTURE
|
||||||
|
//!WIDTH INPUT_WIDTH
|
||||||
|
//!HEIGHT INPUT_HEIGHT
|
||||||
Texture2D OUTPUT;
|
Texture2D OUTPUT;
|
||||||
|
|
||||||
//!SAMPLER
|
//!SAMPLER
|
||||||
|
|
|
||||||
|
|
@ -1,13 +1,14 @@
|
||||||
<?xml version="1.0" encoding="utf-8"?>
|
<?xml version="1.0" encoding="utf-8"?>
|
||||||
<Project DefaultTargets="Build" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
|
<Project DefaultTargets="Build" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
|
||||||
<Import Project="..\Common.Pre.props" />
|
|
||||||
<PropertyGroup Label="Globals">
|
<PropertyGroup Label="Globals">
|
||||||
<Keyword>Win32Proj</Keyword>
|
<Keyword>Win32Proj</Keyword>
|
||||||
<ProjectGuid>{62503530-b84b-4cc2-80b6-3f89618172b7}</ProjectGuid>
|
<ProjectGuid>{62503530-b84b-4cc2-80b6-3f89618172b7}</ProjectGuid>
|
||||||
<WindowsTargetPlatformVersion>10.0.26100.0</WindowsTargetPlatformVersion>
|
<WindowsTargetPlatformVersion>10.0.26100.0</WindowsTargetPlatformVersion>
|
||||||
<OutDir>$(OutBaseDir)\app\effects\</OutDir>
|
<IntDir>$(SolutionDir)\obj\$(Platform)\$(Configuration)\$(MSBuildProjectName)\</IntDir>
|
||||||
|
<OutDir>$(SolutionDir)\bin\$(Platform)\$(Configuration)\</OutDir>
|
||||||
</PropertyGroup>
|
</PropertyGroup>
|
||||||
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
|
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
|
||||||
|
<Import Project="..\Common.Pre.props" />
|
||||||
<PropertyGroup Label="Configuration">
|
<PropertyGroup Label="Configuration">
|
||||||
<ConfigurationType>Utility</ConfigurationType>
|
<ConfigurationType>Utility</ConfigurationType>
|
||||||
</PropertyGroup>
|
</PropertyGroup>
|
||||||
|
|
@ -17,7 +18,7 @@
|
||||||
</ImportGroup>
|
</ImportGroup>
|
||||||
<ItemDefinitionGroup>
|
<ItemDefinitionGroup>
|
||||||
<CopyFileToFolders>
|
<CopyFileToFolders>
|
||||||
<DestinationFolders>$(OutDir)\shaders\</DestinationFolders>
|
<DestinationFolders>$(OutDir)\effects</DestinationFolders>
|
||||||
<DestinationFileName>%(RelativeDir)%(Filename)%(Extension)</DestinationFileName>
|
<DestinationFileName>%(RelativeDir)%(Filename)%(Extension)</DestinationFileName>
|
||||||
</CopyFileToFolders>
|
</CopyFileToFolders>
|
||||||
</ItemDefinitionGroup>
|
</ItemDefinitionGroup>
|
||||||
|
|
@ -366,6 +367,66 @@
|
||||||
<CopyFileToFolders Include="Anime4K\Anime4K_Upscale_GAN_x2_M.hlsl">
|
<CopyFileToFolders Include="Anime4K\Anime4K_Upscale_GAN_x2_M.hlsl">
|
||||||
<FileType>Document</FileType>
|
<FileType>Document</FileType>
|
||||||
</CopyFileToFolders>
|
</CopyFileToFolders>
|
||||||
|
<CopyFileToFolders Include="CuNNy\CuNNy-2x4C-NVL.hlsl">
|
||||||
|
<FileType>Document</FileType>
|
||||||
|
</CopyFileToFolders>
|
||||||
|
<CopyFileToFolders Include="CuNNy\CuNNy-2x4C-NVL-DN.hlsl">
|
||||||
|
<FileType>Document</FileType>
|
||||||
|
</CopyFileToFolders>
|
||||||
|
<CopyFileToFolders Include="CuNNy\CuNNy-3x4C-NVL.hlsl">
|
||||||
|
<FileType>Document</FileType>
|
||||||
|
</CopyFileToFolders>
|
||||||
|
<CopyFileToFolders Include="CuNNy\CuNNy-3x4C-NVL-DN.hlsl">
|
||||||
|
<FileType>Document</FileType>
|
||||||
|
</CopyFileToFolders>
|
||||||
|
<CopyFileToFolders Include="CuNNy\CuNNy-4x4C-NVL.hlsl">
|
||||||
|
<FileType>Document</FileType>
|
||||||
|
</CopyFileToFolders>
|
||||||
|
<CopyFileToFolders Include="CuNNy\CuNNy-4x4C-NVL-DN.hlsl">
|
||||||
|
<FileType>Document</FileType>
|
||||||
|
</CopyFileToFolders>
|
||||||
|
<CopyFileToFolders Include="CuNNy\CuNNy-4x8C-NVL.hlsl">
|
||||||
|
<FileType>Document</FileType>
|
||||||
|
</CopyFileToFolders>
|
||||||
|
<CopyFileToFolders Include="CuNNy\CuNNy-4x8C-NVL-DN.hlsl">
|
||||||
|
<FileType>Document</FileType>
|
||||||
|
</CopyFileToFolders>
|
||||||
|
<CopyFileToFolders Include="CuNNy\CuNNy-4x16C-NVL.hlsl">
|
||||||
|
<FileType>Document</FileType>
|
||||||
|
</CopyFileToFolders>
|
||||||
|
<CopyFileToFolders Include="CuNNy\CuNNy-4x16C-NVL-DN.hlsl">
|
||||||
|
<FileType>Document</FileType>
|
||||||
|
</CopyFileToFolders>
|
||||||
|
<CopyFileToFolders Include="CuNNy\CuNNy-6x8C-NVL.hlsl">
|
||||||
|
<FileType>Document</FileType>
|
||||||
|
</CopyFileToFolders>
|
||||||
|
<CopyFileToFolders Include="CuNNy\CuNNy-6x8C-NVL-DN.hlsl">
|
||||||
|
<FileType>Document</FileType>
|
||||||
|
</CopyFileToFolders>
|
||||||
|
<CopyFileToFolders Include="CuNNy\CuNNy-8x4C-NVL.hlsl">
|
||||||
|
<FileType>Document</FileType>
|
||||||
|
</CopyFileToFolders>
|
||||||
|
<CopyFileToFolders Include="CuNNy\CuNNy-8x4C-NVL-DN.hlsl">
|
||||||
|
<FileType>Document</FileType>
|
||||||
|
</CopyFileToFolders>
|
||||||
|
<CopyFileToFolders Include="CuNNy\CuNNy-8x8C-NVL.hlsl">
|
||||||
|
<FileType>Document</FileType>
|
||||||
|
</CopyFileToFolders>
|
||||||
|
<CopyFileToFolders Include="CuNNy\CuNNy-8x8C-NVL-DN.hlsl">
|
||||||
|
<FileType>Document</FileType>
|
||||||
|
</CopyFileToFolders>
|
||||||
|
<CopyFileToFolders Include="CuNNy\CuNNy-8x16C-NVL.hlsl">
|
||||||
|
<FileType>Document</FileType>
|
||||||
|
</CopyFileToFolders>
|
||||||
|
<CopyFileToFolders Include="CuNNy\CuNNy-8x16C-NVL-DN.hlsl">
|
||||||
|
<FileType>Document</FileType>
|
||||||
|
</CopyFileToFolders>
|
||||||
|
<CopyFileToFolders Include="CuNNy\CuNNy-16x16C-NVL.hlsl">
|
||||||
|
<FileType>Document</FileType>
|
||||||
|
</CopyFileToFolders>
|
||||||
|
<CopyFileToFolders Include="CuNNy\CuNNy-16x16C-NVL-DN.hlsl">
|
||||||
|
<FileType>Document</FileType>
|
||||||
|
</CopyFileToFolders>
|
||||||
<CopyFileToFolders Include="NIS\NIS_Scaler.hlsli">
|
<CopyFileToFolders Include="NIS\NIS_Scaler.hlsli">
|
||||||
<FileType>Document</FileType>
|
<FileType>Document</FileType>
|
||||||
</CopyFileToFolders>
|
</CopyFileToFolders>
|
||||||
|
|
|
||||||
|
|
@ -360,6 +360,66 @@
|
||||||
<CopyFileToFolders Include="Anime4K\Anime4K_Upscale_GAN_x2_M.hlsl">
|
<CopyFileToFolders Include="Anime4K\Anime4K_Upscale_GAN_x2_M.hlsl">
|
||||||
<Filter>Anime4K</Filter>
|
<Filter>Anime4K</Filter>
|
||||||
</CopyFileToFolders>
|
</CopyFileToFolders>
|
||||||
|
<CopyFileToFolders Include="CuNNy\CuNNy-2x4C-NVL.hlsl">
|
||||||
|
<Filter>CuNNy</Filter>
|
||||||
|
</CopyFileToFolders>
|
||||||
|
<CopyFileToFolders Include="CuNNy\CuNNy-2x4C-NVL-DN.hlsl">
|
||||||
|
<Filter>CuNNy</Filter>
|
||||||
|
</CopyFileToFolders>
|
||||||
|
<CopyFileToFolders Include="CuNNy\CuNNy-3x4C-NVL.hlsl">
|
||||||
|
<Filter>CuNNy</Filter>
|
||||||
|
</CopyFileToFolders>
|
||||||
|
<CopyFileToFolders Include="CuNNy\CuNNy-3x4C-NVL-DN.hlsl">
|
||||||
|
<Filter>CuNNy</Filter>
|
||||||
|
</CopyFileToFolders>
|
||||||
|
<CopyFileToFolders Include="CuNNy\CuNNy-4x4C-NVL.hlsl">
|
||||||
|
<Filter>CuNNy</Filter>
|
||||||
|
</CopyFileToFolders>
|
||||||
|
<CopyFileToFolders Include="CuNNy\CuNNy-4x4C-NVL-DN.hlsl">
|
||||||
|
<Filter>CuNNy</Filter>
|
||||||
|
</CopyFileToFolders>
|
||||||
|
<CopyFileToFolders Include="CuNNy\CuNNy-4x8C-NVL.hlsl">
|
||||||
|
<Filter>CuNNy</Filter>
|
||||||
|
</CopyFileToFolders>
|
||||||
|
<CopyFileToFolders Include="CuNNy\CuNNy-4x8C-NVL-DN.hlsl">
|
||||||
|
<Filter>CuNNy</Filter>
|
||||||
|
</CopyFileToFolders>
|
||||||
|
<CopyFileToFolders Include="CuNNy\CuNNy-4x16C-NVL.hlsl">
|
||||||
|
<Filter>CuNNy</Filter>
|
||||||
|
</CopyFileToFolders>
|
||||||
|
<CopyFileToFolders Include="CuNNy\CuNNy-4x16C-NVL-DN.hlsl">
|
||||||
|
<Filter>CuNNy</Filter>
|
||||||
|
</CopyFileToFolders>
|
||||||
|
<CopyFileToFolders Include="CuNNy\CuNNy-6x8C-NVL.hlsl">
|
||||||
|
<Filter>CuNNy</Filter>
|
||||||
|
</CopyFileToFolders>
|
||||||
|
<CopyFileToFolders Include="CuNNy\CuNNy-6x8C-NVL-DN.hlsl">
|
||||||
|
<Filter>CuNNy</Filter>
|
||||||
|
</CopyFileToFolders>
|
||||||
|
<CopyFileToFolders Include="CuNNy\CuNNy-8x4C-NVL.hlsl">
|
||||||
|
<Filter>CuNNy</Filter>
|
||||||
|
</CopyFileToFolders>
|
||||||
|
<CopyFileToFolders Include="CuNNy\CuNNy-8x4C-NVL-DN.hlsl">
|
||||||
|
<Filter>CuNNy</Filter>
|
||||||
|
</CopyFileToFolders>
|
||||||
|
<CopyFileToFolders Include="CuNNy\CuNNy-8x8C-NVL.hlsl">
|
||||||
|
<Filter>CuNNy</Filter>
|
||||||
|
</CopyFileToFolders>
|
||||||
|
<CopyFileToFolders Include="CuNNy\CuNNy-8x8C-NVL-DN.hlsl">
|
||||||
|
<Filter>CuNNy</Filter>
|
||||||
|
</CopyFileToFolders>
|
||||||
|
<CopyFileToFolders Include="CuNNy\CuNNy-8x16C-NVL.hlsl">
|
||||||
|
<Filter>CuNNy</Filter>
|
||||||
|
</CopyFileToFolders>
|
||||||
|
<CopyFileToFolders Include="CuNNy\CuNNy-8x16C-NVL-DN.hlsl">
|
||||||
|
<Filter>CuNNy</Filter>
|
||||||
|
</CopyFileToFolders>
|
||||||
|
<CopyFileToFolders Include="CuNNy\CuNNy-16x16C-NVL.hlsl">
|
||||||
|
<Filter>CuNNy</Filter>
|
||||||
|
</CopyFileToFolders>
|
||||||
|
<CopyFileToFolders Include="CuNNy\CuNNy-16x16C-NVL-DN.hlsl">
|
||||||
|
<Filter>CuNNy</Filter>
|
||||||
|
</CopyFileToFolders>
|
||||||
<CopyFileToFolders Include="Bicubic.hlsl" />
|
<CopyFileToFolders Include="Bicubic.hlsl" />
|
||||||
<CopyFileToFolders Include="NIS\NIS_Scaler.hlsli">
|
<CopyFileToFolders Include="NIS\NIS_Scaler.hlsli">
|
||||||
<Filter>NIS</Filter>
|
<Filter>NIS</Filter>
|
||||||
|
|
@ -434,6 +494,9 @@
|
||||||
<Filter Include="Pixel Art">
|
<Filter Include="Pixel Art">
|
||||||
<UniqueIdentifier>{0b58f073-84cb-4c38-919d-80176ae408bc}</UniqueIdentifier>
|
<UniqueIdentifier>{0b58f073-84cb-4c38-919d-80176ae408bc}</UniqueIdentifier>
|
||||||
</Filter>
|
</Filter>
|
||||||
|
<Filter Include="CuNNy">
|
||||||
|
<UniqueIdentifier>{9157745b-aa96-42ce-bdc6-1230dffa326b}</UniqueIdentifier>
|
||||||
|
</Filter>
|
||||||
<Filter Include="CuNNy2">
|
<Filter Include="CuNNy2">
|
||||||
<UniqueIdentifier>{52055d56-41dc-409a-a878-3c1278082f6d}</UniqueIdentifier>
|
<UniqueIdentifier>{52055d56-41dc-409a-a878-3c1278082f6d}</UniqueIdentifier>
|
||||||
</Filter>
|
</Filter>
|
||||||
|
|
|
||||||
|
|
@ -2,7 +2,7 @@
|
||||||
// 移植自 https://github.com/GPUOpen-Effects/FidelityFX-FSR/blob/a21ffb8f6c13233ba336352bdff293894c706575/ffx-fsr/ffx_fsr1.h
|
// 移植自 https://github.com/GPUOpen-Effects/FidelityFX-FSR/blob/a21ffb8f6c13233ba336352bdff293894c706575/ffx-fsr/ffx_fsr1.h
|
||||||
|
|
||||||
//!MAGPIE EFFECT
|
//!MAGPIE EFFECT
|
||||||
//!VERSION 5
|
//!VERSION 4
|
||||||
//!CAPABILITY FP16
|
//!CAPABILITY FP16
|
||||||
|
|
||||||
#include "../StubDefs.hlsli"
|
#include "../StubDefs.hlsli"
|
||||||
|
|
@ -17,6 +17,7 @@ Texture2D OUTPUT;
|
||||||
//!FILTER POINT
|
//!FILTER POINT
|
||||||
SamplerState sam;
|
SamplerState sam;
|
||||||
|
|
||||||
|
|
||||||
//!PASS 1
|
//!PASS 1
|
||||||
//!IN INPUT
|
//!IN INPUT
|
||||||
//!OUT OUTPUT
|
//!OUT OUTPUT
|
||||||
|
|
@ -460,11 +461,17 @@ void Pass1(uint2 blockStart, uint3 threadId) {
|
||||||
OUTPUT[gxy] = MF4(FsrEasu(gxy, con0, con1, con2, con3), 1);
|
OUTPUT[gxy] = MF4(FsrEasu(gxy, con0, con1, con2, con3), 1);
|
||||||
|
|
||||||
gxy.x += 8u;
|
gxy.x += 8u;
|
||||||
OUTPUT[gxy] = MF4(FsrEasu(gxy, con0, con1, con2, con3), 1);
|
if (gxy.x < outputSize.x && gxy.y < outputSize.y) {
|
||||||
|
OUTPUT[gxy] = MF4(FsrEasu(gxy, con0, con1, con2, con3), 1);
|
||||||
|
}
|
||||||
|
|
||||||
gxy.y += 8u;
|
gxy.y += 8u;
|
||||||
OUTPUT[gxy] = MF4(FsrEasu(gxy, con0, con1, con2, con3), 1);
|
if (gxy.x < outputSize.x && gxy.y < outputSize.y) {
|
||||||
|
OUTPUT[gxy] = MF4(FsrEasu(gxy, con0, con1, con2, con3), 1);
|
||||||
|
}
|
||||||
|
|
||||||
gxy.x -= 8u;
|
gxy.x -= 8u;
|
||||||
OUTPUT[gxy] = MF4(FsrEasu(gxy, con0, con1, con2, con3), 1);
|
if (gxy.x < outputSize.x && gxy.y < outputSize.y) {
|
||||||
|
OUTPUT[gxy] = MF4(FsrEasu(gxy, con0, con1, con2, con3), 1);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -2,9 +2,8 @@
|
||||||
// 移植自 https://github.com/GPUOpen-Effects/FidelityFX-FSR/blob/a21ffb8f6c13233ba336352bdff293894c706575/ffx-fsr/ffx_fsr1.h
|
// 移植自 https://github.com/GPUOpen-Effects/FidelityFX-FSR/blob/a21ffb8f6c13233ba336352bdff293894c706575/ffx-fsr/ffx_fsr1.h
|
||||||
|
|
||||||
//!MAGPIE EFFECT
|
//!MAGPIE EFFECT
|
||||||
//!VERSION 5
|
//!VERSION 4
|
||||||
//!CAPABILITY FP16
|
//!CAPABILITY FP16
|
||||||
//!SCALE_FACTOR 1
|
|
||||||
|
|
||||||
#include "../StubDefs.hlsli"
|
#include "../StubDefs.hlsli"
|
||||||
|
|
||||||
|
|
@ -20,12 +19,15 @@ float sharpness;
|
||||||
Texture2D INPUT;
|
Texture2D INPUT;
|
||||||
|
|
||||||
//!TEXTURE
|
//!TEXTURE
|
||||||
|
//!WIDTH INPUT_WIDTH
|
||||||
|
//!HEIGHT INPUT_HEIGHT
|
||||||
Texture2D OUTPUT;
|
Texture2D OUTPUT;
|
||||||
|
|
||||||
//!SAMPLER
|
//!SAMPLER
|
||||||
//!FILTER POINT
|
//!FILTER POINT
|
||||||
SamplerState sam;
|
SamplerState sam;
|
||||||
|
|
||||||
|
|
||||||
//!PASS 1
|
//!PASS 1
|
||||||
//!IN INPUT
|
//!IN INPUT
|
||||||
//!OUT OUTPUT
|
//!OUT OUTPUT
|
||||||
|
|
|
||||||
|
|
@ -1,8 +1,8 @@
|
||||||
// 移植自 https://github.com/libretro/slang-shaders/blob/3f67e1870dbd5be74ae2f09eaed0eeadce6abd15/misc/image-adjustment.slang
|
// 移植自 https://github.com/libretro/slang-shaders/blob/3f67e1870dbd5be74ae2f09eaed0eeadce6abd15/misc/image-adjustment.slang
|
||||||
|
|
||||||
//!MAGPIE EFFECT
|
//!MAGPIE EFFECT
|
||||||
//!VERSION 5
|
//!VERSION 4
|
||||||
//!SCALE_FACTOR 1
|
|
||||||
|
|
||||||
//!PARAMETER
|
//!PARAMETER
|
||||||
//!LABEL Target Gamma
|
//!LABEL Target Gamma
|
||||||
|
|
|
||||||
|
|
@ -10,11 +10,12 @@
|
||||||
// B = 0.825 to get rid of dithering. Increase B to get a fine sharpness, though dithering returns.
|
// B = 0.825 to get rid of dithering. Increase B to get a fine sharpness, though dithering returns.
|
||||||
|
|
||||||
//!MAGPIE EFFECT
|
//!MAGPIE EFFECT
|
||||||
//!VERSION 5
|
//!VERSION 4
|
||||||
//!CAPABILITY AdvancedColor
|
//!USE MulAdd
|
||||||
|
|
||||||
#include "StubDefs.hlsli"
|
#include "StubDefs.hlsli"
|
||||||
|
|
||||||
|
|
||||||
//!PARAMETER
|
//!PARAMETER
|
||||||
//!LABEL Window Sinc Param
|
//!LABEL Window Sinc Param
|
||||||
//!DEFAULT 0.5
|
//!DEFAULT 0.5
|
||||||
|
|
@ -49,6 +50,7 @@ Texture2D OUTPUT;
|
||||||
//!FILTER POINT
|
//!FILTER POINT
|
||||||
SamplerState sam;
|
SamplerState sam;
|
||||||
|
|
||||||
|
|
||||||
//!PASS 1
|
//!PASS 1
|
||||||
//!IN INPUT
|
//!IN INPUT
|
||||||
//!OUT OUTPUT
|
//!OUT OUTPUT
|
||||||
|
|
@ -60,6 +62,7 @@ SamplerState sam;
|
||||||
#define min4(a, b, c, d) min(min(a, b), min(c, d))
|
#define min4(a, b, c, d) min(min(a, b), min(c, d))
|
||||||
#define max4(a, b, c, d) max(max(a, b), max(c, d))
|
#define max4(a, b, c, d) max(max(a, b), max(c, d))
|
||||||
|
|
||||||
|
|
||||||
float d(float2 pt1, float2 pt2) {
|
float d(float2 pt1, float2 pt2) {
|
||||||
float2 v = pt2 - pt1;
|
float2 v = pt2 - pt1;
|
||||||
return sqrt(dot(v, v));
|
return sqrt(dot(v, v));
|
||||||
|
|
@ -105,9 +108,9 @@ void Pass1(uint2 blockStart, uint3 threadId) {
|
||||||
[unroll]
|
[unroll]
|
||||||
for (uint j = 0; j <= 2; j += 2) {
|
for (uint j = 0; j <= 2; j += 2) {
|
||||||
float2 tpos = (tc + uint2(i, j)) * inputPt;
|
float2 tpos = (tc + uint2(i, j)) * inputPt;
|
||||||
float4 sr = INPUT.GatherRed(sam, tpos);
|
const float4 sr = INPUT.GatherRed(sam, tpos);
|
||||||
float4 sg = INPUT.GatherGreen(sam, tpos);
|
const float4 sg = INPUT.GatherGreen(sam, tpos);
|
||||||
float4 sb = INPUT.GatherBlue(sam, tpos);
|
const float4 sb = INPUT.GatherBlue(sam, tpos);
|
||||||
|
|
||||||
// w z
|
// w z
|
||||||
// x y
|
// x y
|
||||||
|
|
@ -125,9 +128,11 @@ void Pass1(uint2 blockStart, uint3 threadId) {
|
||||||
color *= rcp(dot(mul(weights, float4(1, 1, 1, 1)), 1));
|
color *= rcp(dot(mul(weights, float4(1, 1, 1, 1)), 1));
|
||||||
|
|
||||||
// 抗振铃
|
// 抗振铃
|
||||||
|
// Get min/max samples
|
||||||
float3 min_sample = min4(src[1][1], src[2][1], src[1][2], src[2][2]);
|
float3 min_sample = min4(src[1][1], src[2][1], src[1][2], src[2][2]);
|
||||||
float3 max_sample = max4(src[1][1], src[2][1], src[1][2], src[2][2]);
|
float3 max_sample = max4(src[1][1], src[2][1], src[1][2], src[2][2]);
|
||||||
color = lerp(color, clamp(color, min_sample, max_sample), ARStrength);
|
color = lerp(color, clamp(color, min_sample, max_sample), ARStrength);
|
||||||
|
|
||||||
|
// final sum and weight normalization
|
||||||
OUTPUT[gxy] = float4(color, 1);
|
OUTPUT[gxy] = float4(color, 1);
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -2,8 +2,8 @@
|
||||||
// 移植自 https://github.com/libretro/common-shaders/blob/master/windowed/shaders/lanczos6.cg
|
// 移植自 https://github.com/libretro/common-shaders/blob/master/windowed/shaders/lanczos6.cg
|
||||||
|
|
||||||
//!MAGPIE EFFECT
|
//!MAGPIE EFFECT
|
||||||
//!VERSION 5
|
//!VERSION 4
|
||||||
//!CAPABILITY AdvancedColor
|
|
||||||
|
|
||||||
//!PARAMETER
|
//!PARAMETER
|
||||||
//!LABEL Anti-ringing Strength
|
//!LABEL Anti-ringing Strength
|
||||||
|
|
@ -23,6 +23,7 @@ Texture2D OUTPUT;
|
||||||
//!FILTER POINT
|
//!FILTER POINT
|
||||||
SamplerState sam;
|
SamplerState sam;
|
||||||
|
|
||||||
|
|
||||||
//!PASS 1
|
//!PASS 1
|
||||||
//!STYLE PS
|
//!STYLE PS
|
||||||
//!IN INPUT
|
//!IN INPUT
|
||||||
|
|
|
||||||
|
|
@ -1,5 +1,5 @@
|
||||||
//!MAGPIE EFFECT
|
//!MAGPIE EFFECT
|
||||||
//!VERSION 5
|
//!VERSION 4
|
||||||
//!CAPABILITY FP16
|
//!CAPABILITY FP16
|
||||||
|
|
||||||
#include "../StubDefs.hlsli"
|
#include "../StubDefs.hlsli"
|
||||||
|
|
|
||||||
|
|
@ -163,10 +163,17 @@
|
||||||
#define NVU2 uint2
|
#define NVU2 uint2
|
||||||
#define NVB bool
|
#define NVB bool
|
||||||
#if NIS_USE_HALF_PRECISION
|
#if NIS_USE_HALF_PRECISION
|
||||||
#define NVH MF
|
#if NIS_HLSL_6_2
|
||||||
#define NVH2 MF2
|
#define NVH float16_t
|
||||||
#define NVH3 MF3
|
#define NVH2 float16_t2
|
||||||
#define NVH4 MF4
|
#define NVH3 float16_t3
|
||||||
|
#define NVH4 float16_t4
|
||||||
|
#else
|
||||||
|
#define NVH min16float
|
||||||
|
#define NVH2 min16float2
|
||||||
|
#define NVH3 min16float3
|
||||||
|
#define NVH4 min16float4
|
||||||
|
#endif // NIS_HLSL_6_2
|
||||||
#else // FP32 types
|
#else // FP32 types
|
||||||
#define NVH NVF
|
#define NVH NVF
|
||||||
#define NVH2 NVF2
|
#define NVH2 NVF2
|
||||||
|
|
|
||||||
|
|
@ -1,6 +1,5 @@
|
||||||
//!MAGPIE EFFECT
|
//!MAGPIE EFFECT
|
||||||
//!VERSION 5
|
//!VERSION 4
|
||||||
//!CAPABILITY AdvancedColor
|
|
||||||
|
|
||||||
//!TEXTURE
|
//!TEXTURE
|
||||||
Texture2D INPUT;
|
Texture2D INPUT;
|
||||||
|
|
|
||||||
|
|
@ -2,7 +2,7 @@
|
||||||
// 移植自 https://github.com/SnapdragonStudios/snapdragon-gsr/blob/main/sgsr/v1/include/hlsl/sgsr1_shader_mobile.hlsl
|
// 移植自 https://github.com/SnapdragonStudios/snapdragon-gsr/blob/main/sgsr/v1/include/hlsl/sgsr1_shader_mobile.hlsl
|
||||||
|
|
||||||
//!MAGPIE EFFECT
|
//!MAGPIE EFFECT
|
||||||
//!VERSION 5
|
//!VERSION 4
|
||||||
|
|
||||||
//!PARAMETER
|
//!PARAMETER
|
||||||
//!LABEL Edge Sharpness
|
//!LABEL Edge Sharpness
|
||||||
|
|
@ -154,8 +154,8 @@ float3 SgsrYuvH(float2 uv, float4 con1)
|
||||||
|
|
||||||
float deltaY = finalY - pix_G;
|
float deltaY = finalY - pix_G;
|
||||||
|
|
||||||
pix += deltaY;
|
pix = saturate(pix+deltaY);
|
||||||
}
|
}
|
||||||
return pix;
|
return pix;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -1,12 +1,14 @@
|
||||||
//!MAGPIE EFFECT
|
//!MAGPIE EFFECT
|
||||||
//!VERSION 5
|
//!VERSION 4
|
||||||
//!SORT_NAME SMAA_2
|
//!SORT_NAME SMAA_2
|
||||||
//!SCALE_FACTOR 1
|
|
||||||
|
|
||||||
//!TEXTURE
|
//!TEXTURE
|
||||||
Texture2D INPUT;
|
Texture2D INPUT;
|
||||||
|
|
||||||
//!TEXTURE
|
//!TEXTURE
|
||||||
|
//!WIDTH INPUT_WIDTH
|
||||||
|
//!HEIGHT INPUT_HEIGHT
|
||||||
Texture2D OUTPUT;
|
Texture2D OUTPUT;
|
||||||
|
|
||||||
//!TEXTURE
|
//!TEXTURE
|
||||||
|
|
@ -39,9 +41,10 @@ SamplerState PointSampler;
|
||||||
//!FILTER LINEAR
|
//!FILTER LINEAR
|
||||||
SamplerState LinearSampler;
|
SamplerState LinearSampler;
|
||||||
|
|
||||||
|
|
||||||
//!COMMON
|
//!COMMON
|
||||||
|
|
||||||
static float4 SMAA_RT_METRICS = { GetInputPt(), GetInputSize() };
|
#define SMAA_RT_METRICS float4(GetInputPt(), GetInputSize())
|
||||||
#define SMAA_LINEAR_SAMPLER LinearSampler
|
#define SMAA_LINEAR_SAMPLER LinearSampler
|
||||||
#define SMAA_POINT_SAMPLER PointSampler
|
#define SMAA_POINT_SAMPLER PointSampler
|
||||||
#define SMAA_PRESET_HIGH
|
#define SMAA_PRESET_HIGH
|
||||||
|
|
|
||||||
|
|
@ -1,12 +1,14 @@
|
||||||
//!MAGPIE EFFECT
|
//!MAGPIE EFFECT
|
||||||
//!VERSION 5
|
//!VERSION 4
|
||||||
//!SORT_NAME SMAA_0
|
//!SORT_NAME SMAA_0
|
||||||
//!SCALE_FACTOR 1
|
|
||||||
|
|
||||||
//!TEXTURE
|
//!TEXTURE
|
||||||
Texture2D INPUT;
|
Texture2D INPUT;
|
||||||
|
|
||||||
//!TEXTURE
|
//!TEXTURE
|
||||||
|
//!WIDTH INPUT_WIDTH
|
||||||
|
//!HEIGHT INPUT_HEIGHT
|
||||||
Texture2D OUTPUT;
|
Texture2D OUTPUT;
|
||||||
|
|
||||||
//!TEXTURE
|
//!TEXTURE
|
||||||
|
|
@ -39,9 +41,10 @@ SamplerState PointSampler;
|
||||||
//!FILTER LINEAR
|
//!FILTER LINEAR
|
||||||
SamplerState LinearSampler;
|
SamplerState LinearSampler;
|
||||||
|
|
||||||
|
|
||||||
//!COMMON
|
//!COMMON
|
||||||
|
|
||||||
static float4 SMAA_RT_METRICS = { GetInputPt(), GetInputSize() };
|
#define SMAA_RT_METRICS float4(GetInputPt(), GetInputSize())
|
||||||
#define SMAA_LINEAR_SAMPLER LinearSampler
|
#define SMAA_LINEAR_SAMPLER LinearSampler
|
||||||
#define SMAA_POINT_SAMPLER PointSampler
|
#define SMAA_POINT_SAMPLER PointSampler
|
||||||
#define SMAA_PRESET_LOW
|
#define SMAA_PRESET_LOW
|
||||||
|
|
|
||||||
|
|
@ -1,12 +1,14 @@
|
||||||
//!MAGPIE EFFECT
|
//!MAGPIE EFFECT
|
||||||
//!VERSION 5
|
//!VERSION 4
|
||||||
//!SORT_NAME SMAA_1
|
//!SORT_NAME SMAA_1
|
||||||
//!SCALE_FACTOR 1
|
|
||||||
|
|
||||||
//!TEXTURE
|
//!TEXTURE
|
||||||
Texture2D INPUT;
|
Texture2D INPUT;
|
||||||
|
|
||||||
//!TEXTURE
|
//!TEXTURE
|
||||||
|
//!WIDTH INPUT_WIDTH
|
||||||
|
//!HEIGHT INPUT_HEIGHT
|
||||||
Texture2D OUTPUT;
|
Texture2D OUTPUT;
|
||||||
|
|
||||||
//!TEXTURE
|
//!TEXTURE
|
||||||
|
|
@ -39,9 +41,10 @@ SamplerState PointSampler;
|
||||||
//!FILTER LINEAR
|
//!FILTER LINEAR
|
||||||
SamplerState LinearSampler;
|
SamplerState LinearSampler;
|
||||||
|
|
||||||
|
|
||||||
//!COMMON
|
//!COMMON
|
||||||
|
|
||||||
static float4 SMAA_RT_METRICS = { GetInputPt(), GetInputSize() };
|
#define SMAA_RT_METRICS float4(GetInputPt(), GetInputSize())
|
||||||
#define SMAA_LINEAR_SAMPLER LinearSampler
|
#define SMAA_LINEAR_SAMPLER LinearSampler
|
||||||
#define SMAA_POINT_SAMPLER PointSampler
|
#define SMAA_POINT_SAMPLER PointSampler
|
||||||
#define SMAA_PRESET_MEDIUM
|
#define SMAA_PRESET_MEDIUM
|
||||||
|
|
|
||||||
|
|
@ -1,12 +1,14 @@
|
||||||
//!MAGPIE EFFECT
|
//!MAGPIE EFFECT
|
||||||
//!VERSION 5
|
//!VERSION 4
|
||||||
//!SORT_NAME SMAA_3
|
//!SORT_NAME SMAA_3
|
||||||
//!SCALE_FACTOR 1
|
|
||||||
|
|
||||||
//!TEXTURE
|
//!TEXTURE
|
||||||
Texture2D INPUT;
|
Texture2D INPUT;
|
||||||
|
|
||||||
//!TEXTURE
|
//!TEXTURE
|
||||||
|
//!WIDTH INPUT_WIDTH
|
||||||
|
//!HEIGHT INPUT_HEIGHT
|
||||||
Texture2D OUTPUT;
|
Texture2D OUTPUT;
|
||||||
|
|
||||||
//!TEXTURE
|
//!TEXTURE
|
||||||
|
|
@ -39,9 +41,10 @@ SamplerState PointSampler;
|
||||||
//!FILTER LINEAR
|
//!FILTER LINEAR
|
||||||
SamplerState LinearSampler;
|
SamplerState LinearSampler;
|
||||||
|
|
||||||
|
|
||||||
//!COMMON
|
//!COMMON
|
||||||
|
|
||||||
static float4 SMAA_RT_METRICS = { GetInputPt(), GetInputSize() };
|
#define SMAA_RT_METRICS float4(GetInputPt(), GetInputSize())
|
||||||
#define SMAA_LINEAR_SAMPLER LinearSampler
|
#define SMAA_LINEAR_SAMPLER LinearSampler
|
||||||
#define SMAA_POINT_SAMPLER PointSampler
|
#define SMAA_POINT_SAMPLER PointSampler
|
||||||
#define SMAA_PRESET_ULTRA
|
#define SMAA_PRESET_ULTRA
|
||||||
|
|
|
||||||
|
|
@ -2,8 +2,9 @@
|
||||||
// 移植自 https://gist.github.com/igv/36508af3ffc84410fe39761d6969be10
|
// 移植自 https://gist.github.com/igv/36508af3ffc84410fe39761d6969be10
|
||||||
// 原始文件使用了大量 mpv 的“特性”,因此可能存在移植错误。如果你熟悉 mpv hook,请帮助我们改进
|
// 原始文件使用了大量 mpv 的“特性”,因此可能存在移植错误。如果你熟悉 mpv hook,请帮助我们改进
|
||||||
|
|
||||||
|
|
||||||
//!MAGPIE EFFECT
|
//!MAGPIE EFFECT
|
||||||
//!VERSION 5
|
//!VERSION 4
|
||||||
|
|
||||||
//!PARAMETER
|
//!PARAMETER
|
||||||
//!LABEL Oversharp
|
//!LABEL Oversharp
|
||||||
|
|
@ -40,7 +41,7 @@ Texture2D MR;
|
||||||
//!TEXTURE
|
//!TEXTURE
|
||||||
//!WIDTH OUTPUT_WIDTH
|
//!WIDTH OUTPUT_WIDTH
|
||||||
//!HEIGHT OUTPUT_HEIGHT
|
//!HEIGHT OUTPUT_HEIGHT
|
||||||
//!FORMAT COLOR_SPACE_ADAPTIVE
|
//!FORMAT R8G8B8A8_UNORM
|
||||||
Texture2D POSTKERNEL;
|
Texture2D POSTKERNEL;
|
||||||
|
|
||||||
//!SAMPLER
|
//!SAMPLER
|
||||||
|
|
@ -51,6 +52,7 @@ SamplerState sam;
|
||||||
//!FILTER LINEAR
|
//!FILTER LINEAR
|
||||||
SamplerState sam1;
|
SamplerState sam1;
|
||||||
|
|
||||||
|
|
||||||
//!PASS 1
|
//!PASS 1
|
||||||
//!DESC CatumllRom
|
//!DESC CatumllRom
|
||||||
//!STYLE PS
|
//!STYLE PS
|
||||||
|
|
@ -58,60 +60,58 @@ SamplerState sam1;
|
||||||
//!OUT POSTKERNEL
|
//!OUT POSTKERNEL
|
||||||
|
|
||||||
// 模拟 mpv 的内置缩放(CatmullRom)
|
// 模拟 mpv 的内置缩放(CatmullRom)
|
||||||
|
// Samples a texture with Catmull-Rom filtering, using 9 texture fetches instead of 16.
|
||||||
float4 weight4(float x) {
|
// See http://vec3.ca/bicubic-filtering-in-fewer-taps/ for more details
|
||||||
// Sharper version. May look better in some cases. B=0, C=0.75
|
|
||||||
return float4(
|
|
||||||
((-0.75 * x + 1.5) * x - 0.75) * x,
|
|
||||||
(1.25 * x - 2.25) * x * x + 1.0,
|
|
||||||
((-1.25 * x + 1.5) * x + 0.75) * x,
|
|
||||||
(0.75 * x - 0.75) * x * x
|
|
||||||
);
|
|
||||||
}
|
|
||||||
|
|
||||||
float4 Pass1(float2 pos) {
|
float4 Pass1(float2 pos) {
|
||||||
const float2 inputPt = GetInputPt();
|
float2 inputSize = GetInputSize();
|
||||||
const float2 inputSize = GetInputSize();
|
float2 inputPt = GetInputPt();
|
||||||
|
|
||||||
pos *= inputSize;
|
// We're going to sample a a 4x4 grid of texels surrounding the target UV coordinate. We'll do this by rounding
|
||||||
float2 pos1 = floor(pos - 0.5) + 0.5;
|
// down the sample location to get the exact center of our "starting" texel. The starting texel will be at
|
||||||
float2 f = pos - pos1;
|
// location [1, 1] in the grid, where [0, 0] is the top left corner.
|
||||||
|
float2 samplePos = pos * inputSize;
|
||||||
|
float2 texPos1 = floor(samplePos - 0.5f) + 0.5f;
|
||||||
|
|
||||||
float4 rowtaps = weight4(f.x);
|
// Compute the fractional offset from our starting texel to our original sample location, which we'll
|
||||||
float4 coltaps = weight4(f.y);
|
// feed into the Catmull-Rom spline function to get our filter weights.
|
||||||
|
float2 f = samplePos - texPos1;
|
||||||
|
|
||||||
float2 uv1 = pos1 * inputPt;
|
// Compute the Catmull-Rom weights using the fractional offset that we calculated earlier.
|
||||||
float2 uv0 = uv1 - inputPt;
|
// These equations are pre-expanded based on our knowledge of where the texels will be located,
|
||||||
float2 uv2 = uv1 + inputPt;
|
// which lets us avoid having to evaluate a piece-wise function.
|
||||||
float2 uv3 = uv2 + inputPt;
|
float2 w0 = f * (-0.5f + f * (1.0f - 0.5f * f));
|
||||||
|
float2 w1 = 1.0f + f * f * (-2.5f + 1.5f * f);
|
||||||
|
float2 w2 = f * (0.5f + f * (2.0f - 1.5f * f));
|
||||||
|
float2 w3 = f * f * (-0.5f + 0.5f * f);
|
||||||
|
|
||||||
float u_weight_sum = rowtaps.y + rowtaps.z;
|
// Work out weighting factors and sampling offsets that will let us use bilinear filtering to
|
||||||
float u_middle_offset = rowtaps.z * inputPt.x / u_weight_sum;
|
// simultaneously evaluate the middle 2 samples from the 4x4 grid.
|
||||||
float u_middle = uv1.x + u_middle_offset;
|
float2 w12 = w1 + w2;
|
||||||
|
float2 offset12 = w2 / (w1 + w2);
|
||||||
|
|
||||||
float v_weight_sum = coltaps.y + coltaps.z;
|
// Compute the final UV coordinates we'll use for sampling the texture
|
||||||
float v_middle_offset = coltaps.z * inputPt.y / v_weight_sum;
|
float2 texPos0 = texPos1 - 1;
|
||||||
float v_middle = uv1.y + v_middle_offset;
|
float2 texPos3 = texPos1 + 2;
|
||||||
|
float2 texPos12 = texPos1 + offset12;
|
||||||
|
|
||||||
int2 coord_top_left = int2(max(uv0 * inputSize, 0.5));
|
texPos0 *= inputPt;
|
||||||
int2 coord_bottom_right = int2(min(uv3 * inputSize, inputSize - 0.5));
|
texPos3 *= inputPt;
|
||||||
|
texPos12 *= inputPt;
|
||||||
|
|
||||||
float3 top = INPUT.Load(int3(coord_top_left, 0)).rgb * rowtaps.x;
|
float4 result = 0.0f;
|
||||||
top += INPUT.SampleLevel(sam1, float2(u_middle, uv0.y), 0).rgb * u_weight_sum;
|
result += INPUT.SampleLevel(sam1, float2(texPos0.x, texPos0.y), 0) * w0.x * w0.y;
|
||||||
top += INPUT.Load(int3(coord_bottom_right.x, coord_top_left.y, 0)).rgb * rowtaps.w;
|
result += INPUT.SampleLevel(sam1, float2(texPos12.x, texPos0.y), 0) * w12.x * w0.y;
|
||||||
float3 total = top * coltaps.x;
|
result += INPUT.SampleLevel(sam1, float2(texPos3.x, texPos0.y), 0) * w3.x * w0.y;
|
||||||
|
|
||||||
float3 middle = INPUT.SampleLevel(sam1, float2(uv0.x, v_middle), 0).rgb * rowtaps.x;
|
result += INPUT.SampleLevel(sam1, float2(texPos0.x, texPos12.y), 0) * w0.x * w12.y;
|
||||||
middle += INPUT.SampleLevel(sam1, float2(u_middle, v_middle), 0).rgb * u_weight_sum;
|
result += INPUT.SampleLevel(sam1, float2(texPos12.x, texPos12.y), 0) * w12.x * w12.y;
|
||||||
middle += INPUT.SampleLevel(sam1, float2(uv3.x, v_middle), 0).rgb * rowtaps.w;
|
result += INPUT.SampleLevel(sam1, float2(texPos3.x, texPos12.y), 0) * w3.x * w12.y;
|
||||||
total += middle * v_weight_sum;
|
|
||||||
|
|
||||||
float3 bottom = INPUT.Load(int3(coord_top_left.x, coord_bottom_right.y, 0)).rgb * rowtaps.x;
|
result += INPUT.SampleLevel(sam1, float2(texPos0.x, texPos3.y), 0) * w0.x * w3.y;
|
||||||
bottom += INPUT.SampleLevel(sam1, float2(u_middle, uv3.y), 0).rgb * u_weight_sum;
|
result += INPUT.SampleLevel(sam1, float2(texPos12.x, texPos3.y), 0) * w12.x * w3.y;
|
||||||
bottom += INPUT.Load(int3(coord_bottom_right, 0)).rgb * rowtaps.w;
|
result += INPUT.SampleLevel(sam1, float2(texPos3.x, texPos3.y), 0) * w3.x * w3.y;
|
||||||
total += bottom * coltaps.w;
|
|
||||||
|
|
||||||
return float4(total, 1);
|
return result;
|
||||||
}
|
}
|
||||||
|
|
||||||
//!PASS 2
|
//!PASS 2
|
||||||
|
|
@ -124,6 +124,7 @@ float4 Pass1(float2 pos) {
|
||||||
#define Kernel(x) MN(0.0f, 0.5f, abs(x))
|
#define Kernel(x) MN(0.0f, 0.5f, abs(x))
|
||||||
#define taps 2.0f
|
#define taps 2.0f
|
||||||
|
|
||||||
|
|
||||||
float4 Pass2(float2 pos) {
|
float4 Pass2(float2 pos) {
|
||||||
const float inputPtY = GetInputPt().y;
|
const float inputPtY = GetInputPt().y;
|
||||||
const uint inputHeight = GetInputSize().y;
|
const uint inputHeight = GetInputSize().y;
|
||||||
|
|
@ -151,6 +152,7 @@ float4 Pass2(float2 pos) {
|
||||||
return float4(avg, 1);
|
return float4(avg, 1);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
//!PASS 3
|
//!PASS 3
|
||||||
//!DESC L2 pass 2
|
//!DESC L2 pass 2
|
||||||
//!STYLE PS
|
//!STYLE PS
|
||||||
|
|
@ -161,6 +163,7 @@ float4 Pass2(float2 pos) {
|
||||||
#define Kernel(x) MN(0.0, 0.5, abs(x))
|
#define Kernel(x) MN(0.0, 0.5, abs(x))
|
||||||
#define taps 2.0
|
#define taps 2.0
|
||||||
|
|
||||||
|
|
||||||
float4 Pass3(float2 pos) {
|
float4 Pass3(float2 pos) {
|
||||||
const float inputPtX = GetInputPt().x;
|
const float inputPtX = GetInputPt().x;
|
||||||
const uint inputWidth = GetInputSize().x;
|
const uint inputWidth = GetInputSize().x;
|
||||||
|
|
@ -187,6 +190,7 @@ float4 Pass3(float2 pos) {
|
||||||
return float4(avg, 1);
|
return float4(avg, 1);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
//!PASS 4
|
//!PASS 4
|
||||||
//!DESC mean & R
|
//!DESC mean & R
|
||||||
//!IN L2_2, POSTKERNEL
|
//!IN L2_2, POSTKERNEL
|
||||||
|
|
@ -203,6 +207,7 @@ float4 Pass3(float2 pos) {
|
||||||
|
|
||||||
#define Luma(rgb) ( dot(rgb, float3(0.2126, 0.7152, 0.0722)) )
|
#define Luma(rgb) ( dot(rgb, float3(0.2126, 0.7152, 0.0722)) )
|
||||||
|
|
||||||
|
|
||||||
void Pass4(uint2 blockStart, uint3 threadId) {
|
void Pass4(uint2 blockStart, uint3 threadId) {
|
||||||
uint2 gxy = (Rmp8x8(threadId.x) << 1) + blockStart;
|
uint2 gxy = (Rmp8x8(threadId.x) << 1) + blockStart;
|
||||||
uint2 outputSize = GetOutputSize();
|
uint2 outputSize = GetOutputSize();
|
||||||
|
|
@ -219,7 +224,7 @@ void Pass4(uint2 blockStart, uint3 threadId) {
|
||||||
for (i = 0; i < taps; i += 2) {
|
for (i = 0; i < taps; i += 2) {
|
||||||
[unroll]
|
[unroll]
|
||||||
for (j = 0; j < taps; j += 2) {
|
for (j = 0; j < taps; j += 2) {
|
||||||
float2 tpos = (int2(gxy + uint2(i, j)) - taps / 2 + 1) * outputPt;
|
const float2 tpos = (int2(gxy + uint2(i, j)) - taps / 2 + 1) * outputPt;
|
||||||
float4 sr = POSTKERNEL.GatherRed(sam, tpos);
|
float4 sr = POSTKERNEL.GatherRed(sam, tpos);
|
||||||
float4 sg = POSTKERNEL.GatherGreen(sam, tpos);
|
float4 sg = POSTKERNEL.GatherGreen(sam, tpos);
|
||||||
float4 sb = POSTKERNEL.GatherBlue(sam, tpos);
|
float4 sb = POSTKERNEL.GatherBlue(sam, tpos);
|
||||||
|
|
@ -253,7 +258,13 @@ void Pass4(uint2 blockStart, uint3 threadId) {
|
||||||
[unroll]
|
[unroll]
|
||||||
for (j = 0; j <= 1; ++j) {
|
for (j = 0; j <= 1; ++j) {
|
||||||
uint2 destPos = gxy + uint2(i, j);
|
uint2 destPos = gxy + uint2(i, j);
|
||||||
|
|
||||||
|
if (i != 0 || j != 0) {
|
||||||
|
if (destPos.x >= outputSize.x || destPos.y >= outputSize.y) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
float W = 0.0;
|
float W = 0.0;
|
||||||
float3x3 avg = 0;
|
float3x3 avg = 0;
|
||||||
|
|
||||||
|
|
@ -282,6 +293,7 @@ void Pass4(uint2 blockStart, uint3 threadId) {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
//!PASS 5
|
//!PASS 5
|
||||||
//!DESC final pass
|
//!DESC final pass
|
||||||
//!IN MR, POSTKERNEL
|
//!IN MR, POSTKERNEL
|
||||||
|
|
@ -295,6 +307,7 @@ void Pass4(uint2 blockStart, uint3 threadId) {
|
||||||
// taps 需为奇数
|
// taps 需为奇数
|
||||||
#define taps 3
|
#define taps 3
|
||||||
|
|
||||||
|
|
||||||
void Pass5(uint2 blockStart, uint3 threadId) {
|
void Pass5(uint2 blockStart, uint3 threadId) {
|
||||||
const uint2 gxy = (Rmp8x8(threadId.x) << 1) + blockStart;
|
const uint2 gxy = (Rmp8x8(threadId.x) << 1) + blockStart;
|
||||||
|
|
||||||
|
|
@ -311,11 +324,11 @@ void Pass5(uint2 blockStart, uint3 threadId) {
|
||||||
for (i = 0; i < taps; i += 2) {
|
for (i = 0; i < taps; i += 2) {
|
||||||
[unroll]
|
[unroll]
|
||||||
for (j = 0; j < taps; j += 2) {
|
for (j = 0; j < taps; j += 2) {
|
||||||
float2 tpos = (int2(gxy + uint2(i, j)) - taps / 2 + 1) * outputPt;
|
const float2 tpos = (int2(gxy + uint2(i, j)) - taps / 2 + 1) * outputPt;
|
||||||
float4 sr = MR.GatherRed(sam, tpos);
|
const float4 sr = MR.GatherRed(sam, tpos);
|
||||||
float4 sg = MR.GatherGreen(sam, tpos);
|
const float4 sg = MR.GatherGreen(sam, tpos);
|
||||||
float4 sb = MR.GatherBlue(sam, tpos);
|
const float4 sb = MR.GatherBlue(sam, tpos);
|
||||||
float4 sa = MR.GatherAlpha(sam, tpos);
|
const float4 sa = MR.GatherAlpha(sam, tpos);
|
||||||
|
|
||||||
// w z
|
// w z
|
||||||
// x y
|
// x y
|
||||||
|
|
@ -327,10 +340,10 @@ void Pass5(uint2 blockStart, uint3 threadId) {
|
||||||
}
|
}
|
||||||
|
|
||||||
float3 src2[2][2];
|
float3 src2[2][2];
|
||||||
float2 tpos = (gxy + 1) * outputPt;
|
const float2 tpos = (gxy + 1) * outputPt;
|
||||||
float4 sr = POSTKERNEL.GatherRed(sam, tpos);
|
const float4 sr = POSTKERNEL.GatherRed(sam, tpos);
|
||||||
float4 sg = POSTKERNEL.GatherGreen(sam, tpos);
|
const float4 sg = POSTKERNEL.GatherGreen(sam, tpos);
|
||||||
float4 sb = POSTKERNEL.GatherBlue(sam, tpos);
|
const float4 sb = POSTKERNEL.GatherBlue(sam, tpos);
|
||||||
|
|
||||||
// w z
|
// w z
|
||||||
// x y
|
// x y
|
||||||
|
|
|
||||||
|
|
@ -4,9 +4,10 @@
|
||||||
// Adaptive sharpen - version 2015-05-15 - (requires ps >= 3.0)
|
// Adaptive sharpen - version 2015-05-15 - (requires ps >= 3.0)
|
||||||
// Tuned for use post resize, EXPECTS FULL RANGE GAMMA LIGHT
|
// Tuned for use post resize, EXPECTS FULL RANGE GAMMA LIGHT
|
||||||
|
|
||||||
|
|
||||||
//!MAGPIE EFFECT
|
//!MAGPIE EFFECT
|
||||||
//!VERSION 5
|
//!VERSION 4
|
||||||
//!SCALE_FACTOR 1
|
|
||||||
|
|
||||||
//!PARAMETER
|
//!PARAMETER
|
||||||
//!LABEL Sharpness
|
//!LABEL Sharpness
|
||||||
|
|
@ -23,25 +24,21 @@ float curveHeight;
|
||||||
Texture2D INPUT;
|
Texture2D INPUT;
|
||||||
|
|
||||||
//!TEXTURE
|
//!TEXTURE
|
||||||
|
//!WIDTH INPUT_WIDTH
|
||||||
|
//!HEIGHT INPUT_HEIGHT
|
||||||
Texture2D OUTPUT;
|
Texture2D OUTPUT;
|
||||||
|
|
||||||
//!SAMPLER
|
//!SAMPLER
|
||||||
//!FILTER POINT
|
//!FILTER POINT
|
||||||
SamplerState sam;
|
SamplerState sam;
|
||||||
|
|
||||||
|
|
||||||
//!PASS 1
|
//!PASS 1
|
||||||
//!IN INPUT
|
//!IN INPUT
|
||||||
//!OUT OUTPUT
|
//!OUT OUTPUT
|
||||||
//!BLOCK_SIZE 16
|
//!BLOCK_SIZE 16
|
||||||
//!NUM_THREADS 64
|
//!NUM_THREADS 64
|
||||||
|
|
||||||
// DXC 编译时展开某些循环会大幅降低性能
|
|
||||||
#ifdef MP_SM_6_0
|
|
||||||
#define CONDITIONAL_UNROLL
|
|
||||||
#else
|
|
||||||
#define CONDITIONAL_UNROLL [unroll]
|
|
||||||
#endif
|
|
||||||
|
|
||||||
// Defined values under this row are "optimal" DO NOT CHANGE IF YOU DO NOT KNOW WHAT YOU ARE DOING!
|
// Defined values under this row are "optimal" DO NOT CHANGE IF YOU DO NOT KNOW WHAT YOU ARE DOING!
|
||||||
|
|
||||||
#define curveslope (curveHeight*1.5f) // Sharpening curve slope, edge region
|
#define curveslope (curveHeight*1.5f) // Sharpening curve slope, edge region
|
||||||
|
|
@ -51,9 +48,9 @@ SamplerState sam;
|
||||||
#define L_comp_ratio 0.167f // Max compression ratio, light overshoot (1/0.167=6x)
|
#define L_comp_ratio 0.167f // Max compression ratio, light overshoot (1/0.167=6x)
|
||||||
#define max_scale_lim 10.0f // Abs change before max compression (1/10=±10%)
|
#define max_scale_lim 10.0f // Abs change before max compression (1/10=±10%)
|
||||||
|
|
||||||
// 效果工作在线性 RGB 空间,应使用 GetLuminance 计算亮度
|
|
||||||
// Colour to greyscale, fast approx gamma
|
// Colour to greyscale, fast approx gamma
|
||||||
// float CtG(float3 RGB) { return sqrt((1.0f / 3.0f) * ((RGB * RGB).r + (RGB * RGB).g + (RGB * RGB).b)); }
|
float CtG(float3 RGB) { return sqrt((1.0f / 3.0f) * ((RGB * RGB).r + (RGB * RGB).g + (RGB * RGB).b)); }
|
||||||
|
|
||||||
|
|
||||||
void Pass1(uint2 blockStart, uint3 threadId) {
|
void Pass1(uint2 blockStart, uint3 threadId) {
|
||||||
uint2 gxy = (Rmp8x8(threadId.x) << 1) + blockStart;
|
uint2 gxy = (Rmp8x8(threadId.x) << 1) + blockStart;
|
||||||
|
|
@ -71,6 +68,11 @@ void Pass1(uint2 blockStart, uint3 threadId) {
|
||||||
for (i = 0; i <= 6; i += 2) {
|
for (i = 0; i <= 6; i += 2) {
|
||||||
[unroll]
|
[unroll]
|
||||||
for (j = 0; j <= 6; j += 2) {
|
for (j = 0; j <= 6; j += 2) {
|
||||||
|
// 四角共 16 个纹素无需采样
|
||||||
|
if ((i == 0 && j == 0) || (i == 6 && j == 0) || (i == 0 && j == 6) || (i == 6 && j == 6)) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
float2 tpos = ((int2)gxy + int2(i, j) - 2) * inputPt;
|
float2 tpos = ((int2)gxy + int2(i, j) - 2) * inputPt;
|
||||||
const float4 sr = INPUT.GatherRed(sam, tpos);
|
const float4 sr = INPUT.GatherRed(sam, tpos);
|
||||||
const float4 sg = INPUT.GatherGreen(sam, tpos);
|
const float4 sg = INPUT.GatherGreen(sam, tpos);
|
||||||
|
|
@ -79,19 +81,19 @@ void Pass1(uint2 blockStart, uint3 threadId) {
|
||||||
// w z
|
// w z
|
||||||
// x y
|
// x y
|
||||||
src[i][j].rgb = float3(sr.w, sg.w, sb.w);
|
src[i][j].rgb = float3(sr.w, sg.w, sb.w);
|
||||||
src[i][j].w = GetLuminance(src[i][j].rgb);
|
src[i][j].w = CtG(src[i][j].rgb);
|
||||||
src[i][j + 1].rgb = float3(sr.x, sg.x, sb.x);
|
src[i][j + 1].rgb = float3(sr.x, sg.x, sb.x);
|
||||||
src[i][j + 1].w = GetLuminance(src[i][j + 1].rgb);
|
src[i][j + 1].w = CtG(src[i][j + 1].rgb);
|
||||||
src[i + 1][j].rgb = float3(sr.z, sg.z, sb.z);
|
src[i + 1][j].rgb = float3(sr.z, sg.z, sb.z);
|
||||||
src[i + 1][j].w = GetLuminance(src[i + 1][j].rgb);
|
src[i + 1][j].w = CtG(src[i + 1][j].rgb);
|
||||||
src[i + 1][j + 1].rgb = float3(sr.y, sg.y, sb.y);
|
src[i + 1][j + 1].rgb = float3(sr.y, sg.y, sb.y);
|
||||||
src[i + 1][j + 1].w = GetLuminance(src[i + 1][j + 1].rgb);
|
src[i + 1][j + 1].w = CtG(src[i + 1][j + 1].rgb);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
CONDITIONAL_UNROLL
|
[unroll]
|
||||||
for (i = 0; i <= 1; ++i) {
|
for (i = 0; i <= 1; ++i) {
|
||||||
CONDITIONAL_UNROLL
|
[unroll]
|
||||||
for (j = 0; j <= 1; ++j) {
|
for (j = 0; j <= 1; ++j) {
|
||||||
const uint2 destPos = gxy + uint2(i, j);
|
const uint2 destPos = gxy + uint2(i, j);
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -26,21 +26,19 @@
|
||||||
#define MF4x3 float4x3
|
#define MF4x3 float4x3
|
||||||
#define MF4x4 float4x4
|
#define MF4x4 float4x4
|
||||||
|
|
||||||
uint2 Rmp8x8(uint a) { return uint2(0); }
|
uint2 Rmp8x8(uint a) { return uint2(a / 8, a % 8); }
|
||||||
uint2 GetInputSize() { return uint2(0); }
|
uint2 GetInputSize() { return uint2(0, 0); }
|
||||||
float2 GetInputPt() { return float2(0); }
|
float2 GetInputPt() { return float2(0, 0); }
|
||||||
uint2 GetOutputSize() { return float2(0); }
|
uint2 GetOutputSize() { return float2(0, 0); }
|
||||||
float2 GetOutputPt() { return float2(0); }
|
float2 GetOutputPt() { return float2(0, 0); }
|
||||||
float2 GetScale() { return float2(0); }
|
float2 GetScale() { return float2(0, 0); }
|
||||||
MF3 EncodeSrgb(MF3 c) { return MF3(0); }
|
MF2 MulAdd(MF2 x, MF2x2 y, MF2 a) { return mul(x, y) + a; }
|
||||||
MF3 DecodeSrgb(MF3 c) { return MF3(0); }
|
MF3 MulAdd(MF2 x, MF2x3 y, MF3 a) { return mul(x, y) + a; }
|
||||||
MF GetLuminance(MF3 c) { return 0; }
|
MF4 MulAdd(MF2 x, MF2x4 y, MF4 a) { return mul(x, y) + a; }
|
||||||
MF2 MulAdd(MF2 x, MF2x2 y, MF2 a) { return MF2(0); }
|
MF2 MulAdd(MF3 x, MF3x2 y, MF2 a) { return mul(x, y) + a; }
|
||||||
MF3 MulAdd(MF2 x, MF2x3 y, MF3 a) { return MF3(0); }
|
MF3 MulAdd(MF3 x, MF3x3 y, MF3 a) { return mul(x, y) + a; }
|
||||||
MF4 MulAdd(MF2 x, MF2x4 y, MF4 a) { return MF4(0); }
|
MF4 MulAdd(MF3 x, MF3x4 y, MF4 a) { return mul(x, y) + a; }
|
||||||
MF2 MulAdd(MF3 x, MF3x2 y, MF2 a) { return MF2(0); }
|
MF2 MulAdd(MF4 x, MF4x2 y, MF2 a) { return mul(x, y) + a; }
|
||||||
MF3 MulAdd(MF3 x, MF3x3 y, MF3 a) { return MF3(0); }
|
MF3 MulAdd(MF4 x, MF4x3 y, MF3 a) { return mul(x, y) + a; }
|
||||||
MF4 MulAdd(MF3 x, MF3x4 y, MF4 a) { return MF4(0); }
|
MF4 MulAdd(MF4 x, MF4x4 y, MF4 a) { return mul(x, y) + a; }
|
||||||
MF2 MulAdd(MF4 x, MF4x2 y, MF2 a) { return MF2(0); }
|
uint GetFrameCount() { return 0; }
|
||||||
MF3 MulAdd(MF4 x, MF4x3 y, MF3 a) { return MF3(0); }
|
|
||||||
MF4 MulAdd(MF4 x, MF4x4 y, MF4 a) { return MF4(0); }
|
|
||||||
|
|
|
||||||
338
src/Magpie.Core/AdaptivePresenter.cpp
Normal file
338
src/Magpie.Core/AdaptivePresenter.cpp
Normal file
|
|
@ -0,0 +1,338 @@
|
||||||
|
#include "pch.h"
|
||||||
|
#include "AdaptivePresenter.h"
|
||||||
|
#include "DeviceResources.h"
|
||||||
|
#include "Logger.h"
|
||||||
|
#include "ScalingWindow.h"
|
||||||
|
#include "Win32Helper.h"
|
||||||
|
|
||||||
|
namespace Magpie {
|
||||||
|
|
||||||
|
bool AdaptivePresenter::_Initialize(HWND hwndAttach) noexcept {
|
||||||
|
if (ScalingWindow::Get().Options().IsDirectFlipDisabled()) {
|
||||||
|
// 禁用 DirectFlip 时始终使用 DirectComposition 呈现
|
||||||
|
if (!_ResizeDCompVisual(hwndAttach)) {
|
||||||
|
Logger::Get().Error("_ResizeDCompVisual 失败");
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
_isDCompPresenting = true;
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
const uint32_t bufferCount = _CalcBufferCount();
|
||||||
|
|
||||||
|
const SIZE rendererSize = Win32Helper::GetSizeOfRect(ScalingWindow::Get().RendererRect());
|
||||||
|
DXGI_SWAP_CHAIN_DESC1 sd{
|
||||||
|
.Width = (UINT)rendererSize.cx,
|
||||||
|
.Height = (UINT)rendererSize.cy,
|
||||||
|
.Format = DXGI_FORMAT_R8G8B8A8_UNORM,
|
||||||
|
.SampleDesc = {
|
||||||
|
.Count = 1
|
||||||
|
},
|
||||||
|
.BufferUsage = DXGI_USAGE_RENDER_TARGET_OUTPUT,
|
||||||
|
.BufferCount = bufferCount,
|
||||||
|
#ifdef _DEBUG
|
||||||
|
// 我们应确保两种渲染方式可以无缝切换,DXGI_SCALING_NONE 使错误更容易观察到
|
||||||
|
.Scaling = DXGI_SCALING_NONE,
|
||||||
|
#else
|
||||||
|
// 如果两种渲染方式无法无缝切换,DXGI_SCALING_STRETCH 使视觉变化尽可能小
|
||||||
|
.Scaling = DXGI_SCALING_STRETCH,
|
||||||
|
#endif
|
||||||
|
// 渲染每帧之前都会清空后缓冲区,因此无需 DXGI_SWAP_EFFECT_FLIP_SEQUENTIAL
|
||||||
|
.SwapEffect = DXGI_SWAP_EFFECT_FLIP_DISCARD,
|
||||||
|
.AlphaMode = DXGI_ALPHA_MODE_IGNORE,
|
||||||
|
// 只要显卡支持始终启用 DXGI_SWAP_CHAIN_FLAG_ALLOW_TEARING 以支持可变刷新率
|
||||||
|
.Flags = UINT((_deviceResources->IsTearingSupported() ? DXGI_SWAP_CHAIN_FLAG_ALLOW_TEARING : 0)
|
||||||
|
| DXGI_SWAP_CHAIN_FLAG_FRAME_LATENCY_WAITABLE_OBJECT)
|
||||||
|
};
|
||||||
|
|
||||||
|
ID3D11Device5* d3dDevice = _deviceResources->GetD3DDevice();
|
||||||
|
winrt::com_ptr<IDXGISwapChain1> dxgiSwapChain;
|
||||||
|
HRESULT hr = _deviceResources->GetDXGIFactory()->CreateSwapChainForHwnd(
|
||||||
|
d3dDevice,
|
||||||
|
hwndAttach,
|
||||||
|
&sd,
|
||||||
|
nullptr,
|
||||||
|
nullptr,
|
||||||
|
dxgiSwapChain.put()
|
||||||
|
);
|
||||||
|
if (FAILED(hr)) {
|
||||||
|
Logger::Get().ComError("创建交换链失败", hr);
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
_dxgiSwapChain = dxgiSwapChain.try_as<IDXGISwapChain4>();
|
||||||
|
if (!_dxgiSwapChain) {
|
||||||
|
Logger::Get().Error("获取 IDXGISwapChain2 失败");
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
// 为了降低延迟,两个垂直同步之间允许渲染 bufferCount - 1 帧
|
||||||
|
_dxgiSwapChain->SetMaximumFrameLatency(bufferCount - 1);
|
||||||
|
|
||||||
|
_frameLatencyWaitableObject.reset(_dxgiSwapChain->GetFrameLatencyWaitableObject());
|
||||||
|
if (!_frameLatencyWaitableObject) {
|
||||||
|
Logger::Get().Error("GetFrameLatencyWaitableObject 失败");
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
hr = _deviceResources->GetDXGIFactory()->MakeWindowAssociation(
|
||||||
|
hwndAttach, DXGI_MWA_NO_ALT_ENTER);
|
||||||
|
if (FAILED(hr)) {
|
||||||
|
Logger::Get().ComError("MakeWindowAssociation 失败", hr);
|
||||||
|
}
|
||||||
|
|
||||||
|
hr = _dxgiSwapChain->GetBuffer(0, IID_PPV_ARGS(_backBuffer.put()));
|
||||||
|
if (FAILED(hr)) {
|
||||||
|
Logger::Get().ComError("获取后缓冲区失败", hr);
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
hr = d3dDevice->CreateRenderTargetView(_backBuffer.get(), nullptr, _backBufferRtv.put());
|
||||||
|
if (FAILED(hr)) {
|
||||||
|
Logger::Get().ComError("CreateRenderTargetView 失败", hr);
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
bool AdaptivePresenter::BeginFrame(
|
||||||
|
winrt::com_ptr<ID3D11Texture2D>& frameTex,
|
||||||
|
winrt::com_ptr<ID3D11RenderTargetView>& frameRtv,
|
||||||
|
POINT& drawOffset
|
||||||
|
) noexcept {
|
||||||
|
if (_isDCompPresenting) {
|
||||||
|
HRESULT hr = _dcompSurface->BeginDraw(nullptr, IID_PPV_ARGS(&frameTex), &drawOffset);
|
||||||
|
if (FAILED(hr)) {
|
||||||
|
Logger::Get().ComError("BeginDraw 失败", hr);
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
hr = _deviceResources->GetD3DDevice()->CreateRenderTargetView(
|
||||||
|
frameTex.get(), nullptr, frameRtv.put());
|
||||||
|
if (FAILED(hr)) {
|
||||||
|
Logger::Get().ComError("CreateRenderTargetView 失败", hr);
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
drawOffset = {};
|
||||||
|
|
||||||
|
if (!_isframeLatencyWaited) {
|
||||||
|
_frameLatencyWaitableObject.wait(1000);
|
||||||
|
_isframeLatencyWaited = true;
|
||||||
|
}
|
||||||
|
|
||||||
|
frameTex = _backBuffer;
|
||||||
|
frameRtv = _backBufferRtv;
|
||||||
|
}
|
||||||
|
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
void AdaptivePresenter::EndFrame(bool waitForGpu) noexcept {
|
||||||
|
if (_isDCompPresenting) {
|
||||||
|
_dcompSurface->EndDraw();
|
||||||
|
}
|
||||||
|
|
||||||
|
if (waitForGpu || _isResized) {
|
||||||
|
_isResized = false;
|
||||||
|
|
||||||
|
// 下面两个调用用于减少调整窗口尺寸时的边缘闪烁。
|
||||||
|
//
|
||||||
|
// 我们希望 DWM 绘制新的窗口框架时刚好合成新帧,但这不是我们能控制的,尤其是混合架构
|
||||||
|
// 下需要在显卡间传输帧数据,无法预测 Present/Commit 后多久 DWM 能收到。我们只能尽
|
||||||
|
// 可能为 DWM 合成新帧预留时间,这包括两个步骤:
|
||||||
|
//
|
||||||
|
// 1. 首先等待渲染完成,确保新帧对 DWM 随时可用。
|
||||||
|
// 2. 然后在新一轮合成开始时提交,这让 DWM 有更多时间合成新帧。
|
||||||
|
//
|
||||||
|
// 目前看来除非像 UWP 一般有 DWM 协助,否则彻底摆脱闪烁是不可能的。
|
||||||
|
//
|
||||||
|
// https://github.com/Blinue/Magpie/pull/1071#issuecomment-2718314731 讨论了 UWP
|
||||||
|
// 调整尺寸的方法,测试表明可以彻底解决闪烁问题。不过它使用了很不稳定的私有接口,没有
|
||||||
|
// 实用价值。
|
||||||
|
|
||||||
|
// 等待渲染完成
|
||||||
|
_WaitForGpu();
|
||||||
|
|
||||||
|
// 等待 DWM 开始合成新一帧
|
||||||
|
Win32Helper::WaitForDwmComposition();
|
||||||
|
}
|
||||||
|
|
||||||
|
if (_isDCompPresenting) {
|
||||||
|
_dcompDevice->Commit();
|
||||||
|
} else {
|
||||||
|
// 两个垂直同步之间允许渲染数帧,SyncInterval = 0 只呈现最新的一帧,旧帧被丢弃
|
||||||
|
_dxgiSwapChain->Present(0, 0);
|
||||||
|
_isframeLatencyWaited = false;
|
||||||
|
|
||||||
|
// 丢弃渲染目标的内容
|
||||||
|
_deviceResources->GetD3DDC()->DiscardView(_backBufferRtv.get());
|
||||||
|
|
||||||
|
if (_isSwitchingToSwapChain) {
|
||||||
|
_isSwitchingToSwapChain = false;
|
||||||
|
|
||||||
|
// 等待交换链呈现新帧
|
||||||
|
_WaitForGpu();
|
||||||
|
Win32Helper::WaitForDwmComposition();
|
||||||
|
|
||||||
|
// 清除 DirectCompostion 内容
|
||||||
|
_dcompVisual->SetContent(nullptr);
|
||||||
|
_dcompDevice->Commit();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
bool AdaptivePresenter::OnResize() noexcept {
|
||||||
|
_isResized = true;
|
||||||
|
|
||||||
|
if (ScalingWindow::Get().IsResizingOrMoving() || !_dxgiSwapChain) {
|
||||||
|
// 切换到 DirectComposition 呈现,失败则回落到交换链
|
||||||
|
_isDCompPresenting = _ResizeDCompVisual();
|
||||||
|
if (_isDCompPresenting) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
Logger::Get().Error("_ResizeDCompVisual 失败");
|
||||||
|
|
||||||
|
// 禁用 DirectFlip 时不存在交换链
|
||||||
|
if (!_dxgiSwapChain) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!_ResizeSwapChain()) {
|
||||||
|
Logger::Get().Error("_ResizeSwapChain 失败");
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
void AdaptivePresenter::OnEndResize(bool& shouldRedraw) noexcept {
|
||||||
|
if (!_isDCompPresenting || !_dxgiSwapChain) {
|
||||||
|
shouldRedraw = false;
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
shouldRedraw = true;
|
||||||
|
|
||||||
|
_ResizeSwapChain();
|
||||||
|
_isDCompPresenting = false;
|
||||||
|
// 交换链呈现新帧后再清除 DirectCompostion 内容,确保无缝切换
|
||||||
|
_isSwitchingToSwapChain = true;
|
||||||
|
}
|
||||||
|
|
||||||
|
bool AdaptivePresenter::_ResizeSwapChain() noexcept {
|
||||||
|
assert(_dxgiSwapChain);
|
||||||
|
|
||||||
|
if (!_isframeLatencyWaited) {
|
||||||
|
_frameLatencyWaitableObject.wait(1000);
|
||||||
|
_isframeLatencyWaited = true;
|
||||||
|
}
|
||||||
|
|
||||||
|
_backBuffer = nullptr;
|
||||||
|
_backBufferRtv = nullptr;
|
||||||
|
|
||||||
|
const RECT& swapChainRect = ScalingWindow::Get().RendererRect();
|
||||||
|
const SIZE swapChainSize = Win32Helper::GetSizeOfRect(swapChainRect);
|
||||||
|
HRESULT hr = _dxgiSwapChain->ResizeBuffers(
|
||||||
|
0,
|
||||||
|
(UINT)swapChainSize.cx,
|
||||||
|
(UINT)swapChainSize.cy,
|
||||||
|
DXGI_FORMAT_UNKNOWN,
|
||||||
|
UINT((_deviceResources->IsTearingSupported() ? DXGI_SWAP_CHAIN_FLAG_ALLOW_TEARING : 0)
|
||||||
|
| DXGI_SWAP_CHAIN_FLAG_FRAME_LATENCY_WAITABLE_OBJECT)
|
||||||
|
);
|
||||||
|
if (FAILED(hr)) {
|
||||||
|
Logger::Get().ComError("ResizeBuffers 失败", hr);
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
hr = _dxgiSwapChain->GetBuffer(0, IID_PPV_ARGS(_backBuffer.put()));
|
||||||
|
if (FAILED(hr)) {
|
||||||
|
Logger::Get().ComError("获取后缓冲区失败", hr);
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
hr = _deviceResources->GetD3DDevice()->CreateRenderTargetView(
|
||||||
|
_backBuffer.get(), nullptr, _backBufferRtv.put());
|
||||||
|
if (FAILED(hr)) {
|
||||||
|
Logger::Get().ComError("CreateRenderTargetView 失败", hr);
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
bool AdaptivePresenter::_ResizeDCompVisual(HWND hwndAttach) noexcept {
|
||||||
|
const SIZE rendererSize = Win32Helper::GetSizeOfRect(ScalingWindow::Get().RendererRect());
|
||||||
|
|
||||||
|
if (_dcompSurface) {
|
||||||
|
// 使用 IDCompositionVirtualSurface 而不是 IDCompositionSurface 的原因是
|
||||||
|
// IDCompositionDevice2::CreateSurface 有时相当慢,最坏情况下要几十毫秒。
|
||||||
|
HRESULT hr = _dcompSurface->Resize((UINT)rendererSize.cx, (UINT)rendererSize.cy);
|
||||||
|
if (FAILED(hr)) {
|
||||||
|
Logger::Get().ComError("Resize 失败", hr);
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
// 初始化 DirectComposition
|
||||||
|
HRESULT hr = DCompositionCreateDevice3(
|
||||||
|
_deviceResources->GetD3DDevice(), IID_PPV_ARGS(&_dcompDevice));
|
||||||
|
if (FAILED(hr)) {
|
||||||
|
Logger::Get().ComError("DCompositionCreateDevice3 失败", hr);
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!hwndAttach) {
|
||||||
|
// 没有禁用 DirectFlip 时才会在调整大小时初始化,因此必定存在交换链
|
||||||
|
hr = _dxgiSwapChain->GetHwnd(&hwndAttach);
|
||||||
|
if (FAILED(hr)) {
|
||||||
|
Logger::Get().ComError("GetHwnd 失败", hr);
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
hr = _dcompDevice->CreateTargetForHwnd(hwndAttach, TRUE, _dcompTarget.put());
|
||||||
|
if (FAILED(hr)) {
|
||||||
|
Logger::Get().ComError("CreateTargetForHwnd 失败", hr);
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
hr = _dcompDevice->CreateVisual(_dcompVisual.put());
|
||||||
|
if (FAILED(hr)) {
|
||||||
|
Logger::Get().ComError("CreateVisual 失败", hr);
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
hr = _dcompTarget->SetRoot(_dcompVisual.get());
|
||||||
|
if (FAILED(hr)) {
|
||||||
|
Logger::Get().ComError("SetRoot 失败", hr);
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
hr = _dcompDevice->CreateVirtualSurface(
|
||||||
|
(UINT)rendererSize.cx,
|
||||||
|
(UINT)rendererSize.cy,
|
||||||
|
DXGI_FORMAT_R8G8B8A8_UNORM,
|
||||||
|
DXGI_ALPHA_MODE_IGNORE,
|
||||||
|
_dcompSurface.put()
|
||||||
|
);
|
||||||
|
if (FAILED(hr)) {
|
||||||
|
Logger::Get().ComError("CreateVirtualSurface 失败", hr);
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
HRESULT hr = _dcompVisual->SetContent(_dcompSurface.get());
|
||||||
|
if (FAILED(hr)) {
|
||||||
|
Logger::Get().ComError("SetContent 失败", hr);
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
49
src/Magpie.Core/AdaptivePresenter.h
Normal file
49
src/Magpie.Core/AdaptivePresenter.h
Normal file
|
|
@ -0,0 +1,49 @@
|
||||||
|
#pragma once
|
||||||
|
#include "PresenterBase.h"
|
||||||
|
#include <dcomp.h>
|
||||||
|
|
||||||
|
namespace Magpie {
|
||||||
|
|
||||||
|
// 根据需要在交换链和 DirectComposition 两种呈现方式间切换。交换链可以触发
|
||||||
|
// DirectFlip/IndependentFlip 以最小化延迟,DirectComposition 在调整尺寸
|
||||||
|
// 时闪烁更少,这个呈现器旨在结合两者的优势。
|
||||||
|
class AdaptivePresenter final : public PresenterBase {
|
||||||
|
protected:
|
||||||
|
bool _Initialize(HWND hwndAttach) noexcept override;
|
||||||
|
|
||||||
|
public:
|
||||||
|
bool BeginFrame(
|
||||||
|
winrt::com_ptr<ID3D11Texture2D>& frameTex,
|
||||||
|
winrt::com_ptr<ID3D11RenderTargetView>& frameRtv,
|
||||||
|
POINT& drawOffset
|
||||||
|
) noexcept override;
|
||||||
|
|
||||||
|
void EndFrame(bool waitForGpu = false) noexcept override;
|
||||||
|
|
||||||
|
bool OnResize() noexcept override;
|
||||||
|
|
||||||
|
void OnEndResize(bool& shouldRedraw) noexcept override;
|
||||||
|
|
||||||
|
private:
|
||||||
|
bool _ResizeSwapChain() noexcept;
|
||||||
|
|
||||||
|
bool _ResizeDCompVisual(HWND hwndAttach = NULL) noexcept;
|
||||||
|
|
||||||
|
winrt::com_ptr<IDXGISwapChain4> _dxgiSwapChain;
|
||||||
|
wil::unique_event_nothrow _frameLatencyWaitableObject;
|
||||||
|
winrt::com_ptr<ID3D11Texture2D> _backBuffer;
|
||||||
|
winrt::com_ptr<ID3D11RenderTargetView> _backBufferRtv;
|
||||||
|
|
||||||
|
// 调整大小或禁用 DirectFlip 时使用
|
||||||
|
winrt::com_ptr<IDCompositionDesktopDevice> _dcompDevice;
|
||||||
|
winrt::com_ptr<IDCompositionTarget> _dcompTarget;
|
||||||
|
winrt::com_ptr<IDCompositionVisual2> _dcompVisual;
|
||||||
|
winrt::com_ptr<IDCompositionVirtualSurface> _dcompSurface;
|
||||||
|
|
||||||
|
bool _isDCompPresenting = false;
|
||||||
|
bool _isResized = false;
|
||||||
|
bool _isframeLatencyWaited = false;
|
||||||
|
bool _isSwitchingToSwapChain = false;
|
||||||
|
};
|
||||||
|
|
||||||
|
}
|
||||||
|
|
@ -1,90 +0,0 @@
|
||||||
#include "pch.h"
|
|
||||||
#include "AppFolderManager.h"
|
|
||||||
#include "CommonSharedConstants.h"
|
|
||||||
#include "Win32Helper.h"
|
|
||||||
#include <ShlObj.h>
|
|
||||||
|
|
||||||
#define APP_DIR L"app"
|
|
||||||
#define DATA_DIR L"data"
|
|
||||||
|
|
||||||
namespace Magpie {
|
|
||||||
|
|
||||||
bool AppFolderManager::Initialize() noexcept {
|
|
||||||
_exeDir = Win32Helper::GetExePath().parent_path();
|
|
||||||
if (_exeDir.empty()) {
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
// dll 搜索路径中添加 app 文件夹以及排除当前目录
|
|
||||||
if (!SetDefaultDllDirectories(LOAD_LIBRARY_SEARCH_DEFAULT_DIRS)) {
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (!AddDllDirectory((_exeDir / APP_DIR).c_str())) {
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
// 若程序所在目录存在配置文件则为便携模式
|
|
||||||
_isPortableMode = Win32Helper::FileExists(StrHelper::Concat(
|
|
||||||
_exeDir.native(), L"\\" DATA_DIR L"\\config\\", CommonSharedConstants::CONFIG_FILENAME).c_str());
|
|
||||||
|
|
||||||
// 旧版本便携模式配置文件位置
|
|
||||||
_isPortableMode = _isPortableMode || Win32Helper::FileExists(StrHelper::Concat(
|
|
||||||
_exeDir.native(), L"\\config\\", CommonSharedConstants::CONFIG_FILENAME).c_str());
|
|
||||||
|
|
||||||
if (_isPortableMode) {
|
|
||||||
_workingDir = _exeDir / DATA_DIR;
|
|
||||||
} else {
|
|
||||||
wil::unique_cotaskmem_string localAppDataDir;
|
|
||||||
HRESULT hr = SHGetKnownFolderPath(
|
|
||||||
FOLDERID_LocalAppData, KF_FLAG_DEFAULT, NULL, localAppDataDir.put());
|
|
||||||
if (FAILED(hr)) {
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
_workingDir = StrHelper::Concat(localAppDataDir.get(), L"\\Magpie\\" DATA_DIR);
|
|
||||||
}
|
|
||||||
|
|
||||||
Win32Helper::CreateDir(_workingDir.c_str());
|
|
||||||
|
|
||||||
if (!SetCurrentDirectory(_workingDir.c_str())) {
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
|
|
||||||
std::filesystem::path AppFolderManager::GetAppDir() const noexcept {
|
|
||||||
return _exeDir / APP_DIR;
|
|
||||||
}
|
|
||||||
|
|
||||||
const wchar_t* AppFolderManager::GetLogsDir() const noexcept {
|
|
||||||
return L"logs";
|
|
||||||
}
|
|
||||||
|
|
||||||
const wchar_t* AppFolderManager::GetSourcesDir() const noexcept {
|
|
||||||
return L"sources";
|
|
||||||
}
|
|
||||||
|
|
||||||
const wchar_t* AppFolderManager::GetCacheDir() const noexcept {
|
|
||||||
return L"cache";
|
|
||||||
}
|
|
||||||
|
|
||||||
const wchar_t* AppFolderManager::GetConfigDir() const noexcept {
|
|
||||||
return L"config";
|
|
||||||
}
|
|
||||||
|
|
||||||
std::filesystem::path AppFolderManager::GetBuiltInShaderEffectsDir() const noexcept {
|
|
||||||
return _exeDir / APP_DIR L"\\effects\\shaders";
|
|
||||||
}
|
|
||||||
|
|
||||||
std::filesystem::path AppFolderManager::GetD3D12Dir() const noexcept {
|
|
||||||
return _exeDir / APP_DIR L"\\D3D12";
|
|
||||||
}
|
|
||||||
|
|
||||||
std::filesystem::path AppFolderManager::GetUpdateDir() const noexcept {
|
|
||||||
// 位于根目录中,非打包应用更新时才会使用
|
|
||||||
return _exeDir / CommonSharedConstants::UPDATE_DIR;
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
||||||
75
src/Magpie.Core/BackendDescriptorStore.cpp
Normal file
75
src/Magpie.Core/BackendDescriptorStore.cpp
Normal file
|
|
@ -0,0 +1,75 @@
|
||||||
|
#include "pch.h"
|
||||||
|
#include "BackendDescriptorStore.h"
|
||||||
|
#include "Logger.h"
|
||||||
|
|
||||||
|
namespace Magpie {
|
||||||
|
|
||||||
|
ID3D11ShaderResourceView* BackendDescriptorStore::GetShaderResourceView(ID3D11Texture2D* texture) noexcept {
|
||||||
|
if (auto it = _srvMap.find(texture); it != _srvMap.end()) {
|
||||||
|
return it->second.get();
|
||||||
|
}
|
||||||
|
|
||||||
|
winrt::com_ptr<ID3D11ShaderResourceView> srv;
|
||||||
|
HRESULT hr = _d3dDevice->CreateShaderResourceView(texture, nullptr, srv.put());
|
||||||
|
if (FAILED(hr)) {
|
||||||
|
Logger::Get().ComError("CreateShaderResourceView 失败", hr);
|
||||||
|
return nullptr;
|
||||||
|
}
|
||||||
|
|
||||||
|
return _srvMap.emplace(texture, std::move(srv)).first->second.get();
|
||||||
|
}
|
||||||
|
|
||||||
|
ID3D11UnorderedAccessView* BackendDescriptorStore::GetUnorderedAccessView(ID3D11Texture2D* texture) noexcept {
|
||||||
|
if (auto it = _uavMap.find(texture); it != _uavMap.end()) {
|
||||||
|
return it->second.get();
|
||||||
|
}
|
||||||
|
|
||||||
|
winrt::com_ptr<ID3D11UnorderedAccessView> uav;
|
||||||
|
|
||||||
|
D3D11_UNORDERED_ACCESS_VIEW_DESC desc{
|
||||||
|
.ViewDimension = D3D11_UAV_DIMENSION_TEXTURE2D
|
||||||
|
};
|
||||||
|
|
||||||
|
HRESULT hr = _d3dDevice->CreateUnorderedAccessView(texture, &desc, uav.put());
|
||||||
|
if (FAILED(hr)) {
|
||||||
|
Logger::Get().ComError("CreateUnorderedAccessView 失败", hr);
|
||||||
|
return nullptr;
|
||||||
|
}
|
||||||
|
|
||||||
|
return _uavMap.emplace(texture, std::move(uav)).first->second.get();
|
||||||
|
}
|
||||||
|
|
||||||
|
ID3D11UnorderedAccessView* BackendDescriptorStore::GetUnorderedAccessView(
|
||||||
|
ID3D11Buffer* buffer,
|
||||||
|
uint32_t numElements,
|
||||||
|
DXGI_FORMAT format
|
||||||
|
) noexcept {
|
||||||
|
if (auto it = _uavMap.find(buffer); it != _uavMap.end()) {
|
||||||
|
return it->second.get();
|
||||||
|
}
|
||||||
|
|
||||||
|
winrt::com_ptr<ID3D11UnorderedAccessView> uav;
|
||||||
|
|
||||||
|
D3D11_UNORDERED_ACCESS_VIEW_DESC desc{
|
||||||
|
.Format = format,
|
||||||
|
.ViewDimension = D3D11_UAV_DIMENSION_BUFFER,
|
||||||
|
.Buffer{
|
||||||
|
.NumElements = numElements
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
HRESULT hr = _d3dDevice->CreateUnorderedAccessView(buffer, &desc, uav.put());
|
||||||
|
if (FAILED(hr)) {
|
||||||
|
Logger::Get().ComError("CreateUnorderedAccessView 失败", hr);
|
||||||
|
return nullptr;
|
||||||
|
}
|
||||||
|
|
||||||
|
return _uavMap.emplace(buffer, std::move(uav)).first->second.get();
|
||||||
|
}
|
||||||
|
|
||||||
|
void BackendDescriptorStore::RemoveCache(ID3D11Texture2D* texture) noexcept {
|
||||||
|
_srvMap.erase(texture);
|
||||||
|
_uavMap.erase(texture);
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
35
src/Magpie.Core/BackendDescriptorStore.h
Normal file
35
src/Magpie.Core/BackendDescriptorStore.h
Normal file
|
|
@ -0,0 +1,35 @@
|
||||||
|
#pragma once
|
||||||
|
#include <parallel_hashmap/phmap.h>
|
||||||
|
|
||||||
|
namespace Magpie {
|
||||||
|
|
||||||
|
class BackendDescriptorStore {
|
||||||
|
public:
|
||||||
|
BackendDescriptorStore() = default;
|
||||||
|
BackendDescriptorStore(const BackendDescriptorStore&) = delete;
|
||||||
|
BackendDescriptorStore(BackendDescriptorStore&&) = default;
|
||||||
|
|
||||||
|
void Initialize(ID3D11Device5* d3dDevice) noexcept {
|
||||||
|
_d3dDevice = d3dDevice;
|
||||||
|
}
|
||||||
|
|
||||||
|
ID3D11ShaderResourceView* GetShaderResourceView(ID3D11Texture2D* texture) noexcept;
|
||||||
|
|
||||||
|
ID3D11UnorderedAccessView* GetUnorderedAccessView(ID3D11Texture2D* texture) noexcept;
|
||||||
|
|
||||||
|
ID3D11UnorderedAccessView* GetUnorderedAccessView(
|
||||||
|
ID3D11Buffer* buffer,
|
||||||
|
uint32_t numElements,
|
||||||
|
DXGI_FORMAT format = DXGI_FORMAT_UNKNOWN
|
||||||
|
) noexcept;
|
||||||
|
|
||||||
|
void RemoveCache(ID3D11Texture2D* texture) noexcept;
|
||||||
|
|
||||||
|
private:
|
||||||
|
ID3D11Device5* _d3dDevice = nullptr;
|
||||||
|
|
||||||
|
phmap::flat_hash_map<ID3D11Texture2D*, winrt::com_ptr<ID3D11ShaderResourceView>> _srvMap;
|
||||||
|
phmap::flat_hash_map<void*, winrt::com_ptr<ID3D11UnorderedAccessView>> _uavMap;
|
||||||
|
};
|
||||||
|
|
||||||
|
}
|
||||||
|
|
@ -1,277 +0,0 @@
|
||||||
#include "pch.h"
|
|
||||||
#include "CatmullRomDrawer.h"
|
|
||||||
#include "CommandContext.h"
|
|
||||||
#include "D3D12Context.h"
|
|
||||||
#include "DirectXHelper.h"
|
|
||||||
#include "Logger.h"
|
|
||||||
#include "shaders/CatmullRomCS.h"
|
|
||||||
#include "shaders/CatmullRomCS_SM5.h"
|
|
||||||
#include "shaders/CatmullRomCS_sRGB.h"
|
|
||||||
#include "shaders/CatmullRomCS_sRGB_SM5.h"
|
|
||||||
#include "shaders/CopyCS.h"
|
|
||||||
#include "shaders/CopyCS_SM5.h"
|
|
||||||
#include "shaders/CopyCS_sRGB.h"
|
|
||||||
#include "shaders/CopyCS_sRGB_SM5.h"
|
|
||||||
|
|
||||||
namespace Magpie {
|
|
||||||
|
|
||||||
void CatmullRomDrawer::Initialize(D3D12Context& d3d12Context) noexcept {
|
|
||||||
_d3d12Context = &d3d12Context;
|
|
||||||
}
|
|
||||||
|
|
||||||
HRESULT CatmullRomDrawer::Draw(
|
|
||||||
ComputeContext& computeContext,
|
|
||||||
SizeU inputSize,
|
|
||||||
SizeU outputSize,
|
|
||||||
uint32_t inputSrvOffset,
|
|
||||||
uint32_t outputUavOffset,
|
|
||||||
bool outputSrgb
|
|
||||||
) noexcept {
|
|
||||||
// 作为性能优化,输入和输出尺寸相同时原样复制
|
|
||||||
if (inputSize == outputSize) {
|
|
||||||
if (!_copyRootSignature) {
|
|
||||||
HRESULT hr = _InitializeCopyRootSignature();
|
|
||||||
if (FAILED(hr)) {
|
|
||||||
Logger::Get().ComError("_InitializeCopyRootSignature 失败", hr);
|
|
||||||
return hr;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
computeContext.SetRootSignature(_copyRootSignature.get());
|
|
||||||
|
|
||||||
if (outputSrgb) {
|
|
||||||
if (!_copySrgbPSO) {
|
|
||||||
D3D12_COMPUTE_PIPELINE_STATE_DESC psoDesc = {
|
|
||||||
.pRootSignature = _copyRootSignature.get(),
|
|
||||||
.CS = DirectXHelper::SelectShader(
|
|
||||||
_d3d12Context->GetShaderModel() >= D3D_SHADER_MODEL_6_0,
|
|
||||||
CopyCS_sRGB,
|
|
||||||
CopyCS_sRGB_SM5
|
|
||||||
)
|
|
||||||
};
|
|
||||||
HRESULT hr = _d3d12Context->GetDevice()->CreateComputePipelineState(
|
|
||||||
&psoDesc, IID_PPV_ARGS(&_copySrgbPSO));
|
|
||||||
if (FAILED(hr)) {
|
|
||||||
Logger::Get().ComError("CreateComputePipelineState 失败", hr);
|
|
||||||
return hr;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
computeContext.SetPipelineState(_copySrgbPSO.get());
|
|
||||||
} else {
|
|
||||||
if (!_copyPSO) {
|
|
||||||
D3D12_COMPUTE_PIPELINE_STATE_DESC psoDesc = {
|
|
||||||
.pRootSignature = _copyRootSignature.get(),
|
|
||||||
.CS = DirectXHelper::SelectShader(
|
|
||||||
_d3d12Context->GetShaderModel() >= D3D_SHADER_MODEL_6_0,
|
|
||||||
CopyCS,
|
|
||||||
CopyCS_SM5
|
|
||||||
)
|
|
||||||
};
|
|
||||||
HRESULT hr = _d3d12Context->GetDevice()->CreateComputePipelineState(
|
|
||||||
&psoDesc, IID_PPV_ARGS(&_copyPSO));
|
|
||||||
if (FAILED(hr)) {
|
|
||||||
Logger::Get().ComError("CreateComputePipelineState 失败", hr);
|
|
||||||
return hr;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
computeContext.SetPipelineState(_copyPSO.get());
|
|
||||||
}
|
|
||||||
|
|
||||||
computeContext.SetRootDescriptorTable(0, inputSrvOffset);
|
|
||||||
computeContext.SetRootDescriptorTable(1, outputUavOffset);
|
|
||||||
} else {
|
|
||||||
if (!_catmullRomRootSignature) {
|
|
||||||
HRESULT hr = _InitializeCatmullRomRootSignature();
|
|
||||||
if (FAILED(hr)) {
|
|
||||||
Logger::Get().ComError("_InitializeCatmullRomRootSignature 失败", hr);
|
|
||||||
return hr;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
computeContext.SetRootSignature(_catmullRomRootSignature.get());
|
|
||||||
|
|
||||||
if (outputSrgb) {
|
|
||||||
if (!_catmullRomSrgbPSO) {
|
|
||||||
D3D12_COMPUTE_PIPELINE_STATE_DESC psoDesc = {
|
|
||||||
.pRootSignature = _catmullRomRootSignature.get(),
|
|
||||||
.CS = DirectXHelper::SelectShader(
|
|
||||||
_d3d12Context->GetShaderModel() >= D3D_SHADER_MODEL_6_0,
|
|
||||||
CatmullRomCS_sRGB,
|
|
||||||
CatmullRomCS_sRGB_SM5
|
|
||||||
)
|
|
||||||
};
|
|
||||||
HRESULT hr = _d3d12Context->GetDevice()->CreateComputePipelineState(
|
|
||||||
&psoDesc, IID_PPV_ARGS(&_catmullRomSrgbPSO));
|
|
||||||
if (FAILED(hr)) {
|
|
||||||
Logger::Get().ComError("CreateComputePipelineState 失败", hr);
|
|
||||||
return hr;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
computeContext.SetPipelineState(_catmullRomSrgbPSO.get());
|
|
||||||
} else {
|
|
||||||
if (!_catmullRomPSO) {
|
|
||||||
D3D12_COMPUTE_PIPELINE_STATE_DESC psoDesc = {
|
|
||||||
.pRootSignature = _catmullRomRootSignature.get(),
|
|
||||||
.CS = DirectXHelper::SelectShader(
|
|
||||||
_d3d12Context->GetShaderModel() >= D3D_SHADER_MODEL_6_0,
|
|
||||||
CatmullRomCS,
|
|
||||||
CatmullRomCS_SM5
|
|
||||||
)
|
|
||||||
};
|
|
||||||
HRESULT hr = _d3d12Context->GetDevice()->CreateComputePipelineState(
|
|
||||||
&psoDesc, IID_PPV_ARGS(&_catmullRomPSO));
|
|
||||||
if (FAILED(hr)) {
|
|
||||||
Logger::Get().ComError("CreateComputePipelineState 失败", hr);
|
|
||||||
return hr;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
computeContext.SetPipelineState(_catmullRomPSO.get());
|
|
||||||
}
|
|
||||||
|
|
||||||
DirectXHelper::Constant32 constants[] = {
|
|
||||||
{.uintVal = inputSize.width},
|
|
||||||
{.uintVal = inputSize.height},
|
|
||||||
{.floatVal = 1.0f / inputSize.width},
|
|
||||||
{.floatVal = 1.0f / inputSize.height},
|
|
||||||
{.floatVal = 1.0f / outputSize.width},
|
|
||||||
{.floatVal = 1.0f / outputSize.height}
|
|
||||||
};
|
|
||||||
computeContext.SetRoot32BitConstants(0, (UINT)std::size(constants), constants);
|
|
||||||
|
|
||||||
computeContext.SetRootDescriptorTable(1, inputSrvOffset);
|
|
||||||
computeContext.SetRootDescriptorTable(2, outputUavOffset);
|
|
||||||
}
|
|
||||||
|
|
||||||
constexpr uint32_t BLOCK_SIZE = 16;
|
|
||||||
computeContext.Dispatch(
|
|
||||||
(outputSize.width + BLOCK_SIZE - 1) / BLOCK_SIZE,
|
|
||||||
(outputSize.height + BLOCK_SIZE - 1) / BLOCK_SIZE
|
|
||||||
);
|
|
||||||
|
|
||||||
return S_OK;
|
|
||||||
}
|
|
||||||
|
|
||||||
HRESULT CatmullRomDrawer::_InitializeCatmullRomRootSignature() noexcept {
|
|
||||||
winrt::com_ptr<ID3DBlob> signature;
|
|
||||||
|
|
||||||
CD3DX12_DESCRIPTOR_RANGE1 srvRange(D3D12_DESCRIPTOR_RANGE_TYPE_SRV, 1, 0, 0,
|
|
||||||
D3D12_DESCRIPTOR_RANGE_FLAG_DATA_STATIC_WHILE_SET_AT_EXECUTE);
|
|
||||||
CD3DX12_DESCRIPTOR_RANGE1 uavRange(D3D12_DESCRIPTOR_RANGE_TYPE_UAV, 1, 0, 0,
|
|
||||||
D3D12_DESCRIPTOR_RANGE_FLAG_DATA_VOLATILE);
|
|
||||||
|
|
||||||
D3D12_ROOT_PARAMETER1 rootParams[] = {
|
|
||||||
{
|
|
||||||
.ParameterType = D3D12_ROOT_PARAMETER_TYPE_32BIT_CONSTANTS,
|
|
||||||
.Constants = {
|
|
||||||
.Num32BitValues = 6
|
|
||||||
}
|
|
||||||
},
|
|
||||||
{
|
|
||||||
.ParameterType = D3D12_ROOT_PARAMETER_TYPE_DESCRIPTOR_TABLE,
|
|
||||||
.DescriptorTable = {
|
|
||||||
.NumDescriptorRanges = 1,
|
|
||||||
.pDescriptorRanges = &srvRange
|
|
||||||
}
|
|
||||||
},
|
|
||||||
{
|
|
||||||
.ParameterType = D3D12_ROOT_PARAMETER_TYPE_DESCRIPTOR_TABLE,
|
|
||||||
.DescriptorTable = {
|
|
||||||
.NumDescriptorRanges = 1,
|
|
||||||
.pDescriptorRanges = &uavRange
|
|
||||||
}
|
|
||||||
}
|
|
||||||
};
|
|
||||||
|
|
||||||
D3D12_STATIC_SAMPLER_DESC samplerDesc = {
|
|
||||||
.Filter = D3D12_FILTER_MIN_MAG_MIP_LINEAR,
|
|
||||||
.AddressU = D3D12_TEXTURE_ADDRESS_MODE_CLAMP,
|
|
||||||
.AddressV = D3D12_TEXTURE_ADDRESS_MODE_CLAMP,
|
|
||||||
.AddressW = D3D12_TEXTURE_ADDRESS_MODE_CLAMP,
|
|
||||||
.ComparisonFunc = D3D12_COMPARISON_FUNC_NEVER,
|
|
||||||
.ShaderRegister = 0
|
|
||||||
};
|
|
||||||
|
|
||||||
CD3DX12_VERSIONED_ROOT_SIGNATURE_DESC rootSignatureDesc(
|
|
||||||
(UINT)std::size(rootParams), rootParams, 1, &samplerDesc);
|
|
||||||
|
|
||||||
HRESULT hr = D3DX12SerializeVersionedRootSignature(
|
|
||||||
&rootSignatureDesc,
|
|
||||||
_d3d12Context->GetRootSignatureVersion(),
|
|
||||||
signature.put(),
|
|
||||||
nullptr
|
|
||||||
);
|
|
||||||
if (FAILED(hr)) {
|
|
||||||
Logger::Get().ComError("D3DX12SerializeVersionedRootSignature 失败", hr);
|
|
||||||
return hr;
|
|
||||||
}
|
|
||||||
|
|
||||||
hr = _d3d12Context->GetDevice()->CreateRootSignature(
|
|
||||||
0,
|
|
||||||
signature->GetBufferPointer(),
|
|
||||||
signature->GetBufferSize(),
|
|
||||||
IID_PPV_ARGS(&_catmullRomRootSignature)
|
|
||||||
);
|
|
||||||
if (FAILED(hr)) {
|
|
||||||
Logger::Get().ComError("CreateRootSignature 失败", hr);
|
|
||||||
return hr;
|
|
||||||
}
|
|
||||||
|
|
||||||
return S_OK;
|
|
||||||
}
|
|
||||||
|
|
||||||
HRESULT CatmullRomDrawer::_InitializeCopyRootSignature() noexcept {
|
|
||||||
winrt::com_ptr<ID3DBlob> signature;
|
|
||||||
|
|
||||||
CD3DX12_DESCRIPTOR_RANGE1 srvRange(D3D12_DESCRIPTOR_RANGE_TYPE_SRV, 1, 0, 0,
|
|
||||||
D3D12_DESCRIPTOR_RANGE_FLAG_DATA_STATIC_WHILE_SET_AT_EXECUTE);
|
|
||||||
CD3DX12_DESCRIPTOR_RANGE1 uavRange(D3D12_DESCRIPTOR_RANGE_TYPE_UAV, 1, 0, 0,
|
|
||||||
D3D12_DESCRIPTOR_RANGE_FLAG_DATA_VOLATILE);
|
|
||||||
|
|
||||||
D3D12_ROOT_PARAMETER1 rootParams[] = {
|
|
||||||
{
|
|
||||||
.ParameterType = D3D12_ROOT_PARAMETER_TYPE_DESCRIPTOR_TABLE,
|
|
||||||
.DescriptorTable = {
|
|
||||||
.NumDescriptorRanges = 1,
|
|
||||||
.pDescriptorRanges = &srvRange
|
|
||||||
}
|
|
||||||
},
|
|
||||||
{
|
|
||||||
.ParameterType = D3D12_ROOT_PARAMETER_TYPE_DESCRIPTOR_TABLE,
|
|
||||||
.DescriptorTable = {
|
|
||||||
.NumDescriptorRanges = 1,
|
|
||||||
.pDescriptorRanges = &uavRange
|
|
||||||
}
|
|
||||||
}
|
|
||||||
};
|
|
||||||
|
|
||||||
CD3DX12_VERSIONED_ROOT_SIGNATURE_DESC rootSignatureDesc(
|
|
||||||
(UINT)std::size(rootParams), rootParams, 0, nullptr);
|
|
||||||
|
|
||||||
HRESULT hr = D3DX12SerializeVersionedRootSignature(
|
|
||||||
&rootSignatureDesc,
|
|
||||||
_d3d12Context->GetRootSignatureVersion(),
|
|
||||||
signature.put(),
|
|
||||||
nullptr
|
|
||||||
);
|
|
||||||
if (FAILED(hr)) {
|
|
||||||
Logger::Get().ComError("D3DX12SerializeVersionedRootSignature 失败", hr);
|
|
||||||
return hr;
|
|
||||||
}
|
|
||||||
|
|
||||||
hr = _d3d12Context->GetDevice()->CreateRootSignature(
|
|
||||||
0,
|
|
||||||
signature->GetBufferPointer(),
|
|
||||||
signature->GetBufferSize(),
|
|
||||||
IID_PPV_ARGS(&_copyRootSignature)
|
|
||||||
);
|
|
||||||
if (FAILED(hr)) {
|
|
||||||
Logger::Get().ComError("CreateRootSignature 失败", hr);
|
|
||||||
return hr;
|
|
||||||
}
|
|
||||||
|
|
||||||
return S_OK;
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
@ -1,36 +0,0 @@
|
||||||
#pragma once
|
|
||||||
|
|
||||||
namespace Magpie {
|
|
||||||
|
|
||||||
class D3D12Context;
|
|
||||||
class ComputeContext;
|
|
||||||
|
|
||||||
class CatmullRomDrawer {
|
|
||||||
public:
|
|
||||||
void Initialize(D3D12Context& d3d12Context) noexcept;
|
|
||||||
|
|
||||||
HRESULT Draw(
|
|
||||||
ComputeContext& computeContext,
|
|
||||||
SizeU inputSize,
|
|
||||||
SizeU outputSize,
|
|
||||||
uint32_t inputSrvOffset,
|
|
||||||
uint32_t outputUavOffset,
|
|
||||||
bool outputSrgb
|
|
||||||
) noexcept;
|
|
||||||
|
|
||||||
private:
|
|
||||||
D3D12Context* _d3d12Context = nullptr;
|
|
||||||
|
|
||||||
HRESULT _InitializeCatmullRomRootSignature() noexcept;
|
|
||||||
HRESULT _InitializeCopyRootSignature() noexcept;
|
|
||||||
|
|
||||||
winrt::com_ptr<ID3D12RootSignature> _catmullRomRootSignature;
|
|
||||||
winrt::com_ptr<ID3D12PipelineState> _catmullRomPSO;
|
|
||||||
winrt::com_ptr<ID3D12PipelineState> _catmullRomSrgbPSO;
|
|
||||||
|
|
||||||
winrt::com_ptr<ID3D12RootSignature> _copyRootSignature;
|
|
||||||
winrt::com_ptr<ID3D12PipelineState> _copyPSO;
|
|
||||||
winrt::com_ptr<ID3D12PipelineState> _copySrgbPSO;
|
|
||||||
};
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
@ -1,24 +0,0 @@
|
||||||
#pragma once
|
|
||||||
|
|
||||||
namespace Magpie {
|
|
||||||
|
|
||||||
struct ColorHelper {
|
|
||||||
static float SrgbToLinear(uint8_t c) noexcept {
|
|
||||||
static std::array<float, 256> lut = [] {
|
|
||||||
std::array<float, 256> result{};
|
|
||||||
for (uint32_t i = 0; i < 256; ++i) {
|
|
||||||
float c = i / 255.0f;
|
|
||||||
if (c <= 0.04045f) {
|
|
||||||
result[i] = c / 12.92f * 255.0f;
|
|
||||||
} else {
|
|
||||||
result[i] = std::pow((c + 0.055f) / 1.055f, 2.4f) * 255.0f;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return result;
|
|
||||||
}();
|
|
||||||
|
|
||||||
return lut[c];
|
|
||||||
}
|
|
||||||
};
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
@ -1,129 +0,0 @@
|
||||||
#include "pch.h"
|
|
||||||
#include "CommandContext.h"
|
|
||||||
#include "DescriptorHeap.h"
|
|
||||||
|
|
||||||
namespace Magpie {
|
|
||||||
|
|
||||||
void ComputeContext::SetRootSignature(ID3D12RootSignature* rootSignature) noexcept {
|
|
||||||
_commandList->SetComputeRootSignature(rootSignature);
|
|
||||||
}
|
|
||||||
|
|
||||||
void ComputeContext::SetRoot32BitConstants(
|
|
||||||
uint32_t rootParameterIndex,
|
|
||||||
uint32_t constantCount,
|
|
||||||
const void* pData
|
|
||||||
) noexcept {
|
|
||||||
_commandList->SetComputeRoot32BitConstants(rootParameterIndex, constantCount, pData, 0);
|
|
||||||
}
|
|
||||||
|
|
||||||
void ComputeContext::SetComputeRootConstantBufferView(
|
|
||||||
uint32_t rootParameterIndex,
|
|
||||||
D3D12_GPU_VIRTUAL_ADDRESS bufferLocation
|
|
||||||
) noexcept {
|
|
||||||
// 存在 DATA_STATIC 标志时 SetComputeRootConstantBufferView 会检查资源状态
|
|
||||||
if (_d3d12Context->GetRootSignatureVersion() >= D3D_ROOT_SIGNATURE_VERSION_1_1) {
|
|
||||||
_FlushBarriers();
|
|
||||||
}
|
|
||||||
|
|
||||||
_commandList->SetComputeRootConstantBufferView(rootParameterIndex, bufferLocation);
|
|
||||||
}
|
|
||||||
|
|
||||||
void ComputeContext::SetRootDescriptorTable(
|
|
||||||
uint32_t rootParameterIndex,
|
|
||||||
uint32_t baseDescriptorOffset
|
|
||||||
) noexcept {
|
|
||||||
assert(baseDescriptorOffset != std::numeric_limits<uint32_t>::max());
|
|
||||||
|
|
||||||
// 存在 DATA_STATIC 标志时 SetComputeRootDescriptorTable 会检查资源状态
|
|
||||||
if (_d3d12Context->GetRootSignatureVersion() >= D3D_ROOT_SIGNATURE_VERSION_1_1) {
|
|
||||||
_FlushBarriers();
|
|
||||||
}
|
|
||||||
|
|
||||||
_commandList->SetComputeRootDescriptorTable(
|
|
||||||
rootParameterIndex,
|
|
||||||
_d3d12Context->GetDescriptorHeap().GetGpuHandle(baseDescriptorOffset)
|
|
||||||
);
|
|
||||||
}
|
|
||||||
|
|
||||||
void ComputeContext::Dispatch(
|
|
||||||
uint32_t threadGroupCountX,
|
|
||||||
uint32_t threadGroupCountY,
|
|
||||||
uint32_t threadGroupCountZ
|
|
||||||
) noexcept {
|
|
||||||
_FlushBarriers();
|
|
||||||
_commandList->Dispatch(threadGroupCountX, threadGroupCountY, threadGroupCountZ);
|
|
||||||
}
|
|
||||||
|
|
||||||
void ComputeContext::ClearStateCache() noexcept {
|
|
||||||
_ClearStateCache();
|
|
||||||
}
|
|
||||||
|
|
||||||
void GraphicsContext::SetRootSignature(ID3D12RootSignature* rootSignature) noexcept {
|
|
||||||
_commandList->SetGraphicsRootSignature(rootSignature);
|
|
||||||
}
|
|
||||||
|
|
||||||
void GraphicsContext::SetRoot32BitConstants(
|
|
||||||
uint32_t rootParameterIndex,
|
|
||||||
uint32_t constantCount,
|
|
||||||
const void* pData
|
|
||||||
) noexcept {
|
|
||||||
_commandList->SetGraphicsRoot32BitConstants(rootParameterIndex, constantCount, pData, 0);
|
|
||||||
}
|
|
||||||
|
|
||||||
void GraphicsContext::SetRootDescriptorTable(
|
|
||||||
uint32_t rootParameterIndex,
|
|
||||||
uint32_t baseDescriptorOffset
|
|
||||||
) noexcept {
|
|
||||||
assert(baseDescriptorOffset != std::numeric_limits<uint32_t>::max());
|
|
||||||
|
|
||||||
// 存在 DATA_STATIC 标志时 SetGraphicsRootDescriptorTable 会检查资源状态
|
|
||||||
if (_d3d12Context->GetRootSignatureVersion() >= D3D_ROOT_SIGNATURE_VERSION_1_1) {
|
|
||||||
_FlushBarriers();
|
|
||||||
}
|
|
||||||
|
|
||||||
_commandList->SetGraphicsRootDescriptorTable(
|
|
||||||
rootParameterIndex,
|
|
||||||
_d3d12Context->GetDescriptorHeap().GetGpuHandle(baseDescriptorOffset)
|
|
||||||
);
|
|
||||||
}
|
|
||||||
|
|
||||||
void GraphicsContext::IASetPrimitiveTopology(D3D12_PRIMITIVE_TOPOLOGY trimitiveTopology) noexcept {
|
|
||||||
if (trimitiveTopology != _curTrimitiveTopology) {
|
|
||||||
_curTrimitiveTopology = trimitiveTopology;
|
|
||||||
_commandList->IASetPrimitiveTopology(trimitiveTopology);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
void GraphicsContext::RSSetViewportAndScissorRect(const D3D12_RECT& rect) noexcept {
|
|
||||||
CD3DX12_VIEWPORT viewport((float)rect.left, (float)rect.top,
|
|
||||||
float(rect.right - rect.left), float(rect.bottom - rect.top));
|
|
||||||
_commandList->RSSetViewports(1, &viewport);
|
|
||||||
|
|
||||||
_commandList->RSSetScissorRects(1, &rect);
|
|
||||||
}
|
|
||||||
|
|
||||||
void GraphicsContext::OMSetRenderTarget(uint32_t rtvDescriptorOffset) noexcept {
|
|
||||||
assert(rtvDescriptorOffset != std::numeric_limits<uint32_t>::max());
|
|
||||||
|
|
||||||
if (rtvDescriptorOffset != _curRtvDescriptorOffset) {
|
|
||||||
_curRtvDescriptorOffset = rtvDescriptorOffset;
|
|
||||||
|
|
||||||
D3D12_CPU_DESCRIPTOR_HANDLE rtvHandle =
|
|
||||||
_d3d12Context->GetDescriptorHeap(true).GetCpuHandle(rtvDescriptorOffset);
|
|
||||||
_commandList->OMSetRenderTargets(1, &rtvHandle, FALSE, nullptr);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
void GraphicsContext::Draw(uint32_t vertexCount) noexcept {
|
|
||||||
_FlushBarriers();
|
|
||||||
_commandList->DrawInstanced(vertexCount, 1, 0, 0);
|
|
||||||
}
|
|
||||||
|
|
||||||
void GraphicsContext::ClearStateCache() noexcept {
|
|
||||||
_ClearStateCache();
|
|
||||||
|
|
||||||
_curTrimitiveTopology = D3D_PRIMITIVE_TOPOLOGY_UNDEFINED;
|
|
||||||
_curRtvDescriptorOffset = std::numeric_limits<uint32_t>::max();
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
@ -1,212 +0,0 @@
|
||||||
#pragma once
|
|
||||||
#include "D3D12Context.h"
|
|
||||||
#include "SmallVector.h"
|
|
||||||
#include "Logger.h"
|
|
||||||
|
|
||||||
namespace Magpie {
|
|
||||||
|
|
||||||
class DescriptorHeap;
|
|
||||||
|
|
||||||
template <typename T>
|
|
||||||
class CommandContext {
|
|
||||||
public:
|
|
||||||
CommandContext() noexcept = default;
|
|
||||||
CommandContext(const CommandContext&) = delete;
|
|
||||||
CommandContext(CommandContext&&) = delete;
|
|
||||||
|
|
||||||
void Initialize(D3D12Context& d3d12Context) noexcept {
|
|
||||||
_d3d12Context = &d3d12Context;
|
|
||||||
_commandList = d3d12Context.GetCommandList();
|
|
||||||
}
|
|
||||||
|
|
||||||
ID3D12GraphicsCommandList* GetCommandList() const noexcept {
|
|
||||||
return _commandList;
|
|
||||||
}
|
|
||||||
|
|
||||||
HRESULT Execute(ID3D12CommandQueue* commandQueue) noexcept {
|
|
||||||
_FlushBarriers();
|
|
||||||
|
|
||||||
HRESULT hr = _commandList->Close();
|
|
||||||
if (FAILED(hr)) {
|
|
||||||
Logger::Get().ComError("ID3D12GraphicsCommandList::Close 失败", hr);
|
|
||||||
return hr;
|
|
||||||
}
|
|
||||||
|
|
||||||
commandQueue->ExecuteCommandLists(1, CommandListCast(&_commandList));
|
|
||||||
|
|
||||||
((T*)this)->ClearStateCache();
|
|
||||||
|
|
||||||
return S_OK;
|
|
||||||
}
|
|
||||||
|
|
||||||
void SetPipelineState(ID3D12PipelineState* pipelineState) noexcept {
|
|
||||||
_commandList->SetPipelineState(pipelineState);
|
|
||||||
}
|
|
||||||
|
|
||||||
void SetDescriptorHeap(ID3D12DescriptorHeap* descriptorHeap) noexcept {
|
|
||||||
if (descriptorHeap != _curDescriptorHeap) {
|
|
||||||
_curDescriptorHeap = descriptorHeap;
|
|
||||||
_commandList->SetDescriptorHeaps(1, &descriptorHeap);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
ID3D12DescriptorHeap* GetCurDescriptorHeap() const noexcept {
|
|
||||||
return _curDescriptorHeap;
|
|
||||||
}
|
|
||||||
|
|
||||||
void InsertTransitionBarrier(
|
|
||||||
ID3D12Resource* resource,
|
|
||||||
D3D12_RESOURCE_STATES stateBefore,
|
|
||||||
D3D12_RESOURCE_STATES stateAfter
|
|
||||||
) noexcept {
|
|
||||||
#ifdef _DEBUG
|
|
||||||
// 检查是否存在冗余的状态转换
|
|
||||||
auto it = std::find_if(
|
|
||||||
_pendingBarriers.begin(),
|
|
||||||
_pendingBarriers.end(),
|
|
||||||
[&](const D3D12_RESOURCE_BARRIER& barrier) {
|
|
||||||
return barrier.Transition.pResource == resource;
|
|
||||||
}
|
|
||||||
);
|
|
||||||
assert(it == _pendingBarriers.end());
|
|
||||||
#endif
|
|
||||||
_pendingBarriers.push_back(
|
|
||||||
CD3DX12_RESOURCE_BARRIER::Transition(resource, stateBefore, stateAfter, 0));
|
|
||||||
}
|
|
||||||
|
|
||||||
void CopyBufferRegion(
|
|
||||||
ID3D12Resource* destBuffer,
|
|
||||||
uint32_t destOffset,
|
|
||||||
ID3D12Resource* srcBuffer,
|
|
||||||
uint32_t srcOffset,
|
|
||||||
uint32_t numBytes,
|
|
||||||
bool shouldFlushBarriers
|
|
||||||
) noexcept {
|
|
||||||
if (shouldFlushBarriers) {
|
|
||||||
_FlushBarriers();
|
|
||||||
}
|
|
||||||
|
|
||||||
_commandList->CopyBufferRegion(destBuffer, destOffset, srcBuffer, srcOffset, numBytes);
|
|
||||||
}
|
|
||||||
|
|
||||||
void CopyTextureRegion(
|
|
||||||
ID3D12Resource* destResource,
|
|
||||||
uint32_t dstX,
|
|
||||||
uint32_t dstY,
|
|
||||||
ID3D12Resource* srcResource,
|
|
||||||
const D3D12_BOX* pSrcBox = nullptr
|
|
||||||
) noexcept {
|
|
||||||
CopyTextureRegion(
|
|
||||||
CD3DX12_TEXTURE_COPY_LOCATION(destResource),
|
|
||||||
dstX,
|
|
||||||
dstY,
|
|
||||||
CD3DX12_TEXTURE_COPY_LOCATION(srcResource),
|
|
||||||
pSrcBox
|
|
||||||
);
|
|
||||||
}
|
|
||||||
|
|
||||||
void CopyTextureRegion(
|
|
||||||
const CD3DX12_TEXTURE_COPY_LOCATION& dest,
|
|
||||||
uint32_t dstX,
|
|
||||||
uint32_t dstY,
|
|
||||||
const CD3DX12_TEXTURE_COPY_LOCATION& src,
|
|
||||||
const D3D12_BOX* pSrcBox = nullptr
|
|
||||||
) noexcept {
|
|
||||||
_FlushBarriers();
|
|
||||||
_commandList->CopyTextureRegion(&dest, dstX, dstY, 0, &src, pSrcBox);
|
|
||||||
}
|
|
||||||
|
|
||||||
void DiscardResource(ID3D12Resource* pResource) noexcept {
|
|
||||||
_commandList->DiscardResource(pResource, nullptr);
|
|
||||||
}
|
|
||||||
|
|
||||||
protected:
|
|
||||||
void _ClearStateCache() noexcept {
|
|
||||||
_FlushBarriers();
|
|
||||||
|
|
||||||
_curDescriptorHeap = nullptr;
|
|
||||||
}
|
|
||||||
|
|
||||||
void _FlushBarriers() noexcept {
|
|
||||||
if (!_pendingBarriers.empty()) {
|
|
||||||
_commandList->ResourceBarrier(
|
|
||||||
(UINT)_pendingBarriers.size(), _pendingBarriers.data());
|
|
||||||
_pendingBarriers.clear();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
D3D12Context* _d3d12Context = nullptr;
|
|
||||||
ID3D12GraphicsCommandList* _commandList = nullptr;
|
|
||||||
|
|
||||||
ID3D12DescriptorHeap* _curDescriptorHeap = nullptr;
|
|
||||||
|
|
||||||
SmallVector<D3D12_RESOURCE_BARRIER, 0> _pendingBarriers;
|
|
||||||
};
|
|
||||||
|
|
||||||
class ComputeContext : public CommandContext<ComputeContext> {
|
|
||||||
public:
|
|
||||||
void SetRootSignature(ID3D12RootSignature* rootSignature) noexcept;
|
|
||||||
|
|
||||||
void SetRoot32BitConstants(
|
|
||||||
uint32_t rootParameterIndex,
|
|
||||||
uint32_t constantCount,
|
|
||||||
const void* pData
|
|
||||||
) noexcept;
|
|
||||||
|
|
||||||
void SetComputeRootConstantBufferView(
|
|
||||||
uint32_t rootParameterIndex,
|
|
||||||
D3D12_GPU_VIRTUAL_ADDRESS bufferLocation
|
|
||||||
) noexcept;
|
|
||||||
|
|
||||||
void SetRootDescriptorTable(
|
|
||||||
uint32_t rootParameterIndex,
|
|
||||||
uint32_t baseDescriptorOffset
|
|
||||||
) noexcept;
|
|
||||||
|
|
||||||
void Dispatch(
|
|
||||||
uint32_t threadGroupCountX,
|
|
||||||
uint32_t threadGroupCountY = 1,
|
|
||||||
uint32_t threadGroupCountZ = 1
|
|
||||||
) noexcept;
|
|
||||||
|
|
||||||
void ClearStateCache() noexcept;
|
|
||||||
|
|
||||||
private:
|
|
||||||
|
|
||||||
};
|
|
||||||
|
|
||||||
class GraphicsContext : public CommandContext<GraphicsContext> {
|
|
||||||
public:
|
|
||||||
void SetRootSignature(ID3D12RootSignature* rootSignature) noexcept;
|
|
||||||
|
|
||||||
void SetRoot32BitConstants(
|
|
||||||
uint32_t rootParameterIndex,
|
|
||||||
uint32_t constantCount,
|
|
||||||
const void* pData
|
|
||||||
) noexcept;
|
|
||||||
|
|
||||||
void SetRootDescriptorTable(
|
|
||||||
uint32_t rootParameterIndex,
|
|
||||||
uint32_t baseDescriptorOffset
|
|
||||||
) noexcept;
|
|
||||||
|
|
||||||
void IASetPrimitiveTopology(D3D12_PRIMITIVE_TOPOLOGY trimitiveTopology) noexcept;
|
|
||||||
|
|
||||||
void RSSetViewportAndScissorRect(const D3D12_RECT& rect) noexcept;
|
|
||||||
|
|
||||||
void OMSetRenderTarget(uint32_t rtvDescriptorOffset) noexcept;
|
|
||||||
|
|
||||||
uint32_t OMGetRenderTarget() const noexcept {
|
|
||||||
return _curRtvDescriptorOffset;
|
|
||||||
}
|
|
||||||
|
|
||||||
void Draw(uint32_t vertexCount) noexcept;
|
|
||||||
|
|
||||||
void ClearStateCache() noexcept;
|
|
||||||
|
|
||||||
private:
|
|
||||||
D3D12_PRIMITIVE_TOPOLOGY _curTrimitiveTopology = D3D_PRIMITIVE_TOPOLOGY_UNDEFINED;
|
|
||||||
uint32_t _curRtvDescriptorOffset = std::numeric_limits<uint32_t>::max();
|
|
||||||
};
|
|
||||||
|
|
||||||
}
|
|
||||||
263
src/Magpie.Core/CompSwapchainPresenter.cpp
Normal file
263
src/Magpie.Core/CompSwapchainPresenter.cpp
Normal file
|
|
@ -0,0 +1,263 @@
|
||||||
|
#include "pch.h"
|
||||||
|
#include "CompSwapchainPresenter.h"
|
||||||
|
#include "DeviceResources.h"
|
||||||
|
#include "Logger.h"
|
||||||
|
#include "ScalingWindow.h"
|
||||||
|
#include "Win32Helper.h"
|
||||||
|
|
||||||
|
namespace Magpie {
|
||||||
|
|
||||||
|
static winrt::com_ptr<IPresentationFactory> CreatePresentationFactory(ID3D11Device* d3dDevice) noexcept {
|
||||||
|
winrt::com_ptr<IPresentationFactory> result;
|
||||||
|
|
||||||
|
static const auto createPresentationFactory =
|
||||||
|
Win32Helper::LoadSystemFunction<decltype(::CreatePresentationFactory)>(
|
||||||
|
L"dcomp.dll", "CreatePresentationFactory");
|
||||||
|
if (!createPresentationFactory) {
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
|
HRESULT hr = createPresentationFactory(d3dDevice, IID_PPV_ARGS(&result));
|
||||||
|
if (FAILED(hr)) {
|
||||||
|
Logger::Get().ComError("CreatePresentationFactory 失败", hr);
|
||||||
|
}
|
||||||
|
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
|
bool CompSwapchainPresenter::_Initialize(HWND hwndAttach) noexcept {
|
||||||
|
if (Win32Helper::GetOSVersion().IsWin10()) {
|
||||||
|
Logger::Get().Error("OS 不支持 composition swapchain");
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
HRESULT hr = DCompositionCreateDevice3(nullptr, IID_PPV_ARGS(&_dcompDevice));
|
||||||
|
if (FAILED(hr)) {
|
||||||
|
Logger::Get().ComError("DCompositionCreateDevice3 失败", hr);
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
hr = _dcompDevice->CreateTargetForHwnd(hwndAttach, TRUE, _dcompTarget.put());
|
||||||
|
if (FAILED(hr)) {
|
||||||
|
Logger::Get().ComError("CreateTargetForHwnd 失败", hr);
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
hr = _dcompDevice->CreateVisual(_dcompVisual.put());
|
||||||
|
if (FAILED(hr)) {
|
||||||
|
Logger::Get().ComError("CreateVisual 失败", hr);
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
hr = _dcompTarget->SetRoot(_dcompVisual.get());
|
||||||
|
if (FAILED(hr)) {
|
||||||
|
Logger::Get().ComError("SetRoot 失败", hr);
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
winrt::com_ptr<IPresentationFactory> presentationFactory =
|
||||||
|
CreatePresentationFactory(_deviceResources->GetD3DDevice());
|
||||||
|
if (!presentationFactory) {
|
||||||
|
Logger::Get().Error("CreatePresentationFactory 失败");
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!presentationFactory->IsPresentationSupported()) {
|
||||||
|
Logger::Get().Error("此 D3D 设备不支持 composition swapchain");
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!presentationFactory->IsPresentationSupportedWithIndependentFlip()) {
|
||||||
|
Logger::Get().Info("此 D3D 设备不支持 independent flip");
|
||||||
|
}
|
||||||
|
|
||||||
|
hr = presentationFactory->CreatePresentationManager(_presentationManager.put());
|
||||||
|
if (FAILED(hr)) {
|
||||||
|
Logger::Get().ComError("CreatePresentationManager 失败", hr);
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
wil::unique_handle hCompSurface;
|
||||||
|
hr = DCompositionCreateSurfaceHandle(
|
||||||
|
COMPOSITIONOBJECT_ALL_ACCESS,
|
||||||
|
nullptr,
|
||||||
|
hCompSurface.put()
|
||||||
|
);
|
||||||
|
if (FAILED(hr)) {
|
||||||
|
Logger::Get().ComError("DCompositionCreateSurfaceHandle 失败", hr);
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
hr = _presentationManager->CreatePresentationSurface(
|
||||||
|
hCompSurface.get(), _presentationSurface.put());
|
||||||
|
if (FAILED(hr)) {
|
||||||
|
Logger::Get().ComError("CreatePresentationSurface 失败", hr);
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
winrt::com_ptr<IUnknown> compSurface;
|
||||||
|
hr = _dcompDevice->CreateSurfaceFromHandle(hCompSurface.get(), compSurface.put());
|
||||||
|
if (FAILED(hr)) {
|
||||||
|
Logger::Get().ComError("CreateSurfaceFromHandle 失败", hr);
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
hr = _dcompVisual->SetContent(compSurface.get());
|
||||||
|
if (FAILED(hr)) {
|
||||||
|
Logger::Get().ComError("SetContent 失败", hr);
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
hr = _dcompDevice->Commit();
|
||||||
|
if (FAILED(hr)) {
|
||||||
|
Logger::Get().ComError("Commit 失败", hr);
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
hr = _presentationManager->GetPresentRetiringFence(IID_PPV_ARGS(&_presentationFence));
|
||||||
|
if (FAILED(hr)) {
|
||||||
|
Logger::Get().ComError("GetPresentRetiringFence 失败", hr);
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
const uint32_t bufferCount = _CalcBufferCount();
|
||||||
|
_presentationBuffers.resize(bufferCount);
|
||||||
|
_presentationBufferAvailableEvents.resize(bufferCount);
|
||||||
|
_bufferTextures.resize(bufferCount);
|
||||||
|
_bufferRtvs.resize(bufferCount);
|
||||||
|
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
bool CompSwapchainPresenter::BeginFrame(
|
||||||
|
winrt::com_ptr<ID3D11Texture2D>& frameTex,
|
||||||
|
winrt::com_ptr<ID3D11RenderTargetView>& frameRtv,
|
||||||
|
POINT& drawOffset
|
||||||
|
) noexcept {
|
||||||
|
// 寻找可用的缓冲区
|
||||||
|
uint32_t curIdx = std::numeric_limits<uint32_t>::max();
|
||||||
|
|
||||||
|
// 先寻找未初始化的缓冲区
|
||||||
|
const uint32_t bufferCount = (uint32_t)_presentationBuffers.size();
|
||||||
|
for (uint32_t i = 0; i < bufferCount; ++i) {
|
||||||
|
if (_presentationBuffers[i]) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
const SIZE rendererSize = Win32Helper::GetSizeOfRect(ScalingWindow::Get().RendererRect());
|
||||||
|
|
||||||
|
D3D11_TEXTURE2D_DESC desc{};
|
||||||
|
desc.Format = DXGI_FORMAT_R8G8B8A8_UNORM;
|
||||||
|
desc.SampleDesc.Count = 1;
|
||||||
|
desc.MipLevels = 1;
|
||||||
|
desc.ArraySize = 1;
|
||||||
|
desc.Width = (UINT)rendererSize.cx;
|
||||||
|
desc.Height = (UINT)rendererSize.cy;
|
||||||
|
desc.BindFlags = D3D11_BIND_SHADER_RESOURCE | D3D11_BIND_RENDER_TARGET;
|
||||||
|
desc.MiscFlags =
|
||||||
|
D3D11_RESOURCE_MISC_SHARED |
|
||||||
|
D3D11_RESOURCE_MISC_SHARED_NTHANDLE |
|
||||||
|
D3D11_RESOURCE_MISC_SHARED_DISPLAYABLE;
|
||||||
|
|
||||||
|
HRESULT hr = _deviceResources->GetD3DDevice()->CreateTexture2D(
|
||||||
|
&desc, nullptr, _bufferTextures[i].put());
|
||||||
|
if (FAILED(hr)) {
|
||||||
|
Logger::Get().ComError("CreateTexture2D 失败", hr);
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
hr = _presentationManager->AddBufferFromResource(
|
||||||
|
_bufferTextures[i].get(), _presentationBuffers[i].put());
|
||||||
|
if (FAILED(hr)) {
|
||||||
|
Logger::Get().ComError("AddBufferFromResource 失败", hr);
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
hr = _presentationBuffers[i]->GetAvailableEvent(
|
||||||
|
_presentationBufferAvailableEvents[i].put());
|
||||||
|
if (FAILED(hr)) {
|
||||||
|
Logger::Get().ComError("GetAvailableEvent 失败", hr);
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
RECT srcRect{ 0,0,rendererSize.cx,rendererSize.cy };
|
||||||
|
hr = _presentationSurface->SetSourceRect(&srcRect);
|
||||||
|
if (FAILED(hr)) {
|
||||||
|
Logger::Get().ComError("SetSourceRect 失败", hr);
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
curIdx = i;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (curIdx == std::numeric_limits<uint32_t>::max()) {
|
||||||
|
// 等待某个缓冲区空闲
|
||||||
|
DWORD waitResult = WaitForMultipleObjects(
|
||||||
|
bufferCount, (HANDLE*)_presentationBufferAvailableEvents.data(), FALSE, INFINITE);
|
||||||
|
if (waitResult < WAIT_OBJECT_0 || waitResult > WAIT_OBJECT_0 + bufferCount - 1) {
|
||||||
|
Logger::Get().Error("WaitForMultipleObjects 失败");
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
curIdx = waitResult - WAIT_OBJECT_0;
|
||||||
|
}
|
||||||
|
|
||||||
|
HRESULT hr = _presentationSurface->SetBuffer(_presentationBuffers[curIdx].get());
|
||||||
|
if (FAILED(hr)) {
|
||||||
|
Logger::Get().ComError("SetBuffer 失败", hr);
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
winrt::com_ptr<ID3D11RenderTargetView>& curRtv = _bufferRtvs[curIdx];
|
||||||
|
if (!curRtv) {
|
||||||
|
hr = _deviceResources->GetD3DDevice()->CreateRenderTargetView(
|
||||||
|
_bufferTextures[curIdx].get(), nullptr, curRtv.put());
|
||||||
|
if (FAILED(hr)) {
|
||||||
|
Logger::Get().ComError("CreateRenderTargetView 失败", hr);
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
drawOffset = {};
|
||||||
|
frameTex = _bufferTextures[curIdx];
|
||||||
|
frameRtv = curRtv;
|
||||||
|
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
void CompSwapchainPresenter::EndFrame(bool waitForGpu) noexcept {
|
||||||
|
if (waitForGpu || _isResized) {
|
||||||
|
// 下面两个调用用于减少调整窗口尺寸时的边缘闪烁,参见 AdaptivePresenter::EndFrame
|
||||||
|
|
||||||
|
// 等待渲染完成
|
||||||
|
_WaitForGpu();
|
||||||
|
|
||||||
|
// 等待 DWM 开始合成新一帧
|
||||||
|
Win32Helper::WaitForDwmComposition();
|
||||||
|
}
|
||||||
|
|
||||||
|
_presentationManager->Present();
|
||||||
|
|
||||||
|
if (_isResized) {
|
||||||
|
_isResized = false;
|
||||||
|
} else {
|
||||||
|
// 确保前一帧渲染完成再渲染下一帧,既降低了 GPU 负载,也能降低延迟
|
||||||
|
_WaitForGpu();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
bool CompSwapchainPresenter::OnResize() noexcept {
|
||||||
|
_isResized = true;
|
||||||
|
|
||||||
|
// 缓冲区在 BeginFrame 中按需创建
|
||||||
|
std::fill(_presentationBuffers.begin(), _presentationBuffers.end(), nullptr);
|
||||||
|
std::fill(_presentationBufferAvailableEvents.begin(),
|
||||||
|
_presentationBufferAvailableEvents.end(), nullptr);
|
||||||
|
std::fill(_bufferTextures.begin(), _bufferTextures.end(), nullptr);
|
||||||
|
std::fill(_bufferRtvs.begin(), _bufferRtvs.end(), nullptr);
|
||||||
|
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
41
src/Magpie.Core/CompSwapchainPresenter.h
Normal file
41
src/Magpie.Core/CompSwapchainPresenter.h
Normal file
|
|
@ -0,0 +1,41 @@
|
||||||
|
#pragma once
|
||||||
|
#include "PresenterBase.h"
|
||||||
|
#include <dcomp.h>
|
||||||
|
#include <Presentation.h>
|
||||||
|
|
||||||
|
namespace Magpie {
|
||||||
|
|
||||||
|
class CompSwapchainPresenter final : public PresenterBase {
|
||||||
|
protected:
|
||||||
|
bool _Initialize(HWND hwndAttach) noexcept override;
|
||||||
|
|
||||||
|
public:
|
||||||
|
bool BeginFrame(
|
||||||
|
winrt::com_ptr<ID3D11Texture2D>& frameTex,
|
||||||
|
winrt::com_ptr<ID3D11RenderTargetView>& frameRtv,
|
||||||
|
POINT& drawOffset
|
||||||
|
) noexcept override;
|
||||||
|
|
||||||
|
void EndFrame(bool waitForGpu = false) noexcept override;
|
||||||
|
|
||||||
|
bool OnResize() noexcept override;
|
||||||
|
|
||||||
|
private:
|
||||||
|
winrt::com_ptr<IDCompositionDesktopDevice> _dcompDevice;
|
||||||
|
winrt::com_ptr<IDCompositionTarget> _dcompTarget;
|
||||||
|
winrt::com_ptr<IDCompositionVisual2> _dcompVisual;
|
||||||
|
winrt::com_ptr<IDCompositionSurface> _dcompSurface;
|
||||||
|
|
||||||
|
winrt::com_ptr<IPresentationManager> _presentationManager;
|
||||||
|
winrt::com_ptr<IPresentationSurface> _presentationSurface;
|
||||||
|
winrt::com_ptr<ID3D11Fence> _presentationFence;
|
||||||
|
|
||||||
|
std::vector<winrt::com_ptr<IPresentationBuffer>> _presentationBuffers;
|
||||||
|
std::vector<wil::unique_event_nothrow> _presentationBufferAvailableEvents;
|
||||||
|
std::vector<winrt::com_ptr<ID3D11Texture2D>> _bufferTextures;
|
||||||
|
std::vector<winrt::com_ptr<ID3D11RenderTargetView>> _bufferRtvs;
|
||||||
|
|
||||||
|
bool _isResized = false;
|
||||||
|
};
|
||||||
|
|
||||||
|
}
|
||||||
File diff suppressed because it is too large
Load diff
|
|
@ -1,14 +1,9 @@
|
||||||
#pragma once
|
#pragma once
|
||||||
#include "ByteBuffer.h"
|
|
||||||
#include "SmallVector.h"
|
|
||||||
#include <parallel_hashmap/phmap.h>
|
#include <parallel_hashmap/phmap.h>
|
||||||
#include <wil/registry.h>
|
|
||||||
|
|
||||||
namespace Magpie {
|
namespace Magpie {
|
||||||
|
|
||||||
class D3D12Context;
|
class DeviceResources;
|
||||||
class GraphicsContext;
|
|
||||||
class DescriptorHeap;
|
|
||||||
|
|
||||||
class CursorDrawer {
|
class CursorDrawer {
|
||||||
public:
|
public:
|
||||||
|
|
@ -16,225 +11,75 @@ public:
|
||||||
CursorDrawer(const CursorDrawer&) = delete;
|
CursorDrawer(const CursorDrawer&) = delete;
|
||||||
CursorDrawer(CursorDrawer&&) = delete;
|
CursorDrawer(CursorDrawer&&) = delete;
|
||||||
|
|
||||||
~CursorDrawer() noexcept;
|
bool Initialize(DeviceResources& deviceResources) noexcept;
|
||||||
|
|
||||||
bool Initialize(
|
void Draw(ID3D11Texture2D* backBuffer, POINT drawOffset) noexcept;
|
||||||
D3D12Context& d3d12Context,
|
|
||||||
const RECT& srcRect,
|
|
||||||
const RECT& rendererRect,
|
|
||||||
const RECT& destRect,
|
|
||||||
const ColorInfo& colorInfo
|
|
||||||
) noexcept;
|
|
||||||
|
|
||||||
void PrepareForDraw(HCURSOR hCursor, POINT cursorPos, bool& needRedraw) noexcept;
|
void IsCursorVisible(bool value) noexcept {
|
||||||
|
_isCursorVisible = value;
|
||||||
// backBuffer 不为空表示掩码光标在叠加层上
|
|
||||||
HRESULT Draw(
|
|
||||||
GraphicsContext& graphicsContext,
|
|
||||||
uint64_t frameFenceValue,
|
|
||||||
uint64_t completedFenceValue,
|
|
||||||
uint32_t curFrameSrvOffset,
|
|
||||||
ID3D12Resource* backBuffer = nullptr
|
|
||||||
) noexcept;
|
|
||||||
|
|
||||||
void OnCursorVirtualizationChanged(bool value) noexcept {
|
|
||||||
_isCursorVirtualized = value;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
void OnMovingChanged(bool value) noexcept {
|
bool IsCursorVisible() const noexcept {
|
||||||
_isMoving = value;
|
return _isCursorVisible;
|
||||||
}
|
}
|
||||||
|
|
||||||
void OnMoved(const RECT& rendererRect, const RECT& destRect) noexcept;
|
bool NeedRedraw() const noexcept;
|
||||||
|
|
||||||
void OnResized(const RECT& rendererRect, const RECT& destRect) noexcept;
|
|
||||||
|
|
||||||
void OnSrcMovingChanged(bool value) noexcept {
|
|
||||||
_isSrcMoving = value;
|
|
||||||
}
|
|
||||||
|
|
||||||
void OnColorInfoChanged(const ColorInfo& colorInfo) noexcept;
|
|
||||||
|
|
||||||
private:
|
private:
|
||||||
// SDR 色域下使用 sRGB 空间,否则使用线性 RGB 空间。截至 Win11 25H2,Windows 在 WCG
|
std::pair<HCURSOR, POINT> _GetCursorState(bool& isActive) const noexcept;
|
||||||
// 和 HDR 下光标的色域和透明度经常变化,没有统一标准。
|
|
||||||
enum class _CursorType {
|
enum class _CursorType {
|
||||||
// 彩色光标
|
// 彩色光标,此时纹理中 RGB 通道已预乘 A 通道(premultiplied alpha),A 通道已预先取反
|
||||||
// 纹理格式: DXGI_FORMAT_R16G16B16A16_FLOAT
|
// 这是为了减少着色器的计算量以及确保(可能进行的)双线性差值的准确性
|
||||||
// 计算公式: FinalColor = CursorColor.rgb + ScreenColor * CursorColor.a
|
// 计算公式: FinalColor = ScreenColor * CursorColor.a + CursorColor
|
||||||
// 纹理中 RGB 通道已预乘 A 通道 (premultiplied alpha),A 通道已预先取反,这是为了
|
|
||||||
// 减少着色器的计算量以及确保 (可能进行的) 双线性插值的准确性。
|
|
||||||
Color = 0,
|
|
||||||
// 单色光标
|
|
||||||
// 纹理格式: DXGI_FORMAT_R8_UINT
|
|
||||||
// 高四位为 AND 掩码,低四位为 XOR 掩码,值只能是 0 或 0xf。
|
|
||||||
Monochrome,
|
|
||||||
// 彩色掩码光标
|
|
||||||
// 纹理格式: DXGI_FORMAT_R8G8B8A8_UNORM
|
// 纹理格式: DXGI_FORMAT_R8G8B8A8_UNORM
|
||||||
// A 通道只能是 0 或 255。为 0 时用 RGB 通道取代屏幕颜色,为 255 时将 RGB 通道和
|
Color = 0,
|
||||||
// 屏幕颜色进行异或操作。
|
// 彩色掩码光标,此时 A 通道可能为 0 或 255
|
||||||
MaskedColor
|
// 为 0 时表示 RGB 通道取代屏幕颜色,为 255 时表示 RGB 通道和屏幕颜色进行异或操作
|
||||||
};
|
// 纹理格式: DXGI_FORMAT_R8G8B8A8_UNORM
|
||||||
|
MaskedColor,
|
||||||
struct _CursorInfoKey {
|
// 单色光标,此时 R 通道为 AND 掩码,G 通道为 XOR 掩码,其他通道不使用
|
||||||
HCURSOR hCursor;
|
// RG 通道的值只能是 0 或 255
|
||||||
// DPI 为 0 表示此光标不随 DPI 缩放
|
// 纹理格式: DXGI_FORMAT_R8G8_UNORM
|
||||||
uint32_t dpi;
|
Monochrome
|
||||||
|
|
||||||
bool operator==(const _CursorInfoKey&) const = default;
|
|
||||||
|
|
||||||
// 供 phmap 使用
|
|
||||||
friend size_t hash_value(const _CursorInfoKey& key) noexcept {
|
|
||||||
return phmap::HashState().combine(phmap::Hash<HCURSOR>()(key.hCursor), key.dpi);
|
|
||||||
}
|
|
||||||
};
|
|
||||||
|
|
||||||
struct _CursorFrame {
|
|
||||||
_CursorType type;
|
|
||||||
PointU hotspot;
|
|
||||||
|
|
||||||
winrt::com_ptr<ID3D12Resource> texture;
|
|
||||||
SizeU resSize;
|
|
||||||
ByteBuffer resTextureData;
|
|
||||||
// 这两个资源使用完毕后在 _ClearRetiredResources 中释放
|
|
||||||
winrt::com_ptr<ID3D12Resource> uploadBuffer;
|
|
||||||
winrt::com_ptr<ID3D12Resource> resTexture;
|
|
||||||
uint64_t tempResourcesFenceValue = 0;
|
|
||||||
|
|
||||||
uint32_t textureSrvOffset = std::numeric_limits<uint32_t>::max();
|
|
||||||
uint32_t textureRtvOffset = std::numeric_limits<uint32_t>::max();
|
|
||||||
uint32_t resTextureSrvOffset = std::numeric_limits<uint32_t>::max();
|
|
||||||
};
|
};
|
||||||
|
|
||||||
struct _CursorInfo {
|
struct _CursorInfo {
|
||||||
SizeU size;
|
POINT hotSpot{};
|
||||||
SmallVector<_CursorFrame, 1> frames;
|
SIZE size{};
|
||||||
// 序列表 (帧索引值数组),使多帧可以复用同一个 _CursorFrame。为空表示顺序播放
|
winrt::com_ptr<ID3D11ShaderResourceView> textureSrv = nullptr;
|
||||||
SmallVector<std::pair<uint32_t, std::chrono::nanoseconds>, 0> frameSequence;
|
_CursorType type = _CursorType::Color;
|
||||||
uint64_t lastUseFenceValue = 0;
|
|
||||||
|
|
||||||
bool IsAnimated() const noexcept {
|
|
||||||
return !frameSequence.empty();
|
|
||||||
}
|
|
||||||
|
|
||||||
uint32_t GetFrameIdx(uint32_t seqIdx) const noexcept {
|
|
||||||
return IsAnimated() ? frameSequence[seqIdx].first : 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
void FreeDescriptors(
|
|
||||||
DescriptorHeap& csuDescriptorHeap,
|
|
||||||
DescriptorHeap& rtvDescriptorHeap
|
|
||||||
) const noexcept;
|
|
||||||
};
|
};
|
||||||
|
|
||||||
std::pair<const _CursorInfoKey, _CursorInfo>* _ResolveCursor(
|
const _CursorInfo* _ResolveCursor(HCURSOR hCursor) noexcept;
|
||||||
HCURSOR hCursor,
|
|
||||||
POINT cursorPos
|
|
||||||
) noexcept;
|
|
||||||
|
|
||||||
SizeU _CalcCursorSize(
|
bool _SetPremultipliedAlphaBlend() noexcept;
|
||||||
SizeU cursorBmpSize,
|
|
||||||
uint32_t cursorDpi,
|
|
||||||
uint32_t monitorDpi,
|
|
||||||
bool isCursorDpiAware
|
|
||||||
) const noexcept;
|
|
||||||
|
|
||||||
void _TryResolveCursorFramesFromSource(
|
DeviceResources* _deviceResources = nullptr;
|
||||||
HCURSOR hCursor,
|
|
||||||
const ICONINFOEX& iconInfoEx,
|
|
||||||
uint32_t preferedWidth,
|
|
||||||
SmallVectorImpl<wil::unique_hcursor>& frames,
|
|
||||||
SmallVectorImpl<std::pair<uint32_t, std::chrono::nanoseconds>>& frameSequence
|
|
||||||
) const noexcept;
|
|
||||||
|
|
||||||
bool _ResolveCursorFramePixels(
|
phmap::flat_hash_map<HCURSOR, _CursorInfo> _cursorInfos;
|
||||||
_CursorFrame& cursorFrame,
|
|
||||||
HBITMAP hColorBmp,
|
|
||||||
HBITMAP hMaskBmp
|
|
||||||
) const noexcept;
|
|
||||||
|
|
||||||
HRESULT _InitializeCursorTexture(
|
winrt::com_ptr<ID3D11VertexShader> _simpleVS;
|
||||||
GraphicsContext& graphicsContext,
|
winrt::com_ptr<ID3D11InputLayout> _simpleIL;
|
||||||
_CursorInfo& cursorInfo,
|
winrt::com_ptr<ID3D11Buffer> _vtxBuffer;
|
||||||
uint32_t cursorFrameIdx,
|
winrt::com_ptr<ID3D11PixelShader> _simplePS;
|
||||||
uint64_t completedFenceValue
|
winrt::com_ptr<ID3D11BlendState> premultipliedAlphaBlendBlendState;
|
||||||
) noexcept;
|
winrt::com_ptr<ID3D11PixelShader> _maskedCursorPS;
|
||||||
|
winrt::com_ptr<ID3D11PixelShader> _monochromeCursorPS;
|
||||||
|
|
||||||
// 只能在同步 GPU 后调用
|
// 用于渲染彩色掩码光标和单色光标的临时纹理
|
||||||
void _ClearCursorInfos() noexcept;
|
winrt::com_ptr<ID3D11Texture2D> _tempCursorTexture;
|
||||||
|
winrt::com_ptr<ID3D11ShaderResourceView> _tempCursorTextureRtv;
|
||||||
HRESULT _CreateColorPSO(bool isSrgb, winrt::com_ptr<ID3D12PipelineState>& result) noexcept;
|
SIZE _tempCursorTextureSize{};
|
||||||
|
|
||||||
HRESULT _CreateMaskPSO(
|
|
||||||
bool isMonochrome,
|
|
||||||
bool isSrgb,
|
|
||||||
winrt::com_ptr<ID3D12PipelineState>& result
|
|
||||||
) noexcept;
|
|
||||||
|
|
||||||
HRESULT _CreateCursorResizerPSO() noexcept;
|
|
||||||
|
|
||||||
void _ClearRetiredResources(uint64_t completedFenceValue) noexcept;
|
|
||||||
|
|
||||||
void _OnCursorsRegChanged(wil::RegistryChangeKind) noexcept;
|
|
||||||
|
|
||||||
D3D12Context* _d3d12Context = nullptr;
|
|
||||||
SizeU _srcSize{};
|
|
||||||
RECT _rendererRect{};
|
|
||||||
RECT _destRect{};
|
|
||||||
ColorInfo _colorInfo;
|
|
||||||
|
|
||||||
// 监控“指针大小”选项变化
|
|
||||||
wil::unique_registry_watcher_nothrow _regWatcher;
|
|
||||||
DWORD _cursorBaseSize = 32;
|
|
||||||
|
|
||||||
phmap::flat_hash_map<_CursorInfoKey, _CursorInfo> _cursorInfos;
|
|
||||||
|
|
||||||
// 保存临时资源未被释放的 _CursorInfo。保存键而不是指针,以防 _cursorInfos 扩容后失效
|
|
||||||
SmallVector<_CursorInfoKey, 1> _cursorInfosWithTempResources;
|
|
||||||
|
|
||||||
// 保存 _cursorBaseSize 改变后失效的 _CursorInfo
|
|
||||||
SmallVector<_CursorInfo, 0> _retiredCursorInfos;
|
|
||||||
|
|
||||||
// 保存解析失败的光标以避免重复尝试
|
|
||||||
phmap::flat_hash_set<HCURSOR> _unresolvableCursors;
|
|
||||||
|
|
||||||
// 这两个成员用于检查自动隐藏光标
|
// 这两个成员用于检查自动隐藏光标
|
||||||
HCURSOR _lastRawCursorHandle = NULL;
|
HCURSOR _lastRawCursorHandle = NULL;
|
||||||
std::chrono::steady_clock::time_point _lastCursorActiveTime;
|
std::chrono::steady_clock::time_point _lastCursorActiveTime;
|
||||||
// 上次绘制的光标形状和位置
|
// 上次绘制的光标形状和位置
|
||||||
std::pair<const _CursorInfoKey, _CursorInfo>* _curCursorInfoKeyValue = nullptr;
|
HCURSOR _lastCursorHandle = NULL;
|
||||||
POINT _curCursorPos{ std::numeric_limits<LONG>::max(), std::numeric_limits<LONG>::max() };
|
POINT _lastCursorPos{ std::numeric_limits<LONG>::max(), std::numeric_limits<LONG>::max() };
|
||||||
// 这两个成员用于保存动态光标状态
|
|
||||||
uint32_t _curFrameSeqIdx = 0;
|
|
||||||
std::chrono::steady_clock::time_point _curFrameSeqEndTime;
|
|
||||||
|
|
||||||
// 用于从渲染目标复制光标下区域
|
|
||||||
winrt::com_ptr<ID3D12Resource> _tempOriginTexture;
|
|
||||||
SizeU _tempOriginTextureSize{};
|
|
||||||
uint32_t _tempOriginTextureSrvOffset = std::numeric_limits<uint32_t>::max();
|
|
||||||
|
|
||||||
struct _RetiredTempOriginTexture {
|
|
||||||
winrt::com_ptr<ID3D12Resource> texture;
|
|
||||||
uint64_t fenceValue;
|
|
||||||
uint32_t srvOffset;
|
|
||||||
};
|
|
||||||
SmallVector<_RetiredTempOriginTexture, 1> _retiredTempOriginTextures;
|
|
||||||
|
|
||||||
winrt::com_ptr<ID3D12RootSignature> _colorRootSignature;
|
|
||||||
winrt::com_ptr<ID3D12PipelineState> _colorPSO;
|
|
||||||
winrt::com_ptr<ID3D12PipelineState> _colorSrgbPSO;
|
|
||||||
winrt::com_ptr<ID3D12RootSignature> _maskRootSignature;
|
|
||||||
winrt::com_ptr<ID3D12PipelineState> _monochromePSO;
|
|
||||||
winrt::com_ptr<ID3D12PipelineState> _monochromeSrgbPSO;
|
|
||||||
winrt::com_ptr<ID3D12PipelineState> _maskedColorPSO;
|
|
||||||
winrt::com_ptr<ID3D12PipelineState> _maskedColorSrgbPSO;
|
|
||||||
winrt::com_ptr<ID3D12RootSignature> _cursorResizerRootSignature;
|
|
||||||
winrt::com_ptr<ID3D12PipelineState> _cursorResizerPSO;
|
|
||||||
|
|
||||||
bool _isCursorVisible = true;
|
bool _isCursorVisible = true;
|
||||||
bool _isMoving = false;
|
|
||||||
bool _isCursorVirtualized = false;
|
|
||||||
bool _isSrcMoving = false;
|
|
||||||
};
|
};
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -1,666 +0,0 @@
|
||||||
#include "pch.h"
|
|
||||||
#include "CursorHelper.h"
|
|
||||||
#include "ByteBuffer.h"
|
|
||||||
#include "Logger.h"
|
|
||||||
#include "SmallVector.h"
|
|
||||||
#include "Win32Helper.h"
|
|
||||||
#include <mmsystem.h> // FOURCC
|
|
||||||
|
|
||||||
namespace Magpie {
|
|
||||||
|
|
||||||
struct RTAG {
|
|
||||||
DWORD ckID;
|
|
||||||
DWORD ckSize;
|
|
||||||
};
|
|
||||||
|
|
||||||
static WORD GetRealIconSize(WORD size) noexcept {
|
|
||||||
// 0 等价于 256
|
|
||||||
return size == 0 ? (WORD)256 : size;
|
|
||||||
}
|
|
||||||
|
|
||||||
wil::unique_hcursor CursorHelper::ExtractCursorFromModule(
|
|
||||||
HMODULE hModule,
|
|
||||||
LPCWSTR resName,
|
|
||||||
uint32_t preferredWidth
|
|
||||||
) noexcept {
|
|
||||||
HRSRC hRes = FindResource(hModule, resName, RT_GROUP_CURSOR);
|
|
||||||
if (!hRes) {
|
|
||||||
Logger::Get().Win32Error("FindResource 失败");
|
|
||||||
return nullptr;
|
|
||||||
}
|
|
||||||
|
|
||||||
HGLOBAL hResLoad = LoadResource(hModule, hRes);
|
|
||||||
if (!hResLoad) {
|
|
||||||
Logger::Get().Win32Error("LoadResource 失败");
|
|
||||||
return nullptr;
|
|
||||||
}
|
|
||||||
|
|
||||||
// 解析光标资源
|
|
||||||
#pragma pack(push, 2)
|
|
||||||
// 来自 https://learn.microsoft.com/en-us/windows/win32/menurc/resdir
|
|
||||||
struct RESDIR {
|
|
||||||
WORD Width;
|
|
||||||
WORD Height;
|
|
||||||
WORD Planes;
|
|
||||||
WORD BitCount;
|
|
||||||
DWORD BytesInRes;
|
|
||||||
WORD IconCursorId;
|
|
||||||
};
|
|
||||||
|
|
||||||
// 来自 https://learn.microsoft.com/en-us/windows/win32/menurc/newheader
|
|
||||||
struct NEWHEADER {
|
|
||||||
WORD Reserved;
|
|
||||||
WORD ResType;
|
|
||||||
WORD ResCount;
|
|
||||||
RESDIR entries[1];
|
|
||||||
};
|
|
||||||
#pragma pack(pop)
|
|
||||||
|
|
||||||
const NEWHEADER& header = *(const NEWHEADER*)LockResource(hResLoad);
|
|
||||||
if (header.Reserved != 0 || header.ResType != 2) {
|
|
||||||
Logger::Get().Error("不是光标资源");
|
|
||||||
return nullptr;
|
|
||||||
}
|
|
||||||
|
|
||||||
const uint32_t resCount = header.ResCount;
|
|
||||||
if (resCount == 0 || resCount > 256) {
|
|
||||||
Logger::Get().Error("无可用光标资源");
|
|
||||||
return nullptr;
|
|
||||||
}
|
|
||||||
|
|
||||||
struct IconInfo {
|
|
||||||
WORD width;
|
|
||||||
WORD bitCount;
|
|
||||||
WORD id;
|
|
||||||
};
|
|
||||||
SmallVector<IconInfo, 0> iconInfos(resCount);
|
|
||||||
|
|
||||||
for (uint32_t i = 0; i < resCount; ++i) {
|
|
||||||
const RESDIR& entry = header.entries[i];
|
|
||||||
// 宽度和高度的 0 等价于 256
|
|
||||||
iconInfos[i] = IconInfo{
|
|
||||||
GetRealIconSize(entry.Width),
|
|
||||||
entry.BitCount,
|
|
||||||
entry.IconCursorId
|
|
||||||
};
|
|
||||||
}
|
|
||||||
|
|
||||||
// 尺寸从小到大排序;如果尺寸相同,色深从大到小排序,以便获得色深最大的光标
|
|
||||||
std::sort(iconInfos.begin(), iconInfos.end(), [](const IconInfo& l, const IconInfo& r) {
|
|
||||||
return l.width < r.width || (l.width == r.width && l.bitCount > r.bitCount);
|
|
||||||
});
|
|
||||||
|
|
||||||
// 寻找完美匹配或更大的资源
|
|
||||||
WORD targetResId;
|
|
||||||
{
|
|
||||||
auto it = std::lower_bound(
|
|
||||||
iconInfos.begin(),
|
|
||||||
iconInfos.end(),
|
|
||||||
preferredWidth,
|
|
||||||
[](const IconInfo& iconInfo, uint32_t target) {
|
|
||||||
return iconInfo.width < target;
|
|
||||||
}
|
|
||||||
);
|
|
||||||
if (it == iconInfos.end()) {
|
|
||||||
targetResId = iconInfos.back().id;
|
|
||||||
} else {
|
|
||||||
targetResId = it->id;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
hRes = FindResource(hModule, MAKEINTRESOURCE(targetResId), RT_CURSOR);
|
|
||||||
if (!hRes) {
|
|
||||||
Logger::Get().Win32Error("FindResource 失败");
|
|
||||||
return nullptr;
|
|
||||||
}
|
|
||||||
|
|
||||||
hResLoad = LoadResource(hModule, hRes);
|
|
||||||
if (!hResLoad) {
|
|
||||||
Logger::Get().Win32Error("LoadResource 失败");
|
|
||||||
return nullptr;
|
|
||||||
}
|
|
||||||
|
|
||||||
HICON hIcon = CreateIconFromResourceEx((PBYTE)LockResource(hResLoad),
|
|
||||||
SizeofResource(hModule, hRes), FALSE, 0x30000, 0, 0, LR_DEFAULTCOLOR);
|
|
||||||
if (!hIcon) {
|
|
||||||
Logger::Get().Win32Error("CreateIconFromResourceEx 失败");
|
|
||||||
return nullptr;
|
|
||||||
}
|
|
||||||
|
|
||||||
return wil::unique_hcursor(hIcon);
|
|
||||||
}
|
|
||||||
|
|
||||||
// CUR 文件结构如下,参考自 https://en.wikipedia.org/wiki/ICO_(file_format)#File_structure
|
|
||||||
// [ICONDIR]
|
|
||||||
// [ICONDIRENTRY 1]
|
|
||||||
// [ICONDIRENTRY 2]
|
|
||||||
// ...
|
|
||||||
// [位图 1]
|
|
||||||
// [位图 2]
|
|
||||||
// ...
|
|
||||||
static wil::unique_hcursor LoadIcoFromFileMap(
|
|
||||||
const uint8_t* fileData,
|
|
||||||
const uint8_t* fileEnd,
|
|
||||||
uint32_t preferredWidth
|
|
||||||
) noexcept {
|
|
||||||
#pragma pack(push, 2)
|
|
||||||
struct ICONDIR {
|
|
||||||
WORD idReserved;
|
|
||||||
WORD idType;
|
|
||||||
WORD idCount;
|
|
||||||
};
|
|
||||||
|
|
||||||
struct ICONDIRENTRY {
|
|
||||||
BYTE bWidth;
|
|
||||||
BYTE bHeight;
|
|
||||||
BYTE bColorCount;
|
|
||||||
BYTE bReserved;
|
|
||||||
WORD xHotSpot;
|
|
||||||
WORD yHotSpot;
|
|
||||||
DWORD dwBytesInRes;
|
|
||||||
DWORD dwImageOffset;
|
|
||||||
};
|
|
||||||
|
|
||||||
struct LOCALHEADER {
|
|
||||||
WORD xHotSpot;
|
|
||||||
WORD yHotSpot;
|
|
||||||
};
|
|
||||||
#pragma pack(pop)
|
|
||||||
|
|
||||||
if (fileData + sizeof(ICONDIR) > fileEnd) {
|
|
||||||
Logger::Get().Error("文件无效");
|
|
||||||
return nullptr;
|
|
||||||
}
|
|
||||||
|
|
||||||
uint32_t entryCount;
|
|
||||||
{
|
|
||||||
const ICONDIR& header = *(ICONDIR*)fileData;
|
|
||||||
|
|
||||||
if (header.idReserved != 0 || header.idType != 2) {
|
|
||||||
Logger::Get().Error("不是光标资源");
|
|
||||||
return nullptr;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (header.idCount == 0 || header.idCount > 256) {
|
|
||||||
Logger::Get().Error("无可用光标资源");
|
|
||||||
return nullptr;
|
|
||||||
}
|
|
||||||
|
|
||||||
entryCount = header.idCount;
|
|
||||||
}
|
|
||||||
|
|
||||||
const ICONDIRENTRY* targetEntry;
|
|
||||||
{
|
|
||||||
const ICONDIRENTRY* pEntries = (const ICONDIRENTRY*)(fileData + sizeof(ICONDIR));
|
|
||||||
|
|
||||||
if ((uint8_t*)pEntries + sizeof(ICONDIRENTRY) * entryCount > fileEnd) {
|
|
||||||
Logger::Get().Error("文件无效");
|
|
||||||
return nullptr;
|
|
||||||
}
|
|
||||||
|
|
||||||
// 寻找完美匹配或更大的资源
|
|
||||||
std::vector<const ICONDIRENTRY*> entries(entryCount);
|
|
||||||
|
|
||||||
for (uint32_t i = 0; i < entryCount; ++i) {
|
|
||||||
entries[i] = &pEntries[i];
|
|
||||||
}
|
|
||||||
|
|
||||||
// 尺寸从小到大排序;和资源不同,cur 文件不区分色深
|
|
||||||
std::sort(entries.begin(), entries.end(), [](const ICONDIRENTRY* l, const ICONDIRENTRY* r) {
|
|
||||||
return GetRealIconSize(l->bWidth) < GetRealIconSize(r->bWidth);
|
|
||||||
});
|
|
||||||
|
|
||||||
auto it = std::lower_bound(
|
|
||||||
entries.begin(),
|
|
||||||
entries.end(),
|
|
||||||
preferredWidth,
|
|
||||||
[](const ICONDIRENTRY* entry, uint32_t target) {
|
|
||||||
return GetRealIconSize(entry->bWidth) < target;
|
|
||||||
}
|
|
||||||
);
|
|
||||||
if (it == entries.end()) {
|
|
||||||
targetEntry = entries.back();
|
|
||||||
} else {
|
|
||||||
targetEntry = *it;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
const uint8_t* pCursroData = fileData + targetEntry->dwImageOffset;
|
|
||||||
|
|
||||||
if (pCursroData + targetEntry->dwBytesInRes > fileEnd) {
|
|
||||||
Logger::Get().Error("文件无效");
|
|
||||||
return nullptr;
|
|
||||||
}
|
|
||||||
|
|
||||||
// RT_CURSOR 结构为 LOCALHEADER 后跟位图数据
|
|
||||||
// https://learn.microsoft.com/en-us/windows/win32/menurc/resource-file-formats#cursor-and-icon-resources
|
|
||||||
ByteBuffer cursorData(sizeof(LOCALHEADER) + targetEntry->dwBytesInRes);
|
|
||||||
// 设置热点
|
|
||||||
*(LOCALHEADER*)cursorData.Data() = { targetEntry->xHotSpot, targetEntry->yHotSpot };
|
|
||||||
// 读取位图数据
|
|
||||||
std::memcpy(cursorData.Data() + sizeof(LOCALHEADER), pCursroData, targetEntry->dwBytesInRes);
|
|
||||||
|
|
||||||
wil::unique_hcursor hCursor(CreateIconFromResourceEx(cursorData.Data(),
|
|
||||||
sizeof(LOCALHEADER) + targetEntry->dwBytesInRes, FALSE, 0x30000, 0, 0, LR_DEFAULTCOLOR));
|
|
||||||
if (!hCursor) {
|
|
||||||
Logger::Get().Win32Error("CreateIconFromResourceEx 失败");
|
|
||||||
return nullptr;
|
|
||||||
}
|
|
||||||
|
|
||||||
return hCursor;
|
|
||||||
}
|
|
||||||
|
|
||||||
static std::chrono::nanoseconds JifRateToDuration(uint32_t jifRate) noexcept {
|
|
||||||
using namespace std::chrono;
|
|
||||||
return nanoseconds(seconds(jifRate)) / 60;
|
|
||||||
}
|
|
||||||
|
|
||||||
// RIFF 格式参见 https://en.wikipedia.org/wiki/Resource_Interchange_File_Format
|
|
||||||
// ANI 文件结构如下,来自 https://en.wikipedia.org/wiki/ANI_(file_format)
|
|
||||||
// RIFF('ACON'
|
|
||||||
// [LIST('INFO'
|
|
||||||
// [INAM(<ZSTR>)] // 标题 (可选)
|
|
||||||
// [IART(<ZSTR>)] // 作者 (可选)
|
|
||||||
// )]
|
|
||||||
// 'anih'(<ANIHEADER>) // ANI 文件头
|
|
||||||
// ['rate'(<DWORD...>)] // 速率表 (jiffies 数组)。如果设置了 AF_SEQUENCE 标志,则数
|
|
||||||
// // 量为 ANIHEADER.cSteps,否则为 ANIHEADER.cFrames。
|
|
||||||
// ['seq '(<DWORD...>)] // 序列表 (帧索引值数组)。当设置 AF_SEQUENCE 标志时应存在,
|
|
||||||
// // 数量为 ANIHEADER.cSteps。
|
|
||||||
// LIST('fram' // 帧数据列表,数量为 ANIHEADER.cFrames
|
|
||||||
// 'icon'(<icon_data_1>) // 第 1 帧
|
|
||||||
// 'icon'(<icon_data_2>) // 第 2 帧
|
|
||||||
// ...
|
|
||||||
// )
|
|
||||||
// )
|
|
||||||
static bool LoadAniFromFileMap(
|
|
||||||
const uint8_t* fileData,
|
|
||||||
const uint8_t* fileEnd,
|
|
||||||
uint32_t preferredWidth,
|
|
||||||
SmallVectorImpl<wil::unique_hcursor>& frames,
|
|
||||||
SmallVectorImpl<std::pair<uint32_t, std::chrono::nanoseconds>>& frameSequence
|
|
||||||
) {
|
|
||||||
#pragma pack(push, 2)
|
|
||||||
struct ANIHEADER {
|
|
||||||
DWORD cbSizeof;
|
|
||||||
DWORD cFrames; // 帧数据列表元素数量
|
|
||||||
DWORD cSteps; // 序列表元素数量
|
|
||||||
DWORD cx, cy; // 不使用
|
|
||||||
DWORD cBitCount, cPlanes; // 不使用
|
|
||||||
DWORD jifRate; // 默认显示速率, 单位为 jiffy (1/60s)
|
|
||||||
DWORD fl; // 必须设置 AF_ICON,可选 AF_SEQUENCE
|
|
||||||
};
|
|
||||||
#pragma pack(pop)
|
|
||||||
|
|
||||||
constexpr DWORD FOURCC_ACON = mmioFOURCC('A', 'C', 'O', 'N');
|
|
||||||
constexpr DWORD FOURCC_anih = mmioFOURCC('a', 'n', 'i', 'h');
|
|
||||||
constexpr DWORD FOURCC_rate = mmioFOURCC('r', 'a', 't', 'e');
|
|
||||||
constexpr DWORD FOURCC_seq = mmioFOURCC('s', 'e', 'q', ' ');
|
|
||||||
constexpr DWORD FOURCC_fram = mmioFOURCC('f', 'r', 'a', 'm');
|
|
||||||
constexpr DWORD FOURCC_icon = mmioFOURCC('i', 'c', 'o', 'n');
|
|
||||||
|
|
||||||
constexpr DWORD AF_ICON = 0x1;
|
|
||||||
constexpr DWORD AF_SEQUENCE = 0x2;
|
|
||||||
|
|
||||||
// 已经检查 RIFF 头
|
|
||||||
fileData += sizeof(RTAG);
|
|
||||||
|
|
||||||
if (fileData + sizeof(uint32_t) > fileEnd) {
|
|
||||||
Logger::Get().Error("文件无效");
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
if (*(uint32_t*)fileData != FOURCC_ACON) {
|
|
||||||
Logger::Get().Error("文件无效");
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
fileData += sizeof(uint32_t);
|
|
||||||
|
|
||||||
ANIHEADER aniHeader{};
|
|
||||||
uint32_t curFrameIdx = 0;
|
|
||||||
|
|
||||||
while (fileData + sizeof(RTAG) < fileEnd) {
|
|
||||||
RTAG tag = *(RTAG*)fileData;
|
|
||||||
fileData += sizeof(RTAG);
|
|
||||||
|
|
||||||
const uint8_t* chunkEnd = fileData + ((tag.ckSize + 1) & ~1);
|
|
||||||
if (chunkEnd > fileEnd) {
|
|
||||||
Logger::Get().Error("文件无效");
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
// 不确定是不是强制的,在 Windows 的实现中,anih 块必须比 fram、rate 和 seq 块先
|
|
||||||
// 出现。我们和系统保持一致,这可以简化代码。
|
|
||||||
switch (tag.ckID) {
|
|
||||||
case FOURCC_anih:
|
|
||||||
{
|
|
||||||
if (fileData + sizeof(ANIHEADER) > chunkEnd) {
|
|
||||||
Logger::Get().Error("文件无效");
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
aniHeader = *(ANIHEADER*)fileData;
|
|
||||||
|
|
||||||
if (aniHeader.cbSizeof != sizeof(ANIHEADER) ||
|
|
||||||
aniHeader.cFrames == 0 ||
|
|
||||||
((aniHeader.fl & AF_SEQUENCE) && aniHeader.cSteps == 0) ||
|
|
||||||
!(aniHeader.fl & AF_ICON))
|
|
||||||
{
|
|
||||||
Logger::Get().Error("文件无效");
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
frames.resize(aniHeader.cFrames);
|
|
||||||
|
|
||||||
// 如果只有一帧则不是动态光标
|
|
||||||
if (aniHeader.cFrames > 1) {
|
|
||||||
if (aniHeader.fl & AF_SEQUENCE) {
|
|
||||||
frameSequence.resize(aniHeader.cSteps);
|
|
||||||
// 用于检查 seq 块是否存在
|
|
||||||
frameSequence[0].first = std::numeric_limits<uint32_t>::max();
|
|
||||||
} else {
|
|
||||||
frameSequence.resize(aniHeader.cFrames);
|
|
||||||
// 逐帧播放
|
|
||||||
for (uint32_t i = 0; i < frameSequence.size(); ++i) {
|
|
||||||
frameSequence[i].first = i;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
for (auto& pair : frameSequence) {
|
|
||||||
pair.second = JifRateToDuration(aniHeader.jifRate);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
case FOURCC_LIST:
|
|
||||||
{
|
|
||||||
if (fileData + sizeof(uint32_t) > chunkEnd) {
|
|
||||||
Logger::Get().Error("文件无效");
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
// 如果不是 fram 块则跳过此 LIST 块
|
|
||||||
if (*(uint32_t*)fileData != FOURCC_fram) {
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
|
|
||||||
fileData += sizeof(uint32_t);
|
|
||||||
|
|
||||||
// 确保已解析 anih 块
|
|
||||||
if (aniHeader.cbSizeof == 0) {
|
|
||||||
Logger::Get().Error("文件无效");
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (curFrameIdx == aniHeader.cFrames) {
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
|
|
||||||
while (fileData + sizeof(RTAG) < chunkEnd) {
|
|
||||||
tag = *(RTAG*)fileData;
|
|
||||||
fileData += sizeof(RTAG);
|
|
||||||
|
|
||||||
const uint8_t* subChunkEnd = fileData + ((tag.ckSize + 1) & ~1);
|
|
||||||
if (subChunkEnd > chunkEnd) {
|
|
||||||
Logger::Get().Error("文件无效");
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (tag.ckID == FOURCC_icon) {
|
|
||||||
wil::unique_hcursor hCursor = LoadIcoFromFileMap(fileData, subChunkEnd, preferredWidth);
|
|
||||||
if (hCursor) {
|
|
||||||
frames[curFrameIdx++] = std::move(hCursor);
|
|
||||||
} else {
|
|
||||||
Logger::Get().Error("LoadIcoFromFileMap 失败");
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (curFrameIdx == aniHeader.cFrames) {
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
fileData = subChunkEnd;
|
|
||||||
}
|
|
||||||
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
case FOURCC_rate:
|
|
||||||
{
|
|
||||||
// 确保已解析 anih 块
|
|
||||||
if (aniHeader.cbSizeof == 0) {
|
|
||||||
Logger::Get().Error("文件无效");
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
// 只有一帧则忽略 rate 块
|
|
||||||
if (frameSequence.empty()) {
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (fileData + sizeof(uint32_t) * frameSequence.size() > chunkEnd) {
|
|
||||||
Logger::Get().Error("文件无效");
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
for (auto& pair : frameSequence) {
|
|
||||||
pair.second = JifRateToDuration(*(uint32_t*)fileData);
|
|
||||||
fileData += sizeof(uint32_t);
|
|
||||||
}
|
|
||||||
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
case FOURCC_seq:
|
|
||||||
{
|
|
||||||
// 确保已解析 anih 块
|
|
||||||
if (aniHeader.cbSizeof == 0) {
|
|
||||||
Logger::Get().Error("文件无效");
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
// 无 AF_SEQUENCE 标志或只有一帧时忽略 seq 块
|
|
||||||
if (!(aniHeader.fl & AF_SEQUENCE) || frameSequence.empty()) {
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (fileData + sizeof(uint32_t) * aniHeader.cSteps > chunkEnd) {
|
|
||||||
Logger::Get().Error("文件无效");
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
for (auto& pair : frameSequence) {
|
|
||||||
pair.first = *(uint32_t*)fileData;
|
|
||||||
fileData += sizeof(uint32_t);
|
|
||||||
}
|
|
||||||
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
fileData = chunkEnd;
|
|
||||||
}
|
|
||||||
|
|
||||||
// 确保所有帧都已提取
|
|
||||||
if (frames.empty() || curFrameIdx != aniHeader.cFrames) {
|
|
||||||
Logger::Get().Error("文件无效");
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
// 只有一帧时 frameSequence 为空
|
|
||||||
if (frameSequence.empty()) {
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
|
|
||||||
for (const auto& pair : frameSequence) {
|
|
||||||
// 确保持续时间不为 0
|
|
||||||
if (pair.second.count() == 0) {
|
|
||||||
Logger::Get().Error("文件无效");
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if (aniHeader.fl & AF_SEQUENCE) {
|
|
||||||
std::vector<bool> frameInUse(aniHeader.cFrames);
|
|
||||||
|
|
||||||
for (const auto& pair : frameSequence) {
|
|
||||||
// 检查序列是否合法
|
|
||||||
if (pair.first >= aniHeader.cFrames) {
|
|
||||||
Logger::Get().Error("文件无效");
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
frameInUse[pair.first] = true;
|
|
||||||
}
|
|
||||||
|
|
||||||
// 删除未被使用的帧
|
|
||||||
for (int i = aniHeader.cFrames - 1; i >= 0; --i) {
|
|
||||||
if (frameInUse[i]) {
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
|
|
||||||
frames.erase(frames.begin() + i);
|
|
||||||
|
|
||||||
// 删除一帧后调整索引
|
|
||||||
for (auto& pair : frameSequence) {
|
|
||||||
if (pair.first > (uint32_t)i) {
|
|
||||||
--pair.first;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// 只剩一帧则不是动态光标
|
|
||||||
if (frames.size() == 1) {
|
|
||||||
frameSequence.clear();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
|
|
||||||
bool CursorHelper::ExtractCursorFramesFromFile(
|
|
||||||
const wchar_t* fileName,
|
|
||||||
uint32_t preferredWidth,
|
|
||||||
SmallVectorImpl<wil::unique_hcursor>& frames,
|
|
||||||
SmallVectorImpl<std::pair<uint32_t, std::chrono::nanoseconds>>& frameSequence
|
|
||||||
) noexcept {
|
|
||||||
assert(frames.empty() && frameSequence.empty());
|
|
||||||
|
|
||||||
CREATEFILE2_EXTENDED_PARAMETERS extendedParams{
|
|
||||||
.dwSize = sizeof(CREATEFILE2_EXTENDED_PARAMETERS),
|
|
||||||
.dwFileAttributes = FILE_ATTRIBUTE_NORMAL,
|
|
||||||
.dwFileFlags = FILE_FLAG_SEQUENTIAL_SCAN
|
|
||||||
};
|
|
||||||
wil::unique_hfile hFile(CreateFile2(
|
|
||||||
fileName, GENERIC_READ, FILE_SHARE_READ, OPEN_EXISTING, &extendedParams));
|
|
||||||
if (!hFile) {
|
|
||||||
Logger::Get().Win32Error("CreateFile2 失败");
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
const DWORD fileSize = GetFileSize(hFile.get(), nullptr);
|
|
||||||
// 这个检查确保可以访问 RIFF 头,也确保不会把空文件传给 CreateFileMapping
|
|
||||||
if (fileSize < sizeof(RTAG)) {
|
|
||||||
Logger::Get().Error("文件无效");
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
wil::unique_handle hFileMap(CreateFileMapping(
|
|
||||||
hFile.get(), nullptr, PAGE_READONLY, 0, 0, nullptr));
|
|
||||||
if (!hFileMap) {
|
|
||||||
Logger::Get().Win32Error("CreateFileMapping 失败");
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
wil::unique_mapview_ptr<const uint8_t> fileData((const uint8_t*)MapViewOfFile(
|
|
||||||
hFileMap.get(), FILE_MAP_READ, 0, 0, 0));
|
|
||||||
if (!fileData) {
|
|
||||||
Logger::Get().Win32Error("MapViewOfFile 失败");
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
const uint8_t* fileEnd = fileData.get() + fileSize;
|
|
||||||
|
|
||||||
// 存在 RIFF 头则 ani,否则为 ico
|
|
||||||
if (((RTAG*)fileData.get())->ckID == FOURCC_RIFF) {
|
|
||||||
if (!LoadAniFromFileMap(fileData.get(), fileEnd, preferredWidth, frames, frameSequence)) {
|
|
||||||
Logger::Get().Error("LoadAniFromFileMap 失败");
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
wil::unique_hcursor hCursor = LoadIcoFromFileMap(fileData.get(), fileEnd, preferredWidth);
|
|
||||||
if (hCursor) {
|
|
||||||
frames.push_back(std::move(hCursor));
|
|
||||||
} else {
|
|
||||||
Logger::Get().Error("LoadIcoFromFileMap 失败");
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
|
|
||||||
void CursorHelper::TryResolveAnimatedCursor(
|
|
||||||
HCURSOR hCursor,
|
|
||||||
SmallVectorImpl<HCURSOR>& frames,
|
|
||||||
SmallVectorImpl<std::pair<uint32_t, std::chrono::nanoseconds>>& frameSequence
|
|
||||||
) noexcept {
|
|
||||||
assert(hCursor && frames.empty() && frameSequence.empty());
|
|
||||||
|
|
||||||
using FnGetCursorFrameInfo = HCURSOR WINAPI(
|
|
||||||
HCURSOR hcur,
|
|
||||||
LPWSTR lpName,
|
|
||||||
int iFrame,
|
|
||||||
LPDWORD pjifRate,
|
|
||||||
LPINT pccur
|
|
||||||
);
|
|
||||||
|
|
||||||
static FnGetCursorFrameInfo* getCursorFrameInfo = [] {
|
|
||||||
return Win32Helper::LoadFunction<FnGetCursorFrameInfo>(L"user32.dll", "GetCursorFrameInfo");
|
|
||||||
}();
|
|
||||||
|
|
||||||
if (!getCursorFrameInfo) {
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
// GetCursorFrameInfo 直接返回内部句柄,无需销毁
|
|
||||||
DWORD jifRate;
|
|
||||||
int stepCount;
|
|
||||||
HCURSOR hCursorFrame = getCursorFrameInfo(hCursor, nullptr, 0, &jifRate, &stepCount);
|
|
||||||
if (!hCursorFrame || stepCount <= 1) {
|
|
||||||
// 失败或不是动态光标
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
frames.reserve(stepCount);
|
|
||||||
frameSequence.resize(stepCount);
|
|
||||||
|
|
||||||
frames.push_back(hCursorFrame);
|
|
||||||
frameSequence[0] = { 0, JifRateToDuration(jifRate) };
|
|
||||||
|
|
||||||
for (int i = 1; i < stepCount; ++i) {
|
|
||||||
hCursorFrame = getCursorFrameInfo(hCursor, nullptr, i, &jifRate, &stepCount);
|
|
||||||
if (!hCursorFrame) {
|
|
||||||
// 失败时确保结果为空
|
|
||||||
frames.clear();
|
|
||||||
frameSequence.clear();
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
// 排除重复的帧,用序列表实现
|
|
||||||
const uint32_t frameCount = (uint32_t)frames.size();
|
|
||||||
uint32_t j = 0;
|
|
||||||
for (; j < frameCount; ++j) {
|
|
||||||
if (frames[j] == hCursorFrame) {
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if (j == frameCount) {
|
|
||||||
frames.push_back(hCursorFrame);
|
|
||||||
}
|
|
||||||
|
|
||||||
frameSequence[i] = { j, JifRateToDuration(jifRate) };
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
@ -1,30 +0,0 @@
|
||||||
#pragma once
|
|
||||||
#include "SmallVector.h"
|
|
||||||
|
|
||||||
namespace Magpie {
|
|
||||||
|
|
||||||
struct CursorHelper {
|
|
||||||
// 如果没有完美匹配则倾向于提取较大的资源
|
|
||||||
static wil::unique_hcursor ExtractCursorFromModule(
|
|
||||||
HMODULE hModule,
|
|
||||||
LPCWSTR resName,
|
|
||||||
uint32_t preferredWidth
|
|
||||||
) noexcept;
|
|
||||||
|
|
||||||
// 支持 .ico 和 .ani
|
|
||||||
static bool ExtractCursorFramesFromFile(
|
|
||||||
const wchar_t* fileName,
|
|
||||||
uint32_t preferredWidth,
|
|
||||||
SmallVectorImpl<wil::unique_hcursor>& frames,
|
|
||||||
SmallVectorImpl<std::pair<uint32_t, std::chrono::nanoseconds>>& frameSequence
|
|
||||||
) noexcept;
|
|
||||||
|
|
||||||
// frames 中的句柄无需销毁
|
|
||||||
static void TryResolveAnimatedCursor(
|
|
||||||
HCURSOR hCursor,
|
|
||||||
SmallVectorImpl<HCURSOR>& frames,
|
|
||||||
SmallVectorImpl<std::pair<uint32_t, std::chrono::nanoseconds>>& frameSequence
|
|
||||||
) noexcept;
|
|
||||||
};
|
|
||||||
|
|
||||||
}
|
|
||||||
File diff suppressed because it is too large
Load diff
|
|
@ -8,54 +8,55 @@ public:
|
||||||
CursorManager(const CursorManager&) = delete;
|
CursorManager(const CursorManager&) = delete;
|
||||||
CursorManager(CursorManager&&) = delete;
|
CursorManager(CursorManager&&) = delete;
|
||||||
|
|
||||||
void Initialize(
|
|
||||||
const RECT& srcRect,
|
|
||||||
const RECT& rendererRect,
|
|
||||||
const RECT& destRect,
|
|
||||||
bool isSrcMoving,
|
|
||||||
bool isSrcFocused
|
|
||||||
) noexcept;
|
|
||||||
|
|
||||||
~CursorManager() noexcept;
|
~CursorManager() noexcept;
|
||||||
|
|
||||||
std::pair<HCURSOR, POINT> Update() noexcept;
|
void Update() noexcept;
|
||||||
|
|
||||||
void OnResizingChanged(bool value) noexcept;
|
void OnScalingPosChanged() noexcept;
|
||||||
|
|
||||||
void OnResized(const RECT& rendererRect, const RECT& destRect) noexcept;
|
void OnSrcStartMove() noexcept;
|
||||||
|
|
||||||
void OnMovingChanged(bool value) noexcept;
|
void OnSrcEndMove() noexcept;
|
||||||
|
|
||||||
void OnMoved(const RECT& rendererRect, const RECT& destRect) noexcept;
|
void OnStartMove() noexcept;
|
||||||
|
|
||||||
void OnSrcMovingChanged(bool value) noexcept;
|
void OnEndResizeMove() noexcept;
|
||||||
|
|
||||||
void OnSrcMoved(const RECT& srcRect) noexcept;
|
void OnSrcRectChanged() noexcept;
|
||||||
|
|
||||||
void OnSrcFocusChanged(bool focused) noexcept;
|
// 光标不在缩放窗口上或隐藏时为 NULL
|
||||||
|
HCURSOR CursorHandle() const noexcept {
|
||||||
|
return _hCursor;
|
||||||
|
}
|
||||||
|
|
||||||
void OnCursorOnOverlayChanged(bool value) noexcept;
|
// 屏幕坐标
|
||||||
|
POINT CursorPos() const noexcept {
|
||||||
|
return _cursorPos;
|
||||||
|
}
|
||||||
|
|
||||||
|
bool IsCursorCaptured() const noexcept {
|
||||||
|
return _isUnderCapture;
|
||||||
|
}
|
||||||
|
|
||||||
|
bool IsCursorCapturedOnForeground() const noexcept {
|
||||||
|
return _isCapturedOnForeground;
|
||||||
|
}
|
||||||
|
|
||||||
|
bool IsCursorOnOverlay() const noexcept {
|
||||||
|
return _isOnOverlay;
|
||||||
|
}
|
||||||
|
void IsCursorOnOverlay(bool value) noexcept;
|
||||||
|
|
||||||
bool IsCursorCapturedOnOverlay() const noexcept {
|
bool IsCursorCapturedOnOverlay() const noexcept {
|
||||||
return _isCapturedOnOverlay;
|
return _isCapturedOnOverlay;
|
||||||
}
|
}
|
||||||
void IsCursorCapturedOnOverlay(bool value) noexcept;
|
void IsCursorCapturedOnOverlay(bool value) noexcept;
|
||||||
|
|
||||||
int16_t GetSrcHitTest() const noexcept {
|
int16_t SrcHitTest() const noexcept {
|
||||||
return _lastCompletedHitTestResult;
|
return _lastCompletedHitTestResult;
|
||||||
}
|
}
|
||||||
|
|
||||||
private:
|
private:
|
||||||
POINT _SrcToScaling(POINT pt, bool skipBorder) const noexcept;
|
|
||||||
|
|
||||||
enum class _RoundMethod {
|
|
||||||
Round,
|
|
||||||
Floor,
|
|
||||||
Ceil
|
|
||||||
};
|
|
||||||
|
|
||||||
POINT _ScalingToSrc(POINT pt, _RoundMethod roundType = _RoundMethod::Round) const noexcept;
|
|
||||||
|
|
||||||
void _ShowSystemCursor(bool show, bool onDestory = false);
|
void _ShowSystemCursor(bool show, bool onDestory = false);
|
||||||
|
|
||||||
void _AdjustCursorSpeed() noexcept;
|
void _AdjustCursorSpeed() noexcept;
|
||||||
|
|
@ -76,18 +77,14 @@ private:
|
||||||
|
|
||||||
void _UpdateCursorPos() noexcept;
|
void _UpdateCursorPos() noexcept;
|
||||||
|
|
||||||
void _StartVirtualization(POINT& cursorPos) noexcept;
|
void _StartCapture(POINT& cursorPos) noexcept;
|
||||||
|
|
||||||
bool _StopVirtualization(POINT& cursorPos, bool onDestroy = false) noexcept;
|
bool _StopCapture(POINT& cursorPos, bool onDestroy = false) noexcept;
|
||||||
|
|
||||||
void _SetClipCursor(const RECT& clipRect, bool is3DGameMode = false) noexcept;
|
void _SetClipCursor(const RECT& clipRect, bool is3DGameMode = false) noexcept;
|
||||||
|
|
||||||
void _RestoreClipCursor() noexcept;
|
void _RestoreClipCursor() noexcept;
|
||||||
|
|
||||||
RECT _srcRect{};
|
|
||||||
RECT _rendererRect{};
|
|
||||||
RECT _destRect{};
|
|
||||||
|
|
||||||
HCURSOR _hCursor = NULL;
|
HCURSOR _hCursor = NULL;
|
||||||
POINT _cursorPos{ std::numeric_limits<LONG>::max() };
|
POINT _cursorPos{ std::numeric_limits<LONG>::max() };
|
||||||
|
|
||||||
|
|
@ -107,13 +104,8 @@ private:
|
||||||
POINT _lastCompletedHitTestPos{ std::numeric_limits<LONG>::max() };
|
POINT _lastCompletedHitTestPos{ std::numeric_limits<LONG>::max() };
|
||||||
int16_t _lastCompletedHitTestResult = HTNOWHERE;
|
int16_t _lastCompletedHitTestResult = HTNOWHERE;
|
||||||
|
|
||||||
bool _isMoving = false;
|
bool _isUnderCapture = false;
|
||||||
bool _isResizing = false;
|
// 当缩放后的光标位置在交换链窗口上且没有被其他窗口挡住时应绘制光标
|
||||||
bool _isSrcMoving = false;
|
|
||||||
bool _isSrcFocused = false;
|
|
||||||
|
|
||||||
bool _isVirtualized = false;
|
|
||||||
// 当缩放后的光标位置在渲染矩形内且没有被其他窗口挡住时应绘制光标
|
|
||||||
bool _shouldDrawCursor = false;
|
bool _shouldDrawCursor = false;
|
||||||
|
|
||||||
bool _isCapturedOnForeground = false;
|
bool _isCapturedOnForeground = false;
|
||||||
|
|
|
||||||
|
|
@ -1,634 +0,0 @@
|
||||||
#include "pch.h"
|
|
||||||
#include "D3D12Context.h"
|
|
||||||
#include "ScalingWindow.h"
|
|
||||||
#include "Logger.h"
|
|
||||||
#include "AppFolderManager.h"
|
|
||||||
#include "DirectXHelper.h"
|
|
||||||
#include "StrHelper.h"
|
|
||||||
#include "DescriptorHeap.h"
|
|
||||||
#include "Win32Helper.h"
|
|
||||||
|
|
||||||
namespace Magpie {
|
|
||||||
|
|
||||||
bool D3D12Context::Initialize(
|
|
||||||
const GraphicsCardId& graphicsCardId,
|
|
||||||
uint32_t maxInFlightFrameCount,
|
|
||||||
D3D12_COMMAND_QUEUE_PRIORITY priority,
|
|
||||||
D3D12_COMMAND_LIST_TYPE commandListType,
|
|
||||||
DescriptorHeap& csuDescriptorHeap,
|
|
||||||
DescriptorHeap& rtvDescriptorHeap,
|
|
||||||
bool disableFrameFenceTracking
|
|
||||||
) noexcept {
|
|
||||||
_csuDescriptorHeap = &csuDescriptorHeap;
|
|
||||||
_rtvDescriptorHeap = &rtvDescriptorHeap;
|
|
||||||
|
|
||||||
HRESULT hr = _CreateDXGIFactory();
|
|
||||||
if (FAILED(hr)) {
|
|
||||||
Logger::Get().ComError("_CreateDXGIFactory 失败", hr);
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (!_CreateAdapterAndDevice(graphicsCardId)) {
|
|
||||||
Logger::Get().Error("_CreateAdapterAndDevice 失败");
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
#ifdef _DEBUG
|
|
||||||
// 调试层汇报错误或警告时中断
|
|
||||||
if (winrt::com_ptr<ID3D12InfoQueue> infoQueue = _device.try_as<ID3D12InfoQueue>()) {
|
|
||||||
infoQueue->SetBreakOnSeverity(D3D12_MESSAGE_SEVERITY_CORRUPTION, TRUE);
|
|
||||||
infoQueue->SetBreakOnSeverity(D3D12_MESSAGE_SEVERITY_ERROR, TRUE);
|
|
||||||
infoQueue->SetBreakOnSeverity(D3D12_MESSAGE_SEVERITY_WARNING, TRUE);
|
|
||||||
}
|
|
||||||
#endif
|
|
||||||
|
|
||||||
_QueryHighestShaderModel();
|
|
||||||
|
|
||||||
// 检查根签名版本
|
|
||||||
{
|
|
||||||
D3D12_FEATURE_DATA_ROOT_SIGNATURE featureData = { .HighestVersion = D3D_ROOT_SIGNATURE_VERSION_1_1 };
|
|
||||||
hr = _device->CheckFeatureSupport(D3D12_FEATURE_ROOT_SIGNATURE, &featureData, sizeof(featureData));
|
|
||||||
if (SUCCEEDED(hr)) {
|
|
||||||
_rootSignatureVersion = featureData.HighestVersion;
|
|
||||||
} else {
|
|
||||||
Logger::Get().ComWarn("CheckFeatureSupport 失败", hr);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// 检查是否是集成显卡
|
|
||||||
{
|
|
||||||
D3D12_FEATURE_DATA_ARCHITECTURE1 data{};
|
|
||||||
hr = _device->CheckFeatureSupport(D3D12_FEATURE_ARCHITECTURE1, &data, sizeof(data));
|
|
||||||
if (SUCCEEDED(hr)) {
|
|
||||||
_isUMA = data.UMA;
|
|
||||||
} else {
|
|
||||||
Logger::Get().ComWarn("CheckFeatureSupport 失败", hr);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// 检查 D3D12_HEAP_FLAG_CREATE_NOT_ZEROED 支持。是否支持这个功能只和 D3D12 版本有关,
|
|
||||||
// 虽然我们随程序部署了 Agility SDK,但旧版 Win10 不支持加载。
|
|
||||||
// https://devblogs.microsoft.com/directx/coming-to-directx-12-more-control-over-memory-allocation/
|
|
||||||
_isHeapFlagCreateNotZeroedSupported = (bool)_device.try_as<ID3D12Device8>();
|
|
||||||
|
|
||||||
// 检查 Resizable BAR 支持
|
|
||||||
{
|
|
||||||
D3D12_FEATURE_DATA_D3D12_OPTIONS16 data{};
|
|
||||||
hr = _device->CheckFeatureSupport(D3D12_FEATURE_D3D12_OPTIONS16, &data, sizeof(data));
|
|
||||||
if (SUCCEEDED(hr)) {
|
|
||||||
_isGPUUploadHeapSupported = data.GPUUploadHeapSupported;
|
|
||||||
} else {
|
|
||||||
Logger::Get().ComWarn("CheckFeatureSupport 失败", hr);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// 检查 FP16 支持
|
|
||||||
if (!ScalingWindow::Get().Options().IsFP16Disabled()) {
|
|
||||||
{
|
|
||||||
D3D12_FEATURE_DATA_D3D12_OPTIONS data{};
|
|
||||||
hr = _device->CheckFeatureSupport(D3D12_FEATURE_D3D12_OPTIONS, &data, sizeof(data));
|
|
||||||
if (SUCCEEDED(hr)) {
|
|
||||||
_isMinFloat16Supported = data.MinPrecisionSupport & D3D12_SHADER_MIN_PRECISION_SUPPORT_16_BIT;
|
|
||||||
} else {
|
|
||||||
Logger::Get().ComWarn("CheckFeatureSupport 失败", hr);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// SM 6.2 开始支持原生 16 位标量
|
|
||||||
if (_shaderModel >= D3D_SHADER_MODEL_6_2) {
|
|
||||||
D3D12_FEATURE_DATA_D3D12_OPTIONS4 data{};
|
|
||||||
hr = _device->CheckFeatureSupport(D3D12_FEATURE_D3D12_OPTIONS4, &data, sizeof(data));
|
|
||||||
if (SUCCEEDED(hr)) {
|
|
||||||
_isNative16BitSupported = data.Native16BitShaderOpsSupported;
|
|
||||||
} else {
|
|
||||||
Logger::Get().ComWarn("CheckFeatureSupport 失败", hr);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
_LogDeviceInfo();
|
|
||||||
|
|
||||||
if (!_InitializeDeviceResources(
|
|
||||||
maxInFlightFrameCount, priority, commandListType, disableFrameFenceTracking)) {
|
|
||||||
Logger::Get().Error("_InitializeDeviceResources 失败");
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
|
|
||||||
void D3D12Context::CopyDevice(const D3D12Context& other) {
|
|
||||||
_csuDescriptorHeap = other._csuDescriptorHeap;
|
|
||||||
_rtvDescriptorHeap = other._rtvDescriptorHeap;
|
|
||||||
_device = other._device;
|
|
||||||
_shaderModel = other._shaderModel;
|
|
||||||
_rootSignatureVersion = other._rootSignatureVersion;
|
|
||||||
_isUMA = other._isUMA;
|
|
||||||
_isHeapFlagCreateNotZeroedSupported = other._isHeapFlagCreateNotZeroedSupported;
|
|
||||||
_isGPUUploadHeapSupported = other._isGPUUploadHeapSupported;
|
|
||||||
_isMinFloat16Supported = other._isMinFloat16Supported;
|
|
||||||
_isNative16BitSupported = other._isNative16BitSupported;
|
|
||||||
}
|
|
||||||
|
|
||||||
bool D3D12Context::InitializeAfterCopyDevice(
|
|
||||||
uint32_t maxInFlightFrameCount,
|
|
||||||
D3D12_COMMAND_QUEUE_PRIORITY priority,
|
|
||||||
D3D12_COMMAND_LIST_TYPE commandListType,
|
|
||||||
bool disableFrameFenceTracking
|
|
||||||
) noexcept {
|
|
||||||
HRESULT hr = _CreateDXGIFactory();
|
|
||||||
if (FAILED(hr)) {
|
|
||||||
Logger::Get().ComError("_CreateDXGIFactory 失败", hr);
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (!_CreateAdapterFromDevice()) {
|
|
||||||
Logger::Get().ComError("_CreateDXGIFactory 失败", hr);
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (!_InitializeDeviceResources(maxInFlightFrameCount, priority, commandListType, disableFrameFenceTracking)) {
|
|
||||||
Logger::Get().Error("_InitializeDeviceResources 失败");
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
|
|
||||||
IDXGIFactory7* D3D12Context::GetDXGIFactoryForEnumingAdapters() noexcept {
|
|
||||||
if (!_dxgiFactory->IsCurrent()) {
|
|
||||||
HRESULT hr = _CreateDXGIFactory();
|
|
||||||
if (FAILED(hr)) {
|
|
||||||
Logger::Get().ComError("_CreateDXGIFactory 失败", hr);
|
|
||||||
return nullptr;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
return _dxgiFactory.get();
|
|
||||||
}
|
|
||||||
|
|
||||||
HRESULT D3D12Context::Signal(uint64_t& fenceValue) noexcept {
|
|
||||||
fenceValue = ++_curFenceValue;
|
|
||||||
return _commandQueue->Signal(_fence.get(), _curFenceValue);
|
|
||||||
}
|
|
||||||
|
|
||||||
HRESULT D3D12Context::WaitForFenceValue(uint64_t fenceValue) noexcept {
|
|
||||||
if (_fence->GetCompletedValue() >= fenceValue) {
|
|
||||||
return S_OK;
|
|
||||||
} else {
|
|
||||||
return _fence->SetEventOnCompletion(fenceValue, nullptr);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
HRESULT D3D12Context::WaitForGpu() noexcept {
|
|
||||||
HRESULT hr = _commandQueue->Signal(_fence.get(), ++_curFenceValue);
|
|
||||||
if (FAILED(hr)) {
|
|
||||||
Logger::Get().ComError("ID3D12CommandQueue::Signal 失败", hr);
|
|
||||||
return hr;
|
|
||||||
}
|
|
||||||
|
|
||||||
return WaitForFenceValue(_curFenceValue);
|
|
||||||
}
|
|
||||||
|
|
||||||
HRESULT D3D12Context::WaitForCommandQueue(ID3D12CommandQueue* commandQueue) noexcept {
|
|
||||||
HRESULT hr = commandQueue->Signal(_fence.get(), ++_curFenceValue);
|
|
||||||
if (FAILED(hr)) {
|
|
||||||
Logger::Get().ComError("ID3D12CommandQueue::Signal 失败", hr);
|
|
||||||
return hr;
|
|
||||||
}
|
|
||||||
|
|
||||||
hr = _commandQueue->Wait(_fence.get(), _curFenceValue);
|
|
||||||
if (FAILED(hr)) {
|
|
||||||
Logger::Get().ComError("ID3D12CommandQueue::Wait 失败", hr);
|
|
||||||
return hr;
|
|
||||||
}
|
|
||||||
|
|
||||||
return S_OK;
|
|
||||||
}
|
|
||||||
|
|
||||||
HRESULT D3D12Context::BeginFrame(uint32_t& curFrameIndex, ID3D12PipelineState* initialState) noexcept {
|
|
||||||
if (!_frameFenceValues.empty()) {
|
|
||||||
HRESULT hr = WaitForFenceValue(_frameFenceValues[_curFrameIndex]);
|
|
||||||
if (FAILED(hr)) {
|
|
||||||
Logger::Get().ComError("WaitForFenceValue 失败", hr);
|
|
||||||
return hr;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
HRESULT hr = _commandAllocators[_curFrameIndex]->Reset();
|
|
||||||
if (FAILED(hr)) {
|
|
||||||
Logger::Get().ComError("ID3D12CommandAllocator::Reset 失败", hr);
|
|
||||||
return hr;
|
|
||||||
}
|
|
||||||
|
|
||||||
hr = _commandList->Reset(_commandAllocators[_curFrameIndex].get(), initialState);
|
|
||||||
if (FAILED(hr)) {
|
|
||||||
Logger::Get().ComError("ID3D12GraphicsCommandList::Reset 失败", hr);
|
|
||||||
return hr;
|
|
||||||
}
|
|
||||||
|
|
||||||
curFrameIndex = _curFrameIndex;
|
|
||||||
return S_OK;
|
|
||||||
}
|
|
||||||
|
|
||||||
HRESULT D3D12Context::EndFrame() noexcept {
|
|
||||||
if (!_frameFenceValues.empty()) {
|
|
||||||
HRESULT hr = Signal(_frameFenceValues[_curFrameIndex]);
|
|
||||||
if (FAILED(hr)) {
|
|
||||||
Logger::Get().ComError("Signal 失败", hr);
|
|
||||||
return hr;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
_curFrameIndex = (_curFrameIndex + 1) % (uint32_t)_commandAllocators.size();
|
|
||||||
return S_OK;
|
|
||||||
}
|
|
||||||
|
|
||||||
HRESULT D3D12Context::_CreateDXGIFactory() noexcept {
|
|
||||||
UINT flags = 0;
|
|
||||||
#ifdef _DEBUG
|
|
||||||
flags |= DXGI_CREATE_FACTORY_DEBUG;
|
|
||||||
#endif
|
|
||||||
HRESULT hr = CreateDXGIFactory2(flags, IID_PPV_ARGS(&_dxgiFactory));
|
|
||||||
if (FAILED(hr)) {
|
|
||||||
Logger::Get().ComError("CreateDXGIFactory2 失败", hr);
|
|
||||||
}
|
|
||||||
return hr;
|
|
||||||
}
|
|
||||||
|
|
||||||
bool D3D12Context::_InitializeDeviceResources(
|
|
||||||
uint32_t maxInFlightFrameCount,
|
|
||||||
D3D12_COMMAND_QUEUE_PRIORITY priority,
|
|
||||||
D3D12_COMMAND_LIST_TYPE commandListType,
|
|
||||||
bool disableFrameFenceTracking
|
|
||||||
) noexcept {
|
|
||||||
{
|
|
||||||
D3D12_COMMAND_QUEUE_DESC queueDesc = {
|
|
||||||
.Type = commandListType,
|
|
||||||
.Priority = priority,
|
|
||||||
.Flags = D3D12_COMMAND_QUEUE_FLAG_NONE
|
|
||||||
};
|
|
||||||
HRESULT hr = _device->CreateCommandQueue(&queueDesc, IID_PPV_ARGS(&_commandQueue));
|
|
||||||
if (FAILED(hr)) {
|
|
||||||
Logger::Get().ComError("CreateCommandQueue 失败", hr);
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
HRESULT hr = _device->CreateCommandList1(0, commandListType,
|
|
||||||
D3D12_COMMAND_LIST_FLAG_NONE, IID_PPV_ARGS(&_commandList));
|
|
||||||
if (FAILED(hr)) {
|
|
||||||
Logger::Get().ComError("CreateCommandList1 失败", hr);
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
_commandAllocators.resize(maxInFlightFrameCount);
|
|
||||||
for (winrt::com_ptr<ID3D12CommandAllocator>& commandAllocator : _commandAllocators) {
|
|
||||||
hr = _device->CreateCommandAllocator(commandListType, IID_PPV_ARGS(&commandAllocator));
|
|
||||||
if (FAILED(hr)) {
|
|
||||||
Logger::Get().ComError("CreateCommandAllocator 失败", hr);
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
hr = _device->CreateFence(0, D3D12_FENCE_FLAG_NONE, IID_PPV_ARGS(&_fence));
|
|
||||||
if (FAILED(hr)) {
|
|
||||||
Logger::Get().ComError("CreateFence 失败", hr);
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
// 如果已在外部同步则无需追踪每帧的栅栏值
|
|
||||||
if (!disableFrameFenceTracking) {
|
|
||||||
_frameFenceValues.resize(maxInFlightFrameCount);
|
|
||||||
}
|
|
||||||
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
|
|
||||||
// 和 D3D12SDKLayers.dll 不同,OS 加载 d3d10warp.dll 时不遵循 D3D12SDKPath。
|
|
||||||
// 这个函数确保加载匹配的 d3d10warp.dll。
|
|
||||||
static void FixD3D10WarpDll(IDXGIAdapter1* warpAdapter) noexcept {
|
|
||||||
assert(!GetModuleHandle(L"d3d10warp.dll"));
|
|
||||||
|
|
||||||
HMODULE hD3D12Core = GetModuleHandle(L"D3D12Core.dll");
|
|
||||||
if (!hD3D12Core) {
|
|
||||||
// 如果 D3D12Core.dll 尚未加载则加载它
|
|
||||||
D3D12CreateDevice(warpAdapter, D3D_FEATURE_LEVEL_11_0, winrt::guid_of<ID3D12Device>(), nullptr);
|
|
||||||
|
|
||||||
hD3D12Core = GetModuleHandle(L"D3D12Core.dll");
|
|
||||||
if (!hD3D12Core) {
|
|
||||||
// 可能 OS 不支持 Agility SDK
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// 检查是否加载了随程序部署的 D3D12Core.dll
|
|
||||||
std::wstring d3d12CorePath;
|
|
||||||
wil::GetModuleFileNameW(hD3D12Core, d3d12CorePath);
|
|
||||||
if (d3d12CorePath.starts_with(AppFolderManager::Get().GetExeDir().native())) {
|
|
||||||
// 加载随程序部署的 d3d10warp.dll
|
|
||||||
std::filesystem::path warpDllPath =
|
|
||||||
AppFolderManager::Get().GetD3D12Dir() / L"d3d10warp.dll";
|
|
||||||
LoadLibrary(warpDllPath.c_str());
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
bool D3D12Context::_CreateAdapterAndDevice(const GraphicsCardId& graphicsCardId) noexcept {
|
|
||||||
winrt::com_ptr<IDXGIAdapter1> adapter;
|
|
||||||
|
|
||||||
if (!ScalingWindow::Get().Options().UseWarp()) {
|
|
||||||
// 记录不支持 D3D12 的显卡索引,防止重复尝试
|
|
||||||
int failedIdx = -1;
|
|
||||||
|
|
||||||
if (graphicsCardId.idx >= 0) {
|
|
||||||
assert(graphicsCardId.vendorId != 0 && graphicsCardId.deviceId != 0);
|
|
||||||
|
|
||||||
// 先使用索引
|
|
||||||
HRESULT hr = _dxgiFactory->EnumAdapters1(graphicsCardId.idx, adapter.put());
|
|
||||||
if (SUCCEEDED(hr)) {
|
|
||||||
DXGI_ADAPTER_DESC1 desc;
|
|
||||||
hr = adapter->GetDesc1(&desc);
|
|
||||||
if (SUCCEEDED(hr)) {
|
|
||||||
if (desc.VendorId == graphicsCardId.vendorId && desc.DeviceId == graphicsCardId.deviceId) {
|
|
||||||
if (_TryCreateD3DDevice(adapter, desc)) {
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
|
|
||||||
failedIdx = graphicsCardId.idx;
|
|
||||||
Logger::Get().Warn("用户指定的显示卡不支持 D3D12");
|
|
||||||
} else {
|
|
||||||
Logger::Get().Warn("显卡配置已变化");
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// 如果已确认该显卡不支持 D3D12,不再重复尝试
|
|
||||||
if (failedIdx == -1) {
|
|
||||||
// 枚举查找 vendorId 和 deviceId 匹配的显卡
|
|
||||||
for (UINT adapterIdx = 0;
|
|
||||||
SUCCEEDED(_dxgiFactory->EnumAdapters1(adapterIdx, adapter.put()));
|
|
||||||
++adapterIdx
|
|
||||||
) {
|
|
||||||
if ((int)adapterIdx == graphicsCardId.idx) {
|
|
||||||
// 已经检查了 graphicsCardId.idx
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
|
|
||||||
DXGI_ADAPTER_DESC1 desc;
|
|
||||||
hr = adapter->GetDesc1(&desc);
|
|
||||||
if (FAILED(hr)) {
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (desc.VendorId == graphicsCardId.vendorId && desc.DeviceId == graphicsCardId.deviceId) {
|
|
||||||
if (_TryCreateD3DDevice(adapter, desc)) {
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
|
|
||||||
failedIdx = (int)adapterIdx;
|
|
||||||
Logger::Get().Warn("用户指定的显示卡不支持 D3D12");
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// 枚举查找第一个支持 D3D12 的显卡
|
|
||||||
for (UINT adapterIdx = 0;
|
|
||||||
SUCCEEDED(_dxgiFactory->EnumAdapters1(adapterIdx, adapter.put()));
|
|
||||||
++adapterIdx
|
|
||||||
) {
|
|
||||||
if ((int)adapterIdx == failedIdx) {
|
|
||||||
// 无需再次尝试
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
|
|
||||||
DXGI_ADAPTER_DESC1 desc;
|
|
||||||
HRESULT hr = adapter->GetDesc1(&desc);
|
|
||||||
if (FAILED(hr) || DirectXHelper::IsWARP(desc)) {
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (_TryCreateD3DDevice(adapter, desc)) {
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// 作为最后手段,回落到 CPU 渲染 (WARP)
|
|
||||||
// https://docs.microsoft.com/en-us/windows/win32/direct3darticles/directx-warp
|
|
||||||
HRESULT hr = _dxgiFactory->EnumWarpAdapter(IID_PPV_ARGS(&adapter));
|
|
||||||
if (FAILED(hr)) {
|
|
||||||
Logger::Get().ComError("EnumWarpAdapter 失败", hr);
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
[[maybe_unused]] static Ignore _ = [](IDXGIAdapter1* warpAdapter) {
|
|
||||||
FixD3D10WarpDll(warpAdapter);
|
|
||||||
return Ignore();
|
|
||||||
}(adapter.get());
|
|
||||||
|
|
||||||
DXGI_ADAPTER_DESC1 desc;
|
|
||||||
hr = adapter->GetDesc1(&desc);
|
|
||||||
if (FAILED(hr) || !_TryCreateD3DDevice(adapter, desc)) {
|
|
||||||
Logger::Get().Error("创建 WARP 设备失败");
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
|
|
||||||
bool D3D12Context::_TryCreateD3DDevice(
|
|
||||||
const winrt::com_ptr<IDXGIAdapter1>& adapter,
|
|
||||||
const DXGI_ADAPTER_DESC1& adapterDesc
|
|
||||||
) noexcept {
|
|
||||||
HRESULT hr = D3D12CreateDevice(adapter.get(), D3D_FEATURE_LEVEL_11_0, IID_PPV_ARGS(&_device));
|
|
||||||
if (FAILED(hr)) {
|
|
||||||
Logger::Get().ComError("D3D12CreateDevice 失败", hr);
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
_dxgiAdapter = adapter.try_as<IDXGIAdapter4>();
|
|
||||||
if (!_dxgiAdapter) {
|
|
||||||
Logger::Get().Error("获取 IDXGIAdapter4 失败");
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
Logger::Get().Info(fmt::format("图形适配器\n\tVendorId: {:#x}\n\tDeviceId: {:#x}\n\tDescription: {}",
|
|
||||||
adapterDesc.VendorId, adapterDesc.DeviceId, StrHelper::UTF16ToUTF8(adapterDesc.Description)));
|
|
||||||
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
|
|
||||||
bool D3D12Context::_CreateAdapterFromDevice() noexcept {
|
|
||||||
const LUID adapterLuid = _device->GetAdapterLuid();
|
|
||||||
|
|
||||||
winrt::com_ptr<IDXGIAdapter1> adapter;
|
|
||||||
for (UINT adapterIdx = 0;
|
|
||||||
SUCCEEDED(_dxgiFactory->EnumAdapters1(adapterIdx, adapter.put()));
|
|
||||||
++adapterIdx
|
|
||||||
) {
|
|
||||||
DXGI_ADAPTER_DESC1 desc;
|
|
||||||
HRESULT hr = adapter->GetDesc1(&desc);
|
|
||||||
if (FAILED(hr)) {
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (desc.AdapterLuid != adapterLuid) {
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
|
|
||||||
_dxgiAdapter = adapter.try_as<IDXGIAdapter4>();
|
|
||||||
if (_dxgiAdapter) {
|
|
||||||
return true;
|
|
||||||
} else {
|
|
||||||
Logger::Get().Error("获取 IDXGIAdapter4 失败");
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
void D3D12Context::_QueryHighestShaderModel() noexcept {
|
|
||||||
// 如果运行时不知道 HighestShaderModel,CheckFeatureSupport 将返回 E_INVALIDARG
|
|
||||||
// (这只会发生在不支持 Agility SDK 的旧版本 Win10 上)。官方推荐从新到旧依次检查每
|
|
||||||
// 个版本。
|
|
||||||
constexpr std::array allModelVersions = {
|
|
||||||
D3D_SHADER_MODEL_6_9,
|
|
||||||
D3D_SHADER_MODEL_6_8,
|
|
||||||
D3D_SHADER_MODEL_6_7,
|
|
||||||
D3D_SHADER_MODEL_6_6,
|
|
||||||
D3D_SHADER_MODEL_6_5,
|
|
||||||
D3D_SHADER_MODEL_6_4,
|
|
||||||
D3D_SHADER_MODEL_6_3,
|
|
||||||
D3D_SHADER_MODEL_6_2,
|
|
||||||
D3D_SHADER_MODEL_6_1,
|
|
||||||
D3D_SHADER_MODEL_6_0,
|
|
||||||
D3D_SHADER_MODEL_5_1
|
|
||||||
};
|
|
||||||
constexpr uint32_t versionCount = (uint32_t)std::size(allModelVersions);
|
|
||||||
|
|
||||||
HighestShaderModel versionLimit = ScalingWindow::Get().Options().highestShaderModel;
|
|
||||||
uint32_t startIdx = versionLimit == HighestShaderModel::NotLimited ? 0 : (uint32_t)versionLimit - 1;
|
|
||||||
|
|
||||||
for (uint32_t i = startIdx; i < versionCount; ++i) {
|
|
||||||
D3D12_FEATURE_DATA_SHADER_MODEL data = { .HighestShaderModel = allModelVersions[i]};
|
|
||||||
HRESULT hr = _device->CheckFeatureSupport(D3D12_FEATURE_SHADER_MODEL, &data, sizeof(data));
|
|
||||||
if (hr == E_INVALIDARG) {
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (SUCCEEDED(hr)) {
|
|
||||||
_shaderModel = data.HighestShaderModel;
|
|
||||||
} else {
|
|
||||||
Logger::Get().ComWarn("CheckFeatureSupport 失败", hr);
|
|
||||||
}
|
|
||||||
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
void D3D12Context::_LogDeviceInfo() noexcept {
|
|
||||||
std::string_view featureLevel;
|
|
||||||
{
|
|
||||||
D3D_FEATURE_LEVEL featureLevels[] = {
|
|
||||||
D3D_FEATURE_LEVEL_12_2,
|
|
||||||
D3D_FEATURE_LEVEL_12_1,
|
|
||||||
D3D_FEATURE_LEVEL_12_0,
|
|
||||||
D3D_FEATURE_LEVEL_11_1,
|
|
||||||
D3D_FEATURE_LEVEL_11_0
|
|
||||||
};
|
|
||||||
D3D12_FEATURE_DATA_FEATURE_LEVELS featureData = {
|
|
||||||
.NumFeatureLevels = (UINT)std::size(featureLevels),
|
|
||||||
.pFeatureLevelsRequested = featureLevels
|
|
||||||
};
|
|
||||||
HRESULT hr = _device->CheckFeatureSupport(
|
|
||||||
D3D12_FEATURE_FEATURE_LEVELS, &featureData, sizeof(featureData));
|
|
||||||
if (SUCCEEDED(hr)) {
|
|
||||||
switch (featureData.MaxSupportedFeatureLevel) {
|
|
||||||
case D3D_FEATURE_LEVEL_12_2:
|
|
||||||
featureLevel = "12.2";
|
|
||||||
break;
|
|
||||||
case D3D_FEATURE_LEVEL_12_1:
|
|
||||||
featureLevel = "12.1";
|
|
||||||
break;
|
|
||||||
case D3D_FEATURE_LEVEL_12_0:
|
|
||||||
featureLevel = "12.0";
|
|
||||||
break;
|
|
||||||
case D3D_FEATURE_LEVEL_11_1:
|
|
||||||
featureLevel = "11.1";
|
|
||||||
break;
|
|
||||||
case D3D_FEATURE_LEVEL_11_0:
|
|
||||||
featureLevel = "11.0";
|
|
||||||
break;
|
|
||||||
default:
|
|
||||||
featureLevel = "未知";
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
Logger::Get().ComWarn("CheckFeatureSupport 失败", hr);
|
|
||||||
featureLevel = "未知";
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
std::string_view shaderModel;
|
|
||||||
switch (_shaderModel) {
|
|
||||||
case D3D_SHADER_MODEL_6_9:
|
|
||||||
shaderModel = "6.9";
|
|
||||||
break;
|
|
||||||
case D3D_SHADER_MODEL_6_8:
|
|
||||||
shaderModel = "6.8";
|
|
||||||
break;
|
|
||||||
case D3D_SHADER_MODEL_6_7:
|
|
||||||
shaderModel = "6.7";
|
|
||||||
break;
|
|
||||||
case D3D_SHADER_MODEL_6_6:
|
|
||||||
shaderModel = "6.6";
|
|
||||||
break;
|
|
||||||
case D3D_SHADER_MODEL_6_5:
|
|
||||||
shaderModel = "6.5";
|
|
||||||
break;
|
|
||||||
case D3D_SHADER_MODEL_6_4:
|
|
||||||
shaderModel = "6.4";
|
|
||||||
break;
|
|
||||||
case D3D_SHADER_MODEL_6_3:
|
|
||||||
shaderModel = "6.3";
|
|
||||||
break;
|
|
||||||
case D3D_SHADER_MODEL_6_2:
|
|
||||||
shaderModel = "6.2";
|
|
||||||
break;
|
|
||||||
case D3D_SHADER_MODEL_6_1:
|
|
||||||
shaderModel = "6.1";
|
|
||||||
break;
|
|
||||||
case D3D_SHADER_MODEL_6_0:
|
|
||||||
shaderModel = "6.0";
|
|
||||||
break;
|
|
||||||
default:
|
|
||||||
shaderModel = "5.1";
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
|
|
||||||
constexpr const char* boolStrs[] = { "否","是" };
|
|
||||||
|
|
||||||
Logger::Get().Info(fmt::format(R"(已创建 D3D12 设备
|
|
||||||
功能级别: {}
|
|
||||||
shader model 版本: {}
|
|
||||||
根签名版本: {}
|
|
||||||
集成显卡: {}
|
|
||||||
D3D12_HEAP_FLAG_CREATE_NOT_ZEROED 支持: {}
|
|
||||||
Resizable BAR 支持: {}
|
|
||||||
min16float 支持: {}
|
|
||||||
原生 16 位标量支持: {})",
|
|
||||||
featureLevel,
|
|
||||||
shaderModel,
|
|
||||||
_rootSignatureVersion == D3D_ROOT_SIGNATURE_VERSION_1_1 ? "1.1" : "1.0",
|
|
||||||
boolStrs[_isUMA],
|
|
||||||
boolStrs[_isHeapFlagCreateNotZeroedSupported],
|
|
||||||
boolStrs[_isGPUUploadHeapSupported],
|
|
||||||
boolStrs[_isMinFloat16Supported],
|
|
||||||
boolStrs[_isNative16BitSupported]
|
|
||||||
));
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
@ -1,156 +0,0 @@
|
||||||
#pragma once
|
|
||||||
#include "ScalingOptions.h"
|
|
||||||
|
|
||||||
namespace Magpie {
|
|
||||||
|
|
||||||
class DescriptorHeap;
|
|
||||||
|
|
||||||
class D3D12Context {
|
|
||||||
public:
|
|
||||||
D3D12Context() = default;
|
|
||||||
D3D12Context(const D3D12Context&) = delete;
|
|
||||||
D3D12Context(D3D12Context&&) = delete;
|
|
||||||
|
|
||||||
bool Initialize(
|
|
||||||
const GraphicsCardId& graphicsCardId,
|
|
||||||
uint32_t maxInFlightFrameCount,
|
|
||||||
D3D12_COMMAND_QUEUE_PRIORITY priority,
|
|
||||||
D3D12_COMMAND_LIST_TYPE commandListType,
|
|
||||||
DescriptorHeap& csuDescriptorHeap,
|
|
||||||
DescriptorHeap& rtvDescriptorHeap,
|
|
||||||
bool disableFrameFenceTracking = false
|
|
||||||
) noexcept;
|
|
||||||
|
|
||||||
void CopyDevice(const D3D12Context& other);
|
|
||||||
|
|
||||||
bool InitializeAfterCopyDevice(
|
|
||||||
uint32_t maxInFlightFrameCount,
|
|
||||||
D3D12_COMMAND_QUEUE_PRIORITY priority,
|
|
||||||
D3D12_COMMAND_LIST_TYPE commandListType,
|
|
||||||
bool disableFrameTracking = false
|
|
||||||
) noexcept;
|
|
||||||
|
|
||||||
DescriptorHeap& GetDescriptorHeap(bool rtv = false) const noexcept {
|
|
||||||
return rtv ? *_rtvDescriptorHeap : *_csuDescriptorHeap;
|
|
||||||
}
|
|
||||||
|
|
||||||
IDXGIFactory7* GetDXGIFactory() const noexcept {
|
|
||||||
return _dxgiFactory.get();
|
|
||||||
}
|
|
||||||
|
|
||||||
IDXGIFactory7* GetDXGIFactoryForEnumingAdapters() noexcept;
|
|
||||||
|
|
||||||
IDXGIAdapter4* GetDXGIAdapter() const noexcept {
|
|
||||||
return _dxgiAdapter.get();
|
|
||||||
}
|
|
||||||
|
|
||||||
ID3D12Device5* GetDevice() const noexcept {
|
|
||||||
return _device.get();
|
|
||||||
}
|
|
||||||
|
|
||||||
ID3D12CommandQueue* GetCommandQueue() const noexcept {
|
|
||||||
return _commandQueue.get();
|
|
||||||
}
|
|
||||||
|
|
||||||
ID3D12GraphicsCommandList* GetCommandList() const noexcept {
|
|
||||||
return _commandList.get();
|
|
||||||
}
|
|
||||||
|
|
||||||
D3D_SHADER_MODEL GetShaderModel() const noexcept {
|
|
||||||
return _shaderModel;
|
|
||||||
}
|
|
||||||
|
|
||||||
D3D_ROOT_SIGNATURE_VERSION GetRootSignatureVersion() const noexcept {
|
|
||||||
return _rootSignatureVersion;
|
|
||||||
}
|
|
||||||
|
|
||||||
bool IsUMA() const noexcept {
|
|
||||||
return _isUMA;
|
|
||||||
}
|
|
||||||
|
|
||||||
bool IsHeapFlagCreateNotZeroedSupported() const noexcept {
|
|
||||||
return _isHeapFlagCreateNotZeroedSupported;
|
|
||||||
}
|
|
||||||
|
|
||||||
bool IsGPUUploadHeapSupported() const noexcept {
|
|
||||||
return _isGPUUploadHeapSupported;
|
|
||||||
}
|
|
||||||
|
|
||||||
bool IsMinFloat16Supported() const noexcept {
|
|
||||||
return _isMinFloat16Supported;
|
|
||||||
}
|
|
||||||
|
|
||||||
bool IsNative16BitSupported() const noexcept {
|
|
||||||
return _isNative16BitSupported;
|
|
||||||
}
|
|
||||||
|
|
||||||
uint32_t GetMaxInFlightFrameCount() const noexcept {
|
|
||||||
return (uint32_t)_commandAllocators.size();
|
|
||||||
}
|
|
||||||
|
|
||||||
HRESULT Signal(uint64_t& fenceValue) noexcept;
|
|
||||||
|
|
||||||
HRESULT WaitForFenceValue(uint64_t fenceValue) noexcept;
|
|
||||||
|
|
||||||
HRESULT WaitForGpu() noexcept;
|
|
||||||
|
|
||||||
HRESULT WaitForCommandQueue(ID3D12CommandQueue* commandQueue) noexcept;
|
|
||||||
|
|
||||||
HRESULT BeginFrame(
|
|
||||||
uint32_t& curFrameIndex,
|
|
||||||
ID3D12PipelineState* initialState = nullptr
|
|
||||||
) noexcept;
|
|
||||||
|
|
||||||
HRESULT EndFrame() noexcept;
|
|
||||||
|
|
||||||
private:
|
|
||||||
HRESULT _CreateDXGIFactory() noexcept;
|
|
||||||
|
|
||||||
bool _InitializeDeviceResources(
|
|
||||||
uint32_t maxInFlightFrameCount,
|
|
||||||
D3D12_COMMAND_QUEUE_PRIORITY priority,
|
|
||||||
D3D12_COMMAND_LIST_TYPE commandListType,
|
|
||||||
bool disableFrameFenceTracking
|
|
||||||
) noexcept;
|
|
||||||
|
|
||||||
bool _CreateAdapterAndDevice(const GraphicsCardId& graphicsCardId) noexcept;
|
|
||||||
|
|
||||||
bool _TryCreateD3DDevice(
|
|
||||||
const winrt::com_ptr<IDXGIAdapter1>& adapter,
|
|
||||||
const DXGI_ADAPTER_DESC1& adapterDesc
|
|
||||||
) noexcept;
|
|
||||||
|
|
||||||
bool _CreateAdapterFromDevice() noexcept;
|
|
||||||
|
|
||||||
void _QueryHighestShaderModel() noexcept;
|
|
||||||
|
|
||||||
void _LogDeviceInfo() noexcept;
|
|
||||||
|
|
||||||
DescriptorHeap* _csuDescriptorHeap = nullptr;
|
|
||||||
DescriptorHeap* _rtvDescriptorHeap = nullptr;
|
|
||||||
|
|
||||||
winrt::com_ptr<IDXGIFactory7> _dxgiFactory;
|
|
||||||
winrt::com_ptr<IDXGIAdapter4> _dxgiAdapter;
|
|
||||||
winrt::com_ptr<ID3D12Device5> _device;
|
|
||||||
winrt::com_ptr<ID3D12CommandQueue> _commandQueue;
|
|
||||||
|
|
||||||
std::vector<winrt::com_ptr<ID3D12CommandAllocator>> _commandAllocators;
|
|
||||||
winrt::com_ptr<ID3D12GraphicsCommandList> _commandList;
|
|
||||||
|
|
||||||
winrt::com_ptr<ID3D12Fence1> _fence;
|
|
||||||
uint64_t _curFenceValue = 0;
|
|
||||||
|
|
||||||
std::vector<uint64_t> _frameFenceValues;
|
|
||||||
uint32_t _curFrameIndex = 0;
|
|
||||||
|
|
||||||
D3D_SHADER_MODEL _shaderModel = D3D_SHADER_MODEL_5_1;
|
|
||||||
D3D_ROOT_SIGNATURE_VERSION _rootSignatureVersion = D3D_ROOT_SIGNATURE_VERSION_1_0;
|
|
||||||
|
|
||||||
bool _isUMA = false;
|
|
||||||
bool _isHeapFlagCreateNotZeroedSupported = false;
|
|
||||||
bool _isGPUUploadHeapSupported = false;
|
|
||||||
bool _isMinFloat16Supported = false;
|
|
||||||
bool _isNative16BitSupported = false;
|
|
||||||
};
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
@ -1,4 +1,4 @@
|
||||||
// 复制自 https://github.com/microsoft/DirectXTex/blob/55b96d1d0ab5d9efe2112cd0318470976a2380b5/DirectXTex/DDS.h
|
// 复制自 https://github.com/microsoft/DirectXTex/blob/652cc82b35ff9e14097d12eff73f53348361ff15/DirectXTex/DDS.h
|
||||||
|
|
||||||
//--------------------------------------------------------------------------------------
|
//--------------------------------------------------------------------------------------
|
||||||
// DDS.h
|
// DDS.h
|
||||||
|
|
@ -13,13 +13,12 @@
|
||||||
// Copyright (c) Microsoft Corporation.
|
// Copyright (c) Microsoft Corporation.
|
||||||
// Licensed under the MIT License.
|
// Licensed under the MIT License.
|
||||||
//
|
//
|
||||||
// https://go.microsoft.com/fwlink/?LinkId=248926
|
// http://go.microsoft.com/fwlink/?LinkId=248926
|
||||||
// http://go.microsoft.com/fwlink/?LinkId=248929
|
// http://go.microsoft.com/fwlink/?LinkId=248929
|
||||||
// http://go.microsoft.com/fwlink/?LinkID=615561
|
// http://go.microsoft.com/fwlink/?LinkID=615561
|
||||||
//--------------------------------------------------------------------------------------
|
//--------------------------------------------------------------------------------------
|
||||||
|
|
||||||
#pragma once
|
#pragma once
|
||||||
|
|
||||||
#include <cstdint>
|
#include <cstdint>
|
||||||
|
|
||||||
namespace Magpie {
|
namespace Magpie {
|
||||||
|
|
@ -54,10 +53,10 @@ struct DDS_PIXELFORMAT {
|
||||||
|
|
||||||
#ifndef MAKEFOURCC
|
#ifndef MAKEFOURCC
|
||||||
#define MAKEFOURCC(ch0, ch1, ch2, ch3) \
|
#define MAKEFOURCC(ch0, ch1, ch2, ch3) \
|
||||||
(static_cast<uint32_t>(static_cast<uint8_t>(ch0)) \
|
(static_cast<uint32_t>(static_cast<uint8_t>(ch0)) \
|
||||||
| (static_cast<uint32_t>(static_cast<uint8_t>(ch1)) << 8) \
|
| (static_cast<uint32_t>(static_cast<uint8_t>(ch1)) << 8) \
|
||||||
| (static_cast<uint32_t>(static_cast<uint8_t>(ch2)) << 16) \
|
| (static_cast<uint32_t>(static_cast<uint8_t>(ch2)) << 16) \
|
||||||
| (static_cast<uint32_t>(static_cast<uint8_t>(ch3)) << 24))
|
| (static_cast<uint32_t>(static_cast<uint8_t>(ch3)) << 24))
|
||||||
#endif /* MAKEFOURCC */
|
#endif /* MAKEFOURCC */
|
||||||
|
|
||||||
#ifndef DDSGLOBALCONST
|
#ifndef DDSGLOBALCONST
|
||||||
|
|
@ -221,8 +220,8 @@ DDSGLOBALCONST DDS_PIXELFORMAT DDSPF_DX10 =
|
||||||
#define DDS_CUBEMAP_NEGATIVEZ 0x00008200 // DDSCAPS2_CUBEMAP | DDSCAPS2_CUBEMAP_NEGATIVEZ
|
#define DDS_CUBEMAP_NEGATIVEZ 0x00008200 // DDSCAPS2_CUBEMAP | DDSCAPS2_CUBEMAP_NEGATIVEZ
|
||||||
|
|
||||||
#define DDS_CUBEMAP_ALLFACES ( DDS_CUBEMAP_POSITIVEX | DDS_CUBEMAP_NEGATIVEX |\
|
#define DDS_CUBEMAP_ALLFACES ( DDS_CUBEMAP_POSITIVEX | DDS_CUBEMAP_NEGATIVEX |\
|
||||||
DDS_CUBEMAP_POSITIVEY | DDS_CUBEMAP_NEGATIVEY |\
|
DDS_CUBEMAP_POSITIVEY | DDS_CUBEMAP_NEGATIVEY |\
|
||||||
DDS_CUBEMAP_POSITIVEZ | DDS_CUBEMAP_NEGATIVEZ )
|
DDS_CUBEMAP_POSITIVEZ | DDS_CUBEMAP_NEGATIVEZ )
|
||||||
|
|
||||||
#define DDS_CUBEMAP 0x00000200 // DDSCAPS2_CUBEMAP
|
#define DDS_CUBEMAP 0x00000200 // DDSCAPS2_CUBEMAP
|
||||||
|
|
||||||
|
|
@ -290,4 +289,4 @@ constexpr size_t DDS_MIN_HEADER_SIZE = sizeof(uint32_t) + sizeof(DDS_HEADER);
|
||||||
constexpr size_t DDS_DX10_HEADER_SIZE = sizeof(uint32_t) + sizeof(DDS_HEADER) + sizeof(DDS_HEADER_DXT10);
|
constexpr size_t DDS_DX10_HEADER_SIZE = sizeof(uint32_t) + sizeof(DDS_HEADER) + sizeof(DDS_HEADER_DXT10);
|
||||||
static_assert(DDS_DX10_HEADER_SIZE > DDS_MIN_HEADER_SIZE, "DDS DX10 Header should be larger than standard header");
|
static_assert(DDS_DX10_HEADER_SIZE > DDS_MIN_HEADER_SIZE, "DDS DX10 Header should be larger than standard header");
|
||||||
|
|
||||||
} // namespace
|
}
|
||||||
|
|
|
||||||
1049
src/Magpie.Core/DDSHelper.cpp
Normal file
1049
src/Magpie.Core/DDSHelper.cpp
Normal file
File diff suppressed because it is too large
Load diff
19
src/Magpie.Core/DDSHelper.h
Normal file
19
src/Magpie.Core/DDSHelper.h
Normal file
|
|
@ -0,0 +1,19 @@
|
||||||
|
#pragma once
|
||||||
|
|
||||||
|
namespace Magpie {
|
||||||
|
|
||||||
|
struct DDSHelper {
|
||||||
|
static winrt::com_ptr<ID3D11Texture2D> Load(
|
||||||
|
const wchar_t* fileName, ID3D11Device* d3dDevice) noexcept;
|
||||||
|
|
||||||
|
static bool Save(
|
||||||
|
const wchar_t* fileName,
|
||||||
|
uint32_t width,
|
||||||
|
uint32_t height,
|
||||||
|
DXGI_FORMAT format,
|
||||||
|
std::span<uint8_t> pixelData,
|
||||||
|
uint32_t rowPitch
|
||||||
|
);
|
||||||
|
};
|
||||||
|
|
||||||
|
}
|
||||||
|
|
@ -1,121 +0,0 @@
|
||||||
#include "pch.h"
|
|
||||||
#include "DescriptorHeap.h"
|
|
||||||
#include "Logger.h"
|
|
||||||
|
|
||||||
namespace Magpie {
|
|
||||||
|
|
||||||
DescriptorHeap::~DescriptorHeap() noexcept {
|
|
||||||
// DEBUG 配置下退出前确保所有槽位都已释放
|
|
||||||
assert(_capacity == 0 || (_freeBlocks.size() == 1 &&
|
|
||||||
*_freeBlocks.begin() == std::make_pair(_capacity, _capacity)));
|
|
||||||
}
|
|
||||||
|
|
||||||
bool DescriptorHeap::Initialize(
|
|
||||||
ID3D12Device5* device,
|
|
||||||
D3D12_DESCRIPTOR_HEAP_TYPE type,
|
|
||||||
uint32_t capacity
|
|
||||||
) noexcept {
|
|
||||||
#ifdef _DEBUG
|
|
||||||
_capacity = capacity;
|
|
||||||
#endif
|
|
||||||
|
|
||||||
_freeBlocks.emplace(capacity, capacity);
|
|
||||||
|
|
||||||
_descriptorSize = device->GetDescriptorHandleIncrementSize(type);
|
|
||||||
|
|
||||||
D3D12_DESCRIPTOR_HEAP_DESC desc = {
|
|
||||||
.Type = type,
|
|
||||||
.NumDescriptors = capacity,
|
|
||||||
.Flags = type == D3D12_DESCRIPTOR_HEAP_TYPE_CBV_SRV_UAV ?
|
|
||||||
D3D12_DESCRIPTOR_HEAP_FLAG_SHADER_VISIBLE : D3D12_DESCRIPTOR_HEAP_FLAG_NONE
|
|
||||||
};
|
|
||||||
|
|
||||||
HRESULT hr = device->CreateDescriptorHeap(&desc, IID_PPV_ARGS(&_heap));
|
|
||||||
if (FAILED(hr)) {
|
|
||||||
Logger::Get().ComError("CreateDescriptorHeap 失败", hr);
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
_cpuHandle = _heap->GetCPUDescriptorHandleForHeapStart();
|
|
||||||
|
|
||||||
if (type == D3D12_DESCRIPTOR_HEAP_TYPE_CBV_SRV_UAV) {
|
|
||||||
_gpuHandle = _heap->GetGPUDescriptorHandleForHeapStart();
|
|
||||||
}
|
|
||||||
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
|
|
||||||
HRESULT DescriptorHeap::Alloc(uint32_t count, uint32_t& offset) noexcept {
|
|
||||||
assert(count != 0);
|
|
||||||
auto lk = _freeBlocksLock.lock_exclusive();
|
|
||||||
|
|
||||||
for (auto it = _freeBlocks.begin(); it != _freeBlocks.end(); ++it) {
|
|
||||||
auto& [blockEnd, blockSize] = *it;
|
|
||||||
|
|
||||||
// 寻找第一个足够大的空闲块
|
|
||||||
if (blockSize >= count) {
|
|
||||||
offset = blockEnd - blockSize;
|
|
||||||
|
|
||||||
if (blockSize == count) {
|
|
||||||
_freeBlocks.erase(it);
|
|
||||||
} else {
|
|
||||||
blockSize -= count;
|
|
||||||
}
|
|
||||||
|
|
||||||
return S_OK;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
Logger::Get().Error("描述符用尽");
|
|
||||||
return E_OUTOFMEMORY;
|
|
||||||
}
|
|
||||||
|
|
||||||
static uint32_t GetBlockOffset(const std::pair<const uint32_t, uint32_t>& freeBlock) noexcept {
|
|
||||||
return freeBlock.first - freeBlock.second;
|
|
||||||
}
|
|
||||||
|
|
||||||
void DescriptorHeap::Free(uint32_t offset, uint32_t count) noexcept {
|
|
||||||
assert(count != 0 && offset != std::numeric_limits<uint32_t>::max() && offset + count <= _capacity);
|
|
||||||
|
|
||||||
auto lk = _freeBlocksLock.lock_exclusive();
|
|
||||||
|
|
||||||
const auto freeBlocksEnd = _freeBlocks.end();
|
|
||||||
|
|
||||||
// 寻找 offset 之后的第一个空闲块
|
|
||||||
auto upperBoundIt = _freeBlocks.upper_bound(offset);
|
|
||||||
auto prevIt = upperBoundIt == _freeBlocks.begin() ? freeBlocksEnd : std::prev(upperBoundIt);
|
|
||||||
|
|
||||||
assert(upperBoundIt == freeBlocksEnd || offset + count <= GetBlockOffset(*upperBoundIt));
|
|
||||||
assert(prevIt == freeBlocksEnd || offset >= prevIt->first);
|
|
||||||
|
|
||||||
const bool canMergePrev = prevIt != freeBlocksEnd && offset == prevIt->first;
|
|
||||||
const bool canMergeNext = upperBoundIt != freeBlocksEnd &&
|
|
||||||
offset + count == GetBlockOffset(*upperBoundIt);
|
|
||||||
|
|
||||||
if (canMergeNext) {
|
|
||||||
upperBoundIt->second += count;
|
|
||||||
|
|
||||||
if (canMergePrev) {
|
|
||||||
upperBoundIt->second += prevIt->second;
|
|
||||||
_freeBlocks.erase(prevIt);
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
uint32_t newBlockSize = count;
|
|
||||||
if (canMergePrev) {
|
|
||||||
newBlockSize += prevIt->second;
|
|
||||||
_freeBlocks.erase(prevIt);
|
|
||||||
}
|
|
||||||
_freeBlocks.emplace(offset + count, newBlockSize);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
D3D12_CPU_DESCRIPTOR_HANDLE DescriptorHeap::GetCpuHandle(uint32_t offset) const noexcept {
|
|
||||||
return CD3DX12_CPU_DESCRIPTOR_HANDLE(_cpuHandle, offset, _descriptorSize);
|
|
||||||
}
|
|
||||||
|
|
||||||
D3D12_GPU_DESCRIPTOR_HANDLE DescriptorHeap::GetGpuHandle(uint32_t offset) const noexcept {
|
|
||||||
assert(_gpuHandle.ptr);
|
|
||||||
return CD3DX12_GPU_DESCRIPTOR_HANDLE(_gpuHandle, offset, _descriptorSize);
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
@ -1,58 +0,0 @@
|
||||||
#pragma once
|
|
||||||
#ifndef _DEBUG
|
|
||||||
#include <parallel_hashmap/btree.h>
|
|
||||||
#endif
|
|
||||||
|
|
||||||
namespace Magpie {
|
|
||||||
|
|
||||||
class DescriptorHeap {
|
|
||||||
public:
|
|
||||||
DescriptorHeap() = default;
|
|
||||||
DescriptorHeap(const DescriptorHeap&) = delete;
|
|
||||||
DescriptorHeap(DescriptorHeap&&) = delete;
|
|
||||||
|
|
||||||
~DescriptorHeap() noexcept;
|
|
||||||
|
|
||||||
bool Initialize(
|
|
||||||
ID3D12Device5* device,
|
|
||||||
D3D12_DESCRIPTOR_HEAP_TYPE type,
|
|
||||||
uint32_t capacity
|
|
||||||
) noexcept;
|
|
||||||
|
|
||||||
HRESULT Alloc(uint32_t count, uint32_t& offset) noexcept;
|
|
||||||
|
|
||||||
void Free(uint32_t offset, uint32_t count) noexcept;
|
|
||||||
|
|
||||||
ID3D12DescriptorHeap* GetHeap() const noexcept {
|
|
||||||
return _heap.get();
|
|
||||||
}
|
|
||||||
|
|
||||||
uint32_t GetDescriptorSize() const noexcept {
|
|
||||||
return _descriptorSize;
|
|
||||||
}
|
|
||||||
|
|
||||||
D3D12_CPU_DESCRIPTOR_HANDLE GetCpuHandle(uint32_t offset) const noexcept;
|
|
||||||
|
|
||||||
D3D12_GPU_DESCRIPTOR_HANDLE GetGpuHandle(uint32_t offset) const noexcept;
|
|
||||||
|
|
||||||
private:
|
|
||||||
winrt::com_ptr<ID3D12DescriptorHeap> _heap;
|
|
||||||
D3D12_CPU_DESCRIPTOR_HANDLE _cpuHandle{};
|
|
||||||
D3D12_GPU_DESCRIPTOR_HANDLE _gpuHandle{};
|
|
||||||
uint32_t _descriptorSize = 0;
|
|
||||||
|
|
||||||
wil::srwlock _freeBlocksLock;
|
|
||||||
|
|
||||||
// end(offset+size) -> size
|
|
||||||
// 以 offset+size 作为键可以大大降低删除和插入键的频率
|
|
||||||
#ifdef _DEBUG
|
|
||||||
// phmap::btree_map 没有 natvis,调试不方便
|
|
||||||
std::map<uint32_t, uint32_t> _freeBlocks;
|
|
||||||
// 用于断言
|
|
||||||
uint32_t _capacity = 0;
|
|
||||||
#else
|
|
||||||
phmap::btree_map<uint32_t, uint32_t> _freeBlocks;
|
|
||||||
#endif
|
|
||||||
};
|
|
||||||
|
|
||||||
}
|
|
||||||
214
src/Magpie.Core/DesktopDuplicationFrameSource.cpp
Normal file
214
src/Magpie.Core/DesktopDuplicationFrameSource.cpp
Normal file
|
|
@ -0,0 +1,214 @@
|
||||||
|
#include "pch.h"
|
||||||
|
#include "DesktopDuplicationFrameSource.h"
|
||||||
|
#include "DeviceResources.h"
|
||||||
|
#include "DirectXHelper.h"
|
||||||
|
#include "Logger.h"
|
||||||
|
#include "ScalingWindow.h"
|
||||||
|
#include "SmallVector.h"
|
||||||
|
#include "Win32Helper.h"
|
||||||
|
|
||||||
|
namespace Magpie {
|
||||||
|
|
||||||
|
static winrt::com_ptr<IDXGIOutput1> FindMonitor(IDXGIAdapter1* adapter, HMONITOR hMonitor) noexcept {
|
||||||
|
winrt::com_ptr<IDXGIOutput> output;
|
||||||
|
|
||||||
|
for (UINT adapterIndex = 0;
|
||||||
|
SUCCEEDED(adapter->EnumOutputs(adapterIndex, output.put()));
|
||||||
|
++adapterIndex
|
||||||
|
) {
|
||||||
|
DXGI_OUTPUT_DESC desc;
|
||||||
|
HRESULT hr = output->GetDesc(&desc);
|
||||||
|
if (FAILED(hr)) {
|
||||||
|
Logger::Get().ComError("GetDesc 失败", hr);
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (desc.Monitor == hMonitor) {
|
||||||
|
winrt::com_ptr<IDXGIOutput1> output1 = output.try_as<IDXGIOutput1>();
|
||||||
|
if (!output1) {
|
||||||
|
Logger::Get().Error("从 IDXGIOutput 获取 IDXGIOutput1 失败");
|
||||||
|
return nullptr;
|
||||||
|
}
|
||||||
|
|
||||||
|
return output1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return nullptr;
|
||||||
|
}
|
||||||
|
|
||||||
|
bool DesktopDuplicationFrameSource::_Initialize() noexcept {
|
||||||
|
// WDA_EXCLUDEFROMCAPTURE 只在 Win10 20H1 及更新版本中可用
|
||||||
|
if (!Win32Helper::GetOSVersion().Is20H1OrNewer()) {
|
||||||
|
Logger::Get().Error("当前操作系统无法使用 Desktop Duplication");
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
const HWND hwndSrc = ScalingWindow::Get().SrcTracker().Handle();
|
||||||
|
const RECT& srcRect = ScalingWindow::Get().SrcTracker().SrcRect();
|
||||||
|
|
||||||
|
HMONITOR hMonitor = MonitorFromWindow(hwndSrc, MONITOR_DEFAULTTONULL);
|
||||||
|
assert(hMonitor);
|
||||||
|
|
||||||
|
{
|
||||||
|
MONITORINFO mi{ .cbSize = sizeof(mi) };
|
||||||
|
if (!GetMonitorInfo(hMonitor, &mi)) {
|
||||||
|
Logger::Get().Win32Error("GetMonitorInfo 失败");
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
// ScalingWindow::_InitialMoveSrcWindowInFullscreen 已调整窗口位置
|
||||||
|
assert(srcRect.left >= mi.rcMonitor.left && srcRect.top >= mi.rcMonitor.top &&
|
||||||
|
srcRect.right <= mi.rcMonitor.right && srcRect.bottom <= mi.rcMonitor.bottom);
|
||||||
|
|
||||||
|
// 计算源窗口客户区在该屏幕上的位置,用于计算新帧是否有更新
|
||||||
|
_srcClientInMonitor = {
|
||||||
|
srcRect.left - mi.rcMonitor.left,
|
||||||
|
srcRect.top - mi.rcMonitor.top,
|
||||||
|
srcRect.right - mi.rcMonitor.left,
|
||||||
|
srcRect.bottom - mi.rcMonitor.top
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
_frameInMonitor = {
|
||||||
|
(UINT)_srcClientInMonitor.left,
|
||||||
|
(UINT)_srcClientInMonitor.top,
|
||||||
|
0,
|
||||||
|
(UINT)_srcClientInMonitor.right,
|
||||||
|
(UINT)_srcClientInMonitor.bottom,
|
||||||
|
1
|
||||||
|
};
|
||||||
|
|
||||||
|
_output = DirectXHelper::CreateTexture2D(
|
||||||
|
_deviceResources->GetD3DDevice(),
|
||||||
|
DXGI_FORMAT_B8G8R8A8_UNORM,
|
||||||
|
srcRect.right - srcRect.left,
|
||||||
|
srcRect.bottom - srcRect.top,
|
||||||
|
D3D11_BIND_SHADER_RESOURCE
|
||||||
|
);
|
||||||
|
if (!_output) {
|
||||||
|
Logger::Get().Error("CreateTexture2D 失败");
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
_dxgiOutput = FindMonitor(
|
||||||
|
_deviceResources->GetGraphicsAdapter(), hMonitor);
|
||||||
|
if (!_dxgiOutput) {
|
||||||
|
Logger::Get().Error("无法找到 IDXGIOutput");
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
// 使全屏窗口无法被捕获到
|
||||||
|
if (!SetWindowDisplayAffinity(ScalingWindow::Get().Handle(), WDA_EXCLUDEFROMCAPTURE)) {
|
||||||
|
Logger::Get().Win32Error("SetWindowDisplayAffinity 失败");
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
Logger::Get().Info("DesktopDuplicationFrameSource 初始化完成");
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
bool DesktopDuplicationFrameSource::Start() noexcept {
|
||||||
|
_DisableRoundCornerInWin11();
|
||||||
|
|
||||||
|
HRESULT hr = _dxgiOutput->DuplicateOutput(_deviceResources->GetD3DDevice(), _outputDup.put());
|
||||||
|
if (FAILED(hr)) {
|
||||||
|
Logger::Get().ComError("DuplicateOutput 失败", hr);
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
FrameSourceState DesktopDuplicationFrameSource::_Update() noexcept {
|
||||||
|
ID3D11DeviceContext4* d3dDC = _deviceResources->GetD3DDC();
|
||||||
|
|
||||||
|
if (_isFrameAcquired) {
|
||||||
|
// 根据文档,释放后立刻获取下一帧可以提高性能
|
||||||
|
_outputDup->ReleaseFrame();
|
||||||
|
_isFrameAcquired = false;
|
||||||
|
}
|
||||||
|
|
||||||
|
DXGI_OUTDUPL_FRAME_INFO info;
|
||||||
|
winrt::com_ptr<IDXGIResource> dxgiRes;
|
||||||
|
// 等待 1ms
|
||||||
|
HRESULT hr = _outputDup->AcquireNextFrame(1, &info, dxgiRes.put());
|
||||||
|
if (hr == DXGI_ERROR_WAIT_TIMEOUT) {
|
||||||
|
return FrameSourceState::Waiting;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (FAILED(hr)) {
|
||||||
|
Logger::Get().ComError("AcquireNextFrame 失败", hr);
|
||||||
|
return FrameSourceState::Error;
|
||||||
|
}
|
||||||
|
|
||||||
|
_isFrameAcquired = true;
|
||||||
|
|
||||||
|
bool noUpdate = true;
|
||||||
|
|
||||||
|
// 检索 move rects 和 dirty rects
|
||||||
|
// 这些区域如果和窗口客户区有重叠则表明画面有变化
|
||||||
|
if (info.TotalMetadataBufferSize) {
|
||||||
|
if (info.TotalMetadataBufferSize > _dupMetaData.size()) {
|
||||||
|
_dupMetaData.resize(info.TotalMetadataBufferSize);
|
||||||
|
}
|
||||||
|
|
||||||
|
uint32_t bufSize = info.TotalMetadataBufferSize;
|
||||||
|
|
||||||
|
// Move rects
|
||||||
|
hr = _outputDup->GetFrameMoveRects(
|
||||||
|
bufSize, (DXGI_OUTDUPL_MOVE_RECT*)_dupMetaData.data(), &bufSize);
|
||||||
|
if (FAILED(hr)) {
|
||||||
|
Logger::Get().ComError("GetFrameMoveRects 失败", hr);
|
||||||
|
return FrameSourceState::Error;
|
||||||
|
}
|
||||||
|
|
||||||
|
uint32_t nRect = bufSize / sizeof(DXGI_OUTDUPL_MOVE_RECT);
|
||||||
|
for (uint32_t i = 0; i < nRect; ++i) {
|
||||||
|
const DXGI_OUTDUPL_MOVE_RECT& rect =
|
||||||
|
((DXGI_OUTDUPL_MOVE_RECT*)_dupMetaData.data())[i];
|
||||||
|
if (Win32Helper::IsRectOverlap(_srcClientInMonitor, rect.DestinationRect)) {
|
||||||
|
noUpdate = false;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (noUpdate) {
|
||||||
|
bufSize = info.TotalMetadataBufferSize;
|
||||||
|
|
||||||
|
// Dirty rects
|
||||||
|
hr = _outputDup->GetFrameDirtyRects(
|
||||||
|
bufSize, (RECT*)_dupMetaData.data(), &bufSize);
|
||||||
|
if (FAILED(hr)) {
|
||||||
|
Logger::Get().ComError("GetFrameDirtyRects 失败", hr);
|
||||||
|
return FrameSourceState::Error;
|
||||||
|
}
|
||||||
|
|
||||||
|
nRect = bufSize / sizeof(RECT);
|
||||||
|
for (uint32_t i = 0; i < nRect; ++i) {
|
||||||
|
const RECT& rect = ((RECT*)_dupMetaData.data())[i];
|
||||||
|
if (Win32Helper::IsRectOverlap(_srcClientInMonitor, rect)) {
|
||||||
|
noUpdate = false;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (noUpdate) {
|
||||||
|
return FrameSourceState::Waiting;
|
||||||
|
}
|
||||||
|
|
||||||
|
winrt::com_ptr<ID3D11Texture2D> frameTexture = dxgiRes.try_as<ID3D11Texture2D>();
|
||||||
|
if (!frameTexture) {
|
||||||
|
Logger::Get().Error("从 IDXGIResource 检索 ID3D11Resource 失败");
|
||||||
|
return FrameSourceState::Error;
|
||||||
|
}
|
||||||
|
|
||||||
|
d3dDC->CopySubresourceRegion(
|
||||||
|
_output.get(), 0, 0, 0, 0, frameTexture.get(), 0, &_frameInMonitor);
|
||||||
|
|
||||||
|
return FrameSourceState::NewFrame;
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
36
src/Magpie.Core/DesktopDuplicationFrameSource.h
Normal file
36
src/Magpie.Core/DesktopDuplicationFrameSource.h
Normal file
|
|
@ -0,0 +1,36 @@
|
||||||
|
#pragma once
|
||||||
|
#include "FrameSourceBase.h"
|
||||||
|
#include "SmallVector.h"
|
||||||
|
|
||||||
|
namespace Magpie {
|
||||||
|
|
||||||
|
class DesktopDuplicationFrameSource final : public FrameSourceBase {
|
||||||
|
public:
|
||||||
|
bool Start() noexcept override;
|
||||||
|
|
||||||
|
FrameSourceWaitType WaitType() const noexcept override {
|
||||||
|
return FrameSourceWaitType::WaitForFrame;
|
||||||
|
}
|
||||||
|
|
||||||
|
const char* Name() const noexcept override {
|
||||||
|
return "Desktop Duplication";
|
||||||
|
}
|
||||||
|
|
||||||
|
protected:
|
||||||
|
bool _Initialize() noexcept override;
|
||||||
|
|
||||||
|
FrameSourceState _Update() noexcept override;
|
||||||
|
|
||||||
|
private:
|
||||||
|
winrt::com_ptr<IDXGIOutput1> _dxgiOutput;
|
||||||
|
winrt::com_ptr<IDXGIOutputDuplication> _outputDup;
|
||||||
|
|
||||||
|
SmallVector<uint8_t, 0> _dupMetaData;
|
||||||
|
|
||||||
|
RECT _srcClientInMonitor{};
|
||||||
|
D3D11_BOX _frameInMonitor{};
|
||||||
|
|
||||||
|
bool _isFrameAcquired = false;
|
||||||
|
};
|
||||||
|
|
||||||
|
}
|
||||||
253
src/Magpie.Core/DeviceResources.cpp
Normal file
253
src/Magpie.Core/DeviceResources.cpp
Normal file
|
|
@ -0,0 +1,253 @@
|
||||||
|
#include "pch.h"
|
||||||
|
#include "DeviceResources.h"
|
||||||
|
#include "DirectXHelper.h"
|
||||||
|
#include "Logger.h"
|
||||||
|
#include "ScalingOptions.h"
|
||||||
|
#include "ScalingWindow.h"
|
||||||
|
#include "StrHelper.h"
|
||||||
|
|
||||||
|
namespace Magpie {
|
||||||
|
|
||||||
|
bool DeviceResources::Initialize(bool isForeground) noexcept {
|
||||||
|
#ifdef _DEBUG
|
||||||
|
UINT flag = DXGI_CREATE_FACTORY_DEBUG;
|
||||||
|
#else
|
||||||
|
UINT flag = 0;
|
||||||
|
#endif
|
||||||
|
|
||||||
|
HRESULT hr = CreateDXGIFactory2(flag, IID_PPV_ARGS(_dxgiFactory.put()));
|
||||||
|
if (FAILED(hr)) {
|
||||||
|
Logger::Get().ComError("CreateDXGIFactory2 失败", hr);
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
// 检查可变帧率支持
|
||||||
|
BOOL supportTearing = FALSE;
|
||||||
|
hr = _dxgiFactory->CheckFeatureSupport(DXGI_FEATURE_PRESENT_ALLOW_TEARING, &supportTearing, sizeof(supportTearing));
|
||||||
|
if (FAILED(hr)) {
|
||||||
|
Logger::Get().ComWarn("CheckFeatureSupport 失败", hr);
|
||||||
|
}
|
||||||
|
|
||||||
|
_isTearingSupported = supportTearing;
|
||||||
|
Logger::Get().Info(fmt::format("可变刷新率支持: {}", supportTearing ? "是" : "否"));
|
||||||
|
|
||||||
|
if (!_ObtainAdapterAndDevice(ScalingWindow::Get().Options().graphicsCardId, isForeground)) {
|
||||||
|
Logger::Get().Error("找不到可用的图形适配器");
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
ID3D11SamplerState* DeviceResources::GetSampler(D3D11_FILTER filterMode, D3D11_TEXTURE_ADDRESS_MODE addressMode) noexcept {
|
||||||
|
auto key = std::make_pair(filterMode, addressMode);
|
||||||
|
auto it = _samMap.find(key);
|
||||||
|
if (it != _samMap.end()) {
|
||||||
|
return it->second.get();
|
||||||
|
}
|
||||||
|
|
||||||
|
winrt::com_ptr<ID3D11SamplerState> sam;
|
||||||
|
|
||||||
|
D3D11_SAMPLER_DESC desc{
|
||||||
|
.Filter = filterMode,
|
||||||
|
.AddressU = addressMode,
|
||||||
|
.AddressV = addressMode,
|
||||||
|
.AddressW = addressMode,
|
||||||
|
.ComparisonFunc = D3D11_COMPARISON_NEVER
|
||||||
|
};
|
||||||
|
HRESULT hr = _d3dDevice->CreateSamplerState(&desc, sam.put());
|
||||||
|
if (FAILED(hr)) {
|
||||||
|
Logger::Get().ComError("创建 ID3D11SamplerState 出错", hr);
|
||||||
|
return nullptr;
|
||||||
|
}
|
||||||
|
|
||||||
|
return _samMap.emplace(key, std::move(sam)).first->second.get();
|
||||||
|
}
|
||||||
|
|
||||||
|
bool DeviceResources::_ObtainAdapterAndDevice(GraphicsCardId graphicsCardId, bool isForeground) noexcept {
|
||||||
|
winrt::com_ptr<IDXGIAdapter1> adapter;
|
||||||
|
// 记录不支持 FL11 的显卡索引,防止重复尝试
|
||||||
|
int failedIdx = -1;
|
||||||
|
|
||||||
|
if (graphicsCardId.idx >= 0) {
|
||||||
|
assert(graphicsCardId.vendorId != 0 && graphicsCardId.deviceId != 0);
|
||||||
|
|
||||||
|
// 先使用索引
|
||||||
|
HRESULT hr = _dxgiFactory->EnumAdapters1(graphicsCardId.idx, adapter.put());
|
||||||
|
if (SUCCEEDED(hr)) {
|
||||||
|
DXGI_ADAPTER_DESC1 desc;
|
||||||
|
hr = adapter->GetDesc1(&desc);
|
||||||
|
if (SUCCEEDED(hr)) {
|
||||||
|
if (desc.VendorId == graphicsCardId.vendorId && desc.DeviceId == graphicsCardId.deviceId) {
|
||||||
|
if (_TryCreateD3DDevice(adapter, isForeground)) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
failedIdx = graphicsCardId.idx;
|
||||||
|
Logger::Get().Warn("用户指定的显示卡不支持 FL 11");
|
||||||
|
} else {
|
||||||
|
Logger::Get().Warn("显卡配置已变化");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// 如果已确认该显卡不支持 FL11,不再重复尝试
|
||||||
|
if (failedIdx == -1) {
|
||||||
|
// 枚举查找 vendorId 和 deviceId 匹配的显卡
|
||||||
|
for (UINT adapterIdx = 0;
|
||||||
|
SUCCEEDED(_dxgiFactory->EnumAdapters1(adapterIdx, adapter.put()));
|
||||||
|
++adapterIdx
|
||||||
|
) {
|
||||||
|
if ((int)adapterIdx == graphicsCardId.idx) {
|
||||||
|
// 已经检查了 graphicsCardId.idx
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
DXGI_ADAPTER_DESC1 desc;
|
||||||
|
hr = adapter->GetDesc1(&desc);
|
||||||
|
if (FAILED(hr)) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (desc.VendorId == graphicsCardId.vendorId && desc.DeviceId == graphicsCardId.deviceId) {
|
||||||
|
if (_TryCreateD3DDevice(adapter, isForeground)) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
failedIdx = (int)adapterIdx;
|
||||||
|
Logger::Get().Warn("用户指定的显示卡不支持 FL11");
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// 枚举查找第一个支持 FL11 的显卡
|
||||||
|
for (UINT adapterIdx = 0;
|
||||||
|
SUCCEEDED(_dxgiFactory->EnumAdapters1(adapterIdx, adapter.put()));
|
||||||
|
++adapterIdx
|
||||||
|
) {
|
||||||
|
if ((int)adapterIdx == failedIdx) {
|
||||||
|
// 无需再次尝试
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
DXGI_ADAPTER_DESC1 desc;
|
||||||
|
HRESULT hr = adapter->GetDesc1(&desc);
|
||||||
|
if (FAILED(hr) || DirectXHelper::IsWARP(desc)) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (_TryCreateD3DDevice(adapter, isForeground)) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// 作为最后手段,回落到 CPU 渲染 (WARP)
|
||||||
|
// https://docs.microsoft.com/en-us/windows/win32/direct3darticles/directx-warp
|
||||||
|
HRESULT hr = _dxgiFactory->EnumWarpAdapter(IID_PPV_ARGS(&adapter));
|
||||||
|
if (FAILED(hr)) {
|
||||||
|
Logger::Get().ComError("EnumWarpAdapter 失败", hr);
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!_TryCreateD3DDevice(adapter, isForeground)) {
|
||||||
|
Logger::Get().ComError("创建 WARP 设备失败", hr);
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
bool DeviceResources::_TryCreateD3DDevice(const winrt::com_ptr<IDXGIAdapter1>& adapter, bool isForeground) noexcept {
|
||||||
|
D3D_FEATURE_LEVEL featureLevels[] = {
|
||||||
|
D3D_FEATURE_LEVEL_11_1,
|
||||||
|
D3D_FEATURE_LEVEL_11_0
|
||||||
|
};
|
||||||
|
const UINT nFeatureLevels = ARRAYSIZE(featureLevels);
|
||||||
|
|
||||||
|
UINT createDeviceFlags = D3D11_CREATE_DEVICE_BGRA_SUPPORT;
|
||||||
|
// DEBUG 配置下启用调试层
|
||||||
|
if (DirectXHelper::IsDebugLayersAvailable()) {
|
||||||
|
createDeviceFlags |= D3D11_CREATE_DEVICE_DEBUG;
|
||||||
|
}
|
||||||
|
// WGC 和 D3D11_CREATE_DEVICE_SINGLETHREADED 不兼容
|
||||||
|
if (isForeground || ScalingWindow::Get().Options().captureMethod != CaptureMethod::GraphicsCapture) {
|
||||||
|
createDeviceFlags |= D3D11_CREATE_DEVICE_SINGLETHREADED;
|
||||||
|
}
|
||||||
|
#ifdef MP_USE_COMPSWAPCHAIN
|
||||||
|
if (isForeground) {
|
||||||
|
// 文档说 composition swapchain 和驱动程序内部线程不兼容,如果没有这个标志,创建
|
||||||
|
// IPresentationFactory 将失败。但根据我在 Win11 24H2 上的测试,不指定这个标志也
|
||||||
|
// 可以正常使用,可能文档已经过时。安全起见加上了这个标志。
|
||||||
|
createDeviceFlags |= D3D11_CREATE_DEVICE_PREVENT_INTERNAL_THREADING_OPTIMIZATIONS;
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
|
winrt::com_ptr<ID3D11Device> d3dDevice;
|
||||||
|
winrt::com_ptr<ID3D11DeviceContext> d3dDC;
|
||||||
|
D3D_FEATURE_LEVEL featureLevel;
|
||||||
|
HRESULT hr = D3D11CreateDevice(
|
||||||
|
adapter.get(),
|
||||||
|
D3D_DRIVER_TYPE_UNKNOWN,
|
||||||
|
nullptr,
|
||||||
|
createDeviceFlags,
|
||||||
|
featureLevels,
|
||||||
|
nFeatureLevels,
|
||||||
|
D3D11_SDK_VERSION,
|
||||||
|
d3dDevice.put(),
|
||||||
|
&featureLevel,
|
||||||
|
d3dDC.put()
|
||||||
|
);
|
||||||
|
|
||||||
|
if (FAILED(hr)) {
|
||||||
|
Logger::Get().ComError("D3D11CreateDevice 失败", hr);
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
std::string_view fl;
|
||||||
|
switch (featureLevel) {
|
||||||
|
case D3D_FEATURE_LEVEL_11_1:
|
||||||
|
fl = "11.1";
|
||||||
|
break;
|
||||||
|
case D3D_FEATURE_LEVEL_11_0:
|
||||||
|
fl = "11.0";
|
||||||
|
break;
|
||||||
|
default:
|
||||||
|
fl = "未知";
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
Logger::Get().Info(fmt::format("已创建 D3D 设备\n\t功能级别: {}", fl));
|
||||||
|
|
||||||
|
_d3dDevice = d3dDevice.try_as<ID3D11Device5>();
|
||||||
|
if (!_d3dDevice) {
|
||||||
|
Logger::Get().Error("获取 ID3D11Device1 失败");
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
_d3dDC = d3dDC.try_as<ID3D11DeviceContext4>();
|
||||||
|
if (!_d3dDC) {
|
||||||
|
Logger::Get().Error("获取 ID3D11DeviceContext4 失败");
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
_graphicsAdapter = adapter.try_as<IDXGIAdapter4>();
|
||||||
|
if (!_graphicsAdapter) {
|
||||||
|
Logger::Get().Error("获取 IDXGIAdapter4 失败");
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
// 检查半精度浮点支持
|
||||||
|
D3D11_FEATURE_DATA_SHADER_MIN_PRECISION_SUPPORT value;
|
||||||
|
hr = d3dDevice->CheckFeatureSupport(D3D11_FEATURE_SHADER_MIN_PRECISION_SUPPORT, &value, sizeof(value));
|
||||||
|
if (SUCCEEDED(hr)) {
|
||||||
|
_isFP16Supported = value.AllOtherShaderStagesMinPrecision & D3D11_SHADER_MIN_PRECISION_16_BIT;
|
||||||
|
Logger::Get().Info(StrHelper::Concat("FP16 支持: ", _isFP16Supported ? "是" : "否"));
|
||||||
|
} else {
|
||||||
|
Logger::Get().ComError("CheckFeatureSupport 失败", hr);
|
||||||
|
}
|
||||||
|
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
43
src/Magpie.Core/DeviceResources.h
Normal file
43
src/Magpie.Core/DeviceResources.h
Normal file
|
|
@ -0,0 +1,43 @@
|
||||||
|
#pragma once
|
||||||
|
#include "ScalingOptions.h"
|
||||||
|
#include <parallel_hashmap/phmap.h>
|
||||||
|
|
||||||
|
namespace Magpie {
|
||||||
|
|
||||||
|
class DeviceResources {
|
||||||
|
public:
|
||||||
|
DeviceResources() = default;
|
||||||
|
DeviceResources(const DeviceResources&) = delete;
|
||||||
|
DeviceResources(DeviceResources&&) = default;
|
||||||
|
|
||||||
|
bool Initialize(bool isForeground) noexcept;
|
||||||
|
|
||||||
|
IDXGIFactory7* GetDXGIFactory() const noexcept { return _dxgiFactory.get(); }
|
||||||
|
ID3D11Device5* GetD3DDevice() const noexcept { return _d3dDevice.get(); }
|
||||||
|
ID3D11DeviceContext4* GetD3DDC() const noexcept { return _d3dDC.get(); }
|
||||||
|
IDXGIAdapter4* GetGraphicsAdapter() const noexcept { return _graphicsAdapter.get(); }
|
||||||
|
|
||||||
|
bool IsTearingSupported() const noexcept { return _isTearingSupported; }
|
||||||
|
bool IsFP16Supported() const noexcept { return _isFP16Supported; }
|
||||||
|
|
||||||
|
ID3D11SamplerState* GetSampler(D3D11_FILTER filterMode, D3D11_TEXTURE_ADDRESS_MODE addressMode) noexcept;
|
||||||
|
|
||||||
|
private:
|
||||||
|
bool _ObtainAdapterAndDevice(GraphicsCardId graphicsCardId, bool isForeground) noexcept;
|
||||||
|
bool _TryCreateD3DDevice(const winrt::com_ptr<IDXGIAdapter1>& adapter, bool isForeground) noexcept;
|
||||||
|
|
||||||
|
winrt::com_ptr<IDXGIFactory7> _dxgiFactory;
|
||||||
|
winrt::com_ptr<IDXGIAdapter4> _graphicsAdapter;
|
||||||
|
winrt::com_ptr<ID3D11Device5> _d3dDevice;
|
||||||
|
winrt::com_ptr<ID3D11DeviceContext4> _d3dDC;
|
||||||
|
|
||||||
|
phmap::flat_hash_map<
|
||||||
|
std::pair<D3D11_FILTER, D3D11_TEXTURE_ADDRESS_MODE>,
|
||||||
|
winrt::com_ptr<ID3D11SamplerState>
|
||||||
|
> _samMap;
|
||||||
|
|
||||||
|
bool _isTearingSupported = false;
|
||||||
|
bool _isFP16Supported = false;
|
||||||
|
};
|
||||||
|
|
||||||
|
}
|
||||||
110
src/Magpie.Core/DirectXHelper.cpp
Normal file
110
src/Magpie.Core/DirectXHelper.cpp
Normal file
|
|
@ -0,0 +1,110 @@
|
||||||
|
#include "pch.h"
|
||||||
|
#include "DirectXHelper.h"
|
||||||
|
#include "Logger.h"
|
||||||
|
#include "StrHelper.h"
|
||||||
|
#include <d3dcompiler.h>
|
||||||
|
|
||||||
|
namespace Magpie {
|
||||||
|
|
||||||
|
bool DirectXHelper::CompileComputeShader(
|
||||||
|
std::string_view hlsl,
|
||||||
|
const char* entryPoint,
|
||||||
|
ID3DBlob** blob,
|
||||||
|
const char* sourceName,
|
||||||
|
ID3DInclude* include,
|
||||||
|
const std::vector<std::pair<std::string, std::string>>& macros,
|
||||||
|
bool warningsAreErrors
|
||||||
|
) {
|
||||||
|
winrt::com_ptr<ID3DBlob> errorMsgs = nullptr;
|
||||||
|
|
||||||
|
UINT flags = D3DCOMPILE_ENABLE_STRICTNESS | D3DCOMPILE_ALL_RESOURCES_BOUND;
|
||||||
|
if (warningsAreErrors) {
|
||||||
|
flags |= D3DCOMPILE_WARNINGS_ARE_ERRORS;
|
||||||
|
}
|
||||||
|
|
||||||
|
#ifdef _DEBUG
|
||||||
|
flags |= D3DCOMPILE_SKIP_OPTIMIZATION | D3DCOMPILE_DEBUG;
|
||||||
|
#else
|
||||||
|
flags |= D3DCOMPILE_OPTIMIZATION_LEVEL3;
|
||||||
|
#endif
|
||||||
|
|
||||||
|
std::unique_ptr<D3D_SHADER_MACRO[]> mc(new D3D_SHADER_MACRO[macros.size() + 1]);
|
||||||
|
for (UINT i = 0; i < macros.size(); ++i) {
|
||||||
|
mc[i] = { macros[i].first.c_str(), macros[i].second.c_str() };
|
||||||
|
}
|
||||||
|
mc[macros.size()] = { nullptr,nullptr };
|
||||||
|
|
||||||
|
HRESULT hr = D3DCompile(hlsl.data(), hlsl.size(), sourceName, mc.get(), include,
|
||||||
|
entryPoint, "cs_5_0", flags, 0, blob, errorMsgs.put());
|
||||||
|
if (FAILED(hr)) {
|
||||||
|
if (errorMsgs) {
|
||||||
|
Logger::Get().ComError(StrHelper::Concat("编译计算着色器失败: ", (const char*)errorMsgs->GetBufferPointer()), hr);
|
||||||
|
}
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
// 警告消息
|
||||||
|
if (errorMsgs) {
|
||||||
|
Logger::Get().Warn(StrHelper::Concat("编译计算着色器时产生警告: ", (const char*)errorMsgs->GetBufferPointer()));
|
||||||
|
}
|
||||||
|
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
bool DirectXHelper::IsDebugLayersAvailable() noexcept {
|
||||||
|
#ifdef _DEBUG
|
||||||
|
static bool result = SUCCEEDED(D3D11CreateDevice(
|
||||||
|
nullptr,
|
||||||
|
D3D_DRIVER_TYPE_NULL, // There is no need to create a real hardware device.
|
||||||
|
nullptr,
|
||||||
|
D3D11_CREATE_DEVICE_DEBUG, // Check for the SDK layers.
|
||||||
|
nullptr, // Any feature level will do.
|
||||||
|
0,
|
||||||
|
D3D11_SDK_VERSION,
|
||||||
|
nullptr, // No need to keep the D3D device reference.
|
||||||
|
nullptr, // No need to know the feature level.
|
||||||
|
nullptr // No need to keep the D3D device context reference.
|
||||||
|
));
|
||||||
|
return result;
|
||||||
|
#else
|
||||||
|
// Relaese 配置不使用调试层
|
||||||
|
return false;
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
|
winrt::com_ptr<ID3D11Texture2D> DirectXHelper::CreateTexture2D(
|
||||||
|
ID3D11Device* d3dDevice,
|
||||||
|
DXGI_FORMAT format,
|
||||||
|
UINT width,
|
||||||
|
UINT height,
|
||||||
|
UINT bindFlags,
|
||||||
|
D3D11_USAGE usage,
|
||||||
|
UINT miscFlags,
|
||||||
|
const D3D11_SUBRESOURCE_DATA* pInitialData
|
||||||
|
) noexcept {
|
||||||
|
const D3D11_TEXTURE2D_DESC desc{
|
||||||
|
.Width = width,
|
||||||
|
.Height = height,
|
||||||
|
.MipLevels = 1,
|
||||||
|
.ArraySize = 1,
|
||||||
|
.Format = format,
|
||||||
|
.SampleDesc{
|
||||||
|
.Count = 1,
|
||||||
|
.Quality = 0
|
||||||
|
},
|
||||||
|
.Usage = usage,
|
||||||
|
.BindFlags = bindFlags,
|
||||||
|
.MiscFlags = miscFlags
|
||||||
|
};
|
||||||
|
|
||||||
|
winrt::com_ptr<ID3D11Texture2D> result;
|
||||||
|
HRESULT hr = d3dDevice->CreateTexture2D(&desc, pInitialData, result.put());
|
||||||
|
if (FAILED(hr)) {
|
||||||
|
Logger::Get().ComError("CreateTexture2D 失败", hr);
|
||||||
|
return nullptr;
|
||||||
|
}
|
||||||
|
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
@ -1,362 +0,0 @@
|
||||||
#include "pch.h"
|
|
||||||
#include "DirtyRectsOptimizer.h"
|
|
||||||
#include "DebugInfo.h"
|
|
||||||
#include "RectHelper.h"
|
|
||||||
|
|
||||||
namespace Magpie {
|
|
||||||
|
|
||||||
static bool IsCornerInRect(PointU p, const RectU& r) noexcept {
|
|
||||||
return p.x >= r.left && p.x <= r.right && p.y >= r.top && p.y <= r.bottom;
|
|
||||||
}
|
|
||||||
|
|
||||||
static bool OptimizeDirtyRectPair(RectU& rect1, RectU& rect2, bool reversed = false) noexcept {
|
|
||||||
if (RectHelper::IsEmpty(rect1) || RectHelper::IsEmpty(rect2)) {
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
// 计算 rect2 有几个角在 rect1 内
|
|
||||||
bool lt = IsCornerInRect(PointU{ rect2.left, rect2.top }, rect1);
|
|
||||||
bool rt = IsCornerInRect(PointU{ rect2.right, rect2.top }, rect1);
|
|
||||||
bool rb = IsCornerInRect(PointU{ rect2.right, rect2.bottom }, rect1);
|
|
||||||
bool lb = IsCornerInRect(PointU{ rect2.left, rect2.bottom }, rect1);
|
|
||||||
uint32_t count = (uint32_t)lt + (uint32_t)rt + (uint32_t)rb + (uint32_t)lb;
|
|
||||||
|
|
||||||
if (count <= 1) {
|
|
||||||
if (!reversed) {
|
|
||||||
// 尝试反向
|
|
||||||
return OptimizeDirtyRectPair(rect2, rect1, true);
|
|
||||||
}
|
|
||||||
|
|
||||||
if (count == 0) {
|
|
||||||
// 有小间隙也合并,因为检查重复帧使用 16x16 分块,而且多余的像素因为纹理缓存复制代价很小
|
|
||||||
constexpr uint32_t MERGE_THRESHOLD = DUP_FRAME_DISPATCH_BLOCK_SIZE / 2;
|
|
||||||
|
|
||||||
if (rect1.top == rect2.top && rect1.bottom == rect2.bottom) {
|
|
||||||
if (rect1.right < rect2.left) {
|
|
||||||
if (rect1.right + MERGE_THRESHOLD >= rect2.left) {
|
|
||||||
// rect2 合并进 rect1
|
|
||||||
rect1.right = rect2.right;
|
|
||||||
rect2.right = rect2.left;
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
assert(rect1.left > rect2.right);
|
|
||||||
if (rect2.right + MERGE_THRESHOLD >= rect1.left) {
|
|
||||||
rect1.left = rect2.left;
|
|
||||||
rect2.right = rect2.left;
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
} else if (rect1.left == rect2.left && rect1.right == rect2.right) {
|
|
||||||
if (rect1.bottom < rect2.top) {
|
|
||||||
if (rect1.bottom + MERGE_THRESHOLD >= rect2.top) {
|
|
||||||
rect1.bottom = rect2.bottom;
|
|
||||||
rect2.right = rect2.left;
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
assert(rect1.top > rect2.bottom);
|
|
||||||
if (rect2.bottom + MERGE_THRESHOLD >= rect1.top) {
|
|
||||||
rect1.top = rect2.top;
|
|
||||||
rect2.right = rect2.left;
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
} else if (count == 2) {
|
|
||||||
// rect2 有两个角在 rect1 内时可以合并或裁剪
|
|
||||||
if (lt) {
|
|
||||||
if (rt) {
|
|
||||||
if (rect2.left == rect1.left && rect2.right == rect1.right) {
|
|
||||||
// rect2 合并进 rect1
|
|
||||||
rect1.bottom = rect2.bottom;
|
|
||||||
rect2.right = rect2.left;
|
|
||||||
return true;
|
|
||||||
} else if (rect2.top != rect1.bottom) {
|
|
||||||
// 裁剪 rect2
|
|
||||||
rect2.top = rect1.bottom;
|
|
||||||
assert(rect2.bottom >= rect2.top);
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
assert(lb);
|
|
||||||
if (rect2.top == rect1.top && rect2.bottom == rect1.bottom) {
|
|
||||||
rect1.right = rect2.right;
|
|
||||||
rect2.right = rect2.left;
|
|
||||||
return true;
|
|
||||||
} else if (rect2.left != rect1.right) {
|
|
||||||
rect2.left = rect1.right;
|
|
||||||
assert(rect2.right >= rect2.left);
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
assert(rb);
|
|
||||||
if (rt) {
|
|
||||||
if (rect2.top == rect1.top && rect2.bottom == rect1.bottom) {
|
|
||||||
rect1.left = rect2.left;
|
|
||||||
rect2.right = rect2.left;
|
|
||||||
return true;
|
|
||||||
} else if (rect2.right != rect1.left) {
|
|
||||||
rect2.right = rect1.left;
|
|
||||||
assert(rect2.right >= rect2.left);
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
if (rect2.left == rect1.left && rect2.right == rect1.right) {
|
|
||||||
rect1.top = rect2.top;
|
|
||||||
rect2.right = rect2.left;
|
|
||||||
return true;
|
|
||||||
} else if (rect2.bottom != rect1.top) {
|
|
||||||
rect2.bottom = rect1.top;
|
|
||||||
assert(rect2.bottom >= rect2.top);
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
} else if (count == 4) {
|
|
||||||
// rect2 在 rect1 内
|
|
||||||
rect2.right = rect2.left;
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
static void BasicOptimize(SmallVectorImpl<RectU>& dirtyRects) noexcept {
|
|
||||||
// 持续循环直到不再能优化
|
|
||||||
while (true) {
|
|
||||||
const uint32_t count = (uint32_t)dirtyRects.size();
|
|
||||||
assert(count > 0);
|
|
||||||
|
|
||||||
bool optimized = false;
|
|
||||||
for (uint32_t i = 0; i < count; ++i) {
|
|
||||||
for (uint32_t j = i + 1; j < count; ++j) {
|
|
||||||
if (OptimizeDirtyRectPair(dirtyRects[i], dirtyRects[j])) {
|
|
||||||
optimized = true;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if (!optimized) {
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
// 从后向前删除空矩形
|
|
||||||
for (int i = int(count - 1); i >= 0; --i) {
|
|
||||||
const RectU& rect = dirtyRects[i];
|
|
||||||
if (RectHelper::IsEmpty(rect)) {
|
|
||||||
dirtyRects.erase(dirtyRects.begin() + i);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
static uint32_t CalcTotalPixels(const SmallVectorImpl<RectU>& rects) noexcept {
|
|
||||||
uint32_t result = 0;
|
|
||||||
for (const RectU& rect : rects) {
|
|
||||||
result += RectHelper::CalcArea(rect);
|
|
||||||
}
|
|
||||||
return result;
|
|
||||||
}
|
|
||||||
|
|
||||||
#ifdef MP_DEBUG_INFO
|
|
||||||
// 验证优化算法的正确性
|
|
||||||
static void ValidateOptimize(
|
|
||||||
const SmallVectorImpl<RectU>& originRects,
|
|
||||||
const SmallVectorImpl<RectU>& newRects
|
|
||||||
) noexcept {
|
|
||||||
if (originRects.empty()) {
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
std::vector<bool> pixels;
|
|
||||||
for (const RectU& originRect : originRects) {
|
|
||||||
// 作为优化先检查有没有被优化后的某个矩形包含
|
|
||||||
bool contained = false;
|
|
||||||
for (const RectU& newRect : newRects) {
|
|
||||||
if (RectHelper::Contains(newRect, originRect)) {
|
|
||||||
contained = true;
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if (contained) {
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
|
|
||||||
// 可能被多个矩形共同包含,需要逐像素检查
|
|
||||||
const uint32_t originWidth = originRect.right - originRect.left;
|
|
||||||
pixels.assign(size_t((originRect.bottom - originRect.top) * originWidth), false);
|
|
||||||
|
|
||||||
for (RectU newRect : newRects) {
|
|
||||||
if (!RectHelper::Intersect(newRect, newRect, originRect)) {
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
|
|
||||||
for (uint32_t i = newRect.top; i < newRect.bottom; ++i) {
|
|
||||||
uint32_t start = (i - originRect.top) * originWidth;
|
|
||||||
std::fill(pixels.begin() + size_t(start + newRect.left - originRect.left),
|
|
||||||
pixels.begin() + size_t(start + newRect.right - originRect.left), true);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if (std::find(pixels.begin(), pixels.end(), false) != pixels.end()) {
|
|
||||||
OutputDebugString(L"优化脏矩形算法错误!\n");
|
|
||||||
// 打印脏矩形供调试
|
|
||||||
for (const RectU& rect : originRects) {
|
|
||||||
OutputDebugString(fmt::format(L"{},{},{},{}\n",
|
|
||||||
rect.left, rect.top, rect.right, rect.bottom).c_str());
|
|
||||||
}
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
#endif
|
|
||||||
|
|
||||||
void DirtyRectsOptimizer::Execute(SmallVectorImpl<RectU>& dirtyRects) noexcept {
|
|
||||||
uint32_t rectCount = (uint32_t)dirtyRects.size();
|
|
||||||
if (rectCount <= 1) {
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
#ifdef MP_DEBUG_INFO
|
|
||||||
auto se = wil::scope_exit(std::bind_front(ValidateOptimize, DEBUG_INFO.validateDirtyRectsOptimizer ?
|
|
||||||
SmallVector<RectU>(dirtyRects.begin(), dirtyRects.end()) : SmallVector<RectU>(), std::ref(dirtyRects)));
|
|
||||||
#endif
|
|
||||||
|
|
||||||
if (rectCount <= MAX_CAPTURE_DIRTY_RECT_COUNT * 4) {
|
|
||||||
BasicOptimize(dirtyRects);
|
|
||||||
rectCount = (uint32_t)dirtyRects.size();
|
|
||||||
}
|
|
||||||
|
|
||||||
// 深度优化的复杂度为 n^4,输入矩形数量太多时应削减。花太多时间优化脏矩形是得不偿失的
|
|
||||||
constexpr uint32_t DEEP_OPTIMIZE_LIMIT = MAX_CAPTURE_DIRTY_RECT_COUNT * 2;
|
|
||||||
if (rectCount > DEEP_OPTIMIZE_LIMIT) {
|
|
||||||
RectU& lastRect = dirtyRects[DEEP_OPTIMIZE_LIMIT - 1];
|
|
||||||
for (auto it = dirtyRects.begin() + DEEP_OPTIMIZE_LIMIT; it != dirtyRects.end(); ++it) {
|
|
||||||
lastRect = RectHelper::Union(lastRect, *it);
|
|
||||||
}
|
|
||||||
dirtyRects.erase(dirtyRects.begin() + DEEP_OPTIMIZE_LIMIT, dirtyRects.end());
|
|
||||||
|
|
||||||
BasicOptimize(dirtyRects);
|
|
||||||
rectCount = (uint32_t)dirtyRects.size();
|
|
||||||
}
|
|
||||||
|
|
||||||
if (rectCount == 1) {
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
uint32_t totalPixels = CalcTotalPixels(dirtyRects);
|
|
||||||
|
|
||||||
while (true) {
|
|
||||||
uint32_t minTotalPixels = std::numeric_limits<uint32_t>::max();
|
|
||||||
uint32_t targetRectCount = 0;
|
|
||||||
bool targetCanOptimize = false;
|
|
||||||
uint32_t targetIdx1 = 0;
|
|
||||||
uint32_t targetIdx2 = 0;
|
|
||||||
// 遍历所有的两两合并找出总像素数最少的
|
|
||||||
for (uint32_t i = 0; i < rectCount; ++i) {
|
|
||||||
for (uint32_t j = i + 1; j < rectCount; ++j) {
|
|
||||||
const RectU& rect1 = dirtyRects[i];
|
|
||||||
const RectU& rect2 = dirtyRects[j];
|
|
||||||
|
|
||||||
// 两个矩形必须相交才有优化的可能,但脏矩形数量过多时需要强制合并
|
|
||||||
if (!RectHelper::IsOverlap(rect1, rect2) && rectCount <= MAX_CAPTURE_DIRTY_RECT_COUNT) {
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
|
|
||||||
RectU unionedRect = RectHelper::Union(rect1, rect2);
|
|
||||||
uint32_t newTotalPixels = 0;
|
|
||||||
uint32_t newRectCount = 0;
|
|
||||||
bool optimized = false;
|
|
||||||
|
|
||||||
// 这里只优化一轮而不是调用 OptimizeDirtyRects,既降低复杂度又能避免堆分配
|
|
||||||
for (uint32_t k = 0; k < rectCount; ++k) {
|
|
||||||
if (k == i || k == j) {
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
|
|
||||||
RectU curRect = dirtyRects[k];
|
|
||||||
if (OptimizeDirtyRectPair(curRect, unionedRect)) {
|
|
||||||
optimized = true;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (!RectHelper::IsEmpty(curRect)) {
|
|
||||||
newTotalPixels += RectHelper::CalcArea(curRect);
|
|
||||||
++newRectCount;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if (!RectHelper::IsEmpty(unionedRect)) {
|
|
||||||
newTotalPixels += RectHelper::CalcArea(unionedRect);
|
|
||||||
++newRectCount;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (newTotalPixels < minTotalPixels ||
|
|
||||||
(newTotalPixels == minTotalPixels && newRectCount < targetRectCount)) {
|
|
||||||
minTotalPixels = newTotalPixels;
|
|
||||||
targetRectCount = newRectCount;
|
|
||||||
targetCanOptimize = optimized;
|
|
||||||
targetIdx1 = i;
|
|
||||||
targetIdx2 = j;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// 总像素数持平也采用,因为脏矩形数量减少了
|
|
||||||
if (minTotalPixels > totalPixels && rectCount <= MAX_CAPTURE_DIRTY_RECT_COUNT) {
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
assert(targetIdx1 < targetIdx2);
|
|
||||||
dirtyRects[targetIdx1] = RectHelper::Union(dirtyRects[targetIdx1], dirtyRects[targetIdx2]);
|
|
||||||
dirtyRects.erase(dirtyRects.begin() + targetIdx2);
|
|
||||||
|
|
||||||
if (targetCanOptimize) {
|
|
||||||
BasicOptimize(dirtyRects);
|
|
||||||
totalPixels = CalcTotalPixels(dirtyRects);
|
|
||||||
} else {
|
|
||||||
totalPixels = minTotalPixels;
|
|
||||||
}
|
|
||||||
|
|
||||||
rectCount = (uint32_t)dirtyRects.size();
|
|
||||||
if (rectCount == 1) {
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
#ifdef _DEBUG
|
|
||||||
static Ignore _ = [] {
|
|
||||||
auto rectComp = [](const RectU& l, const RectU& r) {
|
|
||||||
return std::tuple(l.left, l.top, l.right, l.bottom) <
|
|
||||||
std::tuple(r.left, r.top, r.right, r.bottom);
|
|
||||||
};
|
|
||||||
|
|
||||||
SmallVector<RectU, 0> dirtyRects;
|
|
||||||
dirtyRects.reserve(16);
|
|
||||||
|
|
||||||
dirtyRects.emplace_back(0, 0, 2, 2);
|
|
||||||
dirtyRects.emplace_back(1, 1, 3, 4);
|
|
||||||
dirtyRects.emplace_back(2, 1, 4, 3);
|
|
||||||
dirtyRects.emplace_back(0, 1, 3, 2);
|
|
||||||
dirtyRects.emplace_back(3, 3, 4, 4);
|
|
||||||
BasicOptimize(dirtyRects);
|
|
||||||
std::sort(dirtyRects.begin(), dirtyRects.end(), rectComp);
|
|
||||||
assert(dirtyRects.size() == 2);
|
|
||||||
assert((dirtyRects[0] == RectU{ 0, 0, 2, 2 }));
|
|
||||||
assert((dirtyRects[1] == RectU{ 1, 1, 4, 4 }));
|
|
||||||
|
|
||||||
dirtyRects.clear();
|
|
||||||
dirtyRects.emplace_back(0, 0, 1, 1);
|
|
||||||
dirtyRects.emplace_back(0, 0, 2, 2);
|
|
||||||
BasicOptimize(dirtyRects);
|
|
||||||
assert(dirtyRects.size() == 1);
|
|
||||||
assert((dirtyRects[0] == RectU{ 0, 0, 2, 2 }));
|
|
||||||
|
|
||||||
return Ignore();
|
|
||||||
}();
|
|
||||||
#endif
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
@ -1,11 +0,0 @@
|
||||||
#pragma once
|
|
||||||
#include <SmallVector.h>
|
|
||||||
|
|
||||||
namespace Magpie {
|
|
||||||
|
|
||||||
struct DirtyRectsOptimizer {
|
|
||||||
// 尝试减少脏矩形数量和总像素数
|
|
||||||
static void Execute(SmallVectorImpl<RectU>& dirtyRects) noexcept;
|
|
||||||
};
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
@ -1,339 +0,0 @@
|
||||||
#include "pch.h"
|
|
||||||
#include "DuplicateFrameChecker.h"
|
|
||||||
#include "DebugInfo.h"
|
|
||||||
#include "DirectXHelper.h"
|
|
||||||
#include "Logger.h"
|
|
||||||
#include "ScalingWindow.h"
|
|
||||||
#include "shaders/DuplicateFrameCS.h"
|
|
||||||
#include "shaders/DuplicateFrameCS_NoBoundsChecking.h"
|
|
||||||
|
|
||||||
namespace Magpie {
|
|
||||||
|
|
||||||
static constexpr uint16_t INITIAL_CHECK_COUNT = 16;
|
|
||||||
static constexpr uint16_t INITIAL_SKIP_COUNT = 1;
|
|
||||||
static constexpr uint16_t MAX_SKIP_COUNT = 16;
|
|
||||||
|
|
||||||
DuplicateFrameChecker::DuplicateFrameChecker() noexcept :
|
|
||||||
_nextSkipCount(INITIAL_SKIP_COUNT), _framesLeft(INITIAL_CHECK_COUNT) {}
|
|
||||||
|
|
||||||
// 使用 D3D11 而不是 D3D12 检查重复帧。有两个原因:
|
|
||||||
// 1. D3D11 支持 IDXGIDevice::SetGPUThreadPriority,可以提高 GPU 优先级,
|
|
||||||
// 而 D3D12 没有等价接口。
|
|
||||||
// 2. 对于小任务 D3D11 启动渲染的耗时比 D3D12 短,差距可以达到 50us 以上。
|
|
||||||
//
|
|
||||||
// 对于不支持脏矩形且捕获帧右下两边没有多余像素的捕获方式,可以禁用边界检查获得
|
|
||||||
// 性能提升。
|
|
||||||
bool DuplicateFrameChecker::Initialize(
|
|
||||||
ID3D11Device5* d3d11Device,
|
|
||||||
ID3D11DeviceContext4* d3d11DC,
|
|
||||||
const ColorInfo& colorInfo,
|
|
||||||
SizeU frameSize,
|
|
||||||
uint32_t captureFrameCount,
|
|
||||||
bool disableBoundsChecking
|
|
||||||
) noexcept {
|
|
||||||
assert(ScalingWindow::Get().Options().duplicateFrameDetectionMode !=
|
|
||||||
DuplicateFrameDetectionMode::Never);
|
|
||||||
|
|
||||||
_device = d3d11Device;
|
|
||||||
_deviceContext = d3d11DC;
|
|
||||||
_isScRGB = colorInfo.kind != winrt::AdvancedColorKind::StandardDynamicRange;
|
|
||||||
_frameSize = frameSize;
|
|
||||||
#ifdef _DEBUG
|
|
||||||
_isBoundsCheckingDisabled = disableBoundsChecking;
|
|
||||||
#endif
|
|
||||||
|
|
||||||
_frameSrvs.resize(captureFrameCount);
|
|
||||||
|
|
||||||
HRESULT hr = d3d11Device->CreateComputeShader(
|
|
||||||
disableBoundsChecking ? DuplicateFrameCS_NoBoundsChecking : DuplicateFrameCS,
|
|
||||||
disableBoundsChecking ? sizeof(DuplicateFrameCS_NoBoundsChecking) : sizeof(DuplicateFrameCS),
|
|
||||||
nullptr,
|
|
||||||
_dupFrameCS.put()
|
|
||||||
);
|
|
||||||
if (FAILED(hr)) {
|
|
||||||
Logger::Get().ComError("CreateComputeShader 失败", hr);
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
{
|
|
||||||
D3D11_BUFFER_DESC desc = {
|
|
||||||
// CSSetConstantBuffers1 要求偏移量以 256 字节对齐
|
|
||||||
.ByteWidth = (MAX_CAPTURE_DIRTY_RECT_COUNT - 1) * 256 + 8 * sizeof(uint32_t),
|
|
||||||
.Usage = D3D11_USAGE_DYNAMIC,
|
|
||||||
.BindFlags = D3D11_BIND_CONSTANT_BUFFER,
|
|
||||||
.CPUAccessFlags = D3D11_CPU_ACCESS_WRITE,
|
|
||||||
.StructureByteStride = desc.ByteWidth
|
|
||||||
};
|
|
||||||
hr = d3d11Device->CreateBuffer(&desc, nullptr, _constantBuffer.put());
|
|
||||||
if (FAILED(hr)) {
|
|
||||||
Logger::Get().ComError("CreateBuffer 失败", hr);
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
desc.ByteWidth = MAX_CAPTURE_DIRTY_RECT_COUNT * sizeof(uint32_t);
|
|
||||||
desc.Usage = D3D11_USAGE_DEFAULT;
|
|
||||||
desc.BindFlags = D3D11_BIND_UNORDERED_ACCESS;
|
|
||||||
desc.StructureByteStride = desc.ByteWidth;
|
|
||||||
hr = d3d11Device->CreateBuffer(&desc, nullptr, _resultBuffer.put());
|
|
||||||
if (FAILED(hr)) {
|
|
||||||
Logger::Get().ComError("CreateBuffer 失败", hr);
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
desc.Usage = D3D11_USAGE_STAGING;
|
|
||||||
desc.CPUAccessFlags = D3D11_CPU_ACCESS_READ;
|
|
||||||
desc.BindFlags = 0;
|
|
||||||
hr = d3d11Device->CreateBuffer(&desc, nullptr, _readBackBuffer.put());
|
|
||||||
if (FAILED(hr)) {
|
|
||||||
Logger::Get().ComError("CreateBuffer 失败", hr);
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
{
|
|
||||||
D3D11_UNORDERED_ACCESS_VIEW_DESC desc = {
|
|
||||||
.Format = DXGI_FORMAT_R32_UINT,
|
|
||||||
.ViewDimension = D3D11_UAV_DIMENSION_BUFFER,
|
|
||||||
.Buffer = {
|
|
||||||
.NumElements = MAX_CAPTURE_DIRTY_RECT_COUNT
|
|
||||||
}
|
|
||||||
};
|
|
||||||
hr = d3d11Device->CreateUnorderedAccessView(_resultBuffer.get(), &desc, _resultBufferUav.put());
|
|
||||||
if (FAILED(hr)) {
|
|
||||||
Logger::Get().ComError("CreateUnorderedAccessView 失败", hr);
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
{
|
|
||||||
D3D11_SAMPLER_DESC desc{
|
|
||||||
.Filter = D3D11_FILTER_MIN_MAG_MIP_POINT,
|
|
||||||
.AddressU = D3D11_TEXTURE_ADDRESS_CLAMP,
|
|
||||||
.AddressV = D3D11_TEXTURE_ADDRESS_CLAMP,
|
|
||||||
.AddressW = D3D11_TEXTURE_ADDRESS_CLAMP,
|
|
||||||
.ComparisonFunc = D3D11_COMPARISON_NEVER
|
|
||||||
};
|
|
||||||
hr = d3d11Device->CreateSamplerState(&desc, _sampler.put());
|
|
||||||
if (FAILED(hr)) {
|
|
||||||
Logger::Get().ComError("CreateSamplerState 失败", hr);
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
_deviceContext->CSSetShader(_dupFrameCS.get(), nullptr, 0);
|
|
||||||
|
|
||||||
{
|
|
||||||
ID3D11UnorderedAccessView* uav = _resultBufferUav.get();
|
|
||||||
_deviceContext->CSSetUnorderedAccessViews(0, 1, &uav, nullptr);
|
|
||||||
}
|
|
||||||
|
|
||||||
{
|
|
||||||
ID3D11SamplerState* t = _sampler.get();
|
|
||||||
_deviceContext->CSSetSamplers(0, 1, &t);
|
|
||||||
}
|
|
||||||
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
|
|
||||||
HRESULT DuplicateFrameChecker::CheckFrame(
|
|
||||||
ID3D11Texture2D* frameResource,
|
|
||||||
uint32_t frameIdx,
|
|
||||||
SmallVectorImpl<RectU>& dirtyRects
|
|
||||||
) noexcept {
|
|
||||||
assert(!dirtyRects.empty() && dirtyRects.size() <= MAX_CAPTURE_DIRTY_RECT_COUNT);
|
|
||||||
|
|
||||||
#ifdef _DEBUG
|
|
||||||
{
|
|
||||||
D3D11_TEXTURE2D_DESC desc;
|
|
||||||
frameResource->GetDesc(&desc);
|
|
||||||
assert(desc.Width == _frameSize.width && desc.Height == _frameSize.height);
|
|
||||||
|
|
||||||
if (_isBoundsCheckingDisabled) {
|
|
||||||
// 确保捕获帧右下两边没有多余像素
|
|
||||||
for (const RectU& rect : dirtyRects) {
|
|
||||||
assert(rect.right == desc.Width && rect.bottom == desc.Height);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
#endif
|
|
||||||
|
|
||||||
if (!_frameSrvs[frameIdx]) {
|
|
||||||
HRESULT hr = _device->CreateShaderResourceView(frameResource, nullptr, _frameSrvs[frameIdx].put());
|
|
||||||
if (FAILED(hr)) {
|
|
||||||
Logger::Get().ComError("CreateShaderResourceView 失败", hr);
|
|
||||||
return hr;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// 第一帧无需检查重复帧
|
|
||||||
if (_oldFrameIdx == std::numeric_limits<uint32_t>::max()) {
|
|
||||||
return S_OK;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (ScalingWindow::Get().Options().duplicateFrameDetectionMode == DuplicateFrameDetectionMode::Always) {
|
|
||||||
HRESULT hr = _CheckDirtyRects(frameIdx, dirtyRects);
|
|
||||||
if (FAILED(hr)) {
|
|
||||||
Logger::Get().ComError("_CheckDirtyRects 失败", hr);
|
|
||||||
return hr;
|
|
||||||
}
|
|
||||||
|
|
||||||
return S_OK;
|
|
||||||
}
|
|
||||||
|
|
||||||
// 动态检查重复帧,见 #787
|
|
||||||
if (_isCheckingForDuplicateFrame) {
|
|
||||||
if (--_framesLeft == 0) {
|
|
||||||
_isCheckingForDuplicateFrame = false;
|
|
||||||
_framesLeft = _nextSkipCount;
|
|
||||||
if (_nextSkipCount < MAX_SKIP_COUNT) {
|
|
||||||
// 增加下一次连续跳过检查的帧数
|
|
||||||
++_nextSkipCount;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
HRESULT hr = _CheckDirtyRects(frameIdx, dirtyRects);
|
|
||||||
if (FAILED(hr)) {
|
|
||||||
Logger::Get().ComError("_CheckDirtyRects 失败", hr);
|
|
||||||
return hr;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (dirtyRects.empty()) {
|
|
||||||
_isCheckingForDuplicateFrame = true;
|
|
||||||
_framesLeft = INITIAL_CHECK_COUNT;
|
|
||||||
_nextSkipCount = INITIAL_SKIP_COUNT;
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
if (--_framesLeft == 0) {
|
|
||||||
_isCheckingForDuplicateFrame = true;
|
|
||||||
// 第 2 次连续检查 10 帧,之后逐渐减少,从第 16 次开始只连续检查 2 帧
|
|
||||||
_framesLeft = uint32_t((-4 * (int)_nextSkipCount + 78) / 7);
|
|
||||||
}
|
|
||||||
|
|
||||||
#ifdef MP_DEBUG_INFO
|
|
||||||
if (DEBUG_INFO.enableStatisticsForDynamicDuplicateFrameDetection) {
|
|
||||||
// 预测此帧不会重复,验证是否正确
|
|
||||||
SmallVector<RectU> tempRects(dirtyRects.begin(), dirtyRects.end());
|
|
||||||
HRESULT hr = _CheckDirtyRects(frameIdx, tempRects);
|
|
||||||
if (FAILED(hr)) {
|
|
||||||
Logger::Get().ComError("_CheckDirtyRects 失败", hr);
|
|
||||||
return hr;
|
|
||||||
}
|
|
||||||
|
|
||||||
auto lk = DEBUG_INFO.lock.lock_exclusive();
|
|
||||||
++DEBUG_INFO.ddfdSkippedFrameCount;
|
|
||||||
if (tempRects.empty()) {
|
|
||||||
++DEBUG_INFO.ddfdWrongPredictionCount;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
#endif
|
|
||||||
}
|
|
||||||
|
|
||||||
return S_OK;
|
|
||||||
}
|
|
||||||
|
|
||||||
void DuplicateFrameChecker::OnFrameAdopted(uint32_t frameIdx) noexcept {
|
|
||||||
_oldFrameIdx = frameIdx;
|
|
||||||
}
|
|
||||||
|
|
||||||
void DuplicateFrameChecker::OnCaptureStopped() noexcept {
|
|
||||||
_oldFrameIdx = std::numeric_limits<uint32_t>::max();
|
|
||||||
std::fill(_frameSrvs.begin(), _frameSrvs.end(), nullptr);
|
|
||||||
}
|
|
||||||
|
|
||||||
HRESULT DuplicateFrameChecker::_CheckDirtyRects(
|
|
||||||
uint32_t newFrameIdx,
|
|
||||||
SmallVectorImpl<RectU>& dirtyRects
|
|
||||||
) noexcept {
|
|
||||||
assert(dirtyRects.size() <= MAX_CAPTURE_DIRTY_RECT_COUNT);
|
|
||||||
|
|
||||||
{
|
|
||||||
assert(_frameSrvs[_oldFrameIdx] && _frameSrvs[newFrameIdx]);
|
|
||||||
ID3D11ShaderResourceView* srvs[]{ _frameSrvs[_oldFrameIdx].get(), _frameSrvs[newFrameIdx].get()};
|
|
||||||
_deviceContext->CSSetShaderResources(0, 2, srvs);
|
|
||||||
}
|
|
||||||
|
|
||||||
const uint32_t dirtyRectCount = (uint32_t)dirtyRects.size();
|
|
||||||
|
|
||||||
D3D11_MAPPED_SUBRESOURCE ms;
|
|
||||||
HRESULT hr = _deviceContext->Map(_constantBuffer.get(), 0, D3D11_MAP_WRITE_DISCARD, 0, &ms);
|
|
||||||
if (FAILED(hr)) {
|
|
||||||
Logger::Get().ComError("ID3D11DeviceContext::Map 失败", hr);
|
|
||||||
return hr;
|
|
||||||
}
|
|
||||||
|
|
||||||
++_curTargetValue;
|
|
||||||
|
|
||||||
for (uint32_t i = 0; i < dirtyRectCount; ++i) {
|
|
||||||
const RectU& dirtyRect = dirtyRects[i];
|
|
||||||
|
|
||||||
alignas(32) DirectXHelper::Constant32 constants[] = {
|
|
||||||
{.uintVal = dirtyRect.left},
|
|
||||||
{.uintVal = dirtyRect.top},
|
|
||||||
{.uintVal = dirtyRect.right},
|
|
||||||
{.uintVal = dirtyRect.bottom},
|
|
||||||
{.floatVal = 1.0f / _frameSize.width},
|
|
||||||
{.floatVal = 1.0f / _frameSize.height},
|
|
||||||
{.uintVal = _curTargetValue},
|
|
||||||
{.uintVal = i}
|
|
||||||
};
|
|
||||||
// CSSetConstantBuffers1 要求偏移量以 256 字节对齐
|
|
||||||
std::memcpy((uint8_t*)ms.pData + i * 256, constants, sizeof(constants));
|
|
||||||
}
|
|
||||||
|
|
||||||
_deviceContext->Unmap(_constantBuffer.get(), 0);
|
|
||||||
|
|
||||||
for (uint32_t i = 0; i < dirtyRectCount; ++i) {
|
|
||||||
{
|
|
||||||
ID3D11Buffer* buffer = _constantBuffer.get();
|
|
||||||
UINT firstConstant = i * 16;
|
|
||||||
UINT numConstants = 16;
|
|
||||||
_deviceContext->CSSetConstantBuffers1(0, 1, &buffer, &firstConstant, &numConstants);
|
|
||||||
}
|
|
||||||
|
|
||||||
const RectU& dirtyRect = dirtyRects[i];
|
|
||||||
_deviceContext->Dispatch(
|
|
||||||
(dirtyRect.right - dirtyRect.left + DUP_FRAME_DISPATCH_BLOCK_SIZE - 1) / DUP_FRAME_DISPATCH_BLOCK_SIZE,
|
|
||||||
(dirtyRect.bottom - dirtyRect.top + DUP_FRAME_DISPATCH_BLOCK_SIZE - 1) / DUP_FRAME_DISPATCH_BLOCK_SIZE,
|
|
||||||
1
|
|
||||||
);
|
|
||||||
}
|
|
||||||
|
|
||||||
{
|
|
||||||
D3D11_BOX box = {
|
|
||||||
.right = dirtyRectCount * 4,
|
|
||||||
.bottom = 1,
|
|
||||||
.back = 1
|
|
||||||
};
|
|
||||||
_deviceContext->CopySubresourceRegion(_readBackBuffer.get(), 0, 0, 0, 0, _resultBuffer.get(), 0, &box);
|
|
||||||
}
|
|
||||||
|
|
||||||
// 读取结果
|
|
||||||
SmallVector<uint32_t, 4> removeList;
|
|
||||||
|
|
||||||
hr = _deviceContext->Map(_readBackBuffer.get(), 0, D3D11_MAP_READ, 0, &ms);
|
|
||||||
if (FAILED(hr)) {
|
|
||||||
Logger::Get().ComError("ID3D11DeviceContext::Map 失败", hr);
|
|
||||||
return hr;
|
|
||||||
}
|
|
||||||
|
|
||||||
for (uint32_t i = 0; i < dirtyRectCount; ++i) {
|
|
||||||
if (((uint32_t*)ms.pData)[i] != _curTargetValue) {
|
|
||||||
// 此矩形内画面无变化
|
|
||||||
removeList.push_back(i);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
_deviceContext->Unmap(_readBackBuffer.get(), 0);
|
|
||||||
|
|
||||||
if (!removeList.empty()) {
|
|
||||||
// 从后向前删除
|
|
||||||
std::sort(removeList.begin(), removeList.end(), std::greater<uint32_t>());
|
|
||||||
for (uint32_t idx : removeList) {
|
|
||||||
dirtyRects.erase(dirtyRects.begin() + idx);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
return S_OK;
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
@ -1,64 +0,0 @@
|
||||||
#pragma once
|
|
||||||
#include "SmallVector.h"
|
|
||||||
#include <d3d11_4.h>
|
|
||||||
|
|
||||||
namespace Magpie {
|
|
||||||
|
|
||||||
class DuplicateFrameChecker {
|
|
||||||
public:
|
|
||||||
DuplicateFrameChecker() noexcept;
|
|
||||||
DuplicateFrameChecker(const DuplicateFrameChecker&) = delete;
|
|
||||||
DuplicateFrameChecker(DuplicateFrameChecker&&) = delete;
|
|
||||||
|
|
||||||
~DuplicateFrameChecker() = default;
|
|
||||||
|
|
||||||
bool Initialize(
|
|
||||||
ID3D11Device5* d3d11Device,
|
|
||||||
ID3D11DeviceContext4* d3d11DC,
|
|
||||||
const ColorInfo& colorInfo,
|
|
||||||
SizeU frameSize,
|
|
||||||
uint32_t captureFrameCount,
|
|
||||||
bool disableBoundsChecking
|
|
||||||
) noexcept;
|
|
||||||
|
|
||||||
HRESULT CheckFrame(
|
|
||||||
ID3D11Texture2D* frameResource,
|
|
||||||
uint32_t frameIdx,
|
|
||||||
SmallVectorImpl<RectU>& dirtyRects
|
|
||||||
) noexcept;
|
|
||||||
|
|
||||||
void OnFrameAdopted(uint32_t frameIdx) noexcept;
|
|
||||||
|
|
||||||
void OnCaptureStopped() noexcept;
|
|
||||||
|
|
||||||
private:
|
|
||||||
HRESULT _CheckDirtyRects(uint32_t newFrameIdx, SmallVectorImpl<RectU>& dirtyRects) noexcept;
|
|
||||||
|
|
||||||
ID3D11Device5* _device = nullptr;
|
|
||||||
ID3D11DeviceContext4* _deviceContext = nullptr;
|
|
||||||
|
|
||||||
SizeU _frameSize{};
|
|
||||||
|
|
||||||
winrt::com_ptr<ID3D11ComputeShader> _dupFrameCS;
|
|
||||||
winrt::com_ptr<ID3D11Buffer> _constantBuffer;
|
|
||||||
winrt::com_ptr<ID3D11Buffer> _resultBuffer;
|
|
||||||
winrt::com_ptr<ID3D11UnorderedAccessView> _resultBufferUav;
|
|
||||||
winrt::com_ptr<ID3D11Buffer> _readBackBuffer;
|
|
||||||
winrt::com_ptr<ID3D11SamplerState> _sampler;
|
|
||||||
std::vector<winrt::com_ptr<ID3D11ShaderResourceView>> _frameSrvs;
|
|
||||||
|
|
||||||
uint32_t _oldFrameIdx = std::numeric_limits<uint32_t>::max();
|
|
||||||
uint32_t _curTargetValue = 0;
|
|
||||||
|
|
||||||
// 用于检查重复帧
|
|
||||||
uint16_t _nextSkipCount;
|
|
||||||
uint16_t _framesLeft;
|
|
||||||
|
|
||||||
bool _isScRGB = false;
|
|
||||||
#ifdef _DEBUG
|
|
||||||
bool _isBoundsCheckingDisabled = false;
|
|
||||||
#endif
|
|
||||||
bool _isCheckingForDuplicateFrame = true;
|
|
||||||
};
|
|
||||||
|
|
||||||
}
|
|
||||||
111
src/Magpie.Core/DwmSharedSurfaceFrameSource.cpp
Normal file
111
src/Magpie.Core/DwmSharedSurfaceFrameSource.cpp
Normal file
|
|
@ -0,0 +1,111 @@
|
||||||
|
#include "pch.h"
|
||||||
|
#include "DwmSharedSurfaceFrameSource.h"
|
||||||
|
#include "DeviceResources.h"
|
||||||
|
#include "DirectXHelper.h"
|
||||||
|
#include "Logger.h"
|
||||||
|
#include "ScalingWindow.h"
|
||||||
|
#include "Win32Helper.h"
|
||||||
|
|
||||||
|
namespace Magpie {
|
||||||
|
|
||||||
|
using DwmGetDxSharedSurfaceFunc = BOOL(
|
||||||
|
HWND hWnd,
|
||||||
|
HANDLE* phSurface,
|
||||||
|
LUID* pAdapterLuid,
|
||||||
|
ULONG* pFmtWindow,
|
||||||
|
ULONG* pPresentFlags,
|
||||||
|
ULONGLONG* pWin32KUpdateId
|
||||||
|
);
|
||||||
|
|
||||||
|
static DwmGetDxSharedSurfaceFunc* DwmGetDxSharedSurface = nullptr;
|
||||||
|
|
||||||
|
bool DwmSharedSurfaceFrameSource::_Initialize() noexcept {
|
||||||
|
[[maybe_unused]] static Ignore _ = [] {
|
||||||
|
DwmGetDxSharedSurface = Win32Helper::LoadSystemFunction<DwmGetDxSharedSurfaceFunc>(
|
||||||
|
L"user32.dll", "DwmGetDxSharedSurface");
|
||||||
|
return Ignore();
|
||||||
|
}();
|
||||||
|
|
||||||
|
if (!DwmGetDxSharedSurface) {
|
||||||
|
Logger::Get().Win32Error("获取函数 DwmGetDxSharedSurface 地址失败");
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
const SrcTracker& srcTracker = ScalingWindow::Get().SrcTracker();
|
||||||
|
|
||||||
|
RECT frameRect;
|
||||||
|
double a, bx, by;
|
||||||
|
if (!_GetMapToOriginDPI(srcTracker.Handle(), a, bx, by)) {
|
||||||
|
// 很可能是因为窗口没有重定向表面,这种情况下 DwmSharedSurface 捕获肯定失败
|
||||||
|
Logger::Get().Error("_GetMapToOriginDPI 失败");
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
Logger::Get().Info(fmt::format("源窗口 DPI 缩放为 {}", 1 / a));
|
||||||
|
|
||||||
|
const RECT& srcRect = srcTracker.SrcRect();
|
||||||
|
frameRect = RECT{
|
||||||
|
std::lround(srcRect.left * a + bx),
|
||||||
|
std::lround(srcRect.top * a + by),
|
||||||
|
std::lround(srcRect.right * a + bx),
|
||||||
|
std::lround(srcRect.bottom * a + by)
|
||||||
|
};
|
||||||
|
|
||||||
|
if (frameRect.left < 0 || frameRect.top < 0 || frameRect.right < 0
|
||||||
|
|| frameRect.bottom < 0 || frameRect.right - frameRect.left <= 0
|
||||||
|
|| frameRect.bottom - frameRect.top <= 0
|
||||||
|
) {
|
||||||
|
Logger::Get().Error("裁剪失败");
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
_frameInWnd = {
|
||||||
|
(UINT)frameRect.left,
|
||||||
|
(UINT)frameRect.top,
|
||||||
|
0,
|
||||||
|
(UINT)frameRect.right,
|
||||||
|
(UINT)frameRect.bottom,
|
||||||
|
1
|
||||||
|
};
|
||||||
|
|
||||||
|
_output = DirectXHelper::CreateTexture2D(
|
||||||
|
_deviceResources->GetD3DDevice(),
|
||||||
|
DXGI_FORMAT_B8G8R8A8_UNORM,
|
||||||
|
frameRect.right - frameRect.left,
|
||||||
|
frameRect.bottom - frameRect.top,
|
||||||
|
D3D11_BIND_SHADER_RESOURCE
|
||||||
|
);
|
||||||
|
if (!_output) {
|
||||||
|
Logger::Get().Error("CreateTexture2D 失败");
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
Logger::Get().Info("DwmSharedSurfaceFrameSource 初始化完成");
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
FrameSourceState DwmSharedSurfaceFrameSource::_Update() noexcept {
|
||||||
|
HANDLE sharedTextureHandle = NULL;
|
||||||
|
if (!DwmGetDxSharedSurface(ScalingWindow::Get().SrcTracker().Handle(),
|
||||||
|
&sharedTextureHandle, nullptr, nullptr, nullptr, nullptr)
|
||||||
|
|| !sharedTextureHandle
|
||||||
|
) {
|
||||||
|
Logger::Get().Win32Error("DwmGetDxSharedSurface 失败");
|
||||||
|
return FrameSourceState::Error;
|
||||||
|
}
|
||||||
|
|
||||||
|
winrt::com_ptr<ID3D11Texture2D> sharedTexture;
|
||||||
|
HRESULT hr = _deviceResources->GetD3DDevice()
|
||||||
|
->OpenSharedResource(sharedTextureHandle, IID_PPV_ARGS(&sharedTexture));
|
||||||
|
if (FAILED(hr)) {
|
||||||
|
Logger::Get().ComError("OpenSharedResource 失败", hr);
|
||||||
|
return FrameSourceState::Error;
|
||||||
|
}
|
||||||
|
|
||||||
|
_deviceResources->GetD3DDC()->CopySubresourceRegion(
|
||||||
|
_output.get(), 0, 0, 0, 0, sharedTexture.get(), 0, &_frameInWnd);
|
||||||
|
|
||||||
|
return FrameSourceState::NewFrame;
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
27
src/Magpie.Core/DwmSharedSurfaceFrameSource.h
Normal file
27
src/Magpie.Core/DwmSharedSurfaceFrameSource.h
Normal file
|
|
@ -0,0 +1,27 @@
|
||||||
|
#pragma once
|
||||||
|
#include "FrameSourceBase.h"
|
||||||
|
|
||||||
|
namespace Magpie {
|
||||||
|
|
||||||
|
class DwmSharedSurfaceFrameSource final : public FrameSourceBase {
|
||||||
|
public:
|
||||||
|
virtual ~DwmSharedSurfaceFrameSource() {}
|
||||||
|
|
||||||
|
FrameSourceWaitType WaitType() const noexcept override {
|
||||||
|
return FrameSourceWaitType::NoWait;
|
||||||
|
}
|
||||||
|
|
||||||
|
const char* Name() const noexcept override {
|
||||||
|
return "DwmSharedSurface";
|
||||||
|
}
|
||||||
|
|
||||||
|
protected:
|
||||||
|
bool _Initialize() noexcept override;
|
||||||
|
|
||||||
|
FrameSourceState _Update() noexcept override;
|
||||||
|
|
||||||
|
private:
|
||||||
|
D3D11_BOX _frameInWnd{};
|
||||||
|
};
|
||||||
|
|
||||||
|
}
|
||||||
312
src/Magpie.Core/EffectCacheManager.cpp
Normal file
312
src/Magpie.Core/EffectCacheManager.cpp
Normal file
|
|
@ -0,0 +1,312 @@
|
||||||
|
#include "pch.h"
|
||||||
|
#include "EffectCacheManager.h"
|
||||||
|
#include "CommonSharedConstants.h"
|
||||||
|
#include "Logger.h"
|
||||||
|
#include "StrHelper.h"
|
||||||
|
#include "Win32Helper.h"
|
||||||
|
#include "YasHelper.h"
|
||||||
|
#include <d3dcompiler.h>
|
||||||
|
#include <rapidhash.h>
|
||||||
|
|
||||||
|
namespace yas::detail {
|
||||||
|
|
||||||
|
// winrt::com_ptr<ID3DBlob>
|
||||||
|
template <std::size_t F>
|
||||||
|
struct serializer<
|
||||||
|
type_prop::not_a_fundamental,
|
||||||
|
ser_case::use_internal_serializer,
|
||||||
|
F,
|
||||||
|
winrt::com_ptr<ID3DBlob>
|
||||||
|
> {
|
||||||
|
template <typename Archive>
|
||||||
|
static Archive& save(Archive& ar, const winrt::com_ptr<ID3DBlob>& blob) {
|
||||||
|
uint32_t size = (uint32_t)blob->GetBufferSize();
|
||||||
|
ar& size;
|
||||||
|
|
||||||
|
ar.write(blob->GetBufferPointer(), size);
|
||||||
|
|
||||||
|
return ar;
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename Archive>
|
||||||
|
static Archive& load(Archive& ar, winrt::com_ptr<ID3DBlob>& blob) {
|
||||||
|
uint32_t size = 0;
|
||||||
|
ar& size;
|
||||||
|
HRESULT hr = D3DCreateBlob(size, blob.put());
|
||||||
|
if (FAILED(hr)) {
|
||||||
|
Logger::Get().ComError("D3DCreateBlob 失败", hr);
|
||||||
|
throw new std::exception();
|
||||||
|
}
|
||||||
|
|
||||||
|
ar.read(blob->GetBufferPointer(), size);
|
||||||
|
|
||||||
|
return ar;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
namespace Magpie {
|
||||||
|
|
||||||
|
template <typename Archive>
|
||||||
|
void serialize(Archive& ar, EffectParameterDesc& o) {
|
||||||
|
ar& o.name& o.label& o.constant;
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename Archive>
|
||||||
|
void serialize(Archive& ar, EffectIntermediateTextureDesc& o) {
|
||||||
|
ar& o.format& o.name& o.source& o.sizeExpr;
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename Archive>
|
||||||
|
void serialize(Archive& ar, EffectSamplerDesc& o) {
|
||||||
|
ar& o.filterType& o.addressType& o.name;
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename Archive>
|
||||||
|
void serialize(Archive& ar, EffectPassDesc& o) {
|
||||||
|
ar& o.cso& o.inputs& o.outputs& o.numThreads[0] & o.numThreads[1] & o.numThreads[2] & o.blockSize& o.desc& o.flags;
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename Archive>
|
||||||
|
void serialize(Archive& ar, EffectDesc& o) {
|
||||||
|
ar& o.name& o.params& o.textures& o.samplers& o.passes& o.flags;
|
||||||
|
}
|
||||||
|
|
||||||
|
static constexpr uint32_t MAX_CACHE_COUNT = 127;
|
||||||
|
|
||||||
|
// 缓存版本
|
||||||
|
// 当缓存文件结构有更改时更新它,使旧缓存失效
|
||||||
|
static constexpr uint32_t EFFECT_CACHE_VERSION = 15;
|
||||||
|
|
||||||
|
|
||||||
|
static std::wstring GetLinearEffectName(std::wstring_view effectName) {
|
||||||
|
std::wstring result(effectName);
|
||||||
|
for (wchar_t& c : result) {
|
||||||
|
if (c == L'\\') {
|
||||||
|
c = L'#';
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
|
static std::wstring GetCacheFileName(std::wstring_view linearEffectName, uint32_t flags, uint64_t hash) {
|
||||||
|
assert(flags <= 0xFFFF);
|
||||||
|
// 缓存文件的命名: {效果名}_{标志位(4)}_{哈希(16))}
|
||||||
|
return fmt::format(L"{}\\{}_{:04x}_{:016x}", CommonSharedConstants::CACHE_DIR, linearEffectName, flags, hash);
|
||||||
|
}
|
||||||
|
|
||||||
|
void EffectCacheManager::_AddToMemCache(const std::wstring& cacheFileName, std::string& key, const EffectDesc& desc) {
|
||||||
|
auto lock = _lock.lock_exclusive();
|
||||||
|
|
||||||
|
_memCache[cacheFileName] = _MemCacheItem{
|
||||||
|
.key = std::move(key),
|
||||||
|
.effectDesc = desc,
|
||||||
|
.lastAccess = ++_lastAccess
|
||||||
|
};
|
||||||
|
|
||||||
|
if (_memCache.size() > MAX_CACHE_COUNT) {
|
||||||
|
assert(_memCache.size() == MAX_CACHE_COUNT + 1);
|
||||||
|
|
||||||
|
// 清理一半较旧的内存缓存
|
||||||
|
std::array<uint32_t, MAX_CACHE_COUNT + 1> access{};
|
||||||
|
std::transform(_memCache.begin(), _memCache.end(), access.begin(),
|
||||||
|
[](const auto& pair) {return pair.second.lastAccess; });
|
||||||
|
|
||||||
|
auto midIt = access.begin() + access.size() / 2;
|
||||||
|
std::nth_element(access.begin(), midIt, access.end());
|
||||||
|
const uint32_t mid = *midIt;
|
||||||
|
|
||||||
|
for (auto it = _memCache.begin(); it != _memCache.end();) {
|
||||||
|
if (it->second.lastAccess < mid) {
|
||||||
|
it = _memCache.erase(it);
|
||||||
|
} else {
|
||||||
|
++it;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
Logger::Get().Info("已清理内存缓存");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
bool EffectCacheManager::_LoadFromMemCache(const std::wstring& cacheFileName, std::string_view key, EffectDesc& desc) {
|
||||||
|
auto lock = _lock.lock_exclusive();
|
||||||
|
|
||||||
|
auto it = _memCache.find(cacheFileName);
|
||||||
|
if (it != _memCache.end()) {
|
||||||
|
_MemCacheItem& cacheItem = it->second;
|
||||||
|
|
||||||
|
// 防止哈希碰撞
|
||||||
|
if (cacheItem.key != key) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
desc = cacheItem.effectDesc;
|
||||||
|
cacheItem.lastAccess = ++_lastAccess;
|
||||||
|
Logger::Get().Info(StrHelper::Concat("已读取缓存 ", StrHelper::UTF16ToUTF8(cacheFileName)));
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
bool EffectCacheManager::Load(
|
||||||
|
std::wstring_view effectName,
|
||||||
|
uint32_t flags,
|
||||||
|
uint64_t hash,
|
||||||
|
std::string_view key,
|
||||||
|
EffectDesc& desc
|
||||||
|
) {
|
||||||
|
assert(!effectName.empty() && !key.empty());
|
||||||
|
|
||||||
|
std::wstring cacheFileName = GetCacheFileName(GetLinearEffectName(effectName), flags, hash);
|
||||||
|
|
||||||
|
if (_LoadFromMemCache(cacheFileName, key, desc)) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!Win32Helper::FileExists(cacheFileName.c_str())) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
std::vector<BYTE> buf;
|
||||||
|
if (!Win32Helper::ReadFile(cacheFileName.c_str(), buf) || buf.empty()) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
std::string cachedKey;
|
||||||
|
try {
|
||||||
|
yas::mem_istream mi(buf.data(), buf.size());
|
||||||
|
yas::binary_iarchive<yas::mem_istream, yas::binary> ia(mi);
|
||||||
|
|
||||||
|
uint32_t cacheVersion;
|
||||||
|
ia.read(cacheVersion);
|
||||||
|
if (cacheVersion != EFFECT_CACHE_VERSION) {
|
||||||
|
Logger::Get().Info("缓存版本不匹配");
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
ia& cachedKey;
|
||||||
|
if (cachedKey != key) {
|
||||||
|
Logger::Get().Info("缓存键不匹配");
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
ia& desc;
|
||||||
|
} catch (...) {
|
||||||
|
Logger::Get().Error("反序列化失败");
|
||||||
|
desc = {};
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
_AddToMemCache(cacheFileName, cachedKey, desc);
|
||||||
|
|
||||||
|
Logger::Get().Info(StrHelper::Concat("已读取缓存 ", StrHelper::UTF16ToUTF8(cacheFileName)));
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
void EffectCacheManager::Save(
|
||||||
|
std::wstring_view effectName,
|
||||||
|
uint32_t flags,
|
||||||
|
uint64_t hash,
|
||||||
|
std::string key,
|
||||||
|
const EffectDesc& desc
|
||||||
|
) {
|
||||||
|
const std::wstring linearEffectName = GetLinearEffectName(effectName);
|
||||||
|
|
||||||
|
std::vector<BYTE> buffer;
|
||||||
|
buffer.reserve(4096);
|
||||||
|
|
||||||
|
try {
|
||||||
|
yas::vector_ostream os(buffer);
|
||||||
|
yas::binary_oarchive<yas::vector_ostream<BYTE>, yas::binary> oa(os);
|
||||||
|
|
||||||
|
oa.write(EFFECT_CACHE_VERSION);
|
||||||
|
oa& key& desc;
|
||||||
|
} catch (...) {
|
||||||
|
Logger::Get().Error("序列化 EffectDesc 失败");
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!CreateDirectory(CommonSharedConstants::CACHE_DIR, nullptr)) {
|
||||||
|
if (GetLastError() != ERROR_ALREADY_EXISTS) {
|
||||||
|
Logger::Get().Win32Error("创建 cache 文件夹失败");
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
// 清理缓存
|
||||||
|
WIN32_FIND_DATA findData{};
|
||||||
|
wil::unique_hfind hFind(FindFirstFileEx(
|
||||||
|
StrHelper::Concat(CommonSharedConstants::CACHE_DIR, L"\\*").c_str(),
|
||||||
|
FindExInfoBasic, &findData, FindExSearchNameMatch, nullptr, FIND_FIRST_EX_LARGE_FETCH));
|
||||||
|
if (hFind) {
|
||||||
|
do {
|
||||||
|
std::wstring_view fileName(findData.cFileName);
|
||||||
|
|
||||||
|
if (!fileName.starts_with(linearEffectName)) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
const size_t effectNameLen = linearEffectName.size();
|
||||||
|
if (fileName.size() == effectNameLen + 22) {
|
||||||
|
// 保留标志不同的缓存
|
||||||
|
if (!fileName.substr(effectNameLen).starts_with(fmt::format(L"_{:04x}_", flags))) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
int i = 6;
|
||||||
|
for (; i < 22; ++i) {
|
||||||
|
const wchar_t c = fileName[effectNameLen + i];
|
||||||
|
if (!((c >= L'0' && c <= L'9') || (c >= L'a' && c <= L'f'))) {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (i != 22) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
} else if (fileName.size() == effectNameLen + 18) {
|
||||||
|
// 删除旧版缓存
|
||||||
|
if (fileName[effectNameLen] != L'_') {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
int i = 1;
|
||||||
|
for (; i < 18; ++i) {
|
||||||
|
const wchar_t c = fileName[effectNameLen + i];
|
||||||
|
if (!((c >= L'0' && c <= L'9') || (c >= L'a' && c <= L'f'))) {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (i != 18) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!DeleteFile(StrHelper::Concat(
|
||||||
|
CommonSharedConstants::CACHE_DIR, L"\\", findData.cFileName).c_str()))
|
||||||
|
{
|
||||||
|
Logger::Get().Win32Error(StrHelper::Concat("删除缓存文件 ",
|
||||||
|
StrHelper::UTF16ToUTF8(findData.cFileName), " 失败"));
|
||||||
|
}
|
||||||
|
} while (FindNextFile(hFind.get(), &findData));
|
||||||
|
} else {
|
||||||
|
Logger::Get().Win32Error("查找缓存文件失败");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
std::wstring cacheFileName = GetCacheFileName(linearEffectName, flags, hash);
|
||||||
|
if (!Win32Helper::WriteFile(cacheFileName.c_str(), buffer)) {
|
||||||
|
Logger::Get().Error("保存缓存失败");
|
||||||
|
}
|
||||||
|
|
||||||
|
_AddToMemCache(cacheFileName, key, desc);
|
||||||
|
|
||||||
|
Logger::Get().Info(StrHelper::Concat("已保存缓存 ", StrHelper::UTF16ToUTF8(cacheFileName)));
|
||||||
|
}
|
||||||
|
|
||||||
|
uint64_t EffectCacheManager::GetHash(std::string_view key) {
|
||||||
|
return rapidhash(key.data(), key.size());
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
42
src/Magpie.Core/EffectCacheManager.h
Normal file
42
src/Magpie.Core/EffectCacheManager.h
Normal file
|
|
@ -0,0 +1,42 @@
|
||||||
|
#pragma once
|
||||||
|
#include "EffectDesc.h"
|
||||||
|
#include <parallel_hashmap/phmap.h>
|
||||||
|
|
||||||
|
namespace Magpie {
|
||||||
|
|
||||||
|
class EffectCacheManager {
|
||||||
|
public:
|
||||||
|
static EffectCacheManager& Get() noexcept {
|
||||||
|
static EffectCacheManager instance;
|
||||||
|
return instance;
|
||||||
|
}
|
||||||
|
|
||||||
|
EffectCacheManager(const EffectCacheManager&) = delete;
|
||||||
|
EffectCacheManager(EffectCacheManager&&) = delete;
|
||||||
|
|
||||||
|
bool Load(std::wstring_view effectName, uint32_t flags, uint64_t hash, std::string_view key, EffectDesc& desc);
|
||||||
|
|
||||||
|
void Save(std::wstring_view effectName, uint32_t flags, uint64_t hash, std::string key, const EffectDesc& desc);
|
||||||
|
|
||||||
|
static uint64_t GetHash(std::string_view key);
|
||||||
|
|
||||||
|
private:
|
||||||
|
EffectCacheManager() = default;
|
||||||
|
|
||||||
|
void _AddToMemCache(const std::wstring& cacheFileName, std::string& key, const EffectDesc& desc);
|
||||||
|
bool _LoadFromMemCache(const std::wstring& cacheFileName, std::string_view key, EffectDesc& desc);
|
||||||
|
|
||||||
|
// 用于同步对 _memCache 的访问
|
||||||
|
wil::srwlock _lock;
|
||||||
|
|
||||||
|
struct _MemCacheItem {
|
||||||
|
std::string key;
|
||||||
|
EffectDesc effectDesc;
|
||||||
|
uint32_t lastAccess = 0;
|
||||||
|
};
|
||||||
|
phmap::flat_hash_map<std::wstring, _MemCacheItem> _memCache;
|
||||||
|
|
||||||
|
UINT _lastAccess = 0;
|
||||||
|
};
|
||||||
|
|
||||||
|
}
|
||||||
1853
src/Magpie.Core/EffectCompiler.cpp
Normal file
1853
src/Magpie.Core/EffectCompiler.cpp
Normal file
File diff suppressed because it is too large
Load diff
632
src/Magpie.Core/EffectDrawer.cpp
Normal file
632
src/Magpie.Core/EffectDrawer.cpp
Normal file
|
|
@ -0,0 +1,632 @@
|
||||||
|
#include "pch.h"
|
||||||
|
#include "EffectDrawer.h"
|
||||||
|
#include "BackendDescriptorStore.h"
|
||||||
|
#include "DeviceResources.h"
|
||||||
|
#include "DirectXHelper.h"
|
||||||
|
#include "EffectHelper.h"
|
||||||
|
#include "EffectsProfiler.h"
|
||||||
|
#include "Logger.h"
|
||||||
|
#include "ScalingOptions.h"
|
||||||
|
#include "ScalingWindow.h"
|
||||||
|
#include "StrHelper.h"
|
||||||
|
#include "TextureHelper.h"
|
||||||
|
#include "Win32Helper.h"
|
||||||
|
|
||||||
|
namespace Magpie {
|
||||||
|
|
||||||
|
EffectDrawer::~EffectDrawer() {
|
||||||
|
// [0] 为输入,由前一个 EffectDrawer 管理
|
||||||
|
const uint32_t textureCount = (uint32_t)_textures.size();
|
||||||
|
for (uint32_t i = 1; i < textureCount; ++i) {
|
||||||
|
_descriptorStore->RemoveCache(_textures[i].get());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
bool EffectDrawer::Initialize(
|
||||||
|
const EffectDesc& desc,
|
||||||
|
const EffectOption& option,
|
||||||
|
DeviceResources& deviceResources,
|
||||||
|
BackendDescriptorStore& descriptorStore,
|
||||||
|
ID3D11Texture2D** inOutTexture
|
||||||
|
) noexcept {
|
||||||
|
_d3dDC = deviceResources.GetD3DDC();
|
||||||
|
_descriptorStore = &descriptorStore;
|
||||||
|
|
||||||
|
SIZE inputSize{};
|
||||||
|
{
|
||||||
|
D3D11_TEXTURE2D_DESC inputDesc;
|
||||||
|
(*inOutTexture)->GetDesc(&inputDesc);
|
||||||
|
inputSize = { (LONG)inputDesc.Width, (LONG)inputDesc.Height };
|
||||||
|
}
|
||||||
|
|
||||||
|
const SIZE outputSize = _CalcOutputSize(desc, option, inputSize);
|
||||||
|
if (outputSize.cx <= 0 || outputSize.cy <= 0) {
|
||||||
|
Logger::Get().Error("非法的输出尺寸");
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
_samplers.resize(desc.samplers.size());
|
||||||
|
for (UINT i = 0; i < _samplers.size(); ++i) {
|
||||||
|
const EffectSamplerDesc& samDesc = desc.samplers[i];
|
||||||
|
_samplers[i] = deviceResources.GetSampler(
|
||||||
|
samDesc.filterType == EffectSamplerFilterType::Linear ? D3D11_FILTER_MIN_MAG_MIP_LINEAR : D3D11_FILTER_MIN_MAG_MIP_POINT,
|
||||||
|
samDesc.addressType == EffectSamplerAddressType::Clamp ? D3D11_TEXTURE_ADDRESS_CLAMP : D3D11_TEXTURE_ADDRESS_WRAP
|
||||||
|
);
|
||||||
|
|
||||||
|
if (!_samplers[i]) {
|
||||||
|
Logger::Get().Error(fmt::format("创建采样器 {} 失败", samDesc.name));
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// 创建中间纹理
|
||||||
|
// 第一个为 INPUT,第二个为 OUTPUT
|
||||||
|
_textures.resize(desc.textures.size());
|
||||||
|
_textures[0].copy_from(*inOutTexture);
|
||||||
|
|
||||||
|
// 创建输出纹理,格式始终是 DXGI_FORMAT_R8G8B8A8_UNORM
|
||||||
|
_textures[1] = DirectXHelper::CreateTexture2D(
|
||||||
|
deviceResources.GetD3DDevice(),
|
||||||
|
EffectHelper::FORMAT_DESCS[(uint32_t)desc.textures[1].format].dxgiFormat,
|
||||||
|
outputSize.cx,
|
||||||
|
outputSize.cy,
|
||||||
|
D3D11_BIND_SHADER_RESOURCE | D3D11_BIND_UNORDERED_ACCESS
|
||||||
|
);
|
||||||
|
|
||||||
|
*inOutTexture = _textures[1].get();
|
||||||
|
if (!*inOutTexture) {
|
||||||
|
Logger::Get().Error("创建输出纹理失败");
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
for (size_t i = 2; i < desc.textures.size(); ++i) {
|
||||||
|
const EffectIntermediateTextureDesc& texDesc = desc.textures[i];
|
||||||
|
|
||||||
|
if (!texDesc.source.empty()) {
|
||||||
|
// 从文件加载纹理
|
||||||
|
size_t delimPos = desc.name.find_last_of('\\');
|
||||||
|
std::string texPath = delimPos == std::string::npos
|
||||||
|
? StrHelper::Concat("effects\\", texDesc.source)
|
||||||
|
: StrHelper::Concat("effects\\", std::string_view(desc.name.c_str(), delimPos + 1), texDesc.source);
|
||||||
|
_textures[i] = TextureHelper::LoadTexture(
|
||||||
|
StrHelper::UTF8ToUTF16(texPath).c_str(), deviceResources.GetD3DDevice());
|
||||||
|
if (!_textures[i]) {
|
||||||
|
Logger::Get().Error(fmt::format("加载纹理 {} 失败", texDesc.source));
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (texDesc.format != EffectIntermediateTextureFormat::UNKNOWN) {
|
||||||
|
// 检查纹理格式是否匹配
|
||||||
|
D3D11_TEXTURE2D_DESC srcDesc{};
|
||||||
|
_textures[i]->GetDesc(&srcDesc);
|
||||||
|
if (srcDesc.Format != EffectHelper::FORMAT_DESCS[(uint32_t)texDesc.format].dxgiFormat) {
|
||||||
|
Logger::Get().Error("SOURCE 纹理格式不匹配");
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
SIZE texSize{};
|
||||||
|
try {
|
||||||
|
_exprParser.SetExpr(texDesc.sizeExpr.first);
|
||||||
|
texSize.cx = std::lround(_exprParser.Eval());
|
||||||
|
_exprParser.SetExpr(texDesc.sizeExpr.second);
|
||||||
|
texSize.cy = std::lround(_exprParser.Eval());
|
||||||
|
} catch (const mu::ParserError& e) {
|
||||||
|
Logger::Get().Error(fmt::format("计算中间纹理尺寸 {} 失败: {}", e.GetExpr(), e.GetMsg()));
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (texSize.cx <= 0 || texSize.cy <= 0) {
|
||||||
|
Logger::Get().Error("非法的中间纹理尺寸");
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
_textures[i] = DirectXHelper::CreateTexture2D(
|
||||||
|
deviceResources.GetD3DDevice(),
|
||||||
|
EffectHelper::FORMAT_DESCS[(UINT)texDesc.format].dxgiFormat,
|
||||||
|
texSize.cx,
|
||||||
|
texSize.cy,
|
||||||
|
D3D11_BIND_SHADER_RESOURCE | D3D11_BIND_UNORDERED_ACCESS
|
||||||
|
);
|
||||||
|
if (!_textures[i]) {
|
||||||
|
Logger::Get().Error("创建纹理失败");
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
uint32_t passCount = (uint32_t)desc.passes.size();
|
||||||
|
_shaders.resize(passCount);
|
||||||
|
_srvs.resize(passCount);
|
||||||
|
_uavs.resize(passCount);
|
||||||
|
_dispatches.resize(passCount);
|
||||||
|
|
||||||
|
for (uint32_t i = 0; i < passCount; ++i) {
|
||||||
|
const EffectPassDesc& passDesc = desc.passes[i];
|
||||||
|
|
||||||
|
HRESULT hr = deviceResources.GetD3DDevice()->CreateComputeShader(
|
||||||
|
passDesc.cso->GetBufferPointer(), passDesc.cso->GetBufferSize(), nullptr, _shaders[i].put());
|
||||||
|
if (FAILED(hr)) {
|
||||||
|
Logger::Get().ComError("创建计算着色器失败", hr);
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
_srvs[i].resize(passDesc.inputs.size());
|
||||||
|
_uavs[i].resize(passDesc.outputs.size() * 2);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!_UpdatePassResources(desc)) {
|
||||||
|
Logger::Get().Error("_UpdatePassResources 失败");
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!_UpdateConstants(desc, option, deviceResources, inputSize, outputSize)) {
|
||||||
|
Logger::Get().Error("_UpdateConstants 失败");
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
void EffectDrawer::Draw(EffectsProfiler& profiler) const noexcept {
|
||||||
|
_PrepareForDraw();
|
||||||
|
|
||||||
|
for (uint32_t i = 0; i < _dispatches.size(); ++i) {
|
||||||
|
_DrawPass(i);
|
||||||
|
profiler.OnEndPass(_d3dDC);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void EffectDrawer::DrawForExport(const EffectDesc& desc, uint32_t passIdx) const noexcept {
|
||||||
|
_PrepareForDraw();
|
||||||
|
|
||||||
|
for (uint32_t i : _CalcPassesToDrawForExport(desc, passIdx)) {
|
||||||
|
_DrawPass(i);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
bool EffectDrawer::ResizeTextures(
|
||||||
|
const EffectDesc& desc,
|
||||||
|
const EffectOption& option,
|
||||||
|
DeviceResources& deviceResources,
|
||||||
|
ID3D11Texture2D** inOutTexture
|
||||||
|
) noexcept {
|
||||||
|
bool anyChange = false;
|
||||||
|
|
||||||
|
if (*inOutTexture != _textures[0].get()) {
|
||||||
|
_textures[0].copy_from(*inOutTexture);
|
||||||
|
anyChange = true;
|
||||||
|
}
|
||||||
|
|
||||||
|
SIZE inputSize{};
|
||||||
|
{
|
||||||
|
D3D11_TEXTURE2D_DESC inputDesc;
|
||||||
|
_textures[0]->GetDesc(&inputDesc);
|
||||||
|
inputSize = { (LONG)inputDesc.Width, (LONG)inputDesc.Height };
|
||||||
|
}
|
||||||
|
|
||||||
|
const SIZE outputSize = _CalcOutputSize(desc, option, inputSize);
|
||||||
|
if (outputSize.cx <= 0 || outputSize.cy <= 0) {
|
||||||
|
Logger::Get().Error("非法的输出尺寸");
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
D3D11_TEXTURE2D_DESC texDesc;
|
||||||
|
_textures[1]->GetDesc(&texDesc);
|
||||||
|
|
||||||
|
if ((LONG)texDesc.Width != outputSize.cx || (LONG)texDesc.Height != outputSize.cy) {
|
||||||
|
_descriptorStore->RemoveCache(_textures[1].get());
|
||||||
|
|
||||||
|
_textures[1] = DirectXHelper::CreateTexture2D(
|
||||||
|
deviceResources.GetD3DDevice(),
|
||||||
|
texDesc.Format,
|
||||||
|
outputSize.cx,
|
||||||
|
outputSize.cy,
|
||||||
|
texDesc.BindFlags
|
||||||
|
);
|
||||||
|
|
||||||
|
if (!_textures[1]) {
|
||||||
|
Logger::Get().Error("创建输出纹理失败");
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
anyChange = true;
|
||||||
|
}
|
||||||
|
|
||||||
|
*inOutTexture = _textures[1].get();
|
||||||
|
|
||||||
|
for (size_t i = 2; i < _textures.size(); ++i) {
|
||||||
|
const std::pair<std::string, std::string>& sizeExpr = desc.textures[i].sizeExpr;
|
||||||
|
if (sizeExpr.first.empty()) {
|
||||||
|
// 从文件加载的纹理无需调整尺寸
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
SIZE texSize{};
|
||||||
|
try {
|
||||||
|
_exprParser.SetExpr(sizeExpr.first);
|
||||||
|
texSize.cx = std::lround(_exprParser.Eval());
|
||||||
|
_exprParser.SetExpr(sizeExpr.second);
|
||||||
|
texSize.cy = std::lround(_exprParser.Eval());
|
||||||
|
} catch (const mu::ParserError& e) {
|
||||||
|
Logger::Get().Error(fmt::format("计算中间纹理尺寸 {} 失败: {}", e.GetExpr(), e.GetMsg()));
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (texSize.cx <= 0 || texSize.cy <= 0) {
|
||||||
|
Logger::Get().Error("非法的中间纹理尺寸");
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
_textures[i]->GetDesc(&texDesc);
|
||||||
|
|
||||||
|
if ((LONG)texDesc.Width != texSize.cx || (LONG)texDesc.Height != texSize.cy) {
|
||||||
|
_descriptorStore->RemoveCache(_textures[i].get());
|
||||||
|
|
||||||
|
_textures[i] = DirectXHelper::CreateTexture2D(
|
||||||
|
deviceResources.GetD3DDevice(),
|
||||||
|
texDesc.Format,
|
||||||
|
texSize.cx,
|
||||||
|
texSize.cy,
|
||||||
|
texDesc.BindFlags
|
||||||
|
);
|
||||||
|
|
||||||
|
if (!_textures[i]) {
|
||||||
|
Logger::Get().Error("创建纹理失败");
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
anyChange = true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!anyChange) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!_UpdatePassResources(desc)) {
|
||||||
|
Logger::Get().Error("_UpdatePassResources 失败");
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!_UpdateConstants(desc, option, deviceResources, inputSize, outputSize)) {
|
||||||
|
Logger::Get().Error("_UpdateConstants 失败");
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
SIZE EffectDrawer::_CalcOutputSize(
|
||||||
|
const EffectDesc& desc,
|
||||||
|
const EffectOption& option,
|
||||||
|
SIZE inputSize
|
||||||
|
) const noexcept {
|
||||||
|
_exprParser.DefineConst("INPUT_WIDTH", inputSize.cx);
|
||||||
|
_exprParser.DefineConst("INPUT_HEIGHT", inputSize.cy);
|
||||||
|
|
||||||
|
SIZE outputSize{};
|
||||||
|
const std::pair<std::string, std::string>& outputSizeExpr = desc.GetOutputSizeExpr();
|
||||||
|
|
||||||
|
if (outputSizeExpr.first.empty()) {
|
||||||
|
const SIZE rendererSize = Win32Helper::GetSizeOfRect(ScalingWindow::Get().RendererRect());
|
||||||
|
|
||||||
|
switch (option.scalingType) {
|
||||||
|
case ScalingType::Normal:
|
||||||
|
{
|
||||||
|
outputSize.cx = std::lroundf(inputSize.cx * option.scale.first);
|
||||||
|
outputSize.cy = std::lroundf(inputSize.cy * option.scale.second);
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
case ScalingType::Absolute:
|
||||||
|
{
|
||||||
|
outputSize.cx = std::lroundf(option.scale.first);
|
||||||
|
outputSize.cy = std::lroundf(option.scale.second);
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
case ScalingType::Fit:
|
||||||
|
{
|
||||||
|
// 窗口模式缩放时将缩放比例为 1 的 Fit 视为 Fill。此时缩放确保是等比例的,但由于舍入
|
||||||
|
// 可能存在一个像素的误差。考虑长 100 高 50 的矩形窗口,长调整到 101 时高将四舍五入到
|
||||||
|
// 51,再将长调整到 102 高仍是 51,Fit 的计算方式会使这两次调整中有一次存在黑边,而且
|
||||||
|
// 也会影响后续计算是否追加 Bicubic。
|
||||||
|
const bool treatFitAsFill = ScalingWindow::Get().Options().IsWindowedMode() &&
|
||||||
|
IsApprox(option.scale.first, 1.0f) && IsApprox(option.scale.second, 1.0f);
|
||||||
|
|
||||||
|
if (!treatFitAsFill) {
|
||||||
|
const float fillScale = std::min(
|
||||||
|
float(rendererSize.cx) / inputSize.cx,
|
||||||
|
float(rendererSize.cy) / inputSize.cy
|
||||||
|
);
|
||||||
|
outputSize.cx = std::lroundf(inputSize.cx * fillScale * option.scale.first);
|
||||||
|
outputSize.cy = std::lroundf(inputSize.cy * fillScale * option.scale.second);
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
[[fallthrough]];
|
||||||
|
}
|
||||||
|
case ScalingType::Fill:
|
||||||
|
{
|
||||||
|
outputSize = rendererSize;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
default:
|
||||||
|
assert(false);
|
||||||
|
return {};
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
assert(!outputSizeExpr.second.empty());
|
||||||
|
|
||||||
|
try {
|
||||||
|
_exprParser.SetExpr(outputSizeExpr.first);
|
||||||
|
outputSize.cx = std::lround(_exprParser.Eval());
|
||||||
|
|
||||||
|
_exprParser.SetExpr(outputSizeExpr.second);
|
||||||
|
outputSize.cy = std::lround(_exprParser.Eval());
|
||||||
|
} catch (const mu::ParserError& e) {
|
||||||
|
Logger::Get().Error(fmt::format("计算输出尺寸 {} 失败: {}", e.GetExpr(), e.GetMsg()));
|
||||||
|
return {};
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
_exprParser.DefineConst("OUTPUT_WIDTH", outputSize.cx);
|
||||||
|
_exprParser.DefineConst("OUTPUT_HEIGHT", outputSize.cy);
|
||||||
|
|
||||||
|
return outputSize;
|
||||||
|
}
|
||||||
|
|
||||||
|
bool EffectDrawer::_UpdatePassResources(const EffectDesc& desc) noexcept {
|
||||||
|
const uint32_t passCount = (uint32_t)desc.passes.size();
|
||||||
|
for (uint32_t i = 0; i < passCount; ++i) {
|
||||||
|
const SmallVector<uint32_t>& inputs = desc.passes[i].inputs;
|
||||||
|
const SmallVector<uint32_t>& outputs = desc.passes[i].outputs;
|
||||||
|
const std::pair<uint32_t, uint32_t>& blockSize = desc.passes[i].blockSize;
|
||||||
|
|
||||||
|
for (uint32_t j = 0; j < inputs.size(); ++j) {
|
||||||
|
auto srv = _srvs[i][j] = _descriptorStore->GetShaderResourceView(_textures[inputs[j]].get());
|
||||||
|
if (!srv) {
|
||||||
|
Logger::Get().Error("GetShaderResourceView 失败");
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
for (uint32_t j = 0; j < outputs.size(); ++j) {
|
||||||
|
auto uav = _uavs[i][j] = _descriptorStore->GetUnorderedAccessView(_textures[outputs[j]].get());
|
||||||
|
if (!uav) {
|
||||||
|
Logger::Get().Error("GetUnorderedAccessView 失败");
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
D3D11_TEXTURE2D_DESC outputDesc;
|
||||||
|
_textures[outputs[0]]->GetDesc(&outputDesc);
|
||||||
|
_dispatches[i] = {
|
||||||
|
(outputDesc.Width + blockSize.first - 1) / blockSize.first,
|
||||||
|
(outputDesc.Height + blockSize.second - 1) / blockSize.second
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
bool EffectDrawer::_UpdateConstants(
|
||||||
|
const EffectDesc& desc,
|
||||||
|
const EffectOption& option,
|
||||||
|
DeviceResources& deviceResources,
|
||||||
|
SIZE inputSize,
|
||||||
|
SIZE outputSize
|
||||||
|
) noexcept {
|
||||||
|
const bool isInlineParams = desc.flags & EffectFlags::InlineParams;
|
||||||
|
|
||||||
|
SmallVector<EffectHelper::Constant32, 32> constants;
|
||||||
|
|
||||||
|
// 大小必须为 4 的倍数
|
||||||
|
const size_t builtinConstantCount = 10;
|
||||||
|
size_t psStylePassParams = 0;
|
||||||
|
for (UINT i = 0, end = (UINT)desc.passes.size() - 1; i < end; ++i) {
|
||||||
|
if (desc.passes[i].flags & EffectPassFlags::PSStyle) {
|
||||||
|
psStylePassParams += 4;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
constants.resize((builtinConstantCount + psStylePassParams + (isInlineParams ? 0 : desc.params.size()) + 3) / 4 * 4);
|
||||||
|
// cbuffer __CB1 : register(b0) {
|
||||||
|
// uint2 __inputSize;
|
||||||
|
// uint2 __outputSize;
|
||||||
|
// float2 __inputPt;
|
||||||
|
// float2 __outputPt;
|
||||||
|
// float2 __scale;
|
||||||
|
// [PARAMETERS...]
|
||||||
|
// );
|
||||||
|
constants[0].uintVal = inputSize.cx;
|
||||||
|
constants[1].uintVal = inputSize.cy;
|
||||||
|
constants[2].uintVal = outputSize.cx;
|
||||||
|
constants[3].uintVal = outputSize.cy;
|
||||||
|
constants[4].floatVal = 1.0f / inputSize.cx;
|
||||||
|
constants[5].floatVal = 1.0f / inputSize.cy;
|
||||||
|
constants[6].floatVal = 1.0f / outputSize.cx;
|
||||||
|
constants[7].floatVal = 1.0f / outputSize.cy;
|
||||||
|
constants[8].floatVal = outputSize.cx / (FLOAT)inputSize.cx;
|
||||||
|
constants[9].floatVal = outputSize.cy / (FLOAT)inputSize.cy;
|
||||||
|
|
||||||
|
// PS 样式的通道需要的参数
|
||||||
|
EffectHelper::Constant32* pCurParam = constants.data() + builtinConstantCount;
|
||||||
|
if (psStylePassParams > 0) {
|
||||||
|
for (UINT i = 0, end = (UINT)desc.passes.size() - 1; i < end; ++i) {
|
||||||
|
if (desc.passes[i].flags & EffectPassFlags::PSStyle) {
|
||||||
|
D3D11_TEXTURE2D_DESC outputDesc;
|
||||||
|
_textures[desc.passes[i].outputs[0]]->GetDesc(&outputDesc);
|
||||||
|
pCurParam->uintVal = outputDesc.Width;
|
||||||
|
++pCurParam;
|
||||||
|
pCurParam->uintVal = outputDesc.Height;
|
||||||
|
++pCurParam;
|
||||||
|
pCurParam->floatVal = 1.0f / outputDesc.Width;
|
||||||
|
++pCurParam;
|
||||||
|
pCurParam->floatVal = 1.0f / outputDesc.Height;
|
||||||
|
++pCurParam;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!isInlineParams) {
|
||||||
|
for (UINT i = 0; i < desc.params.size(); ++i) {
|
||||||
|
const auto& paramDesc = desc.params[i];
|
||||||
|
auto it = option.parameters.find(paramDesc.name);
|
||||||
|
|
||||||
|
if (paramDesc.constant.index() == 0) {
|
||||||
|
const EffectConstant<float>& constant = std::get<0>(paramDesc.constant);
|
||||||
|
float value = constant.defaultValue;
|
||||||
|
|
||||||
|
if (it != option.parameters.end()) {
|
||||||
|
value = it->second;
|
||||||
|
|
||||||
|
if (value < constant.minValue || value > constant.maxValue) {
|
||||||
|
Logger::Get().Error(fmt::format("参数 {} 的值非法", paramDesc.name));
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
pCurParam->floatVal = value;
|
||||||
|
} else {
|
||||||
|
const EffectConstant<int>& constant = std::get<1>(paramDesc.constant);
|
||||||
|
int value = constant.defaultValue;
|
||||||
|
|
||||||
|
if (it != option.parameters.end()) {
|
||||||
|
value = (int)std::lroundf(it->second);
|
||||||
|
|
||||||
|
if ((value < constant.minValue) || (value > constant.maxValue)) {
|
||||||
|
Logger::Get().Error(StrHelper::Concat("参数 ", paramDesc.name, " 的值非法"));
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
pCurParam->intVal = value;
|
||||||
|
}
|
||||||
|
|
||||||
|
++pCurParam;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (_constantBuffer) {
|
||||||
|
// 更新缓冲区
|
||||||
|
deviceResources.GetD3DDC()->UpdateSubresource1(
|
||||||
|
_constantBuffer.get(), 0, nullptr, constants.data(), 0, 0, D3D11_COPY_DISCARD);
|
||||||
|
} else {
|
||||||
|
// 创建缓冲区
|
||||||
|
D3D11_BUFFER_DESC bd{
|
||||||
|
.ByteWidth = 4 * (UINT)constants.size(),
|
||||||
|
.Usage = D3D11_USAGE_DEFAULT,
|
||||||
|
.BindFlags = D3D11_BIND_CONSTANT_BUFFER
|
||||||
|
};
|
||||||
|
|
||||||
|
D3D11_SUBRESOURCE_DATA initData{ .pSysMem = constants.data() };
|
||||||
|
|
||||||
|
HRESULT hr = deviceResources.GetD3DDevice()->CreateBuffer(&bd, &initData, _constantBuffer.put());
|
||||||
|
if (FAILED(hr)) {
|
||||||
|
Logger::Get().ComError("CreateBuffer 失败", hr);
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
void EffectDrawer::_DrawPass(uint32_t i) const noexcept {
|
||||||
|
_d3dDC->CSSetShader(_shaders[i].get(), nullptr, 0);
|
||||||
|
|
||||||
|
_d3dDC->CSSetShaderResources(0, (UINT)_srvs[i].size(), _srvs[i].data());
|
||||||
|
UINT uavCount = (UINT)_uavs[i].size() / 2;
|
||||||
|
_d3dDC->CSSetUnorderedAccessViews(0, uavCount, _uavs[i].data(), nullptr);
|
||||||
|
|
||||||
|
_d3dDC->Dispatch(_dispatches[i].first, _dispatches[i].second, 1);
|
||||||
|
|
||||||
|
_d3dDC->CSSetUnorderedAccessViews(0, uavCount, _uavs[i].data() + uavCount, nullptr);
|
||||||
|
}
|
||||||
|
|
||||||
|
static bool IsReadonlyTexture(const EffectDesc& desc, uint32_t texture) noexcept {
|
||||||
|
return texture == 0 || !desc.textures[texture].source.empty();
|
||||||
|
}
|
||||||
|
|
||||||
|
// 计算导出某个通道的输出时需要重新渲染的通道
|
||||||
|
SmallVector<uint32_t> EffectDrawer::_CalcPassesToDrawForExport(
|
||||||
|
const EffectDesc& desc,
|
||||||
|
uint32_t passIdx
|
||||||
|
) const noexcept {
|
||||||
|
SmallVector<uint32_t> passesToDraw;
|
||||||
|
passesToDraw.push_back(passIdx);
|
||||||
|
|
||||||
|
if (passIdx == 0) {
|
||||||
|
return passesToDraw;
|
||||||
|
}
|
||||||
|
|
||||||
|
const std::vector<EffectPassDesc>& passes = desc.passes;
|
||||||
|
const uint32_t end = (uint32_t)passes.size() - 1;
|
||||||
|
|
||||||
|
// 用于记录该通道依赖的输入纹理,格式为 (passIdx, texture)
|
||||||
|
SmallVector<std::pair<uint32_t, uint32_t>, 0> depTextures;
|
||||||
|
|
||||||
|
for (uint32_t input : passes[passIdx].inputs) {
|
||||||
|
if (!IsReadonlyTexture(desc, input)) {
|
||||||
|
depTextures.emplace_back(passIdx, input);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
while (!depTextures.empty()) {
|
||||||
|
const auto [curPass, curTexture] = depTextures.pop_back_val();
|
||||||
|
|
||||||
|
// 检查 curTexture 是否会被后面的通道修改
|
||||||
|
{
|
||||||
|
bool isOverwritten = false;
|
||||||
|
for (uint32_t i = curPass + 1; i < end; ++i) {
|
||||||
|
const SmallVector<uint32_t>& curOutputs = passes[i].outputs;
|
||||||
|
if (std::find(curOutputs.begin(), curOutputs.end(), curTexture) != curOutputs.end()) {
|
||||||
|
isOverwritten = true;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (!isOverwritten) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// 需要重新渲染前一个输出 curTexture 的通道,并带来新的依赖
|
||||||
|
for (int i = (int)curPass - 1; i >= 0; --i) {
|
||||||
|
const SmallVector<uint32_t>& curOutputs = passes[i].outputs;
|
||||||
|
if (std::find(curOutputs.begin(), curOutputs.end(), curTexture) != curOutputs.end()) {
|
||||||
|
const uint32_t ui = (uint32_t)i;
|
||||||
|
|
||||||
|
if (std::find(passesToDraw.begin(), passesToDraw.end(), ui) == passesToDraw.end()) {
|
||||||
|
passesToDraw.push_back(ui);
|
||||||
|
|
||||||
|
// 作为优化,如果之前的所有通道都需要重新渲染则提前返回
|
||||||
|
if ((uint32_t)passesToDraw.size() == passIdx + 1) {
|
||||||
|
for (uint32_t j = 0; j <= passIdx; ++j) {
|
||||||
|
passesToDraw[j] = j;
|
||||||
|
}
|
||||||
|
return passesToDraw;
|
||||||
|
}
|
||||||
|
|
||||||
|
for (uint32_t input : passes[ui].inputs) {
|
||||||
|
if (!IsReadonlyTexture(desc, input)) {
|
||||||
|
depTextures.emplace_back(ui, input);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
std::sort(passesToDraw.begin(), passesToDraw.end());
|
||||||
|
return passesToDraw;
|
||||||
|
}
|
||||||
|
|
||||||
|
void EffectDrawer::_PrepareForDraw() const noexcept {
|
||||||
|
{
|
||||||
|
ID3D11Buffer* t = _constantBuffer.get();
|
||||||
|
_d3dDC->CSSetConstantBuffers(0, 1, &t);
|
||||||
|
}
|
||||||
|
|
||||||
|
_d3dDC->CSSetSamplers(0, (UINT)_samplers.size(), _samplers.data());
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
99
src/Magpie.Core/EffectDrawer.h
Normal file
99
src/Magpie.Core/EffectDrawer.h
Normal file
|
|
@ -0,0 +1,99 @@
|
||||||
|
#pragma once
|
||||||
|
#include "EffectDesc.h"
|
||||||
|
#include "SmallVector.h"
|
||||||
|
// Conan 的 muparser 不含 UNICODE 支持
|
||||||
|
#pragma push_macro("_UNICODE")
|
||||||
|
#undef _UNICODE
|
||||||
|
#pragma warning(push)
|
||||||
|
#pragma warning(disable: 4310) // 类型强制转换截断常量值
|
||||||
|
#include <muParser.h>
|
||||||
|
#pragma warning(push)
|
||||||
|
#pragma pop_macro("_UNICODE")
|
||||||
|
|
||||||
|
namespace Magpie {
|
||||||
|
|
||||||
|
struct EffectOption;
|
||||||
|
class DeviceResources;
|
||||||
|
class BackendDescriptorStore;
|
||||||
|
class EffectsProfiler;
|
||||||
|
|
||||||
|
class EffectDrawer {
|
||||||
|
public:
|
||||||
|
EffectDrawer() = default;
|
||||||
|
EffectDrawer(const EffectDrawer&) = delete;
|
||||||
|
EffectDrawer(EffectDrawer&&) = default;
|
||||||
|
|
||||||
|
~EffectDrawer();
|
||||||
|
|
||||||
|
bool Initialize(
|
||||||
|
const EffectDesc& desc,
|
||||||
|
const EffectOption& option,
|
||||||
|
DeviceResources& deviceResources,
|
||||||
|
BackendDescriptorStore& descriptorStore,
|
||||||
|
ID3D11Texture2D** inOutTexture
|
||||||
|
) noexcept;
|
||||||
|
|
||||||
|
void Draw(EffectsProfiler& profiler) const noexcept;
|
||||||
|
|
||||||
|
void DrawForExport(const EffectDesc& desc, uint32_t passIdx) const noexcept;
|
||||||
|
|
||||||
|
bool ResizeTextures(
|
||||||
|
const EffectDesc& desc,
|
||||||
|
const EffectOption& option,
|
||||||
|
DeviceResources& deviceResources,
|
||||||
|
ID3D11Texture2D** inOutTexture
|
||||||
|
) noexcept;
|
||||||
|
|
||||||
|
ID3D11Texture2D* GetOutputTexture() const noexcept {
|
||||||
|
return _textures[1].get();
|
||||||
|
}
|
||||||
|
|
||||||
|
ID3D11Texture2D* GetTexture(uint32_t idx) const noexcept {
|
||||||
|
return _textures[idx].get();
|
||||||
|
}
|
||||||
|
|
||||||
|
private:
|
||||||
|
SIZE _CalcOutputSize(
|
||||||
|
const EffectDesc& desc,
|
||||||
|
const EffectOption& option,
|
||||||
|
SIZE inputSize
|
||||||
|
) const noexcept;
|
||||||
|
|
||||||
|
bool _UpdatePassResources(const EffectDesc& desc) noexcept;
|
||||||
|
|
||||||
|
bool _UpdateConstants(
|
||||||
|
const EffectDesc& desc,
|
||||||
|
const EffectOption& option,
|
||||||
|
DeviceResources& deviceResources,
|
||||||
|
SIZE inputSize,
|
||||||
|
SIZE outputSize
|
||||||
|
) noexcept;
|
||||||
|
|
||||||
|
void _PrepareForDraw() const noexcept;
|
||||||
|
|
||||||
|
void _DrawPass(uint32_t i) const noexcept;
|
||||||
|
|
||||||
|
SmallVector<uint32_t> _CalcPassesToDrawForExport(
|
||||||
|
const EffectDesc& desc,
|
||||||
|
uint32_t passIdx
|
||||||
|
) const noexcept;
|
||||||
|
|
||||||
|
ID3D11DeviceContext* _d3dDC = nullptr;
|
||||||
|
BackendDescriptorStore* _descriptorStore = nullptr;
|
||||||
|
|
||||||
|
SmallVector<ID3D11SamplerState*> _samplers;
|
||||||
|
SmallVector<winrt::com_ptr<ID3D11Texture2D>> _textures;
|
||||||
|
std::vector<SmallVector<ID3D11ShaderResourceView*>> _srvs;
|
||||||
|
// 后半部分为空,用于解绑
|
||||||
|
std::vector<SmallVector<ID3D11UnorderedAccessView*>> _uavs;
|
||||||
|
|
||||||
|
winrt::com_ptr<ID3D11Buffer> _constantBuffer;
|
||||||
|
|
||||||
|
SmallVector<winrt::com_ptr<ID3D11ComputeShader>> _shaders;
|
||||||
|
|
||||||
|
SmallVector<std::pair<uint32_t, uint32_t>> _dispatches;
|
||||||
|
|
||||||
|
static inline mu::Parser _exprParser;
|
||||||
|
};
|
||||||
|
|
||||||
|
}
|
||||||
|
|
@ -1,40 +0,0 @@
|
||||||
#pragma once
|
|
||||||
|
|
||||||
namespace Magpie {
|
|
||||||
|
|
||||||
class D3D12Context;
|
|
||||||
struct EffectOption;
|
|
||||||
struct EffectInfo;
|
|
||||||
class ComputeContext;
|
|
||||||
|
|
||||||
enum class EffectDrawerState {
|
|
||||||
NotReady,
|
|
||||||
Ready,
|
|
||||||
Error
|
|
||||||
};
|
|
||||||
|
|
||||||
class EffectDrawerBase {
|
|
||||||
public:
|
|
||||||
EffectDrawerBase() = default;
|
|
||||||
EffectDrawerBase(const EffectDrawerBase&) = delete;
|
|
||||||
EffectDrawerBase(EffectDrawerBase&&) = delete;
|
|
||||||
|
|
||||||
virtual ~EffectDrawerBase() noexcept = default;
|
|
||||||
|
|
||||||
virtual const EffectInfo* Initialize(
|
|
||||||
D3D12Context& d3d12Context,
|
|
||||||
const EffectOption& effectOption
|
|
||||||
) noexcept = 0;
|
|
||||||
|
|
||||||
virtual void Bind(SizeU inputSize, SizeU outputSize, const ColorInfo& colorInfo) noexcept = 0;
|
|
||||||
|
|
||||||
virtual HRESULT Update(EffectDrawerState& state, std::string& message) noexcept = 0;
|
|
||||||
|
|
||||||
virtual HRESULT Draw(
|
|
||||||
ComputeContext& computeContext,
|
|
||||||
uint32_t inputSrvOffset,
|
|
||||||
uint32_t outputUavOffset
|
|
||||||
) noexcept = 0;
|
|
||||||
};
|
|
||||||
|
|
||||||
}
|
|
||||||
47
src/Magpie.Core/EffectHelper.h
Normal file
47
src/Magpie.Core/EffectHelper.h
Normal file
|
|
@ -0,0 +1,47 @@
|
||||||
|
#pragma once
|
||||||
|
#include <cstdint>
|
||||||
|
#include <dxgi.h>
|
||||||
|
|
||||||
|
namespace Magpie {
|
||||||
|
|
||||||
|
struct EffectHelper {
|
||||||
|
struct EffectIntermediateTextureFormatDesc {
|
||||||
|
const char* name;
|
||||||
|
DXGI_FORMAT dxgiFormat;
|
||||||
|
uint32_t nChannel;
|
||||||
|
const char* srvTexelType;
|
||||||
|
const char* uavTexelType;
|
||||||
|
};
|
||||||
|
|
||||||
|
static constexpr EffectIntermediateTextureFormatDesc FORMAT_DESCS[] = {
|
||||||
|
{"R32G32B32A32_FLOAT", DXGI_FORMAT_R32G32B32A32_FLOAT, 4, "float4", "float4"},
|
||||||
|
{"R16G16B16A16_FLOAT", DXGI_FORMAT_R16G16B16A16_FLOAT, 4, "MF4", "MF4"},
|
||||||
|
{"R16G16B16A16_UNORM", DXGI_FORMAT_R16G16B16A16_UNORM, 4, "MF4", "unorm MF4"},
|
||||||
|
{"R16G16B16A16_SNORM", DXGI_FORMAT_R16G16B16A16_SNORM, 4, "MF4", "snorm MF4"},
|
||||||
|
{"R32G32_FLOAT", DXGI_FORMAT_R32G32_FLOAT, 2, "float2", "float2"},
|
||||||
|
{"R10G10B10A2_UNORM", DXGI_FORMAT_R10G10B10A2_UNORM, 4, "MF4", "unorm MF4"},
|
||||||
|
{"R11G11B10_FLOAT", DXGI_FORMAT_R11G11B10_FLOAT, 3, "MF3", "MF3"},
|
||||||
|
{"R8G8B8A8_UNORM", DXGI_FORMAT_R8G8B8A8_UNORM, 4, "MF4", "unorm MF4"},
|
||||||
|
{"R8G8B8A8_SNORM", DXGI_FORMAT_R8G8B8A8_SNORM, 4, "MF4", "snorm MF4"},
|
||||||
|
{"R16G16_FLOAT", DXGI_FORMAT_R16G16_FLOAT, 2, "MF2", "MF2"},
|
||||||
|
{"R16G16_UNORM", DXGI_FORMAT_R16G16_UNORM, 2, "MF2", "unorm MF2"},
|
||||||
|
{"R16G16_SNORM", DXGI_FORMAT_R16G16_SNORM, 2, "MF2", "snorm MF2"},
|
||||||
|
{"R32_FLOAT" ,DXGI_FORMAT_R32_FLOAT, 1, "float", "float"},
|
||||||
|
{"R8G8_UNORM", DXGI_FORMAT_R8G8_UNORM, 2, "MF2", "unorm MF2"},
|
||||||
|
{"R8G8_SNORM", DXGI_FORMAT_R8G8_SNORM, 2, "MF2", "snorm MF2"},
|
||||||
|
{"R16_FLOAT", DXGI_FORMAT_R16_FLOAT, 1, "MF", "MF"},
|
||||||
|
{"R16_UNORM", DXGI_FORMAT_R16_UNORM, 1, "MF", "unorm MF"},
|
||||||
|
{"R16_SNORM", DXGI_FORMAT_R16_SNORM,1, "MF", "snorm MF"},
|
||||||
|
{"R8_UNORM", DXGI_FORMAT_R8_UNORM, 1, "MF", "unorm MF"},
|
||||||
|
{"R8_SNORM", DXGI_FORMAT_R8_SNORM, 1, "MF", "snorm MF"},
|
||||||
|
{"UNKNOWN", DXGI_FORMAT_UNKNOWN, 4, "float4", "float4"}
|
||||||
|
};
|
||||||
|
|
||||||
|
union Constant32 {
|
||||||
|
float floatVal;
|
||||||
|
uint32_t uintVal;
|
||||||
|
int intVal;
|
||||||
|
};
|
||||||
|
};
|
||||||
|
|
||||||
|
}
|
||||||
|
|
@ -1,437 +0,0 @@
|
||||||
#include "pch.h"
|
|
||||||
#include "EffectsDrawer.h"
|
|
||||||
#include "CatmullRomDrawer.h"
|
|
||||||
#include "CommandContext.h"
|
|
||||||
#include "D3D12Context.h"
|
|
||||||
#include "Logger.h"
|
|
||||||
#include "ScalingWindow.h"
|
|
||||||
#include "ShaderEffectDrawer.h"
|
|
||||||
#include "EffectInfo.h"
|
|
||||||
#include "DescriptorHeap.h"
|
|
||||||
|
|
||||||
namespace Magpie {
|
|
||||||
|
|
||||||
EffectsDrawer::~EffectsDrawer() noexcept {
|
|
||||||
#ifdef _DEBUG
|
|
||||||
if (_descriptorBaseOffset != std::numeric_limits<uint32_t>::max()) {
|
|
||||||
_d3d12Context->GetDescriptorHeap().Free(_descriptorBaseOffset, _CalcDescriptorCount());
|
|
||||||
}
|
|
||||||
#endif
|
|
||||||
}
|
|
||||||
|
|
||||||
static SizeU CalcOutputSize(
|
|
||||||
uint32_t scaleFactor,
|
|
||||||
SizeU inputSize,
|
|
||||||
SizeU rendererSize,
|
|
||||||
const EffectOption& effectOption
|
|
||||||
) noexcept {
|
|
||||||
if (scaleFactor != 0) {
|
|
||||||
return SizeU{ inputSize.width * scaleFactor, inputSize.height * scaleFactor };
|
|
||||||
}
|
|
||||||
|
|
||||||
// 支持自由缩放
|
|
||||||
switch (effectOption.scalingType) {
|
|
||||||
case ScalingType::Normal:
|
|
||||||
{
|
|
||||||
return SizeU{
|
|
||||||
(uint32_t)std::lround(inputSize.width * effectOption.scale.first),
|
|
||||||
(uint32_t)std::lround(inputSize.height * effectOption.scale.second)
|
|
||||||
};
|
|
||||||
}
|
|
||||||
case ScalingType::Absolute:
|
|
||||||
{
|
|
||||||
return SizeU{
|
|
||||||
(uint32_t)std::lround(effectOption.scale.first),
|
|
||||||
(uint32_t)std::lround(effectOption.scale.second)
|
|
||||||
};
|
|
||||||
}
|
|
||||||
case ScalingType::Fit:
|
|
||||||
{
|
|
||||||
// 窗口模式缩放时将缩放比例为 1 的 Fit 视为 Fill。此时缩放确保是等比例的,但由于舍入
|
|
||||||
// 可能存在一个像素的误差。考虑长 100 高 50 的矩形窗口,长调整到 101 时高将四舍五入到
|
|
||||||
// 51,再将长调整到 102 高仍是 51,Fit 的计算方式会使这两次调整中有一次存在黑边。
|
|
||||||
bool treatFitAsFill = ScalingWindow::Get().Options().IsWindowedMode() &&
|
|
||||||
IsApprox(effectOption.scale.first, 1.0f) &&
|
|
||||||
IsApprox(effectOption.scale.second, 1.0f);
|
|
||||||
|
|
||||||
if (!treatFitAsFill) {
|
|
||||||
float fillScale = std::min(
|
|
||||||
float(rendererSize.width) / inputSize.width,
|
|
||||||
float(rendererSize.height) / inputSize.height
|
|
||||||
);
|
|
||||||
return SizeU{
|
|
||||||
(uint32_t)std::lround(inputSize.width * fillScale * effectOption.scale.first),
|
|
||||||
(uint32_t)std::lround(inputSize.height * fillScale * effectOption.scale.second)
|
|
||||||
};
|
|
||||||
}
|
|
||||||
[[fallthrough]];
|
|
||||||
}
|
|
||||||
default:
|
|
||||||
assert(effectOption.scalingType == ScalingType::Fit ||
|
|
||||||
effectOption.scalingType == ScalingType::Fill);
|
|
||||||
return rendererSize;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
bool EffectsDrawer::Initialize(
|
|
||||||
D3D12Context& d3d12Context,
|
|
||||||
const ColorInfo& colorInfo,
|
|
||||||
SizeU inputSize,
|
|
||||||
SizeU rendererSize,
|
|
||||||
SizeU& outputSize
|
|
||||||
) noexcept {
|
|
||||||
_d3d12Context = &d3d12Context;
|
|
||||||
_colorInfo = colorInfo;
|
|
||||||
_inputSize = inputSize;
|
|
||||||
_rendererSize = rendererSize;
|
|
||||||
|
|
||||||
ID3D12Device5* device = d3d12Context.GetDevice();
|
|
||||||
const ScalingOptions& options = ScalingWindow::Get().Options();
|
|
||||||
|
|
||||||
uint32_t effectCount = (uint32_t)options.effects.size();
|
|
||||||
_effectDatas.resize(effectCount);
|
|
||||||
|
|
||||||
// 效果的初始化可能是异步的,因此尽早进行
|
|
||||||
for (uint32_t i = 0; i < effectCount; ++i) {
|
|
||||||
_EffectData& effectData = _effectDatas[i];
|
|
||||||
effectData.drawer = std::make_unique<ShaderEffectDrawer>();
|
|
||||||
effectData.effectInfo = effectData.drawer->Initialize(d3d12Context, options.effects[i]);
|
|
||||||
if (!effectData.effectInfo) {
|
|
||||||
Logger::Get().Error("ShaderEffectDrawer::Initialize 失败");
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
_UpdateEffectBindings();
|
|
||||||
outputSize = _outputSize;
|
|
||||||
|
|
||||||
// 创建效果的输入/输出纹理
|
|
||||||
if (uint32_t descriptorCount = _CalcDescriptorCount()) {
|
|
||||||
auto& descriptorHeap = _d3d12Context->GetDescriptorHeap();
|
|
||||||
HRESULT hr = descriptorHeap.Alloc(descriptorCount, _descriptorBaseOffset);
|
|
||||||
if (FAILED(hr)) {
|
|
||||||
Logger::Get().ComError("DescriptorHeap::Alloc 失败", hr);
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
CD3DX12_CPU_DESCRIPTOR_HANDLE cpuHandle(descriptorHeap.GetCpuHandle(_descriptorBaseOffset));
|
|
||||||
const uint32_t descriptorSize = descriptorHeap.GetDescriptorSize();
|
|
||||||
|
|
||||||
CD3DX12_HEAP_PROPERTIES heapProps(D3D12_HEAP_TYPE_DEFAULT);
|
|
||||||
|
|
||||||
D3D12_HEAP_FLAGS heapFlags = _d3d12Context->IsHeapFlagCreateNotZeroedSupported() ?
|
|
||||||
D3D12_HEAP_FLAG_CREATE_NOT_ZEROED : D3D12_HEAP_FLAG_NONE;
|
|
||||||
|
|
||||||
bool isSrgb = colorInfo.kind == winrt::AdvancedColorKind::StandardDynamicRange;
|
|
||||||
CD3DX12_RESOURCE_DESC texDesc = CD3DX12_RESOURCE_DESC::Tex2D(
|
|
||||||
isSrgb ? DXGI_FORMAT_R10G10B10A2_UNORM : DXGI_FORMAT_R16G16B16A16_FLOAT,
|
|
||||||
0, 0, 1, 1, 1, 0,
|
|
||||||
D3D12_RESOURCE_FLAG_ALLOW_UNORDERED_ACCESS
|
|
||||||
);
|
|
||||||
|
|
||||||
CD3DX12_SHADER_RESOURCE_VIEW_DESC srvDesc =
|
|
||||||
CD3DX12_SHADER_RESOURCE_VIEW_DESC::Tex2D(texDesc.Format, 1);
|
|
||||||
CD3DX12_UNORDERED_ACCESS_VIEW_DESC uavDesc =
|
|
||||||
CD3DX12_UNORDERED_ACCESS_VIEW_DESC::Tex2D(texDesc.Format);
|
|
||||||
|
|
||||||
for (uint32_t i = 0; i < effectCount; ++i) {
|
|
||||||
auto& effectData = _effectDatas[i];
|
|
||||||
|
|
||||||
// 如果不需要缩小,最后一个效果直接写入环形缓冲,不需要创建输出纹理
|
|
||||||
if (i == effectCount - 1 && effectData.outputSize == _outputSize) {
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
|
|
||||||
texDesc.Width = effectData.outputSize.width;
|
|
||||||
texDesc.Height = effectData.outputSize.height;
|
|
||||||
|
|
||||||
hr = device->CreateCommittedResource(
|
|
||||||
&heapProps, heapFlags, &texDesc, D3D12_RESOURCE_STATE_NON_PIXEL_SHADER_RESOURCE,
|
|
||||||
nullptr, IID_PPV_ARGS(&effectData.outputTexture));
|
|
||||||
if (FAILED(hr)) {
|
|
||||||
Logger::Get().ComError("CreateCommittedResource 失败", hr);
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
device->CreateShaderResourceView(effectData.outputTexture.get(), &srvDesc, cpuHandle);
|
|
||||||
cpuHandle.Offset(descriptorSize);
|
|
||||||
|
|
||||||
device->CreateUnorderedAccessView(
|
|
||||||
effectData.outputTexture.get(), nullptr, &uavDesc, cpuHandle);
|
|
||||||
cpuHandle.Offset(descriptorSize);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// CatmullRomDrawer 将在渲染时按需创建 PSO,初始化无代价
|
|
||||||
_catmullRomDrawer.Initialize(d3d12Context);
|
|
||||||
|
|
||||||
{
|
|
||||||
// 每帧两个时间戳
|
|
||||||
const uint32_t timestampCount = 2 * ScalingWindow::Get().Options().maxProducerInFlightFrames;
|
|
||||||
|
|
||||||
D3D12_QUERY_HEAP_DESC queryHeapDesc = {
|
|
||||||
.Type = D3D12_QUERY_HEAP_TYPE_TIMESTAMP,
|
|
||||||
.Count = timestampCount
|
|
||||||
};
|
|
||||||
HRESULT hr = device->CreateQueryHeap(&queryHeapDesc, IID_PPV_ARGS(&_queryHeap));
|
|
||||||
if (FAILED(hr)) {
|
|
||||||
Logger::Get().ComError("CreateQueryHeap 失败", hr);
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
CD3DX12_HEAP_PROPERTIES heapProps(D3D12_HEAP_TYPE_READBACK);
|
|
||||||
CD3DX12_RESOURCE_DESC bufferDesc =
|
|
||||||
CD3DX12_RESOURCE_DESC::Buffer(timestampCount * sizeof(UINT64));
|
|
||||||
hr = device->CreateCommittedResource(
|
|
||||||
&heapProps,
|
|
||||||
D3D12_HEAP_FLAG_NONE,
|
|
||||||
&bufferDesc,
|
|
||||||
D3D12_RESOURCE_STATE_COPY_DEST,
|
|
||||||
nullptr,
|
|
||||||
IID_PPV_ARGS(&_queryResultBuffer)
|
|
||||||
);
|
|
||||||
if (FAILED(hr)) {
|
|
||||||
Logger::Get().ComError("CreateCommittedResource 失败", hr);
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
hr = d3d12Context.GetCommandQueue()->GetTimestampFrequency(&_timestampFrequency);
|
|
||||||
if (FAILED(hr)) {
|
|
||||||
Logger::Get().ComError("ID3D12CommandQueue::GetTimestampFrequency 失败", hr);
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
|
|
||||||
HRESULT EffectsDrawer::Draw(
|
|
||||||
ComputeContext& computeContext,
|
|
||||||
uint32_t /*frameIndex*/,
|
|
||||||
ID3D12Resource* /*inputResource*/,
|
|
||||||
ID3D12Resource* /*outputResource*/,
|
|
||||||
uint32_t inputSrvOffset,
|
|
||||||
uint32_t outputUavOffset
|
|
||||||
) noexcept {
|
|
||||||
// 获取渲染时间
|
|
||||||
// const uint32_t queryHeapIndex = 2 * frameIndex;
|
|
||||||
// {
|
|
||||||
// CD3DX12_RANGE range(queryHeapIndex * sizeof(UINT64), (queryHeapIndex + 2) * sizeof(UINT64));
|
|
||||||
|
|
||||||
// void* pData;
|
|
||||||
// HRESULT hr = _queryResultBuffer->Map(0, nullptr, &pData);
|
|
||||||
// if (FAILED(hr)) {
|
|
||||||
// Logger::Get().ComError("ID3D12Resource::Map 失败", hr);
|
|
||||||
// return hr;
|
|
||||||
// }
|
|
||||||
|
|
||||||
// UINT64* timestampes = (UINT64*)pData + queryHeapIndex;
|
|
||||||
|
|
||||||
// range = {};
|
|
||||||
// _queryResultBuffer->Unmap(0, &range);
|
|
||||||
// }
|
|
||||||
|
|
||||||
//commandList->EndQuery(_queryHeap.get(), D3D12_QUERY_TYPE_TIMESTAMP, queryHeapIndex);
|
|
||||||
|
|
||||||
const uint32_t effectCount = (uint32_t)_effectDatas.size();
|
|
||||||
// 如果多个连续的效果都不能渲染,则合并为一个 CatmullRom
|
|
||||||
uint32_t catmullRomStartIdx = std::numeric_limits<uint32_t>::max();
|
|
||||||
|
|
||||||
for (uint32_t effectIdx = 0; effectIdx < effectCount; ++effectIdx) {
|
|
||||||
EffectDrawerState state;
|
|
||||||
std::string msg;
|
|
||||||
HRESULT hr = _effectDatas[effectIdx].drawer->Update(state, msg);
|
|
||||||
if (FAILED(hr)) {
|
|
||||||
Logger::Get().ComError("ShaderEffectDrawer::Update 失败", hr);
|
|
||||||
return hr;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (state != EffectDrawerState::Ready) {
|
|
||||||
if (catmullRomStartIdx == std::numeric_limits<uint32_t>::max()) {
|
|
||||||
catmullRomStartIdx = effectIdx;
|
|
||||||
}
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (catmullRomStartIdx != std::numeric_limits<uint32_t>::max()) {
|
|
||||||
SizeU inputSize;
|
|
||||||
uint32_t inputSrv;
|
|
||||||
if (catmullRomStartIdx == 0) {
|
|
||||||
inputSize = _inputSize;
|
|
||||||
inputSrv = inputSrvOffset;
|
|
||||||
} else {
|
|
||||||
uint32_t prevIdx = catmullRomStartIdx - 1;
|
|
||||||
inputSize = _effectDatas[prevIdx].outputSize;
|
|
||||||
inputSrv = _descriptorBaseOffset + prevIdx * 2;
|
|
||||||
|
|
||||||
computeContext.InsertTransitionBarrier(
|
|
||||||
_effectDatas[prevIdx].outputTexture.get(),
|
|
||||||
D3D12_RESOURCE_STATE_UNORDERED_ACCESS,
|
|
||||||
D3D12_RESOURCE_STATE_NON_PIXEL_SHADER_RESOURCE
|
|
||||||
);
|
|
||||||
}
|
|
||||||
|
|
||||||
computeContext.InsertTransitionBarrier(
|
|
||||||
_effectDatas[size_t(effectIdx - 1)].outputTexture.get(),
|
|
||||||
D3D12_RESOURCE_STATE_NON_PIXEL_SHADER_RESOURCE,
|
|
||||||
D3D12_RESOURCE_STATE_UNORDERED_ACCESS
|
|
||||||
);
|
|
||||||
|
|
||||||
_catmullRomDrawer.Draw(
|
|
||||||
computeContext,
|
|
||||||
inputSize,
|
|
||||||
_effectDatas[size_t(effectIdx - 1)].outputSize,
|
|
||||||
inputSrv,
|
|
||||||
_descriptorBaseOffset + effectIdx * 2 - 1,
|
|
||||||
false
|
|
||||||
);
|
|
||||||
|
|
||||||
catmullRomStartIdx = std::numeric_limits<uint32_t>::max();
|
|
||||||
}
|
|
||||||
|
|
||||||
bool writeToRingBuffer = effectIdx == effectCount - 1 &&
|
|
||||||
_effectDatas[effectIdx].outputSize == _outputSize;
|
|
||||||
|
|
||||||
if (effectIdx != 0) {
|
|
||||||
computeContext.InsertTransitionBarrier(
|
|
||||||
_effectDatas[size_t(effectIdx - 1)].outputTexture.get(),
|
|
||||||
D3D12_RESOURCE_STATE_UNORDERED_ACCESS,
|
|
||||||
D3D12_RESOURCE_STATE_NON_PIXEL_SHADER_RESOURCE
|
|
||||||
);
|
|
||||||
}
|
|
||||||
|
|
||||||
if (!writeToRingBuffer) {
|
|
||||||
computeContext.InsertTransitionBarrier(
|
|
||||||
_effectDatas[effectIdx].outputTexture.get(),
|
|
||||||
D3D12_RESOURCE_STATE_NON_PIXEL_SHADER_RESOURCE,
|
|
||||||
D3D12_RESOURCE_STATE_UNORDERED_ACCESS
|
|
||||||
);
|
|
||||||
}
|
|
||||||
|
|
||||||
hr = _effectDatas[effectIdx].drawer->Draw(
|
|
||||||
computeContext,
|
|
||||||
effectIdx == 0 ? inputSrvOffset : _descriptorBaseOffset + (effectIdx - 1) * 2,
|
|
||||||
writeToRingBuffer ? outputUavOffset : _descriptorBaseOffset + effectIdx * 2 + 1
|
|
||||||
);
|
|
||||||
if (FAILED(hr)) {
|
|
||||||
Logger::Get().ComError("EffectDrawerBase::Draw 失败", hr);
|
|
||||||
return hr;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if (catmullRomStartIdx != std::numeric_limits<uint32_t>::max()) {
|
|
||||||
SizeU inputSize;
|
|
||||||
uint32_t inputSrv;
|
|
||||||
if (catmullRomStartIdx == 0) {
|
|
||||||
inputSize = _inputSize;
|
|
||||||
inputSrv = inputSrvOffset;
|
|
||||||
} else {
|
|
||||||
uint32_t prevIdx = catmullRomStartIdx - 1;
|
|
||||||
inputSize = _effectDatas[prevIdx].outputSize;
|
|
||||||
inputSrv = _descriptorBaseOffset + prevIdx * 2;
|
|
||||||
|
|
||||||
computeContext.InsertTransitionBarrier(
|
|
||||||
_effectDatas[prevIdx].outputTexture.get(),
|
|
||||||
D3D12_RESOURCE_STATE_UNORDERED_ACCESS,
|
|
||||||
D3D12_RESOURCE_STATE_NON_PIXEL_SHADER_RESOURCE
|
|
||||||
);
|
|
||||||
}
|
|
||||||
|
|
||||||
_catmullRomDrawer.Draw(
|
|
||||||
computeContext, inputSize, _outputSize, inputSrv, outputUavOffset, false);
|
|
||||||
} else if (_effectDatas.back().outputSize != _outputSize) {
|
|
||||||
computeContext.InsertTransitionBarrier(
|
|
||||||
_effectDatas.back().outputTexture.get(),
|
|
||||||
D3D12_RESOURCE_STATE_UNORDERED_ACCESS,
|
|
||||||
D3D12_RESOURCE_STATE_NON_PIXEL_SHADER_RESOURCE
|
|
||||||
);
|
|
||||||
|
|
||||||
_catmullRomDrawer.Draw(
|
|
||||||
computeContext,
|
|
||||||
_effectDatas.back().outputSize,
|
|
||||||
_outputSize,
|
|
||||||
_descriptorBaseOffset + (effectCount - 1) * 2,
|
|
||||||
outputUavOffset,
|
|
||||||
false
|
|
||||||
);
|
|
||||||
}
|
|
||||||
|
|
||||||
// commandList->EndQuery(_queryHeap.get(), D3D12_QUERY_TYPE_TIMESTAMP, queryHeapIndex + 1);
|
|
||||||
// commandList->ResolveQueryData(_queryHeap.get(), D3D12_QUERY_TYPE_TIMESTAMP, queryHeapIndex, 2,
|
|
||||||
// _queryResultBuffer.get(), queryHeapIndex * sizeof(UINT64));
|
|
||||||
|
|
||||||
return S_OK;
|
|
||||||
}
|
|
||||||
|
|
||||||
void EffectsDrawer::OnResized(SizeU rendererSize, SizeU& outputSize) noexcept {
|
|
||||||
_rendererSize = rendererSize;
|
|
||||||
_UpdateEffectBindings();
|
|
||||||
outputSize = _outputSize;
|
|
||||||
}
|
|
||||||
|
|
||||||
void EffectsDrawer::OnColorInfoChanged(const ColorInfo& colorInfo) noexcept {
|
|
||||||
_colorInfo = colorInfo;
|
|
||||||
_UpdateEffectBindings();
|
|
||||||
}
|
|
||||||
|
|
||||||
uint32_t EffectsDrawer::_CalcDescriptorCount() const noexcept {
|
|
||||||
// 如果最后一个效果的缩放类型是 Fit 或 Fill 且缩放比例不大于 1,那么始终可以直接写入环形缓冲区,
|
|
||||||
// 需要的描述符数量可以减少两个。
|
|
||||||
// 还有更复杂的情况,如倒数第二个效果是 Fit(0.5,0.5),最后一个效果放大一倍,也可以认为输出尺寸
|
|
||||||
// 永远不会大于 rendererSize,不过这较为复杂,还有舍入的问题,安全起见不进行优化。
|
|
||||||
uint32_t count = (uint32_t)_effectDatas.size() * 2;
|
|
||||||
|
|
||||||
if (_effectDatas.back().effectInfo->scaleFactor != 0) {
|
|
||||||
return count;
|
|
||||||
}
|
|
||||||
|
|
||||||
const EffectOption& effectOption = ScalingWindow::Get().Options().effects.back();
|
|
||||||
if ((effectOption.scalingType == ScalingType::Fit || effectOption.scalingType == ScalingType::Fill) &&
|
|
||||||
effectOption.scale.first < 1 + FLOAT_EPSILON<float> &&
|
|
||||||
effectOption.scale.second < 1 + FLOAT_EPSILON<float>)
|
|
||||||
{
|
|
||||||
return count - 2;
|
|
||||||
} else {
|
|
||||||
return count;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
void EffectsDrawer::_UpdateEffectBindings() noexcept {
|
|
||||||
const ScalingOptions& options = ScalingWindow::Get().Options();
|
|
||||||
|
|
||||||
_outputSize = _inputSize;
|
|
||||||
for (uint32_t i = 0; i < _effectDatas.size(); ++i) {
|
|
||||||
_EffectData& effectData = _effectDatas[i];
|
|
||||||
const EffectOption& effectOption = options.effects[i];
|
|
||||||
|
|
||||||
// outputSize 是前一个效果的输出尺寸,即当前效果的输入尺寸
|
|
||||||
effectData.outputSize = CalcOutputSize(
|
|
||||||
effectData.effectInfo->scaleFactor, _outputSize, _rendererSize, effectOption);
|
|
||||||
|
|
||||||
effectData.drawer->Bind(_outputSize, effectData.outputSize, _colorInfo);
|
|
||||||
|
|
||||||
_outputSize = effectData.outputSize;
|
|
||||||
}
|
|
||||||
|
|
||||||
// 如果输出尺寸比渲染区域更大则使用 CatmullRom 等比缩小,窗口模式缩放下可能要放大
|
|
||||||
if (_outputSize != _rendererSize) {
|
|
||||||
if (options.IsWindowedMode()) {
|
|
||||||
// 窗口模式缩放已确保等比例,这里直接赋值以避免舍入误差
|
|
||||||
_outputSize = _rendererSize;
|
|
||||||
} else if (_outputSize.width > _rendererSize.width ||
|
|
||||||
_outputSize.height > _rendererSize.height)
|
|
||||||
{
|
|
||||||
float scaleX = float(_rendererSize.width) / _outputSize.width;
|
|
||||||
float scaleY = float(_rendererSize.height) / _outputSize.height;
|
|
||||||
if (scaleX <= scaleY) {
|
|
||||||
_outputSize.width = _rendererSize.width;
|
|
||||||
_outputSize.height = std::lround(_outputSize.height * scaleX);
|
|
||||||
} else {
|
|
||||||
_outputSize.width = std::lround(_outputSize.width * scaleY);
|
|
||||||
_outputSize.height = _rendererSize.height;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
@ -1,74 +0,0 @@
|
||||||
#pragma once
|
|
||||||
#include "CatmullRomDrawer.h"
|
|
||||||
#include "SmallVector.h"
|
|
||||||
|
|
||||||
namespace Magpie {
|
|
||||||
|
|
||||||
class ComputeContext;
|
|
||||||
class EffectDrawerBase;
|
|
||||||
struct EffectInfo;
|
|
||||||
|
|
||||||
class EffectsDrawer {
|
|
||||||
public:
|
|
||||||
EffectsDrawer() noexcept = default;
|
|
||||||
EffectsDrawer(const EffectsDrawer&) = delete;
|
|
||||||
EffectsDrawer(EffectsDrawer&&) = delete;
|
|
||||||
|
|
||||||
~EffectsDrawer() noexcept;
|
|
||||||
|
|
||||||
bool Initialize(
|
|
||||||
D3D12Context& d3d12Context,
|
|
||||||
const ColorInfo& colorInfo,
|
|
||||||
SizeU inputSize,
|
|
||||||
SizeU rendererSize,
|
|
||||||
SizeU& outputSize
|
|
||||||
) noexcept;
|
|
||||||
|
|
||||||
HRESULT Draw(
|
|
||||||
ComputeContext& computeContext,
|
|
||||||
uint32_t frameIndex,
|
|
||||||
ID3D12Resource* inputResource,
|
|
||||||
ID3D12Resource* outputResource,
|
|
||||||
uint32_t inputSrvOffset,
|
|
||||||
uint32_t outputUavOffset
|
|
||||||
) noexcept;
|
|
||||||
|
|
||||||
SizeU GetOutputSize() const noexcept {
|
|
||||||
return _outputSize;
|
|
||||||
}
|
|
||||||
|
|
||||||
void OnResized(SizeU rendererSize, SizeU& outputSize) noexcept;
|
|
||||||
|
|
||||||
void OnColorInfoChanged(const ColorInfo& colorInfo) noexcept;
|
|
||||||
|
|
||||||
private:
|
|
||||||
uint32_t _CalcDescriptorCount() const noexcept;
|
|
||||||
|
|
||||||
void _UpdateEffectBindings() noexcept;
|
|
||||||
|
|
||||||
D3D12Context* _d3d12Context = nullptr;
|
|
||||||
|
|
||||||
SizeU _inputSize{};
|
|
||||||
SizeU _outputSize{};
|
|
||||||
SizeU _rendererSize{};
|
|
||||||
ColorInfo _colorInfo;
|
|
||||||
|
|
||||||
struct _EffectData {
|
|
||||||
std::unique_ptr<EffectDrawerBase> drawer;
|
|
||||||
const EffectInfo* effectInfo = nullptr;
|
|
||||||
SizeU outputSize{};
|
|
||||||
winrt::com_ptr<ID3D12Resource> outputTexture;
|
|
||||||
};
|
|
||||||
|
|
||||||
SmallVector<_EffectData> _effectDatas;
|
|
||||||
CatmullRomDrawer _catmullRomDrawer;
|
|
||||||
|
|
||||||
// 描述符的布局是 SRV|UAV|SRV|UAV|...
|
|
||||||
uint32_t _descriptorBaseOffset = std::numeric_limits<uint32_t>::max();
|
|
||||||
|
|
||||||
winrt::com_ptr<ID3D12QueryHeap> _queryHeap;
|
|
||||||
winrt::com_ptr<ID3D12Resource> _queryResultBuffer;
|
|
||||||
UINT64 _timestampFrequency = 0;
|
|
||||||
};
|
|
||||||
|
|
||||||
}
|
|
||||||
124
src/Magpie.Core/EffectsProfiler.cpp
Normal file
124
src/Magpie.Core/EffectsProfiler.cpp
Normal file
|
|
@ -0,0 +1,124 @@
|
||||||
|
#include "pch.h"
|
||||||
|
#include "EffectsProfiler.h"
|
||||||
|
|
||||||
|
namespace Magpie {
|
||||||
|
|
||||||
|
void EffectsProfiler::Start(ID3D11Device* d3dDevice, uint32_t passCount) noexcept {
|
||||||
|
assert(!IsProfiling() && passCount > 0);
|
||||||
|
|
||||||
|
_passQueries.resize(passCount);
|
||||||
|
|
||||||
|
D3D11_QUERY_DESC desc{ .Query = D3D11_QUERY_TIMESTAMP_DISJOINT };
|
||||||
|
d3dDevice->CreateQuery(&desc, _disjointQuery.put());
|
||||||
|
|
||||||
|
desc.Query = D3D11_QUERY_TIMESTAMP;
|
||||||
|
d3dDevice->CreateQuery(&desc, _startQuery.put());
|
||||||
|
for (winrt::com_ptr<ID3D11Query>& query : _passQueries) {
|
||||||
|
d3dDevice->CreateQuery(&desc, query.put());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void EffectsProfiler::Stop() noexcept {
|
||||||
|
_disjointQuery = nullptr;
|
||||||
|
_startQuery = nullptr;
|
||||||
|
_passQueries.clear();
|
||||||
|
}
|
||||||
|
|
||||||
|
bool EffectsProfiler::IsProfiling() const noexcept {
|
||||||
|
return (bool)_disjointQuery;
|
||||||
|
}
|
||||||
|
|
||||||
|
void EffectsProfiler::SetPassCount(ID3D11Device* d3dDevice, uint32_t passCount) noexcept {
|
||||||
|
if (!IsProfiling()) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
assert(passCount > 0);
|
||||||
|
const uint32_t oldPassCount = (uint32_t)_passQueries.size();
|
||||||
|
|
||||||
|
if (passCount == oldPassCount) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
_passQueries.resize(passCount);
|
||||||
|
|
||||||
|
if (passCount > oldPassCount) {
|
||||||
|
D3D11_QUERY_DESC desc{ .Query = D3D11_QUERY_TIMESTAMP };
|
||||||
|
for (uint32_t i = oldPassCount; i < passCount; ++i) {
|
||||||
|
d3dDevice->CreateQuery(&desc, _passQueries[i].put());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void EffectsProfiler::OnBeginEffects(ID3D11DeviceContext* d3dDC) noexcept {
|
||||||
|
if (!IsProfiling()) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
d3dDC->Begin(_disjointQuery.get());
|
||||||
|
d3dDC->End(_startQuery.get());
|
||||||
|
|
||||||
|
_curPass = 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
void EffectsProfiler::OnEndPass(ID3D11DeviceContext* d3dDC) noexcept {
|
||||||
|
if (!IsProfiling()) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
d3dDC->End(_passQueries[_curPass++].get());
|
||||||
|
}
|
||||||
|
|
||||||
|
void EffectsProfiler::OnEndEffects(ID3D11DeviceContext* d3dDC) noexcept {
|
||||||
|
if (!IsProfiling()) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
d3dDC->End(_disjointQuery.get());
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename T>
|
||||||
|
static T GetQueryData(ID3D11DeviceContext* d3dDC, ID3D11Query* query) noexcept {
|
||||||
|
T data{};
|
||||||
|
while (d3dDC->GetData(query, &data, sizeof(data), 0) != S_OK) {
|
||||||
|
Sleep(0);
|
||||||
|
}
|
||||||
|
return data;
|
||||||
|
}
|
||||||
|
|
||||||
|
void EffectsProfiler::QueryTimings(ID3D11DeviceContext* d3dDC) noexcept {
|
||||||
|
if (!IsProfiling()) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
D3D11_QUERY_DATA_TIMESTAMP_DISJOINT disjointData =
|
||||||
|
GetQueryData<D3D11_QUERY_DATA_TIMESTAMP_DISJOINT>(d3dDC, _disjointQuery.get());
|
||||||
|
|
||||||
|
if (disjointData.Disjoint) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
const float toMS = 1000.0f / disjointData.Frequency;
|
||||||
|
|
||||||
|
uint64_t prevTimestamp = GetQueryData<uint64_t>(d3dDC, _startQuery.get());
|
||||||
|
|
||||||
|
auto lock = _timingsLock.lock_exclusive();
|
||||||
|
_timings.resize(_passQueries.size());
|
||||||
|
for (size_t i = 0; i < _passQueries.size(); ++i) {
|
||||||
|
uint64_t timestamp = GetQueryData<uint64_t>(d3dDC, _passQueries[i].get());
|
||||||
|
_timings[i] = (timestamp - prevTimestamp) * toMS;
|
||||||
|
|
||||||
|
prevTimestamp = timestamp;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
SmallVector<float> EffectsProfiler::GetTimings() noexcept {
|
||||||
|
auto lock = _timingsLock.lock_exclusive();
|
||||||
|
|
||||||
|
// 没有渲染新帧时 _timings 为空
|
||||||
|
SmallVector<float> result = std::move(_timings);
|
||||||
|
_timings.clear();
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
45
src/Magpie.Core/EffectsProfiler.h
Normal file
45
src/Magpie.Core/EffectsProfiler.h
Normal file
|
|
@ -0,0 +1,45 @@
|
||||||
|
#pragma once
|
||||||
|
#include "SmallVector.h"
|
||||||
|
|
||||||
|
namespace Magpie {
|
||||||
|
|
||||||
|
class DeviceResources;
|
||||||
|
|
||||||
|
class EffectsProfiler {
|
||||||
|
public:
|
||||||
|
EffectsProfiler() = default;
|
||||||
|
|
||||||
|
EffectsProfiler(const EffectsProfiler&) = delete;
|
||||||
|
EffectsProfiler(EffectsProfiler&&) = delete;
|
||||||
|
|
||||||
|
void Start(ID3D11Device* d3dDevice, uint32_t passCount) noexcept;
|
||||||
|
|
||||||
|
void Stop() noexcept;
|
||||||
|
|
||||||
|
bool IsProfiling() const noexcept;
|
||||||
|
|
||||||
|
void SetPassCount(ID3D11Device* d3dDevice, uint32_t passCount) noexcept;
|
||||||
|
|
||||||
|
void OnBeginEffects(ID3D11DeviceContext* d3dDC) noexcept;
|
||||||
|
|
||||||
|
void OnEndPass(ID3D11DeviceContext* d3dDC) noexcept;
|
||||||
|
|
||||||
|
void OnEndEffects(ID3D11DeviceContext* d3dDC) noexcept;
|
||||||
|
|
||||||
|
void QueryTimings(ID3D11DeviceContext* d3dDC) noexcept;
|
||||||
|
|
||||||
|
// 从前端线程调用
|
||||||
|
SmallVector<float> GetTimings() noexcept;
|
||||||
|
|
||||||
|
private:
|
||||||
|
SmallVector<float> _timings;
|
||||||
|
wil::srwlock _timingsLock;
|
||||||
|
|
||||||
|
winrt::com_ptr<ID3D11Query> _disjointQuery;
|
||||||
|
winrt::com_ptr<ID3D11Query> _startQuery;
|
||||||
|
std::vector<winrt::com_ptr<ID3D11Query>> _passQueries;
|
||||||
|
|
||||||
|
uint32_t _curPass = 0;
|
||||||
|
};
|
||||||
|
|
||||||
|
}
|
||||||
Some files were not shown because too many files have changed in this diff Show more
Loading…
Add table
Add a link
Reference in a new issue