Compare commits

...

186 commits

Author SHA1 Message Date
Xu
0c33f77250 feat: 实现叠加层 (p3) 2026-04-29 15:20:43 +08:00
Xu
c148306a0c feat: 实现叠加层 (p2) 2026-04-27 14:16:11 +08:00
Xu
98814ff27b Merge branch 'dev' into d3d12 2026-04-24 15:41:23 +08:00
Xu
724ac95762 feat: 实现叠加层 (p1) 2026-04-24 15:40:46 +08:00
Xu
e25055dc77 Merge branch 'dev' into d3d12 2026-04-16 17:02:01 +08:00
Xu
7bc06cc10d perf: 优化部分效果和修复渲染错误 2026-04-16 16:46:01 +08:00
Xu
379982086c feat: 实现效果缓存 2026-04-16 15:13:26 +08:00
Xu
4be6613482 feat: 修改文件结构 2026-04-14 14:34:22 +08:00
Xu
49d5a7d24d perf: 优化部分效果 2026-04-09 13:03:36 +08:00
Xu
0a8d044f7a fix: 修复 DXC 编译的包含文件处理 2026-04-08 17:12:28 +08:00
Xu
2ebd706052 feat: 从文件加载纹理数据 (完成) 2026-04-08 16:44:00 +08:00
Xu
f3a707e021 feat: 从文件加载纹理数据 (p2) 2026-04-08 14:43:08 +08:00
Xu
255b353452 feat: 从文件加载纹理数据 (p1) 2026-04-07 17:27:53 +08:00
Xu
f1c900d0c3 feat: 着色器效果渲染 (p5) 2026-04-07 13:18:57 +08:00
Xu
ee949c430f feat: 着色器效果渲染 (p4) 2026-04-06 15:25:20 +08:00
Xu
f6463be543 feat: 着色器效果渲染 (p3) 2026-04-05 20:31:04 +08:00
Xu
2b02fdfc0d feat: 着色器效果渲染 (p2) 2026-04-04 21:35:05 +08:00
Xu
355dfbb7e2 feat: 着色器效果渲染 (p1) 2026-04-02 16:05:23 +08:00
Xu
76a3ae52a3 feat: 新版 FX 解析器 (p8) 2026-04-01 15:58:40 +08:00
Xu
ac17e9c2dd feat: 新版 FX 解析器 (p7) 2026-03-31 13:41:50 +08:00
Xu
22a3545b49 feat: 新版 FX 解析器 (p6) 2026-03-30 17:10:41 +08:00
Xu
d0d17311c3 feat: 新版 FX 解析器 (p5) 2026-03-30 13:33:27 +08:00
Xu
0279dca40c feat: 新版 FX 解析器 (p4) 2026-03-26 16:53:39 +08:00
Xu
a7c3f766f7 feat: 添加解析错误消息 2026-03-26 09:33:30 +08:00
Xu
7682132dce refactor: EffectsService 和 LocalizationService 移到 Magpie.Core 2026-03-24 13:36:26 +08:00
Xu
eadbf0e4d0 chore: 删除旧版 EffectCompiler 2026-03-24 11:52:59 +08:00
Xu
c388aa755a chore: 修复编译 2026-03-24 08:47:59 +08:00
Xu
830a7cb078 chore: 删除已废弃的旧版 CuNNy 2026-03-23 13:52:37 +08:00
Xu
f4e29b3b06 feat: 新版 FX 解析器 (p3) 2026-03-23 09:36:49 +08:00
Xu
ffc96f3a6c Merge branch 'dev' into d3d12 2026-03-23 08:59:31 +08:00
Xu
c178e071a7 refactor: 共享代码使用 Magpie 命名空间 2026-03-22 20:24:00 +08:00
Xu
d816d67683 feat: 新版 FX 解析器 (p2) 2026-03-22 19:57:22 +08:00
Xu
9ac3c6d11d feat: 新版 FX 解析器 (p1) 2026-03-21 14:17:22 +08:00
Xu
1cd65609fd chore: 删除未使用的旧代码 2026-03-20 13:48:14 +08:00
Xu
9b64ebc7d6 fix: CheckFeatureSupport 失败不算错误 2026-03-20 10:03:01 +08:00
Xu
2501e4a965 feat: 支持绘制动态光标 (完成) 2026-03-20 09:30:06 +08:00
Xu
1d3372092c feat: 支持绘制动态光标 (p5) 2026-03-19 20:51:22 +08:00
Xu
c6543c8228 feat: 支持绘制动态光标 (p4) 2026-03-19 16:52:01 +08:00
Xu
f68d06d9b7 feat: 支持绘制动态光标 (p3) 2026-03-19 13:52:23 +08:00
Xu
bc5489c2d0 feat: 支持绘制动态光标 (p2) 2026-03-18 20:21:51 +08:00
Xu
9ec7ef6ada feat: 支持绘制动态光标 (p1) 2026-03-18 13:45:41 +08:00
Xu
d052c3bce9 fix: 修复某些鼠标指针设置更改不会触发重新解析 2026-03-16 13:31:51 +08:00
Xu
1fd6a9d5f3 feat: 记录 D3D 设备能力
修复编译错误
光标插值选项默认值改为 Bicubic
2026-03-16 12:47:02 +08:00
Xu
a2dceaf57d perf: _cursorBaseSize 变化后避免等待 GPU 2026-03-16 10:37:05 +08:00
Xu
6fda50aab8 fix: 释放描述符 2026-03-15 22:03:29 +08:00
Xu
8b78a97c68 perf: 成员太多时清理 _cursorInfos 2026-03-15 21:52:59 +08:00
Xu
e66dfb8944 fix: 修复 _cursorInfos 扩容会使 _cursorInfosWithTempResources 中指针失效 2026-03-15 19:24:27 +08:00
Xu
f1739ba4e3 perf: 及时清理用于解析光标的临时资源 2026-03-15 17:44:08 +08:00
Xu
10dcfaa683 feat: 添加“最高 shader model”开发者选项 2026-03-15 15:45:55 +08:00
Xu
b2a9206f9b feat: 添加“使用软件渲染”开发者选项 2026-03-15 13:46:37 +08:00
Xu
b28f02b38d fix: 修复 sRGB 和其他色域互转 2026-03-14 18:50:44 +08:00
Xu
e529c87830 feat: 光标插值添加双三次选项 2026-03-14 15:15:51 +08:00
Xu
c24849a281 feat: 支持 SM6 以及引入 DXC 2026-03-14 14:40:23 +08:00
Xu
5100038255 Merge branch 'dev' into d3d12 2026-03-14 12:08:04 +08:00
Xu
d941b9af87 Merge branch 'dev' into d3d12 2026-03-12 19:00:03 +08:00
Xu
be7f606520 fix: 修复不支持 Agility SDK 的 OS 会加载错误版本的依赖 dll 2026-03-12 17:17:53 +08:00
Xu
5f0e0bbe9b fix: 再次检查 D3D12_HEAP_FLAG_CREATE_NOT_ZEROED 是否可用
旧版本 Win10 不支持 Agility SDK,因此不支持这个功能
2026-03-12 15:30:24 +08:00
Xu
78fb2c9b9a fix: 避免重复尝试解析光标 2026-03-12 14:06:40 +08:00
Xu
de3afc9876 refactor: 封装 command list 接口 2026-03-12 11:10:00 +08:00
Xu
66cc806b7a feat: 初步实现缩放光标纹理 2026-03-11 13:30:20 +08:00
Xu
ae39b21650 refactor: 删除旧版 CursorDrawer 2026-03-10 16:16:05 +08:00
Xu
792aeb1c0f feat: 实现在叠加层上绘制掩码光标 2026-03-10 14:46:35 +08:00
Xu
697854a18e feat: 现在 D3D12_HEAP_FLAG_CREATE_NOT_ZEROED 始终可用
因为部署了 Agility SDK
2026-03-10 10:05:33 +08:00
Xu
f767713385 perf: 稍微优化着色器 2026-03-09 19:15:56 +08:00
Xu
aaeb8d9a6b perf: 单色光标和彩色掩码光标着色器分离 sRGB 和 WCG/HDR 2026-03-09 15:48:40 +08:00
Xu
6f3394404c feat: 实现绘制光标 (p11)
支持彩色掩码光标
2026-03-09 14:29:29 +08:00
Xu
c7bb28d0fb feat: 实现绘制光标 (p10)
初步支持单色光标
2026-03-09 12:43:45 +08:00
Xu
98ec450dbc perf: 使用固定大小的描述符堆
动态描述符堆限制太多,尤其是不支持在渲染过程中分配。现在固定使用 64K 个描述符,肯定够用了
2026-03-08 20:09:03 +08:00
Xu
ba34dd8b3c feat: 实现绘制光标 (p9) 2026-03-08 19:01:02 +08:00
Xu
c74b6a5750 chore: Microsoft.UI.Xaml.dll 移动到 app 文件夹 2026-03-06 21:55:04 +08:00
Xu
8c7c51bb61 feat: 重新组织文件结构 2026-03-06 18:55:31 +08:00
Xu
5a93a499b4 fix: 修复 CopyCS.hlsl 2026-03-05 13:42:08 +08:00
Xu
bd75773437 perf: 优化着色器组织 2026-03-05 11:34:01 +08:00
Xu
70b0215410 perf: 输入和输出尺寸相同时优化 CatmullRomDrawer 的性能 2026-03-04 18:37:26 +08:00
Xu
44768d5922 feat: 实现绘制光标 (p8) 2026-03-04 12:29:43 +08:00
Xu
78724540aa fix: 修复彩色掩码光标的越界访问 2026-03-03 21:05:45 +08:00
Xu
83b3af72bf feat: 实现绘制光标 (p7)
处理光标色域
2026-03-03 20:30:45 +08:00
Xu
34bd6b9162 feat: 实现绘制光标 (p6) 2026-03-03 15:10:37 +08:00
Xu
79cfb58876 chore: 修复编译 2026-03-03 09:00:27 +08:00
Xu
4eccf94575 Merge branch 'dev' into d3d12 2026-03-03 08:49:10 +08:00
Xu
83e3f5ccdf feat: 实现动态描述符堆 (完成) 2026-03-02 16:11:24 +08:00
Xu
51ea21a9f6 feat: 实现动态描述符堆 (p6) 2026-03-02 14:29:43 +08:00
Xu
a15f7afce9 chore: 更新 Agility SDK
GPU-Based Validation 的内存泄露已修复
2026-02-27 15:12:23 +08:00
Xu
7c67b4f6b2 feat: 实现动态描述符堆 (p5) 2026-02-27 14:56:54 +08:00
Xu
6b84d780de feat: 实现动态描述符堆 (p4) 2026-02-26 10:38:11 +08:00
Xu
fd8f956c6e feat: 实现动态描述符堆 (p3) 2026-02-25 12:47:42 +08:00
Xu
cb2ced9b42 feat: 实现动态描述符堆 (p2) 2026-02-25 10:42:36 +08:00
Xu
65502115e6 feat: 实现动态描述符堆 (p1) 2026-02-24 14:46:46 +08:00
Xu
18c1e4e280 Merge branch 'dev' into d3d12 2026-02-19 17:13:49 +08:00
Xu
3ec15f2580 Merge branch 'dev' into d3d12 2026-01-30 17:27:07 +08:00
Xu
493d57233b
Merge branch 'dev' into d3d12 2026-01-29 21:30:44 +08:00
Xu
1fbbeca375 feat: 实现绘制光标 (p5) 2026-01-20 17:01:45 +08:00
Xu
69fbc182da chore: Debug 配置下使用最新 WARP
旧版本 WARP 可能和 Agility SDK 不兼容
2026-01-20 09:28:49 +08:00
Xu
07d426d2d8 feat: 实现绘制光标 (p4) 2026-01-15 17:03:29 +08:00
Xu
bc0c5c9ec8 feat: feat: 实现绘制光标 (p3) 2026-01-13 16:51:23 +08:00
Xu
a0ac7020c3 feat: WindowCase 支持模拟系统光标 2026-01-13 16:35:33 +08:00
Xu
514858993c feat: 实现绘制光标 (p2) 2026-01-12 17:28:16 +08:00
Xu
f88914ab01 feat: 搭建框架 2026-01-11 15:20:12 +08:00
Xu
7dbb7a619d fix: 修复 WCG 和 HDR 间切换可能不会立刻捕获新帧的问题 2026-01-08 22:05:18 +08:00
Xu
ad3892bdf9 feat: 实现绘制光标 (p1) 2026-01-08 20:52:33 +08:00
Xu
4bfd962c4f fix: 多个错误修复 2026-01-08 16:57:14 +08:00
Xu
fb4c3b183d feat: WindowCase 支持模拟不同类型的光标 2026-01-05 22:06:31 +08:00
Xu
17497bbfe9 fix: 修复色域切换 bug 2025-12-31 18:42:50 +08:00
Xu
df05e9e13b perf: 提高改变大小和色域的性能 2025-12-31 16:46:32 +08:00
Xu
a197c474e5 feat: 支持改变色域 2025-12-31 14:10:50 +08:00
Xu
d881b4a845 feat: 添加统计动态检查重复帧的预测正确率的调试选项 2025-12-29 17:19:59 +08:00
Xu
54e3680e3c perf: 特定情况下检查重复帧不执行边界检查 2025-12-29 15:18:59 +08:00
Xu
7e685c0168 fix: 从 Win11 25H2 开始不再特殊处理 Kirikiri 窗口
24H2 的某次更新修复了 WGC 无法捕获 Kirikiri 窗口的问题
2025-12-29 14:47:36 +08:00
Xu
2340169185 fix: 暂时不使用 WGC 的脏区域功能 2025-12-29 11:12:53 +08:00
Xu
ad301e2e2f fix: 检查重复帧添加精确越界检查 2025-12-28 22:36:39 +08:00
Xu
debc99615e perf: 改为使用 D3D11 检查重复帧
有两个原因:
1. D3D11 支持 IDXGIDevice::SetGPUThreadPriority,可以提高 GPU 优先级,而 D3D12 没有等价接口。
2. 对于小任务 D3D11 启动渲染的耗时比 D3D12 短,差距可以达到 50us 以上。
2025-12-28 17:25:51 +08:00
Xu
3c391d231f fix: 检查重复帧添加基本越界检查 2025-12-28 12:38:32 +08:00
Xu
347dbd7590 Merge branch 'dev' into d3d12 2025-12-27 20:17:38 +08:00
Xu
59ae713fc6 perf: 捕获时只复制脏矩形 2025-12-27 15:19:36 +08:00
Xu
426e570943 perf: 脏矩形间有小间隙也合并
添加验证优化算法正确性的调试选项
2025-12-27 11:43:15 +08:00
Xu
0b1cf2afe2 perf: 避免合并脏矩形耗时太长 2025-12-26 19:28:39 +08:00
Xu
906b82e3a1 chore: 删除不再需要的字符串资源 2025-12-25 22:03:35 +08:00
Xu
5740d69982 perf: 检查重复帧时避免重复创建描述符 2025-12-25 21:57:27 +08:00
Xu
1a10749f50 fix: 错误修复 2025-12-25 18:46:54 +08:00
Xu
d4dc153b0c feat: 实现动态检测重复帧 2025-12-25 18:14:15 +08:00
Xu
b5557a1efc fix: 修复鼠标离开缩放窗口时卡死的问题 2025-12-25 10:35:00 +08:00
Xu
b7c33fec68 feat: WGC 支持脏矩形 (p3) 2025-12-24 22:28:57 +08:00
Xu
c2c0e71630 feat: WGC 支持脏矩形 (p2) 2025-12-24 21:34:10 +08:00
Xu
b23964c2a8 feat: WGC 支持脏矩形 (p1) 2025-12-24 20:32:27 +08:00
Xu
459c91f7fc feat: 实现重复帧检测 (p4) 2025-12-23 23:02:35 +08:00
Xu
f70ac6c434 feat: 实现重复帧检测 (p3) 2025-12-23 17:14:09 +08:00
Xu
f631317eb2 feat: 实现重复帧检测 (p2) 2025-12-23 13:50:08 +08:00
Xu
7ab216079d feat: 实现重复帧检测 (p1) 2025-12-22 21:50:17 +08:00
Xu
1b3810b458 Merge branch 'dev' into d3d12 2025-12-22 18:58:10 +08:00
Xu
409b47e814 fix: 修复 WGC 导致光标消失 2025-12-22 15:28:22 +08:00
Xu
2baf3be3d2 feat: 添加调试选项 2025-12-22 12:53:00 +08:00
Xu
3c4600c41b feat: WGC 支持禁用源窗口圆角 2025-12-21 19:57:11 +08:00
Xu
491d1ac8b6 feat: WGC 支持更改源窗口样式 2025-12-18 10:46:12 +08:00
Xu
f787b44ecb perf: CatmullRom 每个线程计算的像素从 4 个改为 2 个
低端显卡上可以提高性能,高端显卡上无变化
2025-12-17 20:15:35 +08:00
Xu
92022a5ec3 feat: 测量渲染用时 2025-12-17 15:05:23 +08:00
Xu
b3c9965577 Merge branch 'dev' into d3d12 2025-12-16 20:52:18 +08:00
Xu
624536cac2 perf: 使用 D3D12_HEAP_FLAG_CREATE_NOT_ZEROED 优化创建堆的性能 2025-12-15 15:58:48 +08:00
Xu
c5878938d0 perf: 提高消费者队列的优先级 2025-12-14 20:31:23 +08:00
Xu
6e736e595c perf: 不使用集成显卡捕获 2025-12-14 17:37:33 +08:00
Xu
a826b36262 feat: 测量从捕获到呈现的耗时 2025-12-13 16:01:30 +08:00
Xu
747c101f26 feat: 测量捕获帧被 DWM 呈现到被 Magpie 呈现的延迟 2025-12-13 14:24:46 +08:00
Xu
872337bb67 feat: 添加调试信息 2025-12-12 17:57:34 +08:00
Xu
601cc539bb feat: CatmullRom 适应色域 2025-12-11 20:13:59 +08:00
Xu
0af6528215 feat: 生产者完成新帧后通知消费者 2025-12-11 19:36:16 +08:00
Xu
7afdee05d7 feat: 实现 CatmullRom (p3) 2025-12-11 16:39:28 +08:00
Xu
cf9241b62d feat: 实现 CatmullRom (p2) 2025-12-11 15:56:43 +08:00
Xu
eb88670b8f feat: 实现 CatmullRom (p1) 2025-12-10 22:09:26 +08:00
Xu
fda054aa36 fix: 跨适配器捕获时遵循“写入者创建”的原则,否则可能无法正确同步,Intel 集显作为捕获设备时存在这个问题 2025-12-10 18:55:20 +08:00
Xu
b2166d4d53 perf: WGC 避免重复复制 2025-12-10 13:47:31 +08:00
Xu
9157a3fb8b fix: 生产者写入共享纹理结束后将其转换到只读状态以确保安全 2025-12-10 10:22:50 +08:00
Xu
f04493d583 fix: 修复调整大小时闪烁的问题 2025-12-09 19:36:08 +08:00
Xu
64dc60de2f chore: 注释 2025-12-09 17:10:27 +08:00
Xu
bb1a02cc03 feat: 实现 WGC 跨适配器捕获
跨适配器时 WGC 用内存作中转,现在我们以 D3D12 跨适配器堆中转,由于不需要 CPU 介入,性能应有提升,待测试
2025-12-09 16:33:27 +08:00
Xu
b40b2875e9 perf: 捕获时异步复制纹理,可以提高并行度 2025-12-09 13:48:33 +08:00
Xu
095d349877 feat: 实现更改输出画面位置 2025-12-09 10:13:00 +08:00
Xu
0bb28015b3 feat: 实现大小调整 2025-12-08 20:57:27 +08:00
Xu
c01860bc71 fix: 修复调整大小时鼠标行为 2025-12-08 16:19:35 +08:00
Xu
309c5a1710 feat: 初步实现鼠标功能 2025-12-08 15:56:48 +08:00
Xu
47dee842dd feat: 第一帧渲染完成后显示缩放窗口 2025-12-08 13:13:23 +08:00
Xu
4326dc627c perf: 避免多余栅栏 2025-12-08 12:51:59 +08:00
Xu
4d99b18cfc 全面重构 (p2) 2025-12-08 11:09:06 +08:00
Xu
c4f352aef6 全面重构 (p1) 2025-12-07 20:50:24 +08:00
Xu
7154814f15 Merge branch 'dev' into d3d12 2025-12-04 17:47:42 +08:00
Xu
4a7a3054b2 Merge branch 'dev' into d3d12 2025-12-02 20:02:07 +08:00
Xu
ed8823f914 chore: 更新依赖 2025-12-01 14:28:05 +08:00
Xu
62a8954e52 Merge branch 'dev' into d3d12 2025-12-01 14:17:45 +08:00
Xu
c66d5206de Merge branch 'dev' into d3d12 2025-12-01 14:06:58 +08:00
Xu
00733a884d fix: 缩放结束时等待 GPU 完成 2025-11-25 22:27:32 +08:00
Xu
efa62ad8b9 chore: Debug 配置下使用 Agility SDK 辅助调试
OS 的调试层会错误汇报资源状态不匹配
2025-11-25 15:26:11 +08:00
Xu
b620797b7e fix: 修复 WGC 偶尔死锁 2025-11-25 11:53:44 +08:00
Xu
74650af46f feat: 实现 WGC (p3) 2025-11-24 21:24:45 +08:00
Xu
aa0fa24377 feat: 实现 WGC (p2) 2025-11-24 17:18:42 +08:00
Xu
44410b667e feat: 实现 WGC (p1) 2025-11-20 15:52:02 +08:00
Xu
51e1cc856f feat: 初步搭建生产者管线 2025-11-19 21:47:33 +08:00
Xu
a8c8fb05cc Merge branch 'dev' into d3d12 2025-11-19 19:23:45 +08:00
Xu
ae7ca5f75b feat: 实现生产者-消费者渲染架构 2025-11-18 17:03:51 +08:00
Xu
339fdd36cc feat: 实现检查色域 2025-11-17 17:22:40 +08:00
Xu
de8a0a902c feat: 实现调整大小 2025-11-16 20:45:16 +08:00
Xu
28bafb7185 feat: 绘制背景色 2025-11-16 19:21:40 +08:00
Xu
177149f9a8 feat: 创建交换链 2025-11-16 18:29:58 +08:00
Xu
f682220d5c Merge branch 'dev' into d3d12 2025-11-16 17:15:42 +08:00
Xu
a7f6167e67 Merge branch 'dev' into d3d12 2025-11-16 16:38:58 +08:00
Xu
0a819ea877 feat: 创建命令队列 2025-11-13 22:07:26 +08:00
Xu
1fc44f6e34 feat: 实现创建 D3D12 设备 2025-11-12 23:31:15 +08:00
Xu
859f4dbcf1 feat: 删除 CatmullRom 2025-11-11 19:13:41 +08:00
Xu
3e676f8527 feat: 添加 Catmull-Rom 2025-11-11 17:28:52 +08:00
274 changed files with 17414 additions and 52801 deletions

View file

@ -1,5 +1,6 @@
<?xml version="1.0" encoding="utf-8"?>
<AutoVisualizer xmlns="http://schemas.microsoft.com/vstudio/debugger/natvis/2010">
<Type Name="SmallVectorImpl&lt;*&gt;">
<Type Name="Magpie::SmallVectorImpl&lt;*&gt;">
<DisplayString IncludeView ="elt0" Condition="Size == 0"></DisplayString>
<DisplayString IncludeView ="elt0">{(($T1*)BeginX)[0]}{*this,view(elt1)}</DisplayString>
<DisplayString IncludeView ="elt1" Condition="Size == 1"></DisplayString>

View file

@ -60,7 +60,7 @@ versionNumProps = f";MajorVersion={args.version_major};MinorVersion={args.versio
versionStrProp = "" if args.version_string == "" else f";VersionString={args.version_string}"
p = subprocess.run(
f'"{msbuildPath}" Magpie.slnx -m -t:Rebuild -restore -p:RestorePackagesConfig=true;Configuration=Release;Platform={args.platform};DisablePDB=true;UseClangCL={args.compiler == "ClangCL"};UseNativeMicroArch={args.use_native_march};OutDir={os.getcwd()}\\publish\\{args.platform}\\;CommitId={commitId}{versionNumProps}{versionStrProp}'
f'"{msbuildPath}" Magpie.slnx -m -t:Rebuild -restore -p:RestorePackagesConfig=true;Configuration=Release;Platform={args.platform};DisablePDB=true;UseClangCL={args.compiler == "ClangCL"};UseNativeMicroArch={args.use_native_march};OutBaseDir={os.getcwd()}\\publish\\{args.platform}\\;CommitId={commitId}{versionNumProps}{versionStrProp}'
)
if p.returncode != 0:
raise Exception("编译失败")
@ -82,8 +82,9 @@ def remove_file(file):
pass
for file in glob.glob("*.lib"):
remove_file(file)
for pattern in ["*.lib", "*.exp"]:
for file in glob.glob(pattern):
remove_file(file)
print("清理完毕", flush=True)
@ -102,7 +103,7 @@ if args.pfx_path != "":
)
passwordOption = "" if args.pfx_password == "" else f'/p "{args.pfx_password}"'
p = subprocess.run(
f'"{windowsSdkDir}\\x64\\signtool.exe" sign /fd SHA256 /a /f "{pfxPath}" {passwordOption} TouchHelper.exe'
f'"{windowsSdkDir}\\x64\\signtool.exe" sign /fd SHA256 /a /f "{pfxPath}" {passwordOption} app\\TouchHelper.exe'
)
if p.returncode != 0:
raise Exception("签名失败")

View file

@ -8,13 +8,14 @@
<UseNativeMicroArch>false</UseNativeMicroArch>
<!-- 编译为打包应用 (暂不支持) -->
<IsPackaged>false</IsPackaged>
<!-- 窗口模式缩放时把用于调整窗口尺寸的辅助窗口标示出来 -->
<DebugBorder>false</DebugBorder>
<!-- 在性能分析器上显示调试信息 -->
<DebugInfoOnOverlay>false</DebugInfoOnOverlay>
<!-- 启用调试信息 -->
<DebugInfo>false</DebugInfo>
<!-- 使用 composition swapchain 呈现 -->
<UseCompSwapchain>false</UseCompSwapchain>
<!-- 禁止生成 PDB -->
<DisablePDB>false</DisablePDB>
<OutBaseDir></OutBaseDir>
<MajorVersion></MajorVersion>
<MinorVersion></MinorVersion>
<PatchVersion></PatchVersion>

View file

@ -10,8 +10,7 @@
<PreprocessorDefinitions>MP_MAJOR_VERSION=$(MajorVersion);MP_MINOR_VERSION=$(MinorVersion);MP_PATCH_VERSION=$(PatchVersion);%(PreprocessorDefinitions)</PreprocessorDefinitions>
<PreprocessorDefinitions Condition="'$(VersionString)' != ''">MP_VERSION_STRING=$(VersionString);%(PreprocessorDefinitions)</PreprocessorDefinitions>
<PreprocessorDefinitions Condition="'$(CommitId)' != ''">MP_COMMIT_ID=$(CommitId);%(PreprocessorDefinitions)</PreprocessorDefinitions>
<PreprocessorDefinitions Condition="$(DebugBorder)">MP_DEBUG_BORDER;%(PreprocessorDefinitions)</PreprocessorDefinitions>
<PreprocessorDefinitions Condition="$(DebugInfoOnOverlay)">MP_DEBUG_INFO_ON_OVERLAY;%(PreprocessorDefinitions)</PreprocessorDefinitions>
<PreprocessorDefinitions Condition="$(DebugInfo)">MP_DEBUG_INFO;%(PreprocessorDefinitions)</PreprocessorDefinitions>
<PreprocessorDefinitions Condition="$(UseCompSwapchain)">MP_USE_COMPSWAPCHAIN;%(PreprocessorDefinitions)</PreprocessorDefinitions>
<DebugInformationFormat Condition="'$(DisablePDB)' == 'true'">None</DebugInformationFormat>
<AdditionalOptions>/bigobj %(AdditionalOptions)</AdditionalOptions>
@ -68,7 +67,7 @@
<!-- /Zc:checkGwOdr: 防止 /Gw 导致某些 ODR 违规被忽略 -->
<AdditionalOptions>/Gw %(AdditionalOptions)</AdditionalOptions>
<AdditionalOptions Condition="!$(UseClangCL)">/Zc:checkGwOdr %(AdditionalOptions)</AdditionalOptions>
<!-- clang-cl 不支持 /LTCG应使用 LTO -->
<!-- clang-cl 不支持 LTCG应使用 LTO -->
<AdditionalOptions Condition="$(UseClangCL)">/clang:-flto %(AdditionalOptions)</AdditionalOptions>
</ClCompile>
<Link>

View file

@ -29,6 +29,8 @@
<VCProjectVersion Condition="$(VS17)">17.0</VCProjectVersion>
<VCProjectVersion Condition="!$(VS17)">18.0</VCProjectVersion>
<DefaultLanguage>en-US</DefaultLanguage>
<IntDir>$(SolutionDir)\obj\$(Platform)\$(Configuration)\$(MSBuildProjectName)\</IntDir>
<OutBaseDir Condition="'$(OutBaseDir)' == ''">$(SolutionDir)\bin\$(Platform)\$(Configuration)\</OutBaseDir>
<MajorVersion Condition="'$(MajorVersion)' == ''">0</MajorVersion>
<MinorVersion Condition="'$(MinorVersion)' == ''">0</MinorVersion>
<PatchVersion Condition="'$(PatchVersion)' == ''">0</PatchVersion>

View file

@ -2,19 +2,16 @@
// 移植自 https://github.com/TianZerL/ACNetGLSL/blob/f20a6b6b7327f4caf588b06c6b21f18e40dae1ce/glsl/ACNet.glsl
//!MAGPIE EFFECT
//!VERSION 4
//!USE MulAdd
//!VERSION 5
//!CAPABILITY FP16
//!SCALE_FACTOR 2
#include "StubDefs.hlsli"
//!TEXTURE
Texture2D INPUT;
//!TEXTURE
//!WIDTH INPUT_WIDTH * 2
//!HEIGHT INPUT_HEIGHT * 2
Texture2D OUTPUT;
//!TEXTURE
@ -49,7 +46,6 @@ SamplerState sam;
//!FILTER LINEAR
SamplerState sam1;
//!COMMON
#ifdef MP_DEBUG
@ -58,7 +54,6 @@ SamplerState sam1;
#define RELU(x) max(x, 0)
//!PASS 1
//!DESC L1
//!IN INPUT
@ -66,8 +61,11 @@ SamplerState sam1;
//!BLOCK_SIZE 16
//!NUM_THREADS 64
MF GetLuma(MF3 color) {
return dot(MF3(0.299, 0.587, 0.114), color);
// ACNet 工作在 YUV 颜色空间,原作者是这么做的,见
// https://github.com/TianZerL/Anime4KCPP/blob/b8b3a09fd50b1bb15751eb9aa90b7e7f55b8e51e/Anime4KCore/src/Anime4KGPUCNN.cpp
// sRGB 和 YUV 的转换见 https://en.wikipedia.org/wiki/YCbCr#ITU-R_BT.709_conversion
MF GetY(MF3 color) {
return dot(MF3(0.2126, 0.7152, 0.0722), EncodeSrgb(color));
}
const static MF kernelsL1A[9 * 4] = {
@ -104,7 +102,6 @@ const static MF kernelsL1B[9 * 4] = {
const static MF4 biasL1B = { 0.0223, 0.0340, 0.0150, -0.0044 };
void Pass1(uint2 blockStart, uint3 threadId) {
uint2 gxy = (Rmp8x8(threadId.x) << 1) + blockStart;
uint2 inputSize = GetInputSize();
@ -127,10 +124,10 @@ void Pass1(uint2 blockStart, uint3 threadId) {
// w z
// x y
src[i][j] = GetLuma(MF3(sr.w, sg.w, sb.w));
src[i][j + 1] = GetLuma(MF3(sr.x, sg.x, sb.x));
src[i + 1][j] = GetLuma(MF3(sr.z, sg.z, sb.z));
src[i + 1][j + 1] = GetLuma(MF3(sr.y, sg.y, sb.y));
src[i][j] = GetY(MF3(sr.w, sg.w, sb.w));
src[i][j + 1] = GetY(MF3(sr.x, sg.x, sb.x));
src[i + 1][j] = GetY(MF3(sr.z, sg.z, sb.z));
src[i + 1][j + 1] = GetY(MF3(sr.y, sg.y, sb.y));
}
}
@ -176,7 +173,6 @@ void Pass1(uint2 blockStart, uint3 threadId) {
}
}
//!PASS 2
//!DESC L2
//!IN tex1, tex2
@ -465,7 +461,6 @@ void Pass2(uint2 blockStart, uint3 threadId) {
tex4[gxy] = target2;
}
//!PASS 3
//!DESC L3
//!IN tex3, tex4
@ -675,7 +670,6 @@ const static MF kernelsLB[9 * 8 * 4] = {
const static MF4 biasLB = { -0.0225, 0.0082, -0.0191, -0.0185 };
void Pass3(uint2 blockStart, uint3 threadId) {
uint2 gxy = Rmp8x8(threadId.x) + blockStart;
uint2 inputSize = GetInputSize();
@ -755,7 +749,6 @@ void Pass3(uint2 blockStart, uint3 threadId) {
tex2[gxy] = target2;
}
//!PASS 4
//!DESC L4
//!IN tex1, tex2
@ -965,7 +958,6 @@ const static MF kernelsLB[9 * 8 * 4] = {
const static MF4 biasLB = { -8.1892e-04, 3.3171e-03, -1.1582e-02, -4.1205e-40 };
void Pass4(uint2 blockStart, uint3 threadId) {
uint2 gxy = Rmp8x8(threadId.x) + blockStart;
uint2 inputSize = GetInputSize();
@ -1045,7 +1037,6 @@ void Pass4(uint2 blockStart, uint3 threadId) {
tex4[gxy] = target2;
}
//!PASS 5
//!DESC L5
//!IN tex3, tex4
@ -1255,7 +1246,6 @@ const static MF kernelsLB[9 * 8 * 4] = {
const static MF4 biasLB = { -0.0039, -0.0426, 0.0053, -0.0017 };
void Pass5(uint2 blockStart, uint3 threadId) {
uint2 gxy = Rmp8x8(threadId.x) + blockStart;
uint2 inputSize = GetInputSize();
@ -1335,7 +1325,6 @@ void Pass5(uint2 blockStart, uint3 threadId) {
tex2[gxy] = target2;
}
//!PASS 6
//!DESC L6
//!IN tex1, tex2
@ -1545,7 +1534,6 @@ const static MF kernelsLB[9 * 8 * 4] = {
const static MF4 biasLB = { 0.1077, 0.0347, -0.0165, 0.7296 };
void Pass6(uint2 blockStart, uint3 threadId) {
uint2 gxy = Rmp8x8(threadId.x) + blockStart;
uint2 inputSize = GetInputSize();
@ -1625,7 +1613,6 @@ void Pass6(uint2 blockStart, uint3 threadId) {
tex4[gxy] = target2;
}
//!PASS 7
//!DESC L7
//!IN tex3, tex4
@ -1835,7 +1822,6 @@ const static MF kernelsLB[9 * 8 * 4] = {
const static MF4 biasLB = { 2.3381e-02, -1.2136e-40, -5.6040e-39, 3.7100e-02 };
void Pass7(uint2 blockStart, uint3 threadId) {
uint2 gxy = Rmp8x8(threadId.x) + blockStart;
uint2 inputSize = GetInputSize();
@ -1915,7 +1901,6 @@ void Pass7(uint2 blockStart, uint3 threadId) {
tex2[gxy] = target2;
}
//!PASS 8
//!DESC L8
//!IN tex1, tex2
@ -2125,7 +2110,6 @@ const static MF kernelsLB[9 * 8 * 4] = {
const static MF4 biasLB = { 7.9956e-02, 3.0679e-04, -1.0257e-02, -1.2037e-02 };
void Pass8(uint2 blockStart, uint3 threadId) {
uint2 gxy = Rmp8x8(threadId.x) + blockStart;
uint2 inputSize = GetInputSize();
@ -2205,7 +2189,6 @@ void Pass8(uint2 blockStart, uint3 threadId) {
tex4[gxy] = target2;
}
//!PASS 9
//!DESC L9, L10
//!IN INPUT, tex3, tex4
@ -2434,15 +2417,16 @@ const static MF kernelsL10[4 * 8] = {
0.0415, -0.1858
};
const static MF2x3 rgb2uv = {
-0.169, -0.331, 0.5,
0.5, -0.419, -0.081
// 来自 https://en.wikipedia.org/wiki/YCbCr#ITU-R_BT.709_conversion
const static MF2x3 srgb2uv = {
-0.1146, -0.3854, 0.5,
0.5, -0.4542, -0.0458
};
const static MF3x3 yuv2rgb = {
1, -0.00093, 1.401687,
1, -0.3437, -0.71417,
1, 1.77216, 0.00099
const static MF3x3 yuv2srgb = {
1, 0, 1.5748,
1, -0.1873, -0.4681,
1, 1.8556, 0
};
void Pass9(uint2 blockStart, uint3 threadId) {
@ -2530,7 +2514,7 @@ void Pass9(uint2 blockStart, uint3 threadId) {
uint2 destPos = gxy + uint2(i, j);
uint index = j * 2 + i;
MF luma = saturate(
MF newY = saturate(
target1.x * kernelsL10[0 + index] +
target1.y * kernelsL10[4 + index] +
target1.z * kernelsL10[8 + index] +
@ -2540,8 +2524,10 @@ void Pass9(uint2 blockStart, uint3 threadId) {
target2.z * kernelsL10[24 + index] +
target2.w * kernelsL10[28 + index]);
MF2 originUV = mul(rgb2uv, INPUT.SampleLevel(sam1, (destPos + 0.5f) * outputPt, 0).rgb);
OUTPUT[destPos] = MF4(mul(yuv2rgb, MF3(luma, originUV)), 1);
// ACNet 工作在 YUV 颜色空间
float3 originC = INPUT.SampleLevel(sam1, (destPos + 0.5f) * outputPt, 0).rgb;
MF2 originUV = mul(srgb2uv, EncodeSrgb(originC));
OUTPUT[destPos] = MF4(DecodeSrgb(mul(yuv2srgb, MF3(newY, originUV))), 1);
}
}
}

View file

@ -2,20 +2,17 @@
// 移植自 https://github.com/bloc97/Anime4K/blob/master/glsl/Upscale/Anime4K_Upscale_CNN_x2_S.glsl
//!MAGPIE EFFECT
//!VERSION 4
//!VERSION 5
//!SORT_NAME Anime4K_Upscale_0
//!USE MulAdd
//!CAPABILITY FP16
//!SCALE_FACTOR 2
#include "../StubDefs.hlsli"
//!TEXTURE
Texture2D INPUT;
//!TEXTURE
//!WIDTH INPUT_WIDTH * 2
//!HEIGHT INPUT_HEIGHT * 2
Texture2D OUTPUT;
//!TEXTURE
@ -38,7 +35,6 @@ SamplerState sam;
//!FILTER LINEAR
SamplerState sam1;
//!PASS 1
//!DESC Conv-4x3x3x3
//!IN INPUT
@ -97,7 +93,6 @@ void Pass1(uint2 blockStart, uint3 threadId) {
tex1[gxy] = A4KS1(src, 1, 2);
}
//!PASS 2
//!DESC Conv-4x3x3x8
//!IN tex1
@ -240,7 +235,6 @@ void Pass3(uint2 blockStart, uint3 threadId) {
tex1[gxy] = A4KS3(src, 1, 2);
}
//!PASS 4
//!DESC Conv-4x3x3x8, Depth-to-Space
//!IN INPUT, tex1

View file

@ -2,7 +2,8 @@
// 移植自 https://github.com/ActualMandM/cemu_graphic_packs/blob/468d165cf27dae13a06e8bdc3d588d0af775ad91/Filters/Bicubic/output.glsl
//!MAGPIE EFFECT
//!VERSION 4
//!VERSION 5
//!CAPABILITY AdvancedColor
#include "StubDefs.hlsli"
@ -34,7 +35,6 @@ Texture2D OUTPUT;
//!FILTER LINEAR
SamplerState sam;
//!PASS 1
//!STYLE PS
//!IN INPUT
@ -48,7 +48,7 @@ float weight(float x) {
if (ax < 1.0) {
return (x * x * ((12.0 - 9.0 * B - 6.0 * C) * ax + (-18.0 + 12.0 * B + 6.0 * C)) + (6.0 - 2.0 * B)) / 6.0;
} else if (ax >= 1.0 && ax < 2.0) {
} else if (ax < 2.0) {
return (x * x * ((-B - 6.0 * C) * ax + (6.0 * B + 30.0 * C)) + (-12.0 * B - 48.0 * C) * ax + (8.0 * B + 24.0 * C)) / 6.0;
} else {
return 0.0;
@ -64,7 +64,6 @@ float4 weight4(float x) {
);
}
float4 Pass1(float2 pos) {
const float2 inputPt = GetInputPt();
const float2 inputSize = GetInputSize();
@ -76,10 +75,6 @@ float4 Pass1(float2 pos) {
float4 rowtaps = weight4(1 - f.x);
float4 coltaps = weight4(1 - f.y);
// make sure all taps added together is exactly 1.0, otherwise some (very small) distortion can occur
rowtaps /= rowtaps.r + rowtaps.g + rowtaps.b + rowtaps.a;
coltaps /= coltaps.r + coltaps.g + coltaps.b + coltaps.a;
float2 uv1 = pos1 * inputPt;
float2 uv0 = uv1 - inputPt;
float2 uv2 = uv1 + inputPt;

View file

@ -1,5 +1,6 @@
//!MAGPIE EFFECT
//!VERSION 4
//!VERSION 5
//!CAPABILITY AdvancedColor
//!TEXTURE
Texture2D INPUT;

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

View file

@ -1,340 +0,0 @@
// CuNNy 2x4C BILINEAR RGB NVL DN - https://github.com/funnyplanter/CuNNy
// This program is free software: you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation, either version 3 of the License, or
// (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
//
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License
// along with this program. If not, see <http://www.gnu.org/licenses/>.
//!MAGPIE EFFECT
//!VERSION 4
//!SORT_NAME CuNNy-DN-D04N02
//!USE MulAdd
//!CAPABILITY FP16
#include "../StubDefs.hlsli"
//!TEXTURE
Texture2D INPUT;
//!TEXTURE
//!WIDTH INPUT_WIDTH * 2
//!HEIGHT INPUT_HEIGHT * 2
Texture2D OUTPUT;
//!SAMPLER
//!FILTER POINT
SamplerState SP;
//!SAMPLER
//!FILTER LINEAR
SamplerState SL;
//!COMMON
#define O(t, p) t.SampleLevel(SP, pos + p * pt, 0)
#define V4 MF4
#define M4 MF4x4
//!TEXTURE
//!WIDTH INPUT_WIDTH
//!HEIGHT INPUT_HEIGHT
//!FORMAT R8G8B8A8_SNORM
Texture2D t0;
//!TEXTURE
//!WIDTH INPUT_WIDTH
//!HEIGHT INPUT_HEIGHT
//!FORMAT R8G8B8A8_SNORM
Texture2D t1;
//!PASS 1
//!DESC in
//!BLOCK_SIZE 8
//!NUM_THREADS 64
//!IN INPUT
//!OUT t0
#define l0(x, y) (dot(MF3(-3.725e-01, -7.046e-01, -1.734e-01), O(INPUT, float2(x, y)).rgb) + MF(1.169e-01))
V4 f0(MF s0_0, MF s0_1, MF s0_2, MF s0_3, MF s0_4, MF s0_5, MF s0_6, MF s0_7, MF s0_8) {
V4 r = { 1.492e-02, -1.961e-02, -7.539e-03, -3.574e-03 };
r = mad(s0_0, V4(-2.745e-03, -2.925e-03, 1.135e-01, 3.162e-02), r);
r = mad(s0_1, V4(4.049e-03, -3.428e-01, -7.641e-02, 2.484e-02), r);
r = mad(s0_2, V4(-8.372e-03, 3.398e-01, 1.072e-01, -5.449e-02), r);
r = mad(s0_3, V4(1.592e-02, 1.884e-02, -3.160e-02, -7.727e-02), r);
r = mad(s0_4, V4(4.429e-01, -3.936e-01, -4.134e-01, -4.287e-01), r);
r = mad(s0_5, V4(4.556e-02, 3.754e-01, -2.300e-02, 4.971e-01), r);
r = mad(s0_6, V4(-2.031e-02, -6.662e-03, 8.906e-02, 4.602e-02), r);
r = mad(s0_7, V4(-4.365e-01, 2.183e-03, 8.609e-02, 9.402e-03), r);
r = mad(s0_8, V4(-3.845e-02, 5.695e-03, 9.645e-02, -5.310e-02), r);
return r;
}
void Pass1(uint2 blockStart, uint3 tid) {
float2 pt = float2(GetInputPt());
uint2 gxy = Rmp8x8(tid.x) + blockStart;
uint2 size = GetInputSize();
if (gxy.x >= size.x || gxy.y >= size.y) {
return;
}
float2 pos = (gxy + 0.5) * pt;
MF s0_0 = l0(-1.0, -1.0);
MF s0_1 = l0(0.0, -1.0);
MF s0_2 = l0(1.0, -1.0);
MF s0_3 = l0(-1.0, 0.0);
MF s0_4 = l0(0.0, 0.0);
MF s0_5 = l0(1.0, 0.0);
MF s0_6 = l0(-1.0, 1.0);
MF s0_7 = l0(0.0, 1.0);
MF s0_8 = l0(1.0, 1.0);
t0[gxy] = f0(s0_0, s0_1, s0_2, s0_3, s0_4, s0_5, s0_6, s0_7, s0_8);
}
//!PASS 2
//!DESC conv1
//!BLOCK_SIZE 8
//!NUM_THREADS 64
//!IN t0
//!OUT t1
#define l0(x, y) V4(O(t0, float2(x, y)))
V4 f0(V4 s0_0, V4 s0_1, V4 s0_2, V4 s0_3, V4 s0_4, V4 s0_5, V4 s0_6, V4 s0_7, V4 s0_8, V4 s1_0, V4 s1_1, V4 s1_2, V4 s1_3, V4 s1_4, V4 s1_5, V4 s1_6, V4 s1_7, V4 s1_8) {
V4 r = { 4.789e-02, 4.713e-03, -2.854e-02, 9.967e-03 };
r = MulAdd(s0_0, M4(1.218e-02, -1.208e-01, -1.955e-01, -1.217e-01, 3.123e-02, -2.317e-02, 1.961e-01, -9.984e-02, 3.038e-03, 2.863e-02, -1.042e-01, -5.529e-02, 1.266e-01, -3.877e-01, 2.315e-01, -1.334e-01), r);
r = MulAdd(s0_1, M4(-1.774e-02, 1.636e-01, 1.379e-01, 7.499e-03, -7.890e-02, -3.970e-02, -6.053e-02, -1.431e-02, 4.167e-02, 9.728e-02, 3.825e-02, -2.704e-02, -2.303e-01, -3.348e-01, 2.940e-01, 4.825e-02), r);
r = MulAdd(s0_2, M4(1.239e-02, 1.613e-02, -2.280e-01, 8.985e-02, 2.106e-03, 3.847e-02, -2.539e-02, -3.326e-02, -6.327e-02, -1.427e-01, 4.218e-02, 8.995e-02, -6.045e-02, -1.073e-01, -1.329e-01, -2.085e-02), r);
r = MulAdd(s0_3, M4(-1.601e-01, -2.448e-01, -3.950e-01, 9.169e-03, -3.694e-02, 2.018e-01, -2.524e-01, 1.719e+00, 3.009e-02, 4.927e-02, 1.564e-01, 3.509e-02, -2.630e-02, -3.986e-01, 1.326e-01, -1.037e-02), r);
r = MulAdd(s0_4, M4(-1.074e+00, -1.654e-01, 4.163e-01, 3.816e-02, 4.580e-01, 4.350e-01, -3.490e-01, -1.257e-02, 1.159e-02, -2.083e-01, -2.744e-01, -2.667e-02, 2.826e-03, 1.986e-01, -2.723e-01, 9.612e-02), r);
r = MulAdd(s0_5, M4(-3.195e-01, -1.450e-01, -1.523e-01, -2.999e-03, 1.166e-01, 1.304e-01, 1.475e-01, 7.286e-02, -4.077e-02, -3.477e-02, 1.496e-01, -1.199e-02, 7.881e-02, 8.911e-02, -1.082e-01, -6.762e-02), r);
r = MulAdd(s0_6, M4(2.020e-02, 1.556e-01, -9.837e-03, 1.537e-02, -1.047e-01, 2.095e-01, 2.025e-01, -3.522e-02, -3.407e-02, -8.949e-02, -7.721e-02, -8.910e-03, 9.305e-02, 2.231e-01, 2.178e-01, 1.502e-02), r);
r = MulAdd(s0_7, M4(-7.936e-02, 3.096e-01, 1.869e-01, -1.950e-03, -2.452e-01, -5.098e-01, 5.304e-01, -4.921e-02, -1.073e-01, 1.062e-01, 2.527e-01, 5.909e-04, 3.797e-02, 3.291e-01, -2.395e-01, 2.768e-02), r);
r = MulAdd(s0_8, M4(-5.559e-02, 1.090e-01, -1.757e-01, 1.261e-02, -1.632e-01, -2.476e-01, -5.674e-02, -4.843e-03, 1.064e-02, 1.023e-01, 2.540e-02, -1.336e-02, 1.362e-01, 1.833e-01, 3.772e-03, 5.118e-04), r);
r = MulAdd(s1_0, M4(1.383e-01, 3.469e-01, 3.568e-02, -1.958e-01, -3.170e-02, -1.076e-02, -2.012e-02, -2.104e-04, 2.046e-02, -1.268e-02, -1.618e-01, -6.370e-02, 2.615e-02, 1.494e-01, -1.523e-01, 3.702e-02), r);
r = MulAdd(s1_1, M4(-1.140e-02, 6.811e-01, 5.722e-02, 1.514e-01, -6.311e-02, -3.541e-02, -1.150e-01, 3.625e-02, 1.146e-01, -1.395e-03, 5.059e-01, -7.835e-02, -3.907e-01, 6.172e-02, -9.656e-02, -2.727e-02), r);
r = MulAdd(s1_2, M4(1.239e-01, 1.206e-01, 7.519e-01, 2.106e-02, 8.647e-03, 1.082e-02, 5.931e-02, -4.215e-02, -2.216e-02, -4.829e-02, -1.927e-01, 1.159e-01, -1.789e-01, -9.596e-02, 1.395e-01, -6.395e-02), r);
r = MulAdd(s1_3, M4(1.194e-01, -5.786e-01, -1.761e-03, -1.126e-02, -5.311e-02, -2.325e-01, 1.733e-01, 2.842e-01, -1.080e-01, -1.012e-01, 1.851e-01, 4.253e-02, 1.212e-01, 2.435e-02, -3.061e-01, -9.579e-02), r);
r = MulAdd(s1_4, M4(-4.651e-02, -1.299e+00, -5.020e-01, 5.830e-02, 5.098e-01, 7.344e-02, -1.358e-01, 1.725e-02, -2.980e-01, -6.077e-01, 6.308e-01, -4.014e-02, 3.497e-01, 3.700e-01, -6.035e-01, 8.026e-02), r);
r = MulAdd(s1_5, M4(-1.851e-02, -2.057e-01, 5.081e-01, -5.262e-02, 1.715e-01, 1.387e-01, -1.123e-01, 9.022e-02, -1.532e-01, -3.749e-02, -1.930e-01, 6.423e-02, 2.763e-02, 5.993e-02, 4.141e-01, -8.825e-02), r);
r = MulAdd(s1_6, M4(-6.324e-03, -9.461e-02, 3.044e-02, -4.139e-03, -2.925e-02, 3.975e-01, 1.161e-01, 9.726e-03, 1.353e-01, 2.762e-01, 3.297e-03, 1.076e-02, -8.503e-02, -7.010e-01, -1.967e-01, -1.360e-03), r);
r = MulAdd(s1_7, M4(1.873e-02, 1.099e-01, 1.229e-01, -1.232e-02, -5.723e-01, -4.599e-02, -1.236e-01, -2.003e-02, -4.268e-01, 5.929e-01, 2.942e-01, 3.485e-02, 4.326e-01, -9.250e-02, 3.736e-01, -2.393e-02), r);
r = MulAdd(s1_8, M4(-5.991e-02, 1.199e-03, -1.349e-02, -1.321e-03, -2.036e-01, -1.937e-01, -7.888e-02, -9.144e-03, 1.557e-01, 7.018e-02, -2.646e-01, -3.360e-06, 1.742e-01, 1.814e-01, 1.385e-01, -1.030e-02), r);
return r;
}
void Pass2(uint2 blockStart, uint3 tid) {
float2 pt = float2(GetInputPt());
uint2 gxy = Rmp8x8(tid.x) + blockStart;
uint2 size = GetInputSize();
if (gxy.x >= size.x || gxy.y >= size.y) {
return;
}
float2 pos = (gxy + 0.5) * pt;
V4 s0_0 = l0(-1.0, -1.0);
V4 s0_1 = l0(0.0, -1.0);
V4 s0_2 = l0(1.0, -1.0);
V4 s0_3 = l0(-1.0, 0.0);
V4 s0_4 = l0(0.0, 0.0);
V4 s0_5 = l0(1.0, 0.0);
V4 s0_6 = l0(-1.0, 1.0);
V4 s0_7 = l0(0.0, 1.0);
V4 s0_8 = l0(1.0, 1.0);
V4 s1_0 = -max(-s0_0, 0.0);
V4 s1_1 = -max(-s0_1, 0.0);
V4 s1_2 = -max(-s0_2, 0.0);
V4 s1_3 = -max(-s0_3, 0.0);
V4 s1_4 = -max(-s0_4, 0.0);
V4 s1_5 = -max(-s0_5, 0.0);
V4 s1_6 = -max(-s0_6, 0.0);
V4 s1_7 = -max(-s0_7, 0.0);
V4 s1_8 = -max(-s0_8, 0.0);
s0_0 = max(s0_0, 0.0);
s0_1 = max(s0_1, 0.0);
s0_2 = max(s0_2, 0.0);
s0_3 = max(s0_3, 0.0);
s0_4 = max(s0_4, 0.0);
s0_5 = max(s0_5, 0.0);
s0_6 = max(s0_6, 0.0);
s0_7 = max(s0_7, 0.0);
s0_8 = max(s0_8, 0.0);
t1[gxy] = f0(s0_0, s0_1, s0_2, s0_3, s0_4, s0_5, s0_6, s0_7, s0_8, s1_0, s1_1, s1_2, s1_3, s1_4, s1_5, s1_6, s1_7, s1_8);
}
//!PASS 3
//!DESC conv2
//!BLOCK_SIZE 8
//!NUM_THREADS 64
//!IN t1
//!OUT t0
#define l0(x, y) V4(O(t1, float2(x, y)))
V4 f0(V4 s0_0, V4 s0_1, V4 s0_2, V4 s0_3, V4 s0_4, V4 s0_5, V4 s0_6, V4 s0_7, V4 s0_8, V4 s1_0, V4 s1_1, V4 s1_2, V4 s1_3, V4 s1_4, V4 s1_5, V4 s1_6, V4 s1_7, V4 s1_8) {
V4 r = { 7.359e-03, -1.132e-02, 1.248e-02, 7.243e-04 };
r = MulAdd(s0_0, M4(-1.565e-01, 1.307e-02, -5.269e-02, 5.465e-02, 2.936e-01, 1.626e-01, 4.589e-02, 2.478e-02, 3.520e-01, -5.445e-02, -2.480e-01, 2.838e-02, 1.841e-04, 1.264e-02, -1.370e-02, 2.588e-02), r);
r = MulAdd(s0_1, M4(2.350e-01, 2.116e-01, 2.167e-02, -1.559e-01, 2.502e-01, 4.320e-01, -7.152e-01, 2.270e-01, -2.668e-01, -2.117e-01, 5.598e-01, 2.261e-01, 4.101e-02, -4.860e-02, 3.530e-02, 8.932e-02), r);
r = MulAdd(s0_2, M4(-4.398e-02, -4.486e-02, -5.040e-02, 9.803e-02, 7.515e-02, 1.203e-01, -5.357e-02, -2.803e-01, -1.435e-01, 7.150e-03, -3.118e-02, -2.636e-01, -2.969e-02, -2.011e-02, 2.658e-02, -2.572e-02), r);
r = MulAdd(s0_3, M4(9.140e-02, -1.875e-01, 9.757e-02, 2.976e-02, -8.325e-02, 6.109e-02, -4.304e-02, 7.057e-02, 7.324e-01, -1.528e-01, 2.930e-01, 7.503e-02, -3.901e-02, 1.109e-03, -2.693e-02, -3.330e-02), r);
r = MulAdd(s0_4, M4(-9.944e-02, 1.858e-01, -2.436e-01, 3.822e-02, 6.685e-02, -1.758e-01, 1.382e-01, -1.715e-01, 3.252e-01, 5.176e-01, -2.939e-01, 4.311e-01, -6.125e-02, 1.905e-01, 8.140e-02, 2.095e-01), r);
r = MulAdd(s0_5, M4(3.193e-02, 6.029e-02, 1.869e-03, 8.627e-04, -1.402e-02, 4.288e-02, -5.756e-02, 8.813e-02, -2.758e-02, -5.267e-02, 1.702e-03, -6.676e-01, 6.373e-02, 5.766e-02, -6.325e-02, -2.744e-01), r);
r = MulAdd(s0_6, M4(4.918e-02, 5.420e-04, 3.692e-02, 7.796e-03, -1.163e-02, -4.074e-02, 2.057e-02, -2.837e-02, 1.083e-01, 1.958e-01, -5.078e-02, 2.750e-02, 5.323e-02, 5.953e-03, 4.766e-02, -2.265e-03), r);
r = MulAdd(s0_7, M4(-3.968e-02, -1.535e-01, 6.564e-02, -2.620e-02, 3.742e-02, 8.659e-02, -4.440e-02, 6.007e-03, -9.585e-02, -9.425e-02, -1.517e-01, 3.701e-01, -1.332e-01, -1.860e-01, -5.436e-02, 3.781e-01), r);
r = MulAdd(s0_8, M4(-1.145e-02, 6.045e-02, -4.676e-02, -5.604e-02, -1.576e-02, -3.528e-02, 2.252e-02, 1.997e-02, -2.546e-02, -6.894e-02, 7.238e-02, -3.495e-01, -6.323e-02, -1.042e-01, 1.091e-01, -4.170e-01), r);
r = MulAdd(s1_0, M4(-5.215e-01, 6.255e-01, 5.587e-02, -5.362e-02, 9.895e-02, -8.743e-03, 1.058e-01, -3.585e-02, -1.594e-02, -1.034e-01, 3.848e-02, -5.432e-02, -1.796e-02, 5.838e-02, 1.304e-01, -2.122e-02), r);
r = MulAdd(s1_1, M4(-6.987e-02, 8.696e-01, -1.130e+00, 5.558e-03, -1.080e-01, 4.195e-02, -1.323e-01, 2.270e-01, 3.451e-02, -1.616e-02, 4.251e-03, 1.470e-01, 2.442e-01, -5.904e-02, -3.467e-01, -2.056e-02), r);
r = MulAdd(s1_2, M4(4.884e-02, -1.034e-01, 5.823e-02, 1.131e-01, -4.126e-02, 6.519e-02, -1.532e-02, -2.420e-01, 1.092e-02, 1.869e-02, 1.913e-03, -1.787e-02, 1.122e-01, -1.481e-01, 1.843e-01, 3.454e-01), r);
r = MulAdd(s1_3, M4(-2.906e-01, -9.847e-01, 4.092e-01, 1.655e-01, 4.092e-02, 2.913e-01, 1.306e-01, -4.682e-02, 2.568e-01, -4.528e-02, 3.207e-02, 9.888e-02, -3.928e-01, -3.546e-01, -2.367e-01, -3.239e-01), r);
r = MulAdd(s1_4, M4(4.463e-01, -1.594e-01, 8.418e-01, -3.525e-01, 5.957e-01, 1.082e+00, -9.245e-01, 2.726e-01, 1.210e-01, 2.024e-01, -8.063e-03, -2.433e-01, -1.512e+00, 9.316e-01, 2.305e-01, -5.109e-01), r);
r = MulAdd(s1_5, M4(-2.393e-02, 1.286e-02, -9.453e-02, 3.071e-01, -1.402e-01, -2.436e-01, 1.202e-01, -1.409e-01, -1.857e-02, 2.421e-02, -2.642e-02, -7.415e-02, 8.786e-01, 5.260e-04, -9.212e-02, 1.849e-01), r);
r = MulAdd(s1_6, M4(8.958e-02, 9.057e-02, 1.712e-02, -2.838e-02, -1.405e-01, -6.455e-02, -2.695e-02, -1.110e-02, 8.731e-03, 6.531e-02, -3.752e-02, 1.194e-01, 4.585e-01, 6.270e-01, -1.367e-01, -2.529e-01), r);
r = MulAdd(s1_7, M4(-4.381e-02, -1.595e-02, -4.601e-02, 7.257e-02, -8.036e-02, -1.360e-01, 1.154e-01, -7.942e-02, -4.653e-02, -7.121e-02, 2.720e-02, 8.346e-02, -1.871e+00, -8.300e-01, -6.760e-01, 7.402e-01), r);
r = MulAdd(s1_8, M4(1.359e-02, -2.489e-02, 3.529e-02, -1.121e-01, -6.190e-02, -2.628e-02, -2.090e-03, 2.359e-01, -2.412e-02, -2.463e-02, 8.317e-03, -5.330e-02, 2.105e+00, 1.550e-01, 1.457e+00, -1.129e+00), r);
return r;
}
void Pass3(uint2 blockStart, uint3 tid) {
float2 pt = float2(GetInputPt());
uint2 gxy = Rmp8x8(tid.x) + blockStart;
uint2 size = GetInputSize();
if (gxy.x >= size.x || gxy.y >= size.y) {
return;
}
float2 pos = (gxy + 0.5) * pt;
V4 s0_0 = l0(-1.0, -1.0);
V4 s0_1 = l0(0.0, -1.0);
V4 s0_2 = l0(1.0, -1.0);
V4 s0_3 = l0(-1.0, 0.0);
V4 s0_4 = l0(0.0, 0.0);
V4 s0_5 = l0(1.0, 0.0);
V4 s0_6 = l0(-1.0, 1.0);
V4 s0_7 = l0(0.0, 1.0);
V4 s0_8 = l0(1.0, 1.0);
V4 s1_0 = -max(-s0_0, 0.0);
V4 s1_1 = -max(-s0_1, 0.0);
V4 s1_2 = -max(-s0_2, 0.0);
V4 s1_3 = -max(-s0_3, 0.0);
V4 s1_4 = -max(-s0_4, 0.0);
V4 s1_5 = -max(-s0_5, 0.0);
V4 s1_6 = -max(-s0_6, 0.0);
V4 s1_7 = -max(-s0_7, 0.0);
V4 s1_8 = -max(-s0_8, 0.0);
s0_0 = max(s0_0, 0.0);
s0_1 = max(s0_1, 0.0);
s0_2 = max(s0_2, 0.0);
s0_3 = max(s0_3, 0.0);
s0_4 = max(s0_4, 0.0);
s0_5 = max(s0_5, 0.0);
s0_6 = max(s0_6, 0.0);
s0_7 = max(s0_7, 0.0);
s0_8 = max(s0_8, 0.0);
t0[gxy] = f0(s0_0, s0_1, s0_2, s0_3, s0_4, s0_5, s0_6, s0_7, s0_8, s1_0, s1_1, s1_2, s1_3, s1_4, s1_5, s1_6, s1_7, s1_8);
}
//!PASS 4
//!DESC out-shuffle
//!BLOCK_SIZE 16
//!NUM_THREADS 64
//!IN INPUT, t0
//!OUT OUTPUT
#define l0(x, y) V4(O(t0, float2(x, y)))
V4 f0(V4 s0_0, V4 s0_1, V4 s0_2, V4 s0_3, V4 s0_4, V4 s0_5, V4 s0_6, V4 s0_7, V4 s0_8, V4 s1_0, V4 s1_1, V4 s1_2, V4 s1_3, V4 s1_4, V4 s1_5, V4 s1_6, V4 s1_7, V4 s1_8) {
V4 r = { -7.528e-04, -8.388e-04, -1.247e-03, -1.205e-03 };
r = MulAdd(s0_0, M4(8.642e-03, -1.295e-02, 1.998e-02, -1.289e-03, -4.147e-02, -4.021e-03, 1.491e-04, -7.275e-03, 1.574e-02, -4.122e-03, 1.126e-02, 8.962e-03, 5.174e-02, 3.405e-02, 4.993e-02, 4.529e-02), r);
r = MulAdd(s0_1, M4(-1.028e-01, -2.764e-02, -2.777e-02, -7.170e-03, -8.365e-02, 3.550e-02, 1.288e-01, 2.475e-02, 5.017e-02, 5.917e-02, 3.473e-02, 8.510e-03, 2.332e-02, 8.047e-02, 9.838e-02, 4.234e-02), r);
r = MulAdd(s0_2, M4(-2.319e-02, -4.432e-02, -1.679e-02, 8.855e-03, 3.259e-02, -1.974e-01, 5.938e-02, 1.616e-01, -5.605e-04, 3.183e-02, -3.356e-03, 3.138e-02, 9.572e-03, -3.887e-02, -2.632e-02, -1.161e-02), r);
r = MulAdd(s0_3, M4(-2.947e-02, -4.358e-02, 1.208e-03, -2.705e-02, -1.037e-02, -6.812e-02, -5.436e-02, -3.840e-02, 3.684e-02, 2.560e-02, 1.715e-02, -3.670e-02, -5.930e-02, -2.310e-02, -6.163e-02, -3.562e-02), r);
r = MulAdd(s0_4, M4(5.520e-01, 1.213e-01, 1.753e-01, 5.436e-02, 5.879e-01, 2.281e-01, -2.703e-01, 1.519e-01, 5.739e-01, 2.959e-01, 9.449e-02, 2.473e-02, -5.998e-01, -9.548e-02, -6.035e-01, -9.663e-02), r);
r = MulAdd(s0_5, M4(-9.740e-02, 2.744e-01, -1.522e-01, -7.204e-02, 1.178e-01, 6.112e-01, -4.801e-02, -5.176e-01, 1.480e-02, 8.323e-02, -6.764e-02, 4.138e-02, 1.121e-01, -8.141e-02, 1.211e-01, -8.737e-02), r);
r = MulAdd(s0_6, M4(6.315e-02, 6.323e-02, 1.146e-02, 3.378e-02, -9.598e-02, -1.089e-01, 2.780e-02, -6.091e-02, -1.194e-01, -1.038e-01, -2.147e-02, -4.236e-02, -2.300e-02, -3.184e-02, -1.560e-02, -2.206e-02), r);
r = MulAdd(s0_7, M4(-1.772e-01, -1.304e-01, 1.265e-01, -7.871e-02, 1.978e-01, 1.074e-01, 1.240e-02, 4.600e-02, 1.558e-02, -3.196e-02, 2.018e-01, 1.496e-01, 1.421e-01, 8.472e-02, 7.432e-02, 9.935e-02), r);
r = MulAdd(s0_8, M4(1.132e-02, -2.296e-03, 1.274e-01, 3.428e-01, -5.796e-02, -6.156e-02, -2.549e-01, -2.231e-01, -8.762e-02, -9.318e-02, -2.378e-01, -3.018e-01, 5.601e-03, -2.670e-02, 2.896e-02, -3.910e-02), r);
r = MulAdd(s1_0, M4(4.603e-02, -2.582e-02, -9.045e-03, 1.446e-02, -1.835e-02, -2.533e-02, 3.681e-03, -9.420e-03, -5.802e-02, 2.310e-02, 3.059e-02, 1.313e-03, 9.639e-02, 8.284e-02, 1.071e-01, -3.287e-02), r);
r = MulAdd(s1_1, M4(-2.480e-02, 2.321e-03, -3.594e-02, -1.101e-01, 2.850e-02, 2.912e-02, 2.597e-02, 2.777e-02, 5.701e-02, 9.536e-04, 2.533e-02, 1.102e-02, -3.714e-03, 7.838e-02, -1.716e-02, 1.723e-01), r);
r = MulAdd(s1_2, M4(-4.473e-03, 1.521e-02, -1.887e-02, 6.731e-03, 2.199e-03, 2.965e-02, -3.709e-03, 1.671e-02, 1.376e-02, -4.819e-02, -8.832e-04, 3.531e-02, -8.453e-03, -1.276e-02, -1.461e-02, 4.460e-03), r);
r = MulAdd(s1_3, M4(6.139e-02, -1.511e-01, 1.102e-01, -1.428e-01, -5.114e-02, -6.594e-02, -1.693e-02, -4.651e-02, 2.440e-01, 2.010e-02, -1.900e-01, -1.243e-03, -2.397e-01, 2.002e-01, -3.506e-01, 2.171e-01), r);
r = MulAdd(s1_4, M4(-6.189e-02, 5.137e-01, -8.132e-02, 4.526e-01, 3.263e-01, 2.134e-01, 1.027e-01, 2.067e-02, 2.407e-01, 2.591e-01, 4.489e-01, 2.042e-01, 1.932e-02, -4.463e-01, -1.479e-01, -6.843e-01), r);
r = MulAdd(s1_5, M4(-7.571e-03, -7.787e-02, 9.918e-03, -8.469e-02, 4.056e-02, -1.926e-02, -4.968e-02, 2.416e-02, 2.699e-02, 2.783e-01, -7.854e-02, -6.549e-02, 6.835e-03, 2.288e-02, 1.048e-02, -3.273e-02), r);
r = MulAdd(s1_6, M4(7.034e-02, 4.236e-02, 7.905e-02, -2.283e-03, -8.423e-02, -7.784e-02, -7.540e-03, -3.373e-02, -1.019e-01, -1.421e-01, 6.713e-02, -8.716e-02, -6.980e-02, -4.731e-02, -3.086e-02, -6.210e-03), r);
r = MulAdd(s1_7, M4(-1.597e-01, -2.036e-01, 5.194e-02, 8.457e-02, 1.387e-01, 7.910e-02, 2.030e-02, 5.848e-02, 2.154e-01, 1.382e-01, -8.617e-02, 7.552e-02, 3.127e-02, 5.899e-02, 1.733e-01, 1.657e-01), r);
r = MulAdd(s1_8, M4(3.595e-02, 3.243e-02, 1.450e-01, 2.046e-01, -2.939e-02, -1.306e-02, -1.587e-01, -2.607e-01, -8.980e-02, -5.350e-02, -2.627e-01, -2.861e-01, -1.585e-02, -2.032e-02, -1.662e-02, 1.560e-02), r);
return tanh(r);
}
void Pass4(uint2 blockStart, uint3 tid) {
float2 pt = float2(GetInputPt());
uint2 gxy = (Rmp8x8(tid.x) << 1) + blockStart;
uint2 size = GetOutputSize();
if (gxy.x >= size.x || gxy.y >= size.y) {
return;
}
float2 pos = ((gxy >> 1) + 0.5) * pt;
V4 s0_0 = l0(-1.0, -1.0);
V4 s0_1 = l0(0.0, -1.0);
V4 s0_2 = l0(1.0, -1.0);
V4 s0_3 = l0(-1.0, 0.0);
V4 s0_4 = l0(0.0, 0.0);
V4 s0_5 = l0(1.0, 0.0);
V4 s0_6 = l0(-1.0, 1.0);
V4 s0_7 = l0(0.0, 1.0);
V4 s0_8 = l0(1.0, 1.0);
V4 s1_0 = -max(-s0_0, 0.0);
V4 s1_1 = -max(-s0_1, 0.0);
V4 s1_2 = -max(-s0_2, 0.0);
V4 s1_3 = -max(-s0_3, 0.0);
V4 s1_4 = -max(-s0_4, 0.0);
V4 s1_5 = -max(-s0_5, 0.0);
V4 s1_6 = -max(-s0_6, 0.0);
V4 s1_7 = -max(-s0_7, 0.0);
V4 s1_8 = -max(-s0_8, 0.0);
s0_0 = max(s0_0, 0.0);
s0_1 = max(s0_1, 0.0);
s0_2 = max(s0_2, 0.0);
s0_3 = max(s0_3, 0.0);
s0_4 = max(s0_4, 0.0);
s0_5 = max(s0_5, 0.0);
s0_6 = max(s0_6, 0.0);
s0_7 = max(s0_7, 0.0);
s0_8 = max(s0_8, 0.0);
V4 r = f0(s0_0, s0_1, s0_2, s0_3, s0_4, s0_5, s0_6, s0_7, s0_8, s1_0, s1_1, s1_2, s1_3, s1_4, s1_5, s1_6, s1_7, s1_8);
static const MF3x3 rgb2yuv = { 0.299, 0.587, 0.114, -0.169, -0.331, 0.5, 0.5, -0.419, -0.081 };
static const MF3x3 yuv2rgb = { 1, -0.00093, 1.401687, 1, -0.3437, -0.71417, 1, 1.77216, 0.00099 };
float2 opt = float2(GetOutputPt());
pos -= 0.5f * opt;
MF3 yuv = mul(rgb2yuv, INPUT.SampleLevel(SL, pos, 0).rgb);
OUTPUT[gxy] = MF4(mul(yuv2rgb, MF3(saturate(yuv.r + r.x), yuv.yz)), 1);
++gxy.x;
pos.x += opt.x;
yuv = mul(rgb2yuv, INPUT.SampleLevel(SL, pos, 0).rgb);
OUTPUT[gxy] = MF4(mul(yuv2rgb, MF3(saturate(yuv.r + r.y), yuv.yz)), 1);
++gxy.y;
pos.y += opt.y;
yuv = mul(rgb2yuv, INPUT.SampleLevel(SL, pos, 0).rgb);
OUTPUT[gxy] = MF4(mul(yuv2rgb, MF3(saturate(yuv.r + r.w), yuv.yz)), 1);
--gxy.x;
pos.x -= opt.x;
yuv = mul(rgb2yuv, INPUT.SampleLevel(SL, pos, 0).rgb);
OUTPUT[gxy] = MF4(mul(yuv2rgb, MF3(saturate(yuv.r + r.z), yuv.yz)), 1);
}

View file

@ -1,340 +0,0 @@
// CuNNy 2x4C BILINEAR RGB NVL - https://github.com/funnyplanter/CuNNy
// This program is free software: you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation, either version 3 of the License, or
// (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
//
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License
// along with this program. If not, see <http://www.gnu.org/licenses/>.
//!MAGPIE EFFECT
//!VERSION 4
//!SORT_NAME CuNNy-D04N02
//!USE MulAdd
//!CAPABILITY FP16
#include "../StubDefs.hlsli"
//!TEXTURE
Texture2D INPUT;
//!TEXTURE
//!WIDTH INPUT_WIDTH * 2
//!HEIGHT INPUT_HEIGHT * 2
Texture2D OUTPUT;
//!SAMPLER
//!FILTER POINT
SamplerState SP;
//!SAMPLER
//!FILTER LINEAR
SamplerState SL;
//!COMMON
#define O(t, p) t.SampleLevel(SP, pos + p * pt, 0)
#define V4 MF4
#define M4 MF4x4
//!TEXTURE
//!WIDTH INPUT_WIDTH
//!HEIGHT INPUT_HEIGHT
//!FORMAT R8G8B8A8_SNORM
Texture2D t0;
//!TEXTURE
//!WIDTH INPUT_WIDTH
//!HEIGHT INPUT_HEIGHT
//!FORMAT R8G8B8A8_SNORM
Texture2D t1;
//!PASS 1
//!DESC in
//!BLOCK_SIZE 8
//!NUM_THREADS 64
//!IN INPUT
//!OUT t0
#define l0(x, y) (dot(MF3(-6.049e-01, -1.145e+00, -2.540e-01), O(INPUT, float2(x, y)).rgb) + MF(1.794e+00))
V4 f0(MF s0_0, MF s0_1, MF s0_2, MF s0_3, MF s0_4, MF s0_5, MF s0_6, MF s0_7, MF s0_8) {
V4 r = { 4.440e-03, -1.956e-04, 1.215e-03, 1.790e-03 };
r = mad(s0_0, V4(1.411e-01, -9.763e-03, -1.361e-01, -9.610e-04), r);
r = mad(s0_1, V4(6.068e-02, 7.238e-03, -1.182e-01, -1.535e-02), r);
r = mad(s0_2, V4(-8.549e-02, -2.876e-03, -8.740e-03, 1.652e-02), r);
r = mad(s0_3, V4(-3.249e-01, 5.392e-02, -8.518e-02, -7.437e-03), r);
r = mad(s0_4, V4(2.435e-02, -6.191e-01, 7.147e-01, 5.862e-01), r);
r = mad(s0_5, V4(1.968e-01, 1.868e-02, -1.723e-01, -5.801e-01), r);
r = mad(s0_6, V4(1.528e-01, -4.489e-02, 5.871e-03, 4.528e-03), r);
r = mad(s0_7, V4(-4.619e-01, 6.152e-01, -1.313e-01, -5.326e-02), r);
r = mad(s0_8, V4(2.902e-01, -1.801e-02, -6.907e-02, 5.105e-02), r);
return r;
}
void Pass1(uint2 blockStart, uint3 tid) {
float2 pt = float2(GetInputPt());
uint2 gxy = Rmp8x8(tid.x) + blockStart;
uint2 size = GetInputSize();
if (gxy.x >= size.x || gxy.y >= size.y) {
return;
}
float2 pos = (gxy + 0.5) * pt;
MF s0_0 = l0(-1.0, -1.0);
MF s0_1 = l0(0.0, -1.0);
MF s0_2 = l0(1.0, -1.0);
MF s0_3 = l0(-1.0, 0.0);
MF s0_4 = l0(0.0, 0.0);
MF s0_5 = l0(1.0, 0.0);
MF s0_6 = l0(-1.0, 1.0);
MF s0_7 = l0(0.0, 1.0);
MF s0_8 = l0(1.0, 1.0);
t0[gxy] = f0(s0_0, s0_1, s0_2, s0_3, s0_4, s0_5, s0_6, s0_7, s0_8);
}
//!PASS 2
//!DESC conv1
//!BLOCK_SIZE 8
//!NUM_THREADS 64
//!IN t0
//!OUT t1
#define l0(x, y) V4(O(t0, float2(x, y)))
V4 f0(V4 s0_0, V4 s0_1, V4 s0_2, V4 s0_3, V4 s0_4, V4 s0_5, V4 s0_6, V4 s0_7, V4 s0_8, V4 s1_0, V4 s1_1, V4 s1_2, V4 s1_3, V4 s1_4, V4 s1_5, V4 s1_6, V4 s1_7, V4 s1_8) {
V4 r = { 3.566e-03, 2.403e-03, -1.451e-03, 4.304e-03 };
r = MulAdd(s0_0, M4(1.120e-01, 8.150e-03, 7.146e-02, -4.942e-02, 3.623e-01, -1.678e-01, 1.189e-01, 1.372e-01, 1.225e-01, -2.568e-02, 6.959e-02, 1.788e-02, 1.962e-01, -1.870e-01, -6.548e-03, -4.334e-02), r);
r = MulAdd(s0_1, M4(1.805e-01, 4.881e-02, -2.342e-03, 2.035e-02, -2.427e-01, -2.197e-02, -2.036e-02, 3.919e-01, -3.037e-01, 7.047e-02, 3.426e-02, -8.694e-02, 2.144e-01, 1.431e-01, -7.851e-02, 2.247e-01), r);
r = MulAdd(s0_2, M4(6.328e-02, -4.140e-02, 3.362e-02, 5.204e-02, -1.052e-01, 1.698e-01, -2.727e-03, 1.110e-01, 7.156e-02, -1.108e-02, -2.717e-02, 5.680e-02, -6.118e-02, 2.435e-02, 1.743e-02, 8.179e-02), r);
r = MulAdd(s0_3, M4(1.557e-01, 1.189e-01, 8.836e-02, 2.178e-02, -3.954e-01, 2.466e-01, -2.166e-01, -7.051e-02, -2.857e-01, -1.611e-02, -8.667e-02, 1.895e-04, 2.744e-01, 1.499e-01, 8.228e-02, 2.938e-02), r);
r = MulAdd(s0_4, M4(2.441e-01, -3.694e-01, 1.751e-01, 6.833e-01, -1.087e-01, -2.065e-01, -1.557e-01, -6.945e-02, -1.403e-02, 2.171e-02, 3.748e-02, 2.646e-01, -3.718e-01, -1.188e-01, 1.569e-01, 8.554e-02), r);
r = MulAdd(s0_5, M4(-5.069e-02, 2.646e-01, -5.754e-02, -3.545e-01, 1.404e-01, 1.123e-01, 4.577e-02, -1.465e-01, -2.119e-02, -1.115e-02, 1.661e-01, -4.029e-01, -2.123e-01, 2.774e-01, -1.905e-02, -1.093e-02), r);
r = MulAdd(s0_6, M4(2.593e-02, -1.801e-02, 9.053e-02, -2.721e-02, 6.658e-03, 3.802e-02, -3.282e-02, -1.116e-01, 1.201e-01, 2.095e-02, -2.061e-02, 2.498e-03, -1.831e-01, -1.743e-01, 1.062e-01, -6.113e-01), r);
r = MulAdd(s0_7, M4(-1.172e-01, -1.130e-02, -6.727e-02, 7.753e-02, -3.958e-03, -9.790e-02, -1.635e-01, 1.049e-01, 2.862e-01, -2.733e-02, -1.566e-01, -2.900e-01, -1.050e-01, -3.441e-01, -8.690e-02, 8.659e-02), r);
r = MulAdd(s0_8, M4(2.145e-01, 4.613e-02, 1.590e-02, -4.749e-02, 3.291e-01, 1.012e-01, 8.647e-03, -2.282e-01, 2.215e-01, 1.713e-01, 1.414e-01, -3.916e-01, -2.488e-01, 1.458e-01, 2.518e-02, -9.979e-02), r);
r = MulAdd(s1_0, M4(-2.127e-02, 3.575e-02, 9.372e-02, -2.662e-02, 4.467e-02, 1.304e-02, 3.849e-02, 5.186e-02, 7.417e-02, 3.647e-02, 4.960e-02, -3.988e-02, -3.998e-02, 1.173e-01, 7.752e-03, -2.263e-02), r);
r = MulAdd(s1_1, M4(-1.283e-01, -1.460e-01, 1.963e-02, -1.108e-01, -4.171e-01, 2.397e-01, -5.886e-02, 7.788e-02, -2.820e-02, -1.719e-01, 9.334e-03, -1.255e-01, 1.392e-01, 9.532e-03, -5.163e-02, 8.641e-02), r);
r = MulAdd(s1_2, M4(-1.889e-01, 1.933e-01, 5.574e-02, 6.723e-02, -1.015e-01, -3.316e-01, -1.460e-02, -1.606e-01, 1.052e-01, 1.027e-02, -4.626e-02, 5.368e-02, -9.160e-03, -9.514e-02, 2.577e-02, 7.122e-02), r);
r = MulAdd(s1_3, M4(-1.958e-01, 1.276e-01, 7.303e-02, -1.135e-01, -2.277e-01, 2.017e-01, -5.223e-02, 1.379e-01, -1.737e-01, 4.871e-02, -8.142e-02, 1.392e-01, 8.113e-02, 4.415e-01, -1.174e-01, 1.910e-02), r);
r = MulAdd(s1_4, M4(-3.233e-01, -4.158e-01, 8.391e-02, 2.017e-01, 9.790e-02, -4.865e-02, -2.172e-01, 2.607e-01, -2.458e-01, -4.931e-01, 3.016e-01, 2.198e-01, -7.173e-02, -5.683e-01, -7.447e-02, -1.264e-01), r);
r = MulAdd(s1_5, M4(-4.189e-01, 3.271e-01, 8.844e-02, -5.295e-01, 6.365e-02, -1.513e-01, 1.246e-02, -2.005e-01, 1.764e-01, 5.796e-01, 7.286e-02, -1.428e-01, -1.130e-01, -6.883e-02, -1.303e-02, -1.091e-01), r);
r = MulAdd(s1_6, M4(-6.621e-02, 9.901e-03, 9.472e-02, -3.568e-02, 1.067e-01, -3.318e-02, 3.152e-01, -5.261e-02, 1.108e-01, 7.081e-02, -1.289e-01, 6.477e-03, 1.036e-01, -1.477e-03, 1.035e+00, -9.204e-02), r);
r = MulAdd(s1_7, M4(-2.721e-01, -5.458e-02, -1.707e-01, -1.096e-02, -1.302e-01, -9.074e-02, 1.694e-01, 6.307e-02, 4.233e-01, -5.112e-02, -3.545e-01, -2.589e-01, 8.276e-02, -3.975e-01, 7.705e-02, 4.482e-01), r);
r = MulAdd(s1_8, M4(1.175e-01, 2.212e-03, 5.751e-02, -8.666e-02, 2.532e-01, 1.303e-01, 7.291e-02, -2.126e-01, 4.815e-01, 1.649e-01, -4.748e-02, -3.330e-01, -1.252e-01, -8.987e-03, -4.285e-03, -1.106e-01), r);
return r;
}
void Pass2(uint2 blockStart, uint3 tid) {
float2 pt = float2(GetInputPt());
uint2 gxy = Rmp8x8(tid.x) + blockStart;
uint2 size = GetInputSize();
if (gxy.x >= size.x || gxy.y >= size.y) {
return;
}
float2 pos = (gxy + 0.5) * pt;
V4 s0_0 = l0(-1.0, -1.0);
V4 s0_1 = l0(0.0, -1.0);
V4 s0_2 = l0(1.0, -1.0);
V4 s0_3 = l0(-1.0, 0.0);
V4 s0_4 = l0(0.0, 0.0);
V4 s0_5 = l0(1.0, 0.0);
V4 s0_6 = l0(-1.0, 1.0);
V4 s0_7 = l0(0.0, 1.0);
V4 s0_8 = l0(1.0, 1.0);
V4 s1_0 = -max(-s0_0, 0.0);
V4 s1_1 = -max(-s0_1, 0.0);
V4 s1_2 = -max(-s0_2, 0.0);
V4 s1_3 = -max(-s0_3, 0.0);
V4 s1_4 = -max(-s0_4, 0.0);
V4 s1_5 = -max(-s0_5, 0.0);
V4 s1_6 = -max(-s0_6, 0.0);
V4 s1_7 = -max(-s0_7, 0.0);
V4 s1_8 = -max(-s0_8, 0.0);
s0_0 = max(s0_0, 0.0);
s0_1 = max(s0_1, 0.0);
s0_2 = max(s0_2, 0.0);
s0_3 = max(s0_3, 0.0);
s0_4 = max(s0_4, 0.0);
s0_5 = max(s0_5, 0.0);
s0_6 = max(s0_6, 0.0);
s0_7 = max(s0_7, 0.0);
s0_8 = max(s0_8, 0.0);
t1[gxy] = f0(s0_0, s0_1, s0_2, s0_3, s0_4, s0_5, s0_6, s0_7, s0_8, s1_0, s1_1, s1_2, s1_3, s1_4, s1_5, s1_6, s1_7, s1_8);
}
//!PASS 3
//!DESC conv2
//!BLOCK_SIZE 8
//!NUM_THREADS 64
//!IN t1
//!OUT t0
#define l0(x, y) V4(O(t1, float2(x, y)))
V4 f0(V4 s0_0, V4 s0_1, V4 s0_2, V4 s0_3, V4 s0_4, V4 s0_5, V4 s0_6, V4 s0_7, V4 s0_8, V4 s1_0, V4 s1_1, V4 s1_2, V4 s1_3, V4 s1_4, V4 s1_5, V4 s1_6, V4 s1_7, V4 s1_8) {
V4 r = { 5.508e-03, 4.690e-03, -5.708e-04, -7.674e-03 };
r = MulAdd(s0_0, M4(-1.173e-02, 2.762e-03, -2.225e-03, -6.814e-03, 8.328e-02, -1.275e-02, 6.091e-02, -6.470e-02, -6.067e-02, -1.086e-01, 7.501e-02, 1.227e-01, -1.551e-02, -1.728e-02, -2.694e-02, 7.490e-02), r);
r = MulAdd(s0_1, M4(5.326e-02, 1.003e-02, 3.989e-02, -1.908e-03, -4.580e-02, -4.303e-03, 4.333e-02, 8.324e-02, 8.170e-01, 8.040e-01, -3.975e-01, -1.034e+00, 1.362e-01, 3.776e-04, -1.102e-02, -5.030e-02), r);
r = MulAdd(s0_2, M4(-6.068e-02, 6.212e-02, -4.979e-02, 9.626e-03, 1.301e-02, -2.045e-02, 1.798e-02, 2.091e-02, -2.290e-01, 3.612e-01, -7.014e-02, 1.669e-01, -5.191e-03, 1.304e-02, 9.444e-05, -2.137e-02), r);
r = MulAdd(s0_3, M4(-3.235e-02, -6.238e-02, 3.894e-02, 5.893e-02, -3.530e-02, -1.063e-01, 8.668e-02, 1.232e-02, -3.851e-02, 2.952e-02, 6.132e-02, -5.755e-02, 8.317e-02, 8.340e-02, -8.227e-02, 6.481e-03), r);
r = MulAdd(s0_4, M4(2.118e-02, 2.725e-01, -1.393e-01, -2.377e-01, 4.872e-01, 2.235e-01, -1.746e-02, -3.662e-01, -3.945e-01, -1.862e-01, -9.132e-02, 8.777e-02, -5.084e-01, -3.300e-01, -3.443e-02, 4.203e-01), r);
r = MulAdd(s0_5, M4(1.165e-01, -1.743e-01, 4.169e-03, -1.518e-01, 1.174e-01, -3.314e-02, 2.295e-02, -9.160e-02, -1.854e-01, -6.999e-02, -6.985e-02, 4.875e-04, -1.147e-01, 1.722e-01, -2.588e-02, 1.185e-01), r);
r = MulAdd(s0_6, M4(-8.881e-03, 1.907e-03, 9.002e-03, 8.085e-03, -8.728e-03, -1.074e-01, 7.035e-02, 6.519e-02, 4.323e-02, -4.675e-02, 4.382e-02, 1.091e-02, 3.357e-02, 4.384e-02, -8.031e-03, -1.945e-02), r);
r = MulAdd(s0_7, M4(-7.981e-02, 1.492e-02, -9.399e-02, -3.750e-02, -1.274e-01, -3.235e-02, -3.169e-02, 6.420e-02, 4.304e-02, 9.302e-02, 1.250e-02, 3.906e-03, 1.752e-01, -1.211e-02, 9.058e-02, -6.273e-02), r);
r = MulAdd(s0_8, M4(-1.290e-02, -4.309e-02, 3.384e-02, 3.819e-02, -3.309e-02, 3.986e-02, 3.783e-03, 5.361e-02, 5.473e-02, 1.574e-02, -2.385e-02, -7.630e-02, -1.778e-02, 1.375e-02, -2.936e-02, -1.778e-02), r);
r = MulAdd(s1_0, M4(1.219e-01, 1.166e-02, -5.932e-02, 1.191e-02, -2.487e-03, -5.945e-02, 6.637e-02, 5.775e-02, -1.705e-02, 5.538e-02, -5.130e-02, -3.602e-02, 5.461e-02, -1.253e-01, 6.953e-02, 1.066e-01), r);
r = MulAdd(s1_1, M4(6.504e-01, -9.638e-01, 1.371e+00, 5.682e-02, 1.583e-02, -2.371e-02, 5.201e-02, 3.845e-02, 3.478e-02, -1.477e-01, 1.763e-01, 5.129e-02, 2.992e-01, -3.335e-01, 2.490e-02, 4.873e-01), r);
r = MulAdd(s1_2, M4(2.415e-02, 8.838e-02, -1.519e-01, 9.012e-02, -6.676e-02, 3.422e-02, -2.380e-02, 5.608e-02, -1.744e-01, -9.595e-02, -7.627e-02, -5.823e-02, -9.466e-02, 5.554e-02, -1.024e-01, -1.763e-01), r);
r = MulAdd(s1_3, M4(8.380e-02, -7.972e-02, 8.813e-02, 3.371e-02, 5.392e-03, 4.385e-02, 1.207e-02, -5.728e-02, -3.427e-03, -2.027e-03, 1.211e-03, -7.897e-03, 3.360e-02, 4.603e-02, -1.240e-02, -2.219e-02), r);
r = MulAdd(s1_4, M4(-6.699e-01, -3.512e-01, -2.153e-01, 3.218e-01, -5.100e-01, 4.324e-03, 2.713e-01, -2.073e-01, 1.547e-01, -2.123e-03, 7.928e-02, -5.698e-02, 2.450e-02, -4.866e-02, 9.436e-02, 7.900e-02), r);
r = MulAdd(s1_5, M4(1.609e-01, -7.910e-02, 1.112e-01, -2.959e-02, -3.877e-01, -2.803e-01, -1.071e-01, -6.881e-03, 1.922e-02, 2.433e-02, -3.581e-02, -5.264e-02, -3.287e-01, -1.037e-02, -6.159e-02, 8.219e-02), r);
r = MulAdd(s1_6, M4(-4.263e-02, -6.372e-02, 2.607e-02, 5.285e-02, -6.156e-02, -7.837e-02, 7.299e-03, 8.959e-02, -8.706e-03, -1.642e-02, 1.825e-02, 1.850e-02, 2.735e-02, 2.413e-02, -3.236e-02, -9.612e-03), r);
r = MulAdd(s1_7, M4(-5.849e-02, 1.530e-01, -6.767e-02, -1.392e-02, -3.430e-01, -1.851e-01, -1.013e-01, 2.465e-01, -1.715e-02, 4.970e-03, -1.850e-02, -4.214e-03, 1.889e-02, -5.787e-02, 7.154e-02, 9.237e-02), r);
r = MulAdd(s1_8, M4(-2.084e-02, -2.484e-01, 5.767e-02, -2.550e-02, -9.126e-02, 4.292e-01, 1.983e-02, 2.979e-01, -3.807e-03, -3.367e-03, 1.835e-03, 8.694e-03, -9.074e-02, 4.820e-02, -2.886e-02, 5.975e-02), r);
return r;
}
void Pass3(uint2 blockStart, uint3 tid) {
float2 pt = float2(GetInputPt());
uint2 gxy = Rmp8x8(tid.x) + blockStart;
uint2 size = GetInputSize();
if (gxy.x >= size.x || gxy.y >= size.y) {
return;
}
float2 pos = (gxy + 0.5) * pt;
V4 s0_0 = l0(-1.0, -1.0);
V4 s0_1 = l0(0.0, -1.0);
V4 s0_2 = l0(1.0, -1.0);
V4 s0_3 = l0(-1.0, 0.0);
V4 s0_4 = l0(0.0, 0.0);
V4 s0_5 = l0(1.0, 0.0);
V4 s0_6 = l0(-1.0, 1.0);
V4 s0_7 = l0(0.0, 1.0);
V4 s0_8 = l0(1.0, 1.0);
V4 s1_0 = -max(-s0_0, 0.0);
V4 s1_1 = -max(-s0_1, 0.0);
V4 s1_2 = -max(-s0_2, 0.0);
V4 s1_3 = -max(-s0_3, 0.0);
V4 s1_4 = -max(-s0_4, 0.0);
V4 s1_5 = -max(-s0_5, 0.0);
V4 s1_6 = -max(-s0_6, 0.0);
V4 s1_7 = -max(-s0_7, 0.0);
V4 s1_8 = -max(-s0_8, 0.0);
s0_0 = max(s0_0, 0.0);
s0_1 = max(s0_1, 0.0);
s0_2 = max(s0_2, 0.0);
s0_3 = max(s0_3, 0.0);
s0_4 = max(s0_4, 0.0);
s0_5 = max(s0_5, 0.0);
s0_6 = max(s0_6, 0.0);
s0_7 = max(s0_7, 0.0);
s0_8 = max(s0_8, 0.0);
t0[gxy] = f0(s0_0, s0_1, s0_2, s0_3, s0_4, s0_5, s0_6, s0_7, s0_8, s1_0, s1_1, s1_2, s1_3, s1_4, s1_5, s1_6, s1_7, s1_8);
}
//!PASS 4
//!DESC out-shuffle
//!BLOCK_SIZE 16
//!NUM_THREADS 64
//!IN INPUT, t0
//!OUT OUTPUT
#define l0(x, y) V4(O(t0, float2(x, y)))
V4 f0(V4 s0_0, V4 s0_1, V4 s0_2, V4 s0_3, V4 s0_4, V4 s0_5, V4 s0_6, V4 s0_7, V4 s0_8, V4 s1_0, V4 s1_1, V4 s1_2, V4 s1_3, V4 s1_4, V4 s1_5, V4 s1_6, V4 s1_7, V4 s1_8) {
V4 r = { -1.734e-03, -1.825e-03, -1.635e-03, -1.665e-03 };
r = MulAdd(s0_0, M4(-1.841e-04, -5.677e-02, 9.249e-03, -8.726e-03, 4.041e-02, -1.295e-01, 1.154e-01, 2.765e-02, 1.833e-01, -8.427e-02, 1.078e-01, -1.432e-01, 1.068e-01, -1.222e-01, 2.535e-02, 5.316e-02), r);
r = MulAdd(s0_1, M4(-3.609e-03, 5.812e-02, -4.650e-02, -2.093e-02, -3.442e-02, 7.643e-02, 1.424e-02, 7.195e-02, 1.552e-01, -8.291e-01, 1.547e-01, 4.354e-01, -2.851e-02, 1.023e-01, -8.481e-03, -6.567e-02), r);
r = MulAdd(s0_2, M4(1.724e-02, -1.165e-02, 1.007e-02, -3.008e-02, -9.814e-04, -2.007e-02, -5.905e-03, 6.714e-03, -1.736e-01, 2.035e-01, -1.333e-01, 1.250e-01, -9.118e-03, -4.989e-02, 2.142e-02, -4.038e-03), r);
r = MulAdd(s0_3, M4(7.885e-02, -8.350e-02, -6.025e-03, -1.139e-01, -8.380e-02, -6.836e-02, -5.589e-01, -4.614e-01, -6.742e-01, 2.118e-01, -4.442e-01, 2.197e-01, -5.873e-02, 1.902e-01, -4.687e-01, -4.712e-01), r);
r = MulAdd(s0_4, M4(-4.506e-01, 2.396e-01, -1.350e-02, 4.072e-01, 3.249e-01, 9.930e-02, 1.576e-02, -2.456e-01, 1.506e+00, 6.047e-02, 8.841e-01, -1.927e+00, -4.337e-01, -5.801e-01, 3.334e-01, 8.276e-02), r);
r = MulAdd(s0_5, M4(5.049e-02, -1.870e-01, 7.413e-02, -2.569e-02, -2.152e-02, 1.139e-01, -3.874e-02, 1.634e-02, -1.325e-01, 4.002e-02, -1.874e-01, 1.204e-01, 2.267e-02, 1.380e-02, -1.055e-02, 5.504e-02), r);
r = MulAdd(s0_6, M4(-2.855e-02, 1.255e-02, 3.941e-02, 4.466e-03, 4.814e-05, -9.003e-03, 1.231e-01, 5.676e-02, 5.020e-02, -5.407e-02, -1.951e-01, 4.240e-02, 3.525e-02, -1.021e-01, 4.517e-01, 2.399e-01), r);
r = MulAdd(s0_7, M4(-5.781e-02, -4.964e-02, -3.981e-01, -1.716e-01, 3.430e-02, -1.644e-02, 2.352e-01, 1.938e-01, 1.266e-01, -1.061e-01, 7.754e-01, 5.337e-01, 2.664e-01, 3.669e-01, -1.113e+00, -1.742e-01), r);
r = MulAdd(s0_8, M4(2.948e-02, 3.723e-02, 2.739e-02, -5.215e-02, -1.542e-02, -2.173e-02, -1.944e-02, 1.856e-02, -4.535e-02, 1.163e-02, -5.014e-02, 8.660e-02, 1.421e-01, 2.314e-01, 1.171e-02, -4.975e-01), r);
r = MulAdd(s1_0, M4(-4.408e-02, -3.573e-02, 3.842e-02, 2.571e-02, 2.872e-01, -4.960e-01, 2.569e-01, -6.254e-02, 2.158e-02, -6.452e-02, 7.495e-02, 1.997e-02, 4.094e-02, -9.741e-02, 3.542e-02, -8.115e-03), r);
r = MulAdd(s1_1, M4(3.480e-02, 1.949e-04, 1.780e-02, 4.483e-02, -2.814e-01, 4.229e-01, -5.482e-02, 1.512e-02, -3.120e-02, 3.945e-02, 4.626e-02, 7.013e-02, -6.686e-03, 5.832e-02, -4.408e-02, -1.262e-02), r);
r = MulAdd(s1_2, M4(-9.847e-03, 1.973e-03, 1.457e-02, 2.290e-02, 4.741e-02, 2.270e-02, 8.902e-04, 1.152e-02, -2.473e-02, -1.948e-02, -3.475e-03, 4.431e-02, 2.044e-02, 1.571e-04, 9.470e-03, -2.825e-02), r);
r = MulAdd(s1_3, M4(5.918e-02, -1.939e-02, -4.628e-02, -7.774e-02, -3.040e-01, 8.634e-02, -5.254e-01, -6.906e-01, -1.218e-01, -6.178e-02, -3.115e-01, -2.697e-01, -2.402e-02, -2.149e-02, -3.878e-01, -3.453e-01), r);
r = MulAdd(s1_4, M4(2.920e-01, 3.711e-01, -2.753e-01, -4.654e-02, 1.379e-01, 3.908e-01, -4.798e-01, 6.668e-01, 4.870e-01, -1.634e-01, -7.790e-02, -2.683e-01, -4.834e-01, -1.822e-02, -8.492e-03, 7.620e-02), r);
r = MulAdd(s1_5, M4(-4.786e-02, 2.412e-02, 4.992e-02, -1.913e-01, 9.058e-02, -4.485e-02, 8.249e-02, -9.418e-02, 3.555e-02, 3.543e-01, -1.140e-01, -1.358e-01, 5.079e-02, -2.007e-01, 6.132e-02, -2.373e-03), r);
r = MulAdd(s1_6, M4(6.553e-03, -7.804e-03, 8.569e-02, 4.875e-02, 5.085e-02, 1.728e-02, 6.949e-02, 1.313e-01, 1.825e-02, -5.557e-02, -7.548e-03, -5.534e-02, 7.059e-02, 4.382e-02, 2.807e-01, 1.919e-01), r);
r = MulAdd(s1_7, M4(-1.071e-01, -3.709e-02, -4.757e-01, -1.943e-01, 8.182e-02, -3.334e-02, 4.170e-01, 6.716e-02, 1.563e-01, 1.382e-01, 7.441e-01, 4.082e-01, -9.101e-02, -3.943e-02, -5.142e-01, -1.910e-01), r);
r = MulAdd(s1_8, M4(4.255e-03, 4.204e-02, 5.834e-02, -6.508e-02, -3.675e-02, 1.165e-02, -2.694e-02, -2.212e-02, -3.036e-02, -4.393e-02, 1.855e-03, 1.909e-01, 3.812e-02, 3.309e-02, 3.942e-02, -7.422e-02), r);
return tanh(r);
}
void Pass4(uint2 blockStart, uint3 tid) {
float2 pt = float2(GetInputPt());
uint2 gxy = (Rmp8x8(tid.x) << 1) + blockStart;
uint2 size = GetOutputSize();
if (gxy.x >= size.x || gxy.y >= size.y) {
return;
}
float2 pos = ((gxy >> 1) + 0.5) * pt;
V4 s0_0 = l0(-1.0, -1.0);
V4 s0_1 = l0(0.0, -1.0);
V4 s0_2 = l0(1.0, -1.0);
V4 s0_3 = l0(-1.0, 0.0);
V4 s0_4 = l0(0.0, 0.0);
V4 s0_5 = l0(1.0, 0.0);
V4 s0_6 = l0(-1.0, 1.0);
V4 s0_7 = l0(0.0, 1.0);
V4 s0_8 = l0(1.0, 1.0);
V4 s1_0 = -max(-s0_0, 0.0);
V4 s1_1 = -max(-s0_1, 0.0);
V4 s1_2 = -max(-s0_2, 0.0);
V4 s1_3 = -max(-s0_3, 0.0);
V4 s1_4 = -max(-s0_4, 0.0);
V4 s1_5 = -max(-s0_5, 0.0);
V4 s1_6 = -max(-s0_6, 0.0);
V4 s1_7 = -max(-s0_7, 0.0);
V4 s1_8 = -max(-s0_8, 0.0);
s0_0 = max(s0_0, 0.0);
s0_1 = max(s0_1, 0.0);
s0_2 = max(s0_2, 0.0);
s0_3 = max(s0_3, 0.0);
s0_4 = max(s0_4, 0.0);
s0_5 = max(s0_5, 0.0);
s0_6 = max(s0_6, 0.0);
s0_7 = max(s0_7, 0.0);
s0_8 = max(s0_8, 0.0);
V4 r = f0(s0_0, s0_1, s0_2, s0_3, s0_4, s0_5, s0_6, s0_7, s0_8, s1_0, s1_1, s1_2, s1_3, s1_4, s1_5, s1_6, s1_7, s1_8);
static const MF3x3 rgb2yuv = { 0.299, 0.587, 0.114, -0.169, -0.331, 0.5, 0.5, -0.419, -0.081 };
static const MF3x3 yuv2rgb = { 1, -0.00093, 1.401687, 1, -0.3437, -0.71417, 1, 1.77216, 0.00099 };
float2 opt = float2(GetOutputPt());
pos -= 0.5f * opt;
MF3 yuv = mul(rgb2yuv, INPUT.SampleLevel(SL, pos, 0).rgb);
OUTPUT[gxy] = MF4(mul(yuv2rgb, MF3(saturate(yuv.r + r.x), yuv.yz)), 1);
++gxy.x;
pos.x += opt.x;
yuv = mul(rgb2yuv, INPUT.SampleLevel(SL, pos, 0).rgb);
OUTPUT[gxy] = MF4(mul(yuv2rgb, MF3(saturate(yuv.r + r.y), yuv.yz)), 1);
++gxy.y;
pos.y += opt.y;
yuv = mul(rgb2yuv, INPUT.SampleLevel(SL, pos, 0).rgb);
OUTPUT[gxy] = MF4(mul(yuv2rgb, MF3(saturate(yuv.r + r.w), yuv.yz)), 1);
--gxy.x;
pos.x -= opt.x;
yuv = mul(rgb2yuv, INPUT.SampleLevel(SL, pos, 0).rgb);
OUTPUT[gxy] = MF4(mul(yuv2rgb, MF3(saturate(yuv.r + r.z), yuv.yz)), 1);
}

View file

@ -1,412 +0,0 @@
// CuNNy 3x4C BILINEAR RGB NVL DN - https://github.com/funnyplanter/CuNNy
// This program is free software: you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation, either version 3 of the License, or
// (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
//
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License
// along with this program. If not, see <http://www.gnu.org/licenses/>.
//!MAGPIE EFFECT
//!VERSION 4
//!SORT_NAME CuNNy-DN-D04N03
//!USE MulAdd
//!CAPABILITY FP16
#include "../StubDefs.hlsli"
//!TEXTURE
Texture2D INPUT;
//!TEXTURE
//!WIDTH INPUT_WIDTH * 2
//!HEIGHT INPUT_HEIGHT * 2
Texture2D OUTPUT;
//!SAMPLER
//!FILTER POINT
SamplerState SP;
//!SAMPLER
//!FILTER LINEAR
SamplerState SL;
//!COMMON
#define O(t, p) t.SampleLevel(SP, pos + p * pt, 0)
#define V4 MF4
#define M4 MF4x4
//!TEXTURE
//!WIDTH INPUT_WIDTH
//!HEIGHT INPUT_HEIGHT
//!FORMAT R8G8B8A8_SNORM
Texture2D t0;
//!TEXTURE
//!WIDTH INPUT_WIDTH
//!HEIGHT INPUT_HEIGHT
//!FORMAT R8G8B8A8_SNORM
Texture2D t1;
//!PASS 1
//!DESC in
//!BLOCK_SIZE 8
//!NUM_THREADS 64
//!IN INPUT
//!OUT t0
#define l0(x, y) (dot(MF3(-2.683e-01, -5.217e-01, -1.382e-01), O(INPUT, float2(x, y)).rgb) + MF(7.973e-01))
V4 f0(MF s0_0, MF s0_1, MF s0_2, MF s0_3, MF s0_4, MF s0_5, MF s0_6, MF s0_7, MF s0_8) {
V4 r = { 3.156e-02, 7.379e-02, 1.078e-02, -5.510e-04 };
r = mad(s0_0, V4(1.850e-01, -2.860e-02, -5.321e-01, 2.390e-03), r);
r = mad(s0_1, V4(-4.299e-01, -2.946e-02, -1.180e-01, -5.652e-02), r);
r = mad(s0_2, V4(-4.798e-01, -2.276e-02, 3.201e-02, 4.870e-02), r);
r = mad(s0_3, V4(2.783e-01, -2.262e-03, -1.864e-01, 1.793e-01), r);
r = mad(s0_4, V4(9.435e-04, 8.115e-01, 7.806e-01, -7.793e-01), r);
r = mad(s0_5, V4(2.180e-01, -2.564e-05, 2.774e-03, -7.015e-02), r);
r = mad(s0_6, V4(1.479e-03, -4.675e-02, 3.323e-02, 3.392e-01), r);
r = mad(s0_7, V4(1.203e-01, 1.509e-02, 5.239e-02, 3.194e-01), r);
r = mad(s0_8, V4(7.680e-02, -4.310e-02, -7.203e-02, 1.255e-02), r);
return r;
}
void Pass1(uint2 blockStart, uint3 tid) {
float2 pt = float2(GetInputPt());
uint2 gxy = Rmp8x8(tid.x) + blockStart;
uint2 size = GetInputSize();
if (gxy.x >= size.x || gxy.y >= size.y) {
return;
}
float2 pos = (gxy + 0.5) * pt;
MF s0_0 = l0(-1.0, -1.0);
MF s0_1 = l0(0.0, -1.0);
MF s0_2 = l0(1.0, -1.0);
MF s0_3 = l0(-1.0, 0.0);
MF s0_4 = l0(0.0, 0.0);
MF s0_5 = l0(1.0, 0.0);
MF s0_6 = l0(-1.0, 1.0);
MF s0_7 = l0(0.0, 1.0);
MF s0_8 = l0(1.0, 1.0);
t0[gxy] = f0(s0_0, s0_1, s0_2, s0_3, s0_4, s0_5, s0_6, s0_7, s0_8);
}
//!PASS 2
//!DESC conv1
//!BLOCK_SIZE 8
//!NUM_THREADS 64
//!IN t0
//!OUT t1
#define l0(x, y) V4(O(t0, float2(x, y)))
V4 f0(V4 s0_0, V4 s0_1, V4 s0_2, V4 s0_3, V4 s0_4, V4 s0_5, V4 s0_6, V4 s0_7, V4 s0_8, V4 s1_0, V4 s1_1, V4 s1_2, V4 s1_3, V4 s1_4, V4 s1_5, V4 s1_6, V4 s1_7, V4 s1_8) {
V4 r = { 1.427e-02, -1.982e-02, 4.114e-03, -2.883e-02 };
r = MulAdd(s0_0, M4(1.949e-01, -1.247e-01, -7.307e-02, 8.783e-02, -4.773e-02, 6.012e-02, 8.043e-02, -8.489e-02, 6.760e-02, -7.809e-02, -4.745e-02, -1.304e-02, -1.402e-01, -1.248e-01, 3.334e-01, -1.498e-01), r);
r = MulAdd(s0_1, M4(7.053e-02, 9.895e-02, 1.655e-01, 2.251e-01, 3.511e-02, -1.010e-01, -2.736e-01, 1.174e-01, -2.551e-01, 1.100e-01, 1.518e-01, -4.343e-02, -9.293e-01, 5.327e-01, -2.723e-01, 4.006e-01), r);
r = MulAdd(s0_2, M4(-2.390e-02, 8.154e-03, -2.332e-02, -3.708e-02, 2.814e-02, 5.506e-02, -2.627e-01, -8.081e-02, -1.062e-01, -6.819e-02, -9.498e-02, -2.749e-01, -2.457e-01, 6.868e-01, 6.527e-03, 7.676e-01), r);
r = MulAdd(s0_3, M4(2.704e-01, 4.055e-02, -4.756e-01, 2.506e-01, -9.498e-02, 5.838e-02, 1.733e-01, 3.420e-03, -7.051e-02, -8.233e-02, -3.006e-01, 6.824e-02, -1.308e-01, 1.196e-01, 2.560e-01, 8.304e-02), r);
r = MulAdd(s0_4, M4(4.190e-01, -1.207e-01, 2.708e-01, -6.375e-01, 1.740e-01, 1.955e-03, -1.816e-01, -7.933e-02, -9.308e-01, 1.333e-01, -1.335e-01, -1.401e-01, 3.447e-01, 3.389e-01, 6.660e-01, -3.387e-01), r);
r = MulAdd(s0_5, M4(7.310e-02, 1.403e-02, 8.114e-02, 7.400e-02, -2.552e-02, -1.607e-01, -1.208e-01, -3.943e-02, -2.743e-02, -7.229e-03, -1.749e-03, 3.062e-01, 1.429e-01, 8.105e-01, 3.562e-01, 4.580e-01), r);
r = MulAdd(s0_6, M4(2.115e-01, -1.686e-01, -1.948e-01, -1.191e-01, -5.798e-02, 3.493e-02, 8.264e-02, 1.579e-01, -1.081e-01, -1.775e-01, -8.196e-02, -2.085e-01, 6.791e-02, 1.652e-02, -4.933e-03, 2.833e-02), r);
r = MulAdd(s0_7, M4(-2.160e-01, -3.858e-01, -8.407e-01, -1.091e-01, 8.415e-03, 8.626e-02, 2.340e-01, 9.177e-02, -4.697e-01, -6.623e-02, -5.176e-01, 6.762e-02, -3.437e-03, 6.570e-02, 7.630e-02, 8.988e-02), r);
r = MulAdd(s0_8, M4(6.527e-02, -6.320e-02, 1.192e-02, -1.196e-01, -1.605e-02, -9.294e-03, 1.955e-01, -2.356e-02, -3.582e-02, 1.377e-02, 9.253e-02, -2.362e-02, 3.578e-02, 1.822e-01, 3.329e-01, 1.489e-01), r);
r = MulAdd(s1_0, M4(1.154e-01, -1.822e-01, -2.122e-01, 3.031e-02, 6.550e-01, -4.855e-02, 6.554e-02, 4.432e-02, 1.671e-02, -4.477e-02, -9.428e-03, 4.413e-03, -3.185e-02, -1.529e-01, -1.222e-01, 6.523e-02), r);
r = MulAdd(s1_1, M4(-4.920e-02, -1.697e-02, 4.141e-02, 1.997e-01, 6.972e-01, -5.157e-01, 2.031e-01, 2.829e-02, -5.005e-02, 2.335e-01, 2.985e-01, 6.871e-02, -5.232e-01, 2.146e-02, -1.418e+00, 2.193e-01), r);
r = MulAdd(s1_2, M4(-6.472e-02, 2.595e-02, -2.610e-02, -2.279e-02, 4.165e-01, -7.745e-01, 1.261e-01, -3.845e-01, 3.279e-02, 2.445e-02, 1.796e-01, -2.581e-01, -3.838e-01, 6.280e-02, -4.893e-01, -1.475e-01), r);
r = MulAdd(s1_3, M4(9.330e-02, 1.742e-01, -1.685e-01, 2.376e-02, -9.586e-01, -1.236e+00, -7.271e-01, -7.674e-01, 2.500e-01, -3.709e-02, -1.303e-01, 1.490e-01, -2.746e-01, -1.376e-01, -2.321e-02, -1.967e-02), r);
r = MulAdd(s1_4, M4(3.660e-01, 4.772e-02, 5.524e-01, -2.804e-01, -2.756e+00, -1.336e+00, 2.038e-01, 2.593e+00, 2.156e-01, 3.281e-01, 3.152e-01, 8.064e-01, 3.970e-01, -1.379e-01, -7.518e-02, -2.723e-01), r);
r = MulAdd(s1_5, M4(5.214e-03, 1.695e-02, 1.024e-01, 1.333e-01, -2.250e-01, -1.298e+00, 4.673e-01, 1.317e+00, 3.036e-01, -1.273e-01, 2.900e-01, 2.249e-02, -1.870e-01, -1.124e-01, -5.879e-01, 6.314e-02), r);
r = MulAdd(s1_6, M4(-8.225e-02, -1.149e-01, 1.598e-04, -3.662e-01, -8.572e-02, -8.909e-01, 9.891e-02, 1.818e-01, 1.715e-01, -2.348e-01, 1.178e-01, -6.289e-02, 1.522e-02, 1.973e-02, 3.707e-02, 2.911e-02), r);
r = MulAdd(s1_7, M4(-6.380e-02, 8.661e-02, -2.666e-01, 9.586e-02, -1.257e+00, -2.231e+00, -1.232e+00, 5.642e-01, 5.730e-02, -3.294e-01, -1.151e-01, 2.382e-01, 4.529e-02, 4.927e-02, 9.893e-02, 8.365e-02), r);
r = MulAdd(s1_8, M4(1.906e-02, -8.920e-02, 8.931e-02, -6.752e-02, -3.680e-01, -1.282e+00, -1.388e-01, -7.545e-02, 6.262e-02, -1.695e-01, 2.278e-01, -3.066e-01, -7.412e-02, 1.145e-02, 4.667e-02, -4.205e-04), r);
return r;
}
void Pass2(uint2 blockStart, uint3 tid) {
float2 pt = float2(GetInputPt());
uint2 gxy = Rmp8x8(tid.x) + blockStart;
uint2 size = GetInputSize();
if (gxy.x >= size.x || gxy.y >= size.y) {
return;
}
float2 pos = (gxy + 0.5) * pt;
V4 s0_0 = l0(-1.0, -1.0);
V4 s0_1 = l0(0.0, -1.0);
V4 s0_2 = l0(1.0, -1.0);
V4 s0_3 = l0(-1.0, 0.0);
V4 s0_4 = l0(0.0, 0.0);
V4 s0_5 = l0(1.0, 0.0);
V4 s0_6 = l0(-1.0, 1.0);
V4 s0_7 = l0(0.0, 1.0);
V4 s0_8 = l0(1.0, 1.0);
V4 s1_0 = -max(-s0_0, 0.0);
V4 s1_1 = -max(-s0_1, 0.0);
V4 s1_2 = -max(-s0_2, 0.0);
V4 s1_3 = -max(-s0_3, 0.0);
V4 s1_4 = -max(-s0_4, 0.0);
V4 s1_5 = -max(-s0_5, 0.0);
V4 s1_6 = -max(-s0_6, 0.0);
V4 s1_7 = -max(-s0_7, 0.0);
V4 s1_8 = -max(-s0_8, 0.0);
s0_0 = max(s0_0, 0.0);
s0_1 = max(s0_1, 0.0);
s0_2 = max(s0_2, 0.0);
s0_3 = max(s0_3, 0.0);
s0_4 = max(s0_4, 0.0);
s0_5 = max(s0_5, 0.0);
s0_6 = max(s0_6, 0.0);
s0_7 = max(s0_7, 0.0);
s0_8 = max(s0_8, 0.0);
t1[gxy] = f0(s0_0, s0_1, s0_2, s0_3, s0_4, s0_5, s0_6, s0_7, s0_8, s1_0, s1_1, s1_2, s1_3, s1_4, s1_5, s1_6, s1_7, s1_8);
}
//!PASS 3
//!DESC conv2
//!BLOCK_SIZE 8
//!NUM_THREADS 64
//!IN t1
//!OUT t0
#define l0(x, y) V4(O(t1, float2(x, y)))
V4 f0(V4 s0_0, V4 s0_1, V4 s0_2, V4 s0_3, V4 s0_4, V4 s0_5, V4 s0_6, V4 s0_7, V4 s0_8, V4 s1_0, V4 s1_1, V4 s1_2, V4 s1_3, V4 s1_4, V4 s1_5, V4 s1_6, V4 s1_7, V4 s1_8) {
V4 r = { -1.292e-02, 8.156e-04, -2.055e-03, -3.100e-03 };
r = MulAdd(s0_0, M4(2.965e-01, -1.919e-01, 9.202e-02, 8.775e-03, -4.948e-02, 1.061e-01, -3.754e-02, -1.900e-01, -2.114e-01, 1.267e-01, 1.989e-02, 2.570e-02, 4.634e-03, -2.718e-01, 2.171e-01, 1.512e-01), r);
r = MulAdd(s0_1, M4(-5.527e-01, -4.825e-01, 4.325e-01, 4.447e-01, -6.577e-02, 5.161e-01, 3.286e-02, -3.800e-01, 2.625e-02, 3.835e-01, -7.794e-02, -5.489e-02, -2.647e-01, -4.952e-01, 1.587e-01, 1.471e-01), r);
r = MulAdd(s0_2, M4(-3.687e-01, -1.096e-01, 1.849e-01, -6.915e-02, 2.257e-01, 2.760e-01, -8.875e-02, -8.871e-02, -8.394e-02, -6.714e-02, 5.322e-03, -3.252e-01, -7.885e-02, -2.723e-01, 6.149e-02, 2.998e-01), r);
r = MulAdd(s0_3, M4(1.606e-01, -1.199e-01, 3.573e-01, 2.833e-02, 6.514e-03, -2.242e-02, -6.231e-02, 6.702e-02, -8.717e-02, -2.227e-01, -1.626e-01, 5.313e-02, -1.411e-01, -2.445e-02, 1.194e-01, -1.101e-01), r);
r = MulAdd(s0_4, M4(-1.127e+00, 1.823e-01, 1.358e-01, -1.618e-01, -4.171e-04, -7.771e-02, 2.147e-01, 6.493e-01, 4.989e-01, 3.955e-01, -1.017e-01, -2.861e-01, 3.878e-01, -6.653e-01, -4.968e-01, -5.063e-01), r);
r = MulAdd(s0_5, M4(-2.270e-01, -3.965e-01, -2.794e-02, 1.487e-01, -2.667e-01, -1.410e-02, 1.475e-01, -4.992e-01, -1.071e-01, 2.096e-01, 1.159e-01, -6.073e-02, -7.157e-02, -2.446e-01, -4.807e-02, 1.968e-01), r);
r = MulAdd(s0_6, M4(8.199e-02, 8.336e-02, -3.090e-02, -1.287e-02, -6.954e-02, -7.544e-02, 1.272e-01, 7.930e-02, -3.647e-02, -2.685e-02, -4.235e-02, 3.214e-02, -4.526e-02, 1.479e-01, -4.963e-02, -3.035e-02), r);
r = MulAdd(s0_7, M4(-2.012e-02, -1.497e-02, -2.952e-01, -6.026e-02, 2.135e-03, 2.979e-02, -2.713e-02, 7.951e-03, -8.069e-02, -2.374e-01, 1.865e-01, 1.048e-01, -9.076e-02, 6.683e-02, 9.576e-02, -2.432e-02), r);
r = MulAdd(s0_8, M4(1.455e-01, 2.613e-01, -1.616e-01, -3.564e-01, 1.229e-01, -3.778e-02, 3.316e-02, 5.927e-02, -1.831e-01, -1.388e-01, 5.986e-02, 2.083e-02, -1.368e-03, 2.394e-01, -1.623e-01, -2.768e-02), r);
r = MulAdd(s1_0, M4(7.711e-03, -6.696e-04, -3.229e-02, 1.549e-02, -1.596e-01, 2.068e-01, -6.162e-02, -9.571e-02, -1.500e-01, 1.743e-01, 2.746e-02, -5.845e-02, -7.649e-03, -4.265e-03, 4.154e-03, 3.950e-03), r);
r = MulAdd(s1_1, M4(2.764e-01, -4.505e-02, 4.280e-02, 6.044e-02, 3.396e-02, 2.750e-01, -1.910e-01, -2.153e-01, 9.633e-02, -2.194e-02, -2.131e-01, -1.181e-01, -1.343e-01, 6.123e-02, 1.904e-02, -6.568e-02), r);
r = MulAdd(s1_2, M4(-3.643e-01, -1.709e-02, 1.528e-01, -1.405e-01, 3.307e-01, -1.979e-03, -1.819e-01, 7.635e-02, 1.266e-01, 2.162e-01, -7.492e-02, -9.075e-02, 4.120e-02, 1.521e-01, -2.790e-03, -4.330e-02), r);
r = MulAdd(s1_3, M4(1.913e-02, -5.373e-02, 5.748e-02, -1.443e-02, -2.776e-01, -1.162e-01, -1.994e-01, 1.430e-01, 9.058e-02, -3.720e-02, -3.585e-02, -8.516e-02, -2.228e-02, 7.507e-02, -9.620e-02, -1.013e-01), r);
r = MulAdd(s1_4, M4(-3.592e-01, 1.415e-01, 1.018e+00, -1.555e-01, 5.378e-01, 8.818e-02, 2.190e-01, 1.997e-01, -1.128e-01, 3.331e-02, -1.410e-01, 2.844e-01, 4.756e-01, -5.850e-02, -3.757e-01, -1.716e-01), r);
r = MulAdd(s1_5, M4(2.636e-02, -3.596e-01, -3.280e-01, 2.027e-01, 3.000e-01, -2.297e-01, 4.282e-02, 1.776e-01, 5.222e-02, 1.751e-01, 4.529e-02, -8.347e-02, -3.409e-01, -2.640e-01, 1.753e-01, -5.672e-01), r);
r = MulAdd(s1_6, M4(-1.699e-02, 4.941e-02, -2.642e-02, -1.406e-04, -1.655e-01, -1.464e-02, -4.353e-02, 1.946e-01, 6.067e-02, -1.429e-01, 1.170e-01, -4.644e-02, -6.567e-02, -2.264e-02, 6.666e-02, 9.009e-02), r);
r = MulAdd(s1_7, M4(7.805e-02, 2.173e-02, -3.276e-01, 2.004e-03, -7.789e-02, -1.466e-02, -1.560e-01, -1.126e-01, -3.823e-02, -2.446e-03, 1.465e-01, -2.744e-01, -2.129e-01, -2.141e-02, 4.456e-01, 1.240e-01), r);
r = MulAdd(s1_8, M4(1.315e-02, 2.686e-01, -1.987e-01, -2.093e-01, 3.184e-02, -8.723e-02, 3.012e-01, 3.580e-01, 1.198e-02, -2.655e-01, 1.455e-01, 7.602e-02, -4.605e-02, 3.276e-01, -2.036e-01, -2.590e-01), r);
return r;
}
void Pass3(uint2 blockStart, uint3 tid) {
float2 pt = float2(GetInputPt());
uint2 gxy = Rmp8x8(tid.x) + blockStart;
uint2 size = GetInputSize();
if (gxy.x >= size.x || gxy.y >= size.y) {
return;
}
float2 pos = (gxy + 0.5) * pt;
V4 s0_0 = l0(-1.0, -1.0);
V4 s0_1 = l0(0.0, -1.0);
V4 s0_2 = l0(1.0, -1.0);
V4 s0_3 = l0(-1.0, 0.0);
V4 s0_4 = l0(0.0, 0.0);
V4 s0_5 = l0(1.0, 0.0);
V4 s0_6 = l0(-1.0, 1.0);
V4 s0_7 = l0(0.0, 1.0);
V4 s0_8 = l0(1.0, 1.0);
V4 s1_0 = -max(-s0_0, 0.0);
V4 s1_1 = -max(-s0_1, 0.0);
V4 s1_2 = -max(-s0_2, 0.0);
V4 s1_3 = -max(-s0_3, 0.0);
V4 s1_4 = -max(-s0_4, 0.0);
V4 s1_5 = -max(-s0_5, 0.0);
V4 s1_6 = -max(-s0_6, 0.0);
V4 s1_7 = -max(-s0_7, 0.0);
V4 s1_8 = -max(-s0_8, 0.0);
s0_0 = max(s0_0, 0.0);
s0_1 = max(s0_1, 0.0);
s0_2 = max(s0_2, 0.0);
s0_3 = max(s0_3, 0.0);
s0_4 = max(s0_4, 0.0);
s0_5 = max(s0_5, 0.0);
s0_6 = max(s0_6, 0.0);
s0_7 = max(s0_7, 0.0);
s0_8 = max(s0_8, 0.0);
t0[gxy] = f0(s0_0, s0_1, s0_2, s0_3, s0_4, s0_5, s0_6, s0_7, s0_8, s1_0, s1_1, s1_2, s1_3, s1_4, s1_5, s1_6, s1_7, s1_8);
}
//!PASS 4
//!DESC conv3
//!BLOCK_SIZE 8
//!NUM_THREADS 64
//!IN t0
//!OUT t1
#define l0(x, y) V4(O(t0, float2(x, y)))
V4 f0(V4 s0_0, V4 s0_1, V4 s0_2, V4 s0_3, V4 s0_4, V4 s0_5, V4 s0_6, V4 s0_7, V4 s0_8, V4 s1_0, V4 s1_1, V4 s1_2, V4 s1_3, V4 s1_4, V4 s1_5, V4 s1_6, V4 s1_7, V4 s1_8) {
V4 r = { 3.045e-03, 3.707e-03, -6.011e-03, -5.162e-03 };
r = MulAdd(s0_0, M4(2.151e-02, -4.754e-02, 3.454e-02, -1.338e-03, -4.337e-02, 4.608e-02, -1.116e-01, -2.296e-02, -2.839e-02, -3.878e-01, -2.317e-02, 5.774e-02, 4.317e-03, 6.680e-02, 6.325e-02, -1.449e-01), r);
r = MulAdd(s0_1, M4(-1.173e-01, -8.942e-02, -1.017e-01, 6.496e-02, 5.558e-02, 2.788e-02, 2.184e-02, -2.837e-03, -1.057e-01, -2.075e-01, -3.255e-02, -1.297e-02, -2.643e-02, -1.695e-02, -9.425e-02, 3.942e-02), r);
r = MulAdd(s0_2, M4(-1.773e-02, -4.118e-02, -2.141e-02, 4.282e-02, 4.234e-02, -1.221e-02, -3.375e-03, 4.469e-02, -2.586e-01, -1.112e-01, -7.688e-02, 3.426e-02, 8.170e-02, -2.355e-02, -3.737e-02, 3.004e-02), r);
r = MulAdd(s0_3, M4(2.192e-01, 1.955e+00, 2.012e-01, -2.598e-02, -7.453e-02, 5.510e-02, -1.517e-01, -2.571e-01, -2.182e-02, -2.345e-02, -5.767e-02, -5.534e-02, -1.996e-02, 2.329e-01, 4.447e-04, -1.111e-01), r);
r = MulAdd(s0_4, M4(3.476e-01, -4.368e-01, -1.180e-01, 5.371e-01, 5.294e-01, 1.509e-01, 2.456e-01, -7.875e-02, 2.055e-01, 9.732e-02, 1.285e-01, 5.178e-01, 3.256e-01, -2.842e-01, 4.421e-02, 3.426e-01), r);
r = MulAdd(s0_5, M4(6.119e-01, -1.393e-01, -1.144e-02, 2.438e-01, -5.126e-02, -1.049e-01, -7.847e-02, 9.942e-02, 5.371e-01, 9.985e-02, 9.193e-02, -3.067e-02, -1.962e-01, -4.272e-02, -7.821e-03, 2.557e-02), r);
r = MulAdd(s0_6, M4(1.224e-02, -5.098e-01, 3.052e-01, 5.332e-01, 2.249e-01, 4.201e-02, 5.423e-01, 1.106e-01, -1.056e-02, -4.091e-03, -1.267e-02, -5.280e-02, 1.898e-02, 9.430e-03, 1.470e-02, 7.235e-02), r);
r = MulAdd(s0_7, M4(-4.342e-01, 2.385e-01, -3.834e-02, -7.654e-02, -9.043e-01, -3.139e-01, -1.511e-01, 3.800e-01, -8.848e-02, -3.911e-02, -7.025e-03, -1.196e-02, -3.322e-03, -1.455e-01, 2.084e-02, 1.106e-01), r);
r = MulAdd(s0_8, M4(1.382e-01, -1.894e-01, -8.814e-02, 1.373e-01, 1.362e-01, -1.298e-01, -1.007e-01, 1.166e-01, -1.553e-02, 8.530e-02, 2.744e-02, -1.083e-01, -5.606e-02, 5.965e-02, 1.406e-02, -4.496e-02), r);
r = MulAdd(s1_0, M4(-4.828e-03, -1.035e-01, -5.021e-02, 1.972e-02, -9.942e-03, -3.057e-01, -7.373e-03, 4.274e-02, -3.475e-03, 4.653e-02, 9.115e-03, -5.794e-02, 1.170e-02, 1.322e-01, 1.195e-01, -2.535e-02), r);
r = MulAdd(s1_1, M4(-5.424e-02, -1.541e-01, -9.945e-02, 8.862e-02, -1.198e-01, -3.591e-05, 4.305e-02, -1.079e-01, 1.605e-02, -3.377e-02, -5.398e-02, 1.201e-02, 3.432e-02, 1.090e-02, 8.871e-02, 3.186e-02), r);
r = MulAdd(s1_2, M4(-1.108e-01, -3.481e-02, -1.616e-02, -4.136e-03, -3.382e-02, 1.836e-02, -3.071e-02, -3.186e-02, -1.014e-01, -1.412e-01, -7.790e-02, 9.763e-02, -1.624e-02, -2.520e-02, -2.152e-02, 2.524e-02), r);
r = MulAdd(s1_3, M4(3.337e-03, -1.439e-02, 2.317e-03, 2.097e-01, 5.091e-03, 4.138e-02, -5.988e-02, -2.348e-02, -5.626e-03, 1.695e-02, 2.371e-02, -1.652e-02, 8.541e-02, -1.851e-01, 1.130e+00, -1.181e-01), r);
r = MulAdd(s1_4, M4(1.184e-01, -3.385e-02, 2.659e-02, 3.233e-01, 2.333e-01, 1.694e-01, 1.915e-01, 1.162e-01, 4.309e-02, -3.793e-02, 1.412e-01, -1.345e-02, -6.074e-01, -2.408e-01, -1.306e-01, 1.033e-01), r);
r = MulAdd(s1_5, M4(3.452e-01, 1.401e-01, 3.650e-02, -4.950e-02, 1.755e-01, -1.210e-01, -1.041e-02, 1.281e-01, 4.262e-01, 2.166e-02, 3.851e-02, 1.295e-01, -1.910e-01, -2.029e-02, -2.151e-02, -1.537e-02), r);
r = MulAdd(s1_6, M4(4.989e-03, -5.730e-02, 5.803e-02, 2.946e-02, 1.825e-02, 2.660e-02, -4.900e-03, 3.848e-03, 1.078e-02, 1.823e-02, -4.751e-03, 4.219e-02, -1.024e-01, 7.721e-02, -6.709e-01, 8.423e-02), r);
r = MulAdd(s1_7, M4(-1.567e-01, 4.125e-02, -2.721e-02, -1.831e-01, 9.470e-03, -1.205e-01, 1.793e-02, 1.160e-01, -4.874e-02, -4.902e-02, -1.479e-01, 7.102e-02, 6.699e-01, -1.383e-01, 1.314e-01, 2.999e-01), r);
r = MulAdd(s1_8, M4(-2.625e-01, -9.735e-02, -6.038e-02, 3.588e-03, 2.247e-02, 4.993e-02, 1.171e-02, -2.071e-02, 2.066e-01, 2.852e-01, -5.781e-02, -3.231e-01, 6.922e-02, 8.960e-02, 9.107e-02, -2.880e-02), r);
return r;
}
void Pass4(uint2 blockStart, uint3 tid) {
float2 pt = float2(GetInputPt());
uint2 gxy = Rmp8x8(tid.x) + blockStart;
uint2 size = GetInputSize();
if (gxy.x >= size.x || gxy.y >= size.y) {
return;
}
float2 pos = (gxy + 0.5) * pt;
V4 s0_0 = l0(-1.0, -1.0);
V4 s0_1 = l0(0.0, -1.0);
V4 s0_2 = l0(1.0, -1.0);
V4 s0_3 = l0(-1.0, 0.0);
V4 s0_4 = l0(0.0, 0.0);
V4 s0_5 = l0(1.0, 0.0);
V4 s0_6 = l0(-1.0, 1.0);
V4 s0_7 = l0(0.0, 1.0);
V4 s0_8 = l0(1.0, 1.0);
V4 s1_0 = -max(-s0_0, 0.0);
V4 s1_1 = -max(-s0_1, 0.0);
V4 s1_2 = -max(-s0_2, 0.0);
V4 s1_3 = -max(-s0_3, 0.0);
V4 s1_4 = -max(-s0_4, 0.0);
V4 s1_5 = -max(-s0_5, 0.0);
V4 s1_6 = -max(-s0_6, 0.0);
V4 s1_7 = -max(-s0_7, 0.0);
V4 s1_8 = -max(-s0_8, 0.0);
s0_0 = max(s0_0, 0.0);
s0_1 = max(s0_1, 0.0);
s0_2 = max(s0_2, 0.0);
s0_3 = max(s0_3, 0.0);
s0_4 = max(s0_4, 0.0);
s0_5 = max(s0_5, 0.0);
s0_6 = max(s0_6, 0.0);
s0_7 = max(s0_7, 0.0);
s0_8 = max(s0_8, 0.0);
t1[gxy] = f0(s0_0, s0_1, s0_2, s0_3, s0_4, s0_5, s0_6, s0_7, s0_8, s1_0, s1_1, s1_2, s1_3, s1_4, s1_5, s1_6, s1_7, s1_8);
}
//!PASS 5
//!DESC out-shuffle
//!BLOCK_SIZE 16
//!NUM_THREADS 64
//!IN INPUT, t1
//!OUT OUTPUT
#define l0(x, y) V4(O(t1, float2(x, y)))
V4 f0(V4 s0_0, V4 s0_1, V4 s0_2, V4 s0_3, V4 s0_4, V4 s0_5, V4 s0_6, V4 s0_7, V4 s0_8, V4 s1_0, V4 s1_1, V4 s1_2, V4 s1_3, V4 s1_4, V4 s1_5, V4 s1_6, V4 s1_7, V4 s1_8) {
V4 r = { 1.708e-05, 2.435e-04, 1.267e-03, 1.926e-03 };
r = MulAdd(s0_0, M4(1.116e-01, 1.402e-01, 1.439e-02, 5.091e-02, -1.526e-02, -2.562e-02, -1.193e-02, -1.365e-02, -6.156e-02, -3.463e-02, 2.155e-02, -2.192e-02, -2.937e-02, -1.072e-01, -4.538e-02, -3.302e-02), r);
r = MulAdd(s0_1, M4(-1.192e-02, -1.724e-02, 9.899e-03, -5.861e-03, -1.552e-02, 2.422e-02, 4.929e-03, 7.339e-03, 4.700e-02, 1.993e-01, -6.323e-02, 5.778e-02, 1.499e-01, 3.916e-01, -4.578e-02, -2.026e-02), r);
r = MulAdd(s0_2, M4(5.431e-03, 1.916e-03, -2.064e-03, -6.545e-04, -1.731e-02, -8.081e-02, 1.391e-02, -7.036e-03, 7.739e-02, -1.588e-01, 2.970e-02, 3.357e-02, 3.869e-02, -7.824e-02, 1.813e-02, -6.252e-02), r);
r = MulAdd(s0_3, M4(5.283e-01, 8.076e-02, 3.430e-01, 2.332e-01, -3.540e-02, 1.903e-02, -1.354e-02, -1.415e-02, -1.644e-01, -1.319e-02, -9.781e-02, -3.256e-02, 2.768e-02, -3.914e-02, 1.596e-01, -1.067e-01), r);
r = MulAdd(s0_4, M4(-1.638e-02, 4.385e-01, -1.479e-01, -1.789e-02, -1.399e-01, -5.884e-02, -7.306e-02, -2.036e-03, 5.196e-01, -1.849e-01, 8.771e-01, 3.595e-01, -7.094e-01, 2.485e-02, -3.977e-02, 7.246e-01), r);
r = MulAdd(s0_5, M4(-1.647e-03, -6.027e-03, -3.787e-03, -1.975e-02, -4.810e-02, -4.557e-01, 4.921e-02, -1.313e-01, -2.044e-02, 3.533e-01, -7.591e-02, 1.249e-02, 2.648e-02, -5.215e-01, 1.204e-01, -2.254e-01), r);
r = MulAdd(s0_6, M4(-2.852e-02, -1.630e-02, 1.249e-01, -1.758e-02, 4.285e-02, 1.425e-02, -1.595e-02, 2.618e-02, 4.460e-03, 1.266e-02, -3.914e-02, 1.111e-02, 5.378e-02, 2.199e-02, 2.561e-03, 2.125e-02), r);
r = MulAdd(s0_7, M4(-6.567e-02, -4.333e-02, -4.153e-03, 1.692e-01, 5.376e-02, 5.736e-02, -1.860e-01, -9.094e-02, 3.357e-02, -3.186e-02, 1.244e-01, -9.606e-02, 6.227e-02, 6.827e-02, -2.086e-01, -6.625e-02), r);
r = MulAdd(s0_8, M4(4.553e-05, -3.116e-02, 1.023e-02, 2.322e-02, 8.623e-02, 1.125e-01, 2.802e-02, -2.768e-01, -1.003e-01, -2.143e-02, -2.413e-02, 1.460e-01, 5.421e-02, 5.798e-02, 3.478e-03, -1.421e-01), r);
r = MulAdd(s1_0, M4(2.165e-01, 1.123e-01, -3.653e-02, -6.070e-03, -1.021e-01, -6.901e-04, 6.256e-03, -3.182e-03, -4.285e-02, -6.763e-02, 2.278e-02, -1.860e-02, -2.689e-02, 2.567e-02, 2.634e-03, 3.600e-02), r);
r = MulAdd(s1_1, M4(-1.159e-01, -1.198e-01, 2.991e-02, -6.143e-02, 1.038e-01, -5.076e-02, -1.785e-02, -3.611e-02, 6.860e-02, 9.302e-02, -1.125e-02, 3.332e-02, 6.457e-02, -3.919e-02, 4.158e-03, -1.201e-02), r);
r = MulAdd(s1_2, M4(-6.554e-03, 3.359e-02, -2.003e-02, -2.227e-04, 3.354e-02, -3.700e-02, -9.588e-03, -3.740e-02, -1.336e-02, -2.556e-04, -4.733e-03, -1.636e-02, 1.127e-02, 1.421e-02, -1.019e-02, -2.731e-02), r);
r = MulAdd(s1_3, M4(3.642e-01, -3.756e-03, 6.584e-01, 1.773e-01, -1.638e-02, 1.109e-02, -7.427e-02, -1.572e-02, -1.869e-01, -3.059e-02, -8.088e-02, -5.092e-02, -5.794e-02, -4.431e-02, -7.912e-02, -9.767e-02), r);
r = MulAdd(s1_4, M4(-3.255e-02, 3.115e-01, -2.109e-01, 2.804e-01, -6.504e-01, -1.342e-02, 1.355e-01, 3.623e-01, 5.142e-01, 2.124e-01, 1.866e-01, 2.268e-01, -2.470e-02, 1.629e-01, 1.163e-01, 1.663e-01), r);
r = MulAdd(s1_5, M4(-1.093e-02, -1.640e-04, -3.502e-02, -3.746e-02, 1.836e-02, -5.959e-01, 1.323e-01, -2.388e-01, 3.482e-02, 1.823e-01, -3.895e-02, 5.164e-03, -7.314e-02, -3.897e-01, 6.275e-02, -3.974e-02), r);
r = MulAdd(s1_6, M4(7.922e-03, -3.284e-02, 1.274e-01, -2.930e-02, 6.307e-02, 2.548e-02, -4.094e-02, 2.130e-02, -1.123e-02, 1.824e-03, -9.595e-02, 1.808e-02, 7.955e-02, 3.285e-02, 4.592e-02, 7.153e-02), r);
r = MulAdd(s1_7, M4(-6.410e-02, -1.423e-02, -4.912e-02, 1.461e-01, 6.612e-02, 9.838e-02, -2.153e-01, -1.067e-01, -1.108e-02, -1.048e-01, 2.778e-01, -1.116e-01, 4.569e-02, 2.955e-02, -1.440e-01, -3.364e-02), r);
r = MulAdd(s1_8, M4(1.721e-02, 1.171e-02, 1.096e-02, -2.832e-02, 7.446e-02, 4.785e-02, 8.270e-03, -1.640e-01, -8.912e-02, -6.617e-02, 3.225e-03, 9.894e-02, 4.367e-02, 8.102e-02, -1.779e-02, -2.410e-01), r);
return tanh(r);
}
void Pass5(uint2 blockStart, uint3 tid) {
float2 pt = float2(GetInputPt());
uint2 gxy = (Rmp8x8(tid.x) << 1) + blockStart;
uint2 size = GetOutputSize();
if (gxy.x >= size.x || gxy.y >= size.y) {
return;
}
float2 pos = ((gxy >> 1) + 0.5) * pt;
V4 s0_0 = l0(-1.0, -1.0);
V4 s0_1 = l0(0.0, -1.0);
V4 s0_2 = l0(1.0, -1.0);
V4 s0_3 = l0(-1.0, 0.0);
V4 s0_4 = l0(0.0, 0.0);
V4 s0_5 = l0(1.0, 0.0);
V4 s0_6 = l0(-1.0, 1.0);
V4 s0_7 = l0(0.0, 1.0);
V4 s0_8 = l0(1.0, 1.0);
V4 s1_0 = -max(-s0_0, 0.0);
V4 s1_1 = -max(-s0_1, 0.0);
V4 s1_2 = -max(-s0_2, 0.0);
V4 s1_3 = -max(-s0_3, 0.0);
V4 s1_4 = -max(-s0_4, 0.0);
V4 s1_5 = -max(-s0_5, 0.0);
V4 s1_6 = -max(-s0_6, 0.0);
V4 s1_7 = -max(-s0_7, 0.0);
V4 s1_8 = -max(-s0_8, 0.0);
s0_0 = max(s0_0, 0.0);
s0_1 = max(s0_1, 0.0);
s0_2 = max(s0_2, 0.0);
s0_3 = max(s0_3, 0.0);
s0_4 = max(s0_4, 0.0);
s0_5 = max(s0_5, 0.0);
s0_6 = max(s0_6, 0.0);
s0_7 = max(s0_7, 0.0);
s0_8 = max(s0_8, 0.0);
V4 r = f0(s0_0, s0_1, s0_2, s0_3, s0_4, s0_5, s0_6, s0_7, s0_8, s1_0, s1_1, s1_2, s1_3, s1_4, s1_5, s1_6, s1_7, s1_8);
static const MF3x3 rgb2yuv = { 0.299, 0.587, 0.114, -0.169, -0.331, 0.5, 0.5, -0.419, -0.081 };
static const MF3x3 yuv2rgb = { 1, -0.00093, 1.401687, 1, -0.3437, -0.71417, 1, 1.77216, 0.00099 };
float2 opt = float2(GetOutputPt());
pos -= 0.5f * opt;
MF3 yuv = mul(rgb2yuv, INPUT.SampleLevel(SL, pos, 0).rgb);
OUTPUT[gxy] = MF4(mul(yuv2rgb, MF3(saturate(yuv.r + r.x), yuv.yz)), 1);
++gxy.x;
pos.x += opt.x;
yuv = mul(rgb2yuv, INPUT.SampleLevel(SL, pos, 0).rgb);
OUTPUT[gxy] = MF4(mul(yuv2rgb, MF3(saturate(yuv.r + r.y), yuv.yz)), 1);
++gxy.y;
pos.y += opt.y;
yuv = mul(rgb2yuv, INPUT.SampleLevel(SL, pos, 0).rgb);
OUTPUT[gxy] = MF4(mul(yuv2rgb, MF3(saturate(yuv.r + r.w), yuv.yz)), 1);
--gxy.x;
pos.x -= opt.x;
yuv = mul(rgb2yuv, INPUT.SampleLevel(SL, pos, 0).rgb);
OUTPUT[gxy] = MF4(mul(yuv2rgb, MF3(saturate(yuv.r + r.z), yuv.yz)), 1);
}

View file

@ -1,412 +0,0 @@
// CuNNy 3x4C BILINEAR RGB NVL - https://github.com/funnyplanter/CuNNy
// This program is free software: you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation, either version 3 of the License, or
// (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
//
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License
// along with this program. If not, see <http://www.gnu.org/licenses/>.
//!MAGPIE EFFECT
//!VERSION 4
//!SORT_NAME CuNNy-D04N03
//!USE MulAdd
//!CAPABILITY FP16
#include "../StubDefs.hlsli"
//!TEXTURE
Texture2D INPUT;
//!TEXTURE
//!WIDTH INPUT_WIDTH * 2
//!HEIGHT INPUT_HEIGHT * 2
Texture2D OUTPUT;
//!SAMPLER
//!FILTER POINT
SamplerState SP;
//!SAMPLER
//!FILTER LINEAR
SamplerState SL;
//!COMMON
#define O(t, p) t.SampleLevel(SP, pos + p * pt, 0)
#define V4 MF4
#define M4 MF4x4
//!TEXTURE
//!WIDTH INPUT_WIDTH
//!HEIGHT INPUT_HEIGHT
//!FORMAT R8G8B8A8_SNORM
Texture2D t0;
//!TEXTURE
//!WIDTH INPUT_WIDTH
//!HEIGHT INPUT_HEIGHT
//!FORMAT R8G8B8A8_SNORM
Texture2D t1;
//!PASS 1
//!DESC in
//!BLOCK_SIZE 8
//!NUM_THREADS 64
//!IN INPUT
//!OUT t0
#define l0(x, y) MF((dot(float3(6.094e-01, 1.148e+00, 2.568e-01), O(INPUT, float2(x, y)).rgb) + -1.542e+00))
V4 f0(MF s0_0, MF s0_1, MF s0_2, MF s0_3, MF s0_4, MF s0_5, MF s0_6, MF s0_7, MF s0_8) {
V4 r = { -4.952e-03, -2.750e-03, -9.137e-04, 6.736e-02 };
r = mad(s0_0, V4(-6.372e-02, 1.685e-01, -2.573e-02, -2.185e-02), r);
r = mad(s0_1, V4(-3.502e-02, -2.984e-03, 5.048e-02, -2.445e-01), r);
r = mad(s0_2, V4(9.644e-02, -7.557e-03, -1.770e-02, 3.162e-02), r);
r = mad(s0_3, V4(7.199e-02, -6.233e-01, -4.180e-01, 1.392e-01), r);
r = mad(s0_4, V4(-5.683e-01, 1.451e-01, -8.148e-02, 9.768e-02), r);
r = mad(s0_5, V4(4.702e-01, -1.319e-03, 3.745e-03, -4.204e-02), r);
r = mad(s0_6, V4(9.855e-03, 3.213e-01, 5.098e-01, 4.001e-02), r);
r = mad(s0_7, V4(8.216e-02, -1.219e-02, -3.347e-02, 5.017e-02), r);
r = mad(s0_8, V4(-6.691e-02, 5.417e-03, 1.235e-02, -9.640e-03), r);
return r;
}
void Pass1(uint2 blockStart, uint3 tid) {
float2 pt = float2(GetInputPt());
uint2 gxy = Rmp8x8(tid.x) + blockStart;
uint2 size = GetInputSize();
if (gxy.x >= size.x || gxy.y >= size.y) {
return;
}
float2 pos = (gxy + 0.5) * pt;
MF s0_0 = l0(-1.0, -1.0);
MF s0_1 = l0(0.0, -1.0);
MF s0_2 = l0(1.0, -1.0);
MF s0_3 = l0(-1.0, 0.0);
MF s0_4 = l0(0.0, 0.0);
MF s0_5 = l0(1.0, 0.0);
MF s0_6 = l0(-1.0, 1.0);
MF s0_7 = l0(0.0, 1.0);
MF s0_8 = l0(1.0, 1.0);
t0[gxy] = f0(s0_0, s0_1, s0_2, s0_3, s0_4, s0_5, s0_6, s0_7, s0_8);
}
//!PASS 2
//!DESC conv1
//!BLOCK_SIZE 8
//!NUM_THREADS 64
//!IN t0
//!OUT t1
#define l0(x, y) V4(O(t0, float2(x, y)))
V4 f0(V4 s0_0, V4 s0_1, V4 s0_2, V4 s0_3, V4 s0_4, V4 s0_5, V4 s0_6, V4 s0_7, V4 s0_8, V4 s1_0, V4 s1_1, V4 s1_2, V4 s1_3, V4 s1_4, V4 s1_5, V4 s1_6, V4 s1_7, V4 s1_8) {
V4 r = { -1.511e-02, -2.848e-03, 7.160e-03, -2.555e-03 };
r = MulAdd(s0_0, M4(3.169e-01, 3.467e-01, -2.365e-01, 2.253e-01, 6.307e-02, 1.727e-01, -1.053e-01, 9.324e-02, -4.901e-02, -2.112e-01, 8.983e-02, -1.851e-01, -1.987e-01, 6.645e-02, 2.188e-02, 1.988e-02), r);
r = MulAdd(s0_1, M4(4.393e-02, 2.078e-01, -1.967e-01, 4.673e-02, -7.991e-02, 2.461e-01, -6.028e-02, 9.252e-02, 3.871e-01, 6.138e-02, -3.603e-01, -1.485e-01, 2.466e-01, 5.251e-02, -6.181e-02, 8.932e-02), r);
r = MulAdd(s0_2, M4(-1.707e-02, 2.598e-02, 1.641e-02, 2.780e-02, 2.425e-02, 1.769e-01, -8.461e-02, 1.067e-01, -2.503e-01, 6.051e-01, -2.782e-01, 1.311e-01, -8.456e-03, -1.370e-02, -6.391e-02, 6.935e-02), r);
r = MulAdd(s0_3, M4(-8.251e-01, -4.981e-01, -1.726e-01, -1.815e-01, 1.411e-01, 2.889e-02, -3.115e-01, -3.255e-01, 1.812e-03, -4.529e-02, 2.350e-01, 1.999e-01, -1.993e-01, -1.868e-02, 4.249e-02, -1.117e-01), r);
r = MulAdd(s0_4, M4(-4.732e-02, -5.673e-02, 1.274e-01, 4.894e-02, 9.126e-02, 1.717e-01, -3.294e-01, -2.378e-01, -7.089e-02, -8.116e-02, 2.510e-01, 7.381e-02, 1.275e-01, 8.030e-02, -1.671e-01, -1.824e-02), r);
r = MulAdd(s0_5, M4(3.373e-02, -4.163e-02, -4.077e-02, -2.085e-02, 1.265e-01, -4.133e-01, 7.433e-02, 7.763e-02, -1.466e-01, 3.291e-01, -7.784e-02, 9.472e-02, 2.725e-01, -2.393e-01, -6.913e-02, -9.445e-02), r);
r = MulAdd(s0_6, M4(3.043e-02, -9.985e-02, 1.538e-01, -2.529e-01, 2.379e-01, 1.079e-01, -1.517e-01, -9.289e-02, -1.396e-01, -4.354e-02, 8.463e-02, 7.052e-02, 5.629e-02, 3.293e-03, 5.342e-02, -1.606e-01), r);
r = MulAdd(s0_7, M4(3.626e-02, -1.421e-01, 4.017e-02, -3.963e-02, 2.148e-03, 5.522e-02, 3.174e-01, 2.270e-02, -5.590e-02, -9.875e-02, -1.683e-01, 5.415e-02, 1.509e-01, 7.709e-02, -1.161e-01, 1.440e-01), r);
r = MulAdd(s0_8, M4(-1.132e-02, 2.337e-02, 1.264e-02, 2.638e-03, -6.582e-02, -1.965e-01, 2.803e-01, 1.333e-01, 9.171e-02, 1.567e-01, -2.419e-01, -1.602e-01, -2.271e-01, 3.614e-02, 2.179e-01, 4.826e-02), r);
r = MulAdd(s1_0, M4(1.452e-01, 1.313e-01, -6.140e-02, 2.412e-01, -3.691e-02, 7.355e-02, -4.209e-02, 1.343e-01, -2.509e-02, -1.266e-01, 9.017e-02, -1.854e-02, -4.280e-01, -1.004e-01, 2.319e-01, 4.211e-02), r);
r = MulAdd(s1_1, M4(4.894e-02, 7.564e-02, -9.350e-02, 5.422e-02, -6.111e-02, 6.969e-02, -4.398e-02, 6.622e-02, 7.113e-01, 3.461e-01, -5.254e-01, -8.808e-02, 4.481e-01, 3.171e-01, -2.198e-01, 1.048e-01), r);
r = MulAdd(s1_2, M4(-3.483e-02, 3.150e-03, 2.215e-02, 2.616e-02, 1.468e-01, -1.295e-01, -1.470e-01, 3.371e-02, -4.514e-02, 4.677e-02, -1.313e-01, -1.176e-01, 1.507e-03, 2.290e-01, -2.163e-01, 3.895e-02), r);
r = MulAdd(s1_3, M4(-2.258e-01, -1.353e-01, -4.873e-01, -1.236e+00, 1.660e-01, -1.803e-02, -2.797e-01, -4.092e-01, -1.525e-01, -8.178e-02, 2.665e-01, 3.652e-01, -1.853e-01, -3.819e-02, 1.627e-01, -3.896e-01), r);
r = MulAdd(s1_4, M4(-1.005e-01, -3.821e-02, 9.917e-02, -1.324e-01, -2.040e-01, -3.586e-01, 9.776e-02, -1.376e-01, 2.065e-01, 2.017e-01, -1.320e-01, -2.225e-02, 2.944e-01, 5.393e-02, -4.301e-01, -7.240e-02), r);
r = MulAdd(s1_5, M4(5.353e-02, -4.257e-02, -4.131e-02, -3.943e-02, -6.151e-02, 3.059e-01, -1.481e-02, 3.662e-01, 3.098e-02, -8.774e-02, 1.790e-02, -1.332e-01, 8.670e-02, -6.985e-02, -1.359e-01, 2.063e-01), r);
r = MulAdd(s1_6, M4(-9.271e-02, 2.259e-01, 2.200e-02, -2.390e-01, 3.258e-01, 1.082e-01, -1.499e-01, -3.063e-02, -2.775e-01, -9.008e-02, 1.294e-01, 3.533e-02, 1.011e-02, 4.294e-02, 4.935e-02, -1.005e-01), r);
r = MulAdd(s1_7, M4(1.321e-02, -7.160e-02, 7.229e-02, -3.050e-02, 4.303e-02, -1.518e-01, 5.137e-01, 4.029e-02, 4.896e-02, 5.334e-02, -3.545e-01, 2.370e-02, 1.645e-01, 3.433e-02, -9.552e-03, 1.032e-01), r);
r = MulAdd(s1_8, M4(8.370e-03, -2.408e-02, 2.693e-02, -8.183e-03, -2.375e-02, -2.973e-01, 1.889e-01, 1.096e-01, 1.093e-02, 2.310e-01, -1.613e-01, -1.343e-01, -1.718e-01, -2.165e-02, 1.384e-01, 9.956e-02), r);
return r;
}
void Pass2(uint2 blockStart, uint3 tid) {
float2 pt = float2(GetInputPt());
uint2 gxy = Rmp8x8(tid.x) + blockStart;
uint2 size = GetInputSize();
if (gxy.x >= size.x || gxy.y >= size.y) {
return;
}
float2 pos = (gxy + 0.5) * pt;
V4 s0_0 = l0(-1.0, -1.0);
V4 s0_1 = l0(0.0, -1.0);
V4 s0_2 = l0(1.0, -1.0);
V4 s0_3 = l0(-1.0, 0.0);
V4 s0_4 = l0(0.0, 0.0);
V4 s0_5 = l0(1.0, 0.0);
V4 s0_6 = l0(-1.0, 1.0);
V4 s0_7 = l0(0.0, 1.0);
V4 s0_8 = l0(1.0, 1.0);
V4 s1_0 = -max(-s0_0, 0.0);
V4 s1_1 = -max(-s0_1, 0.0);
V4 s1_2 = -max(-s0_2, 0.0);
V4 s1_3 = -max(-s0_3, 0.0);
V4 s1_4 = -max(-s0_4, 0.0);
V4 s1_5 = -max(-s0_5, 0.0);
V4 s1_6 = -max(-s0_6, 0.0);
V4 s1_7 = -max(-s0_7, 0.0);
V4 s1_8 = -max(-s0_8, 0.0);
s0_0 = max(s0_0, 0.0);
s0_1 = max(s0_1, 0.0);
s0_2 = max(s0_2, 0.0);
s0_3 = max(s0_3, 0.0);
s0_4 = max(s0_4, 0.0);
s0_5 = max(s0_5, 0.0);
s0_6 = max(s0_6, 0.0);
s0_7 = max(s0_7, 0.0);
s0_8 = max(s0_8, 0.0);
t1[gxy] = f0(s0_0, s0_1, s0_2, s0_3, s0_4, s0_5, s0_6, s0_7, s0_8, s1_0, s1_1, s1_2, s1_3, s1_4, s1_5, s1_6, s1_7, s1_8);
}
//!PASS 3
//!DESC conv2
//!BLOCK_SIZE 8
//!NUM_THREADS 64
//!IN t1
//!OUT t0
#define l0(x, y) V4(O(t1, float2(x, y)))
V4 f0(V4 s0_0, V4 s0_1, V4 s0_2, V4 s0_3, V4 s0_4, V4 s0_5, V4 s0_6, V4 s0_7, V4 s0_8, V4 s1_0, V4 s1_1, V4 s1_2, V4 s1_3, V4 s1_4, V4 s1_5, V4 s1_6, V4 s1_7, V4 s1_8) {
V4 r = { -3.514e-03, 2.350e-03, 2.221e-03, 1.089e-03 };
r = MulAdd(s0_0, M4(6.983e-02, 8.935e-03, -1.644e-01, -4.232e-04, -1.981e-01, 9.265e-02, 1.769e-01, 1.705e-01, -2.300e-02, -7.408e-03, -4.221e-02, -1.617e-02, -6.026e-02, -9.185e-03, -7.420e-02, -4.238e-02), r);
r = MulAdd(s0_1, M4(1.832e-01, -1.117e-01, 1.784e-02, 6.345e-02, -9.651e-02, 5.753e-02, 1.480e-01, 1.284e-01, 3.957e-01, -2.684e-01, 2.853e-02, -5.823e-02, -8.184e-02, 1.062e-01, -2.604e-02, -7.579e-02), r);
r = MulAdd(s0_2, M4(-1.753e-01, 5.019e-03, -1.285e-01, 8.470e-02, -2.566e-01, 6.556e-02, -9.751e-02, 7.653e-03, -9.466e-02, 3.098e-02, -9.617e-02, -4.826e-02, 3.951e-02, -5.446e-02, 1.297e-01, 1.076e-01), r);
r = MulAdd(s0_3, M4(-7.377e-02, -2.183e-01, 9.806e-02, 1.735e-01, 2.795e-01, 3.730e-01, 1.906e-01, 1.313e-01, 2.115e-01, 2.222e-01, 1.880e-01, 2.427e-01, -1.177e-01, 2.587e-02, -1.928e-01, -1.489e-01), r);
r = MulAdd(s0_4, M4(-3.487e-01, -3.194e-01, 7.963e-01, -1.044e-01, 3.136e-01, -5.467e-02, 5.059e-01, -4.801e-02, -4.943e-01, -1.466e-01, -5.938e-02, -9.473e-01, 2.661e-01, -1.545e-01, 1.986e-01, -2.172e-02), r);
r = MulAdd(s0_5, M4(-3.450e-01, 1.931e-01, -2.303e-01, -1.880e-01, -1.323e-01, 1.839e-01, -1.130e-01, -5.181e-02, 3.049e-02, 9.834e-02, -1.342e-01, -1.072e-01, 1.925e-02, -9.652e-02, 1.169e-01, 2.084e-01), r);
r = MulAdd(s0_6, M4(1.543e-02, 2.202e-01, 4.809e-02, 1.085e-01, 3.076e-02, -4.127e-01, 4.606e-02, 9.444e-02, 7.886e-02, -1.314e-01, -1.638e-02, 4.353e-02, 9.790e-02, -6.783e-02, -1.008e-01, -1.558e-01), r);
r = MulAdd(s0_7, M4(-4.453e-02, 3.133e-01, -2.217e-01, -5.271e-02, -2.055e-01, -1.000e-01, 8.374e-02, 6.141e-02, 2.147e-02, -3.844e-01, -2.203e-01, -1.105e-01, -3.596e-02, 2.026e-01, 3.174e-01, 1.519e-01), r);
r = MulAdd(s0_8, M4(-5.107e-03, 2.380e-01, 2.147e-02, -8.032e-02, -9.743e-02, 6.943e-02, 9.403e-02, 3.742e-02, -1.822e-02, -4.950e-02, 7.963e-02, -1.338e-01, -1.491e-01, 1.655e-02, -5.817e-02, 1.164e-01), r);
r = MulAdd(s1_0, M4(8.679e-02, -7.335e-02, -5.999e-02, -4.504e-02, -3.329e-02, 4.349e-03, -4.883e-02, 3.159e-02, -7.948e-02, 3.308e-02, 6.579e-02, 1.607e-01, 1.336e-01, -1.042e-01, -2.368e-01, -1.546e-01), r);
r = MulAdd(s1_1, M4(2.764e-01, -6.665e-02, 1.661e-02, -4.103e-02, 1.095e-01, -1.159e-01, -1.142e-01, -1.412e-01, 4.033e-01, -8.697e-02, 2.387e-01, 1.762e-01, 4.948e-01, -1.533e-01, 7.816e-02, 5.700e-02), r);
r = MulAdd(s1_2, M4(1.187e-01, -6.571e-02, 4.698e-02, 4.931e-02, -5.523e-02, 3.925e-02, -7.453e-02, -8.429e-02, -2.202e-01, 6.090e-02, -1.460e-01, 2.777e-02, 4.405e-01, 6.445e-03, 3.494e-01, 3.311e-01), r);
r = MulAdd(s1_3, M4(-4.333e-02, -8.517e-02, 1.372e-01, 2.066e-01, 4.728e-01, 1.195e-01, -2.627e-01, -2.280e-01, 1.606e-01, 2.216e-01, 2.269e-01, 3.505e-01, -2.499e-01, -3.977e-01, -3.659e-02, 1.460e-02), r);
r = MulAdd(s1_4, M4(-4.640e-01, -7.221e-01, -2.524e-01, -6.513e-01, 6.699e-01, -1.727e-01, 4.444e-01, -3.115e-01, -6.748e-01, 1.063e-01, 6.487e-01, -3.195e-01, -5.136e-01, -8.272e-01, 4.014e-01, 4.914e-01), r);
r = MulAdd(s1_5, M4(-1.112e-03, -1.293e-02, 1.567e-02, -1.266e-01, 1.185e-01, 4.940e-02, -9.925e-02, -1.034e-01, -1.041e-01, 1.822e-01, -4.277e-02, 1.313e-01, -6.459e-01, -1.562e-01, -3.961e-01, -7.262e-02), r);
r = MulAdd(s1_6, M4(1.499e-02, 3.135e-01, 2.187e-01, 2.386e-01, 1.171e-01, -4.899e-01, -1.987e-01, -1.717e-01, 5.232e-02, -1.984e-01, 9.338e-04, 1.092e-01, 1.545e-01, 4.183e-01, 1.180e-01, 1.102e-01), r);
r = MulAdd(s1_7, M4(-1.411e-01, 2.619e-01, -2.549e-01, -2.113e-01, -1.109e-01, -3.038e-01, 7.579e-02, -3.585e-02, -1.373e-03, -2.713e-01, -5.527e-02, 7.052e-02, -1.648e-01, 7.324e-01, 3.974e-01, 2.306e-01), r);
r = MulAdd(s1_8, M4(-1.861e-02, 9.414e-02, -6.739e-02, -8.921e-02, -2.337e-02, -2.657e-02, -3.376e-03, -7.209e-02, -1.042e-01, -2.504e-02, 1.287e-01, -1.459e-02, -1.617e-01, 2.384e-01, -6.969e-01, -3.760e-01), r);
return r;
}
void Pass3(uint2 blockStart, uint3 tid) {
float2 pt = float2(GetInputPt());
uint2 gxy = Rmp8x8(tid.x) + blockStart;
uint2 size = GetInputSize();
if (gxy.x >= size.x || gxy.y >= size.y) {
return;
}
float2 pos = (gxy + 0.5) * pt;
V4 s0_0 = l0(-1.0, -1.0);
V4 s0_1 = l0(0.0, -1.0);
V4 s0_2 = l0(1.0, -1.0);
V4 s0_3 = l0(-1.0, 0.0);
V4 s0_4 = l0(0.0, 0.0);
V4 s0_5 = l0(1.0, 0.0);
V4 s0_6 = l0(-1.0, 1.0);
V4 s0_7 = l0(0.0, 1.0);
V4 s0_8 = l0(1.0, 1.0);
V4 s1_0 = -max(-s0_0, 0.0);
V4 s1_1 = -max(-s0_1, 0.0);
V4 s1_2 = -max(-s0_2, 0.0);
V4 s1_3 = -max(-s0_3, 0.0);
V4 s1_4 = -max(-s0_4, 0.0);
V4 s1_5 = -max(-s0_5, 0.0);
V4 s1_6 = -max(-s0_6, 0.0);
V4 s1_7 = -max(-s0_7, 0.0);
V4 s1_8 = -max(-s0_8, 0.0);
s0_0 = max(s0_0, 0.0);
s0_1 = max(s0_1, 0.0);
s0_2 = max(s0_2, 0.0);
s0_3 = max(s0_3, 0.0);
s0_4 = max(s0_4, 0.0);
s0_5 = max(s0_5, 0.0);
s0_6 = max(s0_6, 0.0);
s0_7 = max(s0_7, 0.0);
s0_8 = max(s0_8, 0.0);
t0[gxy] = f0(s0_0, s0_1, s0_2, s0_3, s0_4, s0_5, s0_6, s0_7, s0_8, s1_0, s1_1, s1_2, s1_3, s1_4, s1_5, s1_6, s1_7, s1_8);
}
//!PASS 4
//!DESC conv3
//!BLOCK_SIZE 8
//!NUM_THREADS 64
//!IN t0
//!OUT t1
#define l0(x, y) V4(O(t0, float2(x, y)))
V4 f0(V4 s0_0, V4 s0_1, V4 s0_2, V4 s0_3, V4 s0_4, V4 s0_5, V4 s0_6, V4 s0_7, V4 s0_8, V4 s1_0, V4 s1_1, V4 s1_2, V4 s1_3, V4 s1_4, V4 s1_5, V4 s1_6, V4 s1_7, V4 s1_8) {
V4 r = { -4.349e-03, -3.760e-03, 4.684e-03, 4.745e-03 };
r = MulAdd(s0_0, M4(1.869e-01, 8.774e-02, -6.451e-02, 6.682e-02, 8.374e-02, 1.313e-02, -2.649e-02, 2.741e-02, -3.609e-02, -9.330e-02, -8.233e-02, 1.117e-01, -1.203e-01, 1.719e-02, 1.288e-01, -9.851e-02), r);
r = MulAdd(s0_1, M4(3.100e-01, 5.063e-02, 1.169e-01, -3.828e-02, 3.428e-01, 4.869e-02, -1.232e-02, -1.003e-02, 2.756e-01, 3.916e-01, 1.450e-01, 1.078e-01, -2.568e-01, -2.157e-01, -1.057e-01, -1.338e-01), r);
r = MulAdd(s0_2, M4(1.199e-01, -1.890e-01, 5.870e-03, -5.995e-03, 2.255e-01, -2.325e-03, 7.916e-03, -2.038e-02, 1.353e-01, -9.590e-02, -2.119e-02, -5.860e-02, -7.698e-02, -3.608e-02, -3.571e-02, 2.010e-02), r);
r = MulAdd(s0_3, M4(9.889e-02, -2.665e-02, -2.627e-01, 3.583e-01, 7.891e-02, 8.737e-02, 5.322e-02, 5.246e-04, -5.188e-02, -8.491e-02, -4.991e-02, -3.735e-02, 5.711e-02, 4.482e-02, 5.660e-02, -1.322e-01), r);
r = MulAdd(s0_4, M4(-5.488e-01, 2.898e-01, 1.046e+00, 6.036e-01, -3.180e-01, -6.309e-01, -2.627e-01, 1.734e-01, -2.067e-01, 3.775e-02, -2.881e-01, -9.242e-02, 3.369e-01, 2.554e-02, -1.645e-01, 4.973e-01), r);
r = MulAdd(s0_5, M4(6.976e-03, -1.830e-01, 2.842e-01, 2.570e-02, -2.902e-01, 5.059e-01, 1.944e-01, 1.794e-02, -1.333e-01, 2.341e-01, 4.161e-01, -5.179e-02, 8.176e-02, -2.435e-02, -1.598e-02, 6.211e-02), r);
r = MulAdd(s0_6, M4(-2.668e-02, -6.958e-02, -5.015e-02, 8.035e-02, 4.451e-02, -1.290e-03, -7.688e-02, 1.708e-01, -5.133e-02, -2.768e-02, -1.780e-02, -6.317e-02, -9.692e-03, -2.748e-03, 9.070e-03, -1.314e-01), r);
r = MulAdd(s0_7, M4(1.402e-01, 4.997e-02, -4.973e-02, 6.839e-01, 2.079e-02, -2.511e-02, 3.403e-01, -3.077e-01, -2.831e-02, 4.816e-02, -9.142e-02, -8.176e-02, -2.999e-02, -5.749e-03, -5.579e-02, -2.355e-01), r);
r = MulAdd(s0_8, M4(-1.783e-02, -2.882e-02, 9.841e-02, 4.473e-02, 4.128e-02, -3.071e-02, -2.378e-01, 1.347e-01, -2.285e-02, 1.317e-02, -1.632e-02, 1.058e-01, -3.696e-02, -6.864e-03, -8.989e-02, -7.315e-02), r);
r = MulAdd(s1_0, M4(8.857e-02, 3.169e-02, -1.896e-02, 1.258e-02, 7.086e-02, 5.699e-02, 1.550e-02, -1.836e-02, 1.209e-01, 5.334e-02, -1.557e-02, -2.374e-02, -1.411e-02, 1.543e-02, 1.769e-02, -4.332e-02), r);
r = MulAdd(s1_1, M4(1.199e-01, -8.203e-03, -1.695e-02, -3.214e-02, 5.918e-01, 3.458e-01, 7.684e-02, -5.137e-01, 2.827e-01, -2.008e-02, -1.848e-01, 2.147e-01, 7.212e-02, -3.906e-03, -2.220e-01, -1.918e-01), r);
r = MulAdd(s1_2, M4(4.464e-02, 4.035e-02, 4.265e-03, 1.350e-02, -4.623e-01, -1.882e-01, 9.929e-02, -2.295e-01, 2.010e-01, 6.059e-01, 3.648e-01, -1.670e-02, -6.763e-02, -2.588e-01, -1.741e-01, 3.358e-02), r);
r = MulAdd(s1_3, M4(1.003e-01, -2.961e-02, -1.715e-01, 1.057e-01, 3.275e-03, 1.877e-02, -4.995e-02, 1.181e-01, 3.600e-02, 2.101e-02, -1.050e-01, 8.035e-02, -8.107e-02, -1.067e-01, -5.457e-02, 5.339e-02), r);
r = MulAdd(s1_4, M4(3.875e-01, 3.638e-01, 1.178e-01, -4.404e-02, 6.128e-02, -1.193e-01, -3.161e-01, 3.510e-01, -3.482e-02, -2.842e-01, -3.917e-01, 4.525e-01, 1.969e-01, 5.299e-01, 4.720e-01, -2.266e-01), r);
r = MulAdd(s1_5, M4(-1.420e-02, 2.325e-02, -8.697e-02, -4.296e-03, 8.697e-02, 7.490e-02, 1.773e-01, 4.010e-01, 2.380e-01, -1.182e-01, 9.121e-01, 2.252e-01, 1.348e-01, -7.448e-02, -8.496e-01, -3.335e-01), r);
r = MulAdd(s1_6, M4(-7.923e-02, -2.533e-02, -4.896e-02, -5.473e-02, -5.329e-03, 1.285e-02, -1.763e-02, 7.009e-02, 9.670e-04, -1.889e-02, -1.008e-01, 1.149e-01, 7.259e-03, 4.080e-02, 1.042e-01, -2.627e-01), r);
r = MulAdd(s1_7, M4(-9.746e-02, 6.679e-02, -1.421e-01, -2.202e-01, -9.918e-03, -2.413e-02, -1.554e-02, 7.011e-03, -3.226e-02, -3.024e-02, -5.431e-02, 7.446e-02, 5.860e-02, 2.851e-02, -2.367e-01, 2.562e-02), r);
r = MulAdd(s1_8, M4(-4.627e-02, 4.226e-02, -8.654e-02, -3.312e-02, 1.600e-02, 2.983e-02, 8.834e-03, -3.871e-02, -4.137e-03, 1.767e-02, 2.492e-02, -5.391e-02, 8.133e-03, 1.430e-02, -2.428e-02, -1.132e-01), r);
return r;
}
void Pass4(uint2 blockStart, uint3 tid) {
float2 pt = float2(GetInputPt());
uint2 gxy = Rmp8x8(tid.x) + blockStart;
uint2 size = GetInputSize();
if (gxy.x >= size.x || gxy.y >= size.y) {
return;
}
float2 pos = (gxy + 0.5) * pt;
V4 s0_0 = l0(-1.0, -1.0);
V4 s0_1 = l0(0.0, -1.0);
V4 s0_2 = l0(1.0, -1.0);
V4 s0_3 = l0(-1.0, 0.0);
V4 s0_4 = l0(0.0, 0.0);
V4 s0_5 = l0(1.0, 0.0);
V4 s0_6 = l0(-1.0, 1.0);
V4 s0_7 = l0(0.0, 1.0);
V4 s0_8 = l0(1.0, 1.0);
V4 s1_0 = -max(-s0_0, 0.0);
V4 s1_1 = -max(-s0_1, 0.0);
V4 s1_2 = -max(-s0_2, 0.0);
V4 s1_3 = -max(-s0_3, 0.0);
V4 s1_4 = -max(-s0_4, 0.0);
V4 s1_5 = -max(-s0_5, 0.0);
V4 s1_6 = -max(-s0_6, 0.0);
V4 s1_7 = -max(-s0_7, 0.0);
V4 s1_8 = -max(-s0_8, 0.0);
s0_0 = max(s0_0, 0.0);
s0_1 = max(s0_1, 0.0);
s0_2 = max(s0_2, 0.0);
s0_3 = max(s0_3, 0.0);
s0_4 = max(s0_4, 0.0);
s0_5 = max(s0_5, 0.0);
s0_6 = max(s0_6, 0.0);
s0_7 = max(s0_7, 0.0);
s0_8 = max(s0_8, 0.0);
t1[gxy] = f0(s0_0, s0_1, s0_2, s0_3, s0_4, s0_5, s0_6, s0_7, s0_8, s1_0, s1_1, s1_2, s1_3, s1_4, s1_5, s1_6, s1_7, s1_8);
}
//!PASS 5
//!DESC out-shuffle
//!BLOCK_SIZE 16
//!NUM_THREADS 64
//!IN INPUT, t1
//!OUT OUTPUT
#define l0(x, y) V4(O(t1, float2(x, y)))
V4 f0(V4 s0_0, V4 s0_1, V4 s0_2, V4 s0_3, V4 s0_4, V4 s0_5, V4 s0_6, V4 s0_7, V4 s0_8, V4 s1_0, V4 s1_1, V4 s1_2, V4 s1_3, V4 s1_4, V4 s1_5, V4 s1_6, V4 s1_7, V4 s1_8) {
V4 r = { 7.204e-05, -6.226e-05, 2.867e-04, -3.251e-05 };
r = MulAdd(s0_0, M4(-4.783e-03, 7.235e-03, -7.275e-03, -2.802e-03, 4.921e-02, 7.543e-02, -3.357e-02, 1.213e-02, 2.900e-02, 2.380e-03, -9.028e-03, -2.594e-02, 1.576e-03, 3.334e-04, -2.460e-02, -1.285e-02), r);
r = MulAdd(s0_1, M4(4.582e-02, 9.378e-04, 2.217e-02, 5.083e-02, -1.054e-02, 8.518e-02, -1.884e-02, -5.149e-02, 1.983e-02, -1.106e-02, -4.317e-03, 5.384e-02, -5.193e-02, 1.089e-02, -9.384e-03, 3.137e-02), r);
r = MulAdd(s0_2, M4(-5.241e-03, 3.821e-02, -1.136e-02, -3.033e-02, 3.186e-02, -3.270e-03, 1.422e-02, 2.401e-02, -1.360e-02, 1.024e-01, -6.042e-02, -2.325e-02, -1.248e-01, -1.377e-01, 1.654e-02, -1.347e-02), r);
r = MulAdd(s0_3, M4(-3.552e-02, -3.211e-02, -2.282e-03, 1.775e-02, 1.360e-01, 2.808e-02, 1.082e-01, -1.311e-02, -1.699e-02, -2.628e-02, 3.430e-02, -3.880e-03, 2.514e-02, -3.171e-02, 4.675e-02, -2.711e-02), r);
r = MulAdd(s0_4, M4(4.756e-01, 2.686e-01, 4.514e-02, -8.813e-02, 2.636e-01, -4.893e-01, 1.301e-01, 1.304e-01, 3.778e-01, 2.765e-01, 3.369e-01, 8.811e-02, 5.080e-02, 2.783e-01, -1.131e-01, 2.487e-01), r);
r = MulAdd(s0_5, M4(-2.961e-02, 7.757e-02, -8.471e-02, -4.636e-02, -6.862e-02, 1.733e-01, -7.301e-02, -1.408e-02, 1.636e-02, 9.982e-02, 5.704e-02, 2.568e-01, -2.224e-02, -2.588e-01, -2.202e-01, -4.898e-01), r);
r = MulAdd(s0_6, M4(1.058e-01, -2.810e-02, -2.960e-02, -8.398e-02, -9.106e-02, 6.642e-02, -2.574e-02, 7.841e-02, -1.978e-02, -3.700e-02, -1.504e-02, -3.186e-02, 2.438e-03, 6.191e-03, -1.155e-02, -1.161e-02), r);
r = MulAdd(s0_7, M4(-6.316e-01, -7.748e-02, 8.006e-01, 3.936e-01, 1.300e-01, -1.999e-01, 2.351e-01, -7.485e-01, -7.151e-02, -4.285e-02, -2.277e-02, 2.849e-02, -2.207e-02, -2.585e-02, -2.498e-02, -3.308e-02), r);
r = MulAdd(s0_8, M4(-2.002e-01, -6.934e-01, -1.093e-01, 3.325e-01, -5.778e-02, 2.138e-02, -2.930e-02, 1.794e-01, -3.028e-03, 2.300e-03, 5.845e-03, -1.959e-02, 1.403e-02, 1.565e-02, 1.840e-02, -6.027e-04), r);
r = MulAdd(s1_0, M4(2.228e-02, -8.352e-03, -1.007e-02, -1.911e-02, -1.489e-02, 2.785e-03, -9.190e-03, 5.858e-03, 2.420e-02, -7.701e-03, -2.327e-02, -2.494e-02, -8.526e-03, -2.384e-02, -2.601e-02, -4.833e-02), r);
r = MulAdd(s1_1, M4(5.671e-02, 3.666e-02, 3.309e-02, 1.011e-02, -8.053e-03, 4.673e-02, -5.358e-02, -2.451e-02, 3.779e-01, 5.642e-02, -2.324e-01, -3.499e-02, -3.479e-01, 1.179e-01, -4.630e-02, 1.118e-01), r);
r = MulAdd(s1_2, M4(-1.650e-02, 6.203e-04, -1.322e-02, -1.996e-02, 2.118e-02, -9.244e-03, 2.813e-02, 9.773e-03, -2.654e-02, -8.373e-02, 6.663e-04, -6.860e-02, -3.436e-02, -7.207e-01, 2.389e-01, 1.903e-01), r);
r = MulAdd(s1_3, M4(-8.045e-02, -2.073e-02, 3.380e-02, 1.327e-02, 1.247e-01, 1.129e-02, 6.421e-02, -8.326e-03, -4.675e-02, 4.920e-02, -3.699e-02, 4.601e-02, 3.389e-02, -4.151e-02, 3.012e-02, -2.241e-02), r);
r = MulAdd(s1_4, M4(5.223e-01, 1.394e-01, 1.222e-01, -7.687e-03, -3.115e-01, 3.989e-02, -1.679e-01, 2.607e-01, 4.393e-01, -1.821e-01, 1.006e+00, -2.920e-01, 8.062e-02, 2.231e-01, -1.282e-02, 2.495e-01), r);
r = MulAdd(s1_5, M4(-1.146e-01, 6.738e-02, -1.655e-02, 1.178e-02, -3.058e-02, 1.093e-01, 9.367e-03, 1.382e-02, -7.397e-02, 2.300e-01, -4.202e-02, 1.765e-01, -4.671e-02, -1.375e-02, -3.662e-01, -5.254e-01), r);
r = MulAdd(s1_6, M4(5.090e-02, 8.633e-03, -1.128e-02, -3.186e-02, -6.263e-02, 4.143e-02, -2.214e-02, 5.270e-02, -1.370e-02, -1.692e-02, -2.644e-02, -9.847e-03, -2.147e-03, -7.941e-03, -1.323e-04, -5.173e-03), r);
r = MulAdd(s1_7, M4(-9.353e-02, 6.696e-02, 2.744e-01, 2.743e-01, 9.809e-02, -1.439e-01, -2.583e-02, -3.717e-01, -5.135e-02, -1.889e-02, -1.775e-02, 9.383e-03, -2.496e-02, -2.936e-02, -2.578e-02, -1.586e-02), r);
r = MulAdd(s1_8, M4(-1.565e-02, -1.635e-01, -1.800e-01, -2.607e-01, 1.975e-02, 1.594e-02, -4.568e-02, 1.218e-01, -6.668e-03, 7.923e-03, -4.625e-02, 1.324e-02, -6.838e-03, 2.045e-02, 1.141e-02, 2.717e-02), r);
return tanh(r);
}
void Pass5(uint2 blockStart, uint3 tid) {
float2 pt = float2(GetInputPt());
uint2 gxy = (Rmp8x8(tid.x) << 1) + blockStart;
uint2 size = GetOutputSize();
if (gxy.x >= size.x || gxy.y >= size.y) {
return;
}
float2 pos = ((gxy >> 1) + 0.5) * pt;
V4 s0_0 = l0(-1.0, -1.0);
V4 s0_1 = l0(0.0, -1.0);
V4 s0_2 = l0(1.0, -1.0);
V4 s0_3 = l0(-1.0, 0.0);
V4 s0_4 = l0(0.0, 0.0);
V4 s0_5 = l0(1.0, 0.0);
V4 s0_6 = l0(-1.0, 1.0);
V4 s0_7 = l0(0.0, 1.0);
V4 s0_8 = l0(1.0, 1.0);
V4 s1_0 = -max(-s0_0, 0.0);
V4 s1_1 = -max(-s0_1, 0.0);
V4 s1_2 = -max(-s0_2, 0.0);
V4 s1_3 = -max(-s0_3, 0.0);
V4 s1_4 = -max(-s0_4, 0.0);
V4 s1_5 = -max(-s0_5, 0.0);
V4 s1_6 = -max(-s0_6, 0.0);
V4 s1_7 = -max(-s0_7, 0.0);
V4 s1_8 = -max(-s0_8, 0.0);
s0_0 = max(s0_0, 0.0);
s0_1 = max(s0_1, 0.0);
s0_2 = max(s0_2, 0.0);
s0_3 = max(s0_3, 0.0);
s0_4 = max(s0_4, 0.0);
s0_5 = max(s0_5, 0.0);
s0_6 = max(s0_6, 0.0);
s0_7 = max(s0_7, 0.0);
s0_8 = max(s0_8, 0.0);
V4 r = f0(s0_0, s0_1, s0_2, s0_3, s0_4, s0_5, s0_6, s0_7, s0_8, s1_0, s1_1, s1_2, s1_3, s1_4, s1_5, s1_6, s1_7, s1_8);
static const MF3x3 rgb2yuv = { 0.299, 0.587, 0.114, -0.169, -0.331, 0.5, 0.5, -0.419, -0.081 };
static const MF3x3 yuv2rgb = { 1, -0.00093, 1.401687, 1, -0.3437, -0.71417, 1, 1.77216, 0.00099 };
float2 opt = float2(GetOutputPt());
pos -= 0.5f * opt;
MF3 yuv = mul(rgb2yuv, INPUT.SampleLevel(SL, pos, 0).rgb);
OUTPUT[gxy] = MF4(mul(yuv2rgb, MF3(saturate(yuv.r + r.x), yuv.yz)), 1);
++gxy.x;
pos.x += opt.x;
yuv = mul(rgb2yuv, INPUT.SampleLevel(SL, pos, 0).rgb);
OUTPUT[gxy] = MF4(mul(yuv2rgb, MF3(saturate(yuv.r + r.y), yuv.yz)), 1);
++gxy.y;
pos.y += opt.y;
yuv = mul(rgb2yuv, INPUT.SampleLevel(SL, pos, 0).rgb);
OUTPUT[gxy] = MF4(mul(yuv2rgb, MF3(saturate(yuv.r + r.w), yuv.yz)), 1);
--gxy.x;
pos.x -= opt.x;
yuv = mul(rgb2yuv, INPUT.SampleLevel(SL, pos, 0).rgb);
OUTPUT[gxy] = MF4(mul(yuv2rgb, MF3(saturate(yuv.r + r.z), yuv.yz)), 1);
}

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

View file

@ -1,484 +0,0 @@
// CuNNy 4x4C BILINEAR RGB NVL DN - https://github.com/funnyplanter/CuNNy
// This program is free software: you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation, either version 3 of the License, or
// (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
//
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License
// along with this program. If not, see <http://www.gnu.org/licenses/>.
//!MAGPIE EFFECT
//!VERSION 4
//!SORT_NAME CuNNy-DN-D04N04
//!USE MulAdd
//!CAPABILITY FP16
#include "../StubDefs.hlsli"
//!TEXTURE
Texture2D INPUT;
//!TEXTURE
//!WIDTH INPUT_WIDTH * 2
//!HEIGHT INPUT_HEIGHT * 2
Texture2D OUTPUT;
//!SAMPLER
//!FILTER POINT
SamplerState SP;
//!SAMPLER
//!FILTER LINEAR
SamplerState SL;
//!COMMON
#define O(t, p) t.SampleLevel(SP, pos + p * pt, 0)
#define V4 MF4
#define M4 MF4x4
//!TEXTURE
//!WIDTH INPUT_WIDTH
//!HEIGHT INPUT_HEIGHT
//!FORMAT R8G8B8A8_SNORM
Texture2D t0;
//!TEXTURE
//!WIDTH INPUT_WIDTH
//!HEIGHT INPUT_HEIGHT
//!FORMAT R8G8B8A8_SNORM
Texture2D t1;
//!PASS 1
//!DESC in
//!BLOCK_SIZE 8
//!NUM_THREADS 64
//!IN INPUT
//!OUT t0
#define l0(x, y) (dot(MF3(2.428e-01, 4.714e-01, 1.229e-01), O(INPUT, float2(x, y)).rgb) + MF(-7.696e-02))
V4 f0(MF s0_0, MF s0_1, MF s0_2, MF s0_3, MF s0_4, MF s0_5, MF s0_6, MF s0_7, MF s0_8) {
V4 r = { 1.690e-02, 8.856e-03, -9.136e-04, 2.267e-02 };
r = mad(s0_0, V4(9.154e-02, 3.758e-01, 2.353e-02, -5.798e-02), r);
r = mad(s0_1, V4(-5.382e-01, 1.688e-01, -1.190e-01, 4.082e-02), r);
r = mad(s0_2, V4(2.460e-02, -5.810e-02, 7.788e-02, 3.018e-02), r);
r = mad(s0_3, V4(1.211e-01, -1.552e-01, -9.990e-02, 3.963e-02), r);
r = mad(s0_4, V4(-2.611e-01, -4.835e-01, -6.965e-01, -4.893e-01), r);
r = mad(s0_5, V4(-3.017e-01, -4.435e-02, 1.836e-01, 4.600e-01), r);
r = mad(s0_6, V4(1.275e-01, 2.485e-01, 7.354e-02, -4.648e-02), r);
r = mad(s0_7, V4(2.527e-01, 1.279e-01, 3.053e-01, 3.957e-02), r);
r = mad(s0_8, V4(1.003e-02, 1.193e-01, 2.476e-01, -2.051e-02), r);
return r;
}
void Pass1(uint2 blockStart, uint3 tid) {
float2 pt = float2(GetInputPt());
uint2 gxy = Rmp8x8(tid.x) + blockStart;
uint2 size = GetInputSize();
if (gxy.x >= size.x || gxy.y >= size.y) {
return;
}
float2 pos = (gxy + 0.5) * pt;
MF s0_0 = l0(-1.0, -1.0);
MF s0_1 = l0(0.0, -1.0);
MF s0_2 = l0(1.0, -1.0);
MF s0_3 = l0(-1.0, 0.0);
MF s0_4 = l0(0.0, 0.0);
MF s0_5 = l0(1.0, 0.0);
MF s0_6 = l0(-1.0, 1.0);
MF s0_7 = l0(0.0, 1.0);
MF s0_8 = l0(1.0, 1.0);
t0[gxy] = f0(s0_0, s0_1, s0_2, s0_3, s0_4, s0_5, s0_6, s0_7, s0_8);
}
//!PASS 2
//!DESC conv1
//!BLOCK_SIZE 8
//!NUM_THREADS 64
//!IN t0
//!OUT t1
#define l0(x, y) V4(O(t0, float2(x, y)))
V4 f0(V4 s0_0, V4 s0_1, V4 s0_2, V4 s0_3, V4 s0_4, V4 s0_5, V4 s0_6, V4 s0_7, V4 s0_8, V4 s1_0, V4 s1_1, V4 s1_2, V4 s1_3, V4 s1_4, V4 s1_5, V4 s1_6, V4 s1_7, V4 s1_8) {
V4 r = { -4.697e-03, -2.213e-02, 3.898e-01, -1.481e-02 };
r = MulAdd(s0_0, M4(-4.540e-03, -2.499e-01, 4.202e-02, 1.132e-02, 2.910e-02, -3.788e-02, 3.330e-02, -2.254e-02, -1.953e-01, 1.226e-01, -1.907e-01, -1.378e-01, 9.555e-02, -2.443e-01, 6.124e-02, -7.256e-03), r);
r = MulAdd(s0_1, M4(-1.225e-01, -1.812e-01, -1.238e-02, 4.088e-01, -9.977e-02, 4.395e-02, -2.394e-02, -5.584e-03, 2.939e-01, 4.102e-01, 6.228e-02, 3.822e-01, 8.618e-02, -1.109e-01, 1.776e-01, -7.505e-02), r);
r = MulAdd(s0_2, M4(2.047e-01, -6.853e-02, 1.880e-02, -9.030e-03, 1.505e-01, 7.782e-02, 1.347e-02, 5.566e-01, -6.951e-02, -1.352e-01, 1.941e-03, 3.975e-02, 1.637e-01, 6.708e-02, 1.501e-02, 1.373e-01), r);
r = MulAdd(s0_3, M4(-1.974e-01, 1.068e-01, -1.102e-01, 5.909e-02, 2.355e-03, 1.275e-01, -5.986e-02, -5.288e-02, 8.785e-04, -1.440e-01, -3.369e-01, -9.128e-02, 2.030e-01, 4.937e-01, -1.637e-01, 4.814e-02), r);
r = MulAdd(s0_4, M4(-3.954e-01, 4.772e-01, -5.841e-01, -8.070e-02, -2.056e-01, -2.335e-01, -2.091e-01, 1.223e-01, -2.686e-01, 1.240e+00, 7.095e-02, 6.502e-01, 1.044e-01, -3.071e-01, -2.892e-01, 4.861e-01), r);
r = MulAdd(s0_5, M4(5.943e-02, 2.245e-01, 4.014e-01, -1.063e-01, -1.869e-01, 1.384e-01, 2.996e-01, -1.928e-01, 1.212e-01, 2.849e-01, 2.093e-01, -3.821e-01, -8.705e-02, 1.976e-01, 5.176e-01, -7.461e-02), r);
r = MulAdd(s0_6, M4(1.048e-01, 2.374e-02, 2.730e-01, 1.446e-01, -5.406e-02, -1.587e-02, -2.014e-01, -3.422e-02, -2.114e-01, -5.198e-01, 2.674e-02, -6.078e-02, -2.293e-01, -9.914e-02, -2.110e-01, 7.008e-02), r);
r = MulAdd(s0_7, M4(5.799e-02, 4.932e-01, 4.559e-01, -3.118e-02, 4.706e-02, -2.242e-01, -3.165e-01, -9.912e-02, 4.041e-01, 7.241e-01, -1.696e-01, 1.990e-01, 4.697e-01, 9.965e-03, -1.141e-02, -1.365e-02), r);
r = MulAdd(s0_8, M4(-1.744e-01, -7.119e-02, 3.632e-01, -2.802e-01, -3.155e-01, 4.455e-01, -1.866e-02, -2.667e-02, 1.255e-01, -5.762e-01, -2.226e-02, 2.812e-02, -2.349e-01, 1.552e-01, -6.424e-03, 7.450e-02), r);
r = MulAdd(s1_0, M4(6.159e-02, -4.426e-02, 2.277e-02, 1.040e-01, -6.306e-04, -1.704e-01, 3.807e-02, -8.670e-02, -1.403e-01, 1.644e-01, -9.679e-02, -1.055e-01, 2.394e-01, -5.504e-02, 8.006e-02, 6.312e-02), r);
r = MulAdd(s1_1, M4(-1.134e-01, -1.030e-01, -2.777e-02, 2.955e-01, -1.225e-01, -4.096e-02, -2.748e-02, 9.404e-02, 2.890e-01, -2.441e-01, 1.560e-01, 1.694e-01, 1.853e-01, 3.311e-01, 3.408e-01, -8.678e-02), r);
r = MulAdd(s1_2, M4(1.821e-01, 3.898e-02, -2.560e-02, 1.160e-01, 2.382e-01, -1.638e-01, -1.345e-01, 3.193e-01, -1.839e-01, -2.638e-01, 5.265e-02, 2.415e-01, 2.803e-01, 1.919e-01, -7.340e-02, 1.762e-02), r);
r = MulAdd(s1_3, M4(-2.606e-01, -1.263e-01, -3.067e-02, -1.695e-02, 4.665e-03, 2.947e-02, -1.965e-02, -2.658e-02, -7.935e-02, -1.566e-01, -3.246e-01, -1.075e-03, 1.896e-01, -2.937e-01, -1.020e-01, -1.513e-01), r);
r = MulAdd(s1_4, M4(-3.696e-01, 8.901e-02, -1.890e-01, -2.804e-02, -2.998e-01, -6.597e-02, -2.613e-01, 3.877e-01, -1.032e+00, -2.328e-01, 7.941e-02, 5.733e-01, 8.618e-02, 4.213e-02, -1.242e+00, 5.861e-01), r);
r = MulAdd(s1_5, M4(1.919e-02, -5.609e-02, 3.295e-01, -2.364e-01, -4.238e-01, -6.041e-01, 3.389e-01, -4.460e-01, 4.482e-02, 1.077e-03, 8.990e-02, -2.725e-01, -4.829e-02, 1.184e-01, 1.941e-01, -3.646e-01), r);
r = MulAdd(s1_6, M4(2.968e-01, 2.018e-01, 2.695e-01, 8.891e-02, -5.857e-02, 6.005e-02, -2.440e-01, -1.349e-02, -7.572e-02, -3.213e-01, 6.274e-02, -1.229e-02, -7.589e-01, -2.313e-01, -1.627e-01, 2.538e-01), r);
r = MulAdd(s1_7, M4(-5.728e-02, 1.333e-01, 2.492e-01, -3.609e-02, 1.936e-01, -1.276e-01, -3.034e-01, -1.091e-01, 1.390e-01, 3.356e-01, -1.183e-01, 2.047e-01, 3.779e-01, -3.353e-01, 2.019e-01, 4.337e-02), r);
r = MulAdd(s1_8, M4(-1.386e-01, 1.179e-01, 2.340e-01, -1.604e-01, -4.890e-01, -5.407e-01, -1.546e-01, -1.826e-01, 1.596e-01, -1.784e-01, 5.777e-02, 3.961e-02, -2.290e-01, 2.752e-01, -4.260e-02, 9.649e-02), r);
return r;
}
void Pass2(uint2 blockStart, uint3 tid) {
float2 pt = float2(GetInputPt());
uint2 gxy = Rmp8x8(tid.x) + blockStart;
uint2 size = GetInputSize();
if (gxy.x >= size.x || gxy.y >= size.y) {
return;
}
float2 pos = (gxy + 0.5) * pt;
V4 s0_0 = l0(-1.0, -1.0);
V4 s0_1 = l0(0.0, -1.0);
V4 s0_2 = l0(1.0, -1.0);
V4 s0_3 = l0(-1.0, 0.0);
V4 s0_4 = l0(0.0, 0.0);
V4 s0_5 = l0(1.0, 0.0);
V4 s0_6 = l0(-1.0, 1.0);
V4 s0_7 = l0(0.0, 1.0);
V4 s0_8 = l0(1.0, 1.0);
V4 s1_0 = -max(-s0_0, 0.0);
V4 s1_1 = -max(-s0_1, 0.0);
V4 s1_2 = -max(-s0_2, 0.0);
V4 s1_3 = -max(-s0_3, 0.0);
V4 s1_4 = -max(-s0_4, 0.0);
V4 s1_5 = -max(-s0_5, 0.0);
V4 s1_6 = -max(-s0_6, 0.0);
V4 s1_7 = -max(-s0_7, 0.0);
V4 s1_8 = -max(-s0_8, 0.0);
s0_0 = max(s0_0, 0.0);
s0_1 = max(s0_1, 0.0);
s0_2 = max(s0_2, 0.0);
s0_3 = max(s0_3, 0.0);
s0_4 = max(s0_4, 0.0);
s0_5 = max(s0_5, 0.0);
s0_6 = max(s0_6, 0.0);
s0_7 = max(s0_7, 0.0);
s0_8 = max(s0_8, 0.0);
t1[gxy] = f0(s0_0, s0_1, s0_2, s0_3, s0_4, s0_5, s0_6, s0_7, s0_8, s1_0, s1_1, s1_2, s1_3, s1_4, s1_5, s1_6, s1_7, s1_8);
}
//!PASS 3
//!DESC conv2
//!BLOCK_SIZE 8
//!NUM_THREADS 64
//!IN t1
//!OUT t0
#define l0(x, y) V4(O(t1, float2(x, y)))
V4 f0(V4 s0_0, V4 s0_1, V4 s0_2, V4 s0_3, V4 s0_4, V4 s0_5, V4 s0_6, V4 s0_7, V4 s0_8, V4 s1_0, V4 s1_1, V4 s1_2, V4 s1_3, V4 s1_4, V4 s1_5, V4 s1_6, V4 s1_7, V4 s1_8) {
V4 r = { -8.341e-03, 1.434e-02, 5.791e-03, -1.033e-02 };
r = MulAdd(s0_0, M4(-1.362e-01, -5.847e-02, 2.766e-02, 2.969e-02, 9.796e-02, 6.555e-02, -3.067e-02, -5.139e-02, 1.512e-01, 1.401e-01, -3.820e-03, 2.649e-02, -1.802e-01, -2.099e-02, -6.604e-02, 4.042e-02), r);
r = MulAdd(s0_1, M4(-2.144e-01, -1.437e-01, 4.670e-02, -2.348e-01, 9.990e-02, -5.186e-02, 1.658e-01, 9.557e-02, -1.353e-01, -1.146e-01, -9.837e-02, -8.956e-02, 1.229e-01, 2.354e-01, -2.342e-01, -1.343e-01), r);
r = MulAdd(s0_2, M4(5.918e-01, 2.130e-02, 5.753e-01, -6.941e-02, -3.156e-02, -4.438e-02, -6.348e-02, 2.682e-02, -1.078e-02, 9.727e-03, 8.472e-02, 1.460e-01, -1.921e-01, 1.872e-01, 6.067e-02, 3.762e-02), r);
r = MulAdd(s0_3, M4(1.341e-01, 1.082e-01, -4.460e-02, -1.008e-02, -1.262e-01, -7.942e-02, 5.610e-02, 4.418e-02, -1.725e-01, -1.158e-01, 6.377e-03, -1.171e-01, -3.447e-02, 4.459e-02, 2.822e-04, -7.623e-02), r);
r = MulAdd(s0_4, M4(1.994e-01, -2.251e-01, -2.432e-01, 2.467e-02, 3.717e-02, 3.275e-01, 2.005e-01, 1.427e-01, 1.122e-01, 2.864e-01, 1.478e-01, 3.701e-01, 3.111e-01, -1.704e-01, -1.410e-01, -7.490e-01), r);
r = MulAdd(s0_5, M4(-1.392e-01, -2.284e-02, 2.819e-01, -5.560e-02, -2.624e-01, 7.282e-02, -2.417e-01, -5.534e-02, -6.351e-03, -1.714e-01, -1.505e-01, -3.035e-01, -3.580e-02, 4.429e-02, 1.628e-01, -1.101e-01), r);
r = MulAdd(s0_6, M4(8.306e-04, 3.258e-02, -2.746e-02, -3.143e-02, -1.301e-02, -5.828e-02, 2.411e-03, 1.395e-02, 3.728e-02, -8.319e-02, 3.326e-02, 1.294e-01, -6.226e-02, 5.103e-02, -1.218e-02, 2.411e-01), r);
r = MulAdd(s0_7, M4(-6.323e-02, -1.343e-02, 3.400e-02, -1.727e-02, 3.683e-02, 6.325e-02, 4.834e-04, 3.849e-02, 9.424e-03, -2.010e-02, -3.447e-02, -1.330e-01, -4.107e-01, -7.682e-02, 4.138e-01, 5.994e-02), r);
r = MulAdd(s0_8, M4(7.556e-02, 1.846e-02, 1.847e-02, 1.057e-01, -1.140e-01, -2.834e-02, -3.141e-02, -1.045e-01, -2.025e-02, 4.729e-02, -2.822e-02, -4.072e-02, 3.368e-01, 6.871e-02, 1.184e-01, 1.536e-01), r);
r = MulAdd(s1_0, M4(-6.688e-02, 2.483e-02, 1.598e-01, -4.834e-02, 2.141e-01, -4.911e-02, -4.452e-02, -4.879e-02, -9.473e-01, 6.527e-01, -6.118e-01, -2.436e-01, -3.017e-02, -3.402e-01, 1.343e-01, 9.397e-02), r);
r = MulAdd(s1_1, M4(-1.330e-01, 2.557e-01, 6.838e-02, -3.936e-01, 4.806e-01, 1.828e-01, 5.073e-01, 4.502e-01, -1.404e+00, -2.954e-01, -6.745e-02, 5.594e-02, 2.640e-01, 2.330e-02, 1.331e-02, -2.700e-02), r);
r = MulAdd(s1_2, M4(2.695e-01, -1.004e-01, 9.104e-02, -4.919e-01, 3.357e-01, 4.895e-02, 4.062e-01, -3.494e-02, -4.352e-01, -1.232e-01, 8.889e-03, 3.472e-01, -1.174e-01, 7.690e-02, 6.341e-02, 9.255e-02), r);
r = MulAdd(s1_3, M4(1.805e-01, 2.494e-01, 3.474e-02, 3.930e-02, 2.671e-02, -1.438e-02, 7.294e-02, 4.854e-02, -2.864e+00, -5.832e-01, 4.350e-01, -4.265e-01, -2.643e-02, -6.234e-01, 1.283e-01, 5.168e-02), r);
r = MulAdd(s1_4, M4(-2.192e-01, 2.982e-01, -2.860e-01, -4.050e-01, 8.612e-02, 5.008e-02, 5.366e-01, 5.256e-01, -6.222e-01, 1.169e+00, 1.897e+00, 3.009e+00, 9.105e-02, -2.369e-01, -4.718e-01, -2.725e-01), r);
r = MulAdd(s1_5, M4(-7.441e-01, -1.820e-01, -5.828e-02, -6.348e-01, 5.721e-01, 1.143e-01, 2.871e-01, 3.254e-01, -1.446e-01, 1.446e-01, -8.526e-02, 7.228e-01, -9.749e-02, -1.665e-01, -1.116e-01, -2.705e-01), r);
r = MulAdd(s1_6, M4(-6.357e-02, -2.576e-02, 1.277e-02, -3.956e-02, 2.724e-02, -2.141e-02, 9.778e-02, 7.199e-03, -1.153e+00, -6.945e-01, -4.788e-01, -1.246e+00, 1.909e-01, 1.315e-01, 4.454e-02, 2.678e-01), r);
r = MulAdd(s1_7, M4(-1.022e-01, 1.572e-01, 9.404e-02, 6.768e-02, 2.191e-01, -3.163e-02, 1.257e-01, 1.058e-01, -6.394e-01, 7.223e-03, -6.930e-01, -2.963e-01, -2.666e-01, 3.461e-03, 2.203e-01, -1.212e-01), r);
r = MulAdd(s1_8, M4(-1.179e-01, 7.311e-02, 1.371e-01, -4.039e-02, 2.171e-01, 3.131e-02, 2.219e-01, 1.564e-02, -4.895e-01, -5.067e-03, -4.528e-01, 5.694e-02, 6.858e-02, 6.808e-03, -1.017e-01, 6.675e-03), r);
return r;
}
void Pass3(uint2 blockStart, uint3 tid) {
float2 pt = float2(GetInputPt());
uint2 gxy = Rmp8x8(tid.x) + blockStart;
uint2 size = GetInputSize();
if (gxy.x >= size.x || gxy.y >= size.y) {
return;
}
float2 pos = (gxy + 0.5) * pt;
V4 s0_0 = l0(-1.0, -1.0);
V4 s0_1 = l0(0.0, -1.0);
V4 s0_2 = l0(1.0, -1.0);
V4 s0_3 = l0(-1.0, 0.0);
V4 s0_4 = l0(0.0, 0.0);
V4 s0_5 = l0(1.0, 0.0);
V4 s0_6 = l0(-1.0, 1.0);
V4 s0_7 = l0(0.0, 1.0);
V4 s0_8 = l0(1.0, 1.0);
V4 s1_0 = -max(-s0_0, 0.0);
V4 s1_1 = -max(-s0_1, 0.0);
V4 s1_2 = -max(-s0_2, 0.0);
V4 s1_3 = -max(-s0_3, 0.0);
V4 s1_4 = -max(-s0_4, 0.0);
V4 s1_5 = -max(-s0_5, 0.0);
V4 s1_6 = -max(-s0_6, 0.0);
V4 s1_7 = -max(-s0_7, 0.0);
V4 s1_8 = -max(-s0_8, 0.0);
s0_0 = max(s0_0, 0.0);
s0_1 = max(s0_1, 0.0);
s0_2 = max(s0_2, 0.0);
s0_3 = max(s0_3, 0.0);
s0_4 = max(s0_4, 0.0);
s0_5 = max(s0_5, 0.0);
s0_6 = max(s0_6, 0.0);
s0_7 = max(s0_7, 0.0);
s0_8 = max(s0_8, 0.0);
t0[gxy] = f0(s0_0, s0_1, s0_2, s0_3, s0_4, s0_5, s0_6, s0_7, s0_8, s1_0, s1_1, s1_2, s1_3, s1_4, s1_5, s1_6, s1_7, s1_8);
}
//!PASS 4
//!DESC conv3
//!BLOCK_SIZE 8
//!NUM_THREADS 64
//!IN t0
//!OUT t1
#define l0(x, y) V4(O(t0, float2(x, y)))
V4 f0(V4 s0_0, V4 s0_1, V4 s0_2, V4 s0_3, V4 s0_4, V4 s0_5, V4 s0_6, V4 s0_7, V4 s0_8, V4 s1_0, V4 s1_1, V4 s1_2, V4 s1_3, V4 s1_4, V4 s1_5, V4 s1_6, V4 s1_7, V4 s1_8) {
V4 r = { 1.370e-02, 1.151e-02, 2.567e-03, -1.881e-03 };
r = MulAdd(s0_0, M4(-6.123e-02, 9.666e-03, 4.969e-02, 3.030e-02, 1.714e-02, -3.117e-02, -9.470e-02, 2.078e-03, 4.109e-02, -5.560e-02, 3.757e-02, -3.667e-03, -3.500e-02, -8.151e-02, 1.104e-01, -1.219e-01), r);
r = MulAdd(s0_1, M4(9.596e-02, -6.361e-02, 1.162e-02, -3.138e-02, -1.277e-02, -4.005e-02, 1.805e-02, -1.459e-02, -7.903e-03, 1.138e-02, 1.542e-02, -2.357e-02, -1.421e-01, -2.953e-01, 1.322e-01, 6.480e-03), r);
r = MulAdd(s0_2, M4(1.571e-01, -1.081e-01, 1.345e-01, -5.616e-02, -1.211e-02, 4.515e-02, 1.797e-02, 6.143e-02, -9.605e-02, 7.782e-02, -1.421e-01, 3.195e-02, 1.841e-01, -7.735e-02, 1.082e-01, 1.785e-02), r);
r = MulAdd(s0_3, M4(1.739e-03, -4.187e-02, 1.093e-01, 1.042e-01, -6.538e-03, 5.025e-02, -7.052e-03, -1.033e-01, -1.394e-01, -4.638e-01, 4.354e-02, -1.188e-02, 7.809e-04, 2.484e-01, -8.330e-01, -2.787e-01), r);
r = MulAdd(s0_4, M4(-6.489e-03, -6.309e-01, 7.169e-01, 1.557e-01, 1.478e-01, 2.977e-01, -2.818e-01, 5.129e-02, 7.598e-01, 8.124e-01, -1.262e-02, -1.325e-01, -2.764e-01, 3.485e-01, 4.717e-01, -2.467e-01), r);
r = MulAdd(s0_5, M4(2.022e-02, -1.396e-01, 1.865e-01, 1.568e-02, 3.924e-01, -2.466e-01, 4.990e-01, 3.971e-02, -1.176e-01, 1.792e-01, -2.861e-01, 3.555e-02, -1.428e-01, 2.528e-01, -2.085e-01, -1.311e-01), r);
r = MulAdd(s0_6, M4(3.340e-02, -1.203e-01, 1.014e-01, 1.154e-01, -9.031e-03, -5.586e-02, -5.700e-03, 2.391e-02, -3.509e-01, 6.729e-02, 1.004e-01, -3.277e-01, 1.026e-01, 3.286e-03, -6.603e-02, -3.238e-03), r);
r = MulAdd(s0_7, M4(-6.854e-01, 1.013e-01, -6.298e-02, -5.464e-01, 2.486e-01, -2.186e-01, 3.986e-02, 3.800e-01, -1.267e-01, 1.037e-01, 1.538e-01, -2.069e-01, 9.431e-02, 5.337e-02, -8.507e-02, 2.015e-01), r);
r = MulAdd(s0_8, M4(-5.009e-03, 1.493e-01, -3.010e-02, -2.429e-02, -3.137e-01, -2.276e-01, 1.556e-01, 1.452e-02, 2.063e-01, 3.699e-02, -1.675e-03, 8.221e-02, -6.732e-02, 8.296e-02, -8.474e-02, -1.458e-01), r);
r = MulAdd(s1_0, M4(-3.003e-02, -9.777e-03, 1.239e-02, -3.907e-02, 1.841e-01, -8.959e-02, 9.257e-02, 1.333e-01, 5.703e-04, -1.367e-01, -1.026e-01, 6.398e-02, 1.262e-02, 1.101e-02, 4.291e-02, -4.238e-02), r);
r = MulAdd(s1_1, M4(5.516e-02, 9.884e-04, -5.383e-02, -1.048e-02, 2.529e-01, 9.819e-02, 1.255e-01, 3.149e-02, -8.249e-02, -1.386e-02, 6.214e-02, 2.957e-02, 1.001e-01, 1.590e-01, 1.159e-02, 5.273e-02), r);
r = MulAdd(s1_2, M4(4.571e-02, -6.277e-03, 1.496e-01, -4.044e-02, 4.089e-02, -3.801e-02, -3.690e-02, -1.037e-01, -6.031e-02, 2.117e-03, -9.644e-02, 6.392e-02, 5.093e-02, -2.512e-02, 1.131e-01, 1.304e-01), r);
r = MulAdd(s1_3, M4(-3.118e-02, 2.185e-02, 1.763e-01, 8.327e-02, 6.337e-02, 8.724e-02, 6.808e-02, -4.070e-01, -6.922e-02, -2.417e-01, -1.175e-01, -1.845e-01, -3.773e-03, -1.869e-01, -9.345e-02, -2.340e-01), r);
r = MulAdd(s1_4, M4(-1.159e-01, -4.476e-01, 2.989e-01, 2.794e-01, 5.756e-01, -4.803e-01, -5.979e-02, -1.959e-01, 5.261e-02, -2.399e-01, -6.616e-02, -9.243e-01, 4.622e-01, 1.139e-01, 2.482e-01, 2.254e-01), r);
r = MulAdd(s1_5, M4(1.064e-01, -1.989e-02, 8.581e-02, 3.218e-02, 3.344e-01, -5.684e-01, 4.009e-01, 4.482e-01, 7.737e-02, 8.716e-02, -1.382e-01, -7.145e-02, -1.225e-01, 1.471e-01, -1.866e-01, 3.674e-02), r);
r = MulAdd(s1_6, M4(5.376e-02, -6.192e-03, -1.760e-01, 7.590e-02, -3.279e-02, -1.888e-01, 2.057e-01, 2.114e-01, -3.941e-01, 5.584e-03, 9.400e-03, -4.289e-01, -2.289e-01, 1.880e-01, 3.184e-02, -4.442e-01), r);
r = MulAdd(s1_7, M4(-4.174e-01, -1.344e-01, 3.866e-02, 4.521e-02, -4.215e-01, 1.479e-01, 2.476e-01, -7.051e-01, -4.153e-01, 3.373e-01, 8.098e-02, -6.680e-01, 3.920e-01, -1.023e-01, -2.166e-02, 3.816e-01), r);
r = MulAdd(s1_8, M4(-3.441e-02, 3.404e-03, -4.958e-02, 9.652e-03, -1.930e-02, -2.470e-01, 1.610e-01, 1.112e-01, 2.574e-02, 2.310e-01, 3.643e-02, -5.044e-02, 7.788e-02, 1.923e-03, -7.115e-02, -6.575e-03), r);
return r;
}
void Pass4(uint2 blockStart, uint3 tid) {
float2 pt = float2(GetInputPt());
uint2 gxy = Rmp8x8(tid.x) + blockStart;
uint2 size = GetInputSize();
if (gxy.x >= size.x || gxy.y >= size.y) {
return;
}
float2 pos = (gxy + 0.5) * pt;
V4 s0_0 = l0(-1.0, -1.0);
V4 s0_1 = l0(0.0, -1.0);
V4 s0_2 = l0(1.0, -1.0);
V4 s0_3 = l0(-1.0, 0.0);
V4 s0_4 = l0(0.0, 0.0);
V4 s0_5 = l0(1.0, 0.0);
V4 s0_6 = l0(-1.0, 1.0);
V4 s0_7 = l0(0.0, 1.0);
V4 s0_8 = l0(1.0, 1.0);
V4 s1_0 = -max(-s0_0, 0.0);
V4 s1_1 = -max(-s0_1, 0.0);
V4 s1_2 = -max(-s0_2, 0.0);
V4 s1_3 = -max(-s0_3, 0.0);
V4 s1_4 = -max(-s0_4, 0.0);
V4 s1_5 = -max(-s0_5, 0.0);
V4 s1_6 = -max(-s0_6, 0.0);
V4 s1_7 = -max(-s0_7, 0.0);
V4 s1_8 = -max(-s0_8, 0.0);
s0_0 = max(s0_0, 0.0);
s0_1 = max(s0_1, 0.0);
s0_2 = max(s0_2, 0.0);
s0_3 = max(s0_3, 0.0);
s0_4 = max(s0_4, 0.0);
s0_5 = max(s0_5, 0.0);
s0_6 = max(s0_6, 0.0);
s0_7 = max(s0_7, 0.0);
s0_8 = max(s0_8, 0.0);
t1[gxy] = f0(s0_0, s0_1, s0_2, s0_3, s0_4, s0_5, s0_6, s0_7, s0_8, s1_0, s1_1, s1_2, s1_3, s1_4, s1_5, s1_6, s1_7, s1_8);
}
//!PASS 5
//!DESC conv4
//!BLOCK_SIZE 8
//!NUM_THREADS 64
//!IN t1
//!OUT t0
#define l0(x, y) V4(O(t1, float2(x, y)))
V4 f0(V4 s0_0, V4 s0_1, V4 s0_2, V4 s0_3, V4 s0_4, V4 s0_5, V4 s0_6, V4 s0_7, V4 s0_8, V4 s1_0, V4 s1_1, V4 s1_2, V4 s1_3, V4 s1_4, V4 s1_5, V4 s1_6, V4 s1_7, V4 s1_8) {
V4 r = { 7.249e-03, 2.949e-03, 5.297e-03, 3.693e-03 };
r = MulAdd(s0_0, M4(2.376e-02, 2.931e-02, 7.304e-02, -5.238e-02, -6.500e-03, -3.887e-02, 2.506e-02, 5.201e-03, 5.599e-02, -1.951e-01, -3.847e-01, 8.685e-02, -1.106e-01, -3.954e-02, 1.571e-01, 2.293e-02), r);
r = MulAdd(s0_1, M4(-2.738e-02, 1.554e-01, 1.120e-01, 1.856e-02, 9.513e-03, -2.222e-01, -2.174e-01, -1.065e-02, 3.001e-02, 7.638e-02, -7.497e-02, -2.727e-02, -1.521e-02, 1.843e-01, 3.547e-01, -1.642e-02), r);
r = MulAdd(s0_2, M4(-2.533e-02, -1.959e-02, -6.274e-02, 8.121e-03, -8.703e-03, 5.091e-02, 6.548e-02, 1.988e-02, 4.089e-02, -4.827e-02, -4.089e-02, -4.361e-02, -1.112e-02, -1.101e-02, 2.968e-02, -2.196e-03), r);
r = MulAdd(s0_3, M4(1.813e-02, -2.087e-01, -2.474e-01, -1.066e-01, 2.549e-01, 6.466e-01, 3.169e-01, -1.109e-01, -1.551e-02, -3.119e-01, -3.959e-01, 2.141e-01, 1.121e-01, 3.268e-01, 1.038e-01, -5.818e-02), r);
r = MulAdd(s0_4, M4(-3.147e-01, 2.716e-01, 1.304e-01, 3.887e-01, 9.396e-02, -9.787e-02, -1.596e-01, -7.138e-02, -2.462e-01, -3.027e-01, 6.980e-01, -1.546e-01, 3.730e-02, -7.502e-02, -4.408e-02, 3.814e-02), r);
r = MulAdd(s0_5, M4(-4.177e-02, -1.326e-02, -7.497e-02, 1.168e-03, 5.595e-03, 3.603e-02, 2.589e-02, -2.179e-02, 1.998e-02, -3.544e-03, 1.125e-01, 2.648e-03, -2.417e-02, -1.876e-02, 4.009e-02, 5.481e-02), r);
r = MulAdd(s0_6, M4(-7.181e-02, -2.968e-02, -3.169e-02, -1.899e-02, -3.692e-02, -2.156e-02, 9.595e-02, 1.055e-01, -1.274e-01, -2.576e-02, 8.706e-02, 1.895e-01, 6.316e-04, -4.574e-02, 2.201e-02, 1.199e-01), r);
r = MulAdd(s0_7, M4(-2.193e-01, 1.563e-02, 1.287e-01, 2.403e-01, 2.222e-01, -1.748e-02, 1.486e-02, -7.685e-02, 4.971e-01, 2.920e-01, -2.253e-01, -8.145e-01, 3.018e-01, -4.559e-02, -1.509e-01, -3.003e-01), r);
r = MulAdd(s0_8, M4(1.685e-02, -1.082e-02, 3.539e-03, -2.765e-02, -5.968e-03, -4.628e-03, 3.847e-02, 6.426e-02, -6.284e-02, 5.455e-02, -3.291e-02, 1.636e-01, 5.828e-02, -5.613e-02, -4.404e-02, -1.715e-02), r);
r = MulAdd(s1_0, M4(1.875e-02, 7.150e-02, 3.015e-02, -4.917e-02, 9.333e-03, -1.519e-01, -1.153e-01, 4.344e-02, -1.603e-02, -4.775e-02, -4.484e-02, 6.567e-02, -6.714e-02, 2.569e-01, 4.638e-01, 3.038e-02), r);
r = MulAdd(s1_1, M4(-4.046e-02, 1.372e-01, 2.476e-01, 6.565e-02, 6.481e-04, -1.529e-02, 1.376e-02, 1.367e-02, 2.941e-04, 1.423e-01, 2.311e-01, 7.538e-03, -6.762e-02, -3.992e-01, -1.160e-02, 3.123e-02), r);
r = MulAdd(s1_2, M4(-3.926e-02, 1.709e-04, -4.761e-02, -8.731e-03, 5.123e-03, 7.039e-02, 1.061e-01, -1.322e-03, 4.069e-02, -1.182e-01, -3.698e-04, -7.746e-02, -3.827e-02, 9.957e-02, 9.991e-02, 5.215e-02), r);
r = MulAdd(s1_3, M4(-1.865e-01, -9.784e-01, -5.871e-01, 1.384e-01, 2.097e-01, -1.229e-01, -4.912e-01, -4.254e-02, 3.395e-04, -8.968e-02, -6.923e-02, -4.916e-02, 2.424e-01, 7.730e-01, 2.573e-01, -2.380e-01), r);
r = MulAdd(s1_4, M4(-9.293e-01, 6.176e-01, 1.970e-01, 3.467e-01, 4.341e-01, 9.866e-01, 3.035e-01, -1.062e-01, -1.501e-01, 2.709e-01, 1.991e-01, -2.164e-01, 2.881e-01, -1.696e-01, -4.141e-01, -1.004e+00), r);
r = MulAdd(s1_5, M4(-8.323e-02, -1.285e-02, -3.468e-02, 1.551e-01, 1.330e-01, -1.238e-01, -1.675e-03, 5.588e-02, 2.128e-01, -2.327e-01, -2.891e-02, 1.567e-01, -1.448e-01, 8.781e-02, 3.254e-02, 7.142e-02), r);
r = MulAdd(s1_6, M4(1.231e-01, 5.139e-02, -9.426e-02, -2.822e-01, 1.761e-03, 6.853e-03, 1.165e-01, 7.861e-02, -9.715e-03, 5.489e-03, -1.066e-02, -8.332e-03, -9.111e-02, 3.911e-02, 1.757e-01, 2.222e-01), r);
r = MulAdd(s1_7, M4(2.275e-02, 1.199e-01, 5.904e-02, -2.051e-01, 6.950e-01, 1.592e-02, -9.888e-02, -6.701e-01, -9.096e-02, 3.203e-02, 1.204e-01, 2.153e-01, 1.448e-01, -5.225e-03, 6.786e-02, 2.005e-02), r);
r = MulAdd(s1_8, M4(-3.290e-02, -3.758e-02, -3.158e-02, 8.713e-02, 3.917e-02, 4.275e-02, -2.450e-02, 3.970e-02, 1.928e-01, 5.498e-02, -5.673e-02, -3.743e-01, 4.981e-02, -1.785e-02, 1.958e-02, 3.487e-02), r);
return r;
}
void Pass5(uint2 blockStart, uint3 tid) {
float2 pt = float2(GetInputPt());
uint2 gxy = Rmp8x8(tid.x) + blockStart;
uint2 size = GetInputSize();
if (gxy.x >= size.x || gxy.y >= size.y) {
return;
}
float2 pos = (gxy + 0.5) * pt;
V4 s0_0 = l0(-1.0, -1.0);
V4 s0_1 = l0(0.0, -1.0);
V4 s0_2 = l0(1.0, -1.0);
V4 s0_3 = l0(-1.0, 0.0);
V4 s0_4 = l0(0.0, 0.0);
V4 s0_5 = l0(1.0, 0.0);
V4 s0_6 = l0(-1.0, 1.0);
V4 s0_7 = l0(0.0, 1.0);
V4 s0_8 = l0(1.0, 1.0);
V4 s1_0 = -max(-s0_0, 0.0);
V4 s1_1 = -max(-s0_1, 0.0);
V4 s1_2 = -max(-s0_2, 0.0);
V4 s1_3 = -max(-s0_3, 0.0);
V4 s1_4 = -max(-s0_4, 0.0);
V4 s1_5 = -max(-s0_5, 0.0);
V4 s1_6 = -max(-s0_6, 0.0);
V4 s1_7 = -max(-s0_7, 0.0);
V4 s1_8 = -max(-s0_8, 0.0);
s0_0 = max(s0_0, 0.0);
s0_1 = max(s0_1, 0.0);
s0_2 = max(s0_2, 0.0);
s0_3 = max(s0_3, 0.0);
s0_4 = max(s0_4, 0.0);
s0_5 = max(s0_5, 0.0);
s0_6 = max(s0_6, 0.0);
s0_7 = max(s0_7, 0.0);
s0_8 = max(s0_8, 0.0);
t0[gxy] = f0(s0_0, s0_1, s0_2, s0_3, s0_4, s0_5, s0_6, s0_7, s0_8, s1_0, s1_1, s1_2, s1_3, s1_4, s1_5, s1_6, s1_7, s1_8);
}
//!PASS 6
//!DESC out-shuffle
//!BLOCK_SIZE 16
//!NUM_THREADS 64
//!IN INPUT, t0
//!OUT OUTPUT
#define l0(x, y) V4(O(t0, float2(x, y)))
V4 f0(V4 s0_0, V4 s0_1, V4 s0_2, V4 s0_3, V4 s0_4, V4 s0_5, V4 s0_6, V4 s0_7, V4 s0_8, V4 s1_0, V4 s1_1, V4 s1_2, V4 s1_3, V4 s1_4, V4 s1_5, V4 s1_6, V4 s1_7, V4 s1_8) {
V4 r = { 2.510e-03, 4.409e-03, 2.891e-03, 4.977e-03 };
r = MulAdd(s0_0, M4(2.340e-02, 8.171e-02, -1.124e-01, -5.065e-02, -5.505e-02, -5.540e-02, -3.000e-03, -1.346e-02, 3.800e-02, 4.944e-02, -2.084e-02, 6.388e-03, 8.566e-02, 2.480e-02, 1.184e-01, -1.075e-04), r);
r = MulAdd(s0_1, M4(-2.188e-02, -2.056e-01, 1.480e-02, -7.451e-02, 5.240e-02, 4.098e-02, -4.668e-03, 1.810e-02, -2.533e-02, -6.403e-02, 1.984e-02, -5.716e-02, -3.356e-03, -2.173e-01, 1.218e-01, 1.179e-01), r);
r = MulAdd(s0_2, M4(7.330e-03, 2.521e-02, 1.372e-02, 3.411e-02, -1.438e-02, -1.009e-02, 7.676e-03, -1.712e-02, 5.980e-03, 2.040e-02, -8.766e-03, 3.442e-02, -1.623e-02, -2.557e-02, -6.086e-03, 5.413e-04), r);
r = MulAdd(s0_3, M4(1.754e-01, 6.364e-02, 2.842e-01, 2.378e-01, -1.684e-01, -1.911e-02, -3.838e-01, -2.622e-02, 2.065e-01, 3.951e-02, 4.217e-01, 4.374e-02, -1.028e-02, 2.417e-02, -1.595e-02, 6.305e-02), r);
r = MulAdd(s0_4, M4(-5.620e-02, -8.609e-02, -1.256e-01, -3.166e-01, -1.712e-01, -1.602e-01, -1.577e-01, -4.901e-01, -5.012e-02, 1.082e-01, -7.271e-02, 4.072e-01, -7.789e-02, -1.725e-01, -1.397e-01, -4.507e-01), r);
r = MulAdd(s0_5, M4(1.401e-02, 4.716e-02, 1.486e-02, 4.642e-02, 1.131e-02, 3.865e-02, -9.865e-03, 9.301e-02, 3.441e-03, -8.098e-03, -6.012e-03, -1.549e-01, 1.486e-02, 1.872e-02, -2.469e-03, 1.294e-02), r);
r = MulAdd(s0_6, M4(-3.894e-02, -4.136e-05, -3.022e-02, 1.045e-03, -3.730e-02, -1.838e-02, -5.573e-02, -2.760e-02, 3.516e-02, 1.602e-02, 6.358e-02, 3.111e-02, -3.045e-02, -7.728e-03, -4.189e-02, -1.102e-02), r);
r = MulAdd(s0_7, M4(-1.184e-02, 1.728e-02, 7.925e-03, 6.763e-02, 2.590e-03, -9.456e-03, -4.407e-02, -2.044e-02, 4.472e-02, 2.228e-02, 7.233e-02, 4.863e-02, -1.814e-02, -2.034e-03, -4.994e-02, -2.460e-02), r);
r = MulAdd(s0_8, M4(-3.292e-03, -9.015e-03, -3.171e-03, -2.504e-02, 2.120e-03, 3.064e-02, 2.108e-02, 4.592e-02, 2.258e-03, -2.192e-04, -3.576e-03, 3.733e-02, -1.931e-03, -5.083e-03, 5.877e-03, -1.764e-02), r);
r = MulAdd(s1_0, M4(4.321e-02, -8.135e-02, -1.567e-01, -6.888e-03, -6.542e-02, -1.656e-02, 1.236e-02, -7.563e-03, 4.657e-02, 9.222e-03, -6.696e-03, -3.545e-03, -6.401e-01, 1.189e-01, 1.509e-01, 2.417e-01), r);
r = MulAdd(s1_1, M4(-2.058e-02, 1.174e-01, -2.482e-02, -8.423e-02, -1.692e-02, -1.094e-02, 3.530e-02, 1.780e-02, -9.937e-02, -9.030e-02, 2.304e-02, 1.294e-02, 7.976e-02, -3.096e-01, 1.382e-01, 2.456e-01), r);
r = MulAdd(s1_2, M4(4.491e-02, -1.336e-02, 3.593e-02, -3.503e-02, -8.630e-03, -4.295e-03, -1.356e-02, 3.843e-02, 9.887e-03, 1.913e-03, 2.247e-03, 1.113e-02, -7.234e-04, -3.058e-02, 2.833e-03, -1.707e-02), r);
r = MulAdd(s1_3, M4(2.007e-01, 6.756e-02, 9.393e-01, 9.057e-02, -3.701e-01, -1.729e-02, -4.136e-01, 2.233e-02, 2.783e-01, 3.590e-02, 3.564e-01, 8.342e-03, 1.333e-01, 7.944e-02, -2.312e-01, 8.354e-02), r);
r = MulAdd(s1_4, M4(-3.334e-01, -2.705e-01, -4.072e-01, 3.946e-01, 5.159e-03, -5.860e-01, 1.578e-01, -3.614e-01, 5.366e-01, 4.699e-01, -3.700e-01, 9.463e-02, -4.090e-02, -9.767e-02, -7.999e-02, -4.859e-01), r);
r = MulAdd(s1_5, M4(5.700e-02, 6.092e-02, 4.114e-02, -1.564e-02, -1.345e-02, 9.692e-02, 1.456e-03, 9.371e-02, -3.845e-02, -4.751e-02, -2.509e-02, -2.842e-01, 2.938e-03, 2.387e-02, -6.191e-04, -3.120e-04), r);
r = MulAdd(s1_6, M4(3.888e-02, 4.969e-02, -1.851e-01, -9.866e-03, -3.527e-02, -1.377e-02, -7.594e-02, -2.619e-02, 3.259e-02, 9.636e-03, 8.622e-03, 1.788e-02, -3.505e-02, -1.048e-03, -1.329e-02, 1.425e-02), r);
r = MulAdd(s1_7, M4(6.891e-03, 8.118e-02, -6.443e-02, -1.487e-01, 2.183e-02, 1.106e-03, 6.656e-02, -9.506e-02, 7.418e-04, -6.015e-02, 3.594e-01, 1.039e-02, -3.600e-02, -7.771e-03, -3.406e-02, 2.935e-02), r);
r = MulAdd(s1_8, M4(-4.598e-03, -4.678e-03, 1.595e-02, -8.273e-03, 6.740e-03, 1.175e-02, -2.997e-02, -6.116e-03, -3.788e-02, -9.471e-02, -2.149e-02, 4.139e-02, -9.614e-03, -5.573e-03, -1.643e-02, -1.712e-02), r);
return tanh(r);
}
void Pass6(uint2 blockStart, uint3 tid) {
float2 pt = float2(GetInputPt());
uint2 gxy = (Rmp8x8(tid.x) << 1) + blockStart;
uint2 size = GetOutputSize();
if (gxy.x >= size.x || gxy.y >= size.y) {
return;
}
float2 pos = ((gxy >> 1) + 0.5) * pt;
V4 s0_0 = l0(-1.0, -1.0);
V4 s0_1 = l0(0.0, -1.0);
V4 s0_2 = l0(1.0, -1.0);
V4 s0_3 = l0(-1.0, 0.0);
V4 s0_4 = l0(0.0, 0.0);
V4 s0_5 = l0(1.0, 0.0);
V4 s0_6 = l0(-1.0, 1.0);
V4 s0_7 = l0(0.0, 1.0);
V4 s0_8 = l0(1.0, 1.0);
V4 s1_0 = -max(-s0_0, 0.0);
V4 s1_1 = -max(-s0_1, 0.0);
V4 s1_2 = -max(-s0_2, 0.0);
V4 s1_3 = -max(-s0_3, 0.0);
V4 s1_4 = -max(-s0_4, 0.0);
V4 s1_5 = -max(-s0_5, 0.0);
V4 s1_6 = -max(-s0_6, 0.0);
V4 s1_7 = -max(-s0_7, 0.0);
V4 s1_8 = -max(-s0_8, 0.0);
s0_0 = max(s0_0, 0.0);
s0_1 = max(s0_1, 0.0);
s0_2 = max(s0_2, 0.0);
s0_3 = max(s0_3, 0.0);
s0_4 = max(s0_4, 0.0);
s0_5 = max(s0_5, 0.0);
s0_6 = max(s0_6, 0.0);
s0_7 = max(s0_7, 0.0);
s0_8 = max(s0_8, 0.0);
V4 r = f0(s0_0, s0_1, s0_2, s0_3, s0_4, s0_5, s0_6, s0_7, s0_8, s1_0, s1_1, s1_2, s1_3, s1_4, s1_5, s1_6, s1_7, s1_8);
static const MF3x3 rgb2yuv = { 0.299, 0.587, 0.114, -0.169, -0.331, 0.5, 0.5, -0.419, -0.081 };
static const MF3x3 yuv2rgb = { 1, -0.00093, 1.401687, 1, -0.3437, -0.71417, 1, 1.77216, 0.00099 };
float2 opt = float2(GetOutputPt());
pos -= 0.5f * opt;
MF3 yuv = mul(rgb2yuv, INPUT.SampleLevel(SL, pos, 0).rgb);
OUTPUT[gxy] = MF4(mul(yuv2rgb, MF3(saturate(yuv.r + r.x), yuv.yz)), 1);
++gxy.x;
pos.x += opt.x;
yuv = mul(rgb2yuv, INPUT.SampleLevel(SL, pos, 0).rgb);
OUTPUT[gxy] = MF4(mul(yuv2rgb, MF3(saturate(yuv.r + r.y), yuv.yz)), 1);
++gxy.y;
pos.y += opt.y;
yuv = mul(rgb2yuv, INPUT.SampleLevel(SL, pos, 0).rgb);
OUTPUT[gxy] = MF4(mul(yuv2rgb, MF3(saturate(yuv.r + r.w), yuv.yz)), 1);
--gxy.x;
pos.x -= opt.x;
yuv = mul(rgb2yuv, INPUT.SampleLevel(SL, pos, 0).rgb);
OUTPUT[gxy] = MF4(mul(yuv2rgb, MF3(saturate(yuv.r + r.z), yuv.yz)), 1);
}

View file

@ -1,484 +0,0 @@
// CuNNy 4x4C BILINEAR RGB NVL - https://github.com/funnyplanter/CuNNy
// This program is free software: you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation, either version 3 of the License, or
// (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
//
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License
// along with this program. If not, see <http://www.gnu.org/licenses/>.
//!MAGPIE EFFECT
//!VERSION 4
//!SORT_NAME CuNNy-D04N04
//!USE MulAdd
//!CAPABILITY FP16
#include "../StubDefs.hlsli"
//!TEXTURE
Texture2D INPUT;
//!TEXTURE
//!WIDTH INPUT_WIDTH * 2
//!HEIGHT INPUT_HEIGHT * 2
Texture2D OUTPUT;
//!SAMPLER
//!FILTER POINT
SamplerState SP;
//!SAMPLER
//!FILTER LINEAR
SamplerState SL;
//!COMMON
#define O(t, p) t.SampleLevel(SP, pos + p * pt, 0)
#define V4 MF4
#define M4 MF4x4
//!TEXTURE
//!WIDTH INPUT_WIDTH
//!HEIGHT INPUT_HEIGHT
//!FORMAT R8G8B8A8_SNORM
Texture2D t0;
//!TEXTURE
//!WIDTH INPUT_WIDTH
//!HEIGHT INPUT_HEIGHT
//!FORMAT R8G8B8A8_SNORM
Texture2D t1;
//!PASS 1
//!DESC in
//!BLOCK_SIZE 8
//!NUM_THREADS 64
//!IN INPUT
//!OUT t0
#define l0(x, y) (dot(MF3(-4.174e-01, -7.873e-01, -1.763e-01), O(INPUT, float2(x, y)).rgb) + MF(1.011e+00))
V4 f0(MF s0_0, MF s0_1, MF s0_2, MF s0_3, MF s0_4, MF s0_5, MF s0_6, MF s0_7, MF s0_8) {
V4 r = { 3.566e-02, -1.308e-03, -5.595e-03, -5.246e-03 };
r = mad(s0_0, V4(1.222e-01, 7.038e-03, 1.179e-01, 1.876e-01), r);
r = mad(s0_1, V4(1.025e-01, -2.993e-01, 3.154e-01, -1.050e-01), r);
r = mad(s0_2, V4(5.656e-02, -3.117e-03, -6.665e-02, -2.044e-01), r);
r = mad(s0_3, V4(-5.045e-01, -4.189e-01, -3.076e-01, -3.691e-01), r);
r = mad(s0_4, V4(1.365e-01, 6.699e-01, 3.389e-01, 4.561e-01), r);
r = mad(s0_5, V4(-7.690e-02, 2.655e-02, -1.044e-02, 7.271e-02), r);
r = mad(s0_6, V4(1.358e-02, 3.378e-03, -1.802e-01, -1.936e-01), r);
r = mad(s0_7, V4(8.227e-02, 1.550e-02, -1.820e-01, -1.670e-01), r);
r = mad(s0_8, V4(9.988e-03, 1.413e-03, -2.486e-02, 3.258e-01), r);
return r;
}
void Pass1(uint2 blockStart, uint3 tid) {
float2 pt = float2(GetInputPt());
uint2 gxy = Rmp8x8(tid.x) + blockStart;
uint2 size = GetInputSize();
if (gxy.x >= size.x || gxy.y >= size.y) {
return;
}
float2 pos = (gxy + 0.5) * pt;
MF s0_0 = l0(-1.0, -1.0);
MF s0_1 = l0(0.0, -1.0);
MF s0_2 = l0(1.0, -1.0);
MF s0_3 = l0(-1.0, 0.0);
MF s0_4 = l0(0.0, 0.0);
MF s0_5 = l0(1.0, 0.0);
MF s0_6 = l0(-1.0, 1.0);
MF s0_7 = l0(0.0, 1.0);
MF s0_8 = l0(1.0, 1.0);
t0[gxy] = f0(s0_0, s0_1, s0_2, s0_3, s0_4, s0_5, s0_6, s0_7, s0_8);
}
//!PASS 2
//!DESC conv1
//!BLOCK_SIZE 8
//!NUM_THREADS 64
//!IN t0
//!OUT t1
#define l0(x, y) V4(O(t0, float2(x, y)))
V4 f0(V4 s0_0, V4 s0_1, V4 s0_2, V4 s0_3, V4 s0_4, V4 s0_5, V4 s0_6, V4 s0_7, V4 s0_8, V4 s1_0, V4 s1_1, V4 s1_2, V4 s1_3, V4 s1_4, V4 s1_5, V4 s1_6, V4 s1_7, V4 s1_8) {
V4 r = { -4.317e-03, 2.687e-03, -1.530e-03, 4.681e-04 };
r = MulAdd(s0_0, M4(1.282e-01, 1.199e-01, 1.156e-01, -4.091e-02, -1.771e-02, -1.431e-01, -1.478e-02, 4.041e-02, -1.559e-01, 1.231e-02, -8.571e-02, 2.159e-02, -6.484e-02, 3.819e-02, -3.386e-02, -3.344e-02), r);
r = MulAdd(s0_1, M4(6.131e-02, 1.493e-01, 1.954e-01, -2.565e-01, 1.570e-01, -3.852e-01, -2.313e-01, 9.262e-02, 1.038e-01, -4.169e-01, -2.446e-01, 9.953e-02, -1.830e-01, -9.774e-02, -1.498e-01, 8.626e-02), r);
r = MulAdd(s0_2, M4(9.908e-02, 1.372e-01, -1.254e-02, 4.486e-03, 1.023e-01, 6.484e-02, 1.645e-01, -4.932e-02, -4.221e-02, -1.919e-01, -2.135e-02, 6.955e-02, -1.406e-01, 8.082e-02, -7.935e-02, 3.010e-02), r);
r = MulAdd(s0_3, M4(-7.203e-02, -1.210e-01, 1.084e-01, -6.958e-03, 1.303e-01, 1.030e-01, -2.392e-01, -1.084e-01, 2.173e-01, -7.864e-02, -2.983e-01, -3.510e-01, -3.076e-01, 4.533e-02, 1.940e-01, 4.051e-01), r);
r = MulAdd(s0_4, M4(9.270e-02, -4.072e-01, 2.338e-01, 4.098e-01, -1.440e-01, 6.971e-01, 5.515e-01, 2.682e-01, -1.401e-01, 3.504e-02, 1.366e-01, 6.149e-01, -3.330e-01, 1.880e-01, -4.170e-01, 3.244e-01), r);
r = MulAdd(s0_5, M4(-5.380e-01, -7.843e-02, -1.293e-01, -9.225e-02, 1.393e-01, -2.588e-01, 4.618e-01, -2.264e-02, -5.369e-02, 1.321e-01, -3.029e-02, 7.983e-02, -1.048e-01, 3.279e-02, -5.969e-02, -3.766e-03), r);
r = MulAdd(s0_6, M4(3.432e-02, 1.518e-02, 1.940e-02, -1.086e-01, 1.052e-01, -5.430e-02, -3.343e-02, 1.824e-01, -9.831e-02, 1.097e-02, 6.281e-02, 1.194e-01, 3.253e-02, 4.046e-02, -2.183e-02, -1.328e-01), r);
r = MulAdd(s0_7, M4(1.538e-01, 6.796e-02, -4.870e-01, 7.139e-02, -2.497e-01, 2.916e-02, 6.191e-01, -2.650e-01, -4.194e-02, 1.782e-01, -3.431e-01, -9.707e-02, 2.173e-02, -1.150e-01, -8.162e-03, 4.551e-02), r);
r = MulAdd(s0_8, M4(5.804e-02, 5.436e-02, -1.604e-01, 8.077e-02, 2.685e-01, 4.741e-02, 1.225e-01, -1.033e-01, -4.358e-02, -1.091e-01, 8.815e-02, -3.121e-02, -2.569e-02, -1.093e-02, -2.550e-02, -1.571e-02), r);
r = MulAdd(s1_0, M4(8.760e-02, 1.254e-01, 9.299e-02, -1.140e-02, 4.179e-02, -1.333e-01, 3.048e-03, -3.111e-02, -6.091e-02, 6.563e-03, 4.609e-03, -4.717e-02, -6.470e-02, -5.791e-02, -5.529e-03, 8.697e-02), r);
r = MulAdd(s1_1, M4(6.935e-02, 9.805e-02, 1.851e-01, -2.726e-01, 1.731e-01, -2.863e-01, -2.267e-01, -3.813e-02, 1.104e-01, -3.193e-01, -1.958e-01, 9.567e-02, 1.819e-01, -2.054e-01, 1.228e-01, 3.906e-02), r);
r = MulAdd(s1_2, M4(-1.957e-01, 7.733e-02, -2.023e-01, 1.297e-01, -1.646e-01, 1.304e-01, -1.728e-02, -4.396e-02, 7.828e-02, -2.639e-01, 3.389e-02, 1.101e-01, 1.388e-01, -4.075e-03, 1.023e-01, -7.785e-03), r);
r = MulAdd(s1_3, M4(-2.828e-02, -7.018e-02, 4.269e-02, -1.386e-01, 2.143e-02, 2.504e-01, -2.134e-01, -2.483e-01, 1.075e-01, -2.671e-02, -2.588e-01, -3.271e-01, 1.173e-01, -6.103e-02, 5.539e-01, 5.341e-01), r);
r = MulAdd(s1_4, M4(-2.415e-01, -2.975e-01, -6.622e-02, 4.027e-01, -5.871e-01, 7.506e-01, 1.939e-02, -1.680e-01, 4.796e-01, -2.840e-01, 5.077e-01, 9.122e-02, 1.463e-01, 2.124e-01, 6.358e-02, 2.993e-01), r);
r = MulAdd(s1_5, M4(4.298e-01, -1.754e-01, 5.357e-01, -1.440e-01, -4.439e-01, -3.819e-01, -1.009e-01, 2.113e-02, -2.275e-02, -1.842e-02, 1.441e-01, 6.590e-03, 2.627e-02, 3.381e-02, 9.956e-02, -1.935e-02), r);
r = MulAdd(s1_6, M4(-5.557e-02, 3.378e-02, -2.451e-02, -1.718e-01, -2.037e-01, 1.631e-02, -2.822e-01, -7.724e-02, -6.657e-02, -2.282e-02, 2.673e-02, 8.716e-02, 1.291e-01, 9.472e-03, 3.810e-02, -1.134e-01), r);
r = MulAdd(s1_7, M4(1.441e-01, 4.331e-02, -4.741e-01, 2.165e-01, -5.974e-01, -2.669e-02, -4.949e-02, -3.179e-01, 1.007e-01, 1.512e-01, -4.138e-02, -7.470e-02, 8.828e-02, -1.400e-01, 5.797e-02, -4.988e-03), r);
r = MulAdd(s1_8, M4(-2.478e-01, 1.392e-01, -8.663e-02, -3.629e-02, 1.823e-01, 7.573e-03, -2.445e-01, -1.641e-02, -5.197e-02, -8.804e-02, 1.244e-01, 2.095e-02, 1.683e-02, -4.073e-02, -5.207e-03, -3.854e-03), r);
return r;
}
void Pass2(uint2 blockStart, uint3 tid) {
float2 pt = float2(GetInputPt());
uint2 gxy = Rmp8x8(tid.x) + blockStart;
uint2 size = GetInputSize();
if (gxy.x >= size.x || gxy.y >= size.y) {
return;
}
float2 pos = (gxy + 0.5) * pt;
V4 s0_0 = l0(-1.0, -1.0);
V4 s0_1 = l0(0.0, -1.0);
V4 s0_2 = l0(1.0, -1.0);
V4 s0_3 = l0(-1.0, 0.0);
V4 s0_4 = l0(0.0, 0.0);
V4 s0_5 = l0(1.0, 0.0);
V4 s0_6 = l0(-1.0, 1.0);
V4 s0_7 = l0(0.0, 1.0);
V4 s0_8 = l0(1.0, 1.0);
V4 s1_0 = -max(-s0_0, 0.0);
V4 s1_1 = -max(-s0_1, 0.0);
V4 s1_2 = -max(-s0_2, 0.0);
V4 s1_3 = -max(-s0_3, 0.0);
V4 s1_4 = -max(-s0_4, 0.0);
V4 s1_5 = -max(-s0_5, 0.0);
V4 s1_6 = -max(-s0_6, 0.0);
V4 s1_7 = -max(-s0_7, 0.0);
V4 s1_8 = -max(-s0_8, 0.0);
s0_0 = max(s0_0, 0.0);
s0_1 = max(s0_1, 0.0);
s0_2 = max(s0_2, 0.0);
s0_3 = max(s0_3, 0.0);
s0_4 = max(s0_4, 0.0);
s0_5 = max(s0_5, 0.0);
s0_6 = max(s0_6, 0.0);
s0_7 = max(s0_7, 0.0);
s0_8 = max(s0_8, 0.0);
t1[gxy] = f0(s0_0, s0_1, s0_2, s0_3, s0_4, s0_5, s0_6, s0_7, s0_8, s1_0, s1_1, s1_2, s1_3, s1_4, s1_5, s1_6, s1_7, s1_8);
}
//!PASS 3
//!DESC conv2
//!BLOCK_SIZE 8
//!NUM_THREADS 64
//!IN t1
//!OUT t0
#define l0(x, y) V4(O(t1, float2(x, y)))
V4 f0(V4 s0_0, V4 s0_1, V4 s0_2, V4 s0_3, V4 s0_4, V4 s0_5, V4 s0_6, V4 s0_7, V4 s0_8, V4 s1_0, V4 s1_1, V4 s1_2, V4 s1_3, V4 s1_4, V4 s1_5, V4 s1_6, V4 s1_7, V4 s1_8) {
V4 r = { -1.043e-03, 3.601e-03, 5.622e-03, -7.848e-04 };
r = MulAdd(s0_0, M4(1.921e-01, -2.132e-02, -5.460e-03, -6.681e-02, 9.988e-02, -2.228e-02, 4.719e-02, 9.124e-03, -1.072e-01, 1.506e-01, 2.070e-02, -4.671e-02, 2.244e-01, -4.895e-02, -8.150e-03, -9.520e-02), r);
r = MulAdd(s0_1, M4(8.226e-02, 4.651e-02, -1.842e-01, -3.376e-02, 1.349e-01, 2.148e-02, -1.746e-01, 1.671e-02, 9.761e-02, 7.581e-02, 1.470e-01, -8.582e-02, -1.149e-01, 2.143e-02, -1.597e-01, 1.626e-01), r);
r = MulAdd(s0_2, M4(-5.810e-04, -3.566e-02, 4.708e-02, -3.068e-02, 1.578e-02, 5.503e-03, 3.081e-02, -4.174e-02, 3.394e-01, 7.398e-02, -9.467e-02, -1.127e-01, -1.314e-01, 1.511e-02, 1.538e-01, -5.695e-03), r);
r = MulAdd(s0_3, M4(2.959e-01, 3.316e-02, -5.716e-02, -2.233e-01, 5.020e-01, -1.416e-01, -6.082e-02, -3.393e-01, 3.292e-01, -6.813e-02, 9.009e-02, -1.638e-01, 1.190e-01, -2.728e-02, -6.042e-02, -1.360e-01), r);
r = MulAdd(s0_4, M4(5.902e-01, 3.040e-01, -2.870e-01, 2.228e-02, -1.646e-01, 2.078e-02, -1.480e-01, 2.083e-01, -4.397e-01, -2.549e-01, -1.168e-01, -4.199e-01, 2.199e-01, 2.596e-02, 2.598e-02, -1.313e-01), r);
r = MulAdd(s0_5, M4(1.043e-01, 1.050e-02, -5.654e-02, -1.265e-01, -1.978e-01, 3.772e-02, 2.474e-01, 1.395e-01, 2.041e-01, 6.617e-02, -2.602e-01, -1.601e-01, -5.577e-02, -1.591e-02, 2.096e-01, 2.594e-02), r);
r = MulAdd(s0_6, M4(7.245e-02, 6.156e-02, 5.317e-02, -3.912e-01, 1.871e-01, -2.079e-02, -2.552e-02, -6.961e-02, 2.686e-01, 8.518e-02, -1.026e-01, -4.040e-01, -6.324e-02, 7.999e-03, 1.317e-02, 1.619e-02), r);
r = MulAdd(s0_7, M4(1.240e-01, -8.349e-02, -1.258e-01, -3.269e-01, 6.624e-01, -1.357e-01, -6.738e-01, -5.998e-01, -8.375e-04, 2.226e-01, -1.880e-01, 5.678e-02, -8.383e-02, -3.455e-02, -1.399e-02, 4.540e-02), r);
r = MulAdd(s0_8, M4(-3.130e-02, 9.691e-02, 1.763e-01, -1.847e-02, -1.193e-01, -7.494e-03, 1.485e-02, 1.244e-02, 9.559e-02, 3.116e-02, 8.046e-03, -1.264e-01, -2.403e-01, 6.389e-02, 2.999e-01, 1.484e-01), r);
r = MulAdd(s1_0, M4(2.569e-01, -8.689e-03, -1.806e-02, -3.993e-02, 9.155e-02, -2.022e-02, 1.034e-02, -3.455e-02, -1.534e-01, 1.836e-02, -1.176e-03, 3.593e-03, 2.642e-01, -6.587e-02, -4.169e-02, -2.237e-01), r);
r = MulAdd(s1_1, M4(1.398e-01, 1.020e-02, -2.478e-01, 2.747e-02, 7.152e-02, 1.835e-02, -2.013e-01, 1.151e-02, -2.586e-01, -3.622e-02, 2.529e-01, 1.465e-01, -3.973e-01, 5.907e-02, -9.450e-02, 3.761e-02), r);
r = MulAdd(s1_2, M4(3.157e-02, 7.847e-03, 8.109e-03, -3.333e-02, -3.333e-02, -6.401e-03, -6.632e-03, 3.296e-02, -1.433e-02, 2.167e-02, 1.194e-01, -1.028e-01, -2.104e-01, 1.352e-02, -6.835e-02, 1.901e-01), r);
r = MulAdd(s1_3, M4(3.443e-01, -1.004e-01, -6.176e-02, -3.047e-01, 4.779e-01, -7.928e-02, -8.134e-02, -4.873e-01, -1.421e-01, 3.972e-02, 7.459e-02, 2.099e-01, 1.118e-01, -1.022e-02, -8.584e-02, -1.657e-01), r);
r = MulAdd(s1_4, M4(-1.721e-01, 2.625e-02, -7.292e-03, 2.646e-01, 2.505e-02, 1.479e-01, -3.357e-01, 1.088e-01, 1.016e-01, -1.902e-01, -1.622e-01, -6.326e-02, -4.305e-01, 4.763e-01, -1.357e-03, -5.685e-01), r);
r = MulAdd(s1_5, M4(3.324e-03, 1.692e-02, -5.726e-02, 2.853e-02, -3.135e-01, -4.534e-03, 2.549e-01, 1.183e-01, -1.277e-01, -5.030e-02, 9.190e-02, 1.145e-01, 3.445e-01, 6.425e-02, -2.707e-01, -1.701e-01), r);
r = MulAdd(s1_6, M4(2.164e-02, 1.998e-02, 1.667e-02, -6.126e-02, 2.400e-01, -9.253e-02, -4.525e-02, 8.615e-03, 5.148e-02, -1.803e-02, -7.495e-02, -7.102e-02, -2.646e-02, 6.819e-02, 1.465e-01, 1.904e-01), r);
r = MulAdd(s1_7, M4(-2.339e-02, 3.350e-02, -1.274e-01, 5.525e-02, 9.120e-01, -9.074e-01, -6.856e-01, -7.422e-02, 4.849e-02, -1.377e-02, -1.409e-01, -5.792e-02, -1.044e-01, 9.079e-02, 2.520e-01, 2.053e-01), r);
r = MulAdd(s1_8, M4(1.891e-02, -1.562e-02, -1.024e-02, -2.686e-02, -1.038e-01, -3.210e-02, 4.222e-01, -2.084e-01, -1.841e-01, 3.231e-02, 7.320e-02, 1.727e-01, 2.861e-01, 2.506e-02, -2.266e-01, -3.940e-01), r);
return r;
}
void Pass3(uint2 blockStart, uint3 tid) {
float2 pt = float2(GetInputPt());
uint2 gxy = Rmp8x8(tid.x) + blockStart;
uint2 size = GetInputSize();
if (gxy.x >= size.x || gxy.y >= size.y) {
return;
}
float2 pos = (gxy + 0.5) * pt;
V4 s0_0 = l0(-1.0, -1.0);
V4 s0_1 = l0(0.0, -1.0);
V4 s0_2 = l0(1.0, -1.0);
V4 s0_3 = l0(-1.0, 0.0);
V4 s0_4 = l0(0.0, 0.0);
V4 s0_5 = l0(1.0, 0.0);
V4 s0_6 = l0(-1.0, 1.0);
V4 s0_7 = l0(0.0, 1.0);
V4 s0_8 = l0(1.0, 1.0);
V4 s1_0 = -max(-s0_0, 0.0);
V4 s1_1 = -max(-s0_1, 0.0);
V4 s1_2 = -max(-s0_2, 0.0);
V4 s1_3 = -max(-s0_3, 0.0);
V4 s1_4 = -max(-s0_4, 0.0);
V4 s1_5 = -max(-s0_5, 0.0);
V4 s1_6 = -max(-s0_6, 0.0);
V4 s1_7 = -max(-s0_7, 0.0);
V4 s1_8 = -max(-s0_8, 0.0);
s0_0 = max(s0_0, 0.0);
s0_1 = max(s0_1, 0.0);
s0_2 = max(s0_2, 0.0);
s0_3 = max(s0_3, 0.0);
s0_4 = max(s0_4, 0.0);
s0_5 = max(s0_5, 0.0);
s0_6 = max(s0_6, 0.0);
s0_7 = max(s0_7, 0.0);
s0_8 = max(s0_8, 0.0);
t0[gxy] = f0(s0_0, s0_1, s0_2, s0_3, s0_4, s0_5, s0_6, s0_7, s0_8, s1_0, s1_1, s1_2, s1_3, s1_4, s1_5, s1_6, s1_7, s1_8);
}
//!PASS 4
//!DESC conv3
//!BLOCK_SIZE 8
//!NUM_THREADS 64
//!IN t0
//!OUT t1
#define l0(x, y) V4(O(t0, float2(x, y)))
V4 f0(V4 s0_0, V4 s0_1, V4 s0_2, V4 s0_3, V4 s0_4, V4 s0_5, V4 s0_6, V4 s0_7, V4 s0_8, V4 s1_0, V4 s1_1, V4 s1_2, V4 s1_3, V4 s1_4, V4 s1_5, V4 s1_6, V4 s1_7, V4 s1_8) {
V4 r = { -6.562e-04, 7.371e-04, -4.319e-03, -8.757e-04 };
r = MulAdd(s0_0, M4(-7.801e-03, 7.517e-03, 5.348e-02, 7.686e-02, -8.770e-03, 1.144e-02, -2.398e-02, 1.355e-02, -4.642e-02, 5.880e-02, 3.263e-02, 1.860e-01, -4.443e-02, -2.732e-02, -2.133e-02, -1.166e-01), r);
r = MulAdd(s0_1, M4(-1.751e-02, -1.230e-02, -1.218e-01, -1.231e-01, 4.092e-03, -8.769e-03, -2.251e-03, 5.142e-02, 4.354e-03, -4.445e-02, -2.369e-01, -1.616e-01, 4.495e-03, -1.326e-01, -5.371e-01, -5.119e-01), r);
r = MulAdd(s0_2, M4(3.143e-02, 2.366e-02, 8.884e-02, -1.819e-02, 2.358e-03, 3.812e-04, -4.972e-02, -5.311e-02, 1.729e-02, 1.523e-02, 7.798e-02, -1.705e-05, -2.295e-02, 6.567e-02, 1.422e-01, 1.890e-01), r);
r = MulAdd(s0_3, M4(2.363e-02, 1.555e-02, -1.307e-01, -8.190e-02, 1.026e-02, 9.724e-03, 5.358e-02, -2.783e-01, 7.268e-03, 1.659e-01, -5.801e-02, 3.076e-01, -1.575e-01, -9.567e-02, 3.294e-02, -7.694e-01), r);
r = MulAdd(s0_4, M4(1.677e-02, -1.324e-01, 4.019e-01, -2.902e-01, -6.051e-02, -4.625e-02, 8.409e-01, 4.756e-01, -1.135e-01, -3.213e-01, 6.389e-02, -2.083e-01, -1.219e+00, 2.280e-01, 9.667e-01, -3.604e-01), r);
r = MulAdd(s0_5, M4(-5.948e-02, 1.567e-01, 3.883e-02, -4.843e-03, -2.153e-02, 3.439e-02, -1.160e-01, -1.325e-02, -5.312e-02, 1.136e-01, -5.260e-02, -3.524e-02, 7.315e-02, 3.527e-01, 6.186e-01, -7.505e-02), r);
r = MulAdd(s0_6, M4(-3.841e-02, 1.620e-03, 9.449e-02, -8.648e-02, -2.656e-02, -1.676e-03, 2.364e-03, -7.221e-02, -9.590e-02, 4.160e-02, -1.278e-02, -3.171e-02, 6.213e-02, 2.673e-02, -7.931e-02, 2.588e-01), r);
r = MulAdd(s0_7, M4(-3.636e-02, -1.558e-01, 2.151e-01, 1.188e-01, 1.275e-01, -8.114e-02, -8.376e-02, -3.690e-02, -1.968e-02, -1.038e-01, 8.994e-02, 3.846e-02, -1.499e-01, 6.457e-01, -8.201e-02, -3.935e-01), r);
r = MulAdd(s0_8, M4(-2.833e-03, 2.529e-01, -3.350e-03, -3.433e-02, 1.943e-02, -2.796e-02, 3.313e-02, 1.582e-02, 1.702e-02, 5.663e-02, -1.647e-02, -2.229e-02, -4.865e-01, 3.285e-01, -4.462e-01, -4.307e-01), r);
r = MulAdd(s1_0, M4(-6.004e-02, 4.898e-03, 3.591e-02, 1.900e-01, -3.816e-02, -3.269e-02, 1.459e-01, -3.464e-03, -1.235e-02, -3.737e-02, 1.569e-02, 2.559e-01, -3.173e-04, 1.268e-02, 8.886e-03, 2.960e-02), r);
r = MulAdd(s1_1, M4(-1.582e-02, -7.507e-02, -2.026e-01, 2.027e-01, -6.107e-02, 2.055e-02, -5.811e-02, 5.420e-03, 1.028e-02, -1.374e-02, -6.152e-01, -2.259e-01, -3.408e-03, -1.800e-02, 4.574e-02, -9.590e-02), r);
r = MulAdd(s1_2, M4(4.210e-02, 2.126e-02, 8.277e-02, 2.079e-02, -1.733e-01, -2.483e-02, 2.686e-01, 1.498e-01, 7.352e-02, -2.511e-02, 3.159e-02, 5.775e-02, 5.942e-02, 3.383e-02, 1.274e-01, -5.928e-02), r);
r = MulAdd(s1_3, M4(5.614e-02, 7.561e-02, -8.328e-02, 2.427e-01, 7.214e-02, -1.122e-01, 9.434e-02, -2.602e-01, -1.052e-02, -6.944e-02, -3.023e-02, -1.655e-01, 1.236e-03, 4.025e-03, -3.082e-02, -1.533e-01), r);
r = MulAdd(s1_4, M4(6.675e-01, -2.254e-01, 1.173e+00, -8.261e-02, 5.655e-01, -2.000e-01, 8.301e-01, 1.458e+00, -2.497e-01, -1.091e+00, -4.698e-01, -1.876e-01, -3.358e-02, -2.854e-01, 5.032e-01, -1.558e-01), r);
r = MulAdd(s1_5, M4(-1.444e-02, 1.502e-01, -4.221e-02, -4.864e-02, 3.236e-01, -2.572e-01, 1.344e-01, 8.562e-02, -1.030e-01, 2.690e-01, 1.238e-01, 3.309e-02, -3.849e-02, 1.860e-01, 6.528e-03, 2.840e-02), r);
r = MulAdd(s1_6, M4(-1.161e-01, 5.405e-02, -3.101e-02, -1.009e-01, -9.594e-02, -1.207e-02, -3.836e-02, -6.894e-02, -1.770e-02, -2.958e-02, 8.484e-02, -2.284e-02, 2.585e-04, -2.764e-02, 4.972e-02, -5.968e-02), r);
r = MulAdd(s1_7, M4(-4.113e-02, -1.948e-01, -2.728e-02, -3.142e-02, -2.894e-01, -1.111e-01, 7.492e-02, -2.892e-02, 9.054e-02, 4.350e-02, 2.183e-01, 1.489e-01, 1.167e-02, -6.678e-02, 3.696e-02, -1.315e-02), r);
r = MulAdd(s1_8, M4(2.532e-02, 4.585e-02, -3.694e-02, -6.244e-02, -1.673e-01, 6.180e-02, -4.475e-02, 1.028e-02, -1.658e-02, 8.923e-02, 1.711e-02, 3.037e-03, 4.651e-02, 1.652e-01, 7.863e-03, -3.387e-02), r);
return r;
}
void Pass4(uint2 blockStart, uint3 tid) {
float2 pt = float2(GetInputPt());
uint2 gxy = Rmp8x8(tid.x) + blockStart;
uint2 size = GetInputSize();
if (gxy.x >= size.x || gxy.y >= size.y) {
return;
}
float2 pos = (gxy + 0.5) * pt;
V4 s0_0 = l0(-1.0, -1.0);
V4 s0_1 = l0(0.0, -1.0);
V4 s0_2 = l0(1.0, -1.0);
V4 s0_3 = l0(-1.0, 0.0);
V4 s0_4 = l0(0.0, 0.0);
V4 s0_5 = l0(1.0, 0.0);
V4 s0_6 = l0(-1.0, 1.0);
V4 s0_7 = l0(0.0, 1.0);
V4 s0_8 = l0(1.0, 1.0);
V4 s1_0 = -max(-s0_0, 0.0);
V4 s1_1 = -max(-s0_1, 0.0);
V4 s1_2 = -max(-s0_2, 0.0);
V4 s1_3 = -max(-s0_3, 0.0);
V4 s1_4 = -max(-s0_4, 0.0);
V4 s1_5 = -max(-s0_5, 0.0);
V4 s1_6 = -max(-s0_6, 0.0);
V4 s1_7 = -max(-s0_7, 0.0);
V4 s1_8 = -max(-s0_8, 0.0);
s0_0 = max(s0_0, 0.0);
s0_1 = max(s0_1, 0.0);
s0_2 = max(s0_2, 0.0);
s0_3 = max(s0_3, 0.0);
s0_4 = max(s0_4, 0.0);
s0_5 = max(s0_5, 0.0);
s0_6 = max(s0_6, 0.0);
s0_7 = max(s0_7, 0.0);
s0_8 = max(s0_8, 0.0);
t1[gxy] = f0(s0_0, s0_1, s0_2, s0_3, s0_4, s0_5, s0_6, s0_7, s0_8, s1_0, s1_1, s1_2, s1_3, s1_4, s1_5, s1_6, s1_7, s1_8);
}
//!PASS 5
//!DESC conv4
//!BLOCK_SIZE 8
//!NUM_THREADS 64
//!IN t1
//!OUT t0
#define l0(x, y) V4(O(t1, float2(x, y)))
V4 f0(V4 s0_0, V4 s0_1, V4 s0_2, V4 s0_3, V4 s0_4, V4 s0_5, V4 s0_6, V4 s0_7, V4 s0_8, V4 s1_0, V4 s1_1, V4 s1_2, V4 s1_3, V4 s1_4, V4 s1_5, V4 s1_6, V4 s1_7, V4 s1_8) {
V4 r = { -3.709e-04, 2.029e-04, -3.042e-03, -2.970e-04 };
r = MulAdd(s0_0, M4(-1.087e-01, -5.083e-02, 3.146e-01, -4.241e-02, 4.462e-02, -4.358e-02, -1.562e-01, -2.609e-03, 5.918e-02, -2.526e-02, -3.132e-02, -1.150e-02, -8.799e-03, 3.070e-02, -1.680e-02, -1.046e-02), r);
r = MulAdd(s0_1, M4(1.762e-01, 8.784e-01, -2.704e+00, -1.565e+00, -1.473e-01, -5.723e-01, 7.838e-02, -7.420e-03, -1.769e-01, -2.041e-01, -1.783e-03, -4.944e-03, 1.304e-02, 2.646e-01, -1.708e-01, 7.483e-03), r);
r = MulAdd(s0_2, M4(-1.907e-01, 1.514e-01, -3.657e-01, -5.840e-01, -4.943e-02, -1.014e-02, -2.869e-03, 6.488e-03, 2.266e-02, -3.850e-02, 6.125e-03, 1.899e-02, -3.541e-02, -2.011e-01, 1.567e-01, 1.008e-02), r);
r = MulAdd(s0_3, M4(-3.061e-01, -1.768e-01, 9.163e-02, -2.243e-01, 4.945e-02, 1.106e-01, -1.137e-01, 1.755e-02, 2.640e-01, -9.298e-02, -1.704e-01, 3.935e-02, 1.506e-01, -3.284e-02, 4.719e-02, 5.543e-02), r);
r = MulAdd(s0_4, M4(-4.579e-01, -6.198e-02, -9.889e-01, -4.446e-01, -1.612e-01, 1.518e-01, 2.588e-01, 1.075e-02, -1.527e+00, -7.923e-01, 8.120e-02, -1.116e-01, -2.079e-01, -1.206e-01, -4.422e-01, -1.951e-01), r);
r = MulAdd(s0_5, M4(1.064e-01, -1.684e-01, 2.316e-01, 4.211e-01, -9.153e-02, 9.155e-02, -7.649e-02, -1.385e-01, 9.422e-02, -1.631e-01, 8.278e-02, 3.318e-01, 7.284e-02, 3.489e-01, -2.303e-02, -6.554e-01), r);
r = MulAdd(s0_6, M4(-6.320e-02, -4.390e-02, 1.453e-02, 3.187e-02, 2.166e-02, 2.423e-03, 1.573e-03, -2.226e-02, 1.401e-01, 2.026e-01, -2.249e-01, 6.471e-02, 3.593e-02, -1.575e-02, -3.186e-02, 1.339e-02), r);
r = MulAdd(s0_7, M4(2.778e-02, 7.495e-02, -1.086e-01, 8.862e-02, -2.352e-02, 1.477e-02, 2.741e-02, 4.345e-02, -2.865e-01, 9.405e-02, 1.880e-01, -3.610e-01, -7.797e-02, -5.710e-03, 3.386e-02, 2.830e-02), r);
r = MulAdd(s0_8, M4(-3.734e-02, 3.357e-02, 5.657e-03, -1.596e-01, -7.661e-03, 1.603e-02, -3.137e-02, -7.023e-03, 6.522e-03, -2.715e-02, 2.765e-02, 4.724e-02, 1.922e-02, 3.944e-02, -8.276e-02, -1.915e-02), r);
r = MulAdd(s1_0, M4(-7.121e-02, -2.276e-02, 7.266e-02, -4.411e-03, -5.600e-01, 4.502e-01, -1.817e-01, -2.906e-01, -5.675e-02, 2.653e-02, 3.284e-02, -1.925e-03, -4.729e-03, -1.554e-03, -6.081e-03, -2.195e-02), r);
r = MulAdd(s1_1, M4(2.212e-01, 3.154e-01, -2.765e-01, 4.432e-02, 1.402e+00, 2.159e-01, 4.402e-01, 2.537e-01, 6.697e-02, 1.207e-01, -5.192e-02, 2.638e-02, 5.366e-02, 5.855e-02, -3.687e-02, 4.389e-03), r);
r = MulAdd(s1_2, M4(3.137e-02, -1.157e-01, 9.497e-02, -3.724e-02, 5.241e-02, 7.793e-02, 2.277e-04, -4.033e-01, 1.432e-02, 4.622e-02, -1.636e-02, -5.840e-03, -1.593e-02, -7.447e-02, 3.943e-02, -3.517e-03), r);
r = MulAdd(s1_3, M4(-1.209e-02, -1.350e-01, 3.018e-01, 1.233e-01, -1.262e-03, 2.194e-01, -2.919e-01, -8.031e-03, 4.620e-03, 5.318e-02, 1.247e-02, -4.260e-02, 7.155e-02, 3.256e-02, -9.839e-02, -6.741e-04), r);
r = MulAdd(s1_4, M4(3.291e-01, 2.397e-01, -2.820e-01, 5.703e-01, 7.831e-03, 5.816e-02, -1.696e-02, -1.957e-01, -1.851e-01, 3.696e-02, -2.611e-01, 7.039e-03, -1.562e-01, -7.676e-01, 9.080e-01, 7.823e-02), r);
r = MulAdd(s1_5, M4(9.918e-03, 6.364e-02, 3.364e-02, -3.291e-01, 1.393e-02, 3.139e-02, 1.701e-02, -5.675e-02, 5.085e-02, -2.050e-01, 1.160e-01, 4.875e-02, -1.189e-01, 2.310e-01, -1.353e-01, 2.046e-02), r);
r = MulAdd(s1_6, M4(-5.477e-03, -1.704e-02, 9.510e-03, -1.701e-02, 1.391e-02, -8.760e-03, -3.355e-02, -6.898e-03, -9.203e-03, -2.442e-02, 7.547e-03, 1.817e-02, 1.871e-02, -1.149e-02, 6.458e-02, 1.403e-02), r);
r = MulAdd(s1_7, M4(-5.073e-03, -5.454e-02, -2.710e-02, 1.292e-02, 2.458e-02, 1.739e-02, -2.319e-03, 3.865e-02, 5.399e-02, -1.176e-02, -1.315e-01, 1.489e-01, -7.903e-02, 8.120e-02, 4.749e-02, 1.961e-01), r);
r = MulAdd(s1_8, M4(4.163e-02, -1.603e-02, 8.659e-03, 1.023e-01, 5.233e-03, -2.900e-03, -5.293e-03, -5.829e-03, -1.453e-02, 2.467e-02, 7.198e-02, -2.407e-01, -4.023e-02, 1.009e-01, -1.560e-01, -1.567e-01), r);
return r;
}
void Pass5(uint2 blockStart, uint3 tid) {
float2 pt = float2(GetInputPt());
uint2 gxy = Rmp8x8(tid.x) + blockStart;
uint2 size = GetInputSize();
if (gxy.x >= size.x || gxy.y >= size.y) {
return;
}
float2 pos = (gxy + 0.5) * pt;
V4 s0_0 = l0(-1.0, -1.0);
V4 s0_1 = l0(0.0, -1.0);
V4 s0_2 = l0(1.0, -1.0);
V4 s0_3 = l0(-1.0, 0.0);
V4 s0_4 = l0(0.0, 0.0);
V4 s0_5 = l0(1.0, 0.0);
V4 s0_6 = l0(-1.0, 1.0);
V4 s0_7 = l0(0.0, 1.0);
V4 s0_8 = l0(1.0, 1.0);
V4 s1_0 = -max(-s0_0, 0.0);
V4 s1_1 = -max(-s0_1, 0.0);
V4 s1_2 = -max(-s0_2, 0.0);
V4 s1_3 = -max(-s0_3, 0.0);
V4 s1_4 = -max(-s0_4, 0.0);
V4 s1_5 = -max(-s0_5, 0.0);
V4 s1_6 = -max(-s0_6, 0.0);
V4 s1_7 = -max(-s0_7, 0.0);
V4 s1_8 = -max(-s0_8, 0.0);
s0_0 = max(s0_0, 0.0);
s0_1 = max(s0_1, 0.0);
s0_2 = max(s0_2, 0.0);
s0_3 = max(s0_3, 0.0);
s0_4 = max(s0_4, 0.0);
s0_5 = max(s0_5, 0.0);
s0_6 = max(s0_6, 0.0);
s0_7 = max(s0_7, 0.0);
s0_8 = max(s0_8, 0.0);
t0[gxy] = f0(s0_0, s0_1, s0_2, s0_3, s0_4, s0_5, s0_6, s0_7, s0_8, s1_0, s1_1, s1_2, s1_3, s1_4, s1_5, s1_6, s1_7, s1_8);
}
//!PASS 6
//!DESC out-shuffle
//!BLOCK_SIZE 16
//!NUM_THREADS 64
//!IN INPUT, t0
//!OUT OUTPUT
#define l0(x, y) V4(O(t0, float2(x, y)))
V4 f0(V4 s0_0, V4 s0_1, V4 s0_2, V4 s0_3, V4 s0_4, V4 s0_5, V4 s0_6, V4 s0_7, V4 s0_8, V4 s1_0, V4 s1_1, V4 s1_2, V4 s1_3, V4 s1_4, V4 s1_5, V4 s1_6, V4 s1_7, V4 s1_8) {
V4 r = { -3.259e-04, -3.197e-04, 4.954e-04, 4.568e-04 };
r = MulAdd(s0_0, M4(-6.857e-02, -6.042e-02, 3.293e-03, -2.389e-03, -1.606e-01, -1.556e-02, -5.115e-02, -4.602e-02, -3.762e-02, 1.994e-02, -2.370e-02, 3.558e-02, -7.142e-01, 8.184e-01, -1.361e-01, 1.228e-01), r);
r = MulAdd(s0_1, M4(-1.887e-01, -2.260e-01, 1.293e-02, -1.757e-02, 1.257e-01, 1.304e-01, -4.525e-02, 4.471e-02, 6.895e-01, -4.096e-01, 4.096e-02, 1.817e-02, -1.343e-01, -4.170e-01, 3.991e-03, 1.516e-03), r);
r = MulAdd(s0_2, M4(-2.667e-01, -8.692e-02, 1.481e-01, -1.466e-01, 6.142e-02, -2.084e-02, 1.942e-02, 6.700e-04, 3.942e-02, 3.109e-01, -1.323e-02, 2.240e-02, -2.306e-02, -4.749e-02, -1.155e-02, 1.843e-03), r);
r = MulAdd(s0_3, M4(-1.004e-01, -1.184e-02, -8.590e-02, -1.018e-01, 6.862e-02, -4.700e-02, -1.537e-01, -1.096e-01, -1.228e-01, 1.462e-02, -1.715e-01, 1.862e-02, 3.668e-01, -1.138e-01, 8.494e-04, 6.113e-01), r);
r = MulAdd(s0_4, M4(4.389e-01, -5.527e-01, -4.972e-01, -7.620e-01, 1.684e-01, 5.375e-02, 1.032e+00, 5.723e-01, 4.427e-02, -2.447e-01, 1.132e+00, -5.297e-01, 1.150e-01, 3.877e-01, 1.224e-01, 1.294e-01), r);
r = MulAdd(s0_5, M4(-1.023e+00, 1.567e+00, -9.747e-01, 1.051e+00, 1.537e-02, 1.993e-01, -1.679e-01, 1.139e-01, -7.358e-02, -1.782e-01, -1.938e-01, 4.419e-02, 2.001e-02, 5.881e-02, 8.971e-03, 3.368e-03), r);
r = MulAdd(s0_6, M4(-5.126e-03, 1.449e-02, -7.018e-02, 2.929e-02, 4.748e-02, -4.443e-03, -5.791e-02, -3.490e-02, 3.817e-02, 1.007e-02, -5.501e-02, -1.488e-02, -8.848e-03, 4.884e-02, -6.548e-02, 3.392e-02), r);
r = MulAdd(s0_7, M4(-4.449e-02, 7.313e-02, 3.311e-01, 3.138e-02, -6.466e-02, 5.666e-02, 1.929e-01, 8.274e-02, 3.994e-02, 2.105e-02, -1.821e-01, -1.539e-02, -9.333e-03, -4.728e-02, 6.975e-03, -3.292e-03), r);
r = MulAdd(s0_8, M4(2.038e-01, -2.356e-01, -1.987e-01, -3.746e-02, -1.499e-02, -7.007e-02, -9.546e-02, 1.905e-02, -9.802e-03, 1.990e-02, 2.140e-02, -8.164e-03, 5.109e-03, -2.081e-02, -2.386e-02, 1.183e-02), r);
r = MulAdd(s1_0, M4(-7.067e-02, -4.613e-02, -5.433e-04, -2.191e-02, -1.125e-01, -3.650e-02, -1.298e-02, -3.479e-02, -1.118e-01, -1.521e-02, -4.731e-03, -7.478e-03, 1.802e-01, 4.872e-02, -1.599e-03, -1.452e-02), r);
r = MulAdd(s1_1, M4(-2.920e-01, -1.831e-01, -1.305e-02, 4.031e-02, 1.989e-01, 3.120e-03, 2.025e-02, 5.432e-02, 2.607e-01, 2.403e-02, 1.863e-02, 8.423e-02, -3.372e-01, -1.327e-01, -1.248e-01, -1.247e-01), r);
r = MulAdd(s1_2, M4(-9.286e-02, -1.948e-01, -8.532e-03, 7.416e-03, 4.578e-02, 1.581e-01, 1.473e-03, -3.796e-02, 1.011e-01, 2.393e-01, 2.742e-02, -4.224e-02, -9.579e-03, -9.888e-02, -2.065e-03, 7.685e-03), r);
r = MulAdd(s1_3, M4(-2.056e-01, -3.479e-02, -2.666e-01, -5.344e-02, 1.579e-01, -6.091e-02, -1.655e-01, -1.575e-01, -8.230e-02, -4.748e-02, -1.304e-01, -7.186e-02, 2.953e-01, 6.950e-02, 1.865e-01, 7.567e-02), r);
r = MulAdd(s1_4, M4(3.408e-01, -1.054e-01, -2.613e-01, -6.084e-01, 3.193e-01, 6.366e-01, 4.251e-01, 4.066e-01, -3.742e-01, -8.521e-02, 5.906e-01, 1.870e-01, 2.044e-02, 2.495e-01, 1.046e-01, 3.018e-01), r);
r = MulAdd(s1_5, M4(4.748e-03, 2.086e-01, 4.231e-03, -7.764e-03, 3.933e-02, 3.446e-03, -3.431e-02, 8.415e-02, -3.798e-02, -3.428e-01, -7.206e-02, 2.392e-01, 2.157e-02, 2.692e-02, 3.313e-02, 1.841e-02), r);
r = MulAdd(s1_6, M4(1.813e-02, 2.306e-03, -3.402e-02, 1.009e-03, 4.408e-02, -2.307e-02, -3.394e-02, -3.912e-02, 3.822e-02, -1.051e-02, -1.023e-01, -4.626e-02, -4.871e-02, 6.250e-03, 1.367e-01, 3.674e-02), r);
r = MulAdd(s1_7, M4(-1.170e-02, 3.747e-02, 1.548e-01, 1.243e-01, -1.074e-01, -9.848e-03, 2.627e-01, 1.132e-01, 4.550e-02, 5.050e-02, -1.194e-01, -6.091e-02, -2.180e-02, -6.381e-02, -5.949e-02, 1.580e-02), r);
r = MulAdd(s1_8, M4(-1.146e-04, -1.852e-02, -1.515e-02, 2.488e-02, -1.877e-02, -7.739e-02, -6.812e-02, 7.656e-03, 2.688e-02, 5.650e-02, 4.285e-02, -3.270e-02, 1.163e-03, 8.328e-04, -1.998e-02, -2.282e-02), r);
return tanh(r);
}
void Pass6(uint2 blockStart, uint3 tid) {
float2 pt = float2(GetInputPt());
uint2 gxy = (Rmp8x8(tid.x) << 1) + blockStart;
uint2 size = GetOutputSize();
if (gxy.x >= size.x || gxy.y >= size.y) {
return;
}
float2 pos = ((gxy >> 1) + 0.5) * pt;
V4 s0_0 = l0(-1.0, -1.0);
V4 s0_1 = l0(0.0, -1.0);
V4 s0_2 = l0(1.0, -1.0);
V4 s0_3 = l0(-1.0, 0.0);
V4 s0_4 = l0(0.0, 0.0);
V4 s0_5 = l0(1.0, 0.0);
V4 s0_6 = l0(-1.0, 1.0);
V4 s0_7 = l0(0.0, 1.0);
V4 s0_8 = l0(1.0, 1.0);
V4 s1_0 = -max(-s0_0, 0.0);
V4 s1_1 = -max(-s0_1, 0.0);
V4 s1_2 = -max(-s0_2, 0.0);
V4 s1_3 = -max(-s0_3, 0.0);
V4 s1_4 = -max(-s0_4, 0.0);
V4 s1_5 = -max(-s0_5, 0.0);
V4 s1_6 = -max(-s0_6, 0.0);
V4 s1_7 = -max(-s0_7, 0.0);
V4 s1_8 = -max(-s0_8, 0.0);
s0_0 = max(s0_0, 0.0);
s0_1 = max(s0_1, 0.0);
s0_2 = max(s0_2, 0.0);
s0_3 = max(s0_3, 0.0);
s0_4 = max(s0_4, 0.0);
s0_5 = max(s0_5, 0.0);
s0_6 = max(s0_6, 0.0);
s0_7 = max(s0_7, 0.0);
s0_8 = max(s0_8, 0.0);
V4 r = f0(s0_0, s0_1, s0_2, s0_3, s0_4, s0_5, s0_6, s0_7, s0_8, s1_0, s1_1, s1_2, s1_3, s1_4, s1_5, s1_6, s1_7, s1_8);
static const MF3x3 rgb2yuv = { 0.299, 0.587, 0.114, -0.169, -0.331, 0.5, 0.5, -0.419, -0.081 };
static const MF3x3 yuv2rgb = { 1, -0.00093, 1.401687, 1, -0.3437, -0.71417, 1, 1.77216, 0.00099 };
float2 opt = float2(GetOutputPt());
pos -= 0.5f * opt;
MF3 yuv = mul(rgb2yuv, INPUT.SampleLevel(SL, pos, 0).rgb);
OUTPUT[gxy] = MF4(mul(yuv2rgb, MF3(saturate(yuv.r + r.x), yuv.yz)), 1);
++gxy.x;
pos.x += opt.x;
yuv = mul(rgb2yuv, INPUT.SampleLevel(SL, pos, 0).rgb);
OUTPUT[gxy] = MF4(mul(yuv2rgb, MF3(saturate(yuv.r + r.y), yuv.yz)), 1);
++gxy.y;
pos.y += opt.y;
yuv = mul(rgb2yuv, INPUT.SampleLevel(SL, pos, 0).rgb);
OUTPUT[gxy] = MF4(mul(yuv2rgb, MF3(saturate(yuv.r + r.w), yuv.yz)), 1);
--gxy.x;
pos.x -= opt.x;
yuv = mul(rgb2yuv, INPUT.SampleLevel(SL, pos, 0).rgb);
OUTPUT[gxy] = MF4(mul(yuv2rgb, MF3(saturate(yuv.r + r.z), yuv.yz)), 1);
}

View file

@ -1,914 +0,0 @@
// CuNNy 4x8C BILINEAR RGB NVL DN - https://github.com/funnyplanter/CuNNy
// This program is free software: you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation, either version 3 of the License, or
// (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
//
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License
// along with this program. If not, see <http://www.gnu.org/licenses/>.
//!MAGPIE EFFECT
//!VERSION 4
//!SORT_NAME CuNNy-DN-D08N04
//!USE MulAdd
//!CAPABILITY FP16
#include "../StubDefs.hlsli"
//!TEXTURE
Texture2D INPUT;
//!TEXTURE
//!WIDTH INPUT_WIDTH * 2
//!HEIGHT INPUT_HEIGHT * 2
Texture2D OUTPUT;
//!SAMPLER
//!FILTER POINT
SamplerState SP;
//!SAMPLER
//!FILTER LINEAR
SamplerState SL;
//!COMMON
#define O(t, p) t.SampleLevel(SP, pos + p * pt, 0)
#define V4 MF4
#define M4 MF4x4
//!TEXTURE
//!WIDTH INPUT_WIDTH
//!HEIGHT INPUT_HEIGHT
//!FORMAT R8G8B8A8_SNORM
Texture2D t0;
//!TEXTURE
//!WIDTH INPUT_WIDTH
//!HEIGHT INPUT_HEIGHT
//!FORMAT R8G8B8A8_SNORM
Texture2D t1;
//!TEXTURE
//!WIDTH INPUT_WIDTH
//!HEIGHT INPUT_HEIGHT
//!FORMAT R8G8B8A8_SNORM
Texture2D t2;
//!TEXTURE
//!WIDTH INPUT_WIDTH
//!HEIGHT INPUT_HEIGHT
//!FORMAT R8G8B8A8_SNORM
Texture2D t3;
//!PASS 1
//!DESC in
//!BLOCK_SIZE 8
//!NUM_THREADS 64
//!IN INPUT
//!OUT t0, t1
#define l0(x, y) (dot(MF3(2.214e-01, 4.385e-01, 1.006e-01), O(INPUT, float2(x, y)).rgb) + MF(-6.858e-01))
V4 f0(MF s0_0, MF s0_1, MF s0_2, MF s0_3, MF s0_4, MF s0_5, MF s0_6, MF s0_7, MF s0_8) {
V4 r = { 2.880e-04, 1.418e-02, 1.413e-02, -1.036e-01 };
r = mad(s0_0, V4(-2.401e-02, 1.817e-03, -1.218e-01, 2.796e-02), r);
r = mad(s0_1, V4(3.256e-02, 3.929e-03, -5.850e-02, -5.602e-02), r);
r = mad(s0_2, V4(4.497e-04, -1.812e-02, 5.241e-02, 3.698e-02), r);
r = mad(s0_3, V4(5.371e-01, -2.302e-01, -1.373e-01, -4.038e-03), r);
r = mad(s0_4, V4(1.565e-01, -6.067e-02, 3.397e-01, -3.741e-01), r);
r = mad(s0_5, V4(-2.095e-03, 4.044e-02, -3.770e-02, 5.665e-02), r);
r = mad(s0_6, V4(-1.993e-01, -2.645e-01, -8.892e-02, 1.948e-02), r);
r = mad(s0_7, V4(-4.865e-01, 5.400e-01, -1.396e-01, 1.270e-01), r);
r = mad(s0_8, V4(-1.667e-02, -9.433e-03, -1.324e-02, -1.803e-03), r);
return r;
}
V4 f1(MF s0_0, MF s0_1, MF s0_2, MF s0_3, MF s0_4, MF s0_5, MF s0_6, MF s0_7, MF s0_8) {
V4 r = { -4.485e-04, -2.620e-04, 2.449e-02, -7.403e-04 };
r = mad(s0_0, V4(-4.610e-02, -6.199e-01, 8.493e-03, -1.532e-02), r);
r = mad(s0_1, V4(-7.178e-02, 5.957e-01, 1.575e-03, 1.807e-02), r);
r = mad(s0_2, V4(1.106e-01, 3.625e-03, 3.713e-02, -4.124e-03), r);
r = mad(s0_3, V4(1.288e-01, -5.582e-02, 5.082e-02, 1.674e-02), r);
r = mad(s0_4, V4(-6.074e-01, 8.818e-02, -3.371e-01, -6.663e-01), r);
r = mad(s0_5, V4(-8.030e-02, -4.780e-03, -3.421e-01, 5.358e-02), r);
r = mad(s0_6, V4(4.990e-01, 7.623e-03, 1.778e-03, 2.401e-02), r);
r = mad(s0_7, V4(9.546e-02, -1.656e-02, 6.935e-04, 6.387e-01), r);
r = mad(s0_8, V4(-2.302e-02, 5.209e-03, 5.835e-02, -6.361e-02), r);
return r;
}
void Pass1(uint2 blockStart, uint3 tid) {
float2 pt = float2(GetInputPt());
uint2 gxy = Rmp8x8(tid.x) + blockStart;
uint2 size = GetInputSize();
if (gxy.x >= size.x || gxy.y >= size.y) {
return;
}
float2 pos = (gxy + 0.5) * pt;
MF s0_0 = l0(-1.0, -1.0);
MF s0_1 = l0(0.0, -1.0);
MF s0_2 = l0(1.0, -1.0);
MF s0_3 = l0(-1.0, 0.0);
MF s0_4 = l0(0.0, 0.0);
MF s0_5 = l0(1.0, 0.0);
MF s0_6 = l0(-1.0, 1.0);
MF s0_7 = l0(0.0, 1.0);
MF s0_8 = l0(1.0, 1.0);
t0[gxy] = f0(s0_0, s0_1, s0_2, s0_3, s0_4, s0_5, s0_6, s0_7, s0_8);
t1[gxy] = f1(s0_0, s0_1, s0_2, s0_3, s0_4, s0_5, s0_6, s0_7, s0_8);
}
//!PASS 2
//!DESC conv1
//!BLOCK_SIZE 8
//!NUM_THREADS 64
//!IN t0, t1
//!OUT t2, t3
#define l0(x, y) V4(O(t0, float2(x, y)))
#define l1(x, y) V4(O(t1, float2(x, y)))
V4 f0(V4 s0_0, V4 s0_1, V4 s0_2, V4 s0_3, V4 s0_4, V4 s0_5, V4 s0_6, V4 s0_7, V4 s0_8, V4 s1_0, V4 s1_1, V4 s1_2, V4 s1_3, V4 s1_4, V4 s1_5, V4 s1_6, V4 s1_7, V4 s1_8, V4 s2_0, V4 s2_1, V4 s2_2, V4 s2_3, V4 s2_4, V4 s2_5, V4 s2_6, V4 s2_7, V4 s2_8, V4 s3_0, V4 s3_1, V4 s3_2, V4 s3_3, V4 s3_4, V4 s3_5, V4 s3_6, V4 s3_7, V4 s3_8) {
V4 r = { -3.046e-02, 3.515e-02, 4.880e-02, 4.740e-03 };
r = MulAdd(s0_0, M4(7.103e-02, 1.495e-01, -1.731e-02, -9.952e-02, -1.539e-01, -1.103e-01, 7.099e-02, 2.023e-01, 2.681e-02, 5.202e-03, 1.954e-02, -6.822e-02, -1.650e-01, 3.710e-01, -6.020e-01, 4.879e-01), r);
r = MulAdd(s0_1, M4(-1.168e-02, 2.587e-01, -4.670e-01, -3.986e-02, -1.268e-01, 3.619e-02, 5.712e-02, 1.722e-01, 4.473e-02, -1.224e-01, 8.228e-02, -3.981e-02, 4.044e-01, -3.039e-01, -3.390e-01, 5.925e-02), r);
r = MulAdd(s0_2, M4(4.083e-02, 7.140e-02, -5.864e-01, 1.188e-01, 2.214e-01, -2.826e-01, 2.294e-01, -2.199e-01, -9.048e-02, 1.787e-01, -6.887e-02, -6.645e-02, -1.285e-01, -8.261e-02, -1.975e-01, 2.428e-01), r);
r = MulAdd(s0_3, M4(-5.801e-02, -3.381e-02, -2.285e-01, 9.377e-02, 1.878e-01, 9.285e-02, -1.001e-01, -5.059e-02, -2.155e-02, -9.098e-02, -1.279e-02, 9.801e-02, 1.178e-01, -1.967e-01, -4.792e-02, -1.106e-01), r);
r = MulAdd(s0_4, M4(3.048e-01, 2.731e-01, -2.351e-01, -1.516e-01, -1.382e-02, 1.296e-01, -9.530e-02, 2.975e-02, 2.411e-01, 2.343e-02, 1.731e-02, -2.331e-01, -2.161e-01, 4.114e-01, 4.417e-01, 1.225e+00), r);
r = MulAdd(s0_5, M4(3.337e-01, 2.844e-01, 1.065e-01, -2.391e-01, -1.265e-01, -3.625e-02, -7.062e-02, 3.529e-02, 2.208e-02, -8.459e-03, -1.366e-01, -1.563e-02, -1.648e-01, -5.919e-01, 4.061e-01, -4.975e-02), r);
r = MulAdd(s0_6, M4(6.213e-03, -2.020e-02, 2.520e-03, 2.167e-02, -2.361e-01, -1.421e-01, -4.579e-02, -1.353e-01, -2.883e-01, -5.900e-04, 2.720e-02, 1.591e-01, -5.120e-01, -4.253e-01, -3.397e-02, -4.633e-01), r);
r = MulAdd(s0_7, M4(2.456e-01, -6.978e-02, 5.668e-02, -9.795e-03, -1.925e-01, -4.841e-02, -1.273e-02, 1.282e-02, -1.223e-01, -4.080e-02, 2.975e-02, 1.595e-01, -3.345e-01, -1.504e-01, 1.080e-01, 8.549e-01), r);
r = MulAdd(s0_8, M4(8.700e-02, 1.611e-02, 8.589e-02, -3.284e-02, -1.637e-01, 2.627e-01, 1.851e-02, 2.843e-02, 1.224e-01, 6.163e-02, 4.991e-02, -1.510e-01, 1.885e-01, -5.951e-02, -3.463e-02, 2.172e-01), r);
r = MulAdd(s1_0, M4(1.856e-01, -1.041e-01, 1.900e-01, 8.420e-02, -3.223e-01, 6.258e-02, -9.766e-02, -6.517e-01, 3.066e-02, -7.562e-02, 1.015e-02, -1.139e-01, 1.569e-02, -3.684e-02, -2.813e-02, 8.835e-02), r);
r = MulAdd(s1_1, M4(-7.107e-02, -1.146e-01, 5.488e-01, -2.960e-01, 3.743e-01, -5.368e-01, -2.219e-01, -3.122e-01, 2.468e-02, -7.477e-01, 1.858e-01, 3.498e-01, 1.771e-03, 4.215e-03, 8.478e-02, 9.318e-02), r);
r = MulAdd(s1_2, M4(-2.350e-03, -3.382e-01, 5.964e-01, -2.321e-01, 2.011e-01, 1.890e-01, -2.062e-01, -3.725e-02, -1.003e-01, -1.464e-01, 1.040e-01, 9.994e-02, -7.113e-02, -3.827e-02, -1.258e-01, -1.584e-01), r);
r = MulAdd(s1_3, M4(-1.609e-01, -1.460e-01, -4.804e-03, 5.503e-02, 2.784e-01, -1.475e-02, 9.395e-02, -1.128e-01, 1.032e-02, -1.969e-01, 2.170e-01, 2.335e-01, -1.371e-01, 4.853e-02, 8.945e-03, -2.698e-01), r);
r = MulAdd(s1_4, M4(7.739e-02, -1.105e-01, 3.348e-01, 1.093e-01, -7.745e-02, -1.642e-01, -2.191e-01, -2.674e-02, 4.199e-01, -3.302e-01, 1.445e-01, -2.815e-01, -3.154e-01, 6.646e-02, 8.520e-02, -1.053e-01), r);
r = MulAdd(s1_5, M4(-4.165e-01, -8.545e-02, 2.291e-01, -1.042e-01, 3.791e-01, -7.209e-02, -6.332e-02, -3.174e-01, 1.038e-01, 8.122e-03, -9.715e-02, 6.808e-01, -9.362e-02, -4.634e-02, 5.184e-03, 1.295e-01), r);
r = MulAdd(s1_6, M4(-8.179e-02, -8.513e-02, 4.470e-02, -7.799e-02, -1.092e-01, -1.851e-01, -1.025e-01, -4.220e-02, -3.853e-01, 3.040e-02, -9.081e-02, 1.439e-01, -2.730e-02, -5.086e-02, 5.352e-03, -5.102e-03), r);
r = MulAdd(s1_7, M4(7.601e-02, -1.423e-01, 3.421e-01, 2.574e-03, 1.165e-01, 6.863e-03, 1.250e-02, -4.862e-02, -3.859e-01, -1.108e-01, 2.515e-02, 5.564e-01, 2.485e-01, 2.230e-01, -3.839e-02, 3.605e-02), r);
r = MulAdd(s1_8, M4(-9.424e-02, 1.248e-01, 1.980e-01, -1.671e-01, 1.098e-01, 6.555e-02, -7.194e-02, -1.626e-01, -1.439e-01, -2.086e-01, -1.925e-02, 1.520e-01, 2.139e-01, -7.764e-02, 6.469e-02, 7.875e-03), r);
r = MulAdd(s2_0, M4(4.572e-02, 3.661e-02, -3.845e-01, -1.383e-01, 1.729e-02, 1.780e-02, 3.664e-02, -6.961e-02, -9.001e-03, -1.853e-02, -6.735e-02, -1.864e-02, 1.695e-01, -1.420e-01, 2.679e-01, -1.525e-01), r);
r = MulAdd(s2_1, M4(9.967e-02, -2.869e-01, -2.251e-01, 8.470e-02, 3.178e-02, -9.701e-03, 9.260e-02, 4.087e-04, -8.081e-02, 1.341e-01, 5.882e-03, 1.043e-02, 8.559e-03, 6.534e-02, -4.619e-01, -3.010e-01), r);
r = MulAdd(s2_2, M4(-1.676e-02, -3.339e-01, 1.848e-01, -2.562e-01, -8.563e-02, 2.487e-02, 2.495e-01, 9.448e-02, 2.189e-02, -3.018e-02, 5.698e-02, 6.041e-02, -4.869e-02, -2.627e-02, 1.602e-01, 1.092e-01), r);
r = MulAdd(s2_3, M4(6.867e-02, -1.693e-01, -1.614e-01, -1.944e-01, 1.992e-01, 1.720e-01, 2.393e-01, 1.219e-02, 4.866e-02, -1.165e-01, -1.285e-01, 2.929e-01, 2.043e-01, -1.399e-02, 1.595e-02, -2.746e-01), r);
r = MulAdd(s2_4, M4(-4.477e-01, -5.696e-01, -1.760e-02, 1.362e-01, 1.472e-01, 3.113e-01, -2.419e-01, 8.650e-02, -8.358e-02, 1.081e-01, 3.881e-02, -1.400e-01, -2.071e-01, 3.977e-02, -3.149e-01, 2.525e-01), r);
r = MulAdd(s2_5, M4(5.496e-02, -9.963e-02, -1.227e-01, -1.892e-01, 4.361e-02, -3.776e-01, -6.576e-01, 2.628e-01, -8.215e-02, -8.123e-02, 2.248e-03, 1.261e-01, 1.193e-01, 2.608e-01, 2.567e-01, 8.120e-02), r);
r = MulAdd(s2_6, M4(-1.587e-01, -9.849e-02, 1.122e-01, -5.963e-02, -9.176e-02, 7.341e-03, 1.164e-03, -5.660e-02, 1.567e-01, -6.958e-02, -3.780e-02, 4.238e-04, -6.186e-02, 1.777e-01, 2.398e-01, 6.853e-03), r);
r = MulAdd(s2_7, M4(1.062e-01, -1.498e-01, 5.492e-02, 1.108e-01, -3.248e-01, -2.901e-01, -4.360e-01, 1.128e-01, 7.346e-02, 8.659e-02, 9.740e-02, -1.434e-01, 1.538e-01, 1.349e-01, 1.408e-01, -1.367e-01), r);
r = MulAdd(s2_8, M4(1.412e-01, -8.889e-02, 2.029e-02, -1.523e-01, 4.847e-01, -7.432e-01, -1.181e-01, 4.132e-01, 3.119e-02, -5.840e-02, -2.292e-02, -3.125e-02, 2.440e-02, 2.815e-02, 2.759e-01, -8.781e-02), r);
r = MulAdd(s3_0, M4(2.147e-02, 2.192e-01, 2.489e-01, -3.436e-02, 1.086e-02, -2.680e-02, -9.925e-02, 3.978e-02, 1.239e-01, 3.645e-02, 5.463e-01, 5.005e-01, 1.039e-01, -1.694e-01, -3.816e-02, 3.834e-01), r);
r = MulAdd(s3_1, M4(1.418e-01, 5.806e-02, 1.317e-01, 2.227e-01, 1.486e-02, -4.235e-03, -5.750e-02, -1.548e-01, -7.700e-01, 3.263e-01, -1.193e-02, 3.537e-01, -2.841e-01, 4.657e-01, -1.576e-01, -9.526e-02), r);
r = MulAdd(s3_2, M4(7.641e-02, 8.195e-01, 1.080e-01, 1.814e-01, -5.471e-02, 2.211e-02, -4.212e-02, -1.249e-02, 2.469e-02, 5.436e-01, 3.805e-01, -9.622e-02, -6.358e-02, -3.739e-01, -3.504e-01, -2.627e-01), r);
r = MulAdd(s3_3, M4(-9.359e-02, -1.830e-02, -7.015e-02, -7.774e-02, 2.286e-01, -6.321e-02, -5.124e-02, -2.799e-03, -5.063e-01, -1.835e-01, 3.716e-01, 1.130e+00, 3.259e-01, -2.045e-01, -1.792e-01, 4.892e-01), r);
r = MulAdd(s3_4, M4(-7.478e-01, -1.192e-01, 1.022e-01, 8.111e-01, 7.253e-02, 2.280e-01, -1.116e-01, -2.828e-01, -2.364e-01, -1.233e+00, -1.125e+00, 1.750e+00, -1.215e+00, 4.973e-02, 2.070e-01, 6.996e-01), r);
r = MulAdd(s3_5, M4(-4.115e-02, 3.613e-01, 2.694e-01, 4.126e-02, 7.046e-02, 6.242e-02, 9.300e-02, -1.965e-01, -3.211e-01, 8.504e-01, 2.518e-01, -5.622e-01, 5.663e-02, -1.139e-01, 1.150e-01, -1.954e-01), r);
r = MulAdd(s3_6, M4(-1.870e-01, -9.168e-02, -8.947e-02, 6.127e-03, 1.163e-02, 3.733e-04, -3.330e-01, 1.935e-01, 3.424e-01, 1.313e-01, -6.732e-01, 8.256e-02, 6.713e-02, 2.980e-02, -6.912e-02, 1.715e-01), r);
r = MulAdd(s3_7, M4(1.636e-01, 1.212e-01, 2.280e-02, 1.552e-01, -4.955e-01, 8.376e-01, 1.476e-01, 2.192e-01, 9.746e-01, -3.148e-01, 8.206e-01, -8.104e-01, -7.918e-02, -1.604e-01, 5.505e-02, 7.640e-02), r);
r = MulAdd(s3_8, M4(1.248e-01, 2.878e-01, -4.182e-02, -9.214e-02, -1.210e-01, 4.382e-01, 8.062e-02, -3.051e-01, -1.803e-01, -3.041e-01, 1.368e-01, -1.030e-01, 2.941e-02, -2.724e-01, 3.480e-02, 1.396e-02), r);
return r;
}
V4 f1(V4 s0_0, V4 s0_1, V4 s0_2, V4 s0_3, V4 s0_4, V4 s0_5, V4 s0_6, V4 s0_7, V4 s0_8, V4 s1_0, V4 s1_1, V4 s1_2, V4 s1_3, V4 s1_4, V4 s1_5, V4 s1_6, V4 s1_7, V4 s1_8, V4 s2_0, V4 s2_1, V4 s2_2, V4 s2_3, V4 s2_4, V4 s2_5, V4 s2_6, V4 s2_7, V4 s2_8, V4 s3_0, V4 s3_1, V4 s3_2, V4 s3_3, V4 s3_4, V4 s3_5, V4 s3_6, V4 s3_7, V4 s3_8) {
V4 r = { 3.015e-03, -4.690e-02, 3.573e-02, -1.486e-02 };
r = MulAdd(s0_0, M4(1.574e-01, -6.104e-03, -2.288e-01, 5.024e-03, -2.149e-03, -8.674e-02, 1.209e-01, 7.107e-02, 1.242e-01, 2.312e-03, -5.300e-02, -2.285e-01, -8.824e-02, 7.402e-02, -4.447e-01, 1.117e+00), r);
r = MulAdd(s0_1, M4(-5.617e-02, 3.613e-01, -4.666e-01, 1.795e-01, 1.718e-01, -1.005e-01, -2.593e-01, 4.103e-01, 2.477e-01, 1.883e-01, 3.928e-02, -3.635e-01, -7.353e-01, 3.209e-01, 2.171e-01, 3.924e-01), r);
r = MulAdd(s0_2, M4(-3.304e-01, 6.332e-01, -3.898e-01, 2.704e-01, 4.110e-02, -2.786e-01, -2.513e-01, 1.800e-01, 9.402e-03, -1.975e-01, -4.040e-02, -2.047e-01, -3.239e-01, -1.623e-01, 1.001e-01, 3.053e-02), r);
r = MulAdd(s0_3, M4(3.935e-01, 5.218e-02, 4.630e-02, 2.202e-02, -2.172e-01, -1.530e-02, -1.782e-01, -9.327e-02, -6.425e-02, -2.402e-02, -2.919e-02, -4.034e-02, 6.589e-01, -4.900e-02, 7.783e-02, 6.334e-01), r);
r = MulAdd(s0_4, M4(-5.475e-02, 1.543e-01, -1.597e-01, -2.500e-01, 4.990e-02, 5.780e-02, 1.162e-01, 1.140e-01, -2.980e-01, -2.524e-02, -2.103e-01, 4.297e-01, 4.528e-01, -3.098e-01, -1.415e-01, 7.565e-01), r);
r = MulAdd(s0_5, M4(-1.097e-01, 3.376e-01, -5.685e-01, 1.347e-01, 1.155e-01, -1.396e-01, -2.840e-01, -1.373e-01, 1.442e-01, 8.711e-02, 1.357e-01, -1.110e-01, 2.095e-01, -2.901e-01, -1.007e-01, -2.473e-01), r);
r = MulAdd(s0_6, M4(-7.405e-02, 9.320e-02, -5.870e-02, -2.569e-01, 6.017e-03, -8.078e-02, -3.798e-02, 2.334e-01, 1.440e-01, -1.852e-01, -6.627e-03, 3.514e-03, -1.499e-02, -6.237e-02, 3.665e-01, 3.270e-01), r);
r = MulAdd(s0_7, M4(2.443e-01, 8.076e-02, -2.143e-01, 1.120e-01, 1.187e-01, 1.317e-01, 1.811e-01, 1.918e-01, -2.164e-02, -1.829e-01, 2.105e-01, 3.085e-01, 3.155e-01, 2.801e-01, -6.834e-01, 2.861e-01), r);
r = MulAdd(s0_8, M4(9.974e-03, 9.704e-02, -2.363e-01, 1.829e-01, 1.844e-02, 9.298e-02, -5.319e-02, -5.899e-02, -2.154e-01, 2.555e-02, -8.374e-02, 1.254e-01, -2.736e-01, -4.065e-02, 4.838e-02, 3.338e-02), r);
r = MulAdd(s1_0, M4(-1.239e-02, -1.316e-01, 8.694e-02, -8.443e-02, -1.143e-01, -6.018e-02, -9.054e-02, 7.381e-02, 2.722e-01, 1.030e-01, -8.583e-02, -4.433e-01, -1.339e-01, 1.264e-01, 8.581e-02, -1.947e-01), r);
r = MulAdd(s1_1, M4(3.030e-01, -3.527e-02, 4.665e-01, -3.372e-02, -2.301e-02, 7.308e-01, 5.938e-01, -5.901e-01, 4.766e-01, 1.081e-01, 8.809e-02, 3.482e-01, -1.938e-01, -8.091e-02, 3.649e-02, 9.321e-02), r);
r = MulAdd(s1_2, M4(1.376e-01, -4.460e-01, 4.298e-01, -4.809e-02, -3.819e-01, 5.216e-01, 2.687e-01, 1.359e-01, 2.936e-01, 1.222e-02, 3.706e-01, 2.481e-01, -4.716e-02, -1.798e-02, 2.731e-02, -7.140e-02), r);
r = MulAdd(s1_3, M4(1.657e-01, -3.624e-02, 1.541e-01, -5.006e-03, -4.051e-01, -9.782e-02, 3.008e-02, 1.962e-01, -6.146e-02, 1.866e-03, -3.052e-01, -2.202e-01, 1.057e-01, -1.151e-01, -6.310e-02, 3.914e-01), r);
r = MulAdd(s1_4, M4(-2.629e-01, 1.029e-01, 1.812e-02, -2.950e-01, -1.191e-01, 2.580e-01, -4.833e-01, 1.095e-01, 2.309e-02, 4.519e-02, 1.086e-01, 5.362e-01, -1.349e-01, -1.278e-01, 7.109e-02, -1.992e-01), r);
r = MulAdd(s1_5, M4(-1.815e-01, 2.898e-01, 3.446e-01, -1.587e-01, -6.360e-02, 1.662e-01, 5.187e-01, 1.701e-01, -2.770e-02, -5.932e-01, 2.467e-01, 3.940e-01, 1.022e-01, 1.033e-01, -5.084e-02, -6.520e-02), r);
r = MulAdd(s1_6, M4(-1.494e-01, 3.180e-02, 9.864e-02, -3.409e-01, 1.397e-02, 9.932e-03, -2.110e-01, 2.636e-01, 1.353e-01, -8.495e-02, -2.680e-03, -2.287e-01, 1.136e-01, -1.047e-01, 2.910e-02, 9.922e-02), r);
r = MulAdd(s1_7, M4(1.533e-01, 4.819e-04, 1.735e-01, 2.027e-01, 1.316e-01, 1.029e-01, 1.446e-01, 1.737e-01, 4.855e-02, 4.781e-02, 2.025e-01, 1.587e-01, 1.661e-01, 7.134e-02, 5.853e-02, -1.530e-01), r);
r = MulAdd(s1_8, M4(-1.476e-01, -4.916e-02, 1.989e-01, 1.159e-01, 4.753e-02, 1.694e-01, 4.343e-02, -6.974e-03, 3.382e-02, 2.275e-01, 3.466e-01, -7.178e-03, -1.104e-01, 2.059e-03, -7.101e-02, 8.934e-02), r);
r = MulAdd(s2_0, M4(-3.467e-01, 8.471e-04, 1.580e-01, 2.685e-01, -2.680e-02, -6.444e-02, 8.843e-02, 5.232e-03, 2.576e-02, -3.756e-02, -7.913e-03, -3.871e-02, -5.374e-02, -6.060e-02, -7.688e-02, 6.738e-01), r);
r = MulAdd(s2_1, M4(-3.963e-01, 1.295e-01, 2.623e-01, 2.565e-01, -1.831e-01, -6.054e-02, 1.817e-01, -8.944e-02, 1.974e-01, -2.800e-04, -3.964e-02, 1.232e-01, -3.477e-01, 3.791e-01, 1.438e-01, -7.862e-02), r);
r = MulAdd(s2_2, M4(2.540e-02, 1.123e-01, 6.461e-01, -3.856e-03, 3.373e-02, -5.719e-02, 1.556e-01, -1.100e-01, -3.499e-02, 9.146e-02, -4.624e-02, 9.774e-02, -1.148e-01, -2.280e-01, 4.977e-01, -1.568e-01), r);
r = MulAdd(s2_3, M4(5.352e-02, -1.293e-01, -6.991e-03, 4.190e-01, -2.334e-03, -4.433e-02, -8.470e-02, 1.162e-01, -1.045e-01, -7.444e-02, 8.951e-02, -1.124e-01, 4.295e-01, 1.086e-01, 1.336e-01, 2.645e-01), r);
r = MulAdd(s2_4, M4(-4.062e-01, -6.781e-02, 4.629e-01, -4.931e-01, -1.875e-01, 1.958e-01, -4.560e-01, -2.286e-02, -2.066e-01, 1.151e-01, -5.924e-02, 1.350e-01, -1.752e-01, 2.244e-01, -3.564e-02, -6.129e-01), r);
r = MulAdd(s2_5, M4(6.644e-02, 4.611e-01, 9.200e-02, 6.845e-03, -1.628e-02, 8.352e-02, -1.119e-01, -4.386e-02, -5.822e-02, -4.769e-02, -3.224e-02, -1.235e-01, -3.296e-01, 5.835e-03, 2.231e-01, 5.535e-02), r);
r = MulAdd(s2_6, M4(-2.961e-02, -5.230e-02, 5.124e-02, 6.542e-02, 2.004e-01, 1.189e-01, -1.797e-01, -1.535e-02, 6.469e-02, 1.134e-01, -1.204e-04, -7.606e-02, 2.436e-02, -1.630e-02, 1.841e-01, -2.529e-01), r);
r = MulAdd(s2_7, M4(-1.147e-02, 3.246e-02, 7.626e-02, -1.013e-01, 1.075e-01, 5.871e-01, -5.227e-01, -3.076e-01, 1.609e-01, 5.768e-02, -1.912e-02, 5.898e-02, -7.530e-02, -1.307e-01, 5.828e-02, -1.456e-02), r);
r = MulAdd(s2_8, M4(-7.053e-02, 8.728e-02, 1.211e-01, 1.410e-01, -2.160e-01, 9.970e-02, -5.345e-01, 1.141e-01, 8.112e-04, -4.348e-02, 9.858e-02, 2.780e-02, -1.116e-01, -2.331e-01, 1.545e-01, 7.984e-02), r);
r = MulAdd(s3_0, M4(-5.412e-02, 6.012e-03, -2.395e-01, -1.209e-02, -5.734e-02, 3.058e-02, -7.202e-02, -7.514e-02, 7.241e-03, -1.702e-01, 1.020e+00, 2.997e-01, -2.173e-01, 4.518e-02, -2.703e-02, -4.087e-02), r);
r = MulAdd(s3_1, M4(-5.670e-02, -9.713e-03, -2.091e-01, -1.621e-01, -5.370e-03, -5.579e-02, 1.042e-01, 2.220e-02, 4.788e-01, -6.623e-01, 5.548e-01, 8.186e-01, 2.462e-01, -7.624e-01, -9.065e-02, -1.105e-02), r);
r = MulAdd(s3_2, M4(4.043e-02, -1.577e-01, -3.166e-01, -1.256e-01, -9.515e-02, -8.852e-02, -4.960e-02, 1.129e-01, 1.690e-01, 2.314e-01, -5.134e-01, 9.584e-02, -3.085e-02, 2.399e-01, -3.381e-01, -7.233e-02), r);
r = MulAdd(s3_3, M4(1.750e-01, -9.450e-02, -2.230e-01, 4.190e-01, 8.900e-02, 2.306e-02, 2.783e-01, -3.295e-01, 2.697e+00, 8.855e-02, 5.728e-01, -8.682e-01, 6.085e-02, 5.010e-02, 1.343e-01, 1.137e-01), r);
r = MulAdd(s3_4, M4(9.857e-02, 3.310e-01, -3.584e-01, -5.586e-01, 5.751e-01, -4.023e-01, 3.838e-01, 1.240e-01, -1.482e-01, -1.233e-01, -5.953e-01, 1.534e+00, 3.390e-01, -2.022e-02, 1.619e-01, -2.959e-01), r);
r = MulAdd(s3_5, M4(1.528e-01, 1.593e-01, -1.886e-01, 2.281e-02, 2.174e-01, -8.846e-01, 5.726e-02, 7.369e-03, -1.490e-01, 3.377e-01, -4.669e-02, 1.206e-01, -1.251e-01, 2.600e-01, -2.439e-01, 2.067e-01), r);
r = MulAdd(s3_6, M4(4.090e-02, -2.118e-02, -9.012e-02, -8.624e-03, 1.464e-01, 6.929e-02, 1.492e-01, -4.039e-01, 6.123e-01, 2.679e-01, -2.284e-01, -3.609e-01, -6.598e-02, 1.341e-01, -2.371e-02, -2.899e-01), r);
r = MulAdd(s3_7, M4(5.189e-02, -3.928e-02, 1.670e-01, -1.536e-01, 5.066e-01, -3.768e-01, 6.577e-01, 1.140e-01, -1.537e-01, -1.941e-01, -9.152e-02, -3.571e-02, 1.068e-01, 4.803e-02, -3.180e-01, 4.361e-02), r);
r = MulAdd(s3_8, M4(-8.453e-02, -1.454e-02, 3.613e-02, 8.974e-03, -1.258e-01, -5.842e-01, 3.264e-01, 2.910e-01, 1.306e-01, 4.552e-01, 4.524e-01, 1.065e-02, -1.792e-02, 1.875e-02, -2.206e-01, 2.028e-01), r);
return r;
}
void Pass2(uint2 blockStart, uint3 tid) {
float2 pt = float2(GetInputPt());
uint2 gxy = Rmp8x8(tid.x) + blockStart;
uint2 size = GetInputSize();
if (gxy.x >= size.x || gxy.y >= size.y) {
return;
}
float2 pos = (gxy + 0.5) * pt;
V4 s0_0 = l0(-1.0, -1.0);
V4 s0_1 = l0(0.0, -1.0);
V4 s0_2 = l0(1.0, -1.0);
V4 s0_3 = l0(-1.0, 0.0);
V4 s0_4 = l0(0.0, 0.0);
V4 s0_5 = l0(1.0, 0.0);
V4 s0_6 = l0(-1.0, 1.0);
V4 s0_7 = l0(0.0, 1.0);
V4 s0_8 = l0(1.0, 1.0);
V4 s1_0 = -max(-s0_0, 0.0);
V4 s1_1 = -max(-s0_1, 0.0);
V4 s1_2 = -max(-s0_2, 0.0);
V4 s1_3 = -max(-s0_3, 0.0);
V4 s1_4 = -max(-s0_4, 0.0);
V4 s1_5 = -max(-s0_5, 0.0);
V4 s1_6 = -max(-s0_6, 0.0);
V4 s1_7 = -max(-s0_7, 0.0);
V4 s1_8 = -max(-s0_8, 0.0);
s0_0 = max(s0_0, 0.0);
s0_1 = max(s0_1, 0.0);
s0_2 = max(s0_2, 0.0);
s0_3 = max(s0_3, 0.0);
s0_4 = max(s0_4, 0.0);
s0_5 = max(s0_5, 0.0);
s0_6 = max(s0_6, 0.0);
s0_7 = max(s0_7, 0.0);
s0_8 = max(s0_8, 0.0);
V4 s2_0 = l1(-1.0, -1.0);
V4 s2_1 = l1(0.0, -1.0);
V4 s2_2 = l1(1.0, -1.0);
V4 s2_3 = l1(-1.0, 0.0);
V4 s2_4 = l1(0.0, 0.0);
V4 s2_5 = l1(1.0, 0.0);
V4 s2_6 = l1(-1.0, 1.0);
V4 s2_7 = l1(0.0, 1.0);
V4 s2_8 = l1(1.0, 1.0);
V4 s3_0 = -max(-s2_0, 0.0);
V4 s3_1 = -max(-s2_1, 0.0);
V4 s3_2 = -max(-s2_2, 0.0);
V4 s3_3 = -max(-s2_3, 0.0);
V4 s3_4 = -max(-s2_4, 0.0);
V4 s3_5 = -max(-s2_5, 0.0);
V4 s3_6 = -max(-s2_6, 0.0);
V4 s3_7 = -max(-s2_7, 0.0);
V4 s3_8 = -max(-s2_8, 0.0);
s2_0 = max(s2_0, 0.0);
s2_1 = max(s2_1, 0.0);
s2_2 = max(s2_2, 0.0);
s2_3 = max(s2_3, 0.0);
s2_4 = max(s2_4, 0.0);
s2_5 = max(s2_5, 0.0);
s2_6 = max(s2_6, 0.0);
s2_7 = max(s2_7, 0.0);
s2_8 = max(s2_8, 0.0);
t2[gxy] = f0(s0_0, s0_1, s0_2, s0_3, s0_4, s0_5, s0_6, s0_7, s0_8, s1_0, s1_1, s1_2, s1_3, s1_4, s1_5, s1_6, s1_7, s1_8, s2_0, s2_1, s2_2, s2_3, s2_4, s2_5, s2_6, s2_7, s2_8, s3_0, s3_1, s3_2, s3_3, s3_4, s3_5, s3_6, s3_7, s3_8);
t3[gxy] = f1(s0_0, s0_1, s0_2, s0_3, s0_4, s0_5, s0_6, s0_7, s0_8, s1_0, s1_1, s1_2, s1_3, s1_4, s1_5, s1_6, s1_7, s1_8, s2_0, s2_1, s2_2, s2_3, s2_4, s2_5, s2_6, s2_7, s2_8, s3_0, s3_1, s3_2, s3_3, s3_4, s3_5, s3_6, s3_7, s3_8);
}
//!PASS 3
//!DESC conv2
//!BLOCK_SIZE 8
//!NUM_THREADS 64
//!IN t2, t3
//!OUT t0, t1
#define l0(x, y) V4(O(t2, float2(x, y)))
#define l1(x, y) V4(O(t3, float2(x, y)))
V4 f0(V4 s0_0, V4 s0_1, V4 s0_2, V4 s0_3, V4 s0_4, V4 s0_5, V4 s0_6, V4 s0_7, V4 s0_8, V4 s1_0, V4 s1_1, V4 s1_2, V4 s1_3, V4 s1_4, V4 s1_5, V4 s1_6, V4 s1_7, V4 s1_8, V4 s2_0, V4 s2_1, V4 s2_2, V4 s2_3, V4 s2_4, V4 s2_5, V4 s2_6, V4 s2_7, V4 s2_8, V4 s3_0, V4 s3_1, V4 s3_2, V4 s3_3, V4 s3_4, V4 s3_5, V4 s3_6, V4 s3_7, V4 s3_8) {
V4 r = { 3.575e-03, 3.041e-03, 1.241e-02, -2.230e-03 };
r = MulAdd(s0_0, M4(8.130e-02, -1.243e-01, -7.648e-02, -2.424e-01, -4.742e-02, -5.420e-02, 4.117e-02, 1.568e-01, -3.621e-02, 2.032e-01, 4.484e-02, 1.249e-02, -1.505e-01, 7.294e-02, 4.943e-02, -6.336e-02), r);
r = MulAdd(s0_1, M4(-1.474e-01, -3.366e-01, -5.670e-01, 4.113e-02, -1.260e-01, -1.539e-01, -5.421e-02, 1.779e-01, -1.072e-01, 1.209e-01, 4.423e-02, 2.454e-01, -5.430e-02, -1.442e-01, -1.501e-02, -4.731e-02), r);
r = MulAdd(s0_2, M4(6.444e-02, 1.509e-01, 1.452e-01, -2.840e-02, 9.365e-02, 2.016e-01, 1.002e-01, -3.226e-02, -1.186e-01, 1.535e-01, -1.652e-01, -1.104e-02, 4.170e-02, -4.404e-02, 1.189e-01, 1.007e-02), r);
r = MulAdd(s0_3, M4(-4.618e-02, 1.024e-01, -1.723e-01, -1.354e-01, 1.981e-01, -1.992e-01, 1.670e-01, 3.857e-01, -6.927e-03, 9.087e-02, 1.176e-01, 3.314e-01, 9.860e-02, 4.009e-04, 1.061e-01, -6.930e-02), r);
r = MulAdd(s0_4, M4(4.923e-01, -9.248e-02, 8.616e-03, 4.541e-02, -1.148e-01, 3.990e-03, -3.218e-02, 8.942e-02, 3.219e-02, -9.786e-02, 6.813e-02, 2.492e-01, -3.165e-01, 6.925e-02, -9.826e-02, 4.518e-01), r);
r = MulAdd(s0_5, M4(2.222e-02, 1.046e-01, -2.327e-02, -7.823e-02, 3.540e-01, 3.363e-01, -4.089e-02, 1.292e-02, -2.530e-01, 4.606e-01, -6.191e-02, -3.673e-02, -2.764e-01, -1.360e-01, -2.947e-03, 2.534e-02), r);
r = MulAdd(s0_6, M4(-2.067e-02, -1.566e-01, -8.968e-02, 1.386e-03, -8.841e-02, -1.077e-01, 1.646e-01, 1.987e-01, -3.098e-01, 2.764e-01, 1.935e-01, 1.847e-01, 1.116e-01, -1.514e-01, -5.175e-02, 8.710e-02), r);
r = MulAdd(s0_7, M4(1.266e-02, -2.119e-01, -1.610e-01, -6.512e-02, -1.679e-01, 2.247e-01, -5.854e-02, 1.200e-02, -1.406e-01, 4.393e-01, -8.517e-02, 3.281e-02, -1.177e-01, -1.861e-01, -3.241e-01, -2.918e-02), r);
r = MulAdd(s0_8, M4(-3.015e-02, -1.605e-01, -1.001e-01, 7.795e-03, -5.873e-02, -7.686e-02, -1.448e-01, -1.851e-02, -2.172e-01, 1.977e-01, -1.333e-01, -8.894e-02, -8.939e-03, 1.675e-01, -7.976e-03, 4.020e-02), r);
r = MulAdd(s1_0, M4(1.165e-01, -4.833e-02, 4.750e-02, -4.032e-02, -2.287e-02, -4.825e-02, 9.058e-02, 2.136e-01, 1.009e-01, -2.133e-02, 4.162e-02, -6.816e-02, -9.863e-02, -4.160e-03, -2.467e-02, -9.096e-02), r);
r = MulAdd(s1_1, M4(8.597e-02, -2.205e-01, 1.515e-01, -2.918e-02, -1.099e-01, -4.171e-02, 3.893e-04, -5.273e-03, -2.046e-02, -3.905e-03, 7.793e-04, 5.930e-02, 2.653e-02, -2.546e-01, -8.456e-02, -6.554e-02), r);
r = MulAdd(s1_2, M4(-1.058e-01, 3.302e-01, 1.812e-01, 6.427e-02, -4.601e-02, -1.589e-02, 4.405e-02, -1.366e-02, -5.996e-03, -5.402e-04, 3.237e-02, -5.725e-02, -7.486e-02, 1.358e-01, 4.739e-02, -2.432e-02), r);
r = MulAdd(s1_3, M4(3.333e-02, 5.179e-01, -1.939e-03, 7.798e-02, 2.011e-02, -2.959e-01, 1.135e-01, 3.122e-01, 8.651e-02, -2.708e-02, 7.183e-03, 4.554e-02, -3.342e-02, 9.136e-03, -7.067e-02, -1.867e-01), r);
r = MulAdd(s1_4, M4(6.231e-01, 9.512e-01, 3.523e-01, 3.744e-01, 2.388e-01, -2.827e-01, 9.968e-02, -5.306e-02, -4.498e-02, -2.222e-01, -5.865e-02, 2.967e-02, -3.029e-01, -2.137e-01, -5.363e-01, 8.872e-02), r);
r = MulAdd(s1_5, M4(-4.862e-02, 7.326e-01, 1.354e-01, 5.607e-02, 1.667e-01, -1.184e-01, -1.304e-01, 6.817e-02, 3.287e-02, 3.310e-01, 1.521e-01, -3.212e-02, -8.947e-02, 4.250e-02, -9.770e-02, -8.344e-02), r);
r = MulAdd(s1_6, M4(-9.242e-04, 4.835e-03, 1.322e-01, 3.745e-02, 9.613e-02, -8.310e-03, 4.718e-02, 2.763e-02, -1.616e-02, 6.167e-02, -3.382e-02, 3.624e-02, 1.213e-02, -2.014e-01, -2.776e-03, 4.360e-02), r);
r = MulAdd(s1_7, M4(-6.861e-02, 4.772e-02, -3.779e-02, 7.567e-02, -8.548e-02, -1.028e-02, 1.881e-02, 2.421e-03, 1.378e-01, 1.305e-01, 2.177e-02, -1.118e-03, 5.861e-02, -1.416e-01, -3.140e-01, -9.031e-02), r);
r = MulAdd(s1_8, M4(-4.147e-02, 1.546e-01, 5.650e-02, 4.098e-02, -1.460e-01, -5.779e-02, -1.959e-02, -2.318e-02, 3.538e-02, -5.044e-02, 3.304e-02, -3.517e-03, -1.176e-01, -3.185e-01, -1.738e-01, -4.349e-02), r);
r = MulAdd(s2_0, M4(-3.428e-03, 6.059e-02, 7.024e-02, 2.739e-02, 1.313e-02, -5.748e-02, 9.005e-03, -7.139e-03, 1.165e-01, -1.541e-01, 1.493e-01, 2.725e-01, 3.254e-02, -2.934e-02, 1.115e-02, -2.844e-02), r);
r = MulAdd(s2_1, M4(-8.601e-03, -3.177e-03, 1.878e-01, 1.106e-01, 1.951e-02, 8.194e-02, 4.971e-02, 5.805e-02, 2.515e-02, -2.529e-01, -2.250e-01, 3.498e-02, 7.183e-02, -8.617e-02, -8.616e-02, 1.623e-01), r);
r = MulAdd(s2_2, M4(-8.072e-02, -1.234e-01, 3.482e-02, -2.873e-02, -4.049e-02, 4.828e-03, 1.940e-02, 3.828e-02, -5.156e-03, 4.585e-03, 2.326e-02, 2.346e-02, -8.908e-02, -1.384e-03, -2.366e-02, 1.290e-02), r);
r = MulAdd(s2_3, M4(4.921e-02, 1.726e-01, 3.832e-02, -2.490e-01, -1.152e-01, -1.722e-01, -1.705e-01, 4.228e-01, -8.215e-02, -1.478e-02, 1.554e-01, 3.701e-01, -8.863e-02, 1.068e-01, 8.890e-03, 6.324e-02), r);
r = MulAdd(s2_4, M4(1.307e-01, 2.312e-01, -1.734e-01, 2.083e-02, -1.966e-01, -3.991e-01, -8.681e-02, 1.976e-03, -3.177e-01, 1.528e-01, -2.329e-01, 2.569e-01, -6.230e-03, 6.020e-02, 4.969e-02, -2.039e-01), r);
r = MulAdd(s2_5, M4(1.660e-01, 1.642e-02, 7.203e-02, -1.613e-01, 6.225e-02, 6.470e-02, 3.305e-03, 2.230e-02, -2.455e-02, 6.599e-02, -1.740e-01, 7.887e-02, 3.463e-03, 1.003e-01, -1.850e-01, 7.885e-02), r);
r = MulAdd(s2_6, M4(-2.170e-02, 1.372e-01, 7.445e-02, -9.419e-02, -1.851e-01, 4.957e-02, -2.454e-01, 5.879e-02, -5.800e-02, -1.122e-01, 7.445e-02, 1.190e-01, 2.695e-02, -5.701e-02, -5.166e-02, -5.058e-02), r);
r = MulAdd(s2_7, M4(5.390e-01, 1.674e-01, 1.213e-01, -1.147e-01, -6.939e-02, -1.218e-01, -2.891e-01, 2.682e-02, -2.636e-01, -1.104e-01, -1.556e-01, 3.774e-02, -4.121e-02, -2.431e-01, -1.248e-01, 1.275e-01), r);
r = MulAdd(s2_8, M4(1.053e-01, 2.238e-01, -1.104e-01, 5.372e-02, 6.179e-02, -2.431e-03, -4.843e-02, 3.820e-02, -7.539e-02, 7.898e-02, 7.562e-03, 1.596e-02, 7.298e-02, -1.553e-01, -3.545e-01, 1.990e-02), r);
r = MulAdd(s3_0, M4(8.232e-02, -6.815e-02, -7.421e-02, -3.191e-02, -1.592e-01, 2.814e-01, 5.009e-02, 3.669e-02, -5.908e-02, -5.445e-02, 4.873e-02, 1.538e-01, 1.065e-01, -2.194e-01, -2.612e-02, -2.297e-02), r);
r = MulAdd(s3_1, M4(1.431e-02, -7.835e-02, -2.790e-03, 9.305e-02, -2.975e-01, 1.527e-01, 1.888e-01, -1.279e-02, -1.938e-02, -1.022e-01, -2.197e-02, -2.919e-02, 2.192e-01, -8.056e-02, 1.328e-03, 3.478e-02), r);
r = MulAdd(s3_2, M4(4.920e-03, -6.286e-02, -7.779e-02, 1.075e-01, -1.092e-01, 2.909e-01, 3.056e-01, -9.017e-02, -3.625e-02, 1.079e-01, 1.107e-01, 6.613e-02, 1.696e-01, -1.852e-01, -1.253e-01, -9.675e-02), r);
r = MulAdd(s3_3, M4(-6.350e-02, 1.137e-01, -3.559e-02, -1.684e-01, -2.044e-01, -9.368e-02, 2.283e-01, 8.052e-01, 4.476e-03, -1.599e-01, 2.594e-02, 1.582e-01, -2.483e-02, 9.216e-02, 5.719e-02, 2.237e-01), r);
r = MulAdd(s3_4, M4(-1.694e-01, 1.597e-01, -3.311e-01, 1.880e-01, 2.614e-01, -2.584e-01, 5.296e-02, 9.726e-02, -3.932e-02, -7.518e-02, -1.749e-01, 1.604e-01, 1.008e-01, 2.920e-01, 5.358e-01, -6.383e-01), r);
r = MulAdd(s3_5, M4(-2.706e-01, -2.716e-01, -4.196e-01, 1.023e-01, 2.201e-01, -1.412e-01, 1.003e-01, -6.972e-02, 3.727e-02, -8.424e-02, -7.870e-02, 2.294e-02, 2.836e-01, -4.165e-01, -2.974e-01, -3.567e-02), r);
r = MulAdd(s3_6, M4(-3.434e-02, 6.420e-02, -8.729e-02, -8.600e-02, -2.041e-01, 1.646e-02, 9.025e-02, 1.724e-01, -4.951e-02, -3.894e-02, -7.985e-02, 1.580e-02, 2.554e-01, -3.100e-01, -2.769e-01, 8.336e-05), r);
r = MulAdd(s3_7, M4(-6.557e-02, 3.865e-02, -3.263e-02, 4.621e-02, -2.077e-01, 2.705e-02, -3.354e-01, 1.480e-01, 4.155e-02, -2.143e-01, -2.626e-01, 1.091e-02, 1.382e-01, -1.706e-01, -1.355e-01, -7.700e-02), r);
r = MulAdd(s3_8, M4(-2.004e-01, 4.575e-01, -1.812e-01, 6.102e-02, 3.469e-01, -6.634e-02, 1.302e-01, -9.621e-02, 4.023e-02, 1.048e-01, -9.194e-02, 5.130e-03, 4.272e-01, -5.971e-01, -2.025e-01, -1.364e-01), r);
return r;
}
V4 f1(V4 s0_0, V4 s0_1, V4 s0_2, V4 s0_3, V4 s0_4, V4 s0_5, V4 s0_6, V4 s0_7, V4 s0_8, V4 s1_0, V4 s1_1, V4 s1_2, V4 s1_3, V4 s1_4, V4 s1_5, V4 s1_6, V4 s1_7, V4 s1_8, V4 s2_0, V4 s2_1, V4 s2_2, V4 s2_3, V4 s2_4, V4 s2_5, V4 s2_6, V4 s2_7, V4 s2_8, V4 s3_0, V4 s3_1, V4 s3_2, V4 s3_3, V4 s3_4, V4 s3_5, V4 s3_6, V4 s3_7, V4 s3_8) {
V4 r = { -8.611e-03, -6.529e-03, -1.098e-03, 4.669e-03 };
r = MulAdd(s0_0, M4(-1.437e-01, -9.784e-02, 2.649e-01, -8.638e-02, -1.746e-01, 2.031e-01, 1.203e-01, -8.812e-02, -2.317e-01, 2.311e-01, 3.171e-02, -3.619e-02, -7.798e-02, -2.507e-02, 1.902e-01, 5.780e-02), r);
r = MulAdd(s0_1, M4(2.504e-01, 1.577e-01, -5.397e-02, 4.599e-01, -1.392e-01, 2.560e-01, 1.018e-01, 7.968e-02, 2.247e-01, -2.962e-03, 1.421e-03, -1.201e-01, -3.622e-01, 1.378e-01, 1.392e-01, 1.641e-01), r);
r = MulAdd(s0_2, M4(-6.143e-02, -6.336e-02, 1.131e-01, 6.811e-02, -5.817e-02, 7.362e-02, 1.407e-01, 1.823e-02, 4.880e-01, -2.282e-01, -2.704e-01, -4.287e-01, -2.741e-01, 3.163e-02, 1.098e-01, 1.514e-01), r);
r = MulAdd(s0_3, M4(1.794e-01, 1.720e-01, -4.092e-01, 1.277e-01, -1.938e-01, 3.107e-01, 2.915e-01, 2.279e-01, 2.259e-01, 2.136e-01, 5.867e-02, 2.359e-01, -1.589e-01, 1.132e-01, 6.871e-02, 2.837e-01), r);
r = MulAdd(s0_4, M4(-3.070e-01, -4.494e-01, 5.817e-02, 5.153e-01, 5.215e-01, 5.410e-01, 1.286e-01, -5.596e-01, 4.287e-01, 1.821e-01, 1.542e-01, 3.755e-01, 3.820e-01, 2.953e-01, -2.768e-01, -6.977e-02), r);
r = MulAdd(s0_5, M4(-4.881e-02, 2.327e-02, 9.209e-02, -2.102e-02, -1.394e-01, -8.093e-03, 2.263e-01, -4.307e-01, 1.998e-01, -8.793e-02, -1.057e-01, -1.899e-01, 1.577e-01, 3.435e-01, 6.721e-02, 3.093e-01), r);
r = MulAdd(s0_6, M4(7.516e-02, -1.224e-01, 1.257e-02, -6.769e-02, -8.618e-02, 1.283e-01, 2.060e-01, -1.966e-01, 8.166e-02, -1.263e-01, -2.269e-01, -3.272e-01, -3.439e-02, -2.849e-01, 2.105e-01, -3.015e-03), r);
r = MulAdd(s0_7, M4(7.447e-02, -8.731e-02, 2.804e-02, -4.819e-02, -3.311e-01, 3.824e-01, 7.766e-02, 5.672e-02, 4.014e-01, -4.037e-03, 2.287e-01, 5.626e-02, 3.481e-01, -1.010e-01, -1.156e-01, -2.865e-01), r);
r = MulAdd(s0_8, M4(5.454e-02, -5.590e-02, 3.408e-02, 3.551e-03, 1.262e-02, 8.638e-02, 1.222e-01, 3.418e-01, -2.154e-01, 1.868e-01, 1.210e-01, -2.330e-01, -4.810e-02, -5.190e-02, -8.587e-02, -2.145e-01), r);
r = MulAdd(s1_0, M4(-3.063e-01, -1.830e-02, 5.167e-01, 4.813e-02, -7.310e-02, 1.443e-01, 1.654e-01, 1.158e-01, 4.789e-02, -3.030e-02, -1.358e-01, 2.986e-02, -4.855e-02, -7.736e-02, 4.514e-01, -1.797e-02), r);
r = MulAdd(s1_1, M4(4.322e-01, -1.369e-01, 9.431e-02, 3.921e-01, 2.708e-02, -1.218e-02, -9.091e-02, 1.871e-01, 3.763e-02, -9.213e-02, -1.209e-01, -1.587e-01, 3.014e-03, 1.816e-01, 3.099e-01, 3.210e-01), r);
r = MulAdd(s1_2, M4(-7.234e-02, 1.685e-02, 4.444e-01, -1.886e-01, -9.543e-03, 3.966e-02, 1.105e-01, 4.870e-02, 9.471e-02, -5.263e-02, -1.085e-01, 4.226e-02, -1.565e-01, -3.812e-02, 1.708e-01, 1.457e-01), r);
r = MulAdd(s1_3, M4(2.370e-01, -3.354e-02, -9.648e-02, 1.531e-01, -3.468e-01, -3.957e-02, 3.152e-01, 3.402e-02, 3.762e-02, 9.507e-02, 7.836e-02, 9.088e-03, -1.614e-01, 4.377e-02, 4.748e-02, 1.055e-01), r);
r = MulAdd(s1_4, M4(2.342e-01, -5.059e-01, 2.781e-01, 2.906e-01, 1.656e-01, 1.268e-01, 1.183e-01, -2.458e-02, 2.290e-01, 1.779e-01, -8.310e-02, 1.389e-01, 7.282e-02, 1.050e-01, -3.525e-01, 6.810e-02), r);
r = MulAdd(s1_5, M4(1.078e-01, -4.451e-02, 7.031e-02, -2.977e-01, 3.596e-02, 3.359e-02, 9.589e-03, 9.070e-02, -1.862e-01, -1.863e-01, -9.652e-02, -5.039e-02, 1.004e-01, 1.598e-01, 1.466e-01, 2.349e-01), r);
r = MulAdd(s1_6, M4(1.109e-02, -1.607e-01, 1.578e-02, -1.971e-01, 5.020e-02, -7.597e-02, 7.238e-02, 7.241e-02, 2.025e-02, -2.246e-02, 4.652e-02, -8.760e-02, -1.111e-02, 1.890e-02, 1.046e-01, -2.233e-03), r);
r = MulAdd(s1_7, M4(1.252e-01, -8.046e-02, -1.321e-01, -3.724e-01, -1.383e-01, 1.151e-01, 5.397e-02, -1.422e-01, 8.319e-02, 9.089e-02, -2.620e-02, 1.662e-01, 2.847e-02, -1.255e-01, 6.933e-02, -1.636e-01), r);
r = MulAdd(s1_8, M4(-1.517e-01, 3.661e-02, -3.135e-01, -3.395e-01, -1.139e-01, 1.973e-01, 8.547e-03, -3.118e-02, -8.869e-02, -1.209e-01, 1.867e-02, -4.531e-02, 1.016e-01, -6.909e-02, 1.436e-01, 1.663e-01), r);
r = MulAdd(s2_0, M4(-9.314e-02, 1.395e-02, -1.741e-02, -7.208e-02, -5.164e-02, -5.743e-02, 5.702e-02, 1.342e-01, 6.011e-03, 1.626e-01, 1.101e-01, -1.130e-01, 6.127e-02, -8.956e-03, -7.149e-02, -6.488e-03), r);
r = MulAdd(s2_1, M4(-2.534e-01, 1.086e-01, -1.007e-01, -3.067e-02, -1.074e-01, 7.219e-03, 6.768e-02, -1.012e-01, 2.019e-01, 4.263e-03, -7.411e-02, -1.173e-01, 1.961e-01, -5.619e-02, -2.390e-01, -1.323e-01), r);
r = MulAdd(s2_2, M4(-1.039e-01, -9.899e-02, -2.206e-01, -2.187e-01, -8.739e-03, 6.607e-02, 4.125e-02, 5.363e-02, -6.572e-03, 3.014e-02, 1.314e-01, -9.560e-02, 2.106e-01, 1.237e-02, -8.354e-02, -4.939e-03), r);
r = MulAdd(s2_3, M4(-4.682e-02, -1.357e-01, 3.481e-02, -2.187e-01, 1.113e-01, 8.812e-02, -1.211e-01, -2.011e-02, 1.567e-01, -2.216e-02, -4.920e-03, -2.458e-01, 2.263e-02, 6.741e-02, -1.234e-02, 2.338e-02), r);
r = MulAdd(s2_4, M4(5.105e-02, -3.845e-01, 1.812e-01, -1.927e-01, 2.840e-01, -2.094e-01, 5.673e-02, 4.405e-02, 5.957e-01, 1.734e-02, -1.158e-01, -6.956e-01, -2.077e-01, 5.130e-03, 4.744e-01, -1.540e-02), r);
r = MulAdd(s2_5, M4(1.601e-01, -2.680e-01, -1.678e-01, -1.207e-01, -4.648e-02, -6.454e-02, 1.122e-01, -6.567e-02, 1.638e-01, -1.259e-01, -2.470e-02, -3.547e-01, -1.333e-01, -1.219e-02, -7.710e-02, -3.881e-01), r);
r = MulAdd(s2_6, M4(-6.060e-02, 1.662e-01, -2.082e-01, 3.193e-01, -1.317e-01, 1.395e-04, 2.436e-01, -1.480e-01, 6.104e-03, -2.009e-01, -6.729e-02, -2.207e-01, -7.784e-02, -7.589e-02, 7.569e-02, 3.261e-03), r);
r = MulAdd(s2_7, M4(-2.951e-01, -2.050e-01, 2.827e-02, 3.739e-01, 1.947e-01, 5.411e-01, -2.262e-01, -8.808e-03, 2.262e-01, -9.010e-02, -1.476e-01, -3.582e-01, -1.718e-01, 2.844e-02, 7.832e-02, 1.414e-03), r);
r = MulAdd(s2_8, M4(3.534e-01, 1.695e-01, -1.247e-01, 4.750e-01, 4.171e-02, 2.338e-02, -4.525e-02, -4.955e-02, 2.934e-01, -3.865e-02, -1.125e-01, -2.127e-01, 1.326e-01, 5.967e-02, 6.215e-02, 1.048e-01), r);
r = MulAdd(s3_0, M4(4.186e-02, -5.378e-02, 7.641e-02, -3.524e-02, -2.447e-01, -5.374e-02, -1.380e-01, -4.221e-01, -3.797e-02, -7.623e-03, -4.826e-02, 1.791e-02, -1.390e-01, 1.115e-01, 2.252e-01, -9.103e-03), r);
r = MulAdd(s3_1, M4(1.339e-01, 3.093e-01, -3.615e-02, 8.684e-02, -4.098e-01, -1.216e-01, 2.372e-01, -1.247e-01, -5.358e-02, -1.660e-01, -8.435e-02, 3.871e-02, 2.722e-01, -1.145e-01, -3.944e-01, -5.003e-02), r);
r = MulAdd(s3_2, M4(-4.430e-02, -3.135e-02, 1.019e-01, -1.129e-01, -2.647e-01, -1.317e-01, 8.715e-02, -5.466e-02, -3.946e-02, 7.216e-02, 1.677e-01, 9.349e-02, 8.069e-02, -1.097e-01, -9.659e-03, -8.460e-02), r);
r = MulAdd(s3_3, M4(-5.036e-03, 4.992e-02, 1.086e-01, -1.339e-02, 2.792e-01, 3.294e-01, -1.578e-01, 4.592e-01, -7.749e-02, 4.384e-02, -4.212e-02, 2.287e-02, 1.456e-01, 4.774e-02, -1.264e-01, 7.437e-02), r);
r = MulAdd(s3_4, M4(3.022e-01, -2.197e-01, -4.347e-02, -2.198e-01, 3.922e-02, 8.609e-02, 8.862e-02, 3.418e-01, 8.117e-02, -2.026e-02, -3.236e-01, -2.539e-01, -6.030e-02, -2.409e-01, 7.879e-02, -8.457e-02), r);
r = MulAdd(s3_5, M4(3.525e-01, 2.622e-01, -4.994e-02, -1.932e-01, -1.508e-01, 1.229e-01, 1.359e-01, 1.613e-01, 1.830e-01, -4.473e-02, -5.438e-02, -1.041e-01, 4.534e-01, -4.660e-01, -7.405e-02, -1.001e-01), r);
r = MulAdd(s3_6, M4(-1.224e-02, -5.840e-03, 8.031e-02, -2.279e-02, -2.128e-01, 1.477e-01, -9.937e-03, 4.142e-02, -3.726e-02, -1.013e-01, -2.940e-03, -1.333e-01, 1.353e-01, -2.192e-01, -3.858e-01, -1.100e-01), r);
r = MulAdd(s3_7, M4(-8.882e-02, 1.341e-01, 2.707e-01, 2.212e-01, 2.628e-01, 3.454e-01, -3.703e-01, 4.902e-01, 1.527e-01, 8.567e-03, -1.742e-01, -1.884e-01, -7.710e-01, 1.028e-01, 3.233e-01, -3.897e-01), r);
r = MulAdd(s3_8, M4(3.715e-02, 2.936e-01, -1.195e-01, -1.295e-01, -1.313e-01, -1.222e-01, -2.876e-01, 5.694e-02, 6.813e-02, -1.738e-02, -1.154e-01, 1.649e-02, 1.755e-01, -1.639e-01, 3.212e-02, 3.504e-01), r);
return r;
}
void Pass3(uint2 blockStart, uint3 tid) {
float2 pt = float2(GetInputPt());
uint2 gxy = Rmp8x8(tid.x) + blockStart;
uint2 size = GetInputSize();
if (gxy.x >= size.x || gxy.y >= size.y) {
return;
}
float2 pos = (gxy + 0.5) * pt;
V4 s0_0 = l0(-1.0, -1.0);
V4 s0_1 = l0(0.0, -1.0);
V4 s0_2 = l0(1.0, -1.0);
V4 s0_3 = l0(-1.0, 0.0);
V4 s0_4 = l0(0.0, 0.0);
V4 s0_5 = l0(1.0, 0.0);
V4 s0_6 = l0(-1.0, 1.0);
V4 s0_7 = l0(0.0, 1.0);
V4 s0_8 = l0(1.0, 1.0);
V4 s1_0 = -max(-s0_0, 0.0);
V4 s1_1 = -max(-s0_1, 0.0);
V4 s1_2 = -max(-s0_2, 0.0);
V4 s1_3 = -max(-s0_3, 0.0);
V4 s1_4 = -max(-s0_4, 0.0);
V4 s1_5 = -max(-s0_5, 0.0);
V4 s1_6 = -max(-s0_6, 0.0);
V4 s1_7 = -max(-s0_7, 0.0);
V4 s1_8 = -max(-s0_8, 0.0);
s0_0 = max(s0_0, 0.0);
s0_1 = max(s0_1, 0.0);
s0_2 = max(s0_2, 0.0);
s0_3 = max(s0_3, 0.0);
s0_4 = max(s0_4, 0.0);
s0_5 = max(s0_5, 0.0);
s0_6 = max(s0_6, 0.0);
s0_7 = max(s0_7, 0.0);
s0_8 = max(s0_8, 0.0);
V4 s2_0 = l1(-1.0, -1.0);
V4 s2_1 = l1(0.0, -1.0);
V4 s2_2 = l1(1.0, -1.0);
V4 s2_3 = l1(-1.0, 0.0);
V4 s2_4 = l1(0.0, 0.0);
V4 s2_5 = l1(1.0, 0.0);
V4 s2_6 = l1(-1.0, 1.0);
V4 s2_7 = l1(0.0, 1.0);
V4 s2_8 = l1(1.0, 1.0);
V4 s3_0 = -max(-s2_0, 0.0);
V4 s3_1 = -max(-s2_1, 0.0);
V4 s3_2 = -max(-s2_2, 0.0);
V4 s3_3 = -max(-s2_3, 0.0);
V4 s3_4 = -max(-s2_4, 0.0);
V4 s3_5 = -max(-s2_5, 0.0);
V4 s3_6 = -max(-s2_6, 0.0);
V4 s3_7 = -max(-s2_7, 0.0);
V4 s3_8 = -max(-s2_8, 0.0);
s2_0 = max(s2_0, 0.0);
s2_1 = max(s2_1, 0.0);
s2_2 = max(s2_2, 0.0);
s2_3 = max(s2_3, 0.0);
s2_4 = max(s2_4, 0.0);
s2_5 = max(s2_5, 0.0);
s2_6 = max(s2_6, 0.0);
s2_7 = max(s2_7, 0.0);
s2_8 = max(s2_8, 0.0);
t0[gxy] = f0(s0_0, s0_1, s0_2, s0_3, s0_4, s0_5, s0_6, s0_7, s0_8, s1_0, s1_1, s1_2, s1_3, s1_4, s1_5, s1_6, s1_7, s1_8, s2_0, s2_1, s2_2, s2_3, s2_4, s2_5, s2_6, s2_7, s2_8, s3_0, s3_1, s3_2, s3_3, s3_4, s3_5, s3_6, s3_7, s3_8);
t1[gxy] = f1(s0_0, s0_1, s0_2, s0_3, s0_4, s0_5, s0_6, s0_7, s0_8, s1_0, s1_1, s1_2, s1_3, s1_4, s1_5, s1_6, s1_7, s1_8, s2_0, s2_1, s2_2, s2_3, s2_4, s2_5, s2_6, s2_7, s2_8, s3_0, s3_1, s3_2, s3_3, s3_4, s3_5, s3_6, s3_7, s3_8);
}
//!PASS 4
//!DESC conv3
//!BLOCK_SIZE 8
//!NUM_THREADS 64
//!IN t0, t1
//!OUT t2, t3
#define l0(x, y) V4(O(t0, float2(x, y)))
#define l1(x, y) V4(O(t1, float2(x, y)))
V4 f0(V4 s0_0, V4 s0_1, V4 s0_2, V4 s0_3, V4 s0_4, V4 s0_5, V4 s0_6, V4 s0_7, V4 s0_8, V4 s1_0, V4 s1_1, V4 s1_2, V4 s1_3, V4 s1_4, V4 s1_5, V4 s1_6, V4 s1_7, V4 s1_8, V4 s2_0, V4 s2_1, V4 s2_2, V4 s2_3, V4 s2_4, V4 s2_5, V4 s2_6, V4 s2_7, V4 s2_8, V4 s3_0, V4 s3_1, V4 s3_2, V4 s3_3, V4 s3_4, V4 s3_5, V4 s3_6, V4 s3_7, V4 s3_8) {
V4 r = { -1.057e-02, -1.114e-02, 1.597e-04, 1.132e-02 };
r = MulAdd(s0_0, M4(7.356e-02, 8.402e-03, 1.287e-01, 6.762e-02, 2.134e-01, -6.620e-02, -2.788e-01, -5.744e-02, -3.896e-02, -3.993e-02, -7.161e-02, -1.982e-01, -6.734e-02, 8.804e-03, -4.739e-02, 6.502e-02), r);
r = MulAdd(s0_1, M4(2.249e-01, 4.958e-02, 1.138e-01, 3.152e-01, 2.008e-01, 1.703e-01, 5.817e-02, -9.482e-02, -2.371e-01, 3.975e-02, -1.755e-01, -2.666e-01, 2.819e-01, -2.640e-02, 1.405e-01, -6.009e-02), r);
r = MulAdd(s0_2, M4(2.065e-01, -3.027e-02, -3.447e-02, 3.226e-03, -1.252e-02, -7.589e-03, 2.344e-03, -1.704e-02, -8.894e-02, 3.136e-02, -1.517e-01, -2.176e-02, 8.920e-02, -5.322e-02, -9.529e-02, 8.355e-02), r);
r = MulAdd(s0_3, M4(1.136e-01, 1.015e-01, -2.730e-02, -2.144e-01, -9.526e-02, -2.857e-01, 2.711e-01, -1.991e-01, 2.596e-01, 1.602e-01, -2.169e-01, -1.097e-01, 3.353e-02, 6.231e-02, 8.753e-03, 3.707e-01), r);
r = MulAdd(s0_4, M4(-1.945e-01, 3.081e-01, -2.270e-01, -5.963e-02, -1.666e-01, -3.408e-01, 1.161e-01, -6.384e-02, -6.823e-01, -4.014e-01, -6.276e-01, -1.672e-01, 2.986e-03, -1.351e-01, 1.668e-01, -3.133e-01), r);
r = MulAdd(s0_5, M4(4.802e-02, -4.275e-02, 1.978e-03, -7.602e-02, -4.082e-03, 5.572e-02, -3.341e-02, 9.101e-03, -1.038e-01, 1.622e-01, 2.334e-02, 1.768e-01, 9.416e-03, -2.287e-01, 1.048e-01, -2.926e-01), r);
r = MulAdd(s0_6, M4(5.333e-04, 3.089e-02, 2.721e-02, -3.601e-02, -5.081e-02, -1.152e-01, 6.752e-02, 1.701e-01, -2.951e-02, 2.450e-01, -1.684e-01, -4.702e-02, -1.580e-02, 1.200e-02, -1.266e-02, 4.937e-02), r);
r = MulAdd(s0_7, M4(-1.351e-02, -6.248e-02, -3.060e-03, 4.140e-02, -2.090e-01, -6.831e-01, -8.857e-02, 2.536e-01, -2.333e-02, 1.521e-01, -8.033e-02, 2.124e-01, -6.615e-02, 1.317e-01, 1.847e-01, -2.150e-01), r);
r = MulAdd(s0_8, M4(4.605e-02, 1.013e-01, 6.834e-03, -6.411e-02, -1.476e-02, -2.845e-01, -4.312e-02, -1.171e-02, 6.985e-02, -6.859e-02, -2.785e-02, -3.226e-02, 5.186e-02, 1.102e-01, -2.071e-02, -1.250e-01), r);
r = MulAdd(s1_0, M4(1.952e-01, -3.342e-02, -3.770e-02, -2.026e-01, 4.850e-02, -3.174e-02, -1.987e-01, -2.886e-02, -1.298e-01, 1.994e-02, 1.131e-01, 2.950e-02, -1.791e-02, -4.533e-02, 4.695e-02, -6.907e-02), r);
r = MulAdd(s1_1, M4(2.401e-01, 1.809e-01, -5.151e-02, -6.271e-02, -1.409e-01, 9.215e-03, 1.176e-01, 2.717e-02, 1.130e-01, -3.228e-02, -9.086e-02, -1.202e-03, 1.642e-03, -7.943e-03, 1.097e-01, 1.842e-01), r);
r = MulAdd(s1_2, M4(8.774e-02, -1.486e-02, -4.808e-02, 4.089e-02, 6.244e-02, -7.645e-02, 5.614e-02, -5.706e-02, -2.386e-02, 4.407e-02, -1.378e-01, -5.880e-02, 2.936e-02, 2.285e-02, -3.924e-02, 5.724e-02), r);
r = MulAdd(s1_3, M4(2.603e-01, -1.455e-01, 1.429e-01, -2.992e-02, -6.288e-02, -5.216e-02, -1.802e-01, 1.060e-01, -2.473e-02, -6.795e-03, 2.843e-02, 7.745e-02, -4.868e-03, -9.998e-02, -7.961e-02, 5.068e-02), r);
r = MulAdd(s1_4, M4(2.018e-01, -1.293e-01, -5.291e-02, -4.763e-02, 3.484e-02, -1.648e-01, 8.786e-02, -6.101e-02, -1.083e-01, 5.522e-02, -1.814e-01, -2.392e-01, 6.427e-02, -1.908e-02, 2.643e-01, 1.294e-01), r);
r = MulAdd(s1_5, M4(-7.897e-02, -5.967e-02, -2.620e-01, 1.274e-02, -2.583e-02, 5.654e-02, -7.639e-02, -7.534e-03, -5.812e-02, -7.887e-02, -3.738e-03, 7.664e-02, 1.753e-02, -2.842e-01, -3.237e-01, 2.077e-02), r);
r = MulAdd(s1_6, M4(6.558e-02, -9.890e-02, 1.849e-02, 3.242e-04, 1.021e-02, 1.234e-01, 1.224e-02, -4.322e-02, -2.778e-02, 3.860e-02, -5.257e-02, -1.466e-02, -1.001e-02, -1.291e-03, 1.724e-01, -9.167e-02), r);
r = MulAdd(s1_7, M4(-5.291e-02, -2.764e-01, -6.402e-02, 4.327e-02, 1.921e-02, -1.484e-01, 3.286e-02, 4.051e-02, 1.636e-02, 3.932e-01, -5.432e-02, 4.540e-02, 3.947e-02, -1.385e-01, -1.065e-01, 1.569e-01), r);
r = MulAdd(s1_8, M4(-1.729e-02, 8.177e-02, -4.479e-02, -1.275e-01, -3.302e-03, -1.265e-01, -2.922e-02, 3.720e-02, 1.560e-02, 5.266e-02, -1.572e-02, -4.840e-02, 3.991e-03, 1.003e-01, -1.423e-01, 7.414e-02), r);
r = MulAdd(s2_0, M4(-1.207e-02, -2.418e-02, -7.769e-03, -1.401e-01, 1.660e-01, -6.347e-03, -1.092e-02, -1.830e-02, -1.252e-01, -5.217e-02, 9.898e-03, 1.461e-02, 2.654e-02, 1.219e-02, -3.769e-02, 1.897e-02), r);
r = MulAdd(s2_1, M4(-3.650e-02, 1.317e-01, 1.299e-02, -5.512e-02, -1.287e-01, 2.438e-02, -1.609e-03, 1.759e-01, 1.824e-02, 6.477e-03, 2.905e-02, -8.644e-02, 7.496e-02, -9.920e-02, 1.147e-02, 1.889e-01), r);
r = MulAdd(s2_2, M4(-7.005e-03, -4.482e-02, -1.853e-02, 3.441e-02, 1.251e-01, -3.162e-02, -1.701e-01, -5.231e-02, -1.647e-01, 2.261e-02, 8.255e-02, -3.730e-02, 1.811e-01, -9.052e-02, 1.728e-02, 1.911e-02), r);
r = MulAdd(s2_3, M4(2.359e-02, -1.334e-01, 2.761e-02, -1.251e-01, 1.455e-01, 4.076e-02, -3.260e-02, -1.782e-01, -3.575e-02, 1.411e-02, 1.322e-01, -9.592e-02, 5.423e-02, 7.989e-03, -1.460e-01, 8.895e-02), r);
r = MulAdd(s2_4, M4(1.304e-01, 1.296e-01, -7.250e-02, -6.647e-02, 8.382e-02, 1.111e-01, 8.976e-02, -5.914e-02, -2.228e-01, -4.772e-02, -1.931e-03, 8.499e-02, 4.483e-01, 1.327e-01, 5.086e-02, -4.795e-01), r);
r = MulAdd(s2_5, M4(4.674e-02, 7.104e-02, -5.312e-02, -7.730e-02, 2.647e-03, 8.893e-03, -8.889e-02, -5.714e-02, -4.546e-02, -4.002e-02, -1.514e-01, -2.989e-02, -8.669e-02, -5.441e-03, 1.460e-02, -2.327e-02), r);
r = MulAdd(s2_6, M4(1.146e-01, -1.154e-01, 8.289e-03, 7.655e-02, -2.194e-02, -3.908e-02, -2.191e-02, 2.363e-03, 4.527e-02, -7.852e-02, -4.728e-02, 1.066e-01, 4.023e-02, -5.192e-02, -4.180e-02, -3.879e-02), r);
r = MulAdd(s2_7, M4(2.446e-01, -2.295e-01, -5.819e-02, -2.646e-02, 8.106e-02, -8.799e-02, -3.455e-02, 6.900e-02, 5.579e-02, -1.551e-01, 1.609e-01, 9.954e-02, -1.499e-01, 8.628e-02, 1.114e-01, 1.313e-02), r);
r = MulAdd(s2_8, M4(1.028e-02, 9.150e-02, -6.161e-02, 5.124e-03, 3.822e-02, 1.533e-02, 2.329e-02, -1.106e-01, -1.541e-03, -1.818e-01, -9.577e-02, -3.402e-02, 1.784e-02, -1.152e-01, 6.896e-02, -1.111e-01), r);
r = MulAdd(s3_0, M4(-7.349e-02, -4.782e-02, 3.080e-02, -1.668e-01, 9.572e-02, 5.307e-02, 5.573e-03, 6.483e-02, 1.104e-01, -5.707e-02, -8.579e-02, -1.754e-02, 1.038e-01, 1.706e-02, -1.185e-01, 5.863e-02), r);
r = MulAdd(s3_1, M4(-1.639e-01, -6.808e-03, 1.836e-02, -1.482e-01, 1.032e-01, 2.612e-02, -1.751e-01, -1.527e-01, 3.169e-03, 5.272e-02, 7.983e-02, 5.066e-02, 1.191e-01, 3.658e-02, 3.275e-02, -1.122e-01), r);
r = MulAdd(s3_2, M4(-8.279e-02, -1.068e-02, 3.848e-02, -8.857e-03, -3.783e-02, 9.934e-02, -7.181e-02, 2.801e-02, -1.524e-01, -7.166e-02, 1.038e-01, -9.840e-04, -7.254e-03, -3.252e-02, -1.435e-02, 6.052e-03), r);
r = MulAdd(s3_3, M4(-3.534e-02, -2.891e-02, 3.778e-01, -2.472e-01, -4.015e-02, -5.651e-02, 2.006e-01, 1.249e-02, -8.408e-02, -1.160e-02, 2.881e-01, -6.805e-03, 1.340e-02, -1.237e-01, -1.617e-01, 1.894e-02), r);
r = MulAdd(s3_4, M4(-1.512e-02, 3.232e-01, -1.441e-01, -3.778e-01, -1.475e-01, -2.644e-03, -3.149e-01, 3.225e-02, 1.227e-01, -3.620e-02, -1.175e-01, -3.857e-01, 4.834e-02, -1.567e-01, 1.632e-01, -1.292e-01), r);
r = MulAdd(s3_5, M4(-1.592e-01, 3.426e-02, -1.506e-01, 1.215e-01, 1.314e-01, -7.432e-02, -8.767e-02, 1.685e-01, 6.875e-02, 2.804e-01, -3.279e-02, -1.870e-01, 1.049e-01, -9.061e-02, 8.573e-02, -9.407e-02), r);
r = MulAdd(s3_6, M4(5.310e-02, -1.089e-01, -1.496e-01, 2.134e-01, 5.599e-02, -1.565e-01, -6.842e-02, -1.362e-02, 6.861e-02, -2.548e-02, -1.614e-01, -3.698e-02, -2.731e-02, 1.138e-02, 1.288e-02, -1.789e-02), r);
r = MulAdd(s3_7, M4(-7.967e-02, -2.461e-01, -2.139e-01, 3.193e-01, 1.377e-01, -1.213e-01, 8.415e-02, 1.224e-02, 1.192e-01, 1.785e-01, 1.978e-01, 1.008e-01, 3.016e-02, 9.868e-02, 3.118e-03, -3.294e-02), r);
r = MulAdd(s3_8, M4(1.121e-01, -4.625e-02, 3.331e-02, -7.687e-02, 5.520e-02, 6.326e-02, 1.369e-02, 1.850e-02, 4.062e-02, -1.561e-01, -8.640e-02, 1.105e-01, 8.446e-03, -1.746e-03, 4.572e-02, -1.015e-01), r);
return r;
}
V4 f1(V4 s0_0, V4 s0_1, V4 s0_2, V4 s0_3, V4 s0_4, V4 s0_5, V4 s0_6, V4 s0_7, V4 s0_8, V4 s1_0, V4 s1_1, V4 s1_2, V4 s1_3, V4 s1_4, V4 s1_5, V4 s1_6, V4 s1_7, V4 s1_8, V4 s2_0, V4 s2_1, V4 s2_2, V4 s2_3, V4 s2_4, V4 s2_5, V4 s2_6, V4 s2_7, V4 s2_8, V4 s3_0, V4 s3_1, V4 s3_2, V4 s3_3, V4 s3_4, V4 s3_5, V4 s3_6, V4 s3_7, V4 s3_8) {
V4 r = { -4.712e-03, -1.187e-02, 1.287e-02, -6.625e-03 };
r = MulAdd(s0_0, M4(2.399e-01, 1.190e-01, 9.941e-02, -5.908e-03, 2.176e-01, -3.861e-02, -4.997e-02, -3.036e-02, -6.079e-02, 2.294e-02, -1.260e-01, 6.001e-02, -7.690e-02, -4.805e-02, 6.117e-03, 4.358e-02), r);
r = MulAdd(s0_1, M4(-4.669e-02, -1.150e-01, 9.700e-03, 2.351e-02, 3.215e-01, -1.737e-03, 2.091e-01, -1.245e-01, -8.592e-02, 1.866e-01, 2.826e-01, -6.728e-01, 1.528e-01, 5.511e-02, -4.930e-02, -1.959e-02), r);
r = MulAdd(s0_2, M4(-2.182e-02, -4.512e-02, 6.864e-02, 8.299e-02, 8.483e-03, -4.855e-02, -1.500e-01, 1.325e-02, 6.098e-02, -1.867e-02, 1.276e-02, 1.721e-02, 5.918e-03, -1.130e-01, 7.066e-04, -1.824e-03), r);
r = MulAdd(s0_3, M4(4.594e-02, 1.518e-01, -2.067e-01, 1.546e-02, -1.548e-02, 1.126e-01, -4.502e-03, -2.014e-02, 2.417e-01, -1.530e-01, -1.095e-01, -4.966e-02, 2.291e-01, -4.598e-03, 2.836e-01, 5.562e-02), r);
r = MulAdd(s0_4, M4(5.432e-02, -3.003e-01, 7.389e-01, -1.497e-01, -2.439e-01, -3.298e-01, 4.081e-01, -2.105e-01, -4.267e-01, 3.913e-01, 5.470e-01, 5.594e-01, -1.221e-01, -5.444e-02, -4.180e-01, 1.515e-01), r);
r = MulAdd(s0_5, M4(2.205e-01, -5.813e-03, 7.451e-03, 8.130e-02, -3.312e-02, -9.387e-02, -9.824e-02, 4.493e-02, 8.187e-02, -2.042e-01, 1.644e-01, 1.562e-01, -8.427e-02, 2.057e-01, -1.668e-02, -2.356e-01), r);
r = MulAdd(s0_6, M4(1.150e-02, 1.442e-02, -1.973e-02, -4.599e-02, -9.680e-02, 3.962e-02, 1.731e-02, -2.402e-02, 3.936e-02, 6.512e-03, 2.103e-02, 2.025e-03, -1.308e-02, -5.259e-02, 5.631e-02, 3.037e-02), r);
r = MulAdd(s0_7, M4(-1.306e-02, -3.164e-02, 1.196e-01, 2.798e-02, -2.533e-01, -1.204e-01, 1.860e-01, 1.564e-01, -4.731e-02, -7.323e-02, 1.441e-03, -9.049e-02, -3.371e-02, -2.801e-04, 2.952e-02, -2.632e-02), r);
r = MulAdd(s0_8, M4(3.024e-02, -1.034e-02, -7.595e-02, -7.550e-02, 3.562e-02, -4.589e-02, -3.066e-02, 7.995e-02, -1.866e-02, 1.022e-01, -2.624e-02, -1.074e-01, 2.176e-02, 1.434e-01, -5.664e-02, -3.473e-02), r);
r = MulAdd(s1_0, M4(2.252e-01, 9.801e-02, -5.786e-02, -6.661e-02, 7.599e-02, -9.244e-02, 4.437e-02, -1.203e-01, -1.577e-01, -3.797e-02, -1.335e-02, 4.540e-02, -3.540e-03, -9.094e-03, -4.076e-02, -8.099e-02), r);
r = MulAdd(s1_1, M4(2.557e-01, -2.549e-01, 2.306e-01, -4.389e-02, -3.677e-02, 5.796e-02, 4.505e-02, -1.209e-01, -4.484e-02, 1.229e-01, -5.686e-02, 2.778e-02, 9.876e-02, -6.893e-04, 9.771e-02, 1.264e-01), r);
r = MulAdd(s1_2, M4(-5.324e-02, -9.632e-02, -1.092e-02, -1.426e-02, 3.082e-02, 9.196e-02, -1.381e-01, -1.013e-01, 7.758e-03, -3.290e-02, 1.630e-02, -4.979e-03, -7.297e-02, -7.534e-02, 2.040e-02, -1.983e-01), r);
r = MulAdd(s1_3, M4(1.951e-01, 3.566e-02, 4.220e-02, 8.086e-02, -5.114e-02, -5.626e-02, -6.912e-02, 1.462e-01, 2.268e-03, -2.592e-02, 3.527e-02, -3.832e-02, 4.756e-02, 1.234e-01, -5.494e-03, 4.695e-02), r);
r = MulAdd(s1_4, M4(4.147e-01, -2.431e-01, 2.372e-01, 2.574e-04, -4.485e-02, 5.014e-02, 3.928e-02, -2.817e-02, 3.512e-01, 2.983e-01, -1.260e-01, 4.326e-01, -2.366e-01, -6.912e-02, 2.259e-01, -4.534e-01), r);
r = MulAdd(s1_5, M4(1.323e-01, 5.260e-03, 2.693e-02, 1.841e-01, -1.105e-01, 6.002e-02, -1.233e-01, 1.012e-02, -9.410e-02, -1.260e-01, 1.264e-02, -3.910e-02, 3.656e-01, -1.103e-01, 5.059e-01, 4.280e-01), r);
r = MulAdd(s1_6, M4(-7.537e-02, -2.153e-02, -4.511e-02, -5.184e-02, -1.745e-02, -1.165e-02, 1.352e-02, -1.951e-02, -4.888e-02, 2.249e-02, -3.915e-02, -4.557e-03, -9.946e-03, -1.633e-04, -3.200e-02, -1.356e-02), r);
r = MulAdd(s1_7, M4(-1.509e-01, -2.227e-02, 1.640e-01, 2.693e-02, 4.846e-02, 3.303e-02, -5.390e-02, 3.607e-02, -2.818e-02, -7.170e-02, 3.311e-02, -9.203e-02, -1.946e-03, -8.577e-02, -2.925e-02, 1.238e-01), r);
r = MulAdd(s1_8, M4(-3.295e-02, 1.995e-02, -1.689e-01, -4.353e-02, -4.138e-02, -7.439e-03, -2.343e-02, 6.997e-02, 8.031e-02, 1.117e-01, 4.894e-02, -6.214e-02, -1.960e-01, -1.630e-01, 8.586e-02, -8.213e-02), r);
r = MulAdd(s2_0, M4(-9.883e-02, -1.168e-02, -1.110e-01, -2.148e-01, 1.452e-01, 3.417e-03, -4.513e-02, 8.845e-02, -7.791e-02, 2.326e-02, -4.188e-02, -3.659e-02, 3.105e-02, -1.318e-02, -4.552e-03, 7.109e-02), r);
r = MulAdd(s2_1, M4(1.958e-02, -6.995e-02, 2.588e-01, -6.431e-02, -2.211e-01, 5.281e-02, 5.399e-02, 8.884e-02, -5.135e-02, -4.768e-02, 1.363e-01, -2.064e-01, -1.391e-01, 1.106e-01, -2.611e-01, 2.038e-01), r);
r = MulAdd(s2_2, M4(-6.883e-02, -1.360e-03, -1.628e-01, 7.301e-02, 1.213e-01, -5.159e-03, 1.194e-01, -1.148e-02, -1.285e-01, -1.448e-01, 1.776e-02, -1.414e-01, -3.022e-02, 1.382e-01, 6.695e-02, -4.201e-02), r);
r = MulAdd(s2_3, M4(-1.194e-01, 1.524e-03, -1.945e-01, -1.496e-01, 1.413e-03, -8.697e-04, -1.542e-01, -1.798e-03, -4.991e-02, -7.944e-03, -1.094e-01, -5.578e-02, 1.526e-01, -6.170e-02, 1.598e-01, 1.306e-01), r);
r = MulAdd(s2_4, M4(3.583e-02, -1.213e-01, 2.087e-01, -4.616e-02, 2.125e-01, -1.242e-01, 2.776e-01, -8.100e-02, -1.733e-01, 1.016e-01, 2.949e-01, 1.489e-01, 5.059e-01, 3.526e-01, -4.764e-01, -1.105e-02), r);
r = MulAdd(s2_5, M4(7.240e-02, 1.034e-01, -1.103e-01, 2.351e-02, -2.711e-02, 1.506e-02, -1.534e-01, 1.093e-01, 5.065e-02, -2.686e-01, 1.423e-01, -4.993e-02, 7.167e-02, 1.084e-01, -8.139e-03, 4.460e-02), r);
r = MulAdd(s2_6, M4(1.243e-01, 1.281e-02, 7.048e-02, 1.117e-01, -1.145e-01, -1.703e-02, -1.470e-02, -3.647e-02, 3.796e-03, 2.441e-02, -8.422e-02, 1.955e-02, -2.861e-02, -6.963e-02, 6.894e-02, -4.071e-02), r);
r = MulAdd(s2_7, M4(2.315e-01, 7.446e-02, -7.632e-02, 1.319e-01, -2.392e-02, 2.525e-02, 4.687e-02, 7.645e-02, 4.250e-02, -4.733e-02, 2.179e-01, -3.843e-02, -3.526e-01, 9.675e-02, -1.837e-01, -1.563e-01), r);
r = MulAdd(s2_8, M4(5.933e-02, 1.490e-01, -5.844e-02, 9.363e-02, 7.616e-04, -1.075e-02, -1.365e-01, -6.094e-02, 7.094e-03, -1.218e-01, 7.021e-02, 3.101e-02, -4.184e-02, 3.989e-02, -7.167e-02, -1.179e-01), r);
r = MulAdd(s3_0, M4(-7.835e-02, 6.392e-02, -5.802e-02, -1.483e-01, 1.374e-01, 3.699e-02, 2.043e-03, 1.554e-01, -6.873e-02, -1.174e-02, -1.518e-01, -1.405e-02, 4.783e-03, -1.131e-01, 4.121e-02, -8.849e-02), r);
r = MulAdd(s3_1, M4(-1.463e-01, 5.240e-02, -1.651e-02, -2.410e-01, 1.092e-01, -3.146e-02, -1.629e-02, -2.974e-02, -7.838e-02, -7.374e-03, 2.745e-01, -1.408e-01, 1.335e-01, 8.634e-02, 1.073e-02, -1.407e-02), r);
r = MulAdd(s3_2, M4(-7.340e-02, 2.321e-02, 1.922e-02, -1.112e-01, 2.932e-02, -2.587e-02, 1.333e-01, 4.721e-02, -1.514e-01, -3.395e-02, -1.264e-01, 1.777e-02, -8.692e-02, 1.186e-02, -7.424e-02, -2.402e-02), r);
r = MulAdd(s3_3, M4(5.052e-02, 2.790e-03, 3.121e-02, -1.839e-01, 3.910e-02, 2.279e-02, 6.041e-02, -8.205e-03, -5.819e-02, 5.701e-04, 5.763e-02, -1.835e-02, -7.273e-02, -1.017e-01, -4.708e-02, 3.331e-02), r);
r = MulAdd(s3_4, M4(-4.521e-02, -3.700e-02, -1.199e-01, -3.863e-01, -4.641e-01, -2.451e-01, 1.512e-03, -3.424e-01, -1.194e-01, 1.119e-01, -1.183e-01, 1.918e-01, 8.865e-02, 1.866e-01, -4.503e-02, 2.355e-03), r);
r = MulAdd(s3_5, M4(5.461e-02, -1.461e-01, 2.827e-01, 2.041e-01, -8.786e-03, 1.079e-02, 1.593e-01, 2.173e-01, 4.916e-01, -1.773e-01, 2.149e-02, -1.461e-01, -5.435e-02, 1.909e-01, -2.171e-01, -7.547e-02), r);
r = MulAdd(s3_6, M4(2.543e-02, 5.455e-02, -6.107e-02, 5.194e-03, 9.984e-02, 8.664e-02, -7.757e-04, 3.957e-02, 1.432e-01, 3.805e-02, -1.005e-03, 7.600e-02, -4.304e-02, -6.326e-02, 3.996e-02, 3.872e-03), r);
r = MulAdd(s3_7, M4(-1.234e-01, -1.276e-01, 1.312e-01, 8.454e-02, 1.539e-01, 6.822e-02, -1.455e-02, 1.223e-01, -1.060e-01, -3.708e-02, -1.480e-01, -7.922e-02, 6.503e-02, 1.105e-01, -1.249e-01, -3.210e-02), r);
r = MulAdd(s3_8, M4(1.676e-01, -7.072e-03, 4.581e-02, -1.006e-01, -1.056e-02, -8.209e-02, -4.804e-02, 2.427e-02, -1.165e-01, -8.224e-02, 2.940e-01, -9.220e-03, 5.420e-02, 1.802e-01, -1.190e-01, 1.433e-02), r);
return r;
}
void Pass4(uint2 blockStart, uint3 tid) {
float2 pt = float2(GetInputPt());
uint2 gxy = Rmp8x8(tid.x) + blockStart;
uint2 size = GetInputSize();
if (gxy.x >= size.x || gxy.y >= size.y) {
return;
}
float2 pos = (gxy + 0.5) * pt;
V4 s0_0 = l0(-1.0, -1.0);
V4 s0_1 = l0(0.0, -1.0);
V4 s0_2 = l0(1.0, -1.0);
V4 s0_3 = l0(-1.0, 0.0);
V4 s0_4 = l0(0.0, 0.0);
V4 s0_5 = l0(1.0, 0.0);
V4 s0_6 = l0(-1.0, 1.0);
V4 s0_7 = l0(0.0, 1.0);
V4 s0_8 = l0(1.0, 1.0);
V4 s1_0 = -max(-s0_0, 0.0);
V4 s1_1 = -max(-s0_1, 0.0);
V4 s1_2 = -max(-s0_2, 0.0);
V4 s1_3 = -max(-s0_3, 0.0);
V4 s1_4 = -max(-s0_4, 0.0);
V4 s1_5 = -max(-s0_5, 0.0);
V4 s1_6 = -max(-s0_6, 0.0);
V4 s1_7 = -max(-s0_7, 0.0);
V4 s1_8 = -max(-s0_8, 0.0);
s0_0 = max(s0_0, 0.0);
s0_1 = max(s0_1, 0.0);
s0_2 = max(s0_2, 0.0);
s0_3 = max(s0_3, 0.0);
s0_4 = max(s0_4, 0.0);
s0_5 = max(s0_5, 0.0);
s0_6 = max(s0_6, 0.0);
s0_7 = max(s0_7, 0.0);
s0_8 = max(s0_8, 0.0);
V4 s2_0 = l1(-1.0, -1.0);
V4 s2_1 = l1(0.0, -1.0);
V4 s2_2 = l1(1.0, -1.0);
V4 s2_3 = l1(-1.0, 0.0);
V4 s2_4 = l1(0.0, 0.0);
V4 s2_5 = l1(1.0, 0.0);
V4 s2_6 = l1(-1.0, 1.0);
V4 s2_7 = l1(0.0, 1.0);
V4 s2_8 = l1(1.0, 1.0);
V4 s3_0 = -max(-s2_0, 0.0);
V4 s3_1 = -max(-s2_1, 0.0);
V4 s3_2 = -max(-s2_2, 0.0);
V4 s3_3 = -max(-s2_3, 0.0);
V4 s3_4 = -max(-s2_4, 0.0);
V4 s3_5 = -max(-s2_5, 0.0);
V4 s3_6 = -max(-s2_6, 0.0);
V4 s3_7 = -max(-s2_7, 0.0);
V4 s3_8 = -max(-s2_8, 0.0);
s2_0 = max(s2_0, 0.0);
s2_1 = max(s2_1, 0.0);
s2_2 = max(s2_2, 0.0);
s2_3 = max(s2_3, 0.0);
s2_4 = max(s2_4, 0.0);
s2_5 = max(s2_5, 0.0);
s2_6 = max(s2_6, 0.0);
s2_7 = max(s2_7, 0.0);
s2_8 = max(s2_8, 0.0);
t2[gxy] = f0(s0_0, s0_1, s0_2, s0_3, s0_4, s0_5, s0_6, s0_7, s0_8, s1_0, s1_1, s1_2, s1_3, s1_4, s1_5, s1_6, s1_7, s1_8, s2_0, s2_1, s2_2, s2_3, s2_4, s2_5, s2_6, s2_7, s2_8, s3_0, s3_1, s3_2, s3_3, s3_4, s3_5, s3_6, s3_7, s3_8);
t3[gxy] = f1(s0_0, s0_1, s0_2, s0_3, s0_4, s0_5, s0_6, s0_7, s0_8, s1_0, s1_1, s1_2, s1_3, s1_4, s1_5, s1_6, s1_7, s1_8, s2_0, s2_1, s2_2, s2_3, s2_4, s2_5, s2_6, s2_7, s2_8, s3_0, s3_1, s3_2, s3_3, s3_4, s3_5, s3_6, s3_7, s3_8);
}
//!PASS 5
//!DESC conv4
//!BLOCK_SIZE 8
//!NUM_THREADS 64
//!IN t2, t3
//!OUT t0, t1
#define l0(x, y) V4(O(t2, float2(x, y)))
#define l1(x, y) V4(O(t3, float2(x, y)))
V4 f0(V4 s0_0, V4 s0_1, V4 s0_2, V4 s0_3, V4 s0_4, V4 s0_5, V4 s0_6, V4 s0_7, V4 s0_8, V4 s1_0, V4 s1_1, V4 s1_2, V4 s1_3, V4 s1_4, V4 s1_5, V4 s1_6, V4 s1_7, V4 s1_8, V4 s2_0, V4 s2_1, V4 s2_2, V4 s2_3, V4 s2_4, V4 s2_5, V4 s2_6, V4 s2_7, V4 s2_8, V4 s3_0, V4 s3_1, V4 s3_2, V4 s3_3, V4 s3_4, V4 s3_5, V4 s3_6, V4 s3_7, V4 s3_8) {
V4 r = { -5.270e-03, 1.390e-02, 8.622e-03, 1.255e-02 };
r = MulAdd(s0_0, M4(1.060e-02, 9.173e-03, 9.548e-04, -7.886e-02, -1.324e-02, 4.660e-02, -4.997e-02, -5.676e-02, -3.290e-02, 6.253e-02, -5.777e-02, 1.265e-02, 6.136e-03, 7.179e-02, 3.102e-02, 4.961e-02), r);
r = MulAdd(s0_1, M4(5.787e-03, -2.090e-03, -1.489e-01, 4.380e-02, 1.259e-01, 5.508e-01, 1.211e-01, 3.385e-01, 2.399e-02, -1.436e-01, 2.987e-03, -2.839e-02, -3.021e-02, -8.641e-03, 1.716e-01, -1.328e-02), r);
r = MulAdd(s0_2, M4(-3.284e-02, -5.196e-02, -2.983e-02, -2.858e-02, 1.729e-02, 7.665e-02, 1.387e-01, 1.037e-01, 4.289e-02, 1.274e-01, 3.348e-02, 1.911e-02, -1.786e-02, -4.888e-02, 6.323e-02, -2.989e-02), r);
r = MulAdd(s0_3, M4(2.473e-02, -6.550e-02, -1.373e-01, 3.680e-02, 1.575e-01, -8.270e-02, 3.186e-02, -3.836e-02, 4.508e-02, 4.254e-02, 5.656e-03, -9.132e-02, 1.334e-01, -5.076e-02, -2.445e-02, -4.735e-02), r);
r = MulAdd(s0_4, M4(-5.346e-01, 1.950e-01, 2.121e-01, -3.694e-01, 5.004e-02, 1.610e-02, 2.249e-01, -5.962e-02, -6.243e-02, -3.270e-01, 1.851e-01, 4.051e-02, -2.310e-01, -2.300e-01, -1.314e-01, 3.374e-01), r);
r = MulAdd(s0_5, M4(-4.686e-02, -3.968e-01, 2.772e-02, 2.495e-02, 4.541e-02, 8.724e-02, 4.401e-02, -1.515e-02, -6.453e-02, -7.210e-02, -1.250e-02, 4.044e-02, 3.057e-02, 2.485e-01, 2.228e-02, 6.774e-02), r);
r = MulAdd(s0_6, M4(-1.518e-01, -6.862e-02, -8.148e-02, -2.030e-01, -4.453e-02, -2.133e-03, -6.081e-02, -8.941e-02, -5.417e-02, 1.564e-02, -5.425e-02, 5.875e-02, -8.805e-02, -1.910e-02, 2.099e-02, -1.402e-02), r);
r = MulAdd(s0_7, M4(-1.730e-02, -6.152e-02, -2.764e-01, -8.728e-02, 9.519e-03, -2.799e-02, -5.662e-02, 3.249e-02, 8.716e-02, 2.809e-02, -7.241e-02, 3.046e-02, 1.368e-01, 2.723e-02, 1.130e-01, -4.615e-02), r);
r = MulAdd(s0_8, M4(-5.021e-02, -3.352e-02, 5.072e-02, -1.434e-02, 6.511e-02, 6.519e-02, -8.987e-02, 2.193e-02, 1.583e-04, 2.714e-02, -2.315e-02, -3.077e-02, 7.792e-03, 2.782e-02, 9.282e-02, 5.011e-02), r);
r = MulAdd(s1_0, M4(-2.541e-02, -9.530e-03, -2.089e-01, -2.421e-02, 1.340e-02, 1.228e-01, 8.861e-02, -1.063e-02, -7.461e-02, 5.226e-02, -7.276e-02, 3.544e-02, -1.591e-02, 1.851e-02, 9.562e-03, 4.559e-02), r);
r = MulAdd(s1_1, M4(2.747e-02, -7.982e-02, -1.475e-01, 4.885e-02, -1.175e-02, -9.209e-02, -9.273e-02, -7.428e-02, 3.696e-02, -2.012e-01, 4.627e-02, 3.609e-02, 1.096e-01, -5.087e-02, 2.170e-01, 5.311e-02), r);
r = MulAdd(s1_2, M4(2.410e-02, 6.970e-02, 2.315e-02, 2.908e-02, 2.961e-05, 1.661e-02, 8.374e-02, 5.064e-02, 2.637e-02, 1.330e-01, 5.175e-02, -5.518e-02, -4.871e-03, 1.162e-01, 8.451e-02, 1.741e-02), r);
r = MulAdd(s1_3, M4(4.863e-02, -7.095e-02, 3.927e-03, -9.085e-02, 2.639e-02, -8.297e-02, -1.865e-01, -9.647e-02, 6.967e-02, 1.376e-02, 1.222e-01, -2.819e-01, 1.563e-01, -1.399e-02, -4.367e-02, -5.187e-02), r);
r = MulAdd(s1_4, M4(9.322e-02, 9.848e-02, 1.680e-01, -2.298e-01, -6.183e-02, -4.167e-02, -1.103e-02, -9.856e-03, -2.983e-03, -3.805e-01, -3.115e-01, -4.107e-01, -1.341e-01, -3.703e-01, -3.661e-01, -4.633e-01), r);
r = MulAdd(s1_5, M4(-2.785e-03, -2.188e-02, -2.790e-03, -4.276e-04, 7.082e-02, 1.004e-01, -3.532e-03, 1.740e-03, 6.693e-03, -5.230e-01, 2.119e-01, 2.878e-02, 3.915e-03, 1.842e-01, -1.630e-02, -3.874e-02), r);
r = MulAdd(s1_6, M4(2.313e-02, -6.545e-02, 1.631e-02, -1.278e-01, -4.216e-02, -4.147e-02, 6.827e-02, -1.725e-02, -5.254e-02, -3.942e-02, -2.400e-02, -8.124e-02, -3.250e-02, -1.806e-03, -3.947e-02, -7.056e-02), r);
r = MulAdd(s1_7, M4(8.445e-03, 1.147e-01, -7.772e-02, 1.091e-01, 1.842e-02, -6.040e-03, -7.053e-02, 1.824e-02, 2.212e-01, -8.777e-02, -1.003e-01, 6.533e-03, 2.090e-01, 4.588e-02, 9.886e-02, 6.176e-02), r);
r = MulAdd(s1_8, M4(4.046e-02, 1.872e-02, -5.723e-02, -4.997e-02, 5.232e-03, 1.795e-02, -2.747e-02, -1.507e-02, -1.704e-01, 7.849e-02, -1.475e-01, -4.255e-02, 7.807e-02, 4.185e-02, 3.849e-02, 3.137e-02), r);
r = MulAdd(s2_0, M4(-8.062e-03, 6.677e-02, 6.217e-02, 1.833e-01, -1.475e-01, 2.782e-01, 3.524e-02, -6.275e-02, 4.315e-02, 1.484e-02, 3.820e-02, -3.304e-02, 1.659e-03, -9.567e-03, -3.360e-02, -2.623e-02), r);
r = MulAdd(s2_1, M4(9.928e-02, -2.526e-01, -2.613e-02, 2.043e-01, 1.710e-02, -1.137e-01, 1.798e-01, -1.427e-01, 4.676e-03, 1.728e-01, 8.082e-02, -5.413e-02, -1.710e-02, -3.169e-02, -6.860e-02, 1.496e-02), r);
r = MulAdd(s2_2, M4(1.785e-02, 1.092e-01, -7.685e-02, 7.691e-02, 5.271e-03, -5.168e-02, 3.395e-02, 1.726e-02, 2.936e-02, -1.321e-02, 5.364e-02, -6.785e-03, 2.429e-02, -4.442e-02, -6.348e-02, 3.035e-02), r);
r = MulAdd(s2_3, M4(2.676e-01, 4.022e-03, -5.435e-02, -2.723e-01, -1.412e-01, -6.091e-01, 1.576e-02, 6.829e-02, -1.410e-01, 5.578e-03, 3.833e-03, 1.863e-01, -2.274e-02, 6.034e-03, 1.518e-01, -5.434e-02), r);
r = MulAdd(s2_4, M4(7.884e-02, 5.377e-01, -4.655e-02, -3.752e-01, 1.490e-01, -4.235e-02, -5.390e-02, 2.610e-01, 1.979e-01, -5.718e-02, 1.773e-02, 5.727e-02, 1.703e-02, 7.533e-01, -3.023e-02, 5.456e-02), r);
r = MulAdd(s2_5, M4(-4.898e-02, 4.237e-02, 6.311e-02, -4.635e-02, 3.660e-03, 2.139e-01, -3.722e-02, -6.738e-02, -3.009e-02, -6.140e-02, 2.777e-02, 3.917e-02, -1.421e-01, -4.041e-01, -1.524e-01, -9.837e-02), r);
r = MulAdd(s2_6, M4(6.071e-02, 1.084e-01, -6.370e-02, 1.323e-01, -7.251e-02, -1.079e-01, 1.208e-01, -4.495e-02, -2.115e-03, -4.107e-02, 2.465e-02, -1.230e-01, -6.064e-02, -4.263e-02, -1.388e-01, 6.519e-02), r);
r = MulAdd(s2_7, M4(9.042e-02, -8.032e-02, 1.186e-01, -1.537e-02, -6.566e-03, -3.216e-02, 3.412e-02, -3.207e-02, -1.586e-01, -2.988e-03, -2.358e-03, 2.172e-02, 6.775e-02, -3.590e-01, -4.123e-01, -3.506e-01), r);
r = MulAdd(s2_8, M4(8.486e-02, 4.731e-02, 5.779e-02, 1.000e-01, 9.121e-03, -3.421e-02, 4.891e-02, 4.916e-02, 3.343e-03, 4.437e-03, -2.002e-02, -3.856e-02, -1.319e-01, -4.022e-02, -1.752e-01, -9.250e-02), r);
r = MulAdd(s3_0, M4(-6.652e-03, 6.416e-02, 9.292e-03, 6.520e-02, 1.213e-02, 4.177e-02, 7.038e-02, -3.160e-02, 2.146e-02, -9.523e-02, -1.436e-01, -8.325e-02, -1.234e-02, -1.222e-02, -3.877e-02, -4.175e-02), r);
r = MulAdd(s3_1, M4(-5.171e-02, 1.011e-01, 7.998e-02, -8.804e-02, 1.067e-02, 1.516e-01, 6.508e-02, -7.724e-02, 2.717e-02, -4.901e-02, -6.059e-03, 4.013e-02, -3.833e-02, 1.538e-01, 5.948e-02, -4.945e-02), r);
r = MulAdd(s3_2, M4(1.023e-02, -1.230e-01, -1.861e-02, -2.570e-02, 2.512e-02, -4.630e-02, 6.354e-02, 3.897e-02, -2.146e-02, 2.446e-01, 1.906e-03, -9.068e-03, 1.754e-02, -7.082e-02, 1.107e-04, -9.604e-03), r);
r = MulAdd(s3_3, M4(7.380e-02, 2.216e-02, -2.608e-02, -6.491e-02, 4.018e-02, -6.657e-02, 1.116e-01, 9.405e-02, -7.168e-02, -3.646e-01, 8.387e-02, 1.352e-02, -4.589e-02, 2.235e-02, 1.881e-01, 1.759e-01), r);
r = MulAdd(s3_4, M4(-7.735e-02, -8.574e-02, -6.380e-02, 1.221e-01, 5.556e-02, -1.281e-01, 1.461e-01, 2.757e-01, 8.144e-01, -1.075e-01, 3.165e-03, -2.036e-01, 1.814e-01, 1.744e-01, -1.745e-01, 3.724e-02), r);
r = MulAdd(s3_5, M4(-6.864e-02, 1.273e-02, 7.502e-02, 4.164e-02, 1.301e-02, 1.407e-01, -9.985e-02, -8.079e-02, 1.428e-01, 3.034e-01, -1.564e-02, 6.091e-02, -1.271e-02, -2.153e-01, -7.843e-02, -4.063e-02), r);
r = MulAdd(s3_6, M4(-5.115e-02, 6.016e-02, -2.719e-02, 4.668e-02, -3.214e-02, -2.274e-02, -6.954e-03, -9.099e-03, 4.861e-02, 1.007e-01, -2.150e-01, -1.607e-01, -3.578e-02, 1.230e-02, -5.095e-02, 1.622e-02), r);
r = MulAdd(s3_7, M4(9.498e-02, -6.763e-02, 1.451e-01, 3.408e-03, -3.253e-02, 1.145e-01, 8.122e-03, -9.192e-02, 5.071e-02, -6.317e-02, 1.097e-01, 5.913e-02, 8.494e-02, 2.731e-04, -3.736e-01, -6.110e-03), r);
r = MulAdd(s3_8, M4(1.881e-02, 1.750e-02, 5.956e-02, 4.179e-02, -4.554e-02, -9.824e-02, 8.917e-03, 3.348e-02, 4.160e-02, 6.525e-02, 1.484e-02, -2.331e-02, -8.092e-02, -2.834e-02, -1.284e-01, -7.521e-02), r);
return r;
}
V4 f1(V4 s0_0, V4 s0_1, V4 s0_2, V4 s0_3, V4 s0_4, V4 s0_5, V4 s0_6, V4 s0_7, V4 s0_8, V4 s1_0, V4 s1_1, V4 s1_2, V4 s1_3, V4 s1_4, V4 s1_5, V4 s1_6, V4 s1_7, V4 s1_8, V4 s2_0, V4 s2_1, V4 s2_2, V4 s2_3, V4 s2_4, V4 s2_5, V4 s2_6, V4 s2_7, V4 s2_8, V4 s3_0, V4 s3_1, V4 s3_2, V4 s3_3, V4 s3_4, V4 s3_5, V4 s3_6, V4 s3_7, V4 s3_8) {
V4 r = { -7.279e-03, 1.016e-02, -7.400e-03, 4.979e-03 };
r = MulAdd(s0_0, M4(-4.868e-02, -7.333e-02, -1.029e-02, -7.011e-04, 2.404e-02, -9.301e-02, 1.457e-01, 2.242e-02, 6.850e-02, -1.328e-03, -2.557e-02, -4.854e-04, 1.071e-01, 3.788e-04, 1.408e-01, 5.354e-03), r);
r = MulAdd(s0_1, M4(7.892e-03, 5.832e-02, -1.077e-01, -6.140e-02, -1.003e-02, -4.887e-01, 8.263e-01, 2.416e-01, -3.434e-02, 1.089e-02, -1.984e-02, -3.615e-02, -5.692e-03, 1.615e-02, -5.680e-02, 2.041e-02), r);
r = MulAdd(s0_2, M4(2.136e-02, -2.731e-02, -1.742e-02, 2.592e-02, -4.319e-02, 6.426e-03, 2.110e-02, -6.338e-02, -6.921e-03, -1.288e-03, 4.579e-02, -2.155e-03, 3.041e-02, 1.946e-02, 1.238e-02, 6.906e-02), r);
r = MulAdd(s0_3, M4(-2.058e-02, 3.187e-02, -1.057e-01, 2.407e-01, -3.813e-02, -2.640e-02, 3.941e-02, 1.362e-01, 2.406e-02, 1.518e-02, -4.224e-02, 3.455e-02, 5.443e-02, -6.617e-02, -8.858e-02, -1.949e-02), r);
r = MulAdd(s0_4, M4(3.205e-01, -6.490e-01, -3.962e-01, -1.142e-01, -3.091e-02, 4.755e-01, -2.822e-01, -1.328e-01, -5.487e-01, 5.932e-02, -2.439e-02, -1.689e-01, -3.681e-02, -8.227e-02, 3.967e-02, -8.989e-02), r);
r = MulAdd(s0_5, M4(-5.668e-02, 3.658e-02, 1.227e-02, 8.117e-02, 1.161e-01, 9.350e-02, 9.971e-02, -1.220e-01, 7.876e-02, 5.186e-02, -4.261e-02, 1.436e-01, -2.114e-02, 6.113e-02, 2.251e-02, 2.534e-02), r);
r = MulAdd(s0_6, M4(-5.365e-02, -2.678e-02, -2.565e-02, 7.923e-02, -2.138e-02, -4.932e-02, -6.107e-03, 1.685e-02, 5.425e-02, -1.012e-02, -9.037e-03, 8.218e-04, -1.210e-02, 5.623e-02, -2.094e-02, -2.325e-02), r);
r = MulAdd(s0_7, M4(2.031e-02, -3.187e-02, 8.229e-02, 1.457e-01, 1.044e-01, -4.475e-02, 2.858e-02, -7.345e-02, -3.919e-02, -5.753e-02, 1.684e-02, -1.669e-01, 9.680e-03, 1.254e-01, 2.022e-03, -9.900e-02), r);
r = MulAdd(s0_8, M4(-1.164e-02, 5.171e-02, -5.704e-02, -1.643e-01, 2.554e-02, -9.988e-02, 3.699e-02, -3.752e-02, -8.076e-04, -2.527e-02, -2.081e-02, 3.110e-02, 1.484e-03, 4.064e-02, 2.481e-02, 2.225e-01), r);
r = MulAdd(s1_0, M4(4.384e-02, -1.401e-01, -4.071e-02, -1.137e-02, -4.979e-03, 6.159e-02, 1.275e-01, 6.544e-02, 1.288e-01, -4.421e-02, -4.471e-02, 2.682e-02, 5.621e-02, -4.062e-02, 1.034e-01, 6.606e-02), r);
r = MulAdd(s1_1, M4(2.799e-02, -1.333e-01, 1.521e-01, -5.025e-02, -1.895e-01, -7.913e-02, -2.321e-01, -6.526e-02, -1.330e-02, 1.499e-02, 1.620e-01, 6.936e-02, -6.816e-02, 1.353e-01, 1.107e-01, 5.514e-02), r);
r = MulAdd(s1_2, M4(5.826e-03, -5.941e-03, -2.338e-02, -2.826e-02, 9.265e-02, 3.608e-02, 1.114e-01, 1.274e-01, -1.291e-01, 1.284e-02, -7.540e-02, -3.458e-02, -1.006e-02, 2.083e-02, -8.393e-02, 8.186e-02), r);
r = MulAdd(s1_3, M4(-3.893e-02, -1.137e-02, 1.243e-01, 1.118e-01, 7.397e-02, -1.316e-01, -1.303e-01, -7.808e-05, 1.468e-02, -4.172e-03, -5.014e-02, -2.610e-02, 8.366e-02, -2.755e-02, 1.646e-04, -2.938e-02), r);
r = MulAdd(s1_4, M4(-1.114e-01, -2.017e-01, 2.898e-03, -7.984e-02, -8.403e-02, 2.626e-02, 1.563e-01, -1.397e-02, 2.724e-02, -6.698e-01, 2.358e-01, -6.466e-01, -9.650e-03, -6.742e-01, 1.411e-01, -3.343e-01), r);
r = MulAdd(s1_5, M4(7.380e-02, 6.420e-02, 7.990e-02, 6.014e-02, 5.950e-02, 6.212e-02, -7.881e-02, -2.782e-02, 1.087e-01, -3.347e-02, 3.819e-01, 1.988e-01, 5.813e-02, 2.239e-02, 3.012e-01, 1.275e-01), r);
r = MulAdd(s1_6, M4(-9.473e-02, -2.417e-02, -2.870e-02, 7.718e-02, -2.223e-02, 2.306e-02, 8.255e-03, -1.818e-02, -2.983e-02, -3.495e-02, 1.540e-02, 8.013e-02, 1.651e-02, -1.298e-02, 2.377e-02, 5.523e-02), r);
r = MulAdd(s1_7, M4(1.414e-01, 1.346e-01, 4.336e-03, -7.594e-02, -2.044e-02, -9.596e-03, -1.087e-03, 5.324e-02, -2.041e-02, -6.328e-02, 7.533e-02, -3.971e-01, 5.408e-04, 1.087e-01, 9.749e-03, -2.047e-01), r);
r = MulAdd(s1_8, M4(4.656e-02, -4.771e-02, -2.210e-02, -2.060e-02, -6.953e-03, -3.366e-02, -7.290e-03, -3.300e-02, -1.354e-01, -5.015e-02, -2.887e-02, 2.802e-01, 2.605e-02, -1.972e-02, 1.168e-03, 1.422e-01), r);
r = MulAdd(s2_0, M4(8.826e-02, -4.751e-02, 2.493e-01, 4.446e-02, 1.752e-01, 5.741e-03, -1.820e-01, 1.371e-02, -6.855e-02, 1.164e-02, -5.215e-02, -7.373e-04, -1.491e-02, 7.033e-03, -5.440e-02, -9.302e-05), r);
r = MulAdd(s2_1, M4(-6.871e-02, -9.419e-03, 3.276e-01, 2.826e-02, 5.675e-02, -3.974e-03, 1.104e-01, -2.975e-02, 3.281e-02, 8.429e-03, 1.129e-01, -4.830e-02, -4.374e-02, -6.905e-02, 8.143e-02, 3.180e-03), r);
r = MulAdd(s2_2, M4(-7.197e-02, -1.804e-02, -9.024e-02, -1.527e-03, 2.403e-02, 6.062e-02, 3.346e-02, 4.784e-02, -1.462e-02, 4.216e-02, 2.800e-02, 4.034e-04, -4.216e-02, -4.431e-03, -3.496e-02, -3.005e-02), r);
r = MulAdd(s2_3, M4(2.710e-02, -6.523e-02, 1.559e-01, -6.059e-02, 1.965e-01, -1.608e-01, -9.293e-03, -2.404e-01, -4.061e-02, 8.819e-02, 2.112e-02, 2.398e-01, -1.463e-01, 5.373e-02, -1.346e-02, 3.025e-02), r);
r = MulAdd(s2_4, M4(1.846e-02, 3.857e-01, -4.128e-01, -2.530e-01, -2.312e-01, 3.354e-02, -3.948e-01, -1.465e-01, 1.072e-01, -8.544e-02, -7.428e-02, 4.751e-02, 2.139e-01, 3.097e-01, -3.761e-01, 5.621e-02), r);
r = MulAdd(s2_5, M4(-1.203e-02, -9.598e-02, 4.101e-01, 1.578e-01, -5.394e-02, -6.714e-02, -6.320e-02, 8.249e-03, 5.620e-02, -3.219e-02, 9.398e-03, 6.809e-02, -7.400e-02, -1.431e-01, -1.425e-01, -2.358e-02), r);
r = MulAdd(s2_6, M4(4.035e-02, 5.655e-02, -3.307e-03, -3.497e-02, 6.522e-02, 1.103e-01, -9.802e-02, -2.655e-01, -5.802e-02, -4.359e-02, 3.459e-03, 1.592e-01, -2.566e-02, -1.156e-01, 2.646e-02, 3.806e-02), r);
r = MulAdd(s2_7, M4(-3.211e-02, -1.509e-01, 2.028e-03, -1.702e-01, 4.576e-02, -5.340e-02, 5.503e-02, 1.257e-02, -5.581e-02, 9.818e-02, -2.745e-02, 1.486e-01, 1.063e-01, -3.707e-01, 1.116e-01, -6.709e-02), r);
r = MulAdd(s2_8, M4(9.062e-04, -7.371e-03, 8.420e-02, 1.629e-01, -2.707e-02, -5.219e-03, 6.567e-02, 1.766e-01, 4.554e-04, 1.178e-02, -1.124e-02, 3.477e-02, -5.473e-02, -7.643e-02, 9.083e-03, -4.250e-02), r);
r = MulAdd(s3_0, M4(8.537e-02, 7.246e-02, 5.043e-02, 3.850e-02, -3.951e-02, 9.224e-03, 1.640e-02, 1.906e-02, -1.333e-01, -8.517e-02, -1.410e-01, 4.781e-02, -1.641e-02, 2.463e-03, -7.445e-02, -4.602e-02), r);
r = MulAdd(s3_1, M4(7.934e-02, 7.380e-03, -1.062e-01, -4.154e-03, -2.611e-02, -3.119e-02, 9.679e-02, -1.394e-02, -1.108e-01, 1.158e-02, 1.850e-01, -6.765e-02, 5.765e-02, 3.392e-02, -1.560e-02, -6.052e-02), r);
r = MulAdd(s3_2, M4(-3.056e-02, -8.450e-03, 1.524e-02, -1.007e-02, 1.030e-02, 4.032e-02, 9.837e-02, 9.371e-03, 4.740e-02, -4.795e-02, -3.356e-02, 8.602e-03, 2.484e-02, -9.889e-03, 6.734e-02, -2.287e-02), r);
r = MulAdd(s3_3, M4(6.303e-02, -7.328e-02, -5.082e-02, -5.070e-02, -8.129e-03, 7.948e-03, 7.351e-02, 5.601e-02, -4.169e-01, -8.219e-03, 3.373e-01, 1.781e-01, -5.139e-02, 1.471e-01, 3.415e-02, 7.690e-02), r);
r = MulAdd(s3_4, M4(1.455e-01, 6.664e-02, 5.792e-02, -1.276e-01, 2.360e-01, 5.978e-02, 5.147e-02, 2.707e-01, -7.581e-01, -2.740e-01, -3.000e-01, -2.017e-02, 2.005e-01, 6.934e-02, 2.996e-02, -3.036e-01), r);
r = MulAdd(s3_5, M4(-1.708e-01, -6.628e-02, 9.170e-02, 3.461e-02, -1.028e-01, -4.244e-02, -1.955e-01, -1.875e-01, 8.215e-02, 2.976e-02, -4.637e-03, 1.512e-01, -4.626e-02, 3.819e-03, -2.222e-01, -1.078e-01), r);
r = MulAdd(s3_6, M4(9.585e-02, 3.354e-02, -5.140e-03, -2.883e-02, -9.057e-02, 3.950e-02, -5.781e-02, -2.509e-02, -7.106e-02, -1.351e-01, -4.933e-02, 8.332e-02, -2.922e-02, -3.890e-02, 3.291e-03, 7.054e-02), r);
r = MulAdd(s3_7, M4(-8.208e-02, 1.377e-02, -2.475e-02, -1.353e-01, 9.728e-02, 7.136e-02, -3.984e-02, 1.374e-01, -1.160e-01, 4.362e-02, 6.714e-02, 5.038e-03, 1.866e-01, -2.349e-01, 8.599e-02, -1.510e-01), r);
r = MulAdd(s3_8, M4(-3.279e-02, 4.634e-02, 1.698e-02, 1.410e-01, -2.615e-02, 1.178e-02, 5.268e-02, 5.209e-02, 1.624e-02, 2.431e-02, -3.280e-02, 7.160e-02, -6.704e-02, -1.195e-01, -4.136e-02, -2.048e-01), r);
return r;
}
void Pass5(uint2 blockStart, uint3 tid) {
float2 pt = float2(GetInputPt());
uint2 gxy = Rmp8x8(tid.x) + blockStart;
uint2 size = GetInputSize();
if (gxy.x >= size.x || gxy.y >= size.y) {
return;
}
float2 pos = (gxy + 0.5) * pt;
V4 s0_0 = l0(-1.0, -1.0);
V4 s0_1 = l0(0.0, -1.0);
V4 s0_2 = l0(1.0, -1.0);
V4 s0_3 = l0(-1.0, 0.0);
V4 s0_4 = l0(0.0, 0.0);
V4 s0_5 = l0(1.0, 0.0);
V4 s0_6 = l0(-1.0, 1.0);
V4 s0_7 = l0(0.0, 1.0);
V4 s0_8 = l0(1.0, 1.0);
V4 s1_0 = -max(-s0_0, 0.0);
V4 s1_1 = -max(-s0_1, 0.0);
V4 s1_2 = -max(-s0_2, 0.0);
V4 s1_3 = -max(-s0_3, 0.0);
V4 s1_4 = -max(-s0_4, 0.0);
V4 s1_5 = -max(-s0_5, 0.0);
V4 s1_6 = -max(-s0_6, 0.0);
V4 s1_7 = -max(-s0_7, 0.0);
V4 s1_8 = -max(-s0_8, 0.0);
s0_0 = max(s0_0, 0.0);
s0_1 = max(s0_1, 0.0);
s0_2 = max(s0_2, 0.0);
s0_3 = max(s0_3, 0.0);
s0_4 = max(s0_4, 0.0);
s0_5 = max(s0_5, 0.0);
s0_6 = max(s0_6, 0.0);
s0_7 = max(s0_7, 0.0);
s0_8 = max(s0_8, 0.0);
V4 s2_0 = l1(-1.0, -1.0);
V4 s2_1 = l1(0.0, -1.0);
V4 s2_2 = l1(1.0, -1.0);
V4 s2_3 = l1(-1.0, 0.0);
V4 s2_4 = l1(0.0, 0.0);
V4 s2_5 = l1(1.0, 0.0);
V4 s2_6 = l1(-1.0, 1.0);
V4 s2_7 = l1(0.0, 1.0);
V4 s2_8 = l1(1.0, 1.0);
V4 s3_0 = -max(-s2_0, 0.0);
V4 s3_1 = -max(-s2_1, 0.0);
V4 s3_2 = -max(-s2_2, 0.0);
V4 s3_3 = -max(-s2_3, 0.0);
V4 s3_4 = -max(-s2_4, 0.0);
V4 s3_5 = -max(-s2_5, 0.0);
V4 s3_6 = -max(-s2_6, 0.0);
V4 s3_7 = -max(-s2_7, 0.0);
V4 s3_8 = -max(-s2_8, 0.0);
s2_0 = max(s2_0, 0.0);
s2_1 = max(s2_1, 0.0);
s2_2 = max(s2_2, 0.0);
s2_3 = max(s2_3, 0.0);
s2_4 = max(s2_4, 0.0);
s2_5 = max(s2_5, 0.0);
s2_6 = max(s2_6, 0.0);
s2_7 = max(s2_7, 0.0);
s2_8 = max(s2_8, 0.0);
t0[gxy] = f0(s0_0, s0_1, s0_2, s0_3, s0_4, s0_5, s0_6, s0_7, s0_8, s1_0, s1_1, s1_2, s1_3, s1_4, s1_5, s1_6, s1_7, s1_8, s2_0, s2_1, s2_2, s2_3, s2_4, s2_5, s2_6, s2_7, s2_8, s3_0, s3_1, s3_2, s3_3, s3_4, s3_5, s3_6, s3_7, s3_8);
t1[gxy] = f1(s0_0, s0_1, s0_2, s0_3, s0_4, s0_5, s0_6, s0_7, s0_8, s1_0, s1_1, s1_2, s1_3, s1_4, s1_5, s1_6, s1_7, s1_8, s2_0, s2_1, s2_2, s2_3, s2_4, s2_5, s2_6, s2_7, s2_8, s3_0, s3_1, s3_2, s3_3, s3_4, s3_5, s3_6, s3_7, s3_8);
}
//!PASS 6
//!DESC out-shuffle
//!BLOCK_SIZE 16
//!NUM_THREADS 64
//!IN INPUT, t0, t1
//!OUT OUTPUT
#define l0(x, y) V4(O(t0, float2(x, y)))
#define l1(x, y) V4(O(t1, float2(x, y)))
V4 f0(V4 s0_0, V4 s0_1, V4 s0_2, V4 s0_3, V4 s0_4, V4 s0_5, V4 s0_6, V4 s0_7, V4 s0_8, V4 s1_0, V4 s1_1, V4 s1_2, V4 s1_3, V4 s1_4, V4 s1_5, V4 s1_6, V4 s1_7, V4 s1_8, V4 s2_0, V4 s2_1, V4 s2_2, V4 s2_3, V4 s2_4, V4 s2_5, V4 s2_6, V4 s2_7, V4 s2_8, V4 s3_0, V4 s3_1, V4 s3_2, V4 s3_3, V4 s3_4, V4 s3_5, V4 s3_6, V4 s3_7, V4 s3_8) {
V4 r = { -1.667e-03, -2.914e-03, -1.783e-03, -1.113e-03 };
r = MulAdd(s0_0, M4(1.838e-01, -1.901e-02, 9.627e-03, -5.113e-02, -2.616e-02, -2.850e-02, -1.739e-02, 9.125e-03, 1.563e-02, -1.253e-02, 1.902e-02, -1.512e-02, 3.495e-03, -1.497e-02, 4.974e-03, -1.115e-02), r);
r = MulAdd(s0_1, M4(-4.288e-01, 2.549e-01, -1.017e-01, -1.945e-01, 4.749e-02, 5.258e-02, -8.284e-03, -2.265e-02, -7.010e-02, -1.542e-02, 6.889e-03, 4.028e-02, -1.355e-02, 4.321e-03, 3.330e-02, 8.256e-02), r);
r = MulAdd(s0_2, M4(-1.045e-03, -2.140e-01, -3.928e-02, 1.364e-02, 1.787e-03, -9.427e-03, 1.927e-03, -2.145e-03, -4.617e-03, 2.814e-02, 2.045e-02, 1.776e-02, 3.188e-02, 3.187e-02, -1.498e-02, -2.180e-02), r);
r = MulAdd(s0_3, M4(1.059e-02, -5.522e-02, 6.447e-02, -4.395e-02, -4.846e-02, -2.209e-02, 1.866e-02, -5.920e-02, -2.419e-02, 1.032e-02, 7.736e-04, 1.092e-03, 1.854e-02, -4.388e-03, 3.893e-02, 9.549e-03), r);
r = MulAdd(s0_4, M4(3.358e-02, 7.431e-03, 1.780e-01, 5.353e-01, 2.674e-01, 2.227e-01, 1.450e-01, 2.085e-01, -8.104e-03, -3.561e-02, -1.231e-01, -1.932e-01, -6.538e-02, 3.378e-02, -1.314e-01, -2.862e-02), r);
r = MulAdd(s0_5, M4(-7.887e-02, 4.783e-02, -1.380e-01, -3.877e-01, 3.436e-03, 4.712e-02, -1.250e-02, 2.247e-02, 2.920e-02, 7.190e-02, -2.005e-02, 6.169e-02, 5.594e-03, -4.630e-02, 9.661e-02, 3.625e-02), r);
r = MulAdd(s0_6, M4(1.963e-02, -1.873e-02, 1.489e-02, -1.253e-02, -9.356e-03, 1.334e-02, -3.747e-02, -1.115e-02, 7.741e-04, -6.463e-03, 3.707e-03, -3.598e-03, 1.783e-02, -5.539e-03, 9.899e-03, -9.354e-03), r);
r = MulAdd(s0_7, M4(-2.952e-03, 3.132e-02, -6.679e-02, 2.883e-02, -3.721e-02, -3.573e-02, 1.204e-01, -6.785e-02, -1.208e-02, -1.355e-04, 2.872e-02, 2.196e-02, -1.655e-02, 3.784e-02, -5.921e-03, 2.494e-02), r);
r = MulAdd(s0_8, M4(-1.215e-02, -2.947e-02, -2.454e-03, -6.326e-02, 2.248e-03, 2.302e-02, -2.863e-03, 5.834e-02, 2.187e-02, 9.973e-03, 2.158e-02, 4.902e-02, -2.207e-02, -3.485e-02, -5.118e-02, -6.696e-02), r);
r = MulAdd(s1_0, M4(8.377e-02, -3.093e-02, 2.280e-02, -2.664e-02, -4.333e-02, -3.292e-02, -8.109e-03, 1.105e-02, 1.507e-02, 9.138e-03, 2.597e-02, -1.926e-02, 4.537e-02, -9.080e-03, -1.629e-02, -1.180e-02), r);
r = MulAdd(s1_1, M4(-7.478e-02, 1.238e-01, -5.092e-02, -3.473e-02, 4.269e-02, 4.444e-02, 7.295e-03, 1.274e-04, -1.646e-01, -1.551e-03, 3.424e-02, 4.906e-02, -2.056e-01, -5.847e-02, 5.262e-02, 1.049e-01), r);
r = MulAdd(s1_2, M4(-6.323e-02, -1.179e-01, -1.982e-02, -4.065e-02, 3.089e-03, -9.469e-03, 2.850e-03, 3.314e-03, -1.819e-03, -1.065e-01, 1.882e-02, 9.349e-03, 1.624e-02, -2.906e-02, -2.029e-02, -5.020e-02), r);
r = MulAdd(s1_3, M4(1.101e-01, -3.490e-02, 1.327e-01, -2.853e-02, -5.027e-03, -5.703e-02, 6.484e-03, -6.473e-02, -4.310e-02, 3.882e-02, -3.100e-02, -7.837e-04, -5.501e-02, -1.261e-02, 7.285e-02, 4.648e-02), r);
r = MulAdd(s1_4, M4(-6.214e-02, 1.841e-01, -9.546e-02, 3.700e-01, 2.824e-01, 3.400e-01, 2.309e-01, 2.237e-01, 3.482e-01, -1.294e-01, -4.546e-01, -3.556e-01, -4.730e-01, -1.392e-01, 4.776e-01, 1.210e-01), r);
r = MulAdd(s1_5, M4(-5.408e-02, -1.286e-01, -6.571e-02, -1.230e-01, 9.991e-03, 6.421e-02, 4.305e-03, 1.780e-02, 2.254e-02, 3.661e-01, 6.275e-02, 8.004e-02, -1.834e-02, -3.465e-01, 9.274e-02, 3.935e-01), r);
r = MulAdd(s1_6, M4(-1.620e-03, -1.423e-02, 2.785e-02, -1.252e-02, -1.218e-02, 2.842e-03, -3.496e-02, -2.927e-02, -2.106e-02, -7.099e-03, 2.545e-02, 2.484e-02, 2.973e-02, 4.563e-04, 2.010e-04, -1.839e-02), r);
r = MulAdd(s1_7, M4(1.400e-04, 3.080e-02, -6.992e-03, 8.032e-02, -2.280e-02, -4.436e-02, 7.600e-02, 1.165e-02, -9.494e-02, -2.207e-02, 2.783e-01, 2.095e-01, 2.645e-02, 5.203e-02, -7.492e-02, -1.303e-02), r);
r = MulAdd(s1_8, M4(-5.360e-03, -2.277e-02, -2.252e-02, -6.191e-02, 1.263e-02, 1.540e-02, 9.566e-03, 3.637e-02, -1.265e-02, -3.092e-02, -1.298e-02, -1.187e-02, 1.286e-02, 1.181e-02, -5.675e-02, -5.487e-02), r);
r = MulAdd(s2_0, M4(6.275e-02, 3.332e-02, 2.458e-02, -1.910e-02, -1.764e-02, 2.292e-02, -3.220e-02, -1.127e-02, 4.114e-02, 4.303e-02, -3.355e-02, -8.882e-03, 1.881e-02, 1.788e-02, -3.354e-03, -1.345e-02), r);
r = MulAdd(s2_1, M4(-1.562e-01, -7.407e-02, -5.684e-02, -3.194e-03, 1.150e-01, -2.700e-02, -9.666e-03, -3.629e-02, 5.862e-02, 6.747e-02, -1.085e-02, -3.454e-02, 7.263e-05, 2.167e-02, 5.491e-03, -6.472e-02), r);
r = MulAdd(s2_2, M4(5.068e-03, 4.899e-02, 1.480e-02, -2.153e-02, 1.102e-02, 2.831e-02, -5.931e-03, 1.021e-02, -1.267e-02, -1.569e-02, 5.418e-04, 1.030e-02, -3.280e-02, -3.072e-02, -2.688e-02, -2.208e-02), r);
r = MulAdd(s2_3, M4(-7.105e-02, 1.664e-03, -3.108e-02, 6.985e-02, 3.176e-02, 2.312e-02, -3.835e-02, 3.884e-02, -1.038e-01, 6.660e-02, -1.372e-01, 2.432e-02, -2.888e-04, -2.049e-02, 2.271e-02, 9.383e-03), r);
r = MulAdd(s2_4, M4(4.697e-01, -3.721e-01, 1.705e-01, -2.767e-01, -1.791e-02, -7.276e-02, 2.503e-01, 1.040e-01, 1.180e-02, -5.212e-01, 4.014e-01, 1.946e-01, -2.547e-02, -1.567e-02, -5.652e-02, 9.687e-02), r);
r = MulAdd(s2_5, M4(3.024e-02, 1.618e-02, 2.619e-02, 8.868e-02, -5.217e-02, -7.642e-02, -3.704e-02, -2.374e-02, -4.639e-02, 5.743e-02, -3.967e-02, -2.450e-02, 2.091e-02, -1.108e-02, 6.949e-03, -1.502e-02), r);
r = MulAdd(s2_6, M4(5.298e-03, -6.810e-03, -1.982e-02, 1.960e-04, 3.645e-03, 5.483e-03, 3.357e-03, 3.697e-02, -1.339e-02, 3.253e-02, 3.649e-02, 4.492e-03, 2.076e-02, -9.046e-03, 2.043e-02, -7.803e-03), r);
r = MulAdd(s2_7, M4(-2.707e-02, 7.878e-02, 1.816e-01, -1.506e-03, 9.060e-03, -1.418e-02, -6.983e-02, -5.833e-02, 3.309e-02, -2.537e-02, -3.298e-01, -1.735e-01, -2.132e-03, 5.241e-02, 1.155e-02, 4.817e-02), r);
r = MulAdd(s2_8, M4(-1.781e-02, 1.652e-02, -7.188e-03, 2.114e-03, -1.105e-02, -1.137e-02, -9.037e-03, -5.600e-02, -1.220e-02, 8.292e-03, -3.404e-03, -4.211e-02, -1.018e-02, -1.004e-02, 1.505e-03, -4.591e-03), r);
r = MulAdd(s3_0, M4(7.349e-02, 2.926e-02, 2.398e-02, -1.821e-02, -1.290e-02, 1.201e-02, 5.000e-03, 1.316e-02, -1.567e-02, 2.025e-02, -2.171e-02, -3.941e-04, -7.948e-03, 6.116e-02, -9.445e-03, 1.911e-02), r);
r = MulAdd(s3_1, M4(-1.294e-01, -6.121e-02, -4.576e-02, 9.211e-03, 1.371e-02, -1.964e-02, -3.133e-03, 4.701e-03, 9.544e-02, 6.692e-03, 4.665e-04, -2.056e-02, 3.455e-01, -2.495e-01, 1.027e-02, -7.393e-02), r);
r = MulAdd(s3_2, M4(1.532e-02, 9.402e-03, 6.812e-04, -3.241e-02, 1.245e-03, 6.504e-03, 3.970e-03, 7.168e-03, 9.435e-03, 1.574e-02, -5.118e-03, 5.232e-03, -2.659e-02, -6.011e-02, -2.446e-02, 8.062e-04), r);
r = MulAdd(s3_3, M4(-6.714e-02, 3.454e-03, -1.486e-02, 5.921e-02, 5.177e-02, 3.766e-02, -1.473e-01, 4.371e-02, -7.118e-02, 2.462e-02, -1.810e-02, 3.430e-02, -5.552e-02, 3.047e-02, -5.066e-02, 5.769e-02), r);
r = MulAdd(s3_4, M4(3.191e-02, -1.387e-01, -5.992e-02, -1.554e-01, 4.660e-01, 3.655e-01, 2.406e-02, -3.902e-01, 3.973e-02, -1.333e-01, 1.792e-01, 8.854e-02, 2.477e-01, -3.115e-01, 6.035e-01, -4.717e-01), r);
r = MulAdd(s3_5, M4(7.241e-02, 1.273e-01, 6.810e-02, 1.118e-01, -5.454e-02, -1.728e-02, -1.007e-01, -2.265e-02, -4.534e-02, -5.171e-02, -2.524e-02, -3.337e-02, -2.366e-03, -1.723e-02, 2.300e-02, -9.889e-02), r);
r = MulAdd(s3_6, M4(3.383e-02, -7.898e-03, 1.681e-02, -5.131e-03, 3.687e-02, 1.929e-02, -7.695e-03, 2.145e-03, -2.814e-02, 3.366e-02, -8.788e-02, 3.614e-02, 2.951e-02, -6.964e-03, 2.272e-02, 1.581e-02), r);
r = MulAdd(s3_7, M4(7.020e-03, 6.046e-02, 2.975e-02, 5.663e-02, 2.155e-02, -1.786e-02, -2.588e-01, -1.310e-01, -6.372e-02, -1.218e-01, -7.160e-02, -3.058e-01, 2.297e-03, 3.050e-02, -2.346e-02, 4.674e-02), r);
r = MulAdd(s3_8, M4(-1.071e-02, -1.089e-02, 9.286e-03, 5.202e-02, -2.291e-02, -2.655e-02, 2.386e-02, -3.231e-02, 4.599e-03, 1.114e-02, -1.630e-02, -1.693e-02, -2.194e-03, 1.842e-02, -2.522e-02, 5.265e-02), r);
return tanh(r);
}
void Pass6(uint2 blockStart, uint3 tid) {
float2 pt = float2(GetInputPt());
uint2 gxy = (Rmp8x8(tid.x) << 1) + blockStart;
uint2 size = GetOutputSize();
if (gxy.x >= size.x || gxy.y >= size.y) {
return;
}
float2 pos = ((gxy >> 1) + 0.5) * pt;
V4 s0_0 = l0(-1.0, -1.0);
V4 s0_1 = l0(0.0, -1.0);
V4 s0_2 = l0(1.0, -1.0);
V4 s0_3 = l0(-1.0, 0.0);
V4 s0_4 = l0(0.0, 0.0);
V4 s0_5 = l0(1.0, 0.0);
V4 s0_6 = l0(-1.0, 1.0);
V4 s0_7 = l0(0.0, 1.0);
V4 s0_8 = l0(1.0, 1.0);
V4 s1_0 = -max(-s0_0, 0.0);
V4 s1_1 = -max(-s0_1, 0.0);
V4 s1_2 = -max(-s0_2, 0.0);
V4 s1_3 = -max(-s0_3, 0.0);
V4 s1_4 = -max(-s0_4, 0.0);
V4 s1_5 = -max(-s0_5, 0.0);
V4 s1_6 = -max(-s0_6, 0.0);
V4 s1_7 = -max(-s0_7, 0.0);
V4 s1_8 = -max(-s0_8, 0.0);
s0_0 = max(s0_0, 0.0);
s0_1 = max(s0_1, 0.0);
s0_2 = max(s0_2, 0.0);
s0_3 = max(s0_3, 0.0);
s0_4 = max(s0_4, 0.0);
s0_5 = max(s0_5, 0.0);
s0_6 = max(s0_6, 0.0);
s0_7 = max(s0_7, 0.0);
s0_8 = max(s0_8, 0.0);
V4 s2_0 = l1(-1.0, -1.0);
V4 s2_1 = l1(0.0, -1.0);
V4 s2_2 = l1(1.0, -1.0);
V4 s2_3 = l1(-1.0, 0.0);
V4 s2_4 = l1(0.0, 0.0);
V4 s2_5 = l1(1.0, 0.0);
V4 s2_6 = l1(-1.0, 1.0);
V4 s2_7 = l1(0.0, 1.0);
V4 s2_8 = l1(1.0, 1.0);
V4 s3_0 = -max(-s2_0, 0.0);
V4 s3_1 = -max(-s2_1, 0.0);
V4 s3_2 = -max(-s2_2, 0.0);
V4 s3_3 = -max(-s2_3, 0.0);
V4 s3_4 = -max(-s2_4, 0.0);
V4 s3_5 = -max(-s2_5, 0.0);
V4 s3_6 = -max(-s2_6, 0.0);
V4 s3_7 = -max(-s2_7, 0.0);
V4 s3_8 = -max(-s2_8, 0.0);
s2_0 = max(s2_0, 0.0);
s2_1 = max(s2_1, 0.0);
s2_2 = max(s2_2, 0.0);
s2_3 = max(s2_3, 0.0);
s2_4 = max(s2_4, 0.0);
s2_5 = max(s2_5, 0.0);
s2_6 = max(s2_6, 0.0);
s2_7 = max(s2_7, 0.0);
s2_8 = max(s2_8, 0.0);
V4 r = f0(s0_0, s0_1, s0_2, s0_3, s0_4, s0_5, s0_6, s0_7, s0_8, s1_0, s1_1, s1_2, s1_3, s1_4, s1_5, s1_6, s1_7, s1_8, s2_0, s2_1, s2_2, s2_3, s2_4, s2_5, s2_6, s2_7, s2_8, s3_0, s3_1, s3_2, s3_3, s3_4, s3_5, s3_6, s3_7, s3_8);
static const MF3x3 rgb2yuv = { 0.299, 0.587, 0.114, -0.169, -0.331, 0.5, 0.5, -0.419, -0.081 };
static const MF3x3 yuv2rgb = { 1, -0.00093, 1.401687, 1, -0.3437, -0.71417, 1, 1.77216, 0.00099 };
float2 opt = float2(GetOutputPt());
pos -= 0.5f * opt;
MF3 yuv = mul(rgb2yuv, INPUT.SampleLevel(SL, pos, 0).rgb);
OUTPUT[gxy] = MF4(mul(yuv2rgb, MF3(saturate(yuv.r + r.x), yuv.yz)), 1);
++gxy.x;
pos.x += opt.x;
yuv = mul(rgb2yuv, INPUT.SampleLevel(SL, pos, 0).rgb);
OUTPUT[gxy] = MF4(mul(yuv2rgb, MF3(saturate(yuv.r + r.y), yuv.yz)), 1);
++gxy.y;
pos.y += opt.y;
yuv = mul(rgb2yuv, INPUT.SampleLevel(SL, pos, 0).rgb);
OUTPUT[gxy] = MF4(mul(yuv2rgb, MF3(saturate(yuv.r + r.w), yuv.yz)), 1);
--gxy.x;
pos.x -= opt.x;
yuv = mul(rgb2yuv, INPUT.SampleLevel(SL, pos, 0).rgb);
OUTPUT[gxy] = MF4(mul(yuv2rgb, MF3(saturate(yuv.r + r.z), yuv.yz)), 1);
}

View file

@ -1,914 +0,0 @@
// CuNNy 4x8C BILINEAR RGB NVL - https://github.com/funnyplanter/CuNNy
// This program is free software: you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation, either version 3 of the License, or
// (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
//
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License
// along with this program. If not, see <http://www.gnu.org/licenses/>.
//!MAGPIE EFFECT
//!VERSION 4
//!SORT_NAME CuNNy-D08N04
//!USE MulAdd
//!CAPABILITY FP16
#include "../StubDefs.hlsli"
//!TEXTURE
Texture2D INPUT;
//!TEXTURE
//!WIDTH INPUT_WIDTH * 2
//!HEIGHT INPUT_HEIGHT * 2
Texture2D OUTPUT;
//!SAMPLER
//!FILTER POINT
SamplerState SP;
//!SAMPLER
//!FILTER LINEAR
SamplerState SL;
//!COMMON
#define O(t, p) t.SampleLevel(SP, pos + p * pt, 0)
#define V4 MF4
#define M4 MF4x4
//!TEXTURE
//!WIDTH INPUT_WIDTH
//!HEIGHT INPUT_HEIGHT
//!FORMAT R8G8B8A8_SNORM
Texture2D t0;
//!TEXTURE
//!WIDTH INPUT_WIDTH
//!HEIGHT INPUT_HEIGHT
//!FORMAT R8G8B8A8_SNORM
Texture2D t1;
//!TEXTURE
//!WIDTH INPUT_WIDTH
//!HEIGHT INPUT_HEIGHT
//!FORMAT R8G8B8A8_SNORM
Texture2D t2;
//!TEXTURE
//!WIDTH INPUT_WIDTH
//!HEIGHT INPUT_HEIGHT
//!FORMAT R8G8B8A8_SNORM
Texture2D t3;
//!PASS 1
//!DESC in
//!BLOCK_SIZE 8
//!NUM_THREADS 64
//!IN INPUT
//!OUT t0, t1
#define l0(x, y) (dot(MF3(2.329e-01, 4.438e-01, 9.598e-02), O(INPUT, float2(x, y)).rgb) + MF(-5.664e-01))
V4 f0(MF s0_0, MF s0_1, MF s0_2, MF s0_3, MF s0_4, MF s0_5, MF s0_6, MF s0_7, MF s0_8) {
V4 r = { 3.244e-02, 2.492e-04, 8.562e-04, 1.261e-04 };
r = mad(s0_0, V4(-1.368e-01, -5.123e-02, -2.270e-01, -9.888e-02), r);
r = mad(s0_1, V4(3.682e-01, 4.625e-02, 1.372e-01, 3.834e-01), r);
r = mad(s0_2, V4(-9.245e-02, 7.555e-03, 3.923e-02, 1.252e-02), r);
r = mad(s0_3, V4(-2.312e-01, 2.012e-02, 1.660e-01, 4.386e-01), r);
r = mad(s0_4, V4(-3.965e-02, -4.834e-01, 3.729e-01, -7.207e-01), r);
r = mad(s0_5, V4(2.190e-01, -9.021e-02, -1.087e-01, -9.632e-03), r);
r = mad(s0_6, V4(4.088e-02, 1.183e-01, 8.976e-02, -1.710e-03), r);
r = mad(s0_7, V4(-5.188e-03, 5.274e-01, -8.856e-02, -6.446e-03), r);
r = mad(s0_8, V4(-7.160e-02, -9.349e-02, -3.823e-01, 1.947e-03), r);
return r;
}
V4 f1(MF s0_0, MF s0_1, MF s0_2, MF s0_3, MF s0_4, MF s0_5, MF s0_6, MF s0_7, MF s0_8) {
V4 r = { -1.971e-02, 8.202e-02, 4.706e-03, -6.665e-02 };
r = mad(s0_0, V4(2.403e-02, 8.569e-03, -8.618e-02, 2.022e-02), r);
r = mad(s0_1, V4(4.893e-01, 2.383e-02, 2.423e-02, -3.486e-01), r);
r = mad(s0_2, V4(-3.682e-02, 2.437e-03, 1.872e-01, 1.135e-01), r);
r = mad(s0_3, V4(-2.361e-02, 2.588e-02, 7.348e-02, -8.229e-03), r);
r = mad(s0_4, V4(-4.433e-01, -5.131e-01, -3.778e-01, 6.107e-02), r);
r = mad(s0_5, V4(-4.423e-02, 2.098e-02, 9.260e-03, 4.444e-02), r);
r = mad(s0_6, V4(-1.370e-02, 1.009e-02, 3.020e-01, 1.159e-02), r);
r = mad(s0_7, V4(-3.030e-03, 8.145e-03, -2.789e-02, -7.085e-03), r);
r = mad(s0_8, V4(2.648e-02, 4.731e-03, -1.067e-01, -4.477e-03), r);
return r;
}
void Pass1(uint2 blockStart, uint3 tid) {
float2 pt = float2(GetInputPt());
uint2 gxy = Rmp8x8(tid.x) + blockStart;
uint2 size = GetInputSize();
if (gxy.x >= size.x || gxy.y >= size.y) {
return;
}
float2 pos = (gxy + 0.5) * pt;
MF s0_0 = l0(-1.0, -1.0);
MF s0_1 = l0(0.0, -1.0);
MF s0_2 = l0(1.0, -1.0);
MF s0_3 = l0(-1.0, 0.0);
MF s0_4 = l0(0.0, 0.0);
MF s0_5 = l0(1.0, 0.0);
MF s0_6 = l0(-1.0, 1.0);
MF s0_7 = l0(0.0, 1.0);
MF s0_8 = l0(1.0, 1.0);
t0[gxy] = f0(s0_0, s0_1, s0_2, s0_3, s0_4, s0_5, s0_6, s0_7, s0_8);
t1[gxy] = f1(s0_0, s0_1, s0_2, s0_3, s0_4, s0_5, s0_6, s0_7, s0_8);
}
//!PASS 2
//!DESC conv1
//!BLOCK_SIZE 8
//!NUM_THREADS 64
//!IN t0, t1
//!OUT t2, t3
#define l0(x, y) V4(O(t0, float2(x, y)))
#define l1(x, y) V4(O(t1, float2(x, y)))
V4 f0(V4 s0_0, V4 s0_1, V4 s0_2, V4 s0_3, V4 s0_4, V4 s0_5, V4 s0_6, V4 s0_7, V4 s0_8, V4 s1_0, V4 s1_1, V4 s1_2, V4 s1_3, V4 s1_4, V4 s1_5, V4 s1_6, V4 s1_7, V4 s1_8, V4 s2_0, V4 s2_1, V4 s2_2, V4 s2_3, V4 s2_4, V4 s2_5, V4 s2_6, V4 s2_7, V4 s2_8, V4 s3_0, V4 s3_1, V4 s3_2, V4 s3_3, V4 s3_4, V4 s3_5, V4 s3_6, V4 s3_7, V4 s3_8) {
V4 r = { 3.430e-02, -1.031e-02, -1.631e-02, -3.189e-02 };
r = MulAdd(s0_0, M4(1.205e-01, 8.504e-02, -7.328e-02, 1.539e-01, -9.103e-03, -2.708e-02, -1.401e-01, -2.159e-01, -2.552e-01, 7.462e-02, 5.919e-02, 8.905e-02, 1.169e-01, -4.383e-03, -1.997e-01, -1.379e-01), r);
r = MulAdd(s0_1, M4(2.844e-02, 2.238e-02, 2.143e-01, -1.624e-01, 1.885e-01, 1.316e-01, -1.276e-01, -1.713e-01, 2.553e-03, -1.343e-01, 4.700e-02, 4.762e-01, -2.676e-01, 1.784e-01, -4.065e-02, 1.015e-01), r);
r = MulAdd(s0_2, M4(-4.442e-03, 3.253e-01, 2.650e-02, -2.907e-01, 2.749e-01, -3.510e-01, 8.545e-02, -2.446e-01, -1.579e-01, 9.398e-02, -4.544e-02, -9.123e-02, -2.529e-01, -2.538e-01, -2.686e-01, 2.607e-01), r);
r = MulAdd(s0_3, M4(1.518e-01, -1.515e-01, -1.597e-01, 2.163e-01, -6.933e-02, 7.220e-02, 2.114e-01, -2.227e-01, -3.743e-01, 9.056e-02, 2.612e-02, 3.036e-01, -1.583e-02, -8.293e-02, -1.068e-01, 6.201e-02), r);
r = MulAdd(s0_4, M4(-2.305e-02, 9.029e-02, -1.003e-01, -2.375e-01, -1.891e-01, 3.623e-01, -2.999e-01, -4.511e-01, 1.460e-01, -3.825e-01, 1.231e-01, 6.391e-01, -6.041e-01, 5.588e-01, -3.508e-01, -3.131e-01), r);
r = MulAdd(s0_5, M4(8.812e-02, 2.197e-01, -8.630e-03, 2.287e-02, -1.918e-01, -6.428e-01, 1.496e-01, 2.272e-01, 3.445e-02, -7.188e-03, -8.518e-02, 1.948e-01, 1.606e-01, -8.707e-01, 2.092e-02, -4.993e-01), r);
r = MulAdd(s0_6, M4(9.718e-03, 8.373e-03, 7.436e-02, -1.552e-01, 8.410e-02, -1.728e-02, -1.971e-01, 2.255e-02, -8.645e-02, 1.863e-02, -9.399e-02, -8.424e-02, -1.533e-03, 1.223e-01, 2.715e-01, -1.268e-01), r);
r = MulAdd(s0_7, M4(-4.246e-01, -1.034e-01, 3.236e-01, 5.680e-01, -1.213e-02, 1.577e-01, -9.408e-02, -7.294e-02, -6.410e-02, 4.264e-02, -8.392e-03, 2.192e-01, 1.656e-01, 4.681e-02, 9.146e-01, -6.311e-02), r);
r = MulAdd(s0_8, M4(-1.847e-01, -9.105e-02, -3.260e-02, 2.506e-01, -6.470e-02, 4.430e-02, -1.242e-02, -1.097e-01, 5.488e-02, 9.106e-02, 3.144e-02, -3.367e-05, 2.468e-01, -2.535e-01, 1.409e-01, -5.311e-01), r);
r = MulAdd(s1_0, M4(1.294e-01, 1.098e-01, 7.497e-03, 1.016e-01, 1.377e-02, -1.480e-02, -2.694e-02, -3.417e-02, -1.083e-01, -2.575e-03, 1.137e-01, -2.616e-01, -1.260e-01, -2.567e-02, -1.958e-01, 6.103e-02), r);
r = MulAdd(s1_1, M4(-1.355e-01, 1.168e-01, 2.368e-01, -2.379e-01, 8.556e-01, 1.401e-01, 3.238e-01, 2.737e-01, 8.041e-02, -1.662e-01, 9.181e-02, -3.488e-01, -1.586e-01, 1.407e-01, -1.126e-01, 1.825e-01), r);
r = MulAdd(s1_2, M4(-1.881e-02, 4.604e-01, -1.712e-02, 3.453e-02, 3.171e-01, -1.126e-01, 6.510e-02, 2.908e-01, -9.125e-02, 7.793e-02, -5.580e-02, -3.603e-01, 9.996e-02, -2.647e-01, -2.114e-01, 2.330e-01), r);
r = MulAdd(s1_3, M4(2.957e-01, -1.252e-01, -2.840e-01, 1.815e-01, -2.900e-01, 1.027e-01, 1.404e-01, -1.123e-01, -1.767e-01, 1.535e-03, -3.568e-03, -2.824e-01, 2.015e-01, -7.712e-02, -6.140e-02, 6.517e-02), r);
r = MulAdd(s1_4, M4(-2.439e-01, 7.096e-02, -2.116e-01, -1.980e-01, -3.221e-01, 2.007e-01, -4.243e-01, -5.013e-01, 1.181e-01, -3.735e-01, 1.812e-01, -5.095e-01, 3.646e-01, 4.013e-01, -8.028e-02, 1.287e-01), r);
r = MulAdd(s1_5, M4(-8.389e-02, -1.091e-01, 6.962e-02, 2.605e-01, -3.435e-03, -5.146e-01, 4.125e-01, 5.487e-01, -1.481e-01, 6.810e-02, -1.450e-01, -9.583e-02, 3.305e-01, -1.238e+00, 2.036e-01, 1.879e-01), r);
r = MulAdd(s1_6, M4(-8.033e-02, 5.944e-03, 2.453e-01, -2.971e-01, -5.652e-02, -1.251e-02, -1.449e-01, -5.344e-02, -1.377e-01, 9.383e-03, -1.862e-01, -2.528e-01, -3.825e-02, 7.296e-02, 2.373e-01, -1.935e-01), r);
r = MulAdd(s1_7, M4(-1.795e-01, 1.597e-01, 2.709e-01, -3.738e-01, 2.604e-02, 1.678e-01, -8.718e-02, -9.483e-03, -3.844e-02, 6.235e-02, -1.344e-01, 1.837e-02, -3.074e-02, 2.568e-02, 1.030e+00, 1.831e-01), r);
r = MulAdd(s1_8, M4(4.299e-02, 6.530e-03, -2.571e-02, 3.382e-01, -1.327e-01, 2.975e-02, -2.861e-02, 1.963e-01, 8.130e-04, 9.743e-02, -1.177e-02, -1.273e-01, -1.265e-01, -3.003e-01, 2.635e-01, 5.426e-02), r);
r = MulAdd(s2_0, M4(-1.538e-01, 1.580e-01, 1.392e-01, -1.077e-01, -1.228e-01, 1.853e-01, -1.010e-01, 3.144e-02, 2.203e-01, -3.309e-02, 6.819e-02, 2.708e-01, 1.720e-01, 2.635e-01, -1.290e-01, -2.932e-01), r);
r = MulAdd(s2_1, M4(1.615e-01, -1.424e-01, -2.346e-01, -1.008e-01, 1.386e-01, -2.281e-01, -1.313e-01, -5.902e-02, -3.376e-02, 1.925e-01, -1.172e-01, 7.865e-02, 2.112e-01, -7.280e-02, -1.953e-01, -1.198e-02), r);
r = MulAdd(s2_2, M4(1.280e-01, -1.353e-01, 1.251e-01, 3.212e-02, -1.144e-01, -1.492e-01, -1.499e-01, 2.211e-01, 1.307e-01, 1.336e-01, 1.977e-01, -1.429e-02, -5.395e-02, -2.772e-02, -3.214e-01, -1.907e-01), r);
r = MulAdd(s2_3, M4(-2.703e-01, 3.122e-01, 1.951e-01, -2.005e-01, 1.463e-01, 3.000e-01, 1.058e-01, 8.352e-02, 1.567e-01, -1.256e-01, -1.854e-01, -2.018e-01, 3.248e-01, 8.780e-02, 1.586e-01, -9.757e-03), r);
r = MulAdd(s2_4, M4(3.941e-02, -1.430e-01, 1.023e-01, 2.878e-01, 8.414e-02, 1.385e-01, 8.032e-02, -6.330e-02, -1.020e-01, 2.731e-01, -6.877e-02, -3.492e-01, 3.758e-01, -7.526e-02, 4.955e-01, -5.595e-01), r);
r = MulAdd(s2_5, M4(2.684e-01, -1.924e-02, -2.975e-02, 7.205e-01, 6.611e-02, -1.645e-01, 1.267e-01, 6.066e-02, 1.695e-01, -4.367e-01, -1.450e-01, -4.074e-02, 4.469e-01, -7.176e-03, 4.177e-01, -4.565e-01), r);
r = MulAdd(s2_6, M4(-1.843e-01, 2.522e-01, 3.324e-01, -1.821e-01, -1.327e-01, 1.182e-01, 1.158e-01, -2.494e-01, -6.459e-03, -6.606e-03, 1.333e-01, 2.229e-01, 2.481e-01, -2.018e-01, 2.456e-01, 2.351e-01), r);
r = MulAdd(s2_7, M4(-6.894e-03, -2.822e-01, -1.863e-01, -2.252e-01, 6.755e-02, -1.766e-01, 8.884e-02, -2.720e-03, -4.431e-02, -2.119e-02, 2.876e-01, -5.268e-01, -3.635e-01, -1.001e-01, -8.433e-01, 5.160e-01), r);
r = MulAdd(s2_8, M4(-1.786e-01, 2.208e-01, 4.289e-01, 1.663e-01, -2.341e-01, 8.148e-03, -7.557e-02, 7.817e-02, -1.340e-01, -2.341e-01, 3.123e-02, 1.120e-01, -7.753e-01, 2.056e-01, -2.926e-01, -1.222e-01), r);
r = MulAdd(s3_0, M4(-4.903e-02, 1.377e-01, 6.984e-02, -1.053e-02, -5.115e-01, 2.891e-01, -4.612e-01, -6.693e-01, 4.752e-02, -5.287e-02, -2.183e-02, 4.134e-01, 1.073e-02, 2.383e-01, -2.142e-01, 1.384e-01), r);
r = MulAdd(s3_1, M4(1.680e-01, -1.307e-01, -1.038e-01, -2.130e-02, -1.231e+00, -2.602e-01, -5.456e-01, 3.295e-01, -5.588e-02, 1.505e-01, -4.784e-02, -1.493e-01, 1.202e-01, -2.349e-01, -1.452e-01, -5.111e-02), r);
r = MulAdd(s3_2, M4(-8.858e-02, -1.293e-01, 9.441e-02, -1.295e-01, -3.373e-01, -1.841e-01, -1.818e-01, 1.570e+00, -8.336e-02, 2.012e-01, 1.362e-01, 1.830e-01, -6.053e-02, -1.725e-03, -2.011e-01, -1.021e-01), r);
r = MulAdd(s3_3, M4(-2.017e-01, 3.505e-01, 3.541e-02, 2.044e-01, -3.839e-01, 5.124e-01, 1.104e-01, 1.311e-01, 1.022e-01, -1.111e-01, -2.883e-01, 1.086e-01, 9.932e-02, 1.308e-01, 2.954e-01, -1.416e-02), r);
r = MulAdd(s3_4, M4(6.088e-02, -4.532e-02, -1.302e-01, -1.067e-01, -4.196e+00, 7.383e-01, -2.786e-01, -2.053e+00, -3.758e-01, 2.955e-01, -1.898e-01, 1.875e-01, 1.263e-01, 9.931e-03, 1.016e-01, 5.201e-02), r);
r = MulAdd(s3_5, M4(9.722e-03, -5.478e-02, -1.823e-01, -3.983e-02, -2.434e+00, -4.700e-01, 4.168e-01, 3.938e-01, 1.251e-01, -2.933e-01, -2.054e-02, 8.827e-02, 2.048e-02, 6.212e-02, 1.448e-01, 1.042e-01), r);
r = MulAdd(s3_6, M4(-1.605e-02, 1.851e-01, 2.427e-01, 4.894e-02, -6.032e-01, -3.413e-02, 4.158e-01, 6.903e-01, -1.865e-02, -1.318e-02, 1.003e-01, 3.193e-01, 4.503e-02, 1.880e-01, -4.608e-02, -3.137e-01), r);
r = MulAdd(s3_7, M4(-4.125e-02, -1.494e-01, 8.853e-01, -1.540e-01, -2.445e-01, 2.292e-01, 1.684e+00, 1.098e+00, 5.576e-02, -8.241e-02, 2.507e-01, -1.086e-01, 1.392e-01, -2.115e-01, -2.600e-01, 9.268e-02), r);
r = MulAdd(s3_8, M4(5.677e-02, 9.206e-02, 5.863e-02, 5.663e-02, -2.019e+00, -1.006e-01, -1.769e-01, -3.617e-01, 1.293e-02, -2.766e-01, 2.843e-02, 3.331e-01, -2.316e-01, -1.762e-01, -6.013e-03, -2.482e-02), r);
return r;
}
V4 f1(V4 s0_0, V4 s0_1, V4 s0_2, V4 s0_3, V4 s0_4, V4 s0_5, V4 s0_6, V4 s0_7, V4 s0_8, V4 s1_0, V4 s1_1, V4 s1_2, V4 s1_3, V4 s1_4, V4 s1_5, V4 s1_6, V4 s1_7, V4 s1_8, V4 s2_0, V4 s2_1, V4 s2_2, V4 s2_3, V4 s2_4, V4 s2_5, V4 s2_6, V4 s2_7, V4 s2_8, V4 s3_0, V4 s3_1, V4 s3_2, V4 s3_3, V4 s3_4, V4 s3_5, V4 s3_6, V4 s3_7, V4 s3_8) {
V4 r = { 8.007e-03, 2.570e-02, 2.487e-03, -2.496e-02 };
r = MulAdd(s0_0, M4(3.260e-02, 1.675e-01, 8.130e-02, -2.153e-01, -1.987e-01, -9.443e-02, 3.512e-01, 2.289e-02, 9.481e-02, -1.921e-01, -3.818e-01, 1.373e-01, -9.032e-02, 7.892e-02, 1.392e-01, -6.033e-02), r);
r = MulAdd(s0_1, M4(-8.203e-02, -1.015e-01, -1.313e-02, -5.337e-02, -2.948e-01, -2.678e-01, -2.321e-01, -5.995e-01, 1.364e-01, 1.030e-01, 1.546e-01, -1.179e-02, 1.996e-01, 2.244e-01, -2.304e-01, -1.304e-02), r);
r = MulAdd(s0_2, M4(-2.319e-02, -2.236e-02, 3.976e-02, 1.804e-01, 6.474e-02, 1.315e-01, -1.456e-02, -1.538e-01, 3.061e-02, -1.998e-02, -1.918e-02, -8.662e-02, -1.980e-01, -1.596e-01, -4.624e-01, -3.728e-01), r);
r = MulAdd(s0_3, M4(-3.171e-03, -2.887e-02, 3.107e-01, -8.532e-02, 1.489e-02, -2.798e-01, -2.458e-02, 2.922e-01, 5.196e-02, 2.333e-02, -4.100e-01, 3.851e-01, 8.566e-02, 1.655e-01, 3.680e-01, -3.572e-01), r);
r = MulAdd(s0_4, M4(4.618e-02, -3.100e-02, -1.849e-01, 2.228e-02, -2.182e-01, -5.806e-01, -6.298e-02, 2.421e-01, 4.266e-01, 7.738e-02, 4.856e-03, -1.191e-01, 3.469e-01, -8.683e-02, -2.397e-01, 6.512e-02), r);
r = MulAdd(s0_5, M4(8.363e-02, -9.745e-02, 2.398e-01, -1.335e-01, -1.585e-01, -1.161e-02, 2.482e-02, 1.319e-03, -4.696e-02, -6.675e-02, -7.519e-02, 1.125e-01, -1.199e-01, -9.094e-03, -2.590e-01, -8.812e-01), r);
r = MulAdd(s0_6, M4(7.745e-02, 3.414e-02, 6.378e-02, -8.388e-02, 4.456e-02, 1.354e-02, -1.138e-02, 1.131e-01, 2.361e-01, 1.828e-01, -2.135e-01, -1.100e-02, 1.683e-01, 2.134e-01, 1.832e-01, 8.420e-02), r);
r = MulAdd(s0_7, M4(-3.223e-01, -4.870e-02, -1.457e-01, 1.996e-01, -1.632e-01, -1.811e-01, -1.625e-01, 4.046e-02, -8.959e-02, 1.432e-01, -2.360e-02, -9.415e-02, -1.547e-01, 1.379e-01, 5.098e-01, -4.069e-01), r);
r = MulAdd(s0_8, M4(1.568e-01, -2.510e-02, -9.894e-02, 1.124e-01, -1.372e-01, 5.952e-03, 4.501e-02, 9.591e-03, 1.430e-01, 6.422e-02, -1.412e-03, 1.042e-02, 4.601e-02, -5.133e-02, -7.936e-02, -1.621e-01), r);
r = MulAdd(s1_0, M4(1.380e-01, 1.774e-01, 2.958e-01, -2.044e-01, -2.085e-01, 7.192e-03, -7.903e-02, 6.119e-02, -3.542e-02, -1.060e-01, -1.832e-01, 3.603e-01, -3.854e-02, 5.092e-02, -1.092e-01, -2.074e-01), r);
r = MulAdd(s1_1, M4(-5.638e-02, -1.659e-01, -1.006e-02, 5.355e-02, -2.243e-01, 3.533e-01, -2.130e-01, 6.480e-02, 4.462e-02, 1.065e-01, 1.598e-01, 5.025e-03, -3.810e-02, 1.012e-01, 2.123e-02, 2.124e-01), r);
r = MulAdd(s1_2, M4(5.207e-02, -1.428e-01, 1.745e-01, 2.563e-01, 4.058e-01, 5.320e-02, 3.527e-03, -4.664e-02, -1.641e-03, -2.830e-02, 1.453e-02, 1.169e-01, -5.840e-01, -1.545e-01, 3.880e-01, 1.250e-01), r);
r = MulAdd(s1_3, M4(-2.089e-01, 3.070e-02, 3.770e-01, -2.868e-01, -1.965e-01, -2.499e-01, -2.145e-01, 5.348e-02, -1.201e-01, -3.454e-01, -5.723e-01, 4.313e-01, -7.068e-02, -6.358e-02, -2.426e-02, -2.841e-01), r);
r = MulAdd(s1_4, M4(1.315e-01, 2.464e-01, -2.505e-01, -1.589e-01, 4.124e-01, 4.860e-01, -2.493e-01, 1.201e-01, -1.304e-01, -1.620e-01, 2.228e-01, 4.485e-02, 6.945e-02, -2.261e-01, -8.190e-04, 5.678e-01), r);
r = MulAdd(s1_5, M4(3.529e-01, 1.800e-02, -9.794e-02, -1.160e-01, 7.052e-01, 4.176e-01, 5.822e-02, -5.300e-02, -1.144e-01, -1.890e-01, 1.337e-01, 1.163e-01, -5.024e-01, 9.977e-01, 1.831e-01, 2.166e-02), r);
r = MulAdd(s1_6, M4(-1.239e-01, 1.465e-01, 3.700e-01, -1.638e-01, -1.022e-01, -3.216e-02, -2.412e-02, -2.505e-02, 5.450e-02, -1.325e-02, -2.760e-01, 5.219e-02, -5.604e-02, 3.602e-02, -1.026e-01, 4.063e-02), r);
r = MulAdd(s1_7, M4(1.669e-01, 2.580e-01, -2.923e-01, -2.497e-01, 1.135e-01, -1.599e-01, -2.419e-01, -1.202e-01, -3.903e-01, -2.141e-01, 9.642e-02, -6.096e-02, -6.762e-01, 5.614e-01, 3.076e-01, -4.187e-01), r);
r = MulAdd(s1_8, M4(5.456e-02, -6.641e-02, -3.839e-01, 8.629e-02, 1.149e-01, 1.204e-02, -2.509e-02, -1.413e-03, -1.329e-02, -5.670e-02, -6.186e-02, 5.108e-02, 3.592e-02, 4.563e-01, -7.450e-02, -2.259e-01), r);
r = MulAdd(s2_0, M4(1.013e-01, -2.126e-02, -1.260e-01, 8.480e-03, -3.292e-02, 6.069e-04, 4.154e-02, 5.578e-02, 1.586e-02, 8.252e-02, 1.237e-01, -1.312e-01, 1.489e-01, 2.561e-01, -9.917e-02, -1.060e-01), r);
r = MulAdd(s2_1, M4(-1.285e-01, -8.314e-02, 1.521e-02, 1.037e-01, -1.021e-02, 7.112e-02, -2.319e-02, 7.051e-04, -1.101e-01, -1.896e-01, -2.458e-01, -7.399e-02, -4.133e-02, 1.606e-01, -1.511e-01, -2.425e-01), r);
r = MulAdd(s2_2, M4(7.543e-02, 9.235e-02, 2.139e-01, 2.879e-01, 9.583e-02, 4.372e-02, -8.231e-02, 2.498e-01, 1.241e-01, 1.377e-02, 2.380e-01, 2.586e-02, -1.926e-01, -1.406e-01, -3.627e-01, -8.414e-02), r);
r = MulAdd(s2_3, M4(9.655e-03, -9.581e-02, -6.071e-02, 2.231e-01, -1.148e-01, -3.513e-02, -2.013e-02, -1.094e-01, -1.606e-01, 9.180e-02, 3.498e-01, -2.726e-01, -7.696e-03, -4.007e-01, -8.497e-02, -6.989e-01), r);
r = MulAdd(s2_4, M4(4.965e-03, -1.346e-01, -4.517e-02, 2.043e-01, -1.348e-01, 1.451e-01, 8.113e-02, -8.530e-02, -1.414e-01, 7.261e-02, -2.368e-01, 1.601e-01, -2.438e-02, -2.554e-01, 4.057e-01, -2.224e-01), r);
r = MulAdd(s2_5, M4(-8.716e-02, 1.496e-01, -4.429e-02, 6.451e-01, -9.547e-03, -3.189e-02, -1.096e-01, -5.416e-02, -5.032e-01, 1.331e-01, 2.389e-02, 1.028e-01, -3.186e-01, -2.524e-01, 2.663e-02, -9.995e-03), r);
r = MulAdd(s2_6, M4(-2.465e-01, 1.585e-01, 3.196e-01, -9.098e-02, 2.765e-02, -1.793e-01, 1.519e-01, -9.565e-04, -1.160e-01, -3.035e-02, -1.082e-01, 3.172e-02, 5.502e-01, -6.251e-01, -4.487e-01, 1.932e-01), r);
r = MulAdd(s2_7, M4(-5.017e-01, -5.180e-01, -2.682e-01, -4.715e-01, 1.958e-02, -7.007e-02, -3.332e-02, -8.389e-02, -1.135e-01, -2.956e-02, 1.994e-01, 2.315e-02, -2.553e-01, -3.153e-03, 4.275e-01, 1.669e+00), r);
r = MulAdd(s2_8, M4(1.400e-01, 6.775e-01, 5.287e-02, 2.007e-02, 1.213e-01, -1.460e-03, -2.313e-02, 1.282e-01, -8.355e-02, 2.399e-01, -5.277e-02, -1.499e-01, 7.246e-02, -2.553e-02, 2.185e-01, 8.662e-01), r);
r = MulAdd(s3_0, M4(3.069e-02, -3.668e-02, -3.646e-02, 1.140e-01, -7.882e-02, 2.759e-01, 9.170e-01, 2.779e-01, 1.459e-01, 3.766e-02, -1.214e-01, 5.718e-03, -3.323e-02, 9.705e-02, -1.282e-02, -1.401e-01), r);
r = MulAdd(s3_1, M4(-1.405e-02, 2.809e-02, 1.466e-01, -1.286e-01, 4.754e-01, 8.076e-01, 5.775e-02, -5.403e-01, 1.919e-01, -2.015e-01, -1.976e-01, -8.544e-02, -8.431e-02, 9.302e-02, 6.560e-02, 2.011e-02), r);
r = MulAdd(s3_2, M4(2.107e-01, 2.334e-02, -2.591e-01, -1.023e-01, 6.461e-01, 1.138e+00, 3.917e-01, 2.270e-01, 4.023e-01, 6.135e-02, 4.125e-02, -5.551e-02, 1.871e-02, -1.344e-01, -1.534e-01, 1.216e-01), r);
r = MulAdd(s3_3, M4(8.077e-02, -1.149e-01, 6.733e-02, -9.044e-03, -6.431e-02, -1.755e-02, 2.617e+00, 5.203e-01, 8.910e-02, 9.642e-02, 3.720e-01, -2.326e-01, -1.142e-01, -4.017e-02, 2.351e-01, -1.062e-01), r);
r = MulAdd(s3_4, M4(-2.427e-01, -4.425e-03, 4.260e-01, -6.273e-02, 4.224e+00, -2.047e+00, -1.911e+00, 2.329e+00, 2.987e-01, -3.286e-01, -1.115e-01, 2.053e-01, -5.309e-02, -8.751e-02, -1.275e-02, -2.105e-01), r);
r = MulAdd(s3_5, M4(-1.413e-02, -4.404e-01, -1.525e-01, -1.703e-01, -9.999e-01, 5.276e-01, 4.779e-01, -5.145e-01, 4.772e-01, 2.730e-02, -7.651e-02, -2.235e-01, -1.122e-01, -1.686e-01, 9.595e-02, -1.169e-01), r);
r = MulAdd(s3_6, M4(-1.162e-01, 3.109e-01, -2.686e-01, -1.492e-01, 2.122e-01, 6.911e-01, 7.412e-01, 3.675e-02, 1.420e-01, -3.979e-02, -3.526e-02, -1.170e-01, 2.192e-01, 6.369e-02, 2.568e-01, 1.606e-02), r);
r = MulAdd(s3_7, M4(-2.482e-02, 6.355e-01, 4.230e-01, -4.331e-01, -1.462e+00, -9.944e-01, 1.154e+00, 8.760e-01, 3.625e-01, 2.127e-01, 3.382e-01, 6.009e-02, 1.431e-01, 9.892e-02, -2.409e-01, 4.223e-02), r);
r = MulAdd(s3_8, M4(-1.832e-02, 7.811e-02, -1.928e-02, 1.448e-01, -1.288e+00, 1.805e-01, 6.324e-01, -2.704e-02, 6.456e-02, -6.364e-02, 4.971e-02, -6.535e-03, 1.766e-01, 5.142e-02, -1.375e-01, 2.532e-01), r);
return r;
}
void Pass2(uint2 blockStart, uint3 tid) {
float2 pt = float2(GetInputPt());
uint2 gxy = Rmp8x8(tid.x) + blockStart;
uint2 size = GetInputSize();
if (gxy.x >= size.x || gxy.y >= size.y) {
return;
}
float2 pos = (gxy + 0.5) * pt;
V4 s0_0 = l0(-1.0, -1.0);
V4 s0_1 = l0(0.0, -1.0);
V4 s0_2 = l0(1.0, -1.0);
V4 s0_3 = l0(-1.0, 0.0);
V4 s0_4 = l0(0.0, 0.0);
V4 s0_5 = l0(1.0, 0.0);
V4 s0_6 = l0(-1.0, 1.0);
V4 s0_7 = l0(0.0, 1.0);
V4 s0_8 = l0(1.0, 1.0);
V4 s1_0 = -max(-s0_0, 0.0);
V4 s1_1 = -max(-s0_1, 0.0);
V4 s1_2 = -max(-s0_2, 0.0);
V4 s1_3 = -max(-s0_3, 0.0);
V4 s1_4 = -max(-s0_4, 0.0);
V4 s1_5 = -max(-s0_5, 0.0);
V4 s1_6 = -max(-s0_6, 0.0);
V4 s1_7 = -max(-s0_7, 0.0);
V4 s1_8 = -max(-s0_8, 0.0);
s0_0 = max(s0_0, 0.0);
s0_1 = max(s0_1, 0.0);
s0_2 = max(s0_2, 0.0);
s0_3 = max(s0_3, 0.0);
s0_4 = max(s0_4, 0.0);
s0_5 = max(s0_5, 0.0);
s0_6 = max(s0_6, 0.0);
s0_7 = max(s0_7, 0.0);
s0_8 = max(s0_8, 0.0);
V4 s2_0 = l1(-1.0, -1.0);
V4 s2_1 = l1(0.0, -1.0);
V4 s2_2 = l1(1.0, -1.0);
V4 s2_3 = l1(-1.0, 0.0);
V4 s2_4 = l1(0.0, 0.0);
V4 s2_5 = l1(1.0, 0.0);
V4 s2_6 = l1(-1.0, 1.0);
V4 s2_7 = l1(0.0, 1.0);
V4 s2_8 = l1(1.0, 1.0);
V4 s3_0 = -max(-s2_0, 0.0);
V4 s3_1 = -max(-s2_1, 0.0);
V4 s3_2 = -max(-s2_2, 0.0);
V4 s3_3 = -max(-s2_3, 0.0);
V4 s3_4 = -max(-s2_4, 0.0);
V4 s3_5 = -max(-s2_5, 0.0);
V4 s3_6 = -max(-s2_6, 0.0);
V4 s3_7 = -max(-s2_7, 0.0);
V4 s3_8 = -max(-s2_8, 0.0);
s2_0 = max(s2_0, 0.0);
s2_1 = max(s2_1, 0.0);
s2_2 = max(s2_2, 0.0);
s2_3 = max(s2_3, 0.0);
s2_4 = max(s2_4, 0.0);
s2_5 = max(s2_5, 0.0);
s2_6 = max(s2_6, 0.0);
s2_7 = max(s2_7, 0.0);
s2_8 = max(s2_8, 0.0);
t2[gxy] = f0(s0_0, s0_1, s0_2, s0_3, s0_4, s0_5, s0_6, s0_7, s0_8, s1_0, s1_1, s1_2, s1_3, s1_4, s1_5, s1_6, s1_7, s1_8, s2_0, s2_1, s2_2, s2_3, s2_4, s2_5, s2_6, s2_7, s2_8, s3_0, s3_1, s3_2, s3_3, s3_4, s3_5, s3_6, s3_7, s3_8);
t3[gxy] = f1(s0_0, s0_1, s0_2, s0_3, s0_4, s0_5, s0_6, s0_7, s0_8, s1_0, s1_1, s1_2, s1_3, s1_4, s1_5, s1_6, s1_7, s1_8, s2_0, s2_1, s2_2, s2_3, s2_4, s2_5, s2_6, s2_7, s2_8, s3_0, s3_1, s3_2, s3_3, s3_4, s3_5, s3_6, s3_7, s3_8);
}
//!PASS 3
//!DESC conv2
//!BLOCK_SIZE 8
//!NUM_THREADS 64
//!IN t2, t3
//!OUT t0, t1
#define l0(x, y) V4(O(t2, float2(x, y)))
#define l1(x, y) V4(O(t3, float2(x, y)))
V4 f0(V4 s0_0, V4 s0_1, V4 s0_2, V4 s0_3, V4 s0_4, V4 s0_5, V4 s0_6, V4 s0_7, V4 s0_8, V4 s1_0, V4 s1_1, V4 s1_2, V4 s1_3, V4 s1_4, V4 s1_5, V4 s1_6, V4 s1_7, V4 s1_8, V4 s2_0, V4 s2_1, V4 s2_2, V4 s2_3, V4 s2_4, V4 s2_5, V4 s2_6, V4 s2_7, V4 s2_8, V4 s3_0, V4 s3_1, V4 s3_2, V4 s3_3, V4 s3_4, V4 s3_5, V4 s3_6, V4 s3_7, V4 s3_8) {
V4 r = { 3.009e-03, -1.445e-03, 8.191e-03, -7.852e-03 };
r = MulAdd(s0_0, M4(2.802e-01, -3.301e-02, -1.047e-01, 6.427e-02, 1.357e-02, -8.015e-02, 7.763e-02, -9.646e-02, 1.136e-01, -1.443e-01, -3.950e-02, 2.744e-01, 8.414e-03, -1.005e-01, -1.683e-01, -5.766e-02), r);
r = MulAdd(s0_1, M4(2.907e-01, 1.339e-01, -7.005e-02, 9.074e-02, -2.491e-03, 6.498e-02, 1.121e-01, -9.272e-02, 3.415e-01, 1.949e-01, -2.613e-01, -2.328e-01, 1.311e-01, 1.285e-01, 1.685e-02, -4.780e-02), r);
r = MulAdd(s0_2, M4(1.671e-01, -2.228e-02, -5.777e-02, -5.853e-02, 1.243e-02, -3.269e-02, 8.757e-03, -1.478e-01, -4.190e-02, 3.164e-02, 2.922e-01, -3.017e-01, -6.631e-02, 5.380e-02, -2.750e-02, -7.771e-02), r);
r = MulAdd(s0_3, M4(-2.454e-02, 2.148e-01, -1.116e-01, -1.125e-01, -1.792e-01, -7.021e-01, -2.183e-01, 2.920e-01, -1.698e-01, 1.827e-01, -6.779e-02, 9.333e-02, -2.153e-01, 2.441e-01, 9.794e-02, -2.729e-01), r);
r = MulAdd(s0_4, M4(-6.750e-02, 1.324e-01, -5.087e-02, 2.746e-01, 1.579e-01, -1.909e-01, -7.631e-01, -4.744e-01, -1.732e-01, -2.741e-01, 4.145e-02, -2.124e-01, 7.946e-02, -1.579e-01, 2.856e-01, 5.090e-02), r);
r = MulAdd(s0_5, M4(8.392e-02, -1.504e-01, 2.815e-01, -1.174e-01, 3.942e-02, 1.918e-02, 1.561e-01, -1.457e-01, -5.976e-02, 1.230e-01, -2.539e-01, -1.965e-01, 1.869e-01, -1.795e-01, -1.283e-01, -3.447e-02), r);
r = MulAdd(s0_6, M4(-3.547e-03, -6.576e-03, -5.087e-02, 3.466e-02, -3.130e-03, -3.176e-01, 8.737e-02, 4.018e-02, -6.489e-02, -1.580e-03, -8.784e-03, -4.500e-02, 2.343e-03, 5.945e-02, -5.201e-02, -3.127e-02), r);
r = MulAdd(s0_7, M4(-3.546e-02, 1.145e-01, -4.773e-02, 8.280e-02, 6.746e-03, -1.036e-01, -6.616e-02, -1.224e-01, 7.156e-02, -1.941e-01, 9.307e-02, -3.567e-02, -2.215e-01, 2.437e-01, -5.542e-04, 1.208e-01), r);
r = MulAdd(s0_8, M4(-1.115e-02, -4.687e-02, -3.210e-02, -1.470e-01, -4.609e-02, 4.657e-02, -6.476e-02, -1.372e-01, -4.956e-03, 1.024e-01, -2.349e-01, -8.472e-02, -2.757e-02, -1.707e-02, 2.065e-01, 1.863e-02), r);
r = MulAdd(s1_0, M4(3.728e-02, -7.100e-02, -4.937e-02, 6.239e-02, -7.377e-03, -3.033e-02, 1.675e-01, -1.863e-02, -2.631e-02, -9.633e-02, -1.130e-01, -1.201e-01, 1.414e-01, -1.737e-01, -8.031e-02, -6.951e-02), r);
r = MulAdd(s1_1, M4(-3.703e-02, 4.012e-02, -2.289e-02, 3.332e-02, 2.161e-02, 8.828e-02, 5.544e-02, 1.017e-01, 3.684e-01, 3.149e-01, 3.662e-01, 4.298e-02, 1.966e-01, -2.697e-02, 2.216e-02, 7.540e-02), r);
r = MulAdd(s1_2, M4(-4.974e-02, -3.826e-02, -2.810e-02, -8.318e-02, 3.356e-02, -7.605e-02, -1.087e-01, 1.987e-02, -1.153e-01, -1.039e-01, -5.868e-02, -3.313e-02, -1.750e-02, 3.884e-03, -9.170e-02, -1.011e-01), r);
r = MulAdd(s1_3, M4(2.119e-01, -1.340e-01, -3.650e-02, 2.219e-01, 3.634e-01, 3.474e-01, 2.302e-01, 7.494e-02, -2.253e-01, 1.239e-01, -6.032e-02, 1.293e-01, 9.583e-02, 4.424e-02, -3.920e-02, -1.870e-01), r);
r = MulAdd(s1_4, M4(-2.664e-01, 8.462e-02, -4.745e-01, 1.985e-01, 2.803e-01, 7.429e-02, 7.814e-01, 4.658e-01, 3.661e-01, -2.319e-02, 3.324e-01, 2.860e-01, 3.178e-01, 9.301e-02, 1.316e-01, 4.547e-02), r);
r = MulAdd(s1_5, M4(5.369e-02, 6.912e-02, 2.659e-01, -1.491e-01, 4.462e-02, -4.823e-02, 1.130e-01, 1.710e-02, -7.604e-02, -7.003e-02, 3.093e-01, 2.537e-01, 2.466e-01, -1.039e-01, 2.413e-02, -1.256e-01), r);
r = MulAdd(s1_6, M4(-1.188e-01, 1.026e-01, 4.215e-02, -9.677e-02, 2.443e-03, 1.957e-01, 2.961e-02, -5.553e-02, -3.488e-02, 2.515e-02, -4.840e-03, 1.814e-02, 9.644e-02, -8.802e-02, 3.516e-03, -2.940e-03), r);
r = MulAdd(s1_7, M4(-1.792e-01, 1.391e-01, 1.322e-02, -1.514e-02, -2.173e-01, 1.743e-01, 1.530e-01, 5.286e-02, -8.655e-02, 2.541e-01, 6.282e-02, 1.167e-01, 9.664e-02, 2.304e-01, -1.538e-01, -1.298e-01), r);
r = MulAdd(s1_8, M4(-1.720e-01, 4.693e-02, 2.790e-01, 2.187e-02, -4.386e-02, 7.714e-03, 9.800e-02, 6.484e-03, -5.497e-02, 1.216e-01, 3.924e-02, 5.162e-02, 1.403e-01, -5.364e-03, -6.795e-03, -6.163e-02), r);
r = MulAdd(s2_0, M4(2.905e-01, -3.799e-02, 1.332e-01, 2.496e-02, 7.202e-02, -3.659e-01, -2.940e-02, -1.028e-03, -1.221e-01, 1.147e-01, 3.613e-02, 9.125e-02, -8.760e-03, 1.489e-02, -9.652e-02, 4.452e-03), r);
r = MulAdd(s2_1, M4(4.027e-01, -2.178e-01, -8.478e-02, 2.903e-01, 2.463e-02, 9.527e-03, -2.835e-01, 2.066e-01, -6.698e-02, -2.653e-01, -6.667e-02, 4.320e-02, -2.610e-01, -1.351e-01, 7.826e-02, -5.429e-02), r);
r = MulAdd(s2_2, M4(-1.249e-01, 4.376e-02, -6.245e-02, 1.702e-01, -5.731e-02, 8.022e-02, -1.335e-01, 1.528e-01, -2.969e-02, 1.062e-01, -1.303e-01, 1.226e-01, 2.030e-02, 5.205e-02, -1.877e-01, 4.309e-02), r);
r = MulAdd(s2_3, M4(-6.329e-02, -1.286e-01, -7.222e-02, 5.592e-03, -3.023e-02, 9.502e-02, -4.077e-02, -2.299e-01, -1.038e-01, -5.742e-02, -5.106e-04, 5.143e-02, 3.098e-02, -1.235e-01, 1.987e-02, 1.477e-02), r);
r = MulAdd(s2_4, M4(1.113e-01, -1.761e-01, 5.038e-02, -1.304e-01, 3.668e-01, -3.430e-01, 2.169e-01, 3.877e-01, -3.750e-02, 2.473e-01, 3.416e-02, 2.184e-01, 5.168e-01, -7.132e-02, 3.818e-01, -1.508e-01), r);
r = MulAdd(s2_5, M4(1.479e-01, -8.656e-02, -1.700e-01, 3.874e-01, 2.286e-02, -8.854e-02, 3.305e-02, -4.668e-03, -1.481e-01, 5.115e-02, 2.686e-01, 4.113e-01, -3.740e-01, -2.013e-01, 9.838e-04, 3.008e-01), r);
r = MulAdd(s2_6, M4(3.428e-01, -3.200e-01, 7.593e-02, 1.911e-01, 1.219e-01, 1.211e-02, -5.694e-02, -5.767e-02, 3.119e-02, -7.609e-02, 6.471e-02, 1.215e-01, -2.793e-04, 1.650e-02, 7.190e-03, -4.468e-02), r);
r = MulAdd(s2_7, M4(3.970e-01, -3.192e-01, -5.639e-02, 8.182e-02, -2.831e-02, 4.036e-02, 7.004e-02, 1.095e-01, -3.655e-02, 2.443e-01, 5.606e-02, -4.974e-02, 9.825e-02, 1.158e-01, -5.104e-02, -2.986e-02), r);
r = MulAdd(s2_8, M4(1.440e-01, 5.504e-02, -2.020e-01, 2.618e-03, -1.098e-02, -3.678e-02, 7.661e-02, 5.652e-02, -7.426e-02, 5.461e-02, 4.239e-01, 2.093e-01, 9.316e-03, -3.679e-02, 6.108e-02, 2.036e-01), r);
r = MulAdd(s3_0, M4(1.806e-02, 2.233e-02, 5.056e-02, 1.758e-01, 3.566e-02, -1.383e-01, 5.349e-02, 1.066e-01, 3.314e-02, -1.258e-01, -2.885e-02, -6.648e-02, -6.860e-03, -2.283e-02, -1.052e-01, -1.623e-02), r);
r = MulAdd(s3_1, M4(7.369e-02, -3.141e-02, 3.877e-03, 8.113e-03, -1.773e-01, 5.122e-03, -3.198e-01, 9.005e-02, 7.291e-02, -1.519e-01, -1.501e-01, -8.202e-02, -4.729e-02, -2.877e-02, -4.056e-02, 7.599e-02), r);
r = MulAdd(s3_2, M4(1.282e-01, 2.477e-03, 6.185e-02, 3.967e-02, -1.343e-01, 8.884e-02, 5.299e-02, -7.324e-02, 1.842e-01, -3.053e-02, -1.335e-01, -6.790e-03, -8.128e-02, 6.665e-02, 1.583e-03, -5.358e-02), r);
r = MulAdd(s3_3, M4(1.135e-01, 9.360e-03, 1.646e-01, 1.844e-01, 1.104e-02, 7.072e-02, -9.632e-02, -1.169e-01, -1.458e-01, 2.540e-02, -5.132e-02, -1.627e-01, -1.066e-01, -4.819e-02, -4.340e-02, -5.074e-02), r);
r = MulAdd(s3_4, M4(-1.198e-01, -7.965e-02, -2.989e-01, -4.946e-01, -1.666e-02, -2.136e-01, -3.575e-02, 1.351e-01, -8.546e-02, 2.553e-02, -7.878e-02, -3.233e-01, -2.955e-01, -7.765e-02, 1.450e-01, -2.114e-01), r);
r = MulAdd(s3_5, M4(-7.593e-02, -1.849e-03, -1.688e-01, 3.626e-02, 4.408e-03, 4.014e-02, -1.401e-01, -2.239e-01, 9.538e-02, -2.310e-01, 2.831e-02, 5.065e-02, 1.135e-01, 2.542e-02, -4.365e-01, 4.393e-02), r);
r = MulAdd(s3_6, M4(-5.217e-02, -1.327e-02, -1.851e-02, 2.806e-02, 4.648e-02, -9.047e-04, 2.961e-02, -2.922e-02, 6.360e-02, -3.494e-02, 2.573e-02, 1.309e-02, -2.512e-03, -4.086e-02, -2.086e-03, -6.018e-02), r);
r = MulAdd(s3_7, M4(9.887e-02, -9.515e-03, 1.306e-01, 5.290e-02, 1.832e-01, -2.549e-01, -4.640e-02, -1.256e-01, 4.915e-02, -5.163e-02, 3.044e-02, -9.871e-02, 8.168e-03, -7.112e-02, -5.743e-02, 3.687e-02), r);
r = MulAdd(s3_8, M4(-6.440e-02, 2.530e-02, -2.166e-03, -4.680e-02, 8.009e-02, -6.634e-02, -1.390e-01, -2.524e-02, 6.524e-02, -1.120e-01, -4.252e-02, -8.413e-03, -2.017e-02, 1.444e-02, -4.483e-02, 4.690e-02), r);
return r;
}
V4 f1(V4 s0_0, V4 s0_1, V4 s0_2, V4 s0_3, V4 s0_4, V4 s0_5, V4 s0_6, V4 s0_7, V4 s0_8, V4 s1_0, V4 s1_1, V4 s1_2, V4 s1_3, V4 s1_4, V4 s1_5, V4 s1_6, V4 s1_7, V4 s1_8, V4 s2_0, V4 s2_1, V4 s2_2, V4 s2_3, V4 s2_4, V4 s2_5, V4 s2_6, V4 s2_7, V4 s2_8, V4 s3_0, V4 s3_1, V4 s3_2, V4 s3_3, V4 s3_4, V4 s3_5, V4 s3_6, V4 s3_7, V4 s3_8) {
V4 r = { -6.039e-04, -3.875e-03, -3.020e-03, 2.282e-03 };
r = MulAdd(s0_0, M4(-1.005e-01, -6.367e-02, 4.428e-02, 1.687e-02, -9.639e-02, -1.209e-01, -1.374e-02, 4.932e-02, -9.949e-02, -2.569e-01, 1.199e-01, 1.077e-02, 5.110e-02, -1.129e-01, 6.104e-02, -4.656e-03), r);
r = MulAdd(s0_1, M4(-2.156e-01, 8.505e-02, 4.815e-04, -1.042e-01, -2.724e-01, -1.870e-01, 3.876e-02, 7.840e-02, -4.018e-01, -8.239e-01, 2.611e-01, -3.623e-01, -6.999e-03, 1.848e-02, 6.095e-02, -2.318e-02), r);
r = MulAdd(s0_2, M4(-2.195e-01, -6.727e-02, 7.111e-02, 5.119e-02, 7.396e-02, 1.116e-02, -1.261e-02, 9.531e-02, -3.892e-01, 1.430e-01, -9.840e-02, -2.423e-01, 2.669e-01, 3.009e-02, -2.478e-02, 1.168e-01), r);
r = MulAdd(s0_3, M4(-4.344e-01, 8.202e-02, 9.272e-03, -8.384e-02, -8.136e-02, -4.359e-01, 2.361e-01, -2.183e-01, 4.609e-02, -2.144e-02, 9.525e-03, -7.197e-02, -9.339e-02, 1.927e-01, -1.687e-02, 3.193e-02), r);
r = MulAdd(s0_4, M4(4.702e-01, 1.415e-04, 1.097e-01, 2.415e-01, 1.899e-01, -7.324e-01, -4.745e-03, -1.237e-01, -2.043e-01, 2.674e-02, 6.899e-01, 8.700e-02, 5.083e-02, 2.271e-01, 4.884e-02, 3.767e-01), r);
r = MulAdd(s0_5, M4(6.758e-02, -4.638e-02, 9.477e-02, -8.290e-02, -1.994e-01, 1.090e-01, -5.148e-02, -1.470e-01, 7.433e-02, 3.404e-01, 1.020e-01, -8.353e-02, 1.793e-01, -1.368e-01, 6.375e-02, 5.993e-02), r);
r = MulAdd(s0_6, M4(1.596e-02, 3.589e-02, 1.177e-02, 1.541e-01, -1.159e-01, -1.621e-02, 2.451e-01, 2.767e-01, -3.754e-04, 4.995e-02, -6.760e-02, -9.945e-02, 4.017e-01, 4.413e-02, 2.189e-02, 4.126e-02), r);
r = MulAdd(s0_7, M4(1.635e-01, -1.853e-01, -1.823e-01, -1.003e-01, -4.884e-02, 1.686e-01, 7.826e-02, 5.419e-01, -1.017e-01, 7.007e-02, 2.084e-01, 2.030e-01, 5.150e-01, -1.861e-01, -3.037e-01, -3.846e-01), r);
r = MulAdd(s0_8, M4(1.162e-01, 9.675e-02, -9.807e-02, 7.794e-02, 1.154e-01, 7.680e-02, 7.823e-02, 1.665e-01, 1.414e-01, 4.509e-02, -1.327e-02, 1.752e-01, -2.721e-01, -9.636e-04, 2.198e-02, -9.405e-02), r);
r = MulAdd(s1_0, M4(-3.554e-02, 7.673e-02, -1.735e-02, 3.910e-02, -9.934e-02, 1.798e-01, -4.244e-02, -2.008e-02, -1.586e-01, 7.918e-02, 6.812e-02, 1.784e-01, -2.173e-01, 8.736e-02, -3.130e-02, -1.487e-02), r);
r = MulAdd(s1_1, M4(1.142e-01, 2.330e-02, -7.096e-03, 5.291e-02, -3.702e-01, 2.102e-01, 7.156e-02, -1.416e-01, 1.017e-01, 3.888e-01, -5.335e-02, 9.686e-02, -1.093e-01, -1.631e-02, -2.884e-03, -4.091e-02), r);
r = MulAdd(s1_2, M4(4.795e-02, 4.423e-03, 1.494e-02, 2.666e-02, 1.261e-01, -7.251e-02, 2.103e-02, 1.095e-01, 2.166e-01, -1.249e-01, 8.981e-03, 1.792e-01, -3.697e-02, 6.864e-03, -1.141e-02, 2.430e-02), r);
r = MulAdd(s1_3, M4(-1.206e-01, 1.584e-03, -1.789e-02, -1.335e-02, 2.398e-01, 8.681e-01, -1.241e-01, -4.454e-02, -7.396e-02, 1.759e-02, -9.138e-02, 1.573e-01, -2.025e-01, 8.569e-02, 2.132e-02, 9.791e-02), r);
r = MulAdd(s1_4, M4(-4.834e-02, -7.974e-01, 2.858e-01, -2.441e-01, 4.163e-01, -1.650e-01, -1.897e-01, 1.309e-01, 4.031e-02, -8.242e-02, 3.338e-01, 3.567e-01, -1.532e-01, 2.807e-01, -7.324e-02, 5.093e-03), r);
r = MulAdd(s1_5, M4(-1.538e-01, 9.244e-02, -7.570e-02, -4.333e-02, -1.407e-01, -4.201e-02, -4.186e-02, -1.603e-01, -2.031e-01, 6.309e-02, -8.191e-02, 9.121e-02, -8.138e-02, -4.037e-02, 3.793e-02, 4.240e-02), r);
r = MulAdd(s1_6, M4(1.780e-01, 1.059e-01, -5.233e-03, 1.087e-01, 1.808e-01, -1.409e-01, 1.162e-02, -1.312e-01, 6.866e-02, 1.401e-02, 6.420e-02, 5.614e-02, -6.830e-02, 1.731e-02, 5.889e-02, 2.257e-02), r);
r = MulAdd(s1_7, M4(2.057e-01, -2.093e-02, -1.741e-01, 9.891e-02, -3.673e-02, 3.314e-02, -2.223e-01, -3.177e-01, 2.374e-01, -5.871e-02, -5.086e-02, -9.418e-02, -1.935e-02, -1.902e-02, -1.255e-01, -2.744e-01), r);
r = MulAdd(s1_8, M4(1.654e-01, 7.328e-02, 2.874e-02, 1.256e-01, -2.608e-01, 1.926e-03, 4.500e-02, -7.882e-02, -1.035e-02, -3.478e-02, -1.061e-01, -8.474e-02, -2.438e-01, -6.889e-02, -7.579e-02, -1.871e-01), r);
r = MulAdd(s2_0, M4(6.493e-02, 1.357e-01, -6.197e-02, -5.055e-02, 2.568e-01, -5.699e-02, -1.266e-01, -1.411e-02, 2.936e-02, -5.234e-02, -5.882e-03, -8.014e-02, -5.334e-02, -8.555e-02, 5.632e-02, 8.296e-03), r);
r = MulAdd(s2_1, M4(-3.582e-01, 2.351e-01, -1.636e-01, 2.172e-01, -1.840e-01, 9.838e-02, -7.565e-02, 1.535e-01, 8.151e-02, 3.002e-02, 1.149e-01, 1.180e-01, 1.323e-01, -7.682e-03, 5.013e-02, -2.190e-02), r);
r = MulAdd(s2_2, M4(-1.957e-01, -5.823e-02, -1.131e-01, -7.025e-02, 3.355e-01, 1.378e-01, -2.046e-01, 2.575e-01, 1.663e-01, 2.567e-02, -3.703e-02, -9.489e-02, -6.431e-02, -6.700e-02, 9.598e-02, 4.460e-03), r);
r = MulAdd(s2_3, M4(-1.522e-01, 1.335e-01, -2.140e-01, 3.368e-02, -5.076e-02, 2.412e-01, 6.141e-03, 2.456e-02, -9.105e-03, 1.014e-02, -1.056e-02, 1.368e-01, 8.030e-02, -2.874e-02, -7.499e-02, -2.675e-02), r);
r = MulAdd(s2_4, M4(2.115e-02, -6.849e-02, -8.528e-02, -3.270e-01, 2.112e-02, 7.309e-02, -3.852e-02, 2.604e-01, 1.772e-01, 4.115e-01, -2.443e-01, 3.100e-01, 3.139e-01, 3.829e-01, -2.701e-01, 1.463e-01), r);
r = MulAdd(s2_5, M4(2.664e-03, 4.352e-02, -2.378e-01, 5.316e-02, -1.369e-01, -1.293e-01, 1.587e-01, 2.153e-01, 3.820e-01, -1.515e-01, -4.429e-02, 2.391e-01, -3.720e-01, -1.154e-01, -1.196e-01, 3.172e-01), r);
r = MulAdd(s2_6, M4(-3.174e-01, -2.340e-01, 1.286e-01, -1.076e-01, 5.834e-02, 6.138e-02, -6.854e-03, 5.658e-02, 5.314e-02, -1.751e-02, 9.115e-03, 8.328e-03, 8.394e-03, 2.608e-02, 1.125e-01, 1.593e-01), r);
r = MulAdd(s2_7, M4(-6.600e-01, 1.899e-01, 1.094e-01, 1.665e-02, 1.089e-01, -1.034e-01, -1.811e-01, -3.040e-01, 4.782e-01, 3.160e-02, -4.648e-02, 1.286e-01, 1.070e-01, -1.022e-01, 5.693e-02, -5.195e-02), r);
r = MulAdd(s2_8, M4(3.748e-03, -4.142e-02, -7.021e-02, -2.596e-01, -2.444e-01, -6.341e-05, 4.125e-02, -7.382e-02, 4.456e-02, 3.144e-02, -5.055e-02, -1.724e-01, -1.835e-01, 4.462e-02, -1.398e-01, -2.631e-02), r);
r = MulAdd(s3_0, M4(-1.892e-01, -2.298e-01, 7.045e-02, -6.423e-02, 7.789e-02, -9.540e-02, -3.161e-02, -5.171e-02, -3.656e-02, -6.148e-02, -1.413e-02, -8.995e-02, 2.536e-02, 1.995e-03, 3.317e-02, 1.918e-02), r);
r = MulAdd(s3_1, M4(1.245e-02, -4.971e-03, 1.026e-02, -7.525e-02, -2.233e-01, -4.502e-01, -4.530e-03, -1.802e-01, -1.799e-01, 1.915e-02, 1.043e-02, 4.008e-02, 1.524e-01, 1.881e-03, -7.387e-02, 1.566e-02), r);
r = MulAdd(s3_2, M4(-1.750e-01, 3.216e-03, -1.033e-03, -7.055e-02, -1.263e-01, 1.586e-01, 2.603e-02, -1.282e-01, 5.606e-02, -1.498e-02, -3.338e-02, -8.978e-03, -2.218e-02, -5.852e-02, -3.208e-03, -1.352e-02), r);
r = MulAdd(s3_3, M4(-9.577e-02, -8.859e-02, 7.921e-02, -1.569e-02, -7.962e-02, 2.890e-02, 4.107e-02, -5.870e-02, 2.510e-02, 1.765e-02, 4.458e-02, 1.891e-02, 7.541e-02, 3.492e-02, 3.160e-02, 1.201e-02), r);
r = MulAdd(s3_4, M4(-6.228e-02, 9.576e-02, -1.743e-01, -1.935e-01, 2.054e-01, 1.479e-01, 8.056e-04, 3.321e-02, -1.362e-01, 5.003e-01, 9.071e-02, 8.153e-02, 2.283e-01, -3.484e-01, 4.509e-02, -4.658e-01), r);
r = MulAdd(s3_5, M4(2.528e-01, -9.286e-04, -2.468e-02, 1.338e-01, 4.431e-02, 3.503e-02, 1.304e-01, 1.652e-01, 4.628e-01, -2.670e-01, 1.880e-01, 1.516e-01, -1.538e-01, 1.379e-01, -3.334e-02, 2.977e-02), r);
r = MulAdd(s3_6, M4(1.385e-01, -6.592e-02, -1.225e-01, -1.381e-01, -4.498e-02, -6.343e-03, 4.811e-02, 9.639e-02, 1.635e-02, -3.467e-02, 3.640e-03, -3.186e-02, 6.265e-02, 2.282e-01, 9.661e-02, 1.295e-01), r);
r = MulAdd(s3_7, M4(-3.053e-03, 7.999e-02, 2.407e-01, 2.655e-01, -3.969e-01, -9.502e-03, 1.900e-02, 9.557e-02, -6.199e-02, -3.574e-02, 8.350e-02, -7.837e-02, -1.442e-02, -5.281e-03, 4.503e-01, 4.026e-01), r);
r = MulAdd(s3_8, M4(-1.313e-01, 4.424e-02, -1.155e-02, 6.769e-02, 2.192e-02, 6.721e-02, 5.694e-03, 7.376e-02, -2.155e-01, -7.512e-02, 6.252e-03, -3.428e-01, 3.324e-01, 2.784e-03, -5.606e-02, 2.108e-01), r);
return r;
}
void Pass3(uint2 blockStart, uint3 tid) {
float2 pt = float2(GetInputPt());
uint2 gxy = Rmp8x8(tid.x) + blockStart;
uint2 size = GetInputSize();
if (gxy.x >= size.x || gxy.y >= size.y) {
return;
}
float2 pos = (gxy + 0.5) * pt;
V4 s0_0 = l0(-1.0, -1.0);
V4 s0_1 = l0(0.0, -1.0);
V4 s0_2 = l0(1.0, -1.0);
V4 s0_3 = l0(-1.0, 0.0);
V4 s0_4 = l0(0.0, 0.0);
V4 s0_5 = l0(1.0, 0.0);
V4 s0_6 = l0(-1.0, 1.0);
V4 s0_7 = l0(0.0, 1.0);
V4 s0_8 = l0(1.0, 1.0);
V4 s1_0 = -max(-s0_0, 0.0);
V4 s1_1 = -max(-s0_1, 0.0);
V4 s1_2 = -max(-s0_2, 0.0);
V4 s1_3 = -max(-s0_3, 0.0);
V4 s1_4 = -max(-s0_4, 0.0);
V4 s1_5 = -max(-s0_5, 0.0);
V4 s1_6 = -max(-s0_6, 0.0);
V4 s1_7 = -max(-s0_7, 0.0);
V4 s1_8 = -max(-s0_8, 0.0);
s0_0 = max(s0_0, 0.0);
s0_1 = max(s0_1, 0.0);
s0_2 = max(s0_2, 0.0);
s0_3 = max(s0_3, 0.0);
s0_4 = max(s0_4, 0.0);
s0_5 = max(s0_5, 0.0);
s0_6 = max(s0_6, 0.0);
s0_7 = max(s0_7, 0.0);
s0_8 = max(s0_8, 0.0);
V4 s2_0 = l1(-1.0, -1.0);
V4 s2_1 = l1(0.0, -1.0);
V4 s2_2 = l1(1.0, -1.0);
V4 s2_3 = l1(-1.0, 0.0);
V4 s2_4 = l1(0.0, 0.0);
V4 s2_5 = l1(1.0, 0.0);
V4 s2_6 = l1(-1.0, 1.0);
V4 s2_7 = l1(0.0, 1.0);
V4 s2_8 = l1(1.0, 1.0);
V4 s3_0 = -max(-s2_0, 0.0);
V4 s3_1 = -max(-s2_1, 0.0);
V4 s3_2 = -max(-s2_2, 0.0);
V4 s3_3 = -max(-s2_3, 0.0);
V4 s3_4 = -max(-s2_4, 0.0);
V4 s3_5 = -max(-s2_5, 0.0);
V4 s3_6 = -max(-s2_6, 0.0);
V4 s3_7 = -max(-s2_7, 0.0);
V4 s3_8 = -max(-s2_8, 0.0);
s2_0 = max(s2_0, 0.0);
s2_1 = max(s2_1, 0.0);
s2_2 = max(s2_2, 0.0);
s2_3 = max(s2_3, 0.0);
s2_4 = max(s2_4, 0.0);
s2_5 = max(s2_5, 0.0);
s2_6 = max(s2_6, 0.0);
s2_7 = max(s2_7, 0.0);
s2_8 = max(s2_8, 0.0);
t0[gxy] = f0(s0_0, s0_1, s0_2, s0_3, s0_4, s0_5, s0_6, s0_7, s0_8, s1_0, s1_1, s1_2, s1_3, s1_4, s1_5, s1_6, s1_7, s1_8, s2_0, s2_1, s2_2, s2_3, s2_4, s2_5, s2_6, s2_7, s2_8, s3_0, s3_1, s3_2, s3_3, s3_4, s3_5, s3_6, s3_7, s3_8);
t1[gxy] = f1(s0_0, s0_1, s0_2, s0_3, s0_4, s0_5, s0_6, s0_7, s0_8, s1_0, s1_1, s1_2, s1_3, s1_4, s1_5, s1_6, s1_7, s1_8, s2_0, s2_1, s2_2, s2_3, s2_4, s2_5, s2_6, s2_7, s2_8, s3_0, s3_1, s3_2, s3_3, s3_4, s3_5, s3_6, s3_7, s3_8);
}
//!PASS 4
//!DESC conv3
//!BLOCK_SIZE 8
//!NUM_THREADS 64
//!IN t0, t1
//!OUT t2, t3
#define l0(x, y) V4(O(t0, float2(x, y)))
#define l1(x, y) V4(O(t1, float2(x, y)))
V4 f0(V4 s0_0, V4 s0_1, V4 s0_2, V4 s0_3, V4 s0_4, V4 s0_5, V4 s0_6, V4 s0_7, V4 s0_8, V4 s1_0, V4 s1_1, V4 s1_2, V4 s1_3, V4 s1_4, V4 s1_5, V4 s1_6, V4 s1_7, V4 s1_8, V4 s2_0, V4 s2_1, V4 s2_2, V4 s2_3, V4 s2_4, V4 s2_5, V4 s2_6, V4 s2_7, V4 s2_8, V4 s3_0, V4 s3_1, V4 s3_2, V4 s3_3, V4 s3_4, V4 s3_5, V4 s3_6, V4 s3_7, V4 s3_8) {
V4 r = { 8.385e-03, 1.035e-02, -6.465e-04, -6.502e-03 };
r = MulAdd(s0_0, M4(8.947e-02, -1.234e-01, -3.169e-02, -9.158e-02, -1.406e-01, 6.941e-02, -1.367e-02, -1.406e-02, 9.073e-02, 5.642e-01, -2.007e-02, 9.725e-02, 7.122e-03, -1.956e-03, 6.532e-03, -5.457e-02), r);
r = MulAdd(s0_1, M4(-1.130e-01, -4.645e-02, 3.624e-02, 3.391e-02, 3.882e-01, 2.453e-01, -2.237e-01, -2.271e-01, 2.803e-01, 1.718e-01, 3.255e-02, -2.046e-01, 1.441e-01, -1.880e-03, 2.335e-02, -1.232e-01), r);
r = MulAdd(s0_2, M4(2.016e-01, 1.243e-01, -3.895e-02, -1.135e-01, -2.167e-02, 1.465e-02, -7.776e-02, -1.213e-01, -7.195e-03, 4.404e-03, 6.598e-02, -5.135e-02, -2.062e-01, -3.725e-02, -8.296e-03, 8.739e-03), r);
r = MulAdd(s0_3, M4(-2.068e-02, -3.876e-02, 5.737e-02, 9.886e-02, -9.663e-02, -2.569e-01, 6.761e-02, -1.454e-01, 4.660e-02, 7.810e-01, -2.254e-01, 1.899e-01, -9.628e-02, 8.080e-02, -1.093e-02, 1.451e-02), r);
r = MulAdd(s0_4, M4(3.133e-01, 2.759e-01, -9.917e-02, -3.134e-01, 1.137e-01, -5.446e-01, -2.044e-03, -5.215e-01, -6.867e-02, 5.254e-01, -1.466e-01, -3.048e-01, 3.408e-01, 5.791e-01, -2.594e-01, -4.879e-04), r);
r = MulAdd(s0_5, M4(6.871e-02, -1.221e-01, -5.702e-02, -2.731e-02, 6.025e-01, 1.350e-01, -3.119e-01, -4.130e-01, 2.091e-01, 1.003e-01, 4.509e-02, -1.541e-01, 1.151e-01, -1.558e-01, 6.309e-03, -2.192e-01), r);
r = MulAdd(s0_6, M4(2.139e-02, 1.540e-02, -9.451e-02, 8.898e-02, 1.983e-02, -1.259e-01, 2.162e-01, -9.477e-02, -2.253e-01, -1.456e-01, -2.432e-02, 9.649e-02, 2.147e-02, -9.523e-02, 2.042e-02, -7.790e-02), r);
r = MulAdd(s0_7, M4(-3.105e-03, 1.944e-01, -1.808e-01, -3.058e-02, 4.007e-01, 5.645e-01, -2.452e-01, -7.366e-02, 1.279e-02, 3.212e-02, -1.573e-01, -1.267e-01, 1.613e-02, -1.976e-01, -1.519e-01, -2.687e-02), r);
r = MulAdd(s0_8, M4(-1.906e-04, 8.306e-02, 2.480e-02, 1.696e-02, 1.275e-01, 1.372e-01, 1.205e-01, 1.120e-02, 1.424e-02, -1.526e-01, -6.629e-02, -9.104e-02, 2.042e-02, -1.167e-01, 1.050e-01, 1.560e-02), r);
r = MulAdd(s1_0, M4(-2.398e-02, -1.009e-01, 2.671e-02, -8.841e-02, -7.277e-03, -4.411e-02, -1.240e-02, -5.367e-04, -1.223e-01, -7.251e-02, 4.941e-02, 7.545e-02, 6.688e-02, 1.727e-02, -1.144e-02, -7.713e-02), r);
r = MulAdd(s1_1, M4(-1.507e-01, -3.095e-01, 5.017e-02, -1.145e-01, 3.430e-02, -2.241e-01, -9.050e-02, -8.470e-02, -8.624e-02, -1.021e-02, -1.620e-02, 3.932e-03, 7.775e-02, -2.376e-02, 6.270e-02, -7.896e-02), r);
r = MulAdd(s1_2, M4(3.578e-02, -3.242e-02, 9.400e-03, -2.998e-02, -1.545e-02, -1.481e-01, -6.667e-02, 3.496e-02, 6.722e-02, 7.676e-04, -8.215e-04, 2.142e-03, 4.007e-02, 9.690e-02, -1.652e-03, 3.858e-02), r);
r = MulAdd(s1_3, M4(6.321e-02, -1.472e-01, 6.571e-02, -1.929e-01, -7.340e-02, -8.067e-02, 1.715e-02, 2.182e-02, -8.623e-02, -2.195e-01, -6.101e-02, 8.246e-02, -4.908e-02, -3.293e-02, -7.341e-02, -1.941e-01), r);
r = MulAdd(s1_4, M4(5.609e-01, 5.581e-01, -1.143e-01, -1.052e-01, 2.477e-01, 2.387e-01, 1.272e-01, 3.284e-03, -3.135e-01, 8.385e-02, -7.393e-02, -2.270e-01, 4.403e-01, -1.179e-01, -1.620e-01, 2.978e-01), r);
r = MulAdd(s1_5, M4(-3.015e-02, 1.055e-01, 1.072e-01, 1.177e-01, 3.838e-01, 3.206e-02, -4.556e-03, -5.072e-02, 4.250e-02, -1.665e-02, -1.759e-02, 2.822e-02, -2.408e-01, -2.204e-02, -3.440e-02, 6.520e-02), r);
r = MulAdd(s1_6, M4(9.180e-04, 3.395e-02, -1.211e-02, -5.605e-03, -7.356e-03, -2.439e-02, -2.498e-02, -6.361e-04, -5.167e-02, -1.009e-02, 7.202e-02, 3.652e-02, 3.036e-03, -7.672e-03, -2.822e-02, -9.942e-02), r);
r = MulAdd(s1_7, M4(-7.041e-02, -2.366e-01, -1.556e-01, 1.499e-01, -2.674e-02, 6.601e-03, -1.490e-01, 1.329e-02, -1.127e-01, 8.363e-03, -1.333e-01, 1.038e-02, -1.219e-02, -1.366e-01, 8.814e-02, 4.260e-03), r);
r = MulAdd(s1_8, M4(-1.397e-02, 2.863e-02, 5.459e-03, -1.166e-02, -1.201e-02, 1.346e-01, 5.461e-02, 1.584e-02, -8.155e-02, 8.451e-03, -3.444e-02, 3.920e-02, 2.082e-02, -4.174e-02, 6.205e-02, 5.646e-02), r);
r = MulAdd(s2_0, M4(5.465e-02, 7.303e-02, 1.200e-01, 8.938e-03, -8.960e-02, -2.248e-01, -1.073e-02, 6.882e-02, 4.637e-02, -1.215e-01, -2.319e-02, -2.049e-01, -8.235e-02, -2.689e-02, 8.521e-02, 2.612e-02), r);
r = MulAdd(s2_1, M4(-1.284e-01, -8.509e-02, 6.859e-02, 2.538e-02, -7.401e-02, 2.860e-01, -2.240e-01, 1.754e-01, -2.073e-01, -9.333e-02, -9.310e-02, -3.311e-01, 2.251e-01, 1.948e-01, -1.091e-01, 2.448e-02), r);
r = MulAdd(s2_2, M4(4.550e-03, 2.884e-02, -1.023e-02, -1.793e-02, 1.472e-01, 1.728e-02, -5.533e-02, -4.606e-02, -1.128e-01, 1.845e-01, -9.297e-02, 7.245e-02, 2.303e-02, -1.293e-01, -2.277e-02, -1.523e-02), r);
r = MulAdd(s2_3, M4(5.703e-02, 4.629e-03, -7.495e-02, -7.220e-02, -1.245e-01, 1.142e-01, -1.688e-03, -9.906e-03, 9.714e-02, -2.851e-02, 7.069e-03, -3.250e-01, -5.029e-03, -1.421e-01, -4.162e-02, 1.032e-01), r);
r = MulAdd(s2_4, M4(5.200e-02, -3.414e-02, -3.809e-02, -9.742e-02, 8.686e-01, 1.140e+00, 2.062e-01, 8.598e-02, 4.073e-01, -3.313e-01, 2.673e-01, 1.050e-01, -9.355e-02, 1.764e-01, 8.423e-02, 1.156e-01), r);
r = MulAdd(s2_5, M4(-5.260e-03, 8.804e-02, 3.636e-02, 3.074e-03, 1.724e-01, 2.433e-01, -1.126e-02, -2.652e-01, -1.229e-01, 3.135e-02, 1.187e-02, -6.661e-02, -1.872e-02, -6.508e-02, -7.109e-02, 1.141e-01), r);
r = MulAdd(s2_6, M4(6.180e-03, 2.059e-03, -1.768e-02, 4.877e-03, -7.838e-02, 1.366e-01, -7.231e-02, -2.826e-02, 6.251e-02, 7.375e-02, 2.531e-02, 2.038e-02, -4.462e-03, -4.896e-02, -4.376e-02, -7.998e-03), r);
r = MulAdd(s2_7, M4(1.011e-01, 8.753e-02, -5.554e-02, 6.949e-04, 4.137e-02, 2.710e-01, -3.203e-01, 6.752e-02, 9.720e-02, 3.447e-02, -5.777e-02, -1.723e-02, -9.154e-03, 5.461e-02, 1.248e-01, -3.906e-04), r);
r = MulAdd(s2_8, M4(4.126e-02, 3.442e-02, 9.763e-03, -4.560e-02, -4.233e-04, -1.519e-01, 2.421e-02, -4.043e-02, -1.281e-02, 1.166e-02, 2.489e-04, -3.061e-02, -4.476e-02, 4.493e-03, -4.164e-02, 9.694e-03), r);
r = MulAdd(s3_0, M4(-1.352e-01, -1.938e-01, 7.285e-02, -4.706e-02, 1.920e-02, 1.891e-02, 1.233e-02, 3.876e-02, 1.342e-02, 2.020e-01, 3.292e-02, 2.778e-02, -5.017e-02, 3.560e-02, 7.028e-02, 7.562e-03), r);
r = MulAdd(s3_1, M4(3.014e-01, 1.243e-01, -2.656e-02, -9.796e-02, 1.585e-01, 2.259e-01, -6.651e-02, 4.080e-02, 1.902e-01, 2.705e-01, -9.774e-02, -1.144e-02, -4.653e-01, -3.536e-01, 2.515e-02, 9.628e-02), r);
r = MulAdd(s3_2, M4(-7.724e-02, 1.181e-01, 2.182e-02, 1.999e-02, -7.114e-02, -4.414e-02, -5.748e-06, -8.931e-03, 4.985e-03, 6.360e-02, 4.422e-02, 6.005e-02, 1.335e-01, -8.144e-03, -3.979e-02, 6.952e-03), r);
r = MulAdd(s3_3, M4(-1.826e-03, 2.390e-02, 4.665e-03, -3.357e-02, 2.088e-02, 1.436e-01, -2.474e-02, 1.100e-02, 2.727e-02, -1.649e-02, -9.539e-02, -1.112e-01, -2.427e-02, 1.811e-01, -4.267e-02, 1.060e-01), r);
r = MulAdd(s3_4, M4(9.873e-02, -1.417e-01, -1.365e-01, -3.187e-01, -7.583e-03, 3.047e-01, -2.480e-02, 2.623e-01, -3.193e-01, -1.539e-01, -2.986e-01, 2.350e-01, 4.367e-01, 2.441e-01, -3.426e-01, -5.108e-02), r);
r = MulAdd(s3_5, M4(-8.384e-02, 1.343e-01, 1.653e-01, 7.978e-02, 6.329e-02, 7.040e-02, 2.203e-02, -2.280e-01, 2.531e-02, -9.408e-02, -5.137e-02, -1.717e-01, -1.577e-01, 4.030e-02, -2.802e-01, 1.155e-01), r);
r = MulAdd(s3_6, M4(1.895e-02, 1.436e-01, 3.568e-04, -6.075e-02, 3.213e-02, -3.462e-02, -2.821e-02, -8.374e-03, 3.451e-02, -2.349e-02, 9.517e-03, 2.092e-02, -8.229e-02, 6.530e-02, 6.116e-03, -2.414e-02), r);
r = MulAdd(s3_7, M4(1.530e-01, 2.073e-01, -7.258e-02, -6.975e-02, -6.610e-03, -3.885e-02, -5.636e-03, 1.227e-01, 8.913e-02, 4.336e-02, -1.931e-03, -3.869e-02, -2.019e-02, -1.340e-02, -1.506e-02, 1.591e-02), r);
r = MulAdd(s3_8, M4(2.536e-02, -3.220e-02, 6.413e-02, -1.835e-02, -9.124e-02, -8.098e-02, -5.479e-02, -1.361e-02, -3.146e-03, 1.204e-01, -4.020e-02, -6.924e-02, -1.030e-01, -1.301e-01, 1.634e-02, 1.029e-01), r);
return r;
}
V4 f1(V4 s0_0, V4 s0_1, V4 s0_2, V4 s0_3, V4 s0_4, V4 s0_5, V4 s0_6, V4 s0_7, V4 s0_8, V4 s1_0, V4 s1_1, V4 s1_2, V4 s1_3, V4 s1_4, V4 s1_5, V4 s1_6, V4 s1_7, V4 s1_8, V4 s2_0, V4 s2_1, V4 s2_2, V4 s2_3, V4 s2_4, V4 s2_5, V4 s2_6, V4 s2_7, V4 s2_8, V4 s3_0, V4 s3_1, V4 s3_2, V4 s3_3, V4 s3_4, V4 s3_5, V4 s3_6, V4 s3_7, V4 s3_8) {
V4 r = { -2.994e-04, -3.163e-05, 4.528e-03, -1.285e-02 };
r = MulAdd(s0_0, M4(-5.222e-02, -3.069e-02, 2.456e-03, -1.117e-02, 4.933e-02, 5.166e-02, -6.284e-03, -9.151e-02, 1.439e-02, 1.755e-02, -8.848e-02, -9.796e-02, -2.835e-02, 3.699e-02, 2.912e-02, 4.373e-02), r);
r = MulAdd(s0_1, M4(6.333e-03, -2.767e-02, -7.247e-02, 8.441e-02, -3.433e-02, -4.699e-02, -1.193e-02, -1.729e-01, 2.481e-02, -4.121e-02, -2.861e-01, -1.202e-02, 2.687e-02, -1.313e-01, 1.747e-02, -9.108e-02), r);
r = MulAdd(s0_2, M4(1.309e-02, -1.968e-02, -1.246e-01, -3.915e-02, -1.159e-01, -6.491e-03, 3.316e-01, -6.851e-02, -2.940e-02, -1.787e-02, -5.850e-03, -6.207e-02, 5.272e-02, 9.800e-02, 4.709e-02, 7.491e-02), r);
r = MulAdd(s0_3, M4(-1.127e-01, -3.748e-02, -1.091e-01, 1.788e-01, -7.982e-02, -7.528e-02, 1.898e-01, -1.355e-01, -1.568e-01, 9.648e-02, 2.337e-01, -9.666e-02, -7.316e-02, 2.915e-02, 2.259e-02, -1.310e-02), r);
r = MulAdd(s0_4, M4(1.689e-02, -1.028e-01, 1.304e-01, -6.012e-02, -8.030e-02, -1.823e-01, 4.179e-01, -3.553e-01, 9.095e-04, 9.972e-02, 3.227e-01, -4.967e-02, -2.329e-01, 1.272e-01, 4.332e-01, -8.456e-01), r);
r = MulAdd(s0_5, M4(1.815e-02, -5.743e-02, 7.236e-02, -8.782e-02, 1.161e-01, 2.258e-01, 7.053e-01, -2.993e-01, 6.605e-02, -2.666e-03, -4.733e-02, -1.087e-01, -1.101e-01, 1.554e-01, 1.656e-01, 2.530e-01), r);
r = MulAdd(s0_6, M4(-7.750e-02, -6.619e-02, 2.202e-02, 4.186e-02, -1.519e-01, -8.918e-03, -1.919e-01, -7.085e-02, -1.356e-01, -1.363e-01, 1.782e-01, -1.499e-02, 9.670e-02, 1.450e-03, 5.675e-02, -3.337e-02), r);
r = MulAdd(s0_7, M4(-9.267e-02, 1.661e-01, 1.306e-01, -2.387e-01, -2.261e-02, 2.870e-01, -2.711e-01, 6.281e-02, 2.181e-02, 1.010e-01, 2.979e-01, -9.254e-02, 1.307e-01, -2.024e-02, 2.013e-01, -1.862e-02), r);
r = MulAdd(s0_8, M4(-7.233e-02, 8.276e-02, 1.279e-02, -3.778e-02, -3.737e-01, -2.422e-01, -1.352e-01, -1.631e-01, 6.518e-02, 2.511e-01, 1.588e-01, -3.599e-02, 8.821e-02, 3.757e-02, -1.340e-01, 1.006e-01), r);
r = MulAdd(s1_0, M4(1.034e-02, 8.194e-02, 9.844e-02, -1.052e-01, 4.683e-03, 4.432e-03, 8.420e-03, 7.511e-03, 7.210e-02, -8.697e-03, -9.834e-02, 1.366e-01, 3.221e-04, 1.836e-02, 1.307e-02, -6.823e-02), r);
r = MulAdd(s1_1, M4(-7.232e-02, 1.103e-01, 2.975e-01, 4.747e-02, -1.075e-01, -6.863e-02, 2.378e-01, -2.994e-02, 6.426e-02, 2.459e-02, -1.361e-01, 4.394e-02, 4.558e-02, -5.684e-02, -3.386e-02, 8.075e-02), r);
r = MulAdd(s1_2, M4(-1.568e-02, 6.463e-02, 4.001e-02, 3.549e-02, -3.385e-02, -1.547e-02, 2.510e-01, 3.198e-02, 2.533e-02, -6.612e-02, -5.453e-02, 1.387e-03, 3.071e-02, -5.115e-03, -9.345e-02, 1.790e-02), r);
r = MulAdd(s1_3, M4(1.723e-01, 2.119e-02, -3.394e-01, -1.101e-01, 7.882e-03, -4.188e-02, -6.882e-02, 5.060e-02, 4.902e-02, 2.919e-02, 7.773e-02, 1.080e-01, 8.944e-02, -2.819e-02, -1.252e-02, -2.744e-01), r);
r = MulAdd(s1_4, M4(2.682e-01, 8.840e-03, -3.974e-01, 2.436e-01, 1.156e-02, 3.806e-04, -5.090e-01, -1.339e-02, 1.677e-02, -1.337e-01, -1.050e-01, 2.647e-01, -1.971e-01, -1.145e-02, 1.471e-01, -7.814e-02), r);
r = MulAdd(s1_5, M4(-5.376e-02, 2.321e-02, -1.908e-01, -1.538e-01, 5.032e-03, 2.979e-02, -3.934e-02, -1.754e-01, 3.674e-02, 8.713e-03, -7.429e-02, -2.768e-03, -1.878e-01, -1.382e-01, 1.114e-01, 4.843e-02), r);
r = MulAdd(s1_6, M4(4.390e-03, 1.082e-02, 6.300e-03, -2.220e-02, -1.578e-02, -3.883e-02, 6.290e-02, 5.752e-03, 9.478e-02, 5.108e-03, 6.174e-02, 8.270e-02, -5.128e-02, -3.664e-02, 3.095e-02, -1.575e-01), r);
r = MulAdd(s1_7, M4(2.131e-01, 8.669e-03, 8.288e-02, 1.767e-01, -8.764e-02, -6.440e-03, 1.179e-01, -9.407e-02, -1.114e-01, -1.384e-01, 7.349e-02, 2.379e-02, 6.264e-02, -6.347e-02, -1.973e-01, 3.150e-02), r);
r = MulAdd(s1_8, M4(6.920e-02, 2.737e-01, 5.444e-02, -1.065e-01, -8.435e-02, 1.268e-01, -7.219e-03, -4.022e-02, -3.687e-02, -3.873e-02, 5.773e-02, 1.171e-02, 5.552e-02, -2.870e-02, -4.903e-02, 2.162e-02), r);
r = MulAdd(s2_0, M4(-6.811e-02, 3.915e-02, -1.970e-02, 5.496e-02, -3.225e-02, -5.284e-02, -3.737e-03, -1.864e-03, -1.361e-01, -7.308e-02, -4.948e-02, -1.634e-01, 5.283e-02, 1.746e-02, -8.374e-02, 7.123e-02), r);
r = MulAdd(s2_1, M4(4.868e-03, 7.851e-02, 1.067e-01, 5.576e-02, 1.276e-01, -7.837e-02, -2.875e-01, 3.754e-02, -1.315e-01, -9.095e-02, 8.041e-02, -1.156e-01, 1.309e-02, 1.086e-01, -1.335e-01, 9.059e-02), r);
r = MulAdd(s2_2, M4(-1.092e-02, 1.501e-01, -3.542e-02, 2.500e-02, 1.500e-02, -1.832e-01, -3.447e-01, -2.562e-02, -1.110e-01, 1.362e-01, 1.634e-01, -5.146e-02, -1.184e-02, -1.154e-01, 4.862e-02, 1.344e-03), r);
r = MulAdd(s2_3, M4(3.103e-02, -2.009e-02, 2.266e-02, 5.094e-02, 5.909e-01, 1.844e-01, -3.418e-02, -1.460e-01, 1.218e-02, -3.631e-02, -2.582e-01, -2.230e-01, 9.666e-02, -6.432e-02, 7.267e-02, 7.577e-02), r);
r = MulAdd(s2_4, M4(8.062e-02, -3.981e-02, -3.232e-02, -1.032e-01, -9.859e-02, 6.539e-01, 5.533e-01, -1.046e-02, -5.348e-01, 1.009e-02, -3.879e-01, 1.190e-01, -1.151e-01, 1.835e-01, -7.797e-02, 1.418e-01), r);
r = MulAdd(s2_5, M4(-1.404e-02, -1.730e-01, -4.516e-02, -2.158e-02, 2.544e-01, 4.463e-01, 1.404e-01, -6.854e-02, -9.712e-02, -4.920e-01, -2.485e-02, -6.416e-02, 3.612e-02, 2.451e-01, 2.327e-02, -1.251e-03), r);
r = MulAdd(s2_6, M4(6.507e-02, -2.267e-02, -7.660e-02, 3.043e-02, 3.541e-01, 2.804e-01, 2.783e-01, -2.580e-01, -1.185e-01, 8.028e-02, -1.395e-01, -4.988e-03, 4.702e-02, -5.327e-02, 4.580e-02, 3.130e-03), r);
r = MulAdd(s2_7, M4(9.806e-02, 6.990e-02, -4.317e-02, -2.415e-02, -2.263e-01, -1.723e-01, 2.669e-02, -3.393e-01, 9.368e-02, -6.775e-02, -1.883e-01, -8.601e-02, -2.278e-01, 1.612e-01, 1.625e-01, 8.821e-02), r);
r = MulAdd(s2_8, M4(-1.921e-02, 1.119e-01, 3.717e-02, -2.554e-02, 2.852e-02, 8.987e-02, 1.246e-01, 6.463e-03, 2.548e-02, -2.950e-02, 7.289e-02, 1.802e-02, 2.576e-02, 5.798e-02, 6.021e-02, -5.030e-03), r);
r = MulAdd(s3_0, M4(-1.023e-01, -3.759e-02, -2.437e-02, 1.032e-01, -2.143e-02, -4.189e-02, -6.139e-02, 9.887e-02, -9.094e-03, 3.087e-02, -1.056e-01, 1.376e-01, 1.702e-02, 3.138e-02, -1.243e-01, -5.115e-02), r);
r = MulAdd(s3_1, M4(3.439e-02, -1.018e-01, -3.260e-01, 6.226e-02, 3.794e-02, -6.747e-02, -1.743e-01, -9.149e-02, 6.116e-02, -3.539e-02, -3.971e-01, -2.458e-02, -1.436e-01, 4.323e-02, 5.595e-01, 1.160e-01), r);
r = MulAdd(s3_2, M4(-7.596e-02, -9.502e-02, -1.112e-02, -7.256e-02, -1.625e-02, -1.013e-01, -7.450e-02, 2.969e-03, -1.481e-02, -1.199e-01, -8.230e-02, 2.952e-02, -3.199e-02, 8.852e-02, -1.541e-02, 1.722e-02), r);
r = MulAdd(s3_3, M4(2.768e-03, -9.600e-02, 1.333e-01, -1.174e-01, -7.190e-02, 1.265e-02, 8.135e-02, -6.909e-03, 9.249e-02, -2.800e-02, 2.029e-01, -1.212e-02, 9.955e-02, -2.791e-02, -1.172e-01, 2.079e-01), r);
r = MulAdd(s3_4, M4(-1.948e-01, -1.936e-01, 5.127e-01, -7.970e-02, -1.135e-01, 1.060e-01, 1.226e-01, -3.195e-01, -4.980e-01, -5.665e-03, 3.167e-01, -2.413e-01, 2.036e-01, 1.519e-01, 7.793e-04, -1.316e-01), r);
r = MulAdd(s3_5, M4(-8.284e-02, -1.590e-01, 5.041e-03, -2.936e-02, 1.485e-01, 8.341e-02, -3.804e-02, 3.576e-02, 1.499e-01, -8.989e-02, 7.085e-02, -4.898e-02, 1.070e-01, 5.825e-02, 1.863e-01, -9.850e-03), r);
r = MulAdd(s3_6, M4(-3.057e-01, 2.794e-02, -7.737e-02, -4.168e-02, 2.696e-02, 1.279e-02, 2.638e-02, 8.177e-02, 1.217e-01, 2.531e-02, -1.188e-01, 1.018e-01, -5.486e-02, -6.606e-03, 1.868e-01, -1.050e-01), r);
r = MulAdd(s3_7, M4(-3.018e-01, -1.795e-01, -1.578e-01, -1.809e-01, 1.241e-01, -4.960e-02, -1.067e-01, -1.004e-02, -8.835e-02, 6.620e-02, 1.309e-01, -1.399e-01, 4.651e-02, 4.837e-02, -9.106e-02, 1.670e-01), r);
r = MulAdd(s3_8, M4(1.081e-02, -9.947e-02, 1.643e-02, -2.769e-02, 9.803e-02, -8.389e-02, -2.782e-02, -2.689e-02, 3.693e-02, -3.436e-03, 1.229e-02, -2.929e-02, -1.751e-01, -5.859e-03, 1.543e-01, 8.225e-02), r);
return r;
}
void Pass4(uint2 blockStart, uint3 tid) {
float2 pt = float2(GetInputPt());
uint2 gxy = Rmp8x8(tid.x) + blockStart;
uint2 size = GetInputSize();
if (gxy.x >= size.x || gxy.y >= size.y) {
return;
}
float2 pos = (gxy + 0.5) * pt;
V4 s0_0 = l0(-1.0, -1.0);
V4 s0_1 = l0(0.0, -1.0);
V4 s0_2 = l0(1.0, -1.0);
V4 s0_3 = l0(-1.0, 0.0);
V4 s0_4 = l0(0.0, 0.0);
V4 s0_5 = l0(1.0, 0.0);
V4 s0_6 = l0(-1.0, 1.0);
V4 s0_7 = l0(0.0, 1.0);
V4 s0_8 = l0(1.0, 1.0);
V4 s1_0 = -max(-s0_0, 0.0);
V4 s1_1 = -max(-s0_1, 0.0);
V4 s1_2 = -max(-s0_2, 0.0);
V4 s1_3 = -max(-s0_3, 0.0);
V4 s1_4 = -max(-s0_4, 0.0);
V4 s1_5 = -max(-s0_5, 0.0);
V4 s1_6 = -max(-s0_6, 0.0);
V4 s1_7 = -max(-s0_7, 0.0);
V4 s1_8 = -max(-s0_8, 0.0);
s0_0 = max(s0_0, 0.0);
s0_1 = max(s0_1, 0.0);
s0_2 = max(s0_2, 0.0);
s0_3 = max(s0_3, 0.0);
s0_4 = max(s0_4, 0.0);
s0_5 = max(s0_5, 0.0);
s0_6 = max(s0_6, 0.0);
s0_7 = max(s0_7, 0.0);
s0_8 = max(s0_8, 0.0);
V4 s2_0 = l1(-1.0, -1.0);
V4 s2_1 = l1(0.0, -1.0);
V4 s2_2 = l1(1.0, -1.0);
V4 s2_3 = l1(-1.0, 0.0);
V4 s2_4 = l1(0.0, 0.0);
V4 s2_5 = l1(1.0, 0.0);
V4 s2_6 = l1(-1.0, 1.0);
V4 s2_7 = l1(0.0, 1.0);
V4 s2_8 = l1(1.0, 1.0);
V4 s3_0 = -max(-s2_0, 0.0);
V4 s3_1 = -max(-s2_1, 0.0);
V4 s3_2 = -max(-s2_2, 0.0);
V4 s3_3 = -max(-s2_3, 0.0);
V4 s3_4 = -max(-s2_4, 0.0);
V4 s3_5 = -max(-s2_5, 0.0);
V4 s3_6 = -max(-s2_6, 0.0);
V4 s3_7 = -max(-s2_7, 0.0);
V4 s3_8 = -max(-s2_8, 0.0);
s2_0 = max(s2_0, 0.0);
s2_1 = max(s2_1, 0.0);
s2_2 = max(s2_2, 0.0);
s2_3 = max(s2_3, 0.0);
s2_4 = max(s2_4, 0.0);
s2_5 = max(s2_5, 0.0);
s2_6 = max(s2_6, 0.0);
s2_7 = max(s2_7, 0.0);
s2_8 = max(s2_8, 0.0);
t2[gxy] = f0(s0_0, s0_1, s0_2, s0_3, s0_4, s0_5, s0_6, s0_7, s0_8, s1_0, s1_1, s1_2, s1_3, s1_4, s1_5, s1_6, s1_7, s1_8, s2_0, s2_1, s2_2, s2_3, s2_4, s2_5, s2_6, s2_7, s2_8, s3_0, s3_1, s3_2, s3_3, s3_4, s3_5, s3_6, s3_7, s3_8);
t3[gxy] = f1(s0_0, s0_1, s0_2, s0_3, s0_4, s0_5, s0_6, s0_7, s0_8, s1_0, s1_1, s1_2, s1_3, s1_4, s1_5, s1_6, s1_7, s1_8, s2_0, s2_1, s2_2, s2_3, s2_4, s2_5, s2_6, s2_7, s2_8, s3_0, s3_1, s3_2, s3_3, s3_4, s3_5, s3_6, s3_7, s3_8);
}
//!PASS 5
//!DESC conv4
//!BLOCK_SIZE 8
//!NUM_THREADS 64
//!IN t2, t3
//!OUT t0, t1
#define l0(x, y) V4(O(t2, float2(x, y)))
#define l1(x, y) V4(O(t3, float2(x, y)))
V4 f0(V4 s0_0, V4 s0_1, V4 s0_2, V4 s0_3, V4 s0_4, V4 s0_5, V4 s0_6, V4 s0_7, V4 s0_8, V4 s1_0, V4 s1_1, V4 s1_2, V4 s1_3, V4 s1_4, V4 s1_5, V4 s1_6, V4 s1_7, V4 s1_8, V4 s2_0, V4 s2_1, V4 s2_2, V4 s2_3, V4 s2_4, V4 s2_5, V4 s2_6, V4 s2_7, V4 s2_8, V4 s3_0, V4 s3_1, V4 s3_2, V4 s3_3, V4 s3_4, V4 s3_5, V4 s3_6, V4 s3_7, V4 s3_8) {
V4 r = { -3.261e-03, 1.350e-04, -6.605e-05, 1.307e-04 };
r = MulAdd(s0_0, M4(5.681e-02, -7.933e-02, -1.161e-02, -3.257e-02, -1.507e-02, 2.248e-02, -1.351e-02, 2.789e-02, -1.713e-01, 9.482e-02, 2.715e-02, 9.506e-02, 1.714e-01, -1.090e-01, -7.237e-02, -1.563e-01), r);
r = MulAdd(s0_1, M4(-9.622e-03, -7.774e-04, -4.095e-02, 1.106e-02, -3.592e-02, -4.358e-02, 1.983e-02, -1.134e-02, -1.313e-02, -1.086e-01, 1.102e-01, -3.091e-01, 1.982e-01, 1.438e-01, -6.038e-02, 9.579e-02), r);
r = MulAdd(s0_2, M4(-3.893e-02, 1.554e-02, -7.763e-05, 1.610e-02, 3.470e-03, 9.915e-03, -9.881e-03, 5.331e-02, -9.152e-02, 6.899e-02, -3.615e-02, 1.558e-01, -3.300e-02, 4.493e-02, 2.148e-02, -3.677e-02), r);
r = MulAdd(s0_3, M4(1.939e-01, -7.700e-02, -1.449e-01, -1.942e-02, 9.649e-02, -3.580e-03, -1.767e-02, 2.394e-02, -1.299e-01, 1.160e-01, 8.000e-02, 9.737e-02, 2.751e-01, -4.435e-01, 1.013e-01, -1.782e-01), r);
r = MulAdd(s0_4, M4(-2.745e-01, 2.922e-01, -2.008e-01, 1.636e-01, -4.843e-02, 4.172e-01, 3.097e-02, 3.326e-01, -1.798e-02, -3.860e-01, 3.246e-02, 4.225e-01, -1.057e-01, 2.302e-01, -7.879e-02, 4.832e-02), r);
r = MulAdd(s0_5, M4(-6.834e-04, -3.372e-02, -9.351e-02, 1.547e-02, 5.621e-02, -1.195e-02, -9.402e-03, 6.439e-02, 8.787e-02, 1.499e-02, 1.928e-01, 6.693e-02, 6.516e-02, -1.145e-01, -6.610e-02, 3.986e-02), r);
r = MulAdd(s0_6, M4(7.682e-02, -9.222e-02, 1.566e-01, -1.438e-02, 5.080e-02, -2.762e-02, -3.121e-02, -1.242e-02, 2.046e-02, -1.131e-02, 4.555e-02, -3.006e-02, 1.125e-01, -7.883e-02, 1.063e-01, 3.027e-03), r);
r = MulAdd(s0_7, M4(-1.395e-01, 4.847e-02, 1.605e-01, 1.363e-01, 6.243e-02, -1.464e-02, 3.336e-02, -8.862e-02, 3.286e-02, -2.398e-02, -2.326e-02, -8.408e-02, 1.274e-01, -4.997e-02, 1.548e-01, -8.650e-02), r);
r = MulAdd(s0_8, M4(4.236e-02, 3.116e-02, 7.690e-02, 3.084e-02, 6.290e-03, 1.016e-02, 7.155e-02, -9.786e-02, -1.453e-02, -4.564e-04, -3.654e-02, 7.179e-03, -2.110e-02, -2.766e-02, 1.022e-01, -6.664e-02), r);
r = MulAdd(s1_0, M4(-2.814e-02, 6.473e-02, 5.209e-02, 6.202e-02, -1.898e-02, 6.061e-02, -1.557e-02, 3.561e-02, 2.137e-01, -1.913e-01, 2.387e-03, -1.470e-01, 4.553e-02, -3.358e-02, 1.936e-03, -4.798e-02), r);
r = MulAdd(s1_1, M4(4.947e-03, -8.431e-02, -3.362e-03, -1.057e-01, -6.735e-02, 8.463e-03, -4.622e-02, -2.022e-02, -1.450e-01, -1.687e-03, -1.541e-02, -1.116e-02, 4.447e-02, 5.088e-02, -7.198e-03, 3.279e-02), r);
r = MulAdd(s1_2, M4(1.202e-03, -2.591e-02, -5.357e-03, -3.844e-02, -7.403e-03, 3.771e-02, -6.171e-02, 8.820e-02, 6.744e-03, -4.156e-02, -1.377e-02, 9.398e-02, -2.643e-02, 4.991e-02, -2.000e-02, 1.056e-02), r);
r = MulAdd(s1_3, M4(3.923e-01, 3.525e-02, -1.294e-01, 1.478e-02, 9.667e-02, 1.289e-01, 8.960e-02, 1.946e-02, 3.128e-01, -3.315e-01, -3.019e-01, 1.021e-01, 2.095e-01, -1.488e-01, -9.439e-02, -9.635e-02), r);
r = MulAdd(s1_4, M4(-3.641e-01, -9.985e-02, -3.482e-01, -2.646e-01, -5.257e-01, 9.475e-01, 1.714e-01, 5.842e-01, -2.199e-01, -6.131e-02, -4.597e-01, 5.556e-01, 7.933e-02, -2.150e-01, -3.469e-01, -1.978e-01), r);
r = MulAdd(s1_5, M4(7.883e-05, -2.207e-02, -1.735e-02, 2.167e-02, 4.628e-02, 8.814e-02, -4.837e-02, 6.515e-02, 1.617e-01, -4.460e-02, -1.002e-01, 7.496e-02, -1.180e-01, 5.540e-02, -5.708e-02, 5.715e-02), r);
r = MulAdd(s1_6, M4(1.680e-01, -5.262e-02, 6.143e-02, -4.758e-02, -5.343e-02, 4.332e-02, 1.191e-01, 8.545e-03, 1.171e-01, -8.169e-02, 1.535e-02, -2.281e-01, 8.009e-02, -9.744e-02, 6.114e-02, 8.379e-03), r);
r = MulAdd(s1_7, M4(-9.744e-02, 2.573e-02, 6.125e-02, 1.265e-01, 9.253e-02, -1.227e-01, 3.224e-01, -2.402e-01, 1.083e-01, 1.607e-02, 1.155e-01, -4.014e-01, -2.347e-02, -3.821e-02, 2.379e-01, 2.605e-02), r);
r = MulAdd(s1_8, M4(5.428e-02, -5.434e-02, -2.345e-02, -2.189e-03, 1.274e-02, 7.503e-02, 1.442e-01, -8.839e-02, -3.480e-02, 1.444e-02, -3.859e-02, -1.089e-01, -3.183e-02, 9.172e-02, 1.092e-01, 6.688e-02), r);
r = MulAdd(s2_0, M4(2.283e-01, 3.872e-02, -5.533e-02, -1.704e-02, -1.533e-02, 1.459e-02, 3.842e-02, 6.367e-02, -4.041e-02, -6.411e-03, -5.052e-03, -8.331e-03, 2.786e-03, -5.502e-02, 6.695e-03, -1.982e-02), r);
r = MulAdd(s2_1, M4(-4.716e-01, 4.092e-01, -1.581e-01, 4.209e-01, 1.255e-01, -7.138e-02, 7.300e-02, -1.357e-01, -6.908e-02, -1.986e-02, 1.801e-02, -4.505e-02, -1.611e-01, -1.216e-01, -6.522e-02, -9.093e-02), r);
r = MulAdd(s2_2, M4(1.019e-01, -3.650e-02, 1.353e-02, 2.487e-01, -1.344e-04, 4.653e-02, 1.721e-02, 4.005e-02, 7.572e-03, -4.357e-02, -3.720e-02, 2.091e-02, 6.051e-03, -6.957e-02, -9.009e-02, -1.788e-02), r);
r = MulAdd(s2_3, M4(2.159e-02, -3.325e-02, 3.084e-02, 1.091e-01, -9.662e-02, 1.040e-01, 1.078e-01, -2.572e-02, 2.237e-04, -2.571e-02, -2.335e-02, -1.554e-02, 1.275e-01, -4.579e-02, -1.772e-02, 3.282e-02), r);
r = MulAdd(s2_4, M4(4.984e-02, 2.302e-01, 6.568e-02, 1.279e-01, 6.857e-02, -1.499e-01, -4.461e-02, -1.977e-01, -1.903e-01, 1.430e-01, 3.271e-02, 1.978e-01, 2.410e-01, 5.980e-01, -1.394e-01, 2.261e-01), r);
r = MulAdd(s2_5, M4(2.188e-02, -8.976e-03, 2.475e-02, 1.340e-02, -4.458e-02, 5.360e-02, 2.628e-02, -1.405e-02, 6.166e-02, -4.895e-02, 1.348e-03, 5.680e-02, -1.123e-01, 7.224e-02, -6.458e-02, 1.314e-01), r);
r = MulAdd(s2_6, M4(3.252e-02, -2.389e-02, -2.067e-02, -6.871e-02, -8.327e-02, 7.793e-02, 7.681e-03, 5.095e-02, -1.693e-02, -3.622e-02, 3.065e-02, -1.582e-02, -6.963e-03, 2.835e-02, 6.805e-02, -1.475e-02), r);
r = MulAdd(s2_7, M4(4.783e-02, -2.945e-02, 4.732e-02, -9.789e-04, -1.619e-02, -2.603e-02, -1.368e-01, 2.956e-02, 9.844e-02, -1.214e-01, 1.776e-01, -1.461e-01, -5.165e-02, -1.055e-02, 1.793e-01, -4.355e-02), r);
r = MulAdd(s2_8, M4(2.619e-03, 4.801e-02, 6.393e-02, -2.399e-02, -1.280e-03, -2.210e-02, -4.649e-02, 1.561e-03, -1.789e-02, 5.576e-02, 1.200e-01, 3.338e-03, 4.475e-02, -2.957e-02, 9.300e-02, -7.837e-02), r);
r = MulAdd(s3_0, M4(-1.536e-01, -3.593e-03, -1.064e-02, 1.740e-02, 9.197e-02, 2.772e-01, 5.258e-01, 5.745e-01, 2.331e-02, 8.995e-02, 2.611e-02, 5.463e-02, 4.872e-02, -8.230e-03, -1.742e-02, 3.405e-03), r);
r = MulAdd(s3_1, M4(4.799e-02, 1.088e-01, -7.562e-02, 5.926e-02, 4.190e-01, -4.922e-01, -1.822e-01, -2.309e-01, 1.776e-01, 1.799e-01, 1.213e-01, 3.198e-01, -1.565e-01, 2.118e-02, -5.914e-02, 1.048e-01), r);
r = MulAdd(s3_2, M4(-6.867e-02, -2.488e-02, 2.563e-02, -3.161e-02, -4.038e-02, 5.042e-02, 2.474e-02, 3.962e-03, -4.263e-02, 4.382e-02, -6.197e-03, 5.435e-02, 8.477e-02, -7.694e-02, -2.473e-02, -2.000e-02), r);
r = MulAdd(s3_3, M4(-6.567e-02, 7.271e-02, -2.275e-02, -4.345e-03, -4.825e-02, -7.541e-01, 5.163e-01, 9.170e-01, -1.040e-01, -9.911e-03, 3.569e-02, 2.347e-01, 2.350e-02, 6.202e-02, 7.421e-03, 2.377e-02), r);
r = MulAdd(s3_4, M4(-3.371e-02, -2.738e-02, 1.670e-01, 2.607e-01, -5.009e-02, 5.743e-03, -6.991e-01, -2.858e-02, -6.907e-02, -4.016e-01, 3.462e-01, 9.128e-01, -1.622e-01, 1.392e-01, 2.250e-01, 1.183e-01), r);
r = MulAdd(s3_5, M4(-8.330e-03, 1.029e-01, 1.045e-01, 2.013e-01, 2.609e-02, 7.939e-02, -1.054e-01, 6.487e-02, 1.165e-01, -6.250e-02, 1.274e-01, 2.396e-01, 2.390e-01, -2.468e-01, 1.178e-02, 6.794e-02), r);
r = MulAdd(s3_6, M4(5.411e-02, -5.669e-02, 2.831e-02, -3.762e-02, 1.186e-01, 1.750e-01, -2.862e-01, -9.876e-02, 5.851e-02, 2.750e-02, 7.348e-03, -2.151e-01, -3.151e-02, 5.225e-02, 3.178e-02, 1.438e-02), r);
r = MulAdd(s3_7, M4(-2.053e-03, 2.875e-02, -4.633e-02, -7.843e-02, -5.216e-02, -1.497e-04, -2.534e-01, -5.098e-01, 3.092e-02, -4.215e-02, -1.330e-01, -9.137e-02, 5.062e-02, 5.514e-02, -1.958e-01, 6.162e-03), r);
r = MulAdd(s3_8, M4(3.627e-02, 1.482e-02, 2.228e-02, -7.151e-02, -1.770e-02, 6.009e-02, 2.013e-01, 2.403e-02, 1.912e-03, -9.001e-03, 1.673e-02, -3.465e-02, 5.222e-02, -3.027e-02, -4.458e-03, -6.391e-02), r);
return r;
}
V4 f1(V4 s0_0, V4 s0_1, V4 s0_2, V4 s0_3, V4 s0_4, V4 s0_5, V4 s0_6, V4 s0_7, V4 s0_8, V4 s1_0, V4 s1_1, V4 s1_2, V4 s1_3, V4 s1_4, V4 s1_5, V4 s1_6, V4 s1_7, V4 s1_8, V4 s2_0, V4 s2_1, V4 s2_2, V4 s2_3, V4 s2_4, V4 s2_5, V4 s2_6, V4 s2_7, V4 s2_8, V4 s3_0, V4 s3_1, V4 s3_2, V4 s3_3, V4 s3_4, V4 s3_5, V4 s3_6, V4 s3_7, V4 s3_8) {
V4 r = { -1.782e-04, -1.204e-03, 6.004e-04, -1.736e-03 };
r = MulAdd(s0_0, M4(-5.528e-02, 2.435e-02, -2.728e-02, 5.042e-02, -2.357e-02, 1.752e-02, 6.730e-02, -1.869e-02, 5.562e-02, 2.108e-03, -2.535e-02, -7.791e-02, -6.984e-02, 8.842e-02, 7.203e-02, 3.709e-02), r);
r = MulAdd(s0_1, M4(-6.164e-02, -1.824e-02, 8.179e-02, -3.238e-02, 5.338e-02, -5.506e-02, -1.020e-01, 1.520e-02, 1.953e-01, -2.850e-02, 8.323e-02, -8.899e-02, 5.112e-02, 6.369e-02, -5.510e-02, 1.997e-02), r);
r = MulAdd(s0_2, M4(6.117e-02, -1.311e-02, -9.258e-03, -1.479e-02, -2.710e-02, 2.958e-02, 2.946e-02, -9.472e-03, 4.257e-02, -7.053e-02, -5.896e-02, 5.475e-02, 6.131e-02, -1.827e-02, -2.909e-02, -6.470e-02), r);
r = MulAdd(s0_3, M4(-1.411e-01, 1.597e-01, 2.142e-01, 6.972e-02, 1.704e-02, 4.423e-02, -8.405e-02, 4.993e-02, 1.176e-02, -8.471e-02, 4.062e-02, -1.001e-01, -3.805e-02, 3.820e-02, -6.258e-01, 2.568e-01), r);
r = MulAdd(s0_4, M4(3.384e-01, -2.619e-01, 1.799e-01, -3.175e-01, 3.472e-03, -1.186e-01, 7.886e-02, -1.126e-01, 1.378e-01, -3.772e-02, -1.396e-02, 6.889e-02, -1.383e-01, 1.958e-01, 7.297e-02, -1.066e+00), r);
r = MulAdd(s0_5, M4(-4.115e-04, 8.733e-03, 3.432e-02, 5.650e-02, 9.203e-02, 6.899e-02, -9.987e-03, 5.139e-02, 2.075e-01, -1.229e-02, 5.912e-02, -2.866e-02, -1.602e-01, 1.654e-01, 6.957e-02, 5.472e-02), r);
r = MulAdd(s0_6, M4(-1.000e-01, 9.401e-02, -3.864e-02, 1.160e-01, 1.108e-03, 8.814e-02, 6.570e-04, 2.167e-02, 6.762e-05, -1.080e-02, -1.670e-02, -4.178e-03, -9.704e-03, 2.164e-01, 3.748e-02, -1.258e-02), r);
r = MulAdd(s0_7, M4(7.557e-02, -2.360e-01, -2.727e-02, -7.688e-02, -3.110e-02, 1.671e-02, -4.238e-02, 5.553e-02, 6.518e-02, 3.357e-02, -2.725e-02, -2.524e-02, -1.352e-01, -1.005e-01, -4.108e-02, 2.664e-01), r);
r = MulAdd(s0_8, M4(9.624e-02, 5.754e-03, 8.412e-02, -2.955e-02, 2.850e-02, 8.830e-03, -4.162e-02, -1.337e-02, -4.374e-02, -2.352e-02, -1.566e-02, 1.822e-02, 7.979e-02, -9.058e-02, -1.071e-01, -3.379e-03), r);
r = MulAdd(s1_0, M4(1.395e-02, 1.801e-02, 1.899e-03, -3.313e-02, 2.251e-02, -3.697e-03, 5.577e-02, -3.001e-02, -6.090e-02, 1.645e-01, -1.047e-01, 1.483e-01, -6.634e-03, 3.917e-04, -1.999e-02, 2.114e-02), r);
r = MulAdd(s1_1, M4(2.859e-03, 5.455e-02, 4.336e-02, -2.717e-02, 9.302e-02, -9.807e-02, 7.046e-02, -3.707e-02, -1.275e-01, -3.463e-02, -1.160e-01, -4.227e-02, 3.162e-02, 3.583e-02, 4.579e-02, -1.196e-02), r);
r = MulAdd(s1_2, M4(-7.086e-03, 2.542e-03, 1.500e-03, -6.273e-03, 5.711e-02, -5.317e-02, -5.455e-03, 4.847e-02, 8.830e-02, 5.991e-02, 3.356e-02, 1.214e-03, -5.272e-03, -5.211e-02, -2.142e-02, -1.246e-02), r);
r = MulAdd(s1_3, M4(-4.807e-02, 4.530e-02, 2.719e-01, -1.035e-02, 4.911e-02, -5.824e-03, -6.478e-02, -1.051e-03, -1.348e-02, 6.405e-01, -4.257e-01, 3.690e-01, -9.665e-02, 2.101e-01, 6.571e-02, 9.738e-02), r);
r = MulAdd(s1_4, M4(2.423e-01, -2.074e-01, -4.394e-01, -2.830e-02, 5.415e-02, -2.337e-01, 6.080e-01, -1.843e-01, -5.128e-01, 1.559e-01, -2.033e-01, -6.040e-02, -6.726e-02, 2.589e-01, 1.901e-01, -9.598e-02), r);
r = MulAdd(s1_5, M4(-1.456e-01, 6.484e-02, 1.125e-01, -1.183e-02, 2.186e-01, 2.930e-02, -4.285e-02, 6.272e-02, 1.500e-01, 1.033e-01, 2.173e-01, -3.328e-02, -6.785e-02, -7.882e-02, -1.450e-01, 7.182e-02), r);
r = MulAdd(s1_6, M4(-4.062e-02, 9.988e-02, -5.106e-02, 1.546e-01, 5.122e-02, -7.398e-02, -5.320e-03, -5.669e-02, -4.188e-02, 2.035e-01, -5.253e-02, -7.554e-03, -6.233e-02, 1.285e-01, 1.152e-02, 7.495e-02), r);
r = MulAdd(s1_7, M4(1.168e-01, -1.061e-01, -8.798e-02, -2.456e-01, -1.274e-01, -9.338e-02, 6.064e-04, 1.255e-01, 2.944e-02, -9.599e-02, -1.606e-01, 1.477e-01, -5.541e-02, -9.992e-02, -5.652e-02, 1.402e-02), r);
r = MulAdd(s1_8, M4(-8.447e-02, -2.272e-02, 3.291e-02, 1.141e-01, 2.835e-01, 2.747e-02, 9.338e-03, -1.271e-01, 1.118e-03, -3.543e-02, -3.201e-02, 5.803e-02, 1.793e-01, -6.889e-02, -3.139e-02, -1.000e-01), r);
r = MulAdd(s2_0, M4(3.477e-02, 8.152e-03, -8.100e-03, 3.869e-02, 4.675e-02, 8.080e-02, -4.909e-02, 6.764e-03, -2.946e-03, -7.021e-02, -1.191e-02, -1.660e-02, -5.967e-02, -1.872e-02, -3.485e-02, 3.391e-02), r);
r = MulAdd(s2_1, M4(1.685e-01, -2.681e-01, -2.340e-01, -1.748e-01, -1.593e-01, 7.496e-02, 3.748e-02, 1.562e-02, 5.150e-02, -3.648e-02, 3.739e-02, -4.384e-02, -1.521e-02, -1.061e-01, -1.381e-01, 1.733e-02), r);
r = MulAdd(s2_2, M4(1.573e-01, 1.415e-01, 1.714e-01, -5.175e-02, -2.442e-02, 1.054e-02, 3.047e-03, -5.944e-03, -6.027e-03, 1.034e-02, -3.381e-02, 4.299e-02, -9.763e-02, 4.729e-02, 9.642e-02, -1.450e-02), r);
r = MulAdd(s2_3, M4(8.191e-03, 1.353e-01, -6.018e-02, 5.677e-02, -1.725e-02, -1.324e-01, 1.646e-01, -1.154e-01, -9.796e-03, 3.066e-02, -5.975e-02, 2.878e-02, -1.381e-01, 1.550e-01, 3.556e-02, 8.926e-02), r);
r = MulAdd(s2_4, M4(1.715e-01, -2.115e-02, 8.179e-02, -2.066e-01, 1.275e-01, 1.599e-01, 2.325e-02, -9.637e-03, 6.565e-02, -1.901e-01, 7.185e-02, -1.559e-01, 1.106e-01, -6.210e-02, -3.672e-01, 6.248e-02), r);
r = MulAdd(s2_5, M4(-3.453e-03, 5.284e-02, -1.031e-01, 5.091e-02, 1.538e-02, -9.971e-02, -5.610e-02, -2.585e-02, 6.441e-02, 1.113e-01, 3.085e-02, 6.860e-02, -6.167e-02, -6.774e-02, -6.898e-02, -4.397e-03), r);
r = MulAdd(s2_6, M4(-1.561e-02, 5.106e-02, 2.999e-03, -7.663e-03, 6.665e-02, -1.217e-01, -9.529e-03, -2.096e-02, -2.825e-02, 4.854e-02, -2.196e-02, -7.191e-03, 2.274e-03, 1.698e-02, -1.727e-02, 1.967e-03), r);
r = MulAdd(s2_7, M4(3.534e-02, -1.077e-02, 1.607e-02, 4.542e-02, -7.989e-02, 1.294e-01, 4.920e-02, -6.332e-02, -9.402e-02, 2.028e-02, -6.305e-03, 9.061e-02, 2.225e-03, 2.352e-02, -4.032e-03, -4.985e-02), r);
r = MulAdd(s2_8, M4(7.112e-02, -1.427e-02, -2.352e-02, -2.989e-02, -5.633e-02, -6.039e-03, 3.496e-03, 2.535e-02, 1.265e-01, -4.541e-02, -5.393e-02, -5.355e-02, 1.498e-03, 2.057e-02, 1.278e-02, 5.662e-02), r);
r = MulAdd(s3_0, M4(9.523e-02, -7.183e-02, -2.740e-01, -1.569e-02, 1.008e-01, 3.065e+00, -2.003e-01, 1.938e-01, 7.503e-02, -1.096e-01, -3.177e-02, -4.074e-02, 1.090e-03, -2.250e-02, -4.727e-02, 2.528e-02), r);
r = MulAdd(s3_1, M4(-7.789e-02, 7.186e-03, 3.838e-01, -1.314e-01, -4.119e-01, 1.344e-01, 5.252e-02, -4.478e-02, -2.421e-01, 8.221e-02, 1.588e-01, 5.943e-02, -6.960e-02, -7.055e-02, -5.857e-02, -2.367e-02), r);
r = MulAdd(s3_2, M4(1.578e-01, -5.477e-02, -1.343e-01, 7.698e-02, 9.761e-02, -2.725e-02, -6.329e-02, -5.552e-02, -6.854e-02, 1.143e-02, -8.043e-02, 1.416e-02, 5.387e-02, 1.371e-01, 1.146e-01, -5.881e-04), r);
r = MulAdd(s3_3, M4(7.307e-03, -8.177e-02, 5.634e-02, -1.149e-01, -4.060e-01, 1.613e+00, -3.145e-01, 2.057e-02, -9.555e-02, 2.548e-01, 5.932e-02, 7.789e-02, 7.174e-03, -6.399e-03, -2.315e-02, 8.381e-03), r);
r = MulAdd(s3_4, M4(1.200e-01, 1.356e-01, 8.711e-03, 7.537e-02, -1.751e-01, 3.458e-02, 2.391e-01, -1.111e-01, 1.506e-01, -3.165e-01, -4.619e-01, -9.386e-02, -4.377e-02, -1.492e-01, -5.002e-01, 9.821e-02), r);
r = MulAdd(s3_5, M4(1.539e-01, 7.309e-02, 4.257e-03, -1.539e-01, -4.757e-01, 1.070e-01, 1.702e-02, 9.709e-02, -1.140e-01, 1.938e-01, 1.982e-01, -3.215e-02, -3.822e-01, 3.408e-01, 1.647e-01, 1.597e-01), r);
r = MulAdd(s3_6, M4(-3.320e-02, 4.854e-02, -1.957e-02, 3.353e-02, 1.823e-01, 8.532e-02, 3.236e-02, -1.874e-01, -1.073e-02, -6.598e-03, -2.954e-02, -2.175e-02, 1.184e-02, -3.856e-02, 2.166e-02, -2.608e-02), r);
r = MulAdd(s3_7, M4(2.038e-02, -4.606e-02, -3.841e-02, -4.008e-02, -2.542e-01, -1.076e-01, -2.891e-02, 1.837e-01, 3.842e-02, 1.753e-01, 3.043e-02, -3.298e-02, 2.990e-02, 1.215e-01, 9.583e-02, -5.860e-02), r);
r = MulAdd(s3_8, M4(6.138e-02, 3.405e-02, 3.364e-04, 6.037e-03, 1.811e-01, 9.691e-04, 3.497e-02, -1.810e-02, -3.940e-02, -1.159e-01, -7.007e-02, 1.170e-01, 1.829e-02, -2.216e-02, -1.689e-02, 1.150e-01), r);
return r;
}
void Pass5(uint2 blockStart, uint3 tid) {
float2 pt = float2(GetInputPt());
uint2 gxy = Rmp8x8(tid.x) + blockStart;
uint2 size = GetInputSize();
if (gxy.x >= size.x || gxy.y >= size.y) {
return;
}
float2 pos = (gxy + 0.5) * pt;
V4 s0_0 = l0(-1.0, -1.0);
V4 s0_1 = l0(0.0, -1.0);
V4 s0_2 = l0(1.0, -1.0);
V4 s0_3 = l0(-1.0, 0.0);
V4 s0_4 = l0(0.0, 0.0);
V4 s0_5 = l0(1.0, 0.0);
V4 s0_6 = l0(-1.0, 1.0);
V4 s0_7 = l0(0.0, 1.0);
V4 s0_8 = l0(1.0, 1.0);
V4 s1_0 = -max(-s0_0, 0.0);
V4 s1_1 = -max(-s0_1, 0.0);
V4 s1_2 = -max(-s0_2, 0.0);
V4 s1_3 = -max(-s0_3, 0.0);
V4 s1_4 = -max(-s0_4, 0.0);
V4 s1_5 = -max(-s0_5, 0.0);
V4 s1_6 = -max(-s0_6, 0.0);
V4 s1_7 = -max(-s0_7, 0.0);
V4 s1_8 = -max(-s0_8, 0.0);
s0_0 = max(s0_0, 0.0);
s0_1 = max(s0_1, 0.0);
s0_2 = max(s0_2, 0.0);
s0_3 = max(s0_3, 0.0);
s0_4 = max(s0_4, 0.0);
s0_5 = max(s0_5, 0.0);
s0_6 = max(s0_6, 0.0);
s0_7 = max(s0_7, 0.0);
s0_8 = max(s0_8, 0.0);
V4 s2_0 = l1(-1.0, -1.0);
V4 s2_1 = l1(0.0, -1.0);
V4 s2_2 = l1(1.0, -1.0);
V4 s2_3 = l1(-1.0, 0.0);
V4 s2_4 = l1(0.0, 0.0);
V4 s2_5 = l1(1.0, 0.0);
V4 s2_6 = l1(-1.0, 1.0);
V4 s2_7 = l1(0.0, 1.0);
V4 s2_8 = l1(1.0, 1.0);
V4 s3_0 = -max(-s2_0, 0.0);
V4 s3_1 = -max(-s2_1, 0.0);
V4 s3_2 = -max(-s2_2, 0.0);
V4 s3_3 = -max(-s2_3, 0.0);
V4 s3_4 = -max(-s2_4, 0.0);
V4 s3_5 = -max(-s2_5, 0.0);
V4 s3_6 = -max(-s2_6, 0.0);
V4 s3_7 = -max(-s2_7, 0.0);
V4 s3_8 = -max(-s2_8, 0.0);
s2_0 = max(s2_0, 0.0);
s2_1 = max(s2_1, 0.0);
s2_2 = max(s2_2, 0.0);
s2_3 = max(s2_3, 0.0);
s2_4 = max(s2_4, 0.0);
s2_5 = max(s2_5, 0.0);
s2_6 = max(s2_6, 0.0);
s2_7 = max(s2_7, 0.0);
s2_8 = max(s2_8, 0.0);
t0[gxy] = f0(s0_0, s0_1, s0_2, s0_3, s0_4, s0_5, s0_6, s0_7, s0_8, s1_0, s1_1, s1_2, s1_3, s1_4, s1_5, s1_6, s1_7, s1_8, s2_0, s2_1, s2_2, s2_3, s2_4, s2_5, s2_6, s2_7, s2_8, s3_0, s3_1, s3_2, s3_3, s3_4, s3_5, s3_6, s3_7, s3_8);
t1[gxy] = f1(s0_0, s0_1, s0_2, s0_3, s0_4, s0_5, s0_6, s0_7, s0_8, s1_0, s1_1, s1_2, s1_3, s1_4, s1_5, s1_6, s1_7, s1_8, s2_0, s2_1, s2_2, s2_3, s2_4, s2_5, s2_6, s2_7, s2_8, s3_0, s3_1, s3_2, s3_3, s3_4, s3_5, s3_6, s3_7, s3_8);
}
//!PASS 6
//!DESC out-shuffle
//!BLOCK_SIZE 16
//!NUM_THREADS 64
//!IN INPUT, t0, t1
//!OUT OUTPUT
#define l0(x, y) V4(O(t0, float2(x, y)))
#define l1(x, y) V4(O(t1, float2(x, y)))
V4 f0(V4 s0_0, V4 s0_1, V4 s0_2, V4 s0_3, V4 s0_4, V4 s0_5, V4 s0_6, V4 s0_7, V4 s0_8, V4 s1_0, V4 s1_1, V4 s1_2, V4 s1_3, V4 s1_4, V4 s1_5, V4 s1_6, V4 s1_7, V4 s1_8, V4 s2_0, V4 s2_1, V4 s2_2, V4 s2_3, V4 s2_4, V4 s2_5, V4 s2_6, V4 s2_7, V4 s2_8, V4 s3_0, V4 s3_1, V4 s3_2, V4 s3_3, V4 s3_4, V4 s3_5, V4 s3_6, V4 s3_7, V4 s3_8) {
V4 r = { -8.480e-04, -1.222e-04, -8.629e-04, -1.828e-04 };
r = MulAdd(s0_0, M4(-6.910e-02, 1.215e-03, -2.039e-03, -1.079e-04, 8.088e-02, -2.119e-02, -1.929e-02, 1.865e-02, -6.142e-02, 2.499e-02, -4.185e-03, 1.951e-03, -1.099e-02, 1.071e-02, 3.133e-03, -9.539e-03), r);
r = MulAdd(s0_1, M4(-2.129e-02, 6.812e-02, 2.738e-02, -2.965e-02, -1.569e-01, -7.369e-02, 6.714e-02, -2.416e-02, 6.421e-02, -3.329e-02, 4.397e-03, 1.902e-02, 1.426e-01, 7.469e-02, -3.306e-02, 1.260e-02), r);
r = MulAdd(s0_2, M4(-2.521e-02, -1.556e-02, -1.880e-02, 1.813e-02, -2.926e-03, -3.967e-02, -2.562e-02, 1.669e-02, 1.699e-03, 2.545e-02, 9.862e-03, 1.052e-02, -1.392e-02, 1.215e-02, 2.436e-02, 2.113e-04), r);
r = MulAdd(s0_3, M4(1.800e-02, -2.761e-02, 1.145e-02, -6.469e-02, 1.392e-01, 1.033e-02, 1.406e-01, -7.326e-03, -2.077e-02, 2.985e-03, -1.102e-01, 2.804e-02, -1.544e-02, 5.050e-02, 2.915e-02, 2.396e-02), r);
r = MulAdd(s0_4, M4(1.242e-01, -4.463e-01, -3.829e-01, 1.871e-01, -8.392e-02, 6.470e-02, -3.115e-01, -1.970e-01, -1.186e-01, -1.204e-01, -2.296e-02, -1.763e-01, -1.265e-01, -1.919e-01, 6.718e-02, 8.923e-02), r);
r = MulAdd(s0_5, M4(-2.493e-02, 3.014e-02, 2.446e-02, -1.488e-01, 1.299e-02, -5.759e-02, 2.138e-02, -9.211e-02, -8.051e-03, -4.216e-02, -1.327e-02, -9.724e-04, 3.675e-02, 7.968e-03, -3.353e-02, -4.044e-02), r);
r = MulAdd(s0_6, M4(2.027e-02, 3.813e-03, -2.557e-03, -2.670e-02, 2.068e-02, 1.886e-02, 6.014e-02, 3.191e-02, -1.917e-03, -2.659e-03, 1.273e-02, 3.109e-03, 9.881e-03, -4.410e-04, 7.569e-03, 1.276e-02), r);
r = MulAdd(s0_7, M4(-1.802e-03, 4.820e-02, 4.201e-02, 4.574e-02, 2.826e-02, 2.044e-02, 1.196e-01, 9.132e-02, 1.800e-02, 2.670e-02, -3.398e-03, 1.359e-02, 1.247e-02, 1.268e-02, 1.628e-03, -1.067e-02), r);
r = MulAdd(s0_8, M4(5.233e-03, 3.648e-02, 2.719e-02, 2.838e-02, 1.857e-03, -1.999e-03, 1.703e-02, 5.921e-02, 7.925e-03, -2.543e-03, 5.431e-03, -1.102e-02, -1.116e-02, -5.510e-03, -9.183e-03, -8.054e-03), r);
r = MulAdd(s1_0, M4(-6.423e-02, -5.758e-03, -8.948e-03, -2.227e-03, 5.802e-02, -2.252e-02, -8.134e-03, 1.448e-02, -3.642e-02, 4.476e-03, 7.865e-03, 3.269e-03, 1.053e-02, 1.269e-02, -1.530e-03, -9.628e-03), r);
r = MulAdd(s1_1, M4(-2.553e-02, 4.747e-02, 4.136e-02, -2.368e-02, -1.401e-01, -4.967e-02, 6.372e-02, -1.788e-04, 3.663e-01, 2.193e-01, -8.228e-02, -8.507e-02, 1.404e-01, 8.229e-02, -5.862e-02, -1.161e-02), r);
r = MulAdd(s1_2, M4(-2.216e-02, -7.521e-03, -2.522e-02, 2.337e-02, -2.651e-03, -3.786e-02, -9.854e-03, 2.033e-02, 9.696e-03, 1.237e-01, 6.173e-03, 2.898e-02, -1.335e-02, 2.948e-02, 9.778e-03, -1.243e-02), r);
r = MulAdd(s1_3, M4(-1.598e-02, -1.677e-02, -4.726e-02, -2.250e-02, 2.076e-01, -2.825e-02, 1.389e-01, -2.552e-02, 3.209e-02, -3.267e-03, -9.876e-02, 3.775e-02, -5.440e-02, 6.367e-02, 8.425e-02, 7.583e-03), r);
r = MulAdd(s1_4, M4(-2.339e-01, -8.617e-02, -3.313e-01, 1.470e-01, -1.249e-01, 3.994e-01, -7.191e-01, -2.121e-01, 2.521e-02, 4.601e-02, -3.584e-01, -4.014e-01, -4.299e-01, -4.828e-01, 4.034e-01, 3.633e-01), r);
r = MulAdd(s1_5, M4(3.413e-02, -4.685e-03, 4.308e-02, -1.211e-01, 3.722e-02, -1.000e-01, 5.938e-02, -1.900e-01, 3.286e-03, 6.076e-03, 2.628e-02, -1.190e-01, 3.968e-02, -3.583e-02, -4.724e-02, 5.713e-02), r);
r = MulAdd(s1_6, M4(3.008e-02, -2.083e-02, 7.970e-03, -2.011e-02, -8.809e-03, 9.741e-03, 7.228e-02, 1.875e-02, -8.374e-03, -2.245e-03, 1.642e-02, -9.996e-03, 2.093e-02, 6.393e-03, 6.227e-03, -6.775e-03), r);
r = MulAdd(s1_7, M4(1.113e-02, 5.783e-02, -1.430e-02, 2.826e-02, -1.250e-02, -3.106e-02, 1.754e-01, 2.001e-01, -1.431e-02, -1.368e-02, 4.329e-02, 4.832e-02, 4.089e-02, 3.702e-02, -5.774e-03, 8.701e-03), r);
r = MulAdd(s1_8, M4(1.395e-03, 3.747e-02, 2.706e-02, 4.675e-02, -1.191e-02, -2.163e-02, 3.137e-02, 7.056e-02, 4.929e-03, -6.465e-03, 1.083e-03, 1.816e-02, -3.896e-03, 1.081e-02, -1.507e-02, -1.412e-02), r);
r = MulAdd(s2_0, M4(5.551e-02, 3.061e-02, 2.172e-02, -4.435e-04, 7.341e-02, -4.254e-03, -3.710e-02, 2.005e-02, 3.528e-02, 1.764e-02, 4.547e-03, -6.460e-03, 1.949e-01, 2.466e-02, 7.886e-02, -2.722e-03), r);
r = MulAdd(s2_1, M4(-1.216e-03, 4.895e-02, -2.548e-02, 1.354e-02, 1.184e-01, -2.592e-01, 3.262e-02, 3.213e-02, -7.885e-02, -2.429e-02, -5.811e-02, 1.909e-02, 3.185e-02, -7.057e-02, -2.388e-02, 1.018e-01), r);
r = MulAdd(s2_2, M4(-4.325e-03, 8.278e-03, -7.126e-04, -3.013e-03, -2.277e-02, 6.470e-02, -3.258e-02, 6.558e-03, 2.954e-02, 9.175e-03, -1.066e-03, -1.931e-02, 3.523e-03, 1.347e-03, -1.837e-03, -3.765e-03), r);
r = MulAdd(s2_3, M4(-1.063e-01, 1.364e-02, -1.031e-01, 7.569e-02, -3.770e-02, 3.667e-02, 2.683e-02, 5.980e-02, -1.057e-01, -1.107e-02, -7.272e-02, 5.094e-02, 7.605e-02, 1.566e-02, 1.708e-01, 2.124e-01), r);
r = MulAdd(s2_4, M4(1.344e-02, -6.091e-02, 2.694e-02, -2.727e-02, 2.786e-01, 5.187e-02, 6.738e-01, -9.220e-01, 1.745e-01, -1.468e-02, 1.843e-01, -1.866e-01, -9.396e-02, -1.505e-01, 2.471e-01, -1.138e+00), r);
r = MulAdd(s2_5, M4(6.506e-03, 7.226e-03, 9.650e-03, 3.959e-03, -2.858e-02, -1.124e-01, -5.599e-02, 8.081e-02, -3.923e-02, 6.977e-02, 2.327e-03, 1.164e-01, 1.242e-02, -1.947e-02, -4.582e-02, 2.119e-02), r);
r = MulAdd(s2_6, M4(-1.730e-02, -2.202e-02, -2.408e-02, -6.448e-02, -3.767e-03, 2.506e-02, -4.165e-02, 4.527e-02, 1.431e-02, -2.421e-02, -1.170e-02, -6.665e-02, -1.236e-02, 5.709e-03, -6.345e-03, -3.440e-02), r);
r = MulAdd(s2_7, M4(-4.211e-02, -5.191e-02, -9.762e-02, -1.275e-01, 2.079e-02, -1.004e-01, 7.470e-02, 1.084e-02, -1.789e-02, 8.006e-02, 3.170e-02, 1.111e-01, -4.772e-02, -6.100e-02, 2.375e-02, 2.545e-03), r);
r = MulAdd(s2_8, M4(-7.109e-03, 1.968e-03, -9.159e-03, -1.523e-02, -1.024e-02, -5.787e-04, -4.581e-02, -1.496e-02, 2.302e-02, -1.568e-02, 2.850e-02, 9.731e-03, -1.219e-02, 1.316e-03, -1.859e-02, 8.662e-02), r);
r = MulAdd(s3_0, M4(2.241e-01, 1.599e-02, -3.007e-02, -8.278e-02, -2.343e-02, -1.323e-02, 6.153e-03, 8.030e-03, 1.988e-02, 1.870e-02, 7.620e-03, -1.035e-02, 2.443e-01, 4.061e-02, 3.123e-02, -4.152e-03), r);
r = MulAdd(s3_1, M4(-1.500e-02, -2.365e-02, -2.046e-02, 4.369e-02, 7.611e-03, -9.342e-03, 4.413e-03, -1.110e-03, -1.238e-01, -3.394e-02, -4.442e-02, 2.423e-02, -9.742e-02, -2.324e-02, -3.479e-02, 4.742e-02), r);
r = MulAdd(s3_2, M4(5.839e-03, 1.560e-02, -3.631e-03, 6.730e-03, -2.371e-03, -1.011e-02, -3.821e-03, 1.830e-03, 2.255e-02, 1.426e-02, -1.146e-02, -1.650e-02, 9.035e-03, 5.831e-03, 2.660e-03, -4.854e-03), r);
r = MulAdd(s3_3, M4(-1.694e-01, -2.771e-01, 6.449e-01, -2.979e-01, 9.108e-02, -2.277e-02, -5.309e-02, -3.552e-02, -1.626e-01, 2.544e-02, -7.033e-02, 7.145e-02, -1.334e-01, 1.008e-01, 1.121e-01, 1.733e-01), r);
r = MulAdd(s3_4, M4(-1.019e-01, 1.989e-01, -6.682e-02, -7.066e-02, -3.795e-02, 1.362e-01, 4.307e-02, -4.383e-02, 6.286e-01, -3.881e-01, 1.970e-01, -3.421e-01, -5.374e-03, -2.446e-01, -8.874e-02, -4.099e-01), r);
r = MulAdd(s3_5, M4(1.279e-02, -1.406e-02, 7.997e-03, 1.743e-02, 2.251e-02, -4.285e-02, -2.154e-03, -1.441e-02, -2.329e-02, 1.667e-02, 4.333e-02, 1.229e-01, -2.284e-03, -2.450e-02, -8.000e-03, -1.712e-02), r);
r = MulAdd(s3_6, M4(7.251e-02, 9.488e-03, -1.511e-01, -6.947e-02, -2.728e-02, 7.342e-03, 2.289e-02, 1.443e-02, 1.492e-02, -8.903e-03, -5.817e-02, -4.836e-02, -1.677e-03, 1.964e-02, -6.858e-03, -1.328e-02), r);
r = MulAdd(s3_7, M4(-8.618e-02, -5.596e-02, -1.276e-01, -1.230e-01, 4.851e-03, -5.676e-02, 2.939e-02, -4.192e-02, -2.508e-02, 4.430e-02, 1.352e-01, 2.072e-02, -8.584e-03, -3.983e-02, 1.177e-02, -4.721e-02), r);
r = MulAdd(s3_8, M4(6.050e-03, -3.781e-04, -3.124e-03, -1.667e-02, -1.291e-02, -1.315e-02, -2.106e-02, -5.240e-03, 1.412e-02, -2.504e-02, 3.138e-02, -2.989e-02, -6.363e-03, -1.480e-04, 1.157e-03, 1.933e-02), r);
return tanh(r);
}
void Pass6(uint2 blockStart, uint3 tid) {
float2 pt = float2(GetInputPt());
uint2 gxy = (Rmp8x8(tid.x) << 1) + blockStart;
uint2 size = GetOutputSize();
if (gxy.x >= size.x || gxy.y >= size.y) {
return;
}
float2 pos = ((gxy >> 1) + 0.5) * pt;
V4 s0_0 = l0(-1.0, -1.0);
V4 s0_1 = l0(0.0, -1.0);
V4 s0_2 = l0(1.0, -1.0);
V4 s0_3 = l0(-1.0, 0.0);
V4 s0_4 = l0(0.0, 0.0);
V4 s0_5 = l0(1.0, 0.0);
V4 s0_6 = l0(-1.0, 1.0);
V4 s0_7 = l0(0.0, 1.0);
V4 s0_8 = l0(1.0, 1.0);
V4 s1_0 = -max(-s0_0, 0.0);
V4 s1_1 = -max(-s0_1, 0.0);
V4 s1_2 = -max(-s0_2, 0.0);
V4 s1_3 = -max(-s0_3, 0.0);
V4 s1_4 = -max(-s0_4, 0.0);
V4 s1_5 = -max(-s0_5, 0.0);
V4 s1_6 = -max(-s0_6, 0.0);
V4 s1_7 = -max(-s0_7, 0.0);
V4 s1_8 = -max(-s0_8, 0.0);
s0_0 = max(s0_0, 0.0);
s0_1 = max(s0_1, 0.0);
s0_2 = max(s0_2, 0.0);
s0_3 = max(s0_3, 0.0);
s0_4 = max(s0_4, 0.0);
s0_5 = max(s0_5, 0.0);
s0_6 = max(s0_6, 0.0);
s0_7 = max(s0_7, 0.0);
s0_8 = max(s0_8, 0.0);
V4 s2_0 = l1(-1.0, -1.0);
V4 s2_1 = l1(0.0, -1.0);
V4 s2_2 = l1(1.0, -1.0);
V4 s2_3 = l1(-1.0, 0.0);
V4 s2_4 = l1(0.0, 0.0);
V4 s2_5 = l1(1.0, 0.0);
V4 s2_6 = l1(-1.0, 1.0);
V4 s2_7 = l1(0.0, 1.0);
V4 s2_8 = l1(1.0, 1.0);
V4 s3_0 = -max(-s2_0, 0.0);
V4 s3_1 = -max(-s2_1, 0.0);
V4 s3_2 = -max(-s2_2, 0.0);
V4 s3_3 = -max(-s2_3, 0.0);
V4 s3_4 = -max(-s2_4, 0.0);
V4 s3_5 = -max(-s2_5, 0.0);
V4 s3_6 = -max(-s2_6, 0.0);
V4 s3_7 = -max(-s2_7, 0.0);
V4 s3_8 = -max(-s2_8, 0.0);
s2_0 = max(s2_0, 0.0);
s2_1 = max(s2_1, 0.0);
s2_2 = max(s2_2, 0.0);
s2_3 = max(s2_3, 0.0);
s2_4 = max(s2_4, 0.0);
s2_5 = max(s2_5, 0.0);
s2_6 = max(s2_6, 0.0);
s2_7 = max(s2_7, 0.0);
s2_8 = max(s2_8, 0.0);
V4 r = f0(s0_0, s0_1, s0_2, s0_3, s0_4, s0_5, s0_6, s0_7, s0_8, s1_0, s1_1, s1_2, s1_3, s1_4, s1_5, s1_6, s1_7, s1_8, s2_0, s2_1, s2_2, s2_3, s2_4, s2_5, s2_6, s2_7, s2_8, s3_0, s3_1, s3_2, s3_3, s3_4, s3_5, s3_6, s3_7, s3_8);
static const MF3x3 rgb2yuv = { 0.299, 0.587, 0.114, -0.169, -0.331, 0.5, 0.5, -0.419, -0.081 };
static const MF3x3 yuv2rgb = { 1, -0.00093, 1.401687, 1, -0.3437, -0.71417, 1, 1.77216, 0.00099 };
float2 opt = float2(GetOutputPt());
pos -= 0.5f * opt;
MF3 yuv = mul(rgb2yuv, INPUT.SampleLevel(SL, pos, 0).rgb);
OUTPUT[gxy] = MF4(mul(yuv2rgb, MF3(saturate(yuv.r + r.x), yuv.yz)), 1);
++gxy.x;
pos.x += opt.x;
yuv = mul(rgb2yuv, INPUT.SampleLevel(SL, pos, 0).rgb);
OUTPUT[gxy] = MF4(mul(yuv2rgb, MF3(saturate(yuv.r + r.y), yuv.yz)), 1);
++gxy.y;
pos.y += opt.y;
yuv = mul(rgb2yuv, INPUT.SampleLevel(SL, pos, 0).rgb);
OUTPUT[gxy] = MF4(mul(yuv2rgb, MF3(saturate(yuv.r + r.w), yuv.yz)), 1);
--gxy.x;
pos.x -= opt.x;
yuv = mul(rgb2yuv, INPUT.SampleLevel(SL, pos, 0).rgb);
OUTPUT[gxy] = MF4(mul(yuv2rgb, MF3(saturate(yuv.r + r.z), yuv.yz)), 1);
}

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

View file

@ -1,772 +0,0 @@
// CuNNy 8x4C BILINEAR RGB NVL DN - https://github.com/funnyplanter/CuNNy
// This program is free software: you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation, either version 3 of the License, or
// (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
//
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License
// along with this program. If not, see <http://www.gnu.org/licenses/>.
//!MAGPIE EFFECT
//!VERSION 4
//!SORT_NAME CuNNy-DN-D04N08
//!USE MulAdd
//!CAPABILITY FP16
#include "../StubDefs.hlsli"
//!TEXTURE
Texture2D INPUT;
//!TEXTURE
//!WIDTH INPUT_WIDTH * 2
//!HEIGHT INPUT_HEIGHT * 2
Texture2D OUTPUT;
//!SAMPLER
//!FILTER POINT
SamplerState SP;
//!SAMPLER
//!FILTER LINEAR
SamplerState SL;
//!COMMON
#define O(t, p) t.SampleLevel(SP, pos + p * pt, 0)
#define V4 MF4
#define M4 MF4x4
//!TEXTURE
//!WIDTH INPUT_WIDTH
//!HEIGHT INPUT_HEIGHT
//!FORMAT R8G8B8A8_SNORM
Texture2D t0;
//!TEXTURE
//!WIDTH INPUT_WIDTH
//!HEIGHT INPUT_HEIGHT
//!FORMAT R8G8B8A8_SNORM
Texture2D t1;
//!PASS 1
//!DESC in
//!BLOCK_SIZE 8
//!NUM_THREADS 64
//!IN INPUT
//!OUT t0
#define l0(x, y) (dot(MF3(-1.880e-01, -3.696e-01, -8.936e-02), O(INPUT, float2(x, y)).rgb) + MF(5.137e-01))
V4 f0(MF s0_0, MF s0_1, MF s0_2, MF s0_3, MF s0_4, MF s0_5, MF s0_6, MF s0_7, MF s0_8) {
V4 r = { 1.324e-02, -9.379e-05, 8.452e-03, 5.165e-02 };
r = mad(s0_0, V4(6.049e-03, -3.524e-01, -1.308e-01, -6.691e-02), r);
r = mad(s0_1, V4(1.720e-02, -7.092e-02, -3.030e-01, 1.654e-01), r);
r = mad(s0_2, V4(-6.706e-03, 2.289e-01, 1.982e-03, -5.756e-02), r);
r = mad(s0_3, V4(-2.761e-02, 5.050e-01, -2.036e-01, 1.265e-01), r);
r = mad(s0_4, V4(-8.654e-01, -6.035e-01, -2.119e-01, 5.055e-01), r);
r = mad(s0_5, V4(-7.114e-03, 2.325e-02, 5.721e-02, 4.585e-02), r);
r = mad(s0_6, V4(2.796e-01, 1.680e-01, 1.353e-01, 1.286e-02), r);
r = mad(s0_7, V4(5.684e-01, 3.022e-01, 6.426e-01, 8.931e-02), r);
r = mad(s0_8, V4(3.723e-02, -2.036e-01, 2.732e-02, -4.101e-02), r);
return r;
}
void Pass1(uint2 blockStart, uint3 tid) {
float2 pt = float2(GetInputPt());
uint2 gxy = Rmp8x8(tid.x) + blockStart;
uint2 size = GetInputSize();
if (gxy.x >= size.x || gxy.y >= size.y) {
return;
}
float2 pos = (gxy + 0.5) * pt;
MF s0_0 = l0(-1.0, -1.0);
MF s0_1 = l0(0.0, -1.0);
MF s0_2 = l0(1.0, -1.0);
MF s0_3 = l0(-1.0, 0.0);
MF s0_4 = l0(0.0, 0.0);
MF s0_5 = l0(1.0, 0.0);
MF s0_6 = l0(-1.0, 1.0);
MF s0_7 = l0(0.0, 1.0);
MF s0_8 = l0(1.0, 1.0);
t0[gxy] = f0(s0_0, s0_1, s0_2, s0_3, s0_4, s0_5, s0_6, s0_7, s0_8);
}
//!PASS 2
//!DESC conv1
//!BLOCK_SIZE 8
//!NUM_THREADS 64
//!IN t0
//!OUT t1
#define l0(x, y) V4(O(t0, float2(x, y)))
V4 f0(V4 s0_0, V4 s0_1, V4 s0_2, V4 s0_3, V4 s0_4, V4 s0_5, V4 s0_6, V4 s0_7, V4 s0_8, V4 s1_0, V4 s1_1, V4 s1_2, V4 s1_3, V4 s1_4, V4 s1_5, V4 s1_6, V4 s1_7, V4 s1_8) {
V4 r = { 1.959e-02, -5.807e-03, 9.415e-02, 7.247e-03 };
r = MulAdd(s0_0, M4(2.216e-02, 1.062e-01, -3.433e-03, -1.923e-01, 6.300e-02, -4.594e-01, 2.025e-01, 8.655e-03, -5.497e-02, 1.694e-01, -1.806e-01, 2.115e-01, -6.176e-02, 1.167e-02, -5.987e-02, 1.167e-01), r);
r = MulAdd(s0_1, M4(-1.646e-01, -5.524e-01, -1.352e-01, 1.704e-01, 3.398e-02, -2.598e-01, 1.616e-01, -1.772e-01, -5.648e-02, 2.755e-01, 2.638e-02, -2.657e-02, 3.774e-02, -6.833e-02, -1.141e-01, -2.438e-01), r);
r = MulAdd(s0_2, M4(-1.459e-01, 9.939e-02, -6.457e-04, 2.352e-02, 5.006e-02, -7.759e-01, -4.862e-02, -3.366e-02, 9.508e-02, 1.537e-01, -6.771e-02, -1.260e-01, 1.067e-01, -5.893e-02, -9.811e-02, -1.060e-02), r);
r = MulAdd(s0_3, M4(-2.901e-01, 2.907e-01, 2.178e-01, -3.877e-01, 9.034e-03, 8.718e-03, -1.213e-01, 9.252e-02, 3.286e-01, -8.247e-02, -5.573e-02, -3.852e-01, -1.371e-01, 1.877e-01, 2.337e-01, 5.324e-01), r);
r = MulAdd(s0_4, M4(-9.182e-01, 1.013e-01, 2.969e-01, 7.117e-01, -2.367e-01, -7.128e-02, 1.828e-01, 5.993e-01, -2.965e-01, 1.323e-01, 3.117e-02, -3.215e-01, -1.410e-01, 5.359e-02, -1.137e-01, -2.603e-01), r);
r = MulAdd(s0_5, M4(-1.071e-01, -8.801e-02, 9.524e-03, -2.937e-02, 7.723e-02, 1.195e-01, -9.056e-02, 6.161e-02, 1.962e-01, -2.740e-01, -9.418e-02, 1.141e-01, 6.203e-02, -1.084e-01, 2.402e-01, -2.066e-01), r);
r = MulAdd(s0_6, M4(2.226e-01, -2.259e-01, -2.499e-02, -9.184e-02, -1.499e-01, -3.737e-02, 1.576e-01, 1.084e-01, -2.221e-01, -1.080e-02, 2.643e-02, -1.023e-01, 1.068e-01, 1.193e-01, -2.781e-01, 3.396e-01), r);
r = MulAdd(s0_7, M4(7.520e-01, -1.043e-01, -4.535e-02, 2.775e-01, 1.577e-01, -1.526e-01, 1.796e-01, 1.085e-01, -1.012e+00, 4.333e-02, 1.270e-02, -1.692e-01, 1.127e-01, -2.847e-01, -1.784e-01, -3.956e-01), r);
r = MulAdd(s0_8, M4(2.206e-01, 1.370e-01, -7.453e-02, 1.050e-01, 8.412e-02, -1.396e-01, 1.707e-02, -1.654e-02, -2.116e-01, -7.944e-02, 1.244e-01, -6.709e-02, -5.577e-02, 1.619e-01, -2.818e-01, 1.460e-01), r);
r = MulAdd(s1_0, M4(1.180e-01, -2.345e-01, 5.406e-02, -1.102e-01, 1.559e-02, -3.865e-01, -1.077e-01, 1.442e-02, -1.405e-01, 1.578e-01, -3.338e-02, 1.157e-01, -1.676e-01, 4.656e-02, -1.507e-01, 2.590e-02), r);
r = MulAdd(s1_1, M4(-3.112e-02, -5.537e-01, -3.626e-01, -2.915e-01, 7.495e-02, 4.473e-01, -1.847e-01, -8.743e-02, -3.290e-02, 3.660e-02, 1.252e-01, 1.058e-02, 1.193e-01, 6.421e-02, -1.456e-01, -1.693e-01), r);
r = MulAdd(s1_2, M4(-1.047e-01, -4.306e-01, 6.486e-03, 1.137e-01, 2.935e-02, -3.608e-01, 5.242e-02, -2.374e-02, 1.130e-01, -4.864e-02, -7.302e-02, -2.205e-02, 8.227e-02, -8.403e-02, -9.468e-02, 8.095e-02), r);
r = MulAdd(s1_3, M4(-3.759e-02, 2.709e-01, 1.269e-01, -4.994e-01, -1.577e-02, 1.871e-01, -2.532e-01, 8.960e-02, 2.298e-01, -2.462e-01, -1.634e-02, -3.955e-01, 2.750e-02, -4.812e-02, -2.441e-01, 9.926e-01), r);
r = MulAdd(s1_4, M4(-7.288e-01, 5.644e-01, 1.042e+00, 6.160e-01, -4.271e-01, 4.419e-01, 1.437e-01, 3.840e-01, -1.220e-01, -8.627e-01, 6.664e-02, -1.220e-02, 5.260e-02, 1.505e-01, -2.182e-01, -6.116e-01), r);
r = MulAdd(s1_5, M4(1.659e-01, 2.566e-01, -5.954e-02, -9.187e-02, -8.251e-02, 1.091e-01, -1.506e-01, 1.370e-01, 3.056e-01, -3.512e-01, -4.956e-03, 7.008e-02, 1.320e-01, -3.995e-01, -8.603e-03, -3.542e-01), r);
r = MulAdd(s1_6, M4(2.549e-01, -7.946e-02, -1.755e-01, -2.902e-02, -1.912e-01, 2.349e-01, 6.770e-02, 9.683e-02, -2.690e-01, -1.715e-01, 5.692e-02, -1.064e-01, 2.998e-01, 7.619e-02, 8.040e-03, 2.706e-01), r);
r = MulAdd(s1_7, M4(7.320e-01, 1.397e-01, -5.600e-02, 9.609e-02, -1.267e-01, 6.841e-02, 2.429e-01, 3.167e-02, -6.816e-01, -3.313e-03, 5.622e-02, -4.727e-02, -3.420e-01, 4.283e-02, -3.250e-01, -4.118e-01), r);
r = MulAdd(s1_8, M4(1.607e-01, 1.581e-01, -6.049e-02, 9.118e-02, -1.583e-02, 2.918e-01, 1.703e-02, -1.206e-01, -2.114e-01, -1.248e-01, 6.689e-02, -2.131e-02, -7.779e-02, 1.069e-01, -1.181e-01, 2.230e-01), r);
return r;
}
void Pass2(uint2 blockStart, uint3 tid) {
float2 pt = float2(GetInputPt());
uint2 gxy = Rmp8x8(tid.x) + blockStart;
uint2 size = GetInputSize();
if (gxy.x >= size.x || gxy.y >= size.y) {
return;
}
float2 pos = (gxy + 0.5) * pt;
V4 s0_0 = l0(-1.0, -1.0);
V4 s0_1 = l0(0.0, -1.0);
V4 s0_2 = l0(1.0, -1.0);
V4 s0_3 = l0(-1.0, 0.0);
V4 s0_4 = l0(0.0, 0.0);
V4 s0_5 = l0(1.0, 0.0);
V4 s0_6 = l0(-1.0, 1.0);
V4 s0_7 = l0(0.0, 1.0);
V4 s0_8 = l0(1.0, 1.0);
V4 s1_0 = -max(-s0_0, 0.0);
V4 s1_1 = -max(-s0_1, 0.0);
V4 s1_2 = -max(-s0_2, 0.0);
V4 s1_3 = -max(-s0_3, 0.0);
V4 s1_4 = -max(-s0_4, 0.0);
V4 s1_5 = -max(-s0_5, 0.0);
V4 s1_6 = -max(-s0_6, 0.0);
V4 s1_7 = -max(-s0_7, 0.0);
V4 s1_8 = -max(-s0_8, 0.0);
s0_0 = max(s0_0, 0.0);
s0_1 = max(s0_1, 0.0);
s0_2 = max(s0_2, 0.0);
s0_3 = max(s0_3, 0.0);
s0_4 = max(s0_4, 0.0);
s0_5 = max(s0_5, 0.0);
s0_6 = max(s0_6, 0.0);
s0_7 = max(s0_7, 0.0);
s0_8 = max(s0_8, 0.0);
t1[gxy] = f0(s0_0, s0_1, s0_2, s0_3, s0_4, s0_5, s0_6, s0_7, s0_8, s1_0, s1_1, s1_2, s1_3, s1_4, s1_5, s1_6, s1_7, s1_8);
}
//!PASS 3
//!DESC conv2
//!BLOCK_SIZE 8
//!NUM_THREADS 64
//!IN t1
//!OUT t0
#define l0(x, y) V4(O(t1, float2(x, y)))
V4 f0(V4 s0_0, V4 s0_1, V4 s0_2, V4 s0_3, V4 s0_4, V4 s0_5, V4 s0_6, V4 s0_7, V4 s0_8, V4 s1_0, V4 s1_1, V4 s1_2, V4 s1_3, V4 s1_4, V4 s1_5, V4 s1_6, V4 s1_7, V4 s1_8) {
V4 r = { 3.240e-02, -1.989e-01, -2.700e-02, 6.578e-03 };
r = MulAdd(s0_0, M4(9.727e-02, 1.849e-01, 2.125e-02, 1.933e-01, 9.183e-02, 8.307e-03, -9.035e-02, 3.241e-02, 1.141e-01, 8.739e-02, -9.547e-02, 1.616e-01, 2.912e-02, -1.780e-02, 5.433e-02, 2.720e-02), r);
r = MulAdd(s0_1, M4(-1.524e-01, -9.138e-02, 8.798e-02, -1.691e-01, 8.519e-03, 3.597e-02, -1.784e-02, 3.049e-02, 3.078e-02, 1.823e-01, 1.051e-02, -5.317e-02, -1.977e-01, 1.013e-01, 1.215e-01, 4.261e-02), r);
r = MulAdd(s0_2, M4(-1.992e-02, -1.191e-01, -1.365e-03, 3.976e-02, 3.452e-03, 7.503e-03, 4.850e-03, 8.970e-03, -7.652e-03, 1.166e-01, 9.888e-02, 3.423e-03, -3.354e-01, -3.335e-01, -2.226e-02, -1.509e-01), r);
r = MulAdd(s0_3, M4(-7.994e-02, 1.374e-01, -1.701e-02, -2.530e-01, 2.153e-01, -6.957e-03, -1.405e-01, -6.175e-02, 7.274e-03, 1.734e-01, -9.107e-02, -1.303e-01, -1.265e-01, 1.669e-02, 3.494e-02, -8.377e-02), r);
r = MulAdd(s0_4, M4(-1.124e+00, 1.355e-02, -1.979e-01, -4.092e-01, -1.276e-01, -1.096e-01, 5.949e-02, 1.073e-01, -4.780e-02, 1.378e-01, 1.905e-01, -9.525e-02, -5.999e-01, 1.274e-01, 8.416e-01, 2.483e-01), r);
r = MulAdd(s0_5, M4(3.312e-01, 2.036e-01, -5.231e-02, 5.357e-02, 1.666e-03, -2.102e-03, -3.213e-03, 4.747e-02, 1.130e-01, 3.492e-01, -1.263e-01, 4.100e-01, -5.859e-01, 4.875e-02, 2.227e-01, 3.127e-01), r);
r = MulAdd(s0_6, M4(-3.699e-02, 6.066e-02, 3.448e-03, -4.158e-03, -4.048e-03, -3.619e-02, -8.830e-02, -8.917e-03, 2.990e-02, 6.919e-03, 9.803e-02, 2.188e-02, 5.674e-02, -3.122e-02, -6.793e-02, 8.573e-02), r);
r = MulAdd(s0_7, M4(-1.255e-01, 1.754e-01, -1.332e-01, -1.124e-01, -2.163e-01, 1.552e-02, -7.485e-04, 4.194e-02, -1.899e-01, 1.334e-01, -1.721e-01, -3.487e-01, 3.847e-01, -3.823e-02, 1.121e-02, -7.128e-02), r);
r = MulAdd(s0_8, M4(7.152e-02, -1.631e-02, 4.810e-02, 1.435e-01, 3.881e-02, -3.596e-02, -7.544e-03, -1.071e-01, -8.509e-02, 1.110e-01, 8.542e-02, 1.980e-02, -1.134e-01, -7.967e-02, -1.586e-01, 2.511e-01), r);
r = MulAdd(s1_0, M4(2.326e-01, 4.791e-02, -1.996e-01, 1.352e-02, -9.909e-03, 1.117e-01, 2.198e-02, -6.683e-02, 1.356e-01, 2.830e-01, -8.418e-02, 2.137e-01, -1.401e-02, -7.056e-02, 5.360e-02, 6.243e-02), r);
r = MulAdd(s1_1, M4(7.739e-01, -3.172e-01, -2.031e-01, 2.054e-01, -1.263e-01, -7.571e-03, 8.090e-02, -1.372e-01, 1.053e-01, 2.982e-01, -6.235e-02, 1.452e-02, 1.973e-01, 9.233e-02, -1.067e-01, 1.088e-01), r);
r = MulAdd(s1_2, M4(-1.136e-01, -1.332e-01, -7.369e-02, 2.046e-01, -9.302e-02, 2.722e-02, 9.461e-02, -1.895e-01, 1.216e-02, 2.595e-01, 1.028e-01, 8.413e-02, -1.339e-01, -2.259e-01, -1.047e-01, 5.994e-02), r);
r = MulAdd(s1_3, M4(1.224e-01, -3.713e-02, -2.383e-01, -1.743e-01, -1.876e-01, 1.155e-01, 2.212e-01, -1.375e-01, 1.618e-01, 2.628e-01, -1.161e-01, -1.826e-01, 8.003e-02, -1.961e-02, -6.278e-02, -5.710e-02), r);
r = MulAdd(s1_4, M4(-2.647e-01, -1.603e-01, -7.731e-01, 1.958e-01, -4.093e-01, -1.110e-01, 3.352e-01, -3.093e-02, -6.201e-01, 3.073e-01, 3.779e-01, -2.733e-01, 4.035e-01, 1.230e-01, -1.606e-01, 9.421e-02), r);
r = MulAdd(s1_5, M4(1.981e-01, -8.801e-03, -9.874e-03, -4.003e-02, 2.686e-03, -1.346e-01, -1.813e-02, -1.003e-01, 1.561e-01, 3.252e-01, -1.189e-01, 2.014e-01, 1.343e-01, 4.088e-02, -9.918e-02, 1.025e+00), r);
r = MulAdd(s1_6, M4(-2.323e-02, 3.284e-02, -5.099e-03, -3.025e-02, -1.458e-02, -1.640e-02, 1.268e-01, -3.787e-02, 5.078e-02, 4.529e-02, 1.050e-02, -8.079e-03, -1.530e-02, -6.509e-02, -1.620e-01, 6.662e-02), r);
r = MulAdd(s1_7, M4(3.972e-02, 8.570e-02, -8.723e-02, -3.746e-02, -1.902e-01, 5.121e-02, 1.161e-01, -4.624e-02, -6.268e-02, 1.852e-01, -1.535e-01, -2.023e-01, 2.476e-01, -2.211e-02, -1.590e-01, -3.109e-02), r);
r = MulAdd(s1_8, M4(-8.025e-03, -4.798e-02, 5.162e-02, 6.616e-02, -2.416e-02, -5.815e-02, -1.334e-02, -1.029e-01, 5.381e-02, 1.539e-01, 4.511e-02, 1.426e-01, -5.511e-02, -9.311e-02, -3.072e-02, 1.572e-01), r);
return r;
}
void Pass3(uint2 blockStart, uint3 tid) {
float2 pt = float2(GetInputPt());
uint2 gxy = Rmp8x8(tid.x) + blockStart;
uint2 size = GetInputSize();
if (gxy.x >= size.x || gxy.y >= size.y) {
return;
}
float2 pos = (gxy + 0.5) * pt;
V4 s0_0 = l0(-1.0, -1.0);
V4 s0_1 = l0(0.0, -1.0);
V4 s0_2 = l0(1.0, -1.0);
V4 s0_3 = l0(-1.0, 0.0);
V4 s0_4 = l0(0.0, 0.0);
V4 s0_5 = l0(1.0, 0.0);
V4 s0_6 = l0(-1.0, 1.0);
V4 s0_7 = l0(0.0, 1.0);
V4 s0_8 = l0(1.0, 1.0);
V4 s1_0 = -max(-s0_0, 0.0);
V4 s1_1 = -max(-s0_1, 0.0);
V4 s1_2 = -max(-s0_2, 0.0);
V4 s1_3 = -max(-s0_3, 0.0);
V4 s1_4 = -max(-s0_4, 0.0);
V4 s1_5 = -max(-s0_5, 0.0);
V4 s1_6 = -max(-s0_6, 0.0);
V4 s1_7 = -max(-s0_7, 0.0);
V4 s1_8 = -max(-s0_8, 0.0);
s0_0 = max(s0_0, 0.0);
s0_1 = max(s0_1, 0.0);
s0_2 = max(s0_2, 0.0);
s0_3 = max(s0_3, 0.0);
s0_4 = max(s0_4, 0.0);
s0_5 = max(s0_5, 0.0);
s0_6 = max(s0_6, 0.0);
s0_7 = max(s0_7, 0.0);
s0_8 = max(s0_8, 0.0);
t0[gxy] = f0(s0_0, s0_1, s0_2, s0_3, s0_4, s0_5, s0_6, s0_7, s0_8, s1_0, s1_1, s1_2, s1_3, s1_4, s1_5, s1_6, s1_7, s1_8);
}
//!PASS 4
//!DESC conv3
//!BLOCK_SIZE 8
//!NUM_THREADS 64
//!IN t0
//!OUT t1
#define l0(x, y) V4(O(t0, float2(x, y)))
V4 f0(V4 s0_0, V4 s0_1, V4 s0_2, V4 s0_3, V4 s0_4, V4 s0_5, V4 s0_6, V4 s0_7, V4 s0_8, V4 s1_0, V4 s1_1, V4 s1_2, V4 s1_3, V4 s1_4, V4 s1_5, V4 s1_6, V4 s1_7, V4 s1_8) {
V4 r = { 4.014e-03, -2.020e-02, 1.560e-02, -2.352e-02 };
r = MulAdd(s0_0, M4(9.384e-02, 1.183e-01, 5.136e-02, -4.583e-01, -1.060e-01, 6.124e-02, -1.479e-01, -2.457e-01, -5.881e-02, 4.756e-03, -2.540e-02, -5.047e-02, -1.897e-01, 4.062e-02, 1.226e-02, 1.465e-01), r);
r = MulAdd(s0_1, M4(-1.890e-01, -9.535e-02, 2.627e-01, 3.224e-01, 1.050e-01, -3.922e-02, -3.551e-01, -2.632e-01, -2.349e-01, -5.605e-02, -2.856e-01, 4.331e-01, -2.614e-02, -6.027e-02, -3.236e-02, 2.873e-01), r);
r = MulAdd(s0_2, M4(-1.702e-01, 7.462e-02, 2.168e-01, 4.212e-01, 8.150e-03, 6.671e-02, -2.781e-01, -1.322e-01, -3.933e-02, 2.698e-02, -3.420e-01, -1.116e-02, -1.788e-02, 8.701e-03, -1.044e-01, 1.264e-01), r);
r = MulAdd(s0_3, M4(3.573e-01, -4.592e-02, 4.539e-01, 2.854e-01, -6.463e-01, -1.763e-01, 6.236e-01, 7.125e-02, 4.126e-01, -1.621e-02, 1.685e-02, 2.328e-01, -5.456e-01, -2.113e-01, 1.424e-01, 1.414e-01), r);
r = MulAdd(s0_4, M4(3.838e-01, -1.008e+00, 4.023e-01, 1.302e+00, -1.503e-01, 4.245e-02, 1.496e+00, -3.479e-01, -3.763e-01, -7.877e-01, 4.081e-01, -2.192e-01, -2.853e-01, 2.123e-01, -3.407e-01, 2.423e-01), r);
r = MulAdd(s0_5, M4(5.073e-03, -2.123e-01, 1.851e-01, 1.482e-01, -2.814e-01, 1.262e-01, 6.890e-01, -2.317e-01, 6.427e-02, -5.801e-02, -3.684e-02, 7.526e-02, 1.309e-02, -2.125e-02, -7.760e-02, 4.795e-02), r);
r = MulAdd(s0_6, M4(1.409e-01, -1.062e-01, 1.665e-01, 5.277e-01, 6.676e-01, -1.872e-01, 1.251e+00, 1.165e-01, -2.287e-02, -5.235e-02, -2.028e-03, -3.305e-02, -1.968e-01, 1.898e-01, -9.538e-02, -1.418e-01), r);
r = MulAdd(s0_7, M4(7.353e-02, -3.073e-01, 1.789e-01, 2.137e-01, -6.435e-01, -6.052e-01, 2.259e+00, 2.884e-02, 7.105e-04, 1.247e-01, -7.393e-02, 2.539e-02, 1.194e-01, 1.870e-01, -1.126e-01, 2.444e-02), r);
r = MulAdd(s0_8, M4(3.853e-02, -2.242e-01, 1.470e-01, 1.701e-02, 4.586e-02, 2.027e-01, 7.448e-01, -4.414e-01, 9.096e-03, 1.277e-01, 4.010e-02, 1.064e-02, 2.401e-02, 1.901e-02, 1.956e-02, 8.744e-02), r);
r = MulAdd(s1_0, M4(-4.741e-02, 1.819e-03, -8.321e-02, -1.496e-01, -1.801e-02, 4.682e-02, -6.041e-02, -7.243e-02, -1.478e-01, 4.970e-02, 6.424e-02, -5.378e-02, -9.117e-02, 5.496e-02, -2.648e-02, -4.042e-02), r);
r = MulAdd(s1_1, M4(-8.815e-02, 5.938e-02, -2.433e-01, 1.737e-01, 1.095e-01, -5.108e-02, -5.729e-02, 8.334e-03, -2.763e-01, -6.431e-02, -2.454e-02, 4.055e-01, 2.113e-02, -1.298e-01, -3.908e-02, -1.780e-02), r);
r = MulAdd(s1_2, M4(-1.905e-02, 3.894e-02, -1.293e-01, 8.303e-03, -7.800e-03, -5.508e-03, 8.606e-02, -7.501e-02, 1.542e-02, 3.046e-02, -2.920e-01, -4.240e-02, -3.932e-02, -1.813e-02, -8.213e-02, 1.017e-01), r);
r = MulAdd(s1_3, M4(1.965e-01, 3.626e-02, 3.418e-02, 9.779e-02, -6.664e-02, -2.295e-02, -2.736e-02, 1.091e-01, 1.129e-01, -3.896e-02, 1.171e-02, -2.870e-02, -1.382e-01, -1.691e-01, 3.018e-01, -1.186e-01), r);
r = MulAdd(s1_4, M4(1.075e-01, -6.894e-01, 1.714e-01, 5.097e-01, 9.868e-03, 1.087e-01, 2.107e-01, -6.591e-02, -3.233e-01, -9.792e-01, -1.189e-01, -5.480e-01, -1.157e-01, 5.941e-02, -5.770e-01, -1.030e-01), r);
r = MulAdd(s1_5, M4(3.289e-02, 3.941e-02, 1.824e-01, 7.260e-04, -9.787e-03, 3.128e-02, -1.333e-01, 1.352e-01, 5.954e-03, -2.520e-01, -8.536e-02, -3.566e-01, 2.998e-02, -5.941e-02, -8.531e-02, -4.232e-02), r);
r = MulAdd(s1_6, M4(2.592e-02, -7.528e-02, -1.956e-02, 1.002e-01, 2.992e-02, -1.673e-01, 4.413e-02, 1.683e-01, 1.440e-02, -1.047e-02, 1.425e-02, -1.292e-01, -1.777e-01, 1.220e-01, -6.381e-02, 4.174e-02), r);
r = MulAdd(s1_7, M4(-3.107e-02, -8.612e-02, 1.248e-02, -8.544e-02, -1.161e-01, 7.718e-02, -1.150e-01, -1.699e-01, -1.392e-02, 7.590e-02, -5.195e-02, -3.599e-01, 4.872e-02, 1.381e-01, -1.143e-01, -1.473e-03), r);
r = MulAdd(s1_8, M4(1.277e-02, 3.020e-02, 4.658e-02, 8.071e-02, 6.867e-02, -2.693e-02, 7.897e-02, -1.264e-02, -1.035e-03, 1.509e-01, 4.169e-02, -1.716e-01, -4.694e-03, 1.627e-02, 7.171e-03, -4.496e-02), r);
return r;
}
void Pass4(uint2 blockStart, uint3 tid) {
float2 pt = float2(GetInputPt());
uint2 gxy = Rmp8x8(tid.x) + blockStart;
uint2 size = GetInputSize();
if (gxy.x >= size.x || gxy.y >= size.y) {
return;
}
float2 pos = (gxy + 0.5) * pt;
V4 s0_0 = l0(-1.0, -1.0);
V4 s0_1 = l0(0.0, -1.0);
V4 s0_2 = l0(1.0, -1.0);
V4 s0_3 = l0(-1.0, 0.0);
V4 s0_4 = l0(0.0, 0.0);
V4 s0_5 = l0(1.0, 0.0);
V4 s0_6 = l0(-1.0, 1.0);
V4 s0_7 = l0(0.0, 1.0);
V4 s0_8 = l0(1.0, 1.0);
V4 s1_0 = -max(-s0_0, 0.0);
V4 s1_1 = -max(-s0_1, 0.0);
V4 s1_2 = -max(-s0_2, 0.0);
V4 s1_3 = -max(-s0_3, 0.0);
V4 s1_4 = -max(-s0_4, 0.0);
V4 s1_5 = -max(-s0_5, 0.0);
V4 s1_6 = -max(-s0_6, 0.0);
V4 s1_7 = -max(-s0_7, 0.0);
V4 s1_8 = -max(-s0_8, 0.0);
s0_0 = max(s0_0, 0.0);
s0_1 = max(s0_1, 0.0);
s0_2 = max(s0_2, 0.0);
s0_3 = max(s0_3, 0.0);
s0_4 = max(s0_4, 0.0);
s0_5 = max(s0_5, 0.0);
s0_6 = max(s0_6, 0.0);
s0_7 = max(s0_7, 0.0);
s0_8 = max(s0_8, 0.0);
t1[gxy] = f0(s0_0, s0_1, s0_2, s0_3, s0_4, s0_5, s0_6, s0_7, s0_8, s1_0, s1_1, s1_2, s1_3, s1_4, s1_5, s1_6, s1_7, s1_8);
}
//!PASS 5
//!DESC conv4
//!BLOCK_SIZE 8
//!NUM_THREADS 64
//!IN t1
//!OUT t0
#define l0(x, y) V4(O(t1, float2(x, y)))
V4 f0(V4 s0_0, V4 s0_1, V4 s0_2, V4 s0_3, V4 s0_4, V4 s0_5, V4 s0_6, V4 s0_7, V4 s0_8, V4 s1_0, V4 s1_1, V4 s1_2, V4 s1_3, V4 s1_4, V4 s1_5, V4 s1_6, V4 s1_7, V4 s1_8) {
V4 r = { -5.942e-03, -2.718e-02, -1.234e-02, 3.307e-02 };
r = MulAdd(s0_0, M4(3.174e-02, -2.020e-01, -6.843e-03, 1.049e-01, 1.680e-01, -6.387e-01, -1.541e-01, -1.952e-01, -4.586e-02, -1.580e-01, -5.507e-02, 1.065e-01, -5.257e-03, -9.464e-02, -9.788e-02, 1.221e-01), r);
r = MulAdd(s0_1, M4(1.365e-01, -4.220e-02, -4.186e-02, -1.569e-01, -5.527e-01, -1.180e-01, -2.274e-01, -2.007e-01, 2.207e-02, 1.190e-02, 3.746e-02, -1.565e-01, -2.808e-02, 1.657e-02, -5.376e-02, -1.093e-02), r);
r = MulAdd(s0_2, M4(-7.935e-02, -3.809e-02, -3.727e-02, -4.730e-02, -8.556e-02, 3.451e-04, -8.191e-02, 8.086e-02, 2.051e-02, 7.072e-03, 2.537e-02, 2.793e-02, 9.384e-04, -3.624e-02, -2.171e-02, 7.103e-02), r);
r = MulAdd(s0_3, M4(-1.261e-02, 2.716e-01, 2.739e-01, -7.349e-02, -2.130e-02, -4.131e-01, -1.851e-01, 1.065e-01, -7.827e-02, 2.868e-01, -1.500e-01, -1.442e-01, -1.842e-02, -2.983e-01, -4.232e-02, 1.395e-01), r);
r = MulAdd(s0_4, M4(2.733e-01, 4.015e-01, 4.102e-01, -2.027e-01, 4.229e-01, 2.213e-01, 3.628e-01, -1.011e-01, -4.893e-01, 1.333e-01, -4.245e-02, -8.133e-02, -1.086e-02, -1.089e-01, -8.720e-02, 1.513e-01), r);
r = MulAdd(s0_5, M4(8.521e-02, 1.460e-01, 1.589e-01, -2.075e-01, -5.391e-02, 7.449e-03, -6.763e-02, -2.352e-01, 4.055e-02, -1.812e-02, -1.413e-02, 9.240e-02, -3.070e-02, -4.975e-03, -8.972e-02, -2.225e-02), r);
r = MulAdd(s0_6, M4(1.880e-01, -1.481e-01, 1.001e-01, 6.339e-02, -6.208e-02, -2.814e-02, -5.944e-03, 1.002e-01, -7.822e-02, 1.010e-01, -2.161e-02, 9.175e-02, 1.495e-02, 1.645e-02, 8.901e-03, -3.865e-02), r);
r = MulAdd(s0_7, M4(4.449e-01, -1.089e-01, -1.249e-01, -8.911e-01, 3.096e-02, 1.724e-01, 5.605e-02, -7.605e-02, -9.644e-02, -1.191e-01, -1.332e-01, 2.544e-02, 5.659e-02, -2.706e-04, -9.886e-02, 9.218e-02), r);
r = MulAdd(s0_8, M4(7.394e-02, -2.112e-01, 1.505e-02, -1.236e-01, -1.848e-02, -2.716e-02, -6.663e-02, 2.764e-02, -1.120e-02, 3.440e-03, -1.443e-02, 1.745e-02, -3.847e-02, -4.228e-03, -8.888e-02, 2.134e-02), r);
r = MulAdd(s1_0, M4(6.588e-03, -6.764e-02, -2.660e-02, -3.967e-02, 6.459e-02, -6.345e-01, -5.784e-01, 9.294e-02, 2.426e-02, -9.858e-02, -9.036e-02, -9.545e-02, 2.094e-02, -1.001e-01, -1.145e-01, -6.470e-02), r);
r = MulAdd(s1_1, M4(-2.633e-03, 5.849e-02, 3.154e-02, -7.386e-02, -6.412e-01, -4.405e-01, -5.885e-01, 1.657e-01, -1.757e-01, -1.882e-02, -1.023e-01, -1.713e-01, -1.047e-01, -1.558e-01, -1.509e-01, -2.815e-01), r);
r = MulAdd(s1_2, M4(1.880e-01, -3.790e-02, 1.112e-01, 1.672e-02, -1.713e-01, 2.611e-02, -9.008e-02, 9.359e-02, -6.567e-02, 9.399e-02, 3.743e-02, 3.662e-02, 3.190e-02, -1.466e-01, -1.154e-01, 1.692e-02), r);
r = MulAdd(s1_3, M4(-1.733e-02, 1.381e-01, 8.342e-02, -5.893e-02, -1.467e-02, -4.365e-01, -3.057e-01, 1.506e-01, 7.300e-02, 6.777e-01, -5.484e-03, -3.499e-01, 1.978e-01, -6.846e-01, -2.921e-01, -1.173e-01), r);
r = MulAdd(s1_4, M4(-1.829e-01, -4.506e-01, -5.685e-02, 8.260e-01, 3.056e-01, 1.803e-01, 1.908e-01, -2.029e-01, -1.578e-01, 5.039e-01, 3.016e-01, -4.971e-01, -4.977e-01, 4.537e-01, -4.268e-01, 7.878e-01), r);
r = MulAdd(s1_5, M4(-3.251e-01, -1.229e-01, -1.447e-01, 3.290e-01, -2.134e-01, -6.542e-03, -7.109e-02, -1.004e-01, 3.887e-02, -1.008e-01, -7.490e-02, 6.126e-02, 2.757e-01, -1.980e-01, -1.792e-01, 2.722e-01), r);
r = MulAdd(s1_6, M4(4.765e-02, -5.401e-02, 4.164e-02, 1.847e-03, -3.178e-02, -4.201e-02, -2.504e-02, 1.350e-02, -1.436e-01, 1.654e-01, -1.099e-02, -3.733e-02, 1.118e-01, -2.529e-01, -1.353e-01, -9.309e-02), r);
r = MulAdd(s1_7, M4(1.684e-01, -1.978e-01, 2.645e-02, -9.582e-02, 2.618e-02, 9.350e-02, -2.281e-02, -1.901e-01, 1.176e-02, -1.571e-01, 1.491e-02, -2.105e-01, -1.685e-01, -2.459e-01, -2.166e-01, 1.082e-01), r);
r = MulAdd(s1_8, M4(2.225e-02, 7.813e-02, -4.112e-02, 6.166e-02, -4.143e-02, -2.160e-02, -7.478e-02, -2.251e-02, -1.306e-02, -6.002e-02, -7.496e-02, -2.538e-03, 7.824e-02, 9.597e-02, -3.546e-03, -1.794e-01), r);
return r;
}
void Pass5(uint2 blockStart, uint3 tid) {
float2 pt = float2(GetInputPt());
uint2 gxy = Rmp8x8(tid.x) + blockStart;
uint2 size = GetInputSize();
if (gxy.x >= size.x || gxy.y >= size.y) {
return;
}
float2 pos = (gxy + 0.5) * pt;
V4 s0_0 = l0(-1.0, -1.0);
V4 s0_1 = l0(0.0, -1.0);
V4 s0_2 = l0(1.0, -1.0);
V4 s0_3 = l0(-1.0, 0.0);
V4 s0_4 = l0(0.0, 0.0);
V4 s0_5 = l0(1.0, 0.0);
V4 s0_6 = l0(-1.0, 1.0);
V4 s0_7 = l0(0.0, 1.0);
V4 s0_8 = l0(1.0, 1.0);
V4 s1_0 = -max(-s0_0, 0.0);
V4 s1_1 = -max(-s0_1, 0.0);
V4 s1_2 = -max(-s0_2, 0.0);
V4 s1_3 = -max(-s0_3, 0.0);
V4 s1_4 = -max(-s0_4, 0.0);
V4 s1_5 = -max(-s0_5, 0.0);
V4 s1_6 = -max(-s0_6, 0.0);
V4 s1_7 = -max(-s0_7, 0.0);
V4 s1_8 = -max(-s0_8, 0.0);
s0_0 = max(s0_0, 0.0);
s0_1 = max(s0_1, 0.0);
s0_2 = max(s0_2, 0.0);
s0_3 = max(s0_3, 0.0);
s0_4 = max(s0_4, 0.0);
s0_5 = max(s0_5, 0.0);
s0_6 = max(s0_6, 0.0);
s0_7 = max(s0_7, 0.0);
s0_8 = max(s0_8, 0.0);
t0[gxy] = f0(s0_0, s0_1, s0_2, s0_3, s0_4, s0_5, s0_6, s0_7, s0_8, s1_0, s1_1, s1_2, s1_3, s1_4, s1_5, s1_6, s1_7, s1_8);
}
//!PASS 6
//!DESC conv5
//!BLOCK_SIZE 8
//!NUM_THREADS 64
//!IN t0
//!OUT t1
#define l0(x, y) V4(O(t0, float2(x, y)))
V4 f0(V4 s0_0, V4 s0_1, V4 s0_2, V4 s0_3, V4 s0_4, V4 s0_5, V4 s0_6, V4 s0_7, V4 s0_8, V4 s1_0, V4 s1_1, V4 s1_2, V4 s1_3, V4 s1_4, V4 s1_5, V4 s1_6, V4 s1_7, V4 s1_8) {
V4 r = { 1.102e-03, 4.481e-03, 3.096e-03, -9.818e-03 };
r = MulAdd(s0_0, M4(-1.069e-01, 1.009e-01, -5.972e-02, -1.732e-02, -9.217e-02, 9.177e-03, -3.127e-02, -5.872e-02, -1.364e-02, -9.990e-04, 1.518e-01, 5.861e-02, -9.835e-02, -1.155e-01, 6.714e-02, -5.142e-02), r);
r = MulAdd(s0_1, M4(1.404e-02, 1.372e-01, -2.759e-01, -4.361e-02, -1.407e-01, 1.570e-01, -1.216e-01, -7.289e-02, 3.088e-01, -1.285e-01, 1.107e-01, 1.651e-01, 1.596e-01, -1.569e-01, 1.437e-02, -1.455e-01), r);
r = MulAdd(s0_2, M4(-4.001e-02, 1.772e-01, -2.761e-01, 4.916e-02, -1.489e-01, 1.680e-01, -5.244e-02, 1.334e-01, 1.245e-01, -2.321e-01, 5.371e-01, -2.549e-01, -9.624e-02, -1.072e-01, 2.322e-01, -2.261e-01), r);
r = MulAdd(s0_3, M4(-2.291e-01, 7.774e-04, -1.015e-02, 6.036e-02, -1.133e-01, 7.554e-02, 1.081e-01, 1.704e-01, 2.123e-01, -2.065e-01, 4.928e-02, 2.352e-03, -2.488e-01, -1.765e-01, 2.044e-01, 1.302e-02), r);
r = MulAdd(s0_4, M4(3.195e-01, -5.410e-01, -4.771e-01, -1.713e-01, 2.778e-01, -1.028e-01, 8.603e-02, 2.162e-01, 1.466e-02, 2.633e-02, -3.299e-01, -5.183e-02, -3.598e-01, -4.015e-01, 5.674e-02, -1.429e-01), r);
r = MulAdd(s0_5, M4(-1.480e-01, 2.440e-01, -2.189e-01, 1.407e-01, -3.439e-01, 2.624e-01, 4.947e-01, 7.813e-01, 1.067e-01, -6.781e-02, -5.271e-02, -1.331e-02, -2.133e-01, -1.038e-01, 4.267e-01, -4.026e-01), r);
r = MulAdd(s0_6, M4(-1.086e-01, 2.607e-01, -1.897e-01, -1.710e-01, 6.096e-02, -1.121e-01, 8.797e-02, -8.204e-02, 4.825e-02, -9.364e-02, 8.472e-02, -1.923e-02, -1.755e-01, 1.086e-01, -3.987e-02, 1.737e-02), r);
r = MulAdd(s0_7, M4(5.606e-02, 4.516e-02, -7.352e-02, 7.654e-02, -6.706e-02, 2.674e-01, -2.388e-01, -1.997e-01, 9.871e-02, -9.055e-02, 1.274e-01, 1.854e-01, -1.765e-01, -1.779e-01, 1.114e-01, -1.882e-01), r);
r = MulAdd(s0_8, M4(-4.811e-02, 2.057e-01, -2.913e-01, 1.265e-01, 1.304e-01, 1.462e-01, -4.432e-03, 4.191e-01, 6.606e-02, -1.382e-01, 1.052e-01, -3.990e-01, 9.737e-02, -9.675e-02, 6.216e-02, -2.130e-01), r);
r = MulAdd(s1_0, M4(-1.183e-01, -5.696e-02, 9.372e-02, 3.074e-03, -2.694e-02, -2.272e-02, -3.489e-02, -2.667e-02, 1.635e-01, -5.761e-04, -1.677e-03, -1.076e-01, -5.411e-02, -1.100e-02, 1.742e-02, 6.403e-02), r);
r = MulAdd(s1_1, M4(-4.462e-03, 3.912e-02, -1.208e-01, -9.360e-02, -1.260e-01, 1.602e-02, -1.047e-01, -1.252e-01, 2.940e-01, 1.068e-01, -2.602e-01, 1.692e-01, 1.120e-01, -2.613e-02, -1.083e-02, 1.754e-02), r);
r = MulAdd(s1_2, M4(2.307e-02, 1.240e-01, -2.024e-01, 1.761e-01, -2.326e-01, 3.209e-02, 5.352e-02, 3.399e-02, 1.754e-01, -3.059e-01, 4.554e-01, -2.412e-01, 4.242e-03, 3.919e-02, 7.769e-02, -1.155e-01), r);
r = MulAdd(s1_3, M4(-1.946e-01, -9.445e-02, 1.698e-01, 1.165e-01, -1.571e-01, 1.700e-02, 5.682e-02, 4.628e-02, 4.425e-01, -1.872e-01, 3.713e-02, 8.537e-02, 4.211e-02, -6.178e-02, 1.398e-02, 5.929e-02), r);
r = MulAdd(s1_4, M4(5.957e-01, -6.855e-01, -3.668e-01, -2.565e-01, -4.383e-02, -8.094e-02, -2.101e-02, -2.446e-01, -7.781e-02, 5.879e-01, -5.272e-01, -1.786e-01, -2.396e-01, -4.148e-01, 5.226e-02, 9.011e-02), r);
r = MulAdd(s1_5, M4(-4.655e-02, 1.107e-01, -1.109e-01, 3.601e-01, -2.103e-01, 3.712e-01, 1.666e-01, 3.972e-01, -2.227e-02, -2.115e-02, -7.054e-02, -1.216e-01, 4.739e-03, 1.201e-01, 1.335e-01, -1.775e-01), r);
r = MulAdd(s1_6, M4(-7.542e-02, 9.157e-02, 1.143e-02, -7.961e-02, -3.812e-02, 1.722e-02, 1.396e-02, -3.920e-02, -6.220e-03, -6.723e-02, 9.364e-02, -4.804e-02, -8.885e-02, 1.313e-01, -7.872e-02, 2.733e-02), r);
r = MulAdd(s1_7, M4(-3.879e-01, -2.705e-01, 3.305e-01, -1.542e-01, -1.179e-01, 9.695e-02, -1.353e-01, -2.320e-01, 1.433e-02, -2.689e-01, 2.066e-01, 3.704e-01, -5.587e-02, -6.296e-02, 6.326e-02, 1.881e-02), r);
r = MulAdd(s1_8, M4(-4.722e-02, -5.909e-02, 4.089e-02, -8.851e-02, 2.017e-01, -2.652e-02, 9.432e-02, 3.252e-01, -2.219e-01, 2.142e-02, -4.496e-02, 5.456e-02, 2.364e-02, 1.081e-01, -9.898e-02, 9.928e-02), r);
return r;
}
void Pass6(uint2 blockStart, uint3 tid) {
float2 pt = float2(GetInputPt());
uint2 gxy = Rmp8x8(tid.x) + blockStart;
uint2 size = GetInputSize();
if (gxy.x >= size.x || gxy.y >= size.y) {
return;
}
float2 pos = (gxy + 0.5) * pt;
V4 s0_0 = l0(-1.0, -1.0);
V4 s0_1 = l0(0.0, -1.0);
V4 s0_2 = l0(1.0, -1.0);
V4 s0_3 = l0(-1.0, 0.0);
V4 s0_4 = l0(0.0, 0.0);
V4 s0_5 = l0(1.0, 0.0);
V4 s0_6 = l0(-1.0, 1.0);
V4 s0_7 = l0(0.0, 1.0);
V4 s0_8 = l0(1.0, 1.0);
V4 s1_0 = -max(-s0_0, 0.0);
V4 s1_1 = -max(-s0_1, 0.0);
V4 s1_2 = -max(-s0_2, 0.0);
V4 s1_3 = -max(-s0_3, 0.0);
V4 s1_4 = -max(-s0_4, 0.0);
V4 s1_5 = -max(-s0_5, 0.0);
V4 s1_6 = -max(-s0_6, 0.0);
V4 s1_7 = -max(-s0_7, 0.0);
V4 s1_8 = -max(-s0_8, 0.0);
s0_0 = max(s0_0, 0.0);
s0_1 = max(s0_1, 0.0);
s0_2 = max(s0_2, 0.0);
s0_3 = max(s0_3, 0.0);
s0_4 = max(s0_4, 0.0);
s0_5 = max(s0_5, 0.0);
s0_6 = max(s0_6, 0.0);
s0_7 = max(s0_7, 0.0);
s0_8 = max(s0_8, 0.0);
t1[gxy] = f0(s0_0, s0_1, s0_2, s0_3, s0_4, s0_5, s0_6, s0_7, s0_8, s1_0, s1_1, s1_2, s1_3, s1_4, s1_5, s1_6, s1_7, s1_8);
}
//!PASS 7
//!DESC conv6
//!BLOCK_SIZE 8
//!NUM_THREADS 64
//!IN t1
//!OUT t0
#define l0(x, y) V4(O(t1, float2(x, y)))
V4 f0(V4 s0_0, V4 s0_1, V4 s0_2, V4 s0_3, V4 s0_4, V4 s0_5, V4 s0_6, V4 s0_7, V4 s0_8, V4 s1_0, V4 s1_1, V4 s1_2, V4 s1_3, V4 s1_4, V4 s1_5, V4 s1_6, V4 s1_7, V4 s1_8) {
V4 r = { -6.918e-03, -1.945e-03, -7.751e-03, 1.645e-02 };
r = MulAdd(s0_0, M4(-3.488e-02, 3.507e-02, 3.848e-02, -5.906e-02, 9.669e-02, 3.121e-02, -2.182e-02, 1.691e-01, -1.132e-01, -7.602e-02, -5.000e-02, -6.017e-03, 3.962e-02, 1.086e-01, -3.343e-04, 9.002e-02), r);
r = MulAdd(s0_1, M4(9.453e-02, -1.793e-01, -6.074e-02, 5.317e-03, 1.056e-01, 3.460e-01, 5.291e-02, 7.825e-02, 5.510e-02, 4.818e-02, -1.119e-02, 3.913e-02, -8.177e-02, -1.060e-01, -9.989e-03, -9.245e-02), r);
r = MulAdd(s0_2, M4(-8.190e-02, 1.375e-01, -4.322e-02, -6.721e-02, 1.645e-02, -1.392e-01, 7.103e-02, -1.950e-02, 4.302e-03, -3.213e-02, -7.517e-03, -3.406e-03, -2.132e-02, 1.333e-01, -6.553e-02, 7.300e-02), r);
r = MulAdd(s0_3, M4(-1.102e-01, 3.005e-01, -8.521e-02, 3.002e-01, 1.866e-01, 1.089e-01, -2.968e-02, 1.271e-01, -3.566e-01, 1.224e-01, -7.462e-02, -2.765e-01, 5.175e-02, 1.567e-01, 1.450e-01, -1.948e-01), r);
r = MulAdd(s0_4, M4(1.558e-01, 3.780e-02, 9.697e-02, -2.485e-01, -3.560e-01, -3.667e-01, 1.396e-01, 1.020e+00, -2.319e-01, -2.878e-01, -2.849e-01, 5.648e-01, 2.094e-01, -5.684e-01, 1.482e-01, -6.172e-01), r);
r = MulAdd(s0_5, M4(-1.276e-01, -1.685e-01, 4.271e-01, -1.489e-01, 2.154e-01, 2.661e-01, -1.093e-01, -7.859e-02, 6.618e-02, 9.795e-02, 2.778e-02, -1.286e-01, -1.527e-01, -3.586e-01, 2.523e-01, 9.196e-02), r);
r = MulAdd(s0_6, M4(-1.354e-01, -6.680e-02, 5.541e-02, -5.314e-02, 1.639e-02, -1.639e-01, -1.856e-01, -1.863e-01, -1.519e-01, -5.459e-02, 1.027e-01, 6.492e-02, 3.482e-02, -9.074e-03, 1.861e-01, 1.393e-01), r);
r = MulAdd(s0_7, M4(1.907e-02, 1.189e-02, -5.038e-01, -8.478e-02, 3.643e-01, 1.086e-02, 3.067e-01, 1.071e-01, -6.552e-01, 1.505e-01, -7.394e-01, 1.155e-01, -1.815e-01, -1.739e-02, -2.723e-01, -1.607e-01), r);
r = MulAdd(s0_8, M4(-8.319e-02, -2.563e-02, -1.127e-01, -7.792e-02, 1.295e-01, 1.091e-01, 2.920e-02, -5.761e-02, -9.443e-02, 7.429e-03, -2.117e-01, -3.670e-02, -7.118e-02, -4.469e-02, -6.460e-02, -1.261e-02), r);
r = MulAdd(s1_0, M4(2.400e-02, -2.740e-02, -3.394e-02, 5.817e-02, -6.716e-02, -5.672e-02, -7.339e-02, -3.921e-02, -9.506e-02, -3.805e-02, -3.235e-02, -8.145e-02, 1.265e-02, 7.308e-02, -5.707e-02, 1.141e-01), r);
r = MulAdd(s1_1, M4(-1.565e-01, 1.052e-01, -8.934e-02, -6.945e-02, 3.804e-02, 2.091e-01, -1.102e-01, 2.394e-01, 6.041e-02, -9.942e-02, -6.054e-03, 4.857e-02, -7.265e-02, 1.596e-02, 9.135e-02, -8.397e-02), r);
r = MulAdd(s1_2, M4(-9.449e-02, 1.121e-01, -1.101e-01, -2.980e-02, 5.100e-02, -6.337e-02, 1.692e-01, -5.062e-02, -3.931e-02, 1.083e-01, 3.952e-03, 9.801e-04, -6.425e-02, 8.015e-02, -1.628e-01, 8.317e-02), r);
r = MulAdd(s1_3, M4(7.400e-02, 8.412e-02, 2.984e-02, 8.693e-02, -1.474e-01, -3.529e-02, -6.134e-02, -1.107e-01, -3.264e-01, 8.009e-02, -2.261e-01, -1.472e-01, -4.683e-02, -1.258e-01, 1.061e-01, -1.125e-01), r);
r = MulAdd(s1_4, M4(4.970e-01, -1.211e-01, 2.379e-01, 2.124e-01, -1.003e-01, -5.656e-01, 5.001e-02, 4.959e-01, 1.538e-01, -7.985e-01, -2.085e-01, 2.220e-01, 7.247e-02, 6.581e-02, -9.437e-02, -3.066e-01), r);
r = MulAdd(s1_5, M4(-8.611e-02, -9.199e-02, 2.518e-01, -7.482e-02, -1.208e-01, 1.015e-01, 3.428e-02, -1.354e-01, 1.038e-01, -4.497e-02, 2.744e-01, -4.281e-02, 4.090e-02, -2.726e-01, 1.839e-01, 1.138e-01), r);
r = MulAdd(s1_6, M4(-8.703e-02, -4.776e-02, -1.477e-01, -1.870e-02, -1.072e-01, 3.204e-02, -8.396e-03, 1.175e-01, 1.685e-01, -1.427e-01, 2.152e-01, -2.155e-01, 1.898e-03, 5.924e-02, -1.089e-02, 5.197e-02), r);
r = MulAdd(s1_7, M4(-9.679e-02, 1.961e-02, 1.636e-01, -6.049e-02, -7.071e-03, 1.519e-01, -6.303e-01, 4.739e-02, -3.331e-01, 8.291e-02, -5.944e-01, -7.677e-02, -1.164e-01, -4.580e-02, 1.419e-01, 6.839e-02), r);
r = MulAdd(s1_8, M4(6.174e-02, -4.004e-02, 1.256e-02, -4.981e-02, 4.659e-03, 8.371e-02, -1.664e-01, -2.897e-02, -1.253e-01, 2.381e-02, -1.147e-01, -8.724e-02, -3.736e-02, 1.140e-02, -1.550e-01, 6.350e-03), r);
return r;
}
void Pass7(uint2 blockStart, uint3 tid) {
float2 pt = float2(GetInputPt());
uint2 gxy = Rmp8x8(tid.x) + blockStart;
uint2 size = GetInputSize();
if (gxy.x >= size.x || gxy.y >= size.y) {
return;
}
float2 pos = (gxy + 0.5) * pt;
V4 s0_0 = l0(-1.0, -1.0);
V4 s0_1 = l0(0.0, -1.0);
V4 s0_2 = l0(1.0, -1.0);
V4 s0_3 = l0(-1.0, 0.0);
V4 s0_4 = l0(0.0, 0.0);
V4 s0_5 = l0(1.0, 0.0);
V4 s0_6 = l0(-1.0, 1.0);
V4 s0_7 = l0(0.0, 1.0);
V4 s0_8 = l0(1.0, 1.0);
V4 s1_0 = -max(-s0_0, 0.0);
V4 s1_1 = -max(-s0_1, 0.0);
V4 s1_2 = -max(-s0_2, 0.0);
V4 s1_3 = -max(-s0_3, 0.0);
V4 s1_4 = -max(-s0_4, 0.0);
V4 s1_5 = -max(-s0_5, 0.0);
V4 s1_6 = -max(-s0_6, 0.0);
V4 s1_7 = -max(-s0_7, 0.0);
V4 s1_8 = -max(-s0_8, 0.0);
s0_0 = max(s0_0, 0.0);
s0_1 = max(s0_1, 0.0);
s0_2 = max(s0_2, 0.0);
s0_3 = max(s0_3, 0.0);
s0_4 = max(s0_4, 0.0);
s0_5 = max(s0_5, 0.0);
s0_6 = max(s0_6, 0.0);
s0_7 = max(s0_7, 0.0);
s0_8 = max(s0_8, 0.0);
t0[gxy] = f0(s0_0, s0_1, s0_2, s0_3, s0_4, s0_5, s0_6, s0_7, s0_8, s1_0, s1_1, s1_2, s1_3, s1_4, s1_5, s1_6, s1_7, s1_8);
}
//!PASS 8
//!DESC conv7
//!BLOCK_SIZE 8
//!NUM_THREADS 64
//!IN t0
//!OUT t1
#define l0(x, y) V4(O(t0, float2(x, y)))
V4 f0(V4 s0_0, V4 s0_1, V4 s0_2, V4 s0_3, V4 s0_4, V4 s0_5, V4 s0_6, V4 s0_7, V4 s0_8, V4 s1_0, V4 s1_1, V4 s1_2, V4 s1_3, V4 s1_4, V4 s1_5, V4 s1_6, V4 s1_7, V4 s1_8) {
V4 r = { 1.569e-02, 1.505e-02, 2.765e-02, 1.258e-02 };
r = MulAdd(s0_0, M4(-4.116e-02, -3.385e-02, -4.697e-02, 4.650e-02, -3.488e-02, 1.006e-01, -4.538e-03, 4.637e-02, 1.288e-01, 7.769e-03, 1.150e-01, -7.930e-03, 1.045e-02, 4.849e-02, 2.767e-02, 4.909e-02), r);
r = MulAdd(s0_1, M4(-5.332e-01, -5.254e-01, -3.541e-01, -3.525e-01, 1.117e-02, 2.929e-02, 6.817e-02, 9.115e-02, 1.055e+00, 6.141e-02, 3.976e-01, 4.649e-02, 2.561e-01, -1.191e-01, 1.230e-03, -1.047e-01), r);
r = MulAdd(s0_2, M4(-1.450e-01, -2.829e-01, -6.857e-01, -4.294e-01, 3.217e-02, 2.745e-02, 5.242e-02, 3.556e-02, 1.284e-01, 4.292e-01, 7.161e-01, 2.220e-01, -1.508e-02, 1.802e-01, 1.842e-01, 9.827e-02), r);
r = MulAdd(s0_3, M4(1.381e-01, 5.690e-02, 5.107e-02, 6.625e-02, -1.173e-01, -7.448e-02, -1.152e-01, -1.808e-01, -1.470e-01, -1.833e-01, -1.653e-01, -1.217e-01, 9.096e-02, 5.579e-02, 1.128e-02, 9.791e-02), r);
r = MulAdd(s0_4, M4(5.936e-01, -2.579e-01, 5.761e-01, -7.051e-01, -7.023e-01, 2.824e-01, 2.057e-01, 3.628e-01, 1.006e-02, 3.209e-01, 6.969e-02, -3.464e-01, 4.768e-01, -3.194e-01, -4.817e-02, 3.050e-02), r);
r = MulAdd(s0_5, M4(2.145e-01, -1.899e-01, 1.446e-01, 2.497e-02, -8.750e-02, -3.154e-01, -5.060e-01, -7.413e-02, -8.542e-02, -4.198e-02, -1.528e-01, -1.812e-01, -2.597e-01, 8.374e-02, -5.592e-01, -2.557e-01), r);
r = MulAdd(s0_6, M4(5.713e-02, -4.294e-03, 2.388e-02, -7.124e-02, -2.163e-02, -3.642e-03, 3.839e-02, -6.934e-02, -9.052e-02, -1.153e-02, 1.213e-02, 7.120e-02, -3.698e-02, 4.260e-02, -7.245e-02, 7.898e-02), r);
r = MulAdd(s0_7, M4(2.780e-02, 1.944e-02, 1.415e-01, 1.216e-01, 9.163e-02, -3.069e-02, -1.829e-02, -2.182e-01, 5.815e-02, -1.923e-02, -5.934e-02, -3.487e-02, -1.082e-01, 1.362e-01, 8.120e-02, 2.621e-01), r);
r = MulAdd(s0_8, M4(9.334e-03, -1.300e-02, 4.936e-02, 1.751e-01, -1.214e-01, 1.629e-02, -1.131e-01, 7.402e-02, 1.134e-02, 1.663e-03, -5.887e-03, -8.862e-02, 1.029e-01, -5.629e-02, 9.127e-02, -6.668e-02), r);
r = MulAdd(s1_0, M4(1.165e-02, 4.389e-02, 6.299e-03, 7.939e-02, -2.769e-02, 9.353e-02, 6.239e-02, 1.341e-02, 4.713e-02, -2.731e-03, 5.256e-02, -3.515e-02, -8.911e-02, -1.425e-01, -7.889e-02, -1.627e-01), r);
r = MulAdd(s1_1, M4(-2.869e-01, 2.838e-02, -2.541e-02, 5.216e-02, 2.660e-01, -2.095e-01, 1.375e-01, -2.562e-02, 2.715e-01, 1.694e-01, 9.471e-02, -7.292e-03, 3.257e-01, -2.247e-01, 7.698e-03, -2.076e-01), r);
r = MulAdd(s1_2, M4(1.832e-02, -1.860e-01, -4.951e-02, -1.392e-03, 9.307e-02, 7.671e-02, 1.043e-01, -3.675e-02, 1.433e-03, 1.219e-01, 1.978e-01, 5.960e-02, 9.624e-02, 1.448e-01, 3.561e-01, 3.054e-02), r);
r = MulAdd(s1_3, M4(-4.647e-02, 4.225e-03, 3.830e-02, -3.233e-02, -1.532e-01, -6.289e-01, -3.037e-01, -4.131e-01, -1.794e-01, -4.090e-02, -9.644e-02, -4.828e-02, 7.978e-02, 6.792e-03, 5.043e-02, 4.905e-02), r);
r = MulAdd(s1_4, M4(2.967e-01, -5.750e-02, 1.168e-01, -2.681e-02, 1.232e-01, -2.481e-03, 8.164e-01, 2.468e-01, -3.721e-01, -5.041e-02, -4.796e-01, -2.778e-02, 3.623e-01, -8.387e-01, -5.229e-01, -4.492e-01), r);
r = MulAdd(s1_5, M4(-1.507e-02, 1.343e-01, 8.567e-02, 8.923e-02, -2.766e-03, -1.548e-01, -2.588e-01, -1.295e-01, 1.777e-02, -9.243e-02, -4.495e-02, -5.528e-02, -1.071e-01, -1.284e-01, -4.142e-01, -1.800e-01), r);
r = MulAdd(s1_6, M4(-4.695e-03, -9.431e-04, -1.256e-02, -4.959e-03, 1.607e-01, -8.763e-02, 2.039e-01, -1.243e-01, 3.725e-02, -5.612e-02, -3.615e-03, -2.475e-02, 4.955e-02, 4.065e-02, -1.879e-02, -1.195e-01), r);
r = MulAdd(s1_7, M4(-1.039e-02, 5.631e-02, 2.655e-02, 7.419e-02, 1.286e-01, -6.430e-02, 4.800e-02, -4.480e-02, 5.067e-03, -4.197e-02, -3.342e-02, -7.461e-02, -3.225e-02, 6.062e-03, 5.391e-02, -7.135e-02), r);
r = MulAdd(s1_8, M4(5.195e-02, 3.799e-02, 1.130e-01, -8.811e-03, -4.285e-02, 1.609e-02, -8.972e-03, 3.530e-02, -6.932e-02, -3.013e-03, -5.208e-02, 5.823e-02, -4.561e-02, -1.068e-01, -1.458e-01, -5.739e-02), r);
return r;
}
void Pass8(uint2 blockStart, uint3 tid) {
float2 pt = float2(GetInputPt());
uint2 gxy = Rmp8x8(tid.x) + blockStart;
uint2 size = GetInputSize();
if (gxy.x >= size.x || gxy.y >= size.y) {
return;
}
float2 pos = (gxy + 0.5) * pt;
V4 s0_0 = l0(-1.0, -1.0);
V4 s0_1 = l0(0.0, -1.0);
V4 s0_2 = l0(1.0, -1.0);
V4 s0_3 = l0(-1.0, 0.0);
V4 s0_4 = l0(0.0, 0.0);
V4 s0_5 = l0(1.0, 0.0);
V4 s0_6 = l0(-1.0, 1.0);
V4 s0_7 = l0(0.0, 1.0);
V4 s0_8 = l0(1.0, 1.0);
V4 s1_0 = -max(-s0_0, 0.0);
V4 s1_1 = -max(-s0_1, 0.0);
V4 s1_2 = -max(-s0_2, 0.0);
V4 s1_3 = -max(-s0_3, 0.0);
V4 s1_4 = -max(-s0_4, 0.0);
V4 s1_5 = -max(-s0_5, 0.0);
V4 s1_6 = -max(-s0_6, 0.0);
V4 s1_7 = -max(-s0_7, 0.0);
V4 s1_8 = -max(-s0_8, 0.0);
s0_0 = max(s0_0, 0.0);
s0_1 = max(s0_1, 0.0);
s0_2 = max(s0_2, 0.0);
s0_3 = max(s0_3, 0.0);
s0_4 = max(s0_4, 0.0);
s0_5 = max(s0_5, 0.0);
s0_6 = max(s0_6, 0.0);
s0_7 = max(s0_7, 0.0);
s0_8 = max(s0_8, 0.0);
t1[gxy] = f0(s0_0, s0_1, s0_2, s0_3, s0_4, s0_5, s0_6, s0_7, s0_8, s1_0, s1_1, s1_2, s1_3, s1_4, s1_5, s1_6, s1_7, s1_8);
}
//!PASS 9
//!DESC conv8
//!BLOCK_SIZE 8
//!NUM_THREADS 64
//!IN t1
//!OUT t0
#define l0(x, y) V4(O(t1, float2(x, y)))
V4 f0(V4 s0_0, V4 s0_1, V4 s0_2, V4 s0_3, V4 s0_4, V4 s0_5, V4 s0_6, V4 s0_7, V4 s0_8, V4 s1_0, V4 s1_1, V4 s1_2, V4 s1_3, V4 s1_4, V4 s1_5, V4 s1_6, V4 s1_7, V4 s1_8) {
V4 r = { 8.112e-05, 3.290e-03, -6.342e-04, 1.340e-02 };
r = MulAdd(s0_0, M4(4.724e-03, 6.987e-03, -3.797e-03, 2.147e-02, -5.616e-03, 1.123e-02, -2.768e-02, 8.185e-03, -4.051e-03, 5.608e-05, -9.522e-02, 2.924e-02, -5.976e-03, 8.331e-03, 7.513e-02, -2.513e-02), r);
r = MulAdd(s0_1, M4(-2.224e-02, 1.093e-04, 5.901e-02, 2.350e-02, 1.167e-01, -7.837e-02, 1.939e-01, 1.987e-01, 5.530e-02, -4.759e-05, 1.221e-01, 4.764e-02, -8.813e-02, 7.695e-02, -4.577e-01, 1.671e-02), r);
r = MulAdd(s0_2, M4(-5.696e-02, 6.005e-03, -5.620e-02, -8.978e-02, 4.014e-02, -3.822e-02, 1.081e-01, -6.532e-03, 9.444e-03, 7.498e-03, -3.228e-02, 4.908e-02, -2.043e-02, 2.374e-02, 2.163e-02, -4.505e-02), r);
r = MulAdd(s0_3, M4(-8.098e-02, 1.943e-02, -5.744e-02, 3.824e-02, -2.071e-01, 1.036e-01, -6.926e-02, -2.348e-01, 2.378e-01, -1.069e-01, -5.307e-02, 1.161e-01, 1.881e-01, -5.785e-02, -6.570e-02, 2.227e-01), r);
r = MulAdd(s0_4, M4(7.577e-02, -4.125e-02, 1.714e-01, -6.934e-01, -2.448e-01, 1.146e-01, 2.354e-01, -4.935e-01, -2.321e-01, -8.273e-02, 5.890e-02, 5.704e-01, 4.833e-02, 2.875e-02, 1.163e-01, -1.802e-01), r);
r = MulAdd(s0_5, M4(2.287e-01, -3.461e-02, -2.542e-02, 2.882e-02, 7.142e-02, -1.556e-01, 4.055e-02, 1.534e-02, -1.647e-01, 3.087e-03, -6.811e-02, -3.896e-02, 1.334e-01, 1.188e-01, -1.847e-01, 4.293e-02), r);
r = MulAdd(s0_6, M4(-3.094e-02, 2.712e-03, 3.387e-03, 1.877e-02, 9.494e-02, -2.863e-02, -4.239e-02, -3.402e-02, 5.541e-03, -1.178e-02, 1.795e-02, -3.515e-02, -3.044e-02, -2.463e-02, -1.320e-02, 8.952e-02), r);
r = MulAdd(s0_7, M4(1.035e-01, -3.181e-02, 1.902e-02, 3.973e-03, 2.267e-01, -2.620e-01, 1.821e-01, 1.631e-01, 1.494e-02, 6.125e-02, -6.176e-02, -2.497e-02, -1.364e-02, 7.542e-02, -8.480e-02, -4.648e-02), r);
r = MulAdd(s0_8, M4(-1.466e-01, 3.028e-02, 2.798e-02, -7.887e-02, -4.370e-02, 1.408e-02, -6.161e-02, -3.034e-02, 6.567e-02, 2.071e-02, 3.126e-02, 6.993e-02, -5.556e-02, 1.507e-02, 2.991e-02, -4.924e-02), r);
r = MulAdd(s1_0, M4(1.637e-02, -2.767e-02, 8.568e-02, -4.254e-02, 3.215e-02, 1.987e-04, -3.697e-02, 3.787e-02, 2.236e-02, -6.576e-02, 7.400e-02, 1.093e-01, 3.271e-03, 1.809e-03, 1.011e-02, 1.509e-01), r);
r = MulAdd(s1_1, M4(5.538e-02, -5.865e-02, 4.351e-01, 2.494e-01, 1.101e-01, -1.484e-02, 5.176e-01, 3.999e-02, -4.782e-03, 1.155e-01, -2.099e-01, 5.012e-03, -1.919e-01, 2.292e-01, -5.378e-01, -1.223e-01), r);
r = MulAdd(s1_2, M4(-5.691e-02, 7.653e-02, -2.572e-01, -1.332e-01, -5.652e-02, -5.008e-02, 7.840e-02, -3.729e-02, 6.942e-02, 6.483e-04, -2.243e-05, 8.430e-02, -6.848e-02, -2.096e-02, -3.908e-02, -9.062e-02), r);
r = MulAdd(s1_3, M4(2.725e-01, -1.841e-01, -6.710e-03, 3.965e-01, -1.298e-01, -4.014e-03, 2.007e-01, -3.700e-01, 5.329e-01, -4.014e-01, 2.619e-02, 1.606e-01, 2.179e-01, -1.403e-01, 4.227e-02, 8.568e-02), r);
r = MulAdd(s1_4, M4(4.188e-01, -7.320e-01, -5.609e-01, -6.087e-01, -7.521e-01, 7.363e-01, -6.253e-01, -2.011e-01, -1.017e+00, 3.331e-02, -2.135e-02, 2.084e-01, 6.074e-01, -9.824e-01, 5.154e-01, 1.748e-01), r);
r = MulAdd(s1_5, M4(1.733e-01, 5.176e-01, -7.335e-02, 1.899e-02, 1.028e-01, -6.330e-02, -1.632e-01, 9.241e-05, -1.357e-01, -1.131e-01, 9.644e-02, -1.424e-02, -1.835e-02, 7.296e-01, -3.204e-01, -2.966e-02), r);
r = MulAdd(s1_6, M4(4.798e-02, -1.047e-01, 3.646e-02, 6.703e-02, 5.371e-02, 4.759e-02, -2.975e-02, -6.945e-02, 7.985e-02, -1.101e-01, 3.034e-02, -1.472e-02, -3.827e-02, 9.839e-03, -4.922e-03, 4.307e-03), r);
r = MulAdd(s1_7, M4(-8.950e-02, -1.253e-02, -1.730e-05, 3.862e-02, 2.692e-01, -4.645e-01, 2.399e-01, 2.744e-01, -4.503e-02, 1.724e-01, -7.935e-02, -5.200e-02, -2.132e-03, -1.926e-02, 2.926e-02, -2.288e-02), r);
r = MulAdd(s1_8, M4(-1.072e-01, -1.145e-02, 6.605e-03, -1.090e-01, -7.524e-03, 8.598e-02, -7.698e-02, -6.976e-02, 5.869e-02, -5.499e-02, 3.529e-02, 7.813e-02, -1.794e-01, 4.212e-02, -4.479e-03, -7.253e-02), r);
return r;
}
void Pass9(uint2 blockStart, uint3 tid) {
float2 pt = float2(GetInputPt());
uint2 gxy = Rmp8x8(tid.x) + blockStart;
uint2 size = GetInputSize();
if (gxy.x >= size.x || gxy.y >= size.y) {
return;
}
float2 pos = (gxy + 0.5) * pt;
V4 s0_0 = l0(-1.0, -1.0);
V4 s0_1 = l0(0.0, -1.0);
V4 s0_2 = l0(1.0, -1.0);
V4 s0_3 = l0(-1.0, 0.0);
V4 s0_4 = l0(0.0, 0.0);
V4 s0_5 = l0(1.0, 0.0);
V4 s0_6 = l0(-1.0, 1.0);
V4 s0_7 = l0(0.0, 1.0);
V4 s0_8 = l0(1.0, 1.0);
V4 s1_0 = -max(-s0_0, 0.0);
V4 s1_1 = -max(-s0_1, 0.0);
V4 s1_2 = -max(-s0_2, 0.0);
V4 s1_3 = -max(-s0_3, 0.0);
V4 s1_4 = -max(-s0_4, 0.0);
V4 s1_5 = -max(-s0_5, 0.0);
V4 s1_6 = -max(-s0_6, 0.0);
V4 s1_7 = -max(-s0_7, 0.0);
V4 s1_8 = -max(-s0_8, 0.0);
s0_0 = max(s0_0, 0.0);
s0_1 = max(s0_1, 0.0);
s0_2 = max(s0_2, 0.0);
s0_3 = max(s0_3, 0.0);
s0_4 = max(s0_4, 0.0);
s0_5 = max(s0_5, 0.0);
s0_6 = max(s0_6, 0.0);
s0_7 = max(s0_7, 0.0);
s0_8 = max(s0_8, 0.0);
t0[gxy] = f0(s0_0, s0_1, s0_2, s0_3, s0_4, s0_5, s0_6, s0_7, s0_8, s1_0, s1_1, s1_2, s1_3, s1_4, s1_5, s1_6, s1_7, s1_8);
}
//!PASS 10
//!DESC out-shuffle
//!BLOCK_SIZE 16
//!NUM_THREADS 64
//!IN INPUT, t0
//!OUT OUTPUT
#define l0(x, y) V4(O(t0, float2(x, y)))
V4 f0(V4 s0_0, V4 s0_1, V4 s0_2, V4 s0_3, V4 s0_4, V4 s0_5, V4 s0_6, V4 s0_7, V4 s0_8, V4 s1_0, V4 s1_1, V4 s1_2, V4 s1_3, V4 s1_4, V4 s1_5, V4 s1_6, V4 s1_7, V4 s1_8) {
V4 r = { 3.107e-03, 3.655e-03, 5.416e-04, 5.397e-04 };
r = MulAdd(s0_0, M4(-1.348e-01, -9.107e-02, -4.849e-02, 4.484e-04, -3.384e-02, -6.768e-02, -9.628e-03, -1.766e-02, -9.939e-03, -2.182e-02, -1.288e-02, 8.518e-03, 2.218e-02, -1.184e-03, 1.240e-03, 1.065e-02), r);
r = MulAdd(s0_1, M4(7.301e-02, 1.014e-01, -1.363e-02, -4.850e-02, -2.842e-01, -3.060e-02, -4.154e-02, 4.057e-03, -3.458e-02, -6.335e-02, -2.660e-02, -1.335e-02, -7.944e-03, 8.560e-03, 4.588e-02, 7.580e-03), r);
r = MulAdd(s0_2, M4(-9.349e-05, -2.142e-02, -1.258e-04, -9.330e-03, -5.058e-03, 3.912e-02, -2.976e-02, 2.410e-02, -8.512e-03, 4.954e-02, -2.093e-02, -2.582e-03, 1.648e-02, 7.942e-03, 1.520e-02, 3.414e-02), r);
r = MulAdd(s0_3, M4(-1.021e-01, -1.140e-01, 2.412e-01, 1.289e-02, -1.192e-01, -1.140e-01, 2.395e-01, 6.930e-03, -2.027e-01, -4.824e-02, 1.243e-01, 3.820e-03, 9.280e-03, 2.866e-02, 2.106e-02, 3.644e-03), r);
r = MulAdd(s0_4, M4(8.727e-02, 8.162e-02, 1.478e-01, 5.348e-01, -3.115e-01, -2.605e-02, 1.510e-01, 7.249e-01, -8.110e-02, -6.698e-01, 1.080e-01, -8.090e-02, -3.492e-01, -1.891e-01, -1.877e-01, -1.319e-01), r);
r = MulAdd(s0_5, M4(-5.622e-03, 2.237e-02, -4.008e-03, -1.980e-02, -1.837e-02, -3.311e-02, 4.289e-02, 3.256e-02, -2.178e-02, 2.653e-02, -1.722e-03, 8.373e-02, -8.042e-02, -2.962e-01, -4.643e-03, -6.865e-02), r);
r = MulAdd(s0_6, M4(6.248e-03, -2.320e-02, 1.883e-03, -1.430e-02, 1.224e-02, 5.634e-03, -1.964e-02, -1.627e-02, 2.010e-02, 1.174e-02, -3.919e-02, 9.559e-04, 3.016e-02, -2.836e-03, 7.667e-02, 3.552e-02), r);
r = MulAdd(s0_7, M4(-6.141e-03, 1.380e-02, 1.024e-02, -1.210e-02, 4.548e-02, 3.626e-02, -9.142e-02, -7.666e-02, -3.241e-02, -2.296e-02, -3.244e-02, -2.870e-01, 4.427e-02, 8.899e-02, -1.327e-01, 4.920e-02), r);
r = MulAdd(s0_8, M4(-2.801e-03, -9.930e-04, -2.770e-03, 1.623e-02, 2.158e-03, -1.258e-02, -3.089e-02, 3.211e-02, -9.620e-03, 1.776e-02, -4.337e-03, 4.676e-02, 1.130e-02, -7.436e-03, -3.572e-02, -1.742e-01), r);
r = MulAdd(s1_0, M4(-4.306e-02, -6.039e-02, -1.642e-02, -1.966e-02, -5.996e-02, -1.743e-01, -3.128e-02, 1.714e-02, -4.357e-03, -7.720e-03, -4.532e-03, 4.571e-03, 3.988e-02, 2.067e-02, 1.548e-02, -2.964e-04), r);
r = MulAdd(s1_1, M4(-6.070e-02, -9.324e-02, 7.472e-03, 2.173e-02, -7.996e-02, -5.139e-02, -5.545e-02, -1.891e-02, -1.767e-02, -1.527e-02, -2.906e-02, 1.310e-02, 2.594e-02, 7.495e-02, -7.681e-03, -4.678e-03), r);
r = MulAdd(s1_2, M4(4.883e-02, -3.167e-02, 2.862e-02, 3.357e-02, -8.454e-03, 9.265e-03, -1.657e-02, -8.086e-03, -1.170e-02, -3.549e-02, 7.437e-03, 1.425e-02, 1.441e-02, -1.961e-02, 1.560e-02, -1.122e-02), r);
r = MulAdd(s1_3, M4(-6.156e-02, -6.763e-02, 1.987e-01, 2.459e-02, -5.710e-02, -2.009e-01, 4.581e-01, -1.181e-02, -9.054e-02, -5.658e-02, 3.432e-02, 2.004e-02, 6.965e-03, -1.655e-02, 5.178e-03, -1.236e-02), r);
r = MulAdd(s1_4, M4(-1.301e-01, -5.093e-02, 7.676e-01, 6.003e-01, -9.216e-02, -2.228e-03, 7.034e-02, 1.851e-01, -5.318e-01, -1.852e-01, -2.980e-02, 1.919e-02, -8.147e-01, -1.773e-01, -2.675e-01, 2.314e-02), r);
r = MulAdd(s1_5, M4(-9.962e-03, -1.888e-01, -1.877e-02, 1.190e-01, -2.283e-02, -1.241e-02, 2.969e-04, 3.894e-02, -5.077e-02, 2.300e-01, -6.192e-02, 1.793e-01, 5.384e-03, -4.378e-01, 2.970e-02, -1.125e-01), r);
r = MulAdd(s1_6, M4(1.376e-02, -2.891e-03, 9.292e-03, -1.288e-03, 2.615e-02, 2.656e-02, -8.111e-02, -1.779e-02, -7.512e-03, 9.174e-03, -4.553e-02, -1.139e-02, 2.259e-02, 4.351e-03, 4.963e-02, 2.002e-02), r);
r = MulAdd(s1_7, M4(9.993e-03, 1.250e-02, -2.090e-02, 6.839e-03, 3.502e-03, 2.070e-03, -5.530e-02, -2.855e-03, 1.144e-02, -4.191e-02, -8.395e-02, -2.056e-01, 6.909e-02, 7.425e-02, -2.374e-01, 8.636e-02), r);
r = MulAdd(s1_8, M4(-1.762e-02, -3.804e-04, 2.643e-02, 4.383e-02, 1.748e-03, -1.201e-02, -8.452e-03, -1.216e-02, -1.203e-02, -3.454e-02, -1.957e-02, 2.212e-01, -1.375e-02, -1.094e-02, -1.245e-02, -2.124e-01), r);
return tanh(r);
}
void Pass10(uint2 blockStart, uint3 tid) {
float2 pt = float2(GetInputPt());
uint2 gxy = (Rmp8x8(tid.x) << 1) + blockStart;
uint2 size = GetOutputSize();
if (gxy.x >= size.x || gxy.y >= size.y) {
return;
}
float2 pos = ((gxy >> 1) + 0.5) * pt;
V4 s0_0 = l0(-1.0, -1.0);
V4 s0_1 = l0(0.0, -1.0);
V4 s0_2 = l0(1.0, -1.0);
V4 s0_3 = l0(-1.0, 0.0);
V4 s0_4 = l0(0.0, 0.0);
V4 s0_5 = l0(1.0, 0.0);
V4 s0_6 = l0(-1.0, 1.0);
V4 s0_7 = l0(0.0, 1.0);
V4 s0_8 = l0(1.0, 1.0);
V4 s1_0 = -max(-s0_0, 0.0);
V4 s1_1 = -max(-s0_1, 0.0);
V4 s1_2 = -max(-s0_2, 0.0);
V4 s1_3 = -max(-s0_3, 0.0);
V4 s1_4 = -max(-s0_4, 0.0);
V4 s1_5 = -max(-s0_5, 0.0);
V4 s1_6 = -max(-s0_6, 0.0);
V4 s1_7 = -max(-s0_7, 0.0);
V4 s1_8 = -max(-s0_8, 0.0);
s0_0 = max(s0_0, 0.0);
s0_1 = max(s0_1, 0.0);
s0_2 = max(s0_2, 0.0);
s0_3 = max(s0_3, 0.0);
s0_4 = max(s0_4, 0.0);
s0_5 = max(s0_5, 0.0);
s0_6 = max(s0_6, 0.0);
s0_7 = max(s0_7, 0.0);
s0_8 = max(s0_8, 0.0);
V4 r = f0(s0_0, s0_1, s0_2, s0_3, s0_4, s0_5, s0_6, s0_7, s0_8, s1_0, s1_1, s1_2, s1_3, s1_4, s1_5, s1_6, s1_7, s1_8);
static const MF3x3 rgb2yuv = { 0.299, 0.587, 0.114, -0.169, -0.331, 0.5, 0.5, -0.419, -0.081 };
static const MF3x3 yuv2rgb = { 1, -0.00093, 1.401687, 1, -0.3437, -0.71417, 1, 1.77216, 0.00099 };
float2 opt = float2(GetOutputPt());
pos -= 0.5f * opt;
MF3 yuv = mul(rgb2yuv, INPUT.SampleLevel(SL, pos, 0).rgb);
OUTPUT[gxy] = MF4(mul(yuv2rgb, MF3(saturate(yuv.r + r.x), yuv.yz)), 1);
++gxy.x;
pos.x += opt.x;
yuv = mul(rgb2yuv, INPUT.SampleLevel(SL, pos, 0).rgb);
OUTPUT[gxy] = MF4(mul(yuv2rgb, MF3(saturate(yuv.r + r.y), yuv.yz)), 1);
++gxy.y;
pos.y += opt.y;
yuv = mul(rgb2yuv, INPUT.SampleLevel(SL, pos, 0).rgb);
OUTPUT[gxy] = MF4(mul(yuv2rgb, MF3(saturate(yuv.r + r.w), yuv.yz)), 1);
--gxy.x;
pos.x -= opt.x;
yuv = mul(rgb2yuv, INPUT.SampleLevel(SL, pos, 0).rgb);
OUTPUT[gxy] = MF4(mul(yuv2rgb, MF3(saturate(yuv.r + r.z), yuv.yz)), 1);
}

View file

@ -1,772 +0,0 @@
// CuNNy 8x4C BILINEAR RGB NVL - https://github.com/funnyplanter/CuNNy
// This program is free software: you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation, either version 3 of the License, or
// (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
//
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License
// along with this program. If not, see <http://www.gnu.org/licenses/>.
//!MAGPIE EFFECT
//!VERSION 4
//!SORT_NAME CuNNy-D04N08
//!USE MulAdd
//!CAPABILITY FP16
#include "../StubDefs.hlsli"
//!TEXTURE
Texture2D INPUT;
//!TEXTURE
//!WIDTH INPUT_WIDTH * 2
//!HEIGHT INPUT_HEIGHT * 2
Texture2D OUTPUT;
//!SAMPLER
//!FILTER POINT
SamplerState SP;
//!SAMPLER
//!FILTER LINEAR
SamplerState SL;
//!COMMON
#define O(t, p) t.SampleLevel(SP, pos + p * pt, 0)
#define V4 MF4
#define M4 MF4x4
//!TEXTURE
//!WIDTH INPUT_WIDTH
//!HEIGHT INPUT_HEIGHT
//!FORMAT R8G8B8A8_SNORM
Texture2D t0;
//!TEXTURE
//!WIDTH INPUT_WIDTH
//!HEIGHT INPUT_HEIGHT
//!FORMAT R8G8B8A8_SNORM
Texture2D t1;
//!PASS 1
//!DESC in
//!BLOCK_SIZE 8
//!NUM_THREADS 64
//!IN INPUT
//!OUT t0
#define l0(x, y) (dot(MF3(2.666e-01, 5.050e-01, 1.135e-01), O(INPUT, float2(x, y)).rgb) + MF(-8.258e-01))
V4 f0(MF s0_0, MF s0_1, MF s0_2, MF s0_3, MF s0_4, MF s0_5, MF s0_6, MF s0_7, MF s0_8) {
V4 r = { -8.495e-04, -1.121e-04, 1.842e-02, 5.844e-02 };
r = mad(s0_0, V4(-2.544e-02, -4.130e-01, -2.634e-01, 2.417e-02), r);
r = mad(s0_1, V4(1.256e-02, -8.013e-02, 9.539e-02, -7.111e-02), r);
r = mad(s0_2, V4(1.768e-02, -2.469e-01, -1.627e-01, 8.569e-02), r);
r = mad(s0_3, V4(-1.554e-01, 3.441e-02, -1.508e-01, 2.491e-02), r);
r = mad(s0_4, V4(1.628e-01, 8.679e-01, -1.960e-02, -5.810e-01), r);
r = mad(s0_5, V4(-1.237e-02, -1.704e-01, 2.915e-01, -5.922e-01), r);
r = mad(s0_6, V4(7.925e-01, 5.570e-03, 7.074e-02, 4.442e-04), r);
r = mad(s0_7, V4(-7.910e-01, -1.530e-02, -8.229e-02, 3.149e-03), r);
r = mad(s0_8, V4(-3.973e-03, 2.262e-02, -1.213e-01, 3.843e-02), r);
return r;
}
void Pass1(uint2 blockStart, uint3 tid) {
float2 pt = float2(GetInputPt());
uint2 gxy = Rmp8x8(tid.x) + blockStart;
uint2 size = GetInputSize();
if (gxy.x >= size.x || gxy.y >= size.y) {
return;
}
float2 pos = (gxy + 0.5) * pt;
MF s0_0 = l0(-1.0, -1.0);
MF s0_1 = l0(0.0, -1.0);
MF s0_2 = l0(1.0, -1.0);
MF s0_3 = l0(-1.0, 0.0);
MF s0_4 = l0(0.0, 0.0);
MF s0_5 = l0(1.0, 0.0);
MF s0_6 = l0(-1.0, 1.0);
MF s0_7 = l0(0.0, 1.0);
MF s0_8 = l0(1.0, 1.0);
t0[gxy] = f0(s0_0, s0_1, s0_2, s0_3, s0_4, s0_5, s0_6, s0_7, s0_8);
}
//!PASS 2
//!DESC conv1
//!BLOCK_SIZE 8
//!NUM_THREADS 64
//!IN t0
//!OUT t1
#define l0(x, y) V4(O(t0, float2(x, y)))
V4 f0(V4 s0_0, V4 s0_1, V4 s0_2, V4 s0_3, V4 s0_4, V4 s0_5, V4 s0_6, V4 s0_7, V4 s0_8, V4 s1_0, V4 s1_1, V4 s1_2, V4 s1_3, V4 s1_4, V4 s1_5, V4 s1_6, V4 s1_7, V4 s1_8) {
V4 r = { 1.016e-03, 5.583e-03, -1.608e-02, -1.996e-04 };
r = MulAdd(s0_0, M4(4.254e-02, 1.997e-01, 4.636e-02, -4.800e-02, 2.043e-01, -4.096e-02, -7.212e-02, 1.408e-02, -3.916e-01, 2.630e-03, 7.016e-02, 9.613e-02, 1.773e-01, -2.723e-01, -9.458e-02, -1.890e-01), r);
r = MulAdd(s0_1, M4(2.350e-01, -8.474e-01, -4.044e-01, -9.188e-01, 9.560e-03, 5.061e-02, 1.092e-02, 1.781e-01, -2.144e-01, 3.203e-02, 6.349e-02, -8.272e-02, -3.105e-01, -3.917e-02, -1.320e-02, -1.541e-01), r);
r = MulAdd(s0_2, M4(-8.130e-01, -1.003e-01, 8.195e-02, -7.597e-01, 5.207e-02, 3.470e-02, -8.823e-03, -1.131e-01, -4.029e-02, 7.571e-02, -2.010e-01, 2.487e-01, 1.677e-01, -5.118e-02, -1.070e-01, 7.606e-02), r);
r = MulAdd(s0_3, M4(-1.158e-02, 4.898e-02, 1.202e-02, 5.012e-01, -5.343e-02, 4.756e-02, -2.438e-01, 6.399e-02, 2.822e-01, -2.863e-02, 1.996e-01, -7.099e-02, -1.323e-01, -3.797e-01, 5.385e-02, -1.014e-01), r);
r = MulAdd(s0_4, M4(2.812e-01, 7.903e-01, -1.733e-01, 6.668e-01, 4.775e-01, 5.452e-01, 7.089e-01, -1.851e-01, -2.382e-01, -5.180e-02, -3.623e-01, -3.040e-01, -4.313e-01, -1.167e-02, 1.235e-01, 1.436e-01), r);
r = MulAdd(s0_5, M4(-1.291e-01, -3.022e-02, -4.083e-01, -5.939e-02, -4.249e-01, -1.750e-01, 1.094e-01, -1.176e-01, 1.374e-02, 1.342e-01, 2.086e-01, 2.841e-01, 2.347e-01, 1.450e-01, 7.604e-02, 2.176e-01), r);
r = MulAdd(s0_6, M4(8.130e-02, -7.215e-02, -5.249e-02, 9.518e-03, -1.979e-01, -4.441e-02, -1.857e-01, -4.227e-01, 2.149e-01, -1.610e-01, 1.655e-01, -8.841e-02, 1.409e-01, -1.059e-01, 2.037e-01, -2.744e-03), r);
r = MulAdd(s0_7, M4(-7.266e-02, 1.638e-02, -1.639e-01, 1.957e-02, -2.857e-01, 1.936e-01, -1.243e-01, -1.490e-01, 1.525e-01, -8.934e-02, 7.415e-02, -1.779e-01, 1.648e-02, -6.456e-02, 7.053e-02, -9.530e-02), r);
r = MulAdd(s0_8, M4(-6.960e-02, -8.960e-02, -1.757e-02, -1.370e-01, -5.137e-01, -1.179e-01, -4.053e-01, -1.987e-01, 7.100e-02, 2.928e-02, -9.682e-02, 2.403e-01, 1.814e-01, 2.131e-02, 5.579e-02, 5.457e-02), r);
r = MulAdd(s1_0, M4(-2.737e-02, 5.272e-02, -1.801e-02, -2.491e-01, 2.871e-01, -3.704e-02, -6.568e-02, 2.905e-02, 1.011e-01, -3.782e-01, -8.696e-02, 4.682e-01, 3.233e-01, -3.060e-01, -3.251e-02, 1.165e+00), r);
r = MulAdd(s1_1, M4(-4.994e-01, 3.049e-02, -8.802e-02, -6.179e-02, 7.133e-02, -1.957e-02, -4.465e-02, 1.130e-01, 7.255e-02, 6.956e-03, -1.204e-01, 3.699e-01, -8.844e-02, 4.624e-01, -9.881e-02, -2.512e-01), r);
r = MulAdd(s1_2, M4(-3.645e-01, 1.274e-01, 2.387e-01, -1.963e-01, -5.995e-02, -5.943e-02, 9.694e-02, -2.518e-01, -2.797e-01, 1.598e-01, -1.371e-02, 4.000e-01, 2.213e-01, 9.692e-02, -3.302e-01, 1.132e+00), r);
r = MulAdd(s1_3, M4(-8.539e-03, -6.535e-02, 5.575e-02, 1.928e-01, 1.156e-01, 5.227e-02, -3.039e-01, 4.794e-01, 1.441e-01, 1.929e-01, -4.689e-02, 2.023e-02, 1.330e-01, -1.358e+00, -5.393e-01, 7.907e-01), r);
r = MulAdd(s1_4, M4(1.701e-01, -3.479e-02, 5.404e-01, -2.491e-01, 4.564e-01, 6.659e-01, 7.009e-01, -2.288e-02, -7.696e-01, -4.959e-01, 2.881e-01, -4.322e-01, -9.013e-01, -4.765e-01, 5.556e-02, -1.805e-01), r);
r = MulAdd(s1_5, M4(-2.424e-01, 8.034e-03, -4.699e-02, -2.628e-01, -4.682e-01, 2.977e-02, 2.258e-01, -1.419e-01, 3.514e-01, 6.860e-03, 2.147e-01, 3.806e-01, 3.747e-01, 1.403e-01, 3.106e-01, 9.680e-01), r);
r = MulAdd(s1_6, M4(1.776e-01, -4.873e-02, -1.403e-01, -1.817e-02, -3.551e-01, 4.838e-04, -2.786e-01, -6.048e-01, 3.082e-01, -4.703e-01, 2.419e-01, -3.002e-01, -4.310e-01, -6.490e-01, 1.343e+00, -1.019e+00), r);
r = MulAdd(s1_7, M4(4.689e-02, -2.927e-02, -7.494e-02, -3.516e-02, -2.217e-01, -3.189e-01, 2.202e-01, -2.936e-01, 4.772e-02, -1.609e-01, 9.853e-02, -4.214e-01, 2.780e-01, -1.073e-01, 1.102e-01, -2.033e-01), r);
r = MulAdd(s1_8, M4(-9.468e-02, 4.428e-02, 1.269e-01, -1.086e-01, -1.106e-01, -1.367e-01, -3.356e-01, 4.656e-03, 4.648e-02, -1.743e-02, -2.074e-01, -3.745e-02, 1.281e-01, -3.233e-01, 6.533e-01, 3.705e-01), r);
return r;
}
void Pass2(uint2 blockStart, uint3 tid) {
float2 pt = float2(GetInputPt());
uint2 gxy = Rmp8x8(tid.x) + blockStart;
uint2 size = GetInputSize();
if (gxy.x >= size.x || gxy.y >= size.y) {
return;
}
float2 pos = (gxy + 0.5) * pt;
V4 s0_0 = l0(-1.0, -1.0);
V4 s0_1 = l0(0.0, -1.0);
V4 s0_2 = l0(1.0, -1.0);
V4 s0_3 = l0(-1.0, 0.0);
V4 s0_4 = l0(0.0, 0.0);
V4 s0_5 = l0(1.0, 0.0);
V4 s0_6 = l0(-1.0, 1.0);
V4 s0_7 = l0(0.0, 1.0);
V4 s0_8 = l0(1.0, 1.0);
V4 s1_0 = -max(-s0_0, 0.0);
V4 s1_1 = -max(-s0_1, 0.0);
V4 s1_2 = -max(-s0_2, 0.0);
V4 s1_3 = -max(-s0_3, 0.0);
V4 s1_4 = -max(-s0_4, 0.0);
V4 s1_5 = -max(-s0_5, 0.0);
V4 s1_6 = -max(-s0_6, 0.0);
V4 s1_7 = -max(-s0_7, 0.0);
V4 s1_8 = -max(-s0_8, 0.0);
s0_0 = max(s0_0, 0.0);
s0_1 = max(s0_1, 0.0);
s0_2 = max(s0_2, 0.0);
s0_3 = max(s0_3, 0.0);
s0_4 = max(s0_4, 0.0);
s0_5 = max(s0_5, 0.0);
s0_6 = max(s0_6, 0.0);
s0_7 = max(s0_7, 0.0);
s0_8 = max(s0_8, 0.0);
t1[gxy] = f0(s0_0, s0_1, s0_2, s0_3, s0_4, s0_5, s0_6, s0_7, s0_8, s1_0, s1_1, s1_2, s1_3, s1_4, s1_5, s1_6, s1_7, s1_8);
}
//!PASS 3
//!DESC conv2
//!BLOCK_SIZE 8
//!NUM_THREADS 64
//!IN t1
//!OUT t0
#define l0(x, y) V4(O(t1, float2(x, y)))
V4 f0(V4 s0_0, V4 s0_1, V4 s0_2, V4 s0_3, V4 s0_4, V4 s0_5, V4 s0_6, V4 s0_7, V4 s0_8, V4 s1_0, V4 s1_1, V4 s1_2, V4 s1_3, V4 s1_4, V4 s1_5, V4 s1_6, V4 s1_7, V4 s1_8) {
V4 r = { -1.437e-02, -2.276e-02, 2.275e-02, 6.547e-04 };
r = MulAdd(s0_0, M4(-4.810e-02, 2.379e-02, -8.471e-02, 1.305e-01, -5.897e-02, 1.263e-01, -9.639e-02, 9.150e-02, 9.002e-03, -1.763e-01, 8.275e-02, -2.357e-01, 7.181e-02, -7.360e-02, 4.629e-02, -8.259e-02), r);
r = MulAdd(s0_1, M4(6.774e-02, 9.108e-02, -3.750e-01, 8.014e-02, 2.890e-01, 9.986e-02, -1.262e-02, -1.285e-01, -2.789e-01, -1.145e-01, -4.982e-02, -1.101e-01, -2.051e-02, -2.271e-01, 1.343e-01, -8.643e-02), r);
r = MulAdd(s0_2, M4(-5.433e-02, 6.899e-02, -3.350e-01, -7.837e-02, -1.076e-01, 1.912e-02, -9.061e-02, 1.919e-01, 9.387e-02, -4.206e-02, 1.861e-01, -4.416e-03, -1.560e-01, -4.364e-02, 4.364e-01, 8.765e-02), r);
r = MulAdd(s0_3, M4(2.382e-01, 3.032e-01, -1.313e-01, -1.154e-01, 1.008e-01, 3.058e-01, -8.513e-02, 2.713e-01, -9.875e-02, 3.017e-01, 3.203e-02, 5.762e-01, -2.056e-03, -7.698e-02, 8.681e-02, 4.245e-02), r);
r = MulAdd(s0_4, M4(2.643e-01, 1.750e-01, 4.850e-02, 3.131e-03, 2.785e-01, 1.598e-01, 5.772e-01, -4.118e-04, -4.270e-01, -2.447e-01, 4.486e-01, 9.155e-02, -3.428e-01, -2.583e-01, -3.721e-02, 6.278e-02), r);
r = MulAdd(s0_5, M4(-1.080e-01, -5.514e-02, -3.648e-01, -2.319e-02, -2.100e-01, -4.065e-02, 1.126e-01, 3.970e-02, 9.824e-02, 1.377e-02, 1.295e-01, -2.512e-02, 1.115e-01, 7.094e-02, 3.413e-01, -5.245e-02), r);
r = MulAdd(s0_6, M4(1.991e-01, 4.710e-02, -9.305e-02, -1.471e-01, -8.221e-02, 1.134e-01, -1.718e-01, -2.606e-01, -8.167e-02, -1.462e-02, -1.094e-01, -1.569e-01, 2.133e-02, 3.374e-02, 4.583e-02, 1.228e-01), r);
r = MulAdd(s0_7, M4(-2.135e-01, 6.874e-02, -4.993e-02, 1.156e-02, -4.261e-01, 1.366e-01, 4.250e-02, -5.707e-02, -1.966e-01, -6.106e-02, 1.265e-01, -3.076e-03, 2.043e-03, -3.072e-02, 1.043e-01, 3.422e-01), r);
r = MulAdd(s0_8, M4(7.235e-02, -3.542e-04, -1.435e-02, -3.815e-02, -8.855e-02, 8.327e-02, 1.954e-01, 1.462e-01, 1.615e-01, -4.957e-02, 1.596e-02, -8.625e-02, 6.574e-02, -9.799e-02, 5.401e-03, 7.595e-02), r);
r = MulAdd(s1_0, M4(1.245e-01, -2.812e-03, 1.486e-02, 1.246e-01, -5.943e-02, 1.170e-01, -1.068e-01, 8.960e-02, 5.354e-03, -2.039e-01, 8.228e-02, -2.530e-01, -2.789e-03, -6.932e-02, -3.187e-02, -5.794e-02), r);
r = MulAdd(s1_1, M4(-2.539e-02, 4.598e-02, -1.205e-01, 1.597e-01, 2.391e-01, 1.269e-01, -1.116e-02, 1.498e-02, -2.388e-01, -1.548e-01, -7.389e-02, -1.083e-02, -1.181e-01, -7.069e-02, 9.383e-03, -2.018e-01), r);
r = MulAdd(s1_2, M4(-1.248e-02, 3.267e-02, -2.761e-01, -2.043e-02, -8.520e-02, 3.937e-02, -1.372e-01, 1.821e-02, 6.915e-02, -4.061e-02, 1.782e-01, -4.619e-02, 6.811e-02, -5.458e-04, 3.193e-01, 8.892e-03), r);
r = MulAdd(s1_3, M4(-1.580e-01, 7.536e-02, -6.680e-02, 1.891e-01, 1.196e-01, 3.476e-01, -6.321e-02, 1.972e-01, -9.851e-02, 4.483e-01, 9.326e-03, 5.272e-01, -1.478e-01, -4.009e-02, -3.561e-02, -2.549e-01), r);
r = MulAdd(s1_4, M4(-1.253e-01, 1.345e-01, 4.994e-01, 2.000e-01, 2.728e-01, 1.672e-01, 5.501e-01, -1.736e-02, -5.782e-01, -2.191e-01, 4.380e-01, 4.346e-02, -3.006e-01, -5.220e-02, -1.613e-01, 6.023e-02), r);
r = MulAdd(s1_5, M4(1.276e-01, -8.319e-02, -2.115e-01, 1.471e-01, -1.669e-01, -2.484e-02, 9.906e-02, 1.836e-02, 1.010e-01, 1.847e-02, 1.027e-01, -1.680e-02, -1.880e-01, 1.377e-01, 3.823e-02, -8.256e-02), r);
r = MulAdd(s1_6, M4(-3.200e-01, -7.023e-02, -1.243e-01, -2.003e-02, -7.863e-02, 6.650e-02, -1.264e-01, -1.862e-01, -9.119e-02, -4.374e-02, -1.195e-01, -6.902e-02, -1.360e-01, 3.356e-02, -3.667e-02, -1.815e-01), r);
r = MulAdd(s1_7, M4(1.462e-02, 1.001e-01, 2.453e-01, -1.298e-02, -4.372e-01, 1.509e-01, 8.011e-02, -1.323e-01, -1.980e-01, -4.785e-02, 1.733e-01, 1.100e-02, -2.153e-01, 6.711e-02, 2.595e-03, 1.213e-01), r);
r = MulAdd(s1_8, M4(-3.794e-03, 2.239e-02, -6.960e-02, 7.342e-02, -1.882e-01, 1.159e-01, 1.876e-01, 3.125e-02, 2.242e-01, -5.956e-02, 1.328e-02, -5.400e-02, 2.205e-02, -6.049e-02, -9.151e-02, -1.137e-01), r);
return r;
}
void Pass3(uint2 blockStart, uint3 tid) {
float2 pt = float2(GetInputPt());
uint2 gxy = Rmp8x8(tid.x) + blockStart;
uint2 size = GetInputSize();
if (gxy.x >= size.x || gxy.y >= size.y) {
return;
}
float2 pos = (gxy + 0.5) * pt;
V4 s0_0 = l0(-1.0, -1.0);
V4 s0_1 = l0(0.0, -1.0);
V4 s0_2 = l0(1.0, -1.0);
V4 s0_3 = l0(-1.0, 0.0);
V4 s0_4 = l0(0.0, 0.0);
V4 s0_5 = l0(1.0, 0.0);
V4 s0_6 = l0(-1.0, 1.0);
V4 s0_7 = l0(0.0, 1.0);
V4 s0_8 = l0(1.0, 1.0);
V4 s1_0 = -max(-s0_0, 0.0);
V4 s1_1 = -max(-s0_1, 0.0);
V4 s1_2 = -max(-s0_2, 0.0);
V4 s1_3 = -max(-s0_3, 0.0);
V4 s1_4 = -max(-s0_4, 0.0);
V4 s1_5 = -max(-s0_5, 0.0);
V4 s1_6 = -max(-s0_6, 0.0);
V4 s1_7 = -max(-s0_7, 0.0);
V4 s1_8 = -max(-s0_8, 0.0);
s0_0 = max(s0_0, 0.0);
s0_1 = max(s0_1, 0.0);
s0_2 = max(s0_2, 0.0);
s0_3 = max(s0_3, 0.0);
s0_4 = max(s0_4, 0.0);
s0_5 = max(s0_5, 0.0);
s0_6 = max(s0_6, 0.0);
s0_7 = max(s0_7, 0.0);
s0_8 = max(s0_8, 0.0);
t0[gxy] = f0(s0_0, s0_1, s0_2, s0_3, s0_4, s0_5, s0_6, s0_7, s0_8, s1_0, s1_1, s1_2, s1_3, s1_4, s1_5, s1_6, s1_7, s1_8);
}
//!PASS 4
//!DESC conv3
//!BLOCK_SIZE 8
//!NUM_THREADS 64
//!IN t0
//!OUT t1
#define l0(x, y) V4(O(t0, float2(x, y)))
V4 f0(V4 s0_0, V4 s0_1, V4 s0_2, V4 s0_3, V4 s0_4, V4 s0_5, V4 s0_6, V4 s0_7, V4 s0_8, V4 s1_0, V4 s1_1, V4 s1_2, V4 s1_3, V4 s1_4, V4 s1_5, V4 s1_6, V4 s1_7, V4 s1_8) {
V4 r = { 1.575e-02, -2.007e-01, -3.519e-03, -9.082e-03 };
r = MulAdd(s0_0, M4(3.886e-03, -1.503e-01, -6.378e-01, 4.214e-02, -1.255e-01, 1.146e-01, -1.917e-01, -6.556e-02, -3.368e-02, 6.874e-02, 2.796e-01, -2.936e-02, -3.239e-02, 3.923e-02, -6.439e-02, 1.313e-02), r);
r = MulAdd(s0_1, M4(4.357e-01, -1.067e-01, 3.330e-01, -8.295e-02, -4.004e-01, 3.113e-01, -4.222e-02, 2.290e-01, -1.861e-01, 9.039e-02, -1.132e-01, 1.077e-01, -1.603e-02, 6.296e-02, 4.907e-01, 3.396e-02), r);
r = MulAdd(s0_2, M4(-3.290e-01, -1.073e-01, 1.064e-02, -2.792e-03, -4.366e-01, 3.239e-01, -1.383e-01, 1.918e-01, 3.058e-02, 1.006e-01, -6.898e-02, -1.451e-02, -1.882e-01, 2.248e-01, 1.744e-02, -3.155e-02), r);
r = MulAdd(s0_3, M4(2.403e-02, -1.353e-01, 1.895e-01, -2.285e-01, -1.211e-01, 1.771e-01, 2.135e-01, 1.900e-01, -4.204e-03, 3.719e-02, -4.772e-01, 2.006e-01, -2.532e-03, 5.872e-02, 2.901e-01, -9.450e-02), r);
r = MulAdd(s0_4, M4(8.054e-02, 1.389e-02, -2.060e-02, -3.042e-01, -2.476e-01, 9.905e-02, -9.248e-01, 3.372e-01, -5.254e-01, 4.455e-01, 5.707e-02, 1.057e-01, -3.525e-01, 3.349e-01, -3.414e-01, 7.090e-02), r);
r = MulAdd(s0_5, M4(-1.889e-01, -2.290e-01, -4.930e-02, -1.824e-01, -2.062e+00, 6.868e-02, 2.552e-01, 3.883e-01, 5.778e-02, 9.141e-02, 9.917e-02, -1.164e-01, 4.359e-02, 2.105e-01, -7.911e-02, -1.916e-01), r);
r = MulAdd(s0_6, M4(-2.267e-02, -6.231e-03, -9.718e-03, 3.770e-04, -6.982e-02, 4.184e-02, -2.296e-01, -9.542e-02, 5.236e-02, -5.412e-02, -1.757e-01, -1.054e-01, 1.414e-02, -7.772e-02, -1.338e-02, 3.928e-02), r);
r = MulAdd(s0_7, M4(5.776e-02, 4.703e-02, 3.914e-02, -1.617e-02, -3.606e-01, 3.037e-01, -3.096e-01, 3.562e-02, 3.108e-01, -3.684e-01, 3.725e-02, -2.050e-01, -1.494e-02, 8.741e-02, 5.992e-02, 2.655e-02), r);
r = MulAdd(s0_8, M4(3.614e-02, -1.212e-01, 2.507e-02, -5.858e-02, -1.121e-01, -3.433e-01, 6.613e-02, -6.943e-01, 2.233e-02, -5.467e-02, -6.900e-03, -2.566e-01, -1.106e-01, 2.016e-02, -3.700e-02, -2.886e-01), r);
r = MulAdd(s1_0, M4(-5.136e-02, -2.190e-01, -1.035e+00, -5.722e-02, 2.876e-02, 5.070e-02, 3.532e-01, -6.778e-03, 2.930e-04, -6.219e-02, 2.314e-01, -5.210e-02, 1.508e-02, -4.390e-02, -7.749e-02, -9.658e-03), r);
r = MulAdd(s1_1, M4(3.663e-01, -9.746e-02, -6.582e-01, -3.676e-01, -1.694e-01, 7.883e-02, -1.613e-01, 2.328e-02, 2.595e-04, -3.763e-02, -9.946e-02, -6.137e-02, 1.429e-01, -1.964e-01, 2.439e-01, 4.898e-02), r);
r = MulAdd(s1_2, M4(7.884e-02, 1.842e-01, -1.309e-01, 4.895e-02, 4.820e-02, 8.364e-02, 1.189e-02, -1.438e-02, -7.934e-02, 4.775e-02, -6.137e-02, -1.335e-02, -4.416e-02, 3.584e-02, 1.751e-04, -1.178e-02), r);
r = MulAdd(s1_3, M4(-9.861e-03, -1.277e-01, 2.389e-03, -3.232e-01, -2.782e-03, 1.115e-01, -6.485e-02, 2.093e-01, 2.056e-01, 2.527e-02, -1.772e-01, 1.863e-02, 5.983e-02, -8.103e-02, 3.076e-01, -2.027e-01), r);
r = MulAdd(s1_4, M4(1.001e-01, 3.476e-01, -1.305e-01, -1.653e-01, 8.890e-02, -4.170e-01, -1.530e-01, 7.048e-02, -5.605e-01, 1.093e-01, 2.038e-01, -2.320e-01, -1.287e-01, -2.173e-01, -1.630e-01, -9.691e-02), r);
r = MulAdd(s1_5, M4(-2.778e-01, 1.393e-01, -2.802e-02, -5.375e-02, -4.550e-01, -1.661e-01, 2.293e-03, -5.984e-02, -5.070e-02, -8.852e-02, 7.806e-02, 2.187e-02, 1.901e-01, -3.219e-01, -1.937e-01, -2.336e-01), r);
r = MulAdd(s1_6, M4(-8.489e-02, 1.968e-01, -7.760e-02, 1.388e-01, 4.713e-03, 1.527e-01, 8.535e-02, 1.643e-02, 1.429e-01, -1.558e-01, 2.339e-01, 2.762e-01, 1.694e-02, -4.245e-02, -2.793e-02, -3.332e-02), r);
r = MulAdd(s1_7, M4(-4.377e-02, 3.486e-01, -1.766e-01, -1.065e-01, -1.645e-01, -8.722e-04, -1.147e-01, 1.663e-01, 6.801e-02, -3.539e-01, 1.560e-02, -1.819e-01, 1.440e-02, -1.221e-02, 3.693e-02, 5.886e-03), r);
r = MulAdd(s1_8, M4(5.940e-02, 1.624e-01, 1.526e-02, 6.692e-02, 1.812e-01, -8.647e-02, 3.210e-02, -3.751e-04, 2.884e-02, -4.717e-02, 4.121e-03, 5.144e-02, -1.995e-02, -2.827e-01, 6.148e-03, 7.209e-02), r);
return r;
}
void Pass4(uint2 blockStart, uint3 tid) {
float2 pt = float2(GetInputPt());
uint2 gxy = Rmp8x8(tid.x) + blockStart;
uint2 size = GetInputSize();
if (gxy.x >= size.x || gxy.y >= size.y) {
return;
}
float2 pos = (gxy + 0.5) * pt;
V4 s0_0 = l0(-1.0, -1.0);
V4 s0_1 = l0(0.0, -1.0);
V4 s0_2 = l0(1.0, -1.0);
V4 s0_3 = l0(-1.0, 0.0);
V4 s0_4 = l0(0.0, 0.0);
V4 s0_5 = l0(1.0, 0.0);
V4 s0_6 = l0(-1.0, 1.0);
V4 s0_7 = l0(0.0, 1.0);
V4 s0_8 = l0(1.0, 1.0);
V4 s1_0 = -max(-s0_0, 0.0);
V4 s1_1 = -max(-s0_1, 0.0);
V4 s1_2 = -max(-s0_2, 0.0);
V4 s1_3 = -max(-s0_3, 0.0);
V4 s1_4 = -max(-s0_4, 0.0);
V4 s1_5 = -max(-s0_5, 0.0);
V4 s1_6 = -max(-s0_6, 0.0);
V4 s1_7 = -max(-s0_7, 0.0);
V4 s1_8 = -max(-s0_8, 0.0);
s0_0 = max(s0_0, 0.0);
s0_1 = max(s0_1, 0.0);
s0_2 = max(s0_2, 0.0);
s0_3 = max(s0_3, 0.0);
s0_4 = max(s0_4, 0.0);
s0_5 = max(s0_5, 0.0);
s0_6 = max(s0_6, 0.0);
s0_7 = max(s0_7, 0.0);
s0_8 = max(s0_8, 0.0);
t1[gxy] = f0(s0_0, s0_1, s0_2, s0_3, s0_4, s0_5, s0_6, s0_7, s0_8, s1_0, s1_1, s1_2, s1_3, s1_4, s1_5, s1_6, s1_7, s1_8);
}
//!PASS 5
//!DESC conv4
//!BLOCK_SIZE 8
//!NUM_THREADS 64
//!IN t1
//!OUT t0
#define l0(x, y) V4(O(t1, float2(x, y)))
V4 f0(V4 s0_0, V4 s0_1, V4 s0_2, V4 s0_3, V4 s0_4, V4 s0_5, V4 s0_6, V4 s0_7, V4 s0_8, V4 s1_0, V4 s1_1, V4 s1_2, V4 s1_3, V4 s1_4, V4 s1_5, V4 s1_6, V4 s1_7, V4 s1_8) {
V4 r = { 2.513e-04, -2.994e-02, -5.133e-02, -8.977e-03 };
r = MulAdd(s0_0, M4(-6.479e-02, -9.976e-02, -1.507e-01, -9.934e-02, -1.046e-02, -1.471e-01, -4.218e-02, -8.348e-04, -5.963e-02, 1.519e-03, 5.897e-03, 5.284e-02, -4.467e-01, 4.779e-01, -1.953e-02, 1.951e-01), r);
r = MulAdd(s0_1, M4(-5.276e-02, -1.201e-01, -1.160e-01, 6.076e-02, -4.798e-02, -3.491e-01, -3.055e-01, -1.607e-01, -8.989e-02, 1.221e-01, -1.561e-01, 6.227e-02, -1.598e-01, -6.666e-01, 6.029e-01, -5.466e-01), r);
r = MulAdd(s0_2, M4(-1.331e-01, -4.988e-02, -2.217e-02, 3.405e-02, 2.261e-02, 1.352e-01, 1.124e-02, 8.259e-02, -3.548e-02, 2.454e-01, 4.417e-02, 2.297e-01, 1.780e-01, -2.203e-01, 5.913e-02, -2.201e-01), r);
r = MulAdd(s0_3, M4(1.348e-01, 5.544e-01, -4.335e-01, -3.619e-01, 1.011e-01, 2.665e-01, -2.627e-01, -1.800e-01, -1.158e-01, -8.543e-02, -7.868e-03, 2.056e-01, 1.988e-01, 1.174e+00, -1.291e-01, 1.131e-01), r);
r = MulAdd(s0_4, M4(4.504e-01, 1.025e-01, -1.449e-01, -3.442e-02, -4.525e-01, -1.513e-01, -8.135e-02, -9.669e-02, -3.287e-01, 5.251e-01, -6.540e-01, 7.386e-02, 2.603e-01, -8.246e-01, -1.378e-01, 2.363e+00), r);
r = MulAdd(s0_5, M4(-7.102e-02, -5.554e-02, -3.489e-02, -6.688e-02, 2.877e-01, -6.258e-02, 8.515e-02, -2.109e-01, -2.723e-01, 1.543e-01, 1.285e-01, 9.366e-02, 3.135e-02, -3.700e-01, -4.111e-01, 1.822e+00), r);
r = MulAdd(s0_6, M4(-4.018e-02, -3.412e-01, 5.388e-02, 4.947e-01, -3.234e-02, -6.778e-02, 3.825e-02, 1.313e-01, -6.083e-02, 3.439e-02, -1.081e-01, 6.456e-02, 2.287e-02, -2.470e-01, 2.026e-02, -1.886e-02), r);
r = MulAdd(s0_7, M4(2.410e-01, 1.529e-01, -1.370e-01, -1.389e-01, 1.549e-01, 8.308e-03, 3.064e-02, 3.925e-02, -9.013e-02, 1.131e-01, -9.240e-02, 3.740e-01, -1.009e-01, -6.576e-02, -1.491e-01, -3.452e-02), r);
r = MulAdd(s0_8, M4(-1.628e-01, -2.480e-02, -6.569e-02, 3.873e-02, 1.604e-02, 1.651e-02, -4.681e-02, -1.647e-02, -1.648e-02, 1.541e-01, 2.284e-02, 6.545e-01, 1.799e-03, 1.193e-03, -1.215e-01, 5.919e-02), r);
r = MulAdd(s1_0, M4(-1.115e-02, -5.014e-02, -1.499e-01, -7.414e-04, -6.944e-02, -4.168e-02, -1.254e-01, -6.576e-02, 2.946e-04, -2.669e-02, 4.109e-02, 1.949e-02, 1.242e-01, 1.753e-01, 9.717e-02, 1.446e-01), r);
r = MulAdd(s1_1, M4(-1.327e-02, -1.462e-01, -8.510e-02, -1.228e-02, 1.772e-01, 1.009e-01, -4.342e-02, -8.827e-02, -6.663e-02, -1.245e-01, -4.625e-02, -4.285e-02, 7.586e-02, -1.208e-01, 2.705e-01, -1.558e-01), r);
r = MulAdd(s1_2, M4(-7.024e-02, -3.045e-02, -1.916e-02, 4.979e-02, -9.145e-02, 2.285e-01, 4.612e-02, 2.217e-01, 7.690e-02, -4.332e-02, 6.032e-03, -2.370e-02, 3.802e-01, -8.124e-02, 1.982e-02, -8.310e-02), r);
r = MulAdd(s1_3, M4(1.238e-01, 5.787e-01, -5.332e-01, -2.806e-01, 1.208e-01, 6.549e-02, -2.040e-01, -2.578e-02, -5.878e-02, -1.496e-01, 1.213e-01, 1.489e-02, 9.569e-02, 1.964e-01, 6.477e-02, -2.939e-01), r);
r = MulAdd(s1_4, M4(5.825e-01, 2.257e-01, -1.943e-01, 1.101e-01, -3.240e-01, -2.967e-01, -4.203e-02, -3.636e-01, -1.062e-01, -3.799e-02, -4.444e-01, -7.607e-02, -3.056e-01, -2.926e-01, -4.582e-02, 2.795e-01), r);
r = MulAdd(s1_5, M4(-9.076e-02, -5.130e-02, -3.718e-02, -6.163e-02, 1.831e-01, -1.199e-01, 9.176e-02, -2.456e-01, 2.362e-01, -1.854e-01, -1.394e-01, 3.560e-03, 2.070e-02, -6.903e-02, -5.061e-02, 3.068e-02), r);
r = MulAdd(s1_6, M4(-4.988e-02, -3.880e-01, 3.001e-02, 3.892e-01, -2.827e-02, -2.880e-02, 4.071e-02, 2.861e-01, -4.016e-02, -1.085e-01, 9.207e-03, -7.367e-02, 9.072e-03, 8.960e-02, 5.334e-03, -6.480e-02), r);
r = MulAdd(s1_7, M4(2.900e-01, 1.450e-01, -1.401e-01, -2.809e-01, 1.218e-01, -3.153e-03, -2.544e-02, 1.898e-01, -7.197e-02, -3.721e-01, 4.042e-02, 9.918e-02, -1.132e-01, 3.578e-02, 4.000e-02, 6.991e-02), r);
r = MulAdd(s1_8, M4(-1.493e-01, -2.310e-02, -6.133e-02, 5.322e-02, -4.879e-02, -5.139e-02, -8.058e-02, 4.140e-02, 2.511e-01, 3.669e-02, -1.003e-01, -1.457e-01, 1.528e-01, 1.177e-01, 6.665e-02, -3.084e-02), r);
return r;
}
void Pass5(uint2 blockStart, uint3 tid) {
float2 pt = float2(GetInputPt());
uint2 gxy = Rmp8x8(tid.x) + blockStart;
uint2 size = GetInputSize();
if (gxy.x >= size.x || gxy.y >= size.y) {
return;
}
float2 pos = (gxy + 0.5) * pt;
V4 s0_0 = l0(-1.0, -1.0);
V4 s0_1 = l0(0.0, -1.0);
V4 s0_2 = l0(1.0, -1.0);
V4 s0_3 = l0(-1.0, 0.0);
V4 s0_4 = l0(0.0, 0.0);
V4 s0_5 = l0(1.0, 0.0);
V4 s0_6 = l0(-1.0, 1.0);
V4 s0_7 = l0(0.0, 1.0);
V4 s0_8 = l0(1.0, 1.0);
V4 s1_0 = -max(-s0_0, 0.0);
V4 s1_1 = -max(-s0_1, 0.0);
V4 s1_2 = -max(-s0_2, 0.0);
V4 s1_3 = -max(-s0_3, 0.0);
V4 s1_4 = -max(-s0_4, 0.0);
V4 s1_5 = -max(-s0_5, 0.0);
V4 s1_6 = -max(-s0_6, 0.0);
V4 s1_7 = -max(-s0_7, 0.0);
V4 s1_8 = -max(-s0_8, 0.0);
s0_0 = max(s0_0, 0.0);
s0_1 = max(s0_1, 0.0);
s0_2 = max(s0_2, 0.0);
s0_3 = max(s0_3, 0.0);
s0_4 = max(s0_4, 0.0);
s0_5 = max(s0_5, 0.0);
s0_6 = max(s0_6, 0.0);
s0_7 = max(s0_7, 0.0);
s0_8 = max(s0_8, 0.0);
t0[gxy] = f0(s0_0, s0_1, s0_2, s0_3, s0_4, s0_5, s0_6, s0_7, s0_8, s1_0, s1_1, s1_2, s1_3, s1_4, s1_5, s1_6, s1_7, s1_8);
}
//!PASS 6
//!DESC conv5
//!BLOCK_SIZE 8
//!NUM_THREADS 64
//!IN t0
//!OUT t1
#define l0(x, y) V4(O(t0, float2(x, y)))
V4 f0(V4 s0_0, V4 s0_1, V4 s0_2, V4 s0_3, V4 s0_4, V4 s0_5, V4 s0_6, V4 s0_7, V4 s0_8, V4 s1_0, V4 s1_1, V4 s1_2, V4 s1_3, V4 s1_4, V4 s1_5, V4 s1_6, V4 s1_7, V4 s1_8) {
V4 r = { 4.102e-03, 1.192e-03, -2.598e-03, -2.812e-03 };
r = MulAdd(s0_0, M4(4.575e-01, 2.412e-01, 1.926e-01, 5.873e-02, 2.954e-02, -1.424e-01, 7.881e-03, 2.358e-04, -5.872e-02, -1.007e-01, -3.632e-02, 5.718e-02, 1.389e-01, -4.163e-02, -1.379e-01, 2.160e-03), r);
r = MulAdd(s0_1, M4(1.347e-01, -8.074e-01, -1.155e-01, 2.242e-01, -2.673e-01, 4.053e-01, 8.867e-02, -2.840e-02, 9.443e-02, 2.632e-01, 9.207e-02, -1.793e-02, 1.519e-01, 3.302e-03, 2.027e-01, 2.643e-02), r);
r = MulAdd(s0_2, M4(1.462e-02, -7.543e-02, -6.080e-02, 7.431e-02, -3.673e-02, -1.665e-01, -2.745e-01, -4.416e-02, -3.270e-01, 7.677e-01, 7.241e-01, -1.157e-01, -8.204e-03, 2.172e-02, 3.183e-01, 3.931e-02), r);
r = MulAdd(s0_3, M4(1.168e+00, -8.427e-01, -3.237e-03, 5.416e-02, 1.694e-02, -1.042e-01, -2.173e-01, -1.089e-01, -9.881e-02, -1.109e-01, -1.003e-01, -5.080e-02, -9.279e-02, -1.111e-01, -2.699e-02, -2.297e-02), r);
r = MulAdd(s0_4, M4(-4.884e-01, -4.472e-01, -9.701e-02, 8.789e-01, 1.962e-02, 5.041e-01, 3.221e-01, -4.622e-02, 9.039e-02, -2.531e-01, 6.228e-01, 1.590e-02, 1.804e-02, 7.795e-02, -8.005e-02, -6.310e-03), r);
r = MulAdd(s0_5, M4(-6.567e-02, -5.161e-02, 5.550e-02, 5.285e-02, -6.147e-02, -1.840e-01, 2.028e-01, 4.014e-01, 4.070e-01, -1.022e-01, 1.414e+00, -3.126e-01, 7.508e-03, 1.013e-01, -7.300e-02, -4.282e-01), r);
r = MulAdd(s0_6, M4(1.721e+00, 1.776e-01, -8.690e-02, -1.102e-01, -8.467e-02, -2.165e-02, 6.238e-02, 2.052e-02, 2.763e-01, -3.472e-02, -1.179e-01, 2.993e-02, -6.860e-02, 1.887e-02, 3.140e-02, -6.853e-02), r);
r = MulAdd(s0_7, M4(1.937e-01, 1.975e-01, -2.456e-01, -1.360e+00, 1.792e-01, -5.969e-02, -7.670e-02, 2.606e-01, 1.355e-01, -9.109e-03, 2.756e-01, 6.674e-02, 1.312e-02, -1.542e-02, 2.236e-02, 1.997e-01), r);
r = MulAdd(s0_8, M4(4.255e-02, -1.452e-02, -8.732e-02, -1.084e-01, 1.495e-02, 1.302e-02, -9.151e-02, -2.814e-01, 5.197e-02, 2.866e-02, 5.490e-01, 4.310e-01, 3.666e-02, -3.380e-03, -2.830e-02, -8.223e-02), r);
r = MulAdd(s1_0, M4(2.549e-02, 7.469e-02, -5.290e-02, -4.972e-02, -2.340e-01, -1.875e-01, 1.656e-01, 5.697e-02, -8.570e-02, -1.520e-01, -2.622e-02, 1.043e-02, -2.377e-01, -3.927e-02, 1.539e-01, 4.528e-02), r);
r = MulAdd(s1_1, M4(-1.188e-02, -9.781e-02, 1.606e-01, 5.138e-02, -4.165e-01, 8.262e-01, 1.709e-01, -1.063e-01, 8.393e-03, 7.300e-02, -9.347e-02, -6.226e-02, -3.633e-01, -4.453e-01, 2.190e-01, 2.415e-01), r);
r = MulAdd(s1_2, M4(-4.011e-02, 3.404e-02, 1.013e-01, 3.551e-02, 9.692e-02, -2.109e-01, 1.897e-01, -2.192e-01, -1.703e-01, 5.317e-01, 1.354e-01, -2.027e-01, -3.658e-01, -1.845e-01, -5.465e-01, 1.436e-01), r);
r = MulAdd(s1_3, M4(7.674e-01, 1.677e-01, -7.875e-02, 7.537e-03, -4.911e-01, -1.083e-01, 7.183e-03, -1.107e-01, -2.514e-02, -1.257e-01, -5.070e-02, -3.886e-02, 1.368e-01, -1.991e-02, -1.698e-01, -7.850e-03), r);
r = MulAdd(s1_4, M4(-5.096e-02, 7.912e-02, -2.105e-01, 1.149e-01, 9.798e-02, 2.243e-01, -3.434e-01, 3.492e-01, -1.265e-01, -1.839e-01, -1.337e-01, -6.909e-02, -8.552e-01, 1.334e-01, 8.652e-01, -3.408e-01), r);
r = MulAdd(s1_5, M4(-2.933e-02, 1.424e-01, 6.542e-02, -1.710e-01, -1.459e-01, -3.069e-02, -1.275e-01, -9.443e-02, 2.657e-01, -4.784e-04, -6.729e-03, -1.910e-01, -4.628e-01, 3.808e-02, -1.470e-01, 1.480e-01), r);
r = MulAdd(s1_6, M4(1.512e-01, -1.755e-02, -5.440e-02, 1.317e-02, -7.181e-02, -6.842e-03, -7.375e-02, -8.356e-02, 7.332e-02, -9.437e-02, -1.008e-01, -4.731e-02, -9.102e-02, -8.192e-03, 7.862e-04, 6.417e-02), r);
r = MulAdd(s1_7, M4(2.457e-01, -1.058e-01, -2.777e-02, -1.532e-03, 7.609e-02, 3.452e-02, 1.774e-01, 3.296e-01, 6.779e-02, -6.683e-02, 1.485e-01, 7.321e-02, -3.082e-02, -4.348e-02, 3.558e-03, 9.111e-03), r);
r = MulAdd(s1_8, M4(1.104e-01, 5.040e-03, 9.642e-03, -8.991e-02, -2.134e-01, 3.758e-02, -1.244e-01, -1.987e-01, -7.007e-02, 6.792e-03, 1.369e-01, 5.332e-01, -5.354e-02, -2.024e-02, -1.038e-01, -4.812e-02), r);
return r;
}
void Pass6(uint2 blockStart, uint3 tid) {
float2 pt = float2(GetInputPt());
uint2 gxy = Rmp8x8(tid.x) + blockStart;
uint2 size = GetInputSize();
if (gxy.x >= size.x || gxy.y >= size.y) {
return;
}
float2 pos = (gxy + 0.5) * pt;
V4 s0_0 = l0(-1.0, -1.0);
V4 s0_1 = l0(0.0, -1.0);
V4 s0_2 = l0(1.0, -1.0);
V4 s0_3 = l0(-1.0, 0.0);
V4 s0_4 = l0(0.0, 0.0);
V4 s0_5 = l0(1.0, 0.0);
V4 s0_6 = l0(-1.0, 1.0);
V4 s0_7 = l0(0.0, 1.0);
V4 s0_8 = l0(1.0, 1.0);
V4 s1_0 = -max(-s0_0, 0.0);
V4 s1_1 = -max(-s0_1, 0.0);
V4 s1_2 = -max(-s0_2, 0.0);
V4 s1_3 = -max(-s0_3, 0.0);
V4 s1_4 = -max(-s0_4, 0.0);
V4 s1_5 = -max(-s0_5, 0.0);
V4 s1_6 = -max(-s0_6, 0.0);
V4 s1_7 = -max(-s0_7, 0.0);
V4 s1_8 = -max(-s0_8, 0.0);
s0_0 = max(s0_0, 0.0);
s0_1 = max(s0_1, 0.0);
s0_2 = max(s0_2, 0.0);
s0_3 = max(s0_3, 0.0);
s0_4 = max(s0_4, 0.0);
s0_5 = max(s0_5, 0.0);
s0_6 = max(s0_6, 0.0);
s0_7 = max(s0_7, 0.0);
s0_8 = max(s0_8, 0.0);
t1[gxy] = f0(s0_0, s0_1, s0_2, s0_3, s0_4, s0_5, s0_6, s0_7, s0_8, s1_0, s1_1, s1_2, s1_3, s1_4, s1_5, s1_6, s1_7, s1_8);
}
//!PASS 7
//!DESC conv6
//!BLOCK_SIZE 8
//!NUM_THREADS 64
//!IN t1
//!OUT t0
#define l0(x, y) V4(O(t1, float2(x, y)))
V4 f0(V4 s0_0, V4 s0_1, V4 s0_2, V4 s0_3, V4 s0_4, V4 s0_5, V4 s0_6, V4 s0_7, V4 s0_8, V4 s1_0, V4 s1_1, V4 s1_2, V4 s1_3, V4 s1_4, V4 s1_5, V4 s1_6, V4 s1_7, V4 s1_8) {
V4 r = { 1.448e-03, -2.432e-03, -8.004e-04, 5.896e-05 };
r = MulAdd(s0_0, M4(6.200e-02, 5.385e-02, -5.478e-02, 3.955e-02, -1.722e-02, -1.194e-01, 8.331e-02, -9.296e-02, -2.161e-02, 8.716e-02, -5.918e-02, 1.032e-01, 4.954e-02, -3.822e-02, 8.472e-02, -2.191e-01), r);
r = MulAdd(s0_1, M4(2.503e-01, 5.635e-02, 7.355e-03, -2.025e-01, 7.104e-02, -1.324e-01, -3.051e-02, 2.246e-02, -4.480e-02, 6.693e-03, 4.467e-02, 3.388e-02, 4.262e-01, 1.488e-01, -8.809e-01, 5.350e-01), r);
r = MulAdd(s0_2, M4(-7.511e-03, 1.921e-01, -3.653e-01, 2.096e-02, 2.413e-02, 4.846e-02, -1.538e-01, 3.359e-02, 5.958e-03, -1.033e-02, 2.389e-02, 1.283e-02, -5.270e-02, 2.842e-01, 5.681e-02, -3.578e-02), r);
r = MulAdd(s0_3, M4(-2.198e-02, -1.674e-02, 3.330e-02, 3.249e-02, -4.430e-02, 9.217e-02, -3.348e-02, -3.546e-01, 1.228e-01, 3.875e-02, 7.220e-03, 6.719e-02, -8.768e-01, -1.165e-02, -3.862e-02, -2.045e-02), r);
r = MulAdd(s0_4, M4(-6.935e-01, -4.898e-01, 2.252e-01, -1.647e-01, -6.408e-02, 4.562e-01, -6.617e-01, 1.220e-01, 1.053e-02, -9.937e-02, -1.118e-02, 3.272e-01, -9.081e-02, 2.353e-02, 4.776e-01, -1.238e-01), r);
r = MulAdd(s0_5, M4(2.481e-01, -3.296e-01, -3.372e-02, -2.008e-02, 5.924e-03, 1.762e-02, 3.642e-01, -1.182e-01, -2.219e-02, -4.332e-02, -9.762e-02, 3.537e-02, 2.114e-02, -5.440e-02, 3.124e-01, 5.069e-02), r);
r = MulAdd(s0_6, M4(-5.465e-02, -5.352e-03, -3.419e-03, -6.733e-02, -8.079e-02, -6.569e-02, -1.494e-02, -3.462e-01, -8.125e-03, 2.572e-03, -3.894e-02, -3.246e-02, -1.566e-02, -3.004e-02, 1.145e-01, 6.794e-02), r);
r = MulAdd(s0_7, M4(4.788e-02, 7.675e-03, -7.030e-02, -2.384e-02, -3.070e-01, -7.080e-01, -2.017e-01, 9.579e-02, 1.259e-01, -1.004e-02, -1.287e-01, 3.334e-02, -9.642e-02, -8.073e-02, 2.546e-02, 5.204e-02), r);
r = MulAdd(s0_8, M4(-6.015e-02, 1.650e-01, -5.471e-02, -1.454e-01, -2.785e-02, -1.831e-01, 1.123e-01, 3.453e-02, -1.179e-02, 1.722e-02, -1.068e-02, -2.608e-02, 1.514e-04, -1.287e-02, -7.741e-03, -9.765e-03), r);
r = MulAdd(s1_0, M4(-4.922e-02, -5.675e-03, -2.161e-02, 3.164e-02, -2.003e-02, -3.890e-02, 5.198e-02, -1.811e-03, -3.385e-02, -1.510e-02, -2.289e-02, 1.009e-01, 4.427e-02, -1.763e-01, 1.255e-01, -5.073e-02), r);
r = MulAdd(s1_1, M4(1.057e-01, -8.124e-02, 1.131e-01, -1.361e-01, 4.740e-02, -6.425e-02, 8.930e-03, 5.318e-02, 5.266e-02, -6.003e-02, 1.320e-01, 4.163e-02, 1.277e-01, -2.404e-01, -1.696e-01, 2.204e-01), r);
r = MulAdd(s1_2, M4(2.723e-02, 1.918e-01, -2.822e-01, -1.877e-02, -4.599e-03, 7.591e-02, -1.128e-01, -6.519e-03, 2.311e-02, -1.684e-01, 2.293e-01, -1.042e-01, -1.882e-02, 4.970e-02, -1.309e-01, -8.894e-03), r);
r = MulAdd(s1_3, M4(4.883e-02, 2.819e-02, 4.318e-02, 3.186e-02, 7.782e-02, 1.741e-01, -8.927e-02, 4.005e-02, 5.888e-02, -1.057e-01, 9.692e-02, 8.032e-02, -1.086e-01, 6.323e-02, -8.520e-02, -1.273e-02), r);
r = MulAdd(s1_4, M4(-1.746e-01, -2.834e-02, -3.694e-02, 3.226e-01, -2.541e-01, 6.860e-01, -1.436e-01, 1.705e-01, 2.614e-01, -6.751e-02, 5.646e-02, 3.666e-01, -2.621e-02, 4.951e-01, -1.090e-01, -3.168e-01), r);
r = MulAdd(s1_5, M4(1.513e-01, 5.210e-02, 2.625e-01, -6.303e-02, -2.252e-02, -9.485e-02, 4.776e-01, -1.789e-01, -1.291e-01, -9.714e-02, -1.427e-01, -1.165e-01, 2.415e-02, 9.790e-02, 6.024e-02, -9.622e-02), r);
r = MulAdd(s1_6, M4(3.751e-02, -2.907e-02, -1.762e-02, -9.545e-02, 2.866e-01, -7.329e-02, -9.787e-03, 4.513e-03, -9.486e-02, -2.446e-02, -2.357e-02, -5.002e-02, 4.973e-02, 6.256e-02, -2.532e-02, -1.817e-02), r);
r = MulAdd(s1_7, M4(-6.855e-02, -6.762e-02, -6.269e-02, -6.947e-02, -1.389e-01, -1.915e-01, -4.806e-02, 1.870e-01, 1.298e-01, 6.268e-03, -5.985e-02, -5.396e-02, -3.048e-02, -5.396e-03, -9.720e-02, 3.289e-03), r);
r = MulAdd(s1_8, M4(-2.052e-02, -8.106e-02, -1.721e-02, 9.911e-03, -8.521e-02, 4.832e-02, -1.708e-01, -6.445e-02, -9.788e-02, 8.836e-02, -1.204e-01, -1.123e-01, 1.514e-02, 1.628e-02, -5.003e-02, -6.128e-03), r);
return r;
}
void Pass7(uint2 blockStart, uint3 tid) {
float2 pt = float2(GetInputPt());
uint2 gxy = Rmp8x8(tid.x) + blockStart;
uint2 size = GetInputSize();
if (gxy.x >= size.x || gxy.y >= size.y) {
return;
}
float2 pos = (gxy + 0.5) * pt;
V4 s0_0 = l0(-1.0, -1.0);
V4 s0_1 = l0(0.0, -1.0);
V4 s0_2 = l0(1.0, -1.0);
V4 s0_3 = l0(-1.0, 0.0);
V4 s0_4 = l0(0.0, 0.0);
V4 s0_5 = l0(1.0, 0.0);
V4 s0_6 = l0(-1.0, 1.0);
V4 s0_7 = l0(0.0, 1.0);
V4 s0_8 = l0(1.0, 1.0);
V4 s1_0 = -max(-s0_0, 0.0);
V4 s1_1 = -max(-s0_1, 0.0);
V4 s1_2 = -max(-s0_2, 0.0);
V4 s1_3 = -max(-s0_3, 0.0);
V4 s1_4 = -max(-s0_4, 0.0);
V4 s1_5 = -max(-s0_5, 0.0);
V4 s1_6 = -max(-s0_6, 0.0);
V4 s1_7 = -max(-s0_7, 0.0);
V4 s1_8 = -max(-s0_8, 0.0);
s0_0 = max(s0_0, 0.0);
s0_1 = max(s0_1, 0.0);
s0_2 = max(s0_2, 0.0);
s0_3 = max(s0_3, 0.0);
s0_4 = max(s0_4, 0.0);
s0_5 = max(s0_5, 0.0);
s0_6 = max(s0_6, 0.0);
s0_7 = max(s0_7, 0.0);
s0_8 = max(s0_8, 0.0);
t0[gxy] = f0(s0_0, s0_1, s0_2, s0_3, s0_4, s0_5, s0_6, s0_7, s0_8, s1_0, s1_1, s1_2, s1_3, s1_4, s1_5, s1_6, s1_7, s1_8);
}
//!PASS 8
//!DESC conv7
//!BLOCK_SIZE 8
//!NUM_THREADS 64
//!IN t0
//!OUT t1
#define l0(x, y) V4(O(t0, float2(x, y)))
V4 f0(V4 s0_0, V4 s0_1, V4 s0_2, V4 s0_3, V4 s0_4, V4 s0_5, V4 s0_6, V4 s0_7, V4 s0_8, V4 s1_0, V4 s1_1, V4 s1_2, V4 s1_3, V4 s1_4, V4 s1_5, V4 s1_6, V4 s1_7, V4 s1_8) {
V4 r = { 2.671e-03, -5.536e-03, -4.013e-03, 4.378e-03 };
r = MulAdd(s0_0, M4(5.901e-02, -1.033e-01, -1.441e-01, 4.291e-02, 2.355e-02, -1.199e-01, -1.741e-01, -5.263e-03, -6.030e-03, -4.043e-02, 1.910e-01, 8.326e-03, 2.913e-02, 1.969e-02, -1.380e-01, 9.492e-02), r);
r = MulAdd(s0_1, M4(-1.616e-01, 1.649e-01, -1.133e-02, -1.037e-01, -1.060e-02, 2.299e-01, -5.302e-02, -2.329e-01, -8.540e-02, 2.232e-01, 2.647e-01, 3.922e-01, 5.387e-02, 5.841e-01, -1.264e-01, -1.440e-01), r);
r = MulAdd(s0_2, M4(-1.944e-02, -7.262e-02, 9.583e-02, 3.448e-02, 4.402e-02, 5.319e-02, -2.384e-02, 4.652e-02, 6.280e-02, -4.195e-02, 1.573e-02, 7.059e-02, 1.029e-01, -1.784e-02, -3.735e-02, -4.952e-02), r);
r = MulAdd(s0_3, M4(7.393e-02, -1.825e-01, -2.983e-01, -5.798e-02, -2.475e-01, -4.958e-02, 6.660e-01, -2.202e-01, -9.158e-02, 4.280e-04, 2.472e-01, -2.979e-01, -9.887e-02, 6.188e-02, 2.163e-01, -9.358e-03), r);
r = MulAdd(s0_4, M4(-8.664e-01, 2.357e-01, 3.390e-01, -5.275e-01, -2.213e-01, -4.992e-01, 5.479e-01, 4.245e-01, -7.542e-02, 4.854e-01, -3.525e-01, 3.950e-01, 3.619e-01, -3.968e-01, -3.447e-01, 5.089e-01), r);
r = MulAdd(s0_5, M4(-9.239e-02, -6.370e-01, -7.252e-02, -3.435e-01, -1.057e-01, 1.616e-01, -4.413e-02, 1.824e-01, 2.001e-02, -1.343e-01, -5.730e-02, 7.302e-02, -2.361e-02, -9.044e-02, -1.041e-01, 2.971e-01), r);
r = MulAdd(s0_6, M4(-2.803e-02, -8.707e-02, -1.407e-01, -2.685e-02, 1.099e-01, 1.721e-01, 1.612e-01, 6.962e-02, -1.659e-02, 7.845e-02, 2.165e-01, -7.067e-02, 1.666e-02, 7.051e-02, 6.373e-02, 4.391e-02), r);
r = MulAdd(s0_7, M4(-1.560e-01, -2.698e-02, -5.684e-01, -1.184e-01, 7.742e-01, -1.023e-03, -8.177e-02, 2.857e-01, 2.253e-02, -1.400e-02, -6.523e-02, 7.644e-02, 1.789e-01, -8.433e-03, 1.041e-01, 7.009e-02), r);
r = MulAdd(s0_8, M4(-1.491e-01, -2.037e-01, -2.499e-01, -7.730e-02, 1.051e-01, -1.718e-02, -1.762e-01, 4.808e-02, -3.068e-03, 1.737e-02, -3.772e-04, 4.732e-02, 7.205e-02, 7.901e-02, -1.759e-02, 8.476e-02), r);
r = MulAdd(s1_0, M4(4.810e-02, -1.822e-02, -1.150e-01, -1.679e-02, -5.481e-02, -7.544e-02, 2.213e-01, 2.615e-02, -2.628e-03, -1.482e-01, -5.570e-02, 5.137e-02, -1.381e-02, -1.878e-03, -3.132e-02, -3.309e-02), r);
r = MulAdd(s1_1, M4(1.101e-01, 1.003e-01, -4.307e-01, -2.520e-02, 1.138e-02, -1.966e-01, 6.664e-02, 1.114e-01, -1.431e-01, 3.634e-01, 4.274e-02, -8.279e-02, -5.291e-02, 3.540e-01, 8.995e-02, -1.401e-01), r);
r = MulAdd(s1_2, M4(7.230e-02, 4.684e-01, -6.542e-02, -2.792e-01, 2.936e-02, 3.476e-03, -1.024e-02, 1.880e-01, 1.898e-02, 2.529e-02, 8.537e-03, -6.073e-03, 1.025e-01, -2.320e-01, -1.804e-02, 5.471e-02), r);
r = MulAdd(s1_3, M4(-9.258e-03, -7.731e-03, 4.285e-02, -4.725e-02, -3.878e-02, -1.749e-02, -1.681e-02, -1.020e-01, -3.975e-02, 1.609e-02, 8.299e-02, -1.824e-01, -2.500e-02, 3.516e-02, 8.591e-02, 1.714e-02), r);
r = MulAdd(s1_4, M4(-2.210e-01, 1.534e-01, 3.410e-01, -2.552e-01, -5.090e-02, 1.582e-02, 1.802e-01, -1.333e-01, -5.371e-01, 3.751e-01, -1.323e-01, 3.018e-01, 1.756e-01, -9.756e-02, -4.873e-01, 4.985e-01), r);
r = MulAdd(s1_5, M4(-1.073e-02, 2.919e-01, -2.025e-01, 3.240e-01, 4.318e-02, -1.972e-02, -1.612e-01, 3.528e-01, -6.472e-02, -6.212e-02, 3.146e-02, 6.391e-02, 4.950e-02, -6.270e-01, -1.985e-02, 4.680e-02), r);
r = MulAdd(s1_6, M4(-2.215e-02, 1.836e-02, 5.021e-02, -3.016e-02, -7.854e-03, 1.135e-02, 3.407e-02, -2.923e-02, -5.384e-03, 6.570e-02, 2.437e-01, -8.712e-02, 2.275e-02, -2.291e-03, -7.378e-02, 5.231e-02), r);
r = MulAdd(s1_7, M4(-4.186e-02, 6.944e-02, 8.353e-02, -1.927e-02, 3.937e-02, 2.105e-02, 7.152e-02, 5.635e-03, 1.114e-01, -3.772e-02, -1.853e-01, 6.636e-02, 4.654e-02, -1.008e-01, -1.625e-01, 7.888e-02), r);
r = MulAdd(s1_8, M4(5.288e-02, -5.516e-02, -4.014e-02, 8.854e-02, 2.434e-02, 9.192e-02, -1.203e-02, 6.813e-02, 4.626e-02, -4.892e-02, 4.700e-03, 7.578e-02, -5.040e-02, 3.497e-02, 3.176e-02, -9.741e-02), r);
return r;
}
void Pass8(uint2 blockStart, uint3 tid) {
float2 pt = float2(GetInputPt());
uint2 gxy = Rmp8x8(tid.x) + blockStart;
uint2 size = GetInputSize();
if (gxy.x >= size.x || gxy.y >= size.y) {
return;
}
float2 pos = (gxy + 0.5) * pt;
V4 s0_0 = l0(-1.0, -1.0);
V4 s0_1 = l0(0.0, -1.0);
V4 s0_2 = l0(1.0, -1.0);
V4 s0_3 = l0(-1.0, 0.0);
V4 s0_4 = l0(0.0, 0.0);
V4 s0_5 = l0(1.0, 0.0);
V4 s0_6 = l0(-1.0, 1.0);
V4 s0_7 = l0(0.0, 1.0);
V4 s0_8 = l0(1.0, 1.0);
V4 s1_0 = -max(-s0_0, 0.0);
V4 s1_1 = -max(-s0_1, 0.0);
V4 s1_2 = -max(-s0_2, 0.0);
V4 s1_3 = -max(-s0_3, 0.0);
V4 s1_4 = -max(-s0_4, 0.0);
V4 s1_5 = -max(-s0_5, 0.0);
V4 s1_6 = -max(-s0_6, 0.0);
V4 s1_7 = -max(-s0_7, 0.0);
V4 s1_8 = -max(-s0_8, 0.0);
s0_0 = max(s0_0, 0.0);
s0_1 = max(s0_1, 0.0);
s0_2 = max(s0_2, 0.0);
s0_3 = max(s0_3, 0.0);
s0_4 = max(s0_4, 0.0);
s0_5 = max(s0_5, 0.0);
s0_6 = max(s0_6, 0.0);
s0_7 = max(s0_7, 0.0);
s0_8 = max(s0_8, 0.0);
t1[gxy] = f0(s0_0, s0_1, s0_2, s0_3, s0_4, s0_5, s0_6, s0_7, s0_8, s1_0, s1_1, s1_2, s1_3, s1_4, s1_5, s1_6, s1_7, s1_8);
}
//!PASS 9
//!DESC conv8
//!BLOCK_SIZE 8
//!NUM_THREADS 64
//!IN t1
//!OUT t0
#define l0(x, y) V4(O(t1, float2(x, y)))
V4 f0(V4 s0_0, V4 s0_1, V4 s0_2, V4 s0_3, V4 s0_4, V4 s0_5, V4 s0_6, V4 s0_7, V4 s0_8, V4 s1_0, V4 s1_1, V4 s1_2, V4 s1_3, V4 s1_4, V4 s1_5, V4 s1_6, V4 s1_7, V4 s1_8) {
V4 r = { -5.006e-05, -2.252e-04, -1.752e-03, 4.586e-04 };
r = MulAdd(s0_0, M4(8.283e-02, 5.262e-02, 1.580e-02, 4.991e-02, 6.836e-02, -3.234e-02, 5.630e-02, 1.275e-01, 5.398e-03, 9.866e-04, -1.054e-02, 1.601e-02, 1.546e-02, -7.786e-02, -2.630e-02, -3.023e-02), r);
r = MulAdd(s0_1, M4(9.285e-02, 3.403e-01, -4.572e-02, 1.431e-01, 2.876e-01, -3.271e-01, -8.133e-04, 5.998e-01, 4.515e-02, 9.836e-02, 2.315e-02, 1.724e-01, -8.080e-02, -1.978e-01, -5.366e-02, -4.535e-02), r);
r = MulAdd(s0_2, M4(1.708e-02, -8.374e-02, -1.831e-02, 1.744e-02, 4.902e-02, -1.037e-02, -3.508e-02, 3.501e-02, 1.160e-01, 2.529e-01, 4.235e-02, 4.233e-02, -5.953e-03, -1.398e-01, -8.815e-03, 1.053e-02), r);
r = MulAdd(s0_3, M4(-2.836e-03, -2.496e-01, 2.703e-02, 9.490e-02, 3.985e-01, -9.458e-02, 1.355e-01, 5.917e-01, 5.597e-03, -8.963e-02, 5.238e-02, 4.360e-02, -1.070e-01, 7.593e-02, 6.376e-02, -1.498e-01), r);
r = MulAdd(s0_4, M4(3.214e-01, -8.045e-01, 6.621e-01, -1.261e-01, -1.487e+00, 1.086e+00, 3.779e-01, -1.762e+00, 2.721e-01, -3.815e-02, -1.450e-01, 4.063e-01, 2.804e-01, 3.876e-01, 2.607e-01, 2.174e-01), r);
r = MulAdd(s0_5, M4(-3.896e-01, 3.340e-01, -2.529e-01, -6.519e-02, -1.815e-01, 5.542e-02, -1.669e-01, 1.732e-02, 2.995e-01, 4.942e-02, 6.557e-02, -1.386e-01, -1.392e-01, 2.822e-01, 2.016e-02, -1.313e-01), r);
r = MulAdd(s0_6, M4(-2.130e-02, 4.137e-02, 7.324e-02, 4.834e-03, 9.333e-02, -2.998e-01, 4.229e-01, 9.535e-02, -2.595e-02, 2.955e-02, 7.491e-02, -3.028e-02, -2.850e-02, 1.582e-02, -1.076e-01, -3.159e-02), r);
r = MulAdd(s0_7, M4(-3.601e-02, 5.993e-02, -1.190e-02, -6.800e-02, 6.894e-03, -2.095e-01, -9.548e-02, -2.539e-02, -2.390e-02, 2.947e-02, 1.581e-01, -5.305e-03, 1.029e-01, -1.456e-01, -3.526e-02, 9.251e-02), r);
r = MulAdd(s0_8, M4(-7.206e-02, 9.690e-02, -4.464e-02, -6.999e-03, 3.140e-02, -4.201e-02, -6.364e-03, 5.280e-03, -1.412e-01, 1.696e-01, -1.274e-01, -9.546e-02, 5.285e-02, -1.072e-01, 5.994e-02, 1.293e-02), r);
r = MulAdd(s1_0, M4(-1.808e-02, 1.243e-01, -6.814e-02, -4.219e-03, 1.273e-02, 2.752e-02, 3.764e-02, 3.650e-02, 7.663e-04, 6.843e-03, 1.380e-02, -3.235e-02, 5.400e-02, -5.352e-02, 1.190e-02, -1.028e-01), r);
r = MulAdd(s1_1, M4(2.568e-01, 2.764e-01, 7.740e-02, 1.273e-01, 7.059e-02, 6.668e-02, 4.211e-02, 6.293e-02, -4.164e-02, 2.210e-01, -1.293e-02, 8.369e-02, 2.046e-01, 1.238e-01, 9.491e-02, 4.614e-02), r);
r = MulAdd(s1_2, M4(-2.387e-02, 3.174e-01, 8.165e-02, -6.680e-02, -1.516e-02, 1.482e-02, -1.342e-02, 1.692e-02, -2.288e-02, -6.891e-02, -5.559e-02, 4.771e-02, 3.290e-02, 1.234e-01, 4.334e-02, -5.106e-02), r);
r = MulAdd(s1_3, M4(6.216e-02, -2.114e-01, -1.616e-01, 1.664e-01, 3.796e-02, 6.036e-02, -1.106e-01, 1.398e-01, -3.139e-02, -6.274e-02, 4.988e-02, -6.274e-02, 2.296e-02, -5.131e-02, 5.052e-02, -8.866e-02), r);
r = MulAdd(s1_4, M4(2.647e-01, -7.858e-01, 1.597e-01, -8.262e-01, -3.213e-01, 2.427e-01, 1.686e-01, -4.251e-01, 1.505e-01, 3.244e-02, 1.023e-01, 1.962e-01, -1.116e-01, 3.525e-01, 8.848e-01, -1.945e-01), r);
r = MulAdd(s1_5, M4(-2.549e-01, -1.429e-01, -3.696e-02, 3.042e-01, -1.256e-01, 2.760e-02, -3.650e-02, 7.985e-02, -1.958e-01, 3.076e-01, -9.253e-02, -8.512e-02, -1.708e-01, -3.422e-04, -8.181e-02, 2.319e-01), r);
r = MulAdd(s1_6, M4(-3.382e-02, 6.627e-02, 1.158e-01, -3.044e-02, -7.983e-03, -7.855e-02, 1.729e-02, 3.219e-04, -1.764e-02, 4.065e-02, -1.400e-02, -2.387e-02, 2.673e-03, 5.460e-03, -4.992e-02, -1.573e-02), r);
r = MulAdd(s1_7, M4(-2.505e-02, 1.763e-01, -4.433e-01, -1.024e-01, 1.391e-01, -2.435e-01, -5.358e-02, 5.203e-02, 3.157e-02, 2.012e-02, 7.424e-03, 3.723e-02, -2.388e-02, 7.204e-02, -4.522e-01, -1.187e-02), r);
r = MulAdd(s1_8, M4(9.737e-02, 7.067e-02, 4.072e-02, 4.303e-02, 2.890e-02, -1.810e-02, 5.156e-03, -1.953e-02, -3.503e-02, 7.492e-02, 1.402e-02, -9.796e-03, 2.320e-01, -2.135e-01, 1.462e-01, 1.194e-01), r);
return r;
}
void Pass9(uint2 blockStart, uint3 tid) {
float2 pt = float2(GetInputPt());
uint2 gxy = Rmp8x8(tid.x) + blockStart;
uint2 size = GetInputSize();
if (gxy.x >= size.x || gxy.y >= size.y) {
return;
}
float2 pos = (gxy + 0.5) * pt;
V4 s0_0 = l0(-1.0, -1.0);
V4 s0_1 = l0(0.0, -1.0);
V4 s0_2 = l0(1.0, -1.0);
V4 s0_3 = l0(-1.0, 0.0);
V4 s0_4 = l0(0.0, 0.0);
V4 s0_5 = l0(1.0, 0.0);
V4 s0_6 = l0(-1.0, 1.0);
V4 s0_7 = l0(0.0, 1.0);
V4 s0_8 = l0(1.0, 1.0);
V4 s1_0 = -max(-s0_0, 0.0);
V4 s1_1 = -max(-s0_1, 0.0);
V4 s1_2 = -max(-s0_2, 0.0);
V4 s1_3 = -max(-s0_3, 0.0);
V4 s1_4 = -max(-s0_4, 0.0);
V4 s1_5 = -max(-s0_5, 0.0);
V4 s1_6 = -max(-s0_6, 0.0);
V4 s1_7 = -max(-s0_7, 0.0);
V4 s1_8 = -max(-s0_8, 0.0);
s0_0 = max(s0_0, 0.0);
s0_1 = max(s0_1, 0.0);
s0_2 = max(s0_2, 0.0);
s0_3 = max(s0_3, 0.0);
s0_4 = max(s0_4, 0.0);
s0_5 = max(s0_5, 0.0);
s0_6 = max(s0_6, 0.0);
s0_7 = max(s0_7, 0.0);
s0_8 = max(s0_8, 0.0);
t0[gxy] = f0(s0_0, s0_1, s0_2, s0_3, s0_4, s0_5, s0_6, s0_7, s0_8, s1_0, s1_1, s1_2, s1_3, s1_4, s1_5, s1_6, s1_7, s1_8);
}
//!PASS 10
//!DESC out-shuffle
//!BLOCK_SIZE 16
//!NUM_THREADS 64
//!IN INPUT, t0
//!OUT OUTPUT
#define l0(x, y) V4(O(t0, float2(x, y)))
V4 f0(V4 s0_0, V4 s0_1, V4 s0_2, V4 s0_3, V4 s0_4, V4 s0_5, V4 s0_6, V4 s0_7, V4 s0_8, V4 s1_0, V4 s1_1, V4 s1_2, V4 s1_3, V4 s1_4, V4 s1_5, V4 s1_6, V4 s1_7, V4 s1_8) {
V4 r = { -1.731e-03, -2.098e-03, -1.131e-03, -1.644e-03 };
r = MulAdd(s0_0, M4(2.670e-02, -1.964e-03, 2.191e-02, 3.109e-02, 1.911e-02, -2.017e-02, -2.948e-02, -2.237e-02, -3.845e-02, -7.954e-03, -3.472e-02, -2.253e-02, -1.571e-02, -6.613e-03, -1.489e-02, -2.647e-02), r);
r = MulAdd(s0_1, M4(-6.714e-02, -2.106e-02, 7.577e-03, 1.788e-02, 8.081e-02, 8.813e-02, -5.510e-02, -2.724e-02, 1.150e-01, 5.284e-02, -8.964e-02, -3.024e-02, 5.215e-02, 5.334e-02, -1.180e-02, 6.927e-03), r);
r = MulAdd(s0_2, M4(1.036e-02, 1.826e-02, -8.095e-03, -9.967e-03, 1.368e-03, 3.479e-02, -1.887e-03, -2.161e-02, -3.464e-02, -1.124e-01, -4.623e-03, -5.295e-03, -7.199e-03, -4.285e-02, 8.862e-03, -1.610e-02), r);
r = MulAdd(s0_3, M4(2.388e-01, -1.001e-03, 1.699e-01, -4.519e-02, -3.274e-01, 1.550e-01, 3.748e-02, 3.435e-02, -1.655e-01, 1.227e-02, -1.372e-01, 4.700e-02, -1.636e-01, 1.222e-02, -1.323e-01, 3.239e-02), r);
r = MulAdd(s0_4, M4(1.698e-01, 4.561e-01, -1.355e-01, 1.831e-01, -3.815e-01, -7.832e-01, 1.738e-01, 4.516e-02, 2.803e-01, -4.239e-01, 8.945e-01, -1.339e-02, -3.701e-01, -3.731e-01, 1.765e-01, -1.343e-01), r);
r = MulAdd(s0_5, M4(-4.653e-02, -8.470e-02, -1.076e-03, -7.153e-02, 1.022e-02, -2.560e-02, -1.154e-02, 2.252e-02, -1.053e-01, 4.014e-01, -1.479e-01, 3.667e-01, 9.425e-02, -8.079e-02, 5.594e-03, 4.870e-02), r);
r = MulAdd(s0_6, M4(-6.274e-02, -3.430e-02, -5.955e-02, 1.220e-02, -6.075e-02, 1.284e-02, -8.384e-02, 2.143e-01, -2.050e-02, -8.887e-03, -1.445e-02, 1.797e-02, 1.436e-01, -8.067e-04, 1.013e-01, 3.847e-03), r);
r = MulAdd(s0_7, M4(6.862e-02, -7.230e-02, -2.461e-01, -3.760e-01, 4.038e-02, -2.634e-02, -2.725e-01, -4.389e-01, 9.088e-03, -1.873e-02, -9.497e-02, -1.860e-01, -1.038e-01, 2.502e-01, -6.194e-01, 4.470e-02), r);
r = MulAdd(s0_8, M4(-1.984e-02, 4.173e-02, 5.328e-02, 5.554e-02, 1.241e-03, -2.290e-03, 5.972e-02, 4.381e-02, -3.320e-03, -1.434e-04, -5.754e-02, -6.072e-02, -6.854e-03, 6.781e-02, 1.208e-01, -5.469e-02), r);
r = MulAdd(s1_0, M4(7.050e-02, -3.676e-02, 7.009e-03, 1.431e-02, -1.258e-02, -6.854e-03, -9.803e-04, 5.955e-03, -3.077e-03, -2.372e-02, 8.060e-03, -5.992e-02, -7.957e-02, 2.905e-02, 3.914e-04, -1.408e-02), r);
r = MulAdd(s1_1, M4(-1.068e-01, 4.589e-02, -1.399e-02, -8.157e-03, 1.811e-02, 7.241e-03, 9.447e-03, 3.242e-03, 5.152e-02, 8.667e-02, -2.512e-02, -2.978e-02, 1.382e-01, 5.481e-02, -2.199e-02, -2.739e-02), r);
r = MulAdd(s1_2, M4(3.676e-02, 1.705e-02, -4.520e-03, -6.449e-03, 1.006e-02, 9.807e-03, -6.046e-03, -1.299e-03, -5.035e-02, -4.415e-02, 9.619e-03, -1.059e-02, -6.952e-03, -1.803e-02, -4.042e-03, -1.751e-02), r);
r = MulAdd(s1_3, M4(5.123e-02, 4.500e-02, 2.099e-01, -7.254e-03, -7.977e-02, 2.822e-02, -1.546e-01, -3.748e-02, -2.378e-01, -1.836e-02, -3.508e-02, -2.147e-03, 3.371e-02, -4.720e-02, -5.574e-02, -1.592e-02), r);
r = MulAdd(s1_4, M4(-5.764e-01, 5.998e-01, -2.288e-01, 7.223e-01, -1.855e-01, -3.467e-01, 5.173e-02, -8.967e-02, 3.308e-01, -8.987e-02, 2.397e-01, 3.701e-01, -7.970e-02, -9.046e-01, 2.397e-01, -1.626e-01), r);
r = MulAdd(s1_5, M4(1.177e-02, -1.538e-01, 4.138e-02, -5.198e-02, 3.165e-03, 3.827e-02, -5.913e-03, 8.727e-03, 7.885e-02, 2.979e-01, -6.160e-02, 1.198e-01, 1.186e-02, 9.421e-02, -4.101e-02, 4.185e-03), r);
r = MulAdd(s1_6, M4(-7.690e-02, -4.820e-03, -1.106e-01, 4.040e-02, -6.883e-02, -3.284e-02, 1.259e-02, 1.509e-01, 6.378e-03, -5.293e-04, -3.690e-02, 6.274e-02, 1.401e-01, -3.801e-03, 1.489e-01, -1.044e-02), r);
r = MulAdd(s1_7, M4(1.140e-01, -1.333e-01, -1.739e-01, -1.739e-01, 4.736e-02, -1.306e-02, -3.673e-01, -6.127e-01, -3.477e-02, -6.090e-02, 2.430e-02, -2.666e-01, -6.599e-02, 2.794e-01, -1.724e-01, -2.744e-01), r);
r = MulAdd(s1_8, M4(1.045e-02, 6.106e-02, 3.463e-02, 6.708e-02, -1.028e-02, -2.277e-02, 6.536e-02, 8.227e-02, -5.566e-02, -3.941e-02, -6.862e-03, -1.219e-02, -1.438e-02, -4.651e-02, 5.359e-02, 4.650e-02), r);
return tanh(r);
}
void Pass10(uint2 blockStart, uint3 tid) {
float2 pt = float2(GetInputPt());
uint2 gxy = (Rmp8x8(tid.x) << 1) + blockStart;
uint2 size = GetOutputSize();
if (gxy.x >= size.x || gxy.y >= size.y) {
return;
}
float2 pos = ((gxy >> 1) + 0.5) * pt;
V4 s0_0 = l0(-1.0, -1.0);
V4 s0_1 = l0(0.0, -1.0);
V4 s0_2 = l0(1.0, -1.0);
V4 s0_3 = l0(-1.0, 0.0);
V4 s0_4 = l0(0.0, 0.0);
V4 s0_5 = l0(1.0, 0.0);
V4 s0_6 = l0(-1.0, 1.0);
V4 s0_7 = l0(0.0, 1.0);
V4 s0_8 = l0(1.0, 1.0);
V4 s1_0 = -max(-s0_0, 0.0);
V4 s1_1 = -max(-s0_1, 0.0);
V4 s1_2 = -max(-s0_2, 0.0);
V4 s1_3 = -max(-s0_3, 0.0);
V4 s1_4 = -max(-s0_4, 0.0);
V4 s1_5 = -max(-s0_5, 0.0);
V4 s1_6 = -max(-s0_6, 0.0);
V4 s1_7 = -max(-s0_7, 0.0);
V4 s1_8 = -max(-s0_8, 0.0);
s0_0 = max(s0_0, 0.0);
s0_1 = max(s0_1, 0.0);
s0_2 = max(s0_2, 0.0);
s0_3 = max(s0_3, 0.0);
s0_4 = max(s0_4, 0.0);
s0_5 = max(s0_5, 0.0);
s0_6 = max(s0_6, 0.0);
s0_7 = max(s0_7, 0.0);
s0_8 = max(s0_8, 0.0);
V4 r = f0(s0_0, s0_1, s0_2, s0_3, s0_4, s0_5, s0_6, s0_7, s0_8, s1_0, s1_1, s1_2, s1_3, s1_4, s1_5, s1_6, s1_7, s1_8);
static const MF3x3 rgb2yuv = { 0.299, 0.587, 0.114, -0.169, -0.331, 0.5, 0.5, -0.419, -0.081 };
static const MF3x3 yuv2rgb = { 1, -0.00093, 1.401687, 1, -0.3437, -0.71417, 1, 1.77216, 0.00099 };
float2 opt = float2(GetOutputPt());
pos -= 0.5f * opt;
MF3 yuv = mul(rgb2yuv, INPUT.SampleLevel(SL, pos, 0).rgb);
OUTPUT[gxy] = MF4(mul(yuv2rgb, MF3(saturate(yuv.r + r.x), yuv.yz)), 1);
++gxy.x;
pos.x += opt.x;
yuv = mul(rgb2yuv, INPUT.SampleLevel(SL, pos, 0).rgb);
OUTPUT[gxy] = MF4(mul(yuv2rgb, MF3(saturate(yuv.r + r.y), yuv.yz)), 1);
++gxy.y;
pos.y += opt.y;
yuv = mul(rgb2yuv, INPUT.SampleLevel(SL, pos, 0).rgb);
OUTPUT[gxy] = MF4(mul(yuv2rgb, MF3(saturate(yuv.r + r.w), yuv.yz)), 1);
--gxy.x;
pos.x -= opt.x;
yuv = mul(rgb2yuv, INPUT.SampleLevel(SL, pos, 0).rgb);
OUTPUT[gxy] = MF4(mul(yuv2rgb, MF3(saturate(yuv.r + r.z), yuv.yz)), 1);
}

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

View file

@ -16,10 +16,11 @@
//!MAGPIE EFFECT
//!VERSION 4
//!VERSION 5
//!SORT_NAME CuNNy-03x12
//!USE MulAdd
//!CAPABILITY FP16
//!SCALE_FACTOR 2
#include "../StubDefs.hlsli"
@ -27,8 +28,6 @@
Texture2D INPUT;
//!TEXTURE
//!WIDTH INPUT_WIDTH * 2
//!HEIGHT INPUT_HEIGHT * 2
Texture2D OUTPUT;
//!SAMPLER

View file

@ -16,10 +16,11 @@
//!MAGPIE EFFECT
//!VERSION 4
//!VERSION 5
//!SORT_NAME CuNNy-08x32
//!USE MulAdd
//!CAPABILITY FP16
//!SCALE_FACTOR 2
#include "../StubDefs.hlsli"
@ -27,8 +28,6 @@
Texture2D INPUT;
//!TEXTURE
//!WIDTH INPUT_WIDTH * 2
//!HEIGHT INPUT_HEIGHT * 2
Texture2D OUTPUT;
//!SAMPLER
@ -147,7 +146,7 @@ Texture2D T15;
//!IN INPUT
//!OUT T0, T1, T2, T3, T4, T5, T6, T7
#define L0(x, y) V3(O(INPUT, x, y).rgb)
#define L0(x, y) V3(EncodeSrgb(O(INPUT, x, y).rgb))
#define V3 MF3
#define M3x4 MF3x4
@ -5671,8 +5670,8 @@ void Pass10(uint2 blockStart, uint3 tid) {
r1 = MulAdd(s1_2_2, M4(3.648e-03, -7.492e-03, 7.566e-03, -6.626e-02, -1.922e-03, -1.418e-03, 8.532e-05, -1.628e-03, -1.875e-03, -7.480e-03, -5.740e-03, -3.978e-02, -8.104e-04, 2.341e-03, 5.188e-04, -7.545e-03), r1);
r2 = MulAdd(s1_2_2, M4(2.797e-03, -3.287e-03, 8.760e-03, -5.046e-02, -1.458e-03, -2.502e-03, 6.034e-04, -3.008e-03, -1.281e-03, 1.262e-03, 3.077e-03, 6.751e-02, -1.200e-04, 1.705e-03, -1.655e-05, -5.620e-03), r2);
float2 opt = float2(GetOutputPt()), fpos = (float2(gxy) + 0.5) * opt;
OUTPUT[gxy + int2(0, 0)] = MF4(saturate(INPUT.SampleLevel(SL, fpos + float2(0.0, 0.0) * opt, 0).rgb + MF3(r0.x, r1.x, r2.x)), 1.0);
OUTPUT[gxy + int2(1, 0)] = MF4(saturate(INPUT.SampleLevel(SL, fpos + float2(1.0, 0.0) * opt, 0).rgb + MF3(r0.y, r1.y, r2.y)), 1.0);
OUTPUT[gxy + int2(0, 1)] = MF4(saturate(INPUT.SampleLevel(SL, fpos + float2(0.0, 1.0) * opt, 0).rgb + MF3(r0.z, r1.z, r2.z)), 1.0);
OUTPUT[gxy + int2(1, 1)] = MF4(saturate(INPUT.SampleLevel(SL, fpos + float2(1.0, 1.0) * opt, 0).rgb + MF3(r0.w, r1.w, r2.w)), 1.0);
OUTPUT[gxy + int2(0, 0)] = MF4(saturate(DecodeSrgb(EncodeSrgb(INPUT.SampleLevel(SL, fpos + float2(0.0, 0.0) * opt, 0).rgb) + MF3(r0.x, r1.x, r2.x))), 1.0);
OUTPUT[gxy + int2(1, 0)] = MF4(saturate(DecodeSrgb(EncodeSrgb(INPUT.SampleLevel(SL, fpos + float2(1.0, 0.0) * opt, 0).rgb) + MF3(r0.y, r1.y, r2.y))), 1.0);
OUTPUT[gxy + int2(0, 1)] = MF4(saturate(DecodeSrgb(EncodeSrgb(INPUT.SampleLevel(SL, fpos + float2(0.0, 1.0) * opt, 0).rgb) + MF3(r0.z, r1.z, r2.z))), 1.0);
OUTPUT[gxy + int2(1, 1)] = MF4(saturate(DecodeSrgb(EncodeSrgb(INPUT.SampleLevel(SL, fpos + float2(1.0, 1.0) * opt, 0).rgb) + MF3(r0.w, r1.w, r2.w))), 1.0);
}

View file

@ -2,7 +2,8 @@
// Port from https://github.com/haasn/gentoo-conf/blob/xor/home/nand/.mpv/shaders/deband.glsl
//!MAGPIE EFFECT
//!VERSION 4
//!VERSION 5
//!SCALE_FACTOR 1
//!PARAMETER
//!LABEL Threshold
@ -53,8 +54,6 @@ float grain;
Texture2D INPUT;
//!TEXTURE
//!WIDTH INPUT_WIDTH
//!HEIGHT INPUT_HEIGHT
Texture2D OUTPUT;
//!SAMPLER

View file

@ -1,14 +1,13 @@
<?xml version="1.0" encoding="utf-8"?>
<Project DefaultTargets="Build" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
<Import Project="..\Common.Pre.props" />
<PropertyGroup Label="Globals">
<Keyword>Win32Proj</Keyword>
<ProjectGuid>{62503530-b84b-4cc2-80b6-3f89618172b7}</ProjectGuid>
<WindowsTargetPlatformVersion>10.0.26100.0</WindowsTargetPlatformVersion>
<IntDir>$(SolutionDir)\obj\$(Platform)\$(Configuration)\$(MSBuildProjectName)\</IntDir>
<OutDir>$(SolutionDir)\bin\$(Platform)\$(Configuration)\</OutDir>
<OutDir>$(OutBaseDir)\app\effects\</OutDir>
</PropertyGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
<Import Project="..\Common.Pre.props" />
<PropertyGroup Label="Configuration">
<ConfigurationType>Utility</ConfigurationType>
</PropertyGroup>
@ -18,7 +17,7 @@
</ImportGroup>
<ItemDefinitionGroup>
<CopyFileToFolders>
<DestinationFolders>$(OutDir)\effects</DestinationFolders>
<DestinationFolders>$(OutDir)\shaders\</DestinationFolders>
<DestinationFileName>%(RelativeDir)%(Filename)%(Extension)</DestinationFileName>
</CopyFileToFolders>
</ItemDefinitionGroup>
@ -367,66 +366,6 @@
<CopyFileToFolders Include="Anime4K\Anime4K_Upscale_GAN_x2_M.hlsl">
<FileType>Document</FileType>
</CopyFileToFolders>
<CopyFileToFolders Include="CuNNy\CuNNy-2x4C-NVL.hlsl">
<FileType>Document</FileType>
</CopyFileToFolders>
<CopyFileToFolders Include="CuNNy\CuNNy-2x4C-NVL-DN.hlsl">
<FileType>Document</FileType>
</CopyFileToFolders>
<CopyFileToFolders Include="CuNNy\CuNNy-3x4C-NVL.hlsl">
<FileType>Document</FileType>
</CopyFileToFolders>
<CopyFileToFolders Include="CuNNy\CuNNy-3x4C-NVL-DN.hlsl">
<FileType>Document</FileType>
</CopyFileToFolders>
<CopyFileToFolders Include="CuNNy\CuNNy-4x4C-NVL.hlsl">
<FileType>Document</FileType>
</CopyFileToFolders>
<CopyFileToFolders Include="CuNNy\CuNNy-4x4C-NVL-DN.hlsl">
<FileType>Document</FileType>
</CopyFileToFolders>
<CopyFileToFolders Include="CuNNy\CuNNy-4x8C-NVL.hlsl">
<FileType>Document</FileType>
</CopyFileToFolders>
<CopyFileToFolders Include="CuNNy\CuNNy-4x8C-NVL-DN.hlsl">
<FileType>Document</FileType>
</CopyFileToFolders>
<CopyFileToFolders Include="CuNNy\CuNNy-4x16C-NVL.hlsl">
<FileType>Document</FileType>
</CopyFileToFolders>
<CopyFileToFolders Include="CuNNy\CuNNy-4x16C-NVL-DN.hlsl">
<FileType>Document</FileType>
</CopyFileToFolders>
<CopyFileToFolders Include="CuNNy\CuNNy-6x8C-NVL.hlsl">
<FileType>Document</FileType>
</CopyFileToFolders>
<CopyFileToFolders Include="CuNNy\CuNNy-6x8C-NVL-DN.hlsl">
<FileType>Document</FileType>
</CopyFileToFolders>
<CopyFileToFolders Include="CuNNy\CuNNy-8x4C-NVL.hlsl">
<FileType>Document</FileType>
</CopyFileToFolders>
<CopyFileToFolders Include="CuNNy\CuNNy-8x4C-NVL-DN.hlsl">
<FileType>Document</FileType>
</CopyFileToFolders>
<CopyFileToFolders Include="CuNNy\CuNNy-8x8C-NVL.hlsl">
<FileType>Document</FileType>
</CopyFileToFolders>
<CopyFileToFolders Include="CuNNy\CuNNy-8x8C-NVL-DN.hlsl">
<FileType>Document</FileType>
</CopyFileToFolders>
<CopyFileToFolders Include="CuNNy\CuNNy-8x16C-NVL.hlsl">
<FileType>Document</FileType>
</CopyFileToFolders>
<CopyFileToFolders Include="CuNNy\CuNNy-8x16C-NVL-DN.hlsl">
<FileType>Document</FileType>
</CopyFileToFolders>
<CopyFileToFolders Include="CuNNy\CuNNy-16x16C-NVL.hlsl">
<FileType>Document</FileType>
</CopyFileToFolders>
<CopyFileToFolders Include="CuNNy\CuNNy-16x16C-NVL-DN.hlsl">
<FileType>Document</FileType>
</CopyFileToFolders>
<CopyFileToFolders Include="NIS\NIS_Scaler.hlsli">
<FileType>Document</FileType>
</CopyFileToFolders>

View file

@ -360,66 +360,6 @@
<CopyFileToFolders Include="Anime4K\Anime4K_Upscale_GAN_x2_M.hlsl">
<Filter>Anime4K</Filter>
</CopyFileToFolders>
<CopyFileToFolders Include="CuNNy\CuNNy-2x4C-NVL.hlsl">
<Filter>CuNNy</Filter>
</CopyFileToFolders>
<CopyFileToFolders Include="CuNNy\CuNNy-2x4C-NVL-DN.hlsl">
<Filter>CuNNy</Filter>
</CopyFileToFolders>
<CopyFileToFolders Include="CuNNy\CuNNy-3x4C-NVL.hlsl">
<Filter>CuNNy</Filter>
</CopyFileToFolders>
<CopyFileToFolders Include="CuNNy\CuNNy-3x4C-NVL-DN.hlsl">
<Filter>CuNNy</Filter>
</CopyFileToFolders>
<CopyFileToFolders Include="CuNNy\CuNNy-4x4C-NVL.hlsl">
<Filter>CuNNy</Filter>
</CopyFileToFolders>
<CopyFileToFolders Include="CuNNy\CuNNy-4x4C-NVL-DN.hlsl">
<Filter>CuNNy</Filter>
</CopyFileToFolders>
<CopyFileToFolders Include="CuNNy\CuNNy-4x8C-NVL.hlsl">
<Filter>CuNNy</Filter>
</CopyFileToFolders>
<CopyFileToFolders Include="CuNNy\CuNNy-4x8C-NVL-DN.hlsl">
<Filter>CuNNy</Filter>
</CopyFileToFolders>
<CopyFileToFolders Include="CuNNy\CuNNy-4x16C-NVL.hlsl">
<Filter>CuNNy</Filter>
</CopyFileToFolders>
<CopyFileToFolders Include="CuNNy\CuNNy-4x16C-NVL-DN.hlsl">
<Filter>CuNNy</Filter>
</CopyFileToFolders>
<CopyFileToFolders Include="CuNNy\CuNNy-6x8C-NVL.hlsl">
<Filter>CuNNy</Filter>
</CopyFileToFolders>
<CopyFileToFolders Include="CuNNy\CuNNy-6x8C-NVL-DN.hlsl">
<Filter>CuNNy</Filter>
</CopyFileToFolders>
<CopyFileToFolders Include="CuNNy\CuNNy-8x4C-NVL.hlsl">
<Filter>CuNNy</Filter>
</CopyFileToFolders>
<CopyFileToFolders Include="CuNNy\CuNNy-8x4C-NVL-DN.hlsl">
<Filter>CuNNy</Filter>
</CopyFileToFolders>
<CopyFileToFolders Include="CuNNy\CuNNy-8x8C-NVL.hlsl">
<Filter>CuNNy</Filter>
</CopyFileToFolders>
<CopyFileToFolders Include="CuNNy\CuNNy-8x8C-NVL-DN.hlsl">
<Filter>CuNNy</Filter>
</CopyFileToFolders>
<CopyFileToFolders Include="CuNNy\CuNNy-8x16C-NVL.hlsl">
<Filter>CuNNy</Filter>
</CopyFileToFolders>
<CopyFileToFolders Include="CuNNy\CuNNy-8x16C-NVL-DN.hlsl">
<Filter>CuNNy</Filter>
</CopyFileToFolders>
<CopyFileToFolders Include="CuNNy\CuNNy-16x16C-NVL.hlsl">
<Filter>CuNNy</Filter>
</CopyFileToFolders>
<CopyFileToFolders Include="CuNNy\CuNNy-16x16C-NVL-DN.hlsl">
<Filter>CuNNy</Filter>
</CopyFileToFolders>
<CopyFileToFolders Include="Bicubic.hlsl" />
<CopyFileToFolders Include="NIS\NIS_Scaler.hlsli">
<Filter>NIS</Filter>
@ -494,9 +434,6 @@
<Filter Include="Pixel Art">
<UniqueIdentifier>{0b58f073-84cb-4c38-919d-80176ae408bc}</UniqueIdentifier>
</Filter>
<Filter Include="CuNNy">
<UniqueIdentifier>{9157745b-aa96-42ce-bdc6-1230dffa326b}</UniqueIdentifier>
</Filter>
<Filter Include="CuNNy2">
<UniqueIdentifier>{52055d56-41dc-409a-a878-3c1278082f6d}</UniqueIdentifier>
</Filter>

View file

@ -2,7 +2,7 @@
// 移植自 https://github.com/GPUOpen-Effects/FidelityFX-FSR/blob/a21ffb8f6c13233ba336352bdff293894c706575/ffx-fsr/ffx_fsr1.h
//!MAGPIE EFFECT
//!VERSION 4
//!VERSION 5
//!CAPABILITY FP16
#include "../StubDefs.hlsli"
@ -17,7 +17,6 @@ Texture2D OUTPUT;
//!FILTER POINT
SamplerState sam;
//!PASS 1
//!IN INPUT
//!OUT OUTPUT
@ -461,17 +460,11 @@ void Pass1(uint2 blockStart, uint3 threadId) {
OUTPUT[gxy] = MF4(FsrEasu(gxy, con0, con1, con2, con3), 1);
gxy.x += 8u;
if (gxy.x < outputSize.x && gxy.y < outputSize.y) {
OUTPUT[gxy] = MF4(FsrEasu(gxy, con0, con1, con2, con3), 1);
}
OUTPUT[gxy] = MF4(FsrEasu(gxy, con0, con1, con2, con3), 1);
gxy.y += 8u;
if (gxy.x < outputSize.x && gxy.y < outputSize.y) {
OUTPUT[gxy] = MF4(FsrEasu(gxy, con0, con1, con2, con3), 1);
}
OUTPUT[gxy] = MF4(FsrEasu(gxy, con0, con1, con2, con3), 1);
gxy.x -= 8u;
if (gxy.x < outputSize.x && gxy.y < outputSize.y) {
OUTPUT[gxy] = MF4(FsrEasu(gxy, con0, con1, con2, con3), 1);
}
OUTPUT[gxy] = MF4(FsrEasu(gxy, con0, con1, con2, con3), 1);
}

View file

@ -2,8 +2,9 @@
// 移植自 https://github.com/GPUOpen-Effects/FidelityFX-FSR/blob/a21ffb8f6c13233ba336352bdff293894c706575/ffx-fsr/ffx_fsr1.h
//!MAGPIE EFFECT
//!VERSION 4
//!VERSION 5
//!CAPABILITY FP16
//!SCALE_FACTOR 1
#include "../StubDefs.hlsli"
@ -19,15 +20,12 @@ float sharpness;
Texture2D INPUT;
//!TEXTURE
//!WIDTH INPUT_WIDTH
//!HEIGHT INPUT_HEIGHT
Texture2D OUTPUT;
//!SAMPLER
//!FILTER POINT
SamplerState sam;
//!PASS 1
//!IN INPUT
//!OUT OUTPUT

View file

@ -1,8 +1,8 @@
// 移植自 https://github.com/libretro/slang-shaders/blob/3f67e1870dbd5be74ae2f09eaed0eeadce6abd15/misc/image-adjustment.slang
//!MAGPIE EFFECT
//!VERSION 4
//!VERSION 5
//!SCALE_FACTOR 1
//!PARAMETER
//!LABEL Target Gamma

View file

@ -10,12 +10,11 @@
// B = 0.825 to get rid of dithering. Increase B to get a fine sharpness, though dithering returns.
//!MAGPIE EFFECT
//!VERSION 4
//!USE MulAdd
//!VERSION 5
//!CAPABILITY AdvancedColor
#include "StubDefs.hlsli"
//!PARAMETER
//!LABEL Window Sinc Param
//!DEFAULT 0.5
@ -50,7 +49,6 @@ Texture2D OUTPUT;
//!FILTER POINT
SamplerState sam;
//!PASS 1
//!IN INPUT
//!OUT OUTPUT
@ -62,7 +60,6 @@ SamplerState sam;
#define min4(a, b, c, d) min(min(a, b), min(c, d))
#define max4(a, b, c, d) max(max(a, b), max(c, d))
float d(float2 pt1, float2 pt2) {
float2 v = pt2 - pt1;
return sqrt(dot(v, v));
@ -108,9 +105,9 @@ void Pass1(uint2 blockStart, uint3 threadId) {
[unroll]
for (uint j = 0; j <= 2; j += 2) {
float2 tpos = (tc + uint2(i, j)) * inputPt;
const float4 sr = INPUT.GatherRed(sam, tpos);
const float4 sg = INPUT.GatherGreen(sam, tpos);
const float4 sb = INPUT.GatherBlue(sam, tpos);
float4 sr = INPUT.GatherRed(sam, tpos);
float4 sg = INPUT.GatherGreen(sam, tpos);
float4 sb = INPUT.GatherBlue(sam, tpos);
// w z
// x y
@ -128,11 +125,9 @@ void Pass1(uint2 blockStart, uint3 threadId) {
color *= rcp(dot(mul(weights, float4(1, 1, 1, 1)), 1));
// 抗振铃
// Get min/max samples
float3 min_sample = min4(src[1][1], src[2][1], src[1][2], src[2][2]);
float3 max_sample = max4(src[1][1], src[2][1], src[1][2], src[2][2]);
color = lerp(color, clamp(color, min_sample, max_sample), ARStrength);
// final sum and weight normalization
OUTPUT[gxy] = float4(color, 1);
}

View file

@ -2,8 +2,8 @@
// 移植自 https://github.com/libretro/common-shaders/blob/master/windowed/shaders/lanczos6.cg
//!MAGPIE EFFECT
//!VERSION 4
//!VERSION 5
//!CAPABILITY AdvancedColor
//!PARAMETER
//!LABEL Anti-ringing Strength
@ -23,7 +23,6 @@ Texture2D OUTPUT;
//!FILTER POINT
SamplerState sam;
//!PASS 1
//!STYLE PS
//!IN INPUT

View file

@ -1,5 +1,5 @@
//!MAGPIE EFFECT
//!VERSION 4
//!VERSION 5
//!CAPABILITY FP16
#include "../StubDefs.hlsli"

View file

@ -163,17 +163,10 @@
#define NVU2 uint2
#define NVB bool
#if NIS_USE_HALF_PRECISION
#if NIS_HLSL_6_2
#define NVH float16_t
#define NVH2 float16_t2
#define NVH3 float16_t3
#define NVH4 float16_t4
#else
#define NVH min16float
#define NVH2 min16float2
#define NVH3 min16float3
#define NVH4 min16float4
#endif // NIS_HLSL_6_2
#define NVH MF
#define NVH2 MF2
#define NVH3 MF3
#define NVH4 MF4
#else // FP32 types
#define NVH NVF
#define NVH2 NVF2

View file

@ -1,5 +1,6 @@
//!MAGPIE EFFECT
//!VERSION 4
//!VERSION 5
//!CAPABILITY AdvancedColor
//!TEXTURE
Texture2D INPUT;

View file

@ -2,7 +2,7 @@
// 移植自 https://github.com/SnapdragonStudios/snapdragon-gsr/blob/main/sgsr/v1/include/hlsl/sgsr1_shader_mobile.hlsl
//!MAGPIE EFFECT
//!VERSION 4
//!VERSION 5
//!PARAMETER
//!LABEL Edge Sharpness
@ -154,8 +154,8 @@ float3 SgsrYuvH(float2 uv, float4 con1)
float deltaY = finalY - pix_G;
pix = saturate(pix+deltaY);
}
pix += deltaY;
}
return pix;
}

View file

@ -1,14 +1,12 @@
//!MAGPIE EFFECT
//!VERSION 4
//!VERSION 5
//!SORT_NAME SMAA_2
//!SCALE_FACTOR 1
//!TEXTURE
Texture2D INPUT;
//!TEXTURE
//!WIDTH INPUT_WIDTH
//!HEIGHT INPUT_HEIGHT
Texture2D OUTPUT;
//!TEXTURE
@ -41,10 +39,9 @@ SamplerState PointSampler;
//!FILTER LINEAR
SamplerState LinearSampler;
//!COMMON
#define SMAA_RT_METRICS float4(GetInputPt(), GetInputSize())
static float4 SMAA_RT_METRICS = { GetInputPt(), GetInputSize() };
#define SMAA_LINEAR_SAMPLER LinearSampler
#define SMAA_POINT_SAMPLER PointSampler
#define SMAA_PRESET_HIGH

View file

@ -1,14 +1,12 @@
//!MAGPIE EFFECT
//!VERSION 4
//!VERSION 5
//!SORT_NAME SMAA_0
//!SCALE_FACTOR 1
//!TEXTURE
Texture2D INPUT;
//!TEXTURE
//!WIDTH INPUT_WIDTH
//!HEIGHT INPUT_HEIGHT
Texture2D OUTPUT;
//!TEXTURE
@ -41,10 +39,9 @@ SamplerState PointSampler;
//!FILTER LINEAR
SamplerState LinearSampler;
//!COMMON
#define SMAA_RT_METRICS float4(GetInputPt(), GetInputSize())
static float4 SMAA_RT_METRICS = { GetInputPt(), GetInputSize() };
#define SMAA_LINEAR_SAMPLER LinearSampler
#define SMAA_POINT_SAMPLER PointSampler
#define SMAA_PRESET_LOW

View file

@ -1,14 +1,12 @@
//!MAGPIE EFFECT
//!VERSION 4
//!VERSION 5
//!SORT_NAME SMAA_1
//!SCALE_FACTOR 1
//!TEXTURE
Texture2D INPUT;
//!TEXTURE
//!WIDTH INPUT_WIDTH
//!HEIGHT INPUT_HEIGHT
Texture2D OUTPUT;
//!TEXTURE
@ -41,10 +39,9 @@ SamplerState PointSampler;
//!FILTER LINEAR
SamplerState LinearSampler;
//!COMMON
#define SMAA_RT_METRICS float4(GetInputPt(), GetInputSize())
static float4 SMAA_RT_METRICS = { GetInputPt(), GetInputSize() };
#define SMAA_LINEAR_SAMPLER LinearSampler
#define SMAA_POINT_SAMPLER PointSampler
#define SMAA_PRESET_MEDIUM

View file

@ -1,14 +1,12 @@
//!MAGPIE EFFECT
//!VERSION 4
//!VERSION 5
//!SORT_NAME SMAA_3
//!SCALE_FACTOR 1
//!TEXTURE
Texture2D INPUT;
//!TEXTURE
//!WIDTH INPUT_WIDTH
//!HEIGHT INPUT_HEIGHT
Texture2D OUTPUT;
//!TEXTURE
@ -41,10 +39,9 @@ SamplerState PointSampler;
//!FILTER LINEAR
SamplerState LinearSampler;
//!COMMON
#define SMAA_RT_METRICS float4(GetInputPt(), GetInputSize())
static float4 SMAA_RT_METRICS = { GetInputPt(), GetInputSize() };
#define SMAA_LINEAR_SAMPLER LinearSampler
#define SMAA_POINT_SAMPLER PointSampler
#define SMAA_PRESET_ULTRA

View file

@ -2,9 +2,8 @@
// 移植自 https://gist.github.com/igv/36508af3ffc84410fe39761d6969be10
// 原始文件使用了大量 mpv 的“特性”,因此可能存在移植错误。如果你熟悉 mpv hook请帮助我们改进
//!MAGPIE EFFECT
//!VERSION 4
//!VERSION 5
//!PARAMETER
//!LABEL Oversharp
@ -41,7 +40,7 @@ Texture2D MR;
//!TEXTURE
//!WIDTH OUTPUT_WIDTH
//!HEIGHT OUTPUT_HEIGHT
//!FORMAT R8G8B8A8_UNORM
//!FORMAT COLOR_SPACE_ADAPTIVE
Texture2D POSTKERNEL;
//!SAMPLER
@ -52,7 +51,6 @@ SamplerState sam;
//!FILTER LINEAR
SamplerState sam1;
//!PASS 1
//!DESC CatumllRom
//!STYLE PS
@ -60,58 +58,60 @@ SamplerState sam1;
//!OUT POSTKERNEL
// 模拟 mpv 的内置缩放CatmullRom
// Samples a texture with Catmull-Rom filtering, using 9 texture fetches instead of 16.
// See http://vec3.ca/bicubic-filtering-in-fewer-taps/ for more details
float4 weight4(float x) {
// Sharper version. May look better in some cases. B=0, C=0.75
return float4(
((-0.75 * x + 1.5) * x - 0.75) * x,
(1.25 * x - 2.25) * x * x + 1.0,
((-1.25 * x + 1.5) * x + 0.75) * x,
(0.75 * x - 0.75) * x * x
);
}
float4 Pass1(float2 pos) {
float2 inputSize = GetInputSize();
float2 inputPt = GetInputPt();
const float2 inputPt = GetInputPt();
const float2 inputSize = GetInputSize();
// We're going to sample a a 4x4 grid of texels surrounding the target UV coordinate. We'll do this by rounding
// down the sample location to get the exact center of our "starting" texel. The starting texel will be at
// location [1, 1] in the grid, where [0, 0] is the top left corner.
float2 samplePos = pos * inputSize;
float2 texPos1 = floor(samplePos - 0.5f) + 0.5f;
pos *= inputSize;
float2 pos1 = floor(pos - 0.5) + 0.5;
float2 f = pos - pos1;
// Compute the fractional offset from our starting texel to our original sample location, which we'll
// feed into the Catmull-Rom spline function to get our filter weights.
float2 f = samplePos - texPos1;
float4 rowtaps = weight4(f.x);
float4 coltaps = weight4(f.y);
// Compute the Catmull-Rom weights using the fractional offset that we calculated earlier.
// These equations are pre-expanded based on our knowledge of where the texels will be located,
// which lets us avoid having to evaluate a piece-wise function.
float2 w0 = f * (-0.5f + f * (1.0f - 0.5f * f));
float2 w1 = 1.0f + f * f * (-2.5f + 1.5f * f);
float2 w2 = f * (0.5f + f * (2.0f - 1.5f * f));
float2 w3 = f * f * (-0.5f + 0.5f * f);
float2 uv1 = pos1 * inputPt;
float2 uv0 = uv1 - inputPt;
float2 uv2 = uv1 + inputPt;
float2 uv3 = uv2 + inputPt;
// Work out weighting factors and sampling offsets that will let us use bilinear filtering to
// simultaneously evaluate the middle 2 samples from the 4x4 grid.
float2 w12 = w1 + w2;
float2 offset12 = w2 / (w1 + w2);
float u_weight_sum = rowtaps.y + rowtaps.z;
float u_middle_offset = rowtaps.z * inputPt.x / u_weight_sum;
float u_middle = uv1.x + u_middle_offset;
// Compute the final UV coordinates we'll use for sampling the texture
float2 texPos0 = texPos1 - 1;
float2 texPos3 = texPos1 + 2;
float2 texPos12 = texPos1 + offset12;
float v_weight_sum = coltaps.y + coltaps.z;
float v_middle_offset = coltaps.z * inputPt.y / v_weight_sum;
float v_middle = uv1.y + v_middle_offset;
texPos0 *= inputPt;
texPos3 *= inputPt;
texPos12 *= inputPt;
int2 coord_top_left = int2(max(uv0 * inputSize, 0.5));
int2 coord_bottom_right = int2(min(uv3 * inputSize, inputSize - 0.5));
float4 result = 0.0f;
result += INPUT.SampleLevel(sam1, float2(texPos0.x, texPos0.y), 0) * w0.x * w0.y;
result += INPUT.SampleLevel(sam1, float2(texPos12.x, texPos0.y), 0) * w12.x * w0.y;
result += INPUT.SampleLevel(sam1, float2(texPos3.x, texPos0.y), 0) * w3.x * w0.y;
float3 top = INPUT.Load(int3(coord_top_left, 0)).rgb * rowtaps.x;
top += INPUT.SampleLevel(sam1, float2(u_middle, uv0.y), 0).rgb * u_weight_sum;
top += INPUT.Load(int3(coord_bottom_right.x, coord_top_left.y, 0)).rgb * rowtaps.w;
float3 total = top * coltaps.x;
result += INPUT.SampleLevel(sam1, float2(texPos0.x, texPos12.y), 0) * w0.x * w12.y;
result += INPUT.SampleLevel(sam1, float2(texPos12.x, texPos12.y), 0) * w12.x * w12.y;
result += INPUT.SampleLevel(sam1, float2(texPos3.x, texPos12.y), 0) * w3.x * w12.y;
float3 middle = INPUT.SampleLevel(sam1, float2(uv0.x, v_middle), 0).rgb * rowtaps.x;
middle += INPUT.SampleLevel(sam1, float2(u_middle, v_middle), 0).rgb * u_weight_sum;
middle += INPUT.SampleLevel(sam1, float2(uv3.x, v_middle), 0).rgb * rowtaps.w;
total += middle * v_weight_sum;
result += INPUT.SampleLevel(sam1, float2(texPos0.x, texPos3.y), 0) * w0.x * w3.y;
result += INPUT.SampleLevel(sam1, float2(texPos12.x, texPos3.y), 0) * w12.x * w3.y;
result += INPUT.SampleLevel(sam1, float2(texPos3.x, texPos3.y), 0) * w3.x * w3.y;
float3 bottom = INPUT.Load(int3(coord_top_left.x, coord_bottom_right.y, 0)).rgb * rowtaps.x;
bottom += INPUT.SampleLevel(sam1, float2(u_middle, uv3.y), 0).rgb * u_weight_sum;
bottom += INPUT.Load(int3(coord_bottom_right, 0)).rgb * rowtaps.w;
total += bottom * coltaps.w;
return result;
return float4(total, 1);
}
//!PASS 2
@ -124,7 +124,6 @@ float4 Pass1(float2 pos) {
#define Kernel(x) MN(0.0f, 0.5f, abs(x))
#define taps 2.0f
float4 Pass2(float2 pos) {
const float inputPtY = GetInputPt().y;
const uint inputHeight = GetInputSize().y;
@ -152,7 +151,6 @@ float4 Pass2(float2 pos) {
return float4(avg, 1);
}
//!PASS 3
//!DESC L2 pass 2
//!STYLE PS
@ -163,7 +161,6 @@ float4 Pass2(float2 pos) {
#define Kernel(x) MN(0.0, 0.5, abs(x))
#define taps 2.0
float4 Pass3(float2 pos) {
const float inputPtX = GetInputPt().x;
const uint inputWidth = GetInputSize().x;
@ -190,7 +187,6 @@ float4 Pass3(float2 pos) {
return float4(avg, 1);
}
//!PASS 4
//!DESC mean & R
//!IN L2_2, POSTKERNEL
@ -207,7 +203,6 @@ float4 Pass3(float2 pos) {
#define Luma(rgb) ( dot(rgb, float3(0.2126, 0.7152, 0.0722)) )
void Pass4(uint2 blockStart, uint3 threadId) {
uint2 gxy = (Rmp8x8(threadId.x) << 1) + blockStart;
uint2 outputSize = GetOutputSize();
@ -224,7 +219,7 @@ void Pass4(uint2 blockStart, uint3 threadId) {
for (i = 0; i < taps; i += 2) {
[unroll]
for (j = 0; j < taps; j += 2) {
const float2 tpos = (int2(gxy + uint2(i, j)) - taps / 2 + 1) * outputPt;
float2 tpos = (int2(gxy + uint2(i, j)) - taps / 2 + 1) * outputPt;
float4 sr = POSTKERNEL.GatherRed(sam, tpos);
float4 sg = POSTKERNEL.GatherGreen(sam, tpos);
float4 sb = POSTKERNEL.GatherBlue(sam, tpos);
@ -258,13 +253,7 @@ void Pass4(uint2 blockStart, uint3 threadId) {
[unroll]
for (j = 0; j <= 1; ++j) {
uint2 destPos = gxy + uint2(i, j);
if (i != 0 || j != 0) {
if (destPos.x >= outputSize.x || destPos.y >= outputSize.y) {
continue;
}
}
float W = 0.0;
float3x3 avg = 0;
@ -293,7 +282,6 @@ void Pass4(uint2 blockStart, uint3 threadId) {
}
}
//!PASS 5
//!DESC final pass
//!IN MR, POSTKERNEL
@ -307,7 +295,6 @@ void Pass4(uint2 blockStart, uint3 threadId) {
// taps 需为奇数
#define taps 3
void Pass5(uint2 blockStart, uint3 threadId) {
const uint2 gxy = (Rmp8x8(threadId.x) << 1) + blockStart;
@ -324,11 +311,11 @@ void Pass5(uint2 blockStart, uint3 threadId) {
for (i = 0; i < taps; i += 2) {
[unroll]
for (j = 0; j < taps; j += 2) {
const float2 tpos = (int2(gxy + uint2(i, j)) - taps / 2 + 1) * outputPt;
const float4 sr = MR.GatherRed(sam, tpos);
const float4 sg = MR.GatherGreen(sam, tpos);
const float4 sb = MR.GatherBlue(sam, tpos);
const float4 sa = MR.GatherAlpha(sam, tpos);
float2 tpos = (int2(gxy + uint2(i, j)) - taps / 2 + 1) * outputPt;
float4 sr = MR.GatherRed(sam, tpos);
float4 sg = MR.GatherGreen(sam, tpos);
float4 sb = MR.GatherBlue(sam, tpos);
float4 sa = MR.GatherAlpha(sam, tpos);
// w z
// x y
@ -340,10 +327,10 @@ void Pass5(uint2 blockStart, uint3 threadId) {
}
float3 src2[2][2];
const float2 tpos = (gxy + 1) * outputPt;
const float4 sr = POSTKERNEL.GatherRed(sam, tpos);
const float4 sg = POSTKERNEL.GatherGreen(sam, tpos);
const float4 sb = POSTKERNEL.GatherBlue(sam, tpos);
float2 tpos = (gxy + 1) * outputPt;
float4 sr = POSTKERNEL.GatherRed(sam, tpos);
float4 sg = POSTKERNEL.GatherGreen(sam, tpos);
float4 sb = POSTKERNEL.GatherBlue(sam, tpos);
// w z
// x y

View file

@ -4,10 +4,9 @@
// Adaptive sharpen - version 2015-05-15 - (requires ps >= 3.0)
// Tuned for use post resize, EXPECTS FULL RANGE GAMMA LIGHT
//!MAGPIE EFFECT
//!VERSION 4
//!VERSION 5
//!SCALE_FACTOR 1
//!PARAMETER
//!LABEL Sharpness
@ -24,21 +23,25 @@ float curveHeight;
Texture2D INPUT;
//!TEXTURE
//!WIDTH INPUT_WIDTH
//!HEIGHT INPUT_HEIGHT
Texture2D OUTPUT;
//!SAMPLER
//!FILTER POINT
SamplerState sam;
//!PASS 1
//!IN INPUT
//!OUT OUTPUT
//!BLOCK_SIZE 16
//!NUM_THREADS 64
// DXC 编译时展开某些循环会大幅降低性能
#ifdef MP_SM_6_0
#define CONDITIONAL_UNROLL
#else
#define CONDITIONAL_UNROLL [unroll]
#endif
// Defined values under this row are "optimal" DO NOT CHANGE IF YOU DO NOT KNOW WHAT YOU ARE DOING!
#define curveslope (curveHeight*1.5f) // Sharpening curve slope, edge region
@ -48,9 +51,9 @@ SamplerState sam;
#define L_comp_ratio 0.167f // Max compression ratio, light overshoot (1/0.167=6x)
#define max_scale_lim 10.0f // Abs change before max compression (1/10=±10%)
// 效果工作在线性 RGB 空间,应使用 GetLuminance 计算亮度
// Colour to greyscale, fast approx gamma
float CtG(float3 RGB) { return sqrt((1.0f / 3.0f) * ((RGB * RGB).r + (RGB * RGB).g + (RGB * RGB).b)); }
// float CtG(float3 RGB) { return sqrt((1.0f / 3.0f) * ((RGB * RGB).r + (RGB * RGB).g + (RGB * RGB).b)); }
void Pass1(uint2 blockStart, uint3 threadId) {
uint2 gxy = (Rmp8x8(threadId.x) << 1) + blockStart;
@ -68,11 +71,6 @@ void Pass1(uint2 blockStart, uint3 threadId) {
for (i = 0; i <= 6; i += 2) {
[unroll]
for (j = 0; j <= 6; j += 2) {
// 四角共 16 个纹素无需采样
if ((i == 0 && j == 0) || (i == 6 && j == 0) || (i == 0 && j == 6) || (i == 6 && j == 6)) {
continue;
}
float2 tpos = ((int2)gxy + int2(i, j) - 2) * inputPt;
const float4 sr = INPUT.GatherRed(sam, tpos);
const float4 sg = INPUT.GatherGreen(sam, tpos);
@ -81,19 +79,19 @@ void Pass1(uint2 blockStart, uint3 threadId) {
// w z
// x y
src[i][j].rgb = float3(sr.w, sg.w, sb.w);
src[i][j].w = CtG(src[i][j].rgb);
src[i][j].w = GetLuminance(src[i][j].rgb);
src[i][j + 1].rgb = float3(sr.x, sg.x, sb.x);
src[i][j + 1].w = CtG(src[i][j + 1].rgb);
src[i][j + 1].w = GetLuminance(src[i][j + 1].rgb);
src[i + 1][j].rgb = float3(sr.z, sg.z, sb.z);
src[i + 1][j].w = CtG(src[i + 1][j].rgb);
src[i + 1][j].w = GetLuminance(src[i + 1][j].rgb);
src[i + 1][j + 1].rgb = float3(sr.y, sg.y, sb.y);
src[i + 1][j + 1].w = CtG(src[i + 1][j + 1].rgb);
src[i + 1][j + 1].w = GetLuminance(src[i + 1][j + 1].rgb);
}
}
[unroll]
CONDITIONAL_UNROLL
for (i = 0; i <= 1; ++i) {
[unroll]
CONDITIONAL_UNROLL
for (j = 0; j <= 1; ++j) {
const uint2 destPos = gxy + uint2(i, j);

View file

@ -26,19 +26,21 @@
#define MF4x3 float4x3
#define MF4x4 float4x4
uint2 Rmp8x8(uint a) { return uint2(a / 8, a % 8); }
uint2 GetInputSize() { return uint2(0, 0); }
float2 GetInputPt() { return float2(0, 0); }
uint2 GetOutputSize() { return float2(0, 0); }
float2 GetOutputPt() { return float2(0, 0); }
float2 GetScale() { return float2(0, 0); }
MF2 MulAdd(MF2 x, MF2x2 y, MF2 a) { return mul(x, y) + a; }
MF3 MulAdd(MF2 x, MF2x3 y, MF3 a) { return mul(x, y) + a; }
MF4 MulAdd(MF2 x, MF2x4 y, MF4 a) { return mul(x, y) + a; }
MF2 MulAdd(MF3 x, MF3x2 y, MF2 a) { return mul(x, y) + a; }
MF3 MulAdd(MF3 x, MF3x3 y, MF3 a) { return mul(x, y) + a; }
MF4 MulAdd(MF3 x, MF3x4 y, MF4 a) { return mul(x, y) + a; }
MF2 MulAdd(MF4 x, MF4x2 y, MF2 a) { return mul(x, y) + a; }
MF3 MulAdd(MF4 x, MF4x3 y, MF3 a) { return mul(x, y) + a; }
MF4 MulAdd(MF4 x, MF4x4 y, MF4 a) { return mul(x, y) + a; }
uint GetFrameCount() { return 0; }
uint2 Rmp8x8(uint a) { return uint2(0); }
uint2 GetInputSize() { return uint2(0); }
float2 GetInputPt() { return float2(0); }
uint2 GetOutputSize() { return float2(0); }
float2 GetOutputPt() { return float2(0); }
float2 GetScale() { return float2(0); }
MF3 EncodeSrgb(MF3 c) { return MF3(0); }
MF3 DecodeSrgb(MF3 c) { return MF3(0); }
MF GetLuminance(MF3 c) { return 0; }
MF2 MulAdd(MF2 x, MF2x2 y, MF2 a) { return MF2(0); }
MF3 MulAdd(MF2 x, MF2x3 y, MF3 a) { return MF3(0); }
MF4 MulAdd(MF2 x, MF2x4 y, MF4 a) { return MF4(0); }
MF2 MulAdd(MF3 x, MF3x2 y, MF2 a) { return MF2(0); }
MF3 MulAdd(MF3 x, MF3x3 y, MF3 a) { return MF3(0); }
MF4 MulAdd(MF3 x, MF3x4 y, MF4 a) { return MF4(0); }
MF2 MulAdd(MF4 x, MF4x2 y, MF2 a) { return MF2(0); }
MF3 MulAdd(MF4 x, MF4x3 y, MF3 a) { return MF3(0); }
MF4 MulAdd(MF4 x, MF4x4 y, MF4 a) { return MF4(0); }

View file

@ -1,338 +0,0 @@
#include "pch.h"
#include "AdaptivePresenter.h"
#include "DeviceResources.h"
#include "Logger.h"
#include "ScalingWindow.h"
#include "Win32Helper.h"
namespace Magpie {
bool AdaptivePresenter::_Initialize(HWND hwndAttach) noexcept {
if (ScalingWindow::Get().Options().IsDirectFlipDisabled()) {
// 禁用 DirectFlip 时始终使用 DirectComposition 呈现
if (!_ResizeDCompVisual(hwndAttach)) {
Logger::Get().Error("_ResizeDCompVisual 失败");
return false;
}
_isDCompPresenting = true;
return true;
}
const uint32_t bufferCount = _CalcBufferCount();
const SIZE rendererSize = Win32Helper::GetSizeOfRect(ScalingWindow::Get().RendererRect());
DXGI_SWAP_CHAIN_DESC1 sd{
.Width = (UINT)rendererSize.cx,
.Height = (UINT)rendererSize.cy,
.Format = DXGI_FORMAT_R8G8B8A8_UNORM,
.SampleDesc = {
.Count = 1
},
.BufferUsage = DXGI_USAGE_RENDER_TARGET_OUTPUT,
.BufferCount = bufferCount,
#ifdef _DEBUG
// 我们应确保两种渲染方式可以无缝切换DXGI_SCALING_NONE 使错误更容易观察到
.Scaling = DXGI_SCALING_NONE,
#else
// 如果两种渲染方式无法无缝切换DXGI_SCALING_STRETCH 使视觉变化尽可能小
.Scaling = DXGI_SCALING_STRETCH,
#endif
// 渲染每帧之前都会清空后缓冲区,因此无需 DXGI_SWAP_EFFECT_FLIP_SEQUENTIAL
.SwapEffect = DXGI_SWAP_EFFECT_FLIP_DISCARD,
.AlphaMode = DXGI_ALPHA_MODE_IGNORE,
// 只要显卡支持始终启用 DXGI_SWAP_CHAIN_FLAG_ALLOW_TEARING 以支持可变刷新率
.Flags = UINT((_deviceResources->IsTearingSupported() ? DXGI_SWAP_CHAIN_FLAG_ALLOW_TEARING : 0)
| DXGI_SWAP_CHAIN_FLAG_FRAME_LATENCY_WAITABLE_OBJECT)
};
ID3D11Device5* d3dDevice = _deviceResources->GetD3DDevice();
winrt::com_ptr<IDXGISwapChain1> dxgiSwapChain;
HRESULT hr = _deviceResources->GetDXGIFactory()->CreateSwapChainForHwnd(
d3dDevice,
hwndAttach,
&sd,
nullptr,
nullptr,
dxgiSwapChain.put()
);
if (FAILED(hr)) {
Logger::Get().ComError("创建交换链失败", hr);
return false;
}
_dxgiSwapChain = dxgiSwapChain.try_as<IDXGISwapChain4>();
if (!_dxgiSwapChain) {
Logger::Get().Error("获取 IDXGISwapChain2 失败");
return false;
}
// 为了降低延迟,两个垂直同步之间允许渲染 bufferCount - 1 帧
_dxgiSwapChain->SetMaximumFrameLatency(bufferCount - 1);
_frameLatencyWaitableObject.reset(_dxgiSwapChain->GetFrameLatencyWaitableObject());
if (!_frameLatencyWaitableObject) {
Logger::Get().Error("GetFrameLatencyWaitableObject 失败");
return false;
}
hr = _deviceResources->GetDXGIFactory()->MakeWindowAssociation(
hwndAttach, DXGI_MWA_NO_ALT_ENTER);
if (FAILED(hr)) {
Logger::Get().ComError("MakeWindowAssociation 失败", hr);
}
hr = _dxgiSwapChain->GetBuffer(0, IID_PPV_ARGS(_backBuffer.put()));
if (FAILED(hr)) {
Logger::Get().ComError("获取后缓冲区失败", hr);
return false;
}
hr = d3dDevice->CreateRenderTargetView(_backBuffer.get(), nullptr, _backBufferRtv.put());
if (FAILED(hr)) {
Logger::Get().ComError("CreateRenderTargetView 失败", hr);
return false;
}
return true;
}
bool AdaptivePresenter::BeginFrame(
winrt::com_ptr<ID3D11Texture2D>& frameTex,
winrt::com_ptr<ID3D11RenderTargetView>& frameRtv,
POINT& drawOffset
) noexcept {
if (_isDCompPresenting) {
HRESULT hr = _dcompSurface->BeginDraw(nullptr, IID_PPV_ARGS(&frameTex), &drawOffset);
if (FAILED(hr)) {
Logger::Get().ComError("BeginDraw 失败", hr);
return false;
}
hr = _deviceResources->GetD3DDevice()->CreateRenderTargetView(
frameTex.get(), nullptr, frameRtv.put());
if (FAILED(hr)) {
Logger::Get().ComError("CreateRenderTargetView 失败", hr);
return false;
}
} else {
drawOffset = {};
if (!_isframeLatencyWaited) {
_frameLatencyWaitableObject.wait(1000);
_isframeLatencyWaited = true;
}
frameTex = _backBuffer;
frameRtv = _backBufferRtv;
}
return true;
}
void AdaptivePresenter::EndFrame(bool waitForGpu) noexcept {
if (_isDCompPresenting) {
_dcompSurface->EndDraw();
}
if (waitForGpu || _isResized) {
_isResized = false;
// 下面两个调用用于减少调整窗口尺寸时的边缘闪烁。
//
// 我们希望 DWM 绘制新的窗口框架时刚好合成新帧,但这不是我们能控制的,尤其是混合架构
// 下需要在显卡间传输帧数据,无法预测 Present/Commit 后多久 DWM 能收到。我们只能尽
// 可能为 DWM 合成新帧预留时间,这包括两个步骤:
//
// 1. 首先等待渲染完成,确保新帧对 DWM 随时可用。
// 2. 然后在新一轮合成开始时提交,这让 DWM 有更多时间合成新帧。
//
// 目前看来除非像 UWP 一般有 DWM 协助,否则彻底摆脱闪烁是不可能的。
//
// https://github.com/Blinue/Magpie/pull/1071#issuecomment-2718314731 讨论了 UWP
// 调整尺寸的方法,测试表明可以彻底解决闪烁问题。不过它使用了很不稳定的私有接口,没有
// 实用价值。
// 等待渲染完成
_WaitForGpu();
// 等待 DWM 开始合成新一帧
Win32Helper::WaitForDwmComposition();
}
if (_isDCompPresenting) {
_dcompDevice->Commit();
} else {
// 两个垂直同步之间允许渲染数帧SyncInterval = 0 只呈现最新的一帧,旧帧被丢弃
_dxgiSwapChain->Present(0, 0);
_isframeLatencyWaited = false;
// 丢弃渲染目标的内容
_deviceResources->GetD3DDC()->DiscardView(_backBufferRtv.get());
if (_isSwitchingToSwapChain) {
_isSwitchingToSwapChain = false;
// 等待交换链呈现新帧
_WaitForGpu();
Win32Helper::WaitForDwmComposition();
// 清除 DirectCompostion 内容
_dcompVisual->SetContent(nullptr);
_dcompDevice->Commit();
}
}
}
bool AdaptivePresenter::OnResize() noexcept {
_isResized = true;
if (ScalingWindow::Get().IsResizingOrMoving() || !_dxgiSwapChain) {
// 切换到 DirectComposition 呈现,失败则回落到交换链
_isDCompPresenting = _ResizeDCompVisual();
if (_isDCompPresenting) {
return true;
}
Logger::Get().Error("_ResizeDCompVisual 失败");
// 禁用 DirectFlip 时不存在交换链
if (!_dxgiSwapChain) {
return false;
}
}
if (!_ResizeSwapChain()) {
Logger::Get().Error("_ResizeSwapChain 失败");
return false;
}
return true;
}
void AdaptivePresenter::OnEndResize(bool& shouldRedraw) noexcept {
if (!_isDCompPresenting || !_dxgiSwapChain) {
shouldRedraw = false;
return;
}
shouldRedraw = true;
_ResizeSwapChain();
_isDCompPresenting = false;
// 交换链呈现新帧后再清除 DirectCompostion 内容,确保无缝切换
_isSwitchingToSwapChain = true;
}
bool AdaptivePresenter::_ResizeSwapChain() noexcept {
assert(_dxgiSwapChain);
if (!_isframeLatencyWaited) {
_frameLatencyWaitableObject.wait(1000);
_isframeLatencyWaited = true;
}
_backBuffer = nullptr;
_backBufferRtv = nullptr;
const RECT& swapChainRect = ScalingWindow::Get().RendererRect();
const SIZE swapChainSize = Win32Helper::GetSizeOfRect(swapChainRect);
HRESULT hr = _dxgiSwapChain->ResizeBuffers(
0,
(UINT)swapChainSize.cx,
(UINT)swapChainSize.cy,
DXGI_FORMAT_UNKNOWN,
UINT((_deviceResources->IsTearingSupported() ? DXGI_SWAP_CHAIN_FLAG_ALLOW_TEARING : 0)
| DXGI_SWAP_CHAIN_FLAG_FRAME_LATENCY_WAITABLE_OBJECT)
);
if (FAILED(hr)) {
Logger::Get().ComError("ResizeBuffers 失败", hr);
return false;
}
hr = _dxgiSwapChain->GetBuffer(0, IID_PPV_ARGS(_backBuffer.put()));
if (FAILED(hr)) {
Logger::Get().ComError("获取后缓冲区失败", hr);
return false;
}
hr = _deviceResources->GetD3DDevice()->CreateRenderTargetView(
_backBuffer.get(), nullptr, _backBufferRtv.put());
if (FAILED(hr)) {
Logger::Get().ComError("CreateRenderTargetView 失败", hr);
return false;
}
return true;
}
bool AdaptivePresenter::_ResizeDCompVisual(HWND hwndAttach) noexcept {
const SIZE rendererSize = Win32Helper::GetSizeOfRect(ScalingWindow::Get().RendererRect());
if (_dcompSurface) {
// 使用 IDCompositionVirtualSurface 而不是 IDCompositionSurface 的原因是
// IDCompositionDevice2::CreateSurface 有时相当慢,最坏情况下要几十毫秒。
HRESULT hr = _dcompSurface->Resize((UINT)rendererSize.cx, (UINT)rendererSize.cy);
if (FAILED(hr)) {
Logger::Get().ComError("Resize 失败", hr);
return false;
}
} else {
// 初始化 DirectComposition
HRESULT hr = DCompositionCreateDevice3(
_deviceResources->GetD3DDevice(), IID_PPV_ARGS(&_dcompDevice));
if (FAILED(hr)) {
Logger::Get().ComError("DCompositionCreateDevice3 失败", hr);
return false;
}
if (!hwndAttach) {
// 没有禁用 DirectFlip 时才会在调整大小时初始化,因此必定存在交换链
hr = _dxgiSwapChain->GetHwnd(&hwndAttach);
if (FAILED(hr)) {
Logger::Get().ComError("GetHwnd 失败", hr);
return false;
}
}
hr = _dcompDevice->CreateTargetForHwnd(hwndAttach, TRUE, _dcompTarget.put());
if (FAILED(hr)) {
Logger::Get().ComError("CreateTargetForHwnd 失败", hr);
return false;
}
hr = _dcompDevice->CreateVisual(_dcompVisual.put());
if (FAILED(hr)) {
Logger::Get().ComError("CreateVisual 失败", hr);
return false;
}
hr = _dcompTarget->SetRoot(_dcompVisual.get());
if (FAILED(hr)) {
Logger::Get().ComError("SetRoot 失败", hr);
return false;
}
hr = _dcompDevice->CreateVirtualSurface(
(UINT)rendererSize.cx,
(UINT)rendererSize.cy,
DXGI_FORMAT_R8G8B8A8_UNORM,
DXGI_ALPHA_MODE_IGNORE,
_dcompSurface.put()
);
if (FAILED(hr)) {
Logger::Get().ComError("CreateVirtualSurface 失败", hr);
return false;
}
}
HRESULT hr = _dcompVisual->SetContent(_dcompSurface.get());
if (FAILED(hr)) {
Logger::Get().ComError("SetContent 失败", hr);
return false;
}
return true;
}
}

View file

@ -1,49 +0,0 @@
#pragma once
#include "PresenterBase.h"
#include <dcomp.h>
namespace Magpie {
// 根据需要在交换链和 DirectComposition 两种呈现方式间切换。交换链可以触发
// DirectFlip/IndependentFlip 以最小化延迟DirectComposition 在调整尺寸
// 时闪烁更少,这个呈现器旨在结合两者的优势。
class AdaptivePresenter final : public PresenterBase {
protected:
bool _Initialize(HWND hwndAttach) noexcept override;
public:
bool BeginFrame(
winrt::com_ptr<ID3D11Texture2D>& frameTex,
winrt::com_ptr<ID3D11RenderTargetView>& frameRtv,
POINT& drawOffset
) noexcept override;
void EndFrame(bool waitForGpu = false) noexcept override;
bool OnResize() noexcept override;
void OnEndResize(bool& shouldRedraw) noexcept override;
private:
bool _ResizeSwapChain() noexcept;
bool _ResizeDCompVisual(HWND hwndAttach = NULL) noexcept;
winrt::com_ptr<IDXGISwapChain4> _dxgiSwapChain;
wil::unique_event_nothrow _frameLatencyWaitableObject;
winrt::com_ptr<ID3D11Texture2D> _backBuffer;
winrt::com_ptr<ID3D11RenderTargetView> _backBufferRtv;
// 调整大小或禁用 DirectFlip 时使用
winrt::com_ptr<IDCompositionDesktopDevice> _dcompDevice;
winrt::com_ptr<IDCompositionTarget> _dcompTarget;
winrt::com_ptr<IDCompositionVisual2> _dcompVisual;
winrt::com_ptr<IDCompositionVirtualSurface> _dcompSurface;
bool _isDCompPresenting = false;
bool _isResized = false;
bool _isframeLatencyWaited = false;
bool _isSwitchingToSwapChain = false;
};
}

View file

@ -0,0 +1,90 @@
#include "pch.h"
#include "AppFolderManager.h"
#include "CommonSharedConstants.h"
#include "Win32Helper.h"
#include <ShlObj.h>
#define APP_DIR L"app"
#define DATA_DIR L"data"
namespace Magpie {
bool AppFolderManager::Initialize() noexcept {
_exeDir = Win32Helper::GetExePath().parent_path();
if (_exeDir.empty()) {
return false;
}
// dll 搜索路径中添加 app 文件夹以及排除当前目录
if (!SetDefaultDllDirectories(LOAD_LIBRARY_SEARCH_DEFAULT_DIRS)) {
return false;
}
if (!AddDllDirectory((_exeDir / APP_DIR).c_str())) {
return false;
}
// 若程序所在目录存在配置文件则为便携模式
_isPortableMode = Win32Helper::FileExists(StrHelper::Concat(
_exeDir.native(), L"\\" DATA_DIR L"\\config\\", CommonSharedConstants::CONFIG_FILENAME).c_str());
// 旧版本便携模式配置文件位置
_isPortableMode = _isPortableMode || Win32Helper::FileExists(StrHelper::Concat(
_exeDir.native(), L"\\config\\", CommonSharedConstants::CONFIG_FILENAME).c_str());
if (_isPortableMode) {
_workingDir = _exeDir / DATA_DIR;
} else {
wil::unique_cotaskmem_string localAppDataDir;
HRESULT hr = SHGetKnownFolderPath(
FOLDERID_LocalAppData, KF_FLAG_DEFAULT, NULL, localAppDataDir.put());
if (FAILED(hr)) {
return false;
}
_workingDir = StrHelper::Concat(localAppDataDir.get(), L"\\Magpie\\" DATA_DIR);
}
Win32Helper::CreateDir(_workingDir.c_str());
if (!SetCurrentDirectory(_workingDir.c_str())) {
return false;
}
return true;
}
std::filesystem::path AppFolderManager::GetAppDir() const noexcept {
return _exeDir / APP_DIR;
}
const wchar_t* AppFolderManager::GetLogsDir() const noexcept {
return L"logs";
}
const wchar_t* AppFolderManager::GetSourcesDir() const noexcept {
return L"sources";
}
const wchar_t* AppFolderManager::GetCacheDir() const noexcept {
return L"cache";
}
const wchar_t* AppFolderManager::GetConfigDir() const noexcept {
return L"config";
}
std::filesystem::path AppFolderManager::GetBuiltInShaderEffectsDir() const noexcept {
return _exeDir / APP_DIR L"\\effects\\shaders";
}
std::filesystem::path AppFolderManager::GetD3D12Dir() const noexcept {
return _exeDir / APP_DIR L"\\D3D12";
}
std::filesystem::path AppFolderManager::GetUpdateDir() const noexcept {
// 位于根目录中,非打包应用更新时才会使用
return _exeDir / CommonSharedConstants::UPDATE_DIR;
}
}

View file

@ -1,75 +0,0 @@
#include "pch.h"
#include "BackendDescriptorStore.h"
#include "Logger.h"
namespace Magpie {
ID3D11ShaderResourceView* BackendDescriptorStore::GetShaderResourceView(ID3D11Texture2D* texture) noexcept {
if (auto it = _srvMap.find(texture); it != _srvMap.end()) {
return it->second.get();
}
winrt::com_ptr<ID3D11ShaderResourceView> srv;
HRESULT hr = _d3dDevice->CreateShaderResourceView(texture, nullptr, srv.put());
if (FAILED(hr)) {
Logger::Get().ComError("CreateShaderResourceView 失败", hr);
return nullptr;
}
return _srvMap.emplace(texture, std::move(srv)).first->second.get();
}
ID3D11UnorderedAccessView* BackendDescriptorStore::GetUnorderedAccessView(ID3D11Texture2D* texture) noexcept {
if (auto it = _uavMap.find(texture); it != _uavMap.end()) {
return it->second.get();
}
winrt::com_ptr<ID3D11UnorderedAccessView> uav;
D3D11_UNORDERED_ACCESS_VIEW_DESC desc{
.ViewDimension = D3D11_UAV_DIMENSION_TEXTURE2D
};
HRESULT hr = _d3dDevice->CreateUnorderedAccessView(texture, &desc, uav.put());
if (FAILED(hr)) {
Logger::Get().ComError("CreateUnorderedAccessView 失败", hr);
return nullptr;
}
return _uavMap.emplace(texture, std::move(uav)).first->second.get();
}
ID3D11UnorderedAccessView* BackendDescriptorStore::GetUnorderedAccessView(
ID3D11Buffer* buffer,
uint32_t numElements,
DXGI_FORMAT format
) noexcept {
if (auto it = _uavMap.find(buffer); it != _uavMap.end()) {
return it->second.get();
}
winrt::com_ptr<ID3D11UnorderedAccessView> uav;
D3D11_UNORDERED_ACCESS_VIEW_DESC desc{
.Format = format,
.ViewDimension = D3D11_UAV_DIMENSION_BUFFER,
.Buffer{
.NumElements = numElements
}
};
HRESULT hr = _d3dDevice->CreateUnorderedAccessView(buffer, &desc, uav.put());
if (FAILED(hr)) {
Logger::Get().ComError("CreateUnorderedAccessView 失败", hr);
return nullptr;
}
return _uavMap.emplace(buffer, std::move(uav)).first->second.get();
}
void BackendDescriptorStore::RemoveCache(ID3D11Texture2D* texture) noexcept {
_srvMap.erase(texture);
_uavMap.erase(texture);
}
}

View file

@ -1,35 +0,0 @@
#pragma once
#include <parallel_hashmap/phmap.h>
namespace Magpie {
class BackendDescriptorStore {
public:
BackendDescriptorStore() = default;
BackendDescriptorStore(const BackendDescriptorStore&) = delete;
BackendDescriptorStore(BackendDescriptorStore&&) = default;
void Initialize(ID3D11Device5* d3dDevice) noexcept {
_d3dDevice = d3dDevice;
}
ID3D11ShaderResourceView* GetShaderResourceView(ID3D11Texture2D* texture) noexcept;
ID3D11UnorderedAccessView* GetUnorderedAccessView(ID3D11Texture2D* texture) noexcept;
ID3D11UnorderedAccessView* GetUnorderedAccessView(
ID3D11Buffer* buffer,
uint32_t numElements,
DXGI_FORMAT format = DXGI_FORMAT_UNKNOWN
) noexcept;
void RemoveCache(ID3D11Texture2D* texture) noexcept;
private:
ID3D11Device5* _d3dDevice = nullptr;
phmap::flat_hash_map<ID3D11Texture2D*, winrt::com_ptr<ID3D11ShaderResourceView>> _srvMap;
phmap::flat_hash_map<void*, winrt::com_ptr<ID3D11UnorderedAccessView>> _uavMap;
};
}

View file

@ -0,0 +1,277 @@
#include "pch.h"
#include "CatmullRomDrawer.h"
#include "CommandContext.h"
#include "D3D12Context.h"
#include "DirectXHelper.h"
#include "Logger.h"
#include "shaders/CatmullRomCS.h"
#include "shaders/CatmullRomCS_SM5.h"
#include "shaders/CatmullRomCS_sRGB.h"
#include "shaders/CatmullRomCS_sRGB_SM5.h"
#include "shaders/CopyCS.h"
#include "shaders/CopyCS_SM5.h"
#include "shaders/CopyCS_sRGB.h"
#include "shaders/CopyCS_sRGB_SM5.h"
namespace Magpie {
void CatmullRomDrawer::Initialize(D3D12Context& d3d12Context) noexcept {
_d3d12Context = &d3d12Context;
}
HRESULT CatmullRomDrawer::Draw(
ComputeContext& computeContext,
SizeU inputSize,
SizeU outputSize,
uint32_t inputSrvOffset,
uint32_t outputUavOffset,
bool outputSrgb
) noexcept {
// 作为性能优化,输入和输出尺寸相同时原样复制
if (inputSize == outputSize) {
if (!_copyRootSignature) {
HRESULT hr = _InitializeCopyRootSignature();
if (FAILED(hr)) {
Logger::Get().ComError("_InitializeCopyRootSignature 失败", hr);
return hr;
}
}
computeContext.SetRootSignature(_copyRootSignature.get());
if (outputSrgb) {
if (!_copySrgbPSO) {
D3D12_COMPUTE_PIPELINE_STATE_DESC psoDesc = {
.pRootSignature = _copyRootSignature.get(),
.CS = DirectXHelper::SelectShader(
_d3d12Context->GetShaderModel() >= D3D_SHADER_MODEL_6_0,
CopyCS_sRGB,
CopyCS_sRGB_SM5
)
};
HRESULT hr = _d3d12Context->GetDevice()->CreateComputePipelineState(
&psoDesc, IID_PPV_ARGS(&_copySrgbPSO));
if (FAILED(hr)) {
Logger::Get().ComError("CreateComputePipelineState 失败", hr);
return hr;
}
}
computeContext.SetPipelineState(_copySrgbPSO.get());
} else {
if (!_copyPSO) {
D3D12_COMPUTE_PIPELINE_STATE_DESC psoDesc = {
.pRootSignature = _copyRootSignature.get(),
.CS = DirectXHelper::SelectShader(
_d3d12Context->GetShaderModel() >= D3D_SHADER_MODEL_6_0,
CopyCS,
CopyCS_SM5
)
};
HRESULT hr = _d3d12Context->GetDevice()->CreateComputePipelineState(
&psoDesc, IID_PPV_ARGS(&_copyPSO));
if (FAILED(hr)) {
Logger::Get().ComError("CreateComputePipelineState 失败", hr);
return hr;
}
}
computeContext.SetPipelineState(_copyPSO.get());
}
computeContext.SetRootDescriptorTable(0, inputSrvOffset);
computeContext.SetRootDescriptorTable(1, outputUavOffset);
} else {
if (!_catmullRomRootSignature) {
HRESULT hr = _InitializeCatmullRomRootSignature();
if (FAILED(hr)) {
Logger::Get().ComError("_InitializeCatmullRomRootSignature 失败", hr);
return hr;
}
}
computeContext.SetRootSignature(_catmullRomRootSignature.get());
if (outputSrgb) {
if (!_catmullRomSrgbPSO) {
D3D12_COMPUTE_PIPELINE_STATE_DESC psoDesc = {
.pRootSignature = _catmullRomRootSignature.get(),
.CS = DirectXHelper::SelectShader(
_d3d12Context->GetShaderModel() >= D3D_SHADER_MODEL_6_0,
CatmullRomCS_sRGB,
CatmullRomCS_sRGB_SM5
)
};
HRESULT hr = _d3d12Context->GetDevice()->CreateComputePipelineState(
&psoDesc, IID_PPV_ARGS(&_catmullRomSrgbPSO));
if (FAILED(hr)) {
Logger::Get().ComError("CreateComputePipelineState 失败", hr);
return hr;
}
}
computeContext.SetPipelineState(_catmullRomSrgbPSO.get());
} else {
if (!_catmullRomPSO) {
D3D12_COMPUTE_PIPELINE_STATE_DESC psoDesc = {
.pRootSignature = _catmullRomRootSignature.get(),
.CS = DirectXHelper::SelectShader(
_d3d12Context->GetShaderModel() >= D3D_SHADER_MODEL_6_0,
CatmullRomCS,
CatmullRomCS_SM5
)
};
HRESULT hr = _d3d12Context->GetDevice()->CreateComputePipelineState(
&psoDesc, IID_PPV_ARGS(&_catmullRomPSO));
if (FAILED(hr)) {
Logger::Get().ComError("CreateComputePipelineState 失败", hr);
return hr;
}
}
computeContext.SetPipelineState(_catmullRomPSO.get());
}
DirectXHelper::Constant32 constants[] = {
{.uintVal = inputSize.width},
{.uintVal = inputSize.height},
{.floatVal = 1.0f / inputSize.width},
{.floatVal = 1.0f / inputSize.height},
{.floatVal = 1.0f / outputSize.width},
{.floatVal = 1.0f / outputSize.height}
};
computeContext.SetRoot32BitConstants(0, (UINT)std::size(constants), constants);
computeContext.SetRootDescriptorTable(1, inputSrvOffset);
computeContext.SetRootDescriptorTable(2, outputUavOffset);
}
constexpr uint32_t BLOCK_SIZE = 16;
computeContext.Dispatch(
(outputSize.width + BLOCK_SIZE - 1) / BLOCK_SIZE,
(outputSize.height + BLOCK_SIZE - 1) / BLOCK_SIZE
);
return S_OK;
}
HRESULT CatmullRomDrawer::_InitializeCatmullRomRootSignature() noexcept {
winrt::com_ptr<ID3DBlob> signature;
CD3DX12_DESCRIPTOR_RANGE1 srvRange(D3D12_DESCRIPTOR_RANGE_TYPE_SRV, 1, 0, 0,
D3D12_DESCRIPTOR_RANGE_FLAG_DATA_STATIC_WHILE_SET_AT_EXECUTE);
CD3DX12_DESCRIPTOR_RANGE1 uavRange(D3D12_DESCRIPTOR_RANGE_TYPE_UAV, 1, 0, 0,
D3D12_DESCRIPTOR_RANGE_FLAG_DATA_VOLATILE);
D3D12_ROOT_PARAMETER1 rootParams[] = {
{
.ParameterType = D3D12_ROOT_PARAMETER_TYPE_32BIT_CONSTANTS,
.Constants = {
.Num32BitValues = 6
}
},
{
.ParameterType = D3D12_ROOT_PARAMETER_TYPE_DESCRIPTOR_TABLE,
.DescriptorTable = {
.NumDescriptorRanges = 1,
.pDescriptorRanges = &srvRange
}
},
{
.ParameterType = D3D12_ROOT_PARAMETER_TYPE_DESCRIPTOR_TABLE,
.DescriptorTable = {
.NumDescriptorRanges = 1,
.pDescriptorRanges = &uavRange
}
}
};
D3D12_STATIC_SAMPLER_DESC samplerDesc = {
.Filter = D3D12_FILTER_MIN_MAG_MIP_LINEAR,
.AddressU = D3D12_TEXTURE_ADDRESS_MODE_CLAMP,
.AddressV = D3D12_TEXTURE_ADDRESS_MODE_CLAMP,
.AddressW = D3D12_TEXTURE_ADDRESS_MODE_CLAMP,
.ComparisonFunc = D3D12_COMPARISON_FUNC_NEVER,
.ShaderRegister = 0
};
CD3DX12_VERSIONED_ROOT_SIGNATURE_DESC rootSignatureDesc(
(UINT)std::size(rootParams), rootParams, 1, &samplerDesc);
HRESULT hr = D3DX12SerializeVersionedRootSignature(
&rootSignatureDesc,
_d3d12Context->GetRootSignatureVersion(),
signature.put(),
nullptr
);
if (FAILED(hr)) {
Logger::Get().ComError("D3DX12SerializeVersionedRootSignature 失败", hr);
return hr;
}
hr = _d3d12Context->GetDevice()->CreateRootSignature(
0,
signature->GetBufferPointer(),
signature->GetBufferSize(),
IID_PPV_ARGS(&_catmullRomRootSignature)
);
if (FAILED(hr)) {
Logger::Get().ComError("CreateRootSignature 失败", hr);
return hr;
}
return S_OK;
}
HRESULT CatmullRomDrawer::_InitializeCopyRootSignature() noexcept {
winrt::com_ptr<ID3DBlob> signature;
CD3DX12_DESCRIPTOR_RANGE1 srvRange(D3D12_DESCRIPTOR_RANGE_TYPE_SRV, 1, 0, 0,
D3D12_DESCRIPTOR_RANGE_FLAG_DATA_STATIC_WHILE_SET_AT_EXECUTE);
CD3DX12_DESCRIPTOR_RANGE1 uavRange(D3D12_DESCRIPTOR_RANGE_TYPE_UAV, 1, 0, 0,
D3D12_DESCRIPTOR_RANGE_FLAG_DATA_VOLATILE);
D3D12_ROOT_PARAMETER1 rootParams[] = {
{
.ParameterType = D3D12_ROOT_PARAMETER_TYPE_DESCRIPTOR_TABLE,
.DescriptorTable = {
.NumDescriptorRanges = 1,
.pDescriptorRanges = &srvRange
}
},
{
.ParameterType = D3D12_ROOT_PARAMETER_TYPE_DESCRIPTOR_TABLE,
.DescriptorTable = {
.NumDescriptorRanges = 1,
.pDescriptorRanges = &uavRange
}
}
};
CD3DX12_VERSIONED_ROOT_SIGNATURE_DESC rootSignatureDesc(
(UINT)std::size(rootParams), rootParams, 0, nullptr);
HRESULT hr = D3DX12SerializeVersionedRootSignature(
&rootSignatureDesc,
_d3d12Context->GetRootSignatureVersion(),
signature.put(),
nullptr
);
if (FAILED(hr)) {
Logger::Get().ComError("D3DX12SerializeVersionedRootSignature 失败", hr);
return hr;
}
hr = _d3d12Context->GetDevice()->CreateRootSignature(
0,
signature->GetBufferPointer(),
signature->GetBufferSize(),
IID_PPV_ARGS(&_copyRootSignature)
);
if (FAILED(hr)) {
Logger::Get().ComError("CreateRootSignature 失败", hr);
return hr;
}
return S_OK;
}
}

View file

@ -0,0 +1,36 @@
#pragma once
namespace Magpie {
class D3D12Context;
class ComputeContext;
class CatmullRomDrawer {
public:
void Initialize(D3D12Context& d3d12Context) noexcept;
HRESULT Draw(
ComputeContext& computeContext,
SizeU inputSize,
SizeU outputSize,
uint32_t inputSrvOffset,
uint32_t outputUavOffset,
bool outputSrgb
) noexcept;
private:
D3D12Context* _d3d12Context = nullptr;
HRESULT _InitializeCatmullRomRootSignature() noexcept;
HRESULT _InitializeCopyRootSignature() noexcept;
winrt::com_ptr<ID3D12RootSignature> _catmullRomRootSignature;
winrt::com_ptr<ID3D12PipelineState> _catmullRomPSO;
winrt::com_ptr<ID3D12PipelineState> _catmullRomSrgbPSO;
winrt::com_ptr<ID3D12RootSignature> _copyRootSignature;
winrt::com_ptr<ID3D12PipelineState> _copyPSO;
winrt::com_ptr<ID3D12PipelineState> _copySrgbPSO;
};
}

View file

@ -0,0 +1,24 @@
#pragma once
namespace Magpie {
struct ColorHelper {
static float SrgbToLinear(uint8_t c) noexcept {
static std::array<float, 256> lut = [] {
std::array<float, 256> result{};
for (uint32_t i = 0; i < 256; ++i) {
float c = i / 255.0f;
if (c <= 0.04045f) {
result[i] = c / 12.92f * 255.0f;
} else {
result[i] = std::pow((c + 0.055f) / 1.055f, 2.4f) * 255.0f;
}
}
return result;
}();
return lut[c];
}
};
}

View file

@ -0,0 +1,129 @@
#include "pch.h"
#include "CommandContext.h"
#include "DescriptorHeap.h"
namespace Magpie {
void ComputeContext::SetRootSignature(ID3D12RootSignature* rootSignature) noexcept {
_commandList->SetComputeRootSignature(rootSignature);
}
void ComputeContext::SetRoot32BitConstants(
uint32_t rootParameterIndex,
uint32_t constantCount,
const void* pData
) noexcept {
_commandList->SetComputeRoot32BitConstants(rootParameterIndex, constantCount, pData, 0);
}
void ComputeContext::SetComputeRootConstantBufferView(
uint32_t rootParameterIndex,
D3D12_GPU_VIRTUAL_ADDRESS bufferLocation
) noexcept {
// 存在 DATA_STATIC 标志时 SetComputeRootConstantBufferView 会检查资源状态
if (_d3d12Context->GetRootSignatureVersion() >= D3D_ROOT_SIGNATURE_VERSION_1_1) {
_FlushBarriers();
}
_commandList->SetComputeRootConstantBufferView(rootParameterIndex, bufferLocation);
}
void ComputeContext::SetRootDescriptorTable(
uint32_t rootParameterIndex,
uint32_t baseDescriptorOffset
) noexcept {
assert(baseDescriptorOffset != std::numeric_limits<uint32_t>::max());
// 存在 DATA_STATIC 标志时 SetComputeRootDescriptorTable 会检查资源状态
if (_d3d12Context->GetRootSignatureVersion() >= D3D_ROOT_SIGNATURE_VERSION_1_1) {
_FlushBarriers();
}
_commandList->SetComputeRootDescriptorTable(
rootParameterIndex,
_d3d12Context->GetDescriptorHeap().GetGpuHandle(baseDescriptorOffset)
);
}
void ComputeContext::Dispatch(
uint32_t threadGroupCountX,
uint32_t threadGroupCountY,
uint32_t threadGroupCountZ
) noexcept {
_FlushBarriers();
_commandList->Dispatch(threadGroupCountX, threadGroupCountY, threadGroupCountZ);
}
void ComputeContext::ClearStateCache() noexcept {
_ClearStateCache();
}
void GraphicsContext::SetRootSignature(ID3D12RootSignature* rootSignature) noexcept {
_commandList->SetGraphicsRootSignature(rootSignature);
}
void GraphicsContext::SetRoot32BitConstants(
uint32_t rootParameterIndex,
uint32_t constantCount,
const void* pData
) noexcept {
_commandList->SetGraphicsRoot32BitConstants(rootParameterIndex, constantCount, pData, 0);
}
void GraphicsContext::SetRootDescriptorTable(
uint32_t rootParameterIndex,
uint32_t baseDescriptorOffset
) noexcept {
assert(baseDescriptorOffset != std::numeric_limits<uint32_t>::max());
// 存在 DATA_STATIC 标志时 SetGraphicsRootDescriptorTable 会检查资源状态
if (_d3d12Context->GetRootSignatureVersion() >= D3D_ROOT_SIGNATURE_VERSION_1_1) {
_FlushBarriers();
}
_commandList->SetGraphicsRootDescriptorTable(
rootParameterIndex,
_d3d12Context->GetDescriptorHeap().GetGpuHandle(baseDescriptorOffset)
);
}
void GraphicsContext::IASetPrimitiveTopology(D3D12_PRIMITIVE_TOPOLOGY trimitiveTopology) noexcept {
if (trimitiveTopology != _curTrimitiveTopology) {
_curTrimitiveTopology = trimitiveTopology;
_commandList->IASetPrimitiveTopology(trimitiveTopology);
}
}
void GraphicsContext::RSSetViewportAndScissorRect(const D3D12_RECT& rect) noexcept {
CD3DX12_VIEWPORT viewport((float)rect.left, (float)rect.top,
float(rect.right - rect.left), float(rect.bottom - rect.top));
_commandList->RSSetViewports(1, &viewport);
_commandList->RSSetScissorRects(1, &rect);
}
void GraphicsContext::OMSetRenderTarget(uint32_t rtvDescriptorOffset) noexcept {
assert(rtvDescriptorOffset != std::numeric_limits<uint32_t>::max());
if (rtvDescriptorOffset != _curRtvDescriptorOffset) {
_curRtvDescriptorOffset = rtvDescriptorOffset;
D3D12_CPU_DESCRIPTOR_HANDLE rtvHandle =
_d3d12Context->GetDescriptorHeap(true).GetCpuHandle(rtvDescriptorOffset);
_commandList->OMSetRenderTargets(1, &rtvHandle, FALSE, nullptr);
}
}
void GraphicsContext::Draw(uint32_t vertexCount) noexcept {
_FlushBarriers();
_commandList->DrawInstanced(vertexCount, 1, 0, 0);
}
void GraphicsContext::ClearStateCache() noexcept {
_ClearStateCache();
_curTrimitiveTopology = D3D_PRIMITIVE_TOPOLOGY_UNDEFINED;
_curRtvDescriptorOffset = std::numeric_limits<uint32_t>::max();
}
}

View file

@ -0,0 +1,212 @@
#pragma once
#include "D3D12Context.h"
#include "SmallVector.h"
#include "Logger.h"
namespace Magpie {
class DescriptorHeap;
template <typename T>
class CommandContext {
public:
CommandContext() noexcept = default;
CommandContext(const CommandContext&) = delete;
CommandContext(CommandContext&&) = delete;
void Initialize(D3D12Context& d3d12Context) noexcept {
_d3d12Context = &d3d12Context;
_commandList = d3d12Context.GetCommandList();
}
ID3D12GraphicsCommandList* GetCommandList() const noexcept {
return _commandList;
}
HRESULT Execute(ID3D12CommandQueue* commandQueue) noexcept {
_FlushBarriers();
HRESULT hr = _commandList->Close();
if (FAILED(hr)) {
Logger::Get().ComError("ID3D12GraphicsCommandList::Close 失败", hr);
return hr;
}
commandQueue->ExecuteCommandLists(1, CommandListCast(&_commandList));
((T*)this)->ClearStateCache();
return S_OK;
}
void SetPipelineState(ID3D12PipelineState* pipelineState) noexcept {
_commandList->SetPipelineState(pipelineState);
}
void SetDescriptorHeap(ID3D12DescriptorHeap* descriptorHeap) noexcept {
if (descriptorHeap != _curDescriptorHeap) {
_curDescriptorHeap = descriptorHeap;
_commandList->SetDescriptorHeaps(1, &descriptorHeap);
}
}
ID3D12DescriptorHeap* GetCurDescriptorHeap() const noexcept {
return _curDescriptorHeap;
}
void InsertTransitionBarrier(
ID3D12Resource* resource,
D3D12_RESOURCE_STATES stateBefore,
D3D12_RESOURCE_STATES stateAfter
) noexcept {
#ifdef _DEBUG
// 检查是否存在冗余的状态转换
auto it = std::find_if(
_pendingBarriers.begin(),
_pendingBarriers.end(),
[&](const D3D12_RESOURCE_BARRIER& barrier) {
return barrier.Transition.pResource == resource;
}
);
assert(it == _pendingBarriers.end());
#endif
_pendingBarriers.push_back(
CD3DX12_RESOURCE_BARRIER::Transition(resource, stateBefore, stateAfter, 0));
}
void CopyBufferRegion(
ID3D12Resource* destBuffer,
uint32_t destOffset,
ID3D12Resource* srcBuffer,
uint32_t srcOffset,
uint32_t numBytes,
bool shouldFlushBarriers
) noexcept {
if (shouldFlushBarriers) {
_FlushBarriers();
}
_commandList->CopyBufferRegion(destBuffer, destOffset, srcBuffer, srcOffset, numBytes);
}
void CopyTextureRegion(
ID3D12Resource* destResource,
uint32_t dstX,
uint32_t dstY,
ID3D12Resource* srcResource,
const D3D12_BOX* pSrcBox = nullptr
) noexcept {
CopyTextureRegion(
CD3DX12_TEXTURE_COPY_LOCATION(destResource),
dstX,
dstY,
CD3DX12_TEXTURE_COPY_LOCATION(srcResource),
pSrcBox
);
}
void CopyTextureRegion(
const CD3DX12_TEXTURE_COPY_LOCATION& dest,
uint32_t dstX,
uint32_t dstY,
const CD3DX12_TEXTURE_COPY_LOCATION& src,
const D3D12_BOX* pSrcBox = nullptr
) noexcept {
_FlushBarriers();
_commandList->CopyTextureRegion(&dest, dstX, dstY, 0, &src, pSrcBox);
}
void DiscardResource(ID3D12Resource* pResource) noexcept {
_commandList->DiscardResource(pResource, nullptr);
}
protected:
void _ClearStateCache() noexcept {
_FlushBarriers();
_curDescriptorHeap = nullptr;
}
void _FlushBarriers() noexcept {
if (!_pendingBarriers.empty()) {
_commandList->ResourceBarrier(
(UINT)_pendingBarriers.size(), _pendingBarriers.data());
_pendingBarriers.clear();
}
}
D3D12Context* _d3d12Context = nullptr;
ID3D12GraphicsCommandList* _commandList = nullptr;
ID3D12DescriptorHeap* _curDescriptorHeap = nullptr;
SmallVector<D3D12_RESOURCE_BARRIER, 0> _pendingBarriers;
};
class ComputeContext : public CommandContext<ComputeContext> {
public:
void SetRootSignature(ID3D12RootSignature* rootSignature) noexcept;
void SetRoot32BitConstants(
uint32_t rootParameterIndex,
uint32_t constantCount,
const void* pData
) noexcept;
void SetComputeRootConstantBufferView(
uint32_t rootParameterIndex,
D3D12_GPU_VIRTUAL_ADDRESS bufferLocation
) noexcept;
void SetRootDescriptorTable(
uint32_t rootParameterIndex,
uint32_t baseDescriptorOffset
) noexcept;
void Dispatch(
uint32_t threadGroupCountX,
uint32_t threadGroupCountY = 1,
uint32_t threadGroupCountZ = 1
) noexcept;
void ClearStateCache() noexcept;
private:
};
class GraphicsContext : public CommandContext<GraphicsContext> {
public:
void SetRootSignature(ID3D12RootSignature* rootSignature) noexcept;
void SetRoot32BitConstants(
uint32_t rootParameterIndex,
uint32_t constantCount,
const void* pData
) noexcept;
void SetRootDescriptorTable(
uint32_t rootParameterIndex,
uint32_t baseDescriptorOffset
) noexcept;
void IASetPrimitiveTopology(D3D12_PRIMITIVE_TOPOLOGY trimitiveTopology) noexcept;
void RSSetViewportAndScissorRect(const D3D12_RECT& rect) noexcept;
void OMSetRenderTarget(uint32_t rtvDescriptorOffset) noexcept;
uint32_t OMGetRenderTarget() const noexcept {
return _curRtvDescriptorOffset;
}
void Draw(uint32_t vertexCount) noexcept;
void ClearStateCache() noexcept;
private:
D3D12_PRIMITIVE_TOPOLOGY _curTrimitiveTopology = D3D_PRIMITIVE_TOPOLOGY_UNDEFINED;
uint32_t _curRtvDescriptorOffset = std::numeric_limits<uint32_t>::max();
};
}

View file

@ -1,263 +0,0 @@
#include "pch.h"
#include "CompSwapchainPresenter.h"
#include "DeviceResources.h"
#include "Logger.h"
#include "ScalingWindow.h"
#include "Win32Helper.h"
namespace Magpie {
static winrt::com_ptr<IPresentationFactory> CreatePresentationFactory(ID3D11Device* d3dDevice) noexcept {
winrt::com_ptr<IPresentationFactory> result;
static const auto createPresentationFactory =
Win32Helper::LoadSystemFunction<decltype(::CreatePresentationFactory)>(
L"dcomp.dll", "CreatePresentationFactory");
if (!createPresentationFactory) {
return result;
}
HRESULT hr = createPresentationFactory(d3dDevice, IID_PPV_ARGS(&result));
if (FAILED(hr)) {
Logger::Get().ComError("CreatePresentationFactory 失败", hr);
}
return result;
}
bool CompSwapchainPresenter::_Initialize(HWND hwndAttach) noexcept {
if (Win32Helper::GetOSVersion().IsWin10()) {
Logger::Get().Error("OS 不支持 composition swapchain");
return false;
}
HRESULT hr = DCompositionCreateDevice3(nullptr, IID_PPV_ARGS(&_dcompDevice));
if (FAILED(hr)) {
Logger::Get().ComError("DCompositionCreateDevice3 失败", hr);
return false;
}
hr = _dcompDevice->CreateTargetForHwnd(hwndAttach, TRUE, _dcompTarget.put());
if (FAILED(hr)) {
Logger::Get().ComError("CreateTargetForHwnd 失败", hr);
return false;
}
hr = _dcompDevice->CreateVisual(_dcompVisual.put());
if (FAILED(hr)) {
Logger::Get().ComError("CreateVisual 失败", hr);
return false;
}
hr = _dcompTarget->SetRoot(_dcompVisual.get());
if (FAILED(hr)) {
Logger::Get().ComError("SetRoot 失败", hr);
return false;
}
winrt::com_ptr<IPresentationFactory> presentationFactory =
CreatePresentationFactory(_deviceResources->GetD3DDevice());
if (!presentationFactory) {
Logger::Get().Error("CreatePresentationFactory 失败");
return false;
}
if (!presentationFactory->IsPresentationSupported()) {
Logger::Get().Error("此 D3D 设备不支持 composition swapchain");
return false;
}
if (!presentationFactory->IsPresentationSupportedWithIndependentFlip()) {
Logger::Get().Info("此 D3D 设备不支持 independent flip");
}
hr = presentationFactory->CreatePresentationManager(_presentationManager.put());
if (FAILED(hr)) {
Logger::Get().ComError("CreatePresentationManager 失败", hr);
return false;
}
wil::unique_handle hCompSurface;
hr = DCompositionCreateSurfaceHandle(
COMPOSITIONOBJECT_ALL_ACCESS,
nullptr,
hCompSurface.put()
);
if (FAILED(hr)) {
Logger::Get().ComError("DCompositionCreateSurfaceHandle 失败", hr);
return false;
}
hr = _presentationManager->CreatePresentationSurface(
hCompSurface.get(), _presentationSurface.put());
if (FAILED(hr)) {
Logger::Get().ComError("CreatePresentationSurface 失败", hr);
return false;
}
winrt::com_ptr<IUnknown> compSurface;
hr = _dcompDevice->CreateSurfaceFromHandle(hCompSurface.get(), compSurface.put());
if (FAILED(hr)) {
Logger::Get().ComError("CreateSurfaceFromHandle 失败", hr);
return false;
}
hr = _dcompVisual->SetContent(compSurface.get());
if (FAILED(hr)) {
Logger::Get().ComError("SetContent 失败", hr);
return false;
}
hr = _dcompDevice->Commit();
if (FAILED(hr)) {
Logger::Get().ComError("Commit 失败", hr);
return false;
}
hr = _presentationManager->GetPresentRetiringFence(IID_PPV_ARGS(&_presentationFence));
if (FAILED(hr)) {
Logger::Get().ComError("GetPresentRetiringFence 失败", hr);
return false;
}
const uint32_t bufferCount = _CalcBufferCount();
_presentationBuffers.resize(bufferCount);
_presentationBufferAvailableEvents.resize(bufferCount);
_bufferTextures.resize(bufferCount);
_bufferRtvs.resize(bufferCount);
return true;
}
bool CompSwapchainPresenter::BeginFrame(
winrt::com_ptr<ID3D11Texture2D>& frameTex,
winrt::com_ptr<ID3D11RenderTargetView>& frameRtv,
POINT& drawOffset
) noexcept {
// 寻找可用的缓冲区
uint32_t curIdx = std::numeric_limits<uint32_t>::max();
// 先寻找未初始化的缓冲区
const uint32_t bufferCount = (uint32_t)_presentationBuffers.size();
for (uint32_t i = 0; i < bufferCount; ++i) {
if (_presentationBuffers[i]) {
continue;
}
const SIZE rendererSize = Win32Helper::GetSizeOfRect(ScalingWindow::Get().RendererRect());
D3D11_TEXTURE2D_DESC desc{};
desc.Format = DXGI_FORMAT_R8G8B8A8_UNORM;
desc.SampleDesc.Count = 1;
desc.MipLevels = 1;
desc.ArraySize = 1;
desc.Width = (UINT)rendererSize.cx;
desc.Height = (UINT)rendererSize.cy;
desc.BindFlags = D3D11_BIND_SHADER_RESOURCE | D3D11_BIND_RENDER_TARGET;
desc.MiscFlags =
D3D11_RESOURCE_MISC_SHARED |
D3D11_RESOURCE_MISC_SHARED_NTHANDLE |
D3D11_RESOURCE_MISC_SHARED_DISPLAYABLE;
HRESULT hr = _deviceResources->GetD3DDevice()->CreateTexture2D(
&desc, nullptr, _bufferTextures[i].put());
if (FAILED(hr)) {
Logger::Get().ComError("CreateTexture2D 失败", hr);
return false;
}
hr = _presentationManager->AddBufferFromResource(
_bufferTextures[i].get(), _presentationBuffers[i].put());
if (FAILED(hr)) {
Logger::Get().ComError("AddBufferFromResource 失败", hr);
return false;
}
hr = _presentationBuffers[i]->GetAvailableEvent(
_presentationBufferAvailableEvents[i].put());
if (FAILED(hr)) {
Logger::Get().ComError("GetAvailableEvent 失败", hr);
return false;
}
RECT srcRect{ 0,0,rendererSize.cx,rendererSize.cy };
hr = _presentationSurface->SetSourceRect(&srcRect);
if (FAILED(hr)) {
Logger::Get().ComError("SetSourceRect 失败", hr);
return false;
}
curIdx = i;
break;
}
if (curIdx == std::numeric_limits<uint32_t>::max()) {
// 等待某个缓冲区空闲
DWORD waitResult = WaitForMultipleObjects(
bufferCount, (HANDLE*)_presentationBufferAvailableEvents.data(), FALSE, INFINITE);
if (waitResult < WAIT_OBJECT_0 || waitResult > WAIT_OBJECT_0 + bufferCount - 1) {
Logger::Get().Error("WaitForMultipleObjects 失败");
return false;
}
curIdx = waitResult - WAIT_OBJECT_0;
}
HRESULT hr = _presentationSurface->SetBuffer(_presentationBuffers[curIdx].get());
if (FAILED(hr)) {
Logger::Get().ComError("SetBuffer 失败", hr);
return false;
}
winrt::com_ptr<ID3D11RenderTargetView>& curRtv = _bufferRtvs[curIdx];
if (!curRtv) {
hr = _deviceResources->GetD3DDevice()->CreateRenderTargetView(
_bufferTextures[curIdx].get(), nullptr, curRtv.put());
if (FAILED(hr)) {
Logger::Get().ComError("CreateRenderTargetView 失败", hr);
return false;
}
}
drawOffset = {};
frameTex = _bufferTextures[curIdx];
frameRtv = curRtv;
return true;
}
void CompSwapchainPresenter::EndFrame(bool waitForGpu) noexcept {
if (waitForGpu || _isResized) {
// 下面两个调用用于减少调整窗口尺寸时的边缘闪烁,参见 AdaptivePresenter::EndFrame
// 等待渲染完成
_WaitForGpu();
// 等待 DWM 开始合成新一帧
Win32Helper::WaitForDwmComposition();
}
_presentationManager->Present();
if (_isResized) {
_isResized = false;
} else {
// 确保前一帧渲染完成再渲染下一帧,既降低了 GPU 负载,也能降低延迟
_WaitForGpu();
}
}
bool CompSwapchainPresenter::OnResize() noexcept {
_isResized = true;
// 缓冲区在 BeginFrame 中按需创建
std::fill(_presentationBuffers.begin(), _presentationBuffers.end(), nullptr);
std::fill(_presentationBufferAvailableEvents.begin(),
_presentationBufferAvailableEvents.end(), nullptr);
std::fill(_bufferTextures.begin(), _bufferTextures.end(), nullptr);
std::fill(_bufferRtvs.begin(), _bufferRtvs.end(), nullptr);
return true;
}
}

View file

@ -1,41 +0,0 @@
#pragma once
#include "PresenterBase.h"
#include <dcomp.h>
#include <Presentation.h>
namespace Magpie {
class CompSwapchainPresenter final : public PresenterBase {
protected:
bool _Initialize(HWND hwndAttach) noexcept override;
public:
bool BeginFrame(
winrt::com_ptr<ID3D11Texture2D>& frameTex,
winrt::com_ptr<ID3D11RenderTargetView>& frameRtv,
POINT& drawOffset
) noexcept override;
void EndFrame(bool waitForGpu = false) noexcept override;
bool OnResize() noexcept override;
private:
winrt::com_ptr<IDCompositionDesktopDevice> _dcompDevice;
winrt::com_ptr<IDCompositionTarget> _dcompTarget;
winrt::com_ptr<IDCompositionVisual2> _dcompVisual;
winrt::com_ptr<IDCompositionSurface> _dcompSurface;
winrt::com_ptr<IPresentationManager> _presentationManager;
winrt::com_ptr<IPresentationSurface> _presentationSurface;
winrt::com_ptr<ID3D11Fence> _presentationFence;
std::vector<winrt::com_ptr<IPresentationBuffer>> _presentationBuffers;
std::vector<wil::unique_event_nothrow> _presentationBufferAvailableEvents;
std::vector<winrt::com_ptr<ID3D11Texture2D>> _bufferTextures;
std::vector<winrt::com_ptr<ID3D11RenderTargetView>> _bufferRtvs;
bool _isResized = false;
};
}

File diff suppressed because it is too large Load diff

View file

@ -1,9 +1,14 @@
#pragma once
#include "ByteBuffer.h"
#include "SmallVector.h"
#include <parallel_hashmap/phmap.h>
#include <wil/registry.h>
namespace Magpie {
class DeviceResources;
class D3D12Context;
class GraphicsContext;
class DescriptorHeap;
class CursorDrawer {
public:
@ -11,75 +16,225 @@ public:
CursorDrawer(const CursorDrawer&) = delete;
CursorDrawer(CursorDrawer&&) = delete;
bool Initialize(DeviceResources& deviceResources) noexcept;
~CursorDrawer() noexcept;
void Draw(ID3D11Texture2D* backBuffer, POINT drawOffset) noexcept;
bool Initialize(
D3D12Context& d3d12Context,
const RECT& srcRect,
const RECT& rendererRect,
const RECT& destRect,
const ColorInfo& colorInfo
) noexcept;
void IsCursorVisible(bool value) noexcept {
_isCursorVisible = value;
void PrepareForDraw(HCURSOR hCursor, POINT cursorPos, bool& needRedraw) noexcept;
// backBuffer 不为空表示掩码光标在叠加层上
HRESULT Draw(
GraphicsContext& graphicsContext,
uint64_t frameFenceValue,
uint64_t completedFenceValue,
uint32_t curFrameSrvOffset,
ID3D12Resource* backBuffer = nullptr
) noexcept;
void OnCursorVirtualizationChanged(bool value) noexcept {
_isCursorVirtualized = value;
}
bool IsCursorVisible() const noexcept {
return _isCursorVisible;
void OnMovingChanged(bool value) noexcept {
_isMoving = value;
}
bool NeedRedraw() const noexcept;
void OnMoved(const RECT& rendererRect, const RECT& destRect) noexcept;
void OnResized(const RECT& rendererRect, const RECT& destRect) noexcept;
void OnSrcMovingChanged(bool value) noexcept {
_isSrcMoving = value;
}
void OnColorInfoChanged(const ColorInfo& colorInfo) noexcept;
private:
std::pair<HCURSOR, POINT> _GetCursorState(bool& isActive) const noexcept;
// SDR 色域下使用 sRGB 空间,否则使用线性 RGB 空间。截至 Win11 25H2Windows 在 WCG
// 和 HDR 下光标的色域和透明度经常变化,没有统一标准。
enum class _CursorType {
// 彩色光标,此时纹理中 RGB 通道已预乘 A 通道premultiplied alphaA 通道已预先取反
// 这是为了减少着色器的计算量以及确保(可能进行的)双线性差值的准确性
// 计算公式: FinalColor = ScreenColor * CursorColor.a + CursorColor
// 纹理格式: DXGI_FORMAT_R8G8B8A8_UNORM
// 彩色光标
// 纹理格式: DXGI_FORMAT_R16G16B16A16_FLOAT
// 计算公式: FinalColor = CursorColor.rgb + ScreenColor * CursorColor.a
// 纹理中 RGB 通道已预乘 A 通道 (premultiplied alpha)A 通道已预先取反,这是为了
// 减少着色器的计算量以及确保 (可能进行的) 双线性插值的准确性。
Color = 0,
// 彩色掩码光标,此时 A 通道可能为 0 或 255
// 为 0 时表示 RGB 通道取代屏幕颜色,为 255 时表示 RGB 通道和屏幕颜色进行异或操作
// 单色光标
// 纹理格式: DXGI_FORMAT_R8_UINT
// 高四位为 AND 掩码,低四位为 XOR 掩码,值只能是 0 或 0xf。
Monochrome,
// 彩色掩码光标
// 纹理格式: DXGI_FORMAT_R8G8B8A8_UNORM
MaskedColor,
// 单色光标,此时 R 通道为 AND 掩码G 通道为 XOR 掩码,其他通道不使用
// RG 通道的值只能是 0 或 255
// 纹理格式: DXGI_FORMAT_R8G8_UNORM
Monochrome
// A 通道只能是 0 或 255。为 0 时用 RGB 通道取代屏幕颜色,为 255 时将 RGB 通道和
// 屏幕颜色进行异或操作。
MaskedColor
};
struct _CursorInfoKey {
HCURSOR hCursor;
// DPI 为 0 表示此光标不随 DPI 缩放
uint32_t dpi;
bool operator==(const _CursorInfoKey&) const = default;
// 供 phmap 使用
friend size_t hash_value(const _CursorInfoKey& key) noexcept {
return phmap::HashState().combine(phmap::Hash<HCURSOR>()(key.hCursor), key.dpi);
}
};
struct _CursorFrame {
_CursorType type;
PointU hotspot;
winrt::com_ptr<ID3D12Resource> texture;
SizeU resSize;
ByteBuffer resTextureData;
// 这两个资源使用完毕后在 _ClearRetiredResources 中释放
winrt::com_ptr<ID3D12Resource> uploadBuffer;
winrt::com_ptr<ID3D12Resource> resTexture;
uint64_t tempResourcesFenceValue = 0;
uint32_t textureSrvOffset = std::numeric_limits<uint32_t>::max();
uint32_t textureRtvOffset = std::numeric_limits<uint32_t>::max();
uint32_t resTextureSrvOffset = std::numeric_limits<uint32_t>::max();
};
struct _CursorInfo {
POINT hotSpot{};
SIZE size{};
winrt::com_ptr<ID3D11ShaderResourceView> textureSrv = nullptr;
_CursorType type = _CursorType::Color;
SizeU size;
SmallVector<_CursorFrame, 1> frames;
// 序列表 (帧索引值数组),使多帧可以复用同一个 _CursorFrame。为空表示顺序播放
SmallVector<std::pair<uint32_t, std::chrono::nanoseconds>, 0> frameSequence;
uint64_t lastUseFenceValue = 0;
bool IsAnimated() const noexcept {
return !frameSequence.empty();
}
uint32_t GetFrameIdx(uint32_t seqIdx) const noexcept {
return IsAnimated() ? frameSequence[seqIdx].first : 0;
}
void FreeDescriptors(
DescriptorHeap& csuDescriptorHeap,
DescriptorHeap& rtvDescriptorHeap
) const noexcept;
};
const _CursorInfo* _ResolveCursor(HCURSOR hCursor) noexcept;
std::pair<const _CursorInfoKey, _CursorInfo>* _ResolveCursor(
HCURSOR hCursor,
POINT cursorPos
) noexcept;
bool _SetPremultipliedAlphaBlend() noexcept;
SizeU _CalcCursorSize(
SizeU cursorBmpSize,
uint32_t cursorDpi,
uint32_t monitorDpi,
bool isCursorDpiAware
) const noexcept;
DeviceResources* _deviceResources = nullptr;
void _TryResolveCursorFramesFromSource(
HCURSOR hCursor,
const ICONINFOEX& iconInfoEx,
uint32_t preferedWidth,
SmallVectorImpl<wil::unique_hcursor>& frames,
SmallVectorImpl<std::pair<uint32_t, std::chrono::nanoseconds>>& frameSequence
) const noexcept;
phmap::flat_hash_map<HCURSOR, _CursorInfo> _cursorInfos;
bool _ResolveCursorFramePixels(
_CursorFrame& cursorFrame,
HBITMAP hColorBmp,
HBITMAP hMaskBmp
) const noexcept;
winrt::com_ptr<ID3D11VertexShader> _simpleVS;
winrt::com_ptr<ID3D11InputLayout> _simpleIL;
winrt::com_ptr<ID3D11Buffer> _vtxBuffer;
winrt::com_ptr<ID3D11PixelShader> _simplePS;
winrt::com_ptr<ID3D11BlendState> premultipliedAlphaBlendBlendState;
winrt::com_ptr<ID3D11PixelShader> _maskedCursorPS;
winrt::com_ptr<ID3D11PixelShader> _monochromeCursorPS;
HRESULT _InitializeCursorTexture(
GraphicsContext& graphicsContext,
_CursorInfo& cursorInfo,
uint32_t cursorFrameIdx,
uint64_t completedFenceValue
) noexcept;
// 用于渲染彩色掩码光标和单色光标的临时纹理
winrt::com_ptr<ID3D11Texture2D> _tempCursorTexture;
winrt::com_ptr<ID3D11ShaderResourceView> _tempCursorTextureRtv;
SIZE _tempCursorTextureSize{};
// 只能在同步 GPU 后调用
void _ClearCursorInfos() noexcept;
HRESULT _CreateColorPSO(bool isSrgb, winrt::com_ptr<ID3D12PipelineState>& result) noexcept;
HRESULT _CreateMaskPSO(
bool isMonochrome,
bool isSrgb,
winrt::com_ptr<ID3D12PipelineState>& result
) noexcept;
HRESULT _CreateCursorResizerPSO() noexcept;
void _ClearRetiredResources(uint64_t completedFenceValue) noexcept;
void _OnCursorsRegChanged(wil::RegistryChangeKind) noexcept;
D3D12Context* _d3d12Context = nullptr;
SizeU _srcSize{};
RECT _rendererRect{};
RECT _destRect{};
ColorInfo _colorInfo;
// 监控“指针大小”选项变化
wil::unique_registry_watcher_nothrow _regWatcher;
DWORD _cursorBaseSize = 32;
phmap::flat_hash_map<_CursorInfoKey, _CursorInfo> _cursorInfos;
// 保存临时资源未被释放的 _CursorInfo。保存键而不是指针以防 _cursorInfos 扩容后失效
SmallVector<_CursorInfoKey, 1> _cursorInfosWithTempResources;
// 保存 _cursorBaseSize 改变后失效的 _CursorInfo
SmallVector<_CursorInfo, 0> _retiredCursorInfos;
// 保存解析失败的光标以避免重复尝试
phmap::flat_hash_set<HCURSOR> _unresolvableCursors;
// 这两个成员用于检查自动隐藏光标
HCURSOR _lastRawCursorHandle = NULL;
std::chrono::steady_clock::time_point _lastCursorActiveTime;
// 上次绘制的光标形状和位置
HCURSOR _lastCursorHandle = NULL;
POINT _lastCursorPos{ std::numeric_limits<LONG>::max(), std::numeric_limits<LONG>::max() };
std::pair<const _CursorInfoKey, _CursorInfo>* _curCursorInfoKeyValue = nullptr;
POINT _curCursorPos{ std::numeric_limits<LONG>::max(), std::numeric_limits<LONG>::max() };
// 这两个成员用于保存动态光标状态
uint32_t _curFrameSeqIdx = 0;
std::chrono::steady_clock::time_point _curFrameSeqEndTime;
// 用于从渲染目标复制光标下区域
winrt::com_ptr<ID3D12Resource> _tempOriginTexture;
SizeU _tempOriginTextureSize{};
uint32_t _tempOriginTextureSrvOffset = std::numeric_limits<uint32_t>::max();
struct _RetiredTempOriginTexture {
winrt::com_ptr<ID3D12Resource> texture;
uint64_t fenceValue;
uint32_t srvOffset;
};
SmallVector<_RetiredTempOriginTexture, 1> _retiredTempOriginTextures;
winrt::com_ptr<ID3D12RootSignature> _colorRootSignature;
winrt::com_ptr<ID3D12PipelineState> _colorPSO;
winrt::com_ptr<ID3D12PipelineState> _colorSrgbPSO;
winrt::com_ptr<ID3D12RootSignature> _maskRootSignature;
winrt::com_ptr<ID3D12PipelineState> _monochromePSO;
winrt::com_ptr<ID3D12PipelineState> _monochromeSrgbPSO;
winrt::com_ptr<ID3D12PipelineState> _maskedColorPSO;
winrt::com_ptr<ID3D12PipelineState> _maskedColorSrgbPSO;
winrt::com_ptr<ID3D12RootSignature> _cursorResizerRootSignature;
winrt::com_ptr<ID3D12PipelineState> _cursorResizerPSO;
bool _isCursorVisible = true;
bool _isMoving = false;
bool _isCursorVirtualized = false;
bool _isSrcMoving = false;
};
}

View file

@ -0,0 +1,666 @@
#include "pch.h"
#include "CursorHelper.h"
#include "ByteBuffer.h"
#include "Logger.h"
#include "SmallVector.h"
#include "Win32Helper.h"
#include <mmsystem.h> // FOURCC
namespace Magpie {
struct RTAG {
DWORD ckID;
DWORD ckSize;
};
static WORD GetRealIconSize(WORD size) noexcept {
// 0 等价于 256
return size == 0 ? (WORD)256 : size;
}
wil::unique_hcursor CursorHelper::ExtractCursorFromModule(
HMODULE hModule,
LPCWSTR resName,
uint32_t preferredWidth
) noexcept {
HRSRC hRes = FindResource(hModule, resName, RT_GROUP_CURSOR);
if (!hRes) {
Logger::Get().Win32Error("FindResource 失败");
return nullptr;
}
HGLOBAL hResLoad = LoadResource(hModule, hRes);
if (!hResLoad) {
Logger::Get().Win32Error("LoadResource 失败");
return nullptr;
}
// 解析光标资源
#pragma pack(push, 2)
// 来自 https://learn.microsoft.com/en-us/windows/win32/menurc/resdir
struct RESDIR {
WORD Width;
WORD Height;
WORD Planes;
WORD BitCount;
DWORD BytesInRes;
WORD IconCursorId;
};
// 来自 https://learn.microsoft.com/en-us/windows/win32/menurc/newheader
struct NEWHEADER {
WORD Reserved;
WORD ResType;
WORD ResCount;
RESDIR entries[1];
};
#pragma pack(pop)
const NEWHEADER& header = *(const NEWHEADER*)LockResource(hResLoad);
if (header.Reserved != 0 || header.ResType != 2) {
Logger::Get().Error("不是光标资源");
return nullptr;
}
const uint32_t resCount = header.ResCount;
if (resCount == 0 || resCount > 256) {
Logger::Get().Error("无可用光标资源");
return nullptr;
}
struct IconInfo {
WORD width;
WORD bitCount;
WORD id;
};
SmallVector<IconInfo, 0> iconInfos(resCount);
for (uint32_t i = 0; i < resCount; ++i) {
const RESDIR& entry = header.entries[i];
// 宽度和高度的 0 等价于 256
iconInfos[i] = IconInfo{
GetRealIconSize(entry.Width),
entry.BitCount,
entry.IconCursorId
};
}
// 尺寸从小到大排序;如果尺寸相同,色深从大到小排序,以便获得色深最大的光标
std::sort(iconInfos.begin(), iconInfos.end(), [](const IconInfo& l, const IconInfo& r) {
return l.width < r.width || (l.width == r.width && l.bitCount > r.bitCount);
});
// 寻找完美匹配或更大的资源
WORD targetResId;
{
auto it = std::lower_bound(
iconInfos.begin(),
iconInfos.end(),
preferredWidth,
[](const IconInfo& iconInfo, uint32_t target) {
return iconInfo.width < target;
}
);
if (it == iconInfos.end()) {
targetResId = iconInfos.back().id;
} else {
targetResId = it->id;
}
}
hRes = FindResource(hModule, MAKEINTRESOURCE(targetResId), RT_CURSOR);
if (!hRes) {
Logger::Get().Win32Error("FindResource 失败");
return nullptr;
}
hResLoad = LoadResource(hModule, hRes);
if (!hResLoad) {
Logger::Get().Win32Error("LoadResource 失败");
return nullptr;
}
HICON hIcon = CreateIconFromResourceEx((PBYTE)LockResource(hResLoad),
SizeofResource(hModule, hRes), FALSE, 0x30000, 0, 0, LR_DEFAULTCOLOR);
if (!hIcon) {
Logger::Get().Win32Error("CreateIconFromResourceEx 失败");
return nullptr;
}
return wil::unique_hcursor(hIcon);
}
// CUR 文件结构如下,参考自 https://en.wikipedia.org/wiki/ICO_(file_format)#File_structure
// [ICONDIR]
// [ICONDIRENTRY 1]
// [ICONDIRENTRY 2]
// ...
// [位图 1]
// [位图 2]
// ...
static wil::unique_hcursor LoadIcoFromFileMap(
const uint8_t* fileData,
const uint8_t* fileEnd,
uint32_t preferredWidth
) noexcept {
#pragma pack(push, 2)
struct ICONDIR {
WORD idReserved;
WORD idType;
WORD idCount;
};
struct ICONDIRENTRY {
BYTE bWidth;
BYTE bHeight;
BYTE bColorCount;
BYTE bReserved;
WORD xHotSpot;
WORD yHotSpot;
DWORD dwBytesInRes;
DWORD dwImageOffset;
};
struct LOCALHEADER {
WORD xHotSpot;
WORD yHotSpot;
};
#pragma pack(pop)
if (fileData + sizeof(ICONDIR) > fileEnd) {
Logger::Get().Error("文件无效");
return nullptr;
}
uint32_t entryCount;
{
const ICONDIR& header = *(ICONDIR*)fileData;
if (header.idReserved != 0 || header.idType != 2) {
Logger::Get().Error("不是光标资源");
return nullptr;
}
if (header.idCount == 0 || header.idCount > 256) {
Logger::Get().Error("无可用光标资源");
return nullptr;
}
entryCount = header.idCount;
}
const ICONDIRENTRY* targetEntry;
{
const ICONDIRENTRY* pEntries = (const ICONDIRENTRY*)(fileData + sizeof(ICONDIR));
if ((uint8_t*)pEntries + sizeof(ICONDIRENTRY) * entryCount > fileEnd) {
Logger::Get().Error("文件无效");
return nullptr;
}
// 寻找完美匹配或更大的资源
std::vector<const ICONDIRENTRY*> entries(entryCount);
for (uint32_t i = 0; i < entryCount; ++i) {
entries[i] = &pEntries[i];
}
// 尺寸从小到大排序和资源不同cur 文件不区分色深
std::sort(entries.begin(), entries.end(), [](const ICONDIRENTRY* l, const ICONDIRENTRY* r) {
return GetRealIconSize(l->bWidth) < GetRealIconSize(r->bWidth);
});
auto it = std::lower_bound(
entries.begin(),
entries.end(),
preferredWidth,
[](const ICONDIRENTRY* entry, uint32_t target) {
return GetRealIconSize(entry->bWidth) < target;
}
);
if (it == entries.end()) {
targetEntry = entries.back();
} else {
targetEntry = *it;
}
}
const uint8_t* pCursroData = fileData + targetEntry->dwImageOffset;
if (pCursroData + targetEntry->dwBytesInRes > fileEnd) {
Logger::Get().Error("文件无效");
return nullptr;
}
// RT_CURSOR 结构为 LOCALHEADER 后跟位图数据
// https://learn.microsoft.com/en-us/windows/win32/menurc/resource-file-formats#cursor-and-icon-resources
ByteBuffer cursorData(sizeof(LOCALHEADER) + targetEntry->dwBytesInRes);
// 设置热点
*(LOCALHEADER*)cursorData.Data() = { targetEntry->xHotSpot, targetEntry->yHotSpot };
// 读取位图数据
std::memcpy(cursorData.Data() + sizeof(LOCALHEADER), pCursroData, targetEntry->dwBytesInRes);
wil::unique_hcursor hCursor(CreateIconFromResourceEx(cursorData.Data(),
sizeof(LOCALHEADER) + targetEntry->dwBytesInRes, FALSE, 0x30000, 0, 0, LR_DEFAULTCOLOR));
if (!hCursor) {
Logger::Get().Win32Error("CreateIconFromResourceEx 失败");
return nullptr;
}
return hCursor;
}
static std::chrono::nanoseconds JifRateToDuration(uint32_t jifRate) noexcept {
using namespace std::chrono;
return nanoseconds(seconds(jifRate)) / 60;
}
// RIFF 格式参见 https://en.wikipedia.org/wiki/Resource_Interchange_File_Format
// ANI 文件结构如下,来自 https://en.wikipedia.org/wiki/ANI_(file_format)
// RIFF('ACON'
// [LIST('INFO'
// [INAM(<ZSTR>)] // 标题 (可选)
// [IART(<ZSTR>)] // 作者 (可选)
// )]
// 'anih'(<ANIHEADER>) // ANI 文件头
// ['rate'(<DWORD...>)] // 速率表 (jiffies 数组)。如果设置了 AF_SEQUENCE 标志,则数
// // 量为 ANIHEADER.cSteps否则为 ANIHEADER.cFrames。
// ['seq '(<DWORD...>)] // 序列表 (帧索引值数组)。当设置 AF_SEQUENCE 标志时应存在,
// // 数量为 ANIHEADER.cSteps。
// LIST('fram' // 帧数据列表,数量为 ANIHEADER.cFrames
// 'icon'(<icon_data_1>) // 第 1 帧
// 'icon'(<icon_data_2>) // 第 2 帧
// ...
// )
// )
static bool LoadAniFromFileMap(
const uint8_t* fileData,
const uint8_t* fileEnd,
uint32_t preferredWidth,
SmallVectorImpl<wil::unique_hcursor>& frames,
SmallVectorImpl<std::pair<uint32_t, std::chrono::nanoseconds>>& frameSequence
) {
#pragma pack(push, 2)
struct ANIHEADER {
DWORD cbSizeof;
DWORD cFrames; // 帧数据列表元素数量
DWORD cSteps; // 序列表元素数量
DWORD cx, cy; // 不使用
DWORD cBitCount, cPlanes; // 不使用
DWORD jifRate; // 默认显示速率, 单位为 jiffy (1/60s)
DWORD fl; // 必须设置 AF_ICON可选 AF_SEQUENCE
};
#pragma pack(pop)
constexpr DWORD FOURCC_ACON = mmioFOURCC('A', 'C', 'O', 'N');
constexpr DWORD FOURCC_anih = mmioFOURCC('a', 'n', 'i', 'h');
constexpr DWORD FOURCC_rate = mmioFOURCC('r', 'a', 't', 'e');
constexpr DWORD FOURCC_seq = mmioFOURCC('s', 'e', 'q', ' ');
constexpr DWORD FOURCC_fram = mmioFOURCC('f', 'r', 'a', 'm');
constexpr DWORD FOURCC_icon = mmioFOURCC('i', 'c', 'o', 'n');
constexpr DWORD AF_ICON = 0x1;
constexpr DWORD AF_SEQUENCE = 0x2;
// 已经检查 RIFF 头
fileData += sizeof(RTAG);
if (fileData + sizeof(uint32_t) > fileEnd) {
Logger::Get().Error("文件无效");
return false;
}
if (*(uint32_t*)fileData != FOURCC_ACON) {
Logger::Get().Error("文件无效");
return false;
}
fileData += sizeof(uint32_t);
ANIHEADER aniHeader{};
uint32_t curFrameIdx = 0;
while (fileData + sizeof(RTAG) < fileEnd) {
RTAG tag = *(RTAG*)fileData;
fileData += sizeof(RTAG);
const uint8_t* chunkEnd = fileData + ((tag.ckSize + 1) & ~1);
if (chunkEnd > fileEnd) {
Logger::Get().Error("文件无效");
return false;
}
// 不确定是不是强制的,在 Windows 的实现中anih 块必须比 fram、rate 和 seq 块先
// 出现。我们和系统保持一致,这可以简化代码。
switch (tag.ckID) {
case FOURCC_anih:
{
if (fileData + sizeof(ANIHEADER) > chunkEnd) {
Logger::Get().Error("文件无效");
return false;
}
aniHeader = *(ANIHEADER*)fileData;
if (aniHeader.cbSizeof != sizeof(ANIHEADER) ||
aniHeader.cFrames == 0 ||
((aniHeader.fl & AF_SEQUENCE) && aniHeader.cSteps == 0) ||
!(aniHeader.fl & AF_ICON))
{
Logger::Get().Error("文件无效");
return false;
}
frames.resize(aniHeader.cFrames);
// 如果只有一帧则不是动态光标
if (aniHeader.cFrames > 1) {
if (aniHeader.fl & AF_SEQUENCE) {
frameSequence.resize(aniHeader.cSteps);
// 用于检查 seq 块是否存在
frameSequence[0].first = std::numeric_limits<uint32_t>::max();
} else {
frameSequence.resize(aniHeader.cFrames);
// 逐帧播放
for (uint32_t i = 0; i < frameSequence.size(); ++i) {
frameSequence[i].first = i;
}
}
for (auto& pair : frameSequence) {
pair.second = JifRateToDuration(aniHeader.jifRate);
}
}
break;
}
case FOURCC_LIST:
{
if (fileData + sizeof(uint32_t) > chunkEnd) {
Logger::Get().Error("文件无效");
return false;
}
// 如果不是 fram 块则跳过此 LIST 块
if (*(uint32_t*)fileData != FOURCC_fram) {
break;
}
fileData += sizeof(uint32_t);
// 确保已解析 anih 块
if (aniHeader.cbSizeof == 0) {
Logger::Get().Error("文件无效");
return false;
}
if (curFrameIdx == aniHeader.cFrames) {
break;
}
while (fileData + sizeof(RTAG) < chunkEnd) {
tag = *(RTAG*)fileData;
fileData += sizeof(RTAG);
const uint8_t* subChunkEnd = fileData + ((tag.ckSize + 1) & ~1);
if (subChunkEnd > chunkEnd) {
Logger::Get().Error("文件无效");
return false;
}
if (tag.ckID == FOURCC_icon) {
wil::unique_hcursor hCursor = LoadIcoFromFileMap(fileData, subChunkEnd, preferredWidth);
if (hCursor) {
frames[curFrameIdx++] = std::move(hCursor);
} else {
Logger::Get().Error("LoadIcoFromFileMap 失败");
return false;
}
if (curFrameIdx == aniHeader.cFrames) {
break;
}
}
fileData = subChunkEnd;
}
break;
}
case FOURCC_rate:
{
// 确保已解析 anih 块
if (aniHeader.cbSizeof == 0) {
Logger::Get().Error("文件无效");
return false;
}
// 只有一帧则忽略 rate 块
if (frameSequence.empty()) {
break;
}
if (fileData + sizeof(uint32_t) * frameSequence.size() > chunkEnd) {
Logger::Get().Error("文件无效");
return false;
}
for (auto& pair : frameSequence) {
pair.second = JifRateToDuration(*(uint32_t*)fileData);
fileData += sizeof(uint32_t);
}
break;
}
case FOURCC_seq:
{
// 确保已解析 anih 块
if (aniHeader.cbSizeof == 0) {
Logger::Get().Error("文件无效");
return false;
}
// 无 AF_SEQUENCE 标志或只有一帧时忽略 seq 块
if (!(aniHeader.fl & AF_SEQUENCE) || frameSequence.empty()) {
break;
}
if (fileData + sizeof(uint32_t) * aniHeader.cSteps > chunkEnd) {
Logger::Get().Error("文件无效");
return false;
}
for (auto& pair : frameSequence) {
pair.first = *(uint32_t*)fileData;
fileData += sizeof(uint32_t);
}
break;
}
}
fileData = chunkEnd;
}
// 确保所有帧都已提取
if (frames.empty() || curFrameIdx != aniHeader.cFrames) {
Logger::Get().Error("文件无效");
return false;
}
// 只有一帧时 frameSequence 为空
if (frameSequence.empty()) {
return true;
}
for (const auto& pair : frameSequence) {
// 确保持续时间不为 0
if (pair.second.count() == 0) {
Logger::Get().Error("文件无效");
return false;
}
}
if (aniHeader.fl & AF_SEQUENCE) {
std::vector<bool> frameInUse(aniHeader.cFrames);
for (const auto& pair : frameSequence) {
// 检查序列是否合法
if (pair.first >= aniHeader.cFrames) {
Logger::Get().Error("文件无效");
return false;
}
frameInUse[pair.first] = true;
}
// 删除未被使用的帧
for (int i = aniHeader.cFrames - 1; i >= 0; --i) {
if (frameInUse[i]) {
continue;
}
frames.erase(frames.begin() + i);
// 删除一帧后调整索引
for (auto& pair : frameSequence) {
if (pair.first > (uint32_t)i) {
--pair.first;
}
}
}
// 只剩一帧则不是动态光标
if (frames.size() == 1) {
frameSequence.clear();
}
}
return true;
}
bool CursorHelper::ExtractCursorFramesFromFile(
const wchar_t* fileName,
uint32_t preferredWidth,
SmallVectorImpl<wil::unique_hcursor>& frames,
SmallVectorImpl<std::pair<uint32_t, std::chrono::nanoseconds>>& frameSequence
) noexcept {
assert(frames.empty() && frameSequence.empty());
CREATEFILE2_EXTENDED_PARAMETERS extendedParams{
.dwSize = sizeof(CREATEFILE2_EXTENDED_PARAMETERS),
.dwFileAttributes = FILE_ATTRIBUTE_NORMAL,
.dwFileFlags = FILE_FLAG_SEQUENTIAL_SCAN
};
wil::unique_hfile hFile(CreateFile2(
fileName, GENERIC_READ, FILE_SHARE_READ, OPEN_EXISTING, &extendedParams));
if (!hFile) {
Logger::Get().Win32Error("CreateFile2 失败");
return false;
}
const DWORD fileSize = GetFileSize(hFile.get(), nullptr);
// 这个检查确保可以访问 RIFF 头,也确保不会把空文件传给 CreateFileMapping
if (fileSize < sizeof(RTAG)) {
Logger::Get().Error("文件无效");
return false;
}
wil::unique_handle hFileMap(CreateFileMapping(
hFile.get(), nullptr, PAGE_READONLY, 0, 0, nullptr));
if (!hFileMap) {
Logger::Get().Win32Error("CreateFileMapping 失败");
return false;
}
wil::unique_mapview_ptr<const uint8_t> fileData((const uint8_t*)MapViewOfFile(
hFileMap.get(), FILE_MAP_READ, 0, 0, 0));
if (!fileData) {
Logger::Get().Win32Error("MapViewOfFile 失败");
return false;
}
const uint8_t* fileEnd = fileData.get() + fileSize;
// 存在 RIFF 头则 ani否则为 ico
if (((RTAG*)fileData.get())->ckID == FOURCC_RIFF) {
if (!LoadAniFromFileMap(fileData.get(), fileEnd, preferredWidth, frames, frameSequence)) {
Logger::Get().Error("LoadAniFromFileMap 失败");
return false;
}
} else {
wil::unique_hcursor hCursor = LoadIcoFromFileMap(fileData.get(), fileEnd, preferredWidth);
if (hCursor) {
frames.push_back(std::move(hCursor));
} else {
Logger::Get().Error("LoadIcoFromFileMap 失败");
return false;
}
}
return true;
}
void CursorHelper::TryResolveAnimatedCursor(
HCURSOR hCursor,
SmallVectorImpl<HCURSOR>& frames,
SmallVectorImpl<std::pair<uint32_t, std::chrono::nanoseconds>>& frameSequence
) noexcept {
assert(hCursor && frames.empty() && frameSequence.empty());
using FnGetCursorFrameInfo = HCURSOR WINAPI(
HCURSOR hcur,
LPWSTR lpName,
int iFrame,
LPDWORD pjifRate,
LPINT pccur
);
static FnGetCursorFrameInfo* getCursorFrameInfo = [] {
return Win32Helper::LoadFunction<FnGetCursorFrameInfo>(L"user32.dll", "GetCursorFrameInfo");
}();
if (!getCursorFrameInfo) {
return;
}
// GetCursorFrameInfo 直接返回内部句柄,无需销毁
DWORD jifRate;
int stepCount;
HCURSOR hCursorFrame = getCursorFrameInfo(hCursor, nullptr, 0, &jifRate, &stepCount);
if (!hCursorFrame || stepCount <= 1) {
// 失败或不是动态光标
return;
}
frames.reserve(stepCount);
frameSequence.resize(stepCount);
frames.push_back(hCursorFrame);
frameSequence[0] = { 0, JifRateToDuration(jifRate) };
for (int i = 1; i < stepCount; ++i) {
hCursorFrame = getCursorFrameInfo(hCursor, nullptr, i, &jifRate, &stepCount);
if (!hCursorFrame) {
// 失败时确保结果为空
frames.clear();
frameSequence.clear();
return;
}
// 排除重复的帧,用序列表实现
const uint32_t frameCount = (uint32_t)frames.size();
uint32_t j = 0;
for (; j < frameCount; ++j) {
if (frames[j] == hCursorFrame) {
break;
}
}
if (j == frameCount) {
frames.push_back(hCursorFrame);
}
frameSequence[i] = { j, JifRateToDuration(jifRate) };
}
}
}

View file

@ -0,0 +1,30 @@
#pragma once
#include "SmallVector.h"
namespace Magpie {
struct CursorHelper {
// 如果没有完美匹配则倾向于提取较大的资源
static wil::unique_hcursor ExtractCursorFromModule(
HMODULE hModule,
LPCWSTR resName,
uint32_t preferredWidth
) noexcept;
// 支持 .ico 和 .ani
static bool ExtractCursorFramesFromFile(
const wchar_t* fileName,
uint32_t preferredWidth,
SmallVectorImpl<wil::unique_hcursor>& frames,
SmallVectorImpl<std::pair<uint32_t, std::chrono::nanoseconds>>& frameSequence
) noexcept;
// frames 中的句柄无需销毁
static void TryResolveAnimatedCursor(
HCURSOR hCursor,
SmallVectorImpl<HCURSOR>& frames,
SmallVectorImpl<std::pair<uint32_t, std::chrono::nanoseconds>>& frameSequence
) noexcept;
};
}

File diff suppressed because it is too large Load diff

View file

@ -8,55 +8,54 @@ public:
CursorManager(const CursorManager&) = delete;
CursorManager(CursorManager&&) = delete;
void Initialize(
const RECT& srcRect,
const RECT& rendererRect,
const RECT& destRect,
bool isSrcMoving,
bool isSrcFocused
) noexcept;
~CursorManager() noexcept;
void Update() noexcept;
std::pair<HCURSOR, POINT> Update() noexcept;
void OnScalingPosChanged() noexcept;
void OnResizingChanged(bool value) noexcept;
void OnSrcStartMove() noexcept;
void OnResized(const RECT& rendererRect, const RECT& destRect) noexcept;
void OnSrcEndMove() noexcept;
void OnMovingChanged(bool value) noexcept;
void OnStartMove() noexcept;
void OnMoved(const RECT& rendererRect, const RECT& destRect) noexcept;
void OnEndResizeMove() noexcept;
void OnSrcMovingChanged(bool value) noexcept;
void OnSrcRectChanged() noexcept;
void OnSrcMoved(const RECT& srcRect) noexcept;
// 光标不在缩放窗口上或隐藏时为 NULL
HCURSOR CursorHandle() const noexcept {
return _hCursor;
}
void OnSrcFocusChanged(bool focused) noexcept;
// 屏幕坐标
POINT CursorPos() const noexcept {
return _cursorPos;
}
bool IsCursorCaptured() const noexcept {
return _isUnderCapture;
}
bool IsCursorCapturedOnForeground() const noexcept {
return _isCapturedOnForeground;
}
bool IsCursorOnOverlay() const noexcept {
return _isOnOverlay;
}
void IsCursorOnOverlay(bool value) noexcept;
void OnCursorOnOverlayChanged(bool value) noexcept;
bool IsCursorCapturedOnOverlay() const noexcept {
return _isCapturedOnOverlay;
}
void IsCursorCapturedOnOverlay(bool value) noexcept;
int16_t SrcHitTest() const noexcept {
int16_t GetSrcHitTest() const noexcept {
return _lastCompletedHitTestResult;
}
private:
POINT _SrcToScaling(POINT pt, bool skipBorder) const noexcept;
enum class _RoundMethod {
Round,
Floor,
Ceil
};
POINT _ScalingToSrc(POINT pt, _RoundMethod roundType = _RoundMethod::Round) const noexcept;
void _ShowSystemCursor(bool show, bool onDestory = false);
void _AdjustCursorSpeed() noexcept;
@ -77,14 +76,18 @@ private:
void _UpdateCursorPos() noexcept;
void _StartCapture(POINT& cursorPos) noexcept;
void _StartVirtualization(POINT& cursorPos) noexcept;
bool _StopCapture(POINT& cursorPos, bool onDestroy = false) noexcept;
bool _StopVirtualization(POINT& cursorPos, bool onDestroy = false) noexcept;
void _SetClipCursor(const RECT& clipRect, bool is3DGameMode = false) noexcept;
void _RestoreClipCursor() noexcept;
RECT _srcRect{};
RECT _rendererRect{};
RECT _destRect{};
HCURSOR _hCursor = NULL;
POINT _cursorPos{ std::numeric_limits<LONG>::max() };
@ -104,8 +107,13 @@ private:
POINT _lastCompletedHitTestPos{ std::numeric_limits<LONG>::max() };
int16_t _lastCompletedHitTestResult = HTNOWHERE;
bool _isUnderCapture = false;
// 当缩放后的光标位置在交换链窗口上且没有被其他窗口挡住时应绘制光标
bool _isMoving = false;
bool _isResizing = false;
bool _isSrcMoving = false;
bool _isSrcFocused = false;
bool _isVirtualized = false;
// 当缩放后的光标位置在渲染矩形内且没有被其他窗口挡住时应绘制光标
bool _shouldDrawCursor = false;
bool _isCapturedOnForeground = false;

View file

@ -0,0 +1,634 @@
#include "pch.h"
#include "D3D12Context.h"
#include "ScalingWindow.h"
#include "Logger.h"
#include "AppFolderManager.h"
#include "DirectXHelper.h"
#include "StrHelper.h"
#include "DescriptorHeap.h"
#include "Win32Helper.h"
namespace Magpie {
bool D3D12Context::Initialize(
const GraphicsCardId& graphicsCardId,
uint32_t maxInFlightFrameCount,
D3D12_COMMAND_QUEUE_PRIORITY priority,
D3D12_COMMAND_LIST_TYPE commandListType,
DescriptorHeap& csuDescriptorHeap,
DescriptorHeap& rtvDescriptorHeap,
bool disableFrameFenceTracking
) noexcept {
_csuDescriptorHeap = &csuDescriptorHeap;
_rtvDescriptorHeap = &rtvDescriptorHeap;
HRESULT hr = _CreateDXGIFactory();
if (FAILED(hr)) {
Logger::Get().ComError("_CreateDXGIFactory 失败", hr);
return false;
}
if (!_CreateAdapterAndDevice(graphicsCardId)) {
Logger::Get().Error("_CreateAdapterAndDevice 失败");
return false;
}
#ifdef _DEBUG
// 调试层汇报错误或警告时中断
if (winrt::com_ptr<ID3D12InfoQueue> infoQueue = _device.try_as<ID3D12InfoQueue>()) {
infoQueue->SetBreakOnSeverity(D3D12_MESSAGE_SEVERITY_CORRUPTION, TRUE);
infoQueue->SetBreakOnSeverity(D3D12_MESSAGE_SEVERITY_ERROR, TRUE);
infoQueue->SetBreakOnSeverity(D3D12_MESSAGE_SEVERITY_WARNING, TRUE);
}
#endif
_QueryHighestShaderModel();
// 检查根签名版本
{
D3D12_FEATURE_DATA_ROOT_SIGNATURE featureData = { .HighestVersion = D3D_ROOT_SIGNATURE_VERSION_1_1 };
hr = _device->CheckFeatureSupport(D3D12_FEATURE_ROOT_SIGNATURE, &featureData, sizeof(featureData));
if (SUCCEEDED(hr)) {
_rootSignatureVersion = featureData.HighestVersion;
} else {
Logger::Get().ComWarn("CheckFeatureSupport 失败", hr);
}
}
// 检查是否是集成显卡
{
D3D12_FEATURE_DATA_ARCHITECTURE1 data{};
hr = _device->CheckFeatureSupport(D3D12_FEATURE_ARCHITECTURE1, &data, sizeof(data));
if (SUCCEEDED(hr)) {
_isUMA = data.UMA;
} else {
Logger::Get().ComWarn("CheckFeatureSupport 失败", hr);
}
}
// 检查 D3D12_HEAP_FLAG_CREATE_NOT_ZEROED 支持。是否支持这个功能只和 D3D12 版本有关,
// 虽然我们随程序部署了 Agility SDK但旧版 Win10 不支持加载。
// https://devblogs.microsoft.com/directx/coming-to-directx-12-more-control-over-memory-allocation/
_isHeapFlagCreateNotZeroedSupported = (bool)_device.try_as<ID3D12Device8>();
// 检查 Resizable BAR 支持
{
D3D12_FEATURE_DATA_D3D12_OPTIONS16 data{};
hr = _device->CheckFeatureSupport(D3D12_FEATURE_D3D12_OPTIONS16, &data, sizeof(data));
if (SUCCEEDED(hr)) {
_isGPUUploadHeapSupported = data.GPUUploadHeapSupported;
} else {
Logger::Get().ComWarn("CheckFeatureSupport 失败", hr);
}
}
// 检查 FP16 支持
if (!ScalingWindow::Get().Options().IsFP16Disabled()) {
{
D3D12_FEATURE_DATA_D3D12_OPTIONS data{};
hr = _device->CheckFeatureSupport(D3D12_FEATURE_D3D12_OPTIONS, &data, sizeof(data));
if (SUCCEEDED(hr)) {
_isMinFloat16Supported = data.MinPrecisionSupport & D3D12_SHADER_MIN_PRECISION_SUPPORT_16_BIT;
} else {
Logger::Get().ComWarn("CheckFeatureSupport 失败", hr);
}
}
// SM 6.2 开始支持原生 16 位标量
if (_shaderModel >= D3D_SHADER_MODEL_6_2) {
D3D12_FEATURE_DATA_D3D12_OPTIONS4 data{};
hr = _device->CheckFeatureSupport(D3D12_FEATURE_D3D12_OPTIONS4, &data, sizeof(data));
if (SUCCEEDED(hr)) {
_isNative16BitSupported = data.Native16BitShaderOpsSupported;
} else {
Logger::Get().ComWarn("CheckFeatureSupport 失败", hr);
}
}
}
_LogDeviceInfo();
if (!_InitializeDeviceResources(
maxInFlightFrameCount, priority, commandListType, disableFrameFenceTracking)) {
Logger::Get().Error("_InitializeDeviceResources 失败");
return false;
}
return true;
}
void D3D12Context::CopyDevice(const D3D12Context& other) {
_csuDescriptorHeap = other._csuDescriptorHeap;
_rtvDescriptorHeap = other._rtvDescriptorHeap;
_device = other._device;
_shaderModel = other._shaderModel;
_rootSignatureVersion = other._rootSignatureVersion;
_isUMA = other._isUMA;
_isHeapFlagCreateNotZeroedSupported = other._isHeapFlagCreateNotZeroedSupported;
_isGPUUploadHeapSupported = other._isGPUUploadHeapSupported;
_isMinFloat16Supported = other._isMinFloat16Supported;
_isNative16BitSupported = other._isNative16BitSupported;
}
bool D3D12Context::InitializeAfterCopyDevice(
uint32_t maxInFlightFrameCount,
D3D12_COMMAND_QUEUE_PRIORITY priority,
D3D12_COMMAND_LIST_TYPE commandListType,
bool disableFrameFenceTracking
) noexcept {
HRESULT hr = _CreateDXGIFactory();
if (FAILED(hr)) {
Logger::Get().ComError("_CreateDXGIFactory 失败", hr);
return false;
}
if (!_CreateAdapterFromDevice()) {
Logger::Get().ComError("_CreateDXGIFactory 失败", hr);
return false;
}
if (!_InitializeDeviceResources(maxInFlightFrameCount, priority, commandListType, disableFrameFenceTracking)) {
Logger::Get().Error("_InitializeDeviceResources 失败");
return false;
}
return true;
}
IDXGIFactory7* D3D12Context::GetDXGIFactoryForEnumingAdapters() noexcept {
if (!_dxgiFactory->IsCurrent()) {
HRESULT hr = _CreateDXGIFactory();
if (FAILED(hr)) {
Logger::Get().ComError("_CreateDXGIFactory 失败", hr);
return nullptr;
}
}
return _dxgiFactory.get();
}
HRESULT D3D12Context::Signal(uint64_t& fenceValue) noexcept {
fenceValue = ++_curFenceValue;
return _commandQueue->Signal(_fence.get(), _curFenceValue);
}
HRESULT D3D12Context::WaitForFenceValue(uint64_t fenceValue) noexcept {
if (_fence->GetCompletedValue() >= fenceValue) {
return S_OK;
} else {
return _fence->SetEventOnCompletion(fenceValue, nullptr);
}
}
HRESULT D3D12Context::WaitForGpu() noexcept {
HRESULT hr = _commandQueue->Signal(_fence.get(), ++_curFenceValue);
if (FAILED(hr)) {
Logger::Get().ComError("ID3D12CommandQueue::Signal 失败", hr);
return hr;
}
return WaitForFenceValue(_curFenceValue);
}
HRESULT D3D12Context::WaitForCommandQueue(ID3D12CommandQueue* commandQueue) noexcept {
HRESULT hr = commandQueue->Signal(_fence.get(), ++_curFenceValue);
if (FAILED(hr)) {
Logger::Get().ComError("ID3D12CommandQueue::Signal 失败", hr);
return hr;
}
hr = _commandQueue->Wait(_fence.get(), _curFenceValue);
if (FAILED(hr)) {
Logger::Get().ComError("ID3D12CommandQueue::Wait 失败", hr);
return hr;
}
return S_OK;
}
HRESULT D3D12Context::BeginFrame(uint32_t& curFrameIndex, ID3D12PipelineState* initialState) noexcept {
if (!_frameFenceValues.empty()) {
HRESULT hr = WaitForFenceValue(_frameFenceValues[_curFrameIndex]);
if (FAILED(hr)) {
Logger::Get().ComError("WaitForFenceValue 失败", hr);
return hr;
}
}
HRESULT hr = _commandAllocators[_curFrameIndex]->Reset();
if (FAILED(hr)) {
Logger::Get().ComError("ID3D12CommandAllocator::Reset 失败", hr);
return hr;
}
hr = _commandList->Reset(_commandAllocators[_curFrameIndex].get(), initialState);
if (FAILED(hr)) {
Logger::Get().ComError("ID3D12GraphicsCommandList::Reset 失败", hr);
return hr;
}
curFrameIndex = _curFrameIndex;
return S_OK;
}
HRESULT D3D12Context::EndFrame() noexcept {
if (!_frameFenceValues.empty()) {
HRESULT hr = Signal(_frameFenceValues[_curFrameIndex]);
if (FAILED(hr)) {
Logger::Get().ComError("Signal 失败", hr);
return hr;
}
}
_curFrameIndex = (_curFrameIndex + 1) % (uint32_t)_commandAllocators.size();
return S_OK;
}
HRESULT D3D12Context::_CreateDXGIFactory() noexcept {
UINT flags = 0;
#ifdef _DEBUG
flags |= DXGI_CREATE_FACTORY_DEBUG;
#endif
HRESULT hr = CreateDXGIFactory2(flags, IID_PPV_ARGS(&_dxgiFactory));
if (FAILED(hr)) {
Logger::Get().ComError("CreateDXGIFactory2 失败", hr);
}
return hr;
}
bool D3D12Context::_InitializeDeviceResources(
uint32_t maxInFlightFrameCount,
D3D12_COMMAND_QUEUE_PRIORITY priority,
D3D12_COMMAND_LIST_TYPE commandListType,
bool disableFrameFenceTracking
) noexcept {
{
D3D12_COMMAND_QUEUE_DESC queueDesc = {
.Type = commandListType,
.Priority = priority,
.Flags = D3D12_COMMAND_QUEUE_FLAG_NONE
};
HRESULT hr = _device->CreateCommandQueue(&queueDesc, IID_PPV_ARGS(&_commandQueue));
if (FAILED(hr)) {
Logger::Get().ComError("CreateCommandQueue 失败", hr);
return false;
}
}
HRESULT hr = _device->CreateCommandList1(0, commandListType,
D3D12_COMMAND_LIST_FLAG_NONE, IID_PPV_ARGS(&_commandList));
if (FAILED(hr)) {
Logger::Get().ComError("CreateCommandList1 失败", hr);
return false;
}
_commandAllocators.resize(maxInFlightFrameCount);
for (winrt::com_ptr<ID3D12CommandAllocator>& commandAllocator : _commandAllocators) {
hr = _device->CreateCommandAllocator(commandListType, IID_PPV_ARGS(&commandAllocator));
if (FAILED(hr)) {
Logger::Get().ComError("CreateCommandAllocator 失败", hr);
return false;
}
}
hr = _device->CreateFence(0, D3D12_FENCE_FLAG_NONE, IID_PPV_ARGS(&_fence));
if (FAILED(hr)) {
Logger::Get().ComError("CreateFence 失败", hr);
return false;
}
// 如果已在外部同步则无需追踪每帧的栅栏值
if (!disableFrameFenceTracking) {
_frameFenceValues.resize(maxInFlightFrameCount);
}
return true;
}
// 和 D3D12SDKLayers.dll 不同OS 加载 d3d10warp.dll 时不遵循 D3D12SDKPath。
// 这个函数确保加载匹配的 d3d10warp.dll。
static void FixD3D10WarpDll(IDXGIAdapter1* warpAdapter) noexcept {
assert(!GetModuleHandle(L"d3d10warp.dll"));
HMODULE hD3D12Core = GetModuleHandle(L"D3D12Core.dll");
if (!hD3D12Core) {
// 如果 D3D12Core.dll 尚未加载则加载它
D3D12CreateDevice(warpAdapter, D3D_FEATURE_LEVEL_11_0, winrt::guid_of<ID3D12Device>(), nullptr);
hD3D12Core = GetModuleHandle(L"D3D12Core.dll");
if (!hD3D12Core) {
// 可能 OS 不支持 Agility SDK
return;
}
}
// 检查是否加载了随程序部署的 D3D12Core.dll
std::wstring d3d12CorePath;
wil::GetModuleFileNameW(hD3D12Core, d3d12CorePath);
if (d3d12CorePath.starts_with(AppFolderManager::Get().GetExeDir().native())) {
// 加载随程序部署的 d3d10warp.dll
std::filesystem::path warpDllPath =
AppFolderManager::Get().GetD3D12Dir() / L"d3d10warp.dll";
LoadLibrary(warpDllPath.c_str());
}
}
bool D3D12Context::_CreateAdapterAndDevice(const GraphicsCardId& graphicsCardId) noexcept {
winrt::com_ptr<IDXGIAdapter1> adapter;
if (!ScalingWindow::Get().Options().UseWarp()) {
// 记录不支持 D3D12 的显卡索引,防止重复尝试
int failedIdx = -1;
if (graphicsCardId.idx >= 0) {
assert(graphicsCardId.vendorId != 0 && graphicsCardId.deviceId != 0);
// 先使用索引
HRESULT hr = _dxgiFactory->EnumAdapters1(graphicsCardId.idx, adapter.put());
if (SUCCEEDED(hr)) {
DXGI_ADAPTER_DESC1 desc;
hr = adapter->GetDesc1(&desc);
if (SUCCEEDED(hr)) {
if (desc.VendorId == graphicsCardId.vendorId && desc.DeviceId == graphicsCardId.deviceId) {
if (_TryCreateD3DDevice(adapter, desc)) {
return true;
}
failedIdx = graphicsCardId.idx;
Logger::Get().Warn("用户指定的显示卡不支持 D3D12");
} else {
Logger::Get().Warn("显卡配置已变化");
}
}
}
// 如果已确认该显卡不支持 D3D12不再重复尝试
if (failedIdx == -1) {
// 枚举查找 vendorId 和 deviceId 匹配的显卡
for (UINT adapterIdx = 0;
SUCCEEDED(_dxgiFactory->EnumAdapters1(adapterIdx, adapter.put()));
++adapterIdx
) {
if ((int)adapterIdx == graphicsCardId.idx) {
// 已经检查了 graphicsCardId.idx
continue;
}
DXGI_ADAPTER_DESC1 desc;
hr = adapter->GetDesc1(&desc);
if (FAILED(hr)) {
continue;
}
if (desc.VendorId == graphicsCardId.vendorId && desc.DeviceId == graphicsCardId.deviceId) {
if (_TryCreateD3DDevice(adapter, desc)) {
return true;
}
failedIdx = (int)adapterIdx;
Logger::Get().Warn("用户指定的显示卡不支持 D3D12");
break;
}
}
}
}
// 枚举查找第一个支持 D3D12 的显卡
for (UINT adapterIdx = 0;
SUCCEEDED(_dxgiFactory->EnumAdapters1(adapterIdx, adapter.put()));
++adapterIdx
) {
if ((int)adapterIdx == failedIdx) {
// 无需再次尝试
continue;
}
DXGI_ADAPTER_DESC1 desc;
HRESULT hr = adapter->GetDesc1(&desc);
if (FAILED(hr) || DirectXHelper::IsWARP(desc)) {
continue;
}
if (_TryCreateD3DDevice(adapter, desc)) {
return true;
}
}
}
// 作为最后手段,回落到 CPU 渲染 (WARP)
// https://docs.microsoft.com/en-us/windows/win32/direct3darticles/directx-warp
HRESULT hr = _dxgiFactory->EnumWarpAdapter(IID_PPV_ARGS(&adapter));
if (FAILED(hr)) {
Logger::Get().ComError("EnumWarpAdapter 失败", hr);
return false;
}
[[maybe_unused]] static Ignore _ = [](IDXGIAdapter1* warpAdapter) {
FixD3D10WarpDll(warpAdapter);
return Ignore();
}(adapter.get());
DXGI_ADAPTER_DESC1 desc;
hr = adapter->GetDesc1(&desc);
if (FAILED(hr) || !_TryCreateD3DDevice(adapter, desc)) {
Logger::Get().Error("创建 WARP 设备失败");
return false;
}
return true;
}
bool D3D12Context::_TryCreateD3DDevice(
const winrt::com_ptr<IDXGIAdapter1>& adapter,
const DXGI_ADAPTER_DESC1& adapterDesc
) noexcept {
HRESULT hr = D3D12CreateDevice(adapter.get(), D3D_FEATURE_LEVEL_11_0, IID_PPV_ARGS(&_device));
if (FAILED(hr)) {
Logger::Get().ComError("D3D12CreateDevice 失败", hr);
return false;
}
_dxgiAdapter = adapter.try_as<IDXGIAdapter4>();
if (!_dxgiAdapter) {
Logger::Get().Error("获取 IDXGIAdapter4 失败");
return false;
}
Logger::Get().Info(fmt::format("图形适配器\n\tVendorId: {:#x}\n\tDeviceId: {:#x}\n\tDescription: {}",
adapterDesc.VendorId, adapterDesc.DeviceId, StrHelper::UTF16ToUTF8(adapterDesc.Description)));
return true;
}
bool D3D12Context::_CreateAdapterFromDevice() noexcept {
const LUID adapterLuid = _device->GetAdapterLuid();
winrt::com_ptr<IDXGIAdapter1> adapter;
for (UINT adapterIdx = 0;
SUCCEEDED(_dxgiFactory->EnumAdapters1(adapterIdx, adapter.put()));
++adapterIdx
) {
DXGI_ADAPTER_DESC1 desc;
HRESULT hr = adapter->GetDesc1(&desc);
if (FAILED(hr)) {
continue;
}
if (desc.AdapterLuid != adapterLuid) {
continue;
}
_dxgiAdapter = adapter.try_as<IDXGIAdapter4>();
if (_dxgiAdapter) {
return true;
} else {
Logger::Get().Error("获取 IDXGIAdapter4 失败");
return false;
}
}
return false;
}
void D3D12Context::_QueryHighestShaderModel() noexcept {
// 如果运行时不知道 HighestShaderModelCheckFeatureSupport 将返回 E_INVALIDARG
// (这只会发生在不支持 Agility SDK 的旧版本 Win10 上)。官方推荐从新到旧依次检查每
// 个版本。
constexpr std::array allModelVersions = {
D3D_SHADER_MODEL_6_9,
D3D_SHADER_MODEL_6_8,
D3D_SHADER_MODEL_6_7,
D3D_SHADER_MODEL_6_6,
D3D_SHADER_MODEL_6_5,
D3D_SHADER_MODEL_6_4,
D3D_SHADER_MODEL_6_3,
D3D_SHADER_MODEL_6_2,
D3D_SHADER_MODEL_6_1,
D3D_SHADER_MODEL_6_0,
D3D_SHADER_MODEL_5_1
};
constexpr uint32_t versionCount = (uint32_t)std::size(allModelVersions);
HighestShaderModel versionLimit = ScalingWindow::Get().Options().highestShaderModel;
uint32_t startIdx = versionLimit == HighestShaderModel::NotLimited ? 0 : (uint32_t)versionLimit - 1;
for (uint32_t i = startIdx; i < versionCount; ++i) {
D3D12_FEATURE_DATA_SHADER_MODEL data = { .HighestShaderModel = allModelVersions[i]};
HRESULT hr = _device->CheckFeatureSupport(D3D12_FEATURE_SHADER_MODEL, &data, sizeof(data));
if (hr == E_INVALIDARG) {
continue;
}
if (SUCCEEDED(hr)) {
_shaderModel = data.HighestShaderModel;
} else {
Logger::Get().ComWarn("CheckFeatureSupport 失败", hr);
}
return;
}
}
void D3D12Context::_LogDeviceInfo() noexcept {
std::string_view featureLevel;
{
D3D_FEATURE_LEVEL featureLevels[] = {
D3D_FEATURE_LEVEL_12_2,
D3D_FEATURE_LEVEL_12_1,
D3D_FEATURE_LEVEL_12_0,
D3D_FEATURE_LEVEL_11_1,
D3D_FEATURE_LEVEL_11_0
};
D3D12_FEATURE_DATA_FEATURE_LEVELS featureData = {
.NumFeatureLevels = (UINT)std::size(featureLevels),
.pFeatureLevelsRequested = featureLevels
};
HRESULT hr = _device->CheckFeatureSupport(
D3D12_FEATURE_FEATURE_LEVELS, &featureData, sizeof(featureData));
if (SUCCEEDED(hr)) {
switch (featureData.MaxSupportedFeatureLevel) {
case D3D_FEATURE_LEVEL_12_2:
featureLevel = "12.2";
break;
case D3D_FEATURE_LEVEL_12_1:
featureLevel = "12.1";
break;
case D3D_FEATURE_LEVEL_12_0:
featureLevel = "12.0";
break;
case D3D_FEATURE_LEVEL_11_1:
featureLevel = "11.1";
break;
case D3D_FEATURE_LEVEL_11_0:
featureLevel = "11.0";
break;
default:
featureLevel = "未知";
break;
}
} else {
Logger::Get().ComWarn("CheckFeatureSupport 失败", hr);
featureLevel = "未知";
}
}
std::string_view shaderModel;
switch (_shaderModel) {
case D3D_SHADER_MODEL_6_9:
shaderModel = "6.9";
break;
case D3D_SHADER_MODEL_6_8:
shaderModel = "6.8";
break;
case D3D_SHADER_MODEL_6_7:
shaderModel = "6.7";
break;
case D3D_SHADER_MODEL_6_6:
shaderModel = "6.6";
break;
case D3D_SHADER_MODEL_6_5:
shaderModel = "6.5";
break;
case D3D_SHADER_MODEL_6_4:
shaderModel = "6.4";
break;
case D3D_SHADER_MODEL_6_3:
shaderModel = "6.3";
break;
case D3D_SHADER_MODEL_6_2:
shaderModel = "6.2";
break;
case D3D_SHADER_MODEL_6_1:
shaderModel = "6.1";
break;
case D3D_SHADER_MODEL_6_0:
shaderModel = "6.0";
break;
default:
shaderModel = "5.1";
break;
}
constexpr const char* boolStrs[] = { "","" };
Logger::Get().Info(fmt::format(R"(已创建 D3D12 设备
: {}
shader model : {}
: {}
: {}
D3D12_HEAP_FLAG_CREATE_NOT_ZEROED : {}
Resizable BAR : {}
min16float : {}
16 : {})",
featureLevel,
shaderModel,
_rootSignatureVersion == D3D_ROOT_SIGNATURE_VERSION_1_1 ? "1.1" : "1.0",
boolStrs[_isUMA],
boolStrs[_isHeapFlagCreateNotZeroedSupported],
boolStrs[_isGPUUploadHeapSupported],
boolStrs[_isMinFloat16Supported],
boolStrs[_isNative16BitSupported]
));
}
}

View file

@ -0,0 +1,156 @@
#pragma once
#include "ScalingOptions.h"
namespace Magpie {
class DescriptorHeap;
class D3D12Context {
public:
D3D12Context() = default;
D3D12Context(const D3D12Context&) = delete;
D3D12Context(D3D12Context&&) = delete;
bool Initialize(
const GraphicsCardId& graphicsCardId,
uint32_t maxInFlightFrameCount,
D3D12_COMMAND_QUEUE_PRIORITY priority,
D3D12_COMMAND_LIST_TYPE commandListType,
DescriptorHeap& csuDescriptorHeap,
DescriptorHeap& rtvDescriptorHeap,
bool disableFrameFenceTracking = false
) noexcept;
void CopyDevice(const D3D12Context& other);
bool InitializeAfterCopyDevice(
uint32_t maxInFlightFrameCount,
D3D12_COMMAND_QUEUE_PRIORITY priority,
D3D12_COMMAND_LIST_TYPE commandListType,
bool disableFrameTracking = false
) noexcept;
DescriptorHeap& GetDescriptorHeap(bool rtv = false) const noexcept {
return rtv ? *_rtvDescriptorHeap : *_csuDescriptorHeap;
}
IDXGIFactory7* GetDXGIFactory() const noexcept {
return _dxgiFactory.get();
}
IDXGIFactory7* GetDXGIFactoryForEnumingAdapters() noexcept;
IDXGIAdapter4* GetDXGIAdapter() const noexcept {
return _dxgiAdapter.get();
}
ID3D12Device5* GetDevice() const noexcept {
return _device.get();
}
ID3D12CommandQueue* GetCommandQueue() const noexcept {
return _commandQueue.get();
}
ID3D12GraphicsCommandList* GetCommandList() const noexcept {
return _commandList.get();
}
D3D_SHADER_MODEL GetShaderModel() const noexcept {
return _shaderModel;
}
D3D_ROOT_SIGNATURE_VERSION GetRootSignatureVersion() const noexcept {
return _rootSignatureVersion;
}
bool IsUMA() const noexcept {
return _isUMA;
}
bool IsHeapFlagCreateNotZeroedSupported() const noexcept {
return _isHeapFlagCreateNotZeroedSupported;
}
bool IsGPUUploadHeapSupported() const noexcept {
return _isGPUUploadHeapSupported;
}
bool IsMinFloat16Supported() const noexcept {
return _isMinFloat16Supported;
}
bool IsNative16BitSupported() const noexcept {
return _isNative16BitSupported;
}
uint32_t GetMaxInFlightFrameCount() const noexcept {
return (uint32_t)_commandAllocators.size();
}
HRESULT Signal(uint64_t& fenceValue) noexcept;
HRESULT WaitForFenceValue(uint64_t fenceValue) noexcept;
HRESULT WaitForGpu() noexcept;
HRESULT WaitForCommandQueue(ID3D12CommandQueue* commandQueue) noexcept;
HRESULT BeginFrame(
uint32_t& curFrameIndex,
ID3D12PipelineState* initialState = nullptr
) noexcept;
HRESULT EndFrame() noexcept;
private:
HRESULT _CreateDXGIFactory() noexcept;
bool _InitializeDeviceResources(
uint32_t maxInFlightFrameCount,
D3D12_COMMAND_QUEUE_PRIORITY priority,
D3D12_COMMAND_LIST_TYPE commandListType,
bool disableFrameFenceTracking
) noexcept;
bool _CreateAdapterAndDevice(const GraphicsCardId& graphicsCardId) noexcept;
bool _TryCreateD3DDevice(
const winrt::com_ptr<IDXGIAdapter1>& adapter,
const DXGI_ADAPTER_DESC1& adapterDesc
) noexcept;
bool _CreateAdapterFromDevice() noexcept;
void _QueryHighestShaderModel() noexcept;
void _LogDeviceInfo() noexcept;
DescriptorHeap* _csuDescriptorHeap = nullptr;
DescriptorHeap* _rtvDescriptorHeap = nullptr;
winrt::com_ptr<IDXGIFactory7> _dxgiFactory;
winrt::com_ptr<IDXGIAdapter4> _dxgiAdapter;
winrt::com_ptr<ID3D12Device5> _device;
winrt::com_ptr<ID3D12CommandQueue> _commandQueue;
std::vector<winrt::com_ptr<ID3D12CommandAllocator>> _commandAllocators;
winrt::com_ptr<ID3D12GraphicsCommandList> _commandList;
winrt::com_ptr<ID3D12Fence1> _fence;
uint64_t _curFenceValue = 0;
std::vector<uint64_t> _frameFenceValues;
uint32_t _curFrameIndex = 0;
D3D_SHADER_MODEL _shaderModel = D3D_SHADER_MODEL_5_1;
D3D_ROOT_SIGNATURE_VERSION _rootSignatureVersion = D3D_ROOT_SIGNATURE_VERSION_1_0;
bool _isUMA = false;
bool _isHeapFlagCreateNotZeroedSupported = false;
bool _isGPUUploadHeapSupported = false;
bool _isMinFloat16Supported = false;
bool _isNative16BitSupported = false;
};
}

View file

@ -1,4 +1,4 @@
// 复制自 https://github.com/microsoft/DirectXTex/blob/652cc82b35ff9e14097d12eff73f53348361ff15/DirectXTex/DDS.h
// 复制自 https://github.com/microsoft/DirectXTex/blob/55b96d1d0ab5d9efe2112cd0318470976a2380b5/DirectXTex/DDS.h
//--------------------------------------------------------------------------------------
// DDS.h
@ -13,12 +13,13 @@
// Copyright (c) Microsoft Corporation.
// Licensed under the MIT License.
//
// http://go.microsoft.com/fwlink/?LinkId=248926
// https://go.microsoft.com/fwlink/?LinkId=248926
// http://go.microsoft.com/fwlink/?LinkId=248929
// http://go.microsoft.com/fwlink/?LinkID=615561
//--------------------------------------------------------------------------------------
#pragma once
#include <cstdint>
namespace Magpie {
@ -53,10 +54,10 @@ struct DDS_PIXELFORMAT {
#ifndef MAKEFOURCC
#define MAKEFOURCC(ch0, ch1, ch2, ch3) \
(static_cast<uint32_t>(static_cast<uint8_t>(ch0)) \
| (static_cast<uint32_t>(static_cast<uint8_t>(ch1)) << 8) \
| (static_cast<uint32_t>(static_cast<uint8_t>(ch2)) << 16) \
| (static_cast<uint32_t>(static_cast<uint8_t>(ch3)) << 24))
(static_cast<uint32_t>(static_cast<uint8_t>(ch0)) \
| (static_cast<uint32_t>(static_cast<uint8_t>(ch1)) << 8) \
| (static_cast<uint32_t>(static_cast<uint8_t>(ch2)) << 16) \
| (static_cast<uint32_t>(static_cast<uint8_t>(ch3)) << 24))
#endif /* MAKEFOURCC */
#ifndef DDSGLOBALCONST
@ -220,8 +221,8 @@ DDSGLOBALCONST DDS_PIXELFORMAT DDSPF_DX10 =
#define DDS_CUBEMAP_NEGATIVEZ 0x00008200 // DDSCAPS2_CUBEMAP | DDSCAPS2_CUBEMAP_NEGATIVEZ
#define DDS_CUBEMAP_ALLFACES ( DDS_CUBEMAP_POSITIVEX | DDS_CUBEMAP_NEGATIVEX |\
DDS_CUBEMAP_POSITIVEY | DDS_CUBEMAP_NEGATIVEY |\
DDS_CUBEMAP_POSITIVEZ | DDS_CUBEMAP_NEGATIVEZ )
DDS_CUBEMAP_POSITIVEY | DDS_CUBEMAP_NEGATIVEY |\
DDS_CUBEMAP_POSITIVEZ | DDS_CUBEMAP_NEGATIVEZ )
#define DDS_CUBEMAP 0x00000200 // DDSCAPS2_CUBEMAP
@ -289,4 +290,4 @@ constexpr size_t DDS_MIN_HEADER_SIZE = sizeof(uint32_t) + sizeof(DDS_HEADER);
constexpr size_t DDS_DX10_HEADER_SIZE = sizeof(uint32_t) + sizeof(DDS_HEADER) + sizeof(DDS_HEADER_DXT10);
static_assert(DDS_DX10_HEADER_SIZE > DDS_MIN_HEADER_SIZE, "DDS DX10 Header should be larger than standard header");
}
} // namespace

File diff suppressed because it is too large Load diff

View file

@ -1,19 +0,0 @@
#pragma once
namespace Magpie {
struct DDSHelper {
static winrt::com_ptr<ID3D11Texture2D> Load(
const wchar_t* fileName, ID3D11Device* d3dDevice) noexcept;
static bool Save(
const wchar_t* fileName,
uint32_t width,
uint32_t height,
DXGI_FORMAT format,
std::span<uint8_t> pixelData,
uint32_t rowPitch
);
};
}

View file

@ -0,0 +1,121 @@
#include "pch.h"
#include "DescriptorHeap.h"
#include "Logger.h"
namespace Magpie {
DescriptorHeap::~DescriptorHeap() noexcept {
// DEBUG 配置下退出前确保所有槽位都已释放
assert(_capacity == 0 || (_freeBlocks.size() == 1 &&
*_freeBlocks.begin() == std::make_pair(_capacity, _capacity)));
}
bool DescriptorHeap::Initialize(
ID3D12Device5* device,
D3D12_DESCRIPTOR_HEAP_TYPE type,
uint32_t capacity
) noexcept {
#ifdef _DEBUG
_capacity = capacity;
#endif
_freeBlocks.emplace(capacity, capacity);
_descriptorSize = device->GetDescriptorHandleIncrementSize(type);
D3D12_DESCRIPTOR_HEAP_DESC desc = {
.Type = type,
.NumDescriptors = capacity,
.Flags = type == D3D12_DESCRIPTOR_HEAP_TYPE_CBV_SRV_UAV ?
D3D12_DESCRIPTOR_HEAP_FLAG_SHADER_VISIBLE : D3D12_DESCRIPTOR_HEAP_FLAG_NONE
};
HRESULT hr = device->CreateDescriptorHeap(&desc, IID_PPV_ARGS(&_heap));
if (FAILED(hr)) {
Logger::Get().ComError("CreateDescriptorHeap 失败", hr);
return false;
}
_cpuHandle = _heap->GetCPUDescriptorHandleForHeapStart();
if (type == D3D12_DESCRIPTOR_HEAP_TYPE_CBV_SRV_UAV) {
_gpuHandle = _heap->GetGPUDescriptorHandleForHeapStart();
}
return true;
}
HRESULT DescriptorHeap::Alloc(uint32_t count, uint32_t& offset) noexcept {
assert(count != 0);
auto lk = _freeBlocksLock.lock_exclusive();
for (auto it = _freeBlocks.begin(); it != _freeBlocks.end(); ++it) {
auto& [blockEnd, blockSize] = *it;
// 寻找第一个足够大的空闲块
if (blockSize >= count) {
offset = blockEnd - blockSize;
if (blockSize == count) {
_freeBlocks.erase(it);
} else {
blockSize -= count;
}
return S_OK;
}
}
Logger::Get().Error("描述符用尽");
return E_OUTOFMEMORY;
}
static uint32_t GetBlockOffset(const std::pair<const uint32_t, uint32_t>& freeBlock) noexcept {
return freeBlock.first - freeBlock.second;
}
void DescriptorHeap::Free(uint32_t offset, uint32_t count) noexcept {
assert(count != 0 && offset != std::numeric_limits<uint32_t>::max() && offset + count <= _capacity);
auto lk = _freeBlocksLock.lock_exclusive();
const auto freeBlocksEnd = _freeBlocks.end();
// 寻找 offset 之后的第一个空闲块
auto upperBoundIt = _freeBlocks.upper_bound(offset);
auto prevIt = upperBoundIt == _freeBlocks.begin() ? freeBlocksEnd : std::prev(upperBoundIt);
assert(upperBoundIt == freeBlocksEnd || offset + count <= GetBlockOffset(*upperBoundIt));
assert(prevIt == freeBlocksEnd || offset >= prevIt->first);
const bool canMergePrev = prevIt != freeBlocksEnd && offset == prevIt->first;
const bool canMergeNext = upperBoundIt != freeBlocksEnd &&
offset + count == GetBlockOffset(*upperBoundIt);
if (canMergeNext) {
upperBoundIt->second += count;
if (canMergePrev) {
upperBoundIt->second += prevIt->second;
_freeBlocks.erase(prevIt);
}
} else {
uint32_t newBlockSize = count;
if (canMergePrev) {
newBlockSize += prevIt->second;
_freeBlocks.erase(prevIt);
}
_freeBlocks.emplace(offset + count, newBlockSize);
}
}
D3D12_CPU_DESCRIPTOR_HANDLE DescriptorHeap::GetCpuHandle(uint32_t offset) const noexcept {
return CD3DX12_CPU_DESCRIPTOR_HANDLE(_cpuHandle, offset, _descriptorSize);
}
D3D12_GPU_DESCRIPTOR_HANDLE DescriptorHeap::GetGpuHandle(uint32_t offset) const noexcept {
assert(_gpuHandle.ptr);
return CD3DX12_GPU_DESCRIPTOR_HANDLE(_gpuHandle, offset, _descriptorSize);
}
}

View file

@ -0,0 +1,58 @@
#pragma once
#ifndef _DEBUG
#include <parallel_hashmap/btree.h>
#endif
namespace Magpie {
class DescriptorHeap {
public:
DescriptorHeap() = default;
DescriptorHeap(const DescriptorHeap&) = delete;
DescriptorHeap(DescriptorHeap&&) = delete;
~DescriptorHeap() noexcept;
bool Initialize(
ID3D12Device5* device,
D3D12_DESCRIPTOR_HEAP_TYPE type,
uint32_t capacity
) noexcept;
HRESULT Alloc(uint32_t count, uint32_t& offset) noexcept;
void Free(uint32_t offset, uint32_t count) noexcept;
ID3D12DescriptorHeap* GetHeap() const noexcept {
return _heap.get();
}
uint32_t GetDescriptorSize() const noexcept {
return _descriptorSize;
}
D3D12_CPU_DESCRIPTOR_HANDLE GetCpuHandle(uint32_t offset) const noexcept;
D3D12_GPU_DESCRIPTOR_HANDLE GetGpuHandle(uint32_t offset) const noexcept;
private:
winrt::com_ptr<ID3D12DescriptorHeap> _heap;
D3D12_CPU_DESCRIPTOR_HANDLE _cpuHandle{};
D3D12_GPU_DESCRIPTOR_HANDLE _gpuHandle{};
uint32_t _descriptorSize = 0;
wil::srwlock _freeBlocksLock;
// end(offset+size) -> size
// 以 offset+size 作为键可以大大降低删除和插入键的频率
#ifdef _DEBUG
// phmap::btree_map 没有 natvis调试不方便
std::map<uint32_t, uint32_t> _freeBlocks;
// 用于断言
uint32_t _capacity = 0;
#else
phmap::btree_map<uint32_t, uint32_t> _freeBlocks;
#endif
};
}

View file

@ -1,214 +0,0 @@
#include "pch.h"
#include "DesktopDuplicationFrameSource.h"
#include "DeviceResources.h"
#include "DirectXHelper.h"
#include "Logger.h"
#include "ScalingWindow.h"
#include "SmallVector.h"
#include "Win32Helper.h"
namespace Magpie {
static winrt::com_ptr<IDXGIOutput1> FindMonitor(IDXGIAdapter1* adapter, HMONITOR hMonitor) noexcept {
winrt::com_ptr<IDXGIOutput> output;
for (UINT adapterIndex = 0;
SUCCEEDED(adapter->EnumOutputs(adapterIndex, output.put()));
++adapterIndex
) {
DXGI_OUTPUT_DESC desc;
HRESULT hr = output->GetDesc(&desc);
if (FAILED(hr)) {
Logger::Get().ComError("GetDesc 失败", hr);
continue;
}
if (desc.Monitor == hMonitor) {
winrt::com_ptr<IDXGIOutput1> output1 = output.try_as<IDXGIOutput1>();
if (!output1) {
Logger::Get().Error("从 IDXGIOutput 获取 IDXGIOutput1 失败");
return nullptr;
}
return output1;
}
}
return nullptr;
}
bool DesktopDuplicationFrameSource::_Initialize() noexcept {
// WDA_EXCLUDEFROMCAPTURE 只在 Win10 20H1 及更新版本中可用
if (!Win32Helper::GetOSVersion().Is20H1OrNewer()) {
Logger::Get().Error("当前操作系统无法使用 Desktop Duplication");
return false;
}
const HWND hwndSrc = ScalingWindow::Get().SrcTracker().Handle();
const RECT& srcRect = ScalingWindow::Get().SrcTracker().SrcRect();
HMONITOR hMonitor = MonitorFromWindow(hwndSrc, MONITOR_DEFAULTTONULL);
assert(hMonitor);
{
MONITORINFO mi{ .cbSize = sizeof(mi) };
if (!GetMonitorInfo(hMonitor, &mi)) {
Logger::Get().Win32Error("GetMonitorInfo 失败");
return false;
}
// ScalingWindow::_InitialMoveSrcWindowInFullscreen 已调整窗口位置
assert(srcRect.left >= mi.rcMonitor.left && srcRect.top >= mi.rcMonitor.top &&
srcRect.right <= mi.rcMonitor.right && srcRect.bottom <= mi.rcMonitor.bottom);
// 计算源窗口客户区在该屏幕上的位置,用于计算新帧是否有更新
_srcClientInMonitor = {
srcRect.left - mi.rcMonitor.left,
srcRect.top - mi.rcMonitor.top,
srcRect.right - mi.rcMonitor.left,
srcRect.bottom - mi.rcMonitor.top
};
}
_frameInMonitor = {
(UINT)_srcClientInMonitor.left,
(UINT)_srcClientInMonitor.top,
0,
(UINT)_srcClientInMonitor.right,
(UINT)_srcClientInMonitor.bottom,
1
};
_output = DirectXHelper::CreateTexture2D(
_deviceResources->GetD3DDevice(),
DXGI_FORMAT_B8G8R8A8_UNORM,
srcRect.right - srcRect.left,
srcRect.bottom - srcRect.top,
D3D11_BIND_SHADER_RESOURCE
);
if (!_output) {
Logger::Get().Error("CreateTexture2D 失败");
return false;
}
_dxgiOutput = FindMonitor(
_deviceResources->GetGraphicsAdapter(), hMonitor);
if (!_dxgiOutput) {
Logger::Get().Error("无法找到 IDXGIOutput");
return false;
}
// 使全屏窗口无法被捕获到
if (!SetWindowDisplayAffinity(ScalingWindow::Get().Handle(), WDA_EXCLUDEFROMCAPTURE)) {
Logger::Get().Win32Error("SetWindowDisplayAffinity 失败");
return false;
}
Logger::Get().Info("DesktopDuplicationFrameSource 初始化完成");
return true;
}
bool DesktopDuplicationFrameSource::Start() noexcept {
_DisableRoundCornerInWin11();
HRESULT hr = _dxgiOutput->DuplicateOutput(_deviceResources->GetD3DDevice(), _outputDup.put());
if (FAILED(hr)) {
Logger::Get().ComError("DuplicateOutput 失败", hr);
return false;
}
return true;
}
FrameSourceState DesktopDuplicationFrameSource::_Update() noexcept {
ID3D11DeviceContext4* d3dDC = _deviceResources->GetD3DDC();
if (_isFrameAcquired) {
// 根据文档,释放后立刻获取下一帧可以提高性能
_outputDup->ReleaseFrame();
_isFrameAcquired = false;
}
DXGI_OUTDUPL_FRAME_INFO info;
winrt::com_ptr<IDXGIResource> dxgiRes;
// 等待 1ms
HRESULT hr = _outputDup->AcquireNextFrame(1, &info, dxgiRes.put());
if (hr == DXGI_ERROR_WAIT_TIMEOUT) {
return FrameSourceState::Waiting;
}
if (FAILED(hr)) {
Logger::Get().ComError("AcquireNextFrame 失败", hr);
return FrameSourceState::Error;
}
_isFrameAcquired = true;
bool noUpdate = true;
// 检索 move rects 和 dirty rects
// 这些区域如果和窗口客户区有重叠则表明画面有变化
if (info.TotalMetadataBufferSize) {
if (info.TotalMetadataBufferSize > _dupMetaData.size()) {
_dupMetaData.resize(info.TotalMetadataBufferSize);
}
uint32_t bufSize = info.TotalMetadataBufferSize;
// Move rects
hr = _outputDup->GetFrameMoveRects(
bufSize, (DXGI_OUTDUPL_MOVE_RECT*)_dupMetaData.data(), &bufSize);
if (FAILED(hr)) {
Logger::Get().ComError("GetFrameMoveRects 失败", hr);
return FrameSourceState::Error;
}
uint32_t nRect = bufSize / sizeof(DXGI_OUTDUPL_MOVE_RECT);
for (uint32_t i = 0; i < nRect; ++i) {
const DXGI_OUTDUPL_MOVE_RECT& rect =
((DXGI_OUTDUPL_MOVE_RECT*)_dupMetaData.data())[i];
if (Win32Helper::IsRectOverlap(_srcClientInMonitor, rect.DestinationRect)) {
noUpdate = false;
break;
}
}
if (noUpdate) {
bufSize = info.TotalMetadataBufferSize;
// Dirty rects
hr = _outputDup->GetFrameDirtyRects(
bufSize, (RECT*)_dupMetaData.data(), &bufSize);
if (FAILED(hr)) {
Logger::Get().ComError("GetFrameDirtyRects 失败", hr);
return FrameSourceState::Error;
}
nRect = bufSize / sizeof(RECT);
for (uint32_t i = 0; i < nRect; ++i) {
const RECT& rect = ((RECT*)_dupMetaData.data())[i];
if (Win32Helper::IsRectOverlap(_srcClientInMonitor, rect)) {
noUpdate = false;
break;
}
}
}
}
if (noUpdate) {
return FrameSourceState::Waiting;
}
winrt::com_ptr<ID3D11Texture2D> frameTexture = dxgiRes.try_as<ID3D11Texture2D>();
if (!frameTexture) {
Logger::Get().Error("从 IDXGIResource 检索 ID3D11Resource 失败");
return FrameSourceState::Error;
}
d3dDC->CopySubresourceRegion(
_output.get(), 0, 0, 0, 0, frameTexture.get(), 0, &_frameInMonitor);
return FrameSourceState::NewFrame;
}
}

View file

@ -1,36 +0,0 @@
#pragma once
#include "FrameSourceBase.h"
#include "SmallVector.h"
namespace Magpie {
class DesktopDuplicationFrameSource final : public FrameSourceBase {
public:
bool Start() noexcept override;
FrameSourceWaitType WaitType() const noexcept override {
return FrameSourceWaitType::WaitForFrame;
}
const char* Name() const noexcept override {
return "Desktop Duplication";
}
protected:
bool _Initialize() noexcept override;
FrameSourceState _Update() noexcept override;
private:
winrt::com_ptr<IDXGIOutput1> _dxgiOutput;
winrt::com_ptr<IDXGIOutputDuplication> _outputDup;
SmallVector<uint8_t, 0> _dupMetaData;
RECT _srcClientInMonitor{};
D3D11_BOX _frameInMonitor{};
bool _isFrameAcquired = false;
};
}

View file

@ -1,253 +0,0 @@
#include "pch.h"
#include "DeviceResources.h"
#include "DirectXHelper.h"
#include "Logger.h"
#include "ScalingOptions.h"
#include "ScalingWindow.h"
#include "StrHelper.h"
namespace Magpie {
bool DeviceResources::Initialize(bool isForeground) noexcept {
#ifdef _DEBUG
UINT flag = DXGI_CREATE_FACTORY_DEBUG;
#else
UINT flag = 0;
#endif
HRESULT hr = CreateDXGIFactory2(flag, IID_PPV_ARGS(_dxgiFactory.put()));
if (FAILED(hr)) {
Logger::Get().ComError("CreateDXGIFactory2 失败", hr);
return false;
}
// 检查可变帧率支持
BOOL supportTearing = FALSE;
hr = _dxgiFactory->CheckFeatureSupport(DXGI_FEATURE_PRESENT_ALLOW_TEARING, &supportTearing, sizeof(supportTearing));
if (FAILED(hr)) {
Logger::Get().ComWarn("CheckFeatureSupport 失败", hr);
}
_isTearingSupported = supportTearing;
Logger::Get().Info(fmt::format("可变刷新率支持: {}", supportTearing ? "" : ""));
if (!_ObtainAdapterAndDevice(ScalingWindow::Get().Options().graphicsCardId, isForeground)) {
Logger::Get().Error("找不到可用的图形适配器");
return false;
}
return true;
}
ID3D11SamplerState* DeviceResources::GetSampler(D3D11_FILTER filterMode, D3D11_TEXTURE_ADDRESS_MODE addressMode) noexcept {
auto key = std::make_pair(filterMode, addressMode);
auto it = _samMap.find(key);
if (it != _samMap.end()) {
return it->second.get();
}
winrt::com_ptr<ID3D11SamplerState> sam;
D3D11_SAMPLER_DESC desc{
.Filter = filterMode,
.AddressU = addressMode,
.AddressV = addressMode,
.AddressW = addressMode,
.ComparisonFunc = D3D11_COMPARISON_NEVER
};
HRESULT hr = _d3dDevice->CreateSamplerState(&desc, sam.put());
if (FAILED(hr)) {
Logger::Get().ComError("创建 ID3D11SamplerState 出错", hr);
return nullptr;
}
return _samMap.emplace(key, std::move(sam)).first->second.get();
}
bool DeviceResources::_ObtainAdapterAndDevice(GraphicsCardId graphicsCardId, bool isForeground) noexcept {
winrt::com_ptr<IDXGIAdapter1> adapter;
// 记录不支持 FL11 的显卡索引,防止重复尝试
int failedIdx = -1;
if (graphicsCardId.idx >= 0) {
assert(graphicsCardId.vendorId != 0 && graphicsCardId.deviceId != 0);
// 先使用索引
HRESULT hr = _dxgiFactory->EnumAdapters1(graphicsCardId.idx, adapter.put());
if (SUCCEEDED(hr)) {
DXGI_ADAPTER_DESC1 desc;
hr = adapter->GetDesc1(&desc);
if (SUCCEEDED(hr)) {
if (desc.VendorId == graphicsCardId.vendorId && desc.DeviceId == graphicsCardId.deviceId) {
if (_TryCreateD3DDevice(adapter, isForeground)) {
return true;
}
failedIdx = graphicsCardId.idx;
Logger::Get().Warn("用户指定的显示卡不支持 FL 11");
} else {
Logger::Get().Warn("显卡配置已变化");
}
}
}
// 如果已确认该显卡不支持 FL11不再重复尝试
if (failedIdx == -1) {
// 枚举查找 vendorId 和 deviceId 匹配的显卡
for (UINT adapterIdx = 0;
SUCCEEDED(_dxgiFactory->EnumAdapters1(adapterIdx, adapter.put()));
++adapterIdx
) {
if ((int)adapterIdx == graphicsCardId.idx) {
// 已经检查了 graphicsCardId.idx
continue;
}
DXGI_ADAPTER_DESC1 desc;
hr = adapter->GetDesc1(&desc);
if (FAILED(hr)) {
continue;
}
if (desc.VendorId == graphicsCardId.vendorId && desc.DeviceId == graphicsCardId.deviceId) {
if (_TryCreateD3DDevice(adapter, isForeground)) {
return true;
}
failedIdx = (int)adapterIdx;
Logger::Get().Warn("用户指定的显示卡不支持 FL11");
break;
}
}
}
}
// 枚举查找第一个支持 FL11 的显卡
for (UINT adapterIdx = 0;
SUCCEEDED(_dxgiFactory->EnumAdapters1(adapterIdx, adapter.put()));
++adapterIdx
) {
if ((int)adapterIdx == failedIdx) {
// 无需再次尝试
continue;
}
DXGI_ADAPTER_DESC1 desc;
HRESULT hr = adapter->GetDesc1(&desc);
if (FAILED(hr) || DirectXHelper::IsWARP(desc)) {
continue;
}
if (_TryCreateD3DDevice(adapter, isForeground)) {
return true;
}
}
// 作为最后手段,回落到 CPU 渲染 (WARP)
// https://docs.microsoft.com/en-us/windows/win32/direct3darticles/directx-warp
HRESULT hr = _dxgiFactory->EnumWarpAdapter(IID_PPV_ARGS(&adapter));
if (FAILED(hr)) {
Logger::Get().ComError("EnumWarpAdapter 失败", hr);
return false;
}
if (!_TryCreateD3DDevice(adapter, isForeground)) {
Logger::Get().ComError("创建 WARP 设备失败", hr);
return false;
}
return true;
}
bool DeviceResources::_TryCreateD3DDevice(const winrt::com_ptr<IDXGIAdapter1>& adapter, bool isForeground) noexcept {
D3D_FEATURE_LEVEL featureLevels[] = {
D3D_FEATURE_LEVEL_11_1,
D3D_FEATURE_LEVEL_11_0
};
const UINT nFeatureLevels = ARRAYSIZE(featureLevels);
UINT createDeviceFlags = D3D11_CREATE_DEVICE_BGRA_SUPPORT;
// DEBUG 配置下启用调试层
if (DirectXHelper::IsDebugLayersAvailable()) {
createDeviceFlags |= D3D11_CREATE_DEVICE_DEBUG;
}
// WGC 和 D3D11_CREATE_DEVICE_SINGLETHREADED 不兼容
if (isForeground || ScalingWindow::Get().Options().captureMethod != CaptureMethod::GraphicsCapture) {
createDeviceFlags |= D3D11_CREATE_DEVICE_SINGLETHREADED;
}
#ifdef MP_USE_COMPSWAPCHAIN
if (isForeground) {
// 文档说 composition swapchain 和驱动程序内部线程不兼容,如果没有这个标志,创建
// IPresentationFactory 将失败。但根据我在 Win11 24H2 上的测试,不指定这个标志也
// 可以正常使用,可能文档已经过时。安全起见加上了这个标志。
createDeviceFlags |= D3D11_CREATE_DEVICE_PREVENT_INTERNAL_THREADING_OPTIMIZATIONS;
}
#endif
winrt::com_ptr<ID3D11Device> d3dDevice;
winrt::com_ptr<ID3D11DeviceContext> d3dDC;
D3D_FEATURE_LEVEL featureLevel;
HRESULT hr = D3D11CreateDevice(
adapter.get(),
D3D_DRIVER_TYPE_UNKNOWN,
nullptr,
createDeviceFlags,
featureLevels,
nFeatureLevels,
D3D11_SDK_VERSION,
d3dDevice.put(),
&featureLevel,
d3dDC.put()
);
if (FAILED(hr)) {
Logger::Get().ComError("D3D11CreateDevice 失败", hr);
return false;
}
std::string_view fl;
switch (featureLevel) {
case D3D_FEATURE_LEVEL_11_1:
fl = "11.1";
break;
case D3D_FEATURE_LEVEL_11_0:
fl = "11.0";
break;
default:
fl = "未知";
break;
}
Logger::Get().Info(fmt::format("已创建 D3D 设备\n\t功能级别: {}", fl));
_d3dDevice = d3dDevice.try_as<ID3D11Device5>();
if (!_d3dDevice) {
Logger::Get().Error("获取 ID3D11Device1 失败");
return false;
}
_d3dDC = d3dDC.try_as<ID3D11DeviceContext4>();
if (!_d3dDC) {
Logger::Get().Error("获取 ID3D11DeviceContext4 失败");
return false;
}
_graphicsAdapter = adapter.try_as<IDXGIAdapter4>();
if (!_graphicsAdapter) {
Logger::Get().Error("获取 IDXGIAdapter4 失败");
return false;
}
// 检查半精度浮点支持
D3D11_FEATURE_DATA_SHADER_MIN_PRECISION_SUPPORT value;
hr = d3dDevice->CheckFeatureSupport(D3D11_FEATURE_SHADER_MIN_PRECISION_SUPPORT, &value, sizeof(value));
if (SUCCEEDED(hr)) {
_isFP16Supported = value.AllOtherShaderStagesMinPrecision & D3D11_SHADER_MIN_PRECISION_16_BIT;
Logger::Get().Info(StrHelper::Concat("FP16 支持: ", _isFP16Supported ? "" : ""));
} else {
Logger::Get().ComError("CheckFeatureSupport 失败", hr);
}
return true;
}
}

View file

@ -1,43 +0,0 @@
#pragma once
#include "ScalingOptions.h"
#include <parallel_hashmap/phmap.h>
namespace Magpie {
class DeviceResources {
public:
DeviceResources() = default;
DeviceResources(const DeviceResources&) = delete;
DeviceResources(DeviceResources&&) = default;
bool Initialize(bool isForeground) noexcept;
IDXGIFactory7* GetDXGIFactory() const noexcept { return _dxgiFactory.get(); }
ID3D11Device5* GetD3DDevice() const noexcept { return _d3dDevice.get(); }
ID3D11DeviceContext4* GetD3DDC() const noexcept { return _d3dDC.get(); }
IDXGIAdapter4* GetGraphicsAdapter() const noexcept { return _graphicsAdapter.get(); }
bool IsTearingSupported() const noexcept { return _isTearingSupported; }
bool IsFP16Supported() const noexcept { return _isFP16Supported; }
ID3D11SamplerState* GetSampler(D3D11_FILTER filterMode, D3D11_TEXTURE_ADDRESS_MODE addressMode) noexcept;
private:
bool _ObtainAdapterAndDevice(GraphicsCardId graphicsCardId, bool isForeground) noexcept;
bool _TryCreateD3DDevice(const winrt::com_ptr<IDXGIAdapter1>& adapter, bool isForeground) noexcept;
winrt::com_ptr<IDXGIFactory7> _dxgiFactory;
winrt::com_ptr<IDXGIAdapter4> _graphicsAdapter;
winrt::com_ptr<ID3D11Device5> _d3dDevice;
winrt::com_ptr<ID3D11DeviceContext4> _d3dDC;
phmap::flat_hash_map<
std::pair<D3D11_FILTER, D3D11_TEXTURE_ADDRESS_MODE>,
winrt::com_ptr<ID3D11SamplerState>
> _samMap;
bool _isTearingSupported = false;
bool _isFP16Supported = false;
};
}

View file

@ -1,110 +0,0 @@
#include "pch.h"
#include "DirectXHelper.h"
#include "Logger.h"
#include "StrHelper.h"
#include <d3dcompiler.h>
namespace Magpie {
bool DirectXHelper::CompileComputeShader(
std::string_view hlsl,
const char* entryPoint,
ID3DBlob** blob,
const char* sourceName,
ID3DInclude* include,
const std::vector<std::pair<std::string, std::string>>& macros,
bool warningsAreErrors
) {
winrt::com_ptr<ID3DBlob> errorMsgs = nullptr;
UINT flags = D3DCOMPILE_ENABLE_STRICTNESS | D3DCOMPILE_ALL_RESOURCES_BOUND;
if (warningsAreErrors) {
flags |= D3DCOMPILE_WARNINGS_ARE_ERRORS;
}
#ifdef _DEBUG
flags |= D3DCOMPILE_SKIP_OPTIMIZATION | D3DCOMPILE_DEBUG;
#else
flags |= D3DCOMPILE_OPTIMIZATION_LEVEL3;
#endif
std::unique_ptr<D3D_SHADER_MACRO[]> mc(new D3D_SHADER_MACRO[macros.size() + 1]);
for (UINT i = 0; i < macros.size(); ++i) {
mc[i] = { macros[i].first.c_str(), macros[i].second.c_str() };
}
mc[macros.size()] = { nullptr,nullptr };
HRESULT hr = D3DCompile(hlsl.data(), hlsl.size(), sourceName, mc.get(), include,
entryPoint, "cs_5_0", flags, 0, blob, errorMsgs.put());
if (FAILED(hr)) {
if (errorMsgs) {
Logger::Get().ComError(StrHelper::Concat("编译计算着色器失败: ", (const char*)errorMsgs->GetBufferPointer()), hr);
}
return false;
}
// 警告消息
if (errorMsgs) {
Logger::Get().Warn(StrHelper::Concat("编译计算着色器时产生警告: ", (const char*)errorMsgs->GetBufferPointer()));
}
return true;
}
bool DirectXHelper::IsDebugLayersAvailable() noexcept {
#ifdef _DEBUG
static bool result = SUCCEEDED(D3D11CreateDevice(
nullptr,
D3D_DRIVER_TYPE_NULL, // There is no need to create a real hardware device.
nullptr,
D3D11_CREATE_DEVICE_DEBUG, // Check for the SDK layers.
nullptr, // Any feature level will do.
0,
D3D11_SDK_VERSION,
nullptr, // No need to keep the D3D device reference.
nullptr, // No need to know the feature level.
nullptr // No need to keep the D3D device context reference.
));
return result;
#else
// Relaese 配置不使用调试层
return false;
#endif
}
winrt::com_ptr<ID3D11Texture2D> DirectXHelper::CreateTexture2D(
ID3D11Device* d3dDevice,
DXGI_FORMAT format,
UINT width,
UINT height,
UINT bindFlags,
D3D11_USAGE usage,
UINT miscFlags,
const D3D11_SUBRESOURCE_DATA* pInitialData
) noexcept {
const D3D11_TEXTURE2D_DESC desc{
.Width = width,
.Height = height,
.MipLevels = 1,
.ArraySize = 1,
.Format = format,
.SampleDesc{
.Count = 1,
.Quality = 0
},
.Usage = usage,
.BindFlags = bindFlags,
.MiscFlags = miscFlags
};
winrt::com_ptr<ID3D11Texture2D> result;
HRESULT hr = d3dDevice->CreateTexture2D(&desc, pInitialData, result.put());
if (FAILED(hr)) {
Logger::Get().ComError("CreateTexture2D 失败", hr);
return nullptr;
}
return result;
}
}

View file

@ -0,0 +1,362 @@
#include "pch.h"
#include "DirtyRectsOptimizer.h"
#include "DebugInfo.h"
#include "RectHelper.h"
namespace Magpie {
static bool IsCornerInRect(PointU p, const RectU& r) noexcept {
return p.x >= r.left && p.x <= r.right && p.y >= r.top && p.y <= r.bottom;
}
static bool OptimizeDirtyRectPair(RectU& rect1, RectU& rect2, bool reversed = false) noexcept {
if (RectHelper::IsEmpty(rect1) || RectHelper::IsEmpty(rect2)) {
return false;
}
// 计算 rect2 有几个角在 rect1 内
bool lt = IsCornerInRect(PointU{ rect2.left, rect2.top }, rect1);
bool rt = IsCornerInRect(PointU{ rect2.right, rect2.top }, rect1);
bool rb = IsCornerInRect(PointU{ rect2.right, rect2.bottom }, rect1);
bool lb = IsCornerInRect(PointU{ rect2.left, rect2.bottom }, rect1);
uint32_t count = (uint32_t)lt + (uint32_t)rt + (uint32_t)rb + (uint32_t)lb;
if (count <= 1) {
if (!reversed) {
// 尝试反向
return OptimizeDirtyRectPair(rect2, rect1, true);
}
if (count == 0) {
// 有小间隙也合并,因为检查重复帧使用 16x16 分块,而且多余的像素因为纹理缓存复制代价很小
constexpr uint32_t MERGE_THRESHOLD = DUP_FRAME_DISPATCH_BLOCK_SIZE / 2;
if (rect1.top == rect2.top && rect1.bottom == rect2.bottom) {
if (rect1.right < rect2.left) {
if (rect1.right + MERGE_THRESHOLD >= rect2.left) {
// rect2 合并进 rect1
rect1.right = rect2.right;
rect2.right = rect2.left;
return true;
}
} else {
assert(rect1.left > rect2.right);
if (rect2.right + MERGE_THRESHOLD >= rect1.left) {
rect1.left = rect2.left;
rect2.right = rect2.left;
return true;
}
}
} else if (rect1.left == rect2.left && rect1.right == rect2.right) {
if (rect1.bottom < rect2.top) {
if (rect1.bottom + MERGE_THRESHOLD >= rect2.top) {
rect1.bottom = rect2.bottom;
rect2.right = rect2.left;
return true;
}
} else {
assert(rect1.top > rect2.bottom);
if (rect2.bottom + MERGE_THRESHOLD >= rect1.top) {
rect1.top = rect2.top;
rect2.right = rect2.left;
return true;
}
}
}
}
} else if (count == 2) {
// rect2 有两个角在 rect1 内时可以合并或裁剪
if (lt) {
if (rt) {
if (rect2.left == rect1.left && rect2.right == rect1.right) {
// rect2 合并进 rect1
rect1.bottom = rect2.bottom;
rect2.right = rect2.left;
return true;
} else if (rect2.top != rect1.bottom) {
// 裁剪 rect2
rect2.top = rect1.bottom;
assert(rect2.bottom >= rect2.top);
return true;
}
} else {
assert(lb);
if (rect2.top == rect1.top && rect2.bottom == rect1.bottom) {
rect1.right = rect2.right;
rect2.right = rect2.left;
return true;
} else if (rect2.left != rect1.right) {
rect2.left = rect1.right;
assert(rect2.right >= rect2.left);
return true;
}
}
} else {
assert(rb);
if (rt) {
if (rect2.top == rect1.top && rect2.bottom == rect1.bottom) {
rect1.left = rect2.left;
rect2.right = rect2.left;
return true;
} else if (rect2.right != rect1.left) {
rect2.right = rect1.left;
assert(rect2.right >= rect2.left);
return true;
}
} else {
if (rect2.left == rect1.left && rect2.right == rect1.right) {
rect1.top = rect2.top;
rect2.right = rect2.left;
return true;
} else if (rect2.bottom != rect1.top) {
rect2.bottom = rect1.top;
assert(rect2.bottom >= rect2.top);
return true;
}
}
}
} else if (count == 4) {
// rect2 在 rect1 内
rect2.right = rect2.left;
return true;
}
return false;
}
static void BasicOptimize(SmallVectorImpl<RectU>& dirtyRects) noexcept {
// 持续循环直到不再能优化
while (true) {
const uint32_t count = (uint32_t)dirtyRects.size();
assert(count > 0);
bool optimized = false;
for (uint32_t i = 0; i < count; ++i) {
for (uint32_t j = i + 1; j < count; ++j) {
if (OptimizeDirtyRectPair(dirtyRects[i], dirtyRects[j])) {
optimized = true;
}
}
}
if (!optimized) {
return;
}
// 从后向前删除空矩形
for (int i = int(count - 1); i >= 0; --i) {
const RectU& rect = dirtyRects[i];
if (RectHelper::IsEmpty(rect)) {
dirtyRects.erase(dirtyRects.begin() + i);
}
}
}
}
static uint32_t CalcTotalPixels(const SmallVectorImpl<RectU>& rects) noexcept {
uint32_t result = 0;
for (const RectU& rect : rects) {
result += RectHelper::CalcArea(rect);
}
return result;
}
#ifdef MP_DEBUG_INFO
// 验证优化算法的正确性
static void ValidateOptimize(
const SmallVectorImpl<RectU>& originRects,
const SmallVectorImpl<RectU>& newRects
) noexcept {
if (originRects.empty()) {
return;
}
std::vector<bool> pixels;
for (const RectU& originRect : originRects) {
// 作为优化先检查有没有被优化后的某个矩形包含
bool contained = false;
for (const RectU& newRect : newRects) {
if (RectHelper::Contains(newRect, originRect)) {
contained = true;
break;
}
}
if (contained) {
continue;
}
// 可能被多个矩形共同包含,需要逐像素检查
const uint32_t originWidth = originRect.right - originRect.left;
pixels.assign(size_t((originRect.bottom - originRect.top) * originWidth), false);
for (RectU newRect : newRects) {
if (!RectHelper::Intersect(newRect, newRect, originRect)) {
continue;
}
for (uint32_t i = newRect.top; i < newRect.bottom; ++i) {
uint32_t start = (i - originRect.top) * originWidth;
std::fill(pixels.begin() + size_t(start + newRect.left - originRect.left),
pixels.begin() + size_t(start + newRect.right - originRect.left), true);
}
}
if (std::find(pixels.begin(), pixels.end(), false) != pixels.end()) {
OutputDebugString(L"优化脏矩形算法错误!\n");
// 打印脏矩形供调试
for (const RectU& rect : originRects) {
OutputDebugString(fmt::format(L"{},{},{},{}\n",
rect.left, rect.top, rect.right, rect.bottom).c_str());
}
return;
}
}
}
#endif
void DirtyRectsOptimizer::Execute(SmallVectorImpl<RectU>& dirtyRects) noexcept {
uint32_t rectCount = (uint32_t)dirtyRects.size();
if (rectCount <= 1) {
return;
}
#ifdef MP_DEBUG_INFO
auto se = wil::scope_exit(std::bind_front(ValidateOptimize, DEBUG_INFO.validateDirtyRectsOptimizer ?
SmallVector<RectU>(dirtyRects.begin(), dirtyRects.end()) : SmallVector<RectU>(), std::ref(dirtyRects)));
#endif
if (rectCount <= MAX_CAPTURE_DIRTY_RECT_COUNT * 4) {
BasicOptimize(dirtyRects);
rectCount = (uint32_t)dirtyRects.size();
}
// 深度优化的复杂度为 n^4输入矩形数量太多时应削减。花太多时间优化脏矩形是得不偿失的
constexpr uint32_t DEEP_OPTIMIZE_LIMIT = MAX_CAPTURE_DIRTY_RECT_COUNT * 2;
if (rectCount > DEEP_OPTIMIZE_LIMIT) {
RectU& lastRect = dirtyRects[DEEP_OPTIMIZE_LIMIT - 1];
for (auto it = dirtyRects.begin() + DEEP_OPTIMIZE_LIMIT; it != dirtyRects.end(); ++it) {
lastRect = RectHelper::Union(lastRect, *it);
}
dirtyRects.erase(dirtyRects.begin() + DEEP_OPTIMIZE_LIMIT, dirtyRects.end());
BasicOptimize(dirtyRects);
rectCount = (uint32_t)dirtyRects.size();
}
if (rectCount == 1) {
return;
}
uint32_t totalPixels = CalcTotalPixels(dirtyRects);
while (true) {
uint32_t minTotalPixels = std::numeric_limits<uint32_t>::max();
uint32_t targetRectCount = 0;
bool targetCanOptimize = false;
uint32_t targetIdx1 = 0;
uint32_t targetIdx2 = 0;
// 遍历所有的两两合并找出总像素数最少的
for (uint32_t i = 0; i < rectCount; ++i) {
for (uint32_t j = i + 1; j < rectCount; ++j) {
const RectU& rect1 = dirtyRects[i];
const RectU& rect2 = dirtyRects[j];
// 两个矩形必须相交才有优化的可能,但脏矩形数量过多时需要强制合并
if (!RectHelper::IsOverlap(rect1, rect2) && rectCount <= MAX_CAPTURE_DIRTY_RECT_COUNT) {
continue;
}
RectU unionedRect = RectHelper::Union(rect1, rect2);
uint32_t newTotalPixels = 0;
uint32_t newRectCount = 0;
bool optimized = false;
// 这里只优化一轮而不是调用 OptimizeDirtyRects既降低复杂度又能避免堆分配
for (uint32_t k = 0; k < rectCount; ++k) {
if (k == i || k == j) {
continue;
}
RectU curRect = dirtyRects[k];
if (OptimizeDirtyRectPair(curRect, unionedRect)) {
optimized = true;
}
if (!RectHelper::IsEmpty(curRect)) {
newTotalPixels += RectHelper::CalcArea(curRect);
++newRectCount;
}
}
if (!RectHelper::IsEmpty(unionedRect)) {
newTotalPixels += RectHelper::CalcArea(unionedRect);
++newRectCount;
}
if (newTotalPixels < minTotalPixels ||
(newTotalPixels == minTotalPixels && newRectCount < targetRectCount)) {
minTotalPixels = newTotalPixels;
targetRectCount = newRectCount;
targetCanOptimize = optimized;
targetIdx1 = i;
targetIdx2 = j;
}
}
}
// 总像素数持平也采用,因为脏矩形数量减少了
if (minTotalPixels > totalPixels && rectCount <= MAX_CAPTURE_DIRTY_RECT_COUNT) {
return;
}
assert(targetIdx1 < targetIdx2);
dirtyRects[targetIdx1] = RectHelper::Union(dirtyRects[targetIdx1], dirtyRects[targetIdx2]);
dirtyRects.erase(dirtyRects.begin() + targetIdx2);
if (targetCanOptimize) {
BasicOptimize(dirtyRects);
totalPixels = CalcTotalPixels(dirtyRects);
} else {
totalPixels = minTotalPixels;
}
rectCount = (uint32_t)dirtyRects.size();
if (rectCount == 1) {
return;
}
}
}
#ifdef _DEBUG
static Ignore _ = [] {
auto rectComp = [](const RectU& l, const RectU& r) {
return std::tuple(l.left, l.top, l.right, l.bottom) <
std::tuple(r.left, r.top, r.right, r.bottom);
};
SmallVector<RectU, 0> dirtyRects;
dirtyRects.reserve(16);
dirtyRects.emplace_back(0, 0, 2, 2);
dirtyRects.emplace_back(1, 1, 3, 4);
dirtyRects.emplace_back(2, 1, 4, 3);
dirtyRects.emplace_back(0, 1, 3, 2);
dirtyRects.emplace_back(3, 3, 4, 4);
BasicOptimize(dirtyRects);
std::sort(dirtyRects.begin(), dirtyRects.end(), rectComp);
assert(dirtyRects.size() == 2);
assert((dirtyRects[0] == RectU{ 0, 0, 2, 2 }));
assert((dirtyRects[1] == RectU{ 1, 1, 4, 4 }));
dirtyRects.clear();
dirtyRects.emplace_back(0, 0, 1, 1);
dirtyRects.emplace_back(0, 0, 2, 2);
BasicOptimize(dirtyRects);
assert(dirtyRects.size() == 1);
assert((dirtyRects[0] == RectU{ 0, 0, 2, 2 }));
return Ignore();
}();
#endif
}

View file

@ -0,0 +1,11 @@
#pragma once
#include <SmallVector.h>
namespace Magpie {
struct DirtyRectsOptimizer {
// 尝试减少脏矩形数量和总像素数
static void Execute(SmallVectorImpl<RectU>& dirtyRects) noexcept;
};
}

View file

@ -0,0 +1,339 @@
#include "pch.h"
#include "DuplicateFrameChecker.h"
#include "DebugInfo.h"
#include "DirectXHelper.h"
#include "Logger.h"
#include "ScalingWindow.h"
#include "shaders/DuplicateFrameCS.h"
#include "shaders/DuplicateFrameCS_NoBoundsChecking.h"
namespace Magpie {
static constexpr uint16_t INITIAL_CHECK_COUNT = 16;
static constexpr uint16_t INITIAL_SKIP_COUNT = 1;
static constexpr uint16_t MAX_SKIP_COUNT = 16;
DuplicateFrameChecker::DuplicateFrameChecker() noexcept :
_nextSkipCount(INITIAL_SKIP_COUNT), _framesLeft(INITIAL_CHECK_COUNT) {}
// 使用 D3D11 而不是 D3D12 检查重复帧。有两个原因:
// 1. D3D11 支持 IDXGIDevice::SetGPUThreadPriority可以提高 GPU 优先级,
// 而 D3D12 没有等价接口。
// 2. 对于小任务 D3D11 启动渲染的耗时比 D3D12 短,差距可以达到 50us 以上。
//
// 对于不支持脏矩形且捕获帧右下两边没有多余像素的捕获方式,可以禁用边界检查获得
// 性能提升。
bool DuplicateFrameChecker::Initialize(
ID3D11Device5* d3d11Device,
ID3D11DeviceContext4* d3d11DC,
const ColorInfo& colorInfo,
SizeU frameSize,
uint32_t captureFrameCount,
bool disableBoundsChecking
) noexcept {
assert(ScalingWindow::Get().Options().duplicateFrameDetectionMode !=
DuplicateFrameDetectionMode::Never);
_device = d3d11Device;
_deviceContext = d3d11DC;
_isScRGB = colorInfo.kind != winrt::AdvancedColorKind::StandardDynamicRange;
_frameSize = frameSize;
#ifdef _DEBUG
_isBoundsCheckingDisabled = disableBoundsChecking;
#endif
_frameSrvs.resize(captureFrameCount);
HRESULT hr = d3d11Device->CreateComputeShader(
disableBoundsChecking ? DuplicateFrameCS_NoBoundsChecking : DuplicateFrameCS,
disableBoundsChecking ? sizeof(DuplicateFrameCS_NoBoundsChecking) : sizeof(DuplicateFrameCS),
nullptr,
_dupFrameCS.put()
);
if (FAILED(hr)) {
Logger::Get().ComError("CreateComputeShader 失败", hr);
return false;
}
{
D3D11_BUFFER_DESC desc = {
// CSSetConstantBuffers1 要求偏移量以 256 字节对齐
.ByteWidth = (MAX_CAPTURE_DIRTY_RECT_COUNT - 1) * 256 + 8 * sizeof(uint32_t),
.Usage = D3D11_USAGE_DYNAMIC,
.BindFlags = D3D11_BIND_CONSTANT_BUFFER,
.CPUAccessFlags = D3D11_CPU_ACCESS_WRITE,
.StructureByteStride = desc.ByteWidth
};
hr = d3d11Device->CreateBuffer(&desc, nullptr, _constantBuffer.put());
if (FAILED(hr)) {
Logger::Get().ComError("CreateBuffer 失败", hr);
return false;
}
desc.ByteWidth = MAX_CAPTURE_DIRTY_RECT_COUNT * sizeof(uint32_t);
desc.Usage = D3D11_USAGE_DEFAULT;
desc.BindFlags = D3D11_BIND_UNORDERED_ACCESS;
desc.StructureByteStride = desc.ByteWidth;
hr = d3d11Device->CreateBuffer(&desc, nullptr, _resultBuffer.put());
if (FAILED(hr)) {
Logger::Get().ComError("CreateBuffer 失败", hr);
return false;
}
desc.Usage = D3D11_USAGE_STAGING;
desc.CPUAccessFlags = D3D11_CPU_ACCESS_READ;
desc.BindFlags = 0;
hr = d3d11Device->CreateBuffer(&desc, nullptr, _readBackBuffer.put());
if (FAILED(hr)) {
Logger::Get().ComError("CreateBuffer 失败", hr);
return false;
}
}
{
D3D11_UNORDERED_ACCESS_VIEW_DESC desc = {
.Format = DXGI_FORMAT_R32_UINT,
.ViewDimension = D3D11_UAV_DIMENSION_BUFFER,
.Buffer = {
.NumElements = MAX_CAPTURE_DIRTY_RECT_COUNT
}
};
hr = d3d11Device->CreateUnorderedAccessView(_resultBuffer.get(), &desc, _resultBufferUav.put());
if (FAILED(hr)) {
Logger::Get().ComError("CreateUnorderedAccessView 失败", hr);
return false;
}
}
{
D3D11_SAMPLER_DESC desc{
.Filter = D3D11_FILTER_MIN_MAG_MIP_POINT,
.AddressU = D3D11_TEXTURE_ADDRESS_CLAMP,
.AddressV = D3D11_TEXTURE_ADDRESS_CLAMP,
.AddressW = D3D11_TEXTURE_ADDRESS_CLAMP,
.ComparisonFunc = D3D11_COMPARISON_NEVER
};
hr = d3d11Device->CreateSamplerState(&desc, _sampler.put());
if (FAILED(hr)) {
Logger::Get().ComError("CreateSamplerState 失败", hr);
return false;
}
}
_deviceContext->CSSetShader(_dupFrameCS.get(), nullptr, 0);
{
ID3D11UnorderedAccessView* uav = _resultBufferUav.get();
_deviceContext->CSSetUnorderedAccessViews(0, 1, &uav, nullptr);
}
{
ID3D11SamplerState* t = _sampler.get();
_deviceContext->CSSetSamplers(0, 1, &t);
}
return true;
}
HRESULT DuplicateFrameChecker::CheckFrame(
ID3D11Texture2D* frameResource,
uint32_t frameIdx,
SmallVectorImpl<RectU>& dirtyRects
) noexcept {
assert(!dirtyRects.empty() && dirtyRects.size() <= MAX_CAPTURE_DIRTY_RECT_COUNT);
#ifdef _DEBUG
{
D3D11_TEXTURE2D_DESC desc;
frameResource->GetDesc(&desc);
assert(desc.Width == _frameSize.width && desc.Height == _frameSize.height);
if (_isBoundsCheckingDisabled) {
// 确保捕获帧右下两边没有多余像素
for (const RectU& rect : dirtyRects) {
assert(rect.right == desc.Width && rect.bottom == desc.Height);
}
}
}
#endif
if (!_frameSrvs[frameIdx]) {
HRESULT hr = _device->CreateShaderResourceView(frameResource, nullptr, _frameSrvs[frameIdx].put());
if (FAILED(hr)) {
Logger::Get().ComError("CreateShaderResourceView 失败", hr);
return hr;
}
}
// 第一帧无需检查重复帧
if (_oldFrameIdx == std::numeric_limits<uint32_t>::max()) {
return S_OK;
}
if (ScalingWindow::Get().Options().duplicateFrameDetectionMode == DuplicateFrameDetectionMode::Always) {
HRESULT hr = _CheckDirtyRects(frameIdx, dirtyRects);
if (FAILED(hr)) {
Logger::Get().ComError("_CheckDirtyRects 失败", hr);
return hr;
}
return S_OK;
}
// 动态检查重复帧,见 #787
if (_isCheckingForDuplicateFrame) {
if (--_framesLeft == 0) {
_isCheckingForDuplicateFrame = false;
_framesLeft = _nextSkipCount;
if (_nextSkipCount < MAX_SKIP_COUNT) {
// 增加下一次连续跳过检查的帧数
++_nextSkipCount;
}
}
HRESULT hr = _CheckDirtyRects(frameIdx, dirtyRects);
if (FAILED(hr)) {
Logger::Get().ComError("_CheckDirtyRects 失败", hr);
return hr;
}
if (dirtyRects.empty()) {
_isCheckingForDuplicateFrame = true;
_framesLeft = INITIAL_CHECK_COUNT;
_nextSkipCount = INITIAL_SKIP_COUNT;
}
} else {
if (--_framesLeft == 0) {
_isCheckingForDuplicateFrame = true;
// 第 2 次连续检查 10 帧,之后逐渐减少,从第 16 次开始只连续检查 2 帧
_framesLeft = uint32_t((-4 * (int)_nextSkipCount + 78) / 7);
}
#ifdef MP_DEBUG_INFO
if (DEBUG_INFO.enableStatisticsForDynamicDuplicateFrameDetection) {
// 预测此帧不会重复,验证是否正确
SmallVector<RectU> tempRects(dirtyRects.begin(), dirtyRects.end());
HRESULT hr = _CheckDirtyRects(frameIdx, tempRects);
if (FAILED(hr)) {
Logger::Get().ComError("_CheckDirtyRects 失败", hr);
return hr;
}
auto lk = DEBUG_INFO.lock.lock_exclusive();
++DEBUG_INFO.ddfdSkippedFrameCount;
if (tempRects.empty()) {
++DEBUG_INFO.ddfdWrongPredictionCount;
}
}
#endif
}
return S_OK;
}
void DuplicateFrameChecker::OnFrameAdopted(uint32_t frameIdx) noexcept {
_oldFrameIdx = frameIdx;
}
void DuplicateFrameChecker::OnCaptureStopped() noexcept {
_oldFrameIdx = std::numeric_limits<uint32_t>::max();
std::fill(_frameSrvs.begin(), _frameSrvs.end(), nullptr);
}
HRESULT DuplicateFrameChecker::_CheckDirtyRects(
uint32_t newFrameIdx,
SmallVectorImpl<RectU>& dirtyRects
) noexcept {
assert(dirtyRects.size() <= MAX_CAPTURE_DIRTY_RECT_COUNT);
{
assert(_frameSrvs[_oldFrameIdx] && _frameSrvs[newFrameIdx]);
ID3D11ShaderResourceView* srvs[]{ _frameSrvs[_oldFrameIdx].get(), _frameSrvs[newFrameIdx].get()};
_deviceContext->CSSetShaderResources(0, 2, srvs);
}
const uint32_t dirtyRectCount = (uint32_t)dirtyRects.size();
D3D11_MAPPED_SUBRESOURCE ms;
HRESULT hr = _deviceContext->Map(_constantBuffer.get(), 0, D3D11_MAP_WRITE_DISCARD, 0, &ms);
if (FAILED(hr)) {
Logger::Get().ComError("ID3D11DeviceContext::Map 失败", hr);
return hr;
}
++_curTargetValue;
for (uint32_t i = 0; i < dirtyRectCount; ++i) {
const RectU& dirtyRect = dirtyRects[i];
alignas(32) DirectXHelper::Constant32 constants[] = {
{.uintVal = dirtyRect.left},
{.uintVal = dirtyRect.top},
{.uintVal = dirtyRect.right},
{.uintVal = dirtyRect.bottom},
{.floatVal = 1.0f / _frameSize.width},
{.floatVal = 1.0f / _frameSize.height},
{.uintVal = _curTargetValue},
{.uintVal = i}
};
// CSSetConstantBuffers1 要求偏移量以 256 字节对齐
std::memcpy((uint8_t*)ms.pData + i * 256, constants, sizeof(constants));
}
_deviceContext->Unmap(_constantBuffer.get(), 0);
for (uint32_t i = 0; i < dirtyRectCount; ++i) {
{
ID3D11Buffer* buffer = _constantBuffer.get();
UINT firstConstant = i * 16;
UINT numConstants = 16;
_deviceContext->CSSetConstantBuffers1(0, 1, &buffer, &firstConstant, &numConstants);
}
const RectU& dirtyRect = dirtyRects[i];
_deviceContext->Dispatch(
(dirtyRect.right - dirtyRect.left + DUP_FRAME_DISPATCH_BLOCK_SIZE - 1) / DUP_FRAME_DISPATCH_BLOCK_SIZE,
(dirtyRect.bottom - dirtyRect.top + DUP_FRAME_DISPATCH_BLOCK_SIZE - 1) / DUP_FRAME_DISPATCH_BLOCK_SIZE,
1
);
}
{
D3D11_BOX box = {
.right = dirtyRectCount * 4,
.bottom = 1,
.back = 1
};
_deviceContext->CopySubresourceRegion(_readBackBuffer.get(), 0, 0, 0, 0, _resultBuffer.get(), 0, &box);
}
// 读取结果
SmallVector<uint32_t, 4> removeList;
hr = _deviceContext->Map(_readBackBuffer.get(), 0, D3D11_MAP_READ, 0, &ms);
if (FAILED(hr)) {
Logger::Get().ComError("ID3D11DeviceContext::Map 失败", hr);
return hr;
}
for (uint32_t i = 0; i < dirtyRectCount; ++i) {
if (((uint32_t*)ms.pData)[i] != _curTargetValue) {
// 此矩形内画面无变化
removeList.push_back(i);
}
}
_deviceContext->Unmap(_readBackBuffer.get(), 0);
if (!removeList.empty()) {
// 从后向前删除
std::sort(removeList.begin(), removeList.end(), std::greater<uint32_t>());
for (uint32_t idx : removeList) {
dirtyRects.erase(dirtyRects.begin() + idx);
}
}
return S_OK;
}
}

View file

@ -0,0 +1,64 @@
#pragma once
#include "SmallVector.h"
#include <d3d11_4.h>
namespace Magpie {
class DuplicateFrameChecker {
public:
DuplicateFrameChecker() noexcept;
DuplicateFrameChecker(const DuplicateFrameChecker&) = delete;
DuplicateFrameChecker(DuplicateFrameChecker&&) = delete;
~DuplicateFrameChecker() = default;
bool Initialize(
ID3D11Device5* d3d11Device,
ID3D11DeviceContext4* d3d11DC,
const ColorInfo& colorInfo,
SizeU frameSize,
uint32_t captureFrameCount,
bool disableBoundsChecking
) noexcept;
HRESULT CheckFrame(
ID3D11Texture2D* frameResource,
uint32_t frameIdx,
SmallVectorImpl<RectU>& dirtyRects
) noexcept;
void OnFrameAdopted(uint32_t frameIdx) noexcept;
void OnCaptureStopped() noexcept;
private:
HRESULT _CheckDirtyRects(uint32_t newFrameIdx, SmallVectorImpl<RectU>& dirtyRects) noexcept;
ID3D11Device5* _device = nullptr;
ID3D11DeviceContext4* _deviceContext = nullptr;
SizeU _frameSize{};
winrt::com_ptr<ID3D11ComputeShader> _dupFrameCS;
winrt::com_ptr<ID3D11Buffer> _constantBuffer;
winrt::com_ptr<ID3D11Buffer> _resultBuffer;
winrt::com_ptr<ID3D11UnorderedAccessView> _resultBufferUav;
winrt::com_ptr<ID3D11Buffer> _readBackBuffer;
winrt::com_ptr<ID3D11SamplerState> _sampler;
std::vector<winrt::com_ptr<ID3D11ShaderResourceView>> _frameSrvs;
uint32_t _oldFrameIdx = std::numeric_limits<uint32_t>::max();
uint32_t _curTargetValue = 0;
// 用于检查重复帧
uint16_t _nextSkipCount;
uint16_t _framesLeft;
bool _isScRGB = false;
#ifdef _DEBUG
bool _isBoundsCheckingDisabled = false;
#endif
bool _isCheckingForDuplicateFrame = true;
};
}

View file

@ -1,111 +0,0 @@
#include "pch.h"
#include "DwmSharedSurfaceFrameSource.h"
#include "DeviceResources.h"
#include "DirectXHelper.h"
#include "Logger.h"
#include "ScalingWindow.h"
#include "Win32Helper.h"
namespace Magpie {
using DwmGetDxSharedSurfaceFunc = BOOL(
HWND hWnd,
HANDLE* phSurface,
LUID* pAdapterLuid,
ULONG* pFmtWindow,
ULONG* pPresentFlags,
ULONGLONG* pWin32KUpdateId
);
static DwmGetDxSharedSurfaceFunc* DwmGetDxSharedSurface = nullptr;
bool DwmSharedSurfaceFrameSource::_Initialize() noexcept {
[[maybe_unused]] static Ignore _ = [] {
DwmGetDxSharedSurface = Win32Helper::LoadSystemFunction<DwmGetDxSharedSurfaceFunc>(
L"user32.dll", "DwmGetDxSharedSurface");
return Ignore();
}();
if (!DwmGetDxSharedSurface) {
Logger::Get().Win32Error("获取函数 DwmGetDxSharedSurface 地址失败");
return false;
}
const SrcTracker& srcTracker = ScalingWindow::Get().SrcTracker();
RECT frameRect;
double a, bx, by;
if (!_GetMapToOriginDPI(srcTracker.Handle(), a, bx, by)) {
// 很可能是因为窗口没有重定向表面,这种情况下 DwmSharedSurface 捕获肯定失败
Logger::Get().Error("_GetMapToOriginDPI 失败");
return false;
}
Logger::Get().Info(fmt::format("源窗口 DPI 缩放为 {}", 1 / a));
const RECT& srcRect = srcTracker.SrcRect();
frameRect = RECT{
std::lround(srcRect.left * a + bx),
std::lround(srcRect.top * a + by),
std::lround(srcRect.right * a + bx),
std::lround(srcRect.bottom * a + by)
};
if (frameRect.left < 0 || frameRect.top < 0 || frameRect.right < 0
|| frameRect.bottom < 0 || frameRect.right - frameRect.left <= 0
|| frameRect.bottom - frameRect.top <= 0
) {
Logger::Get().Error("裁剪失败");
return false;
}
_frameInWnd = {
(UINT)frameRect.left,
(UINT)frameRect.top,
0,
(UINT)frameRect.right,
(UINT)frameRect.bottom,
1
};
_output = DirectXHelper::CreateTexture2D(
_deviceResources->GetD3DDevice(),
DXGI_FORMAT_B8G8R8A8_UNORM,
frameRect.right - frameRect.left,
frameRect.bottom - frameRect.top,
D3D11_BIND_SHADER_RESOURCE
);
if (!_output) {
Logger::Get().Error("CreateTexture2D 失败");
return false;
}
Logger::Get().Info("DwmSharedSurfaceFrameSource 初始化完成");
return true;
}
FrameSourceState DwmSharedSurfaceFrameSource::_Update() noexcept {
HANDLE sharedTextureHandle = NULL;
if (!DwmGetDxSharedSurface(ScalingWindow::Get().SrcTracker().Handle(),
&sharedTextureHandle, nullptr, nullptr, nullptr, nullptr)
|| !sharedTextureHandle
) {
Logger::Get().Win32Error("DwmGetDxSharedSurface 失败");
return FrameSourceState::Error;
}
winrt::com_ptr<ID3D11Texture2D> sharedTexture;
HRESULT hr = _deviceResources->GetD3DDevice()
->OpenSharedResource(sharedTextureHandle, IID_PPV_ARGS(&sharedTexture));
if (FAILED(hr)) {
Logger::Get().ComError("OpenSharedResource 失败", hr);
return FrameSourceState::Error;
}
_deviceResources->GetD3DDC()->CopySubresourceRegion(
_output.get(), 0, 0, 0, 0, sharedTexture.get(), 0, &_frameInWnd);
return FrameSourceState::NewFrame;
}
}

View file

@ -1,27 +0,0 @@
#pragma once
#include "FrameSourceBase.h"
namespace Magpie {
class DwmSharedSurfaceFrameSource final : public FrameSourceBase {
public:
virtual ~DwmSharedSurfaceFrameSource() {}
FrameSourceWaitType WaitType() const noexcept override {
return FrameSourceWaitType::NoWait;
}
const char* Name() const noexcept override {
return "DwmSharedSurface";
}
protected:
bool _Initialize() noexcept override;
FrameSourceState _Update() noexcept override;
private:
D3D11_BOX _frameInWnd{};
};
}

View file

@ -1,312 +0,0 @@
#include "pch.h"
#include "EffectCacheManager.h"
#include "CommonSharedConstants.h"
#include "Logger.h"
#include "StrHelper.h"
#include "Win32Helper.h"
#include "YasHelper.h"
#include <d3dcompiler.h>
#include <rapidhash.h>
namespace yas::detail {
// winrt::com_ptr<ID3DBlob>
template <std::size_t F>
struct serializer<
type_prop::not_a_fundamental,
ser_case::use_internal_serializer,
F,
winrt::com_ptr<ID3DBlob>
> {
template <typename Archive>
static Archive& save(Archive& ar, const winrt::com_ptr<ID3DBlob>& blob) {
uint32_t size = (uint32_t)blob->GetBufferSize();
ar& size;
ar.write(blob->GetBufferPointer(), size);
return ar;
}
template <typename Archive>
static Archive& load(Archive& ar, winrt::com_ptr<ID3DBlob>& blob) {
uint32_t size = 0;
ar& size;
HRESULT hr = D3DCreateBlob(size, blob.put());
if (FAILED(hr)) {
Logger::Get().ComError("D3DCreateBlob 失败", hr);
throw new std::exception();
}
ar.read(blob->GetBufferPointer(), size);
return ar;
}
};
}
namespace Magpie {
template <typename Archive>
void serialize(Archive& ar, EffectParameterDesc& o) {
ar& o.name& o.label& o.constant;
}
template <typename Archive>
void serialize(Archive& ar, EffectIntermediateTextureDesc& o) {
ar& o.format& o.name& o.source& o.sizeExpr;
}
template <typename Archive>
void serialize(Archive& ar, EffectSamplerDesc& o) {
ar& o.filterType& o.addressType& o.name;
}
template <typename Archive>
void serialize(Archive& ar, EffectPassDesc& o) {
ar& o.cso& o.inputs& o.outputs& o.numThreads[0] & o.numThreads[1] & o.numThreads[2] & o.blockSize& o.desc& o.flags;
}
template <typename Archive>
void serialize(Archive& ar, EffectDesc& o) {
ar& o.name& o.params& o.textures& o.samplers& o.passes& o.flags;
}
static constexpr uint32_t MAX_CACHE_COUNT = 127;
// 缓存版本
// 当缓存文件结构有更改时更新它,使旧缓存失效
static constexpr uint32_t EFFECT_CACHE_VERSION = 15;
static std::wstring GetLinearEffectName(std::wstring_view effectName) {
std::wstring result(effectName);
for (wchar_t& c : result) {
if (c == L'\\') {
c = L'#';
}
}
return result;
}
static std::wstring GetCacheFileName(std::wstring_view linearEffectName, uint32_t flags, uint64_t hash) {
assert(flags <= 0xFFFF);
// 缓存文件的命名: {效果名}_{标志位(4)}_{哈希(16)}
return fmt::format(L"{}\\{}_{:04x}_{:016x}", CommonSharedConstants::CACHE_DIR, linearEffectName, flags, hash);
}
void EffectCacheManager::_AddToMemCache(const std::wstring& cacheFileName, std::string& key, const EffectDesc& desc) {
auto lock = _lock.lock_exclusive();
_memCache[cacheFileName] = _MemCacheItem{
.key = std::move(key),
.effectDesc = desc,
.lastAccess = ++_lastAccess
};
if (_memCache.size() > MAX_CACHE_COUNT) {
assert(_memCache.size() == MAX_CACHE_COUNT + 1);
// 清理一半较旧的内存缓存
std::array<uint32_t, MAX_CACHE_COUNT + 1> access{};
std::transform(_memCache.begin(), _memCache.end(), access.begin(),
[](const auto& pair) {return pair.second.lastAccess; });
auto midIt = access.begin() + access.size() / 2;
std::nth_element(access.begin(), midIt, access.end());
const uint32_t mid = *midIt;
for (auto it = _memCache.begin(); it != _memCache.end();) {
if (it->second.lastAccess < mid) {
it = _memCache.erase(it);
} else {
++it;
}
}
Logger::Get().Info("已清理内存缓存");
}
}
bool EffectCacheManager::_LoadFromMemCache(const std::wstring& cacheFileName, std::string_view key, EffectDesc& desc) {
auto lock = _lock.lock_exclusive();
auto it = _memCache.find(cacheFileName);
if (it != _memCache.end()) {
_MemCacheItem& cacheItem = it->second;
// 防止哈希碰撞
if (cacheItem.key != key) {
return false;
}
desc = cacheItem.effectDesc;
cacheItem.lastAccess = ++_lastAccess;
Logger::Get().Info(StrHelper::Concat("已读取缓存 ", StrHelper::UTF16ToUTF8(cacheFileName)));
return true;
}
return false;
}
bool EffectCacheManager::Load(
std::wstring_view effectName,
uint32_t flags,
uint64_t hash,
std::string_view key,
EffectDesc& desc
) {
assert(!effectName.empty() && !key.empty());
std::wstring cacheFileName = GetCacheFileName(GetLinearEffectName(effectName), flags, hash);
if (_LoadFromMemCache(cacheFileName, key, desc)) {
return true;
}
if (!Win32Helper::FileExists(cacheFileName.c_str())) {
return false;
}
std::vector<BYTE> buf;
if (!Win32Helper::ReadFile(cacheFileName.c_str(), buf) || buf.empty()) {
return false;
}
std::string cachedKey;
try {
yas::mem_istream mi(buf.data(), buf.size());
yas::binary_iarchive<yas::mem_istream, yas::binary> ia(mi);
uint32_t cacheVersion;
ia.read(cacheVersion);
if (cacheVersion != EFFECT_CACHE_VERSION) {
Logger::Get().Info("缓存版本不匹配");
return false;
}
ia& cachedKey;
if (cachedKey != key) {
Logger::Get().Info("缓存键不匹配");
return false;
}
ia& desc;
} catch (...) {
Logger::Get().Error("反序列化失败");
desc = {};
return false;
}
_AddToMemCache(cacheFileName, cachedKey, desc);
Logger::Get().Info(StrHelper::Concat("已读取缓存 ", StrHelper::UTF16ToUTF8(cacheFileName)));
return true;
}
void EffectCacheManager::Save(
std::wstring_view effectName,
uint32_t flags,
uint64_t hash,
std::string key,
const EffectDesc& desc
) {
const std::wstring linearEffectName = GetLinearEffectName(effectName);
std::vector<BYTE> buffer;
buffer.reserve(4096);
try {
yas::vector_ostream os(buffer);
yas::binary_oarchive<yas::vector_ostream<BYTE>, yas::binary> oa(os);
oa.write(EFFECT_CACHE_VERSION);
oa& key& desc;
} catch (...) {
Logger::Get().Error("序列化 EffectDesc 失败");
return;
}
if (!CreateDirectory(CommonSharedConstants::CACHE_DIR, nullptr)) {
if (GetLastError() != ERROR_ALREADY_EXISTS) {
Logger::Get().Win32Error("创建 cache 文件夹失败");
return;
}
// 清理缓存
WIN32_FIND_DATA findData{};
wil::unique_hfind hFind(FindFirstFileEx(
StrHelper::Concat(CommonSharedConstants::CACHE_DIR, L"\\*").c_str(),
FindExInfoBasic, &findData, FindExSearchNameMatch, nullptr, FIND_FIRST_EX_LARGE_FETCH));
if (hFind) {
do {
std::wstring_view fileName(findData.cFileName);
if (!fileName.starts_with(linearEffectName)) {
continue;
}
const size_t effectNameLen = linearEffectName.size();
if (fileName.size() == effectNameLen + 22) {
// 保留标志不同的缓存
if (!fileName.substr(effectNameLen).starts_with(fmt::format(L"_{:04x}_", flags))) {
continue;
}
int i = 6;
for (; i < 22; ++i) {
const wchar_t c = fileName[effectNameLen + i];
if (!((c >= L'0' && c <= L'9') || (c >= L'a' && c <= L'f'))) {
break;
}
}
if (i != 22) {
continue;
}
} else if (fileName.size() == effectNameLen + 18) {
// 删除旧版缓存
if (fileName[effectNameLen] != L'_') {
continue;
}
int i = 1;
for (; i < 18; ++i) {
const wchar_t c = fileName[effectNameLen + i];
if (!((c >= L'0' && c <= L'9') || (c >= L'a' && c <= L'f'))) {
break;
}
}
if (i != 18) {
continue;
}
} else {
continue;
}
if (!DeleteFile(StrHelper::Concat(
CommonSharedConstants::CACHE_DIR, L"\\", findData.cFileName).c_str()))
{
Logger::Get().Win32Error(StrHelper::Concat("删除缓存文件 ",
StrHelper::UTF16ToUTF8(findData.cFileName), " 失败"));
}
} while (FindNextFile(hFind.get(), &findData));
} else {
Logger::Get().Win32Error("查找缓存文件失败");
}
}
std::wstring cacheFileName = GetCacheFileName(linearEffectName, flags, hash);
if (!Win32Helper::WriteFile(cacheFileName.c_str(), buffer)) {
Logger::Get().Error("保存缓存失败");
}
_AddToMemCache(cacheFileName, key, desc);
Logger::Get().Info(StrHelper::Concat("已保存缓存 ", StrHelper::UTF16ToUTF8(cacheFileName)));
}
uint64_t EffectCacheManager::GetHash(std::string_view key) {
return rapidhash(key.data(), key.size());
}
}

View file

@ -1,42 +0,0 @@
#pragma once
#include "EffectDesc.h"
#include <parallel_hashmap/phmap.h>
namespace Magpie {
class EffectCacheManager {
public:
static EffectCacheManager& Get() noexcept {
static EffectCacheManager instance;
return instance;
}
EffectCacheManager(const EffectCacheManager&) = delete;
EffectCacheManager(EffectCacheManager&&) = delete;
bool Load(std::wstring_view effectName, uint32_t flags, uint64_t hash, std::string_view key, EffectDesc& desc);
void Save(std::wstring_view effectName, uint32_t flags, uint64_t hash, std::string key, const EffectDesc& desc);
static uint64_t GetHash(std::string_view key);
private:
EffectCacheManager() = default;
void _AddToMemCache(const std::wstring& cacheFileName, std::string& key, const EffectDesc& desc);
bool _LoadFromMemCache(const std::wstring& cacheFileName, std::string_view key, EffectDesc& desc);
// 用于同步对 _memCache 的访问
wil::srwlock _lock;
struct _MemCacheItem {
std::string key;
EffectDesc effectDesc;
uint32_t lastAccess = 0;
};
phmap::flat_hash_map<std::wstring, _MemCacheItem> _memCache;
UINT _lastAccess = 0;
};
}

File diff suppressed because it is too large Load diff

View file

@ -1,632 +0,0 @@
#include "pch.h"
#include "EffectDrawer.h"
#include "BackendDescriptorStore.h"
#include "DeviceResources.h"
#include "DirectXHelper.h"
#include "EffectHelper.h"
#include "EffectsProfiler.h"
#include "Logger.h"
#include "ScalingOptions.h"
#include "ScalingWindow.h"
#include "StrHelper.h"
#include "TextureHelper.h"
#include "Win32Helper.h"
namespace Magpie {
EffectDrawer::~EffectDrawer() {
// [0] 为输入,由前一个 EffectDrawer 管理
const uint32_t textureCount = (uint32_t)_textures.size();
for (uint32_t i = 1; i < textureCount; ++i) {
_descriptorStore->RemoveCache(_textures[i].get());
}
}
bool EffectDrawer::Initialize(
const EffectDesc& desc,
const EffectOption& option,
DeviceResources& deviceResources,
BackendDescriptorStore& descriptorStore,
ID3D11Texture2D** inOutTexture
) noexcept {
_d3dDC = deviceResources.GetD3DDC();
_descriptorStore = &descriptorStore;
SIZE inputSize{};
{
D3D11_TEXTURE2D_DESC inputDesc;
(*inOutTexture)->GetDesc(&inputDesc);
inputSize = { (LONG)inputDesc.Width, (LONG)inputDesc.Height };
}
const SIZE outputSize = _CalcOutputSize(desc, option, inputSize);
if (outputSize.cx <= 0 || outputSize.cy <= 0) {
Logger::Get().Error("非法的输出尺寸");
return false;
}
_samplers.resize(desc.samplers.size());
for (UINT i = 0; i < _samplers.size(); ++i) {
const EffectSamplerDesc& samDesc = desc.samplers[i];
_samplers[i] = deviceResources.GetSampler(
samDesc.filterType == EffectSamplerFilterType::Linear ? D3D11_FILTER_MIN_MAG_MIP_LINEAR : D3D11_FILTER_MIN_MAG_MIP_POINT,
samDesc.addressType == EffectSamplerAddressType::Clamp ? D3D11_TEXTURE_ADDRESS_CLAMP : D3D11_TEXTURE_ADDRESS_WRAP
);
if (!_samplers[i]) {
Logger::Get().Error(fmt::format("创建采样器 {} 失败", samDesc.name));
return false;
}
}
// 创建中间纹理
// 第一个为 INPUT第二个为 OUTPUT
_textures.resize(desc.textures.size());
_textures[0].copy_from(*inOutTexture);
// 创建输出纹理,格式始终是 DXGI_FORMAT_R8G8B8A8_UNORM
_textures[1] = DirectXHelper::CreateTexture2D(
deviceResources.GetD3DDevice(),
EffectHelper::FORMAT_DESCS[(uint32_t)desc.textures[1].format].dxgiFormat,
outputSize.cx,
outputSize.cy,
D3D11_BIND_SHADER_RESOURCE | D3D11_BIND_UNORDERED_ACCESS
);
*inOutTexture = _textures[1].get();
if (!*inOutTexture) {
Logger::Get().Error("创建输出纹理失败");
return false;
}
for (size_t i = 2; i < desc.textures.size(); ++i) {
const EffectIntermediateTextureDesc& texDesc = desc.textures[i];
if (!texDesc.source.empty()) {
// 从文件加载纹理
size_t delimPos = desc.name.find_last_of('\\');
std::string texPath = delimPos == std::string::npos
? StrHelper::Concat("effects\\", texDesc.source)
: StrHelper::Concat("effects\\", std::string_view(desc.name.c_str(), delimPos + 1), texDesc.source);
_textures[i] = TextureHelper::LoadTexture(
StrHelper::UTF8ToUTF16(texPath).c_str(), deviceResources.GetD3DDevice());
if (!_textures[i]) {
Logger::Get().Error(fmt::format("加载纹理 {} 失败", texDesc.source));
return false;
}
if (texDesc.format != EffectIntermediateTextureFormat::UNKNOWN) {
// 检查纹理格式是否匹配
D3D11_TEXTURE2D_DESC srcDesc{};
_textures[i]->GetDesc(&srcDesc);
if (srcDesc.Format != EffectHelper::FORMAT_DESCS[(uint32_t)texDesc.format].dxgiFormat) {
Logger::Get().Error("SOURCE 纹理格式不匹配");
return false;
}
}
} else {
SIZE texSize{};
try {
_exprParser.SetExpr(texDesc.sizeExpr.first);
texSize.cx = std::lround(_exprParser.Eval());
_exprParser.SetExpr(texDesc.sizeExpr.second);
texSize.cy = std::lround(_exprParser.Eval());
} catch (const mu::ParserError& e) {
Logger::Get().Error(fmt::format("计算中间纹理尺寸 {} 失败: {}", e.GetExpr(), e.GetMsg()));
return false;
}
if (texSize.cx <= 0 || texSize.cy <= 0) {
Logger::Get().Error("非法的中间纹理尺寸");
return false;
}
_textures[i] = DirectXHelper::CreateTexture2D(
deviceResources.GetD3DDevice(),
EffectHelper::FORMAT_DESCS[(UINT)texDesc.format].dxgiFormat,
texSize.cx,
texSize.cy,
D3D11_BIND_SHADER_RESOURCE | D3D11_BIND_UNORDERED_ACCESS
);
if (!_textures[i]) {
Logger::Get().Error("创建纹理失败");
return false;
}
}
}
uint32_t passCount = (uint32_t)desc.passes.size();
_shaders.resize(passCount);
_srvs.resize(passCount);
_uavs.resize(passCount);
_dispatches.resize(passCount);
for (uint32_t i = 0; i < passCount; ++i) {
const EffectPassDesc& passDesc = desc.passes[i];
HRESULT hr = deviceResources.GetD3DDevice()->CreateComputeShader(
passDesc.cso->GetBufferPointer(), passDesc.cso->GetBufferSize(), nullptr, _shaders[i].put());
if (FAILED(hr)) {
Logger::Get().ComError("创建计算着色器失败", hr);
return false;
}
_srvs[i].resize(passDesc.inputs.size());
_uavs[i].resize(passDesc.outputs.size() * 2);
}
if (!_UpdatePassResources(desc)) {
Logger::Get().Error("_UpdatePassResources 失败");
return false;
}
if (!_UpdateConstants(desc, option, deviceResources, inputSize, outputSize)) {
Logger::Get().Error("_UpdateConstants 失败");
return false;
}
return true;
}
void EffectDrawer::Draw(EffectsProfiler& profiler) const noexcept {
_PrepareForDraw();
for (uint32_t i = 0; i < _dispatches.size(); ++i) {
_DrawPass(i);
profiler.OnEndPass(_d3dDC);
}
}
void EffectDrawer::DrawForExport(const EffectDesc& desc, uint32_t passIdx) const noexcept {
_PrepareForDraw();
for (uint32_t i : _CalcPassesToDrawForExport(desc, passIdx)) {
_DrawPass(i);
}
}
bool EffectDrawer::ResizeTextures(
const EffectDesc& desc,
const EffectOption& option,
DeviceResources& deviceResources,
ID3D11Texture2D** inOutTexture
) noexcept {
bool anyChange = false;
if (*inOutTexture != _textures[0].get()) {
_textures[0].copy_from(*inOutTexture);
anyChange = true;
}
SIZE inputSize{};
{
D3D11_TEXTURE2D_DESC inputDesc;
_textures[0]->GetDesc(&inputDesc);
inputSize = { (LONG)inputDesc.Width, (LONG)inputDesc.Height };
}
const SIZE outputSize = _CalcOutputSize(desc, option, inputSize);
if (outputSize.cx <= 0 || outputSize.cy <= 0) {
Logger::Get().Error("非法的输出尺寸");
return false;
}
D3D11_TEXTURE2D_DESC texDesc;
_textures[1]->GetDesc(&texDesc);
if ((LONG)texDesc.Width != outputSize.cx || (LONG)texDesc.Height != outputSize.cy) {
_descriptorStore->RemoveCache(_textures[1].get());
_textures[1] = DirectXHelper::CreateTexture2D(
deviceResources.GetD3DDevice(),
texDesc.Format,
outputSize.cx,
outputSize.cy,
texDesc.BindFlags
);
if (!_textures[1]) {
Logger::Get().Error("创建输出纹理失败");
return false;
}
anyChange = true;
}
*inOutTexture = _textures[1].get();
for (size_t i = 2; i < _textures.size(); ++i) {
const std::pair<std::string, std::string>& sizeExpr = desc.textures[i].sizeExpr;
if (sizeExpr.first.empty()) {
// 从文件加载的纹理无需调整尺寸
continue;
}
SIZE texSize{};
try {
_exprParser.SetExpr(sizeExpr.first);
texSize.cx = std::lround(_exprParser.Eval());
_exprParser.SetExpr(sizeExpr.second);
texSize.cy = std::lround(_exprParser.Eval());
} catch (const mu::ParserError& e) {
Logger::Get().Error(fmt::format("计算中间纹理尺寸 {} 失败: {}", e.GetExpr(), e.GetMsg()));
return false;
}
if (texSize.cx <= 0 || texSize.cy <= 0) {
Logger::Get().Error("非法的中间纹理尺寸");
return false;
}
_textures[i]->GetDesc(&texDesc);
if ((LONG)texDesc.Width != texSize.cx || (LONG)texDesc.Height != texSize.cy) {
_descriptorStore->RemoveCache(_textures[i].get());
_textures[i] = DirectXHelper::CreateTexture2D(
deviceResources.GetD3DDevice(),
texDesc.Format,
texSize.cx,
texSize.cy,
texDesc.BindFlags
);
if (!_textures[i]) {
Logger::Get().Error("创建纹理失败");
return false;
}
anyChange = true;
}
}
if (!anyChange) {
return true;
}
if (!_UpdatePassResources(desc)) {
Logger::Get().Error("_UpdatePassResources 失败");
return false;
}
if (!_UpdateConstants(desc, option, deviceResources, inputSize, outputSize)) {
Logger::Get().Error("_UpdateConstants 失败");
return false;
}
return true;
}
SIZE EffectDrawer::_CalcOutputSize(
const EffectDesc& desc,
const EffectOption& option,
SIZE inputSize
) const noexcept {
_exprParser.DefineConst("INPUT_WIDTH", inputSize.cx);
_exprParser.DefineConst("INPUT_HEIGHT", inputSize.cy);
SIZE outputSize{};
const std::pair<std::string, std::string>& outputSizeExpr = desc.GetOutputSizeExpr();
if (outputSizeExpr.first.empty()) {
const SIZE rendererSize = Win32Helper::GetSizeOfRect(ScalingWindow::Get().RendererRect());
switch (option.scalingType) {
case ScalingType::Normal:
{
outputSize.cx = std::lroundf(inputSize.cx * option.scale.first);
outputSize.cy = std::lroundf(inputSize.cy * option.scale.second);
break;
}
case ScalingType::Absolute:
{
outputSize.cx = std::lroundf(option.scale.first);
outputSize.cy = std::lroundf(option.scale.second);
break;
}
case ScalingType::Fit:
{
// 窗口模式缩放时将缩放比例为 1 的 Fit 视为 Fill。此时缩放确保是等比例的但由于舍入
// 可能存在一个像素的误差。考虑长 100 高 50 的矩形窗口,长调整到 101 时高将四舍五入到
// 51再将长调整到 102 高仍是 51Fit 的计算方式会使这两次调整中有一次存在黑边,而且
// 也会影响后续计算是否追加 Bicubic。
const bool treatFitAsFill = ScalingWindow::Get().Options().IsWindowedMode() &&
IsApprox(option.scale.first, 1.0f) && IsApprox(option.scale.second, 1.0f);
if (!treatFitAsFill) {
const float fillScale = std::min(
float(rendererSize.cx) / inputSize.cx,
float(rendererSize.cy) / inputSize.cy
);
outputSize.cx = std::lroundf(inputSize.cx * fillScale * option.scale.first);
outputSize.cy = std::lroundf(inputSize.cy * fillScale * option.scale.second);
break;
}
[[fallthrough]];
}
case ScalingType::Fill:
{
outputSize = rendererSize;
break;
}
default:
assert(false);
return {};
}
} else {
assert(!outputSizeExpr.second.empty());
try {
_exprParser.SetExpr(outputSizeExpr.first);
outputSize.cx = std::lround(_exprParser.Eval());
_exprParser.SetExpr(outputSizeExpr.second);
outputSize.cy = std::lround(_exprParser.Eval());
} catch (const mu::ParserError& e) {
Logger::Get().Error(fmt::format("计算输出尺寸 {} 失败: {}", e.GetExpr(), e.GetMsg()));
return {};
}
}
_exprParser.DefineConst("OUTPUT_WIDTH", outputSize.cx);
_exprParser.DefineConst("OUTPUT_HEIGHT", outputSize.cy);
return outputSize;
}
bool EffectDrawer::_UpdatePassResources(const EffectDesc& desc) noexcept {
const uint32_t passCount = (uint32_t)desc.passes.size();
for (uint32_t i = 0; i < passCount; ++i) {
const SmallVector<uint32_t>& inputs = desc.passes[i].inputs;
const SmallVector<uint32_t>& outputs = desc.passes[i].outputs;
const std::pair<uint32_t, uint32_t>& blockSize = desc.passes[i].blockSize;
for (uint32_t j = 0; j < inputs.size(); ++j) {
auto srv = _srvs[i][j] = _descriptorStore->GetShaderResourceView(_textures[inputs[j]].get());
if (!srv) {
Logger::Get().Error("GetShaderResourceView 失败");
return false;
}
}
for (uint32_t j = 0; j < outputs.size(); ++j) {
auto uav = _uavs[i][j] = _descriptorStore->GetUnorderedAccessView(_textures[outputs[j]].get());
if (!uav) {
Logger::Get().Error("GetUnorderedAccessView 失败");
return false;
}
}
D3D11_TEXTURE2D_DESC outputDesc;
_textures[outputs[0]]->GetDesc(&outputDesc);
_dispatches[i] = {
(outputDesc.Width + blockSize.first - 1) / blockSize.first,
(outputDesc.Height + blockSize.second - 1) / blockSize.second
};
}
return true;
}
bool EffectDrawer::_UpdateConstants(
const EffectDesc& desc,
const EffectOption& option,
DeviceResources& deviceResources,
SIZE inputSize,
SIZE outputSize
) noexcept {
const bool isInlineParams = desc.flags & EffectFlags::InlineParams;
SmallVector<EffectHelper::Constant32, 32> constants;
// 大小必须为 4 的倍数
const size_t builtinConstantCount = 10;
size_t psStylePassParams = 0;
for (UINT i = 0, end = (UINT)desc.passes.size() - 1; i < end; ++i) {
if (desc.passes[i].flags & EffectPassFlags::PSStyle) {
psStylePassParams += 4;
}
}
constants.resize((builtinConstantCount + psStylePassParams + (isInlineParams ? 0 : desc.params.size()) + 3) / 4 * 4);
// cbuffer __CB1 : register(b0) {
// uint2 __inputSize;
// uint2 __outputSize;
// float2 __inputPt;
// float2 __outputPt;
// float2 __scale;
// [PARAMETERS...]
// );
constants[0].uintVal = inputSize.cx;
constants[1].uintVal = inputSize.cy;
constants[2].uintVal = outputSize.cx;
constants[3].uintVal = outputSize.cy;
constants[4].floatVal = 1.0f / inputSize.cx;
constants[5].floatVal = 1.0f / inputSize.cy;
constants[6].floatVal = 1.0f / outputSize.cx;
constants[7].floatVal = 1.0f / outputSize.cy;
constants[8].floatVal = outputSize.cx / (FLOAT)inputSize.cx;
constants[9].floatVal = outputSize.cy / (FLOAT)inputSize.cy;
// PS 样式的通道需要的参数
EffectHelper::Constant32* pCurParam = constants.data() + builtinConstantCount;
if (psStylePassParams > 0) {
for (UINT i = 0, end = (UINT)desc.passes.size() - 1; i < end; ++i) {
if (desc.passes[i].flags & EffectPassFlags::PSStyle) {
D3D11_TEXTURE2D_DESC outputDesc;
_textures[desc.passes[i].outputs[0]]->GetDesc(&outputDesc);
pCurParam->uintVal = outputDesc.Width;
++pCurParam;
pCurParam->uintVal = outputDesc.Height;
++pCurParam;
pCurParam->floatVal = 1.0f / outputDesc.Width;
++pCurParam;
pCurParam->floatVal = 1.0f / outputDesc.Height;
++pCurParam;
}
}
}
if (!isInlineParams) {
for (UINT i = 0; i < desc.params.size(); ++i) {
const auto& paramDesc = desc.params[i];
auto it = option.parameters.find(paramDesc.name);
if (paramDesc.constant.index() == 0) {
const EffectConstant<float>& constant = std::get<0>(paramDesc.constant);
float value = constant.defaultValue;
if (it != option.parameters.end()) {
value = it->second;
if (value < constant.minValue || value > constant.maxValue) {
Logger::Get().Error(fmt::format("参数 {} 的值非法", paramDesc.name));
return false;
}
}
pCurParam->floatVal = value;
} else {
const EffectConstant<int>& constant = std::get<1>(paramDesc.constant);
int value = constant.defaultValue;
if (it != option.parameters.end()) {
value = (int)std::lroundf(it->second);
if ((value < constant.minValue) || (value > constant.maxValue)) {
Logger::Get().Error(StrHelper::Concat("参数 ", paramDesc.name, " 的值非法"));
return false;
}
}
pCurParam->intVal = value;
}
++pCurParam;
}
}
if (_constantBuffer) {
// 更新缓冲区
deviceResources.GetD3DDC()->UpdateSubresource1(
_constantBuffer.get(), 0, nullptr, constants.data(), 0, 0, D3D11_COPY_DISCARD);
} else {
// 创建缓冲区
D3D11_BUFFER_DESC bd{
.ByteWidth = 4 * (UINT)constants.size(),
.Usage = D3D11_USAGE_DEFAULT,
.BindFlags = D3D11_BIND_CONSTANT_BUFFER
};
D3D11_SUBRESOURCE_DATA initData{ .pSysMem = constants.data() };
HRESULT hr = deviceResources.GetD3DDevice()->CreateBuffer(&bd, &initData, _constantBuffer.put());
if (FAILED(hr)) {
Logger::Get().ComError("CreateBuffer 失败", hr);
return false;
}
}
return true;
}
void EffectDrawer::_DrawPass(uint32_t i) const noexcept {
_d3dDC->CSSetShader(_shaders[i].get(), nullptr, 0);
_d3dDC->CSSetShaderResources(0, (UINT)_srvs[i].size(), _srvs[i].data());
UINT uavCount = (UINT)_uavs[i].size() / 2;
_d3dDC->CSSetUnorderedAccessViews(0, uavCount, _uavs[i].data(), nullptr);
_d3dDC->Dispatch(_dispatches[i].first, _dispatches[i].second, 1);
_d3dDC->CSSetUnorderedAccessViews(0, uavCount, _uavs[i].data() + uavCount, nullptr);
}
static bool IsReadonlyTexture(const EffectDesc& desc, uint32_t texture) noexcept {
return texture == 0 || !desc.textures[texture].source.empty();
}
// 计算导出某个通道的输出时需要重新渲染的通道
SmallVector<uint32_t> EffectDrawer::_CalcPassesToDrawForExport(
const EffectDesc& desc,
uint32_t passIdx
) const noexcept {
SmallVector<uint32_t> passesToDraw;
passesToDraw.push_back(passIdx);
if (passIdx == 0) {
return passesToDraw;
}
const std::vector<EffectPassDesc>& passes = desc.passes;
const uint32_t end = (uint32_t)passes.size() - 1;
// 用于记录该通道依赖的输入纹理,格式为 (passIdx, texture)
SmallVector<std::pair<uint32_t, uint32_t>, 0> depTextures;
for (uint32_t input : passes[passIdx].inputs) {
if (!IsReadonlyTexture(desc, input)) {
depTextures.emplace_back(passIdx, input);
}
}
while (!depTextures.empty()) {
const auto [curPass, curTexture] = depTextures.pop_back_val();
// 检查 curTexture 是否会被后面的通道修改
{
bool isOverwritten = false;
for (uint32_t i = curPass + 1; i < end; ++i) {
const SmallVector<uint32_t>& curOutputs = passes[i].outputs;
if (std::find(curOutputs.begin(), curOutputs.end(), curTexture) != curOutputs.end()) {
isOverwritten = true;
break;
}
}
if (!isOverwritten) {
continue;
}
}
// 需要重新渲染前一个输出 curTexture 的通道,并带来新的依赖
for (int i = (int)curPass - 1; i >= 0; --i) {
const SmallVector<uint32_t>& curOutputs = passes[i].outputs;
if (std::find(curOutputs.begin(), curOutputs.end(), curTexture) != curOutputs.end()) {
const uint32_t ui = (uint32_t)i;
if (std::find(passesToDraw.begin(), passesToDraw.end(), ui) == passesToDraw.end()) {
passesToDraw.push_back(ui);
// 作为优化,如果之前的所有通道都需要重新渲染则提前返回
if ((uint32_t)passesToDraw.size() == passIdx + 1) {
for (uint32_t j = 0; j <= passIdx; ++j) {
passesToDraw[j] = j;
}
return passesToDraw;
}
for (uint32_t input : passes[ui].inputs) {
if (!IsReadonlyTexture(desc, input)) {
depTextures.emplace_back(ui, input);
}
}
}
break;
}
}
}
std::sort(passesToDraw.begin(), passesToDraw.end());
return passesToDraw;
}
void EffectDrawer::_PrepareForDraw() const noexcept {
{
ID3D11Buffer* t = _constantBuffer.get();
_d3dDC->CSSetConstantBuffers(0, 1, &t);
}
_d3dDC->CSSetSamplers(0, (UINT)_samplers.size(), _samplers.data());
}
}

View file

@ -1,99 +0,0 @@
#pragma once
#include "EffectDesc.h"
#include "SmallVector.h"
// Conan 的 muparser 不含 UNICODE 支持
#pragma push_macro("_UNICODE")
#undef _UNICODE
#pragma warning(push)
#pragma warning(disable: 4310) // 类型强制转换截断常量值
#include <muParser.h>
#pragma warning(push)
#pragma pop_macro("_UNICODE")
namespace Magpie {
struct EffectOption;
class DeviceResources;
class BackendDescriptorStore;
class EffectsProfiler;
class EffectDrawer {
public:
EffectDrawer() = default;
EffectDrawer(const EffectDrawer&) = delete;
EffectDrawer(EffectDrawer&&) = default;
~EffectDrawer();
bool Initialize(
const EffectDesc& desc,
const EffectOption& option,
DeviceResources& deviceResources,
BackendDescriptorStore& descriptorStore,
ID3D11Texture2D** inOutTexture
) noexcept;
void Draw(EffectsProfiler& profiler) const noexcept;
void DrawForExport(const EffectDesc& desc, uint32_t passIdx) const noexcept;
bool ResizeTextures(
const EffectDesc& desc,
const EffectOption& option,
DeviceResources& deviceResources,
ID3D11Texture2D** inOutTexture
) noexcept;
ID3D11Texture2D* GetOutputTexture() const noexcept {
return _textures[1].get();
}
ID3D11Texture2D* GetTexture(uint32_t idx) const noexcept {
return _textures[idx].get();
}
private:
SIZE _CalcOutputSize(
const EffectDesc& desc,
const EffectOption& option,
SIZE inputSize
) const noexcept;
bool _UpdatePassResources(const EffectDesc& desc) noexcept;
bool _UpdateConstants(
const EffectDesc& desc,
const EffectOption& option,
DeviceResources& deviceResources,
SIZE inputSize,
SIZE outputSize
) noexcept;
void _PrepareForDraw() const noexcept;
void _DrawPass(uint32_t i) const noexcept;
SmallVector<uint32_t> _CalcPassesToDrawForExport(
const EffectDesc& desc,
uint32_t passIdx
) const noexcept;
ID3D11DeviceContext* _d3dDC = nullptr;
BackendDescriptorStore* _descriptorStore = nullptr;
SmallVector<ID3D11SamplerState*> _samplers;
SmallVector<winrt::com_ptr<ID3D11Texture2D>> _textures;
std::vector<SmallVector<ID3D11ShaderResourceView*>> _srvs;
// 后半部分为空,用于解绑
std::vector<SmallVector<ID3D11UnorderedAccessView*>> _uavs;
winrt::com_ptr<ID3D11Buffer> _constantBuffer;
SmallVector<winrt::com_ptr<ID3D11ComputeShader>> _shaders;
SmallVector<std::pair<uint32_t, uint32_t>> _dispatches;
static inline mu::Parser _exprParser;
};
}

View file

@ -0,0 +1,40 @@
#pragma once
namespace Magpie {
class D3D12Context;
struct EffectOption;
struct EffectInfo;
class ComputeContext;
enum class EffectDrawerState {
NotReady,
Ready,
Error
};
class EffectDrawerBase {
public:
EffectDrawerBase() = default;
EffectDrawerBase(const EffectDrawerBase&) = delete;
EffectDrawerBase(EffectDrawerBase&&) = delete;
virtual ~EffectDrawerBase() noexcept = default;
virtual const EffectInfo* Initialize(
D3D12Context& d3d12Context,
const EffectOption& effectOption
) noexcept = 0;
virtual void Bind(SizeU inputSize, SizeU outputSize, const ColorInfo& colorInfo) noexcept = 0;
virtual HRESULT Update(EffectDrawerState& state, std::string& message) noexcept = 0;
virtual HRESULT Draw(
ComputeContext& computeContext,
uint32_t inputSrvOffset,
uint32_t outputUavOffset
) noexcept = 0;
};
}

View file

@ -1,47 +0,0 @@
#pragma once
#include <cstdint>
#include <dxgi.h>
namespace Magpie {
struct EffectHelper {
struct EffectIntermediateTextureFormatDesc {
const char* name;
DXGI_FORMAT dxgiFormat;
uint32_t nChannel;
const char* srvTexelType;
const char* uavTexelType;
};
static constexpr EffectIntermediateTextureFormatDesc FORMAT_DESCS[] = {
{"R32G32B32A32_FLOAT", DXGI_FORMAT_R32G32B32A32_FLOAT, 4, "float4", "float4"},
{"R16G16B16A16_FLOAT", DXGI_FORMAT_R16G16B16A16_FLOAT, 4, "MF4", "MF4"},
{"R16G16B16A16_UNORM", DXGI_FORMAT_R16G16B16A16_UNORM, 4, "MF4", "unorm MF4"},
{"R16G16B16A16_SNORM", DXGI_FORMAT_R16G16B16A16_SNORM, 4, "MF4", "snorm MF4"},
{"R32G32_FLOAT", DXGI_FORMAT_R32G32_FLOAT, 2, "float2", "float2"},
{"R10G10B10A2_UNORM", DXGI_FORMAT_R10G10B10A2_UNORM, 4, "MF4", "unorm MF4"},
{"R11G11B10_FLOAT", DXGI_FORMAT_R11G11B10_FLOAT, 3, "MF3", "MF3"},
{"R8G8B8A8_UNORM", DXGI_FORMAT_R8G8B8A8_UNORM, 4, "MF4", "unorm MF4"},
{"R8G8B8A8_SNORM", DXGI_FORMAT_R8G8B8A8_SNORM, 4, "MF4", "snorm MF4"},
{"R16G16_FLOAT", DXGI_FORMAT_R16G16_FLOAT, 2, "MF2", "MF2"},
{"R16G16_UNORM", DXGI_FORMAT_R16G16_UNORM, 2, "MF2", "unorm MF2"},
{"R16G16_SNORM", DXGI_FORMAT_R16G16_SNORM, 2, "MF2", "snorm MF2"},
{"R32_FLOAT" ,DXGI_FORMAT_R32_FLOAT, 1, "float", "float"},
{"R8G8_UNORM", DXGI_FORMAT_R8G8_UNORM, 2, "MF2", "unorm MF2"},
{"R8G8_SNORM", DXGI_FORMAT_R8G8_SNORM, 2, "MF2", "snorm MF2"},
{"R16_FLOAT", DXGI_FORMAT_R16_FLOAT, 1, "MF", "MF"},
{"R16_UNORM", DXGI_FORMAT_R16_UNORM, 1, "MF", "unorm MF"},
{"R16_SNORM", DXGI_FORMAT_R16_SNORM,1, "MF", "snorm MF"},
{"R8_UNORM", DXGI_FORMAT_R8_UNORM, 1, "MF", "unorm MF"},
{"R8_SNORM", DXGI_FORMAT_R8_SNORM, 1, "MF", "snorm MF"},
{"UNKNOWN", DXGI_FORMAT_UNKNOWN, 4, "float4", "float4"}
};
union Constant32 {
float floatVal;
uint32_t uintVal;
int intVal;
};
};
}

View file

@ -0,0 +1,437 @@
#include "pch.h"
#include "EffectsDrawer.h"
#include "CatmullRomDrawer.h"
#include "CommandContext.h"
#include "D3D12Context.h"
#include "Logger.h"
#include "ScalingWindow.h"
#include "ShaderEffectDrawer.h"
#include "EffectInfo.h"
#include "DescriptorHeap.h"
namespace Magpie {
EffectsDrawer::~EffectsDrawer() noexcept {
#ifdef _DEBUG
if (_descriptorBaseOffset != std::numeric_limits<uint32_t>::max()) {
_d3d12Context->GetDescriptorHeap().Free(_descriptorBaseOffset, _CalcDescriptorCount());
}
#endif
}
static SizeU CalcOutputSize(
uint32_t scaleFactor,
SizeU inputSize,
SizeU rendererSize,
const EffectOption& effectOption
) noexcept {
if (scaleFactor != 0) {
return SizeU{ inputSize.width * scaleFactor, inputSize.height * scaleFactor };
}
// 支持自由缩放
switch (effectOption.scalingType) {
case ScalingType::Normal:
{
return SizeU{
(uint32_t)std::lround(inputSize.width * effectOption.scale.first),
(uint32_t)std::lround(inputSize.height * effectOption.scale.second)
};
}
case ScalingType::Absolute:
{
return SizeU{
(uint32_t)std::lround(effectOption.scale.first),
(uint32_t)std::lround(effectOption.scale.second)
};
}
case ScalingType::Fit:
{
// 窗口模式缩放时将缩放比例为 1 的 Fit 视为 Fill。此时缩放确保是等比例的但由于舍入
// 可能存在一个像素的误差。考虑长 100 高 50 的矩形窗口,长调整到 101 时高将四舍五入到
// 51再将长调整到 102 高仍是 51Fit 的计算方式会使这两次调整中有一次存在黑边。
bool treatFitAsFill = ScalingWindow::Get().Options().IsWindowedMode() &&
IsApprox(effectOption.scale.first, 1.0f) &&
IsApprox(effectOption.scale.second, 1.0f);
if (!treatFitAsFill) {
float fillScale = std::min(
float(rendererSize.width) / inputSize.width,
float(rendererSize.height) / inputSize.height
);
return SizeU{
(uint32_t)std::lround(inputSize.width * fillScale * effectOption.scale.first),
(uint32_t)std::lround(inputSize.height * fillScale * effectOption.scale.second)
};
}
[[fallthrough]];
}
default:
assert(effectOption.scalingType == ScalingType::Fit ||
effectOption.scalingType == ScalingType::Fill);
return rendererSize;
}
}
bool EffectsDrawer::Initialize(
D3D12Context& d3d12Context,
const ColorInfo& colorInfo,
SizeU inputSize,
SizeU rendererSize,
SizeU& outputSize
) noexcept {
_d3d12Context = &d3d12Context;
_colorInfo = colorInfo;
_inputSize = inputSize;
_rendererSize = rendererSize;
ID3D12Device5* device = d3d12Context.GetDevice();
const ScalingOptions& options = ScalingWindow::Get().Options();
uint32_t effectCount = (uint32_t)options.effects.size();
_effectDatas.resize(effectCount);
// 效果的初始化可能是异步的,因此尽早进行
for (uint32_t i = 0; i < effectCount; ++i) {
_EffectData& effectData = _effectDatas[i];
effectData.drawer = std::make_unique<ShaderEffectDrawer>();
effectData.effectInfo = effectData.drawer->Initialize(d3d12Context, options.effects[i]);
if (!effectData.effectInfo) {
Logger::Get().Error("ShaderEffectDrawer::Initialize 失败");
return false;
}
}
_UpdateEffectBindings();
outputSize = _outputSize;
// 创建效果的输入/输出纹理
if (uint32_t descriptorCount = _CalcDescriptorCount()) {
auto& descriptorHeap = _d3d12Context->GetDescriptorHeap();
HRESULT hr = descriptorHeap.Alloc(descriptorCount, _descriptorBaseOffset);
if (FAILED(hr)) {
Logger::Get().ComError("DescriptorHeap::Alloc 失败", hr);
return false;
}
CD3DX12_CPU_DESCRIPTOR_HANDLE cpuHandle(descriptorHeap.GetCpuHandle(_descriptorBaseOffset));
const uint32_t descriptorSize = descriptorHeap.GetDescriptorSize();
CD3DX12_HEAP_PROPERTIES heapProps(D3D12_HEAP_TYPE_DEFAULT);
D3D12_HEAP_FLAGS heapFlags = _d3d12Context->IsHeapFlagCreateNotZeroedSupported() ?
D3D12_HEAP_FLAG_CREATE_NOT_ZEROED : D3D12_HEAP_FLAG_NONE;
bool isSrgb = colorInfo.kind == winrt::AdvancedColorKind::StandardDynamicRange;
CD3DX12_RESOURCE_DESC texDesc = CD3DX12_RESOURCE_DESC::Tex2D(
isSrgb ? DXGI_FORMAT_R10G10B10A2_UNORM : DXGI_FORMAT_R16G16B16A16_FLOAT,
0, 0, 1, 1, 1, 0,
D3D12_RESOURCE_FLAG_ALLOW_UNORDERED_ACCESS
);
CD3DX12_SHADER_RESOURCE_VIEW_DESC srvDesc =
CD3DX12_SHADER_RESOURCE_VIEW_DESC::Tex2D(texDesc.Format, 1);
CD3DX12_UNORDERED_ACCESS_VIEW_DESC uavDesc =
CD3DX12_UNORDERED_ACCESS_VIEW_DESC::Tex2D(texDesc.Format);
for (uint32_t i = 0; i < effectCount; ++i) {
auto& effectData = _effectDatas[i];
// 如果不需要缩小,最后一个效果直接写入环形缓冲,不需要创建输出纹理
if (i == effectCount - 1 && effectData.outputSize == _outputSize) {
break;
}
texDesc.Width = effectData.outputSize.width;
texDesc.Height = effectData.outputSize.height;
hr = device->CreateCommittedResource(
&heapProps, heapFlags, &texDesc, D3D12_RESOURCE_STATE_NON_PIXEL_SHADER_RESOURCE,
nullptr, IID_PPV_ARGS(&effectData.outputTexture));
if (FAILED(hr)) {
Logger::Get().ComError("CreateCommittedResource 失败", hr);
return false;
}
device->CreateShaderResourceView(effectData.outputTexture.get(), &srvDesc, cpuHandle);
cpuHandle.Offset(descriptorSize);
device->CreateUnorderedAccessView(
effectData.outputTexture.get(), nullptr, &uavDesc, cpuHandle);
cpuHandle.Offset(descriptorSize);
}
}
// CatmullRomDrawer 将在渲染时按需创建 PSO初始化无代价
_catmullRomDrawer.Initialize(d3d12Context);
{
// 每帧两个时间戳
const uint32_t timestampCount = 2 * ScalingWindow::Get().Options().maxProducerInFlightFrames;
D3D12_QUERY_HEAP_DESC queryHeapDesc = {
.Type = D3D12_QUERY_HEAP_TYPE_TIMESTAMP,
.Count = timestampCount
};
HRESULT hr = device->CreateQueryHeap(&queryHeapDesc, IID_PPV_ARGS(&_queryHeap));
if (FAILED(hr)) {
Logger::Get().ComError("CreateQueryHeap 失败", hr);
return false;
}
CD3DX12_HEAP_PROPERTIES heapProps(D3D12_HEAP_TYPE_READBACK);
CD3DX12_RESOURCE_DESC bufferDesc =
CD3DX12_RESOURCE_DESC::Buffer(timestampCount * sizeof(UINT64));
hr = device->CreateCommittedResource(
&heapProps,
D3D12_HEAP_FLAG_NONE,
&bufferDesc,
D3D12_RESOURCE_STATE_COPY_DEST,
nullptr,
IID_PPV_ARGS(&_queryResultBuffer)
);
if (FAILED(hr)) {
Logger::Get().ComError("CreateCommittedResource 失败", hr);
return false;
}
hr = d3d12Context.GetCommandQueue()->GetTimestampFrequency(&_timestampFrequency);
if (FAILED(hr)) {
Logger::Get().ComError("ID3D12CommandQueue::GetTimestampFrequency 失败", hr);
return false;
}
}
return true;
}
HRESULT EffectsDrawer::Draw(
ComputeContext& computeContext,
uint32_t /*frameIndex*/,
ID3D12Resource* /*inputResource*/,
ID3D12Resource* /*outputResource*/,
uint32_t inputSrvOffset,
uint32_t outputUavOffset
) noexcept {
// 获取渲染时间
// const uint32_t queryHeapIndex = 2 * frameIndex;
// {
// CD3DX12_RANGE range(queryHeapIndex * sizeof(UINT64), (queryHeapIndex + 2) * sizeof(UINT64));
// void* pData;
// HRESULT hr = _queryResultBuffer->Map(0, nullptr, &pData);
// if (FAILED(hr)) {
// Logger::Get().ComError("ID3D12Resource::Map 失败", hr);
// return hr;
// }
// UINT64* timestampes = (UINT64*)pData + queryHeapIndex;
// range = {};
// _queryResultBuffer->Unmap(0, &range);
// }
//commandList->EndQuery(_queryHeap.get(), D3D12_QUERY_TYPE_TIMESTAMP, queryHeapIndex);
const uint32_t effectCount = (uint32_t)_effectDatas.size();
// 如果多个连续的效果都不能渲染,则合并为一个 CatmullRom
uint32_t catmullRomStartIdx = std::numeric_limits<uint32_t>::max();
for (uint32_t effectIdx = 0; effectIdx < effectCount; ++effectIdx) {
EffectDrawerState state;
std::string msg;
HRESULT hr = _effectDatas[effectIdx].drawer->Update(state, msg);
if (FAILED(hr)) {
Logger::Get().ComError("ShaderEffectDrawer::Update 失败", hr);
return hr;
}
if (state != EffectDrawerState::Ready) {
if (catmullRomStartIdx == std::numeric_limits<uint32_t>::max()) {
catmullRomStartIdx = effectIdx;
}
continue;
}
if (catmullRomStartIdx != std::numeric_limits<uint32_t>::max()) {
SizeU inputSize;
uint32_t inputSrv;
if (catmullRomStartIdx == 0) {
inputSize = _inputSize;
inputSrv = inputSrvOffset;
} else {
uint32_t prevIdx = catmullRomStartIdx - 1;
inputSize = _effectDatas[prevIdx].outputSize;
inputSrv = _descriptorBaseOffset + prevIdx * 2;
computeContext.InsertTransitionBarrier(
_effectDatas[prevIdx].outputTexture.get(),
D3D12_RESOURCE_STATE_UNORDERED_ACCESS,
D3D12_RESOURCE_STATE_NON_PIXEL_SHADER_RESOURCE
);
}
computeContext.InsertTransitionBarrier(
_effectDatas[size_t(effectIdx - 1)].outputTexture.get(),
D3D12_RESOURCE_STATE_NON_PIXEL_SHADER_RESOURCE,
D3D12_RESOURCE_STATE_UNORDERED_ACCESS
);
_catmullRomDrawer.Draw(
computeContext,
inputSize,
_effectDatas[size_t(effectIdx - 1)].outputSize,
inputSrv,
_descriptorBaseOffset + effectIdx * 2 - 1,
false
);
catmullRomStartIdx = std::numeric_limits<uint32_t>::max();
}
bool writeToRingBuffer = effectIdx == effectCount - 1 &&
_effectDatas[effectIdx].outputSize == _outputSize;
if (effectIdx != 0) {
computeContext.InsertTransitionBarrier(
_effectDatas[size_t(effectIdx - 1)].outputTexture.get(),
D3D12_RESOURCE_STATE_UNORDERED_ACCESS,
D3D12_RESOURCE_STATE_NON_PIXEL_SHADER_RESOURCE
);
}
if (!writeToRingBuffer) {
computeContext.InsertTransitionBarrier(
_effectDatas[effectIdx].outputTexture.get(),
D3D12_RESOURCE_STATE_NON_PIXEL_SHADER_RESOURCE,
D3D12_RESOURCE_STATE_UNORDERED_ACCESS
);
}
hr = _effectDatas[effectIdx].drawer->Draw(
computeContext,
effectIdx == 0 ? inputSrvOffset : _descriptorBaseOffset + (effectIdx - 1) * 2,
writeToRingBuffer ? outputUavOffset : _descriptorBaseOffset + effectIdx * 2 + 1
);
if (FAILED(hr)) {
Logger::Get().ComError("EffectDrawerBase::Draw 失败", hr);
return hr;
}
}
if (catmullRomStartIdx != std::numeric_limits<uint32_t>::max()) {
SizeU inputSize;
uint32_t inputSrv;
if (catmullRomStartIdx == 0) {
inputSize = _inputSize;
inputSrv = inputSrvOffset;
} else {
uint32_t prevIdx = catmullRomStartIdx - 1;
inputSize = _effectDatas[prevIdx].outputSize;
inputSrv = _descriptorBaseOffset + prevIdx * 2;
computeContext.InsertTransitionBarrier(
_effectDatas[prevIdx].outputTexture.get(),
D3D12_RESOURCE_STATE_UNORDERED_ACCESS,
D3D12_RESOURCE_STATE_NON_PIXEL_SHADER_RESOURCE
);
}
_catmullRomDrawer.Draw(
computeContext, inputSize, _outputSize, inputSrv, outputUavOffset, false);
} else if (_effectDatas.back().outputSize != _outputSize) {
computeContext.InsertTransitionBarrier(
_effectDatas.back().outputTexture.get(),
D3D12_RESOURCE_STATE_UNORDERED_ACCESS,
D3D12_RESOURCE_STATE_NON_PIXEL_SHADER_RESOURCE
);
_catmullRomDrawer.Draw(
computeContext,
_effectDatas.back().outputSize,
_outputSize,
_descriptorBaseOffset + (effectCount - 1) * 2,
outputUavOffset,
false
);
}
// commandList->EndQuery(_queryHeap.get(), D3D12_QUERY_TYPE_TIMESTAMP, queryHeapIndex + 1);
// commandList->ResolveQueryData(_queryHeap.get(), D3D12_QUERY_TYPE_TIMESTAMP, queryHeapIndex, 2,
// _queryResultBuffer.get(), queryHeapIndex * sizeof(UINT64));
return S_OK;
}
void EffectsDrawer::OnResized(SizeU rendererSize, SizeU& outputSize) noexcept {
_rendererSize = rendererSize;
_UpdateEffectBindings();
outputSize = _outputSize;
}
void EffectsDrawer::OnColorInfoChanged(const ColorInfo& colorInfo) noexcept {
_colorInfo = colorInfo;
_UpdateEffectBindings();
}
uint32_t EffectsDrawer::_CalcDescriptorCount() const noexcept {
// 如果最后一个效果的缩放类型是 Fit 或 Fill 且缩放比例不大于 1那么始终可以直接写入环形缓冲区
// 需要的描述符数量可以减少两个。
// 还有更复杂的情况,如倒数第二个效果是 Fit(0.5,0.5),最后一个效果放大一倍,也可以认为输出尺寸
// 永远不会大于 rendererSize不过这较为复杂还有舍入的问题安全起见不进行优化。
uint32_t count = (uint32_t)_effectDatas.size() * 2;
if (_effectDatas.back().effectInfo->scaleFactor != 0) {
return count;
}
const EffectOption& effectOption = ScalingWindow::Get().Options().effects.back();
if ((effectOption.scalingType == ScalingType::Fit || effectOption.scalingType == ScalingType::Fill) &&
effectOption.scale.first < 1 + FLOAT_EPSILON<float> &&
effectOption.scale.second < 1 + FLOAT_EPSILON<float>)
{
return count - 2;
} else {
return count;
}
}
void EffectsDrawer::_UpdateEffectBindings() noexcept {
const ScalingOptions& options = ScalingWindow::Get().Options();
_outputSize = _inputSize;
for (uint32_t i = 0; i < _effectDatas.size(); ++i) {
_EffectData& effectData = _effectDatas[i];
const EffectOption& effectOption = options.effects[i];
// outputSize 是前一个效果的输出尺寸,即当前效果的输入尺寸
effectData.outputSize = CalcOutputSize(
effectData.effectInfo->scaleFactor, _outputSize, _rendererSize, effectOption);
effectData.drawer->Bind(_outputSize, effectData.outputSize, _colorInfo);
_outputSize = effectData.outputSize;
}
// 如果输出尺寸比渲染区域更大则使用 CatmullRom 等比缩小,窗口模式缩放下可能要放大
if (_outputSize != _rendererSize) {
if (options.IsWindowedMode()) {
// 窗口模式缩放已确保等比例,这里直接赋值以避免舍入误差
_outputSize = _rendererSize;
} else if (_outputSize.width > _rendererSize.width ||
_outputSize.height > _rendererSize.height)
{
float scaleX = float(_rendererSize.width) / _outputSize.width;
float scaleY = float(_rendererSize.height) / _outputSize.height;
if (scaleX <= scaleY) {
_outputSize.width = _rendererSize.width;
_outputSize.height = std::lround(_outputSize.height * scaleX);
} else {
_outputSize.width = std::lround(_outputSize.width * scaleY);
_outputSize.height = _rendererSize.height;
}
}
}
}
}

View file

@ -0,0 +1,74 @@
#pragma once
#include "CatmullRomDrawer.h"
#include "SmallVector.h"
namespace Magpie {
class ComputeContext;
class EffectDrawerBase;
struct EffectInfo;
class EffectsDrawer {
public:
EffectsDrawer() noexcept = default;
EffectsDrawer(const EffectsDrawer&) = delete;
EffectsDrawer(EffectsDrawer&&) = delete;
~EffectsDrawer() noexcept;
bool Initialize(
D3D12Context& d3d12Context,
const ColorInfo& colorInfo,
SizeU inputSize,
SizeU rendererSize,
SizeU& outputSize
) noexcept;
HRESULT Draw(
ComputeContext& computeContext,
uint32_t frameIndex,
ID3D12Resource* inputResource,
ID3D12Resource* outputResource,
uint32_t inputSrvOffset,
uint32_t outputUavOffset
) noexcept;
SizeU GetOutputSize() const noexcept {
return _outputSize;
}
void OnResized(SizeU rendererSize, SizeU& outputSize) noexcept;
void OnColorInfoChanged(const ColorInfo& colorInfo) noexcept;
private:
uint32_t _CalcDescriptorCount() const noexcept;
void _UpdateEffectBindings() noexcept;
D3D12Context* _d3d12Context = nullptr;
SizeU _inputSize{};
SizeU _outputSize{};
SizeU _rendererSize{};
ColorInfo _colorInfo;
struct _EffectData {
std::unique_ptr<EffectDrawerBase> drawer;
const EffectInfo* effectInfo = nullptr;
SizeU outputSize{};
winrt::com_ptr<ID3D12Resource> outputTexture;
};
SmallVector<_EffectData> _effectDatas;
CatmullRomDrawer _catmullRomDrawer;
// 描述符的布局是 SRV|UAV|SRV|UAV|...
uint32_t _descriptorBaseOffset = std::numeric_limits<uint32_t>::max();
winrt::com_ptr<ID3D12QueryHeap> _queryHeap;
winrt::com_ptr<ID3D12Resource> _queryResultBuffer;
UINT64 _timestampFrequency = 0;
};
}

View file

@ -1,124 +0,0 @@
#include "pch.h"
#include "EffectsProfiler.h"
namespace Magpie {
void EffectsProfiler::Start(ID3D11Device* d3dDevice, uint32_t passCount) noexcept {
assert(!IsProfiling() && passCount > 0);
_passQueries.resize(passCount);
D3D11_QUERY_DESC desc{ .Query = D3D11_QUERY_TIMESTAMP_DISJOINT };
d3dDevice->CreateQuery(&desc, _disjointQuery.put());
desc.Query = D3D11_QUERY_TIMESTAMP;
d3dDevice->CreateQuery(&desc, _startQuery.put());
for (winrt::com_ptr<ID3D11Query>& query : _passQueries) {
d3dDevice->CreateQuery(&desc, query.put());
}
}
void EffectsProfiler::Stop() noexcept {
_disjointQuery = nullptr;
_startQuery = nullptr;
_passQueries.clear();
}
bool EffectsProfiler::IsProfiling() const noexcept {
return (bool)_disjointQuery;
}
void EffectsProfiler::SetPassCount(ID3D11Device* d3dDevice, uint32_t passCount) noexcept {
if (!IsProfiling()) {
return;
}
assert(passCount > 0);
const uint32_t oldPassCount = (uint32_t)_passQueries.size();
if (passCount == oldPassCount) {
return;
}
_passQueries.resize(passCount);
if (passCount > oldPassCount) {
D3D11_QUERY_DESC desc{ .Query = D3D11_QUERY_TIMESTAMP };
for (uint32_t i = oldPassCount; i < passCount; ++i) {
d3dDevice->CreateQuery(&desc, _passQueries[i].put());
}
}
}
void EffectsProfiler::OnBeginEffects(ID3D11DeviceContext* d3dDC) noexcept {
if (!IsProfiling()) {
return;
}
d3dDC->Begin(_disjointQuery.get());
d3dDC->End(_startQuery.get());
_curPass = 0;
}
void EffectsProfiler::OnEndPass(ID3D11DeviceContext* d3dDC) noexcept {
if (!IsProfiling()) {
return;
}
d3dDC->End(_passQueries[_curPass++].get());
}
void EffectsProfiler::OnEndEffects(ID3D11DeviceContext* d3dDC) noexcept {
if (!IsProfiling()) {
return;
}
d3dDC->End(_disjointQuery.get());
}
template <typename T>
static T GetQueryData(ID3D11DeviceContext* d3dDC, ID3D11Query* query) noexcept {
T data{};
while (d3dDC->GetData(query, &data, sizeof(data), 0) != S_OK) {
Sleep(0);
}
return data;
}
void EffectsProfiler::QueryTimings(ID3D11DeviceContext* d3dDC) noexcept {
if (!IsProfiling()) {
return;
}
D3D11_QUERY_DATA_TIMESTAMP_DISJOINT disjointData =
GetQueryData<D3D11_QUERY_DATA_TIMESTAMP_DISJOINT>(d3dDC, _disjointQuery.get());
if (disjointData.Disjoint) {
return;
}
const float toMS = 1000.0f / disjointData.Frequency;
uint64_t prevTimestamp = GetQueryData<uint64_t>(d3dDC, _startQuery.get());
auto lock = _timingsLock.lock_exclusive();
_timings.resize(_passQueries.size());
for (size_t i = 0; i < _passQueries.size(); ++i) {
uint64_t timestamp = GetQueryData<uint64_t>(d3dDC, _passQueries[i].get());
_timings[i] = (timestamp - prevTimestamp) * toMS;
prevTimestamp = timestamp;
}
}
SmallVector<float> EffectsProfiler::GetTimings() noexcept {
auto lock = _timingsLock.lock_exclusive();
// 没有渲染新帧时 _timings 为空
SmallVector<float> result = std::move(_timings);
_timings.clear();
return result;
}
}

View file

@ -1,45 +0,0 @@
#pragma once
#include "SmallVector.h"
namespace Magpie {
class DeviceResources;
class EffectsProfiler {
public:
EffectsProfiler() = default;
EffectsProfiler(const EffectsProfiler&) = delete;
EffectsProfiler(EffectsProfiler&&) = delete;
void Start(ID3D11Device* d3dDevice, uint32_t passCount) noexcept;
void Stop() noexcept;
bool IsProfiling() const noexcept;
void SetPassCount(ID3D11Device* d3dDevice, uint32_t passCount) noexcept;
void OnBeginEffects(ID3D11DeviceContext* d3dDC) noexcept;
void OnEndPass(ID3D11DeviceContext* d3dDC) noexcept;
void OnEndEffects(ID3D11DeviceContext* d3dDC) noexcept;
void QueryTimings(ID3D11DeviceContext* d3dDC) noexcept;
// 从前端线程调用
SmallVector<float> GetTimings() noexcept;
private:
SmallVector<float> _timings;
wil::srwlock _timingsLock;
winrt::com_ptr<ID3D11Query> _disjointQuery;
winrt::com_ptr<ID3D11Query> _startQuery;
std::vector<winrt::com_ptr<ID3D11Query>> _passQueries;
uint32_t _curPass = 0;
};
}

View file

@ -0,0 +1,796 @@
#include "pch.h"
#include "AppFolderManager.h"
#include "EffectsService.h"
#include "Logger.h"
#include "ShaderEffectParser.h"
#include "StrHelper.h"
#include "Win32Helper.h"
#include "YasHelper.h"
#include <d3dcompiler.h>
#include <dxcapi.h>
#include <rapidhash.h>
namespace Magpie {
static constexpr uint32_t MAX_MEM_CACHE_COUNT = 63;
// 缓存版本。当缓存文件结构有更改时更新它,使旧缓存失效
static constexpr uint32_t EFFECT_CACHE_VERSION = 16;
static void ListEffects(std::vector<std::wstring>& result, std::wstring_view prefix = {}) {
result.reserve(80);
std::filesystem::path effectsDir = AppFolderManager::Get().GetBuiltInShaderEffectsDir();
WIN32_FIND_DATA findData{};
wil::unique_hfind hFind(FindFirstFileEx(
StrHelper::Concat(effectsDir.native(), L"\\", prefix, L"*").c_str(),
FindExInfoBasic, &findData, FindExSearchNameMatch, nullptr, FIND_FIRST_EX_LARGE_FETCH));
if (!hFind) {
Logger::Get().Win32Error("FindFirstFileEx 失败");
return;
}
do {
std::wstring_view fileName(findData.cFileName);
if (fileName == L"." || fileName == L"..") {
continue;
}
std::wstring filePath = StrHelper::Concat(effectsDir.native(), L"\\", prefix, fileName);
if (Win32Helper::DirExists(filePath.c_str())) {
ListEffects(result, StrHelper::Concat(prefix, fileName, L"\\"));
continue;
}
if (!fileName.ends_with(L".hlsl")) {
continue;
}
result.emplace_back(StrHelper::Concat(prefix, fileName.substr(0, fileName.size() - 5)));
} while (FindNextFile(hFind.get(), &findData));
}
winrt::fire_and_forget EffectsService::Initialize() {
co_await winrt::resume_background();
std::vector<std::wstring> effectNames;
ListEffects(effectNames);
const uint32_t nEffect = (uint32_t)effectNames.size();
_effectsMap.reserve(nEffect);
_effects.reserve(nEffect);
std::filesystem::path effectsDir = AppFolderManager::Get().GetBuiltInShaderEffectsDir();
{
// 用于同步 _effectsMap 和 _effects 的初始化
wil::srwlock srwLock;
// 并行解析效果
Win32Helper::RunParallel([&](uint32_t id) {
std::wstring fileName = StrHelper::Concat(
effectsDir.native(), L"\\", effectNames[id], L".hlsl");
std::string source;
Win32Helper::ReadTextFile(fileName.c_str(), source);
EffectInfo effectInfo;
std::string errorMsg = ShaderEffectParser::ParseForInfo(
StrHelper::UTF16ToUTF8(effectNames[id]), std::move(source), effectInfo);
if (!errorMsg.empty()) {
return;
}
auto lock = srwLock.lock_exclusive();
uint32_t effectIdx = (uint32_t)_effects.size();
EffectInfo& movedEffectInfo = _effects.emplace_back(std::move(effectInfo));
_effectsMap.emplace(movedEffectInfo.name, effectIdx);
}, nEffect);
}
_initialized.store(true, std::memory_order_release);
_initialized.notify_one();
}
void EffectsService::Uninitialize() {
// 等待解析完成,防止退出时崩溃
_WaitForInitialize();
auto lock = _stopSource->lock.lock_exclusive();
_stopSource->isUninitialized = true;
}
const std::vector<EffectInfo>& EffectsService::GetEffects() noexcept {
_WaitForInitialize();
return _effects;
}
const EffectInfo* EffectsService::GetEffect(std::string_view name) noexcept {
_WaitForInitialize();
auto it = _effectsMap.find(name);
return it != _effectsMap.end() ? &_effects[it->second] : nullptr;
}
static std::string GetCacheFileName(
std::string_view effectName,
D3D_SHADER_MODEL shaderModel,
ShaderEffectParserFlags flags,
uint64_t hash
) {
std::string linearEffectName(effectName);
for (char& c : linearEffectName) {
if (c == '\\') {
c = '#';
}
}
// 缓存文件的命名: {效果名}_{shader model|2}{标志位|4}{哈希|16}
return fmt::format("{}_{:02x}{:04x}{:016x}",
linearEffectName, (uint8_t)shaderModel, (uint16_t)flags, hash);
}
std::string EffectsService::SubmitCompileShaderEffectTask(
std::string_view effectName,
const phmap::flat_hash_map<std::string, float>* inlineParams,
D3D_SHADER_MODEL shaderModel,
bool isMinFloat16Supported,
bool isNative16BitSupported,
bool isAdvancedColorSupported,
bool saveSources,
bool warningsAreErrors,
bool disableCache
) noexcept {
_WaitForInitialize();
std::string cacheKey;
auto it = _effectsMap.find(effectName);
if (it == _effectsMap.end()) {
return cacheKey;
}
const EffectInfo& effectInfo = _effects[it->second];
std::string source;
{
std::wstring fileName = StrHelper::Concat(
AppFolderManager::Get().GetBuiltInShaderEffectsDir().native(),
L"\\", StrHelper::UTF8ToUTF16(effectName), L".hlsl");
if (!Win32Helper::ReadTextFile(fileName.c_str(), source)) {
return cacheKey;
}
}
ShaderEffectParserFlags parserFlags = ShaderEffectParserFlags::None;
if (bool(effectInfo.flags & EffectFlags::SupportFP16)) {
if (isNative16BitSupported) {
parserFlags |= ShaderEffectParserFlags::EnableNative16Bit;
} else if (isMinFloat16Supported) {
parserFlags |= ShaderEffectParserFlags::EnableMinFloat16;
}
}
if (bool(effectInfo.flags & EffectFlags::SupportAdvancedColor) && isAdvancedColorSupported) {
parserFlags |= ShaderEffectParserFlags::EnableAdvancedColor;
}
// shaderModel 和 flags 不参与哈希,它们决定缓存键(也是缓存文件名)
uint64_t hash = rapidhash(source.data(), source.size());
if (inlineParams) {
// 即使 inlineParams 中不包含的参数也参与哈希,否则无法区分未启用内联变量和
// 启用但 inlineParams 中没有成员。
for (const EffectParameterDesc& param : effectInfo.params) {
float value;
auto it1 = inlineParams->find(param.name);
if (it1 != inlineParams->end()) {
value = it1->second;
} else {
value = param.defaultValue;
}
// 将参数值归一化然后保留 4 位精度
long normValue = std::lround((value - param.minValue) /
(param.maxValue - param.minValue) * 10000);
hash = phmap::HashState().combine(hash, normValue);
}
}
cacheKey = GetCacheFileName(effectName, shaderModel, parserFlags, hash);
{
auto lk = _shaderEffectCacheLock.lock_exclusive();
auto it1 = _shaderEffectCache.find(cacheKey);
if (it1 != _shaderEffectCache.end()) {
_ShaderEffectMemCacheItem& cacheItem = it1->second;
// 禁用缓存时总是重新编译,除非有多个相同的效果
if (disableCache && cacheItem.refCount == 0) {
_shaderEffectCache.erase(it1);
} else {
cacheItem.lastAccess = _nextLastAccess++;
++cacheItem.refCount;
return cacheKey;
}
}
_shaderEffectCache.emplace(cacheKey, _ShaderEffectMemCacheItem{
.lastAccess = _nextLastAccess++,
.refCount = 1
});
// 超过限制则清理一半较旧的内存缓存
if (_shaderEffectCache.size() > MAX_MEM_CACHE_COUNT) {
assert(_shaderEffectCache.size() == MAX_MEM_CACHE_COUNT + 1);
std::array<uint32_t, MAX_MEM_CACHE_COUNT + 1> allLastAccess{};
std::transform(_shaderEffectCache.begin(), _shaderEffectCache.end(), allLastAccess.begin(),
[](const auto& pair) { return pair.second.lastAccess; });
auto midIt = allLastAccess.begin() + allLastAccess.size() / 2;
std::nth_element(allLastAccess.begin(), midIt, allLastAccess.end());
uint32_t midLastAccess = *midIt;
for (it1 = _shaderEffectCache.begin(); it1 != _shaderEffectCache.end();) {
// 未被使用时才能删除
if (it1->second.lastAccess < midLastAccess && it1->second.refCount == 0) {
it1 = _shaderEffectCache.erase(it1);
} else {
++it1;
}
}
}
}
_CompileShaderEffectAsync(
std::string(effectName),
std::move(source),
inlineParams,
shaderModel,
cacheKey,
(uint32_t)parserFlags,
saveSources,
warningsAreErrors,
disableCache
);
return cacheKey;
}
bool EffectsService::GetTaskResult(
const std::string& taskKey,
const ShaderEffectDrawInfo** drawInfo
) noexcept {
auto lk = _shaderEffectCacheLock.lock_shared();
auto it = _shaderEffectCache.find(taskKey);
if (it == _shaderEffectCache.end()) {
// 编译失败
return false;
}
if (it->second.drawInfo.passes.empty()) {
// 尚未编译完成
*drawInfo = nullptr;
return true;
}
*drawInfo = &it->second.drawInfo;
return true;
}
void EffectsService::ReleaseTask(const std::string& taskKey) noexcept {
auto lk = _shaderEffectCacheLock.lock_exclusive();
auto it = _shaderEffectCache.find(taskKey);
if (it == _shaderEffectCache.end()) {
return;
}
assert(it->second.refCount >= 1);
--it->second.refCount;
}
void EffectsService::_WaitForInitialize() noexcept {
if (_initializedCache) {
return;
}
_initialized.wait(false, std::memory_order_acquire);
_initializedCache = true;
}
class FXCInclude : public ID3DInclude {
public:
FXCInclude(const std::filesystem::path& localDir) : _localDir(localDir) {}
FXCInclude(const FXCInclude&) = default;
FXCInclude(FXCInclude&&) = default;
HRESULT CALLBACK Open(
D3D_INCLUDE_TYPE /*IncludeType*/,
LPCSTR pFileName,
LPCVOID /*pParentData*/,
LPCVOID* ppData,
UINT* pBytes
) noexcept override {
std::filesystem::path relativePath = _localDir / StrHelper::UTF8ToUTF16(pFileName);
std::string file;
if (!Win32Helper::ReadTextFile(relativePath.c_str(), file)) {
return E_FAIL;
}
char* result = std::make_unique<char[]>(file.size()).release();
std::memcpy(result, file.data(), file.size());
*ppData = result;
*pBytes = (UINT)file.size();
return S_OK;
}
HRESULT CALLBACK Close(LPCVOID pData) noexcept override {
std::unique_ptr<char[]> temp((char*)pData);
return S_OK;
}
private:
std::filesystem::path _localDir;
};
template <typename Archive>
void serialize(Archive& ar, ShaderEffectTextureDesc& o) {
ar& o.name& o.format& o.widthExpr& o.heightExpr& o.source;
}
template <typename Archive>
void serialize(Archive& ar, ShaderEffectSamplerDesc& o) {
ar& o.name& o.filterType& o.addressType;
}
template <typename Archive>
void serialize(Archive& ar, ShaderEffectPassDesc& o) {
ar& o.desc& o.byteCode& o.inputs& o.outputs& o.numThreads & o.blockSize& o.flags;
}
template <typename Archive>
void serialize(Archive& ar, ShaderEffectDrawInfo& o) {
ar& o.textures& o.samplers& o.passes;
}
static bool ReadFileCache(const std::string& key, ShaderEffectDrawInfo& drawInfo) noexcept {
const wchar_t* cacheDir = AppFolderManager::Get().GetCacheDir();
std::wstring cacheFilePath = StrHelper::Concat(cacheDir, L"\\", StrHelper::UTF8ToUTF16(key));
if (!Win32Helper::FileExists(cacheFilePath.c_str())) {
return false;
}
std::vector<uint8_t> buffer;
if (!Win32Helper::ReadFile(cacheFilePath.c_str(), buffer) || buffer.empty()) {
return false;
}
try {
yas::mem_istream mi(buffer.data(), buffer.size());
yas::binary_iarchive<yas::mem_istream, yas::binary> ia(mi);
uint32_t version;
ia.read(version);
if (version != EFFECT_CACHE_VERSION) {
Logger::Get().Info("缓存版本不匹配");
return false;
}
ia& drawInfo;
return true;
} catch (...) {
Logger::Get().Error("反序列化失败");
return false;
}
}
static bool WriteFileCache(const std::string& key, const ShaderEffectDrawInfo& drawInfo) noexcept {
std::vector<uint8_t> buffer;
buffer.reserve(4096);
// 序列化
try {
yas::vector_ostream os(buffer);
yas::binary_oarchive<yas::vector_ostream<uint8_t>, yas::binary> oa(os);
oa.write(EFFECT_CACHE_VERSION);
oa& drawInfo;
} catch (...) {
Logger::Get().Error("序列化 ShaderEffectDrawInfo 失败");
return false;
}
const wchar_t* cacheDir = AppFolderManager::Get().GetCacheDir();
if (!Win32Helper::CreateDir(cacheDir, true)) {
Logger::Get().Error("创建缓存文件夹失败");
return false;
}
// 清理缓存
WIN32_FIND_DATA findData{};
wil::unique_hfind hFind(FindFirstFileEx(
StrHelper::Concat(cacheDir, L"\\*").c_str(),
FindExInfoBasic, &findData, FindExSearchNameMatch, nullptr, FIND_FIRST_EX_LARGE_FETCH));
if (hFind) {
// 缓存文件的命名: {效果名}_{shader model|2}{标志位|4}{哈希|16}
assert(key.size() >= 24);
// 只有哈希不同则删除,否则保留。也就是说:
// 1. 效果源代码修改后删除旧缓存
// 2. 启用内联效果参数时删除参数不同的缓存
std::wstring prefix = StrHelper::UTF8ToUTF16(std::string_view(key.c_str(), key.size() - 16));
do {
std::wstring_view fileName(findData.cFileName);
if (fileName.size() == key.size() && fileName.starts_with(prefix)) {
if (!DeleteFile(StrHelper::Concat(cacheDir, L"\\", findData.cFileName).c_str())) {
Logger::Get().Win32Error(StrHelper::Concat("删除缓存文件 ",
StrHelper::UTF16ToUTF8(findData.cFileName), " 失败"));
}
}
} while (FindNextFile(hFind.get(), &findData));
} else {
Logger::Get().Win32Error("查找缓存文件失败");
}
std::wstring cacheFilePath = StrHelper::Concat(cacheDir, L"\\", StrHelper::UTF8ToUTF16(key));
if (!Win32Helper::WriteFile(cacheFilePath.c_str(), buffer)) {
Logger::Get().Error("保存缓存失败");
return false;
}
return true;
}
winrt::fire_and_forget EffectsService::_CompileShaderEffectAsync(
std::string effectName,
std::string source,
const phmap::flat_hash_map<std::string, float>* inlineParams,
D3D_SHADER_MODEL shaderModel,
std::string cacheKey,
uint32_t parserFlags,
bool saveSources,
bool warningsAreErrors,
bool disableCache
) noexcept {
// 允许在编译中途退出,因此访问成员甚至全局变量时必须小心,确保加锁后再访问!
std::shared_ptr<_StopSource> stopSource(_stopSource);
co_await winrt::resume_background();
ShaderEffectParserOptions options;
ShaderEffectDrawInfo effectDrawInfo;
SmallVector<ShaderEffectSource, 0> effectSources;
std::wstring sourcesPath;
{
auto stopSourceLock = stopSource->lock.lock_shared();
if (stopSource->isUninitialized) {
co_return;
}
if (!disableCache) {
// 尝试读取文件缓存
if (ReadFileCache(cacheKey, effectDrawInfo)) {
auto lk = _shaderEffectCacheLock.lock_exclusive();
auto it = _shaderEffectCache.find(cacheKey);
if (it == _shaderEffectCache.end()) {
co_return;
}
it->second.drawInfo = std::move(effectDrawInfo);
co_return;
}
}
// 如果以后 _effectsMap 会变,这里应加锁
auto it = _effectsMap.find(effectName);
if (it == _effectsMap.end()) {
auto lk = _shaderEffectCacheLock.lock_exclusive();
_shaderEffectCache.erase(cacheKey);
co_return;
}
const EffectInfo& effectInfo = _effects[it->second];
options = ShaderEffectParserOptions{
.inlineParams = inlineParams,
.shaderModel = shaderModel,
.flags = (ShaderEffectParserFlags)parserFlags
};
std::string errorMsg = ShaderEffectParser::ParseForDesc(
effectInfo,
std::move(source),
options,
effectDrawInfo,
effectSources
);
if (!errorMsg.empty()) {
// 解析失败
auto lk = _shaderEffectCacheLock.lock_exclusive();
_shaderEffectCache.erase(cacheKey);
co_return;
}
if (saveSources) {
sourcesPath = StrHelper::Concat(AppFolderManager::Get().GetSourcesDir(),
L"\\", StrHelper::UTF8ToUTF16(effectInfo.name));
std::wstring sourcesDir = sourcesPath.substr(0, sourcesPath.find_last_of(L'\\'));
if (!Win32Helper::CreateDir(sourcesDir, true)) {
Logger::Get().Error("Win32Helper::CreateDir 失败");
}
}
}
// 由于允许在编译中途退出,访问 Logger 要加锁!
auto logComError = [&](std::string_view msg, HRESULT hr,
const SourceLocation& location = SourceLocation::Current())
{
auto stopSourceLock = stopSource->lock.lock_shared();
if (!stopSource->isUninitialized) {
Logger::Get().ComError(msg, hr, location);
}
};
auto logWarn = [&](std::string_view msg,
const SourceLocation& location = SourceLocation::Current()) {
auto stopSourceLock = stopSource->lock.lock_shared();
if (!stopSource->isUninitialized) {
Logger::Get().Warn(msg, location);
}
};
auto logError = [&](std::string_view msg,
const SourceLocation& location = SourceLocation::Current()) {
auto stopSourceLock = stopSource->lock.lock_shared();
if (!stopSource->isUninitialized) {
Logger::Get().Error(msg, location);
}
};
std::filesystem::path includeDir = AppFolderManager::Get().GetBuiltInShaderEffectsDir();
size_t delimPos = effectName.find_last_of('\\');
if (delimPos != std::string::npos) {
includeDir /= StrHelper::UTF8ToUTF16(std::string_view(effectName.c_str(), delimPos));
}
Win32Helper::RunParallel([&](uint32_t id) {
const auto& [source, macros] = effectSources[id];
if (saveSources) {
std::wstring fileName = effectDrawInfo.passes.size() == 1
? StrHelper::Concat(sourcesPath, L".hlsl")
: fmt::format(L"{}_Pass{}.hlsl", sourcesPath, id + 1);
if (!Win32Helper::WriteTextFile(fileName.c_str(), source)) {
logError(fmt::format("保存 Pass{} 源码失败", id + 1));
}
}
// SM 6.0 及以上使用 DXC 编译SM 5.1 使用 FXC 编译
if (shaderModel >= D3D_SHADER_MODEL_6_0) {
winrt::com_ptr<IDxcUtils> dxcUtils;
winrt::com_ptr<IDxcCompiler3> dxcCompiler;
HRESULT hr = DxcCreateInstance(CLSID_DxcUtils, IID_PPV_ARGS(&dxcUtils));
if (FAILED(hr)) {
logComError("DxcCreateInstance 失败", hr);
return;
}
hr = DxcCreateInstance(CLSID_DxcCompiler, IID_PPV_ARGS(&dxcCompiler));
if (FAILED(hr)) {
logComError("DxcCreateInstance 失败", hr);
return;
}
DxcBuffer sourceBuffer = {
.Ptr = source.data(),
.Size = source.size(),
.Encoding = DXC_CP_UTF8
};
std::vector<const wchar_t*> arguments;
arguments.push_back(L"-E");
arguments.push_back(L"__M");
arguments.push_back(L"-all-resources-bound");
arguments.push_back(L"-ffinite-math-only");
arguments.push_back(L"-T");
const wchar_t* profile;
switch (shaderModel) {
case D3D_SHADER_MODEL_6_9:
profile = L"cs_6_9";
break;
case D3D_SHADER_MODEL_6_8:
profile = L"cs_6_8";
break;
case D3D_SHADER_MODEL_6_7:
profile = L"cs_6_7";
break;
case D3D_SHADER_MODEL_6_6:
profile = L"cs_6_6";
break;
case D3D_SHADER_MODEL_6_5:
profile = L"cs_6_5";
break;
case D3D_SHADER_MODEL_6_4:
profile = L"cs_6_4";
break;
case D3D_SHADER_MODEL_6_3:
profile = L"cs_6_3";
break;
case D3D_SHADER_MODEL_6_2:
profile = L"cs_6_2";
break;
case D3D_SHADER_MODEL_6_1:
profile = L"cs_6_1";
break;
default:
profile = L"cs_6_0";
break;
}
arguments.push_back(profile);
if (bool(options.flags & ShaderEffectParserFlags::EnableNative16Bit)) {
arguments.push_back(L"-enable-16bit-types");
}
std::vector<std::wstring> macroStrs;
for (const std::pair<std::string, std::string>& macro : macros) {
arguments.push_back(L"-D");
if (macro.second.empty()) {
arguments.push_back(macroStrs.emplace_back(StrHelper::UTF8ToUTF16(macro.first)).c_str());
} else {
arguments.push_back(macroStrs.emplace_back(StrHelper::Concat(
StrHelper::UTF8ToUTF16(macro.first), L"=", StrHelper::UTF8ToUTF16(macro.second))).c_str());
}
}
#ifdef _DEBUG
arguments.push_back(L"-Od");
arguments.push_back(L"-Zi");
arguments.push_back(L"-Qembed_debug");
#else
arguments.push_back(L"-O3");
// 剥离反射信息以减小体积
arguments.push_back(L"-Qstrip_reflect");
#endif
arguments.push_back(L"-I");
arguments.push_back(includeDir.c_str());
winrt::com_ptr<IDxcIncludeHandler> includeHandler;
hr = dxcUtils->CreateDefaultIncludeHandler(includeHandler.put());
if (FAILED(hr)) {
logComError("IDxcUtils::CreateDefaultIncludeHandler 失败", hr);
return;
}
winrt::com_ptr<IDxcResult> dxcResult;
hr = dxcCompiler->Compile(
&sourceBuffer,
arguments.data(),
(uint32_t)arguments.size(),
includeHandler.get(),
IID_PPV_ARGS(&dxcResult)
);
if (FAILED(hr)) {
logComError("IDxcCompiler3::Compile 失败", hr);
return;
}
winrt::com_ptr<IDxcBlobUtf8> messages;
dxcResult->GetOutput(DXC_OUT_ERRORS, IID_PPV_ARGS(&messages), nullptr);
if (messages && messages->GetStringLength() > 0) {
logWarn(StrHelper::Concat("编译着色器输出: ", messages->GetStringPointer()));
return;
}
HRESULT compileStatus;
dxcResult->GetStatus(&compileStatus);
if (FAILED(compileStatus)) {
logComError("编译着色器失败", compileStatus);
return;
}
hr = dxcResult->GetOutput(DXC_OUT_OBJECT, IID_PPV_ARGS(&effectDrawInfo.passes[id].byteCode), nullptr);
if (FAILED(hr)) {
logComError("IDxcResult::GetOutput 失败", hr);
}
} else {
winrt::com_ptr<ID3DBlob> errorMsg;
UINT flags = D3DCOMPILE_ENABLE_STRICTNESS | D3DCOMPILE_ALL_RESOURCES_BOUND;
if (warningsAreErrors) {
flags |= D3DCOMPILE_WARNINGS_ARE_ERRORS;
}
#ifdef _DEBUG
flags |= D3DCOMPILE_SKIP_OPTIMIZATION | D3DCOMPILE_DEBUG;
#else
flags |= D3DCOMPILE_OPTIMIZATION_LEVEL3;
#endif
auto shaderMacros = std::make_unique<D3D_SHADER_MACRO[]>(macros.size() + 1);
for (size_t i = 0; i < macros.size(); ++i) {
shaderMacros[i] = { macros[i].first.c_str(), macros[i].second.c_str() };
}
FXCInclude fxcInclude(includeDir);
HRESULT hr = D3DCompile(
source.data(),
source.size(),
fmt::format("{}_Pass{}.hlsl", effectName, id + 1).c_str(),
shaderMacros.get(),
&fxcInclude,
"__M",
"cs_5_1",
flags,
0,
effectDrawInfo.passes[id].byteCode.put(),
errorMsg.put()
);
if (FAILED(hr)) {
if (errorMsg) {
logComError(StrHelper::Concat("编译着色器失败: ", (const char*)errorMsg->GetBufferPointer()), hr);
}
return;
}
// 警告消息
if (errorMsg) {
logWarn(StrHelper::Concat("编译着色器时产生警告: ", (const char*)errorMsg->GetBufferPointer()));
}
}
}, (uint32_t)effectDrawInfo.passes.size());
{
auto stopSourceLock = stopSource->lock.lock_shared();
if (stopSource->isUninitialized) {
co_return;
}
for (const ShaderEffectPassDesc& passDesc : effectDrawInfo.passes) {
if (!passDesc.byteCode) {
// 编译失败
auto lk = _shaderEffectCacheLock.lock_exclusive();
_shaderEffectCache.erase(cacheKey);
co_return;
}
}
{
auto lk = _shaderEffectCacheLock.lock_exclusive();
auto it = _shaderEffectCache.find(cacheKey);
if (it == _shaderEffectCache.end()) {
co_return;
}
// 需要写入文件缓存时应复制而不是移动以避免加锁
if (disableCache) {
it->second.drawInfo = std::move(effectDrawInfo);
co_return;
} else {
it->second.drawInfo = effectDrawInfo;
}
}
// 创建文件缓存
if (!WriteFileCache(cacheKey, effectDrawInfo)) {
Logger::Get().Error("WriteFileCache 失败");
}
}
}
}

View file

@ -0,0 +1,597 @@
#include "pch.h"
#include "FrameProducer.h"
#include "CommonSharedConstants.h"
#include "DuplicateFrameChecker.h"
#include "DescriptorHeap.h"
#include "GraphicsCaptureFrameSource.h"
#include "Logger.h"
#include "ScalingWindow.h"
#include <dispatcherqueue.h>
namespace Magpie {
FrameProducer::~FrameProducer() noexcept {
if (_producerThread.joinable()) {
const HANDLE hThread = _producerThread.native_handle();
if (!wil::handle_wait(hThread, 0)) {
const DWORD threadId = GetThreadId(_producerThread.native_handle());
while (true) {
// 持续尝试直到 _producerThread 创建了消息队列
PostThreadMessage(threadId, WM_QUIT, 0, 0);
if (wil::handle_wait(hThread, 1)) {
break;
}
}
}
_producerThread.join();
}
#ifdef _DEBUG
if (_inputSrvBaseOffset != std::numeric_limits<uint32_t>::max()) {
auto& descriptorHeap = _d3d12Context.GetDescriptorHeap();
uint32_t maxInFlightFrameCount = ScalingWindow::Get().Options().maxProducerInFlightFrames;
descriptorHeap.Free(_inputSrvBaseOffset, 3 * maxInFlightFrameCount + 2);
}
#endif
}
void FrameProducer::InitializeAsync(
const D3D12Context& d3d12Context,
const ColorInfo& colorInfo,
HMONITOR hMonSrc,
const RECT& srcRect,
SizeU rendererSize,
SizeU& outputSize,
SimpleTask<bool>& task
) noexcept {
_d3d12Context.CopyDevice(d3d12Context);
_producerThread = std::thread(
&FrameProducer::_ProducerThreadProc,
this,
colorInfo,
hMonSrc,
srcRect,
rendererSize,
std::ref(outputSize),
std::ref(task)
);
}
ComponentState FrameProducer::GetState() const noexcept {
return _state.load(std::memory_order_relaxed);
}
uint64_t FrameProducer::GetLatestFrameNumber() const noexcept {
return _frameRingBuffer.GetLatestFrameNumber();
}
uint32_t FrameProducer::GetFPS() const noexcept {
return _stepTimer.GetFPS();
}
bool FrameProducer::ConsumerBeginFrame(
ID3D12Resource*& frame,
uint32_t& frameSrvOffset,
uint64_t& completedFenceValue,
uint64_t& fenceValueToSignal
) noexcept {
uint32_t bufferIdx;
if (!_frameRingBuffer.ConsumerBeginFrame(
bufferIdx, frame, completedFenceValue, fenceValueToSignal)) {
return false;
}
frameSrvOffset = _outputSrvBaseOffset + bufferIdx;
return true;
}
HRESULT FrameProducer::ConsumerEndFrame(
ID3D12CommandQueue* commandQueue,
uint64_t fenceValueToSignal
) const noexcept {
return _frameRingBuffer.ConsumerEndFrame(commandQueue, fenceValueToSignal);
}
void FrameProducer::OnResizedAsync(
SizeU rendererSize,
SizeU& outputSize,
SimpleTask<HRESULT>& task
) noexcept {
_dispatcher.TryEnqueue([&, rendererSize] {
HRESULT hr = S_OK;
auto se = wil::scope_exit([&] {
// 同步 outputSize
task.SetResult(hr, std::memory_order_release);
});
ComponentState state = _state.load(std::memory_order_relaxed);
if (state != ComponentState::NoError) {
hr = state == ComponentState::DeviceLost ? DXGI_ERROR_DEVICE_REMOVED : E_FAIL;
return;
}
hr = _d3d12Context.WaitForGpu();
if (!_CheckResult(hr, "D3D12Context::WaitForGpu 失败")) {
return;
}
_effectsDrawer.OnResized(rendererSize, outputSize);
hr = _frameRingBuffer.OnResized(outputSize);
if (!_CheckResult(hr, "FrameRingBuffer::OnResized 失败")) {
return;
}
_CreateOutputDescriptors();
hr = _Render();
if (!_CheckResult(hr, "_Render 失败")) {
return;
}
// 等待渲染完成
hr = _d3d12Context.WaitForGpu();
if (!_CheckResult(hr, "D3D12Context::WaitForGpu 失败")) {
return;
}
});
}
void FrameProducer::OnColorInfoChangedAsync(
const ColorInfo& colorInfo,
SimpleTask<HRESULT>& task
) noexcept {
_dispatcher.TryEnqueue([&] {
HRESULT hr = S_OK;
auto se = wil::scope_exit([&] {
task.SetResult(hr);
});
ComponentState state = _state.load(std::memory_order_relaxed);
if (state != ComponentState::NoError) {
hr = state == ComponentState::DeviceLost ? DXGI_ERROR_DEVICE_REMOVED : E_FAIL;
return;
}
_isScRGB = colorInfo.kind != winrt::AdvancedColorKind::StandardDynamicRange;
hr = _d3d12Context.WaitForGpu();
if (!_CheckResult(hr, "D3D12Context::WaitForGpu 失败")) {
return;
}
hr = _frameSource->OnColorInfoChanged(colorInfo);
if (!_CheckResult(hr, "GraphicsCaptureFrameSource::OnColorInfoChanged 失败")) {
return;
}
_effectsDrawer.OnColorInfoChanged(colorInfo);
hr = _frameRingBuffer.OnColorInfoChanged(colorInfo);
if (!_CheckResult(hr, "FrameRingBuffer::OnColorInfoChanged 失败")) {
return;
}
_CreateInputDescriptors();
_CreateOutputDescriptors();
// 等待新帧
while (true) {
bool isNewFrameAvailable;
hr = _frameSource->CheckForNewFrame(isNewFrameAvailable);
if (!_CheckResult(hr, "GraphicsCaptureFrameSource::CheckForNewFrame 失败")) {
return;
}
if (isNewFrameAvailable) {
break;
} else {
WaitMessage();
}
}
hr = _Render();
if (!_CheckResult(hr, "_Render 失败")) {
return;
}
// 等待渲染完成
hr = _d3d12Context.WaitForGpu();
if (!_CheckResult(hr, "D3D12Context::WaitForGpu 失败")) {
return;
}
});
}
void FrameProducer::OnCursorVisibilityChanged(bool isVisible, bool onDestory) noexcept {
_dispatcher.TryEnqueue([this, isVisible, onDestory] {
if (_state.load(std::memory_order_relaxed) != ComponentState::NoError) {
return;
}
_CheckResult(_frameSource->OnCursorVisibilityChanged(isVisible, onDestory),
"GraphicsCaptureFrameSource::OnCursorVisibilityChanged 失败");
});
}
void FrameProducer::_ProducerThreadProc(
const ColorInfo& colorInfo,
HMONITOR hMonSrc,
RECT srcRect,
SizeU rendererSize,
SizeU& outputSize,
SimpleTask<bool>& initializeTask
) noexcept {
#ifdef _DEBUG
SetThreadDescription(GetCurrentThread(), L"Magpie-缩放生产者线程");
#endif
if (_Initialize(colorInfo, hMonSrc, srcRect, rendererSize, outputSize)) {
// 同步 outputSize
initializeTask.SetResult(true, std::memory_order_release);
} else {
Logger::Get().Error("_Initialize 失败");
initializeTask.SetResult(false);
return;
}
StepTimerStatus stepTimerStatus = StepTimerStatus::WaitingForNewFrame;
const bool waitMsgForNewFrame = _frameSource->ShouldWaitMessageForNewFrame();
bool isWaitingForFirstFrame = true;
MSG msg;
while (true) {
bool fpsUpdated = false;
// WaitingForFPSLimiter 状态下新帧消息可能已被处理,不要等待消息,直到状态变化
stepTimerStatus = _stepTimer.WaitForNextFrame(
waitMsgForNewFrame && stepTimerStatus != StepTimerStatus::WaitingForFPSLimiter,
fpsUpdated
);
if (fpsUpdated) {
// FPS 变化时要求前端重新渲染以更新叠加层
PostMessage(ScalingWindow::Get().Handle(),
CommonSharedConstants::WM_FRONTEND_RENDER, 0, 0);
}
while (PeekMessage(&msg, NULL, 0, 0, PM_REMOVE)) {
if (msg.message == WM_QUIT) {
break;
}
DispatchMessage(&msg);
}
// 异步检查回调是否出错
if (msg.message == WM_QUIT ||
_state.load(std::memory_order_relaxed) != ComponentState::NoError) {
break;
}
if (stepTimerStatus == StepTimerStatus::WaitingForFPSLimiter) {
continue;
}
bool isNewFrameAvailable;
if (!_CheckResult(_frameSource->CheckForNewFrame(isNewFrameAvailable),
"GraphicsCaptureFrameSource::CheckForNewFrame 失败")) {
break;
}
// 强制等待第一帧
if (!isNewFrameAvailable && (isWaitingForFirstFrame || stepTimerStatus != StepTimerStatus::ForceNewFrame)) {
continue;
}
isWaitingForFirstFrame = false;
if (!_CheckResult(_Render(), "_Render 失败")) {
break;
}
}
_d3d12Context.WaitForGpu();
// 必须在创建线程释放
_frameSource.reset();
if (_monitorThread.joinable()) {
const HANDLE hThread = _monitorThread.native_handle();
if (!wil::handle_wait(hThread, 0)) {
const DWORD threadId = GetThreadId(_monitorThread.native_handle());
while (true) {
// 持续尝试直到创建了消息队列
PostThreadMessage(threadId, WM_QUIT, 0, 0);
if (wil::handle_wait(hThread, 1)) {
break;
}
}
}
_monitorThread.join();
}
}
bool FrameProducer::_Initialize(
const ColorInfo& colorInfo,
HMONITOR hMonSrc,
const RECT& srcRect,
SizeU rendererSize,
SizeU& outputSize
) noexcept {
winrt::init_apartment(winrt::apartment_type::single_threaded);
// 创建 DispatcherQueue
{
winrt::Windows::System::DispatcherQueueController dqc{ nullptr };
HRESULT hr = CreateDispatcherQueueController(
DispatcherQueueOptions{
.dwSize = sizeof(DispatcherQueueOptions),
.threadType = DQTYPE_THREAD_CURRENT
},
(PDISPATCHERQUEUECONTROLLER*)winrt::put_abi(dqc)
);
if (FAILED(hr)) {
Logger::Get().ComError("CreateDispatcherQueueController 失败", hr);
return false;
}
_dispatcher = dqc.DispatcherQueue();
}
const ScalingOptions& options = ScalingWindow::Get().Options();
const uint32_t maxInFlightFrameCount = options.maxProducerInFlightFrames;
if (!_d3d12Context.InitializeAfterCopyDevice(
maxInFlightFrameCount,
D3D12_COMMAND_QUEUE_PRIORITY_NORMAL,
D3D12_COMMAND_LIST_TYPE_COMPUTE,
true
)) {
Logger::Get().Error("初始化 D3D12Context 失败");
return false;
}
_computeContext.Initialize(_d3d12Context);
_isScRGB = colorInfo.kind != winrt::AdvancedColorKind::StandardDynamicRange;
_frameSource = std::make_unique<GraphicsCaptureFrameSource>();
if (!_frameSource->Initialize(_d3d12Context, srcRect, hMonSrc, colorInfo)) {
Logger::Get().Error("初始化 GraphicsCaptureFrameSource 失败");
return false;
}
{
const SizeU inputSize = {
uint32_t(srcRect.right - srcRect.left),
uint32_t(srcRect.bottom - srcRect.top)
};
if (!_effectsDrawer.Initialize(_d3d12Context, colorInfo, inputSize, rendererSize, outputSize)) {
Logger::Get().Error("EffectsDrawer::Initialize 失败");
return false;
}
assert(outputSize.width <= rendererSize.width && outputSize.height <= rendererSize.height);
if (!_frameRingBuffer.Initialize(_d3d12Context, outputSize, colorInfo)) {
Logger::Get().Error("初始化 FrameRingBuffer 失败");
return false;
}
}
{
auto& descriptorHeap = _d3d12Context.GetDescriptorHeap();
// maxInFlightFrameCount + (maxInFlightFrameCount + 1) + (maxInFlightFrameCount + 1)
HRESULT hr = descriptorHeap.Alloc(maxInFlightFrameCount * 3 + 2, _inputSrvBaseOffset);
if (FAILED(hr)) {
Logger::Get().ComError("DescriptorHeap::Alloc 失败", hr);
return false;
}
_outputUavBaseOffset = _inputSrvBaseOffset + maxInFlightFrameCount;
_outputSrvBaseOffset = _outputUavBaseOffset + maxInFlightFrameCount + 1;
_CreateInputDescriptors();
_CreateOutputDescriptors();
}
_monitorThread = std::thread(&FrameProducer::_MonitorThreadProc, this);
if (options.IsBenchmarkMode()) {
// 不要使用无限大,/fp:fast 下无限大值不可靠
_stepTimer.Initialize(std::numeric_limits<float>::max(), std::nullopt);
} else {
_stepTimer.Initialize(options.minFrameRate, options.maxFrameRate);
}
// 最后启动捕获以尽可能推迟显示黄色边框 (Win10) 或禁用圆角 (Win11)
if (!_frameSource->Start()) {
Logger::Get().Error("GraphicsCaptureFrameSource::Start 失败");
return false;
}
return true;
}
void FrameProducer::_CreateInputDescriptors() noexcept {
uint32_t bufferCount = ScalingWindow::Get().Options().maxProducerInFlightFrames;
ID3D12Device5* device = _d3d12Context.GetDevice();
auto& descriptorHeap = _d3d12Context.GetDescriptorHeap();
uint32_t descriptorSize = descriptorHeap.GetDescriptorSize();
CD3DX12_CPU_DESCRIPTOR_HANDLE descriptorCpuHandle(descriptorHeap.GetCpuHandle(_inputSrvBaseOffset));
CD3DX12_SHADER_RESOURCE_VIEW_DESC srvDesc = CD3DX12_SHADER_RESOURCE_VIEW_DESC::Tex2D(
_isScRGB ? DXGI_FORMAT_R16G16B16A16_FLOAT : DXGI_FORMAT_B8G8R8A8_UNORM_SRGB, 1);
for (uint32_t i = 0; i < bufferCount; ++i) {
device->CreateShaderResourceView(_frameSource->GetOutput(i), &srvDesc, descriptorCpuHandle);
descriptorCpuHandle.Offset(descriptorSize);
}
}
void FrameProducer::_CreateOutputDescriptors() noexcept {
uint32_t bufferCount = ScalingWindow::Get().Options().maxProducerInFlightFrames + 1;
ID3D12Device5* device = _d3d12Context.GetDevice();
auto& descriptorHeap = _d3d12Context.GetDescriptorHeap();
uint32_t descriptorSize = descriptorHeap.GetDescriptorSize();
CD3DX12_CPU_DESCRIPTOR_HANDLE uavCpuHandle(descriptorHeap.GetCpuHandle(_outputUavBaseOffset));
CD3DX12_CPU_DESCRIPTOR_HANDLE srvCpuHandle(descriptorHeap.GetCpuHandle(_outputSrvBaseOffset));
DXGI_FORMAT format = _isScRGB ? DXGI_FORMAT_R16G16B16A16_FLOAT : DXGI_FORMAT_R8G8B8A8_UNORM;
CD3DX12_UNORDERED_ACCESS_VIEW_DESC uavDesc = CD3DX12_UNORDERED_ACCESS_VIEW_DESC::Tex2D(format);
CD3DX12_SHADER_RESOURCE_VIEW_DESC srvDesc = CD3DX12_SHADER_RESOURCE_VIEW_DESC::Tex2D(format, 1);
for (uint32_t i = 0; i < bufferCount; ++i) {
ID3D12Resource* resource = _frameRingBuffer.GetBuffer(i);
device->CreateUnorderedAccessView(resource, nullptr, &uavDesc, uavCpuHandle);
device->CreateShaderResourceView(resource, &srvDesc, srvCpuHandle);
uavCpuHandle.Offset(descriptorSize);
srvCpuHandle.Offset(descriptorSize);
}
}
HRESULT FrameProducer::_Render() noexcept {
_stepTimer.PrepareForRender();
uint32_t frameIndex;
HRESULT hr = _d3d12Context.BeginFrame(frameIndex, nullptr);
if (FAILED(hr)) {
Logger::Get().ComError("D3D12Context::BeginFrame 失败", hr);
return hr;
}
ID3D12CommandQueue* commandQueue = _d3d12Context.GetCommandQueue();
uint32_t frameRingBufferIdx;
hr = _frameRingBuffer.ProducerBeginFrame(commandQueue, frameRingBufferIdx);
if (FAILED(hr)) {
Logger::Get().ComError("FrameRingBuffer::ProducerBeginFrame 失败", hr);
return hr;
}
uint32_t frameSourceOutputIdx;
hr = _frameSource->Update(frameSourceOutputIdx);
if (FAILED(hr)) {
Logger::Get().ComError("GraphicsCaptureFrameSource::Update 失败", hr);
return hr;
}
_computeContext.SetDescriptorHeap(_d3d12Context.GetDescriptorHeap().GetHeap());
// 输出和输出纹理都处于 COMMON 状态使用结束后也应处于此状态。inputResource
// 依赖隐式状态转换。
ID3D12Resource* inputResource = _frameSource->GetOutput(frameSourceOutputIdx);
ID3D12Resource* outputResource = _frameRingBuffer.GetBuffer(frameRingBufferIdx);
_computeContext.InsertTransitionBarrier(
outputResource,
D3D12_RESOURCE_STATE_COMMON,
D3D12_RESOURCE_STATE_UNORDERED_ACCESS
);
hr = _effectsDrawer.Draw(
_computeContext,
frameIndex,
inputResource,
outputResource,
_inputSrvBaseOffset + frameSourceOutputIdx,
_outputUavBaseOffset + frameRingBufferIdx
);
if (FAILED(hr)) {
Logger::Get().ComError("EffectsDrawer::Draw 失败", hr);
return hr;
}
_computeContext.InsertTransitionBarrier(
outputResource,
D3D12_RESOURCE_STATE_UNORDERED_ACCESS,
D3D12_RESOURCE_STATE_COMMON
);
_computeContext.Execute(commandQueue);
hr = _frameRingBuffer.ProducerEndFrame(commandQueue);
if (FAILED(hr)) {
Logger::Get().ComError("FrameRingBuffer::ProducerEndFrame 失败", hr);
return hr;
}
hr = _d3d12Context.EndFrame();
if (FAILED(hr)) {
Logger::Get().ComError("D3D12Context::EndFrame 失败", hr);
return hr;
}
return S_OK;
}
void FrameProducer::_MonitorThreadProc() noexcept {
wil::unique_event_nothrow event;
if (!event.try_create(wil::EventOptions::None, nullptr)) {
Logger::Get().Win32Error("创建事件失败");
return;
}
uint64_t frameNumber = 1;
// 绑定新帧渲染完成时触发的事件
HRESULT hr = _frameRingBuffer.SetEventOnNewFrame(frameNumber, event.get());
if (FAILED(hr)) {
Logger::Get().ComError("FrameRingBuffer::SetEventOnNewFrame 失败", hr);
return;
}
MSG msg;
while (true) {
while (PeekMessage(&msg, NULL, 0, 0, PM_REMOVE)) {
if (msg.message == WM_QUIT) {
return;
}
DispatchMessage(&msg);
}
// 有新帧可用时返回 WAIT_OBJECT_0有新消息时返回 WAIT_OBJECT_0 + 1
HANDLE hEvent = event.get();
if (MsgWaitForMultipleObjectsEx(1, &hEvent, INFINITE, QS_ALLINPUT, MWMO_INPUTAVAILABLE) == WAIT_OBJECT_0) {
// 通知消费者渲染
PostMessage(ScalingWindow::Get().Handle(), CommonSharedConstants::WM_FRONTEND_RENDER, 0, 0);
hr = _frameRingBuffer.SetEventOnNewFrame(frameNumber, event.get());
if (FAILED(hr)) {
Logger::Get().ComError("FrameRingBuffer::SetEventOnNewFrame 失败", hr);
return;
}
}
}
}
bool FrameProducer::_CheckResult(HRESULT hr, std::string_view errorMsg) noexcept {
assert(_state.load(std::memory_order_relaxed) == ComponentState::NoError);
if (SUCCEEDED(hr)) {
return true;
}
if (hr == DXGI_ERROR_DEVICE_REMOVED || hr == DXGI_ERROR_DEVICE_RESET) {
_state.store(ComponentState::DeviceLost, std::memory_order_relaxed);
} else {
_state.store(ComponentState::Error, std::memory_order_relaxed);
}
Logger::Get().ComError(errorMsg, hr);
return false;
}
}

View file

@ -0,0 +1,104 @@
#pragma once
#include "CommandContext.h"
#include "D3D12Context.h"
#include "EffectsDrawer.h"
#include "FrameRingBuffer.h"
#include "StepTimer.h"
#include "SimpleTask.h"
namespace Magpie {
class GraphicsCaptureFrameSource;
class FrameProducer {
public:
FrameProducer() = default;
FrameProducer(const FrameProducer&) = delete;
FrameProducer(FrameProducer&&) = delete;
~FrameProducer() noexcept;
void InitializeAsync(
const D3D12Context& d3d12Context,
const ColorInfo& colorInfo,
HMONITOR hMonSrc,
const RECT& srcRect,
SizeU rendererSize,
SizeU& outputSize,
SimpleTask<bool>& task
) noexcept;
ComponentState GetState() const noexcept;
uint64_t GetLatestFrameNumber() const noexcept;
uint32_t GetFPS() const noexcept;
bool ConsumerBeginFrame(
ID3D12Resource*& frame,
uint32_t& frameSrvOffset,
uint64_t& completedFenceValue,
uint64_t& fenceValueToSignal
) noexcept;
HRESULT ConsumerEndFrame(
ID3D12CommandQueue* commandQueue,
uint64_t fenceValueToSignal
) const noexcept;
void OnResizedAsync(SizeU rendererSize, SizeU& outputSize, SimpleTask<HRESULT>& task) noexcept;
void OnColorInfoChangedAsync(const ColorInfo& colorInfo, SimpleTask<HRESULT>& task) noexcept;
void OnCursorVisibilityChanged(bool isVisible, bool onDestory) noexcept;
private:
void _ProducerThreadProc(
const ColorInfo& colorInfo,
HMONITOR hMonSrc,
RECT srcRect,
SizeU rendererSize,
SizeU& outputSize,
SimpleTask<bool>& initializeTask
) noexcept;
bool _Initialize(
const ColorInfo& colorInfo,
HMONITOR hMonSrc,
const RECT& srcRect,
SizeU rendererSize,
SizeU& outputSize
) noexcept;
void _CreateInputDescriptors() noexcept;
void _CreateOutputDescriptors() noexcept;
HRESULT _Render() noexcept;
void _MonitorThreadProc() noexcept;
bool _CheckResult(HRESULT hr, std::string_view errorMsg) noexcept;
std::atomic<ComponentState> _state = ComponentState::NoError;
std::thread _producerThread;
winrt::DispatcherQueue _dispatcher{ nullptr };
std::thread _monitorThread;
D3D12Context _d3d12Context;
ComputeContext _computeContext;
FrameRingBuffer _frameRingBuffer;
StepTimer _stepTimer;
std::unique_ptr<GraphicsCaptureFrameSource> _frameSource;
EffectsDrawer _effectsDrawer;
uint32_t _inputSrvBaseOffset = std::numeric_limits<uint32_t>::max();
uint32_t _outputUavBaseOffset = std::numeric_limits<uint32_t>::max();
uint32_t _outputSrvBaseOffset = std::numeric_limits<uint32_t>::max();
bool _isScRGB = false;
};
}

Some files were not shown because too many files have changed in this diff Show more