Merge pull request #643 from Blinue/render-system

新渲染系统
This commit is contained in:
Xu 2024-04-03 19:51:22 +08:00 committed by GitHub
commit 65e5bd8331
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
367 changed files with 104653 additions and 12644 deletions

View file

@ -9,6 +9,9 @@ on:
jobs:
build:
runs-on: windows-latest
strategy:
matrix:
platform: ["x64", "ARM64"]
steps:
- uses: actions/checkout@v4
@ -24,10 +27,10 @@ jobs:
uses: actions/cache@v4
with:
path: ~/.conan2/p
key: ${{ runner.os }}-conan-${{ hashFiles('src/**/conanfile.txt') }}
key: Conan-${{ hashFiles('src/**/conanfile.txt') }}-${{ matrix.platform }}
- name: Build
run: python publish.py
run: python publish.py ${{ matrix.platform }}
- name: Save hash
id: hash
@ -36,5 +39,5 @@ jobs:
- name: Store build
uses: actions/upload-artifact@v4
with:
name: Magpie-dev-${{ steps.hash.outputs.sha_short }}
path: ./publish
name: Magpie-dev-${{ steps.hash.outputs.sha_short }}-${{ matrix.platform }}
path: ./publish/${{ matrix.platform }}

View file

@ -24,9 +24,13 @@ on:
required: true
type: boolean
jobs:
release:
build:
runs-on: windows-latest
outputs:
tag: ${{ steps.tag.outputs.tag }}
strategy:
matrix:
platform: ["x64", "ARM64"]
steps:
- uses: actions/checkout@v4
@ -42,13 +46,50 @@ jobs:
with:
path: ~/.conan2/p
key: ${{ runner.os }}-conan-${{ hashFiles('src/**/conanfile.txt') }}
- name: Generate tag
id: tag
run: |
$tag = "${{ inputs.tag }}" -eq "" ? "v${{ inputs.major }}.${{ inputs.minor }}.${{ inputs.patch }}" : "${{ inputs.tag }}"
echo "tag=$tag" >> $env:GITHUB_OUTPUT
- name: Publish release
run: python publish.py
- name: Build
run: python publish.py ${{ matrix.platform }}
env:
MAJOR: ${{ inputs.major }}
MINOR: ${{ inputs.minor }}
PATCH: ${{ inputs.patch }}
TAG: ${{ inputs.tag }}
PRERELEASE: ${{ inputs.prerelease }}
ACCESS_TOKEN: ${{ secrets.CONTENTS_ACCESS_TOKEN }}
TAG: ${{ steps.tag.outputs.tag }}
- name: Store artifacts
uses: actions/upload-artifact@v4
with:
name: Magpie-${{ steps.tag.outputs.tag }}-${{ matrix.platform }}
path: publish/${{ matrix.platform }}
release:
runs-on: windows-latest
needs: build
steps:
- uses: actions/checkout@v4
- uses: actions/setup-python@v5
with:
python-version: '3.11'
- name: Setup Requests
run: pip install requests
- name: Restore artifacts
uses: actions/download-artifact@v4
with:
path: publish
- name: Publish release
run: python ci/release.py
env:
MAJOR: ${{ inputs.major }}
MINOR: ${{ inputs.minor }}
PATCH: ${{ inputs.patch }}
TAG: ${{ needs.build.outputs.tag }}
PRERELEASE: ${{ inputs.prerelease }}
ACCESS_TOKEN: ${{ secrets.CONTENTS_ACCESS_TOKEN }}

View file

@ -24,7 +24,6 @@ Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "Solution Items", "Solution
src\Common.Pre.props = src\Common.Pre.props
Directory.Build.props = Directory.Build.props
src\extract_winui_runtime.py = src\extract_winui_runtime.py
src\fix_resfiles.py = src\fix_resfiles.py
src\HybridCRT.props = src\HybridCRT.props
src\WinUI.props = src\WinUI.props
EndProjectSection

158
ci/release.py Normal file
View file

@ -0,0 +1,158 @@
import sys
import os
import subprocess
import shutil
import requests
import hashlib
import json
try:
# https://docs.github.com/en/actions/learn-github-actions/variables
if os.environ["GITHUB_ACTIONS"].lower() == "true":
# 不知为何在 Github Actions 中运行时默认编码为 ANSI并且 print 需刷新流才能正常显示
for stream in [sys.stdout, sys.stderr]:
stream.reconfigure(encoding="utf-8")
except:
pass
majorVersion = os.environ["MAJOR"]
minorVersion = os.environ["MINOR"]
patchVersion = os.environ["PATCH"]
tag = os.environ["TAG"]
isPrerelease = os.environ["PRERELEASE"].lower() == "true"
githubAccessToken = os.environ["ACCESS_TOKEN"]
repo = os.environ["GITHUB_REPOSITORY"]
actor = os.environ["GITHUB_ACTOR"]
subprocess.run("git config user.name " + actor)
subprocess.run(f"git config user.email {actor}@users.noreply.github.com")
subprocess.run(
f"git remote set-url origin https://{githubAccessToken}@github.com/{repo}.git"
)
# 打标签
if subprocess.run(f"git tag -a {tag} -m {tag}").returncode != 0:
raise Exception("打标签失败")
if subprocess.run("git push origin " + tag).returncode != 0:
raise Exception("推送标签失败")
print("已创建标签 " + tag, flush=True)
headers = {
"Accept": "application/vnd.github+json",
"Authorization": "Bearer " + githubAccessToken,
"X-GitHub-Api-Version": "2022-11-28",
}
# 获取前一个发布版本来生成默认发行说明
prevReleaseTag = None
try:
if isPrerelease:
# 发布预发行版与最新的版本(无论是正式版还是预发行版)对比
response = requests.get(
f"https://api.github.com/repos/{repo}/releases",
json={"per_page": 1},
headers=headers,
)
if response.ok:
prevReleaseTag = response.json()[0]["tag_name"]
else:
# 发布正式版则与最新的正式版对比
# 由于可以自己选择最新版本,此接口可能不会返回时间上最新发布的版本,不是大问题
response = requests.get(
f"https://api.github.com/repos/{repo}/releases/latest", headers=headers
)
if response.ok:
prevReleaseTag = response.json()["tag_name"]
except:
# 忽略错误
pass
# 发布 release
if prevReleaseTag == None:
body = ""
else:
# 默认发行说明为比较两个 tag
body = f"https://github.com/{repo}/compare/{prevReleaseTag}...{tag}"
response = requests.post(
f"https://api.github.com/repos/{repo}/releases",
json={
"tag_name": tag,
"name": tag,
"prerelease": isPrerelease,
"body": body,
"discussion_category_name": "Announcements",
},
headers=headers,
)
if not response.ok:
raise Exception("发布失败")
uploadUrl = response.json()["upload_url"]
uploadUrl = uploadUrl[: uploadUrl.find("{")] + "?name="
os.chdir(os.path.dirname(__file__) + "\\..\\publish")
pkgInfos = {}
for platform in ["x64", "ARM64"]:
# 打包成 zip
pkgName = "Magpie-" + tag + "-" + platform
shutil.make_archive(pkgName, "zip", pkgName)
pkgName += ".zip"
# 上传资产
with open(pkgName, "rb") as f:
# 流式上传
# https://requests.readthedocs.io/en/latest/user/advanced/#streaming-uploads
response = requests.post(
uploadUrl + pkgName,
data=f,
headers={**headers, "Content-Type": "application/zip"},
)
if not response.ok:
raise Exception("上传失败")
# 计算哈希
f.seek(0, os.SEEK_SET)
md5 = hashlib.file_digest(f, hashlib.md5).hexdigest()
pkgInfos[platform] = (pkgName, md5)
print("已发布 " + tag, flush=True)
# 更新 version.json
# 此步应在发布版本之后,因为程序使用 version.json 检查更新
os.chdir("..")
with open("version.json", "w", encoding="utf-8") as f:
json.dump(
{
"version": f"{majorVersion}.{minorVersion}.{patchVersion}",
"tag": tag,
"binary": {
"x64": {
"url": f"https://github.com/{repo}/releases/download/{tag}/{pkgInfos['x64'][0]}",
"hash": pkgInfos["x64"][1],
},
"ARM64": {
"url": f"https://github.com/{repo}/releases/download/{tag}/{pkgInfos['ARM64'][0]}",
"hash": pkgInfos["ARM64"][1],
},
},
},
f,
indent=4,
)
# 提交对 version.json 的更改
if subprocess.run("git add version.json").returncode != 0:
raise Exception("git add 失败")
if subprocess.run('git commit -m "Update version.json"').returncode != 0:
raise Exception("git commit 失败")
if subprocess.run("git push").returncode != 0:
raise Exception("git push 失败")

View file

@ -18,7 +18,7 @@ Magpie ships with a handful of effects that can be used in combinations. Most of
* Parameter:
* Strength: Denoise magnitude
* Anime4K_Restore_S, Anime4K_Restore_M, Anime4K_Restore_L, Anime4K_Restore_VL, Anime4K_Restore_UL, Anime4K_Restore_Soft_S, Anime4K_Restore_Soft_M, Anime4K_Restore_Soft_L, Anime4K_Restore_Soft_VL, Anime4K_Restore_Soft_UL: Algorithms to restore the lines in animations. In increasing order of demand for computing power. The Soft variants are more conservative in sharpening.
* Anime4K_Restore family: Algorithms to restore the lines in animations. In increasing order of demand for computing power. The Soft variants are more conservative in sharpening.
* Output size: the same as the input
* Anime4K_Thin_HQ: Algorithm to clarify lines in animations provided by Anime4K.
@ -27,7 +27,7 @@ Magpie ships with a handful of effects that can be used in combinations. Most of
* Strength: The strength in each iteration.
* Iterations: The number of iterations. Decreasing strength and increasing iterations improves the quality of the images, but will lower the processing speed.
* Anime4K_Upscale_S, Anime4K_Upscale_L, Anime4K_Upscale_Denoise_S, Anime4K_Upscale_Denoise_L, and Anime4K_Upscale_GAN_x2_S: Anime-style scaling algorithms provided by Anime4K. The denoise variant includes denoise functionality. The GAN variant, which keeps more details, is still under experiment.
* Anime4K_Upscale family: Anime-style scaling algorithms provided by Anime4K. The denoise variant includes denoise functionality. The GAN variant, which keeps more details, is still under experiment.
* Output size: twice that of the input
* Bicubic: Interpolation algorithms. The lite variant is fast, but at the cost of quality degradation, Suitable for users will weak graphics cards.
@ -124,6 +124,9 @@ Magpie ships with a handful of effects that can be used in combinations. Most of
* Bloom Amount
* Filter Kernel Shape
* CuNNy familySuitable for visual novel-style images. The DS variants offer a subtle denoise effect. Provided by [CuNNy](https://github.com/cunnyplapper/CuNNy)
* Output size: twice that of the input
* Deband
* Output size: the same as the input
* Parameters
@ -221,7 +224,7 @@ Magpie ships with a handful of effects that can be used in combinations. Most of
* Sharpness
* Note: Only supports upscaling.
* NNEDI3_nns16_win8x4 and NNEDI3_nns64_win8x6These shaders originally designed for deinterlacing and are also high-quality interpolation algorithms. NNEDI3_nns64_win8x6 produces higher quality results, but slower.
* NNEDI3 familyThese shaders originally designed for deinterlacing and are also high-quality interpolation algorithms. NNEDI3_nns64_win8x6 produces higher quality results, but slower.
* Output size: twice that of the input
* NVSharpen: Port of NVSharpen that was published along with NIS.
@ -232,10 +235,10 @@ Magpie ships with a handful of effects that can be used in combinations. Most of
* Pixellate: Scale with the Pixellate algorithm. Suitable for upscaling pixel arts.
* Output size: determined by scale configuration
* RAVU_Lite_R3: Port of ravu-lite-r3
* RAVU family: Ported from https://github.com/bjin/mpv-prescalers
* Output size: twice that of the input
* RAVU_Zoom_R3: Port of ravu-zoom-r3
* RAVU_Zoom family: Ported from https://github.com/bjin/mpv-prescalers
* Output size: determined by scale configuration
* Note: Only supports upscaling.

View file

@ -1,4 +1,4 @@
Magpie provides several capture methods. They have their pros and cons in different scenarios.
Magpie provides several capture methods. They have their pros and cons in different scenarios. For general purposes, it's recommended to use Graphics Capture, as it provides the best compatibility and smoothness.
| | Graphics Capture | Desktop Duplication | GDI | DwmSharedSurface |
| :---: | :---: | :---: | :---: |:---: |
@ -6,11 +6,9 @@ Magpie provides several capture methods. They have their pros and cons in differ
| Supports recording/streaming | No under extreme conditions<sup>[1]</sup> | No | Yes | Yes |
| Support the source window to span multiple screens | No under extreme conditions<sup>[1]</sup> | No | Yes | Yes |
| Ignores DPI virtualization<sup>[2]</sup> | No | No | Yes| Yes |
| Notes | The most recommended capture method | Requires Win10 v2004, suitable for games with more static frames<sup>[3]</sup>, could capture pop-ups | | Low VRAM usage |
| Notes | The most recommended capture method | Requires Win10 v2004 | | Low VRAM usage |
[1]: (1) The source window does not support regular window capture. (2) The operating system is Windows 11.
[2]: The system will perform bicubic interpolation upscaling to windows that do not support DPI scaling. The capture methods supporting this options captures the images before such scaling.
[3]: The Desktop Duplication mode effectively reduces the power consumption if there are many static frames.

View file

@ -2,23 +2,12 @@ MagpieFX is based on DirectX 11 compute shader
``` hlsl
//!MAGPIE EFFECT
//!VERSION 3
//!OUTPUT_WIDTH INPUT_WIDTH * 2
//!OUTPUT_HEIGHT INPUT_HEIGHT * 2
// Specify "USE_DYNAMIC" to use GetFrameCount or GetCursorPos.
//!VERSION 4
// Specify "USE_DYNAMIC" to use GetFrameCount.
//!USE_DYNAMIC
// Specifying "GENERIC_DOWNSCALER" indicates that this effect can be used as the "default downscaling effect".
//!GENERIC_DOWNSCALER
// Use "SORT_NAME" to specify the name used for sorting, otherwise the files will be sorted by their file names.
//!SORT_NAME test1
// Not specifying "OUTPUT_WIDTH" and "OUTPUT_HEIGHT" indicates that this effect supports outputting to any size.
// You can use some pre-defined constants when calculating texture size.
// INPUT_WIDTH
// INPUT_HEIGHT
// OUTPUT_WIDTH
// OUTPUT_HEIGHT
// Definition of parameters
//!PARAMETER
@ -33,13 +22,25 @@ float sharpness;
// Definition of textures
// "INPUT" is a special keyword.
// "INPUT" cannot be used as the output of a pass.
// Defining INPUT is optional, but it is recommended to define it explicitly for the sake of semantic completeness.
// "INPUT" and "OUTPUT" are special keywords.
// "INPUT" cannot be used as the output of a pass; "OUTPUT" cannot be used as the input of a pass.
// Defining INPUT/OUTPUT is optional, but it is recommended to define them explicitly for the sake of semantic completeness.
// The size of the OUTPUT represents the output size of this effect. Not specifying it indicates support for output of any size.
//!TEXTURE
Texture2D INPUT;
//!TEXTURE
//!WIDTH INPUT_WIDTH * 2
//!HEIGHT INPUT_HEIGHT * 2
Texture2D OUTPUT;
// You can use some pre-defined constants to calculate texture size.
// INPUT_WIDTH
// INPUT_HEIGHT
// OUTPUT_WIDTH
// OUTPUT_HEIGHT
// Supported texture formats:
// R32G32B32A32_FLOAT
// R16G16B16A16_FLOAT
@ -110,11 +111,10 @@ float4 Pass1(float2 pos) {
return float4(1, 1, 1, 1);
}
// The last pass does not support "OUT".
// If you are using the CS style, you must use "WriteToOutput" to output the result.
//!PASS 2
//!IN INPUT, tex1
// The output of the last pass must be "OUTPUT".
//!OUT OUTPUT
// "BLOACK_SIZE" specifies how large an area is processed in one dispatch.
// "BLOACK_SIZE" can have only one dimension, meaning that length and height are specified at the same time.
//!BLOCK_SIZE 16, 16
@ -123,18 +123,13 @@ float4 Pass1(float2 pos) {
//!NUM_THREADS 64, 1, 1
void Pass2(uint2 blockStart, uint3 threadId) {
// Render the cursor and then output.
// Available only in the last pass.
WriteToOutput(blockStart, float3(1,1,1));
// Write to OUPUT
OUTPUT[blockStart] = float4(1,1,1,1);
}
```
### Predefined functions
**void WriteToOutput(uint2 pos, float3 color)**: Only available in the last pass and is used to write results to the output texture.
**bool CheckViewport(uint2 pos)**: Only available in the last pass and is used to check whether the output coordinates are inside the viewport.
**uint2 GetInputSize()**: Retrieves the size of the input texture.
**float2 GetInputPt()**: Retrieves the size of pixel in the input texture.
@ -147,8 +142,6 @@ void Pass2(uint2 blockStart, uint3 threadId) {
**uint GetFrameCount()**: Retrieves the total number of frames rendered so far. When using this function, you must specify USE_DYNAMIC.
**uint2 GetCursorPos()**: Retrieves the current cursor position. When using this function, you must specify USE_DYNAMIC.
**uint2 Rmp8x8(uint id)**: Maps the values of 0 to 63 to coordinates in an 8x8 square in swizzle order, which can improve texture cache hit rate.
@ -164,10 +157,6 @@ void Pass2(uint2 blockStart, uint3 threadId) {
**MP_DEBUG**: Whether the shader is being compiled in debug mode (when compiling shaders in debug mode, they are not optimized and contain debug information).
**MP_LAST_PASS**: Whether the current pass is the last pass of the effect.
**MP_LAST_EFFECT**: Whether the effect is the last effect for the current scaling mode (the last effect needs to handle viewport and cursor rendering).
**MP_FP16**: Whether to use half-precision floating-point numbers (specifed by user).
**MF、MF1、MF2、...、MF4x4**: Floating-point data types that conform to MP_FP16. When half-precision is not specified, they are aliases for float..., otherwise they are aliases for min16float...

View file

@ -2,23 +2,12 @@ MagpieFX 基于 DirectX 11 计算着色器
``` hlsl
//!MAGPIE EFFECT
//!VERSION 3
//!OUTPUT_WIDTH INPUT_WIDTH * 2
//!OUTPUT_HEIGHT INPUT_HEIGHT * 2
// 若要使用 GetFrameCount 或 GetCursorPos 需指定 USE_DYNAMIC
//!VERSION 4
// 若要使用 GetFrameCount 需指定 USE_DYNAMIC
//!USE_DYNAMIC
// GENERIC_DOWNSCALER 表示此效果可以作为“默认降采样效果”
//!GENERIC_DOWNSCALER
// 使用 SORT_NAME 指定排序时使用的名字,否则按照文件名排序
//!SORT_NAME test1
// 不指定 OUTPUT_WIDTH 和 OUTPUT_HEIGHT 表示此效果支持输出任意尺寸
// 计算纹理尺寸时可以使用一些预定义常量
// INPUT_WIDTH
// INPUT_HEIGHT
// OUTPUT_WIDTH
// OUTPUT_HEIGHT
// 参数定义
//!PARAMETER
@ -33,13 +22,25 @@ float sharpness;
// 纹理定义
// INPUT 是特殊关键字
// INPUT 不能作为通道的输出
// 定义 INPUT 是可选的,但为了保持语义的完整性,建议显式定义
// INPUT、OUTPUT 是特殊关键字
// INPUT 不能作为通道的输出OUTPUT 不能作为通道的输入
// 定义 INPUT 和 OUTPUT 是可选的,但为了保持语义的完整性,建议显式定义
// OUTPUT 的尺寸即为此效果的输出尺寸,不指定则表示支持任意尺寸的输出
//!TEXTURE
Texture2D INPUT;
//!TEXTURE
//!WIDTH INPUT_WIDTH * 2
//!HEIGHT INPUT_HEIGHT * 2
Texture2D OUTPUT;
// 计算纹理尺寸时可以使用一些预定义常量
// INPUT_WIDTH
// INPUT_HEIGHT
// OUTPUT_WIDTH
// OUTPUT_HEIGHT
// 支持的纹理格式:
// R32G32B32A32_FLOAT
// R16G16B16A16_FLOAT
@ -109,11 +110,10 @@ float4 Pass1(float2 pos) {
return float4(1, 1, 1, 1);
}
// 最后一个通道不能指定 OUT
// 如果是 CS 风格必须使用 WriteToOutput 输出结果
//!PASS 2
//!IN INPUT, tex1
// 最后一个通道的输出只能是 OUTPUT
//!OUT OUTPUT
// BLOCK_SIZE 指定一次 dispatch 处理多大的区域
// 可以只有一维,即同时指定长和高
//!BLOCK_SIZE 16, 16
@ -122,18 +122,13 @@ float4 Pass1(float2 pos) {
//!NUM_THREADS 64, 1, 1
void Pass2(uint2 blockStart, uint3 threadId) {
// 渲染光标并写入 OUPUT
// 只在最后一个通道中可用
WriteToOutput(blockStart, float3(1,1,1));
// 写入 OUPUT
OUTPUT[blockStart] = float4(1,1,1,1);
}
```
### 预定义函数
**void WriteToOutput(uint2 pos, float3 color)**只在最后一个通道Pass中可用用于将结果写入到输出纹理。
**bool CheckViewport(uint2 pos)**:只在最后一个通道中可用,检查输出坐标是否位于视口内。
**uint2 GetInputSize()**:获取输入纹理尺寸。
**float2 GetInputPt()**:获取输入纹理每个像素的尺寸。
@ -146,8 +141,6 @@ void Pass2(uint2 blockStart, uint3 threadId) {
**uint GetFrameCount()**:获取当前总计帧数。使用此函数时必须指定 "USE_DYNAMIC"。
**uint2 GetCursorPos()**:获取当前光标位置。使用此函数时必须指定 "USE_DYNAMIC"。
**uint2 Rmp8x8(uint id)**:将 0~63 的值以 swizzle 顺序映射到 8x8 的正方形内的坐标,用以提高纹理缓存的命中率。
@ -163,10 +156,6 @@ void Pass2(uint2 blockStart, uint3 threadId) {
**MP_DEBUG**:当前是否为调试模式(调试模式下编译的着色器不进行优化且含有调试信息)
**MP_LAST_PASS**:当前通道是否是当前效果的最后一个通道
**MP_LAST_EFFECT**:当前效果是否是当前缩放模式的最后一个效果(最后一个效果要处理视口和光标渲染)
**MP_FP16**:当前是否使用半精度浮点数(由用户指定)
**MF、MF1、MF2、...、MF4x4**:遵守 fp16 参数的浮点数类型。当未指定 fp16它们为 float... 的别名,否则为 min16float... 的别名

View file

@ -8,8 +8,6 @@ If you cannot run some effects with high computing power requirements (e.g. Anim
1. Change to the variants with lower requirements. For example, Anime4K_Upscale_S is much faster than Anime4K_Upscale_L. CAS is much faster than AdaptiveSharpen. They can effectively improve the smoothness of the effects at the cost of some quality degradation.
2. Change the capture mode. We recommend you to try each of them.
3. Set the frame rate to "unlimited." This will turn off Vsync. It usually increases the frame rate substantially, but may causes the screen to tear.
4. Turn on "allow additional latency to improve performance" when Vsync is on. This will not lead to screen tearing and it also raises the frame rate. However, it will cause an extra 1-frame latency.
## Intermittent lagging
@ -25,6 +23,5 @@ If your graphics card is powerful enough, but you are still experiencing lagging
When you need to save electricity or reduce the heat generated, try the following:
1. Change the capture more. The Desktop Duplication capture mode effectively reduces the power consumption if there are a lot of static frames in the game.
2. Change the effects to their variants with lower requirements.
3. Limit the frame rate, which may cause screen tearing.
1. Limit the frame rate.
2. Opt for effects that require lower performance.

View file

@ -18,7 +18,7 @@ Magpie 内置了大量效果供组合使用,大部分提供了参数选项以
* 参数
* Strength降噪强度
* Anime4K_Restore_S、Anime4K_Restore_M、Anime4K_Restore_L、Anime4K_Restore_VL、Anime4K_Restore_UL、Anime4K_Restore_Soft_S、Anime4K_Restore_Soft_M、Anime4K_Restore_Soft_L、Anime4K_Restore_Soft_VL 和 Anime4K_Restore_Soft_ULAnime4K 提供的用于还原动漫画面线条的算法S->M->L->VL->UL 对性能的需求依次提高Soft 变体效果稍弱
* Anime4K_RestoreAnime4K 提供的用于还原动漫画面线条的算法S->M->L->VL->UL 对性能的需求依次提高Soft 变体效果稍弱
* 输出尺寸:和输入相同
* Anime4K_Thin_HQAnime4K 提供的用于细化动漫画面线条的算法
@ -27,7 +27,7 @@ Magpie 内置了大量效果供组合使用,大部分提供了参数选项以
* Strength每次迭代的强度
* Iterations迭代次数。降低 Strength 并提高 Iterations 可以提高画面质量,但会降低速度。
* Anime4K_Upscale_S、Anime4K_Upscale_L、Anime4K_Upscale_VL、Anime4K_Upscale_UL、Anime4K_Upscale_Denoise_S、Anime4K_Upscale_Denoise_L、Anime4K_Upscale_Denoise_VL、Anime4K_Upscale_Denoise_UL 和 Anime4K_Upscale_GAN_x2_SAnime4K 提供的动画风格图像缩放算法。Denoise 变体包含降噪效果GAN 变体处于实验阶段可以保留更多细节。S、L、VL、UL 对性能的要求依次提高
* Anime4K_UpscaleAnime4K 提供的动画风格图像缩放算法。Denoise 变体包含降噪效果GAN 变体处于实验阶段可以保留更多细节。S、L、VL、UL 对性能的要求依次提高
* 输出尺寸:输入的两倍
* Bicubic双立方双三次插值算法
@ -124,6 +124,9 @@ Magpie 内置了大量效果供组合使用,大部分提供了参数选项以
* Bloom Amount
* Filter Kernel Shape
* CuNNy 族:适合视觉小说风格图像的缩放,由 [CuNNy](https://github.com/cunnyplapper/CuNNy) 提供。DS 变体有轻微降噪效果
* 输出尺寸:输入的两倍
* Deband去除色带
* 输出尺寸:和输入相同
* 参数
@ -221,7 +224,7 @@ Magpie 内置了大量效果供组合使用,大部分提供了参数选项以
* Sharpness锐化强度
* 备注:只支持放大
* NNEDI3_nns16_win8x4 和 NNEDI3_nns64_win8x6原本用于去隔行也是高质量的插值算法。NNEDI3_nns64_win8x6 质量更高,速度更慢
* NNEDI3 族:原本用于去隔行,也是高质量的插值算法。移植自 https://github.com/bjin/mpv-prescalers
* 输出尺寸:输入的两倍
* NVSharpen随 NIS 发布的 NVSharpen 的移植
@ -232,10 +235,10 @@ Magpie 内置了大量效果供组合使用,大部分提供了参数选项以
* Pixellate使用 Pixellate 算法缩放输入。适合放大像素画
* 输出尺寸:取决于缩放选项
* RAVU_Lite_R3ravu-lite-r3的移植
* RAVU 族:移植自 https://github.com/bjin/mpv-prescalers
* 输出尺寸:输入的两倍
* RAVU_Zoom_R3ravu-zoom-r3的移植
* RAVU-Zoom 族:移植自 https://github.com/bjin/mpv-prescalers
* 输出尺寸:取决于缩放选项
* 备注:只支持放大

View file

@ -8,8 +8,6 @@
1. 更换为性能需求更低的效果。如 Anime4K_Upscale_S 比 Anime4K_Upscale_L 快的多CAS 比 AdaptiveSharpen 快的多,它们可以有效提高流畅度,代价是一定程度的画面质量损失。
2. 尝试更换捕获模式。建议你每种模式都尝试一下。
3. 关闭垂直同步。这通常可以大幅提高帧率,但可能造成画面撕裂。
4. 开启“垂直同步”并“允许额外的延迟以提高性能”。这个配置不会造成画面撕裂,同时也可以有效提高帧率。缺点是会引入一帧的延迟。
## 间歇性卡顿
@ -25,5 +23,5 @@
在需要节省电量或降低发热时,请尝试下面的操作:
1. 更换捕获模式。如果游戏的静止画面较多Desktop Duplication 捕获模式可以有效降低功耗
1. 限制帧率
2. 更换为性能需求更低的效果。

View file

@ -1,4 +1,4 @@
Magpie 提供数种捕获方式,根据使用场景,它们各有优劣。
Magpie 提供数种捕获方式,根据使用场景,它们各有优劣。无特殊需求应使用 Graphics Capture它提供最好的兼容性和流畅度。
| | Graphics Capture | Desktop Duplication | GDI | DwmSharedSurface |
| :---: | :---: | :---: | :---: |:---: |
@ -6,11 +6,9 @@ Magpie 提供数种捕获方式,根据使用场景,它们各有优劣。
| 支持录制/串流 | 特殊情况下不支持<sup>[1]</sup> | 否 | 是 | 是 |
| 支持源窗口跨越多个屏幕 | 特殊情况下不支持<sup>[1]</sup> | 否 | 是 | 是 |
| 无视 DPI 虚拟化<sup>[2]</sup> | 否 | 否 | 是| 是 |
| 备注 | 首选捕获方式 | 要求 Win10 v2004;适合静止帧较多的游戏<sup>[3]</sup>;可以捕获到弹窗 | | 占用的显存较少 |
| 备注 | 首选捕获方式 | 要求 Win10 v2004 | | 占用的显存较少 |
[1]: (1) 源窗口不支持常规的窗口捕获 (2) 操作系统为 Windows 11
[2]: 系统会对不支持 DPI 缩放的窗口进行双三次插值放大,支持此项的捕获方式可以捕获到放大前的图像
[3]: 如果窗口的静止帧较多,使用 Desktop Duplication 可以有效降低功耗

View file

@ -18,31 +18,18 @@ try:
except:
pass
platform = "x64"
if len(sys.argv) == 2:
platform = sys.argv[1]
if not platform in ["x64", "ARM64"]:
raise Exception("非法参数")
if majorVersion != None:
import re
import hashlib
import json
# 使用第三方库 requests 发送 HTTP 请求,它是 Conan 的依赖项,无需单独安装
import requests
minorVersion = os.environ["MINOR"]
patchVersion = os.environ["PATCH"]
tag = ""
try:
tag = os.environ["TAG"]
except:
pass
if tag == "":
tag = f"v{majorVersion}.{minorVersion}.{patchVersion}"
isPrerelease = os.environ["PRERELEASE"].lower() == "true"
githubAccessToken = os.environ["ACCESS_TOKEN"]
repo = os.environ["GITHUB_REPOSITORY"]
actor = os.environ["GITHUB_ACTOR"]
tag = os.environ["TAG"]
#####################################################################
#
@ -111,7 +98,7 @@ else:
version_props = ""
p = subprocess.run(
f'"{msbuildPath}" -restore -p:RestorePackagesConfig=true;Configuration=Release;Platform=x64;OutDir={os.getcwd()}\\publish\\;CommitId={commit_id}{version_props} Magpie.sln'
f'"{msbuildPath}" -restore -p:RestorePackagesConfig=true;Configuration=Release;Platform={platform};OutDir={os.getcwd()}\\publish\\{platform}\\;CommitId={commit_id}{version_props} Magpie.sln'
)
if p.returncode != 0:
raise Exception("编译失败")
@ -122,7 +109,7 @@ if p.returncode != 0:
#
#####################################################################
os.chdir("publish")
os.chdir("publish\\" + platform)
# 删除文件,忽略错误
@ -133,10 +120,9 @@ def remove_file(file):
pass
for folder in ["Microsoft.UI.Xaml", "Magpie.App"]:
shutil.rmtree(folder, ignore_errors=True)
shutil.rmtree("Microsoft.UI.Xaml", ignore_errors=True)
for pattern in ["*.pdb", "*.lib", "*.exp", "*.winmd", "*.xml", "*.xbf", "dummy.*"]:
for pattern in ["*.pdb", "*.lib", "*.exp", "*.winmd", "*.xml", "*.xbf"]:
for file in glob.glob(pattern):
remove_file(file)
@ -225,138 +211,3 @@ os.remove("resources.pri.xml")
os.remove("priconfig.xml")
print("已修剪 resources.pri", flush=True)
#####################################################################
#
# 发布
#
#####################################################################
if majorVersion != None:
os.chdir("..")
subprocess.run("git config user.name " + actor)
subprocess.run(f"git config user.email {actor}@users.noreply.github.com")
subprocess.run(
f"git remote set-url origin https://{githubAccessToken}@github.com/{repo}.git"
)
# 打标签
if subprocess.run(f"git tag -a {tag} -m {tag}").returncode != 0:
raise Exception("打标签失败")
if subprocess.run("git push origin " + tag).returncode != 0:
raise Exception("推送标签失败")
print("已创建标签 " + tag, flush=True)
# 打包成 zip
pkgName = "Magpie-" + tag + "-x64"
shutil.make_archive(pkgName, "zip", "publish")
pkgName += ".zip"
headers = {
"Accept": "application/vnd.github+json",
"Authorization": "Bearer " + githubAccessToken,
"X-GitHub-Api-Version": "2022-11-28",
}
# 获取前一个发布版本来生成默认发行说明
prevReleaseTag = None
try:
if isPrerelease:
# 发布预发行版与最新的版本(无论是正式版还是预发行版)对比
response = requests.get(
f"https://api.github.com/repos/{repo}/releases",
json={
"per_page": 1
},
headers=headers
)
if response.ok:
prevReleaseTag = response.json()[0]["tag_name"]
else:
# 发布正式版则与最新的正式版对比
# 由于可以自己选择最新版本,此接口可能不会返回时间上最新发布的版本,不是大问题
response = requests.get(f"https://api.github.com/repos/{repo}/releases/latest", headers=headers)
if response.ok:
prevReleaseTag = response.json()["tag_name"]
except:
# 忽略错误
pass
# 发布 release
if prevReleaseTag == None:
body = ""
else:
# 默认发行说明为比较两个 tag
body = f"https://github.com/{repo}/compare/{prevReleaseTag}...{tag}"
response = requests.post(
f"https://api.github.com/repos/{repo}/releases",
json={
"tag_name": tag,
"name": tag,
"prerelease": isPrerelease,
"body": body,
"discussion_category_name": "Announcements",
},
headers=headers,
)
if not response.ok:
raise Exception("发布失败")
upload_url = response.json()["upload_url"]
upload_url = upload_url[: upload_url.find("{")] + "?name=" + pkgName
# 上传资产
with open(pkgName, "rb") as f:
# 流式上传
# https://requests.readthedocs.io/en/latest/user/advanced/#streaming-uploads
response = requests.post(
upload_url,
data=f,
headers={**headers, "Content-Type": "application/zip"},
)
if not response.ok:
raise Exception("上传失败")
# 计算哈希
f.seek(0, os.SEEK_SET)
md5 = hashlib.file_digest(f, hashlib.md5).hexdigest()
print("已发布 " + tag, flush=True)
# 丢弃当前修改并更新到最新,防止编译时有新的提交
subprocess.run("git checkout -f")
subprocess.run("git pull")
# 更新 version.json
# 此步应在发布版本之后,因为程序使用 version.json 检查更新
with open("version.json", "w", encoding="utf-8") as f:
json.dump(
{
"version": f"{majorVersion}.{minorVersion}.{patchVersion}",
"tag": tag,
"binary": {
"x64": {
"url": f"https://github.com/{repo}/releases/download/{tag}/{pkgName}",
"hash": md5,
}
},
},
f,
indent=4,
)
# 提交对 version.json 的更改
if subprocess.run("git add version.json").returncode != 0:
raise Exception("git add 失败")
if subprocess.run('git commit -m "Update version.json"').returncode != 0:
raise Exception("git commit 失败")
if subprocess.run("git push").returncode != 0:
raise Exception("git push 失败")

View file

@ -2,6 +2,7 @@
<Project xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
<PropertyGroup>
<DefaultLanguage>en-US</DefaultLanguage>
<CppWinRTFastAbi>true</CppWinRTFastAbi>
<CppWinRTOptimized>true</CppWinRTOptimized>
<CppWinRTRootNamespaceAutoMerge>true</CppWinRTRootNamespaceAutoMerge>
<CppWinRTVerbosity>low</CppWinRTVerbosity>

View file

@ -3,14 +3,17 @@
//!MAGPIE EFFECT
//!VERSION 3
//!OUTPUT_WIDTH INPUT_WIDTH * 2
//!OUTPUT_HEIGHT INPUT_HEIGHT * 2
//!VERSION 4
//!TEXTURE
Texture2D INPUT;
//!TEXTURE
//!WIDTH INPUT_WIDTH * 2
//!HEIGHT INPUT_HEIGHT * 2
Texture2D OUTPUT;
//!TEXTURE
//!WIDTH INPUT_WIDTH
//!HEIGHT INPUT_HEIGHT
@ -3741,6 +3744,7 @@ void Pass8(uint2 blockStart, uint3 threadId) {
//!PASS 9
//!DESC L9, L10
//!IN INPUT, tex3, tex4
//!OUT OUTPUT
//!BLOCK_SIZE 16
//!NUM_THREADS 64
@ -3978,8 +3982,9 @@ const static float3x3 yuv2rgb = {
void Pass9(uint2 blockStart, uint3 threadId) {
uint2 gxy = (Rmp8x8(threadId.x) << 1) + blockStart;
if (!CheckViewport(gxy)) {
const uint2 outputSize = GetOutputSize();
if (gxy.x >= outputSize.x || gxy.y >= outputSize.y) {
return;
}
@ -4277,12 +4282,6 @@ void Pass9(uint2 blockStart, uint3 threadId) {
for (uint j = 0; j <= 1; ++j) {
uint2 destPos = gxy + uint2(i, j);
if (i != 0 || j != 0) {
if (!CheckViewport(destPos)) {
continue;
}
}
uint index = j * 2 + i;
float luma = clamp(
target1.x * kernelsL10[0 + index] +
@ -4295,7 +4294,7 @@ void Pass9(uint2 blockStart, uint3 threadId) {
target2.w * kernelsL10[28 + index], 0.0f, 1.0f);
float2 originUV = mul(rgb2uv, INPUT.SampleLevel(sam1, (destPos + 0.5f) * outputPt, 0).rgb);
WriteToOutput(destPos, mul(yuv2rgb, float3(luma, originUV)));
OUTPUT[destPos] = float4(mul(yuv2rgb, float3(luma, originUV)), 1);
}
}
}

View file

@ -2,15 +2,18 @@
// 移植自 https://github.com/bloc97/Anime4K/blob/master/glsl/Upscale/Anime4K_3DGraphics_AA_Upscale_x2_US.glsl
//!MAGPIE EFFECT
//!VERSION 3
//!OUTPUT_WIDTH INPUT_WIDTH * 2
//!OUTPUT_HEIGHT INPUT_HEIGHT * 2
//!VERSION 4
//!SORT_NAME Anime4K_3D_Upscale_1
//!TEXTURE
Texture2D INPUT;
//!TEXTURE
//!WIDTH INPUT_WIDTH * 2
//!HEIGHT INPUT_HEIGHT * 2
Texture2D OUTPUT;
//!SAMPLER
//!FILTER POINT
SamplerState sam;
@ -175,13 +178,15 @@ void Pass2(uint2 blockStart, uint3 threadId) {
//!PASS 3
//!DESC Conv-4x3x3x4, Depth-to-Space
//!IN INPUT, tex2
//!OUT OUTPUT
//!BLOCK_SIZE 16
//!NUM_THREADS 64
void Pass3(uint2 blockStart, uint3 threadId) {
uint2 gxy = (Rmp8x8(threadId.x) << 1) + blockStart;
if (!CheckViewport(gxy)) {
const uint2 outputSize = GetOutputSize();
if (gxy.x >= outputSize.x || gxy.y >= outputSize.y) {
return;
}
@ -221,24 +226,19 @@ void Pass3(uint2 blockStart, uint3 threadId) {
result += float4(-3.1127936e-05, 3.3726166e-05, 4.8580805e-05, -9.541029e-06);
pos -= 0.5f * outputPt;
WriteToOutput(gxy, result.x + INPUT.SampleLevel(sam1, pos, 0).rgb);
OUTPUT[gxy] = float4(result.x + INPUT.SampleLevel(sam1, pos, 0).rgb, 1);
++gxy.x;
pos.x += outputPt.x;
if (CheckViewport(gxy)) {
WriteToOutput(gxy, result.y + INPUT.SampleLevel(sam1, pos, 0).rgb);
}
OUTPUT[gxy] = float4(result.y + INPUT.SampleLevel(sam1, pos, 0).rgb, 1);
++gxy.y;
pos.y += outputPt.y;
if (CheckViewport(gxy)) {
WriteToOutput(gxy, result.w + INPUT.SampleLevel(sam1, pos, 0).rgb);
}
OUTPUT[gxy] = float4(result.w + INPUT.SampleLevel(sam1, pos, 0).rgb, 1);
--gxy.x;
pos.x -= outputPt.x;
if (CheckViewport(gxy)) {
WriteToOutput(gxy, result.z + INPUT.SampleLevel(sam1, pos, 0).rgb);
}
OUTPUT[gxy] = float4(result.z + INPUT.SampleLevel(sam1, pos, 0).rgb, 1);
}

View file

@ -2,15 +2,18 @@
// 移植自 https://github.com/bloc97/Anime4K/blob/master/glsl/Upscale/Anime4K_3DGraphics_Upscale_x2_US.glsl
//!MAGPIE EFFECT
//!VERSION 3
//!OUTPUT_WIDTH INPUT_WIDTH * 2
//!OUTPUT_HEIGHT INPUT_HEIGHT * 2
//!VERSION 4
//!SORT_NAME Anime4K_3D_Upscale_0
//!TEXTURE
Texture2D INPUT;
//!TEXTURE
//!WIDTH INPUT_WIDTH * 2
//!HEIGHT INPUT_HEIGHT * 2
Texture2D OUTPUT;
//!SAMPLER
//!FILTER POINT
SamplerState sam;
@ -176,13 +179,15 @@ void Pass2(uint2 blockStart, uint3 threadId) {
//!PASS 3
//!DESC Conv-4x3x3x4, Depth-to-Space
//!IN INPUT, tex2
//!OUT OUTPUT
//!BLOCK_SIZE 16
//!NUM_THREADS 64
void Pass3(uint2 blockStart, uint3 threadId) {
uint2 gxy = (Rmp8x8(threadId.x) << 1) + blockStart;
if (!CheckViewport(gxy)) {
const uint2 outputSize = GetOutputSize();
if (gxy.x >= outputSize.x || gxy.y >= outputSize.y) {
return;
}
@ -222,23 +227,18 @@ void Pass3(uint2 blockStart, uint3 threadId) {
result += float4(-0.00016697648, -0.00015957489, 0.00017437353, -0.00019393339);
pos -= 0.5f * outputPt;
WriteToOutput(gxy, result.x + INPUT.SampleLevel(sam1, pos, 0).rgb);
OUTPUT[gxy] = float4(result.x + INPUT.SampleLevel(sam1, pos, 0).rgb, 1);
++gxy.x;
pos.x += outputPt.x;
if (CheckViewport(gxy)) {
WriteToOutput(gxy, result.y + INPUT.SampleLevel(sam1, pos, 0).rgb);
}
OUTPUT[gxy] = float4(result.y + INPUT.SampleLevel(sam1, pos, 0).rgb, 1);
++gxy.y;
pos.y += outputPt.y;
if (CheckViewport(gxy)) {
WriteToOutput(gxy, result.w + INPUT.SampleLevel(sam1, pos, 0).rgb);
}
OUTPUT[gxy] = float4(result.w + INPUT.SampleLevel(sam1, pos, 0).rgb, 1);
--gxy.x;
pos.x -= outputPt.x;
if (CheckViewport(gxy)) {
WriteToOutput(gxy, result.z + INPUT.SampleLevel(sam1, pos, 0).rgb);
}
OUTPUT[gxy] = float4(result.z + INPUT.SampleLevel(sam1, pos, 0).rgb, 1);
}

View file

@ -3,9 +3,7 @@
//!MAGPIE EFFECT
//!VERSION 3
//!OUTPUT_WIDTH INPUT_WIDTH
//!OUTPUT_HEIGHT INPUT_HEIGHT
//!VERSION 4
//!PARAMETER
@ -19,6 +17,11 @@ float intensitySigma;
//!TEXTURE
Texture2D INPUT;
//!TEXTURE
//!WIDTH INPUT_WIDTH
//!HEIGHT INPUT_HEIGHT
Texture2D OUTPUT;
//!SAMPLER
//!FILTER POINT
SamplerState sam;
@ -26,6 +29,7 @@ SamplerState sam;
//!PASS 1
//!IN INPUT
//!OUT OUTPUT
//!BLOCK_SIZE 16
//!NUM_THREADS 64
@ -52,7 +56,9 @@ float gaussian(float x, float rcpS, float m) {
void Pass1(uint2 blockStart, uint3 threadId) {
uint2 gxy = (Rmp8x8(threadId.x) << 1) + blockStart;
if (!CheckViewport(gxy)) {
const uint2 outputSize = GetOutputSize();
if (gxy.x >= outputSize.x || gxy.y >= outputSize.y) {
return;
}
@ -93,12 +99,6 @@ void Pass1(uint2 blockStart, uint3 threadId) {
for (j = 0; j <= 1; ++j) {
uint2 destPos = gxy + uint2(i, j);
if (i != 0 || j != 0) {
if (!CheckViewport(gxy)) {
continue;
}
}
float3 sum = 0;
float3 n = 0;
@ -118,7 +118,7 @@ void Pass1(uint2 blockStart, uint3 threadId) {
}
}
WriteToOutput(destPos, sum / n);
OUTPUT[destPos] = float4(sum / n, 1);
}
}
}

View file

@ -2,9 +2,7 @@
//!MAGPIE EFFECT
//!VERSION 3
//!OUTPUT_WIDTH INPUT_WIDTH
//!OUTPUT_HEIGHT INPUT_HEIGHT
//!VERSION 4
//!PARAMETER
@ -18,14 +16,19 @@ float intensitySigma;
//!TEXTURE
Texture2D INPUT;
//!TEXTURE
//!WIDTH INPUT_WIDTH
//!HEIGHT INPUT_HEIGHT
Texture2D OUTPUT;
//!SAMPLER
//!FILTER POINT
SamplerState sam;
//!PASS 1
//!IN INPUT
//!OUT OUTPUT
//!BLOCK_SIZE 8
//!NUM_THREADS 64
@ -77,7 +80,9 @@ float3 getMedian(float3 v[KERNELLEN], float w[KERNELLEN], float n) {
void Pass1(uint2 blockStart, uint3 threadId) {
uint2 gxy = Rmp8x8(threadId.x) + blockStart;
if (!CheckViewport(gxy)) {
const uint2 outputSize = GetOutputSize();
if (gxy.x >= outputSize.x || gxy.y >= outputSize.y) {
return;
}
@ -126,9 +131,9 @@ void Pass1(uint2 blockStart, uint3 threadId) {
n += histogram_wn[i];
}
WriteToOutput(gxy, getMedian(histogram_v, histogram_wn, n));
OUTPUT[gxy] = float4(getMedian(histogram_v, histogram_wn, n), 1);
return;
}
WriteToOutput(gxy, getMedian(histogram_v, histogram_w, n));
OUTPUT[gxy] = float4(getMedian(histogram_v, histogram_w, n), 1);
}

View file

@ -3,14 +3,9 @@
//!MAGPIE EFFECT
//!VERSION 3
//!OUTPUT_WIDTH INPUT_WIDTH
//!OUTPUT_HEIGHT INPUT_HEIGHT
//!VERSION 4
//!TEXTURE
Texture2D INPUT;
//!PARAMETER
//!LABEL Strength
//!DEFAULT 0.1
@ -19,6 +14,14 @@ Texture2D INPUT;
//!STEP 0.01
float intensitySigma;
//!TEXTURE
Texture2D INPUT;
//!TEXTURE
//!WIDTH INPUT_WIDTH
//!HEIGHT INPUT_HEIGHT
Texture2D OUTPUT;
//!SAMPLER
//!FILTER POINT
SamplerState sam;
@ -26,6 +29,7 @@ SamplerState sam;
//!PASS 1
//!IN INPUT
//!OUT OUTPUT
//!BLOCK_SIZE 16
//!NUM_THREADS 64
@ -52,7 +56,9 @@ float gaussian(float x, float s, float m) {
void Pass1(uint2 blockStart, uint3 threadId) {
uint2 gxy = (Rmp8x8(threadId.x) << 1) + blockStart;
if (!CheckViewport(gxy)) {
const uint2 outputSize = GetOutputSize();
if (gxy.x >= outputSize.x || gxy.y >= outputSize.y) {
return;
}
@ -84,12 +90,6 @@ void Pass1(uint2 blockStart, uint3 threadId) {
for (j = 0; j <= 1; ++j) {
const uint2 destPos = gxy + uint2(i, j);
if (i != 0 || j != 0) {
if (!CheckViewport(gxy)) {
continue;
}
}
float3 histogram_v[KERNELLEN];
float histogram_l[KERNELLEN];
float histogram_w[KERNELLEN];
@ -132,7 +132,7 @@ void Pass1(uint2 blockStart, uint3 threadId) {
}
}
WriteToOutput(destPos, maxv);
OUTPUT[destPos] = float4(maxv, 1);
}
}
}

View file

@ -1,18 +1,17 @@
// 移植自 https://github.com/bloc97/Anime4K/blob/master/glsl/Restore/Anime4K_Restore_CNN_L.glsl
//!MAGPIE EFFECT
//!VERSION 3
//!OUTPUT_WIDTH INPUT_WIDTH
//!OUTPUT_HEIGHT INPUT_HEIGHT
//!VERSION 4
//!SORT_NAME Anime4K_Restore_2
//!TEXTURE
Texture2D INPUT;
//!SAMPLER
//!FILTER POINT
SamplerState sam;
//!TEXTURE
//!WIDTH INPUT_WIDTH
//!HEIGHT INPUT_HEIGHT
Texture2D OUTPUT;
//!TEXTURE
//!WIDTH INPUT_WIDTH
@ -38,6 +37,10 @@ Texture2D tex3;
//!FORMAT R16G16B16A16_FLOAT
Texture2D tex4;
//!SAMPLER
//!FILTER POINT
SamplerState sam;
//!PASS 1
//!DESC Conv-4x3x3x3
@ -602,13 +605,15 @@ void Pass4(uint2 blockStart, uint3 threadId) {
//!PASS 5
//!DESC Conv-3x3x3x16
//!IN INPUT, tex3, tex4
//!OUT OUTPUT
//!BLOCK_SIZE 8
//!NUM_THREADS 64
void Pass5(uint2 blockStart, uint3 threadId) {
uint2 gxy = Rmp8x8(threadId.x) + blockStart;
uint2 inputSize = GetInputSize();
if (!CheckViewport(gxy)) {
const uint2 outputSize = GetOutputSize();
if (gxy.x >= outputSize.x || gxy.y >= outputSize.y) {
return;
}
@ -638,45 +643,45 @@ void Pass5(uint2 blockStart, uint3 threadId) {
float4 h2 = tex4.SampleLevel(sam, pos + float2(inputPt.x, 0), 0);
float4 i2 = tex4.SampleLevel(sam, pos + float2(inputPt.x, inputPt.y), 0);
float4 result = mul(max(a1, 0), float4x4(0.012102164, 0.01385959, 0.018815203, 0.0, -0.017435113, -0.04530735, -0.051318135, 0.0, 0.01267727, 0.01400136, 0.017735276, 0.0, 0.012681183, 0.035241637, 0.03990959, 0.0));
result += mul(max(b1, 0), float4x4(0.16069227, 0.098007366, 0.076831706, 0.0, 0.081593364, 0.017831434, 0.010174303, 0.0, 0.014732323, 0.02229113, 0.029828338, 0.0, 0.0048171813, 0.051809076, 0.055740006, 0.0));
result += mul(max(c1, 0), float4x4(0.0347963, -0.014327445, -0.024176419, 0.0, 0.003463003, -0.050532356, -0.06565927, 0.0, 0.082851514, 0.10950989, 0.12022889, 0.0, -0.038950548, -0.015094648, -0.0119305095, 0.0));
result += mul(max(d1, 0), float4x4(-0.11845135, -0.08067485, -0.06981454, 0.0, 0.00058037776, 0.01160575, 0.014900963, 0.0, -0.0374349, -0.052966926, -0.044557698, 0.0, 0.017439643, 0.005496974, -0.0024181441, 0.0));
result += mul(max(e1, 0), float4x4(-0.1084345, -0.18271221, -0.18795776, 0.0, 0.110637866, 0.08913364, 0.09161146, 0.0, -0.19889367, -0.17172937, -0.1600661, 0.0, -0.03789556, -0.028977778, -0.029903485, 0.0));
result += mul(max(f1, 0), float4x4(0.017774954, -0.048732057, -0.061161697, 0.0, 0.022389695, -0.013317256, -0.019972157, 0.0, 0.051979035, 0.08774837, 0.09633588, 0.0, -0.047462203, -0.033091765, -0.028352588, 0.0));
result += mul(max(g1, 0), float4x4(0.022178177, 0.05031684, 0.05802219, 0.0, -0.027539665, -0.020904189, -0.01800042, 0.0, 0.0019531948, 0.00019749763, -0.0013961957, 0.0, 0.024253767, -0.00058503833, 0.0006474611, 0.0));
result += mul(max(h1, 0), float4x4(0.06707921, 0.0817431, 0.07561426, 0.0, -0.04157211, -0.006174012, -0.003754037, 0.0, 0.0031168605, 0.02320992, 0.026471246, 0.0, 0.0029530525, -0.004939263, -0.0070194793, 0.0));
result += mul(max(i1, 0), float4x4(0.03383418, 0.042321067, 0.04266926, 0.0, -0.043634403, -0.0182769, -0.011314871, 0.0, -0.050008457, -0.003527757, 0.0035165092, 0.0, -0.00016610099, 0.019936454, 0.022199173, 0.0));
result += mul(max(a2, 0), float4x4(-0.055203374, -0.03910439, -0.03778927, 0.0, 0.027640847, 0.019469904, 0.0277834, 0.0, -0.026225597, 0.04481541, 0.047454204, 0.0, 0.031545334, 0.019874612, 0.011878432, 0.0));
result += mul(max(b2, 0), float4x4(0.016088601, -0.045959134, -0.048793618, 0.0, -0.009834776, 0.0077799167, 0.00873151, 0.0, 0.031265914, 0.09698676, 0.10005417, 0.0, 0.039120086, 0.0005542848, -0.0049420255, 0.0));
result += mul(max(c2, 0), float4x4(0.028432969, -0.014792921, -0.026881924, 0.0, -0.00586326, 0.013427183, 0.018215714, 0.0, -0.013559131, 0.017704675, 0.024854776, 0.0, -0.09087544, -0.104627624, -0.0921747, 0.0));
result += mul(max(d2, 0), float4x4(-0.022899037, 0.026374351, 0.03145993, 0.0, -0.008008749, -0.0013132087, -0.003957525, 0.0, -0.02490554, 0.0020362549, 0.006453752, 0.0, 0.031494617, 0.049864545, 0.04702567, 0.0));
result += mul(max(e2, 0), float4x4(-0.12318068, -0.121377476, -0.11615006, 0.0, -0.1321696, -0.078085914, -0.07868927, 0.0, -0.072339885, 0.0012095685, 0.010923645, 0.0, 0.10844834, 0.10038668, 0.09919817, 0.0));
result += mul(max(f2, 0), float4x4(0.058991943, 0.018824834, 0.01659209, 0.0, -0.041878223, 0.013176531, 0.023566704, 0.0, -0.010507848, 0.02042605, 0.028884022, 0.0, -0.1193022, -0.10676289, -0.096668206, 0.0));
result += mul(max(g2, 0), float4x4(0.023510003, 0.06057355, 0.052194174, 0.0, 0.02304783, 0.031745855, 0.025863871, 0.0, -0.01060811, -0.043136407, -0.03569961, 0.0, -0.022243036, 0.014206766, 0.0032128936, 0.0));
result += mul(max(h2, 0), float4x4(0.025120225, 0.07386707, 0.07916389, 0.0, -0.020202598, 0.010854587, 0.009825397, 0.0, -0.043466344, -0.049230598, -0.038344223, 0.0, 0.006438127, 0.041072655, 0.036958262, 0.0));
result += mul(max(i2, 0), float4x4(0.027640026, 0.04239058, 0.055017423, 0.0, -0.002110394, 0.040088017, 0.045239322, 0.0, -0.020238828, -0.01711292, -0.014726791, 0.0, -0.029621653, -0.007380026, -0.002073584, 0.0));
result += mul(max(-a1, 0), float4x4(0.008071638, 0.0034274645, -0.0016181463, 0.0, 0.044838928, 0.06936641, 0.072150804, 0.0, 0.0006324625, -0.02223834, -0.021122342, 0.0, 0.043963037, 0.047561962, 0.026419055, 0.0));
result += mul(max(-b1, 0), float4x4(-0.06605246, -0.011649812, -0.0022502556, 0.0, -0.09256232, -0.06281528, -0.055003755, 0.0, 0.032296494, -0.011113339, -0.015790787, 0.0, 0.05214882, 0.022887057, 0.013746634, 0.0));
result += mul(max(-c1, 0), float4x4(-0.03587372, 0.018986767, 0.03229596, 0.0, 0.008917248, 0.050303612, 0.06147115, 0.0, 0.01872278, -0.011048741, -0.017369485, 0.0, 0.030770298, 0.0063107815, 0.003187433, 0.0));
result += mul(max(-d1, 0), float4x4(0.087662674, 0.048391398, 0.042332277, 0.0, 0.0043635606, 0.02438183, 0.020213395, 0.0, -0.023863237, -0.0051179314, -0.0060627074, 0.0, 0.06292237, 0.05821987, 0.051667042, 0.0));
result += mul(max(-e1, 0), float4x4(-0.048478693, 0.008368922, 0.016874269, 0.0, -0.19261299, -0.1848583, -0.18258469, 0.0, 0.112302095, 0.061518673, 0.058282077, 0.0, 0.024626324, 0.0058449907, 0.006936535, 0.0));
result += mul(max(-f1, 0), float4x4(-0.04468695, 0.0099176075, 0.025094027, 0.0, 0.05447911, 0.08220857, 0.08161316, 0.0, -0.0007933787, -0.03090106, -0.040217776, 0.0, -0.028044306, -0.050590593, -0.05027328, 0.0));
result += mul(max(-g1, 0), float4x4(0.029733973, -0.0129855955, -0.019776886, 0.0, 0.01860655, 0.017793713, 0.020113358, 0.0, -0.023667783, -0.0013290358, -0.004159268, 0.0, -0.01960303, -0.012806444, -0.016549494, 0.0));
result += mul(max(-h1, 0), float4x4(-0.00952229, -0.007181503, -0.0061082463, 0.0, 0.04292393, 0.01510459, 0.0062862537, 0.0, -0.016540393, -0.023619318, -0.02633423, 0.0, -0.06652295, -0.06933143, -0.063913494, 0.0));
result += mul(max(-i1, 0), float4x4(-0.015281855, -0.012470513, -0.008184894, 0.0, 0.045862548, 0.023707546, 0.014719574, 0.0, 0.032412887, -0.0038218168, -0.0065955487, 0.0, -0.027728679, -0.04009727, -0.018856067, 0.0));
result += mul(max(-a2, 0), float4x4(0.042844415, 0.00673587, 0.0038338478, 0.0, -0.031152235, -0.06649269, -0.065986395, 0.0, 0.005666899, -0.015819343, -0.012795757, 0.0, -0.0007617308, 0.021531299, 0.026071105, 0.0));
result += mul(max(-b2, 0), float4x4(-0.118266046, -0.07211513, -0.058381762, 0.0, 0.02361942, 0.012819485, 0.010511434, 0.0, 0.077196896, 0.003424893, 0.001927401, 0.0, -0.03160996, -0.0034473129, -0.00444674, 0.0));
result += mul(max(-c2, 0), float4x4(-0.06548674, -0.018152835, 0.0034779215, 0.0, -0.006173449, 0.008357867, -0.0033986098, 0.0, 0.021622533, -0.03722321, -0.045832597, 0.0, -0.011835129, 0.0109178, 0.010480887, 0.0));
result += mul(max(-d2, 0), float4x4(0.041682176, -0.008985459, -0.018538723, 0.0, -0.054624356, -0.09495616, -0.090484254, 0.0, -0.0060466817, -0.017551763, -0.014151624, 0.0, -0.015683241, -0.012590141, -0.014278323, 0.0));
result += mul(max(-e2, 0), float4x4(0.073194094, 0.055347454, 0.060976587, 0.0, 0.18175459, 0.13776664, 0.13139476, 0.0, 0.14047755, 0.061971992, 0.056503728, 0.0, 0.0068531767, -0.011873265, -0.016871026, 0.0));
result += mul(max(-f2, 0), float4x4(-0.041848205, -0.009582, -0.0076929387, 0.0, 0.044274334, 0.04011985, 0.03085897, 0.0, 0.009403278, -0.03346772, -0.04463548, 0.0, 0.04548978, 0.014613167, 0.0055232802, 0.0));
result += mul(max(-g2, 0), float4x4(0.019901669, -0.0011372451, -0.007423424, 0.0, -0.053240675, -0.07105105, -0.07122227, 0.0, -0.01892976, -0.019795185, -0.019204788, 0.0, 0.01228504, -0.005040437, -0.0010069044, 0.0));
result += mul(max(-h2, 0), float4x4(0.032843515, 0.014947385, 0.007550199, 0.0, -0.0006476342, -0.020907652, -0.030297596, 0.0, -0.015617971, -0.029182931, -0.038677275, 0.0, 0.037908908, -0.018132487, -0.020226713, 0.0));
result += mul(max(-i2, 0), float4x4(0.03232915, 0.02915194, 0.014929652, 0.0, 0.016676396, 0.004807404, -0.0008906752, 0.0, 0.0076904814, 0.00541351, -0.0048240838, 0.0, 0.03459369, -0.012969539, -0.024712864, 0.0));
result += float4(-0.0096404655, 0.0022038757, 0.0035988842, 0.0);
float3 result = mul(max(a1, 0), float4x3(0.012102164, 0.01385959, 0.018815203, -0.017435113, -0.04530735, -0.051318135, 0.01267727, 0.01400136, 0.017735276, 0.012681183, 0.035241637, 0.03990959));
result += mul(max(b1, 0), float4x3(0.16069227, 0.098007366, 0.076831706, 0.081593364, 0.017831434, 0.010174303, 0.014732323, 0.02229113, 0.029828338, 0.0048171813, 0.051809076, 0.055740006));
result += mul(max(c1, 0), float4x3(0.0347963, -0.014327445, -0.024176419, 0.003463003, -0.050532356, -0.06565927, 0.082851514, 0.10950989, 0.12022889, -0.038950548, -0.015094648, -0.0119305095));
result += mul(max(d1, 0), float4x3(-0.11845135, -0.08067485, -0.06981454, 0.00058037776, 0.01160575, 0.014900963, -0.0374349, -0.052966926, -0.044557698, 0.017439643, 0.005496974, -0.0024181441));
result += mul(max(e1, 0), float4x3(-0.1084345, -0.18271221, -0.18795776, 0.110637866, 0.08913364, 0.09161146, -0.19889367, -0.17172937, -0.1600661, -0.03789556, -0.028977778, -0.029903485));
result += mul(max(f1, 0), float4x3(0.017774954, -0.048732057, -0.061161697, 0.022389695, -0.013317256, -0.019972157, 0.051979035, 0.08774837, 0.09633588, -0.047462203, -0.033091765, -0.028352588));
result += mul(max(g1, 0), float4x3(0.022178177, 0.05031684, 0.05802219, -0.027539665, -0.020904189, -0.01800042, 0.0019531948, 0.00019749763, -0.0013961957, 0.024253767, -0.00058503833, 0.0006474611));
result += mul(max(h1, 0), float4x3(0.06707921, 0.0817431, 0.07561426, -0.04157211, -0.006174012, -0.003754037, 0.0031168605, 0.02320992, 0.026471246, 0.0029530525, -0.004939263, -0.0070194793));
result += mul(max(i1, 0), float4x3(0.03383418, 0.042321067, 0.04266926, -0.043634403, -0.0182769, -0.011314871, -0.050008457, -0.003527757, 0.0035165092, -0.00016610099, 0.019936454, 0.022199173));
result += mul(max(a2, 0), float4x3(-0.055203374, -0.03910439, -0.03778927, 0.027640847, 0.019469904, 0.0277834, -0.026225597, 0.04481541, 0.047454204, 0.031545334, 0.019874612, 0.011878432));
result += mul(max(b2, 0), float4x3(0.016088601, -0.045959134, -0.048793618, -0.009834776, 0.0077799167, 0.00873151, 0.031265914, 0.09698676, 0.10005417, 0.039120086, 0.0005542848, -0.0049420255));
result += mul(max(c2, 0), float4x3(0.028432969, -0.014792921, -0.026881924, -0.00586326, 0.013427183, 0.018215714, -0.013559131, 0.017704675, 0.024854776, -0.09087544, -0.104627624, -0.0921747));
result += mul(max(d2, 0), float4x3(-0.022899037, 0.026374351, 0.03145993, -0.008008749, -0.0013132087, -0.003957525, -0.02490554, 0.0020362549, 0.006453752, 0.031494617, 0.049864545, 0.04702567));
result += mul(max(e2, 0), float4x3(-0.12318068, -0.121377476, -0.11615006, -0.1321696, -0.078085914, -0.07868927, -0.072339885, 0.0012095685, 0.010923645, 0.10844834, 0.10038668, 0.09919817));
result += mul(max(f2, 0), float4x3(0.058991943, 0.018824834, 0.01659209, -0.041878223, 0.013176531, 0.023566704, -0.010507848, 0.02042605, 0.028884022, -0.1193022, -0.10676289, -0.096668206));
result += mul(max(g2, 0), float4x3(0.023510003, 0.06057355, 0.052194174, 0.02304783, 0.031745855, 0.025863871, -0.01060811, -0.043136407, -0.03569961, -0.022243036, 0.014206766, 0.0032128936));
result += mul(max(h2, 0), float4x3(0.025120225, 0.07386707, 0.07916389, -0.020202598, 0.010854587, 0.009825397, -0.043466344, -0.049230598, -0.038344223, 0.006438127, 0.041072655, 0.036958262));
result += mul(max(i2, 0), float4x3(0.027640026, 0.04239058, 0.055017423, -0.002110394, 0.040088017, 0.045239322, -0.020238828, -0.01711292, -0.014726791, -0.029621653, -0.007380026, -0.002073584));
result += mul(max(-a1, 0), float4x3(0.008071638, 0.0034274645, -0.0016181463, 0.044838928, 0.06936641, 0.072150804, 0.0006324625, -0.02223834, -0.021122342, 0.043963037, 0.047561962, 0.026419055));
result += mul(max(-b1, 0), float4x3(-0.06605246, -0.011649812, -0.0022502556, -0.09256232, -0.06281528, -0.055003755, 0.032296494, -0.011113339, -0.015790787, 0.05214882, 0.022887057, 0.013746634));
result += mul(max(-c1, 0), float4x3(-0.03587372, 0.018986767, 0.03229596, 0.008917248, 0.050303612, 0.06147115, 0.01872278, -0.011048741, -0.017369485, 0.030770298, 0.0063107815, 0.003187433));
result += mul(max(-d1, 0), float4x3(0.087662674, 0.048391398, 0.042332277, 0.0043635606, 0.02438183, 0.020213395, -0.023863237, -0.0051179314, -0.0060627074, 0.06292237, 0.05821987, 0.051667042));
result += mul(max(-e1, 0), float4x3(-0.048478693, 0.008368922, 0.016874269, -0.19261299, -0.1848583, -0.18258469, 0.112302095, 0.061518673, 0.058282077, 0.024626324, 0.0058449907, 0.006936535));
result += mul(max(-f1, 0), float4x3(-0.04468695, 0.0099176075, 0.025094027, 0.05447911, 0.08220857, 0.08161316, -0.0007933787, -0.03090106, -0.040217776, -0.028044306, -0.050590593, -0.05027328));
result += mul(max(-g1, 0), float4x3(0.029733973, -0.0129855955, -0.019776886, 0.01860655, 0.017793713, 0.020113358, -0.023667783, -0.0013290358, -0.004159268, -0.01960303, -0.012806444, -0.016549494));
result += mul(max(-h1, 0), float4x3(-0.00952229, -0.007181503, -0.0061082463, 0.04292393, 0.01510459, 0.0062862537, -0.016540393, -0.023619318, -0.02633423, -0.06652295, -0.06933143, -0.063913494));
result += mul(max(-i1, 0), float4x3(-0.015281855, -0.012470513, -0.008184894, 0.045862548, 0.023707546, 0.014719574, 0.032412887, -0.0038218168, -0.0065955487, -0.027728679, -0.04009727, -0.018856067));
result += mul(max(-a2, 0), float4x3(0.042844415, 0.00673587, 0.0038338478, -0.031152235, -0.06649269, -0.065986395, 0.005666899, -0.015819343, -0.012795757, -0.0007617308, 0.021531299, 0.026071105));
result += mul(max(-b2, 0), float4x3(-0.118266046, -0.07211513, -0.058381762, 0.02361942, 0.012819485, 0.010511434, 0.077196896, 0.003424893, 0.001927401, -0.03160996, -0.0034473129, -0.00444674));
result += mul(max(-c2, 0), float4x3(-0.06548674, -0.018152835, 0.0034779215, -0.006173449, 0.008357867, -0.0033986098, 0.021622533, -0.03722321, -0.045832597, -0.011835129, 0.0109178, 0.010480887));
result += mul(max(-d2, 0), float4x3(0.041682176, -0.008985459, -0.018538723, -0.054624356, -0.09495616, -0.090484254, -0.0060466817, -0.017551763, -0.014151624, -0.015683241, -0.012590141, -0.014278323));
result += mul(max(-e2, 0), float4x3(0.073194094, 0.055347454, 0.060976587, 0.18175459, 0.13776664, 0.13139476, 0.14047755, 0.061971992, 0.056503728, 0.0068531767, -0.011873265, -0.016871026));
result += mul(max(-f2, 0), float4x3(-0.041848205, -0.009582, -0.0076929387, 0.044274334, 0.04011985, 0.03085897, 0.009403278, -0.03346772, -0.04463548, 0.04548978, 0.014613167, 0.0055232802));
result += mul(max(-g2, 0), float4x3(0.019901669, -0.0011372451, -0.007423424, -0.053240675, -0.07105105, -0.07122227, -0.01892976, -0.019795185, -0.019204788, 0.01228504, -0.005040437, -0.0010069044));
result += mul(max(-h2, 0), float4x3(0.032843515, 0.014947385, 0.007550199, -0.0006476342, -0.020907652, -0.030297596, -0.015617971, -0.029182931, -0.038677275, 0.037908908, -0.018132487, -0.020226713));
result += mul(max(-i2, 0), float4x3(0.03232915, 0.02915194, 0.014929652, 0.016676396, 0.004807404, -0.0008906752, 0.0076904814, 0.00541351, -0.0048240838, 0.03459369, -0.012969539, -0.024712864));
result += float3(-0.0096404655, 0.0022038757, 0.0035988842);
result += INPUT.SampleLevel(sam, pos, 0);
result += INPUT.SampleLevel(sam, pos, 0).rgb;
WriteToOutput(gxy, result.rgb);
OUTPUT[gxy] = float4(result, 1);
}

View file

@ -2,18 +2,17 @@
// 移植自 https://github.com/bloc97/Anime4K/blob/master/glsl/Restore/Anime4K_Restore_CNN_M.glsl
//!MAGPIE EFFECT
//!VERSION 3
//!OUTPUT_WIDTH INPUT_WIDTH
//!OUTPUT_HEIGHT INPUT_HEIGHT
//!VERSION 4
//!SORT_NAME Anime4K_Restore_1
//!TEXTURE
Texture2D INPUT;
//!SAMPLER
//!FILTER POINT
SamplerState sam;
//!TEXTURE
//!WIDTH INPUT_WIDTH
//!HEIGHT INPUT_HEIGHT
Texture2D OUTPUT;
//!TEXTURE
//!WIDTH INPUT_WIDTH
@ -51,6 +50,10 @@ Texture2D tex5;
//!FORMAT R16G16B16A16_FLOAT
Texture2D tex6;
//!SAMPLER
//!FILTER POINT
SamplerState sam;
//!PASS 1
//!DESC Conv-4x3x3x3
@ -495,15 +498,18 @@ void Pass6(uint2 blockStart, uint3 threadId) {
//!PASS 7
//!DESC Conv-4x3x3x8, Conv-3x1x1x56
//!IN INPUT, tex1, tex2, tex3, tex4, tex5, tex6
//!OUT OUTPUT
//!BLOCK_SIZE 8
//!NUM_THREADS 64
void Pass7(uint2 blockStart, uint3 threadId) {
uint2 gxy = Rmp8x8(threadId.x) + blockStart;
uint2 inputSize = GetInputSize();
if (!CheckViewport(gxy)) {
const uint2 outputSize = GetOutputSize();
if (gxy.x >= outputSize.x || gxy.y >= outputSize.y) {
return;
}
float2 inputPt = GetInputPt();
float2 pos = (gxy + 0.5f) * inputPt;
@ -564,5 +570,5 @@ void Pass7(uint2 blockStart, uint3 threadId) {
result += mul(max(-src7, 0), float4x3(0.10676299, 0.118409514, 0.10618478, -0.05880252, -0.06488367, -0.06432695, 0.019221924, 0.017602798, 0.017413978, -0.07512528, -0.080483615, -0.066218294));
result += float3(-0.010478934, -0.008364784, -0.010246552);
WriteToOutput(gxy, result + origin);
OUTPUT[gxy] = float4(result + origin, 1);
}

View file

@ -2,15 +2,18 @@
// 移植自 https://github.com/bloc97/Anime4K/blob/master/glsl/Restore/Anime4K_Restore_CNN_S.glsl
//!MAGPIE EFFECT
//!VERSION 3
//!OUTPUT_WIDTH INPUT_WIDTH
//!OUTPUT_HEIGHT INPUT_HEIGHT
//!VERSION 4
//!SORT_NAME Anime4K_Restore_0
//!TEXTURE
Texture2D INPUT;
//!TEXTURE
//!WIDTH INPUT_WIDTH
//!HEIGHT INPUT_HEIGHT
Texture2D OUTPUT;
//!SAMPLER
//!FILTER POINT
SamplerState sam;
@ -246,13 +249,15 @@ void Pass3(uint2 blockStart, uint3 threadId) {
//!PASS 4
//!DESC Conv-3x3x3x8
//!IN INPUT, tex1
//!OUT OUTPUT
//!BLOCK_SIZE 8
//!NUM_THREADS 64
void Pass4(uint2 blockStart, uint3 threadId) {
uint2 gxy = Rmp8x8(threadId.x) + blockStart;
uint2 inputSize = GetInputSize();
if (!CheckViewport(gxy)) {
const uint2 outputSize = GetOutputSize();
if (gxy.x >= outputSize.x || gxy.y >= outputSize.y) {
return;
}
@ -294,5 +299,5 @@ void Pass4(uint2 blockStart, uint3 threadId) {
result += INPUT.SampleLevel(sam, pos, 0).rgb;
WriteToOutput(gxy, result);
OUTPUT[gxy] = float4(result, 1);
}

View file

@ -1,18 +1,17 @@
// 移植自 https://github.com/bloc97/Anime4K/blob/master/glsl/Restore/Anime4K_Restore_CNN_Soft_L.glsl
//!MAGPIE EFFECT
//!VERSION 3
//!OUTPUT_WIDTH INPUT_WIDTH
//!OUTPUT_HEIGHT INPUT_HEIGHT
//!VERSION 4
//!SORT_NAME Anime4K_Restore_Soft_2
//!TEXTURE
Texture2D INPUT;
//!SAMPLER
//!FILTER POINT
SamplerState sam;
//!TEXTURE
//!WIDTH INPUT_WIDTH
//!HEIGHT INPUT_HEIGHT
Texture2D OUTPUT;
//!TEXTURE
//!WIDTH INPUT_WIDTH
@ -38,6 +37,10 @@ Texture2D tex3;
//!FORMAT R16G16B16A16_FLOAT
Texture2D tex4;
//!SAMPLER
//!FILTER POINT
SamplerState sam;
//!PASS 1
//!DESC Conv-4x3x3x3
@ -602,13 +605,15 @@ void Pass4(uint2 blockStart, uint3 threadId) {
//!PASS 5
//!DESC Conv-3x3x3x16
//!IN INPUT, tex3, tex4
//!OUT OUTPUT
//!BLOCK_SIZE 8
//!NUM_THREADS 64
void Pass5(uint2 blockStart, uint3 threadId) {
uint2 gxy = Rmp8x8(threadId.x) + blockStart;
uint2 inputSize = GetInputSize();
if (!CheckViewport(gxy)) {
const uint2 outputSize = GetOutputSize();
if (gxy.x >= outputSize.x || gxy.y >= outputSize.y) {
return;
}
@ -638,45 +643,45 @@ void Pass5(uint2 blockStart, uint3 threadId) {
float4 h2 = tex4.SampleLevel(sam, pos + float2(inputPt.x, 0), 0);
float4 i2 = tex4.SampleLevel(sam, pos + float2(inputPt.x, inputPt.y), 0);
float4 result = mul(max(a1, 0), float4x4(-0.01858372, 0.017144108, 0.02794388, 0.0, 0.0129101565, -0.0073674284, -0.011766938, 0.0, 0.01970984, 0.01209068, 0.009530311, 0.0, -0.009190449, -0.006996753, -0.0038750458, 0.0));
result += mul(max(b1, 0), float4x4(0.15856947, 0.10162126, 0.08489005, 0.0, 0.038381726, -0.017771017, -0.03226132, 0.0, -0.011787879, -0.0152445, -0.007564454, 0.0, 0.055921376, 0.08389841, 0.08452836, 0.0));
result += mul(max(c1, 0), float4x4(0.026705442, -0.0070655374, -0.018199183, 0.0, 0.016254421, -0.025398912, -0.03461042, 0.0, 0.03950644, 0.06586101, 0.0707467, 0.0, -0.03793455, -0.04957139, -0.04777402, 0.0));
result += mul(max(d1, 0), float4x4(-0.115341224, -0.04463122, -0.016549354, 0.0, -0.059433736, -0.04303295, -0.042805545, 0.0, 0.010830498, -0.011057443, -0.0141014, 0.0, 0.067396216, 0.06553637, 0.06705378, 0.0));
result += mul(max(e1, 0), float4x4(-0.12767975, -0.19935511, -0.20109995, 0.0, 0.11554901, 0.11426503, 0.11161185, 0.0, -0.22092125, -0.22041021, -0.2142712, 0.0, -0.06326996, -0.061314825, -0.059039716, 0.0));
result += mul(max(f1, 0), float4x4(0.007717391, -0.046238754, -0.056983955, 0.0, 0.021419598, 0.0036924274, -0.00033630748, 0.0, 0.053556852, 0.0824714, 0.08295022, 0.0, -0.09881205, -0.043157153, -0.040801782, 0.0));
result += mul(max(g1, 0), float4x4(0.0052828738, 0.049702674, 0.056108, 0.0, 0.009478552, 0.010345037, 0.0094180945, 0.0, -0.010412882, 0.0006965096, 0.0021917222, 0.0, -0.010701383, -0.023212843, -0.024252625, 0.0));
result += mul(max(h1, 0), float4x4(0.07542127, 0.0739301, 0.06642962, 0.0, -0.08054489, -0.037553925, -0.026762033, 0.0, 0.09727509, 0.102272816, 0.097533874, 0.0, 0.01325714, -0.004582272, -0.006647532, 0.0));
result += mul(max(i1, 0), float4x4(0.03005975, 0.017012767, 0.007840201, 0.0, -0.028650383, -0.0019064787, 0.01083078, 0.0, -0.071352504, -0.019919744, -0.008299795, 0.0, 0.023253804, 0.042413715, 0.04681489, 0.0));
result += mul(max(a2, 0), float4x4(-0.052201163, -0.021727808, -0.020888992, 0.0, 0.008365179, -0.016546093, -0.0111018475, 0.0, -0.06236095, -0.019278256, -0.021443967, 0.0, 0.0029381379, -0.0033039588, -0.006425339, 0.0));
result += mul(max(b2, 0), float4x4(0.02397296, -0.041659098, -0.050882675, 0.0, -0.013487, 0.0067506596, 0.005435185, 0.0, 0.066447854, 0.13331215, 0.13754861, 0.0, 0.028300207, -0.0048033795, -0.010058485, 0.0));
result += mul(max(c2, 0), float4x4(0.08140248, 0.018564016, 0.0036607496, 0.0, -0.0112075955, 0.0022339798, 0.0045722146, 0.0, -0.045716517, -0.0076076477, -0.0016939791, 0.0, -0.030486025, -0.07539711, -0.07185734, 0.0));
result += mul(max(d2, 0), float4x4(-0.0155724995, 0.048904862, 0.059412133, 0.0, -0.013894624, -0.0061430936, -0.011662488, 0.0, -0.0052947477, -0.0176474, -0.018611705, 0.0, 0.022075793, 0.031703226, 0.026735537, 0.0));
result += mul(max(e2, 0), float4x4(-0.18287502, -0.18703277, -0.18331653, 0.0, -0.08616293, -0.011741755, -0.009296464, 0.0, -0.054274965, 0.016794622, 0.022522328, 0.0, 0.06965258, 0.08260611, 0.08285337, 0.0));
result += mul(max(f2, 0), float4x4(0.08107809, 0.0336241, 0.025449684, 0.0, -0.031931, 0.01179566, 0.019694995, 0.0, 0.025930194, 0.042288166, 0.04673656, 0.0, -0.14357394, -0.11003491, -0.094090074, 0.0));
result += mul(max(g2, 0), float4x4(0.007188181, 0.050626095, 0.050705966, 0.0, -0.008030409, -0.018670242, -0.019766346, 0.0, 0.014874803, -0.03657919, -0.034044486, 0.0, -0.011178416, -0.004358302, -0.013611815, 0.0));
result += mul(max(h2, 0), float4x4(0.07987872, 0.11399873, 0.12089382, 0.0, -0.01514355, 0.0068139364, 0.010206274, 0.0, -0.0005701044, -0.011158322, 0.006484812, 0.0, 0.002018227, 0.043359682, 0.042987905, 0.0));
result += mul(max(i2, 0), float4x4(0.0017806455, -0.0015697709, -0.0018252691, 0.0, 0.0058658062, 0.021681193, 0.028615465, 0.0, -0.054827355, -0.04541651, -0.027485048, 0.0, -0.017649114, 0.017717479, 0.027309911, 0.0));
result += mul(max(-a1, 0), float4x4(0.02555098, -0.0028983613, -0.005134733, 0.0, -0.0029332284, 0.015552135, 0.022189403, 0.0, -0.019786593, -0.0031676649, -0.0014604586, 0.0, 0.06648065, 0.0672302, 0.04586375, 0.0));
result += mul(max(-b1, 0), float4x4(-0.06674696, 0.002328631, 0.014039355, 0.0, -0.03636718, 0.014560653, 0.028076636, 0.0, 0.042305287, 0.015249338, 0.0136925895, 0.0, 0.033586804, 0.00701501, -0.011588751, 0.0));
result += mul(max(-c1, 0), float4x4(-0.039022632, 0.015240631, 0.02699061, 0.0, -0.02614261, 0.0051843156, 0.012590042, 0.0, 0.015304643, -0.022641543, -0.030434309, 0.0, 0.016862666, 0.020819275, 0.022333218, 0.0));
result += mul(max(-d1, 0), float4x4(0.08056982, 0.026592938, 0.009744146, 0.0, 0.08762212, 0.10150359, 0.09662005, 0.0, -0.044551965, -0.016349116, -0.014629014, 0.0, -0.014341297, -0.030914815, -0.038747486, 0.0));
result += mul(max(-e1, 0), float4x4(-0.048734166, 0.019775594, 0.03124684, 0.0, -0.2345022, -0.23639877, -0.22958128, 0.0, 0.12412277, 0.10245112, 0.10389806, 0.0, -0.0030797734, -0.01989389, -0.02020691, 0.0));
result += mul(max(-f1, 0), float4x4(-0.0133485105, 0.029644802, 0.041630358, 0.0, 0.041081797, 0.059993293, 0.060033485, 0.0, -0.02155099, -0.035306025, -0.03838472, 0.0, 0.017466968, -0.01866363, -0.004764589, 0.0));
result += mul(max(-g1, 0), float4x4(0.0030783121, -0.04064586, -0.04504904, 0.0, -0.023528632, -0.029308239, -0.022441925, 0.0, 0.020095564, 0.018979732, 0.015117934, 0.0, 0.008429918, 0.021180628, 0.020137152, 0.0));
result += mul(max(-h1, 0), float4x4(0.0012200709, 0.013313984, 0.014122978, 0.0, 0.08750284, 0.038747437, 0.027102578, 0.0, -0.09627132, -0.09706183, -0.09405641, 0.0, -0.05180081, -0.03555434, -0.021694236, 0.0));
result += mul(max(-i1, 0), float4x4(-0.022396728, -0.018316073, -0.01250564, 0.0, 0.045423746, 0.025315331, 0.010639915, 0.0, 0.05618814, 0.022210265, 0.014195103, 0.0, -0.014828652, -0.010245087, 0.0020570823, 0.0));
result += mul(max(-a2, 0), float4x4(0.046651457, 0.001333767, -0.003572458, 0.0, -0.0077845114, -0.012861641, -0.015116351, 0.0, 0.01338984, 0.029198132, 0.026183384, 0.0, 0.0014878022, 0.020025207, 0.024829973, 0.0));
result += mul(max(-b2, 0), float4x4(-0.09506711, -0.06541528, -0.051106647, 0.0, 0.02552611, 0.01181497, 0.0020236392, 0.0, 0.03234602, -0.03153924, -0.035502207, 0.0, -0.034516744, 0.00018784113, 0.0085376045, 0.0));
result += mul(max(-c2, 0), float4x4(-0.05945615, -0.0046793907, 0.011128929, 0.0, -0.0061961384, -0.0040663416, -0.010319631, 0.0, 0.044197917, -0.033448357, -0.04109943, 0.0, -0.04109929, 0.006773195, 0.016976412, 0.0));
result += mul(max(-d2, 0), float4x4(0.02855516, -0.033051047, -0.04864978, 0.0, -0.06393814, -0.082921155, -0.0730681, 0.0, -0.058905125, -0.038639963, -0.027698845, 0.0, -0.013616608, -0.007876684, -0.006182652, 0.0));
result += mul(max(-e2, 0), float4x4(0.15423118, 0.14667909, 0.14534634, 0.0, 0.1485341, 0.096721016, 0.0820024, 0.0, 0.1263968, 0.088775866, 0.083860956, 0.0, 0.04213644, 0.020989005, 0.010447147, 0.0));
result += mul(max(-f2, 0), float4x4(-0.068275765, -0.018390667, -0.011452603, 0.0, 0.03738383, 0.019398715, 0.005998161, 0.0, -0.0011161854, -0.039955888, -0.04444185, 0.0, 0.052985556, 0.017621813, 0.009551621, 0.0));
result += mul(max(-g2, 0), float4x4(0.01387326, -0.0033411914, -0.009420935, 0.0, -0.034494568, -0.019219222, -0.009562797, 0.0, 0.0074023325, 0.022065453, 0.027121471, 0.0, 0.00019609048, -0.0042242454, 2.0403608e-05, 0.0));
result += mul(max(-h2, 0), float4x4(-0.015793918, -0.024342488, -0.037188973, 0.0, 0.004534637, -0.025236975, -0.028567247, 0.0, -0.055682972, -0.054670315, -0.06584981, 0.0, 0.043045517, -0.0075941198, -0.014196169, 0.0));
result += mul(max(-i2, 0), float4x4(0.0132598495, 0.01775289, 0.017206183, 0.0, 0.010604703, -0.007352816, -0.017301153, 0.0, 0.030967329, 0.027615465, 0.0145311365, 0.0, 0.008636854, -0.033379406, -0.042725433, 0.0));
result += float4(-0.0056639817, -0.0017339308, -0.0011913306, 0.0);
float3 result = mul(max(a1, 0), float4x3(-0.01858372, 0.017144108, 0.02794388, 0.0129101565, -0.0073674284, -0.011766938, 0.01970984, 0.01209068, 0.009530311, -0.009190449, -0.006996753, -0.0038750458));
result += mul(max(b1, 0), float4x3(0.15856947, 0.10162126, 0.08489005, 0.038381726, -0.017771017, -0.03226132, -0.011787879, -0.0152445, -0.007564454, 0.055921376, 0.08389841, 0.08452836));
result += mul(max(c1, 0), float4x3(0.026705442, -0.0070655374, -0.018199183, 0.016254421, -0.025398912, -0.03461042, 0.03950644, 0.06586101, 0.0707467, -0.03793455, -0.04957139, -0.04777402));
result += mul(max(d1, 0), float4x3(-0.115341224, -0.04463122, -0.016549354, -0.059433736, -0.04303295, -0.042805545, 0.010830498, -0.011057443, -0.0141014, 0.067396216, 0.06553637, 0.06705378));
result += mul(max(e1, 0), float4x3(-0.12767975, -0.19935511, -0.20109995, 0.11554901, 0.11426503, 0.11161185, -0.22092125, -0.22041021, -0.2142712, -0.06326996, -0.061314825, -0.059039716));
result += mul(max(f1, 0), float4x3(0.007717391, -0.046238754, -0.056983955, 0.021419598, 0.0036924274, -0.00033630748, 0.053556852, 0.0824714, 0.08295022, -0.09881205, -0.043157153, -0.040801782));
result += mul(max(g1, 0), float4x3(0.0052828738, 0.049702674, 0.056108, 0.009478552, 0.010345037, 0.0094180945, -0.010412882, 0.0006965096, 0.0021917222, -0.010701383, -0.023212843, -0.024252625));
result += mul(max(h1, 0), float4x3(0.07542127, 0.0739301, 0.06642962, -0.08054489, -0.037553925, -0.026762033, 0.09727509, 0.102272816, 0.097533874, 0.01325714, -0.004582272, -0.006647532));
result += mul(max(i1, 0), float4x3(0.03005975, 0.017012767, 0.007840201, -0.028650383, -0.0019064787, 0.01083078, -0.071352504, -0.019919744, -0.008299795, 0.023253804, 0.042413715, 0.04681489));
result += mul(max(a2, 0), float4x3(-0.052201163, -0.021727808, -0.020888992, 0.008365179, -0.016546093, -0.0111018475, -0.06236095, -0.019278256, -0.021443967, 0.0029381379, -0.0033039588, -0.006425339));
result += mul(max(b2, 0), float4x3(0.02397296, -0.041659098, -0.050882675, -0.013487, 0.0067506596, 0.005435185, 0.066447854, 0.13331215, 0.13754861, 0.028300207, -0.0048033795, -0.010058485));
result += mul(max(c2, 0), float4x3(0.08140248, 0.018564016, 0.0036607496, -0.0112075955, 0.0022339798, 0.0045722146, -0.045716517, -0.0076076477, -0.0016939791, -0.030486025, -0.07539711, -0.07185734));
result += mul(max(d2, 0), float4x3(-0.0155724995, 0.048904862, 0.059412133, -0.013894624, -0.0061430936, -0.011662488, -0.0052947477, -0.0176474, -0.018611705, 0.022075793, 0.031703226, 0.026735537));
result += mul(max(e2, 0), float4x3(-0.18287502, -0.18703277, -0.18331653, -0.08616293, -0.011741755, -0.009296464, -0.054274965, 0.016794622, 0.022522328, 0.06965258, 0.08260611, 0.08285337));
result += mul(max(f2, 0), float4x3(0.08107809, 0.0336241, 0.025449684, -0.031931, 0.01179566, 0.019694995, 0.025930194, 0.042288166, 0.04673656, -0.14357394, -0.11003491, -0.094090074));
result += mul(max(g2, 0), float4x3(0.007188181, 0.050626095, 0.050705966, -0.008030409, -0.018670242, -0.019766346, 0.014874803, -0.03657919, -0.034044486, -0.011178416, -0.004358302, -0.013611815));
result += mul(max(h2, 0), float4x3(0.07987872, 0.11399873, 0.12089382, -0.01514355, 0.0068139364, 0.010206274, -0.0005701044, -0.011158322, 0.006484812, 0.002018227, 0.043359682, 0.042987905));
result += mul(max(i2, 0), float4x3(0.0017806455, -0.0015697709, -0.0018252691, 0.0058658062, 0.021681193, 0.028615465, -0.054827355, -0.04541651, -0.027485048, -0.017649114, 0.017717479, 0.027309911));
result += mul(max(-a1, 0), float4x3(0.02555098, -0.0028983613, -0.005134733, -0.0029332284, 0.015552135, 0.022189403, -0.019786593, -0.0031676649, -0.0014604586, 0.06648065, 0.0672302, 0.04586375));
result += mul(max(-b1, 0), float4x3(-0.06674696, 0.002328631, 0.014039355, -0.03636718, 0.014560653, 0.028076636, 0.042305287, 0.015249338, 0.0136925895, 0.033586804, 0.00701501, -0.011588751));
result += mul(max(-c1, 0), float4x3(-0.039022632, 0.015240631, 0.02699061, -0.02614261, 0.0051843156, 0.012590042, 0.015304643, -0.022641543, -0.030434309, 0.016862666, 0.020819275, 0.022333218));
result += mul(max(-d1, 0), float4x3(0.08056982, 0.026592938, 0.009744146, 0.08762212, 0.10150359, 0.09662005, -0.044551965, -0.016349116, -0.014629014, -0.014341297, -0.030914815, -0.038747486));
result += mul(max(-e1, 0), float4x3(-0.048734166, 0.019775594, 0.03124684, -0.2345022, -0.23639877, -0.22958128, 0.12412277, 0.10245112, 0.10389806, -0.0030797734, -0.01989389, -0.02020691));
result += mul(max(-f1, 0), float4x3(-0.0133485105, 0.029644802, 0.041630358, 0.041081797, 0.059993293, 0.060033485, -0.02155099, -0.035306025, -0.03838472, 0.017466968, -0.01866363, -0.004764589));
result += mul(max(-g1, 0), float4x3(0.0030783121, -0.04064586, -0.04504904, -0.023528632, -0.029308239, -0.022441925, 0.020095564, 0.018979732, 0.015117934, 0.008429918, 0.021180628, 0.020137152));
result += mul(max(-h1, 0), float4x3(0.0012200709, 0.013313984, 0.014122978, 0.08750284, 0.038747437, 0.027102578, -0.09627132, -0.09706183, -0.09405641, -0.05180081, -0.03555434, -0.021694236));
result += mul(max(-i1, 0), float4x3(-0.022396728, -0.018316073, -0.01250564, 0.045423746, 0.025315331, 0.010639915, 0.05618814, 0.022210265, 0.014195103, -0.014828652, -0.010245087, 0.0020570823));
result += mul(max(-a2, 0), float4x3(0.046651457, 0.001333767, -0.003572458, -0.0077845114, -0.012861641, -0.015116351, 0.01338984, 0.029198132, 0.026183384, 0.0014878022, 0.020025207, 0.024829973));
result += mul(max(-b2, 0), float4x3(-0.09506711, -0.06541528, -0.051106647, 0.02552611, 0.01181497, 0.0020236392, 0.03234602, -0.03153924, -0.035502207, -0.034516744, 0.00018784113, 0.0085376045));
result += mul(max(-c2, 0), float4x3(-0.05945615, -0.0046793907, 0.011128929, -0.0061961384, -0.0040663416, -0.010319631, 0.044197917, -0.033448357, -0.04109943, -0.04109929, 0.006773195, 0.016976412));
result += mul(max(-d2, 0), float4x3(0.02855516, -0.033051047, -0.04864978, -0.06393814, -0.082921155, -0.0730681, -0.058905125, -0.038639963, -0.027698845, -0.013616608, -0.007876684, -0.006182652));
result += mul(max(-e2, 0), float4x3(0.15423118, 0.14667909, 0.14534634, 0.1485341, 0.096721016, 0.0820024, 0.1263968, 0.088775866, 0.083860956, 0.04213644, 0.020989005, 0.010447147));
result += mul(max(-f2, 0), float4x3(-0.068275765, -0.018390667, -0.011452603, 0.03738383, 0.019398715, 0.005998161, -0.0011161854, -0.039955888, -0.04444185, 0.052985556, 0.017621813, 0.009551621));
result += mul(max(-g2, 0), float4x3(0.01387326, -0.0033411914, -0.009420935, -0.034494568, -0.019219222, -0.009562797, 0.0074023325, 0.022065453, 0.027121471, 0.00019609048, -0.0042242454, 2.0403608e-05));
result += mul(max(-h2, 0), float4x3(-0.015793918, -0.024342488, -0.037188973, 0.004534637, -0.025236975, -0.028567247, -0.055682972, -0.054670315, -0.06584981, 0.043045517, -0.0075941198, -0.014196169));
result += mul(max(-i2, 0), float4x3(0.0132598495, 0.01775289, 0.017206183, 0.010604703, -0.007352816, -0.017301153, 0.030967329, 0.027615465, 0.0145311365, 0.008636854, -0.033379406, -0.042725433));
result += float3(-0.0056639817, -0.0017339308, -0.0011913306);
result += INPUT.SampleLevel(sam, pos, 0);
result += INPUT.SampleLevel(sam, pos, 0).rgb;
WriteToOutput(gxy, result.rgb);
OUTPUT[gxy] = float4(result, 1);
}

View file

@ -2,18 +2,17 @@
// 移植自 https://github.com/bloc97/Anime4K/blob/master/glsl/Restore/Anime4K_Restore_CNN_Soft_M.glsl
//!MAGPIE EFFECT
//!VERSION 3
//!OUTPUT_WIDTH INPUT_WIDTH
//!OUTPUT_HEIGHT INPUT_HEIGHT
//!VERSION 4
//!SORT_NAME Anime4K_Restore_Soft_1
//!TEXTURE
Texture2D INPUT;
//!SAMPLER
//!FILTER POINT
SamplerState sam;
//!TEXTURE
//!WIDTH INPUT_WIDTH
//!HEIGHT INPUT_HEIGHT
Texture2D OUTPUT;
//!TEXTURE
//!WIDTH INPUT_WIDTH
@ -51,6 +50,10 @@ Texture2D tex5;
//!FORMAT R16G16B16A16_FLOAT
Texture2D tex6;
//!SAMPLER
//!FILTER POINT
SamplerState sam;
//!PASS 1
//!DESC Conv-4x3x3x3
@ -495,15 +498,18 @@ void Pass6(uint2 blockStart, uint3 threadId) {
//!PASS 7
//!DESC Conv-4x3x3x8, Conv-3x1x1x56
//!IN INPUT, tex1, tex2, tex3, tex4, tex5, tex6
//!OUT OUTPUT
//!BLOCK_SIZE 8
//!NUM_THREADS 64
void Pass7(uint2 blockStart, uint3 threadId) {
uint2 gxy = Rmp8x8(threadId.x) + blockStart;
uint2 inputSize = GetInputSize();
if (!CheckViewport(gxy)) {
const uint2 outputSize = GetOutputSize();
if (gxy.x >= outputSize.x || gxy.y >= outputSize.y) {
return;
}
float2 inputPt = GetInputPt();
float2 pos = (gxy + 0.5f) * inputPt;
@ -564,5 +570,5 @@ void Pass7(uint2 blockStart, uint3 threadId) {
result += mul(max(-src7, 0), float4x3(0.09681486, 0.113604136, 0.10416855, -0.08199983, -0.09013433, -0.08562243, 0.041304465, 0.048315883, 0.042945288, -0.09863276, -0.117853515, -0.09870226));
result += float3(-0.0039074384, -0.0085585555, -0.0132283475);
WriteToOutput(gxy, result + origin);
OUTPUT[gxy] = float4(result + origin, 1);
}

View file

@ -2,15 +2,18 @@
// 移植自 https://github.com/bloc97/Anime4K/blob/master/glsl/Restore/Anime4K_Restore_CNN_Soft_S.glsl
//!MAGPIE EFFECT
//!VERSION 3
//!OUTPUT_WIDTH INPUT_WIDTH
//!OUTPUT_HEIGHT INPUT_HEIGHT
//!VERSION 4
//!SORT_NAME Anime4K_Restore_Soft_0
//!TEXTURE
Texture2D INPUT;
//!TEXTURE
//!WIDTH INPUT_WIDTH
//!HEIGHT INPUT_HEIGHT
Texture2D OUTPUT;
//!SAMPLER
//!FILTER POINT
SamplerState sam;
@ -246,13 +249,15 @@ void Pass3(uint2 blockStart, uint3 threadId) {
//!PASS 4
//!DESC Conv-3x3x3x8
//!IN INPUT, tex1
//!OUT OUTPUT
//!BLOCK_SIZE 8
//!NUM_THREADS 64
void Pass4(uint2 blockStart, uint3 threadId) {
uint2 gxy = Rmp8x8(threadId.x) + blockStart;
uint2 inputSize = GetInputSize();
if (!CheckViewport(gxy)) {
const uint2 outputSize = GetOutputSize();
if (gxy.x >= outputSize.x || gxy.y >= outputSize.y) {
return;
}
@ -294,5 +299,5 @@ void Pass4(uint2 blockStart, uint3 threadId) {
result += INPUT.SampleLevel(sam, pos, 0).rgb;
WriteToOutput(gxy, result);
OUTPUT[gxy] = float4(result, 1);
}

View file

@ -2,18 +2,17 @@
// Ported from https://github.com/bloc97/Anime4K/blob/4ba94b179a144200cb6b3052e690fe2ca5c6914c/glsl/Restore/Anime4K_Restore_CNN_Soft_UL.glsl
//!MAGPIE EFFECT
//!VERSION 3
//!OUTPUT_WIDTH INPUT_WIDTH
//!OUTPUT_HEIGHT INPUT_HEIGHT
//!VERSION 4
//!SORT_NAME Anime4K_Restore_Soft_4
//!TEXTURE
Texture2D INPUT;
//!SAMPLER
//!FILTER POINT
SamplerState sam;
//!TEXTURE
//!WIDTH INPUT_WIDTH
//!HEIGHT INPUT_HEIGHT
Texture2D OUTPUT;
//!TEXTURE
//!WIDTH INPUT_WIDTH
@ -63,6 +62,11 @@ Texture2D tex7;
//!FORMAT R16G16B16A16_FLOAT
Texture2D tex8;
//!SAMPLER
//!FILTER POINT
SamplerState sam;
//!PASS 1
//!DESC Conv-4x3x3x3
//!IN INPUT
@ -1879,13 +1883,15 @@ void Pass7(uint2 blockStart, uint3 threadId) {
//!PASS 8
//!DESC Conv-4x3x3x24, Conv-3x1x1x120
//!IN INPUT, tex1, tex2, tex3, tex7
//!OUT OUTPUT
//!BLOCK_SIZE 8
//!NUM_THREADS 64
void Pass8(uint2 blockStart, uint3 threadId) {
uint2 gxy = Rmp8x8(threadId.x) + blockStart;
uint2 inputSize = GetInputSize();
if (gxy.x >= inputSize.x || gxy.y >= inputSize.y) {
const uint2 outputSize = GetOutputSize();
if (gxy.x >= outputSize.x || gxy.y >= outputSize.y) {
return;
}
@ -2169,5 +2175,5 @@ void Pass8(uint2 blockStart, uint3 threadId) {
result += float3(-0.0036656514, 0.006677459, 0.007698717);
result += INPUT.SampleLevel(sam, pos, 0).rgb;
WriteToOutput(gxy, result.rgb);
OUTPUT[gxy] = float4(result, 1);
}

View file

@ -2,18 +2,17 @@
// Ported from https://github.com/bloc97/Anime4K/blob/4ba94b179a144200cb6b3052e690fe2ca5c6914c/glsl/Restore/Anime4K_Restore_CNN_Soft_VL.glsl
//!MAGPIE EFFECT
//!VERSION 3
//!OUTPUT_WIDTH INPUT_WIDTH
//!OUTPUT_HEIGHT INPUT_HEIGHT
//!VERSION 4
//!SORT_NAME Anime4K_Restore_Soft_3
//!TEXTURE
Texture2D INPUT;
//!SAMPLER
//!FILTER POINT
SamplerState sam;
//!TEXTURE
//!WIDTH INPUT_WIDTH
//!HEIGHT INPUT_HEIGHT
Texture2D OUTPUT;
//!TEXTURE
//!WIDTH INPUT_WIDTH
@ -51,6 +50,10 @@ Texture2D tex5;
//!FORMAT R16G16B16A16_FLOAT
Texture2D tex6;
//!SAMPLER
//!FILTER POINT
SamplerState sam;
//!PASS 1
//!DESC Conv-4x3x3x3
//!IN INPUT
@ -1125,13 +1128,15 @@ void Pass7(uint2 blockStart, uint3 threadId) {
//!PASS 8
//!DESC Conv-4x3x3x16, Conv-3x1x1x112
//!IN INPUT, tex1, tex2, tex5
//!OUT OUTPUT
//!BLOCK_SIZE 8
//!NUM_THREADS 64
void Pass8(uint2 blockStart, uint3 threadId) {
uint2 gxy = Rmp8x8(threadId.x) + blockStart;
uint2 inputSize = GetInputSize();
if (gxy.x >= inputSize.x || gxy.y >= inputSize.y) {
const uint2 outputSize = GetOutputSize();
if (gxy.x >= outputSize.x || gxy.y >= outputSize.y) {
return;
}
@ -1289,5 +1294,5 @@ void Pass8(uint2 blockStart, uint3 threadId) {
result += float3(0.018580848, -0.022256816, -0.0266178);
result += INPUT.SampleLevel(sam, pos, 0).rgb;
WriteToOutput(gxy, result);
OUTPUT[gxy] = float4(result, 1);
}

View file

@ -2,18 +2,17 @@
// Ported from https://github.com/bloc97/Anime4K/blob/4ba94b179a144200cb6b3052e690fe2ca5c6914c/glsl/Restore/Anime4K_Restore_CNN_UL.glsl
//!MAGPIE EFFECT
//!VERSION 3
//!OUTPUT_WIDTH INPUT_WIDTH
//!OUTPUT_HEIGHT INPUT_HEIGHT
//!VERSION 4
//!SORT_NAME Anime4K_Restore_4
//!TEXTURE
Texture2D INPUT;
//!SAMPLER
//!FILTER POINT
SamplerState sam;
//!TEXTURE
//!WIDTH INPUT_WIDTH
//!HEIGHT INPUT_HEIGHT
Texture2D OUTPUT;
//!TEXTURE
//!WIDTH INPUT_WIDTH
@ -63,6 +62,11 @@ Texture2D tex7;
//!FORMAT R16G16B16A16_FLOAT
Texture2D tex8;
//!SAMPLER
//!FILTER POINT
SamplerState sam;
//!PASS 1
//!DESC Conv-4x3x3x3
//!IN INPUT
@ -1879,13 +1883,15 @@ void Pass7(uint2 blockStart, uint3 threadId) {
//!PASS 8
//!DESC Conv-4x3x3x24, Conv-3x1x1x120
//!IN INPUT, tex1, tex2, tex3, tex7
//!OUT OUTPUT
//!BLOCK_SIZE 8
//!NUM_THREADS 64
void Pass8(uint2 blockStart, uint3 threadId) {
uint2 gxy = Rmp8x8(threadId.x) + blockStart;
uint2 inputSize = GetInputSize();
if (gxy.x >= inputSize.x || gxy.y >= inputSize.y) {
const uint2 outputSize = GetOutputSize();
if (gxy.x >= outputSize.x || gxy.y >= outputSize.y) {
return;
}
@ -2169,5 +2175,5 @@ void Pass8(uint2 blockStart, uint3 threadId) {
result += float3(-0.0071146404, 0.005606682, 0.010180816);
result += INPUT.SampleLevel(sam, pos, 0).rgb;
WriteToOutput(gxy, result.rgb);
OUTPUT[gxy] = float4(result, 1);
}

View file

@ -2,18 +2,17 @@
// 移植自 https://github.com/bloc97/Anime4K/blob/master/glsl/Restore/Anime4K_Restore_CNN_VL.glsl
//!MAGPIE EFFECT
//!VERSION 3
//!OUTPUT_WIDTH INPUT_WIDTH
//!OUTPUT_HEIGHT INPUT_HEIGHT
//!VERSION 4
//!SORT_NAME Anime4K_Restore_3
//!TEXTURE
Texture2D INPUT;
//!SAMPLER
//!FILTER POINT
SamplerState sam;
//!TEXTURE
//!WIDTH INPUT_WIDTH
//!HEIGHT INPUT_HEIGHT
Texture2D OUTPUT;
//!TEXTURE
//!WIDTH INPUT_WIDTH
@ -51,6 +50,10 @@ Texture2D tex5;
//!FORMAT R16G16B16A16_FLOAT
Texture2D tex6;
//!SAMPLER
//!FILTER POINT
SamplerState sam;
//!PASS 1
//!DESC Conv-4x3x3x3
@ -1132,13 +1135,15 @@ void Pass7(uint2 blockStart, uint3 threadId) {
//!PASS 8
//!DESC Conv-4x3x3x16, Conv-3x1x1x112
//!IN INPUT, tex1, tex2, tex5
//!OUT OUTPUT
//!BLOCK_SIZE 8
//!NUM_THREADS 64
void Pass8(uint2 blockStart, uint3 threadId) {
uint2 gxy = Rmp8x8(threadId.x) + blockStart;
uint2 inputSize = GetInputSize();
if (gxy.x >= inputSize.x || gxy.y >= inputSize.y) {
const uint2 outputSize = GetOutputSize();
if (gxy.x >= outputSize.x || gxy.y >= outputSize.y) {
return;
}
@ -1296,5 +1301,5 @@ void Pass8(uint2 blockStart, uint3 threadId) {
result += float3(0.047567394, -0.02504617, -0.028163986);
result += INPUT.SampleLevel(sam, pos, 0).rgb;
WriteToOutput(gxy, result);
OUTPUT[gxy] = float4(result, 1);
}

View file

@ -2,9 +2,7 @@
// 移植自 https://github.com/bloc97/Anime4K/blob/master/glsl/Experimental-Effects/Anime4K_Thin_HQ.glsl
//!MAGPIE EFFECT
//!VERSION 3
//!OUTPUT_WIDTH INPUT_WIDTH
//!OUTPUT_HEIGHT INPUT_HEIGHT
//!VERSION 4
//!PARAMETER
@ -30,6 +28,11 @@ int iterations;
//!TEXTURE
Texture2D INPUT;
//!TEXTURE
//!WIDTH INPUT_WIDTH
//!HEIGHT INPUT_HEIGHT
Texture2D OUTPUT;
//!TEXTURE
//!WIDTH INPUT_WIDTH
//!HEIGHT INPUT_HEIGHT
@ -280,13 +283,15 @@ void Pass4(uint2 blockStart, uint3 threadId) {
//!PASS 5
//!DESC Warp
//!IN tex1, INPUT
//!OUT OUTPUT
//!BLOCK_SIZE 16
//!NUM_THREADS 64
void Pass5(uint2 blockStart, uint3 threadId) {
const uint2 gxy = (Rmp8x8(threadId.x) << 1) + blockStart;
const uint2 inputSize = GetInputSize();
if (!CheckViewport(gxy)) {
const uint2 outputSize = GetOutputSize();
if (gxy.x >= outputSize.x || gxy.y >= outputSize.y) {
return;
}
@ -299,12 +304,6 @@ void Pass5(uint2 blockStart, uint3 threadId) {
for (uint j = 0; j <= 1; ++j) {
const uint2 destPos = gxy + uint2(i, j);
if (i != 0 || j != 0) {
if (!CheckViewport(destPos)) {
continue;
}
}
float2 pos = (destPos + 0.5f) * inputPt;
for (int i = 0; i < iterations; ++i) {
@ -313,7 +312,7 @@ void Pass5(uint2 blockStart, uint3 threadId) {
pos -= dd;
}
WriteToOutput(destPos, INPUT.SampleLevel(sam1, pos, 0).rgb);
OUTPUT[destPos] = INPUT.SampleLevel(sam1, pos, 0);
}
}
}

View file

@ -2,22 +2,17 @@
// 移植自 https://github.com/bloc97/Anime4K/blob/master/glsl/Upscale%2BDenoise/Anime4K_Upscale_Denoise_CNN_x2_L.glsl
//!MAGPIE EFFECT
//!VERSION 3
//!OUTPUT_WIDTH INPUT_WIDTH * 2
//!OUTPUT_HEIGHT INPUT_HEIGHT * 2
//!VERSION 4
//!SORT_NAME Anime4K_Upscale_Denoise_1
//!TEXTURE
Texture2D INPUT;
//!SAMPLER
//!FILTER POINT
SamplerState sam;
//!SAMPLER
//!FILTER LINEAR
SamplerState sam1;
//!TEXTURE
//!WIDTH INPUT_WIDTH * 2
//!HEIGHT INPUT_HEIGHT * 2
Texture2D OUTPUT;
//!TEXTURE
//!WIDTH INPUT_WIDTH
@ -43,6 +38,14 @@ Texture2D tex3;
//!FORMAT R16G16B16A16_FLOAT
Texture2D tex4;
//!SAMPLER
//!FILTER POINT
SamplerState sam;
//!SAMPLER
//!FILTER LINEAR
SamplerState sam1;
//!PASS 1
//!DESC Conv-4x3x3x3
@ -446,12 +449,15 @@ void Pass3(uint2 blockStart, uint3 threadId) {
//!PASS 4
//!DESC Conv-4x3x3x16, Depth-to-Space
//!IN INPUT, tex1, tex2
//!OUT OUTPUT
//!BLOCK_SIZE 16
//!NUM_THREADS 64
void Pass4(uint2 blockStart, uint3 threadId) {
uint2 gxy = (Rmp8x8(threadId.x) << 1) + blockStart;
if (!CheckViewport(gxy)) {
const uint2 outputSize = GetOutputSize();
if (gxy.x >= outputSize.x || gxy.y >= outputSize.y) {
return;
}
@ -638,23 +644,17 @@ void Pass4(uint2 blockStart, uint3 threadId) {
float2 outputPt = GetOutputPt();
pos -= 0.5f * outputPt;
WriteToOutput(gxy, float3(target1.x, target2.x, target3.x) + INPUT.SampleLevel(sam1, pos, 0).rgb);
OUTPUT[gxy] = float4(float3(target1.x, target2.x, target3.x) + INPUT.SampleLevel(sam1, pos, 0).rgb, 1);
gxy.x += 1u;
++gxy.x;
pos.x += outputPt.x;
if (CheckViewport(gxy)) {
WriteToOutput(gxy, float3(target1.y, target2.y, target3.y) + INPUT.SampleLevel(sam1, pos, 0).rgb);
}
gxy.y += 1u;
OUTPUT[gxy] = float4(float3(target1.y, target2.y, target3.y) + INPUT.SampleLevel(sam1, pos, 0).rgb, 1);
++gxy.y;
pos.y += outputPt.y;
if (CheckViewport(gxy)) {
WriteToOutput(gxy, float3(target1.w, target2.w, target3.w) + INPUT.SampleLevel(sam1, pos, 0).rgb);
}
OUTPUT[gxy] = float4(float3(target1.w, target2.w, target3.w) + INPUT.SampleLevel(sam1, pos, 0).rgb, 1);
gxy.x -= 1u;
--gxy.x;
pos.x -= outputPt.x;
if (CheckViewport(gxy)) {
WriteToOutput(gxy, float3(target1.z, target2.z, target3.z) + INPUT.SampleLevel(sam1, pos, 0).rgb);
}
OUTPUT[gxy] = float4(float3(target1.z, target2.z, target3.z) + INPUT.SampleLevel(sam1, pos, 0).rgb, 1);
}

View file

@ -2,15 +2,18 @@
// 移植自 https://github.com/bloc97/Anime4K/blob/master/glsl/Upscale%2BDenoise/Anime4K_Upscale_Denoise_CNN_x2_S.glsl
//!MAGPIE EFFECT
//!VERSION 3
//!OUTPUT_WIDTH INPUT_WIDTH * 2
//!OUTPUT_HEIGHT INPUT_HEIGHT * 2
//!VERSION 4
//!SORT_NAME Anime4K_Upscale_Denoise_0
//!TEXTURE
Texture2D INPUT;
//!TEXTURE
//!WIDTH INPUT_WIDTH * 2
//!HEIGHT INPUT_HEIGHT * 2
Texture2D OUTPUT;
//!TEXTURE
//!WIDTH INPUT_WIDTH
//!HEIGHT INPUT_HEIGHT
@ -238,6 +241,7 @@ void Pass3(uint2 blockStart, uint3 threadId) {
//!PASS 4
//!DESC Conv-4x3x3x8, Depth-to-Space
//!IN INPUT, tex1
//!OUT OUTPUT
//!BLOCK_SIZE 16
//!NUM_THREADS 64
@ -282,7 +286,8 @@ float4 A4KS4(float2 pos) {
void Pass4(uint2 blockStart, uint3 threadId) {
uint2 gxy = (Rmp8x8(threadId.x) << 1) + blockStart;
if (!CheckViewport(gxy)) {
const uint2 outputSize = GetOutputSize();
if (gxy.x >= outputSize.x || gxy.y >= outputSize.y) {
return;
}
@ -291,25 +296,19 @@ void Pass4(uint2 blockStart, uint3 threadId) {
float2 pos = ((gxy >> 1) + 0.5f) * inputPt;
float4 c = A4KS4(pos);
pos -= 0.5f * outputPt;
WriteToOutput(gxy, c.x + INPUT.SampleLevel(sam1, pos, 0).rgb);
OUTPUT[gxy] = float4(c.x + INPUT.SampleLevel(sam1, pos, 0).rgb, 1);
gxy.x += 1u;
++gxy.x;
pos.x += outputPt.x;
if (CheckViewport(gxy)) {
WriteToOutput(gxy, c.y + INPUT.SampleLevel(sam1, pos, 0).rgb);
}
gxy.y += 1u;
OUTPUT[gxy] = float4(c.y + INPUT.SampleLevel(sam1, pos, 0).rgb, 1);
++gxy.y;
pos.y += outputPt.y;
if (CheckViewport(gxy)) {
WriteToOutput(gxy, c.w + INPUT.SampleLevel(sam1, pos, 0).rgb);
}
OUTPUT[gxy] = float4(c.w + INPUT.SampleLevel(sam1, pos, 0).rgb, 1);
gxy.x -= 1u;
--gxy.x;
pos.x -= outputPt.x;
if (CheckViewport(gxy)) {
WriteToOutput(gxy, c.z + INPUT.SampleLevel(sam1, pos, 0).rgb);
}
OUTPUT[gxy] = float4(c.z + INPUT.SampleLevel(sam1, pos, 0).rgb, 1);
}

View file

@ -2,22 +2,17 @@
// 移植自 https://github.com/bloc97/Anime4K/blob/78e4f78f65b772e94bae6e7db5c49af1e889f784/glsl/Upscale%2BDenoise/Anime4K_Upscale_Denoise_CNN_x2_UL.glsl
//!MAGPIE EFFECT
//!VERSION 3
//!OUTPUT_WIDTH INPUT_WIDTH * 2
//!OUTPUT_HEIGHT INPUT_HEIGHT * 2
//!VERSION 4
//!SORT_NAME Anime4K_Upscale_Denoise_3
//!TEXTURE
Texture2D INPUT;
//!SAMPLER
//!FILTER POINT
SamplerState sam;
//!SAMPLER
//!FILTER LINEAR
SamplerState sam1;
//!TEXTURE
//!WIDTH INPUT_WIDTH * 2
//!HEIGHT INPUT_HEIGHT * 2
Texture2D OUTPUT;
//!TEXTURE
//!WIDTH INPUT_WIDTH
@ -145,6 +140,14 @@ Texture2D conv2d_6_tf1;
//!FORMAT R16G16B16A16_FLOAT
Texture2D conv2d_6_tf2;
//!SAMPLER
//!FILTER POINT
SamplerState sam;
//!SAMPLER
//!FILTER LINEAR
SamplerState sam1;
//!PASS 1
//!DESC Conv-4x3x3x3
//!IN INPUT
@ -1929,12 +1932,15 @@ void Pass7(uint2 blockStart, uint3 threadId) {
//!PASS 8
//!DESC Conv-4x1x1x120, Depth-to-Space
//!IN INPUT, conv2d_2_tf, conv2d_2_tf1, conv2d_2_tf2, conv2d_3_tf, conv2d_3_tf1, conv2d_3_tf2, conv2d_4_tf, conv2d_4_tf1, conv2d_4_tf2, conv2d_5_tf, conv2d_5_tf1, conv2d_5_tf2, conv2d_6_tf, conv2d_6_tf1, conv2d_6_tf2
//!OUT OUTPUT
//!BLOCK_SIZE 16
//!NUM_THREADS 64
void Pass8(uint2 blockStart, uint3 threadId) {
uint2 gxy = (Rmp8x8(threadId.x) << 1) + blockStart;
if (!CheckViewport(gxy)) {
const uint2 outputSize = GetOutputSize();
if (gxy.x >= outputSize.x || gxy.y >= outputSize.y) {
return;
}
@ -2086,25 +2092,19 @@ void Pass8(uint2 blockStart, uint3 threadId) {
target3 += float4(0.00428531, -0.011541925, 0.00898425, -0.01374321);
float2 outputPt = GetOutputPt();
pos -= 0.5f * outputPt;
WriteToOutput(gxy, float3(target1.x, target2.x, target3.x) + INPUT.SampleLevel(sam1, pos, 0).rgb);
OUTPUT[gxy] = float4(float3(target1.x, target2.x, target3.x) + INPUT.SampleLevel(sam1, pos, 0).rgb, 1);
gxy.x += 1u;
++gxy.x;
pos.x += outputPt.x;
if (CheckViewport(gxy)) {
WriteToOutput(gxy, float3(target1.y, target2.y, target3.y) + INPUT.SampleLevel(sam1, pos, 0).rgb);
}
gxy.y += 1u;
OUTPUT[gxy] = float4(float3(target1.y, target2.y, target3.y) + INPUT.SampleLevel(sam1, pos, 0).rgb, 1);
++gxy.y;
pos.y += outputPt.y;
if (CheckViewport(gxy)) {
WriteToOutput(gxy, float3(target1.w, target2.w, target3.w) + INPUT.SampleLevel(sam1, pos, 0).rgb);
}
OUTPUT[gxy] = float4(float3(target1.w, target2.w, target3.w) + INPUT.SampleLevel(sam1, pos, 0).rgb, 1);
gxy.x -= 1u;
--gxy.x;
pos.x -= outputPt.x;
if (CheckViewport(gxy)) {
WriteToOutput(gxy, float3(target1.z, target2.z, target3.z) + INPUT.SampleLevel(sam1, pos, 0).rgb);
}
OUTPUT[gxy] = float4(float3(target1.z, target2.z, target3.z) + INPUT.SampleLevel(sam1, pos, 0).rgb, 1);
}

View file

@ -2,22 +2,17 @@
// 移植自 https://github.com/bloc97/Anime4K/blob/78e4f78f65b772e94bae6e7db5c49af1e889f784/glsl/Upscale%2BDenoise/Anime4K_Upscale_Denoise_CNN_x2_VL.glsl
//!MAGPIE EFFECT
//!VERSION 3
//!OUTPUT_WIDTH INPUT_WIDTH * 2
//!OUTPUT_HEIGHT INPUT_HEIGHT * 2
//!VERSION 4
//!SORT_NAME Anime4K_Upscale_Denoise_2
//!TEXTURE
Texture2D INPUT;
//!SAMPLER
//!FILTER POINT
SamplerState sam;
//!SAMPLER
//!FILTER LINEAR
SamplerState sam1;
//!TEXTURE
//!WIDTH INPUT_WIDTH * 2
//!HEIGHT INPUT_HEIGHT * 2
Texture2D OUTPUT;
//!TEXTURE
//!WIDTH INPUT_WIDTH
@ -103,6 +98,15 @@ Texture2D conv2d_6_tf;
//!FORMAT R16G16B16A16_FLOAT
Texture2D conv2d_6_tf1;
//!SAMPLER
//!FILTER POINT
SamplerState sam;
//!SAMPLER
//!FILTER LINEAR
SamplerState sam1;
//!PASS 1
//!DESC Conv-4x3x3x3
//!IN INPUT
@ -1143,12 +1147,15 @@ void Pass7(uint2 blockStart, uint3 threadId) {
//!PASS 8
//!DESC Conv-4x1x1x112, Depth-to-Space
//!IN INPUT, conv2d_tf, conv2d_tf1, conv2d_1_tf, conv2d_1_tf1, conv2d_2_tf, conv2d_2_tf1, conv2d_3_tf, conv2d_3_tf1, conv2d_4_tf, conv2d_4_tf1, conv2d_5_tf, conv2d_5_tf1, conv2d_6_tf, conv2d_6_tf1
//!OUT OUTPUT
//!BLOCK_SIZE 16
//!NUM_THREADS 64
void Pass8(uint2 blockStart, uint3 threadId) {
uint2 gxy = (Rmp8x8(threadId.x) << 1) + blockStart;
if (!CheckViewport(gxy)) {
const uint2 outputSize = GetOutputSize();
if (gxy.x >= outputSize.x || gxy.y >= outputSize.y) {
return;
}
@ -1293,23 +1300,17 @@ void Pass8(uint2 blockStart, uint3 threadId) {
float2 outputPt = GetOutputPt();
pos -= 0.5f * outputPt;
WriteToOutput(gxy, float3(target1.x, target2.x, target3.x) + INPUT.SampleLevel(sam1, pos, 0).rgb);
OUTPUT[gxy] = float4(float3(target1.x, target2.x, target3.x) + INPUT.SampleLevel(sam1, pos, 0).rgb, 1);
gxy.x += 1u;
++gxy.x;
pos.x += outputPt.x;
if (CheckViewport(gxy)) {
WriteToOutput(gxy, float3(target1.y, target2.y, target3.y) + INPUT.SampleLevel(sam1, pos, 0).rgb);
}
gxy.y += 1u;
OUTPUT[gxy] = float4(float3(target1.y, target2.y, target3.y) + INPUT.SampleLevel(sam1, pos, 0).rgb, 1);
++gxy.y;
pos.y += outputPt.y;
if (CheckViewport(gxy)) {
WriteToOutput(gxy, float3(target1.w, target2.w, target3.w) + INPUT.SampleLevel(sam1, pos, 0).rgb);
}
OUTPUT[gxy] = float4(float3(target1.w, target2.w, target3.w) + INPUT.SampleLevel(sam1, pos, 0).rgb, 1);
gxy.x -= 1u;
--gxy.x;
pos.x -= outputPt.x;
if (CheckViewport(gxy)) {
WriteToOutput(gxy, float3(target1.z, target2.z, target3.z) + INPUT.SampleLevel(sam1, pos, 0).rgb);
}
OUTPUT[gxy] = float4(float3(target1.z, target2.z, target3.z) + INPUT.SampleLevel(sam1, pos, 0).rgb, 1);
}

File diff suppressed because it is too large Load diff

View file

@ -1,15 +1,18 @@
// Anime4K_Upscale_GAN_x2_S
// 移植自 https://github.com/bloc97/Anime4K/blob/master/glsl/Upscale/Anime4K_Upscale_GAN_x2_S.glsl
// 移植自 https://github.com/bloc97/Anime4K/blob/8e39551ce96ed172605c89b7dd8be855b5502cc9/glsl/Upscale/Anime4K_Upscale_GAN_x2_S.glsl
//!MAGPIE EFFECT
//!VERSION 3
//!OUTPUT_WIDTH INPUT_WIDTH * 2
//!OUTPUT_HEIGHT INPUT_HEIGHT * 2
//!VERSION 4
//!SORT_NAME Anime4K_Upscale_GAN_x2_1
//!TEXTURE
Texture2D INPUT;
//!TEXTURE
//!WIDTH INPUT_WIDTH * 2
//!HEIGHT INPUT_HEIGHT * 2
Texture2D OUTPUT;
//!SAMPLER
//!FILTER POINT
SamplerState sam;
@ -696,12 +699,14 @@ void Pass6(uint2 blockStart, uint3 threadId) {
//!PASS 7
//!DESC Conv-3x3x3x16
//!IN tex6, tex8, INPUT
//!OUT OUTPUT
//!BLOCK_SIZE 8
//!NUM_THREADS 64
void Pass7(uint2 blockStart, uint3 threadId) {
uint2 gxy = Rmp8x8(threadId.x) + blockStart;
uint2 outputSize = GetOutputSize();
const uint2 outputSize = GetOutputSize();
if (gxy.x >= outputSize.x || gxy.y >= outputSize.y) {
return;
}
@ -810,5 +815,5 @@ void Pass7(uint2 blockStart, uint3 threadId) {
result += mul(ni2, float4x3(0.068098865, 0.07742245, 0.04117883, -0.07239023, -0.0048315763, -0.0029638975, -0.053049978, 0.121163346, 0.048760712, -0.033619802, -0.010043663, -0.012648383));
result += float3(0.00016753975, -0.00019302216, -0.0001663917);
WriteToOutput(gxy, result + INPUT.SampleLevel(sam1, pos, 0).rgb);
OUTPUT[gxy] = float4(result + INPUT.SampleLevel(sam1, pos, 0).rgb, 1);
}

File diff suppressed because it is too large Load diff

View file

@ -2,22 +2,17 @@
// 移植自 https://github.com/bloc97/Anime4K/blob/master/glsl/Upscale/Anime4K_Upscale_CNN_x2_L.glsl
//!MAGPIE EFFECT
//!VERSION 3
//!OUTPUT_WIDTH INPUT_WIDTH * 2
//!OUTPUT_HEIGHT INPUT_HEIGHT * 2
//!VERSION 4
//!SORT_NAME Anime4K_Upscale_1
//!TEXTURE
Texture2D INPUT;
//!SAMPLER
//!FILTER POINT
SamplerState sam;
//!SAMPLER
//!FILTER LINEAR
SamplerState sam1;
//!TEXTURE
//!WIDTH INPUT_WIDTH * 2
//!HEIGHT INPUT_HEIGHT * 2
Texture2D OUTPUT;
//!TEXTURE
//!WIDTH INPUT_WIDTH
@ -43,6 +38,14 @@ Texture2D tex3;
//!FORMAT R16G16B16A16_FLOAT
Texture2D tex4;
//!SAMPLER
//!FILTER POINT
SamplerState sam;
//!SAMPLER
//!FILTER LINEAR
SamplerState sam1;
//!PASS 1
//!DESC Conv-4x3x3x3
@ -446,12 +449,15 @@ void Pass3(uint2 blockStart, uint3 threadId) {
//!PASS 4
//!DESC Conv-4x3x3x16, Depth-to-Space
//!IN INPUT, tex1, tex2
//!OUT OUTPUT
//!BLOCK_SIZE 16
//!NUM_THREADS 64
void Pass4(uint2 blockStart, uint3 threadId) {
uint2 gxy = (Rmp8x8(threadId.x) << 1) + blockStart;
if (!CheckViewport(gxy)) {
const uint2 outputSize = GetOutputSize();
if (gxy.x >= outputSize.x || gxy.y >= outputSize.y) {
return;
}
@ -638,23 +644,17 @@ void Pass4(uint2 blockStart, uint3 threadId) {
float2 outputPt = GetOutputPt();
pos -= 0.5f * outputPt;
WriteToOutput(gxy, float3(target1.x, target2.x, target3.x) + INPUT.SampleLevel(sam1, pos, 0).rgb);
OUTPUT[gxy] = float4(float3(target1.x, target2.x, target3.x) + INPUT.SampleLevel(sam1, pos, 0).rgb, 1);
gxy.x += 1u;
++gxy.x;
pos.x += outputPt.x;
if (CheckViewport(gxy)) {
WriteToOutput(gxy, float3(target1.y, target2.y, target3.y) + INPUT.SampleLevel(sam1, pos, 0).rgb);
}
OUTPUT[gxy] = float4(float3(target1.y, target2.y, target3.y) + INPUT.SampleLevel(sam1, pos, 0).rgb, 1);
gxy.y += 1u;
++gxy.y;
pos.y += outputPt.y;
if (CheckViewport(gxy)) {
WriteToOutput(gxy, float3(target1.w, target2.w, target3.w) + INPUT.SampleLevel(sam1, pos, 0).rgb);
}
OUTPUT[gxy] = float4(float3(target1.w, target2.w, target3.w) + INPUT.SampleLevel(sam1, pos, 0).rgb, 1);
gxy.x -= 1u;
--gxy.x;
pos.x -= outputPt.x;
if (CheckViewport(gxy)) {
WriteToOutput(gxy, float3(target1.z, target2.z, target3.z) + INPUT.SampleLevel(sam1, pos, 0).rgb);
}
OUTPUT[gxy] = float4(float3(target1.z, target2.z, target3.z) + INPUT.SampleLevel(sam1, pos, 0).rgb, 1);
}

View file

@ -2,15 +2,18 @@
// 移植自 https://github.com/bloc97/Anime4K/blob/master/glsl/Upscale/Anime4K_Upscale_CNN_x2_S.glsl
//!MAGPIE EFFECT
//!VERSION 3
//!OUTPUT_WIDTH INPUT_WIDTH * 2
//!OUTPUT_HEIGHT INPUT_HEIGHT * 2
//!VERSION 4
//!SORT_NAME Anime4K_Upscale_0
//!TEXTURE
Texture2D INPUT;
//!TEXTURE
//!WIDTH INPUT_WIDTH * 2
//!HEIGHT INPUT_HEIGHT * 2
Texture2D OUTPUT;
//!TEXTURE
//!WIDTH INPUT_WIDTH
//!HEIGHT INPUT_HEIGHT
@ -238,6 +241,7 @@ void Pass3(uint2 blockStart, uint3 threadId) {
//!PASS 4
//!DESC Conv-4x3x3x8, Depth-to-Space
//!IN INPUT, tex1
//!OUT OUTPUT
//!BLOCK_SIZE 16
//!NUM_THREADS 64
@ -282,7 +286,8 @@ float4 A4KS4(float2 pos) {
void Pass4(uint2 blockStart, uint3 threadId) {
uint2 gxy = (Rmp8x8(threadId.x) << 1) + blockStart;
if (!CheckViewport(gxy)) {
const uint2 outputSize = GetOutputSize();
if (gxy.x >= outputSize.x || gxy.y >= outputSize.y) {
return;
}
@ -293,23 +298,17 @@ void Pass4(uint2 blockStart, uint3 threadId) {
float4 c = A4KS4(pos);
pos -= 0.5f * outputPt;
WriteToOutput(gxy, c.x + INPUT.SampleLevel(sam1, pos, 0).rgb);
gxy.x += 1u;
OUTPUT[gxy] = float4(c.x + INPUT.SampleLevel(sam1, pos, 0).rgb, 1);
++gxy.x;
pos.x += outputPt.x;
if (CheckViewport(gxy)) {
WriteToOutput(gxy, c.y + INPUT.SampleLevel(sam1, pos, 0).rgb);
}
OUTPUT[gxy] = float4(c.y + INPUT.SampleLevel(sam1, pos, 0).rgb, 1);
gxy.y += 1u;
++gxy.y;
pos.y += outputPt.y;
if (CheckViewport(gxy)) {
WriteToOutput(gxy, c.w + INPUT.SampleLevel(sam1, pos, 0).rgb);
}
gxy.x -= 1u;
OUTPUT[gxy] = float4(c.w + INPUT.SampleLevel(sam1, pos, 0).rgb, 1);
--gxy.x;
pos.x -= outputPt.x;
if (CheckViewport(gxy)) {
WriteToOutput(gxy, c.z + INPUT.SampleLevel(sam1, pos, 0).rgb);
}
OUTPUT[gxy] = float4(c.z + INPUT.SampleLevel(sam1, pos, 0).rgb, 1);
}

View file

@ -2,22 +2,17 @@
// 移植自 https://github.com/bloc97/Anime4K/blob/78e4f78f65b772e94bae6e7db5c49af1e889f784/glsl/Upscale/Anime4K_Upscale_CNN_x2_UL.glsl
//!MAGPIE EFFECT
//!VERSION 3
//!OUTPUT_WIDTH INPUT_WIDTH * 2
//!OUTPUT_HEIGHT INPUT_HEIGHT * 2
//!VERSION 4
//!SORT_NAME Anime4K_Upscale_3
//!TEXTURE
Texture2D INPUT;
//!SAMPLER
//!FILTER POINT
SamplerState sam;
//!SAMPLER
//!FILTER LINEAR
SamplerState sam1;
//!TEXTURE
//!WIDTH INPUT_WIDTH * 2
//!HEIGHT INPUT_HEIGHT * 2
Texture2D OUTPUT;
//!TEXTURE
//!WIDTH INPUT_WIDTH
@ -145,6 +140,15 @@ Texture2D conv2d_6_tf1;
//!FORMAT R16G16B16A16_FLOAT
Texture2D conv2d_6_tf2;
//!SAMPLER
//!FILTER POINT
SamplerState sam;
//!SAMPLER
//!FILTER LINEAR
SamplerState sam1;
//!PASS 1
//!DESC Conv-4x3x3x3
//!IN INPUT
@ -1929,12 +1933,15 @@ void Pass7(uint2 blockStart, uint3 threadId) {
//!PASS 8
//!DESC Conv-4x1x1x120, Depth-to-Space
//!IN INPUT, conv2d_2_tf, conv2d_2_tf1, conv2d_2_tf2, conv2d_3_tf, conv2d_3_tf1, conv2d_3_tf2, conv2d_4_tf, conv2d_4_tf1, conv2d_4_tf2, conv2d_5_tf, conv2d_5_tf1, conv2d_5_tf2, conv2d_6_tf, conv2d_6_tf1, conv2d_6_tf2
//!OUT OUTPUT
//!BLOCK_SIZE 16
//!NUM_THREADS 64
void Pass8(uint2 blockStart, uint3 threadId) {
uint2 gxy = (Rmp8x8(threadId.x) << 1) + blockStart;
if (!CheckViewport(gxy)) {
const uint2 outputSize = GetOutputSize();
if (gxy.x >= outputSize.x || gxy.y >= outputSize.y) {
return;
}
@ -2088,23 +2095,17 @@ void Pass8(uint2 blockStart, uint3 threadId) {
float2 outputPt = GetOutputPt();
pos -= 0.5f * outputPt;
WriteToOutput(gxy, float3(target1.x, target2.x, target3.x) + INPUT.SampleLevel(sam1, pos, 0).rgb);
OUTPUT[gxy] = float4(float3(target1.x, target2.x, target3.x) + INPUT.SampleLevel(sam1, pos, 0).rgb, 1);
gxy.x += 1u;
++gxy.x;
pos.x += outputPt.x;
if (CheckViewport(gxy)) {
WriteToOutput(gxy, float3(target1.y, target2.y, target3.y) + INPUT.SampleLevel(sam1, pos, 0).rgb);
}
gxy.y += 1u;
OUTPUT[gxy] = float4(float3(target1.y, target2.y, target3.y) + INPUT.SampleLevel(sam1, pos, 0).rgb, 1);
++gxy.y;
pos.y += outputPt.y;
if (CheckViewport(gxy)) {
WriteToOutput(gxy, float3(target1.w, target2.w, target3.w) + INPUT.SampleLevel(sam1, pos, 0).rgb);
}
OUTPUT[gxy] = float4(float3(target1.w, target2.w, target3.w) + INPUT.SampleLevel(sam1, pos, 0).rgb, 1);
gxy.x -= 1u;
--gxy.x;
pos.x -= outputPt.x;
if (CheckViewport(gxy)) {
WriteToOutput(gxy, float3(target1.z, target2.z, target3.z) + INPUT.SampleLevel(sam1, pos, 0).rgb);
}
OUTPUT[gxy] = float4(float3(target1.z, target2.z, target3.z) + INPUT.SampleLevel(sam1, pos, 0).rgb, 1);
}

View file

@ -2,22 +2,17 @@
// 移植自 https://github.com/bloc97/Anime4K/blob/78e4f78f65b772e94bae6e7db5c49af1e889f784/glsl/Upscale/Anime4K_Upscale_CNN_x2_VL.glsl
//!MAGPIE EFFECT
//!VERSION 3
//!OUTPUT_WIDTH INPUT_WIDTH * 2
//!OUTPUT_HEIGHT INPUT_HEIGHT * 2
//!VERSION 4
//!SORT_NAME Anime4K_Upscale_2
//!TEXTURE
Texture2D INPUT;
//!SAMPLER
//!FILTER POINT
SamplerState sam;
//!SAMPLER
//!FILTER LINEAR
SamplerState sam1;
//!TEXTURE
//!WIDTH INPUT_WIDTH * 2
//!HEIGHT INPUT_HEIGHT * 2
Texture2D OUTPUT;
//!TEXTURE
//!WIDTH INPUT_WIDTH
@ -103,6 +98,15 @@ Texture2D conv2d_6_tf;
//!FORMAT R16G16B16A16_FLOAT
Texture2D conv2d_6_tf1;
//!SAMPLER
//!FILTER POINT
SamplerState sam;
//!SAMPLER
//!FILTER LINEAR
SamplerState sam1;
//!PASS 1
//!DESC Conv-4x3x3x3
//!IN INPUT
@ -1143,12 +1147,15 @@ void Pass7(uint2 blockStart, uint3 threadId) {
//!PASS 8
//!DESC Conv-4x1x1x112, Depth-to-Space
//!IN INPUT, conv2d_tf, conv2d_tf1, conv2d_1_tf, conv2d_1_tf1, conv2d_2_tf, conv2d_2_tf1, conv2d_3_tf, conv2d_3_tf1, conv2d_4_tf, conv2d_4_tf1, conv2d_5_tf, conv2d_5_tf1, conv2d_6_tf, conv2d_6_tf1
//!OUT OUTPUT
//!BLOCK_SIZE 16
//!NUM_THREADS 64
void Pass8(uint2 blockStart, uint3 threadId) {
uint2 gxy = (Rmp8x8(threadId.x) << 1) + blockStart;
if (!CheckViewport(gxy)) {
const uint2 outputSize = GetOutputSize();
if (gxy.x >= outputSize.x || gxy.y >= outputSize.y) {
return;
}
@ -1293,23 +1300,17 @@ void Pass8(uint2 blockStart, uint3 threadId) {
float2 outputPt = GetOutputPt();
pos -= 0.5f * outputPt;
WriteToOutput(gxy, float3(target1.x, target2.x, target3.x) + INPUT.SampleLevel(sam1, pos, 0).rgb);
OUTPUT[gxy] = float4(float3(target1.x, target2.x, target3.x) + INPUT.SampleLevel(sam1, pos, 0).rgb, 1);
gxy.x += 1u;
++gxy.x;
pos.x += outputPt.x;
if (CheckViewport(gxy)) {
WriteToOutput(gxy, float3(target1.y, target2.y, target3.y) + INPUT.SampleLevel(sam1, pos, 0).rgb);
}
gxy.y += 1u;
OUTPUT[gxy] = float4(float3(target1.y, target2.y, target3.y) + INPUT.SampleLevel(sam1, pos, 0).rgb, 1);
++gxy.y;
pos.y += outputPt.y;
if (CheckViewport(gxy)) {
WriteToOutput(gxy, float3(target1.w, target2.w, target3.w) + INPUT.SampleLevel(sam1, pos, 0).rgb);
}
OUTPUT[gxy] = float4(float3(target1.w, target2.w, target3.w) + INPUT.SampleLevel(sam1, pos, 0).rgb, 1);
gxy.x -= 1u;
--gxy.x;
pos.x -= outputPt.x;
if (CheckViewport(gxy)) {
WriteToOutput(gxy, float3(target1.z, target2.z, target3.z) + INPUT.SampleLevel(sam1, pos, 0).rgb);
}
OUTPUT[gxy] = float4(float3(target1.z, target2.z, target3.z) + INPUT.SampleLevel(sam1, pos, 0).rgb, 1);
}

View file

@ -2,8 +2,7 @@
// 移植自 https://github.com/ActualMandM/cemu_graphic_packs/blob/468d165cf27dae13a06e8bdc3d588d0af775ad91/Filters/Bicubic/output.glsl
//!MAGPIE EFFECT
//!VERSION 3
//!GENERIC_DOWNSCALER
//!VERSION 4
//!PARAMETER
@ -27,6 +26,9 @@ float paramC;
//!TEXTURE
Texture2D INPUT;
//!TEXTURE
Texture2D OUTPUT;
//!SAMPLER
//!FILTER LINEAR
SamplerState sam;
@ -35,7 +37,7 @@ SamplerState sam;
//!PASS 1
//!STYLE PS
//!IN INPUT
//!OUT OUTPUT
float weight(float x) {
const float B = paramB;
@ -93,20 +95,20 @@ float4 Pass1(float2 pos) {
int2 coord_top_left = int2(max(uv0 * inputSize, 0.5));
int2 coord_bottom_right = int2(min(uv3 * inputSize, inputSize - 0.5));
float4 top = INPUT.Load(int3(coord_top_left, 0)) * rowtaps.x;
top += INPUT.SampleLevel(sam, float2(u_middle, uv0.y), 0) * u_weight_sum;
top += INPUT.Load(int3(coord_bottom_right.x, coord_top_left.y, 0)) * rowtaps.w;
float4 total = top * coltaps.x;
float3 top = INPUT.Load(int3(coord_top_left, 0)).rgb * rowtaps.x;
top += INPUT.SampleLevel(sam, float2(u_middle, uv0.y), 0).rgb * u_weight_sum;
top += INPUT.Load(int3(coord_bottom_right.x, coord_top_left.y, 0)).rgb * rowtaps.w;
float3 total = top * coltaps.x;
float4 middle = INPUT.SampleLevel(sam, float2(uv0.x, v_middle), 0) * rowtaps.x;
middle += INPUT.SampleLevel(sam, float2(u_middle, v_middle), 0) * u_weight_sum;
middle += INPUT.SampleLevel(sam, float2(uv3.x, v_middle), 0) * rowtaps.w;
float3 middle = INPUT.SampleLevel(sam, float2(uv0.x, v_middle), 0).rgb * rowtaps.x;
middle += INPUT.SampleLevel(sam, float2(u_middle, v_middle), 0).rgb * u_weight_sum;
middle += INPUT.SampleLevel(sam, float2(uv3.x, v_middle), 0).rgb * rowtaps.w;
total += middle * v_weight_sum;
float4 bottom = INPUT.Load(int3(coord_top_left.x, coord_bottom_right.y, 0)) * rowtaps.x;
bottom += INPUT.SampleLevel(sam, float2(u_middle, uv3.y), 0) * u_weight_sum;
bottom += INPUT.Load(int3(coord_bottom_right, 0)) * rowtaps.w;
float3 bottom = INPUT.Load(int3(coord_top_left.x, coord_bottom_right.y, 0)).rgb * rowtaps.x;
bottom += INPUT.SampleLevel(sam, float2(u_middle, uv3.y), 0).rgb * u_weight_sum;
bottom += INPUT.Load(int3(coord_bottom_right, 0)).rgb * rowtaps.w;
total += bottom * coltaps.w;
return total;
return float4(total, 1);
}

View file

@ -1,20 +1,20 @@
//!MAGPIE EFFECT
//!VERSION 3
//!GENERIC_DOWNSCALER
//!VERSION 4
//!TEXTURE
Texture2D INPUT;
//!TEXTURE
Texture2D OUTPUT;
//!SAMPLER
//!FILTER LINEAR
SamplerState sam;
//!PASS 1
//!STYLE PS
//!IN INPUT
//!OUT OUTPUT
float4 Pass1(float2 pos) {
return INPUT.SampleLevel(sam, pos, 0);
}

View file

@ -1,9 +1,8 @@
// 移植自 https://github.com/GPUOpen-Effects/FidelityFX-CAS/blob/master/ffx-cas/ffx_cas.h
//!MAGPIE EFFECT
//!VERSION 3
//!OUTPUT_WIDTH INPUT_WIDTH
//!OUTPUT_HEIGHT INPUT_HEIGHT
//!VERSION 4
//!PARAMETER
//!LABEL Sharpness
@ -16,6 +15,11 @@ float sharpness;
//!TEXTURE
Texture2D INPUT;
//!TEXTURE
//!WIDTH INPUT_WIDTH
//!HEIGHT INPUT_HEIGHT
Texture2D OUTPUT;
//!SAMPLER
//!FILTER POINT
SamplerState sam;
@ -23,6 +27,7 @@ SamplerState sam;
//!PASS 1
//!IN INPUT
//!OUT OUTPUT
//!BLOCK_SIZE 16
//!NUM_THREADS 64
@ -35,254 +40,244 @@ SamplerState sam;
#ifdef MP_FP16
void CasFilterH(
MF3 src[4][4],
uint pos,
MF peak,
// Output values are for 2 8x8 tiles in a 16x8 region.
// pix<R,G,B>.x = right 8x8 tile
// pix<R,G,B>.y = left 8x8 tile
// This enables later processing to easily be packed as well.
out MF2 pixR,
out MF2 pixG,
out MF2 pixB
MF3 src[4][4],
uint pos,
MF peak,
// Output values are for 2 8x8 tiles in a 16x8 region.
// pix<R,G,B>.x = right 8x8 tile
// pix<R,G,B>.y = left 8x8 tile
// This enables later processing to easily be packed as well.
out MF2 pixR,
out MF2 pixG,
out MF2 pixB
) {
// AOS to SOA conversion.
MF2 aR = MF2(src[0][pos + 0].r, src[1][pos + 0].r);
MF2 aG = MF2(src[0][pos + 0].g, src[1][pos + 0].g);
MF2 aB = MF2(src[0][pos + 0].b, src[1][pos + 0].b);
MF2 bR = MF2(src[1][pos + 0].r, src[2][pos + 0].r);
MF2 bG = MF2(src[1][pos + 0].g, src[2][pos + 0].g);
MF2 bB = MF2(src[1][pos + 0].b, src[2][pos + 0].b);
MF2 cR = MF2(src[2][pos + 0].r, src[3][pos + 0].r);
MF2 cG = MF2(src[2][pos + 0].g, src[3][pos + 0].g);
MF2 cB = MF2(src[2][pos + 0].b, src[3][pos + 0].b);
MF2 dR = MF2(src[0][pos + 1].r, src[1][pos + 1].r);
MF2 dG = MF2(src[0][pos + 1].g, src[1][pos + 1].g);
MF2 dB = MF2(src[0][pos + 1].b, src[1][pos + 1].b);
MF2 eR = MF2(src[1][pos + 1].r, src[2][pos + 1].r);
MF2 eG = MF2(src[1][pos + 1].g, src[2][pos + 1].g);
MF2 eB = MF2(src[1][pos + 1].b, src[2][pos + 1].b);
MF2 fR = MF2(src[2][pos + 1].r, src[3][pos + 1].r);
MF2 fG = MF2(src[2][pos + 1].g, src[3][pos + 1].g);
MF2 fB = MF2(src[2][pos + 1].b, src[3][pos + 1].b);
MF2 gR = MF2(src[0][pos + 2].r, src[1][pos + 2].r);
MF2 gG = MF2(src[0][pos + 2].g, src[1][pos + 2].g);
MF2 gB = MF2(src[0][pos + 2].b, src[1][pos + 2].b);
MF2 hR = MF2(src[1][pos + 2].r, src[2][pos + 2].r);
MF2 hG = MF2(src[1][pos + 2].g, src[2][pos + 2].g);
MF2 hB = MF2(src[1][pos + 2].b, src[2][pos + 2].b);
MF2 iR = MF2(src[2][pos + 2].r, src[3][pos + 2].r);
MF2 iG = MF2(src[2][pos + 2].g, src[3][pos + 2].g);
MF2 iB = MF2(src[2][pos + 2].b, src[3][pos + 2].b);
// AOS to SOA conversion.
MF2 aR = MF2(src[0][pos + 0].r, src[1][pos + 0].r);
MF2 aG = MF2(src[0][pos + 0].g, src[1][pos + 0].g);
MF2 aB = MF2(src[0][pos + 0].b, src[1][pos + 0].b);
MF2 bR = MF2(src[1][pos + 0].r, src[2][pos + 0].r);
MF2 bG = MF2(src[1][pos + 0].g, src[2][pos + 0].g);
MF2 bB = MF2(src[1][pos + 0].b, src[2][pos + 0].b);
MF2 cR = MF2(src[2][pos + 0].r, src[3][pos + 0].r);
MF2 cG = MF2(src[2][pos + 0].g, src[3][pos + 0].g);
MF2 cB = MF2(src[2][pos + 0].b, src[3][pos + 0].b);
MF2 dR = MF2(src[0][pos + 1].r, src[1][pos + 1].r);
MF2 dG = MF2(src[0][pos + 1].g, src[1][pos + 1].g);
MF2 dB = MF2(src[0][pos + 1].b, src[1][pos + 1].b);
MF2 eR = MF2(src[1][pos + 1].r, src[2][pos + 1].r);
MF2 eG = MF2(src[1][pos + 1].g, src[2][pos + 1].g);
MF2 eB = MF2(src[1][pos + 1].b, src[2][pos + 1].b);
MF2 fR = MF2(src[2][pos + 1].r, src[3][pos + 1].r);
MF2 fG = MF2(src[2][pos + 1].g, src[3][pos + 1].g);
MF2 fB = MF2(src[2][pos + 1].b, src[3][pos + 1].b);
MF2 gR = MF2(src[0][pos + 2].r, src[1][pos + 2].r);
MF2 gG = MF2(src[0][pos + 2].g, src[1][pos + 2].g);
MF2 gB = MF2(src[0][pos + 2].b, src[1][pos + 2].b);
MF2 hR = MF2(src[1][pos + 2].r, src[2][pos + 2].r);
MF2 hG = MF2(src[1][pos + 2].g, src[2][pos + 2].g);
MF2 hB = MF2(src[1][pos + 2].b, src[2][pos + 2].b);
MF2 iR = MF2(src[2][pos + 2].r, src[3][pos + 2].r);
MF2 iG = MF2(src[2][pos + 2].g, src[3][pos + 2].g);
MF2 iB = MF2(src[2][pos + 2].b, src[3][pos + 2].b);
// Soft min and max.
MF2 mnR = min(min(fR, hR), min(min(bR, dR), eR));
MF2 mnG = min(min(fG, hG), min(min(bG, dG), eG));
MF2 mnB = min(min(fB, hB), min(min(bB, dB), eB));
// Soft min and max.
MF2 mnR = min(min(fR, hR), min(min(bR, dR), eR));
MF2 mnG = min(min(fG, hG), min(min(bG, dG), eG));
MF2 mnB = min(min(fB, hB), min(min(bB, dB), eB));
#ifdef CAS_BETTER_DIAGONALS
MF2 mnR2 = min(min(gR, iR), min(min(aR, cR), mnR));
MF2 mnG2 = min(min(gG, iG), min(min(aG, cG), mnG));
MF2 mnB2 = min(min(gB, iB), min(min(aB, cB), mnB));
mnR = mnR + mnR2;
mnG = mnG + mnG2;
mnB = mnB + mnB2;
MF2 mnR2 = min(min(gR, iR), min(min(aR, cR), mnR));
MF2 mnG2 = min(min(gG, iG), min(min(aG, cG), mnG));
MF2 mnB2 = min(min(gB, iB), min(min(aB, cB), mnB));
mnR = mnR + mnR2;
mnG = mnG + mnG2;
mnB = mnB + mnB2;
#endif
MF2 mxR = max(max(fR, hR), max(max(bR, dR), eR));
MF2 mxG = max(max(fG, hG), max(max(bG, dG), eG));
MF2 mxB = max(max(fB, hB), max(max(bB, dB), eB));
MF2 mxR = max(max(fR, hR), max(max(bR, dR), eR));
MF2 mxG = max(max(fG, hG), max(max(bG, dG), eG));
MF2 mxB = max(max(fB, hB), max(max(bB, dB), eB));
#ifdef CAS_BETTER_DIAGONALS
MF2 mxR2 = max(max(gR, iR), max(max(aR, cR), mxR));
MF2 mxG2 = max(max(gG, iG), max(max(aG, cG), mxG));
MF2 mxB2 = max(max(gB, iB), max(max(aB, cB), mxB));
mxR = mxR + mxR2;
mxG = mxG + mxG2;
mxB = mxB + mxB2;
MF2 mxR2 = max(max(gR, iR), max(max(aR, cR), mxR));
MF2 mxG2 = max(max(gG, iG), max(max(aG, cG), mxG));
MF2 mxB2 = max(max(gB, iB), max(max(aB, cB), mxB));
mxR = mxR + mxR2;
mxG = mxG + mxG2;
mxB = mxB + mxB2;
#endif
// Smooth minimum distance to signal limit divided by smooth max.
MF2 rcpMR = rcp(mxR);
MF2 rcpMG = rcp(mxG);
MF2 rcpMB = rcp(mxB);
// Smooth minimum distance to signal limit divided by smooth max.
MF2 rcpMR = rcp(mxR);
MF2 rcpMG = rcp(mxG);
MF2 rcpMB = rcp(mxB);
#ifdef CAS_BETTER_DIAGONALS
MF2 ampR = saturate(min(mnR, 2.0 - mxR) * rcpMR);
MF2 ampG = saturate(min(mnG, 2.0 - mxG) * rcpMG);
MF2 ampB = saturate(min(mnB, 2.0 - mxB) * rcpMB);
MF2 ampR = saturate(min(mnR, 2.0 - mxR) * rcpMR);
MF2 ampG = saturate(min(mnG, 2.0 - mxG) * rcpMG);
MF2 ampB = saturate(min(mnB, 2.0 - mxB) * rcpMB);
#else
MF2 ampR = saturate(min(mnR, 1.0 - mxR) * rcpMR);
MF2 ampG = saturate(min(mnG, 1.0 - mxG) * rcpMG);
MF2 ampB = saturate(min(mnB, 1.0 - mxB) * rcpMB);
MF2 ampR = saturate(min(mnR, 1.0 - mxR) * rcpMR);
MF2 ampG = saturate(min(mnG, 1.0 - mxG) * rcpMG);
MF2 ampB = saturate(min(mnB, 1.0 - mxB) * rcpMB);
#endif
// Shaping amount of sharpening.
// Shaping amount of sharpening.
ampR = sqrt(ampR);
ampG = sqrt(ampG);
ampB = sqrt(ampB);
ampR = sqrt(ampR);
ampG = sqrt(ampG);
ampB = sqrt(ampB);
// Filter shape.
MF2 wR = ampR * peak;
MF2 wG = ampG * peak;
MF2 wB = ampB * peak;
// Filter.
// Filter shape.
MF2 wR = ampR * peak;
MF2 wG = ampG * peak;
MF2 wB = ampB * peak;
// Filter.
MF2 rcpWeight = rcp(1.0 + 4.0 * wG);
MF2 rcpWeight = rcp(1.0 + 4.0 * wG);
pixR = saturate((bR * wG + dR * wG + fR * wG + hR * wG + eR) * rcpWeight);
pixG = saturate((bG * wG + dG * wG + fG * wG + hG * wG + eG) * rcpWeight);
pixB = saturate((bB * wG + dB * wG + fB * wG + hB * wG + eB) * rcpWeight);
pixR = saturate((bR * wG + dR * wG + fR * wG + hR * wG + eR) * rcpWeight);
pixG = saturate((bG * wG + dG * wG + fG * wG + hG * wG + eG) * rcpWeight);
pixB = saturate((bB * wG + dB * wG + fB * wG + hB * wG + eB) * rcpWeight);
}
#else
MF3 CasFilter(MF3 src[4][4], uint2 pos, MF peak) {
// a b c
// d e f
// g h i
MF3 a = src[pos.x - 1][pos.y - 1];
MF3 b = src[pos.x][pos.y - 1];
MF3 c = src[pos.x + 1][pos.y - 1];
MF3 d = src[pos.x - 1][pos.y];
MF3 e = src[pos.x][pos.y];
MF3 f = src[pos.x + 1][pos.y];
MF3 g = src[pos.x - 1][pos.y + 1];
MF3 h = src[pos.x][pos.y + 1];
MF3 i = src[pos.x + 1][pos.y + 1];
// a b c
// d e f
// g h i
MF3 a = src[pos.x - 1][pos.y - 1];
MF3 b = src[pos.x][pos.y - 1];
MF3 c = src[pos.x + 1][pos.y - 1];
MF3 d = src[pos.x - 1][pos.y];
MF3 e = src[pos.x][pos.y];
MF3 f = src[pos.x + 1][pos.y];
MF3 g = src[pos.x - 1][pos.y + 1];
MF3 h = src[pos.x][pos.y + 1];
MF3 i = src[pos.x + 1][pos.y + 1];
// Soft min and max.
// a b c b
// d e f * 0.5 + d e f * 0.5
// g h i h
// These are 2.0x bigger (factored out the extra multiply).
MF mnR = min3(min3(d.r, e.r, f.r), b.r, h.r);
MF mnG = min3(min3(d.g, e.g, f.g), b.g, h.g);
MF mnB = min3(min3(d.b, e.b, f.b), b.b, h.b);
// Soft min and max.
// a b c b
// d e f * 0.5 + d e f * 0.5
// g h i h
// These are 2.0x bigger (factored out the extra multiply).
MF mnR = min3(min3(d.r, e.r, f.r), b.r, h.r);
MF mnG = min3(min3(d.g, e.g, f.g), b.g, h.g);
MF mnB = min3(min3(d.b, e.b, f.b), b.b, h.b);
#ifdef CAS_BETTER_DIAGONALS
MF mnR2 = min3(min3(mnR, a.r, c.r), g.r, i.r);
MF mnG2 = min3(min3(mnG, a.g, c.g), g.g, i.g);
MF mnB2 = min3(min3(mnB, a.b, c.b), g.b, i.b);
mnR = mnR + mnR2;
mnG = mnG + mnG2;
mnB = mnB + mnB2;
MF mnR2 = min3(min3(mnR, a.r, c.r), g.r, i.r);
MF mnG2 = min3(min3(mnG, a.g, c.g), g.g, i.g);
MF mnB2 = min3(min3(mnB, a.b, c.b), g.b, i.b);
mnR = mnR + mnR2;
mnG = mnG + mnG2;
mnB = mnB + mnB2;
#endif
MF mxR = max3(max3(d.r, e.r, f.r), b.r, h.r);
MF mxG = max3(max3(d.g, e.g, f.g), b.g, h.g);
MF mxB = max3(max3(d.b, e.b, f.b), b.b, h.b);
MF mxR = max3(max3(d.r, e.r, f.r), b.r, h.r);
MF mxG = max3(max3(d.g, e.g, f.g), b.g, h.g);
MF mxB = max3(max3(d.b, e.b, f.b), b.b, h.b);
#ifdef CAS_BETTER_DIAGONALS
MF mxR2 = max3(max3(mxR, a.r, c.r), g.r, i.r);
MF mxG2 = max3(max3(mxG, a.g, c.g), g.g, i.g);
MF mxB2 = max3(max3(mxB, a.b, c.b), g.b, i.b);
mxR = mxR + mxR2;
mxG = mxG + mxG2;
mxB = mxB + mxB2;
MF mxR2 = max3(max3(mxR, a.r, c.r), g.r, i.r);
MF mxG2 = max3(max3(mxG, a.g, c.g), g.g, i.g);
MF mxB2 = max3(max3(mxB, a.b, c.b), g.b, i.b);
mxR = mxR + mxR2;
mxG = mxG + mxG2;
mxB = mxB + mxB2;
#endif
// Smooth minimum distance to signal limit divided by smooth max.
// Smooth minimum distance to signal limit divided by smooth max.
MF rcpMR = rcp(mxR);
MF rcpMG = rcp(mxG);
MF rcpMB = rcp(mxB);
MF rcpMR = rcp(mxR);
MF rcpMG = rcp(mxG);
MF rcpMB = rcp(mxB);
#ifdef CAS_BETTER_DIAGONALS
MF ampR = saturate(min(mnR, 2.0 - mxR) * rcpMR);
MF ampG = saturate(min(mnG, 2.0 - mxG) * rcpMG);
MF ampB = saturate(min(mnB, 2.0 - mxB) * rcpMB);
MF ampR = saturate(min(mnR, 2.0 - mxR) * rcpMR);
MF ampG = saturate(min(mnG, 2.0 - mxG) * rcpMG);
MF ampB = saturate(min(mnB, 2.0 - mxB) * rcpMB);
#else
MF ampR = saturate(min(mnR, 1.0 - mxR) * rcpMR);
MF ampG = saturate(min(mnG, 1.0 - mxG) * rcpMG);
MF ampB = saturate(min(mnB, 1.0 - mxB) * rcpMB);
MF ampR = saturate(min(mnR, 1.0 - mxR) * rcpMR);
MF ampG = saturate(min(mnG, 1.0 - mxG) * rcpMG);
MF ampB = saturate(min(mnB, 1.0 - mxB) * rcpMB);
#endif
// Shaping amount of sharpening.
ampR = sqrt(ampR);
ampG = sqrt(ampG);
ampB = sqrt(ampB);
// Shaping amount of sharpening.
ampR = sqrt(ampR);
ampG = sqrt(ampG);
ampB = sqrt(ampB);
// Filter shape.
// 0 w 0
// w 1 w
// 0 w 0
MF wR = ampR * peak;
MF wG = ampG * peak;
MF wB = ampB * peak;
// Filter.
// Using green coef only, depending on dead code removal to strip out the extra overhead.
MF rcpWeight = rcp(1.0 + 4.0 * wG);
// Filter shape.
// 0 w 0
// w 1 w
// 0 w 0
MF wR = ampR * peak;
MF wG = ampG * peak;
MF wB = ampB * peak;
// Filter.
// Using green coef only, depending on dead code removal to strip out the extra overhead.
MF rcpWeight = rcp(1.0 + 4.0 * wG);
return MF3(
saturate((b.r * wG + d.r * wG + f.r * wG + h.r * wG + e.r) * rcpWeight),
saturate((b.g * wG + d.g * wG + f.g * wG + h.g * wG + e.g) * rcpWeight),
saturate((b.b * wG + d.b * wG + f.b * wG + h.b * wG + e.b) * rcpWeight)
);
return MF3(
saturate((b.r * wG + d.r * wG + f.r * wG + h.r * wG + e.r) * rcpWeight),
saturate((b.g * wG + d.g * wG + f.g * wG + h.g * wG + e.g) * rcpWeight),
saturate((b.b * wG + d.b * wG + f.b * wG + h.b * wG + e.b) * rcpWeight)
);
}
#endif
void Pass1(uint2 blockStart, uint3 threadId) {
uint2 gxy = blockStart + (Rmp8x8(threadId.x) << 1);
if (!CheckViewport(gxy)) {
return;
}
uint2 gxy = blockStart + (Rmp8x8(threadId.x) << 1);
const uint2 outputSize = GetOutputSize();
if (gxy.x >= outputSize.x || gxy.y >= outputSize.y) {
return;
}
float2 inputPt = GetInputPt();
uint i, j;
float2 inputPt = GetInputPt();
uint i, j;
MF3 src[4][4];
[unroll]
for (i = 0; i < 3; i += 2) {
[unroll]
for (j = 0; j < 3; j += 2) {
float2 tpos = (gxy + uint2(i, j)) * inputPt;
const MF4 sr = (MF4)INPUT.GatherRed(sam, tpos);
const MF4 sg = (MF4)INPUT.GatherGreen(sam, tpos);
const MF4 sb = (MF4)INPUT.GatherBlue(sam, tpos);
MF3 src[4][4];
[unroll]
for (i = 0; i < 3; i += 2) {
[unroll]
for (j = 0; j < 3; j += 2) {
float2 tpos = (gxy + uint2(i, j)) * inputPt;
const MF4 sr = (MF4)INPUT.GatherRed(sam, tpos);
const MF4 sg = (MF4)INPUT.GatherGreen(sam, tpos);
const MF4 sb = (MF4)INPUT.GatherBlue(sam, tpos);
// w z
// x y
src[i][j] = MF3(sr.w, sg.w, sb.w);
src[i][j + 1] = MF3(sr.x, sg.x, sb.x);
src[i + 1][j] = MF3(sr.z, sg.z, sb.z);
src[i + 1][j + 1] = MF3(sr.y, sg.y, sb.y);
}
}
// w z
// x y
src[i][j] = MF3(sr.w, sg.w, sb.w);
src[i][j + 1] = MF3(sr.x, sg.x, sb.x);
src[i + 1][j] = MF3(sr.z, sg.z, sb.z);
src[i + 1][j + 1] = MF3(sr.y, sg.y, sb.y);
}
}
const MF peak = -rcp(lerp(8.0, 5.0, (MF)sharpness));
const MF peak = -rcp(lerp(8.0, 5.0, (MF)sharpness));
#ifdef MP_FP16
MF2 pixR, pixG, pixB;
CasFilterH(src, 0, peak, pixR, pixG, pixB);
MF2 pixR, pixG, pixB;
CasFilterH(src, 0, peak, pixR, pixG, pixB);
WriteToOutput(gxy, float3(pixR.x, pixG.x, pixB.x));
OUTPUT[gxy] = float4(float3(pixR.x, pixG.x, pixB.x), 1);
++gxy.x;
if (CheckViewport(gxy)) {
WriteToOutput(gxy, float3(pixR.y, pixG.y, pixB.y));
}
++gxy.x;
OUTPUT[gxy] = float4(float3(pixR.y, pixG.y, pixB.y), 1);
CasFilterH(src, 1, peak, pixR, pixG, pixB);
CasFilterH(src, 1, peak, pixR, pixG, pixB);
++gxy.y;
if (CheckViewport(gxy)) {
WriteToOutput(gxy, float3(pixR.y, pixG.y, pixB.y));
}
++gxy.y;
OUTPUT[gxy] = float4(float3(pixR.y, pixG.y, pixB.y), 1);
--gxy.x;
if (CheckViewport(gxy)) {
WriteToOutput(gxy, float3(pixR.x, pixG.x, pixB.x));
}
--gxy.x;
OUTPUT[gxy] = float4(float3(pixR.x, pixG.x, pixB.x), 1);
#else
WriteToOutput(gxy, CasFilter(src, uint2(1, 1), peak));
OUTPUT[gxy] = float4(CasFilter(src, uint2(1, 1), peak), 1);
++gxy.x;
if (CheckViewport(gxy)) {
WriteToOutput(gxy, CasFilter(src, uint2(2, 1), peak));
}
++gxy.x;
OUTPUT[gxy] = float4(CasFilter(src, uint2(2, 1), peak), 1);
++gxy.y;
OUTPUT[gxy] = float4(CasFilter(src, uint2(2, 2), peak), 1);
++gxy.y;
if (CheckViewport(gxy)) {
WriteToOutput(gxy, CasFilter(src, uint2(2, 2), peak));
}
--gxy.x;
if (CheckViewport(gxy)) {
WriteToOutput(gxy, CasFilter(src, uint2(1, 2), peak));
}
--gxy.x;
OUTPUT[gxy] = float4(CasFilter(src, uint2(1, 2), peak), 1);
#endif
}

View file

@ -1,7 +1,7 @@
// 移植自 https://github.com/GPUOpen-Effects/FidelityFX-CAS/blob/master/ffx-cas/ffx_cas.h
//!MAGPIE EFFECT
//!VERSION 3
//!VERSION 4
//!PARAMETER
//!LABEL Sharpness
@ -14,9 +14,13 @@ float sharpness;
//!TEXTURE
Texture2D INPUT;
//!TEXTURE
Texture2D OUTPUT;
//!PASS 1
//!IN INPUT
//!OUT OUTPUT
//!BLOCK_SIZE 16
//!NUM_THREADS 64
@ -307,7 +311,9 @@ float3 CasFilter(uint2 ip, float4 const0, float peak) {
void Pass1(uint2 blockStart, uint3 threadId) {
uint2 gxy = blockStart + Rmp8x8(threadId.x);
if (!CheckViewport(gxy)) {
const uint2 outputSize = GetOutputSize();
if (gxy.x >= outputSize.x || gxy.y >= outputSize.y) {
return;
}
@ -317,20 +323,14 @@ void Pass1(uint2 blockStart, uint3 threadId) {
const float peak = -rcp(lerp(8.0, 5.0, sharpness));
WriteToOutput(gxy, CasFilter(gxy, const0, peak));
OUTPUT[gxy] = float4(CasFilter(gxy, const0, peak), 1);
gxy.x += 8u;
if (CheckViewport(gxy)) {
WriteToOutput(gxy, CasFilter(gxy, const0, peak));
}
OUTPUT[gxy] = float4(CasFilter(gxy, const0, peak), 1);
gxy.y += 8u;
if (CheckViewport(gxy)) {
WriteToOutput(gxy, CasFilter(gxy, const0, peak));
}
OUTPUT[gxy] = float4(CasFilter(gxy, const0, peak), 1);
gxy.x -= 8u;
if (CheckViewport(gxy)) {
WriteToOutput(gxy, CasFilter(gxy, const0, peak));
}
OUTPUT[gxy] = float4(CasFilter(gxy, const0, peak), 1);
}

View file

@ -31,7 +31,7 @@
*/
//!MAGPIE EFFECT
//!VERSION 3
//!VERSION 4
//!PARAMETER
@ -173,6 +173,9 @@ int dilation;
//!TEXTURE
Texture2D INPUT;
//!TEXTURE
Texture2D OUTPUT;
//!SAMPLER
//!FILTER POINT
SamplerState sam;
@ -181,6 +184,7 @@ SamplerState sam;
//!PASS 1
//!STYLE PS
//!IN INPUT
//!OUT OUTPUT
#pragma warning(disable: 3571) // X3571: pow(f, e) will not work for negative f, use abs(f) or conditionally handle negative values if you expect them

View file

@ -24,7 +24,7 @@
*/
//!MAGPIE EFFECT
//!VERSION 3
//!VERSION 4
//!USE_DYNAMIC
@ -160,6 +160,9 @@ int interlace;
//!TEXTURE
Texture2D INPUT;
//!TEXTURE
Texture2D OUTPUT;
//!SAMPLER
//!FILTER POINT
SamplerState sam;
@ -168,6 +171,7 @@ SamplerState sam;
//!PASS 1
//!STYLE PS
//!IN INPUT
//!OUT OUTPUT
#pragma warning(disable: 3571) // X3571: pow(f, e) will not work for negative f, use abs(f) or conditionally handle negative values if you expect them

View file

@ -28,7 +28,7 @@
*/
//!MAGPIE EFFECT
//!VERSION 3
//!VERSION 4
//!PARAMETER
@ -138,6 +138,9 @@ float crtAntiRinging;
//!TEXTURE
Texture2D INPUT;
//!TEXTURE
Texture2D OUTPUT;
//!SAMPLER
//!FILTER POINT
SamplerState sam;
@ -146,6 +149,7 @@ SamplerState sam;
//!PASS 1
//!STYLE PS
//!IN INPUT
//!OUT OUTPUT
#pragma warning(disable: 3571) // X3571: pow(f, e) will not work for negative f, use abs(f) or conditionally handle negative values if you expect them

View file

@ -17,7 +17,7 @@
//!MAGPIE EFFECT
//!VERSION 3
//!VERSION 4
//!PARAMETER
//!LABEL Scanline Hardness
@ -119,6 +119,9 @@ float shape;
//!TEXTURE
Texture2D INPUT;
//!TEXTURE
Texture2D OUTPUT;
//!SAMPLER
//!FILTER POINT
SamplerState sam;
@ -126,6 +129,7 @@ SamplerState sam;
//!PASS 1
//!IN INPUT
//!OUT OUTPUT
//!BLOCK_SIZE 8
//!NUM_THREADS 64
@ -303,7 +307,9 @@ float3 Mask(float2 pos) {
void Pass1(uint2 blockStart, uint3 threadId) {
uint2 gxy = Rmp8x8(threadId.x) + blockStart;
if (!CheckViewport(gxy)) {
const uint2 outputSize = GetOutputSize();
if (gxy.x >= outputSize.x || gxy.y >= outputSize.y) {
return;
}
@ -318,8 +324,9 @@ void Pass1(uint2 blockStart, uint3 threadId) {
outColor.rgb += Bloom(pos1, inputSize) * bloomAmount;
#endif
if (shadowMask)
if (shadowMask) {
outColor.rgb *= Mask(gxy + 0.5f);
}
WriteToOutput(gxy, pow(outColor.rgb, 1.0f / 2.2f));
OUTPUT[gxy] = float4(pow(outColor.rgb, 1.0f / 2.2f), 1);
}

View file

@ -9,7 +9,7 @@
//!MAGPIE EFFECT
//!VERSION 3
//!VERSION 4
//!PARAMETER
@ -80,6 +80,9 @@ float contrast;
//!TEXTURE
Texture2D INPUT;
//!TEXTURE
Texture2D OUTPUT;
//!TEXTURE
//!WIDTH OUTPUT_WIDTH
//!HEIGHT INPUT_HEIGHT
@ -153,6 +156,7 @@ float4 Pass1(float2 pos) {
//!PASS 2
//!STYLE PS
//!IN tex1
//!OUT OUTPUT
#define pi 3.14159265358
#define normalGauss(x) ((exp(-(x)*(x)*0.5))/sqrt(2.0*pi))

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,340 @@
// CuNNy 2x4C BILINEAR RGB NVL DN - https://github.com/cunnyplapper/CuNNy
// This program is free software: you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation, either version 3 of the License, or
// (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
//
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License
// along with this program. If not, see <http://www.gnu.org/licenses/>.
//!MAGPIE EFFECT
//!VERSION 4
//!SORT_NAME CuNNy-DN-D04N02
//!TEXTURE
Texture2D INPUT;
//!TEXTURE
//!WIDTH INPUT_WIDTH * 2
//!HEIGHT INPUT_HEIGHT * 2
Texture2D OUTPUT;
//!SAMPLER
//!FILTER POINT
SamplerState SP;
//!SAMPLER
//!FILTER LINEAR
SamplerState SL;
//!COMMON
#define O(t, p) t.SampleLevel(SP, pos + p * pt, 0)
#define V4 min16float4
#define M4 min16float4x4
//!TEXTURE
//!WIDTH INPUT_WIDTH
//!HEIGHT INPUT_HEIGHT
//!FORMAT R8G8B8A8_SNORM
Texture2D t0;
//!TEXTURE
//!WIDTH INPUT_WIDTH
//!HEIGHT INPUT_HEIGHT
//!FORMAT R8G8B8A8_SNORM
Texture2D t1;
//!PASS 1
//!DESC in
//!BLOCK_SIZE 8
//!NUM_THREADS 64
//!IN INPUT
//!OUT t0
#define l0(x, y) min16float((dot(float3(-3.725e-01, -7.046e-01, -1.734e-01), O(INPUT, float2(x, y)).rgb) + 1.169e-01))
V4 f0(min16float s0_0, min16float s0_1, min16float s0_2, min16float s0_3, min16float s0_4, min16float s0_5, min16float s0_6, min16float s0_7, min16float s0_8) {
V4 r = 0.0;
r += V4(-2.745e-03, -2.925e-03, 1.135e-01, 3.162e-02) * s0_0;
r += V4(4.049e-03, -3.428e-01, -7.641e-02, 2.484e-02) * s0_1;
r += V4(-8.372e-03, 3.398e-01, 1.072e-01, -5.449e-02) * s0_2;
r += V4(1.592e-02, 1.884e-02, -3.160e-02, -7.727e-02) * s0_3;
r += V4(4.429e-01, -3.936e-01, -4.134e-01, -4.287e-01) * s0_4;
r += V4(4.556e-02, 3.754e-01, -2.300e-02, 4.971e-01) * s0_5;
r += V4(-2.031e-02, -6.662e-03, 8.906e-02, 4.602e-02) * s0_6;
r += V4(-4.365e-01, 2.183e-03, 8.609e-02, 9.402e-03) * s0_7;
r += V4(-3.845e-02, 5.695e-03, 9.645e-02, -5.310e-02) * s0_8;
r += V4(1.492e-02, -1.961e-02, -7.539e-03, -3.574e-03);
return r;
}
void Pass1(uint2 blockStart, uint3 tid) {
float2 pt = float2(GetInputPt());
uint2 gxy = Rmp8x8(tid.x) + blockStart;
uint2 size = GetInputSize();
if (gxy.x >= size.x || gxy.y >= size.y) {
return;
}
float2 pos = (gxy + 0.5) * pt;
min16float s0_0 = l0(-1.0, -1.0);
min16float s0_1 = l0(0.0, -1.0);
min16float s0_2 = l0(1.0, -1.0);
min16float s0_3 = l0(-1.0, 0.0);
min16float s0_4 = l0(0.0, 0.0);
min16float s0_5 = l0(1.0, 0.0);
min16float s0_6 = l0(-1.0, 1.0);
min16float s0_7 = l0(0.0, 1.0);
min16float s0_8 = l0(1.0, 1.0);
t0[gxy] = f0(s0_0, s0_1, s0_2, s0_3, s0_4, s0_5, s0_6, s0_7, s0_8);
}
//!PASS 2
//!DESC conv1
//!BLOCK_SIZE 8
//!NUM_THREADS 64
//!IN t0
//!OUT t1
#define l0(x, y) V4(O(t0, float2(x, y)))
V4 f0(V4 s0_0, V4 s0_1, V4 s0_2, V4 s0_3, V4 s0_4, V4 s0_5, V4 s0_6, V4 s0_7, V4 s0_8, V4 s1_0, V4 s1_1, V4 s1_2, V4 s1_3, V4 s1_4, V4 s1_5, V4 s1_6, V4 s1_7, V4 s1_8) {
V4 r = 0.0;
r += mul(s0_0, M4(1.218e-02, -1.208e-01, -1.955e-01, -1.217e-01, 3.123e-02, -2.317e-02, 1.961e-01, -9.984e-02, 3.038e-03, 2.863e-02, -1.042e-01, -5.529e-02, 1.266e-01, -3.877e-01, 2.315e-01, -1.334e-01));
r += mul(s0_1, M4(-1.774e-02, 1.636e-01, 1.379e-01, 7.499e-03, -7.890e-02, -3.970e-02, -6.053e-02, -1.431e-02, 4.167e-02, 9.728e-02, 3.825e-02, -2.704e-02, -2.303e-01, -3.348e-01, 2.940e-01, 4.825e-02));
r += mul(s0_2, M4(1.239e-02, 1.613e-02, -2.280e-01, 8.985e-02, 2.106e-03, 3.847e-02, -2.539e-02, -3.326e-02, -6.327e-02, -1.427e-01, 4.218e-02, 8.995e-02, -6.045e-02, -1.073e-01, -1.329e-01, -2.085e-02));
r += mul(s0_3, M4(-1.601e-01, -2.448e-01, -3.950e-01, 9.169e-03, -3.694e-02, 2.018e-01, -2.524e-01, 1.719e+00, 3.009e-02, 4.927e-02, 1.564e-01, 3.509e-02, -2.630e-02, -3.986e-01, 1.326e-01, -1.037e-02));
r += mul(s0_4, M4(-1.074e+00, -1.654e-01, 4.163e-01, 3.816e-02, 4.580e-01, 4.350e-01, -3.490e-01, -1.257e-02, 1.159e-02, -2.083e-01, -2.744e-01, -2.667e-02, 2.826e-03, 1.986e-01, -2.723e-01, 9.612e-02));
r += mul(s0_5, M4(-3.195e-01, -1.450e-01, -1.523e-01, -2.999e-03, 1.166e-01, 1.304e-01, 1.475e-01, 7.286e-02, -4.077e-02, -3.477e-02, 1.496e-01, -1.199e-02, 7.881e-02, 8.911e-02, -1.082e-01, -6.762e-02));
r += mul(s0_6, M4(2.020e-02, 1.556e-01, -9.837e-03, 1.537e-02, -1.047e-01, 2.095e-01, 2.025e-01, -3.522e-02, -3.407e-02, -8.949e-02, -7.721e-02, -8.910e-03, 9.305e-02, 2.231e-01, 2.178e-01, 1.502e-02));
r += mul(s0_7, M4(-7.936e-02, 3.096e-01, 1.869e-01, -1.950e-03, -2.452e-01, -5.098e-01, 5.304e-01, -4.921e-02, -1.073e-01, 1.062e-01, 2.527e-01, 5.909e-04, 3.797e-02, 3.291e-01, -2.395e-01, 2.768e-02));
r += mul(s0_8, M4(-5.559e-02, 1.090e-01, -1.757e-01, 1.261e-02, -1.632e-01, -2.476e-01, -5.674e-02, -4.843e-03, 1.064e-02, 1.023e-01, 2.540e-02, -1.336e-02, 1.362e-01, 1.833e-01, 3.772e-03, 5.118e-04));
r += mul(s1_0, M4(1.383e-01, 3.469e-01, 3.568e-02, -1.958e-01, -3.170e-02, -1.076e-02, -2.012e-02, -2.104e-04, 2.046e-02, -1.268e-02, -1.618e-01, -6.370e-02, 2.615e-02, 1.494e-01, -1.523e-01, 3.702e-02));
r += mul(s1_1, M4(-1.140e-02, 6.811e-01, 5.722e-02, 1.514e-01, -6.311e-02, -3.541e-02, -1.150e-01, 3.625e-02, 1.146e-01, -1.395e-03, 5.059e-01, -7.835e-02, -3.907e-01, 6.172e-02, -9.656e-02, -2.727e-02));
r += mul(s1_2, M4(1.239e-01, 1.206e-01, 7.519e-01, 2.106e-02, 8.647e-03, 1.082e-02, 5.931e-02, -4.215e-02, -2.216e-02, -4.829e-02, -1.927e-01, 1.159e-01, -1.789e-01, -9.596e-02, 1.395e-01, -6.395e-02));
r += mul(s1_3, M4(1.194e-01, -5.786e-01, -1.761e-03, -1.126e-02, -5.311e-02, -2.325e-01, 1.733e-01, 2.842e-01, -1.080e-01, -1.012e-01, 1.851e-01, 4.253e-02, 1.212e-01, 2.435e-02, -3.061e-01, -9.579e-02));
r += mul(s1_4, M4(-4.651e-02, -1.299e+00, -5.020e-01, 5.830e-02, 5.098e-01, 7.344e-02, -1.358e-01, 1.725e-02, -2.980e-01, -6.077e-01, 6.308e-01, -4.014e-02, 3.497e-01, 3.700e-01, -6.035e-01, 8.026e-02));
r += mul(s1_5, M4(-1.851e-02, -2.057e-01, 5.081e-01, -5.262e-02, 1.715e-01, 1.387e-01, -1.123e-01, 9.022e-02, -1.532e-01, -3.749e-02, -1.930e-01, 6.423e-02, 2.763e-02, 5.993e-02, 4.141e-01, -8.825e-02));
r += mul(s1_6, M4(-6.324e-03, -9.461e-02, 3.044e-02, -4.139e-03, -2.925e-02, 3.975e-01, 1.161e-01, 9.726e-03, 1.353e-01, 2.762e-01, 3.297e-03, 1.076e-02, -8.503e-02, -7.010e-01, -1.967e-01, -1.360e-03));
r += mul(s1_7, M4(1.873e-02, 1.099e-01, 1.229e-01, -1.232e-02, -5.723e-01, -4.599e-02, -1.236e-01, -2.003e-02, -4.268e-01, 5.929e-01, 2.942e-01, 3.485e-02, 4.326e-01, -9.250e-02, 3.736e-01, -2.393e-02));
r += mul(s1_8, M4(-5.991e-02, 1.199e-03, -1.349e-02, -1.321e-03, -2.036e-01, -1.937e-01, -7.888e-02, -9.144e-03, 1.557e-01, 7.018e-02, -2.646e-01, -3.360e-06, 1.742e-01, 1.814e-01, 1.385e-01, -1.030e-02));
r += V4(4.789e-02, 4.713e-03, -2.854e-02, 9.967e-03);
return r;
}
void Pass2(uint2 blockStart, uint3 tid) {
float2 pt = float2(GetInputPt());
uint2 gxy = Rmp8x8(tid.x) + blockStart;
uint2 size = GetInputSize();
if (gxy.x >= size.x || gxy.y >= size.y) {
return;
}
float2 pos = (gxy + 0.5) * pt;
V4 s0_0 = l0(-1.0, -1.0);
V4 s0_1 = l0(0.0, -1.0);
V4 s0_2 = l0(1.0, -1.0);
V4 s0_3 = l0(-1.0, 0.0);
V4 s0_4 = l0(0.0, 0.0);
V4 s0_5 = l0(1.0, 0.0);
V4 s0_6 = l0(-1.0, 1.0);
V4 s0_7 = l0(0.0, 1.0);
V4 s0_8 = l0(1.0, 1.0);
V4 s1_0 = -max(-s0_0, 0.0);
V4 s1_1 = -max(-s0_1, 0.0);
V4 s1_2 = -max(-s0_2, 0.0);
V4 s1_3 = -max(-s0_3, 0.0);
V4 s1_4 = -max(-s0_4, 0.0);
V4 s1_5 = -max(-s0_5, 0.0);
V4 s1_6 = -max(-s0_6, 0.0);
V4 s1_7 = -max(-s0_7, 0.0);
V4 s1_8 = -max(-s0_8, 0.0);
s0_0 = max(s0_0, 0.0);
s0_1 = max(s0_1, 0.0);
s0_2 = max(s0_2, 0.0);
s0_3 = max(s0_3, 0.0);
s0_4 = max(s0_4, 0.0);
s0_5 = max(s0_5, 0.0);
s0_6 = max(s0_6, 0.0);
s0_7 = max(s0_7, 0.0);
s0_8 = max(s0_8, 0.0);
t1[gxy] = f0(s0_0, s0_1, s0_2, s0_3, s0_4, s0_5, s0_6, s0_7, s0_8, s1_0, s1_1, s1_2, s1_3, s1_4, s1_5, s1_6, s1_7, s1_8);
}
//!PASS 3
//!DESC conv2
//!BLOCK_SIZE 8
//!NUM_THREADS 64
//!IN t1
//!OUT t0
#define l0(x, y) V4(O(t1, float2(x, y)))
V4 f0(V4 s0_0, V4 s0_1, V4 s0_2, V4 s0_3, V4 s0_4, V4 s0_5, V4 s0_6, V4 s0_7, V4 s0_8, V4 s1_0, V4 s1_1, V4 s1_2, V4 s1_3, V4 s1_4, V4 s1_5, V4 s1_6, V4 s1_7, V4 s1_8) {
V4 r = 0.0;
r += mul(s0_0, M4(-1.565e-01, 1.307e-02, -5.269e-02, 5.465e-02, 2.936e-01, 1.626e-01, 4.589e-02, 2.478e-02, 3.520e-01, -5.445e-02, -2.480e-01, 2.838e-02, 1.841e-04, 1.264e-02, -1.370e-02, 2.588e-02));
r += mul(s0_1, M4(2.350e-01, 2.116e-01, 2.167e-02, -1.559e-01, 2.502e-01, 4.320e-01, -7.152e-01, 2.270e-01, -2.668e-01, -2.117e-01, 5.598e-01, 2.261e-01, 4.101e-02, -4.860e-02, 3.530e-02, 8.932e-02));
r += mul(s0_2, M4(-4.398e-02, -4.486e-02, -5.040e-02, 9.803e-02, 7.515e-02, 1.203e-01, -5.357e-02, -2.803e-01, -1.435e-01, 7.150e-03, -3.118e-02, -2.636e-01, -2.969e-02, -2.011e-02, 2.658e-02, -2.572e-02));
r += mul(s0_3, M4(9.140e-02, -1.875e-01, 9.757e-02, 2.976e-02, -8.325e-02, 6.109e-02, -4.304e-02, 7.057e-02, 7.324e-01, -1.528e-01, 2.930e-01, 7.503e-02, -3.901e-02, 1.109e-03, -2.693e-02, -3.330e-02));
r += mul(s0_4, M4(-9.944e-02, 1.858e-01, -2.436e-01, 3.822e-02, 6.685e-02, -1.758e-01, 1.382e-01, -1.715e-01, 3.252e-01, 5.176e-01, -2.939e-01, 4.311e-01, -6.125e-02, 1.905e-01, 8.140e-02, 2.095e-01));
r += mul(s0_5, M4(3.193e-02, 6.029e-02, 1.869e-03, 8.627e-04, -1.402e-02, 4.288e-02, -5.756e-02, 8.813e-02, -2.758e-02, -5.267e-02, 1.702e-03, -6.676e-01, 6.373e-02, 5.766e-02, -6.325e-02, -2.744e-01));
r += mul(s0_6, M4(4.918e-02, 5.420e-04, 3.692e-02, 7.796e-03, -1.163e-02, -4.074e-02, 2.057e-02, -2.837e-02, 1.083e-01, 1.958e-01, -5.078e-02, 2.750e-02, 5.323e-02, 5.953e-03, 4.766e-02, -2.265e-03));
r += mul(s0_7, M4(-3.968e-02, -1.535e-01, 6.564e-02, -2.620e-02, 3.742e-02, 8.659e-02, -4.440e-02, 6.007e-03, -9.585e-02, -9.425e-02, -1.517e-01, 3.701e-01, -1.332e-01, -1.860e-01, -5.436e-02, 3.781e-01));
r += mul(s0_8, M4(-1.145e-02, 6.045e-02, -4.676e-02, -5.604e-02, -1.576e-02, -3.528e-02, 2.252e-02, 1.997e-02, -2.546e-02, -6.894e-02, 7.238e-02, -3.495e-01, -6.323e-02, -1.042e-01, 1.091e-01, -4.170e-01));
r += mul(s1_0, M4(-5.215e-01, 6.255e-01, 5.587e-02, -5.362e-02, 9.895e-02, -8.743e-03, 1.058e-01, -3.585e-02, -1.594e-02, -1.034e-01, 3.848e-02, -5.432e-02, -1.796e-02, 5.838e-02, 1.304e-01, -2.122e-02));
r += mul(s1_1, M4(-6.987e-02, 8.696e-01, -1.130e+00, 5.558e-03, -1.080e-01, 4.195e-02, -1.323e-01, 2.270e-01, 3.451e-02, -1.616e-02, 4.251e-03, 1.470e-01, 2.442e-01, -5.904e-02, -3.467e-01, -2.056e-02));
r += mul(s1_2, M4(4.884e-02, -1.034e-01, 5.823e-02, 1.131e-01, -4.126e-02, 6.519e-02, -1.532e-02, -2.420e-01, 1.092e-02, 1.869e-02, 1.913e-03, -1.787e-02, 1.122e-01, -1.481e-01, 1.843e-01, 3.454e-01));
r += mul(s1_3, M4(-2.906e-01, -9.847e-01, 4.092e-01, 1.655e-01, 4.092e-02, 2.913e-01, 1.306e-01, -4.682e-02, 2.568e-01, -4.528e-02, 3.207e-02, 9.888e-02, -3.928e-01, -3.546e-01, -2.367e-01, -3.239e-01));
r += mul(s1_4, M4(4.463e-01, -1.594e-01, 8.418e-01, -3.525e-01, 5.957e-01, 1.082e+00, -9.245e-01, 2.726e-01, 1.210e-01, 2.024e-01, -8.063e-03, -2.433e-01, -1.512e+00, 9.316e-01, 2.305e-01, -5.109e-01));
r += mul(s1_5, M4(-2.393e-02, 1.286e-02, -9.453e-02, 3.071e-01, -1.402e-01, -2.436e-01, 1.202e-01, -1.409e-01, -1.857e-02, 2.421e-02, -2.642e-02, -7.415e-02, 8.786e-01, 5.260e-04, -9.212e-02, 1.849e-01));
r += mul(s1_6, M4(8.958e-02, 9.057e-02, 1.712e-02, -2.838e-02, -1.405e-01, -6.455e-02, -2.695e-02, -1.110e-02, 8.731e-03, 6.531e-02, -3.752e-02, 1.194e-01, 4.585e-01, 6.270e-01, -1.367e-01, -2.529e-01));
r += mul(s1_7, M4(-4.381e-02, -1.595e-02, -4.601e-02, 7.257e-02, -8.036e-02, -1.360e-01, 1.154e-01, -7.942e-02, -4.653e-02, -7.121e-02, 2.720e-02, 8.346e-02, -1.871e+00, -8.300e-01, -6.760e-01, 7.402e-01));
r += mul(s1_8, M4(1.359e-02, -2.489e-02, 3.529e-02, -1.121e-01, -6.190e-02, -2.628e-02, -2.090e-03, 2.359e-01, -2.412e-02, -2.463e-02, 8.317e-03, -5.330e-02, 2.105e+00, 1.550e-01, 1.457e+00, -1.129e+00));
r += V4(7.359e-03, -1.132e-02, 1.248e-02, 7.243e-04);
return r;
}
void Pass3(uint2 blockStart, uint3 tid) {
float2 pt = float2(GetInputPt());
uint2 gxy = Rmp8x8(tid.x) + blockStart;
uint2 size = GetInputSize();
if (gxy.x >= size.x || gxy.y >= size.y) {
return;
}
float2 pos = (gxy + 0.5) * pt;
V4 s0_0 = l0(-1.0, -1.0);
V4 s0_1 = l0(0.0, -1.0);
V4 s0_2 = l0(1.0, -1.0);
V4 s0_3 = l0(-1.0, 0.0);
V4 s0_4 = l0(0.0, 0.0);
V4 s0_5 = l0(1.0, 0.0);
V4 s0_6 = l0(-1.0, 1.0);
V4 s0_7 = l0(0.0, 1.0);
V4 s0_8 = l0(1.0, 1.0);
V4 s1_0 = -max(-s0_0, 0.0);
V4 s1_1 = -max(-s0_1, 0.0);
V4 s1_2 = -max(-s0_2, 0.0);
V4 s1_3 = -max(-s0_3, 0.0);
V4 s1_4 = -max(-s0_4, 0.0);
V4 s1_5 = -max(-s0_5, 0.0);
V4 s1_6 = -max(-s0_6, 0.0);
V4 s1_7 = -max(-s0_7, 0.0);
V4 s1_8 = -max(-s0_8, 0.0);
s0_0 = max(s0_0, 0.0);
s0_1 = max(s0_1, 0.0);
s0_2 = max(s0_2, 0.0);
s0_3 = max(s0_3, 0.0);
s0_4 = max(s0_4, 0.0);
s0_5 = max(s0_5, 0.0);
s0_6 = max(s0_6, 0.0);
s0_7 = max(s0_7, 0.0);
s0_8 = max(s0_8, 0.0);
t0[gxy] = f0(s0_0, s0_1, s0_2, s0_3, s0_4, s0_5, s0_6, s0_7, s0_8, s1_0, s1_1, s1_2, s1_3, s1_4, s1_5, s1_6, s1_7, s1_8);
}
//!PASS 4
//!DESC out-shuffle
//!BLOCK_SIZE 16
//!NUM_THREADS 64
//!IN INPUT, t0
//!OUT OUTPUT
#define l0(x, y) V4(O(t0, float2(x, y)))
V4 f0(V4 s0_0, V4 s0_1, V4 s0_2, V4 s0_3, V4 s0_4, V4 s0_5, V4 s0_6, V4 s0_7, V4 s0_8, V4 s1_0, V4 s1_1, V4 s1_2, V4 s1_3, V4 s1_4, V4 s1_5, V4 s1_6, V4 s1_7, V4 s1_8) {
V4 r = 0.0;
r += mul(s0_0, M4(8.642e-03, -1.295e-02, 1.998e-02, -1.289e-03, -4.147e-02, -4.021e-03, 1.491e-04, -7.275e-03, 1.574e-02, -4.122e-03, 1.126e-02, 8.962e-03, 5.174e-02, 3.405e-02, 4.993e-02, 4.529e-02));
r += mul(s0_1, M4(-1.028e-01, -2.764e-02, -2.777e-02, -7.170e-03, -8.365e-02, 3.550e-02, 1.288e-01, 2.475e-02, 5.017e-02, 5.917e-02, 3.473e-02, 8.510e-03, 2.332e-02, 8.047e-02, 9.838e-02, 4.234e-02));
r += mul(s0_2, M4(-2.319e-02, -4.432e-02, -1.679e-02, 8.855e-03, 3.259e-02, -1.974e-01, 5.938e-02, 1.616e-01, -5.605e-04, 3.183e-02, -3.356e-03, 3.138e-02, 9.572e-03, -3.887e-02, -2.632e-02, -1.161e-02));
r += mul(s0_3, M4(-2.947e-02, -4.358e-02, 1.208e-03, -2.705e-02, -1.037e-02, -6.812e-02, -5.436e-02, -3.840e-02, 3.684e-02, 2.560e-02, 1.715e-02, -3.670e-02, -5.930e-02, -2.310e-02, -6.163e-02, -3.562e-02));
r += mul(s0_4, M4(5.520e-01, 1.213e-01, 1.753e-01, 5.436e-02, 5.879e-01, 2.281e-01, -2.703e-01, 1.519e-01, 5.739e-01, 2.959e-01, 9.449e-02, 2.473e-02, -5.998e-01, -9.548e-02, -6.035e-01, -9.663e-02));
r += mul(s0_5, M4(-9.740e-02, 2.744e-01, -1.522e-01, -7.204e-02, 1.178e-01, 6.112e-01, -4.801e-02, -5.176e-01, 1.480e-02, 8.323e-02, -6.764e-02, 4.138e-02, 1.121e-01, -8.141e-02, 1.211e-01, -8.737e-02));
r += mul(s0_6, M4(6.315e-02, 6.323e-02, 1.146e-02, 3.378e-02, -9.598e-02, -1.089e-01, 2.780e-02, -6.091e-02, -1.194e-01, -1.038e-01, -2.147e-02, -4.236e-02, -2.300e-02, -3.184e-02, -1.560e-02, -2.206e-02));
r += mul(s0_7, M4(-1.772e-01, -1.304e-01, 1.265e-01, -7.871e-02, 1.978e-01, 1.074e-01, 1.240e-02, 4.600e-02, 1.558e-02, -3.196e-02, 2.018e-01, 1.496e-01, 1.421e-01, 8.472e-02, 7.432e-02, 9.935e-02));
r += mul(s0_8, M4(1.132e-02, -2.296e-03, 1.274e-01, 3.428e-01, -5.796e-02, -6.156e-02, -2.549e-01, -2.231e-01, -8.762e-02, -9.318e-02, -2.378e-01, -3.018e-01, 5.601e-03, -2.670e-02, 2.896e-02, -3.910e-02));
r += mul(s1_0, M4(4.603e-02, -2.582e-02, -9.045e-03, 1.446e-02, -1.835e-02, -2.533e-02, 3.681e-03, -9.420e-03, -5.802e-02, 2.310e-02, 3.059e-02, 1.313e-03, 9.639e-02, 8.284e-02, 1.071e-01, -3.287e-02));
r += mul(s1_1, M4(-2.480e-02, 2.321e-03, -3.594e-02, -1.101e-01, 2.850e-02, 2.912e-02, 2.597e-02, 2.777e-02, 5.701e-02, 9.536e-04, 2.533e-02, 1.102e-02, -3.714e-03, 7.838e-02, -1.716e-02, 1.723e-01));
r += mul(s1_2, M4(-4.473e-03, 1.521e-02, -1.887e-02, 6.731e-03, 2.199e-03, 2.965e-02, -3.709e-03, 1.671e-02, 1.376e-02, -4.819e-02, -8.832e-04, 3.531e-02, -8.453e-03, -1.276e-02, -1.461e-02, 4.460e-03));
r += mul(s1_3, M4(6.139e-02, -1.511e-01, 1.102e-01, -1.428e-01, -5.114e-02, -6.594e-02, -1.693e-02, -4.651e-02, 2.440e-01, 2.010e-02, -1.900e-01, -1.243e-03, -2.397e-01, 2.002e-01, -3.506e-01, 2.171e-01));
r += mul(s1_4, M4(-6.189e-02, 5.137e-01, -8.132e-02, 4.526e-01, 3.263e-01, 2.134e-01, 1.027e-01, 2.067e-02, 2.407e-01, 2.591e-01, 4.489e-01, 2.042e-01, 1.932e-02, -4.463e-01, -1.479e-01, -6.843e-01));
r += mul(s1_5, M4(-7.571e-03, -7.787e-02, 9.918e-03, -8.469e-02, 4.056e-02, -1.926e-02, -4.968e-02, 2.416e-02, 2.699e-02, 2.783e-01, -7.854e-02, -6.549e-02, 6.835e-03, 2.288e-02, 1.048e-02, -3.273e-02));
r += mul(s1_6, M4(7.034e-02, 4.236e-02, 7.905e-02, -2.283e-03, -8.423e-02, -7.784e-02, -7.540e-03, -3.373e-02, -1.019e-01, -1.421e-01, 6.713e-02, -8.716e-02, -6.980e-02, -4.731e-02, -3.086e-02, -6.210e-03));
r += mul(s1_7, M4(-1.597e-01, -2.036e-01, 5.194e-02, 8.457e-02, 1.387e-01, 7.910e-02, 2.030e-02, 5.848e-02, 2.154e-01, 1.382e-01, -8.617e-02, 7.552e-02, 3.127e-02, 5.899e-02, 1.733e-01, 1.657e-01));
r += mul(s1_8, M4(3.595e-02, 3.243e-02, 1.450e-01, 2.046e-01, -2.939e-02, -1.306e-02, -1.587e-01, -2.607e-01, -8.980e-02, -5.350e-02, -2.627e-01, -2.861e-01, -1.585e-02, -2.032e-02, -1.662e-02, 1.560e-02));
r += V4(-7.528e-04, -8.388e-04, -1.247e-03, -1.205e-03);
return tanh(r);
}
void Pass4(uint2 blockStart, uint3 tid) {
float2 pt = float2(GetInputPt());
uint2 gxy = (Rmp8x8(tid.x) << 1) + blockStart;
uint2 size = GetOutputSize();
if (gxy.x >= size.x || gxy.y >= size.y) {
return;
}
float2 pos = ((gxy >> 1) + 0.5) * pt;
V4 s0_0 = l0(-1.0, -1.0);
V4 s0_1 = l0(0.0, -1.0);
V4 s0_2 = l0(1.0, -1.0);
V4 s0_3 = l0(-1.0, 0.0);
V4 s0_4 = l0(0.0, 0.0);
V4 s0_5 = l0(1.0, 0.0);
V4 s0_6 = l0(-1.0, 1.0);
V4 s0_7 = l0(0.0, 1.0);
V4 s0_8 = l0(1.0, 1.0);
V4 s1_0 = -max(-s0_0, 0.0);
V4 s1_1 = -max(-s0_1, 0.0);
V4 s1_2 = -max(-s0_2, 0.0);
V4 s1_3 = -max(-s0_3, 0.0);
V4 s1_4 = -max(-s0_4, 0.0);
V4 s1_5 = -max(-s0_5, 0.0);
V4 s1_6 = -max(-s0_6, 0.0);
V4 s1_7 = -max(-s0_7, 0.0);
V4 s1_8 = -max(-s0_8, 0.0);
s0_0 = max(s0_0, 0.0);
s0_1 = max(s0_1, 0.0);
s0_2 = max(s0_2, 0.0);
s0_3 = max(s0_3, 0.0);
s0_4 = max(s0_4, 0.0);
s0_5 = max(s0_5, 0.0);
s0_6 = max(s0_6, 0.0);
s0_7 = max(s0_7, 0.0);
s0_8 = max(s0_8, 0.0);
V4 r = f0(s0_0, s0_1, s0_2, s0_3, s0_4, s0_5, s0_6, s0_7, s0_8, s1_0, s1_1, s1_2, s1_3, s1_4, s1_5, s1_6, s1_7, s1_8);
static const float3x3 rgb2yuv = {0.299, 0.587, 0.114, -0.169, -0.331, 0.5, 0.5, -0.419, -0.081};
static const float3x3 yuv2rgb = {1, -0.00093, 1.401687, 1, -0.3437, -0.71417, 1, 1.77216, 0.00099};
float2 opt = float2(GetOutputPt());
pos -= 0.5f * opt;
float3 yuv = mul(rgb2yuv, INPUT.SampleLevel(SL, pos, 0).rgb);
OUTPUT[gxy] = float4(mul(yuv2rgb, float3(saturate(yuv.r + r.x), yuv.yz)), 1);
++gxy.x;
pos.x += opt.x;
yuv = mul(rgb2yuv, INPUT.SampleLevel(SL, pos, 0).rgb);
OUTPUT[gxy] = float4(mul(yuv2rgb, float3(saturate(yuv.r + r.y), yuv.yz)), 1);
++gxy.y;
pos.y += opt.y;
yuv = mul(rgb2yuv, INPUT.SampleLevel(SL, pos, 0).rgb);
OUTPUT[gxy] = float4(mul(yuv2rgb, float3(saturate(yuv.r + r.w), yuv.yz)), 1);
--gxy.x;
pos.x -= opt.x;
yuv = mul(rgb2yuv, INPUT.SampleLevel(SL, pos, 0).rgb);
OUTPUT[gxy] = float4(mul(yuv2rgb, float3(saturate(yuv.r + r.z), yuv.yz)), 1);
}

View file

@ -0,0 +1,340 @@
// CuNNy 2x4C BILINEAR RGB NVL - https://github.com/cunnyplapper/CuNNy
// This program is free software: you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation, either version 3 of the License, or
// (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
//
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License
// along with this program. If not, see <http://www.gnu.org/licenses/>.
//!MAGPIE EFFECT
//!VERSION 4
//!SORT_NAME CuNNy-D04N02
//!TEXTURE
Texture2D INPUT;
//!TEXTURE
//!WIDTH INPUT_WIDTH * 2
//!HEIGHT INPUT_HEIGHT * 2
Texture2D OUTPUT;
//!SAMPLER
//!FILTER POINT
SamplerState SP;
//!SAMPLER
//!FILTER LINEAR
SamplerState SL;
//!COMMON
#define O(t, p) t.SampleLevel(SP, pos + p * pt, 0)
#define V4 min16float4
#define M4 min16float4x4
//!TEXTURE
//!WIDTH INPUT_WIDTH
//!HEIGHT INPUT_HEIGHT
//!FORMAT R8G8B8A8_SNORM
Texture2D t0;
//!TEXTURE
//!WIDTH INPUT_WIDTH
//!HEIGHT INPUT_HEIGHT
//!FORMAT R8G8B8A8_SNORM
Texture2D t1;
//!PASS 1
//!DESC in
//!BLOCK_SIZE 8
//!NUM_THREADS 64
//!IN INPUT
//!OUT t0
#define l0(x, y) min16float((dot(float3(-6.049e-01, -1.145e+00, -2.540e-01), O(INPUT, float2(x, y)).rgb) + 1.794e+00))
V4 f0(min16float s0_0, min16float s0_1, min16float s0_2, min16float s0_3, min16float s0_4, min16float s0_5, min16float s0_6, min16float s0_7, min16float s0_8) {
V4 r = 0.0;
r += V4(1.411e-01, -9.763e-03, -1.361e-01, -9.610e-04) * s0_0;
r += V4(6.068e-02, 7.238e-03, -1.182e-01, -1.535e-02) * s0_1;
r += V4(-8.549e-02, -2.876e-03, -8.740e-03, 1.652e-02) * s0_2;
r += V4(-3.249e-01, 5.392e-02, -8.518e-02, -7.437e-03) * s0_3;
r += V4(2.435e-02, -6.191e-01, 7.147e-01, 5.862e-01) * s0_4;
r += V4(1.968e-01, 1.868e-02, -1.723e-01, -5.801e-01) * s0_5;
r += V4(1.528e-01, -4.489e-02, 5.871e-03, 4.528e-03) * s0_6;
r += V4(-4.619e-01, 6.152e-01, -1.313e-01, -5.326e-02) * s0_7;
r += V4(2.902e-01, -1.801e-02, -6.907e-02, 5.105e-02) * s0_8;
r += V4(4.440e-03, -1.956e-04, 1.215e-03, 1.790e-03);
return r;
}
void Pass1(uint2 blockStart, uint3 tid) {
float2 pt = float2(GetInputPt());
uint2 gxy = Rmp8x8(tid.x) + blockStart;
uint2 size = GetInputSize();
if (gxy.x >= size.x || gxy.y >= size.y) {
return;
}
float2 pos = (gxy + 0.5) * pt;
min16float s0_0 = l0(-1.0, -1.0);
min16float s0_1 = l0(0.0, -1.0);
min16float s0_2 = l0(1.0, -1.0);
min16float s0_3 = l0(-1.0, 0.0);
min16float s0_4 = l0(0.0, 0.0);
min16float s0_5 = l0(1.0, 0.0);
min16float s0_6 = l0(-1.0, 1.0);
min16float s0_7 = l0(0.0, 1.0);
min16float s0_8 = l0(1.0, 1.0);
t0[gxy] = f0(s0_0, s0_1, s0_2, s0_3, s0_4, s0_5, s0_6, s0_7, s0_8);
}
//!PASS 2
//!DESC conv1
//!BLOCK_SIZE 8
//!NUM_THREADS 64
//!IN t0
//!OUT t1
#define l0(x, y) V4(O(t0, float2(x, y)))
V4 f0(V4 s0_0, V4 s0_1, V4 s0_2, V4 s0_3, V4 s0_4, V4 s0_5, V4 s0_6, V4 s0_7, V4 s0_8, V4 s1_0, V4 s1_1, V4 s1_2, V4 s1_3, V4 s1_4, V4 s1_5, V4 s1_6, V4 s1_7, V4 s1_8) {
V4 r = 0.0;
r += mul(s0_0, M4(1.120e-01, 8.150e-03, 7.146e-02, -4.942e-02, 3.623e-01, -1.678e-01, 1.189e-01, 1.372e-01, 1.225e-01, -2.568e-02, 6.959e-02, 1.788e-02, 1.962e-01, -1.870e-01, -6.548e-03, -4.334e-02));
r += mul(s0_1, M4(1.805e-01, 4.881e-02, -2.342e-03, 2.035e-02, -2.427e-01, -2.197e-02, -2.036e-02, 3.919e-01, -3.037e-01, 7.047e-02, 3.426e-02, -8.694e-02, 2.144e-01, 1.431e-01, -7.851e-02, 2.247e-01));
r += mul(s0_2, M4(6.328e-02, -4.140e-02, 3.362e-02, 5.204e-02, -1.052e-01, 1.698e-01, -2.727e-03, 1.110e-01, 7.156e-02, -1.108e-02, -2.717e-02, 5.680e-02, -6.118e-02, 2.435e-02, 1.743e-02, 8.179e-02));
r += mul(s0_3, M4(1.557e-01, 1.189e-01, 8.836e-02, 2.178e-02, -3.954e-01, 2.466e-01, -2.166e-01, -7.051e-02, -2.857e-01, -1.611e-02, -8.667e-02, 1.895e-04, 2.744e-01, 1.499e-01, 8.228e-02, 2.938e-02));
r += mul(s0_4, M4(2.441e-01, -3.694e-01, 1.751e-01, 6.833e-01, -1.087e-01, -2.065e-01, -1.557e-01, -6.945e-02, -1.403e-02, 2.171e-02, 3.748e-02, 2.646e-01, -3.718e-01, -1.188e-01, 1.569e-01, 8.554e-02));
r += mul(s0_5, M4(-5.069e-02, 2.646e-01, -5.754e-02, -3.545e-01, 1.404e-01, 1.123e-01, 4.577e-02, -1.465e-01, -2.119e-02, -1.115e-02, 1.661e-01, -4.029e-01, -2.123e-01, 2.774e-01, -1.905e-02, -1.093e-02));
r += mul(s0_6, M4(2.593e-02, -1.801e-02, 9.053e-02, -2.721e-02, 6.658e-03, 3.802e-02, -3.282e-02, -1.116e-01, 1.201e-01, 2.095e-02, -2.061e-02, 2.498e-03, -1.831e-01, -1.743e-01, 1.062e-01, -6.113e-01));
r += mul(s0_7, M4(-1.172e-01, -1.130e-02, -6.727e-02, 7.753e-02, -3.958e-03, -9.790e-02, -1.635e-01, 1.049e-01, 2.862e-01, -2.733e-02, -1.566e-01, -2.900e-01, -1.050e-01, -3.441e-01, -8.690e-02, 8.659e-02));
r += mul(s0_8, M4(2.145e-01, 4.613e-02, 1.590e-02, -4.749e-02, 3.291e-01, 1.012e-01, 8.647e-03, -2.282e-01, 2.215e-01, 1.713e-01, 1.414e-01, -3.916e-01, -2.488e-01, 1.458e-01, 2.518e-02, -9.979e-02));
r += mul(s1_0, M4(-2.127e-02, 3.575e-02, 9.372e-02, -2.662e-02, 4.467e-02, 1.304e-02, 3.849e-02, 5.186e-02, 7.417e-02, 3.647e-02, 4.960e-02, -3.988e-02, -3.998e-02, 1.173e-01, 7.752e-03, -2.263e-02));
r += mul(s1_1, M4(-1.283e-01, -1.460e-01, 1.963e-02, -1.108e-01, -4.171e-01, 2.397e-01, -5.886e-02, 7.788e-02, -2.820e-02, -1.719e-01, 9.334e-03, -1.255e-01, 1.392e-01, 9.532e-03, -5.163e-02, 8.641e-02));
r += mul(s1_2, M4(-1.889e-01, 1.933e-01, 5.574e-02, 6.723e-02, -1.015e-01, -3.316e-01, -1.460e-02, -1.606e-01, 1.052e-01, 1.027e-02, -4.626e-02, 5.368e-02, -9.160e-03, -9.514e-02, 2.577e-02, 7.122e-02));
r += mul(s1_3, M4(-1.958e-01, 1.276e-01, 7.303e-02, -1.135e-01, -2.277e-01, 2.017e-01, -5.223e-02, 1.379e-01, -1.737e-01, 4.871e-02, -8.142e-02, 1.392e-01, 8.113e-02, 4.415e-01, -1.174e-01, 1.910e-02));
r += mul(s1_4, M4(-3.233e-01, -4.158e-01, 8.391e-02, 2.017e-01, 9.790e-02, -4.865e-02, -2.172e-01, 2.607e-01, -2.458e-01, -4.931e-01, 3.016e-01, 2.198e-01, -7.173e-02, -5.683e-01, -7.447e-02, -1.264e-01));
r += mul(s1_5, M4(-4.189e-01, 3.271e-01, 8.844e-02, -5.295e-01, 6.365e-02, -1.513e-01, 1.246e-02, -2.005e-01, 1.764e-01, 5.796e-01, 7.286e-02, -1.428e-01, -1.130e-01, -6.883e-02, -1.303e-02, -1.091e-01));
r += mul(s1_6, M4(-6.621e-02, 9.901e-03, 9.472e-02, -3.568e-02, 1.067e-01, -3.318e-02, 3.152e-01, -5.261e-02, 1.108e-01, 7.081e-02, -1.289e-01, 6.477e-03, 1.036e-01, -1.477e-03, 1.035e+00, -9.204e-02));
r += mul(s1_7, M4(-2.721e-01, -5.458e-02, -1.707e-01, -1.096e-02, -1.302e-01, -9.074e-02, 1.694e-01, 6.307e-02, 4.233e-01, -5.112e-02, -3.545e-01, -2.589e-01, 8.276e-02, -3.975e-01, 7.705e-02, 4.482e-01));
r += mul(s1_8, M4(1.175e-01, 2.212e-03, 5.751e-02, -8.666e-02, 2.532e-01, 1.303e-01, 7.291e-02, -2.126e-01, 4.815e-01, 1.649e-01, -4.748e-02, -3.330e-01, -1.252e-01, -8.987e-03, -4.285e-03, -1.106e-01));
r += V4(3.566e-03, 2.403e-03, -1.451e-03, 4.304e-03);
return r;
}
void Pass2(uint2 blockStart, uint3 tid) {
float2 pt = float2(GetInputPt());
uint2 gxy = Rmp8x8(tid.x) + blockStart;
uint2 size = GetInputSize();
if (gxy.x >= size.x || gxy.y >= size.y) {
return;
}
float2 pos = (gxy + 0.5) * pt;
V4 s0_0 = l0(-1.0, -1.0);
V4 s0_1 = l0(0.0, -1.0);
V4 s0_2 = l0(1.0, -1.0);
V4 s0_3 = l0(-1.0, 0.0);
V4 s0_4 = l0(0.0, 0.0);
V4 s0_5 = l0(1.0, 0.0);
V4 s0_6 = l0(-1.0, 1.0);
V4 s0_7 = l0(0.0, 1.0);
V4 s0_8 = l0(1.0, 1.0);
V4 s1_0 = -max(-s0_0, 0.0);
V4 s1_1 = -max(-s0_1, 0.0);
V4 s1_2 = -max(-s0_2, 0.0);
V4 s1_3 = -max(-s0_3, 0.0);
V4 s1_4 = -max(-s0_4, 0.0);
V4 s1_5 = -max(-s0_5, 0.0);
V4 s1_6 = -max(-s0_6, 0.0);
V4 s1_7 = -max(-s0_7, 0.0);
V4 s1_8 = -max(-s0_8, 0.0);
s0_0 = max(s0_0, 0.0);
s0_1 = max(s0_1, 0.0);
s0_2 = max(s0_2, 0.0);
s0_3 = max(s0_3, 0.0);
s0_4 = max(s0_4, 0.0);
s0_5 = max(s0_5, 0.0);
s0_6 = max(s0_6, 0.0);
s0_7 = max(s0_7, 0.0);
s0_8 = max(s0_8, 0.0);
t1[gxy] = f0(s0_0, s0_1, s0_2, s0_3, s0_4, s0_5, s0_6, s0_7, s0_8, s1_0, s1_1, s1_2, s1_3, s1_4, s1_5, s1_6, s1_7, s1_8);
}
//!PASS 3
//!DESC conv2
//!BLOCK_SIZE 8
//!NUM_THREADS 64
//!IN t1
//!OUT t0
#define l0(x, y) V4(O(t1, float2(x, y)))
V4 f0(V4 s0_0, V4 s0_1, V4 s0_2, V4 s0_3, V4 s0_4, V4 s0_5, V4 s0_6, V4 s0_7, V4 s0_8, V4 s1_0, V4 s1_1, V4 s1_2, V4 s1_3, V4 s1_4, V4 s1_5, V4 s1_6, V4 s1_7, V4 s1_8) {
V4 r = 0.0;
r += mul(s0_0, M4(-1.173e-02, 2.762e-03, -2.225e-03, -6.814e-03, 8.328e-02, -1.275e-02, 6.091e-02, -6.470e-02, -6.067e-02, -1.086e-01, 7.501e-02, 1.227e-01, -1.551e-02, -1.728e-02, -2.694e-02, 7.490e-02));
r += mul(s0_1, M4(5.326e-02, 1.003e-02, 3.989e-02, -1.908e-03, -4.580e-02, -4.303e-03, 4.333e-02, 8.324e-02, 8.170e-01, 8.040e-01, -3.975e-01, -1.034e+00, 1.362e-01, 3.776e-04, -1.102e-02, -5.030e-02));
r += mul(s0_2, M4(-6.068e-02, 6.212e-02, -4.979e-02, 9.626e-03, 1.301e-02, -2.045e-02, 1.798e-02, 2.091e-02, -2.290e-01, 3.612e-01, -7.014e-02, 1.669e-01, -5.191e-03, 1.304e-02, 9.444e-05, -2.137e-02));
r += mul(s0_3, M4(-3.235e-02, -6.238e-02, 3.894e-02, 5.893e-02, -3.530e-02, -1.063e-01, 8.668e-02, 1.232e-02, -3.851e-02, 2.952e-02, 6.132e-02, -5.755e-02, 8.317e-02, 8.340e-02, -8.227e-02, 6.481e-03));
r += mul(s0_4, M4(2.118e-02, 2.725e-01, -1.393e-01, -2.377e-01, 4.872e-01, 2.235e-01, -1.746e-02, -3.662e-01, -3.945e-01, -1.862e-01, -9.132e-02, 8.777e-02, -5.084e-01, -3.300e-01, -3.443e-02, 4.203e-01));
r += mul(s0_5, M4(1.165e-01, -1.743e-01, 4.169e-03, -1.518e-01, 1.174e-01, -3.314e-02, 2.295e-02, -9.160e-02, -1.854e-01, -6.999e-02, -6.985e-02, 4.875e-04, -1.147e-01, 1.722e-01, -2.588e-02, 1.185e-01));
r += mul(s0_6, M4(-8.881e-03, 1.907e-03, 9.002e-03, 8.085e-03, -8.728e-03, -1.074e-01, 7.035e-02, 6.519e-02, 4.323e-02, -4.675e-02, 4.382e-02, 1.091e-02, 3.357e-02, 4.384e-02, -8.031e-03, -1.945e-02));
r += mul(s0_7, M4(-7.981e-02, 1.492e-02, -9.399e-02, -3.750e-02, -1.274e-01, -3.235e-02, -3.169e-02, 6.420e-02, 4.304e-02, 9.302e-02, 1.250e-02, 3.906e-03, 1.752e-01, -1.211e-02, 9.058e-02, -6.273e-02));
r += mul(s0_8, M4(-1.290e-02, -4.309e-02, 3.384e-02, 3.819e-02, -3.309e-02, 3.986e-02, 3.783e-03, 5.361e-02, 5.473e-02, 1.574e-02, -2.385e-02, -7.630e-02, -1.778e-02, 1.375e-02, -2.936e-02, -1.778e-02));
r += mul(s1_0, M4(1.219e-01, 1.166e-02, -5.932e-02, 1.191e-02, -2.487e-03, -5.945e-02, 6.637e-02, 5.775e-02, -1.705e-02, 5.538e-02, -5.130e-02, -3.602e-02, 5.461e-02, -1.253e-01, 6.953e-02, 1.066e-01));
r += mul(s1_1, M4(6.504e-01, -9.638e-01, 1.371e+00, 5.682e-02, 1.583e-02, -2.371e-02, 5.201e-02, 3.845e-02, 3.478e-02, -1.477e-01, 1.763e-01, 5.129e-02, 2.992e-01, -3.335e-01, 2.490e-02, 4.873e-01));
r += mul(s1_2, M4(2.415e-02, 8.838e-02, -1.519e-01, 9.012e-02, -6.676e-02, 3.422e-02, -2.380e-02, 5.608e-02, -1.744e-01, -9.595e-02, -7.627e-02, -5.823e-02, -9.466e-02, 5.554e-02, -1.024e-01, -1.763e-01));
r += mul(s1_3, M4(8.380e-02, -7.972e-02, 8.813e-02, 3.371e-02, 5.392e-03, 4.385e-02, 1.207e-02, -5.728e-02, -3.427e-03, -2.027e-03, 1.211e-03, -7.897e-03, 3.360e-02, 4.603e-02, -1.240e-02, -2.219e-02));
r += mul(s1_4, M4(-6.699e-01, -3.512e-01, -2.153e-01, 3.218e-01, -5.100e-01, 4.324e-03, 2.713e-01, -2.073e-01, 1.547e-01, -2.123e-03, 7.928e-02, -5.698e-02, 2.450e-02, -4.866e-02, 9.436e-02, 7.900e-02));
r += mul(s1_5, M4(1.609e-01, -7.910e-02, 1.112e-01, -2.959e-02, -3.877e-01, -2.803e-01, -1.071e-01, -6.881e-03, 1.922e-02, 2.433e-02, -3.581e-02, -5.264e-02, -3.287e-01, -1.037e-02, -6.159e-02, 8.219e-02));
r += mul(s1_6, M4(-4.263e-02, -6.372e-02, 2.607e-02, 5.285e-02, -6.156e-02, -7.837e-02, 7.299e-03, 8.959e-02, -8.706e-03, -1.642e-02, 1.825e-02, 1.850e-02, 2.735e-02, 2.413e-02, -3.236e-02, -9.612e-03));
r += mul(s1_7, M4(-5.849e-02, 1.530e-01, -6.767e-02, -1.392e-02, -3.430e-01, -1.851e-01, -1.013e-01, 2.465e-01, -1.715e-02, 4.970e-03, -1.850e-02, -4.214e-03, 1.889e-02, -5.787e-02, 7.154e-02, 9.237e-02));
r += mul(s1_8, M4(-2.084e-02, -2.484e-01, 5.767e-02, -2.550e-02, -9.126e-02, 4.292e-01, 1.983e-02, 2.979e-01, -3.807e-03, -3.367e-03, 1.835e-03, 8.694e-03, -9.074e-02, 4.820e-02, -2.886e-02, 5.975e-02));
r += V4(5.508e-03, 4.690e-03, -5.708e-04, -7.674e-03);
return r;
}
void Pass3(uint2 blockStart, uint3 tid) {
float2 pt = float2(GetInputPt());
uint2 gxy = Rmp8x8(tid.x) + blockStart;
uint2 size = GetInputSize();
if (gxy.x >= size.x || gxy.y >= size.y) {
return;
}
float2 pos = (gxy + 0.5) * pt;
V4 s0_0 = l0(-1.0, -1.0);
V4 s0_1 = l0(0.0, -1.0);
V4 s0_2 = l0(1.0, -1.0);
V4 s0_3 = l0(-1.0, 0.0);
V4 s0_4 = l0(0.0, 0.0);
V4 s0_5 = l0(1.0, 0.0);
V4 s0_6 = l0(-1.0, 1.0);
V4 s0_7 = l0(0.0, 1.0);
V4 s0_8 = l0(1.0, 1.0);
V4 s1_0 = -max(-s0_0, 0.0);
V4 s1_1 = -max(-s0_1, 0.0);
V4 s1_2 = -max(-s0_2, 0.0);
V4 s1_3 = -max(-s0_3, 0.0);
V4 s1_4 = -max(-s0_4, 0.0);
V4 s1_5 = -max(-s0_5, 0.0);
V4 s1_6 = -max(-s0_6, 0.0);
V4 s1_7 = -max(-s0_7, 0.0);
V4 s1_8 = -max(-s0_8, 0.0);
s0_0 = max(s0_0, 0.0);
s0_1 = max(s0_1, 0.0);
s0_2 = max(s0_2, 0.0);
s0_3 = max(s0_3, 0.0);
s0_4 = max(s0_4, 0.0);
s0_5 = max(s0_5, 0.0);
s0_6 = max(s0_6, 0.0);
s0_7 = max(s0_7, 0.0);
s0_8 = max(s0_8, 0.0);
t0[gxy] = f0(s0_0, s0_1, s0_2, s0_3, s0_4, s0_5, s0_6, s0_7, s0_8, s1_0, s1_1, s1_2, s1_3, s1_4, s1_5, s1_6, s1_7, s1_8);
}
//!PASS 4
//!DESC out-shuffle
//!BLOCK_SIZE 16
//!NUM_THREADS 64
//!IN INPUT, t0
//!OUT OUTPUT
#define l0(x, y) V4(O(t0, float2(x, y)))
V4 f0(V4 s0_0, V4 s0_1, V4 s0_2, V4 s0_3, V4 s0_4, V4 s0_5, V4 s0_6, V4 s0_7, V4 s0_8, V4 s1_0, V4 s1_1, V4 s1_2, V4 s1_3, V4 s1_4, V4 s1_5, V4 s1_6, V4 s1_7, V4 s1_8) {
V4 r = 0.0;
r += mul(s0_0, M4(-1.841e-04, -5.677e-02, 9.249e-03, -8.726e-03, 4.041e-02, -1.295e-01, 1.154e-01, 2.765e-02, 1.833e-01, -8.427e-02, 1.078e-01, -1.432e-01, 1.068e-01, -1.222e-01, 2.535e-02, 5.316e-02));
r += mul(s0_1, M4(-3.609e-03, 5.812e-02, -4.650e-02, -2.093e-02, -3.442e-02, 7.643e-02, 1.424e-02, 7.195e-02, 1.552e-01, -8.291e-01, 1.547e-01, 4.354e-01, -2.851e-02, 1.023e-01, -8.481e-03, -6.567e-02));
r += mul(s0_2, M4(1.724e-02, -1.165e-02, 1.007e-02, -3.008e-02, -9.814e-04, -2.007e-02, -5.905e-03, 6.714e-03, -1.736e-01, 2.035e-01, -1.333e-01, 1.250e-01, -9.118e-03, -4.989e-02, 2.142e-02, -4.038e-03));
r += mul(s0_3, M4(7.885e-02, -8.350e-02, -6.025e-03, -1.139e-01, -8.380e-02, -6.836e-02, -5.589e-01, -4.614e-01, -6.742e-01, 2.118e-01, -4.442e-01, 2.197e-01, -5.873e-02, 1.902e-01, -4.687e-01, -4.712e-01));
r += mul(s0_4, M4(-4.506e-01, 2.396e-01, -1.350e-02, 4.072e-01, 3.249e-01, 9.930e-02, 1.576e-02, -2.456e-01, 1.506e+00, 6.047e-02, 8.841e-01, -1.927e+00, -4.337e-01, -5.801e-01, 3.334e-01, 8.276e-02));
r += mul(s0_5, M4(5.049e-02, -1.870e-01, 7.413e-02, -2.569e-02, -2.152e-02, 1.139e-01, -3.874e-02, 1.634e-02, -1.325e-01, 4.002e-02, -1.874e-01, 1.204e-01, 2.267e-02, 1.380e-02, -1.055e-02, 5.504e-02));
r += mul(s0_6, M4(-2.855e-02, 1.255e-02, 3.941e-02, 4.466e-03, 4.814e-05, -9.003e-03, 1.231e-01, 5.676e-02, 5.020e-02, -5.407e-02, -1.951e-01, 4.240e-02, 3.525e-02, -1.021e-01, 4.517e-01, 2.399e-01));
r += mul(s0_7, M4(-5.781e-02, -4.964e-02, -3.981e-01, -1.716e-01, 3.430e-02, -1.644e-02, 2.352e-01, 1.938e-01, 1.266e-01, -1.061e-01, 7.754e-01, 5.337e-01, 2.664e-01, 3.669e-01, -1.113e+00, -1.742e-01));
r += mul(s0_8, M4(2.948e-02, 3.723e-02, 2.739e-02, -5.215e-02, -1.542e-02, -2.173e-02, -1.944e-02, 1.856e-02, -4.535e-02, 1.163e-02, -5.014e-02, 8.660e-02, 1.421e-01, 2.314e-01, 1.171e-02, -4.975e-01));
r += mul(s1_0, M4(-4.408e-02, -3.573e-02, 3.842e-02, 2.571e-02, 2.872e-01, -4.960e-01, 2.569e-01, -6.254e-02, 2.158e-02, -6.452e-02, 7.495e-02, 1.997e-02, 4.094e-02, -9.741e-02, 3.542e-02, -8.115e-03));
r += mul(s1_1, M4(3.480e-02, 1.949e-04, 1.780e-02, 4.483e-02, -2.814e-01, 4.229e-01, -5.482e-02, 1.512e-02, -3.120e-02, 3.945e-02, 4.626e-02, 7.013e-02, -6.686e-03, 5.832e-02, -4.408e-02, -1.262e-02));
r += mul(s1_2, M4(-9.847e-03, 1.973e-03, 1.457e-02, 2.290e-02, 4.741e-02, 2.270e-02, 8.902e-04, 1.152e-02, -2.473e-02, -1.948e-02, -3.475e-03, 4.431e-02, 2.044e-02, 1.571e-04, 9.470e-03, -2.825e-02));
r += mul(s1_3, M4(5.918e-02, -1.939e-02, -4.628e-02, -7.774e-02, -3.040e-01, 8.634e-02, -5.254e-01, -6.906e-01, -1.218e-01, -6.178e-02, -3.115e-01, -2.697e-01, -2.402e-02, -2.149e-02, -3.878e-01, -3.453e-01));
r += mul(s1_4, M4(2.920e-01, 3.711e-01, -2.753e-01, -4.654e-02, 1.379e-01, 3.908e-01, -4.798e-01, 6.668e-01, 4.870e-01, -1.634e-01, -7.790e-02, -2.683e-01, -4.834e-01, -1.822e-02, -8.492e-03, 7.620e-02));
r += mul(s1_5, M4(-4.786e-02, 2.412e-02, 4.992e-02, -1.913e-01, 9.058e-02, -4.485e-02, 8.249e-02, -9.418e-02, 3.555e-02, 3.543e-01, -1.140e-01, -1.358e-01, 5.079e-02, -2.007e-01, 6.132e-02, -2.373e-03));
r += mul(s1_6, M4(6.553e-03, -7.804e-03, 8.569e-02, 4.875e-02, 5.085e-02, 1.728e-02, 6.949e-02, 1.313e-01, 1.825e-02, -5.557e-02, -7.548e-03, -5.534e-02, 7.059e-02, 4.382e-02, 2.807e-01, 1.919e-01));
r += mul(s1_7, M4(-1.071e-01, -3.709e-02, -4.757e-01, -1.943e-01, 8.182e-02, -3.334e-02, 4.170e-01, 6.716e-02, 1.563e-01, 1.382e-01, 7.441e-01, 4.082e-01, -9.101e-02, -3.943e-02, -5.142e-01, -1.910e-01));
r += mul(s1_8, M4(4.255e-03, 4.204e-02, 5.834e-02, -6.508e-02, -3.675e-02, 1.165e-02, -2.694e-02, -2.212e-02, -3.036e-02, -4.393e-02, 1.855e-03, 1.909e-01, 3.812e-02, 3.309e-02, 3.942e-02, -7.422e-02));
r += V4(-1.734e-03, -1.825e-03, -1.635e-03, -1.665e-03);
return tanh(r);
}
void Pass4(uint2 blockStart, uint3 tid) {
float2 pt = float2(GetInputPt());
uint2 gxy = (Rmp8x8(tid.x) << 1) + blockStart;
uint2 size = GetOutputSize();
if (gxy.x >= size.x || gxy.y >= size.y) {
return;
}
float2 pos = ((gxy >> 1) + 0.5) * pt;
V4 s0_0 = l0(-1.0, -1.0);
V4 s0_1 = l0(0.0, -1.0);
V4 s0_2 = l0(1.0, -1.0);
V4 s0_3 = l0(-1.0, 0.0);
V4 s0_4 = l0(0.0, 0.0);
V4 s0_5 = l0(1.0, 0.0);
V4 s0_6 = l0(-1.0, 1.0);
V4 s0_7 = l0(0.0, 1.0);
V4 s0_8 = l0(1.0, 1.0);
V4 s1_0 = -max(-s0_0, 0.0);
V4 s1_1 = -max(-s0_1, 0.0);
V4 s1_2 = -max(-s0_2, 0.0);
V4 s1_3 = -max(-s0_3, 0.0);
V4 s1_4 = -max(-s0_4, 0.0);
V4 s1_5 = -max(-s0_5, 0.0);
V4 s1_6 = -max(-s0_6, 0.0);
V4 s1_7 = -max(-s0_7, 0.0);
V4 s1_8 = -max(-s0_8, 0.0);
s0_0 = max(s0_0, 0.0);
s0_1 = max(s0_1, 0.0);
s0_2 = max(s0_2, 0.0);
s0_3 = max(s0_3, 0.0);
s0_4 = max(s0_4, 0.0);
s0_5 = max(s0_5, 0.0);
s0_6 = max(s0_6, 0.0);
s0_7 = max(s0_7, 0.0);
s0_8 = max(s0_8, 0.0);
V4 r = f0(s0_0, s0_1, s0_2, s0_3, s0_4, s0_5, s0_6, s0_7, s0_8, s1_0, s1_1, s1_2, s1_3, s1_4, s1_5, s1_6, s1_7, s1_8);
static const float3x3 rgb2yuv = {0.299, 0.587, 0.114, -0.169, -0.331, 0.5, 0.5, -0.419, -0.081};
static const float3x3 yuv2rgb = {1, -0.00093, 1.401687, 1, -0.3437, -0.71417, 1, 1.77216, 0.00099};
float2 opt = float2(GetOutputPt());
pos -= 0.5f * opt;
float3 yuv = mul(rgb2yuv, INPUT.SampleLevel(SL, pos, 0).rgb);
OUTPUT[gxy] = float4(mul(yuv2rgb, float3(saturate(yuv.r + r.x), yuv.yz)), 1);
++gxy.x;
pos.x += opt.x;
yuv = mul(rgb2yuv, INPUT.SampleLevel(SL, pos, 0).rgb);
OUTPUT[gxy] = float4(mul(yuv2rgb, float3(saturate(yuv.r + r.y), yuv.yz)), 1);
++gxy.y;
pos.y += opt.y;
yuv = mul(rgb2yuv, INPUT.SampleLevel(SL, pos, 0).rgb);
OUTPUT[gxy] = float4(mul(yuv2rgb, float3(saturate(yuv.r + r.w), yuv.yz)), 1);
--gxy.x;
pos.x -= opt.x;
yuv = mul(rgb2yuv, INPUT.SampleLevel(SL, pos, 0).rgb);
OUTPUT[gxy] = float4(mul(yuv2rgb, float3(saturate(yuv.r + r.z), yuv.yz)), 1);
}

View file

@ -0,0 +1,413 @@
// CuNNy 3x4C BILINEAR RGB NVL DN - https://github.com/cunnyplapper/CuNNy
// This program is free software: you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation, either version 3 of the License, or
// (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
//
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License
// along with this program. If not, see <http://www.gnu.org/licenses/>.
//!MAGPIE EFFECT
//!VERSION 4
//!SORT_NAME CuNNy-DN-D04N03
//!TEXTURE
Texture2D INPUT;
//!TEXTURE
//!WIDTH INPUT_WIDTH * 2
//!HEIGHT INPUT_HEIGHT * 2
Texture2D OUTPUT;
//!SAMPLER
//!FILTER POINT
SamplerState SP;
//!SAMPLER
//!FILTER LINEAR
SamplerState SL;
//!COMMON
#define O(t, p) t.SampleLevel(SP, pos + p * pt, 0)
#define V4 min16float4
#define M4 min16float4x4
//!TEXTURE
//!WIDTH INPUT_WIDTH
//!HEIGHT INPUT_HEIGHT
//!FORMAT R8G8B8A8_SNORM
Texture2D t0;
//!TEXTURE
//!WIDTH INPUT_WIDTH
//!HEIGHT INPUT_HEIGHT
//!FORMAT R8G8B8A8_SNORM
Texture2D t1;
//!PASS 1
//!DESC in
//!BLOCK_SIZE 8
//!NUM_THREADS 64
//!IN INPUT
//!OUT t0
#define l0(x, y) min16float((dot(float3(-2.683e-01, -5.217e-01, -1.382e-01), O(INPUT, float2(x, y)).rgb) + 7.973e-01))
V4 f0(min16float s0_0, min16float s0_1, min16float s0_2, min16float s0_3, min16float s0_4, min16float s0_5, min16float s0_6, min16float s0_7, min16float s0_8) {
V4 r = 0.0;
r += V4(1.850e-01, -2.860e-02, -5.321e-01, 2.390e-03) * s0_0;
r += V4(-4.299e-01, -2.946e-02, -1.180e-01, -5.652e-02) * s0_1;
r += V4(-4.798e-01, -2.276e-02, 3.201e-02, 4.870e-02) * s0_2;
r += V4(2.783e-01, -2.262e-03, -1.864e-01, 1.793e-01) * s0_3;
r += V4(9.435e-04, 8.115e-01, 7.806e-01, -7.793e-01) * s0_4;
r += V4(2.180e-01, -2.564e-05, 2.774e-03, -7.015e-02) * s0_5;
r += V4(1.479e-03, -4.675e-02, 3.323e-02, 3.392e-01) * s0_6;
r += V4(1.203e-01, 1.509e-02, 5.239e-02, 3.194e-01) * s0_7;
r += V4(7.680e-02, -4.310e-02, -7.203e-02, 1.255e-02) * s0_8;
r += V4(3.156e-02, 7.379e-02, 1.078e-02, -5.510e-04);
return r;
}
void Pass1(uint2 blockStart, uint3 tid) {
float2 pt = float2(GetInputPt());
uint2 gxy = Rmp8x8(tid.x) + blockStart;
uint2 size = GetInputSize();
if (gxy.x >= size.x || gxy.y >= size.y) {
return;
}
float2 pos = (gxy + 0.5) * pt;
min16float s0_0 = l0(-1.0, -1.0);
min16float s0_1 = l0(0.0, -1.0);
min16float s0_2 = l0(1.0, -1.0);
min16float s0_3 = l0(-1.0, 0.0);
min16float s0_4 = l0(0.0, 0.0);
min16float s0_5 = l0(1.0, 0.0);
min16float s0_6 = l0(-1.0, 1.0);
min16float s0_7 = l0(0.0, 1.0);
min16float s0_8 = l0(1.0, 1.0);
t0[gxy] = f0(s0_0, s0_1, s0_2, s0_3, s0_4, s0_5, s0_6, s0_7, s0_8);
}
//!PASS 2
//!DESC conv1
//!BLOCK_SIZE 8
//!NUM_THREADS 64
//!IN t0
//!OUT t1
#define l0(x, y) V4(O(t0, float2(x, y)))
V4 f0(V4 s0_0, V4 s0_1, V4 s0_2, V4 s0_3, V4 s0_4, V4 s0_5, V4 s0_6, V4 s0_7, V4 s0_8, V4 s1_0, V4 s1_1, V4 s1_2, V4 s1_3, V4 s1_4, V4 s1_5, V4 s1_6, V4 s1_7, V4 s1_8) {
V4 r = 0.0;
r += mul(s0_0, M4(1.949e-01, -1.247e-01, -7.307e-02, 8.783e-02, -4.773e-02, 6.012e-02, 8.043e-02, -8.489e-02, 6.760e-02, -7.809e-02, -4.745e-02, -1.304e-02, -1.402e-01, -1.248e-01, 3.334e-01, -1.498e-01));
r += mul(s0_1, M4(7.053e-02, 9.895e-02, 1.655e-01, 2.251e-01, 3.511e-02, -1.010e-01, -2.736e-01, 1.174e-01, -2.551e-01, 1.100e-01, 1.518e-01, -4.343e-02, -9.293e-01, 5.327e-01, -2.723e-01, 4.006e-01));
r += mul(s0_2, M4(-2.390e-02, 8.154e-03, -2.332e-02, -3.708e-02, 2.814e-02, 5.506e-02, -2.627e-01, -8.081e-02, -1.062e-01, -6.819e-02, -9.498e-02, -2.749e-01, -2.457e-01, 6.868e-01, 6.527e-03, 7.676e-01));
r += mul(s0_3, M4(2.704e-01, 4.055e-02, -4.756e-01, 2.506e-01, -9.498e-02, 5.838e-02, 1.733e-01, 3.420e-03, -7.051e-02, -8.233e-02, -3.006e-01, 6.824e-02, -1.308e-01, 1.196e-01, 2.560e-01, 8.304e-02));
r += mul(s0_4, M4(4.190e-01, -1.207e-01, 2.708e-01, -6.375e-01, 1.740e-01, 1.955e-03, -1.816e-01, -7.933e-02, -9.308e-01, 1.333e-01, -1.335e-01, -1.401e-01, 3.447e-01, 3.389e-01, 6.660e-01, -3.387e-01));
r += mul(s0_5, M4(7.310e-02, 1.403e-02, 8.114e-02, 7.400e-02, -2.552e-02, -1.607e-01, -1.208e-01, -3.943e-02, -2.743e-02, -7.229e-03, -1.749e-03, 3.062e-01, 1.429e-01, 8.105e-01, 3.562e-01, 4.580e-01));
r += mul(s0_6, M4(2.115e-01, -1.686e-01, -1.948e-01, -1.191e-01, -5.798e-02, 3.493e-02, 8.264e-02, 1.579e-01, -1.081e-01, -1.775e-01, -8.196e-02, -2.085e-01, 6.791e-02, 1.652e-02, -4.933e-03, 2.833e-02));
r += mul(s0_7, M4(-2.160e-01, -3.858e-01, -8.407e-01, -1.091e-01, 8.415e-03, 8.626e-02, 2.340e-01, 9.177e-02, -4.697e-01, -6.623e-02, -5.176e-01, 6.762e-02, -3.437e-03, 6.570e-02, 7.630e-02, 8.988e-02));
r += mul(s0_8, M4(6.527e-02, -6.320e-02, 1.192e-02, -1.196e-01, -1.605e-02, -9.294e-03, 1.955e-01, -2.356e-02, -3.582e-02, 1.377e-02, 9.253e-02, -2.362e-02, 3.578e-02, 1.822e-01, 3.329e-01, 1.489e-01));
r += mul(s1_0, M4(1.154e-01, -1.822e-01, -2.122e-01, 3.031e-02, 6.550e-01, -4.855e-02, 6.554e-02, 4.432e-02, 1.671e-02, -4.477e-02, -9.428e-03, 4.413e-03, -3.185e-02, -1.529e-01, -1.222e-01, 6.523e-02));
r += mul(s1_1, M4(-4.920e-02, -1.697e-02, 4.141e-02, 1.997e-01, 6.972e-01, -5.157e-01, 2.031e-01, 2.829e-02, -5.005e-02, 2.335e-01, 2.985e-01, 6.871e-02, -5.232e-01, 2.146e-02, -1.418e+00, 2.193e-01));
r += mul(s1_2, M4(-6.472e-02, 2.595e-02, -2.610e-02, -2.279e-02, 4.165e-01, -7.745e-01, 1.261e-01, -3.845e-01, 3.279e-02, 2.445e-02, 1.796e-01, -2.581e-01, -3.838e-01, 6.280e-02, -4.893e-01, -1.475e-01));
r += mul(s1_3, M4(9.330e-02, 1.742e-01, -1.685e-01, 2.376e-02, -9.586e-01, -1.236e+00, -7.271e-01, -7.674e-01, 2.500e-01, -3.709e-02, -1.303e-01, 1.490e-01, -2.746e-01, -1.376e-01, -2.321e-02, -1.967e-02));
r += mul(s1_4, M4(3.660e-01, 4.772e-02, 5.524e-01, -2.804e-01, -2.756e+00, -1.336e+00, 2.038e-01, 2.593e+00, 2.156e-01, 3.281e-01, 3.152e-01, 8.064e-01, 3.970e-01, -1.379e-01, -7.518e-02, -2.723e-01));
r += mul(s1_5, M4(5.214e-03, 1.695e-02, 1.024e-01, 1.333e-01, -2.250e-01, -1.298e+00, 4.673e-01, 1.317e+00, 3.036e-01, -1.273e-01, 2.900e-01, 2.249e-02, -1.870e-01, -1.124e-01, -5.879e-01, 6.314e-02));
r += mul(s1_6, M4(-8.225e-02, -1.149e-01, 1.598e-04, -3.662e-01, -8.572e-02, -8.909e-01, 9.891e-02, 1.818e-01, 1.715e-01, -2.348e-01, 1.178e-01, -6.289e-02, 1.522e-02, 1.973e-02, 3.707e-02, 2.911e-02));
r += mul(s1_7, M4(-6.380e-02, 8.661e-02, -2.666e-01, 9.586e-02, -1.257e+00, -2.231e+00, -1.232e+00, 5.642e-01, 5.730e-02, -3.294e-01, -1.151e-01, 2.382e-01, 4.529e-02, 4.927e-02, 9.893e-02, 8.365e-02));
r += mul(s1_8, M4(1.906e-02, -8.920e-02, 8.931e-02, -6.752e-02, -3.680e-01, -1.282e+00, -1.388e-01, -7.545e-02, 6.262e-02, -1.695e-01, 2.278e-01, -3.066e-01, -7.412e-02, 1.145e-02, 4.667e-02, -4.205e-04));
r += V4(1.427e-02, -1.982e-02, 4.114e-03, -2.883e-02);
return r;
}
void Pass2(uint2 blockStart, uint3 tid) {
float2 pt = float2(GetInputPt());
uint2 gxy = Rmp8x8(tid.x) + blockStart;
uint2 size = GetInputSize();
if (gxy.x >= size.x || gxy.y >= size.y) {
return;
}
float2 pos = (gxy + 0.5) * pt;
V4 s0_0 = l0(-1.0, -1.0);
V4 s0_1 = l0(0.0, -1.0);
V4 s0_2 = l0(1.0, -1.0);
V4 s0_3 = l0(-1.0, 0.0);
V4 s0_4 = l0(0.0, 0.0);
V4 s0_5 = l0(1.0, 0.0);
V4 s0_6 = l0(-1.0, 1.0);
V4 s0_7 = l0(0.0, 1.0);
V4 s0_8 = l0(1.0, 1.0);
V4 s1_0 = -max(-s0_0, 0.0);
V4 s1_1 = -max(-s0_1, 0.0);
V4 s1_2 = -max(-s0_2, 0.0);
V4 s1_3 = -max(-s0_3, 0.0);
V4 s1_4 = -max(-s0_4, 0.0);
V4 s1_5 = -max(-s0_5, 0.0);
V4 s1_6 = -max(-s0_6, 0.0);
V4 s1_7 = -max(-s0_7, 0.0);
V4 s1_8 = -max(-s0_8, 0.0);
s0_0 = max(s0_0, 0.0);
s0_1 = max(s0_1, 0.0);
s0_2 = max(s0_2, 0.0);
s0_3 = max(s0_3, 0.0);
s0_4 = max(s0_4, 0.0);
s0_5 = max(s0_5, 0.0);
s0_6 = max(s0_6, 0.0);
s0_7 = max(s0_7, 0.0);
s0_8 = max(s0_8, 0.0);
t1[gxy] = f0(s0_0, s0_1, s0_2, s0_3, s0_4, s0_5, s0_6, s0_7, s0_8, s1_0, s1_1, s1_2, s1_3, s1_4, s1_5, s1_6, s1_7, s1_8);
}
//!PASS 3
//!DESC conv2
//!BLOCK_SIZE 8
//!NUM_THREADS 64
//!IN t1
//!OUT t0
#define l0(x, y) V4(O(t1, float2(x, y)))
V4 f0(V4 s0_0, V4 s0_1, V4 s0_2, V4 s0_3, V4 s0_4, V4 s0_5, V4 s0_6, V4 s0_7, V4 s0_8, V4 s1_0, V4 s1_1, V4 s1_2, V4 s1_3, V4 s1_4, V4 s1_5, V4 s1_6, V4 s1_7, V4 s1_8) {
V4 r = 0.0;
r += mul(s0_0, M4(2.965e-01, -1.919e-01, 9.202e-02, 8.775e-03, -4.948e-02, 1.061e-01, -3.754e-02, -1.900e-01, -2.114e-01, 1.267e-01, 1.989e-02, 2.570e-02, 4.634e-03, -2.718e-01, 2.171e-01, 1.512e-01));
r += mul(s0_1, M4(-5.527e-01, -4.825e-01, 4.325e-01, 4.447e-01, -6.577e-02, 5.161e-01, 3.286e-02, -3.800e-01, 2.625e-02, 3.835e-01, -7.794e-02, -5.489e-02, -2.647e-01, -4.952e-01, 1.587e-01, 1.471e-01));
r += mul(s0_2, M4(-3.687e-01, -1.096e-01, 1.849e-01, -6.915e-02, 2.257e-01, 2.760e-01, -8.875e-02, -8.871e-02, -8.394e-02, -6.714e-02, 5.322e-03, -3.252e-01, -7.885e-02, -2.723e-01, 6.149e-02, 2.998e-01));
r += mul(s0_3, M4(1.606e-01, -1.199e-01, 3.573e-01, 2.833e-02, 6.514e-03, -2.242e-02, -6.231e-02, 6.702e-02, -8.717e-02, -2.227e-01, -1.626e-01, 5.313e-02, -1.411e-01, -2.445e-02, 1.194e-01, -1.101e-01));
r += mul(s0_4, M4(-1.127e+00, 1.823e-01, 1.358e-01, -1.618e-01, -4.171e-04, -7.771e-02, 2.147e-01, 6.493e-01, 4.989e-01, 3.955e-01, -1.017e-01, -2.861e-01, 3.878e-01, -6.653e-01, -4.968e-01, -5.063e-01));
r += mul(s0_5, M4(-2.270e-01, -3.965e-01, -2.794e-02, 1.487e-01, -2.667e-01, -1.410e-02, 1.475e-01, -4.992e-01, -1.071e-01, 2.096e-01, 1.159e-01, -6.073e-02, -7.157e-02, -2.446e-01, -4.807e-02, 1.968e-01));
r += mul(s0_6, M4(8.199e-02, 8.336e-02, -3.090e-02, -1.287e-02, -6.954e-02, -7.544e-02, 1.272e-01, 7.930e-02, -3.647e-02, -2.685e-02, -4.235e-02, 3.214e-02, -4.526e-02, 1.479e-01, -4.963e-02, -3.035e-02));
r += mul(s0_7, M4(-2.012e-02, -1.497e-02, -2.952e-01, -6.026e-02, 2.135e-03, 2.979e-02, -2.713e-02, 7.951e-03, -8.069e-02, -2.374e-01, 1.865e-01, 1.048e-01, -9.076e-02, 6.683e-02, 9.576e-02, -2.432e-02));
r += mul(s0_8, M4(1.455e-01, 2.613e-01, -1.616e-01, -3.564e-01, 1.229e-01, -3.778e-02, 3.316e-02, 5.927e-02, -1.831e-01, -1.388e-01, 5.986e-02, 2.083e-02, -1.368e-03, 2.394e-01, -1.623e-01, -2.768e-02));
r += mul(s1_0, M4(7.711e-03, -6.696e-04, -3.229e-02, 1.549e-02, -1.596e-01, 2.068e-01, -6.162e-02, -9.571e-02, -1.500e-01, 1.743e-01, 2.746e-02, -5.845e-02, -7.649e-03, -4.265e-03, 4.154e-03, 3.950e-03));
r += mul(s1_1, M4(2.764e-01, -4.505e-02, 4.280e-02, 6.044e-02, 3.396e-02, 2.750e-01, -1.910e-01, -2.153e-01, 9.633e-02, -2.194e-02, -2.131e-01, -1.181e-01, -1.343e-01, 6.123e-02, 1.904e-02, -6.568e-02));
r += mul(s1_2, M4(-3.643e-01, -1.709e-02, 1.528e-01, -1.405e-01, 3.307e-01, -1.979e-03, -1.819e-01, 7.635e-02, 1.266e-01, 2.162e-01, -7.492e-02, -9.075e-02, 4.120e-02, 1.521e-01, -2.790e-03, -4.330e-02));
r += mul(s1_3, M4(1.913e-02, -5.373e-02, 5.748e-02, -1.443e-02, -2.776e-01, -1.162e-01, -1.994e-01, 1.430e-01, 9.058e-02, -3.720e-02, -3.585e-02, -8.516e-02, -2.228e-02, 7.507e-02, -9.620e-02, -1.013e-01));
r += mul(s1_4, M4(-3.592e-01, 1.415e-01, 1.018e+00, -1.555e-01, 5.378e-01, 8.818e-02, 2.190e-01, 1.997e-01, -1.128e-01, 3.331e-02, -1.410e-01, 2.844e-01, 4.756e-01, -5.850e-02, -3.757e-01, -1.716e-01));
r += mul(s1_5, M4(2.636e-02, -3.596e-01, -3.280e-01, 2.027e-01, 3.000e-01, -2.297e-01, 4.282e-02, 1.776e-01, 5.222e-02, 1.751e-01, 4.529e-02, -8.347e-02, -3.409e-01, -2.640e-01, 1.753e-01, -5.672e-01));
r += mul(s1_6, M4(-1.699e-02, 4.941e-02, -2.642e-02, -1.406e-04, -1.655e-01, -1.464e-02, -4.353e-02, 1.946e-01, 6.067e-02, -1.429e-01, 1.170e-01, -4.644e-02, -6.567e-02, -2.264e-02, 6.666e-02, 9.009e-02));
r += mul(s1_7, M4(7.805e-02, 2.173e-02, -3.276e-01, 2.004e-03, -7.789e-02, -1.466e-02, -1.560e-01, -1.126e-01, -3.823e-02, -2.446e-03, 1.465e-01, -2.744e-01, -2.129e-01, -2.141e-02, 4.456e-01, 1.240e-01));
r += mul(s1_8, M4(1.315e-02, 2.686e-01, -1.987e-01, -2.093e-01, 3.184e-02, -8.723e-02, 3.012e-01, 3.580e-01, 1.198e-02, -2.655e-01, 1.455e-01, 7.602e-02, -4.605e-02, 3.276e-01, -2.036e-01, -2.590e-01));
r += V4(-1.292e-02, 8.156e-04, -2.055e-03, -3.100e-03);
return r;
}
void Pass3(uint2 blockStart, uint3 tid) {
float2 pt = float2(GetInputPt());
uint2 gxy = Rmp8x8(tid.x) + blockStart;
uint2 size = GetInputSize();
if (gxy.x >= size.x || gxy.y >= size.y) {
return;
}
float2 pos = (gxy + 0.5) * pt;
V4 s0_0 = l0(-1.0, -1.0);
V4 s0_1 = l0(0.0, -1.0);
V4 s0_2 = l0(1.0, -1.0);
V4 s0_3 = l0(-1.0, 0.0);
V4 s0_4 = l0(0.0, 0.0);
V4 s0_5 = l0(1.0, 0.0);
V4 s0_6 = l0(-1.0, 1.0);
V4 s0_7 = l0(0.0, 1.0);
V4 s0_8 = l0(1.0, 1.0);
V4 s1_0 = -max(-s0_0, 0.0);
V4 s1_1 = -max(-s0_1, 0.0);
V4 s1_2 = -max(-s0_2, 0.0);
V4 s1_3 = -max(-s0_3, 0.0);
V4 s1_4 = -max(-s0_4, 0.0);
V4 s1_5 = -max(-s0_5, 0.0);
V4 s1_6 = -max(-s0_6, 0.0);
V4 s1_7 = -max(-s0_7, 0.0);
V4 s1_8 = -max(-s0_8, 0.0);
s0_0 = max(s0_0, 0.0);
s0_1 = max(s0_1, 0.0);
s0_2 = max(s0_2, 0.0);
s0_3 = max(s0_3, 0.0);
s0_4 = max(s0_4, 0.0);
s0_5 = max(s0_5, 0.0);
s0_6 = max(s0_6, 0.0);
s0_7 = max(s0_7, 0.0);
s0_8 = max(s0_8, 0.0);
t0[gxy] = f0(s0_0, s0_1, s0_2, s0_3, s0_4, s0_5, s0_6, s0_7, s0_8, s1_0, s1_1, s1_2, s1_3, s1_4, s1_5, s1_6, s1_7, s1_8);
}
//!PASS 4
//!DESC conv3
//!BLOCK_SIZE 8
//!NUM_THREADS 64
//!IN t0
//!OUT t1
#define l0(x, y) V4(O(t0, float2(x, y)))
V4 f0(V4 s0_0, V4 s0_1, V4 s0_2, V4 s0_3, V4 s0_4, V4 s0_5, V4 s0_6, V4 s0_7, V4 s0_8, V4 s1_0, V4 s1_1, V4 s1_2, V4 s1_3, V4 s1_4, V4 s1_5, V4 s1_6, V4 s1_7, V4 s1_8) {
V4 r = 0.0;
r += mul(s0_0, M4(2.151e-02, -4.754e-02, 3.454e-02, -1.338e-03, -4.337e-02, 4.608e-02, -1.116e-01, -2.296e-02, -2.839e-02, -3.878e-01, -2.317e-02, 5.774e-02, 4.317e-03, 6.680e-02, 6.325e-02, -1.449e-01));
r += mul(s0_1, M4(-1.173e-01, -8.942e-02, -1.017e-01, 6.496e-02, 5.558e-02, 2.788e-02, 2.184e-02, -2.837e-03, -1.057e-01, -2.075e-01, -3.255e-02, -1.297e-02, -2.643e-02, -1.695e-02, -9.425e-02, 3.942e-02));
r += mul(s0_2, M4(-1.773e-02, -4.118e-02, -2.141e-02, 4.282e-02, 4.234e-02, -1.221e-02, -3.375e-03, 4.469e-02, -2.586e-01, -1.112e-01, -7.688e-02, 3.426e-02, 8.170e-02, -2.355e-02, -3.737e-02, 3.004e-02));
r += mul(s0_3, M4(2.192e-01, 1.955e+00, 2.012e-01, -2.598e-02, -7.453e-02, 5.510e-02, -1.517e-01, -2.571e-01, -2.182e-02, -2.345e-02, -5.767e-02, -5.534e-02, -1.996e-02, 2.329e-01, 4.447e-04, -1.111e-01));
r += mul(s0_4, M4(3.476e-01, -4.368e-01, -1.180e-01, 5.371e-01, 5.294e-01, 1.509e-01, 2.456e-01, -7.875e-02, 2.055e-01, 9.732e-02, 1.285e-01, 5.178e-01, 3.256e-01, -2.842e-01, 4.421e-02, 3.426e-01));
r += mul(s0_5, M4(6.119e-01, -1.393e-01, -1.144e-02, 2.438e-01, -5.126e-02, -1.049e-01, -7.847e-02, 9.942e-02, 5.371e-01, 9.985e-02, 9.193e-02, -3.067e-02, -1.962e-01, -4.272e-02, -7.821e-03, 2.557e-02));
r += mul(s0_6, M4(1.224e-02, -5.098e-01, 3.052e-01, 5.332e-01, 2.249e-01, 4.201e-02, 5.423e-01, 1.106e-01, -1.056e-02, -4.091e-03, -1.267e-02, -5.280e-02, 1.898e-02, 9.430e-03, 1.470e-02, 7.235e-02));
r += mul(s0_7, M4(-4.342e-01, 2.385e-01, -3.834e-02, -7.654e-02, -9.043e-01, -3.139e-01, -1.511e-01, 3.800e-01, -8.848e-02, -3.911e-02, -7.025e-03, -1.196e-02, -3.322e-03, -1.455e-01, 2.084e-02, 1.106e-01));
r += mul(s0_8, M4(1.382e-01, -1.894e-01, -8.814e-02, 1.373e-01, 1.362e-01, -1.298e-01, -1.007e-01, 1.166e-01, -1.553e-02, 8.530e-02, 2.744e-02, -1.083e-01, -5.606e-02, 5.965e-02, 1.406e-02, -4.496e-02));
r += mul(s1_0, M4(-4.828e-03, -1.035e-01, -5.021e-02, 1.972e-02, -9.942e-03, -3.057e-01, -7.373e-03, 4.274e-02, -3.475e-03, 4.653e-02, 9.115e-03, -5.794e-02, 1.170e-02, 1.322e-01, 1.195e-01, -2.535e-02));
r += mul(s1_1, M4(-5.424e-02, -1.541e-01, -9.945e-02, 8.862e-02, -1.198e-01, -3.591e-05, 4.305e-02, -1.079e-01, 1.605e-02, -3.377e-02, -5.398e-02, 1.201e-02, 3.432e-02, 1.090e-02, 8.871e-02, 3.186e-02));
r += mul(s1_2, M4(-1.108e-01, -3.481e-02, -1.616e-02, -4.136e-03, -3.382e-02, 1.836e-02, -3.071e-02, -3.186e-02, -1.014e-01, -1.412e-01, -7.790e-02, 9.763e-02, -1.624e-02, -2.520e-02, -2.152e-02, 2.524e-02));
r += mul(s1_3, M4(3.337e-03, -1.439e-02, 2.317e-03, 2.097e-01, 5.091e-03, 4.138e-02, -5.988e-02, -2.348e-02, -5.626e-03, 1.695e-02, 2.371e-02, -1.652e-02, 8.541e-02, -1.851e-01, 1.130e+00, -1.181e-01));
r += mul(s1_4, M4(1.184e-01, -3.385e-02, 2.659e-02, 3.233e-01, 2.333e-01, 1.694e-01, 1.915e-01, 1.162e-01, 4.309e-02, -3.793e-02, 1.412e-01, -1.345e-02, -6.074e-01, -2.408e-01, -1.306e-01, 1.033e-01));
r += mul(s1_5, M4(3.452e-01, 1.401e-01, 3.650e-02, -4.950e-02, 1.755e-01, -1.210e-01, -1.041e-02, 1.281e-01, 4.262e-01, 2.166e-02, 3.851e-02, 1.295e-01, -1.910e-01, -2.029e-02, -2.151e-02, -1.537e-02));
r += mul(s1_6, M4(4.989e-03, -5.730e-02, 5.803e-02, 2.946e-02, 1.825e-02, 2.660e-02, -4.900e-03, 3.848e-03, 1.078e-02, 1.823e-02, -4.751e-03, 4.219e-02, -1.024e-01, 7.721e-02, -6.709e-01, 8.423e-02));
r += mul(s1_7, M4(-1.567e-01, 4.125e-02, -2.721e-02, -1.831e-01, 9.470e-03, -1.205e-01, 1.793e-02, 1.160e-01, -4.874e-02, -4.902e-02, -1.479e-01, 7.102e-02, 6.699e-01, -1.383e-01, 1.314e-01, 2.999e-01));
r += mul(s1_8, M4(-2.625e-01, -9.735e-02, -6.038e-02, 3.588e-03, 2.247e-02, 4.993e-02, 1.171e-02, -2.071e-02, 2.066e-01, 2.852e-01, -5.781e-02, -3.231e-01, 6.922e-02, 8.960e-02, 9.107e-02, -2.880e-02));
r += V4(3.045e-03, 3.707e-03, -6.011e-03, -5.162e-03);
return r;
}
void Pass4(uint2 blockStart, uint3 tid) {
float2 pt = float2(GetInputPt());
uint2 gxy = Rmp8x8(tid.x) + blockStart;
uint2 size = GetInputSize();
if (gxy.x >= size.x || gxy.y >= size.y) {
return;
}
float2 pos = (gxy + 0.5) * pt;
V4 s0_0 = l0(-1.0, -1.0);
V4 s0_1 = l0(0.0, -1.0);
V4 s0_2 = l0(1.0, -1.0);
V4 s0_3 = l0(-1.0, 0.0);
V4 s0_4 = l0(0.0, 0.0);
V4 s0_5 = l0(1.0, 0.0);
V4 s0_6 = l0(-1.0, 1.0);
V4 s0_7 = l0(0.0, 1.0);
V4 s0_8 = l0(1.0, 1.0);
V4 s1_0 = -max(-s0_0, 0.0);
V4 s1_1 = -max(-s0_1, 0.0);
V4 s1_2 = -max(-s0_2, 0.0);
V4 s1_3 = -max(-s0_3, 0.0);
V4 s1_4 = -max(-s0_4, 0.0);
V4 s1_5 = -max(-s0_5, 0.0);
V4 s1_6 = -max(-s0_6, 0.0);
V4 s1_7 = -max(-s0_7, 0.0);
V4 s1_8 = -max(-s0_8, 0.0);
s0_0 = max(s0_0, 0.0);
s0_1 = max(s0_1, 0.0);
s0_2 = max(s0_2, 0.0);
s0_3 = max(s0_3, 0.0);
s0_4 = max(s0_4, 0.0);
s0_5 = max(s0_5, 0.0);
s0_6 = max(s0_6, 0.0);
s0_7 = max(s0_7, 0.0);
s0_8 = max(s0_8, 0.0);
t1[gxy] = f0(s0_0, s0_1, s0_2, s0_3, s0_4, s0_5, s0_6, s0_7, s0_8, s1_0, s1_1, s1_2, s1_3, s1_4, s1_5, s1_6, s1_7, s1_8);
}
//!PASS 5
//!DESC out-shuffle
//!BLOCK_SIZE 16
//!NUM_THREADS 64
//!IN INPUT, t1
//!OUT OUTPUT
#define l0(x, y) V4(O(t1, float2(x, y)))
V4 f0(V4 s0_0, V4 s0_1, V4 s0_2, V4 s0_3, V4 s0_4, V4 s0_5, V4 s0_6, V4 s0_7, V4 s0_8, V4 s1_0, V4 s1_1, V4 s1_2, V4 s1_3, V4 s1_4, V4 s1_5, V4 s1_6, V4 s1_7, V4 s1_8) {
V4 r = 0.0;
r += mul(s0_0, M4(1.116e-01, 1.402e-01, 1.439e-02, 5.091e-02, -1.526e-02, -2.562e-02, -1.193e-02, -1.365e-02, -6.156e-02, -3.463e-02, 2.155e-02, -2.192e-02, -2.937e-02, -1.072e-01, -4.538e-02, -3.302e-02));
r += mul(s0_1, M4(-1.192e-02, -1.724e-02, 9.899e-03, -5.861e-03, -1.552e-02, 2.422e-02, 4.929e-03, 7.339e-03, 4.700e-02, 1.993e-01, -6.323e-02, 5.778e-02, 1.499e-01, 3.916e-01, -4.578e-02, -2.026e-02));
r += mul(s0_2, M4(5.431e-03, 1.916e-03, -2.064e-03, -6.545e-04, -1.731e-02, -8.081e-02, 1.391e-02, -7.036e-03, 7.739e-02, -1.588e-01, 2.970e-02, 3.357e-02, 3.869e-02, -7.824e-02, 1.813e-02, -6.252e-02));
r += mul(s0_3, M4(5.283e-01, 8.076e-02, 3.430e-01, 2.332e-01, -3.540e-02, 1.903e-02, -1.354e-02, -1.415e-02, -1.644e-01, -1.319e-02, -9.781e-02, -3.256e-02, 2.768e-02, -3.914e-02, 1.596e-01, -1.067e-01));
r += mul(s0_4, M4(-1.638e-02, 4.385e-01, -1.479e-01, -1.789e-02, -1.399e-01, -5.884e-02, -7.306e-02, -2.036e-03, 5.196e-01, -1.849e-01, 8.771e-01, 3.595e-01, -7.094e-01, 2.485e-02, -3.977e-02, 7.246e-01));
r += mul(s0_5, M4(-1.647e-03, -6.027e-03, -3.787e-03, -1.975e-02, -4.810e-02, -4.557e-01, 4.921e-02, -1.313e-01, -2.044e-02, 3.533e-01, -7.591e-02, 1.249e-02, 2.648e-02, -5.215e-01, 1.204e-01, -2.254e-01));
r += mul(s0_6, M4(-2.852e-02, -1.630e-02, 1.249e-01, -1.758e-02, 4.285e-02, 1.425e-02, -1.595e-02, 2.618e-02, 4.460e-03, 1.266e-02, -3.914e-02, 1.111e-02, 5.378e-02, 2.199e-02, 2.561e-03, 2.125e-02));
r += mul(s0_7, M4(-6.567e-02, -4.333e-02, -4.153e-03, 1.692e-01, 5.376e-02, 5.736e-02, -1.860e-01, -9.094e-02, 3.357e-02, -3.186e-02, 1.244e-01, -9.606e-02, 6.227e-02, 6.827e-02, -2.086e-01, -6.625e-02));
r += mul(s0_8, M4(4.553e-05, -3.116e-02, 1.023e-02, 2.322e-02, 8.623e-02, 1.125e-01, 2.802e-02, -2.768e-01, -1.003e-01, -2.143e-02, -2.413e-02, 1.460e-01, 5.421e-02, 5.798e-02, 3.478e-03, -1.421e-01));
r += mul(s1_0, M4(2.165e-01, 1.123e-01, -3.653e-02, -6.070e-03, -1.021e-01, -6.901e-04, 6.256e-03, -3.182e-03, -4.285e-02, -6.763e-02, 2.278e-02, -1.860e-02, -2.689e-02, 2.567e-02, 2.634e-03, 3.600e-02));
r += mul(s1_1, M4(-1.159e-01, -1.198e-01, 2.991e-02, -6.143e-02, 1.038e-01, -5.076e-02, -1.785e-02, -3.611e-02, 6.860e-02, 9.302e-02, -1.125e-02, 3.332e-02, 6.457e-02, -3.919e-02, 4.158e-03, -1.201e-02));
r += mul(s1_2, M4(-6.554e-03, 3.359e-02, -2.003e-02, -2.227e-04, 3.354e-02, -3.700e-02, -9.588e-03, -3.740e-02, -1.336e-02, -2.556e-04, -4.733e-03, -1.636e-02, 1.127e-02, 1.421e-02, -1.019e-02, -2.731e-02));
r += mul(s1_3, M4(3.642e-01, -3.756e-03, 6.584e-01, 1.773e-01, -1.638e-02, 1.109e-02, -7.427e-02, -1.572e-02, -1.869e-01, -3.059e-02, -8.088e-02, -5.092e-02, -5.794e-02, -4.431e-02, -7.912e-02, -9.767e-02));
r += mul(s1_4, M4(-3.255e-02, 3.115e-01, -2.109e-01, 2.804e-01, -6.504e-01, -1.342e-02, 1.355e-01, 3.623e-01, 5.142e-01, 2.124e-01, 1.866e-01, 2.268e-01, -2.470e-02, 1.629e-01, 1.163e-01, 1.663e-01));
r += mul(s1_5, M4(-1.093e-02, -1.640e-04, -3.502e-02, -3.746e-02, 1.836e-02, -5.959e-01, 1.323e-01, -2.388e-01, 3.482e-02, 1.823e-01, -3.895e-02, 5.164e-03, -7.314e-02, -3.897e-01, 6.275e-02, -3.974e-02));
r += mul(s1_6, M4(7.922e-03, -3.284e-02, 1.274e-01, -2.930e-02, 6.307e-02, 2.548e-02, -4.094e-02, 2.130e-02, -1.123e-02, 1.824e-03, -9.595e-02, 1.808e-02, 7.955e-02, 3.285e-02, 4.592e-02, 7.153e-02));
r += mul(s1_7, M4(-6.410e-02, -1.423e-02, -4.912e-02, 1.461e-01, 6.612e-02, 9.838e-02, -2.153e-01, -1.067e-01, -1.108e-02, -1.048e-01, 2.778e-01, -1.116e-01, 4.569e-02, 2.955e-02, -1.440e-01, -3.364e-02));
r += mul(s1_8, M4(1.721e-02, 1.171e-02, 1.096e-02, -2.832e-02, 7.446e-02, 4.785e-02, 8.270e-03, -1.640e-01, -8.912e-02, -6.617e-02, 3.225e-03, 9.894e-02, 4.367e-02, 8.102e-02, -1.779e-02, -2.410e-01));
r += V4(1.708e-05, 2.435e-04, 1.267e-03, 1.926e-03);
return tanh(r);
}
void Pass5(uint2 blockStart, uint3 tid) {
float2 pt = float2(GetInputPt());
uint2 gxy = (Rmp8x8(tid.x) << 1) + blockStart;
uint2 size = GetOutputSize();
if (gxy.x >= size.x || gxy.y >= size.y) {
return;
}
float2 pos = ((gxy >> 1) + 0.5) * pt;
V4 s0_0 = l0(-1.0, -1.0);
V4 s0_1 = l0(0.0, -1.0);
V4 s0_2 = l0(1.0, -1.0);
V4 s0_3 = l0(-1.0, 0.0);
V4 s0_4 = l0(0.0, 0.0);
V4 s0_5 = l0(1.0, 0.0);
V4 s0_6 = l0(-1.0, 1.0);
V4 s0_7 = l0(0.0, 1.0);
V4 s0_8 = l0(1.0, 1.0);
V4 s1_0 = -max(-s0_0, 0.0);
V4 s1_1 = -max(-s0_1, 0.0);
V4 s1_2 = -max(-s0_2, 0.0);
V4 s1_3 = -max(-s0_3, 0.0);
V4 s1_4 = -max(-s0_4, 0.0);
V4 s1_5 = -max(-s0_5, 0.0);
V4 s1_6 = -max(-s0_6, 0.0);
V4 s1_7 = -max(-s0_7, 0.0);
V4 s1_8 = -max(-s0_8, 0.0);
s0_0 = max(s0_0, 0.0);
s0_1 = max(s0_1, 0.0);
s0_2 = max(s0_2, 0.0);
s0_3 = max(s0_3, 0.0);
s0_4 = max(s0_4, 0.0);
s0_5 = max(s0_5, 0.0);
s0_6 = max(s0_6, 0.0);
s0_7 = max(s0_7, 0.0);
s0_8 = max(s0_8, 0.0);
V4 r = f0(s0_0, s0_1, s0_2, s0_3, s0_4, s0_5, s0_6, s0_7, s0_8, s1_0, s1_1, s1_2, s1_3, s1_4, s1_5, s1_6, s1_7, s1_8);
static const float3x3 rgb2yuv = {0.299, 0.587, 0.114, -0.169, -0.331, 0.5, 0.5, -0.419, -0.081};
static const float3x3 yuv2rgb = {1, -0.00093, 1.401687, 1, -0.3437, -0.71417, 1, 1.77216, 0.00099};
float2 opt = float2(GetOutputPt());
pos -= 0.5f * opt;
float3 yuv = mul(rgb2yuv, INPUT.SampleLevel(SL, pos, 0).rgb);
OUTPUT[gxy] = float4(mul(yuv2rgb, float3(saturate(yuv.r + r.x), yuv.yz)), 1);
++gxy.x;
pos.x += opt.x;
yuv = mul(rgb2yuv, INPUT.SampleLevel(SL, pos, 0).rgb);
OUTPUT[gxy] = float4(mul(yuv2rgb, float3(saturate(yuv.r + r.y), yuv.yz)), 1);
++gxy.y;
pos.y += opt.y;
yuv = mul(rgb2yuv, INPUT.SampleLevel(SL, pos, 0).rgb);
OUTPUT[gxy] = float4(mul(yuv2rgb, float3(saturate(yuv.r + r.w), yuv.yz)), 1);
--gxy.x;
pos.x -= opt.x;
yuv = mul(rgb2yuv, INPUT.SampleLevel(SL, pos, 0).rgb);
OUTPUT[gxy] = float4(mul(yuv2rgb, float3(saturate(yuv.r + r.z), yuv.yz)), 1);
}

View file

@ -0,0 +1,413 @@
// CuNNy 3x4C BILINEAR RGB NVL - https://github.com/cunnyplapper/CuNNy
// This program is free software: you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation, either version 3 of the License, or
// (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
//
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License
// along with this program. If not, see <http://www.gnu.org/licenses/>.
//!MAGPIE EFFECT
//!VERSION 4
//!SORT_NAME CuNNy-D04N03
//!TEXTURE
Texture2D INPUT;
//!TEXTURE
//!WIDTH INPUT_WIDTH * 2
//!HEIGHT INPUT_HEIGHT * 2
Texture2D OUTPUT;
//!SAMPLER
//!FILTER POINT
SamplerState SP;
//!SAMPLER
//!FILTER LINEAR
SamplerState SL;
//!COMMON
#define O(t, p) t.SampleLevel(SP, pos + p * pt, 0)
#define V4 min16float4
#define M4 min16float4x4
//!TEXTURE
//!WIDTH INPUT_WIDTH
//!HEIGHT INPUT_HEIGHT
//!FORMAT R8G8B8A8_SNORM
Texture2D t0;
//!TEXTURE
//!WIDTH INPUT_WIDTH
//!HEIGHT INPUT_HEIGHT
//!FORMAT R8G8B8A8_SNORM
Texture2D t1;
//!PASS 1
//!DESC in
//!BLOCK_SIZE 8
//!NUM_THREADS 64
//!IN INPUT
//!OUT t0
#define l0(x, y) min16float((dot(float3(6.094e-01, 1.148e+00, 2.568e-01), O(INPUT, float2(x, y)).rgb) + -1.542e+00))
V4 f0(min16float s0_0, min16float s0_1, min16float s0_2, min16float s0_3, min16float s0_4, min16float s0_5, min16float s0_6, min16float s0_7, min16float s0_8) {
V4 r = 0.0;
r += V4(-6.372e-02, 1.685e-01, -2.573e-02, -2.185e-02) * s0_0;
r += V4(-3.502e-02, -2.984e-03, 5.048e-02, -2.445e-01) * s0_1;
r += V4(9.644e-02, -7.557e-03, -1.770e-02, 3.162e-02) * s0_2;
r += V4(7.199e-02, -6.233e-01, -4.180e-01, 1.392e-01) * s0_3;
r += V4(-5.683e-01, 1.451e-01, -8.148e-02, 9.768e-02) * s0_4;
r += V4(4.702e-01, -1.319e-03, 3.745e-03, -4.204e-02) * s0_5;
r += V4(9.855e-03, 3.213e-01, 5.098e-01, 4.001e-02) * s0_6;
r += V4(8.216e-02, -1.219e-02, -3.347e-02, 5.017e-02) * s0_7;
r += V4(-6.691e-02, 5.417e-03, 1.235e-02, -9.640e-03) * s0_8;
r += V4(-4.952e-03, -2.750e-03, -9.137e-04, 6.736e-02);
return r;
}
void Pass1(uint2 blockStart, uint3 tid) {
float2 pt = float2(GetInputPt());
uint2 gxy = Rmp8x8(tid.x) + blockStart;
uint2 size = GetInputSize();
if (gxy.x >= size.x || gxy.y >= size.y) {
return;
}
float2 pos = (gxy + 0.5) * pt;
min16float s0_0 = l0(-1.0, -1.0);
min16float s0_1 = l0(0.0, -1.0);
min16float s0_2 = l0(1.0, -1.0);
min16float s0_3 = l0(-1.0, 0.0);
min16float s0_4 = l0(0.0, 0.0);
min16float s0_5 = l0(1.0, 0.0);
min16float s0_6 = l0(-1.0, 1.0);
min16float s0_7 = l0(0.0, 1.0);
min16float s0_8 = l0(1.0, 1.0);
t0[gxy] = f0(s0_0, s0_1, s0_2, s0_3, s0_4, s0_5, s0_6, s0_7, s0_8);
}
//!PASS 2
//!DESC conv1
//!BLOCK_SIZE 8
//!NUM_THREADS 64
//!IN t0
//!OUT t1
#define l0(x, y) V4(O(t0, float2(x, y)))
V4 f0(V4 s0_0, V4 s0_1, V4 s0_2, V4 s0_3, V4 s0_4, V4 s0_5, V4 s0_6, V4 s0_7, V4 s0_8, V4 s1_0, V4 s1_1, V4 s1_2, V4 s1_3, V4 s1_4, V4 s1_5, V4 s1_6, V4 s1_7, V4 s1_8) {
V4 r = 0.0;
r += mul(s0_0, M4(3.169e-01, 3.467e-01, -2.365e-01, 2.253e-01, 6.307e-02, 1.727e-01, -1.053e-01, 9.324e-02, -4.901e-02, -2.112e-01, 8.983e-02, -1.851e-01, -1.987e-01, 6.645e-02, 2.188e-02, 1.988e-02));
r += mul(s0_1, M4(4.393e-02, 2.078e-01, -1.967e-01, 4.673e-02, -7.991e-02, 2.461e-01, -6.028e-02, 9.252e-02, 3.871e-01, 6.138e-02, -3.603e-01, -1.485e-01, 2.466e-01, 5.251e-02, -6.181e-02, 8.932e-02));
r += mul(s0_2, M4(-1.707e-02, 2.598e-02, 1.641e-02, 2.780e-02, 2.425e-02, 1.769e-01, -8.461e-02, 1.067e-01, -2.503e-01, 6.051e-01, -2.782e-01, 1.311e-01, -8.456e-03, -1.370e-02, -6.391e-02, 6.935e-02));
r += mul(s0_3, M4(-8.251e-01, -4.981e-01, -1.726e-01, -1.815e-01, 1.411e-01, 2.889e-02, -3.115e-01, -3.255e-01, 1.812e-03, -4.529e-02, 2.350e-01, 1.999e-01, -1.993e-01, -1.868e-02, 4.249e-02, -1.117e-01));
r += mul(s0_4, M4(-4.732e-02, -5.673e-02, 1.274e-01, 4.894e-02, 9.126e-02, 1.717e-01, -3.294e-01, -2.378e-01, -7.089e-02, -8.116e-02, 2.510e-01, 7.381e-02, 1.275e-01, 8.030e-02, -1.671e-01, -1.824e-02));
r += mul(s0_5, M4(3.373e-02, -4.163e-02, -4.077e-02, -2.085e-02, 1.265e-01, -4.133e-01, 7.433e-02, 7.763e-02, -1.466e-01, 3.291e-01, -7.784e-02, 9.472e-02, 2.725e-01, -2.393e-01, -6.913e-02, -9.445e-02));
r += mul(s0_6, M4(3.043e-02, -9.985e-02, 1.538e-01, -2.529e-01, 2.379e-01, 1.079e-01, -1.517e-01, -9.289e-02, -1.396e-01, -4.354e-02, 8.463e-02, 7.052e-02, 5.629e-02, 3.293e-03, 5.342e-02, -1.606e-01));
r += mul(s0_7, M4(3.626e-02, -1.421e-01, 4.017e-02, -3.963e-02, 2.148e-03, 5.522e-02, 3.174e-01, 2.270e-02, -5.590e-02, -9.875e-02, -1.683e-01, 5.415e-02, 1.509e-01, 7.709e-02, -1.161e-01, 1.440e-01));
r += mul(s0_8, M4(-1.132e-02, 2.337e-02, 1.264e-02, 2.638e-03, -6.582e-02, -1.965e-01, 2.803e-01, 1.333e-01, 9.171e-02, 1.567e-01, -2.419e-01, -1.602e-01, -2.271e-01, 3.614e-02, 2.179e-01, 4.826e-02));
r += mul(s1_0, M4(1.452e-01, 1.313e-01, -6.140e-02, 2.412e-01, -3.691e-02, 7.355e-02, -4.209e-02, 1.343e-01, -2.509e-02, -1.266e-01, 9.017e-02, -1.854e-02, -4.280e-01, -1.004e-01, 2.319e-01, 4.211e-02));
r += mul(s1_1, M4(4.894e-02, 7.564e-02, -9.350e-02, 5.422e-02, -6.111e-02, 6.969e-02, -4.398e-02, 6.622e-02, 7.113e-01, 3.461e-01, -5.254e-01, -8.808e-02, 4.481e-01, 3.171e-01, -2.198e-01, 1.048e-01));
r += mul(s1_2, M4(-3.483e-02, 3.150e-03, 2.215e-02, 2.616e-02, 1.468e-01, -1.295e-01, -1.470e-01, 3.371e-02, -4.514e-02, 4.677e-02, -1.313e-01, -1.176e-01, 1.507e-03, 2.290e-01, -2.163e-01, 3.895e-02));
r += mul(s1_3, M4(-2.258e-01, -1.353e-01, -4.873e-01, -1.236e+00, 1.660e-01, -1.803e-02, -2.797e-01, -4.092e-01, -1.525e-01, -8.178e-02, 2.665e-01, 3.652e-01, -1.853e-01, -3.819e-02, 1.627e-01, -3.896e-01));
r += mul(s1_4, M4(-1.005e-01, -3.821e-02, 9.917e-02, -1.324e-01, -2.040e-01, -3.586e-01, 9.776e-02, -1.376e-01, 2.065e-01, 2.017e-01, -1.320e-01, -2.225e-02, 2.944e-01, 5.393e-02, -4.301e-01, -7.240e-02));
r += mul(s1_5, M4(5.353e-02, -4.257e-02, -4.131e-02, -3.943e-02, -6.151e-02, 3.059e-01, -1.481e-02, 3.662e-01, 3.098e-02, -8.774e-02, 1.790e-02, -1.332e-01, 8.670e-02, -6.985e-02, -1.359e-01, 2.063e-01));
r += mul(s1_6, M4(-9.271e-02, 2.259e-01, 2.200e-02, -2.390e-01, 3.258e-01, 1.082e-01, -1.499e-01, -3.063e-02, -2.775e-01, -9.008e-02, 1.294e-01, 3.533e-02, 1.011e-02, 4.294e-02, 4.935e-02, -1.005e-01));
r += mul(s1_7, M4(1.321e-02, -7.160e-02, 7.229e-02, -3.050e-02, 4.303e-02, -1.518e-01, 5.137e-01, 4.029e-02, 4.896e-02, 5.334e-02, -3.545e-01, 2.370e-02, 1.645e-01, 3.433e-02, -9.552e-03, 1.032e-01));
r += mul(s1_8, M4(8.370e-03, -2.408e-02, 2.693e-02, -8.183e-03, -2.375e-02, -2.973e-01, 1.889e-01, 1.096e-01, 1.093e-02, 2.310e-01, -1.613e-01, -1.343e-01, -1.718e-01, -2.165e-02, 1.384e-01, 9.956e-02));
r += V4(-1.511e-02, -2.848e-03, 7.160e-03, -2.555e-03);
return r;
}
void Pass2(uint2 blockStart, uint3 tid) {
float2 pt = float2(GetInputPt());
uint2 gxy = Rmp8x8(tid.x) + blockStart;
uint2 size = GetInputSize();
if (gxy.x >= size.x || gxy.y >= size.y) {
return;
}
float2 pos = (gxy + 0.5) * pt;
V4 s0_0 = l0(-1.0, -1.0);
V4 s0_1 = l0(0.0, -1.0);
V4 s0_2 = l0(1.0, -1.0);
V4 s0_3 = l0(-1.0, 0.0);
V4 s0_4 = l0(0.0, 0.0);
V4 s0_5 = l0(1.0, 0.0);
V4 s0_6 = l0(-1.0, 1.0);
V4 s0_7 = l0(0.0, 1.0);
V4 s0_8 = l0(1.0, 1.0);
V4 s1_0 = -max(-s0_0, 0.0);
V4 s1_1 = -max(-s0_1, 0.0);
V4 s1_2 = -max(-s0_2, 0.0);
V4 s1_3 = -max(-s0_3, 0.0);
V4 s1_4 = -max(-s0_4, 0.0);
V4 s1_5 = -max(-s0_5, 0.0);
V4 s1_6 = -max(-s0_6, 0.0);
V4 s1_7 = -max(-s0_7, 0.0);
V4 s1_8 = -max(-s0_8, 0.0);
s0_0 = max(s0_0, 0.0);
s0_1 = max(s0_1, 0.0);
s0_2 = max(s0_2, 0.0);
s0_3 = max(s0_3, 0.0);
s0_4 = max(s0_4, 0.0);
s0_5 = max(s0_5, 0.0);
s0_6 = max(s0_6, 0.0);
s0_7 = max(s0_7, 0.0);
s0_8 = max(s0_8, 0.0);
t1[gxy] = f0(s0_0, s0_1, s0_2, s0_3, s0_4, s0_5, s0_6, s0_7, s0_8, s1_0, s1_1, s1_2, s1_3, s1_4, s1_5, s1_6, s1_7, s1_8);
}
//!PASS 3
//!DESC conv2
//!BLOCK_SIZE 8
//!NUM_THREADS 64
//!IN t1
//!OUT t0
#define l0(x, y) V4(O(t1, float2(x, y)))
V4 f0(V4 s0_0, V4 s0_1, V4 s0_2, V4 s0_3, V4 s0_4, V4 s0_5, V4 s0_6, V4 s0_7, V4 s0_8, V4 s1_0, V4 s1_1, V4 s1_2, V4 s1_3, V4 s1_4, V4 s1_5, V4 s1_6, V4 s1_7, V4 s1_8) {
V4 r = 0.0;
r += mul(s0_0, M4(6.983e-02, 8.935e-03, -1.644e-01, -4.232e-04, -1.981e-01, 9.265e-02, 1.769e-01, 1.705e-01, -2.300e-02, -7.408e-03, -4.221e-02, -1.617e-02, -6.026e-02, -9.185e-03, -7.420e-02, -4.238e-02));
r += mul(s0_1, M4(1.832e-01, -1.117e-01, 1.784e-02, 6.345e-02, -9.651e-02, 5.753e-02, 1.480e-01, 1.284e-01, 3.957e-01, -2.684e-01, 2.853e-02, -5.823e-02, -8.184e-02, 1.062e-01, -2.604e-02, -7.579e-02));
r += mul(s0_2, M4(-1.753e-01, 5.019e-03, -1.285e-01, 8.470e-02, -2.566e-01, 6.556e-02, -9.751e-02, 7.653e-03, -9.466e-02, 3.098e-02, -9.617e-02, -4.826e-02, 3.951e-02, -5.446e-02, 1.297e-01, 1.076e-01));
r += mul(s0_3, M4(-7.377e-02, -2.183e-01, 9.806e-02, 1.735e-01, 2.795e-01, 3.730e-01, 1.906e-01, 1.313e-01, 2.115e-01, 2.222e-01, 1.880e-01, 2.427e-01, -1.177e-01, 2.587e-02, -1.928e-01, -1.489e-01));
r += mul(s0_4, M4(-3.487e-01, -3.194e-01, 7.963e-01, -1.044e-01, 3.136e-01, -5.467e-02, 5.059e-01, -4.801e-02, -4.943e-01, -1.466e-01, -5.938e-02, -9.473e-01, 2.661e-01, -1.545e-01, 1.986e-01, -2.172e-02));
r += mul(s0_5, M4(-3.450e-01, 1.931e-01, -2.303e-01, -1.880e-01, -1.323e-01, 1.839e-01, -1.130e-01, -5.181e-02, 3.049e-02, 9.834e-02, -1.342e-01, -1.072e-01, 1.925e-02, -9.652e-02, 1.169e-01, 2.084e-01));
r += mul(s0_6, M4(1.543e-02, 2.202e-01, 4.809e-02, 1.085e-01, 3.076e-02, -4.127e-01, 4.606e-02, 9.444e-02, 7.886e-02, -1.314e-01, -1.638e-02, 4.353e-02, 9.790e-02, -6.783e-02, -1.008e-01, -1.558e-01));
r += mul(s0_7, M4(-4.453e-02, 3.133e-01, -2.217e-01, -5.271e-02, -2.055e-01, -1.000e-01, 8.374e-02, 6.141e-02, 2.147e-02, -3.844e-01, -2.203e-01, -1.105e-01, -3.596e-02, 2.026e-01, 3.174e-01, 1.519e-01));
r += mul(s0_8, M4(-5.107e-03, 2.380e-01, 2.147e-02, -8.032e-02, -9.743e-02, 6.943e-02, 9.403e-02, 3.742e-02, -1.822e-02, -4.950e-02, 7.963e-02, -1.338e-01, -1.491e-01, 1.655e-02, -5.817e-02, 1.164e-01));
r += mul(s1_0, M4(8.679e-02, -7.335e-02, -5.999e-02, -4.504e-02, -3.329e-02, 4.349e-03, -4.883e-02, 3.159e-02, -7.948e-02, 3.308e-02, 6.579e-02, 1.607e-01, 1.336e-01, -1.042e-01, -2.368e-01, -1.546e-01));
r += mul(s1_1, M4(2.764e-01, -6.665e-02, 1.661e-02, -4.103e-02, 1.095e-01, -1.159e-01, -1.142e-01, -1.412e-01, 4.033e-01, -8.697e-02, 2.387e-01, 1.762e-01, 4.948e-01, -1.533e-01, 7.816e-02, 5.700e-02));
r += mul(s1_2, M4(1.187e-01, -6.571e-02, 4.698e-02, 4.931e-02, -5.523e-02, 3.925e-02, -7.453e-02, -8.429e-02, -2.202e-01, 6.090e-02, -1.460e-01, 2.777e-02, 4.405e-01, 6.445e-03, 3.494e-01, 3.311e-01));
r += mul(s1_3, M4(-4.333e-02, -8.517e-02, 1.372e-01, 2.066e-01, 4.728e-01, 1.195e-01, -2.627e-01, -2.280e-01, 1.606e-01, 2.216e-01, 2.269e-01, 3.505e-01, -2.499e-01, -3.977e-01, -3.659e-02, 1.460e-02));
r += mul(s1_4, M4(-4.640e-01, -7.221e-01, -2.524e-01, -6.513e-01, 6.699e-01, -1.727e-01, 4.444e-01, -3.115e-01, -6.748e-01, 1.063e-01, 6.487e-01, -3.195e-01, -5.136e-01, -8.272e-01, 4.014e-01, 4.914e-01));
r += mul(s1_5, M4(-1.112e-03, -1.293e-02, 1.567e-02, -1.266e-01, 1.185e-01, 4.940e-02, -9.925e-02, -1.034e-01, -1.041e-01, 1.822e-01, -4.277e-02, 1.313e-01, -6.459e-01, -1.562e-01, -3.961e-01, -7.262e-02));
r += mul(s1_6, M4(1.499e-02, 3.135e-01, 2.187e-01, 2.386e-01, 1.171e-01, -4.899e-01, -1.987e-01, -1.717e-01, 5.232e-02, -1.984e-01, 9.338e-04, 1.092e-01, 1.545e-01, 4.183e-01, 1.180e-01, 1.102e-01));
r += mul(s1_7, M4(-1.411e-01, 2.619e-01, -2.549e-01, -2.113e-01, -1.109e-01, -3.038e-01, 7.579e-02, -3.585e-02, -1.373e-03, -2.713e-01, -5.527e-02, 7.052e-02, -1.648e-01, 7.324e-01, 3.974e-01, 2.306e-01));
r += mul(s1_8, M4(-1.861e-02, 9.414e-02, -6.739e-02, -8.921e-02, -2.337e-02, -2.657e-02, -3.376e-03, -7.209e-02, -1.042e-01, -2.504e-02, 1.287e-01, -1.459e-02, -1.617e-01, 2.384e-01, -6.969e-01, -3.760e-01));
r += V4(-3.514e-03, 2.350e-03, 2.221e-03, 1.089e-03);
return r;
}
void Pass3(uint2 blockStart, uint3 tid) {
float2 pt = float2(GetInputPt());
uint2 gxy = Rmp8x8(tid.x) + blockStart;
uint2 size = GetInputSize();
if (gxy.x >= size.x || gxy.y >= size.y) {
return;
}
float2 pos = (gxy + 0.5) * pt;
V4 s0_0 = l0(-1.0, -1.0);
V4 s0_1 = l0(0.0, -1.0);
V4 s0_2 = l0(1.0, -1.0);
V4 s0_3 = l0(-1.0, 0.0);
V4 s0_4 = l0(0.0, 0.0);
V4 s0_5 = l0(1.0, 0.0);
V4 s0_6 = l0(-1.0, 1.0);
V4 s0_7 = l0(0.0, 1.0);
V4 s0_8 = l0(1.0, 1.0);
V4 s1_0 = -max(-s0_0, 0.0);
V4 s1_1 = -max(-s0_1, 0.0);
V4 s1_2 = -max(-s0_2, 0.0);
V4 s1_3 = -max(-s0_3, 0.0);
V4 s1_4 = -max(-s0_4, 0.0);
V4 s1_5 = -max(-s0_5, 0.0);
V4 s1_6 = -max(-s0_6, 0.0);
V4 s1_7 = -max(-s0_7, 0.0);
V4 s1_8 = -max(-s0_8, 0.0);
s0_0 = max(s0_0, 0.0);
s0_1 = max(s0_1, 0.0);
s0_2 = max(s0_2, 0.0);
s0_3 = max(s0_3, 0.0);
s0_4 = max(s0_4, 0.0);
s0_5 = max(s0_5, 0.0);
s0_6 = max(s0_6, 0.0);
s0_7 = max(s0_7, 0.0);
s0_8 = max(s0_8, 0.0);
t0[gxy] = f0(s0_0, s0_1, s0_2, s0_3, s0_4, s0_5, s0_6, s0_7, s0_8, s1_0, s1_1, s1_2, s1_3, s1_4, s1_5, s1_6, s1_7, s1_8);
}
//!PASS 4
//!DESC conv3
//!BLOCK_SIZE 8
//!NUM_THREADS 64
//!IN t0
//!OUT t1
#define l0(x, y) V4(O(t0, float2(x, y)))
V4 f0(V4 s0_0, V4 s0_1, V4 s0_2, V4 s0_3, V4 s0_4, V4 s0_5, V4 s0_6, V4 s0_7, V4 s0_8, V4 s1_0, V4 s1_1, V4 s1_2, V4 s1_3, V4 s1_4, V4 s1_5, V4 s1_6, V4 s1_7, V4 s1_8) {
V4 r = 0.0;
r += mul(s0_0, M4(1.869e-01, 8.774e-02, -6.451e-02, 6.682e-02, 8.374e-02, 1.313e-02, -2.649e-02, 2.741e-02, -3.609e-02, -9.330e-02, -8.233e-02, 1.117e-01, -1.203e-01, 1.719e-02, 1.288e-01, -9.851e-02));
r += mul(s0_1, M4(3.100e-01, 5.063e-02, 1.169e-01, -3.828e-02, 3.428e-01, 4.869e-02, -1.232e-02, -1.003e-02, 2.756e-01, 3.916e-01, 1.450e-01, 1.078e-01, -2.568e-01, -2.157e-01, -1.057e-01, -1.338e-01));
r += mul(s0_2, M4(1.199e-01, -1.890e-01, 5.870e-03, -5.995e-03, 2.255e-01, -2.325e-03, 7.916e-03, -2.038e-02, 1.353e-01, -9.590e-02, -2.119e-02, -5.860e-02, -7.698e-02, -3.608e-02, -3.571e-02, 2.010e-02));
r += mul(s0_3, M4(9.889e-02, -2.665e-02, -2.627e-01, 3.583e-01, 7.891e-02, 8.737e-02, 5.322e-02, 5.246e-04, -5.188e-02, -8.491e-02, -4.991e-02, -3.735e-02, 5.711e-02, 4.482e-02, 5.660e-02, -1.322e-01));
r += mul(s0_4, M4(-5.488e-01, 2.898e-01, 1.046e+00, 6.036e-01, -3.180e-01, -6.309e-01, -2.627e-01, 1.734e-01, -2.067e-01, 3.775e-02, -2.881e-01, -9.242e-02, 3.369e-01, 2.554e-02, -1.645e-01, 4.973e-01));
r += mul(s0_5, M4(6.976e-03, -1.830e-01, 2.842e-01, 2.570e-02, -2.902e-01, 5.059e-01, 1.944e-01, 1.794e-02, -1.333e-01, 2.341e-01, 4.161e-01, -5.179e-02, 8.176e-02, -2.435e-02, -1.598e-02, 6.211e-02));
r += mul(s0_6, M4(-2.668e-02, -6.958e-02, -5.015e-02, 8.035e-02, 4.451e-02, -1.290e-03, -7.688e-02, 1.708e-01, -5.133e-02, -2.768e-02, -1.780e-02, -6.317e-02, -9.692e-03, -2.748e-03, 9.070e-03, -1.314e-01));
r += mul(s0_7, M4(1.402e-01, 4.997e-02, -4.973e-02, 6.839e-01, 2.079e-02, -2.511e-02, 3.403e-01, -3.077e-01, -2.831e-02, 4.816e-02, -9.142e-02, -8.176e-02, -2.999e-02, -5.749e-03, -5.579e-02, -2.355e-01));
r += mul(s0_8, M4(-1.783e-02, -2.882e-02, 9.841e-02, 4.473e-02, 4.128e-02, -3.071e-02, -2.378e-01, 1.347e-01, -2.285e-02, 1.317e-02, -1.632e-02, 1.058e-01, -3.696e-02, -6.864e-03, -8.989e-02, -7.315e-02));
r += mul(s1_0, M4(8.857e-02, 3.169e-02, -1.896e-02, 1.258e-02, 7.086e-02, 5.699e-02, 1.550e-02, -1.836e-02, 1.209e-01, 5.334e-02, -1.557e-02, -2.374e-02, -1.411e-02, 1.543e-02, 1.769e-02, -4.332e-02));
r += mul(s1_1, M4(1.199e-01, -8.203e-03, -1.695e-02, -3.214e-02, 5.918e-01, 3.458e-01, 7.684e-02, -5.137e-01, 2.827e-01, -2.008e-02, -1.848e-01, 2.147e-01, 7.212e-02, -3.906e-03, -2.220e-01, -1.918e-01));
r += mul(s1_2, M4(4.464e-02, 4.035e-02, 4.265e-03, 1.350e-02, -4.623e-01, -1.882e-01, 9.929e-02, -2.295e-01, 2.010e-01, 6.059e-01, 3.648e-01, -1.670e-02, -6.763e-02, -2.588e-01, -1.741e-01, 3.358e-02));
r += mul(s1_3, M4(1.003e-01, -2.961e-02, -1.715e-01, 1.057e-01, 3.275e-03, 1.877e-02, -4.995e-02, 1.181e-01, 3.600e-02, 2.101e-02, -1.050e-01, 8.035e-02, -8.107e-02, -1.067e-01, -5.457e-02, 5.339e-02));
r += mul(s1_4, M4(3.875e-01, 3.638e-01, 1.178e-01, -4.404e-02, 6.128e-02, -1.193e-01, -3.161e-01, 3.510e-01, -3.482e-02, -2.842e-01, -3.917e-01, 4.525e-01, 1.969e-01, 5.299e-01, 4.720e-01, -2.266e-01));
r += mul(s1_5, M4(-1.420e-02, 2.325e-02, -8.697e-02, -4.296e-03, 8.697e-02, 7.490e-02, 1.773e-01, 4.010e-01, 2.380e-01, -1.182e-01, 9.121e-01, 2.252e-01, 1.348e-01, -7.448e-02, -8.496e-01, -3.335e-01));
r += mul(s1_6, M4(-7.923e-02, -2.533e-02, -4.896e-02, -5.473e-02, -5.329e-03, 1.285e-02, -1.763e-02, 7.009e-02, 9.670e-04, -1.889e-02, -1.008e-01, 1.149e-01, 7.259e-03, 4.080e-02, 1.042e-01, -2.627e-01));
r += mul(s1_7, M4(-9.746e-02, 6.679e-02, -1.421e-01, -2.202e-01, -9.918e-03, -2.413e-02, -1.554e-02, 7.011e-03, -3.226e-02, -3.024e-02, -5.431e-02, 7.446e-02, 5.860e-02, 2.851e-02, -2.367e-01, 2.562e-02));
r += mul(s1_8, M4(-4.627e-02, 4.226e-02, -8.654e-02, -3.312e-02, 1.600e-02, 2.983e-02, 8.834e-03, -3.871e-02, -4.137e-03, 1.767e-02, 2.492e-02, -5.391e-02, 8.133e-03, 1.430e-02, -2.428e-02, -1.132e-01));
r += V4(-4.349e-03, -3.760e-03, 4.684e-03, 4.745e-03);
return r;
}
void Pass4(uint2 blockStart, uint3 tid) {
float2 pt = float2(GetInputPt());
uint2 gxy = Rmp8x8(tid.x) + blockStart;
uint2 size = GetInputSize();
if (gxy.x >= size.x || gxy.y >= size.y) {
return;
}
float2 pos = (gxy + 0.5) * pt;
V4 s0_0 = l0(-1.0, -1.0);
V4 s0_1 = l0(0.0, -1.0);
V4 s0_2 = l0(1.0, -1.0);
V4 s0_3 = l0(-1.0, 0.0);
V4 s0_4 = l0(0.0, 0.0);
V4 s0_5 = l0(1.0, 0.0);
V4 s0_6 = l0(-1.0, 1.0);
V4 s0_7 = l0(0.0, 1.0);
V4 s0_8 = l0(1.0, 1.0);
V4 s1_0 = -max(-s0_0, 0.0);
V4 s1_1 = -max(-s0_1, 0.0);
V4 s1_2 = -max(-s0_2, 0.0);
V4 s1_3 = -max(-s0_3, 0.0);
V4 s1_4 = -max(-s0_4, 0.0);
V4 s1_5 = -max(-s0_5, 0.0);
V4 s1_6 = -max(-s0_6, 0.0);
V4 s1_7 = -max(-s0_7, 0.0);
V4 s1_8 = -max(-s0_8, 0.0);
s0_0 = max(s0_0, 0.0);
s0_1 = max(s0_1, 0.0);
s0_2 = max(s0_2, 0.0);
s0_3 = max(s0_3, 0.0);
s0_4 = max(s0_4, 0.0);
s0_5 = max(s0_5, 0.0);
s0_6 = max(s0_6, 0.0);
s0_7 = max(s0_7, 0.0);
s0_8 = max(s0_8, 0.0);
t1[gxy] = f0(s0_0, s0_1, s0_2, s0_3, s0_4, s0_5, s0_6, s0_7, s0_8, s1_0, s1_1, s1_2, s1_3, s1_4, s1_5, s1_6, s1_7, s1_8);
}
//!PASS 5
//!DESC out-shuffle
//!BLOCK_SIZE 16
//!NUM_THREADS 64
//!IN INPUT, t1
//!OUT OUTPUT
#define l0(x, y) V4(O(t1, float2(x, y)))
V4 f0(V4 s0_0, V4 s0_1, V4 s0_2, V4 s0_3, V4 s0_4, V4 s0_5, V4 s0_6, V4 s0_7, V4 s0_8, V4 s1_0, V4 s1_1, V4 s1_2, V4 s1_3, V4 s1_4, V4 s1_5, V4 s1_6, V4 s1_7, V4 s1_8) {
V4 r = 0.0;
r += mul(s0_0, M4(-4.783e-03, 7.235e-03, -7.275e-03, -2.802e-03, 4.921e-02, 7.543e-02, -3.357e-02, 1.213e-02, 2.900e-02, 2.380e-03, -9.028e-03, -2.594e-02, 1.576e-03, 3.334e-04, -2.460e-02, -1.285e-02));
r += mul(s0_1, M4(4.582e-02, 9.378e-04, 2.217e-02, 5.083e-02, -1.054e-02, 8.518e-02, -1.884e-02, -5.149e-02, 1.983e-02, -1.106e-02, -4.317e-03, 5.384e-02, -5.193e-02, 1.089e-02, -9.384e-03, 3.137e-02));
r += mul(s0_2, M4(-5.241e-03, 3.821e-02, -1.136e-02, -3.033e-02, 3.186e-02, -3.270e-03, 1.422e-02, 2.401e-02, -1.360e-02, 1.024e-01, -6.042e-02, -2.325e-02, -1.248e-01, -1.377e-01, 1.654e-02, -1.347e-02));
r += mul(s0_3, M4(-3.552e-02, -3.211e-02, -2.282e-03, 1.775e-02, 1.360e-01, 2.808e-02, 1.082e-01, -1.311e-02, -1.699e-02, -2.628e-02, 3.430e-02, -3.880e-03, 2.514e-02, -3.171e-02, 4.675e-02, -2.711e-02));
r += mul(s0_4, M4(4.756e-01, 2.686e-01, 4.514e-02, -8.813e-02, 2.636e-01, -4.893e-01, 1.301e-01, 1.304e-01, 3.778e-01, 2.765e-01, 3.369e-01, 8.811e-02, 5.080e-02, 2.783e-01, -1.131e-01, 2.487e-01));
r += mul(s0_5, M4(-2.961e-02, 7.757e-02, -8.471e-02, -4.636e-02, -6.862e-02, 1.733e-01, -7.301e-02, -1.408e-02, 1.636e-02, 9.982e-02, 5.704e-02, 2.568e-01, -2.224e-02, -2.588e-01, -2.202e-01, -4.898e-01));
r += mul(s0_6, M4(1.058e-01, -2.810e-02, -2.960e-02, -8.398e-02, -9.106e-02, 6.642e-02, -2.574e-02, 7.841e-02, -1.978e-02, -3.700e-02, -1.504e-02, -3.186e-02, 2.438e-03, 6.191e-03, -1.155e-02, -1.161e-02));
r += mul(s0_7, M4(-6.316e-01, -7.748e-02, 8.006e-01, 3.936e-01, 1.300e-01, -1.999e-01, 2.351e-01, -7.485e-01, -7.151e-02, -4.285e-02, -2.277e-02, 2.849e-02, -2.207e-02, -2.585e-02, -2.498e-02, -3.308e-02));
r += mul(s0_8, M4(-2.002e-01, -6.934e-01, -1.093e-01, 3.325e-01, -5.778e-02, 2.138e-02, -2.930e-02, 1.794e-01, -3.028e-03, 2.300e-03, 5.845e-03, -1.959e-02, 1.403e-02, 1.565e-02, 1.840e-02, -6.027e-04));
r += mul(s1_0, M4(2.228e-02, -8.352e-03, -1.007e-02, -1.911e-02, -1.489e-02, 2.785e-03, -9.190e-03, 5.858e-03, 2.420e-02, -7.701e-03, -2.327e-02, -2.494e-02, -8.526e-03, -2.384e-02, -2.601e-02, -4.833e-02));
r += mul(s1_1, M4(5.671e-02, 3.666e-02, 3.309e-02, 1.011e-02, -8.053e-03, 4.673e-02, -5.358e-02, -2.451e-02, 3.779e-01, 5.642e-02, -2.324e-01, -3.499e-02, -3.479e-01, 1.179e-01, -4.630e-02, 1.118e-01));
r += mul(s1_2, M4(-1.650e-02, 6.203e-04, -1.322e-02, -1.996e-02, 2.118e-02, -9.244e-03, 2.813e-02, 9.773e-03, -2.654e-02, -8.373e-02, 6.663e-04, -6.860e-02, -3.436e-02, -7.207e-01, 2.389e-01, 1.903e-01));
r += mul(s1_3, M4(-8.045e-02, -2.073e-02, 3.380e-02, 1.327e-02, 1.247e-01, 1.129e-02, 6.421e-02, -8.326e-03, -4.675e-02, 4.920e-02, -3.699e-02, 4.601e-02, 3.389e-02, -4.151e-02, 3.012e-02, -2.241e-02));
r += mul(s1_4, M4(5.223e-01, 1.394e-01, 1.222e-01, -7.687e-03, -3.115e-01, 3.989e-02, -1.679e-01, 2.607e-01, 4.393e-01, -1.821e-01, 1.006e+00, -2.920e-01, 8.062e-02, 2.231e-01, -1.282e-02, 2.495e-01));
r += mul(s1_5, M4(-1.146e-01, 6.738e-02, -1.655e-02, 1.178e-02, -3.058e-02, 1.093e-01, 9.367e-03, 1.382e-02, -7.397e-02, 2.300e-01, -4.202e-02, 1.765e-01, -4.671e-02, -1.375e-02, -3.662e-01, -5.254e-01));
r += mul(s1_6, M4(5.090e-02, 8.633e-03, -1.128e-02, -3.186e-02, -6.263e-02, 4.143e-02, -2.214e-02, 5.270e-02, -1.370e-02, -1.692e-02, -2.644e-02, -9.847e-03, -2.147e-03, -7.941e-03, -1.323e-04, -5.173e-03));
r += mul(s1_7, M4(-9.353e-02, 6.696e-02, 2.744e-01, 2.743e-01, 9.809e-02, -1.439e-01, -2.583e-02, -3.717e-01, -5.135e-02, -1.889e-02, -1.775e-02, 9.383e-03, -2.496e-02, -2.936e-02, -2.578e-02, -1.586e-02));
r += mul(s1_8, M4(-1.565e-02, -1.635e-01, -1.800e-01, -2.607e-01, 1.975e-02, 1.594e-02, -4.568e-02, 1.218e-01, -6.668e-03, 7.923e-03, -4.625e-02, 1.324e-02, -6.838e-03, 2.045e-02, 1.141e-02, 2.717e-02));
r += V4(7.204e-05, -6.226e-05, 2.867e-04, -3.251e-05);
return tanh(r);
}
void Pass5(uint2 blockStart, uint3 tid) {
float2 pt = float2(GetInputPt());
uint2 gxy = (Rmp8x8(tid.x) << 1) + blockStart;
uint2 size = GetOutputSize();
if (gxy.x >= size.x || gxy.y >= size.y) {
return;
}
float2 pos = ((gxy >> 1) + 0.5) * pt;
V4 s0_0 = l0(-1.0, -1.0);
V4 s0_1 = l0(0.0, -1.0);
V4 s0_2 = l0(1.0, -1.0);
V4 s0_3 = l0(-1.0, 0.0);
V4 s0_4 = l0(0.0, 0.0);
V4 s0_5 = l0(1.0, 0.0);
V4 s0_6 = l0(-1.0, 1.0);
V4 s0_7 = l0(0.0, 1.0);
V4 s0_8 = l0(1.0, 1.0);
V4 s1_0 = -max(-s0_0, 0.0);
V4 s1_1 = -max(-s0_1, 0.0);
V4 s1_2 = -max(-s0_2, 0.0);
V4 s1_3 = -max(-s0_3, 0.0);
V4 s1_4 = -max(-s0_4, 0.0);
V4 s1_5 = -max(-s0_5, 0.0);
V4 s1_6 = -max(-s0_6, 0.0);
V4 s1_7 = -max(-s0_7, 0.0);
V4 s1_8 = -max(-s0_8, 0.0);
s0_0 = max(s0_0, 0.0);
s0_1 = max(s0_1, 0.0);
s0_2 = max(s0_2, 0.0);
s0_3 = max(s0_3, 0.0);
s0_4 = max(s0_4, 0.0);
s0_5 = max(s0_5, 0.0);
s0_6 = max(s0_6, 0.0);
s0_7 = max(s0_7, 0.0);
s0_8 = max(s0_8, 0.0);
V4 r = f0(s0_0, s0_1, s0_2, s0_3, s0_4, s0_5, s0_6, s0_7, s0_8, s1_0, s1_1, s1_2, s1_3, s1_4, s1_5, s1_6, s1_7, s1_8);
static const float3x3 rgb2yuv = {0.299, 0.587, 0.114, -0.169, -0.331, 0.5, 0.5, -0.419, -0.081};
static const float3x3 yuv2rgb = {1, -0.00093, 1.401687, 1, -0.3437, -0.71417, 1, 1.77216, 0.00099};
float2 opt = float2(GetOutputPt());
pos -= 0.5f * opt;
float3 yuv = mul(rgb2yuv, INPUT.SampleLevel(SL, pos, 0).rgb);
OUTPUT[gxy] = float4(mul(yuv2rgb, float3(saturate(yuv.r + r.x), yuv.yz)), 1);
++gxy.x;
pos.x += opt.x;
yuv = mul(rgb2yuv, INPUT.SampleLevel(SL, pos, 0).rgb);
OUTPUT[gxy] = float4(mul(yuv2rgb, float3(saturate(yuv.r + r.y), yuv.yz)), 1);
++gxy.y;
pos.y += opt.y;
yuv = mul(rgb2yuv, INPUT.SampleLevel(SL, pos, 0).rgb);
OUTPUT[gxy] = float4(mul(yuv2rgb, float3(saturate(yuv.r + r.w), yuv.yz)), 1);
--gxy.x;
pos.x -= opt.x;
yuv = mul(rgb2yuv, INPUT.SampleLevel(SL, pos, 0).rgb);
OUTPUT[gxy] = float4(mul(yuv2rgb, float3(saturate(yuv.r + r.z), yuv.yz)), 1);
}

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,486 @@
// CuNNy 4x4C BILINEAR RGB NVL DN - https://github.com/cunnyplapper/CuNNy
// This program is free software: you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation, either version 3 of the License, or
// (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
//
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License
// along with this program. If not, see <http://www.gnu.org/licenses/>.
//!MAGPIE EFFECT
//!VERSION 4
//!SORT_NAME CuNNy-DN-D04N04
//!TEXTURE
Texture2D INPUT;
//!TEXTURE
//!WIDTH INPUT_WIDTH * 2
//!HEIGHT INPUT_HEIGHT * 2
Texture2D OUTPUT;
//!SAMPLER
//!FILTER POINT
SamplerState SP;
//!SAMPLER
//!FILTER LINEAR
SamplerState SL;
//!COMMON
#define O(t, p) t.SampleLevel(SP, pos + p * pt, 0)
#define V4 min16float4
#define M4 min16float4x4
//!TEXTURE
//!WIDTH INPUT_WIDTH
//!HEIGHT INPUT_HEIGHT
//!FORMAT R8G8B8A8_SNORM
Texture2D t0;
//!TEXTURE
//!WIDTH INPUT_WIDTH
//!HEIGHT INPUT_HEIGHT
//!FORMAT R8G8B8A8_SNORM
Texture2D t1;
//!PASS 1
//!DESC in
//!BLOCK_SIZE 8
//!NUM_THREADS 64
//!IN INPUT
//!OUT t0
#define l0(x, y) min16float((dot(float3(2.428e-01, 4.714e-01, 1.229e-01), O(INPUT, float2(x, y)).rgb) + -7.696e-02))
V4 f0(min16float s0_0, min16float s0_1, min16float s0_2, min16float s0_3, min16float s0_4, min16float s0_5, min16float s0_6, min16float s0_7, min16float s0_8) {
V4 r = 0.0;
r += V4(9.154e-02, 3.758e-01, 2.353e-02, -5.798e-02) * s0_0;
r += V4(-5.382e-01, 1.688e-01, -1.190e-01, 4.082e-02) * s0_1;
r += V4(2.460e-02, -5.810e-02, 7.788e-02, 3.018e-02) * s0_2;
r += V4(1.211e-01, -1.552e-01, -9.990e-02, 3.963e-02) * s0_3;
r += V4(-2.611e-01, -4.835e-01, -6.965e-01, -4.893e-01) * s0_4;
r += V4(-3.017e-01, -4.435e-02, 1.836e-01, 4.600e-01) * s0_5;
r += V4(1.275e-01, 2.485e-01, 7.354e-02, -4.648e-02) * s0_6;
r += V4(2.527e-01, 1.279e-01, 3.053e-01, 3.957e-02) * s0_7;
r += V4(1.003e-02, 1.193e-01, 2.476e-01, -2.051e-02) * s0_8;
r += V4(1.690e-02, 8.856e-03, -9.136e-04, 2.267e-02);
return r;
}
void Pass1(uint2 blockStart, uint3 tid) {
float2 pt = float2(GetInputPt());
uint2 gxy = Rmp8x8(tid.x) + blockStart;
uint2 size = GetInputSize();
if (gxy.x >= size.x || gxy.y >= size.y) {
return;
}
float2 pos = (gxy + 0.5) * pt;
min16float s0_0 = l0(-1.0, -1.0);
min16float s0_1 = l0(0.0, -1.0);
min16float s0_2 = l0(1.0, -1.0);
min16float s0_3 = l0(-1.0, 0.0);
min16float s0_4 = l0(0.0, 0.0);
min16float s0_5 = l0(1.0, 0.0);
min16float s0_6 = l0(-1.0, 1.0);
min16float s0_7 = l0(0.0, 1.0);
min16float s0_8 = l0(1.0, 1.0);
t0[gxy] = f0(s0_0, s0_1, s0_2, s0_3, s0_4, s0_5, s0_6, s0_7, s0_8);
}
//!PASS 2
//!DESC conv1
//!BLOCK_SIZE 8
//!NUM_THREADS 64
//!IN t0
//!OUT t1
#define l0(x, y) V4(O(t0, float2(x, y)))
V4 f0(V4 s0_0, V4 s0_1, V4 s0_2, V4 s0_3, V4 s0_4, V4 s0_5, V4 s0_6, V4 s0_7, V4 s0_8, V4 s1_0, V4 s1_1, V4 s1_2, V4 s1_3, V4 s1_4, V4 s1_5, V4 s1_6, V4 s1_7, V4 s1_8) {
V4 r = 0.0;
r += mul(s0_0, M4(-4.540e-03, -2.499e-01, 4.202e-02, 1.132e-02, 2.910e-02, -3.788e-02, 3.330e-02, -2.254e-02, -1.953e-01, 1.226e-01, -1.907e-01, -1.378e-01, 9.555e-02, -2.443e-01, 6.124e-02, -7.256e-03));
r += mul(s0_1, M4(-1.225e-01, -1.812e-01, -1.238e-02, 4.088e-01, -9.977e-02, 4.395e-02, -2.394e-02, -5.584e-03, 2.939e-01, 4.102e-01, 6.228e-02, 3.822e-01, 8.618e-02, -1.109e-01, 1.776e-01, -7.505e-02));
r += mul(s0_2, M4(2.047e-01, -6.853e-02, 1.880e-02, -9.030e-03, 1.505e-01, 7.782e-02, 1.347e-02, 5.566e-01, -6.951e-02, -1.352e-01, 1.941e-03, 3.975e-02, 1.637e-01, 6.708e-02, 1.501e-02, 1.373e-01));
r += mul(s0_3, M4(-1.974e-01, 1.068e-01, -1.102e-01, 5.909e-02, 2.355e-03, 1.275e-01, -5.986e-02, -5.288e-02, 8.785e-04, -1.440e-01, -3.369e-01, -9.128e-02, 2.030e-01, 4.937e-01, -1.637e-01, 4.814e-02));
r += mul(s0_4, M4(-3.954e-01, 4.772e-01, -5.841e-01, -8.070e-02, -2.056e-01, -2.335e-01, -2.091e-01, 1.223e-01, -2.686e-01, 1.240e+00, 7.095e-02, 6.502e-01, 1.044e-01, -3.071e-01, -2.892e-01, 4.861e-01));
r += mul(s0_5, M4(5.943e-02, 2.245e-01, 4.014e-01, -1.063e-01, -1.869e-01, 1.384e-01, 2.996e-01, -1.928e-01, 1.212e-01, 2.849e-01, 2.093e-01, -3.821e-01, -8.705e-02, 1.976e-01, 5.176e-01, -7.461e-02));
r += mul(s0_6, M4(1.048e-01, 2.374e-02, 2.730e-01, 1.446e-01, -5.406e-02, -1.587e-02, -2.014e-01, -3.422e-02, -2.114e-01, -5.198e-01, 2.674e-02, -6.078e-02, -2.293e-01, -9.914e-02, -2.110e-01, 7.008e-02));
r += mul(s0_7, M4(5.799e-02, 4.932e-01, 4.559e-01, -3.118e-02, 4.706e-02, -2.242e-01, -3.165e-01, -9.912e-02, 4.041e-01, 7.241e-01, -1.696e-01, 1.990e-01, 4.697e-01, 9.965e-03, -1.141e-02, -1.365e-02));
r += mul(s0_8, M4(-1.744e-01, -7.119e-02, 3.632e-01, -2.802e-01, -3.155e-01, 4.455e-01, -1.866e-02, -2.667e-02, 1.255e-01, -5.762e-01, -2.226e-02, 2.812e-02, -2.349e-01, 1.552e-01, -6.424e-03, 7.450e-02));
r += mul(s1_0, M4(6.159e-02, -4.426e-02, 2.277e-02, 1.040e-01, -6.306e-04, -1.704e-01, 3.807e-02, -8.670e-02, -1.403e-01, 1.644e-01, -9.679e-02, -1.055e-01, 2.394e-01, -5.504e-02, 8.006e-02, 6.312e-02));
r += mul(s1_1, M4(-1.134e-01, -1.030e-01, -2.777e-02, 2.955e-01, -1.225e-01, -4.096e-02, -2.748e-02, 9.404e-02, 2.890e-01, -2.441e-01, 1.560e-01, 1.694e-01, 1.853e-01, 3.311e-01, 3.408e-01, -8.678e-02));
r += mul(s1_2, M4(1.821e-01, 3.898e-02, -2.560e-02, 1.160e-01, 2.382e-01, -1.638e-01, -1.345e-01, 3.193e-01, -1.839e-01, -2.638e-01, 5.265e-02, 2.415e-01, 2.803e-01, 1.919e-01, -7.340e-02, 1.762e-02));
r += mul(s1_3, M4(-2.606e-01, -1.263e-01, -3.067e-02, -1.695e-02, 4.665e-03, 2.947e-02, -1.965e-02, -2.658e-02, -7.935e-02, -1.566e-01, -3.246e-01, -1.075e-03, 1.896e-01, -2.937e-01, -1.020e-01, -1.513e-01));
r += mul(s1_4, M4(-3.696e-01, 8.901e-02, -1.890e-01, -2.804e-02, -2.998e-01, -6.597e-02, -2.613e-01, 3.877e-01, -1.032e+00, -2.328e-01, 7.941e-02, 5.733e-01, 8.618e-02, 4.213e-02, -1.242e+00, 5.861e-01));
r += mul(s1_5, M4(1.919e-02, -5.609e-02, 3.295e-01, -2.364e-01, -4.238e-01, -6.041e-01, 3.389e-01, -4.460e-01, 4.482e-02, 1.077e-03, 8.990e-02, -2.725e-01, -4.829e-02, 1.184e-01, 1.941e-01, -3.646e-01));
r += mul(s1_6, M4(2.968e-01, 2.018e-01, 2.695e-01, 8.891e-02, -5.857e-02, 6.005e-02, -2.440e-01, -1.349e-02, -7.572e-02, -3.213e-01, 6.274e-02, -1.229e-02, -7.589e-01, -2.313e-01, -1.627e-01, 2.538e-01));
r += mul(s1_7, M4(-5.728e-02, 1.333e-01, 2.492e-01, -3.609e-02, 1.936e-01, -1.276e-01, -3.034e-01, -1.091e-01, 1.390e-01, 3.356e-01, -1.183e-01, 2.047e-01, 3.779e-01, -3.353e-01, 2.019e-01, 4.337e-02));
r += mul(s1_8, M4(-1.386e-01, 1.179e-01, 2.340e-01, -1.604e-01, -4.890e-01, -5.407e-01, -1.546e-01, -1.826e-01, 1.596e-01, -1.784e-01, 5.777e-02, 3.961e-02, -2.290e-01, 2.752e-01, -4.260e-02, 9.649e-02));
r += V4(-4.697e-03, -2.213e-02, 3.898e-01, -1.481e-02);
return r;
}
void Pass2(uint2 blockStart, uint3 tid) {
float2 pt = float2(GetInputPt());
uint2 gxy = Rmp8x8(tid.x) + blockStart;
uint2 size = GetInputSize();
if (gxy.x >= size.x || gxy.y >= size.y) {
return;
}
float2 pos = (gxy + 0.5) * pt;
V4 s0_0 = l0(-1.0, -1.0);
V4 s0_1 = l0(0.0, -1.0);
V4 s0_2 = l0(1.0, -1.0);
V4 s0_3 = l0(-1.0, 0.0);
V4 s0_4 = l0(0.0, 0.0);
V4 s0_5 = l0(1.0, 0.0);
V4 s0_6 = l0(-1.0, 1.0);
V4 s0_7 = l0(0.0, 1.0);
V4 s0_8 = l0(1.0, 1.0);
V4 s1_0 = -max(-s0_0, 0.0);
V4 s1_1 = -max(-s0_1, 0.0);
V4 s1_2 = -max(-s0_2, 0.0);
V4 s1_3 = -max(-s0_3, 0.0);
V4 s1_4 = -max(-s0_4, 0.0);
V4 s1_5 = -max(-s0_5, 0.0);
V4 s1_6 = -max(-s0_6, 0.0);
V4 s1_7 = -max(-s0_7, 0.0);
V4 s1_8 = -max(-s0_8, 0.0);
s0_0 = max(s0_0, 0.0);
s0_1 = max(s0_1, 0.0);
s0_2 = max(s0_2, 0.0);
s0_3 = max(s0_3, 0.0);
s0_4 = max(s0_4, 0.0);
s0_5 = max(s0_5, 0.0);
s0_6 = max(s0_6, 0.0);
s0_7 = max(s0_7, 0.0);
s0_8 = max(s0_8, 0.0);
t1[gxy] = f0(s0_0, s0_1, s0_2, s0_3, s0_4, s0_5, s0_6, s0_7, s0_8, s1_0, s1_1, s1_2, s1_3, s1_4, s1_5, s1_6, s1_7, s1_8);
}
//!PASS 3
//!DESC conv2
//!BLOCK_SIZE 8
//!NUM_THREADS 64
//!IN t1
//!OUT t0
#define l0(x, y) V4(O(t1, float2(x, y)))
V4 f0(V4 s0_0, V4 s0_1, V4 s0_2, V4 s0_3, V4 s0_4, V4 s0_5, V4 s0_6, V4 s0_7, V4 s0_8, V4 s1_0, V4 s1_1, V4 s1_2, V4 s1_3, V4 s1_4, V4 s1_5, V4 s1_6, V4 s1_7, V4 s1_8) {
V4 r = 0.0;
r += mul(s0_0, M4(-1.362e-01, -5.847e-02, 2.766e-02, 2.969e-02, 9.796e-02, 6.555e-02, -3.067e-02, -5.139e-02, 1.512e-01, 1.401e-01, -3.820e-03, 2.649e-02, -1.802e-01, -2.099e-02, -6.604e-02, 4.042e-02));
r += mul(s0_1, M4(-2.144e-01, -1.437e-01, 4.670e-02, -2.348e-01, 9.990e-02, -5.186e-02, 1.658e-01, 9.557e-02, -1.353e-01, -1.146e-01, -9.837e-02, -8.956e-02, 1.229e-01, 2.354e-01, -2.342e-01, -1.343e-01));
r += mul(s0_2, M4(5.918e-01, 2.130e-02, 5.753e-01, -6.941e-02, -3.156e-02, -4.438e-02, -6.348e-02, 2.682e-02, -1.078e-02, 9.727e-03, 8.472e-02, 1.460e-01, -1.921e-01, 1.872e-01, 6.067e-02, 3.762e-02));
r += mul(s0_3, M4(1.341e-01, 1.082e-01, -4.460e-02, -1.008e-02, -1.262e-01, -7.942e-02, 5.610e-02, 4.418e-02, -1.725e-01, -1.158e-01, 6.377e-03, -1.171e-01, -3.447e-02, 4.459e-02, 2.822e-04, -7.623e-02));
r += mul(s0_4, M4(1.994e-01, -2.251e-01, -2.432e-01, 2.467e-02, 3.717e-02, 3.275e-01, 2.005e-01, 1.427e-01, 1.122e-01, 2.864e-01, 1.478e-01, 3.701e-01, 3.111e-01, -1.704e-01, -1.410e-01, -7.490e-01));
r += mul(s0_5, M4(-1.392e-01, -2.284e-02, 2.819e-01, -5.560e-02, -2.624e-01, 7.282e-02, -2.417e-01, -5.534e-02, -6.351e-03, -1.714e-01, -1.505e-01, -3.035e-01, -3.580e-02, 4.429e-02, 1.628e-01, -1.101e-01));
r += mul(s0_6, M4(8.306e-04, 3.258e-02, -2.746e-02, -3.143e-02, -1.301e-02, -5.828e-02, 2.411e-03, 1.395e-02, 3.728e-02, -8.319e-02, 3.326e-02, 1.294e-01, -6.226e-02, 5.103e-02, -1.218e-02, 2.411e-01));
r += mul(s0_7, M4(-6.323e-02, -1.343e-02, 3.400e-02, -1.727e-02, 3.683e-02, 6.325e-02, 4.834e-04, 3.849e-02, 9.424e-03, -2.010e-02, -3.447e-02, -1.330e-01, -4.107e-01, -7.682e-02, 4.138e-01, 5.994e-02));
r += mul(s0_8, M4(7.556e-02, 1.846e-02, 1.847e-02, 1.057e-01, -1.140e-01, -2.834e-02, -3.141e-02, -1.045e-01, -2.025e-02, 4.729e-02, -2.822e-02, -4.072e-02, 3.368e-01, 6.871e-02, 1.184e-01, 1.536e-01));
r += mul(s1_0, M4(-6.688e-02, 2.483e-02, 1.598e-01, -4.834e-02, 2.141e-01, -4.911e-02, -4.452e-02, -4.879e-02, -9.473e-01, 6.527e-01, -6.118e-01, -2.436e-01, -3.017e-02, -3.402e-01, 1.343e-01, 9.397e-02));
r += mul(s1_1, M4(-1.330e-01, 2.557e-01, 6.838e-02, -3.936e-01, 4.806e-01, 1.828e-01, 5.073e-01, 4.502e-01, -1.404e+00, -2.954e-01, -6.745e-02, 5.594e-02, 2.640e-01, 2.330e-02, 1.331e-02, -2.700e-02));
r += mul(s1_2, M4(2.695e-01, -1.004e-01, 9.104e-02, -4.919e-01, 3.357e-01, 4.895e-02, 4.062e-01, -3.494e-02, -4.352e-01, -1.232e-01, 8.889e-03, 3.472e-01, -1.174e-01, 7.690e-02, 6.341e-02, 9.255e-02));
r += mul(s1_3, M4(1.805e-01, 2.494e-01, 3.474e-02, 3.930e-02, 2.671e-02, -1.438e-02, 7.294e-02, 4.854e-02, -2.864e+00, -5.832e-01, 4.350e-01, -4.265e-01, -2.643e-02, -6.234e-01, 1.283e-01, 5.168e-02));
r += mul(s1_4, M4(-2.192e-01, 2.982e-01, -2.860e-01, -4.050e-01, 8.612e-02, 5.008e-02, 5.366e-01, 5.256e-01, -6.222e-01, 1.169e+00, 1.897e+00, 3.009e+00, 9.105e-02, -2.369e-01, -4.718e-01, -2.725e-01));
r += mul(s1_5, M4(-7.441e-01, -1.820e-01, -5.828e-02, -6.348e-01, 5.721e-01, 1.143e-01, 2.871e-01, 3.254e-01, -1.446e-01, 1.446e-01, -8.526e-02, 7.228e-01, -9.749e-02, -1.665e-01, -1.116e-01, -2.705e-01));
r += mul(s1_6, M4(-6.357e-02, -2.576e-02, 1.277e-02, -3.956e-02, 2.724e-02, -2.141e-02, 9.778e-02, 7.199e-03, -1.153e+00, -6.945e-01, -4.788e-01, -1.246e+00, 1.909e-01, 1.315e-01, 4.454e-02, 2.678e-01));
r += mul(s1_7, M4(-1.022e-01, 1.572e-01, 9.404e-02, 6.768e-02, 2.191e-01, -3.163e-02, 1.257e-01, 1.058e-01, -6.394e-01, 7.223e-03, -6.930e-01, -2.963e-01, -2.666e-01, 3.461e-03, 2.203e-01, -1.212e-01));
r += mul(s1_8, M4(-1.179e-01, 7.311e-02, 1.371e-01, -4.039e-02, 2.171e-01, 3.131e-02, 2.219e-01, 1.564e-02, -4.895e-01, -5.067e-03, -4.528e-01, 5.694e-02, 6.858e-02, 6.808e-03, -1.017e-01, 6.675e-03));
r += V4(-8.341e-03, 1.434e-02, 5.791e-03, -1.033e-02);
return r;
}
void Pass3(uint2 blockStart, uint3 tid) {
float2 pt = float2(GetInputPt());
uint2 gxy = Rmp8x8(tid.x) + blockStart;
uint2 size = GetInputSize();
if (gxy.x >= size.x || gxy.y >= size.y) {
return;
}
float2 pos = (gxy + 0.5) * pt;
V4 s0_0 = l0(-1.0, -1.0);
V4 s0_1 = l0(0.0, -1.0);
V4 s0_2 = l0(1.0, -1.0);
V4 s0_3 = l0(-1.0, 0.0);
V4 s0_4 = l0(0.0, 0.0);
V4 s0_5 = l0(1.0, 0.0);
V4 s0_6 = l0(-1.0, 1.0);
V4 s0_7 = l0(0.0, 1.0);
V4 s0_8 = l0(1.0, 1.0);
V4 s1_0 = -max(-s0_0, 0.0);
V4 s1_1 = -max(-s0_1, 0.0);
V4 s1_2 = -max(-s0_2, 0.0);
V4 s1_3 = -max(-s0_3, 0.0);
V4 s1_4 = -max(-s0_4, 0.0);
V4 s1_5 = -max(-s0_5, 0.0);
V4 s1_6 = -max(-s0_6, 0.0);
V4 s1_7 = -max(-s0_7, 0.0);
V4 s1_8 = -max(-s0_8, 0.0);
s0_0 = max(s0_0, 0.0);
s0_1 = max(s0_1, 0.0);
s0_2 = max(s0_2, 0.0);
s0_3 = max(s0_3, 0.0);
s0_4 = max(s0_4, 0.0);
s0_5 = max(s0_5, 0.0);
s0_6 = max(s0_6, 0.0);
s0_7 = max(s0_7, 0.0);
s0_8 = max(s0_8, 0.0);
t0[gxy] = f0(s0_0, s0_1, s0_2, s0_3, s0_4, s0_5, s0_6, s0_7, s0_8, s1_0, s1_1, s1_2, s1_3, s1_4, s1_5, s1_6, s1_7, s1_8);
}
//!PASS 4
//!DESC conv3
//!BLOCK_SIZE 8
//!NUM_THREADS 64
//!IN t0
//!OUT t1
#define l0(x, y) V4(O(t0, float2(x, y)))
V4 f0(V4 s0_0, V4 s0_1, V4 s0_2, V4 s0_3, V4 s0_4, V4 s0_5, V4 s0_6, V4 s0_7, V4 s0_8, V4 s1_0, V4 s1_1, V4 s1_2, V4 s1_3, V4 s1_4, V4 s1_5, V4 s1_6, V4 s1_7, V4 s1_8) {
V4 r = 0.0;
r += mul(s0_0, M4(-6.123e-02, 9.666e-03, 4.969e-02, 3.030e-02, 1.714e-02, -3.117e-02, -9.470e-02, 2.078e-03, 4.109e-02, -5.560e-02, 3.757e-02, -3.667e-03, -3.500e-02, -8.151e-02, 1.104e-01, -1.219e-01));
r += mul(s0_1, M4(9.596e-02, -6.361e-02, 1.162e-02, -3.138e-02, -1.277e-02, -4.005e-02, 1.805e-02, -1.459e-02, -7.903e-03, 1.138e-02, 1.542e-02, -2.357e-02, -1.421e-01, -2.953e-01, 1.322e-01, 6.480e-03));
r += mul(s0_2, M4(1.571e-01, -1.081e-01, 1.345e-01, -5.616e-02, -1.211e-02, 4.515e-02, 1.797e-02, 6.143e-02, -9.605e-02, 7.782e-02, -1.421e-01, 3.195e-02, 1.841e-01, -7.735e-02, 1.082e-01, 1.785e-02));
r += mul(s0_3, M4(1.739e-03, -4.187e-02, 1.093e-01, 1.042e-01, -6.538e-03, 5.025e-02, -7.052e-03, -1.033e-01, -1.394e-01, -4.638e-01, 4.354e-02, -1.188e-02, 7.809e-04, 2.484e-01, -8.330e-01, -2.787e-01));
r += mul(s0_4, M4(-6.489e-03, -6.309e-01, 7.169e-01, 1.557e-01, 1.478e-01, 2.977e-01, -2.818e-01, 5.129e-02, 7.598e-01, 8.124e-01, -1.262e-02, -1.325e-01, -2.764e-01, 3.485e-01, 4.717e-01, -2.467e-01));
r += mul(s0_5, M4(2.022e-02, -1.396e-01, 1.865e-01, 1.568e-02, 3.924e-01, -2.466e-01, 4.990e-01, 3.971e-02, -1.176e-01, 1.792e-01, -2.861e-01, 3.555e-02, -1.428e-01, 2.528e-01, -2.085e-01, -1.311e-01));
r += mul(s0_6, M4(3.340e-02, -1.203e-01, 1.014e-01, 1.154e-01, -9.031e-03, -5.586e-02, -5.700e-03, 2.391e-02, -3.509e-01, 6.729e-02, 1.004e-01, -3.277e-01, 1.026e-01, 3.286e-03, -6.603e-02, -3.238e-03));
r += mul(s0_7, M4(-6.854e-01, 1.013e-01, -6.298e-02, -5.464e-01, 2.486e-01, -2.186e-01, 3.986e-02, 3.800e-01, -1.267e-01, 1.037e-01, 1.538e-01, -2.069e-01, 9.431e-02, 5.337e-02, -8.507e-02, 2.015e-01));
r += mul(s0_8, M4(-5.009e-03, 1.493e-01, -3.010e-02, -2.429e-02, -3.137e-01, -2.276e-01, 1.556e-01, 1.452e-02, 2.063e-01, 3.699e-02, -1.675e-03, 8.221e-02, -6.732e-02, 8.296e-02, -8.474e-02, -1.458e-01));
r += mul(s1_0, M4(-3.003e-02, -9.777e-03, 1.239e-02, -3.907e-02, 1.841e-01, -8.959e-02, 9.257e-02, 1.333e-01, 5.703e-04, -1.367e-01, -1.026e-01, 6.398e-02, 1.262e-02, 1.101e-02, 4.291e-02, -4.238e-02));
r += mul(s1_1, M4(5.516e-02, 9.884e-04, -5.383e-02, -1.048e-02, 2.529e-01, 9.819e-02, 1.255e-01, 3.149e-02, -8.249e-02, -1.386e-02, 6.214e-02, 2.957e-02, 1.001e-01, 1.590e-01, 1.159e-02, 5.273e-02));
r += mul(s1_2, M4(4.571e-02, -6.277e-03, 1.496e-01, -4.044e-02, 4.089e-02, -3.801e-02, -3.690e-02, -1.037e-01, -6.031e-02, 2.117e-03, -9.644e-02, 6.392e-02, 5.093e-02, -2.512e-02, 1.131e-01, 1.304e-01));
r += mul(s1_3, M4(-3.118e-02, 2.185e-02, 1.763e-01, 8.327e-02, 6.337e-02, 8.724e-02, 6.808e-02, -4.070e-01, -6.922e-02, -2.417e-01, -1.175e-01, -1.845e-01, -3.773e-03, -1.869e-01, -9.345e-02, -2.340e-01));
r += mul(s1_4, M4(-1.159e-01, -4.476e-01, 2.989e-01, 2.794e-01, 5.756e-01, -4.803e-01, -5.979e-02, -1.959e-01, 5.261e-02, -2.399e-01, -6.616e-02, -9.243e-01, 4.622e-01, 1.139e-01, 2.482e-01, 2.254e-01));
r += mul(s1_5, M4(1.064e-01, -1.989e-02, 8.581e-02, 3.218e-02, 3.344e-01, -5.684e-01, 4.009e-01, 4.482e-01, 7.737e-02, 8.716e-02, -1.382e-01, -7.145e-02, -1.225e-01, 1.471e-01, -1.866e-01, 3.674e-02));
r += mul(s1_6, M4(5.376e-02, -6.192e-03, -1.760e-01, 7.590e-02, -3.279e-02, -1.888e-01, 2.057e-01, 2.114e-01, -3.941e-01, 5.584e-03, 9.400e-03, -4.289e-01, -2.289e-01, 1.880e-01, 3.184e-02, -4.442e-01));
r += mul(s1_7, M4(-4.174e-01, -1.344e-01, 3.866e-02, 4.521e-02, -4.215e-01, 1.479e-01, 2.476e-01, -7.051e-01, -4.153e-01, 3.373e-01, 8.098e-02, -6.680e-01, 3.920e-01, -1.023e-01, -2.166e-02, 3.816e-01));
r += mul(s1_8, M4(-3.441e-02, 3.404e-03, -4.958e-02, 9.652e-03, -1.930e-02, -2.470e-01, 1.610e-01, 1.112e-01, 2.574e-02, 2.310e-01, 3.643e-02, -5.044e-02, 7.788e-02, 1.923e-03, -7.115e-02, -6.575e-03));
r += V4(1.370e-02, 1.151e-02, 2.567e-03, -1.881e-03);
return r;
}
void Pass4(uint2 blockStart, uint3 tid) {
float2 pt = float2(GetInputPt());
uint2 gxy = Rmp8x8(tid.x) + blockStart;
uint2 size = GetInputSize();
if (gxy.x >= size.x || gxy.y >= size.y) {
return;
}
float2 pos = (gxy + 0.5) * pt;
V4 s0_0 = l0(-1.0, -1.0);
V4 s0_1 = l0(0.0, -1.0);
V4 s0_2 = l0(1.0, -1.0);
V4 s0_3 = l0(-1.0, 0.0);
V4 s0_4 = l0(0.0, 0.0);
V4 s0_5 = l0(1.0, 0.0);
V4 s0_6 = l0(-1.0, 1.0);
V4 s0_7 = l0(0.0, 1.0);
V4 s0_8 = l0(1.0, 1.0);
V4 s1_0 = -max(-s0_0, 0.0);
V4 s1_1 = -max(-s0_1, 0.0);
V4 s1_2 = -max(-s0_2, 0.0);
V4 s1_3 = -max(-s0_3, 0.0);
V4 s1_4 = -max(-s0_4, 0.0);
V4 s1_5 = -max(-s0_5, 0.0);
V4 s1_6 = -max(-s0_6, 0.0);
V4 s1_7 = -max(-s0_7, 0.0);
V4 s1_8 = -max(-s0_8, 0.0);
s0_0 = max(s0_0, 0.0);
s0_1 = max(s0_1, 0.0);
s0_2 = max(s0_2, 0.0);
s0_3 = max(s0_3, 0.0);
s0_4 = max(s0_4, 0.0);
s0_5 = max(s0_5, 0.0);
s0_6 = max(s0_6, 0.0);
s0_7 = max(s0_7, 0.0);
s0_8 = max(s0_8, 0.0);
t1[gxy] = f0(s0_0, s0_1, s0_2, s0_3, s0_4, s0_5, s0_6, s0_7, s0_8, s1_0, s1_1, s1_2, s1_3, s1_4, s1_5, s1_6, s1_7, s1_8);
}
//!PASS 5
//!DESC conv4
//!BLOCK_SIZE 8
//!NUM_THREADS 64
//!IN t1
//!OUT t0
#define l0(x, y) V4(O(t1, float2(x, y)))
V4 f0(V4 s0_0, V4 s0_1, V4 s0_2, V4 s0_3, V4 s0_4, V4 s0_5, V4 s0_6, V4 s0_7, V4 s0_8, V4 s1_0, V4 s1_1, V4 s1_2, V4 s1_3, V4 s1_4, V4 s1_5, V4 s1_6, V4 s1_7, V4 s1_8) {
V4 r = 0.0;
r += mul(s0_0, M4(2.376e-02, 2.931e-02, 7.304e-02, -5.238e-02, -6.500e-03, -3.887e-02, 2.506e-02, 5.201e-03, 5.599e-02, -1.951e-01, -3.847e-01, 8.685e-02, -1.106e-01, -3.954e-02, 1.571e-01, 2.293e-02));
r += mul(s0_1, M4(-2.738e-02, 1.554e-01, 1.120e-01, 1.856e-02, 9.513e-03, -2.222e-01, -2.174e-01, -1.065e-02, 3.001e-02, 7.638e-02, -7.497e-02, -2.727e-02, -1.521e-02, 1.843e-01, 3.547e-01, -1.642e-02));
r += mul(s0_2, M4(-2.533e-02, -1.959e-02, -6.274e-02, 8.121e-03, -8.703e-03, 5.091e-02, 6.548e-02, 1.988e-02, 4.089e-02, -4.827e-02, -4.089e-02, -4.361e-02, -1.112e-02, -1.101e-02, 2.968e-02, -2.196e-03));
r += mul(s0_3, M4(1.813e-02, -2.087e-01, -2.474e-01, -1.066e-01, 2.549e-01, 6.466e-01, 3.169e-01, -1.109e-01, -1.551e-02, -3.119e-01, -3.959e-01, 2.141e-01, 1.121e-01, 3.268e-01, 1.038e-01, -5.818e-02));
r += mul(s0_4, M4(-3.147e-01, 2.716e-01, 1.304e-01, 3.887e-01, 9.396e-02, -9.787e-02, -1.596e-01, -7.138e-02, -2.462e-01, -3.027e-01, 6.980e-01, -1.546e-01, 3.730e-02, -7.502e-02, -4.408e-02, 3.814e-02));
r += mul(s0_5, M4(-4.177e-02, -1.326e-02, -7.497e-02, 1.168e-03, 5.595e-03, 3.603e-02, 2.589e-02, -2.179e-02, 1.998e-02, -3.544e-03, 1.125e-01, 2.648e-03, -2.417e-02, -1.876e-02, 4.009e-02, 5.481e-02));
r += mul(s0_6, M4(-7.181e-02, -2.968e-02, -3.169e-02, -1.899e-02, -3.692e-02, -2.156e-02, 9.595e-02, 1.055e-01, -1.274e-01, -2.576e-02, 8.706e-02, 1.895e-01, 6.316e-04, -4.574e-02, 2.201e-02, 1.199e-01));
r += mul(s0_7, M4(-2.193e-01, 1.563e-02, 1.287e-01, 2.403e-01, 2.222e-01, -1.748e-02, 1.486e-02, -7.685e-02, 4.971e-01, 2.920e-01, -2.253e-01, -8.145e-01, 3.018e-01, -4.559e-02, -1.509e-01, -3.003e-01));
r += mul(s0_8, M4(1.685e-02, -1.082e-02, 3.539e-03, -2.765e-02, -5.968e-03, -4.628e-03, 3.847e-02, 6.426e-02, -6.284e-02, 5.455e-02, -3.291e-02, 1.636e-01, 5.828e-02, -5.613e-02, -4.404e-02, -1.715e-02));
r += mul(s1_0, M4(1.875e-02, 7.150e-02, 3.015e-02, -4.917e-02, 9.333e-03, -1.519e-01, -1.153e-01, 4.344e-02, -1.603e-02, -4.775e-02, -4.484e-02, 6.567e-02, -6.714e-02, 2.569e-01, 4.638e-01, 3.038e-02));
r += mul(s1_1, M4(-4.046e-02, 1.372e-01, 2.476e-01, 6.565e-02, 6.481e-04, -1.529e-02, 1.376e-02, 1.367e-02, 2.941e-04, 1.423e-01, 2.311e-01, 7.538e-03, -6.762e-02, -3.992e-01, -1.160e-02, 3.123e-02));
r += mul(s1_2, M4(-3.926e-02, 1.709e-04, -4.761e-02, -8.731e-03, 5.123e-03, 7.039e-02, 1.061e-01, -1.322e-03, 4.069e-02, -1.182e-01, -3.698e-04, -7.746e-02, -3.827e-02, 9.957e-02, 9.991e-02, 5.215e-02));
r += mul(s1_3, M4(-1.865e-01, -9.784e-01, -5.871e-01, 1.384e-01, 2.097e-01, -1.229e-01, -4.912e-01, -4.254e-02, 3.395e-04, -8.968e-02, -6.923e-02, -4.916e-02, 2.424e-01, 7.730e-01, 2.573e-01, -2.380e-01));
r += mul(s1_4, M4(-9.293e-01, 6.176e-01, 1.970e-01, 3.467e-01, 4.341e-01, 9.866e-01, 3.035e-01, -1.062e-01, -1.501e-01, 2.709e-01, 1.991e-01, -2.164e-01, 2.881e-01, -1.696e-01, -4.141e-01, -1.004e+00));
r += mul(s1_5, M4(-8.323e-02, -1.285e-02, -3.468e-02, 1.551e-01, 1.330e-01, -1.238e-01, -1.675e-03, 5.588e-02, 2.128e-01, -2.327e-01, -2.891e-02, 1.567e-01, -1.448e-01, 8.781e-02, 3.254e-02, 7.142e-02));
r += mul(s1_6, M4(1.231e-01, 5.139e-02, -9.426e-02, -2.822e-01, 1.761e-03, 6.853e-03, 1.165e-01, 7.861e-02, -9.715e-03, 5.489e-03, -1.066e-02, -8.332e-03, -9.111e-02, 3.911e-02, 1.757e-01, 2.222e-01));
r += mul(s1_7, M4(2.275e-02, 1.199e-01, 5.904e-02, -2.051e-01, 6.950e-01, 1.592e-02, -9.888e-02, -6.701e-01, -9.096e-02, 3.203e-02, 1.204e-01, 2.153e-01, 1.448e-01, -5.225e-03, 6.786e-02, 2.005e-02));
r += mul(s1_8, M4(-3.290e-02, -3.758e-02, -3.158e-02, 8.713e-02, 3.917e-02, 4.275e-02, -2.450e-02, 3.970e-02, 1.928e-01, 5.498e-02, -5.673e-02, -3.743e-01, 4.981e-02, -1.785e-02, 1.958e-02, 3.487e-02));
r += V4(7.249e-03, 2.949e-03, 5.297e-03, 3.693e-03);
return r;
}
void Pass5(uint2 blockStart, uint3 tid) {
float2 pt = float2(GetInputPt());
uint2 gxy = Rmp8x8(tid.x) + blockStart;
uint2 size = GetInputSize();
if (gxy.x >= size.x || gxy.y >= size.y) {
return;
}
float2 pos = (gxy + 0.5) * pt;
V4 s0_0 = l0(-1.0, -1.0);
V4 s0_1 = l0(0.0, -1.0);
V4 s0_2 = l0(1.0, -1.0);
V4 s0_3 = l0(-1.0, 0.0);
V4 s0_4 = l0(0.0, 0.0);
V4 s0_5 = l0(1.0, 0.0);
V4 s0_6 = l0(-1.0, 1.0);
V4 s0_7 = l0(0.0, 1.0);
V4 s0_8 = l0(1.0, 1.0);
V4 s1_0 = -max(-s0_0, 0.0);
V4 s1_1 = -max(-s0_1, 0.0);
V4 s1_2 = -max(-s0_2, 0.0);
V4 s1_3 = -max(-s0_3, 0.0);
V4 s1_4 = -max(-s0_4, 0.0);
V4 s1_5 = -max(-s0_5, 0.0);
V4 s1_6 = -max(-s0_6, 0.0);
V4 s1_7 = -max(-s0_7, 0.0);
V4 s1_8 = -max(-s0_8, 0.0);
s0_0 = max(s0_0, 0.0);
s0_1 = max(s0_1, 0.0);
s0_2 = max(s0_2, 0.0);
s0_3 = max(s0_3, 0.0);
s0_4 = max(s0_4, 0.0);
s0_5 = max(s0_5, 0.0);
s0_6 = max(s0_6, 0.0);
s0_7 = max(s0_7, 0.0);
s0_8 = max(s0_8, 0.0);
t0[gxy] = f0(s0_0, s0_1, s0_2, s0_3, s0_4, s0_5, s0_6, s0_7, s0_8, s1_0, s1_1, s1_2, s1_3, s1_4, s1_5, s1_6, s1_7, s1_8);
}
//!PASS 6
//!DESC out-shuffle
//!BLOCK_SIZE 16
//!NUM_THREADS 64
//!IN INPUT, t0
//!OUT OUTPUT
#define l0(x, y) V4(O(t0, float2(x, y)))
V4 f0(V4 s0_0, V4 s0_1, V4 s0_2, V4 s0_3, V4 s0_4, V4 s0_5, V4 s0_6, V4 s0_7, V4 s0_8, V4 s1_0, V4 s1_1, V4 s1_2, V4 s1_3, V4 s1_4, V4 s1_5, V4 s1_6, V4 s1_7, V4 s1_8) {
V4 r = 0.0;
r += mul(s0_0, M4(2.340e-02, 8.171e-02, -1.124e-01, -5.065e-02, -5.505e-02, -5.540e-02, -3.000e-03, -1.346e-02, 3.800e-02, 4.944e-02, -2.084e-02, 6.388e-03, 8.566e-02, 2.480e-02, 1.184e-01, -1.075e-04));
r += mul(s0_1, M4(-2.188e-02, -2.056e-01, 1.480e-02, -7.451e-02, 5.240e-02, 4.098e-02, -4.668e-03, 1.810e-02, -2.533e-02, -6.403e-02, 1.984e-02, -5.716e-02, -3.356e-03, -2.173e-01, 1.218e-01, 1.179e-01));
r += mul(s0_2, M4(7.330e-03, 2.521e-02, 1.372e-02, 3.411e-02, -1.438e-02, -1.009e-02, 7.676e-03, -1.712e-02, 5.980e-03, 2.040e-02, -8.766e-03, 3.442e-02, -1.623e-02, -2.557e-02, -6.086e-03, 5.413e-04));
r += mul(s0_3, M4(1.754e-01, 6.364e-02, 2.842e-01, 2.378e-01, -1.684e-01, -1.911e-02, -3.838e-01, -2.622e-02, 2.065e-01, 3.951e-02, 4.217e-01, 4.374e-02, -1.028e-02, 2.417e-02, -1.595e-02, 6.305e-02));
r += mul(s0_4, M4(-5.620e-02, -8.609e-02, -1.256e-01, -3.166e-01, -1.712e-01, -1.602e-01, -1.577e-01, -4.901e-01, -5.012e-02, 1.082e-01, -7.271e-02, 4.072e-01, -7.789e-02, -1.725e-01, -1.397e-01, -4.507e-01));
r += mul(s0_5, M4(1.401e-02, 4.716e-02, 1.486e-02, 4.642e-02, 1.131e-02, 3.865e-02, -9.865e-03, 9.301e-02, 3.441e-03, -8.098e-03, -6.012e-03, -1.549e-01, 1.486e-02, 1.872e-02, -2.469e-03, 1.294e-02));
r += mul(s0_6, M4(-3.894e-02, -4.136e-05, -3.022e-02, 1.045e-03, -3.730e-02, -1.838e-02, -5.573e-02, -2.760e-02, 3.516e-02, 1.602e-02, 6.358e-02, 3.111e-02, -3.045e-02, -7.728e-03, -4.189e-02, -1.102e-02));
r += mul(s0_7, M4(-1.184e-02, 1.728e-02, 7.925e-03, 6.763e-02, 2.590e-03, -9.456e-03, -4.407e-02, -2.044e-02, 4.472e-02, 2.228e-02, 7.233e-02, 4.863e-02, -1.814e-02, -2.034e-03, -4.994e-02, -2.460e-02));
r += mul(s0_8, M4(-3.292e-03, -9.015e-03, -3.171e-03, -2.504e-02, 2.120e-03, 3.064e-02, 2.108e-02, 4.592e-02, 2.258e-03, -2.192e-04, -3.576e-03, 3.733e-02, -1.931e-03, -5.083e-03, 5.877e-03, -1.764e-02));
r += mul(s1_0, M4(4.321e-02, -8.135e-02, -1.567e-01, -6.888e-03, -6.542e-02, -1.656e-02, 1.236e-02, -7.563e-03, 4.657e-02, 9.222e-03, -6.696e-03, -3.545e-03, -6.401e-01, 1.189e-01, 1.509e-01, 2.417e-01));
r += mul(s1_1, M4(-2.058e-02, 1.174e-01, -2.482e-02, -8.423e-02, -1.692e-02, -1.094e-02, 3.530e-02, 1.780e-02, -9.937e-02, -9.030e-02, 2.304e-02, 1.294e-02, 7.976e-02, -3.096e-01, 1.382e-01, 2.456e-01));
r += mul(s1_2, M4(4.491e-02, -1.336e-02, 3.593e-02, -3.503e-02, -8.630e-03, -4.295e-03, -1.356e-02, 3.843e-02, 9.887e-03, 1.913e-03, 2.247e-03, 1.113e-02, -7.234e-04, -3.058e-02, 2.833e-03, -1.707e-02));
r += mul(s1_3, M4(2.007e-01, 6.756e-02, 9.393e-01, 9.057e-02, -3.701e-01, -1.729e-02, -4.136e-01, 2.233e-02, 2.783e-01, 3.590e-02, 3.564e-01, 8.342e-03, 1.333e-01, 7.944e-02, -2.312e-01, 8.354e-02));
r += mul(s1_4, M4(-3.334e-01, -2.705e-01, -4.072e-01, 3.946e-01, 5.159e-03, -5.860e-01, 1.578e-01, -3.614e-01, 5.366e-01, 4.699e-01, -3.700e-01, 9.463e-02, -4.090e-02, -9.767e-02, -7.999e-02, -4.859e-01));
r += mul(s1_5, M4(5.700e-02, 6.092e-02, 4.114e-02, -1.564e-02, -1.345e-02, 9.692e-02, 1.456e-03, 9.371e-02, -3.845e-02, -4.751e-02, -2.509e-02, -2.842e-01, 2.938e-03, 2.387e-02, -6.191e-04, -3.120e-04));
r += mul(s1_6, M4(3.888e-02, 4.969e-02, -1.851e-01, -9.866e-03, -3.527e-02, -1.377e-02, -7.594e-02, -2.619e-02, 3.259e-02, 9.636e-03, 8.622e-03, 1.788e-02, -3.505e-02, -1.048e-03, -1.329e-02, 1.425e-02));
r += mul(s1_7, M4(6.891e-03, 8.118e-02, -6.443e-02, -1.487e-01, 2.183e-02, 1.106e-03, 6.656e-02, -9.506e-02, 7.418e-04, -6.015e-02, 3.594e-01, 1.039e-02, -3.600e-02, -7.771e-03, -3.406e-02, 2.935e-02));
r += mul(s1_8, M4(-4.598e-03, -4.678e-03, 1.595e-02, -8.273e-03, 6.740e-03, 1.175e-02, -2.997e-02, -6.116e-03, -3.788e-02, -9.471e-02, -2.149e-02, 4.139e-02, -9.614e-03, -5.573e-03, -1.643e-02, -1.712e-02));
r += V4(2.510e-03, 4.409e-03, 2.891e-03, 4.977e-03);
return tanh(r);
}
void Pass6(uint2 blockStart, uint3 tid) {
float2 pt = float2(GetInputPt());
uint2 gxy = (Rmp8x8(tid.x) << 1) + blockStart;
uint2 size = GetOutputSize();
if (gxy.x >= size.x || gxy.y >= size.y) {
return;
}
float2 pos = ((gxy >> 1) + 0.5) * pt;
V4 s0_0 = l0(-1.0, -1.0);
V4 s0_1 = l0(0.0, -1.0);
V4 s0_2 = l0(1.0, -1.0);
V4 s0_3 = l0(-1.0, 0.0);
V4 s0_4 = l0(0.0, 0.0);
V4 s0_5 = l0(1.0, 0.0);
V4 s0_6 = l0(-1.0, 1.0);
V4 s0_7 = l0(0.0, 1.0);
V4 s0_8 = l0(1.0, 1.0);
V4 s1_0 = -max(-s0_0, 0.0);
V4 s1_1 = -max(-s0_1, 0.0);
V4 s1_2 = -max(-s0_2, 0.0);
V4 s1_3 = -max(-s0_3, 0.0);
V4 s1_4 = -max(-s0_4, 0.0);
V4 s1_5 = -max(-s0_5, 0.0);
V4 s1_6 = -max(-s0_6, 0.0);
V4 s1_7 = -max(-s0_7, 0.0);
V4 s1_8 = -max(-s0_8, 0.0);
s0_0 = max(s0_0, 0.0);
s0_1 = max(s0_1, 0.0);
s0_2 = max(s0_2, 0.0);
s0_3 = max(s0_3, 0.0);
s0_4 = max(s0_4, 0.0);
s0_5 = max(s0_5, 0.0);
s0_6 = max(s0_6, 0.0);
s0_7 = max(s0_7, 0.0);
s0_8 = max(s0_8, 0.0);
V4 r = f0(s0_0, s0_1, s0_2, s0_3, s0_4, s0_5, s0_6, s0_7, s0_8, s1_0, s1_1, s1_2, s1_3, s1_4, s1_5, s1_6, s1_7, s1_8);
static const float3x3 rgb2yuv = {0.299, 0.587, 0.114, -0.169, -0.331, 0.5, 0.5, -0.419, -0.081};
static const float3x3 yuv2rgb = {1, -0.00093, 1.401687, 1, -0.3437, -0.71417, 1, 1.77216, 0.00099};
float2 opt = float2(GetOutputPt());
pos -= 0.5f * opt;
float3 yuv = mul(rgb2yuv, INPUT.SampleLevel(SL, pos, 0).rgb);
OUTPUT[gxy] = float4(mul(yuv2rgb, float3(saturate(yuv.r + r.x), yuv.yz)), 1);
++gxy.x;
pos.x += opt.x;
yuv = mul(rgb2yuv, INPUT.SampleLevel(SL, pos, 0).rgb);
OUTPUT[gxy] = float4(mul(yuv2rgb, float3(saturate(yuv.r + r.y), yuv.yz)), 1);
++gxy.y;
pos.y += opt.y;
yuv = mul(rgb2yuv, INPUT.SampleLevel(SL, pos, 0).rgb);
OUTPUT[gxy] = float4(mul(yuv2rgb, float3(saturate(yuv.r + r.w), yuv.yz)), 1);
--gxy.x;
pos.x -= opt.x;
yuv = mul(rgb2yuv, INPUT.SampleLevel(SL, pos, 0).rgb);
OUTPUT[gxy] = float4(mul(yuv2rgb, float3(saturate(yuv.r + r.z), yuv.yz)), 1);
}

View file

@ -0,0 +1,486 @@
// CuNNy 4x4C BILINEAR RGB NVL - https://github.com/cunnyplapper/CuNNy
// This program is free software: you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation, either version 3 of the License, or
// (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
//
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License
// along with this program. If not, see <http://www.gnu.org/licenses/>.
//!MAGPIE EFFECT
//!VERSION 4
//!SORT_NAME CuNNy-D04N04
//!TEXTURE
Texture2D INPUT;
//!TEXTURE
//!WIDTH INPUT_WIDTH * 2
//!HEIGHT INPUT_HEIGHT * 2
Texture2D OUTPUT;
//!SAMPLER
//!FILTER POINT
SamplerState SP;
//!SAMPLER
//!FILTER LINEAR
SamplerState SL;
//!COMMON
#define O(t, p) t.SampleLevel(SP, pos + p * pt, 0)
#define V4 min16float4
#define M4 min16float4x4
//!TEXTURE
//!WIDTH INPUT_WIDTH
//!HEIGHT INPUT_HEIGHT
//!FORMAT R8G8B8A8_SNORM
Texture2D t0;
//!TEXTURE
//!WIDTH INPUT_WIDTH
//!HEIGHT INPUT_HEIGHT
//!FORMAT R8G8B8A8_SNORM
Texture2D t1;
//!PASS 1
//!DESC in
//!BLOCK_SIZE 8
//!NUM_THREADS 64
//!IN INPUT
//!OUT t0
#define l0(x, y) min16float((dot(float3(-4.174e-01, -7.873e-01, -1.763e-01), O(INPUT, float2(x, y)).rgb) + 1.011e+00))
V4 f0(min16float s0_0, min16float s0_1, min16float s0_2, min16float s0_3, min16float s0_4, min16float s0_5, min16float s0_6, min16float s0_7, min16float s0_8) {
V4 r = 0.0;
r += V4(1.222e-01, 7.038e-03, 1.179e-01, 1.876e-01) * s0_0;
r += V4(1.025e-01, -2.993e-01, 3.154e-01, -1.050e-01) * s0_1;
r += V4(5.656e-02, -3.117e-03, -6.665e-02, -2.044e-01) * s0_2;
r += V4(-5.045e-01, -4.189e-01, -3.076e-01, -3.691e-01) * s0_3;
r += V4(1.365e-01, 6.699e-01, 3.389e-01, 4.561e-01) * s0_4;
r += V4(-7.690e-02, 2.655e-02, -1.044e-02, 7.271e-02) * s0_5;
r += V4(1.358e-02, 3.378e-03, -1.802e-01, -1.936e-01) * s0_6;
r += V4(8.227e-02, 1.550e-02, -1.820e-01, -1.670e-01) * s0_7;
r += V4(9.988e-03, 1.413e-03, -2.486e-02, 3.258e-01) * s0_8;
r += V4(3.566e-02, -1.308e-03, -5.595e-03, -5.246e-03);
return r;
}
void Pass1(uint2 blockStart, uint3 tid) {
float2 pt = float2(GetInputPt());
uint2 gxy = Rmp8x8(tid.x) + blockStart;
uint2 size = GetInputSize();
if (gxy.x >= size.x || gxy.y >= size.y) {
return;
}
float2 pos = (gxy + 0.5) * pt;
min16float s0_0 = l0(-1.0, -1.0);
min16float s0_1 = l0(0.0, -1.0);
min16float s0_2 = l0(1.0, -1.0);
min16float s0_3 = l0(-1.0, 0.0);
min16float s0_4 = l0(0.0, 0.0);
min16float s0_5 = l0(1.0, 0.0);
min16float s0_6 = l0(-1.0, 1.0);
min16float s0_7 = l0(0.0, 1.0);
min16float s0_8 = l0(1.0, 1.0);
t0[gxy] = f0(s0_0, s0_1, s0_2, s0_3, s0_4, s0_5, s0_6, s0_7, s0_8);
}
//!PASS 2
//!DESC conv1
//!BLOCK_SIZE 8
//!NUM_THREADS 64
//!IN t0
//!OUT t1
#define l0(x, y) V4(O(t0, float2(x, y)))
V4 f0(V4 s0_0, V4 s0_1, V4 s0_2, V4 s0_3, V4 s0_4, V4 s0_5, V4 s0_6, V4 s0_7, V4 s0_8, V4 s1_0, V4 s1_1, V4 s1_2, V4 s1_3, V4 s1_4, V4 s1_5, V4 s1_6, V4 s1_7, V4 s1_8) {
V4 r = 0.0;
r += mul(s0_0, M4(1.282e-01, 1.199e-01, 1.156e-01, -4.091e-02, -1.771e-02, -1.431e-01, -1.478e-02, 4.041e-02, -1.559e-01, 1.231e-02, -8.571e-02, 2.159e-02, -6.484e-02, 3.819e-02, -3.386e-02, -3.344e-02));
r += mul(s0_1, M4(6.131e-02, 1.493e-01, 1.954e-01, -2.565e-01, 1.570e-01, -3.852e-01, -2.313e-01, 9.262e-02, 1.038e-01, -4.169e-01, -2.446e-01, 9.953e-02, -1.830e-01, -9.774e-02, -1.498e-01, 8.626e-02));
r += mul(s0_2, M4(9.908e-02, 1.372e-01, -1.254e-02, 4.486e-03, 1.023e-01, 6.484e-02, 1.645e-01, -4.932e-02, -4.221e-02, -1.919e-01, -2.135e-02, 6.955e-02, -1.406e-01, 8.082e-02, -7.935e-02, 3.010e-02));
r += mul(s0_3, M4(-7.203e-02, -1.210e-01, 1.084e-01, -6.958e-03, 1.303e-01, 1.030e-01, -2.392e-01, -1.084e-01, 2.173e-01, -7.864e-02, -2.983e-01, -3.510e-01, -3.076e-01, 4.533e-02, 1.940e-01, 4.051e-01));
r += mul(s0_4, M4(9.270e-02, -4.072e-01, 2.338e-01, 4.098e-01, -1.440e-01, 6.971e-01, 5.515e-01, 2.682e-01, -1.401e-01, 3.504e-02, 1.366e-01, 6.149e-01, -3.330e-01, 1.880e-01, -4.170e-01, 3.244e-01));
r += mul(s0_5, M4(-5.380e-01, -7.843e-02, -1.293e-01, -9.225e-02, 1.393e-01, -2.588e-01, 4.618e-01, -2.264e-02, -5.369e-02, 1.321e-01, -3.029e-02, 7.983e-02, -1.048e-01, 3.279e-02, -5.969e-02, -3.766e-03));
r += mul(s0_6, M4(3.432e-02, 1.518e-02, 1.940e-02, -1.086e-01, 1.052e-01, -5.430e-02, -3.343e-02, 1.824e-01, -9.831e-02, 1.097e-02, 6.281e-02, 1.194e-01, 3.253e-02, 4.046e-02, -2.183e-02, -1.328e-01));
r += mul(s0_7, M4(1.538e-01, 6.796e-02, -4.870e-01, 7.139e-02, -2.497e-01, 2.916e-02, 6.191e-01, -2.650e-01, -4.194e-02, 1.782e-01, -3.431e-01, -9.707e-02, 2.173e-02, -1.150e-01, -8.162e-03, 4.551e-02));
r += mul(s0_8, M4(5.804e-02, 5.436e-02, -1.604e-01, 8.077e-02, 2.685e-01, 4.741e-02, 1.225e-01, -1.033e-01, -4.358e-02, -1.091e-01, 8.815e-02, -3.121e-02, -2.569e-02, -1.093e-02, -2.550e-02, -1.571e-02));
r += mul(s1_0, M4(8.760e-02, 1.254e-01, 9.299e-02, -1.140e-02, 4.179e-02, -1.333e-01, 3.048e-03, -3.111e-02, -6.091e-02, 6.563e-03, 4.609e-03, -4.717e-02, -6.470e-02, -5.791e-02, -5.529e-03, 8.697e-02));
r += mul(s1_1, M4(6.935e-02, 9.805e-02, 1.851e-01, -2.726e-01, 1.731e-01, -2.863e-01, -2.267e-01, -3.813e-02, 1.104e-01, -3.193e-01, -1.958e-01, 9.567e-02, 1.819e-01, -2.054e-01, 1.228e-01, 3.906e-02));
r += mul(s1_2, M4(-1.957e-01, 7.733e-02, -2.023e-01, 1.297e-01, -1.646e-01, 1.304e-01, -1.728e-02, -4.396e-02, 7.828e-02, -2.639e-01, 3.389e-02, 1.101e-01, 1.388e-01, -4.075e-03, 1.023e-01, -7.785e-03));
r += mul(s1_3, M4(-2.828e-02, -7.018e-02, 4.269e-02, -1.386e-01, 2.143e-02, 2.504e-01, -2.134e-01, -2.483e-01, 1.075e-01, -2.671e-02, -2.588e-01, -3.271e-01, 1.173e-01, -6.103e-02, 5.539e-01, 5.341e-01));
r += mul(s1_4, M4(-2.415e-01, -2.975e-01, -6.622e-02, 4.027e-01, -5.871e-01, 7.506e-01, 1.939e-02, -1.680e-01, 4.796e-01, -2.840e-01, 5.077e-01, 9.122e-02, 1.463e-01, 2.124e-01, 6.358e-02, 2.993e-01));
r += mul(s1_5, M4(4.298e-01, -1.754e-01, 5.357e-01, -1.440e-01, -4.439e-01, -3.819e-01, -1.009e-01, 2.113e-02, -2.275e-02, -1.842e-02, 1.441e-01, 6.590e-03, 2.627e-02, 3.381e-02, 9.956e-02, -1.935e-02));
r += mul(s1_6, M4(-5.557e-02, 3.378e-02, -2.451e-02, -1.718e-01, -2.037e-01, 1.631e-02, -2.822e-01, -7.724e-02, -6.657e-02, -2.282e-02, 2.673e-02, 8.716e-02, 1.291e-01, 9.472e-03, 3.810e-02, -1.134e-01));
r += mul(s1_7, M4(1.441e-01, 4.331e-02, -4.741e-01, 2.165e-01, -5.974e-01, -2.669e-02, -4.949e-02, -3.179e-01, 1.007e-01, 1.512e-01, -4.138e-02, -7.470e-02, 8.828e-02, -1.400e-01, 5.797e-02, -4.988e-03));
r += mul(s1_8, M4(-2.478e-01, 1.392e-01, -8.663e-02, -3.629e-02, 1.823e-01, 7.573e-03, -2.445e-01, -1.641e-02, -5.197e-02, -8.804e-02, 1.244e-01, 2.095e-02, 1.683e-02, -4.073e-02, -5.207e-03, -3.854e-03));
r += V4(-4.317e-03, 2.687e-03, -1.530e-03, 4.681e-04);
return r;
}
void Pass2(uint2 blockStart, uint3 tid) {
float2 pt = float2(GetInputPt());
uint2 gxy = Rmp8x8(tid.x) + blockStart;
uint2 size = GetInputSize();
if (gxy.x >= size.x || gxy.y >= size.y) {
return;
}
float2 pos = (gxy + 0.5) * pt;
V4 s0_0 = l0(-1.0, -1.0);
V4 s0_1 = l0(0.0, -1.0);
V4 s0_2 = l0(1.0, -1.0);
V4 s0_3 = l0(-1.0, 0.0);
V4 s0_4 = l0(0.0, 0.0);
V4 s0_5 = l0(1.0, 0.0);
V4 s0_6 = l0(-1.0, 1.0);
V4 s0_7 = l0(0.0, 1.0);
V4 s0_8 = l0(1.0, 1.0);
V4 s1_0 = -max(-s0_0, 0.0);
V4 s1_1 = -max(-s0_1, 0.0);
V4 s1_2 = -max(-s0_2, 0.0);
V4 s1_3 = -max(-s0_3, 0.0);
V4 s1_4 = -max(-s0_4, 0.0);
V4 s1_5 = -max(-s0_5, 0.0);
V4 s1_6 = -max(-s0_6, 0.0);
V4 s1_7 = -max(-s0_7, 0.0);
V4 s1_8 = -max(-s0_8, 0.0);
s0_0 = max(s0_0, 0.0);
s0_1 = max(s0_1, 0.0);
s0_2 = max(s0_2, 0.0);
s0_3 = max(s0_3, 0.0);
s0_4 = max(s0_4, 0.0);
s0_5 = max(s0_5, 0.0);
s0_6 = max(s0_6, 0.0);
s0_7 = max(s0_7, 0.0);
s0_8 = max(s0_8, 0.0);
t1[gxy] = f0(s0_0, s0_1, s0_2, s0_3, s0_4, s0_5, s0_6, s0_7, s0_8, s1_0, s1_1, s1_2, s1_3, s1_4, s1_5, s1_6, s1_7, s1_8);
}
//!PASS 3
//!DESC conv2
//!BLOCK_SIZE 8
//!NUM_THREADS 64
//!IN t1
//!OUT t0
#define l0(x, y) V4(O(t1, float2(x, y)))
V4 f0(V4 s0_0, V4 s0_1, V4 s0_2, V4 s0_3, V4 s0_4, V4 s0_5, V4 s0_6, V4 s0_7, V4 s0_8, V4 s1_0, V4 s1_1, V4 s1_2, V4 s1_3, V4 s1_4, V4 s1_5, V4 s1_6, V4 s1_7, V4 s1_8) {
V4 r = 0.0;
r += mul(s0_0, M4(1.921e-01, -2.132e-02, -5.460e-03, -6.681e-02, 9.988e-02, -2.228e-02, 4.719e-02, 9.124e-03, -1.072e-01, 1.506e-01, 2.070e-02, -4.671e-02, 2.244e-01, -4.895e-02, -8.150e-03, -9.520e-02));
r += mul(s0_1, M4(8.226e-02, 4.651e-02, -1.842e-01, -3.376e-02, 1.349e-01, 2.148e-02, -1.746e-01, 1.671e-02, 9.761e-02, 7.581e-02, 1.470e-01, -8.582e-02, -1.149e-01, 2.143e-02, -1.597e-01, 1.626e-01));
r += mul(s0_2, M4(-5.810e-04, -3.566e-02, 4.708e-02, -3.068e-02, 1.578e-02, 5.503e-03, 3.081e-02, -4.174e-02, 3.394e-01, 7.398e-02, -9.467e-02, -1.127e-01, -1.314e-01, 1.511e-02, 1.538e-01, -5.695e-03));
r += mul(s0_3, M4(2.959e-01, 3.316e-02, -5.716e-02, -2.233e-01, 5.020e-01, -1.416e-01, -6.082e-02, -3.393e-01, 3.292e-01, -6.813e-02, 9.009e-02, -1.638e-01, 1.190e-01, -2.728e-02, -6.042e-02, -1.360e-01));
r += mul(s0_4, M4(5.902e-01, 3.040e-01, -2.870e-01, 2.228e-02, -1.646e-01, 2.078e-02, -1.480e-01, 2.083e-01, -4.397e-01, -2.549e-01, -1.168e-01, -4.199e-01, 2.199e-01, 2.596e-02, 2.598e-02, -1.313e-01));
r += mul(s0_5, M4(1.043e-01, 1.050e-02, -5.654e-02, -1.265e-01, -1.978e-01, 3.772e-02, 2.474e-01, 1.395e-01, 2.041e-01, 6.617e-02, -2.602e-01, -1.601e-01, -5.577e-02, -1.591e-02, 2.096e-01, 2.594e-02));
r += mul(s0_6, M4(7.245e-02, 6.156e-02, 5.317e-02, -3.912e-01, 1.871e-01, -2.079e-02, -2.552e-02, -6.961e-02, 2.686e-01, 8.518e-02, -1.026e-01, -4.040e-01, -6.324e-02, 7.999e-03, 1.317e-02, 1.619e-02));
r += mul(s0_7, M4(1.240e-01, -8.349e-02, -1.258e-01, -3.269e-01, 6.624e-01, -1.357e-01, -6.738e-01, -5.998e-01, -8.375e-04, 2.226e-01, -1.880e-01, 5.678e-02, -8.383e-02, -3.455e-02, -1.399e-02, 4.540e-02));
r += mul(s0_8, M4(-3.130e-02, 9.691e-02, 1.763e-01, -1.847e-02, -1.193e-01, -7.494e-03, 1.485e-02, 1.244e-02, 9.559e-02, 3.116e-02, 8.046e-03, -1.264e-01, -2.403e-01, 6.389e-02, 2.999e-01, 1.484e-01));
r += mul(s1_0, M4(2.569e-01, -8.689e-03, -1.806e-02, -3.993e-02, 9.155e-02, -2.022e-02, 1.034e-02, -3.455e-02, -1.534e-01, 1.836e-02, -1.176e-03, 3.593e-03, 2.642e-01, -6.587e-02, -4.169e-02, -2.237e-01));
r += mul(s1_1, M4(1.398e-01, 1.020e-02, -2.478e-01, 2.747e-02, 7.152e-02, 1.835e-02, -2.013e-01, 1.151e-02, -2.586e-01, -3.622e-02, 2.529e-01, 1.465e-01, -3.973e-01, 5.907e-02, -9.450e-02, 3.761e-02));
r += mul(s1_2, M4(3.157e-02, 7.847e-03, 8.109e-03, -3.333e-02, -3.333e-02, -6.401e-03, -6.632e-03, 3.296e-02, -1.433e-02, 2.167e-02, 1.194e-01, -1.028e-01, -2.104e-01, 1.352e-02, -6.835e-02, 1.901e-01));
r += mul(s1_3, M4(3.443e-01, -1.004e-01, -6.176e-02, -3.047e-01, 4.779e-01, -7.928e-02, -8.134e-02, -4.873e-01, -1.421e-01, 3.972e-02, 7.459e-02, 2.099e-01, 1.118e-01, -1.022e-02, -8.584e-02, -1.657e-01));
r += mul(s1_4, M4(-1.721e-01, 2.625e-02, -7.292e-03, 2.646e-01, 2.505e-02, 1.479e-01, -3.357e-01, 1.088e-01, 1.016e-01, -1.902e-01, -1.622e-01, -6.326e-02, -4.305e-01, 4.763e-01, -1.357e-03, -5.685e-01));
r += mul(s1_5, M4(3.324e-03, 1.692e-02, -5.726e-02, 2.853e-02, -3.135e-01, -4.534e-03, 2.549e-01, 1.183e-01, -1.277e-01, -5.030e-02, 9.190e-02, 1.145e-01, 3.445e-01, 6.425e-02, -2.707e-01, -1.701e-01));
r += mul(s1_6, M4(2.164e-02, 1.998e-02, 1.667e-02, -6.126e-02, 2.400e-01, -9.253e-02, -4.525e-02, 8.615e-03, 5.148e-02, -1.803e-02, -7.495e-02, -7.102e-02, -2.646e-02, 6.819e-02, 1.465e-01, 1.904e-01));
r += mul(s1_7, M4(-2.339e-02, 3.350e-02, -1.274e-01, 5.525e-02, 9.120e-01, -9.074e-01, -6.856e-01, -7.422e-02, 4.849e-02, -1.377e-02, -1.409e-01, -5.792e-02, -1.044e-01, 9.079e-02, 2.520e-01, 2.053e-01));
r += mul(s1_8, M4(1.891e-02, -1.562e-02, -1.024e-02, -2.686e-02, -1.038e-01, -3.210e-02, 4.222e-01, -2.084e-01, -1.841e-01, 3.231e-02, 7.320e-02, 1.727e-01, 2.861e-01, 2.506e-02, -2.266e-01, -3.940e-01));
r += V4(-1.043e-03, 3.601e-03, 5.622e-03, -7.848e-04);
return r;
}
void Pass3(uint2 blockStart, uint3 tid) {
float2 pt = float2(GetInputPt());
uint2 gxy = Rmp8x8(tid.x) + blockStart;
uint2 size = GetInputSize();
if (gxy.x >= size.x || gxy.y >= size.y) {
return;
}
float2 pos = (gxy + 0.5) * pt;
V4 s0_0 = l0(-1.0, -1.0);
V4 s0_1 = l0(0.0, -1.0);
V4 s0_2 = l0(1.0, -1.0);
V4 s0_3 = l0(-1.0, 0.0);
V4 s0_4 = l0(0.0, 0.0);
V4 s0_5 = l0(1.0, 0.0);
V4 s0_6 = l0(-1.0, 1.0);
V4 s0_7 = l0(0.0, 1.0);
V4 s0_8 = l0(1.0, 1.0);
V4 s1_0 = -max(-s0_0, 0.0);
V4 s1_1 = -max(-s0_1, 0.0);
V4 s1_2 = -max(-s0_2, 0.0);
V4 s1_3 = -max(-s0_3, 0.0);
V4 s1_4 = -max(-s0_4, 0.0);
V4 s1_5 = -max(-s0_5, 0.0);
V4 s1_6 = -max(-s0_6, 0.0);
V4 s1_7 = -max(-s0_7, 0.0);
V4 s1_8 = -max(-s0_8, 0.0);
s0_0 = max(s0_0, 0.0);
s0_1 = max(s0_1, 0.0);
s0_2 = max(s0_2, 0.0);
s0_3 = max(s0_3, 0.0);
s0_4 = max(s0_4, 0.0);
s0_5 = max(s0_5, 0.0);
s0_6 = max(s0_6, 0.0);
s0_7 = max(s0_7, 0.0);
s0_8 = max(s0_8, 0.0);
t0[gxy] = f0(s0_0, s0_1, s0_2, s0_3, s0_4, s0_5, s0_6, s0_7, s0_8, s1_0, s1_1, s1_2, s1_3, s1_4, s1_5, s1_6, s1_7, s1_8);
}
//!PASS 4
//!DESC conv3
//!BLOCK_SIZE 8
//!NUM_THREADS 64
//!IN t0
//!OUT t1
#define l0(x, y) V4(O(t0, float2(x, y)))
V4 f0(V4 s0_0, V4 s0_1, V4 s0_2, V4 s0_3, V4 s0_4, V4 s0_5, V4 s0_6, V4 s0_7, V4 s0_8, V4 s1_0, V4 s1_1, V4 s1_2, V4 s1_3, V4 s1_4, V4 s1_5, V4 s1_6, V4 s1_7, V4 s1_8) {
V4 r = 0.0;
r += mul(s0_0, M4(-7.801e-03, 7.517e-03, 5.348e-02, 7.686e-02, -8.770e-03, 1.144e-02, -2.398e-02, 1.355e-02, -4.642e-02, 5.880e-02, 3.263e-02, 1.860e-01, -4.443e-02, -2.732e-02, -2.133e-02, -1.166e-01));
r += mul(s0_1, M4(-1.751e-02, -1.230e-02, -1.218e-01, -1.231e-01, 4.092e-03, -8.769e-03, -2.251e-03, 5.142e-02, 4.354e-03, -4.445e-02, -2.369e-01, -1.616e-01, 4.495e-03, -1.326e-01, -5.371e-01, -5.119e-01));
r += mul(s0_2, M4(3.143e-02, 2.366e-02, 8.884e-02, -1.819e-02, 2.358e-03, 3.812e-04, -4.972e-02, -5.311e-02, 1.729e-02, 1.523e-02, 7.798e-02, -1.705e-05, -2.295e-02, 6.567e-02, 1.422e-01, 1.890e-01));
r += mul(s0_3, M4(2.363e-02, 1.555e-02, -1.307e-01, -8.190e-02, 1.026e-02, 9.724e-03, 5.358e-02, -2.783e-01, 7.268e-03, 1.659e-01, -5.801e-02, 3.076e-01, -1.575e-01, -9.567e-02, 3.294e-02, -7.694e-01));
r += mul(s0_4, M4(1.677e-02, -1.324e-01, 4.019e-01, -2.902e-01, -6.051e-02, -4.625e-02, 8.409e-01, 4.756e-01, -1.135e-01, -3.213e-01, 6.389e-02, -2.083e-01, -1.219e+00, 2.280e-01, 9.667e-01, -3.604e-01));
r += mul(s0_5, M4(-5.948e-02, 1.567e-01, 3.883e-02, -4.843e-03, -2.153e-02, 3.439e-02, -1.160e-01, -1.325e-02, -5.312e-02, 1.136e-01, -5.260e-02, -3.524e-02, 7.315e-02, 3.527e-01, 6.186e-01, -7.505e-02));
r += mul(s0_6, M4(-3.841e-02, 1.620e-03, 9.449e-02, -8.648e-02, -2.656e-02, -1.676e-03, 2.364e-03, -7.221e-02, -9.590e-02, 4.160e-02, -1.278e-02, -3.171e-02, 6.213e-02, 2.673e-02, -7.931e-02, 2.588e-01));
r += mul(s0_7, M4(-3.636e-02, -1.558e-01, 2.151e-01, 1.188e-01, 1.275e-01, -8.114e-02, -8.376e-02, -3.690e-02, -1.968e-02, -1.038e-01, 8.994e-02, 3.846e-02, -1.499e-01, 6.457e-01, -8.201e-02, -3.935e-01));
r += mul(s0_8, M4(-2.833e-03, 2.529e-01, -3.350e-03, -3.433e-02, 1.943e-02, -2.796e-02, 3.313e-02, 1.582e-02, 1.702e-02, 5.663e-02, -1.647e-02, -2.229e-02, -4.865e-01, 3.285e-01, -4.462e-01, -4.307e-01));
r += mul(s1_0, M4(-6.004e-02, 4.898e-03, 3.591e-02, 1.900e-01, -3.816e-02, -3.269e-02, 1.459e-01, -3.464e-03, -1.235e-02, -3.737e-02, 1.569e-02, 2.559e-01, -3.173e-04, 1.268e-02, 8.886e-03, 2.960e-02));
r += mul(s1_1, M4(-1.582e-02, -7.507e-02, -2.026e-01, 2.027e-01, -6.107e-02, 2.055e-02, -5.811e-02, 5.420e-03, 1.028e-02, -1.374e-02, -6.152e-01, -2.259e-01, -3.408e-03, -1.800e-02, 4.574e-02, -9.590e-02));
r += mul(s1_2, M4(4.210e-02, 2.126e-02, 8.277e-02, 2.079e-02, -1.733e-01, -2.483e-02, 2.686e-01, 1.498e-01, 7.352e-02, -2.511e-02, 3.159e-02, 5.775e-02, 5.942e-02, 3.383e-02, 1.274e-01, -5.928e-02));
r += mul(s1_3, M4(5.614e-02, 7.561e-02, -8.328e-02, 2.427e-01, 7.214e-02, -1.122e-01, 9.434e-02, -2.602e-01, -1.052e-02, -6.944e-02, -3.023e-02, -1.655e-01, 1.236e-03, 4.025e-03, -3.082e-02, -1.533e-01));
r += mul(s1_4, M4(6.675e-01, -2.254e-01, 1.173e+00, -8.261e-02, 5.655e-01, -2.000e-01, 8.301e-01, 1.458e+00, -2.497e-01, -1.091e+00, -4.698e-01, -1.876e-01, -3.358e-02, -2.854e-01, 5.032e-01, -1.558e-01));
r += mul(s1_5, M4(-1.444e-02, 1.502e-01, -4.221e-02, -4.864e-02, 3.236e-01, -2.572e-01, 1.344e-01, 8.562e-02, -1.030e-01, 2.690e-01, 1.238e-01, 3.309e-02, -3.849e-02, 1.860e-01, 6.528e-03, 2.840e-02));
r += mul(s1_6, M4(-1.161e-01, 5.405e-02, -3.101e-02, -1.009e-01, -9.594e-02, -1.207e-02, -3.836e-02, -6.894e-02, -1.770e-02, -2.958e-02, 8.484e-02, -2.284e-02, 2.585e-04, -2.764e-02, 4.972e-02, -5.968e-02));
r += mul(s1_7, M4(-4.113e-02, -1.948e-01, -2.728e-02, -3.142e-02, -2.894e-01, -1.111e-01, 7.492e-02, -2.892e-02, 9.054e-02, 4.350e-02, 2.183e-01, 1.489e-01, 1.167e-02, -6.678e-02, 3.696e-02, -1.315e-02));
r += mul(s1_8, M4(2.532e-02, 4.585e-02, -3.694e-02, -6.244e-02, -1.673e-01, 6.180e-02, -4.475e-02, 1.028e-02, -1.658e-02, 8.923e-02, 1.711e-02, 3.037e-03, 4.651e-02, 1.652e-01, 7.863e-03, -3.387e-02));
r += V4(-6.562e-04, 7.371e-04, -4.319e-03, -8.757e-04);
return r;
}
void Pass4(uint2 blockStart, uint3 tid) {
float2 pt = float2(GetInputPt());
uint2 gxy = Rmp8x8(tid.x) + blockStart;
uint2 size = GetInputSize();
if (gxy.x >= size.x || gxy.y >= size.y) {
return;
}
float2 pos = (gxy + 0.5) * pt;
V4 s0_0 = l0(-1.0, -1.0);
V4 s0_1 = l0(0.0, -1.0);
V4 s0_2 = l0(1.0, -1.0);
V4 s0_3 = l0(-1.0, 0.0);
V4 s0_4 = l0(0.0, 0.0);
V4 s0_5 = l0(1.0, 0.0);
V4 s0_6 = l0(-1.0, 1.0);
V4 s0_7 = l0(0.0, 1.0);
V4 s0_8 = l0(1.0, 1.0);
V4 s1_0 = -max(-s0_0, 0.0);
V4 s1_1 = -max(-s0_1, 0.0);
V4 s1_2 = -max(-s0_2, 0.0);
V4 s1_3 = -max(-s0_3, 0.0);
V4 s1_4 = -max(-s0_4, 0.0);
V4 s1_5 = -max(-s0_5, 0.0);
V4 s1_6 = -max(-s0_6, 0.0);
V4 s1_7 = -max(-s0_7, 0.0);
V4 s1_8 = -max(-s0_8, 0.0);
s0_0 = max(s0_0, 0.0);
s0_1 = max(s0_1, 0.0);
s0_2 = max(s0_2, 0.0);
s0_3 = max(s0_3, 0.0);
s0_4 = max(s0_4, 0.0);
s0_5 = max(s0_5, 0.0);
s0_6 = max(s0_6, 0.0);
s0_7 = max(s0_7, 0.0);
s0_8 = max(s0_8, 0.0);
t1[gxy] = f0(s0_0, s0_1, s0_2, s0_3, s0_4, s0_5, s0_6, s0_7, s0_8, s1_0, s1_1, s1_2, s1_3, s1_4, s1_5, s1_6, s1_7, s1_8);
}
//!PASS 5
//!DESC conv4
//!BLOCK_SIZE 8
//!NUM_THREADS 64
//!IN t1
//!OUT t0
#define l0(x, y) V4(O(t1, float2(x, y)))
V4 f0(V4 s0_0, V4 s0_1, V4 s0_2, V4 s0_3, V4 s0_4, V4 s0_5, V4 s0_6, V4 s0_7, V4 s0_8, V4 s1_0, V4 s1_1, V4 s1_2, V4 s1_3, V4 s1_4, V4 s1_5, V4 s1_6, V4 s1_7, V4 s1_8) {
V4 r = 0.0;
r += mul(s0_0, M4(-1.087e-01, -5.083e-02, 3.146e-01, -4.241e-02, 4.462e-02, -4.358e-02, -1.562e-01, -2.609e-03, 5.918e-02, -2.526e-02, -3.132e-02, -1.150e-02, -8.799e-03, 3.070e-02, -1.680e-02, -1.046e-02));
r += mul(s0_1, M4(1.762e-01, 8.784e-01, -2.704e+00, -1.565e+00, -1.473e-01, -5.723e-01, 7.838e-02, -7.420e-03, -1.769e-01, -2.041e-01, -1.783e-03, -4.944e-03, 1.304e-02, 2.646e-01, -1.708e-01, 7.483e-03));
r += mul(s0_2, M4(-1.907e-01, 1.514e-01, -3.657e-01, -5.840e-01, -4.943e-02, -1.014e-02, -2.869e-03, 6.488e-03, 2.266e-02, -3.850e-02, 6.125e-03, 1.899e-02, -3.541e-02, -2.011e-01, 1.567e-01, 1.008e-02));
r += mul(s0_3, M4(-3.061e-01, -1.768e-01, 9.163e-02, -2.243e-01, 4.945e-02, 1.106e-01, -1.137e-01, 1.755e-02, 2.640e-01, -9.298e-02, -1.704e-01, 3.935e-02, 1.506e-01, -3.284e-02, 4.719e-02, 5.543e-02));
r += mul(s0_4, M4(-4.579e-01, -6.198e-02, -9.889e-01, -4.446e-01, -1.612e-01, 1.518e-01, 2.588e-01, 1.075e-02, -1.527e+00, -7.923e-01, 8.120e-02, -1.116e-01, -2.079e-01, -1.206e-01, -4.422e-01, -1.951e-01));
r += mul(s0_5, M4(1.064e-01, -1.684e-01, 2.316e-01, 4.211e-01, -9.153e-02, 9.155e-02, -7.649e-02, -1.385e-01, 9.422e-02, -1.631e-01, 8.278e-02, 3.318e-01, 7.284e-02, 3.489e-01, -2.303e-02, -6.554e-01));
r += mul(s0_6, M4(-6.320e-02, -4.390e-02, 1.453e-02, 3.187e-02, 2.166e-02, 2.423e-03, 1.573e-03, -2.226e-02, 1.401e-01, 2.026e-01, -2.249e-01, 6.471e-02, 3.593e-02, -1.575e-02, -3.186e-02, 1.339e-02));
r += mul(s0_7, M4(2.778e-02, 7.495e-02, -1.086e-01, 8.862e-02, -2.352e-02, 1.477e-02, 2.741e-02, 4.345e-02, -2.865e-01, 9.405e-02, 1.880e-01, -3.610e-01, -7.797e-02, -5.710e-03, 3.386e-02, 2.830e-02));
r += mul(s0_8, M4(-3.734e-02, 3.357e-02, 5.657e-03, -1.596e-01, -7.661e-03, 1.603e-02, -3.137e-02, -7.023e-03, 6.522e-03, -2.715e-02, 2.765e-02, 4.724e-02, 1.922e-02, 3.944e-02, -8.276e-02, -1.915e-02));
r += mul(s1_0, M4(-7.121e-02, -2.276e-02, 7.266e-02, -4.411e-03, -5.600e-01, 4.502e-01, -1.817e-01, -2.906e-01, -5.675e-02, 2.653e-02, 3.284e-02, -1.925e-03, -4.729e-03, -1.554e-03, -6.081e-03, -2.195e-02));
r += mul(s1_1, M4(2.212e-01, 3.154e-01, -2.765e-01, 4.432e-02, 1.402e+00, 2.159e-01, 4.402e-01, 2.537e-01, 6.697e-02, 1.207e-01, -5.192e-02, 2.638e-02, 5.366e-02, 5.855e-02, -3.687e-02, 4.389e-03));
r += mul(s1_2, M4(3.137e-02, -1.157e-01, 9.497e-02, -3.724e-02, 5.241e-02, 7.793e-02, 2.277e-04, -4.033e-01, 1.432e-02, 4.622e-02, -1.636e-02, -5.840e-03, -1.593e-02, -7.447e-02, 3.943e-02, -3.517e-03));
r += mul(s1_3, M4(-1.209e-02, -1.350e-01, 3.018e-01, 1.233e-01, -1.262e-03, 2.194e-01, -2.919e-01, -8.031e-03, 4.620e-03, 5.318e-02, 1.247e-02, -4.260e-02, 7.155e-02, 3.256e-02, -9.839e-02, -6.741e-04));
r += mul(s1_4, M4(3.291e-01, 2.397e-01, -2.820e-01, 5.703e-01, 7.831e-03, 5.816e-02, -1.696e-02, -1.957e-01, -1.851e-01, 3.696e-02, -2.611e-01, 7.039e-03, -1.562e-01, -7.676e-01, 9.080e-01, 7.823e-02));
r += mul(s1_5, M4(9.918e-03, 6.364e-02, 3.364e-02, -3.291e-01, 1.393e-02, 3.139e-02, 1.701e-02, -5.675e-02, 5.085e-02, -2.050e-01, 1.160e-01, 4.875e-02, -1.189e-01, 2.310e-01, -1.353e-01, 2.046e-02));
r += mul(s1_6, M4(-5.477e-03, -1.704e-02, 9.510e-03, -1.701e-02, 1.391e-02, -8.760e-03, -3.355e-02, -6.898e-03, -9.203e-03, -2.442e-02, 7.547e-03, 1.817e-02, 1.871e-02, -1.149e-02, 6.458e-02, 1.403e-02));
r += mul(s1_7, M4(-5.073e-03, -5.454e-02, -2.710e-02, 1.292e-02, 2.458e-02, 1.739e-02, -2.319e-03, 3.865e-02, 5.399e-02, -1.176e-02, -1.315e-01, 1.489e-01, -7.903e-02, 8.120e-02, 4.749e-02, 1.961e-01));
r += mul(s1_8, M4(4.163e-02, -1.603e-02, 8.659e-03, 1.023e-01, 5.233e-03, -2.900e-03, -5.293e-03, -5.829e-03, -1.453e-02, 2.467e-02, 7.198e-02, -2.407e-01, -4.023e-02, 1.009e-01, -1.560e-01, -1.567e-01));
r += V4(-3.709e-04, 2.029e-04, -3.042e-03, -2.970e-04);
return r;
}
void Pass5(uint2 blockStart, uint3 tid) {
float2 pt = float2(GetInputPt());
uint2 gxy = Rmp8x8(tid.x) + blockStart;
uint2 size = GetInputSize();
if (gxy.x >= size.x || gxy.y >= size.y) {
return;
}
float2 pos = (gxy + 0.5) * pt;
V4 s0_0 = l0(-1.0, -1.0);
V4 s0_1 = l0(0.0, -1.0);
V4 s0_2 = l0(1.0, -1.0);
V4 s0_3 = l0(-1.0, 0.0);
V4 s0_4 = l0(0.0, 0.0);
V4 s0_5 = l0(1.0, 0.0);
V4 s0_6 = l0(-1.0, 1.0);
V4 s0_7 = l0(0.0, 1.0);
V4 s0_8 = l0(1.0, 1.0);
V4 s1_0 = -max(-s0_0, 0.0);
V4 s1_1 = -max(-s0_1, 0.0);
V4 s1_2 = -max(-s0_2, 0.0);
V4 s1_3 = -max(-s0_3, 0.0);
V4 s1_4 = -max(-s0_4, 0.0);
V4 s1_5 = -max(-s0_5, 0.0);
V4 s1_6 = -max(-s0_6, 0.0);
V4 s1_7 = -max(-s0_7, 0.0);
V4 s1_8 = -max(-s0_8, 0.0);
s0_0 = max(s0_0, 0.0);
s0_1 = max(s0_1, 0.0);
s0_2 = max(s0_2, 0.0);
s0_3 = max(s0_3, 0.0);
s0_4 = max(s0_4, 0.0);
s0_5 = max(s0_5, 0.0);
s0_6 = max(s0_6, 0.0);
s0_7 = max(s0_7, 0.0);
s0_8 = max(s0_8, 0.0);
t0[gxy] = f0(s0_0, s0_1, s0_2, s0_3, s0_4, s0_5, s0_6, s0_7, s0_8, s1_0, s1_1, s1_2, s1_3, s1_4, s1_5, s1_6, s1_7, s1_8);
}
//!PASS 6
//!DESC out-shuffle
//!BLOCK_SIZE 16
//!NUM_THREADS 64
//!IN INPUT, t0
//!OUT OUTPUT
#define l0(x, y) V4(O(t0, float2(x, y)))
V4 f0(V4 s0_0, V4 s0_1, V4 s0_2, V4 s0_3, V4 s0_4, V4 s0_5, V4 s0_6, V4 s0_7, V4 s0_8, V4 s1_0, V4 s1_1, V4 s1_2, V4 s1_3, V4 s1_4, V4 s1_5, V4 s1_6, V4 s1_7, V4 s1_8) {
V4 r = 0.0;
r += mul(s0_0, M4(-6.857e-02, -6.042e-02, 3.293e-03, -2.389e-03, -1.606e-01, -1.556e-02, -5.115e-02, -4.602e-02, -3.762e-02, 1.994e-02, -2.370e-02, 3.558e-02, -7.142e-01, 8.184e-01, -1.361e-01, 1.228e-01));
r += mul(s0_1, M4(-1.887e-01, -2.260e-01, 1.293e-02, -1.757e-02, 1.257e-01, 1.304e-01, -4.525e-02, 4.471e-02, 6.895e-01, -4.096e-01, 4.096e-02, 1.817e-02, -1.343e-01, -4.170e-01, 3.991e-03, 1.516e-03));
r += mul(s0_2, M4(-2.667e-01, -8.692e-02, 1.481e-01, -1.466e-01, 6.142e-02, -2.084e-02, 1.942e-02, 6.700e-04, 3.942e-02, 3.109e-01, -1.323e-02, 2.240e-02, -2.306e-02, -4.749e-02, -1.155e-02, 1.843e-03));
r += mul(s0_3, M4(-1.004e-01, -1.184e-02, -8.590e-02, -1.018e-01, 6.862e-02, -4.700e-02, -1.537e-01, -1.096e-01, -1.228e-01, 1.462e-02, -1.715e-01, 1.862e-02, 3.668e-01, -1.138e-01, 8.494e-04, 6.113e-01));
r += mul(s0_4, M4(4.389e-01, -5.527e-01, -4.972e-01, -7.620e-01, 1.684e-01, 5.375e-02, 1.032e+00, 5.723e-01, 4.427e-02, -2.447e-01, 1.132e+00, -5.297e-01, 1.150e-01, 3.877e-01, 1.224e-01, 1.294e-01));
r += mul(s0_5, M4(-1.023e+00, 1.567e+00, -9.747e-01, 1.051e+00, 1.537e-02, 1.993e-01, -1.679e-01, 1.139e-01, -7.358e-02, -1.782e-01, -1.938e-01, 4.419e-02, 2.001e-02, 5.881e-02, 8.971e-03, 3.368e-03));
r += mul(s0_6, M4(-5.126e-03, 1.449e-02, -7.018e-02, 2.929e-02, 4.748e-02, -4.443e-03, -5.791e-02, -3.490e-02, 3.817e-02, 1.007e-02, -5.501e-02, -1.488e-02, -8.848e-03, 4.884e-02, -6.548e-02, 3.392e-02));
r += mul(s0_7, M4(-4.449e-02, 7.313e-02, 3.311e-01, 3.138e-02, -6.466e-02, 5.666e-02, 1.929e-01, 8.274e-02, 3.994e-02, 2.105e-02, -1.821e-01, -1.539e-02, -9.333e-03, -4.728e-02, 6.975e-03, -3.292e-03));
r += mul(s0_8, M4(2.038e-01, -2.356e-01, -1.987e-01, -3.746e-02, -1.499e-02, -7.007e-02, -9.546e-02, 1.905e-02, -9.802e-03, 1.990e-02, 2.140e-02, -8.164e-03, 5.109e-03, -2.081e-02, -2.386e-02, 1.183e-02));
r += mul(s1_0, M4(-7.067e-02, -4.613e-02, -5.433e-04, -2.191e-02, -1.125e-01, -3.650e-02, -1.298e-02, -3.479e-02, -1.118e-01, -1.521e-02, -4.731e-03, -7.478e-03, 1.802e-01, 4.872e-02, -1.599e-03, -1.452e-02));
r += mul(s1_1, M4(-2.920e-01, -1.831e-01, -1.305e-02, 4.031e-02, 1.989e-01, 3.120e-03, 2.025e-02, 5.432e-02, 2.607e-01, 2.403e-02, 1.863e-02, 8.423e-02, -3.372e-01, -1.327e-01, -1.248e-01, -1.247e-01));
r += mul(s1_2, M4(-9.286e-02, -1.948e-01, -8.532e-03, 7.416e-03, 4.578e-02, 1.581e-01, 1.473e-03, -3.796e-02, 1.011e-01, 2.393e-01, 2.742e-02, -4.224e-02, -9.579e-03, -9.888e-02, -2.065e-03, 7.685e-03));
r += mul(s1_3, M4(-2.056e-01, -3.479e-02, -2.666e-01, -5.344e-02, 1.579e-01, -6.091e-02, -1.655e-01, -1.575e-01, -8.230e-02, -4.748e-02, -1.304e-01, -7.186e-02, 2.953e-01, 6.950e-02, 1.865e-01, 7.567e-02));
r += mul(s1_4, M4(3.408e-01, -1.054e-01, -2.613e-01, -6.084e-01, 3.193e-01, 6.366e-01, 4.251e-01, 4.066e-01, -3.742e-01, -8.521e-02, 5.906e-01, 1.870e-01, 2.044e-02, 2.495e-01, 1.046e-01, 3.018e-01));
r += mul(s1_5, M4(4.748e-03, 2.086e-01, 4.231e-03, -7.764e-03, 3.933e-02, 3.446e-03, -3.431e-02, 8.415e-02, -3.798e-02, -3.428e-01, -7.206e-02, 2.392e-01, 2.157e-02, 2.692e-02, 3.313e-02, 1.841e-02));
r += mul(s1_6, M4(1.813e-02, 2.306e-03, -3.402e-02, 1.009e-03, 4.408e-02, -2.307e-02, -3.394e-02, -3.912e-02, 3.822e-02, -1.051e-02, -1.023e-01, -4.626e-02, -4.871e-02, 6.250e-03, 1.367e-01, 3.674e-02));
r += mul(s1_7, M4(-1.170e-02, 3.747e-02, 1.548e-01, 1.243e-01, -1.074e-01, -9.848e-03, 2.627e-01, 1.132e-01, 4.550e-02, 5.050e-02, -1.194e-01, -6.091e-02, -2.180e-02, -6.381e-02, -5.949e-02, 1.580e-02));
r += mul(s1_8, M4(-1.146e-04, -1.852e-02, -1.515e-02, 2.488e-02, -1.877e-02, -7.739e-02, -6.812e-02, 7.656e-03, 2.688e-02, 5.650e-02, 4.285e-02, -3.270e-02, 1.163e-03, 8.328e-04, -1.998e-02, -2.282e-02));
r += V4(-3.259e-04, -3.197e-04, 4.954e-04, 4.568e-04);
return tanh(r);
}
void Pass6(uint2 blockStart, uint3 tid) {
float2 pt = float2(GetInputPt());
uint2 gxy = (Rmp8x8(tid.x) << 1) + blockStart;
uint2 size = GetOutputSize();
if (gxy.x >= size.x || gxy.y >= size.y) {
return;
}
float2 pos = ((gxy >> 1) + 0.5) * pt;
V4 s0_0 = l0(-1.0, -1.0);
V4 s0_1 = l0(0.0, -1.0);
V4 s0_2 = l0(1.0, -1.0);
V4 s0_3 = l0(-1.0, 0.0);
V4 s0_4 = l0(0.0, 0.0);
V4 s0_5 = l0(1.0, 0.0);
V4 s0_6 = l0(-1.0, 1.0);
V4 s0_7 = l0(0.0, 1.0);
V4 s0_8 = l0(1.0, 1.0);
V4 s1_0 = -max(-s0_0, 0.0);
V4 s1_1 = -max(-s0_1, 0.0);
V4 s1_2 = -max(-s0_2, 0.0);
V4 s1_3 = -max(-s0_3, 0.0);
V4 s1_4 = -max(-s0_4, 0.0);
V4 s1_5 = -max(-s0_5, 0.0);
V4 s1_6 = -max(-s0_6, 0.0);
V4 s1_7 = -max(-s0_7, 0.0);
V4 s1_8 = -max(-s0_8, 0.0);
s0_0 = max(s0_0, 0.0);
s0_1 = max(s0_1, 0.0);
s0_2 = max(s0_2, 0.0);
s0_3 = max(s0_3, 0.0);
s0_4 = max(s0_4, 0.0);
s0_5 = max(s0_5, 0.0);
s0_6 = max(s0_6, 0.0);
s0_7 = max(s0_7, 0.0);
s0_8 = max(s0_8, 0.0);
V4 r = f0(s0_0, s0_1, s0_2, s0_3, s0_4, s0_5, s0_6, s0_7, s0_8, s1_0, s1_1, s1_2, s1_3, s1_4, s1_5, s1_6, s1_7, s1_8);
static const float3x3 rgb2yuv = {0.299, 0.587, 0.114, -0.169, -0.331, 0.5, 0.5, -0.419, -0.081};
static const float3x3 yuv2rgb = {1, -0.00093, 1.401687, 1, -0.3437, -0.71417, 1, 1.77216, 0.00099};
float2 opt = float2(GetOutputPt());
pos -= 0.5f * opt;
float3 yuv = mul(rgb2yuv, INPUT.SampleLevel(SL, pos, 0).rgb);
OUTPUT[gxy] = float4(mul(yuv2rgb, float3(saturate(yuv.r + r.x), yuv.yz)), 1);
++gxy.x;
pos.x += opt.x;
yuv = mul(rgb2yuv, INPUT.SampleLevel(SL, pos, 0).rgb);
OUTPUT[gxy] = float4(mul(yuv2rgb, float3(saturate(yuv.r + r.y), yuv.yz)), 1);
++gxy.y;
pos.y += opt.y;
yuv = mul(rgb2yuv, INPUT.SampleLevel(SL, pos, 0).rgb);
OUTPUT[gxy] = float4(mul(yuv2rgb, float3(saturate(yuv.r + r.w), yuv.yz)), 1);
--gxy.x;
pos.x -= opt.x;
yuv = mul(rgb2yuv, INPUT.SampleLevel(SL, pos, 0).rgb);
OUTPUT[gxy] = float4(mul(yuv2rgb, float3(saturate(yuv.r + r.z), yuv.yz)), 1);
}

View file

@ -0,0 +1,921 @@
// CuNNy 4x8C BILINEAR RGB NVL DN - https://github.com/cunnyplapper/CuNNy
// This program is free software: you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation, either version 3 of the License, or
// (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
//
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License
// along with this program. If not, see <http://www.gnu.org/licenses/>.
//!MAGPIE EFFECT
//!VERSION 4
//!SORT_NAME CuNNy-DN-D08N04
//!TEXTURE
Texture2D INPUT;
//!TEXTURE
//!WIDTH INPUT_WIDTH * 2
//!HEIGHT INPUT_HEIGHT * 2
Texture2D OUTPUT;
//!SAMPLER
//!FILTER POINT
SamplerState SP;
//!SAMPLER
//!FILTER LINEAR
SamplerState SL;
//!COMMON
#define O(t, p) t.SampleLevel(SP, pos + p * pt, 0)
#define V4 min16float4
#define M4 min16float4x4
//!TEXTURE
//!WIDTH INPUT_WIDTH
//!HEIGHT INPUT_HEIGHT
//!FORMAT R8G8B8A8_SNORM
Texture2D t0;
//!TEXTURE
//!WIDTH INPUT_WIDTH
//!HEIGHT INPUT_HEIGHT
//!FORMAT R8G8B8A8_SNORM
Texture2D t1;
//!TEXTURE
//!WIDTH INPUT_WIDTH
//!HEIGHT INPUT_HEIGHT
//!FORMAT R8G8B8A8_SNORM
Texture2D t2;
//!TEXTURE
//!WIDTH INPUT_WIDTH
//!HEIGHT INPUT_HEIGHT
//!FORMAT R8G8B8A8_SNORM
Texture2D t3;
//!PASS 1
//!DESC in
//!BLOCK_SIZE 8
//!NUM_THREADS 64
//!IN INPUT
//!OUT t0, t1
#define l0(x, y) min16float((dot(float3(2.214e-01, 4.385e-01, 1.006e-01), O(INPUT, float2(x, y)).rgb) + -6.858e-01))
V4 f0(min16float s0_0, min16float s0_1, min16float s0_2, min16float s0_3, min16float s0_4, min16float s0_5, min16float s0_6, min16float s0_7, min16float s0_8) {
V4 r = 0.0;
r += V4(-2.401e-02, 1.817e-03, -1.218e-01, 2.796e-02) * s0_0;
r += V4(3.256e-02, 3.929e-03, -5.850e-02, -5.602e-02) * s0_1;
r += V4(4.497e-04, -1.812e-02, 5.241e-02, 3.698e-02) * s0_2;
r += V4(5.371e-01, -2.302e-01, -1.373e-01, -4.038e-03) * s0_3;
r += V4(1.565e-01, -6.067e-02, 3.397e-01, -3.741e-01) * s0_4;
r += V4(-2.095e-03, 4.044e-02, -3.770e-02, 5.665e-02) * s0_5;
r += V4(-1.993e-01, -2.645e-01, -8.892e-02, 1.948e-02) * s0_6;
r += V4(-4.865e-01, 5.400e-01, -1.396e-01, 1.270e-01) * s0_7;
r += V4(-1.667e-02, -9.433e-03, -1.324e-02, -1.803e-03) * s0_8;
r += V4(2.880e-04, 1.418e-02, 1.413e-02, -1.036e-01);
return r;
}
V4 f1(min16float s0_0, min16float s0_1, min16float s0_2, min16float s0_3, min16float s0_4, min16float s0_5, min16float s0_6, min16float s0_7, min16float s0_8) {
V4 r = 0.0;
r += V4(-4.610e-02, -6.199e-01, 8.493e-03, -1.532e-02) * s0_0;
r += V4(-7.178e-02, 5.957e-01, 1.575e-03, 1.807e-02) * s0_1;
r += V4(1.106e-01, 3.625e-03, 3.713e-02, -4.124e-03) * s0_2;
r += V4(1.288e-01, -5.582e-02, 5.082e-02, 1.674e-02) * s0_3;
r += V4(-6.074e-01, 8.818e-02, -3.371e-01, -6.663e-01) * s0_4;
r += V4(-8.030e-02, -4.780e-03, -3.421e-01, 5.358e-02) * s0_5;
r += V4(4.990e-01, 7.623e-03, 1.778e-03, 2.401e-02) * s0_6;
r += V4(9.546e-02, -1.656e-02, 6.935e-04, 6.387e-01) * s0_7;
r += V4(-2.302e-02, 5.209e-03, 5.835e-02, -6.361e-02) * s0_8;
r += V4(-4.485e-04, -2.620e-04, 2.449e-02, -7.403e-04);
return r;
}
void Pass1(uint2 blockStart, uint3 tid) {
float2 pt = float2(GetInputPt());
uint2 gxy = Rmp8x8(tid.x) + blockStart;
uint2 size = GetInputSize();
if (gxy.x >= size.x || gxy.y >= size.y) {
return;
}
float2 pos = (gxy + 0.5) * pt;
min16float s0_0 = l0(-1.0, -1.0);
min16float s0_1 = l0(0.0, -1.0);
min16float s0_2 = l0(1.0, -1.0);
min16float s0_3 = l0(-1.0, 0.0);
min16float s0_4 = l0(0.0, 0.0);
min16float s0_5 = l0(1.0, 0.0);
min16float s0_6 = l0(-1.0, 1.0);
min16float s0_7 = l0(0.0, 1.0);
min16float s0_8 = l0(1.0, 1.0);
t0[gxy] = f0(s0_0, s0_1, s0_2, s0_3, s0_4, s0_5, s0_6, s0_7, s0_8);
t1[gxy] = f1(s0_0, s0_1, s0_2, s0_3, s0_4, s0_5, s0_6, s0_7, s0_8);
}
//!PASS 2
//!DESC conv1
//!BLOCK_SIZE 8
//!NUM_THREADS 64
//!IN t0, t1
//!OUT t2, t3
#define l0(x, y) V4(O(t0, float2(x, y)))
#define l1(x, y) V4(O(t1, float2(x, y)))
V4 f0(V4 s0_0, V4 s0_1, V4 s0_2, V4 s0_3, V4 s0_4, V4 s0_5, V4 s0_6, V4 s0_7, V4 s0_8, V4 s1_0, V4 s1_1, V4 s1_2, V4 s1_3, V4 s1_4, V4 s1_5, V4 s1_6, V4 s1_7, V4 s1_8, V4 s2_0, V4 s2_1, V4 s2_2, V4 s2_3, V4 s2_4, V4 s2_5, V4 s2_6, V4 s2_7, V4 s2_8, V4 s3_0, V4 s3_1, V4 s3_2, V4 s3_3, V4 s3_4, V4 s3_5, V4 s3_6, V4 s3_7, V4 s3_8) {
V4 r = 0.0;
r += mul(s0_0, M4(7.103e-02, 1.495e-01, -1.731e-02, -9.952e-02, -1.539e-01, -1.103e-01, 7.099e-02, 2.023e-01, 2.681e-02, 5.202e-03, 1.954e-02, -6.822e-02, -1.650e-01, 3.710e-01, -6.020e-01, 4.879e-01));
r += mul(s0_1, M4(-1.168e-02, 2.587e-01, -4.670e-01, -3.986e-02, -1.268e-01, 3.619e-02, 5.712e-02, 1.722e-01, 4.473e-02, -1.224e-01, 8.228e-02, -3.981e-02, 4.044e-01, -3.039e-01, -3.390e-01, 5.925e-02));
r += mul(s0_2, M4(4.083e-02, 7.140e-02, -5.864e-01, 1.188e-01, 2.214e-01, -2.826e-01, 2.294e-01, -2.199e-01, -9.048e-02, 1.787e-01, -6.887e-02, -6.645e-02, -1.285e-01, -8.261e-02, -1.975e-01, 2.428e-01));
r += mul(s0_3, M4(-5.801e-02, -3.381e-02, -2.285e-01, 9.377e-02, 1.878e-01, 9.285e-02, -1.001e-01, -5.059e-02, -2.155e-02, -9.098e-02, -1.279e-02, 9.801e-02, 1.178e-01, -1.967e-01, -4.792e-02, -1.106e-01));
r += mul(s0_4, M4(3.048e-01, 2.731e-01, -2.351e-01, -1.516e-01, -1.382e-02, 1.296e-01, -9.530e-02, 2.975e-02, 2.411e-01, 2.343e-02, 1.731e-02, -2.331e-01, -2.161e-01, 4.114e-01, 4.417e-01, 1.225e+00));
r += mul(s0_5, M4(3.337e-01, 2.844e-01, 1.065e-01, -2.391e-01, -1.265e-01, -3.625e-02, -7.062e-02, 3.529e-02, 2.208e-02, -8.459e-03, -1.366e-01, -1.563e-02, -1.648e-01, -5.919e-01, 4.061e-01, -4.975e-02));
r += mul(s0_6, M4(6.213e-03, -2.020e-02, 2.520e-03, 2.167e-02, -2.361e-01, -1.421e-01, -4.579e-02, -1.353e-01, -2.883e-01, -5.900e-04, 2.720e-02, 1.591e-01, -5.120e-01, -4.253e-01, -3.397e-02, -4.633e-01));
r += mul(s0_7, M4(2.456e-01, -6.978e-02, 5.668e-02, -9.795e-03, -1.925e-01, -4.841e-02, -1.273e-02, 1.282e-02, -1.223e-01, -4.080e-02, 2.975e-02, 1.595e-01, -3.345e-01, -1.504e-01, 1.080e-01, 8.549e-01));
r += mul(s0_8, M4(8.700e-02, 1.611e-02, 8.589e-02, -3.284e-02, -1.637e-01, 2.627e-01, 1.851e-02, 2.843e-02, 1.224e-01, 6.163e-02, 4.991e-02, -1.510e-01, 1.885e-01, -5.951e-02, -3.463e-02, 2.172e-01));
r += mul(s1_0, M4(1.856e-01, -1.041e-01, 1.900e-01, 8.420e-02, -3.223e-01, 6.258e-02, -9.766e-02, -6.517e-01, 3.066e-02, -7.562e-02, 1.015e-02, -1.139e-01, 1.569e-02, -3.684e-02, -2.813e-02, 8.835e-02));
r += mul(s1_1, M4(-7.107e-02, -1.146e-01, 5.488e-01, -2.960e-01, 3.743e-01, -5.368e-01, -2.219e-01, -3.122e-01, 2.468e-02, -7.477e-01, 1.858e-01, 3.498e-01, 1.771e-03, 4.215e-03, 8.478e-02, 9.318e-02));
r += mul(s1_2, M4(-2.350e-03, -3.382e-01, 5.964e-01, -2.321e-01, 2.011e-01, 1.890e-01, -2.062e-01, -3.725e-02, -1.003e-01, -1.464e-01, 1.040e-01, 9.994e-02, -7.113e-02, -3.827e-02, -1.258e-01, -1.584e-01));
r += mul(s1_3, M4(-1.609e-01, -1.460e-01, -4.804e-03, 5.503e-02, 2.784e-01, -1.475e-02, 9.395e-02, -1.128e-01, 1.032e-02, -1.969e-01, 2.170e-01, 2.335e-01, -1.371e-01, 4.853e-02, 8.945e-03, -2.698e-01));
r += mul(s1_4, M4(7.739e-02, -1.105e-01, 3.348e-01, 1.093e-01, -7.745e-02, -1.642e-01, -2.191e-01, -2.674e-02, 4.199e-01, -3.302e-01, 1.445e-01, -2.815e-01, -3.154e-01, 6.646e-02, 8.520e-02, -1.053e-01));
r += mul(s1_5, M4(-4.165e-01, -8.545e-02, 2.291e-01, -1.042e-01, 3.791e-01, -7.209e-02, -6.332e-02, -3.174e-01, 1.038e-01, 8.122e-03, -9.715e-02, 6.808e-01, -9.362e-02, -4.634e-02, 5.184e-03, 1.295e-01));
r += mul(s1_6, M4(-8.179e-02, -8.513e-02, 4.470e-02, -7.799e-02, -1.092e-01, -1.851e-01, -1.025e-01, -4.220e-02, -3.853e-01, 3.040e-02, -9.081e-02, 1.439e-01, -2.730e-02, -5.086e-02, 5.352e-03, -5.102e-03));
r += mul(s1_7, M4(7.601e-02, -1.423e-01, 3.421e-01, 2.574e-03, 1.165e-01, 6.863e-03, 1.250e-02, -4.862e-02, -3.859e-01, -1.108e-01, 2.515e-02, 5.564e-01, 2.485e-01, 2.230e-01, -3.839e-02, 3.605e-02));
r += mul(s1_8, M4(-9.424e-02, 1.248e-01, 1.980e-01, -1.671e-01, 1.098e-01, 6.555e-02, -7.194e-02, -1.626e-01, -1.439e-01, -2.086e-01, -1.925e-02, 1.520e-01, 2.139e-01, -7.764e-02, 6.469e-02, 7.875e-03));
r += mul(s2_0, M4(4.572e-02, 3.661e-02, -3.845e-01, -1.383e-01, 1.729e-02, 1.780e-02, 3.664e-02, -6.961e-02, -9.001e-03, -1.853e-02, -6.735e-02, -1.864e-02, 1.695e-01, -1.420e-01, 2.679e-01, -1.525e-01));
r += mul(s2_1, M4(9.967e-02, -2.869e-01, -2.251e-01, 8.470e-02, 3.178e-02, -9.701e-03, 9.260e-02, 4.087e-04, -8.081e-02, 1.341e-01, 5.882e-03, 1.043e-02, 8.559e-03, 6.534e-02, -4.619e-01, -3.010e-01));
r += mul(s2_2, M4(-1.676e-02, -3.339e-01, 1.848e-01, -2.562e-01, -8.563e-02, 2.487e-02, 2.495e-01, 9.448e-02, 2.189e-02, -3.018e-02, 5.698e-02, 6.041e-02, -4.869e-02, -2.627e-02, 1.602e-01, 1.092e-01));
r += mul(s2_3, M4(6.867e-02, -1.693e-01, -1.614e-01, -1.944e-01, 1.992e-01, 1.720e-01, 2.393e-01, 1.219e-02, 4.866e-02, -1.165e-01, -1.285e-01, 2.929e-01, 2.043e-01, -1.399e-02, 1.595e-02, -2.746e-01));
r += mul(s2_4, M4(-4.477e-01, -5.696e-01, -1.760e-02, 1.362e-01, 1.472e-01, 3.113e-01, -2.419e-01, 8.650e-02, -8.358e-02, 1.081e-01, 3.881e-02, -1.400e-01, -2.071e-01, 3.977e-02, -3.149e-01, 2.525e-01));
r += mul(s2_5, M4(5.496e-02, -9.963e-02, -1.227e-01, -1.892e-01, 4.361e-02, -3.776e-01, -6.576e-01, 2.628e-01, -8.215e-02, -8.123e-02, 2.248e-03, 1.261e-01, 1.193e-01, 2.608e-01, 2.567e-01, 8.120e-02));
r += mul(s2_6, M4(-1.587e-01, -9.849e-02, 1.122e-01, -5.963e-02, -9.176e-02, 7.341e-03, 1.164e-03, -5.660e-02, 1.567e-01, -6.958e-02, -3.780e-02, 4.238e-04, -6.186e-02, 1.777e-01, 2.398e-01, 6.853e-03));
r += mul(s2_7, M4(1.062e-01, -1.498e-01, 5.492e-02, 1.108e-01, -3.248e-01, -2.901e-01, -4.360e-01, 1.128e-01, 7.346e-02, 8.659e-02, 9.740e-02, -1.434e-01, 1.538e-01, 1.349e-01, 1.408e-01, -1.367e-01));
r += mul(s2_8, M4(1.412e-01, -8.889e-02, 2.029e-02, -1.523e-01, 4.847e-01, -7.432e-01, -1.181e-01, 4.132e-01, 3.119e-02, -5.840e-02, -2.292e-02, -3.125e-02, 2.440e-02, 2.815e-02, 2.759e-01, -8.781e-02));
r += mul(s3_0, M4(2.147e-02, 2.192e-01, 2.489e-01, -3.436e-02, 1.086e-02, -2.680e-02, -9.925e-02, 3.978e-02, 1.239e-01, 3.645e-02, 5.463e-01, 5.005e-01, 1.039e-01, -1.694e-01, -3.816e-02, 3.834e-01));
r += mul(s3_1, M4(1.418e-01, 5.806e-02, 1.317e-01, 2.227e-01, 1.486e-02, -4.235e-03, -5.750e-02, -1.548e-01, -7.700e-01, 3.263e-01, -1.193e-02, 3.537e-01, -2.841e-01, 4.657e-01, -1.576e-01, -9.526e-02));
r += mul(s3_2, M4(7.641e-02, 8.195e-01, 1.080e-01, 1.814e-01, -5.471e-02, 2.211e-02, -4.212e-02, -1.249e-02, 2.469e-02, 5.436e-01, 3.805e-01, -9.622e-02, -6.358e-02, -3.739e-01, -3.504e-01, -2.627e-01));
r += mul(s3_3, M4(-9.359e-02, -1.830e-02, -7.015e-02, -7.774e-02, 2.286e-01, -6.321e-02, -5.124e-02, -2.799e-03, -5.063e-01, -1.835e-01, 3.716e-01, 1.130e+00, 3.259e-01, -2.045e-01, -1.792e-01, 4.892e-01));
r += mul(s3_4, M4(-7.478e-01, -1.192e-01, 1.022e-01, 8.111e-01, 7.253e-02, 2.280e-01, -1.116e-01, -2.828e-01, -2.364e-01, -1.233e+00, -1.125e+00, 1.750e+00, -1.215e+00, 4.973e-02, 2.070e-01, 6.996e-01));
r += mul(s3_5, M4(-4.115e-02, 3.613e-01, 2.694e-01, 4.126e-02, 7.046e-02, 6.242e-02, 9.300e-02, -1.965e-01, -3.211e-01, 8.504e-01, 2.518e-01, -5.622e-01, 5.663e-02, -1.139e-01, 1.150e-01, -1.954e-01));
r += mul(s3_6, M4(-1.870e-01, -9.168e-02, -8.947e-02, 6.127e-03, 1.163e-02, 3.733e-04, -3.330e-01, 1.935e-01, 3.424e-01, 1.313e-01, -6.732e-01, 8.256e-02, 6.713e-02, 2.980e-02, -6.912e-02, 1.715e-01));
r += mul(s3_7, M4(1.636e-01, 1.212e-01, 2.280e-02, 1.552e-01, -4.955e-01, 8.376e-01, 1.476e-01, 2.192e-01, 9.746e-01, -3.148e-01, 8.206e-01, -8.104e-01, -7.918e-02, -1.604e-01, 5.505e-02, 7.640e-02));
r += mul(s3_8, M4(1.248e-01, 2.878e-01, -4.182e-02, -9.214e-02, -1.210e-01, 4.382e-01, 8.062e-02, -3.051e-01, -1.803e-01, -3.041e-01, 1.368e-01, -1.030e-01, 2.941e-02, -2.724e-01, 3.480e-02, 1.396e-02));
r += V4(-3.046e-02, 3.515e-02, 4.880e-02, 4.740e-03);
return r;
}
V4 f1(V4 s0_0, V4 s0_1, V4 s0_2, V4 s0_3, V4 s0_4, V4 s0_5, V4 s0_6, V4 s0_7, V4 s0_8, V4 s1_0, V4 s1_1, V4 s1_2, V4 s1_3, V4 s1_4, V4 s1_5, V4 s1_6, V4 s1_7, V4 s1_8, V4 s2_0, V4 s2_1, V4 s2_2, V4 s2_3, V4 s2_4, V4 s2_5, V4 s2_6, V4 s2_7, V4 s2_8, V4 s3_0, V4 s3_1, V4 s3_2, V4 s3_3, V4 s3_4, V4 s3_5, V4 s3_6, V4 s3_7, V4 s3_8) {
V4 r = 0.0;
r += mul(s0_0, M4(1.574e-01, -6.104e-03, -2.288e-01, 5.024e-03, -2.149e-03, -8.674e-02, 1.209e-01, 7.107e-02, 1.242e-01, 2.312e-03, -5.300e-02, -2.285e-01, -8.824e-02, 7.402e-02, -4.447e-01, 1.117e+00));
r += mul(s0_1, M4(-5.617e-02, 3.613e-01, -4.666e-01, 1.795e-01, 1.718e-01, -1.005e-01, -2.593e-01, 4.103e-01, 2.477e-01, 1.883e-01, 3.928e-02, -3.635e-01, -7.353e-01, 3.209e-01, 2.171e-01, 3.924e-01));
r += mul(s0_2, M4(-3.304e-01, 6.332e-01, -3.898e-01, 2.704e-01, 4.110e-02, -2.786e-01, -2.513e-01, 1.800e-01, 9.402e-03, -1.975e-01, -4.040e-02, -2.047e-01, -3.239e-01, -1.623e-01, 1.001e-01, 3.053e-02));
r += mul(s0_3, M4(3.935e-01, 5.218e-02, 4.630e-02, 2.202e-02, -2.172e-01, -1.530e-02, -1.782e-01, -9.327e-02, -6.425e-02, -2.402e-02, -2.919e-02, -4.034e-02, 6.589e-01, -4.900e-02, 7.783e-02, 6.334e-01));
r += mul(s0_4, M4(-5.475e-02, 1.543e-01, -1.597e-01, -2.500e-01, 4.990e-02, 5.780e-02, 1.162e-01, 1.140e-01, -2.980e-01, -2.524e-02, -2.103e-01, 4.297e-01, 4.528e-01, -3.098e-01, -1.415e-01, 7.565e-01));
r += mul(s0_5, M4(-1.097e-01, 3.376e-01, -5.685e-01, 1.347e-01, 1.155e-01, -1.396e-01, -2.840e-01, -1.373e-01, 1.442e-01, 8.711e-02, 1.357e-01, -1.110e-01, 2.095e-01, -2.901e-01, -1.007e-01, -2.473e-01));
r += mul(s0_6, M4(-7.405e-02, 9.320e-02, -5.870e-02, -2.569e-01, 6.017e-03, -8.078e-02, -3.798e-02, 2.334e-01, 1.440e-01, -1.852e-01, -6.627e-03, 3.514e-03, -1.499e-02, -6.237e-02, 3.665e-01, 3.270e-01));
r += mul(s0_7, M4(2.443e-01, 8.076e-02, -2.143e-01, 1.120e-01, 1.187e-01, 1.317e-01, 1.811e-01, 1.918e-01, -2.164e-02, -1.829e-01, 2.105e-01, 3.085e-01, 3.155e-01, 2.801e-01, -6.834e-01, 2.861e-01));
r += mul(s0_8, M4(9.974e-03, 9.704e-02, -2.363e-01, 1.829e-01, 1.844e-02, 9.298e-02, -5.319e-02, -5.899e-02, -2.154e-01, 2.555e-02, -8.374e-02, 1.254e-01, -2.736e-01, -4.065e-02, 4.838e-02, 3.338e-02));
r += mul(s1_0, M4(-1.239e-02, -1.316e-01, 8.694e-02, -8.443e-02, -1.143e-01, -6.018e-02, -9.054e-02, 7.381e-02, 2.722e-01, 1.030e-01, -8.583e-02, -4.433e-01, -1.339e-01, 1.264e-01, 8.581e-02, -1.947e-01));
r += mul(s1_1, M4(3.030e-01, -3.527e-02, 4.665e-01, -3.372e-02, -2.301e-02, 7.308e-01, 5.938e-01, -5.901e-01, 4.766e-01, 1.081e-01, 8.809e-02, 3.482e-01, -1.938e-01, -8.091e-02, 3.649e-02, 9.321e-02));
r += mul(s1_2, M4(1.376e-01, -4.460e-01, 4.298e-01, -4.809e-02, -3.819e-01, 5.216e-01, 2.687e-01, 1.359e-01, 2.936e-01, 1.222e-02, 3.706e-01, 2.481e-01, -4.716e-02, -1.798e-02, 2.731e-02, -7.140e-02));
r += mul(s1_3, M4(1.657e-01, -3.624e-02, 1.541e-01, -5.006e-03, -4.051e-01, -9.782e-02, 3.008e-02, 1.962e-01, -6.146e-02, 1.866e-03, -3.052e-01, -2.202e-01, 1.057e-01, -1.151e-01, -6.310e-02, 3.914e-01));
r += mul(s1_4, M4(-2.629e-01, 1.029e-01, 1.812e-02, -2.950e-01, -1.191e-01, 2.580e-01, -4.833e-01, 1.095e-01, 2.309e-02, 4.519e-02, 1.086e-01, 5.362e-01, -1.349e-01, -1.278e-01, 7.109e-02, -1.992e-01));
r += mul(s1_5, M4(-1.815e-01, 2.898e-01, 3.446e-01, -1.587e-01, -6.360e-02, 1.662e-01, 5.187e-01, 1.701e-01, -2.770e-02, -5.932e-01, 2.467e-01, 3.940e-01, 1.022e-01, 1.033e-01, -5.084e-02, -6.520e-02));
r += mul(s1_6, M4(-1.494e-01, 3.180e-02, 9.864e-02, -3.409e-01, 1.397e-02, 9.932e-03, -2.110e-01, 2.636e-01, 1.353e-01, -8.495e-02, -2.680e-03, -2.287e-01, 1.136e-01, -1.047e-01, 2.910e-02, 9.922e-02));
r += mul(s1_7, M4(1.533e-01, 4.819e-04, 1.735e-01, 2.027e-01, 1.316e-01, 1.029e-01, 1.446e-01, 1.737e-01, 4.855e-02, 4.781e-02, 2.025e-01, 1.587e-01, 1.661e-01, 7.134e-02, 5.853e-02, -1.530e-01));
r += mul(s1_8, M4(-1.476e-01, -4.916e-02, 1.989e-01, 1.159e-01, 4.753e-02, 1.694e-01, 4.343e-02, -6.974e-03, 3.382e-02, 2.275e-01, 3.466e-01, -7.178e-03, -1.104e-01, 2.059e-03, -7.101e-02, 8.934e-02));
r += mul(s2_0, M4(-3.467e-01, 8.471e-04, 1.580e-01, 2.685e-01, -2.680e-02, -6.444e-02, 8.843e-02, 5.232e-03, 2.576e-02, -3.756e-02, -7.913e-03, -3.871e-02, -5.374e-02, -6.060e-02, -7.688e-02, 6.738e-01));
r += mul(s2_1, M4(-3.963e-01, 1.295e-01, 2.623e-01, 2.565e-01, -1.831e-01, -6.054e-02, 1.817e-01, -8.944e-02, 1.974e-01, -2.800e-04, -3.964e-02, 1.232e-01, -3.477e-01, 3.791e-01, 1.438e-01, -7.862e-02));
r += mul(s2_2, M4(2.540e-02, 1.123e-01, 6.461e-01, -3.856e-03, 3.373e-02, -5.719e-02, 1.556e-01, -1.100e-01, -3.499e-02, 9.146e-02, -4.624e-02, 9.774e-02, -1.148e-01, -2.280e-01, 4.977e-01, -1.568e-01));
r += mul(s2_3, M4(5.352e-02, -1.293e-01, -6.991e-03, 4.190e-01, -2.334e-03, -4.433e-02, -8.470e-02, 1.162e-01, -1.045e-01, -7.444e-02, 8.951e-02, -1.124e-01, 4.295e-01, 1.086e-01, 1.336e-01, 2.645e-01));
r += mul(s2_4, M4(-4.062e-01, -6.781e-02, 4.629e-01, -4.931e-01, -1.875e-01, 1.958e-01, -4.560e-01, -2.286e-02, -2.066e-01, 1.151e-01, -5.924e-02, 1.350e-01, -1.752e-01, 2.244e-01, -3.564e-02, -6.129e-01));
r += mul(s2_5, M4(6.644e-02, 4.611e-01, 9.200e-02, 6.845e-03, -1.628e-02, 8.352e-02, -1.119e-01, -4.386e-02, -5.822e-02, -4.769e-02, -3.224e-02, -1.235e-01, -3.296e-01, 5.835e-03, 2.231e-01, 5.535e-02));
r += mul(s2_6, M4(-2.961e-02, -5.230e-02, 5.124e-02, 6.542e-02, 2.004e-01, 1.189e-01, -1.797e-01, -1.535e-02, 6.469e-02, 1.134e-01, -1.204e-04, -7.606e-02, 2.436e-02, -1.630e-02, 1.841e-01, -2.529e-01));
r += mul(s2_7, M4(-1.147e-02, 3.246e-02, 7.626e-02, -1.013e-01, 1.075e-01, 5.871e-01, -5.227e-01, -3.076e-01, 1.609e-01, 5.768e-02, -1.912e-02, 5.898e-02, -7.530e-02, -1.307e-01, 5.828e-02, -1.456e-02));
r += mul(s2_8, M4(-7.053e-02, 8.728e-02, 1.211e-01, 1.410e-01, -2.160e-01, 9.970e-02, -5.345e-01, 1.141e-01, 8.112e-04, -4.348e-02, 9.858e-02, 2.780e-02, -1.116e-01, -2.331e-01, 1.545e-01, 7.984e-02));
r += mul(s3_0, M4(-5.412e-02, 6.012e-03, -2.395e-01, -1.209e-02, -5.734e-02, 3.058e-02, -7.202e-02, -7.514e-02, 7.241e-03, -1.702e-01, 1.020e+00, 2.997e-01, -2.173e-01, 4.518e-02, -2.703e-02, -4.087e-02));
r += mul(s3_1, M4(-5.670e-02, -9.713e-03, -2.091e-01, -1.621e-01, -5.370e-03, -5.579e-02, 1.042e-01, 2.220e-02, 4.788e-01, -6.623e-01, 5.548e-01, 8.186e-01, 2.462e-01, -7.624e-01, -9.065e-02, -1.105e-02));
r += mul(s3_2, M4(4.043e-02, -1.577e-01, -3.166e-01, -1.256e-01, -9.515e-02, -8.852e-02, -4.960e-02, 1.129e-01, 1.690e-01, 2.314e-01, -5.134e-01, 9.584e-02, -3.085e-02, 2.399e-01, -3.381e-01, -7.233e-02));
r += mul(s3_3, M4(1.750e-01, -9.450e-02, -2.230e-01, 4.190e-01, 8.900e-02, 2.306e-02, 2.783e-01, -3.295e-01, 2.697e+00, 8.855e-02, 5.728e-01, -8.682e-01, 6.085e-02, 5.010e-02, 1.343e-01, 1.137e-01));
r += mul(s3_4, M4(9.857e-02, 3.310e-01, -3.584e-01, -5.586e-01, 5.751e-01, -4.023e-01, 3.838e-01, 1.240e-01, -1.482e-01, -1.233e-01, -5.953e-01, 1.534e+00, 3.390e-01, -2.022e-02, 1.619e-01, -2.959e-01));
r += mul(s3_5, M4(1.528e-01, 1.593e-01, -1.886e-01, 2.281e-02, 2.174e-01, -8.846e-01, 5.726e-02, 7.369e-03, -1.490e-01, 3.377e-01, -4.669e-02, 1.206e-01, -1.251e-01, 2.600e-01, -2.439e-01, 2.067e-01));
r += mul(s3_6, M4(4.090e-02, -2.118e-02, -9.012e-02, -8.624e-03, 1.464e-01, 6.929e-02, 1.492e-01, -4.039e-01, 6.123e-01, 2.679e-01, -2.284e-01, -3.609e-01, -6.598e-02, 1.341e-01, -2.371e-02, -2.899e-01));
r += mul(s3_7, M4(5.189e-02, -3.928e-02, 1.670e-01, -1.536e-01, 5.066e-01, -3.768e-01, 6.577e-01, 1.140e-01, -1.537e-01, -1.941e-01, -9.152e-02, -3.571e-02, 1.068e-01, 4.803e-02, -3.180e-01, 4.361e-02));
r += mul(s3_8, M4(-8.453e-02, -1.454e-02, 3.613e-02, 8.974e-03, -1.258e-01, -5.842e-01, 3.264e-01, 2.910e-01, 1.306e-01, 4.552e-01, 4.524e-01, 1.065e-02, -1.792e-02, 1.875e-02, -2.206e-01, 2.028e-01));
r += V4(3.015e-03, -4.690e-02, 3.573e-02, -1.486e-02);
return r;
}
void Pass2(uint2 blockStart, uint3 tid) {
float2 pt = float2(GetInputPt());
uint2 gxy = Rmp8x8(tid.x) + blockStart;
uint2 size = GetInputSize();
if (gxy.x >= size.x || gxy.y >= size.y) {
return;
}
float2 pos = (gxy + 0.5) * pt;
V4 s0_0 = l0(-1.0, -1.0);
V4 s0_1 = l0(0.0, -1.0);
V4 s0_2 = l0(1.0, -1.0);
V4 s0_3 = l0(-1.0, 0.0);
V4 s0_4 = l0(0.0, 0.0);
V4 s0_5 = l0(1.0, 0.0);
V4 s0_6 = l0(-1.0, 1.0);
V4 s0_7 = l0(0.0, 1.0);
V4 s0_8 = l0(1.0, 1.0);
V4 s1_0 = -max(-s0_0, 0.0);
V4 s1_1 = -max(-s0_1, 0.0);
V4 s1_2 = -max(-s0_2, 0.0);
V4 s1_3 = -max(-s0_3, 0.0);
V4 s1_4 = -max(-s0_4, 0.0);
V4 s1_5 = -max(-s0_5, 0.0);
V4 s1_6 = -max(-s0_6, 0.0);
V4 s1_7 = -max(-s0_7, 0.0);
V4 s1_8 = -max(-s0_8, 0.0);
s0_0 = max(s0_0, 0.0);
s0_1 = max(s0_1, 0.0);
s0_2 = max(s0_2, 0.0);
s0_3 = max(s0_3, 0.0);
s0_4 = max(s0_4, 0.0);
s0_5 = max(s0_5, 0.0);
s0_6 = max(s0_6, 0.0);
s0_7 = max(s0_7, 0.0);
s0_8 = max(s0_8, 0.0);
V4 s2_0 = l1(-1.0, -1.0);
V4 s2_1 = l1(0.0, -1.0);
V4 s2_2 = l1(1.0, -1.0);
V4 s2_3 = l1(-1.0, 0.0);
V4 s2_4 = l1(0.0, 0.0);
V4 s2_5 = l1(1.0, 0.0);
V4 s2_6 = l1(-1.0, 1.0);
V4 s2_7 = l1(0.0, 1.0);
V4 s2_8 = l1(1.0, 1.0);
V4 s3_0 = -max(-s2_0, 0.0);
V4 s3_1 = -max(-s2_1, 0.0);
V4 s3_2 = -max(-s2_2, 0.0);
V4 s3_3 = -max(-s2_3, 0.0);
V4 s3_4 = -max(-s2_4, 0.0);
V4 s3_5 = -max(-s2_5, 0.0);
V4 s3_6 = -max(-s2_6, 0.0);
V4 s3_7 = -max(-s2_7, 0.0);
V4 s3_8 = -max(-s2_8, 0.0);
s2_0 = max(s2_0, 0.0);
s2_1 = max(s2_1, 0.0);
s2_2 = max(s2_2, 0.0);
s2_3 = max(s2_3, 0.0);
s2_4 = max(s2_4, 0.0);
s2_5 = max(s2_5, 0.0);
s2_6 = max(s2_6, 0.0);
s2_7 = max(s2_7, 0.0);
s2_8 = max(s2_8, 0.0);
t2[gxy] = f0(s0_0, s0_1, s0_2, s0_3, s0_4, s0_5, s0_6, s0_7, s0_8, s1_0, s1_1, s1_2, s1_3, s1_4, s1_5, s1_6, s1_7, s1_8, s2_0, s2_1, s2_2, s2_3, s2_4, s2_5, s2_6, s2_7, s2_8, s3_0, s3_1, s3_2, s3_3, s3_4, s3_5, s3_6, s3_7, s3_8);
t3[gxy] = f1(s0_0, s0_1, s0_2, s0_3, s0_4, s0_5, s0_6, s0_7, s0_8, s1_0, s1_1, s1_2, s1_3, s1_4, s1_5, s1_6, s1_7, s1_8, s2_0, s2_1, s2_2, s2_3, s2_4, s2_5, s2_6, s2_7, s2_8, s3_0, s3_1, s3_2, s3_3, s3_4, s3_5, s3_6, s3_7, s3_8);
}
//!PASS 3
//!DESC conv2
//!BLOCK_SIZE 8
//!NUM_THREADS 64
//!IN t2, t3
//!OUT t0, t1
#define l0(x, y) V4(O(t2, float2(x, y)))
#define l1(x, y) V4(O(t3, float2(x, y)))
V4 f0(V4 s0_0, V4 s0_1, V4 s0_2, V4 s0_3, V4 s0_4, V4 s0_5, V4 s0_6, V4 s0_7, V4 s0_8, V4 s1_0, V4 s1_1, V4 s1_2, V4 s1_3, V4 s1_4, V4 s1_5, V4 s1_6, V4 s1_7, V4 s1_8, V4 s2_0, V4 s2_1, V4 s2_2, V4 s2_3, V4 s2_4, V4 s2_5, V4 s2_6, V4 s2_7, V4 s2_8, V4 s3_0, V4 s3_1, V4 s3_2, V4 s3_3, V4 s3_4, V4 s3_5, V4 s3_6, V4 s3_7, V4 s3_8) {
V4 r = 0.0;
r += mul(s0_0, M4(8.130e-02, -1.243e-01, -7.648e-02, -2.424e-01, -4.742e-02, -5.420e-02, 4.117e-02, 1.568e-01, -3.621e-02, 2.032e-01, 4.484e-02, 1.249e-02, -1.505e-01, 7.294e-02, 4.943e-02, -6.336e-02));
r += mul(s0_1, M4(-1.474e-01, -3.366e-01, -5.670e-01, 4.113e-02, -1.260e-01, -1.539e-01, -5.421e-02, 1.779e-01, -1.072e-01, 1.209e-01, 4.423e-02, 2.454e-01, -5.430e-02, -1.442e-01, -1.501e-02, -4.731e-02));
r += mul(s0_2, M4(6.444e-02, 1.509e-01, 1.452e-01, -2.840e-02, 9.365e-02, 2.016e-01, 1.002e-01, -3.226e-02, -1.186e-01, 1.535e-01, -1.652e-01, -1.104e-02, 4.170e-02, -4.404e-02, 1.189e-01, 1.007e-02));
r += mul(s0_3, M4(-4.618e-02, 1.024e-01, -1.723e-01, -1.354e-01, 1.981e-01, -1.992e-01, 1.670e-01, 3.857e-01, -6.927e-03, 9.087e-02, 1.176e-01, 3.314e-01, 9.860e-02, 4.009e-04, 1.061e-01, -6.930e-02));
r += mul(s0_4, M4(4.923e-01, -9.248e-02, 8.616e-03, 4.541e-02, -1.148e-01, 3.990e-03, -3.218e-02, 8.942e-02, 3.219e-02, -9.786e-02, 6.813e-02, 2.492e-01, -3.165e-01, 6.925e-02, -9.826e-02, 4.518e-01));
r += mul(s0_5, M4(2.222e-02, 1.046e-01, -2.327e-02, -7.823e-02, 3.540e-01, 3.363e-01, -4.089e-02, 1.292e-02, -2.530e-01, 4.606e-01, -6.191e-02, -3.673e-02, -2.764e-01, -1.360e-01, -2.947e-03, 2.534e-02));
r += mul(s0_6, M4(-2.067e-02, -1.566e-01, -8.968e-02, 1.386e-03, -8.841e-02, -1.077e-01, 1.646e-01, 1.987e-01, -3.098e-01, 2.764e-01, 1.935e-01, 1.847e-01, 1.116e-01, -1.514e-01, -5.175e-02, 8.710e-02));
r += mul(s0_7, M4(1.266e-02, -2.119e-01, -1.610e-01, -6.512e-02, -1.679e-01, 2.247e-01, -5.854e-02, 1.200e-02, -1.406e-01, 4.393e-01, -8.517e-02, 3.281e-02, -1.177e-01, -1.861e-01, -3.241e-01, -2.918e-02));
r += mul(s0_8, M4(-3.015e-02, -1.605e-01, -1.001e-01, 7.795e-03, -5.873e-02, -7.686e-02, -1.448e-01, -1.851e-02, -2.172e-01, 1.977e-01, -1.333e-01, -8.894e-02, -8.939e-03, 1.675e-01, -7.976e-03, 4.020e-02));
r += mul(s1_0, M4(1.165e-01, -4.833e-02, 4.750e-02, -4.032e-02, -2.287e-02, -4.825e-02, 9.058e-02, 2.136e-01, 1.009e-01, -2.133e-02, 4.162e-02, -6.816e-02, -9.863e-02, -4.160e-03, -2.467e-02, -9.096e-02));
r += mul(s1_1, M4(8.597e-02, -2.205e-01, 1.515e-01, -2.918e-02, -1.099e-01, -4.171e-02, 3.893e-04, -5.273e-03, -2.046e-02, -3.905e-03, 7.793e-04, 5.930e-02, 2.653e-02, -2.546e-01, -8.456e-02, -6.554e-02));
r += mul(s1_2, M4(-1.058e-01, 3.302e-01, 1.812e-01, 6.427e-02, -4.601e-02, -1.589e-02, 4.405e-02, -1.366e-02, -5.996e-03, -5.402e-04, 3.237e-02, -5.725e-02, -7.486e-02, 1.358e-01, 4.739e-02, -2.432e-02));
r += mul(s1_3, M4(3.333e-02, 5.179e-01, -1.939e-03, 7.798e-02, 2.011e-02, -2.959e-01, 1.135e-01, 3.122e-01, 8.651e-02, -2.708e-02, 7.183e-03, 4.554e-02, -3.342e-02, 9.136e-03, -7.067e-02, -1.867e-01));
r += mul(s1_4, M4(6.231e-01, 9.512e-01, 3.523e-01, 3.744e-01, 2.388e-01, -2.827e-01, 9.968e-02, -5.306e-02, -4.498e-02, -2.222e-01, -5.865e-02, 2.967e-02, -3.029e-01, -2.137e-01, -5.363e-01, 8.872e-02));
r += mul(s1_5, M4(-4.862e-02, 7.326e-01, 1.354e-01, 5.607e-02, 1.667e-01, -1.184e-01, -1.304e-01, 6.817e-02, 3.287e-02, 3.310e-01, 1.521e-01, -3.212e-02, -8.947e-02, 4.250e-02, -9.770e-02, -8.344e-02));
r += mul(s1_6, M4(-9.242e-04, 4.835e-03, 1.322e-01, 3.745e-02, 9.613e-02, -8.310e-03, 4.718e-02, 2.763e-02, -1.616e-02, 6.167e-02, -3.382e-02, 3.624e-02, 1.213e-02, -2.014e-01, -2.776e-03, 4.360e-02));
r += mul(s1_7, M4(-6.861e-02, 4.772e-02, -3.779e-02, 7.567e-02, -8.548e-02, -1.028e-02, 1.881e-02, 2.421e-03, 1.378e-01, 1.305e-01, 2.177e-02, -1.118e-03, 5.861e-02, -1.416e-01, -3.140e-01, -9.031e-02));
r += mul(s1_8, M4(-4.147e-02, 1.546e-01, 5.650e-02, 4.098e-02, -1.460e-01, -5.779e-02, -1.959e-02, -2.318e-02, 3.538e-02, -5.044e-02, 3.304e-02, -3.517e-03, -1.176e-01, -3.185e-01, -1.738e-01, -4.349e-02));
r += mul(s2_0, M4(-3.428e-03, 6.059e-02, 7.024e-02, 2.739e-02, 1.313e-02, -5.748e-02, 9.005e-03, -7.139e-03, 1.165e-01, -1.541e-01, 1.493e-01, 2.725e-01, 3.254e-02, -2.934e-02, 1.115e-02, -2.844e-02));
r += mul(s2_1, M4(-8.601e-03, -3.177e-03, 1.878e-01, 1.106e-01, 1.951e-02, 8.194e-02, 4.971e-02, 5.805e-02, 2.515e-02, -2.529e-01, -2.250e-01, 3.498e-02, 7.183e-02, -8.617e-02, -8.616e-02, 1.623e-01));
r += mul(s2_2, M4(-8.072e-02, -1.234e-01, 3.482e-02, -2.873e-02, -4.049e-02, 4.828e-03, 1.940e-02, 3.828e-02, -5.156e-03, 4.585e-03, 2.326e-02, 2.346e-02, -8.908e-02, -1.384e-03, -2.366e-02, 1.290e-02));
r += mul(s2_3, M4(4.921e-02, 1.726e-01, 3.832e-02, -2.490e-01, -1.152e-01, -1.722e-01, -1.705e-01, 4.228e-01, -8.215e-02, -1.478e-02, 1.554e-01, 3.701e-01, -8.863e-02, 1.068e-01, 8.890e-03, 6.324e-02));
r += mul(s2_4, M4(1.307e-01, 2.312e-01, -1.734e-01, 2.083e-02, -1.966e-01, -3.991e-01, -8.681e-02, 1.976e-03, -3.177e-01, 1.528e-01, -2.329e-01, 2.569e-01, -6.230e-03, 6.020e-02, 4.969e-02, -2.039e-01));
r += mul(s2_5, M4(1.660e-01, 1.642e-02, 7.203e-02, -1.613e-01, 6.225e-02, 6.470e-02, 3.305e-03, 2.230e-02, -2.455e-02, 6.599e-02, -1.740e-01, 7.887e-02, 3.463e-03, 1.003e-01, -1.850e-01, 7.885e-02));
r += mul(s2_6, M4(-2.170e-02, 1.372e-01, 7.445e-02, -9.419e-02, -1.851e-01, 4.957e-02, -2.454e-01, 5.879e-02, -5.800e-02, -1.122e-01, 7.445e-02, 1.190e-01, 2.695e-02, -5.701e-02, -5.166e-02, -5.058e-02));
r += mul(s2_7, M4(5.390e-01, 1.674e-01, 1.213e-01, -1.147e-01, -6.939e-02, -1.218e-01, -2.891e-01, 2.682e-02, -2.636e-01, -1.104e-01, -1.556e-01, 3.774e-02, -4.121e-02, -2.431e-01, -1.248e-01, 1.275e-01));
r += mul(s2_8, M4(1.053e-01, 2.238e-01, -1.104e-01, 5.372e-02, 6.179e-02, -2.431e-03, -4.843e-02, 3.820e-02, -7.539e-02, 7.898e-02, 7.562e-03, 1.596e-02, 7.298e-02, -1.553e-01, -3.545e-01, 1.990e-02));
r += mul(s3_0, M4(8.232e-02, -6.815e-02, -7.421e-02, -3.191e-02, -1.592e-01, 2.814e-01, 5.009e-02, 3.669e-02, -5.908e-02, -5.445e-02, 4.873e-02, 1.538e-01, 1.065e-01, -2.194e-01, -2.612e-02, -2.297e-02));
r += mul(s3_1, M4(1.431e-02, -7.835e-02, -2.790e-03, 9.305e-02, -2.975e-01, 1.527e-01, 1.888e-01, -1.279e-02, -1.938e-02, -1.022e-01, -2.197e-02, -2.919e-02, 2.192e-01, -8.056e-02, 1.328e-03, 3.478e-02));
r += mul(s3_2, M4(4.920e-03, -6.286e-02, -7.779e-02, 1.075e-01, -1.092e-01, 2.909e-01, 3.056e-01, -9.017e-02, -3.625e-02, 1.079e-01, 1.107e-01, 6.613e-02, 1.696e-01, -1.852e-01, -1.253e-01, -9.675e-02));
r += mul(s3_3, M4(-6.350e-02, 1.137e-01, -3.559e-02, -1.684e-01, -2.044e-01, -9.368e-02, 2.283e-01, 8.052e-01, 4.476e-03, -1.599e-01, 2.594e-02, 1.582e-01, -2.483e-02, 9.216e-02, 5.719e-02, 2.237e-01));
r += mul(s3_4, M4(-1.694e-01, 1.597e-01, -3.311e-01, 1.880e-01, 2.614e-01, -2.584e-01, 5.296e-02, 9.726e-02, -3.932e-02, -7.518e-02, -1.749e-01, 1.604e-01, 1.008e-01, 2.920e-01, 5.358e-01, -6.383e-01));
r += mul(s3_5, M4(-2.706e-01, -2.716e-01, -4.196e-01, 1.023e-01, 2.201e-01, -1.412e-01, 1.003e-01, -6.972e-02, 3.727e-02, -8.424e-02, -7.870e-02, 2.294e-02, 2.836e-01, -4.165e-01, -2.974e-01, -3.567e-02));
r += mul(s3_6, M4(-3.434e-02, 6.420e-02, -8.729e-02, -8.600e-02, -2.041e-01, 1.646e-02, 9.025e-02, 1.724e-01, -4.951e-02, -3.894e-02, -7.985e-02, 1.580e-02, 2.554e-01, -3.100e-01, -2.769e-01, 8.336e-05));
r += mul(s3_7, M4(-6.557e-02, 3.865e-02, -3.263e-02, 4.621e-02, -2.077e-01, 2.705e-02, -3.354e-01, 1.480e-01, 4.155e-02, -2.143e-01, -2.626e-01, 1.091e-02, 1.382e-01, -1.706e-01, -1.355e-01, -7.700e-02));
r += mul(s3_8, M4(-2.004e-01, 4.575e-01, -1.812e-01, 6.102e-02, 3.469e-01, -6.634e-02, 1.302e-01, -9.621e-02, 4.023e-02, 1.048e-01, -9.194e-02, 5.130e-03, 4.272e-01, -5.971e-01, -2.025e-01, -1.364e-01));
r += V4(3.575e-03, 3.041e-03, 1.241e-02, -2.230e-03);
return r;
}
V4 f1(V4 s0_0, V4 s0_1, V4 s0_2, V4 s0_3, V4 s0_4, V4 s0_5, V4 s0_6, V4 s0_7, V4 s0_8, V4 s1_0, V4 s1_1, V4 s1_2, V4 s1_3, V4 s1_4, V4 s1_5, V4 s1_6, V4 s1_7, V4 s1_8, V4 s2_0, V4 s2_1, V4 s2_2, V4 s2_3, V4 s2_4, V4 s2_5, V4 s2_6, V4 s2_7, V4 s2_8, V4 s3_0, V4 s3_1, V4 s3_2, V4 s3_3, V4 s3_4, V4 s3_5, V4 s3_6, V4 s3_7, V4 s3_8) {
V4 r = 0.0;
r += mul(s0_0, M4(-1.437e-01, -9.784e-02, 2.649e-01, -8.638e-02, -1.746e-01, 2.031e-01, 1.203e-01, -8.812e-02, -2.317e-01, 2.311e-01, 3.171e-02, -3.619e-02, -7.798e-02, -2.507e-02, 1.902e-01, 5.780e-02));
r += mul(s0_1, M4(2.504e-01, 1.577e-01, -5.397e-02, 4.599e-01, -1.392e-01, 2.560e-01, 1.018e-01, 7.968e-02, 2.247e-01, -2.962e-03, 1.421e-03, -1.201e-01, -3.622e-01, 1.378e-01, 1.392e-01, 1.641e-01));
r += mul(s0_2, M4(-6.143e-02, -6.336e-02, 1.131e-01, 6.811e-02, -5.817e-02, 7.362e-02, 1.407e-01, 1.823e-02, 4.880e-01, -2.282e-01, -2.704e-01, -4.287e-01, -2.741e-01, 3.163e-02, 1.098e-01, 1.514e-01));
r += mul(s0_3, M4(1.794e-01, 1.720e-01, -4.092e-01, 1.277e-01, -1.938e-01, 3.107e-01, 2.915e-01, 2.279e-01, 2.259e-01, 2.136e-01, 5.867e-02, 2.359e-01, -1.589e-01, 1.132e-01, 6.871e-02, 2.837e-01));
r += mul(s0_4, M4(-3.070e-01, -4.494e-01, 5.817e-02, 5.153e-01, 5.215e-01, 5.410e-01, 1.286e-01, -5.596e-01, 4.287e-01, 1.821e-01, 1.542e-01, 3.755e-01, 3.820e-01, 2.953e-01, -2.768e-01, -6.977e-02));
r += mul(s0_5, M4(-4.881e-02, 2.327e-02, 9.209e-02, -2.102e-02, -1.394e-01, -8.093e-03, 2.263e-01, -4.307e-01, 1.998e-01, -8.793e-02, -1.057e-01, -1.899e-01, 1.577e-01, 3.435e-01, 6.721e-02, 3.093e-01));
r += mul(s0_6, M4(7.516e-02, -1.224e-01, 1.257e-02, -6.769e-02, -8.618e-02, 1.283e-01, 2.060e-01, -1.966e-01, 8.166e-02, -1.263e-01, -2.269e-01, -3.272e-01, -3.439e-02, -2.849e-01, 2.105e-01, -3.015e-03));
r += mul(s0_7, M4(7.447e-02, -8.731e-02, 2.804e-02, -4.819e-02, -3.311e-01, 3.824e-01, 7.766e-02, 5.672e-02, 4.014e-01, -4.037e-03, 2.287e-01, 5.626e-02, 3.481e-01, -1.010e-01, -1.156e-01, -2.865e-01));
r += mul(s0_8, M4(5.454e-02, -5.590e-02, 3.408e-02, 3.551e-03, 1.262e-02, 8.638e-02, 1.222e-01, 3.418e-01, -2.154e-01, 1.868e-01, 1.210e-01, -2.330e-01, -4.810e-02, -5.190e-02, -8.587e-02, -2.145e-01));
r += mul(s1_0, M4(-3.063e-01, -1.830e-02, 5.167e-01, 4.813e-02, -7.310e-02, 1.443e-01, 1.654e-01, 1.158e-01, 4.789e-02, -3.030e-02, -1.358e-01, 2.986e-02, -4.855e-02, -7.736e-02, 4.514e-01, -1.797e-02));
r += mul(s1_1, M4(4.322e-01, -1.369e-01, 9.431e-02, 3.921e-01, 2.708e-02, -1.218e-02, -9.091e-02, 1.871e-01, 3.763e-02, -9.213e-02, -1.209e-01, -1.587e-01, 3.014e-03, 1.816e-01, 3.099e-01, 3.210e-01));
r += mul(s1_2, M4(-7.234e-02, 1.685e-02, 4.444e-01, -1.886e-01, -9.543e-03, 3.966e-02, 1.105e-01, 4.870e-02, 9.471e-02, -5.263e-02, -1.085e-01, 4.226e-02, -1.565e-01, -3.812e-02, 1.708e-01, 1.457e-01));
r += mul(s1_3, M4(2.370e-01, -3.354e-02, -9.648e-02, 1.531e-01, -3.468e-01, -3.957e-02, 3.152e-01, 3.402e-02, 3.762e-02, 9.507e-02, 7.836e-02, 9.088e-03, -1.614e-01, 4.377e-02, 4.748e-02, 1.055e-01));
r += mul(s1_4, M4(2.342e-01, -5.059e-01, 2.781e-01, 2.906e-01, 1.656e-01, 1.268e-01, 1.183e-01, -2.458e-02, 2.290e-01, 1.779e-01, -8.310e-02, 1.389e-01, 7.282e-02, 1.050e-01, -3.525e-01, 6.810e-02));
r += mul(s1_5, M4(1.078e-01, -4.451e-02, 7.031e-02, -2.977e-01, 3.596e-02, 3.359e-02, 9.589e-03, 9.070e-02, -1.862e-01, -1.863e-01, -9.652e-02, -5.039e-02, 1.004e-01, 1.598e-01, 1.466e-01, 2.349e-01));
r += mul(s1_6, M4(1.109e-02, -1.607e-01, 1.578e-02, -1.971e-01, 5.020e-02, -7.597e-02, 7.238e-02, 7.241e-02, 2.025e-02, -2.246e-02, 4.652e-02, -8.760e-02, -1.111e-02, 1.890e-02, 1.046e-01, -2.233e-03));
r += mul(s1_7, M4(1.252e-01, -8.046e-02, -1.321e-01, -3.724e-01, -1.383e-01, 1.151e-01, 5.397e-02, -1.422e-01, 8.319e-02, 9.089e-02, -2.620e-02, 1.662e-01, 2.847e-02, -1.255e-01, 6.933e-02, -1.636e-01));
r += mul(s1_8, M4(-1.517e-01, 3.661e-02, -3.135e-01, -3.395e-01, -1.139e-01, 1.973e-01, 8.547e-03, -3.118e-02, -8.869e-02, -1.209e-01, 1.867e-02, -4.531e-02, 1.016e-01, -6.909e-02, 1.436e-01, 1.663e-01));
r += mul(s2_0, M4(-9.314e-02, 1.395e-02, -1.741e-02, -7.208e-02, -5.164e-02, -5.743e-02, 5.702e-02, 1.342e-01, 6.011e-03, 1.626e-01, 1.101e-01, -1.130e-01, 6.127e-02, -8.956e-03, -7.149e-02, -6.488e-03));
r += mul(s2_1, M4(-2.534e-01, 1.086e-01, -1.007e-01, -3.067e-02, -1.074e-01, 7.219e-03, 6.768e-02, -1.012e-01, 2.019e-01, 4.263e-03, -7.411e-02, -1.173e-01, 1.961e-01, -5.619e-02, -2.390e-01, -1.323e-01));
r += mul(s2_2, M4(-1.039e-01, -9.899e-02, -2.206e-01, -2.187e-01, -8.739e-03, 6.607e-02, 4.125e-02, 5.363e-02, -6.572e-03, 3.014e-02, 1.314e-01, -9.560e-02, 2.106e-01, 1.237e-02, -8.354e-02, -4.939e-03));
r += mul(s2_3, M4(-4.682e-02, -1.357e-01, 3.481e-02, -2.187e-01, 1.113e-01, 8.812e-02, -1.211e-01, -2.011e-02, 1.567e-01, -2.216e-02, -4.920e-03, -2.458e-01, 2.263e-02, 6.741e-02, -1.234e-02, 2.338e-02));
r += mul(s2_4, M4(5.105e-02, -3.845e-01, 1.812e-01, -1.927e-01, 2.840e-01, -2.094e-01, 5.673e-02, 4.405e-02, 5.957e-01, 1.734e-02, -1.158e-01, -6.956e-01, -2.077e-01, 5.130e-03, 4.744e-01, -1.540e-02));
r += mul(s2_5, M4(1.601e-01, -2.680e-01, -1.678e-01, -1.207e-01, -4.648e-02, -6.454e-02, 1.122e-01, -6.567e-02, 1.638e-01, -1.259e-01, -2.470e-02, -3.547e-01, -1.333e-01, -1.219e-02, -7.710e-02, -3.881e-01));
r += mul(s2_6, M4(-6.060e-02, 1.662e-01, -2.082e-01, 3.193e-01, -1.317e-01, 1.395e-04, 2.436e-01, -1.480e-01, 6.104e-03, -2.009e-01, -6.729e-02, -2.207e-01, -7.784e-02, -7.589e-02, 7.569e-02, 3.261e-03));
r += mul(s2_7, M4(-2.951e-01, -2.050e-01, 2.827e-02, 3.739e-01, 1.947e-01, 5.411e-01, -2.262e-01, -8.808e-03, 2.262e-01, -9.010e-02, -1.476e-01, -3.582e-01, -1.718e-01, 2.844e-02, 7.832e-02, 1.414e-03));
r += mul(s2_8, M4(3.534e-01, 1.695e-01, -1.247e-01, 4.750e-01, 4.171e-02, 2.338e-02, -4.525e-02, -4.955e-02, 2.934e-01, -3.865e-02, -1.125e-01, -2.127e-01, 1.326e-01, 5.967e-02, 6.215e-02, 1.048e-01));
r += mul(s3_0, M4(4.186e-02, -5.378e-02, 7.641e-02, -3.524e-02, -2.447e-01, -5.374e-02, -1.380e-01, -4.221e-01, -3.797e-02, -7.623e-03, -4.826e-02, 1.791e-02, -1.390e-01, 1.115e-01, 2.252e-01, -9.103e-03));
r += mul(s3_1, M4(1.339e-01, 3.093e-01, -3.615e-02, 8.684e-02, -4.098e-01, -1.216e-01, 2.372e-01, -1.247e-01, -5.358e-02, -1.660e-01, -8.435e-02, 3.871e-02, 2.722e-01, -1.145e-01, -3.944e-01, -5.003e-02));
r += mul(s3_2, M4(-4.430e-02, -3.135e-02, 1.019e-01, -1.129e-01, -2.647e-01, -1.317e-01, 8.715e-02, -5.466e-02, -3.946e-02, 7.216e-02, 1.677e-01, 9.349e-02, 8.069e-02, -1.097e-01, -9.659e-03, -8.460e-02));
r += mul(s3_3, M4(-5.036e-03, 4.992e-02, 1.086e-01, -1.339e-02, 2.792e-01, 3.294e-01, -1.578e-01, 4.592e-01, -7.749e-02, 4.384e-02, -4.212e-02, 2.287e-02, 1.456e-01, 4.774e-02, -1.264e-01, 7.437e-02));
r += mul(s3_4, M4(3.022e-01, -2.197e-01, -4.347e-02, -2.198e-01, 3.922e-02, 8.609e-02, 8.862e-02, 3.418e-01, 8.117e-02, -2.026e-02, -3.236e-01, -2.539e-01, -6.030e-02, -2.409e-01, 7.879e-02, -8.457e-02));
r += mul(s3_5, M4(3.525e-01, 2.622e-01, -4.994e-02, -1.932e-01, -1.508e-01, 1.229e-01, 1.359e-01, 1.613e-01, 1.830e-01, -4.473e-02, -5.438e-02, -1.041e-01, 4.534e-01, -4.660e-01, -7.405e-02, -1.001e-01));
r += mul(s3_6, M4(-1.224e-02, -5.840e-03, 8.031e-02, -2.279e-02, -2.128e-01, 1.477e-01, -9.937e-03, 4.142e-02, -3.726e-02, -1.013e-01, -2.940e-03, -1.333e-01, 1.353e-01, -2.192e-01, -3.858e-01, -1.100e-01));
r += mul(s3_7, M4(-8.882e-02, 1.341e-01, 2.707e-01, 2.212e-01, 2.628e-01, 3.454e-01, -3.703e-01, 4.902e-01, 1.527e-01, 8.567e-03, -1.742e-01, -1.884e-01, -7.710e-01, 1.028e-01, 3.233e-01, -3.897e-01));
r += mul(s3_8, M4(3.715e-02, 2.936e-01, -1.195e-01, -1.295e-01, -1.313e-01, -1.222e-01, -2.876e-01, 5.694e-02, 6.813e-02, -1.738e-02, -1.154e-01, 1.649e-02, 1.755e-01, -1.639e-01, 3.212e-02, 3.504e-01));
r += V4(-8.611e-03, -6.529e-03, -1.098e-03, 4.669e-03);
return r;
}
void Pass3(uint2 blockStart, uint3 tid) {
float2 pt = float2(GetInputPt());
uint2 gxy = Rmp8x8(tid.x) + blockStart;
uint2 size = GetInputSize();
if (gxy.x >= size.x || gxy.y >= size.y) {
return;
}
float2 pos = (gxy + 0.5) * pt;
V4 s0_0 = l0(-1.0, -1.0);
V4 s0_1 = l0(0.0, -1.0);
V4 s0_2 = l0(1.0, -1.0);
V4 s0_3 = l0(-1.0, 0.0);
V4 s0_4 = l0(0.0, 0.0);
V4 s0_5 = l0(1.0, 0.0);
V4 s0_6 = l0(-1.0, 1.0);
V4 s0_7 = l0(0.0, 1.0);
V4 s0_8 = l0(1.0, 1.0);
V4 s1_0 = -max(-s0_0, 0.0);
V4 s1_1 = -max(-s0_1, 0.0);
V4 s1_2 = -max(-s0_2, 0.0);
V4 s1_3 = -max(-s0_3, 0.0);
V4 s1_4 = -max(-s0_4, 0.0);
V4 s1_5 = -max(-s0_5, 0.0);
V4 s1_6 = -max(-s0_6, 0.0);
V4 s1_7 = -max(-s0_7, 0.0);
V4 s1_8 = -max(-s0_8, 0.0);
s0_0 = max(s0_0, 0.0);
s0_1 = max(s0_1, 0.0);
s0_2 = max(s0_2, 0.0);
s0_3 = max(s0_3, 0.0);
s0_4 = max(s0_4, 0.0);
s0_5 = max(s0_5, 0.0);
s0_6 = max(s0_6, 0.0);
s0_7 = max(s0_7, 0.0);
s0_8 = max(s0_8, 0.0);
V4 s2_0 = l1(-1.0, -1.0);
V4 s2_1 = l1(0.0, -1.0);
V4 s2_2 = l1(1.0, -1.0);
V4 s2_3 = l1(-1.0, 0.0);
V4 s2_4 = l1(0.0, 0.0);
V4 s2_5 = l1(1.0, 0.0);
V4 s2_6 = l1(-1.0, 1.0);
V4 s2_7 = l1(0.0, 1.0);
V4 s2_8 = l1(1.0, 1.0);
V4 s3_0 = -max(-s2_0, 0.0);
V4 s3_1 = -max(-s2_1, 0.0);
V4 s3_2 = -max(-s2_2, 0.0);
V4 s3_3 = -max(-s2_3, 0.0);
V4 s3_4 = -max(-s2_4, 0.0);
V4 s3_5 = -max(-s2_5, 0.0);
V4 s3_6 = -max(-s2_6, 0.0);
V4 s3_7 = -max(-s2_7, 0.0);
V4 s3_8 = -max(-s2_8, 0.0);
s2_0 = max(s2_0, 0.0);
s2_1 = max(s2_1, 0.0);
s2_2 = max(s2_2, 0.0);
s2_3 = max(s2_3, 0.0);
s2_4 = max(s2_4, 0.0);
s2_5 = max(s2_5, 0.0);
s2_6 = max(s2_6, 0.0);
s2_7 = max(s2_7, 0.0);
s2_8 = max(s2_8, 0.0);
t0[gxy] = f0(s0_0, s0_1, s0_2, s0_3, s0_4, s0_5, s0_6, s0_7, s0_8, s1_0, s1_1, s1_2, s1_3, s1_4, s1_5, s1_6, s1_7, s1_8, s2_0, s2_1, s2_2, s2_3, s2_4, s2_5, s2_6, s2_7, s2_8, s3_0, s3_1, s3_2, s3_3, s3_4, s3_5, s3_6, s3_7, s3_8);
t1[gxy] = f1(s0_0, s0_1, s0_2, s0_3, s0_4, s0_5, s0_6, s0_7, s0_8, s1_0, s1_1, s1_2, s1_3, s1_4, s1_5, s1_6, s1_7, s1_8, s2_0, s2_1, s2_2, s2_3, s2_4, s2_5, s2_6, s2_7, s2_8, s3_0, s3_1, s3_2, s3_3, s3_4, s3_5, s3_6, s3_7, s3_8);
}
//!PASS 4
//!DESC conv3
//!BLOCK_SIZE 8
//!NUM_THREADS 64
//!IN t0, t1
//!OUT t2, t3
#define l0(x, y) V4(O(t0, float2(x, y)))
#define l1(x, y) V4(O(t1, float2(x, y)))
V4 f0(V4 s0_0, V4 s0_1, V4 s0_2, V4 s0_3, V4 s0_4, V4 s0_5, V4 s0_6, V4 s0_7, V4 s0_8, V4 s1_0, V4 s1_1, V4 s1_2, V4 s1_3, V4 s1_4, V4 s1_5, V4 s1_6, V4 s1_7, V4 s1_8, V4 s2_0, V4 s2_1, V4 s2_2, V4 s2_3, V4 s2_4, V4 s2_5, V4 s2_6, V4 s2_7, V4 s2_8, V4 s3_0, V4 s3_1, V4 s3_2, V4 s3_3, V4 s3_4, V4 s3_5, V4 s3_6, V4 s3_7, V4 s3_8) {
V4 r = 0.0;
r += mul(s0_0, M4(7.356e-02, 8.402e-03, 1.287e-01, 6.762e-02, 2.134e-01, -6.620e-02, -2.788e-01, -5.744e-02, -3.896e-02, -3.993e-02, -7.161e-02, -1.982e-01, -6.734e-02, 8.804e-03, -4.739e-02, 6.502e-02));
r += mul(s0_1, M4(2.249e-01, 4.958e-02, 1.138e-01, 3.152e-01, 2.008e-01, 1.703e-01, 5.817e-02, -9.482e-02, -2.371e-01, 3.975e-02, -1.755e-01, -2.666e-01, 2.819e-01, -2.640e-02, 1.405e-01, -6.009e-02));
r += mul(s0_2, M4(2.065e-01, -3.027e-02, -3.447e-02, 3.226e-03, -1.252e-02, -7.589e-03, 2.344e-03, -1.704e-02, -8.894e-02, 3.136e-02, -1.517e-01, -2.176e-02, 8.920e-02, -5.322e-02, -9.529e-02, 8.355e-02));
r += mul(s0_3, M4(1.136e-01, 1.015e-01, -2.730e-02, -2.144e-01, -9.526e-02, -2.857e-01, 2.711e-01, -1.991e-01, 2.596e-01, 1.602e-01, -2.169e-01, -1.097e-01, 3.353e-02, 6.231e-02, 8.753e-03, 3.707e-01));
r += mul(s0_4, M4(-1.945e-01, 3.081e-01, -2.270e-01, -5.963e-02, -1.666e-01, -3.408e-01, 1.161e-01, -6.384e-02, -6.823e-01, -4.014e-01, -6.276e-01, -1.672e-01, 2.986e-03, -1.351e-01, 1.668e-01, -3.133e-01));
r += mul(s0_5, M4(4.802e-02, -4.275e-02, 1.978e-03, -7.602e-02, -4.082e-03, 5.572e-02, -3.341e-02, 9.101e-03, -1.038e-01, 1.622e-01, 2.334e-02, 1.768e-01, 9.416e-03, -2.287e-01, 1.048e-01, -2.926e-01));
r += mul(s0_6, M4(5.333e-04, 3.089e-02, 2.721e-02, -3.601e-02, -5.081e-02, -1.152e-01, 6.752e-02, 1.701e-01, -2.951e-02, 2.450e-01, -1.684e-01, -4.702e-02, -1.580e-02, 1.200e-02, -1.266e-02, 4.937e-02));
r += mul(s0_7, M4(-1.351e-02, -6.248e-02, -3.060e-03, 4.140e-02, -2.090e-01, -6.831e-01, -8.857e-02, 2.536e-01, -2.333e-02, 1.521e-01, -8.033e-02, 2.124e-01, -6.615e-02, 1.317e-01, 1.847e-01, -2.150e-01));
r += mul(s0_8, M4(4.605e-02, 1.013e-01, 6.834e-03, -6.411e-02, -1.476e-02, -2.845e-01, -4.312e-02, -1.171e-02, 6.985e-02, -6.859e-02, -2.785e-02, -3.226e-02, 5.186e-02, 1.102e-01, -2.071e-02, -1.250e-01));
r += mul(s1_0, M4(1.952e-01, -3.342e-02, -3.770e-02, -2.026e-01, 4.850e-02, -3.174e-02, -1.987e-01, -2.886e-02, -1.298e-01, 1.994e-02, 1.131e-01, 2.950e-02, -1.791e-02, -4.533e-02, 4.695e-02, -6.907e-02));
r += mul(s1_1, M4(2.401e-01, 1.809e-01, -5.151e-02, -6.271e-02, -1.409e-01, 9.215e-03, 1.176e-01, 2.717e-02, 1.130e-01, -3.228e-02, -9.086e-02, -1.202e-03, 1.642e-03, -7.943e-03, 1.097e-01, 1.842e-01));
r += mul(s1_2, M4(8.774e-02, -1.486e-02, -4.808e-02, 4.089e-02, 6.244e-02, -7.645e-02, 5.614e-02, -5.706e-02, -2.386e-02, 4.407e-02, -1.378e-01, -5.880e-02, 2.936e-02, 2.285e-02, -3.924e-02, 5.724e-02));
r += mul(s1_3, M4(2.603e-01, -1.455e-01, 1.429e-01, -2.992e-02, -6.288e-02, -5.216e-02, -1.802e-01, 1.060e-01, -2.473e-02, -6.795e-03, 2.843e-02, 7.745e-02, -4.868e-03, -9.998e-02, -7.961e-02, 5.068e-02));
r += mul(s1_4, M4(2.018e-01, -1.293e-01, -5.291e-02, -4.763e-02, 3.484e-02, -1.648e-01, 8.786e-02, -6.101e-02, -1.083e-01, 5.522e-02, -1.814e-01, -2.392e-01, 6.427e-02, -1.908e-02, 2.643e-01, 1.294e-01));
r += mul(s1_5, M4(-7.897e-02, -5.967e-02, -2.620e-01, 1.274e-02, -2.583e-02, 5.654e-02, -7.639e-02, -7.534e-03, -5.812e-02, -7.887e-02, -3.738e-03, 7.664e-02, 1.753e-02, -2.842e-01, -3.237e-01, 2.077e-02));
r += mul(s1_6, M4(6.558e-02, -9.890e-02, 1.849e-02, 3.242e-04, 1.021e-02, 1.234e-01, 1.224e-02, -4.322e-02, -2.778e-02, 3.860e-02, -5.257e-02, -1.466e-02, -1.001e-02, -1.291e-03, 1.724e-01, -9.167e-02));
r += mul(s1_7, M4(-5.291e-02, -2.764e-01, -6.402e-02, 4.327e-02, 1.921e-02, -1.484e-01, 3.286e-02, 4.051e-02, 1.636e-02, 3.932e-01, -5.432e-02, 4.540e-02, 3.947e-02, -1.385e-01, -1.065e-01, 1.569e-01));
r += mul(s1_8, M4(-1.729e-02, 8.177e-02, -4.479e-02, -1.275e-01, -3.302e-03, -1.265e-01, -2.922e-02, 3.720e-02, 1.560e-02, 5.266e-02, -1.572e-02, -4.840e-02, 3.991e-03, 1.003e-01, -1.423e-01, 7.414e-02));
r += mul(s2_0, M4(-1.207e-02, -2.418e-02, -7.769e-03, -1.401e-01, 1.660e-01, -6.347e-03, -1.092e-02, -1.830e-02, -1.252e-01, -5.217e-02, 9.898e-03, 1.461e-02, 2.654e-02, 1.219e-02, -3.769e-02, 1.897e-02));
r += mul(s2_1, M4(-3.650e-02, 1.317e-01, 1.299e-02, -5.512e-02, -1.287e-01, 2.438e-02, -1.609e-03, 1.759e-01, 1.824e-02, 6.477e-03, 2.905e-02, -8.644e-02, 7.496e-02, -9.920e-02, 1.147e-02, 1.889e-01));
r += mul(s2_2, M4(-7.005e-03, -4.482e-02, -1.853e-02, 3.441e-02, 1.251e-01, -3.162e-02, -1.701e-01, -5.231e-02, -1.647e-01, 2.261e-02, 8.255e-02, -3.730e-02, 1.811e-01, -9.052e-02, 1.728e-02, 1.911e-02));
r += mul(s2_3, M4(2.359e-02, -1.334e-01, 2.761e-02, -1.251e-01, 1.455e-01, 4.076e-02, -3.260e-02, -1.782e-01, -3.575e-02, 1.411e-02, 1.322e-01, -9.592e-02, 5.423e-02, 7.989e-03, -1.460e-01, 8.895e-02));
r += mul(s2_4, M4(1.304e-01, 1.296e-01, -7.250e-02, -6.647e-02, 8.382e-02, 1.111e-01, 8.976e-02, -5.914e-02, -2.228e-01, -4.772e-02, -1.931e-03, 8.499e-02, 4.483e-01, 1.327e-01, 5.086e-02, -4.795e-01));
r += mul(s2_5, M4(4.674e-02, 7.104e-02, -5.312e-02, -7.730e-02, 2.647e-03, 8.893e-03, -8.889e-02, -5.714e-02, -4.546e-02, -4.002e-02, -1.514e-01, -2.989e-02, -8.669e-02, -5.441e-03, 1.460e-02, -2.327e-02));
r += mul(s2_6, M4(1.146e-01, -1.154e-01, 8.289e-03, 7.655e-02, -2.194e-02, -3.908e-02, -2.191e-02, 2.363e-03, 4.527e-02, -7.852e-02, -4.728e-02, 1.066e-01, 4.023e-02, -5.192e-02, -4.180e-02, -3.879e-02));
r += mul(s2_7, M4(2.446e-01, -2.295e-01, -5.819e-02, -2.646e-02, 8.106e-02, -8.799e-02, -3.455e-02, 6.900e-02, 5.579e-02, -1.551e-01, 1.609e-01, 9.954e-02, -1.499e-01, 8.628e-02, 1.114e-01, 1.313e-02));
r += mul(s2_8, M4(1.028e-02, 9.150e-02, -6.161e-02, 5.124e-03, 3.822e-02, 1.533e-02, 2.329e-02, -1.106e-01, -1.541e-03, -1.818e-01, -9.577e-02, -3.402e-02, 1.784e-02, -1.152e-01, 6.896e-02, -1.111e-01));
r += mul(s3_0, M4(-7.349e-02, -4.782e-02, 3.080e-02, -1.668e-01, 9.572e-02, 5.307e-02, 5.573e-03, 6.483e-02, 1.104e-01, -5.707e-02, -8.579e-02, -1.754e-02, 1.038e-01, 1.706e-02, -1.185e-01, 5.863e-02));
r += mul(s3_1, M4(-1.639e-01, -6.808e-03, 1.836e-02, -1.482e-01, 1.032e-01, 2.612e-02, -1.751e-01, -1.527e-01, 3.169e-03, 5.272e-02, 7.983e-02, 5.066e-02, 1.191e-01, 3.658e-02, 3.275e-02, -1.122e-01));
r += mul(s3_2, M4(-8.279e-02, -1.068e-02, 3.848e-02, -8.857e-03, -3.783e-02, 9.934e-02, -7.181e-02, 2.801e-02, -1.524e-01, -7.166e-02, 1.038e-01, -9.840e-04, -7.254e-03, -3.252e-02, -1.435e-02, 6.052e-03));
r += mul(s3_3, M4(-3.534e-02, -2.891e-02, 3.778e-01, -2.472e-01, -4.015e-02, -5.651e-02, 2.006e-01, 1.249e-02, -8.408e-02, -1.160e-02, 2.881e-01, -6.805e-03, 1.340e-02, -1.237e-01, -1.617e-01, 1.894e-02));
r += mul(s3_4, M4(-1.512e-02, 3.232e-01, -1.441e-01, -3.778e-01, -1.475e-01, -2.644e-03, -3.149e-01, 3.225e-02, 1.227e-01, -3.620e-02, -1.175e-01, -3.857e-01, 4.834e-02, -1.567e-01, 1.632e-01, -1.292e-01));
r += mul(s3_5, M4(-1.592e-01, 3.426e-02, -1.506e-01, 1.215e-01, 1.314e-01, -7.432e-02, -8.767e-02, 1.685e-01, 6.875e-02, 2.804e-01, -3.279e-02, -1.870e-01, 1.049e-01, -9.061e-02, 8.573e-02, -9.407e-02));
r += mul(s3_6, M4(5.310e-02, -1.089e-01, -1.496e-01, 2.134e-01, 5.599e-02, -1.565e-01, -6.842e-02, -1.362e-02, 6.861e-02, -2.548e-02, -1.614e-01, -3.698e-02, -2.731e-02, 1.138e-02, 1.288e-02, -1.789e-02));
r += mul(s3_7, M4(-7.967e-02, -2.461e-01, -2.139e-01, 3.193e-01, 1.377e-01, -1.213e-01, 8.415e-02, 1.224e-02, 1.192e-01, 1.785e-01, 1.978e-01, 1.008e-01, 3.016e-02, 9.868e-02, 3.118e-03, -3.294e-02));
r += mul(s3_8, M4(1.121e-01, -4.625e-02, 3.331e-02, -7.687e-02, 5.520e-02, 6.326e-02, 1.369e-02, 1.850e-02, 4.062e-02, -1.561e-01, -8.640e-02, 1.105e-01, 8.446e-03, -1.746e-03, 4.572e-02, -1.015e-01));
r += V4(-1.057e-02, -1.114e-02, 1.597e-04, 1.132e-02);
return r;
}
V4 f1(V4 s0_0, V4 s0_1, V4 s0_2, V4 s0_3, V4 s0_4, V4 s0_5, V4 s0_6, V4 s0_7, V4 s0_8, V4 s1_0, V4 s1_1, V4 s1_2, V4 s1_3, V4 s1_4, V4 s1_5, V4 s1_6, V4 s1_7, V4 s1_8, V4 s2_0, V4 s2_1, V4 s2_2, V4 s2_3, V4 s2_4, V4 s2_5, V4 s2_6, V4 s2_7, V4 s2_8, V4 s3_0, V4 s3_1, V4 s3_2, V4 s3_3, V4 s3_4, V4 s3_5, V4 s3_6, V4 s3_7, V4 s3_8) {
V4 r = 0.0;
r += mul(s0_0, M4(2.399e-01, 1.190e-01, 9.941e-02, -5.908e-03, 2.176e-01, -3.861e-02, -4.997e-02, -3.036e-02, -6.079e-02, 2.294e-02, -1.260e-01, 6.001e-02, -7.690e-02, -4.805e-02, 6.117e-03, 4.358e-02));
r += mul(s0_1, M4(-4.669e-02, -1.150e-01, 9.700e-03, 2.351e-02, 3.215e-01, -1.737e-03, 2.091e-01, -1.245e-01, -8.592e-02, 1.866e-01, 2.826e-01, -6.728e-01, 1.528e-01, 5.511e-02, -4.930e-02, -1.959e-02));
r += mul(s0_2, M4(-2.182e-02, -4.512e-02, 6.864e-02, 8.299e-02, 8.483e-03, -4.855e-02, -1.500e-01, 1.325e-02, 6.098e-02, -1.867e-02, 1.276e-02, 1.721e-02, 5.918e-03, -1.130e-01, 7.066e-04, -1.824e-03));
r += mul(s0_3, M4(4.594e-02, 1.518e-01, -2.067e-01, 1.546e-02, -1.548e-02, 1.126e-01, -4.502e-03, -2.014e-02, 2.417e-01, -1.530e-01, -1.095e-01, -4.966e-02, 2.291e-01, -4.598e-03, 2.836e-01, 5.562e-02));
r += mul(s0_4, M4(5.432e-02, -3.003e-01, 7.389e-01, -1.497e-01, -2.439e-01, -3.298e-01, 4.081e-01, -2.105e-01, -4.267e-01, 3.913e-01, 5.470e-01, 5.594e-01, -1.221e-01, -5.444e-02, -4.180e-01, 1.515e-01));
r += mul(s0_5, M4(2.205e-01, -5.813e-03, 7.451e-03, 8.130e-02, -3.312e-02, -9.387e-02, -9.824e-02, 4.493e-02, 8.187e-02, -2.042e-01, 1.644e-01, 1.562e-01, -8.427e-02, 2.057e-01, -1.668e-02, -2.356e-01));
r += mul(s0_6, M4(1.150e-02, 1.442e-02, -1.973e-02, -4.599e-02, -9.680e-02, 3.962e-02, 1.731e-02, -2.402e-02, 3.936e-02, 6.512e-03, 2.103e-02, 2.025e-03, -1.308e-02, -5.259e-02, 5.631e-02, 3.037e-02));
r += mul(s0_7, M4(-1.306e-02, -3.164e-02, 1.196e-01, 2.798e-02, -2.533e-01, -1.204e-01, 1.860e-01, 1.564e-01, -4.731e-02, -7.323e-02, 1.441e-03, -9.049e-02, -3.371e-02, -2.801e-04, 2.952e-02, -2.632e-02));
r += mul(s0_8, M4(3.024e-02, -1.034e-02, -7.595e-02, -7.550e-02, 3.562e-02, -4.589e-02, -3.066e-02, 7.995e-02, -1.866e-02, 1.022e-01, -2.624e-02, -1.074e-01, 2.176e-02, 1.434e-01, -5.664e-02, -3.473e-02));
r += mul(s1_0, M4(2.252e-01, 9.801e-02, -5.786e-02, -6.661e-02, 7.599e-02, -9.244e-02, 4.437e-02, -1.203e-01, -1.577e-01, -3.797e-02, -1.335e-02, 4.540e-02, -3.540e-03, -9.094e-03, -4.076e-02, -8.099e-02));
r += mul(s1_1, M4(2.557e-01, -2.549e-01, 2.306e-01, -4.389e-02, -3.677e-02, 5.796e-02, 4.505e-02, -1.209e-01, -4.484e-02, 1.229e-01, -5.686e-02, 2.778e-02, 9.876e-02, -6.893e-04, 9.771e-02, 1.264e-01));
r += mul(s1_2, M4(-5.324e-02, -9.632e-02, -1.092e-02, -1.426e-02, 3.082e-02, 9.196e-02, -1.381e-01, -1.013e-01, 7.758e-03, -3.290e-02, 1.630e-02, -4.979e-03, -7.297e-02, -7.534e-02, 2.040e-02, -1.983e-01));
r += mul(s1_3, M4(1.951e-01, 3.566e-02, 4.220e-02, 8.086e-02, -5.114e-02, -5.626e-02, -6.912e-02, 1.462e-01, 2.268e-03, -2.592e-02, 3.527e-02, -3.832e-02, 4.756e-02, 1.234e-01, -5.494e-03, 4.695e-02));
r += mul(s1_4, M4(4.147e-01, -2.431e-01, 2.372e-01, 2.574e-04, -4.485e-02, 5.014e-02, 3.928e-02, -2.817e-02, 3.512e-01, 2.983e-01, -1.260e-01, 4.326e-01, -2.366e-01, -6.912e-02, 2.259e-01, -4.534e-01));
r += mul(s1_5, M4(1.323e-01, 5.260e-03, 2.693e-02, 1.841e-01, -1.105e-01, 6.002e-02, -1.233e-01, 1.012e-02, -9.410e-02, -1.260e-01, 1.264e-02, -3.910e-02, 3.656e-01, -1.103e-01, 5.059e-01, 4.280e-01));
r += mul(s1_6, M4(-7.537e-02, -2.153e-02, -4.511e-02, -5.184e-02, -1.745e-02, -1.165e-02, 1.352e-02, -1.951e-02, -4.888e-02, 2.249e-02, -3.915e-02, -4.557e-03, -9.946e-03, -1.633e-04, -3.200e-02, -1.356e-02));
r += mul(s1_7, M4(-1.509e-01, -2.227e-02, 1.640e-01, 2.693e-02, 4.846e-02, 3.303e-02, -5.390e-02, 3.607e-02, -2.818e-02, -7.170e-02, 3.311e-02, -9.203e-02, -1.946e-03, -8.577e-02, -2.925e-02, 1.238e-01));
r += mul(s1_8, M4(-3.295e-02, 1.995e-02, -1.689e-01, -4.353e-02, -4.138e-02, -7.439e-03, -2.343e-02, 6.997e-02, 8.031e-02, 1.117e-01, 4.894e-02, -6.214e-02, -1.960e-01, -1.630e-01, 8.586e-02, -8.213e-02));
r += mul(s2_0, M4(-9.883e-02, -1.168e-02, -1.110e-01, -2.148e-01, 1.452e-01, 3.417e-03, -4.513e-02, 8.845e-02, -7.791e-02, 2.326e-02, -4.188e-02, -3.659e-02, 3.105e-02, -1.318e-02, -4.552e-03, 7.109e-02));
r += mul(s2_1, M4(1.958e-02, -6.995e-02, 2.588e-01, -6.431e-02, -2.211e-01, 5.281e-02, 5.399e-02, 8.884e-02, -5.135e-02, -4.768e-02, 1.363e-01, -2.064e-01, -1.391e-01, 1.106e-01, -2.611e-01, 2.038e-01));
r += mul(s2_2, M4(-6.883e-02, -1.360e-03, -1.628e-01, 7.301e-02, 1.213e-01, -5.159e-03, 1.194e-01, -1.148e-02, -1.285e-01, -1.448e-01, 1.776e-02, -1.414e-01, -3.022e-02, 1.382e-01, 6.695e-02, -4.201e-02));
r += mul(s2_3, M4(-1.194e-01, 1.524e-03, -1.945e-01, -1.496e-01, 1.413e-03, -8.697e-04, -1.542e-01, -1.798e-03, -4.991e-02, -7.944e-03, -1.094e-01, -5.578e-02, 1.526e-01, -6.170e-02, 1.598e-01, 1.306e-01));
r += mul(s2_4, M4(3.583e-02, -1.213e-01, 2.087e-01, -4.616e-02, 2.125e-01, -1.242e-01, 2.776e-01, -8.100e-02, -1.733e-01, 1.016e-01, 2.949e-01, 1.489e-01, 5.059e-01, 3.526e-01, -4.764e-01, -1.105e-02));
r += mul(s2_5, M4(7.240e-02, 1.034e-01, -1.103e-01, 2.351e-02, -2.711e-02, 1.506e-02, -1.534e-01, 1.093e-01, 5.065e-02, -2.686e-01, 1.423e-01, -4.993e-02, 7.167e-02, 1.084e-01, -8.139e-03, 4.460e-02));
r += mul(s2_6, M4(1.243e-01, 1.281e-02, 7.048e-02, 1.117e-01, -1.145e-01, -1.703e-02, -1.470e-02, -3.647e-02, 3.796e-03, 2.441e-02, -8.422e-02, 1.955e-02, -2.861e-02, -6.963e-02, 6.894e-02, -4.071e-02));
r += mul(s2_7, M4(2.315e-01, 7.446e-02, -7.632e-02, 1.319e-01, -2.392e-02, 2.525e-02, 4.687e-02, 7.645e-02, 4.250e-02, -4.733e-02, 2.179e-01, -3.843e-02, -3.526e-01, 9.675e-02, -1.837e-01, -1.563e-01));
r += mul(s2_8, M4(5.933e-02, 1.490e-01, -5.844e-02, 9.363e-02, 7.616e-04, -1.075e-02, -1.365e-01, -6.094e-02, 7.094e-03, -1.218e-01, 7.021e-02, 3.101e-02, -4.184e-02, 3.989e-02, -7.167e-02, -1.179e-01));
r += mul(s3_0, M4(-7.835e-02, 6.392e-02, -5.802e-02, -1.483e-01, 1.374e-01, 3.699e-02, 2.043e-03, 1.554e-01, -6.873e-02, -1.174e-02, -1.518e-01, -1.405e-02, 4.783e-03, -1.131e-01, 4.121e-02, -8.849e-02));
r += mul(s3_1, M4(-1.463e-01, 5.240e-02, -1.651e-02, -2.410e-01, 1.092e-01, -3.146e-02, -1.629e-02, -2.974e-02, -7.838e-02, -7.374e-03, 2.745e-01, -1.408e-01, 1.335e-01, 8.634e-02, 1.073e-02, -1.407e-02));
r += mul(s3_2, M4(-7.340e-02, 2.321e-02, 1.922e-02, -1.112e-01, 2.932e-02, -2.587e-02, 1.333e-01, 4.721e-02, -1.514e-01, -3.395e-02, -1.264e-01, 1.777e-02, -8.692e-02, 1.186e-02, -7.424e-02, -2.402e-02));
r += mul(s3_3, M4(5.052e-02, 2.790e-03, 3.121e-02, -1.839e-01, 3.910e-02, 2.279e-02, 6.041e-02, -8.205e-03, -5.819e-02, 5.701e-04, 5.763e-02, -1.835e-02, -7.273e-02, -1.017e-01, -4.708e-02, 3.331e-02));
r += mul(s3_4, M4(-4.521e-02, -3.700e-02, -1.199e-01, -3.863e-01, -4.641e-01, -2.451e-01, 1.512e-03, -3.424e-01, -1.194e-01, 1.119e-01, -1.183e-01, 1.918e-01, 8.865e-02, 1.866e-01, -4.503e-02, 2.355e-03));
r += mul(s3_5, M4(5.461e-02, -1.461e-01, 2.827e-01, 2.041e-01, -8.786e-03, 1.079e-02, 1.593e-01, 2.173e-01, 4.916e-01, -1.773e-01, 2.149e-02, -1.461e-01, -5.435e-02, 1.909e-01, -2.171e-01, -7.547e-02));
r += mul(s3_6, M4(2.543e-02, 5.455e-02, -6.107e-02, 5.194e-03, 9.984e-02, 8.664e-02, -7.757e-04, 3.957e-02, 1.432e-01, 3.805e-02, -1.005e-03, 7.600e-02, -4.304e-02, -6.326e-02, 3.996e-02, 3.872e-03));
r += mul(s3_7, M4(-1.234e-01, -1.276e-01, 1.312e-01, 8.454e-02, 1.539e-01, 6.822e-02, -1.455e-02, 1.223e-01, -1.060e-01, -3.708e-02, -1.480e-01, -7.922e-02, 6.503e-02, 1.105e-01, -1.249e-01, -3.210e-02));
r += mul(s3_8, M4(1.676e-01, -7.072e-03, 4.581e-02, -1.006e-01, -1.056e-02, -8.209e-02, -4.804e-02, 2.427e-02, -1.165e-01, -8.224e-02, 2.940e-01, -9.220e-03, 5.420e-02, 1.802e-01, -1.190e-01, 1.433e-02));
r += V4(-4.712e-03, -1.187e-02, 1.287e-02, -6.625e-03);
return r;
}
void Pass4(uint2 blockStart, uint3 tid) {
float2 pt = float2(GetInputPt());
uint2 gxy = Rmp8x8(tid.x) + blockStart;
uint2 size = GetInputSize();
if (gxy.x >= size.x || gxy.y >= size.y) {
return;
}
float2 pos = (gxy + 0.5) * pt;
V4 s0_0 = l0(-1.0, -1.0);
V4 s0_1 = l0(0.0, -1.0);
V4 s0_2 = l0(1.0, -1.0);
V4 s0_3 = l0(-1.0, 0.0);
V4 s0_4 = l0(0.0, 0.0);
V4 s0_5 = l0(1.0, 0.0);
V4 s0_6 = l0(-1.0, 1.0);
V4 s0_7 = l0(0.0, 1.0);
V4 s0_8 = l0(1.0, 1.0);
V4 s1_0 = -max(-s0_0, 0.0);
V4 s1_1 = -max(-s0_1, 0.0);
V4 s1_2 = -max(-s0_2, 0.0);
V4 s1_3 = -max(-s0_3, 0.0);
V4 s1_4 = -max(-s0_4, 0.0);
V4 s1_5 = -max(-s0_5, 0.0);
V4 s1_6 = -max(-s0_6, 0.0);
V4 s1_7 = -max(-s0_7, 0.0);
V4 s1_8 = -max(-s0_8, 0.0);
s0_0 = max(s0_0, 0.0);
s0_1 = max(s0_1, 0.0);
s0_2 = max(s0_2, 0.0);
s0_3 = max(s0_3, 0.0);
s0_4 = max(s0_4, 0.0);
s0_5 = max(s0_5, 0.0);
s0_6 = max(s0_6, 0.0);
s0_7 = max(s0_7, 0.0);
s0_8 = max(s0_8, 0.0);
V4 s2_0 = l1(-1.0, -1.0);
V4 s2_1 = l1(0.0, -1.0);
V4 s2_2 = l1(1.0, -1.0);
V4 s2_3 = l1(-1.0, 0.0);
V4 s2_4 = l1(0.0, 0.0);
V4 s2_5 = l1(1.0, 0.0);
V4 s2_6 = l1(-1.0, 1.0);
V4 s2_7 = l1(0.0, 1.0);
V4 s2_8 = l1(1.0, 1.0);
V4 s3_0 = -max(-s2_0, 0.0);
V4 s3_1 = -max(-s2_1, 0.0);
V4 s3_2 = -max(-s2_2, 0.0);
V4 s3_3 = -max(-s2_3, 0.0);
V4 s3_4 = -max(-s2_4, 0.0);
V4 s3_5 = -max(-s2_5, 0.0);
V4 s3_6 = -max(-s2_6, 0.0);
V4 s3_7 = -max(-s2_7, 0.0);
V4 s3_8 = -max(-s2_8, 0.0);
s2_0 = max(s2_0, 0.0);
s2_1 = max(s2_1, 0.0);
s2_2 = max(s2_2, 0.0);
s2_3 = max(s2_3, 0.0);
s2_4 = max(s2_4, 0.0);
s2_5 = max(s2_5, 0.0);
s2_6 = max(s2_6, 0.0);
s2_7 = max(s2_7, 0.0);
s2_8 = max(s2_8, 0.0);
t2[gxy] = f0(s0_0, s0_1, s0_2, s0_3, s0_4, s0_5, s0_6, s0_7, s0_8, s1_0, s1_1, s1_2, s1_3, s1_4, s1_5, s1_6, s1_7, s1_8, s2_0, s2_1, s2_2, s2_3, s2_4, s2_5, s2_6, s2_7, s2_8, s3_0, s3_1, s3_2, s3_3, s3_4, s3_5, s3_6, s3_7, s3_8);
t3[gxy] = f1(s0_0, s0_1, s0_2, s0_3, s0_4, s0_5, s0_6, s0_7, s0_8, s1_0, s1_1, s1_2, s1_3, s1_4, s1_5, s1_6, s1_7, s1_8, s2_0, s2_1, s2_2, s2_3, s2_4, s2_5, s2_6, s2_7, s2_8, s3_0, s3_1, s3_2, s3_3, s3_4, s3_5, s3_6, s3_7, s3_8);
}
//!PASS 5
//!DESC conv4
//!BLOCK_SIZE 8
//!NUM_THREADS 64
//!IN t2, t3
//!OUT t0, t1
#define l0(x, y) V4(O(t2, float2(x, y)))
#define l1(x, y) V4(O(t3, float2(x, y)))
V4 f0(V4 s0_0, V4 s0_1, V4 s0_2, V4 s0_3, V4 s0_4, V4 s0_5, V4 s0_6, V4 s0_7, V4 s0_8, V4 s1_0, V4 s1_1, V4 s1_2, V4 s1_3, V4 s1_4, V4 s1_5, V4 s1_6, V4 s1_7, V4 s1_8, V4 s2_0, V4 s2_1, V4 s2_2, V4 s2_3, V4 s2_4, V4 s2_5, V4 s2_6, V4 s2_7, V4 s2_8, V4 s3_0, V4 s3_1, V4 s3_2, V4 s3_3, V4 s3_4, V4 s3_5, V4 s3_6, V4 s3_7, V4 s3_8) {
V4 r = 0.0;
r += mul(s0_0, M4(1.060e-02, 9.173e-03, 9.548e-04, -7.886e-02, -1.324e-02, 4.660e-02, -4.997e-02, -5.676e-02, -3.290e-02, 6.253e-02, -5.777e-02, 1.265e-02, 6.136e-03, 7.179e-02, 3.102e-02, 4.961e-02));
r += mul(s0_1, M4(5.787e-03, -2.090e-03, -1.489e-01, 4.380e-02, 1.259e-01, 5.508e-01, 1.211e-01, 3.385e-01, 2.399e-02, -1.436e-01, 2.987e-03, -2.839e-02, -3.021e-02, -8.641e-03, 1.716e-01, -1.328e-02));
r += mul(s0_2, M4(-3.284e-02, -5.196e-02, -2.983e-02, -2.858e-02, 1.729e-02, 7.665e-02, 1.387e-01, 1.037e-01, 4.289e-02, 1.274e-01, 3.348e-02, 1.911e-02, -1.786e-02, -4.888e-02, 6.323e-02, -2.989e-02));
r += mul(s0_3, M4(2.473e-02, -6.550e-02, -1.373e-01, 3.680e-02, 1.575e-01, -8.270e-02, 3.186e-02, -3.836e-02, 4.508e-02, 4.254e-02, 5.656e-03, -9.132e-02, 1.334e-01, -5.076e-02, -2.445e-02, -4.735e-02));
r += mul(s0_4, M4(-5.346e-01, 1.950e-01, 2.121e-01, -3.694e-01, 5.004e-02, 1.610e-02, 2.249e-01, -5.962e-02, -6.243e-02, -3.270e-01, 1.851e-01, 4.051e-02, -2.310e-01, -2.300e-01, -1.314e-01, 3.374e-01));
r += mul(s0_5, M4(-4.686e-02, -3.968e-01, 2.772e-02, 2.495e-02, 4.541e-02, 8.724e-02, 4.401e-02, -1.515e-02, -6.453e-02, -7.210e-02, -1.250e-02, 4.044e-02, 3.057e-02, 2.485e-01, 2.228e-02, 6.774e-02));
r += mul(s0_6, M4(-1.518e-01, -6.862e-02, -8.148e-02, -2.030e-01, -4.453e-02, -2.133e-03, -6.081e-02, -8.941e-02, -5.417e-02, 1.564e-02, -5.425e-02, 5.875e-02, -8.805e-02, -1.910e-02, 2.099e-02, -1.402e-02));
r += mul(s0_7, M4(-1.730e-02, -6.152e-02, -2.764e-01, -8.728e-02, 9.519e-03, -2.799e-02, -5.662e-02, 3.249e-02, 8.716e-02, 2.809e-02, -7.241e-02, 3.046e-02, 1.368e-01, 2.723e-02, 1.130e-01, -4.615e-02));
r += mul(s0_8, M4(-5.021e-02, -3.352e-02, 5.072e-02, -1.434e-02, 6.511e-02, 6.519e-02, -8.987e-02, 2.193e-02, 1.583e-04, 2.714e-02, -2.315e-02, -3.077e-02, 7.792e-03, 2.782e-02, 9.282e-02, 5.011e-02));
r += mul(s1_0, M4(-2.541e-02, -9.530e-03, -2.089e-01, -2.421e-02, 1.340e-02, 1.228e-01, 8.861e-02, -1.063e-02, -7.461e-02, 5.226e-02, -7.276e-02, 3.544e-02, -1.591e-02, 1.851e-02, 9.562e-03, 4.559e-02));
r += mul(s1_1, M4(2.747e-02, -7.982e-02, -1.475e-01, 4.885e-02, -1.175e-02, -9.209e-02, -9.273e-02, -7.428e-02, 3.696e-02, -2.012e-01, 4.627e-02, 3.609e-02, 1.096e-01, -5.087e-02, 2.170e-01, 5.311e-02));
r += mul(s1_2, M4(2.410e-02, 6.970e-02, 2.315e-02, 2.908e-02, 2.961e-05, 1.661e-02, 8.374e-02, 5.064e-02, 2.637e-02, 1.330e-01, 5.175e-02, -5.518e-02, -4.871e-03, 1.162e-01, 8.451e-02, 1.741e-02));
r += mul(s1_3, M4(4.863e-02, -7.095e-02, 3.927e-03, -9.085e-02, 2.639e-02, -8.297e-02, -1.865e-01, -9.647e-02, 6.967e-02, 1.376e-02, 1.222e-01, -2.819e-01, 1.563e-01, -1.399e-02, -4.367e-02, -5.187e-02));
r += mul(s1_4, M4(9.322e-02, 9.848e-02, 1.680e-01, -2.298e-01, -6.183e-02, -4.167e-02, -1.103e-02, -9.856e-03, -2.983e-03, -3.805e-01, -3.115e-01, -4.107e-01, -1.341e-01, -3.703e-01, -3.661e-01, -4.633e-01));
r += mul(s1_5, M4(-2.785e-03, -2.188e-02, -2.790e-03, -4.276e-04, 7.082e-02, 1.004e-01, -3.532e-03, 1.740e-03, 6.693e-03, -5.230e-01, 2.119e-01, 2.878e-02, 3.915e-03, 1.842e-01, -1.630e-02, -3.874e-02));
r += mul(s1_6, M4(2.313e-02, -6.545e-02, 1.631e-02, -1.278e-01, -4.216e-02, -4.147e-02, 6.827e-02, -1.725e-02, -5.254e-02, -3.942e-02, -2.400e-02, -8.124e-02, -3.250e-02, -1.806e-03, -3.947e-02, -7.056e-02));
r += mul(s1_7, M4(8.445e-03, 1.147e-01, -7.772e-02, 1.091e-01, 1.842e-02, -6.040e-03, -7.053e-02, 1.824e-02, 2.212e-01, -8.777e-02, -1.003e-01, 6.533e-03, 2.090e-01, 4.588e-02, 9.886e-02, 6.176e-02));
r += mul(s1_8, M4(4.046e-02, 1.872e-02, -5.723e-02, -4.997e-02, 5.232e-03, 1.795e-02, -2.747e-02, -1.507e-02, -1.704e-01, 7.849e-02, -1.475e-01, -4.255e-02, 7.807e-02, 4.185e-02, 3.849e-02, 3.137e-02));
r += mul(s2_0, M4(-8.062e-03, 6.677e-02, 6.217e-02, 1.833e-01, -1.475e-01, 2.782e-01, 3.524e-02, -6.275e-02, 4.315e-02, 1.484e-02, 3.820e-02, -3.304e-02, 1.659e-03, -9.567e-03, -3.360e-02, -2.623e-02));
r += mul(s2_1, M4(9.928e-02, -2.526e-01, -2.613e-02, 2.043e-01, 1.710e-02, -1.137e-01, 1.798e-01, -1.427e-01, 4.676e-03, 1.728e-01, 8.082e-02, -5.413e-02, -1.710e-02, -3.169e-02, -6.860e-02, 1.496e-02));
r += mul(s2_2, M4(1.785e-02, 1.092e-01, -7.685e-02, 7.691e-02, 5.271e-03, -5.168e-02, 3.395e-02, 1.726e-02, 2.936e-02, -1.321e-02, 5.364e-02, -6.785e-03, 2.429e-02, -4.442e-02, -6.348e-02, 3.035e-02));
r += mul(s2_3, M4(2.676e-01, 4.022e-03, -5.435e-02, -2.723e-01, -1.412e-01, -6.091e-01, 1.576e-02, 6.829e-02, -1.410e-01, 5.578e-03, 3.833e-03, 1.863e-01, -2.274e-02, 6.034e-03, 1.518e-01, -5.434e-02));
r += mul(s2_4, M4(7.884e-02, 5.377e-01, -4.655e-02, -3.752e-01, 1.490e-01, -4.235e-02, -5.390e-02, 2.610e-01, 1.979e-01, -5.718e-02, 1.773e-02, 5.727e-02, 1.703e-02, 7.533e-01, -3.023e-02, 5.456e-02));
r += mul(s2_5, M4(-4.898e-02, 4.237e-02, 6.311e-02, -4.635e-02, 3.660e-03, 2.139e-01, -3.722e-02, -6.738e-02, -3.009e-02, -6.140e-02, 2.777e-02, 3.917e-02, -1.421e-01, -4.041e-01, -1.524e-01, -9.837e-02));
r += mul(s2_6, M4(6.071e-02, 1.084e-01, -6.370e-02, 1.323e-01, -7.251e-02, -1.079e-01, 1.208e-01, -4.495e-02, -2.115e-03, -4.107e-02, 2.465e-02, -1.230e-01, -6.064e-02, -4.263e-02, -1.388e-01, 6.519e-02));
r += mul(s2_7, M4(9.042e-02, -8.032e-02, 1.186e-01, -1.537e-02, -6.566e-03, -3.216e-02, 3.412e-02, -3.207e-02, -1.586e-01, -2.988e-03, -2.358e-03, 2.172e-02, 6.775e-02, -3.590e-01, -4.123e-01, -3.506e-01));
r += mul(s2_8, M4(8.486e-02, 4.731e-02, 5.779e-02, 1.000e-01, 9.121e-03, -3.421e-02, 4.891e-02, 4.916e-02, 3.343e-03, 4.437e-03, -2.002e-02, -3.856e-02, -1.319e-01, -4.022e-02, -1.752e-01, -9.250e-02));
r += mul(s3_0, M4(-6.652e-03, 6.416e-02, 9.292e-03, 6.520e-02, 1.213e-02, 4.177e-02, 7.038e-02, -3.160e-02, 2.146e-02, -9.523e-02, -1.436e-01, -8.325e-02, -1.234e-02, -1.222e-02, -3.877e-02, -4.175e-02));
r += mul(s3_1, M4(-5.171e-02, 1.011e-01, 7.998e-02, -8.804e-02, 1.067e-02, 1.516e-01, 6.508e-02, -7.724e-02, 2.717e-02, -4.901e-02, -6.059e-03, 4.013e-02, -3.833e-02, 1.538e-01, 5.948e-02, -4.945e-02));
r += mul(s3_2, M4(1.023e-02, -1.230e-01, -1.861e-02, -2.570e-02, 2.512e-02, -4.630e-02, 6.354e-02, 3.897e-02, -2.146e-02, 2.446e-01, 1.906e-03, -9.068e-03, 1.754e-02, -7.082e-02, 1.107e-04, -9.604e-03));
r += mul(s3_3, M4(7.380e-02, 2.216e-02, -2.608e-02, -6.491e-02, 4.018e-02, -6.657e-02, 1.116e-01, 9.405e-02, -7.168e-02, -3.646e-01, 8.387e-02, 1.352e-02, -4.589e-02, 2.235e-02, 1.881e-01, 1.759e-01));
r += mul(s3_4, M4(-7.735e-02, -8.574e-02, -6.380e-02, 1.221e-01, 5.556e-02, -1.281e-01, 1.461e-01, 2.757e-01, 8.144e-01, -1.075e-01, 3.165e-03, -2.036e-01, 1.814e-01, 1.744e-01, -1.745e-01, 3.724e-02));
r += mul(s3_5, M4(-6.864e-02, 1.273e-02, 7.502e-02, 4.164e-02, 1.301e-02, 1.407e-01, -9.985e-02, -8.079e-02, 1.428e-01, 3.034e-01, -1.564e-02, 6.091e-02, -1.271e-02, -2.153e-01, -7.843e-02, -4.063e-02));
r += mul(s3_6, M4(-5.115e-02, 6.016e-02, -2.719e-02, 4.668e-02, -3.214e-02, -2.274e-02, -6.954e-03, -9.099e-03, 4.861e-02, 1.007e-01, -2.150e-01, -1.607e-01, -3.578e-02, 1.230e-02, -5.095e-02, 1.622e-02));
r += mul(s3_7, M4(9.498e-02, -6.763e-02, 1.451e-01, 3.408e-03, -3.253e-02, 1.145e-01, 8.122e-03, -9.192e-02, 5.071e-02, -6.317e-02, 1.097e-01, 5.913e-02, 8.494e-02, 2.731e-04, -3.736e-01, -6.110e-03));
r += mul(s3_8, M4(1.881e-02, 1.750e-02, 5.956e-02, 4.179e-02, -4.554e-02, -9.824e-02, 8.917e-03, 3.348e-02, 4.160e-02, 6.525e-02, 1.484e-02, -2.331e-02, -8.092e-02, -2.834e-02, -1.284e-01, -7.521e-02));
r += V4(-5.270e-03, 1.390e-02, 8.622e-03, 1.255e-02);
return r;
}
V4 f1(V4 s0_0, V4 s0_1, V4 s0_2, V4 s0_3, V4 s0_4, V4 s0_5, V4 s0_6, V4 s0_7, V4 s0_8, V4 s1_0, V4 s1_1, V4 s1_2, V4 s1_3, V4 s1_4, V4 s1_5, V4 s1_6, V4 s1_7, V4 s1_8, V4 s2_0, V4 s2_1, V4 s2_2, V4 s2_3, V4 s2_4, V4 s2_5, V4 s2_6, V4 s2_7, V4 s2_8, V4 s3_0, V4 s3_1, V4 s3_2, V4 s3_3, V4 s3_4, V4 s3_5, V4 s3_6, V4 s3_7, V4 s3_8) {
V4 r = 0.0;
r += mul(s0_0, M4(-4.868e-02, -7.333e-02, -1.029e-02, -7.011e-04, 2.404e-02, -9.301e-02, 1.457e-01, 2.242e-02, 6.850e-02, -1.328e-03, -2.557e-02, -4.854e-04, 1.071e-01, 3.788e-04, 1.408e-01, 5.354e-03));
r += mul(s0_1, M4(7.892e-03, 5.832e-02, -1.077e-01, -6.140e-02, -1.003e-02, -4.887e-01, 8.263e-01, 2.416e-01, -3.434e-02, 1.089e-02, -1.984e-02, -3.615e-02, -5.692e-03, 1.615e-02, -5.680e-02, 2.041e-02));
r += mul(s0_2, M4(2.136e-02, -2.731e-02, -1.742e-02, 2.592e-02, -4.319e-02, 6.426e-03, 2.110e-02, -6.338e-02, -6.921e-03, -1.288e-03, 4.579e-02, -2.155e-03, 3.041e-02, 1.946e-02, 1.238e-02, 6.906e-02));
r += mul(s0_3, M4(-2.058e-02, 3.187e-02, -1.057e-01, 2.407e-01, -3.813e-02, -2.640e-02, 3.941e-02, 1.362e-01, 2.406e-02, 1.518e-02, -4.224e-02, 3.455e-02, 5.443e-02, -6.617e-02, -8.858e-02, -1.949e-02));
r += mul(s0_4, M4(3.205e-01, -6.490e-01, -3.962e-01, -1.142e-01, -3.091e-02, 4.755e-01, -2.822e-01, -1.328e-01, -5.487e-01, 5.932e-02, -2.439e-02, -1.689e-01, -3.681e-02, -8.227e-02, 3.967e-02, -8.989e-02));
r += mul(s0_5, M4(-5.668e-02, 3.658e-02, 1.227e-02, 8.117e-02, 1.161e-01, 9.350e-02, 9.971e-02, -1.220e-01, 7.876e-02, 5.186e-02, -4.261e-02, 1.436e-01, -2.114e-02, 6.113e-02, 2.251e-02, 2.534e-02));
r += mul(s0_6, M4(-5.365e-02, -2.678e-02, -2.565e-02, 7.923e-02, -2.138e-02, -4.932e-02, -6.107e-03, 1.685e-02, 5.425e-02, -1.012e-02, -9.037e-03, 8.218e-04, -1.210e-02, 5.623e-02, -2.094e-02, -2.325e-02));
r += mul(s0_7, M4(2.031e-02, -3.187e-02, 8.229e-02, 1.457e-01, 1.044e-01, -4.475e-02, 2.858e-02, -7.345e-02, -3.919e-02, -5.753e-02, 1.684e-02, -1.669e-01, 9.680e-03, 1.254e-01, 2.022e-03, -9.900e-02));
r += mul(s0_8, M4(-1.164e-02, 5.171e-02, -5.704e-02, -1.643e-01, 2.554e-02, -9.988e-02, 3.699e-02, -3.752e-02, -8.076e-04, -2.527e-02, -2.081e-02, 3.110e-02, 1.484e-03, 4.064e-02, 2.481e-02, 2.225e-01));
r += mul(s1_0, M4(4.384e-02, -1.401e-01, -4.071e-02, -1.137e-02, -4.979e-03, 6.159e-02, 1.275e-01, 6.544e-02, 1.288e-01, -4.421e-02, -4.471e-02, 2.682e-02, 5.621e-02, -4.062e-02, 1.034e-01, 6.606e-02));
r += mul(s1_1, M4(2.799e-02, -1.333e-01, 1.521e-01, -5.025e-02, -1.895e-01, -7.913e-02, -2.321e-01, -6.526e-02, -1.330e-02, 1.499e-02, 1.620e-01, 6.936e-02, -6.816e-02, 1.353e-01, 1.107e-01, 5.514e-02));
r += mul(s1_2, M4(5.826e-03, -5.941e-03, -2.338e-02, -2.826e-02, 9.265e-02, 3.608e-02, 1.114e-01, 1.274e-01, -1.291e-01, 1.284e-02, -7.540e-02, -3.458e-02, -1.006e-02, 2.083e-02, -8.393e-02, 8.186e-02));
r += mul(s1_3, M4(-3.893e-02, -1.137e-02, 1.243e-01, 1.118e-01, 7.397e-02, -1.316e-01, -1.303e-01, -7.808e-05, 1.468e-02, -4.172e-03, -5.014e-02, -2.610e-02, 8.366e-02, -2.755e-02, 1.646e-04, -2.938e-02));
r += mul(s1_4, M4(-1.114e-01, -2.017e-01, 2.898e-03, -7.984e-02, -8.403e-02, 2.626e-02, 1.563e-01, -1.397e-02, 2.724e-02, -6.698e-01, 2.358e-01, -6.466e-01, -9.650e-03, -6.742e-01, 1.411e-01, -3.343e-01));
r += mul(s1_5, M4(7.380e-02, 6.420e-02, 7.990e-02, 6.014e-02, 5.950e-02, 6.212e-02, -7.881e-02, -2.782e-02, 1.087e-01, -3.347e-02, 3.819e-01, 1.988e-01, 5.813e-02, 2.239e-02, 3.012e-01, 1.275e-01));
r += mul(s1_6, M4(-9.473e-02, -2.417e-02, -2.870e-02, 7.718e-02, -2.223e-02, 2.306e-02, 8.255e-03, -1.818e-02, -2.983e-02, -3.495e-02, 1.540e-02, 8.013e-02, 1.651e-02, -1.298e-02, 2.377e-02, 5.523e-02));
r += mul(s1_7, M4(1.414e-01, 1.346e-01, 4.336e-03, -7.594e-02, -2.044e-02, -9.596e-03, -1.087e-03, 5.324e-02, -2.041e-02, -6.328e-02, 7.533e-02, -3.971e-01, 5.408e-04, 1.087e-01, 9.749e-03, -2.047e-01));
r += mul(s1_8, M4(4.656e-02, -4.771e-02, -2.210e-02, -2.060e-02, -6.953e-03, -3.366e-02, -7.290e-03, -3.300e-02, -1.354e-01, -5.015e-02, -2.887e-02, 2.802e-01, 2.605e-02, -1.972e-02, 1.168e-03, 1.422e-01));
r += mul(s2_0, M4(8.826e-02, -4.751e-02, 2.493e-01, 4.446e-02, 1.752e-01, 5.741e-03, -1.820e-01, 1.371e-02, -6.855e-02, 1.164e-02, -5.215e-02, -7.373e-04, -1.491e-02, 7.033e-03, -5.440e-02, -9.302e-05));
r += mul(s2_1, M4(-6.871e-02, -9.419e-03, 3.276e-01, 2.826e-02, 5.675e-02, -3.974e-03, 1.104e-01, -2.975e-02, 3.281e-02, 8.429e-03, 1.129e-01, -4.830e-02, -4.374e-02, -6.905e-02, 8.143e-02, 3.180e-03));
r += mul(s2_2, M4(-7.197e-02, -1.804e-02, -9.024e-02, -1.527e-03, 2.403e-02, 6.062e-02, 3.346e-02, 4.784e-02, -1.462e-02, 4.216e-02, 2.800e-02, 4.034e-04, -4.216e-02, -4.431e-03, -3.496e-02, -3.005e-02));
r += mul(s2_3, M4(2.710e-02, -6.523e-02, 1.559e-01, -6.059e-02, 1.965e-01, -1.608e-01, -9.293e-03, -2.404e-01, -4.061e-02, 8.819e-02, 2.112e-02, 2.398e-01, -1.463e-01, 5.373e-02, -1.346e-02, 3.025e-02));
r += mul(s2_4, M4(1.846e-02, 3.857e-01, -4.128e-01, -2.530e-01, -2.312e-01, 3.354e-02, -3.948e-01, -1.465e-01, 1.072e-01, -8.544e-02, -7.428e-02, 4.751e-02, 2.139e-01, 3.097e-01, -3.761e-01, 5.621e-02));
r += mul(s2_5, M4(-1.203e-02, -9.598e-02, 4.101e-01, 1.578e-01, -5.394e-02, -6.714e-02, -6.320e-02, 8.249e-03, 5.620e-02, -3.219e-02, 9.398e-03, 6.809e-02, -7.400e-02, -1.431e-01, -1.425e-01, -2.358e-02));
r += mul(s2_6, M4(4.035e-02, 5.655e-02, -3.307e-03, -3.497e-02, 6.522e-02, 1.103e-01, -9.802e-02, -2.655e-01, -5.802e-02, -4.359e-02, 3.459e-03, 1.592e-01, -2.566e-02, -1.156e-01, 2.646e-02, 3.806e-02));
r += mul(s2_7, M4(-3.211e-02, -1.509e-01, 2.028e-03, -1.702e-01, 4.576e-02, -5.340e-02, 5.503e-02, 1.257e-02, -5.581e-02, 9.818e-02, -2.745e-02, 1.486e-01, 1.063e-01, -3.707e-01, 1.116e-01, -6.709e-02));
r += mul(s2_8, M4(9.062e-04, -7.371e-03, 8.420e-02, 1.629e-01, -2.707e-02, -5.219e-03, 6.567e-02, 1.766e-01, 4.554e-04, 1.178e-02, -1.124e-02, 3.477e-02, -5.473e-02, -7.643e-02, 9.083e-03, -4.250e-02));
r += mul(s3_0, M4(8.537e-02, 7.246e-02, 5.043e-02, 3.850e-02, -3.951e-02, 9.224e-03, 1.640e-02, 1.906e-02, -1.333e-01, -8.517e-02, -1.410e-01, 4.781e-02, -1.641e-02, 2.463e-03, -7.445e-02, -4.602e-02));
r += mul(s3_1, M4(7.934e-02, 7.380e-03, -1.062e-01, -4.154e-03, -2.611e-02, -3.119e-02, 9.679e-02, -1.394e-02, -1.108e-01, 1.158e-02, 1.850e-01, -6.765e-02, 5.765e-02, 3.392e-02, -1.560e-02, -6.052e-02));
r += mul(s3_2, M4(-3.056e-02, -8.450e-03, 1.524e-02, -1.007e-02, 1.030e-02, 4.032e-02, 9.837e-02, 9.371e-03, 4.740e-02, -4.795e-02, -3.356e-02, 8.602e-03, 2.484e-02, -9.889e-03, 6.734e-02, -2.287e-02));
r += mul(s3_3, M4(6.303e-02, -7.328e-02, -5.082e-02, -5.070e-02, -8.129e-03, 7.948e-03, 7.351e-02, 5.601e-02, -4.169e-01, -8.219e-03, 3.373e-01, 1.781e-01, -5.139e-02, 1.471e-01, 3.415e-02, 7.690e-02));
r += mul(s3_4, M4(1.455e-01, 6.664e-02, 5.792e-02, -1.276e-01, 2.360e-01, 5.978e-02, 5.147e-02, 2.707e-01, -7.581e-01, -2.740e-01, -3.000e-01, -2.017e-02, 2.005e-01, 6.934e-02, 2.996e-02, -3.036e-01));
r += mul(s3_5, M4(-1.708e-01, -6.628e-02, 9.170e-02, 3.461e-02, -1.028e-01, -4.244e-02, -1.955e-01, -1.875e-01, 8.215e-02, 2.976e-02, -4.637e-03, 1.512e-01, -4.626e-02, 3.819e-03, -2.222e-01, -1.078e-01));
r += mul(s3_6, M4(9.585e-02, 3.354e-02, -5.140e-03, -2.883e-02, -9.057e-02, 3.950e-02, -5.781e-02, -2.509e-02, -7.106e-02, -1.351e-01, -4.933e-02, 8.332e-02, -2.922e-02, -3.890e-02, 3.291e-03, 7.054e-02));
r += mul(s3_7, M4(-8.208e-02, 1.377e-02, -2.475e-02, -1.353e-01, 9.728e-02, 7.136e-02, -3.984e-02, 1.374e-01, -1.160e-01, 4.362e-02, 6.714e-02, 5.038e-03, 1.866e-01, -2.349e-01, 8.599e-02, -1.510e-01));
r += mul(s3_8, M4(-3.279e-02, 4.634e-02, 1.698e-02, 1.410e-01, -2.615e-02, 1.178e-02, 5.268e-02, 5.209e-02, 1.624e-02, 2.431e-02, -3.280e-02, 7.160e-02, -6.704e-02, -1.195e-01, -4.136e-02, -2.048e-01));
r += V4(-7.279e-03, 1.016e-02, -7.400e-03, 4.979e-03);
return r;
}
void Pass5(uint2 blockStart, uint3 tid) {
float2 pt = float2(GetInputPt());
uint2 gxy = Rmp8x8(tid.x) + blockStart;
uint2 size = GetInputSize();
if (gxy.x >= size.x || gxy.y >= size.y) {
return;
}
float2 pos = (gxy + 0.5) * pt;
V4 s0_0 = l0(-1.0, -1.0);
V4 s0_1 = l0(0.0, -1.0);
V4 s0_2 = l0(1.0, -1.0);
V4 s0_3 = l0(-1.0, 0.0);
V4 s0_4 = l0(0.0, 0.0);
V4 s0_5 = l0(1.0, 0.0);
V4 s0_6 = l0(-1.0, 1.0);
V4 s0_7 = l0(0.0, 1.0);
V4 s0_8 = l0(1.0, 1.0);
V4 s1_0 = -max(-s0_0, 0.0);
V4 s1_1 = -max(-s0_1, 0.0);
V4 s1_2 = -max(-s0_2, 0.0);
V4 s1_3 = -max(-s0_3, 0.0);
V4 s1_4 = -max(-s0_4, 0.0);
V4 s1_5 = -max(-s0_5, 0.0);
V4 s1_6 = -max(-s0_6, 0.0);
V4 s1_7 = -max(-s0_7, 0.0);
V4 s1_8 = -max(-s0_8, 0.0);
s0_0 = max(s0_0, 0.0);
s0_1 = max(s0_1, 0.0);
s0_2 = max(s0_2, 0.0);
s0_3 = max(s0_3, 0.0);
s0_4 = max(s0_4, 0.0);
s0_5 = max(s0_5, 0.0);
s0_6 = max(s0_6, 0.0);
s0_7 = max(s0_7, 0.0);
s0_8 = max(s0_8, 0.0);
V4 s2_0 = l1(-1.0, -1.0);
V4 s2_1 = l1(0.0, -1.0);
V4 s2_2 = l1(1.0, -1.0);
V4 s2_3 = l1(-1.0, 0.0);
V4 s2_4 = l1(0.0, 0.0);
V4 s2_5 = l1(1.0, 0.0);
V4 s2_6 = l1(-1.0, 1.0);
V4 s2_7 = l1(0.0, 1.0);
V4 s2_8 = l1(1.0, 1.0);
V4 s3_0 = -max(-s2_0, 0.0);
V4 s3_1 = -max(-s2_1, 0.0);
V4 s3_2 = -max(-s2_2, 0.0);
V4 s3_3 = -max(-s2_3, 0.0);
V4 s3_4 = -max(-s2_4, 0.0);
V4 s3_5 = -max(-s2_5, 0.0);
V4 s3_6 = -max(-s2_6, 0.0);
V4 s3_7 = -max(-s2_7, 0.0);
V4 s3_8 = -max(-s2_8, 0.0);
s2_0 = max(s2_0, 0.0);
s2_1 = max(s2_1, 0.0);
s2_2 = max(s2_2, 0.0);
s2_3 = max(s2_3, 0.0);
s2_4 = max(s2_4, 0.0);
s2_5 = max(s2_5, 0.0);
s2_6 = max(s2_6, 0.0);
s2_7 = max(s2_7, 0.0);
s2_8 = max(s2_8, 0.0);
t0[gxy] = f0(s0_0, s0_1, s0_2, s0_3, s0_4, s0_5, s0_6, s0_7, s0_8, s1_0, s1_1, s1_2, s1_3, s1_4, s1_5, s1_6, s1_7, s1_8, s2_0, s2_1, s2_2, s2_3, s2_4, s2_5, s2_6, s2_7, s2_8, s3_0, s3_1, s3_2, s3_3, s3_4, s3_5, s3_6, s3_7, s3_8);
t1[gxy] = f1(s0_0, s0_1, s0_2, s0_3, s0_4, s0_5, s0_6, s0_7, s0_8, s1_0, s1_1, s1_2, s1_3, s1_4, s1_5, s1_6, s1_7, s1_8, s2_0, s2_1, s2_2, s2_3, s2_4, s2_5, s2_6, s2_7, s2_8, s3_0, s3_1, s3_2, s3_3, s3_4, s3_5, s3_6, s3_7, s3_8);
}
//!PASS 6
//!DESC out-shuffle
//!BLOCK_SIZE 16
//!NUM_THREADS 64
//!IN INPUT, t0, t1
//!OUT OUTPUT
#define l0(x, y) V4(O(t0, float2(x, y)))
#define l1(x, y) V4(O(t1, float2(x, y)))
V4 f0(V4 s0_0, V4 s0_1, V4 s0_2, V4 s0_3, V4 s0_4, V4 s0_5, V4 s0_6, V4 s0_7, V4 s0_8, V4 s1_0, V4 s1_1, V4 s1_2, V4 s1_3, V4 s1_4, V4 s1_5, V4 s1_6, V4 s1_7, V4 s1_8, V4 s2_0, V4 s2_1, V4 s2_2, V4 s2_3, V4 s2_4, V4 s2_5, V4 s2_6, V4 s2_7, V4 s2_8, V4 s3_0, V4 s3_1, V4 s3_2, V4 s3_3, V4 s3_4, V4 s3_5, V4 s3_6, V4 s3_7, V4 s3_8) {
V4 r = 0.0;
r += mul(s0_0, M4(1.838e-01, -1.901e-02, 9.627e-03, -5.113e-02, -2.616e-02, -2.850e-02, -1.739e-02, 9.125e-03, 1.563e-02, -1.253e-02, 1.902e-02, -1.512e-02, 3.495e-03, -1.497e-02, 4.974e-03, -1.115e-02));
r += mul(s0_1, M4(-4.288e-01, 2.549e-01, -1.017e-01, -1.945e-01, 4.749e-02, 5.258e-02, -8.284e-03, -2.265e-02, -7.010e-02, -1.542e-02, 6.889e-03, 4.028e-02, -1.355e-02, 4.321e-03, 3.330e-02, 8.256e-02));
r += mul(s0_2, M4(-1.045e-03, -2.140e-01, -3.928e-02, 1.364e-02, 1.787e-03, -9.427e-03, 1.927e-03, -2.145e-03, -4.617e-03, 2.814e-02, 2.045e-02, 1.776e-02, 3.188e-02, 3.187e-02, -1.498e-02, -2.180e-02));
r += mul(s0_3, M4(1.059e-02, -5.522e-02, 6.447e-02, -4.395e-02, -4.846e-02, -2.209e-02, 1.866e-02, -5.920e-02, -2.419e-02, 1.032e-02, 7.736e-04, 1.092e-03, 1.854e-02, -4.388e-03, 3.893e-02, 9.549e-03));
r += mul(s0_4, M4(3.358e-02, 7.431e-03, 1.780e-01, 5.353e-01, 2.674e-01, 2.227e-01, 1.450e-01, 2.085e-01, -8.104e-03, -3.561e-02, -1.231e-01, -1.932e-01, -6.538e-02, 3.378e-02, -1.314e-01, -2.862e-02));
r += mul(s0_5, M4(-7.887e-02, 4.783e-02, -1.380e-01, -3.877e-01, 3.436e-03, 4.712e-02, -1.250e-02, 2.247e-02, 2.920e-02, 7.190e-02, -2.005e-02, 6.169e-02, 5.594e-03, -4.630e-02, 9.661e-02, 3.625e-02));
r += mul(s0_6, M4(1.963e-02, -1.873e-02, 1.489e-02, -1.253e-02, -9.356e-03, 1.334e-02, -3.747e-02, -1.115e-02, 7.741e-04, -6.463e-03, 3.707e-03, -3.598e-03, 1.783e-02, -5.539e-03, 9.899e-03, -9.354e-03));
r += mul(s0_7, M4(-2.952e-03, 3.132e-02, -6.679e-02, 2.883e-02, -3.721e-02, -3.573e-02, 1.204e-01, -6.785e-02, -1.208e-02, -1.355e-04, 2.872e-02, 2.196e-02, -1.655e-02, 3.784e-02, -5.921e-03, 2.494e-02));
r += mul(s0_8, M4(-1.215e-02, -2.947e-02, -2.454e-03, -6.326e-02, 2.248e-03, 2.302e-02, -2.863e-03, 5.834e-02, 2.187e-02, 9.973e-03, 2.158e-02, 4.902e-02, -2.207e-02, -3.485e-02, -5.118e-02, -6.696e-02));
r += mul(s1_0, M4(8.377e-02, -3.093e-02, 2.280e-02, -2.664e-02, -4.333e-02, -3.292e-02, -8.109e-03, 1.105e-02, 1.507e-02, 9.138e-03, 2.597e-02, -1.926e-02, 4.537e-02, -9.080e-03, -1.629e-02, -1.180e-02));
r += mul(s1_1, M4(-7.478e-02, 1.238e-01, -5.092e-02, -3.473e-02, 4.269e-02, 4.444e-02, 7.295e-03, 1.274e-04, -1.646e-01, -1.551e-03, 3.424e-02, 4.906e-02, -2.056e-01, -5.847e-02, 5.262e-02, 1.049e-01));
r += mul(s1_2, M4(-6.323e-02, -1.179e-01, -1.982e-02, -4.065e-02, 3.089e-03, -9.469e-03, 2.850e-03, 3.314e-03, -1.819e-03, -1.065e-01, 1.882e-02, 9.349e-03, 1.624e-02, -2.906e-02, -2.029e-02, -5.020e-02));
r += mul(s1_3, M4(1.101e-01, -3.490e-02, 1.327e-01, -2.853e-02, -5.027e-03, -5.703e-02, 6.484e-03, -6.473e-02, -4.310e-02, 3.882e-02, -3.100e-02, -7.837e-04, -5.501e-02, -1.261e-02, 7.285e-02, 4.648e-02));
r += mul(s1_4, M4(-6.214e-02, 1.841e-01, -9.546e-02, 3.700e-01, 2.824e-01, 3.400e-01, 2.309e-01, 2.237e-01, 3.482e-01, -1.294e-01, -4.546e-01, -3.556e-01, -4.730e-01, -1.392e-01, 4.776e-01, 1.210e-01));
r += mul(s1_5, M4(-5.408e-02, -1.286e-01, -6.571e-02, -1.230e-01, 9.991e-03, 6.421e-02, 4.305e-03, 1.780e-02, 2.254e-02, 3.661e-01, 6.275e-02, 8.004e-02, -1.834e-02, -3.465e-01, 9.274e-02, 3.935e-01));
r += mul(s1_6, M4(-1.620e-03, -1.423e-02, 2.785e-02, -1.252e-02, -1.218e-02, 2.842e-03, -3.496e-02, -2.927e-02, -2.106e-02, -7.099e-03, 2.545e-02, 2.484e-02, 2.973e-02, 4.563e-04, 2.010e-04, -1.839e-02));
r += mul(s1_7, M4(1.400e-04, 3.080e-02, -6.992e-03, 8.032e-02, -2.280e-02, -4.436e-02, 7.600e-02, 1.165e-02, -9.494e-02, -2.207e-02, 2.783e-01, 2.095e-01, 2.645e-02, 5.203e-02, -7.492e-02, -1.303e-02));
r += mul(s1_8, M4(-5.360e-03, -2.277e-02, -2.252e-02, -6.191e-02, 1.263e-02, 1.540e-02, 9.566e-03, 3.637e-02, -1.265e-02, -3.092e-02, -1.298e-02, -1.187e-02, 1.286e-02, 1.181e-02, -5.675e-02, -5.487e-02));
r += mul(s2_0, M4(6.275e-02, 3.332e-02, 2.458e-02, -1.910e-02, -1.764e-02, 2.292e-02, -3.220e-02, -1.127e-02, 4.114e-02, 4.303e-02, -3.355e-02, -8.882e-03, 1.881e-02, 1.788e-02, -3.354e-03, -1.345e-02));
r += mul(s2_1, M4(-1.562e-01, -7.407e-02, -5.684e-02, -3.194e-03, 1.150e-01, -2.700e-02, -9.666e-03, -3.629e-02, 5.862e-02, 6.747e-02, -1.085e-02, -3.454e-02, 7.263e-05, 2.167e-02, 5.491e-03, -6.472e-02));
r += mul(s2_2, M4(5.068e-03, 4.899e-02, 1.480e-02, -2.153e-02, 1.102e-02, 2.831e-02, -5.931e-03, 1.021e-02, -1.267e-02, -1.569e-02, 5.418e-04, 1.030e-02, -3.280e-02, -3.072e-02, -2.688e-02, -2.208e-02));
r += mul(s2_3, M4(-7.105e-02, 1.664e-03, -3.108e-02, 6.985e-02, 3.176e-02, 2.312e-02, -3.835e-02, 3.884e-02, -1.038e-01, 6.660e-02, -1.372e-01, 2.432e-02, -2.888e-04, -2.049e-02, 2.271e-02, 9.383e-03));
r += mul(s2_4, M4(4.697e-01, -3.721e-01, 1.705e-01, -2.767e-01, -1.791e-02, -7.276e-02, 2.503e-01, 1.040e-01, 1.180e-02, -5.212e-01, 4.014e-01, 1.946e-01, -2.547e-02, -1.567e-02, -5.652e-02, 9.687e-02));
r += mul(s2_5, M4(3.024e-02, 1.618e-02, 2.619e-02, 8.868e-02, -5.217e-02, -7.642e-02, -3.704e-02, -2.374e-02, -4.639e-02, 5.743e-02, -3.967e-02, -2.450e-02, 2.091e-02, -1.108e-02, 6.949e-03, -1.502e-02));
r += mul(s2_6, M4(5.298e-03, -6.810e-03, -1.982e-02, 1.960e-04, 3.645e-03, 5.483e-03, 3.357e-03, 3.697e-02, -1.339e-02, 3.253e-02, 3.649e-02, 4.492e-03, 2.076e-02, -9.046e-03, 2.043e-02, -7.803e-03));
r += mul(s2_7, M4(-2.707e-02, 7.878e-02, 1.816e-01, -1.506e-03, 9.060e-03, -1.418e-02, -6.983e-02, -5.833e-02, 3.309e-02, -2.537e-02, -3.298e-01, -1.735e-01, -2.132e-03, 5.241e-02, 1.155e-02, 4.817e-02));
r += mul(s2_8, M4(-1.781e-02, 1.652e-02, -7.188e-03, 2.114e-03, -1.105e-02, -1.137e-02, -9.037e-03, -5.600e-02, -1.220e-02, 8.292e-03, -3.404e-03, -4.211e-02, -1.018e-02, -1.004e-02, 1.505e-03, -4.591e-03));
r += mul(s3_0, M4(7.349e-02, 2.926e-02, 2.398e-02, -1.821e-02, -1.290e-02, 1.201e-02, 5.000e-03, 1.316e-02, -1.567e-02, 2.025e-02, -2.171e-02, -3.941e-04, -7.948e-03, 6.116e-02, -9.445e-03, 1.911e-02));
r += mul(s3_1, M4(-1.294e-01, -6.121e-02, -4.576e-02, 9.211e-03, 1.371e-02, -1.964e-02, -3.133e-03, 4.701e-03, 9.544e-02, 6.692e-03, 4.665e-04, -2.056e-02, 3.455e-01, -2.495e-01, 1.027e-02, -7.393e-02));
r += mul(s3_2, M4(1.532e-02, 9.402e-03, 6.812e-04, -3.241e-02, 1.245e-03, 6.504e-03, 3.970e-03, 7.168e-03, 9.435e-03, 1.574e-02, -5.118e-03, 5.232e-03, -2.659e-02, -6.011e-02, -2.446e-02, 8.062e-04));
r += mul(s3_3, M4(-6.714e-02, 3.454e-03, -1.486e-02, 5.921e-02, 5.177e-02, 3.766e-02, -1.473e-01, 4.371e-02, -7.118e-02, 2.462e-02, -1.810e-02, 3.430e-02, -5.552e-02, 3.047e-02, -5.066e-02, 5.769e-02));
r += mul(s3_4, M4(3.191e-02, -1.387e-01, -5.992e-02, -1.554e-01, 4.660e-01, 3.655e-01, 2.406e-02, -3.902e-01, 3.973e-02, -1.333e-01, 1.792e-01, 8.854e-02, 2.477e-01, -3.115e-01, 6.035e-01, -4.717e-01));
r += mul(s3_5, M4(7.241e-02, 1.273e-01, 6.810e-02, 1.118e-01, -5.454e-02, -1.728e-02, -1.007e-01, -2.265e-02, -4.534e-02, -5.171e-02, -2.524e-02, -3.337e-02, -2.366e-03, -1.723e-02, 2.300e-02, -9.889e-02));
r += mul(s3_6, M4(3.383e-02, -7.898e-03, 1.681e-02, -5.131e-03, 3.687e-02, 1.929e-02, -7.695e-03, 2.145e-03, -2.814e-02, 3.366e-02, -8.788e-02, 3.614e-02, 2.951e-02, -6.964e-03, 2.272e-02, 1.581e-02));
r += mul(s3_7, M4(7.020e-03, 6.046e-02, 2.975e-02, 5.663e-02, 2.155e-02, -1.786e-02, -2.588e-01, -1.310e-01, -6.372e-02, -1.218e-01, -7.160e-02, -3.058e-01, 2.297e-03, 3.050e-02, -2.346e-02, 4.674e-02));
r += mul(s3_8, M4(-1.071e-02, -1.089e-02, 9.286e-03, 5.202e-02, -2.291e-02, -2.655e-02, 2.386e-02, -3.231e-02, 4.599e-03, 1.114e-02, -1.630e-02, -1.693e-02, -2.194e-03, 1.842e-02, -2.522e-02, 5.265e-02));
r += V4(-1.667e-03, -2.914e-03, -1.783e-03, -1.113e-03);
return tanh(r);
}
void Pass6(uint2 blockStart, uint3 tid) {
float2 pt = float2(GetInputPt());
uint2 gxy = (Rmp8x8(tid.x) << 1) + blockStart;
uint2 size = GetOutputSize();
if (gxy.x >= size.x || gxy.y >= size.y) {
return;
}
float2 pos = ((gxy >> 1) + 0.5) * pt;
V4 s0_0 = l0(-1.0, -1.0);
V4 s0_1 = l0(0.0, -1.0);
V4 s0_2 = l0(1.0, -1.0);
V4 s0_3 = l0(-1.0, 0.0);
V4 s0_4 = l0(0.0, 0.0);
V4 s0_5 = l0(1.0, 0.0);
V4 s0_6 = l0(-1.0, 1.0);
V4 s0_7 = l0(0.0, 1.0);
V4 s0_8 = l0(1.0, 1.0);
V4 s1_0 = -max(-s0_0, 0.0);
V4 s1_1 = -max(-s0_1, 0.0);
V4 s1_2 = -max(-s0_2, 0.0);
V4 s1_3 = -max(-s0_3, 0.0);
V4 s1_4 = -max(-s0_4, 0.0);
V4 s1_5 = -max(-s0_5, 0.0);
V4 s1_6 = -max(-s0_6, 0.0);
V4 s1_7 = -max(-s0_7, 0.0);
V4 s1_8 = -max(-s0_8, 0.0);
s0_0 = max(s0_0, 0.0);
s0_1 = max(s0_1, 0.0);
s0_2 = max(s0_2, 0.0);
s0_3 = max(s0_3, 0.0);
s0_4 = max(s0_4, 0.0);
s0_5 = max(s0_5, 0.0);
s0_6 = max(s0_6, 0.0);
s0_7 = max(s0_7, 0.0);
s0_8 = max(s0_8, 0.0);
V4 s2_0 = l1(-1.0, -1.0);
V4 s2_1 = l1(0.0, -1.0);
V4 s2_2 = l1(1.0, -1.0);
V4 s2_3 = l1(-1.0, 0.0);
V4 s2_4 = l1(0.0, 0.0);
V4 s2_5 = l1(1.0, 0.0);
V4 s2_6 = l1(-1.0, 1.0);
V4 s2_7 = l1(0.0, 1.0);
V4 s2_8 = l1(1.0, 1.0);
V4 s3_0 = -max(-s2_0, 0.0);
V4 s3_1 = -max(-s2_1, 0.0);
V4 s3_2 = -max(-s2_2, 0.0);
V4 s3_3 = -max(-s2_3, 0.0);
V4 s3_4 = -max(-s2_4, 0.0);
V4 s3_5 = -max(-s2_5, 0.0);
V4 s3_6 = -max(-s2_6, 0.0);
V4 s3_7 = -max(-s2_7, 0.0);
V4 s3_8 = -max(-s2_8, 0.0);
s2_0 = max(s2_0, 0.0);
s2_1 = max(s2_1, 0.0);
s2_2 = max(s2_2, 0.0);
s2_3 = max(s2_3, 0.0);
s2_4 = max(s2_4, 0.0);
s2_5 = max(s2_5, 0.0);
s2_6 = max(s2_6, 0.0);
s2_7 = max(s2_7, 0.0);
s2_8 = max(s2_8, 0.0);
V4 r = f0(s0_0, s0_1, s0_2, s0_3, s0_4, s0_5, s0_6, s0_7, s0_8, s1_0, s1_1, s1_2, s1_3, s1_4, s1_5, s1_6, s1_7, s1_8, s2_0, s2_1, s2_2, s2_3, s2_4, s2_5, s2_6, s2_7, s2_8, s3_0, s3_1, s3_2, s3_3, s3_4, s3_5, s3_6, s3_7, s3_8);
static const float3x3 rgb2yuv = {0.299, 0.587, 0.114, -0.169, -0.331, 0.5, 0.5, -0.419, -0.081};
static const float3x3 yuv2rgb = {1, -0.00093, 1.401687, 1, -0.3437, -0.71417, 1, 1.77216, 0.00099};
float2 opt = float2(GetOutputPt());
pos -= 0.5f * opt;
float3 yuv = mul(rgb2yuv, INPUT.SampleLevel(SL, pos, 0).rgb);
OUTPUT[gxy] = float4(mul(yuv2rgb, float3(saturate(yuv.r + r.x), yuv.yz)), 1);
++gxy.x;
pos.x += opt.x;
yuv = mul(rgb2yuv, INPUT.SampleLevel(SL, pos, 0).rgb);
OUTPUT[gxy] = float4(mul(yuv2rgb, float3(saturate(yuv.r + r.y), yuv.yz)), 1);
++gxy.y;
pos.y += opt.y;
yuv = mul(rgb2yuv, INPUT.SampleLevel(SL, pos, 0).rgb);
OUTPUT[gxy] = float4(mul(yuv2rgb, float3(saturate(yuv.r + r.w), yuv.yz)), 1);
--gxy.x;
pos.x -= opt.x;
yuv = mul(rgb2yuv, INPUT.SampleLevel(SL, pos, 0).rgb);
OUTPUT[gxy] = float4(mul(yuv2rgb, float3(saturate(yuv.r + r.z), yuv.yz)), 1);
}

View file

@ -0,0 +1,921 @@
// CuNNy 4x8C BILINEAR RGB NVL - https://github.com/cunnyplapper/CuNNy
// This program is free software: you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation, either version 3 of the License, or
// (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
//
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License
// along with this program. If not, see <http://www.gnu.org/licenses/>.
//!MAGPIE EFFECT
//!VERSION 4
//!SORT_NAME CuNNy-D08N04
//!TEXTURE
Texture2D INPUT;
//!TEXTURE
//!WIDTH INPUT_WIDTH * 2
//!HEIGHT INPUT_HEIGHT * 2
Texture2D OUTPUT;
//!SAMPLER
//!FILTER POINT
SamplerState SP;
//!SAMPLER
//!FILTER LINEAR
SamplerState SL;
//!COMMON
#define O(t, p) t.SampleLevel(SP, pos + p * pt, 0)
#define V4 min16float4
#define M4 min16float4x4
//!TEXTURE
//!WIDTH INPUT_WIDTH
//!HEIGHT INPUT_HEIGHT
//!FORMAT R8G8B8A8_SNORM
Texture2D t0;
//!TEXTURE
//!WIDTH INPUT_WIDTH
//!HEIGHT INPUT_HEIGHT
//!FORMAT R8G8B8A8_SNORM
Texture2D t1;
//!TEXTURE
//!WIDTH INPUT_WIDTH
//!HEIGHT INPUT_HEIGHT
//!FORMAT R8G8B8A8_SNORM
Texture2D t2;
//!TEXTURE
//!WIDTH INPUT_WIDTH
//!HEIGHT INPUT_HEIGHT
//!FORMAT R8G8B8A8_SNORM
Texture2D t3;
//!PASS 1
//!DESC in
//!BLOCK_SIZE 8
//!NUM_THREADS 64
//!IN INPUT
//!OUT t0, t1
#define l0(x, y) min16float((dot(float3(2.329e-01, 4.438e-01, 9.598e-02), O(INPUT, float2(x, y)).rgb) + -5.664e-01))
V4 f0(min16float s0_0, min16float s0_1, min16float s0_2, min16float s0_3, min16float s0_4, min16float s0_5, min16float s0_6, min16float s0_7, min16float s0_8) {
V4 r = 0.0;
r += V4(-1.368e-01, -5.123e-02, -2.270e-01, -9.888e-02) * s0_0;
r += V4(3.682e-01, 4.625e-02, 1.372e-01, 3.834e-01) * s0_1;
r += V4(-9.245e-02, 7.555e-03, 3.923e-02, 1.252e-02) * s0_2;
r += V4(-2.312e-01, 2.012e-02, 1.660e-01, 4.386e-01) * s0_3;
r += V4(-3.965e-02, -4.834e-01, 3.729e-01, -7.207e-01) * s0_4;
r += V4(2.190e-01, -9.021e-02, -1.087e-01, -9.632e-03) * s0_5;
r += V4(4.088e-02, 1.183e-01, 8.976e-02, -1.710e-03) * s0_6;
r += V4(-5.188e-03, 5.274e-01, -8.856e-02, -6.446e-03) * s0_7;
r += V4(-7.160e-02, -9.349e-02, -3.823e-01, 1.947e-03) * s0_8;
r += V4(3.244e-02, 2.492e-04, 8.562e-04, 1.261e-04);
return r;
}
V4 f1(min16float s0_0, min16float s0_1, min16float s0_2, min16float s0_3, min16float s0_4, min16float s0_5, min16float s0_6, min16float s0_7, min16float s0_8) {
V4 r = 0.0;
r += V4(2.403e-02, 8.569e-03, -8.618e-02, 2.022e-02) * s0_0;
r += V4(4.893e-01, 2.383e-02, 2.423e-02, -3.486e-01) * s0_1;
r += V4(-3.682e-02, 2.437e-03, 1.872e-01, 1.135e-01) * s0_2;
r += V4(-2.361e-02, 2.588e-02, 7.348e-02, -8.229e-03) * s0_3;
r += V4(-4.433e-01, -5.131e-01, -3.778e-01, 6.107e-02) * s0_4;
r += V4(-4.423e-02, 2.098e-02, 9.260e-03, 4.444e-02) * s0_5;
r += V4(-1.370e-02, 1.009e-02, 3.020e-01, 1.159e-02) * s0_6;
r += V4(-3.030e-03, 8.145e-03, -2.789e-02, -7.085e-03) * s0_7;
r += V4(2.648e-02, 4.731e-03, -1.067e-01, -4.477e-03) * s0_8;
r += V4(-1.971e-02, 8.202e-02, 4.706e-03, -6.665e-02);
return r;
}
void Pass1(uint2 blockStart, uint3 tid) {
float2 pt = float2(GetInputPt());
uint2 gxy = Rmp8x8(tid.x) + blockStart;
uint2 size = GetInputSize();
if (gxy.x >= size.x || gxy.y >= size.y) {
return;
}
float2 pos = (gxy + 0.5) * pt;
min16float s0_0 = l0(-1.0, -1.0);
min16float s0_1 = l0(0.0, -1.0);
min16float s0_2 = l0(1.0, -1.0);
min16float s0_3 = l0(-1.0, 0.0);
min16float s0_4 = l0(0.0, 0.0);
min16float s0_5 = l0(1.0, 0.0);
min16float s0_6 = l0(-1.0, 1.0);
min16float s0_7 = l0(0.0, 1.0);
min16float s0_8 = l0(1.0, 1.0);
t0[gxy] = f0(s0_0, s0_1, s0_2, s0_3, s0_4, s0_5, s0_6, s0_7, s0_8);
t1[gxy] = f1(s0_0, s0_1, s0_2, s0_3, s0_4, s0_5, s0_6, s0_7, s0_8);
}
//!PASS 2
//!DESC conv1
//!BLOCK_SIZE 8
//!NUM_THREADS 64
//!IN t0, t1
//!OUT t2, t3
#define l0(x, y) V4(O(t0, float2(x, y)))
#define l1(x, y) V4(O(t1, float2(x, y)))
V4 f0(V4 s0_0, V4 s0_1, V4 s0_2, V4 s0_3, V4 s0_4, V4 s0_5, V4 s0_6, V4 s0_7, V4 s0_8, V4 s1_0, V4 s1_1, V4 s1_2, V4 s1_3, V4 s1_4, V4 s1_5, V4 s1_6, V4 s1_7, V4 s1_8, V4 s2_0, V4 s2_1, V4 s2_2, V4 s2_3, V4 s2_4, V4 s2_5, V4 s2_6, V4 s2_7, V4 s2_8, V4 s3_0, V4 s3_1, V4 s3_2, V4 s3_3, V4 s3_4, V4 s3_5, V4 s3_6, V4 s3_7, V4 s3_8) {
V4 r = 0.0;
r += mul(s0_0, M4(1.205e-01, 8.504e-02, -7.328e-02, 1.539e-01, -9.103e-03, -2.708e-02, -1.401e-01, -2.159e-01, -2.552e-01, 7.462e-02, 5.919e-02, 8.905e-02, 1.169e-01, -4.383e-03, -1.997e-01, -1.379e-01));
r += mul(s0_1, M4(2.844e-02, 2.238e-02, 2.143e-01, -1.624e-01, 1.885e-01, 1.316e-01, -1.276e-01, -1.713e-01, 2.553e-03, -1.343e-01, 4.700e-02, 4.762e-01, -2.676e-01, 1.784e-01, -4.065e-02, 1.015e-01));
r += mul(s0_2, M4(-4.442e-03, 3.253e-01, 2.650e-02, -2.907e-01, 2.749e-01, -3.510e-01, 8.545e-02, -2.446e-01, -1.579e-01, 9.398e-02, -4.544e-02, -9.123e-02, -2.529e-01, -2.538e-01, -2.686e-01, 2.607e-01));
r += mul(s0_3, M4(1.518e-01, -1.515e-01, -1.597e-01, 2.163e-01, -6.933e-02, 7.220e-02, 2.114e-01, -2.227e-01, -3.743e-01, 9.056e-02, 2.612e-02, 3.036e-01, -1.583e-02, -8.293e-02, -1.068e-01, 6.201e-02));
r += mul(s0_4, M4(-2.305e-02, 9.029e-02, -1.003e-01, -2.375e-01, -1.891e-01, 3.623e-01, -2.999e-01, -4.511e-01, 1.460e-01, -3.825e-01, 1.231e-01, 6.391e-01, -6.041e-01, 5.588e-01, -3.508e-01, -3.131e-01));
r += mul(s0_5, M4(8.812e-02, 2.197e-01, -8.630e-03, 2.287e-02, -1.918e-01, -6.428e-01, 1.496e-01, 2.272e-01, 3.445e-02, -7.188e-03, -8.518e-02, 1.948e-01, 1.606e-01, -8.707e-01, 2.092e-02, -4.993e-01));
r += mul(s0_6, M4(9.718e-03, 8.373e-03, 7.436e-02, -1.552e-01, 8.410e-02, -1.728e-02, -1.971e-01, 2.255e-02, -8.645e-02, 1.863e-02, -9.399e-02, -8.424e-02, -1.533e-03, 1.223e-01, 2.715e-01, -1.268e-01));
r += mul(s0_7, M4(-4.246e-01, -1.034e-01, 3.236e-01, 5.680e-01, -1.213e-02, 1.577e-01, -9.408e-02, -7.294e-02, -6.410e-02, 4.264e-02, -8.392e-03, 2.192e-01, 1.656e-01, 4.681e-02, 9.146e-01, -6.311e-02));
r += mul(s0_8, M4(-1.847e-01, -9.105e-02, -3.260e-02, 2.506e-01, -6.470e-02, 4.430e-02, -1.242e-02, -1.097e-01, 5.488e-02, 9.106e-02, 3.144e-02, -3.367e-05, 2.468e-01, -2.535e-01, 1.409e-01, -5.311e-01));
r += mul(s1_0, M4(1.294e-01, 1.098e-01, 7.497e-03, 1.016e-01, 1.377e-02, -1.480e-02, -2.694e-02, -3.417e-02, -1.083e-01, -2.575e-03, 1.137e-01, -2.616e-01, -1.260e-01, -2.567e-02, -1.958e-01, 6.103e-02));
r += mul(s1_1, M4(-1.355e-01, 1.168e-01, 2.368e-01, -2.379e-01, 8.556e-01, 1.401e-01, 3.238e-01, 2.737e-01, 8.041e-02, -1.662e-01, 9.181e-02, -3.488e-01, -1.586e-01, 1.407e-01, -1.126e-01, 1.825e-01));
r += mul(s1_2, M4(-1.881e-02, 4.604e-01, -1.712e-02, 3.453e-02, 3.171e-01, -1.126e-01, 6.510e-02, 2.908e-01, -9.125e-02, 7.793e-02, -5.580e-02, -3.603e-01, 9.996e-02, -2.647e-01, -2.114e-01, 2.330e-01));
r += mul(s1_3, M4(2.957e-01, -1.252e-01, -2.840e-01, 1.815e-01, -2.900e-01, 1.027e-01, 1.404e-01, -1.123e-01, -1.767e-01, 1.535e-03, -3.568e-03, -2.824e-01, 2.015e-01, -7.712e-02, -6.140e-02, 6.517e-02));
r += mul(s1_4, M4(-2.439e-01, 7.096e-02, -2.116e-01, -1.980e-01, -3.221e-01, 2.007e-01, -4.243e-01, -5.013e-01, 1.181e-01, -3.735e-01, 1.812e-01, -5.095e-01, 3.646e-01, 4.013e-01, -8.028e-02, 1.287e-01));
r += mul(s1_5, M4(-8.389e-02, -1.091e-01, 6.962e-02, 2.605e-01, -3.435e-03, -5.146e-01, 4.125e-01, 5.487e-01, -1.481e-01, 6.810e-02, -1.450e-01, -9.583e-02, 3.305e-01, -1.238e+00, 2.036e-01, 1.879e-01));
r += mul(s1_6, M4(-8.033e-02, 5.944e-03, 2.453e-01, -2.971e-01, -5.652e-02, -1.251e-02, -1.449e-01, -5.344e-02, -1.377e-01, 9.383e-03, -1.862e-01, -2.528e-01, -3.825e-02, 7.296e-02, 2.373e-01, -1.935e-01));
r += mul(s1_7, M4(-1.795e-01, 1.597e-01, 2.709e-01, -3.738e-01, 2.604e-02, 1.678e-01, -8.718e-02, -9.483e-03, -3.844e-02, 6.235e-02, -1.344e-01, 1.837e-02, -3.074e-02, 2.568e-02, 1.030e+00, 1.831e-01));
r += mul(s1_8, M4(4.299e-02, 6.530e-03, -2.571e-02, 3.382e-01, -1.327e-01, 2.975e-02, -2.861e-02, 1.963e-01, 8.130e-04, 9.743e-02, -1.177e-02, -1.273e-01, -1.265e-01, -3.003e-01, 2.635e-01, 5.426e-02));
r += mul(s2_0, M4(-1.538e-01, 1.580e-01, 1.392e-01, -1.077e-01, -1.228e-01, 1.853e-01, -1.010e-01, 3.144e-02, 2.203e-01, -3.309e-02, 6.819e-02, 2.708e-01, 1.720e-01, 2.635e-01, -1.290e-01, -2.932e-01));
r += mul(s2_1, M4(1.615e-01, -1.424e-01, -2.346e-01, -1.008e-01, 1.386e-01, -2.281e-01, -1.313e-01, -5.902e-02, -3.376e-02, 1.925e-01, -1.172e-01, 7.865e-02, 2.112e-01, -7.280e-02, -1.953e-01, -1.198e-02));
r += mul(s2_2, M4(1.280e-01, -1.353e-01, 1.251e-01, 3.212e-02, -1.144e-01, -1.492e-01, -1.499e-01, 2.211e-01, 1.307e-01, 1.336e-01, 1.977e-01, -1.429e-02, -5.395e-02, -2.772e-02, -3.214e-01, -1.907e-01));
r += mul(s2_3, M4(-2.703e-01, 3.122e-01, 1.951e-01, -2.005e-01, 1.463e-01, 3.000e-01, 1.058e-01, 8.352e-02, 1.567e-01, -1.256e-01, -1.854e-01, -2.018e-01, 3.248e-01, 8.780e-02, 1.586e-01, -9.757e-03));
r += mul(s2_4, M4(3.941e-02, -1.430e-01, 1.023e-01, 2.878e-01, 8.414e-02, 1.385e-01, 8.032e-02, -6.330e-02, -1.020e-01, 2.731e-01, -6.877e-02, -3.492e-01, 3.758e-01, -7.526e-02, 4.955e-01, -5.595e-01));
r += mul(s2_5, M4(2.684e-01, -1.924e-02, -2.975e-02, 7.205e-01, 6.611e-02, -1.645e-01, 1.267e-01, 6.066e-02, 1.695e-01, -4.367e-01, -1.450e-01, -4.074e-02, 4.469e-01, -7.176e-03, 4.177e-01, -4.565e-01));
r += mul(s2_6, M4(-1.843e-01, 2.522e-01, 3.324e-01, -1.821e-01, -1.327e-01, 1.182e-01, 1.158e-01, -2.494e-01, -6.459e-03, -6.606e-03, 1.333e-01, 2.229e-01, 2.481e-01, -2.018e-01, 2.456e-01, 2.351e-01));
r += mul(s2_7, M4(-6.894e-03, -2.822e-01, -1.863e-01, -2.252e-01, 6.755e-02, -1.766e-01, 8.884e-02, -2.720e-03, -4.431e-02, -2.119e-02, 2.876e-01, -5.268e-01, -3.635e-01, -1.001e-01, -8.433e-01, 5.160e-01));
r += mul(s2_8, M4(-1.786e-01, 2.208e-01, 4.289e-01, 1.663e-01, -2.341e-01, 8.148e-03, -7.557e-02, 7.817e-02, -1.340e-01, -2.341e-01, 3.123e-02, 1.120e-01, -7.753e-01, 2.056e-01, -2.926e-01, -1.222e-01));
r += mul(s3_0, M4(-4.903e-02, 1.377e-01, 6.984e-02, -1.053e-02, -5.115e-01, 2.891e-01, -4.612e-01, -6.693e-01, 4.752e-02, -5.287e-02, -2.183e-02, 4.134e-01, 1.073e-02, 2.383e-01, -2.142e-01, 1.384e-01));
r += mul(s3_1, M4(1.680e-01, -1.307e-01, -1.038e-01, -2.130e-02, -1.231e+00, -2.602e-01, -5.456e-01, 3.295e-01, -5.588e-02, 1.505e-01, -4.784e-02, -1.493e-01, 1.202e-01, -2.349e-01, -1.452e-01, -5.111e-02));
r += mul(s3_2, M4(-8.858e-02, -1.293e-01, 9.441e-02, -1.295e-01, -3.373e-01, -1.841e-01, -1.818e-01, 1.570e+00, -8.336e-02, 2.012e-01, 1.362e-01, 1.830e-01, -6.053e-02, -1.725e-03, -2.011e-01, -1.021e-01));
r += mul(s3_3, M4(-2.017e-01, 3.505e-01, 3.541e-02, 2.044e-01, -3.839e-01, 5.124e-01, 1.104e-01, 1.311e-01, 1.022e-01, -1.111e-01, -2.883e-01, 1.086e-01, 9.932e-02, 1.308e-01, 2.954e-01, -1.416e-02));
r += mul(s3_4, M4(6.088e-02, -4.532e-02, -1.302e-01, -1.067e-01, -4.196e+00, 7.383e-01, -2.786e-01, -2.053e+00, -3.758e-01, 2.955e-01, -1.898e-01, 1.875e-01, 1.263e-01, 9.931e-03, 1.016e-01, 5.201e-02));
r += mul(s3_5, M4(9.722e-03, -5.478e-02, -1.823e-01, -3.983e-02, -2.434e+00, -4.700e-01, 4.168e-01, 3.938e-01, 1.251e-01, -2.933e-01, -2.054e-02, 8.827e-02, 2.048e-02, 6.212e-02, 1.448e-01, 1.042e-01));
r += mul(s3_6, M4(-1.605e-02, 1.851e-01, 2.427e-01, 4.894e-02, -6.032e-01, -3.413e-02, 4.158e-01, 6.903e-01, -1.865e-02, -1.318e-02, 1.003e-01, 3.193e-01, 4.503e-02, 1.880e-01, -4.608e-02, -3.137e-01));
r += mul(s3_7, M4(-4.125e-02, -1.494e-01, 8.853e-01, -1.540e-01, -2.445e-01, 2.292e-01, 1.684e+00, 1.098e+00, 5.576e-02, -8.241e-02, 2.507e-01, -1.086e-01, 1.392e-01, -2.115e-01, -2.600e-01, 9.268e-02));
r += mul(s3_8, M4(5.677e-02, 9.206e-02, 5.863e-02, 5.663e-02, -2.019e+00, -1.006e-01, -1.769e-01, -3.617e-01, 1.293e-02, -2.766e-01, 2.843e-02, 3.331e-01, -2.316e-01, -1.762e-01, -6.013e-03, -2.482e-02));
r += V4(3.430e-02, -1.031e-02, -1.631e-02, -3.189e-02);
return r;
}
V4 f1(V4 s0_0, V4 s0_1, V4 s0_2, V4 s0_3, V4 s0_4, V4 s0_5, V4 s0_6, V4 s0_7, V4 s0_8, V4 s1_0, V4 s1_1, V4 s1_2, V4 s1_3, V4 s1_4, V4 s1_5, V4 s1_6, V4 s1_7, V4 s1_8, V4 s2_0, V4 s2_1, V4 s2_2, V4 s2_3, V4 s2_4, V4 s2_5, V4 s2_6, V4 s2_7, V4 s2_8, V4 s3_0, V4 s3_1, V4 s3_2, V4 s3_3, V4 s3_4, V4 s3_5, V4 s3_6, V4 s3_7, V4 s3_8) {
V4 r = 0.0;
r += mul(s0_0, M4(3.260e-02, 1.675e-01, 8.130e-02, -2.153e-01, -1.987e-01, -9.443e-02, 3.512e-01, 2.289e-02, 9.481e-02, -1.921e-01, -3.818e-01, 1.373e-01, -9.032e-02, 7.892e-02, 1.392e-01, -6.033e-02));
r += mul(s0_1, M4(-8.203e-02, -1.015e-01, -1.313e-02, -5.337e-02, -2.948e-01, -2.678e-01, -2.321e-01, -5.995e-01, 1.364e-01, 1.030e-01, 1.546e-01, -1.179e-02, 1.996e-01, 2.244e-01, -2.304e-01, -1.304e-02));
r += mul(s0_2, M4(-2.319e-02, -2.236e-02, 3.976e-02, 1.804e-01, 6.474e-02, 1.315e-01, -1.456e-02, -1.538e-01, 3.061e-02, -1.998e-02, -1.918e-02, -8.662e-02, -1.980e-01, -1.596e-01, -4.624e-01, -3.728e-01));
r += mul(s0_3, M4(-3.171e-03, -2.887e-02, 3.107e-01, -8.532e-02, 1.489e-02, -2.798e-01, -2.458e-02, 2.922e-01, 5.196e-02, 2.333e-02, -4.100e-01, 3.851e-01, 8.566e-02, 1.655e-01, 3.680e-01, -3.572e-01));
r += mul(s0_4, M4(4.618e-02, -3.100e-02, -1.849e-01, 2.228e-02, -2.182e-01, -5.806e-01, -6.298e-02, 2.421e-01, 4.266e-01, 7.738e-02, 4.856e-03, -1.191e-01, 3.469e-01, -8.683e-02, -2.397e-01, 6.512e-02));
r += mul(s0_5, M4(8.363e-02, -9.745e-02, 2.398e-01, -1.335e-01, -1.585e-01, -1.161e-02, 2.482e-02, 1.319e-03, -4.696e-02, -6.675e-02, -7.519e-02, 1.125e-01, -1.199e-01, -9.094e-03, -2.590e-01, -8.812e-01));
r += mul(s0_6, M4(7.745e-02, 3.414e-02, 6.378e-02, -8.388e-02, 4.456e-02, 1.354e-02, -1.138e-02, 1.131e-01, 2.361e-01, 1.828e-01, -2.135e-01, -1.100e-02, 1.683e-01, 2.134e-01, 1.832e-01, 8.420e-02));
r += mul(s0_7, M4(-3.223e-01, -4.870e-02, -1.457e-01, 1.996e-01, -1.632e-01, -1.811e-01, -1.625e-01, 4.046e-02, -8.959e-02, 1.432e-01, -2.360e-02, -9.415e-02, -1.547e-01, 1.379e-01, 5.098e-01, -4.069e-01));
r += mul(s0_8, M4(1.568e-01, -2.510e-02, -9.894e-02, 1.124e-01, -1.372e-01, 5.952e-03, 4.501e-02, 9.591e-03, 1.430e-01, 6.422e-02, -1.412e-03, 1.042e-02, 4.601e-02, -5.133e-02, -7.936e-02, -1.621e-01));
r += mul(s1_0, M4(1.380e-01, 1.774e-01, 2.958e-01, -2.044e-01, -2.085e-01, 7.192e-03, -7.903e-02, 6.119e-02, -3.542e-02, -1.060e-01, -1.832e-01, 3.603e-01, -3.854e-02, 5.092e-02, -1.092e-01, -2.074e-01));
r += mul(s1_1, M4(-5.638e-02, -1.659e-01, -1.006e-02, 5.355e-02, -2.243e-01, 3.533e-01, -2.130e-01, 6.480e-02, 4.462e-02, 1.065e-01, 1.598e-01, 5.025e-03, -3.810e-02, 1.012e-01, 2.123e-02, 2.124e-01));
r += mul(s1_2, M4(5.207e-02, -1.428e-01, 1.745e-01, 2.563e-01, 4.058e-01, 5.320e-02, 3.527e-03, -4.664e-02, -1.641e-03, -2.830e-02, 1.453e-02, 1.169e-01, -5.840e-01, -1.545e-01, 3.880e-01, 1.250e-01));
r += mul(s1_3, M4(-2.089e-01, 3.070e-02, 3.770e-01, -2.868e-01, -1.965e-01, -2.499e-01, -2.145e-01, 5.348e-02, -1.201e-01, -3.454e-01, -5.723e-01, 4.313e-01, -7.068e-02, -6.358e-02, -2.426e-02, -2.841e-01));
r += mul(s1_4, M4(1.315e-01, 2.464e-01, -2.505e-01, -1.589e-01, 4.124e-01, 4.860e-01, -2.493e-01, 1.201e-01, -1.304e-01, -1.620e-01, 2.228e-01, 4.485e-02, 6.945e-02, -2.261e-01, -8.190e-04, 5.678e-01));
r += mul(s1_5, M4(3.529e-01, 1.800e-02, -9.794e-02, -1.160e-01, 7.052e-01, 4.176e-01, 5.822e-02, -5.300e-02, -1.144e-01, -1.890e-01, 1.337e-01, 1.163e-01, -5.024e-01, 9.977e-01, 1.831e-01, 2.166e-02));
r += mul(s1_6, M4(-1.239e-01, 1.465e-01, 3.700e-01, -1.638e-01, -1.022e-01, -3.216e-02, -2.412e-02, -2.505e-02, 5.450e-02, -1.325e-02, -2.760e-01, 5.219e-02, -5.604e-02, 3.602e-02, -1.026e-01, 4.063e-02));
r += mul(s1_7, M4(1.669e-01, 2.580e-01, -2.923e-01, -2.497e-01, 1.135e-01, -1.599e-01, -2.419e-01, -1.202e-01, -3.903e-01, -2.141e-01, 9.642e-02, -6.096e-02, -6.762e-01, 5.614e-01, 3.076e-01, -4.187e-01));
r += mul(s1_8, M4(5.456e-02, -6.641e-02, -3.839e-01, 8.629e-02, 1.149e-01, 1.204e-02, -2.509e-02, -1.413e-03, -1.329e-02, -5.670e-02, -6.186e-02, 5.108e-02, 3.592e-02, 4.563e-01, -7.450e-02, -2.259e-01));
r += mul(s2_0, M4(1.013e-01, -2.126e-02, -1.260e-01, 8.480e-03, -3.292e-02, 6.069e-04, 4.154e-02, 5.578e-02, 1.586e-02, 8.252e-02, 1.237e-01, -1.312e-01, 1.489e-01, 2.561e-01, -9.917e-02, -1.060e-01));
r += mul(s2_1, M4(-1.285e-01, -8.314e-02, 1.521e-02, 1.037e-01, -1.021e-02, 7.112e-02, -2.319e-02, 7.051e-04, -1.101e-01, -1.896e-01, -2.458e-01, -7.399e-02, -4.133e-02, 1.606e-01, -1.511e-01, -2.425e-01));
r += mul(s2_2, M4(7.543e-02, 9.235e-02, 2.139e-01, 2.879e-01, 9.583e-02, 4.372e-02, -8.231e-02, 2.498e-01, 1.241e-01, 1.377e-02, 2.380e-01, 2.586e-02, -1.926e-01, -1.406e-01, -3.627e-01, -8.414e-02));
r += mul(s2_3, M4(9.655e-03, -9.581e-02, -6.071e-02, 2.231e-01, -1.148e-01, -3.513e-02, -2.013e-02, -1.094e-01, -1.606e-01, 9.180e-02, 3.498e-01, -2.726e-01, -7.696e-03, -4.007e-01, -8.497e-02, -6.989e-01));
r += mul(s2_4, M4(4.965e-03, -1.346e-01, -4.517e-02, 2.043e-01, -1.348e-01, 1.451e-01, 8.113e-02, -8.530e-02, -1.414e-01, 7.261e-02, -2.368e-01, 1.601e-01, -2.438e-02, -2.554e-01, 4.057e-01, -2.224e-01));
r += mul(s2_5, M4(-8.716e-02, 1.496e-01, -4.429e-02, 6.451e-01, -9.547e-03, -3.189e-02, -1.096e-01, -5.416e-02, -5.032e-01, 1.331e-01, 2.389e-02, 1.028e-01, -3.186e-01, -2.524e-01, 2.663e-02, -9.995e-03));
r += mul(s2_6, M4(-2.465e-01, 1.585e-01, 3.196e-01, -9.098e-02, 2.765e-02, -1.793e-01, 1.519e-01, -9.565e-04, -1.160e-01, -3.035e-02, -1.082e-01, 3.172e-02, 5.502e-01, -6.251e-01, -4.487e-01, 1.932e-01));
r += mul(s2_7, M4(-5.017e-01, -5.180e-01, -2.682e-01, -4.715e-01, 1.958e-02, -7.007e-02, -3.332e-02, -8.389e-02, -1.135e-01, -2.956e-02, 1.994e-01, 2.315e-02, -2.553e-01, -3.153e-03, 4.275e-01, 1.669e+00));
r += mul(s2_8, M4(1.400e-01, 6.775e-01, 5.287e-02, 2.007e-02, 1.213e-01, -1.460e-03, -2.313e-02, 1.282e-01, -8.355e-02, 2.399e-01, -5.277e-02, -1.499e-01, 7.246e-02, -2.553e-02, 2.185e-01, 8.662e-01));
r += mul(s3_0, M4(3.069e-02, -3.668e-02, -3.646e-02, 1.140e-01, -7.882e-02, 2.759e-01, 9.170e-01, 2.779e-01, 1.459e-01, 3.766e-02, -1.214e-01, 5.718e-03, -3.323e-02, 9.705e-02, -1.282e-02, -1.401e-01));
r += mul(s3_1, M4(-1.405e-02, 2.809e-02, 1.466e-01, -1.286e-01, 4.754e-01, 8.076e-01, 5.775e-02, -5.403e-01, 1.919e-01, -2.015e-01, -1.976e-01, -8.544e-02, -8.431e-02, 9.302e-02, 6.560e-02, 2.011e-02));
r += mul(s3_2, M4(2.107e-01, 2.334e-02, -2.591e-01, -1.023e-01, 6.461e-01, 1.138e+00, 3.917e-01, 2.270e-01, 4.023e-01, 6.135e-02, 4.125e-02, -5.551e-02, 1.871e-02, -1.344e-01, -1.534e-01, 1.216e-01));
r += mul(s3_3, M4(8.077e-02, -1.149e-01, 6.733e-02, -9.044e-03, -6.431e-02, -1.755e-02, 2.617e+00, 5.203e-01, 8.910e-02, 9.642e-02, 3.720e-01, -2.326e-01, -1.142e-01, -4.017e-02, 2.351e-01, -1.062e-01));
r += mul(s3_4, M4(-2.427e-01, -4.425e-03, 4.260e-01, -6.273e-02, 4.224e+00, -2.047e+00, -1.911e+00, 2.329e+00, 2.987e-01, -3.286e-01, -1.115e-01, 2.053e-01, -5.309e-02, -8.751e-02, -1.275e-02, -2.105e-01));
r += mul(s3_5, M4(-1.413e-02, -4.404e-01, -1.525e-01, -1.703e-01, -9.999e-01, 5.276e-01, 4.779e-01, -5.145e-01, 4.772e-01, 2.730e-02, -7.651e-02, -2.235e-01, -1.122e-01, -1.686e-01, 9.595e-02, -1.169e-01));
r += mul(s3_6, M4(-1.162e-01, 3.109e-01, -2.686e-01, -1.492e-01, 2.122e-01, 6.911e-01, 7.412e-01, 3.675e-02, 1.420e-01, -3.979e-02, -3.526e-02, -1.170e-01, 2.192e-01, 6.369e-02, 2.568e-01, 1.606e-02));
r += mul(s3_7, M4(-2.482e-02, 6.355e-01, 4.230e-01, -4.331e-01, -1.462e+00, -9.944e-01, 1.154e+00, 8.760e-01, 3.625e-01, 2.127e-01, 3.382e-01, 6.009e-02, 1.431e-01, 9.892e-02, -2.409e-01, 4.223e-02));
r += mul(s3_8, M4(-1.832e-02, 7.811e-02, -1.928e-02, 1.448e-01, -1.288e+00, 1.805e-01, 6.324e-01, -2.704e-02, 6.456e-02, -6.364e-02, 4.971e-02, -6.535e-03, 1.766e-01, 5.142e-02, -1.375e-01, 2.532e-01));
r += V4(8.007e-03, 2.570e-02, 2.487e-03, -2.496e-02);
return r;
}
void Pass2(uint2 blockStart, uint3 tid) {
float2 pt = float2(GetInputPt());
uint2 gxy = Rmp8x8(tid.x) + blockStart;
uint2 size = GetInputSize();
if (gxy.x >= size.x || gxy.y >= size.y) {
return;
}
float2 pos = (gxy + 0.5) * pt;
V4 s0_0 = l0(-1.0, -1.0);
V4 s0_1 = l0(0.0, -1.0);
V4 s0_2 = l0(1.0, -1.0);
V4 s0_3 = l0(-1.0, 0.0);
V4 s0_4 = l0(0.0, 0.0);
V4 s0_5 = l0(1.0, 0.0);
V4 s0_6 = l0(-1.0, 1.0);
V4 s0_7 = l0(0.0, 1.0);
V4 s0_8 = l0(1.0, 1.0);
V4 s1_0 = -max(-s0_0, 0.0);
V4 s1_1 = -max(-s0_1, 0.0);
V4 s1_2 = -max(-s0_2, 0.0);
V4 s1_3 = -max(-s0_3, 0.0);
V4 s1_4 = -max(-s0_4, 0.0);
V4 s1_5 = -max(-s0_5, 0.0);
V4 s1_6 = -max(-s0_6, 0.0);
V4 s1_7 = -max(-s0_7, 0.0);
V4 s1_8 = -max(-s0_8, 0.0);
s0_0 = max(s0_0, 0.0);
s0_1 = max(s0_1, 0.0);
s0_2 = max(s0_2, 0.0);
s0_3 = max(s0_3, 0.0);
s0_4 = max(s0_4, 0.0);
s0_5 = max(s0_5, 0.0);
s0_6 = max(s0_6, 0.0);
s0_7 = max(s0_7, 0.0);
s0_8 = max(s0_8, 0.0);
V4 s2_0 = l1(-1.0, -1.0);
V4 s2_1 = l1(0.0, -1.0);
V4 s2_2 = l1(1.0, -1.0);
V4 s2_3 = l1(-1.0, 0.0);
V4 s2_4 = l1(0.0, 0.0);
V4 s2_5 = l1(1.0, 0.0);
V4 s2_6 = l1(-1.0, 1.0);
V4 s2_7 = l1(0.0, 1.0);
V4 s2_8 = l1(1.0, 1.0);
V4 s3_0 = -max(-s2_0, 0.0);
V4 s3_1 = -max(-s2_1, 0.0);
V4 s3_2 = -max(-s2_2, 0.0);
V4 s3_3 = -max(-s2_3, 0.0);
V4 s3_4 = -max(-s2_4, 0.0);
V4 s3_5 = -max(-s2_5, 0.0);
V4 s3_6 = -max(-s2_6, 0.0);
V4 s3_7 = -max(-s2_7, 0.0);
V4 s3_8 = -max(-s2_8, 0.0);
s2_0 = max(s2_0, 0.0);
s2_1 = max(s2_1, 0.0);
s2_2 = max(s2_2, 0.0);
s2_3 = max(s2_3, 0.0);
s2_4 = max(s2_4, 0.0);
s2_5 = max(s2_5, 0.0);
s2_6 = max(s2_6, 0.0);
s2_7 = max(s2_7, 0.0);
s2_8 = max(s2_8, 0.0);
t2[gxy] = f0(s0_0, s0_1, s0_2, s0_3, s0_4, s0_5, s0_6, s0_7, s0_8, s1_0, s1_1, s1_2, s1_3, s1_4, s1_5, s1_6, s1_7, s1_8, s2_0, s2_1, s2_2, s2_3, s2_4, s2_5, s2_6, s2_7, s2_8, s3_0, s3_1, s3_2, s3_3, s3_4, s3_5, s3_6, s3_7, s3_8);
t3[gxy] = f1(s0_0, s0_1, s0_2, s0_3, s0_4, s0_5, s0_6, s0_7, s0_8, s1_0, s1_1, s1_2, s1_3, s1_4, s1_5, s1_6, s1_7, s1_8, s2_0, s2_1, s2_2, s2_3, s2_4, s2_5, s2_6, s2_7, s2_8, s3_0, s3_1, s3_2, s3_3, s3_4, s3_5, s3_6, s3_7, s3_8);
}
//!PASS 3
//!DESC conv2
//!BLOCK_SIZE 8
//!NUM_THREADS 64
//!IN t2, t3
//!OUT t0, t1
#define l0(x, y) V4(O(t2, float2(x, y)))
#define l1(x, y) V4(O(t3, float2(x, y)))
V4 f0(V4 s0_0, V4 s0_1, V4 s0_2, V4 s0_3, V4 s0_4, V4 s0_5, V4 s0_6, V4 s0_7, V4 s0_8, V4 s1_0, V4 s1_1, V4 s1_2, V4 s1_3, V4 s1_4, V4 s1_5, V4 s1_6, V4 s1_7, V4 s1_8, V4 s2_0, V4 s2_1, V4 s2_2, V4 s2_3, V4 s2_4, V4 s2_5, V4 s2_6, V4 s2_7, V4 s2_8, V4 s3_0, V4 s3_1, V4 s3_2, V4 s3_3, V4 s3_4, V4 s3_5, V4 s3_6, V4 s3_7, V4 s3_8) {
V4 r = 0.0;
r += mul(s0_0, M4(2.802e-01, -3.301e-02, -1.047e-01, 6.427e-02, 1.357e-02, -8.015e-02, 7.763e-02, -9.646e-02, 1.136e-01, -1.443e-01, -3.950e-02, 2.744e-01, 8.414e-03, -1.005e-01, -1.683e-01, -5.766e-02));
r += mul(s0_1, M4(2.907e-01, 1.339e-01, -7.005e-02, 9.074e-02, -2.491e-03, 6.498e-02, 1.121e-01, -9.272e-02, 3.415e-01, 1.949e-01, -2.613e-01, -2.328e-01, 1.311e-01, 1.285e-01, 1.685e-02, -4.780e-02));
r += mul(s0_2, M4(1.671e-01, -2.228e-02, -5.777e-02, -5.853e-02, 1.243e-02, -3.269e-02, 8.757e-03, -1.478e-01, -4.190e-02, 3.164e-02, 2.922e-01, -3.017e-01, -6.631e-02, 5.380e-02, -2.750e-02, -7.771e-02));
r += mul(s0_3, M4(-2.454e-02, 2.148e-01, -1.116e-01, -1.125e-01, -1.792e-01, -7.021e-01, -2.183e-01, 2.920e-01, -1.698e-01, 1.827e-01, -6.779e-02, 9.333e-02, -2.153e-01, 2.441e-01, 9.794e-02, -2.729e-01));
r += mul(s0_4, M4(-6.750e-02, 1.324e-01, -5.087e-02, 2.746e-01, 1.579e-01, -1.909e-01, -7.631e-01, -4.744e-01, -1.732e-01, -2.741e-01, 4.145e-02, -2.124e-01, 7.946e-02, -1.579e-01, 2.856e-01, 5.090e-02));
r += mul(s0_5, M4(8.392e-02, -1.504e-01, 2.815e-01, -1.174e-01, 3.942e-02, 1.918e-02, 1.561e-01, -1.457e-01, -5.976e-02, 1.230e-01, -2.539e-01, -1.965e-01, 1.869e-01, -1.795e-01, -1.283e-01, -3.447e-02));
r += mul(s0_6, M4(-3.547e-03, -6.576e-03, -5.087e-02, 3.466e-02, -3.130e-03, -3.176e-01, 8.737e-02, 4.018e-02, -6.489e-02, -1.580e-03, -8.784e-03, -4.500e-02, 2.343e-03, 5.945e-02, -5.201e-02, -3.127e-02));
r += mul(s0_7, M4(-3.546e-02, 1.145e-01, -4.773e-02, 8.280e-02, 6.746e-03, -1.036e-01, -6.616e-02, -1.224e-01, 7.156e-02, -1.941e-01, 9.307e-02, -3.567e-02, -2.215e-01, 2.437e-01, -5.542e-04, 1.208e-01));
r += mul(s0_8, M4(-1.115e-02, -4.687e-02, -3.210e-02, -1.470e-01, -4.609e-02, 4.657e-02, -6.476e-02, -1.372e-01, -4.956e-03, 1.024e-01, -2.349e-01, -8.472e-02, -2.757e-02, -1.707e-02, 2.065e-01, 1.863e-02));
r += mul(s1_0, M4(3.728e-02, -7.100e-02, -4.937e-02, 6.239e-02, -7.377e-03, -3.033e-02, 1.675e-01, -1.863e-02, -2.631e-02, -9.633e-02, -1.130e-01, -1.201e-01, 1.414e-01, -1.737e-01, -8.031e-02, -6.951e-02));
r += mul(s1_1, M4(-3.703e-02, 4.012e-02, -2.289e-02, 3.332e-02, 2.161e-02, 8.828e-02, 5.544e-02, 1.017e-01, 3.684e-01, 3.149e-01, 3.662e-01, 4.298e-02, 1.966e-01, -2.697e-02, 2.216e-02, 7.540e-02));
r += mul(s1_2, M4(-4.974e-02, -3.826e-02, -2.810e-02, -8.318e-02, 3.356e-02, -7.605e-02, -1.087e-01, 1.987e-02, -1.153e-01, -1.039e-01, -5.868e-02, -3.313e-02, -1.750e-02, 3.884e-03, -9.170e-02, -1.011e-01));
r += mul(s1_3, M4(2.119e-01, -1.340e-01, -3.650e-02, 2.219e-01, 3.634e-01, 3.474e-01, 2.302e-01, 7.494e-02, -2.253e-01, 1.239e-01, -6.032e-02, 1.293e-01, 9.583e-02, 4.424e-02, -3.920e-02, -1.870e-01));
r += mul(s1_4, M4(-2.664e-01, 8.462e-02, -4.745e-01, 1.985e-01, 2.803e-01, 7.429e-02, 7.814e-01, 4.658e-01, 3.661e-01, -2.319e-02, 3.324e-01, 2.860e-01, 3.178e-01, 9.301e-02, 1.316e-01, 4.547e-02));
r += mul(s1_5, M4(5.369e-02, 6.912e-02, 2.659e-01, -1.491e-01, 4.462e-02, -4.823e-02, 1.130e-01, 1.710e-02, -7.604e-02, -7.003e-02, 3.093e-01, 2.537e-01, 2.466e-01, -1.039e-01, 2.413e-02, -1.256e-01));
r += mul(s1_6, M4(-1.188e-01, 1.026e-01, 4.215e-02, -9.677e-02, 2.443e-03, 1.957e-01, 2.961e-02, -5.553e-02, -3.488e-02, 2.515e-02, -4.840e-03, 1.814e-02, 9.644e-02, -8.802e-02, 3.516e-03, -2.940e-03));
r += mul(s1_7, M4(-1.792e-01, 1.391e-01, 1.322e-02, -1.514e-02, -2.173e-01, 1.743e-01, 1.530e-01, 5.286e-02, -8.655e-02, 2.541e-01, 6.282e-02, 1.167e-01, 9.664e-02, 2.304e-01, -1.538e-01, -1.298e-01));
r += mul(s1_8, M4(-1.720e-01, 4.693e-02, 2.790e-01, 2.187e-02, -4.386e-02, 7.714e-03, 9.800e-02, 6.484e-03, -5.497e-02, 1.216e-01, 3.924e-02, 5.162e-02, 1.403e-01, -5.364e-03, -6.795e-03, -6.163e-02));
r += mul(s2_0, M4(2.905e-01, -3.799e-02, 1.332e-01, 2.496e-02, 7.202e-02, -3.659e-01, -2.940e-02, -1.028e-03, -1.221e-01, 1.147e-01, 3.613e-02, 9.125e-02, -8.760e-03, 1.489e-02, -9.652e-02, 4.452e-03));
r += mul(s2_1, M4(4.027e-01, -2.178e-01, -8.478e-02, 2.903e-01, 2.463e-02, 9.527e-03, -2.835e-01, 2.066e-01, -6.698e-02, -2.653e-01, -6.667e-02, 4.320e-02, -2.610e-01, -1.351e-01, 7.826e-02, -5.429e-02));
r += mul(s2_2, M4(-1.249e-01, 4.376e-02, -6.245e-02, 1.702e-01, -5.731e-02, 8.022e-02, -1.335e-01, 1.528e-01, -2.969e-02, 1.062e-01, -1.303e-01, 1.226e-01, 2.030e-02, 5.205e-02, -1.877e-01, 4.309e-02));
r += mul(s2_3, M4(-6.329e-02, -1.286e-01, -7.222e-02, 5.592e-03, -3.023e-02, 9.502e-02, -4.077e-02, -2.299e-01, -1.038e-01, -5.742e-02, -5.106e-04, 5.143e-02, 3.098e-02, -1.235e-01, 1.987e-02, 1.477e-02));
r += mul(s2_4, M4(1.113e-01, -1.761e-01, 5.038e-02, -1.304e-01, 3.668e-01, -3.430e-01, 2.169e-01, 3.877e-01, -3.750e-02, 2.473e-01, 3.416e-02, 2.184e-01, 5.168e-01, -7.132e-02, 3.818e-01, -1.508e-01));
r += mul(s2_5, M4(1.479e-01, -8.656e-02, -1.700e-01, 3.874e-01, 2.286e-02, -8.854e-02, 3.305e-02, -4.668e-03, -1.481e-01, 5.115e-02, 2.686e-01, 4.113e-01, -3.740e-01, -2.013e-01, 9.838e-04, 3.008e-01));
r += mul(s2_6, M4(3.428e-01, -3.200e-01, 7.593e-02, 1.911e-01, 1.219e-01, 1.211e-02, -5.694e-02, -5.767e-02, 3.119e-02, -7.609e-02, 6.471e-02, 1.215e-01, -2.793e-04, 1.650e-02, 7.190e-03, -4.468e-02));
r += mul(s2_7, M4(3.970e-01, -3.192e-01, -5.639e-02, 8.182e-02, -2.831e-02, 4.036e-02, 7.004e-02, 1.095e-01, -3.655e-02, 2.443e-01, 5.606e-02, -4.974e-02, 9.825e-02, 1.158e-01, -5.104e-02, -2.986e-02));
r += mul(s2_8, M4(1.440e-01, 5.504e-02, -2.020e-01, 2.618e-03, -1.098e-02, -3.678e-02, 7.661e-02, 5.652e-02, -7.426e-02, 5.461e-02, 4.239e-01, 2.093e-01, 9.316e-03, -3.679e-02, 6.108e-02, 2.036e-01));
r += mul(s3_0, M4(1.806e-02, 2.233e-02, 5.056e-02, 1.758e-01, 3.566e-02, -1.383e-01, 5.349e-02, 1.066e-01, 3.314e-02, -1.258e-01, -2.885e-02, -6.648e-02, -6.860e-03, -2.283e-02, -1.052e-01, -1.623e-02));
r += mul(s3_1, M4(7.369e-02, -3.141e-02, 3.877e-03, 8.113e-03, -1.773e-01, 5.122e-03, -3.198e-01, 9.005e-02, 7.291e-02, -1.519e-01, -1.501e-01, -8.202e-02, -4.729e-02, -2.877e-02, -4.056e-02, 7.599e-02));
r += mul(s3_2, M4(1.282e-01, 2.477e-03, 6.185e-02, 3.967e-02, -1.343e-01, 8.884e-02, 5.299e-02, -7.324e-02, 1.842e-01, -3.053e-02, -1.335e-01, -6.790e-03, -8.128e-02, 6.665e-02, 1.583e-03, -5.358e-02));
r += mul(s3_3, M4(1.135e-01, 9.360e-03, 1.646e-01, 1.844e-01, 1.104e-02, 7.072e-02, -9.632e-02, -1.169e-01, -1.458e-01, 2.540e-02, -5.132e-02, -1.627e-01, -1.066e-01, -4.819e-02, -4.340e-02, -5.074e-02));
r += mul(s3_4, M4(-1.198e-01, -7.965e-02, -2.989e-01, -4.946e-01, -1.666e-02, -2.136e-01, -3.575e-02, 1.351e-01, -8.546e-02, 2.553e-02, -7.878e-02, -3.233e-01, -2.955e-01, -7.765e-02, 1.450e-01, -2.114e-01));
r += mul(s3_5, M4(-7.593e-02, -1.849e-03, -1.688e-01, 3.626e-02, 4.408e-03, 4.014e-02, -1.401e-01, -2.239e-01, 9.538e-02, -2.310e-01, 2.831e-02, 5.065e-02, 1.135e-01, 2.542e-02, -4.365e-01, 4.393e-02));
r += mul(s3_6, M4(-5.217e-02, -1.327e-02, -1.851e-02, 2.806e-02, 4.648e-02, -9.047e-04, 2.961e-02, -2.922e-02, 6.360e-02, -3.494e-02, 2.573e-02, 1.309e-02, -2.512e-03, -4.086e-02, -2.086e-03, -6.018e-02));
r += mul(s3_7, M4(9.887e-02, -9.515e-03, 1.306e-01, 5.290e-02, 1.832e-01, -2.549e-01, -4.640e-02, -1.256e-01, 4.915e-02, -5.163e-02, 3.044e-02, -9.871e-02, 8.168e-03, -7.112e-02, -5.743e-02, 3.687e-02));
r += mul(s3_8, M4(-6.440e-02, 2.530e-02, -2.166e-03, -4.680e-02, 8.009e-02, -6.634e-02, -1.390e-01, -2.524e-02, 6.524e-02, -1.120e-01, -4.252e-02, -8.413e-03, -2.017e-02, 1.444e-02, -4.483e-02, 4.690e-02));
r += V4(3.009e-03, -1.445e-03, 8.191e-03, -7.852e-03);
return r;
}
V4 f1(V4 s0_0, V4 s0_1, V4 s0_2, V4 s0_3, V4 s0_4, V4 s0_5, V4 s0_6, V4 s0_7, V4 s0_8, V4 s1_0, V4 s1_1, V4 s1_2, V4 s1_3, V4 s1_4, V4 s1_5, V4 s1_6, V4 s1_7, V4 s1_8, V4 s2_0, V4 s2_1, V4 s2_2, V4 s2_3, V4 s2_4, V4 s2_5, V4 s2_6, V4 s2_7, V4 s2_8, V4 s3_0, V4 s3_1, V4 s3_2, V4 s3_3, V4 s3_4, V4 s3_5, V4 s3_6, V4 s3_7, V4 s3_8) {
V4 r = 0.0;
r += mul(s0_0, M4(-1.005e-01, -6.367e-02, 4.428e-02, 1.687e-02, -9.639e-02, -1.209e-01, -1.374e-02, 4.932e-02, -9.949e-02, -2.569e-01, 1.199e-01, 1.077e-02, 5.110e-02, -1.129e-01, 6.104e-02, -4.656e-03));
r += mul(s0_1, M4(-2.156e-01, 8.505e-02, 4.815e-04, -1.042e-01, -2.724e-01, -1.870e-01, 3.876e-02, 7.840e-02, -4.018e-01, -8.239e-01, 2.611e-01, -3.623e-01, -6.999e-03, 1.848e-02, 6.095e-02, -2.318e-02));
r += mul(s0_2, M4(-2.195e-01, -6.727e-02, 7.111e-02, 5.119e-02, 7.396e-02, 1.116e-02, -1.261e-02, 9.531e-02, -3.892e-01, 1.430e-01, -9.840e-02, -2.423e-01, 2.669e-01, 3.009e-02, -2.478e-02, 1.168e-01));
r += mul(s0_3, M4(-4.344e-01, 8.202e-02, 9.272e-03, -8.384e-02, -8.136e-02, -4.359e-01, 2.361e-01, -2.183e-01, 4.609e-02, -2.144e-02, 9.525e-03, -7.197e-02, -9.339e-02, 1.927e-01, -1.687e-02, 3.193e-02));
r += mul(s0_4, M4(4.702e-01, 1.415e-04, 1.097e-01, 2.415e-01, 1.899e-01, -7.324e-01, -4.745e-03, -1.237e-01, -2.043e-01, 2.674e-02, 6.899e-01, 8.700e-02, 5.083e-02, 2.271e-01, 4.884e-02, 3.767e-01));
r += mul(s0_5, M4(6.758e-02, -4.638e-02, 9.477e-02, -8.290e-02, -1.994e-01, 1.090e-01, -5.148e-02, -1.470e-01, 7.433e-02, 3.404e-01, 1.020e-01, -8.353e-02, 1.793e-01, -1.368e-01, 6.375e-02, 5.993e-02));
r += mul(s0_6, M4(1.596e-02, 3.589e-02, 1.177e-02, 1.541e-01, -1.159e-01, -1.621e-02, 2.451e-01, 2.767e-01, -3.754e-04, 4.995e-02, -6.760e-02, -9.945e-02, 4.017e-01, 4.413e-02, 2.189e-02, 4.126e-02));
r += mul(s0_7, M4(1.635e-01, -1.853e-01, -1.823e-01, -1.003e-01, -4.884e-02, 1.686e-01, 7.826e-02, 5.419e-01, -1.017e-01, 7.007e-02, 2.084e-01, 2.030e-01, 5.150e-01, -1.861e-01, -3.037e-01, -3.846e-01));
r += mul(s0_8, M4(1.162e-01, 9.675e-02, -9.807e-02, 7.794e-02, 1.154e-01, 7.680e-02, 7.823e-02, 1.665e-01, 1.414e-01, 4.509e-02, -1.327e-02, 1.752e-01, -2.721e-01, -9.636e-04, 2.198e-02, -9.405e-02));
r += mul(s1_0, M4(-3.554e-02, 7.673e-02, -1.735e-02, 3.910e-02, -9.934e-02, 1.798e-01, -4.244e-02, -2.008e-02, -1.586e-01, 7.918e-02, 6.812e-02, 1.784e-01, -2.173e-01, 8.736e-02, -3.130e-02, -1.487e-02));
r += mul(s1_1, M4(1.142e-01, 2.330e-02, -7.096e-03, 5.291e-02, -3.702e-01, 2.102e-01, 7.156e-02, -1.416e-01, 1.017e-01, 3.888e-01, -5.335e-02, 9.686e-02, -1.093e-01, -1.631e-02, -2.884e-03, -4.091e-02));
r += mul(s1_2, M4(4.795e-02, 4.423e-03, 1.494e-02, 2.666e-02, 1.261e-01, -7.251e-02, 2.103e-02, 1.095e-01, 2.166e-01, -1.249e-01, 8.981e-03, 1.792e-01, -3.697e-02, 6.864e-03, -1.141e-02, 2.430e-02));
r += mul(s1_3, M4(-1.206e-01, 1.584e-03, -1.789e-02, -1.335e-02, 2.398e-01, 8.681e-01, -1.241e-01, -4.454e-02, -7.396e-02, 1.759e-02, -9.138e-02, 1.573e-01, -2.025e-01, 8.569e-02, 2.132e-02, 9.791e-02));
r += mul(s1_4, M4(-4.834e-02, -7.974e-01, 2.858e-01, -2.441e-01, 4.163e-01, -1.650e-01, -1.897e-01, 1.309e-01, 4.031e-02, -8.242e-02, 3.338e-01, 3.567e-01, -1.532e-01, 2.807e-01, -7.324e-02, 5.093e-03));
r += mul(s1_5, M4(-1.538e-01, 9.244e-02, -7.570e-02, -4.333e-02, -1.407e-01, -4.201e-02, -4.186e-02, -1.603e-01, -2.031e-01, 6.309e-02, -8.191e-02, 9.121e-02, -8.138e-02, -4.037e-02, 3.793e-02, 4.240e-02));
r += mul(s1_6, M4(1.780e-01, 1.059e-01, -5.233e-03, 1.087e-01, 1.808e-01, -1.409e-01, 1.162e-02, -1.312e-01, 6.866e-02, 1.401e-02, 6.420e-02, 5.614e-02, -6.830e-02, 1.731e-02, 5.889e-02, 2.257e-02));
r += mul(s1_7, M4(2.057e-01, -2.093e-02, -1.741e-01, 9.891e-02, -3.673e-02, 3.314e-02, -2.223e-01, -3.177e-01, 2.374e-01, -5.871e-02, -5.086e-02, -9.418e-02, -1.935e-02, -1.902e-02, -1.255e-01, -2.744e-01));
r += mul(s1_8, M4(1.654e-01, 7.328e-02, 2.874e-02, 1.256e-01, -2.608e-01, 1.926e-03, 4.500e-02, -7.882e-02, -1.035e-02, -3.478e-02, -1.061e-01, -8.474e-02, -2.438e-01, -6.889e-02, -7.579e-02, -1.871e-01));
r += mul(s2_0, M4(6.493e-02, 1.357e-01, -6.197e-02, -5.055e-02, 2.568e-01, -5.699e-02, -1.266e-01, -1.411e-02, 2.936e-02, -5.234e-02, -5.882e-03, -8.014e-02, -5.334e-02, -8.555e-02, 5.632e-02, 8.296e-03));
r += mul(s2_1, M4(-3.582e-01, 2.351e-01, -1.636e-01, 2.172e-01, -1.840e-01, 9.838e-02, -7.565e-02, 1.535e-01, 8.151e-02, 3.002e-02, 1.149e-01, 1.180e-01, 1.323e-01, -7.682e-03, 5.013e-02, -2.190e-02));
r += mul(s2_2, M4(-1.957e-01, -5.823e-02, -1.131e-01, -7.025e-02, 3.355e-01, 1.378e-01, -2.046e-01, 2.575e-01, 1.663e-01, 2.567e-02, -3.703e-02, -9.489e-02, -6.431e-02, -6.700e-02, 9.598e-02, 4.460e-03));
r += mul(s2_3, M4(-1.522e-01, 1.335e-01, -2.140e-01, 3.368e-02, -5.076e-02, 2.412e-01, 6.141e-03, 2.456e-02, -9.105e-03, 1.014e-02, -1.056e-02, 1.368e-01, 8.030e-02, -2.874e-02, -7.499e-02, -2.675e-02));
r += mul(s2_4, M4(2.115e-02, -6.849e-02, -8.528e-02, -3.270e-01, 2.112e-02, 7.309e-02, -3.852e-02, 2.604e-01, 1.772e-01, 4.115e-01, -2.443e-01, 3.100e-01, 3.139e-01, 3.829e-01, -2.701e-01, 1.463e-01));
r += mul(s2_5, M4(2.664e-03, 4.352e-02, -2.378e-01, 5.316e-02, -1.369e-01, -1.293e-01, 1.587e-01, 2.153e-01, 3.820e-01, -1.515e-01, -4.429e-02, 2.391e-01, -3.720e-01, -1.154e-01, -1.196e-01, 3.172e-01));
r += mul(s2_6, M4(-3.174e-01, -2.340e-01, 1.286e-01, -1.076e-01, 5.834e-02, 6.138e-02, -6.854e-03, 5.658e-02, 5.314e-02, -1.751e-02, 9.115e-03, 8.328e-03, 8.394e-03, 2.608e-02, 1.125e-01, 1.593e-01));
r += mul(s2_7, M4(-6.600e-01, 1.899e-01, 1.094e-01, 1.665e-02, 1.089e-01, -1.034e-01, -1.811e-01, -3.040e-01, 4.782e-01, 3.160e-02, -4.648e-02, 1.286e-01, 1.070e-01, -1.022e-01, 5.693e-02, -5.195e-02));
r += mul(s2_8, M4(3.748e-03, -4.142e-02, -7.021e-02, -2.596e-01, -2.444e-01, -6.341e-05, 4.125e-02, -7.382e-02, 4.456e-02, 3.144e-02, -5.055e-02, -1.724e-01, -1.835e-01, 4.462e-02, -1.398e-01, -2.631e-02));
r += mul(s3_0, M4(-1.892e-01, -2.298e-01, 7.045e-02, -6.423e-02, 7.789e-02, -9.540e-02, -3.161e-02, -5.171e-02, -3.656e-02, -6.148e-02, -1.413e-02, -8.995e-02, 2.536e-02, 1.995e-03, 3.317e-02, 1.918e-02));
r += mul(s3_1, M4(1.245e-02, -4.971e-03, 1.026e-02, -7.525e-02, -2.233e-01, -4.502e-01, -4.530e-03, -1.802e-01, -1.799e-01, 1.915e-02, 1.043e-02, 4.008e-02, 1.524e-01, 1.881e-03, -7.387e-02, 1.566e-02));
r += mul(s3_2, M4(-1.750e-01, 3.216e-03, -1.033e-03, -7.055e-02, -1.263e-01, 1.586e-01, 2.603e-02, -1.282e-01, 5.606e-02, -1.498e-02, -3.338e-02, -8.978e-03, -2.218e-02, -5.852e-02, -3.208e-03, -1.352e-02));
r += mul(s3_3, M4(-9.577e-02, -8.859e-02, 7.921e-02, -1.569e-02, -7.962e-02, 2.890e-02, 4.107e-02, -5.870e-02, 2.510e-02, 1.765e-02, 4.458e-02, 1.891e-02, 7.541e-02, 3.492e-02, 3.160e-02, 1.201e-02));
r += mul(s3_4, M4(-6.228e-02, 9.576e-02, -1.743e-01, -1.935e-01, 2.054e-01, 1.479e-01, 8.056e-04, 3.321e-02, -1.362e-01, 5.003e-01, 9.071e-02, 8.153e-02, 2.283e-01, -3.484e-01, 4.509e-02, -4.658e-01));
r += mul(s3_5, M4(2.528e-01, -9.286e-04, -2.468e-02, 1.338e-01, 4.431e-02, 3.503e-02, 1.304e-01, 1.652e-01, 4.628e-01, -2.670e-01, 1.880e-01, 1.516e-01, -1.538e-01, 1.379e-01, -3.334e-02, 2.977e-02));
r += mul(s3_6, M4(1.385e-01, -6.592e-02, -1.225e-01, -1.381e-01, -4.498e-02, -6.343e-03, 4.811e-02, 9.639e-02, 1.635e-02, -3.467e-02, 3.640e-03, -3.186e-02, 6.265e-02, 2.282e-01, 9.661e-02, 1.295e-01));
r += mul(s3_7, M4(-3.053e-03, 7.999e-02, 2.407e-01, 2.655e-01, -3.969e-01, -9.502e-03, 1.900e-02, 9.557e-02, -6.199e-02, -3.574e-02, 8.350e-02, -7.837e-02, -1.442e-02, -5.281e-03, 4.503e-01, 4.026e-01));
r += mul(s3_8, M4(-1.313e-01, 4.424e-02, -1.155e-02, 6.769e-02, 2.192e-02, 6.721e-02, 5.694e-03, 7.376e-02, -2.155e-01, -7.512e-02, 6.252e-03, -3.428e-01, 3.324e-01, 2.784e-03, -5.606e-02, 2.108e-01));
r += V4(-6.039e-04, -3.875e-03, -3.020e-03, 2.282e-03);
return r;
}
void Pass3(uint2 blockStart, uint3 tid) {
float2 pt = float2(GetInputPt());
uint2 gxy = Rmp8x8(tid.x) + blockStart;
uint2 size = GetInputSize();
if (gxy.x >= size.x || gxy.y >= size.y) {
return;
}
float2 pos = (gxy + 0.5) * pt;
V4 s0_0 = l0(-1.0, -1.0);
V4 s0_1 = l0(0.0, -1.0);
V4 s0_2 = l0(1.0, -1.0);
V4 s0_3 = l0(-1.0, 0.0);
V4 s0_4 = l0(0.0, 0.0);
V4 s0_5 = l0(1.0, 0.0);
V4 s0_6 = l0(-1.0, 1.0);
V4 s0_7 = l0(0.0, 1.0);
V4 s0_8 = l0(1.0, 1.0);
V4 s1_0 = -max(-s0_0, 0.0);
V4 s1_1 = -max(-s0_1, 0.0);
V4 s1_2 = -max(-s0_2, 0.0);
V4 s1_3 = -max(-s0_3, 0.0);
V4 s1_4 = -max(-s0_4, 0.0);
V4 s1_5 = -max(-s0_5, 0.0);
V4 s1_6 = -max(-s0_6, 0.0);
V4 s1_7 = -max(-s0_7, 0.0);
V4 s1_8 = -max(-s0_8, 0.0);
s0_0 = max(s0_0, 0.0);
s0_1 = max(s0_1, 0.0);
s0_2 = max(s0_2, 0.0);
s0_3 = max(s0_3, 0.0);
s0_4 = max(s0_4, 0.0);
s0_5 = max(s0_5, 0.0);
s0_6 = max(s0_6, 0.0);
s0_7 = max(s0_7, 0.0);
s0_8 = max(s0_8, 0.0);
V4 s2_0 = l1(-1.0, -1.0);
V4 s2_1 = l1(0.0, -1.0);
V4 s2_2 = l1(1.0, -1.0);
V4 s2_3 = l1(-1.0, 0.0);
V4 s2_4 = l1(0.0, 0.0);
V4 s2_5 = l1(1.0, 0.0);
V4 s2_6 = l1(-1.0, 1.0);
V4 s2_7 = l1(0.0, 1.0);
V4 s2_8 = l1(1.0, 1.0);
V4 s3_0 = -max(-s2_0, 0.0);
V4 s3_1 = -max(-s2_1, 0.0);
V4 s3_2 = -max(-s2_2, 0.0);
V4 s3_3 = -max(-s2_3, 0.0);
V4 s3_4 = -max(-s2_4, 0.0);
V4 s3_5 = -max(-s2_5, 0.0);
V4 s3_6 = -max(-s2_6, 0.0);
V4 s3_7 = -max(-s2_7, 0.0);
V4 s3_8 = -max(-s2_8, 0.0);
s2_0 = max(s2_0, 0.0);
s2_1 = max(s2_1, 0.0);
s2_2 = max(s2_2, 0.0);
s2_3 = max(s2_3, 0.0);
s2_4 = max(s2_4, 0.0);
s2_5 = max(s2_5, 0.0);
s2_6 = max(s2_6, 0.0);
s2_7 = max(s2_7, 0.0);
s2_8 = max(s2_8, 0.0);
t0[gxy] = f0(s0_0, s0_1, s0_2, s0_3, s0_4, s0_5, s0_6, s0_7, s0_8, s1_0, s1_1, s1_2, s1_3, s1_4, s1_5, s1_6, s1_7, s1_8, s2_0, s2_1, s2_2, s2_3, s2_4, s2_5, s2_6, s2_7, s2_8, s3_0, s3_1, s3_2, s3_3, s3_4, s3_5, s3_6, s3_7, s3_8);
t1[gxy] = f1(s0_0, s0_1, s0_2, s0_3, s0_4, s0_5, s0_6, s0_7, s0_8, s1_0, s1_1, s1_2, s1_3, s1_4, s1_5, s1_6, s1_7, s1_8, s2_0, s2_1, s2_2, s2_3, s2_4, s2_5, s2_6, s2_7, s2_8, s3_0, s3_1, s3_2, s3_3, s3_4, s3_5, s3_6, s3_7, s3_8);
}
//!PASS 4
//!DESC conv3
//!BLOCK_SIZE 8
//!NUM_THREADS 64
//!IN t0, t1
//!OUT t2, t3
#define l0(x, y) V4(O(t0, float2(x, y)))
#define l1(x, y) V4(O(t1, float2(x, y)))
V4 f0(V4 s0_0, V4 s0_1, V4 s0_2, V4 s0_3, V4 s0_4, V4 s0_5, V4 s0_6, V4 s0_7, V4 s0_8, V4 s1_0, V4 s1_1, V4 s1_2, V4 s1_3, V4 s1_4, V4 s1_5, V4 s1_6, V4 s1_7, V4 s1_8, V4 s2_0, V4 s2_1, V4 s2_2, V4 s2_3, V4 s2_4, V4 s2_5, V4 s2_6, V4 s2_7, V4 s2_8, V4 s3_0, V4 s3_1, V4 s3_2, V4 s3_3, V4 s3_4, V4 s3_5, V4 s3_6, V4 s3_7, V4 s3_8) {
V4 r = 0.0;
r += mul(s0_0, M4(8.947e-02, -1.234e-01, -3.169e-02, -9.158e-02, -1.406e-01, 6.941e-02, -1.367e-02, -1.406e-02, 9.073e-02, 5.642e-01, -2.007e-02, 9.725e-02, 7.122e-03, -1.956e-03, 6.532e-03, -5.457e-02));
r += mul(s0_1, M4(-1.130e-01, -4.645e-02, 3.624e-02, 3.391e-02, 3.882e-01, 2.453e-01, -2.237e-01, -2.271e-01, 2.803e-01, 1.718e-01, 3.255e-02, -2.046e-01, 1.441e-01, -1.880e-03, 2.335e-02, -1.232e-01));
r += mul(s0_2, M4(2.016e-01, 1.243e-01, -3.895e-02, -1.135e-01, -2.167e-02, 1.465e-02, -7.776e-02, -1.213e-01, -7.195e-03, 4.404e-03, 6.598e-02, -5.135e-02, -2.062e-01, -3.725e-02, -8.296e-03, 8.739e-03));
r += mul(s0_3, M4(-2.068e-02, -3.876e-02, 5.737e-02, 9.886e-02, -9.663e-02, -2.569e-01, 6.761e-02, -1.454e-01, 4.660e-02, 7.810e-01, -2.254e-01, 1.899e-01, -9.628e-02, 8.080e-02, -1.093e-02, 1.451e-02));
r += mul(s0_4, M4(3.133e-01, 2.759e-01, -9.917e-02, -3.134e-01, 1.137e-01, -5.446e-01, -2.044e-03, -5.215e-01, -6.867e-02, 5.254e-01, -1.466e-01, -3.048e-01, 3.408e-01, 5.791e-01, -2.594e-01, -4.879e-04));
r += mul(s0_5, M4(6.871e-02, -1.221e-01, -5.702e-02, -2.731e-02, 6.025e-01, 1.350e-01, -3.119e-01, -4.130e-01, 2.091e-01, 1.003e-01, 4.509e-02, -1.541e-01, 1.151e-01, -1.558e-01, 6.309e-03, -2.192e-01));
r += mul(s0_6, M4(2.139e-02, 1.540e-02, -9.451e-02, 8.898e-02, 1.983e-02, -1.259e-01, 2.162e-01, -9.477e-02, -2.253e-01, -1.456e-01, -2.432e-02, 9.649e-02, 2.147e-02, -9.523e-02, 2.042e-02, -7.790e-02));
r += mul(s0_7, M4(-3.105e-03, 1.944e-01, -1.808e-01, -3.058e-02, 4.007e-01, 5.645e-01, -2.452e-01, -7.366e-02, 1.279e-02, 3.212e-02, -1.573e-01, -1.267e-01, 1.613e-02, -1.976e-01, -1.519e-01, -2.687e-02));
r += mul(s0_8, M4(-1.906e-04, 8.306e-02, 2.480e-02, 1.696e-02, 1.275e-01, 1.372e-01, 1.205e-01, 1.120e-02, 1.424e-02, -1.526e-01, -6.629e-02, -9.104e-02, 2.042e-02, -1.167e-01, 1.050e-01, 1.560e-02));
r += mul(s1_0, M4(-2.398e-02, -1.009e-01, 2.671e-02, -8.841e-02, -7.277e-03, -4.411e-02, -1.240e-02, -5.367e-04, -1.223e-01, -7.251e-02, 4.941e-02, 7.545e-02, 6.688e-02, 1.727e-02, -1.144e-02, -7.713e-02));
r += mul(s1_1, M4(-1.507e-01, -3.095e-01, 5.017e-02, -1.145e-01, 3.430e-02, -2.241e-01, -9.050e-02, -8.470e-02, -8.624e-02, -1.021e-02, -1.620e-02, 3.932e-03, 7.775e-02, -2.376e-02, 6.270e-02, -7.896e-02));
r += mul(s1_2, M4(3.578e-02, -3.242e-02, 9.400e-03, -2.998e-02, -1.545e-02, -1.481e-01, -6.667e-02, 3.496e-02, 6.722e-02, 7.676e-04, -8.215e-04, 2.142e-03, 4.007e-02, 9.690e-02, -1.652e-03, 3.858e-02));
r += mul(s1_3, M4(6.321e-02, -1.472e-01, 6.571e-02, -1.929e-01, -7.340e-02, -8.067e-02, 1.715e-02, 2.182e-02, -8.623e-02, -2.195e-01, -6.101e-02, 8.246e-02, -4.908e-02, -3.293e-02, -7.341e-02, -1.941e-01));
r += mul(s1_4, M4(5.609e-01, 5.581e-01, -1.143e-01, -1.052e-01, 2.477e-01, 2.387e-01, 1.272e-01, 3.284e-03, -3.135e-01, 8.385e-02, -7.393e-02, -2.270e-01, 4.403e-01, -1.179e-01, -1.620e-01, 2.978e-01));
r += mul(s1_5, M4(-3.015e-02, 1.055e-01, 1.072e-01, 1.177e-01, 3.838e-01, 3.206e-02, -4.556e-03, -5.072e-02, 4.250e-02, -1.665e-02, -1.759e-02, 2.822e-02, -2.408e-01, -2.204e-02, -3.440e-02, 6.520e-02));
r += mul(s1_6, M4(9.180e-04, 3.395e-02, -1.211e-02, -5.605e-03, -7.356e-03, -2.439e-02, -2.498e-02, -6.361e-04, -5.167e-02, -1.009e-02, 7.202e-02, 3.652e-02, 3.036e-03, -7.672e-03, -2.822e-02, -9.942e-02));
r += mul(s1_7, M4(-7.041e-02, -2.366e-01, -1.556e-01, 1.499e-01, -2.674e-02, 6.601e-03, -1.490e-01, 1.329e-02, -1.127e-01, 8.363e-03, -1.333e-01, 1.038e-02, -1.219e-02, -1.366e-01, 8.814e-02, 4.260e-03));
r += mul(s1_8, M4(-1.397e-02, 2.863e-02, 5.459e-03, -1.166e-02, -1.201e-02, 1.346e-01, 5.461e-02, 1.584e-02, -8.155e-02, 8.451e-03, -3.444e-02, 3.920e-02, 2.082e-02, -4.174e-02, 6.205e-02, 5.646e-02));
r += mul(s2_0, M4(5.465e-02, 7.303e-02, 1.200e-01, 8.938e-03, -8.960e-02, -2.248e-01, -1.073e-02, 6.882e-02, 4.637e-02, -1.215e-01, -2.319e-02, -2.049e-01, -8.235e-02, -2.689e-02, 8.521e-02, 2.612e-02));
r += mul(s2_1, M4(-1.284e-01, -8.509e-02, 6.859e-02, 2.538e-02, -7.401e-02, 2.860e-01, -2.240e-01, 1.754e-01, -2.073e-01, -9.333e-02, -9.310e-02, -3.311e-01, 2.251e-01, 1.948e-01, -1.091e-01, 2.448e-02));
r += mul(s2_2, M4(4.550e-03, 2.884e-02, -1.023e-02, -1.793e-02, 1.472e-01, 1.728e-02, -5.533e-02, -4.606e-02, -1.128e-01, 1.845e-01, -9.297e-02, 7.245e-02, 2.303e-02, -1.293e-01, -2.277e-02, -1.523e-02));
r += mul(s2_3, M4(5.703e-02, 4.629e-03, -7.495e-02, -7.220e-02, -1.245e-01, 1.142e-01, -1.688e-03, -9.906e-03, 9.714e-02, -2.851e-02, 7.069e-03, -3.250e-01, -5.029e-03, -1.421e-01, -4.162e-02, 1.032e-01));
r += mul(s2_4, M4(5.200e-02, -3.414e-02, -3.809e-02, -9.742e-02, 8.686e-01, 1.140e+00, 2.062e-01, 8.598e-02, 4.073e-01, -3.313e-01, 2.673e-01, 1.050e-01, -9.355e-02, 1.764e-01, 8.423e-02, 1.156e-01));
r += mul(s2_5, M4(-5.260e-03, 8.804e-02, 3.636e-02, 3.074e-03, 1.724e-01, 2.433e-01, -1.126e-02, -2.652e-01, -1.229e-01, 3.135e-02, 1.187e-02, -6.661e-02, -1.872e-02, -6.508e-02, -7.109e-02, 1.141e-01));
r += mul(s2_6, M4(6.180e-03, 2.059e-03, -1.768e-02, 4.877e-03, -7.838e-02, 1.366e-01, -7.231e-02, -2.826e-02, 6.251e-02, 7.375e-02, 2.531e-02, 2.038e-02, -4.462e-03, -4.896e-02, -4.376e-02, -7.998e-03));
r += mul(s2_7, M4(1.011e-01, 8.753e-02, -5.554e-02, 6.949e-04, 4.137e-02, 2.710e-01, -3.203e-01, 6.752e-02, 9.720e-02, 3.447e-02, -5.777e-02, -1.723e-02, -9.154e-03, 5.461e-02, 1.248e-01, -3.906e-04));
r += mul(s2_8, M4(4.126e-02, 3.442e-02, 9.763e-03, -4.560e-02, -4.233e-04, -1.519e-01, 2.421e-02, -4.043e-02, -1.281e-02, 1.166e-02, 2.489e-04, -3.061e-02, -4.476e-02, 4.493e-03, -4.164e-02, 9.694e-03));
r += mul(s3_0, M4(-1.352e-01, -1.938e-01, 7.285e-02, -4.706e-02, 1.920e-02, 1.891e-02, 1.233e-02, 3.876e-02, 1.342e-02, 2.020e-01, 3.292e-02, 2.778e-02, -5.017e-02, 3.560e-02, 7.028e-02, 7.562e-03));
r += mul(s3_1, M4(3.014e-01, 1.243e-01, -2.656e-02, -9.796e-02, 1.585e-01, 2.259e-01, -6.651e-02, 4.080e-02, 1.902e-01, 2.705e-01, -9.774e-02, -1.144e-02, -4.653e-01, -3.536e-01, 2.515e-02, 9.628e-02));
r += mul(s3_2, M4(-7.724e-02, 1.181e-01, 2.182e-02, 1.999e-02, -7.114e-02, -4.414e-02, -5.748e-06, -8.931e-03, 4.985e-03, 6.360e-02, 4.422e-02, 6.005e-02, 1.335e-01, -8.144e-03, -3.979e-02, 6.952e-03));
r += mul(s3_3, M4(-1.826e-03, 2.390e-02, 4.665e-03, -3.357e-02, 2.088e-02, 1.436e-01, -2.474e-02, 1.100e-02, 2.727e-02, -1.649e-02, -9.539e-02, -1.112e-01, -2.427e-02, 1.811e-01, -4.267e-02, 1.060e-01));
r += mul(s3_4, M4(9.873e-02, -1.417e-01, -1.365e-01, -3.187e-01, -7.583e-03, 3.047e-01, -2.480e-02, 2.623e-01, -3.193e-01, -1.539e-01, -2.986e-01, 2.350e-01, 4.367e-01, 2.441e-01, -3.426e-01, -5.108e-02));
r += mul(s3_5, M4(-8.384e-02, 1.343e-01, 1.653e-01, 7.978e-02, 6.329e-02, 7.040e-02, 2.203e-02, -2.280e-01, 2.531e-02, -9.408e-02, -5.137e-02, -1.717e-01, -1.577e-01, 4.030e-02, -2.802e-01, 1.155e-01));
r += mul(s3_6, M4(1.895e-02, 1.436e-01, 3.568e-04, -6.075e-02, 3.213e-02, -3.462e-02, -2.821e-02, -8.374e-03, 3.451e-02, -2.349e-02, 9.517e-03, 2.092e-02, -8.229e-02, 6.530e-02, 6.116e-03, -2.414e-02));
r += mul(s3_7, M4(1.530e-01, 2.073e-01, -7.258e-02, -6.975e-02, -6.610e-03, -3.885e-02, -5.636e-03, 1.227e-01, 8.913e-02, 4.336e-02, -1.931e-03, -3.869e-02, -2.019e-02, -1.340e-02, -1.506e-02, 1.591e-02));
r += mul(s3_8, M4(2.536e-02, -3.220e-02, 6.413e-02, -1.835e-02, -9.124e-02, -8.098e-02, -5.479e-02, -1.361e-02, -3.146e-03, 1.204e-01, -4.020e-02, -6.924e-02, -1.030e-01, -1.301e-01, 1.634e-02, 1.029e-01));
r += V4(8.385e-03, 1.035e-02, -6.465e-04, -6.502e-03);
return r;
}
V4 f1(V4 s0_0, V4 s0_1, V4 s0_2, V4 s0_3, V4 s0_4, V4 s0_5, V4 s0_6, V4 s0_7, V4 s0_8, V4 s1_0, V4 s1_1, V4 s1_2, V4 s1_3, V4 s1_4, V4 s1_5, V4 s1_6, V4 s1_7, V4 s1_8, V4 s2_0, V4 s2_1, V4 s2_2, V4 s2_3, V4 s2_4, V4 s2_5, V4 s2_6, V4 s2_7, V4 s2_8, V4 s3_0, V4 s3_1, V4 s3_2, V4 s3_3, V4 s3_4, V4 s3_5, V4 s3_6, V4 s3_7, V4 s3_8) {
V4 r = 0.0;
r += mul(s0_0, M4(-5.222e-02, -3.069e-02, 2.456e-03, -1.117e-02, 4.933e-02, 5.166e-02, -6.284e-03, -9.151e-02, 1.439e-02, 1.755e-02, -8.848e-02, -9.796e-02, -2.835e-02, 3.699e-02, 2.912e-02, 4.373e-02));
r += mul(s0_1, M4(6.333e-03, -2.767e-02, -7.247e-02, 8.441e-02, -3.433e-02, -4.699e-02, -1.193e-02, -1.729e-01, 2.481e-02, -4.121e-02, -2.861e-01, -1.202e-02, 2.687e-02, -1.313e-01, 1.747e-02, -9.108e-02));
r += mul(s0_2, M4(1.309e-02, -1.968e-02, -1.246e-01, -3.915e-02, -1.159e-01, -6.491e-03, 3.316e-01, -6.851e-02, -2.940e-02, -1.787e-02, -5.850e-03, -6.207e-02, 5.272e-02, 9.800e-02, 4.709e-02, 7.491e-02));
r += mul(s0_3, M4(-1.127e-01, -3.748e-02, -1.091e-01, 1.788e-01, -7.982e-02, -7.528e-02, 1.898e-01, -1.355e-01, -1.568e-01, 9.648e-02, 2.337e-01, -9.666e-02, -7.316e-02, 2.915e-02, 2.259e-02, -1.310e-02));
r += mul(s0_4, M4(1.689e-02, -1.028e-01, 1.304e-01, -6.012e-02, -8.030e-02, -1.823e-01, 4.179e-01, -3.553e-01, 9.095e-04, 9.972e-02, 3.227e-01, -4.967e-02, -2.329e-01, 1.272e-01, 4.332e-01, -8.456e-01));
r += mul(s0_5, M4(1.815e-02, -5.743e-02, 7.236e-02, -8.782e-02, 1.161e-01, 2.258e-01, 7.053e-01, -2.993e-01, 6.605e-02, -2.666e-03, -4.733e-02, -1.087e-01, -1.101e-01, 1.554e-01, 1.656e-01, 2.530e-01));
r += mul(s0_6, M4(-7.750e-02, -6.619e-02, 2.202e-02, 4.186e-02, -1.519e-01, -8.918e-03, -1.919e-01, -7.085e-02, -1.356e-01, -1.363e-01, 1.782e-01, -1.499e-02, 9.670e-02, 1.450e-03, 5.675e-02, -3.337e-02));
r += mul(s0_7, M4(-9.267e-02, 1.661e-01, 1.306e-01, -2.387e-01, -2.261e-02, 2.870e-01, -2.711e-01, 6.281e-02, 2.181e-02, 1.010e-01, 2.979e-01, -9.254e-02, 1.307e-01, -2.024e-02, 2.013e-01, -1.862e-02));
r += mul(s0_8, M4(-7.233e-02, 8.276e-02, 1.279e-02, -3.778e-02, -3.737e-01, -2.422e-01, -1.352e-01, -1.631e-01, 6.518e-02, 2.511e-01, 1.588e-01, -3.599e-02, 8.821e-02, 3.757e-02, -1.340e-01, 1.006e-01));
r += mul(s1_0, M4(1.034e-02, 8.194e-02, 9.844e-02, -1.052e-01, 4.683e-03, 4.432e-03, 8.420e-03, 7.511e-03, 7.210e-02, -8.697e-03, -9.834e-02, 1.366e-01, 3.221e-04, 1.836e-02, 1.307e-02, -6.823e-02));
r += mul(s1_1, M4(-7.232e-02, 1.103e-01, 2.975e-01, 4.747e-02, -1.075e-01, -6.863e-02, 2.378e-01, -2.994e-02, 6.426e-02, 2.459e-02, -1.361e-01, 4.394e-02, 4.558e-02, -5.684e-02, -3.386e-02, 8.075e-02));
r += mul(s1_2, M4(-1.568e-02, 6.463e-02, 4.001e-02, 3.549e-02, -3.385e-02, -1.547e-02, 2.510e-01, 3.198e-02, 2.533e-02, -6.612e-02, -5.453e-02, 1.387e-03, 3.071e-02, -5.115e-03, -9.345e-02, 1.790e-02));
r += mul(s1_3, M4(1.723e-01, 2.119e-02, -3.394e-01, -1.101e-01, 7.882e-03, -4.188e-02, -6.882e-02, 5.060e-02, 4.902e-02, 2.919e-02, 7.773e-02, 1.080e-01, 8.944e-02, -2.819e-02, -1.252e-02, -2.744e-01));
r += mul(s1_4, M4(2.682e-01, 8.840e-03, -3.974e-01, 2.436e-01, 1.156e-02, 3.806e-04, -5.090e-01, -1.339e-02, 1.677e-02, -1.337e-01, -1.050e-01, 2.647e-01, -1.971e-01, -1.145e-02, 1.471e-01, -7.814e-02));
r += mul(s1_5, M4(-5.376e-02, 2.321e-02, -1.908e-01, -1.538e-01, 5.032e-03, 2.979e-02, -3.934e-02, -1.754e-01, 3.674e-02, 8.713e-03, -7.429e-02, -2.768e-03, -1.878e-01, -1.382e-01, 1.114e-01, 4.843e-02));
r += mul(s1_6, M4(4.390e-03, 1.082e-02, 6.300e-03, -2.220e-02, -1.578e-02, -3.883e-02, 6.290e-02, 5.752e-03, 9.478e-02, 5.108e-03, 6.174e-02, 8.270e-02, -5.128e-02, -3.664e-02, 3.095e-02, -1.575e-01));
r += mul(s1_7, M4(2.131e-01, 8.669e-03, 8.288e-02, 1.767e-01, -8.764e-02, -6.440e-03, 1.179e-01, -9.407e-02, -1.114e-01, -1.384e-01, 7.349e-02, 2.379e-02, 6.264e-02, -6.347e-02, -1.973e-01, 3.150e-02));
r += mul(s1_8, M4(6.920e-02, 2.737e-01, 5.444e-02, -1.065e-01, -8.435e-02, 1.268e-01, -7.219e-03, -4.022e-02, -3.687e-02, -3.873e-02, 5.773e-02, 1.171e-02, 5.552e-02, -2.870e-02, -4.903e-02, 2.162e-02));
r += mul(s2_0, M4(-6.811e-02, 3.915e-02, -1.970e-02, 5.496e-02, -3.225e-02, -5.284e-02, -3.737e-03, -1.864e-03, -1.361e-01, -7.308e-02, -4.948e-02, -1.634e-01, 5.283e-02, 1.746e-02, -8.374e-02, 7.123e-02));
r += mul(s2_1, M4(4.868e-03, 7.851e-02, 1.067e-01, 5.576e-02, 1.276e-01, -7.837e-02, -2.875e-01, 3.754e-02, -1.315e-01, -9.095e-02, 8.041e-02, -1.156e-01, 1.309e-02, 1.086e-01, -1.335e-01, 9.059e-02));
r += mul(s2_2, M4(-1.092e-02, 1.501e-01, -3.542e-02, 2.500e-02, 1.500e-02, -1.832e-01, -3.447e-01, -2.562e-02, -1.110e-01, 1.362e-01, 1.634e-01, -5.146e-02, -1.184e-02, -1.154e-01, 4.862e-02, 1.344e-03));
r += mul(s2_3, M4(3.103e-02, -2.009e-02, 2.266e-02, 5.094e-02, 5.909e-01, 1.844e-01, -3.418e-02, -1.460e-01, 1.218e-02, -3.631e-02, -2.582e-01, -2.230e-01, 9.666e-02, -6.432e-02, 7.267e-02, 7.577e-02));
r += mul(s2_4, M4(8.062e-02, -3.981e-02, -3.232e-02, -1.032e-01, -9.859e-02, 6.539e-01, 5.533e-01, -1.046e-02, -5.348e-01, 1.009e-02, -3.879e-01, 1.190e-01, -1.151e-01, 1.835e-01, -7.797e-02, 1.418e-01));
r += mul(s2_5, M4(-1.404e-02, -1.730e-01, -4.516e-02, -2.158e-02, 2.544e-01, 4.463e-01, 1.404e-01, -6.854e-02, -9.712e-02, -4.920e-01, -2.485e-02, -6.416e-02, 3.612e-02, 2.451e-01, 2.327e-02, -1.251e-03));
r += mul(s2_6, M4(6.507e-02, -2.267e-02, -7.660e-02, 3.043e-02, 3.541e-01, 2.804e-01, 2.783e-01, -2.580e-01, -1.185e-01, 8.028e-02, -1.395e-01, -4.988e-03, 4.702e-02, -5.327e-02, 4.580e-02, 3.130e-03));
r += mul(s2_7, M4(9.806e-02, 6.990e-02, -4.317e-02, -2.415e-02, -2.263e-01, -1.723e-01, 2.669e-02, -3.393e-01, 9.368e-02, -6.775e-02, -1.883e-01, -8.601e-02, -2.278e-01, 1.612e-01, 1.625e-01, 8.821e-02));
r += mul(s2_8, M4(-1.921e-02, 1.119e-01, 3.717e-02, -2.554e-02, 2.852e-02, 8.987e-02, 1.246e-01, 6.463e-03, 2.548e-02, -2.950e-02, 7.289e-02, 1.802e-02, 2.576e-02, 5.798e-02, 6.021e-02, -5.030e-03));
r += mul(s3_0, M4(-1.023e-01, -3.759e-02, -2.437e-02, 1.032e-01, -2.143e-02, -4.189e-02, -6.139e-02, 9.887e-02, -9.094e-03, 3.087e-02, -1.056e-01, 1.376e-01, 1.702e-02, 3.138e-02, -1.243e-01, -5.115e-02));
r += mul(s3_1, M4(3.439e-02, -1.018e-01, -3.260e-01, 6.226e-02, 3.794e-02, -6.747e-02, -1.743e-01, -9.149e-02, 6.116e-02, -3.539e-02, -3.971e-01, -2.458e-02, -1.436e-01, 4.323e-02, 5.595e-01, 1.160e-01));
r += mul(s3_2, M4(-7.596e-02, -9.502e-02, -1.112e-02, -7.256e-02, -1.625e-02, -1.013e-01, -7.450e-02, 2.969e-03, -1.481e-02, -1.199e-01, -8.230e-02, 2.952e-02, -3.199e-02, 8.852e-02, -1.541e-02, 1.722e-02));
r += mul(s3_3, M4(2.768e-03, -9.600e-02, 1.333e-01, -1.174e-01, -7.190e-02, 1.265e-02, 8.135e-02, -6.909e-03, 9.249e-02, -2.800e-02, 2.029e-01, -1.212e-02, 9.955e-02, -2.791e-02, -1.172e-01, 2.079e-01));
r += mul(s3_4, M4(-1.948e-01, -1.936e-01, 5.127e-01, -7.970e-02, -1.135e-01, 1.060e-01, 1.226e-01, -3.195e-01, -4.980e-01, -5.665e-03, 3.167e-01, -2.413e-01, 2.036e-01, 1.519e-01, 7.793e-04, -1.316e-01));
r += mul(s3_5, M4(-8.284e-02, -1.590e-01, 5.041e-03, -2.936e-02, 1.485e-01, 8.341e-02, -3.804e-02, 3.576e-02, 1.499e-01, -8.989e-02, 7.085e-02, -4.898e-02, 1.070e-01, 5.825e-02, 1.863e-01, -9.850e-03));
r += mul(s3_6, M4(-3.057e-01, 2.794e-02, -7.737e-02, -4.168e-02, 2.696e-02, 1.279e-02, 2.638e-02, 8.177e-02, 1.217e-01, 2.531e-02, -1.188e-01, 1.018e-01, -5.486e-02, -6.606e-03, 1.868e-01, -1.050e-01));
r += mul(s3_7, M4(-3.018e-01, -1.795e-01, -1.578e-01, -1.809e-01, 1.241e-01, -4.960e-02, -1.067e-01, -1.004e-02, -8.835e-02, 6.620e-02, 1.309e-01, -1.399e-01, 4.651e-02, 4.837e-02, -9.106e-02, 1.670e-01));
r += mul(s3_8, M4(1.081e-02, -9.947e-02, 1.643e-02, -2.769e-02, 9.803e-02, -8.389e-02, -2.782e-02, -2.689e-02, 3.693e-02, -3.436e-03, 1.229e-02, -2.929e-02, -1.751e-01, -5.859e-03, 1.543e-01, 8.225e-02));
r += V4(-2.994e-04, -3.163e-05, 4.528e-03, -1.285e-02);
return r;
}
void Pass4(uint2 blockStart, uint3 tid) {
float2 pt = float2(GetInputPt());
uint2 gxy = Rmp8x8(tid.x) + blockStart;
uint2 size = GetInputSize();
if (gxy.x >= size.x || gxy.y >= size.y) {
return;
}
float2 pos = (gxy + 0.5) * pt;
V4 s0_0 = l0(-1.0, -1.0);
V4 s0_1 = l0(0.0, -1.0);
V4 s0_2 = l0(1.0, -1.0);
V4 s0_3 = l0(-1.0, 0.0);
V4 s0_4 = l0(0.0, 0.0);
V4 s0_5 = l0(1.0, 0.0);
V4 s0_6 = l0(-1.0, 1.0);
V4 s0_7 = l0(0.0, 1.0);
V4 s0_8 = l0(1.0, 1.0);
V4 s1_0 = -max(-s0_0, 0.0);
V4 s1_1 = -max(-s0_1, 0.0);
V4 s1_2 = -max(-s0_2, 0.0);
V4 s1_3 = -max(-s0_3, 0.0);
V4 s1_4 = -max(-s0_4, 0.0);
V4 s1_5 = -max(-s0_5, 0.0);
V4 s1_6 = -max(-s0_6, 0.0);
V4 s1_7 = -max(-s0_7, 0.0);
V4 s1_8 = -max(-s0_8, 0.0);
s0_0 = max(s0_0, 0.0);
s0_1 = max(s0_1, 0.0);
s0_2 = max(s0_2, 0.0);
s0_3 = max(s0_3, 0.0);
s0_4 = max(s0_4, 0.0);
s0_5 = max(s0_5, 0.0);
s0_6 = max(s0_6, 0.0);
s0_7 = max(s0_7, 0.0);
s0_8 = max(s0_8, 0.0);
V4 s2_0 = l1(-1.0, -1.0);
V4 s2_1 = l1(0.0, -1.0);
V4 s2_2 = l1(1.0, -1.0);
V4 s2_3 = l1(-1.0, 0.0);
V4 s2_4 = l1(0.0, 0.0);
V4 s2_5 = l1(1.0, 0.0);
V4 s2_6 = l1(-1.0, 1.0);
V4 s2_7 = l1(0.0, 1.0);
V4 s2_8 = l1(1.0, 1.0);
V4 s3_0 = -max(-s2_0, 0.0);
V4 s3_1 = -max(-s2_1, 0.0);
V4 s3_2 = -max(-s2_2, 0.0);
V4 s3_3 = -max(-s2_3, 0.0);
V4 s3_4 = -max(-s2_4, 0.0);
V4 s3_5 = -max(-s2_5, 0.0);
V4 s3_6 = -max(-s2_6, 0.0);
V4 s3_7 = -max(-s2_7, 0.0);
V4 s3_8 = -max(-s2_8, 0.0);
s2_0 = max(s2_0, 0.0);
s2_1 = max(s2_1, 0.0);
s2_2 = max(s2_2, 0.0);
s2_3 = max(s2_3, 0.0);
s2_4 = max(s2_4, 0.0);
s2_5 = max(s2_5, 0.0);
s2_6 = max(s2_6, 0.0);
s2_7 = max(s2_7, 0.0);
s2_8 = max(s2_8, 0.0);
t2[gxy] = f0(s0_0, s0_1, s0_2, s0_3, s0_4, s0_5, s0_6, s0_7, s0_8, s1_0, s1_1, s1_2, s1_3, s1_4, s1_5, s1_6, s1_7, s1_8, s2_0, s2_1, s2_2, s2_3, s2_4, s2_5, s2_6, s2_7, s2_8, s3_0, s3_1, s3_2, s3_3, s3_4, s3_5, s3_6, s3_7, s3_8);
t3[gxy] = f1(s0_0, s0_1, s0_2, s0_3, s0_4, s0_5, s0_6, s0_7, s0_8, s1_0, s1_1, s1_2, s1_3, s1_4, s1_5, s1_6, s1_7, s1_8, s2_0, s2_1, s2_2, s2_3, s2_4, s2_5, s2_6, s2_7, s2_8, s3_0, s3_1, s3_2, s3_3, s3_4, s3_5, s3_6, s3_7, s3_8);
}
//!PASS 5
//!DESC conv4
//!BLOCK_SIZE 8
//!NUM_THREADS 64
//!IN t2, t3
//!OUT t0, t1
#define l0(x, y) V4(O(t2, float2(x, y)))
#define l1(x, y) V4(O(t3, float2(x, y)))
V4 f0(V4 s0_0, V4 s0_1, V4 s0_2, V4 s0_3, V4 s0_4, V4 s0_5, V4 s0_6, V4 s0_7, V4 s0_8, V4 s1_0, V4 s1_1, V4 s1_2, V4 s1_3, V4 s1_4, V4 s1_5, V4 s1_6, V4 s1_7, V4 s1_8, V4 s2_0, V4 s2_1, V4 s2_2, V4 s2_3, V4 s2_4, V4 s2_5, V4 s2_6, V4 s2_7, V4 s2_8, V4 s3_0, V4 s3_1, V4 s3_2, V4 s3_3, V4 s3_4, V4 s3_5, V4 s3_6, V4 s3_7, V4 s3_8) {
V4 r = 0.0;
r += mul(s0_0, M4(5.681e-02, -7.933e-02, -1.161e-02, -3.257e-02, -1.507e-02, 2.248e-02, -1.351e-02, 2.789e-02, -1.713e-01, 9.482e-02, 2.715e-02, 9.506e-02, 1.714e-01, -1.090e-01, -7.237e-02, -1.563e-01));
r += mul(s0_1, M4(-9.622e-03, -7.774e-04, -4.095e-02, 1.106e-02, -3.592e-02, -4.358e-02, 1.983e-02, -1.134e-02, -1.313e-02, -1.086e-01, 1.102e-01, -3.091e-01, 1.982e-01, 1.438e-01, -6.038e-02, 9.579e-02));
r += mul(s0_2, M4(-3.893e-02, 1.554e-02, -7.763e-05, 1.610e-02, 3.470e-03, 9.915e-03, -9.881e-03, 5.331e-02, -9.152e-02, 6.899e-02, -3.615e-02, 1.558e-01, -3.300e-02, 4.493e-02, 2.148e-02, -3.677e-02));
r += mul(s0_3, M4(1.939e-01, -7.700e-02, -1.449e-01, -1.942e-02, 9.649e-02, -3.580e-03, -1.767e-02, 2.394e-02, -1.299e-01, 1.160e-01, 8.000e-02, 9.737e-02, 2.751e-01, -4.435e-01, 1.013e-01, -1.782e-01));
r += mul(s0_4, M4(-2.745e-01, 2.922e-01, -2.008e-01, 1.636e-01, -4.843e-02, 4.172e-01, 3.097e-02, 3.326e-01, -1.798e-02, -3.860e-01, 3.246e-02, 4.225e-01, -1.057e-01, 2.302e-01, -7.879e-02, 4.832e-02));
r += mul(s0_5, M4(-6.834e-04, -3.372e-02, -9.351e-02, 1.547e-02, 5.621e-02, -1.195e-02, -9.402e-03, 6.439e-02, 8.787e-02, 1.499e-02, 1.928e-01, 6.693e-02, 6.516e-02, -1.145e-01, -6.610e-02, 3.986e-02));
r += mul(s0_6, M4(7.682e-02, -9.222e-02, 1.566e-01, -1.438e-02, 5.080e-02, -2.762e-02, -3.121e-02, -1.242e-02, 2.046e-02, -1.131e-02, 4.555e-02, -3.006e-02, 1.125e-01, -7.883e-02, 1.063e-01, 3.027e-03));
r += mul(s0_7, M4(-1.395e-01, 4.847e-02, 1.605e-01, 1.363e-01, 6.243e-02, -1.464e-02, 3.336e-02, -8.862e-02, 3.286e-02, -2.398e-02, -2.326e-02, -8.408e-02, 1.274e-01, -4.997e-02, 1.548e-01, -8.650e-02));
r += mul(s0_8, M4(4.236e-02, 3.116e-02, 7.690e-02, 3.084e-02, 6.290e-03, 1.016e-02, 7.155e-02, -9.786e-02, -1.453e-02, -4.564e-04, -3.654e-02, 7.179e-03, -2.110e-02, -2.766e-02, 1.022e-01, -6.664e-02));
r += mul(s1_0, M4(-2.814e-02, 6.473e-02, 5.209e-02, 6.202e-02, -1.898e-02, 6.061e-02, -1.557e-02, 3.561e-02, 2.137e-01, -1.913e-01, 2.387e-03, -1.470e-01, 4.553e-02, -3.358e-02, 1.936e-03, -4.798e-02));
r += mul(s1_1, M4(4.947e-03, -8.431e-02, -3.362e-03, -1.057e-01, -6.735e-02, 8.463e-03, -4.622e-02, -2.022e-02, -1.450e-01, -1.687e-03, -1.541e-02, -1.116e-02, 4.447e-02, 5.088e-02, -7.198e-03, 3.279e-02));
r += mul(s1_2, M4(1.202e-03, -2.591e-02, -5.357e-03, -3.844e-02, -7.403e-03, 3.771e-02, -6.171e-02, 8.820e-02, 6.744e-03, -4.156e-02, -1.377e-02, 9.398e-02, -2.643e-02, 4.991e-02, -2.000e-02, 1.056e-02));
r += mul(s1_3, M4(3.923e-01, 3.525e-02, -1.294e-01, 1.478e-02, 9.667e-02, 1.289e-01, 8.960e-02, 1.946e-02, 3.128e-01, -3.315e-01, -3.019e-01, 1.021e-01, 2.095e-01, -1.488e-01, -9.439e-02, -9.635e-02));
r += mul(s1_4, M4(-3.641e-01, -9.985e-02, -3.482e-01, -2.646e-01, -5.257e-01, 9.475e-01, 1.714e-01, 5.842e-01, -2.199e-01, -6.131e-02, -4.597e-01, 5.556e-01, 7.933e-02, -2.150e-01, -3.469e-01, -1.978e-01));
r += mul(s1_5, M4(7.883e-05, -2.207e-02, -1.735e-02, 2.167e-02, 4.628e-02, 8.814e-02, -4.837e-02, 6.515e-02, 1.617e-01, -4.460e-02, -1.002e-01, 7.496e-02, -1.180e-01, 5.540e-02, -5.708e-02, 5.715e-02));
r += mul(s1_6, M4(1.680e-01, -5.262e-02, 6.143e-02, -4.758e-02, -5.343e-02, 4.332e-02, 1.191e-01, 8.545e-03, 1.171e-01, -8.169e-02, 1.535e-02, -2.281e-01, 8.009e-02, -9.744e-02, 6.114e-02, 8.379e-03));
r += mul(s1_7, M4(-9.744e-02, 2.573e-02, 6.125e-02, 1.265e-01, 9.253e-02, -1.227e-01, 3.224e-01, -2.402e-01, 1.083e-01, 1.607e-02, 1.155e-01, -4.014e-01, -2.347e-02, -3.821e-02, 2.379e-01, 2.605e-02));
r += mul(s1_8, M4(5.428e-02, -5.434e-02, -2.345e-02, -2.189e-03, 1.274e-02, 7.503e-02, 1.442e-01, -8.839e-02, -3.480e-02, 1.444e-02, -3.859e-02, -1.089e-01, -3.183e-02, 9.172e-02, 1.092e-01, 6.688e-02));
r += mul(s2_0, M4(2.283e-01, 3.872e-02, -5.533e-02, -1.704e-02, -1.533e-02, 1.459e-02, 3.842e-02, 6.367e-02, -4.041e-02, -6.411e-03, -5.052e-03, -8.331e-03, 2.786e-03, -5.502e-02, 6.695e-03, -1.982e-02));
r += mul(s2_1, M4(-4.716e-01, 4.092e-01, -1.581e-01, 4.209e-01, 1.255e-01, -7.138e-02, 7.300e-02, -1.357e-01, -6.908e-02, -1.986e-02, 1.801e-02, -4.505e-02, -1.611e-01, -1.216e-01, -6.522e-02, -9.093e-02));
r += mul(s2_2, M4(1.019e-01, -3.650e-02, 1.353e-02, 2.487e-01, -1.344e-04, 4.653e-02, 1.721e-02, 4.005e-02, 7.572e-03, -4.357e-02, -3.720e-02, 2.091e-02, 6.051e-03, -6.957e-02, -9.009e-02, -1.788e-02));
r += mul(s2_3, M4(2.159e-02, -3.325e-02, 3.084e-02, 1.091e-01, -9.662e-02, 1.040e-01, 1.078e-01, -2.572e-02, 2.237e-04, -2.571e-02, -2.335e-02, -1.554e-02, 1.275e-01, -4.579e-02, -1.772e-02, 3.282e-02));
r += mul(s2_4, M4(4.984e-02, 2.302e-01, 6.568e-02, 1.279e-01, 6.857e-02, -1.499e-01, -4.461e-02, -1.977e-01, -1.903e-01, 1.430e-01, 3.271e-02, 1.978e-01, 2.410e-01, 5.980e-01, -1.394e-01, 2.261e-01));
r += mul(s2_5, M4(2.188e-02, -8.976e-03, 2.475e-02, 1.340e-02, -4.458e-02, 5.360e-02, 2.628e-02, -1.405e-02, 6.166e-02, -4.895e-02, 1.348e-03, 5.680e-02, -1.123e-01, 7.224e-02, -6.458e-02, 1.314e-01));
r += mul(s2_6, M4(3.252e-02, -2.389e-02, -2.067e-02, -6.871e-02, -8.327e-02, 7.793e-02, 7.681e-03, 5.095e-02, -1.693e-02, -3.622e-02, 3.065e-02, -1.582e-02, -6.963e-03, 2.835e-02, 6.805e-02, -1.475e-02));
r += mul(s2_7, M4(4.783e-02, -2.945e-02, 4.732e-02, -9.789e-04, -1.619e-02, -2.603e-02, -1.368e-01, 2.956e-02, 9.844e-02, -1.214e-01, 1.776e-01, -1.461e-01, -5.165e-02, -1.055e-02, 1.793e-01, -4.355e-02));
r += mul(s2_8, M4(2.619e-03, 4.801e-02, 6.393e-02, -2.399e-02, -1.280e-03, -2.210e-02, -4.649e-02, 1.561e-03, -1.789e-02, 5.576e-02, 1.200e-01, 3.338e-03, 4.475e-02, -2.957e-02, 9.300e-02, -7.837e-02));
r += mul(s3_0, M4(-1.536e-01, -3.593e-03, -1.064e-02, 1.740e-02, 9.197e-02, 2.772e-01, 5.258e-01, 5.745e-01, 2.331e-02, 8.995e-02, 2.611e-02, 5.463e-02, 4.872e-02, -8.230e-03, -1.742e-02, 3.405e-03));
r += mul(s3_1, M4(4.799e-02, 1.088e-01, -7.562e-02, 5.926e-02, 4.190e-01, -4.922e-01, -1.822e-01, -2.309e-01, 1.776e-01, 1.799e-01, 1.213e-01, 3.198e-01, -1.565e-01, 2.118e-02, -5.914e-02, 1.048e-01));
r += mul(s3_2, M4(-6.867e-02, -2.488e-02, 2.563e-02, -3.161e-02, -4.038e-02, 5.042e-02, 2.474e-02, 3.962e-03, -4.263e-02, 4.382e-02, -6.197e-03, 5.435e-02, 8.477e-02, -7.694e-02, -2.473e-02, -2.000e-02));
r += mul(s3_3, M4(-6.567e-02, 7.271e-02, -2.275e-02, -4.345e-03, -4.825e-02, -7.541e-01, 5.163e-01, 9.170e-01, -1.040e-01, -9.911e-03, 3.569e-02, 2.347e-01, 2.350e-02, 6.202e-02, 7.421e-03, 2.377e-02));
r += mul(s3_4, M4(-3.371e-02, -2.738e-02, 1.670e-01, 2.607e-01, -5.009e-02, 5.743e-03, -6.991e-01, -2.858e-02, -6.907e-02, -4.016e-01, 3.462e-01, 9.128e-01, -1.622e-01, 1.392e-01, 2.250e-01, 1.183e-01));
r += mul(s3_5, M4(-8.330e-03, 1.029e-01, 1.045e-01, 2.013e-01, 2.609e-02, 7.939e-02, -1.054e-01, 6.487e-02, 1.165e-01, -6.250e-02, 1.274e-01, 2.396e-01, 2.390e-01, -2.468e-01, 1.178e-02, 6.794e-02));
r += mul(s3_6, M4(5.411e-02, -5.669e-02, 2.831e-02, -3.762e-02, 1.186e-01, 1.750e-01, -2.862e-01, -9.876e-02, 5.851e-02, 2.750e-02, 7.348e-03, -2.151e-01, -3.151e-02, 5.225e-02, 3.178e-02, 1.438e-02));
r += mul(s3_7, M4(-2.053e-03, 2.875e-02, -4.633e-02, -7.843e-02, -5.216e-02, -1.497e-04, -2.534e-01, -5.098e-01, 3.092e-02, -4.215e-02, -1.330e-01, -9.137e-02, 5.062e-02, 5.514e-02, -1.958e-01, 6.162e-03));
r += mul(s3_8, M4(3.627e-02, 1.482e-02, 2.228e-02, -7.151e-02, -1.770e-02, 6.009e-02, 2.013e-01, 2.403e-02, 1.912e-03, -9.001e-03, 1.673e-02, -3.465e-02, 5.222e-02, -3.027e-02, -4.458e-03, -6.391e-02));
r += V4(-3.261e-03, 1.350e-04, -6.605e-05, 1.307e-04);
return r;
}
V4 f1(V4 s0_0, V4 s0_1, V4 s0_2, V4 s0_3, V4 s0_4, V4 s0_5, V4 s0_6, V4 s0_7, V4 s0_8, V4 s1_0, V4 s1_1, V4 s1_2, V4 s1_3, V4 s1_4, V4 s1_5, V4 s1_6, V4 s1_7, V4 s1_8, V4 s2_0, V4 s2_1, V4 s2_2, V4 s2_3, V4 s2_4, V4 s2_5, V4 s2_6, V4 s2_7, V4 s2_8, V4 s3_0, V4 s3_1, V4 s3_2, V4 s3_3, V4 s3_4, V4 s3_5, V4 s3_6, V4 s3_7, V4 s3_8) {
V4 r = 0.0;
r += mul(s0_0, M4(-5.528e-02, 2.435e-02, -2.728e-02, 5.042e-02, -2.357e-02, 1.752e-02, 6.730e-02, -1.869e-02, 5.562e-02, 2.108e-03, -2.535e-02, -7.791e-02, -6.984e-02, 8.842e-02, 7.203e-02, 3.709e-02));
r += mul(s0_1, M4(-6.164e-02, -1.824e-02, 8.179e-02, -3.238e-02, 5.338e-02, -5.506e-02, -1.020e-01, 1.520e-02, 1.953e-01, -2.850e-02, 8.323e-02, -8.899e-02, 5.112e-02, 6.369e-02, -5.510e-02, 1.997e-02));
r += mul(s0_2, M4(6.117e-02, -1.311e-02, -9.258e-03, -1.479e-02, -2.710e-02, 2.958e-02, 2.946e-02, -9.472e-03, 4.257e-02, -7.053e-02, -5.896e-02, 5.475e-02, 6.131e-02, -1.827e-02, -2.909e-02, -6.470e-02));
r += mul(s0_3, M4(-1.411e-01, 1.597e-01, 2.142e-01, 6.972e-02, 1.704e-02, 4.423e-02, -8.405e-02, 4.993e-02, 1.176e-02, -8.471e-02, 4.062e-02, -1.001e-01, -3.805e-02, 3.820e-02, -6.258e-01, 2.568e-01));
r += mul(s0_4, M4(3.384e-01, -2.619e-01, 1.799e-01, -3.175e-01, 3.472e-03, -1.186e-01, 7.886e-02, -1.126e-01, 1.378e-01, -3.772e-02, -1.396e-02, 6.889e-02, -1.383e-01, 1.958e-01, 7.297e-02, -1.066e+00));
r += mul(s0_5, M4(-4.115e-04, 8.733e-03, 3.432e-02, 5.650e-02, 9.203e-02, 6.899e-02, -9.987e-03, 5.139e-02, 2.075e-01, -1.229e-02, 5.912e-02, -2.866e-02, -1.602e-01, 1.654e-01, 6.957e-02, 5.472e-02));
r += mul(s0_6, M4(-1.000e-01, 9.401e-02, -3.864e-02, 1.160e-01, 1.108e-03, 8.814e-02, 6.570e-04, 2.167e-02, 6.762e-05, -1.080e-02, -1.670e-02, -4.178e-03, -9.704e-03, 2.164e-01, 3.748e-02, -1.258e-02));
r += mul(s0_7, M4(7.557e-02, -2.360e-01, -2.727e-02, -7.688e-02, -3.110e-02, 1.671e-02, -4.238e-02, 5.553e-02, 6.518e-02, 3.357e-02, -2.725e-02, -2.524e-02, -1.352e-01, -1.005e-01, -4.108e-02, 2.664e-01));
r += mul(s0_8, M4(9.624e-02, 5.754e-03, 8.412e-02, -2.955e-02, 2.850e-02, 8.830e-03, -4.162e-02, -1.337e-02, -4.374e-02, -2.352e-02, -1.566e-02, 1.822e-02, 7.979e-02, -9.058e-02, -1.071e-01, -3.379e-03));
r += mul(s1_0, M4(1.395e-02, 1.801e-02, 1.899e-03, -3.313e-02, 2.251e-02, -3.697e-03, 5.577e-02, -3.001e-02, -6.090e-02, 1.645e-01, -1.047e-01, 1.483e-01, -6.634e-03, 3.917e-04, -1.999e-02, 2.114e-02));
r += mul(s1_1, M4(2.859e-03, 5.455e-02, 4.336e-02, -2.717e-02, 9.302e-02, -9.807e-02, 7.046e-02, -3.707e-02, -1.275e-01, -3.463e-02, -1.160e-01, -4.227e-02, 3.162e-02, 3.583e-02, 4.579e-02, -1.196e-02));
r += mul(s1_2, M4(-7.086e-03, 2.542e-03, 1.500e-03, -6.273e-03, 5.711e-02, -5.317e-02, -5.455e-03, 4.847e-02, 8.830e-02, 5.991e-02, 3.356e-02, 1.214e-03, -5.272e-03, -5.211e-02, -2.142e-02, -1.246e-02));
r += mul(s1_3, M4(-4.807e-02, 4.530e-02, 2.719e-01, -1.035e-02, 4.911e-02, -5.824e-03, -6.478e-02, -1.051e-03, -1.348e-02, 6.405e-01, -4.257e-01, 3.690e-01, -9.665e-02, 2.101e-01, 6.571e-02, 9.738e-02));
r += mul(s1_4, M4(2.423e-01, -2.074e-01, -4.394e-01, -2.830e-02, 5.415e-02, -2.337e-01, 6.080e-01, -1.843e-01, -5.128e-01, 1.559e-01, -2.033e-01, -6.040e-02, -6.726e-02, 2.589e-01, 1.901e-01, -9.598e-02));
r += mul(s1_5, M4(-1.456e-01, 6.484e-02, 1.125e-01, -1.183e-02, 2.186e-01, 2.930e-02, -4.285e-02, 6.272e-02, 1.500e-01, 1.033e-01, 2.173e-01, -3.328e-02, -6.785e-02, -7.882e-02, -1.450e-01, 7.182e-02));
r += mul(s1_6, M4(-4.062e-02, 9.988e-02, -5.106e-02, 1.546e-01, 5.122e-02, -7.398e-02, -5.320e-03, -5.669e-02, -4.188e-02, 2.035e-01, -5.253e-02, -7.554e-03, -6.233e-02, 1.285e-01, 1.152e-02, 7.495e-02));
r += mul(s1_7, M4(1.168e-01, -1.061e-01, -8.798e-02, -2.456e-01, -1.274e-01, -9.338e-02, 6.064e-04, 1.255e-01, 2.944e-02, -9.599e-02, -1.606e-01, 1.477e-01, -5.541e-02, -9.992e-02, -5.652e-02, 1.402e-02));
r += mul(s1_8, M4(-8.447e-02, -2.272e-02, 3.291e-02, 1.141e-01, 2.835e-01, 2.747e-02, 9.338e-03, -1.271e-01, 1.118e-03, -3.543e-02, -3.201e-02, 5.803e-02, 1.793e-01, -6.889e-02, -3.139e-02, -1.000e-01));
r += mul(s2_0, M4(3.477e-02, 8.152e-03, -8.100e-03, 3.869e-02, 4.675e-02, 8.080e-02, -4.909e-02, 6.764e-03, -2.946e-03, -7.021e-02, -1.191e-02, -1.660e-02, -5.967e-02, -1.872e-02, -3.485e-02, 3.391e-02));
r += mul(s2_1, M4(1.685e-01, -2.681e-01, -2.340e-01, -1.748e-01, -1.593e-01, 7.496e-02, 3.748e-02, 1.562e-02, 5.150e-02, -3.648e-02, 3.739e-02, -4.384e-02, -1.521e-02, -1.061e-01, -1.381e-01, 1.733e-02));
r += mul(s2_2, M4(1.573e-01, 1.415e-01, 1.714e-01, -5.175e-02, -2.442e-02, 1.054e-02, 3.047e-03, -5.944e-03, -6.027e-03, 1.034e-02, -3.381e-02, 4.299e-02, -9.763e-02, 4.729e-02, 9.642e-02, -1.450e-02));
r += mul(s2_3, M4(8.191e-03, 1.353e-01, -6.018e-02, 5.677e-02, -1.725e-02, -1.324e-01, 1.646e-01, -1.154e-01, -9.796e-03, 3.066e-02, -5.975e-02, 2.878e-02, -1.381e-01, 1.550e-01, 3.556e-02, 8.926e-02));
r += mul(s2_4, M4(1.715e-01, -2.115e-02, 8.179e-02, -2.066e-01, 1.275e-01, 1.599e-01, 2.325e-02, -9.637e-03, 6.565e-02, -1.901e-01, 7.185e-02, -1.559e-01, 1.106e-01, -6.210e-02, -3.672e-01, 6.248e-02));
r += mul(s2_5, M4(-3.453e-03, 5.284e-02, -1.031e-01, 5.091e-02, 1.538e-02, -9.971e-02, -5.610e-02, -2.585e-02, 6.441e-02, 1.113e-01, 3.085e-02, 6.860e-02, -6.167e-02, -6.774e-02, -6.898e-02, -4.397e-03));
r += mul(s2_6, M4(-1.561e-02, 5.106e-02, 2.999e-03, -7.663e-03, 6.665e-02, -1.217e-01, -9.529e-03, -2.096e-02, -2.825e-02, 4.854e-02, -2.196e-02, -7.191e-03, 2.274e-03, 1.698e-02, -1.727e-02, 1.967e-03));
r += mul(s2_7, M4(3.534e-02, -1.077e-02, 1.607e-02, 4.542e-02, -7.989e-02, 1.294e-01, 4.920e-02, -6.332e-02, -9.402e-02, 2.028e-02, -6.305e-03, 9.061e-02, 2.225e-03, 2.352e-02, -4.032e-03, -4.985e-02));
r += mul(s2_8, M4(7.112e-02, -1.427e-02, -2.352e-02, -2.989e-02, -5.633e-02, -6.039e-03, 3.496e-03, 2.535e-02, 1.265e-01, -4.541e-02, -5.393e-02, -5.355e-02, 1.498e-03, 2.057e-02, 1.278e-02, 5.662e-02));
r += mul(s3_0, M4(9.523e-02, -7.183e-02, -2.740e-01, -1.569e-02, 1.008e-01, 3.065e+00, -2.003e-01, 1.938e-01, 7.503e-02, -1.096e-01, -3.177e-02, -4.074e-02, 1.090e-03, -2.250e-02, -4.727e-02, 2.528e-02));
r += mul(s3_1, M4(-7.789e-02, 7.186e-03, 3.838e-01, -1.314e-01, -4.119e-01, 1.344e-01, 5.252e-02, -4.478e-02, -2.421e-01, 8.221e-02, 1.588e-01, 5.943e-02, -6.960e-02, -7.055e-02, -5.857e-02, -2.367e-02));
r += mul(s3_2, M4(1.578e-01, -5.477e-02, -1.343e-01, 7.698e-02, 9.761e-02, -2.725e-02, -6.329e-02, -5.552e-02, -6.854e-02, 1.143e-02, -8.043e-02, 1.416e-02, 5.387e-02, 1.371e-01, 1.146e-01, -5.881e-04));
r += mul(s3_3, M4(7.307e-03, -8.177e-02, 5.634e-02, -1.149e-01, -4.060e-01, 1.613e+00, -3.145e-01, 2.057e-02, -9.555e-02, 2.548e-01, 5.932e-02, 7.789e-02, 7.174e-03, -6.399e-03, -2.315e-02, 8.381e-03));
r += mul(s3_4, M4(1.200e-01, 1.356e-01, 8.711e-03, 7.537e-02, -1.751e-01, 3.458e-02, 2.391e-01, -1.111e-01, 1.506e-01, -3.165e-01, -4.619e-01, -9.386e-02, -4.377e-02, -1.492e-01, -5.002e-01, 9.821e-02));
r += mul(s3_5, M4(1.539e-01, 7.309e-02, 4.257e-03, -1.539e-01, -4.757e-01, 1.070e-01, 1.702e-02, 9.709e-02, -1.140e-01, 1.938e-01, 1.982e-01, -3.215e-02, -3.822e-01, 3.408e-01, 1.647e-01, 1.597e-01));
r += mul(s3_6, M4(-3.320e-02, 4.854e-02, -1.957e-02, 3.353e-02, 1.823e-01, 8.532e-02, 3.236e-02, -1.874e-01, -1.073e-02, -6.598e-03, -2.954e-02, -2.175e-02, 1.184e-02, -3.856e-02, 2.166e-02, -2.608e-02));
r += mul(s3_7, M4(2.038e-02, -4.606e-02, -3.841e-02, -4.008e-02, -2.542e-01, -1.076e-01, -2.891e-02, 1.837e-01, 3.842e-02, 1.753e-01, 3.043e-02, -3.298e-02, 2.990e-02, 1.215e-01, 9.583e-02, -5.860e-02));
r += mul(s3_8, M4(6.138e-02, 3.405e-02, 3.364e-04, 6.037e-03, 1.811e-01, 9.691e-04, 3.497e-02, -1.810e-02, -3.940e-02, -1.159e-01, -7.007e-02, 1.170e-01, 1.829e-02, -2.216e-02, -1.689e-02, 1.150e-01));
r += V4(-1.782e-04, -1.204e-03, 6.004e-04, -1.736e-03);
return r;
}
void Pass5(uint2 blockStart, uint3 tid) {
float2 pt = float2(GetInputPt());
uint2 gxy = Rmp8x8(tid.x) + blockStart;
uint2 size = GetInputSize();
if (gxy.x >= size.x || gxy.y >= size.y) {
return;
}
float2 pos = (gxy + 0.5) * pt;
V4 s0_0 = l0(-1.0, -1.0);
V4 s0_1 = l0(0.0, -1.0);
V4 s0_2 = l0(1.0, -1.0);
V4 s0_3 = l0(-1.0, 0.0);
V4 s0_4 = l0(0.0, 0.0);
V4 s0_5 = l0(1.0, 0.0);
V4 s0_6 = l0(-1.0, 1.0);
V4 s0_7 = l0(0.0, 1.0);
V4 s0_8 = l0(1.0, 1.0);
V4 s1_0 = -max(-s0_0, 0.0);
V4 s1_1 = -max(-s0_1, 0.0);
V4 s1_2 = -max(-s0_2, 0.0);
V4 s1_3 = -max(-s0_3, 0.0);
V4 s1_4 = -max(-s0_4, 0.0);
V4 s1_5 = -max(-s0_5, 0.0);
V4 s1_6 = -max(-s0_6, 0.0);
V4 s1_7 = -max(-s0_7, 0.0);
V4 s1_8 = -max(-s0_8, 0.0);
s0_0 = max(s0_0, 0.0);
s0_1 = max(s0_1, 0.0);
s0_2 = max(s0_2, 0.0);
s0_3 = max(s0_3, 0.0);
s0_4 = max(s0_4, 0.0);
s0_5 = max(s0_5, 0.0);
s0_6 = max(s0_6, 0.0);
s0_7 = max(s0_7, 0.0);
s0_8 = max(s0_8, 0.0);
V4 s2_0 = l1(-1.0, -1.0);
V4 s2_1 = l1(0.0, -1.0);
V4 s2_2 = l1(1.0, -1.0);
V4 s2_3 = l1(-1.0, 0.0);
V4 s2_4 = l1(0.0, 0.0);
V4 s2_5 = l1(1.0, 0.0);
V4 s2_6 = l1(-1.0, 1.0);
V4 s2_7 = l1(0.0, 1.0);
V4 s2_8 = l1(1.0, 1.0);
V4 s3_0 = -max(-s2_0, 0.0);
V4 s3_1 = -max(-s2_1, 0.0);
V4 s3_2 = -max(-s2_2, 0.0);
V4 s3_3 = -max(-s2_3, 0.0);
V4 s3_4 = -max(-s2_4, 0.0);
V4 s3_5 = -max(-s2_5, 0.0);
V4 s3_6 = -max(-s2_6, 0.0);
V4 s3_7 = -max(-s2_7, 0.0);
V4 s3_8 = -max(-s2_8, 0.0);
s2_0 = max(s2_0, 0.0);
s2_1 = max(s2_1, 0.0);
s2_2 = max(s2_2, 0.0);
s2_3 = max(s2_3, 0.0);
s2_4 = max(s2_4, 0.0);
s2_5 = max(s2_5, 0.0);
s2_6 = max(s2_6, 0.0);
s2_7 = max(s2_7, 0.0);
s2_8 = max(s2_8, 0.0);
t0[gxy] = f0(s0_0, s0_1, s0_2, s0_3, s0_4, s0_5, s0_6, s0_7, s0_8, s1_0, s1_1, s1_2, s1_3, s1_4, s1_5, s1_6, s1_7, s1_8, s2_0, s2_1, s2_2, s2_3, s2_4, s2_5, s2_6, s2_7, s2_8, s3_0, s3_1, s3_2, s3_3, s3_4, s3_5, s3_6, s3_7, s3_8);
t1[gxy] = f1(s0_0, s0_1, s0_2, s0_3, s0_4, s0_5, s0_6, s0_7, s0_8, s1_0, s1_1, s1_2, s1_3, s1_4, s1_5, s1_6, s1_7, s1_8, s2_0, s2_1, s2_2, s2_3, s2_4, s2_5, s2_6, s2_7, s2_8, s3_0, s3_1, s3_2, s3_3, s3_4, s3_5, s3_6, s3_7, s3_8);
}
//!PASS 6
//!DESC out-shuffle
//!BLOCK_SIZE 16
//!NUM_THREADS 64
//!IN INPUT, t0, t1
//!OUT OUTPUT
#define l0(x, y) V4(O(t0, float2(x, y)))
#define l1(x, y) V4(O(t1, float2(x, y)))
V4 f0(V4 s0_0, V4 s0_1, V4 s0_2, V4 s0_3, V4 s0_4, V4 s0_5, V4 s0_6, V4 s0_7, V4 s0_8, V4 s1_0, V4 s1_1, V4 s1_2, V4 s1_3, V4 s1_4, V4 s1_5, V4 s1_6, V4 s1_7, V4 s1_8, V4 s2_0, V4 s2_1, V4 s2_2, V4 s2_3, V4 s2_4, V4 s2_5, V4 s2_6, V4 s2_7, V4 s2_8, V4 s3_0, V4 s3_1, V4 s3_2, V4 s3_3, V4 s3_4, V4 s3_5, V4 s3_6, V4 s3_7, V4 s3_8) {
V4 r = 0.0;
r += mul(s0_0, M4(-6.910e-02, 1.215e-03, -2.039e-03, -1.079e-04, 8.088e-02, -2.119e-02, -1.929e-02, 1.865e-02, -6.142e-02, 2.499e-02, -4.185e-03, 1.951e-03, -1.099e-02, 1.071e-02, 3.133e-03, -9.539e-03));
r += mul(s0_1, M4(-2.129e-02, 6.812e-02, 2.738e-02, -2.965e-02, -1.569e-01, -7.369e-02, 6.714e-02, -2.416e-02, 6.421e-02, -3.329e-02, 4.397e-03, 1.902e-02, 1.426e-01, 7.469e-02, -3.306e-02, 1.260e-02));
r += mul(s0_2, M4(-2.521e-02, -1.556e-02, -1.880e-02, 1.813e-02, -2.926e-03, -3.967e-02, -2.562e-02, 1.669e-02, 1.699e-03, 2.545e-02, 9.862e-03, 1.052e-02, -1.392e-02, 1.215e-02, 2.436e-02, 2.113e-04));
r += mul(s0_3, M4(1.800e-02, -2.761e-02, 1.145e-02, -6.469e-02, 1.392e-01, 1.033e-02, 1.406e-01, -7.326e-03, -2.077e-02, 2.985e-03, -1.102e-01, 2.804e-02, -1.544e-02, 5.050e-02, 2.915e-02, 2.396e-02));
r += mul(s0_4, M4(1.242e-01, -4.463e-01, -3.829e-01, 1.871e-01, -8.392e-02, 6.470e-02, -3.115e-01, -1.970e-01, -1.186e-01, -1.204e-01, -2.296e-02, -1.763e-01, -1.265e-01, -1.919e-01, 6.718e-02, 8.923e-02));
r += mul(s0_5, M4(-2.493e-02, 3.014e-02, 2.446e-02, -1.488e-01, 1.299e-02, -5.759e-02, 2.138e-02, -9.211e-02, -8.051e-03, -4.216e-02, -1.327e-02, -9.724e-04, 3.675e-02, 7.968e-03, -3.353e-02, -4.044e-02));
r += mul(s0_6, M4(2.027e-02, 3.813e-03, -2.557e-03, -2.670e-02, 2.068e-02, 1.886e-02, 6.014e-02, 3.191e-02, -1.917e-03, -2.659e-03, 1.273e-02, 3.109e-03, 9.881e-03, -4.410e-04, 7.569e-03, 1.276e-02));
r += mul(s0_7, M4(-1.802e-03, 4.820e-02, 4.201e-02, 4.574e-02, 2.826e-02, 2.044e-02, 1.196e-01, 9.132e-02, 1.800e-02, 2.670e-02, -3.398e-03, 1.359e-02, 1.247e-02, 1.268e-02, 1.628e-03, -1.067e-02));
r += mul(s0_8, M4(5.233e-03, 3.648e-02, 2.719e-02, 2.838e-02, 1.857e-03, -1.999e-03, 1.703e-02, 5.921e-02, 7.925e-03, -2.543e-03, 5.431e-03, -1.102e-02, -1.116e-02, -5.510e-03, -9.183e-03, -8.054e-03));
r += mul(s1_0, M4(-6.423e-02, -5.758e-03, -8.948e-03, -2.227e-03, 5.802e-02, -2.252e-02, -8.134e-03, 1.448e-02, -3.642e-02, 4.476e-03, 7.865e-03, 3.269e-03, 1.053e-02, 1.269e-02, -1.530e-03, -9.628e-03));
r += mul(s1_1, M4(-2.553e-02, 4.747e-02, 4.136e-02, -2.368e-02, -1.401e-01, -4.967e-02, 6.372e-02, -1.788e-04, 3.663e-01, 2.193e-01, -8.228e-02, -8.507e-02, 1.404e-01, 8.229e-02, -5.862e-02, -1.161e-02));
r += mul(s1_2, M4(-2.216e-02, -7.521e-03, -2.522e-02, 2.337e-02, -2.651e-03, -3.786e-02, -9.854e-03, 2.033e-02, 9.696e-03, 1.237e-01, 6.173e-03, 2.898e-02, -1.335e-02, 2.948e-02, 9.778e-03, -1.243e-02));
r += mul(s1_3, M4(-1.598e-02, -1.677e-02, -4.726e-02, -2.250e-02, 2.076e-01, -2.825e-02, 1.389e-01, -2.552e-02, 3.209e-02, -3.267e-03, -9.876e-02, 3.775e-02, -5.440e-02, 6.367e-02, 8.425e-02, 7.583e-03));
r += mul(s1_4, M4(-2.339e-01, -8.617e-02, -3.313e-01, 1.470e-01, -1.249e-01, 3.994e-01, -7.191e-01, -2.121e-01, 2.521e-02, 4.601e-02, -3.584e-01, -4.014e-01, -4.299e-01, -4.828e-01, 4.034e-01, 3.633e-01));
r += mul(s1_5, M4(3.413e-02, -4.685e-03, 4.308e-02, -1.211e-01, 3.722e-02, -1.000e-01, 5.938e-02, -1.900e-01, 3.286e-03, 6.076e-03, 2.628e-02, -1.190e-01, 3.968e-02, -3.583e-02, -4.724e-02, 5.713e-02));
r += mul(s1_6, M4(3.008e-02, -2.083e-02, 7.970e-03, -2.011e-02, -8.809e-03, 9.741e-03, 7.228e-02, 1.875e-02, -8.374e-03, -2.245e-03, 1.642e-02, -9.996e-03, 2.093e-02, 6.393e-03, 6.227e-03, -6.775e-03));
r += mul(s1_7, M4(1.113e-02, 5.783e-02, -1.430e-02, 2.826e-02, -1.250e-02, -3.106e-02, 1.754e-01, 2.001e-01, -1.431e-02, -1.368e-02, 4.329e-02, 4.832e-02, 4.089e-02, 3.702e-02, -5.774e-03, 8.701e-03));
r += mul(s1_8, M4(1.395e-03, 3.747e-02, 2.706e-02, 4.675e-02, -1.191e-02, -2.163e-02, 3.137e-02, 7.056e-02, 4.929e-03, -6.465e-03, 1.083e-03, 1.816e-02, -3.896e-03, 1.081e-02, -1.507e-02, -1.412e-02));
r += mul(s2_0, M4(5.551e-02, 3.061e-02, 2.172e-02, -4.435e-04, 7.341e-02, -4.254e-03, -3.710e-02, 2.005e-02, 3.528e-02, 1.764e-02, 4.547e-03, -6.460e-03, 1.949e-01, 2.466e-02, 7.886e-02, -2.722e-03));
r += mul(s2_1, M4(-1.216e-03, 4.895e-02, -2.548e-02, 1.354e-02, 1.184e-01, -2.592e-01, 3.262e-02, 3.213e-02, -7.885e-02, -2.429e-02, -5.811e-02, 1.909e-02, 3.185e-02, -7.057e-02, -2.388e-02, 1.018e-01));
r += mul(s2_2, M4(-4.325e-03, 8.278e-03, -7.126e-04, -3.013e-03, -2.277e-02, 6.470e-02, -3.258e-02, 6.558e-03, 2.954e-02, 9.175e-03, -1.066e-03, -1.931e-02, 3.523e-03, 1.347e-03, -1.837e-03, -3.765e-03));
r += mul(s2_3, M4(-1.063e-01, 1.364e-02, -1.031e-01, 7.569e-02, -3.770e-02, 3.667e-02, 2.683e-02, 5.980e-02, -1.057e-01, -1.107e-02, -7.272e-02, 5.094e-02, 7.605e-02, 1.566e-02, 1.708e-01, 2.124e-01));
r += mul(s2_4, M4(1.344e-02, -6.091e-02, 2.694e-02, -2.727e-02, 2.786e-01, 5.187e-02, 6.738e-01, -9.220e-01, 1.745e-01, -1.468e-02, 1.843e-01, -1.866e-01, -9.396e-02, -1.505e-01, 2.471e-01, -1.138e+00));
r += mul(s2_5, M4(6.506e-03, 7.226e-03, 9.650e-03, 3.959e-03, -2.858e-02, -1.124e-01, -5.599e-02, 8.081e-02, -3.923e-02, 6.977e-02, 2.327e-03, 1.164e-01, 1.242e-02, -1.947e-02, -4.582e-02, 2.119e-02));
r += mul(s2_6, M4(-1.730e-02, -2.202e-02, -2.408e-02, -6.448e-02, -3.767e-03, 2.506e-02, -4.165e-02, 4.527e-02, 1.431e-02, -2.421e-02, -1.170e-02, -6.665e-02, -1.236e-02, 5.709e-03, -6.345e-03, -3.440e-02));
r += mul(s2_7, M4(-4.211e-02, -5.191e-02, -9.762e-02, -1.275e-01, 2.079e-02, -1.004e-01, 7.470e-02, 1.084e-02, -1.789e-02, 8.006e-02, 3.170e-02, 1.111e-01, -4.772e-02, -6.100e-02, 2.375e-02, 2.545e-03));
r += mul(s2_8, M4(-7.109e-03, 1.968e-03, -9.159e-03, -1.523e-02, -1.024e-02, -5.787e-04, -4.581e-02, -1.496e-02, 2.302e-02, -1.568e-02, 2.850e-02, 9.731e-03, -1.219e-02, 1.316e-03, -1.859e-02, 8.662e-02));
r += mul(s3_0, M4(2.241e-01, 1.599e-02, -3.007e-02, -8.278e-02, -2.343e-02, -1.323e-02, 6.153e-03, 8.030e-03, 1.988e-02, 1.870e-02, 7.620e-03, -1.035e-02, 2.443e-01, 4.061e-02, 3.123e-02, -4.152e-03));
r += mul(s3_1, M4(-1.500e-02, -2.365e-02, -2.046e-02, 4.369e-02, 7.611e-03, -9.342e-03, 4.413e-03, -1.110e-03, -1.238e-01, -3.394e-02, -4.442e-02, 2.423e-02, -9.742e-02, -2.324e-02, -3.479e-02, 4.742e-02));
r += mul(s3_2, M4(5.839e-03, 1.560e-02, -3.631e-03, 6.730e-03, -2.371e-03, -1.011e-02, -3.821e-03, 1.830e-03, 2.255e-02, 1.426e-02, -1.146e-02, -1.650e-02, 9.035e-03, 5.831e-03, 2.660e-03, -4.854e-03));
r += mul(s3_3, M4(-1.694e-01, -2.771e-01, 6.449e-01, -2.979e-01, 9.108e-02, -2.277e-02, -5.309e-02, -3.552e-02, -1.626e-01, 2.544e-02, -7.033e-02, 7.145e-02, -1.334e-01, 1.008e-01, 1.121e-01, 1.733e-01));
r += mul(s3_4, M4(-1.019e-01, 1.989e-01, -6.682e-02, -7.066e-02, -3.795e-02, 1.362e-01, 4.307e-02, -4.383e-02, 6.286e-01, -3.881e-01, 1.970e-01, -3.421e-01, -5.374e-03, -2.446e-01, -8.874e-02, -4.099e-01));
r += mul(s3_5, M4(1.279e-02, -1.406e-02, 7.997e-03, 1.743e-02, 2.251e-02, -4.285e-02, -2.154e-03, -1.441e-02, -2.329e-02, 1.667e-02, 4.333e-02, 1.229e-01, -2.284e-03, -2.450e-02, -8.000e-03, -1.712e-02));
r += mul(s3_6, M4(7.251e-02, 9.488e-03, -1.511e-01, -6.947e-02, -2.728e-02, 7.342e-03, 2.289e-02, 1.443e-02, 1.492e-02, -8.903e-03, -5.817e-02, -4.836e-02, -1.677e-03, 1.964e-02, -6.858e-03, -1.328e-02));
r += mul(s3_7, M4(-8.618e-02, -5.596e-02, -1.276e-01, -1.230e-01, 4.851e-03, -5.676e-02, 2.939e-02, -4.192e-02, -2.508e-02, 4.430e-02, 1.352e-01, 2.072e-02, -8.584e-03, -3.983e-02, 1.177e-02, -4.721e-02));
r += mul(s3_8, M4(6.050e-03, -3.781e-04, -3.124e-03, -1.667e-02, -1.291e-02, -1.315e-02, -2.106e-02, -5.240e-03, 1.412e-02, -2.504e-02, 3.138e-02, -2.989e-02, -6.363e-03, -1.480e-04, 1.157e-03, 1.933e-02));
r += V4(-8.480e-04, -1.222e-04, -8.629e-04, -1.828e-04);
return tanh(r);
}
void Pass6(uint2 blockStart, uint3 tid) {
float2 pt = float2(GetInputPt());
uint2 gxy = (Rmp8x8(tid.x) << 1) + blockStart;
uint2 size = GetOutputSize();
if (gxy.x >= size.x || gxy.y >= size.y) {
return;
}
float2 pos = ((gxy >> 1) + 0.5) * pt;
V4 s0_0 = l0(-1.0, -1.0);
V4 s0_1 = l0(0.0, -1.0);
V4 s0_2 = l0(1.0, -1.0);
V4 s0_3 = l0(-1.0, 0.0);
V4 s0_4 = l0(0.0, 0.0);
V4 s0_5 = l0(1.0, 0.0);
V4 s0_6 = l0(-1.0, 1.0);
V4 s0_7 = l0(0.0, 1.0);
V4 s0_8 = l0(1.0, 1.0);
V4 s1_0 = -max(-s0_0, 0.0);
V4 s1_1 = -max(-s0_1, 0.0);
V4 s1_2 = -max(-s0_2, 0.0);
V4 s1_3 = -max(-s0_3, 0.0);
V4 s1_4 = -max(-s0_4, 0.0);
V4 s1_5 = -max(-s0_5, 0.0);
V4 s1_6 = -max(-s0_6, 0.0);
V4 s1_7 = -max(-s0_7, 0.0);
V4 s1_8 = -max(-s0_8, 0.0);
s0_0 = max(s0_0, 0.0);
s0_1 = max(s0_1, 0.0);
s0_2 = max(s0_2, 0.0);
s0_3 = max(s0_3, 0.0);
s0_4 = max(s0_4, 0.0);
s0_5 = max(s0_5, 0.0);
s0_6 = max(s0_6, 0.0);
s0_7 = max(s0_7, 0.0);
s0_8 = max(s0_8, 0.0);
V4 s2_0 = l1(-1.0, -1.0);
V4 s2_1 = l1(0.0, -1.0);
V4 s2_2 = l1(1.0, -1.0);
V4 s2_3 = l1(-1.0, 0.0);
V4 s2_4 = l1(0.0, 0.0);
V4 s2_5 = l1(1.0, 0.0);
V4 s2_6 = l1(-1.0, 1.0);
V4 s2_7 = l1(0.0, 1.0);
V4 s2_8 = l1(1.0, 1.0);
V4 s3_0 = -max(-s2_0, 0.0);
V4 s3_1 = -max(-s2_1, 0.0);
V4 s3_2 = -max(-s2_2, 0.0);
V4 s3_3 = -max(-s2_3, 0.0);
V4 s3_4 = -max(-s2_4, 0.0);
V4 s3_5 = -max(-s2_5, 0.0);
V4 s3_6 = -max(-s2_6, 0.0);
V4 s3_7 = -max(-s2_7, 0.0);
V4 s3_8 = -max(-s2_8, 0.0);
s2_0 = max(s2_0, 0.0);
s2_1 = max(s2_1, 0.0);
s2_2 = max(s2_2, 0.0);
s2_3 = max(s2_3, 0.0);
s2_4 = max(s2_4, 0.0);
s2_5 = max(s2_5, 0.0);
s2_6 = max(s2_6, 0.0);
s2_7 = max(s2_7, 0.0);
s2_8 = max(s2_8, 0.0);
V4 r = f0(s0_0, s0_1, s0_2, s0_3, s0_4, s0_5, s0_6, s0_7, s0_8, s1_0, s1_1, s1_2, s1_3, s1_4, s1_5, s1_6, s1_7, s1_8, s2_0, s2_1, s2_2, s2_3, s2_4, s2_5, s2_6, s2_7, s2_8, s3_0, s3_1, s3_2, s3_3, s3_4, s3_5, s3_6, s3_7, s3_8);
static const float3x3 rgb2yuv = {0.299, 0.587, 0.114, -0.169, -0.331, 0.5, 0.5, -0.419, -0.081};
static const float3x3 yuv2rgb = {1, -0.00093, 1.401687, 1, -0.3437, -0.71417, 1, 1.77216, 0.00099};
float2 opt = float2(GetOutputPt());
pos -= 0.5f * opt;
float3 yuv = mul(rgb2yuv, INPUT.SampleLevel(SL, pos, 0).rgb);
OUTPUT[gxy] = float4(mul(yuv2rgb, float3(saturate(yuv.r + r.x), yuv.yz)), 1);
++gxy.x;
pos.x += opt.x;
yuv = mul(rgb2yuv, INPUT.SampleLevel(SL, pos, 0).rgb);
OUTPUT[gxy] = float4(mul(yuv2rgb, float3(saturate(yuv.r + r.y), yuv.yz)), 1);
++gxy.y;
pos.y += opt.y;
yuv = mul(rgb2yuv, INPUT.SampleLevel(SL, pos, 0).rgb);
OUTPUT[gxy] = float4(mul(yuv2rgb, float3(saturate(yuv.r + r.w), yuv.yz)), 1);
--gxy.x;
pos.x -= opt.x;
yuv = mul(rgb2yuv, INPUT.SampleLevel(SL, pos, 0).rgb);
OUTPUT[gxy] = float4(mul(yuv2rgb, float3(saturate(yuv.r + r.z), yuv.yz)), 1);
}

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,778 @@
// CuNNy 8x4C BILINEAR RGB NVL DN - https://github.com/cunnyplapper/CuNNy
// This program is free software: you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation, either version 3 of the License, or
// (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
//
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License
// along with this program. If not, see <http://www.gnu.org/licenses/>.
//!MAGPIE EFFECT
//!VERSION 4
//!SORT_NAME CuNNy-DN-D04N08
//!TEXTURE
Texture2D INPUT;
//!TEXTURE
//!WIDTH INPUT_WIDTH * 2
//!HEIGHT INPUT_HEIGHT * 2
Texture2D OUTPUT;
//!SAMPLER
//!FILTER POINT
SamplerState SP;
//!SAMPLER
//!FILTER LINEAR
SamplerState SL;
//!COMMON
#define O(t, p) t.SampleLevel(SP, pos + p * pt, 0)
#define V4 min16float4
#define M4 min16float4x4
//!TEXTURE
//!WIDTH INPUT_WIDTH
//!HEIGHT INPUT_HEIGHT
//!FORMAT R8G8B8A8_SNORM
Texture2D t0;
//!TEXTURE
//!WIDTH INPUT_WIDTH
//!HEIGHT INPUT_HEIGHT
//!FORMAT R8G8B8A8_SNORM
Texture2D t1;
//!PASS 1
//!DESC in
//!BLOCK_SIZE 8
//!NUM_THREADS 64
//!IN INPUT
//!OUT t0
#define l0(x, y) min16float((dot(float3(-1.880e-01, -3.696e-01, -8.936e-02), O(INPUT, float2(x, y)).rgb) + 5.137e-01))
V4 f0(min16float s0_0, min16float s0_1, min16float s0_2, min16float s0_3, min16float s0_4, min16float s0_5, min16float s0_6, min16float s0_7, min16float s0_8) {
V4 r = 0.0;
r += V4(6.049e-03, -3.524e-01, -1.308e-01, -6.691e-02) * s0_0;
r += V4(1.720e-02, -7.092e-02, -3.030e-01, 1.654e-01) * s0_1;
r += V4(-6.706e-03, 2.289e-01, 1.982e-03, -5.756e-02) * s0_2;
r += V4(-2.761e-02, 5.050e-01, -2.036e-01, 1.265e-01) * s0_3;
r += V4(-8.654e-01, -6.035e-01, -2.119e-01, 5.055e-01) * s0_4;
r += V4(-7.114e-03, 2.325e-02, 5.721e-02, 4.585e-02) * s0_5;
r += V4(2.796e-01, 1.680e-01, 1.353e-01, 1.286e-02) * s0_6;
r += V4(5.684e-01, 3.022e-01, 6.426e-01, 8.931e-02) * s0_7;
r += V4(3.723e-02, -2.036e-01, 2.732e-02, -4.101e-02) * s0_8;
r += V4(1.324e-02, -9.379e-05, 8.452e-03, 5.165e-02);
return r;
}
void Pass1(uint2 blockStart, uint3 tid) {
float2 pt = float2(GetInputPt());
uint2 gxy = Rmp8x8(tid.x) + blockStart;
uint2 size = GetInputSize();
if (gxy.x >= size.x || gxy.y >= size.y) {
return;
}
float2 pos = (gxy + 0.5) * pt;
min16float s0_0 = l0(-1.0, -1.0);
min16float s0_1 = l0(0.0, -1.0);
min16float s0_2 = l0(1.0, -1.0);
min16float s0_3 = l0(-1.0, 0.0);
min16float s0_4 = l0(0.0, 0.0);
min16float s0_5 = l0(1.0, 0.0);
min16float s0_6 = l0(-1.0, 1.0);
min16float s0_7 = l0(0.0, 1.0);
min16float s0_8 = l0(1.0, 1.0);
t0[gxy] = f0(s0_0, s0_1, s0_2, s0_3, s0_4, s0_5, s0_6, s0_7, s0_8);
}
//!PASS 2
//!DESC conv1
//!BLOCK_SIZE 8
//!NUM_THREADS 64
//!IN t0
//!OUT t1
#define l0(x, y) V4(O(t0, float2(x, y)))
V4 f0(V4 s0_0, V4 s0_1, V4 s0_2, V4 s0_3, V4 s0_4, V4 s0_5, V4 s0_6, V4 s0_7, V4 s0_8, V4 s1_0, V4 s1_1, V4 s1_2, V4 s1_3, V4 s1_4, V4 s1_5, V4 s1_6, V4 s1_7, V4 s1_8) {
V4 r = 0.0;
r += mul(s0_0, M4(2.216e-02, 1.062e-01, -3.433e-03, -1.923e-01, 6.300e-02, -4.594e-01, 2.025e-01, 8.655e-03, -5.497e-02, 1.694e-01, -1.806e-01, 2.115e-01, -6.176e-02, 1.167e-02, -5.987e-02, 1.167e-01));
r += mul(s0_1, M4(-1.646e-01, -5.524e-01, -1.352e-01, 1.704e-01, 3.398e-02, -2.598e-01, 1.616e-01, -1.772e-01, -5.648e-02, 2.755e-01, 2.638e-02, -2.657e-02, 3.774e-02, -6.833e-02, -1.141e-01, -2.438e-01));
r += mul(s0_2, M4(-1.459e-01, 9.939e-02, -6.457e-04, 2.352e-02, 5.006e-02, -7.759e-01, -4.862e-02, -3.366e-02, 9.508e-02, 1.537e-01, -6.771e-02, -1.260e-01, 1.067e-01, -5.893e-02, -9.811e-02, -1.060e-02));
r += mul(s0_3, M4(-2.901e-01, 2.907e-01, 2.178e-01, -3.877e-01, 9.034e-03, 8.718e-03, -1.213e-01, 9.252e-02, 3.286e-01, -8.247e-02, -5.573e-02, -3.852e-01, -1.371e-01, 1.877e-01, 2.337e-01, 5.324e-01));
r += mul(s0_4, M4(-9.182e-01, 1.013e-01, 2.969e-01, 7.117e-01, -2.367e-01, -7.128e-02, 1.828e-01, 5.993e-01, -2.965e-01, 1.323e-01, 3.117e-02, -3.215e-01, -1.410e-01, 5.359e-02, -1.137e-01, -2.603e-01));
r += mul(s0_5, M4(-1.071e-01, -8.801e-02, 9.524e-03, -2.937e-02, 7.723e-02, 1.195e-01, -9.056e-02, 6.161e-02, 1.962e-01, -2.740e-01, -9.418e-02, 1.141e-01, 6.203e-02, -1.084e-01, 2.402e-01, -2.066e-01));
r += mul(s0_6, M4(2.226e-01, -2.259e-01, -2.499e-02, -9.184e-02, -1.499e-01, -3.737e-02, 1.576e-01, 1.084e-01, -2.221e-01, -1.080e-02, 2.643e-02, -1.023e-01, 1.068e-01, 1.193e-01, -2.781e-01, 3.396e-01));
r += mul(s0_7, M4(7.520e-01, -1.043e-01, -4.535e-02, 2.775e-01, 1.577e-01, -1.526e-01, 1.796e-01, 1.085e-01, -1.012e+00, 4.333e-02, 1.270e-02, -1.692e-01, 1.127e-01, -2.847e-01, -1.784e-01, -3.956e-01));
r += mul(s0_8, M4(2.206e-01, 1.370e-01, -7.453e-02, 1.050e-01, 8.412e-02, -1.396e-01, 1.707e-02, -1.654e-02, -2.116e-01, -7.944e-02, 1.244e-01, -6.709e-02, -5.577e-02, 1.619e-01, -2.818e-01, 1.460e-01));
r += mul(s1_0, M4(1.180e-01, -2.345e-01, 5.406e-02, -1.102e-01, 1.559e-02, -3.865e-01, -1.077e-01, 1.442e-02, -1.405e-01, 1.578e-01, -3.338e-02, 1.157e-01, -1.676e-01, 4.656e-02, -1.507e-01, 2.590e-02));
r += mul(s1_1, M4(-3.112e-02, -5.537e-01, -3.626e-01, -2.915e-01, 7.495e-02, 4.473e-01, -1.847e-01, -8.743e-02, -3.290e-02, 3.660e-02, 1.252e-01, 1.058e-02, 1.193e-01, 6.421e-02, -1.456e-01, -1.693e-01));
r += mul(s1_2, M4(-1.047e-01, -4.306e-01, 6.486e-03, 1.137e-01, 2.935e-02, -3.608e-01, 5.242e-02, -2.374e-02, 1.130e-01, -4.864e-02, -7.302e-02, -2.205e-02, 8.227e-02, -8.403e-02, -9.468e-02, 8.095e-02));
r += mul(s1_3, M4(-3.759e-02, 2.709e-01, 1.269e-01, -4.994e-01, -1.577e-02, 1.871e-01, -2.532e-01, 8.960e-02, 2.298e-01, -2.462e-01, -1.634e-02, -3.955e-01, 2.750e-02, -4.812e-02, -2.441e-01, 9.926e-01));
r += mul(s1_4, M4(-7.288e-01, 5.644e-01, 1.042e+00, 6.160e-01, -4.271e-01, 4.419e-01, 1.437e-01, 3.840e-01, -1.220e-01, -8.627e-01, 6.664e-02, -1.220e-02, 5.260e-02, 1.505e-01, -2.182e-01, -6.116e-01));
r += mul(s1_5, M4(1.659e-01, 2.566e-01, -5.954e-02, -9.187e-02, -8.251e-02, 1.091e-01, -1.506e-01, 1.370e-01, 3.056e-01, -3.512e-01, -4.956e-03, 7.008e-02, 1.320e-01, -3.995e-01, -8.603e-03, -3.542e-01));
r += mul(s1_6, M4(2.549e-01, -7.946e-02, -1.755e-01, -2.902e-02, -1.912e-01, 2.349e-01, 6.770e-02, 9.683e-02, -2.690e-01, -1.715e-01, 5.692e-02, -1.064e-01, 2.998e-01, 7.619e-02, 8.040e-03, 2.706e-01));
r += mul(s1_7, M4(7.320e-01, 1.397e-01, -5.600e-02, 9.609e-02, -1.267e-01, 6.841e-02, 2.429e-01, 3.167e-02, -6.816e-01, -3.313e-03, 5.622e-02, -4.727e-02, -3.420e-01, 4.283e-02, -3.250e-01, -4.118e-01));
r += mul(s1_8, M4(1.607e-01, 1.581e-01, -6.049e-02, 9.118e-02, -1.583e-02, 2.918e-01, 1.703e-02, -1.206e-01, -2.114e-01, -1.248e-01, 6.689e-02, -2.131e-02, -7.779e-02, 1.069e-01, -1.181e-01, 2.230e-01));
r += V4(1.959e-02, -5.807e-03, 9.415e-02, 7.247e-03);
return r;
}
void Pass2(uint2 blockStart, uint3 tid) {
float2 pt = float2(GetInputPt());
uint2 gxy = Rmp8x8(tid.x) + blockStart;
uint2 size = GetInputSize();
if (gxy.x >= size.x || gxy.y >= size.y) {
return;
}
float2 pos = (gxy + 0.5) * pt;
V4 s0_0 = l0(-1.0, -1.0);
V4 s0_1 = l0(0.0, -1.0);
V4 s0_2 = l0(1.0, -1.0);
V4 s0_3 = l0(-1.0, 0.0);
V4 s0_4 = l0(0.0, 0.0);
V4 s0_5 = l0(1.0, 0.0);
V4 s0_6 = l0(-1.0, 1.0);
V4 s0_7 = l0(0.0, 1.0);
V4 s0_8 = l0(1.0, 1.0);
V4 s1_0 = -max(-s0_0, 0.0);
V4 s1_1 = -max(-s0_1, 0.0);
V4 s1_2 = -max(-s0_2, 0.0);
V4 s1_3 = -max(-s0_3, 0.0);
V4 s1_4 = -max(-s0_4, 0.0);
V4 s1_5 = -max(-s0_5, 0.0);
V4 s1_6 = -max(-s0_6, 0.0);
V4 s1_7 = -max(-s0_7, 0.0);
V4 s1_8 = -max(-s0_8, 0.0);
s0_0 = max(s0_0, 0.0);
s0_1 = max(s0_1, 0.0);
s0_2 = max(s0_2, 0.0);
s0_3 = max(s0_3, 0.0);
s0_4 = max(s0_4, 0.0);
s0_5 = max(s0_5, 0.0);
s0_6 = max(s0_6, 0.0);
s0_7 = max(s0_7, 0.0);
s0_8 = max(s0_8, 0.0);
t1[gxy] = f0(s0_0, s0_1, s0_2, s0_3, s0_4, s0_5, s0_6, s0_7, s0_8, s1_0, s1_1, s1_2, s1_3, s1_4, s1_5, s1_6, s1_7, s1_8);
}
//!PASS 3
//!DESC conv2
//!BLOCK_SIZE 8
//!NUM_THREADS 64
//!IN t1
//!OUT t0
#define l0(x, y) V4(O(t1, float2(x, y)))
V4 f0(V4 s0_0, V4 s0_1, V4 s0_2, V4 s0_3, V4 s0_4, V4 s0_5, V4 s0_6, V4 s0_7, V4 s0_8, V4 s1_0, V4 s1_1, V4 s1_2, V4 s1_3, V4 s1_4, V4 s1_5, V4 s1_6, V4 s1_7, V4 s1_8) {
V4 r = 0.0;
r += mul(s0_0, M4(9.727e-02, 1.849e-01, 2.125e-02, 1.933e-01, 9.183e-02, 8.307e-03, -9.035e-02, 3.241e-02, 1.141e-01, 8.739e-02, -9.547e-02, 1.616e-01, 2.912e-02, -1.780e-02, 5.433e-02, 2.720e-02));
r += mul(s0_1, M4(-1.524e-01, -9.138e-02, 8.798e-02, -1.691e-01, 8.519e-03, 3.597e-02, -1.784e-02, 3.049e-02, 3.078e-02, 1.823e-01, 1.051e-02, -5.317e-02, -1.977e-01, 1.013e-01, 1.215e-01, 4.261e-02));
r += mul(s0_2, M4(-1.992e-02, -1.191e-01, -1.365e-03, 3.976e-02, 3.452e-03, 7.503e-03, 4.850e-03, 8.970e-03, -7.652e-03, 1.166e-01, 9.888e-02, 3.423e-03, -3.354e-01, -3.335e-01, -2.226e-02, -1.509e-01));
r += mul(s0_3, M4(-7.994e-02, 1.374e-01, -1.701e-02, -2.530e-01, 2.153e-01, -6.957e-03, -1.405e-01, -6.175e-02, 7.274e-03, 1.734e-01, -9.107e-02, -1.303e-01, -1.265e-01, 1.669e-02, 3.494e-02, -8.377e-02));
r += mul(s0_4, M4(-1.124e+00, 1.355e-02, -1.979e-01, -4.092e-01, -1.276e-01, -1.096e-01, 5.949e-02, 1.073e-01, -4.780e-02, 1.378e-01, 1.905e-01, -9.525e-02, -5.999e-01, 1.274e-01, 8.416e-01, 2.483e-01));
r += mul(s0_5, M4(3.312e-01, 2.036e-01, -5.231e-02, 5.357e-02, 1.666e-03, -2.102e-03, -3.213e-03, 4.747e-02, 1.130e-01, 3.492e-01, -1.263e-01, 4.100e-01, -5.859e-01, 4.875e-02, 2.227e-01, 3.127e-01));
r += mul(s0_6, M4(-3.699e-02, 6.066e-02, 3.448e-03, -4.158e-03, -4.048e-03, -3.619e-02, -8.830e-02, -8.917e-03, 2.990e-02, 6.919e-03, 9.803e-02, 2.188e-02, 5.674e-02, -3.122e-02, -6.793e-02, 8.573e-02));
r += mul(s0_7, M4(-1.255e-01, 1.754e-01, -1.332e-01, -1.124e-01, -2.163e-01, 1.552e-02, -7.485e-04, 4.194e-02, -1.899e-01, 1.334e-01, -1.721e-01, -3.487e-01, 3.847e-01, -3.823e-02, 1.121e-02, -7.128e-02));
r += mul(s0_8, M4(7.152e-02, -1.631e-02, 4.810e-02, 1.435e-01, 3.881e-02, -3.596e-02, -7.544e-03, -1.071e-01, -8.509e-02, 1.110e-01, 8.542e-02, 1.980e-02, -1.134e-01, -7.967e-02, -1.586e-01, 2.511e-01));
r += mul(s1_0, M4(2.326e-01, 4.791e-02, -1.996e-01, 1.352e-02, -9.909e-03, 1.117e-01, 2.198e-02, -6.683e-02, 1.356e-01, 2.830e-01, -8.418e-02, 2.137e-01, -1.401e-02, -7.056e-02, 5.360e-02, 6.243e-02));
r += mul(s1_1, M4(7.739e-01, -3.172e-01, -2.031e-01, 2.054e-01, -1.263e-01, -7.571e-03, 8.090e-02, -1.372e-01, 1.053e-01, 2.982e-01, -6.235e-02, 1.452e-02, 1.973e-01, 9.233e-02, -1.067e-01, 1.088e-01));
r += mul(s1_2, M4(-1.136e-01, -1.332e-01, -7.369e-02, 2.046e-01, -9.302e-02, 2.722e-02, 9.461e-02, -1.895e-01, 1.216e-02, 2.595e-01, 1.028e-01, 8.413e-02, -1.339e-01, -2.259e-01, -1.047e-01, 5.994e-02));
r += mul(s1_3, M4(1.224e-01, -3.713e-02, -2.383e-01, -1.743e-01, -1.876e-01, 1.155e-01, 2.212e-01, -1.375e-01, 1.618e-01, 2.628e-01, -1.161e-01, -1.826e-01, 8.003e-02, -1.961e-02, -6.278e-02, -5.710e-02));
r += mul(s1_4, M4(-2.647e-01, -1.603e-01, -7.731e-01, 1.958e-01, -4.093e-01, -1.110e-01, 3.352e-01, -3.093e-02, -6.201e-01, 3.073e-01, 3.779e-01, -2.733e-01, 4.035e-01, 1.230e-01, -1.606e-01, 9.421e-02));
r += mul(s1_5, M4(1.981e-01, -8.801e-03, -9.874e-03, -4.003e-02, 2.686e-03, -1.346e-01, -1.813e-02, -1.003e-01, 1.561e-01, 3.252e-01, -1.189e-01, 2.014e-01, 1.343e-01, 4.088e-02, -9.918e-02, 1.025e+00));
r += mul(s1_6, M4(-2.323e-02, 3.284e-02, -5.099e-03, -3.025e-02, -1.458e-02, -1.640e-02, 1.268e-01, -3.787e-02, 5.078e-02, 4.529e-02, 1.050e-02, -8.079e-03, -1.530e-02, -6.509e-02, -1.620e-01, 6.662e-02));
r += mul(s1_7, M4(3.972e-02, 8.570e-02, -8.723e-02, -3.746e-02, -1.902e-01, 5.121e-02, 1.161e-01, -4.624e-02, -6.268e-02, 1.852e-01, -1.535e-01, -2.023e-01, 2.476e-01, -2.211e-02, -1.590e-01, -3.109e-02));
r += mul(s1_8, M4(-8.025e-03, -4.798e-02, 5.162e-02, 6.616e-02, -2.416e-02, -5.815e-02, -1.334e-02, -1.029e-01, 5.381e-02, 1.539e-01, 4.511e-02, 1.426e-01, -5.511e-02, -9.311e-02, -3.072e-02, 1.572e-01));
r += V4(3.240e-02, -1.989e-01, -2.700e-02, 6.578e-03);
return r;
}
void Pass3(uint2 blockStart, uint3 tid) {
float2 pt = float2(GetInputPt());
uint2 gxy = Rmp8x8(tid.x) + blockStart;
uint2 size = GetInputSize();
if (gxy.x >= size.x || gxy.y >= size.y) {
return;
}
float2 pos = (gxy + 0.5) * pt;
V4 s0_0 = l0(-1.0, -1.0);
V4 s0_1 = l0(0.0, -1.0);
V4 s0_2 = l0(1.0, -1.0);
V4 s0_3 = l0(-1.0, 0.0);
V4 s0_4 = l0(0.0, 0.0);
V4 s0_5 = l0(1.0, 0.0);
V4 s0_6 = l0(-1.0, 1.0);
V4 s0_7 = l0(0.0, 1.0);
V4 s0_8 = l0(1.0, 1.0);
V4 s1_0 = -max(-s0_0, 0.0);
V4 s1_1 = -max(-s0_1, 0.0);
V4 s1_2 = -max(-s0_2, 0.0);
V4 s1_3 = -max(-s0_3, 0.0);
V4 s1_4 = -max(-s0_4, 0.0);
V4 s1_5 = -max(-s0_5, 0.0);
V4 s1_6 = -max(-s0_6, 0.0);
V4 s1_7 = -max(-s0_7, 0.0);
V4 s1_8 = -max(-s0_8, 0.0);
s0_0 = max(s0_0, 0.0);
s0_1 = max(s0_1, 0.0);
s0_2 = max(s0_2, 0.0);
s0_3 = max(s0_3, 0.0);
s0_4 = max(s0_4, 0.0);
s0_5 = max(s0_5, 0.0);
s0_6 = max(s0_6, 0.0);
s0_7 = max(s0_7, 0.0);
s0_8 = max(s0_8, 0.0);
t0[gxy] = f0(s0_0, s0_1, s0_2, s0_3, s0_4, s0_5, s0_6, s0_7, s0_8, s1_0, s1_1, s1_2, s1_3, s1_4, s1_5, s1_6, s1_7, s1_8);
}
//!PASS 4
//!DESC conv3
//!BLOCK_SIZE 8
//!NUM_THREADS 64
//!IN t0
//!OUT t1
#define l0(x, y) V4(O(t0, float2(x, y)))
V4 f0(V4 s0_0, V4 s0_1, V4 s0_2, V4 s0_3, V4 s0_4, V4 s0_5, V4 s0_6, V4 s0_7, V4 s0_8, V4 s1_0, V4 s1_1, V4 s1_2, V4 s1_3, V4 s1_4, V4 s1_5, V4 s1_6, V4 s1_7, V4 s1_8) {
V4 r = 0.0;
r += mul(s0_0, M4(9.384e-02, 1.183e-01, 5.136e-02, -4.583e-01, -1.060e-01, 6.124e-02, -1.479e-01, -2.457e-01, -5.881e-02, 4.756e-03, -2.540e-02, -5.047e-02, -1.897e-01, 4.062e-02, 1.226e-02, 1.465e-01));
r += mul(s0_1, M4(-1.890e-01, -9.535e-02, 2.627e-01, 3.224e-01, 1.050e-01, -3.922e-02, -3.551e-01, -2.632e-01, -2.349e-01, -5.605e-02, -2.856e-01, 4.331e-01, -2.614e-02, -6.027e-02, -3.236e-02, 2.873e-01));
r += mul(s0_2, M4(-1.702e-01, 7.462e-02, 2.168e-01, 4.212e-01, 8.150e-03, 6.671e-02, -2.781e-01, -1.322e-01, -3.933e-02, 2.698e-02, -3.420e-01, -1.116e-02, -1.788e-02, 8.701e-03, -1.044e-01, 1.264e-01));
r += mul(s0_3, M4(3.573e-01, -4.592e-02, 4.539e-01, 2.854e-01, -6.463e-01, -1.763e-01, 6.236e-01, 7.125e-02, 4.126e-01, -1.621e-02, 1.685e-02, 2.328e-01, -5.456e-01, -2.113e-01, 1.424e-01, 1.414e-01));
r += mul(s0_4, M4(3.838e-01, -1.008e+00, 4.023e-01, 1.302e+00, -1.503e-01, 4.245e-02, 1.496e+00, -3.479e-01, -3.763e-01, -7.877e-01, 4.081e-01, -2.192e-01, -2.853e-01, 2.123e-01, -3.407e-01, 2.423e-01));
r += mul(s0_5, M4(5.073e-03, -2.123e-01, 1.851e-01, 1.482e-01, -2.814e-01, 1.262e-01, 6.890e-01, -2.317e-01, 6.427e-02, -5.801e-02, -3.684e-02, 7.526e-02, 1.309e-02, -2.125e-02, -7.760e-02, 4.795e-02));
r += mul(s0_6, M4(1.409e-01, -1.062e-01, 1.665e-01, 5.277e-01, 6.676e-01, -1.872e-01, 1.251e+00, 1.165e-01, -2.287e-02, -5.235e-02, -2.028e-03, -3.305e-02, -1.968e-01, 1.898e-01, -9.538e-02, -1.418e-01));
r += mul(s0_7, M4(7.353e-02, -3.073e-01, 1.789e-01, 2.137e-01, -6.435e-01, -6.052e-01, 2.259e+00, 2.884e-02, 7.105e-04, 1.247e-01, -7.393e-02, 2.539e-02, 1.194e-01, 1.870e-01, -1.126e-01, 2.444e-02));
r += mul(s0_8, M4(3.853e-02, -2.242e-01, 1.470e-01, 1.701e-02, 4.586e-02, 2.027e-01, 7.448e-01, -4.414e-01, 9.096e-03, 1.277e-01, 4.010e-02, 1.064e-02, 2.401e-02, 1.901e-02, 1.956e-02, 8.744e-02));
r += mul(s1_0, M4(-4.741e-02, 1.819e-03, -8.321e-02, -1.496e-01, -1.801e-02, 4.682e-02, -6.041e-02, -7.243e-02, -1.478e-01, 4.970e-02, 6.424e-02, -5.378e-02, -9.117e-02, 5.496e-02, -2.648e-02, -4.042e-02));
r += mul(s1_1, M4(-8.815e-02, 5.938e-02, -2.433e-01, 1.737e-01, 1.095e-01, -5.108e-02, -5.729e-02, 8.334e-03, -2.763e-01, -6.431e-02, -2.454e-02, 4.055e-01, 2.113e-02, -1.298e-01, -3.908e-02, -1.780e-02));
r += mul(s1_2, M4(-1.905e-02, 3.894e-02, -1.293e-01, 8.303e-03, -7.800e-03, -5.508e-03, 8.606e-02, -7.501e-02, 1.542e-02, 3.046e-02, -2.920e-01, -4.240e-02, -3.932e-02, -1.813e-02, -8.213e-02, 1.017e-01));
r += mul(s1_3, M4(1.965e-01, 3.626e-02, 3.418e-02, 9.779e-02, -6.664e-02, -2.295e-02, -2.736e-02, 1.091e-01, 1.129e-01, -3.896e-02, 1.171e-02, -2.870e-02, -1.382e-01, -1.691e-01, 3.018e-01, -1.186e-01));
r += mul(s1_4, M4(1.075e-01, -6.894e-01, 1.714e-01, 5.097e-01, 9.868e-03, 1.087e-01, 2.107e-01, -6.591e-02, -3.233e-01, -9.792e-01, -1.189e-01, -5.480e-01, -1.157e-01, 5.941e-02, -5.770e-01, -1.030e-01));
r += mul(s1_5, M4(3.289e-02, 3.941e-02, 1.824e-01, 7.260e-04, -9.787e-03, 3.128e-02, -1.333e-01, 1.352e-01, 5.954e-03, -2.520e-01, -8.536e-02, -3.566e-01, 2.998e-02, -5.941e-02, -8.531e-02, -4.232e-02));
r += mul(s1_6, M4(2.592e-02, -7.528e-02, -1.956e-02, 1.002e-01, 2.992e-02, -1.673e-01, 4.413e-02, 1.683e-01, 1.440e-02, -1.047e-02, 1.425e-02, -1.292e-01, -1.777e-01, 1.220e-01, -6.381e-02, 4.174e-02));
r += mul(s1_7, M4(-3.107e-02, -8.612e-02, 1.248e-02, -8.544e-02, -1.161e-01, 7.718e-02, -1.150e-01, -1.699e-01, -1.392e-02, 7.590e-02, -5.195e-02, -3.599e-01, 4.872e-02, 1.381e-01, -1.143e-01, -1.473e-03));
r += mul(s1_8, M4(1.277e-02, 3.020e-02, 4.658e-02, 8.071e-02, 6.867e-02, -2.693e-02, 7.897e-02, -1.264e-02, -1.035e-03, 1.509e-01, 4.169e-02, -1.716e-01, -4.694e-03, 1.627e-02, 7.171e-03, -4.496e-02));
r += V4(4.014e-03, -2.020e-02, 1.560e-02, -2.352e-02);
return r;
}
void Pass4(uint2 blockStart, uint3 tid) {
float2 pt = float2(GetInputPt());
uint2 gxy = Rmp8x8(tid.x) + blockStart;
uint2 size = GetInputSize();
if (gxy.x >= size.x || gxy.y >= size.y) {
return;
}
float2 pos = (gxy + 0.5) * pt;
V4 s0_0 = l0(-1.0, -1.0);
V4 s0_1 = l0(0.0, -1.0);
V4 s0_2 = l0(1.0, -1.0);
V4 s0_3 = l0(-1.0, 0.0);
V4 s0_4 = l0(0.0, 0.0);
V4 s0_5 = l0(1.0, 0.0);
V4 s0_6 = l0(-1.0, 1.0);
V4 s0_7 = l0(0.0, 1.0);
V4 s0_8 = l0(1.0, 1.0);
V4 s1_0 = -max(-s0_0, 0.0);
V4 s1_1 = -max(-s0_1, 0.0);
V4 s1_2 = -max(-s0_2, 0.0);
V4 s1_3 = -max(-s0_3, 0.0);
V4 s1_4 = -max(-s0_4, 0.0);
V4 s1_5 = -max(-s0_5, 0.0);
V4 s1_6 = -max(-s0_6, 0.0);
V4 s1_7 = -max(-s0_7, 0.0);
V4 s1_8 = -max(-s0_8, 0.0);
s0_0 = max(s0_0, 0.0);
s0_1 = max(s0_1, 0.0);
s0_2 = max(s0_2, 0.0);
s0_3 = max(s0_3, 0.0);
s0_4 = max(s0_4, 0.0);
s0_5 = max(s0_5, 0.0);
s0_6 = max(s0_6, 0.0);
s0_7 = max(s0_7, 0.0);
s0_8 = max(s0_8, 0.0);
t1[gxy] = f0(s0_0, s0_1, s0_2, s0_3, s0_4, s0_5, s0_6, s0_7, s0_8, s1_0, s1_1, s1_2, s1_3, s1_4, s1_5, s1_6, s1_7, s1_8);
}
//!PASS 5
//!DESC conv4
//!BLOCK_SIZE 8
//!NUM_THREADS 64
//!IN t1
//!OUT t0
#define l0(x, y) V4(O(t1, float2(x, y)))
V4 f0(V4 s0_0, V4 s0_1, V4 s0_2, V4 s0_3, V4 s0_4, V4 s0_5, V4 s0_6, V4 s0_7, V4 s0_8, V4 s1_0, V4 s1_1, V4 s1_2, V4 s1_3, V4 s1_4, V4 s1_5, V4 s1_6, V4 s1_7, V4 s1_8) {
V4 r = 0.0;
r += mul(s0_0, M4(3.174e-02, -2.020e-01, -6.843e-03, 1.049e-01, 1.680e-01, -6.387e-01, -1.541e-01, -1.952e-01, -4.586e-02, -1.580e-01, -5.507e-02, 1.065e-01, -5.257e-03, -9.464e-02, -9.788e-02, 1.221e-01));
r += mul(s0_1, M4(1.365e-01, -4.220e-02, -4.186e-02, -1.569e-01, -5.527e-01, -1.180e-01, -2.274e-01, -2.007e-01, 2.207e-02, 1.190e-02, 3.746e-02, -1.565e-01, -2.808e-02, 1.657e-02, -5.376e-02, -1.093e-02));
r += mul(s0_2, M4(-7.935e-02, -3.809e-02, -3.727e-02, -4.730e-02, -8.556e-02, 3.451e-04, -8.191e-02, 8.086e-02, 2.051e-02, 7.072e-03, 2.537e-02, 2.793e-02, 9.384e-04, -3.624e-02, -2.171e-02, 7.103e-02));
r += mul(s0_3, M4(-1.261e-02, 2.716e-01, 2.739e-01, -7.349e-02, -2.130e-02, -4.131e-01, -1.851e-01, 1.065e-01, -7.827e-02, 2.868e-01, -1.500e-01, -1.442e-01, -1.842e-02, -2.983e-01, -4.232e-02, 1.395e-01));
r += mul(s0_4, M4(2.733e-01, 4.015e-01, 4.102e-01, -2.027e-01, 4.229e-01, 2.213e-01, 3.628e-01, -1.011e-01, -4.893e-01, 1.333e-01, -4.245e-02, -8.133e-02, -1.086e-02, -1.089e-01, -8.720e-02, 1.513e-01));
r += mul(s0_5, M4(8.521e-02, 1.460e-01, 1.589e-01, -2.075e-01, -5.391e-02, 7.449e-03, -6.763e-02, -2.352e-01, 4.055e-02, -1.812e-02, -1.413e-02, 9.240e-02, -3.070e-02, -4.975e-03, -8.972e-02, -2.225e-02));
r += mul(s0_6, M4(1.880e-01, -1.481e-01, 1.001e-01, 6.339e-02, -6.208e-02, -2.814e-02, -5.944e-03, 1.002e-01, -7.822e-02, 1.010e-01, -2.161e-02, 9.175e-02, 1.495e-02, 1.645e-02, 8.901e-03, -3.865e-02));
r += mul(s0_7, M4(4.449e-01, -1.089e-01, -1.249e-01, -8.911e-01, 3.096e-02, 1.724e-01, 5.605e-02, -7.605e-02, -9.644e-02, -1.191e-01, -1.332e-01, 2.544e-02, 5.659e-02, -2.706e-04, -9.886e-02, 9.218e-02));
r += mul(s0_8, M4(7.394e-02, -2.112e-01, 1.505e-02, -1.236e-01, -1.848e-02, -2.716e-02, -6.663e-02, 2.764e-02, -1.120e-02, 3.440e-03, -1.443e-02, 1.745e-02, -3.847e-02, -4.228e-03, -8.888e-02, 2.134e-02));
r += mul(s1_0, M4(6.588e-03, -6.764e-02, -2.660e-02, -3.967e-02, 6.459e-02, -6.345e-01, -5.784e-01, 9.294e-02, 2.426e-02, -9.858e-02, -9.036e-02, -9.545e-02, 2.094e-02, -1.001e-01, -1.145e-01, -6.470e-02));
r += mul(s1_1, M4(-2.633e-03, 5.849e-02, 3.154e-02, -7.386e-02, -6.412e-01, -4.405e-01, -5.885e-01, 1.657e-01, -1.757e-01, -1.882e-02, -1.023e-01, -1.713e-01, -1.047e-01, -1.558e-01, -1.509e-01, -2.815e-01));
r += mul(s1_2, M4(1.880e-01, -3.790e-02, 1.112e-01, 1.672e-02, -1.713e-01, 2.611e-02, -9.008e-02, 9.359e-02, -6.567e-02, 9.399e-02, 3.743e-02, 3.662e-02, 3.190e-02, -1.466e-01, -1.154e-01, 1.692e-02));
r += mul(s1_3, M4(-1.733e-02, 1.381e-01, 8.342e-02, -5.893e-02, -1.467e-02, -4.365e-01, -3.057e-01, 1.506e-01, 7.300e-02, 6.777e-01, -5.484e-03, -3.499e-01, 1.978e-01, -6.846e-01, -2.921e-01, -1.173e-01));
r += mul(s1_4, M4(-1.829e-01, -4.506e-01, -5.685e-02, 8.260e-01, 3.056e-01, 1.803e-01, 1.908e-01, -2.029e-01, -1.578e-01, 5.039e-01, 3.016e-01, -4.971e-01, -4.977e-01, 4.537e-01, -4.268e-01, 7.878e-01));
r += mul(s1_5, M4(-3.251e-01, -1.229e-01, -1.447e-01, 3.290e-01, -2.134e-01, -6.542e-03, -7.109e-02, -1.004e-01, 3.887e-02, -1.008e-01, -7.490e-02, 6.126e-02, 2.757e-01, -1.980e-01, -1.792e-01, 2.722e-01));
r += mul(s1_6, M4(4.765e-02, -5.401e-02, 4.164e-02, 1.847e-03, -3.178e-02, -4.201e-02, -2.504e-02, 1.350e-02, -1.436e-01, 1.654e-01, -1.099e-02, -3.733e-02, 1.118e-01, -2.529e-01, -1.353e-01, -9.309e-02));
r += mul(s1_7, M4(1.684e-01, -1.978e-01, 2.645e-02, -9.582e-02, 2.618e-02, 9.350e-02, -2.281e-02, -1.901e-01, 1.176e-02, -1.571e-01, 1.491e-02, -2.105e-01, -1.685e-01, -2.459e-01, -2.166e-01, 1.082e-01));
r += mul(s1_8, M4(2.225e-02, 7.813e-02, -4.112e-02, 6.166e-02, -4.143e-02, -2.160e-02, -7.478e-02, -2.251e-02, -1.306e-02, -6.002e-02, -7.496e-02, -2.538e-03, 7.824e-02, 9.597e-02, -3.546e-03, -1.794e-01));
r += V4(-5.942e-03, -2.718e-02, -1.234e-02, 3.307e-02);
return r;
}
void Pass5(uint2 blockStart, uint3 tid) {
float2 pt = float2(GetInputPt());
uint2 gxy = Rmp8x8(tid.x) + blockStart;
uint2 size = GetInputSize();
if (gxy.x >= size.x || gxy.y >= size.y) {
return;
}
float2 pos = (gxy + 0.5) * pt;
V4 s0_0 = l0(-1.0, -1.0);
V4 s0_1 = l0(0.0, -1.0);
V4 s0_2 = l0(1.0, -1.0);
V4 s0_3 = l0(-1.0, 0.0);
V4 s0_4 = l0(0.0, 0.0);
V4 s0_5 = l0(1.0, 0.0);
V4 s0_6 = l0(-1.0, 1.0);
V4 s0_7 = l0(0.0, 1.0);
V4 s0_8 = l0(1.0, 1.0);
V4 s1_0 = -max(-s0_0, 0.0);
V4 s1_1 = -max(-s0_1, 0.0);
V4 s1_2 = -max(-s0_2, 0.0);
V4 s1_3 = -max(-s0_3, 0.0);
V4 s1_4 = -max(-s0_4, 0.0);
V4 s1_5 = -max(-s0_5, 0.0);
V4 s1_6 = -max(-s0_6, 0.0);
V4 s1_7 = -max(-s0_7, 0.0);
V4 s1_8 = -max(-s0_8, 0.0);
s0_0 = max(s0_0, 0.0);
s0_1 = max(s0_1, 0.0);
s0_2 = max(s0_2, 0.0);
s0_3 = max(s0_3, 0.0);
s0_4 = max(s0_4, 0.0);
s0_5 = max(s0_5, 0.0);
s0_6 = max(s0_6, 0.0);
s0_7 = max(s0_7, 0.0);
s0_8 = max(s0_8, 0.0);
t0[gxy] = f0(s0_0, s0_1, s0_2, s0_3, s0_4, s0_5, s0_6, s0_7, s0_8, s1_0, s1_1, s1_2, s1_3, s1_4, s1_5, s1_6, s1_7, s1_8);
}
//!PASS 6
//!DESC conv5
//!BLOCK_SIZE 8
//!NUM_THREADS 64
//!IN t0
//!OUT t1
#define l0(x, y) V4(O(t0, float2(x, y)))
V4 f0(V4 s0_0, V4 s0_1, V4 s0_2, V4 s0_3, V4 s0_4, V4 s0_5, V4 s0_6, V4 s0_7, V4 s0_8, V4 s1_0, V4 s1_1, V4 s1_2, V4 s1_3, V4 s1_4, V4 s1_5, V4 s1_6, V4 s1_7, V4 s1_8) {
V4 r = 0.0;
r += mul(s0_0, M4(-1.069e-01, 1.009e-01, -5.972e-02, -1.732e-02, -9.217e-02, 9.177e-03, -3.127e-02, -5.872e-02, -1.364e-02, -9.990e-04, 1.518e-01, 5.861e-02, -9.835e-02, -1.155e-01, 6.714e-02, -5.142e-02));
r += mul(s0_1, M4(1.404e-02, 1.372e-01, -2.759e-01, -4.361e-02, -1.407e-01, 1.570e-01, -1.216e-01, -7.289e-02, 3.088e-01, -1.285e-01, 1.107e-01, 1.651e-01, 1.596e-01, -1.569e-01, 1.437e-02, -1.455e-01));
r += mul(s0_2, M4(-4.001e-02, 1.772e-01, -2.761e-01, 4.916e-02, -1.489e-01, 1.680e-01, -5.244e-02, 1.334e-01, 1.245e-01, -2.321e-01, 5.371e-01, -2.549e-01, -9.624e-02, -1.072e-01, 2.322e-01, -2.261e-01));
r += mul(s0_3, M4(-2.291e-01, 7.774e-04, -1.015e-02, 6.036e-02, -1.133e-01, 7.554e-02, 1.081e-01, 1.704e-01, 2.123e-01, -2.065e-01, 4.928e-02, 2.352e-03, -2.488e-01, -1.765e-01, 2.044e-01, 1.302e-02));
r += mul(s0_4, M4(3.195e-01, -5.410e-01, -4.771e-01, -1.713e-01, 2.778e-01, -1.028e-01, 8.603e-02, 2.162e-01, 1.466e-02, 2.633e-02, -3.299e-01, -5.183e-02, -3.598e-01, -4.015e-01, 5.674e-02, -1.429e-01));
r += mul(s0_5, M4(-1.480e-01, 2.440e-01, -2.189e-01, 1.407e-01, -3.439e-01, 2.624e-01, 4.947e-01, 7.813e-01, 1.067e-01, -6.781e-02, -5.271e-02, -1.331e-02, -2.133e-01, -1.038e-01, 4.267e-01, -4.026e-01));
r += mul(s0_6, M4(-1.086e-01, 2.607e-01, -1.897e-01, -1.710e-01, 6.096e-02, -1.121e-01, 8.797e-02, -8.204e-02, 4.825e-02, -9.364e-02, 8.472e-02, -1.923e-02, -1.755e-01, 1.086e-01, -3.987e-02, 1.737e-02));
r += mul(s0_7, M4(5.606e-02, 4.516e-02, -7.352e-02, 7.654e-02, -6.706e-02, 2.674e-01, -2.388e-01, -1.997e-01, 9.871e-02, -9.055e-02, 1.274e-01, 1.854e-01, -1.765e-01, -1.779e-01, 1.114e-01, -1.882e-01));
r += mul(s0_8, M4(-4.811e-02, 2.057e-01, -2.913e-01, 1.265e-01, 1.304e-01, 1.462e-01, -4.432e-03, 4.191e-01, 6.606e-02, -1.382e-01, 1.052e-01, -3.990e-01, 9.737e-02, -9.675e-02, 6.216e-02, -2.130e-01));
r += mul(s1_0, M4(-1.183e-01, -5.696e-02, 9.372e-02, 3.074e-03, -2.694e-02, -2.272e-02, -3.489e-02, -2.667e-02, 1.635e-01, -5.761e-04, -1.677e-03, -1.076e-01, -5.411e-02, -1.100e-02, 1.742e-02, 6.403e-02));
r += mul(s1_1, M4(-4.462e-03, 3.912e-02, -1.208e-01, -9.360e-02, -1.260e-01, 1.602e-02, -1.047e-01, -1.252e-01, 2.940e-01, 1.068e-01, -2.602e-01, 1.692e-01, 1.120e-01, -2.613e-02, -1.083e-02, 1.754e-02));
r += mul(s1_2, M4(2.307e-02, 1.240e-01, -2.024e-01, 1.761e-01, -2.326e-01, 3.209e-02, 5.352e-02, 3.399e-02, 1.754e-01, -3.059e-01, 4.554e-01, -2.412e-01, 4.242e-03, 3.919e-02, 7.769e-02, -1.155e-01));
r += mul(s1_3, M4(-1.946e-01, -9.445e-02, 1.698e-01, 1.165e-01, -1.571e-01, 1.700e-02, 5.682e-02, 4.628e-02, 4.425e-01, -1.872e-01, 3.713e-02, 8.537e-02, 4.211e-02, -6.178e-02, 1.398e-02, 5.929e-02));
r += mul(s1_4, M4(5.957e-01, -6.855e-01, -3.668e-01, -2.565e-01, -4.383e-02, -8.094e-02, -2.101e-02, -2.446e-01, -7.781e-02, 5.879e-01, -5.272e-01, -1.786e-01, -2.396e-01, -4.148e-01, 5.226e-02, 9.011e-02));
r += mul(s1_5, M4(-4.655e-02, 1.107e-01, -1.109e-01, 3.601e-01, -2.103e-01, 3.712e-01, 1.666e-01, 3.972e-01, -2.227e-02, -2.115e-02, -7.054e-02, -1.216e-01, 4.739e-03, 1.201e-01, 1.335e-01, -1.775e-01));
r += mul(s1_6, M4(-7.542e-02, 9.157e-02, 1.143e-02, -7.961e-02, -3.812e-02, 1.722e-02, 1.396e-02, -3.920e-02, -6.220e-03, -6.723e-02, 9.364e-02, -4.804e-02, -8.885e-02, 1.313e-01, -7.872e-02, 2.733e-02));
r += mul(s1_7, M4(-3.879e-01, -2.705e-01, 3.305e-01, -1.542e-01, -1.179e-01, 9.695e-02, -1.353e-01, -2.320e-01, 1.433e-02, -2.689e-01, 2.066e-01, 3.704e-01, -5.587e-02, -6.296e-02, 6.326e-02, 1.881e-02));
r += mul(s1_8, M4(-4.722e-02, -5.909e-02, 4.089e-02, -8.851e-02, 2.017e-01, -2.652e-02, 9.432e-02, 3.252e-01, -2.219e-01, 2.142e-02, -4.496e-02, 5.456e-02, 2.364e-02, 1.081e-01, -9.898e-02, 9.928e-02));
r += V4(1.102e-03, 4.481e-03, 3.096e-03, -9.818e-03);
return r;
}
void Pass6(uint2 blockStart, uint3 tid) {
float2 pt = float2(GetInputPt());
uint2 gxy = Rmp8x8(tid.x) + blockStart;
uint2 size = GetInputSize();
if (gxy.x >= size.x || gxy.y >= size.y) {
return;
}
float2 pos = (gxy + 0.5) * pt;
V4 s0_0 = l0(-1.0, -1.0);
V4 s0_1 = l0(0.0, -1.0);
V4 s0_2 = l0(1.0, -1.0);
V4 s0_3 = l0(-1.0, 0.0);
V4 s0_4 = l0(0.0, 0.0);
V4 s0_5 = l0(1.0, 0.0);
V4 s0_6 = l0(-1.0, 1.0);
V4 s0_7 = l0(0.0, 1.0);
V4 s0_8 = l0(1.0, 1.0);
V4 s1_0 = -max(-s0_0, 0.0);
V4 s1_1 = -max(-s0_1, 0.0);
V4 s1_2 = -max(-s0_2, 0.0);
V4 s1_3 = -max(-s0_3, 0.0);
V4 s1_4 = -max(-s0_4, 0.0);
V4 s1_5 = -max(-s0_5, 0.0);
V4 s1_6 = -max(-s0_6, 0.0);
V4 s1_7 = -max(-s0_7, 0.0);
V4 s1_8 = -max(-s0_8, 0.0);
s0_0 = max(s0_0, 0.0);
s0_1 = max(s0_1, 0.0);
s0_2 = max(s0_2, 0.0);
s0_3 = max(s0_3, 0.0);
s0_4 = max(s0_4, 0.0);
s0_5 = max(s0_5, 0.0);
s0_6 = max(s0_6, 0.0);
s0_7 = max(s0_7, 0.0);
s0_8 = max(s0_8, 0.0);
t1[gxy] = f0(s0_0, s0_1, s0_2, s0_3, s0_4, s0_5, s0_6, s0_7, s0_8, s1_0, s1_1, s1_2, s1_3, s1_4, s1_5, s1_6, s1_7, s1_8);
}
//!PASS 7
//!DESC conv6
//!BLOCK_SIZE 8
//!NUM_THREADS 64
//!IN t1
//!OUT t0
#define l0(x, y) V4(O(t1, float2(x, y)))
V4 f0(V4 s0_0, V4 s0_1, V4 s0_2, V4 s0_3, V4 s0_4, V4 s0_5, V4 s0_6, V4 s0_7, V4 s0_8, V4 s1_0, V4 s1_1, V4 s1_2, V4 s1_3, V4 s1_4, V4 s1_5, V4 s1_6, V4 s1_7, V4 s1_8) {
V4 r = 0.0;
r += mul(s0_0, M4(-3.488e-02, 3.507e-02, 3.848e-02, -5.906e-02, 9.669e-02, 3.121e-02, -2.182e-02, 1.691e-01, -1.132e-01, -7.602e-02, -5.000e-02, -6.017e-03, 3.962e-02, 1.086e-01, -3.343e-04, 9.002e-02));
r += mul(s0_1, M4(9.453e-02, -1.793e-01, -6.074e-02, 5.317e-03, 1.056e-01, 3.460e-01, 5.291e-02, 7.825e-02, 5.510e-02, 4.818e-02, -1.119e-02, 3.913e-02, -8.177e-02, -1.060e-01, -9.989e-03, -9.245e-02));
r += mul(s0_2, M4(-8.190e-02, 1.375e-01, -4.322e-02, -6.721e-02, 1.645e-02, -1.392e-01, 7.103e-02, -1.950e-02, 4.302e-03, -3.213e-02, -7.517e-03, -3.406e-03, -2.132e-02, 1.333e-01, -6.553e-02, 7.300e-02));
r += mul(s0_3, M4(-1.102e-01, 3.005e-01, -8.521e-02, 3.002e-01, 1.866e-01, 1.089e-01, -2.968e-02, 1.271e-01, -3.566e-01, 1.224e-01, -7.462e-02, -2.765e-01, 5.175e-02, 1.567e-01, 1.450e-01, -1.948e-01));
r += mul(s0_4, M4(1.558e-01, 3.780e-02, 9.697e-02, -2.485e-01, -3.560e-01, -3.667e-01, 1.396e-01, 1.020e+00, -2.319e-01, -2.878e-01, -2.849e-01, 5.648e-01, 2.094e-01, -5.684e-01, 1.482e-01, -6.172e-01));
r += mul(s0_5, M4(-1.276e-01, -1.685e-01, 4.271e-01, -1.489e-01, 2.154e-01, 2.661e-01, -1.093e-01, -7.859e-02, 6.618e-02, 9.795e-02, 2.778e-02, -1.286e-01, -1.527e-01, -3.586e-01, 2.523e-01, 9.196e-02));
r += mul(s0_6, M4(-1.354e-01, -6.680e-02, 5.541e-02, -5.314e-02, 1.639e-02, -1.639e-01, -1.856e-01, -1.863e-01, -1.519e-01, -5.459e-02, 1.027e-01, 6.492e-02, 3.482e-02, -9.074e-03, 1.861e-01, 1.393e-01));
r += mul(s0_7, M4(1.907e-02, 1.189e-02, -5.038e-01, -8.478e-02, 3.643e-01, 1.086e-02, 3.067e-01, 1.071e-01, -6.552e-01, 1.505e-01, -7.394e-01, 1.155e-01, -1.815e-01, -1.739e-02, -2.723e-01, -1.607e-01));
r += mul(s0_8, M4(-8.319e-02, -2.563e-02, -1.127e-01, -7.792e-02, 1.295e-01, 1.091e-01, 2.920e-02, -5.761e-02, -9.443e-02, 7.429e-03, -2.117e-01, -3.670e-02, -7.118e-02, -4.469e-02, -6.460e-02, -1.261e-02));
r += mul(s1_0, M4(2.400e-02, -2.740e-02, -3.394e-02, 5.817e-02, -6.716e-02, -5.672e-02, -7.339e-02, -3.921e-02, -9.506e-02, -3.805e-02, -3.235e-02, -8.145e-02, 1.265e-02, 7.308e-02, -5.707e-02, 1.141e-01));
r += mul(s1_1, M4(-1.565e-01, 1.052e-01, -8.934e-02, -6.945e-02, 3.804e-02, 2.091e-01, -1.102e-01, 2.394e-01, 6.041e-02, -9.942e-02, -6.054e-03, 4.857e-02, -7.265e-02, 1.596e-02, 9.135e-02, -8.397e-02));
r += mul(s1_2, M4(-9.449e-02, 1.121e-01, -1.101e-01, -2.980e-02, 5.100e-02, -6.337e-02, 1.692e-01, -5.062e-02, -3.931e-02, 1.083e-01, 3.952e-03, 9.801e-04, -6.425e-02, 8.015e-02, -1.628e-01, 8.317e-02));
r += mul(s1_3, M4(7.400e-02, 8.412e-02, 2.984e-02, 8.693e-02, -1.474e-01, -3.529e-02, -6.134e-02, -1.107e-01, -3.264e-01, 8.009e-02, -2.261e-01, -1.472e-01, -4.683e-02, -1.258e-01, 1.061e-01, -1.125e-01));
r += mul(s1_4, M4(4.970e-01, -1.211e-01, 2.379e-01, 2.124e-01, -1.003e-01, -5.656e-01, 5.001e-02, 4.959e-01, 1.538e-01, -7.985e-01, -2.085e-01, 2.220e-01, 7.247e-02, 6.581e-02, -9.437e-02, -3.066e-01));
r += mul(s1_5, M4(-8.611e-02, -9.199e-02, 2.518e-01, -7.482e-02, -1.208e-01, 1.015e-01, 3.428e-02, -1.354e-01, 1.038e-01, -4.497e-02, 2.744e-01, -4.281e-02, 4.090e-02, -2.726e-01, 1.839e-01, 1.138e-01));
r += mul(s1_6, M4(-8.703e-02, -4.776e-02, -1.477e-01, -1.870e-02, -1.072e-01, 3.204e-02, -8.396e-03, 1.175e-01, 1.685e-01, -1.427e-01, 2.152e-01, -2.155e-01, 1.898e-03, 5.924e-02, -1.089e-02, 5.197e-02));
r += mul(s1_7, M4(-9.679e-02, 1.961e-02, 1.636e-01, -6.049e-02, -7.071e-03, 1.519e-01, -6.303e-01, 4.739e-02, -3.331e-01, 8.291e-02, -5.944e-01, -7.677e-02, -1.164e-01, -4.580e-02, 1.419e-01, 6.839e-02));
r += mul(s1_8, M4(6.174e-02, -4.004e-02, 1.256e-02, -4.981e-02, 4.659e-03, 8.371e-02, -1.664e-01, -2.897e-02, -1.253e-01, 2.381e-02, -1.147e-01, -8.724e-02, -3.736e-02, 1.140e-02, -1.550e-01, 6.350e-03));
r += V4(-6.918e-03, -1.945e-03, -7.751e-03, 1.645e-02);
return r;
}
void Pass7(uint2 blockStart, uint3 tid) {
float2 pt = float2(GetInputPt());
uint2 gxy = Rmp8x8(tid.x) + blockStart;
uint2 size = GetInputSize();
if (gxy.x >= size.x || gxy.y >= size.y) {
return;
}
float2 pos = (gxy + 0.5) * pt;
V4 s0_0 = l0(-1.0, -1.0);
V4 s0_1 = l0(0.0, -1.0);
V4 s0_2 = l0(1.0, -1.0);
V4 s0_3 = l0(-1.0, 0.0);
V4 s0_4 = l0(0.0, 0.0);
V4 s0_5 = l0(1.0, 0.0);
V4 s0_6 = l0(-1.0, 1.0);
V4 s0_7 = l0(0.0, 1.0);
V4 s0_8 = l0(1.0, 1.0);
V4 s1_0 = -max(-s0_0, 0.0);
V4 s1_1 = -max(-s0_1, 0.0);
V4 s1_2 = -max(-s0_2, 0.0);
V4 s1_3 = -max(-s0_3, 0.0);
V4 s1_4 = -max(-s0_4, 0.0);
V4 s1_5 = -max(-s0_5, 0.0);
V4 s1_6 = -max(-s0_6, 0.0);
V4 s1_7 = -max(-s0_7, 0.0);
V4 s1_8 = -max(-s0_8, 0.0);
s0_0 = max(s0_0, 0.0);
s0_1 = max(s0_1, 0.0);
s0_2 = max(s0_2, 0.0);
s0_3 = max(s0_3, 0.0);
s0_4 = max(s0_4, 0.0);
s0_5 = max(s0_5, 0.0);
s0_6 = max(s0_6, 0.0);
s0_7 = max(s0_7, 0.0);
s0_8 = max(s0_8, 0.0);
t0[gxy] = f0(s0_0, s0_1, s0_2, s0_3, s0_4, s0_5, s0_6, s0_7, s0_8, s1_0, s1_1, s1_2, s1_3, s1_4, s1_5, s1_6, s1_7, s1_8);
}
//!PASS 8
//!DESC conv7
//!BLOCK_SIZE 8
//!NUM_THREADS 64
//!IN t0
//!OUT t1
#define l0(x, y) V4(O(t0, float2(x, y)))
V4 f0(V4 s0_0, V4 s0_1, V4 s0_2, V4 s0_3, V4 s0_4, V4 s0_5, V4 s0_6, V4 s0_7, V4 s0_8, V4 s1_0, V4 s1_1, V4 s1_2, V4 s1_3, V4 s1_4, V4 s1_5, V4 s1_6, V4 s1_7, V4 s1_8) {
V4 r = 0.0;
r += mul(s0_0, M4(-4.116e-02, -3.385e-02, -4.697e-02, 4.650e-02, -3.488e-02, 1.006e-01, -4.538e-03, 4.637e-02, 1.288e-01, 7.769e-03, 1.150e-01, -7.930e-03, 1.045e-02, 4.849e-02, 2.767e-02, 4.909e-02));
r += mul(s0_1, M4(-5.332e-01, -5.254e-01, -3.541e-01, -3.525e-01, 1.117e-02, 2.929e-02, 6.817e-02, 9.115e-02, 1.055e+00, 6.141e-02, 3.976e-01, 4.649e-02, 2.561e-01, -1.191e-01, 1.230e-03, -1.047e-01));
r += mul(s0_2, M4(-1.450e-01, -2.829e-01, -6.857e-01, -4.294e-01, 3.217e-02, 2.745e-02, 5.242e-02, 3.556e-02, 1.284e-01, 4.292e-01, 7.161e-01, 2.220e-01, -1.508e-02, 1.802e-01, 1.842e-01, 9.827e-02));
r += mul(s0_3, M4(1.381e-01, 5.690e-02, 5.107e-02, 6.625e-02, -1.173e-01, -7.448e-02, -1.152e-01, -1.808e-01, -1.470e-01, -1.833e-01, -1.653e-01, -1.217e-01, 9.096e-02, 5.579e-02, 1.128e-02, 9.791e-02));
r += mul(s0_4, M4(5.936e-01, -2.579e-01, 5.761e-01, -7.051e-01, -7.023e-01, 2.824e-01, 2.057e-01, 3.628e-01, 1.006e-02, 3.209e-01, 6.969e-02, -3.464e-01, 4.768e-01, -3.194e-01, -4.817e-02, 3.050e-02));
r += mul(s0_5, M4(2.145e-01, -1.899e-01, 1.446e-01, 2.497e-02, -8.750e-02, -3.154e-01, -5.060e-01, -7.413e-02, -8.542e-02, -4.198e-02, -1.528e-01, -1.812e-01, -2.597e-01, 8.374e-02, -5.592e-01, -2.557e-01));
r += mul(s0_6, M4(5.713e-02, -4.294e-03, 2.388e-02, -7.124e-02, -2.163e-02, -3.642e-03, 3.839e-02, -6.934e-02, -9.052e-02, -1.153e-02, 1.213e-02, 7.120e-02, -3.698e-02, 4.260e-02, -7.245e-02, 7.898e-02));
r += mul(s0_7, M4(2.780e-02, 1.944e-02, 1.415e-01, 1.216e-01, 9.163e-02, -3.069e-02, -1.829e-02, -2.182e-01, 5.815e-02, -1.923e-02, -5.934e-02, -3.487e-02, -1.082e-01, 1.362e-01, 8.120e-02, 2.621e-01));
r += mul(s0_8, M4(9.334e-03, -1.300e-02, 4.936e-02, 1.751e-01, -1.214e-01, 1.629e-02, -1.131e-01, 7.402e-02, 1.134e-02, 1.663e-03, -5.887e-03, -8.862e-02, 1.029e-01, -5.629e-02, 9.127e-02, -6.668e-02));
r += mul(s1_0, M4(1.165e-02, 4.389e-02, 6.299e-03, 7.939e-02, -2.769e-02, 9.353e-02, 6.239e-02, 1.341e-02, 4.713e-02, -2.731e-03, 5.256e-02, -3.515e-02, -8.911e-02, -1.425e-01, -7.889e-02, -1.627e-01));
r += mul(s1_1, M4(-2.869e-01, 2.838e-02, -2.541e-02, 5.216e-02, 2.660e-01, -2.095e-01, 1.375e-01, -2.562e-02, 2.715e-01, 1.694e-01, 9.471e-02, -7.292e-03, 3.257e-01, -2.247e-01, 7.698e-03, -2.076e-01));
r += mul(s1_2, M4(1.832e-02, -1.860e-01, -4.951e-02, -1.392e-03, 9.307e-02, 7.671e-02, 1.043e-01, -3.675e-02, 1.433e-03, 1.219e-01, 1.978e-01, 5.960e-02, 9.624e-02, 1.448e-01, 3.561e-01, 3.054e-02));
r += mul(s1_3, M4(-4.647e-02, 4.225e-03, 3.830e-02, -3.233e-02, -1.532e-01, -6.289e-01, -3.037e-01, -4.131e-01, -1.794e-01, -4.090e-02, -9.644e-02, -4.828e-02, 7.978e-02, 6.792e-03, 5.043e-02, 4.905e-02));
r += mul(s1_4, M4(2.967e-01, -5.750e-02, 1.168e-01, -2.681e-02, 1.232e-01, -2.481e-03, 8.164e-01, 2.468e-01, -3.721e-01, -5.041e-02, -4.796e-01, -2.778e-02, 3.623e-01, -8.387e-01, -5.229e-01, -4.492e-01));
r += mul(s1_5, M4(-1.507e-02, 1.343e-01, 8.567e-02, 8.923e-02, -2.766e-03, -1.548e-01, -2.588e-01, -1.295e-01, 1.777e-02, -9.243e-02, -4.495e-02, -5.528e-02, -1.071e-01, -1.284e-01, -4.142e-01, -1.800e-01));
r += mul(s1_6, M4(-4.695e-03, -9.431e-04, -1.256e-02, -4.959e-03, 1.607e-01, -8.763e-02, 2.039e-01, -1.243e-01, 3.725e-02, -5.612e-02, -3.615e-03, -2.475e-02, 4.955e-02, 4.065e-02, -1.879e-02, -1.195e-01));
r += mul(s1_7, M4(-1.039e-02, 5.631e-02, 2.655e-02, 7.419e-02, 1.286e-01, -6.430e-02, 4.800e-02, -4.480e-02, 5.067e-03, -4.197e-02, -3.342e-02, -7.461e-02, -3.225e-02, 6.062e-03, 5.391e-02, -7.135e-02));
r += mul(s1_8, M4(5.195e-02, 3.799e-02, 1.130e-01, -8.811e-03, -4.285e-02, 1.609e-02, -8.972e-03, 3.530e-02, -6.932e-02, -3.013e-03, -5.208e-02, 5.823e-02, -4.561e-02, -1.068e-01, -1.458e-01, -5.739e-02));
r += V4(1.569e-02, 1.505e-02, 2.765e-02, 1.258e-02);
return r;
}
void Pass8(uint2 blockStart, uint3 tid) {
float2 pt = float2(GetInputPt());
uint2 gxy = Rmp8x8(tid.x) + blockStart;
uint2 size = GetInputSize();
if (gxy.x >= size.x || gxy.y >= size.y) {
return;
}
float2 pos = (gxy + 0.5) * pt;
V4 s0_0 = l0(-1.0, -1.0);
V4 s0_1 = l0(0.0, -1.0);
V4 s0_2 = l0(1.0, -1.0);
V4 s0_3 = l0(-1.0, 0.0);
V4 s0_4 = l0(0.0, 0.0);
V4 s0_5 = l0(1.0, 0.0);
V4 s0_6 = l0(-1.0, 1.0);
V4 s0_7 = l0(0.0, 1.0);
V4 s0_8 = l0(1.0, 1.0);
V4 s1_0 = -max(-s0_0, 0.0);
V4 s1_1 = -max(-s0_1, 0.0);
V4 s1_2 = -max(-s0_2, 0.0);
V4 s1_3 = -max(-s0_3, 0.0);
V4 s1_4 = -max(-s0_4, 0.0);
V4 s1_5 = -max(-s0_5, 0.0);
V4 s1_6 = -max(-s0_6, 0.0);
V4 s1_7 = -max(-s0_7, 0.0);
V4 s1_8 = -max(-s0_8, 0.0);
s0_0 = max(s0_0, 0.0);
s0_1 = max(s0_1, 0.0);
s0_2 = max(s0_2, 0.0);
s0_3 = max(s0_3, 0.0);
s0_4 = max(s0_4, 0.0);
s0_5 = max(s0_5, 0.0);
s0_6 = max(s0_6, 0.0);
s0_7 = max(s0_7, 0.0);
s0_8 = max(s0_8, 0.0);
t1[gxy] = f0(s0_0, s0_1, s0_2, s0_3, s0_4, s0_5, s0_6, s0_7, s0_8, s1_0, s1_1, s1_2, s1_3, s1_4, s1_5, s1_6, s1_7, s1_8);
}
//!PASS 9
//!DESC conv8
//!BLOCK_SIZE 8
//!NUM_THREADS 64
//!IN t1
//!OUT t0
#define l0(x, y) V4(O(t1, float2(x, y)))
V4 f0(V4 s0_0, V4 s0_1, V4 s0_2, V4 s0_3, V4 s0_4, V4 s0_5, V4 s0_6, V4 s0_7, V4 s0_8, V4 s1_0, V4 s1_1, V4 s1_2, V4 s1_3, V4 s1_4, V4 s1_5, V4 s1_6, V4 s1_7, V4 s1_8) {
V4 r = 0.0;
r += mul(s0_0, M4(4.724e-03, 6.987e-03, -3.797e-03, 2.147e-02, -5.616e-03, 1.123e-02, -2.768e-02, 8.185e-03, -4.051e-03, 5.608e-05, -9.522e-02, 2.924e-02, -5.976e-03, 8.331e-03, 7.513e-02, -2.513e-02));
r += mul(s0_1, M4(-2.224e-02, 1.093e-04, 5.901e-02, 2.350e-02, 1.167e-01, -7.837e-02, 1.939e-01, 1.987e-01, 5.530e-02, -4.759e-05, 1.221e-01, 4.764e-02, -8.813e-02, 7.695e-02, -4.577e-01, 1.671e-02));
r += mul(s0_2, M4(-5.696e-02, 6.005e-03, -5.620e-02, -8.978e-02, 4.014e-02, -3.822e-02, 1.081e-01, -6.532e-03, 9.444e-03, 7.498e-03, -3.228e-02, 4.908e-02, -2.043e-02, 2.374e-02, 2.163e-02, -4.505e-02));
r += mul(s0_3, M4(-8.098e-02, 1.943e-02, -5.744e-02, 3.824e-02, -2.071e-01, 1.036e-01, -6.926e-02, -2.348e-01, 2.378e-01, -1.069e-01, -5.307e-02, 1.161e-01, 1.881e-01, -5.785e-02, -6.570e-02, 2.227e-01));
r += mul(s0_4, M4(7.577e-02, -4.125e-02, 1.714e-01, -6.934e-01, -2.448e-01, 1.146e-01, 2.354e-01, -4.935e-01, -2.321e-01, -8.273e-02, 5.890e-02, 5.704e-01, 4.833e-02, 2.875e-02, 1.163e-01, -1.802e-01));
r += mul(s0_5, M4(2.287e-01, -3.461e-02, -2.542e-02, 2.882e-02, 7.142e-02, -1.556e-01, 4.055e-02, 1.534e-02, -1.647e-01, 3.087e-03, -6.811e-02, -3.896e-02, 1.334e-01, 1.188e-01, -1.847e-01, 4.293e-02));
r += mul(s0_6, M4(-3.094e-02, 2.712e-03, 3.387e-03, 1.877e-02, 9.494e-02, -2.863e-02, -4.239e-02, -3.402e-02, 5.541e-03, -1.178e-02, 1.795e-02, -3.515e-02, -3.044e-02, -2.463e-02, -1.320e-02, 8.952e-02));
r += mul(s0_7, M4(1.035e-01, -3.181e-02, 1.902e-02, 3.973e-03, 2.267e-01, -2.620e-01, 1.821e-01, 1.631e-01, 1.494e-02, 6.125e-02, -6.176e-02, -2.497e-02, -1.364e-02, 7.542e-02, -8.480e-02, -4.648e-02));
r += mul(s0_8, M4(-1.466e-01, 3.028e-02, 2.798e-02, -7.887e-02, -4.370e-02, 1.408e-02, -6.161e-02, -3.034e-02, 6.567e-02, 2.071e-02, 3.126e-02, 6.993e-02, -5.556e-02, 1.507e-02, 2.991e-02, -4.924e-02));
r += mul(s1_0, M4(1.637e-02, -2.767e-02, 8.568e-02, -4.254e-02, 3.215e-02, 1.987e-04, -3.697e-02, 3.787e-02, 2.236e-02, -6.576e-02, 7.400e-02, 1.093e-01, 3.271e-03, 1.809e-03, 1.011e-02, 1.509e-01));
r += mul(s1_1, M4(5.538e-02, -5.865e-02, 4.351e-01, 2.494e-01, 1.101e-01, -1.484e-02, 5.176e-01, 3.999e-02, -4.782e-03, 1.155e-01, -2.099e-01, 5.012e-03, -1.919e-01, 2.292e-01, -5.378e-01, -1.223e-01));
r += mul(s1_2, M4(-5.691e-02, 7.653e-02, -2.572e-01, -1.332e-01, -5.652e-02, -5.008e-02, 7.840e-02, -3.729e-02, 6.942e-02, 6.483e-04, -2.243e-05, 8.430e-02, -6.848e-02, -2.096e-02, -3.908e-02, -9.062e-02));
r += mul(s1_3, M4(2.725e-01, -1.841e-01, -6.710e-03, 3.965e-01, -1.298e-01, -4.014e-03, 2.007e-01, -3.700e-01, 5.329e-01, -4.014e-01, 2.619e-02, 1.606e-01, 2.179e-01, -1.403e-01, 4.227e-02, 8.568e-02));
r += mul(s1_4, M4(4.188e-01, -7.320e-01, -5.609e-01, -6.087e-01, -7.521e-01, 7.363e-01, -6.253e-01, -2.011e-01, -1.017e+00, 3.331e-02, -2.135e-02, 2.084e-01, 6.074e-01, -9.824e-01, 5.154e-01, 1.748e-01));
r += mul(s1_5, M4(1.733e-01, 5.176e-01, -7.335e-02, 1.899e-02, 1.028e-01, -6.330e-02, -1.632e-01, 9.241e-05, -1.357e-01, -1.131e-01, 9.644e-02, -1.424e-02, -1.835e-02, 7.296e-01, -3.204e-01, -2.966e-02));
r += mul(s1_6, M4(4.798e-02, -1.047e-01, 3.646e-02, 6.703e-02, 5.371e-02, 4.759e-02, -2.975e-02, -6.945e-02, 7.985e-02, -1.101e-01, 3.034e-02, -1.472e-02, -3.827e-02, 9.839e-03, -4.922e-03, 4.307e-03));
r += mul(s1_7, M4(-8.950e-02, -1.253e-02, -1.730e-05, 3.862e-02, 2.692e-01, -4.645e-01, 2.399e-01, 2.744e-01, -4.503e-02, 1.724e-01, -7.935e-02, -5.200e-02, -2.132e-03, -1.926e-02, 2.926e-02, -2.288e-02));
r += mul(s1_8, M4(-1.072e-01, -1.145e-02, 6.605e-03, -1.090e-01, -7.524e-03, 8.598e-02, -7.698e-02, -6.976e-02, 5.869e-02, -5.499e-02, 3.529e-02, 7.813e-02, -1.794e-01, 4.212e-02, -4.479e-03, -7.253e-02));
r += V4(8.112e-05, 3.290e-03, -6.342e-04, 1.340e-02);
return r;
}
void Pass9(uint2 blockStart, uint3 tid) {
float2 pt = float2(GetInputPt());
uint2 gxy = Rmp8x8(tid.x) + blockStart;
uint2 size = GetInputSize();
if (gxy.x >= size.x || gxy.y >= size.y) {
return;
}
float2 pos = (gxy + 0.5) * pt;
V4 s0_0 = l0(-1.0, -1.0);
V4 s0_1 = l0(0.0, -1.0);
V4 s0_2 = l0(1.0, -1.0);
V4 s0_3 = l0(-1.0, 0.0);
V4 s0_4 = l0(0.0, 0.0);
V4 s0_5 = l0(1.0, 0.0);
V4 s0_6 = l0(-1.0, 1.0);
V4 s0_7 = l0(0.0, 1.0);
V4 s0_8 = l0(1.0, 1.0);
V4 s1_0 = -max(-s0_0, 0.0);
V4 s1_1 = -max(-s0_1, 0.0);
V4 s1_2 = -max(-s0_2, 0.0);
V4 s1_3 = -max(-s0_3, 0.0);
V4 s1_4 = -max(-s0_4, 0.0);
V4 s1_5 = -max(-s0_5, 0.0);
V4 s1_6 = -max(-s0_6, 0.0);
V4 s1_7 = -max(-s0_7, 0.0);
V4 s1_8 = -max(-s0_8, 0.0);
s0_0 = max(s0_0, 0.0);
s0_1 = max(s0_1, 0.0);
s0_2 = max(s0_2, 0.0);
s0_3 = max(s0_3, 0.0);
s0_4 = max(s0_4, 0.0);
s0_5 = max(s0_5, 0.0);
s0_6 = max(s0_6, 0.0);
s0_7 = max(s0_7, 0.0);
s0_8 = max(s0_8, 0.0);
t0[gxy] = f0(s0_0, s0_1, s0_2, s0_3, s0_4, s0_5, s0_6, s0_7, s0_8, s1_0, s1_1, s1_2, s1_3, s1_4, s1_5, s1_6, s1_7, s1_8);
}
//!PASS 10
//!DESC out-shuffle
//!BLOCK_SIZE 16
//!NUM_THREADS 64
//!IN INPUT, t0
//!OUT OUTPUT
#define l0(x, y) V4(O(t0, float2(x, y)))
V4 f0(V4 s0_0, V4 s0_1, V4 s0_2, V4 s0_3, V4 s0_4, V4 s0_5, V4 s0_6, V4 s0_7, V4 s0_8, V4 s1_0, V4 s1_1, V4 s1_2, V4 s1_3, V4 s1_4, V4 s1_5, V4 s1_6, V4 s1_7, V4 s1_8) {
V4 r = 0.0;
r += mul(s0_0, M4(-1.348e-01, -9.107e-02, -4.849e-02, 4.484e-04, -3.384e-02, -6.768e-02, -9.628e-03, -1.766e-02, -9.939e-03, -2.182e-02, -1.288e-02, 8.518e-03, 2.218e-02, -1.184e-03, 1.240e-03, 1.065e-02));
r += mul(s0_1, M4(7.301e-02, 1.014e-01, -1.363e-02, -4.850e-02, -2.842e-01, -3.060e-02, -4.154e-02, 4.057e-03, -3.458e-02, -6.335e-02, -2.660e-02, -1.335e-02, -7.944e-03, 8.560e-03, 4.588e-02, 7.580e-03));
r += mul(s0_2, M4(-9.349e-05, -2.142e-02, -1.258e-04, -9.330e-03, -5.058e-03, 3.912e-02, -2.976e-02, 2.410e-02, -8.512e-03, 4.954e-02, -2.093e-02, -2.582e-03, 1.648e-02, 7.942e-03, 1.520e-02, 3.414e-02));
r += mul(s0_3, M4(-1.021e-01, -1.140e-01, 2.412e-01, 1.289e-02, -1.192e-01, -1.140e-01, 2.395e-01, 6.930e-03, -2.027e-01, -4.824e-02, 1.243e-01, 3.820e-03, 9.280e-03, 2.866e-02, 2.106e-02, 3.644e-03));
r += mul(s0_4, M4(8.727e-02, 8.162e-02, 1.478e-01, 5.348e-01, -3.115e-01, -2.605e-02, 1.510e-01, 7.249e-01, -8.110e-02, -6.698e-01, 1.080e-01, -8.090e-02, -3.492e-01, -1.891e-01, -1.877e-01, -1.319e-01));
r += mul(s0_5, M4(-5.622e-03, 2.237e-02, -4.008e-03, -1.980e-02, -1.837e-02, -3.311e-02, 4.289e-02, 3.256e-02, -2.178e-02, 2.653e-02, -1.722e-03, 8.373e-02, -8.042e-02, -2.962e-01, -4.643e-03, -6.865e-02));
r += mul(s0_6, M4(6.248e-03, -2.320e-02, 1.883e-03, -1.430e-02, 1.224e-02, 5.634e-03, -1.964e-02, -1.627e-02, 2.010e-02, 1.174e-02, -3.919e-02, 9.559e-04, 3.016e-02, -2.836e-03, 7.667e-02, 3.552e-02));
r += mul(s0_7, M4(-6.141e-03, 1.380e-02, 1.024e-02, -1.210e-02, 4.548e-02, 3.626e-02, -9.142e-02, -7.666e-02, -3.241e-02, -2.296e-02, -3.244e-02, -2.870e-01, 4.427e-02, 8.899e-02, -1.327e-01, 4.920e-02));
r += mul(s0_8, M4(-2.801e-03, -9.930e-04, -2.770e-03, 1.623e-02, 2.158e-03, -1.258e-02, -3.089e-02, 3.211e-02, -9.620e-03, 1.776e-02, -4.337e-03, 4.676e-02, 1.130e-02, -7.436e-03, -3.572e-02, -1.742e-01));
r += mul(s1_0, M4(-4.306e-02, -6.039e-02, -1.642e-02, -1.966e-02, -5.996e-02, -1.743e-01, -3.128e-02, 1.714e-02, -4.357e-03, -7.720e-03, -4.532e-03, 4.571e-03, 3.988e-02, 2.067e-02, 1.548e-02, -2.964e-04));
r += mul(s1_1, M4(-6.070e-02, -9.324e-02, 7.472e-03, 2.173e-02, -7.996e-02, -5.139e-02, -5.545e-02, -1.891e-02, -1.767e-02, -1.527e-02, -2.906e-02, 1.310e-02, 2.594e-02, 7.495e-02, -7.681e-03, -4.678e-03));
r += mul(s1_2, M4(4.883e-02, -3.167e-02, 2.862e-02, 3.357e-02, -8.454e-03, 9.265e-03, -1.657e-02, -8.086e-03, -1.170e-02, -3.549e-02, 7.437e-03, 1.425e-02, 1.441e-02, -1.961e-02, 1.560e-02, -1.122e-02));
r += mul(s1_3, M4(-6.156e-02, -6.763e-02, 1.987e-01, 2.459e-02, -5.710e-02, -2.009e-01, 4.581e-01, -1.181e-02, -9.054e-02, -5.658e-02, 3.432e-02, 2.004e-02, 6.965e-03, -1.655e-02, 5.178e-03, -1.236e-02));
r += mul(s1_4, M4(-1.301e-01, -5.093e-02, 7.676e-01, 6.003e-01, -9.216e-02, -2.228e-03, 7.034e-02, 1.851e-01, -5.318e-01, -1.852e-01, -2.980e-02, 1.919e-02, -8.147e-01, -1.773e-01, -2.675e-01, 2.314e-02));
r += mul(s1_5, M4(-9.962e-03, -1.888e-01, -1.877e-02, 1.190e-01, -2.283e-02, -1.241e-02, 2.969e-04, 3.894e-02, -5.077e-02, 2.300e-01, -6.192e-02, 1.793e-01, 5.384e-03, -4.378e-01, 2.970e-02, -1.125e-01));
r += mul(s1_6, M4(1.376e-02, -2.891e-03, 9.292e-03, -1.288e-03, 2.615e-02, 2.656e-02, -8.111e-02, -1.779e-02, -7.512e-03, 9.174e-03, -4.553e-02, -1.139e-02, 2.259e-02, 4.351e-03, 4.963e-02, 2.002e-02));
r += mul(s1_7, M4(9.993e-03, 1.250e-02, -2.090e-02, 6.839e-03, 3.502e-03, 2.070e-03, -5.530e-02, -2.855e-03, 1.144e-02, -4.191e-02, -8.395e-02, -2.056e-01, 6.909e-02, 7.425e-02, -2.374e-01, 8.636e-02));
r += mul(s1_8, M4(-1.762e-02, -3.804e-04, 2.643e-02, 4.383e-02, 1.748e-03, -1.201e-02, -8.452e-03, -1.216e-02, -1.203e-02, -3.454e-02, -1.957e-02, 2.212e-01, -1.375e-02, -1.094e-02, -1.245e-02, -2.124e-01));
r += V4(3.107e-03, 3.655e-03, 5.416e-04, 5.397e-04);
return tanh(r);
}
void Pass10(uint2 blockStart, uint3 tid) {
float2 pt = float2(GetInputPt());
uint2 gxy = (Rmp8x8(tid.x) << 1) + blockStart;
uint2 size = GetOutputSize();
if (gxy.x >= size.x || gxy.y >= size.y) {
return;
}
float2 pos = ((gxy >> 1) + 0.5) * pt;
V4 s0_0 = l0(-1.0, -1.0);
V4 s0_1 = l0(0.0, -1.0);
V4 s0_2 = l0(1.0, -1.0);
V4 s0_3 = l0(-1.0, 0.0);
V4 s0_4 = l0(0.0, 0.0);
V4 s0_5 = l0(1.0, 0.0);
V4 s0_6 = l0(-1.0, 1.0);
V4 s0_7 = l0(0.0, 1.0);
V4 s0_8 = l0(1.0, 1.0);
V4 s1_0 = -max(-s0_0, 0.0);
V4 s1_1 = -max(-s0_1, 0.0);
V4 s1_2 = -max(-s0_2, 0.0);
V4 s1_3 = -max(-s0_3, 0.0);
V4 s1_4 = -max(-s0_4, 0.0);
V4 s1_5 = -max(-s0_5, 0.0);
V4 s1_6 = -max(-s0_6, 0.0);
V4 s1_7 = -max(-s0_7, 0.0);
V4 s1_8 = -max(-s0_8, 0.0);
s0_0 = max(s0_0, 0.0);
s0_1 = max(s0_1, 0.0);
s0_2 = max(s0_2, 0.0);
s0_3 = max(s0_3, 0.0);
s0_4 = max(s0_4, 0.0);
s0_5 = max(s0_5, 0.0);
s0_6 = max(s0_6, 0.0);
s0_7 = max(s0_7, 0.0);
s0_8 = max(s0_8, 0.0);
V4 r = f0(s0_0, s0_1, s0_2, s0_3, s0_4, s0_5, s0_6, s0_7, s0_8, s1_0, s1_1, s1_2, s1_3, s1_4, s1_5, s1_6, s1_7, s1_8);
static const float3x3 rgb2yuv = {0.299, 0.587, 0.114, -0.169, -0.331, 0.5, 0.5, -0.419, -0.081};
static const float3x3 yuv2rgb = {1, -0.00093, 1.401687, 1, -0.3437, -0.71417, 1, 1.77216, 0.00099};
float2 opt = float2(GetOutputPt());
pos -= 0.5f * opt;
float3 yuv = mul(rgb2yuv, INPUT.SampleLevel(SL, pos, 0).rgb);
OUTPUT[gxy] = float4(mul(yuv2rgb, float3(saturate(yuv.r + r.x), yuv.yz)), 1);
++gxy.x;
pos.x += opt.x;
yuv = mul(rgb2yuv, INPUT.SampleLevel(SL, pos, 0).rgb);
OUTPUT[gxy] = float4(mul(yuv2rgb, float3(saturate(yuv.r + r.y), yuv.yz)), 1);
++gxy.y;
pos.y += opt.y;
yuv = mul(rgb2yuv, INPUT.SampleLevel(SL, pos, 0).rgb);
OUTPUT[gxy] = float4(mul(yuv2rgb, float3(saturate(yuv.r + r.w), yuv.yz)), 1);
--gxy.x;
pos.x -= opt.x;
yuv = mul(rgb2yuv, INPUT.SampleLevel(SL, pos, 0).rgb);
OUTPUT[gxy] = float4(mul(yuv2rgb, float3(saturate(yuv.r + r.z), yuv.yz)), 1);
}

View file

@ -0,0 +1,778 @@
// CuNNy 8x4C BILINEAR RGB NVL - https://github.com/cunnyplapper/CuNNy
// This program is free software: you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation, either version 3 of the License, or
// (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
//
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License
// along with this program. If not, see <http://www.gnu.org/licenses/>.
//!MAGPIE EFFECT
//!VERSION 4
//!SORT_NAME CuNNy-D04N08
//!TEXTURE
Texture2D INPUT;
//!TEXTURE
//!WIDTH INPUT_WIDTH * 2
//!HEIGHT INPUT_HEIGHT * 2
Texture2D OUTPUT;
//!SAMPLER
//!FILTER POINT
SamplerState SP;
//!SAMPLER
//!FILTER LINEAR
SamplerState SL;
//!COMMON
#define O(t, p) t.SampleLevel(SP, pos + p * pt, 0)
#define V4 min16float4
#define M4 min16float4x4
//!TEXTURE
//!WIDTH INPUT_WIDTH
//!HEIGHT INPUT_HEIGHT
//!FORMAT R8G8B8A8_SNORM
Texture2D t0;
//!TEXTURE
//!WIDTH INPUT_WIDTH
//!HEIGHT INPUT_HEIGHT
//!FORMAT R8G8B8A8_SNORM
Texture2D t1;
//!PASS 1
//!DESC in
//!BLOCK_SIZE 8
//!NUM_THREADS 64
//!IN INPUT
//!OUT t0
#define l0(x, y) min16float((dot(float3(2.666e-01, 5.050e-01, 1.135e-01), O(INPUT, float2(x, y)).rgb) + -8.258e-01))
V4 f0(min16float s0_0, min16float s0_1, min16float s0_2, min16float s0_3, min16float s0_4, min16float s0_5, min16float s0_6, min16float s0_7, min16float s0_8) {
V4 r = 0.0;
r += V4(-2.544e-02, -4.130e-01, -2.634e-01, 2.417e-02) * s0_0;
r += V4(1.256e-02, -8.013e-02, 9.539e-02, -7.111e-02) * s0_1;
r += V4(1.768e-02, -2.469e-01, -1.627e-01, 8.569e-02) * s0_2;
r += V4(-1.554e-01, 3.441e-02, -1.508e-01, 2.491e-02) * s0_3;
r += V4(1.628e-01, 8.679e-01, -1.960e-02, -5.810e-01) * s0_4;
r += V4(-1.237e-02, -1.704e-01, 2.915e-01, -5.922e-01) * s0_5;
r += V4(7.925e-01, 5.570e-03, 7.074e-02, 4.442e-04) * s0_6;
r += V4(-7.910e-01, -1.530e-02, -8.229e-02, 3.149e-03) * s0_7;
r += V4(-3.973e-03, 2.262e-02, -1.213e-01, 3.843e-02) * s0_8;
r += V4(-8.495e-04, -1.121e-04, 1.842e-02, 5.844e-02);
return r;
}
void Pass1(uint2 blockStart, uint3 tid) {
float2 pt = float2(GetInputPt());
uint2 gxy = Rmp8x8(tid.x) + blockStart;
uint2 size = GetInputSize();
if (gxy.x >= size.x || gxy.y >= size.y) {
return;
}
float2 pos = (gxy + 0.5) * pt;
min16float s0_0 = l0(-1.0, -1.0);
min16float s0_1 = l0(0.0, -1.0);
min16float s0_2 = l0(1.0, -1.0);
min16float s0_3 = l0(-1.0, 0.0);
min16float s0_4 = l0(0.0, 0.0);
min16float s0_5 = l0(1.0, 0.0);
min16float s0_6 = l0(-1.0, 1.0);
min16float s0_7 = l0(0.0, 1.0);
min16float s0_8 = l0(1.0, 1.0);
t0[gxy] = f0(s0_0, s0_1, s0_2, s0_3, s0_4, s0_5, s0_6, s0_7, s0_8);
}
//!PASS 2
//!DESC conv1
//!BLOCK_SIZE 8
//!NUM_THREADS 64
//!IN t0
//!OUT t1
#define l0(x, y) V4(O(t0, float2(x, y)))
V4 f0(V4 s0_0, V4 s0_1, V4 s0_2, V4 s0_3, V4 s0_4, V4 s0_5, V4 s0_6, V4 s0_7, V4 s0_8, V4 s1_0, V4 s1_1, V4 s1_2, V4 s1_3, V4 s1_4, V4 s1_5, V4 s1_6, V4 s1_7, V4 s1_8) {
V4 r = 0.0;
r += mul(s0_0, M4(4.254e-02, 1.997e-01, 4.636e-02, -4.800e-02, 2.043e-01, -4.096e-02, -7.212e-02, 1.408e-02, -3.916e-01, 2.630e-03, 7.016e-02, 9.613e-02, 1.773e-01, -2.723e-01, -9.458e-02, -1.890e-01));
r += mul(s0_1, M4(2.350e-01, -8.474e-01, -4.044e-01, -9.188e-01, 9.560e-03, 5.061e-02, 1.092e-02, 1.781e-01, -2.144e-01, 3.203e-02, 6.349e-02, -8.272e-02, -3.105e-01, -3.917e-02, -1.320e-02, -1.541e-01));
r += mul(s0_2, M4(-8.130e-01, -1.003e-01, 8.195e-02, -7.597e-01, 5.207e-02, 3.470e-02, -8.823e-03, -1.131e-01, -4.029e-02, 7.571e-02, -2.010e-01, 2.487e-01, 1.677e-01, -5.118e-02, -1.070e-01, 7.606e-02));
r += mul(s0_3, M4(-1.158e-02, 4.898e-02, 1.202e-02, 5.012e-01, -5.343e-02, 4.756e-02, -2.438e-01, 6.399e-02, 2.822e-01, -2.863e-02, 1.996e-01, -7.099e-02, -1.323e-01, -3.797e-01, 5.385e-02, -1.014e-01));
r += mul(s0_4, M4(2.812e-01, 7.903e-01, -1.733e-01, 6.668e-01, 4.775e-01, 5.452e-01, 7.089e-01, -1.851e-01, -2.382e-01, -5.180e-02, -3.623e-01, -3.040e-01, -4.313e-01, -1.167e-02, 1.235e-01, 1.436e-01));
r += mul(s0_5, M4(-1.291e-01, -3.022e-02, -4.083e-01, -5.939e-02, -4.249e-01, -1.750e-01, 1.094e-01, -1.176e-01, 1.374e-02, 1.342e-01, 2.086e-01, 2.841e-01, 2.347e-01, 1.450e-01, 7.604e-02, 2.176e-01));
r += mul(s0_6, M4(8.130e-02, -7.215e-02, -5.249e-02, 9.518e-03, -1.979e-01, -4.441e-02, -1.857e-01, -4.227e-01, 2.149e-01, -1.610e-01, 1.655e-01, -8.841e-02, 1.409e-01, -1.059e-01, 2.037e-01, -2.744e-03));
r += mul(s0_7, M4(-7.266e-02, 1.638e-02, -1.639e-01, 1.957e-02, -2.857e-01, 1.936e-01, -1.243e-01, -1.490e-01, 1.525e-01, -8.934e-02, 7.415e-02, -1.779e-01, 1.648e-02, -6.456e-02, 7.053e-02, -9.530e-02));
r += mul(s0_8, M4(-6.960e-02, -8.960e-02, -1.757e-02, -1.370e-01, -5.137e-01, -1.179e-01, -4.053e-01, -1.987e-01, 7.100e-02, 2.928e-02, -9.682e-02, 2.403e-01, 1.814e-01, 2.131e-02, 5.579e-02, 5.457e-02));
r += mul(s1_0, M4(-2.737e-02, 5.272e-02, -1.801e-02, -2.491e-01, 2.871e-01, -3.704e-02, -6.568e-02, 2.905e-02, 1.011e-01, -3.782e-01, -8.696e-02, 4.682e-01, 3.233e-01, -3.060e-01, -3.251e-02, 1.165e+00));
r += mul(s1_1, M4(-4.994e-01, 3.049e-02, -8.802e-02, -6.179e-02, 7.133e-02, -1.957e-02, -4.465e-02, 1.130e-01, 7.255e-02, 6.956e-03, -1.204e-01, 3.699e-01, -8.844e-02, 4.624e-01, -9.881e-02, -2.512e-01));
r += mul(s1_2, M4(-3.645e-01, 1.274e-01, 2.387e-01, -1.963e-01, -5.995e-02, -5.943e-02, 9.694e-02, -2.518e-01, -2.797e-01, 1.598e-01, -1.371e-02, 4.000e-01, 2.213e-01, 9.692e-02, -3.302e-01, 1.132e+00));
r += mul(s1_3, M4(-8.539e-03, -6.535e-02, 5.575e-02, 1.928e-01, 1.156e-01, 5.227e-02, -3.039e-01, 4.794e-01, 1.441e-01, 1.929e-01, -4.689e-02, 2.023e-02, 1.330e-01, -1.358e+00, -5.393e-01, 7.907e-01));
r += mul(s1_4, M4(1.701e-01, -3.479e-02, 5.404e-01, -2.491e-01, 4.564e-01, 6.659e-01, 7.009e-01, -2.288e-02, -7.696e-01, -4.959e-01, 2.881e-01, -4.322e-01, -9.013e-01, -4.765e-01, 5.556e-02, -1.805e-01));
r += mul(s1_5, M4(-2.424e-01, 8.034e-03, -4.699e-02, -2.628e-01, -4.682e-01, 2.977e-02, 2.258e-01, -1.419e-01, 3.514e-01, 6.860e-03, 2.147e-01, 3.806e-01, 3.747e-01, 1.403e-01, 3.106e-01, 9.680e-01));
r += mul(s1_6, M4(1.776e-01, -4.873e-02, -1.403e-01, -1.817e-02, -3.551e-01, 4.838e-04, -2.786e-01, -6.048e-01, 3.082e-01, -4.703e-01, 2.419e-01, -3.002e-01, -4.310e-01, -6.490e-01, 1.343e+00, -1.019e+00));
r += mul(s1_7, M4(4.689e-02, -2.927e-02, -7.494e-02, -3.516e-02, -2.217e-01, -3.189e-01, 2.202e-01, -2.936e-01, 4.772e-02, -1.609e-01, 9.853e-02, -4.214e-01, 2.780e-01, -1.073e-01, 1.102e-01, -2.033e-01));
r += mul(s1_8, M4(-9.468e-02, 4.428e-02, 1.269e-01, -1.086e-01, -1.106e-01, -1.367e-01, -3.356e-01, 4.656e-03, 4.648e-02, -1.743e-02, -2.074e-01, -3.745e-02, 1.281e-01, -3.233e-01, 6.533e-01, 3.705e-01));
r += V4(1.016e-03, 5.583e-03, -1.608e-02, -1.996e-04);
return r;
}
void Pass2(uint2 blockStart, uint3 tid) {
float2 pt = float2(GetInputPt());
uint2 gxy = Rmp8x8(tid.x) + blockStart;
uint2 size = GetInputSize();
if (gxy.x >= size.x || gxy.y >= size.y) {
return;
}
float2 pos = (gxy + 0.5) * pt;
V4 s0_0 = l0(-1.0, -1.0);
V4 s0_1 = l0(0.0, -1.0);
V4 s0_2 = l0(1.0, -1.0);
V4 s0_3 = l0(-1.0, 0.0);
V4 s0_4 = l0(0.0, 0.0);
V4 s0_5 = l0(1.0, 0.0);
V4 s0_6 = l0(-1.0, 1.0);
V4 s0_7 = l0(0.0, 1.0);
V4 s0_8 = l0(1.0, 1.0);
V4 s1_0 = -max(-s0_0, 0.0);
V4 s1_1 = -max(-s0_1, 0.0);
V4 s1_2 = -max(-s0_2, 0.0);
V4 s1_3 = -max(-s0_3, 0.0);
V4 s1_4 = -max(-s0_4, 0.0);
V4 s1_5 = -max(-s0_5, 0.0);
V4 s1_6 = -max(-s0_6, 0.0);
V4 s1_7 = -max(-s0_7, 0.0);
V4 s1_8 = -max(-s0_8, 0.0);
s0_0 = max(s0_0, 0.0);
s0_1 = max(s0_1, 0.0);
s0_2 = max(s0_2, 0.0);
s0_3 = max(s0_3, 0.0);
s0_4 = max(s0_4, 0.0);
s0_5 = max(s0_5, 0.0);
s0_6 = max(s0_6, 0.0);
s0_7 = max(s0_7, 0.0);
s0_8 = max(s0_8, 0.0);
t1[gxy] = f0(s0_0, s0_1, s0_2, s0_3, s0_4, s0_5, s0_6, s0_7, s0_8, s1_0, s1_1, s1_2, s1_3, s1_4, s1_5, s1_6, s1_7, s1_8);
}
//!PASS 3
//!DESC conv2
//!BLOCK_SIZE 8
//!NUM_THREADS 64
//!IN t1
//!OUT t0
#define l0(x, y) V4(O(t1, float2(x, y)))
V4 f0(V4 s0_0, V4 s0_1, V4 s0_2, V4 s0_3, V4 s0_4, V4 s0_5, V4 s0_6, V4 s0_7, V4 s0_8, V4 s1_0, V4 s1_1, V4 s1_2, V4 s1_3, V4 s1_4, V4 s1_5, V4 s1_6, V4 s1_7, V4 s1_8) {
V4 r = 0.0;
r += mul(s0_0, M4(-4.810e-02, 2.379e-02, -8.471e-02, 1.305e-01, -5.897e-02, 1.263e-01, -9.639e-02, 9.150e-02, 9.002e-03, -1.763e-01, 8.275e-02, -2.357e-01, 7.181e-02, -7.360e-02, 4.629e-02, -8.259e-02));
r += mul(s0_1, M4(6.774e-02, 9.108e-02, -3.750e-01, 8.014e-02, 2.890e-01, 9.986e-02, -1.262e-02, -1.285e-01, -2.789e-01, -1.145e-01, -4.982e-02, -1.101e-01, -2.051e-02, -2.271e-01, 1.343e-01, -8.643e-02));
r += mul(s0_2, M4(-5.433e-02, 6.899e-02, -3.350e-01, -7.837e-02, -1.076e-01, 1.912e-02, -9.061e-02, 1.919e-01, 9.387e-02, -4.206e-02, 1.861e-01, -4.416e-03, -1.560e-01, -4.364e-02, 4.364e-01, 8.765e-02));
r += mul(s0_3, M4(2.382e-01, 3.032e-01, -1.313e-01, -1.154e-01, 1.008e-01, 3.058e-01, -8.513e-02, 2.713e-01, -9.875e-02, 3.017e-01, 3.203e-02, 5.762e-01, -2.056e-03, -7.698e-02, 8.681e-02, 4.245e-02));
r += mul(s0_4, M4(2.643e-01, 1.750e-01, 4.850e-02, 3.131e-03, 2.785e-01, 1.598e-01, 5.772e-01, -4.118e-04, -4.270e-01, -2.447e-01, 4.486e-01, 9.155e-02, -3.428e-01, -2.583e-01, -3.721e-02, 6.278e-02));
r += mul(s0_5, M4(-1.080e-01, -5.514e-02, -3.648e-01, -2.319e-02, -2.100e-01, -4.065e-02, 1.126e-01, 3.970e-02, 9.824e-02, 1.377e-02, 1.295e-01, -2.512e-02, 1.115e-01, 7.094e-02, 3.413e-01, -5.245e-02));
r += mul(s0_6, M4(1.991e-01, 4.710e-02, -9.305e-02, -1.471e-01, -8.221e-02, 1.134e-01, -1.718e-01, -2.606e-01, -8.167e-02, -1.462e-02, -1.094e-01, -1.569e-01, 2.133e-02, 3.374e-02, 4.583e-02, 1.228e-01));
r += mul(s0_7, M4(-2.135e-01, 6.874e-02, -4.993e-02, 1.156e-02, -4.261e-01, 1.366e-01, 4.250e-02, -5.707e-02, -1.966e-01, -6.106e-02, 1.265e-01, -3.076e-03, 2.043e-03, -3.072e-02, 1.043e-01, 3.422e-01));
r += mul(s0_8, M4(7.235e-02, -3.542e-04, -1.435e-02, -3.815e-02, -8.855e-02, 8.327e-02, 1.954e-01, 1.462e-01, 1.615e-01, -4.957e-02, 1.596e-02, -8.625e-02, 6.574e-02, -9.799e-02, 5.401e-03, 7.595e-02));
r += mul(s1_0, M4(1.245e-01, -2.812e-03, 1.486e-02, 1.246e-01, -5.943e-02, 1.170e-01, -1.068e-01, 8.960e-02, 5.354e-03, -2.039e-01, 8.228e-02, -2.530e-01, -2.789e-03, -6.932e-02, -3.187e-02, -5.794e-02));
r += mul(s1_1, M4(-2.539e-02, 4.598e-02, -1.205e-01, 1.597e-01, 2.391e-01, 1.269e-01, -1.116e-02, 1.498e-02, -2.388e-01, -1.548e-01, -7.389e-02, -1.083e-02, -1.181e-01, -7.069e-02, 9.383e-03, -2.018e-01));
r += mul(s1_2, M4(-1.248e-02, 3.267e-02, -2.761e-01, -2.043e-02, -8.520e-02, 3.937e-02, -1.372e-01, 1.821e-02, 6.915e-02, -4.061e-02, 1.782e-01, -4.619e-02, 6.811e-02, -5.458e-04, 3.193e-01, 8.892e-03));
r += mul(s1_3, M4(-1.580e-01, 7.536e-02, -6.680e-02, 1.891e-01, 1.196e-01, 3.476e-01, -6.321e-02, 1.972e-01, -9.851e-02, 4.483e-01, 9.326e-03, 5.272e-01, -1.478e-01, -4.009e-02, -3.561e-02, -2.549e-01));
r += mul(s1_4, M4(-1.253e-01, 1.345e-01, 4.994e-01, 2.000e-01, 2.728e-01, 1.672e-01, 5.501e-01, -1.736e-02, -5.782e-01, -2.191e-01, 4.380e-01, 4.346e-02, -3.006e-01, -5.220e-02, -1.613e-01, 6.023e-02));
r += mul(s1_5, M4(1.276e-01, -8.319e-02, -2.115e-01, 1.471e-01, -1.669e-01, -2.484e-02, 9.906e-02, 1.836e-02, 1.010e-01, 1.847e-02, 1.027e-01, -1.680e-02, -1.880e-01, 1.377e-01, 3.823e-02, -8.256e-02));
r += mul(s1_6, M4(-3.200e-01, -7.023e-02, -1.243e-01, -2.003e-02, -7.863e-02, 6.650e-02, -1.264e-01, -1.862e-01, -9.119e-02, -4.374e-02, -1.195e-01, -6.902e-02, -1.360e-01, 3.356e-02, -3.667e-02, -1.815e-01));
r += mul(s1_7, M4(1.462e-02, 1.001e-01, 2.453e-01, -1.298e-02, -4.372e-01, 1.509e-01, 8.011e-02, -1.323e-01, -1.980e-01, -4.785e-02, 1.733e-01, 1.100e-02, -2.153e-01, 6.711e-02, 2.595e-03, 1.213e-01));
r += mul(s1_8, M4(-3.794e-03, 2.239e-02, -6.960e-02, 7.342e-02, -1.882e-01, 1.159e-01, 1.876e-01, 3.125e-02, 2.242e-01, -5.956e-02, 1.328e-02, -5.400e-02, 2.205e-02, -6.049e-02, -9.151e-02, -1.137e-01));
r += V4(-1.437e-02, -2.276e-02, 2.275e-02, 6.547e-04);
return r;
}
void Pass3(uint2 blockStart, uint3 tid) {
float2 pt = float2(GetInputPt());
uint2 gxy = Rmp8x8(tid.x) + blockStart;
uint2 size = GetInputSize();
if (gxy.x >= size.x || gxy.y >= size.y) {
return;
}
float2 pos = (gxy + 0.5) * pt;
V4 s0_0 = l0(-1.0, -1.0);
V4 s0_1 = l0(0.0, -1.0);
V4 s0_2 = l0(1.0, -1.0);
V4 s0_3 = l0(-1.0, 0.0);
V4 s0_4 = l0(0.0, 0.0);
V4 s0_5 = l0(1.0, 0.0);
V4 s0_6 = l0(-1.0, 1.0);
V4 s0_7 = l0(0.0, 1.0);
V4 s0_8 = l0(1.0, 1.0);
V4 s1_0 = -max(-s0_0, 0.0);
V4 s1_1 = -max(-s0_1, 0.0);
V4 s1_2 = -max(-s0_2, 0.0);
V4 s1_3 = -max(-s0_3, 0.0);
V4 s1_4 = -max(-s0_4, 0.0);
V4 s1_5 = -max(-s0_5, 0.0);
V4 s1_6 = -max(-s0_6, 0.0);
V4 s1_7 = -max(-s0_7, 0.0);
V4 s1_8 = -max(-s0_8, 0.0);
s0_0 = max(s0_0, 0.0);
s0_1 = max(s0_1, 0.0);
s0_2 = max(s0_2, 0.0);
s0_3 = max(s0_3, 0.0);
s0_4 = max(s0_4, 0.0);
s0_5 = max(s0_5, 0.0);
s0_6 = max(s0_6, 0.0);
s0_7 = max(s0_7, 0.0);
s0_8 = max(s0_8, 0.0);
t0[gxy] = f0(s0_0, s0_1, s0_2, s0_3, s0_4, s0_5, s0_6, s0_7, s0_8, s1_0, s1_1, s1_2, s1_3, s1_4, s1_5, s1_6, s1_7, s1_8);
}
//!PASS 4
//!DESC conv3
//!BLOCK_SIZE 8
//!NUM_THREADS 64
//!IN t0
//!OUT t1
#define l0(x, y) V4(O(t0, float2(x, y)))
V4 f0(V4 s0_0, V4 s0_1, V4 s0_2, V4 s0_3, V4 s0_4, V4 s0_5, V4 s0_6, V4 s0_7, V4 s0_8, V4 s1_0, V4 s1_1, V4 s1_2, V4 s1_3, V4 s1_4, V4 s1_5, V4 s1_6, V4 s1_7, V4 s1_8) {
V4 r = 0.0;
r += mul(s0_0, M4(3.886e-03, -1.503e-01, -6.378e-01, 4.214e-02, -1.255e-01, 1.146e-01, -1.917e-01, -6.556e-02, -3.368e-02, 6.874e-02, 2.796e-01, -2.936e-02, -3.239e-02, 3.923e-02, -6.439e-02, 1.313e-02));
r += mul(s0_1, M4(4.357e-01, -1.067e-01, 3.330e-01, -8.295e-02, -4.004e-01, 3.113e-01, -4.222e-02, 2.290e-01, -1.861e-01, 9.039e-02, -1.132e-01, 1.077e-01, -1.603e-02, 6.296e-02, 4.907e-01, 3.396e-02));
r += mul(s0_2, M4(-3.290e-01, -1.073e-01, 1.064e-02, -2.792e-03, -4.366e-01, 3.239e-01, -1.383e-01, 1.918e-01, 3.058e-02, 1.006e-01, -6.898e-02, -1.451e-02, -1.882e-01, 2.248e-01, 1.744e-02, -3.155e-02));
r += mul(s0_3, M4(2.403e-02, -1.353e-01, 1.895e-01, -2.285e-01, -1.211e-01, 1.771e-01, 2.135e-01, 1.900e-01, -4.204e-03, 3.719e-02, -4.772e-01, 2.006e-01, -2.532e-03, 5.872e-02, 2.901e-01, -9.450e-02));
r += mul(s0_4, M4(8.054e-02, 1.389e-02, -2.060e-02, -3.042e-01, -2.476e-01, 9.905e-02, -9.248e-01, 3.372e-01, -5.254e-01, 4.455e-01, 5.707e-02, 1.057e-01, -3.525e-01, 3.349e-01, -3.414e-01, 7.090e-02));
r += mul(s0_5, M4(-1.889e-01, -2.290e-01, -4.930e-02, -1.824e-01, -2.062e+00, 6.868e-02, 2.552e-01, 3.883e-01, 5.778e-02, 9.141e-02, 9.917e-02, -1.164e-01, 4.359e-02, 2.105e-01, -7.911e-02, -1.916e-01));
r += mul(s0_6, M4(-2.267e-02, -6.231e-03, -9.718e-03, 3.770e-04, -6.982e-02, 4.184e-02, -2.296e-01, -9.542e-02, 5.236e-02, -5.412e-02, -1.757e-01, -1.054e-01, 1.414e-02, -7.772e-02, -1.338e-02, 3.928e-02));
r += mul(s0_7, M4(5.776e-02, 4.703e-02, 3.914e-02, -1.617e-02, -3.606e-01, 3.037e-01, -3.096e-01, 3.562e-02, 3.108e-01, -3.684e-01, 3.725e-02, -2.050e-01, -1.494e-02, 8.741e-02, 5.992e-02, 2.655e-02));
r += mul(s0_8, M4(3.614e-02, -1.212e-01, 2.507e-02, -5.858e-02, -1.121e-01, -3.433e-01, 6.613e-02, -6.943e-01, 2.233e-02, -5.467e-02, -6.900e-03, -2.566e-01, -1.106e-01, 2.016e-02, -3.700e-02, -2.886e-01));
r += mul(s1_0, M4(-5.136e-02, -2.190e-01, -1.035e+00, -5.722e-02, 2.876e-02, 5.070e-02, 3.532e-01, -6.778e-03, 2.930e-04, -6.219e-02, 2.314e-01, -5.210e-02, 1.508e-02, -4.390e-02, -7.749e-02, -9.658e-03));
r += mul(s1_1, M4(3.663e-01, -9.746e-02, -6.582e-01, -3.676e-01, -1.694e-01, 7.883e-02, -1.613e-01, 2.328e-02, 2.595e-04, -3.763e-02, -9.946e-02, -6.137e-02, 1.429e-01, -1.964e-01, 2.439e-01, 4.898e-02));
r += mul(s1_2, M4(7.884e-02, 1.842e-01, -1.309e-01, 4.895e-02, 4.820e-02, 8.364e-02, 1.189e-02, -1.438e-02, -7.934e-02, 4.775e-02, -6.137e-02, -1.335e-02, -4.416e-02, 3.584e-02, 1.751e-04, -1.178e-02));
r += mul(s1_3, M4(-9.861e-03, -1.277e-01, 2.389e-03, -3.232e-01, -2.782e-03, 1.115e-01, -6.485e-02, 2.093e-01, 2.056e-01, 2.527e-02, -1.772e-01, 1.863e-02, 5.983e-02, -8.103e-02, 3.076e-01, -2.027e-01));
r += mul(s1_4, M4(1.001e-01, 3.476e-01, -1.305e-01, -1.653e-01, 8.890e-02, -4.170e-01, -1.530e-01, 7.048e-02, -5.605e-01, 1.093e-01, 2.038e-01, -2.320e-01, -1.287e-01, -2.173e-01, -1.630e-01, -9.691e-02));
r += mul(s1_5, M4(-2.778e-01, 1.393e-01, -2.802e-02, -5.375e-02, -4.550e-01, -1.661e-01, 2.293e-03, -5.984e-02, -5.070e-02, -8.852e-02, 7.806e-02, 2.187e-02, 1.901e-01, -3.219e-01, -1.937e-01, -2.336e-01));
r += mul(s1_6, M4(-8.489e-02, 1.968e-01, -7.760e-02, 1.388e-01, 4.713e-03, 1.527e-01, 8.535e-02, 1.643e-02, 1.429e-01, -1.558e-01, 2.339e-01, 2.762e-01, 1.694e-02, -4.245e-02, -2.793e-02, -3.332e-02));
r += mul(s1_7, M4(-4.377e-02, 3.486e-01, -1.766e-01, -1.065e-01, -1.645e-01, -8.722e-04, -1.147e-01, 1.663e-01, 6.801e-02, -3.539e-01, 1.560e-02, -1.819e-01, 1.440e-02, -1.221e-02, 3.693e-02, 5.886e-03));
r += mul(s1_8, M4(5.940e-02, 1.624e-01, 1.526e-02, 6.692e-02, 1.812e-01, -8.647e-02, 3.210e-02, -3.751e-04, 2.884e-02, -4.717e-02, 4.121e-03, 5.144e-02, -1.995e-02, -2.827e-01, 6.148e-03, 7.209e-02));
r += V4(1.575e-02, -2.007e-01, -3.519e-03, -9.082e-03);
return r;
}
void Pass4(uint2 blockStart, uint3 tid) {
float2 pt = float2(GetInputPt());
uint2 gxy = Rmp8x8(tid.x) + blockStart;
uint2 size = GetInputSize();
if (gxy.x >= size.x || gxy.y >= size.y) {
return;
}
float2 pos = (gxy + 0.5) * pt;
V4 s0_0 = l0(-1.0, -1.0);
V4 s0_1 = l0(0.0, -1.0);
V4 s0_2 = l0(1.0, -1.0);
V4 s0_3 = l0(-1.0, 0.0);
V4 s0_4 = l0(0.0, 0.0);
V4 s0_5 = l0(1.0, 0.0);
V4 s0_6 = l0(-1.0, 1.0);
V4 s0_7 = l0(0.0, 1.0);
V4 s0_8 = l0(1.0, 1.0);
V4 s1_0 = -max(-s0_0, 0.0);
V4 s1_1 = -max(-s0_1, 0.0);
V4 s1_2 = -max(-s0_2, 0.0);
V4 s1_3 = -max(-s0_3, 0.0);
V4 s1_4 = -max(-s0_4, 0.0);
V4 s1_5 = -max(-s0_5, 0.0);
V4 s1_6 = -max(-s0_6, 0.0);
V4 s1_7 = -max(-s0_7, 0.0);
V4 s1_8 = -max(-s0_8, 0.0);
s0_0 = max(s0_0, 0.0);
s0_1 = max(s0_1, 0.0);
s0_2 = max(s0_2, 0.0);
s0_3 = max(s0_3, 0.0);
s0_4 = max(s0_4, 0.0);
s0_5 = max(s0_5, 0.0);
s0_6 = max(s0_6, 0.0);
s0_7 = max(s0_7, 0.0);
s0_8 = max(s0_8, 0.0);
t1[gxy] = f0(s0_0, s0_1, s0_2, s0_3, s0_4, s0_5, s0_6, s0_7, s0_8, s1_0, s1_1, s1_2, s1_3, s1_4, s1_5, s1_6, s1_7, s1_8);
}
//!PASS 5
//!DESC conv4
//!BLOCK_SIZE 8
//!NUM_THREADS 64
//!IN t1
//!OUT t0
#define l0(x, y) V4(O(t1, float2(x, y)))
V4 f0(V4 s0_0, V4 s0_1, V4 s0_2, V4 s0_3, V4 s0_4, V4 s0_5, V4 s0_6, V4 s0_7, V4 s0_8, V4 s1_0, V4 s1_1, V4 s1_2, V4 s1_3, V4 s1_4, V4 s1_5, V4 s1_6, V4 s1_7, V4 s1_8) {
V4 r = 0.0;
r += mul(s0_0, M4(-6.479e-02, -9.976e-02, -1.507e-01, -9.934e-02, -1.046e-02, -1.471e-01, -4.218e-02, -8.348e-04, -5.963e-02, 1.519e-03, 5.897e-03, 5.284e-02, -4.467e-01, 4.779e-01, -1.953e-02, 1.951e-01));
r += mul(s0_1, M4(-5.276e-02, -1.201e-01, -1.160e-01, 6.076e-02, -4.798e-02, -3.491e-01, -3.055e-01, -1.607e-01, -8.989e-02, 1.221e-01, -1.561e-01, 6.227e-02, -1.598e-01, -6.666e-01, 6.029e-01, -5.466e-01));
r += mul(s0_2, M4(-1.331e-01, -4.988e-02, -2.217e-02, 3.405e-02, 2.261e-02, 1.352e-01, 1.124e-02, 8.259e-02, -3.548e-02, 2.454e-01, 4.417e-02, 2.297e-01, 1.780e-01, -2.203e-01, 5.913e-02, -2.201e-01));
r += mul(s0_3, M4(1.348e-01, 5.544e-01, -4.335e-01, -3.619e-01, 1.011e-01, 2.665e-01, -2.627e-01, -1.800e-01, -1.158e-01, -8.543e-02, -7.868e-03, 2.056e-01, 1.988e-01, 1.174e+00, -1.291e-01, 1.131e-01));
r += mul(s0_4, M4(4.504e-01, 1.025e-01, -1.449e-01, -3.442e-02, -4.525e-01, -1.513e-01, -8.135e-02, -9.669e-02, -3.287e-01, 5.251e-01, -6.540e-01, 7.386e-02, 2.603e-01, -8.246e-01, -1.378e-01, 2.363e+00));
r += mul(s0_5, M4(-7.102e-02, -5.554e-02, -3.489e-02, -6.688e-02, 2.877e-01, -6.258e-02, 8.515e-02, -2.109e-01, -2.723e-01, 1.543e-01, 1.285e-01, 9.366e-02, 3.135e-02, -3.700e-01, -4.111e-01, 1.822e+00));
r += mul(s0_6, M4(-4.018e-02, -3.412e-01, 5.388e-02, 4.947e-01, -3.234e-02, -6.778e-02, 3.825e-02, 1.313e-01, -6.083e-02, 3.439e-02, -1.081e-01, 6.456e-02, 2.287e-02, -2.470e-01, 2.026e-02, -1.886e-02));
r += mul(s0_7, M4(2.410e-01, 1.529e-01, -1.370e-01, -1.389e-01, 1.549e-01, 8.308e-03, 3.064e-02, 3.925e-02, -9.013e-02, 1.131e-01, -9.240e-02, 3.740e-01, -1.009e-01, -6.576e-02, -1.491e-01, -3.452e-02));
r += mul(s0_8, M4(-1.628e-01, -2.480e-02, -6.569e-02, 3.873e-02, 1.604e-02, 1.651e-02, -4.681e-02, -1.647e-02, -1.648e-02, 1.541e-01, 2.284e-02, 6.545e-01, 1.799e-03, 1.193e-03, -1.215e-01, 5.919e-02));
r += mul(s1_0, M4(-1.115e-02, -5.014e-02, -1.499e-01, -7.414e-04, -6.944e-02, -4.168e-02, -1.254e-01, -6.576e-02, 2.946e-04, -2.669e-02, 4.109e-02, 1.949e-02, 1.242e-01, 1.753e-01, 9.717e-02, 1.446e-01));
r += mul(s1_1, M4(-1.327e-02, -1.462e-01, -8.510e-02, -1.228e-02, 1.772e-01, 1.009e-01, -4.342e-02, -8.827e-02, -6.663e-02, -1.245e-01, -4.625e-02, -4.285e-02, 7.586e-02, -1.208e-01, 2.705e-01, -1.558e-01));
r += mul(s1_2, M4(-7.024e-02, -3.045e-02, -1.916e-02, 4.979e-02, -9.145e-02, 2.285e-01, 4.612e-02, 2.217e-01, 7.690e-02, -4.332e-02, 6.032e-03, -2.370e-02, 3.802e-01, -8.124e-02, 1.982e-02, -8.310e-02));
r += mul(s1_3, M4(1.238e-01, 5.787e-01, -5.332e-01, -2.806e-01, 1.208e-01, 6.549e-02, -2.040e-01, -2.578e-02, -5.878e-02, -1.496e-01, 1.213e-01, 1.489e-02, 9.569e-02, 1.964e-01, 6.477e-02, -2.939e-01));
r += mul(s1_4, M4(5.825e-01, 2.257e-01, -1.943e-01, 1.101e-01, -3.240e-01, -2.967e-01, -4.203e-02, -3.636e-01, -1.062e-01, -3.799e-02, -4.444e-01, -7.607e-02, -3.056e-01, -2.926e-01, -4.582e-02, 2.795e-01));
r += mul(s1_5, M4(-9.076e-02, -5.130e-02, -3.718e-02, -6.163e-02, 1.831e-01, -1.199e-01, 9.176e-02, -2.456e-01, 2.362e-01, -1.854e-01, -1.394e-01, 3.560e-03, 2.070e-02, -6.903e-02, -5.061e-02, 3.068e-02));
r += mul(s1_6, M4(-4.988e-02, -3.880e-01, 3.001e-02, 3.892e-01, -2.827e-02, -2.880e-02, 4.071e-02, 2.861e-01, -4.016e-02, -1.085e-01, 9.207e-03, -7.367e-02, 9.072e-03, 8.960e-02, 5.334e-03, -6.480e-02));
r += mul(s1_7, M4(2.900e-01, 1.450e-01, -1.401e-01, -2.809e-01, 1.218e-01, -3.153e-03, -2.544e-02, 1.898e-01, -7.197e-02, -3.721e-01, 4.042e-02, 9.918e-02, -1.132e-01, 3.578e-02, 4.000e-02, 6.991e-02));
r += mul(s1_8, M4(-1.493e-01, -2.310e-02, -6.133e-02, 5.322e-02, -4.879e-02, -5.139e-02, -8.058e-02, 4.140e-02, 2.511e-01, 3.669e-02, -1.003e-01, -1.457e-01, 1.528e-01, 1.177e-01, 6.665e-02, -3.084e-02));
r += V4(2.513e-04, -2.994e-02, -5.133e-02, -8.977e-03);
return r;
}
void Pass5(uint2 blockStart, uint3 tid) {
float2 pt = float2(GetInputPt());
uint2 gxy = Rmp8x8(tid.x) + blockStart;
uint2 size = GetInputSize();
if (gxy.x >= size.x || gxy.y >= size.y) {
return;
}
float2 pos = (gxy + 0.5) * pt;
V4 s0_0 = l0(-1.0, -1.0);
V4 s0_1 = l0(0.0, -1.0);
V4 s0_2 = l0(1.0, -1.0);
V4 s0_3 = l0(-1.0, 0.0);
V4 s0_4 = l0(0.0, 0.0);
V4 s0_5 = l0(1.0, 0.0);
V4 s0_6 = l0(-1.0, 1.0);
V4 s0_7 = l0(0.0, 1.0);
V4 s0_8 = l0(1.0, 1.0);
V4 s1_0 = -max(-s0_0, 0.0);
V4 s1_1 = -max(-s0_1, 0.0);
V4 s1_2 = -max(-s0_2, 0.0);
V4 s1_3 = -max(-s0_3, 0.0);
V4 s1_4 = -max(-s0_4, 0.0);
V4 s1_5 = -max(-s0_5, 0.0);
V4 s1_6 = -max(-s0_6, 0.0);
V4 s1_7 = -max(-s0_7, 0.0);
V4 s1_8 = -max(-s0_8, 0.0);
s0_0 = max(s0_0, 0.0);
s0_1 = max(s0_1, 0.0);
s0_2 = max(s0_2, 0.0);
s0_3 = max(s0_3, 0.0);
s0_4 = max(s0_4, 0.0);
s0_5 = max(s0_5, 0.0);
s0_6 = max(s0_6, 0.0);
s0_7 = max(s0_7, 0.0);
s0_8 = max(s0_8, 0.0);
t0[gxy] = f0(s0_0, s0_1, s0_2, s0_3, s0_4, s0_5, s0_6, s0_7, s0_8, s1_0, s1_1, s1_2, s1_3, s1_4, s1_5, s1_6, s1_7, s1_8);
}
//!PASS 6
//!DESC conv5
//!BLOCK_SIZE 8
//!NUM_THREADS 64
//!IN t0
//!OUT t1
#define l0(x, y) V4(O(t0, float2(x, y)))
V4 f0(V4 s0_0, V4 s0_1, V4 s0_2, V4 s0_3, V4 s0_4, V4 s0_5, V4 s0_6, V4 s0_7, V4 s0_8, V4 s1_0, V4 s1_1, V4 s1_2, V4 s1_3, V4 s1_4, V4 s1_5, V4 s1_6, V4 s1_7, V4 s1_8) {
V4 r = 0.0;
r += mul(s0_0, M4(4.575e-01, 2.412e-01, 1.926e-01, 5.873e-02, 2.954e-02, -1.424e-01, 7.881e-03, 2.358e-04, -5.872e-02, -1.007e-01, -3.632e-02, 5.718e-02, 1.389e-01, -4.163e-02, -1.379e-01, 2.160e-03));
r += mul(s0_1, M4(1.347e-01, -8.074e-01, -1.155e-01, 2.242e-01, -2.673e-01, 4.053e-01, 8.867e-02, -2.840e-02, 9.443e-02, 2.632e-01, 9.207e-02, -1.793e-02, 1.519e-01, 3.302e-03, 2.027e-01, 2.643e-02));
r += mul(s0_2, M4(1.462e-02, -7.543e-02, -6.080e-02, 7.431e-02, -3.673e-02, -1.665e-01, -2.745e-01, -4.416e-02, -3.270e-01, 7.677e-01, 7.241e-01, -1.157e-01, -8.204e-03, 2.172e-02, 3.183e-01, 3.931e-02));
r += mul(s0_3, M4(1.168e+00, -8.427e-01, -3.237e-03, 5.416e-02, 1.694e-02, -1.042e-01, -2.173e-01, -1.089e-01, -9.881e-02, -1.109e-01, -1.003e-01, -5.080e-02, -9.279e-02, -1.111e-01, -2.699e-02, -2.297e-02));
r += mul(s0_4, M4(-4.884e-01, -4.472e-01, -9.701e-02, 8.789e-01, 1.962e-02, 5.041e-01, 3.221e-01, -4.622e-02, 9.039e-02, -2.531e-01, 6.228e-01, 1.590e-02, 1.804e-02, 7.795e-02, -8.005e-02, -6.310e-03));
r += mul(s0_5, M4(-6.567e-02, -5.161e-02, 5.550e-02, 5.285e-02, -6.147e-02, -1.840e-01, 2.028e-01, 4.014e-01, 4.070e-01, -1.022e-01, 1.414e+00, -3.126e-01, 7.508e-03, 1.013e-01, -7.300e-02, -4.282e-01));
r += mul(s0_6, M4(1.721e+00, 1.776e-01, -8.690e-02, -1.102e-01, -8.467e-02, -2.165e-02, 6.238e-02, 2.052e-02, 2.763e-01, -3.472e-02, -1.179e-01, 2.993e-02, -6.860e-02, 1.887e-02, 3.140e-02, -6.853e-02));
r += mul(s0_7, M4(1.937e-01, 1.975e-01, -2.456e-01, -1.360e+00, 1.792e-01, -5.969e-02, -7.670e-02, 2.606e-01, 1.355e-01, -9.109e-03, 2.756e-01, 6.674e-02, 1.312e-02, -1.542e-02, 2.236e-02, 1.997e-01));
r += mul(s0_8, M4(4.255e-02, -1.452e-02, -8.732e-02, -1.084e-01, 1.495e-02, 1.302e-02, -9.151e-02, -2.814e-01, 5.197e-02, 2.866e-02, 5.490e-01, 4.310e-01, 3.666e-02, -3.380e-03, -2.830e-02, -8.223e-02));
r += mul(s1_0, M4(2.549e-02, 7.469e-02, -5.290e-02, -4.972e-02, -2.340e-01, -1.875e-01, 1.656e-01, 5.697e-02, -8.570e-02, -1.520e-01, -2.622e-02, 1.043e-02, -2.377e-01, -3.927e-02, 1.539e-01, 4.528e-02));
r += mul(s1_1, M4(-1.188e-02, -9.781e-02, 1.606e-01, 5.138e-02, -4.165e-01, 8.262e-01, 1.709e-01, -1.063e-01, 8.393e-03, 7.300e-02, -9.347e-02, -6.226e-02, -3.633e-01, -4.453e-01, 2.190e-01, 2.415e-01));
r += mul(s1_2, M4(-4.011e-02, 3.404e-02, 1.013e-01, 3.551e-02, 9.692e-02, -2.109e-01, 1.897e-01, -2.192e-01, -1.703e-01, 5.317e-01, 1.354e-01, -2.027e-01, -3.658e-01, -1.845e-01, -5.465e-01, 1.436e-01));
r += mul(s1_3, M4(7.674e-01, 1.677e-01, -7.875e-02, 7.537e-03, -4.911e-01, -1.083e-01, 7.183e-03, -1.107e-01, -2.514e-02, -1.257e-01, -5.070e-02, -3.886e-02, 1.368e-01, -1.991e-02, -1.698e-01, -7.850e-03));
r += mul(s1_4, M4(-5.096e-02, 7.912e-02, -2.105e-01, 1.149e-01, 9.798e-02, 2.243e-01, -3.434e-01, 3.492e-01, -1.265e-01, -1.839e-01, -1.337e-01, -6.909e-02, -8.552e-01, 1.334e-01, 8.652e-01, -3.408e-01));
r += mul(s1_5, M4(-2.933e-02, 1.424e-01, 6.542e-02, -1.710e-01, -1.459e-01, -3.069e-02, -1.275e-01, -9.443e-02, 2.657e-01, -4.784e-04, -6.729e-03, -1.910e-01, -4.628e-01, 3.808e-02, -1.470e-01, 1.480e-01));
r += mul(s1_6, M4(1.512e-01, -1.755e-02, -5.440e-02, 1.317e-02, -7.181e-02, -6.842e-03, -7.375e-02, -8.356e-02, 7.332e-02, -9.437e-02, -1.008e-01, -4.731e-02, -9.102e-02, -8.192e-03, 7.862e-04, 6.417e-02));
r += mul(s1_7, M4(2.457e-01, -1.058e-01, -2.777e-02, -1.532e-03, 7.609e-02, 3.452e-02, 1.774e-01, 3.296e-01, 6.779e-02, -6.683e-02, 1.485e-01, 7.321e-02, -3.082e-02, -4.348e-02, 3.558e-03, 9.111e-03));
r += mul(s1_8, M4(1.104e-01, 5.040e-03, 9.642e-03, -8.991e-02, -2.134e-01, 3.758e-02, -1.244e-01, -1.987e-01, -7.007e-02, 6.792e-03, 1.369e-01, 5.332e-01, -5.354e-02, -2.024e-02, -1.038e-01, -4.812e-02));
r += V4(4.102e-03, 1.192e-03, -2.598e-03, -2.812e-03);
return r;
}
void Pass6(uint2 blockStart, uint3 tid) {
float2 pt = float2(GetInputPt());
uint2 gxy = Rmp8x8(tid.x) + blockStart;
uint2 size = GetInputSize();
if (gxy.x >= size.x || gxy.y >= size.y) {
return;
}
float2 pos = (gxy + 0.5) * pt;
V4 s0_0 = l0(-1.0, -1.0);
V4 s0_1 = l0(0.0, -1.0);
V4 s0_2 = l0(1.0, -1.0);
V4 s0_3 = l0(-1.0, 0.0);
V4 s0_4 = l0(0.0, 0.0);
V4 s0_5 = l0(1.0, 0.0);
V4 s0_6 = l0(-1.0, 1.0);
V4 s0_7 = l0(0.0, 1.0);
V4 s0_8 = l0(1.0, 1.0);
V4 s1_0 = -max(-s0_0, 0.0);
V4 s1_1 = -max(-s0_1, 0.0);
V4 s1_2 = -max(-s0_2, 0.0);
V4 s1_3 = -max(-s0_3, 0.0);
V4 s1_4 = -max(-s0_4, 0.0);
V4 s1_5 = -max(-s0_5, 0.0);
V4 s1_6 = -max(-s0_6, 0.0);
V4 s1_7 = -max(-s0_7, 0.0);
V4 s1_8 = -max(-s0_8, 0.0);
s0_0 = max(s0_0, 0.0);
s0_1 = max(s0_1, 0.0);
s0_2 = max(s0_2, 0.0);
s0_3 = max(s0_3, 0.0);
s0_4 = max(s0_4, 0.0);
s0_5 = max(s0_5, 0.0);
s0_6 = max(s0_6, 0.0);
s0_7 = max(s0_7, 0.0);
s0_8 = max(s0_8, 0.0);
t1[gxy] = f0(s0_0, s0_1, s0_2, s0_3, s0_4, s0_5, s0_6, s0_7, s0_8, s1_0, s1_1, s1_2, s1_3, s1_4, s1_5, s1_6, s1_7, s1_8);
}
//!PASS 7
//!DESC conv6
//!BLOCK_SIZE 8
//!NUM_THREADS 64
//!IN t1
//!OUT t0
#define l0(x, y) V4(O(t1, float2(x, y)))
V4 f0(V4 s0_0, V4 s0_1, V4 s0_2, V4 s0_3, V4 s0_4, V4 s0_5, V4 s0_6, V4 s0_7, V4 s0_8, V4 s1_0, V4 s1_1, V4 s1_2, V4 s1_3, V4 s1_4, V4 s1_5, V4 s1_6, V4 s1_7, V4 s1_8) {
V4 r = 0.0;
r += mul(s0_0, M4(6.200e-02, 5.385e-02, -5.478e-02, 3.955e-02, -1.722e-02, -1.194e-01, 8.331e-02, -9.296e-02, -2.161e-02, 8.716e-02, -5.918e-02, 1.032e-01, 4.954e-02, -3.822e-02, 8.472e-02, -2.191e-01));
r += mul(s0_1, M4(2.503e-01, 5.635e-02, 7.355e-03, -2.025e-01, 7.104e-02, -1.324e-01, -3.051e-02, 2.246e-02, -4.480e-02, 6.693e-03, 4.467e-02, 3.388e-02, 4.262e-01, 1.488e-01, -8.809e-01, 5.350e-01));
r += mul(s0_2, M4(-7.511e-03, 1.921e-01, -3.653e-01, 2.096e-02, 2.413e-02, 4.846e-02, -1.538e-01, 3.359e-02, 5.958e-03, -1.033e-02, 2.389e-02, 1.283e-02, -5.270e-02, 2.842e-01, 5.681e-02, -3.578e-02));
r += mul(s0_3, M4(-2.198e-02, -1.674e-02, 3.330e-02, 3.249e-02, -4.430e-02, 9.217e-02, -3.348e-02, -3.546e-01, 1.228e-01, 3.875e-02, 7.220e-03, 6.719e-02, -8.768e-01, -1.165e-02, -3.862e-02, -2.045e-02));
r += mul(s0_4, M4(-6.935e-01, -4.898e-01, 2.252e-01, -1.647e-01, -6.408e-02, 4.562e-01, -6.617e-01, 1.220e-01, 1.053e-02, -9.937e-02, -1.118e-02, 3.272e-01, -9.081e-02, 2.353e-02, 4.776e-01, -1.238e-01));
r += mul(s0_5, M4(2.481e-01, -3.296e-01, -3.372e-02, -2.008e-02, 5.924e-03, 1.762e-02, 3.642e-01, -1.182e-01, -2.219e-02, -4.332e-02, -9.762e-02, 3.537e-02, 2.114e-02, -5.440e-02, 3.124e-01, 5.069e-02));
r += mul(s0_6, M4(-5.465e-02, -5.352e-03, -3.419e-03, -6.733e-02, -8.079e-02, -6.569e-02, -1.494e-02, -3.462e-01, -8.125e-03, 2.572e-03, -3.894e-02, -3.246e-02, -1.566e-02, -3.004e-02, 1.145e-01, 6.794e-02));
r += mul(s0_7, M4(4.788e-02, 7.675e-03, -7.030e-02, -2.384e-02, -3.070e-01, -7.080e-01, -2.017e-01, 9.579e-02, 1.259e-01, -1.004e-02, -1.287e-01, 3.334e-02, -9.642e-02, -8.073e-02, 2.546e-02, 5.204e-02));
r += mul(s0_8, M4(-6.015e-02, 1.650e-01, -5.471e-02, -1.454e-01, -2.785e-02, -1.831e-01, 1.123e-01, 3.453e-02, -1.179e-02, 1.722e-02, -1.068e-02, -2.608e-02, 1.514e-04, -1.287e-02, -7.741e-03, -9.765e-03));
r += mul(s1_0, M4(-4.922e-02, -5.675e-03, -2.161e-02, 3.164e-02, -2.003e-02, -3.890e-02, 5.198e-02, -1.811e-03, -3.385e-02, -1.510e-02, -2.289e-02, 1.009e-01, 4.427e-02, -1.763e-01, 1.255e-01, -5.073e-02));
r += mul(s1_1, M4(1.057e-01, -8.124e-02, 1.131e-01, -1.361e-01, 4.740e-02, -6.425e-02, 8.930e-03, 5.318e-02, 5.266e-02, -6.003e-02, 1.320e-01, 4.163e-02, 1.277e-01, -2.404e-01, -1.696e-01, 2.204e-01));
r += mul(s1_2, M4(2.723e-02, 1.918e-01, -2.822e-01, -1.877e-02, -4.599e-03, 7.591e-02, -1.128e-01, -6.519e-03, 2.311e-02, -1.684e-01, 2.293e-01, -1.042e-01, -1.882e-02, 4.970e-02, -1.309e-01, -8.894e-03));
r += mul(s1_3, M4(4.883e-02, 2.819e-02, 4.318e-02, 3.186e-02, 7.782e-02, 1.741e-01, -8.927e-02, 4.005e-02, 5.888e-02, -1.057e-01, 9.692e-02, 8.032e-02, -1.086e-01, 6.323e-02, -8.520e-02, -1.273e-02));
r += mul(s1_4, M4(-1.746e-01, -2.834e-02, -3.694e-02, 3.226e-01, -2.541e-01, 6.860e-01, -1.436e-01, 1.705e-01, 2.614e-01, -6.751e-02, 5.646e-02, 3.666e-01, -2.621e-02, 4.951e-01, -1.090e-01, -3.168e-01));
r += mul(s1_5, M4(1.513e-01, 5.210e-02, 2.625e-01, -6.303e-02, -2.252e-02, -9.485e-02, 4.776e-01, -1.789e-01, -1.291e-01, -9.714e-02, -1.427e-01, -1.165e-01, 2.415e-02, 9.790e-02, 6.024e-02, -9.622e-02));
r += mul(s1_6, M4(3.751e-02, -2.907e-02, -1.762e-02, -9.545e-02, 2.866e-01, -7.329e-02, -9.787e-03, 4.513e-03, -9.486e-02, -2.446e-02, -2.357e-02, -5.002e-02, 4.973e-02, 6.256e-02, -2.532e-02, -1.817e-02));
r += mul(s1_7, M4(-6.855e-02, -6.762e-02, -6.269e-02, -6.947e-02, -1.389e-01, -1.915e-01, -4.806e-02, 1.870e-01, 1.298e-01, 6.268e-03, -5.985e-02, -5.396e-02, -3.048e-02, -5.396e-03, -9.720e-02, 3.289e-03));
r += mul(s1_8, M4(-2.052e-02, -8.106e-02, -1.721e-02, 9.911e-03, -8.521e-02, 4.832e-02, -1.708e-01, -6.445e-02, -9.788e-02, 8.836e-02, -1.204e-01, -1.123e-01, 1.514e-02, 1.628e-02, -5.003e-02, -6.128e-03));
r += V4(1.448e-03, -2.432e-03, -8.004e-04, 5.896e-05);
return r;
}
void Pass7(uint2 blockStart, uint3 tid) {
float2 pt = float2(GetInputPt());
uint2 gxy = Rmp8x8(tid.x) + blockStart;
uint2 size = GetInputSize();
if (gxy.x >= size.x || gxy.y >= size.y) {
return;
}
float2 pos = (gxy + 0.5) * pt;
V4 s0_0 = l0(-1.0, -1.0);
V4 s0_1 = l0(0.0, -1.0);
V4 s0_2 = l0(1.0, -1.0);
V4 s0_3 = l0(-1.0, 0.0);
V4 s0_4 = l0(0.0, 0.0);
V4 s0_5 = l0(1.0, 0.0);
V4 s0_6 = l0(-1.0, 1.0);
V4 s0_7 = l0(0.0, 1.0);
V4 s0_8 = l0(1.0, 1.0);
V4 s1_0 = -max(-s0_0, 0.0);
V4 s1_1 = -max(-s0_1, 0.0);
V4 s1_2 = -max(-s0_2, 0.0);
V4 s1_3 = -max(-s0_3, 0.0);
V4 s1_4 = -max(-s0_4, 0.0);
V4 s1_5 = -max(-s0_5, 0.0);
V4 s1_6 = -max(-s0_6, 0.0);
V4 s1_7 = -max(-s0_7, 0.0);
V4 s1_8 = -max(-s0_8, 0.0);
s0_0 = max(s0_0, 0.0);
s0_1 = max(s0_1, 0.0);
s0_2 = max(s0_2, 0.0);
s0_3 = max(s0_3, 0.0);
s0_4 = max(s0_4, 0.0);
s0_5 = max(s0_5, 0.0);
s0_6 = max(s0_6, 0.0);
s0_7 = max(s0_7, 0.0);
s0_8 = max(s0_8, 0.0);
t0[gxy] = f0(s0_0, s0_1, s0_2, s0_3, s0_4, s0_5, s0_6, s0_7, s0_8, s1_0, s1_1, s1_2, s1_3, s1_4, s1_5, s1_6, s1_7, s1_8);
}
//!PASS 8
//!DESC conv7
//!BLOCK_SIZE 8
//!NUM_THREADS 64
//!IN t0
//!OUT t1
#define l0(x, y) V4(O(t0, float2(x, y)))
V4 f0(V4 s0_0, V4 s0_1, V4 s0_2, V4 s0_3, V4 s0_4, V4 s0_5, V4 s0_6, V4 s0_7, V4 s0_8, V4 s1_0, V4 s1_1, V4 s1_2, V4 s1_3, V4 s1_4, V4 s1_5, V4 s1_6, V4 s1_7, V4 s1_8) {
V4 r = 0.0;
r += mul(s0_0, M4(5.901e-02, -1.033e-01, -1.441e-01, 4.291e-02, 2.355e-02, -1.199e-01, -1.741e-01, -5.263e-03, -6.030e-03, -4.043e-02, 1.910e-01, 8.326e-03, 2.913e-02, 1.969e-02, -1.380e-01, 9.492e-02));
r += mul(s0_1, M4(-1.616e-01, 1.649e-01, -1.133e-02, -1.037e-01, -1.060e-02, 2.299e-01, -5.302e-02, -2.329e-01, -8.540e-02, 2.232e-01, 2.647e-01, 3.922e-01, 5.387e-02, 5.841e-01, -1.264e-01, -1.440e-01));
r += mul(s0_2, M4(-1.944e-02, -7.262e-02, 9.583e-02, 3.448e-02, 4.402e-02, 5.319e-02, -2.384e-02, 4.652e-02, 6.280e-02, -4.195e-02, 1.573e-02, 7.059e-02, 1.029e-01, -1.784e-02, -3.735e-02, -4.952e-02));
r += mul(s0_3, M4(7.393e-02, -1.825e-01, -2.983e-01, -5.798e-02, -2.475e-01, -4.958e-02, 6.660e-01, -2.202e-01, -9.158e-02, 4.280e-04, 2.472e-01, -2.979e-01, -9.887e-02, 6.188e-02, 2.163e-01, -9.358e-03));
r += mul(s0_4, M4(-8.664e-01, 2.357e-01, 3.390e-01, -5.275e-01, -2.213e-01, -4.992e-01, 5.479e-01, 4.245e-01, -7.542e-02, 4.854e-01, -3.525e-01, 3.950e-01, 3.619e-01, -3.968e-01, -3.447e-01, 5.089e-01));
r += mul(s0_5, M4(-9.239e-02, -6.370e-01, -7.252e-02, -3.435e-01, -1.057e-01, 1.616e-01, -4.413e-02, 1.824e-01, 2.001e-02, -1.343e-01, -5.730e-02, 7.302e-02, -2.361e-02, -9.044e-02, -1.041e-01, 2.971e-01));
r += mul(s0_6, M4(-2.803e-02, -8.707e-02, -1.407e-01, -2.685e-02, 1.099e-01, 1.721e-01, 1.612e-01, 6.962e-02, -1.659e-02, 7.845e-02, 2.165e-01, -7.067e-02, 1.666e-02, 7.051e-02, 6.373e-02, 4.391e-02));
r += mul(s0_7, M4(-1.560e-01, -2.698e-02, -5.684e-01, -1.184e-01, 7.742e-01, -1.023e-03, -8.177e-02, 2.857e-01, 2.253e-02, -1.400e-02, -6.523e-02, 7.644e-02, 1.789e-01, -8.433e-03, 1.041e-01, 7.009e-02));
r += mul(s0_8, M4(-1.491e-01, -2.037e-01, -2.499e-01, -7.730e-02, 1.051e-01, -1.718e-02, -1.762e-01, 4.808e-02, -3.068e-03, 1.737e-02, -3.772e-04, 4.732e-02, 7.205e-02, 7.901e-02, -1.759e-02, 8.476e-02));
r += mul(s1_0, M4(4.810e-02, -1.822e-02, -1.150e-01, -1.679e-02, -5.481e-02, -7.544e-02, 2.213e-01, 2.615e-02, -2.628e-03, -1.482e-01, -5.570e-02, 5.137e-02, -1.381e-02, -1.878e-03, -3.132e-02, -3.309e-02));
r += mul(s1_1, M4(1.101e-01, 1.003e-01, -4.307e-01, -2.520e-02, 1.138e-02, -1.966e-01, 6.664e-02, 1.114e-01, -1.431e-01, 3.634e-01, 4.274e-02, -8.279e-02, -5.291e-02, 3.540e-01, 8.995e-02, -1.401e-01));
r += mul(s1_2, M4(7.230e-02, 4.684e-01, -6.542e-02, -2.792e-01, 2.936e-02, 3.476e-03, -1.024e-02, 1.880e-01, 1.898e-02, 2.529e-02, 8.537e-03, -6.073e-03, 1.025e-01, -2.320e-01, -1.804e-02, 5.471e-02));
r += mul(s1_3, M4(-9.258e-03, -7.731e-03, 4.285e-02, -4.725e-02, -3.878e-02, -1.749e-02, -1.681e-02, -1.020e-01, -3.975e-02, 1.609e-02, 8.299e-02, -1.824e-01, -2.500e-02, 3.516e-02, 8.591e-02, 1.714e-02));
r += mul(s1_4, M4(-2.210e-01, 1.534e-01, 3.410e-01, -2.552e-01, -5.090e-02, 1.582e-02, 1.802e-01, -1.333e-01, -5.371e-01, 3.751e-01, -1.323e-01, 3.018e-01, 1.756e-01, -9.756e-02, -4.873e-01, 4.985e-01));
r += mul(s1_5, M4(-1.073e-02, 2.919e-01, -2.025e-01, 3.240e-01, 4.318e-02, -1.972e-02, -1.612e-01, 3.528e-01, -6.472e-02, -6.212e-02, 3.146e-02, 6.391e-02, 4.950e-02, -6.270e-01, -1.985e-02, 4.680e-02));
r += mul(s1_6, M4(-2.215e-02, 1.836e-02, 5.021e-02, -3.016e-02, -7.854e-03, 1.135e-02, 3.407e-02, -2.923e-02, -5.384e-03, 6.570e-02, 2.437e-01, -8.712e-02, 2.275e-02, -2.291e-03, -7.378e-02, 5.231e-02));
r += mul(s1_7, M4(-4.186e-02, 6.944e-02, 8.353e-02, -1.927e-02, 3.937e-02, 2.105e-02, 7.152e-02, 5.635e-03, 1.114e-01, -3.772e-02, -1.853e-01, 6.636e-02, 4.654e-02, -1.008e-01, -1.625e-01, 7.888e-02));
r += mul(s1_8, M4(5.288e-02, -5.516e-02, -4.014e-02, 8.854e-02, 2.434e-02, 9.192e-02, -1.203e-02, 6.813e-02, 4.626e-02, -4.892e-02, 4.700e-03, 7.578e-02, -5.040e-02, 3.497e-02, 3.176e-02, -9.741e-02));
r += V4(2.671e-03, -5.536e-03, -4.013e-03, 4.378e-03);
return r;
}
void Pass8(uint2 blockStart, uint3 tid) {
float2 pt = float2(GetInputPt());
uint2 gxy = Rmp8x8(tid.x) + blockStart;
uint2 size = GetInputSize();
if (gxy.x >= size.x || gxy.y >= size.y) {
return;
}
float2 pos = (gxy + 0.5) * pt;
V4 s0_0 = l0(-1.0, -1.0);
V4 s0_1 = l0(0.0, -1.0);
V4 s0_2 = l0(1.0, -1.0);
V4 s0_3 = l0(-1.0, 0.0);
V4 s0_4 = l0(0.0, 0.0);
V4 s0_5 = l0(1.0, 0.0);
V4 s0_6 = l0(-1.0, 1.0);
V4 s0_7 = l0(0.0, 1.0);
V4 s0_8 = l0(1.0, 1.0);
V4 s1_0 = -max(-s0_0, 0.0);
V4 s1_1 = -max(-s0_1, 0.0);
V4 s1_2 = -max(-s0_2, 0.0);
V4 s1_3 = -max(-s0_3, 0.0);
V4 s1_4 = -max(-s0_4, 0.0);
V4 s1_5 = -max(-s0_5, 0.0);
V4 s1_6 = -max(-s0_6, 0.0);
V4 s1_7 = -max(-s0_7, 0.0);
V4 s1_8 = -max(-s0_8, 0.0);
s0_0 = max(s0_0, 0.0);
s0_1 = max(s0_1, 0.0);
s0_2 = max(s0_2, 0.0);
s0_3 = max(s0_3, 0.0);
s0_4 = max(s0_4, 0.0);
s0_5 = max(s0_5, 0.0);
s0_6 = max(s0_6, 0.0);
s0_7 = max(s0_7, 0.0);
s0_8 = max(s0_8, 0.0);
t1[gxy] = f0(s0_0, s0_1, s0_2, s0_3, s0_4, s0_5, s0_6, s0_7, s0_8, s1_0, s1_1, s1_2, s1_3, s1_4, s1_5, s1_6, s1_7, s1_8);
}
//!PASS 9
//!DESC conv8
//!BLOCK_SIZE 8
//!NUM_THREADS 64
//!IN t1
//!OUT t0
#define l0(x, y) V4(O(t1, float2(x, y)))
V4 f0(V4 s0_0, V4 s0_1, V4 s0_2, V4 s0_3, V4 s0_4, V4 s0_5, V4 s0_6, V4 s0_7, V4 s0_8, V4 s1_0, V4 s1_1, V4 s1_2, V4 s1_3, V4 s1_4, V4 s1_5, V4 s1_6, V4 s1_7, V4 s1_8) {
V4 r = 0.0;
r += mul(s0_0, M4(8.283e-02, 5.262e-02, 1.580e-02, 4.991e-02, 6.836e-02, -3.234e-02, 5.630e-02, 1.275e-01, 5.398e-03, 9.866e-04, -1.054e-02, 1.601e-02, 1.546e-02, -7.786e-02, -2.630e-02, -3.023e-02));
r += mul(s0_1, M4(9.285e-02, 3.403e-01, -4.572e-02, 1.431e-01, 2.876e-01, -3.271e-01, -8.133e-04, 5.998e-01, 4.515e-02, 9.836e-02, 2.315e-02, 1.724e-01, -8.080e-02, -1.978e-01, -5.366e-02, -4.535e-02));
r += mul(s0_2, M4(1.708e-02, -8.374e-02, -1.831e-02, 1.744e-02, 4.902e-02, -1.037e-02, -3.508e-02, 3.501e-02, 1.160e-01, 2.529e-01, 4.235e-02, 4.233e-02, -5.953e-03, -1.398e-01, -8.815e-03, 1.053e-02));
r += mul(s0_3, M4(-2.836e-03, -2.496e-01, 2.703e-02, 9.490e-02, 3.985e-01, -9.458e-02, 1.355e-01, 5.917e-01, 5.597e-03, -8.963e-02, 5.238e-02, 4.360e-02, -1.070e-01, 7.593e-02, 6.376e-02, -1.498e-01));
r += mul(s0_4, M4(3.214e-01, -8.045e-01, 6.621e-01, -1.261e-01, -1.487e+00, 1.086e+00, 3.779e-01, -1.762e+00, 2.721e-01, -3.815e-02, -1.450e-01, 4.063e-01, 2.804e-01, 3.876e-01, 2.607e-01, 2.174e-01));
r += mul(s0_5, M4(-3.896e-01, 3.340e-01, -2.529e-01, -6.519e-02, -1.815e-01, 5.542e-02, -1.669e-01, 1.732e-02, 2.995e-01, 4.942e-02, 6.557e-02, -1.386e-01, -1.392e-01, 2.822e-01, 2.016e-02, -1.313e-01));
r += mul(s0_6, M4(-2.130e-02, 4.137e-02, 7.324e-02, 4.834e-03, 9.333e-02, -2.998e-01, 4.229e-01, 9.535e-02, -2.595e-02, 2.955e-02, 7.491e-02, -3.028e-02, -2.850e-02, 1.582e-02, -1.076e-01, -3.159e-02));
r += mul(s0_7, M4(-3.601e-02, 5.993e-02, -1.190e-02, -6.800e-02, 6.894e-03, -2.095e-01, -9.548e-02, -2.539e-02, -2.390e-02, 2.947e-02, 1.581e-01, -5.305e-03, 1.029e-01, -1.456e-01, -3.526e-02, 9.251e-02));
r += mul(s0_8, M4(-7.206e-02, 9.690e-02, -4.464e-02, -6.999e-03, 3.140e-02, -4.201e-02, -6.364e-03, 5.280e-03, -1.412e-01, 1.696e-01, -1.274e-01, -9.546e-02, 5.285e-02, -1.072e-01, 5.994e-02, 1.293e-02));
r += mul(s1_0, M4(-1.808e-02, 1.243e-01, -6.814e-02, -4.219e-03, 1.273e-02, 2.752e-02, 3.764e-02, 3.650e-02, 7.663e-04, 6.843e-03, 1.380e-02, -3.235e-02, 5.400e-02, -5.352e-02, 1.190e-02, -1.028e-01));
r += mul(s1_1, M4(2.568e-01, 2.764e-01, 7.740e-02, 1.273e-01, 7.059e-02, 6.668e-02, 4.211e-02, 6.293e-02, -4.164e-02, 2.210e-01, -1.293e-02, 8.369e-02, 2.046e-01, 1.238e-01, 9.491e-02, 4.614e-02));
r += mul(s1_2, M4(-2.387e-02, 3.174e-01, 8.165e-02, -6.680e-02, -1.516e-02, 1.482e-02, -1.342e-02, 1.692e-02, -2.288e-02, -6.891e-02, -5.559e-02, 4.771e-02, 3.290e-02, 1.234e-01, 4.334e-02, -5.106e-02));
r += mul(s1_3, M4(6.216e-02, -2.114e-01, -1.616e-01, 1.664e-01, 3.796e-02, 6.036e-02, -1.106e-01, 1.398e-01, -3.139e-02, -6.274e-02, 4.988e-02, -6.274e-02, 2.296e-02, -5.131e-02, 5.052e-02, -8.866e-02));
r += mul(s1_4, M4(2.647e-01, -7.858e-01, 1.597e-01, -8.262e-01, -3.213e-01, 2.427e-01, 1.686e-01, -4.251e-01, 1.505e-01, 3.244e-02, 1.023e-01, 1.962e-01, -1.116e-01, 3.525e-01, 8.848e-01, -1.945e-01));
r += mul(s1_5, M4(-2.549e-01, -1.429e-01, -3.696e-02, 3.042e-01, -1.256e-01, 2.760e-02, -3.650e-02, 7.985e-02, -1.958e-01, 3.076e-01, -9.253e-02, -8.512e-02, -1.708e-01, -3.422e-04, -8.181e-02, 2.319e-01));
r += mul(s1_6, M4(-3.382e-02, 6.627e-02, 1.158e-01, -3.044e-02, -7.983e-03, -7.855e-02, 1.729e-02, 3.219e-04, -1.764e-02, 4.065e-02, -1.400e-02, -2.387e-02, 2.673e-03, 5.460e-03, -4.992e-02, -1.573e-02));
r += mul(s1_7, M4(-2.505e-02, 1.763e-01, -4.433e-01, -1.024e-01, 1.391e-01, -2.435e-01, -5.358e-02, 5.203e-02, 3.157e-02, 2.012e-02, 7.424e-03, 3.723e-02, -2.388e-02, 7.204e-02, -4.522e-01, -1.187e-02));
r += mul(s1_8, M4(9.737e-02, 7.067e-02, 4.072e-02, 4.303e-02, 2.890e-02, -1.810e-02, 5.156e-03, -1.953e-02, -3.503e-02, 7.492e-02, 1.402e-02, -9.796e-03, 2.320e-01, -2.135e-01, 1.462e-01, 1.194e-01));
r += V4(-5.006e-05, -2.252e-04, -1.752e-03, 4.586e-04);
return r;
}
void Pass9(uint2 blockStart, uint3 tid) {
float2 pt = float2(GetInputPt());
uint2 gxy = Rmp8x8(tid.x) + blockStart;
uint2 size = GetInputSize();
if (gxy.x >= size.x || gxy.y >= size.y) {
return;
}
float2 pos = (gxy + 0.5) * pt;
V4 s0_0 = l0(-1.0, -1.0);
V4 s0_1 = l0(0.0, -1.0);
V4 s0_2 = l0(1.0, -1.0);
V4 s0_3 = l0(-1.0, 0.0);
V4 s0_4 = l0(0.0, 0.0);
V4 s0_5 = l0(1.0, 0.0);
V4 s0_6 = l0(-1.0, 1.0);
V4 s0_7 = l0(0.0, 1.0);
V4 s0_8 = l0(1.0, 1.0);
V4 s1_0 = -max(-s0_0, 0.0);
V4 s1_1 = -max(-s0_1, 0.0);
V4 s1_2 = -max(-s0_2, 0.0);
V4 s1_3 = -max(-s0_3, 0.0);
V4 s1_4 = -max(-s0_4, 0.0);
V4 s1_5 = -max(-s0_5, 0.0);
V4 s1_6 = -max(-s0_6, 0.0);
V4 s1_7 = -max(-s0_7, 0.0);
V4 s1_8 = -max(-s0_8, 0.0);
s0_0 = max(s0_0, 0.0);
s0_1 = max(s0_1, 0.0);
s0_2 = max(s0_2, 0.0);
s0_3 = max(s0_3, 0.0);
s0_4 = max(s0_4, 0.0);
s0_5 = max(s0_5, 0.0);
s0_6 = max(s0_6, 0.0);
s0_7 = max(s0_7, 0.0);
s0_8 = max(s0_8, 0.0);
t0[gxy] = f0(s0_0, s0_1, s0_2, s0_3, s0_4, s0_5, s0_6, s0_7, s0_8, s1_0, s1_1, s1_2, s1_3, s1_4, s1_5, s1_6, s1_7, s1_8);
}
//!PASS 10
//!DESC out-shuffle
//!BLOCK_SIZE 16
//!NUM_THREADS 64
//!IN INPUT, t0
//!OUT OUTPUT
#define l0(x, y) V4(O(t0, float2(x, y)))
V4 f0(V4 s0_0, V4 s0_1, V4 s0_2, V4 s0_3, V4 s0_4, V4 s0_5, V4 s0_6, V4 s0_7, V4 s0_8, V4 s1_0, V4 s1_1, V4 s1_2, V4 s1_3, V4 s1_4, V4 s1_5, V4 s1_6, V4 s1_7, V4 s1_8) {
V4 r = 0.0;
r += mul(s0_0, M4(2.670e-02, -1.964e-03, 2.191e-02, 3.109e-02, 1.911e-02, -2.017e-02, -2.948e-02, -2.237e-02, -3.845e-02, -7.954e-03, -3.472e-02, -2.253e-02, -1.571e-02, -6.613e-03, -1.489e-02, -2.647e-02));
r += mul(s0_1, M4(-6.714e-02, -2.106e-02, 7.577e-03, 1.788e-02, 8.081e-02, 8.813e-02, -5.510e-02, -2.724e-02, 1.150e-01, 5.284e-02, -8.964e-02, -3.024e-02, 5.215e-02, 5.334e-02, -1.180e-02, 6.927e-03));
r += mul(s0_2, M4(1.036e-02, 1.826e-02, -8.095e-03, -9.967e-03, 1.368e-03, 3.479e-02, -1.887e-03, -2.161e-02, -3.464e-02, -1.124e-01, -4.623e-03, -5.295e-03, -7.199e-03, -4.285e-02, 8.862e-03, -1.610e-02));
r += mul(s0_3, M4(2.388e-01, -1.001e-03, 1.699e-01, -4.519e-02, -3.274e-01, 1.550e-01, 3.748e-02, 3.435e-02, -1.655e-01, 1.227e-02, -1.372e-01, 4.700e-02, -1.636e-01, 1.222e-02, -1.323e-01, 3.239e-02));
r += mul(s0_4, M4(1.698e-01, 4.561e-01, -1.355e-01, 1.831e-01, -3.815e-01, -7.832e-01, 1.738e-01, 4.516e-02, 2.803e-01, -4.239e-01, 8.945e-01, -1.339e-02, -3.701e-01, -3.731e-01, 1.765e-01, -1.343e-01));
r += mul(s0_5, M4(-4.653e-02, -8.470e-02, -1.076e-03, -7.153e-02, 1.022e-02, -2.560e-02, -1.154e-02, 2.252e-02, -1.053e-01, 4.014e-01, -1.479e-01, 3.667e-01, 9.425e-02, -8.079e-02, 5.594e-03, 4.870e-02));
r += mul(s0_6, M4(-6.274e-02, -3.430e-02, -5.955e-02, 1.220e-02, -6.075e-02, 1.284e-02, -8.384e-02, 2.143e-01, -2.050e-02, -8.887e-03, -1.445e-02, 1.797e-02, 1.436e-01, -8.067e-04, 1.013e-01, 3.847e-03));
r += mul(s0_7, M4(6.862e-02, -7.230e-02, -2.461e-01, -3.760e-01, 4.038e-02, -2.634e-02, -2.725e-01, -4.389e-01, 9.088e-03, -1.873e-02, -9.497e-02, -1.860e-01, -1.038e-01, 2.502e-01, -6.194e-01, 4.470e-02));
r += mul(s0_8, M4(-1.984e-02, 4.173e-02, 5.328e-02, 5.554e-02, 1.241e-03, -2.290e-03, 5.972e-02, 4.381e-02, -3.320e-03, -1.434e-04, -5.754e-02, -6.072e-02, -6.854e-03, 6.781e-02, 1.208e-01, -5.469e-02));
r += mul(s1_0, M4(7.050e-02, -3.676e-02, 7.009e-03, 1.431e-02, -1.258e-02, -6.854e-03, -9.803e-04, 5.955e-03, -3.077e-03, -2.372e-02, 8.060e-03, -5.992e-02, -7.957e-02, 2.905e-02, 3.914e-04, -1.408e-02));
r += mul(s1_1, M4(-1.068e-01, 4.589e-02, -1.399e-02, -8.157e-03, 1.811e-02, 7.241e-03, 9.447e-03, 3.242e-03, 5.152e-02, 8.667e-02, -2.512e-02, -2.978e-02, 1.382e-01, 5.481e-02, -2.199e-02, -2.739e-02));
r += mul(s1_2, M4(3.676e-02, 1.705e-02, -4.520e-03, -6.449e-03, 1.006e-02, 9.807e-03, -6.046e-03, -1.299e-03, -5.035e-02, -4.415e-02, 9.619e-03, -1.059e-02, -6.952e-03, -1.803e-02, -4.042e-03, -1.751e-02));
r += mul(s1_3, M4(5.123e-02, 4.500e-02, 2.099e-01, -7.254e-03, -7.977e-02, 2.822e-02, -1.546e-01, -3.748e-02, -2.378e-01, -1.836e-02, -3.508e-02, -2.147e-03, 3.371e-02, -4.720e-02, -5.574e-02, -1.592e-02));
r += mul(s1_4, M4(-5.764e-01, 5.998e-01, -2.288e-01, 7.223e-01, -1.855e-01, -3.467e-01, 5.173e-02, -8.967e-02, 3.308e-01, -8.987e-02, 2.397e-01, 3.701e-01, -7.970e-02, -9.046e-01, 2.397e-01, -1.626e-01));
r += mul(s1_5, M4(1.177e-02, -1.538e-01, 4.138e-02, -5.198e-02, 3.165e-03, 3.827e-02, -5.913e-03, 8.727e-03, 7.885e-02, 2.979e-01, -6.160e-02, 1.198e-01, 1.186e-02, 9.421e-02, -4.101e-02, 4.185e-03));
r += mul(s1_6, M4(-7.690e-02, -4.820e-03, -1.106e-01, 4.040e-02, -6.883e-02, -3.284e-02, 1.259e-02, 1.509e-01, 6.378e-03, -5.293e-04, -3.690e-02, 6.274e-02, 1.401e-01, -3.801e-03, 1.489e-01, -1.044e-02));
r += mul(s1_7, M4(1.140e-01, -1.333e-01, -1.739e-01, -1.739e-01, 4.736e-02, -1.306e-02, -3.673e-01, -6.127e-01, -3.477e-02, -6.090e-02, 2.430e-02, -2.666e-01, -6.599e-02, 2.794e-01, -1.724e-01, -2.744e-01));
r += mul(s1_8, M4(1.045e-02, 6.106e-02, 3.463e-02, 6.708e-02, -1.028e-02, -2.277e-02, 6.536e-02, 8.227e-02, -5.566e-02, -3.941e-02, -6.862e-03, -1.219e-02, -1.438e-02, -4.651e-02, 5.359e-02, 4.650e-02));
r += V4(-1.731e-03, -2.098e-03, -1.131e-03, -1.644e-03);
return tanh(r);
}
void Pass10(uint2 blockStart, uint3 tid) {
float2 pt = float2(GetInputPt());
uint2 gxy = (Rmp8x8(tid.x) << 1) + blockStart;
uint2 size = GetOutputSize();
if (gxy.x >= size.x || gxy.y >= size.y) {
return;
}
float2 pos = ((gxy >> 1) + 0.5) * pt;
V4 s0_0 = l0(-1.0, -1.0);
V4 s0_1 = l0(0.0, -1.0);
V4 s0_2 = l0(1.0, -1.0);
V4 s0_3 = l0(-1.0, 0.0);
V4 s0_4 = l0(0.0, 0.0);
V4 s0_5 = l0(1.0, 0.0);
V4 s0_6 = l0(-1.0, 1.0);
V4 s0_7 = l0(0.0, 1.0);
V4 s0_8 = l0(1.0, 1.0);
V4 s1_0 = -max(-s0_0, 0.0);
V4 s1_1 = -max(-s0_1, 0.0);
V4 s1_2 = -max(-s0_2, 0.0);
V4 s1_3 = -max(-s0_3, 0.0);
V4 s1_4 = -max(-s0_4, 0.0);
V4 s1_5 = -max(-s0_5, 0.0);
V4 s1_6 = -max(-s0_6, 0.0);
V4 s1_7 = -max(-s0_7, 0.0);
V4 s1_8 = -max(-s0_8, 0.0);
s0_0 = max(s0_0, 0.0);
s0_1 = max(s0_1, 0.0);
s0_2 = max(s0_2, 0.0);
s0_3 = max(s0_3, 0.0);
s0_4 = max(s0_4, 0.0);
s0_5 = max(s0_5, 0.0);
s0_6 = max(s0_6, 0.0);
s0_7 = max(s0_7, 0.0);
s0_8 = max(s0_8, 0.0);
V4 r = f0(s0_0, s0_1, s0_2, s0_3, s0_4, s0_5, s0_6, s0_7, s0_8, s1_0, s1_1, s1_2, s1_3, s1_4, s1_5, s1_6, s1_7, s1_8);
static const float3x3 rgb2yuv = {0.299, 0.587, 0.114, -0.169, -0.331, 0.5, 0.5, -0.419, -0.081};
static const float3x3 yuv2rgb = {1, -0.00093, 1.401687, 1, -0.3437, -0.71417, 1, 1.77216, 0.00099};
float2 opt = float2(GetOutputPt());
pos -= 0.5f * opt;
float3 yuv = mul(rgb2yuv, INPUT.SampleLevel(SL, pos, 0).rgb);
OUTPUT[gxy] = float4(mul(yuv2rgb, float3(saturate(yuv.r + r.x), yuv.yz)), 1);
++gxy.x;
pos.x += opt.x;
yuv = mul(rgb2yuv, INPUT.SampleLevel(SL, pos, 0).rgb);
OUTPUT[gxy] = float4(mul(yuv2rgb, float3(saturate(yuv.r + r.y), yuv.yz)), 1);
++gxy.y;
pos.y += opt.y;
yuv = mul(rgb2yuv, INPUT.SampleLevel(SL, pos, 0).rgb);
OUTPUT[gxy] = float4(mul(yuv2rgb, float3(saturate(yuv.r + r.w), yuv.yz)), 1);
--gxy.x;
pos.x -= opt.x;
yuv = mul(rgb2yuv, INPUT.SampleLevel(SL, pos, 0).rgb);
OUTPUT[gxy] = float4(mul(yuv2rgb, float3(saturate(yuv.r + r.z), yuv.yz)), 1);
}

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

View file

@ -2,9 +2,7 @@
// Port from https://github.com/haasn/gentoo-conf/blob/xor/home/nand/.mpv/shaders/deband.glsl
//!MAGPIE EFFECT
//!VERSION 3
//!OUTPUT_WIDTH INPUT_WIDTH
//!OUTPUT_HEIGHT INPUT_HEIGHT
//!VERSION 4
//!PARAMETER
//!LABEL Threshold
@ -54,6 +52,11 @@ float grain;
//!TEXTURE
Texture2D INPUT;
//!TEXTURE
//!WIDTH INPUT_WIDTH
//!HEIGHT INPUT_HEIGHT
Texture2D OUTPUT;
//!SAMPLER
//!FILTER LINEAR
SamplerState sam;
@ -66,6 +69,7 @@ SamplerState sam1;
//!PASS 1
//!STYLE PS
//!IN INPUT
//!OUT OUTPUT
// Wide usage friendly PRNG, shamelessly stolen from a GLSL tricks forum post
float mod289(float x) { return x - floor(x / 289.0) * 289.0; }

View file

@ -29,36 +29,24 @@
<CopyFileToFolders Include="ACNet.hlsl">
<FileType>Document</FileType>
</CopyFileToFolders>
<CopyFileToFolders Include="Bicubic.hlsl">
<FileType>Document</FileType>
</CopyFileToFolders>
<CopyFileToFolders Include="Jinc.hlsl">
<FileType>Document</FileType>
</CopyFileToFolders>
<CopyFileToFolders Include="Lanczos.hlsl">
<FileType>Document</FileType>
</CopyFileToFolders>
<CopyFileToFolders Include="Bilinear.hlsl">
<FileType>Document</FileType>
</CopyFileToFolders>
<CopyFileToFolders Include="Nearest.hlsl">
<FileType>Document</FileType>
</CopyFileToFolders>
<CopyFileToFolders Include="SSimDownscaler.hlsl">
<FileType>Document</FileType>
</CopyFileToFolders>
</ItemGroup>
<ItemGroup>
<CopyFileToFolders Include="Bicubic.hlsl">
<FileType>Document</FileType>
</CopyFileToFolders>
<CopyFileToFolders Include="ImageAdjustment.hlsl">
<FileType>Document</FileType>
</CopyFileToFolders>
</ItemGroup>
<ItemGroup>
<CopyFileToFolders Include="SMAA\SMAA.hlsli">
<FileType>Document</FileType>
</CopyFileToFolders>
</ItemGroup>
<ItemGroup>
<CopyFileToFolders Include="SMAA\SMAA_High.hlsl">
<FileType>Document</FileType>
</CopyFileToFolders>
@ -71,20 +59,14 @@
<CopyFileToFolders Include="SMAA\SMAA_Ultra.hlsl">
<FileType>Document</FileType>
</CopyFileToFolders>
</ItemGroup>
<ItemGroup>
<CopyFileToFolders Include="FSR\FSR_EASU.hlsl">
<FileType>Document</FileType>
</CopyFileToFolders>
<CopyFileToFolders Include="FSR\FSR_RCAS.hlsl">
<FileType>Document</FileType>
</CopyFileToFolders>
</ItemGroup>
<ItemGroup>
<CopyFileToFolders Include="SMAA\AreaTex.dds" />
<CopyFileToFolders Include="SMAA\SearchTex.dds" />
</ItemGroup>
<ItemGroup>
<CopyFileToFolders Include="Anime4K\Anime4K_3D_AA_Upscale_US.hlsl">
<FileType>Document</FileType>
</CopyFileToFolders>
@ -160,8 +142,6 @@
<CopyFileToFolders Include="Anime4K\Anime4K_Upscale_VL.hlsl">
<FileType>Document</FileType>
</CopyFileToFolders>
</ItemGroup>
<ItemGroup>
<CopyFileToFolders Include="CRT\CRT_Easymode.hlsl">
<FileType>Document</FileType>
</CopyFileToFolders>
@ -177,8 +157,6 @@
<CopyFileToFolders Include="CRT\GTU_v050.hlsl">
<FileType>Document</FileType>
</CopyFileToFolders>
</ItemGroup>
<ItemGroup>
<CopyFileToFolders Include="xBRZ\xBRZ_2x.hlsl">
<FileType>Document</FileType>
</CopyFileToFolders>
@ -197,13 +175,9 @@
<CopyFileToFolders Include="xBRZ\xBRZ_Freescale.hlsl">
<FileType>Document</FileType>
</CopyFileToFolders>
</ItemGroup>
<ItemGroup>
<CopyFileToFolders Include="FXAA\FXAA.hlsli">
<FileType>Document</FileType>
</CopyFileToFolders>
</ItemGroup>
<ItemGroup>
<CopyFileToFolders Include="FXAA\FXAA_High.hlsl">
<FileType>Document</FileType>
</CopyFileToFolders>
@ -213,56 +187,153 @@
<CopyFileToFolders Include="FXAA\FXAA_Ultra.hlsl">
<FileType>Document</FileType>
</CopyFileToFolders>
</ItemGroup>
<ItemGroup>
<CopyFileToFolders Include="RAVU\prescalers.hlsli">
<FileType>Document</FileType>
</CopyFileToFolders>
<CopyFileToFolders Include="RAVU\RAVU_3x_R2.hlsl">
<FileType>Document</FileType>
</CopyFileToFolders>
<CopyFileToFolders Include="RAVU\RAVU_3x_R2_RGB.hlsl">
<FileType>Document</FileType>
</CopyFileToFolders>
<CopyFileToFolders Include="RAVU\RAVU_3x_R3.hlsl">
<FileType>Document</FileType>
</CopyFileToFolders>
<CopyFileToFolders Include="RAVU\RAVU_3x_R3_RGB.hlsl">
<FileType>Document</FileType>
</CopyFileToFolders>
<CopyFileToFolders Include="RAVU\RAVU_3x_R4.hlsl">
<FileType>Document</FileType>
</CopyFileToFolders>
<CopyFileToFolders Include="RAVU\RAVU_3x_R4_RGB.hlsl">
<FileType>Document</FileType>
</CopyFileToFolders>
<CopyFileToFolders Include="RAVU\RAVU_Lite_AR_R2.hlsl">
<FileType>Document</FileType>
</CopyFileToFolders>
<CopyFileToFolders Include="RAVU\RAVU_Lite_AR_R3.hlsl">
<FileType>Document</FileType>
</CopyFileToFolders>
<CopyFileToFolders Include="RAVU\RAVU_Lite_AR_R4.hlsl">
<FileType>Document</FileType>
</CopyFileToFolders>
<CopyFileToFolders Include="RAVU\RAVU_Lite_R2.hlsl">
<FileType>Document</FileType>
</CopyFileToFolders>
<CopyFileToFolders Include="RAVU\RAVU_Lite_R3.hlsl">
<FileType>Document</FileType>
</CopyFileToFolders>
<CopyFileToFolders Include="RAVU\RAVU_Lite_R4.hlsl">
<FileType>Document</FileType>
</CopyFileToFolders>
<CopyFileToFolders Include="RAVU\RAVU_R2.hlsl">
<FileType>Document</FileType>
</CopyFileToFolders>
<CopyFileToFolders Include="RAVU\RAVU_R2_RGB.hlsl">
<FileType>Document</FileType>
</CopyFileToFolders>
<CopyFileToFolders Include="RAVU\RAVU_R3.hlsl">
<FileType>Document</FileType>
</CopyFileToFolders>
<CopyFileToFolders Include="RAVU\RAVU_R3_RGB.hlsl">
<FileType>Document</FileType>
</CopyFileToFolders>
<CopyFileToFolders Include="RAVU\RAVU_R4.hlsl">
<FileType>Document</FileType>
</CopyFileToFolders>
<CopyFileToFolders Include="RAVU\RAVU_R4_RGB.hlsl">
<FileType>Document</FileType>
</CopyFileToFolders>
<CopyFileToFolders Include="RAVU\RAVU_Zoom_AR_R2.hlsl">
<FileType>Document</FileType>
</CopyFileToFolders>
<CopyFileToFolders Include="RAVU\RAVU_Zoom_AR_R2_RGB.hlsl">
<FileType>Document</FileType>
</CopyFileToFolders>
<CopyFileToFolders Include="RAVU\RAVU_Zoom_AR_R3.hlsl">
<FileType>Document</FileType>
</CopyFileToFolders>
<CopyFileToFolders Include="RAVU\RAVU_Zoom_AR_R3_RGB.hlsl">
<FileType>Document</FileType>
</CopyFileToFolders>
<CopyFileToFolders Include="RAVU\RAVU_Zoom_R2.hlsl">
<FileType>Document</FileType>
</CopyFileToFolders>
<CopyFileToFolders Include="RAVU\RAVU_Zoom_R2_RGB.hlsl">
<FileType>Document</FileType>
</CopyFileToFolders>
<CopyFileToFolders Include="RAVU\RAVU_Zoom_R3.hlsl">
<FileType>Document</FileType>
</CopyFileToFolders>
</ItemGroup>
<ItemGroup>
<CopyFileToFolders Include="RAVU\RAVU_Lite_R3_Weights.dds" />
<CopyFileToFolders Include="RAVU\RAVU_Zoom_R3_Weights.dds" />
</ItemGroup>
<ItemGroup>
<CopyFileToFolders Include="RAVU\RAVU_Zoom_R3_RGB.hlsl">
<FileType>Document</FileType>
</CopyFileToFolders>
<CopyFileToFolders Include="RAVU\ravu_3x_lut2_f16.dds" />
<CopyFileToFolders Include="RAVU\ravu_3x_lut3_f16.dds" />
<CopyFileToFolders Include="RAVU\ravu_3x_lut4_f16.dds" />
<CopyFileToFolders Include="RAVU\ravu_lite_lut2_f16.dds" />
<CopyFileToFolders Include="RAVU\ravu_lite_lut3_f16.dds" />
<CopyFileToFolders Include="RAVU\ravu_lite_lut4_f16.dds" />
<CopyFileToFolders Include="RAVU\ravu_lut2_f16.dds" />
<CopyFileToFolders Include="RAVU\ravu_lut3_f16.dds" />
<CopyFileToFolders Include="RAVU\ravu_lut4_f16.dds" />
<CopyFileToFolders Include="RAVU\ravu_zoom_lut2_ar_f16.dds" />
<CopyFileToFolders Include="RAVU\ravu_zoom_lut2_f16.dds" />
<CopyFileToFolders Include="RAVU\ravu_zoom_lut3_ar_f16.dds" />
<CopyFileToFolders Include="RAVU\ravu_zoom_lut3_f16.dds" />
<CopyFileToFolders Include="NNEDI3\NNEDI3_nns128_win8x4.hlsl">
<FileType>Document</FileType>
</CopyFileToFolders>
<CopyFileToFolders Include="NNEDI3\NNEDI3_nns128_win8x6.hlsl">
<FileType>Document</FileType>
</CopyFileToFolders>
<CopyFileToFolders Include="NNEDI3\NNEDI3_nns16_win8x4.hlsl">
<FileType>Document</FileType>
</CopyFileToFolders>
<CopyFileToFolders Include="NNEDI3\NNEDI3_nns16_win8x6.hlsl">
<FileType>Document</FileType>
</CopyFileToFolders>
<CopyFileToFolders Include="NNEDI3\NNEDI3_nns256_win8x4.hlsl">
<FileType>Document</FileType>
</CopyFileToFolders>
<CopyFileToFolders Include="NNEDI3\NNEDI3_nns256_win8x6.hlsl">
<FileType>Document</FileType>
</CopyFileToFolders>
<CopyFileToFolders Include="NNEDI3\NNEDI3_nns32_win8x4.hlsl">
<FileType>Document</FileType>
</CopyFileToFolders>
<CopyFileToFolders Include="NNEDI3\NNEDI3_nns32_win8x6.hlsl">
<FileType>Document</FileType>
</CopyFileToFolders>
<CopyFileToFolders Include="NNEDI3\NNEDI3_nns64_win8x4.hlsl">
<FileType>Document</FileType>
</CopyFileToFolders>
<CopyFileToFolders Include="NNEDI3\NNEDI3_nns64_win8x6.hlsl">
<FileType>Document</FileType>
</CopyFileToFolders>
</ItemGroup>
<ItemGroup>
<CopyFileToFolders Include="NNEDI3\prescalers.hlsli">
<FileType>Document</FileType>
</CopyFileToFolders>
<CopyFileToFolders Include="NIS\NIS.hlsl">
<FileType>Document</FileType>
</CopyFileToFolders>
<CopyFileToFolders Include="NIS\NVSharpen.hlsl">
<FileType>Document</FileType>
</CopyFileToFolders>
</ItemGroup>
<ItemGroup>
<CopyFileToFolders Include="NIS\Coef_Scale.dds" />
<CopyFileToFolders Include="NIS\Coef_USM.dds" />
</ItemGroup>
<ItemGroup>
<CopyFileToFolders Include="CAS\CAS.hlsl">
<FileType>Document</FileType>
</CopyFileToFolders>
<CopyFileToFolders Include="CAS\CAS_Scaling.hlsl">
<FileType>Document</FileType>
</CopyFileToFolders>
</ItemGroup>
<ItemGroup>
<CopyFileToFolders Include="FSRCNNX\FSRCNNX.hlsl">
<FileType>Document</FileType>
</CopyFileToFolders>
<CopyFileToFolders Include="FSRCNNX\FSRCNNX_LineArt.hlsl">
<FileType>Document</FileType>
</CopyFileToFolders>
</ItemGroup>
<ItemGroup>
<CopyFileToFolders Include="Sharpen\AdaptiveSharpen.hlsl">
<FileType>Document</FileType>
</CopyFileToFolders>
@ -275,8 +346,6 @@
<CopyFileToFolders Include="Sharpen\LumaSharpen.hlsl">
<FileType>Document</FileType>
</CopyFileToFolders>
</ItemGroup>
<ItemGroup>
<CopyFileToFolders Include="Pixel Art\MMPX.hlsl">
<FileType>Document</FileType>
</CopyFileToFolders>
@ -286,11 +355,81 @@
<CopyFileToFolders Include="Pixel Art\SharpBilinear.hlsl">
<FileType>Document</FileType>
</CopyFileToFolders>
</ItemGroup>
<ItemGroup>
<CopyFileToFolders Include="Deband.hlsl">
<FileType>Document</FileType>
</CopyFileToFolders>
<CopyFileToFolders Include="Bilinear.hlsl">
<FileType>Document</FileType>
</CopyFileToFolders>
<CopyFileToFolders Include="Nearest.hlsl">
<FileType>Document</FileType>
</CopyFileToFolders>
<CopyFileToFolders Include="Anime4K\Anime4K_Upscale_GAN_x3_L.hlsl">
<FileType>Document</FileType>
</CopyFileToFolders>
<CopyFileToFolders Include="Anime4K\Anime4K_Upscale_GAN_x2_M.hlsl">
<FileType>Document</FileType>
</CopyFileToFolders>
<CopyFileToFolders Include="CuNNy\CuNNy-2x4C-NVL.hlsl">
<FileType>Document</FileType>
</CopyFileToFolders>
<CopyFileToFolders Include="CuNNy\CuNNy-2x4C-NVL-DN.hlsl">
<FileType>Document</FileType>
</CopyFileToFolders>
<CopyFileToFolders Include="CuNNy\CuNNy-3x4C-NVL.hlsl">
<FileType>Document</FileType>
</CopyFileToFolders>
<CopyFileToFolders Include="CuNNy\CuNNy-3x4C-NVL-DN.hlsl">
<FileType>Document</FileType>
</CopyFileToFolders>
<CopyFileToFolders Include="CuNNy\CuNNy-4x4C-NVL.hlsl">
<FileType>Document</FileType>
</CopyFileToFolders>
<CopyFileToFolders Include="CuNNy\CuNNy-4x4C-NVL-DN.hlsl">
<FileType>Document</FileType>
</CopyFileToFolders>
<CopyFileToFolders Include="CuNNy\CuNNy-4x8C-NVL.hlsl">
<FileType>Document</FileType>
</CopyFileToFolders>
<CopyFileToFolders Include="CuNNy\CuNNy-4x8C-NVL-DN.hlsl">
<FileType>Document</FileType>
</CopyFileToFolders>
<CopyFileToFolders Include="CuNNy\CuNNy-4x16C-NVL.hlsl">
<FileType>Document</FileType>
</CopyFileToFolders>
<CopyFileToFolders Include="CuNNy\CuNNy-4x16C-NVL-DN.hlsl">
<FileType>Document</FileType>
</CopyFileToFolders>
<CopyFileToFolders Include="CuNNy\CuNNy-6x8C-NVL.hlsl">
<FileType>Document</FileType>
</CopyFileToFolders>
<CopyFileToFolders Include="CuNNy\CuNNy-6x8C-NVL-DN.hlsl">
<FileType>Document</FileType>
</CopyFileToFolders>
<CopyFileToFolders Include="CuNNy\CuNNy-8x4C-NVL.hlsl">
<FileType>Document</FileType>
</CopyFileToFolders>
<CopyFileToFolders Include="CuNNy\CuNNy-8x4C-NVL-DN.hlsl">
<FileType>Document</FileType>
</CopyFileToFolders>
<CopyFileToFolders Include="CuNNy\CuNNy-8x8C-NVL.hlsl">
<FileType>Document</FileType>
</CopyFileToFolders>
<CopyFileToFolders Include="CuNNy\CuNNy-8x8C-NVL-DN.hlsl">
<FileType>Document</FileType>
</CopyFileToFolders>
<CopyFileToFolders Include="CuNNy\CuNNy-8x16C-NVL.hlsl">
<FileType>Document</FileType>
</CopyFileToFolders>
<CopyFileToFolders Include="CuNNy\CuNNy-8x16C-NVL-DN.hlsl">
<FileType>Document</FileType>
</CopyFileToFolders>
<CopyFileToFolders Include="CuNNy\CuNNy-16x16C-NVL.hlsl">
<FileType>Document</FileType>
</CopyFileToFolders>
<CopyFileToFolders Include="CuNNy\CuNNy-16x16C-NVL-DN.hlsl">
<FileType>Document</FileType>
</CopyFileToFolders>
</ItemGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
</Project>

View file

@ -2,11 +2,8 @@
<Project ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
<ItemGroup>
<CopyFileToFolders Include="ACNet.hlsl" />
<CopyFileToFolders Include="Bicubic.hlsl" />
<CopyFileToFolders Include="Jinc.hlsl" />
<CopyFileToFolders Include="Lanczos.hlsl" />
<CopyFileToFolders Include="Bilinear.hlsl" />
<CopyFileToFolders Include="Nearest.hlsl" />
<CopyFileToFolders Include="SSimDownscaler.hlsl" />
<CopyFileToFolders Include="ImageAdjustment.hlsl" />
<CopyFileToFolders Include="SMAA\SMAA.hlsli">
@ -59,7 +56,7 @@
</CopyFileToFolders>
<CopyFileToFolders Include="Anime4K\Anime4K_Restore_Soft_UL.hlsl">
<Filter>Anime4K</Filter>
</CopyFileToFolders>
</CopyFileToFolders>
<CopyFileToFolders Include="Anime4K\Anime4K_Restore_Soft_VL.hlsl">
<Filter>Anime4K</Filter>
</CopyFileToFolders>
@ -156,24 +153,159 @@
<CopyFileToFolders Include="FXAA\FXAA.hlsli">
<Filter>FXAA</Filter>
</CopyFileToFolders>
<CopyFileToFolders Include="RAVU\prescalers.hlsli">
<Filter>RAVU</Filter>
</CopyFileToFolders>
<CopyFileToFolders Include="RAVU\ravu_3x_lut2_f16.dds">
<Filter>RAVU</Filter>
</CopyFileToFolders>
<CopyFileToFolders Include="RAVU\ravu_3x_lut3_f16.dds">
<Filter>RAVU</Filter>
</CopyFileToFolders>
<CopyFileToFolders Include="RAVU\ravu_3x_lut4_f16.dds">
<Filter>RAVU</Filter>
</CopyFileToFolders>
<CopyFileToFolders Include="RAVU\RAVU_3x_R2_RGB.hlsl">
<Filter>RAVU</Filter>
</CopyFileToFolders>
<CopyFileToFolders Include="RAVU\RAVU_3x_R2.hlsl">
<Filter>RAVU</Filter>
</CopyFileToFolders>
<CopyFileToFolders Include="RAVU\RAVU_3x_R3_RGB.hlsl">
<Filter>RAVU</Filter>
</CopyFileToFolders>
<CopyFileToFolders Include="RAVU\RAVU_3x_R3.hlsl">
<Filter>RAVU</Filter>
</CopyFileToFolders>
<CopyFileToFolders Include="RAVU\RAVU_3x_R4_RGB.hlsl">
<Filter>RAVU</Filter>
</CopyFileToFolders>
<CopyFileToFolders Include="RAVU\RAVU_3x_R4.hlsl">
<Filter>RAVU</Filter>
</CopyFileToFolders>
<CopyFileToFolders Include="RAVU\RAVU_Lite_AR_R2.hlsl">
<Filter>RAVU</Filter>
</CopyFileToFolders>
<CopyFileToFolders Include="RAVU\RAVU_Lite_AR_R3.hlsl">
<Filter>RAVU</Filter>
</CopyFileToFolders>
<CopyFileToFolders Include="RAVU\RAVU_Lite_AR_R4.hlsl">
<Filter>RAVU</Filter>
</CopyFileToFolders>
<CopyFileToFolders Include="RAVU\ravu_lite_lut2_f16.dds">
<Filter>RAVU</Filter>
</CopyFileToFolders>
<CopyFileToFolders Include="RAVU\ravu_lite_lut3_f16.dds">
<Filter>RAVU</Filter>
</CopyFileToFolders>
<CopyFileToFolders Include="RAVU\ravu_lite_lut4_f16.dds">
<Filter>RAVU</Filter>
</CopyFileToFolders>
<CopyFileToFolders Include="RAVU\RAVU_Lite_R2.hlsl">
<Filter>RAVU</Filter>
</CopyFileToFolders>
<CopyFileToFolders Include="RAVU\RAVU_Lite_R3.hlsl">
<Filter>RAVU</Filter>
</CopyFileToFolders>
<CopyFileToFolders Include="RAVU\RAVU_Lite_R3_Weights.dds">
<CopyFileToFolders Include="RAVU\RAVU_Lite_R4.hlsl">
<Filter>RAVU</Filter>
</CopyFileToFolders>
<CopyFileToFolders Include="RAVU\ravu_lut2_f16.dds">
<Filter>RAVU</Filter>
</CopyFileToFolders>
<CopyFileToFolders Include="RAVU\ravu_lut3_f16.dds">
<Filter>RAVU</Filter>
</CopyFileToFolders>
<CopyFileToFolders Include="RAVU\ravu_lut4_f16.dds">
<Filter>RAVU</Filter>
</CopyFileToFolders>
<CopyFileToFolders Include="RAVU\RAVU_R2_RGB.hlsl">
<Filter>RAVU</Filter>
</CopyFileToFolders>
<CopyFileToFolders Include="RAVU\RAVU_R2.hlsl">
<Filter>RAVU</Filter>
</CopyFileToFolders>
<CopyFileToFolders Include="RAVU\RAVU_R3_RGB.hlsl">
<Filter>RAVU</Filter>
</CopyFileToFolders>
<CopyFileToFolders Include="RAVU\RAVU_R3.hlsl">
<Filter>RAVU</Filter>
</CopyFileToFolders>
<CopyFileToFolders Include="RAVU\RAVU_R4_RGB.hlsl">
<Filter>RAVU</Filter>
</CopyFileToFolders>
<CopyFileToFolders Include="RAVU\RAVU_R4.hlsl">
<Filter>RAVU</Filter>
</CopyFileToFolders>
<CopyFileToFolders Include="RAVU\RAVU_Zoom_AR_R2_RGB.hlsl">
<Filter>RAVU</Filter>
</CopyFileToFolders>
<CopyFileToFolders Include="RAVU\RAVU_Zoom_AR_R2.hlsl">
<Filter>RAVU</Filter>
</CopyFileToFolders>
<CopyFileToFolders Include="RAVU\RAVU_Zoom_AR_R3_RGB.hlsl">
<Filter>RAVU</Filter>
</CopyFileToFolders>
<CopyFileToFolders Include="RAVU\RAVU_Zoom_AR_R3.hlsl">
<Filter>RAVU</Filter>
</CopyFileToFolders>
<CopyFileToFolders Include="RAVU\ravu_zoom_lut2_ar_f16.dds">
<Filter>RAVU</Filter>
</CopyFileToFolders>
<CopyFileToFolders Include="RAVU\ravu_zoom_lut2_f16.dds">
<Filter>RAVU</Filter>
</CopyFileToFolders>
<CopyFileToFolders Include="RAVU\ravu_zoom_lut3_ar_f16.dds">
<Filter>RAVU</Filter>
</CopyFileToFolders>
<CopyFileToFolders Include="RAVU\ravu_zoom_lut3_f16.dds">
<Filter>RAVU</Filter>
</CopyFileToFolders>
<CopyFileToFolders Include="RAVU\RAVU_Zoom_R2_RGB.hlsl">
<Filter>RAVU</Filter>
</CopyFileToFolders>
<CopyFileToFolders Include="RAVU\RAVU_Zoom_R2.hlsl">
<Filter>RAVU</Filter>
</CopyFileToFolders>
<CopyFileToFolders Include="RAVU\RAVU_Zoom_R3.hlsl">
<Filter>RAVU</Filter>
</CopyFileToFolders>
<CopyFileToFolders Include="RAVU\RAVU_Zoom_R3_Weights.dds">
<CopyFileToFolders Include="RAVU\RAVU_Zoom_R3_RGB.hlsl">
<Filter>RAVU</Filter>
</CopyFileToFolders>
<CopyFileToFolders Include="NNEDI3\NNEDI3_nns128_win8x4.hlsl">
<Filter>NNEDI3</Filter>
</CopyFileToFolders>
<CopyFileToFolders Include="NNEDI3\NNEDI3_nns128_win8x6.hlsl">
<Filter>NNEDI3</Filter>
</CopyFileToFolders>
<CopyFileToFolders Include="NNEDI3\NNEDI3_nns16_win8x4.hlsl">
<Filter>NNEDI3</Filter>
</CopyFileToFolders>
<CopyFileToFolders Include="NNEDI3\NNEDI3_nns16_win8x6.hlsl">
<Filter>NNEDI3</Filter>
</CopyFileToFolders>
<CopyFileToFolders Include="NNEDI3\NNEDI3_nns256_win8x4.hlsl">
<Filter>NNEDI3</Filter>
</CopyFileToFolders>
<CopyFileToFolders Include="NNEDI3\NNEDI3_nns256_win8x6.hlsl">
<Filter>NNEDI3</Filter>
</CopyFileToFolders>
<CopyFileToFolders Include="NNEDI3\NNEDI3_nns32_win8x4.hlsl">
<Filter>NNEDI3</Filter>
</CopyFileToFolders>
<CopyFileToFolders Include="NNEDI3\NNEDI3_nns32_win8x6.hlsl">
<Filter>NNEDI3</Filter>
</CopyFileToFolders>
<CopyFileToFolders Include="NNEDI3\NNEDI3_nns64_win8x4.hlsl">
<Filter>NNEDI3</Filter>
</CopyFileToFolders>
<CopyFileToFolders Include="NNEDI3\NNEDI3_nns64_win8x6.hlsl">
<Filter>NNEDI3</Filter>
</CopyFileToFolders>
<CopyFileToFolders Include="NNEDI3\prescalers.hlsli">
<Filter>NNEDI3</Filter>
</CopyFileToFolders>
<CopyFileToFolders Include="NIS\NIS.hlsl">
<Filter>NIS</Filter>
</CopyFileToFolders>
@ -220,6 +352,75 @@
<Filter>Pixel Art</Filter>
</CopyFileToFolders>
<CopyFileToFolders Include="Deband.hlsl" />
<CopyFileToFolders Include="Nearest.hlsl" />
<CopyFileToFolders Include="Bilinear.hlsl" />
<CopyFileToFolders Include="Anime4K\Anime4K_Upscale_GAN_x3_L.hlsl">
<Filter>Anime4K</Filter>
</CopyFileToFolders>
<CopyFileToFolders Include="Anime4K\Anime4K_Upscale_GAN_x2_M.hlsl">
<Filter>Anime4K</Filter>
</CopyFileToFolders>
<CopyFileToFolders Include="CuNNy\CuNNy-2x4C-NVL.hlsl">
<Filter>CuNNy</Filter>
</CopyFileToFolders>
<CopyFileToFolders Include="CuNNy\CuNNy-2x4C-NVL-DN.hlsl">
<Filter>CuNNy</Filter>
</CopyFileToFolders>
<CopyFileToFolders Include="CuNNy\CuNNy-3x4C-NVL.hlsl">
<Filter>CuNNy</Filter>
</CopyFileToFolders>
<CopyFileToFolders Include="CuNNy\CuNNy-3x4C-NVL-DN.hlsl">
<Filter>CuNNy</Filter>
</CopyFileToFolders>
<CopyFileToFolders Include="CuNNy\CuNNy-4x4C-NVL.hlsl">
<Filter>CuNNy</Filter>
</CopyFileToFolders>
<CopyFileToFolders Include="CuNNy\CuNNy-4x4C-NVL-DN.hlsl">
<Filter>CuNNy</Filter>
</CopyFileToFolders>
<CopyFileToFolders Include="CuNNy\CuNNy-4x8C-NVL.hlsl">
<Filter>CuNNy</Filter>
</CopyFileToFolders>
<CopyFileToFolders Include="CuNNy\CuNNy-4x8C-NVL-DN.hlsl">
<Filter>CuNNy</Filter>
</CopyFileToFolders>
<CopyFileToFolders Include="CuNNy\CuNNy-4x16C-NVL.hlsl">
<Filter>CuNNy</Filter>
</CopyFileToFolders>
<CopyFileToFolders Include="CuNNy\CuNNy-4x16C-NVL-DN.hlsl">
<Filter>CuNNy</Filter>
</CopyFileToFolders>
<CopyFileToFolders Include="CuNNy\CuNNy-6x8C-NVL.hlsl">
<Filter>CuNNy</Filter>
</CopyFileToFolders>
<CopyFileToFolders Include="CuNNy\CuNNy-6x8C-NVL-DN.hlsl">
<Filter>CuNNy</Filter>
</CopyFileToFolders>
<CopyFileToFolders Include="CuNNy\CuNNy-8x4C-NVL.hlsl">
<Filter>CuNNy</Filter>
</CopyFileToFolders>
<CopyFileToFolders Include="CuNNy\CuNNy-8x4C-NVL-DN.hlsl">
<Filter>CuNNy</Filter>
</CopyFileToFolders>
<CopyFileToFolders Include="CuNNy\CuNNy-8x8C-NVL.hlsl">
<Filter>CuNNy</Filter>
</CopyFileToFolders>
<CopyFileToFolders Include="CuNNy\CuNNy-8x8C-NVL-DN.hlsl">
<Filter>CuNNy</Filter>
</CopyFileToFolders>
<CopyFileToFolders Include="CuNNy\CuNNy-8x16C-NVL.hlsl">
<Filter>CuNNy</Filter>
</CopyFileToFolders>
<CopyFileToFolders Include="CuNNy\CuNNy-8x16C-NVL-DN.hlsl">
<Filter>CuNNy</Filter>
</CopyFileToFolders>
<CopyFileToFolders Include="CuNNy\CuNNy-16x16C-NVL.hlsl">
<Filter>CuNNy</Filter>
</CopyFileToFolders>
<CopyFileToFolders Include="CuNNy\CuNNy-16x16C-NVL-DN.hlsl">
<Filter>CuNNy</Filter>
</CopyFileToFolders>
<CopyFileToFolders Include="Bicubic.hlsl" />
</ItemGroup>
<ItemGroup>
<Filter Include="Anime4K">
@ -261,5 +462,8 @@
<Filter Include="Pixel Art">
<UniqueIdentifier>{0b58f073-84cb-4c38-919d-80176ae408bc}</UniqueIdentifier>
</Filter>
<Filter Include="CuNNy">
<UniqueIdentifier>{9157745b-aa96-42ce-bdc6-1230dffa326b}</UniqueIdentifier>
</Filter>
</ItemGroup>
</Project>

View file

@ -2,11 +2,13 @@
// 移植自 https://github.com/GPUOpen-Effects/FidelityFX-FSR/blob/master/ffx-fsr/ffx_fsr1.h
//!MAGPIE EFFECT
//!VERSION 3
//!VERSION 4
//!TEXTURE
Texture2D INPUT;
//!TEXTURE
Texture2D OUTPUT;
//!SAMPLER
//!FILTER POINT
@ -15,6 +17,7 @@ SamplerState sam;
//!PASS 1
//!IN INPUT
//!OUT OUTPUT
//!BLOCK_SIZE 16
//!NUM_THREADS 64
@ -228,12 +231,13 @@ float3 FsrEasuF(uint2 pos, float4 con0, float4 con1, float4 con2, float2 con3) {
void Pass1(uint2 blockStart, uint3 threadId) {
uint2 gxy = blockStart + Rmp8x8(threadId.x);
if (!CheckViewport(gxy)) {
const uint2 outputSize = GetOutputSize();
if (gxy.x >= outputSize.x || gxy.y >= outputSize.y) {
return;
}
uint2 inputSize = GetInputSize();
uint2 outputSize = GetOutputSize();
float2 inputPt = GetInputPt();
float4 con0, con1, con2;
@ -271,20 +275,20 @@ void Pass1(uint2 blockStart, uint3 threadId) {
con3[0] = 0;
con3[1] = 4.0f * inputPt.y;
WriteToOutput(gxy, FsrEasuF(gxy, con0, con1, con2, con3));
OUTPUT[gxy] = float4(FsrEasuF(gxy, con0, con1, con2, con3), 1);
gxy.x += 8u;
if (CheckViewport(gxy)) {
WriteToOutput(gxy, FsrEasuF(gxy, con0, con1, con2, con3));
if (gxy.x < outputSize.x && gxy.y < outputSize.y) {
OUTPUT[gxy] = float4(FsrEasuF(gxy, con0, con1, con2, con3), 1);
}
gxy.y += 8u;
if (CheckViewport(gxy)) {
WriteToOutput(gxy, FsrEasuF(gxy, con0, con1, con2, con3));
if (gxy.x < outputSize.x && gxy.y < outputSize.y) {
OUTPUT[gxy] = float4(FsrEasuF(gxy, con0, con1, con2, con3), 1);
}
gxy.x -= 8u;
if (CheckViewport(gxy)) {
WriteToOutput(gxy, FsrEasuF(gxy, con0, con1, con2, con3));
if (gxy.x < outputSize.x && gxy.y < outputSize.y) {
OUTPUT[gxy] = float4(FsrEasuF(gxy, con0, con1, con2, con3), 1);
}
}

View file

@ -2,9 +2,7 @@
// 移植自 https://github.com/GPUOpen-Effects/FidelityFX-FSR/blob/master/ffx-fsr/ffx_fsr1.h
//!MAGPIE EFFECT
//!VERSION 3
//!OUTPUT_WIDTH INPUT_WIDTH
//!OUTPUT_HEIGHT INPUT_HEIGHT
//!VERSION 4
//!PARAMETER
@ -18,12 +16,19 @@ float sharpness;
//!TEXTURE
Texture2D INPUT;
//!TEXTURE
//!WIDTH INPUT_WIDTH
//!HEIGHT INPUT_HEIGHT
Texture2D OUTPUT;
//!SAMPLER
//!FILTER POINT
SamplerState sam;
//!PASS 1
//!IN INPUT
//!OUT OUTPUT
//!BLOCK_SIZE 16
//!NUM_THREADS 64
@ -108,7 +113,9 @@ float3 FsrRcasF(float3 b, float3 d, float3 e, float3 f, float3 h) {
void Pass1(uint2 blockStart, uint3 threadId) {
uint2 gxy = blockStart + (Rmp8x8(threadId.x) << 1);
if (!CheckViewport(gxy)) {
const uint2 outputSize = GetOutputSize();
if (gxy.x >= outputSize.x || gxy.y >= outputSize.y) {
return;
}
@ -126,20 +133,20 @@ void Pass1(uint2 blockStart, uint3 threadId) {
src[3][1] = INPUT.Load(int3(gxy.x + 2, gxy.y, 0)).rgb;
src[3][2] = INPUT.Load(int3(gxy.x + 2, gxy.y + 1, 0)).rgb;
WriteToOutput(gxy, FsrRcasF(src[1][0], src[0][1], src[1][1], src[2][1], src[1][2]));
OUTPUT[gxy] = float4(FsrRcasF(src[1][0], src[0][1], src[1][1], src[2][1], src[1][2]), 1);
++gxy.x;
if (CheckViewport(gxy)) {
WriteToOutput(gxy, FsrRcasF(src[2][0], src[1][1], src[2][1], src[3][1], src[2][2]));
if (gxy.x < outputSize.x && gxy.y < outputSize.y) {
OUTPUT[gxy] = float4(FsrRcasF(src[2][0], src[1][1], src[2][1], src[3][1], src[2][2]), 1);
}
++gxy.y;
if (CheckViewport(gxy)) {
WriteToOutput(gxy, FsrRcasF(src[2][1], src[1][2], src[2][2], src[3][2], src[2][3]));
if (gxy.x < outputSize.x && gxy.y < outputSize.y) {
OUTPUT[gxy] = float4(FsrRcasF(src[2][1], src[1][2], src[2][2], src[3][2], src[2][3]), 1);
}
--gxy.x;
if (CheckViewport(gxy)) {
WriteToOutput(gxy, FsrRcasF(src[1][1], src[0][2], src[1][2], src[2][2], src[1][3]));
if (gxy.x < outputSize.x && gxy.y < outputSize.y) {
OUTPUT[gxy] = float4(FsrRcasF(src[1][1], src[0][2], src[1][2], src[2][2], src[1][3]), 1);
}
}

View file

@ -3,14 +3,17 @@
//!MAGPIE EFFECT
//!VERSION 3
//!OUTPUT_WIDTH INPUT_WIDTH * 2
//!OUTPUT_HEIGHT INPUT_HEIGHT * 2
//!VERSION 4
//!TEXTURE
Texture2D INPUT;
//!TEXTURE
//!WIDTH INPUT_WIDTH * 2
//!HEIGHT INPUT_HEIGHT * 2
Texture2D OUTPUT;
//!TEXTURE
//!WIDTH INPUT_WIDTH
//!HEIGHT INPUT_HEIGHT
@ -534,6 +537,7 @@ void Pass5(uint2 blockStart, uint3 threadId) {
//!PASS 6
//!DESC sub-pixel convolution, aggregation
//!IN tex3, tex4, INPUT
//!OUT OUTPUT
//!BLOCK_SIZE 16
//!NUM_THREADS 64
@ -551,7 +555,8 @@ const static float3x3 yuv2rgb = {
void Pass6(uint2 blockStart, uint3 threadId) {
uint2 gxy = (Rmp8x8(threadId.x) << 1) + blockStart;
if (!CheckViewport(gxy)) {
const uint2 outputSize = GetOutputSize();
if (gxy.x >= outputSize.x || gxy.y >= outputSize.y) {
return;
}
@ -609,15 +614,9 @@ void Pass6(uint2 blockStart, uint3 threadId) {
for (uint j = 0; j <= 1; ++j) {
const uint2 destPos = gxy + uint2(i, j);
if (i != 0 || j != 0) {
if (!CheckViewport(destPos)) {
continue;
}
}
float2 originUV = mul(rgb2uv, INPUT.SampleLevel(sam1, (destPos + 0.5f) * outputPt, 0).rgb);
const uint index = i * 2 + j;
WriteToOutput(destPos, mul(yuv2rgb, float3(result[index], originUV)));
OUTPUT[destPos] = float4(mul(yuv2rgb, float3(result[index], originUV)), 1);
}
}
}

View file

@ -2,14 +2,17 @@
// 移植自 https://github.com/igv/FSRCNN-TensorFlow
//!MAGPIE EFFECT
//!VERSION 3
//!OUTPUT_WIDTH INPUT_WIDTH * 2
//!OUTPUT_HEIGHT INPUT_HEIGHT * 2
//!VERSION 4
//!TEXTURE
Texture2D INPUT;
//!TEXTURE
//!WIDTH INPUT_WIDTH * 2
//!HEIGHT INPUT_HEIGHT * 2
Texture2D OUTPUT;
//!TEXTURE
//!WIDTH INPUT_WIDTH
//!HEIGHT INPUT_HEIGHT
@ -531,6 +534,7 @@ void Pass5(uint2 blockStart, uint3 threadId) {
//!PASS 6
//!DESC sub-pixel convolution, aggregation
//!IN tex3, tex4, INPUT
//!OUT OUTPUT
//!BLOCK_SIZE 16
//!NUM_THREADS 64
@ -548,7 +552,8 @@ const static float3x3 yuv2rgb = {
void Pass6(uint2 blockStart, uint3 threadId) {
uint2 gxy = (Rmp8x8(threadId.x) << 1) + blockStart;
if (!CheckViewport(gxy)) {
const uint2 outputSize = GetOutputSize();
if (gxy.x >= outputSize.x || gxy.y >= outputSize.y) {
return;
}
@ -606,15 +611,9 @@ void Pass6(uint2 blockStart, uint3 threadId) {
for (uint j = 0; j <= 1; ++j) {
const uint2 destPos = gxy + uint2(i, j);
if (i != 0 || j != 0) {
if (!CheckViewport(destPos)) {
continue;
}
}
float2 originUV = mul(rgb2uv, INPUT.SampleLevel(sam1, (destPos + 0.5f) * outputPt, 0).rgb);
const uint index = i * 2 + j;
WriteToOutput(destPos, mul(yuv2rgb, float3(result[index], originUV)));
OUTPUT[destPos] = float4(mul(yuv2rgb, float3(result[index], originUV)), 1);
}
}
}

View file

@ -1,15 +1,18 @@
// 移植自 https://github.com/libretro/slang-shaders/blob/master/anti-aliasing/shaders/fxaa.slang
//!MAGPIE EFFECT
//!VERSION 3
//!OUTPUT_WIDTH INPUT_WIDTH
//!OUTPUT_HEIGHT INPUT_HEIGHT
//!VERSION 4
//!SORT_NAME FXAA_1
//!TEXTURE
Texture2D INPUT;
//!TEXTURE
//!WIDTH INPUT_WIDTH
//!HEIGHT INPUT_HEIGHT
Texture2D OUTPUT;
//!SAMPLER
//!FILTER LINEAR
SamplerState sam;
@ -17,6 +20,7 @@ SamplerState sam;
//!PASS 1
//!IN INPUT
//!OUT OUTPUT
//!BLOCK_SIZE 16
//!NUM_THREADS 64
@ -26,7 +30,9 @@ SamplerState sam;
void Pass1(uint2 blockStart, uint3 threadId) {
uint2 gxy = (Rmp8x8(threadId.x) << 1) + blockStart;
if (!CheckViewport(gxy)) {
const uint2 outputSize = GetOutputSize();
if (gxy.x >= outputSize.x || gxy.y >= outputSize.y) {
return;
}
@ -57,14 +63,7 @@ void Pass1(uint2 blockStart, uint3 threadId) {
[unroll]
for (j = 0; j <= 1; ++j) {
uint2 destPos = gxy + uint2(i, j);
if (i != 0 || j != 0) {
if (!CheckViewport(gxy)) {
return;
}
}
WriteToOutput(destPos, FXAA(src, i + 1, j + 1, INPUT, sam, (destPos + 0.5f) * inputPt, inputPt));
OUTPUT[destPos] = float4(FXAA(src, i + 1, j + 1, INPUT, sam, (destPos + 0.5f) * inputPt, inputPt), 1);
}
}
}

View file

@ -1,15 +1,18 @@
// 移植自 https://github.com/libretro/slang-shaders/blob/master/anti-aliasing/shaders/fxaa.slang
//!MAGPIE EFFECT
//!VERSION 3
//!OUTPUT_WIDTH INPUT_WIDTH
//!OUTPUT_HEIGHT INPUT_HEIGHT
//!VERSION 4
//!SORT_NAME FXAA_0
//!TEXTURE
Texture2D INPUT;
//!TEXTURE
//!WIDTH INPUT_WIDTH
//!HEIGHT INPUT_HEIGHT
Texture2D OUTPUT;
//!SAMPLER
//!FILTER LINEAR
SamplerState sam;
@ -17,6 +20,7 @@ SamplerState sam;
//!PASS 1
//!IN INPUT
//!OUT OUTPUT
//!BLOCK_SIZE 16
//!NUM_THREADS 64
@ -26,7 +30,9 @@ SamplerState sam;
void Pass1(uint2 blockStart, uint3 threadId) {
uint2 gxy = (Rmp8x8(threadId.x) << 1) + blockStart;
if (!CheckViewport(gxy)) {
const uint2 outputSize = GetOutputSize();
if (gxy.x >= outputSize.x || gxy.y >= outputSize.y) {
return;
}
@ -57,14 +63,7 @@ void Pass1(uint2 blockStart, uint3 threadId) {
[unroll]
for (j = 0; j <= 1; ++j) {
uint2 destPos = gxy + uint2(i, j);
if (i != 0 || j != 0) {
if (!CheckViewport(gxy)) {
return;
}
}
WriteToOutput(destPos, FXAA(src, i + 1, j + 1, INPUT, sam, (destPos + 0.5f) * inputPt, inputPt));
OUTPUT[destPos] = float4(FXAA(src, i + 1, j + 1, INPUT, sam, (destPos + 0.5f) * inputPt, inputPt), 1);
}
}
}

View file

@ -1,15 +1,18 @@
// 移植自 https://github.com/libretro/slang-shaders/blob/master/anti-aliasing/shaders/fxaa.slang
//!MAGPIE EFFECT
//!VERSION 3
//!OUTPUT_WIDTH INPUT_WIDTH
//!OUTPUT_HEIGHT INPUT_HEIGHT
//!VERSION 4
//!SORT_NAME FXAA_2
//!TEXTURE
Texture2D INPUT;
//!TEXTURE
//!WIDTH INPUT_WIDTH
//!HEIGHT INPUT_HEIGHT
Texture2D OUTPUT;
//!SAMPLER
//!FILTER LINEAR
SamplerState sam;
@ -17,6 +20,7 @@ SamplerState sam;
//!PASS 1
//!IN INPUT
//!OUT OUTPUT
//!BLOCK_SIZE 16
//!NUM_THREADS 64
@ -26,7 +30,9 @@ SamplerState sam;
void Pass1(uint2 blockStart, uint3 threadId) {
uint2 gxy = (Rmp8x8(threadId.x) << 1) + blockStart;
if (!CheckViewport(gxy)) {
const uint2 outputSize = GetOutputSize();
if (gxy.x >= outputSize.x || gxy.y >= outputSize.y) {
return;
}
@ -57,14 +63,7 @@ void Pass1(uint2 blockStart, uint3 threadId) {
[unroll]
for (j = 0; j <= 1; ++j) {
uint2 destPos = gxy + uint2(i, j);
if (i != 0 || j != 0) {
if (!CheckViewport(gxy)) {
return;
}
}
WriteToOutput(destPos, FXAA(src, i + 1, j + 1, INPUT, sam, (destPos + 0.5f) * inputPt, inputPt));
OUTPUT[destPos] = float4(FXAA(src, i + 1, j + 1, INPUT, sam, (destPos + 0.5f) * inputPt, inputPt), 1);
}
}
}

View file

@ -1,9 +1,7 @@
// 移植自 https://github.com/libretro/slang-shaders/blob/3f67e1870dbd5be74ae2f09eaed0eeadce6abd15/misc/image-adjustment.slang
//!MAGPIE EFFECT
//!VERSION 3
//!OUTPUT_WIDTH INPUT_WIDTH
//!OUTPUT_HEIGHT INPUT_HEIGHT
//!VERSION 4
//!PARAMETER
@ -89,6 +87,11 @@ float b;
//!TEXTURE
Texture2D INPUT;
//!TEXTURE
//!WIDTH INPUT_WIDTH
//!HEIGHT INPUT_HEIGHT
Texture2D OUTPUT;
//!SAMPLER
//!FILTER POINT
SamplerState sam;
@ -97,6 +100,7 @@ SamplerState sam;
//!PASS 1
//!STYLE PS
//!IN INPUT
//!OUT OUTPUT
float3 RGBtoHSV(float3 c) {
float4 K = float4(0.0, -1.0 / 3.0, 2.0 / 3.0, -1.0);

View file

@ -10,8 +10,7 @@
// B = 0.825 to get rid of dithering. Increase B to get a fine sharpness, though dithering returns.
//!MAGPIE EFFECT
//!VERSION 3
//!GENERIC_DOWNSCALER
//!VERSION 4
//!PARAMETER
@ -41,6 +40,9 @@ float ARStrength;
//!TEXTURE
Texture2D INPUT;
//!TEXTURE
Texture2D OUTPUT;
//!SAMPLER
//!FILTER POINT
SamplerState sam;
@ -48,6 +50,7 @@ SamplerState sam;
//!PASS 1
//!IN INPUT
//!OUT OUTPUT
//!BLOCK_SIZE 8
//!NUM_THREADS 64
@ -70,7 +73,9 @@ float4 resampler(float4 x, float wa, float wb) {
void Pass1(uint2 blockStart, uint3 threadId) {
uint2 gxy = Rmp8x8(threadId.x) + blockStart;
if (!CheckViewport(gxy)) {
const uint2 outputSize = GetOutputSize();
if (gxy.x >= outputSize.x || gxy.y >= outputSize.y) {
return;
}
@ -126,5 +131,5 @@ void Pass1(uint2 blockStart, uint3 threadId) {
color = lerp(color, clamp(color, min_sample, max_sample), ARStrength);
// final sum and weight normalization
WriteToOutput(gxy, color);
OUTPUT[gxy] = float4(color, 1);
}

View file

@ -2,8 +2,7 @@
// 移植自 https://github.com/libretro/common-shaders/blob/master/windowed/shaders/lanczos6.cg
//!MAGPIE EFFECT
//!VERSION 3
//!GENERIC_DOWNSCALER
//!VERSION 4
//!PARAMETER
@ -17,6 +16,9 @@ float ARStrength;
//!TEXTURE
Texture2D INPUT;
//!TEXTURE
Texture2D OUTPUT;
//!SAMPLER
//!FILTER POINT
SamplerState sam;
@ -25,6 +27,7 @@ SamplerState sam;
//!PASS 1
//!STYLE PS
//!IN INPUT
//!OUT OUTPUT
#define FIX(c) max(abs(c), 1e-5)
#define PI 3.14159265359

View file

@ -1,7 +1,7 @@
// 移植自 https://github.com/NVIDIAGameWorks/NVIDIAImageScaling/blob/main/NIS/NIS_Scaler.h
//!MAGPIE EFFECT
//!VERSION 3
//!VERSION 4
//!PARAMETER
@ -15,6 +15,9 @@ float sharpness;
//!TEXTURE
Texture2D INPUT;
//!TEXTURE
Texture2D OUTPUT;
//!TEXTURE
//!SOURCE Coef_Scale.dds
//!FORMAT R16G16B16A16_FLOAT
@ -32,6 +35,7 @@ SamplerState samplerLinearClamp;
//!PASS 1
//!IN INPUT, coef_scaler, coef_usm
//!OUT OUTPUT
//!BLOCK_SIZE 32,32
//!NUM_THREADS 256
@ -431,12 +435,18 @@ void Pass1(uint2 blockStart, uint3 threadId) {
// discretized phase
const int fx_int = int(fx * kPhaseCount);
const uint2 outputSize = GetOutputSize();
if (dstX >= outputSize.x) {
return;
}
for (int k = 0; k < NIS_BLOCK_WIDTH * NIS_BLOCK_HEIGHT / NIS_THREAD_GROUP_SIZE; ++k) {
// y coord inside the output image
const int dstY = dstBlockY + pos.y + k * (NIS_THREAD_GROUP_SIZE / NIS_BLOCK_WIDTH);
if (!CheckViewport(int2(dstX, dstY))) {
if (dstY >= outputSize.y) {
return;
}
// y coord inside the input image
const float srcY = (0.5f + dstY) * kScaleY - 0.5f;
@ -487,13 +497,13 @@ void Pass1(uint2 blockStart, uint3 threadId) {
// do bilinear tap for chroma upscaling
float3 op = INPUT.SampleLevel(samplerLinearClamp, float2((srcX + 0.5f) * kSrcNormX, (srcY + 0.5f) * kSrcNormY), 0).rgb;
float4 op = INPUT.SampleLevel(samplerLinearClamp, float2((srcX + 0.5f) * kSrcNormX, (srcY + 0.5f) * kSrcNormY), 0);
const float corr = opY * (1.0f / NIS_SCALE_FLOAT) - getY(float3(op.x, op.y, op.z));
op.x += corr;
op.y += corr;
op.z += corr;
WriteToOutput(uint2(dstX, dstY), op);
OUTPUT[uint2(dstX, dstY)] = op;
}
}

View file

@ -1,18 +1,9 @@
// 移植自 https://github.com/NVIDIAGameWorks/NVIDIAImageScaling/blob/main/NIS/NIS_Scaler.h
//!MAGPIE EFFECT
//!VERSION 3
//!OUTPUT_WIDTH INPUT_WIDTH
//!OUTPUT_HEIGHT INPUT_HEIGHT
//!VERSION 4
//!TEXTURE
Texture2D INPUT;
//!SAMPLER
//!FILTER LINEAR
SamplerState samplerLinearClamp;
//!PARAMETER
//!LABEL Sharpness
//!DEFAULT 0.5
@ -21,9 +12,22 @@ SamplerState samplerLinearClamp;
//!STEP 0.01
float sharpness;
//!TEXTURE
Texture2D INPUT;
//!TEXTURE
//!WIDTH INPUT_WIDTH
//!HEIGHT INPUT_HEIGHT
Texture2D OUTPUT;
//!SAMPLER
//!FILTER LINEAR
SamplerState samplerLinearClamp;
//!PASS 1
//!IN INPUT
//!OUT OUTPUT
//!BLOCK_SIZE 32, 32
//!NUM_THREADS 256
@ -208,6 +212,8 @@ void Pass1(uint2 blockStart, uint3 threadId) {
GroupMemoryBarrierWithGroupSync();
const int2 outputSize = (int2)GetOutputSize();
for (int k = int(threadIdx); k < NIS_BLOCK_WIDTH * NIS_BLOCK_HEIGHT; k += NIS_THREAD_GROUP_SIZE) {
const int2 pos = int2(uint(k) % uint(NIS_BLOCK_WIDTH), uint(k) / uint(NIS_BLOCK_WIDTH));
@ -215,7 +221,7 @@ void Pass1(uint2 blockStart, uint3 threadId) {
const int dstX = dstBlockX + pos.x;
const int dstY = dstBlockY + pos.y;
if (!CheckViewport(int2(dstX, dstY))) {
if (dstX >= outputSize.x || dstY >= outputSize.y) {
continue;
}
@ -238,9 +244,9 @@ void Pass1(uint2 blockStart, uint3 threadId) {
// final USM is a weighted sum filter outputs
const float usmY = (dirUSM.x * w.x + dirUSM.y * w.y + dirUSM.z * w.z + dirUSM.w * w.w);
float3 op = INPUT.SampleLevel(samplerLinearClamp, float2((dstX + 0.5f) * kSrcNormX, (dstY + 0.5f) * kSrcNormY), 0).rgb;
float4 op = INPUT.SampleLevel(samplerLinearClamp, float2((dstX + 0.5f) * kSrcNormX, (dstY + 0.5f) * kSrcNormY), 0);
op += usmY;
WriteToOutput(uint2(dstX, dstY), op);
OUTPUT[uint2(dstX, dstY)] = op;
}
}

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

View file

@ -1,206 +1,732 @@
// nnedi3-nns16-win8x4
// 移植自 https://github.com/bjin/mpv-prescalers/blob/cc02ed95c1fe05b72bc21d41257c4c085e6e409b/compute/nnedi3-nns16-win8x4.hook
// 有半像素的偏移
// This file is generated by the scripts available at https://github.com/hauuau/magpie-prescalers
// Please don't edit this file directly.
// Generated by: nnedi3.py --nns 16 --win 8x4 --use-compute-shader --use-magpie
//
// This program is free software: you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation, either version 3 of the License, or
// (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License
// along with this program. If not, see <http://www.gnu.org/licenses/>.
//!MAGPIE EFFECT
//!VERSION 3
//!OUTPUT_WIDTH INPUT_WIDTH * 2
//!OUTPUT_HEIGHT INPUT_HEIGHT * 2
//!VERSION 4
//!SORT_NAME NNEDI3_016_4
//!TEXTURE
Texture2D INPUT;
//!SAMPLER
//!FILTER POINT
SamplerState sam;
SamplerState sam_INPUT;
//!TEXTURE
//!WIDTH INPUT_WIDTH * 1 * 2
//!HEIGHT INPUT_HEIGHT * 2 * 1
Texture2D OUTPUT;
//!SAMPLER
//!FILTER LINEAR
SamplerState sam1;
SamplerState sam_INPUT_LINEAR;
//!TEXTURE
//!WIDTH INPUT_WIDTH
//!HEIGHT INPUT_HEIGHT * 2
//!FORMAT R16_FLOAT
Texture2D tex1;
//!WIDTH INPUT_WIDTH * 1
//!HEIGHT INPUT_HEIGHT * 2
Texture2D temp;
//!SAMPLER
//!FILTER POINT
SamplerState sam_temp;
//!COMMON
#include "prescalers.hlsli"
#define T(x) asfloat(x)
#define W(i,w0,w1,w2,w3) dot(samples[i],float4(T(w0),T(w1),T(w2),T(w3)))
#define WS(w0,w1) sum1 = exp(sum1 * mstd2 + T(w0)); sum2 = sum2 * mstd2 + T(w1); wsum += sum1; vsum += sum1*(sum2/(1.0+abs(sum2)))
#define LAST_PASS 2
//!PASS 1
//!DESC double_y
//!DESC NNEDI3 (double_y, nns16, win8x4)
//!IN INPUT
//!OUT tex1
//!BLOCK_SIZE 32,16
//!NUM_THREADS 32,8
float nnedi3(float4 samples[8]) {
//!OUT temp
//!BLOCK_SIZE 32, 16
//!NUM_THREADS 32, 8
#pragma optionNV(inline none)
float nnedi3(vec4 samples[8]) {
float sum = 0.0, sumsq = 0.0;
[unroll]
for (int i = 0; i < 8; i++) {
sum += dot(samples[i], 1.0f);
[unroll] for (int i = 0; i < 8; i++) {
sum += dot(samples[i], vec4(1.0, 1.0, 1.0, 1.0));
sumsq += dot(samples[i], samples[i]);
}
float mstd0 = sum / 32.0;
float mstd1 = sumsq / 32.0 - mstd0 * mstd0;
// 不能使用 lerp否则结果可能为 nan
float mstd2 = mstd1 >= 1.192092896e-7 ? rsqrt(mstd1) : 0.0;
float mstd2 = mix(0.0, inversesqrt(mstd1), mstd1 >= 1.192092896e-7);
mstd1 *= mstd2;
float vsum = 0.0, wsum = 0.0, sum1, sum2;
#define T(x) intBitsToFloat(x)
#define W(i, w0, w1, w2, w3) dot(samples[i], vec4(T(w0), T(w1), T(w2), T(w3)))
#define WS(w0, w1) \
sum1 = exp(sum1 * mstd2 + T(w0)); \
sum2 = sum2 * mstd2 + T(w1); \
wsum += sum1; \
vsum += sum1 * (sum2 / (1.0 + abs(sum2)));
sum1 = W(0, -1123354974, -1112248839, 1046299686, -1143613552)
+ W(1, -1118620174, 1024662558, 1028038478, -1129268360)
+ W(2, 1016130204, -1087068557, 1063313277, -1103342192)
+ W(3, -1103968288, 1048182784, 1047279381, -1115088511)
+ W(4, -1101453425, 1059583965, -1088182320, 1003350800)
+ W(5, -1117908518, -1119323982, 1034186247, -1134684248)
+ W(6, -1122284590, 1027638054, -1124394588, -1111377363)
+ W(7, -1122818124, -1137723992, 978245507, 1028117438);
sum2 = W(0, -1162931039, -1131063526, 1029801649, -1117642655)
+ W(1, -1136248556, -1131086728, 1031011705, -1128864654)
+ W(2, -1115594515, -1128443230, 1042762789, -1107118398)
+ W(3, -1119907402, 1044675527, 1050674207, -1113986381)
+ W(4, 1022791334, -1107588397, 1009001220, -1186206458)
+ W(5, 1017500018, -1111169922, -1112569685, 1017255694)
+ W(6, -1156766128, -1125594766, -1148613464, 993928432)
+ W(7, 1014782692, -1135599628, -1114139175, 1007622876);
WS(1038828992, 1041685264);
sum1 = W(0, -1114329248, 1049950910, -1097681183, 1028668144) + W(1, 995958527, 1027336960, -1107326552, 1025858258)
+ W(2, -1117673776, 1060640651, -1085831405, 1033402064)
+ W(3, 1034401008, 1045782072, -1105157973, -1122828000)
+ W(4, 1038612842, -1098159517, 1053136924, -1110558370)
+ W(5, 1035088196, -1106507532, 1032016120, -1113173980)
+ W(6, 1008781376, -1124000392, 1023707152, 1012109856)
+ W(7, 1029875310, -1105439902, 1034119968, -1114749520);
sum2 =
W(0, 1031315360, -1099468189, -1112139926, 1036663822) + W(1, -1131767489, -1140834082, 1024287080, -1122285462)
+ W(2, 1023637252, -1100127579, -1117241706, 1038018354)
+ W(3, -1107869385, 1052854494, 1052996200, -1112496415)
+ W(4, -1107666272, 1034036134, 1027811452, -1110479054)
+ W(5, -1117110288, 1024451620, 1027157968, -1112615559) + W(6, -1124350185, 1003450083, -1131082337, 998992195)
+ W(7, -1110538107, 1041131277, 1035032776, -1106762474);
WS(-1086074680, 1053637716);
sum1 = W(0, -1121345387, 1042002951, -1113042450, -1121398619)
+ W(1, -1148805338, -1165378922, -1115297518, 991217235)
+ W(2, -1136570733, 1052460699, -1107443934, -1117268427)
+ W(3, 1049266593, -1094571489, -1098765182, 1036113926)
+ W(4, 1027081787, -1124281856, 1043313411, -1136658365)
+ W(5, -1133439181, 1040734807, 1006695533, -1112513138)
+ W(6, -1158465386, -1121708851, 1016359031, 1021173351)
+ W(7, -1120818857, 1035650578, 1027853163, -1106476275);
sum2 = W(0, 1026517575, -1170492850, -1138816415, -1143472678)
+ W(1, 1017334370, 1003954710, -1132363566, 998846550)
+ W(2, 1051558711, -1096673587, -1136175651, -1124275402)
+ W(3, 1071692777, -1077357700, -1098960792, 1018703670)
+ W(4, 1049822619, -1098179385, -1116986501, 1007812651) + W(5, 1020207734, 996694924, 1003290486, 1007766851)
+ W(6, 1022251878, -1122577241, -1141894102, 1009415395)
+ W(7, 1019995718, 1015494226, -1126828734, -1163222937);
WS(1051521136, 1027207116);
sum1 =
W(0, -1122694020, 1010830545, -1124291704, 1018062184) + W(1, -1121133108, -1124202632, 1037913146, -1116091286)
+ W(2, -1102175837, 1057246783, -1093542759, 1041281977) + W(3, -1116351908, 1026322980, 982577970, -1125394504)
+ W(4, 1045518980, -1089509425, 1055793637, 1008755233) + W(5, 1009393969, 1025178484, -1118947636, -1127575032)
+ W(6, 1008379217, -1117338572, 1001093793, 1015898776) + W(7, 1015772516, 1009646833, 1001810977, -1121163492);
sum2 = W(0, -1137495011, -1135527491, 1027730022, -1118108263)
+ W(1, 1013616911, -1123650952, 1024465134, -1128775579)
+ W(2, -1135578111, 1013443151, 1049128967, -1098008683)
+ W(3, 1029346938, -1114797945, 1068130737, -1080443718)
+ W(4, 1017473747, -1122100892, 1046423571, -1101482344)
+ W(5, 1012413655, -1128721387, -1143058109, -1137148015)
+ W(6, -1133405571, -1166794345, 1020545683, -1128178767)
+ W(7, 1008139351, -1156685818, -1126785325, 991435034);
WS(1057767608, -1132080751);
sum1 = W(0, 1026028453, 1025766741, 1035118319, 1012106581) + W(1, 1026017621, -1135552917, 1040474693, -1138611630)
+ W(2, -1117947285, 1051769667, -1111744027, 1030333189)
+ W(3, 1048679017, -1083959172, -1084413328, 1045191121)
+ W(4, 1025261389, -1120826122, 1049618505, -1122181545)
+ W(5, 1011196341, 1045191525, -1110336171, 1030480605) + W(6, 1015828970, 1028389741, 1028257397, 1027514349)
+ W(7, 1025013027, 1039505775, -1123719333, 1020294666);
sum2 = W(0, 1017587161, -1101123140, 1040188371, 988296658) + W(1, 1028118553, -1103020887, 1022642341, 1010063898)
+ W(2, 1008167722, -1099714612, 1039093756, 1026403646) + W(3, 1005112948, 1049070164, 1046164698, 1033545355)
+ W(4, -1125344655, 1032013714, -1111525569, 1002132020)
+ W(5, 1015776789, 1022049457, -1098832696, 1037334715)
+ W(6, -1148301500, 1009340114, -1115917000, -1139728254)
+ W(7, -1138850406, -1167693540, -1103378287, 1035581889);
WS(-1099372256, -1088618788);
sum1 = W(0, -1112538182, 1048693927, -1112344546, -1109099742)
+ W(1, -1113349022, 1033711782, -1129092599, -1110127398)
+ W(2, -1103996671, 1064716592, -1086749016, 1032699126)
+ W(3, 1024020908, -1143605597, 1044926535, -1121424940)
+ W(4, 1046614908, -1085173359, 1062252083, -1130166943)
+ W(5, -1111225386, 1004694493, 1040479887, -1106709441)
+ W(6, -1110537326, -1108087402, 1034104622, -1120726228)
+ W(7, -1114146165, -1138402062, 1042110371, -1106064827);
sum2 = W(0, 987083788, 1013472954, -1120418118, 979955865)
+ W(1, -1144106823, -1131186779, -1122269098, -1163904780)
+ W(2, -1120467381, -1139561796, 1038342084, -1115615181)
+ W(3, -1121977305, 1044091298, 1042996066, -1127292875)
+ W(4, -1118651341, 1038343490, -1118476220, -1123141745)
+ W(5, -1162389292, -1115306287, -1128689408, 1014320394)
+ W(6, -1152635694, -1155962630, -1132569906, -1135582470)
+ W(7, 964510307, -1117365756, -1141833923, 1008840046);
WS(1041282784, 1044242623);
sum1 = W(0, -1119885764, -1171512555, 1003864029, 1025494836)
+ W(1, -1119816052, -1121861252, 1040963149, -1113504879)
+ W(2, -1100880653, 1057266723, -1094412795, 1043843337)
+ W(3, -1113812594, 1010135439, -1118004569, -1125989575)
+ W(4, 1046531310, -1089952515, 1056310444, -1156936827)
+ W(5, 1015358999, 1031135156, -1114099002, -1122714492)
+ W(6, 1005085853, -1115226950, 1015234855, 1003362397) + W(7, 1021011107, 1003139037, 992693307, -1120612644);
sum2 =
W(0, 1005317381, -1142619324, -1126266146, 1026462555) + W(1, -1143827754, 1012902153, -1128784654, 1020893616)
+ W(2, 1019060164, -1114788024, -1094218173, 1054132458)
+ W(3, 1009279342, -1098688460, -1078812823, 1070492026)
+ W(4, 1014092605, -1120377499, -1099532818, 1048935725) + W(5, -1131000233, 1017453102, 1007638067, 1011358224)
+ W(6, 1012779564, -1139793504, -1130333980, 1015734963)
+ W(7, -1137528453, -1147729078, 1018177647, 987943782);
WS(1046635232, 1024078131);
sum1 = W(0, 1002735212, 1035063871, -1097977761, 1040314319) + W(1, 1025138813, 1034039879, -1105608655, 1035664624)
+ W(2, 1017042555, 1044122447, -1094991056, 1038536855)
+ W(3, -1132524982, -1110416695, 1051547730, -1114843703)
+ W(4, 1031803657, -1092481954, 1050188814, 1003107468) + W(5, 1033606155, -1094320024, 1047410847, 1019470987)
+ W(6, 1021596219, -1107502027, 1031346589, 1021345835)
+ W(7, 1015508823, -1103391009, 1046101811, -1136683190);
sum2 =
W(0, -1096475926, 1044036812, 1052862983, -1106234474) + W(1, -1112281069, -1112231286, 1024115789, -1121785528)
+ W(2, -1116645717, -1111398905, 1051331710, -1130292776)
+ W(3, 1041647377, -1096068583, 1038036111, 1037359643) + W(4, -1113263240, 1026411348, 1042458641, -1111704128)
+ W(5, 1023473494, -1114320784, 1028002558, -1123406807)
+ W(6, -1117017643, -1138574198, 1037890580, -1109714921)
+ W(7, 1039764966, -1104710548, -1106844581, 1041123403);
WS(-1088554040, -1076674880);
sum1 =
W(0, 1026292820, -1132973070, -1144171612, -1130131975) + W(1, 1016736263, 1034501898, -1110973538, 1028857234)
+ W(2, 1042339025, -1089525132, 1052671191, -1108906970)
+ W(3, -1110236986, 1037427962, -1123890785, -1112145786)
+ W(4, -1103961368, 1056478885, -1092344862, 1002874044) + W(5, 1016313655, -1118983748, 1041641985, 1025897228)
+ W(6, -1151588920, 1038469390, 1010979982, -1130905399)
+ W(7, 1014755782, -1123320716, 1017396903, 1033705562);
sum2 = W(0, 1013915195, -1133182691, -1127318198, 1020584890)
+ W(1, 1007730851, 1024414743, -1121307593, 1005058566) + W(2, 981970521, -1111248658, 1035588225, -1124411850)
+ W(3, 1028189234, 1040952978, 1057294107, 1029625115)
+ W(4, -1121038101, -1109339192, -1107404728, 1026110889)
+ W(5, -1142484934, -1094377458, 1024397525, 1023925523)
+ W(6, -1146368902, -1116592821, -1118541421, -1140327971)
+ W(7, 1010322539, -1112421528, 1019759378, -1199698720);
WS(1063581112, 1015292283);
sum1 =
W(0, -1123806598, -1125096044, 1046804719, -1117498166) + W(1, -1124445804, 1037634467, 1028314614, 1006823135)
+ W(2, 1036776315, -1083793455, 1064148787, -1106689849)
+ W(3, -1112186771, -1098422117, 1034155462, 1004978479)
+ W(4, -1102837698, 1058965073, -1089226130, 1033810693)
+ W(5, -1117642958, -1106625757, 1037373467, 1029436414) + W(6, -1137018200, 1036181095, 994321759, -1119765454)
+ W(7, 1010580432, -1127761788, 1021285644, 1034713459);
sum2 = W(0, -1127012521, -1110373665, -1121983257, 1021812843)
+ W(1, -1129458054, -1122115974, -1121551577, 1015201109)
+ W(2, -1134632819, -1118435057, -1107711610, 1039413537)
+ W(3, -1113739078, 1041258512, 1043546644, -1127386873)
+ W(4, -1106078947, 1025961773, 1048226293, -1110385416)
+ W(5, -1115241196, 1041055451, -1131486243, -1135801459)
+ W(6, -1122814807, 1025056413, -1139476701, -1132245806)
+ W(7, -1119046895, 1029845331, 1018415015, -1140149017);
WS(-1109010880, -1087548956);
sum1 = W(0, 1034947768, -1095012676, 1046023882, 1029737824) + W(1, 1034343312, -1102610188, 1039446704, 1025692706)
+ W(2, 1016751552, -1096454908, 1042564604, 1038373096)
+ W(3, 1019661856, -1091443170, -1105694067, 1039271048)
+ W(4, -1126501287, -1131030249, 1044246468, 1012879825)
+ W(5, 1017025648, 1042942296, -1103700296, 1041317114) + W(6, 1030724160, 1019936112, -1141422594, 1029263800)
+ W(7, -1140792121, 1024647464, -1107855416, 1041193844);
sum2 =
W(0, 1034034732, -1107522705, -1105460279, 1021740679) + W(1, -1113997103, -1121503695, 1038975878, -1112744336)
+ W(2, 1028771217, -1114143244, 1032873918, -1121564954) + W(3, 1025456143, -1105773446, 1059420344, 1024971971)
+ W(4, 1035315492, -1109746606, 1040681265, -1122379806)
+ W(5, -1102403849, -1106040358, 1046039582, -1106873869)
+ W(6, 1018212015, -1106459627, 1026290649, -1130313815)
+ W(7, -1099438501, 1039219872, 1046943722, -1105420350);
WS(-1086299832, -1077288694);
sum1 = W(0, 1021716686, -1099039878, -1111509136, 1039618828)
+ W(1, -1132921948, -1108540692, 1021468846, -1131678690)
+ W(2, -1113901292, -1158126306, -1096197083, 1041516082)
+ W(3, -1108835908, 1055092577, 1062013047, -1118733319)
+ W(4, 1023078294, -1089051407, 1050708993, -1122936235) + W(5, 965138311, -1113759276, 1022391342, 1015065790)
+ W(6, 998651320, -1107695832, -1133490396, 997649137) + W(7, -1130194922, -1113503632, 991635057, 1023538631);
sum2 = W(0, -1133976495, 1035891239, -1130801609, -1113698362)
+ W(1, 1027343155, 1030599513, -1108453664, 1016406968)
+ W(2, -1149877867, 1037590422, 1012747883, -1108226898)
+ W(3, -1119506980, 1054189655, -1119322812, -1120928356)
+ W(4, -1126385541, 1041308688, -1107379808, 1016225738)
+ W(5, 1016526837, -1112736561, -1119223720, 988482485) + W(6, 994153115, 1004824957, -1116360142, 1018050885)
+ W(7, -1140785051, -1120347934, -1129452107, -1117792638);
WS(-1113279936, 1066223903);
sum1 = W(0, -1128171420, 1040261344, -1112013315, -1123695998)
+ W(1, -1141738481, -1140107833, -1116929726, -1154978689)
+ W(2, -1138940153, 1050703688, -1108200895, -1123177006)
+ W(3, 1044160156, -1100167260, -1100730273, 1034288823)
+ W(4, 1020686276, -1130335589, 1040782300, -1141423761)
+ W(5, -1129655596, 1035637471, 1024316286, -1114187043) + W(6, 964173357, -1124525100, 1014134393, 1013984857)
+ W(7, -1123239900, 1032644739, 1029624526, -1108229911);
sum2 = W(0, -1115606620, 1021458196, 1009639320, -1131253088)
+ W(1, -1125272644, 1017345212, 1016051020, -1143902384)
+ W(2, -1099614716, 1047257730, -1120838650, 1020803060)
+ W(3, -1080575150, 1068148121, -1113655261, 1032085971)
+ W(4, -1102155153, 1044966894, -1132238288, 1016311348)
+ W(5, -1122847678, 1026244022, -1130782536, -1137376840)
+ W(6, -1123394906, 1017049220, 967940860, -1137115752)
+ W(7, -1129056732, 1010161976, 1004223696, -1136984808);
WS(1060545080, -1126581603);
sum1 =
W(0, 1032630360, -1112268976, 1045186906, -1125010622) + W(1, 1037657648, -1128752350, 1032285712, 1029508223)
+ W(2, 1043836232, -1090205186, 1053340438, -1108078856) + W(3, 1037448680, 1048595306, -1094666759, 1041691860)
+ W(4, 976149203, 1057651571, -1082657749, 1042698525) + W(5, 1031833596, 1035187792, -1092127852, 1040118132)
+ W(6, 1031675647, 1034806588, -1104761760, 1033087420) + W(7, 1025282125, 1043419290, -1096441814, 1034587656);
sum2 =
W(0, -1123698886, 1034075649, 998149095, -1113635181) + W(1, -1126365381, 1026991402, -1118780236, -1168196508)
+ W(2, -1135914762, 1019253181, 1023543366, -1114469118)
+ W(3, -1121651762, 1047572688, 1038479879, -1145545780) + W(4, -1118625490, 1035108181, -1114677625, 992781287)
+ W(5, -1122087574, -1115886918, 1011684618, -1139655050)
+ W(6, -1147908244, 1016718341, -1132109957, -1142844852)
+ W(7, -1134045690, -1117034488, -1137057610, 1007905050);
WS(-1083899832, -1105526146);
sum1 = W(0, 1026357515, -1119744955, -1117075907, -1111407198)
+ W(1, -1139718894, -1125720471, -1106102943, -1152407445)
+ W(2, 1044187583, -1092285679, 1048719011, -1107209883)
+ W(3, -1105573131, 1062437883, 1052836221, -1107292779)
+ W(4, -1104526300, 1058460257, -1089717563, -1122559055)
+ W(5, -1119529939, 1022150135, -1123085499, -1119739267)
+ W(6, -1125768375, 1033366698, -1114009838, -1119196243)
+ W(7, -1132776678, 1009731342, -1112611206, -1129505495);
sum2 = W(0, -1110807022, 1025172792, 1033543849, -1123816828)
+ W(1, -1129400032, -1117035240, 999654946, -1144812946)
+ W(2, -1105612607, 1035443403, 1039345667, -1120747576)
+ W(3, -1123619892, -1135427545, 1053020794, -1113498942)
+ W(4, -1131262448, -1111010692, 1047843748, -1113301822)
+ W(5, 1016529300, -1115955576, -1135856481, -1146605522)
+ W(6, -1129444600, -1117326476, 1022819536, -1119691028)
+ W(7, -1136239801, -1121250556, 998047364, -1135792457);
WS(-1107513792, 1064663354);
sum1 = W(0, 1030862455, -1113532308, 1032378968, -1123071015)
+ W(1, -1161118946, 1021510766, -1127591630, 1009770420)
+ W(2, 1040244826, -1091621085, 1051734861, -1107582956)
+ W(3, -1104300038, 1046262406, 1034822530, -1108820108)
+ W(4, -1102940181, 1054782000, -1095483267, -1125175670)
+ W(5, -1135077628, 1019068110, 1031948820, 1025488559)
+ W(6, -1135539484, 1036941280, -1172984259, -1126076542)
+ W(7, 1011863892, -1128724830, -1120336759, 1036426604);
sum2 =
W(0, -1135206239, -1140752647, 1022777359, 974924014) + W(1, -1139065871, -1123380440, 1021581075, -1133276463)
+ W(2, 1026230428, 988696695, -1122295168, 1029689087) + W(3, 1025917606, -1092786651, -1085937537, -1140169471)
+ W(4, 1027050280, 1049996339, 1032573953, -1135329695) + W(5, 1013849783, 1057784826, -1130048007, -1124883951)
+ W(6, 1016077019, 1033822297, 1032545188, 1011238415) + W(7, -1127829351, 1034470972, -1137094527, 1001568686);
WS(1058918200, -1121082995);
sum1 = W(0, -1123354974, -1112248839, 1046299686, -1143613552) + W(1, -1118620174, 1024662558, 1028038478, -1129268360) + W(2, 1016130204, -1087068557, 1063313277, -1103342192) + W(3, -1103968288, 1048182784, 1047279381, -1115088511) + W(4, -1101453425, 1059583965, -1088182320, 1003350800) + W(5, -1117908518, -1119323982, 1034186247, -1134684248) + W(6, -1122284590, 1027638054, -1124394588, -1111377363) + W(7, -1122818124, -1137723992, 978245507, 1028117438); sum2 = W(0, -1162931039, -1131063526, 1029801649, -1117642655) + W(1, -1136248556, -1131086728, 1031011705, -1128864654) + W(2, -1115594515, -1128443230, 1042762789, -1107118398) + W(3, -1119907402, 1044675527, 1050674207, -1113986381) + W(4, 1022791334, -1107588397, 1009001220, -1186206458) + W(5, 1017500018, -1111169922, -1112569685, 1017255694) + W(6, -1156766128, -1125594766, -1148613464, 993928432) + W(7, 1014782692, -1135599628, -1114139175, 1007622876); WS(1038828992, 1041685264);
sum1 = W(0, -1114329248, 1049950910, -1097681183, 1028668144) + W(1, 995958527, 1027336960, -1107326552, 1025858258) + W(2, -1117673776, 1060640651, -1085831405, 1033402064) + W(3, 1034401008, 1045782072, -1105157973, -1122828000) + W(4, 1038612842, -1098159517, 1053136924, -1110558370) + W(5, 1035088196, -1106507532, 1032016120, -1113173980) + W(6, 1008781376, -1124000392, 1023707152, 1012109856) + W(7, 1029875310, -1105439902, 1034119968, -1114749520); sum2 = W(0, 1031315360, -1099468189, -1112139926, 1036663822) + W(1, -1131767489, -1140834082, 1024287080, -1122285462) + W(2, 1023637252, -1100127579, -1117241706, 1038018354) + W(3, -1107869385, 1052854494, 1052996200, -1112496415) + W(4, -1107666272, 1034036134, 1027811452, -1110479054) + W(5, -1117110288, 1024451620, 1027157968, -1112615559) + W(6, -1124350185, 1003450083, -1131082337, 998992195) + W(7, -1110538107, 1041131277, 1035032776, -1106762474); WS(-1086074680, 1053637716);
sum1 = W(0, -1121345387, 1042002951, -1113042450, -1121398619) + W(1, -1148805338, -1165378922, -1115297518, 991217235) + W(2, -1136570733, 1052460699, -1107443934, -1117268427) + W(3, 1049266593, -1094571489, -1098765182, 1036113926) + W(4, 1027081787, -1124281856, 1043313411, -1136658365) + W(5, -1133439181, 1040734807, 1006695533, -1112513138) + W(6, -1158465386, -1121708851, 1016359031, 1021173351) + W(7, -1120818857, 1035650578, 1027853163, -1106476275); sum2 = W(0, 1026517575, -1170492850, -1138816415, -1143472678) + W(1, 1017334370, 1003954710, -1132363566, 998846550) + W(2, 1051558711, -1096673587, -1136175651, -1124275402) + W(3, 1071692777, -1077357700, -1098960792, 1018703670) + W(4, 1049822619, -1098179385, -1116986501, 1007812651) + W(5, 1020207734, 996694924, 1003290486, 1007766851) + W(6, 1022251878, -1122577241, -1141894102, 1009415395) + W(7, 1019995718, 1015494226, -1126828734, -1163222937); WS(1051521136, 1027207116);
sum1 = W(0, -1122694020, 1010830545, -1124291704, 1018062184) + W(1, -1121133108, -1124202632, 1037913146, -1116091286) + W(2, -1102175837, 1057246783, -1093542759, 1041281977) + W(3, -1116351908, 1026322980, 982577970, -1125394504) + W(4, 1045518980, -1089509425, 1055793637, 1008755233) + W(5, 1009393969, 1025178484, -1118947636, -1127575032) + W(6, 1008379217, -1117338572, 1001093793, 1015898776) + W(7, 1015772516, 1009646833, 1001810977, -1121163492); sum2 = W(0, -1137495011, -1135527491, 1027730022, -1118108263) + W(1, 1013616911, -1123650952, 1024465134, -1128775579) + W(2, -1135578111, 1013443151, 1049128967, -1098008683) + W(3, 1029346938, -1114797945, 1068130737, -1080443718) + W(4, 1017473747, -1122100892, 1046423571, -1101482344) + W(5, 1012413655, -1128721387, -1143058109, -1137148015) + W(6, -1133405571, -1166794345, 1020545683, -1128178767) + W(7, 1008139351, -1156685818, -1126785325, 991435034); WS(1057767608, -1132080751);
sum1 = W(0, 1026028453, 1025766741, 1035118319, 1012106581) + W(1, 1026017621, -1135552917, 1040474693, -1138611630) + W(2, -1117947285, 1051769667, -1111744027, 1030333189) + W(3, 1048679017, -1083959172, -1084413328, 1045191121) + W(4, 1025261389, -1120826122, 1049618505, -1122181545) + W(5, 1011196341, 1045191525, -1110336171, 1030480605) + W(6, 1015828970, 1028389741, 1028257397, 1027514349) + W(7, 1025013027, 1039505775, -1123719333, 1020294666); sum2 = W(0, 1017587161, -1101123140, 1040188371, 988296658) + W(1, 1028118553, -1103020887, 1022642341, 1010063898) + W(2, 1008167722, -1099714612, 1039093756, 1026403646) + W(3, 1005112948, 1049070164, 1046164698, 1033545355) + W(4, -1125344655, 1032013714, -1111525569, 1002132020) + W(5, 1015776789, 1022049457, -1098832696, 1037334715) + W(6, -1148301500, 1009340114, -1115917000, -1139728254) + W(7, -1138850406, -1167693540, -1103378287, 1035581889); WS(-1099372256, -1088618788);
sum1 = W(0, -1112538182, 1048693927, -1112344546, -1109099742) + W(1, -1113349022, 1033711782, -1129092599, -1110127398) + W(2, -1103996671, 1064716592, -1086749016, 1032699126) + W(3, 1024020908, -1143605597, 1044926535, -1121424940) + W(4, 1046614908, -1085173359, 1062252083, -1130166943) + W(5, -1111225386, 1004694493, 1040479887, -1106709441) + W(6, -1110537326, -1108087402, 1034104622, -1120726228) + W(7, -1114146165, -1138402062, 1042110371, -1106064827); sum2 = W(0, 987083788, 1013472954, -1120418118, 979955865) + W(1, -1144106823, -1131186779, -1122269098, -1163904780) + W(2, -1120467381, -1139561796, 1038342084, -1115615181) + W(3, -1121977305, 1044091298, 1042996066, -1127292875) + W(4, -1118651341, 1038343490, -1118476220, -1123141745) + W(5, -1162389292, -1115306287, -1128689408, 1014320394) + W(6, -1152635694, -1155962630, -1132569906, -1135582470) + W(7, 964510307, -1117365756, -1141833923, 1008840046); WS(1041282784, 1044242623);
sum1 = W(0, -1119885764, -1171512555, 1003864029, 1025494836) + W(1, -1119816052, -1121861252, 1040963149, -1113504879) + W(2, -1100880653, 1057266723, -1094412795, 1043843337) + W(3, -1113812594, 1010135439, -1118004569, -1125989575) + W(4, 1046531310, -1089952515, 1056310444, -1156936827) + W(5, 1015358999, 1031135156, -1114099002, -1122714492) + W(6, 1005085853, -1115226950, 1015234855, 1003362397) + W(7, 1021011107, 1003139037, 992693307, -1120612644); sum2 = W(0, 1005317381, -1142619324, -1126266146, 1026462555) + W(1, -1143827754, 1012902153, -1128784654, 1020893616) + W(2, 1019060164, -1114788024, -1094218173, 1054132458) + W(3, 1009279342, -1098688460, -1078812823, 1070492026) + W(4, 1014092605, -1120377499, -1099532818, 1048935725) + W(5, -1131000233, 1017453102, 1007638067, 1011358224) + W(6, 1012779564, -1139793504, -1130333980, 1015734963) + W(7, -1137528453, -1147729078, 1018177647, 987943782); WS(1046635232, 1024078131);
sum1 = W(0, 1002735212, 1035063871, -1097977761, 1040314319) + W(1, 1025138813, 1034039879, -1105608655, 1035664624) + W(2, 1017042555, 1044122447, -1094991056, 1038536855) + W(3, -1132524982, -1110416695, 1051547730, -1114843703) + W(4, 1031803657, -1092481954, 1050188814, 1003107468) + W(5, 1033606155, -1094320024, 1047410847, 1019470987) + W(6, 1021596219, -1107502027, 1031346589, 1021345835) + W(7, 1015508823, -1103391009, 1046101811, -1136683190); sum2 = W(0, -1096475926, 1044036812, 1052862983, -1106234474) + W(1, -1112281069, -1112231286, 1024115789, -1121785528) + W(2, -1116645717, -1111398905, 1051331710, -1130292776) + W(3, 1041647377, -1096068583, 1038036111, 1037359643) + W(4, -1113263240, 1026411348, 1042458641, -1111704128) + W(5, 1023473494, -1114320784, 1028002558, -1123406807) + W(6, -1117017643, -1138574198, 1037890580, -1109714921) + W(7, 1039764966, -1104710548, -1106844581, 1041123403); WS(-1088554040, -1076674880);
sum1 = W(0, 1026292820, -1132973070, -1144171612, -1130131975) + W(1, 1016736263, 1034501898, -1110973538, 1028857234) + W(2, 1042339025, -1089525132, 1052671191, -1108906970) + W(3, -1110236986, 1037427962, -1123890785, -1112145786) + W(4, -1103961368, 1056478885, -1092344862, 1002874044) + W(5, 1016313655, -1118983748, 1041641985, 1025897228) + W(6, -1151588920, 1038469390, 1010979982, -1130905399) + W(7, 1014755782, -1123320716, 1017396903, 1033705562); sum2 = W(0, 1013915195, -1133182691, -1127318198, 1020584890) + W(1, 1007730851, 1024414743, -1121307593, 1005058566) + W(2, 981970521, -1111248658, 1035588225, -1124411850) + W(3, 1028189234, 1040952978, 1057294107, 1029625115) + W(4, -1121038101, -1109339192, -1107404728, 1026110889) + W(5, -1142484934, -1094377458, 1024397525, 1023925523) + W(6, -1146368902, -1116592821, -1118541421, -1140327971) + W(7, 1010322539, -1112421528, 1019759378, -1199698720); WS(1063581112, 1015292283);
sum1 = W(0, -1123806598, -1125096044, 1046804719, -1117498166) + W(1, -1124445804, 1037634467, 1028314614, 1006823135) + W(2, 1036776315, -1083793455, 1064148787, -1106689849) + W(3, -1112186771, -1098422117, 1034155462, 1004978479) + W(4, -1102837698, 1058965073, -1089226130, 1033810693) + W(5, -1117642958, -1106625757, 1037373467, 1029436414) + W(6, -1137018200, 1036181095, 994321759, -1119765454) + W(7, 1010580432, -1127761788, 1021285644, 1034713459); sum2 = W(0, -1127012521, -1110373665, -1121983257, 1021812843) + W(1, -1129458054, -1122115974, -1121551577, 1015201109) + W(2, -1134632819, -1118435057, -1107711610, 1039413537) + W(3, -1113739078, 1041258512, 1043546644, -1127386873) + W(4, -1106078947, 1025961773, 1048226293, -1110385416) + W(5, -1115241196, 1041055451, -1131486243, -1135801459) + W(6, -1122814807, 1025056413, -1139476701, -1132245806) + W(7, -1119046895, 1029845331, 1018415015, -1140149017); WS(-1109010880, -1087548956);
sum1 = W(0, 1034947768, -1095012676, 1046023882, 1029737824) + W(1, 1034343312, -1102610188, 1039446704, 1025692706) + W(2, 1016751552, -1096454908, 1042564604, 1038373096) + W(3, 1019661856, -1091443170, -1105694067, 1039271048) + W(4, -1126501287, -1131030249, 1044246468, 1012879825) + W(5, 1017025648, 1042942296, -1103700296, 1041317114) + W(6, 1030724160, 1019936112, -1141422594, 1029263800) + W(7, -1140792121, 1024647464, -1107855416, 1041193844); sum2 = W(0, 1034034732, -1107522705, -1105460279, 1021740679) + W(1, -1113997103, -1121503695, 1038975878, -1112744336) + W(2, 1028771217, -1114143244, 1032873918, -1121564954) + W(3, 1025456143, -1105773446, 1059420344, 1024971971) + W(4, 1035315492, -1109746606, 1040681265, -1122379806) + W(5, -1102403849, -1106040358, 1046039582, -1106873869) + W(6, 1018212015, -1106459627, 1026290649, -1130313815) + W(7, -1099438501, 1039219872, 1046943722, -1105420350); WS(-1086299832, -1077288694);
sum1 = W(0, 1021716686, -1099039878, -1111509136, 1039618828) + W(1, -1132921948, -1108540692, 1021468846, -1131678690) + W(2, -1113901292, -1158126306, -1096197083, 1041516082) + W(3, -1108835908, 1055092577, 1062013047, -1118733319) + W(4, 1023078294, -1089051407, 1050708993, -1122936235) + W(5, 965138311, -1113759276, 1022391342, 1015065790) + W(6, 998651320, -1107695832, -1133490396, 997649137) + W(7, -1130194922, -1113503632, 991635057, 1023538631); sum2 = W(0, -1133976495, 1035891239, -1130801609, -1113698362) + W(1, 1027343155, 1030599513, -1108453664, 1016406968) + W(2, -1149877867, 1037590422, 1012747883, -1108226898) + W(3, -1119506980, 1054189655, -1119322812, -1120928356) + W(4, -1126385541, 1041308688, -1107379808, 1016225738) + W(5, 1016526837, -1112736561, -1119223720, 988482485) + W(6, 994153115, 1004824957, -1116360142, 1018050885) + W(7, -1140785051, -1120347934, -1129452107, -1117792638); WS(-1113279936, 1066223903);
sum1 = W(0, -1128171420, 1040261344, -1112013315, -1123695998) + W(1, -1141738481, -1140107833, -1116929726, -1154978689) + W(2, -1138940153, 1050703688, -1108200895, -1123177006) + W(3, 1044160156, -1100167260, -1100730273, 1034288823) + W(4, 1020686276, -1130335589, 1040782300, -1141423761) + W(5, -1129655596, 1035637471, 1024316286, -1114187043) + W(6, 964173357, -1124525100, 1014134393, 1013984857) + W(7, -1123239900, 1032644739, 1029624526, -1108229911); sum2 = W(0, -1115606620, 1021458196, 1009639320, -1131253088) + W(1, -1125272644, 1017345212, 1016051020, -1143902384) + W(2, -1099614716, 1047257730, -1120838650, 1020803060) + W(3, -1080575150, 1068148121, -1113655261, 1032085971) + W(4, -1102155153, 1044966894, -1132238288, 1016311348) + W(5, -1122847678, 1026244022, -1130782536, -1137376840) + W(6, -1123394906, 1017049220, 967940860, -1137115752) + W(7, -1129056732, 1010161976, 1004223696, -1136984808); WS(1060545080, -1126581603);
sum1 = W(0, 1032630360, -1112268976, 1045186906, -1125010622) + W(1, 1037657648, -1128752350, 1032285712, 1029508223) + W(2, 1043836232, -1090205186, 1053340438, -1108078856) + W(3, 1037448680, 1048595306, -1094666759, 1041691860) + W(4, 976149203, 1057651571, -1082657749, 1042698525) + W(5, 1031833596, 1035187792, -1092127852, 1040118132) + W(6, 1031675647, 1034806588, -1104761760, 1033087420) + W(7, 1025282125, 1043419290, -1096441814, 1034587656); sum2 = W(0, -1123698886, 1034075649, 998149095, -1113635181) + W(1, -1126365381, 1026991402, -1118780236, -1168196508) + W(2, -1135914762, 1019253181, 1023543366, -1114469118) + W(3, -1121651762, 1047572688, 1038479879, -1145545780) + W(4, -1118625490, 1035108181, -1114677625, 992781287) + W(5, -1122087574, -1115886918, 1011684618, -1139655050) + W(6, -1147908244, 1016718341, -1132109957, -1142844852) + W(7, -1134045690, -1117034488, -1137057610, 1007905050); WS(-1083899832, -1105526146);
sum1 = W(0, 1026357515, -1119744955, -1117075907, -1111407198) + W(1, -1139718894, -1125720471, -1106102943, -1152407445) + W(2, 1044187583, -1092285679, 1048719011, -1107209883) + W(3, -1105573131, 1062437883, 1052836221, -1107292779) + W(4, -1104526300, 1058460257, -1089717563, -1122559055) + W(5, -1119529939, 1022150135, -1123085499, -1119739267) + W(6, -1125768375, 1033366698, -1114009838, -1119196243) + W(7, -1132776678, 1009731342, -1112611206, -1129505495); sum2 = W(0, -1110807022, 1025172792, 1033543849, -1123816828) + W(1, -1129400032, -1117035240, 999654946, -1144812946) + W(2, -1105612607, 1035443403, 1039345667, -1120747576) + W(3, -1123619892, -1135427545, 1053020794, -1113498942) + W(4, -1131262448, -1111010692, 1047843748, -1113301822) + W(5, 1016529300, -1115955576, -1135856481, -1146605522) + W(6, -1129444600, -1117326476, 1022819536, -1119691028) + W(7, -1136239801, -1121250556, 998047364, -1135792457); WS(-1107513792, 1064663354);
sum1 = W(0, 1030862455, -1113532308, 1032378968, -1123071015) + W(1, -1161118946, 1021510766, -1127591630, 1009770420) + W(2, 1040244826, -1091621085, 1051734861, -1107582956) + W(3, -1104300038, 1046262406, 1034822530, -1108820108) + W(4, -1102940181, 1054782000, -1095483267, -1125175670) + W(5, -1135077628, 1019068110, 1031948820, 1025488559) + W(6, -1135539484, 1036941280, -1172984259, -1126076542) + W(7, 1011863892, -1128724830, -1120336759, 1036426604); sum2 = W(0, -1135206239, -1140752647, 1022777359, 974924014) + W(1, -1139065871, -1123380440, 1021581075, -1133276463) + W(2, 1026230428, 988696695, -1122295168, 1029689087) + W(3, 1025917606, -1092786651, -1085937537, -1140169471) + W(4, 1027050280, 1049996339, 1032573953, -1135329695) + W(5, 1013849783, 1057784826, -1130048007, -1124883951) + W(6, 1016077019, 1033822297, 1032545188, 1011238415) + W(7, -1127829351, 1034470972, -1137094527, 1001568686); WS(1058918200, -1121082995);
return clamp(mstd0 + 5.0 * vsum / wsum * mstd1, 0.0, 1.0);
}
float GetLuma(float3 color) {
return dot(float3(0.299f, 0.587f, 0.114f), color);
}
shared float inp[429];
groupshared float inp[429];
#define CURRENT_PASS 1
#define GET_SAMPLE(x) dot(x.rgb, rgb2y)
#define imageStore(out_image, pos, val) imageStoreOverride(pos, val.x)
void imageStoreOverride(uint2 pos, float value) { temp[pos] = (value); }
#define INPUT_tex(pos) GET_SAMPLE(vec4(texture(INPUT, pos)))
static const float2 INPUT_size = float2(GetInputSize());
static const float2 INPUT_pt = float2(GetInputPt());
#define HOOKED_tex(pos) INPUT_tex(pos)
#define HOOKED_size INPUT_size
#define HOOKED_pt INPUT_pt
void Pass1(uint2 blockStart, uint3 threadId) {
const float2 inputPt = GetInputPt();
const uint2 group_base = uint2(blockStart.x, blockStart.y >> 1);
for (int id = threadId.x * MP_NUM_THREADS_Y + threadId.y; id < 429; id += MP_NUM_THREADS_X * MP_NUM_THREADS_Y) {
ivec2 group_base = ivec2(gl_WorkGroupID) * ivec2(gl_WorkGroupSize);
int local_pos = int(gl_LocalInvocationID.x) * 11 + int(gl_LocalInvocationID.y);
for (int id = int(gl_LocalInvocationIndex); id < 429; id += int(gl_WorkGroupSize.x * gl_WorkGroupSize.y)) {
uint x = (uint)id / 11, y = (uint)id % 11;
inp[id] = GetLuma(INPUT.SampleLevel(sam, inputPt * float2(group_base.x + x - 3 + 0.5, group_base.y + y - 1 + 0.5), 0).rgb);
inp[id] =
HOOKED_tex(HOOKED_pt * vec2(float(group_base.x + x - (3)) + 0.5, float(group_base.y + y - (1)) + 0.5)).x;
}
GroupMemoryBarrierWithGroupSync();
float4 ret = 0.0;
float4 ret0 = 0.0;
float4 samples[8];
const uint local_pos = threadId.x * 11 + threadId.y;
[unroll]
for (int i = 0; i < 8; ++i) {
[unroll]
for (int j = 0; j < 4; ++j) {
samples[i][j] = inp[local_pos + i * 11 + j];
}
barrier();
vec4 ret = vec4(0.0, 0.0, 0.0, 0.0);
vec4 ret0 = vec4(0.0, 0.0, 0.0, 0.0);
vec4 samples[8];
samples[0][0] = inp[local_pos + 0];
samples[0][1] = inp[local_pos + 1];
samples[0][2] = inp[local_pos + 2];
samples[0][3] = inp[local_pos + 3];
samples[1][0] = inp[local_pos + 11];
samples[1][1] = inp[local_pos + 12];
samples[1][2] = inp[local_pos + 13];
samples[1][3] = inp[local_pos + 14];
samples[2][0] = inp[local_pos + 22];
samples[2][1] = inp[local_pos + 23];
samples[2][2] = inp[local_pos + 24];
samples[2][3] = inp[local_pos + 25];
samples[3][0] = inp[local_pos + 33];
samples[3][1] = inp[local_pos + 34];
samples[3][2] = inp[local_pos + 35];
samples[3][3] = inp[local_pos + 36];
samples[4][0] = inp[local_pos + 44];
samples[4][1] = inp[local_pos + 45];
samples[4][2] = inp[local_pos + 46];
samples[4][3] = inp[local_pos + 47];
samples[5][0] = inp[local_pos + 55];
samples[5][1] = inp[local_pos + 56];
samples[5][2] = inp[local_pos + 57];
samples[5][3] = inp[local_pos + 58];
samples[6][0] = inp[local_pos + 66];
samples[6][1] = inp[local_pos + 67];
samples[6][2] = inp[local_pos + 68];
samples[6][3] = inp[local_pos + 69];
samples[7][0] = inp[local_pos + 77];
samples[7][1] = inp[local_pos + 78];
samples[7][2] = inp[local_pos + 79];
samples[7][3] = inp[local_pos + 80];
ret[0] = nnedi3(samples);
ret0[0] = inp[local_pos + 34];
#if CURRENT_PASS == LAST_PASS
uint2 destPos = blockStart + threadId.xy * 2;
uint2 outputSize = GetOutputSize();
if (destPos.x >= outputSize.x || destPos.y >= outputSize.y) {
return;
}
const uint2 destPos = blockStart + uint2(threadId.x, threadId.y * 2);
tex1[destPos] = samples[3][1];
tex1[destPos + uint2(0, 1)] = nnedi3(samples);
#endif
imageStore(out_image, ivec2(gl_GlobalInvocationID) * ivec2(1, 2), ret0);
imageStore(out_image, ivec2(gl_GlobalInvocationID) * ivec2(1, 2) + ivec2(0, 1), ret);
}
//!PASS 2
//!DESC double_x
//!IN tex1, INPUT
//!BLOCK_SIZE 64,8
//!NUM_THREADS 32,8
float nnedi3(float4 samples[8]) {
//!DESC NNEDI3 (double_x, nns16, win8x4)
//!IN INPUT, temp
//!OUT OUTPUT
//!BLOCK_SIZE 64, 8
//!NUM_THREADS 32, 8
#pragma optionNV(inline none)
float nnedi3(vec4 samples[8]) {
float sum = 0.0, sumsq = 0.0;
[unroll]
for (int i = 0; i < 8; i++) {
sum += dot(samples[i], 1.0f);
[unroll] for (int i = 0; i < 8; i++) {
sum += dot(samples[i], vec4(1.0, 1.0, 1.0, 1.0));
sumsq += dot(samples[i], samples[i]);
}
float mstd0 = sum / 32.0;
float mstd1 = sumsq / 32.0 - mstd0 * mstd0;
// 不能使用 lerp否则结果可能为 nan
float mstd2 = mstd1 >= 1.192092896e-7 ? rsqrt(mstd1) : 0.0;
float mstd2 = mix(0.0, inversesqrt(mstd1), mstd1 >= 1.192092896e-7);
mstd1 *= mstd2;
float vsum = 0.0, wsum = 0.0, sum1, sum2;
#define T(x) intBitsToFloat(x)
#define W(i, w0, w1, w2, w3) dot(samples[i], vec4(T(w0), T(w1), T(w2), T(w3)))
#define WS(w0, w1) \
sum1 = exp(sum1 * mstd2 + T(w0)); \
sum2 = sum2 * mstd2 + T(w1); \
wsum += sum1; \
vsum += sum1 * (sum2 / (1.0 + abs(sum2)));
sum1 = W(0, -1123354974, -1118620174, 1016130204, -1103968288)
+ W(1, -1101453425, -1117908518, -1122284590, -1122818124)
+ W(2, -1112248839, 1024662558, -1087068557, 1048182784)
+ W(3, 1059583965, -1119323982, 1027638054, -1137723992) + W(4, 1046299686, 1028038478, 1063313277, 1047279381)
+ W(5, -1088182320, 1034186247, -1124394588, 978245507)
+ W(6, -1143613552, -1129268360, -1103342192, -1115088511)
+ W(7, 1003350800, -1134684248, -1111377363, 1028117438);
sum2 =
W(0, -1162931039, -1136248556, -1115594515, -1119907402) + W(1, 1022791334, 1017500018, -1156766128, 1014782692)
+ W(2, -1131063526, -1131086728, -1128443230, 1044675527)
+ W(3, -1107588397, -1111169922, -1125594766, -1135599628)
+ W(4, 1029801649, 1031011705, 1042762789, 1050674207) + W(5, 1009001220, -1112569685, -1148613464, -1114139175)
+ W(6, -1117642655, -1128864654, -1107118398, -1113986381)
+ W(7, -1186206458, 1017255694, 993928432, 1007622876);
WS(1038828992, 1041685264);
sum1 = W(0, -1114329248, 995958527, -1117673776, 1034401008) + W(1, 1038612842, 1035088196, 1008781376, 1029875310)
+ W(2, 1049950910, 1027336960, 1060640651, 1045782072)
+ W(3, -1098159517, -1106507532, -1124000392, -1105439902)
+ W(4, -1097681183, -1107326552, -1085831405, -1105157973)
+ W(5, 1053136924, 1032016120, 1023707152, 1034119968) + W(6, 1028668144, 1025858258, 1033402064, -1122828000)
+ W(7, -1110558370, -1113173980, 1012109856, -1114749520);
sum2 = W(0, 1031315360, -1131767489, 1023637252, -1107869385)
+ W(1, -1107666272, -1117110288, -1124350185, -1110538107)
+ W(2, -1099468189, -1140834082, -1100127579, 1052854494)
+ W(3, 1034036134, 1024451620, 1003450083, 1041131277) + W(4, -1112139926, 1024287080, -1117241706, 1052996200)
+ W(5, 1027811452, 1027157968, -1131082337, 1035032776)
+ W(6, 1036663822, -1122285462, 1038018354, -1112496415)
+ W(7, -1110479054, -1112615559, 998992195, -1106762474);
WS(-1086074680, 1053637716);
sum1 = W(0, -1121345387, -1148805338, -1136570733, 1049266593)
+ W(1, 1027081787, -1133439181, -1158465386, -1120818857)
+ W(2, 1042002951, -1165378922, 1052460699, -1094571489)
+ W(3, -1124281856, 1040734807, -1121708851, 1035650578)
+ W(4, -1113042450, -1115297518, -1107443934, -1098765182)
+ W(5, 1043313411, 1006695533, 1016359031, 1027853163) + W(6, -1121398619, 991217235, -1117268427, 1036113926)
+ W(7, -1136658365, -1112513138, 1021173351, -1106476275);
sum2 = W(0, 1026517575, 1017334370, 1051558711, 1071692777) + W(1, 1049822619, 1020207734, 1022251878, 1019995718)
+ W(2, -1170492850, 1003954710, -1096673587, -1077357700)
+ W(3, -1098179385, 996694924, -1122577241, 1015494226)
+ W(4, -1138816415, -1132363566, -1136175651, -1098960792)
+ W(5, -1116986501, 1003290486, -1141894102, -1126828734)
+ W(6, -1143472678, 998846550, -1124275402, 1018703670)
+ W(7, 1007812651, 1007766851, 1009415395, -1163222937);
WS(1051521136, 1027207116);
sum1 = W(0, -1122694020, -1121133108, -1102175837, -1116351908)
+ W(1, 1045518980, 1009393969, 1008379217, 1015772516) + W(2, 1010830545, -1124202632, 1057246783, 1026322980)
+ W(3, -1089509425, 1025178484, -1117338572, 1009646833)
+ W(4, -1124291704, 1037913146, -1093542759, 982577970) + W(5, 1055793637, -1118947636, 1001093793, 1001810977)
+ W(6, 1018062184, -1116091286, 1041281977, -1125394504)
+ W(7, 1008755233, -1127575032, 1015898776, -1121163492);
sum2 = W(0, -1137495011, 1013616911, -1135578111, 1029346938)
+ W(1, 1017473747, 1012413655, -1133405571, 1008139351)
+ W(2, -1135527491, -1123650952, 1013443151, -1114797945)
+ W(3, -1122100892, -1128721387, -1166794345, -1156685818)
+ W(4, 1027730022, 1024465134, 1049128967, 1068130737) + W(5, 1046423571, -1143058109, 1020545683, -1126785325)
+ W(6, -1118108263, -1128775579, -1098008683, -1080443718)
+ W(7, -1101482344, -1137148015, -1128178767, 991435034);
WS(1057767608, -1132080751);
sum1 =
W(0, 1026028453, 1026017621, -1117947285, 1048679017) + W(1, 1025261389, 1011196341, 1015828970, 1025013027)
+ W(2, 1025766741, -1135552917, 1051769667, -1083959172) + W(3, -1120826122, 1045191525, 1028389741, 1039505775)
+ W(4, 1035118319, 1040474693, -1111744027, -1084413328)
+ W(5, 1049618505, -1110336171, 1028257397, -1123719333) + W(6, 1012106581, -1138611630, 1030333189, 1045191121)
+ W(7, -1122181545, 1030480605, 1027514349, 1020294666);
sum2 = W(0, 1017587161, 1028118553, 1008167722, 1005112948)
+ W(1, -1125344655, 1015776789, -1148301500, -1138850406)
+ W(2, -1101123140, -1103020887, -1099714612, 1049070164)
+ W(3, 1032013714, 1022049457, 1009340114, -1167693540) + W(4, 1040188371, 1022642341, 1039093756, 1046164698)
+ W(5, -1111525569, -1098832696, -1115917000, -1103378287)
+ W(6, 988296658, 1010063898, 1026403646, 1033545355) + W(7, 1002132020, 1037334715, -1139728254, 1035581889);
WS(-1099372256, -1088618788);
sum1 = W(0, -1112538182, -1113349022, -1103996671, 1024020908)
+ W(1, 1046614908, -1111225386, -1110537326, -1114146165)
+ W(2, 1048693927, 1033711782, 1064716592, -1143605597)
+ W(3, -1085173359, 1004694493, -1108087402, -1138402062)
+ W(4, -1112344546, -1129092599, -1086749016, 1044926535)
+ W(5, 1062252083, 1040479887, 1034104622, 1042110371)
+ W(6, -1109099742, -1110127398, 1032699126, -1121424940)
+ W(7, -1130166943, -1106709441, -1120726228, -1106064827);
sum2 = W(0, 987083788, -1144106823, -1120467381, -1121977305)
+ W(1, -1118651341, -1162389292, -1152635694, 964510307)
+ W(2, 1013472954, -1131186779, -1139561796, 1044091298)
+ W(3, 1038343490, -1115306287, -1155962630, -1117365756)
+ W(4, -1120418118, -1122269098, 1038342084, 1042996066)
+ W(5, -1118476220, -1128689408, -1132569906, -1141833923)
+ W(6, 979955865, -1163904780, -1115615181, -1127292875)
+ W(7, -1123141745, 1014320394, -1135582470, 1008840046);
WS(1041282784, 1044242623);
sum1 = W(0, -1119885764, -1119816052, -1100880653, -1113812594)
+ W(1, 1046531310, 1015358999, 1005085853, 1021011107) + W(2, -1171512555, -1121861252, 1057266723, 1010135439)
+ W(3, -1089952515, 1031135156, -1115226950, 1003139037)
+ W(4, 1003864029, 1040963149, -1094412795, -1118004569) + W(5, 1056310444, -1114099002, 1015234855, 992693307)
+ W(6, 1025494836, -1113504879, 1043843337, -1125989575)
+ W(7, -1156936827, -1122714492, 1003362397, -1120612644);
sum2 = W(0, 1005317381, -1143827754, 1019060164, 1009279342)
+ W(1, 1014092605, -1131000233, 1012779564, -1137528453)
+ W(2, -1142619324, 1012902153, -1114788024, -1098688460)
+ W(3, -1120377499, 1017453102, -1139793504, -1147729078)
+ W(4, -1126266146, -1128784654, -1094218173, -1078812823)
+ W(5, -1099532818, 1007638067, -1130333980, 1018177647) + W(6, 1026462555, 1020893616, 1054132458, 1070492026)
+ W(7, 1048935725, 1011358224, 1015734963, 987943782);
WS(1046635232, 1024078131);
sum1 = W(0, 1002735212, 1025138813, 1017042555, -1132524982) + W(1, 1031803657, 1033606155, 1021596219, 1015508823)
+ W(2, 1035063871, 1034039879, 1044122447, -1110416695)
+ W(3, -1092481954, -1094320024, -1107502027, -1103391009)
+ W(4, -1097977761, -1105608655, -1094991056, 1051547730)
+ W(5, 1050188814, 1047410847, 1031346589, 1046101811) + W(6, 1040314319, 1035664624, 1038536855, -1114843703)
+ W(7, 1003107468, 1019470987, 1021345835, -1136683190);
sum2 = W(0, -1096475926, -1112281069, -1116645717, 1041647377)
+ W(1, -1113263240, 1023473494, -1117017643, 1039764966)
+ W(2, 1044036812, -1112231286, -1111398905, -1096068583)
+ W(3, 1026411348, -1114320784, -1138574198, -1104710548)
+ W(4, 1052862983, 1024115789, 1051331710, 1038036111) + W(5, 1042458641, 1028002558, 1037890580, -1106844581)
+ W(6, -1106234474, -1121785528, -1130292776, 1037359643)
+ W(7, -1111704128, -1123406807, -1109714921, 1041123403);
WS(-1088554040, -1076674880);
sum1 = W(0, 1026292820, 1016736263, 1042339025, -1110236986)
+ W(1, -1103961368, 1016313655, -1151588920, 1014755782)
+ W(2, -1132973070, 1034501898, -1089525132, 1037427962)
+ W(3, 1056478885, -1118983748, 1038469390, -1123320716)
+ W(4, -1144171612, -1110973538, 1052671191, -1123890785)
+ W(5, -1092344862, 1041641985, 1010979982, 1017396903)
+ W(6, -1130131975, 1028857234, -1108906970, -1112145786)
+ W(7, 1002874044, 1025897228, -1130905399, 1033705562);
sum2 = W(0, 1013915195, 1007730851, 981970521, 1028189234) + W(1, -1121038101, -1142484934, -1146368902, 1010322539)
+ W(2, -1133182691, 1024414743, -1111248658, 1040952978)
+ W(3, -1109339192, -1094377458, -1116592821, -1112421528)
+ W(4, -1127318198, -1121307593, 1035588225, 1057294107)
+ W(5, -1107404728, 1024397525, -1118541421, 1019759378)
+ W(6, 1020584890, 1005058566, -1124411850, 1029625115)
+ W(7, 1026110889, 1023925523, -1140327971, -1199698720);
WS(1063581112, 1015292283);
sum1 = W(0, -1123806598, -1124445804, 1036776315, -1112186771)
+ W(1, -1102837698, -1117642958, -1137018200, 1010580432)
+ W(2, -1125096044, 1037634467, -1083793455, -1098422117)
+ W(3, 1058965073, -1106625757, 1036181095, -1127761788) + W(4, 1046804719, 1028314614, 1064148787, 1034155462)
+ W(5, -1089226130, 1037373467, 994321759, 1021285644) + W(6, -1117498166, 1006823135, -1106689849, 1004978479)
+ W(7, 1033810693, 1029436414, -1119765454, 1034713459);
sum2 =
W(0, -1127012521, -1129458054, -1134632819, -1113739078)
+ W(1, -1106078947, -1115241196, -1122814807, -1119046895)
+ W(2, -1110373665, -1122115974, -1118435057, 1041258512) + W(3, 1025961773, 1041055451, 1025056413, 1029845331)
+ W(4, -1121983257, -1121551577, -1107711610, 1043546644)
+ W(5, 1048226293, -1131486243, -1139476701, 1018415015) + W(6, 1021812843, 1015201109, 1039413537, -1127386873)
+ W(7, -1110385416, -1135801459, -1132245806, -1140149017);
WS(-1109010880, -1087548956);
sum1 = W(0, 1034947768, 1034343312, 1016751552, 1019661856) + W(1, -1126501287, 1017025648, 1030724160, -1140792121)
+ W(2, -1095012676, -1102610188, -1096454908, -1091443170)
+ W(3, -1131030249, 1042942296, 1019936112, 1024647464) + W(4, 1046023882, 1039446704, 1042564604, -1105694067)
+ W(5, 1044246468, -1103700296, -1141422594, -1107855416)
+ W(6, 1029737824, 1025692706, 1038373096, 1039271048) + W(7, 1012879825, 1041317114, 1029263800, 1041193844);
sum2 = W(0, 1034034732, -1113997103, 1028771217, 1025456143)
+ W(1, 1035315492, -1102403849, 1018212015, -1099438501)
+ W(2, -1107522705, -1121503695, -1114143244, -1105773446)
+ W(3, -1109746606, -1106040358, -1106459627, 1039219872)
+ W(4, -1105460279, 1038975878, 1032873918, 1059420344) + W(5, 1040681265, 1046039582, 1026290649, 1046943722)
+ W(6, 1021740679, -1112744336, -1121564954, 1024971971)
+ W(7, -1122379806, -1106873869, -1130313815, -1105420350);
WS(-1086299832, -1077288694);
sum1 = W(0, 1021716686, -1132921948, -1113901292, -1108835908) + W(1, 1023078294, 965138311, 998651320, -1130194922)
+ W(2, -1099039878, -1108540692, -1158126306, 1055092577)
+ W(3, -1089051407, -1113759276, -1107695832, -1113503632)
+ W(4, -1111509136, 1021468846, -1096197083, 1062013047) + W(5, 1050708993, 1022391342, -1133490396, 991635057)
+ W(6, 1039618828, -1131678690, 1041516082, -1118733319)
+ W(7, -1122936235, 1015065790, 997649137, 1023538631);
sum2 = W(0, -1133976495, 1027343155, -1149877867, -1119506980)
+ W(1, -1126385541, 1016526837, 994153115, -1140785051) + W(2, 1035891239, 1030599513, 1037590422, 1054189655)
+ W(3, 1041308688, -1112736561, 1004824957, -1120347934)
+ W(4, -1130801609, -1108453664, 1012747883, -1119322812)
+ W(5, -1107379808, -1119223720, -1116360142, -1129452107)
+ W(6, -1113698362, 1016406968, -1108226898, -1120928356)
+ W(7, 1016225738, 988482485, 1018050885, -1117792638);
WS(-1113279936, 1066223903);
sum1 =
W(0, -1128171420, -1141738481, -1138940153, 1044160156) + W(1, 1020686276, -1129655596, 964173357, -1123239900)
+ W(2, 1040261344, -1140107833, 1050703688, -1100167260)
+ W(3, -1130335589, 1035637471, -1124525100, 1032644739)
+ W(4, -1112013315, -1116929726, -1108200895, -1100730273)
+ W(5, 1040782300, 1024316286, 1014134393, 1029624526) + W(6, -1123695998, -1154978689, -1123177006, 1034288823)
+ W(7, -1141423761, -1114187043, 1013984857, -1108229911);
sum2 = W(0, -1115606620, -1125272644, -1099614716, -1080575150)
+ W(1, -1102155153, -1122847678, -1123394906, -1129056732)
+ W(2, 1021458196, 1017345212, 1047257730, 1068148121) + W(3, 1044966894, 1026244022, 1017049220, 1010161976)
+ W(4, 1009639320, 1016051020, -1120838650, -1113655261)
+ W(5, -1132238288, -1130782536, 967940860, 1004223696)
+ W(6, -1131253088, -1143902384, 1020803060, 1032085971)
+ W(7, 1016311348, -1137376840, -1137115752, -1136984808);
WS(1060545080, -1126581603);
sum1 = W(0, 1032630360, 1037657648, 1043836232, 1037448680) + W(1, 976149203, 1031833596, 1031675647, 1025282125)
+ W(2, -1112268976, -1128752350, -1090205186, 1048595306)
+ W(3, 1057651571, 1035187792, 1034806588, 1043419290) + W(4, 1045186906, 1032285712, 1053340438, -1094666759)
+ W(5, -1082657749, -1092127852, -1104761760, -1096441814)
+ W(6, -1125010622, 1029508223, -1108078856, 1041691860)
+ W(7, 1042698525, 1040118132, 1033087420, 1034587656);
sum2 = W(0, -1123698886, -1126365381, -1135914762, -1121651762)
+ W(1, -1118625490, -1122087574, -1147908244, -1134045690)
+ W(2, 1034075649, 1026991402, 1019253181, 1047572688) + W(3, 1035108181, -1115886918, 1016718341, -1117034488)
+ W(4, 998149095, -1118780236, 1023543366, 1038479879)
+ W(5, -1114677625, 1011684618, -1132109957, -1137057610)
+ W(6, -1113635181, -1168196508, -1114469118, -1145545780)
+ W(7, 992781287, -1139655050, -1142844852, 1007905050);
WS(-1083899832, -1105526146);
sum1 = W(0, 1026357515, -1139718894, 1044187583, -1105573131)
+ W(1, -1104526300, -1119529939, -1125768375, -1132776678)
+ W(2, -1119744955, -1125720471, -1092285679, 1062437883)
+ W(3, 1058460257, 1022150135, 1033366698, 1009731342) + W(4, -1117075907, -1106102943, 1048719011, 1052836221)
+ W(5, -1089717563, -1123085499, -1114009838, -1112611206)
+ W(6, -1111407198, -1152407445, -1107209883, -1107292779)
+ W(7, -1122559055, -1119739267, -1119196243, -1129505495);
sum2 = W(0, -1110807022, -1129400032, -1105612607, -1123619892)
+ W(1, -1131262448, 1016529300, -1129444600, -1136239801)
+ W(2, 1025172792, -1117035240, 1035443403, -1135427545)
+ W(3, -1111010692, -1115955576, -1117326476, -1121250556)
+ W(4, 1033543849, 999654946, 1039345667, 1053020794) + W(5, 1047843748, -1135856481, 1022819536, 998047364)
+ W(6, -1123816828, -1144812946, -1120747576, -1113498942)
+ W(7, -1113301822, -1146605522, -1119691028, -1135792457);
WS(-1107513792, 1064663354);
sum1 = W(0, 1030862455, -1161118946, 1040244826, -1104300038)
+ W(1, -1102940181, -1135077628, -1135539484, 1011863892)
+ W(2, -1113532308, 1021510766, -1091621085, 1046262406)
+ W(3, 1054782000, 1019068110, 1036941280, -1128724830) + W(4, 1032378968, -1127591630, 1051734861, 1034822530)
+ W(5, -1095483267, 1031948820, -1172984259, -1120336759)
+ W(6, -1123071015, 1009770420, -1107582956, -1108820108)
+ W(7, -1125175670, 1025488559, -1126076542, 1036426604);
sum2 =
W(0, -1135206239, -1139065871, 1026230428, 1025917606) + W(1, 1027050280, 1013849783, 1016077019, -1127829351)
+ W(2, -1140752647, -1123380440, 988696695, -1092786651) + W(3, 1049996339, 1057784826, 1033822297, 1034470972)
+ W(4, 1022777359, 1021581075, -1122295168, -1085937537)
+ W(5, 1032573953, -1130048007, 1032545188, -1137094527) + W(6, 974924014, -1133276463, 1029689087, -1140169471)
+ W(7, -1135329695, -1124883951, 1011238415, 1001568686);
WS(1058918200, -1121082995);
sum1 = W(0, -1123354974, -1118620174, 1016130204, -1103968288) + W(1, -1101453425, -1117908518, -1122284590, -1122818124) + W(2, -1112248839, 1024662558, -1087068557, 1048182784) + W(3, 1059583965, -1119323982, 1027638054, -1137723992) + W(4, 1046299686, 1028038478, 1063313277, 1047279381) + W(5, -1088182320, 1034186247, -1124394588, 978245507) + W(6, -1143613552, -1129268360, -1103342192, -1115088511) + W(7, 1003350800, -1134684248, -1111377363, 1028117438); sum2 = W(0, -1162931039, -1136248556, -1115594515, -1119907402) + W(1, 1022791334, 1017500018, -1156766128, 1014782692) + W(2, -1131063526, -1131086728, -1128443230, 1044675527) + W(3, -1107588397, -1111169922, -1125594766, -1135599628) + W(4, 1029801649, 1031011705, 1042762789, 1050674207) + W(5, 1009001220, -1112569685, -1148613464, -1114139175) + W(6, -1117642655, -1128864654, -1107118398, -1113986381) + W(7, -1186206458, 1017255694, 993928432, 1007622876); WS(1038828992, 1041685264);
sum1 = W(0, -1114329248, 995958527, -1117673776, 1034401008) + W(1, 1038612842, 1035088196, 1008781376, 1029875310) + W(2, 1049950910, 1027336960, 1060640651, 1045782072) + W(3, -1098159517, -1106507532, -1124000392, -1105439902) + W(4, -1097681183, -1107326552, -1085831405, -1105157973) + W(5, 1053136924, 1032016120, 1023707152, 1034119968) + W(6, 1028668144, 1025858258, 1033402064, -1122828000) + W(7, -1110558370, -1113173980, 1012109856, -1114749520); sum2 = W(0, 1031315360, -1131767489, 1023637252, -1107869385) + W(1, -1107666272, -1117110288, -1124350185, -1110538107) + W(2, -1099468189, -1140834082, -1100127579, 1052854494) + W(3, 1034036134, 1024451620, 1003450083, 1041131277) + W(4, -1112139926, 1024287080, -1117241706, 1052996200) + W(5, 1027811452, 1027157968, -1131082337, 1035032776) + W(6, 1036663822, -1122285462, 1038018354, -1112496415) + W(7, -1110479054, -1112615559, 998992195, -1106762474); WS(-1086074680, 1053637716);
sum1 = W(0, -1121345387, -1148805338, -1136570733, 1049266593) + W(1, 1027081787, -1133439181, -1158465386, -1120818857) + W(2, 1042002951, -1165378922, 1052460699, -1094571489) + W(3, -1124281856, 1040734807, -1121708851, 1035650578) + W(4, -1113042450, -1115297518, -1107443934, -1098765182) + W(5, 1043313411, 1006695533, 1016359031, 1027853163) + W(6, -1121398619, 991217235, -1117268427, 1036113926) + W(7, -1136658365, -1112513138, 1021173351, -1106476275); sum2 = W(0, 1026517575, 1017334370, 1051558711, 1071692777) + W(1, 1049822619, 1020207734, 1022251878, 1019995718) + W(2, -1170492850, 1003954710, -1096673587, -1077357700) + W(3, -1098179385, 996694924, -1122577241, 1015494226) + W(4, -1138816415, -1132363566, -1136175651, -1098960792) + W(5, -1116986501, 1003290486, -1141894102, -1126828734) + W(6, -1143472678, 998846550, -1124275402, 1018703670) + W(7, 1007812651, 1007766851, 1009415395, -1163222937); WS(1051521136, 1027207116);
sum1 = W(0, -1122694020, -1121133108, -1102175837, -1116351908) + W(1, 1045518980, 1009393969, 1008379217, 1015772516) + W(2, 1010830545, -1124202632, 1057246783, 1026322980) + W(3, -1089509425, 1025178484, -1117338572, 1009646833) + W(4, -1124291704, 1037913146, -1093542759, 982577970) + W(5, 1055793637, -1118947636, 1001093793, 1001810977) + W(6, 1018062184, -1116091286, 1041281977, -1125394504) + W(7, 1008755233, -1127575032, 1015898776, -1121163492); sum2 = W(0, -1137495011, 1013616911, -1135578111, 1029346938) + W(1, 1017473747, 1012413655, -1133405571, 1008139351) + W(2, -1135527491, -1123650952, 1013443151, -1114797945) + W(3, -1122100892, -1128721387, -1166794345, -1156685818) + W(4, 1027730022, 1024465134, 1049128967, 1068130737) + W(5, 1046423571, -1143058109, 1020545683, -1126785325) + W(6, -1118108263, -1128775579, -1098008683, -1080443718) + W(7, -1101482344, -1137148015, -1128178767, 991435034); WS(1057767608, -1132080751);
sum1 = W(0, 1026028453, 1026017621, -1117947285, 1048679017) + W(1, 1025261389, 1011196341, 1015828970, 1025013027) + W(2, 1025766741, -1135552917, 1051769667, -1083959172) + W(3, -1120826122, 1045191525, 1028389741, 1039505775) + W(4, 1035118319, 1040474693, -1111744027, -1084413328) + W(5, 1049618505, -1110336171, 1028257397, -1123719333) + W(6, 1012106581, -1138611630, 1030333189, 1045191121) + W(7, -1122181545, 1030480605, 1027514349, 1020294666); sum2 = W(0, 1017587161, 1028118553, 1008167722, 1005112948) + W(1, -1125344655, 1015776789, -1148301500, -1138850406) + W(2, -1101123140, -1103020887, -1099714612, 1049070164) + W(3, 1032013714, 1022049457, 1009340114, -1167693540) + W(4, 1040188371, 1022642341, 1039093756, 1046164698) + W(5, -1111525569, -1098832696, -1115917000, -1103378287) + W(6, 988296658, 1010063898, 1026403646, 1033545355) + W(7, 1002132020, 1037334715, -1139728254, 1035581889); WS(-1099372256, -1088618788);
sum1 = W(0, -1112538182, -1113349022, -1103996671, 1024020908) + W(1, 1046614908, -1111225386, -1110537326, -1114146165) + W(2, 1048693927, 1033711782, 1064716592, -1143605597) + W(3, -1085173359, 1004694493, -1108087402, -1138402062) + W(4, -1112344546, -1129092599, -1086749016, 1044926535) + W(5, 1062252083, 1040479887, 1034104622, 1042110371) + W(6, -1109099742, -1110127398, 1032699126, -1121424940) + W(7, -1130166943, -1106709441, -1120726228, -1106064827); sum2 = W(0, 987083788, -1144106823, -1120467381, -1121977305) + W(1, -1118651341, -1162389292, -1152635694, 964510307) + W(2, 1013472954, -1131186779, -1139561796, 1044091298) + W(3, 1038343490, -1115306287, -1155962630, -1117365756) + W(4, -1120418118, -1122269098, 1038342084, 1042996066) + W(5, -1118476220, -1128689408, -1132569906, -1141833923) + W(6, 979955865, -1163904780, -1115615181, -1127292875) + W(7, -1123141745, 1014320394, -1135582470, 1008840046); WS(1041282784, 1044242623);
sum1 = W(0, -1119885764, -1119816052, -1100880653, -1113812594) + W(1, 1046531310, 1015358999, 1005085853, 1021011107) + W(2, -1171512555, -1121861252, 1057266723, 1010135439) + W(3, -1089952515, 1031135156, -1115226950, 1003139037) + W(4, 1003864029, 1040963149, -1094412795, -1118004569) + W(5, 1056310444, -1114099002, 1015234855, 992693307) + W(6, 1025494836, -1113504879, 1043843337, -1125989575) + W(7, -1156936827, -1122714492, 1003362397, -1120612644); sum2 = W(0, 1005317381, -1143827754, 1019060164, 1009279342) + W(1, 1014092605, -1131000233, 1012779564, -1137528453) + W(2, -1142619324, 1012902153, -1114788024, -1098688460) + W(3, -1120377499, 1017453102, -1139793504, -1147729078) + W(4, -1126266146, -1128784654, -1094218173, -1078812823) + W(5, -1099532818, 1007638067, -1130333980, 1018177647) + W(6, 1026462555, 1020893616, 1054132458, 1070492026) + W(7, 1048935725, 1011358224, 1015734963, 987943782); WS(1046635232, 1024078131);
sum1 = W(0, 1002735212, 1025138813, 1017042555, -1132524982) + W(1, 1031803657, 1033606155, 1021596219, 1015508823) + W(2, 1035063871, 1034039879, 1044122447, -1110416695) + W(3, -1092481954, -1094320024, -1107502027, -1103391009) + W(4, -1097977761, -1105608655, -1094991056, 1051547730) + W(5, 1050188814, 1047410847, 1031346589, 1046101811) + W(6, 1040314319, 1035664624, 1038536855, -1114843703) + W(7, 1003107468, 1019470987, 1021345835, -1136683190); sum2 = W(0, -1096475926, -1112281069, -1116645717, 1041647377) + W(1, -1113263240, 1023473494, -1117017643, 1039764966) + W(2, 1044036812, -1112231286, -1111398905, -1096068583) + W(3, 1026411348, -1114320784, -1138574198, -1104710548) + W(4, 1052862983, 1024115789, 1051331710, 1038036111) + W(5, 1042458641, 1028002558, 1037890580, -1106844581) + W(6, -1106234474, -1121785528, -1130292776, 1037359643) + W(7, -1111704128, -1123406807, -1109714921, 1041123403); WS(-1088554040, -1076674880);
sum1 = W(0, 1026292820, 1016736263, 1042339025, -1110236986) + W(1, -1103961368, 1016313655, -1151588920, 1014755782) + W(2, -1132973070, 1034501898, -1089525132, 1037427962) + W(3, 1056478885, -1118983748, 1038469390, -1123320716) + W(4, -1144171612, -1110973538, 1052671191, -1123890785) + W(5, -1092344862, 1041641985, 1010979982, 1017396903) + W(6, -1130131975, 1028857234, -1108906970, -1112145786) + W(7, 1002874044, 1025897228, -1130905399, 1033705562); sum2 = W(0, 1013915195, 1007730851, 981970521, 1028189234) + W(1, -1121038101, -1142484934, -1146368902, 1010322539) + W(2, -1133182691, 1024414743, -1111248658, 1040952978) + W(3, -1109339192, -1094377458, -1116592821, -1112421528) + W(4, -1127318198, -1121307593, 1035588225, 1057294107) + W(5, -1107404728, 1024397525, -1118541421, 1019759378) + W(6, 1020584890, 1005058566, -1124411850, 1029625115) + W(7, 1026110889, 1023925523, -1140327971, -1199698720); WS(1063581112, 1015292283);
sum1 = W(0, -1123806598, -1124445804, 1036776315, -1112186771) + W(1, -1102837698, -1117642958, -1137018200, 1010580432) + W(2, -1125096044, 1037634467, -1083793455, -1098422117) + W(3, 1058965073, -1106625757, 1036181095, -1127761788) + W(4, 1046804719, 1028314614, 1064148787, 1034155462) + W(5, -1089226130, 1037373467, 994321759, 1021285644) + W(6, -1117498166, 1006823135, -1106689849, 1004978479) + W(7, 1033810693, 1029436414, -1119765454, 1034713459); sum2 = W(0, -1127012521, -1129458054, -1134632819, -1113739078) + W(1, -1106078947, -1115241196, -1122814807, -1119046895) + W(2, -1110373665, -1122115974, -1118435057, 1041258512) + W(3, 1025961773, 1041055451, 1025056413, 1029845331) + W(4, -1121983257, -1121551577, -1107711610, 1043546644) + W(5, 1048226293, -1131486243, -1139476701, 1018415015) + W(6, 1021812843, 1015201109, 1039413537, -1127386873) + W(7, -1110385416, -1135801459, -1132245806, -1140149017); WS(-1109010880, -1087548956);
sum1 = W(0, 1034947768, 1034343312, 1016751552, 1019661856) + W(1, -1126501287, 1017025648, 1030724160, -1140792121) + W(2, -1095012676, -1102610188, -1096454908, -1091443170) + W(3, -1131030249, 1042942296, 1019936112, 1024647464) + W(4, 1046023882, 1039446704, 1042564604, -1105694067) + W(5, 1044246468, -1103700296, -1141422594, -1107855416) + W(6, 1029737824, 1025692706, 1038373096, 1039271048) + W(7, 1012879825, 1041317114, 1029263800, 1041193844); sum2 = W(0, 1034034732, -1113997103, 1028771217, 1025456143) + W(1, 1035315492, -1102403849, 1018212015, -1099438501) + W(2, -1107522705, -1121503695, -1114143244, -1105773446) + W(3, -1109746606, -1106040358, -1106459627, 1039219872) + W(4, -1105460279, 1038975878, 1032873918, 1059420344) + W(5, 1040681265, 1046039582, 1026290649, 1046943722) + W(6, 1021740679, -1112744336, -1121564954, 1024971971) + W(7, -1122379806, -1106873869, -1130313815, -1105420350); WS(-1086299832, -1077288694);
sum1 = W(0, 1021716686, -1132921948, -1113901292, -1108835908) + W(1, 1023078294, 965138311, 998651320, -1130194922) + W(2, -1099039878, -1108540692, -1158126306, 1055092577) + W(3, -1089051407, -1113759276, -1107695832, -1113503632) + W(4, -1111509136, 1021468846, -1096197083, 1062013047) + W(5, 1050708993, 1022391342, -1133490396, 991635057) + W(6, 1039618828, -1131678690, 1041516082, -1118733319) + W(7, -1122936235, 1015065790, 997649137, 1023538631); sum2 = W(0, -1133976495, 1027343155, -1149877867, -1119506980) + W(1, -1126385541, 1016526837, 994153115, -1140785051) + W(2, 1035891239, 1030599513, 1037590422, 1054189655) + W(3, 1041308688, -1112736561, 1004824957, -1120347934) + W(4, -1130801609, -1108453664, 1012747883, -1119322812) + W(5, -1107379808, -1119223720, -1116360142, -1129452107) + W(6, -1113698362, 1016406968, -1108226898, -1120928356) + W(7, 1016225738, 988482485, 1018050885, -1117792638); WS(-1113279936, 1066223903);
sum1 = W(0, -1128171420, -1141738481, -1138940153, 1044160156) + W(1, 1020686276, -1129655596, 964173357, -1123239900) + W(2, 1040261344, -1140107833, 1050703688, -1100167260) + W(3, -1130335589, 1035637471, -1124525100, 1032644739) + W(4, -1112013315, -1116929726, -1108200895, -1100730273) + W(5, 1040782300, 1024316286, 1014134393, 1029624526) + W(6, -1123695998, -1154978689, -1123177006, 1034288823) + W(7, -1141423761, -1114187043, 1013984857, -1108229911); sum2 = W(0, -1115606620, -1125272644, -1099614716, -1080575150) + W(1, -1102155153, -1122847678, -1123394906, -1129056732) + W(2, 1021458196, 1017345212, 1047257730, 1068148121) + W(3, 1044966894, 1026244022, 1017049220, 1010161976) + W(4, 1009639320, 1016051020, -1120838650, -1113655261) + W(5, -1132238288, -1130782536, 967940860, 1004223696) + W(6, -1131253088, -1143902384, 1020803060, 1032085971) + W(7, 1016311348, -1137376840, -1137115752, -1136984808); WS(1060545080, -1126581603);
sum1 = W(0, 1032630360, 1037657648, 1043836232, 1037448680) + W(1, 976149203, 1031833596, 1031675647, 1025282125) + W(2, -1112268976, -1128752350, -1090205186, 1048595306) + W(3, 1057651571, 1035187792, 1034806588, 1043419290) + W(4, 1045186906, 1032285712, 1053340438, -1094666759) + W(5, -1082657749, -1092127852, -1104761760, -1096441814) + W(6, -1125010622, 1029508223, -1108078856, 1041691860) + W(7, 1042698525, 1040118132, 1033087420, 1034587656); sum2 = W(0, -1123698886, -1126365381, -1135914762, -1121651762) + W(1, -1118625490, -1122087574, -1147908244, -1134045690) + W(2, 1034075649, 1026991402, 1019253181, 1047572688) + W(3, 1035108181, -1115886918, 1016718341, -1117034488) + W(4, 998149095, -1118780236, 1023543366, 1038479879) + W(5, -1114677625, 1011684618, -1132109957, -1137057610) + W(6, -1113635181, -1168196508, -1114469118, -1145545780) + W(7, 992781287, -1139655050, -1142844852, 1007905050); WS(-1083899832, -1105526146);
sum1 = W(0, 1026357515, -1139718894, 1044187583, -1105573131) + W(1, -1104526300, -1119529939, -1125768375, -1132776678) + W(2, -1119744955, -1125720471, -1092285679, 1062437883) + W(3, 1058460257, 1022150135, 1033366698, 1009731342) + W(4, -1117075907, -1106102943, 1048719011, 1052836221) + W(5, -1089717563, -1123085499, -1114009838, -1112611206) + W(6, -1111407198, -1152407445, -1107209883, -1107292779) + W(7, -1122559055, -1119739267, -1119196243, -1129505495); sum2 = W(0, -1110807022, -1129400032, -1105612607, -1123619892) + W(1, -1131262448, 1016529300, -1129444600, -1136239801) + W(2, 1025172792, -1117035240, 1035443403, -1135427545) + W(3, -1111010692, -1115955576, -1117326476, -1121250556) + W(4, 1033543849, 999654946, 1039345667, 1053020794) + W(5, 1047843748, -1135856481, 1022819536, 998047364) + W(6, -1123816828, -1144812946, -1120747576, -1113498942) + W(7, -1113301822, -1146605522, -1119691028, -1135792457); WS(-1107513792, 1064663354);
sum1 = W(0, 1030862455, -1161118946, 1040244826, -1104300038) + W(1, -1102940181, -1135077628, -1135539484, 1011863892) + W(2, -1113532308, 1021510766, -1091621085, 1046262406) + W(3, 1054782000, 1019068110, 1036941280, -1128724830) + W(4, 1032378968, -1127591630, 1051734861, 1034822530) + W(5, -1095483267, 1031948820, -1172984259, -1120336759) + W(6, -1123071015, 1009770420, -1107582956, -1108820108) + W(7, -1125175670, 1025488559, -1126076542, 1036426604); sum2 = W(0, -1135206239, -1139065871, 1026230428, 1025917606) + W(1, 1027050280, 1013849783, 1016077019, -1127829351) + W(2, -1140752647, -1123380440, 988696695, -1092786651) + W(3, 1049996339, 1057784826, 1033822297, 1034470972) + W(4, 1022777359, 1021581075, -1122295168, -1085937537) + W(5, 1032573953, -1130048007, 1032545188, -1137094527) + W(6, 974924014, -1133276463, 1029689087, -1140169471) + W(7, -1135329695, -1124883951, 1011238415, 1001568686); WS(1058918200, -1121082995);
return clamp(mstd0 + 5.0 * vsum / wsum * mstd1, 0.0, 1.0);
}
const static float2x3 rgb2uv = {
-0.169, -0.331, 0.5,
0.5, -0.419, -0.081
};
shared float inp[525];
const static float3x3 yuv2rgb = {
1, -0.00093, 1.401687,
1, -0.3437, -0.71417,
1, 1.77216, 0.00099
};
#define CURRENT_PASS 2
groupshared float inp[525];
#define GET_SAMPLE(x) dot(x.rgb, rgb2y)
#define imageStore(out_image, pos, val) imageStoreOverride(pos, val.x)
void imageStoreOverride(uint2 pos, float value) {
float2 UV = mul(rgb2uv, INPUT.SampleLevel(sam_INPUT_LINEAR, HOOKED_map(pos), 0).rgb);
OUTPUT[pos] = float4(mul(yuv2rgb, float3(value.x, UV)), 1.0);
}
#define INPUT_tex(pos) GET_SAMPLE(vec4(texture(INPUT, pos)))
static const float2 INPUT_size = float2(GetInputSize());
static const float2 INPUT_pt = float2(GetInputPt());
#define temp_tex(pos) (float(texture(temp, pos).x))
static const float2 temp_size = float2(GetInputSize().x * 1, GetInputSize().y * 2);
static const float2 temp_pt = float2(1.0 / (temp_size.x), 1.0 / (temp_size.y));
#define HOOKED_tex(pos) temp_tex(pos)
#define HOOKED_size temp_size
#define HOOKED_pt temp_pt
void Pass2(uint2 blockStart, uint3 threadId) {
const float2 inputPt = GetInputPt();
const float2 outputPt = GetOutputPt();
const uint2 group_base = uint2(blockStart.x >> 1, blockStart.y);
for (int id = threadId.x * MP_NUM_THREADS_Y + threadId.y; id < 525; id += MP_NUM_THREADS_X * MP_NUM_THREADS_Y) {
ivec2 group_base = ivec2(gl_WorkGroupID) * ivec2(gl_WorkGroupSize);
int local_pos = int(gl_LocalInvocationID.x) * 15 + int(gl_LocalInvocationID.y);
for (int id = int(gl_LocalInvocationIndex); id < 525; id += int(gl_WorkGroupSize.x * gl_WorkGroupSize.y)) {
uint x = (uint)id / 15, y = (uint)id % 15;
inp[id] = tex1.SampleLevel(sam, inputPt * float2(group_base.x + x - 1 + 0.5, (group_base.y + y - 3 + 0.5) * 0.5), 0).r;
inp[id] =
HOOKED_tex(HOOKED_pt * vec2(float(group_base.x + x - (1)) + 0.5, float(group_base.y + y - (3)) + 0.5)).x;
}
GroupMemoryBarrierWithGroupSync();
uint2 destPos = blockStart + uint2(threadId.x * 2, threadId.y);
if (!CheckViewport(destPos)) {
barrier();
vec4 ret = vec4(0.0, 0.0, 0.0, 0.0);
vec4 ret0 = vec4(0.0, 0.0, 0.0, 0.0);
vec4 samples[8];
samples[0][0] = inp[local_pos + 0];
samples[0][1] = inp[local_pos + 1];
samples[0][2] = inp[local_pos + 2];
samples[0][3] = inp[local_pos + 3];
samples[1][0] = inp[local_pos + 4];
samples[1][1] = inp[local_pos + 5];
samples[1][2] = inp[local_pos + 6];
samples[1][3] = inp[local_pos + 7];
samples[2][0] = inp[local_pos + 15];
samples[2][1] = inp[local_pos + 16];
samples[2][2] = inp[local_pos + 17];
samples[2][3] = inp[local_pos + 18];
samples[3][0] = inp[local_pos + 19];
samples[3][1] = inp[local_pos + 20];
samples[3][2] = inp[local_pos + 21];
samples[3][3] = inp[local_pos + 22];
samples[4][0] = inp[local_pos + 30];
samples[4][1] = inp[local_pos + 31];
samples[4][2] = inp[local_pos + 32];
samples[4][3] = inp[local_pos + 33];
samples[5][0] = inp[local_pos + 34];
samples[5][1] = inp[local_pos + 35];
samples[5][2] = inp[local_pos + 36];
samples[5][3] = inp[local_pos + 37];
samples[6][0] = inp[local_pos + 45];
samples[6][1] = inp[local_pos + 46];
samples[6][2] = inp[local_pos + 47];
samples[6][3] = inp[local_pos + 48];
samples[7][0] = inp[local_pos + 49];
samples[7][1] = inp[local_pos + 50];
samples[7][2] = inp[local_pos + 51];
samples[7][3] = inp[local_pos + 52];
ret[0] = nnedi3(samples);
ret0[0] = inp[local_pos + 18];
#if CURRENT_PASS == LAST_PASS
uint2 destPos = blockStart + threadId.xy * 2;
uint2 outputSize = GetOutputSize();
if (destPos.x >= outputSize.x || destPos.y >= outputSize.y) {
return;
}
float4 ret = 0.0;
float4 ret0 = 0.0;
float4 samples[8];
const uint local_pos = threadId.x * 15 + threadId.y;
[unroll]
for (int i = 0; i < 8; ++i) {
[unroll]
for (int j = 0; j < 4; ++j) {
samples[i][j] = inp[local_pos + (i / 2) * 15 + (i % 2) * 4 + j];
}
}
float2 originUV = mul(rgb2uv, INPUT.SampleLevel(sam1, (destPos + 0.5f) * outputPt, 0).rgb);
WriteToOutput(destPos, mul(yuv2rgb, float3(samples[2][3], originUV)));
++destPos.x;
if (!CheckViewport(destPos)) {
return;
}
originUV = mul(rgb2uv, INPUT.SampleLevel(sam1, (destPos + 0.5f) * outputPt, 0).rgb);
WriteToOutput(destPos, mul(yuv2rgb, float3(nnedi3(samples), originUV)));
#endif
imageStore(out_image, ivec2(gl_GlobalInvocationID) * ivec2(2, 1), ret0);
imageStore(out_image, ivec2(gl_GlobalInvocationID) * ivec2(2, 1) + ivec2(1, 0), ret);
}

View file

@ -0,0 +1,953 @@
// This file is generated by the scripts available at https://github.com/hauuau/magpie-prescalers
// Please don't edit this file directly.
// Generated by: nnedi3.py --nns 16 --win 8x6 --use-compute-shader --use-magpie
//
// This program is free software: you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation, either version 3 of the License, or
// (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License
// along with this program. If not, see <http://www.gnu.org/licenses/>.
//!MAGPIE EFFECT
//!VERSION 4
//!SORT_NAME NNEDI3_016_6
//!TEXTURE
Texture2D INPUT;
//!SAMPLER
//!FILTER POINT
SamplerState sam_INPUT;
//!TEXTURE
//!WIDTH INPUT_WIDTH * 1 * 2
//!HEIGHT INPUT_HEIGHT * 2 * 1
Texture2D OUTPUT;
//!SAMPLER
//!FILTER LINEAR
SamplerState sam_INPUT_LINEAR;
//!TEXTURE
//!FORMAT R16_FLOAT
//!WIDTH INPUT_WIDTH * 1
//!HEIGHT INPUT_HEIGHT * 2
Texture2D temp;
//!SAMPLER
//!FILTER POINT
SamplerState sam_temp;
//!COMMON
#include "prescalers.hlsli"
#define LAST_PASS 2
//!PASS 1
//!DESC NNEDI3 (double_y, nns16, win8x6)
//!IN INPUT
//!OUT temp
//!BLOCK_SIZE 32, 16
//!NUM_THREADS 32, 8
#pragma optionNV(inline none)
float nnedi3(vec4 samples[12]) {
float sum = 0.0, sumsq = 0.0;
[unroll] for (int i = 0; i < 12; i++) {
sum += dot(samples[i], vec4(1.0, 1.0, 1.0, 1.0));
sumsq += dot(samples[i], samples[i]);
}
float mstd0 = sum / 48.0;
float mstd1 = sumsq / 48.0 - mstd0 * mstd0;
float mstd2 = mix(0.0, inversesqrt(mstd1), mstd1 >= 1.192092896e-7);
mstd1 *= mstd2;
float vsum = 0.0, wsum = 0.0, sum1, sum2;
#define T(x) intBitsToFloat(x)
#define W(i, w0, w1, w2, w3) dot(samples[i], vec4(T(w0), T(w1), T(w2), T(w3)))
#define WS(w0, w1) \
sum1 = exp(sum1 * mstd2 + T(w0)); \
sum2 = sum2 * mstd2 + T(w1); \
wsum += sum1; \
vsum += sum1 * (sum2 / (1.0 + abs(sum2)));
sum1 =
W(0, -1126897990, 1027745880, 1024250604, 1024642508) + W(1, -1121959908, -1149906049, -1130469888, -1121396864)
+ W(2, 1039079928, -1107295041, -1147395201, -1126556538)
+ W(3, -1113607518, 1041026790, 1022159130, 1044630722) + W(4, -1107136294, 1005058137, -1116173177, 1042195560)
+ W(5, -1098313415, 999141354, 1019497054, 1007702352) + W(6, 1015526727, 1018714920, 1042189511, -1106681307)
+ W(7, 1035880216, -1121374916, -1133977224, 1026239260)
+ W(8, -1106606352, 1038936227, -1124106064, 1025050132) + W(9, 990390561, -1131068140, 1013770942, -1122507740)
+ W(10, -1136584888, -1135809122, -1122292152, 1015308851)
+ W(11, -1122039043, 1031978820, -1116330759, 1018900008);
sum2 = W(0, 1017133506, 1011515348, -1139818306, -1123730089) + W(1, 996184056, -1138856554, 1023321012, 1029416248)
+ W(2, -1115999672, 1020129658, 1015618084, 1007066512)
+ W(3, -1119553894, 1057587887, -1090489276, -1109933138)
+ W(4, 1016266760, -1145378916, -1112177411, 1071604647)
+ W(5, -1079392139, -1097028615, 1028448562, 1008681896)
+ W(6, -1165256880, 1051025857, -1098617840, -1105405946)
+ W(7, -1155286464, 1000343320, -1133574805, 1035052104)
+ W(8, -1139515542, -1135392452, -1138601606, 991053648)
+ W(9, 1020043526, 1010374724, -1121583660, -1142174380)
+ W(10, 997185888, -1155288808, -1135761830, 1018728192)
+ W(11, 1024878156, 1002597928, -1131188096, -1132781834);
WS(1018288640, 1027735986);
sum1 =
W(0, 1012158232, -1178449286, 1044498160, -1128542910) + W(1, -1115962871, 1002517720, 1006778572, -1114624234)
+ W(2, 1032943202, 1027108853, 1017365062, 964628492) + W(3, 1025063871, -1104570115, 1059928494, -1088743921)
+ W(4, 1032615126, -1134936888, -1156175041, 1028919475)
+ W(5, -1097612337, -1106124541, 1026836706, -1146238776)
+ W(6, 1010747802, 1034856692, -1085331503, 1059914122)
+ W(7, -1114177498, 1020458158, -1140348884, -1127457566)
+ W(8, 1031833306, 1032056909, -1122073627, 1016604174) + W(9, 1020162890, -1122825993, -1119592595, 1033999672)
+ W(10, 1022377282, 998219705, -1172026051, -1115773453)
+ W(11, 1038136595, 1027508251, -1129465364, 1023799671);
sum2 =
W(0, -1126840972, -1130460798, 1019075916, 1017322604) + W(1, -1131054760, -1131047996, -1145399745, 985194115)
+ W(2, -1120812206, -1129997452, 1006903064, -1143360737)
+ W(3, -1139273136, -1112997847, -1139625904, 1042717692)
+ W(4, -1114175000, -1130986946, 991527106, -1120456092) + W(5, 1043975251, 1051048254, -1113881740, 1007107280)
+ W(6, -1135317632, 1001121889, -1150833602, -1121880440)
+ W(7, 978663174, -1143215153, -1139461992, 1017866680)
+ W(8, -1128878392, -1112673669, 1026044394, -1125685806)
+ W(9, -1129486378, 1006765920, -1133504840, -1126929736)
+ W(10, 1014584312, -1144361281, 995542402, 1000306721)
+ W(11, -1142139489, -1114488494, 1007041936, -1134951296);
WS(1042433344, -1111851638);
sum1 = W(0, -1128612156, -1112658226, -1119638967, 1043958886)
+ W(1, -1120465263, -1128976934, -1139940268, -1123380939)
+ W(2, -1126908022, 1033805831, -1115346894, -1142120768)
+ W(3, -1122042583, -1128727592, -1097703246, 1057665642)
+ W(4, -1104545545, 1005565040, 984858240, -1107767030)
+ W(5, 1052387104, 1046318672, -1108167869, -1148354296)
+ W(6, 999630836, -1114896432, 1054789077, -1095395475)
+ W(7, 1029397739, -1133849404, -1146630760, -1115281716)
+ W(8, 1030603948, -1117224401, -1163176544, -1117808895)
+ W(9, -1126512698, -1129996802, 1028419819, -1123618471)
+ W(10, -1117439993, 1013349902, 996431920, -1123547845)
+ W(11, 1026334318, -1113258842, -1134051464, -1120421311);
sum2 =
W(0, 1022431497, -1109389142, 1004613154, 1028727631) + W(1, 1029503922, -1132574761, -1132240188, -1119299282)
+ W(2, -1139248009, -1129989652, -1140046689, -1114039002)
+ W(3, 1024165374, -1107432916, 1041447926, 1047487962) + W(4, 1017218352, -1135952741, -1114822837, 1044244351)
+ W(5, -1108646182, -1100679909, 1040665470, -1123756570)
+ W(6, -1120729932, 1031006195, 1047688354, -1126089152)
+ W(7, -1120804126, -1148002498, -1124855948, 983982854) + W(8, 1009435309, 1033956847, -1107003694, 1028342876)
+ W(9, -1126342960, -1158996358, -1122846542, -1123334894)
+ W(10, -1140927562, -1117057946, -1128289576, -1121099750)
+ W(11, 1036127241, 1039673953, -1102421772, 1026336008);
WS(1015433728, 1058400049);
sum1 =
W(0, -1139873791, 1031161269, -1113693508, 1033801204) + W(1, -1119172737, -1143910182, -1133909491, 1032977294)
+ W(2, -1112917766, -1131731326, 989007258, 1019358132) + W(3, 1023506921, -1116372870, -1116140698, 1045725159)
+ W(4, -1122523445, 1008313039, -1230944644, 1035249566)
+ W(5, -1103376612, -1102794347, 1044071755, -1115540344)
+ W(6, -1118840528, -1120831281, 1044830734, -1116748777)
+ W(7, 1030473357, -1126204226, 1028378783, -1114963068)
+ W(8, -1141442286, 1032646513, 1018738506, -1118552369)
+ W(9, -1121050287, 1032892305, 1023234585, -1112562780)
+ W(10, 1021910870, 1016154651, 1033465034, -1105610222)
+ W(11, 1034039600, 1030129285, -1122899972, -1124368226);
sum2 = W(0, -1138428449, -1158711528, -1124467432, -1140697417)
+ W(1, 1030243467, 1012442941, 992976916, 1013039401) + W(2, -1130455464, -1123518198, 1033499227, 975746961)
+ W(3, -1142924106, -1128734961, -1113146735, -1099387353)
+ W(4, 1051222006, -1122081826, 976851025, 1036130613)
+ W(5, -1097860430, -1077268149, 1072898808, -1117904739)
+ W(6, 989093448, 1010050489, -1108810723, -1091225653) + W(7, 1056060393, -1131990027, 997652548, -1137359275)
+ W(8, -1122996798, 1032494444, 1025590581, 951236744) + W(9, -1153131756, 990210276, -1140348735, -1115493835)
+ W(10, 1025171621, 1006284898, -1134977059, -1138876101)
+ W(11, -1127238416, 1018469149, 1026307569, -1146863422);
WS(-1143089152, 1030017260);
sum1 =
W(0, 1012276081, -1116644609, 1019444907, -1124688427) + W(1, 1029853709, -1130860131, 1001605962, -1127223379)
+ W(2, -1119160665, 1035777366, -1136557285, -1130309965)
+ W(3, 1024406997, -1109637089, 1048989101, -1098625404)
+ W(4, 1038057505, -1130883561, -1155861797, -1115433381)
+ W(5, 1044433671, 1006101820, -1111190908, 1009046005) + W(6, -1155627981, 1036571679, -1098184025, 1048780603)
+ W(7, -1112291813, 1025361773, -1122534699, 1028189701)
+ W(8, 1039597237, -1104960796, -1130076067, 1018788475)
+ W(9, 1018348791, -1126280255, -1117935161, 1029641477)
+ W(10, 1012573277, -1125993892, -1120990241, 1036379833)
+ W(11, -1136463217, -1111599465, -1154886405, 1020397819);
sum2 = W(0, -1153319600, 1008405084, -1118973116, -1140784820)
+ W(1, 1012585128, 1010769460, -1147284080, 985822624) + W(2, 1010505984, -1129308604, 1021293048, 1001814848)
+ W(3, 1008968960, -1142311064, -1101248908, 1037448945) + W(4, 1024969278, -1160749952, 995456320, 1022276922)
+ W(5, -1089187936, 1057794596, 1033366347, -1123619202)
+ W(6, -1140178660, -1140411728, -1109859050, 1029773785)
+ W(7, 1024400778, -1136545168, -1146954776, 1005012008) + W(8, 1017518401, 1015531414, 1007802556, 1000322872)
+ W(9, -1142030464, 1003782736, 982409184, 974134143) + W(10, 1003482728, -1152799248, -1170856127, 1006946188)
+ W(11, 995727232, 960534268, 1009923956, 985284128);
WS(1064472528, -1121594920);
sum1 =
W(0, -1142654991, 1027230343, -1112807213, 1027061019) + W(1, -1128825126, -1164359388, -1143599223, 1032290711)
+ W(2, -1113392623, 1016010466, 991342574, 1014490160) + W(3, 1014568428, -1136037408, -1115590690, 1034098395)
+ W(4, 1008695068, -1148094031, 1010500896, 1002050167)
+ W(5, -1113734161, -1112872467, 1027642302, -1127829894)
+ W(6, -1124387333, -1122938499, 1038834309, -1130883382)
+ W(7, 1013984188, -1138058188, 1020884834, -1120250507) + W(8, 1029912912, 1015162858, 1015817710, -1124941766)
+ W(9, -1131205634, 1025589157, 1019867389, -1123484555)
+ W(10, 1015459258, 1008886302, 1026841191, -1110863224)
+ W(11, 1031947569, 1019435182, -1129521612, -1130075526);
sum2 =
W(0, 1003807591, -1154115373, 1000124719, 1017182228) + W(1, -1126980607, -1130234859, -1147429191, -1139843175)
+ W(2, 1001833687, 1024488826, -1116401990, 987658746) + W(3, 1002635095, 1018649088, 1008095031, 1040714709)
+ W(4, -1105844805, 1013729967, -1132089351, 1016729308)
+ W(5, -1105992985, 1063780536, -1085442794, 1024604622) + W(6, -1147602519, 1024344696, 1014141127, 1047200342)
+ W(7, -1101306502, 995366957, -1151072125, -1155997437)
+ W(8, -1132427785, 1020609216, -1122913939, -1147894927) + W(9, 964968041, 1001714367, -1141957575, 1023684454)
+ W(10, -1125194898, -1146690231, 1011860423, -1141691791)
+ W(11, -1139390003, 1017456200, -1128761080, -1146063807);
WS(1061878800, -1131153991);
sum1 =
W(0, -1123872727, 1015115512, -1099302516, 1041224340) + W(1, -1144166978, -1171049230, 1018625288, 1031144036)
+ W(2, -1102371221, 1009910425, 1014687697, 1022902338) + W(3, -1127640224, 1036357847, -1085394744, 1052022073)
+ W(4, -1115552350, -1132534141, 1026350045, -1108974562)
+ W(5, 1059569738, 1058525661, -1125187302, 1016189168) + W(6, 1013916191, -1107191102, 1050617832, -1088226291)
+ W(7, 1037730450, -1123531112, 1018183052, 1006433282) + W(8, 1032504563, -1097316565, 1040234099, -1127405808)
+ W(9, -1145362866, 1014427177, 1031877738, -1109508096) + W(10, 1015825508, 1018548825, 1016048056, 1026198990)
+ W(11, 1033421596, -1098228398, 1035235966, -1137247201);
sum2 = W(0, -1131301730, 1031269327, -1127010401, -1109842974)
+ W(1, -1181736700, -1180777340, 973798558, -1131640108)
+ W(2, 1028981651, -1125259759, -1167651134, -1160957999)
+ W(3, -1127780866, 1013454096, -1149526184, -1113692773)
+ W(4, -1123287814, 993986728, 1013478572, -1109509101) + W(5, 1051779317, 1047088883, -1109788940, 1020962386)
+ W(6, -1160424319, -1117315078, 1028380081, -1134194124)
+ W(7, -1115287133, -1136947718, -1135840779, -1131160392)
+ W(8, -1137527992, 1028175261, -1121515979, -1138138790)
+ W(9, -1164912671, -1145619912, 998238336, 1018886164)
+ W(10, -1125209194, -1152989064, -1138738786, -1127332243)
+ W(11, -1148504424, 1027237057, -1142455024, -1123011340);
WS(-1146021888, 1053974589);
sum1 =
W(0, 1029642476, -1119368753, 1042969521, -1095098901) + W(1, 1046685039, 984849429, 1013890275, -1134074211)
+ W(2, 1042359026, -1107285127, 1031018217, -1135393367)
+ W(3, -1176939092, 1007708103, 1045769551, -1096985546) + W(4, 1036262392, -1139413615, 1022266947, 1017736689)
+ W(5, -1101301107, 1034918881, 1003810877, 1024875117) + W(6, -1146466657, 1027345005, -1094644679, 1050538529)
+ W(7, -1120828825, -1172526890, 1004183253, 1032510570) + W(8, -1091538585, 1051699648, 1011534979, 1017671961)
+ W(9, -1160650069, 1019378973, -1107179580, 1036824506)
+ W(10, -1133351451, -1160823333, -1127783457, 1031489314)
+ W(11, -1095508207, 1048776768, 1035618600, 1006585957);
sum2 = W(0, 1031363252, -1091101506, 1048232756, 1057852755) + W(1, -1095952784, 1016290300, 1030774484, 1001500224)
+ W(2, -1110436898, -1132290932, -1131305343, -1126601761)
+ W(3, 1015165558, -1110787951, 1016237906, 1043794074) + W(4, -1113356328, 1003743696, 1007437656, 965388167)
+ W(5, 1014973676, 1047525730, -1152923833, 1022650220) + W(6, 1020087968, 1003188992, -1123006886, 1011818344)
+ W(7, -1111245491, 1021501454, -1158035650, 1041338676)
+ W(8, -1105090874, -1129296549, -1131940021, 1017537464)
+ W(9, -1137051446, -1134903850, -1123217223, 1034851396)
+ W(10, -1117639196, -1133259176, 1018262350, 1033269727)
+ W(11, -1104724635, -1106365430, 1024945328, 1019937714);
WS(-1077057896, -1083600334);
sum1 = W(0, 1017420011, 1011471785, 1029223422, -1116040414) + W(1, 1017123181, 1016511669, 1014201033, 1019976613)
+ W(2, -1126437509, 1015478313, 1024110818, -1167731667)
+ W(3, 1017846781, -1138042285, 1049638570, -1103217262) + W(4, 1023111893, 1009386661, 999765850, 1040273597)
+ W(5, -1090770241, -1087230893, 1030676769, 1023090125)
+ W(6, -1162024122, 1016487629, 1029091694, 1046437488) + W(7, -1112046985, 1020460717, 985808522, 1027730222)
+ W(8, 1037672698, 1024768280, -1120839802, 1025489318) + W(9, 1019153993, 1010855969, 1027546578, 1028909230)
+ W(10, 1023955584, -1134545259, 1011766057, 1025127228)
+ W(11, 1025680213, 1017109109, -1128064723, 1027741830);
sum2 =
W(0, 1023774756, -1107003878, 1020767940, -1118294055) + W(1, -1113997093, 1021408408, -1152708847, 1013240776)
+ W(2, -1108605887, -1128830540, -1139588328, -1119578529)
+ W(3, 1005727232, -1108761818, 1050907301, -1097736561)
+ W(4, 1032528025, -1135972104, -1128030280, 1032847770) + W(5, 1058054639, 1008347200, 1039669350, -1131826954)
+ W(6, 1004577664, 1024878510, -1106188814, 1049418167) + W(7, -1108856812, 999382680, -1116453887, -1129071264)
+ W(8, 1040942692, -1105809360, -1104688291, 1019392776)
+ W(9, 1020705336, -1124253692, -1115446820, 1014050712)
+ W(10, 1018266740, -1117167612, -1127775332, -1114566712)
+ W(11, 1042743894, -1132221182, -1103534695, 1022204104);
WS(1034686080, -1080904524);
sum1 =
W(0, -1139332721, 1025190657, -1143163562, 1041601261) + W(1, 1024768205, -1137907141, -1156631187, 1024127465)
+ W(2, 1040892278, 1028605547, -1129308018, 1012089369) + W(3, 1023562901, 1006799241, -1104914606, 1052908885)
+ W(4, -1117860929, 1019594656, 1011454089, -1145135178)
+ W(5, -1089193318, -1091833281, 1036300940, -1143330794)
+ W(6, 1009225011, -1129417722, 1043909393, -1103073573) + W(7, 1040987970, 992909011, 1012327853, 1017495114)
+ W(8, -1119873834, 1025246703, 1033652713, -1123933213)
+ W(9, 1010687981, 1027561839, -1136185891, -1124345098) + W(10, 1024209623, 1018355139, 1010798725, 1010795083)
+ W(11, -1118482716, 1032670633, 1027144528, -1123266333);
sum2 =
W(0, 998154484, -1124228589, -1132108902, -1115676434) + W(1, -1123985162, 1004957466, -1136847690, 1028193069)
+ W(2, -1123281782, -1123302060, -1132306691, 1011392625)
+ W(3, -1120010648, 1043298286, -1097765474, 1027211577)
+ W(4, -1114822183, -1127542967, -1145824866, -1115567961)
+ W(5, 1059221182, 1034703777, -1131429597, 1022587458) + W(6, 1015307650, -1106126812, 1048600788, -1099334080)
+ W(7, 1029215805, -1127163397, 994166396, -1111174068) + W(8, -1130476352, 1015056080, 1023836215, -1122559367)
+ W(9, 1000606426, -1128437454, 1026255089, -1137618020)
+ W(10, -1127893362, -1171736302, 1010815409, -1110538383)
+ W(11, -1118584150, 1028199647, 1025007180, -1124423270);
WS(-1097173920, -1100403112);
sum1 = W(0, -1133792968, -1126599342, 1026626987, -1109988694)
+ W(1, -1128510918, -1124691470, -1124511038, -1134319356)
+ W(2, -1112479512, -1122054529, -1138055228, -1131431128)
+ W(3, -1133667884, -1113753548, 1051379210, -1097159959)
+ W(4, 1031366423, -1128464692, -1126404688, -1113718896)
+ W(5, 1058852431, 1058630415, -1108453759, -1122909907)
+ W(6, -1129657589, 1034489098, -1097104011, 1049904553)
+ W(7, -1111244112, 1006087192, -1123548289, 1017816566)
+ W(8, 1007326848, -1104990865, -1129654222, -1138955724)
+ W(9, -1134226372, -1122628437, -1112737379, 983139170)
+ W(10, -1143321192, -1123473736, -1120375479, 1029275393)
+ W(11, -1116837058, -1110311540, -1132471000, -1149064600);
sum2 = W(0, -1133003813, -1145103116, -1105221269, 1033080040)
+ W(1, 1016862101, -1129731365, -1170659932, 1024883426)
+ W(2, -1117429423, 1028547885, -1128891234, -1147341896)
+ W(3, 1006656308, -1122208183, -1098340061, 1042272545)
+ W(4, -1121562483, -1121650606, 1031055883, -1101651786)
+ W(5, 1055658740, 1058321046, -1100689547, 1031708925)
+ W(6, -1122785076, -1107240567, 1035604404, -1112738821)
+ W(7, -1115182870, -1123396988, -1138148825, -1137951645)
+ W(8, -1131811521, 1003752088, 1026865631, -1133076983)
+ W(9, -1134424500, -1131665157, -1130287800, 1015669581)
+ W(10, -1129373191, -1131162259, -1131089901, -1116779622)
+ W(11, -1123356625, 1033205575, -1134576021, -1127933595);
WS(1049422752, 1064394145);
sum1 = W(0, 1016583527, -1106085006, 995307718, 1042273115) + W(1, -1113049442, 1025810280, 997641734, -1123841888)
+ W(2, 1031369872, 1021597381, -1122854832, 1006187755)
+ W(3, -1129211865, 1041111742, -1088517333, 1058826428)
+ W(4, -1113933244, 1019889767, -1131677043, 1032245856)
+ W(5, -1098988005, -1105331685, 1032610296, -1131685097)
+ W(6, 1021172552, -1110939130, 1058612208, -1090507155)
+ W(7, 1037338632, -1155049030, 1021691141, -1105269375)
+ W(8, 1030057089, 1043687978, -1122591528, -1134096210)
+ W(9, -1133007562, -1137128282, 1036830720, -1120823228)
+ W(10, -1116248270, 1025994697, 1026669144, -1106745812)
+ W(11, 1034516890, 1038691348, -1117945591, -1126546729);
sum2 = W(0, 1015668141, -1138201662, -1111996311, -1127284815)
+ W(1, -1125087482, 1020174885, -1124041461, -1140877219)
+ W(2, -1116450062, -1123578506, 1024732308, -1139064970)
+ W(3, 1005775275, 1027346708, -1125910350, -1106280325)
+ W(4, 1034158307, -1133423524, 1015274173, 1016303395)
+ W(5, -1108948194, 1052974100, 1032925063, -1161498797)
+ W(6, -1138139200, -1106503093, -1104963655, 1053021197)
+ W(7, -1107449032, -1134898868, 992639399, -1117618841) + W(8, 1031763952, 957951850, 994113735, 1013272790)
+ W(9, -1132053353, -1115775134, 1015724405, 1016609913)
+ W(10, -1132927280, -1132485274, -1129319398, -1122071744)
+ W(11, 1034411590, -1140595900, -1140186580, -1164791981);
WS(-1101497152, -1084603877);
sum1 =
W(0, -1136425045, 1016522037, 967194407, 1019848413) + W(1, -1129523533, -1142614610, -1140218249, -1157845066)
+ W(2, 1029505522, -1119357636, -1140249161, -1135395837)
+ W(3, -1121565262, 1035402982, 1022903246, 1027088345)
+ W(4, -1121932442, -1148904362, -1122160667, 1027884002)
+ W(5, -1107598171, 1024422013, -1127296803, 1002411186) + W(6, 1006883159, 1025282390, 1025270942, -1117602990)
+ W(7, 1030372258, -1130529549, -1132497425, 1022271101)
+ W(8, -1120772739, 1030415880, -1129818261, 1018540973)
+ W(9, 1004502690, -1138792353, -1154700189, -1171556244)
+ W(10, -1138666305, -1138856043, -1128604789, 995143101)
+ W(11, -1128284203, 1025955498, -1121511513, 1011955033);
sum2 = W(0, -1126668299, -1131366283, 1024971228, 1000957181)
+ W(1, -1151515419, 1005199725, -1137964827, -1117612139)
+ W(2, 1034620123, -1119890411, -1145021381, -1136862175)
+ W(3, 1015963121, -1097765254, 1049249869, 1026062254) + W(4, 1001872029, 1007955643, 1030757650, -1083955387)
+ W(5, 1064229708, -1107214224, 1026637176, -1125717658)
+ W(6, -1137547503, -1103492737, 1047078464, -1122275403)
+ W(7, 1027173860, -1169614250, 997720155, -1118797430) + W(8, 1017921725, 1016072153, -1135832789, 923654805)
+ W(9, -1132279825, -1131387718, 1024786888, -1133941049)
+ W(10, -1148432117, 1002011725, -1152589275, -1140632131)
+ W(11, -1144191965, 996433547, -1140699475, 1005736109);
WS(1059552336, -1136539026);
sum1 =
W(0, 990367896, 1041343484, -1096612504, 1033353841) + W(1, -1125599349, 1028944863, 1010957914, 1036710283)
+ W(2, -1107358947, 1029016441, -1132821402, 1024290996)
+ W(3, -1154541352, 1045269292, -1087221074, 1042554433)
+ W(4, -1154580200, 1023892422, 1017372383, -1112141659) + W(5, 1058232297, 1029783110, -1114120867, 1023410731)
+ W(6, 1026284586, -1116984235, 1051438086, -1087458720) + W(7, 1033522371, -1144215764, 1015461809, 1018013925)
+ W(8, 1047713030, -1095293300, 1032365167, -1144750420) + W(9, 1014364322, 1006339428, 1032067931, -1114380761)
+ W(10, 1004597796, 1001346936, 1021777309, 1032228520)
+ W(11, 1045851190, -1099415088, 1030006574, -1130073781);
sum2 = W(0, -1153914788, -1101809160, 1052877341, 1046574229)
+ W(1, -1095334336, 1023520281, -1126180245, -1115520194)
+ W(2, 1022007580, 1000424166, -1113807813, 1021218858) + W(3, 995844276, -1114410922, 1055965696, 1034680258)
+ W(4, -1109583292, 1008634443, -1141303142, 1033573989)
+ W(5, -1098900400, -1098051352, 1033797491, -1115608949) + W(6, 1026951758, 998799030, 1023481081, 1045079279)
+ W(7, 1032986287, 1032307290, 990856044, -1110191966) + W(8, 1023185808, -1106708743, 1025876178, -1128938562)
+ W(9, 1004850742, -1129252703, 1031073312, 984863273) + W(10, -1137844345, 1017335440, 1015235936, 1016759632)
+ W(11, -1104219784, -1103050031, 1038371038, 1020607644);
WS(-1080660584, -1085825159);
sum1 =
W(0, 1013708199, -1123370319, -1145658646, -1118786339) + W(1, 1028171867, -1144908790, 998525366, -1131079022)
+ W(2, -1111041043, 1035331132, 1017605134, -1131113128)
+ W(3, 1026247587, -1110742584, 1047524760, -1095527502)
+ W(4, 1042485668, -1130744068, 1009982783, -1113918027) + W(5, 1038280501, 1041941518, -1110999603, 992723116)
+ W(6, -1136883881, 1032009669, -1096311074, 1051037928)
+ W(7, -1106204846, 1025830203, -1128223794, 1025751155)
+ W(8, 1042402294, -1106649743, -1132447358, 1017749654) + W(9, 999596614, -1126831290, -1118872454, 1032615945)
+ W(10, 1002160934, -1127230527, -1126850910, 1033490448)
+ W(11, 1023947050, -1111971999, 971034337, 1018668086);
sum2 = W(0, 988660617, 1017543700, 1015794522, -1133704409) + W(1, 1003471274, -1140119133, -1145776834, 1002138986)
+ W(2, 1001599498, 1024621822, -1135257421, -1136500105)
+ W(3, -1133422913, 1031822055, 1041494739, -1102581932)
+ W(4, 970658596, -1163479081, -1126488793, 1032911160) + W(5, 1056510750, -1089051586, 1026713544, 1009057465)
+ W(6, 999416722, 1018658069, 1023998101, -1111744235) + W(7, 945757471, 1000517690, 999055930, 1007351961)
+ W(8, -1138508317, 1009295285, 998080468, -1137960905) + W(9, 987033481, -1162261577, 991201876, -1140892226)
+ W(10, -1156050276, -1186683976, -1179419172, 999395634)
+ W(11, -1141702058, -1147317506, 1007988669, -1146609818);
WS(1064784784, -1120346387);
sum1 = W(0, -1150678408, 1015721531, 1049255678, -1099108228)
+ W(1, -1149551256, -1136953142, 1000581420, -1110077251)
+ W(2, 1043607805, -1107416484, 1017163947, -1140022794)
+ W(3, 1006062348, -1107299655, 1059242626, -1089544734)
+ W(4, 1023526494, -1139533474, 1015088861, -1132691862)
+ W(5, -1123916922, -1130977491, 1022505321, 1012221798)
+ W(6, -1136518116, -1148196556, -1096371932, 1057929313)
+ W(7, -1104456865, 1014035238, -1126533711, 1013224070)
+ W(8, -1100407642, 1048500643, -1111675367, 1026165050)
+ W(9, 1012432222, -1124886999, -1132580564, 1035479729)
+ W(10, -1127245287, -1136458552, -1122704190, 1014270588)
+ W(11, -1102354822, 1044504531, 1007459698, 1017479699);
sum2 = W(0, -1140771860, 1031694512, -1104948969, -1115570202)
+ W(1, 1040745971, -1127298441, -1125513054, -1122230843) + W(2, 993388690, 1042093481, -1111499166, 995262946)
+ W(3, -1131667695, 979286214, 1026183534, 1042830623) + W(4, -1119680402, 1002124441, -1131288705, 1025077104)
+ W(5, -1111209187, -1112764939, 982469091, -1123012516) + W(6, 978159878, -1108853537, 1041617383, 1043422569)
+ W(7, -1120447085, -1129740789, 1012596136, -1102087836)
+ W(8, 1045410736, 1034771561, -1109907689, -1125016939)
+ W(9, 1011933560, -1117751010, 1030126174, 1014235016)
+ W(10, -1127258987, 1004566649, -1121534607, -1113389694)
+ W(11, 1044425994, 1025820984, -1115100280, -1119639931);
WS(-1088649680, 1067112300);
return clamp(mstd0 + 5.0 * vsum / wsum * mstd1, 0.0, 1.0);
}
shared float inp[507];
#define CURRENT_PASS 1
#define GET_SAMPLE(x) dot(x.rgb, rgb2y)
#define imageStore(out_image, pos, val) imageStoreOverride(pos, val.x)
void imageStoreOverride(uint2 pos, float value) { temp[pos] = (value); }
#define INPUT_tex(pos) GET_SAMPLE(vec4(texture(INPUT, pos)))
static const float2 INPUT_size = float2(GetInputSize());
static const float2 INPUT_pt = float2(GetInputPt());
#define HOOKED_tex(pos) INPUT_tex(pos)
#define HOOKED_size INPUT_size
#define HOOKED_pt INPUT_pt
void Pass1(uint2 blockStart, uint3 threadId) {
ivec2 group_base = ivec2(gl_WorkGroupID) * ivec2(gl_WorkGroupSize);
int local_pos = int(gl_LocalInvocationID.x) * 13 + int(gl_LocalInvocationID.y);
for (int id = int(gl_LocalInvocationIndex); id < 507; id += int(gl_WorkGroupSize.x * gl_WorkGroupSize.y)) {
uint x = (uint)id / 13, y = (uint)id % 13;
inp[id] =
HOOKED_tex(HOOKED_pt * vec2(float(group_base.x + x - (3)) + 0.5, float(group_base.y + y - (2)) + 0.5)).x;
}
barrier();
vec4 ret = vec4(0.0, 0.0, 0.0, 0.0);
vec4 ret0 = vec4(0.0, 0.0, 0.0, 0.0);
vec4 samples[12];
samples[0][0] = inp[local_pos + 0];
samples[0][1] = inp[local_pos + 1];
samples[0][2] = inp[local_pos + 2];
samples[0][3] = inp[local_pos + 3];
samples[1][0] = inp[local_pos + 4];
samples[1][1] = inp[local_pos + 5];
samples[1][2] = inp[local_pos + 13];
samples[1][3] = inp[local_pos + 14];
samples[2][0] = inp[local_pos + 15];
samples[2][1] = inp[local_pos + 16];
samples[2][2] = inp[local_pos + 17];
samples[2][3] = inp[local_pos + 18];
samples[3][0] = inp[local_pos + 26];
samples[3][1] = inp[local_pos + 27];
samples[3][2] = inp[local_pos + 28];
samples[3][3] = inp[local_pos + 29];
samples[4][0] = inp[local_pos + 30];
samples[4][1] = inp[local_pos + 31];
samples[4][2] = inp[local_pos + 39];
samples[4][3] = inp[local_pos + 40];
samples[5][0] = inp[local_pos + 41];
samples[5][1] = inp[local_pos + 42];
samples[5][2] = inp[local_pos + 43];
samples[5][3] = inp[local_pos + 44];
samples[6][0] = inp[local_pos + 52];
samples[6][1] = inp[local_pos + 53];
samples[6][2] = inp[local_pos + 54];
samples[6][3] = inp[local_pos + 55];
samples[7][0] = inp[local_pos + 56];
samples[7][1] = inp[local_pos + 57];
samples[7][2] = inp[local_pos + 65];
samples[7][3] = inp[local_pos + 66];
samples[8][0] = inp[local_pos + 67];
samples[8][1] = inp[local_pos + 68];
samples[8][2] = inp[local_pos + 69];
samples[8][3] = inp[local_pos + 70];
samples[9][0] = inp[local_pos + 78];
samples[9][1] = inp[local_pos + 79];
samples[9][2] = inp[local_pos + 80];
samples[9][3] = inp[local_pos + 81];
samples[10][0] = inp[local_pos + 82];
samples[10][1] = inp[local_pos + 83];
samples[10][2] = inp[local_pos + 91];
samples[10][3] = inp[local_pos + 92];
samples[11][0] = inp[local_pos + 93];
samples[11][1] = inp[local_pos + 94];
samples[11][2] = inp[local_pos + 95];
samples[11][3] = inp[local_pos + 96];
ret[0] = nnedi3(samples);
ret0[0] = inp[local_pos + 41];
#if CURRENT_PASS == LAST_PASS
uint2 destPos = blockStart + threadId.xy * 2;
uint2 outputSize = GetOutputSize();
if (destPos.x >= outputSize.x || destPos.y >= outputSize.y) {
return;
}
#endif
imageStore(out_image, ivec2(gl_GlobalInvocationID) * ivec2(1, 2), ret0);
imageStore(out_image, ivec2(gl_GlobalInvocationID) * ivec2(1, 2) + ivec2(0, 1), ret);
}
//!PASS 2
//!DESC NNEDI3 (double_x, nns16, win8x6)
//!IN INPUT, temp
//!OUT OUTPUT
//!BLOCK_SIZE 64, 8
//!NUM_THREADS 32, 8
#pragma optionNV(inline none)
float nnedi3(vec4 samples[12]) {
float sum = 0.0, sumsq = 0.0;
[unroll] for (int i = 0; i < 12; i++) {
sum += dot(samples[i], vec4(1.0, 1.0, 1.0, 1.0));
sumsq += dot(samples[i], samples[i]);
}
float mstd0 = sum / 48.0;
float mstd1 = sumsq / 48.0 - mstd0 * mstd0;
float mstd2 = mix(0.0, inversesqrt(mstd1), mstd1 >= 1.192092896e-7);
mstd1 *= mstd2;
float vsum = 0.0, wsum = 0.0, sum1, sum2;
#define T(x) intBitsToFloat(x)
#define W(i, w0, w1, w2, w3) dot(samples[i], vec4(T(w0), T(w1), T(w2), T(w3)))
#define WS(w0, w1) \
sum1 = exp(sum1 * mstd2 + T(w0)); \
sum2 = sum2 * mstd2 + T(w1); \
wsum += sum1; \
vsum += sum1 * (sum2 / (1.0 + abs(sum2)));
sum1 = W(0, -1126897990, -1130469888, -1113607518, -1116173177)
+ W(1, 1015526727, -1133977224, 990390561, -1122292152) + W(2, 1027745880, -1121396864, 1041026790, 1042195560)
+ W(3, 1018714920, 1026239260, -1131068140, 1015308851) + W(4, 1024250604, 1039079928, 1022159130, -1098313415)
+ W(5, 1042189511, -1106606352, 1013770942, -1122039043) + W(6, 1024642508, -1107295041, 1044630722, 999141354)
+ W(7, -1106681307, 1038936227, -1122507740, 1031978820)
+ W(8, -1121959908, -1147395201, -1107136294, 1019497054)
+ W(9, 1035880216, -1124106064, -1136584888, -1116330759)
+ W(10, -1149906049, -1126556538, 1005058137, 1007702352)
+ W(11, -1121374916, 1025050132, -1135809122, 1018900008);
sum2 = W(0, 1017133506, 1023321012, -1119553894, -1112177411)
+ W(1, -1165256880, -1133574805, 1020043526, -1135761830)
+ W(2, 1011515348, 1029416248, 1057587887, 1071604647) + W(3, 1051025857, 1035052104, 1010374724, 1018728192)
+ W(4, -1139818306, -1115999672, -1090489276, -1079392139)
+ W(5, -1098617840, -1139515542, -1121583660, 1024878156)
+ W(6, -1123730089, 1020129658, -1109933138, -1097028615)
+ W(7, -1105405946, -1135392452, -1142174380, 1002597928) + W(8, 996184056, 1015618084, 1016266760, 1028448562)
+ W(9, -1155286464, -1138601606, 997185888, -1131188096)
+ W(10, -1138856554, 1007066512, -1145378916, 1008681896)
+ W(11, 1000343320, 991053648, -1155288808, -1132781834);
WS(1018288640, 1027735986);
sum1 =
W(0, 1012158232, 1006778572, 1025063871, -1156175041) + W(1, 1010747802, -1140348884, 1020162890, -1172026051)
+ W(2, -1178449286, -1114624234, -1104570115, 1028919475)
+ W(3, 1034856692, -1127457566, -1122825993, -1115773453)
+ W(4, 1044498160, 1032943202, 1059928494, -1097612337) + W(5, -1085331503, 1031833306, -1119592595, 1038136595)
+ W(6, -1128542910, 1027108853, -1088743921, -1106124541) + W(7, 1059914122, 1032056909, 1033999672, 1027508251)
+ W(8, -1115962871, 1017365062, 1032615126, 1026836706)
+ W(9, -1114177498, -1122073627, 1022377282, -1129465364)
+ W(10, 1002517720, 964628492, -1134936888, -1146238776) + W(11, 1020458158, 1016604174, 998219705, 1023799671);
sum2 = W(0, -1126840972, -1145399745, -1139273136, 991527106)
+ W(1, -1135317632, -1139461992, -1129486378, 995542402)
+ W(2, -1130460798, 985194115, -1112997847, -1120456092) + W(3, 1001121889, 1017866680, 1006765920, 1000306721)
+ W(4, 1019075916, -1120812206, -1139625904, 1043975251)
+ W(5, -1150833602, -1128878392, -1133504840, -1142139489)
+ W(6, 1017322604, -1129997452, 1042717692, 1051048254)
+ W(7, -1121880440, -1112673669, -1126929736, -1114488494)
+ W(8, -1131054760, 1006903064, -1114175000, -1113881740) + W(9, 978663174, 1026044394, 1014584312, 1007041936)
+ W(10, -1131047996, -1143360737, -1130986946, 1007107280)
+ W(11, -1143215153, -1125685806, -1144361281, -1134951296);
WS(1042433344, -1111851638);
sum1 = W(0, -1128612156, -1139940268, -1122042583, 984858240) + W(1, 999630836, -1146630760, -1126512698, 996431920)
+ W(2, -1112658226, -1123380939, -1128727592, -1107767030)
+ W(3, -1114896432, -1115281716, -1129996802, -1123547845)
+ W(4, -1119638967, -1126908022, -1097703246, 1052387104)
+ W(5, 1054789077, 1030603948, 1028419819, 1026334318) + W(6, 1043958886, 1033805831, 1057665642, 1046318672)
+ W(7, -1095395475, -1117224401, -1123618471, -1113258842)
+ W(8, -1120465263, -1115346894, -1104545545, -1108167869)
+ W(9, 1029397739, -1163176544, -1117439993, -1134051464)
+ W(10, -1128976934, -1142120768, 1005565040, -1148354296)
+ W(11, -1133849404, -1117808895, 1013349902, -1120421311);
sum2 =
W(0, 1022431497, -1132240188, 1024165374, -1114822837)
+ W(1, -1120729932, -1124855948, -1126342960, -1128289576)
+ W(2, -1109389142, -1119299282, -1107432916, 1044244351)
+ W(3, 1031006195, 983982854, -1158996358, -1121099750) + W(4, 1004613154, -1139248009, 1041447926, -1108646182)
+ W(5, 1047688354, 1009435309, -1122846542, 1036127241) + W(6, 1028727631, -1129989652, 1047487962, -1100679909)
+ W(7, -1126089152, 1033956847, -1123334894, 1039673953) + W(8, 1029503922, -1140046689, 1017218352, 1040665470)
+ W(9, -1120804126, -1107003694, -1140927562, -1102421772)
+ W(10, -1132574761, -1114039002, -1135952741, -1123756570)
+ W(11, -1148002498, 1028342876, -1117057946, 1026336008);
WS(1015433728, 1058400049);
sum1 =
W(0, -1139873791, -1133909491, 1023506921, -1230944644) + W(1, -1118840528, 1028378783, -1121050287, 1033465034)
+ W(2, 1031161269, 1032977294, -1116372870, 1035249566)
+ W(3, -1120831281, -1114963068, 1032892305, -1105610222)
+ W(4, -1113693508, -1112917766, -1116140698, -1103376612)
+ W(5, 1044830734, -1141442286, 1023234585, 1034039600) + W(6, 1033801204, -1131731326, 1045725159, -1102794347)
+ W(7, -1116748777, 1032646513, -1112562780, 1030129285) + W(8, -1119172737, 989007258, -1122523445, 1044071755)
+ W(9, 1030473357, 1018738506, 1021910870, -1122899972)
+ W(10, -1143910182, 1019358132, 1008313039, -1115540344)
+ W(11, -1126204226, -1118552369, 1016154651, -1124368226);
sum2 = W(0, -1138428449, 992976916, -1142924106, 976851025) + W(1, 989093448, 997652548, -1153131756, -1134977059)
+ W(2, -1158711528, 1013039401, -1128734961, 1036130613)
+ W(3, 1010050489, -1137359275, 990210276, -1138876101)
+ W(4, -1124467432, -1130455464, -1113146735, -1097860430)
+ W(5, -1108810723, -1122996798, -1140348735, -1127238416)
+ W(6, -1140697417, -1123518198, -1099387353, -1077268149)
+ W(7, -1091225653, 1032494444, -1115493835, 1018469149) + W(8, 1030243467, 1033499227, 1051222006, 1072898808)
+ W(9, 1056060393, 1025590581, 1025171621, 1026307569) + W(10, 1012442941, 975746961, -1122081826, -1117904739)
+ W(11, -1131990027, 951236744, 1006284898, -1146863422);
WS(-1143089152, 1030017260);
sum1 = W(0, 1012276081, 1001605962, 1024406997, -1155861797)
+ W(1, -1155627981, -1122534699, 1018348791, -1120990241)
+ W(2, -1116644609, -1127223379, -1109637089, -1115433381)
+ W(3, 1036571679, 1028189701, -1126280255, 1036379833) + W(4, 1019444907, -1119160665, 1048989101, 1044433671)
+ W(5, -1098184025, 1039597237, -1117935161, -1136463217)
+ W(6, -1124688427, 1035777366, -1098625404, 1006101820)
+ W(7, 1048780603, -1104960796, 1029641477, -1111599465)
+ W(8, 1029853709, -1136557285, 1038057505, -1111190908)
+ W(9, -1112291813, -1130076067, 1012573277, -1154886405)
+ W(10, -1130860131, -1130309965, -1130883561, 1009046005)
+ W(11, 1025361773, 1018788475, -1125993892, 1020397819);
sum2 = W(0, -1153319600, -1147284080, 1008968960, 995456320)
+ W(1, -1140178660, -1146954776, -1142030464, -1170856127)
+ W(2, 1008405084, 985822624, -1142311064, 1022276922) + W(3, -1140411728, 1005012008, 1003782736, 1006946188)
+ W(4, -1118973116, 1010505984, -1101248908, -1089187936) + W(5, -1109859050, 1017518401, 982409184, 995727232)
+ W(6, -1140784820, -1129308604, 1037448945, 1057794596) + W(7, 1029773785, 1015531414, 974134143, 960534268)
+ W(8, 1012585128, 1021293048, 1024969278, 1033366347) + W(9, 1024400778, 1007802556, 1003482728, 1009923956)
+ W(10, 1010769460, 1001814848, -1160749952, -1123619202)
+ W(11, -1136545168, 1000322872, -1152799248, 985284128);
WS(1064472528, -1121594920);
sum1 = W(0, -1142654991, -1143599223, 1014568428, 1010500896)
+ W(1, -1124387333, 1020884834, -1131205634, 1026841191)
+ W(2, 1027230343, 1032290711, -1136037408, 1002050167)
+ W(3, -1122938499, -1120250507, 1025589157, -1110863224)
+ W(4, -1112807213, -1113392623, -1115590690, -1113734161)
+ W(5, 1038834309, 1029912912, 1019867389, 1031947569) + W(6, 1027061019, 1016010466, 1034098395, -1112872467)
+ W(7, -1130883382, 1015162858, -1123484555, 1019435182) + W(8, -1128825126, 991342574, 1008695068, 1027642302)
+ W(9, 1013984188, 1015817710, 1015459258, -1129521612)
+ W(10, -1164359388, 1014490160, -1148094031, -1127829894)
+ W(11, -1138058188, -1124941766, 1008886302, -1130075526);
sum2 =
W(0, 1003807591, -1147429191, 1002635095, -1132089351) + W(1, -1147602519, -1151072125, 964968041, 1011860423)
+ W(2, -1154115373, -1139843175, 1018649088, 1016729308)
+ W(3, 1024344696, -1155997437, 1001714367, -1141691791) + W(4, 1000124719, 1001833687, 1008095031, -1105992985)
+ W(5, 1014141127, -1132427785, -1141957575, -1139390003) + W(6, 1017182228, 1024488826, 1040714709, 1063780536)
+ W(7, 1047200342, 1020609216, 1023684454, 1017456200)
+ W(8, -1126980607, -1116401990, -1105844805, -1085442794)
+ W(9, -1101306502, -1122913939, -1125194898, -1128761080)
+ W(10, -1130234859, 987658746, 1013729967, 1024604622)
+ W(11, 995366957, -1147894927, -1146690231, -1146063807);
WS(1061878800, -1131153991);
sum1 =
W(0, -1123872727, 1018625288, -1127640224, 1026350045) + W(1, 1013916191, 1018183052, -1145362866, 1016048056)
+ W(2, 1015115512, 1031144036, 1036357847, -1108974562) + W(3, -1107191102, 1006433282, 1014427177, 1026198990)
+ W(4, -1099302516, -1102371221, -1085394744, 1059569738) + W(5, 1050617832, 1032504563, 1031877738, 1033421596)
+ W(6, 1041224340, 1009910425, 1052022073, 1058525661)
+ W(7, -1088226291, -1097316565, -1109508096, -1098228398)
+ W(8, -1144166978, 1014687697, -1115552350, -1125187302) + W(9, 1037730450, 1040234099, 1015825508, 1035235966)
+ W(10, -1171049230, 1022902338, -1132534141, 1016189168)
+ W(11, -1123531112, -1127405808, 1018548825, -1137247201);
sum2 =
W(0, -1131301730, 973798558, -1127780866, 1013478572) + W(1, -1160424319, -1135840779, -1164912671, -1138738786)
+ W(2, 1031269327, -1131640108, 1013454096, -1109509101)
+ W(3, -1117315078, -1131160392, -1145619912, -1127332243)
+ W(4, -1127010401, 1028981651, -1149526184, 1051779317) + W(5, 1028380081, -1137527992, 998238336, -1148504424)
+ W(6, -1109842974, -1125259759, -1113692773, 1047088883)
+ W(7, -1134194124, 1028175261, 1018886164, 1027237057)
+ W(8, -1181736700, -1167651134, -1123287814, -1109788940)
+ W(9, -1115287133, -1121515979, -1125209194, -1142455024)
+ W(10, -1180777340, -1160957999, 993986728, 1020962386)
+ W(11, -1136947718, -1138138790, -1152989064, -1123011340);
WS(-1146021888, 1053974589);
sum1 =
W(0, 1029642476, 1013890275, -1176939092, 1022266947) + W(1, -1146466657, 1004183253, -1160650069, -1127783457)
+ W(2, -1119368753, -1134074211, 1007708103, 1017736689) + W(3, 1027345005, 1032510570, 1019378973, 1031489314)
+ W(4, 1042969521, 1042359026, 1045769551, -1101301107)
+ W(5, -1094644679, -1091538585, -1107179580, -1095508207)
+ W(6, -1095098901, -1107285127, -1096985546, 1034918881) + W(7, 1050538529, 1051699648, 1036824506, 1048776768)
+ W(8, 1046685039, 1031018217, 1036262392, 1003810877) + W(9, -1120828825, 1011534979, -1133351451, 1035618600)
+ W(10, 984849429, -1135393367, -1139413615, 1024875117)
+ W(11, -1172526890, 1017671961, -1160823333, 1006585957);
sum2 = W(0, 1031363252, 1030774484, 1015165558, 1007437656) + W(1, 1020087968, -1158035650, -1137051446, 1018262350)
+ W(2, -1091101506, 1001500224, -1110787951, 965388167) + W(3, 1003188992, 1041338676, -1134903850, 1033269727)
+ W(4, 1048232756, -1110436898, 1016237906, 1014973676)
+ W(5, -1123006886, -1105090874, -1123217223, -1104724635)
+ W(6, 1057852755, -1132290932, 1043794074, 1047525730)
+ W(7, 1011818344, -1129296549, 1034851396, -1106365430)
+ W(8, -1095952784, -1131305343, -1113356328, -1152923833)
+ W(9, -1111245491, -1131940021, -1117639196, 1024945328)
+ W(10, 1016290300, -1126601761, 1003743696, 1022650220)
+ W(11, 1021501454, 1017537464, -1133259176, 1019937714);
WS(-1077057896, -1083600334);
sum1 = W(0, 1017420011, 1014201033, 1017846781, 999765850) + W(1, -1162024122, 985808522, 1019153993, 1011766057)
+ W(2, 1011471785, 1019976613, -1138042285, 1040273597) + W(3, 1016487629, 1027730222, 1010855969, 1025127228)
+ W(4, 1029223422, -1126437509, 1049638570, -1090770241) + W(5, 1029091694, 1037672698, 1027546578, 1025680213)
+ W(6, -1116040414, 1015478313, -1103217262, -1087230893)
+ W(7, 1046437488, 1024768280, 1028909230, 1017109109) + W(8, 1017123181, 1024110818, 1023111893, 1030676769)
+ W(9, -1112046985, -1120839802, 1023955584, -1128064723)
+ W(10, 1016511669, -1167731667, 1009386661, 1023090125)
+ W(11, 1020460717, 1025489318, -1134545259, 1027741830);
sum2 =
W(0, 1023774756, -1152708847, 1005727232, -1128030280) + W(1, 1004577664, -1116453887, 1020705336, -1127775332)
+ W(2, -1107003878, 1013240776, -1108761818, 1032847770)
+ W(3, 1024878510, -1129071264, -1124253692, -1114566712)
+ W(4, 1020767940, -1108605887, 1050907301, 1058054639) + W(5, -1106188814, 1040942692, -1115446820, 1042743894)
+ W(6, -1118294055, -1128830540, -1097736561, 1008347200)
+ W(7, 1049418167, -1105809360, 1014050712, -1132221182)
+ W(8, -1113997093, -1139588328, 1032528025, 1039669350)
+ W(9, -1108856812, -1104688291, 1018266740, -1103534695)
+ W(10, 1021408408, -1119578529, -1135972104, -1131826954)
+ W(11, 999382680, 1019392776, -1117167612, 1022204104);
WS(1034686080, -1080904524);
sum1 = W(0, -1139332721, -1156631187, 1023562901, 1011454089) + W(1, 1009225011, 1012327853, 1010687981, 1010798725)
+ W(2, 1025190657, 1024127465, 1006799241, -1145135178) + W(3, -1129417722, 1017495114, 1027561839, 1010795083)
+ W(4, -1143163562, 1040892278, -1104914606, -1089193318)
+ W(5, 1043909393, -1119873834, -1136185891, -1118482716)
+ W(6, 1041601261, 1028605547, 1052908885, -1091833281)
+ W(7, -1103073573, 1025246703, -1124345098, 1032670633)
+ W(8, 1024768205, -1129308018, -1117860929, 1036300940) + W(9, 1040987970, 1033652713, 1024209623, 1027144528)
+ W(10, -1137907141, 1012089369, 1019594656, -1143330794)
+ W(11, 992909011, -1123933213, 1018355139, -1123266333);
sum2 = W(0, 998154484, -1136847690, -1120010648, -1145824866) + W(1, 1015307650, 994166396, 1000606426, 1010815409)
+ W(2, -1124228589, 1028193069, 1043298286, -1115567961)
+ W(3, -1106126812, -1111174068, -1128437454, -1110538383)
+ W(4, -1132108902, -1123281782, -1097765474, 1059221182)
+ W(5, 1048600788, -1130476352, 1026255089, -1118584150)
+ W(6, -1115676434, -1123302060, 1027211577, 1034703777)
+ W(7, -1099334080, 1015056080, -1137618020, 1028199647)
+ W(8, -1123985162, -1132306691, -1114822183, -1131429597)
+ W(9, 1029215805, 1023836215, -1127893362, 1025007180)
+ W(10, 1004957466, 1011392625, -1127542967, 1022587458)
+ W(11, -1127163397, -1122559367, -1171736302, -1124423270);
WS(-1097173920, -1100403112);
sum1 = W(0, -1133792968, -1124511038, -1133667884, -1126404688)
+ W(1, -1129657589, -1123548289, -1134226372, -1120375479)
+ W(2, -1126599342, -1134319356, -1113753548, -1113718896)
+ W(3, 1034489098, 1017816566, -1122628437, 1029275393) + W(4, 1026626987, -1112479512, 1051379210, 1058852431)
+ W(5, -1097104011, 1007326848, -1112737379, -1116837058)
+ W(6, -1109988694, -1122054529, -1097159959, 1058630415)
+ W(7, 1049904553, -1104990865, 983139170, -1110311540)
+ W(8, -1128510918, -1138055228, 1031366423, -1108453759)
+ W(9, -1111244112, -1129654222, -1143321192, -1132471000)
+ W(10, -1124691470, -1131431128, -1128464692, -1122909907)
+ W(11, 1006087192, -1138955724, -1123473736, -1149064600);
sum2 = W(0, -1133003813, -1170659932, 1006656308, 1031055883)
+ W(1, -1122785076, -1138148825, -1134424500, -1131089901)
+ W(2, -1145103116, 1024883426, -1122208183, -1101651786)
+ W(3, -1107240567, -1137951645, -1131665157, -1116779622)
+ W(4, -1105221269, -1117429423, -1098340061, 1055658740)
+ W(5, 1035604404, -1131811521, -1130287800, -1123356625)
+ W(6, 1033080040, 1028547885, 1042272545, 1058321046) + W(7, -1112738821, 1003752088, 1015669581, 1033205575)
+ W(8, 1016862101, -1128891234, -1121562483, -1100689547)
+ W(9, -1115182870, 1026865631, -1129373191, -1134576021)
+ W(10, -1129731365, -1147341896, -1121650606, 1031708925)
+ W(11, -1123396988, -1133076983, -1131162259, -1127933595);
WS(1049422752, 1064394145);
sum1 = W(0, 1016583527, 997641734, -1129211865, -1131677043) + W(1, 1021172552, 1021691141, -1133007562, 1026669144)
+ W(2, -1106085006, -1123841888, 1041111742, 1032245856)
+ W(3, -1110939130, -1105269375, -1137128282, -1106745812)
+ W(4, 995307718, 1031369872, -1088517333, -1098988005) + W(5, 1058612208, 1030057089, 1036830720, 1034516890)
+ W(6, 1042273115, 1021597381, 1058826428, -1105331685)
+ W(7, -1090507155, 1043687978, -1120823228, 1038691348)
+ W(8, -1113049442, -1122854832, -1113933244, 1032610296)
+ W(9, 1037338632, -1122591528, -1116248270, -1117945591)
+ W(10, 1025810280, 1006187755, 1019889767, -1131685097)
+ W(11, -1155049030, -1134096210, 1025994697, -1126546729);
sum2 =
W(0, 1015668141, -1124041461, 1005775275, 1015274173) + W(1, -1138139200, 992639399, -1132053353, -1129319398)
+ W(2, -1138201662, -1140877219, 1027346708, 1016303395)
+ W(3, -1106503093, -1117618841, -1115775134, -1122071744)
+ W(4, -1111996311, -1116450062, -1125910350, -1108948194)
+ W(5, -1104963655, 1031763952, 1015724405, 1034411590)
+ W(6, -1127284815, -1123578506, -1106280325, 1052974100) + W(7, 1053021197, 957951850, 1016609913, -1140595900)
+ W(8, -1125087482, 1024732308, 1034158307, 1032925063) + W(9, -1107449032, 994113735, -1132927280, -1140186580)
+ W(10, 1020174885, -1139064970, -1133423524, -1161498797)
+ W(11, -1134898868, 1013272790, -1132485274, -1164791981);
WS(-1101497152, -1084603877);
sum1 =
W(0, -1136425045, -1140218249, -1121565262, -1122160667)
+ W(1, 1006883159, -1132497425, 1004502690, -1128604789) + W(2, 1016522037, -1157845066, 1035402982, 1027884002)
+ W(3, 1025282390, 1022271101, -1138792353, 995143101) + W(4, 967194407, 1029505522, 1022903246, -1107598171)
+ W(5, 1025270942, -1120772739, -1154700189, -1128284203)
+ W(6, 1019848413, -1119357636, 1027088345, 1024422013) + W(7, -1117602990, 1030415880, -1171556244, 1025955498)
+ W(8, -1129523533, -1140249161, -1121932442, -1127296803)
+ W(9, 1030372258, -1129818261, -1138666305, -1121511513)
+ W(10, -1142614610, -1135395837, -1148904362, 1002411186)
+ W(11, -1130529549, 1018540973, -1138856043, 1011955033);
sum2 =
W(0, -1126668299, -1137964827, 1015963121, 1030757650) + W(1, -1137547503, 997720155, -1132279825, -1152589275)
+ W(2, -1131366283, -1117612139, -1097765254, -1083955387)
+ W(3, -1103492737, -1118797430, -1131387718, -1140632131)
+ W(4, 1024971228, 1034620123, 1049249869, 1064229708) + W(5, 1047078464, 1017921725, 1024786888, -1144191965)
+ W(6, 1000957181, -1119890411, 1026062254, -1107214224) + W(7, -1122275403, 1016072153, -1133941049, 996433547)
+ W(8, -1151515419, -1145021381, 1001872029, 1026637176)
+ W(9, 1027173860, -1135832789, -1148432117, -1140699475)
+ W(10, 1005199725, -1136862175, 1007955643, -1125717658)
+ W(11, -1169614250, 923654805, 1002011725, 1005736109);
WS(1059552336, -1136539026);
sum1 = W(0, 990367896, 1010957914, -1154541352, 1017372383) + W(1, 1026284586, 1015461809, 1014364322, 1021777309)
+ W(2, 1041343484, 1036710283, 1045269292, -1112141659) + W(3, -1116984235, 1018013925, 1006339428, 1032228520)
+ W(4, -1096612504, -1107358947, -1087221074, 1058232297)
+ W(5, 1051438086, 1047713030, 1032067931, 1045851190) + W(6, 1033353841, 1029016441, 1042554433, 1029783110)
+ W(7, -1087458720, -1095293300, -1114380761, -1099415088)
+ W(8, -1125599349, -1132821402, -1154580200, -1114120867)
+ W(9, 1033522371, 1032365167, 1004597796, 1030006574) + W(10, 1028944863, 1024290996, 1023892422, 1023410731)
+ W(11, -1144215764, -1144750420, 1001346936, -1130073781);
sum2 = W(0, -1153914788, -1126180245, 995844276, -1141303142) + W(1, 1026951758, 990856044, 1004850742, 1015235936)
+ W(2, -1101809160, -1115520194, -1114410922, 1033573989)
+ W(3, 998799030, -1110191966, -1129252703, 1016759632) + W(4, 1052877341, 1022007580, 1055965696, -1098900400)
+ W(5, 1023481081, 1023185808, 1031073312, -1104219784) + W(6, 1046574229, 1000424166, 1034680258, -1098051352)
+ W(7, 1045079279, -1106708743, 984863273, -1103050031)
+ W(8, -1095334336, -1113807813, -1109583292, 1033797491)
+ W(9, 1032986287, 1025876178, -1137844345, 1038371038)
+ W(10, 1023520281, 1021218858, 1008634443, -1115608949)
+ W(11, 1032307290, -1128938562, 1017335440, 1020607644);
WS(-1080660584, -1085825159);
sum1 = W(0, 1013708199, 998525366, 1026247587, 1009982783) + W(1, -1136883881, -1128223794, 999596614, -1126850910)
+ W(2, -1123370319, -1131079022, -1110742584, -1113918027)
+ W(3, 1032009669, 1025751155, -1126831290, 1033490448)
+ W(4, -1145658646, -1111041043, 1047524760, 1038280501)
+ W(5, -1096311074, 1042402294, -1118872454, 1023947050)
+ W(6, -1118786339, 1035331132, -1095527502, 1041941518)
+ W(7, 1051037928, -1106649743, 1032615945, -1111971999)
+ W(8, 1028171867, 1017605134, 1042485668, -1110999603) + W(9, -1106204846, -1132447358, 1002160934, 971034337)
+ W(10, -1144908790, -1131113128, -1130744068, 992723116)
+ W(11, 1025830203, 1017749654, -1127230527, 1018668086);
sum2 = W(0, 988660617, -1145776834, -1133422913, -1126488793) + W(1, 999416722, 999055930, 987033481, -1179419172)
+ W(2, 1017543700, 1002138986, 1031822055, 1032911160) + W(3, 1018658069, 1007351961, -1162261577, 999395634)
+ W(4, 1015794522, 1001599498, 1041494739, 1056510750) + W(5, 1023998101, -1138508317, 991201876, -1141702058)
+ W(6, -1133704409, 1024621822, -1102581932, -1089051586)
+ W(7, -1111744235, 1009295285, -1140892226, -1147317506)
+ W(8, 1003471274, -1135257421, 970658596, 1026713544) + W(9, 945757471, 998080468, -1156050276, 1007988669)
+ W(10, -1140119133, -1136500105, -1163479081, 1009057465)
+ W(11, 1000517690, -1137960905, -1186683976, -1146609818);
WS(1064784784, -1120346387);
sum1 =
W(0, -1150678408, 1000581420, 1006062348, 1015088861) + W(1, -1136518116, -1126533711, 1012432222, -1122704190)
+ W(2, 1015721531, -1110077251, -1107299655, -1132691862)
+ W(3, -1148196556, 1013224070, -1124886999, 1014270588) + W(4, 1049255678, 1043607805, 1059242626, -1123916922)
+ W(5, -1096371932, -1100407642, -1132580564, -1102354822)
+ W(6, -1099108228, -1107416484, -1089544734, -1130977491)
+ W(7, 1057929313, 1048500643, 1035479729, 1044504531) + W(8, -1149551256, 1017163947, 1023526494, 1022505321)
+ W(9, -1104456865, -1111675367, -1127245287, 1007459698)
+ W(10, -1136953142, -1140022794, -1139533474, 1012221798)
+ W(11, 1014035238, 1026165050, -1136458552, 1017479699);
sum2 = W(0, -1140771860, -1125513054, -1131667695, -1131288705)
+ W(1, 978159878, 1012596136, 1011933560, -1121534607) + W(2, 1031694512, -1122230843, 979286214, 1025077104)
+ W(3, -1108853537, -1102087836, -1117751010, -1113389694)
+ W(4, -1104948969, 993388690, 1026183534, -1111209187) + W(5, 1041617383, 1045410736, 1030126174, 1044425994)
+ W(6, -1115570202, 1042093481, 1042830623, -1112764939) + W(7, 1043422569, 1034771561, 1014235016, 1025820984)
+ W(8, 1040745971, -1111499166, -1119680402, 982469091)
+ W(9, -1120447085, -1109907689, -1127258987, -1115100280)
+ W(10, -1127298441, 995262946, 1002124441, -1123012516)
+ W(11, -1129740789, -1125016939, 1004566649, -1119639931);
WS(-1088649680, 1067112300);
return clamp(mstd0 + 5.0 * vsum / wsum * mstd1, 0.0, 1.0);
}
shared float inp[555];
#define CURRENT_PASS 2
#define GET_SAMPLE(x) dot(x.rgb, rgb2y)
#define imageStore(out_image, pos, val) imageStoreOverride(pos, val.x)
void imageStoreOverride(uint2 pos, float value) {
float2 UV = mul(rgb2uv, INPUT.SampleLevel(sam_INPUT_LINEAR, HOOKED_map(pos), 0).rgb);
OUTPUT[pos] = float4(mul(yuv2rgb, float3(value.x, UV)), 1.0);
}
#define INPUT_tex(pos) GET_SAMPLE(vec4(texture(INPUT, pos)))
static const float2 INPUT_size = float2(GetInputSize());
static const float2 INPUT_pt = float2(GetInputPt());
#define temp_tex(pos) (float(texture(temp, pos).x))
static const float2 temp_size = float2(GetInputSize().x * 1, GetInputSize().y * 2);
static const float2 temp_pt = float2(1.0 / (temp_size.x), 1.0 / (temp_size.y));
#define HOOKED_tex(pos) temp_tex(pos)
#define HOOKED_size temp_size
#define HOOKED_pt temp_pt
void Pass2(uint2 blockStart, uint3 threadId) {
ivec2 group_base = ivec2(gl_WorkGroupID) * ivec2(gl_WorkGroupSize);
int local_pos = int(gl_LocalInvocationID.x) * 15 + int(gl_LocalInvocationID.y);
for (int id = int(gl_LocalInvocationIndex); id < 555; id += int(gl_WorkGroupSize.x * gl_WorkGroupSize.y)) {
uint x = (uint)id / 15, y = (uint)id % 15;
inp[id] =
HOOKED_tex(HOOKED_pt * vec2(float(group_base.x + x - (2)) + 0.5, float(group_base.y + y - (3)) + 0.5)).x;
}
barrier();
vec4 ret = vec4(0.0, 0.0, 0.0, 0.0);
vec4 ret0 = vec4(0.0, 0.0, 0.0, 0.0);
vec4 samples[12];
samples[0][0] = inp[local_pos + 0];
samples[0][1] = inp[local_pos + 1];
samples[0][2] = inp[local_pos + 2];
samples[0][3] = inp[local_pos + 3];
samples[1][0] = inp[local_pos + 4];
samples[1][1] = inp[local_pos + 5];
samples[1][2] = inp[local_pos + 6];
samples[1][3] = inp[local_pos + 7];
samples[2][0] = inp[local_pos + 15];
samples[2][1] = inp[local_pos + 16];
samples[2][2] = inp[local_pos + 17];
samples[2][3] = inp[local_pos + 18];
samples[3][0] = inp[local_pos + 19];
samples[3][1] = inp[local_pos + 20];
samples[3][2] = inp[local_pos + 21];
samples[3][3] = inp[local_pos + 22];
samples[4][0] = inp[local_pos + 30];
samples[4][1] = inp[local_pos + 31];
samples[4][2] = inp[local_pos + 32];
samples[4][3] = inp[local_pos + 33];
samples[5][0] = inp[local_pos + 34];
samples[5][1] = inp[local_pos + 35];
samples[5][2] = inp[local_pos + 36];
samples[5][3] = inp[local_pos + 37];
samples[6][0] = inp[local_pos + 45];
samples[6][1] = inp[local_pos + 46];
samples[6][2] = inp[local_pos + 47];
samples[6][3] = inp[local_pos + 48];
samples[7][0] = inp[local_pos + 49];
samples[7][1] = inp[local_pos + 50];
samples[7][2] = inp[local_pos + 51];
samples[7][3] = inp[local_pos + 52];
samples[8][0] = inp[local_pos + 60];
samples[8][1] = inp[local_pos + 61];
samples[8][2] = inp[local_pos + 62];
samples[8][3] = inp[local_pos + 63];
samples[9][0] = inp[local_pos + 64];
samples[9][1] = inp[local_pos + 65];
samples[9][2] = inp[local_pos + 66];
samples[9][3] = inp[local_pos + 67];
samples[10][0] = inp[local_pos + 75];
samples[10][1] = inp[local_pos + 76];
samples[10][2] = inp[local_pos + 77];
samples[10][3] = inp[local_pos + 78];
samples[11][0] = inp[local_pos + 79];
samples[11][1] = inp[local_pos + 80];
samples[11][2] = inp[local_pos + 81];
samples[11][3] = inp[local_pos + 82];
ret[0] = nnedi3(samples);
ret0[0] = inp[local_pos + 33];
#if CURRENT_PASS == LAST_PASS
uint2 destPos = blockStart + threadId.xy * 2;
uint2 outputSize = GetOutputSize();
if (destPos.x >= outputSize.x || destPos.y >= outputSize.y) {
return;
}
#endif
imageStore(out_image, ivec2(gl_GlobalInvocationID) * ivec2(2, 1), ret0);
imageStore(out_image, ivec2(gl_GlobalInvocationID) * ivec2(2, 1) + ivec2(1, 0), ret);
}

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,73 @@
// This program is free software: you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation, either version 3 of the License, or
// (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License
// along with this program. If not, see <http://www.gnu.org/licenses/>.
// Conversion from GLSL to HLSL is done through defines as much as possible to ease synchronization and comparison with upstream
#define ivec2 int2
#define vec2 float2
#define vec3 float3
#define vec4 float4
#define mat4x3 float4x3
#define matrixCompMult(mtx1, mtx2) (mtx1 * mtx2)
#define shared groupshared
#define atan atan2
#define barrier GroupMemoryBarrierWithGroupSync
#define fract frac
#define intBitsToFloat asfloat
#define inversesqrt rsqrt
// mod deals only with positive numbers here and it could be substituted by fmod
#define mod fmod
// lerp handles bools as the third argument differently from mix
float mix(float a, float b, bool c) {
return c ? b : a;
}
#define MIX_LERP(type1, type3) type1 mix(type1 a, type1 b, type3 c) { return lerp(a, b, c); }
MIX_LERP(float, float)
MIX_LERP(float2, float2)
MIX_LERP(float3, float)
MIX_LERP(float4, float)
#define texture(tex, pos) tex.SampleLevel(sam_##tex, pos, 0.0)
#define OUTPUT_pt float2(GetOutputPt())
#define frag_pos(id) (vec2(id) + vec2(0.5, 0.5))
#define frag_map(id) (OUTPUT_pt * frag_pos(id))
#define HOOKED_map(id) frag_map(id)
#define gl_LocalInvocationIndex (threadId.y*MP_NUM_THREADS_X + threadId.x)
#define gl_LocalInvocationID threadId
#define gl_WorkGroupSize (uint2(MP_NUM_THREADS_X, MP_NUM_THREADS_Y))
#define gl_WorkGroupID (blockStart / uint2(MP_BLOCK_WIDTH, MP_BLOCK_HEIGHT))
#define gl_GlobalInvocationID (gl_WorkGroupID*gl_WorkGroupSize + threadId.xy)
// disable warning about unknown pragma
#pragma warning(disable: 3568)
// disable warning about too many threads (ravu-r4-rgb triggers it)
#pragma warning(disable: 4714)
// https://www.itu.int/dms_pubrec/itu-r/rec/bt/R-REC-BT.709-6-201506-I!!PDF-E.pdf
static const float3 rgb2y = float3(0.2126, 0.7152, 0.0722);
static const float2x3 rgb2uv = {
-0.2126/1.8556, -0.7152/1.8556, 0.9278/1.8556,
0.7874/1.5748, -0.7152/1.5748, -0.0722/1.5748
};
static const float3x3 yuv2rgb = {
1, 0, 1.5748,
1, -0.187324, -0.468124,
1, 1.8556, 0
};

View file

@ -1,20 +1,20 @@
//!MAGPIE EFFECT
//!VERSION 3
//!GENERIC_DOWNSCALER
//!VERSION 4
//!TEXTURE
Texture2D INPUT;
//!TEXTURE
Texture2D OUTPUT;
//!SAMPLER
//!FILTER POINT
SamplerState sam;
//!PASS 1
//!STYLE PS
//!IN INPUT
//!OUT OUTPUT
float4 Pass1(float2 pos) {
return INPUT.SampleLevel(sam, pos, 0);
}

View file

@ -2,14 +2,17 @@
// 移植自 https://casual-effects.com/research/McGuire2021PixelArt/index.html
//!MAGPIE EFFECT
//!VERSION 3
//!OUTPUT_WIDTH INPUT_WIDTH * 2
//!OUTPUT_HEIGHT INPUT_HEIGHT * 2
//!VERSION 4
//!TEXTURE
Texture2D INPUT;
//!TEXTURE
//!WIDTH INPUT_WIDTH * 2
//!HEIGHT INPUT_HEIGHT * 2
Texture2D OUTPUT;
//!SAMPLER
//!FILTER POINT
SamplerState sam;
@ -17,118 +20,113 @@ SamplerState sam;
//!PASS 1
//!IN INPUT
//!OUT OUTPUT
//!BLOCK_SIZE 16
//!NUM_THREADS 64
#define src(x, y) INPUT.SampleLevel(sam, float2(x, y) * GetInputPt(), 0).rgb
float luma(float3 C) {
return C.r + C.g + C.b;
return C.r + C.g + C.b;
}
bool all_eq2(float3 B, float3 A0, float3 A1) {
return all(B == A0) && all(B == A1);
return all(B == A0) && all(B == A1);
}
bool all_eq3(float3 B, float3 A0, float3 A1, float3 A2) {
return all(B == A0) && all(B == A1) && all(B == A2);
return all(B == A0) && all(B == A1) && all(B == A2);
}
bool all_eq4(float3 B, float3 A0, float3 A1, float3 A2, float3 A3) {
return all(B == A0) && all(B == A1) && all(B == A2) && all(B == A3);
return all(B == A0) && all(B == A1) && all(B == A2) && all(B == A3);
}
bool any_eq3(float3 B, float3 A0, float3 A1, float3 A2) {
return all(B == A0) || all(B == A1) || all(B == A2);
return all(B == A0) || all(B == A1) || all(B == A2);
}
bool none_eq2(float3 B, float3 A0, float3 A1) {
return any(B != A0) && any(B != A1);
return any(B != A0) && any(B != A1);
}
bool none_eq4(float3 B, float3 A0, float3 A1, float3 A2, float3 A3) {
return any(B != A0) && any(B != A1) && any(B != A2) && any(B != A3);
return any(B != A0) && any(B != A1) && any(B != A2) && any(B != A3);
}
void Pass1(uint2 blockStart, uint3 threadId) {
uint2 gxy = (Rmp8x8(threadId.x) << 1) + blockStart;
uint2 gxy = (Rmp8x8(threadId.x) << 1) + blockStart;
if (!CheckViewport(gxy)) {
return;
}
const uint2 outputSize = GetOutputSize();
if (gxy.x >= outputSize.x || gxy.y >= outputSize.y) {
return;
}
float srcX = (gxy.x >> 1) + 0.5f;
float srcY = (gxy.y >> 1) + 0.5f;
float srcX = (gxy.x >> 1) + 0.5f;
float srcY = (gxy.y >> 1) + 0.5f;
float3 A = src(srcX - 1, srcY - 1), B = src(srcX, srcY - 1), C = src(srcX + 1, srcY - 1);
float3 D = src(srcX - 1, srcY + 0), E = src(srcX, srcY + 0), F = src(srcX + 1, srcY + 0);
float3 G = src(srcX - 1, srcY + 1), H = src(srcX, srcY + 1), I = src(srcX + 1, srcY + 1);
float3 A = src(srcX - 1, srcY - 1), B = src(srcX, srcY - 1), C = src(srcX + 1, srcY - 1);
float3 D = src(srcX - 1, srcY + 0), E = src(srcX, srcY + 0), F = src(srcX + 1, srcY + 0);
float3 G = src(srcX - 1, srcY + 1), H = src(srcX, srcY + 1), I = src(srcX + 1, srcY + 1);
float3 J = E, K = E, L = E, M = E;
float3 J = E, K = E, L = E, M = E;
if (any(A != E) || any(B != E) || any(C != E) || any(D != E) || any(F != E) || any(G != E) || any(H != E) || any(I != E)) {
float3 P = src(srcX, srcY - 2), S = src(srcX, srcY + 2);
float3 Q = src(srcX - 2, srcY), R = src(srcX + 2, srcY);
float Bl = luma(B), Dl = luma(D), El = luma(E), Fl = luma(F), Hl = luma(H);
if (any(A != E) || any(B != E) || any(C != E) || any(D != E) || any(F != E) || any(G != E) || any(H != E) || any(I != E)) {
float3 P = src(srcX, srcY - 2), S = src(srcX, srcY + 2);
float3 Q = src(srcX - 2, srcY), R = src(srcX + 2, srcY);
float Bl = luma(B), Dl = luma(D), El = luma(E), Fl = luma(F), Hl = luma(H);
// 1:1 slope rules
if ((all(D == B) && any(D != H) && any(D != F)) && (El >= Dl || all(E == A)) && any_eq3(E, A, C, G) && ((El < Dl) || any(A != D) || any(E != P) || any(E != Q))) J = D;
if ((all(B == F) && any(B != D) && any(B != H)) && (El >= Bl || all(E == C)) && any_eq3(E, A, C, I) && ((El < Bl) || any(C != B) || any(E != P) || any(E != R))) K = B;
if ((all(H == D) && any(H != F) && any(H != B)) && (El >= Hl || all(E == G)) && any_eq3(E, A, G, I) && ((El < Hl) || any(G != H) || any(E != S) || any(E != Q))) L = H;
if ((all(F == H) && any(F != B) && any(F != D)) && (El >= Fl || all(E == I)) && any_eq3(E, C, G, I) && ((El < Fl) || any(I != H) || any(E != R) || any(E != S))) M = F;
// 1:1 slope rules
if ((all(D == B) && any(D != H) && any(D != F)) && (El >= Dl || all(E == A)) && any_eq3(E, A, C, G) && ((El < Dl) || any(A != D) || any(E != P) || any(E != Q))) J = D;
if ((all(B == F) && any(B != D) && any(B != H)) && (El >= Bl || all(E == C)) && any_eq3(E, A, C, I) && ((El < Bl) || any(C != B) || any(E != P) || any(E != R))) K = B;
if ((all(H == D) && any(H != F) && any(H != B)) && (El >= Hl || all(E == G)) && any_eq3(E, A, G, I) && ((El < Hl) || any(G != H) || any(E != S) || any(E != Q))) L = H;
if ((all(F == H) && any(F != B) && any(F != D)) && (El >= Fl || all(E == I)) && any_eq3(E, C, G, I) && ((El < Fl) || any(I != H) || any(E != R) || any(E != S))) M = F;
// Intersection rules
if ((any(E != F) && all_eq4(E, C, I, D, Q) && all_eq2(F, B, H)) && (any(F != src(srcX + 3, srcY)))) K = M = F;
if ((any(E != D) && all_eq4(E, A, G, F, R) && all_eq2(D, B, H)) && (any(D != src(srcX - 3, srcY)))) J = L = D;
if ((any(E != H) && all_eq4(E, G, I, B, P) && all_eq2(H, D, F)) && (any(H != src(srcX, srcY + 3)))) L = M = H;
if ((any(E != B) && all_eq4(E, A, C, H, S) && all_eq2(B, D, F)) && (any(B != src(srcX, srcY - 3)))) J = K = B;
if (Bl < El && all_eq4(E, G, H, I, S) && none_eq4(E, A, D, C, F)) J = K = B;
if (Hl < El && all_eq4(E, A, B, C, P) && none_eq4(E, D, G, I, F)) L = M = H;
if (Fl < El && all_eq4(E, A, D, G, Q) && none_eq4(E, B, C, I, H)) K = M = F;
if (Dl < El && all_eq4(E, C, F, I, R) && none_eq4(E, B, A, G, H)) J = L = D;
// Intersection rules
if ((any(E != F) && all_eq4(E, C, I, D, Q) && all_eq2(F, B, H)) && (any(F != src(srcX + 3, srcY)))) K = M = F;
if ((any(E != D) && all_eq4(E, A, G, F, R) && all_eq2(D, B, H)) && (any(D != src(srcX - 3, srcY)))) J = L = D;
if ((any(E != H) && all_eq4(E, G, I, B, P) && all_eq2(H, D, F)) && (any(H != src(srcX, srcY + 3)))) L = M = H;
if ((any(E != B) && all_eq4(E, A, C, H, S) && all_eq2(B, D, F)) && (any(B != src(srcX, srcY - 3)))) J = K = B;
if (Bl < El && all_eq4(E, G, H, I, S) && none_eq4(E, A, D, C, F)) J = K = B;
if (Hl < El && all_eq4(E, A, B, C, P) && none_eq4(E, D, G, I, F)) L = M = H;
if (Fl < El && all_eq4(E, A, D, G, Q) && none_eq4(E, B, C, I, H)) K = M = F;
if (Dl < El && all_eq4(E, C, F, I, R) && none_eq4(E, B, A, G, H)) J = L = D;
// 2:1 slope rules
if (any(H != B)) {
if (any(H != A) && any(H != E) && any(H != C)) {
if (all_eq3(H, G, F, R) && none_eq2(H, D, src(srcX + 2, srcY - 1))) L = M;
if (all_eq3(H, I, D, Q) && none_eq2(H, F, src(srcX - 2, srcY - 1))) M = L;
}
// 2:1 slope rules
if (any(H != B)) {
if (any(H != A) && any(H != E) && any(H != C)) {
if (all_eq3(H, G, F, R) && none_eq2(H, D, src(srcX + 2, srcY - 1))) L = M;
if (all_eq3(H, I, D, Q) && none_eq2(H, F, src(srcX - 2, srcY - 1))) M = L;
}
if (any(B != I) && any(B != G) && any(B != E)) {
if (all_eq3(B, A, F, R) && none_eq2(B, D, src(srcX + 2, srcY + 1))) J = K;
if (all_eq3(B, C, D, Q) && none_eq2(B, F, src(srcX - 2, srcY + 1))) K = J;
}
} // H !== B
if (any(B != I) && any(B != G) && any(B != E)) {
if (all_eq3(B, A, F, R) && none_eq2(B, D, src(srcX + 2, srcY + 1))) J = K;
if (all_eq3(B, C, D, Q) && none_eq2(B, F, src(srcX - 2, srcY + 1))) K = J;
}
} // H !== B
if (any(F != D)) {
if (any(D != I) && any(D != E) && any(D != C)) {
if (all_eq3(D, A, H, S) && none_eq2(D, B, src(srcX + 1, srcY + 2))) J = L;
if (all_eq3(D, G, B, P) && none_eq2(D, H, src(srcX + 1, srcY - 2))) L = J;
}
if (any(F != D)) {
if (any(D != I) && any(D != E) && any(D != C)) {
if (all_eq3(D, A, H, S) && none_eq2(D, B, src(srcX + 1, srcY + 2))) J = L;
if (all_eq3(D, G, B, P) && none_eq2(D, H, src(srcX + 1, srcY - 2))) L = J;
}
if (any(F != E) && any(F != A) && any(F != G)) {
if (all_eq3(F, C, H, S) && none_eq2(F, B, src(srcX - 1, srcY + 2))) K = M;
if (all_eq3(F, I, B, P) && none_eq2(F, H, src(srcX - 1, srcY - 2))) M = K;
}
} // F !== D
} // not constant
if (any(F != E) && any(F != A) && any(F != G)) {
if (all_eq3(F, C, H, S) && none_eq2(F, B, src(srcX - 1, srcY + 2))) K = M;
if (all_eq3(F, I, B, P) && none_eq2(F, H, src(srcX - 1, srcY - 2))) M = K;
}
} // F !== D
} // not constant
// Write four pixels at once
WriteToOutput(gxy, J);
// Write four pixels at once
OUTPUT[gxy] = float4(J, 1);
++gxy.x;
if (CheckViewport(gxy)) {
WriteToOutput(gxy, K);
}
++gxy.x;
OUTPUT[gxy] = float4(K, 1);
++gxy.y;
if (CheckViewport(gxy)) {
WriteToOutput(gxy, M);
}
++gxy.y;
OUTPUT[gxy] = float4(M, 1);
--gxy.x;
if (CheckViewport(gxy)) {
WriteToOutput(gxy, L);
}
--gxy.x;
OUTPUT[gxy] = float4(L, 1);
}

View file

@ -1,12 +1,15 @@
// 移植自 https://github.com/libretro/common-shaders/blob/master/interpolation/shaders/pixellate.cg
//!MAGPIE EFFECT
//!VERSION 3
//!VERSION 4
//!TEXTURE
Texture2D INPUT;
//!TEXTURE
Texture2D OUTPUT;
//!SAMPLER
//!FILTER POINT
SamplerState sam;
@ -15,6 +18,7 @@ SamplerState sam;
//!PASS 1
//!STYLE PS
//!IN INPUT
//!OUT OUTPUT
float4 Pass1(float2 pos) {
float2 texelSize = GetInputPt();

Some files were not shown because too many files have changed in this diff Show more