mirror of
https://github.com/Blinue/Magpie.git
synced 2026-06-24 02:04:10 +00:00
commit
65e5bd8331
367 changed files with 104653 additions and 12644 deletions
11
.github/workflows/build.yml
vendored
11
.github/workflows/build.yml
vendored
|
|
@ -9,6 +9,9 @@ on:
|
|||
jobs:
|
||||
build:
|
||||
runs-on: windows-latest
|
||||
strategy:
|
||||
matrix:
|
||||
platform: ["x64", "ARM64"]
|
||||
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
|
|
@ -24,10 +27,10 @@ jobs:
|
|||
uses: actions/cache@v4
|
||||
with:
|
||||
path: ~/.conan2/p
|
||||
key: ${{ runner.os }}-conan-${{ hashFiles('src/**/conanfile.txt') }}
|
||||
key: Conan-${{ hashFiles('src/**/conanfile.txt') }}-${{ matrix.platform }}
|
||||
|
||||
- name: Build
|
||||
run: python publish.py
|
||||
run: python publish.py ${{ matrix.platform }}
|
||||
|
||||
- name: Save hash
|
||||
id: hash
|
||||
|
|
@ -36,5 +39,5 @@ jobs:
|
|||
- name: Store build
|
||||
uses: actions/upload-artifact@v4
|
||||
with:
|
||||
name: Magpie-dev-${{ steps.hash.outputs.sha_short }}
|
||||
path: ./publish
|
||||
name: Magpie-dev-${{ steps.hash.outputs.sha_short }}-${{ matrix.platform }}
|
||||
path: ./publish/${{ matrix.platform }}
|
||||
|
|
|
|||
55
.github/workflows/release.yml
vendored
55
.github/workflows/release.yml
vendored
|
|
@ -24,9 +24,13 @@ on:
|
|||
required: true
|
||||
type: boolean
|
||||
jobs:
|
||||
release:
|
||||
build:
|
||||
runs-on: windows-latest
|
||||
|
||||
outputs:
|
||||
tag: ${{ steps.tag.outputs.tag }}
|
||||
strategy:
|
||||
matrix:
|
||||
platform: ["x64", "ARM64"]
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
|
||||
|
|
@ -42,13 +46,50 @@ jobs:
|
|||
with:
|
||||
path: ~/.conan2/p
|
||||
key: ${{ runner.os }}-conan-${{ hashFiles('src/**/conanfile.txt') }}
|
||||
|
||||
- name: Generate tag
|
||||
id: tag
|
||||
run: |
|
||||
$tag = "${{ inputs.tag }}" -eq "" ? "v${{ inputs.major }}.${{ inputs.minor }}.${{ inputs.patch }}" : "${{ inputs.tag }}"
|
||||
echo "tag=$tag" >> $env:GITHUB_OUTPUT
|
||||
|
||||
- name: Publish release
|
||||
run: python publish.py
|
||||
- name: Build
|
||||
run: python publish.py ${{ matrix.platform }}
|
||||
env:
|
||||
MAJOR: ${{ inputs.major }}
|
||||
MINOR: ${{ inputs.minor }}
|
||||
PATCH: ${{ inputs.patch }}
|
||||
TAG: ${{ inputs.tag }}
|
||||
PRERELEASE: ${{ inputs.prerelease }}
|
||||
ACCESS_TOKEN: ${{ secrets.CONTENTS_ACCESS_TOKEN }}
|
||||
TAG: ${{ steps.tag.outputs.tag }}
|
||||
|
||||
- name: Store artifacts
|
||||
uses: actions/upload-artifact@v4
|
||||
with:
|
||||
name: Magpie-${{ steps.tag.outputs.tag }}-${{ matrix.platform }}
|
||||
path: publish/${{ matrix.platform }}
|
||||
release:
|
||||
runs-on: windows-latest
|
||||
needs: build
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
|
||||
- uses: actions/setup-python@v5
|
||||
with:
|
||||
python-version: '3.11'
|
||||
|
||||
- name: Setup Requests
|
||||
run: pip install requests
|
||||
|
||||
- name: Restore artifacts
|
||||
uses: actions/download-artifact@v4
|
||||
with:
|
||||
path: publish
|
||||
|
||||
- name: Publish release
|
||||
run: python ci/release.py
|
||||
env:
|
||||
MAJOR: ${{ inputs.major }}
|
||||
MINOR: ${{ inputs.minor }}
|
||||
PATCH: ${{ inputs.patch }}
|
||||
TAG: ${{ needs.build.outputs.tag }}
|
||||
PRERELEASE: ${{ inputs.prerelease }}
|
||||
ACCESS_TOKEN: ${{ secrets.CONTENTS_ACCESS_TOKEN }}
|
||||
|
|
|
|||
|
|
@ -24,7 +24,6 @@ Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "Solution Items", "Solution
|
|||
src\Common.Pre.props = src\Common.Pre.props
|
||||
Directory.Build.props = Directory.Build.props
|
||||
src\extract_winui_runtime.py = src\extract_winui_runtime.py
|
||||
src\fix_resfiles.py = src\fix_resfiles.py
|
||||
src\HybridCRT.props = src\HybridCRT.props
|
||||
src\WinUI.props = src\WinUI.props
|
||||
EndProjectSection
|
||||
|
|
|
|||
158
ci/release.py
Normal file
158
ci/release.py
Normal file
|
|
@ -0,0 +1,158 @@
|
|||
import sys
|
||||
import os
|
||||
import subprocess
|
||||
import shutil
|
||||
import requests
|
||||
import hashlib
|
||||
import json
|
||||
|
||||
try:
|
||||
# https://docs.github.com/en/actions/learn-github-actions/variables
|
||||
if os.environ["GITHUB_ACTIONS"].lower() == "true":
|
||||
# 不知为何在 Github Actions 中运行时默认编码为 ANSI,并且 print 需刷新流才能正常显示
|
||||
for stream in [sys.stdout, sys.stderr]:
|
||||
stream.reconfigure(encoding="utf-8")
|
||||
except:
|
||||
pass
|
||||
|
||||
majorVersion = os.environ["MAJOR"]
|
||||
minorVersion = os.environ["MINOR"]
|
||||
patchVersion = os.environ["PATCH"]
|
||||
tag = os.environ["TAG"]
|
||||
isPrerelease = os.environ["PRERELEASE"].lower() == "true"
|
||||
githubAccessToken = os.environ["ACCESS_TOKEN"]
|
||||
repo = os.environ["GITHUB_REPOSITORY"]
|
||||
actor = os.environ["GITHUB_ACTOR"]
|
||||
|
||||
subprocess.run("git config user.name " + actor)
|
||||
subprocess.run(f"git config user.email {actor}@users.noreply.github.com")
|
||||
|
||||
subprocess.run(
|
||||
f"git remote set-url origin https://{githubAccessToken}@github.com/{repo}.git"
|
||||
)
|
||||
|
||||
# 打标签
|
||||
if subprocess.run(f"git tag -a {tag} -m {tag}").returncode != 0:
|
||||
raise Exception("打标签失败")
|
||||
|
||||
if subprocess.run("git push origin " + tag).returncode != 0:
|
||||
raise Exception("推送标签失败")
|
||||
|
||||
print("已创建标签 " + tag, flush=True)
|
||||
|
||||
headers = {
|
||||
"Accept": "application/vnd.github+json",
|
||||
"Authorization": "Bearer " + githubAccessToken,
|
||||
"X-GitHub-Api-Version": "2022-11-28",
|
||||
}
|
||||
|
||||
# 获取前一个发布版本来生成默认发行说明
|
||||
prevReleaseTag = None
|
||||
try:
|
||||
if isPrerelease:
|
||||
# 发布预发行版与最新的版本(无论是正式版还是预发行版)对比
|
||||
response = requests.get(
|
||||
f"https://api.github.com/repos/{repo}/releases",
|
||||
json={"per_page": 1},
|
||||
headers=headers,
|
||||
)
|
||||
if response.ok:
|
||||
prevReleaseTag = response.json()[0]["tag_name"]
|
||||
else:
|
||||
# 发布正式版则与最新的正式版对比
|
||||
# 由于可以自己选择最新版本,此接口可能不会返回时间上最新发布的版本,不是大问题
|
||||
response = requests.get(
|
||||
f"https://api.github.com/repos/{repo}/releases/latest", headers=headers
|
||||
)
|
||||
if response.ok:
|
||||
prevReleaseTag = response.json()["tag_name"]
|
||||
except:
|
||||
# 忽略错误
|
||||
pass
|
||||
|
||||
# 发布 release
|
||||
if prevReleaseTag == None:
|
||||
body = ""
|
||||
else:
|
||||
# 默认发行说明为比较两个 tag
|
||||
body = f"https://github.com/{repo}/compare/{prevReleaseTag}...{tag}"
|
||||
|
||||
response = requests.post(
|
||||
f"https://api.github.com/repos/{repo}/releases",
|
||||
json={
|
||||
"tag_name": tag,
|
||||
"name": tag,
|
||||
"prerelease": isPrerelease,
|
||||
"body": body,
|
||||
"discussion_category_name": "Announcements",
|
||||
},
|
||||
headers=headers,
|
||||
)
|
||||
if not response.ok:
|
||||
raise Exception("发布失败")
|
||||
|
||||
uploadUrl = response.json()["upload_url"]
|
||||
uploadUrl = uploadUrl[: uploadUrl.find("{")] + "?name="
|
||||
|
||||
os.chdir(os.path.dirname(__file__) + "\\..\\publish")
|
||||
|
||||
pkgInfos = {}
|
||||
for platform in ["x64", "ARM64"]:
|
||||
# 打包成 zip
|
||||
pkgName = "Magpie-" + tag + "-" + platform
|
||||
shutil.make_archive(pkgName, "zip", pkgName)
|
||||
pkgName += ".zip"
|
||||
|
||||
# 上传资产
|
||||
with open(pkgName, "rb") as f:
|
||||
# 流式上传
|
||||
# https://requests.readthedocs.io/en/latest/user/advanced/#streaming-uploads
|
||||
response = requests.post(
|
||||
uploadUrl + pkgName,
|
||||
data=f,
|
||||
headers={**headers, "Content-Type": "application/zip"},
|
||||
)
|
||||
|
||||
if not response.ok:
|
||||
raise Exception("上传失败")
|
||||
|
||||
# 计算哈希
|
||||
f.seek(0, os.SEEK_SET)
|
||||
md5 = hashlib.file_digest(f, hashlib.md5).hexdigest()
|
||||
|
||||
pkgInfos[platform] = (pkgName, md5)
|
||||
|
||||
print("已发布 " + tag, flush=True)
|
||||
|
||||
# 更新 version.json
|
||||
# 此步应在发布版本之后,因为程序使用 version.json 检查更新
|
||||
os.chdir("..")
|
||||
with open("version.json", "w", encoding="utf-8") as f:
|
||||
json.dump(
|
||||
{
|
||||
"version": f"{majorVersion}.{minorVersion}.{patchVersion}",
|
||||
"tag": tag,
|
||||
"binary": {
|
||||
"x64": {
|
||||
"url": f"https://github.com/{repo}/releases/download/{tag}/{pkgInfos['x64'][0]}",
|
||||
"hash": pkgInfos["x64"][1],
|
||||
},
|
||||
"ARM64": {
|
||||
"url": f"https://github.com/{repo}/releases/download/{tag}/{pkgInfos['ARM64'][0]}",
|
||||
"hash": pkgInfos["ARM64"][1],
|
||||
},
|
||||
},
|
||||
},
|
||||
f,
|
||||
indent=4,
|
||||
)
|
||||
|
||||
# 提交对 version.json 的更改
|
||||
if subprocess.run("git add version.json").returncode != 0:
|
||||
raise Exception("git add 失败")
|
||||
|
||||
if subprocess.run('git commit -m "Update version.json"').returncode != 0:
|
||||
raise Exception("git commit 失败")
|
||||
|
||||
if subprocess.run("git push").returncode != 0:
|
||||
raise Exception("git push 失败")
|
||||
|
|
@ -18,7 +18,7 @@ Magpie ships with a handful of effects that can be used in combinations. Most of
|
|||
* Parameter:
|
||||
* Strength: Denoise magnitude
|
||||
|
||||
* Anime4K_Restore_S, Anime4K_Restore_M, Anime4K_Restore_L, Anime4K_Restore_VL, Anime4K_Restore_UL, Anime4K_Restore_Soft_S, Anime4K_Restore_Soft_M, Anime4K_Restore_Soft_L, Anime4K_Restore_Soft_VL, Anime4K_Restore_Soft_UL: Algorithms to restore the lines in animations. In increasing order of demand for computing power. The Soft variants are more conservative in sharpening.
|
||||
* Anime4K_Restore family: Algorithms to restore the lines in animations. In increasing order of demand for computing power. The Soft variants are more conservative in sharpening.
|
||||
* Output size: the same as the input
|
||||
|
||||
* Anime4K_Thin_HQ: Algorithm to clarify lines in animations provided by Anime4K.
|
||||
|
|
@ -27,7 +27,7 @@ Magpie ships with a handful of effects that can be used in combinations. Most of
|
|||
* Strength: The strength in each iteration.
|
||||
* Iterations: The number of iterations. Decreasing strength and increasing iterations improves the quality of the images, but will lower the processing speed.
|
||||
|
||||
* Anime4K_Upscale_S, Anime4K_Upscale_L, Anime4K_Upscale_Denoise_S, Anime4K_Upscale_Denoise_L, and Anime4K_Upscale_GAN_x2_S: Anime-style scaling algorithms provided by Anime4K. The denoise variant includes denoise functionality. The GAN variant, which keeps more details, is still under experiment.
|
||||
* Anime4K_Upscale family: Anime-style scaling algorithms provided by Anime4K. The denoise variant includes denoise functionality. The GAN variant, which keeps more details, is still under experiment.
|
||||
* Output size: twice that of the input
|
||||
|
||||
* Bicubic: Interpolation algorithms. The lite variant is fast, but at the cost of quality degradation, Suitable for users will weak graphics cards.
|
||||
|
|
@ -124,6 +124,9 @@ Magpie ships with a handful of effects that can be used in combinations. Most of
|
|||
* Bloom Amount
|
||||
* Filter Kernel Shape
|
||||
|
||||
* CuNNy family:Suitable for visual novel-style images. The DS variants offer a subtle denoise effect. Provided by [CuNNy](https://github.com/cunnyplapper/CuNNy)
|
||||
* Output size: twice that of the input
|
||||
|
||||
* Deband
|
||||
* Output size: the same as the input
|
||||
* Parameters
|
||||
|
|
@ -221,7 +224,7 @@ Magpie ships with a handful of effects that can be used in combinations. Most of
|
|||
* Sharpness
|
||||
* Note: Only supports upscaling.
|
||||
|
||||
* NNEDI3_nns16_win8x4 and NNEDI3_nns64_win8x6:These shaders originally designed for deinterlacing and are also high-quality interpolation algorithms. NNEDI3_nns64_win8x6 produces higher quality results, but slower.
|
||||
* NNEDI3 family:These shaders originally designed for deinterlacing and are also high-quality interpolation algorithms. NNEDI3_nns64_win8x6 produces higher quality results, but slower.
|
||||
* Output size: twice that of the input
|
||||
|
||||
* NVSharpen: Port of NVSharpen that was published along with NIS.
|
||||
|
|
@ -232,10 +235,10 @@ Magpie ships with a handful of effects that can be used in combinations. Most of
|
|||
* Pixellate: Scale with the Pixellate algorithm. Suitable for upscaling pixel arts.
|
||||
* Output size: determined by scale configuration
|
||||
|
||||
* RAVU_Lite_R3: Port of ravu-lite-r3
|
||||
* RAVU family: Ported from https://github.com/bjin/mpv-prescalers
|
||||
* Output size: twice that of the input
|
||||
|
||||
* RAVU_Zoom_R3: Port of ravu-zoom-r3
|
||||
* RAVU_Zoom family: Ported from https://github.com/bjin/mpv-prescalers
|
||||
* Output size: determined by scale configuration
|
||||
* Note: Only supports upscaling.
|
||||
|
||||
|
|
|
|||
|
|
@ -1,4 +1,4 @@
|
|||
Magpie provides several capture methods. They have their pros and cons in different scenarios.
|
||||
Magpie provides several capture methods. They have their pros and cons in different scenarios. For general purposes, it's recommended to use Graphics Capture, as it provides the best compatibility and smoothness.
|
||||
|
||||
| | Graphics Capture | Desktop Duplication | GDI | DwmSharedSurface |
|
||||
| :---: | :---: | :---: | :---: |:---: |
|
||||
|
|
@ -6,11 +6,9 @@ Magpie provides several capture methods. They have their pros and cons in differ
|
|||
| Supports recording/streaming | No under extreme conditions<sup>[1]</sup> | No | Yes | Yes |
|
||||
| Support the source window to span multiple screens | No under extreme conditions<sup>[1]</sup> | No | Yes | Yes |
|
||||
| Ignores DPI virtualization<sup>[2]</sup> | No | No | Yes| Yes |
|
||||
| Notes | The most recommended capture method | Requires Win10 v2004, suitable for games with more static frames<sup>[3]</sup>, could capture pop-ups | | Low VRAM usage |
|
||||
| Notes | The most recommended capture method | Requires Win10 v2004 | | Low VRAM usage |
|
||||
|
||||
|
||||
[1]: (1) The source window does not support regular window capture. (2) The operating system is Windows 11.
|
||||
|
||||
[2]: The system will perform bicubic interpolation upscaling to windows that do not support DPI scaling. The capture methods supporting this options captures the images before such scaling.
|
||||
|
||||
[3]: The Desktop Duplication mode effectively reduces the power consumption if there are many static frames.
|
||||
|
|
|
|||
|
|
@ -2,23 +2,12 @@ MagpieFX is based on DirectX 11 compute shader
|
|||
|
||||
``` hlsl
|
||||
//!MAGPIE EFFECT
|
||||
//!VERSION 3
|
||||
//!OUTPUT_WIDTH INPUT_WIDTH * 2
|
||||
//!OUTPUT_HEIGHT INPUT_HEIGHT * 2
|
||||
// Specify "USE_DYNAMIC" to use GetFrameCount or GetCursorPos.
|
||||
//!VERSION 4
|
||||
// Specify "USE_DYNAMIC" to use GetFrameCount.
|
||||
//!USE_DYNAMIC
|
||||
// Specifying "GENERIC_DOWNSCALER" indicates that this effect can be used as the "default downscaling effect".
|
||||
//!GENERIC_DOWNSCALER
|
||||
// Use "SORT_NAME" to specify the name used for sorting, otherwise the files will be sorted by their file names.
|
||||
//!SORT_NAME test1
|
||||
|
||||
// Not specifying "OUTPUT_WIDTH" and "OUTPUT_HEIGHT" indicates that this effect supports outputting to any size.
|
||||
// You can use some pre-defined constants when calculating texture size.
|
||||
// INPUT_WIDTH
|
||||
// INPUT_HEIGHT
|
||||
// OUTPUT_WIDTH
|
||||
// OUTPUT_HEIGHT
|
||||
|
||||
|
||||
// Definition of parameters
|
||||
//!PARAMETER
|
||||
|
|
@ -33,13 +22,25 @@ float sharpness;
|
|||
|
||||
|
||||
// Definition of textures
|
||||
// "INPUT" is a special keyword.
|
||||
// "INPUT" cannot be used as the output of a pass.
|
||||
// Defining INPUT is optional, but it is recommended to define it explicitly for the sake of semantic completeness.
|
||||
// "INPUT" and "OUTPUT" are special keywords.
|
||||
// "INPUT" cannot be used as the output of a pass; "OUTPUT" cannot be used as the input of a pass.
|
||||
// Defining INPUT/OUTPUT is optional, but it is recommended to define them explicitly for the sake of semantic completeness.
|
||||
// The size of the OUTPUT represents the output size of this effect. Not specifying it indicates support for output of any size.
|
||||
|
||||
//!TEXTURE
|
||||
Texture2D INPUT;
|
||||
|
||||
//!TEXTURE
|
||||
//!WIDTH INPUT_WIDTH * 2
|
||||
//!HEIGHT INPUT_HEIGHT * 2
|
||||
Texture2D OUTPUT;
|
||||
|
||||
// You can use some pre-defined constants to calculate texture size.
|
||||
// INPUT_WIDTH
|
||||
// INPUT_HEIGHT
|
||||
// OUTPUT_WIDTH
|
||||
// OUTPUT_HEIGHT
|
||||
|
||||
// Supported texture formats:
|
||||
// R32G32B32A32_FLOAT
|
||||
// R16G16B16A16_FLOAT
|
||||
|
|
@ -110,11 +111,10 @@ float4 Pass1(float2 pos) {
|
|||
return float4(1, 1, 1, 1);
|
||||
}
|
||||
|
||||
// The last pass does not support "OUT".
|
||||
// If you are using the CS style, you must use "WriteToOutput" to output the result.
|
||||
|
||||
//!PASS 2
|
||||
//!IN INPUT, tex1
|
||||
// The output of the last pass must be "OUTPUT".
|
||||
//!OUT OUTPUT
|
||||
// "BLOACK_SIZE" specifies how large an area is processed in one dispatch.
|
||||
// "BLOACK_SIZE" can have only one dimension, meaning that length and height are specified at the same time.
|
||||
//!BLOCK_SIZE 16, 16
|
||||
|
|
@ -123,18 +123,13 @@ float4 Pass1(float2 pos) {
|
|||
//!NUM_THREADS 64, 1, 1
|
||||
|
||||
void Pass2(uint2 blockStart, uint3 threadId) {
|
||||
// Render the cursor and then output.
|
||||
// Available only in the last pass.
|
||||
WriteToOutput(blockStart, float3(1,1,1));
|
||||
// Write to OUPUT
|
||||
OUTPUT[blockStart] = float4(1,1,1,1);
|
||||
}
|
||||
```
|
||||
|
||||
### Predefined functions
|
||||
|
||||
**void WriteToOutput(uint2 pos, float3 color)**: Only available in the last pass and is used to write results to the output texture.
|
||||
|
||||
**bool CheckViewport(uint2 pos)**: Only available in the last pass and is used to check whether the output coordinates are inside the viewport.
|
||||
|
||||
**uint2 GetInputSize()**: Retrieves the size of the input texture.
|
||||
|
||||
**float2 GetInputPt()**: Retrieves the size of pixel in the input texture.
|
||||
|
|
@ -147,8 +142,6 @@ void Pass2(uint2 blockStart, uint3 threadId) {
|
|||
|
||||
**uint GetFrameCount()**: Retrieves the total number of frames rendered so far. When using this function, you must specify USE_DYNAMIC.
|
||||
|
||||
**uint2 GetCursorPos()**: Retrieves the current cursor position. When using this function, you must specify USE_DYNAMIC.
|
||||
|
||||
**uint2 Rmp8x8(uint id)**: Maps the values of 0 to 63 to coordinates in an 8x8 square in swizzle order, which can improve texture cache hit rate.
|
||||
|
||||
|
||||
|
|
@ -164,10 +157,6 @@ void Pass2(uint2 blockStart, uint3 threadId) {
|
|||
|
||||
**MP_DEBUG**: Whether the shader is being compiled in debug mode (when compiling shaders in debug mode, they are not optimized and contain debug information).
|
||||
|
||||
**MP_LAST_PASS**: Whether the current pass is the last pass of the effect.
|
||||
|
||||
**MP_LAST_EFFECT**: Whether the effect is the last effect for the current scaling mode (the last effect needs to handle viewport and cursor rendering).
|
||||
|
||||
**MP_FP16**: Whether to use half-precision floating-point numbers (specifed by user).
|
||||
|
||||
**MF、MF1、MF2、...、MF4x4**: Floating-point data types that conform to MP_FP16. When half-precision is not specified, they are aliases for float..., otherwise they are aliases for min16float...
|
||||
|
|
|
|||
|
|
@ -2,23 +2,12 @@ MagpieFX 基于 DirectX 11 计算着色器
|
|||
|
||||
``` hlsl
|
||||
//!MAGPIE EFFECT
|
||||
//!VERSION 3
|
||||
//!OUTPUT_WIDTH INPUT_WIDTH * 2
|
||||
//!OUTPUT_HEIGHT INPUT_HEIGHT * 2
|
||||
// 若要使用 GetFrameCount 或 GetCursorPos 需指定 USE_DYNAMIC
|
||||
//!VERSION 4
|
||||
// 若要使用 GetFrameCount 需指定 USE_DYNAMIC
|
||||
//!USE_DYNAMIC
|
||||
// GENERIC_DOWNSCALER 表示此效果可以作为“默认降采样效果”
|
||||
//!GENERIC_DOWNSCALER
|
||||
// 使用 SORT_NAME 指定排序时使用的名字,否则按照文件名排序
|
||||
//!SORT_NAME test1
|
||||
|
||||
// 不指定 OUTPUT_WIDTH 和 OUTPUT_HEIGHT 表示此效果支持输出任意尺寸
|
||||
// 计算纹理尺寸时可以使用一些预定义常量
|
||||
// INPUT_WIDTH
|
||||
// INPUT_HEIGHT
|
||||
// OUTPUT_WIDTH
|
||||
// OUTPUT_HEIGHT
|
||||
|
||||
|
||||
// 参数定义
|
||||
//!PARAMETER
|
||||
|
|
@ -33,13 +22,25 @@ float sharpness;
|
|||
|
||||
|
||||
// 纹理定义
|
||||
// INPUT 是特殊关键字
|
||||
// INPUT 不能作为通道的输出
|
||||
// 定义 INPUT 是可选的,但为了保持语义的完整性,建议显式定义
|
||||
// INPUT、OUTPUT 是特殊关键字
|
||||
// INPUT 不能作为通道的输出,OUTPUT 不能作为通道的输入
|
||||
// 定义 INPUT 和 OUTPUT 是可选的,但为了保持语义的完整性,建议显式定义
|
||||
// OUTPUT 的尺寸即为此效果的输出尺寸,不指定则表示支持任意尺寸的输出
|
||||
|
||||
//!TEXTURE
|
||||
Texture2D INPUT;
|
||||
|
||||
//!TEXTURE
|
||||
//!WIDTH INPUT_WIDTH * 2
|
||||
//!HEIGHT INPUT_HEIGHT * 2
|
||||
Texture2D OUTPUT;
|
||||
|
||||
// 计算纹理尺寸时可以使用一些预定义常量
|
||||
// INPUT_WIDTH
|
||||
// INPUT_HEIGHT
|
||||
// OUTPUT_WIDTH
|
||||
// OUTPUT_HEIGHT
|
||||
|
||||
// 支持的纹理格式:
|
||||
// R32G32B32A32_FLOAT
|
||||
// R16G16B16A16_FLOAT
|
||||
|
|
@ -109,11 +110,10 @@ float4 Pass1(float2 pos) {
|
|||
return float4(1, 1, 1, 1);
|
||||
}
|
||||
|
||||
// 最后一个通道不能指定 OUT
|
||||
// 如果是 CS 风格必须使用 WriteToOutput 输出结果
|
||||
|
||||
//!PASS 2
|
||||
//!IN INPUT, tex1
|
||||
// 最后一个通道的输出只能是 OUTPUT
|
||||
//!OUT OUTPUT
|
||||
// BLOCK_SIZE 指定一次 dispatch 处理多大的区域
|
||||
// 可以只有一维,即同时指定长和高
|
||||
//!BLOCK_SIZE 16, 16
|
||||
|
|
@ -122,18 +122,13 @@ float4 Pass1(float2 pos) {
|
|||
//!NUM_THREADS 64, 1, 1
|
||||
|
||||
void Pass2(uint2 blockStart, uint3 threadId) {
|
||||
// 渲染光标并写入 OUPUT
|
||||
// 只在最后一个通道中可用
|
||||
WriteToOutput(blockStart, float3(1,1,1));
|
||||
// 写入 OUPUT
|
||||
OUTPUT[blockStart] = float4(1,1,1,1);
|
||||
}
|
||||
```
|
||||
|
||||
### 预定义函数
|
||||
|
||||
**void WriteToOutput(uint2 pos, float3 color)**:只在最后一个通道(Pass)中可用,用于将结果写入到输出纹理。
|
||||
|
||||
**bool CheckViewport(uint2 pos)**:只在最后一个通道中可用,检查输出坐标是否位于视口内。
|
||||
|
||||
**uint2 GetInputSize()**:获取输入纹理尺寸。
|
||||
|
||||
**float2 GetInputPt()**:获取输入纹理每个像素的尺寸。
|
||||
|
|
@ -146,8 +141,6 @@ void Pass2(uint2 blockStart, uint3 threadId) {
|
|||
|
||||
**uint GetFrameCount()**:获取当前总计帧数。使用此函数时必须指定 "USE_DYNAMIC"。
|
||||
|
||||
**uint2 GetCursorPos()**:获取当前光标位置。使用此函数时必须指定 "USE_DYNAMIC"。
|
||||
|
||||
**uint2 Rmp8x8(uint id)**:将 0~63 的值以 swizzle 顺序映射到 8x8 的正方形内的坐标,用以提高纹理缓存的命中率。
|
||||
|
||||
|
||||
|
|
@ -163,10 +156,6 @@ void Pass2(uint2 blockStart, uint3 threadId) {
|
|||
|
||||
**MP_DEBUG**:当前是否为调试模式(调试模式下编译的着色器不进行优化且含有调试信息)
|
||||
|
||||
**MP_LAST_PASS**:当前通道是否是当前效果的最后一个通道
|
||||
|
||||
**MP_LAST_EFFECT**:当前效果是否是当前缩放模式的最后一个效果(最后一个效果要处理视口和光标渲染)
|
||||
|
||||
**MP_FP16**:当前是否使用半精度浮点数(由用户指定)
|
||||
|
||||
**MF、MF1、MF2、...、MF4x4**:遵守 fp16 参数的浮点数类型。当未指定 fp16,它们为 float... 的别名,否则为 min16float... 的别名
|
||||
|
|
|
|||
|
|
@ -8,8 +8,6 @@ If you cannot run some effects with high computing power requirements (e.g. Anim
|
|||
|
||||
1. Change to the variants with lower requirements. For example, Anime4K_Upscale_S is much faster than Anime4K_Upscale_L. CAS is much faster than AdaptiveSharpen. They can effectively improve the smoothness of the effects at the cost of some quality degradation.
|
||||
2. Change the capture mode. We recommend you to try each of them.
|
||||
3. Set the frame rate to "unlimited." This will turn off Vsync. It usually increases the frame rate substantially, but may causes the screen to tear.
|
||||
4. Turn on "allow additional latency to improve performance" when Vsync is on. This will not lead to screen tearing and it also raises the frame rate. However, it will cause an extra 1-frame latency.
|
||||
|
||||
## Intermittent lagging
|
||||
|
||||
|
|
@ -25,6 +23,5 @@ If your graphics card is powerful enough, but you are still experiencing lagging
|
|||
|
||||
When you need to save electricity or reduce the heat generated, try the following:
|
||||
|
||||
1. Change the capture more. The Desktop Duplication capture mode effectively reduces the power consumption if there are a lot of static frames in the game.
|
||||
2. Change the effects to their variants with lower requirements.
|
||||
3. Limit the frame rate, which may cause screen tearing.
|
||||
1. Limit the frame rate.
|
||||
2. Opt for effects that require lower performance.
|
||||
|
|
|
|||
|
|
@ -18,7 +18,7 @@ Magpie 内置了大量效果供组合使用,大部分提供了参数选项以
|
|||
* 参数
|
||||
* Strength:降噪强度
|
||||
|
||||
* Anime4K_Restore_S、Anime4K_Restore_M、Anime4K_Restore_L、Anime4K_Restore_VL、Anime4K_Restore_UL、Anime4K_Restore_Soft_S、Anime4K_Restore_Soft_M、Anime4K_Restore_Soft_L、Anime4K_Restore_Soft_VL 和 Anime4K_Restore_Soft_UL:Anime4K 提供的用于还原动漫画面线条的算法,S->M->L->VL->UL 对性能的需求依次提高,Soft 变体效果稍弱
|
||||
* Anime4K_Restore 族:Anime4K 提供的用于还原动漫画面线条的算法,S->M->L->VL->UL 对性能的需求依次提高,Soft 变体效果稍弱
|
||||
* 输出尺寸:和输入相同
|
||||
|
||||
* Anime4K_Thin_HQ:Anime4K 提供的用于细化动漫画面线条的算法
|
||||
|
|
@ -27,7 +27,7 @@ Magpie 内置了大量效果供组合使用,大部分提供了参数选项以
|
|||
* Strength:每次迭代的强度
|
||||
* Iterations:迭代次数。降低 Strength 并提高 Iterations 可以提高画面质量,但会降低速度。
|
||||
|
||||
* Anime4K_Upscale_S、Anime4K_Upscale_L、Anime4K_Upscale_VL、Anime4K_Upscale_UL、Anime4K_Upscale_Denoise_S、Anime4K_Upscale_Denoise_L、Anime4K_Upscale_Denoise_VL、Anime4K_Upscale_Denoise_UL 和 Anime4K_Upscale_GAN_x2_S:Anime4K 提供的动画风格图像缩放算法。Denoise 变体包含降噪效果,GAN 变体处于实验阶段,可以保留更多细节。S、L、VL、UL 对性能的要求依次提高
|
||||
* Anime4K_Upscale 族:Anime4K 提供的动画风格图像缩放算法。Denoise 变体包含降噪效果,GAN 变体处于实验阶段,可以保留更多细节。S、L、VL、UL 对性能的要求依次提高
|
||||
* 输出尺寸:输入的两倍
|
||||
|
||||
* Bicubic:双立方(双三次)插值算法
|
||||
|
|
@ -124,6 +124,9 @@ Magpie 内置了大量效果供组合使用,大部分提供了参数选项以
|
|||
* Bloom Amount
|
||||
* Filter Kernel Shape
|
||||
|
||||
* CuNNy 族:适合视觉小说风格图像的缩放,由 [CuNNy](https://github.com/cunnyplapper/CuNNy) 提供。DS 变体有轻微降噪效果
|
||||
* 输出尺寸:输入的两倍
|
||||
|
||||
* Deband:去除色带
|
||||
* 输出尺寸:和输入相同
|
||||
* 参数
|
||||
|
|
@ -221,7 +224,7 @@ Magpie 内置了大量效果供组合使用,大部分提供了参数选项以
|
|||
* Sharpness:锐化强度
|
||||
* 备注:只支持放大
|
||||
|
||||
* NNEDI3_nns16_win8x4 和 NNEDI3_nns64_win8x6:原本用于去隔行,也是高质量的插值算法。NNEDI3_nns64_win8x6 质量更高,速度更慢
|
||||
* NNEDI3 族:原本用于去隔行,也是高质量的插值算法。移植自 https://github.com/bjin/mpv-prescalers
|
||||
* 输出尺寸:输入的两倍
|
||||
|
||||
* NVSharpen:随 NIS 发布的 NVSharpen 的移植
|
||||
|
|
@ -232,10 +235,10 @@ Magpie 内置了大量效果供组合使用,大部分提供了参数选项以
|
|||
* Pixellate:使用 Pixellate 算法缩放输入。适合放大像素画
|
||||
* 输出尺寸:取决于缩放选项
|
||||
|
||||
* RAVU_Lite_R3:ravu-lite-r3的移植
|
||||
* RAVU 族:移植自 https://github.com/bjin/mpv-prescalers
|
||||
* 输出尺寸:输入的两倍
|
||||
|
||||
* RAVU_Zoom_R3:ravu-zoom-r3的移植
|
||||
* RAVU-Zoom 族:移植自 https://github.com/bjin/mpv-prescalers
|
||||
* 输出尺寸:取决于缩放选项
|
||||
* 备注:只支持放大
|
||||
|
||||
|
|
|
|||
|
|
@ -8,8 +8,6 @@
|
|||
|
||||
1. 更换为性能需求更低的效果。如 Anime4K_Upscale_S 比 Anime4K_Upscale_L 快的多,CAS 比 AdaptiveSharpen 快的多,它们可以有效提高流畅度,代价是一定程度的画面质量损失。
|
||||
2. 尝试更换捕获模式。建议你每种模式都尝试一下。
|
||||
3. 关闭垂直同步。这通常可以大幅提高帧率,但可能造成画面撕裂。
|
||||
4. 开启“垂直同步”并“允许额外的延迟以提高性能”。这个配置不会造成画面撕裂,同时也可以有效提高帧率。缺点是会引入一帧的延迟。
|
||||
|
||||
## 间歇性卡顿
|
||||
|
||||
|
|
@ -25,5 +23,5 @@
|
|||
|
||||
在需要节省电量或降低发热时,请尝试下面的操作:
|
||||
|
||||
1. 更换捕获模式。如果游戏的静止画面较多,Desktop Duplication 捕获模式可以有效降低功耗。
|
||||
1. 限制帧率。
|
||||
2. 更换为性能需求更低的效果。
|
||||
|
|
|
|||
|
|
@ -1,4 +1,4 @@
|
|||
Magpie 提供数种捕获方式,根据使用场景,它们各有优劣。
|
||||
Magpie 提供数种捕获方式,根据使用场景,它们各有优劣。无特殊需求应使用 Graphics Capture,它提供最好的兼容性和流畅度。
|
||||
|
||||
| | Graphics Capture | Desktop Duplication | GDI | DwmSharedSurface |
|
||||
| :---: | :---: | :---: | :---: |:---: |
|
||||
|
|
@ -6,11 +6,9 @@ Magpie 提供数种捕获方式,根据使用场景,它们各有优劣。
|
|||
| 支持录制/串流 | 特殊情况下不支持<sup>[1]</sup> | 否 | 是 | 是 |
|
||||
| 支持源窗口跨越多个屏幕 | 特殊情况下不支持<sup>[1]</sup> | 否 | 是 | 是 |
|
||||
| 无视 DPI 虚拟化<sup>[2]</sup> | 否 | 否 | 是| 是 |
|
||||
| 备注 | 首选捕获方式 | 要求 Win10 v2004;适合静止帧较多的游戏<sup>[3]</sup>;可以捕获到弹窗 | | 占用的显存较少 |
|
||||
| 备注 | 首选捕获方式 | 要求 Win10 v2004 | | 占用的显存较少 |
|
||||
|
||||
|
||||
[1]: (1) 源窗口不支持常规的窗口捕获 (2) 操作系统为 Windows 11
|
||||
|
||||
[2]: 系统会对不支持 DPI 缩放的窗口进行双三次插值放大,支持此项的捕获方式可以捕获到放大前的图像
|
||||
|
||||
[3]: 如果窗口的静止帧较多,使用 Desktop Duplication 可以有效降低功耗
|
||||
|
|
|
|||
171
publish.py
171
publish.py
|
|
@ -18,31 +18,18 @@ try:
|
|||
except:
|
||||
pass
|
||||
|
||||
platform = "x64"
|
||||
if len(sys.argv) == 2:
|
||||
platform = sys.argv[1]
|
||||
if not platform in ["x64", "ARM64"]:
|
||||
raise Exception("非法参数")
|
||||
|
||||
if majorVersion != None:
|
||||
import re
|
||||
import hashlib
|
||||
import json
|
||||
|
||||
# 使用第三方库 requests 发送 HTTP 请求,它是 Conan 的依赖项,无需单独安装
|
||||
import requests
|
||||
|
||||
minorVersion = os.environ["MINOR"]
|
||||
patchVersion = os.environ["PATCH"]
|
||||
|
||||
tag = ""
|
||||
try:
|
||||
tag = os.environ["TAG"]
|
||||
except:
|
||||
pass
|
||||
|
||||
if tag == "":
|
||||
tag = f"v{majorVersion}.{minorVersion}.{patchVersion}"
|
||||
|
||||
isPrerelease = os.environ["PRERELEASE"].lower() == "true"
|
||||
|
||||
githubAccessToken = os.environ["ACCESS_TOKEN"]
|
||||
repo = os.environ["GITHUB_REPOSITORY"]
|
||||
actor = os.environ["GITHUB_ACTOR"]
|
||||
tag = os.environ["TAG"]
|
||||
|
||||
#####################################################################
|
||||
#
|
||||
|
|
@ -111,7 +98,7 @@ else:
|
|||
version_props = ""
|
||||
|
||||
p = subprocess.run(
|
||||
f'"{msbuildPath}" -restore -p:RestorePackagesConfig=true;Configuration=Release;Platform=x64;OutDir={os.getcwd()}\\publish\\;CommitId={commit_id}{version_props} Magpie.sln'
|
||||
f'"{msbuildPath}" -restore -p:RestorePackagesConfig=true;Configuration=Release;Platform={platform};OutDir={os.getcwd()}\\publish\\{platform}\\;CommitId={commit_id}{version_props} Magpie.sln'
|
||||
)
|
||||
if p.returncode != 0:
|
||||
raise Exception("编译失败")
|
||||
|
|
@ -122,7 +109,7 @@ if p.returncode != 0:
|
|||
#
|
||||
#####################################################################
|
||||
|
||||
os.chdir("publish")
|
||||
os.chdir("publish\\" + platform)
|
||||
|
||||
|
||||
# 删除文件,忽略错误
|
||||
|
|
@ -133,10 +120,9 @@ def remove_file(file):
|
|||
pass
|
||||
|
||||
|
||||
for folder in ["Microsoft.UI.Xaml", "Magpie.App"]:
|
||||
shutil.rmtree(folder, ignore_errors=True)
|
||||
shutil.rmtree("Microsoft.UI.Xaml", ignore_errors=True)
|
||||
|
||||
for pattern in ["*.pdb", "*.lib", "*.exp", "*.winmd", "*.xml", "*.xbf", "dummy.*"]:
|
||||
for pattern in ["*.pdb", "*.lib", "*.exp", "*.winmd", "*.xml", "*.xbf"]:
|
||||
for file in glob.glob(pattern):
|
||||
remove_file(file)
|
||||
|
||||
|
|
@ -225,138 +211,3 @@ os.remove("resources.pri.xml")
|
|||
os.remove("priconfig.xml")
|
||||
|
||||
print("已修剪 resources.pri", flush=True)
|
||||
|
||||
#####################################################################
|
||||
#
|
||||
# 发布
|
||||
#
|
||||
#####################################################################
|
||||
|
||||
if majorVersion != None:
|
||||
os.chdir("..")
|
||||
|
||||
subprocess.run("git config user.name " + actor)
|
||||
subprocess.run(f"git config user.email {actor}@users.noreply.github.com")
|
||||
|
||||
subprocess.run(
|
||||
f"git remote set-url origin https://{githubAccessToken}@github.com/{repo}.git"
|
||||
)
|
||||
|
||||
# 打标签
|
||||
if subprocess.run(f"git tag -a {tag} -m {tag}").returncode != 0:
|
||||
raise Exception("打标签失败")
|
||||
|
||||
if subprocess.run("git push origin " + tag).returncode != 0:
|
||||
raise Exception("推送标签失败")
|
||||
|
||||
print("已创建标签 " + tag, flush=True)
|
||||
|
||||
# 打包成 zip
|
||||
pkgName = "Magpie-" + tag + "-x64"
|
||||
shutil.make_archive(pkgName, "zip", "publish")
|
||||
pkgName += ".zip"
|
||||
|
||||
headers = {
|
||||
"Accept": "application/vnd.github+json",
|
||||
"Authorization": "Bearer " + githubAccessToken,
|
||||
"X-GitHub-Api-Version": "2022-11-28",
|
||||
}
|
||||
|
||||
# 获取前一个发布版本来生成默认发行说明
|
||||
prevReleaseTag = None
|
||||
try:
|
||||
if isPrerelease:
|
||||
# 发布预发行版与最新的版本(无论是正式版还是预发行版)对比
|
||||
response = requests.get(
|
||||
f"https://api.github.com/repos/{repo}/releases",
|
||||
json={
|
||||
"per_page": 1
|
||||
},
|
||||
headers=headers
|
||||
)
|
||||
if response.ok:
|
||||
prevReleaseTag = response.json()[0]["tag_name"]
|
||||
else:
|
||||
# 发布正式版则与最新的正式版对比
|
||||
# 由于可以自己选择最新版本,此接口可能不会返回时间上最新发布的版本,不是大问题
|
||||
response = requests.get(f"https://api.github.com/repos/{repo}/releases/latest", headers=headers)
|
||||
if response.ok:
|
||||
prevReleaseTag = response.json()["tag_name"]
|
||||
except:
|
||||
# 忽略错误
|
||||
pass
|
||||
|
||||
# 发布 release
|
||||
if prevReleaseTag == None:
|
||||
body = ""
|
||||
else:
|
||||
# 默认发行说明为比较两个 tag
|
||||
body = f"https://github.com/{repo}/compare/{prevReleaseTag}...{tag}"
|
||||
|
||||
response = requests.post(
|
||||
f"https://api.github.com/repos/{repo}/releases",
|
||||
json={
|
||||
"tag_name": tag,
|
||||
"name": tag,
|
||||
"prerelease": isPrerelease,
|
||||
"body": body,
|
||||
"discussion_category_name": "Announcements",
|
||||
},
|
||||
headers=headers,
|
||||
)
|
||||
if not response.ok:
|
||||
raise Exception("发布失败")
|
||||
|
||||
upload_url = response.json()["upload_url"]
|
||||
upload_url = upload_url[: upload_url.find("{")] + "?name=" + pkgName
|
||||
|
||||
# 上传资产
|
||||
with open(pkgName, "rb") as f:
|
||||
# 流式上传
|
||||
# https://requests.readthedocs.io/en/latest/user/advanced/#streaming-uploads
|
||||
response = requests.post(
|
||||
upload_url,
|
||||
data=f,
|
||||
headers={**headers, "Content-Type": "application/zip"},
|
||||
)
|
||||
|
||||
if not response.ok:
|
||||
raise Exception("上传失败")
|
||||
|
||||
# 计算哈希
|
||||
f.seek(0, os.SEEK_SET)
|
||||
md5 = hashlib.file_digest(f, hashlib.md5).hexdigest()
|
||||
|
||||
print("已发布 " + tag, flush=True)
|
||||
|
||||
# 丢弃当前修改并更新到最新,防止编译时有新的提交
|
||||
subprocess.run("git checkout -f")
|
||||
subprocess.run("git pull")
|
||||
|
||||
# 更新 version.json
|
||||
# 此步应在发布版本之后,因为程序使用 version.json 检查更新
|
||||
with open("version.json", "w", encoding="utf-8") as f:
|
||||
json.dump(
|
||||
{
|
||||
"version": f"{majorVersion}.{minorVersion}.{patchVersion}",
|
||||
"tag": tag,
|
||||
"binary": {
|
||||
"x64": {
|
||||
"url": f"https://github.com/{repo}/releases/download/{tag}/{pkgName}",
|
||||
"hash": md5,
|
||||
}
|
||||
},
|
||||
},
|
||||
f,
|
||||
indent=4,
|
||||
)
|
||||
|
||||
# 提交对 version.json 的更改
|
||||
if subprocess.run("git add version.json").returncode != 0:
|
||||
raise Exception("git add 失败")
|
||||
|
||||
if subprocess.run('git commit -m "Update version.json"').returncode != 0:
|
||||
raise Exception("git commit 失败")
|
||||
|
||||
if subprocess.run("git push").returncode != 0:
|
||||
raise Exception("git push 失败")
|
||||
|
|
|
|||
|
|
@ -2,6 +2,7 @@
|
|||
<Project xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
|
||||
<PropertyGroup>
|
||||
<DefaultLanguage>en-US</DefaultLanguage>
|
||||
<CppWinRTFastAbi>true</CppWinRTFastAbi>
|
||||
<CppWinRTOptimized>true</CppWinRTOptimized>
|
||||
<CppWinRTRootNamespaceAutoMerge>true</CppWinRTRootNamespaceAutoMerge>
|
||||
<CppWinRTVerbosity>low</CppWinRTVerbosity>
|
||||
|
|
|
|||
|
|
@ -3,14 +3,17 @@
|
|||
|
||||
|
||||
//!MAGPIE EFFECT
|
||||
//!VERSION 3
|
||||
//!OUTPUT_WIDTH INPUT_WIDTH * 2
|
||||
//!OUTPUT_HEIGHT INPUT_HEIGHT * 2
|
||||
//!VERSION 4
|
||||
|
||||
|
||||
//!TEXTURE
|
||||
Texture2D INPUT;
|
||||
|
||||
//!TEXTURE
|
||||
//!WIDTH INPUT_WIDTH * 2
|
||||
//!HEIGHT INPUT_HEIGHT * 2
|
||||
Texture2D OUTPUT;
|
||||
|
||||
//!TEXTURE
|
||||
//!WIDTH INPUT_WIDTH
|
||||
//!HEIGHT INPUT_HEIGHT
|
||||
|
|
@ -3741,6 +3744,7 @@ void Pass8(uint2 blockStart, uint3 threadId) {
|
|||
//!PASS 9
|
||||
//!DESC L9, L10
|
||||
//!IN INPUT, tex3, tex4
|
||||
//!OUT OUTPUT
|
||||
//!BLOCK_SIZE 16
|
||||
//!NUM_THREADS 64
|
||||
|
||||
|
|
@ -3978,8 +3982,9 @@ const static float3x3 yuv2rgb = {
|
|||
|
||||
void Pass9(uint2 blockStart, uint3 threadId) {
|
||||
uint2 gxy = (Rmp8x8(threadId.x) << 1) + blockStart;
|
||||
|
||||
if (!CheckViewport(gxy)) {
|
||||
|
||||
const uint2 outputSize = GetOutputSize();
|
||||
if (gxy.x >= outputSize.x || gxy.y >= outputSize.y) {
|
||||
return;
|
||||
}
|
||||
|
||||
|
|
@ -4277,12 +4282,6 @@ void Pass9(uint2 blockStart, uint3 threadId) {
|
|||
for (uint j = 0; j <= 1; ++j) {
|
||||
uint2 destPos = gxy + uint2(i, j);
|
||||
|
||||
if (i != 0 || j != 0) {
|
||||
if (!CheckViewport(destPos)) {
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
uint index = j * 2 + i;
|
||||
float luma = clamp(
|
||||
target1.x * kernelsL10[0 + index] +
|
||||
|
|
@ -4295,7 +4294,7 @@ void Pass9(uint2 blockStart, uint3 threadId) {
|
|||
target2.w * kernelsL10[28 + index], 0.0f, 1.0f);
|
||||
|
||||
float2 originUV = mul(rgb2uv, INPUT.SampleLevel(sam1, (destPos + 0.5f) * outputPt, 0).rgb);
|
||||
WriteToOutput(destPos, mul(yuv2rgb, float3(luma, originUV)));
|
||||
OUTPUT[destPos] = float4(mul(yuv2rgb, float3(luma, originUV)), 1);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -2,15 +2,18 @@
|
|||
// 移植自 https://github.com/bloc97/Anime4K/blob/master/glsl/Upscale/Anime4K_3DGraphics_AA_Upscale_x2_US.glsl
|
||||
|
||||
//!MAGPIE EFFECT
|
||||
//!VERSION 3
|
||||
//!OUTPUT_WIDTH INPUT_WIDTH * 2
|
||||
//!OUTPUT_HEIGHT INPUT_HEIGHT * 2
|
||||
//!VERSION 4
|
||||
//!SORT_NAME Anime4K_3D_Upscale_1
|
||||
|
||||
|
||||
//!TEXTURE
|
||||
Texture2D INPUT;
|
||||
|
||||
//!TEXTURE
|
||||
//!WIDTH INPUT_WIDTH * 2
|
||||
//!HEIGHT INPUT_HEIGHT * 2
|
||||
Texture2D OUTPUT;
|
||||
|
||||
//!SAMPLER
|
||||
//!FILTER POINT
|
||||
SamplerState sam;
|
||||
|
|
@ -175,13 +178,15 @@ void Pass2(uint2 blockStart, uint3 threadId) {
|
|||
//!PASS 3
|
||||
//!DESC Conv-4x3x3x4, Depth-to-Space
|
||||
//!IN INPUT, tex2
|
||||
//!OUT OUTPUT
|
||||
//!BLOCK_SIZE 16
|
||||
//!NUM_THREADS 64
|
||||
|
||||
void Pass3(uint2 blockStart, uint3 threadId) {
|
||||
uint2 gxy = (Rmp8x8(threadId.x) << 1) + blockStart;
|
||||
|
||||
if (!CheckViewport(gxy)) {
|
||||
const uint2 outputSize = GetOutputSize();
|
||||
if (gxy.x >= outputSize.x || gxy.y >= outputSize.y) {
|
||||
return;
|
||||
}
|
||||
|
||||
|
|
@ -221,24 +226,19 @@ void Pass3(uint2 blockStart, uint3 threadId) {
|
|||
result += float4(-3.1127936e-05, 3.3726166e-05, 4.8580805e-05, -9.541029e-06);
|
||||
|
||||
pos -= 0.5f * outputPt;
|
||||
WriteToOutput(gxy, result.x + INPUT.SampleLevel(sam1, pos, 0).rgb);
|
||||
OUTPUT[gxy] = float4(result.x + INPUT.SampleLevel(sam1, pos, 0).rgb, 1);
|
||||
|
||||
++gxy.x;
|
||||
pos.x += outputPt.x;
|
||||
if (CheckViewport(gxy)) {
|
||||
WriteToOutput(gxy, result.y + INPUT.SampleLevel(sam1, pos, 0).rgb);
|
||||
}
|
||||
OUTPUT[gxy] = float4(result.y + INPUT.SampleLevel(sam1, pos, 0).rgb, 1);
|
||||
|
||||
|
||||
++gxy.y;
|
||||
pos.y += outputPt.y;
|
||||
if (CheckViewport(gxy)) {
|
||||
WriteToOutput(gxy, result.w + INPUT.SampleLevel(sam1, pos, 0).rgb);
|
||||
}
|
||||
OUTPUT[gxy] = float4(result.w + INPUT.SampleLevel(sam1, pos, 0).rgb, 1);
|
||||
|
||||
--gxy.x;
|
||||
pos.x -= outputPt.x;
|
||||
if (CheckViewport(gxy)) {
|
||||
WriteToOutput(gxy, result.z + INPUT.SampleLevel(sam1, pos, 0).rgb);
|
||||
}
|
||||
OUTPUT[gxy] = float4(result.z + INPUT.SampleLevel(sam1, pos, 0).rgb, 1);
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -2,15 +2,18 @@
|
|||
// 移植自 https://github.com/bloc97/Anime4K/blob/master/glsl/Upscale/Anime4K_3DGraphics_Upscale_x2_US.glsl
|
||||
|
||||
//!MAGPIE EFFECT
|
||||
//!VERSION 3
|
||||
//!OUTPUT_WIDTH INPUT_WIDTH * 2
|
||||
//!OUTPUT_HEIGHT INPUT_HEIGHT * 2
|
||||
//!VERSION 4
|
||||
//!SORT_NAME Anime4K_3D_Upscale_0
|
||||
|
||||
|
||||
//!TEXTURE
|
||||
Texture2D INPUT;
|
||||
|
||||
//!TEXTURE
|
||||
//!WIDTH INPUT_WIDTH * 2
|
||||
//!HEIGHT INPUT_HEIGHT * 2
|
||||
Texture2D OUTPUT;
|
||||
|
||||
//!SAMPLER
|
||||
//!FILTER POINT
|
||||
SamplerState sam;
|
||||
|
|
@ -176,13 +179,15 @@ void Pass2(uint2 blockStart, uint3 threadId) {
|
|||
//!PASS 3
|
||||
//!DESC Conv-4x3x3x4, Depth-to-Space
|
||||
//!IN INPUT, tex2
|
||||
//!OUT OUTPUT
|
||||
//!BLOCK_SIZE 16
|
||||
//!NUM_THREADS 64
|
||||
|
||||
void Pass3(uint2 blockStart, uint3 threadId) {
|
||||
uint2 gxy = (Rmp8x8(threadId.x) << 1) + blockStart;
|
||||
|
||||
if (!CheckViewport(gxy)) {
|
||||
const uint2 outputSize = GetOutputSize();
|
||||
if (gxy.x >= outputSize.x || gxy.y >= outputSize.y) {
|
||||
return;
|
||||
}
|
||||
|
||||
|
|
@ -222,23 +227,18 @@ void Pass3(uint2 blockStart, uint3 threadId) {
|
|||
result += float4(-0.00016697648, -0.00015957489, 0.00017437353, -0.00019393339);
|
||||
|
||||
pos -= 0.5f * outputPt;
|
||||
WriteToOutput(gxy, result.x + INPUT.SampleLevel(sam1, pos, 0).rgb);
|
||||
OUTPUT[gxy] = float4(result.x + INPUT.SampleLevel(sam1, pos, 0).rgb, 1);
|
||||
|
||||
++gxy.x;
|
||||
pos.x += outputPt.x;
|
||||
if (CheckViewport(gxy)) {
|
||||
WriteToOutput(gxy, result.y + INPUT.SampleLevel(sam1, pos, 0).rgb);
|
||||
}
|
||||
OUTPUT[gxy] = float4(result.y + INPUT.SampleLevel(sam1, pos, 0).rgb, 1);
|
||||
|
||||
|
||||
++gxy.y;
|
||||
pos.y += outputPt.y;
|
||||
if (CheckViewport(gxy)) {
|
||||
WriteToOutput(gxy, result.w + INPUT.SampleLevel(sam1, pos, 0).rgb);
|
||||
}
|
||||
OUTPUT[gxy] = float4(result.w + INPUT.SampleLevel(sam1, pos, 0).rgb, 1);
|
||||
|
||||
--gxy.x;
|
||||
pos.x -= outputPt.x;
|
||||
if (CheckViewport(gxy)) {
|
||||
WriteToOutput(gxy, result.z + INPUT.SampleLevel(sam1, pos, 0).rgb);
|
||||
}
|
||||
OUTPUT[gxy] = float4(result.z + INPUT.SampleLevel(sam1, pos, 0).rgb, 1);
|
||||
}
|
||||
|
|
|
|||
|
|
@ -3,9 +3,7 @@
|
|||
|
||||
|
||||
//!MAGPIE EFFECT
|
||||
//!VERSION 3
|
||||
//!OUTPUT_WIDTH INPUT_WIDTH
|
||||
//!OUTPUT_HEIGHT INPUT_HEIGHT
|
||||
//!VERSION 4
|
||||
|
||||
|
||||
//!PARAMETER
|
||||
|
|
@ -19,6 +17,11 @@ float intensitySigma;
|
|||
//!TEXTURE
|
||||
Texture2D INPUT;
|
||||
|
||||
//!TEXTURE
|
||||
//!WIDTH INPUT_WIDTH
|
||||
//!HEIGHT INPUT_HEIGHT
|
||||
Texture2D OUTPUT;
|
||||
|
||||
//!SAMPLER
|
||||
//!FILTER POINT
|
||||
SamplerState sam;
|
||||
|
|
@ -26,6 +29,7 @@ SamplerState sam;
|
|||
|
||||
//!PASS 1
|
||||
//!IN INPUT
|
||||
//!OUT OUTPUT
|
||||
//!BLOCK_SIZE 16
|
||||
//!NUM_THREADS 64
|
||||
|
||||
|
|
@ -52,7 +56,9 @@ float gaussian(float x, float rcpS, float m) {
|
|||
|
||||
void Pass1(uint2 blockStart, uint3 threadId) {
|
||||
uint2 gxy = (Rmp8x8(threadId.x) << 1) + blockStart;
|
||||
if (!CheckViewport(gxy)) {
|
||||
|
||||
const uint2 outputSize = GetOutputSize();
|
||||
if (gxy.x >= outputSize.x || gxy.y >= outputSize.y) {
|
||||
return;
|
||||
}
|
||||
|
||||
|
|
@ -93,12 +99,6 @@ void Pass1(uint2 blockStart, uint3 threadId) {
|
|||
for (j = 0; j <= 1; ++j) {
|
||||
uint2 destPos = gxy + uint2(i, j);
|
||||
|
||||
if (i != 0 || j != 0) {
|
||||
if (!CheckViewport(gxy)) {
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
float3 sum = 0;
|
||||
float3 n = 0;
|
||||
|
||||
|
|
@ -118,7 +118,7 @@ void Pass1(uint2 blockStart, uint3 threadId) {
|
|||
}
|
||||
}
|
||||
|
||||
WriteToOutput(destPos, sum / n);
|
||||
OUTPUT[destPos] = float4(sum / n, 1);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -2,9 +2,7 @@
|
|||
|
||||
|
||||
//!MAGPIE EFFECT
|
||||
//!VERSION 3
|
||||
//!OUTPUT_WIDTH INPUT_WIDTH
|
||||
//!OUTPUT_HEIGHT INPUT_HEIGHT
|
||||
//!VERSION 4
|
||||
|
||||
|
||||
//!PARAMETER
|
||||
|
|
@ -18,14 +16,19 @@ float intensitySigma;
|
|||
//!TEXTURE
|
||||
Texture2D INPUT;
|
||||
|
||||
//!TEXTURE
|
||||
//!WIDTH INPUT_WIDTH
|
||||
//!HEIGHT INPUT_HEIGHT
|
||||
Texture2D OUTPUT;
|
||||
|
||||
//!SAMPLER
|
||||
//!FILTER POINT
|
||||
SamplerState sam;
|
||||
|
||||
|
||||
|
||||
//!PASS 1
|
||||
//!IN INPUT
|
||||
//!OUT OUTPUT
|
||||
//!BLOCK_SIZE 8
|
||||
//!NUM_THREADS 64
|
||||
|
||||
|
|
@ -77,7 +80,9 @@ float3 getMedian(float3 v[KERNELLEN], float w[KERNELLEN], float n) {
|
|||
|
||||
void Pass1(uint2 blockStart, uint3 threadId) {
|
||||
uint2 gxy = Rmp8x8(threadId.x) + blockStart;
|
||||
if (!CheckViewport(gxy)) {
|
||||
|
||||
const uint2 outputSize = GetOutputSize();
|
||||
if (gxy.x >= outputSize.x || gxy.y >= outputSize.y) {
|
||||
return;
|
||||
}
|
||||
|
||||
|
|
@ -126,9 +131,9 @@ void Pass1(uint2 blockStart, uint3 threadId) {
|
|||
n += histogram_wn[i];
|
||||
}
|
||||
|
||||
WriteToOutput(gxy, getMedian(histogram_v, histogram_wn, n));
|
||||
OUTPUT[gxy] = float4(getMedian(histogram_v, histogram_wn, n), 1);
|
||||
return;
|
||||
}
|
||||
|
||||
WriteToOutput(gxy, getMedian(histogram_v, histogram_w, n));
|
||||
OUTPUT[gxy] = float4(getMedian(histogram_v, histogram_w, n), 1);
|
||||
}
|
||||
|
|
|
|||
|
|
@ -3,14 +3,9 @@
|
|||
|
||||
|
||||
//!MAGPIE EFFECT
|
||||
//!VERSION 3
|
||||
//!OUTPUT_WIDTH INPUT_WIDTH
|
||||
//!OUTPUT_HEIGHT INPUT_HEIGHT
|
||||
//!VERSION 4
|
||||
|
||||
|
||||
//!TEXTURE
|
||||
Texture2D INPUT;
|
||||
|
||||
//!PARAMETER
|
||||
//!LABEL Strength
|
||||
//!DEFAULT 0.1
|
||||
|
|
@ -19,6 +14,14 @@ Texture2D INPUT;
|
|||
//!STEP 0.01
|
||||
float intensitySigma;
|
||||
|
||||
//!TEXTURE
|
||||
Texture2D INPUT;
|
||||
|
||||
//!TEXTURE
|
||||
//!WIDTH INPUT_WIDTH
|
||||
//!HEIGHT INPUT_HEIGHT
|
||||
Texture2D OUTPUT;
|
||||
|
||||
//!SAMPLER
|
||||
//!FILTER POINT
|
||||
SamplerState sam;
|
||||
|
|
@ -26,6 +29,7 @@ SamplerState sam;
|
|||
|
||||
//!PASS 1
|
||||
//!IN INPUT
|
||||
//!OUT OUTPUT
|
||||
//!BLOCK_SIZE 16
|
||||
//!NUM_THREADS 64
|
||||
|
||||
|
|
@ -52,7 +56,9 @@ float gaussian(float x, float s, float m) {
|
|||
|
||||
void Pass1(uint2 blockStart, uint3 threadId) {
|
||||
uint2 gxy = (Rmp8x8(threadId.x) << 1) + blockStart;
|
||||
if (!CheckViewport(gxy)) {
|
||||
|
||||
const uint2 outputSize = GetOutputSize();
|
||||
if (gxy.x >= outputSize.x || gxy.y >= outputSize.y) {
|
||||
return;
|
||||
}
|
||||
|
||||
|
|
@ -84,12 +90,6 @@ void Pass1(uint2 blockStart, uint3 threadId) {
|
|||
for (j = 0; j <= 1; ++j) {
|
||||
const uint2 destPos = gxy + uint2(i, j);
|
||||
|
||||
if (i != 0 || j != 0) {
|
||||
if (!CheckViewport(gxy)) {
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
float3 histogram_v[KERNELLEN];
|
||||
float histogram_l[KERNELLEN];
|
||||
float histogram_w[KERNELLEN];
|
||||
|
|
@ -132,7 +132,7 @@ void Pass1(uint2 blockStart, uint3 threadId) {
|
|||
}
|
||||
}
|
||||
|
||||
WriteToOutput(destPos, maxv);
|
||||
OUTPUT[destPos] = float4(maxv, 1);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -1,18 +1,17 @@
|
|||
// 移植自 https://github.com/bloc97/Anime4K/blob/master/glsl/Restore/Anime4K_Restore_CNN_L.glsl
|
||||
|
||||
//!MAGPIE EFFECT
|
||||
//!VERSION 3
|
||||
//!OUTPUT_WIDTH INPUT_WIDTH
|
||||
//!OUTPUT_HEIGHT INPUT_HEIGHT
|
||||
//!VERSION 4
|
||||
//!SORT_NAME Anime4K_Restore_2
|
||||
|
||||
|
||||
//!TEXTURE
|
||||
Texture2D INPUT;
|
||||
|
||||
//!SAMPLER
|
||||
//!FILTER POINT
|
||||
SamplerState sam;
|
||||
//!TEXTURE
|
||||
//!WIDTH INPUT_WIDTH
|
||||
//!HEIGHT INPUT_HEIGHT
|
||||
Texture2D OUTPUT;
|
||||
|
||||
//!TEXTURE
|
||||
//!WIDTH INPUT_WIDTH
|
||||
|
|
@ -38,6 +37,10 @@ Texture2D tex3;
|
|||
//!FORMAT R16G16B16A16_FLOAT
|
||||
Texture2D tex4;
|
||||
|
||||
//!SAMPLER
|
||||
//!FILTER POINT
|
||||
SamplerState sam;
|
||||
|
||||
|
||||
//!PASS 1
|
||||
//!DESC Conv-4x3x3x3
|
||||
|
|
@ -602,13 +605,15 @@ void Pass4(uint2 blockStart, uint3 threadId) {
|
|||
//!PASS 5
|
||||
//!DESC Conv-3x3x3x16
|
||||
//!IN INPUT, tex3, tex4
|
||||
//!OUT OUTPUT
|
||||
//!BLOCK_SIZE 8
|
||||
//!NUM_THREADS 64
|
||||
|
||||
void Pass5(uint2 blockStart, uint3 threadId) {
|
||||
uint2 gxy = Rmp8x8(threadId.x) + blockStart;
|
||||
uint2 inputSize = GetInputSize();
|
||||
if (!CheckViewport(gxy)) {
|
||||
|
||||
const uint2 outputSize = GetOutputSize();
|
||||
if (gxy.x >= outputSize.x || gxy.y >= outputSize.y) {
|
||||
return;
|
||||
}
|
||||
|
||||
|
|
@ -638,45 +643,45 @@ void Pass5(uint2 blockStart, uint3 threadId) {
|
|||
float4 h2 = tex4.SampleLevel(sam, pos + float2(inputPt.x, 0), 0);
|
||||
float4 i2 = tex4.SampleLevel(sam, pos + float2(inputPt.x, inputPt.y), 0);
|
||||
|
||||
float4 result = mul(max(a1, 0), float4x4(0.012102164, 0.01385959, 0.018815203, 0.0, -0.017435113, -0.04530735, -0.051318135, 0.0, 0.01267727, 0.01400136, 0.017735276, 0.0, 0.012681183, 0.035241637, 0.03990959, 0.0));
|
||||
result += mul(max(b1, 0), float4x4(0.16069227, 0.098007366, 0.076831706, 0.0, 0.081593364, 0.017831434, 0.010174303, 0.0, 0.014732323, 0.02229113, 0.029828338, 0.0, 0.0048171813, 0.051809076, 0.055740006, 0.0));
|
||||
result += mul(max(c1, 0), float4x4(0.0347963, -0.014327445, -0.024176419, 0.0, 0.003463003, -0.050532356, -0.06565927, 0.0, 0.082851514, 0.10950989, 0.12022889, 0.0, -0.038950548, -0.015094648, -0.0119305095, 0.0));
|
||||
result += mul(max(d1, 0), float4x4(-0.11845135, -0.08067485, -0.06981454, 0.0, 0.00058037776, 0.01160575, 0.014900963, 0.0, -0.0374349, -0.052966926, -0.044557698, 0.0, 0.017439643, 0.005496974, -0.0024181441, 0.0));
|
||||
result += mul(max(e1, 0), float4x4(-0.1084345, -0.18271221, -0.18795776, 0.0, 0.110637866, 0.08913364, 0.09161146, 0.0, -0.19889367, -0.17172937, -0.1600661, 0.0, -0.03789556, -0.028977778, -0.029903485, 0.0));
|
||||
result += mul(max(f1, 0), float4x4(0.017774954, -0.048732057, -0.061161697, 0.0, 0.022389695, -0.013317256, -0.019972157, 0.0, 0.051979035, 0.08774837, 0.09633588, 0.0, -0.047462203, -0.033091765, -0.028352588, 0.0));
|
||||
result += mul(max(g1, 0), float4x4(0.022178177, 0.05031684, 0.05802219, 0.0, -0.027539665, -0.020904189, -0.01800042, 0.0, 0.0019531948, 0.00019749763, -0.0013961957, 0.0, 0.024253767, -0.00058503833, 0.0006474611, 0.0));
|
||||
result += mul(max(h1, 0), float4x4(0.06707921, 0.0817431, 0.07561426, 0.0, -0.04157211, -0.006174012, -0.003754037, 0.0, 0.0031168605, 0.02320992, 0.026471246, 0.0, 0.0029530525, -0.004939263, -0.0070194793, 0.0));
|
||||
result += mul(max(i1, 0), float4x4(0.03383418, 0.042321067, 0.04266926, 0.0, -0.043634403, -0.0182769, -0.011314871, 0.0, -0.050008457, -0.003527757, 0.0035165092, 0.0, -0.00016610099, 0.019936454, 0.022199173, 0.0));
|
||||
result += mul(max(a2, 0), float4x4(-0.055203374, -0.03910439, -0.03778927, 0.0, 0.027640847, 0.019469904, 0.0277834, 0.0, -0.026225597, 0.04481541, 0.047454204, 0.0, 0.031545334, 0.019874612, 0.011878432, 0.0));
|
||||
result += mul(max(b2, 0), float4x4(0.016088601, -0.045959134, -0.048793618, 0.0, -0.009834776, 0.0077799167, 0.00873151, 0.0, 0.031265914, 0.09698676, 0.10005417, 0.0, 0.039120086, 0.0005542848, -0.0049420255, 0.0));
|
||||
result += mul(max(c2, 0), float4x4(0.028432969, -0.014792921, -0.026881924, 0.0, -0.00586326, 0.013427183, 0.018215714, 0.0, -0.013559131, 0.017704675, 0.024854776, 0.0, -0.09087544, -0.104627624, -0.0921747, 0.0));
|
||||
result += mul(max(d2, 0), float4x4(-0.022899037, 0.026374351, 0.03145993, 0.0, -0.008008749, -0.0013132087, -0.003957525, 0.0, -0.02490554, 0.0020362549, 0.006453752, 0.0, 0.031494617, 0.049864545, 0.04702567, 0.0));
|
||||
result += mul(max(e2, 0), float4x4(-0.12318068, -0.121377476, -0.11615006, 0.0, -0.1321696, -0.078085914, -0.07868927, 0.0, -0.072339885, 0.0012095685, 0.010923645, 0.0, 0.10844834, 0.10038668, 0.09919817, 0.0));
|
||||
result += mul(max(f2, 0), float4x4(0.058991943, 0.018824834, 0.01659209, 0.0, -0.041878223, 0.013176531, 0.023566704, 0.0, -0.010507848, 0.02042605, 0.028884022, 0.0, -0.1193022, -0.10676289, -0.096668206, 0.0));
|
||||
result += mul(max(g2, 0), float4x4(0.023510003, 0.06057355, 0.052194174, 0.0, 0.02304783, 0.031745855, 0.025863871, 0.0, -0.01060811, -0.043136407, -0.03569961, 0.0, -0.022243036, 0.014206766, 0.0032128936, 0.0));
|
||||
result += mul(max(h2, 0), float4x4(0.025120225, 0.07386707, 0.07916389, 0.0, -0.020202598, 0.010854587, 0.009825397, 0.0, -0.043466344, -0.049230598, -0.038344223, 0.0, 0.006438127, 0.041072655, 0.036958262, 0.0));
|
||||
result += mul(max(i2, 0), float4x4(0.027640026, 0.04239058, 0.055017423, 0.0, -0.002110394, 0.040088017, 0.045239322, 0.0, -0.020238828, -0.01711292, -0.014726791, 0.0, -0.029621653, -0.007380026, -0.002073584, 0.0));
|
||||
result += mul(max(-a1, 0), float4x4(0.008071638, 0.0034274645, -0.0016181463, 0.0, 0.044838928, 0.06936641, 0.072150804, 0.0, 0.0006324625, -0.02223834, -0.021122342, 0.0, 0.043963037, 0.047561962, 0.026419055, 0.0));
|
||||
result += mul(max(-b1, 0), float4x4(-0.06605246, -0.011649812, -0.0022502556, 0.0, -0.09256232, -0.06281528, -0.055003755, 0.0, 0.032296494, -0.011113339, -0.015790787, 0.0, 0.05214882, 0.022887057, 0.013746634, 0.0));
|
||||
result += mul(max(-c1, 0), float4x4(-0.03587372, 0.018986767, 0.03229596, 0.0, 0.008917248, 0.050303612, 0.06147115, 0.0, 0.01872278, -0.011048741, -0.017369485, 0.0, 0.030770298, 0.0063107815, 0.003187433, 0.0));
|
||||
result += mul(max(-d1, 0), float4x4(0.087662674, 0.048391398, 0.042332277, 0.0, 0.0043635606, 0.02438183, 0.020213395, 0.0, -0.023863237, -0.0051179314, -0.0060627074, 0.0, 0.06292237, 0.05821987, 0.051667042, 0.0));
|
||||
result += mul(max(-e1, 0), float4x4(-0.048478693, 0.008368922, 0.016874269, 0.0, -0.19261299, -0.1848583, -0.18258469, 0.0, 0.112302095, 0.061518673, 0.058282077, 0.0, 0.024626324, 0.0058449907, 0.006936535, 0.0));
|
||||
result += mul(max(-f1, 0), float4x4(-0.04468695, 0.0099176075, 0.025094027, 0.0, 0.05447911, 0.08220857, 0.08161316, 0.0, -0.0007933787, -0.03090106, -0.040217776, 0.0, -0.028044306, -0.050590593, -0.05027328, 0.0));
|
||||
result += mul(max(-g1, 0), float4x4(0.029733973, -0.0129855955, -0.019776886, 0.0, 0.01860655, 0.017793713, 0.020113358, 0.0, -0.023667783, -0.0013290358, -0.004159268, 0.0, -0.01960303, -0.012806444, -0.016549494, 0.0));
|
||||
result += mul(max(-h1, 0), float4x4(-0.00952229, -0.007181503, -0.0061082463, 0.0, 0.04292393, 0.01510459, 0.0062862537, 0.0, -0.016540393, -0.023619318, -0.02633423, 0.0, -0.06652295, -0.06933143, -0.063913494, 0.0));
|
||||
result += mul(max(-i1, 0), float4x4(-0.015281855, -0.012470513, -0.008184894, 0.0, 0.045862548, 0.023707546, 0.014719574, 0.0, 0.032412887, -0.0038218168, -0.0065955487, 0.0, -0.027728679, -0.04009727, -0.018856067, 0.0));
|
||||
result += mul(max(-a2, 0), float4x4(0.042844415, 0.00673587, 0.0038338478, 0.0, -0.031152235, -0.06649269, -0.065986395, 0.0, 0.005666899, -0.015819343, -0.012795757, 0.0, -0.0007617308, 0.021531299, 0.026071105, 0.0));
|
||||
result += mul(max(-b2, 0), float4x4(-0.118266046, -0.07211513, -0.058381762, 0.0, 0.02361942, 0.012819485, 0.010511434, 0.0, 0.077196896, 0.003424893, 0.001927401, 0.0, -0.03160996, -0.0034473129, -0.00444674, 0.0));
|
||||
result += mul(max(-c2, 0), float4x4(-0.06548674, -0.018152835, 0.0034779215, 0.0, -0.006173449, 0.008357867, -0.0033986098, 0.0, 0.021622533, -0.03722321, -0.045832597, 0.0, -0.011835129, 0.0109178, 0.010480887, 0.0));
|
||||
result += mul(max(-d2, 0), float4x4(0.041682176, -0.008985459, -0.018538723, 0.0, -0.054624356, -0.09495616, -0.090484254, 0.0, -0.0060466817, -0.017551763, -0.014151624, 0.0, -0.015683241, -0.012590141, -0.014278323, 0.0));
|
||||
result += mul(max(-e2, 0), float4x4(0.073194094, 0.055347454, 0.060976587, 0.0, 0.18175459, 0.13776664, 0.13139476, 0.0, 0.14047755, 0.061971992, 0.056503728, 0.0, 0.0068531767, -0.011873265, -0.016871026, 0.0));
|
||||
result += mul(max(-f2, 0), float4x4(-0.041848205, -0.009582, -0.0076929387, 0.0, 0.044274334, 0.04011985, 0.03085897, 0.0, 0.009403278, -0.03346772, -0.04463548, 0.0, 0.04548978, 0.014613167, 0.0055232802, 0.0));
|
||||
result += mul(max(-g2, 0), float4x4(0.019901669, -0.0011372451, -0.007423424, 0.0, -0.053240675, -0.07105105, -0.07122227, 0.0, -0.01892976, -0.019795185, -0.019204788, 0.0, 0.01228504, -0.005040437, -0.0010069044, 0.0));
|
||||
result += mul(max(-h2, 0), float4x4(0.032843515, 0.014947385, 0.007550199, 0.0, -0.0006476342, -0.020907652, -0.030297596, 0.0, -0.015617971, -0.029182931, -0.038677275, 0.0, 0.037908908, -0.018132487, -0.020226713, 0.0));
|
||||
result += mul(max(-i2, 0), float4x4(0.03232915, 0.02915194, 0.014929652, 0.0, 0.016676396, 0.004807404, -0.0008906752, 0.0, 0.0076904814, 0.00541351, -0.0048240838, 0.0, 0.03459369, -0.012969539, -0.024712864, 0.0));
|
||||
result += float4(-0.0096404655, 0.0022038757, 0.0035988842, 0.0);
|
||||
float3 result = mul(max(a1, 0), float4x3(0.012102164, 0.01385959, 0.018815203, -0.017435113, -0.04530735, -0.051318135, 0.01267727, 0.01400136, 0.017735276, 0.012681183, 0.035241637, 0.03990959));
|
||||
result += mul(max(b1, 0), float4x3(0.16069227, 0.098007366, 0.076831706, 0.081593364, 0.017831434, 0.010174303, 0.014732323, 0.02229113, 0.029828338, 0.0048171813, 0.051809076, 0.055740006));
|
||||
result += mul(max(c1, 0), float4x3(0.0347963, -0.014327445, -0.024176419, 0.003463003, -0.050532356, -0.06565927, 0.082851514, 0.10950989, 0.12022889, -0.038950548, -0.015094648, -0.0119305095));
|
||||
result += mul(max(d1, 0), float4x3(-0.11845135, -0.08067485, -0.06981454, 0.00058037776, 0.01160575, 0.014900963, -0.0374349, -0.052966926, -0.044557698, 0.017439643, 0.005496974, -0.0024181441));
|
||||
result += mul(max(e1, 0), float4x3(-0.1084345, -0.18271221, -0.18795776, 0.110637866, 0.08913364, 0.09161146, -0.19889367, -0.17172937, -0.1600661, -0.03789556, -0.028977778, -0.029903485));
|
||||
result += mul(max(f1, 0), float4x3(0.017774954, -0.048732057, -0.061161697, 0.022389695, -0.013317256, -0.019972157, 0.051979035, 0.08774837, 0.09633588, -0.047462203, -0.033091765, -0.028352588));
|
||||
result += mul(max(g1, 0), float4x3(0.022178177, 0.05031684, 0.05802219, -0.027539665, -0.020904189, -0.01800042, 0.0019531948, 0.00019749763, -0.0013961957, 0.024253767, -0.00058503833, 0.0006474611));
|
||||
result += mul(max(h1, 0), float4x3(0.06707921, 0.0817431, 0.07561426, -0.04157211, -0.006174012, -0.003754037, 0.0031168605, 0.02320992, 0.026471246, 0.0029530525, -0.004939263, -0.0070194793));
|
||||
result += mul(max(i1, 0), float4x3(0.03383418, 0.042321067, 0.04266926, -0.043634403, -0.0182769, -0.011314871, -0.050008457, -0.003527757, 0.0035165092, -0.00016610099, 0.019936454, 0.022199173));
|
||||
result += mul(max(a2, 0), float4x3(-0.055203374, -0.03910439, -0.03778927, 0.027640847, 0.019469904, 0.0277834, -0.026225597, 0.04481541, 0.047454204, 0.031545334, 0.019874612, 0.011878432));
|
||||
result += mul(max(b2, 0), float4x3(0.016088601, -0.045959134, -0.048793618, -0.009834776, 0.0077799167, 0.00873151, 0.031265914, 0.09698676, 0.10005417, 0.039120086, 0.0005542848, -0.0049420255));
|
||||
result += mul(max(c2, 0), float4x3(0.028432969, -0.014792921, -0.026881924, -0.00586326, 0.013427183, 0.018215714, -0.013559131, 0.017704675, 0.024854776, -0.09087544, -0.104627624, -0.0921747));
|
||||
result += mul(max(d2, 0), float4x3(-0.022899037, 0.026374351, 0.03145993, -0.008008749, -0.0013132087, -0.003957525, -0.02490554, 0.0020362549, 0.006453752, 0.031494617, 0.049864545, 0.04702567));
|
||||
result += mul(max(e2, 0), float4x3(-0.12318068, -0.121377476, -0.11615006, -0.1321696, -0.078085914, -0.07868927, -0.072339885, 0.0012095685, 0.010923645, 0.10844834, 0.10038668, 0.09919817));
|
||||
result += mul(max(f2, 0), float4x3(0.058991943, 0.018824834, 0.01659209, -0.041878223, 0.013176531, 0.023566704, -0.010507848, 0.02042605, 0.028884022, -0.1193022, -0.10676289, -0.096668206));
|
||||
result += mul(max(g2, 0), float4x3(0.023510003, 0.06057355, 0.052194174, 0.02304783, 0.031745855, 0.025863871, -0.01060811, -0.043136407, -0.03569961, -0.022243036, 0.014206766, 0.0032128936));
|
||||
result += mul(max(h2, 0), float4x3(0.025120225, 0.07386707, 0.07916389, -0.020202598, 0.010854587, 0.009825397, -0.043466344, -0.049230598, -0.038344223, 0.006438127, 0.041072655, 0.036958262));
|
||||
result += mul(max(i2, 0), float4x3(0.027640026, 0.04239058, 0.055017423, -0.002110394, 0.040088017, 0.045239322, -0.020238828, -0.01711292, -0.014726791, -0.029621653, -0.007380026, -0.002073584));
|
||||
result += mul(max(-a1, 0), float4x3(0.008071638, 0.0034274645, -0.0016181463, 0.044838928, 0.06936641, 0.072150804, 0.0006324625, -0.02223834, -0.021122342, 0.043963037, 0.047561962, 0.026419055));
|
||||
result += mul(max(-b1, 0), float4x3(-0.06605246, -0.011649812, -0.0022502556, -0.09256232, -0.06281528, -0.055003755, 0.032296494, -0.011113339, -0.015790787, 0.05214882, 0.022887057, 0.013746634));
|
||||
result += mul(max(-c1, 0), float4x3(-0.03587372, 0.018986767, 0.03229596, 0.008917248, 0.050303612, 0.06147115, 0.01872278, -0.011048741, -0.017369485, 0.030770298, 0.0063107815, 0.003187433));
|
||||
result += mul(max(-d1, 0), float4x3(0.087662674, 0.048391398, 0.042332277, 0.0043635606, 0.02438183, 0.020213395, -0.023863237, -0.0051179314, -0.0060627074, 0.06292237, 0.05821987, 0.051667042));
|
||||
result += mul(max(-e1, 0), float4x3(-0.048478693, 0.008368922, 0.016874269, -0.19261299, -0.1848583, -0.18258469, 0.112302095, 0.061518673, 0.058282077, 0.024626324, 0.0058449907, 0.006936535));
|
||||
result += mul(max(-f1, 0), float4x3(-0.04468695, 0.0099176075, 0.025094027, 0.05447911, 0.08220857, 0.08161316, -0.0007933787, -0.03090106, -0.040217776, -0.028044306, -0.050590593, -0.05027328));
|
||||
result += mul(max(-g1, 0), float4x3(0.029733973, -0.0129855955, -0.019776886, 0.01860655, 0.017793713, 0.020113358, -0.023667783, -0.0013290358, -0.004159268, -0.01960303, -0.012806444, -0.016549494));
|
||||
result += mul(max(-h1, 0), float4x3(-0.00952229, -0.007181503, -0.0061082463, 0.04292393, 0.01510459, 0.0062862537, -0.016540393, -0.023619318, -0.02633423, -0.06652295, -0.06933143, -0.063913494));
|
||||
result += mul(max(-i1, 0), float4x3(-0.015281855, -0.012470513, -0.008184894, 0.045862548, 0.023707546, 0.014719574, 0.032412887, -0.0038218168, -0.0065955487, -0.027728679, -0.04009727, -0.018856067));
|
||||
result += mul(max(-a2, 0), float4x3(0.042844415, 0.00673587, 0.0038338478, -0.031152235, -0.06649269, -0.065986395, 0.005666899, -0.015819343, -0.012795757, -0.0007617308, 0.021531299, 0.026071105));
|
||||
result += mul(max(-b2, 0), float4x3(-0.118266046, -0.07211513, -0.058381762, 0.02361942, 0.012819485, 0.010511434, 0.077196896, 0.003424893, 0.001927401, -0.03160996, -0.0034473129, -0.00444674));
|
||||
result += mul(max(-c2, 0), float4x3(-0.06548674, -0.018152835, 0.0034779215, -0.006173449, 0.008357867, -0.0033986098, 0.021622533, -0.03722321, -0.045832597, -0.011835129, 0.0109178, 0.010480887));
|
||||
result += mul(max(-d2, 0), float4x3(0.041682176, -0.008985459, -0.018538723, -0.054624356, -0.09495616, -0.090484254, -0.0060466817, -0.017551763, -0.014151624, -0.015683241, -0.012590141, -0.014278323));
|
||||
result += mul(max(-e2, 0), float4x3(0.073194094, 0.055347454, 0.060976587, 0.18175459, 0.13776664, 0.13139476, 0.14047755, 0.061971992, 0.056503728, 0.0068531767, -0.011873265, -0.016871026));
|
||||
result += mul(max(-f2, 0), float4x3(-0.041848205, -0.009582, -0.0076929387, 0.044274334, 0.04011985, 0.03085897, 0.009403278, -0.03346772, -0.04463548, 0.04548978, 0.014613167, 0.0055232802));
|
||||
result += mul(max(-g2, 0), float4x3(0.019901669, -0.0011372451, -0.007423424, -0.053240675, -0.07105105, -0.07122227, -0.01892976, -0.019795185, -0.019204788, 0.01228504, -0.005040437, -0.0010069044));
|
||||
result += mul(max(-h2, 0), float4x3(0.032843515, 0.014947385, 0.007550199, -0.0006476342, -0.020907652, -0.030297596, -0.015617971, -0.029182931, -0.038677275, 0.037908908, -0.018132487, -0.020226713));
|
||||
result += mul(max(-i2, 0), float4x3(0.03232915, 0.02915194, 0.014929652, 0.016676396, 0.004807404, -0.0008906752, 0.0076904814, 0.00541351, -0.0048240838, 0.03459369, -0.012969539, -0.024712864));
|
||||
result += float3(-0.0096404655, 0.0022038757, 0.0035988842);
|
||||
|
||||
result += INPUT.SampleLevel(sam, pos, 0);
|
||||
result += INPUT.SampleLevel(sam, pos, 0).rgb;
|
||||
|
||||
WriteToOutput(gxy, result.rgb);
|
||||
OUTPUT[gxy] = float4(result, 1);
|
||||
}
|
||||
|
|
|
|||
|
|
@ -2,18 +2,17 @@
|
|||
// 移植自 https://github.com/bloc97/Anime4K/blob/master/glsl/Restore/Anime4K_Restore_CNN_M.glsl
|
||||
|
||||
//!MAGPIE EFFECT
|
||||
//!VERSION 3
|
||||
//!OUTPUT_WIDTH INPUT_WIDTH
|
||||
//!OUTPUT_HEIGHT INPUT_HEIGHT
|
||||
//!VERSION 4
|
||||
//!SORT_NAME Anime4K_Restore_1
|
||||
|
||||
|
||||
//!TEXTURE
|
||||
Texture2D INPUT;
|
||||
|
||||
//!SAMPLER
|
||||
//!FILTER POINT
|
||||
SamplerState sam;
|
||||
//!TEXTURE
|
||||
//!WIDTH INPUT_WIDTH
|
||||
//!HEIGHT INPUT_HEIGHT
|
||||
Texture2D OUTPUT;
|
||||
|
||||
//!TEXTURE
|
||||
//!WIDTH INPUT_WIDTH
|
||||
|
|
@ -51,6 +50,10 @@ Texture2D tex5;
|
|||
//!FORMAT R16G16B16A16_FLOAT
|
||||
Texture2D tex6;
|
||||
|
||||
//!SAMPLER
|
||||
//!FILTER POINT
|
||||
SamplerState sam;
|
||||
|
||||
|
||||
//!PASS 1
|
||||
//!DESC Conv-4x3x3x3
|
||||
|
|
@ -495,15 +498,18 @@ void Pass6(uint2 blockStart, uint3 threadId) {
|
|||
//!PASS 7
|
||||
//!DESC Conv-4x3x3x8, Conv-3x1x1x56
|
||||
//!IN INPUT, tex1, tex2, tex3, tex4, tex5, tex6
|
||||
//!OUT OUTPUT
|
||||
//!BLOCK_SIZE 8
|
||||
//!NUM_THREADS 64
|
||||
|
||||
void Pass7(uint2 blockStart, uint3 threadId) {
|
||||
uint2 gxy = Rmp8x8(threadId.x) + blockStart;
|
||||
uint2 inputSize = GetInputSize();
|
||||
if (!CheckViewport(gxy)) {
|
||||
|
||||
const uint2 outputSize = GetOutputSize();
|
||||
if (gxy.x >= outputSize.x || gxy.y >= outputSize.y) {
|
||||
return;
|
||||
}
|
||||
|
||||
float2 inputPt = GetInputPt();
|
||||
float2 pos = (gxy + 0.5f) * inputPt;
|
||||
|
||||
|
|
@ -564,5 +570,5 @@ void Pass7(uint2 blockStart, uint3 threadId) {
|
|||
result += mul(max(-src7, 0), float4x3(0.10676299, 0.118409514, 0.10618478, -0.05880252, -0.06488367, -0.06432695, 0.019221924, 0.017602798, 0.017413978, -0.07512528, -0.080483615, -0.066218294));
|
||||
result += float3(-0.010478934, -0.008364784, -0.010246552);
|
||||
|
||||
WriteToOutput(gxy, result + origin);
|
||||
OUTPUT[gxy] = float4(result + origin, 1);
|
||||
}
|
||||
|
|
|
|||
|
|
@ -2,15 +2,18 @@
|
|||
// 移植自 https://github.com/bloc97/Anime4K/blob/master/glsl/Restore/Anime4K_Restore_CNN_S.glsl
|
||||
|
||||
//!MAGPIE EFFECT
|
||||
//!VERSION 3
|
||||
//!OUTPUT_WIDTH INPUT_WIDTH
|
||||
//!OUTPUT_HEIGHT INPUT_HEIGHT
|
||||
//!VERSION 4
|
||||
//!SORT_NAME Anime4K_Restore_0
|
||||
|
||||
|
||||
//!TEXTURE
|
||||
Texture2D INPUT;
|
||||
|
||||
//!TEXTURE
|
||||
//!WIDTH INPUT_WIDTH
|
||||
//!HEIGHT INPUT_HEIGHT
|
||||
Texture2D OUTPUT;
|
||||
|
||||
//!SAMPLER
|
||||
//!FILTER POINT
|
||||
SamplerState sam;
|
||||
|
|
@ -246,13 +249,15 @@ void Pass3(uint2 blockStart, uint3 threadId) {
|
|||
//!PASS 4
|
||||
//!DESC Conv-3x3x3x8
|
||||
//!IN INPUT, tex1
|
||||
//!OUT OUTPUT
|
||||
//!BLOCK_SIZE 8
|
||||
//!NUM_THREADS 64
|
||||
|
||||
void Pass4(uint2 blockStart, uint3 threadId) {
|
||||
uint2 gxy = Rmp8x8(threadId.x) + blockStart;
|
||||
uint2 inputSize = GetInputSize();
|
||||
if (!CheckViewport(gxy)) {
|
||||
|
||||
const uint2 outputSize = GetOutputSize();
|
||||
if (gxy.x >= outputSize.x || gxy.y >= outputSize.y) {
|
||||
return;
|
||||
}
|
||||
|
||||
|
|
@ -294,5 +299,5 @@ void Pass4(uint2 blockStart, uint3 threadId) {
|
|||
|
||||
result += INPUT.SampleLevel(sam, pos, 0).rgb;
|
||||
|
||||
WriteToOutput(gxy, result);
|
||||
OUTPUT[gxy] = float4(result, 1);
|
||||
}
|
||||
|
|
|
|||
|
|
@ -1,18 +1,17 @@
|
|||
// 移植自 https://github.com/bloc97/Anime4K/blob/master/glsl/Restore/Anime4K_Restore_CNN_Soft_L.glsl
|
||||
|
||||
//!MAGPIE EFFECT
|
||||
//!VERSION 3
|
||||
//!OUTPUT_WIDTH INPUT_WIDTH
|
||||
//!OUTPUT_HEIGHT INPUT_HEIGHT
|
||||
//!VERSION 4
|
||||
//!SORT_NAME Anime4K_Restore_Soft_2
|
||||
|
||||
|
||||
//!TEXTURE
|
||||
Texture2D INPUT;
|
||||
|
||||
//!SAMPLER
|
||||
//!FILTER POINT
|
||||
SamplerState sam;
|
||||
//!TEXTURE
|
||||
//!WIDTH INPUT_WIDTH
|
||||
//!HEIGHT INPUT_HEIGHT
|
||||
Texture2D OUTPUT;
|
||||
|
||||
//!TEXTURE
|
||||
//!WIDTH INPUT_WIDTH
|
||||
|
|
@ -38,6 +37,10 @@ Texture2D tex3;
|
|||
//!FORMAT R16G16B16A16_FLOAT
|
||||
Texture2D tex4;
|
||||
|
||||
//!SAMPLER
|
||||
//!FILTER POINT
|
||||
SamplerState sam;
|
||||
|
||||
|
||||
//!PASS 1
|
||||
//!DESC Conv-4x3x3x3
|
||||
|
|
@ -602,13 +605,15 @@ void Pass4(uint2 blockStart, uint3 threadId) {
|
|||
//!PASS 5
|
||||
//!DESC Conv-3x3x3x16
|
||||
//!IN INPUT, tex3, tex4
|
||||
//!OUT OUTPUT
|
||||
//!BLOCK_SIZE 8
|
||||
//!NUM_THREADS 64
|
||||
|
||||
void Pass5(uint2 blockStart, uint3 threadId) {
|
||||
uint2 gxy = Rmp8x8(threadId.x) + blockStart;
|
||||
uint2 inputSize = GetInputSize();
|
||||
if (!CheckViewport(gxy)) {
|
||||
|
||||
const uint2 outputSize = GetOutputSize();
|
||||
if (gxy.x >= outputSize.x || gxy.y >= outputSize.y) {
|
||||
return;
|
||||
}
|
||||
|
||||
|
|
@ -638,45 +643,45 @@ void Pass5(uint2 blockStart, uint3 threadId) {
|
|||
float4 h2 = tex4.SampleLevel(sam, pos + float2(inputPt.x, 0), 0);
|
||||
float4 i2 = tex4.SampleLevel(sam, pos + float2(inputPt.x, inputPt.y), 0);
|
||||
|
||||
float4 result = mul(max(a1, 0), float4x4(-0.01858372, 0.017144108, 0.02794388, 0.0, 0.0129101565, -0.0073674284, -0.011766938, 0.0, 0.01970984, 0.01209068, 0.009530311, 0.0, -0.009190449, -0.006996753, -0.0038750458, 0.0));
|
||||
result += mul(max(b1, 0), float4x4(0.15856947, 0.10162126, 0.08489005, 0.0, 0.038381726, -0.017771017, -0.03226132, 0.0, -0.011787879, -0.0152445, -0.007564454, 0.0, 0.055921376, 0.08389841, 0.08452836, 0.0));
|
||||
result += mul(max(c1, 0), float4x4(0.026705442, -0.0070655374, -0.018199183, 0.0, 0.016254421, -0.025398912, -0.03461042, 0.0, 0.03950644, 0.06586101, 0.0707467, 0.0, -0.03793455, -0.04957139, -0.04777402, 0.0));
|
||||
result += mul(max(d1, 0), float4x4(-0.115341224, -0.04463122, -0.016549354, 0.0, -0.059433736, -0.04303295, -0.042805545, 0.0, 0.010830498, -0.011057443, -0.0141014, 0.0, 0.067396216, 0.06553637, 0.06705378, 0.0));
|
||||
result += mul(max(e1, 0), float4x4(-0.12767975, -0.19935511, -0.20109995, 0.0, 0.11554901, 0.11426503, 0.11161185, 0.0, -0.22092125, -0.22041021, -0.2142712, 0.0, -0.06326996, -0.061314825, -0.059039716, 0.0));
|
||||
result += mul(max(f1, 0), float4x4(0.007717391, -0.046238754, -0.056983955, 0.0, 0.021419598, 0.0036924274, -0.00033630748, 0.0, 0.053556852, 0.0824714, 0.08295022, 0.0, -0.09881205, -0.043157153, -0.040801782, 0.0));
|
||||
result += mul(max(g1, 0), float4x4(0.0052828738, 0.049702674, 0.056108, 0.0, 0.009478552, 0.010345037, 0.0094180945, 0.0, -0.010412882, 0.0006965096, 0.0021917222, 0.0, -0.010701383, -0.023212843, -0.024252625, 0.0));
|
||||
result += mul(max(h1, 0), float4x4(0.07542127, 0.0739301, 0.06642962, 0.0, -0.08054489, -0.037553925, -0.026762033, 0.0, 0.09727509, 0.102272816, 0.097533874, 0.0, 0.01325714, -0.004582272, -0.006647532, 0.0));
|
||||
result += mul(max(i1, 0), float4x4(0.03005975, 0.017012767, 0.007840201, 0.0, -0.028650383, -0.0019064787, 0.01083078, 0.0, -0.071352504, -0.019919744, -0.008299795, 0.0, 0.023253804, 0.042413715, 0.04681489, 0.0));
|
||||
result += mul(max(a2, 0), float4x4(-0.052201163, -0.021727808, -0.020888992, 0.0, 0.008365179, -0.016546093, -0.0111018475, 0.0, -0.06236095, -0.019278256, -0.021443967, 0.0, 0.0029381379, -0.0033039588, -0.006425339, 0.0));
|
||||
result += mul(max(b2, 0), float4x4(0.02397296, -0.041659098, -0.050882675, 0.0, -0.013487, 0.0067506596, 0.005435185, 0.0, 0.066447854, 0.13331215, 0.13754861, 0.0, 0.028300207, -0.0048033795, -0.010058485, 0.0));
|
||||
result += mul(max(c2, 0), float4x4(0.08140248, 0.018564016, 0.0036607496, 0.0, -0.0112075955, 0.0022339798, 0.0045722146, 0.0, -0.045716517, -0.0076076477, -0.0016939791, 0.0, -0.030486025, -0.07539711, -0.07185734, 0.0));
|
||||
result += mul(max(d2, 0), float4x4(-0.0155724995, 0.048904862, 0.059412133, 0.0, -0.013894624, -0.0061430936, -0.011662488, 0.0, -0.0052947477, -0.0176474, -0.018611705, 0.0, 0.022075793, 0.031703226, 0.026735537, 0.0));
|
||||
result += mul(max(e2, 0), float4x4(-0.18287502, -0.18703277, -0.18331653, 0.0, -0.08616293, -0.011741755, -0.009296464, 0.0, -0.054274965, 0.016794622, 0.022522328, 0.0, 0.06965258, 0.08260611, 0.08285337, 0.0));
|
||||
result += mul(max(f2, 0), float4x4(0.08107809, 0.0336241, 0.025449684, 0.0, -0.031931, 0.01179566, 0.019694995, 0.0, 0.025930194, 0.042288166, 0.04673656, 0.0, -0.14357394, -0.11003491, -0.094090074, 0.0));
|
||||
result += mul(max(g2, 0), float4x4(0.007188181, 0.050626095, 0.050705966, 0.0, -0.008030409, -0.018670242, -0.019766346, 0.0, 0.014874803, -0.03657919, -0.034044486, 0.0, -0.011178416, -0.004358302, -0.013611815, 0.0));
|
||||
result += mul(max(h2, 0), float4x4(0.07987872, 0.11399873, 0.12089382, 0.0, -0.01514355, 0.0068139364, 0.010206274, 0.0, -0.0005701044, -0.011158322, 0.006484812, 0.0, 0.002018227, 0.043359682, 0.042987905, 0.0));
|
||||
result += mul(max(i2, 0), float4x4(0.0017806455, -0.0015697709, -0.0018252691, 0.0, 0.0058658062, 0.021681193, 0.028615465, 0.0, -0.054827355, -0.04541651, -0.027485048, 0.0, -0.017649114, 0.017717479, 0.027309911, 0.0));
|
||||
result += mul(max(-a1, 0), float4x4(0.02555098, -0.0028983613, -0.005134733, 0.0, -0.0029332284, 0.015552135, 0.022189403, 0.0, -0.019786593, -0.0031676649, -0.0014604586, 0.0, 0.06648065, 0.0672302, 0.04586375, 0.0));
|
||||
result += mul(max(-b1, 0), float4x4(-0.06674696, 0.002328631, 0.014039355, 0.0, -0.03636718, 0.014560653, 0.028076636, 0.0, 0.042305287, 0.015249338, 0.0136925895, 0.0, 0.033586804, 0.00701501, -0.011588751, 0.0));
|
||||
result += mul(max(-c1, 0), float4x4(-0.039022632, 0.015240631, 0.02699061, 0.0, -0.02614261, 0.0051843156, 0.012590042, 0.0, 0.015304643, -0.022641543, -0.030434309, 0.0, 0.016862666, 0.020819275, 0.022333218, 0.0));
|
||||
result += mul(max(-d1, 0), float4x4(0.08056982, 0.026592938, 0.009744146, 0.0, 0.08762212, 0.10150359, 0.09662005, 0.0, -0.044551965, -0.016349116, -0.014629014, 0.0, -0.014341297, -0.030914815, -0.038747486, 0.0));
|
||||
result += mul(max(-e1, 0), float4x4(-0.048734166, 0.019775594, 0.03124684, 0.0, -0.2345022, -0.23639877, -0.22958128, 0.0, 0.12412277, 0.10245112, 0.10389806, 0.0, -0.0030797734, -0.01989389, -0.02020691, 0.0));
|
||||
result += mul(max(-f1, 0), float4x4(-0.0133485105, 0.029644802, 0.041630358, 0.0, 0.041081797, 0.059993293, 0.060033485, 0.0, -0.02155099, -0.035306025, -0.03838472, 0.0, 0.017466968, -0.01866363, -0.004764589, 0.0));
|
||||
result += mul(max(-g1, 0), float4x4(0.0030783121, -0.04064586, -0.04504904, 0.0, -0.023528632, -0.029308239, -0.022441925, 0.0, 0.020095564, 0.018979732, 0.015117934, 0.0, 0.008429918, 0.021180628, 0.020137152, 0.0));
|
||||
result += mul(max(-h1, 0), float4x4(0.0012200709, 0.013313984, 0.014122978, 0.0, 0.08750284, 0.038747437, 0.027102578, 0.0, -0.09627132, -0.09706183, -0.09405641, 0.0, -0.05180081, -0.03555434, -0.021694236, 0.0));
|
||||
result += mul(max(-i1, 0), float4x4(-0.022396728, -0.018316073, -0.01250564, 0.0, 0.045423746, 0.025315331, 0.010639915, 0.0, 0.05618814, 0.022210265, 0.014195103, 0.0, -0.014828652, -0.010245087, 0.0020570823, 0.0));
|
||||
result += mul(max(-a2, 0), float4x4(0.046651457, 0.001333767, -0.003572458, 0.0, -0.0077845114, -0.012861641, -0.015116351, 0.0, 0.01338984, 0.029198132, 0.026183384, 0.0, 0.0014878022, 0.020025207, 0.024829973, 0.0));
|
||||
result += mul(max(-b2, 0), float4x4(-0.09506711, -0.06541528, -0.051106647, 0.0, 0.02552611, 0.01181497, 0.0020236392, 0.0, 0.03234602, -0.03153924, -0.035502207, 0.0, -0.034516744, 0.00018784113, 0.0085376045, 0.0));
|
||||
result += mul(max(-c2, 0), float4x4(-0.05945615, -0.0046793907, 0.011128929, 0.0, -0.0061961384, -0.0040663416, -0.010319631, 0.0, 0.044197917, -0.033448357, -0.04109943, 0.0, -0.04109929, 0.006773195, 0.016976412, 0.0));
|
||||
result += mul(max(-d2, 0), float4x4(0.02855516, -0.033051047, -0.04864978, 0.0, -0.06393814, -0.082921155, -0.0730681, 0.0, -0.058905125, -0.038639963, -0.027698845, 0.0, -0.013616608, -0.007876684, -0.006182652, 0.0));
|
||||
result += mul(max(-e2, 0), float4x4(0.15423118, 0.14667909, 0.14534634, 0.0, 0.1485341, 0.096721016, 0.0820024, 0.0, 0.1263968, 0.088775866, 0.083860956, 0.0, 0.04213644, 0.020989005, 0.010447147, 0.0));
|
||||
result += mul(max(-f2, 0), float4x4(-0.068275765, -0.018390667, -0.011452603, 0.0, 0.03738383, 0.019398715, 0.005998161, 0.0, -0.0011161854, -0.039955888, -0.04444185, 0.0, 0.052985556, 0.017621813, 0.009551621, 0.0));
|
||||
result += mul(max(-g2, 0), float4x4(0.01387326, -0.0033411914, -0.009420935, 0.0, -0.034494568, -0.019219222, -0.009562797, 0.0, 0.0074023325, 0.022065453, 0.027121471, 0.0, 0.00019609048, -0.0042242454, 2.0403608e-05, 0.0));
|
||||
result += mul(max(-h2, 0), float4x4(-0.015793918, -0.024342488, -0.037188973, 0.0, 0.004534637, -0.025236975, -0.028567247, 0.0, -0.055682972, -0.054670315, -0.06584981, 0.0, 0.043045517, -0.0075941198, -0.014196169, 0.0));
|
||||
result += mul(max(-i2, 0), float4x4(0.0132598495, 0.01775289, 0.017206183, 0.0, 0.010604703, -0.007352816, -0.017301153, 0.0, 0.030967329, 0.027615465, 0.0145311365, 0.0, 0.008636854, -0.033379406, -0.042725433, 0.0));
|
||||
result += float4(-0.0056639817, -0.0017339308, -0.0011913306, 0.0);
|
||||
float3 result = mul(max(a1, 0), float4x3(-0.01858372, 0.017144108, 0.02794388, 0.0129101565, -0.0073674284, -0.011766938, 0.01970984, 0.01209068, 0.009530311, -0.009190449, -0.006996753, -0.0038750458));
|
||||
result += mul(max(b1, 0), float4x3(0.15856947, 0.10162126, 0.08489005, 0.038381726, -0.017771017, -0.03226132, -0.011787879, -0.0152445, -0.007564454, 0.055921376, 0.08389841, 0.08452836));
|
||||
result += mul(max(c1, 0), float4x3(0.026705442, -0.0070655374, -0.018199183, 0.016254421, -0.025398912, -0.03461042, 0.03950644, 0.06586101, 0.0707467, -0.03793455, -0.04957139, -0.04777402));
|
||||
result += mul(max(d1, 0), float4x3(-0.115341224, -0.04463122, -0.016549354, -0.059433736, -0.04303295, -0.042805545, 0.010830498, -0.011057443, -0.0141014, 0.067396216, 0.06553637, 0.06705378));
|
||||
result += mul(max(e1, 0), float4x3(-0.12767975, -0.19935511, -0.20109995, 0.11554901, 0.11426503, 0.11161185, -0.22092125, -0.22041021, -0.2142712, -0.06326996, -0.061314825, -0.059039716));
|
||||
result += mul(max(f1, 0), float4x3(0.007717391, -0.046238754, -0.056983955, 0.021419598, 0.0036924274, -0.00033630748, 0.053556852, 0.0824714, 0.08295022, -0.09881205, -0.043157153, -0.040801782));
|
||||
result += mul(max(g1, 0), float4x3(0.0052828738, 0.049702674, 0.056108, 0.009478552, 0.010345037, 0.0094180945, -0.010412882, 0.0006965096, 0.0021917222, -0.010701383, -0.023212843, -0.024252625));
|
||||
result += mul(max(h1, 0), float4x3(0.07542127, 0.0739301, 0.06642962, -0.08054489, -0.037553925, -0.026762033, 0.09727509, 0.102272816, 0.097533874, 0.01325714, -0.004582272, -0.006647532));
|
||||
result += mul(max(i1, 0), float4x3(0.03005975, 0.017012767, 0.007840201, -0.028650383, -0.0019064787, 0.01083078, -0.071352504, -0.019919744, -0.008299795, 0.023253804, 0.042413715, 0.04681489));
|
||||
result += mul(max(a2, 0), float4x3(-0.052201163, -0.021727808, -0.020888992, 0.008365179, -0.016546093, -0.0111018475, -0.06236095, -0.019278256, -0.021443967, 0.0029381379, -0.0033039588, -0.006425339));
|
||||
result += mul(max(b2, 0), float4x3(0.02397296, -0.041659098, -0.050882675, -0.013487, 0.0067506596, 0.005435185, 0.066447854, 0.13331215, 0.13754861, 0.028300207, -0.0048033795, -0.010058485));
|
||||
result += mul(max(c2, 0), float4x3(0.08140248, 0.018564016, 0.0036607496, -0.0112075955, 0.0022339798, 0.0045722146, -0.045716517, -0.0076076477, -0.0016939791, -0.030486025, -0.07539711, -0.07185734));
|
||||
result += mul(max(d2, 0), float4x3(-0.0155724995, 0.048904862, 0.059412133, -0.013894624, -0.0061430936, -0.011662488, -0.0052947477, -0.0176474, -0.018611705, 0.022075793, 0.031703226, 0.026735537));
|
||||
result += mul(max(e2, 0), float4x3(-0.18287502, -0.18703277, -0.18331653, -0.08616293, -0.011741755, -0.009296464, -0.054274965, 0.016794622, 0.022522328, 0.06965258, 0.08260611, 0.08285337));
|
||||
result += mul(max(f2, 0), float4x3(0.08107809, 0.0336241, 0.025449684, -0.031931, 0.01179566, 0.019694995, 0.025930194, 0.042288166, 0.04673656, -0.14357394, -0.11003491, -0.094090074));
|
||||
result += mul(max(g2, 0), float4x3(0.007188181, 0.050626095, 0.050705966, -0.008030409, -0.018670242, -0.019766346, 0.014874803, -0.03657919, -0.034044486, -0.011178416, -0.004358302, -0.013611815));
|
||||
result += mul(max(h2, 0), float4x3(0.07987872, 0.11399873, 0.12089382, -0.01514355, 0.0068139364, 0.010206274, -0.0005701044, -0.011158322, 0.006484812, 0.002018227, 0.043359682, 0.042987905));
|
||||
result += mul(max(i2, 0), float4x3(0.0017806455, -0.0015697709, -0.0018252691, 0.0058658062, 0.021681193, 0.028615465, -0.054827355, -0.04541651, -0.027485048, -0.017649114, 0.017717479, 0.027309911));
|
||||
result += mul(max(-a1, 0), float4x3(0.02555098, -0.0028983613, -0.005134733, -0.0029332284, 0.015552135, 0.022189403, -0.019786593, -0.0031676649, -0.0014604586, 0.06648065, 0.0672302, 0.04586375));
|
||||
result += mul(max(-b1, 0), float4x3(-0.06674696, 0.002328631, 0.014039355, -0.03636718, 0.014560653, 0.028076636, 0.042305287, 0.015249338, 0.0136925895, 0.033586804, 0.00701501, -0.011588751));
|
||||
result += mul(max(-c1, 0), float4x3(-0.039022632, 0.015240631, 0.02699061, -0.02614261, 0.0051843156, 0.012590042, 0.015304643, -0.022641543, -0.030434309, 0.016862666, 0.020819275, 0.022333218));
|
||||
result += mul(max(-d1, 0), float4x3(0.08056982, 0.026592938, 0.009744146, 0.08762212, 0.10150359, 0.09662005, -0.044551965, -0.016349116, -0.014629014, -0.014341297, -0.030914815, -0.038747486));
|
||||
result += mul(max(-e1, 0), float4x3(-0.048734166, 0.019775594, 0.03124684, -0.2345022, -0.23639877, -0.22958128, 0.12412277, 0.10245112, 0.10389806, -0.0030797734, -0.01989389, -0.02020691));
|
||||
result += mul(max(-f1, 0), float4x3(-0.0133485105, 0.029644802, 0.041630358, 0.041081797, 0.059993293, 0.060033485, -0.02155099, -0.035306025, -0.03838472, 0.017466968, -0.01866363, -0.004764589));
|
||||
result += mul(max(-g1, 0), float4x3(0.0030783121, -0.04064586, -0.04504904, -0.023528632, -0.029308239, -0.022441925, 0.020095564, 0.018979732, 0.015117934, 0.008429918, 0.021180628, 0.020137152));
|
||||
result += mul(max(-h1, 0), float4x3(0.0012200709, 0.013313984, 0.014122978, 0.08750284, 0.038747437, 0.027102578, -0.09627132, -0.09706183, -0.09405641, -0.05180081, -0.03555434, -0.021694236));
|
||||
result += mul(max(-i1, 0), float4x3(-0.022396728, -0.018316073, -0.01250564, 0.045423746, 0.025315331, 0.010639915, 0.05618814, 0.022210265, 0.014195103, -0.014828652, -0.010245087, 0.0020570823));
|
||||
result += mul(max(-a2, 0), float4x3(0.046651457, 0.001333767, -0.003572458, -0.0077845114, -0.012861641, -0.015116351, 0.01338984, 0.029198132, 0.026183384, 0.0014878022, 0.020025207, 0.024829973));
|
||||
result += mul(max(-b2, 0), float4x3(-0.09506711, -0.06541528, -0.051106647, 0.02552611, 0.01181497, 0.0020236392, 0.03234602, -0.03153924, -0.035502207, -0.034516744, 0.00018784113, 0.0085376045));
|
||||
result += mul(max(-c2, 0), float4x3(-0.05945615, -0.0046793907, 0.011128929, -0.0061961384, -0.0040663416, -0.010319631, 0.044197917, -0.033448357, -0.04109943, -0.04109929, 0.006773195, 0.016976412));
|
||||
result += mul(max(-d2, 0), float4x3(0.02855516, -0.033051047, -0.04864978, -0.06393814, -0.082921155, -0.0730681, -0.058905125, -0.038639963, -0.027698845, -0.013616608, -0.007876684, -0.006182652));
|
||||
result += mul(max(-e2, 0), float4x3(0.15423118, 0.14667909, 0.14534634, 0.1485341, 0.096721016, 0.0820024, 0.1263968, 0.088775866, 0.083860956, 0.04213644, 0.020989005, 0.010447147));
|
||||
result += mul(max(-f2, 0), float4x3(-0.068275765, -0.018390667, -0.011452603, 0.03738383, 0.019398715, 0.005998161, -0.0011161854, -0.039955888, -0.04444185, 0.052985556, 0.017621813, 0.009551621));
|
||||
result += mul(max(-g2, 0), float4x3(0.01387326, -0.0033411914, -0.009420935, -0.034494568, -0.019219222, -0.009562797, 0.0074023325, 0.022065453, 0.027121471, 0.00019609048, -0.0042242454, 2.0403608e-05));
|
||||
result += mul(max(-h2, 0), float4x3(-0.015793918, -0.024342488, -0.037188973, 0.004534637, -0.025236975, -0.028567247, -0.055682972, -0.054670315, -0.06584981, 0.043045517, -0.0075941198, -0.014196169));
|
||||
result += mul(max(-i2, 0), float4x3(0.0132598495, 0.01775289, 0.017206183, 0.010604703, -0.007352816, -0.017301153, 0.030967329, 0.027615465, 0.0145311365, 0.008636854, -0.033379406, -0.042725433));
|
||||
result += float3(-0.0056639817, -0.0017339308, -0.0011913306);
|
||||
|
||||
result += INPUT.SampleLevel(sam, pos, 0);
|
||||
result += INPUT.SampleLevel(sam, pos, 0).rgb;
|
||||
|
||||
WriteToOutput(gxy, result.rgb);
|
||||
OUTPUT[gxy] = float4(result, 1);
|
||||
}
|
||||
|
|
|
|||
|
|
@ -2,18 +2,17 @@
|
|||
// 移植自 https://github.com/bloc97/Anime4K/blob/master/glsl/Restore/Anime4K_Restore_CNN_Soft_M.glsl
|
||||
|
||||
//!MAGPIE EFFECT
|
||||
//!VERSION 3
|
||||
//!OUTPUT_WIDTH INPUT_WIDTH
|
||||
//!OUTPUT_HEIGHT INPUT_HEIGHT
|
||||
//!VERSION 4
|
||||
//!SORT_NAME Anime4K_Restore_Soft_1
|
||||
|
||||
|
||||
//!TEXTURE
|
||||
Texture2D INPUT;
|
||||
|
||||
//!SAMPLER
|
||||
//!FILTER POINT
|
||||
SamplerState sam;
|
||||
//!TEXTURE
|
||||
//!WIDTH INPUT_WIDTH
|
||||
//!HEIGHT INPUT_HEIGHT
|
||||
Texture2D OUTPUT;
|
||||
|
||||
//!TEXTURE
|
||||
//!WIDTH INPUT_WIDTH
|
||||
|
|
@ -51,6 +50,10 @@ Texture2D tex5;
|
|||
//!FORMAT R16G16B16A16_FLOAT
|
||||
Texture2D tex6;
|
||||
|
||||
//!SAMPLER
|
||||
//!FILTER POINT
|
||||
SamplerState sam;
|
||||
|
||||
|
||||
//!PASS 1
|
||||
//!DESC Conv-4x3x3x3
|
||||
|
|
@ -495,15 +498,18 @@ void Pass6(uint2 blockStart, uint3 threadId) {
|
|||
//!PASS 7
|
||||
//!DESC Conv-4x3x3x8, Conv-3x1x1x56
|
||||
//!IN INPUT, tex1, tex2, tex3, tex4, tex5, tex6
|
||||
//!OUT OUTPUT
|
||||
//!BLOCK_SIZE 8
|
||||
//!NUM_THREADS 64
|
||||
|
||||
void Pass7(uint2 blockStart, uint3 threadId) {
|
||||
uint2 gxy = Rmp8x8(threadId.x) + blockStart;
|
||||
uint2 inputSize = GetInputSize();
|
||||
if (!CheckViewport(gxy)) {
|
||||
|
||||
const uint2 outputSize = GetOutputSize();
|
||||
if (gxy.x >= outputSize.x || gxy.y >= outputSize.y) {
|
||||
return;
|
||||
}
|
||||
|
||||
float2 inputPt = GetInputPt();
|
||||
float2 pos = (gxy + 0.5f) * inputPt;
|
||||
|
||||
|
|
@ -564,5 +570,5 @@ void Pass7(uint2 blockStart, uint3 threadId) {
|
|||
result += mul(max(-src7, 0), float4x3(0.09681486, 0.113604136, 0.10416855, -0.08199983, -0.09013433, -0.08562243, 0.041304465, 0.048315883, 0.042945288, -0.09863276, -0.117853515, -0.09870226));
|
||||
result += float3(-0.0039074384, -0.0085585555, -0.0132283475);
|
||||
|
||||
WriteToOutput(gxy, result + origin);
|
||||
OUTPUT[gxy] = float4(result + origin, 1);
|
||||
}
|
||||
|
|
|
|||
|
|
@ -2,15 +2,18 @@
|
|||
// 移植自 https://github.com/bloc97/Anime4K/blob/master/glsl/Restore/Anime4K_Restore_CNN_Soft_S.glsl
|
||||
|
||||
//!MAGPIE EFFECT
|
||||
//!VERSION 3
|
||||
//!OUTPUT_WIDTH INPUT_WIDTH
|
||||
//!OUTPUT_HEIGHT INPUT_HEIGHT
|
||||
//!VERSION 4
|
||||
//!SORT_NAME Anime4K_Restore_Soft_0
|
||||
|
||||
|
||||
//!TEXTURE
|
||||
Texture2D INPUT;
|
||||
|
||||
//!TEXTURE
|
||||
//!WIDTH INPUT_WIDTH
|
||||
//!HEIGHT INPUT_HEIGHT
|
||||
Texture2D OUTPUT;
|
||||
|
||||
//!SAMPLER
|
||||
//!FILTER POINT
|
||||
SamplerState sam;
|
||||
|
|
@ -246,13 +249,15 @@ void Pass3(uint2 blockStart, uint3 threadId) {
|
|||
//!PASS 4
|
||||
//!DESC Conv-3x3x3x8
|
||||
//!IN INPUT, tex1
|
||||
//!OUT OUTPUT
|
||||
//!BLOCK_SIZE 8
|
||||
//!NUM_THREADS 64
|
||||
|
||||
void Pass4(uint2 blockStart, uint3 threadId) {
|
||||
uint2 gxy = Rmp8x8(threadId.x) + blockStart;
|
||||
uint2 inputSize = GetInputSize();
|
||||
if (!CheckViewport(gxy)) {
|
||||
|
||||
const uint2 outputSize = GetOutputSize();
|
||||
if (gxy.x >= outputSize.x || gxy.y >= outputSize.y) {
|
||||
return;
|
||||
}
|
||||
|
||||
|
|
@ -294,5 +299,5 @@ void Pass4(uint2 blockStart, uint3 threadId) {
|
|||
|
||||
result += INPUT.SampleLevel(sam, pos, 0).rgb;
|
||||
|
||||
WriteToOutput(gxy, result);
|
||||
OUTPUT[gxy] = float4(result, 1);
|
||||
}
|
||||
|
|
|
|||
|
|
@ -2,18 +2,17 @@
|
|||
// Ported from https://github.com/bloc97/Anime4K/blob/4ba94b179a144200cb6b3052e690fe2ca5c6914c/glsl/Restore/Anime4K_Restore_CNN_Soft_UL.glsl
|
||||
|
||||
//!MAGPIE EFFECT
|
||||
//!VERSION 3
|
||||
//!OUTPUT_WIDTH INPUT_WIDTH
|
||||
//!OUTPUT_HEIGHT INPUT_HEIGHT
|
||||
//!VERSION 4
|
||||
//!SORT_NAME Anime4K_Restore_Soft_4
|
||||
|
||||
|
||||
//!TEXTURE
|
||||
Texture2D INPUT;
|
||||
|
||||
//!SAMPLER
|
||||
//!FILTER POINT
|
||||
SamplerState sam;
|
||||
//!TEXTURE
|
||||
//!WIDTH INPUT_WIDTH
|
||||
//!HEIGHT INPUT_HEIGHT
|
||||
Texture2D OUTPUT;
|
||||
|
||||
//!TEXTURE
|
||||
//!WIDTH INPUT_WIDTH
|
||||
|
|
@ -63,6 +62,11 @@ Texture2D tex7;
|
|||
//!FORMAT R16G16B16A16_FLOAT
|
||||
Texture2D tex8;
|
||||
|
||||
//!SAMPLER
|
||||
//!FILTER POINT
|
||||
SamplerState sam;
|
||||
|
||||
|
||||
//!PASS 1
|
||||
//!DESC Conv-4x3x3x3
|
||||
//!IN INPUT
|
||||
|
|
@ -1879,13 +1883,15 @@ void Pass7(uint2 blockStart, uint3 threadId) {
|
|||
//!PASS 8
|
||||
//!DESC Conv-4x3x3x24, Conv-3x1x1x120
|
||||
//!IN INPUT, tex1, tex2, tex3, tex7
|
||||
//!OUT OUTPUT
|
||||
//!BLOCK_SIZE 8
|
||||
//!NUM_THREADS 64
|
||||
|
||||
void Pass8(uint2 blockStart, uint3 threadId) {
|
||||
uint2 gxy = Rmp8x8(threadId.x) + blockStart;
|
||||
uint2 inputSize = GetInputSize();
|
||||
if (gxy.x >= inputSize.x || gxy.y >= inputSize.y) {
|
||||
|
||||
const uint2 outputSize = GetOutputSize();
|
||||
if (gxy.x >= outputSize.x || gxy.y >= outputSize.y) {
|
||||
return;
|
||||
}
|
||||
|
||||
|
|
@ -2169,5 +2175,5 @@ void Pass8(uint2 blockStart, uint3 threadId) {
|
|||
result += float3(-0.0036656514, 0.006677459, 0.007698717);
|
||||
|
||||
result += INPUT.SampleLevel(sam, pos, 0).rgb;
|
||||
WriteToOutput(gxy, result.rgb);
|
||||
OUTPUT[gxy] = float4(result, 1);
|
||||
}
|
||||
|
|
|
|||
|
|
@ -2,18 +2,17 @@
|
|||
// Ported from https://github.com/bloc97/Anime4K/blob/4ba94b179a144200cb6b3052e690fe2ca5c6914c/glsl/Restore/Anime4K_Restore_CNN_Soft_VL.glsl
|
||||
|
||||
//!MAGPIE EFFECT
|
||||
//!VERSION 3
|
||||
//!OUTPUT_WIDTH INPUT_WIDTH
|
||||
//!OUTPUT_HEIGHT INPUT_HEIGHT
|
||||
//!VERSION 4
|
||||
//!SORT_NAME Anime4K_Restore_Soft_3
|
||||
|
||||
|
||||
//!TEXTURE
|
||||
Texture2D INPUT;
|
||||
|
||||
//!SAMPLER
|
||||
//!FILTER POINT
|
||||
SamplerState sam;
|
||||
//!TEXTURE
|
||||
//!WIDTH INPUT_WIDTH
|
||||
//!HEIGHT INPUT_HEIGHT
|
||||
Texture2D OUTPUT;
|
||||
|
||||
//!TEXTURE
|
||||
//!WIDTH INPUT_WIDTH
|
||||
|
|
@ -51,6 +50,10 @@ Texture2D tex5;
|
|||
//!FORMAT R16G16B16A16_FLOAT
|
||||
Texture2D tex6;
|
||||
|
||||
//!SAMPLER
|
||||
//!FILTER POINT
|
||||
SamplerState sam;
|
||||
|
||||
//!PASS 1
|
||||
//!DESC Conv-4x3x3x3
|
||||
//!IN INPUT
|
||||
|
|
@ -1125,13 +1128,15 @@ void Pass7(uint2 blockStart, uint3 threadId) {
|
|||
//!PASS 8
|
||||
//!DESC Conv-4x3x3x16, Conv-3x1x1x112
|
||||
//!IN INPUT, tex1, tex2, tex5
|
||||
//!OUT OUTPUT
|
||||
//!BLOCK_SIZE 8
|
||||
//!NUM_THREADS 64
|
||||
|
||||
void Pass8(uint2 blockStart, uint3 threadId) {
|
||||
uint2 gxy = Rmp8x8(threadId.x) + blockStart;
|
||||
uint2 inputSize = GetInputSize();
|
||||
if (gxy.x >= inputSize.x || gxy.y >= inputSize.y) {
|
||||
|
||||
const uint2 outputSize = GetOutputSize();
|
||||
if (gxy.x >= outputSize.x || gxy.y >= outputSize.y) {
|
||||
return;
|
||||
}
|
||||
|
||||
|
|
@ -1289,5 +1294,5 @@ void Pass8(uint2 blockStart, uint3 threadId) {
|
|||
result += float3(0.018580848, -0.022256816, -0.0266178);
|
||||
result += INPUT.SampleLevel(sam, pos, 0).rgb;
|
||||
|
||||
WriteToOutput(gxy, result);
|
||||
OUTPUT[gxy] = float4(result, 1);
|
||||
}
|
||||
|
|
|
|||
|
|
@ -2,18 +2,17 @@
|
|||
// Ported from https://github.com/bloc97/Anime4K/blob/4ba94b179a144200cb6b3052e690fe2ca5c6914c/glsl/Restore/Anime4K_Restore_CNN_UL.glsl
|
||||
|
||||
//!MAGPIE EFFECT
|
||||
//!VERSION 3
|
||||
//!OUTPUT_WIDTH INPUT_WIDTH
|
||||
//!OUTPUT_HEIGHT INPUT_HEIGHT
|
||||
//!VERSION 4
|
||||
//!SORT_NAME Anime4K_Restore_4
|
||||
|
||||
|
||||
//!TEXTURE
|
||||
Texture2D INPUT;
|
||||
|
||||
//!SAMPLER
|
||||
//!FILTER POINT
|
||||
SamplerState sam;
|
||||
//!TEXTURE
|
||||
//!WIDTH INPUT_WIDTH
|
||||
//!HEIGHT INPUT_HEIGHT
|
||||
Texture2D OUTPUT;
|
||||
|
||||
//!TEXTURE
|
||||
//!WIDTH INPUT_WIDTH
|
||||
|
|
@ -63,6 +62,11 @@ Texture2D tex7;
|
|||
//!FORMAT R16G16B16A16_FLOAT
|
||||
Texture2D tex8;
|
||||
|
||||
//!SAMPLER
|
||||
//!FILTER POINT
|
||||
SamplerState sam;
|
||||
|
||||
|
||||
//!PASS 1
|
||||
//!DESC Conv-4x3x3x3
|
||||
//!IN INPUT
|
||||
|
|
@ -1879,13 +1883,15 @@ void Pass7(uint2 blockStart, uint3 threadId) {
|
|||
//!PASS 8
|
||||
//!DESC Conv-4x3x3x24, Conv-3x1x1x120
|
||||
//!IN INPUT, tex1, tex2, tex3, tex7
|
||||
//!OUT OUTPUT
|
||||
//!BLOCK_SIZE 8
|
||||
//!NUM_THREADS 64
|
||||
|
||||
void Pass8(uint2 blockStart, uint3 threadId) {
|
||||
uint2 gxy = Rmp8x8(threadId.x) + blockStart;
|
||||
uint2 inputSize = GetInputSize();
|
||||
if (gxy.x >= inputSize.x || gxy.y >= inputSize.y) {
|
||||
|
||||
const uint2 outputSize = GetOutputSize();
|
||||
if (gxy.x >= outputSize.x || gxy.y >= outputSize.y) {
|
||||
return;
|
||||
}
|
||||
|
||||
|
|
@ -2169,5 +2175,5 @@ void Pass8(uint2 blockStart, uint3 threadId) {
|
|||
result += float3(-0.0071146404, 0.005606682, 0.010180816);
|
||||
|
||||
result += INPUT.SampleLevel(sam, pos, 0).rgb;
|
||||
WriteToOutput(gxy, result.rgb);
|
||||
OUTPUT[gxy] = float4(result, 1);
|
||||
}
|
||||
|
|
|
|||
|
|
@ -2,18 +2,17 @@
|
|||
// 移植自 https://github.com/bloc97/Anime4K/blob/master/glsl/Restore/Anime4K_Restore_CNN_VL.glsl
|
||||
|
||||
//!MAGPIE EFFECT
|
||||
//!VERSION 3
|
||||
//!OUTPUT_WIDTH INPUT_WIDTH
|
||||
//!OUTPUT_HEIGHT INPUT_HEIGHT
|
||||
//!VERSION 4
|
||||
//!SORT_NAME Anime4K_Restore_3
|
||||
|
||||
|
||||
//!TEXTURE
|
||||
Texture2D INPUT;
|
||||
|
||||
//!SAMPLER
|
||||
//!FILTER POINT
|
||||
SamplerState sam;
|
||||
//!TEXTURE
|
||||
//!WIDTH INPUT_WIDTH
|
||||
//!HEIGHT INPUT_HEIGHT
|
||||
Texture2D OUTPUT;
|
||||
|
||||
//!TEXTURE
|
||||
//!WIDTH INPUT_WIDTH
|
||||
|
|
@ -51,6 +50,10 @@ Texture2D tex5;
|
|||
//!FORMAT R16G16B16A16_FLOAT
|
||||
Texture2D tex6;
|
||||
|
||||
//!SAMPLER
|
||||
//!FILTER POINT
|
||||
SamplerState sam;
|
||||
|
||||
|
||||
//!PASS 1
|
||||
//!DESC Conv-4x3x3x3
|
||||
|
|
@ -1132,13 +1135,15 @@ void Pass7(uint2 blockStart, uint3 threadId) {
|
|||
//!PASS 8
|
||||
//!DESC Conv-4x3x3x16, Conv-3x1x1x112
|
||||
//!IN INPUT, tex1, tex2, tex5
|
||||
//!OUT OUTPUT
|
||||
//!BLOCK_SIZE 8
|
||||
//!NUM_THREADS 64
|
||||
|
||||
void Pass8(uint2 blockStart, uint3 threadId) {
|
||||
uint2 gxy = Rmp8x8(threadId.x) + blockStart;
|
||||
uint2 inputSize = GetInputSize();
|
||||
if (gxy.x >= inputSize.x || gxy.y >= inputSize.y) {
|
||||
|
||||
const uint2 outputSize = GetOutputSize();
|
||||
if (gxy.x >= outputSize.x || gxy.y >= outputSize.y) {
|
||||
return;
|
||||
}
|
||||
|
||||
|
|
@ -1296,5 +1301,5 @@ void Pass8(uint2 blockStart, uint3 threadId) {
|
|||
result += float3(0.047567394, -0.02504617, -0.028163986);
|
||||
|
||||
result += INPUT.SampleLevel(sam, pos, 0).rgb;
|
||||
WriteToOutput(gxy, result);
|
||||
OUTPUT[gxy] = float4(result, 1);
|
||||
}
|
||||
|
|
|
|||
|
|
@ -2,9 +2,7 @@
|
|||
// 移植自 https://github.com/bloc97/Anime4K/blob/master/glsl/Experimental-Effects/Anime4K_Thin_HQ.glsl
|
||||
|
||||
//!MAGPIE EFFECT
|
||||
//!VERSION 3
|
||||
//!OUTPUT_WIDTH INPUT_WIDTH
|
||||
//!OUTPUT_HEIGHT INPUT_HEIGHT
|
||||
//!VERSION 4
|
||||
|
||||
|
||||
//!PARAMETER
|
||||
|
|
@ -30,6 +28,11 @@ int iterations;
|
|||
//!TEXTURE
|
||||
Texture2D INPUT;
|
||||
|
||||
//!TEXTURE
|
||||
//!WIDTH INPUT_WIDTH
|
||||
//!HEIGHT INPUT_HEIGHT
|
||||
Texture2D OUTPUT;
|
||||
|
||||
//!TEXTURE
|
||||
//!WIDTH INPUT_WIDTH
|
||||
//!HEIGHT INPUT_HEIGHT
|
||||
|
|
@ -280,13 +283,15 @@ void Pass4(uint2 blockStart, uint3 threadId) {
|
|||
//!PASS 5
|
||||
//!DESC Warp
|
||||
//!IN tex1, INPUT
|
||||
//!OUT OUTPUT
|
||||
//!BLOCK_SIZE 16
|
||||
//!NUM_THREADS 64
|
||||
|
||||
void Pass5(uint2 blockStart, uint3 threadId) {
|
||||
const uint2 gxy = (Rmp8x8(threadId.x) << 1) + blockStart;
|
||||
const uint2 inputSize = GetInputSize();
|
||||
if (!CheckViewport(gxy)) {
|
||||
|
||||
const uint2 outputSize = GetOutputSize();
|
||||
if (gxy.x >= outputSize.x || gxy.y >= outputSize.y) {
|
||||
return;
|
||||
}
|
||||
|
||||
|
|
@ -299,12 +304,6 @@ void Pass5(uint2 blockStart, uint3 threadId) {
|
|||
for (uint j = 0; j <= 1; ++j) {
|
||||
const uint2 destPos = gxy + uint2(i, j);
|
||||
|
||||
if (i != 0 || j != 0) {
|
||||
if (!CheckViewport(destPos)) {
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
float2 pos = (destPos + 0.5f) * inputPt;
|
||||
|
||||
for (int i = 0; i < iterations; ++i) {
|
||||
|
|
@ -313,7 +312,7 @@ void Pass5(uint2 blockStart, uint3 threadId) {
|
|||
pos -= dd;
|
||||
}
|
||||
|
||||
WriteToOutput(destPos, INPUT.SampleLevel(sam1, pos, 0).rgb);
|
||||
OUTPUT[destPos] = INPUT.SampleLevel(sam1, pos, 0);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -2,22 +2,17 @@
|
|||
// 移植自 https://github.com/bloc97/Anime4K/blob/master/glsl/Upscale%2BDenoise/Anime4K_Upscale_Denoise_CNN_x2_L.glsl
|
||||
|
||||
//!MAGPIE EFFECT
|
||||
//!VERSION 3
|
||||
//!OUTPUT_WIDTH INPUT_WIDTH * 2
|
||||
//!OUTPUT_HEIGHT INPUT_HEIGHT * 2
|
||||
//!VERSION 4
|
||||
//!SORT_NAME Anime4K_Upscale_Denoise_1
|
||||
|
||||
|
||||
//!TEXTURE
|
||||
Texture2D INPUT;
|
||||
|
||||
//!SAMPLER
|
||||
//!FILTER POINT
|
||||
SamplerState sam;
|
||||
|
||||
//!SAMPLER
|
||||
//!FILTER LINEAR
|
||||
SamplerState sam1;
|
||||
//!TEXTURE
|
||||
//!WIDTH INPUT_WIDTH * 2
|
||||
//!HEIGHT INPUT_HEIGHT * 2
|
||||
Texture2D OUTPUT;
|
||||
|
||||
//!TEXTURE
|
||||
//!WIDTH INPUT_WIDTH
|
||||
|
|
@ -43,6 +38,14 @@ Texture2D tex3;
|
|||
//!FORMAT R16G16B16A16_FLOAT
|
||||
Texture2D tex4;
|
||||
|
||||
//!SAMPLER
|
||||
//!FILTER POINT
|
||||
SamplerState sam;
|
||||
|
||||
//!SAMPLER
|
||||
//!FILTER LINEAR
|
||||
SamplerState sam1;
|
||||
|
||||
|
||||
//!PASS 1
|
||||
//!DESC Conv-4x3x3x3
|
||||
|
|
@ -446,12 +449,15 @@ void Pass3(uint2 blockStart, uint3 threadId) {
|
|||
//!PASS 4
|
||||
//!DESC Conv-4x3x3x16, Depth-to-Space
|
||||
//!IN INPUT, tex1, tex2
|
||||
//!OUT OUTPUT
|
||||
//!BLOCK_SIZE 16
|
||||
//!NUM_THREADS 64
|
||||
|
||||
void Pass4(uint2 blockStart, uint3 threadId) {
|
||||
uint2 gxy = (Rmp8x8(threadId.x) << 1) + blockStart;
|
||||
if (!CheckViewport(gxy)) {
|
||||
|
||||
const uint2 outputSize = GetOutputSize();
|
||||
if (gxy.x >= outputSize.x || gxy.y >= outputSize.y) {
|
||||
return;
|
||||
}
|
||||
|
||||
|
|
@ -638,23 +644,17 @@ void Pass4(uint2 blockStart, uint3 threadId) {
|
|||
float2 outputPt = GetOutputPt();
|
||||
|
||||
pos -= 0.5f * outputPt;
|
||||
WriteToOutput(gxy, float3(target1.x, target2.x, target3.x) + INPUT.SampleLevel(sam1, pos, 0).rgb);
|
||||
OUTPUT[gxy] = float4(float3(target1.x, target2.x, target3.x) + INPUT.SampleLevel(sam1, pos, 0).rgb, 1);
|
||||
|
||||
gxy.x += 1u;
|
||||
++gxy.x;
|
||||
pos.x += outputPt.x;
|
||||
if (CheckViewport(gxy)) {
|
||||
WriteToOutput(gxy, float3(target1.y, target2.y, target3.y) + INPUT.SampleLevel(sam1, pos, 0).rgb);
|
||||
}
|
||||
|
||||
gxy.y += 1u;
|
||||
OUTPUT[gxy] = float4(float3(target1.y, target2.y, target3.y) + INPUT.SampleLevel(sam1, pos, 0).rgb, 1);
|
||||
|
||||
++gxy.y;
|
||||
pos.y += outputPt.y;
|
||||
if (CheckViewport(gxy)) {
|
||||
WriteToOutput(gxy, float3(target1.w, target2.w, target3.w) + INPUT.SampleLevel(sam1, pos, 0).rgb);
|
||||
}
|
||||
OUTPUT[gxy] = float4(float3(target1.w, target2.w, target3.w) + INPUT.SampleLevel(sam1, pos, 0).rgb, 1);
|
||||
|
||||
gxy.x -= 1u;
|
||||
--gxy.x;
|
||||
pos.x -= outputPt.x;
|
||||
if (CheckViewport(gxy)) {
|
||||
WriteToOutput(gxy, float3(target1.z, target2.z, target3.z) + INPUT.SampleLevel(sam1, pos, 0).rgb);
|
||||
}
|
||||
OUTPUT[gxy] = float4(float3(target1.z, target2.z, target3.z) + INPUT.SampleLevel(sam1, pos, 0).rgb, 1);
|
||||
}
|
||||
|
|
|
|||
|
|
@ -2,15 +2,18 @@
|
|||
// 移植自 https://github.com/bloc97/Anime4K/blob/master/glsl/Upscale%2BDenoise/Anime4K_Upscale_Denoise_CNN_x2_S.glsl
|
||||
|
||||
//!MAGPIE EFFECT
|
||||
//!VERSION 3
|
||||
//!OUTPUT_WIDTH INPUT_WIDTH * 2
|
||||
//!OUTPUT_HEIGHT INPUT_HEIGHT * 2
|
||||
//!VERSION 4
|
||||
//!SORT_NAME Anime4K_Upscale_Denoise_0
|
||||
|
||||
|
||||
//!TEXTURE
|
||||
Texture2D INPUT;
|
||||
|
||||
//!TEXTURE
|
||||
//!WIDTH INPUT_WIDTH * 2
|
||||
//!HEIGHT INPUT_HEIGHT * 2
|
||||
Texture2D OUTPUT;
|
||||
|
||||
//!TEXTURE
|
||||
//!WIDTH INPUT_WIDTH
|
||||
//!HEIGHT INPUT_HEIGHT
|
||||
|
|
@ -238,6 +241,7 @@ void Pass3(uint2 blockStart, uint3 threadId) {
|
|||
//!PASS 4
|
||||
//!DESC Conv-4x3x3x8, Depth-to-Space
|
||||
//!IN INPUT, tex1
|
||||
//!OUT OUTPUT
|
||||
//!BLOCK_SIZE 16
|
||||
//!NUM_THREADS 64
|
||||
|
||||
|
|
@ -282,7 +286,8 @@ float4 A4KS4(float2 pos) {
|
|||
void Pass4(uint2 blockStart, uint3 threadId) {
|
||||
uint2 gxy = (Rmp8x8(threadId.x) << 1) + blockStart;
|
||||
|
||||
if (!CheckViewport(gxy)) {
|
||||
const uint2 outputSize = GetOutputSize();
|
||||
if (gxy.x >= outputSize.x || gxy.y >= outputSize.y) {
|
||||
return;
|
||||
}
|
||||
|
||||
|
|
@ -291,25 +296,19 @@ void Pass4(uint2 blockStart, uint3 threadId) {
|
|||
|
||||
float2 pos = ((gxy >> 1) + 0.5f) * inputPt;
|
||||
float4 c = A4KS4(pos);
|
||||
|
||||
|
||||
pos -= 0.5f * outputPt;
|
||||
WriteToOutput(gxy, c.x + INPUT.SampleLevel(sam1, pos, 0).rgb);
|
||||
OUTPUT[gxy] = float4(c.x + INPUT.SampleLevel(sam1, pos, 0).rgb, 1);
|
||||
|
||||
gxy.x += 1u;
|
||||
++gxy.x;
|
||||
pos.x += outputPt.x;
|
||||
if (CheckViewport(gxy)) {
|
||||
WriteToOutput(gxy, c.y + INPUT.SampleLevel(sam1, pos, 0).rgb);
|
||||
}
|
||||
|
||||
gxy.y += 1u;
|
||||
OUTPUT[gxy] = float4(c.y + INPUT.SampleLevel(sam1, pos, 0).rgb, 1);
|
||||
|
||||
++gxy.y;
|
||||
pos.y += outputPt.y;
|
||||
if (CheckViewport(gxy)) {
|
||||
WriteToOutput(gxy, c.w + INPUT.SampleLevel(sam1, pos, 0).rgb);
|
||||
}
|
||||
OUTPUT[gxy] = float4(c.w + INPUT.SampleLevel(sam1, pos, 0).rgb, 1);
|
||||
|
||||
gxy.x -= 1u;
|
||||
--gxy.x;
|
||||
pos.x -= outputPt.x;
|
||||
if (CheckViewport(gxy)) {
|
||||
WriteToOutput(gxy, c.z + INPUT.SampleLevel(sam1, pos, 0).rgb);
|
||||
}
|
||||
OUTPUT[gxy] = float4(c.z + INPUT.SampleLevel(sam1, pos, 0).rgb, 1);
|
||||
}
|
||||
|
|
|
|||
|
|
@ -2,22 +2,17 @@
|
|||
// 移植自 https://github.com/bloc97/Anime4K/blob/78e4f78f65b772e94bae6e7db5c49af1e889f784/glsl/Upscale%2BDenoise/Anime4K_Upscale_Denoise_CNN_x2_UL.glsl
|
||||
|
||||
//!MAGPIE EFFECT
|
||||
//!VERSION 3
|
||||
//!OUTPUT_WIDTH INPUT_WIDTH * 2
|
||||
//!OUTPUT_HEIGHT INPUT_HEIGHT * 2
|
||||
//!VERSION 4
|
||||
//!SORT_NAME Anime4K_Upscale_Denoise_3
|
||||
|
||||
|
||||
//!TEXTURE
|
||||
Texture2D INPUT;
|
||||
|
||||
//!SAMPLER
|
||||
//!FILTER POINT
|
||||
SamplerState sam;
|
||||
|
||||
//!SAMPLER
|
||||
//!FILTER LINEAR
|
||||
SamplerState sam1;
|
||||
//!TEXTURE
|
||||
//!WIDTH INPUT_WIDTH * 2
|
||||
//!HEIGHT INPUT_HEIGHT * 2
|
||||
Texture2D OUTPUT;
|
||||
|
||||
//!TEXTURE
|
||||
//!WIDTH INPUT_WIDTH
|
||||
|
|
@ -145,6 +140,14 @@ Texture2D conv2d_6_tf1;
|
|||
//!FORMAT R16G16B16A16_FLOAT
|
||||
Texture2D conv2d_6_tf2;
|
||||
|
||||
//!SAMPLER
|
||||
//!FILTER POINT
|
||||
SamplerState sam;
|
||||
|
||||
//!SAMPLER
|
||||
//!FILTER LINEAR
|
||||
SamplerState sam1;
|
||||
|
||||
//!PASS 1
|
||||
//!DESC Conv-4x3x3x3
|
||||
//!IN INPUT
|
||||
|
|
@ -1929,12 +1932,15 @@ void Pass7(uint2 blockStart, uint3 threadId) {
|
|||
//!PASS 8
|
||||
//!DESC Conv-4x1x1x120, Depth-to-Space
|
||||
//!IN INPUT, conv2d_2_tf, conv2d_2_tf1, conv2d_2_tf2, conv2d_3_tf, conv2d_3_tf1, conv2d_3_tf2, conv2d_4_tf, conv2d_4_tf1, conv2d_4_tf2, conv2d_5_tf, conv2d_5_tf1, conv2d_5_tf2, conv2d_6_tf, conv2d_6_tf1, conv2d_6_tf2
|
||||
//!OUT OUTPUT
|
||||
//!BLOCK_SIZE 16
|
||||
//!NUM_THREADS 64
|
||||
|
||||
void Pass8(uint2 blockStart, uint3 threadId) {
|
||||
uint2 gxy = (Rmp8x8(threadId.x) << 1) + blockStart;
|
||||
if (!CheckViewport(gxy)) {
|
||||
|
||||
const uint2 outputSize = GetOutputSize();
|
||||
if (gxy.x >= outputSize.x || gxy.y >= outputSize.y) {
|
||||
return;
|
||||
}
|
||||
|
||||
|
|
@ -2086,25 +2092,19 @@ void Pass8(uint2 blockStart, uint3 threadId) {
|
|||
target3 += float4(0.00428531, -0.011541925, 0.00898425, -0.01374321);
|
||||
|
||||
float2 outputPt = GetOutputPt();
|
||||
|
||||
|
||||
pos -= 0.5f * outputPt;
|
||||
WriteToOutput(gxy, float3(target1.x, target2.x, target3.x) + INPUT.SampleLevel(sam1, pos, 0).rgb);
|
||||
OUTPUT[gxy] = float4(float3(target1.x, target2.x, target3.x) + INPUT.SampleLevel(sam1, pos, 0).rgb, 1);
|
||||
|
||||
gxy.x += 1u;
|
||||
++gxy.x;
|
||||
pos.x += outputPt.x;
|
||||
if (CheckViewport(gxy)) {
|
||||
WriteToOutput(gxy, float3(target1.y, target2.y, target3.y) + INPUT.SampleLevel(sam1, pos, 0).rgb);
|
||||
}
|
||||
|
||||
gxy.y += 1u;
|
||||
OUTPUT[gxy] = float4(float3(target1.y, target2.y, target3.y) + INPUT.SampleLevel(sam1, pos, 0).rgb, 1);
|
||||
|
||||
++gxy.y;
|
||||
pos.y += outputPt.y;
|
||||
if (CheckViewport(gxy)) {
|
||||
WriteToOutput(gxy, float3(target1.w, target2.w, target3.w) + INPUT.SampleLevel(sam1, pos, 0).rgb);
|
||||
}
|
||||
OUTPUT[gxy] = float4(float3(target1.w, target2.w, target3.w) + INPUT.SampleLevel(sam1, pos, 0).rgb, 1);
|
||||
|
||||
gxy.x -= 1u;
|
||||
--gxy.x;
|
||||
pos.x -= outputPt.x;
|
||||
if (CheckViewport(gxy)) {
|
||||
WriteToOutput(gxy, float3(target1.z, target2.z, target3.z) + INPUT.SampleLevel(sam1, pos, 0).rgb);
|
||||
}
|
||||
OUTPUT[gxy] = float4(float3(target1.z, target2.z, target3.z) + INPUT.SampleLevel(sam1, pos, 0).rgb, 1);
|
||||
}
|
||||
|
|
|
|||
|
|
@ -2,22 +2,17 @@
|
|||
// 移植自 https://github.com/bloc97/Anime4K/blob/78e4f78f65b772e94bae6e7db5c49af1e889f784/glsl/Upscale%2BDenoise/Anime4K_Upscale_Denoise_CNN_x2_VL.glsl
|
||||
|
||||
//!MAGPIE EFFECT
|
||||
//!VERSION 3
|
||||
//!OUTPUT_WIDTH INPUT_WIDTH * 2
|
||||
//!OUTPUT_HEIGHT INPUT_HEIGHT * 2
|
||||
//!VERSION 4
|
||||
//!SORT_NAME Anime4K_Upscale_Denoise_2
|
||||
|
||||
|
||||
//!TEXTURE
|
||||
Texture2D INPUT;
|
||||
|
||||
//!SAMPLER
|
||||
//!FILTER POINT
|
||||
SamplerState sam;
|
||||
|
||||
//!SAMPLER
|
||||
//!FILTER LINEAR
|
||||
SamplerState sam1;
|
||||
//!TEXTURE
|
||||
//!WIDTH INPUT_WIDTH * 2
|
||||
//!HEIGHT INPUT_HEIGHT * 2
|
||||
Texture2D OUTPUT;
|
||||
|
||||
//!TEXTURE
|
||||
//!WIDTH INPUT_WIDTH
|
||||
|
|
@ -103,6 +98,15 @@ Texture2D conv2d_6_tf;
|
|||
//!FORMAT R16G16B16A16_FLOAT
|
||||
Texture2D conv2d_6_tf1;
|
||||
|
||||
//!SAMPLER
|
||||
//!FILTER POINT
|
||||
SamplerState sam;
|
||||
|
||||
//!SAMPLER
|
||||
//!FILTER LINEAR
|
||||
SamplerState sam1;
|
||||
|
||||
|
||||
//!PASS 1
|
||||
//!DESC Conv-4x3x3x3
|
||||
//!IN INPUT
|
||||
|
|
@ -1143,12 +1147,15 @@ void Pass7(uint2 blockStart, uint3 threadId) {
|
|||
//!PASS 8
|
||||
//!DESC Conv-4x1x1x112, Depth-to-Space
|
||||
//!IN INPUT, conv2d_tf, conv2d_tf1, conv2d_1_tf, conv2d_1_tf1, conv2d_2_tf, conv2d_2_tf1, conv2d_3_tf, conv2d_3_tf1, conv2d_4_tf, conv2d_4_tf1, conv2d_5_tf, conv2d_5_tf1, conv2d_6_tf, conv2d_6_tf1
|
||||
//!OUT OUTPUT
|
||||
//!BLOCK_SIZE 16
|
||||
//!NUM_THREADS 64
|
||||
|
||||
void Pass8(uint2 blockStart, uint3 threadId) {
|
||||
uint2 gxy = (Rmp8x8(threadId.x) << 1) + blockStart;
|
||||
if (!CheckViewport(gxy)) {
|
||||
|
||||
const uint2 outputSize = GetOutputSize();
|
||||
if (gxy.x >= outputSize.x || gxy.y >= outputSize.y) {
|
||||
return;
|
||||
}
|
||||
|
||||
|
|
@ -1293,23 +1300,17 @@ void Pass8(uint2 blockStart, uint3 threadId) {
|
|||
float2 outputPt = GetOutputPt();
|
||||
|
||||
pos -= 0.5f * outputPt;
|
||||
WriteToOutput(gxy, float3(target1.x, target2.x, target3.x) + INPUT.SampleLevel(sam1, pos, 0).rgb);
|
||||
OUTPUT[gxy] = float4(float3(target1.x, target2.x, target3.x) + INPUT.SampleLevel(sam1, pos, 0).rgb, 1);
|
||||
|
||||
gxy.x += 1u;
|
||||
++gxy.x;
|
||||
pos.x += outputPt.x;
|
||||
if (CheckViewport(gxy)) {
|
||||
WriteToOutput(gxy, float3(target1.y, target2.y, target3.y) + INPUT.SampleLevel(sam1, pos, 0).rgb);
|
||||
}
|
||||
|
||||
gxy.y += 1u;
|
||||
OUTPUT[gxy] = float4(float3(target1.y, target2.y, target3.y) + INPUT.SampleLevel(sam1, pos, 0).rgb, 1);
|
||||
|
||||
++gxy.y;
|
||||
pos.y += outputPt.y;
|
||||
if (CheckViewport(gxy)) {
|
||||
WriteToOutput(gxy, float3(target1.w, target2.w, target3.w) + INPUT.SampleLevel(sam1, pos, 0).rgb);
|
||||
}
|
||||
OUTPUT[gxy] = float4(float3(target1.w, target2.w, target3.w) + INPUT.SampleLevel(sam1, pos, 0).rgb, 1);
|
||||
|
||||
gxy.x -= 1u;
|
||||
--gxy.x;
|
||||
pos.x -= outputPt.x;
|
||||
if (CheckViewport(gxy)) {
|
||||
WriteToOutput(gxy, float3(target1.z, target2.z, target3.z) + INPUT.SampleLevel(sam1, pos, 0).rgb);
|
||||
}
|
||||
OUTPUT[gxy] = float4(float3(target1.z, target2.z, target3.z) + INPUT.SampleLevel(sam1, pos, 0).rgb, 1);
|
||||
}
|
||||
|
|
|
|||
1324
src/Effects/Anime4K/Anime4K_Upscale_GAN_x2_M.hlsl
Normal file
1324
src/Effects/Anime4K/Anime4K_Upscale_GAN_x2_M.hlsl
Normal file
File diff suppressed because it is too large
Load diff
|
|
@ -1,15 +1,18 @@
|
|||
// Anime4K_Upscale_GAN_x2_S
|
||||
// 移植自 https://github.com/bloc97/Anime4K/blob/master/glsl/Upscale/Anime4K_Upscale_GAN_x2_S.glsl
|
||||
// 移植自 https://github.com/bloc97/Anime4K/blob/8e39551ce96ed172605c89b7dd8be855b5502cc9/glsl/Upscale/Anime4K_Upscale_GAN_x2_S.glsl
|
||||
|
||||
//!MAGPIE EFFECT
|
||||
//!VERSION 3
|
||||
//!OUTPUT_WIDTH INPUT_WIDTH * 2
|
||||
//!OUTPUT_HEIGHT INPUT_HEIGHT * 2
|
||||
|
||||
//!VERSION 4
|
||||
//!SORT_NAME Anime4K_Upscale_GAN_x2_1
|
||||
|
||||
//!TEXTURE
|
||||
Texture2D INPUT;
|
||||
|
||||
//!TEXTURE
|
||||
//!WIDTH INPUT_WIDTH * 2
|
||||
//!HEIGHT INPUT_HEIGHT * 2
|
||||
Texture2D OUTPUT;
|
||||
|
||||
//!SAMPLER
|
||||
//!FILTER POINT
|
||||
SamplerState sam;
|
||||
|
|
@ -696,12 +699,14 @@ void Pass6(uint2 blockStart, uint3 threadId) {
|
|||
//!PASS 7
|
||||
//!DESC Conv-3x3x3x16
|
||||
//!IN tex6, tex8, INPUT
|
||||
//!OUT OUTPUT
|
||||
//!BLOCK_SIZE 8
|
||||
//!NUM_THREADS 64
|
||||
|
||||
void Pass7(uint2 blockStart, uint3 threadId) {
|
||||
uint2 gxy = Rmp8x8(threadId.x) + blockStart;
|
||||
uint2 outputSize = GetOutputSize();
|
||||
|
||||
const uint2 outputSize = GetOutputSize();
|
||||
if (gxy.x >= outputSize.x || gxy.y >= outputSize.y) {
|
||||
return;
|
||||
}
|
||||
|
|
@ -810,5 +815,5 @@ void Pass7(uint2 blockStart, uint3 threadId) {
|
|||
result += mul(ni2, float4x3(0.068098865, 0.07742245, 0.04117883, -0.07239023, -0.0048315763, -0.0029638975, -0.053049978, 0.121163346, 0.048760712, -0.033619802, -0.010043663, -0.012648383));
|
||||
result += float3(0.00016753975, -0.00019302216, -0.0001663917);
|
||||
|
||||
WriteToOutput(gxy, result + INPUT.SampleLevel(sam1, pos, 0).rgb);
|
||||
OUTPUT[gxy] = float4(result + INPUT.SampleLevel(sam1, pos, 0).rgb, 1);
|
||||
}
|
||||
|
|
|
|||
1932
src/Effects/Anime4K/Anime4K_Upscale_GAN_x3_L.hlsl
Normal file
1932
src/Effects/Anime4K/Anime4K_Upscale_GAN_x3_L.hlsl
Normal file
File diff suppressed because it is too large
Load diff
|
|
@ -2,22 +2,17 @@
|
|||
// 移植自 https://github.com/bloc97/Anime4K/blob/master/glsl/Upscale/Anime4K_Upscale_CNN_x2_L.glsl
|
||||
|
||||
//!MAGPIE EFFECT
|
||||
//!VERSION 3
|
||||
//!OUTPUT_WIDTH INPUT_WIDTH * 2
|
||||
//!OUTPUT_HEIGHT INPUT_HEIGHT * 2
|
||||
//!VERSION 4
|
||||
//!SORT_NAME Anime4K_Upscale_1
|
||||
|
||||
|
||||
//!TEXTURE
|
||||
Texture2D INPUT;
|
||||
|
||||
//!SAMPLER
|
||||
//!FILTER POINT
|
||||
SamplerState sam;
|
||||
|
||||
//!SAMPLER
|
||||
//!FILTER LINEAR
|
||||
SamplerState sam1;
|
||||
//!TEXTURE
|
||||
//!WIDTH INPUT_WIDTH * 2
|
||||
//!HEIGHT INPUT_HEIGHT * 2
|
||||
Texture2D OUTPUT;
|
||||
|
||||
//!TEXTURE
|
||||
//!WIDTH INPUT_WIDTH
|
||||
|
|
@ -43,6 +38,14 @@ Texture2D tex3;
|
|||
//!FORMAT R16G16B16A16_FLOAT
|
||||
Texture2D tex4;
|
||||
|
||||
//!SAMPLER
|
||||
//!FILTER POINT
|
||||
SamplerState sam;
|
||||
|
||||
//!SAMPLER
|
||||
//!FILTER LINEAR
|
||||
SamplerState sam1;
|
||||
|
||||
|
||||
//!PASS 1
|
||||
//!DESC Conv-4x3x3x3
|
||||
|
|
@ -446,12 +449,15 @@ void Pass3(uint2 blockStart, uint3 threadId) {
|
|||
//!PASS 4
|
||||
//!DESC Conv-4x3x3x16, Depth-to-Space
|
||||
//!IN INPUT, tex1, tex2
|
||||
//!OUT OUTPUT
|
||||
//!BLOCK_SIZE 16
|
||||
//!NUM_THREADS 64
|
||||
|
||||
void Pass4(uint2 blockStart, uint3 threadId) {
|
||||
uint2 gxy = (Rmp8x8(threadId.x) << 1) + blockStart;
|
||||
if (!CheckViewport(gxy)) {
|
||||
|
||||
const uint2 outputSize = GetOutputSize();
|
||||
if (gxy.x >= outputSize.x || gxy.y >= outputSize.y) {
|
||||
return;
|
||||
}
|
||||
|
||||
|
|
@ -638,23 +644,17 @@ void Pass4(uint2 blockStart, uint3 threadId) {
|
|||
float2 outputPt = GetOutputPt();
|
||||
|
||||
pos -= 0.5f * outputPt;
|
||||
WriteToOutput(gxy, float3(target1.x, target2.x, target3.x) + INPUT.SampleLevel(sam1, pos, 0).rgb);
|
||||
OUTPUT[gxy] = float4(float3(target1.x, target2.x, target3.x) + INPUT.SampleLevel(sam1, pos, 0).rgb, 1);
|
||||
|
||||
gxy.x += 1u;
|
||||
++gxy.x;
|
||||
pos.x += outputPt.x;
|
||||
if (CheckViewport(gxy)) {
|
||||
WriteToOutput(gxy, float3(target1.y, target2.y, target3.y) + INPUT.SampleLevel(sam1, pos, 0).rgb);
|
||||
}
|
||||
OUTPUT[gxy] = float4(float3(target1.y, target2.y, target3.y) + INPUT.SampleLevel(sam1, pos, 0).rgb, 1);
|
||||
|
||||
gxy.y += 1u;
|
||||
++gxy.y;
|
||||
pos.y += outputPt.y;
|
||||
if (CheckViewport(gxy)) {
|
||||
WriteToOutput(gxy, float3(target1.w, target2.w, target3.w) + INPUT.SampleLevel(sam1, pos, 0).rgb);
|
||||
}
|
||||
OUTPUT[gxy] = float4(float3(target1.w, target2.w, target3.w) + INPUT.SampleLevel(sam1, pos, 0).rgb, 1);
|
||||
|
||||
gxy.x -= 1u;
|
||||
--gxy.x;
|
||||
pos.x -= outputPt.x;
|
||||
if (CheckViewport(gxy)) {
|
||||
WriteToOutput(gxy, float3(target1.z, target2.z, target3.z) + INPUT.SampleLevel(sam1, pos, 0).rgb);
|
||||
}
|
||||
OUTPUT[gxy] = float4(float3(target1.z, target2.z, target3.z) + INPUT.SampleLevel(sam1, pos, 0).rgb, 1);
|
||||
}
|
||||
|
|
|
|||
|
|
@ -2,15 +2,18 @@
|
|||
// 移植自 https://github.com/bloc97/Anime4K/blob/master/glsl/Upscale/Anime4K_Upscale_CNN_x2_S.glsl
|
||||
|
||||
//!MAGPIE EFFECT
|
||||
//!VERSION 3
|
||||
//!OUTPUT_WIDTH INPUT_WIDTH * 2
|
||||
//!OUTPUT_HEIGHT INPUT_HEIGHT * 2
|
||||
//!VERSION 4
|
||||
//!SORT_NAME Anime4K_Upscale_0
|
||||
|
||||
|
||||
//!TEXTURE
|
||||
Texture2D INPUT;
|
||||
|
||||
//!TEXTURE
|
||||
//!WIDTH INPUT_WIDTH * 2
|
||||
//!HEIGHT INPUT_HEIGHT * 2
|
||||
Texture2D OUTPUT;
|
||||
|
||||
//!TEXTURE
|
||||
//!WIDTH INPUT_WIDTH
|
||||
//!HEIGHT INPUT_HEIGHT
|
||||
|
|
@ -238,6 +241,7 @@ void Pass3(uint2 blockStart, uint3 threadId) {
|
|||
//!PASS 4
|
||||
//!DESC Conv-4x3x3x8, Depth-to-Space
|
||||
//!IN INPUT, tex1
|
||||
//!OUT OUTPUT
|
||||
//!BLOCK_SIZE 16
|
||||
//!NUM_THREADS 64
|
||||
|
||||
|
|
@ -282,7 +286,8 @@ float4 A4KS4(float2 pos) {
|
|||
void Pass4(uint2 blockStart, uint3 threadId) {
|
||||
uint2 gxy = (Rmp8x8(threadId.x) << 1) + blockStart;
|
||||
|
||||
if (!CheckViewport(gxy)) {
|
||||
const uint2 outputSize = GetOutputSize();
|
||||
if (gxy.x >= outputSize.x || gxy.y >= outputSize.y) {
|
||||
return;
|
||||
}
|
||||
|
||||
|
|
@ -293,23 +298,17 @@ void Pass4(uint2 blockStart, uint3 threadId) {
|
|||
float4 c = A4KS4(pos);
|
||||
|
||||
pos -= 0.5f * outputPt;
|
||||
WriteToOutput(gxy, c.x + INPUT.SampleLevel(sam1, pos, 0).rgb);
|
||||
|
||||
gxy.x += 1u;
|
||||
OUTPUT[gxy] = float4(c.x + INPUT.SampleLevel(sam1, pos, 0).rgb, 1);
|
||||
|
||||
++gxy.x;
|
||||
pos.x += outputPt.x;
|
||||
if (CheckViewport(gxy)) {
|
||||
WriteToOutput(gxy, c.y + INPUT.SampleLevel(sam1, pos, 0).rgb);
|
||||
}
|
||||
OUTPUT[gxy] = float4(c.y + INPUT.SampleLevel(sam1, pos, 0).rgb, 1);
|
||||
|
||||
gxy.y += 1u;
|
||||
++gxy.y;
|
||||
pos.y += outputPt.y;
|
||||
if (CheckViewport(gxy)) {
|
||||
WriteToOutput(gxy, c.w + INPUT.SampleLevel(sam1, pos, 0).rgb);
|
||||
}
|
||||
|
||||
gxy.x -= 1u;
|
||||
OUTPUT[gxy] = float4(c.w + INPUT.SampleLevel(sam1, pos, 0).rgb, 1);
|
||||
|
||||
--gxy.x;
|
||||
pos.x -= outputPt.x;
|
||||
if (CheckViewport(gxy)) {
|
||||
WriteToOutput(gxy, c.z + INPUT.SampleLevel(sam1, pos, 0).rgb);
|
||||
}
|
||||
OUTPUT[gxy] = float4(c.z + INPUT.SampleLevel(sam1, pos, 0).rgb, 1);
|
||||
}
|
||||
|
|
|
|||
|
|
@ -2,22 +2,17 @@
|
|||
// 移植自 https://github.com/bloc97/Anime4K/blob/78e4f78f65b772e94bae6e7db5c49af1e889f784/glsl/Upscale/Anime4K_Upscale_CNN_x2_UL.glsl
|
||||
|
||||
//!MAGPIE EFFECT
|
||||
//!VERSION 3
|
||||
//!OUTPUT_WIDTH INPUT_WIDTH * 2
|
||||
//!OUTPUT_HEIGHT INPUT_HEIGHT * 2
|
||||
//!VERSION 4
|
||||
//!SORT_NAME Anime4K_Upscale_3
|
||||
|
||||
|
||||
//!TEXTURE
|
||||
Texture2D INPUT;
|
||||
|
||||
//!SAMPLER
|
||||
//!FILTER POINT
|
||||
SamplerState sam;
|
||||
|
||||
//!SAMPLER
|
||||
//!FILTER LINEAR
|
||||
SamplerState sam1;
|
||||
//!TEXTURE
|
||||
//!WIDTH INPUT_WIDTH * 2
|
||||
//!HEIGHT INPUT_HEIGHT * 2
|
||||
Texture2D OUTPUT;
|
||||
|
||||
//!TEXTURE
|
||||
//!WIDTH INPUT_WIDTH
|
||||
|
|
@ -145,6 +140,15 @@ Texture2D conv2d_6_tf1;
|
|||
//!FORMAT R16G16B16A16_FLOAT
|
||||
Texture2D conv2d_6_tf2;
|
||||
|
||||
//!SAMPLER
|
||||
//!FILTER POINT
|
||||
SamplerState sam;
|
||||
|
||||
//!SAMPLER
|
||||
//!FILTER LINEAR
|
||||
SamplerState sam1;
|
||||
|
||||
|
||||
//!PASS 1
|
||||
//!DESC Conv-4x3x3x3
|
||||
//!IN INPUT
|
||||
|
|
@ -1929,12 +1933,15 @@ void Pass7(uint2 blockStart, uint3 threadId) {
|
|||
//!PASS 8
|
||||
//!DESC Conv-4x1x1x120, Depth-to-Space
|
||||
//!IN INPUT, conv2d_2_tf, conv2d_2_tf1, conv2d_2_tf2, conv2d_3_tf, conv2d_3_tf1, conv2d_3_tf2, conv2d_4_tf, conv2d_4_tf1, conv2d_4_tf2, conv2d_5_tf, conv2d_5_tf1, conv2d_5_tf2, conv2d_6_tf, conv2d_6_tf1, conv2d_6_tf2
|
||||
//!OUT OUTPUT
|
||||
//!BLOCK_SIZE 16
|
||||
//!NUM_THREADS 64
|
||||
|
||||
void Pass8(uint2 blockStart, uint3 threadId) {
|
||||
uint2 gxy = (Rmp8x8(threadId.x) << 1) + blockStart;
|
||||
if (!CheckViewport(gxy)) {
|
||||
|
||||
const uint2 outputSize = GetOutputSize();
|
||||
if (gxy.x >= outputSize.x || gxy.y >= outputSize.y) {
|
||||
return;
|
||||
}
|
||||
|
||||
|
|
@ -2088,23 +2095,17 @@ void Pass8(uint2 blockStart, uint3 threadId) {
|
|||
float2 outputPt = GetOutputPt();
|
||||
|
||||
pos -= 0.5f * outputPt;
|
||||
WriteToOutput(gxy, float3(target1.x, target2.x, target3.x) + INPUT.SampleLevel(sam1, pos, 0).rgb);
|
||||
OUTPUT[gxy] = float4(float3(target1.x, target2.x, target3.x) + INPUT.SampleLevel(sam1, pos, 0).rgb, 1);
|
||||
|
||||
gxy.x += 1u;
|
||||
++gxy.x;
|
||||
pos.x += outputPt.x;
|
||||
if (CheckViewport(gxy)) {
|
||||
WriteToOutput(gxy, float3(target1.y, target2.y, target3.y) + INPUT.SampleLevel(sam1, pos, 0).rgb);
|
||||
}
|
||||
|
||||
gxy.y += 1u;
|
||||
OUTPUT[gxy] = float4(float3(target1.y, target2.y, target3.y) + INPUT.SampleLevel(sam1, pos, 0).rgb, 1);
|
||||
|
||||
++gxy.y;
|
||||
pos.y += outputPt.y;
|
||||
if (CheckViewport(gxy)) {
|
||||
WriteToOutput(gxy, float3(target1.w, target2.w, target3.w) + INPUT.SampleLevel(sam1, pos, 0).rgb);
|
||||
}
|
||||
OUTPUT[gxy] = float4(float3(target1.w, target2.w, target3.w) + INPUT.SampleLevel(sam1, pos, 0).rgb, 1);
|
||||
|
||||
gxy.x -= 1u;
|
||||
--gxy.x;
|
||||
pos.x -= outputPt.x;
|
||||
if (CheckViewport(gxy)) {
|
||||
WriteToOutput(gxy, float3(target1.z, target2.z, target3.z) + INPUT.SampleLevel(sam1, pos, 0).rgb);
|
||||
}
|
||||
OUTPUT[gxy] = float4(float3(target1.z, target2.z, target3.z) + INPUT.SampleLevel(sam1, pos, 0).rgb, 1);
|
||||
}
|
||||
|
|
|
|||
|
|
@ -2,22 +2,17 @@
|
|||
// 移植自 https://github.com/bloc97/Anime4K/blob/78e4f78f65b772e94bae6e7db5c49af1e889f784/glsl/Upscale/Anime4K_Upscale_CNN_x2_VL.glsl
|
||||
|
||||
//!MAGPIE EFFECT
|
||||
//!VERSION 3
|
||||
//!OUTPUT_WIDTH INPUT_WIDTH * 2
|
||||
//!OUTPUT_HEIGHT INPUT_HEIGHT * 2
|
||||
//!VERSION 4
|
||||
//!SORT_NAME Anime4K_Upscale_2
|
||||
|
||||
|
||||
//!TEXTURE
|
||||
Texture2D INPUT;
|
||||
|
||||
//!SAMPLER
|
||||
//!FILTER POINT
|
||||
SamplerState sam;
|
||||
|
||||
//!SAMPLER
|
||||
//!FILTER LINEAR
|
||||
SamplerState sam1;
|
||||
//!TEXTURE
|
||||
//!WIDTH INPUT_WIDTH * 2
|
||||
//!HEIGHT INPUT_HEIGHT * 2
|
||||
Texture2D OUTPUT;
|
||||
|
||||
//!TEXTURE
|
||||
//!WIDTH INPUT_WIDTH
|
||||
|
|
@ -103,6 +98,15 @@ Texture2D conv2d_6_tf;
|
|||
//!FORMAT R16G16B16A16_FLOAT
|
||||
Texture2D conv2d_6_tf1;
|
||||
|
||||
//!SAMPLER
|
||||
//!FILTER POINT
|
||||
SamplerState sam;
|
||||
|
||||
//!SAMPLER
|
||||
//!FILTER LINEAR
|
||||
SamplerState sam1;
|
||||
|
||||
|
||||
//!PASS 1
|
||||
//!DESC Conv-4x3x3x3
|
||||
//!IN INPUT
|
||||
|
|
@ -1143,12 +1147,15 @@ void Pass7(uint2 blockStart, uint3 threadId) {
|
|||
//!PASS 8
|
||||
//!DESC Conv-4x1x1x112, Depth-to-Space
|
||||
//!IN INPUT, conv2d_tf, conv2d_tf1, conv2d_1_tf, conv2d_1_tf1, conv2d_2_tf, conv2d_2_tf1, conv2d_3_tf, conv2d_3_tf1, conv2d_4_tf, conv2d_4_tf1, conv2d_5_tf, conv2d_5_tf1, conv2d_6_tf, conv2d_6_tf1
|
||||
//!OUT OUTPUT
|
||||
//!BLOCK_SIZE 16
|
||||
//!NUM_THREADS 64
|
||||
|
||||
void Pass8(uint2 blockStart, uint3 threadId) {
|
||||
uint2 gxy = (Rmp8x8(threadId.x) << 1) + blockStart;
|
||||
if (!CheckViewport(gxy)) {
|
||||
|
||||
const uint2 outputSize = GetOutputSize();
|
||||
if (gxy.x >= outputSize.x || gxy.y >= outputSize.y) {
|
||||
return;
|
||||
}
|
||||
|
||||
|
|
@ -1293,23 +1300,17 @@ void Pass8(uint2 blockStart, uint3 threadId) {
|
|||
float2 outputPt = GetOutputPt();
|
||||
|
||||
pos -= 0.5f * outputPt;
|
||||
WriteToOutput(gxy, float3(target1.x, target2.x, target3.x) + INPUT.SampleLevel(sam1, pos, 0).rgb);
|
||||
OUTPUT[gxy] = float4(float3(target1.x, target2.x, target3.x) + INPUT.SampleLevel(sam1, pos, 0).rgb, 1);
|
||||
|
||||
gxy.x += 1u;
|
||||
++gxy.x;
|
||||
pos.x += outputPt.x;
|
||||
if (CheckViewport(gxy)) {
|
||||
WriteToOutput(gxy, float3(target1.y, target2.y, target3.y) + INPUT.SampleLevel(sam1, pos, 0).rgb);
|
||||
}
|
||||
|
||||
gxy.y += 1u;
|
||||
OUTPUT[gxy] = float4(float3(target1.y, target2.y, target3.y) + INPUT.SampleLevel(sam1, pos, 0).rgb, 1);
|
||||
|
||||
++gxy.y;
|
||||
pos.y += outputPt.y;
|
||||
if (CheckViewport(gxy)) {
|
||||
WriteToOutput(gxy, float3(target1.w, target2.w, target3.w) + INPUT.SampleLevel(sam1, pos, 0).rgb);
|
||||
}
|
||||
OUTPUT[gxy] = float4(float3(target1.w, target2.w, target3.w) + INPUT.SampleLevel(sam1, pos, 0).rgb, 1);
|
||||
|
||||
gxy.x -= 1u;
|
||||
--gxy.x;
|
||||
pos.x -= outputPt.x;
|
||||
if (CheckViewport(gxy)) {
|
||||
WriteToOutput(gxy, float3(target1.z, target2.z, target3.z) + INPUT.SampleLevel(sam1, pos, 0).rgb);
|
||||
}
|
||||
OUTPUT[gxy] = float4(float3(target1.z, target2.z, target3.z) + INPUT.SampleLevel(sam1, pos, 0).rgb, 1);
|
||||
}
|
||||
|
|
|
|||
|
|
@ -2,8 +2,7 @@
|
|||
// 移植自 https://github.com/ActualMandM/cemu_graphic_packs/blob/468d165cf27dae13a06e8bdc3d588d0af775ad91/Filters/Bicubic/output.glsl
|
||||
|
||||
//!MAGPIE EFFECT
|
||||
//!VERSION 3
|
||||
//!GENERIC_DOWNSCALER
|
||||
//!VERSION 4
|
||||
|
||||
|
||||
//!PARAMETER
|
||||
|
|
@ -27,6 +26,9 @@ float paramC;
|
|||
//!TEXTURE
|
||||
Texture2D INPUT;
|
||||
|
||||
//!TEXTURE
|
||||
Texture2D OUTPUT;
|
||||
|
||||
//!SAMPLER
|
||||
//!FILTER LINEAR
|
||||
SamplerState sam;
|
||||
|
|
@ -35,7 +37,7 @@ SamplerState sam;
|
|||
//!PASS 1
|
||||
//!STYLE PS
|
||||
//!IN INPUT
|
||||
|
||||
//!OUT OUTPUT
|
||||
|
||||
float weight(float x) {
|
||||
const float B = paramB;
|
||||
|
|
@ -93,20 +95,20 @@ float4 Pass1(float2 pos) {
|
|||
int2 coord_top_left = int2(max(uv0 * inputSize, 0.5));
|
||||
int2 coord_bottom_right = int2(min(uv3 * inputSize, inputSize - 0.5));
|
||||
|
||||
float4 top = INPUT.Load(int3(coord_top_left, 0)) * rowtaps.x;
|
||||
top += INPUT.SampleLevel(sam, float2(u_middle, uv0.y), 0) * u_weight_sum;
|
||||
top += INPUT.Load(int3(coord_bottom_right.x, coord_top_left.y, 0)) * rowtaps.w;
|
||||
float4 total = top * coltaps.x;
|
||||
float3 top = INPUT.Load(int3(coord_top_left, 0)).rgb * rowtaps.x;
|
||||
top += INPUT.SampleLevel(sam, float2(u_middle, uv0.y), 0).rgb * u_weight_sum;
|
||||
top += INPUT.Load(int3(coord_bottom_right.x, coord_top_left.y, 0)).rgb * rowtaps.w;
|
||||
float3 total = top * coltaps.x;
|
||||
|
||||
float4 middle = INPUT.SampleLevel(sam, float2(uv0.x, v_middle), 0) * rowtaps.x;
|
||||
middle += INPUT.SampleLevel(sam, float2(u_middle, v_middle), 0) * u_weight_sum;
|
||||
middle += INPUT.SampleLevel(sam, float2(uv3.x, v_middle), 0) * rowtaps.w;
|
||||
float3 middle = INPUT.SampleLevel(sam, float2(uv0.x, v_middle), 0).rgb * rowtaps.x;
|
||||
middle += INPUT.SampleLevel(sam, float2(u_middle, v_middle), 0).rgb * u_weight_sum;
|
||||
middle += INPUT.SampleLevel(sam, float2(uv3.x, v_middle), 0).rgb * rowtaps.w;
|
||||
total += middle * v_weight_sum;
|
||||
|
||||
float4 bottom = INPUT.Load(int3(coord_top_left.x, coord_bottom_right.y, 0)) * rowtaps.x;
|
||||
bottom += INPUT.SampleLevel(sam, float2(u_middle, uv3.y), 0) * u_weight_sum;
|
||||
bottom += INPUT.Load(int3(coord_bottom_right, 0)) * rowtaps.w;
|
||||
float3 bottom = INPUT.Load(int3(coord_top_left.x, coord_bottom_right.y, 0)).rgb * rowtaps.x;
|
||||
bottom += INPUT.SampleLevel(sam, float2(u_middle, uv3.y), 0).rgb * u_weight_sum;
|
||||
bottom += INPUT.Load(int3(coord_bottom_right, 0)).rgb * rowtaps.w;
|
||||
total += bottom * coltaps.w;
|
||||
|
||||
return total;
|
||||
return float4(total, 1);
|
||||
}
|
||||
|
|
|
|||
|
|
@ -1,20 +1,20 @@
|
|||
//!MAGPIE EFFECT
|
||||
//!VERSION 3
|
||||
//!GENERIC_DOWNSCALER
|
||||
|
||||
//!VERSION 4
|
||||
|
||||
//!TEXTURE
|
||||
Texture2D INPUT;
|
||||
|
||||
//!TEXTURE
|
||||
Texture2D OUTPUT;
|
||||
|
||||
//!SAMPLER
|
||||
//!FILTER LINEAR
|
||||
SamplerState sam;
|
||||
|
||||
|
||||
//!PASS 1
|
||||
//!STYLE PS
|
||||
//!IN INPUT
|
||||
|
||||
//!OUT OUTPUT
|
||||
float4 Pass1(float2 pos) {
|
||||
return INPUT.SampleLevel(sam, pos, 0);
|
||||
}
|
||||
|
|
|
|||
|
|
@ -1,9 +1,8 @@
|
|||
// 移植自 https://github.com/GPUOpen-Effects/FidelityFX-CAS/blob/master/ffx-cas/ffx_cas.h
|
||||
|
||||
//!MAGPIE EFFECT
|
||||
//!VERSION 3
|
||||
//!OUTPUT_WIDTH INPUT_WIDTH
|
||||
//!OUTPUT_HEIGHT INPUT_HEIGHT
|
||||
//!VERSION 4
|
||||
|
||||
|
||||
//!PARAMETER
|
||||
//!LABEL Sharpness
|
||||
|
|
@ -16,6 +15,11 @@ float sharpness;
|
|||
//!TEXTURE
|
||||
Texture2D INPUT;
|
||||
|
||||
//!TEXTURE
|
||||
//!WIDTH INPUT_WIDTH
|
||||
//!HEIGHT INPUT_HEIGHT
|
||||
Texture2D OUTPUT;
|
||||
|
||||
//!SAMPLER
|
||||
//!FILTER POINT
|
||||
SamplerState sam;
|
||||
|
|
@ -23,6 +27,7 @@ SamplerState sam;
|
|||
|
||||
//!PASS 1
|
||||
//!IN INPUT
|
||||
//!OUT OUTPUT
|
||||
//!BLOCK_SIZE 16
|
||||
//!NUM_THREADS 64
|
||||
|
||||
|
|
@ -35,254 +40,244 @@ SamplerState sam;
|
|||
#ifdef MP_FP16
|
||||
|
||||
void CasFilterH(
|
||||
MF3 src[4][4],
|
||||
uint pos,
|
||||
MF peak,
|
||||
// Output values are for 2 8x8 tiles in a 16x8 region.
|
||||
// pix<R,G,B>.x = right 8x8 tile
|
||||
// pix<R,G,B>.y = left 8x8 tile
|
||||
// This enables later processing to easily be packed as well.
|
||||
out MF2 pixR,
|
||||
out MF2 pixG,
|
||||
out MF2 pixB
|
||||
MF3 src[4][4],
|
||||
uint pos,
|
||||
MF peak,
|
||||
// Output values are for 2 8x8 tiles in a 16x8 region.
|
||||
// pix<R,G,B>.x = right 8x8 tile
|
||||
// pix<R,G,B>.y = left 8x8 tile
|
||||
// This enables later processing to easily be packed as well.
|
||||
out MF2 pixR,
|
||||
out MF2 pixG,
|
||||
out MF2 pixB
|
||||
) {
|
||||
// AOS to SOA conversion.
|
||||
MF2 aR = MF2(src[0][pos + 0].r, src[1][pos + 0].r);
|
||||
MF2 aG = MF2(src[0][pos + 0].g, src[1][pos + 0].g);
|
||||
MF2 aB = MF2(src[0][pos + 0].b, src[1][pos + 0].b);
|
||||
MF2 bR = MF2(src[1][pos + 0].r, src[2][pos + 0].r);
|
||||
MF2 bG = MF2(src[1][pos + 0].g, src[2][pos + 0].g);
|
||||
MF2 bB = MF2(src[1][pos + 0].b, src[2][pos + 0].b);
|
||||
MF2 cR = MF2(src[2][pos + 0].r, src[3][pos + 0].r);
|
||||
MF2 cG = MF2(src[2][pos + 0].g, src[3][pos + 0].g);
|
||||
MF2 cB = MF2(src[2][pos + 0].b, src[3][pos + 0].b);
|
||||
MF2 dR = MF2(src[0][pos + 1].r, src[1][pos + 1].r);
|
||||
MF2 dG = MF2(src[0][pos + 1].g, src[1][pos + 1].g);
|
||||
MF2 dB = MF2(src[0][pos + 1].b, src[1][pos + 1].b);
|
||||
MF2 eR = MF2(src[1][pos + 1].r, src[2][pos + 1].r);
|
||||
MF2 eG = MF2(src[1][pos + 1].g, src[2][pos + 1].g);
|
||||
MF2 eB = MF2(src[1][pos + 1].b, src[2][pos + 1].b);
|
||||
MF2 fR = MF2(src[2][pos + 1].r, src[3][pos + 1].r);
|
||||
MF2 fG = MF2(src[2][pos + 1].g, src[3][pos + 1].g);
|
||||
MF2 fB = MF2(src[2][pos + 1].b, src[3][pos + 1].b);
|
||||
MF2 gR = MF2(src[0][pos + 2].r, src[1][pos + 2].r);
|
||||
MF2 gG = MF2(src[0][pos + 2].g, src[1][pos + 2].g);
|
||||
MF2 gB = MF2(src[0][pos + 2].b, src[1][pos + 2].b);
|
||||
MF2 hR = MF2(src[1][pos + 2].r, src[2][pos + 2].r);
|
||||
MF2 hG = MF2(src[1][pos + 2].g, src[2][pos + 2].g);
|
||||
MF2 hB = MF2(src[1][pos + 2].b, src[2][pos + 2].b);
|
||||
MF2 iR = MF2(src[2][pos + 2].r, src[3][pos + 2].r);
|
||||
MF2 iG = MF2(src[2][pos + 2].g, src[3][pos + 2].g);
|
||||
MF2 iB = MF2(src[2][pos + 2].b, src[3][pos + 2].b);
|
||||
// AOS to SOA conversion.
|
||||
MF2 aR = MF2(src[0][pos + 0].r, src[1][pos + 0].r);
|
||||
MF2 aG = MF2(src[0][pos + 0].g, src[1][pos + 0].g);
|
||||
MF2 aB = MF2(src[0][pos + 0].b, src[1][pos + 0].b);
|
||||
MF2 bR = MF2(src[1][pos + 0].r, src[2][pos + 0].r);
|
||||
MF2 bG = MF2(src[1][pos + 0].g, src[2][pos + 0].g);
|
||||
MF2 bB = MF2(src[1][pos + 0].b, src[2][pos + 0].b);
|
||||
MF2 cR = MF2(src[2][pos + 0].r, src[3][pos + 0].r);
|
||||
MF2 cG = MF2(src[2][pos + 0].g, src[3][pos + 0].g);
|
||||
MF2 cB = MF2(src[2][pos + 0].b, src[3][pos + 0].b);
|
||||
MF2 dR = MF2(src[0][pos + 1].r, src[1][pos + 1].r);
|
||||
MF2 dG = MF2(src[0][pos + 1].g, src[1][pos + 1].g);
|
||||
MF2 dB = MF2(src[0][pos + 1].b, src[1][pos + 1].b);
|
||||
MF2 eR = MF2(src[1][pos + 1].r, src[2][pos + 1].r);
|
||||
MF2 eG = MF2(src[1][pos + 1].g, src[2][pos + 1].g);
|
||||
MF2 eB = MF2(src[1][pos + 1].b, src[2][pos + 1].b);
|
||||
MF2 fR = MF2(src[2][pos + 1].r, src[3][pos + 1].r);
|
||||
MF2 fG = MF2(src[2][pos + 1].g, src[3][pos + 1].g);
|
||||
MF2 fB = MF2(src[2][pos + 1].b, src[3][pos + 1].b);
|
||||
MF2 gR = MF2(src[0][pos + 2].r, src[1][pos + 2].r);
|
||||
MF2 gG = MF2(src[0][pos + 2].g, src[1][pos + 2].g);
|
||||
MF2 gB = MF2(src[0][pos + 2].b, src[1][pos + 2].b);
|
||||
MF2 hR = MF2(src[1][pos + 2].r, src[2][pos + 2].r);
|
||||
MF2 hG = MF2(src[1][pos + 2].g, src[2][pos + 2].g);
|
||||
MF2 hB = MF2(src[1][pos + 2].b, src[2][pos + 2].b);
|
||||
MF2 iR = MF2(src[2][pos + 2].r, src[3][pos + 2].r);
|
||||
MF2 iG = MF2(src[2][pos + 2].g, src[3][pos + 2].g);
|
||||
MF2 iB = MF2(src[2][pos + 2].b, src[3][pos + 2].b);
|
||||
|
||||
// Soft min and max.
|
||||
MF2 mnR = min(min(fR, hR), min(min(bR, dR), eR));
|
||||
MF2 mnG = min(min(fG, hG), min(min(bG, dG), eG));
|
||||
MF2 mnB = min(min(fB, hB), min(min(bB, dB), eB));
|
||||
// Soft min and max.
|
||||
MF2 mnR = min(min(fR, hR), min(min(bR, dR), eR));
|
||||
MF2 mnG = min(min(fG, hG), min(min(bG, dG), eG));
|
||||
MF2 mnB = min(min(fB, hB), min(min(bB, dB), eB));
|
||||
#ifdef CAS_BETTER_DIAGONALS
|
||||
MF2 mnR2 = min(min(gR, iR), min(min(aR, cR), mnR));
|
||||
MF2 mnG2 = min(min(gG, iG), min(min(aG, cG), mnG));
|
||||
MF2 mnB2 = min(min(gB, iB), min(min(aB, cB), mnB));
|
||||
mnR = mnR + mnR2;
|
||||
mnG = mnG + mnG2;
|
||||
mnB = mnB + mnB2;
|
||||
MF2 mnR2 = min(min(gR, iR), min(min(aR, cR), mnR));
|
||||
MF2 mnG2 = min(min(gG, iG), min(min(aG, cG), mnG));
|
||||
MF2 mnB2 = min(min(gB, iB), min(min(aB, cB), mnB));
|
||||
mnR = mnR + mnR2;
|
||||
mnG = mnG + mnG2;
|
||||
mnB = mnB + mnB2;
|
||||
#endif
|
||||
MF2 mxR = max(max(fR, hR), max(max(bR, dR), eR));
|
||||
MF2 mxG = max(max(fG, hG), max(max(bG, dG), eG));
|
||||
MF2 mxB = max(max(fB, hB), max(max(bB, dB), eB));
|
||||
MF2 mxR = max(max(fR, hR), max(max(bR, dR), eR));
|
||||
MF2 mxG = max(max(fG, hG), max(max(bG, dG), eG));
|
||||
MF2 mxB = max(max(fB, hB), max(max(bB, dB), eB));
|
||||
#ifdef CAS_BETTER_DIAGONALS
|
||||
MF2 mxR2 = max(max(gR, iR), max(max(aR, cR), mxR));
|
||||
MF2 mxG2 = max(max(gG, iG), max(max(aG, cG), mxG));
|
||||
MF2 mxB2 = max(max(gB, iB), max(max(aB, cB), mxB));
|
||||
mxR = mxR + mxR2;
|
||||
mxG = mxG + mxG2;
|
||||
mxB = mxB + mxB2;
|
||||
MF2 mxR2 = max(max(gR, iR), max(max(aR, cR), mxR));
|
||||
MF2 mxG2 = max(max(gG, iG), max(max(aG, cG), mxG));
|
||||
MF2 mxB2 = max(max(gB, iB), max(max(aB, cB), mxB));
|
||||
mxR = mxR + mxR2;
|
||||
mxG = mxG + mxG2;
|
||||
mxB = mxB + mxB2;
|
||||
#endif
|
||||
// Smooth minimum distance to signal limit divided by smooth max.
|
||||
MF2 rcpMR = rcp(mxR);
|
||||
MF2 rcpMG = rcp(mxG);
|
||||
MF2 rcpMB = rcp(mxB);
|
||||
// Smooth minimum distance to signal limit divided by smooth max.
|
||||
MF2 rcpMR = rcp(mxR);
|
||||
MF2 rcpMG = rcp(mxG);
|
||||
MF2 rcpMB = rcp(mxB);
|
||||
|
||||
#ifdef CAS_BETTER_DIAGONALS
|
||||
MF2 ampR = saturate(min(mnR, 2.0 - mxR) * rcpMR);
|
||||
MF2 ampG = saturate(min(mnG, 2.0 - mxG) * rcpMG);
|
||||
MF2 ampB = saturate(min(mnB, 2.0 - mxB) * rcpMB);
|
||||
MF2 ampR = saturate(min(mnR, 2.0 - mxR) * rcpMR);
|
||||
MF2 ampG = saturate(min(mnG, 2.0 - mxG) * rcpMG);
|
||||
MF2 ampB = saturate(min(mnB, 2.0 - mxB) * rcpMB);
|
||||
#else
|
||||
MF2 ampR = saturate(min(mnR, 1.0 - mxR) * rcpMR);
|
||||
MF2 ampG = saturate(min(mnG, 1.0 - mxG) * rcpMG);
|
||||
MF2 ampB = saturate(min(mnB, 1.0 - mxB) * rcpMB);
|
||||
MF2 ampR = saturate(min(mnR, 1.0 - mxR) * rcpMR);
|
||||
MF2 ampG = saturate(min(mnG, 1.0 - mxG) * rcpMG);
|
||||
MF2 ampB = saturate(min(mnB, 1.0 - mxB) * rcpMB);
|
||||
#endif
|
||||
// Shaping amount of sharpening.
|
||||
// Shaping amount of sharpening.
|
||||
|
||||
ampR = sqrt(ampR);
|
||||
ampG = sqrt(ampG);
|
||||
ampB = sqrt(ampB);
|
||||
ampR = sqrt(ampR);
|
||||
ampG = sqrt(ampG);
|
||||
ampB = sqrt(ampB);
|
||||
|
||||
// Filter shape.
|
||||
MF2 wR = ampR * peak;
|
||||
MF2 wG = ampG * peak;
|
||||
MF2 wB = ampB * peak;
|
||||
// Filter.
|
||||
// Filter shape.
|
||||
MF2 wR = ampR * peak;
|
||||
MF2 wG = ampG * peak;
|
||||
MF2 wB = ampB * peak;
|
||||
// Filter.
|
||||
|
||||
MF2 rcpWeight = rcp(1.0 + 4.0 * wG);
|
||||
MF2 rcpWeight = rcp(1.0 + 4.0 * wG);
|
||||
|
||||
pixR = saturate((bR * wG + dR * wG + fR * wG + hR * wG + eR) * rcpWeight);
|
||||
pixG = saturate((bG * wG + dG * wG + fG * wG + hG * wG + eG) * rcpWeight);
|
||||
pixB = saturate((bB * wG + dB * wG + fB * wG + hB * wG + eB) * rcpWeight);
|
||||
pixR = saturate((bR * wG + dR * wG + fR * wG + hR * wG + eR) * rcpWeight);
|
||||
pixG = saturate((bG * wG + dG * wG + fG * wG + hG * wG + eG) * rcpWeight);
|
||||
pixB = saturate((bB * wG + dB * wG + fB * wG + hB * wG + eB) * rcpWeight);
|
||||
}
|
||||
|
||||
#else
|
||||
|
||||
MF3 CasFilter(MF3 src[4][4], uint2 pos, MF peak) {
|
||||
// a b c
|
||||
// d e f
|
||||
// g h i
|
||||
MF3 a = src[pos.x - 1][pos.y - 1];
|
||||
MF3 b = src[pos.x][pos.y - 1];
|
||||
MF3 c = src[pos.x + 1][pos.y - 1];
|
||||
MF3 d = src[pos.x - 1][pos.y];
|
||||
MF3 e = src[pos.x][pos.y];
|
||||
MF3 f = src[pos.x + 1][pos.y];
|
||||
MF3 g = src[pos.x - 1][pos.y + 1];
|
||||
MF3 h = src[pos.x][pos.y + 1];
|
||||
MF3 i = src[pos.x + 1][pos.y + 1];
|
||||
// a b c
|
||||
// d e f
|
||||
// g h i
|
||||
MF3 a = src[pos.x - 1][pos.y - 1];
|
||||
MF3 b = src[pos.x][pos.y - 1];
|
||||
MF3 c = src[pos.x + 1][pos.y - 1];
|
||||
MF3 d = src[pos.x - 1][pos.y];
|
||||
MF3 e = src[pos.x][pos.y];
|
||||
MF3 f = src[pos.x + 1][pos.y];
|
||||
MF3 g = src[pos.x - 1][pos.y + 1];
|
||||
MF3 h = src[pos.x][pos.y + 1];
|
||||
MF3 i = src[pos.x + 1][pos.y + 1];
|
||||
|
||||
// Soft min and max.
|
||||
// a b c b
|
||||
// d e f * 0.5 + d e f * 0.5
|
||||
// g h i h
|
||||
// These are 2.0x bigger (factored out the extra multiply).
|
||||
MF mnR = min3(min3(d.r, e.r, f.r), b.r, h.r);
|
||||
MF mnG = min3(min3(d.g, e.g, f.g), b.g, h.g);
|
||||
MF mnB = min3(min3(d.b, e.b, f.b), b.b, h.b);
|
||||
// Soft min and max.
|
||||
// a b c b
|
||||
// d e f * 0.5 + d e f * 0.5
|
||||
// g h i h
|
||||
// These are 2.0x bigger (factored out the extra multiply).
|
||||
MF mnR = min3(min3(d.r, e.r, f.r), b.r, h.r);
|
||||
MF mnG = min3(min3(d.g, e.g, f.g), b.g, h.g);
|
||||
MF mnB = min3(min3(d.b, e.b, f.b), b.b, h.b);
|
||||
#ifdef CAS_BETTER_DIAGONALS
|
||||
MF mnR2 = min3(min3(mnR, a.r, c.r), g.r, i.r);
|
||||
MF mnG2 = min3(min3(mnG, a.g, c.g), g.g, i.g);
|
||||
MF mnB2 = min3(min3(mnB, a.b, c.b), g.b, i.b);
|
||||
mnR = mnR + mnR2;
|
||||
mnG = mnG + mnG2;
|
||||
mnB = mnB + mnB2;
|
||||
MF mnR2 = min3(min3(mnR, a.r, c.r), g.r, i.r);
|
||||
MF mnG2 = min3(min3(mnG, a.g, c.g), g.g, i.g);
|
||||
MF mnB2 = min3(min3(mnB, a.b, c.b), g.b, i.b);
|
||||
mnR = mnR + mnR2;
|
||||
mnG = mnG + mnG2;
|
||||
mnB = mnB + mnB2;
|
||||
#endif
|
||||
MF mxR = max3(max3(d.r, e.r, f.r), b.r, h.r);
|
||||
MF mxG = max3(max3(d.g, e.g, f.g), b.g, h.g);
|
||||
MF mxB = max3(max3(d.b, e.b, f.b), b.b, h.b);
|
||||
MF mxR = max3(max3(d.r, e.r, f.r), b.r, h.r);
|
||||
MF mxG = max3(max3(d.g, e.g, f.g), b.g, h.g);
|
||||
MF mxB = max3(max3(d.b, e.b, f.b), b.b, h.b);
|
||||
#ifdef CAS_BETTER_DIAGONALS
|
||||
MF mxR2 = max3(max3(mxR, a.r, c.r), g.r, i.r);
|
||||
MF mxG2 = max3(max3(mxG, a.g, c.g), g.g, i.g);
|
||||
MF mxB2 = max3(max3(mxB, a.b, c.b), g.b, i.b);
|
||||
mxR = mxR + mxR2;
|
||||
mxG = mxG + mxG2;
|
||||
mxB = mxB + mxB2;
|
||||
MF mxR2 = max3(max3(mxR, a.r, c.r), g.r, i.r);
|
||||
MF mxG2 = max3(max3(mxG, a.g, c.g), g.g, i.g);
|
||||
MF mxB2 = max3(max3(mxB, a.b, c.b), g.b, i.b);
|
||||
mxR = mxR + mxR2;
|
||||
mxG = mxG + mxG2;
|
||||
mxB = mxB + mxB2;
|
||||
#endif
|
||||
// Smooth minimum distance to signal limit divided by smooth max.
|
||||
// Smooth minimum distance to signal limit divided by smooth max.
|
||||
|
||||
MF rcpMR = rcp(mxR);
|
||||
MF rcpMG = rcp(mxG);
|
||||
MF rcpMB = rcp(mxB);
|
||||
MF rcpMR = rcp(mxR);
|
||||
MF rcpMG = rcp(mxG);
|
||||
MF rcpMB = rcp(mxB);
|
||||
|
||||
#ifdef CAS_BETTER_DIAGONALS
|
||||
MF ampR = saturate(min(mnR, 2.0 - mxR) * rcpMR);
|
||||
MF ampG = saturate(min(mnG, 2.0 - mxG) * rcpMG);
|
||||
MF ampB = saturate(min(mnB, 2.0 - mxB) * rcpMB);
|
||||
MF ampR = saturate(min(mnR, 2.0 - mxR) * rcpMR);
|
||||
MF ampG = saturate(min(mnG, 2.0 - mxG) * rcpMG);
|
||||
MF ampB = saturate(min(mnB, 2.0 - mxB) * rcpMB);
|
||||
#else
|
||||
MF ampR = saturate(min(mnR, 1.0 - mxR) * rcpMR);
|
||||
MF ampG = saturate(min(mnG, 1.0 - mxG) * rcpMG);
|
||||
MF ampB = saturate(min(mnB, 1.0 - mxB) * rcpMB);
|
||||
MF ampR = saturate(min(mnR, 1.0 - mxR) * rcpMR);
|
||||
MF ampG = saturate(min(mnG, 1.0 - mxG) * rcpMG);
|
||||
MF ampB = saturate(min(mnB, 1.0 - mxB) * rcpMB);
|
||||
#endif
|
||||
// Shaping amount of sharpening.
|
||||
ampR = sqrt(ampR);
|
||||
ampG = sqrt(ampG);
|
||||
ampB = sqrt(ampB);
|
||||
// Shaping amount of sharpening.
|
||||
ampR = sqrt(ampR);
|
||||
ampG = sqrt(ampG);
|
||||
ampB = sqrt(ampB);
|
||||
|
||||
// Filter shape.
|
||||
// 0 w 0
|
||||
// w 1 w
|
||||
// 0 w 0
|
||||
MF wR = ampR * peak;
|
||||
MF wG = ampG * peak;
|
||||
MF wB = ampB * peak;
|
||||
// Filter.
|
||||
// Using green coef only, depending on dead code removal to strip out the extra overhead.
|
||||
MF rcpWeight = rcp(1.0 + 4.0 * wG);
|
||||
// Filter shape.
|
||||
// 0 w 0
|
||||
// w 1 w
|
||||
// 0 w 0
|
||||
MF wR = ampR * peak;
|
||||
MF wG = ampG * peak;
|
||||
MF wB = ampB * peak;
|
||||
// Filter.
|
||||
// Using green coef only, depending on dead code removal to strip out the extra overhead.
|
||||
MF rcpWeight = rcp(1.0 + 4.0 * wG);
|
||||
|
||||
return MF3(
|
||||
saturate((b.r * wG + d.r * wG + f.r * wG + h.r * wG + e.r) * rcpWeight),
|
||||
saturate((b.g * wG + d.g * wG + f.g * wG + h.g * wG + e.g) * rcpWeight),
|
||||
saturate((b.b * wG + d.b * wG + f.b * wG + h.b * wG + e.b) * rcpWeight)
|
||||
);
|
||||
return MF3(
|
||||
saturate((b.r * wG + d.r * wG + f.r * wG + h.r * wG + e.r) * rcpWeight),
|
||||
saturate((b.g * wG + d.g * wG + f.g * wG + h.g * wG + e.g) * rcpWeight),
|
||||
saturate((b.b * wG + d.b * wG + f.b * wG + h.b * wG + e.b) * rcpWeight)
|
||||
);
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
|
||||
void Pass1(uint2 blockStart, uint3 threadId) {
|
||||
uint2 gxy = blockStart + (Rmp8x8(threadId.x) << 1);
|
||||
if (!CheckViewport(gxy)) {
|
||||
return;
|
||||
}
|
||||
uint2 gxy = blockStart + (Rmp8x8(threadId.x) << 1);
|
||||
|
||||
const uint2 outputSize = GetOutputSize();
|
||||
if (gxy.x >= outputSize.x || gxy.y >= outputSize.y) {
|
||||
return;
|
||||
}
|
||||
|
||||
float2 inputPt = GetInputPt();
|
||||
uint i, j;
|
||||
float2 inputPt = GetInputPt();
|
||||
uint i, j;
|
||||
|
||||
MF3 src[4][4];
|
||||
[unroll]
|
||||
for (i = 0; i < 3; i += 2) {
|
||||
[unroll]
|
||||
for (j = 0; j < 3; j += 2) {
|
||||
float2 tpos = (gxy + uint2(i, j)) * inputPt;
|
||||
const MF4 sr = (MF4)INPUT.GatherRed(sam, tpos);
|
||||
const MF4 sg = (MF4)INPUT.GatherGreen(sam, tpos);
|
||||
const MF4 sb = (MF4)INPUT.GatherBlue(sam, tpos);
|
||||
MF3 src[4][4];
|
||||
[unroll]
|
||||
for (i = 0; i < 3; i += 2) {
|
||||
[unroll]
|
||||
for (j = 0; j < 3; j += 2) {
|
||||
float2 tpos = (gxy + uint2(i, j)) * inputPt;
|
||||
const MF4 sr = (MF4)INPUT.GatherRed(sam, tpos);
|
||||
const MF4 sg = (MF4)INPUT.GatherGreen(sam, tpos);
|
||||
const MF4 sb = (MF4)INPUT.GatherBlue(sam, tpos);
|
||||
|
||||
// w z
|
||||
// x y
|
||||
src[i][j] = MF3(sr.w, sg.w, sb.w);
|
||||
src[i][j + 1] = MF3(sr.x, sg.x, sb.x);
|
||||
src[i + 1][j] = MF3(sr.z, sg.z, sb.z);
|
||||
src[i + 1][j + 1] = MF3(sr.y, sg.y, sb.y);
|
||||
}
|
||||
}
|
||||
// w z
|
||||
// x y
|
||||
src[i][j] = MF3(sr.w, sg.w, sb.w);
|
||||
src[i][j + 1] = MF3(sr.x, sg.x, sb.x);
|
||||
src[i + 1][j] = MF3(sr.z, sg.z, sb.z);
|
||||
src[i + 1][j + 1] = MF3(sr.y, sg.y, sb.y);
|
||||
}
|
||||
}
|
||||
|
||||
const MF peak = -rcp(lerp(8.0, 5.0, (MF)sharpness));
|
||||
const MF peak = -rcp(lerp(8.0, 5.0, (MF)sharpness));
|
||||
|
||||
#ifdef MP_FP16
|
||||
MF2 pixR, pixG, pixB;
|
||||
CasFilterH(src, 0, peak, pixR, pixG, pixB);
|
||||
MF2 pixR, pixG, pixB;
|
||||
CasFilterH(src, 0, peak, pixR, pixG, pixB);
|
||||
|
||||
WriteToOutput(gxy, float3(pixR.x, pixG.x, pixB.x));
|
||||
OUTPUT[gxy] = float4(float3(pixR.x, pixG.x, pixB.x), 1);
|
||||
|
||||
++gxy.x;
|
||||
if (CheckViewport(gxy)) {
|
||||
WriteToOutput(gxy, float3(pixR.y, pixG.y, pixB.y));
|
||||
}
|
||||
++gxy.x;
|
||||
OUTPUT[gxy] = float4(float3(pixR.y, pixG.y, pixB.y), 1);
|
||||
|
||||
CasFilterH(src, 1, peak, pixR, pixG, pixB);
|
||||
CasFilterH(src, 1, peak, pixR, pixG, pixB);
|
||||
|
||||
++gxy.y;
|
||||
if (CheckViewport(gxy)) {
|
||||
WriteToOutput(gxy, float3(pixR.y, pixG.y, pixB.y));
|
||||
}
|
||||
++gxy.y;
|
||||
OUTPUT[gxy] = float4(float3(pixR.y, pixG.y, pixB.y), 1);
|
||||
|
||||
--gxy.x;
|
||||
if (CheckViewport(gxy)) {
|
||||
WriteToOutput(gxy, float3(pixR.x, pixG.x, pixB.x));
|
||||
}
|
||||
--gxy.x;
|
||||
OUTPUT[gxy] = float4(float3(pixR.x, pixG.x, pixB.x), 1);
|
||||
#else
|
||||
WriteToOutput(gxy, CasFilter(src, uint2(1, 1), peak));
|
||||
OUTPUT[gxy] = float4(CasFilter(src, uint2(1, 1), peak), 1);
|
||||
|
||||
++gxy.x;
|
||||
if (CheckViewport(gxy)) {
|
||||
WriteToOutput(gxy, CasFilter(src, uint2(2, 1), peak));
|
||||
}
|
||||
++gxy.x;
|
||||
OUTPUT[gxy] = float4(CasFilter(src, uint2(2, 1), peak), 1);
|
||||
|
||||
++gxy.y;
|
||||
OUTPUT[gxy] = float4(CasFilter(src, uint2(2, 2), peak), 1);
|
||||
|
||||
++gxy.y;
|
||||
if (CheckViewport(gxy)) {
|
||||
WriteToOutput(gxy, CasFilter(src, uint2(2, 2), peak));
|
||||
}
|
||||
|
||||
--gxy.x;
|
||||
if (CheckViewport(gxy)) {
|
||||
WriteToOutput(gxy, CasFilter(src, uint2(1, 2), peak));
|
||||
}
|
||||
--gxy.x;
|
||||
OUTPUT[gxy] = float4(CasFilter(src, uint2(1, 2), peak), 1);
|
||||
#endif
|
||||
}
|
||||
|
|
|
|||
|
|
@ -1,7 +1,7 @@
|
|||
// 移植自 https://github.com/GPUOpen-Effects/FidelityFX-CAS/blob/master/ffx-cas/ffx_cas.h
|
||||
|
||||
//!MAGPIE EFFECT
|
||||
//!VERSION 3
|
||||
//!VERSION 4
|
||||
|
||||
//!PARAMETER
|
||||
//!LABEL Sharpness
|
||||
|
|
@ -14,9 +14,13 @@ float sharpness;
|
|||
//!TEXTURE
|
||||
Texture2D INPUT;
|
||||
|
||||
//!TEXTURE
|
||||
Texture2D OUTPUT;
|
||||
|
||||
|
||||
//!PASS 1
|
||||
//!IN INPUT
|
||||
//!OUT OUTPUT
|
||||
//!BLOCK_SIZE 16
|
||||
//!NUM_THREADS 64
|
||||
|
||||
|
|
@ -307,7 +311,9 @@ float3 CasFilter(uint2 ip, float4 const0, float peak) {
|
|||
|
||||
void Pass1(uint2 blockStart, uint3 threadId) {
|
||||
uint2 gxy = blockStart + Rmp8x8(threadId.x);
|
||||
if (!CheckViewport(gxy)) {
|
||||
|
||||
const uint2 outputSize = GetOutputSize();
|
||||
if (gxy.x >= outputSize.x || gxy.y >= outputSize.y) {
|
||||
return;
|
||||
}
|
||||
|
||||
|
|
@ -317,20 +323,14 @@ void Pass1(uint2 blockStart, uint3 threadId) {
|
|||
|
||||
const float peak = -rcp(lerp(8.0, 5.0, sharpness));
|
||||
|
||||
WriteToOutput(gxy, CasFilter(gxy, const0, peak));
|
||||
OUTPUT[gxy] = float4(CasFilter(gxy, const0, peak), 1);
|
||||
|
||||
gxy.x += 8u;
|
||||
if (CheckViewport(gxy)) {
|
||||
WriteToOutput(gxy, CasFilter(gxy, const0, peak));
|
||||
}
|
||||
OUTPUT[gxy] = float4(CasFilter(gxy, const0, peak), 1);
|
||||
|
||||
gxy.y += 8u;
|
||||
if (CheckViewport(gxy)) {
|
||||
WriteToOutput(gxy, CasFilter(gxy, const0, peak));
|
||||
}
|
||||
OUTPUT[gxy] = float4(CasFilter(gxy, const0, peak), 1);
|
||||
|
||||
gxy.x -= 8u;
|
||||
if (CheckViewport(gxy)) {
|
||||
WriteToOutput(gxy, CasFilter(gxy, const0, peak));
|
||||
}
|
||||
OUTPUT[gxy] = float4(CasFilter(gxy, const0, peak), 1);
|
||||
}
|
||||
|
|
|
|||
|
|
@ -31,7 +31,7 @@
|
|||
*/
|
||||
|
||||
//!MAGPIE EFFECT
|
||||
//!VERSION 3
|
||||
//!VERSION 4
|
||||
|
||||
|
||||
//!PARAMETER
|
||||
|
|
@ -173,6 +173,9 @@ int dilation;
|
|||
//!TEXTURE
|
||||
Texture2D INPUT;
|
||||
|
||||
//!TEXTURE
|
||||
Texture2D OUTPUT;
|
||||
|
||||
//!SAMPLER
|
||||
//!FILTER POINT
|
||||
SamplerState sam;
|
||||
|
|
@ -181,6 +184,7 @@ SamplerState sam;
|
|||
//!PASS 1
|
||||
//!STYLE PS
|
||||
//!IN INPUT
|
||||
//!OUT OUTPUT
|
||||
|
||||
#pragma warning(disable: 3571) // X3571: pow(f, e) will not work for negative f, use abs(f) or conditionally handle negative values if you expect them
|
||||
|
||||
|
|
|
|||
|
|
@ -24,7 +24,7 @@
|
|||
*/
|
||||
|
||||
//!MAGPIE EFFECT
|
||||
//!VERSION 3
|
||||
//!VERSION 4
|
||||
//!USE_DYNAMIC
|
||||
|
||||
|
||||
|
|
@ -160,6 +160,9 @@ int interlace;
|
|||
//!TEXTURE
|
||||
Texture2D INPUT;
|
||||
|
||||
//!TEXTURE
|
||||
Texture2D OUTPUT;
|
||||
|
||||
//!SAMPLER
|
||||
//!FILTER POINT
|
||||
SamplerState sam;
|
||||
|
|
@ -168,6 +171,7 @@ SamplerState sam;
|
|||
//!PASS 1
|
||||
//!STYLE PS
|
||||
//!IN INPUT
|
||||
//!OUT OUTPUT
|
||||
|
||||
#pragma warning(disable: 3571) // X3571: pow(f, e) will not work for negative f, use abs(f) or conditionally handle negative values if you expect them
|
||||
|
||||
|
|
|
|||
|
|
@ -28,7 +28,7 @@
|
|||
*/
|
||||
|
||||
//!MAGPIE EFFECT
|
||||
//!VERSION 3
|
||||
//!VERSION 4
|
||||
|
||||
|
||||
//!PARAMETER
|
||||
|
|
@ -138,6 +138,9 @@ float crtAntiRinging;
|
|||
//!TEXTURE
|
||||
Texture2D INPUT;
|
||||
|
||||
//!TEXTURE
|
||||
Texture2D OUTPUT;
|
||||
|
||||
//!SAMPLER
|
||||
//!FILTER POINT
|
||||
SamplerState sam;
|
||||
|
|
@ -146,6 +149,7 @@ SamplerState sam;
|
|||
//!PASS 1
|
||||
//!STYLE PS
|
||||
//!IN INPUT
|
||||
//!OUT OUTPUT
|
||||
|
||||
#pragma warning(disable: 3571) // X3571: pow(f, e) will not work for negative f, use abs(f) or conditionally handle negative values if you expect them
|
||||
|
||||
|
|
|
|||
|
|
@ -17,7 +17,7 @@
|
|||
|
||||
|
||||
//!MAGPIE EFFECT
|
||||
//!VERSION 3
|
||||
//!VERSION 4
|
||||
|
||||
//!PARAMETER
|
||||
//!LABEL Scanline Hardness
|
||||
|
|
@ -119,6 +119,9 @@ float shape;
|
|||
//!TEXTURE
|
||||
Texture2D INPUT;
|
||||
|
||||
//!TEXTURE
|
||||
Texture2D OUTPUT;
|
||||
|
||||
//!SAMPLER
|
||||
//!FILTER POINT
|
||||
SamplerState sam;
|
||||
|
|
@ -126,6 +129,7 @@ SamplerState sam;
|
|||
|
||||
//!PASS 1
|
||||
//!IN INPUT
|
||||
//!OUT OUTPUT
|
||||
//!BLOCK_SIZE 8
|
||||
//!NUM_THREADS 64
|
||||
|
||||
|
|
@ -303,7 +307,9 @@ float3 Mask(float2 pos) {
|
|||
|
||||
void Pass1(uint2 blockStart, uint3 threadId) {
|
||||
uint2 gxy = Rmp8x8(threadId.x) + blockStart;
|
||||
if (!CheckViewport(gxy)) {
|
||||
|
||||
const uint2 outputSize = GetOutputSize();
|
||||
if (gxy.x >= outputSize.x || gxy.y >= outputSize.y) {
|
||||
return;
|
||||
}
|
||||
|
||||
|
|
@ -318,8 +324,9 @@ void Pass1(uint2 blockStart, uint3 threadId) {
|
|||
outColor.rgb += Bloom(pos1, inputSize) * bloomAmount;
|
||||
#endif
|
||||
|
||||
if (shadowMask)
|
||||
if (shadowMask) {
|
||||
outColor.rgb *= Mask(gxy + 0.5f);
|
||||
}
|
||||
|
||||
WriteToOutput(gxy, pow(outColor.rgb, 1.0f / 2.2f));
|
||||
OUTPUT[gxy] = float4(pow(outColor.rgb, 1.0f / 2.2f), 1);
|
||||
}
|
||||
|
|
|
|||
|
|
@ -9,7 +9,7 @@
|
|||
|
||||
|
||||
//!MAGPIE EFFECT
|
||||
//!VERSION 3
|
||||
//!VERSION 4
|
||||
|
||||
|
||||
//!PARAMETER
|
||||
|
|
@ -80,6 +80,9 @@ float contrast;
|
|||
//!TEXTURE
|
||||
Texture2D INPUT;
|
||||
|
||||
//!TEXTURE
|
||||
Texture2D OUTPUT;
|
||||
|
||||
//!TEXTURE
|
||||
//!WIDTH OUTPUT_WIDTH
|
||||
//!HEIGHT INPUT_HEIGHT
|
||||
|
|
@ -153,6 +156,7 @@ float4 Pass1(float2 pos) {
|
|||
//!PASS 2
|
||||
//!STYLE PS
|
||||
//!IN tex1
|
||||
//!OUT OUTPUT
|
||||
|
||||
#define pi 3.14159265358
|
||||
#define normalGauss(x) ((exp(-(x)*(x)*0.5))/sqrt(2.0*pi))
|
||||
|
|
|
|||
7635
src/Effects/CuNNy/CuNNy-16x16C-NVL-DN.hlsl
Normal file
7635
src/Effects/CuNNy/CuNNy-16x16C-NVL-DN.hlsl
Normal file
File diff suppressed because it is too large
Load diff
7635
src/Effects/CuNNy/CuNNy-16x16C-NVL.hlsl
Normal file
7635
src/Effects/CuNNy/CuNNy-16x16C-NVL.hlsl
Normal file
File diff suppressed because it is too large
Load diff
340
src/Effects/CuNNy/CuNNy-2x4C-NVL-DN.hlsl
Normal file
340
src/Effects/CuNNy/CuNNy-2x4C-NVL-DN.hlsl
Normal file
|
|
@ -0,0 +1,340 @@
|
|||
// CuNNy 2x4C BILINEAR RGB NVL DN - https://github.com/cunnyplapper/CuNNy
|
||||
|
||||
// This program is free software: you can redistribute it and/or modify
|
||||
// it under the terms of the GNU General Public License as published by
|
||||
// the Free Software Foundation, either version 3 of the License, or
|
||||
// (at your option) any later version.
|
||||
//
|
||||
// This program is distributed in the hope that it will be useful,
|
||||
// but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
//
|
||||
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
// GNU General Public License for more details.
|
||||
//
|
||||
// You should have received a copy of the GNU General Public License
|
||||
// along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
|
||||
//!MAGPIE EFFECT
|
||||
//!VERSION 4
|
||||
//!SORT_NAME CuNNy-DN-D04N02
|
||||
|
||||
//!TEXTURE
|
||||
Texture2D INPUT;
|
||||
|
||||
//!TEXTURE
|
||||
//!WIDTH INPUT_WIDTH * 2
|
||||
//!HEIGHT INPUT_HEIGHT * 2
|
||||
Texture2D OUTPUT;
|
||||
|
||||
//!SAMPLER
|
||||
//!FILTER POINT
|
||||
SamplerState SP;
|
||||
|
||||
//!SAMPLER
|
||||
//!FILTER LINEAR
|
||||
SamplerState SL;
|
||||
|
||||
//!COMMON
|
||||
#define O(t, p) t.SampleLevel(SP, pos + p * pt, 0)
|
||||
#define V4 min16float4
|
||||
#define M4 min16float4x4
|
||||
|
||||
//!TEXTURE
|
||||
//!WIDTH INPUT_WIDTH
|
||||
//!HEIGHT INPUT_HEIGHT
|
||||
//!FORMAT R8G8B8A8_SNORM
|
||||
Texture2D t0;
|
||||
|
||||
//!TEXTURE
|
||||
//!WIDTH INPUT_WIDTH
|
||||
//!HEIGHT INPUT_HEIGHT
|
||||
//!FORMAT R8G8B8A8_SNORM
|
||||
Texture2D t1;
|
||||
|
||||
//!PASS 1
|
||||
//!DESC in
|
||||
//!BLOCK_SIZE 8
|
||||
//!NUM_THREADS 64
|
||||
//!IN INPUT
|
||||
//!OUT t0
|
||||
|
||||
#define l0(x, y) min16float((dot(float3(-3.725e-01, -7.046e-01, -1.734e-01), O(INPUT, float2(x, y)).rgb) + 1.169e-01))
|
||||
|
||||
V4 f0(min16float s0_0, min16float s0_1, min16float s0_2, min16float s0_3, min16float s0_4, min16float s0_5, min16float s0_6, min16float s0_7, min16float s0_8) {
|
||||
V4 r = 0.0;
|
||||
r += V4(-2.745e-03, -2.925e-03, 1.135e-01, 3.162e-02) * s0_0;
|
||||
r += V4(4.049e-03, -3.428e-01, -7.641e-02, 2.484e-02) * s0_1;
|
||||
r += V4(-8.372e-03, 3.398e-01, 1.072e-01, -5.449e-02) * s0_2;
|
||||
r += V4(1.592e-02, 1.884e-02, -3.160e-02, -7.727e-02) * s0_3;
|
||||
r += V4(4.429e-01, -3.936e-01, -4.134e-01, -4.287e-01) * s0_4;
|
||||
r += V4(4.556e-02, 3.754e-01, -2.300e-02, 4.971e-01) * s0_5;
|
||||
r += V4(-2.031e-02, -6.662e-03, 8.906e-02, 4.602e-02) * s0_6;
|
||||
r += V4(-4.365e-01, 2.183e-03, 8.609e-02, 9.402e-03) * s0_7;
|
||||
r += V4(-3.845e-02, 5.695e-03, 9.645e-02, -5.310e-02) * s0_8;
|
||||
r += V4(1.492e-02, -1.961e-02, -7.539e-03, -3.574e-03);
|
||||
return r;
|
||||
}
|
||||
|
||||
void Pass1(uint2 blockStart, uint3 tid) {
|
||||
float2 pt = float2(GetInputPt());
|
||||
uint2 gxy = Rmp8x8(tid.x) + blockStart;
|
||||
uint2 size = GetInputSize();
|
||||
if (gxy.x >= size.x || gxy.y >= size.y) {
|
||||
return;
|
||||
}
|
||||
float2 pos = (gxy + 0.5) * pt;
|
||||
|
||||
min16float s0_0 = l0(-1.0, -1.0);
|
||||
min16float s0_1 = l0(0.0, -1.0);
|
||||
min16float s0_2 = l0(1.0, -1.0);
|
||||
min16float s0_3 = l0(-1.0, 0.0);
|
||||
min16float s0_4 = l0(0.0, 0.0);
|
||||
min16float s0_5 = l0(1.0, 0.0);
|
||||
min16float s0_6 = l0(-1.0, 1.0);
|
||||
min16float s0_7 = l0(0.0, 1.0);
|
||||
min16float s0_8 = l0(1.0, 1.0);
|
||||
|
||||
t0[gxy] = f0(s0_0, s0_1, s0_2, s0_3, s0_4, s0_5, s0_6, s0_7, s0_8);
|
||||
}
|
||||
|
||||
//!PASS 2
|
||||
//!DESC conv1
|
||||
//!BLOCK_SIZE 8
|
||||
//!NUM_THREADS 64
|
||||
//!IN t0
|
||||
//!OUT t1
|
||||
|
||||
#define l0(x, y) V4(O(t0, float2(x, y)))
|
||||
|
||||
V4 f0(V4 s0_0, V4 s0_1, V4 s0_2, V4 s0_3, V4 s0_4, V4 s0_5, V4 s0_6, V4 s0_7, V4 s0_8, V4 s1_0, V4 s1_1, V4 s1_2, V4 s1_3, V4 s1_4, V4 s1_5, V4 s1_6, V4 s1_7, V4 s1_8) {
|
||||
V4 r = 0.0;
|
||||
r += mul(s0_0, M4(1.218e-02, -1.208e-01, -1.955e-01, -1.217e-01, 3.123e-02, -2.317e-02, 1.961e-01, -9.984e-02, 3.038e-03, 2.863e-02, -1.042e-01, -5.529e-02, 1.266e-01, -3.877e-01, 2.315e-01, -1.334e-01));
|
||||
r += mul(s0_1, M4(-1.774e-02, 1.636e-01, 1.379e-01, 7.499e-03, -7.890e-02, -3.970e-02, -6.053e-02, -1.431e-02, 4.167e-02, 9.728e-02, 3.825e-02, -2.704e-02, -2.303e-01, -3.348e-01, 2.940e-01, 4.825e-02));
|
||||
r += mul(s0_2, M4(1.239e-02, 1.613e-02, -2.280e-01, 8.985e-02, 2.106e-03, 3.847e-02, -2.539e-02, -3.326e-02, -6.327e-02, -1.427e-01, 4.218e-02, 8.995e-02, -6.045e-02, -1.073e-01, -1.329e-01, -2.085e-02));
|
||||
r += mul(s0_3, M4(-1.601e-01, -2.448e-01, -3.950e-01, 9.169e-03, -3.694e-02, 2.018e-01, -2.524e-01, 1.719e+00, 3.009e-02, 4.927e-02, 1.564e-01, 3.509e-02, -2.630e-02, -3.986e-01, 1.326e-01, -1.037e-02));
|
||||
r += mul(s0_4, M4(-1.074e+00, -1.654e-01, 4.163e-01, 3.816e-02, 4.580e-01, 4.350e-01, -3.490e-01, -1.257e-02, 1.159e-02, -2.083e-01, -2.744e-01, -2.667e-02, 2.826e-03, 1.986e-01, -2.723e-01, 9.612e-02));
|
||||
r += mul(s0_5, M4(-3.195e-01, -1.450e-01, -1.523e-01, -2.999e-03, 1.166e-01, 1.304e-01, 1.475e-01, 7.286e-02, -4.077e-02, -3.477e-02, 1.496e-01, -1.199e-02, 7.881e-02, 8.911e-02, -1.082e-01, -6.762e-02));
|
||||
r += mul(s0_6, M4(2.020e-02, 1.556e-01, -9.837e-03, 1.537e-02, -1.047e-01, 2.095e-01, 2.025e-01, -3.522e-02, -3.407e-02, -8.949e-02, -7.721e-02, -8.910e-03, 9.305e-02, 2.231e-01, 2.178e-01, 1.502e-02));
|
||||
r += mul(s0_7, M4(-7.936e-02, 3.096e-01, 1.869e-01, -1.950e-03, -2.452e-01, -5.098e-01, 5.304e-01, -4.921e-02, -1.073e-01, 1.062e-01, 2.527e-01, 5.909e-04, 3.797e-02, 3.291e-01, -2.395e-01, 2.768e-02));
|
||||
r += mul(s0_8, M4(-5.559e-02, 1.090e-01, -1.757e-01, 1.261e-02, -1.632e-01, -2.476e-01, -5.674e-02, -4.843e-03, 1.064e-02, 1.023e-01, 2.540e-02, -1.336e-02, 1.362e-01, 1.833e-01, 3.772e-03, 5.118e-04));
|
||||
r += mul(s1_0, M4(1.383e-01, 3.469e-01, 3.568e-02, -1.958e-01, -3.170e-02, -1.076e-02, -2.012e-02, -2.104e-04, 2.046e-02, -1.268e-02, -1.618e-01, -6.370e-02, 2.615e-02, 1.494e-01, -1.523e-01, 3.702e-02));
|
||||
r += mul(s1_1, M4(-1.140e-02, 6.811e-01, 5.722e-02, 1.514e-01, -6.311e-02, -3.541e-02, -1.150e-01, 3.625e-02, 1.146e-01, -1.395e-03, 5.059e-01, -7.835e-02, -3.907e-01, 6.172e-02, -9.656e-02, -2.727e-02));
|
||||
r += mul(s1_2, M4(1.239e-01, 1.206e-01, 7.519e-01, 2.106e-02, 8.647e-03, 1.082e-02, 5.931e-02, -4.215e-02, -2.216e-02, -4.829e-02, -1.927e-01, 1.159e-01, -1.789e-01, -9.596e-02, 1.395e-01, -6.395e-02));
|
||||
r += mul(s1_3, M4(1.194e-01, -5.786e-01, -1.761e-03, -1.126e-02, -5.311e-02, -2.325e-01, 1.733e-01, 2.842e-01, -1.080e-01, -1.012e-01, 1.851e-01, 4.253e-02, 1.212e-01, 2.435e-02, -3.061e-01, -9.579e-02));
|
||||
r += mul(s1_4, M4(-4.651e-02, -1.299e+00, -5.020e-01, 5.830e-02, 5.098e-01, 7.344e-02, -1.358e-01, 1.725e-02, -2.980e-01, -6.077e-01, 6.308e-01, -4.014e-02, 3.497e-01, 3.700e-01, -6.035e-01, 8.026e-02));
|
||||
r += mul(s1_5, M4(-1.851e-02, -2.057e-01, 5.081e-01, -5.262e-02, 1.715e-01, 1.387e-01, -1.123e-01, 9.022e-02, -1.532e-01, -3.749e-02, -1.930e-01, 6.423e-02, 2.763e-02, 5.993e-02, 4.141e-01, -8.825e-02));
|
||||
r += mul(s1_6, M4(-6.324e-03, -9.461e-02, 3.044e-02, -4.139e-03, -2.925e-02, 3.975e-01, 1.161e-01, 9.726e-03, 1.353e-01, 2.762e-01, 3.297e-03, 1.076e-02, -8.503e-02, -7.010e-01, -1.967e-01, -1.360e-03));
|
||||
r += mul(s1_7, M4(1.873e-02, 1.099e-01, 1.229e-01, -1.232e-02, -5.723e-01, -4.599e-02, -1.236e-01, -2.003e-02, -4.268e-01, 5.929e-01, 2.942e-01, 3.485e-02, 4.326e-01, -9.250e-02, 3.736e-01, -2.393e-02));
|
||||
r += mul(s1_8, M4(-5.991e-02, 1.199e-03, -1.349e-02, -1.321e-03, -2.036e-01, -1.937e-01, -7.888e-02, -9.144e-03, 1.557e-01, 7.018e-02, -2.646e-01, -3.360e-06, 1.742e-01, 1.814e-01, 1.385e-01, -1.030e-02));
|
||||
r += V4(4.789e-02, 4.713e-03, -2.854e-02, 9.967e-03);
|
||||
return r;
|
||||
}
|
||||
|
||||
void Pass2(uint2 blockStart, uint3 tid) {
|
||||
float2 pt = float2(GetInputPt());
|
||||
uint2 gxy = Rmp8x8(tid.x) + blockStart;
|
||||
uint2 size = GetInputSize();
|
||||
if (gxy.x >= size.x || gxy.y >= size.y) {
|
||||
return;
|
||||
}
|
||||
float2 pos = (gxy + 0.5) * pt;
|
||||
|
||||
V4 s0_0 = l0(-1.0, -1.0);
|
||||
V4 s0_1 = l0(0.0, -1.0);
|
||||
V4 s0_2 = l0(1.0, -1.0);
|
||||
V4 s0_3 = l0(-1.0, 0.0);
|
||||
V4 s0_4 = l0(0.0, 0.0);
|
||||
V4 s0_5 = l0(1.0, 0.0);
|
||||
V4 s0_6 = l0(-1.0, 1.0);
|
||||
V4 s0_7 = l0(0.0, 1.0);
|
||||
V4 s0_8 = l0(1.0, 1.0);
|
||||
V4 s1_0 = -max(-s0_0, 0.0);
|
||||
V4 s1_1 = -max(-s0_1, 0.0);
|
||||
V4 s1_2 = -max(-s0_2, 0.0);
|
||||
V4 s1_3 = -max(-s0_3, 0.0);
|
||||
V4 s1_4 = -max(-s0_4, 0.0);
|
||||
V4 s1_5 = -max(-s0_5, 0.0);
|
||||
V4 s1_6 = -max(-s0_6, 0.0);
|
||||
V4 s1_7 = -max(-s0_7, 0.0);
|
||||
V4 s1_8 = -max(-s0_8, 0.0);
|
||||
s0_0 = max(s0_0, 0.0);
|
||||
s0_1 = max(s0_1, 0.0);
|
||||
s0_2 = max(s0_2, 0.0);
|
||||
s0_3 = max(s0_3, 0.0);
|
||||
s0_4 = max(s0_4, 0.0);
|
||||
s0_5 = max(s0_5, 0.0);
|
||||
s0_6 = max(s0_6, 0.0);
|
||||
s0_7 = max(s0_7, 0.0);
|
||||
s0_8 = max(s0_8, 0.0);
|
||||
|
||||
t1[gxy] = f0(s0_0, s0_1, s0_2, s0_3, s0_4, s0_5, s0_6, s0_7, s0_8, s1_0, s1_1, s1_2, s1_3, s1_4, s1_5, s1_6, s1_7, s1_8);
|
||||
}
|
||||
|
||||
//!PASS 3
|
||||
//!DESC conv2
|
||||
//!BLOCK_SIZE 8
|
||||
//!NUM_THREADS 64
|
||||
//!IN t1
|
||||
//!OUT t0
|
||||
|
||||
#define l0(x, y) V4(O(t1, float2(x, y)))
|
||||
|
||||
V4 f0(V4 s0_0, V4 s0_1, V4 s0_2, V4 s0_3, V4 s0_4, V4 s0_5, V4 s0_6, V4 s0_7, V4 s0_8, V4 s1_0, V4 s1_1, V4 s1_2, V4 s1_3, V4 s1_4, V4 s1_5, V4 s1_6, V4 s1_7, V4 s1_8) {
|
||||
V4 r = 0.0;
|
||||
r += mul(s0_0, M4(-1.565e-01, 1.307e-02, -5.269e-02, 5.465e-02, 2.936e-01, 1.626e-01, 4.589e-02, 2.478e-02, 3.520e-01, -5.445e-02, -2.480e-01, 2.838e-02, 1.841e-04, 1.264e-02, -1.370e-02, 2.588e-02));
|
||||
r += mul(s0_1, M4(2.350e-01, 2.116e-01, 2.167e-02, -1.559e-01, 2.502e-01, 4.320e-01, -7.152e-01, 2.270e-01, -2.668e-01, -2.117e-01, 5.598e-01, 2.261e-01, 4.101e-02, -4.860e-02, 3.530e-02, 8.932e-02));
|
||||
r += mul(s0_2, M4(-4.398e-02, -4.486e-02, -5.040e-02, 9.803e-02, 7.515e-02, 1.203e-01, -5.357e-02, -2.803e-01, -1.435e-01, 7.150e-03, -3.118e-02, -2.636e-01, -2.969e-02, -2.011e-02, 2.658e-02, -2.572e-02));
|
||||
r += mul(s0_3, M4(9.140e-02, -1.875e-01, 9.757e-02, 2.976e-02, -8.325e-02, 6.109e-02, -4.304e-02, 7.057e-02, 7.324e-01, -1.528e-01, 2.930e-01, 7.503e-02, -3.901e-02, 1.109e-03, -2.693e-02, -3.330e-02));
|
||||
r += mul(s0_4, M4(-9.944e-02, 1.858e-01, -2.436e-01, 3.822e-02, 6.685e-02, -1.758e-01, 1.382e-01, -1.715e-01, 3.252e-01, 5.176e-01, -2.939e-01, 4.311e-01, -6.125e-02, 1.905e-01, 8.140e-02, 2.095e-01));
|
||||
r += mul(s0_5, M4(3.193e-02, 6.029e-02, 1.869e-03, 8.627e-04, -1.402e-02, 4.288e-02, -5.756e-02, 8.813e-02, -2.758e-02, -5.267e-02, 1.702e-03, -6.676e-01, 6.373e-02, 5.766e-02, -6.325e-02, -2.744e-01));
|
||||
r += mul(s0_6, M4(4.918e-02, 5.420e-04, 3.692e-02, 7.796e-03, -1.163e-02, -4.074e-02, 2.057e-02, -2.837e-02, 1.083e-01, 1.958e-01, -5.078e-02, 2.750e-02, 5.323e-02, 5.953e-03, 4.766e-02, -2.265e-03));
|
||||
r += mul(s0_7, M4(-3.968e-02, -1.535e-01, 6.564e-02, -2.620e-02, 3.742e-02, 8.659e-02, -4.440e-02, 6.007e-03, -9.585e-02, -9.425e-02, -1.517e-01, 3.701e-01, -1.332e-01, -1.860e-01, -5.436e-02, 3.781e-01));
|
||||
r += mul(s0_8, M4(-1.145e-02, 6.045e-02, -4.676e-02, -5.604e-02, -1.576e-02, -3.528e-02, 2.252e-02, 1.997e-02, -2.546e-02, -6.894e-02, 7.238e-02, -3.495e-01, -6.323e-02, -1.042e-01, 1.091e-01, -4.170e-01));
|
||||
r += mul(s1_0, M4(-5.215e-01, 6.255e-01, 5.587e-02, -5.362e-02, 9.895e-02, -8.743e-03, 1.058e-01, -3.585e-02, -1.594e-02, -1.034e-01, 3.848e-02, -5.432e-02, -1.796e-02, 5.838e-02, 1.304e-01, -2.122e-02));
|
||||
r += mul(s1_1, M4(-6.987e-02, 8.696e-01, -1.130e+00, 5.558e-03, -1.080e-01, 4.195e-02, -1.323e-01, 2.270e-01, 3.451e-02, -1.616e-02, 4.251e-03, 1.470e-01, 2.442e-01, -5.904e-02, -3.467e-01, -2.056e-02));
|
||||
r += mul(s1_2, M4(4.884e-02, -1.034e-01, 5.823e-02, 1.131e-01, -4.126e-02, 6.519e-02, -1.532e-02, -2.420e-01, 1.092e-02, 1.869e-02, 1.913e-03, -1.787e-02, 1.122e-01, -1.481e-01, 1.843e-01, 3.454e-01));
|
||||
r += mul(s1_3, M4(-2.906e-01, -9.847e-01, 4.092e-01, 1.655e-01, 4.092e-02, 2.913e-01, 1.306e-01, -4.682e-02, 2.568e-01, -4.528e-02, 3.207e-02, 9.888e-02, -3.928e-01, -3.546e-01, -2.367e-01, -3.239e-01));
|
||||
r += mul(s1_4, M4(4.463e-01, -1.594e-01, 8.418e-01, -3.525e-01, 5.957e-01, 1.082e+00, -9.245e-01, 2.726e-01, 1.210e-01, 2.024e-01, -8.063e-03, -2.433e-01, -1.512e+00, 9.316e-01, 2.305e-01, -5.109e-01));
|
||||
r += mul(s1_5, M4(-2.393e-02, 1.286e-02, -9.453e-02, 3.071e-01, -1.402e-01, -2.436e-01, 1.202e-01, -1.409e-01, -1.857e-02, 2.421e-02, -2.642e-02, -7.415e-02, 8.786e-01, 5.260e-04, -9.212e-02, 1.849e-01));
|
||||
r += mul(s1_6, M4(8.958e-02, 9.057e-02, 1.712e-02, -2.838e-02, -1.405e-01, -6.455e-02, -2.695e-02, -1.110e-02, 8.731e-03, 6.531e-02, -3.752e-02, 1.194e-01, 4.585e-01, 6.270e-01, -1.367e-01, -2.529e-01));
|
||||
r += mul(s1_7, M4(-4.381e-02, -1.595e-02, -4.601e-02, 7.257e-02, -8.036e-02, -1.360e-01, 1.154e-01, -7.942e-02, -4.653e-02, -7.121e-02, 2.720e-02, 8.346e-02, -1.871e+00, -8.300e-01, -6.760e-01, 7.402e-01));
|
||||
r += mul(s1_8, M4(1.359e-02, -2.489e-02, 3.529e-02, -1.121e-01, -6.190e-02, -2.628e-02, -2.090e-03, 2.359e-01, -2.412e-02, -2.463e-02, 8.317e-03, -5.330e-02, 2.105e+00, 1.550e-01, 1.457e+00, -1.129e+00));
|
||||
r += V4(7.359e-03, -1.132e-02, 1.248e-02, 7.243e-04);
|
||||
return r;
|
||||
}
|
||||
|
||||
void Pass3(uint2 blockStart, uint3 tid) {
|
||||
float2 pt = float2(GetInputPt());
|
||||
uint2 gxy = Rmp8x8(tid.x) + blockStart;
|
||||
uint2 size = GetInputSize();
|
||||
if (gxy.x >= size.x || gxy.y >= size.y) {
|
||||
return;
|
||||
}
|
||||
float2 pos = (gxy + 0.5) * pt;
|
||||
|
||||
V4 s0_0 = l0(-1.0, -1.0);
|
||||
V4 s0_1 = l0(0.0, -1.0);
|
||||
V4 s0_2 = l0(1.0, -1.0);
|
||||
V4 s0_3 = l0(-1.0, 0.0);
|
||||
V4 s0_4 = l0(0.0, 0.0);
|
||||
V4 s0_5 = l0(1.0, 0.0);
|
||||
V4 s0_6 = l0(-1.0, 1.0);
|
||||
V4 s0_7 = l0(0.0, 1.0);
|
||||
V4 s0_8 = l0(1.0, 1.0);
|
||||
V4 s1_0 = -max(-s0_0, 0.0);
|
||||
V4 s1_1 = -max(-s0_1, 0.0);
|
||||
V4 s1_2 = -max(-s0_2, 0.0);
|
||||
V4 s1_3 = -max(-s0_3, 0.0);
|
||||
V4 s1_4 = -max(-s0_4, 0.0);
|
||||
V4 s1_5 = -max(-s0_5, 0.0);
|
||||
V4 s1_6 = -max(-s0_6, 0.0);
|
||||
V4 s1_7 = -max(-s0_7, 0.0);
|
||||
V4 s1_8 = -max(-s0_8, 0.0);
|
||||
s0_0 = max(s0_0, 0.0);
|
||||
s0_1 = max(s0_1, 0.0);
|
||||
s0_2 = max(s0_2, 0.0);
|
||||
s0_3 = max(s0_3, 0.0);
|
||||
s0_4 = max(s0_4, 0.0);
|
||||
s0_5 = max(s0_5, 0.0);
|
||||
s0_6 = max(s0_6, 0.0);
|
||||
s0_7 = max(s0_7, 0.0);
|
||||
s0_8 = max(s0_8, 0.0);
|
||||
|
||||
t0[gxy] = f0(s0_0, s0_1, s0_2, s0_3, s0_4, s0_5, s0_6, s0_7, s0_8, s1_0, s1_1, s1_2, s1_3, s1_4, s1_5, s1_6, s1_7, s1_8);
|
||||
}
|
||||
|
||||
//!PASS 4
|
||||
//!DESC out-shuffle
|
||||
//!BLOCK_SIZE 16
|
||||
//!NUM_THREADS 64
|
||||
//!IN INPUT, t0
|
||||
//!OUT OUTPUT
|
||||
|
||||
#define l0(x, y) V4(O(t0, float2(x, y)))
|
||||
|
||||
V4 f0(V4 s0_0, V4 s0_1, V4 s0_2, V4 s0_3, V4 s0_4, V4 s0_5, V4 s0_6, V4 s0_7, V4 s0_8, V4 s1_0, V4 s1_1, V4 s1_2, V4 s1_3, V4 s1_4, V4 s1_5, V4 s1_6, V4 s1_7, V4 s1_8) {
|
||||
V4 r = 0.0;
|
||||
r += mul(s0_0, M4(8.642e-03, -1.295e-02, 1.998e-02, -1.289e-03, -4.147e-02, -4.021e-03, 1.491e-04, -7.275e-03, 1.574e-02, -4.122e-03, 1.126e-02, 8.962e-03, 5.174e-02, 3.405e-02, 4.993e-02, 4.529e-02));
|
||||
r += mul(s0_1, M4(-1.028e-01, -2.764e-02, -2.777e-02, -7.170e-03, -8.365e-02, 3.550e-02, 1.288e-01, 2.475e-02, 5.017e-02, 5.917e-02, 3.473e-02, 8.510e-03, 2.332e-02, 8.047e-02, 9.838e-02, 4.234e-02));
|
||||
r += mul(s0_2, M4(-2.319e-02, -4.432e-02, -1.679e-02, 8.855e-03, 3.259e-02, -1.974e-01, 5.938e-02, 1.616e-01, -5.605e-04, 3.183e-02, -3.356e-03, 3.138e-02, 9.572e-03, -3.887e-02, -2.632e-02, -1.161e-02));
|
||||
r += mul(s0_3, M4(-2.947e-02, -4.358e-02, 1.208e-03, -2.705e-02, -1.037e-02, -6.812e-02, -5.436e-02, -3.840e-02, 3.684e-02, 2.560e-02, 1.715e-02, -3.670e-02, -5.930e-02, -2.310e-02, -6.163e-02, -3.562e-02));
|
||||
r += mul(s0_4, M4(5.520e-01, 1.213e-01, 1.753e-01, 5.436e-02, 5.879e-01, 2.281e-01, -2.703e-01, 1.519e-01, 5.739e-01, 2.959e-01, 9.449e-02, 2.473e-02, -5.998e-01, -9.548e-02, -6.035e-01, -9.663e-02));
|
||||
r += mul(s0_5, M4(-9.740e-02, 2.744e-01, -1.522e-01, -7.204e-02, 1.178e-01, 6.112e-01, -4.801e-02, -5.176e-01, 1.480e-02, 8.323e-02, -6.764e-02, 4.138e-02, 1.121e-01, -8.141e-02, 1.211e-01, -8.737e-02));
|
||||
r += mul(s0_6, M4(6.315e-02, 6.323e-02, 1.146e-02, 3.378e-02, -9.598e-02, -1.089e-01, 2.780e-02, -6.091e-02, -1.194e-01, -1.038e-01, -2.147e-02, -4.236e-02, -2.300e-02, -3.184e-02, -1.560e-02, -2.206e-02));
|
||||
r += mul(s0_7, M4(-1.772e-01, -1.304e-01, 1.265e-01, -7.871e-02, 1.978e-01, 1.074e-01, 1.240e-02, 4.600e-02, 1.558e-02, -3.196e-02, 2.018e-01, 1.496e-01, 1.421e-01, 8.472e-02, 7.432e-02, 9.935e-02));
|
||||
r += mul(s0_8, M4(1.132e-02, -2.296e-03, 1.274e-01, 3.428e-01, -5.796e-02, -6.156e-02, -2.549e-01, -2.231e-01, -8.762e-02, -9.318e-02, -2.378e-01, -3.018e-01, 5.601e-03, -2.670e-02, 2.896e-02, -3.910e-02));
|
||||
r += mul(s1_0, M4(4.603e-02, -2.582e-02, -9.045e-03, 1.446e-02, -1.835e-02, -2.533e-02, 3.681e-03, -9.420e-03, -5.802e-02, 2.310e-02, 3.059e-02, 1.313e-03, 9.639e-02, 8.284e-02, 1.071e-01, -3.287e-02));
|
||||
r += mul(s1_1, M4(-2.480e-02, 2.321e-03, -3.594e-02, -1.101e-01, 2.850e-02, 2.912e-02, 2.597e-02, 2.777e-02, 5.701e-02, 9.536e-04, 2.533e-02, 1.102e-02, -3.714e-03, 7.838e-02, -1.716e-02, 1.723e-01));
|
||||
r += mul(s1_2, M4(-4.473e-03, 1.521e-02, -1.887e-02, 6.731e-03, 2.199e-03, 2.965e-02, -3.709e-03, 1.671e-02, 1.376e-02, -4.819e-02, -8.832e-04, 3.531e-02, -8.453e-03, -1.276e-02, -1.461e-02, 4.460e-03));
|
||||
r += mul(s1_3, M4(6.139e-02, -1.511e-01, 1.102e-01, -1.428e-01, -5.114e-02, -6.594e-02, -1.693e-02, -4.651e-02, 2.440e-01, 2.010e-02, -1.900e-01, -1.243e-03, -2.397e-01, 2.002e-01, -3.506e-01, 2.171e-01));
|
||||
r += mul(s1_4, M4(-6.189e-02, 5.137e-01, -8.132e-02, 4.526e-01, 3.263e-01, 2.134e-01, 1.027e-01, 2.067e-02, 2.407e-01, 2.591e-01, 4.489e-01, 2.042e-01, 1.932e-02, -4.463e-01, -1.479e-01, -6.843e-01));
|
||||
r += mul(s1_5, M4(-7.571e-03, -7.787e-02, 9.918e-03, -8.469e-02, 4.056e-02, -1.926e-02, -4.968e-02, 2.416e-02, 2.699e-02, 2.783e-01, -7.854e-02, -6.549e-02, 6.835e-03, 2.288e-02, 1.048e-02, -3.273e-02));
|
||||
r += mul(s1_6, M4(7.034e-02, 4.236e-02, 7.905e-02, -2.283e-03, -8.423e-02, -7.784e-02, -7.540e-03, -3.373e-02, -1.019e-01, -1.421e-01, 6.713e-02, -8.716e-02, -6.980e-02, -4.731e-02, -3.086e-02, -6.210e-03));
|
||||
r += mul(s1_7, M4(-1.597e-01, -2.036e-01, 5.194e-02, 8.457e-02, 1.387e-01, 7.910e-02, 2.030e-02, 5.848e-02, 2.154e-01, 1.382e-01, -8.617e-02, 7.552e-02, 3.127e-02, 5.899e-02, 1.733e-01, 1.657e-01));
|
||||
r += mul(s1_8, M4(3.595e-02, 3.243e-02, 1.450e-01, 2.046e-01, -2.939e-02, -1.306e-02, -1.587e-01, -2.607e-01, -8.980e-02, -5.350e-02, -2.627e-01, -2.861e-01, -1.585e-02, -2.032e-02, -1.662e-02, 1.560e-02));
|
||||
r += V4(-7.528e-04, -8.388e-04, -1.247e-03, -1.205e-03);
|
||||
return tanh(r);
|
||||
}
|
||||
|
||||
void Pass4(uint2 blockStart, uint3 tid) {
|
||||
float2 pt = float2(GetInputPt());
|
||||
uint2 gxy = (Rmp8x8(tid.x) << 1) + blockStart;
|
||||
uint2 size = GetOutputSize();
|
||||
if (gxy.x >= size.x || gxy.y >= size.y) {
|
||||
return;
|
||||
}
|
||||
float2 pos = ((gxy >> 1) + 0.5) * pt;
|
||||
|
||||
V4 s0_0 = l0(-1.0, -1.0);
|
||||
V4 s0_1 = l0(0.0, -1.0);
|
||||
V4 s0_2 = l0(1.0, -1.0);
|
||||
V4 s0_3 = l0(-1.0, 0.0);
|
||||
V4 s0_4 = l0(0.0, 0.0);
|
||||
V4 s0_5 = l0(1.0, 0.0);
|
||||
V4 s0_6 = l0(-1.0, 1.0);
|
||||
V4 s0_7 = l0(0.0, 1.0);
|
||||
V4 s0_8 = l0(1.0, 1.0);
|
||||
V4 s1_0 = -max(-s0_0, 0.0);
|
||||
V4 s1_1 = -max(-s0_1, 0.0);
|
||||
V4 s1_2 = -max(-s0_2, 0.0);
|
||||
V4 s1_3 = -max(-s0_3, 0.0);
|
||||
V4 s1_4 = -max(-s0_4, 0.0);
|
||||
V4 s1_5 = -max(-s0_5, 0.0);
|
||||
V4 s1_6 = -max(-s0_6, 0.0);
|
||||
V4 s1_7 = -max(-s0_7, 0.0);
|
||||
V4 s1_8 = -max(-s0_8, 0.0);
|
||||
s0_0 = max(s0_0, 0.0);
|
||||
s0_1 = max(s0_1, 0.0);
|
||||
s0_2 = max(s0_2, 0.0);
|
||||
s0_3 = max(s0_3, 0.0);
|
||||
s0_4 = max(s0_4, 0.0);
|
||||
s0_5 = max(s0_5, 0.0);
|
||||
s0_6 = max(s0_6, 0.0);
|
||||
s0_7 = max(s0_7, 0.0);
|
||||
s0_8 = max(s0_8, 0.0);
|
||||
|
||||
V4 r = f0(s0_0, s0_1, s0_2, s0_3, s0_4, s0_5, s0_6, s0_7, s0_8, s1_0, s1_1, s1_2, s1_3, s1_4, s1_5, s1_6, s1_7, s1_8);
|
||||
|
||||
static const float3x3 rgb2yuv = {0.299, 0.587, 0.114, -0.169, -0.331, 0.5, 0.5, -0.419, -0.081};
|
||||
static const float3x3 yuv2rgb = {1, -0.00093, 1.401687, 1, -0.3437, -0.71417, 1, 1.77216, 0.00099};
|
||||
float2 opt = float2(GetOutputPt());
|
||||
|
||||
pos -= 0.5f * opt;
|
||||
float3 yuv = mul(rgb2yuv, INPUT.SampleLevel(SL, pos, 0).rgb);
|
||||
OUTPUT[gxy] = float4(mul(yuv2rgb, float3(saturate(yuv.r + r.x), yuv.yz)), 1);
|
||||
|
||||
++gxy.x;
|
||||
pos.x += opt.x;
|
||||
yuv = mul(rgb2yuv, INPUT.SampleLevel(SL, pos, 0).rgb);
|
||||
OUTPUT[gxy] = float4(mul(yuv2rgb, float3(saturate(yuv.r + r.y), yuv.yz)), 1);
|
||||
|
||||
++gxy.y;
|
||||
pos.y += opt.y;
|
||||
yuv = mul(rgb2yuv, INPUT.SampleLevel(SL, pos, 0).rgb);
|
||||
OUTPUT[gxy] = float4(mul(yuv2rgb, float3(saturate(yuv.r + r.w), yuv.yz)), 1);
|
||||
|
||||
--gxy.x;
|
||||
pos.x -= opt.x;
|
||||
yuv = mul(rgb2yuv, INPUT.SampleLevel(SL, pos, 0).rgb);
|
||||
OUTPUT[gxy] = float4(mul(yuv2rgb, float3(saturate(yuv.r + r.z), yuv.yz)), 1);
|
||||
}
|
||||
340
src/Effects/CuNNy/CuNNy-2x4C-NVL.hlsl
Normal file
340
src/Effects/CuNNy/CuNNy-2x4C-NVL.hlsl
Normal file
|
|
@ -0,0 +1,340 @@
|
|||
// CuNNy 2x4C BILINEAR RGB NVL - https://github.com/cunnyplapper/CuNNy
|
||||
|
||||
// This program is free software: you can redistribute it and/or modify
|
||||
// it under the terms of the GNU General Public License as published by
|
||||
// the Free Software Foundation, either version 3 of the License, or
|
||||
// (at your option) any later version.
|
||||
//
|
||||
// This program is distributed in the hope that it will be useful,
|
||||
// but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
//
|
||||
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
// GNU General Public License for more details.
|
||||
//
|
||||
// You should have received a copy of the GNU General Public License
|
||||
// along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
|
||||
//!MAGPIE EFFECT
|
||||
//!VERSION 4
|
||||
//!SORT_NAME CuNNy-D04N02
|
||||
|
||||
//!TEXTURE
|
||||
Texture2D INPUT;
|
||||
|
||||
//!TEXTURE
|
||||
//!WIDTH INPUT_WIDTH * 2
|
||||
//!HEIGHT INPUT_HEIGHT * 2
|
||||
Texture2D OUTPUT;
|
||||
|
||||
//!SAMPLER
|
||||
//!FILTER POINT
|
||||
SamplerState SP;
|
||||
|
||||
//!SAMPLER
|
||||
//!FILTER LINEAR
|
||||
SamplerState SL;
|
||||
|
||||
//!COMMON
|
||||
#define O(t, p) t.SampleLevel(SP, pos + p * pt, 0)
|
||||
#define V4 min16float4
|
||||
#define M4 min16float4x4
|
||||
|
||||
//!TEXTURE
|
||||
//!WIDTH INPUT_WIDTH
|
||||
//!HEIGHT INPUT_HEIGHT
|
||||
//!FORMAT R8G8B8A8_SNORM
|
||||
Texture2D t0;
|
||||
|
||||
//!TEXTURE
|
||||
//!WIDTH INPUT_WIDTH
|
||||
//!HEIGHT INPUT_HEIGHT
|
||||
//!FORMAT R8G8B8A8_SNORM
|
||||
Texture2D t1;
|
||||
|
||||
//!PASS 1
|
||||
//!DESC in
|
||||
//!BLOCK_SIZE 8
|
||||
//!NUM_THREADS 64
|
||||
//!IN INPUT
|
||||
//!OUT t0
|
||||
|
||||
#define l0(x, y) min16float((dot(float3(-6.049e-01, -1.145e+00, -2.540e-01), O(INPUT, float2(x, y)).rgb) + 1.794e+00))
|
||||
|
||||
V4 f0(min16float s0_0, min16float s0_1, min16float s0_2, min16float s0_3, min16float s0_4, min16float s0_5, min16float s0_6, min16float s0_7, min16float s0_8) {
|
||||
V4 r = 0.0;
|
||||
r += V4(1.411e-01, -9.763e-03, -1.361e-01, -9.610e-04) * s0_0;
|
||||
r += V4(6.068e-02, 7.238e-03, -1.182e-01, -1.535e-02) * s0_1;
|
||||
r += V4(-8.549e-02, -2.876e-03, -8.740e-03, 1.652e-02) * s0_2;
|
||||
r += V4(-3.249e-01, 5.392e-02, -8.518e-02, -7.437e-03) * s0_3;
|
||||
r += V4(2.435e-02, -6.191e-01, 7.147e-01, 5.862e-01) * s0_4;
|
||||
r += V4(1.968e-01, 1.868e-02, -1.723e-01, -5.801e-01) * s0_5;
|
||||
r += V4(1.528e-01, -4.489e-02, 5.871e-03, 4.528e-03) * s0_6;
|
||||
r += V4(-4.619e-01, 6.152e-01, -1.313e-01, -5.326e-02) * s0_7;
|
||||
r += V4(2.902e-01, -1.801e-02, -6.907e-02, 5.105e-02) * s0_8;
|
||||
r += V4(4.440e-03, -1.956e-04, 1.215e-03, 1.790e-03);
|
||||
return r;
|
||||
}
|
||||
|
||||
void Pass1(uint2 blockStart, uint3 tid) {
|
||||
float2 pt = float2(GetInputPt());
|
||||
uint2 gxy = Rmp8x8(tid.x) + blockStart;
|
||||
uint2 size = GetInputSize();
|
||||
if (gxy.x >= size.x || gxy.y >= size.y) {
|
||||
return;
|
||||
}
|
||||
float2 pos = (gxy + 0.5) * pt;
|
||||
|
||||
min16float s0_0 = l0(-1.0, -1.0);
|
||||
min16float s0_1 = l0(0.0, -1.0);
|
||||
min16float s0_2 = l0(1.0, -1.0);
|
||||
min16float s0_3 = l0(-1.0, 0.0);
|
||||
min16float s0_4 = l0(0.0, 0.0);
|
||||
min16float s0_5 = l0(1.0, 0.0);
|
||||
min16float s0_6 = l0(-1.0, 1.0);
|
||||
min16float s0_7 = l0(0.0, 1.0);
|
||||
min16float s0_8 = l0(1.0, 1.0);
|
||||
|
||||
t0[gxy] = f0(s0_0, s0_1, s0_2, s0_3, s0_4, s0_5, s0_6, s0_7, s0_8);
|
||||
}
|
||||
|
||||
//!PASS 2
|
||||
//!DESC conv1
|
||||
//!BLOCK_SIZE 8
|
||||
//!NUM_THREADS 64
|
||||
//!IN t0
|
||||
//!OUT t1
|
||||
|
||||
#define l0(x, y) V4(O(t0, float2(x, y)))
|
||||
|
||||
V4 f0(V4 s0_0, V4 s0_1, V4 s0_2, V4 s0_3, V4 s0_4, V4 s0_5, V4 s0_6, V4 s0_7, V4 s0_8, V4 s1_0, V4 s1_1, V4 s1_2, V4 s1_3, V4 s1_4, V4 s1_5, V4 s1_6, V4 s1_7, V4 s1_8) {
|
||||
V4 r = 0.0;
|
||||
r += mul(s0_0, M4(1.120e-01, 8.150e-03, 7.146e-02, -4.942e-02, 3.623e-01, -1.678e-01, 1.189e-01, 1.372e-01, 1.225e-01, -2.568e-02, 6.959e-02, 1.788e-02, 1.962e-01, -1.870e-01, -6.548e-03, -4.334e-02));
|
||||
r += mul(s0_1, M4(1.805e-01, 4.881e-02, -2.342e-03, 2.035e-02, -2.427e-01, -2.197e-02, -2.036e-02, 3.919e-01, -3.037e-01, 7.047e-02, 3.426e-02, -8.694e-02, 2.144e-01, 1.431e-01, -7.851e-02, 2.247e-01));
|
||||
r += mul(s0_2, M4(6.328e-02, -4.140e-02, 3.362e-02, 5.204e-02, -1.052e-01, 1.698e-01, -2.727e-03, 1.110e-01, 7.156e-02, -1.108e-02, -2.717e-02, 5.680e-02, -6.118e-02, 2.435e-02, 1.743e-02, 8.179e-02));
|
||||
r += mul(s0_3, M4(1.557e-01, 1.189e-01, 8.836e-02, 2.178e-02, -3.954e-01, 2.466e-01, -2.166e-01, -7.051e-02, -2.857e-01, -1.611e-02, -8.667e-02, 1.895e-04, 2.744e-01, 1.499e-01, 8.228e-02, 2.938e-02));
|
||||
r += mul(s0_4, M4(2.441e-01, -3.694e-01, 1.751e-01, 6.833e-01, -1.087e-01, -2.065e-01, -1.557e-01, -6.945e-02, -1.403e-02, 2.171e-02, 3.748e-02, 2.646e-01, -3.718e-01, -1.188e-01, 1.569e-01, 8.554e-02));
|
||||
r += mul(s0_5, M4(-5.069e-02, 2.646e-01, -5.754e-02, -3.545e-01, 1.404e-01, 1.123e-01, 4.577e-02, -1.465e-01, -2.119e-02, -1.115e-02, 1.661e-01, -4.029e-01, -2.123e-01, 2.774e-01, -1.905e-02, -1.093e-02));
|
||||
r += mul(s0_6, M4(2.593e-02, -1.801e-02, 9.053e-02, -2.721e-02, 6.658e-03, 3.802e-02, -3.282e-02, -1.116e-01, 1.201e-01, 2.095e-02, -2.061e-02, 2.498e-03, -1.831e-01, -1.743e-01, 1.062e-01, -6.113e-01));
|
||||
r += mul(s0_7, M4(-1.172e-01, -1.130e-02, -6.727e-02, 7.753e-02, -3.958e-03, -9.790e-02, -1.635e-01, 1.049e-01, 2.862e-01, -2.733e-02, -1.566e-01, -2.900e-01, -1.050e-01, -3.441e-01, -8.690e-02, 8.659e-02));
|
||||
r += mul(s0_8, M4(2.145e-01, 4.613e-02, 1.590e-02, -4.749e-02, 3.291e-01, 1.012e-01, 8.647e-03, -2.282e-01, 2.215e-01, 1.713e-01, 1.414e-01, -3.916e-01, -2.488e-01, 1.458e-01, 2.518e-02, -9.979e-02));
|
||||
r += mul(s1_0, M4(-2.127e-02, 3.575e-02, 9.372e-02, -2.662e-02, 4.467e-02, 1.304e-02, 3.849e-02, 5.186e-02, 7.417e-02, 3.647e-02, 4.960e-02, -3.988e-02, -3.998e-02, 1.173e-01, 7.752e-03, -2.263e-02));
|
||||
r += mul(s1_1, M4(-1.283e-01, -1.460e-01, 1.963e-02, -1.108e-01, -4.171e-01, 2.397e-01, -5.886e-02, 7.788e-02, -2.820e-02, -1.719e-01, 9.334e-03, -1.255e-01, 1.392e-01, 9.532e-03, -5.163e-02, 8.641e-02));
|
||||
r += mul(s1_2, M4(-1.889e-01, 1.933e-01, 5.574e-02, 6.723e-02, -1.015e-01, -3.316e-01, -1.460e-02, -1.606e-01, 1.052e-01, 1.027e-02, -4.626e-02, 5.368e-02, -9.160e-03, -9.514e-02, 2.577e-02, 7.122e-02));
|
||||
r += mul(s1_3, M4(-1.958e-01, 1.276e-01, 7.303e-02, -1.135e-01, -2.277e-01, 2.017e-01, -5.223e-02, 1.379e-01, -1.737e-01, 4.871e-02, -8.142e-02, 1.392e-01, 8.113e-02, 4.415e-01, -1.174e-01, 1.910e-02));
|
||||
r += mul(s1_4, M4(-3.233e-01, -4.158e-01, 8.391e-02, 2.017e-01, 9.790e-02, -4.865e-02, -2.172e-01, 2.607e-01, -2.458e-01, -4.931e-01, 3.016e-01, 2.198e-01, -7.173e-02, -5.683e-01, -7.447e-02, -1.264e-01));
|
||||
r += mul(s1_5, M4(-4.189e-01, 3.271e-01, 8.844e-02, -5.295e-01, 6.365e-02, -1.513e-01, 1.246e-02, -2.005e-01, 1.764e-01, 5.796e-01, 7.286e-02, -1.428e-01, -1.130e-01, -6.883e-02, -1.303e-02, -1.091e-01));
|
||||
r += mul(s1_6, M4(-6.621e-02, 9.901e-03, 9.472e-02, -3.568e-02, 1.067e-01, -3.318e-02, 3.152e-01, -5.261e-02, 1.108e-01, 7.081e-02, -1.289e-01, 6.477e-03, 1.036e-01, -1.477e-03, 1.035e+00, -9.204e-02));
|
||||
r += mul(s1_7, M4(-2.721e-01, -5.458e-02, -1.707e-01, -1.096e-02, -1.302e-01, -9.074e-02, 1.694e-01, 6.307e-02, 4.233e-01, -5.112e-02, -3.545e-01, -2.589e-01, 8.276e-02, -3.975e-01, 7.705e-02, 4.482e-01));
|
||||
r += mul(s1_8, M4(1.175e-01, 2.212e-03, 5.751e-02, -8.666e-02, 2.532e-01, 1.303e-01, 7.291e-02, -2.126e-01, 4.815e-01, 1.649e-01, -4.748e-02, -3.330e-01, -1.252e-01, -8.987e-03, -4.285e-03, -1.106e-01));
|
||||
r += V4(3.566e-03, 2.403e-03, -1.451e-03, 4.304e-03);
|
||||
return r;
|
||||
}
|
||||
|
||||
void Pass2(uint2 blockStart, uint3 tid) {
|
||||
float2 pt = float2(GetInputPt());
|
||||
uint2 gxy = Rmp8x8(tid.x) + blockStart;
|
||||
uint2 size = GetInputSize();
|
||||
if (gxy.x >= size.x || gxy.y >= size.y) {
|
||||
return;
|
||||
}
|
||||
float2 pos = (gxy + 0.5) * pt;
|
||||
|
||||
V4 s0_0 = l0(-1.0, -1.0);
|
||||
V4 s0_1 = l0(0.0, -1.0);
|
||||
V4 s0_2 = l0(1.0, -1.0);
|
||||
V4 s0_3 = l0(-1.0, 0.0);
|
||||
V4 s0_4 = l0(0.0, 0.0);
|
||||
V4 s0_5 = l0(1.0, 0.0);
|
||||
V4 s0_6 = l0(-1.0, 1.0);
|
||||
V4 s0_7 = l0(0.0, 1.0);
|
||||
V4 s0_8 = l0(1.0, 1.0);
|
||||
V4 s1_0 = -max(-s0_0, 0.0);
|
||||
V4 s1_1 = -max(-s0_1, 0.0);
|
||||
V4 s1_2 = -max(-s0_2, 0.0);
|
||||
V4 s1_3 = -max(-s0_3, 0.0);
|
||||
V4 s1_4 = -max(-s0_4, 0.0);
|
||||
V4 s1_5 = -max(-s0_5, 0.0);
|
||||
V4 s1_6 = -max(-s0_6, 0.0);
|
||||
V4 s1_7 = -max(-s0_7, 0.0);
|
||||
V4 s1_8 = -max(-s0_8, 0.0);
|
||||
s0_0 = max(s0_0, 0.0);
|
||||
s0_1 = max(s0_1, 0.0);
|
||||
s0_2 = max(s0_2, 0.0);
|
||||
s0_3 = max(s0_3, 0.0);
|
||||
s0_4 = max(s0_4, 0.0);
|
||||
s0_5 = max(s0_5, 0.0);
|
||||
s0_6 = max(s0_6, 0.0);
|
||||
s0_7 = max(s0_7, 0.0);
|
||||
s0_8 = max(s0_8, 0.0);
|
||||
|
||||
t1[gxy] = f0(s0_0, s0_1, s0_2, s0_3, s0_4, s0_5, s0_6, s0_7, s0_8, s1_0, s1_1, s1_2, s1_3, s1_4, s1_5, s1_6, s1_7, s1_8);
|
||||
}
|
||||
|
||||
//!PASS 3
|
||||
//!DESC conv2
|
||||
//!BLOCK_SIZE 8
|
||||
//!NUM_THREADS 64
|
||||
//!IN t1
|
||||
//!OUT t0
|
||||
|
||||
#define l0(x, y) V4(O(t1, float2(x, y)))
|
||||
|
||||
V4 f0(V4 s0_0, V4 s0_1, V4 s0_2, V4 s0_3, V4 s0_4, V4 s0_5, V4 s0_6, V4 s0_7, V4 s0_8, V4 s1_0, V4 s1_1, V4 s1_2, V4 s1_3, V4 s1_4, V4 s1_5, V4 s1_6, V4 s1_7, V4 s1_8) {
|
||||
V4 r = 0.0;
|
||||
r += mul(s0_0, M4(-1.173e-02, 2.762e-03, -2.225e-03, -6.814e-03, 8.328e-02, -1.275e-02, 6.091e-02, -6.470e-02, -6.067e-02, -1.086e-01, 7.501e-02, 1.227e-01, -1.551e-02, -1.728e-02, -2.694e-02, 7.490e-02));
|
||||
r += mul(s0_1, M4(5.326e-02, 1.003e-02, 3.989e-02, -1.908e-03, -4.580e-02, -4.303e-03, 4.333e-02, 8.324e-02, 8.170e-01, 8.040e-01, -3.975e-01, -1.034e+00, 1.362e-01, 3.776e-04, -1.102e-02, -5.030e-02));
|
||||
r += mul(s0_2, M4(-6.068e-02, 6.212e-02, -4.979e-02, 9.626e-03, 1.301e-02, -2.045e-02, 1.798e-02, 2.091e-02, -2.290e-01, 3.612e-01, -7.014e-02, 1.669e-01, -5.191e-03, 1.304e-02, 9.444e-05, -2.137e-02));
|
||||
r += mul(s0_3, M4(-3.235e-02, -6.238e-02, 3.894e-02, 5.893e-02, -3.530e-02, -1.063e-01, 8.668e-02, 1.232e-02, -3.851e-02, 2.952e-02, 6.132e-02, -5.755e-02, 8.317e-02, 8.340e-02, -8.227e-02, 6.481e-03));
|
||||
r += mul(s0_4, M4(2.118e-02, 2.725e-01, -1.393e-01, -2.377e-01, 4.872e-01, 2.235e-01, -1.746e-02, -3.662e-01, -3.945e-01, -1.862e-01, -9.132e-02, 8.777e-02, -5.084e-01, -3.300e-01, -3.443e-02, 4.203e-01));
|
||||
r += mul(s0_5, M4(1.165e-01, -1.743e-01, 4.169e-03, -1.518e-01, 1.174e-01, -3.314e-02, 2.295e-02, -9.160e-02, -1.854e-01, -6.999e-02, -6.985e-02, 4.875e-04, -1.147e-01, 1.722e-01, -2.588e-02, 1.185e-01));
|
||||
r += mul(s0_6, M4(-8.881e-03, 1.907e-03, 9.002e-03, 8.085e-03, -8.728e-03, -1.074e-01, 7.035e-02, 6.519e-02, 4.323e-02, -4.675e-02, 4.382e-02, 1.091e-02, 3.357e-02, 4.384e-02, -8.031e-03, -1.945e-02));
|
||||
r += mul(s0_7, M4(-7.981e-02, 1.492e-02, -9.399e-02, -3.750e-02, -1.274e-01, -3.235e-02, -3.169e-02, 6.420e-02, 4.304e-02, 9.302e-02, 1.250e-02, 3.906e-03, 1.752e-01, -1.211e-02, 9.058e-02, -6.273e-02));
|
||||
r += mul(s0_8, M4(-1.290e-02, -4.309e-02, 3.384e-02, 3.819e-02, -3.309e-02, 3.986e-02, 3.783e-03, 5.361e-02, 5.473e-02, 1.574e-02, -2.385e-02, -7.630e-02, -1.778e-02, 1.375e-02, -2.936e-02, -1.778e-02));
|
||||
r += mul(s1_0, M4(1.219e-01, 1.166e-02, -5.932e-02, 1.191e-02, -2.487e-03, -5.945e-02, 6.637e-02, 5.775e-02, -1.705e-02, 5.538e-02, -5.130e-02, -3.602e-02, 5.461e-02, -1.253e-01, 6.953e-02, 1.066e-01));
|
||||
r += mul(s1_1, M4(6.504e-01, -9.638e-01, 1.371e+00, 5.682e-02, 1.583e-02, -2.371e-02, 5.201e-02, 3.845e-02, 3.478e-02, -1.477e-01, 1.763e-01, 5.129e-02, 2.992e-01, -3.335e-01, 2.490e-02, 4.873e-01));
|
||||
r += mul(s1_2, M4(2.415e-02, 8.838e-02, -1.519e-01, 9.012e-02, -6.676e-02, 3.422e-02, -2.380e-02, 5.608e-02, -1.744e-01, -9.595e-02, -7.627e-02, -5.823e-02, -9.466e-02, 5.554e-02, -1.024e-01, -1.763e-01));
|
||||
r += mul(s1_3, M4(8.380e-02, -7.972e-02, 8.813e-02, 3.371e-02, 5.392e-03, 4.385e-02, 1.207e-02, -5.728e-02, -3.427e-03, -2.027e-03, 1.211e-03, -7.897e-03, 3.360e-02, 4.603e-02, -1.240e-02, -2.219e-02));
|
||||
r += mul(s1_4, M4(-6.699e-01, -3.512e-01, -2.153e-01, 3.218e-01, -5.100e-01, 4.324e-03, 2.713e-01, -2.073e-01, 1.547e-01, -2.123e-03, 7.928e-02, -5.698e-02, 2.450e-02, -4.866e-02, 9.436e-02, 7.900e-02));
|
||||
r += mul(s1_5, M4(1.609e-01, -7.910e-02, 1.112e-01, -2.959e-02, -3.877e-01, -2.803e-01, -1.071e-01, -6.881e-03, 1.922e-02, 2.433e-02, -3.581e-02, -5.264e-02, -3.287e-01, -1.037e-02, -6.159e-02, 8.219e-02));
|
||||
r += mul(s1_6, M4(-4.263e-02, -6.372e-02, 2.607e-02, 5.285e-02, -6.156e-02, -7.837e-02, 7.299e-03, 8.959e-02, -8.706e-03, -1.642e-02, 1.825e-02, 1.850e-02, 2.735e-02, 2.413e-02, -3.236e-02, -9.612e-03));
|
||||
r += mul(s1_7, M4(-5.849e-02, 1.530e-01, -6.767e-02, -1.392e-02, -3.430e-01, -1.851e-01, -1.013e-01, 2.465e-01, -1.715e-02, 4.970e-03, -1.850e-02, -4.214e-03, 1.889e-02, -5.787e-02, 7.154e-02, 9.237e-02));
|
||||
r += mul(s1_8, M4(-2.084e-02, -2.484e-01, 5.767e-02, -2.550e-02, -9.126e-02, 4.292e-01, 1.983e-02, 2.979e-01, -3.807e-03, -3.367e-03, 1.835e-03, 8.694e-03, -9.074e-02, 4.820e-02, -2.886e-02, 5.975e-02));
|
||||
r += V4(5.508e-03, 4.690e-03, -5.708e-04, -7.674e-03);
|
||||
return r;
|
||||
}
|
||||
|
||||
void Pass3(uint2 blockStart, uint3 tid) {
|
||||
float2 pt = float2(GetInputPt());
|
||||
uint2 gxy = Rmp8x8(tid.x) + blockStart;
|
||||
uint2 size = GetInputSize();
|
||||
if (gxy.x >= size.x || gxy.y >= size.y) {
|
||||
return;
|
||||
}
|
||||
float2 pos = (gxy + 0.5) * pt;
|
||||
|
||||
V4 s0_0 = l0(-1.0, -1.0);
|
||||
V4 s0_1 = l0(0.0, -1.0);
|
||||
V4 s0_2 = l0(1.0, -1.0);
|
||||
V4 s0_3 = l0(-1.0, 0.0);
|
||||
V4 s0_4 = l0(0.0, 0.0);
|
||||
V4 s0_5 = l0(1.0, 0.0);
|
||||
V4 s0_6 = l0(-1.0, 1.0);
|
||||
V4 s0_7 = l0(0.0, 1.0);
|
||||
V4 s0_8 = l0(1.0, 1.0);
|
||||
V4 s1_0 = -max(-s0_0, 0.0);
|
||||
V4 s1_1 = -max(-s0_1, 0.0);
|
||||
V4 s1_2 = -max(-s0_2, 0.0);
|
||||
V4 s1_3 = -max(-s0_3, 0.0);
|
||||
V4 s1_4 = -max(-s0_4, 0.0);
|
||||
V4 s1_5 = -max(-s0_5, 0.0);
|
||||
V4 s1_6 = -max(-s0_6, 0.0);
|
||||
V4 s1_7 = -max(-s0_7, 0.0);
|
||||
V4 s1_8 = -max(-s0_8, 0.0);
|
||||
s0_0 = max(s0_0, 0.0);
|
||||
s0_1 = max(s0_1, 0.0);
|
||||
s0_2 = max(s0_2, 0.0);
|
||||
s0_3 = max(s0_3, 0.0);
|
||||
s0_4 = max(s0_4, 0.0);
|
||||
s0_5 = max(s0_5, 0.0);
|
||||
s0_6 = max(s0_6, 0.0);
|
||||
s0_7 = max(s0_7, 0.0);
|
||||
s0_8 = max(s0_8, 0.0);
|
||||
|
||||
t0[gxy] = f0(s0_0, s0_1, s0_2, s0_3, s0_4, s0_5, s0_6, s0_7, s0_8, s1_0, s1_1, s1_2, s1_3, s1_4, s1_5, s1_6, s1_7, s1_8);
|
||||
}
|
||||
|
||||
//!PASS 4
|
||||
//!DESC out-shuffle
|
||||
//!BLOCK_SIZE 16
|
||||
//!NUM_THREADS 64
|
||||
//!IN INPUT, t0
|
||||
//!OUT OUTPUT
|
||||
|
||||
#define l0(x, y) V4(O(t0, float2(x, y)))
|
||||
|
||||
V4 f0(V4 s0_0, V4 s0_1, V4 s0_2, V4 s0_3, V4 s0_4, V4 s0_5, V4 s0_6, V4 s0_7, V4 s0_8, V4 s1_0, V4 s1_1, V4 s1_2, V4 s1_3, V4 s1_4, V4 s1_5, V4 s1_6, V4 s1_7, V4 s1_8) {
|
||||
V4 r = 0.0;
|
||||
r += mul(s0_0, M4(-1.841e-04, -5.677e-02, 9.249e-03, -8.726e-03, 4.041e-02, -1.295e-01, 1.154e-01, 2.765e-02, 1.833e-01, -8.427e-02, 1.078e-01, -1.432e-01, 1.068e-01, -1.222e-01, 2.535e-02, 5.316e-02));
|
||||
r += mul(s0_1, M4(-3.609e-03, 5.812e-02, -4.650e-02, -2.093e-02, -3.442e-02, 7.643e-02, 1.424e-02, 7.195e-02, 1.552e-01, -8.291e-01, 1.547e-01, 4.354e-01, -2.851e-02, 1.023e-01, -8.481e-03, -6.567e-02));
|
||||
r += mul(s0_2, M4(1.724e-02, -1.165e-02, 1.007e-02, -3.008e-02, -9.814e-04, -2.007e-02, -5.905e-03, 6.714e-03, -1.736e-01, 2.035e-01, -1.333e-01, 1.250e-01, -9.118e-03, -4.989e-02, 2.142e-02, -4.038e-03));
|
||||
r += mul(s0_3, M4(7.885e-02, -8.350e-02, -6.025e-03, -1.139e-01, -8.380e-02, -6.836e-02, -5.589e-01, -4.614e-01, -6.742e-01, 2.118e-01, -4.442e-01, 2.197e-01, -5.873e-02, 1.902e-01, -4.687e-01, -4.712e-01));
|
||||
r += mul(s0_4, M4(-4.506e-01, 2.396e-01, -1.350e-02, 4.072e-01, 3.249e-01, 9.930e-02, 1.576e-02, -2.456e-01, 1.506e+00, 6.047e-02, 8.841e-01, -1.927e+00, -4.337e-01, -5.801e-01, 3.334e-01, 8.276e-02));
|
||||
r += mul(s0_5, M4(5.049e-02, -1.870e-01, 7.413e-02, -2.569e-02, -2.152e-02, 1.139e-01, -3.874e-02, 1.634e-02, -1.325e-01, 4.002e-02, -1.874e-01, 1.204e-01, 2.267e-02, 1.380e-02, -1.055e-02, 5.504e-02));
|
||||
r += mul(s0_6, M4(-2.855e-02, 1.255e-02, 3.941e-02, 4.466e-03, 4.814e-05, -9.003e-03, 1.231e-01, 5.676e-02, 5.020e-02, -5.407e-02, -1.951e-01, 4.240e-02, 3.525e-02, -1.021e-01, 4.517e-01, 2.399e-01));
|
||||
r += mul(s0_7, M4(-5.781e-02, -4.964e-02, -3.981e-01, -1.716e-01, 3.430e-02, -1.644e-02, 2.352e-01, 1.938e-01, 1.266e-01, -1.061e-01, 7.754e-01, 5.337e-01, 2.664e-01, 3.669e-01, -1.113e+00, -1.742e-01));
|
||||
r += mul(s0_8, M4(2.948e-02, 3.723e-02, 2.739e-02, -5.215e-02, -1.542e-02, -2.173e-02, -1.944e-02, 1.856e-02, -4.535e-02, 1.163e-02, -5.014e-02, 8.660e-02, 1.421e-01, 2.314e-01, 1.171e-02, -4.975e-01));
|
||||
r += mul(s1_0, M4(-4.408e-02, -3.573e-02, 3.842e-02, 2.571e-02, 2.872e-01, -4.960e-01, 2.569e-01, -6.254e-02, 2.158e-02, -6.452e-02, 7.495e-02, 1.997e-02, 4.094e-02, -9.741e-02, 3.542e-02, -8.115e-03));
|
||||
r += mul(s1_1, M4(3.480e-02, 1.949e-04, 1.780e-02, 4.483e-02, -2.814e-01, 4.229e-01, -5.482e-02, 1.512e-02, -3.120e-02, 3.945e-02, 4.626e-02, 7.013e-02, -6.686e-03, 5.832e-02, -4.408e-02, -1.262e-02));
|
||||
r += mul(s1_2, M4(-9.847e-03, 1.973e-03, 1.457e-02, 2.290e-02, 4.741e-02, 2.270e-02, 8.902e-04, 1.152e-02, -2.473e-02, -1.948e-02, -3.475e-03, 4.431e-02, 2.044e-02, 1.571e-04, 9.470e-03, -2.825e-02));
|
||||
r += mul(s1_3, M4(5.918e-02, -1.939e-02, -4.628e-02, -7.774e-02, -3.040e-01, 8.634e-02, -5.254e-01, -6.906e-01, -1.218e-01, -6.178e-02, -3.115e-01, -2.697e-01, -2.402e-02, -2.149e-02, -3.878e-01, -3.453e-01));
|
||||
r += mul(s1_4, M4(2.920e-01, 3.711e-01, -2.753e-01, -4.654e-02, 1.379e-01, 3.908e-01, -4.798e-01, 6.668e-01, 4.870e-01, -1.634e-01, -7.790e-02, -2.683e-01, -4.834e-01, -1.822e-02, -8.492e-03, 7.620e-02));
|
||||
r += mul(s1_5, M4(-4.786e-02, 2.412e-02, 4.992e-02, -1.913e-01, 9.058e-02, -4.485e-02, 8.249e-02, -9.418e-02, 3.555e-02, 3.543e-01, -1.140e-01, -1.358e-01, 5.079e-02, -2.007e-01, 6.132e-02, -2.373e-03));
|
||||
r += mul(s1_6, M4(6.553e-03, -7.804e-03, 8.569e-02, 4.875e-02, 5.085e-02, 1.728e-02, 6.949e-02, 1.313e-01, 1.825e-02, -5.557e-02, -7.548e-03, -5.534e-02, 7.059e-02, 4.382e-02, 2.807e-01, 1.919e-01));
|
||||
r += mul(s1_7, M4(-1.071e-01, -3.709e-02, -4.757e-01, -1.943e-01, 8.182e-02, -3.334e-02, 4.170e-01, 6.716e-02, 1.563e-01, 1.382e-01, 7.441e-01, 4.082e-01, -9.101e-02, -3.943e-02, -5.142e-01, -1.910e-01));
|
||||
r += mul(s1_8, M4(4.255e-03, 4.204e-02, 5.834e-02, -6.508e-02, -3.675e-02, 1.165e-02, -2.694e-02, -2.212e-02, -3.036e-02, -4.393e-02, 1.855e-03, 1.909e-01, 3.812e-02, 3.309e-02, 3.942e-02, -7.422e-02));
|
||||
r += V4(-1.734e-03, -1.825e-03, -1.635e-03, -1.665e-03);
|
||||
return tanh(r);
|
||||
}
|
||||
|
||||
void Pass4(uint2 blockStart, uint3 tid) {
|
||||
float2 pt = float2(GetInputPt());
|
||||
uint2 gxy = (Rmp8x8(tid.x) << 1) + blockStart;
|
||||
uint2 size = GetOutputSize();
|
||||
if (gxy.x >= size.x || gxy.y >= size.y) {
|
||||
return;
|
||||
}
|
||||
float2 pos = ((gxy >> 1) + 0.5) * pt;
|
||||
|
||||
V4 s0_0 = l0(-1.0, -1.0);
|
||||
V4 s0_1 = l0(0.0, -1.0);
|
||||
V4 s0_2 = l0(1.0, -1.0);
|
||||
V4 s0_3 = l0(-1.0, 0.0);
|
||||
V4 s0_4 = l0(0.0, 0.0);
|
||||
V4 s0_5 = l0(1.0, 0.0);
|
||||
V4 s0_6 = l0(-1.0, 1.0);
|
||||
V4 s0_7 = l0(0.0, 1.0);
|
||||
V4 s0_8 = l0(1.0, 1.0);
|
||||
V4 s1_0 = -max(-s0_0, 0.0);
|
||||
V4 s1_1 = -max(-s0_1, 0.0);
|
||||
V4 s1_2 = -max(-s0_2, 0.0);
|
||||
V4 s1_3 = -max(-s0_3, 0.0);
|
||||
V4 s1_4 = -max(-s0_4, 0.0);
|
||||
V4 s1_5 = -max(-s0_5, 0.0);
|
||||
V4 s1_6 = -max(-s0_6, 0.0);
|
||||
V4 s1_7 = -max(-s0_7, 0.0);
|
||||
V4 s1_8 = -max(-s0_8, 0.0);
|
||||
s0_0 = max(s0_0, 0.0);
|
||||
s0_1 = max(s0_1, 0.0);
|
||||
s0_2 = max(s0_2, 0.0);
|
||||
s0_3 = max(s0_3, 0.0);
|
||||
s0_4 = max(s0_4, 0.0);
|
||||
s0_5 = max(s0_5, 0.0);
|
||||
s0_6 = max(s0_6, 0.0);
|
||||
s0_7 = max(s0_7, 0.0);
|
||||
s0_8 = max(s0_8, 0.0);
|
||||
|
||||
V4 r = f0(s0_0, s0_1, s0_2, s0_3, s0_4, s0_5, s0_6, s0_7, s0_8, s1_0, s1_1, s1_2, s1_3, s1_4, s1_5, s1_6, s1_7, s1_8);
|
||||
|
||||
static const float3x3 rgb2yuv = {0.299, 0.587, 0.114, -0.169, -0.331, 0.5, 0.5, -0.419, -0.081};
|
||||
static const float3x3 yuv2rgb = {1, -0.00093, 1.401687, 1, -0.3437, -0.71417, 1, 1.77216, 0.00099};
|
||||
float2 opt = float2(GetOutputPt());
|
||||
|
||||
pos -= 0.5f * opt;
|
||||
float3 yuv = mul(rgb2yuv, INPUT.SampleLevel(SL, pos, 0).rgb);
|
||||
OUTPUT[gxy] = float4(mul(yuv2rgb, float3(saturate(yuv.r + r.x), yuv.yz)), 1);
|
||||
|
||||
++gxy.x;
|
||||
pos.x += opt.x;
|
||||
yuv = mul(rgb2yuv, INPUT.SampleLevel(SL, pos, 0).rgb);
|
||||
OUTPUT[gxy] = float4(mul(yuv2rgb, float3(saturate(yuv.r + r.y), yuv.yz)), 1);
|
||||
|
||||
++gxy.y;
|
||||
pos.y += opt.y;
|
||||
yuv = mul(rgb2yuv, INPUT.SampleLevel(SL, pos, 0).rgb);
|
||||
OUTPUT[gxy] = float4(mul(yuv2rgb, float3(saturate(yuv.r + r.w), yuv.yz)), 1);
|
||||
|
||||
--gxy.x;
|
||||
pos.x -= opt.x;
|
||||
yuv = mul(rgb2yuv, INPUT.SampleLevel(SL, pos, 0).rgb);
|
||||
OUTPUT[gxy] = float4(mul(yuv2rgb, float3(saturate(yuv.r + r.z), yuv.yz)), 1);
|
||||
}
|
||||
413
src/Effects/CuNNy/CuNNy-3x4C-NVL-DN.hlsl
Normal file
413
src/Effects/CuNNy/CuNNy-3x4C-NVL-DN.hlsl
Normal file
|
|
@ -0,0 +1,413 @@
|
|||
// CuNNy 3x4C BILINEAR RGB NVL DN - https://github.com/cunnyplapper/CuNNy
|
||||
|
||||
// This program is free software: you can redistribute it and/or modify
|
||||
// it under the terms of the GNU General Public License as published by
|
||||
// the Free Software Foundation, either version 3 of the License, or
|
||||
// (at your option) any later version.
|
||||
//
|
||||
// This program is distributed in the hope that it will be useful,
|
||||
// but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
//
|
||||
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
// GNU General Public License for more details.
|
||||
//
|
||||
// You should have received a copy of the GNU General Public License
|
||||
// along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
|
||||
//!MAGPIE EFFECT
|
||||
//!VERSION 4
|
||||
//!SORT_NAME CuNNy-DN-D04N03
|
||||
|
||||
//!TEXTURE
|
||||
Texture2D INPUT;
|
||||
|
||||
//!TEXTURE
|
||||
//!WIDTH INPUT_WIDTH * 2
|
||||
//!HEIGHT INPUT_HEIGHT * 2
|
||||
Texture2D OUTPUT;
|
||||
|
||||
//!SAMPLER
|
||||
//!FILTER POINT
|
||||
SamplerState SP;
|
||||
|
||||
//!SAMPLER
|
||||
//!FILTER LINEAR
|
||||
SamplerState SL;
|
||||
|
||||
//!COMMON
|
||||
#define O(t, p) t.SampleLevel(SP, pos + p * pt, 0)
|
||||
#define V4 min16float4
|
||||
#define M4 min16float4x4
|
||||
|
||||
//!TEXTURE
|
||||
//!WIDTH INPUT_WIDTH
|
||||
//!HEIGHT INPUT_HEIGHT
|
||||
//!FORMAT R8G8B8A8_SNORM
|
||||
Texture2D t0;
|
||||
|
||||
//!TEXTURE
|
||||
//!WIDTH INPUT_WIDTH
|
||||
//!HEIGHT INPUT_HEIGHT
|
||||
//!FORMAT R8G8B8A8_SNORM
|
||||
Texture2D t1;
|
||||
|
||||
//!PASS 1
|
||||
//!DESC in
|
||||
//!BLOCK_SIZE 8
|
||||
//!NUM_THREADS 64
|
||||
//!IN INPUT
|
||||
//!OUT t0
|
||||
|
||||
#define l0(x, y) min16float((dot(float3(-2.683e-01, -5.217e-01, -1.382e-01), O(INPUT, float2(x, y)).rgb) + 7.973e-01))
|
||||
|
||||
V4 f0(min16float s0_0, min16float s0_1, min16float s0_2, min16float s0_3, min16float s0_4, min16float s0_5, min16float s0_6, min16float s0_7, min16float s0_8) {
|
||||
V4 r = 0.0;
|
||||
r += V4(1.850e-01, -2.860e-02, -5.321e-01, 2.390e-03) * s0_0;
|
||||
r += V4(-4.299e-01, -2.946e-02, -1.180e-01, -5.652e-02) * s0_1;
|
||||
r += V4(-4.798e-01, -2.276e-02, 3.201e-02, 4.870e-02) * s0_2;
|
||||
r += V4(2.783e-01, -2.262e-03, -1.864e-01, 1.793e-01) * s0_3;
|
||||
r += V4(9.435e-04, 8.115e-01, 7.806e-01, -7.793e-01) * s0_4;
|
||||
r += V4(2.180e-01, -2.564e-05, 2.774e-03, -7.015e-02) * s0_5;
|
||||
r += V4(1.479e-03, -4.675e-02, 3.323e-02, 3.392e-01) * s0_6;
|
||||
r += V4(1.203e-01, 1.509e-02, 5.239e-02, 3.194e-01) * s0_7;
|
||||
r += V4(7.680e-02, -4.310e-02, -7.203e-02, 1.255e-02) * s0_8;
|
||||
r += V4(3.156e-02, 7.379e-02, 1.078e-02, -5.510e-04);
|
||||
return r;
|
||||
}
|
||||
|
||||
void Pass1(uint2 blockStart, uint3 tid) {
|
||||
float2 pt = float2(GetInputPt());
|
||||
uint2 gxy = Rmp8x8(tid.x) + blockStart;
|
||||
uint2 size = GetInputSize();
|
||||
if (gxy.x >= size.x || gxy.y >= size.y) {
|
||||
return;
|
||||
}
|
||||
float2 pos = (gxy + 0.5) * pt;
|
||||
|
||||
min16float s0_0 = l0(-1.0, -1.0);
|
||||
min16float s0_1 = l0(0.0, -1.0);
|
||||
min16float s0_2 = l0(1.0, -1.0);
|
||||
min16float s0_3 = l0(-1.0, 0.0);
|
||||
min16float s0_4 = l0(0.0, 0.0);
|
||||
min16float s0_5 = l0(1.0, 0.0);
|
||||
min16float s0_6 = l0(-1.0, 1.0);
|
||||
min16float s0_7 = l0(0.0, 1.0);
|
||||
min16float s0_8 = l0(1.0, 1.0);
|
||||
|
||||
t0[gxy] = f0(s0_0, s0_1, s0_2, s0_3, s0_4, s0_5, s0_6, s0_7, s0_8);
|
||||
}
|
||||
|
||||
//!PASS 2
|
||||
//!DESC conv1
|
||||
//!BLOCK_SIZE 8
|
||||
//!NUM_THREADS 64
|
||||
//!IN t0
|
||||
//!OUT t1
|
||||
|
||||
#define l0(x, y) V4(O(t0, float2(x, y)))
|
||||
|
||||
V4 f0(V4 s0_0, V4 s0_1, V4 s0_2, V4 s0_3, V4 s0_4, V4 s0_5, V4 s0_6, V4 s0_7, V4 s0_8, V4 s1_0, V4 s1_1, V4 s1_2, V4 s1_3, V4 s1_4, V4 s1_5, V4 s1_6, V4 s1_7, V4 s1_8) {
|
||||
V4 r = 0.0;
|
||||
r += mul(s0_0, M4(1.949e-01, -1.247e-01, -7.307e-02, 8.783e-02, -4.773e-02, 6.012e-02, 8.043e-02, -8.489e-02, 6.760e-02, -7.809e-02, -4.745e-02, -1.304e-02, -1.402e-01, -1.248e-01, 3.334e-01, -1.498e-01));
|
||||
r += mul(s0_1, M4(7.053e-02, 9.895e-02, 1.655e-01, 2.251e-01, 3.511e-02, -1.010e-01, -2.736e-01, 1.174e-01, -2.551e-01, 1.100e-01, 1.518e-01, -4.343e-02, -9.293e-01, 5.327e-01, -2.723e-01, 4.006e-01));
|
||||
r += mul(s0_2, M4(-2.390e-02, 8.154e-03, -2.332e-02, -3.708e-02, 2.814e-02, 5.506e-02, -2.627e-01, -8.081e-02, -1.062e-01, -6.819e-02, -9.498e-02, -2.749e-01, -2.457e-01, 6.868e-01, 6.527e-03, 7.676e-01));
|
||||
r += mul(s0_3, M4(2.704e-01, 4.055e-02, -4.756e-01, 2.506e-01, -9.498e-02, 5.838e-02, 1.733e-01, 3.420e-03, -7.051e-02, -8.233e-02, -3.006e-01, 6.824e-02, -1.308e-01, 1.196e-01, 2.560e-01, 8.304e-02));
|
||||
r += mul(s0_4, M4(4.190e-01, -1.207e-01, 2.708e-01, -6.375e-01, 1.740e-01, 1.955e-03, -1.816e-01, -7.933e-02, -9.308e-01, 1.333e-01, -1.335e-01, -1.401e-01, 3.447e-01, 3.389e-01, 6.660e-01, -3.387e-01));
|
||||
r += mul(s0_5, M4(7.310e-02, 1.403e-02, 8.114e-02, 7.400e-02, -2.552e-02, -1.607e-01, -1.208e-01, -3.943e-02, -2.743e-02, -7.229e-03, -1.749e-03, 3.062e-01, 1.429e-01, 8.105e-01, 3.562e-01, 4.580e-01));
|
||||
r += mul(s0_6, M4(2.115e-01, -1.686e-01, -1.948e-01, -1.191e-01, -5.798e-02, 3.493e-02, 8.264e-02, 1.579e-01, -1.081e-01, -1.775e-01, -8.196e-02, -2.085e-01, 6.791e-02, 1.652e-02, -4.933e-03, 2.833e-02));
|
||||
r += mul(s0_7, M4(-2.160e-01, -3.858e-01, -8.407e-01, -1.091e-01, 8.415e-03, 8.626e-02, 2.340e-01, 9.177e-02, -4.697e-01, -6.623e-02, -5.176e-01, 6.762e-02, -3.437e-03, 6.570e-02, 7.630e-02, 8.988e-02));
|
||||
r += mul(s0_8, M4(6.527e-02, -6.320e-02, 1.192e-02, -1.196e-01, -1.605e-02, -9.294e-03, 1.955e-01, -2.356e-02, -3.582e-02, 1.377e-02, 9.253e-02, -2.362e-02, 3.578e-02, 1.822e-01, 3.329e-01, 1.489e-01));
|
||||
r += mul(s1_0, M4(1.154e-01, -1.822e-01, -2.122e-01, 3.031e-02, 6.550e-01, -4.855e-02, 6.554e-02, 4.432e-02, 1.671e-02, -4.477e-02, -9.428e-03, 4.413e-03, -3.185e-02, -1.529e-01, -1.222e-01, 6.523e-02));
|
||||
r += mul(s1_1, M4(-4.920e-02, -1.697e-02, 4.141e-02, 1.997e-01, 6.972e-01, -5.157e-01, 2.031e-01, 2.829e-02, -5.005e-02, 2.335e-01, 2.985e-01, 6.871e-02, -5.232e-01, 2.146e-02, -1.418e+00, 2.193e-01));
|
||||
r += mul(s1_2, M4(-6.472e-02, 2.595e-02, -2.610e-02, -2.279e-02, 4.165e-01, -7.745e-01, 1.261e-01, -3.845e-01, 3.279e-02, 2.445e-02, 1.796e-01, -2.581e-01, -3.838e-01, 6.280e-02, -4.893e-01, -1.475e-01));
|
||||
r += mul(s1_3, M4(9.330e-02, 1.742e-01, -1.685e-01, 2.376e-02, -9.586e-01, -1.236e+00, -7.271e-01, -7.674e-01, 2.500e-01, -3.709e-02, -1.303e-01, 1.490e-01, -2.746e-01, -1.376e-01, -2.321e-02, -1.967e-02));
|
||||
r += mul(s1_4, M4(3.660e-01, 4.772e-02, 5.524e-01, -2.804e-01, -2.756e+00, -1.336e+00, 2.038e-01, 2.593e+00, 2.156e-01, 3.281e-01, 3.152e-01, 8.064e-01, 3.970e-01, -1.379e-01, -7.518e-02, -2.723e-01));
|
||||
r += mul(s1_5, M4(5.214e-03, 1.695e-02, 1.024e-01, 1.333e-01, -2.250e-01, -1.298e+00, 4.673e-01, 1.317e+00, 3.036e-01, -1.273e-01, 2.900e-01, 2.249e-02, -1.870e-01, -1.124e-01, -5.879e-01, 6.314e-02));
|
||||
r += mul(s1_6, M4(-8.225e-02, -1.149e-01, 1.598e-04, -3.662e-01, -8.572e-02, -8.909e-01, 9.891e-02, 1.818e-01, 1.715e-01, -2.348e-01, 1.178e-01, -6.289e-02, 1.522e-02, 1.973e-02, 3.707e-02, 2.911e-02));
|
||||
r += mul(s1_7, M4(-6.380e-02, 8.661e-02, -2.666e-01, 9.586e-02, -1.257e+00, -2.231e+00, -1.232e+00, 5.642e-01, 5.730e-02, -3.294e-01, -1.151e-01, 2.382e-01, 4.529e-02, 4.927e-02, 9.893e-02, 8.365e-02));
|
||||
r += mul(s1_8, M4(1.906e-02, -8.920e-02, 8.931e-02, -6.752e-02, -3.680e-01, -1.282e+00, -1.388e-01, -7.545e-02, 6.262e-02, -1.695e-01, 2.278e-01, -3.066e-01, -7.412e-02, 1.145e-02, 4.667e-02, -4.205e-04));
|
||||
r += V4(1.427e-02, -1.982e-02, 4.114e-03, -2.883e-02);
|
||||
return r;
|
||||
}
|
||||
|
||||
void Pass2(uint2 blockStart, uint3 tid) {
|
||||
float2 pt = float2(GetInputPt());
|
||||
uint2 gxy = Rmp8x8(tid.x) + blockStart;
|
||||
uint2 size = GetInputSize();
|
||||
if (gxy.x >= size.x || gxy.y >= size.y) {
|
||||
return;
|
||||
}
|
||||
float2 pos = (gxy + 0.5) * pt;
|
||||
|
||||
V4 s0_0 = l0(-1.0, -1.0);
|
||||
V4 s0_1 = l0(0.0, -1.0);
|
||||
V4 s0_2 = l0(1.0, -1.0);
|
||||
V4 s0_3 = l0(-1.0, 0.0);
|
||||
V4 s0_4 = l0(0.0, 0.0);
|
||||
V4 s0_5 = l0(1.0, 0.0);
|
||||
V4 s0_6 = l0(-1.0, 1.0);
|
||||
V4 s0_7 = l0(0.0, 1.0);
|
||||
V4 s0_8 = l0(1.0, 1.0);
|
||||
V4 s1_0 = -max(-s0_0, 0.0);
|
||||
V4 s1_1 = -max(-s0_1, 0.0);
|
||||
V4 s1_2 = -max(-s0_2, 0.0);
|
||||
V4 s1_3 = -max(-s0_3, 0.0);
|
||||
V4 s1_4 = -max(-s0_4, 0.0);
|
||||
V4 s1_5 = -max(-s0_5, 0.0);
|
||||
V4 s1_6 = -max(-s0_6, 0.0);
|
||||
V4 s1_7 = -max(-s0_7, 0.0);
|
||||
V4 s1_8 = -max(-s0_8, 0.0);
|
||||
s0_0 = max(s0_0, 0.0);
|
||||
s0_1 = max(s0_1, 0.0);
|
||||
s0_2 = max(s0_2, 0.0);
|
||||
s0_3 = max(s0_3, 0.0);
|
||||
s0_4 = max(s0_4, 0.0);
|
||||
s0_5 = max(s0_5, 0.0);
|
||||
s0_6 = max(s0_6, 0.0);
|
||||
s0_7 = max(s0_7, 0.0);
|
||||
s0_8 = max(s0_8, 0.0);
|
||||
|
||||
t1[gxy] = f0(s0_0, s0_1, s0_2, s0_3, s0_4, s0_5, s0_6, s0_7, s0_8, s1_0, s1_1, s1_2, s1_3, s1_4, s1_5, s1_6, s1_7, s1_8);
|
||||
}
|
||||
|
||||
//!PASS 3
|
||||
//!DESC conv2
|
||||
//!BLOCK_SIZE 8
|
||||
//!NUM_THREADS 64
|
||||
//!IN t1
|
||||
//!OUT t0
|
||||
|
||||
#define l0(x, y) V4(O(t1, float2(x, y)))
|
||||
|
||||
V4 f0(V4 s0_0, V4 s0_1, V4 s0_2, V4 s0_3, V4 s0_4, V4 s0_5, V4 s0_6, V4 s0_7, V4 s0_8, V4 s1_0, V4 s1_1, V4 s1_2, V4 s1_3, V4 s1_4, V4 s1_5, V4 s1_6, V4 s1_7, V4 s1_8) {
|
||||
V4 r = 0.0;
|
||||
r += mul(s0_0, M4(2.965e-01, -1.919e-01, 9.202e-02, 8.775e-03, -4.948e-02, 1.061e-01, -3.754e-02, -1.900e-01, -2.114e-01, 1.267e-01, 1.989e-02, 2.570e-02, 4.634e-03, -2.718e-01, 2.171e-01, 1.512e-01));
|
||||
r += mul(s0_1, M4(-5.527e-01, -4.825e-01, 4.325e-01, 4.447e-01, -6.577e-02, 5.161e-01, 3.286e-02, -3.800e-01, 2.625e-02, 3.835e-01, -7.794e-02, -5.489e-02, -2.647e-01, -4.952e-01, 1.587e-01, 1.471e-01));
|
||||
r += mul(s0_2, M4(-3.687e-01, -1.096e-01, 1.849e-01, -6.915e-02, 2.257e-01, 2.760e-01, -8.875e-02, -8.871e-02, -8.394e-02, -6.714e-02, 5.322e-03, -3.252e-01, -7.885e-02, -2.723e-01, 6.149e-02, 2.998e-01));
|
||||
r += mul(s0_3, M4(1.606e-01, -1.199e-01, 3.573e-01, 2.833e-02, 6.514e-03, -2.242e-02, -6.231e-02, 6.702e-02, -8.717e-02, -2.227e-01, -1.626e-01, 5.313e-02, -1.411e-01, -2.445e-02, 1.194e-01, -1.101e-01));
|
||||
r += mul(s0_4, M4(-1.127e+00, 1.823e-01, 1.358e-01, -1.618e-01, -4.171e-04, -7.771e-02, 2.147e-01, 6.493e-01, 4.989e-01, 3.955e-01, -1.017e-01, -2.861e-01, 3.878e-01, -6.653e-01, -4.968e-01, -5.063e-01));
|
||||
r += mul(s0_5, M4(-2.270e-01, -3.965e-01, -2.794e-02, 1.487e-01, -2.667e-01, -1.410e-02, 1.475e-01, -4.992e-01, -1.071e-01, 2.096e-01, 1.159e-01, -6.073e-02, -7.157e-02, -2.446e-01, -4.807e-02, 1.968e-01));
|
||||
r += mul(s0_6, M4(8.199e-02, 8.336e-02, -3.090e-02, -1.287e-02, -6.954e-02, -7.544e-02, 1.272e-01, 7.930e-02, -3.647e-02, -2.685e-02, -4.235e-02, 3.214e-02, -4.526e-02, 1.479e-01, -4.963e-02, -3.035e-02));
|
||||
r += mul(s0_7, M4(-2.012e-02, -1.497e-02, -2.952e-01, -6.026e-02, 2.135e-03, 2.979e-02, -2.713e-02, 7.951e-03, -8.069e-02, -2.374e-01, 1.865e-01, 1.048e-01, -9.076e-02, 6.683e-02, 9.576e-02, -2.432e-02));
|
||||
r += mul(s0_8, M4(1.455e-01, 2.613e-01, -1.616e-01, -3.564e-01, 1.229e-01, -3.778e-02, 3.316e-02, 5.927e-02, -1.831e-01, -1.388e-01, 5.986e-02, 2.083e-02, -1.368e-03, 2.394e-01, -1.623e-01, -2.768e-02));
|
||||
r += mul(s1_0, M4(7.711e-03, -6.696e-04, -3.229e-02, 1.549e-02, -1.596e-01, 2.068e-01, -6.162e-02, -9.571e-02, -1.500e-01, 1.743e-01, 2.746e-02, -5.845e-02, -7.649e-03, -4.265e-03, 4.154e-03, 3.950e-03));
|
||||
r += mul(s1_1, M4(2.764e-01, -4.505e-02, 4.280e-02, 6.044e-02, 3.396e-02, 2.750e-01, -1.910e-01, -2.153e-01, 9.633e-02, -2.194e-02, -2.131e-01, -1.181e-01, -1.343e-01, 6.123e-02, 1.904e-02, -6.568e-02));
|
||||
r += mul(s1_2, M4(-3.643e-01, -1.709e-02, 1.528e-01, -1.405e-01, 3.307e-01, -1.979e-03, -1.819e-01, 7.635e-02, 1.266e-01, 2.162e-01, -7.492e-02, -9.075e-02, 4.120e-02, 1.521e-01, -2.790e-03, -4.330e-02));
|
||||
r += mul(s1_3, M4(1.913e-02, -5.373e-02, 5.748e-02, -1.443e-02, -2.776e-01, -1.162e-01, -1.994e-01, 1.430e-01, 9.058e-02, -3.720e-02, -3.585e-02, -8.516e-02, -2.228e-02, 7.507e-02, -9.620e-02, -1.013e-01));
|
||||
r += mul(s1_4, M4(-3.592e-01, 1.415e-01, 1.018e+00, -1.555e-01, 5.378e-01, 8.818e-02, 2.190e-01, 1.997e-01, -1.128e-01, 3.331e-02, -1.410e-01, 2.844e-01, 4.756e-01, -5.850e-02, -3.757e-01, -1.716e-01));
|
||||
r += mul(s1_5, M4(2.636e-02, -3.596e-01, -3.280e-01, 2.027e-01, 3.000e-01, -2.297e-01, 4.282e-02, 1.776e-01, 5.222e-02, 1.751e-01, 4.529e-02, -8.347e-02, -3.409e-01, -2.640e-01, 1.753e-01, -5.672e-01));
|
||||
r += mul(s1_6, M4(-1.699e-02, 4.941e-02, -2.642e-02, -1.406e-04, -1.655e-01, -1.464e-02, -4.353e-02, 1.946e-01, 6.067e-02, -1.429e-01, 1.170e-01, -4.644e-02, -6.567e-02, -2.264e-02, 6.666e-02, 9.009e-02));
|
||||
r += mul(s1_7, M4(7.805e-02, 2.173e-02, -3.276e-01, 2.004e-03, -7.789e-02, -1.466e-02, -1.560e-01, -1.126e-01, -3.823e-02, -2.446e-03, 1.465e-01, -2.744e-01, -2.129e-01, -2.141e-02, 4.456e-01, 1.240e-01));
|
||||
r += mul(s1_8, M4(1.315e-02, 2.686e-01, -1.987e-01, -2.093e-01, 3.184e-02, -8.723e-02, 3.012e-01, 3.580e-01, 1.198e-02, -2.655e-01, 1.455e-01, 7.602e-02, -4.605e-02, 3.276e-01, -2.036e-01, -2.590e-01));
|
||||
r += V4(-1.292e-02, 8.156e-04, -2.055e-03, -3.100e-03);
|
||||
return r;
|
||||
}
|
||||
|
||||
void Pass3(uint2 blockStart, uint3 tid) {
|
||||
float2 pt = float2(GetInputPt());
|
||||
uint2 gxy = Rmp8x8(tid.x) + blockStart;
|
||||
uint2 size = GetInputSize();
|
||||
if (gxy.x >= size.x || gxy.y >= size.y) {
|
||||
return;
|
||||
}
|
||||
float2 pos = (gxy + 0.5) * pt;
|
||||
|
||||
V4 s0_0 = l0(-1.0, -1.0);
|
||||
V4 s0_1 = l0(0.0, -1.0);
|
||||
V4 s0_2 = l0(1.0, -1.0);
|
||||
V4 s0_3 = l0(-1.0, 0.0);
|
||||
V4 s0_4 = l0(0.0, 0.0);
|
||||
V4 s0_5 = l0(1.0, 0.0);
|
||||
V4 s0_6 = l0(-1.0, 1.0);
|
||||
V4 s0_7 = l0(0.0, 1.0);
|
||||
V4 s0_8 = l0(1.0, 1.0);
|
||||
V4 s1_0 = -max(-s0_0, 0.0);
|
||||
V4 s1_1 = -max(-s0_1, 0.0);
|
||||
V4 s1_2 = -max(-s0_2, 0.0);
|
||||
V4 s1_3 = -max(-s0_3, 0.0);
|
||||
V4 s1_4 = -max(-s0_4, 0.0);
|
||||
V4 s1_5 = -max(-s0_5, 0.0);
|
||||
V4 s1_6 = -max(-s0_6, 0.0);
|
||||
V4 s1_7 = -max(-s0_7, 0.0);
|
||||
V4 s1_8 = -max(-s0_8, 0.0);
|
||||
s0_0 = max(s0_0, 0.0);
|
||||
s0_1 = max(s0_1, 0.0);
|
||||
s0_2 = max(s0_2, 0.0);
|
||||
s0_3 = max(s0_3, 0.0);
|
||||
s0_4 = max(s0_4, 0.0);
|
||||
s0_5 = max(s0_5, 0.0);
|
||||
s0_6 = max(s0_6, 0.0);
|
||||
s0_7 = max(s0_7, 0.0);
|
||||
s0_8 = max(s0_8, 0.0);
|
||||
|
||||
t0[gxy] = f0(s0_0, s0_1, s0_2, s0_3, s0_4, s0_5, s0_6, s0_7, s0_8, s1_0, s1_1, s1_2, s1_3, s1_4, s1_5, s1_6, s1_7, s1_8);
|
||||
}
|
||||
|
||||
//!PASS 4
|
||||
//!DESC conv3
|
||||
//!BLOCK_SIZE 8
|
||||
//!NUM_THREADS 64
|
||||
//!IN t0
|
||||
//!OUT t1
|
||||
|
||||
#define l0(x, y) V4(O(t0, float2(x, y)))
|
||||
|
||||
V4 f0(V4 s0_0, V4 s0_1, V4 s0_2, V4 s0_3, V4 s0_4, V4 s0_5, V4 s0_6, V4 s0_7, V4 s0_8, V4 s1_0, V4 s1_1, V4 s1_2, V4 s1_3, V4 s1_4, V4 s1_5, V4 s1_6, V4 s1_7, V4 s1_8) {
|
||||
V4 r = 0.0;
|
||||
r += mul(s0_0, M4(2.151e-02, -4.754e-02, 3.454e-02, -1.338e-03, -4.337e-02, 4.608e-02, -1.116e-01, -2.296e-02, -2.839e-02, -3.878e-01, -2.317e-02, 5.774e-02, 4.317e-03, 6.680e-02, 6.325e-02, -1.449e-01));
|
||||
r += mul(s0_1, M4(-1.173e-01, -8.942e-02, -1.017e-01, 6.496e-02, 5.558e-02, 2.788e-02, 2.184e-02, -2.837e-03, -1.057e-01, -2.075e-01, -3.255e-02, -1.297e-02, -2.643e-02, -1.695e-02, -9.425e-02, 3.942e-02));
|
||||
r += mul(s0_2, M4(-1.773e-02, -4.118e-02, -2.141e-02, 4.282e-02, 4.234e-02, -1.221e-02, -3.375e-03, 4.469e-02, -2.586e-01, -1.112e-01, -7.688e-02, 3.426e-02, 8.170e-02, -2.355e-02, -3.737e-02, 3.004e-02));
|
||||
r += mul(s0_3, M4(2.192e-01, 1.955e+00, 2.012e-01, -2.598e-02, -7.453e-02, 5.510e-02, -1.517e-01, -2.571e-01, -2.182e-02, -2.345e-02, -5.767e-02, -5.534e-02, -1.996e-02, 2.329e-01, 4.447e-04, -1.111e-01));
|
||||
r += mul(s0_4, M4(3.476e-01, -4.368e-01, -1.180e-01, 5.371e-01, 5.294e-01, 1.509e-01, 2.456e-01, -7.875e-02, 2.055e-01, 9.732e-02, 1.285e-01, 5.178e-01, 3.256e-01, -2.842e-01, 4.421e-02, 3.426e-01));
|
||||
r += mul(s0_5, M4(6.119e-01, -1.393e-01, -1.144e-02, 2.438e-01, -5.126e-02, -1.049e-01, -7.847e-02, 9.942e-02, 5.371e-01, 9.985e-02, 9.193e-02, -3.067e-02, -1.962e-01, -4.272e-02, -7.821e-03, 2.557e-02));
|
||||
r += mul(s0_6, M4(1.224e-02, -5.098e-01, 3.052e-01, 5.332e-01, 2.249e-01, 4.201e-02, 5.423e-01, 1.106e-01, -1.056e-02, -4.091e-03, -1.267e-02, -5.280e-02, 1.898e-02, 9.430e-03, 1.470e-02, 7.235e-02));
|
||||
r += mul(s0_7, M4(-4.342e-01, 2.385e-01, -3.834e-02, -7.654e-02, -9.043e-01, -3.139e-01, -1.511e-01, 3.800e-01, -8.848e-02, -3.911e-02, -7.025e-03, -1.196e-02, -3.322e-03, -1.455e-01, 2.084e-02, 1.106e-01));
|
||||
r += mul(s0_8, M4(1.382e-01, -1.894e-01, -8.814e-02, 1.373e-01, 1.362e-01, -1.298e-01, -1.007e-01, 1.166e-01, -1.553e-02, 8.530e-02, 2.744e-02, -1.083e-01, -5.606e-02, 5.965e-02, 1.406e-02, -4.496e-02));
|
||||
r += mul(s1_0, M4(-4.828e-03, -1.035e-01, -5.021e-02, 1.972e-02, -9.942e-03, -3.057e-01, -7.373e-03, 4.274e-02, -3.475e-03, 4.653e-02, 9.115e-03, -5.794e-02, 1.170e-02, 1.322e-01, 1.195e-01, -2.535e-02));
|
||||
r += mul(s1_1, M4(-5.424e-02, -1.541e-01, -9.945e-02, 8.862e-02, -1.198e-01, -3.591e-05, 4.305e-02, -1.079e-01, 1.605e-02, -3.377e-02, -5.398e-02, 1.201e-02, 3.432e-02, 1.090e-02, 8.871e-02, 3.186e-02));
|
||||
r += mul(s1_2, M4(-1.108e-01, -3.481e-02, -1.616e-02, -4.136e-03, -3.382e-02, 1.836e-02, -3.071e-02, -3.186e-02, -1.014e-01, -1.412e-01, -7.790e-02, 9.763e-02, -1.624e-02, -2.520e-02, -2.152e-02, 2.524e-02));
|
||||
r += mul(s1_3, M4(3.337e-03, -1.439e-02, 2.317e-03, 2.097e-01, 5.091e-03, 4.138e-02, -5.988e-02, -2.348e-02, -5.626e-03, 1.695e-02, 2.371e-02, -1.652e-02, 8.541e-02, -1.851e-01, 1.130e+00, -1.181e-01));
|
||||
r += mul(s1_4, M4(1.184e-01, -3.385e-02, 2.659e-02, 3.233e-01, 2.333e-01, 1.694e-01, 1.915e-01, 1.162e-01, 4.309e-02, -3.793e-02, 1.412e-01, -1.345e-02, -6.074e-01, -2.408e-01, -1.306e-01, 1.033e-01));
|
||||
r += mul(s1_5, M4(3.452e-01, 1.401e-01, 3.650e-02, -4.950e-02, 1.755e-01, -1.210e-01, -1.041e-02, 1.281e-01, 4.262e-01, 2.166e-02, 3.851e-02, 1.295e-01, -1.910e-01, -2.029e-02, -2.151e-02, -1.537e-02));
|
||||
r += mul(s1_6, M4(4.989e-03, -5.730e-02, 5.803e-02, 2.946e-02, 1.825e-02, 2.660e-02, -4.900e-03, 3.848e-03, 1.078e-02, 1.823e-02, -4.751e-03, 4.219e-02, -1.024e-01, 7.721e-02, -6.709e-01, 8.423e-02));
|
||||
r += mul(s1_7, M4(-1.567e-01, 4.125e-02, -2.721e-02, -1.831e-01, 9.470e-03, -1.205e-01, 1.793e-02, 1.160e-01, -4.874e-02, -4.902e-02, -1.479e-01, 7.102e-02, 6.699e-01, -1.383e-01, 1.314e-01, 2.999e-01));
|
||||
r += mul(s1_8, M4(-2.625e-01, -9.735e-02, -6.038e-02, 3.588e-03, 2.247e-02, 4.993e-02, 1.171e-02, -2.071e-02, 2.066e-01, 2.852e-01, -5.781e-02, -3.231e-01, 6.922e-02, 8.960e-02, 9.107e-02, -2.880e-02));
|
||||
r += V4(3.045e-03, 3.707e-03, -6.011e-03, -5.162e-03);
|
||||
return r;
|
||||
}
|
||||
|
||||
void Pass4(uint2 blockStart, uint3 tid) {
|
||||
float2 pt = float2(GetInputPt());
|
||||
uint2 gxy = Rmp8x8(tid.x) + blockStart;
|
||||
uint2 size = GetInputSize();
|
||||
if (gxy.x >= size.x || gxy.y >= size.y) {
|
||||
return;
|
||||
}
|
||||
float2 pos = (gxy + 0.5) * pt;
|
||||
|
||||
V4 s0_0 = l0(-1.0, -1.0);
|
||||
V4 s0_1 = l0(0.0, -1.0);
|
||||
V4 s0_2 = l0(1.0, -1.0);
|
||||
V4 s0_3 = l0(-1.0, 0.0);
|
||||
V4 s0_4 = l0(0.0, 0.0);
|
||||
V4 s0_5 = l0(1.0, 0.0);
|
||||
V4 s0_6 = l0(-1.0, 1.0);
|
||||
V4 s0_7 = l0(0.0, 1.0);
|
||||
V4 s0_8 = l0(1.0, 1.0);
|
||||
V4 s1_0 = -max(-s0_0, 0.0);
|
||||
V4 s1_1 = -max(-s0_1, 0.0);
|
||||
V4 s1_2 = -max(-s0_2, 0.0);
|
||||
V4 s1_3 = -max(-s0_3, 0.0);
|
||||
V4 s1_4 = -max(-s0_4, 0.0);
|
||||
V4 s1_5 = -max(-s0_5, 0.0);
|
||||
V4 s1_6 = -max(-s0_6, 0.0);
|
||||
V4 s1_7 = -max(-s0_7, 0.0);
|
||||
V4 s1_8 = -max(-s0_8, 0.0);
|
||||
s0_0 = max(s0_0, 0.0);
|
||||
s0_1 = max(s0_1, 0.0);
|
||||
s0_2 = max(s0_2, 0.0);
|
||||
s0_3 = max(s0_3, 0.0);
|
||||
s0_4 = max(s0_4, 0.0);
|
||||
s0_5 = max(s0_5, 0.0);
|
||||
s0_6 = max(s0_6, 0.0);
|
||||
s0_7 = max(s0_7, 0.0);
|
||||
s0_8 = max(s0_8, 0.0);
|
||||
|
||||
t1[gxy] = f0(s0_0, s0_1, s0_2, s0_3, s0_4, s0_5, s0_6, s0_7, s0_8, s1_0, s1_1, s1_2, s1_3, s1_4, s1_5, s1_6, s1_7, s1_8);
|
||||
}
|
||||
|
||||
//!PASS 5
|
||||
//!DESC out-shuffle
|
||||
//!BLOCK_SIZE 16
|
||||
//!NUM_THREADS 64
|
||||
//!IN INPUT, t1
|
||||
//!OUT OUTPUT
|
||||
|
||||
#define l0(x, y) V4(O(t1, float2(x, y)))
|
||||
|
||||
V4 f0(V4 s0_0, V4 s0_1, V4 s0_2, V4 s0_3, V4 s0_4, V4 s0_5, V4 s0_6, V4 s0_7, V4 s0_8, V4 s1_0, V4 s1_1, V4 s1_2, V4 s1_3, V4 s1_4, V4 s1_5, V4 s1_6, V4 s1_7, V4 s1_8) {
|
||||
V4 r = 0.0;
|
||||
r += mul(s0_0, M4(1.116e-01, 1.402e-01, 1.439e-02, 5.091e-02, -1.526e-02, -2.562e-02, -1.193e-02, -1.365e-02, -6.156e-02, -3.463e-02, 2.155e-02, -2.192e-02, -2.937e-02, -1.072e-01, -4.538e-02, -3.302e-02));
|
||||
r += mul(s0_1, M4(-1.192e-02, -1.724e-02, 9.899e-03, -5.861e-03, -1.552e-02, 2.422e-02, 4.929e-03, 7.339e-03, 4.700e-02, 1.993e-01, -6.323e-02, 5.778e-02, 1.499e-01, 3.916e-01, -4.578e-02, -2.026e-02));
|
||||
r += mul(s0_2, M4(5.431e-03, 1.916e-03, -2.064e-03, -6.545e-04, -1.731e-02, -8.081e-02, 1.391e-02, -7.036e-03, 7.739e-02, -1.588e-01, 2.970e-02, 3.357e-02, 3.869e-02, -7.824e-02, 1.813e-02, -6.252e-02));
|
||||
r += mul(s0_3, M4(5.283e-01, 8.076e-02, 3.430e-01, 2.332e-01, -3.540e-02, 1.903e-02, -1.354e-02, -1.415e-02, -1.644e-01, -1.319e-02, -9.781e-02, -3.256e-02, 2.768e-02, -3.914e-02, 1.596e-01, -1.067e-01));
|
||||
r += mul(s0_4, M4(-1.638e-02, 4.385e-01, -1.479e-01, -1.789e-02, -1.399e-01, -5.884e-02, -7.306e-02, -2.036e-03, 5.196e-01, -1.849e-01, 8.771e-01, 3.595e-01, -7.094e-01, 2.485e-02, -3.977e-02, 7.246e-01));
|
||||
r += mul(s0_5, M4(-1.647e-03, -6.027e-03, -3.787e-03, -1.975e-02, -4.810e-02, -4.557e-01, 4.921e-02, -1.313e-01, -2.044e-02, 3.533e-01, -7.591e-02, 1.249e-02, 2.648e-02, -5.215e-01, 1.204e-01, -2.254e-01));
|
||||
r += mul(s0_6, M4(-2.852e-02, -1.630e-02, 1.249e-01, -1.758e-02, 4.285e-02, 1.425e-02, -1.595e-02, 2.618e-02, 4.460e-03, 1.266e-02, -3.914e-02, 1.111e-02, 5.378e-02, 2.199e-02, 2.561e-03, 2.125e-02));
|
||||
r += mul(s0_7, M4(-6.567e-02, -4.333e-02, -4.153e-03, 1.692e-01, 5.376e-02, 5.736e-02, -1.860e-01, -9.094e-02, 3.357e-02, -3.186e-02, 1.244e-01, -9.606e-02, 6.227e-02, 6.827e-02, -2.086e-01, -6.625e-02));
|
||||
r += mul(s0_8, M4(4.553e-05, -3.116e-02, 1.023e-02, 2.322e-02, 8.623e-02, 1.125e-01, 2.802e-02, -2.768e-01, -1.003e-01, -2.143e-02, -2.413e-02, 1.460e-01, 5.421e-02, 5.798e-02, 3.478e-03, -1.421e-01));
|
||||
r += mul(s1_0, M4(2.165e-01, 1.123e-01, -3.653e-02, -6.070e-03, -1.021e-01, -6.901e-04, 6.256e-03, -3.182e-03, -4.285e-02, -6.763e-02, 2.278e-02, -1.860e-02, -2.689e-02, 2.567e-02, 2.634e-03, 3.600e-02));
|
||||
r += mul(s1_1, M4(-1.159e-01, -1.198e-01, 2.991e-02, -6.143e-02, 1.038e-01, -5.076e-02, -1.785e-02, -3.611e-02, 6.860e-02, 9.302e-02, -1.125e-02, 3.332e-02, 6.457e-02, -3.919e-02, 4.158e-03, -1.201e-02));
|
||||
r += mul(s1_2, M4(-6.554e-03, 3.359e-02, -2.003e-02, -2.227e-04, 3.354e-02, -3.700e-02, -9.588e-03, -3.740e-02, -1.336e-02, -2.556e-04, -4.733e-03, -1.636e-02, 1.127e-02, 1.421e-02, -1.019e-02, -2.731e-02));
|
||||
r += mul(s1_3, M4(3.642e-01, -3.756e-03, 6.584e-01, 1.773e-01, -1.638e-02, 1.109e-02, -7.427e-02, -1.572e-02, -1.869e-01, -3.059e-02, -8.088e-02, -5.092e-02, -5.794e-02, -4.431e-02, -7.912e-02, -9.767e-02));
|
||||
r += mul(s1_4, M4(-3.255e-02, 3.115e-01, -2.109e-01, 2.804e-01, -6.504e-01, -1.342e-02, 1.355e-01, 3.623e-01, 5.142e-01, 2.124e-01, 1.866e-01, 2.268e-01, -2.470e-02, 1.629e-01, 1.163e-01, 1.663e-01));
|
||||
r += mul(s1_5, M4(-1.093e-02, -1.640e-04, -3.502e-02, -3.746e-02, 1.836e-02, -5.959e-01, 1.323e-01, -2.388e-01, 3.482e-02, 1.823e-01, -3.895e-02, 5.164e-03, -7.314e-02, -3.897e-01, 6.275e-02, -3.974e-02));
|
||||
r += mul(s1_6, M4(7.922e-03, -3.284e-02, 1.274e-01, -2.930e-02, 6.307e-02, 2.548e-02, -4.094e-02, 2.130e-02, -1.123e-02, 1.824e-03, -9.595e-02, 1.808e-02, 7.955e-02, 3.285e-02, 4.592e-02, 7.153e-02));
|
||||
r += mul(s1_7, M4(-6.410e-02, -1.423e-02, -4.912e-02, 1.461e-01, 6.612e-02, 9.838e-02, -2.153e-01, -1.067e-01, -1.108e-02, -1.048e-01, 2.778e-01, -1.116e-01, 4.569e-02, 2.955e-02, -1.440e-01, -3.364e-02));
|
||||
r += mul(s1_8, M4(1.721e-02, 1.171e-02, 1.096e-02, -2.832e-02, 7.446e-02, 4.785e-02, 8.270e-03, -1.640e-01, -8.912e-02, -6.617e-02, 3.225e-03, 9.894e-02, 4.367e-02, 8.102e-02, -1.779e-02, -2.410e-01));
|
||||
r += V4(1.708e-05, 2.435e-04, 1.267e-03, 1.926e-03);
|
||||
return tanh(r);
|
||||
}
|
||||
|
||||
void Pass5(uint2 blockStart, uint3 tid) {
|
||||
float2 pt = float2(GetInputPt());
|
||||
uint2 gxy = (Rmp8x8(tid.x) << 1) + blockStart;
|
||||
uint2 size = GetOutputSize();
|
||||
if (gxy.x >= size.x || gxy.y >= size.y) {
|
||||
return;
|
||||
}
|
||||
float2 pos = ((gxy >> 1) + 0.5) * pt;
|
||||
|
||||
V4 s0_0 = l0(-1.0, -1.0);
|
||||
V4 s0_1 = l0(0.0, -1.0);
|
||||
V4 s0_2 = l0(1.0, -1.0);
|
||||
V4 s0_3 = l0(-1.0, 0.0);
|
||||
V4 s0_4 = l0(0.0, 0.0);
|
||||
V4 s0_5 = l0(1.0, 0.0);
|
||||
V4 s0_6 = l0(-1.0, 1.0);
|
||||
V4 s0_7 = l0(0.0, 1.0);
|
||||
V4 s0_8 = l0(1.0, 1.0);
|
||||
V4 s1_0 = -max(-s0_0, 0.0);
|
||||
V4 s1_1 = -max(-s0_1, 0.0);
|
||||
V4 s1_2 = -max(-s0_2, 0.0);
|
||||
V4 s1_3 = -max(-s0_3, 0.0);
|
||||
V4 s1_4 = -max(-s0_4, 0.0);
|
||||
V4 s1_5 = -max(-s0_5, 0.0);
|
||||
V4 s1_6 = -max(-s0_6, 0.0);
|
||||
V4 s1_7 = -max(-s0_7, 0.0);
|
||||
V4 s1_8 = -max(-s0_8, 0.0);
|
||||
s0_0 = max(s0_0, 0.0);
|
||||
s0_1 = max(s0_1, 0.0);
|
||||
s0_2 = max(s0_2, 0.0);
|
||||
s0_3 = max(s0_3, 0.0);
|
||||
s0_4 = max(s0_4, 0.0);
|
||||
s0_5 = max(s0_5, 0.0);
|
||||
s0_6 = max(s0_6, 0.0);
|
||||
s0_7 = max(s0_7, 0.0);
|
||||
s0_8 = max(s0_8, 0.0);
|
||||
|
||||
V4 r = f0(s0_0, s0_1, s0_2, s0_3, s0_4, s0_5, s0_6, s0_7, s0_8, s1_0, s1_1, s1_2, s1_3, s1_4, s1_5, s1_6, s1_7, s1_8);
|
||||
|
||||
static const float3x3 rgb2yuv = {0.299, 0.587, 0.114, -0.169, -0.331, 0.5, 0.5, -0.419, -0.081};
|
||||
static const float3x3 yuv2rgb = {1, -0.00093, 1.401687, 1, -0.3437, -0.71417, 1, 1.77216, 0.00099};
|
||||
float2 opt = float2(GetOutputPt());
|
||||
|
||||
pos -= 0.5f * opt;
|
||||
float3 yuv = mul(rgb2yuv, INPUT.SampleLevel(SL, pos, 0).rgb);
|
||||
OUTPUT[gxy] = float4(mul(yuv2rgb, float3(saturate(yuv.r + r.x), yuv.yz)), 1);
|
||||
|
||||
++gxy.x;
|
||||
pos.x += opt.x;
|
||||
yuv = mul(rgb2yuv, INPUT.SampleLevel(SL, pos, 0).rgb);
|
||||
OUTPUT[gxy] = float4(mul(yuv2rgb, float3(saturate(yuv.r + r.y), yuv.yz)), 1);
|
||||
|
||||
++gxy.y;
|
||||
pos.y += opt.y;
|
||||
yuv = mul(rgb2yuv, INPUT.SampleLevel(SL, pos, 0).rgb);
|
||||
OUTPUT[gxy] = float4(mul(yuv2rgb, float3(saturate(yuv.r + r.w), yuv.yz)), 1);
|
||||
|
||||
--gxy.x;
|
||||
pos.x -= opt.x;
|
||||
yuv = mul(rgb2yuv, INPUT.SampleLevel(SL, pos, 0).rgb);
|
||||
OUTPUT[gxy] = float4(mul(yuv2rgb, float3(saturate(yuv.r + r.z), yuv.yz)), 1);
|
||||
}
|
||||
413
src/Effects/CuNNy/CuNNy-3x4C-NVL.hlsl
Normal file
413
src/Effects/CuNNy/CuNNy-3x4C-NVL.hlsl
Normal file
|
|
@ -0,0 +1,413 @@
|
|||
// CuNNy 3x4C BILINEAR RGB NVL - https://github.com/cunnyplapper/CuNNy
|
||||
|
||||
// This program is free software: you can redistribute it and/or modify
|
||||
// it under the terms of the GNU General Public License as published by
|
||||
// the Free Software Foundation, either version 3 of the License, or
|
||||
// (at your option) any later version.
|
||||
//
|
||||
// This program is distributed in the hope that it will be useful,
|
||||
// but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
//
|
||||
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
// GNU General Public License for more details.
|
||||
//
|
||||
// You should have received a copy of the GNU General Public License
|
||||
// along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
|
||||
//!MAGPIE EFFECT
|
||||
//!VERSION 4
|
||||
//!SORT_NAME CuNNy-D04N03
|
||||
|
||||
//!TEXTURE
|
||||
Texture2D INPUT;
|
||||
|
||||
//!TEXTURE
|
||||
//!WIDTH INPUT_WIDTH * 2
|
||||
//!HEIGHT INPUT_HEIGHT * 2
|
||||
Texture2D OUTPUT;
|
||||
|
||||
//!SAMPLER
|
||||
//!FILTER POINT
|
||||
SamplerState SP;
|
||||
|
||||
//!SAMPLER
|
||||
//!FILTER LINEAR
|
||||
SamplerState SL;
|
||||
|
||||
//!COMMON
|
||||
#define O(t, p) t.SampleLevel(SP, pos + p * pt, 0)
|
||||
#define V4 min16float4
|
||||
#define M4 min16float4x4
|
||||
|
||||
//!TEXTURE
|
||||
//!WIDTH INPUT_WIDTH
|
||||
//!HEIGHT INPUT_HEIGHT
|
||||
//!FORMAT R8G8B8A8_SNORM
|
||||
Texture2D t0;
|
||||
|
||||
//!TEXTURE
|
||||
//!WIDTH INPUT_WIDTH
|
||||
//!HEIGHT INPUT_HEIGHT
|
||||
//!FORMAT R8G8B8A8_SNORM
|
||||
Texture2D t1;
|
||||
|
||||
//!PASS 1
|
||||
//!DESC in
|
||||
//!BLOCK_SIZE 8
|
||||
//!NUM_THREADS 64
|
||||
//!IN INPUT
|
||||
//!OUT t0
|
||||
|
||||
#define l0(x, y) min16float((dot(float3(6.094e-01, 1.148e+00, 2.568e-01), O(INPUT, float2(x, y)).rgb) + -1.542e+00))
|
||||
|
||||
V4 f0(min16float s0_0, min16float s0_1, min16float s0_2, min16float s0_3, min16float s0_4, min16float s0_5, min16float s0_6, min16float s0_7, min16float s0_8) {
|
||||
V4 r = 0.0;
|
||||
r += V4(-6.372e-02, 1.685e-01, -2.573e-02, -2.185e-02) * s0_0;
|
||||
r += V4(-3.502e-02, -2.984e-03, 5.048e-02, -2.445e-01) * s0_1;
|
||||
r += V4(9.644e-02, -7.557e-03, -1.770e-02, 3.162e-02) * s0_2;
|
||||
r += V4(7.199e-02, -6.233e-01, -4.180e-01, 1.392e-01) * s0_3;
|
||||
r += V4(-5.683e-01, 1.451e-01, -8.148e-02, 9.768e-02) * s0_4;
|
||||
r += V4(4.702e-01, -1.319e-03, 3.745e-03, -4.204e-02) * s0_5;
|
||||
r += V4(9.855e-03, 3.213e-01, 5.098e-01, 4.001e-02) * s0_6;
|
||||
r += V4(8.216e-02, -1.219e-02, -3.347e-02, 5.017e-02) * s0_7;
|
||||
r += V4(-6.691e-02, 5.417e-03, 1.235e-02, -9.640e-03) * s0_8;
|
||||
r += V4(-4.952e-03, -2.750e-03, -9.137e-04, 6.736e-02);
|
||||
return r;
|
||||
}
|
||||
|
||||
void Pass1(uint2 blockStart, uint3 tid) {
|
||||
float2 pt = float2(GetInputPt());
|
||||
uint2 gxy = Rmp8x8(tid.x) + blockStart;
|
||||
uint2 size = GetInputSize();
|
||||
if (gxy.x >= size.x || gxy.y >= size.y) {
|
||||
return;
|
||||
}
|
||||
float2 pos = (gxy + 0.5) * pt;
|
||||
|
||||
min16float s0_0 = l0(-1.0, -1.0);
|
||||
min16float s0_1 = l0(0.0, -1.0);
|
||||
min16float s0_2 = l0(1.0, -1.0);
|
||||
min16float s0_3 = l0(-1.0, 0.0);
|
||||
min16float s0_4 = l0(0.0, 0.0);
|
||||
min16float s0_5 = l0(1.0, 0.0);
|
||||
min16float s0_6 = l0(-1.0, 1.0);
|
||||
min16float s0_7 = l0(0.0, 1.0);
|
||||
min16float s0_8 = l0(1.0, 1.0);
|
||||
|
||||
t0[gxy] = f0(s0_0, s0_1, s0_2, s0_3, s0_4, s0_5, s0_6, s0_7, s0_8);
|
||||
}
|
||||
|
||||
//!PASS 2
|
||||
//!DESC conv1
|
||||
//!BLOCK_SIZE 8
|
||||
//!NUM_THREADS 64
|
||||
//!IN t0
|
||||
//!OUT t1
|
||||
|
||||
#define l0(x, y) V4(O(t0, float2(x, y)))
|
||||
|
||||
V4 f0(V4 s0_0, V4 s0_1, V4 s0_2, V4 s0_3, V4 s0_4, V4 s0_5, V4 s0_6, V4 s0_7, V4 s0_8, V4 s1_0, V4 s1_1, V4 s1_2, V4 s1_3, V4 s1_4, V4 s1_5, V4 s1_6, V4 s1_7, V4 s1_8) {
|
||||
V4 r = 0.0;
|
||||
r += mul(s0_0, M4(3.169e-01, 3.467e-01, -2.365e-01, 2.253e-01, 6.307e-02, 1.727e-01, -1.053e-01, 9.324e-02, -4.901e-02, -2.112e-01, 8.983e-02, -1.851e-01, -1.987e-01, 6.645e-02, 2.188e-02, 1.988e-02));
|
||||
r += mul(s0_1, M4(4.393e-02, 2.078e-01, -1.967e-01, 4.673e-02, -7.991e-02, 2.461e-01, -6.028e-02, 9.252e-02, 3.871e-01, 6.138e-02, -3.603e-01, -1.485e-01, 2.466e-01, 5.251e-02, -6.181e-02, 8.932e-02));
|
||||
r += mul(s0_2, M4(-1.707e-02, 2.598e-02, 1.641e-02, 2.780e-02, 2.425e-02, 1.769e-01, -8.461e-02, 1.067e-01, -2.503e-01, 6.051e-01, -2.782e-01, 1.311e-01, -8.456e-03, -1.370e-02, -6.391e-02, 6.935e-02));
|
||||
r += mul(s0_3, M4(-8.251e-01, -4.981e-01, -1.726e-01, -1.815e-01, 1.411e-01, 2.889e-02, -3.115e-01, -3.255e-01, 1.812e-03, -4.529e-02, 2.350e-01, 1.999e-01, -1.993e-01, -1.868e-02, 4.249e-02, -1.117e-01));
|
||||
r += mul(s0_4, M4(-4.732e-02, -5.673e-02, 1.274e-01, 4.894e-02, 9.126e-02, 1.717e-01, -3.294e-01, -2.378e-01, -7.089e-02, -8.116e-02, 2.510e-01, 7.381e-02, 1.275e-01, 8.030e-02, -1.671e-01, -1.824e-02));
|
||||
r += mul(s0_5, M4(3.373e-02, -4.163e-02, -4.077e-02, -2.085e-02, 1.265e-01, -4.133e-01, 7.433e-02, 7.763e-02, -1.466e-01, 3.291e-01, -7.784e-02, 9.472e-02, 2.725e-01, -2.393e-01, -6.913e-02, -9.445e-02));
|
||||
r += mul(s0_6, M4(3.043e-02, -9.985e-02, 1.538e-01, -2.529e-01, 2.379e-01, 1.079e-01, -1.517e-01, -9.289e-02, -1.396e-01, -4.354e-02, 8.463e-02, 7.052e-02, 5.629e-02, 3.293e-03, 5.342e-02, -1.606e-01));
|
||||
r += mul(s0_7, M4(3.626e-02, -1.421e-01, 4.017e-02, -3.963e-02, 2.148e-03, 5.522e-02, 3.174e-01, 2.270e-02, -5.590e-02, -9.875e-02, -1.683e-01, 5.415e-02, 1.509e-01, 7.709e-02, -1.161e-01, 1.440e-01));
|
||||
r += mul(s0_8, M4(-1.132e-02, 2.337e-02, 1.264e-02, 2.638e-03, -6.582e-02, -1.965e-01, 2.803e-01, 1.333e-01, 9.171e-02, 1.567e-01, -2.419e-01, -1.602e-01, -2.271e-01, 3.614e-02, 2.179e-01, 4.826e-02));
|
||||
r += mul(s1_0, M4(1.452e-01, 1.313e-01, -6.140e-02, 2.412e-01, -3.691e-02, 7.355e-02, -4.209e-02, 1.343e-01, -2.509e-02, -1.266e-01, 9.017e-02, -1.854e-02, -4.280e-01, -1.004e-01, 2.319e-01, 4.211e-02));
|
||||
r += mul(s1_1, M4(4.894e-02, 7.564e-02, -9.350e-02, 5.422e-02, -6.111e-02, 6.969e-02, -4.398e-02, 6.622e-02, 7.113e-01, 3.461e-01, -5.254e-01, -8.808e-02, 4.481e-01, 3.171e-01, -2.198e-01, 1.048e-01));
|
||||
r += mul(s1_2, M4(-3.483e-02, 3.150e-03, 2.215e-02, 2.616e-02, 1.468e-01, -1.295e-01, -1.470e-01, 3.371e-02, -4.514e-02, 4.677e-02, -1.313e-01, -1.176e-01, 1.507e-03, 2.290e-01, -2.163e-01, 3.895e-02));
|
||||
r += mul(s1_3, M4(-2.258e-01, -1.353e-01, -4.873e-01, -1.236e+00, 1.660e-01, -1.803e-02, -2.797e-01, -4.092e-01, -1.525e-01, -8.178e-02, 2.665e-01, 3.652e-01, -1.853e-01, -3.819e-02, 1.627e-01, -3.896e-01));
|
||||
r += mul(s1_4, M4(-1.005e-01, -3.821e-02, 9.917e-02, -1.324e-01, -2.040e-01, -3.586e-01, 9.776e-02, -1.376e-01, 2.065e-01, 2.017e-01, -1.320e-01, -2.225e-02, 2.944e-01, 5.393e-02, -4.301e-01, -7.240e-02));
|
||||
r += mul(s1_5, M4(5.353e-02, -4.257e-02, -4.131e-02, -3.943e-02, -6.151e-02, 3.059e-01, -1.481e-02, 3.662e-01, 3.098e-02, -8.774e-02, 1.790e-02, -1.332e-01, 8.670e-02, -6.985e-02, -1.359e-01, 2.063e-01));
|
||||
r += mul(s1_6, M4(-9.271e-02, 2.259e-01, 2.200e-02, -2.390e-01, 3.258e-01, 1.082e-01, -1.499e-01, -3.063e-02, -2.775e-01, -9.008e-02, 1.294e-01, 3.533e-02, 1.011e-02, 4.294e-02, 4.935e-02, -1.005e-01));
|
||||
r += mul(s1_7, M4(1.321e-02, -7.160e-02, 7.229e-02, -3.050e-02, 4.303e-02, -1.518e-01, 5.137e-01, 4.029e-02, 4.896e-02, 5.334e-02, -3.545e-01, 2.370e-02, 1.645e-01, 3.433e-02, -9.552e-03, 1.032e-01));
|
||||
r += mul(s1_8, M4(8.370e-03, -2.408e-02, 2.693e-02, -8.183e-03, -2.375e-02, -2.973e-01, 1.889e-01, 1.096e-01, 1.093e-02, 2.310e-01, -1.613e-01, -1.343e-01, -1.718e-01, -2.165e-02, 1.384e-01, 9.956e-02));
|
||||
r += V4(-1.511e-02, -2.848e-03, 7.160e-03, -2.555e-03);
|
||||
return r;
|
||||
}
|
||||
|
||||
void Pass2(uint2 blockStart, uint3 tid) {
|
||||
float2 pt = float2(GetInputPt());
|
||||
uint2 gxy = Rmp8x8(tid.x) + blockStart;
|
||||
uint2 size = GetInputSize();
|
||||
if (gxy.x >= size.x || gxy.y >= size.y) {
|
||||
return;
|
||||
}
|
||||
float2 pos = (gxy + 0.5) * pt;
|
||||
|
||||
V4 s0_0 = l0(-1.0, -1.0);
|
||||
V4 s0_1 = l0(0.0, -1.0);
|
||||
V4 s0_2 = l0(1.0, -1.0);
|
||||
V4 s0_3 = l0(-1.0, 0.0);
|
||||
V4 s0_4 = l0(0.0, 0.0);
|
||||
V4 s0_5 = l0(1.0, 0.0);
|
||||
V4 s0_6 = l0(-1.0, 1.0);
|
||||
V4 s0_7 = l0(0.0, 1.0);
|
||||
V4 s0_8 = l0(1.0, 1.0);
|
||||
V4 s1_0 = -max(-s0_0, 0.0);
|
||||
V4 s1_1 = -max(-s0_1, 0.0);
|
||||
V4 s1_2 = -max(-s0_2, 0.0);
|
||||
V4 s1_3 = -max(-s0_3, 0.0);
|
||||
V4 s1_4 = -max(-s0_4, 0.0);
|
||||
V4 s1_5 = -max(-s0_5, 0.0);
|
||||
V4 s1_6 = -max(-s0_6, 0.0);
|
||||
V4 s1_7 = -max(-s0_7, 0.0);
|
||||
V4 s1_8 = -max(-s0_8, 0.0);
|
||||
s0_0 = max(s0_0, 0.0);
|
||||
s0_1 = max(s0_1, 0.0);
|
||||
s0_2 = max(s0_2, 0.0);
|
||||
s0_3 = max(s0_3, 0.0);
|
||||
s0_4 = max(s0_4, 0.0);
|
||||
s0_5 = max(s0_5, 0.0);
|
||||
s0_6 = max(s0_6, 0.0);
|
||||
s0_7 = max(s0_7, 0.0);
|
||||
s0_8 = max(s0_8, 0.0);
|
||||
|
||||
t1[gxy] = f0(s0_0, s0_1, s0_2, s0_3, s0_4, s0_5, s0_6, s0_7, s0_8, s1_0, s1_1, s1_2, s1_3, s1_4, s1_5, s1_6, s1_7, s1_8);
|
||||
}
|
||||
|
||||
//!PASS 3
|
||||
//!DESC conv2
|
||||
//!BLOCK_SIZE 8
|
||||
//!NUM_THREADS 64
|
||||
//!IN t1
|
||||
//!OUT t0
|
||||
|
||||
#define l0(x, y) V4(O(t1, float2(x, y)))
|
||||
|
||||
V4 f0(V4 s0_0, V4 s0_1, V4 s0_2, V4 s0_3, V4 s0_4, V4 s0_5, V4 s0_6, V4 s0_7, V4 s0_8, V4 s1_0, V4 s1_1, V4 s1_2, V4 s1_3, V4 s1_4, V4 s1_5, V4 s1_6, V4 s1_7, V4 s1_8) {
|
||||
V4 r = 0.0;
|
||||
r += mul(s0_0, M4(6.983e-02, 8.935e-03, -1.644e-01, -4.232e-04, -1.981e-01, 9.265e-02, 1.769e-01, 1.705e-01, -2.300e-02, -7.408e-03, -4.221e-02, -1.617e-02, -6.026e-02, -9.185e-03, -7.420e-02, -4.238e-02));
|
||||
r += mul(s0_1, M4(1.832e-01, -1.117e-01, 1.784e-02, 6.345e-02, -9.651e-02, 5.753e-02, 1.480e-01, 1.284e-01, 3.957e-01, -2.684e-01, 2.853e-02, -5.823e-02, -8.184e-02, 1.062e-01, -2.604e-02, -7.579e-02));
|
||||
r += mul(s0_2, M4(-1.753e-01, 5.019e-03, -1.285e-01, 8.470e-02, -2.566e-01, 6.556e-02, -9.751e-02, 7.653e-03, -9.466e-02, 3.098e-02, -9.617e-02, -4.826e-02, 3.951e-02, -5.446e-02, 1.297e-01, 1.076e-01));
|
||||
r += mul(s0_3, M4(-7.377e-02, -2.183e-01, 9.806e-02, 1.735e-01, 2.795e-01, 3.730e-01, 1.906e-01, 1.313e-01, 2.115e-01, 2.222e-01, 1.880e-01, 2.427e-01, -1.177e-01, 2.587e-02, -1.928e-01, -1.489e-01));
|
||||
r += mul(s0_4, M4(-3.487e-01, -3.194e-01, 7.963e-01, -1.044e-01, 3.136e-01, -5.467e-02, 5.059e-01, -4.801e-02, -4.943e-01, -1.466e-01, -5.938e-02, -9.473e-01, 2.661e-01, -1.545e-01, 1.986e-01, -2.172e-02));
|
||||
r += mul(s0_5, M4(-3.450e-01, 1.931e-01, -2.303e-01, -1.880e-01, -1.323e-01, 1.839e-01, -1.130e-01, -5.181e-02, 3.049e-02, 9.834e-02, -1.342e-01, -1.072e-01, 1.925e-02, -9.652e-02, 1.169e-01, 2.084e-01));
|
||||
r += mul(s0_6, M4(1.543e-02, 2.202e-01, 4.809e-02, 1.085e-01, 3.076e-02, -4.127e-01, 4.606e-02, 9.444e-02, 7.886e-02, -1.314e-01, -1.638e-02, 4.353e-02, 9.790e-02, -6.783e-02, -1.008e-01, -1.558e-01));
|
||||
r += mul(s0_7, M4(-4.453e-02, 3.133e-01, -2.217e-01, -5.271e-02, -2.055e-01, -1.000e-01, 8.374e-02, 6.141e-02, 2.147e-02, -3.844e-01, -2.203e-01, -1.105e-01, -3.596e-02, 2.026e-01, 3.174e-01, 1.519e-01));
|
||||
r += mul(s0_8, M4(-5.107e-03, 2.380e-01, 2.147e-02, -8.032e-02, -9.743e-02, 6.943e-02, 9.403e-02, 3.742e-02, -1.822e-02, -4.950e-02, 7.963e-02, -1.338e-01, -1.491e-01, 1.655e-02, -5.817e-02, 1.164e-01));
|
||||
r += mul(s1_0, M4(8.679e-02, -7.335e-02, -5.999e-02, -4.504e-02, -3.329e-02, 4.349e-03, -4.883e-02, 3.159e-02, -7.948e-02, 3.308e-02, 6.579e-02, 1.607e-01, 1.336e-01, -1.042e-01, -2.368e-01, -1.546e-01));
|
||||
r += mul(s1_1, M4(2.764e-01, -6.665e-02, 1.661e-02, -4.103e-02, 1.095e-01, -1.159e-01, -1.142e-01, -1.412e-01, 4.033e-01, -8.697e-02, 2.387e-01, 1.762e-01, 4.948e-01, -1.533e-01, 7.816e-02, 5.700e-02));
|
||||
r += mul(s1_2, M4(1.187e-01, -6.571e-02, 4.698e-02, 4.931e-02, -5.523e-02, 3.925e-02, -7.453e-02, -8.429e-02, -2.202e-01, 6.090e-02, -1.460e-01, 2.777e-02, 4.405e-01, 6.445e-03, 3.494e-01, 3.311e-01));
|
||||
r += mul(s1_3, M4(-4.333e-02, -8.517e-02, 1.372e-01, 2.066e-01, 4.728e-01, 1.195e-01, -2.627e-01, -2.280e-01, 1.606e-01, 2.216e-01, 2.269e-01, 3.505e-01, -2.499e-01, -3.977e-01, -3.659e-02, 1.460e-02));
|
||||
r += mul(s1_4, M4(-4.640e-01, -7.221e-01, -2.524e-01, -6.513e-01, 6.699e-01, -1.727e-01, 4.444e-01, -3.115e-01, -6.748e-01, 1.063e-01, 6.487e-01, -3.195e-01, -5.136e-01, -8.272e-01, 4.014e-01, 4.914e-01));
|
||||
r += mul(s1_5, M4(-1.112e-03, -1.293e-02, 1.567e-02, -1.266e-01, 1.185e-01, 4.940e-02, -9.925e-02, -1.034e-01, -1.041e-01, 1.822e-01, -4.277e-02, 1.313e-01, -6.459e-01, -1.562e-01, -3.961e-01, -7.262e-02));
|
||||
r += mul(s1_6, M4(1.499e-02, 3.135e-01, 2.187e-01, 2.386e-01, 1.171e-01, -4.899e-01, -1.987e-01, -1.717e-01, 5.232e-02, -1.984e-01, 9.338e-04, 1.092e-01, 1.545e-01, 4.183e-01, 1.180e-01, 1.102e-01));
|
||||
r += mul(s1_7, M4(-1.411e-01, 2.619e-01, -2.549e-01, -2.113e-01, -1.109e-01, -3.038e-01, 7.579e-02, -3.585e-02, -1.373e-03, -2.713e-01, -5.527e-02, 7.052e-02, -1.648e-01, 7.324e-01, 3.974e-01, 2.306e-01));
|
||||
r += mul(s1_8, M4(-1.861e-02, 9.414e-02, -6.739e-02, -8.921e-02, -2.337e-02, -2.657e-02, -3.376e-03, -7.209e-02, -1.042e-01, -2.504e-02, 1.287e-01, -1.459e-02, -1.617e-01, 2.384e-01, -6.969e-01, -3.760e-01));
|
||||
r += V4(-3.514e-03, 2.350e-03, 2.221e-03, 1.089e-03);
|
||||
return r;
|
||||
}
|
||||
|
||||
void Pass3(uint2 blockStart, uint3 tid) {
|
||||
float2 pt = float2(GetInputPt());
|
||||
uint2 gxy = Rmp8x8(tid.x) + blockStart;
|
||||
uint2 size = GetInputSize();
|
||||
if (gxy.x >= size.x || gxy.y >= size.y) {
|
||||
return;
|
||||
}
|
||||
float2 pos = (gxy + 0.5) * pt;
|
||||
|
||||
V4 s0_0 = l0(-1.0, -1.0);
|
||||
V4 s0_1 = l0(0.0, -1.0);
|
||||
V4 s0_2 = l0(1.0, -1.0);
|
||||
V4 s0_3 = l0(-1.0, 0.0);
|
||||
V4 s0_4 = l0(0.0, 0.0);
|
||||
V4 s0_5 = l0(1.0, 0.0);
|
||||
V4 s0_6 = l0(-1.0, 1.0);
|
||||
V4 s0_7 = l0(0.0, 1.0);
|
||||
V4 s0_8 = l0(1.0, 1.0);
|
||||
V4 s1_0 = -max(-s0_0, 0.0);
|
||||
V4 s1_1 = -max(-s0_1, 0.0);
|
||||
V4 s1_2 = -max(-s0_2, 0.0);
|
||||
V4 s1_3 = -max(-s0_3, 0.0);
|
||||
V4 s1_4 = -max(-s0_4, 0.0);
|
||||
V4 s1_5 = -max(-s0_5, 0.0);
|
||||
V4 s1_6 = -max(-s0_6, 0.0);
|
||||
V4 s1_7 = -max(-s0_7, 0.0);
|
||||
V4 s1_8 = -max(-s0_8, 0.0);
|
||||
s0_0 = max(s0_0, 0.0);
|
||||
s0_1 = max(s0_1, 0.0);
|
||||
s0_2 = max(s0_2, 0.0);
|
||||
s0_3 = max(s0_3, 0.0);
|
||||
s0_4 = max(s0_4, 0.0);
|
||||
s0_5 = max(s0_5, 0.0);
|
||||
s0_6 = max(s0_6, 0.0);
|
||||
s0_7 = max(s0_7, 0.0);
|
||||
s0_8 = max(s0_8, 0.0);
|
||||
|
||||
t0[gxy] = f0(s0_0, s0_1, s0_2, s0_3, s0_4, s0_5, s0_6, s0_7, s0_8, s1_0, s1_1, s1_2, s1_3, s1_4, s1_5, s1_6, s1_7, s1_8);
|
||||
}
|
||||
|
||||
//!PASS 4
|
||||
//!DESC conv3
|
||||
//!BLOCK_SIZE 8
|
||||
//!NUM_THREADS 64
|
||||
//!IN t0
|
||||
//!OUT t1
|
||||
|
||||
#define l0(x, y) V4(O(t0, float2(x, y)))
|
||||
|
||||
V4 f0(V4 s0_0, V4 s0_1, V4 s0_2, V4 s0_3, V4 s0_4, V4 s0_5, V4 s0_6, V4 s0_7, V4 s0_8, V4 s1_0, V4 s1_1, V4 s1_2, V4 s1_3, V4 s1_4, V4 s1_5, V4 s1_6, V4 s1_7, V4 s1_8) {
|
||||
V4 r = 0.0;
|
||||
r += mul(s0_0, M4(1.869e-01, 8.774e-02, -6.451e-02, 6.682e-02, 8.374e-02, 1.313e-02, -2.649e-02, 2.741e-02, -3.609e-02, -9.330e-02, -8.233e-02, 1.117e-01, -1.203e-01, 1.719e-02, 1.288e-01, -9.851e-02));
|
||||
r += mul(s0_1, M4(3.100e-01, 5.063e-02, 1.169e-01, -3.828e-02, 3.428e-01, 4.869e-02, -1.232e-02, -1.003e-02, 2.756e-01, 3.916e-01, 1.450e-01, 1.078e-01, -2.568e-01, -2.157e-01, -1.057e-01, -1.338e-01));
|
||||
r += mul(s0_2, M4(1.199e-01, -1.890e-01, 5.870e-03, -5.995e-03, 2.255e-01, -2.325e-03, 7.916e-03, -2.038e-02, 1.353e-01, -9.590e-02, -2.119e-02, -5.860e-02, -7.698e-02, -3.608e-02, -3.571e-02, 2.010e-02));
|
||||
r += mul(s0_3, M4(9.889e-02, -2.665e-02, -2.627e-01, 3.583e-01, 7.891e-02, 8.737e-02, 5.322e-02, 5.246e-04, -5.188e-02, -8.491e-02, -4.991e-02, -3.735e-02, 5.711e-02, 4.482e-02, 5.660e-02, -1.322e-01));
|
||||
r += mul(s0_4, M4(-5.488e-01, 2.898e-01, 1.046e+00, 6.036e-01, -3.180e-01, -6.309e-01, -2.627e-01, 1.734e-01, -2.067e-01, 3.775e-02, -2.881e-01, -9.242e-02, 3.369e-01, 2.554e-02, -1.645e-01, 4.973e-01));
|
||||
r += mul(s0_5, M4(6.976e-03, -1.830e-01, 2.842e-01, 2.570e-02, -2.902e-01, 5.059e-01, 1.944e-01, 1.794e-02, -1.333e-01, 2.341e-01, 4.161e-01, -5.179e-02, 8.176e-02, -2.435e-02, -1.598e-02, 6.211e-02));
|
||||
r += mul(s0_6, M4(-2.668e-02, -6.958e-02, -5.015e-02, 8.035e-02, 4.451e-02, -1.290e-03, -7.688e-02, 1.708e-01, -5.133e-02, -2.768e-02, -1.780e-02, -6.317e-02, -9.692e-03, -2.748e-03, 9.070e-03, -1.314e-01));
|
||||
r += mul(s0_7, M4(1.402e-01, 4.997e-02, -4.973e-02, 6.839e-01, 2.079e-02, -2.511e-02, 3.403e-01, -3.077e-01, -2.831e-02, 4.816e-02, -9.142e-02, -8.176e-02, -2.999e-02, -5.749e-03, -5.579e-02, -2.355e-01));
|
||||
r += mul(s0_8, M4(-1.783e-02, -2.882e-02, 9.841e-02, 4.473e-02, 4.128e-02, -3.071e-02, -2.378e-01, 1.347e-01, -2.285e-02, 1.317e-02, -1.632e-02, 1.058e-01, -3.696e-02, -6.864e-03, -8.989e-02, -7.315e-02));
|
||||
r += mul(s1_0, M4(8.857e-02, 3.169e-02, -1.896e-02, 1.258e-02, 7.086e-02, 5.699e-02, 1.550e-02, -1.836e-02, 1.209e-01, 5.334e-02, -1.557e-02, -2.374e-02, -1.411e-02, 1.543e-02, 1.769e-02, -4.332e-02));
|
||||
r += mul(s1_1, M4(1.199e-01, -8.203e-03, -1.695e-02, -3.214e-02, 5.918e-01, 3.458e-01, 7.684e-02, -5.137e-01, 2.827e-01, -2.008e-02, -1.848e-01, 2.147e-01, 7.212e-02, -3.906e-03, -2.220e-01, -1.918e-01));
|
||||
r += mul(s1_2, M4(4.464e-02, 4.035e-02, 4.265e-03, 1.350e-02, -4.623e-01, -1.882e-01, 9.929e-02, -2.295e-01, 2.010e-01, 6.059e-01, 3.648e-01, -1.670e-02, -6.763e-02, -2.588e-01, -1.741e-01, 3.358e-02));
|
||||
r += mul(s1_3, M4(1.003e-01, -2.961e-02, -1.715e-01, 1.057e-01, 3.275e-03, 1.877e-02, -4.995e-02, 1.181e-01, 3.600e-02, 2.101e-02, -1.050e-01, 8.035e-02, -8.107e-02, -1.067e-01, -5.457e-02, 5.339e-02));
|
||||
r += mul(s1_4, M4(3.875e-01, 3.638e-01, 1.178e-01, -4.404e-02, 6.128e-02, -1.193e-01, -3.161e-01, 3.510e-01, -3.482e-02, -2.842e-01, -3.917e-01, 4.525e-01, 1.969e-01, 5.299e-01, 4.720e-01, -2.266e-01));
|
||||
r += mul(s1_5, M4(-1.420e-02, 2.325e-02, -8.697e-02, -4.296e-03, 8.697e-02, 7.490e-02, 1.773e-01, 4.010e-01, 2.380e-01, -1.182e-01, 9.121e-01, 2.252e-01, 1.348e-01, -7.448e-02, -8.496e-01, -3.335e-01));
|
||||
r += mul(s1_6, M4(-7.923e-02, -2.533e-02, -4.896e-02, -5.473e-02, -5.329e-03, 1.285e-02, -1.763e-02, 7.009e-02, 9.670e-04, -1.889e-02, -1.008e-01, 1.149e-01, 7.259e-03, 4.080e-02, 1.042e-01, -2.627e-01));
|
||||
r += mul(s1_7, M4(-9.746e-02, 6.679e-02, -1.421e-01, -2.202e-01, -9.918e-03, -2.413e-02, -1.554e-02, 7.011e-03, -3.226e-02, -3.024e-02, -5.431e-02, 7.446e-02, 5.860e-02, 2.851e-02, -2.367e-01, 2.562e-02));
|
||||
r += mul(s1_8, M4(-4.627e-02, 4.226e-02, -8.654e-02, -3.312e-02, 1.600e-02, 2.983e-02, 8.834e-03, -3.871e-02, -4.137e-03, 1.767e-02, 2.492e-02, -5.391e-02, 8.133e-03, 1.430e-02, -2.428e-02, -1.132e-01));
|
||||
r += V4(-4.349e-03, -3.760e-03, 4.684e-03, 4.745e-03);
|
||||
return r;
|
||||
}
|
||||
|
||||
void Pass4(uint2 blockStart, uint3 tid) {
|
||||
float2 pt = float2(GetInputPt());
|
||||
uint2 gxy = Rmp8x8(tid.x) + blockStart;
|
||||
uint2 size = GetInputSize();
|
||||
if (gxy.x >= size.x || gxy.y >= size.y) {
|
||||
return;
|
||||
}
|
||||
float2 pos = (gxy + 0.5) * pt;
|
||||
|
||||
V4 s0_0 = l0(-1.0, -1.0);
|
||||
V4 s0_1 = l0(0.0, -1.0);
|
||||
V4 s0_2 = l0(1.0, -1.0);
|
||||
V4 s0_3 = l0(-1.0, 0.0);
|
||||
V4 s0_4 = l0(0.0, 0.0);
|
||||
V4 s0_5 = l0(1.0, 0.0);
|
||||
V4 s0_6 = l0(-1.0, 1.0);
|
||||
V4 s0_7 = l0(0.0, 1.0);
|
||||
V4 s0_8 = l0(1.0, 1.0);
|
||||
V4 s1_0 = -max(-s0_0, 0.0);
|
||||
V4 s1_1 = -max(-s0_1, 0.0);
|
||||
V4 s1_2 = -max(-s0_2, 0.0);
|
||||
V4 s1_3 = -max(-s0_3, 0.0);
|
||||
V4 s1_4 = -max(-s0_4, 0.0);
|
||||
V4 s1_5 = -max(-s0_5, 0.0);
|
||||
V4 s1_6 = -max(-s0_6, 0.0);
|
||||
V4 s1_7 = -max(-s0_7, 0.0);
|
||||
V4 s1_8 = -max(-s0_8, 0.0);
|
||||
s0_0 = max(s0_0, 0.0);
|
||||
s0_1 = max(s0_1, 0.0);
|
||||
s0_2 = max(s0_2, 0.0);
|
||||
s0_3 = max(s0_3, 0.0);
|
||||
s0_4 = max(s0_4, 0.0);
|
||||
s0_5 = max(s0_5, 0.0);
|
||||
s0_6 = max(s0_6, 0.0);
|
||||
s0_7 = max(s0_7, 0.0);
|
||||
s0_8 = max(s0_8, 0.0);
|
||||
|
||||
t1[gxy] = f0(s0_0, s0_1, s0_2, s0_3, s0_4, s0_5, s0_6, s0_7, s0_8, s1_0, s1_1, s1_2, s1_3, s1_4, s1_5, s1_6, s1_7, s1_8);
|
||||
}
|
||||
|
||||
//!PASS 5
|
||||
//!DESC out-shuffle
|
||||
//!BLOCK_SIZE 16
|
||||
//!NUM_THREADS 64
|
||||
//!IN INPUT, t1
|
||||
//!OUT OUTPUT
|
||||
|
||||
#define l0(x, y) V4(O(t1, float2(x, y)))
|
||||
|
||||
V4 f0(V4 s0_0, V4 s0_1, V4 s0_2, V4 s0_3, V4 s0_4, V4 s0_5, V4 s0_6, V4 s0_7, V4 s0_8, V4 s1_0, V4 s1_1, V4 s1_2, V4 s1_3, V4 s1_4, V4 s1_5, V4 s1_6, V4 s1_7, V4 s1_8) {
|
||||
V4 r = 0.0;
|
||||
r += mul(s0_0, M4(-4.783e-03, 7.235e-03, -7.275e-03, -2.802e-03, 4.921e-02, 7.543e-02, -3.357e-02, 1.213e-02, 2.900e-02, 2.380e-03, -9.028e-03, -2.594e-02, 1.576e-03, 3.334e-04, -2.460e-02, -1.285e-02));
|
||||
r += mul(s0_1, M4(4.582e-02, 9.378e-04, 2.217e-02, 5.083e-02, -1.054e-02, 8.518e-02, -1.884e-02, -5.149e-02, 1.983e-02, -1.106e-02, -4.317e-03, 5.384e-02, -5.193e-02, 1.089e-02, -9.384e-03, 3.137e-02));
|
||||
r += mul(s0_2, M4(-5.241e-03, 3.821e-02, -1.136e-02, -3.033e-02, 3.186e-02, -3.270e-03, 1.422e-02, 2.401e-02, -1.360e-02, 1.024e-01, -6.042e-02, -2.325e-02, -1.248e-01, -1.377e-01, 1.654e-02, -1.347e-02));
|
||||
r += mul(s0_3, M4(-3.552e-02, -3.211e-02, -2.282e-03, 1.775e-02, 1.360e-01, 2.808e-02, 1.082e-01, -1.311e-02, -1.699e-02, -2.628e-02, 3.430e-02, -3.880e-03, 2.514e-02, -3.171e-02, 4.675e-02, -2.711e-02));
|
||||
r += mul(s0_4, M4(4.756e-01, 2.686e-01, 4.514e-02, -8.813e-02, 2.636e-01, -4.893e-01, 1.301e-01, 1.304e-01, 3.778e-01, 2.765e-01, 3.369e-01, 8.811e-02, 5.080e-02, 2.783e-01, -1.131e-01, 2.487e-01));
|
||||
r += mul(s0_5, M4(-2.961e-02, 7.757e-02, -8.471e-02, -4.636e-02, -6.862e-02, 1.733e-01, -7.301e-02, -1.408e-02, 1.636e-02, 9.982e-02, 5.704e-02, 2.568e-01, -2.224e-02, -2.588e-01, -2.202e-01, -4.898e-01));
|
||||
r += mul(s0_6, M4(1.058e-01, -2.810e-02, -2.960e-02, -8.398e-02, -9.106e-02, 6.642e-02, -2.574e-02, 7.841e-02, -1.978e-02, -3.700e-02, -1.504e-02, -3.186e-02, 2.438e-03, 6.191e-03, -1.155e-02, -1.161e-02));
|
||||
r += mul(s0_7, M4(-6.316e-01, -7.748e-02, 8.006e-01, 3.936e-01, 1.300e-01, -1.999e-01, 2.351e-01, -7.485e-01, -7.151e-02, -4.285e-02, -2.277e-02, 2.849e-02, -2.207e-02, -2.585e-02, -2.498e-02, -3.308e-02));
|
||||
r += mul(s0_8, M4(-2.002e-01, -6.934e-01, -1.093e-01, 3.325e-01, -5.778e-02, 2.138e-02, -2.930e-02, 1.794e-01, -3.028e-03, 2.300e-03, 5.845e-03, -1.959e-02, 1.403e-02, 1.565e-02, 1.840e-02, -6.027e-04));
|
||||
r += mul(s1_0, M4(2.228e-02, -8.352e-03, -1.007e-02, -1.911e-02, -1.489e-02, 2.785e-03, -9.190e-03, 5.858e-03, 2.420e-02, -7.701e-03, -2.327e-02, -2.494e-02, -8.526e-03, -2.384e-02, -2.601e-02, -4.833e-02));
|
||||
r += mul(s1_1, M4(5.671e-02, 3.666e-02, 3.309e-02, 1.011e-02, -8.053e-03, 4.673e-02, -5.358e-02, -2.451e-02, 3.779e-01, 5.642e-02, -2.324e-01, -3.499e-02, -3.479e-01, 1.179e-01, -4.630e-02, 1.118e-01));
|
||||
r += mul(s1_2, M4(-1.650e-02, 6.203e-04, -1.322e-02, -1.996e-02, 2.118e-02, -9.244e-03, 2.813e-02, 9.773e-03, -2.654e-02, -8.373e-02, 6.663e-04, -6.860e-02, -3.436e-02, -7.207e-01, 2.389e-01, 1.903e-01));
|
||||
r += mul(s1_3, M4(-8.045e-02, -2.073e-02, 3.380e-02, 1.327e-02, 1.247e-01, 1.129e-02, 6.421e-02, -8.326e-03, -4.675e-02, 4.920e-02, -3.699e-02, 4.601e-02, 3.389e-02, -4.151e-02, 3.012e-02, -2.241e-02));
|
||||
r += mul(s1_4, M4(5.223e-01, 1.394e-01, 1.222e-01, -7.687e-03, -3.115e-01, 3.989e-02, -1.679e-01, 2.607e-01, 4.393e-01, -1.821e-01, 1.006e+00, -2.920e-01, 8.062e-02, 2.231e-01, -1.282e-02, 2.495e-01));
|
||||
r += mul(s1_5, M4(-1.146e-01, 6.738e-02, -1.655e-02, 1.178e-02, -3.058e-02, 1.093e-01, 9.367e-03, 1.382e-02, -7.397e-02, 2.300e-01, -4.202e-02, 1.765e-01, -4.671e-02, -1.375e-02, -3.662e-01, -5.254e-01));
|
||||
r += mul(s1_6, M4(5.090e-02, 8.633e-03, -1.128e-02, -3.186e-02, -6.263e-02, 4.143e-02, -2.214e-02, 5.270e-02, -1.370e-02, -1.692e-02, -2.644e-02, -9.847e-03, -2.147e-03, -7.941e-03, -1.323e-04, -5.173e-03));
|
||||
r += mul(s1_7, M4(-9.353e-02, 6.696e-02, 2.744e-01, 2.743e-01, 9.809e-02, -1.439e-01, -2.583e-02, -3.717e-01, -5.135e-02, -1.889e-02, -1.775e-02, 9.383e-03, -2.496e-02, -2.936e-02, -2.578e-02, -1.586e-02));
|
||||
r += mul(s1_8, M4(-1.565e-02, -1.635e-01, -1.800e-01, -2.607e-01, 1.975e-02, 1.594e-02, -4.568e-02, 1.218e-01, -6.668e-03, 7.923e-03, -4.625e-02, 1.324e-02, -6.838e-03, 2.045e-02, 1.141e-02, 2.717e-02));
|
||||
r += V4(7.204e-05, -6.226e-05, 2.867e-04, -3.251e-05);
|
||||
return tanh(r);
|
||||
}
|
||||
|
||||
void Pass5(uint2 blockStart, uint3 tid) {
|
||||
float2 pt = float2(GetInputPt());
|
||||
uint2 gxy = (Rmp8x8(tid.x) << 1) + blockStart;
|
||||
uint2 size = GetOutputSize();
|
||||
if (gxy.x >= size.x || gxy.y >= size.y) {
|
||||
return;
|
||||
}
|
||||
float2 pos = ((gxy >> 1) + 0.5) * pt;
|
||||
|
||||
V4 s0_0 = l0(-1.0, -1.0);
|
||||
V4 s0_1 = l0(0.0, -1.0);
|
||||
V4 s0_2 = l0(1.0, -1.0);
|
||||
V4 s0_3 = l0(-1.0, 0.0);
|
||||
V4 s0_4 = l0(0.0, 0.0);
|
||||
V4 s0_5 = l0(1.0, 0.0);
|
||||
V4 s0_6 = l0(-1.0, 1.0);
|
||||
V4 s0_7 = l0(0.0, 1.0);
|
||||
V4 s0_8 = l0(1.0, 1.0);
|
||||
V4 s1_0 = -max(-s0_0, 0.0);
|
||||
V4 s1_1 = -max(-s0_1, 0.0);
|
||||
V4 s1_2 = -max(-s0_2, 0.0);
|
||||
V4 s1_3 = -max(-s0_3, 0.0);
|
||||
V4 s1_4 = -max(-s0_4, 0.0);
|
||||
V4 s1_5 = -max(-s0_5, 0.0);
|
||||
V4 s1_6 = -max(-s0_6, 0.0);
|
||||
V4 s1_7 = -max(-s0_7, 0.0);
|
||||
V4 s1_8 = -max(-s0_8, 0.0);
|
||||
s0_0 = max(s0_0, 0.0);
|
||||
s0_1 = max(s0_1, 0.0);
|
||||
s0_2 = max(s0_2, 0.0);
|
||||
s0_3 = max(s0_3, 0.0);
|
||||
s0_4 = max(s0_4, 0.0);
|
||||
s0_5 = max(s0_5, 0.0);
|
||||
s0_6 = max(s0_6, 0.0);
|
||||
s0_7 = max(s0_7, 0.0);
|
||||
s0_8 = max(s0_8, 0.0);
|
||||
|
||||
V4 r = f0(s0_0, s0_1, s0_2, s0_3, s0_4, s0_5, s0_6, s0_7, s0_8, s1_0, s1_1, s1_2, s1_3, s1_4, s1_5, s1_6, s1_7, s1_8);
|
||||
|
||||
static const float3x3 rgb2yuv = {0.299, 0.587, 0.114, -0.169, -0.331, 0.5, 0.5, -0.419, -0.081};
|
||||
static const float3x3 yuv2rgb = {1, -0.00093, 1.401687, 1, -0.3437, -0.71417, 1, 1.77216, 0.00099};
|
||||
float2 opt = float2(GetOutputPt());
|
||||
|
||||
pos -= 0.5f * opt;
|
||||
float3 yuv = mul(rgb2yuv, INPUT.SampleLevel(SL, pos, 0).rgb);
|
||||
OUTPUT[gxy] = float4(mul(yuv2rgb, float3(saturate(yuv.r + r.x), yuv.yz)), 1);
|
||||
|
||||
++gxy.x;
|
||||
pos.x += opt.x;
|
||||
yuv = mul(rgb2yuv, INPUT.SampleLevel(SL, pos, 0).rgb);
|
||||
OUTPUT[gxy] = float4(mul(yuv2rgb, float3(saturate(yuv.r + r.y), yuv.yz)), 1);
|
||||
|
||||
++gxy.y;
|
||||
pos.y += opt.y;
|
||||
yuv = mul(rgb2yuv, INPUT.SampleLevel(SL, pos, 0).rgb);
|
||||
OUTPUT[gxy] = float4(mul(yuv2rgb, float3(saturate(yuv.r + r.w), yuv.yz)), 1);
|
||||
|
||||
--gxy.x;
|
||||
pos.x -= opt.x;
|
||||
yuv = mul(rgb2yuv, INPUT.SampleLevel(SL, pos, 0).rgb);
|
||||
OUTPUT[gxy] = float4(mul(yuv2rgb, float3(saturate(yuv.r + r.z), yuv.yz)), 1);
|
||||
}
|
||||
2223
src/Effects/CuNNy/CuNNy-4x16C-NVL-DN.hlsl
Normal file
2223
src/Effects/CuNNy/CuNNy-4x16C-NVL-DN.hlsl
Normal file
File diff suppressed because it is too large
Load diff
2223
src/Effects/CuNNy/CuNNy-4x16C-NVL.hlsl
Normal file
2223
src/Effects/CuNNy/CuNNy-4x16C-NVL.hlsl
Normal file
File diff suppressed because it is too large
Load diff
486
src/Effects/CuNNy/CuNNy-4x4C-NVL-DN.hlsl
Normal file
486
src/Effects/CuNNy/CuNNy-4x4C-NVL-DN.hlsl
Normal file
|
|
@ -0,0 +1,486 @@
|
|||
// CuNNy 4x4C BILINEAR RGB NVL DN - https://github.com/cunnyplapper/CuNNy
|
||||
|
||||
// This program is free software: you can redistribute it and/or modify
|
||||
// it under the terms of the GNU General Public License as published by
|
||||
// the Free Software Foundation, either version 3 of the License, or
|
||||
// (at your option) any later version.
|
||||
//
|
||||
// This program is distributed in the hope that it will be useful,
|
||||
// but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
//
|
||||
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
// GNU General Public License for more details.
|
||||
//
|
||||
// You should have received a copy of the GNU General Public License
|
||||
// along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
|
||||
//!MAGPIE EFFECT
|
||||
//!VERSION 4
|
||||
//!SORT_NAME CuNNy-DN-D04N04
|
||||
|
||||
//!TEXTURE
|
||||
Texture2D INPUT;
|
||||
|
||||
//!TEXTURE
|
||||
//!WIDTH INPUT_WIDTH * 2
|
||||
//!HEIGHT INPUT_HEIGHT * 2
|
||||
Texture2D OUTPUT;
|
||||
|
||||
//!SAMPLER
|
||||
//!FILTER POINT
|
||||
SamplerState SP;
|
||||
|
||||
//!SAMPLER
|
||||
//!FILTER LINEAR
|
||||
SamplerState SL;
|
||||
|
||||
//!COMMON
|
||||
#define O(t, p) t.SampleLevel(SP, pos + p * pt, 0)
|
||||
#define V4 min16float4
|
||||
#define M4 min16float4x4
|
||||
|
||||
//!TEXTURE
|
||||
//!WIDTH INPUT_WIDTH
|
||||
//!HEIGHT INPUT_HEIGHT
|
||||
//!FORMAT R8G8B8A8_SNORM
|
||||
Texture2D t0;
|
||||
|
||||
//!TEXTURE
|
||||
//!WIDTH INPUT_WIDTH
|
||||
//!HEIGHT INPUT_HEIGHT
|
||||
//!FORMAT R8G8B8A8_SNORM
|
||||
Texture2D t1;
|
||||
|
||||
//!PASS 1
|
||||
//!DESC in
|
||||
//!BLOCK_SIZE 8
|
||||
//!NUM_THREADS 64
|
||||
//!IN INPUT
|
||||
//!OUT t0
|
||||
|
||||
#define l0(x, y) min16float((dot(float3(2.428e-01, 4.714e-01, 1.229e-01), O(INPUT, float2(x, y)).rgb) + -7.696e-02))
|
||||
|
||||
V4 f0(min16float s0_0, min16float s0_1, min16float s0_2, min16float s0_3, min16float s0_4, min16float s0_5, min16float s0_6, min16float s0_7, min16float s0_8) {
|
||||
V4 r = 0.0;
|
||||
r += V4(9.154e-02, 3.758e-01, 2.353e-02, -5.798e-02) * s0_0;
|
||||
r += V4(-5.382e-01, 1.688e-01, -1.190e-01, 4.082e-02) * s0_1;
|
||||
r += V4(2.460e-02, -5.810e-02, 7.788e-02, 3.018e-02) * s0_2;
|
||||
r += V4(1.211e-01, -1.552e-01, -9.990e-02, 3.963e-02) * s0_3;
|
||||
r += V4(-2.611e-01, -4.835e-01, -6.965e-01, -4.893e-01) * s0_4;
|
||||
r += V4(-3.017e-01, -4.435e-02, 1.836e-01, 4.600e-01) * s0_5;
|
||||
r += V4(1.275e-01, 2.485e-01, 7.354e-02, -4.648e-02) * s0_6;
|
||||
r += V4(2.527e-01, 1.279e-01, 3.053e-01, 3.957e-02) * s0_7;
|
||||
r += V4(1.003e-02, 1.193e-01, 2.476e-01, -2.051e-02) * s0_8;
|
||||
r += V4(1.690e-02, 8.856e-03, -9.136e-04, 2.267e-02);
|
||||
return r;
|
||||
}
|
||||
|
||||
void Pass1(uint2 blockStart, uint3 tid) {
|
||||
float2 pt = float2(GetInputPt());
|
||||
uint2 gxy = Rmp8x8(tid.x) + blockStart;
|
||||
uint2 size = GetInputSize();
|
||||
if (gxy.x >= size.x || gxy.y >= size.y) {
|
||||
return;
|
||||
}
|
||||
float2 pos = (gxy + 0.5) * pt;
|
||||
|
||||
min16float s0_0 = l0(-1.0, -1.0);
|
||||
min16float s0_1 = l0(0.0, -1.0);
|
||||
min16float s0_2 = l0(1.0, -1.0);
|
||||
min16float s0_3 = l0(-1.0, 0.0);
|
||||
min16float s0_4 = l0(0.0, 0.0);
|
||||
min16float s0_5 = l0(1.0, 0.0);
|
||||
min16float s0_6 = l0(-1.0, 1.0);
|
||||
min16float s0_7 = l0(0.0, 1.0);
|
||||
min16float s0_8 = l0(1.0, 1.0);
|
||||
|
||||
t0[gxy] = f0(s0_0, s0_1, s0_2, s0_3, s0_4, s0_5, s0_6, s0_7, s0_8);
|
||||
}
|
||||
|
||||
//!PASS 2
|
||||
//!DESC conv1
|
||||
//!BLOCK_SIZE 8
|
||||
//!NUM_THREADS 64
|
||||
//!IN t0
|
||||
//!OUT t1
|
||||
|
||||
#define l0(x, y) V4(O(t0, float2(x, y)))
|
||||
|
||||
V4 f0(V4 s0_0, V4 s0_1, V4 s0_2, V4 s0_3, V4 s0_4, V4 s0_5, V4 s0_6, V4 s0_7, V4 s0_8, V4 s1_0, V4 s1_1, V4 s1_2, V4 s1_3, V4 s1_4, V4 s1_5, V4 s1_6, V4 s1_7, V4 s1_8) {
|
||||
V4 r = 0.0;
|
||||
r += mul(s0_0, M4(-4.540e-03, -2.499e-01, 4.202e-02, 1.132e-02, 2.910e-02, -3.788e-02, 3.330e-02, -2.254e-02, -1.953e-01, 1.226e-01, -1.907e-01, -1.378e-01, 9.555e-02, -2.443e-01, 6.124e-02, -7.256e-03));
|
||||
r += mul(s0_1, M4(-1.225e-01, -1.812e-01, -1.238e-02, 4.088e-01, -9.977e-02, 4.395e-02, -2.394e-02, -5.584e-03, 2.939e-01, 4.102e-01, 6.228e-02, 3.822e-01, 8.618e-02, -1.109e-01, 1.776e-01, -7.505e-02));
|
||||
r += mul(s0_2, M4(2.047e-01, -6.853e-02, 1.880e-02, -9.030e-03, 1.505e-01, 7.782e-02, 1.347e-02, 5.566e-01, -6.951e-02, -1.352e-01, 1.941e-03, 3.975e-02, 1.637e-01, 6.708e-02, 1.501e-02, 1.373e-01));
|
||||
r += mul(s0_3, M4(-1.974e-01, 1.068e-01, -1.102e-01, 5.909e-02, 2.355e-03, 1.275e-01, -5.986e-02, -5.288e-02, 8.785e-04, -1.440e-01, -3.369e-01, -9.128e-02, 2.030e-01, 4.937e-01, -1.637e-01, 4.814e-02));
|
||||
r += mul(s0_4, M4(-3.954e-01, 4.772e-01, -5.841e-01, -8.070e-02, -2.056e-01, -2.335e-01, -2.091e-01, 1.223e-01, -2.686e-01, 1.240e+00, 7.095e-02, 6.502e-01, 1.044e-01, -3.071e-01, -2.892e-01, 4.861e-01));
|
||||
r += mul(s0_5, M4(5.943e-02, 2.245e-01, 4.014e-01, -1.063e-01, -1.869e-01, 1.384e-01, 2.996e-01, -1.928e-01, 1.212e-01, 2.849e-01, 2.093e-01, -3.821e-01, -8.705e-02, 1.976e-01, 5.176e-01, -7.461e-02));
|
||||
r += mul(s0_6, M4(1.048e-01, 2.374e-02, 2.730e-01, 1.446e-01, -5.406e-02, -1.587e-02, -2.014e-01, -3.422e-02, -2.114e-01, -5.198e-01, 2.674e-02, -6.078e-02, -2.293e-01, -9.914e-02, -2.110e-01, 7.008e-02));
|
||||
r += mul(s0_7, M4(5.799e-02, 4.932e-01, 4.559e-01, -3.118e-02, 4.706e-02, -2.242e-01, -3.165e-01, -9.912e-02, 4.041e-01, 7.241e-01, -1.696e-01, 1.990e-01, 4.697e-01, 9.965e-03, -1.141e-02, -1.365e-02));
|
||||
r += mul(s0_8, M4(-1.744e-01, -7.119e-02, 3.632e-01, -2.802e-01, -3.155e-01, 4.455e-01, -1.866e-02, -2.667e-02, 1.255e-01, -5.762e-01, -2.226e-02, 2.812e-02, -2.349e-01, 1.552e-01, -6.424e-03, 7.450e-02));
|
||||
r += mul(s1_0, M4(6.159e-02, -4.426e-02, 2.277e-02, 1.040e-01, -6.306e-04, -1.704e-01, 3.807e-02, -8.670e-02, -1.403e-01, 1.644e-01, -9.679e-02, -1.055e-01, 2.394e-01, -5.504e-02, 8.006e-02, 6.312e-02));
|
||||
r += mul(s1_1, M4(-1.134e-01, -1.030e-01, -2.777e-02, 2.955e-01, -1.225e-01, -4.096e-02, -2.748e-02, 9.404e-02, 2.890e-01, -2.441e-01, 1.560e-01, 1.694e-01, 1.853e-01, 3.311e-01, 3.408e-01, -8.678e-02));
|
||||
r += mul(s1_2, M4(1.821e-01, 3.898e-02, -2.560e-02, 1.160e-01, 2.382e-01, -1.638e-01, -1.345e-01, 3.193e-01, -1.839e-01, -2.638e-01, 5.265e-02, 2.415e-01, 2.803e-01, 1.919e-01, -7.340e-02, 1.762e-02));
|
||||
r += mul(s1_3, M4(-2.606e-01, -1.263e-01, -3.067e-02, -1.695e-02, 4.665e-03, 2.947e-02, -1.965e-02, -2.658e-02, -7.935e-02, -1.566e-01, -3.246e-01, -1.075e-03, 1.896e-01, -2.937e-01, -1.020e-01, -1.513e-01));
|
||||
r += mul(s1_4, M4(-3.696e-01, 8.901e-02, -1.890e-01, -2.804e-02, -2.998e-01, -6.597e-02, -2.613e-01, 3.877e-01, -1.032e+00, -2.328e-01, 7.941e-02, 5.733e-01, 8.618e-02, 4.213e-02, -1.242e+00, 5.861e-01));
|
||||
r += mul(s1_5, M4(1.919e-02, -5.609e-02, 3.295e-01, -2.364e-01, -4.238e-01, -6.041e-01, 3.389e-01, -4.460e-01, 4.482e-02, 1.077e-03, 8.990e-02, -2.725e-01, -4.829e-02, 1.184e-01, 1.941e-01, -3.646e-01));
|
||||
r += mul(s1_6, M4(2.968e-01, 2.018e-01, 2.695e-01, 8.891e-02, -5.857e-02, 6.005e-02, -2.440e-01, -1.349e-02, -7.572e-02, -3.213e-01, 6.274e-02, -1.229e-02, -7.589e-01, -2.313e-01, -1.627e-01, 2.538e-01));
|
||||
r += mul(s1_7, M4(-5.728e-02, 1.333e-01, 2.492e-01, -3.609e-02, 1.936e-01, -1.276e-01, -3.034e-01, -1.091e-01, 1.390e-01, 3.356e-01, -1.183e-01, 2.047e-01, 3.779e-01, -3.353e-01, 2.019e-01, 4.337e-02));
|
||||
r += mul(s1_8, M4(-1.386e-01, 1.179e-01, 2.340e-01, -1.604e-01, -4.890e-01, -5.407e-01, -1.546e-01, -1.826e-01, 1.596e-01, -1.784e-01, 5.777e-02, 3.961e-02, -2.290e-01, 2.752e-01, -4.260e-02, 9.649e-02));
|
||||
r += V4(-4.697e-03, -2.213e-02, 3.898e-01, -1.481e-02);
|
||||
return r;
|
||||
}
|
||||
|
||||
void Pass2(uint2 blockStart, uint3 tid) {
|
||||
float2 pt = float2(GetInputPt());
|
||||
uint2 gxy = Rmp8x8(tid.x) + blockStart;
|
||||
uint2 size = GetInputSize();
|
||||
if (gxy.x >= size.x || gxy.y >= size.y) {
|
||||
return;
|
||||
}
|
||||
float2 pos = (gxy + 0.5) * pt;
|
||||
|
||||
V4 s0_0 = l0(-1.0, -1.0);
|
||||
V4 s0_1 = l0(0.0, -1.0);
|
||||
V4 s0_2 = l0(1.0, -1.0);
|
||||
V4 s0_3 = l0(-1.0, 0.0);
|
||||
V4 s0_4 = l0(0.0, 0.0);
|
||||
V4 s0_5 = l0(1.0, 0.0);
|
||||
V4 s0_6 = l0(-1.0, 1.0);
|
||||
V4 s0_7 = l0(0.0, 1.0);
|
||||
V4 s0_8 = l0(1.0, 1.0);
|
||||
V4 s1_0 = -max(-s0_0, 0.0);
|
||||
V4 s1_1 = -max(-s0_1, 0.0);
|
||||
V4 s1_2 = -max(-s0_2, 0.0);
|
||||
V4 s1_3 = -max(-s0_3, 0.0);
|
||||
V4 s1_4 = -max(-s0_4, 0.0);
|
||||
V4 s1_5 = -max(-s0_5, 0.0);
|
||||
V4 s1_6 = -max(-s0_6, 0.0);
|
||||
V4 s1_7 = -max(-s0_7, 0.0);
|
||||
V4 s1_8 = -max(-s0_8, 0.0);
|
||||
s0_0 = max(s0_0, 0.0);
|
||||
s0_1 = max(s0_1, 0.0);
|
||||
s0_2 = max(s0_2, 0.0);
|
||||
s0_3 = max(s0_3, 0.0);
|
||||
s0_4 = max(s0_4, 0.0);
|
||||
s0_5 = max(s0_5, 0.0);
|
||||
s0_6 = max(s0_6, 0.0);
|
||||
s0_7 = max(s0_7, 0.0);
|
||||
s0_8 = max(s0_8, 0.0);
|
||||
|
||||
t1[gxy] = f0(s0_0, s0_1, s0_2, s0_3, s0_4, s0_5, s0_6, s0_7, s0_8, s1_0, s1_1, s1_2, s1_3, s1_4, s1_5, s1_6, s1_7, s1_8);
|
||||
}
|
||||
|
||||
//!PASS 3
|
||||
//!DESC conv2
|
||||
//!BLOCK_SIZE 8
|
||||
//!NUM_THREADS 64
|
||||
//!IN t1
|
||||
//!OUT t0
|
||||
|
||||
#define l0(x, y) V4(O(t1, float2(x, y)))
|
||||
|
||||
V4 f0(V4 s0_0, V4 s0_1, V4 s0_2, V4 s0_3, V4 s0_4, V4 s0_5, V4 s0_6, V4 s0_7, V4 s0_8, V4 s1_0, V4 s1_1, V4 s1_2, V4 s1_3, V4 s1_4, V4 s1_5, V4 s1_6, V4 s1_7, V4 s1_8) {
|
||||
V4 r = 0.0;
|
||||
r += mul(s0_0, M4(-1.362e-01, -5.847e-02, 2.766e-02, 2.969e-02, 9.796e-02, 6.555e-02, -3.067e-02, -5.139e-02, 1.512e-01, 1.401e-01, -3.820e-03, 2.649e-02, -1.802e-01, -2.099e-02, -6.604e-02, 4.042e-02));
|
||||
r += mul(s0_1, M4(-2.144e-01, -1.437e-01, 4.670e-02, -2.348e-01, 9.990e-02, -5.186e-02, 1.658e-01, 9.557e-02, -1.353e-01, -1.146e-01, -9.837e-02, -8.956e-02, 1.229e-01, 2.354e-01, -2.342e-01, -1.343e-01));
|
||||
r += mul(s0_2, M4(5.918e-01, 2.130e-02, 5.753e-01, -6.941e-02, -3.156e-02, -4.438e-02, -6.348e-02, 2.682e-02, -1.078e-02, 9.727e-03, 8.472e-02, 1.460e-01, -1.921e-01, 1.872e-01, 6.067e-02, 3.762e-02));
|
||||
r += mul(s0_3, M4(1.341e-01, 1.082e-01, -4.460e-02, -1.008e-02, -1.262e-01, -7.942e-02, 5.610e-02, 4.418e-02, -1.725e-01, -1.158e-01, 6.377e-03, -1.171e-01, -3.447e-02, 4.459e-02, 2.822e-04, -7.623e-02));
|
||||
r += mul(s0_4, M4(1.994e-01, -2.251e-01, -2.432e-01, 2.467e-02, 3.717e-02, 3.275e-01, 2.005e-01, 1.427e-01, 1.122e-01, 2.864e-01, 1.478e-01, 3.701e-01, 3.111e-01, -1.704e-01, -1.410e-01, -7.490e-01));
|
||||
r += mul(s0_5, M4(-1.392e-01, -2.284e-02, 2.819e-01, -5.560e-02, -2.624e-01, 7.282e-02, -2.417e-01, -5.534e-02, -6.351e-03, -1.714e-01, -1.505e-01, -3.035e-01, -3.580e-02, 4.429e-02, 1.628e-01, -1.101e-01));
|
||||
r += mul(s0_6, M4(8.306e-04, 3.258e-02, -2.746e-02, -3.143e-02, -1.301e-02, -5.828e-02, 2.411e-03, 1.395e-02, 3.728e-02, -8.319e-02, 3.326e-02, 1.294e-01, -6.226e-02, 5.103e-02, -1.218e-02, 2.411e-01));
|
||||
r += mul(s0_7, M4(-6.323e-02, -1.343e-02, 3.400e-02, -1.727e-02, 3.683e-02, 6.325e-02, 4.834e-04, 3.849e-02, 9.424e-03, -2.010e-02, -3.447e-02, -1.330e-01, -4.107e-01, -7.682e-02, 4.138e-01, 5.994e-02));
|
||||
r += mul(s0_8, M4(7.556e-02, 1.846e-02, 1.847e-02, 1.057e-01, -1.140e-01, -2.834e-02, -3.141e-02, -1.045e-01, -2.025e-02, 4.729e-02, -2.822e-02, -4.072e-02, 3.368e-01, 6.871e-02, 1.184e-01, 1.536e-01));
|
||||
r += mul(s1_0, M4(-6.688e-02, 2.483e-02, 1.598e-01, -4.834e-02, 2.141e-01, -4.911e-02, -4.452e-02, -4.879e-02, -9.473e-01, 6.527e-01, -6.118e-01, -2.436e-01, -3.017e-02, -3.402e-01, 1.343e-01, 9.397e-02));
|
||||
r += mul(s1_1, M4(-1.330e-01, 2.557e-01, 6.838e-02, -3.936e-01, 4.806e-01, 1.828e-01, 5.073e-01, 4.502e-01, -1.404e+00, -2.954e-01, -6.745e-02, 5.594e-02, 2.640e-01, 2.330e-02, 1.331e-02, -2.700e-02));
|
||||
r += mul(s1_2, M4(2.695e-01, -1.004e-01, 9.104e-02, -4.919e-01, 3.357e-01, 4.895e-02, 4.062e-01, -3.494e-02, -4.352e-01, -1.232e-01, 8.889e-03, 3.472e-01, -1.174e-01, 7.690e-02, 6.341e-02, 9.255e-02));
|
||||
r += mul(s1_3, M4(1.805e-01, 2.494e-01, 3.474e-02, 3.930e-02, 2.671e-02, -1.438e-02, 7.294e-02, 4.854e-02, -2.864e+00, -5.832e-01, 4.350e-01, -4.265e-01, -2.643e-02, -6.234e-01, 1.283e-01, 5.168e-02));
|
||||
r += mul(s1_4, M4(-2.192e-01, 2.982e-01, -2.860e-01, -4.050e-01, 8.612e-02, 5.008e-02, 5.366e-01, 5.256e-01, -6.222e-01, 1.169e+00, 1.897e+00, 3.009e+00, 9.105e-02, -2.369e-01, -4.718e-01, -2.725e-01));
|
||||
r += mul(s1_5, M4(-7.441e-01, -1.820e-01, -5.828e-02, -6.348e-01, 5.721e-01, 1.143e-01, 2.871e-01, 3.254e-01, -1.446e-01, 1.446e-01, -8.526e-02, 7.228e-01, -9.749e-02, -1.665e-01, -1.116e-01, -2.705e-01));
|
||||
r += mul(s1_6, M4(-6.357e-02, -2.576e-02, 1.277e-02, -3.956e-02, 2.724e-02, -2.141e-02, 9.778e-02, 7.199e-03, -1.153e+00, -6.945e-01, -4.788e-01, -1.246e+00, 1.909e-01, 1.315e-01, 4.454e-02, 2.678e-01));
|
||||
r += mul(s1_7, M4(-1.022e-01, 1.572e-01, 9.404e-02, 6.768e-02, 2.191e-01, -3.163e-02, 1.257e-01, 1.058e-01, -6.394e-01, 7.223e-03, -6.930e-01, -2.963e-01, -2.666e-01, 3.461e-03, 2.203e-01, -1.212e-01));
|
||||
r += mul(s1_8, M4(-1.179e-01, 7.311e-02, 1.371e-01, -4.039e-02, 2.171e-01, 3.131e-02, 2.219e-01, 1.564e-02, -4.895e-01, -5.067e-03, -4.528e-01, 5.694e-02, 6.858e-02, 6.808e-03, -1.017e-01, 6.675e-03));
|
||||
r += V4(-8.341e-03, 1.434e-02, 5.791e-03, -1.033e-02);
|
||||
return r;
|
||||
}
|
||||
|
||||
void Pass3(uint2 blockStart, uint3 tid) {
|
||||
float2 pt = float2(GetInputPt());
|
||||
uint2 gxy = Rmp8x8(tid.x) + blockStart;
|
||||
uint2 size = GetInputSize();
|
||||
if (gxy.x >= size.x || gxy.y >= size.y) {
|
||||
return;
|
||||
}
|
||||
float2 pos = (gxy + 0.5) * pt;
|
||||
|
||||
V4 s0_0 = l0(-1.0, -1.0);
|
||||
V4 s0_1 = l0(0.0, -1.0);
|
||||
V4 s0_2 = l0(1.0, -1.0);
|
||||
V4 s0_3 = l0(-1.0, 0.0);
|
||||
V4 s0_4 = l0(0.0, 0.0);
|
||||
V4 s0_5 = l0(1.0, 0.0);
|
||||
V4 s0_6 = l0(-1.0, 1.0);
|
||||
V4 s0_7 = l0(0.0, 1.0);
|
||||
V4 s0_8 = l0(1.0, 1.0);
|
||||
V4 s1_0 = -max(-s0_0, 0.0);
|
||||
V4 s1_1 = -max(-s0_1, 0.0);
|
||||
V4 s1_2 = -max(-s0_2, 0.0);
|
||||
V4 s1_3 = -max(-s0_3, 0.0);
|
||||
V4 s1_4 = -max(-s0_4, 0.0);
|
||||
V4 s1_5 = -max(-s0_5, 0.0);
|
||||
V4 s1_6 = -max(-s0_6, 0.0);
|
||||
V4 s1_7 = -max(-s0_7, 0.0);
|
||||
V4 s1_8 = -max(-s0_8, 0.0);
|
||||
s0_0 = max(s0_0, 0.0);
|
||||
s0_1 = max(s0_1, 0.0);
|
||||
s0_2 = max(s0_2, 0.0);
|
||||
s0_3 = max(s0_3, 0.0);
|
||||
s0_4 = max(s0_4, 0.0);
|
||||
s0_5 = max(s0_5, 0.0);
|
||||
s0_6 = max(s0_6, 0.0);
|
||||
s0_7 = max(s0_7, 0.0);
|
||||
s0_8 = max(s0_8, 0.0);
|
||||
|
||||
t0[gxy] = f0(s0_0, s0_1, s0_2, s0_3, s0_4, s0_5, s0_6, s0_7, s0_8, s1_0, s1_1, s1_2, s1_3, s1_4, s1_5, s1_6, s1_7, s1_8);
|
||||
}
|
||||
|
||||
//!PASS 4
|
||||
//!DESC conv3
|
||||
//!BLOCK_SIZE 8
|
||||
//!NUM_THREADS 64
|
||||
//!IN t0
|
||||
//!OUT t1
|
||||
|
||||
#define l0(x, y) V4(O(t0, float2(x, y)))
|
||||
|
||||
V4 f0(V4 s0_0, V4 s0_1, V4 s0_2, V4 s0_3, V4 s0_4, V4 s0_5, V4 s0_6, V4 s0_7, V4 s0_8, V4 s1_0, V4 s1_1, V4 s1_2, V4 s1_3, V4 s1_4, V4 s1_5, V4 s1_6, V4 s1_7, V4 s1_8) {
|
||||
V4 r = 0.0;
|
||||
r += mul(s0_0, M4(-6.123e-02, 9.666e-03, 4.969e-02, 3.030e-02, 1.714e-02, -3.117e-02, -9.470e-02, 2.078e-03, 4.109e-02, -5.560e-02, 3.757e-02, -3.667e-03, -3.500e-02, -8.151e-02, 1.104e-01, -1.219e-01));
|
||||
r += mul(s0_1, M4(9.596e-02, -6.361e-02, 1.162e-02, -3.138e-02, -1.277e-02, -4.005e-02, 1.805e-02, -1.459e-02, -7.903e-03, 1.138e-02, 1.542e-02, -2.357e-02, -1.421e-01, -2.953e-01, 1.322e-01, 6.480e-03));
|
||||
r += mul(s0_2, M4(1.571e-01, -1.081e-01, 1.345e-01, -5.616e-02, -1.211e-02, 4.515e-02, 1.797e-02, 6.143e-02, -9.605e-02, 7.782e-02, -1.421e-01, 3.195e-02, 1.841e-01, -7.735e-02, 1.082e-01, 1.785e-02));
|
||||
r += mul(s0_3, M4(1.739e-03, -4.187e-02, 1.093e-01, 1.042e-01, -6.538e-03, 5.025e-02, -7.052e-03, -1.033e-01, -1.394e-01, -4.638e-01, 4.354e-02, -1.188e-02, 7.809e-04, 2.484e-01, -8.330e-01, -2.787e-01));
|
||||
r += mul(s0_4, M4(-6.489e-03, -6.309e-01, 7.169e-01, 1.557e-01, 1.478e-01, 2.977e-01, -2.818e-01, 5.129e-02, 7.598e-01, 8.124e-01, -1.262e-02, -1.325e-01, -2.764e-01, 3.485e-01, 4.717e-01, -2.467e-01));
|
||||
r += mul(s0_5, M4(2.022e-02, -1.396e-01, 1.865e-01, 1.568e-02, 3.924e-01, -2.466e-01, 4.990e-01, 3.971e-02, -1.176e-01, 1.792e-01, -2.861e-01, 3.555e-02, -1.428e-01, 2.528e-01, -2.085e-01, -1.311e-01));
|
||||
r += mul(s0_6, M4(3.340e-02, -1.203e-01, 1.014e-01, 1.154e-01, -9.031e-03, -5.586e-02, -5.700e-03, 2.391e-02, -3.509e-01, 6.729e-02, 1.004e-01, -3.277e-01, 1.026e-01, 3.286e-03, -6.603e-02, -3.238e-03));
|
||||
r += mul(s0_7, M4(-6.854e-01, 1.013e-01, -6.298e-02, -5.464e-01, 2.486e-01, -2.186e-01, 3.986e-02, 3.800e-01, -1.267e-01, 1.037e-01, 1.538e-01, -2.069e-01, 9.431e-02, 5.337e-02, -8.507e-02, 2.015e-01));
|
||||
r += mul(s0_8, M4(-5.009e-03, 1.493e-01, -3.010e-02, -2.429e-02, -3.137e-01, -2.276e-01, 1.556e-01, 1.452e-02, 2.063e-01, 3.699e-02, -1.675e-03, 8.221e-02, -6.732e-02, 8.296e-02, -8.474e-02, -1.458e-01));
|
||||
r += mul(s1_0, M4(-3.003e-02, -9.777e-03, 1.239e-02, -3.907e-02, 1.841e-01, -8.959e-02, 9.257e-02, 1.333e-01, 5.703e-04, -1.367e-01, -1.026e-01, 6.398e-02, 1.262e-02, 1.101e-02, 4.291e-02, -4.238e-02));
|
||||
r += mul(s1_1, M4(5.516e-02, 9.884e-04, -5.383e-02, -1.048e-02, 2.529e-01, 9.819e-02, 1.255e-01, 3.149e-02, -8.249e-02, -1.386e-02, 6.214e-02, 2.957e-02, 1.001e-01, 1.590e-01, 1.159e-02, 5.273e-02));
|
||||
r += mul(s1_2, M4(4.571e-02, -6.277e-03, 1.496e-01, -4.044e-02, 4.089e-02, -3.801e-02, -3.690e-02, -1.037e-01, -6.031e-02, 2.117e-03, -9.644e-02, 6.392e-02, 5.093e-02, -2.512e-02, 1.131e-01, 1.304e-01));
|
||||
r += mul(s1_3, M4(-3.118e-02, 2.185e-02, 1.763e-01, 8.327e-02, 6.337e-02, 8.724e-02, 6.808e-02, -4.070e-01, -6.922e-02, -2.417e-01, -1.175e-01, -1.845e-01, -3.773e-03, -1.869e-01, -9.345e-02, -2.340e-01));
|
||||
r += mul(s1_4, M4(-1.159e-01, -4.476e-01, 2.989e-01, 2.794e-01, 5.756e-01, -4.803e-01, -5.979e-02, -1.959e-01, 5.261e-02, -2.399e-01, -6.616e-02, -9.243e-01, 4.622e-01, 1.139e-01, 2.482e-01, 2.254e-01));
|
||||
r += mul(s1_5, M4(1.064e-01, -1.989e-02, 8.581e-02, 3.218e-02, 3.344e-01, -5.684e-01, 4.009e-01, 4.482e-01, 7.737e-02, 8.716e-02, -1.382e-01, -7.145e-02, -1.225e-01, 1.471e-01, -1.866e-01, 3.674e-02));
|
||||
r += mul(s1_6, M4(5.376e-02, -6.192e-03, -1.760e-01, 7.590e-02, -3.279e-02, -1.888e-01, 2.057e-01, 2.114e-01, -3.941e-01, 5.584e-03, 9.400e-03, -4.289e-01, -2.289e-01, 1.880e-01, 3.184e-02, -4.442e-01));
|
||||
r += mul(s1_7, M4(-4.174e-01, -1.344e-01, 3.866e-02, 4.521e-02, -4.215e-01, 1.479e-01, 2.476e-01, -7.051e-01, -4.153e-01, 3.373e-01, 8.098e-02, -6.680e-01, 3.920e-01, -1.023e-01, -2.166e-02, 3.816e-01));
|
||||
r += mul(s1_8, M4(-3.441e-02, 3.404e-03, -4.958e-02, 9.652e-03, -1.930e-02, -2.470e-01, 1.610e-01, 1.112e-01, 2.574e-02, 2.310e-01, 3.643e-02, -5.044e-02, 7.788e-02, 1.923e-03, -7.115e-02, -6.575e-03));
|
||||
r += V4(1.370e-02, 1.151e-02, 2.567e-03, -1.881e-03);
|
||||
return r;
|
||||
}
|
||||
|
||||
void Pass4(uint2 blockStart, uint3 tid) {
|
||||
float2 pt = float2(GetInputPt());
|
||||
uint2 gxy = Rmp8x8(tid.x) + blockStart;
|
||||
uint2 size = GetInputSize();
|
||||
if (gxy.x >= size.x || gxy.y >= size.y) {
|
||||
return;
|
||||
}
|
||||
float2 pos = (gxy + 0.5) * pt;
|
||||
|
||||
V4 s0_0 = l0(-1.0, -1.0);
|
||||
V4 s0_1 = l0(0.0, -1.0);
|
||||
V4 s0_2 = l0(1.0, -1.0);
|
||||
V4 s0_3 = l0(-1.0, 0.0);
|
||||
V4 s0_4 = l0(0.0, 0.0);
|
||||
V4 s0_5 = l0(1.0, 0.0);
|
||||
V4 s0_6 = l0(-1.0, 1.0);
|
||||
V4 s0_7 = l0(0.0, 1.0);
|
||||
V4 s0_8 = l0(1.0, 1.0);
|
||||
V4 s1_0 = -max(-s0_0, 0.0);
|
||||
V4 s1_1 = -max(-s0_1, 0.0);
|
||||
V4 s1_2 = -max(-s0_2, 0.0);
|
||||
V4 s1_3 = -max(-s0_3, 0.0);
|
||||
V4 s1_4 = -max(-s0_4, 0.0);
|
||||
V4 s1_5 = -max(-s0_5, 0.0);
|
||||
V4 s1_6 = -max(-s0_6, 0.0);
|
||||
V4 s1_7 = -max(-s0_7, 0.0);
|
||||
V4 s1_8 = -max(-s0_8, 0.0);
|
||||
s0_0 = max(s0_0, 0.0);
|
||||
s0_1 = max(s0_1, 0.0);
|
||||
s0_2 = max(s0_2, 0.0);
|
||||
s0_3 = max(s0_3, 0.0);
|
||||
s0_4 = max(s0_4, 0.0);
|
||||
s0_5 = max(s0_5, 0.0);
|
||||
s0_6 = max(s0_6, 0.0);
|
||||
s0_7 = max(s0_7, 0.0);
|
||||
s0_8 = max(s0_8, 0.0);
|
||||
|
||||
t1[gxy] = f0(s0_0, s0_1, s0_2, s0_3, s0_4, s0_5, s0_6, s0_7, s0_8, s1_0, s1_1, s1_2, s1_3, s1_4, s1_5, s1_6, s1_7, s1_8);
|
||||
}
|
||||
|
||||
//!PASS 5
|
||||
//!DESC conv4
|
||||
//!BLOCK_SIZE 8
|
||||
//!NUM_THREADS 64
|
||||
//!IN t1
|
||||
//!OUT t0
|
||||
|
||||
#define l0(x, y) V4(O(t1, float2(x, y)))
|
||||
|
||||
V4 f0(V4 s0_0, V4 s0_1, V4 s0_2, V4 s0_3, V4 s0_4, V4 s0_5, V4 s0_6, V4 s0_7, V4 s0_8, V4 s1_0, V4 s1_1, V4 s1_2, V4 s1_3, V4 s1_4, V4 s1_5, V4 s1_6, V4 s1_7, V4 s1_8) {
|
||||
V4 r = 0.0;
|
||||
r += mul(s0_0, M4(2.376e-02, 2.931e-02, 7.304e-02, -5.238e-02, -6.500e-03, -3.887e-02, 2.506e-02, 5.201e-03, 5.599e-02, -1.951e-01, -3.847e-01, 8.685e-02, -1.106e-01, -3.954e-02, 1.571e-01, 2.293e-02));
|
||||
r += mul(s0_1, M4(-2.738e-02, 1.554e-01, 1.120e-01, 1.856e-02, 9.513e-03, -2.222e-01, -2.174e-01, -1.065e-02, 3.001e-02, 7.638e-02, -7.497e-02, -2.727e-02, -1.521e-02, 1.843e-01, 3.547e-01, -1.642e-02));
|
||||
r += mul(s0_2, M4(-2.533e-02, -1.959e-02, -6.274e-02, 8.121e-03, -8.703e-03, 5.091e-02, 6.548e-02, 1.988e-02, 4.089e-02, -4.827e-02, -4.089e-02, -4.361e-02, -1.112e-02, -1.101e-02, 2.968e-02, -2.196e-03));
|
||||
r += mul(s0_3, M4(1.813e-02, -2.087e-01, -2.474e-01, -1.066e-01, 2.549e-01, 6.466e-01, 3.169e-01, -1.109e-01, -1.551e-02, -3.119e-01, -3.959e-01, 2.141e-01, 1.121e-01, 3.268e-01, 1.038e-01, -5.818e-02));
|
||||
r += mul(s0_4, M4(-3.147e-01, 2.716e-01, 1.304e-01, 3.887e-01, 9.396e-02, -9.787e-02, -1.596e-01, -7.138e-02, -2.462e-01, -3.027e-01, 6.980e-01, -1.546e-01, 3.730e-02, -7.502e-02, -4.408e-02, 3.814e-02));
|
||||
r += mul(s0_5, M4(-4.177e-02, -1.326e-02, -7.497e-02, 1.168e-03, 5.595e-03, 3.603e-02, 2.589e-02, -2.179e-02, 1.998e-02, -3.544e-03, 1.125e-01, 2.648e-03, -2.417e-02, -1.876e-02, 4.009e-02, 5.481e-02));
|
||||
r += mul(s0_6, M4(-7.181e-02, -2.968e-02, -3.169e-02, -1.899e-02, -3.692e-02, -2.156e-02, 9.595e-02, 1.055e-01, -1.274e-01, -2.576e-02, 8.706e-02, 1.895e-01, 6.316e-04, -4.574e-02, 2.201e-02, 1.199e-01));
|
||||
r += mul(s0_7, M4(-2.193e-01, 1.563e-02, 1.287e-01, 2.403e-01, 2.222e-01, -1.748e-02, 1.486e-02, -7.685e-02, 4.971e-01, 2.920e-01, -2.253e-01, -8.145e-01, 3.018e-01, -4.559e-02, -1.509e-01, -3.003e-01));
|
||||
r += mul(s0_8, M4(1.685e-02, -1.082e-02, 3.539e-03, -2.765e-02, -5.968e-03, -4.628e-03, 3.847e-02, 6.426e-02, -6.284e-02, 5.455e-02, -3.291e-02, 1.636e-01, 5.828e-02, -5.613e-02, -4.404e-02, -1.715e-02));
|
||||
r += mul(s1_0, M4(1.875e-02, 7.150e-02, 3.015e-02, -4.917e-02, 9.333e-03, -1.519e-01, -1.153e-01, 4.344e-02, -1.603e-02, -4.775e-02, -4.484e-02, 6.567e-02, -6.714e-02, 2.569e-01, 4.638e-01, 3.038e-02));
|
||||
r += mul(s1_1, M4(-4.046e-02, 1.372e-01, 2.476e-01, 6.565e-02, 6.481e-04, -1.529e-02, 1.376e-02, 1.367e-02, 2.941e-04, 1.423e-01, 2.311e-01, 7.538e-03, -6.762e-02, -3.992e-01, -1.160e-02, 3.123e-02));
|
||||
r += mul(s1_2, M4(-3.926e-02, 1.709e-04, -4.761e-02, -8.731e-03, 5.123e-03, 7.039e-02, 1.061e-01, -1.322e-03, 4.069e-02, -1.182e-01, -3.698e-04, -7.746e-02, -3.827e-02, 9.957e-02, 9.991e-02, 5.215e-02));
|
||||
r += mul(s1_3, M4(-1.865e-01, -9.784e-01, -5.871e-01, 1.384e-01, 2.097e-01, -1.229e-01, -4.912e-01, -4.254e-02, 3.395e-04, -8.968e-02, -6.923e-02, -4.916e-02, 2.424e-01, 7.730e-01, 2.573e-01, -2.380e-01));
|
||||
r += mul(s1_4, M4(-9.293e-01, 6.176e-01, 1.970e-01, 3.467e-01, 4.341e-01, 9.866e-01, 3.035e-01, -1.062e-01, -1.501e-01, 2.709e-01, 1.991e-01, -2.164e-01, 2.881e-01, -1.696e-01, -4.141e-01, -1.004e+00));
|
||||
r += mul(s1_5, M4(-8.323e-02, -1.285e-02, -3.468e-02, 1.551e-01, 1.330e-01, -1.238e-01, -1.675e-03, 5.588e-02, 2.128e-01, -2.327e-01, -2.891e-02, 1.567e-01, -1.448e-01, 8.781e-02, 3.254e-02, 7.142e-02));
|
||||
r += mul(s1_6, M4(1.231e-01, 5.139e-02, -9.426e-02, -2.822e-01, 1.761e-03, 6.853e-03, 1.165e-01, 7.861e-02, -9.715e-03, 5.489e-03, -1.066e-02, -8.332e-03, -9.111e-02, 3.911e-02, 1.757e-01, 2.222e-01));
|
||||
r += mul(s1_7, M4(2.275e-02, 1.199e-01, 5.904e-02, -2.051e-01, 6.950e-01, 1.592e-02, -9.888e-02, -6.701e-01, -9.096e-02, 3.203e-02, 1.204e-01, 2.153e-01, 1.448e-01, -5.225e-03, 6.786e-02, 2.005e-02));
|
||||
r += mul(s1_8, M4(-3.290e-02, -3.758e-02, -3.158e-02, 8.713e-02, 3.917e-02, 4.275e-02, -2.450e-02, 3.970e-02, 1.928e-01, 5.498e-02, -5.673e-02, -3.743e-01, 4.981e-02, -1.785e-02, 1.958e-02, 3.487e-02));
|
||||
r += V4(7.249e-03, 2.949e-03, 5.297e-03, 3.693e-03);
|
||||
return r;
|
||||
}
|
||||
|
||||
void Pass5(uint2 blockStart, uint3 tid) {
|
||||
float2 pt = float2(GetInputPt());
|
||||
uint2 gxy = Rmp8x8(tid.x) + blockStart;
|
||||
uint2 size = GetInputSize();
|
||||
if (gxy.x >= size.x || gxy.y >= size.y) {
|
||||
return;
|
||||
}
|
||||
float2 pos = (gxy + 0.5) * pt;
|
||||
|
||||
V4 s0_0 = l0(-1.0, -1.0);
|
||||
V4 s0_1 = l0(0.0, -1.0);
|
||||
V4 s0_2 = l0(1.0, -1.0);
|
||||
V4 s0_3 = l0(-1.0, 0.0);
|
||||
V4 s0_4 = l0(0.0, 0.0);
|
||||
V4 s0_5 = l0(1.0, 0.0);
|
||||
V4 s0_6 = l0(-1.0, 1.0);
|
||||
V4 s0_7 = l0(0.0, 1.0);
|
||||
V4 s0_8 = l0(1.0, 1.0);
|
||||
V4 s1_0 = -max(-s0_0, 0.0);
|
||||
V4 s1_1 = -max(-s0_1, 0.0);
|
||||
V4 s1_2 = -max(-s0_2, 0.0);
|
||||
V4 s1_3 = -max(-s0_3, 0.0);
|
||||
V4 s1_4 = -max(-s0_4, 0.0);
|
||||
V4 s1_5 = -max(-s0_5, 0.0);
|
||||
V4 s1_6 = -max(-s0_6, 0.0);
|
||||
V4 s1_7 = -max(-s0_7, 0.0);
|
||||
V4 s1_8 = -max(-s0_8, 0.0);
|
||||
s0_0 = max(s0_0, 0.0);
|
||||
s0_1 = max(s0_1, 0.0);
|
||||
s0_2 = max(s0_2, 0.0);
|
||||
s0_3 = max(s0_3, 0.0);
|
||||
s0_4 = max(s0_4, 0.0);
|
||||
s0_5 = max(s0_5, 0.0);
|
||||
s0_6 = max(s0_6, 0.0);
|
||||
s0_7 = max(s0_7, 0.0);
|
||||
s0_8 = max(s0_8, 0.0);
|
||||
|
||||
t0[gxy] = f0(s0_0, s0_1, s0_2, s0_3, s0_4, s0_5, s0_6, s0_7, s0_8, s1_0, s1_1, s1_2, s1_3, s1_4, s1_5, s1_6, s1_7, s1_8);
|
||||
}
|
||||
|
||||
//!PASS 6
|
||||
//!DESC out-shuffle
|
||||
//!BLOCK_SIZE 16
|
||||
//!NUM_THREADS 64
|
||||
//!IN INPUT, t0
|
||||
//!OUT OUTPUT
|
||||
|
||||
#define l0(x, y) V4(O(t0, float2(x, y)))
|
||||
|
||||
V4 f0(V4 s0_0, V4 s0_1, V4 s0_2, V4 s0_3, V4 s0_4, V4 s0_5, V4 s0_6, V4 s0_7, V4 s0_8, V4 s1_0, V4 s1_1, V4 s1_2, V4 s1_3, V4 s1_4, V4 s1_5, V4 s1_6, V4 s1_7, V4 s1_8) {
|
||||
V4 r = 0.0;
|
||||
r += mul(s0_0, M4(2.340e-02, 8.171e-02, -1.124e-01, -5.065e-02, -5.505e-02, -5.540e-02, -3.000e-03, -1.346e-02, 3.800e-02, 4.944e-02, -2.084e-02, 6.388e-03, 8.566e-02, 2.480e-02, 1.184e-01, -1.075e-04));
|
||||
r += mul(s0_1, M4(-2.188e-02, -2.056e-01, 1.480e-02, -7.451e-02, 5.240e-02, 4.098e-02, -4.668e-03, 1.810e-02, -2.533e-02, -6.403e-02, 1.984e-02, -5.716e-02, -3.356e-03, -2.173e-01, 1.218e-01, 1.179e-01));
|
||||
r += mul(s0_2, M4(7.330e-03, 2.521e-02, 1.372e-02, 3.411e-02, -1.438e-02, -1.009e-02, 7.676e-03, -1.712e-02, 5.980e-03, 2.040e-02, -8.766e-03, 3.442e-02, -1.623e-02, -2.557e-02, -6.086e-03, 5.413e-04));
|
||||
r += mul(s0_3, M4(1.754e-01, 6.364e-02, 2.842e-01, 2.378e-01, -1.684e-01, -1.911e-02, -3.838e-01, -2.622e-02, 2.065e-01, 3.951e-02, 4.217e-01, 4.374e-02, -1.028e-02, 2.417e-02, -1.595e-02, 6.305e-02));
|
||||
r += mul(s0_4, M4(-5.620e-02, -8.609e-02, -1.256e-01, -3.166e-01, -1.712e-01, -1.602e-01, -1.577e-01, -4.901e-01, -5.012e-02, 1.082e-01, -7.271e-02, 4.072e-01, -7.789e-02, -1.725e-01, -1.397e-01, -4.507e-01));
|
||||
r += mul(s0_5, M4(1.401e-02, 4.716e-02, 1.486e-02, 4.642e-02, 1.131e-02, 3.865e-02, -9.865e-03, 9.301e-02, 3.441e-03, -8.098e-03, -6.012e-03, -1.549e-01, 1.486e-02, 1.872e-02, -2.469e-03, 1.294e-02));
|
||||
r += mul(s0_6, M4(-3.894e-02, -4.136e-05, -3.022e-02, 1.045e-03, -3.730e-02, -1.838e-02, -5.573e-02, -2.760e-02, 3.516e-02, 1.602e-02, 6.358e-02, 3.111e-02, -3.045e-02, -7.728e-03, -4.189e-02, -1.102e-02));
|
||||
r += mul(s0_7, M4(-1.184e-02, 1.728e-02, 7.925e-03, 6.763e-02, 2.590e-03, -9.456e-03, -4.407e-02, -2.044e-02, 4.472e-02, 2.228e-02, 7.233e-02, 4.863e-02, -1.814e-02, -2.034e-03, -4.994e-02, -2.460e-02));
|
||||
r += mul(s0_8, M4(-3.292e-03, -9.015e-03, -3.171e-03, -2.504e-02, 2.120e-03, 3.064e-02, 2.108e-02, 4.592e-02, 2.258e-03, -2.192e-04, -3.576e-03, 3.733e-02, -1.931e-03, -5.083e-03, 5.877e-03, -1.764e-02));
|
||||
r += mul(s1_0, M4(4.321e-02, -8.135e-02, -1.567e-01, -6.888e-03, -6.542e-02, -1.656e-02, 1.236e-02, -7.563e-03, 4.657e-02, 9.222e-03, -6.696e-03, -3.545e-03, -6.401e-01, 1.189e-01, 1.509e-01, 2.417e-01));
|
||||
r += mul(s1_1, M4(-2.058e-02, 1.174e-01, -2.482e-02, -8.423e-02, -1.692e-02, -1.094e-02, 3.530e-02, 1.780e-02, -9.937e-02, -9.030e-02, 2.304e-02, 1.294e-02, 7.976e-02, -3.096e-01, 1.382e-01, 2.456e-01));
|
||||
r += mul(s1_2, M4(4.491e-02, -1.336e-02, 3.593e-02, -3.503e-02, -8.630e-03, -4.295e-03, -1.356e-02, 3.843e-02, 9.887e-03, 1.913e-03, 2.247e-03, 1.113e-02, -7.234e-04, -3.058e-02, 2.833e-03, -1.707e-02));
|
||||
r += mul(s1_3, M4(2.007e-01, 6.756e-02, 9.393e-01, 9.057e-02, -3.701e-01, -1.729e-02, -4.136e-01, 2.233e-02, 2.783e-01, 3.590e-02, 3.564e-01, 8.342e-03, 1.333e-01, 7.944e-02, -2.312e-01, 8.354e-02));
|
||||
r += mul(s1_4, M4(-3.334e-01, -2.705e-01, -4.072e-01, 3.946e-01, 5.159e-03, -5.860e-01, 1.578e-01, -3.614e-01, 5.366e-01, 4.699e-01, -3.700e-01, 9.463e-02, -4.090e-02, -9.767e-02, -7.999e-02, -4.859e-01));
|
||||
r += mul(s1_5, M4(5.700e-02, 6.092e-02, 4.114e-02, -1.564e-02, -1.345e-02, 9.692e-02, 1.456e-03, 9.371e-02, -3.845e-02, -4.751e-02, -2.509e-02, -2.842e-01, 2.938e-03, 2.387e-02, -6.191e-04, -3.120e-04));
|
||||
r += mul(s1_6, M4(3.888e-02, 4.969e-02, -1.851e-01, -9.866e-03, -3.527e-02, -1.377e-02, -7.594e-02, -2.619e-02, 3.259e-02, 9.636e-03, 8.622e-03, 1.788e-02, -3.505e-02, -1.048e-03, -1.329e-02, 1.425e-02));
|
||||
r += mul(s1_7, M4(6.891e-03, 8.118e-02, -6.443e-02, -1.487e-01, 2.183e-02, 1.106e-03, 6.656e-02, -9.506e-02, 7.418e-04, -6.015e-02, 3.594e-01, 1.039e-02, -3.600e-02, -7.771e-03, -3.406e-02, 2.935e-02));
|
||||
r += mul(s1_8, M4(-4.598e-03, -4.678e-03, 1.595e-02, -8.273e-03, 6.740e-03, 1.175e-02, -2.997e-02, -6.116e-03, -3.788e-02, -9.471e-02, -2.149e-02, 4.139e-02, -9.614e-03, -5.573e-03, -1.643e-02, -1.712e-02));
|
||||
r += V4(2.510e-03, 4.409e-03, 2.891e-03, 4.977e-03);
|
||||
return tanh(r);
|
||||
}
|
||||
|
||||
void Pass6(uint2 blockStart, uint3 tid) {
|
||||
float2 pt = float2(GetInputPt());
|
||||
uint2 gxy = (Rmp8x8(tid.x) << 1) + blockStart;
|
||||
uint2 size = GetOutputSize();
|
||||
if (gxy.x >= size.x || gxy.y >= size.y) {
|
||||
return;
|
||||
}
|
||||
float2 pos = ((gxy >> 1) + 0.5) * pt;
|
||||
|
||||
V4 s0_0 = l0(-1.0, -1.0);
|
||||
V4 s0_1 = l0(0.0, -1.0);
|
||||
V4 s0_2 = l0(1.0, -1.0);
|
||||
V4 s0_3 = l0(-1.0, 0.0);
|
||||
V4 s0_4 = l0(0.0, 0.0);
|
||||
V4 s0_5 = l0(1.0, 0.0);
|
||||
V4 s0_6 = l0(-1.0, 1.0);
|
||||
V4 s0_7 = l0(0.0, 1.0);
|
||||
V4 s0_8 = l0(1.0, 1.0);
|
||||
V4 s1_0 = -max(-s0_0, 0.0);
|
||||
V4 s1_1 = -max(-s0_1, 0.0);
|
||||
V4 s1_2 = -max(-s0_2, 0.0);
|
||||
V4 s1_3 = -max(-s0_3, 0.0);
|
||||
V4 s1_4 = -max(-s0_4, 0.0);
|
||||
V4 s1_5 = -max(-s0_5, 0.0);
|
||||
V4 s1_6 = -max(-s0_6, 0.0);
|
||||
V4 s1_7 = -max(-s0_7, 0.0);
|
||||
V4 s1_8 = -max(-s0_8, 0.0);
|
||||
s0_0 = max(s0_0, 0.0);
|
||||
s0_1 = max(s0_1, 0.0);
|
||||
s0_2 = max(s0_2, 0.0);
|
||||
s0_3 = max(s0_3, 0.0);
|
||||
s0_4 = max(s0_4, 0.0);
|
||||
s0_5 = max(s0_5, 0.0);
|
||||
s0_6 = max(s0_6, 0.0);
|
||||
s0_7 = max(s0_7, 0.0);
|
||||
s0_8 = max(s0_8, 0.0);
|
||||
|
||||
V4 r = f0(s0_0, s0_1, s0_2, s0_3, s0_4, s0_5, s0_6, s0_7, s0_8, s1_0, s1_1, s1_2, s1_3, s1_4, s1_5, s1_6, s1_7, s1_8);
|
||||
|
||||
static const float3x3 rgb2yuv = {0.299, 0.587, 0.114, -0.169, -0.331, 0.5, 0.5, -0.419, -0.081};
|
||||
static const float3x3 yuv2rgb = {1, -0.00093, 1.401687, 1, -0.3437, -0.71417, 1, 1.77216, 0.00099};
|
||||
float2 opt = float2(GetOutputPt());
|
||||
|
||||
pos -= 0.5f * opt;
|
||||
float3 yuv = mul(rgb2yuv, INPUT.SampleLevel(SL, pos, 0).rgb);
|
||||
OUTPUT[gxy] = float4(mul(yuv2rgb, float3(saturate(yuv.r + r.x), yuv.yz)), 1);
|
||||
|
||||
++gxy.x;
|
||||
pos.x += opt.x;
|
||||
yuv = mul(rgb2yuv, INPUT.SampleLevel(SL, pos, 0).rgb);
|
||||
OUTPUT[gxy] = float4(mul(yuv2rgb, float3(saturate(yuv.r + r.y), yuv.yz)), 1);
|
||||
|
||||
++gxy.y;
|
||||
pos.y += opt.y;
|
||||
yuv = mul(rgb2yuv, INPUT.SampleLevel(SL, pos, 0).rgb);
|
||||
OUTPUT[gxy] = float4(mul(yuv2rgb, float3(saturate(yuv.r + r.w), yuv.yz)), 1);
|
||||
|
||||
--gxy.x;
|
||||
pos.x -= opt.x;
|
||||
yuv = mul(rgb2yuv, INPUT.SampleLevel(SL, pos, 0).rgb);
|
||||
OUTPUT[gxy] = float4(mul(yuv2rgb, float3(saturate(yuv.r + r.z), yuv.yz)), 1);
|
||||
}
|
||||
486
src/Effects/CuNNy/CuNNy-4x4C-NVL.hlsl
Normal file
486
src/Effects/CuNNy/CuNNy-4x4C-NVL.hlsl
Normal file
|
|
@ -0,0 +1,486 @@
|
|||
// CuNNy 4x4C BILINEAR RGB NVL - https://github.com/cunnyplapper/CuNNy
|
||||
|
||||
// This program is free software: you can redistribute it and/or modify
|
||||
// it under the terms of the GNU General Public License as published by
|
||||
// the Free Software Foundation, either version 3 of the License, or
|
||||
// (at your option) any later version.
|
||||
//
|
||||
// This program is distributed in the hope that it will be useful,
|
||||
// but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
//
|
||||
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
// GNU General Public License for more details.
|
||||
//
|
||||
// You should have received a copy of the GNU General Public License
|
||||
// along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
|
||||
//!MAGPIE EFFECT
|
||||
//!VERSION 4
|
||||
//!SORT_NAME CuNNy-D04N04
|
||||
|
||||
//!TEXTURE
|
||||
Texture2D INPUT;
|
||||
|
||||
//!TEXTURE
|
||||
//!WIDTH INPUT_WIDTH * 2
|
||||
//!HEIGHT INPUT_HEIGHT * 2
|
||||
Texture2D OUTPUT;
|
||||
|
||||
//!SAMPLER
|
||||
//!FILTER POINT
|
||||
SamplerState SP;
|
||||
|
||||
//!SAMPLER
|
||||
//!FILTER LINEAR
|
||||
SamplerState SL;
|
||||
|
||||
//!COMMON
|
||||
#define O(t, p) t.SampleLevel(SP, pos + p * pt, 0)
|
||||
#define V4 min16float4
|
||||
#define M4 min16float4x4
|
||||
|
||||
//!TEXTURE
|
||||
//!WIDTH INPUT_WIDTH
|
||||
//!HEIGHT INPUT_HEIGHT
|
||||
//!FORMAT R8G8B8A8_SNORM
|
||||
Texture2D t0;
|
||||
|
||||
//!TEXTURE
|
||||
//!WIDTH INPUT_WIDTH
|
||||
//!HEIGHT INPUT_HEIGHT
|
||||
//!FORMAT R8G8B8A8_SNORM
|
||||
Texture2D t1;
|
||||
|
||||
//!PASS 1
|
||||
//!DESC in
|
||||
//!BLOCK_SIZE 8
|
||||
//!NUM_THREADS 64
|
||||
//!IN INPUT
|
||||
//!OUT t0
|
||||
|
||||
#define l0(x, y) min16float((dot(float3(-4.174e-01, -7.873e-01, -1.763e-01), O(INPUT, float2(x, y)).rgb) + 1.011e+00))
|
||||
|
||||
V4 f0(min16float s0_0, min16float s0_1, min16float s0_2, min16float s0_3, min16float s0_4, min16float s0_5, min16float s0_6, min16float s0_7, min16float s0_8) {
|
||||
V4 r = 0.0;
|
||||
r += V4(1.222e-01, 7.038e-03, 1.179e-01, 1.876e-01) * s0_0;
|
||||
r += V4(1.025e-01, -2.993e-01, 3.154e-01, -1.050e-01) * s0_1;
|
||||
r += V4(5.656e-02, -3.117e-03, -6.665e-02, -2.044e-01) * s0_2;
|
||||
r += V4(-5.045e-01, -4.189e-01, -3.076e-01, -3.691e-01) * s0_3;
|
||||
r += V4(1.365e-01, 6.699e-01, 3.389e-01, 4.561e-01) * s0_4;
|
||||
r += V4(-7.690e-02, 2.655e-02, -1.044e-02, 7.271e-02) * s0_5;
|
||||
r += V4(1.358e-02, 3.378e-03, -1.802e-01, -1.936e-01) * s0_6;
|
||||
r += V4(8.227e-02, 1.550e-02, -1.820e-01, -1.670e-01) * s0_7;
|
||||
r += V4(9.988e-03, 1.413e-03, -2.486e-02, 3.258e-01) * s0_8;
|
||||
r += V4(3.566e-02, -1.308e-03, -5.595e-03, -5.246e-03);
|
||||
return r;
|
||||
}
|
||||
|
||||
void Pass1(uint2 blockStart, uint3 tid) {
|
||||
float2 pt = float2(GetInputPt());
|
||||
uint2 gxy = Rmp8x8(tid.x) + blockStart;
|
||||
uint2 size = GetInputSize();
|
||||
if (gxy.x >= size.x || gxy.y >= size.y) {
|
||||
return;
|
||||
}
|
||||
float2 pos = (gxy + 0.5) * pt;
|
||||
|
||||
min16float s0_0 = l0(-1.0, -1.0);
|
||||
min16float s0_1 = l0(0.0, -1.0);
|
||||
min16float s0_2 = l0(1.0, -1.0);
|
||||
min16float s0_3 = l0(-1.0, 0.0);
|
||||
min16float s0_4 = l0(0.0, 0.0);
|
||||
min16float s0_5 = l0(1.0, 0.0);
|
||||
min16float s0_6 = l0(-1.0, 1.0);
|
||||
min16float s0_7 = l0(0.0, 1.0);
|
||||
min16float s0_8 = l0(1.0, 1.0);
|
||||
|
||||
t0[gxy] = f0(s0_0, s0_1, s0_2, s0_3, s0_4, s0_5, s0_6, s0_7, s0_8);
|
||||
}
|
||||
|
||||
//!PASS 2
|
||||
//!DESC conv1
|
||||
//!BLOCK_SIZE 8
|
||||
//!NUM_THREADS 64
|
||||
//!IN t0
|
||||
//!OUT t1
|
||||
|
||||
#define l0(x, y) V4(O(t0, float2(x, y)))
|
||||
|
||||
V4 f0(V4 s0_0, V4 s0_1, V4 s0_2, V4 s0_3, V4 s0_4, V4 s0_5, V4 s0_6, V4 s0_7, V4 s0_8, V4 s1_0, V4 s1_1, V4 s1_2, V4 s1_3, V4 s1_4, V4 s1_5, V4 s1_6, V4 s1_7, V4 s1_8) {
|
||||
V4 r = 0.0;
|
||||
r += mul(s0_0, M4(1.282e-01, 1.199e-01, 1.156e-01, -4.091e-02, -1.771e-02, -1.431e-01, -1.478e-02, 4.041e-02, -1.559e-01, 1.231e-02, -8.571e-02, 2.159e-02, -6.484e-02, 3.819e-02, -3.386e-02, -3.344e-02));
|
||||
r += mul(s0_1, M4(6.131e-02, 1.493e-01, 1.954e-01, -2.565e-01, 1.570e-01, -3.852e-01, -2.313e-01, 9.262e-02, 1.038e-01, -4.169e-01, -2.446e-01, 9.953e-02, -1.830e-01, -9.774e-02, -1.498e-01, 8.626e-02));
|
||||
r += mul(s0_2, M4(9.908e-02, 1.372e-01, -1.254e-02, 4.486e-03, 1.023e-01, 6.484e-02, 1.645e-01, -4.932e-02, -4.221e-02, -1.919e-01, -2.135e-02, 6.955e-02, -1.406e-01, 8.082e-02, -7.935e-02, 3.010e-02));
|
||||
r += mul(s0_3, M4(-7.203e-02, -1.210e-01, 1.084e-01, -6.958e-03, 1.303e-01, 1.030e-01, -2.392e-01, -1.084e-01, 2.173e-01, -7.864e-02, -2.983e-01, -3.510e-01, -3.076e-01, 4.533e-02, 1.940e-01, 4.051e-01));
|
||||
r += mul(s0_4, M4(9.270e-02, -4.072e-01, 2.338e-01, 4.098e-01, -1.440e-01, 6.971e-01, 5.515e-01, 2.682e-01, -1.401e-01, 3.504e-02, 1.366e-01, 6.149e-01, -3.330e-01, 1.880e-01, -4.170e-01, 3.244e-01));
|
||||
r += mul(s0_5, M4(-5.380e-01, -7.843e-02, -1.293e-01, -9.225e-02, 1.393e-01, -2.588e-01, 4.618e-01, -2.264e-02, -5.369e-02, 1.321e-01, -3.029e-02, 7.983e-02, -1.048e-01, 3.279e-02, -5.969e-02, -3.766e-03));
|
||||
r += mul(s0_6, M4(3.432e-02, 1.518e-02, 1.940e-02, -1.086e-01, 1.052e-01, -5.430e-02, -3.343e-02, 1.824e-01, -9.831e-02, 1.097e-02, 6.281e-02, 1.194e-01, 3.253e-02, 4.046e-02, -2.183e-02, -1.328e-01));
|
||||
r += mul(s0_7, M4(1.538e-01, 6.796e-02, -4.870e-01, 7.139e-02, -2.497e-01, 2.916e-02, 6.191e-01, -2.650e-01, -4.194e-02, 1.782e-01, -3.431e-01, -9.707e-02, 2.173e-02, -1.150e-01, -8.162e-03, 4.551e-02));
|
||||
r += mul(s0_8, M4(5.804e-02, 5.436e-02, -1.604e-01, 8.077e-02, 2.685e-01, 4.741e-02, 1.225e-01, -1.033e-01, -4.358e-02, -1.091e-01, 8.815e-02, -3.121e-02, -2.569e-02, -1.093e-02, -2.550e-02, -1.571e-02));
|
||||
r += mul(s1_0, M4(8.760e-02, 1.254e-01, 9.299e-02, -1.140e-02, 4.179e-02, -1.333e-01, 3.048e-03, -3.111e-02, -6.091e-02, 6.563e-03, 4.609e-03, -4.717e-02, -6.470e-02, -5.791e-02, -5.529e-03, 8.697e-02));
|
||||
r += mul(s1_1, M4(6.935e-02, 9.805e-02, 1.851e-01, -2.726e-01, 1.731e-01, -2.863e-01, -2.267e-01, -3.813e-02, 1.104e-01, -3.193e-01, -1.958e-01, 9.567e-02, 1.819e-01, -2.054e-01, 1.228e-01, 3.906e-02));
|
||||
r += mul(s1_2, M4(-1.957e-01, 7.733e-02, -2.023e-01, 1.297e-01, -1.646e-01, 1.304e-01, -1.728e-02, -4.396e-02, 7.828e-02, -2.639e-01, 3.389e-02, 1.101e-01, 1.388e-01, -4.075e-03, 1.023e-01, -7.785e-03));
|
||||
r += mul(s1_3, M4(-2.828e-02, -7.018e-02, 4.269e-02, -1.386e-01, 2.143e-02, 2.504e-01, -2.134e-01, -2.483e-01, 1.075e-01, -2.671e-02, -2.588e-01, -3.271e-01, 1.173e-01, -6.103e-02, 5.539e-01, 5.341e-01));
|
||||
r += mul(s1_4, M4(-2.415e-01, -2.975e-01, -6.622e-02, 4.027e-01, -5.871e-01, 7.506e-01, 1.939e-02, -1.680e-01, 4.796e-01, -2.840e-01, 5.077e-01, 9.122e-02, 1.463e-01, 2.124e-01, 6.358e-02, 2.993e-01));
|
||||
r += mul(s1_5, M4(4.298e-01, -1.754e-01, 5.357e-01, -1.440e-01, -4.439e-01, -3.819e-01, -1.009e-01, 2.113e-02, -2.275e-02, -1.842e-02, 1.441e-01, 6.590e-03, 2.627e-02, 3.381e-02, 9.956e-02, -1.935e-02));
|
||||
r += mul(s1_6, M4(-5.557e-02, 3.378e-02, -2.451e-02, -1.718e-01, -2.037e-01, 1.631e-02, -2.822e-01, -7.724e-02, -6.657e-02, -2.282e-02, 2.673e-02, 8.716e-02, 1.291e-01, 9.472e-03, 3.810e-02, -1.134e-01));
|
||||
r += mul(s1_7, M4(1.441e-01, 4.331e-02, -4.741e-01, 2.165e-01, -5.974e-01, -2.669e-02, -4.949e-02, -3.179e-01, 1.007e-01, 1.512e-01, -4.138e-02, -7.470e-02, 8.828e-02, -1.400e-01, 5.797e-02, -4.988e-03));
|
||||
r += mul(s1_8, M4(-2.478e-01, 1.392e-01, -8.663e-02, -3.629e-02, 1.823e-01, 7.573e-03, -2.445e-01, -1.641e-02, -5.197e-02, -8.804e-02, 1.244e-01, 2.095e-02, 1.683e-02, -4.073e-02, -5.207e-03, -3.854e-03));
|
||||
r += V4(-4.317e-03, 2.687e-03, -1.530e-03, 4.681e-04);
|
||||
return r;
|
||||
}
|
||||
|
||||
void Pass2(uint2 blockStart, uint3 tid) {
|
||||
float2 pt = float2(GetInputPt());
|
||||
uint2 gxy = Rmp8x8(tid.x) + blockStart;
|
||||
uint2 size = GetInputSize();
|
||||
if (gxy.x >= size.x || gxy.y >= size.y) {
|
||||
return;
|
||||
}
|
||||
float2 pos = (gxy + 0.5) * pt;
|
||||
|
||||
V4 s0_0 = l0(-1.0, -1.0);
|
||||
V4 s0_1 = l0(0.0, -1.0);
|
||||
V4 s0_2 = l0(1.0, -1.0);
|
||||
V4 s0_3 = l0(-1.0, 0.0);
|
||||
V4 s0_4 = l0(0.0, 0.0);
|
||||
V4 s0_5 = l0(1.0, 0.0);
|
||||
V4 s0_6 = l0(-1.0, 1.0);
|
||||
V4 s0_7 = l0(0.0, 1.0);
|
||||
V4 s0_8 = l0(1.0, 1.0);
|
||||
V4 s1_0 = -max(-s0_0, 0.0);
|
||||
V4 s1_1 = -max(-s0_1, 0.0);
|
||||
V4 s1_2 = -max(-s0_2, 0.0);
|
||||
V4 s1_3 = -max(-s0_3, 0.0);
|
||||
V4 s1_4 = -max(-s0_4, 0.0);
|
||||
V4 s1_5 = -max(-s0_5, 0.0);
|
||||
V4 s1_6 = -max(-s0_6, 0.0);
|
||||
V4 s1_7 = -max(-s0_7, 0.0);
|
||||
V4 s1_8 = -max(-s0_8, 0.0);
|
||||
s0_0 = max(s0_0, 0.0);
|
||||
s0_1 = max(s0_1, 0.0);
|
||||
s0_2 = max(s0_2, 0.0);
|
||||
s0_3 = max(s0_3, 0.0);
|
||||
s0_4 = max(s0_4, 0.0);
|
||||
s0_5 = max(s0_5, 0.0);
|
||||
s0_6 = max(s0_6, 0.0);
|
||||
s0_7 = max(s0_7, 0.0);
|
||||
s0_8 = max(s0_8, 0.0);
|
||||
|
||||
t1[gxy] = f0(s0_0, s0_1, s0_2, s0_3, s0_4, s0_5, s0_6, s0_7, s0_8, s1_0, s1_1, s1_2, s1_3, s1_4, s1_5, s1_6, s1_7, s1_8);
|
||||
}
|
||||
|
||||
//!PASS 3
|
||||
//!DESC conv2
|
||||
//!BLOCK_SIZE 8
|
||||
//!NUM_THREADS 64
|
||||
//!IN t1
|
||||
//!OUT t0
|
||||
|
||||
#define l0(x, y) V4(O(t1, float2(x, y)))
|
||||
|
||||
V4 f0(V4 s0_0, V4 s0_1, V4 s0_2, V4 s0_3, V4 s0_4, V4 s0_5, V4 s0_6, V4 s0_7, V4 s0_8, V4 s1_0, V4 s1_1, V4 s1_2, V4 s1_3, V4 s1_4, V4 s1_5, V4 s1_6, V4 s1_7, V4 s1_8) {
|
||||
V4 r = 0.0;
|
||||
r += mul(s0_0, M4(1.921e-01, -2.132e-02, -5.460e-03, -6.681e-02, 9.988e-02, -2.228e-02, 4.719e-02, 9.124e-03, -1.072e-01, 1.506e-01, 2.070e-02, -4.671e-02, 2.244e-01, -4.895e-02, -8.150e-03, -9.520e-02));
|
||||
r += mul(s0_1, M4(8.226e-02, 4.651e-02, -1.842e-01, -3.376e-02, 1.349e-01, 2.148e-02, -1.746e-01, 1.671e-02, 9.761e-02, 7.581e-02, 1.470e-01, -8.582e-02, -1.149e-01, 2.143e-02, -1.597e-01, 1.626e-01));
|
||||
r += mul(s0_2, M4(-5.810e-04, -3.566e-02, 4.708e-02, -3.068e-02, 1.578e-02, 5.503e-03, 3.081e-02, -4.174e-02, 3.394e-01, 7.398e-02, -9.467e-02, -1.127e-01, -1.314e-01, 1.511e-02, 1.538e-01, -5.695e-03));
|
||||
r += mul(s0_3, M4(2.959e-01, 3.316e-02, -5.716e-02, -2.233e-01, 5.020e-01, -1.416e-01, -6.082e-02, -3.393e-01, 3.292e-01, -6.813e-02, 9.009e-02, -1.638e-01, 1.190e-01, -2.728e-02, -6.042e-02, -1.360e-01));
|
||||
r += mul(s0_4, M4(5.902e-01, 3.040e-01, -2.870e-01, 2.228e-02, -1.646e-01, 2.078e-02, -1.480e-01, 2.083e-01, -4.397e-01, -2.549e-01, -1.168e-01, -4.199e-01, 2.199e-01, 2.596e-02, 2.598e-02, -1.313e-01));
|
||||
r += mul(s0_5, M4(1.043e-01, 1.050e-02, -5.654e-02, -1.265e-01, -1.978e-01, 3.772e-02, 2.474e-01, 1.395e-01, 2.041e-01, 6.617e-02, -2.602e-01, -1.601e-01, -5.577e-02, -1.591e-02, 2.096e-01, 2.594e-02));
|
||||
r += mul(s0_6, M4(7.245e-02, 6.156e-02, 5.317e-02, -3.912e-01, 1.871e-01, -2.079e-02, -2.552e-02, -6.961e-02, 2.686e-01, 8.518e-02, -1.026e-01, -4.040e-01, -6.324e-02, 7.999e-03, 1.317e-02, 1.619e-02));
|
||||
r += mul(s0_7, M4(1.240e-01, -8.349e-02, -1.258e-01, -3.269e-01, 6.624e-01, -1.357e-01, -6.738e-01, -5.998e-01, -8.375e-04, 2.226e-01, -1.880e-01, 5.678e-02, -8.383e-02, -3.455e-02, -1.399e-02, 4.540e-02));
|
||||
r += mul(s0_8, M4(-3.130e-02, 9.691e-02, 1.763e-01, -1.847e-02, -1.193e-01, -7.494e-03, 1.485e-02, 1.244e-02, 9.559e-02, 3.116e-02, 8.046e-03, -1.264e-01, -2.403e-01, 6.389e-02, 2.999e-01, 1.484e-01));
|
||||
r += mul(s1_0, M4(2.569e-01, -8.689e-03, -1.806e-02, -3.993e-02, 9.155e-02, -2.022e-02, 1.034e-02, -3.455e-02, -1.534e-01, 1.836e-02, -1.176e-03, 3.593e-03, 2.642e-01, -6.587e-02, -4.169e-02, -2.237e-01));
|
||||
r += mul(s1_1, M4(1.398e-01, 1.020e-02, -2.478e-01, 2.747e-02, 7.152e-02, 1.835e-02, -2.013e-01, 1.151e-02, -2.586e-01, -3.622e-02, 2.529e-01, 1.465e-01, -3.973e-01, 5.907e-02, -9.450e-02, 3.761e-02));
|
||||
r += mul(s1_2, M4(3.157e-02, 7.847e-03, 8.109e-03, -3.333e-02, -3.333e-02, -6.401e-03, -6.632e-03, 3.296e-02, -1.433e-02, 2.167e-02, 1.194e-01, -1.028e-01, -2.104e-01, 1.352e-02, -6.835e-02, 1.901e-01));
|
||||
r += mul(s1_3, M4(3.443e-01, -1.004e-01, -6.176e-02, -3.047e-01, 4.779e-01, -7.928e-02, -8.134e-02, -4.873e-01, -1.421e-01, 3.972e-02, 7.459e-02, 2.099e-01, 1.118e-01, -1.022e-02, -8.584e-02, -1.657e-01));
|
||||
r += mul(s1_4, M4(-1.721e-01, 2.625e-02, -7.292e-03, 2.646e-01, 2.505e-02, 1.479e-01, -3.357e-01, 1.088e-01, 1.016e-01, -1.902e-01, -1.622e-01, -6.326e-02, -4.305e-01, 4.763e-01, -1.357e-03, -5.685e-01));
|
||||
r += mul(s1_5, M4(3.324e-03, 1.692e-02, -5.726e-02, 2.853e-02, -3.135e-01, -4.534e-03, 2.549e-01, 1.183e-01, -1.277e-01, -5.030e-02, 9.190e-02, 1.145e-01, 3.445e-01, 6.425e-02, -2.707e-01, -1.701e-01));
|
||||
r += mul(s1_6, M4(2.164e-02, 1.998e-02, 1.667e-02, -6.126e-02, 2.400e-01, -9.253e-02, -4.525e-02, 8.615e-03, 5.148e-02, -1.803e-02, -7.495e-02, -7.102e-02, -2.646e-02, 6.819e-02, 1.465e-01, 1.904e-01));
|
||||
r += mul(s1_7, M4(-2.339e-02, 3.350e-02, -1.274e-01, 5.525e-02, 9.120e-01, -9.074e-01, -6.856e-01, -7.422e-02, 4.849e-02, -1.377e-02, -1.409e-01, -5.792e-02, -1.044e-01, 9.079e-02, 2.520e-01, 2.053e-01));
|
||||
r += mul(s1_8, M4(1.891e-02, -1.562e-02, -1.024e-02, -2.686e-02, -1.038e-01, -3.210e-02, 4.222e-01, -2.084e-01, -1.841e-01, 3.231e-02, 7.320e-02, 1.727e-01, 2.861e-01, 2.506e-02, -2.266e-01, -3.940e-01));
|
||||
r += V4(-1.043e-03, 3.601e-03, 5.622e-03, -7.848e-04);
|
||||
return r;
|
||||
}
|
||||
|
||||
void Pass3(uint2 blockStart, uint3 tid) {
|
||||
float2 pt = float2(GetInputPt());
|
||||
uint2 gxy = Rmp8x8(tid.x) + blockStart;
|
||||
uint2 size = GetInputSize();
|
||||
if (gxy.x >= size.x || gxy.y >= size.y) {
|
||||
return;
|
||||
}
|
||||
float2 pos = (gxy + 0.5) * pt;
|
||||
|
||||
V4 s0_0 = l0(-1.0, -1.0);
|
||||
V4 s0_1 = l0(0.0, -1.0);
|
||||
V4 s0_2 = l0(1.0, -1.0);
|
||||
V4 s0_3 = l0(-1.0, 0.0);
|
||||
V4 s0_4 = l0(0.0, 0.0);
|
||||
V4 s0_5 = l0(1.0, 0.0);
|
||||
V4 s0_6 = l0(-1.0, 1.0);
|
||||
V4 s0_7 = l0(0.0, 1.0);
|
||||
V4 s0_8 = l0(1.0, 1.0);
|
||||
V4 s1_0 = -max(-s0_0, 0.0);
|
||||
V4 s1_1 = -max(-s0_1, 0.0);
|
||||
V4 s1_2 = -max(-s0_2, 0.0);
|
||||
V4 s1_3 = -max(-s0_3, 0.0);
|
||||
V4 s1_4 = -max(-s0_4, 0.0);
|
||||
V4 s1_5 = -max(-s0_5, 0.0);
|
||||
V4 s1_6 = -max(-s0_6, 0.0);
|
||||
V4 s1_7 = -max(-s0_7, 0.0);
|
||||
V4 s1_8 = -max(-s0_8, 0.0);
|
||||
s0_0 = max(s0_0, 0.0);
|
||||
s0_1 = max(s0_1, 0.0);
|
||||
s0_2 = max(s0_2, 0.0);
|
||||
s0_3 = max(s0_3, 0.0);
|
||||
s0_4 = max(s0_4, 0.0);
|
||||
s0_5 = max(s0_5, 0.0);
|
||||
s0_6 = max(s0_6, 0.0);
|
||||
s0_7 = max(s0_7, 0.0);
|
||||
s0_8 = max(s0_8, 0.0);
|
||||
|
||||
t0[gxy] = f0(s0_0, s0_1, s0_2, s0_3, s0_4, s0_5, s0_6, s0_7, s0_8, s1_0, s1_1, s1_2, s1_3, s1_4, s1_5, s1_6, s1_7, s1_8);
|
||||
}
|
||||
|
||||
//!PASS 4
|
||||
//!DESC conv3
|
||||
//!BLOCK_SIZE 8
|
||||
//!NUM_THREADS 64
|
||||
//!IN t0
|
||||
//!OUT t1
|
||||
|
||||
#define l0(x, y) V4(O(t0, float2(x, y)))
|
||||
|
||||
V4 f0(V4 s0_0, V4 s0_1, V4 s0_2, V4 s0_3, V4 s0_4, V4 s0_5, V4 s0_6, V4 s0_7, V4 s0_8, V4 s1_0, V4 s1_1, V4 s1_2, V4 s1_3, V4 s1_4, V4 s1_5, V4 s1_6, V4 s1_7, V4 s1_8) {
|
||||
V4 r = 0.0;
|
||||
r += mul(s0_0, M4(-7.801e-03, 7.517e-03, 5.348e-02, 7.686e-02, -8.770e-03, 1.144e-02, -2.398e-02, 1.355e-02, -4.642e-02, 5.880e-02, 3.263e-02, 1.860e-01, -4.443e-02, -2.732e-02, -2.133e-02, -1.166e-01));
|
||||
r += mul(s0_1, M4(-1.751e-02, -1.230e-02, -1.218e-01, -1.231e-01, 4.092e-03, -8.769e-03, -2.251e-03, 5.142e-02, 4.354e-03, -4.445e-02, -2.369e-01, -1.616e-01, 4.495e-03, -1.326e-01, -5.371e-01, -5.119e-01));
|
||||
r += mul(s0_2, M4(3.143e-02, 2.366e-02, 8.884e-02, -1.819e-02, 2.358e-03, 3.812e-04, -4.972e-02, -5.311e-02, 1.729e-02, 1.523e-02, 7.798e-02, -1.705e-05, -2.295e-02, 6.567e-02, 1.422e-01, 1.890e-01));
|
||||
r += mul(s0_3, M4(2.363e-02, 1.555e-02, -1.307e-01, -8.190e-02, 1.026e-02, 9.724e-03, 5.358e-02, -2.783e-01, 7.268e-03, 1.659e-01, -5.801e-02, 3.076e-01, -1.575e-01, -9.567e-02, 3.294e-02, -7.694e-01));
|
||||
r += mul(s0_4, M4(1.677e-02, -1.324e-01, 4.019e-01, -2.902e-01, -6.051e-02, -4.625e-02, 8.409e-01, 4.756e-01, -1.135e-01, -3.213e-01, 6.389e-02, -2.083e-01, -1.219e+00, 2.280e-01, 9.667e-01, -3.604e-01));
|
||||
r += mul(s0_5, M4(-5.948e-02, 1.567e-01, 3.883e-02, -4.843e-03, -2.153e-02, 3.439e-02, -1.160e-01, -1.325e-02, -5.312e-02, 1.136e-01, -5.260e-02, -3.524e-02, 7.315e-02, 3.527e-01, 6.186e-01, -7.505e-02));
|
||||
r += mul(s0_6, M4(-3.841e-02, 1.620e-03, 9.449e-02, -8.648e-02, -2.656e-02, -1.676e-03, 2.364e-03, -7.221e-02, -9.590e-02, 4.160e-02, -1.278e-02, -3.171e-02, 6.213e-02, 2.673e-02, -7.931e-02, 2.588e-01));
|
||||
r += mul(s0_7, M4(-3.636e-02, -1.558e-01, 2.151e-01, 1.188e-01, 1.275e-01, -8.114e-02, -8.376e-02, -3.690e-02, -1.968e-02, -1.038e-01, 8.994e-02, 3.846e-02, -1.499e-01, 6.457e-01, -8.201e-02, -3.935e-01));
|
||||
r += mul(s0_8, M4(-2.833e-03, 2.529e-01, -3.350e-03, -3.433e-02, 1.943e-02, -2.796e-02, 3.313e-02, 1.582e-02, 1.702e-02, 5.663e-02, -1.647e-02, -2.229e-02, -4.865e-01, 3.285e-01, -4.462e-01, -4.307e-01));
|
||||
r += mul(s1_0, M4(-6.004e-02, 4.898e-03, 3.591e-02, 1.900e-01, -3.816e-02, -3.269e-02, 1.459e-01, -3.464e-03, -1.235e-02, -3.737e-02, 1.569e-02, 2.559e-01, -3.173e-04, 1.268e-02, 8.886e-03, 2.960e-02));
|
||||
r += mul(s1_1, M4(-1.582e-02, -7.507e-02, -2.026e-01, 2.027e-01, -6.107e-02, 2.055e-02, -5.811e-02, 5.420e-03, 1.028e-02, -1.374e-02, -6.152e-01, -2.259e-01, -3.408e-03, -1.800e-02, 4.574e-02, -9.590e-02));
|
||||
r += mul(s1_2, M4(4.210e-02, 2.126e-02, 8.277e-02, 2.079e-02, -1.733e-01, -2.483e-02, 2.686e-01, 1.498e-01, 7.352e-02, -2.511e-02, 3.159e-02, 5.775e-02, 5.942e-02, 3.383e-02, 1.274e-01, -5.928e-02));
|
||||
r += mul(s1_3, M4(5.614e-02, 7.561e-02, -8.328e-02, 2.427e-01, 7.214e-02, -1.122e-01, 9.434e-02, -2.602e-01, -1.052e-02, -6.944e-02, -3.023e-02, -1.655e-01, 1.236e-03, 4.025e-03, -3.082e-02, -1.533e-01));
|
||||
r += mul(s1_4, M4(6.675e-01, -2.254e-01, 1.173e+00, -8.261e-02, 5.655e-01, -2.000e-01, 8.301e-01, 1.458e+00, -2.497e-01, -1.091e+00, -4.698e-01, -1.876e-01, -3.358e-02, -2.854e-01, 5.032e-01, -1.558e-01));
|
||||
r += mul(s1_5, M4(-1.444e-02, 1.502e-01, -4.221e-02, -4.864e-02, 3.236e-01, -2.572e-01, 1.344e-01, 8.562e-02, -1.030e-01, 2.690e-01, 1.238e-01, 3.309e-02, -3.849e-02, 1.860e-01, 6.528e-03, 2.840e-02));
|
||||
r += mul(s1_6, M4(-1.161e-01, 5.405e-02, -3.101e-02, -1.009e-01, -9.594e-02, -1.207e-02, -3.836e-02, -6.894e-02, -1.770e-02, -2.958e-02, 8.484e-02, -2.284e-02, 2.585e-04, -2.764e-02, 4.972e-02, -5.968e-02));
|
||||
r += mul(s1_7, M4(-4.113e-02, -1.948e-01, -2.728e-02, -3.142e-02, -2.894e-01, -1.111e-01, 7.492e-02, -2.892e-02, 9.054e-02, 4.350e-02, 2.183e-01, 1.489e-01, 1.167e-02, -6.678e-02, 3.696e-02, -1.315e-02));
|
||||
r += mul(s1_8, M4(2.532e-02, 4.585e-02, -3.694e-02, -6.244e-02, -1.673e-01, 6.180e-02, -4.475e-02, 1.028e-02, -1.658e-02, 8.923e-02, 1.711e-02, 3.037e-03, 4.651e-02, 1.652e-01, 7.863e-03, -3.387e-02));
|
||||
r += V4(-6.562e-04, 7.371e-04, -4.319e-03, -8.757e-04);
|
||||
return r;
|
||||
}
|
||||
|
||||
void Pass4(uint2 blockStart, uint3 tid) {
|
||||
float2 pt = float2(GetInputPt());
|
||||
uint2 gxy = Rmp8x8(tid.x) + blockStart;
|
||||
uint2 size = GetInputSize();
|
||||
if (gxy.x >= size.x || gxy.y >= size.y) {
|
||||
return;
|
||||
}
|
||||
float2 pos = (gxy + 0.5) * pt;
|
||||
|
||||
V4 s0_0 = l0(-1.0, -1.0);
|
||||
V4 s0_1 = l0(0.0, -1.0);
|
||||
V4 s0_2 = l0(1.0, -1.0);
|
||||
V4 s0_3 = l0(-1.0, 0.0);
|
||||
V4 s0_4 = l0(0.0, 0.0);
|
||||
V4 s0_5 = l0(1.0, 0.0);
|
||||
V4 s0_6 = l0(-1.0, 1.0);
|
||||
V4 s0_7 = l0(0.0, 1.0);
|
||||
V4 s0_8 = l0(1.0, 1.0);
|
||||
V4 s1_0 = -max(-s0_0, 0.0);
|
||||
V4 s1_1 = -max(-s0_1, 0.0);
|
||||
V4 s1_2 = -max(-s0_2, 0.0);
|
||||
V4 s1_3 = -max(-s0_3, 0.0);
|
||||
V4 s1_4 = -max(-s0_4, 0.0);
|
||||
V4 s1_5 = -max(-s0_5, 0.0);
|
||||
V4 s1_6 = -max(-s0_6, 0.0);
|
||||
V4 s1_7 = -max(-s0_7, 0.0);
|
||||
V4 s1_8 = -max(-s0_8, 0.0);
|
||||
s0_0 = max(s0_0, 0.0);
|
||||
s0_1 = max(s0_1, 0.0);
|
||||
s0_2 = max(s0_2, 0.0);
|
||||
s0_3 = max(s0_3, 0.0);
|
||||
s0_4 = max(s0_4, 0.0);
|
||||
s0_5 = max(s0_5, 0.0);
|
||||
s0_6 = max(s0_6, 0.0);
|
||||
s0_7 = max(s0_7, 0.0);
|
||||
s0_8 = max(s0_8, 0.0);
|
||||
|
||||
t1[gxy] = f0(s0_0, s0_1, s0_2, s0_3, s0_4, s0_5, s0_6, s0_7, s0_8, s1_0, s1_1, s1_2, s1_3, s1_4, s1_5, s1_6, s1_7, s1_8);
|
||||
}
|
||||
|
||||
//!PASS 5
|
||||
//!DESC conv4
|
||||
//!BLOCK_SIZE 8
|
||||
//!NUM_THREADS 64
|
||||
//!IN t1
|
||||
//!OUT t0
|
||||
|
||||
#define l0(x, y) V4(O(t1, float2(x, y)))
|
||||
|
||||
V4 f0(V4 s0_0, V4 s0_1, V4 s0_2, V4 s0_3, V4 s0_4, V4 s0_5, V4 s0_6, V4 s0_7, V4 s0_8, V4 s1_0, V4 s1_1, V4 s1_2, V4 s1_3, V4 s1_4, V4 s1_5, V4 s1_6, V4 s1_7, V4 s1_8) {
|
||||
V4 r = 0.0;
|
||||
r += mul(s0_0, M4(-1.087e-01, -5.083e-02, 3.146e-01, -4.241e-02, 4.462e-02, -4.358e-02, -1.562e-01, -2.609e-03, 5.918e-02, -2.526e-02, -3.132e-02, -1.150e-02, -8.799e-03, 3.070e-02, -1.680e-02, -1.046e-02));
|
||||
r += mul(s0_1, M4(1.762e-01, 8.784e-01, -2.704e+00, -1.565e+00, -1.473e-01, -5.723e-01, 7.838e-02, -7.420e-03, -1.769e-01, -2.041e-01, -1.783e-03, -4.944e-03, 1.304e-02, 2.646e-01, -1.708e-01, 7.483e-03));
|
||||
r += mul(s0_2, M4(-1.907e-01, 1.514e-01, -3.657e-01, -5.840e-01, -4.943e-02, -1.014e-02, -2.869e-03, 6.488e-03, 2.266e-02, -3.850e-02, 6.125e-03, 1.899e-02, -3.541e-02, -2.011e-01, 1.567e-01, 1.008e-02));
|
||||
r += mul(s0_3, M4(-3.061e-01, -1.768e-01, 9.163e-02, -2.243e-01, 4.945e-02, 1.106e-01, -1.137e-01, 1.755e-02, 2.640e-01, -9.298e-02, -1.704e-01, 3.935e-02, 1.506e-01, -3.284e-02, 4.719e-02, 5.543e-02));
|
||||
r += mul(s0_4, M4(-4.579e-01, -6.198e-02, -9.889e-01, -4.446e-01, -1.612e-01, 1.518e-01, 2.588e-01, 1.075e-02, -1.527e+00, -7.923e-01, 8.120e-02, -1.116e-01, -2.079e-01, -1.206e-01, -4.422e-01, -1.951e-01));
|
||||
r += mul(s0_5, M4(1.064e-01, -1.684e-01, 2.316e-01, 4.211e-01, -9.153e-02, 9.155e-02, -7.649e-02, -1.385e-01, 9.422e-02, -1.631e-01, 8.278e-02, 3.318e-01, 7.284e-02, 3.489e-01, -2.303e-02, -6.554e-01));
|
||||
r += mul(s0_6, M4(-6.320e-02, -4.390e-02, 1.453e-02, 3.187e-02, 2.166e-02, 2.423e-03, 1.573e-03, -2.226e-02, 1.401e-01, 2.026e-01, -2.249e-01, 6.471e-02, 3.593e-02, -1.575e-02, -3.186e-02, 1.339e-02));
|
||||
r += mul(s0_7, M4(2.778e-02, 7.495e-02, -1.086e-01, 8.862e-02, -2.352e-02, 1.477e-02, 2.741e-02, 4.345e-02, -2.865e-01, 9.405e-02, 1.880e-01, -3.610e-01, -7.797e-02, -5.710e-03, 3.386e-02, 2.830e-02));
|
||||
r += mul(s0_8, M4(-3.734e-02, 3.357e-02, 5.657e-03, -1.596e-01, -7.661e-03, 1.603e-02, -3.137e-02, -7.023e-03, 6.522e-03, -2.715e-02, 2.765e-02, 4.724e-02, 1.922e-02, 3.944e-02, -8.276e-02, -1.915e-02));
|
||||
r += mul(s1_0, M4(-7.121e-02, -2.276e-02, 7.266e-02, -4.411e-03, -5.600e-01, 4.502e-01, -1.817e-01, -2.906e-01, -5.675e-02, 2.653e-02, 3.284e-02, -1.925e-03, -4.729e-03, -1.554e-03, -6.081e-03, -2.195e-02));
|
||||
r += mul(s1_1, M4(2.212e-01, 3.154e-01, -2.765e-01, 4.432e-02, 1.402e+00, 2.159e-01, 4.402e-01, 2.537e-01, 6.697e-02, 1.207e-01, -5.192e-02, 2.638e-02, 5.366e-02, 5.855e-02, -3.687e-02, 4.389e-03));
|
||||
r += mul(s1_2, M4(3.137e-02, -1.157e-01, 9.497e-02, -3.724e-02, 5.241e-02, 7.793e-02, 2.277e-04, -4.033e-01, 1.432e-02, 4.622e-02, -1.636e-02, -5.840e-03, -1.593e-02, -7.447e-02, 3.943e-02, -3.517e-03));
|
||||
r += mul(s1_3, M4(-1.209e-02, -1.350e-01, 3.018e-01, 1.233e-01, -1.262e-03, 2.194e-01, -2.919e-01, -8.031e-03, 4.620e-03, 5.318e-02, 1.247e-02, -4.260e-02, 7.155e-02, 3.256e-02, -9.839e-02, -6.741e-04));
|
||||
r += mul(s1_4, M4(3.291e-01, 2.397e-01, -2.820e-01, 5.703e-01, 7.831e-03, 5.816e-02, -1.696e-02, -1.957e-01, -1.851e-01, 3.696e-02, -2.611e-01, 7.039e-03, -1.562e-01, -7.676e-01, 9.080e-01, 7.823e-02));
|
||||
r += mul(s1_5, M4(9.918e-03, 6.364e-02, 3.364e-02, -3.291e-01, 1.393e-02, 3.139e-02, 1.701e-02, -5.675e-02, 5.085e-02, -2.050e-01, 1.160e-01, 4.875e-02, -1.189e-01, 2.310e-01, -1.353e-01, 2.046e-02));
|
||||
r += mul(s1_6, M4(-5.477e-03, -1.704e-02, 9.510e-03, -1.701e-02, 1.391e-02, -8.760e-03, -3.355e-02, -6.898e-03, -9.203e-03, -2.442e-02, 7.547e-03, 1.817e-02, 1.871e-02, -1.149e-02, 6.458e-02, 1.403e-02));
|
||||
r += mul(s1_7, M4(-5.073e-03, -5.454e-02, -2.710e-02, 1.292e-02, 2.458e-02, 1.739e-02, -2.319e-03, 3.865e-02, 5.399e-02, -1.176e-02, -1.315e-01, 1.489e-01, -7.903e-02, 8.120e-02, 4.749e-02, 1.961e-01));
|
||||
r += mul(s1_8, M4(4.163e-02, -1.603e-02, 8.659e-03, 1.023e-01, 5.233e-03, -2.900e-03, -5.293e-03, -5.829e-03, -1.453e-02, 2.467e-02, 7.198e-02, -2.407e-01, -4.023e-02, 1.009e-01, -1.560e-01, -1.567e-01));
|
||||
r += V4(-3.709e-04, 2.029e-04, -3.042e-03, -2.970e-04);
|
||||
return r;
|
||||
}
|
||||
|
||||
void Pass5(uint2 blockStart, uint3 tid) {
|
||||
float2 pt = float2(GetInputPt());
|
||||
uint2 gxy = Rmp8x8(tid.x) + blockStart;
|
||||
uint2 size = GetInputSize();
|
||||
if (gxy.x >= size.x || gxy.y >= size.y) {
|
||||
return;
|
||||
}
|
||||
float2 pos = (gxy + 0.5) * pt;
|
||||
|
||||
V4 s0_0 = l0(-1.0, -1.0);
|
||||
V4 s0_1 = l0(0.0, -1.0);
|
||||
V4 s0_2 = l0(1.0, -1.0);
|
||||
V4 s0_3 = l0(-1.0, 0.0);
|
||||
V4 s0_4 = l0(0.0, 0.0);
|
||||
V4 s0_5 = l0(1.0, 0.0);
|
||||
V4 s0_6 = l0(-1.0, 1.0);
|
||||
V4 s0_7 = l0(0.0, 1.0);
|
||||
V4 s0_8 = l0(1.0, 1.0);
|
||||
V4 s1_0 = -max(-s0_0, 0.0);
|
||||
V4 s1_1 = -max(-s0_1, 0.0);
|
||||
V4 s1_2 = -max(-s0_2, 0.0);
|
||||
V4 s1_3 = -max(-s0_3, 0.0);
|
||||
V4 s1_4 = -max(-s0_4, 0.0);
|
||||
V4 s1_5 = -max(-s0_5, 0.0);
|
||||
V4 s1_6 = -max(-s0_6, 0.0);
|
||||
V4 s1_7 = -max(-s0_7, 0.0);
|
||||
V4 s1_8 = -max(-s0_8, 0.0);
|
||||
s0_0 = max(s0_0, 0.0);
|
||||
s0_1 = max(s0_1, 0.0);
|
||||
s0_2 = max(s0_2, 0.0);
|
||||
s0_3 = max(s0_3, 0.0);
|
||||
s0_4 = max(s0_4, 0.0);
|
||||
s0_5 = max(s0_5, 0.0);
|
||||
s0_6 = max(s0_6, 0.0);
|
||||
s0_7 = max(s0_7, 0.0);
|
||||
s0_8 = max(s0_8, 0.0);
|
||||
|
||||
t0[gxy] = f0(s0_0, s0_1, s0_2, s0_3, s0_4, s0_5, s0_6, s0_7, s0_8, s1_0, s1_1, s1_2, s1_3, s1_4, s1_5, s1_6, s1_7, s1_8);
|
||||
}
|
||||
|
||||
//!PASS 6
|
||||
//!DESC out-shuffle
|
||||
//!BLOCK_SIZE 16
|
||||
//!NUM_THREADS 64
|
||||
//!IN INPUT, t0
|
||||
//!OUT OUTPUT
|
||||
|
||||
#define l0(x, y) V4(O(t0, float2(x, y)))
|
||||
|
||||
V4 f0(V4 s0_0, V4 s0_1, V4 s0_2, V4 s0_3, V4 s0_4, V4 s0_5, V4 s0_6, V4 s0_7, V4 s0_8, V4 s1_0, V4 s1_1, V4 s1_2, V4 s1_3, V4 s1_4, V4 s1_5, V4 s1_6, V4 s1_7, V4 s1_8) {
|
||||
V4 r = 0.0;
|
||||
r += mul(s0_0, M4(-6.857e-02, -6.042e-02, 3.293e-03, -2.389e-03, -1.606e-01, -1.556e-02, -5.115e-02, -4.602e-02, -3.762e-02, 1.994e-02, -2.370e-02, 3.558e-02, -7.142e-01, 8.184e-01, -1.361e-01, 1.228e-01));
|
||||
r += mul(s0_1, M4(-1.887e-01, -2.260e-01, 1.293e-02, -1.757e-02, 1.257e-01, 1.304e-01, -4.525e-02, 4.471e-02, 6.895e-01, -4.096e-01, 4.096e-02, 1.817e-02, -1.343e-01, -4.170e-01, 3.991e-03, 1.516e-03));
|
||||
r += mul(s0_2, M4(-2.667e-01, -8.692e-02, 1.481e-01, -1.466e-01, 6.142e-02, -2.084e-02, 1.942e-02, 6.700e-04, 3.942e-02, 3.109e-01, -1.323e-02, 2.240e-02, -2.306e-02, -4.749e-02, -1.155e-02, 1.843e-03));
|
||||
r += mul(s0_3, M4(-1.004e-01, -1.184e-02, -8.590e-02, -1.018e-01, 6.862e-02, -4.700e-02, -1.537e-01, -1.096e-01, -1.228e-01, 1.462e-02, -1.715e-01, 1.862e-02, 3.668e-01, -1.138e-01, 8.494e-04, 6.113e-01));
|
||||
r += mul(s0_4, M4(4.389e-01, -5.527e-01, -4.972e-01, -7.620e-01, 1.684e-01, 5.375e-02, 1.032e+00, 5.723e-01, 4.427e-02, -2.447e-01, 1.132e+00, -5.297e-01, 1.150e-01, 3.877e-01, 1.224e-01, 1.294e-01));
|
||||
r += mul(s0_5, M4(-1.023e+00, 1.567e+00, -9.747e-01, 1.051e+00, 1.537e-02, 1.993e-01, -1.679e-01, 1.139e-01, -7.358e-02, -1.782e-01, -1.938e-01, 4.419e-02, 2.001e-02, 5.881e-02, 8.971e-03, 3.368e-03));
|
||||
r += mul(s0_6, M4(-5.126e-03, 1.449e-02, -7.018e-02, 2.929e-02, 4.748e-02, -4.443e-03, -5.791e-02, -3.490e-02, 3.817e-02, 1.007e-02, -5.501e-02, -1.488e-02, -8.848e-03, 4.884e-02, -6.548e-02, 3.392e-02));
|
||||
r += mul(s0_7, M4(-4.449e-02, 7.313e-02, 3.311e-01, 3.138e-02, -6.466e-02, 5.666e-02, 1.929e-01, 8.274e-02, 3.994e-02, 2.105e-02, -1.821e-01, -1.539e-02, -9.333e-03, -4.728e-02, 6.975e-03, -3.292e-03));
|
||||
r += mul(s0_8, M4(2.038e-01, -2.356e-01, -1.987e-01, -3.746e-02, -1.499e-02, -7.007e-02, -9.546e-02, 1.905e-02, -9.802e-03, 1.990e-02, 2.140e-02, -8.164e-03, 5.109e-03, -2.081e-02, -2.386e-02, 1.183e-02));
|
||||
r += mul(s1_0, M4(-7.067e-02, -4.613e-02, -5.433e-04, -2.191e-02, -1.125e-01, -3.650e-02, -1.298e-02, -3.479e-02, -1.118e-01, -1.521e-02, -4.731e-03, -7.478e-03, 1.802e-01, 4.872e-02, -1.599e-03, -1.452e-02));
|
||||
r += mul(s1_1, M4(-2.920e-01, -1.831e-01, -1.305e-02, 4.031e-02, 1.989e-01, 3.120e-03, 2.025e-02, 5.432e-02, 2.607e-01, 2.403e-02, 1.863e-02, 8.423e-02, -3.372e-01, -1.327e-01, -1.248e-01, -1.247e-01));
|
||||
r += mul(s1_2, M4(-9.286e-02, -1.948e-01, -8.532e-03, 7.416e-03, 4.578e-02, 1.581e-01, 1.473e-03, -3.796e-02, 1.011e-01, 2.393e-01, 2.742e-02, -4.224e-02, -9.579e-03, -9.888e-02, -2.065e-03, 7.685e-03));
|
||||
r += mul(s1_3, M4(-2.056e-01, -3.479e-02, -2.666e-01, -5.344e-02, 1.579e-01, -6.091e-02, -1.655e-01, -1.575e-01, -8.230e-02, -4.748e-02, -1.304e-01, -7.186e-02, 2.953e-01, 6.950e-02, 1.865e-01, 7.567e-02));
|
||||
r += mul(s1_4, M4(3.408e-01, -1.054e-01, -2.613e-01, -6.084e-01, 3.193e-01, 6.366e-01, 4.251e-01, 4.066e-01, -3.742e-01, -8.521e-02, 5.906e-01, 1.870e-01, 2.044e-02, 2.495e-01, 1.046e-01, 3.018e-01));
|
||||
r += mul(s1_5, M4(4.748e-03, 2.086e-01, 4.231e-03, -7.764e-03, 3.933e-02, 3.446e-03, -3.431e-02, 8.415e-02, -3.798e-02, -3.428e-01, -7.206e-02, 2.392e-01, 2.157e-02, 2.692e-02, 3.313e-02, 1.841e-02));
|
||||
r += mul(s1_6, M4(1.813e-02, 2.306e-03, -3.402e-02, 1.009e-03, 4.408e-02, -2.307e-02, -3.394e-02, -3.912e-02, 3.822e-02, -1.051e-02, -1.023e-01, -4.626e-02, -4.871e-02, 6.250e-03, 1.367e-01, 3.674e-02));
|
||||
r += mul(s1_7, M4(-1.170e-02, 3.747e-02, 1.548e-01, 1.243e-01, -1.074e-01, -9.848e-03, 2.627e-01, 1.132e-01, 4.550e-02, 5.050e-02, -1.194e-01, -6.091e-02, -2.180e-02, -6.381e-02, -5.949e-02, 1.580e-02));
|
||||
r += mul(s1_8, M4(-1.146e-04, -1.852e-02, -1.515e-02, 2.488e-02, -1.877e-02, -7.739e-02, -6.812e-02, 7.656e-03, 2.688e-02, 5.650e-02, 4.285e-02, -3.270e-02, 1.163e-03, 8.328e-04, -1.998e-02, -2.282e-02));
|
||||
r += V4(-3.259e-04, -3.197e-04, 4.954e-04, 4.568e-04);
|
||||
return tanh(r);
|
||||
}
|
||||
|
||||
void Pass6(uint2 blockStart, uint3 tid) {
|
||||
float2 pt = float2(GetInputPt());
|
||||
uint2 gxy = (Rmp8x8(tid.x) << 1) + blockStart;
|
||||
uint2 size = GetOutputSize();
|
||||
if (gxy.x >= size.x || gxy.y >= size.y) {
|
||||
return;
|
||||
}
|
||||
float2 pos = ((gxy >> 1) + 0.5) * pt;
|
||||
|
||||
V4 s0_0 = l0(-1.0, -1.0);
|
||||
V4 s0_1 = l0(0.0, -1.0);
|
||||
V4 s0_2 = l0(1.0, -1.0);
|
||||
V4 s0_3 = l0(-1.0, 0.0);
|
||||
V4 s0_4 = l0(0.0, 0.0);
|
||||
V4 s0_5 = l0(1.0, 0.0);
|
||||
V4 s0_6 = l0(-1.0, 1.0);
|
||||
V4 s0_7 = l0(0.0, 1.0);
|
||||
V4 s0_8 = l0(1.0, 1.0);
|
||||
V4 s1_0 = -max(-s0_0, 0.0);
|
||||
V4 s1_1 = -max(-s0_1, 0.0);
|
||||
V4 s1_2 = -max(-s0_2, 0.0);
|
||||
V4 s1_3 = -max(-s0_3, 0.0);
|
||||
V4 s1_4 = -max(-s0_4, 0.0);
|
||||
V4 s1_5 = -max(-s0_5, 0.0);
|
||||
V4 s1_6 = -max(-s0_6, 0.0);
|
||||
V4 s1_7 = -max(-s0_7, 0.0);
|
||||
V4 s1_8 = -max(-s0_8, 0.0);
|
||||
s0_0 = max(s0_0, 0.0);
|
||||
s0_1 = max(s0_1, 0.0);
|
||||
s0_2 = max(s0_2, 0.0);
|
||||
s0_3 = max(s0_3, 0.0);
|
||||
s0_4 = max(s0_4, 0.0);
|
||||
s0_5 = max(s0_5, 0.0);
|
||||
s0_6 = max(s0_6, 0.0);
|
||||
s0_7 = max(s0_7, 0.0);
|
||||
s0_8 = max(s0_8, 0.0);
|
||||
|
||||
V4 r = f0(s0_0, s0_1, s0_2, s0_3, s0_4, s0_5, s0_6, s0_7, s0_8, s1_0, s1_1, s1_2, s1_3, s1_4, s1_5, s1_6, s1_7, s1_8);
|
||||
|
||||
static const float3x3 rgb2yuv = {0.299, 0.587, 0.114, -0.169, -0.331, 0.5, 0.5, -0.419, -0.081};
|
||||
static const float3x3 yuv2rgb = {1, -0.00093, 1.401687, 1, -0.3437, -0.71417, 1, 1.77216, 0.00099};
|
||||
float2 opt = float2(GetOutputPt());
|
||||
|
||||
pos -= 0.5f * opt;
|
||||
float3 yuv = mul(rgb2yuv, INPUT.SampleLevel(SL, pos, 0).rgb);
|
||||
OUTPUT[gxy] = float4(mul(yuv2rgb, float3(saturate(yuv.r + r.x), yuv.yz)), 1);
|
||||
|
||||
++gxy.x;
|
||||
pos.x += opt.x;
|
||||
yuv = mul(rgb2yuv, INPUT.SampleLevel(SL, pos, 0).rgb);
|
||||
OUTPUT[gxy] = float4(mul(yuv2rgb, float3(saturate(yuv.r + r.y), yuv.yz)), 1);
|
||||
|
||||
++gxy.y;
|
||||
pos.y += opt.y;
|
||||
yuv = mul(rgb2yuv, INPUT.SampleLevel(SL, pos, 0).rgb);
|
||||
OUTPUT[gxy] = float4(mul(yuv2rgb, float3(saturate(yuv.r + r.w), yuv.yz)), 1);
|
||||
|
||||
--gxy.x;
|
||||
pos.x -= opt.x;
|
||||
yuv = mul(rgb2yuv, INPUT.SampleLevel(SL, pos, 0).rgb);
|
||||
OUTPUT[gxy] = float4(mul(yuv2rgb, float3(saturate(yuv.r + r.z), yuv.yz)), 1);
|
||||
}
|
||||
921
src/Effects/CuNNy/CuNNy-4x8C-NVL-DN.hlsl
Normal file
921
src/Effects/CuNNy/CuNNy-4x8C-NVL-DN.hlsl
Normal file
|
|
@ -0,0 +1,921 @@
|
|||
// CuNNy 4x8C BILINEAR RGB NVL DN - https://github.com/cunnyplapper/CuNNy
|
||||
|
||||
// This program is free software: you can redistribute it and/or modify
|
||||
// it under the terms of the GNU General Public License as published by
|
||||
// the Free Software Foundation, either version 3 of the License, or
|
||||
// (at your option) any later version.
|
||||
//
|
||||
// This program is distributed in the hope that it will be useful,
|
||||
// but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
//
|
||||
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
// GNU General Public License for more details.
|
||||
//
|
||||
// You should have received a copy of the GNU General Public License
|
||||
// along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
|
||||
//!MAGPIE EFFECT
|
||||
//!VERSION 4
|
||||
//!SORT_NAME CuNNy-DN-D08N04
|
||||
|
||||
//!TEXTURE
|
||||
Texture2D INPUT;
|
||||
|
||||
//!TEXTURE
|
||||
//!WIDTH INPUT_WIDTH * 2
|
||||
//!HEIGHT INPUT_HEIGHT * 2
|
||||
Texture2D OUTPUT;
|
||||
|
||||
//!SAMPLER
|
||||
//!FILTER POINT
|
||||
SamplerState SP;
|
||||
|
||||
//!SAMPLER
|
||||
//!FILTER LINEAR
|
||||
SamplerState SL;
|
||||
|
||||
//!COMMON
|
||||
#define O(t, p) t.SampleLevel(SP, pos + p * pt, 0)
|
||||
#define V4 min16float4
|
||||
#define M4 min16float4x4
|
||||
|
||||
//!TEXTURE
|
||||
//!WIDTH INPUT_WIDTH
|
||||
//!HEIGHT INPUT_HEIGHT
|
||||
//!FORMAT R8G8B8A8_SNORM
|
||||
Texture2D t0;
|
||||
|
||||
//!TEXTURE
|
||||
//!WIDTH INPUT_WIDTH
|
||||
//!HEIGHT INPUT_HEIGHT
|
||||
//!FORMAT R8G8B8A8_SNORM
|
||||
Texture2D t1;
|
||||
|
||||
//!TEXTURE
|
||||
//!WIDTH INPUT_WIDTH
|
||||
//!HEIGHT INPUT_HEIGHT
|
||||
//!FORMAT R8G8B8A8_SNORM
|
||||
Texture2D t2;
|
||||
|
||||
//!TEXTURE
|
||||
//!WIDTH INPUT_WIDTH
|
||||
//!HEIGHT INPUT_HEIGHT
|
||||
//!FORMAT R8G8B8A8_SNORM
|
||||
Texture2D t3;
|
||||
|
||||
//!PASS 1
|
||||
//!DESC in
|
||||
//!BLOCK_SIZE 8
|
||||
//!NUM_THREADS 64
|
||||
//!IN INPUT
|
||||
//!OUT t0, t1
|
||||
|
||||
#define l0(x, y) min16float((dot(float3(2.214e-01, 4.385e-01, 1.006e-01), O(INPUT, float2(x, y)).rgb) + -6.858e-01))
|
||||
|
||||
V4 f0(min16float s0_0, min16float s0_1, min16float s0_2, min16float s0_3, min16float s0_4, min16float s0_5, min16float s0_6, min16float s0_7, min16float s0_8) {
|
||||
V4 r = 0.0;
|
||||
r += V4(-2.401e-02, 1.817e-03, -1.218e-01, 2.796e-02) * s0_0;
|
||||
r += V4(3.256e-02, 3.929e-03, -5.850e-02, -5.602e-02) * s0_1;
|
||||
r += V4(4.497e-04, -1.812e-02, 5.241e-02, 3.698e-02) * s0_2;
|
||||
r += V4(5.371e-01, -2.302e-01, -1.373e-01, -4.038e-03) * s0_3;
|
||||
r += V4(1.565e-01, -6.067e-02, 3.397e-01, -3.741e-01) * s0_4;
|
||||
r += V4(-2.095e-03, 4.044e-02, -3.770e-02, 5.665e-02) * s0_5;
|
||||
r += V4(-1.993e-01, -2.645e-01, -8.892e-02, 1.948e-02) * s0_6;
|
||||
r += V4(-4.865e-01, 5.400e-01, -1.396e-01, 1.270e-01) * s0_7;
|
||||
r += V4(-1.667e-02, -9.433e-03, -1.324e-02, -1.803e-03) * s0_8;
|
||||
r += V4(2.880e-04, 1.418e-02, 1.413e-02, -1.036e-01);
|
||||
return r;
|
||||
}
|
||||
|
||||
V4 f1(min16float s0_0, min16float s0_1, min16float s0_2, min16float s0_3, min16float s0_4, min16float s0_5, min16float s0_6, min16float s0_7, min16float s0_8) {
|
||||
V4 r = 0.0;
|
||||
r += V4(-4.610e-02, -6.199e-01, 8.493e-03, -1.532e-02) * s0_0;
|
||||
r += V4(-7.178e-02, 5.957e-01, 1.575e-03, 1.807e-02) * s0_1;
|
||||
r += V4(1.106e-01, 3.625e-03, 3.713e-02, -4.124e-03) * s0_2;
|
||||
r += V4(1.288e-01, -5.582e-02, 5.082e-02, 1.674e-02) * s0_3;
|
||||
r += V4(-6.074e-01, 8.818e-02, -3.371e-01, -6.663e-01) * s0_4;
|
||||
r += V4(-8.030e-02, -4.780e-03, -3.421e-01, 5.358e-02) * s0_5;
|
||||
r += V4(4.990e-01, 7.623e-03, 1.778e-03, 2.401e-02) * s0_6;
|
||||
r += V4(9.546e-02, -1.656e-02, 6.935e-04, 6.387e-01) * s0_7;
|
||||
r += V4(-2.302e-02, 5.209e-03, 5.835e-02, -6.361e-02) * s0_8;
|
||||
r += V4(-4.485e-04, -2.620e-04, 2.449e-02, -7.403e-04);
|
||||
return r;
|
||||
}
|
||||
|
||||
void Pass1(uint2 blockStart, uint3 tid) {
|
||||
float2 pt = float2(GetInputPt());
|
||||
uint2 gxy = Rmp8x8(tid.x) + blockStart;
|
||||
uint2 size = GetInputSize();
|
||||
if (gxy.x >= size.x || gxy.y >= size.y) {
|
||||
return;
|
||||
}
|
||||
float2 pos = (gxy + 0.5) * pt;
|
||||
|
||||
min16float s0_0 = l0(-1.0, -1.0);
|
||||
min16float s0_1 = l0(0.0, -1.0);
|
||||
min16float s0_2 = l0(1.0, -1.0);
|
||||
min16float s0_3 = l0(-1.0, 0.0);
|
||||
min16float s0_4 = l0(0.0, 0.0);
|
||||
min16float s0_5 = l0(1.0, 0.0);
|
||||
min16float s0_6 = l0(-1.0, 1.0);
|
||||
min16float s0_7 = l0(0.0, 1.0);
|
||||
min16float s0_8 = l0(1.0, 1.0);
|
||||
|
||||
t0[gxy] = f0(s0_0, s0_1, s0_2, s0_3, s0_4, s0_5, s0_6, s0_7, s0_8);
|
||||
t1[gxy] = f1(s0_0, s0_1, s0_2, s0_3, s0_4, s0_5, s0_6, s0_7, s0_8);
|
||||
}
|
||||
|
||||
//!PASS 2
|
||||
//!DESC conv1
|
||||
//!BLOCK_SIZE 8
|
||||
//!NUM_THREADS 64
|
||||
//!IN t0, t1
|
||||
//!OUT t2, t3
|
||||
|
||||
#define l0(x, y) V4(O(t0, float2(x, y)))
|
||||
#define l1(x, y) V4(O(t1, float2(x, y)))
|
||||
|
||||
V4 f0(V4 s0_0, V4 s0_1, V4 s0_2, V4 s0_3, V4 s0_4, V4 s0_5, V4 s0_6, V4 s0_7, V4 s0_8, V4 s1_0, V4 s1_1, V4 s1_2, V4 s1_3, V4 s1_4, V4 s1_5, V4 s1_6, V4 s1_7, V4 s1_8, V4 s2_0, V4 s2_1, V4 s2_2, V4 s2_3, V4 s2_4, V4 s2_5, V4 s2_6, V4 s2_7, V4 s2_8, V4 s3_0, V4 s3_1, V4 s3_2, V4 s3_3, V4 s3_4, V4 s3_5, V4 s3_6, V4 s3_7, V4 s3_8) {
|
||||
V4 r = 0.0;
|
||||
r += mul(s0_0, M4(7.103e-02, 1.495e-01, -1.731e-02, -9.952e-02, -1.539e-01, -1.103e-01, 7.099e-02, 2.023e-01, 2.681e-02, 5.202e-03, 1.954e-02, -6.822e-02, -1.650e-01, 3.710e-01, -6.020e-01, 4.879e-01));
|
||||
r += mul(s0_1, M4(-1.168e-02, 2.587e-01, -4.670e-01, -3.986e-02, -1.268e-01, 3.619e-02, 5.712e-02, 1.722e-01, 4.473e-02, -1.224e-01, 8.228e-02, -3.981e-02, 4.044e-01, -3.039e-01, -3.390e-01, 5.925e-02));
|
||||
r += mul(s0_2, M4(4.083e-02, 7.140e-02, -5.864e-01, 1.188e-01, 2.214e-01, -2.826e-01, 2.294e-01, -2.199e-01, -9.048e-02, 1.787e-01, -6.887e-02, -6.645e-02, -1.285e-01, -8.261e-02, -1.975e-01, 2.428e-01));
|
||||
r += mul(s0_3, M4(-5.801e-02, -3.381e-02, -2.285e-01, 9.377e-02, 1.878e-01, 9.285e-02, -1.001e-01, -5.059e-02, -2.155e-02, -9.098e-02, -1.279e-02, 9.801e-02, 1.178e-01, -1.967e-01, -4.792e-02, -1.106e-01));
|
||||
r += mul(s0_4, M4(3.048e-01, 2.731e-01, -2.351e-01, -1.516e-01, -1.382e-02, 1.296e-01, -9.530e-02, 2.975e-02, 2.411e-01, 2.343e-02, 1.731e-02, -2.331e-01, -2.161e-01, 4.114e-01, 4.417e-01, 1.225e+00));
|
||||
r += mul(s0_5, M4(3.337e-01, 2.844e-01, 1.065e-01, -2.391e-01, -1.265e-01, -3.625e-02, -7.062e-02, 3.529e-02, 2.208e-02, -8.459e-03, -1.366e-01, -1.563e-02, -1.648e-01, -5.919e-01, 4.061e-01, -4.975e-02));
|
||||
r += mul(s0_6, M4(6.213e-03, -2.020e-02, 2.520e-03, 2.167e-02, -2.361e-01, -1.421e-01, -4.579e-02, -1.353e-01, -2.883e-01, -5.900e-04, 2.720e-02, 1.591e-01, -5.120e-01, -4.253e-01, -3.397e-02, -4.633e-01));
|
||||
r += mul(s0_7, M4(2.456e-01, -6.978e-02, 5.668e-02, -9.795e-03, -1.925e-01, -4.841e-02, -1.273e-02, 1.282e-02, -1.223e-01, -4.080e-02, 2.975e-02, 1.595e-01, -3.345e-01, -1.504e-01, 1.080e-01, 8.549e-01));
|
||||
r += mul(s0_8, M4(8.700e-02, 1.611e-02, 8.589e-02, -3.284e-02, -1.637e-01, 2.627e-01, 1.851e-02, 2.843e-02, 1.224e-01, 6.163e-02, 4.991e-02, -1.510e-01, 1.885e-01, -5.951e-02, -3.463e-02, 2.172e-01));
|
||||
r += mul(s1_0, M4(1.856e-01, -1.041e-01, 1.900e-01, 8.420e-02, -3.223e-01, 6.258e-02, -9.766e-02, -6.517e-01, 3.066e-02, -7.562e-02, 1.015e-02, -1.139e-01, 1.569e-02, -3.684e-02, -2.813e-02, 8.835e-02));
|
||||
r += mul(s1_1, M4(-7.107e-02, -1.146e-01, 5.488e-01, -2.960e-01, 3.743e-01, -5.368e-01, -2.219e-01, -3.122e-01, 2.468e-02, -7.477e-01, 1.858e-01, 3.498e-01, 1.771e-03, 4.215e-03, 8.478e-02, 9.318e-02));
|
||||
r += mul(s1_2, M4(-2.350e-03, -3.382e-01, 5.964e-01, -2.321e-01, 2.011e-01, 1.890e-01, -2.062e-01, -3.725e-02, -1.003e-01, -1.464e-01, 1.040e-01, 9.994e-02, -7.113e-02, -3.827e-02, -1.258e-01, -1.584e-01));
|
||||
r += mul(s1_3, M4(-1.609e-01, -1.460e-01, -4.804e-03, 5.503e-02, 2.784e-01, -1.475e-02, 9.395e-02, -1.128e-01, 1.032e-02, -1.969e-01, 2.170e-01, 2.335e-01, -1.371e-01, 4.853e-02, 8.945e-03, -2.698e-01));
|
||||
r += mul(s1_4, M4(7.739e-02, -1.105e-01, 3.348e-01, 1.093e-01, -7.745e-02, -1.642e-01, -2.191e-01, -2.674e-02, 4.199e-01, -3.302e-01, 1.445e-01, -2.815e-01, -3.154e-01, 6.646e-02, 8.520e-02, -1.053e-01));
|
||||
r += mul(s1_5, M4(-4.165e-01, -8.545e-02, 2.291e-01, -1.042e-01, 3.791e-01, -7.209e-02, -6.332e-02, -3.174e-01, 1.038e-01, 8.122e-03, -9.715e-02, 6.808e-01, -9.362e-02, -4.634e-02, 5.184e-03, 1.295e-01));
|
||||
r += mul(s1_6, M4(-8.179e-02, -8.513e-02, 4.470e-02, -7.799e-02, -1.092e-01, -1.851e-01, -1.025e-01, -4.220e-02, -3.853e-01, 3.040e-02, -9.081e-02, 1.439e-01, -2.730e-02, -5.086e-02, 5.352e-03, -5.102e-03));
|
||||
r += mul(s1_7, M4(7.601e-02, -1.423e-01, 3.421e-01, 2.574e-03, 1.165e-01, 6.863e-03, 1.250e-02, -4.862e-02, -3.859e-01, -1.108e-01, 2.515e-02, 5.564e-01, 2.485e-01, 2.230e-01, -3.839e-02, 3.605e-02));
|
||||
r += mul(s1_8, M4(-9.424e-02, 1.248e-01, 1.980e-01, -1.671e-01, 1.098e-01, 6.555e-02, -7.194e-02, -1.626e-01, -1.439e-01, -2.086e-01, -1.925e-02, 1.520e-01, 2.139e-01, -7.764e-02, 6.469e-02, 7.875e-03));
|
||||
r += mul(s2_0, M4(4.572e-02, 3.661e-02, -3.845e-01, -1.383e-01, 1.729e-02, 1.780e-02, 3.664e-02, -6.961e-02, -9.001e-03, -1.853e-02, -6.735e-02, -1.864e-02, 1.695e-01, -1.420e-01, 2.679e-01, -1.525e-01));
|
||||
r += mul(s2_1, M4(9.967e-02, -2.869e-01, -2.251e-01, 8.470e-02, 3.178e-02, -9.701e-03, 9.260e-02, 4.087e-04, -8.081e-02, 1.341e-01, 5.882e-03, 1.043e-02, 8.559e-03, 6.534e-02, -4.619e-01, -3.010e-01));
|
||||
r += mul(s2_2, M4(-1.676e-02, -3.339e-01, 1.848e-01, -2.562e-01, -8.563e-02, 2.487e-02, 2.495e-01, 9.448e-02, 2.189e-02, -3.018e-02, 5.698e-02, 6.041e-02, -4.869e-02, -2.627e-02, 1.602e-01, 1.092e-01));
|
||||
r += mul(s2_3, M4(6.867e-02, -1.693e-01, -1.614e-01, -1.944e-01, 1.992e-01, 1.720e-01, 2.393e-01, 1.219e-02, 4.866e-02, -1.165e-01, -1.285e-01, 2.929e-01, 2.043e-01, -1.399e-02, 1.595e-02, -2.746e-01));
|
||||
r += mul(s2_4, M4(-4.477e-01, -5.696e-01, -1.760e-02, 1.362e-01, 1.472e-01, 3.113e-01, -2.419e-01, 8.650e-02, -8.358e-02, 1.081e-01, 3.881e-02, -1.400e-01, -2.071e-01, 3.977e-02, -3.149e-01, 2.525e-01));
|
||||
r += mul(s2_5, M4(5.496e-02, -9.963e-02, -1.227e-01, -1.892e-01, 4.361e-02, -3.776e-01, -6.576e-01, 2.628e-01, -8.215e-02, -8.123e-02, 2.248e-03, 1.261e-01, 1.193e-01, 2.608e-01, 2.567e-01, 8.120e-02));
|
||||
r += mul(s2_6, M4(-1.587e-01, -9.849e-02, 1.122e-01, -5.963e-02, -9.176e-02, 7.341e-03, 1.164e-03, -5.660e-02, 1.567e-01, -6.958e-02, -3.780e-02, 4.238e-04, -6.186e-02, 1.777e-01, 2.398e-01, 6.853e-03));
|
||||
r += mul(s2_7, M4(1.062e-01, -1.498e-01, 5.492e-02, 1.108e-01, -3.248e-01, -2.901e-01, -4.360e-01, 1.128e-01, 7.346e-02, 8.659e-02, 9.740e-02, -1.434e-01, 1.538e-01, 1.349e-01, 1.408e-01, -1.367e-01));
|
||||
r += mul(s2_8, M4(1.412e-01, -8.889e-02, 2.029e-02, -1.523e-01, 4.847e-01, -7.432e-01, -1.181e-01, 4.132e-01, 3.119e-02, -5.840e-02, -2.292e-02, -3.125e-02, 2.440e-02, 2.815e-02, 2.759e-01, -8.781e-02));
|
||||
r += mul(s3_0, M4(2.147e-02, 2.192e-01, 2.489e-01, -3.436e-02, 1.086e-02, -2.680e-02, -9.925e-02, 3.978e-02, 1.239e-01, 3.645e-02, 5.463e-01, 5.005e-01, 1.039e-01, -1.694e-01, -3.816e-02, 3.834e-01));
|
||||
r += mul(s3_1, M4(1.418e-01, 5.806e-02, 1.317e-01, 2.227e-01, 1.486e-02, -4.235e-03, -5.750e-02, -1.548e-01, -7.700e-01, 3.263e-01, -1.193e-02, 3.537e-01, -2.841e-01, 4.657e-01, -1.576e-01, -9.526e-02));
|
||||
r += mul(s3_2, M4(7.641e-02, 8.195e-01, 1.080e-01, 1.814e-01, -5.471e-02, 2.211e-02, -4.212e-02, -1.249e-02, 2.469e-02, 5.436e-01, 3.805e-01, -9.622e-02, -6.358e-02, -3.739e-01, -3.504e-01, -2.627e-01));
|
||||
r += mul(s3_3, M4(-9.359e-02, -1.830e-02, -7.015e-02, -7.774e-02, 2.286e-01, -6.321e-02, -5.124e-02, -2.799e-03, -5.063e-01, -1.835e-01, 3.716e-01, 1.130e+00, 3.259e-01, -2.045e-01, -1.792e-01, 4.892e-01));
|
||||
r += mul(s3_4, M4(-7.478e-01, -1.192e-01, 1.022e-01, 8.111e-01, 7.253e-02, 2.280e-01, -1.116e-01, -2.828e-01, -2.364e-01, -1.233e+00, -1.125e+00, 1.750e+00, -1.215e+00, 4.973e-02, 2.070e-01, 6.996e-01));
|
||||
r += mul(s3_5, M4(-4.115e-02, 3.613e-01, 2.694e-01, 4.126e-02, 7.046e-02, 6.242e-02, 9.300e-02, -1.965e-01, -3.211e-01, 8.504e-01, 2.518e-01, -5.622e-01, 5.663e-02, -1.139e-01, 1.150e-01, -1.954e-01));
|
||||
r += mul(s3_6, M4(-1.870e-01, -9.168e-02, -8.947e-02, 6.127e-03, 1.163e-02, 3.733e-04, -3.330e-01, 1.935e-01, 3.424e-01, 1.313e-01, -6.732e-01, 8.256e-02, 6.713e-02, 2.980e-02, -6.912e-02, 1.715e-01));
|
||||
r += mul(s3_7, M4(1.636e-01, 1.212e-01, 2.280e-02, 1.552e-01, -4.955e-01, 8.376e-01, 1.476e-01, 2.192e-01, 9.746e-01, -3.148e-01, 8.206e-01, -8.104e-01, -7.918e-02, -1.604e-01, 5.505e-02, 7.640e-02));
|
||||
r += mul(s3_8, M4(1.248e-01, 2.878e-01, -4.182e-02, -9.214e-02, -1.210e-01, 4.382e-01, 8.062e-02, -3.051e-01, -1.803e-01, -3.041e-01, 1.368e-01, -1.030e-01, 2.941e-02, -2.724e-01, 3.480e-02, 1.396e-02));
|
||||
r += V4(-3.046e-02, 3.515e-02, 4.880e-02, 4.740e-03);
|
||||
return r;
|
||||
}
|
||||
|
||||
V4 f1(V4 s0_0, V4 s0_1, V4 s0_2, V4 s0_3, V4 s0_4, V4 s0_5, V4 s0_6, V4 s0_7, V4 s0_8, V4 s1_0, V4 s1_1, V4 s1_2, V4 s1_3, V4 s1_4, V4 s1_5, V4 s1_6, V4 s1_7, V4 s1_8, V4 s2_0, V4 s2_1, V4 s2_2, V4 s2_3, V4 s2_4, V4 s2_5, V4 s2_6, V4 s2_7, V4 s2_8, V4 s3_0, V4 s3_1, V4 s3_2, V4 s3_3, V4 s3_4, V4 s3_5, V4 s3_6, V4 s3_7, V4 s3_8) {
|
||||
V4 r = 0.0;
|
||||
r += mul(s0_0, M4(1.574e-01, -6.104e-03, -2.288e-01, 5.024e-03, -2.149e-03, -8.674e-02, 1.209e-01, 7.107e-02, 1.242e-01, 2.312e-03, -5.300e-02, -2.285e-01, -8.824e-02, 7.402e-02, -4.447e-01, 1.117e+00));
|
||||
r += mul(s0_1, M4(-5.617e-02, 3.613e-01, -4.666e-01, 1.795e-01, 1.718e-01, -1.005e-01, -2.593e-01, 4.103e-01, 2.477e-01, 1.883e-01, 3.928e-02, -3.635e-01, -7.353e-01, 3.209e-01, 2.171e-01, 3.924e-01));
|
||||
r += mul(s0_2, M4(-3.304e-01, 6.332e-01, -3.898e-01, 2.704e-01, 4.110e-02, -2.786e-01, -2.513e-01, 1.800e-01, 9.402e-03, -1.975e-01, -4.040e-02, -2.047e-01, -3.239e-01, -1.623e-01, 1.001e-01, 3.053e-02));
|
||||
r += mul(s0_3, M4(3.935e-01, 5.218e-02, 4.630e-02, 2.202e-02, -2.172e-01, -1.530e-02, -1.782e-01, -9.327e-02, -6.425e-02, -2.402e-02, -2.919e-02, -4.034e-02, 6.589e-01, -4.900e-02, 7.783e-02, 6.334e-01));
|
||||
r += mul(s0_4, M4(-5.475e-02, 1.543e-01, -1.597e-01, -2.500e-01, 4.990e-02, 5.780e-02, 1.162e-01, 1.140e-01, -2.980e-01, -2.524e-02, -2.103e-01, 4.297e-01, 4.528e-01, -3.098e-01, -1.415e-01, 7.565e-01));
|
||||
r += mul(s0_5, M4(-1.097e-01, 3.376e-01, -5.685e-01, 1.347e-01, 1.155e-01, -1.396e-01, -2.840e-01, -1.373e-01, 1.442e-01, 8.711e-02, 1.357e-01, -1.110e-01, 2.095e-01, -2.901e-01, -1.007e-01, -2.473e-01));
|
||||
r += mul(s0_6, M4(-7.405e-02, 9.320e-02, -5.870e-02, -2.569e-01, 6.017e-03, -8.078e-02, -3.798e-02, 2.334e-01, 1.440e-01, -1.852e-01, -6.627e-03, 3.514e-03, -1.499e-02, -6.237e-02, 3.665e-01, 3.270e-01));
|
||||
r += mul(s0_7, M4(2.443e-01, 8.076e-02, -2.143e-01, 1.120e-01, 1.187e-01, 1.317e-01, 1.811e-01, 1.918e-01, -2.164e-02, -1.829e-01, 2.105e-01, 3.085e-01, 3.155e-01, 2.801e-01, -6.834e-01, 2.861e-01));
|
||||
r += mul(s0_8, M4(9.974e-03, 9.704e-02, -2.363e-01, 1.829e-01, 1.844e-02, 9.298e-02, -5.319e-02, -5.899e-02, -2.154e-01, 2.555e-02, -8.374e-02, 1.254e-01, -2.736e-01, -4.065e-02, 4.838e-02, 3.338e-02));
|
||||
r += mul(s1_0, M4(-1.239e-02, -1.316e-01, 8.694e-02, -8.443e-02, -1.143e-01, -6.018e-02, -9.054e-02, 7.381e-02, 2.722e-01, 1.030e-01, -8.583e-02, -4.433e-01, -1.339e-01, 1.264e-01, 8.581e-02, -1.947e-01));
|
||||
r += mul(s1_1, M4(3.030e-01, -3.527e-02, 4.665e-01, -3.372e-02, -2.301e-02, 7.308e-01, 5.938e-01, -5.901e-01, 4.766e-01, 1.081e-01, 8.809e-02, 3.482e-01, -1.938e-01, -8.091e-02, 3.649e-02, 9.321e-02));
|
||||
r += mul(s1_2, M4(1.376e-01, -4.460e-01, 4.298e-01, -4.809e-02, -3.819e-01, 5.216e-01, 2.687e-01, 1.359e-01, 2.936e-01, 1.222e-02, 3.706e-01, 2.481e-01, -4.716e-02, -1.798e-02, 2.731e-02, -7.140e-02));
|
||||
r += mul(s1_3, M4(1.657e-01, -3.624e-02, 1.541e-01, -5.006e-03, -4.051e-01, -9.782e-02, 3.008e-02, 1.962e-01, -6.146e-02, 1.866e-03, -3.052e-01, -2.202e-01, 1.057e-01, -1.151e-01, -6.310e-02, 3.914e-01));
|
||||
r += mul(s1_4, M4(-2.629e-01, 1.029e-01, 1.812e-02, -2.950e-01, -1.191e-01, 2.580e-01, -4.833e-01, 1.095e-01, 2.309e-02, 4.519e-02, 1.086e-01, 5.362e-01, -1.349e-01, -1.278e-01, 7.109e-02, -1.992e-01));
|
||||
r += mul(s1_5, M4(-1.815e-01, 2.898e-01, 3.446e-01, -1.587e-01, -6.360e-02, 1.662e-01, 5.187e-01, 1.701e-01, -2.770e-02, -5.932e-01, 2.467e-01, 3.940e-01, 1.022e-01, 1.033e-01, -5.084e-02, -6.520e-02));
|
||||
r += mul(s1_6, M4(-1.494e-01, 3.180e-02, 9.864e-02, -3.409e-01, 1.397e-02, 9.932e-03, -2.110e-01, 2.636e-01, 1.353e-01, -8.495e-02, -2.680e-03, -2.287e-01, 1.136e-01, -1.047e-01, 2.910e-02, 9.922e-02));
|
||||
r += mul(s1_7, M4(1.533e-01, 4.819e-04, 1.735e-01, 2.027e-01, 1.316e-01, 1.029e-01, 1.446e-01, 1.737e-01, 4.855e-02, 4.781e-02, 2.025e-01, 1.587e-01, 1.661e-01, 7.134e-02, 5.853e-02, -1.530e-01));
|
||||
r += mul(s1_8, M4(-1.476e-01, -4.916e-02, 1.989e-01, 1.159e-01, 4.753e-02, 1.694e-01, 4.343e-02, -6.974e-03, 3.382e-02, 2.275e-01, 3.466e-01, -7.178e-03, -1.104e-01, 2.059e-03, -7.101e-02, 8.934e-02));
|
||||
r += mul(s2_0, M4(-3.467e-01, 8.471e-04, 1.580e-01, 2.685e-01, -2.680e-02, -6.444e-02, 8.843e-02, 5.232e-03, 2.576e-02, -3.756e-02, -7.913e-03, -3.871e-02, -5.374e-02, -6.060e-02, -7.688e-02, 6.738e-01));
|
||||
r += mul(s2_1, M4(-3.963e-01, 1.295e-01, 2.623e-01, 2.565e-01, -1.831e-01, -6.054e-02, 1.817e-01, -8.944e-02, 1.974e-01, -2.800e-04, -3.964e-02, 1.232e-01, -3.477e-01, 3.791e-01, 1.438e-01, -7.862e-02));
|
||||
r += mul(s2_2, M4(2.540e-02, 1.123e-01, 6.461e-01, -3.856e-03, 3.373e-02, -5.719e-02, 1.556e-01, -1.100e-01, -3.499e-02, 9.146e-02, -4.624e-02, 9.774e-02, -1.148e-01, -2.280e-01, 4.977e-01, -1.568e-01));
|
||||
r += mul(s2_3, M4(5.352e-02, -1.293e-01, -6.991e-03, 4.190e-01, -2.334e-03, -4.433e-02, -8.470e-02, 1.162e-01, -1.045e-01, -7.444e-02, 8.951e-02, -1.124e-01, 4.295e-01, 1.086e-01, 1.336e-01, 2.645e-01));
|
||||
r += mul(s2_4, M4(-4.062e-01, -6.781e-02, 4.629e-01, -4.931e-01, -1.875e-01, 1.958e-01, -4.560e-01, -2.286e-02, -2.066e-01, 1.151e-01, -5.924e-02, 1.350e-01, -1.752e-01, 2.244e-01, -3.564e-02, -6.129e-01));
|
||||
r += mul(s2_5, M4(6.644e-02, 4.611e-01, 9.200e-02, 6.845e-03, -1.628e-02, 8.352e-02, -1.119e-01, -4.386e-02, -5.822e-02, -4.769e-02, -3.224e-02, -1.235e-01, -3.296e-01, 5.835e-03, 2.231e-01, 5.535e-02));
|
||||
r += mul(s2_6, M4(-2.961e-02, -5.230e-02, 5.124e-02, 6.542e-02, 2.004e-01, 1.189e-01, -1.797e-01, -1.535e-02, 6.469e-02, 1.134e-01, -1.204e-04, -7.606e-02, 2.436e-02, -1.630e-02, 1.841e-01, -2.529e-01));
|
||||
r += mul(s2_7, M4(-1.147e-02, 3.246e-02, 7.626e-02, -1.013e-01, 1.075e-01, 5.871e-01, -5.227e-01, -3.076e-01, 1.609e-01, 5.768e-02, -1.912e-02, 5.898e-02, -7.530e-02, -1.307e-01, 5.828e-02, -1.456e-02));
|
||||
r += mul(s2_8, M4(-7.053e-02, 8.728e-02, 1.211e-01, 1.410e-01, -2.160e-01, 9.970e-02, -5.345e-01, 1.141e-01, 8.112e-04, -4.348e-02, 9.858e-02, 2.780e-02, -1.116e-01, -2.331e-01, 1.545e-01, 7.984e-02));
|
||||
r += mul(s3_0, M4(-5.412e-02, 6.012e-03, -2.395e-01, -1.209e-02, -5.734e-02, 3.058e-02, -7.202e-02, -7.514e-02, 7.241e-03, -1.702e-01, 1.020e+00, 2.997e-01, -2.173e-01, 4.518e-02, -2.703e-02, -4.087e-02));
|
||||
r += mul(s3_1, M4(-5.670e-02, -9.713e-03, -2.091e-01, -1.621e-01, -5.370e-03, -5.579e-02, 1.042e-01, 2.220e-02, 4.788e-01, -6.623e-01, 5.548e-01, 8.186e-01, 2.462e-01, -7.624e-01, -9.065e-02, -1.105e-02));
|
||||
r += mul(s3_2, M4(4.043e-02, -1.577e-01, -3.166e-01, -1.256e-01, -9.515e-02, -8.852e-02, -4.960e-02, 1.129e-01, 1.690e-01, 2.314e-01, -5.134e-01, 9.584e-02, -3.085e-02, 2.399e-01, -3.381e-01, -7.233e-02));
|
||||
r += mul(s3_3, M4(1.750e-01, -9.450e-02, -2.230e-01, 4.190e-01, 8.900e-02, 2.306e-02, 2.783e-01, -3.295e-01, 2.697e+00, 8.855e-02, 5.728e-01, -8.682e-01, 6.085e-02, 5.010e-02, 1.343e-01, 1.137e-01));
|
||||
r += mul(s3_4, M4(9.857e-02, 3.310e-01, -3.584e-01, -5.586e-01, 5.751e-01, -4.023e-01, 3.838e-01, 1.240e-01, -1.482e-01, -1.233e-01, -5.953e-01, 1.534e+00, 3.390e-01, -2.022e-02, 1.619e-01, -2.959e-01));
|
||||
r += mul(s3_5, M4(1.528e-01, 1.593e-01, -1.886e-01, 2.281e-02, 2.174e-01, -8.846e-01, 5.726e-02, 7.369e-03, -1.490e-01, 3.377e-01, -4.669e-02, 1.206e-01, -1.251e-01, 2.600e-01, -2.439e-01, 2.067e-01));
|
||||
r += mul(s3_6, M4(4.090e-02, -2.118e-02, -9.012e-02, -8.624e-03, 1.464e-01, 6.929e-02, 1.492e-01, -4.039e-01, 6.123e-01, 2.679e-01, -2.284e-01, -3.609e-01, -6.598e-02, 1.341e-01, -2.371e-02, -2.899e-01));
|
||||
r += mul(s3_7, M4(5.189e-02, -3.928e-02, 1.670e-01, -1.536e-01, 5.066e-01, -3.768e-01, 6.577e-01, 1.140e-01, -1.537e-01, -1.941e-01, -9.152e-02, -3.571e-02, 1.068e-01, 4.803e-02, -3.180e-01, 4.361e-02));
|
||||
r += mul(s3_8, M4(-8.453e-02, -1.454e-02, 3.613e-02, 8.974e-03, -1.258e-01, -5.842e-01, 3.264e-01, 2.910e-01, 1.306e-01, 4.552e-01, 4.524e-01, 1.065e-02, -1.792e-02, 1.875e-02, -2.206e-01, 2.028e-01));
|
||||
r += V4(3.015e-03, -4.690e-02, 3.573e-02, -1.486e-02);
|
||||
return r;
|
||||
}
|
||||
|
||||
void Pass2(uint2 blockStart, uint3 tid) {
|
||||
float2 pt = float2(GetInputPt());
|
||||
uint2 gxy = Rmp8x8(tid.x) + blockStart;
|
||||
uint2 size = GetInputSize();
|
||||
if (gxy.x >= size.x || gxy.y >= size.y) {
|
||||
return;
|
||||
}
|
||||
float2 pos = (gxy + 0.5) * pt;
|
||||
|
||||
V4 s0_0 = l0(-1.0, -1.0);
|
||||
V4 s0_1 = l0(0.0, -1.0);
|
||||
V4 s0_2 = l0(1.0, -1.0);
|
||||
V4 s0_3 = l0(-1.0, 0.0);
|
||||
V4 s0_4 = l0(0.0, 0.0);
|
||||
V4 s0_5 = l0(1.0, 0.0);
|
||||
V4 s0_6 = l0(-1.0, 1.0);
|
||||
V4 s0_7 = l0(0.0, 1.0);
|
||||
V4 s0_8 = l0(1.0, 1.0);
|
||||
V4 s1_0 = -max(-s0_0, 0.0);
|
||||
V4 s1_1 = -max(-s0_1, 0.0);
|
||||
V4 s1_2 = -max(-s0_2, 0.0);
|
||||
V4 s1_3 = -max(-s0_3, 0.0);
|
||||
V4 s1_4 = -max(-s0_4, 0.0);
|
||||
V4 s1_5 = -max(-s0_5, 0.0);
|
||||
V4 s1_6 = -max(-s0_6, 0.0);
|
||||
V4 s1_7 = -max(-s0_7, 0.0);
|
||||
V4 s1_8 = -max(-s0_8, 0.0);
|
||||
s0_0 = max(s0_0, 0.0);
|
||||
s0_1 = max(s0_1, 0.0);
|
||||
s0_2 = max(s0_2, 0.0);
|
||||
s0_3 = max(s0_3, 0.0);
|
||||
s0_4 = max(s0_4, 0.0);
|
||||
s0_5 = max(s0_5, 0.0);
|
||||
s0_6 = max(s0_6, 0.0);
|
||||
s0_7 = max(s0_7, 0.0);
|
||||
s0_8 = max(s0_8, 0.0);
|
||||
|
||||
V4 s2_0 = l1(-1.0, -1.0);
|
||||
V4 s2_1 = l1(0.0, -1.0);
|
||||
V4 s2_2 = l1(1.0, -1.0);
|
||||
V4 s2_3 = l1(-1.0, 0.0);
|
||||
V4 s2_4 = l1(0.0, 0.0);
|
||||
V4 s2_5 = l1(1.0, 0.0);
|
||||
V4 s2_6 = l1(-1.0, 1.0);
|
||||
V4 s2_7 = l1(0.0, 1.0);
|
||||
V4 s2_8 = l1(1.0, 1.0);
|
||||
V4 s3_0 = -max(-s2_0, 0.0);
|
||||
V4 s3_1 = -max(-s2_1, 0.0);
|
||||
V4 s3_2 = -max(-s2_2, 0.0);
|
||||
V4 s3_3 = -max(-s2_3, 0.0);
|
||||
V4 s3_4 = -max(-s2_4, 0.0);
|
||||
V4 s3_5 = -max(-s2_5, 0.0);
|
||||
V4 s3_6 = -max(-s2_6, 0.0);
|
||||
V4 s3_7 = -max(-s2_7, 0.0);
|
||||
V4 s3_8 = -max(-s2_8, 0.0);
|
||||
s2_0 = max(s2_0, 0.0);
|
||||
s2_1 = max(s2_1, 0.0);
|
||||
s2_2 = max(s2_2, 0.0);
|
||||
s2_3 = max(s2_3, 0.0);
|
||||
s2_4 = max(s2_4, 0.0);
|
||||
s2_5 = max(s2_5, 0.0);
|
||||
s2_6 = max(s2_6, 0.0);
|
||||
s2_7 = max(s2_7, 0.0);
|
||||
s2_8 = max(s2_8, 0.0);
|
||||
|
||||
t2[gxy] = f0(s0_0, s0_1, s0_2, s0_3, s0_4, s0_5, s0_6, s0_7, s0_8, s1_0, s1_1, s1_2, s1_3, s1_4, s1_5, s1_6, s1_7, s1_8, s2_0, s2_1, s2_2, s2_3, s2_4, s2_5, s2_6, s2_7, s2_8, s3_0, s3_1, s3_2, s3_3, s3_4, s3_5, s3_6, s3_7, s3_8);
|
||||
t3[gxy] = f1(s0_0, s0_1, s0_2, s0_3, s0_4, s0_5, s0_6, s0_7, s0_8, s1_0, s1_1, s1_2, s1_3, s1_4, s1_5, s1_6, s1_7, s1_8, s2_0, s2_1, s2_2, s2_3, s2_4, s2_5, s2_6, s2_7, s2_8, s3_0, s3_1, s3_2, s3_3, s3_4, s3_5, s3_6, s3_7, s3_8);
|
||||
}
|
||||
|
||||
//!PASS 3
|
||||
//!DESC conv2
|
||||
//!BLOCK_SIZE 8
|
||||
//!NUM_THREADS 64
|
||||
//!IN t2, t3
|
||||
//!OUT t0, t1
|
||||
|
||||
#define l0(x, y) V4(O(t2, float2(x, y)))
|
||||
#define l1(x, y) V4(O(t3, float2(x, y)))
|
||||
|
||||
V4 f0(V4 s0_0, V4 s0_1, V4 s0_2, V4 s0_3, V4 s0_4, V4 s0_5, V4 s0_6, V4 s0_7, V4 s0_8, V4 s1_0, V4 s1_1, V4 s1_2, V4 s1_3, V4 s1_4, V4 s1_5, V4 s1_6, V4 s1_7, V4 s1_8, V4 s2_0, V4 s2_1, V4 s2_2, V4 s2_3, V4 s2_4, V4 s2_5, V4 s2_6, V4 s2_7, V4 s2_8, V4 s3_0, V4 s3_1, V4 s3_2, V4 s3_3, V4 s3_4, V4 s3_5, V4 s3_6, V4 s3_7, V4 s3_8) {
|
||||
V4 r = 0.0;
|
||||
r += mul(s0_0, M4(8.130e-02, -1.243e-01, -7.648e-02, -2.424e-01, -4.742e-02, -5.420e-02, 4.117e-02, 1.568e-01, -3.621e-02, 2.032e-01, 4.484e-02, 1.249e-02, -1.505e-01, 7.294e-02, 4.943e-02, -6.336e-02));
|
||||
r += mul(s0_1, M4(-1.474e-01, -3.366e-01, -5.670e-01, 4.113e-02, -1.260e-01, -1.539e-01, -5.421e-02, 1.779e-01, -1.072e-01, 1.209e-01, 4.423e-02, 2.454e-01, -5.430e-02, -1.442e-01, -1.501e-02, -4.731e-02));
|
||||
r += mul(s0_2, M4(6.444e-02, 1.509e-01, 1.452e-01, -2.840e-02, 9.365e-02, 2.016e-01, 1.002e-01, -3.226e-02, -1.186e-01, 1.535e-01, -1.652e-01, -1.104e-02, 4.170e-02, -4.404e-02, 1.189e-01, 1.007e-02));
|
||||
r += mul(s0_3, M4(-4.618e-02, 1.024e-01, -1.723e-01, -1.354e-01, 1.981e-01, -1.992e-01, 1.670e-01, 3.857e-01, -6.927e-03, 9.087e-02, 1.176e-01, 3.314e-01, 9.860e-02, 4.009e-04, 1.061e-01, -6.930e-02));
|
||||
r += mul(s0_4, M4(4.923e-01, -9.248e-02, 8.616e-03, 4.541e-02, -1.148e-01, 3.990e-03, -3.218e-02, 8.942e-02, 3.219e-02, -9.786e-02, 6.813e-02, 2.492e-01, -3.165e-01, 6.925e-02, -9.826e-02, 4.518e-01));
|
||||
r += mul(s0_5, M4(2.222e-02, 1.046e-01, -2.327e-02, -7.823e-02, 3.540e-01, 3.363e-01, -4.089e-02, 1.292e-02, -2.530e-01, 4.606e-01, -6.191e-02, -3.673e-02, -2.764e-01, -1.360e-01, -2.947e-03, 2.534e-02));
|
||||
r += mul(s0_6, M4(-2.067e-02, -1.566e-01, -8.968e-02, 1.386e-03, -8.841e-02, -1.077e-01, 1.646e-01, 1.987e-01, -3.098e-01, 2.764e-01, 1.935e-01, 1.847e-01, 1.116e-01, -1.514e-01, -5.175e-02, 8.710e-02));
|
||||
r += mul(s0_7, M4(1.266e-02, -2.119e-01, -1.610e-01, -6.512e-02, -1.679e-01, 2.247e-01, -5.854e-02, 1.200e-02, -1.406e-01, 4.393e-01, -8.517e-02, 3.281e-02, -1.177e-01, -1.861e-01, -3.241e-01, -2.918e-02));
|
||||
r += mul(s0_8, M4(-3.015e-02, -1.605e-01, -1.001e-01, 7.795e-03, -5.873e-02, -7.686e-02, -1.448e-01, -1.851e-02, -2.172e-01, 1.977e-01, -1.333e-01, -8.894e-02, -8.939e-03, 1.675e-01, -7.976e-03, 4.020e-02));
|
||||
r += mul(s1_0, M4(1.165e-01, -4.833e-02, 4.750e-02, -4.032e-02, -2.287e-02, -4.825e-02, 9.058e-02, 2.136e-01, 1.009e-01, -2.133e-02, 4.162e-02, -6.816e-02, -9.863e-02, -4.160e-03, -2.467e-02, -9.096e-02));
|
||||
r += mul(s1_1, M4(8.597e-02, -2.205e-01, 1.515e-01, -2.918e-02, -1.099e-01, -4.171e-02, 3.893e-04, -5.273e-03, -2.046e-02, -3.905e-03, 7.793e-04, 5.930e-02, 2.653e-02, -2.546e-01, -8.456e-02, -6.554e-02));
|
||||
r += mul(s1_2, M4(-1.058e-01, 3.302e-01, 1.812e-01, 6.427e-02, -4.601e-02, -1.589e-02, 4.405e-02, -1.366e-02, -5.996e-03, -5.402e-04, 3.237e-02, -5.725e-02, -7.486e-02, 1.358e-01, 4.739e-02, -2.432e-02));
|
||||
r += mul(s1_3, M4(3.333e-02, 5.179e-01, -1.939e-03, 7.798e-02, 2.011e-02, -2.959e-01, 1.135e-01, 3.122e-01, 8.651e-02, -2.708e-02, 7.183e-03, 4.554e-02, -3.342e-02, 9.136e-03, -7.067e-02, -1.867e-01));
|
||||
r += mul(s1_4, M4(6.231e-01, 9.512e-01, 3.523e-01, 3.744e-01, 2.388e-01, -2.827e-01, 9.968e-02, -5.306e-02, -4.498e-02, -2.222e-01, -5.865e-02, 2.967e-02, -3.029e-01, -2.137e-01, -5.363e-01, 8.872e-02));
|
||||
r += mul(s1_5, M4(-4.862e-02, 7.326e-01, 1.354e-01, 5.607e-02, 1.667e-01, -1.184e-01, -1.304e-01, 6.817e-02, 3.287e-02, 3.310e-01, 1.521e-01, -3.212e-02, -8.947e-02, 4.250e-02, -9.770e-02, -8.344e-02));
|
||||
r += mul(s1_6, M4(-9.242e-04, 4.835e-03, 1.322e-01, 3.745e-02, 9.613e-02, -8.310e-03, 4.718e-02, 2.763e-02, -1.616e-02, 6.167e-02, -3.382e-02, 3.624e-02, 1.213e-02, -2.014e-01, -2.776e-03, 4.360e-02));
|
||||
r += mul(s1_7, M4(-6.861e-02, 4.772e-02, -3.779e-02, 7.567e-02, -8.548e-02, -1.028e-02, 1.881e-02, 2.421e-03, 1.378e-01, 1.305e-01, 2.177e-02, -1.118e-03, 5.861e-02, -1.416e-01, -3.140e-01, -9.031e-02));
|
||||
r += mul(s1_8, M4(-4.147e-02, 1.546e-01, 5.650e-02, 4.098e-02, -1.460e-01, -5.779e-02, -1.959e-02, -2.318e-02, 3.538e-02, -5.044e-02, 3.304e-02, -3.517e-03, -1.176e-01, -3.185e-01, -1.738e-01, -4.349e-02));
|
||||
r += mul(s2_0, M4(-3.428e-03, 6.059e-02, 7.024e-02, 2.739e-02, 1.313e-02, -5.748e-02, 9.005e-03, -7.139e-03, 1.165e-01, -1.541e-01, 1.493e-01, 2.725e-01, 3.254e-02, -2.934e-02, 1.115e-02, -2.844e-02));
|
||||
r += mul(s2_1, M4(-8.601e-03, -3.177e-03, 1.878e-01, 1.106e-01, 1.951e-02, 8.194e-02, 4.971e-02, 5.805e-02, 2.515e-02, -2.529e-01, -2.250e-01, 3.498e-02, 7.183e-02, -8.617e-02, -8.616e-02, 1.623e-01));
|
||||
r += mul(s2_2, M4(-8.072e-02, -1.234e-01, 3.482e-02, -2.873e-02, -4.049e-02, 4.828e-03, 1.940e-02, 3.828e-02, -5.156e-03, 4.585e-03, 2.326e-02, 2.346e-02, -8.908e-02, -1.384e-03, -2.366e-02, 1.290e-02));
|
||||
r += mul(s2_3, M4(4.921e-02, 1.726e-01, 3.832e-02, -2.490e-01, -1.152e-01, -1.722e-01, -1.705e-01, 4.228e-01, -8.215e-02, -1.478e-02, 1.554e-01, 3.701e-01, -8.863e-02, 1.068e-01, 8.890e-03, 6.324e-02));
|
||||
r += mul(s2_4, M4(1.307e-01, 2.312e-01, -1.734e-01, 2.083e-02, -1.966e-01, -3.991e-01, -8.681e-02, 1.976e-03, -3.177e-01, 1.528e-01, -2.329e-01, 2.569e-01, -6.230e-03, 6.020e-02, 4.969e-02, -2.039e-01));
|
||||
r += mul(s2_5, M4(1.660e-01, 1.642e-02, 7.203e-02, -1.613e-01, 6.225e-02, 6.470e-02, 3.305e-03, 2.230e-02, -2.455e-02, 6.599e-02, -1.740e-01, 7.887e-02, 3.463e-03, 1.003e-01, -1.850e-01, 7.885e-02));
|
||||
r += mul(s2_6, M4(-2.170e-02, 1.372e-01, 7.445e-02, -9.419e-02, -1.851e-01, 4.957e-02, -2.454e-01, 5.879e-02, -5.800e-02, -1.122e-01, 7.445e-02, 1.190e-01, 2.695e-02, -5.701e-02, -5.166e-02, -5.058e-02));
|
||||
r += mul(s2_7, M4(5.390e-01, 1.674e-01, 1.213e-01, -1.147e-01, -6.939e-02, -1.218e-01, -2.891e-01, 2.682e-02, -2.636e-01, -1.104e-01, -1.556e-01, 3.774e-02, -4.121e-02, -2.431e-01, -1.248e-01, 1.275e-01));
|
||||
r += mul(s2_8, M4(1.053e-01, 2.238e-01, -1.104e-01, 5.372e-02, 6.179e-02, -2.431e-03, -4.843e-02, 3.820e-02, -7.539e-02, 7.898e-02, 7.562e-03, 1.596e-02, 7.298e-02, -1.553e-01, -3.545e-01, 1.990e-02));
|
||||
r += mul(s3_0, M4(8.232e-02, -6.815e-02, -7.421e-02, -3.191e-02, -1.592e-01, 2.814e-01, 5.009e-02, 3.669e-02, -5.908e-02, -5.445e-02, 4.873e-02, 1.538e-01, 1.065e-01, -2.194e-01, -2.612e-02, -2.297e-02));
|
||||
r += mul(s3_1, M4(1.431e-02, -7.835e-02, -2.790e-03, 9.305e-02, -2.975e-01, 1.527e-01, 1.888e-01, -1.279e-02, -1.938e-02, -1.022e-01, -2.197e-02, -2.919e-02, 2.192e-01, -8.056e-02, 1.328e-03, 3.478e-02));
|
||||
r += mul(s3_2, M4(4.920e-03, -6.286e-02, -7.779e-02, 1.075e-01, -1.092e-01, 2.909e-01, 3.056e-01, -9.017e-02, -3.625e-02, 1.079e-01, 1.107e-01, 6.613e-02, 1.696e-01, -1.852e-01, -1.253e-01, -9.675e-02));
|
||||
r += mul(s3_3, M4(-6.350e-02, 1.137e-01, -3.559e-02, -1.684e-01, -2.044e-01, -9.368e-02, 2.283e-01, 8.052e-01, 4.476e-03, -1.599e-01, 2.594e-02, 1.582e-01, -2.483e-02, 9.216e-02, 5.719e-02, 2.237e-01));
|
||||
r += mul(s3_4, M4(-1.694e-01, 1.597e-01, -3.311e-01, 1.880e-01, 2.614e-01, -2.584e-01, 5.296e-02, 9.726e-02, -3.932e-02, -7.518e-02, -1.749e-01, 1.604e-01, 1.008e-01, 2.920e-01, 5.358e-01, -6.383e-01));
|
||||
r += mul(s3_5, M4(-2.706e-01, -2.716e-01, -4.196e-01, 1.023e-01, 2.201e-01, -1.412e-01, 1.003e-01, -6.972e-02, 3.727e-02, -8.424e-02, -7.870e-02, 2.294e-02, 2.836e-01, -4.165e-01, -2.974e-01, -3.567e-02));
|
||||
r += mul(s3_6, M4(-3.434e-02, 6.420e-02, -8.729e-02, -8.600e-02, -2.041e-01, 1.646e-02, 9.025e-02, 1.724e-01, -4.951e-02, -3.894e-02, -7.985e-02, 1.580e-02, 2.554e-01, -3.100e-01, -2.769e-01, 8.336e-05));
|
||||
r += mul(s3_7, M4(-6.557e-02, 3.865e-02, -3.263e-02, 4.621e-02, -2.077e-01, 2.705e-02, -3.354e-01, 1.480e-01, 4.155e-02, -2.143e-01, -2.626e-01, 1.091e-02, 1.382e-01, -1.706e-01, -1.355e-01, -7.700e-02));
|
||||
r += mul(s3_8, M4(-2.004e-01, 4.575e-01, -1.812e-01, 6.102e-02, 3.469e-01, -6.634e-02, 1.302e-01, -9.621e-02, 4.023e-02, 1.048e-01, -9.194e-02, 5.130e-03, 4.272e-01, -5.971e-01, -2.025e-01, -1.364e-01));
|
||||
r += V4(3.575e-03, 3.041e-03, 1.241e-02, -2.230e-03);
|
||||
return r;
|
||||
}
|
||||
|
||||
V4 f1(V4 s0_0, V4 s0_1, V4 s0_2, V4 s0_3, V4 s0_4, V4 s0_5, V4 s0_6, V4 s0_7, V4 s0_8, V4 s1_0, V4 s1_1, V4 s1_2, V4 s1_3, V4 s1_4, V4 s1_5, V4 s1_6, V4 s1_7, V4 s1_8, V4 s2_0, V4 s2_1, V4 s2_2, V4 s2_3, V4 s2_4, V4 s2_5, V4 s2_6, V4 s2_7, V4 s2_8, V4 s3_0, V4 s3_1, V4 s3_2, V4 s3_3, V4 s3_4, V4 s3_5, V4 s3_6, V4 s3_7, V4 s3_8) {
|
||||
V4 r = 0.0;
|
||||
r += mul(s0_0, M4(-1.437e-01, -9.784e-02, 2.649e-01, -8.638e-02, -1.746e-01, 2.031e-01, 1.203e-01, -8.812e-02, -2.317e-01, 2.311e-01, 3.171e-02, -3.619e-02, -7.798e-02, -2.507e-02, 1.902e-01, 5.780e-02));
|
||||
r += mul(s0_1, M4(2.504e-01, 1.577e-01, -5.397e-02, 4.599e-01, -1.392e-01, 2.560e-01, 1.018e-01, 7.968e-02, 2.247e-01, -2.962e-03, 1.421e-03, -1.201e-01, -3.622e-01, 1.378e-01, 1.392e-01, 1.641e-01));
|
||||
r += mul(s0_2, M4(-6.143e-02, -6.336e-02, 1.131e-01, 6.811e-02, -5.817e-02, 7.362e-02, 1.407e-01, 1.823e-02, 4.880e-01, -2.282e-01, -2.704e-01, -4.287e-01, -2.741e-01, 3.163e-02, 1.098e-01, 1.514e-01));
|
||||
r += mul(s0_3, M4(1.794e-01, 1.720e-01, -4.092e-01, 1.277e-01, -1.938e-01, 3.107e-01, 2.915e-01, 2.279e-01, 2.259e-01, 2.136e-01, 5.867e-02, 2.359e-01, -1.589e-01, 1.132e-01, 6.871e-02, 2.837e-01));
|
||||
r += mul(s0_4, M4(-3.070e-01, -4.494e-01, 5.817e-02, 5.153e-01, 5.215e-01, 5.410e-01, 1.286e-01, -5.596e-01, 4.287e-01, 1.821e-01, 1.542e-01, 3.755e-01, 3.820e-01, 2.953e-01, -2.768e-01, -6.977e-02));
|
||||
r += mul(s0_5, M4(-4.881e-02, 2.327e-02, 9.209e-02, -2.102e-02, -1.394e-01, -8.093e-03, 2.263e-01, -4.307e-01, 1.998e-01, -8.793e-02, -1.057e-01, -1.899e-01, 1.577e-01, 3.435e-01, 6.721e-02, 3.093e-01));
|
||||
r += mul(s0_6, M4(7.516e-02, -1.224e-01, 1.257e-02, -6.769e-02, -8.618e-02, 1.283e-01, 2.060e-01, -1.966e-01, 8.166e-02, -1.263e-01, -2.269e-01, -3.272e-01, -3.439e-02, -2.849e-01, 2.105e-01, -3.015e-03));
|
||||
r += mul(s0_7, M4(7.447e-02, -8.731e-02, 2.804e-02, -4.819e-02, -3.311e-01, 3.824e-01, 7.766e-02, 5.672e-02, 4.014e-01, -4.037e-03, 2.287e-01, 5.626e-02, 3.481e-01, -1.010e-01, -1.156e-01, -2.865e-01));
|
||||
r += mul(s0_8, M4(5.454e-02, -5.590e-02, 3.408e-02, 3.551e-03, 1.262e-02, 8.638e-02, 1.222e-01, 3.418e-01, -2.154e-01, 1.868e-01, 1.210e-01, -2.330e-01, -4.810e-02, -5.190e-02, -8.587e-02, -2.145e-01));
|
||||
r += mul(s1_0, M4(-3.063e-01, -1.830e-02, 5.167e-01, 4.813e-02, -7.310e-02, 1.443e-01, 1.654e-01, 1.158e-01, 4.789e-02, -3.030e-02, -1.358e-01, 2.986e-02, -4.855e-02, -7.736e-02, 4.514e-01, -1.797e-02));
|
||||
r += mul(s1_1, M4(4.322e-01, -1.369e-01, 9.431e-02, 3.921e-01, 2.708e-02, -1.218e-02, -9.091e-02, 1.871e-01, 3.763e-02, -9.213e-02, -1.209e-01, -1.587e-01, 3.014e-03, 1.816e-01, 3.099e-01, 3.210e-01));
|
||||
r += mul(s1_2, M4(-7.234e-02, 1.685e-02, 4.444e-01, -1.886e-01, -9.543e-03, 3.966e-02, 1.105e-01, 4.870e-02, 9.471e-02, -5.263e-02, -1.085e-01, 4.226e-02, -1.565e-01, -3.812e-02, 1.708e-01, 1.457e-01));
|
||||
r += mul(s1_3, M4(2.370e-01, -3.354e-02, -9.648e-02, 1.531e-01, -3.468e-01, -3.957e-02, 3.152e-01, 3.402e-02, 3.762e-02, 9.507e-02, 7.836e-02, 9.088e-03, -1.614e-01, 4.377e-02, 4.748e-02, 1.055e-01));
|
||||
r += mul(s1_4, M4(2.342e-01, -5.059e-01, 2.781e-01, 2.906e-01, 1.656e-01, 1.268e-01, 1.183e-01, -2.458e-02, 2.290e-01, 1.779e-01, -8.310e-02, 1.389e-01, 7.282e-02, 1.050e-01, -3.525e-01, 6.810e-02));
|
||||
r += mul(s1_5, M4(1.078e-01, -4.451e-02, 7.031e-02, -2.977e-01, 3.596e-02, 3.359e-02, 9.589e-03, 9.070e-02, -1.862e-01, -1.863e-01, -9.652e-02, -5.039e-02, 1.004e-01, 1.598e-01, 1.466e-01, 2.349e-01));
|
||||
r += mul(s1_6, M4(1.109e-02, -1.607e-01, 1.578e-02, -1.971e-01, 5.020e-02, -7.597e-02, 7.238e-02, 7.241e-02, 2.025e-02, -2.246e-02, 4.652e-02, -8.760e-02, -1.111e-02, 1.890e-02, 1.046e-01, -2.233e-03));
|
||||
r += mul(s1_7, M4(1.252e-01, -8.046e-02, -1.321e-01, -3.724e-01, -1.383e-01, 1.151e-01, 5.397e-02, -1.422e-01, 8.319e-02, 9.089e-02, -2.620e-02, 1.662e-01, 2.847e-02, -1.255e-01, 6.933e-02, -1.636e-01));
|
||||
r += mul(s1_8, M4(-1.517e-01, 3.661e-02, -3.135e-01, -3.395e-01, -1.139e-01, 1.973e-01, 8.547e-03, -3.118e-02, -8.869e-02, -1.209e-01, 1.867e-02, -4.531e-02, 1.016e-01, -6.909e-02, 1.436e-01, 1.663e-01));
|
||||
r += mul(s2_0, M4(-9.314e-02, 1.395e-02, -1.741e-02, -7.208e-02, -5.164e-02, -5.743e-02, 5.702e-02, 1.342e-01, 6.011e-03, 1.626e-01, 1.101e-01, -1.130e-01, 6.127e-02, -8.956e-03, -7.149e-02, -6.488e-03));
|
||||
r += mul(s2_1, M4(-2.534e-01, 1.086e-01, -1.007e-01, -3.067e-02, -1.074e-01, 7.219e-03, 6.768e-02, -1.012e-01, 2.019e-01, 4.263e-03, -7.411e-02, -1.173e-01, 1.961e-01, -5.619e-02, -2.390e-01, -1.323e-01));
|
||||
r += mul(s2_2, M4(-1.039e-01, -9.899e-02, -2.206e-01, -2.187e-01, -8.739e-03, 6.607e-02, 4.125e-02, 5.363e-02, -6.572e-03, 3.014e-02, 1.314e-01, -9.560e-02, 2.106e-01, 1.237e-02, -8.354e-02, -4.939e-03));
|
||||
r += mul(s2_3, M4(-4.682e-02, -1.357e-01, 3.481e-02, -2.187e-01, 1.113e-01, 8.812e-02, -1.211e-01, -2.011e-02, 1.567e-01, -2.216e-02, -4.920e-03, -2.458e-01, 2.263e-02, 6.741e-02, -1.234e-02, 2.338e-02));
|
||||
r += mul(s2_4, M4(5.105e-02, -3.845e-01, 1.812e-01, -1.927e-01, 2.840e-01, -2.094e-01, 5.673e-02, 4.405e-02, 5.957e-01, 1.734e-02, -1.158e-01, -6.956e-01, -2.077e-01, 5.130e-03, 4.744e-01, -1.540e-02));
|
||||
r += mul(s2_5, M4(1.601e-01, -2.680e-01, -1.678e-01, -1.207e-01, -4.648e-02, -6.454e-02, 1.122e-01, -6.567e-02, 1.638e-01, -1.259e-01, -2.470e-02, -3.547e-01, -1.333e-01, -1.219e-02, -7.710e-02, -3.881e-01));
|
||||
r += mul(s2_6, M4(-6.060e-02, 1.662e-01, -2.082e-01, 3.193e-01, -1.317e-01, 1.395e-04, 2.436e-01, -1.480e-01, 6.104e-03, -2.009e-01, -6.729e-02, -2.207e-01, -7.784e-02, -7.589e-02, 7.569e-02, 3.261e-03));
|
||||
r += mul(s2_7, M4(-2.951e-01, -2.050e-01, 2.827e-02, 3.739e-01, 1.947e-01, 5.411e-01, -2.262e-01, -8.808e-03, 2.262e-01, -9.010e-02, -1.476e-01, -3.582e-01, -1.718e-01, 2.844e-02, 7.832e-02, 1.414e-03));
|
||||
r += mul(s2_8, M4(3.534e-01, 1.695e-01, -1.247e-01, 4.750e-01, 4.171e-02, 2.338e-02, -4.525e-02, -4.955e-02, 2.934e-01, -3.865e-02, -1.125e-01, -2.127e-01, 1.326e-01, 5.967e-02, 6.215e-02, 1.048e-01));
|
||||
r += mul(s3_0, M4(4.186e-02, -5.378e-02, 7.641e-02, -3.524e-02, -2.447e-01, -5.374e-02, -1.380e-01, -4.221e-01, -3.797e-02, -7.623e-03, -4.826e-02, 1.791e-02, -1.390e-01, 1.115e-01, 2.252e-01, -9.103e-03));
|
||||
r += mul(s3_1, M4(1.339e-01, 3.093e-01, -3.615e-02, 8.684e-02, -4.098e-01, -1.216e-01, 2.372e-01, -1.247e-01, -5.358e-02, -1.660e-01, -8.435e-02, 3.871e-02, 2.722e-01, -1.145e-01, -3.944e-01, -5.003e-02));
|
||||
r += mul(s3_2, M4(-4.430e-02, -3.135e-02, 1.019e-01, -1.129e-01, -2.647e-01, -1.317e-01, 8.715e-02, -5.466e-02, -3.946e-02, 7.216e-02, 1.677e-01, 9.349e-02, 8.069e-02, -1.097e-01, -9.659e-03, -8.460e-02));
|
||||
r += mul(s3_3, M4(-5.036e-03, 4.992e-02, 1.086e-01, -1.339e-02, 2.792e-01, 3.294e-01, -1.578e-01, 4.592e-01, -7.749e-02, 4.384e-02, -4.212e-02, 2.287e-02, 1.456e-01, 4.774e-02, -1.264e-01, 7.437e-02));
|
||||
r += mul(s3_4, M4(3.022e-01, -2.197e-01, -4.347e-02, -2.198e-01, 3.922e-02, 8.609e-02, 8.862e-02, 3.418e-01, 8.117e-02, -2.026e-02, -3.236e-01, -2.539e-01, -6.030e-02, -2.409e-01, 7.879e-02, -8.457e-02));
|
||||
r += mul(s3_5, M4(3.525e-01, 2.622e-01, -4.994e-02, -1.932e-01, -1.508e-01, 1.229e-01, 1.359e-01, 1.613e-01, 1.830e-01, -4.473e-02, -5.438e-02, -1.041e-01, 4.534e-01, -4.660e-01, -7.405e-02, -1.001e-01));
|
||||
r += mul(s3_6, M4(-1.224e-02, -5.840e-03, 8.031e-02, -2.279e-02, -2.128e-01, 1.477e-01, -9.937e-03, 4.142e-02, -3.726e-02, -1.013e-01, -2.940e-03, -1.333e-01, 1.353e-01, -2.192e-01, -3.858e-01, -1.100e-01));
|
||||
r += mul(s3_7, M4(-8.882e-02, 1.341e-01, 2.707e-01, 2.212e-01, 2.628e-01, 3.454e-01, -3.703e-01, 4.902e-01, 1.527e-01, 8.567e-03, -1.742e-01, -1.884e-01, -7.710e-01, 1.028e-01, 3.233e-01, -3.897e-01));
|
||||
r += mul(s3_8, M4(3.715e-02, 2.936e-01, -1.195e-01, -1.295e-01, -1.313e-01, -1.222e-01, -2.876e-01, 5.694e-02, 6.813e-02, -1.738e-02, -1.154e-01, 1.649e-02, 1.755e-01, -1.639e-01, 3.212e-02, 3.504e-01));
|
||||
r += V4(-8.611e-03, -6.529e-03, -1.098e-03, 4.669e-03);
|
||||
return r;
|
||||
}
|
||||
|
||||
void Pass3(uint2 blockStart, uint3 tid) {
|
||||
float2 pt = float2(GetInputPt());
|
||||
uint2 gxy = Rmp8x8(tid.x) + blockStart;
|
||||
uint2 size = GetInputSize();
|
||||
if (gxy.x >= size.x || gxy.y >= size.y) {
|
||||
return;
|
||||
}
|
||||
float2 pos = (gxy + 0.5) * pt;
|
||||
|
||||
V4 s0_0 = l0(-1.0, -1.0);
|
||||
V4 s0_1 = l0(0.0, -1.0);
|
||||
V4 s0_2 = l0(1.0, -1.0);
|
||||
V4 s0_3 = l0(-1.0, 0.0);
|
||||
V4 s0_4 = l0(0.0, 0.0);
|
||||
V4 s0_5 = l0(1.0, 0.0);
|
||||
V4 s0_6 = l0(-1.0, 1.0);
|
||||
V4 s0_7 = l0(0.0, 1.0);
|
||||
V4 s0_8 = l0(1.0, 1.0);
|
||||
V4 s1_0 = -max(-s0_0, 0.0);
|
||||
V4 s1_1 = -max(-s0_1, 0.0);
|
||||
V4 s1_2 = -max(-s0_2, 0.0);
|
||||
V4 s1_3 = -max(-s0_3, 0.0);
|
||||
V4 s1_4 = -max(-s0_4, 0.0);
|
||||
V4 s1_5 = -max(-s0_5, 0.0);
|
||||
V4 s1_6 = -max(-s0_6, 0.0);
|
||||
V4 s1_7 = -max(-s0_7, 0.0);
|
||||
V4 s1_8 = -max(-s0_8, 0.0);
|
||||
s0_0 = max(s0_0, 0.0);
|
||||
s0_1 = max(s0_1, 0.0);
|
||||
s0_2 = max(s0_2, 0.0);
|
||||
s0_3 = max(s0_3, 0.0);
|
||||
s0_4 = max(s0_4, 0.0);
|
||||
s0_5 = max(s0_5, 0.0);
|
||||
s0_6 = max(s0_6, 0.0);
|
||||
s0_7 = max(s0_7, 0.0);
|
||||
s0_8 = max(s0_8, 0.0);
|
||||
|
||||
V4 s2_0 = l1(-1.0, -1.0);
|
||||
V4 s2_1 = l1(0.0, -1.0);
|
||||
V4 s2_2 = l1(1.0, -1.0);
|
||||
V4 s2_3 = l1(-1.0, 0.0);
|
||||
V4 s2_4 = l1(0.0, 0.0);
|
||||
V4 s2_5 = l1(1.0, 0.0);
|
||||
V4 s2_6 = l1(-1.0, 1.0);
|
||||
V4 s2_7 = l1(0.0, 1.0);
|
||||
V4 s2_8 = l1(1.0, 1.0);
|
||||
V4 s3_0 = -max(-s2_0, 0.0);
|
||||
V4 s3_1 = -max(-s2_1, 0.0);
|
||||
V4 s3_2 = -max(-s2_2, 0.0);
|
||||
V4 s3_3 = -max(-s2_3, 0.0);
|
||||
V4 s3_4 = -max(-s2_4, 0.0);
|
||||
V4 s3_5 = -max(-s2_5, 0.0);
|
||||
V4 s3_6 = -max(-s2_6, 0.0);
|
||||
V4 s3_7 = -max(-s2_7, 0.0);
|
||||
V4 s3_8 = -max(-s2_8, 0.0);
|
||||
s2_0 = max(s2_0, 0.0);
|
||||
s2_1 = max(s2_1, 0.0);
|
||||
s2_2 = max(s2_2, 0.0);
|
||||
s2_3 = max(s2_3, 0.0);
|
||||
s2_4 = max(s2_4, 0.0);
|
||||
s2_5 = max(s2_5, 0.0);
|
||||
s2_6 = max(s2_6, 0.0);
|
||||
s2_7 = max(s2_7, 0.0);
|
||||
s2_8 = max(s2_8, 0.0);
|
||||
|
||||
t0[gxy] = f0(s0_0, s0_1, s0_2, s0_3, s0_4, s0_5, s0_6, s0_7, s0_8, s1_0, s1_1, s1_2, s1_3, s1_4, s1_5, s1_6, s1_7, s1_8, s2_0, s2_1, s2_2, s2_3, s2_4, s2_5, s2_6, s2_7, s2_8, s3_0, s3_1, s3_2, s3_3, s3_4, s3_5, s3_6, s3_7, s3_8);
|
||||
t1[gxy] = f1(s0_0, s0_1, s0_2, s0_3, s0_4, s0_5, s0_6, s0_7, s0_8, s1_0, s1_1, s1_2, s1_3, s1_4, s1_5, s1_6, s1_7, s1_8, s2_0, s2_1, s2_2, s2_3, s2_4, s2_5, s2_6, s2_7, s2_8, s3_0, s3_1, s3_2, s3_3, s3_4, s3_5, s3_6, s3_7, s3_8);
|
||||
}
|
||||
|
||||
//!PASS 4
|
||||
//!DESC conv3
|
||||
//!BLOCK_SIZE 8
|
||||
//!NUM_THREADS 64
|
||||
//!IN t0, t1
|
||||
//!OUT t2, t3
|
||||
|
||||
#define l0(x, y) V4(O(t0, float2(x, y)))
|
||||
#define l1(x, y) V4(O(t1, float2(x, y)))
|
||||
|
||||
V4 f0(V4 s0_0, V4 s0_1, V4 s0_2, V4 s0_3, V4 s0_4, V4 s0_5, V4 s0_6, V4 s0_7, V4 s0_8, V4 s1_0, V4 s1_1, V4 s1_2, V4 s1_3, V4 s1_4, V4 s1_5, V4 s1_6, V4 s1_7, V4 s1_8, V4 s2_0, V4 s2_1, V4 s2_2, V4 s2_3, V4 s2_4, V4 s2_5, V4 s2_6, V4 s2_7, V4 s2_8, V4 s3_0, V4 s3_1, V4 s3_2, V4 s3_3, V4 s3_4, V4 s3_5, V4 s3_6, V4 s3_7, V4 s3_8) {
|
||||
V4 r = 0.0;
|
||||
r += mul(s0_0, M4(7.356e-02, 8.402e-03, 1.287e-01, 6.762e-02, 2.134e-01, -6.620e-02, -2.788e-01, -5.744e-02, -3.896e-02, -3.993e-02, -7.161e-02, -1.982e-01, -6.734e-02, 8.804e-03, -4.739e-02, 6.502e-02));
|
||||
r += mul(s0_1, M4(2.249e-01, 4.958e-02, 1.138e-01, 3.152e-01, 2.008e-01, 1.703e-01, 5.817e-02, -9.482e-02, -2.371e-01, 3.975e-02, -1.755e-01, -2.666e-01, 2.819e-01, -2.640e-02, 1.405e-01, -6.009e-02));
|
||||
r += mul(s0_2, M4(2.065e-01, -3.027e-02, -3.447e-02, 3.226e-03, -1.252e-02, -7.589e-03, 2.344e-03, -1.704e-02, -8.894e-02, 3.136e-02, -1.517e-01, -2.176e-02, 8.920e-02, -5.322e-02, -9.529e-02, 8.355e-02));
|
||||
r += mul(s0_3, M4(1.136e-01, 1.015e-01, -2.730e-02, -2.144e-01, -9.526e-02, -2.857e-01, 2.711e-01, -1.991e-01, 2.596e-01, 1.602e-01, -2.169e-01, -1.097e-01, 3.353e-02, 6.231e-02, 8.753e-03, 3.707e-01));
|
||||
r += mul(s0_4, M4(-1.945e-01, 3.081e-01, -2.270e-01, -5.963e-02, -1.666e-01, -3.408e-01, 1.161e-01, -6.384e-02, -6.823e-01, -4.014e-01, -6.276e-01, -1.672e-01, 2.986e-03, -1.351e-01, 1.668e-01, -3.133e-01));
|
||||
r += mul(s0_5, M4(4.802e-02, -4.275e-02, 1.978e-03, -7.602e-02, -4.082e-03, 5.572e-02, -3.341e-02, 9.101e-03, -1.038e-01, 1.622e-01, 2.334e-02, 1.768e-01, 9.416e-03, -2.287e-01, 1.048e-01, -2.926e-01));
|
||||
r += mul(s0_6, M4(5.333e-04, 3.089e-02, 2.721e-02, -3.601e-02, -5.081e-02, -1.152e-01, 6.752e-02, 1.701e-01, -2.951e-02, 2.450e-01, -1.684e-01, -4.702e-02, -1.580e-02, 1.200e-02, -1.266e-02, 4.937e-02));
|
||||
r += mul(s0_7, M4(-1.351e-02, -6.248e-02, -3.060e-03, 4.140e-02, -2.090e-01, -6.831e-01, -8.857e-02, 2.536e-01, -2.333e-02, 1.521e-01, -8.033e-02, 2.124e-01, -6.615e-02, 1.317e-01, 1.847e-01, -2.150e-01));
|
||||
r += mul(s0_8, M4(4.605e-02, 1.013e-01, 6.834e-03, -6.411e-02, -1.476e-02, -2.845e-01, -4.312e-02, -1.171e-02, 6.985e-02, -6.859e-02, -2.785e-02, -3.226e-02, 5.186e-02, 1.102e-01, -2.071e-02, -1.250e-01));
|
||||
r += mul(s1_0, M4(1.952e-01, -3.342e-02, -3.770e-02, -2.026e-01, 4.850e-02, -3.174e-02, -1.987e-01, -2.886e-02, -1.298e-01, 1.994e-02, 1.131e-01, 2.950e-02, -1.791e-02, -4.533e-02, 4.695e-02, -6.907e-02));
|
||||
r += mul(s1_1, M4(2.401e-01, 1.809e-01, -5.151e-02, -6.271e-02, -1.409e-01, 9.215e-03, 1.176e-01, 2.717e-02, 1.130e-01, -3.228e-02, -9.086e-02, -1.202e-03, 1.642e-03, -7.943e-03, 1.097e-01, 1.842e-01));
|
||||
r += mul(s1_2, M4(8.774e-02, -1.486e-02, -4.808e-02, 4.089e-02, 6.244e-02, -7.645e-02, 5.614e-02, -5.706e-02, -2.386e-02, 4.407e-02, -1.378e-01, -5.880e-02, 2.936e-02, 2.285e-02, -3.924e-02, 5.724e-02));
|
||||
r += mul(s1_3, M4(2.603e-01, -1.455e-01, 1.429e-01, -2.992e-02, -6.288e-02, -5.216e-02, -1.802e-01, 1.060e-01, -2.473e-02, -6.795e-03, 2.843e-02, 7.745e-02, -4.868e-03, -9.998e-02, -7.961e-02, 5.068e-02));
|
||||
r += mul(s1_4, M4(2.018e-01, -1.293e-01, -5.291e-02, -4.763e-02, 3.484e-02, -1.648e-01, 8.786e-02, -6.101e-02, -1.083e-01, 5.522e-02, -1.814e-01, -2.392e-01, 6.427e-02, -1.908e-02, 2.643e-01, 1.294e-01));
|
||||
r += mul(s1_5, M4(-7.897e-02, -5.967e-02, -2.620e-01, 1.274e-02, -2.583e-02, 5.654e-02, -7.639e-02, -7.534e-03, -5.812e-02, -7.887e-02, -3.738e-03, 7.664e-02, 1.753e-02, -2.842e-01, -3.237e-01, 2.077e-02));
|
||||
r += mul(s1_6, M4(6.558e-02, -9.890e-02, 1.849e-02, 3.242e-04, 1.021e-02, 1.234e-01, 1.224e-02, -4.322e-02, -2.778e-02, 3.860e-02, -5.257e-02, -1.466e-02, -1.001e-02, -1.291e-03, 1.724e-01, -9.167e-02));
|
||||
r += mul(s1_7, M4(-5.291e-02, -2.764e-01, -6.402e-02, 4.327e-02, 1.921e-02, -1.484e-01, 3.286e-02, 4.051e-02, 1.636e-02, 3.932e-01, -5.432e-02, 4.540e-02, 3.947e-02, -1.385e-01, -1.065e-01, 1.569e-01));
|
||||
r += mul(s1_8, M4(-1.729e-02, 8.177e-02, -4.479e-02, -1.275e-01, -3.302e-03, -1.265e-01, -2.922e-02, 3.720e-02, 1.560e-02, 5.266e-02, -1.572e-02, -4.840e-02, 3.991e-03, 1.003e-01, -1.423e-01, 7.414e-02));
|
||||
r += mul(s2_0, M4(-1.207e-02, -2.418e-02, -7.769e-03, -1.401e-01, 1.660e-01, -6.347e-03, -1.092e-02, -1.830e-02, -1.252e-01, -5.217e-02, 9.898e-03, 1.461e-02, 2.654e-02, 1.219e-02, -3.769e-02, 1.897e-02));
|
||||
r += mul(s2_1, M4(-3.650e-02, 1.317e-01, 1.299e-02, -5.512e-02, -1.287e-01, 2.438e-02, -1.609e-03, 1.759e-01, 1.824e-02, 6.477e-03, 2.905e-02, -8.644e-02, 7.496e-02, -9.920e-02, 1.147e-02, 1.889e-01));
|
||||
r += mul(s2_2, M4(-7.005e-03, -4.482e-02, -1.853e-02, 3.441e-02, 1.251e-01, -3.162e-02, -1.701e-01, -5.231e-02, -1.647e-01, 2.261e-02, 8.255e-02, -3.730e-02, 1.811e-01, -9.052e-02, 1.728e-02, 1.911e-02));
|
||||
r += mul(s2_3, M4(2.359e-02, -1.334e-01, 2.761e-02, -1.251e-01, 1.455e-01, 4.076e-02, -3.260e-02, -1.782e-01, -3.575e-02, 1.411e-02, 1.322e-01, -9.592e-02, 5.423e-02, 7.989e-03, -1.460e-01, 8.895e-02));
|
||||
r += mul(s2_4, M4(1.304e-01, 1.296e-01, -7.250e-02, -6.647e-02, 8.382e-02, 1.111e-01, 8.976e-02, -5.914e-02, -2.228e-01, -4.772e-02, -1.931e-03, 8.499e-02, 4.483e-01, 1.327e-01, 5.086e-02, -4.795e-01));
|
||||
r += mul(s2_5, M4(4.674e-02, 7.104e-02, -5.312e-02, -7.730e-02, 2.647e-03, 8.893e-03, -8.889e-02, -5.714e-02, -4.546e-02, -4.002e-02, -1.514e-01, -2.989e-02, -8.669e-02, -5.441e-03, 1.460e-02, -2.327e-02));
|
||||
r += mul(s2_6, M4(1.146e-01, -1.154e-01, 8.289e-03, 7.655e-02, -2.194e-02, -3.908e-02, -2.191e-02, 2.363e-03, 4.527e-02, -7.852e-02, -4.728e-02, 1.066e-01, 4.023e-02, -5.192e-02, -4.180e-02, -3.879e-02));
|
||||
r += mul(s2_7, M4(2.446e-01, -2.295e-01, -5.819e-02, -2.646e-02, 8.106e-02, -8.799e-02, -3.455e-02, 6.900e-02, 5.579e-02, -1.551e-01, 1.609e-01, 9.954e-02, -1.499e-01, 8.628e-02, 1.114e-01, 1.313e-02));
|
||||
r += mul(s2_8, M4(1.028e-02, 9.150e-02, -6.161e-02, 5.124e-03, 3.822e-02, 1.533e-02, 2.329e-02, -1.106e-01, -1.541e-03, -1.818e-01, -9.577e-02, -3.402e-02, 1.784e-02, -1.152e-01, 6.896e-02, -1.111e-01));
|
||||
r += mul(s3_0, M4(-7.349e-02, -4.782e-02, 3.080e-02, -1.668e-01, 9.572e-02, 5.307e-02, 5.573e-03, 6.483e-02, 1.104e-01, -5.707e-02, -8.579e-02, -1.754e-02, 1.038e-01, 1.706e-02, -1.185e-01, 5.863e-02));
|
||||
r += mul(s3_1, M4(-1.639e-01, -6.808e-03, 1.836e-02, -1.482e-01, 1.032e-01, 2.612e-02, -1.751e-01, -1.527e-01, 3.169e-03, 5.272e-02, 7.983e-02, 5.066e-02, 1.191e-01, 3.658e-02, 3.275e-02, -1.122e-01));
|
||||
r += mul(s3_2, M4(-8.279e-02, -1.068e-02, 3.848e-02, -8.857e-03, -3.783e-02, 9.934e-02, -7.181e-02, 2.801e-02, -1.524e-01, -7.166e-02, 1.038e-01, -9.840e-04, -7.254e-03, -3.252e-02, -1.435e-02, 6.052e-03));
|
||||
r += mul(s3_3, M4(-3.534e-02, -2.891e-02, 3.778e-01, -2.472e-01, -4.015e-02, -5.651e-02, 2.006e-01, 1.249e-02, -8.408e-02, -1.160e-02, 2.881e-01, -6.805e-03, 1.340e-02, -1.237e-01, -1.617e-01, 1.894e-02));
|
||||
r += mul(s3_4, M4(-1.512e-02, 3.232e-01, -1.441e-01, -3.778e-01, -1.475e-01, -2.644e-03, -3.149e-01, 3.225e-02, 1.227e-01, -3.620e-02, -1.175e-01, -3.857e-01, 4.834e-02, -1.567e-01, 1.632e-01, -1.292e-01));
|
||||
r += mul(s3_5, M4(-1.592e-01, 3.426e-02, -1.506e-01, 1.215e-01, 1.314e-01, -7.432e-02, -8.767e-02, 1.685e-01, 6.875e-02, 2.804e-01, -3.279e-02, -1.870e-01, 1.049e-01, -9.061e-02, 8.573e-02, -9.407e-02));
|
||||
r += mul(s3_6, M4(5.310e-02, -1.089e-01, -1.496e-01, 2.134e-01, 5.599e-02, -1.565e-01, -6.842e-02, -1.362e-02, 6.861e-02, -2.548e-02, -1.614e-01, -3.698e-02, -2.731e-02, 1.138e-02, 1.288e-02, -1.789e-02));
|
||||
r += mul(s3_7, M4(-7.967e-02, -2.461e-01, -2.139e-01, 3.193e-01, 1.377e-01, -1.213e-01, 8.415e-02, 1.224e-02, 1.192e-01, 1.785e-01, 1.978e-01, 1.008e-01, 3.016e-02, 9.868e-02, 3.118e-03, -3.294e-02));
|
||||
r += mul(s3_8, M4(1.121e-01, -4.625e-02, 3.331e-02, -7.687e-02, 5.520e-02, 6.326e-02, 1.369e-02, 1.850e-02, 4.062e-02, -1.561e-01, -8.640e-02, 1.105e-01, 8.446e-03, -1.746e-03, 4.572e-02, -1.015e-01));
|
||||
r += V4(-1.057e-02, -1.114e-02, 1.597e-04, 1.132e-02);
|
||||
return r;
|
||||
}
|
||||
|
||||
V4 f1(V4 s0_0, V4 s0_1, V4 s0_2, V4 s0_3, V4 s0_4, V4 s0_5, V4 s0_6, V4 s0_7, V4 s0_8, V4 s1_0, V4 s1_1, V4 s1_2, V4 s1_3, V4 s1_4, V4 s1_5, V4 s1_6, V4 s1_7, V4 s1_8, V4 s2_0, V4 s2_1, V4 s2_2, V4 s2_3, V4 s2_4, V4 s2_5, V4 s2_6, V4 s2_7, V4 s2_8, V4 s3_0, V4 s3_1, V4 s3_2, V4 s3_3, V4 s3_4, V4 s3_5, V4 s3_6, V4 s3_7, V4 s3_8) {
|
||||
V4 r = 0.0;
|
||||
r += mul(s0_0, M4(2.399e-01, 1.190e-01, 9.941e-02, -5.908e-03, 2.176e-01, -3.861e-02, -4.997e-02, -3.036e-02, -6.079e-02, 2.294e-02, -1.260e-01, 6.001e-02, -7.690e-02, -4.805e-02, 6.117e-03, 4.358e-02));
|
||||
r += mul(s0_1, M4(-4.669e-02, -1.150e-01, 9.700e-03, 2.351e-02, 3.215e-01, -1.737e-03, 2.091e-01, -1.245e-01, -8.592e-02, 1.866e-01, 2.826e-01, -6.728e-01, 1.528e-01, 5.511e-02, -4.930e-02, -1.959e-02));
|
||||
r += mul(s0_2, M4(-2.182e-02, -4.512e-02, 6.864e-02, 8.299e-02, 8.483e-03, -4.855e-02, -1.500e-01, 1.325e-02, 6.098e-02, -1.867e-02, 1.276e-02, 1.721e-02, 5.918e-03, -1.130e-01, 7.066e-04, -1.824e-03));
|
||||
r += mul(s0_3, M4(4.594e-02, 1.518e-01, -2.067e-01, 1.546e-02, -1.548e-02, 1.126e-01, -4.502e-03, -2.014e-02, 2.417e-01, -1.530e-01, -1.095e-01, -4.966e-02, 2.291e-01, -4.598e-03, 2.836e-01, 5.562e-02));
|
||||
r += mul(s0_4, M4(5.432e-02, -3.003e-01, 7.389e-01, -1.497e-01, -2.439e-01, -3.298e-01, 4.081e-01, -2.105e-01, -4.267e-01, 3.913e-01, 5.470e-01, 5.594e-01, -1.221e-01, -5.444e-02, -4.180e-01, 1.515e-01));
|
||||
r += mul(s0_5, M4(2.205e-01, -5.813e-03, 7.451e-03, 8.130e-02, -3.312e-02, -9.387e-02, -9.824e-02, 4.493e-02, 8.187e-02, -2.042e-01, 1.644e-01, 1.562e-01, -8.427e-02, 2.057e-01, -1.668e-02, -2.356e-01));
|
||||
r += mul(s0_6, M4(1.150e-02, 1.442e-02, -1.973e-02, -4.599e-02, -9.680e-02, 3.962e-02, 1.731e-02, -2.402e-02, 3.936e-02, 6.512e-03, 2.103e-02, 2.025e-03, -1.308e-02, -5.259e-02, 5.631e-02, 3.037e-02));
|
||||
r += mul(s0_7, M4(-1.306e-02, -3.164e-02, 1.196e-01, 2.798e-02, -2.533e-01, -1.204e-01, 1.860e-01, 1.564e-01, -4.731e-02, -7.323e-02, 1.441e-03, -9.049e-02, -3.371e-02, -2.801e-04, 2.952e-02, -2.632e-02));
|
||||
r += mul(s0_8, M4(3.024e-02, -1.034e-02, -7.595e-02, -7.550e-02, 3.562e-02, -4.589e-02, -3.066e-02, 7.995e-02, -1.866e-02, 1.022e-01, -2.624e-02, -1.074e-01, 2.176e-02, 1.434e-01, -5.664e-02, -3.473e-02));
|
||||
r += mul(s1_0, M4(2.252e-01, 9.801e-02, -5.786e-02, -6.661e-02, 7.599e-02, -9.244e-02, 4.437e-02, -1.203e-01, -1.577e-01, -3.797e-02, -1.335e-02, 4.540e-02, -3.540e-03, -9.094e-03, -4.076e-02, -8.099e-02));
|
||||
r += mul(s1_1, M4(2.557e-01, -2.549e-01, 2.306e-01, -4.389e-02, -3.677e-02, 5.796e-02, 4.505e-02, -1.209e-01, -4.484e-02, 1.229e-01, -5.686e-02, 2.778e-02, 9.876e-02, -6.893e-04, 9.771e-02, 1.264e-01));
|
||||
r += mul(s1_2, M4(-5.324e-02, -9.632e-02, -1.092e-02, -1.426e-02, 3.082e-02, 9.196e-02, -1.381e-01, -1.013e-01, 7.758e-03, -3.290e-02, 1.630e-02, -4.979e-03, -7.297e-02, -7.534e-02, 2.040e-02, -1.983e-01));
|
||||
r += mul(s1_3, M4(1.951e-01, 3.566e-02, 4.220e-02, 8.086e-02, -5.114e-02, -5.626e-02, -6.912e-02, 1.462e-01, 2.268e-03, -2.592e-02, 3.527e-02, -3.832e-02, 4.756e-02, 1.234e-01, -5.494e-03, 4.695e-02));
|
||||
r += mul(s1_4, M4(4.147e-01, -2.431e-01, 2.372e-01, 2.574e-04, -4.485e-02, 5.014e-02, 3.928e-02, -2.817e-02, 3.512e-01, 2.983e-01, -1.260e-01, 4.326e-01, -2.366e-01, -6.912e-02, 2.259e-01, -4.534e-01));
|
||||
r += mul(s1_5, M4(1.323e-01, 5.260e-03, 2.693e-02, 1.841e-01, -1.105e-01, 6.002e-02, -1.233e-01, 1.012e-02, -9.410e-02, -1.260e-01, 1.264e-02, -3.910e-02, 3.656e-01, -1.103e-01, 5.059e-01, 4.280e-01));
|
||||
r += mul(s1_6, M4(-7.537e-02, -2.153e-02, -4.511e-02, -5.184e-02, -1.745e-02, -1.165e-02, 1.352e-02, -1.951e-02, -4.888e-02, 2.249e-02, -3.915e-02, -4.557e-03, -9.946e-03, -1.633e-04, -3.200e-02, -1.356e-02));
|
||||
r += mul(s1_7, M4(-1.509e-01, -2.227e-02, 1.640e-01, 2.693e-02, 4.846e-02, 3.303e-02, -5.390e-02, 3.607e-02, -2.818e-02, -7.170e-02, 3.311e-02, -9.203e-02, -1.946e-03, -8.577e-02, -2.925e-02, 1.238e-01));
|
||||
r += mul(s1_8, M4(-3.295e-02, 1.995e-02, -1.689e-01, -4.353e-02, -4.138e-02, -7.439e-03, -2.343e-02, 6.997e-02, 8.031e-02, 1.117e-01, 4.894e-02, -6.214e-02, -1.960e-01, -1.630e-01, 8.586e-02, -8.213e-02));
|
||||
r += mul(s2_0, M4(-9.883e-02, -1.168e-02, -1.110e-01, -2.148e-01, 1.452e-01, 3.417e-03, -4.513e-02, 8.845e-02, -7.791e-02, 2.326e-02, -4.188e-02, -3.659e-02, 3.105e-02, -1.318e-02, -4.552e-03, 7.109e-02));
|
||||
r += mul(s2_1, M4(1.958e-02, -6.995e-02, 2.588e-01, -6.431e-02, -2.211e-01, 5.281e-02, 5.399e-02, 8.884e-02, -5.135e-02, -4.768e-02, 1.363e-01, -2.064e-01, -1.391e-01, 1.106e-01, -2.611e-01, 2.038e-01));
|
||||
r += mul(s2_2, M4(-6.883e-02, -1.360e-03, -1.628e-01, 7.301e-02, 1.213e-01, -5.159e-03, 1.194e-01, -1.148e-02, -1.285e-01, -1.448e-01, 1.776e-02, -1.414e-01, -3.022e-02, 1.382e-01, 6.695e-02, -4.201e-02));
|
||||
r += mul(s2_3, M4(-1.194e-01, 1.524e-03, -1.945e-01, -1.496e-01, 1.413e-03, -8.697e-04, -1.542e-01, -1.798e-03, -4.991e-02, -7.944e-03, -1.094e-01, -5.578e-02, 1.526e-01, -6.170e-02, 1.598e-01, 1.306e-01));
|
||||
r += mul(s2_4, M4(3.583e-02, -1.213e-01, 2.087e-01, -4.616e-02, 2.125e-01, -1.242e-01, 2.776e-01, -8.100e-02, -1.733e-01, 1.016e-01, 2.949e-01, 1.489e-01, 5.059e-01, 3.526e-01, -4.764e-01, -1.105e-02));
|
||||
r += mul(s2_5, M4(7.240e-02, 1.034e-01, -1.103e-01, 2.351e-02, -2.711e-02, 1.506e-02, -1.534e-01, 1.093e-01, 5.065e-02, -2.686e-01, 1.423e-01, -4.993e-02, 7.167e-02, 1.084e-01, -8.139e-03, 4.460e-02));
|
||||
r += mul(s2_6, M4(1.243e-01, 1.281e-02, 7.048e-02, 1.117e-01, -1.145e-01, -1.703e-02, -1.470e-02, -3.647e-02, 3.796e-03, 2.441e-02, -8.422e-02, 1.955e-02, -2.861e-02, -6.963e-02, 6.894e-02, -4.071e-02));
|
||||
r += mul(s2_7, M4(2.315e-01, 7.446e-02, -7.632e-02, 1.319e-01, -2.392e-02, 2.525e-02, 4.687e-02, 7.645e-02, 4.250e-02, -4.733e-02, 2.179e-01, -3.843e-02, -3.526e-01, 9.675e-02, -1.837e-01, -1.563e-01));
|
||||
r += mul(s2_8, M4(5.933e-02, 1.490e-01, -5.844e-02, 9.363e-02, 7.616e-04, -1.075e-02, -1.365e-01, -6.094e-02, 7.094e-03, -1.218e-01, 7.021e-02, 3.101e-02, -4.184e-02, 3.989e-02, -7.167e-02, -1.179e-01));
|
||||
r += mul(s3_0, M4(-7.835e-02, 6.392e-02, -5.802e-02, -1.483e-01, 1.374e-01, 3.699e-02, 2.043e-03, 1.554e-01, -6.873e-02, -1.174e-02, -1.518e-01, -1.405e-02, 4.783e-03, -1.131e-01, 4.121e-02, -8.849e-02));
|
||||
r += mul(s3_1, M4(-1.463e-01, 5.240e-02, -1.651e-02, -2.410e-01, 1.092e-01, -3.146e-02, -1.629e-02, -2.974e-02, -7.838e-02, -7.374e-03, 2.745e-01, -1.408e-01, 1.335e-01, 8.634e-02, 1.073e-02, -1.407e-02));
|
||||
r += mul(s3_2, M4(-7.340e-02, 2.321e-02, 1.922e-02, -1.112e-01, 2.932e-02, -2.587e-02, 1.333e-01, 4.721e-02, -1.514e-01, -3.395e-02, -1.264e-01, 1.777e-02, -8.692e-02, 1.186e-02, -7.424e-02, -2.402e-02));
|
||||
r += mul(s3_3, M4(5.052e-02, 2.790e-03, 3.121e-02, -1.839e-01, 3.910e-02, 2.279e-02, 6.041e-02, -8.205e-03, -5.819e-02, 5.701e-04, 5.763e-02, -1.835e-02, -7.273e-02, -1.017e-01, -4.708e-02, 3.331e-02));
|
||||
r += mul(s3_4, M4(-4.521e-02, -3.700e-02, -1.199e-01, -3.863e-01, -4.641e-01, -2.451e-01, 1.512e-03, -3.424e-01, -1.194e-01, 1.119e-01, -1.183e-01, 1.918e-01, 8.865e-02, 1.866e-01, -4.503e-02, 2.355e-03));
|
||||
r += mul(s3_5, M4(5.461e-02, -1.461e-01, 2.827e-01, 2.041e-01, -8.786e-03, 1.079e-02, 1.593e-01, 2.173e-01, 4.916e-01, -1.773e-01, 2.149e-02, -1.461e-01, -5.435e-02, 1.909e-01, -2.171e-01, -7.547e-02));
|
||||
r += mul(s3_6, M4(2.543e-02, 5.455e-02, -6.107e-02, 5.194e-03, 9.984e-02, 8.664e-02, -7.757e-04, 3.957e-02, 1.432e-01, 3.805e-02, -1.005e-03, 7.600e-02, -4.304e-02, -6.326e-02, 3.996e-02, 3.872e-03));
|
||||
r += mul(s3_7, M4(-1.234e-01, -1.276e-01, 1.312e-01, 8.454e-02, 1.539e-01, 6.822e-02, -1.455e-02, 1.223e-01, -1.060e-01, -3.708e-02, -1.480e-01, -7.922e-02, 6.503e-02, 1.105e-01, -1.249e-01, -3.210e-02));
|
||||
r += mul(s3_8, M4(1.676e-01, -7.072e-03, 4.581e-02, -1.006e-01, -1.056e-02, -8.209e-02, -4.804e-02, 2.427e-02, -1.165e-01, -8.224e-02, 2.940e-01, -9.220e-03, 5.420e-02, 1.802e-01, -1.190e-01, 1.433e-02));
|
||||
r += V4(-4.712e-03, -1.187e-02, 1.287e-02, -6.625e-03);
|
||||
return r;
|
||||
}
|
||||
|
||||
void Pass4(uint2 blockStart, uint3 tid) {
|
||||
float2 pt = float2(GetInputPt());
|
||||
uint2 gxy = Rmp8x8(tid.x) + blockStart;
|
||||
uint2 size = GetInputSize();
|
||||
if (gxy.x >= size.x || gxy.y >= size.y) {
|
||||
return;
|
||||
}
|
||||
float2 pos = (gxy + 0.5) * pt;
|
||||
|
||||
V4 s0_0 = l0(-1.0, -1.0);
|
||||
V4 s0_1 = l0(0.0, -1.0);
|
||||
V4 s0_2 = l0(1.0, -1.0);
|
||||
V4 s0_3 = l0(-1.0, 0.0);
|
||||
V4 s0_4 = l0(0.0, 0.0);
|
||||
V4 s0_5 = l0(1.0, 0.0);
|
||||
V4 s0_6 = l0(-1.0, 1.0);
|
||||
V4 s0_7 = l0(0.0, 1.0);
|
||||
V4 s0_8 = l0(1.0, 1.0);
|
||||
V4 s1_0 = -max(-s0_0, 0.0);
|
||||
V4 s1_1 = -max(-s0_1, 0.0);
|
||||
V4 s1_2 = -max(-s0_2, 0.0);
|
||||
V4 s1_3 = -max(-s0_3, 0.0);
|
||||
V4 s1_4 = -max(-s0_4, 0.0);
|
||||
V4 s1_5 = -max(-s0_5, 0.0);
|
||||
V4 s1_6 = -max(-s0_6, 0.0);
|
||||
V4 s1_7 = -max(-s0_7, 0.0);
|
||||
V4 s1_8 = -max(-s0_8, 0.0);
|
||||
s0_0 = max(s0_0, 0.0);
|
||||
s0_1 = max(s0_1, 0.0);
|
||||
s0_2 = max(s0_2, 0.0);
|
||||
s0_3 = max(s0_3, 0.0);
|
||||
s0_4 = max(s0_4, 0.0);
|
||||
s0_5 = max(s0_5, 0.0);
|
||||
s0_6 = max(s0_6, 0.0);
|
||||
s0_7 = max(s0_7, 0.0);
|
||||
s0_8 = max(s0_8, 0.0);
|
||||
|
||||
V4 s2_0 = l1(-1.0, -1.0);
|
||||
V4 s2_1 = l1(0.0, -1.0);
|
||||
V4 s2_2 = l1(1.0, -1.0);
|
||||
V4 s2_3 = l1(-1.0, 0.0);
|
||||
V4 s2_4 = l1(0.0, 0.0);
|
||||
V4 s2_5 = l1(1.0, 0.0);
|
||||
V4 s2_6 = l1(-1.0, 1.0);
|
||||
V4 s2_7 = l1(0.0, 1.0);
|
||||
V4 s2_8 = l1(1.0, 1.0);
|
||||
V4 s3_0 = -max(-s2_0, 0.0);
|
||||
V4 s3_1 = -max(-s2_1, 0.0);
|
||||
V4 s3_2 = -max(-s2_2, 0.0);
|
||||
V4 s3_3 = -max(-s2_3, 0.0);
|
||||
V4 s3_4 = -max(-s2_4, 0.0);
|
||||
V4 s3_5 = -max(-s2_5, 0.0);
|
||||
V4 s3_6 = -max(-s2_6, 0.0);
|
||||
V4 s3_7 = -max(-s2_7, 0.0);
|
||||
V4 s3_8 = -max(-s2_8, 0.0);
|
||||
s2_0 = max(s2_0, 0.0);
|
||||
s2_1 = max(s2_1, 0.0);
|
||||
s2_2 = max(s2_2, 0.0);
|
||||
s2_3 = max(s2_3, 0.0);
|
||||
s2_4 = max(s2_4, 0.0);
|
||||
s2_5 = max(s2_5, 0.0);
|
||||
s2_6 = max(s2_6, 0.0);
|
||||
s2_7 = max(s2_7, 0.0);
|
||||
s2_8 = max(s2_8, 0.0);
|
||||
|
||||
t2[gxy] = f0(s0_0, s0_1, s0_2, s0_3, s0_4, s0_5, s0_6, s0_7, s0_8, s1_0, s1_1, s1_2, s1_3, s1_4, s1_5, s1_6, s1_7, s1_8, s2_0, s2_1, s2_2, s2_3, s2_4, s2_5, s2_6, s2_7, s2_8, s3_0, s3_1, s3_2, s3_3, s3_4, s3_5, s3_6, s3_7, s3_8);
|
||||
t3[gxy] = f1(s0_0, s0_1, s0_2, s0_3, s0_4, s0_5, s0_6, s0_7, s0_8, s1_0, s1_1, s1_2, s1_3, s1_4, s1_5, s1_6, s1_7, s1_8, s2_0, s2_1, s2_2, s2_3, s2_4, s2_5, s2_6, s2_7, s2_8, s3_0, s3_1, s3_2, s3_3, s3_4, s3_5, s3_6, s3_7, s3_8);
|
||||
}
|
||||
|
||||
//!PASS 5
|
||||
//!DESC conv4
|
||||
//!BLOCK_SIZE 8
|
||||
//!NUM_THREADS 64
|
||||
//!IN t2, t3
|
||||
//!OUT t0, t1
|
||||
|
||||
#define l0(x, y) V4(O(t2, float2(x, y)))
|
||||
#define l1(x, y) V4(O(t3, float2(x, y)))
|
||||
|
||||
V4 f0(V4 s0_0, V4 s0_1, V4 s0_2, V4 s0_3, V4 s0_4, V4 s0_5, V4 s0_6, V4 s0_7, V4 s0_8, V4 s1_0, V4 s1_1, V4 s1_2, V4 s1_3, V4 s1_4, V4 s1_5, V4 s1_6, V4 s1_7, V4 s1_8, V4 s2_0, V4 s2_1, V4 s2_2, V4 s2_3, V4 s2_4, V4 s2_5, V4 s2_6, V4 s2_7, V4 s2_8, V4 s3_0, V4 s3_1, V4 s3_2, V4 s3_3, V4 s3_4, V4 s3_5, V4 s3_6, V4 s3_7, V4 s3_8) {
|
||||
V4 r = 0.0;
|
||||
r += mul(s0_0, M4(1.060e-02, 9.173e-03, 9.548e-04, -7.886e-02, -1.324e-02, 4.660e-02, -4.997e-02, -5.676e-02, -3.290e-02, 6.253e-02, -5.777e-02, 1.265e-02, 6.136e-03, 7.179e-02, 3.102e-02, 4.961e-02));
|
||||
r += mul(s0_1, M4(5.787e-03, -2.090e-03, -1.489e-01, 4.380e-02, 1.259e-01, 5.508e-01, 1.211e-01, 3.385e-01, 2.399e-02, -1.436e-01, 2.987e-03, -2.839e-02, -3.021e-02, -8.641e-03, 1.716e-01, -1.328e-02));
|
||||
r += mul(s0_2, M4(-3.284e-02, -5.196e-02, -2.983e-02, -2.858e-02, 1.729e-02, 7.665e-02, 1.387e-01, 1.037e-01, 4.289e-02, 1.274e-01, 3.348e-02, 1.911e-02, -1.786e-02, -4.888e-02, 6.323e-02, -2.989e-02));
|
||||
r += mul(s0_3, M4(2.473e-02, -6.550e-02, -1.373e-01, 3.680e-02, 1.575e-01, -8.270e-02, 3.186e-02, -3.836e-02, 4.508e-02, 4.254e-02, 5.656e-03, -9.132e-02, 1.334e-01, -5.076e-02, -2.445e-02, -4.735e-02));
|
||||
r += mul(s0_4, M4(-5.346e-01, 1.950e-01, 2.121e-01, -3.694e-01, 5.004e-02, 1.610e-02, 2.249e-01, -5.962e-02, -6.243e-02, -3.270e-01, 1.851e-01, 4.051e-02, -2.310e-01, -2.300e-01, -1.314e-01, 3.374e-01));
|
||||
r += mul(s0_5, M4(-4.686e-02, -3.968e-01, 2.772e-02, 2.495e-02, 4.541e-02, 8.724e-02, 4.401e-02, -1.515e-02, -6.453e-02, -7.210e-02, -1.250e-02, 4.044e-02, 3.057e-02, 2.485e-01, 2.228e-02, 6.774e-02));
|
||||
r += mul(s0_6, M4(-1.518e-01, -6.862e-02, -8.148e-02, -2.030e-01, -4.453e-02, -2.133e-03, -6.081e-02, -8.941e-02, -5.417e-02, 1.564e-02, -5.425e-02, 5.875e-02, -8.805e-02, -1.910e-02, 2.099e-02, -1.402e-02));
|
||||
r += mul(s0_7, M4(-1.730e-02, -6.152e-02, -2.764e-01, -8.728e-02, 9.519e-03, -2.799e-02, -5.662e-02, 3.249e-02, 8.716e-02, 2.809e-02, -7.241e-02, 3.046e-02, 1.368e-01, 2.723e-02, 1.130e-01, -4.615e-02));
|
||||
r += mul(s0_8, M4(-5.021e-02, -3.352e-02, 5.072e-02, -1.434e-02, 6.511e-02, 6.519e-02, -8.987e-02, 2.193e-02, 1.583e-04, 2.714e-02, -2.315e-02, -3.077e-02, 7.792e-03, 2.782e-02, 9.282e-02, 5.011e-02));
|
||||
r += mul(s1_0, M4(-2.541e-02, -9.530e-03, -2.089e-01, -2.421e-02, 1.340e-02, 1.228e-01, 8.861e-02, -1.063e-02, -7.461e-02, 5.226e-02, -7.276e-02, 3.544e-02, -1.591e-02, 1.851e-02, 9.562e-03, 4.559e-02));
|
||||
r += mul(s1_1, M4(2.747e-02, -7.982e-02, -1.475e-01, 4.885e-02, -1.175e-02, -9.209e-02, -9.273e-02, -7.428e-02, 3.696e-02, -2.012e-01, 4.627e-02, 3.609e-02, 1.096e-01, -5.087e-02, 2.170e-01, 5.311e-02));
|
||||
r += mul(s1_2, M4(2.410e-02, 6.970e-02, 2.315e-02, 2.908e-02, 2.961e-05, 1.661e-02, 8.374e-02, 5.064e-02, 2.637e-02, 1.330e-01, 5.175e-02, -5.518e-02, -4.871e-03, 1.162e-01, 8.451e-02, 1.741e-02));
|
||||
r += mul(s1_3, M4(4.863e-02, -7.095e-02, 3.927e-03, -9.085e-02, 2.639e-02, -8.297e-02, -1.865e-01, -9.647e-02, 6.967e-02, 1.376e-02, 1.222e-01, -2.819e-01, 1.563e-01, -1.399e-02, -4.367e-02, -5.187e-02));
|
||||
r += mul(s1_4, M4(9.322e-02, 9.848e-02, 1.680e-01, -2.298e-01, -6.183e-02, -4.167e-02, -1.103e-02, -9.856e-03, -2.983e-03, -3.805e-01, -3.115e-01, -4.107e-01, -1.341e-01, -3.703e-01, -3.661e-01, -4.633e-01));
|
||||
r += mul(s1_5, M4(-2.785e-03, -2.188e-02, -2.790e-03, -4.276e-04, 7.082e-02, 1.004e-01, -3.532e-03, 1.740e-03, 6.693e-03, -5.230e-01, 2.119e-01, 2.878e-02, 3.915e-03, 1.842e-01, -1.630e-02, -3.874e-02));
|
||||
r += mul(s1_6, M4(2.313e-02, -6.545e-02, 1.631e-02, -1.278e-01, -4.216e-02, -4.147e-02, 6.827e-02, -1.725e-02, -5.254e-02, -3.942e-02, -2.400e-02, -8.124e-02, -3.250e-02, -1.806e-03, -3.947e-02, -7.056e-02));
|
||||
r += mul(s1_7, M4(8.445e-03, 1.147e-01, -7.772e-02, 1.091e-01, 1.842e-02, -6.040e-03, -7.053e-02, 1.824e-02, 2.212e-01, -8.777e-02, -1.003e-01, 6.533e-03, 2.090e-01, 4.588e-02, 9.886e-02, 6.176e-02));
|
||||
r += mul(s1_8, M4(4.046e-02, 1.872e-02, -5.723e-02, -4.997e-02, 5.232e-03, 1.795e-02, -2.747e-02, -1.507e-02, -1.704e-01, 7.849e-02, -1.475e-01, -4.255e-02, 7.807e-02, 4.185e-02, 3.849e-02, 3.137e-02));
|
||||
r += mul(s2_0, M4(-8.062e-03, 6.677e-02, 6.217e-02, 1.833e-01, -1.475e-01, 2.782e-01, 3.524e-02, -6.275e-02, 4.315e-02, 1.484e-02, 3.820e-02, -3.304e-02, 1.659e-03, -9.567e-03, -3.360e-02, -2.623e-02));
|
||||
r += mul(s2_1, M4(9.928e-02, -2.526e-01, -2.613e-02, 2.043e-01, 1.710e-02, -1.137e-01, 1.798e-01, -1.427e-01, 4.676e-03, 1.728e-01, 8.082e-02, -5.413e-02, -1.710e-02, -3.169e-02, -6.860e-02, 1.496e-02));
|
||||
r += mul(s2_2, M4(1.785e-02, 1.092e-01, -7.685e-02, 7.691e-02, 5.271e-03, -5.168e-02, 3.395e-02, 1.726e-02, 2.936e-02, -1.321e-02, 5.364e-02, -6.785e-03, 2.429e-02, -4.442e-02, -6.348e-02, 3.035e-02));
|
||||
r += mul(s2_3, M4(2.676e-01, 4.022e-03, -5.435e-02, -2.723e-01, -1.412e-01, -6.091e-01, 1.576e-02, 6.829e-02, -1.410e-01, 5.578e-03, 3.833e-03, 1.863e-01, -2.274e-02, 6.034e-03, 1.518e-01, -5.434e-02));
|
||||
r += mul(s2_4, M4(7.884e-02, 5.377e-01, -4.655e-02, -3.752e-01, 1.490e-01, -4.235e-02, -5.390e-02, 2.610e-01, 1.979e-01, -5.718e-02, 1.773e-02, 5.727e-02, 1.703e-02, 7.533e-01, -3.023e-02, 5.456e-02));
|
||||
r += mul(s2_5, M4(-4.898e-02, 4.237e-02, 6.311e-02, -4.635e-02, 3.660e-03, 2.139e-01, -3.722e-02, -6.738e-02, -3.009e-02, -6.140e-02, 2.777e-02, 3.917e-02, -1.421e-01, -4.041e-01, -1.524e-01, -9.837e-02));
|
||||
r += mul(s2_6, M4(6.071e-02, 1.084e-01, -6.370e-02, 1.323e-01, -7.251e-02, -1.079e-01, 1.208e-01, -4.495e-02, -2.115e-03, -4.107e-02, 2.465e-02, -1.230e-01, -6.064e-02, -4.263e-02, -1.388e-01, 6.519e-02));
|
||||
r += mul(s2_7, M4(9.042e-02, -8.032e-02, 1.186e-01, -1.537e-02, -6.566e-03, -3.216e-02, 3.412e-02, -3.207e-02, -1.586e-01, -2.988e-03, -2.358e-03, 2.172e-02, 6.775e-02, -3.590e-01, -4.123e-01, -3.506e-01));
|
||||
r += mul(s2_8, M4(8.486e-02, 4.731e-02, 5.779e-02, 1.000e-01, 9.121e-03, -3.421e-02, 4.891e-02, 4.916e-02, 3.343e-03, 4.437e-03, -2.002e-02, -3.856e-02, -1.319e-01, -4.022e-02, -1.752e-01, -9.250e-02));
|
||||
r += mul(s3_0, M4(-6.652e-03, 6.416e-02, 9.292e-03, 6.520e-02, 1.213e-02, 4.177e-02, 7.038e-02, -3.160e-02, 2.146e-02, -9.523e-02, -1.436e-01, -8.325e-02, -1.234e-02, -1.222e-02, -3.877e-02, -4.175e-02));
|
||||
r += mul(s3_1, M4(-5.171e-02, 1.011e-01, 7.998e-02, -8.804e-02, 1.067e-02, 1.516e-01, 6.508e-02, -7.724e-02, 2.717e-02, -4.901e-02, -6.059e-03, 4.013e-02, -3.833e-02, 1.538e-01, 5.948e-02, -4.945e-02));
|
||||
r += mul(s3_2, M4(1.023e-02, -1.230e-01, -1.861e-02, -2.570e-02, 2.512e-02, -4.630e-02, 6.354e-02, 3.897e-02, -2.146e-02, 2.446e-01, 1.906e-03, -9.068e-03, 1.754e-02, -7.082e-02, 1.107e-04, -9.604e-03));
|
||||
r += mul(s3_3, M4(7.380e-02, 2.216e-02, -2.608e-02, -6.491e-02, 4.018e-02, -6.657e-02, 1.116e-01, 9.405e-02, -7.168e-02, -3.646e-01, 8.387e-02, 1.352e-02, -4.589e-02, 2.235e-02, 1.881e-01, 1.759e-01));
|
||||
r += mul(s3_4, M4(-7.735e-02, -8.574e-02, -6.380e-02, 1.221e-01, 5.556e-02, -1.281e-01, 1.461e-01, 2.757e-01, 8.144e-01, -1.075e-01, 3.165e-03, -2.036e-01, 1.814e-01, 1.744e-01, -1.745e-01, 3.724e-02));
|
||||
r += mul(s3_5, M4(-6.864e-02, 1.273e-02, 7.502e-02, 4.164e-02, 1.301e-02, 1.407e-01, -9.985e-02, -8.079e-02, 1.428e-01, 3.034e-01, -1.564e-02, 6.091e-02, -1.271e-02, -2.153e-01, -7.843e-02, -4.063e-02));
|
||||
r += mul(s3_6, M4(-5.115e-02, 6.016e-02, -2.719e-02, 4.668e-02, -3.214e-02, -2.274e-02, -6.954e-03, -9.099e-03, 4.861e-02, 1.007e-01, -2.150e-01, -1.607e-01, -3.578e-02, 1.230e-02, -5.095e-02, 1.622e-02));
|
||||
r += mul(s3_7, M4(9.498e-02, -6.763e-02, 1.451e-01, 3.408e-03, -3.253e-02, 1.145e-01, 8.122e-03, -9.192e-02, 5.071e-02, -6.317e-02, 1.097e-01, 5.913e-02, 8.494e-02, 2.731e-04, -3.736e-01, -6.110e-03));
|
||||
r += mul(s3_8, M4(1.881e-02, 1.750e-02, 5.956e-02, 4.179e-02, -4.554e-02, -9.824e-02, 8.917e-03, 3.348e-02, 4.160e-02, 6.525e-02, 1.484e-02, -2.331e-02, -8.092e-02, -2.834e-02, -1.284e-01, -7.521e-02));
|
||||
r += V4(-5.270e-03, 1.390e-02, 8.622e-03, 1.255e-02);
|
||||
return r;
|
||||
}
|
||||
|
||||
V4 f1(V4 s0_0, V4 s0_1, V4 s0_2, V4 s0_3, V4 s0_4, V4 s0_5, V4 s0_6, V4 s0_7, V4 s0_8, V4 s1_0, V4 s1_1, V4 s1_2, V4 s1_3, V4 s1_4, V4 s1_5, V4 s1_6, V4 s1_7, V4 s1_8, V4 s2_0, V4 s2_1, V4 s2_2, V4 s2_3, V4 s2_4, V4 s2_5, V4 s2_6, V4 s2_7, V4 s2_8, V4 s3_0, V4 s3_1, V4 s3_2, V4 s3_3, V4 s3_4, V4 s3_5, V4 s3_6, V4 s3_7, V4 s3_8) {
|
||||
V4 r = 0.0;
|
||||
r += mul(s0_0, M4(-4.868e-02, -7.333e-02, -1.029e-02, -7.011e-04, 2.404e-02, -9.301e-02, 1.457e-01, 2.242e-02, 6.850e-02, -1.328e-03, -2.557e-02, -4.854e-04, 1.071e-01, 3.788e-04, 1.408e-01, 5.354e-03));
|
||||
r += mul(s0_1, M4(7.892e-03, 5.832e-02, -1.077e-01, -6.140e-02, -1.003e-02, -4.887e-01, 8.263e-01, 2.416e-01, -3.434e-02, 1.089e-02, -1.984e-02, -3.615e-02, -5.692e-03, 1.615e-02, -5.680e-02, 2.041e-02));
|
||||
r += mul(s0_2, M4(2.136e-02, -2.731e-02, -1.742e-02, 2.592e-02, -4.319e-02, 6.426e-03, 2.110e-02, -6.338e-02, -6.921e-03, -1.288e-03, 4.579e-02, -2.155e-03, 3.041e-02, 1.946e-02, 1.238e-02, 6.906e-02));
|
||||
r += mul(s0_3, M4(-2.058e-02, 3.187e-02, -1.057e-01, 2.407e-01, -3.813e-02, -2.640e-02, 3.941e-02, 1.362e-01, 2.406e-02, 1.518e-02, -4.224e-02, 3.455e-02, 5.443e-02, -6.617e-02, -8.858e-02, -1.949e-02));
|
||||
r += mul(s0_4, M4(3.205e-01, -6.490e-01, -3.962e-01, -1.142e-01, -3.091e-02, 4.755e-01, -2.822e-01, -1.328e-01, -5.487e-01, 5.932e-02, -2.439e-02, -1.689e-01, -3.681e-02, -8.227e-02, 3.967e-02, -8.989e-02));
|
||||
r += mul(s0_5, M4(-5.668e-02, 3.658e-02, 1.227e-02, 8.117e-02, 1.161e-01, 9.350e-02, 9.971e-02, -1.220e-01, 7.876e-02, 5.186e-02, -4.261e-02, 1.436e-01, -2.114e-02, 6.113e-02, 2.251e-02, 2.534e-02));
|
||||
r += mul(s0_6, M4(-5.365e-02, -2.678e-02, -2.565e-02, 7.923e-02, -2.138e-02, -4.932e-02, -6.107e-03, 1.685e-02, 5.425e-02, -1.012e-02, -9.037e-03, 8.218e-04, -1.210e-02, 5.623e-02, -2.094e-02, -2.325e-02));
|
||||
r += mul(s0_7, M4(2.031e-02, -3.187e-02, 8.229e-02, 1.457e-01, 1.044e-01, -4.475e-02, 2.858e-02, -7.345e-02, -3.919e-02, -5.753e-02, 1.684e-02, -1.669e-01, 9.680e-03, 1.254e-01, 2.022e-03, -9.900e-02));
|
||||
r += mul(s0_8, M4(-1.164e-02, 5.171e-02, -5.704e-02, -1.643e-01, 2.554e-02, -9.988e-02, 3.699e-02, -3.752e-02, -8.076e-04, -2.527e-02, -2.081e-02, 3.110e-02, 1.484e-03, 4.064e-02, 2.481e-02, 2.225e-01));
|
||||
r += mul(s1_0, M4(4.384e-02, -1.401e-01, -4.071e-02, -1.137e-02, -4.979e-03, 6.159e-02, 1.275e-01, 6.544e-02, 1.288e-01, -4.421e-02, -4.471e-02, 2.682e-02, 5.621e-02, -4.062e-02, 1.034e-01, 6.606e-02));
|
||||
r += mul(s1_1, M4(2.799e-02, -1.333e-01, 1.521e-01, -5.025e-02, -1.895e-01, -7.913e-02, -2.321e-01, -6.526e-02, -1.330e-02, 1.499e-02, 1.620e-01, 6.936e-02, -6.816e-02, 1.353e-01, 1.107e-01, 5.514e-02));
|
||||
r += mul(s1_2, M4(5.826e-03, -5.941e-03, -2.338e-02, -2.826e-02, 9.265e-02, 3.608e-02, 1.114e-01, 1.274e-01, -1.291e-01, 1.284e-02, -7.540e-02, -3.458e-02, -1.006e-02, 2.083e-02, -8.393e-02, 8.186e-02));
|
||||
r += mul(s1_3, M4(-3.893e-02, -1.137e-02, 1.243e-01, 1.118e-01, 7.397e-02, -1.316e-01, -1.303e-01, -7.808e-05, 1.468e-02, -4.172e-03, -5.014e-02, -2.610e-02, 8.366e-02, -2.755e-02, 1.646e-04, -2.938e-02));
|
||||
r += mul(s1_4, M4(-1.114e-01, -2.017e-01, 2.898e-03, -7.984e-02, -8.403e-02, 2.626e-02, 1.563e-01, -1.397e-02, 2.724e-02, -6.698e-01, 2.358e-01, -6.466e-01, -9.650e-03, -6.742e-01, 1.411e-01, -3.343e-01));
|
||||
r += mul(s1_5, M4(7.380e-02, 6.420e-02, 7.990e-02, 6.014e-02, 5.950e-02, 6.212e-02, -7.881e-02, -2.782e-02, 1.087e-01, -3.347e-02, 3.819e-01, 1.988e-01, 5.813e-02, 2.239e-02, 3.012e-01, 1.275e-01));
|
||||
r += mul(s1_6, M4(-9.473e-02, -2.417e-02, -2.870e-02, 7.718e-02, -2.223e-02, 2.306e-02, 8.255e-03, -1.818e-02, -2.983e-02, -3.495e-02, 1.540e-02, 8.013e-02, 1.651e-02, -1.298e-02, 2.377e-02, 5.523e-02));
|
||||
r += mul(s1_7, M4(1.414e-01, 1.346e-01, 4.336e-03, -7.594e-02, -2.044e-02, -9.596e-03, -1.087e-03, 5.324e-02, -2.041e-02, -6.328e-02, 7.533e-02, -3.971e-01, 5.408e-04, 1.087e-01, 9.749e-03, -2.047e-01));
|
||||
r += mul(s1_8, M4(4.656e-02, -4.771e-02, -2.210e-02, -2.060e-02, -6.953e-03, -3.366e-02, -7.290e-03, -3.300e-02, -1.354e-01, -5.015e-02, -2.887e-02, 2.802e-01, 2.605e-02, -1.972e-02, 1.168e-03, 1.422e-01));
|
||||
r += mul(s2_0, M4(8.826e-02, -4.751e-02, 2.493e-01, 4.446e-02, 1.752e-01, 5.741e-03, -1.820e-01, 1.371e-02, -6.855e-02, 1.164e-02, -5.215e-02, -7.373e-04, -1.491e-02, 7.033e-03, -5.440e-02, -9.302e-05));
|
||||
r += mul(s2_1, M4(-6.871e-02, -9.419e-03, 3.276e-01, 2.826e-02, 5.675e-02, -3.974e-03, 1.104e-01, -2.975e-02, 3.281e-02, 8.429e-03, 1.129e-01, -4.830e-02, -4.374e-02, -6.905e-02, 8.143e-02, 3.180e-03));
|
||||
r += mul(s2_2, M4(-7.197e-02, -1.804e-02, -9.024e-02, -1.527e-03, 2.403e-02, 6.062e-02, 3.346e-02, 4.784e-02, -1.462e-02, 4.216e-02, 2.800e-02, 4.034e-04, -4.216e-02, -4.431e-03, -3.496e-02, -3.005e-02));
|
||||
r += mul(s2_3, M4(2.710e-02, -6.523e-02, 1.559e-01, -6.059e-02, 1.965e-01, -1.608e-01, -9.293e-03, -2.404e-01, -4.061e-02, 8.819e-02, 2.112e-02, 2.398e-01, -1.463e-01, 5.373e-02, -1.346e-02, 3.025e-02));
|
||||
r += mul(s2_4, M4(1.846e-02, 3.857e-01, -4.128e-01, -2.530e-01, -2.312e-01, 3.354e-02, -3.948e-01, -1.465e-01, 1.072e-01, -8.544e-02, -7.428e-02, 4.751e-02, 2.139e-01, 3.097e-01, -3.761e-01, 5.621e-02));
|
||||
r += mul(s2_5, M4(-1.203e-02, -9.598e-02, 4.101e-01, 1.578e-01, -5.394e-02, -6.714e-02, -6.320e-02, 8.249e-03, 5.620e-02, -3.219e-02, 9.398e-03, 6.809e-02, -7.400e-02, -1.431e-01, -1.425e-01, -2.358e-02));
|
||||
r += mul(s2_6, M4(4.035e-02, 5.655e-02, -3.307e-03, -3.497e-02, 6.522e-02, 1.103e-01, -9.802e-02, -2.655e-01, -5.802e-02, -4.359e-02, 3.459e-03, 1.592e-01, -2.566e-02, -1.156e-01, 2.646e-02, 3.806e-02));
|
||||
r += mul(s2_7, M4(-3.211e-02, -1.509e-01, 2.028e-03, -1.702e-01, 4.576e-02, -5.340e-02, 5.503e-02, 1.257e-02, -5.581e-02, 9.818e-02, -2.745e-02, 1.486e-01, 1.063e-01, -3.707e-01, 1.116e-01, -6.709e-02));
|
||||
r += mul(s2_8, M4(9.062e-04, -7.371e-03, 8.420e-02, 1.629e-01, -2.707e-02, -5.219e-03, 6.567e-02, 1.766e-01, 4.554e-04, 1.178e-02, -1.124e-02, 3.477e-02, -5.473e-02, -7.643e-02, 9.083e-03, -4.250e-02));
|
||||
r += mul(s3_0, M4(8.537e-02, 7.246e-02, 5.043e-02, 3.850e-02, -3.951e-02, 9.224e-03, 1.640e-02, 1.906e-02, -1.333e-01, -8.517e-02, -1.410e-01, 4.781e-02, -1.641e-02, 2.463e-03, -7.445e-02, -4.602e-02));
|
||||
r += mul(s3_1, M4(7.934e-02, 7.380e-03, -1.062e-01, -4.154e-03, -2.611e-02, -3.119e-02, 9.679e-02, -1.394e-02, -1.108e-01, 1.158e-02, 1.850e-01, -6.765e-02, 5.765e-02, 3.392e-02, -1.560e-02, -6.052e-02));
|
||||
r += mul(s3_2, M4(-3.056e-02, -8.450e-03, 1.524e-02, -1.007e-02, 1.030e-02, 4.032e-02, 9.837e-02, 9.371e-03, 4.740e-02, -4.795e-02, -3.356e-02, 8.602e-03, 2.484e-02, -9.889e-03, 6.734e-02, -2.287e-02));
|
||||
r += mul(s3_3, M4(6.303e-02, -7.328e-02, -5.082e-02, -5.070e-02, -8.129e-03, 7.948e-03, 7.351e-02, 5.601e-02, -4.169e-01, -8.219e-03, 3.373e-01, 1.781e-01, -5.139e-02, 1.471e-01, 3.415e-02, 7.690e-02));
|
||||
r += mul(s3_4, M4(1.455e-01, 6.664e-02, 5.792e-02, -1.276e-01, 2.360e-01, 5.978e-02, 5.147e-02, 2.707e-01, -7.581e-01, -2.740e-01, -3.000e-01, -2.017e-02, 2.005e-01, 6.934e-02, 2.996e-02, -3.036e-01));
|
||||
r += mul(s3_5, M4(-1.708e-01, -6.628e-02, 9.170e-02, 3.461e-02, -1.028e-01, -4.244e-02, -1.955e-01, -1.875e-01, 8.215e-02, 2.976e-02, -4.637e-03, 1.512e-01, -4.626e-02, 3.819e-03, -2.222e-01, -1.078e-01));
|
||||
r += mul(s3_6, M4(9.585e-02, 3.354e-02, -5.140e-03, -2.883e-02, -9.057e-02, 3.950e-02, -5.781e-02, -2.509e-02, -7.106e-02, -1.351e-01, -4.933e-02, 8.332e-02, -2.922e-02, -3.890e-02, 3.291e-03, 7.054e-02));
|
||||
r += mul(s3_7, M4(-8.208e-02, 1.377e-02, -2.475e-02, -1.353e-01, 9.728e-02, 7.136e-02, -3.984e-02, 1.374e-01, -1.160e-01, 4.362e-02, 6.714e-02, 5.038e-03, 1.866e-01, -2.349e-01, 8.599e-02, -1.510e-01));
|
||||
r += mul(s3_8, M4(-3.279e-02, 4.634e-02, 1.698e-02, 1.410e-01, -2.615e-02, 1.178e-02, 5.268e-02, 5.209e-02, 1.624e-02, 2.431e-02, -3.280e-02, 7.160e-02, -6.704e-02, -1.195e-01, -4.136e-02, -2.048e-01));
|
||||
r += V4(-7.279e-03, 1.016e-02, -7.400e-03, 4.979e-03);
|
||||
return r;
|
||||
}
|
||||
|
||||
void Pass5(uint2 blockStart, uint3 tid) {
|
||||
float2 pt = float2(GetInputPt());
|
||||
uint2 gxy = Rmp8x8(tid.x) + blockStart;
|
||||
uint2 size = GetInputSize();
|
||||
if (gxy.x >= size.x || gxy.y >= size.y) {
|
||||
return;
|
||||
}
|
||||
float2 pos = (gxy + 0.5) * pt;
|
||||
|
||||
V4 s0_0 = l0(-1.0, -1.0);
|
||||
V4 s0_1 = l0(0.0, -1.0);
|
||||
V4 s0_2 = l0(1.0, -1.0);
|
||||
V4 s0_3 = l0(-1.0, 0.0);
|
||||
V4 s0_4 = l0(0.0, 0.0);
|
||||
V4 s0_5 = l0(1.0, 0.0);
|
||||
V4 s0_6 = l0(-1.0, 1.0);
|
||||
V4 s0_7 = l0(0.0, 1.0);
|
||||
V4 s0_8 = l0(1.0, 1.0);
|
||||
V4 s1_0 = -max(-s0_0, 0.0);
|
||||
V4 s1_1 = -max(-s0_1, 0.0);
|
||||
V4 s1_2 = -max(-s0_2, 0.0);
|
||||
V4 s1_3 = -max(-s0_3, 0.0);
|
||||
V4 s1_4 = -max(-s0_4, 0.0);
|
||||
V4 s1_5 = -max(-s0_5, 0.0);
|
||||
V4 s1_6 = -max(-s0_6, 0.0);
|
||||
V4 s1_7 = -max(-s0_7, 0.0);
|
||||
V4 s1_8 = -max(-s0_8, 0.0);
|
||||
s0_0 = max(s0_0, 0.0);
|
||||
s0_1 = max(s0_1, 0.0);
|
||||
s0_2 = max(s0_2, 0.0);
|
||||
s0_3 = max(s0_3, 0.0);
|
||||
s0_4 = max(s0_4, 0.0);
|
||||
s0_5 = max(s0_5, 0.0);
|
||||
s0_6 = max(s0_6, 0.0);
|
||||
s0_7 = max(s0_7, 0.0);
|
||||
s0_8 = max(s0_8, 0.0);
|
||||
|
||||
V4 s2_0 = l1(-1.0, -1.0);
|
||||
V4 s2_1 = l1(0.0, -1.0);
|
||||
V4 s2_2 = l1(1.0, -1.0);
|
||||
V4 s2_3 = l1(-1.0, 0.0);
|
||||
V4 s2_4 = l1(0.0, 0.0);
|
||||
V4 s2_5 = l1(1.0, 0.0);
|
||||
V4 s2_6 = l1(-1.0, 1.0);
|
||||
V4 s2_7 = l1(0.0, 1.0);
|
||||
V4 s2_8 = l1(1.0, 1.0);
|
||||
V4 s3_0 = -max(-s2_0, 0.0);
|
||||
V4 s3_1 = -max(-s2_1, 0.0);
|
||||
V4 s3_2 = -max(-s2_2, 0.0);
|
||||
V4 s3_3 = -max(-s2_3, 0.0);
|
||||
V4 s3_4 = -max(-s2_4, 0.0);
|
||||
V4 s3_5 = -max(-s2_5, 0.0);
|
||||
V4 s3_6 = -max(-s2_6, 0.0);
|
||||
V4 s3_7 = -max(-s2_7, 0.0);
|
||||
V4 s3_8 = -max(-s2_8, 0.0);
|
||||
s2_0 = max(s2_0, 0.0);
|
||||
s2_1 = max(s2_1, 0.0);
|
||||
s2_2 = max(s2_2, 0.0);
|
||||
s2_3 = max(s2_3, 0.0);
|
||||
s2_4 = max(s2_4, 0.0);
|
||||
s2_5 = max(s2_5, 0.0);
|
||||
s2_6 = max(s2_6, 0.0);
|
||||
s2_7 = max(s2_7, 0.0);
|
||||
s2_8 = max(s2_8, 0.0);
|
||||
|
||||
t0[gxy] = f0(s0_0, s0_1, s0_2, s0_3, s0_4, s0_5, s0_6, s0_7, s0_8, s1_0, s1_1, s1_2, s1_3, s1_4, s1_5, s1_6, s1_7, s1_8, s2_0, s2_1, s2_2, s2_3, s2_4, s2_5, s2_6, s2_7, s2_8, s3_0, s3_1, s3_2, s3_3, s3_4, s3_5, s3_6, s3_7, s3_8);
|
||||
t1[gxy] = f1(s0_0, s0_1, s0_2, s0_3, s0_4, s0_5, s0_6, s0_7, s0_8, s1_0, s1_1, s1_2, s1_3, s1_4, s1_5, s1_6, s1_7, s1_8, s2_0, s2_1, s2_2, s2_3, s2_4, s2_5, s2_6, s2_7, s2_8, s3_0, s3_1, s3_2, s3_3, s3_4, s3_5, s3_6, s3_7, s3_8);
|
||||
}
|
||||
|
||||
//!PASS 6
|
||||
//!DESC out-shuffle
|
||||
//!BLOCK_SIZE 16
|
||||
//!NUM_THREADS 64
|
||||
//!IN INPUT, t0, t1
|
||||
//!OUT OUTPUT
|
||||
|
||||
#define l0(x, y) V4(O(t0, float2(x, y)))
|
||||
#define l1(x, y) V4(O(t1, float2(x, y)))
|
||||
|
||||
V4 f0(V4 s0_0, V4 s0_1, V4 s0_2, V4 s0_3, V4 s0_4, V4 s0_5, V4 s0_6, V4 s0_7, V4 s0_8, V4 s1_0, V4 s1_1, V4 s1_2, V4 s1_3, V4 s1_4, V4 s1_5, V4 s1_6, V4 s1_7, V4 s1_8, V4 s2_0, V4 s2_1, V4 s2_2, V4 s2_3, V4 s2_4, V4 s2_5, V4 s2_6, V4 s2_7, V4 s2_8, V4 s3_0, V4 s3_1, V4 s3_2, V4 s3_3, V4 s3_4, V4 s3_5, V4 s3_6, V4 s3_7, V4 s3_8) {
|
||||
V4 r = 0.0;
|
||||
r += mul(s0_0, M4(1.838e-01, -1.901e-02, 9.627e-03, -5.113e-02, -2.616e-02, -2.850e-02, -1.739e-02, 9.125e-03, 1.563e-02, -1.253e-02, 1.902e-02, -1.512e-02, 3.495e-03, -1.497e-02, 4.974e-03, -1.115e-02));
|
||||
r += mul(s0_1, M4(-4.288e-01, 2.549e-01, -1.017e-01, -1.945e-01, 4.749e-02, 5.258e-02, -8.284e-03, -2.265e-02, -7.010e-02, -1.542e-02, 6.889e-03, 4.028e-02, -1.355e-02, 4.321e-03, 3.330e-02, 8.256e-02));
|
||||
r += mul(s0_2, M4(-1.045e-03, -2.140e-01, -3.928e-02, 1.364e-02, 1.787e-03, -9.427e-03, 1.927e-03, -2.145e-03, -4.617e-03, 2.814e-02, 2.045e-02, 1.776e-02, 3.188e-02, 3.187e-02, -1.498e-02, -2.180e-02));
|
||||
r += mul(s0_3, M4(1.059e-02, -5.522e-02, 6.447e-02, -4.395e-02, -4.846e-02, -2.209e-02, 1.866e-02, -5.920e-02, -2.419e-02, 1.032e-02, 7.736e-04, 1.092e-03, 1.854e-02, -4.388e-03, 3.893e-02, 9.549e-03));
|
||||
r += mul(s0_4, M4(3.358e-02, 7.431e-03, 1.780e-01, 5.353e-01, 2.674e-01, 2.227e-01, 1.450e-01, 2.085e-01, -8.104e-03, -3.561e-02, -1.231e-01, -1.932e-01, -6.538e-02, 3.378e-02, -1.314e-01, -2.862e-02));
|
||||
r += mul(s0_5, M4(-7.887e-02, 4.783e-02, -1.380e-01, -3.877e-01, 3.436e-03, 4.712e-02, -1.250e-02, 2.247e-02, 2.920e-02, 7.190e-02, -2.005e-02, 6.169e-02, 5.594e-03, -4.630e-02, 9.661e-02, 3.625e-02));
|
||||
r += mul(s0_6, M4(1.963e-02, -1.873e-02, 1.489e-02, -1.253e-02, -9.356e-03, 1.334e-02, -3.747e-02, -1.115e-02, 7.741e-04, -6.463e-03, 3.707e-03, -3.598e-03, 1.783e-02, -5.539e-03, 9.899e-03, -9.354e-03));
|
||||
r += mul(s0_7, M4(-2.952e-03, 3.132e-02, -6.679e-02, 2.883e-02, -3.721e-02, -3.573e-02, 1.204e-01, -6.785e-02, -1.208e-02, -1.355e-04, 2.872e-02, 2.196e-02, -1.655e-02, 3.784e-02, -5.921e-03, 2.494e-02));
|
||||
r += mul(s0_8, M4(-1.215e-02, -2.947e-02, -2.454e-03, -6.326e-02, 2.248e-03, 2.302e-02, -2.863e-03, 5.834e-02, 2.187e-02, 9.973e-03, 2.158e-02, 4.902e-02, -2.207e-02, -3.485e-02, -5.118e-02, -6.696e-02));
|
||||
r += mul(s1_0, M4(8.377e-02, -3.093e-02, 2.280e-02, -2.664e-02, -4.333e-02, -3.292e-02, -8.109e-03, 1.105e-02, 1.507e-02, 9.138e-03, 2.597e-02, -1.926e-02, 4.537e-02, -9.080e-03, -1.629e-02, -1.180e-02));
|
||||
r += mul(s1_1, M4(-7.478e-02, 1.238e-01, -5.092e-02, -3.473e-02, 4.269e-02, 4.444e-02, 7.295e-03, 1.274e-04, -1.646e-01, -1.551e-03, 3.424e-02, 4.906e-02, -2.056e-01, -5.847e-02, 5.262e-02, 1.049e-01));
|
||||
r += mul(s1_2, M4(-6.323e-02, -1.179e-01, -1.982e-02, -4.065e-02, 3.089e-03, -9.469e-03, 2.850e-03, 3.314e-03, -1.819e-03, -1.065e-01, 1.882e-02, 9.349e-03, 1.624e-02, -2.906e-02, -2.029e-02, -5.020e-02));
|
||||
r += mul(s1_3, M4(1.101e-01, -3.490e-02, 1.327e-01, -2.853e-02, -5.027e-03, -5.703e-02, 6.484e-03, -6.473e-02, -4.310e-02, 3.882e-02, -3.100e-02, -7.837e-04, -5.501e-02, -1.261e-02, 7.285e-02, 4.648e-02));
|
||||
r += mul(s1_4, M4(-6.214e-02, 1.841e-01, -9.546e-02, 3.700e-01, 2.824e-01, 3.400e-01, 2.309e-01, 2.237e-01, 3.482e-01, -1.294e-01, -4.546e-01, -3.556e-01, -4.730e-01, -1.392e-01, 4.776e-01, 1.210e-01));
|
||||
r += mul(s1_5, M4(-5.408e-02, -1.286e-01, -6.571e-02, -1.230e-01, 9.991e-03, 6.421e-02, 4.305e-03, 1.780e-02, 2.254e-02, 3.661e-01, 6.275e-02, 8.004e-02, -1.834e-02, -3.465e-01, 9.274e-02, 3.935e-01));
|
||||
r += mul(s1_6, M4(-1.620e-03, -1.423e-02, 2.785e-02, -1.252e-02, -1.218e-02, 2.842e-03, -3.496e-02, -2.927e-02, -2.106e-02, -7.099e-03, 2.545e-02, 2.484e-02, 2.973e-02, 4.563e-04, 2.010e-04, -1.839e-02));
|
||||
r += mul(s1_7, M4(1.400e-04, 3.080e-02, -6.992e-03, 8.032e-02, -2.280e-02, -4.436e-02, 7.600e-02, 1.165e-02, -9.494e-02, -2.207e-02, 2.783e-01, 2.095e-01, 2.645e-02, 5.203e-02, -7.492e-02, -1.303e-02));
|
||||
r += mul(s1_8, M4(-5.360e-03, -2.277e-02, -2.252e-02, -6.191e-02, 1.263e-02, 1.540e-02, 9.566e-03, 3.637e-02, -1.265e-02, -3.092e-02, -1.298e-02, -1.187e-02, 1.286e-02, 1.181e-02, -5.675e-02, -5.487e-02));
|
||||
r += mul(s2_0, M4(6.275e-02, 3.332e-02, 2.458e-02, -1.910e-02, -1.764e-02, 2.292e-02, -3.220e-02, -1.127e-02, 4.114e-02, 4.303e-02, -3.355e-02, -8.882e-03, 1.881e-02, 1.788e-02, -3.354e-03, -1.345e-02));
|
||||
r += mul(s2_1, M4(-1.562e-01, -7.407e-02, -5.684e-02, -3.194e-03, 1.150e-01, -2.700e-02, -9.666e-03, -3.629e-02, 5.862e-02, 6.747e-02, -1.085e-02, -3.454e-02, 7.263e-05, 2.167e-02, 5.491e-03, -6.472e-02));
|
||||
r += mul(s2_2, M4(5.068e-03, 4.899e-02, 1.480e-02, -2.153e-02, 1.102e-02, 2.831e-02, -5.931e-03, 1.021e-02, -1.267e-02, -1.569e-02, 5.418e-04, 1.030e-02, -3.280e-02, -3.072e-02, -2.688e-02, -2.208e-02));
|
||||
r += mul(s2_3, M4(-7.105e-02, 1.664e-03, -3.108e-02, 6.985e-02, 3.176e-02, 2.312e-02, -3.835e-02, 3.884e-02, -1.038e-01, 6.660e-02, -1.372e-01, 2.432e-02, -2.888e-04, -2.049e-02, 2.271e-02, 9.383e-03));
|
||||
r += mul(s2_4, M4(4.697e-01, -3.721e-01, 1.705e-01, -2.767e-01, -1.791e-02, -7.276e-02, 2.503e-01, 1.040e-01, 1.180e-02, -5.212e-01, 4.014e-01, 1.946e-01, -2.547e-02, -1.567e-02, -5.652e-02, 9.687e-02));
|
||||
r += mul(s2_5, M4(3.024e-02, 1.618e-02, 2.619e-02, 8.868e-02, -5.217e-02, -7.642e-02, -3.704e-02, -2.374e-02, -4.639e-02, 5.743e-02, -3.967e-02, -2.450e-02, 2.091e-02, -1.108e-02, 6.949e-03, -1.502e-02));
|
||||
r += mul(s2_6, M4(5.298e-03, -6.810e-03, -1.982e-02, 1.960e-04, 3.645e-03, 5.483e-03, 3.357e-03, 3.697e-02, -1.339e-02, 3.253e-02, 3.649e-02, 4.492e-03, 2.076e-02, -9.046e-03, 2.043e-02, -7.803e-03));
|
||||
r += mul(s2_7, M4(-2.707e-02, 7.878e-02, 1.816e-01, -1.506e-03, 9.060e-03, -1.418e-02, -6.983e-02, -5.833e-02, 3.309e-02, -2.537e-02, -3.298e-01, -1.735e-01, -2.132e-03, 5.241e-02, 1.155e-02, 4.817e-02));
|
||||
r += mul(s2_8, M4(-1.781e-02, 1.652e-02, -7.188e-03, 2.114e-03, -1.105e-02, -1.137e-02, -9.037e-03, -5.600e-02, -1.220e-02, 8.292e-03, -3.404e-03, -4.211e-02, -1.018e-02, -1.004e-02, 1.505e-03, -4.591e-03));
|
||||
r += mul(s3_0, M4(7.349e-02, 2.926e-02, 2.398e-02, -1.821e-02, -1.290e-02, 1.201e-02, 5.000e-03, 1.316e-02, -1.567e-02, 2.025e-02, -2.171e-02, -3.941e-04, -7.948e-03, 6.116e-02, -9.445e-03, 1.911e-02));
|
||||
r += mul(s3_1, M4(-1.294e-01, -6.121e-02, -4.576e-02, 9.211e-03, 1.371e-02, -1.964e-02, -3.133e-03, 4.701e-03, 9.544e-02, 6.692e-03, 4.665e-04, -2.056e-02, 3.455e-01, -2.495e-01, 1.027e-02, -7.393e-02));
|
||||
r += mul(s3_2, M4(1.532e-02, 9.402e-03, 6.812e-04, -3.241e-02, 1.245e-03, 6.504e-03, 3.970e-03, 7.168e-03, 9.435e-03, 1.574e-02, -5.118e-03, 5.232e-03, -2.659e-02, -6.011e-02, -2.446e-02, 8.062e-04));
|
||||
r += mul(s3_3, M4(-6.714e-02, 3.454e-03, -1.486e-02, 5.921e-02, 5.177e-02, 3.766e-02, -1.473e-01, 4.371e-02, -7.118e-02, 2.462e-02, -1.810e-02, 3.430e-02, -5.552e-02, 3.047e-02, -5.066e-02, 5.769e-02));
|
||||
r += mul(s3_4, M4(3.191e-02, -1.387e-01, -5.992e-02, -1.554e-01, 4.660e-01, 3.655e-01, 2.406e-02, -3.902e-01, 3.973e-02, -1.333e-01, 1.792e-01, 8.854e-02, 2.477e-01, -3.115e-01, 6.035e-01, -4.717e-01));
|
||||
r += mul(s3_5, M4(7.241e-02, 1.273e-01, 6.810e-02, 1.118e-01, -5.454e-02, -1.728e-02, -1.007e-01, -2.265e-02, -4.534e-02, -5.171e-02, -2.524e-02, -3.337e-02, -2.366e-03, -1.723e-02, 2.300e-02, -9.889e-02));
|
||||
r += mul(s3_6, M4(3.383e-02, -7.898e-03, 1.681e-02, -5.131e-03, 3.687e-02, 1.929e-02, -7.695e-03, 2.145e-03, -2.814e-02, 3.366e-02, -8.788e-02, 3.614e-02, 2.951e-02, -6.964e-03, 2.272e-02, 1.581e-02));
|
||||
r += mul(s3_7, M4(7.020e-03, 6.046e-02, 2.975e-02, 5.663e-02, 2.155e-02, -1.786e-02, -2.588e-01, -1.310e-01, -6.372e-02, -1.218e-01, -7.160e-02, -3.058e-01, 2.297e-03, 3.050e-02, -2.346e-02, 4.674e-02));
|
||||
r += mul(s3_8, M4(-1.071e-02, -1.089e-02, 9.286e-03, 5.202e-02, -2.291e-02, -2.655e-02, 2.386e-02, -3.231e-02, 4.599e-03, 1.114e-02, -1.630e-02, -1.693e-02, -2.194e-03, 1.842e-02, -2.522e-02, 5.265e-02));
|
||||
r += V4(-1.667e-03, -2.914e-03, -1.783e-03, -1.113e-03);
|
||||
return tanh(r);
|
||||
}
|
||||
|
||||
void Pass6(uint2 blockStart, uint3 tid) {
|
||||
float2 pt = float2(GetInputPt());
|
||||
uint2 gxy = (Rmp8x8(tid.x) << 1) + blockStart;
|
||||
uint2 size = GetOutputSize();
|
||||
if (gxy.x >= size.x || gxy.y >= size.y) {
|
||||
return;
|
||||
}
|
||||
float2 pos = ((gxy >> 1) + 0.5) * pt;
|
||||
|
||||
V4 s0_0 = l0(-1.0, -1.0);
|
||||
V4 s0_1 = l0(0.0, -1.0);
|
||||
V4 s0_2 = l0(1.0, -1.0);
|
||||
V4 s0_3 = l0(-1.0, 0.0);
|
||||
V4 s0_4 = l0(0.0, 0.0);
|
||||
V4 s0_5 = l0(1.0, 0.0);
|
||||
V4 s0_6 = l0(-1.0, 1.0);
|
||||
V4 s0_7 = l0(0.0, 1.0);
|
||||
V4 s0_8 = l0(1.0, 1.0);
|
||||
V4 s1_0 = -max(-s0_0, 0.0);
|
||||
V4 s1_1 = -max(-s0_1, 0.0);
|
||||
V4 s1_2 = -max(-s0_2, 0.0);
|
||||
V4 s1_3 = -max(-s0_3, 0.0);
|
||||
V4 s1_4 = -max(-s0_4, 0.0);
|
||||
V4 s1_5 = -max(-s0_5, 0.0);
|
||||
V4 s1_6 = -max(-s0_6, 0.0);
|
||||
V4 s1_7 = -max(-s0_7, 0.0);
|
||||
V4 s1_8 = -max(-s0_8, 0.0);
|
||||
s0_0 = max(s0_0, 0.0);
|
||||
s0_1 = max(s0_1, 0.0);
|
||||
s0_2 = max(s0_2, 0.0);
|
||||
s0_3 = max(s0_3, 0.0);
|
||||
s0_4 = max(s0_4, 0.0);
|
||||
s0_5 = max(s0_5, 0.0);
|
||||
s0_6 = max(s0_6, 0.0);
|
||||
s0_7 = max(s0_7, 0.0);
|
||||
s0_8 = max(s0_8, 0.0);
|
||||
|
||||
V4 s2_0 = l1(-1.0, -1.0);
|
||||
V4 s2_1 = l1(0.0, -1.0);
|
||||
V4 s2_2 = l1(1.0, -1.0);
|
||||
V4 s2_3 = l1(-1.0, 0.0);
|
||||
V4 s2_4 = l1(0.0, 0.0);
|
||||
V4 s2_5 = l1(1.0, 0.0);
|
||||
V4 s2_6 = l1(-1.0, 1.0);
|
||||
V4 s2_7 = l1(0.0, 1.0);
|
||||
V4 s2_8 = l1(1.0, 1.0);
|
||||
V4 s3_0 = -max(-s2_0, 0.0);
|
||||
V4 s3_1 = -max(-s2_1, 0.0);
|
||||
V4 s3_2 = -max(-s2_2, 0.0);
|
||||
V4 s3_3 = -max(-s2_3, 0.0);
|
||||
V4 s3_4 = -max(-s2_4, 0.0);
|
||||
V4 s3_5 = -max(-s2_5, 0.0);
|
||||
V4 s3_6 = -max(-s2_6, 0.0);
|
||||
V4 s3_7 = -max(-s2_7, 0.0);
|
||||
V4 s3_8 = -max(-s2_8, 0.0);
|
||||
s2_0 = max(s2_0, 0.0);
|
||||
s2_1 = max(s2_1, 0.0);
|
||||
s2_2 = max(s2_2, 0.0);
|
||||
s2_3 = max(s2_3, 0.0);
|
||||
s2_4 = max(s2_4, 0.0);
|
||||
s2_5 = max(s2_5, 0.0);
|
||||
s2_6 = max(s2_6, 0.0);
|
||||
s2_7 = max(s2_7, 0.0);
|
||||
s2_8 = max(s2_8, 0.0);
|
||||
|
||||
V4 r = f0(s0_0, s0_1, s0_2, s0_3, s0_4, s0_5, s0_6, s0_7, s0_8, s1_0, s1_1, s1_2, s1_3, s1_4, s1_5, s1_6, s1_7, s1_8, s2_0, s2_1, s2_2, s2_3, s2_4, s2_5, s2_6, s2_7, s2_8, s3_0, s3_1, s3_2, s3_3, s3_4, s3_5, s3_6, s3_7, s3_8);
|
||||
|
||||
static const float3x3 rgb2yuv = {0.299, 0.587, 0.114, -0.169, -0.331, 0.5, 0.5, -0.419, -0.081};
|
||||
static const float3x3 yuv2rgb = {1, -0.00093, 1.401687, 1, -0.3437, -0.71417, 1, 1.77216, 0.00099};
|
||||
float2 opt = float2(GetOutputPt());
|
||||
|
||||
pos -= 0.5f * opt;
|
||||
float3 yuv = mul(rgb2yuv, INPUT.SampleLevel(SL, pos, 0).rgb);
|
||||
OUTPUT[gxy] = float4(mul(yuv2rgb, float3(saturate(yuv.r + r.x), yuv.yz)), 1);
|
||||
|
||||
++gxy.x;
|
||||
pos.x += opt.x;
|
||||
yuv = mul(rgb2yuv, INPUT.SampleLevel(SL, pos, 0).rgb);
|
||||
OUTPUT[gxy] = float4(mul(yuv2rgb, float3(saturate(yuv.r + r.y), yuv.yz)), 1);
|
||||
|
||||
++gxy.y;
|
||||
pos.y += opt.y;
|
||||
yuv = mul(rgb2yuv, INPUT.SampleLevel(SL, pos, 0).rgb);
|
||||
OUTPUT[gxy] = float4(mul(yuv2rgb, float3(saturate(yuv.r + r.w), yuv.yz)), 1);
|
||||
|
||||
--gxy.x;
|
||||
pos.x -= opt.x;
|
||||
yuv = mul(rgb2yuv, INPUT.SampleLevel(SL, pos, 0).rgb);
|
||||
OUTPUT[gxy] = float4(mul(yuv2rgb, float3(saturate(yuv.r + r.z), yuv.yz)), 1);
|
||||
}
|
||||
921
src/Effects/CuNNy/CuNNy-4x8C-NVL.hlsl
Normal file
921
src/Effects/CuNNy/CuNNy-4x8C-NVL.hlsl
Normal file
|
|
@ -0,0 +1,921 @@
|
|||
// CuNNy 4x8C BILINEAR RGB NVL - https://github.com/cunnyplapper/CuNNy
|
||||
|
||||
// This program is free software: you can redistribute it and/or modify
|
||||
// it under the terms of the GNU General Public License as published by
|
||||
// the Free Software Foundation, either version 3 of the License, or
|
||||
// (at your option) any later version.
|
||||
//
|
||||
// This program is distributed in the hope that it will be useful,
|
||||
// but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
//
|
||||
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
// GNU General Public License for more details.
|
||||
//
|
||||
// You should have received a copy of the GNU General Public License
|
||||
// along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
|
||||
//!MAGPIE EFFECT
|
||||
//!VERSION 4
|
||||
//!SORT_NAME CuNNy-D08N04
|
||||
|
||||
//!TEXTURE
|
||||
Texture2D INPUT;
|
||||
|
||||
//!TEXTURE
|
||||
//!WIDTH INPUT_WIDTH * 2
|
||||
//!HEIGHT INPUT_HEIGHT * 2
|
||||
Texture2D OUTPUT;
|
||||
|
||||
//!SAMPLER
|
||||
//!FILTER POINT
|
||||
SamplerState SP;
|
||||
|
||||
//!SAMPLER
|
||||
//!FILTER LINEAR
|
||||
SamplerState SL;
|
||||
|
||||
//!COMMON
|
||||
#define O(t, p) t.SampleLevel(SP, pos + p * pt, 0)
|
||||
#define V4 min16float4
|
||||
#define M4 min16float4x4
|
||||
|
||||
//!TEXTURE
|
||||
//!WIDTH INPUT_WIDTH
|
||||
//!HEIGHT INPUT_HEIGHT
|
||||
//!FORMAT R8G8B8A8_SNORM
|
||||
Texture2D t0;
|
||||
|
||||
//!TEXTURE
|
||||
//!WIDTH INPUT_WIDTH
|
||||
//!HEIGHT INPUT_HEIGHT
|
||||
//!FORMAT R8G8B8A8_SNORM
|
||||
Texture2D t1;
|
||||
|
||||
//!TEXTURE
|
||||
//!WIDTH INPUT_WIDTH
|
||||
//!HEIGHT INPUT_HEIGHT
|
||||
//!FORMAT R8G8B8A8_SNORM
|
||||
Texture2D t2;
|
||||
|
||||
//!TEXTURE
|
||||
//!WIDTH INPUT_WIDTH
|
||||
//!HEIGHT INPUT_HEIGHT
|
||||
//!FORMAT R8G8B8A8_SNORM
|
||||
Texture2D t3;
|
||||
|
||||
//!PASS 1
|
||||
//!DESC in
|
||||
//!BLOCK_SIZE 8
|
||||
//!NUM_THREADS 64
|
||||
//!IN INPUT
|
||||
//!OUT t0, t1
|
||||
|
||||
#define l0(x, y) min16float((dot(float3(2.329e-01, 4.438e-01, 9.598e-02), O(INPUT, float2(x, y)).rgb) + -5.664e-01))
|
||||
|
||||
V4 f0(min16float s0_0, min16float s0_1, min16float s0_2, min16float s0_3, min16float s0_4, min16float s0_5, min16float s0_6, min16float s0_7, min16float s0_8) {
|
||||
V4 r = 0.0;
|
||||
r += V4(-1.368e-01, -5.123e-02, -2.270e-01, -9.888e-02) * s0_0;
|
||||
r += V4(3.682e-01, 4.625e-02, 1.372e-01, 3.834e-01) * s0_1;
|
||||
r += V4(-9.245e-02, 7.555e-03, 3.923e-02, 1.252e-02) * s0_2;
|
||||
r += V4(-2.312e-01, 2.012e-02, 1.660e-01, 4.386e-01) * s0_3;
|
||||
r += V4(-3.965e-02, -4.834e-01, 3.729e-01, -7.207e-01) * s0_4;
|
||||
r += V4(2.190e-01, -9.021e-02, -1.087e-01, -9.632e-03) * s0_5;
|
||||
r += V4(4.088e-02, 1.183e-01, 8.976e-02, -1.710e-03) * s0_6;
|
||||
r += V4(-5.188e-03, 5.274e-01, -8.856e-02, -6.446e-03) * s0_7;
|
||||
r += V4(-7.160e-02, -9.349e-02, -3.823e-01, 1.947e-03) * s0_8;
|
||||
r += V4(3.244e-02, 2.492e-04, 8.562e-04, 1.261e-04);
|
||||
return r;
|
||||
}
|
||||
|
||||
V4 f1(min16float s0_0, min16float s0_1, min16float s0_2, min16float s0_3, min16float s0_4, min16float s0_5, min16float s0_6, min16float s0_7, min16float s0_8) {
|
||||
V4 r = 0.0;
|
||||
r += V4(2.403e-02, 8.569e-03, -8.618e-02, 2.022e-02) * s0_0;
|
||||
r += V4(4.893e-01, 2.383e-02, 2.423e-02, -3.486e-01) * s0_1;
|
||||
r += V4(-3.682e-02, 2.437e-03, 1.872e-01, 1.135e-01) * s0_2;
|
||||
r += V4(-2.361e-02, 2.588e-02, 7.348e-02, -8.229e-03) * s0_3;
|
||||
r += V4(-4.433e-01, -5.131e-01, -3.778e-01, 6.107e-02) * s0_4;
|
||||
r += V4(-4.423e-02, 2.098e-02, 9.260e-03, 4.444e-02) * s0_5;
|
||||
r += V4(-1.370e-02, 1.009e-02, 3.020e-01, 1.159e-02) * s0_6;
|
||||
r += V4(-3.030e-03, 8.145e-03, -2.789e-02, -7.085e-03) * s0_7;
|
||||
r += V4(2.648e-02, 4.731e-03, -1.067e-01, -4.477e-03) * s0_8;
|
||||
r += V4(-1.971e-02, 8.202e-02, 4.706e-03, -6.665e-02);
|
||||
return r;
|
||||
}
|
||||
|
||||
void Pass1(uint2 blockStart, uint3 tid) {
|
||||
float2 pt = float2(GetInputPt());
|
||||
uint2 gxy = Rmp8x8(tid.x) + blockStart;
|
||||
uint2 size = GetInputSize();
|
||||
if (gxy.x >= size.x || gxy.y >= size.y) {
|
||||
return;
|
||||
}
|
||||
float2 pos = (gxy + 0.5) * pt;
|
||||
|
||||
min16float s0_0 = l0(-1.0, -1.0);
|
||||
min16float s0_1 = l0(0.0, -1.0);
|
||||
min16float s0_2 = l0(1.0, -1.0);
|
||||
min16float s0_3 = l0(-1.0, 0.0);
|
||||
min16float s0_4 = l0(0.0, 0.0);
|
||||
min16float s0_5 = l0(1.0, 0.0);
|
||||
min16float s0_6 = l0(-1.0, 1.0);
|
||||
min16float s0_7 = l0(0.0, 1.0);
|
||||
min16float s0_8 = l0(1.0, 1.0);
|
||||
|
||||
t0[gxy] = f0(s0_0, s0_1, s0_2, s0_3, s0_4, s0_5, s0_6, s0_7, s0_8);
|
||||
t1[gxy] = f1(s0_0, s0_1, s0_2, s0_3, s0_4, s0_5, s0_6, s0_7, s0_8);
|
||||
}
|
||||
|
||||
//!PASS 2
|
||||
//!DESC conv1
|
||||
//!BLOCK_SIZE 8
|
||||
//!NUM_THREADS 64
|
||||
//!IN t0, t1
|
||||
//!OUT t2, t3
|
||||
|
||||
#define l0(x, y) V4(O(t0, float2(x, y)))
|
||||
#define l1(x, y) V4(O(t1, float2(x, y)))
|
||||
|
||||
V4 f0(V4 s0_0, V4 s0_1, V4 s0_2, V4 s0_3, V4 s0_4, V4 s0_5, V4 s0_6, V4 s0_7, V4 s0_8, V4 s1_0, V4 s1_1, V4 s1_2, V4 s1_3, V4 s1_4, V4 s1_5, V4 s1_6, V4 s1_7, V4 s1_8, V4 s2_0, V4 s2_1, V4 s2_2, V4 s2_3, V4 s2_4, V4 s2_5, V4 s2_6, V4 s2_7, V4 s2_8, V4 s3_0, V4 s3_1, V4 s3_2, V4 s3_3, V4 s3_4, V4 s3_5, V4 s3_6, V4 s3_7, V4 s3_8) {
|
||||
V4 r = 0.0;
|
||||
r += mul(s0_0, M4(1.205e-01, 8.504e-02, -7.328e-02, 1.539e-01, -9.103e-03, -2.708e-02, -1.401e-01, -2.159e-01, -2.552e-01, 7.462e-02, 5.919e-02, 8.905e-02, 1.169e-01, -4.383e-03, -1.997e-01, -1.379e-01));
|
||||
r += mul(s0_1, M4(2.844e-02, 2.238e-02, 2.143e-01, -1.624e-01, 1.885e-01, 1.316e-01, -1.276e-01, -1.713e-01, 2.553e-03, -1.343e-01, 4.700e-02, 4.762e-01, -2.676e-01, 1.784e-01, -4.065e-02, 1.015e-01));
|
||||
r += mul(s0_2, M4(-4.442e-03, 3.253e-01, 2.650e-02, -2.907e-01, 2.749e-01, -3.510e-01, 8.545e-02, -2.446e-01, -1.579e-01, 9.398e-02, -4.544e-02, -9.123e-02, -2.529e-01, -2.538e-01, -2.686e-01, 2.607e-01));
|
||||
r += mul(s0_3, M4(1.518e-01, -1.515e-01, -1.597e-01, 2.163e-01, -6.933e-02, 7.220e-02, 2.114e-01, -2.227e-01, -3.743e-01, 9.056e-02, 2.612e-02, 3.036e-01, -1.583e-02, -8.293e-02, -1.068e-01, 6.201e-02));
|
||||
r += mul(s0_4, M4(-2.305e-02, 9.029e-02, -1.003e-01, -2.375e-01, -1.891e-01, 3.623e-01, -2.999e-01, -4.511e-01, 1.460e-01, -3.825e-01, 1.231e-01, 6.391e-01, -6.041e-01, 5.588e-01, -3.508e-01, -3.131e-01));
|
||||
r += mul(s0_5, M4(8.812e-02, 2.197e-01, -8.630e-03, 2.287e-02, -1.918e-01, -6.428e-01, 1.496e-01, 2.272e-01, 3.445e-02, -7.188e-03, -8.518e-02, 1.948e-01, 1.606e-01, -8.707e-01, 2.092e-02, -4.993e-01));
|
||||
r += mul(s0_6, M4(9.718e-03, 8.373e-03, 7.436e-02, -1.552e-01, 8.410e-02, -1.728e-02, -1.971e-01, 2.255e-02, -8.645e-02, 1.863e-02, -9.399e-02, -8.424e-02, -1.533e-03, 1.223e-01, 2.715e-01, -1.268e-01));
|
||||
r += mul(s0_7, M4(-4.246e-01, -1.034e-01, 3.236e-01, 5.680e-01, -1.213e-02, 1.577e-01, -9.408e-02, -7.294e-02, -6.410e-02, 4.264e-02, -8.392e-03, 2.192e-01, 1.656e-01, 4.681e-02, 9.146e-01, -6.311e-02));
|
||||
r += mul(s0_8, M4(-1.847e-01, -9.105e-02, -3.260e-02, 2.506e-01, -6.470e-02, 4.430e-02, -1.242e-02, -1.097e-01, 5.488e-02, 9.106e-02, 3.144e-02, -3.367e-05, 2.468e-01, -2.535e-01, 1.409e-01, -5.311e-01));
|
||||
r += mul(s1_0, M4(1.294e-01, 1.098e-01, 7.497e-03, 1.016e-01, 1.377e-02, -1.480e-02, -2.694e-02, -3.417e-02, -1.083e-01, -2.575e-03, 1.137e-01, -2.616e-01, -1.260e-01, -2.567e-02, -1.958e-01, 6.103e-02));
|
||||
r += mul(s1_1, M4(-1.355e-01, 1.168e-01, 2.368e-01, -2.379e-01, 8.556e-01, 1.401e-01, 3.238e-01, 2.737e-01, 8.041e-02, -1.662e-01, 9.181e-02, -3.488e-01, -1.586e-01, 1.407e-01, -1.126e-01, 1.825e-01));
|
||||
r += mul(s1_2, M4(-1.881e-02, 4.604e-01, -1.712e-02, 3.453e-02, 3.171e-01, -1.126e-01, 6.510e-02, 2.908e-01, -9.125e-02, 7.793e-02, -5.580e-02, -3.603e-01, 9.996e-02, -2.647e-01, -2.114e-01, 2.330e-01));
|
||||
r += mul(s1_3, M4(2.957e-01, -1.252e-01, -2.840e-01, 1.815e-01, -2.900e-01, 1.027e-01, 1.404e-01, -1.123e-01, -1.767e-01, 1.535e-03, -3.568e-03, -2.824e-01, 2.015e-01, -7.712e-02, -6.140e-02, 6.517e-02));
|
||||
r += mul(s1_4, M4(-2.439e-01, 7.096e-02, -2.116e-01, -1.980e-01, -3.221e-01, 2.007e-01, -4.243e-01, -5.013e-01, 1.181e-01, -3.735e-01, 1.812e-01, -5.095e-01, 3.646e-01, 4.013e-01, -8.028e-02, 1.287e-01));
|
||||
r += mul(s1_5, M4(-8.389e-02, -1.091e-01, 6.962e-02, 2.605e-01, -3.435e-03, -5.146e-01, 4.125e-01, 5.487e-01, -1.481e-01, 6.810e-02, -1.450e-01, -9.583e-02, 3.305e-01, -1.238e+00, 2.036e-01, 1.879e-01));
|
||||
r += mul(s1_6, M4(-8.033e-02, 5.944e-03, 2.453e-01, -2.971e-01, -5.652e-02, -1.251e-02, -1.449e-01, -5.344e-02, -1.377e-01, 9.383e-03, -1.862e-01, -2.528e-01, -3.825e-02, 7.296e-02, 2.373e-01, -1.935e-01));
|
||||
r += mul(s1_7, M4(-1.795e-01, 1.597e-01, 2.709e-01, -3.738e-01, 2.604e-02, 1.678e-01, -8.718e-02, -9.483e-03, -3.844e-02, 6.235e-02, -1.344e-01, 1.837e-02, -3.074e-02, 2.568e-02, 1.030e+00, 1.831e-01));
|
||||
r += mul(s1_8, M4(4.299e-02, 6.530e-03, -2.571e-02, 3.382e-01, -1.327e-01, 2.975e-02, -2.861e-02, 1.963e-01, 8.130e-04, 9.743e-02, -1.177e-02, -1.273e-01, -1.265e-01, -3.003e-01, 2.635e-01, 5.426e-02));
|
||||
r += mul(s2_0, M4(-1.538e-01, 1.580e-01, 1.392e-01, -1.077e-01, -1.228e-01, 1.853e-01, -1.010e-01, 3.144e-02, 2.203e-01, -3.309e-02, 6.819e-02, 2.708e-01, 1.720e-01, 2.635e-01, -1.290e-01, -2.932e-01));
|
||||
r += mul(s2_1, M4(1.615e-01, -1.424e-01, -2.346e-01, -1.008e-01, 1.386e-01, -2.281e-01, -1.313e-01, -5.902e-02, -3.376e-02, 1.925e-01, -1.172e-01, 7.865e-02, 2.112e-01, -7.280e-02, -1.953e-01, -1.198e-02));
|
||||
r += mul(s2_2, M4(1.280e-01, -1.353e-01, 1.251e-01, 3.212e-02, -1.144e-01, -1.492e-01, -1.499e-01, 2.211e-01, 1.307e-01, 1.336e-01, 1.977e-01, -1.429e-02, -5.395e-02, -2.772e-02, -3.214e-01, -1.907e-01));
|
||||
r += mul(s2_3, M4(-2.703e-01, 3.122e-01, 1.951e-01, -2.005e-01, 1.463e-01, 3.000e-01, 1.058e-01, 8.352e-02, 1.567e-01, -1.256e-01, -1.854e-01, -2.018e-01, 3.248e-01, 8.780e-02, 1.586e-01, -9.757e-03));
|
||||
r += mul(s2_4, M4(3.941e-02, -1.430e-01, 1.023e-01, 2.878e-01, 8.414e-02, 1.385e-01, 8.032e-02, -6.330e-02, -1.020e-01, 2.731e-01, -6.877e-02, -3.492e-01, 3.758e-01, -7.526e-02, 4.955e-01, -5.595e-01));
|
||||
r += mul(s2_5, M4(2.684e-01, -1.924e-02, -2.975e-02, 7.205e-01, 6.611e-02, -1.645e-01, 1.267e-01, 6.066e-02, 1.695e-01, -4.367e-01, -1.450e-01, -4.074e-02, 4.469e-01, -7.176e-03, 4.177e-01, -4.565e-01));
|
||||
r += mul(s2_6, M4(-1.843e-01, 2.522e-01, 3.324e-01, -1.821e-01, -1.327e-01, 1.182e-01, 1.158e-01, -2.494e-01, -6.459e-03, -6.606e-03, 1.333e-01, 2.229e-01, 2.481e-01, -2.018e-01, 2.456e-01, 2.351e-01));
|
||||
r += mul(s2_7, M4(-6.894e-03, -2.822e-01, -1.863e-01, -2.252e-01, 6.755e-02, -1.766e-01, 8.884e-02, -2.720e-03, -4.431e-02, -2.119e-02, 2.876e-01, -5.268e-01, -3.635e-01, -1.001e-01, -8.433e-01, 5.160e-01));
|
||||
r += mul(s2_8, M4(-1.786e-01, 2.208e-01, 4.289e-01, 1.663e-01, -2.341e-01, 8.148e-03, -7.557e-02, 7.817e-02, -1.340e-01, -2.341e-01, 3.123e-02, 1.120e-01, -7.753e-01, 2.056e-01, -2.926e-01, -1.222e-01));
|
||||
r += mul(s3_0, M4(-4.903e-02, 1.377e-01, 6.984e-02, -1.053e-02, -5.115e-01, 2.891e-01, -4.612e-01, -6.693e-01, 4.752e-02, -5.287e-02, -2.183e-02, 4.134e-01, 1.073e-02, 2.383e-01, -2.142e-01, 1.384e-01));
|
||||
r += mul(s3_1, M4(1.680e-01, -1.307e-01, -1.038e-01, -2.130e-02, -1.231e+00, -2.602e-01, -5.456e-01, 3.295e-01, -5.588e-02, 1.505e-01, -4.784e-02, -1.493e-01, 1.202e-01, -2.349e-01, -1.452e-01, -5.111e-02));
|
||||
r += mul(s3_2, M4(-8.858e-02, -1.293e-01, 9.441e-02, -1.295e-01, -3.373e-01, -1.841e-01, -1.818e-01, 1.570e+00, -8.336e-02, 2.012e-01, 1.362e-01, 1.830e-01, -6.053e-02, -1.725e-03, -2.011e-01, -1.021e-01));
|
||||
r += mul(s3_3, M4(-2.017e-01, 3.505e-01, 3.541e-02, 2.044e-01, -3.839e-01, 5.124e-01, 1.104e-01, 1.311e-01, 1.022e-01, -1.111e-01, -2.883e-01, 1.086e-01, 9.932e-02, 1.308e-01, 2.954e-01, -1.416e-02));
|
||||
r += mul(s3_4, M4(6.088e-02, -4.532e-02, -1.302e-01, -1.067e-01, -4.196e+00, 7.383e-01, -2.786e-01, -2.053e+00, -3.758e-01, 2.955e-01, -1.898e-01, 1.875e-01, 1.263e-01, 9.931e-03, 1.016e-01, 5.201e-02));
|
||||
r += mul(s3_5, M4(9.722e-03, -5.478e-02, -1.823e-01, -3.983e-02, -2.434e+00, -4.700e-01, 4.168e-01, 3.938e-01, 1.251e-01, -2.933e-01, -2.054e-02, 8.827e-02, 2.048e-02, 6.212e-02, 1.448e-01, 1.042e-01));
|
||||
r += mul(s3_6, M4(-1.605e-02, 1.851e-01, 2.427e-01, 4.894e-02, -6.032e-01, -3.413e-02, 4.158e-01, 6.903e-01, -1.865e-02, -1.318e-02, 1.003e-01, 3.193e-01, 4.503e-02, 1.880e-01, -4.608e-02, -3.137e-01));
|
||||
r += mul(s3_7, M4(-4.125e-02, -1.494e-01, 8.853e-01, -1.540e-01, -2.445e-01, 2.292e-01, 1.684e+00, 1.098e+00, 5.576e-02, -8.241e-02, 2.507e-01, -1.086e-01, 1.392e-01, -2.115e-01, -2.600e-01, 9.268e-02));
|
||||
r += mul(s3_8, M4(5.677e-02, 9.206e-02, 5.863e-02, 5.663e-02, -2.019e+00, -1.006e-01, -1.769e-01, -3.617e-01, 1.293e-02, -2.766e-01, 2.843e-02, 3.331e-01, -2.316e-01, -1.762e-01, -6.013e-03, -2.482e-02));
|
||||
r += V4(3.430e-02, -1.031e-02, -1.631e-02, -3.189e-02);
|
||||
return r;
|
||||
}
|
||||
|
||||
V4 f1(V4 s0_0, V4 s0_1, V4 s0_2, V4 s0_3, V4 s0_4, V4 s0_5, V4 s0_6, V4 s0_7, V4 s0_8, V4 s1_0, V4 s1_1, V4 s1_2, V4 s1_3, V4 s1_4, V4 s1_5, V4 s1_6, V4 s1_7, V4 s1_8, V4 s2_0, V4 s2_1, V4 s2_2, V4 s2_3, V4 s2_4, V4 s2_5, V4 s2_6, V4 s2_7, V4 s2_8, V4 s3_0, V4 s3_1, V4 s3_2, V4 s3_3, V4 s3_4, V4 s3_5, V4 s3_6, V4 s3_7, V4 s3_8) {
|
||||
V4 r = 0.0;
|
||||
r += mul(s0_0, M4(3.260e-02, 1.675e-01, 8.130e-02, -2.153e-01, -1.987e-01, -9.443e-02, 3.512e-01, 2.289e-02, 9.481e-02, -1.921e-01, -3.818e-01, 1.373e-01, -9.032e-02, 7.892e-02, 1.392e-01, -6.033e-02));
|
||||
r += mul(s0_1, M4(-8.203e-02, -1.015e-01, -1.313e-02, -5.337e-02, -2.948e-01, -2.678e-01, -2.321e-01, -5.995e-01, 1.364e-01, 1.030e-01, 1.546e-01, -1.179e-02, 1.996e-01, 2.244e-01, -2.304e-01, -1.304e-02));
|
||||
r += mul(s0_2, M4(-2.319e-02, -2.236e-02, 3.976e-02, 1.804e-01, 6.474e-02, 1.315e-01, -1.456e-02, -1.538e-01, 3.061e-02, -1.998e-02, -1.918e-02, -8.662e-02, -1.980e-01, -1.596e-01, -4.624e-01, -3.728e-01));
|
||||
r += mul(s0_3, M4(-3.171e-03, -2.887e-02, 3.107e-01, -8.532e-02, 1.489e-02, -2.798e-01, -2.458e-02, 2.922e-01, 5.196e-02, 2.333e-02, -4.100e-01, 3.851e-01, 8.566e-02, 1.655e-01, 3.680e-01, -3.572e-01));
|
||||
r += mul(s0_4, M4(4.618e-02, -3.100e-02, -1.849e-01, 2.228e-02, -2.182e-01, -5.806e-01, -6.298e-02, 2.421e-01, 4.266e-01, 7.738e-02, 4.856e-03, -1.191e-01, 3.469e-01, -8.683e-02, -2.397e-01, 6.512e-02));
|
||||
r += mul(s0_5, M4(8.363e-02, -9.745e-02, 2.398e-01, -1.335e-01, -1.585e-01, -1.161e-02, 2.482e-02, 1.319e-03, -4.696e-02, -6.675e-02, -7.519e-02, 1.125e-01, -1.199e-01, -9.094e-03, -2.590e-01, -8.812e-01));
|
||||
r += mul(s0_6, M4(7.745e-02, 3.414e-02, 6.378e-02, -8.388e-02, 4.456e-02, 1.354e-02, -1.138e-02, 1.131e-01, 2.361e-01, 1.828e-01, -2.135e-01, -1.100e-02, 1.683e-01, 2.134e-01, 1.832e-01, 8.420e-02));
|
||||
r += mul(s0_7, M4(-3.223e-01, -4.870e-02, -1.457e-01, 1.996e-01, -1.632e-01, -1.811e-01, -1.625e-01, 4.046e-02, -8.959e-02, 1.432e-01, -2.360e-02, -9.415e-02, -1.547e-01, 1.379e-01, 5.098e-01, -4.069e-01));
|
||||
r += mul(s0_8, M4(1.568e-01, -2.510e-02, -9.894e-02, 1.124e-01, -1.372e-01, 5.952e-03, 4.501e-02, 9.591e-03, 1.430e-01, 6.422e-02, -1.412e-03, 1.042e-02, 4.601e-02, -5.133e-02, -7.936e-02, -1.621e-01));
|
||||
r += mul(s1_0, M4(1.380e-01, 1.774e-01, 2.958e-01, -2.044e-01, -2.085e-01, 7.192e-03, -7.903e-02, 6.119e-02, -3.542e-02, -1.060e-01, -1.832e-01, 3.603e-01, -3.854e-02, 5.092e-02, -1.092e-01, -2.074e-01));
|
||||
r += mul(s1_1, M4(-5.638e-02, -1.659e-01, -1.006e-02, 5.355e-02, -2.243e-01, 3.533e-01, -2.130e-01, 6.480e-02, 4.462e-02, 1.065e-01, 1.598e-01, 5.025e-03, -3.810e-02, 1.012e-01, 2.123e-02, 2.124e-01));
|
||||
r += mul(s1_2, M4(5.207e-02, -1.428e-01, 1.745e-01, 2.563e-01, 4.058e-01, 5.320e-02, 3.527e-03, -4.664e-02, -1.641e-03, -2.830e-02, 1.453e-02, 1.169e-01, -5.840e-01, -1.545e-01, 3.880e-01, 1.250e-01));
|
||||
r += mul(s1_3, M4(-2.089e-01, 3.070e-02, 3.770e-01, -2.868e-01, -1.965e-01, -2.499e-01, -2.145e-01, 5.348e-02, -1.201e-01, -3.454e-01, -5.723e-01, 4.313e-01, -7.068e-02, -6.358e-02, -2.426e-02, -2.841e-01));
|
||||
r += mul(s1_4, M4(1.315e-01, 2.464e-01, -2.505e-01, -1.589e-01, 4.124e-01, 4.860e-01, -2.493e-01, 1.201e-01, -1.304e-01, -1.620e-01, 2.228e-01, 4.485e-02, 6.945e-02, -2.261e-01, -8.190e-04, 5.678e-01));
|
||||
r += mul(s1_5, M4(3.529e-01, 1.800e-02, -9.794e-02, -1.160e-01, 7.052e-01, 4.176e-01, 5.822e-02, -5.300e-02, -1.144e-01, -1.890e-01, 1.337e-01, 1.163e-01, -5.024e-01, 9.977e-01, 1.831e-01, 2.166e-02));
|
||||
r += mul(s1_6, M4(-1.239e-01, 1.465e-01, 3.700e-01, -1.638e-01, -1.022e-01, -3.216e-02, -2.412e-02, -2.505e-02, 5.450e-02, -1.325e-02, -2.760e-01, 5.219e-02, -5.604e-02, 3.602e-02, -1.026e-01, 4.063e-02));
|
||||
r += mul(s1_7, M4(1.669e-01, 2.580e-01, -2.923e-01, -2.497e-01, 1.135e-01, -1.599e-01, -2.419e-01, -1.202e-01, -3.903e-01, -2.141e-01, 9.642e-02, -6.096e-02, -6.762e-01, 5.614e-01, 3.076e-01, -4.187e-01));
|
||||
r += mul(s1_8, M4(5.456e-02, -6.641e-02, -3.839e-01, 8.629e-02, 1.149e-01, 1.204e-02, -2.509e-02, -1.413e-03, -1.329e-02, -5.670e-02, -6.186e-02, 5.108e-02, 3.592e-02, 4.563e-01, -7.450e-02, -2.259e-01));
|
||||
r += mul(s2_0, M4(1.013e-01, -2.126e-02, -1.260e-01, 8.480e-03, -3.292e-02, 6.069e-04, 4.154e-02, 5.578e-02, 1.586e-02, 8.252e-02, 1.237e-01, -1.312e-01, 1.489e-01, 2.561e-01, -9.917e-02, -1.060e-01));
|
||||
r += mul(s2_1, M4(-1.285e-01, -8.314e-02, 1.521e-02, 1.037e-01, -1.021e-02, 7.112e-02, -2.319e-02, 7.051e-04, -1.101e-01, -1.896e-01, -2.458e-01, -7.399e-02, -4.133e-02, 1.606e-01, -1.511e-01, -2.425e-01));
|
||||
r += mul(s2_2, M4(7.543e-02, 9.235e-02, 2.139e-01, 2.879e-01, 9.583e-02, 4.372e-02, -8.231e-02, 2.498e-01, 1.241e-01, 1.377e-02, 2.380e-01, 2.586e-02, -1.926e-01, -1.406e-01, -3.627e-01, -8.414e-02));
|
||||
r += mul(s2_3, M4(9.655e-03, -9.581e-02, -6.071e-02, 2.231e-01, -1.148e-01, -3.513e-02, -2.013e-02, -1.094e-01, -1.606e-01, 9.180e-02, 3.498e-01, -2.726e-01, -7.696e-03, -4.007e-01, -8.497e-02, -6.989e-01));
|
||||
r += mul(s2_4, M4(4.965e-03, -1.346e-01, -4.517e-02, 2.043e-01, -1.348e-01, 1.451e-01, 8.113e-02, -8.530e-02, -1.414e-01, 7.261e-02, -2.368e-01, 1.601e-01, -2.438e-02, -2.554e-01, 4.057e-01, -2.224e-01));
|
||||
r += mul(s2_5, M4(-8.716e-02, 1.496e-01, -4.429e-02, 6.451e-01, -9.547e-03, -3.189e-02, -1.096e-01, -5.416e-02, -5.032e-01, 1.331e-01, 2.389e-02, 1.028e-01, -3.186e-01, -2.524e-01, 2.663e-02, -9.995e-03));
|
||||
r += mul(s2_6, M4(-2.465e-01, 1.585e-01, 3.196e-01, -9.098e-02, 2.765e-02, -1.793e-01, 1.519e-01, -9.565e-04, -1.160e-01, -3.035e-02, -1.082e-01, 3.172e-02, 5.502e-01, -6.251e-01, -4.487e-01, 1.932e-01));
|
||||
r += mul(s2_7, M4(-5.017e-01, -5.180e-01, -2.682e-01, -4.715e-01, 1.958e-02, -7.007e-02, -3.332e-02, -8.389e-02, -1.135e-01, -2.956e-02, 1.994e-01, 2.315e-02, -2.553e-01, -3.153e-03, 4.275e-01, 1.669e+00));
|
||||
r += mul(s2_8, M4(1.400e-01, 6.775e-01, 5.287e-02, 2.007e-02, 1.213e-01, -1.460e-03, -2.313e-02, 1.282e-01, -8.355e-02, 2.399e-01, -5.277e-02, -1.499e-01, 7.246e-02, -2.553e-02, 2.185e-01, 8.662e-01));
|
||||
r += mul(s3_0, M4(3.069e-02, -3.668e-02, -3.646e-02, 1.140e-01, -7.882e-02, 2.759e-01, 9.170e-01, 2.779e-01, 1.459e-01, 3.766e-02, -1.214e-01, 5.718e-03, -3.323e-02, 9.705e-02, -1.282e-02, -1.401e-01));
|
||||
r += mul(s3_1, M4(-1.405e-02, 2.809e-02, 1.466e-01, -1.286e-01, 4.754e-01, 8.076e-01, 5.775e-02, -5.403e-01, 1.919e-01, -2.015e-01, -1.976e-01, -8.544e-02, -8.431e-02, 9.302e-02, 6.560e-02, 2.011e-02));
|
||||
r += mul(s3_2, M4(2.107e-01, 2.334e-02, -2.591e-01, -1.023e-01, 6.461e-01, 1.138e+00, 3.917e-01, 2.270e-01, 4.023e-01, 6.135e-02, 4.125e-02, -5.551e-02, 1.871e-02, -1.344e-01, -1.534e-01, 1.216e-01));
|
||||
r += mul(s3_3, M4(8.077e-02, -1.149e-01, 6.733e-02, -9.044e-03, -6.431e-02, -1.755e-02, 2.617e+00, 5.203e-01, 8.910e-02, 9.642e-02, 3.720e-01, -2.326e-01, -1.142e-01, -4.017e-02, 2.351e-01, -1.062e-01));
|
||||
r += mul(s3_4, M4(-2.427e-01, -4.425e-03, 4.260e-01, -6.273e-02, 4.224e+00, -2.047e+00, -1.911e+00, 2.329e+00, 2.987e-01, -3.286e-01, -1.115e-01, 2.053e-01, -5.309e-02, -8.751e-02, -1.275e-02, -2.105e-01));
|
||||
r += mul(s3_5, M4(-1.413e-02, -4.404e-01, -1.525e-01, -1.703e-01, -9.999e-01, 5.276e-01, 4.779e-01, -5.145e-01, 4.772e-01, 2.730e-02, -7.651e-02, -2.235e-01, -1.122e-01, -1.686e-01, 9.595e-02, -1.169e-01));
|
||||
r += mul(s3_6, M4(-1.162e-01, 3.109e-01, -2.686e-01, -1.492e-01, 2.122e-01, 6.911e-01, 7.412e-01, 3.675e-02, 1.420e-01, -3.979e-02, -3.526e-02, -1.170e-01, 2.192e-01, 6.369e-02, 2.568e-01, 1.606e-02));
|
||||
r += mul(s3_7, M4(-2.482e-02, 6.355e-01, 4.230e-01, -4.331e-01, -1.462e+00, -9.944e-01, 1.154e+00, 8.760e-01, 3.625e-01, 2.127e-01, 3.382e-01, 6.009e-02, 1.431e-01, 9.892e-02, -2.409e-01, 4.223e-02));
|
||||
r += mul(s3_8, M4(-1.832e-02, 7.811e-02, -1.928e-02, 1.448e-01, -1.288e+00, 1.805e-01, 6.324e-01, -2.704e-02, 6.456e-02, -6.364e-02, 4.971e-02, -6.535e-03, 1.766e-01, 5.142e-02, -1.375e-01, 2.532e-01));
|
||||
r += V4(8.007e-03, 2.570e-02, 2.487e-03, -2.496e-02);
|
||||
return r;
|
||||
}
|
||||
|
||||
void Pass2(uint2 blockStart, uint3 tid) {
|
||||
float2 pt = float2(GetInputPt());
|
||||
uint2 gxy = Rmp8x8(tid.x) + blockStart;
|
||||
uint2 size = GetInputSize();
|
||||
if (gxy.x >= size.x || gxy.y >= size.y) {
|
||||
return;
|
||||
}
|
||||
float2 pos = (gxy + 0.5) * pt;
|
||||
|
||||
V4 s0_0 = l0(-1.0, -1.0);
|
||||
V4 s0_1 = l0(0.0, -1.0);
|
||||
V4 s0_2 = l0(1.0, -1.0);
|
||||
V4 s0_3 = l0(-1.0, 0.0);
|
||||
V4 s0_4 = l0(0.0, 0.0);
|
||||
V4 s0_5 = l0(1.0, 0.0);
|
||||
V4 s0_6 = l0(-1.0, 1.0);
|
||||
V4 s0_7 = l0(0.0, 1.0);
|
||||
V4 s0_8 = l0(1.0, 1.0);
|
||||
V4 s1_0 = -max(-s0_0, 0.0);
|
||||
V4 s1_1 = -max(-s0_1, 0.0);
|
||||
V4 s1_2 = -max(-s0_2, 0.0);
|
||||
V4 s1_3 = -max(-s0_3, 0.0);
|
||||
V4 s1_4 = -max(-s0_4, 0.0);
|
||||
V4 s1_5 = -max(-s0_5, 0.0);
|
||||
V4 s1_6 = -max(-s0_6, 0.0);
|
||||
V4 s1_7 = -max(-s0_7, 0.0);
|
||||
V4 s1_8 = -max(-s0_8, 0.0);
|
||||
s0_0 = max(s0_0, 0.0);
|
||||
s0_1 = max(s0_1, 0.0);
|
||||
s0_2 = max(s0_2, 0.0);
|
||||
s0_3 = max(s0_3, 0.0);
|
||||
s0_4 = max(s0_4, 0.0);
|
||||
s0_5 = max(s0_5, 0.0);
|
||||
s0_6 = max(s0_6, 0.0);
|
||||
s0_7 = max(s0_7, 0.0);
|
||||
s0_8 = max(s0_8, 0.0);
|
||||
|
||||
V4 s2_0 = l1(-1.0, -1.0);
|
||||
V4 s2_1 = l1(0.0, -1.0);
|
||||
V4 s2_2 = l1(1.0, -1.0);
|
||||
V4 s2_3 = l1(-1.0, 0.0);
|
||||
V4 s2_4 = l1(0.0, 0.0);
|
||||
V4 s2_5 = l1(1.0, 0.0);
|
||||
V4 s2_6 = l1(-1.0, 1.0);
|
||||
V4 s2_7 = l1(0.0, 1.0);
|
||||
V4 s2_8 = l1(1.0, 1.0);
|
||||
V4 s3_0 = -max(-s2_0, 0.0);
|
||||
V4 s3_1 = -max(-s2_1, 0.0);
|
||||
V4 s3_2 = -max(-s2_2, 0.0);
|
||||
V4 s3_3 = -max(-s2_3, 0.0);
|
||||
V4 s3_4 = -max(-s2_4, 0.0);
|
||||
V4 s3_5 = -max(-s2_5, 0.0);
|
||||
V4 s3_6 = -max(-s2_6, 0.0);
|
||||
V4 s3_7 = -max(-s2_7, 0.0);
|
||||
V4 s3_8 = -max(-s2_8, 0.0);
|
||||
s2_0 = max(s2_0, 0.0);
|
||||
s2_1 = max(s2_1, 0.0);
|
||||
s2_2 = max(s2_2, 0.0);
|
||||
s2_3 = max(s2_3, 0.0);
|
||||
s2_4 = max(s2_4, 0.0);
|
||||
s2_5 = max(s2_5, 0.0);
|
||||
s2_6 = max(s2_6, 0.0);
|
||||
s2_7 = max(s2_7, 0.0);
|
||||
s2_8 = max(s2_8, 0.0);
|
||||
|
||||
t2[gxy] = f0(s0_0, s0_1, s0_2, s0_3, s0_4, s0_5, s0_6, s0_7, s0_8, s1_0, s1_1, s1_2, s1_3, s1_4, s1_5, s1_6, s1_7, s1_8, s2_0, s2_1, s2_2, s2_3, s2_4, s2_5, s2_6, s2_7, s2_8, s3_0, s3_1, s3_2, s3_3, s3_4, s3_5, s3_6, s3_7, s3_8);
|
||||
t3[gxy] = f1(s0_0, s0_1, s0_2, s0_3, s0_4, s0_5, s0_6, s0_7, s0_8, s1_0, s1_1, s1_2, s1_3, s1_4, s1_5, s1_6, s1_7, s1_8, s2_0, s2_1, s2_2, s2_3, s2_4, s2_5, s2_6, s2_7, s2_8, s3_0, s3_1, s3_2, s3_3, s3_4, s3_5, s3_6, s3_7, s3_8);
|
||||
}
|
||||
|
||||
//!PASS 3
|
||||
//!DESC conv2
|
||||
//!BLOCK_SIZE 8
|
||||
//!NUM_THREADS 64
|
||||
//!IN t2, t3
|
||||
//!OUT t0, t1
|
||||
|
||||
#define l0(x, y) V4(O(t2, float2(x, y)))
|
||||
#define l1(x, y) V4(O(t3, float2(x, y)))
|
||||
|
||||
V4 f0(V4 s0_0, V4 s0_1, V4 s0_2, V4 s0_3, V4 s0_4, V4 s0_5, V4 s0_6, V4 s0_7, V4 s0_8, V4 s1_0, V4 s1_1, V4 s1_2, V4 s1_3, V4 s1_4, V4 s1_5, V4 s1_6, V4 s1_7, V4 s1_8, V4 s2_0, V4 s2_1, V4 s2_2, V4 s2_3, V4 s2_4, V4 s2_5, V4 s2_6, V4 s2_7, V4 s2_8, V4 s3_0, V4 s3_1, V4 s3_2, V4 s3_3, V4 s3_4, V4 s3_5, V4 s3_6, V4 s3_7, V4 s3_8) {
|
||||
V4 r = 0.0;
|
||||
r += mul(s0_0, M4(2.802e-01, -3.301e-02, -1.047e-01, 6.427e-02, 1.357e-02, -8.015e-02, 7.763e-02, -9.646e-02, 1.136e-01, -1.443e-01, -3.950e-02, 2.744e-01, 8.414e-03, -1.005e-01, -1.683e-01, -5.766e-02));
|
||||
r += mul(s0_1, M4(2.907e-01, 1.339e-01, -7.005e-02, 9.074e-02, -2.491e-03, 6.498e-02, 1.121e-01, -9.272e-02, 3.415e-01, 1.949e-01, -2.613e-01, -2.328e-01, 1.311e-01, 1.285e-01, 1.685e-02, -4.780e-02));
|
||||
r += mul(s0_2, M4(1.671e-01, -2.228e-02, -5.777e-02, -5.853e-02, 1.243e-02, -3.269e-02, 8.757e-03, -1.478e-01, -4.190e-02, 3.164e-02, 2.922e-01, -3.017e-01, -6.631e-02, 5.380e-02, -2.750e-02, -7.771e-02));
|
||||
r += mul(s0_3, M4(-2.454e-02, 2.148e-01, -1.116e-01, -1.125e-01, -1.792e-01, -7.021e-01, -2.183e-01, 2.920e-01, -1.698e-01, 1.827e-01, -6.779e-02, 9.333e-02, -2.153e-01, 2.441e-01, 9.794e-02, -2.729e-01));
|
||||
r += mul(s0_4, M4(-6.750e-02, 1.324e-01, -5.087e-02, 2.746e-01, 1.579e-01, -1.909e-01, -7.631e-01, -4.744e-01, -1.732e-01, -2.741e-01, 4.145e-02, -2.124e-01, 7.946e-02, -1.579e-01, 2.856e-01, 5.090e-02));
|
||||
r += mul(s0_5, M4(8.392e-02, -1.504e-01, 2.815e-01, -1.174e-01, 3.942e-02, 1.918e-02, 1.561e-01, -1.457e-01, -5.976e-02, 1.230e-01, -2.539e-01, -1.965e-01, 1.869e-01, -1.795e-01, -1.283e-01, -3.447e-02));
|
||||
r += mul(s0_6, M4(-3.547e-03, -6.576e-03, -5.087e-02, 3.466e-02, -3.130e-03, -3.176e-01, 8.737e-02, 4.018e-02, -6.489e-02, -1.580e-03, -8.784e-03, -4.500e-02, 2.343e-03, 5.945e-02, -5.201e-02, -3.127e-02));
|
||||
r += mul(s0_7, M4(-3.546e-02, 1.145e-01, -4.773e-02, 8.280e-02, 6.746e-03, -1.036e-01, -6.616e-02, -1.224e-01, 7.156e-02, -1.941e-01, 9.307e-02, -3.567e-02, -2.215e-01, 2.437e-01, -5.542e-04, 1.208e-01));
|
||||
r += mul(s0_8, M4(-1.115e-02, -4.687e-02, -3.210e-02, -1.470e-01, -4.609e-02, 4.657e-02, -6.476e-02, -1.372e-01, -4.956e-03, 1.024e-01, -2.349e-01, -8.472e-02, -2.757e-02, -1.707e-02, 2.065e-01, 1.863e-02));
|
||||
r += mul(s1_0, M4(3.728e-02, -7.100e-02, -4.937e-02, 6.239e-02, -7.377e-03, -3.033e-02, 1.675e-01, -1.863e-02, -2.631e-02, -9.633e-02, -1.130e-01, -1.201e-01, 1.414e-01, -1.737e-01, -8.031e-02, -6.951e-02));
|
||||
r += mul(s1_1, M4(-3.703e-02, 4.012e-02, -2.289e-02, 3.332e-02, 2.161e-02, 8.828e-02, 5.544e-02, 1.017e-01, 3.684e-01, 3.149e-01, 3.662e-01, 4.298e-02, 1.966e-01, -2.697e-02, 2.216e-02, 7.540e-02));
|
||||
r += mul(s1_2, M4(-4.974e-02, -3.826e-02, -2.810e-02, -8.318e-02, 3.356e-02, -7.605e-02, -1.087e-01, 1.987e-02, -1.153e-01, -1.039e-01, -5.868e-02, -3.313e-02, -1.750e-02, 3.884e-03, -9.170e-02, -1.011e-01));
|
||||
r += mul(s1_3, M4(2.119e-01, -1.340e-01, -3.650e-02, 2.219e-01, 3.634e-01, 3.474e-01, 2.302e-01, 7.494e-02, -2.253e-01, 1.239e-01, -6.032e-02, 1.293e-01, 9.583e-02, 4.424e-02, -3.920e-02, -1.870e-01));
|
||||
r += mul(s1_4, M4(-2.664e-01, 8.462e-02, -4.745e-01, 1.985e-01, 2.803e-01, 7.429e-02, 7.814e-01, 4.658e-01, 3.661e-01, -2.319e-02, 3.324e-01, 2.860e-01, 3.178e-01, 9.301e-02, 1.316e-01, 4.547e-02));
|
||||
r += mul(s1_5, M4(5.369e-02, 6.912e-02, 2.659e-01, -1.491e-01, 4.462e-02, -4.823e-02, 1.130e-01, 1.710e-02, -7.604e-02, -7.003e-02, 3.093e-01, 2.537e-01, 2.466e-01, -1.039e-01, 2.413e-02, -1.256e-01));
|
||||
r += mul(s1_6, M4(-1.188e-01, 1.026e-01, 4.215e-02, -9.677e-02, 2.443e-03, 1.957e-01, 2.961e-02, -5.553e-02, -3.488e-02, 2.515e-02, -4.840e-03, 1.814e-02, 9.644e-02, -8.802e-02, 3.516e-03, -2.940e-03));
|
||||
r += mul(s1_7, M4(-1.792e-01, 1.391e-01, 1.322e-02, -1.514e-02, -2.173e-01, 1.743e-01, 1.530e-01, 5.286e-02, -8.655e-02, 2.541e-01, 6.282e-02, 1.167e-01, 9.664e-02, 2.304e-01, -1.538e-01, -1.298e-01));
|
||||
r += mul(s1_8, M4(-1.720e-01, 4.693e-02, 2.790e-01, 2.187e-02, -4.386e-02, 7.714e-03, 9.800e-02, 6.484e-03, -5.497e-02, 1.216e-01, 3.924e-02, 5.162e-02, 1.403e-01, -5.364e-03, -6.795e-03, -6.163e-02));
|
||||
r += mul(s2_0, M4(2.905e-01, -3.799e-02, 1.332e-01, 2.496e-02, 7.202e-02, -3.659e-01, -2.940e-02, -1.028e-03, -1.221e-01, 1.147e-01, 3.613e-02, 9.125e-02, -8.760e-03, 1.489e-02, -9.652e-02, 4.452e-03));
|
||||
r += mul(s2_1, M4(4.027e-01, -2.178e-01, -8.478e-02, 2.903e-01, 2.463e-02, 9.527e-03, -2.835e-01, 2.066e-01, -6.698e-02, -2.653e-01, -6.667e-02, 4.320e-02, -2.610e-01, -1.351e-01, 7.826e-02, -5.429e-02));
|
||||
r += mul(s2_2, M4(-1.249e-01, 4.376e-02, -6.245e-02, 1.702e-01, -5.731e-02, 8.022e-02, -1.335e-01, 1.528e-01, -2.969e-02, 1.062e-01, -1.303e-01, 1.226e-01, 2.030e-02, 5.205e-02, -1.877e-01, 4.309e-02));
|
||||
r += mul(s2_3, M4(-6.329e-02, -1.286e-01, -7.222e-02, 5.592e-03, -3.023e-02, 9.502e-02, -4.077e-02, -2.299e-01, -1.038e-01, -5.742e-02, -5.106e-04, 5.143e-02, 3.098e-02, -1.235e-01, 1.987e-02, 1.477e-02));
|
||||
r += mul(s2_4, M4(1.113e-01, -1.761e-01, 5.038e-02, -1.304e-01, 3.668e-01, -3.430e-01, 2.169e-01, 3.877e-01, -3.750e-02, 2.473e-01, 3.416e-02, 2.184e-01, 5.168e-01, -7.132e-02, 3.818e-01, -1.508e-01));
|
||||
r += mul(s2_5, M4(1.479e-01, -8.656e-02, -1.700e-01, 3.874e-01, 2.286e-02, -8.854e-02, 3.305e-02, -4.668e-03, -1.481e-01, 5.115e-02, 2.686e-01, 4.113e-01, -3.740e-01, -2.013e-01, 9.838e-04, 3.008e-01));
|
||||
r += mul(s2_6, M4(3.428e-01, -3.200e-01, 7.593e-02, 1.911e-01, 1.219e-01, 1.211e-02, -5.694e-02, -5.767e-02, 3.119e-02, -7.609e-02, 6.471e-02, 1.215e-01, -2.793e-04, 1.650e-02, 7.190e-03, -4.468e-02));
|
||||
r += mul(s2_7, M4(3.970e-01, -3.192e-01, -5.639e-02, 8.182e-02, -2.831e-02, 4.036e-02, 7.004e-02, 1.095e-01, -3.655e-02, 2.443e-01, 5.606e-02, -4.974e-02, 9.825e-02, 1.158e-01, -5.104e-02, -2.986e-02));
|
||||
r += mul(s2_8, M4(1.440e-01, 5.504e-02, -2.020e-01, 2.618e-03, -1.098e-02, -3.678e-02, 7.661e-02, 5.652e-02, -7.426e-02, 5.461e-02, 4.239e-01, 2.093e-01, 9.316e-03, -3.679e-02, 6.108e-02, 2.036e-01));
|
||||
r += mul(s3_0, M4(1.806e-02, 2.233e-02, 5.056e-02, 1.758e-01, 3.566e-02, -1.383e-01, 5.349e-02, 1.066e-01, 3.314e-02, -1.258e-01, -2.885e-02, -6.648e-02, -6.860e-03, -2.283e-02, -1.052e-01, -1.623e-02));
|
||||
r += mul(s3_1, M4(7.369e-02, -3.141e-02, 3.877e-03, 8.113e-03, -1.773e-01, 5.122e-03, -3.198e-01, 9.005e-02, 7.291e-02, -1.519e-01, -1.501e-01, -8.202e-02, -4.729e-02, -2.877e-02, -4.056e-02, 7.599e-02));
|
||||
r += mul(s3_2, M4(1.282e-01, 2.477e-03, 6.185e-02, 3.967e-02, -1.343e-01, 8.884e-02, 5.299e-02, -7.324e-02, 1.842e-01, -3.053e-02, -1.335e-01, -6.790e-03, -8.128e-02, 6.665e-02, 1.583e-03, -5.358e-02));
|
||||
r += mul(s3_3, M4(1.135e-01, 9.360e-03, 1.646e-01, 1.844e-01, 1.104e-02, 7.072e-02, -9.632e-02, -1.169e-01, -1.458e-01, 2.540e-02, -5.132e-02, -1.627e-01, -1.066e-01, -4.819e-02, -4.340e-02, -5.074e-02));
|
||||
r += mul(s3_4, M4(-1.198e-01, -7.965e-02, -2.989e-01, -4.946e-01, -1.666e-02, -2.136e-01, -3.575e-02, 1.351e-01, -8.546e-02, 2.553e-02, -7.878e-02, -3.233e-01, -2.955e-01, -7.765e-02, 1.450e-01, -2.114e-01));
|
||||
r += mul(s3_5, M4(-7.593e-02, -1.849e-03, -1.688e-01, 3.626e-02, 4.408e-03, 4.014e-02, -1.401e-01, -2.239e-01, 9.538e-02, -2.310e-01, 2.831e-02, 5.065e-02, 1.135e-01, 2.542e-02, -4.365e-01, 4.393e-02));
|
||||
r += mul(s3_6, M4(-5.217e-02, -1.327e-02, -1.851e-02, 2.806e-02, 4.648e-02, -9.047e-04, 2.961e-02, -2.922e-02, 6.360e-02, -3.494e-02, 2.573e-02, 1.309e-02, -2.512e-03, -4.086e-02, -2.086e-03, -6.018e-02));
|
||||
r += mul(s3_7, M4(9.887e-02, -9.515e-03, 1.306e-01, 5.290e-02, 1.832e-01, -2.549e-01, -4.640e-02, -1.256e-01, 4.915e-02, -5.163e-02, 3.044e-02, -9.871e-02, 8.168e-03, -7.112e-02, -5.743e-02, 3.687e-02));
|
||||
r += mul(s3_8, M4(-6.440e-02, 2.530e-02, -2.166e-03, -4.680e-02, 8.009e-02, -6.634e-02, -1.390e-01, -2.524e-02, 6.524e-02, -1.120e-01, -4.252e-02, -8.413e-03, -2.017e-02, 1.444e-02, -4.483e-02, 4.690e-02));
|
||||
r += V4(3.009e-03, -1.445e-03, 8.191e-03, -7.852e-03);
|
||||
return r;
|
||||
}
|
||||
|
||||
V4 f1(V4 s0_0, V4 s0_1, V4 s0_2, V4 s0_3, V4 s0_4, V4 s0_5, V4 s0_6, V4 s0_7, V4 s0_8, V4 s1_0, V4 s1_1, V4 s1_2, V4 s1_3, V4 s1_4, V4 s1_5, V4 s1_6, V4 s1_7, V4 s1_8, V4 s2_0, V4 s2_1, V4 s2_2, V4 s2_3, V4 s2_4, V4 s2_5, V4 s2_6, V4 s2_7, V4 s2_8, V4 s3_0, V4 s3_1, V4 s3_2, V4 s3_3, V4 s3_4, V4 s3_5, V4 s3_6, V4 s3_7, V4 s3_8) {
|
||||
V4 r = 0.0;
|
||||
r += mul(s0_0, M4(-1.005e-01, -6.367e-02, 4.428e-02, 1.687e-02, -9.639e-02, -1.209e-01, -1.374e-02, 4.932e-02, -9.949e-02, -2.569e-01, 1.199e-01, 1.077e-02, 5.110e-02, -1.129e-01, 6.104e-02, -4.656e-03));
|
||||
r += mul(s0_1, M4(-2.156e-01, 8.505e-02, 4.815e-04, -1.042e-01, -2.724e-01, -1.870e-01, 3.876e-02, 7.840e-02, -4.018e-01, -8.239e-01, 2.611e-01, -3.623e-01, -6.999e-03, 1.848e-02, 6.095e-02, -2.318e-02));
|
||||
r += mul(s0_2, M4(-2.195e-01, -6.727e-02, 7.111e-02, 5.119e-02, 7.396e-02, 1.116e-02, -1.261e-02, 9.531e-02, -3.892e-01, 1.430e-01, -9.840e-02, -2.423e-01, 2.669e-01, 3.009e-02, -2.478e-02, 1.168e-01));
|
||||
r += mul(s0_3, M4(-4.344e-01, 8.202e-02, 9.272e-03, -8.384e-02, -8.136e-02, -4.359e-01, 2.361e-01, -2.183e-01, 4.609e-02, -2.144e-02, 9.525e-03, -7.197e-02, -9.339e-02, 1.927e-01, -1.687e-02, 3.193e-02));
|
||||
r += mul(s0_4, M4(4.702e-01, 1.415e-04, 1.097e-01, 2.415e-01, 1.899e-01, -7.324e-01, -4.745e-03, -1.237e-01, -2.043e-01, 2.674e-02, 6.899e-01, 8.700e-02, 5.083e-02, 2.271e-01, 4.884e-02, 3.767e-01));
|
||||
r += mul(s0_5, M4(6.758e-02, -4.638e-02, 9.477e-02, -8.290e-02, -1.994e-01, 1.090e-01, -5.148e-02, -1.470e-01, 7.433e-02, 3.404e-01, 1.020e-01, -8.353e-02, 1.793e-01, -1.368e-01, 6.375e-02, 5.993e-02));
|
||||
r += mul(s0_6, M4(1.596e-02, 3.589e-02, 1.177e-02, 1.541e-01, -1.159e-01, -1.621e-02, 2.451e-01, 2.767e-01, -3.754e-04, 4.995e-02, -6.760e-02, -9.945e-02, 4.017e-01, 4.413e-02, 2.189e-02, 4.126e-02));
|
||||
r += mul(s0_7, M4(1.635e-01, -1.853e-01, -1.823e-01, -1.003e-01, -4.884e-02, 1.686e-01, 7.826e-02, 5.419e-01, -1.017e-01, 7.007e-02, 2.084e-01, 2.030e-01, 5.150e-01, -1.861e-01, -3.037e-01, -3.846e-01));
|
||||
r += mul(s0_8, M4(1.162e-01, 9.675e-02, -9.807e-02, 7.794e-02, 1.154e-01, 7.680e-02, 7.823e-02, 1.665e-01, 1.414e-01, 4.509e-02, -1.327e-02, 1.752e-01, -2.721e-01, -9.636e-04, 2.198e-02, -9.405e-02));
|
||||
r += mul(s1_0, M4(-3.554e-02, 7.673e-02, -1.735e-02, 3.910e-02, -9.934e-02, 1.798e-01, -4.244e-02, -2.008e-02, -1.586e-01, 7.918e-02, 6.812e-02, 1.784e-01, -2.173e-01, 8.736e-02, -3.130e-02, -1.487e-02));
|
||||
r += mul(s1_1, M4(1.142e-01, 2.330e-02, -7.096e-03, 5.291e-02, -3.702e-01, 2.102e-01, 7.156e-02, -1.416e-01, 1.017e-01, 3.888e-01, -5.335e-02, 9.686e-02, -1.093e-01, -1.631e-02, -2.884e-03, -4.091e-02));
|
||||
r += mul(s1_2, M4(4.795e-02, 4.423e-03, 1.494e-02, 2.666e-02, 1.261e-01, -7.251e-02, 2.103e-02, 1.095e-01, 2.166e-01, -1.249e-01, 8.981e-03, 1.792e-01, -3.697e-02, 6.864e-03, -1.141e-02, 2.430e-02));
|
||||
r += mul(s1_3, M4(-1.206e-01, 1.584e-03, -1.789e-02, -1.335e-02, 2.398e-01, 8.681e-01, -1.241e-01, -4.454e-02, -7.396e-02, 1.759e-02, -9.138e-02, 1.573e-01, -2.025e-01, 8.569e-02, 2.132e-02, 9.791e-02));
|
||||
r += mul(s1_4, M4(-4.834e-02, -7.974e-01, 2.858e-01, -2.441e-01, 4.163e-01, -1.650e-01, -1.897e-01, 1.309e-01, 4.031e-02, -8.242e-02, 3.338e-01, 3.567e-01, -1.532e-01, 2.807e-01, -7.324e-02, 5.093e-03));
|
||||
r += mul(s1_5, M4(-1.538e-01, 9.244e-02, -7.570e-02, -4.333e-02, -1.407e-01, -4.201e-02, -4.186e-02, -1.603e-01, -2.031e-01, 6.309e-02, -8.191e-02, 9.121e-02, -8.138e-02, -4.037e-02, 3.793e-02, 4.240e-02));
|
||||
r += mul(s1_6, M4(1.780e-01, 1.059e-01, -5.233e-03, 1.087e-01, 1.808e-01, -1.409e-01, 1.162e-02, -1.312e-01, 6.866e-02, 1.401e-02, 6.420e-02, 5.614e-02, -6.830e-02, 1.731e-02, 5.889e-02, 2.257e-02));
|
||||
r += mul(s1_7, M4(2.057e-01, -2.093e-02, -1.741e-01, 9.891e-02, -3.673e-02, 3.314e-02, -2.223e-01, -3.177e-01, 2.374e-01, -5.871e-02, -5.086e-02, -9.418e-02, -1.935e-02, -1.902e-02, -1.255e-01, -2.744e-01));
|
||||
r += mul(s1_8, M4(1.654e-01, 7.328e-02, 2.874e-02, 1.256e-01, -2.608e-01, 1.926e-03, 4.500e-02, -7.882e-02, -1.035e-02, -3.478e-02, -1.061e-01, -8.474e-02, -2.438e-01, -6.889e-02, -7.579e-02, -1.871e-01));
|
||||
r += mul(s2_0, M4(6.493e-02, 1.357e-01, -6.197e-02, -5.055e-02, 2.568e-01, -5.699e-02, -1.266e-01, -1.411e-02, 2.936e-02, -5.234e-02, -5.882e-03, -8.014e-02, -5.334e-02, -8.555e-02, 5.632e-02, 8.296e-03));
|
||||
r += mul(s2_1, M4(-3.582e-01, 2.351e-01, -1.636e-01, 2.172e-01, -1.840e-01, 9.838e-02, -7.565e-02, 1.535e-01, 8.151e-02, 3.002e-02, 1.149e-01, 1.180e-01, 1.323e-01, -7.682e-03, 5.013e-02, -2.190e-02));
|
||||
r += mul(s2_2, M4(-1.957e-01, -5.823e-02, -1.131e-01, -7.025e-02, 3.355e-01, 1.378e-01, -2.046e-01, 2.575e-01, 1.663e-01, 2.567e-02, -3.703e-02, -9.489e-02, -6.431e-02, -6.700e-02, 9.598e-02, 4.460e-03));
|
||||
r += mul(s2_3, M4(-1.522e-01, 1.335e-01, -2.140e-01, 3.368e-02, -5.076e-02, 2.412e-01, 6.141e-03, 2.456e-02, -9.105e-03, 1.014e-02, -1.056e-02, 1.368e-01, 8.030e-02, -2.874e-02, -7.499e-02, -2.675e-02));
|
||||
r += mul(s2_4, M4(2.115e-02, -6.849e-02, -8.528e-02, -3.270e-01, 2.112e-02, 7.309e-02, -3.852e-02, 2.604e-01, 1.772e-01, 4.115e-01, -2.443e-01, 3.100e-01, 3.139e-01, 3.829e-01, -2.701e-01, 1.463e-01));
|
||||
r += mul(s2_5, M4(2.664e-03, 4.352e-02, -2.378e-01, 5.316e-02, -1.369e-01, -1.293e-01, 1.587e-01, 2.153e-01, 3.820e-01, -1.515e-01, -4.429e-02, 2.391e-01, -3.720e-01, -1.154e-01, -1.196e-01, 3.172e-01));
|
||||
r += mul(s2_6, M4(-3.174e-01, -2.340e-01, 1.286e-01, -1.076e-01, 5.834e-02, 6.138e-02, -6.854e-03, 5.658e-02, 5.314e-02, -1.751e-02, 9.115e-03, 8.328e-03, 8.394e-03, 2.608e-02, 1.125e-01, 1.593e-01));
|
||||
r += mul(s2_7, M4(-6.600e-01, 1.899e-01, 1.094e-01, 1.665e-02, 1.089e-01, -1.034e-01, -1.811e-01, -3.040e-01, 4.782e-01, 3.160e-02, -4.648e-02, 1.286e-01, 1.070e-01, -1.022e-01, 5.693e-02, -5.195e-02));
|
||||
r += mul(s2_8, M4(3.748e-03, -4.142e-02, -7.021e-02, -2.596e-01, -2.444e-01, -6.341e-05, 4.125e-02, -7.382e-02, 4.456e-02, 3.144e-02, -5.055e-02, -1.724e-01, -1.835e-01, 4.462e-02, -1.398e-01, -2.631e-02));
|
||||
r += mul(s3_0, M4(-1.892e-01, -2.298e-01, 7.045e-02, -6.423e-02, 7.789e-02, -9.540e-02, -3.161e-02, -5.171e-02, -3.656e-02, -6.148e-02, -1.413e-02, -8.995e-02, 2.536e-02, 1.995e-03, 3.317e-02, 1.918e-02));
|
||||
r += mul(s3_1, M4(1.245e-02, -4.971e-03, 1.026e-02, -7.525e-02, -2.233e-01, -4.502e-01, -4.530e-03, -1.802e-01, -1.799e-01, 1.915e-02, 1.043e-02, 4.008e-02, 1.524e-01, 1.881e-03, -7.387e-02, 1.566e-02));
|
||||
r += mul(s3_2, M4(-1.750e-01, 3.216e-03, -1.033e-03, -7.055e-02, -1.263e-01, 1.586e-01, 2.603e-02, -1.282e-01, 5.606e-02, -1.498e-02, -3.338e-02, -8.978e-03, -2.218e-02, -5.852e-02, -3.208e-03, -1.352e-02));
|
||||
r += mul(s3_3, M4(-9.577e-02, -8.859e-02, 7.921e-02, -1.569e-02, -7.962e-02, 2.890e-02, 4.107e-02, -5.870e-02, 2.510e-02, 1.765e-02, 4.458e-02, 1.891e-02, 7.541e-02, 3.492e-02, 3.160e-02, 1.201e-02));
|
||||
r += mul(s3_4, M4(-6.228e-02, 9.576e-02, -1.743e-01, -1.935e-01, 2.054e-01, 1.479e-01, 8.056e-04, 3.321e-02, -1.362e-01, 5.003e-01, 9.071e-02, 8.153e-02, 2.283e-01, -3.484e-01, 4.509e-02, -4.658e-01));
|
||||
r += mul(s3_5, M4(2.528e-01, -9.286e-04, -2.468e-02, 1.338e-01, 4.431e-02, 3.503e-02, 1.304e-01, 1.652e-01, 4.628e-01, -2.670e-01, 1.880e-01, 1.516e-01, -1.538e-01, 1.379e-01, -3.334e-02, 2.977e-02));
|
||||
r += mul(s3_6, M4(1.385e-01, -6.592e-02, -1.225e-01, -1.381e-01, -4.498e-02, -6.343e-03, 4.811e-02, 9.639e-02, 1.635e-02, -3.467e-02, 3.640e-03, -3.186e-02, 6.265e-02, 2.282e-01, 9.661e-02, 1.295e-01));
|
||||
r += mul(s3_7, M4(-3.053e-03, 7.999e-02, 2.407e-01, 2.655e-01, -3.969e-01, -9.502e-03, 1.900e-02, 9.557e-02, -6.199e-02, -3.574e-02, 8.350e-02, -7.837e-02, -1.442e-02, -5.281e-03, 4.503e-01, 4.026e-01));
|
||||
r += mul(s3_8, M4(-1.313e-01, 4.424e-02, -1.155e-02, 6.769e-02, 2.192e-02, 6.721e-02, 5.694e-03, 7.376e-02, -2.155e-01, -7.512e-02, 6.252e-03, -3.428e-01, 3.324e-01, 2.784e-03, -5.606e-02, 2.108e-01));
|
||||
r += V4(-6.039e-04, -3.875e-03, -3.020e-03, 2.282e-03);
|
||||
return r;
|
||||
}
|
||||
|
||||
void Pass3(uint2 blockStart, uint3 tid) {
|
||||
float2 pt = float2(GetInputPt());
|
||||
uint2 gxy = Rmp8x8(tid.x) + blockStart;
|
||||
uint2 size = GetInputSize();
|
||||
if (gxy.x >= size.x || gxy.y >= size.y) {
|
||||
return;
|
||||
}
|
||||
float2 pos = (gxy + 0.5) * pt;
|
||||
|
||||
V4 s0_0 = l0(-1.0, -1.0);
|
||||
V4 s0_1 = l0(0.0, -1.0);
|
||||
V4 s0_2 = l0(1.0, -1.0);
|
||||
V4 s0_3 = l0(-1.0, 0.0);
|
||||
V4 s0_4 = l0(0.0, 0.0);
|
||||
V4 s0_5 = l0(1.0, 0.0);
|
||||
V4 s0_6 = l0(-1.0, 1.0);
|
||||
V4 s0_7 = l0(0.0, 1.0);
|
||||
V4 s0_8 = l0(1.0, 1.0);
|
||||
V4 s1_0 = -max(-s0_0, 0.0);
|
||||
V4 s1_1 = -max(-s0_1, 0.0);
|
||||
V4 s1_2 = -max(-s0_2, 0.0);
|
||||
V4 s1_3 = -max(-s0_3, 0.0);
|
||||
V4 s1_4 = -max(-s0_4, 0.0);
|
||||
V4 s1_5 = -max(-s0_5, 0.0);
|
||||
V4 s1_6 = -max(-s0_6, 0.0);
|
||||
V4 s1_7 = -max(-s0_7, 0.0);
|
||||
V4 s1_8 = -max(-s0_8, 0.0);
|
||||
s0_0 = max(s0_0, 0.0);
|
||||
s0_1 = max(s0_1, 0.0);
|
||||
s0_2 = max(s0_2, 0.0);
|
||||
s0_3 = max(s0_3, 0.0);
|
||||
s0_4 = max(s0_4, 0.0);
|
||||
s0_5 = max(s0_5, 0.0);
|
||||
s0_6 = max(s0_6, 0.0);
|
||||
s0_7 = max(s0_7, 0.0);
|
||||
s0_8 = max(s0_8, 0.0);
|
||||
|
||||
V4 s2_0 = l1(-1.0, -1.0);
|
||||
V4 s2_1 = l1(0.0, -1.0);
|
||||
V4 s2_2 = l1(1.0, -1.0);
|
||||
V4 s2_3 = l1(-1.0, 0.0);
|
||||
V4 s2_4 = l1(0.0, 0.0);
|
||||
V4 s2_5 = l1(1.0, 0.0);
|
||||
V4 s2_6 = l1(-1.0, 1.0);
|
||||
V4 s2_7 = l1(0.0, 1.0);
|
||||
V4 s2_8 = l1(1.0, 1.0);
|
||||
V4 s3_0 = -max(-s2_0, 0.0);
|
||||
V4 s3_1 = -max(-s2_1, 0.0);
|
||||
V4 s3_2 = -max(-s2_2, 0.0);
|
||||
V4 s3_3 = -max(-s2_3, 0.0);
|
||||
V4 s3_4 = -max(-s2_4, 0.0);
|
||||
V4 s3_5 = -max(-s2_5, 0.0);
|
||||
V4 s3_6 = -max(-s2_6, 0.0);
|
||||
V4 s3_7 = -max(-s2_7, 0.0);
|
||||
V4 s3_8 = -max(-s2_8, 0.0);
|
||||
s2_0 = max(s2_0, 0.0);
|
||||
s2_1 = max(s2_1, 0.0);
|
||||
s2_2 = max(s2_2, 0.0);
|
||||
s2_3 = max(s2_3, 0.0);
|
||||
s2_4 = max(s2_4, 0.0);
|
||||
s2_5 = max(s2_5, 0.0);
|
||||
s2_6 = max(s2_6, 0.0);
|
||||
s2_7 = max(s2_7, 0.0);
|
||||
s2_8 = max(s2_8, 0.0);
|
||||
|
||||
t0[gxy] = f0(s0_0, s0_1, s0_2, s0_3, s0_4, s0_5, s0_6, s0_7, s0_8, s1_0, s1_1, s1_2, s1_3, s1_4, s1_5, s1_6, s1_7, s1_8, s2_0, s2_1, s2_2, s2_3, s2_4, s2_5, s2_6, s2_7, s2_8, s3_0, s3_1, s3_2, s3_3, s3_4, s3_5, s3_6, s3_7, s3_8);
|
||||
t1[gxy] = f1(s0_0, s0_1, s0_2, s0_3, s0_4, s0_5, s0_6, s0_7, s0_8, s1_0, s1_1, s1_2, s1_3, s1_4, s1_5, s1_6, s1_7, s1_8, s2_0, s2_1, s2_2, s2_3, s2_4, s2_5, s2_6, s2_7, s2_8, s3_0, s3_1, s3_2, s3_3, s3_4, s3_5, s3_6, s3_7, s3_8);
|
||||
}
|
||||
|
||||
//!PASS 4
|
||||
//!DESC conv3
|
||||
//!BLOCK_SIZE 8
|
||||
//!NUM_THREADS 64
|
||||
//!IN t0, t1
|
||||
//!OUT t2, t3
|
||||
|
||||
#define l0(x, y) V4(O(t0, float2(x, y)))
|
||||
#define l1(x, y) V4(O(t1, float2(x, y)))
|
||||
|
||||
V4 f0(V4 s0_0, V4 s0_1, V4 s0_2, V4 s0_3, V4 s0_4, V4 s0_5, V4 s0_6, V4 s0_7, V4 s0_8, V4 s1_0, V4 s1_1, V4 s1_2, V4 s1_3, V4 s1_4, V4 s1_5, V4 s1_6, V4 s1_7, V4 s1_8, V4 s2_0, V4 s2_1, V4 s2_2, V4 s2_3, V4 s2_4, V4 s2_5, V4 s2_6, V4 s2_7, V4 s2_8, V4 s3_0, V4 s3_1, V4 s3_2, V4 s3_3, V4 s3_4, V4 s3_5, V4 s3_6, V4 s3_7, V4 s3_8) {
|
||||
V4 r = 0.0;
|
||||
r += mul(s0_0, M4(8.947e-02, -1.234e-01, -3.169e-02, -9.158e-02, -1.406e-01, 6.941e-02, -1.367e-02, -1.406e-02, 9.073e-02, 5.642e-01, -2.007e-02, 9.725e-02, 7.122e-03, -1.956e-03, 6.532e-03, -5.457e-02));
|
||||
r += mul(s0_1, M4(-1.130e-01, -4.645e-02, 3.624e-02, 3.391e-02, 3.882e-01, 2.453e-01, -2.237e-01, -2.271e-01, 2.803e-01, 1.718e-01, 3.255e-02, -2.046e-01, 1.441e-01, -1.880e-03, 2.335e-02, -1.232e-01));
|
||||
r += mul(s0_2, M4(2.016e-01, 1.243e-01, -3.895e-02, -1.135e-01, -2.167e-02, 1.465e-02, -7.776e-02, -1.213e-01, -7.195e-03, 4.404e-03, 6.598e-02, -5.135e-02, -2.062e-01, -3.725e-02, -8.296e-03, 8.739e-03));
|
||||
r += mul(s0_3, M4(-2.068e-02, -3.876e-02, 5.737e-02, 9.886e-02, -9.663e-02, -2.569e-01, 6.761e-02, -1.454e-01, 4.660e-02, 7.810e-01, -2.254e-01, 1.899e-01, -9.628e-02, 8.080e-02, -1.093e-02, 1.451e-02));
|
||||
r += mul(s0_4, M4(3.133e-01, 2.759e-01, -9.917e-02, -3.134e-01, 1.137e-01, -5.446e-01, -2.044e-03, -5.215e-01, -6.867e-02, 5.254e-01, -1.466e-01, -3.048e-01, 3.408e-01, 5.791e-01, -2.594e-01, -4.879e-04));
|
||||
r += mul(s0_5, M4(6.871e-02, -1.221e-01, -5.702e-02, -2.731e-02, 6.025e-01, 1.350e-01, -3.119e-01, -4.130e-01, 2.091e-01, 1.003e-01, 4.509e-02, -1.541e-01, 1.151e-01, -1.558e-01, 6.309e-03, -2.192e-01));
|
||||
r += mul(s0_6, M4(2.139e-02, 1.540e-02, -9.451e-02, 8.898e-02, 1.983e-02, -1.259e-01, 2.162e-01, -9.477e-02, -2.253e-01, -1.456e-01, -2.432e-02, 9.649e-02, 2.147e-02, -9.523e-02, 2.042e-02, -7.790e-02));
|
||||
r += mul(s0_7, M4(-3.105e-03, 1.944e-01, -1.808e-01, -3.058e-02, 4.007e-01, 5.645e-01, -2.452e-01, -7.366e-02, 1.279e-02, 3.212e-02, -1.573e-01, -1.267e-01, 1.613e-02, -1.976e-01, -1.519e-01, -2.687e-02));
|
||||
r += mul(s0_8, M4(-1.906e-04, 8.306e-02, 2.480e-02, 1.696e-02, 1.275e-01, 1.372e-01, 1.205e-01, 1.120e-02, 1.424e-02, -1.526e-01, -6.629e-02, -9.104e-02, 2.042e-02, -1.167e-01, 1.050e-01, 1.560e-02));
|
||||
r += mul(s1_0, M4(-2.398e-02, -1.009e-01, 2.671e-02, -8.841e-02, -7.277e-03, -4.411e-02, -1.240e-02, -5.367e-04, -1.223e-01, -7.251e-02, 4.941e-02, 7.545e-02, 6.688e-02, 1.727e-02, -1.144e-02, -7.713e-02));
|
||||
r += mul(s1_1, M4(-1.507e-01, -3.095e-01, 5.017e-02, -1.145e-01, 3.430e-02, -2.241e-01, -9.050e-02, -8.470e-02, -8.624e-02, -1.021e-02, -1.620e-02, 3.932e-03, 7.775e-02, -2.376e-02, 6.270e-02, -7.896e-02));
|
||||
r += mul(s1_2, M4(3.578e-02, -3.242e-02, 9.400e-03, -2.998e-02, -1.545e-02, -1.481e-01, -6.667e-02, 3.496e-02, 6.722e-02, 7.676e-04, -8.215e-04, 2.142e-03, 4.007e-02, 9.690e-02, -1.652e-03, 3.858e-02));
|
||||
r += mul(s1_3, M4(6.321e-02, -1.472e-01, 6.571e-02, -1.929e-01, -7.340e-02, -8.067e-02, 1.715e-02, 2.182e-02, -8.623e-02, -2.195e-01, -6.101e-02, 8.246e-02, -4.908e-02, -3.293e-02, -7.341e-02, -1.941e-01));
|
||||
r += mul(s1_4, M4(5.609e-01, 5.581e-01, -1.143e-01, -1.052e-01, 2.477e-01, 2.387e-01, 1.272e-01, 3.284e-03, -3.135e-01, 8.385e-02, -7.393e-02, -2.270e-01, 4.403e-01, -1.179e-01, -1.620e-01, 2.978e-01));
|
||||
r += mul(s1_5, M4(-3.015e-02, 1.055e-01, 1.072e-01, 1.177e-01, 3.838e-01, 3.206e-02, -4.556e-03, -5.072e-02, 4.250e-02, -1.665e-02, -1.759e-02, 2.822e-02, -2.408e-01, -2.204e-02, -3.440e-02, 6.520e-02));
|
||||
r += mul(s1_6, M4(9.180e-04, 3.395e-02, -1.211e-02, -5.605e-03, -7.356e-03, -2.439e-02, -2.498e-02, -6.361e-04, -5.167e-02, -1.009e-02, 7.202e-02, 3.652e-02, 3.036e-03, -7.672e-03, -2.822e-02, -9.942e-02));
|
||||
r += mul(s1_7, M4(-7.041e-02, -2.366e-01, -1.556e-01, 1.499e-01, -2.674e-02, 6.601e-03, -1.490e-01, 1.329e-02, -1.127e-01, 8.363e-03, -1.333e-01, 1.038e-02, -1.219e-02, -1.366e-01, 8.814e-02, 4.260e-03));
|
||||
r += mul(s1_8, M4(-1.397e-02, 2.863e-02, 5.459e-03, -1.166e-02, -1.201e-02, 1.346e-01, 5.461e-02, 1.584e-02, -8.155e-02, 8.451e-03, -3.444e-02, 3.920e-02, 2.082e-02, -4.174e-02, 6.205e-02, 5.646e-02));
|
||||
r += mul(s2_0, M4(5.465e-02, 7.303e-02, 1.200e-01, 8.938e-03, -8.960e-02, -2.248e-01, -1.073e-02, 6.882e-02, 4.637e-02, -1.215e-01, -2.319e-02, -2.049e-01, -8.235e-02, -2.689e-02, 8.521e-02, 2.612e-02));
|
||||
r += mul(s2_1, M4(-1.284e-01, -8.509e-02, 6.859e-02, 2.538e-02, -7.401e-02, 2.860e-01, -2.240e-01, 1.754e-01, -2.073e-01, -9.333e-02, -9.310e-02, -3.311e-01, 2.251e-01, 1.948e-01, -1.091e-01, 2.448e-02));
|
||||
r += mul(s2_2, M4(4.550e-03, 2.884e-02, -1.023e-02, -1.793e-02, 1.472e-01, 1.728e-02, -5.533e-02, -4.606e-02, -1.128e-01, 1.845e-01, -9.297e-02, 7.245e-02, 2.303e-02, -1.293e-01, -2.277e-02, -1.523e-02));
|
||||
r += mul(s2_3, M4(5.703e-02, 4.629e-03, -7.495e-02, -7.220e-02, -1.245e-01, 1.142e-01, -1.688e-03, -9.906e-03, 9.714e-02, -2.851e-02, 7.069e-03, -3.250e-01, -5.029e-03, -1.421e-01, -4.162e-02, 1.032e-01));
|
||||
r += mul(s2_4, M4(5.200e-02, -3.414e-02, -3.809e-02, -9.742e-02, 8.686e-01, 1.140e+00, 2.062e-01, 8.598e-02, 4.073e-01, -3.313e-01, 2.673e-01, 1.050e-01, -9.355e-02, 1.764e-01, 8.423e-02, 1.156e-01));
|
||||
r += mul(s2_5, M4(-5.260e-03, 8.804e-02, 3.636e-02, 3.074e-03, 1.724e-01, 2.433e-01, -1.126e-02, -2.652e-01, -1.229e-01, 3.135e-02, 1.187e-02, -6.661e-02, -1.872e-02, -6.508e-02, -7.109e-02, 1.141e-01));
|
||||
r += mul(s2_6, M4(6.180e-03, 2.059e-03, -1.768e-02, 4.877e-03, -7.838e-02, 1.366e-01, -7.231e-02, -2.826e-02, 6.251e-02, 7.375e-02, 2.531e-02, 2.038e-02, -4.462e-03, -4.896e-02, -4.376e-02, -7.998e-03));
|
||||
r += mul(s2_7, M4(1.011e-01, 8.753e-02, -5.554e-02, 6.949e-04, 4.137e-02, 2.710e-01, -3.203e-01, 6.752e-02, 9.720e-02, 3.447e-02, -5.777e-02, -1.723e-02, -9.154e-03, 5.461e-02, 1.248e-01, -3.906e-04));
|
||||
r += mul(s2_8, M4(4.126e-02, 3.442e-02, 9.763e-03, -4.560e-02, -4.233e-04, -1.519e-01, 2.421e-02, -4.043e-02, -1.281e-02, 1.166e-02, 2.489e-04, -3.061e-02, -4.476e-02, 4.493e-03, -4.164e-02, 9.694e-03));
|
||||
r += mul(s3_0, M4(-1.352e-01, -1.938e-01, 7.285e-02, -4.706e-02, 1.920e-02, 1.891e-02, 1.233e-02, 3.876e-02, 1.342e-02, 2.020e-01, 3.292e-02, 2.778e-02, -5.017e-02, 3.560e-02, 7.028e-02, 7.562e-03));
|
||||
r += mul(s3_1, M4(3.014e-01, 1.243e-01, -2.656e-02, -9.796e-02, 1.585e-01, 2.259e-01, -6.651e-02, 4.080e-02, 1.902e-01, 2.705e-01, -9.774e-02, -1.144e-02, -4.653e-01, -3.536e-01, 2.515e-02, 9.628e-02));
|
||||
r += mul(s3_2, M4(-7.724e-02, 1.181e-01, 2.182e-02, 1.999e-02, -7.114e-02, -4.414e-02, -5.748e-06, -8.931e-03, 4.985e-03, 6.360e-02, 4.422e-02, 6.005e-02, 1.335e-01, -8.144e-03, -3.979e-02, 6.952e-03));
|
||||
r += mul(s3_3, M4(-1.826e-03, 2.390e-02, 4.665e-03, -3.357e-02, 2.088e-02, 1.436e-01, -2.474e-02, 1.100e-02, 2.727e-02, -1.649e-02, -9.539e-02, -1.112e-01, -2.427e-02, 1.811e-01, -4.267e-02, 1.060e-01));
|
||||
r += mul(s3_4, M4(9.873e-02, -1.417e-01, -1.365e-01, -3.187e-01, -7.583e-03, 3.047e-01, -2.480e-02, 2.623e-01, -3.193e-01, -1.539e-01, -2.986e-01, 2.350e-01, 4.367e-01, 2.441e-01, -3.426e-01, -5.108e-02));
|
||||
r += mul(s3_5, M4(-8.384e-02, 1.343e-01, 1.653e-01, 7.978e-02, 6.329e-02, 7.040e-02, 2.203e-02, -2.280e-01, 2.531e-02, -9.408e-02, -5.137e-02, -1.717e-01, -1.577e-01, 4.030e-02, -2.802e-01, 1.155e-01));
|
||||
r += mul(s3_6, M4(1.895e-02, 1.436e-01, 3.568e-04, -6.075e-02, 3.213e-02, -3.462e-02, -2.821e-02, -8.374e-03, 3.451e-02, -2.349e-02, 9.517e-03, 2.092e-02, -8.229e-02, 6.530e-02, 6.116e-03, -2.414e-02));
|
||||
r += mul(s3_7, M4(1.530e-01, 2.073e-01, -7.258e-02, -6.975e-02, -6.610e-03, -3.885e-02, -5.636e-03, 1.227e-01, 8.913e-02, 4.336e-02, -1.931e-03, -3.869e-02, -2.019e-02, -1.340e-02, -1.506e-02, 1.591e-02));
|
||||
r += mul(s3_8, M4(2.536e-02, -3.220e-02, 6.413e-02, -1.835e-02, -9.124e-02, -8.098e-02, -5.479e-02, -1.361e-02, -3.146e-03, 1.204e-01, -4.020e-02, -6.924e-02, -1.030e-01, -1.301e-01, 1.634e-02, 1.029e-01));
|
||||
r += V4(8.385e-03, 1.035e-02, -6.465e-04, -6.502e-03);
|
||||
return r;
|
||||
}
|
||||
|
||||
V4 f1(V4 s0_0, V4 s0_1, V4 s0_2, V4 s0_3, V4 s0_4, V4 s0_5, V4 s0_6, V4 s0_7, V4 s0_8, V4 s1_0, V4 s1_1, V4 s1_2, V4 s1_3, V4 s1_4, V4 s1_5, V4 s1_6, V4 s1_7, V4 s1_8, V4 s2_0, V4 s2_1, V4 s2_2, V4 s2_3, V4 s2_4, V4 s2_5, V4 s2_6, V4 s2_7, V4 s2_8, V4 s3_0, V4 s3_1, V4 s3_2, V4 s3_3, V4 s3_4, V4 s3_5, V4 s3_6, V4 s3_7, V4 s3_8) {
|
||||
V4 r = 0.0;
|
||||
r += mul(s0_0, M4(-5.222e-02, -3.069e-02, 2.456e-03, -1.117e-02, 4.933e-02, 5.166e-02, -6.284e-03, -9.151e-02, 1.439e-02, 1.755e-02, -8.848e-02, -9.796e-02, -2.835e-02, 3.699e-02, 2.912e-02, 4.373e-02));
|
||||
r += mul(s0_1, M4(6.333e-03, -2.767e-02, -7.247e-02, 8.441e-02, -3.433e-02, -4.699e-02, -1.193e-02, -1.729e-01, 2.481e-02, -4.121e-02, -2.861e-01, -1.202e-02, 2.687e-02, -1.313e-01, 1.747e-02, -9.108e-02));
|
||||
r += mul(s0_2, M4(1.309e-02, -1.968e-02, -1.246e-01, -3.915e-02, -1.159e-01, -6.491e-03, 3.316e-01, -6.851e-02, -2.940e-02, -1.787e-02, -5.850e-03, -6.207e-02, 5.272e-02, 9.800e-02, 4.709e-02, 7.491e-02));
|
||||
r += mul(s0_3, M4(-1.127e-01, -3.748e-02, -1.091e-01, 1.788e-01, -7.982e-02, -7.528e-02, 1.898e-01, -1.355e-01, -1.568e-01, 9.648e-02, 2.337e-01, -9.666e-02, -7.316e-02, 2.915e-02, 2.259e-02, -1.310e-02));
|
||||
r += mul(s0_4, M4(1.689e-02, -1.028e-01, 1.304e-01, -6.012e-02, -8.030e-02, -1.823e-01, 4.179e-01, -3.553e-01, 9.095e-04, 9.972e-02, 3.227e-01, -4.967e-02, -2.329e-01, 1.272e-01, 4.332e-01, -8.456e-01));
|
||||
r += mul(s0_5, M4(1.815e-02, -5.743e-02, 7.236e-02, -8.782e-02, 1.161e-01, 2.258e-01, 7.053e-01, -2.993e-01, 6.605e-02, -2.666e-03, -4.733e-02, -1.087e-01, -1.101e-01, 1.554e-01, 1.656e-01, 2.530e-01));
|
||||
r += mul(s0_6, M4(-7.750e-02, -6.619e-02, 2.202e-02, 4.186e-02, -1.519e-01, -8.918e-03, -1.919e-01, -7.085e-02, -1.356e-01, -1.363e-01, 1.782e-01, -1.499e-02, 9.670e-02, 1.450e-03, 5.675e-02, -3.337e-02));
|
||||
r += mul(s0_7, M4(-9.267e-02, 1.661e-01, 1.306e-01, -2.387e-01, -2.261e-02, 2.870e-01, -2.711e-01, 6.281e-02, 2.181e-02, 1.010e-01, 2.979e-01, -9.254e-02, 1.307e-01, -2.024e-02, 2.013e-01, -1.862e-02));
|
||||
r += mul(s0_8, M4(-7.233e-02, 8.276e-02, 1.279e-02, -3.778e-02, -3.737e-01, -2.422e-01, -1.352e-01, -1.631e-01, 6.518e-02, 2.511e-01, 1.588e-01, -3.599e-02, 8.821e-02, 3.757e-02, -1.340e-01, 1.006e-01));
|
||||
r += mul(s1_0, M4(1.034e-02, 8.194e-02, 9.844e-02, -1.052e-01, 4.683e-03, 4.432e-03, 8.420e-03, 7.511e-03, 7.210e-02, -8.697e-03, -9.834e-02, 1.366e-01, 3.221e-04, 1.836e-02, 1.307e-02, -6.823e-02));
|
||||
r += mul(s1_1, M4(-7.232e-02, 1.103e-01, 2.975e-01, 4.747e-02, -1.075e-01, -6.863e-02, 2.378e-01, -2.994e-02, 6.426e-02, 2.459e-02, -1.361e-01, 4.394e-02, 4.558e-02, -5.684e-02, -3.386e-02, 8.075e-02));
|
||||
r += mul(s1_2, M4(-1.568e-02, 6.463e-02, 4.001e-02, 3.549e-02, -3.385e-02, -1.547e-02, 2.510e-01, 3.198e-02, 2.533e-02, -6.612e-02, -5.453e-02, 1.387e-03, 3.071e-02, -5.115e-03, -9.345e-02, 1.790e-02));
|
||||
r += mul(s1_3, M4(1.723e-01, 2.119e-02, -3.394e-01, -1.101e-01, 7.882e-03, -4.188e-02, -6.882e-02, 5.060e-02, 4.902e-02, 2.919e-02, 7.773e-02, 1.080e-01, 8.944e-02, -2.819e-02, -1.252e-02, -2.744e-01));
|
||||
r += mul(s1_4, M4(2.682e-01, 8.840e-03, -3.974e-01, 2.436e-01, 1.156e-02, 3.806e-04, -5.090e-01, -1.339e-02, 1.677e-02, -1.337e-01, -1.050e-01, 2.647e-01, -1.971e-01, -1.145e-02, 1.471e-01, -7.814e-02));
|
||||
r += mul(s1_5, M4(-5.376e-02, 2.321e-02, -1.908e-01, -1.538e-01, 5.032e-03, 2.979e-02, -3.934e-02, -1.754e-01, 3.674e-02, 8.713e-03, -7.429e-02, -2.768e-03, -1.878e-01, -1.382e-01, 1.114e-01, 4.843e-02));
|
||||
r += mul(s1_6, M4(4.390e-03, 1.082e-02, 6.300e-03, -2.220e-02, -1.578e-02, -3.883e-02, 6.290e-02, 5.752e-03, 9.478e-02, 5.108e-03, 6.174e-02, 8.270e-02, -5.128e-02, -3.664e-02, 3.095e-02, -1.575e-01));
|
||||
r += mul(s1_7, M4(2.131e-01, 8.669e-03, 8.288e-02, 1.767e-01, -8.764e-02, -6.440e-03, 1.179e-01, -9.407e-02, -1.114e-01, -1.384e-01, 7.349e-02, 2.379e-02, 6.264e-02, -6.347e-02, -1.973e-01, 3.150e-02));
|
||||
r += mul(s1_8, M4(6.920e-02, 2.737e-01, 5.444e-02, -1.065e-01, -8.435e-02, 1.268e-01, -7.219e-03, -4.022e-02, -3.687e-02, -3.873e-02, 5.773e-02, 1.171e-02, 5.552e-02, -2.870e-02, -4.903e-02, 2.162e-02));
|
||||
r += mul(s2_0, M4(-6.811e-02, 3.915e-02, -1.970e-02, 5.496e-02, -3.225e-02, -5.284e-02, -3.737e-03, -1.864e-03, -1.361e-01, -7.308e-02, -4.948e-02, -1.634e-01, 5.283e-02, 1.746e-02, -8.374e-02, 7.123e-02));
|
||||
r += mul(s2_1, M4(4.868e-03, 7.851e-02, 1.067e-01, 5.576e-02, 1.276e-01, -7.837e-02, -2.875e-01, 3.754e-02, -1.315e-01, -9.095e-02, 8.041e-02, -1.156e-01, 1.309e-02, 1.086e-01, -1.335e-01, 9.059e-02));
|
||||
r += mul(s2_2, M4(-1.092e-02, 1.501e-01, -3.542e-02, 2.500e-02, 1.500e-02, -1.832e-01, -3.447e-01, -2.562e-02, -1.110e-01, 1.362e-01, 1.634e-01, -5.146e-02, -1.184e-02, -1.154e-01, 4.862e-02, 1.344e-03));
|
||||
r += mul(s2_3, M4(3.103e-02, -2.009e-02, 2.266e-02, 5.094e-02, 5.909e-01, 1.844e-01, -3.418e-02, -1.460e-01, 1.218e-02, -3.631e-02, -2.582e-01, -2.230e-01, 9.666e-02, -6.432e-02, 7.267e-02, 7.577e-02));
|
||||
r += mul(s2_4, M4(8.062e-02, -3.981e-02, -3.232e-02, -1.032e-01, -9.859e-02, 6.539e-01, 5.533e-01, -1.046e-02, -5.348e-01, 1.009e-02, -3.879e-01, 1.190e-01, -1.151e-01, 1.835e-01, -7.797e-02, 1.418e-01));
|
||||
r += mul(s2_5, M4(-1.404e-02, -1.730e-01, -4.516e-02, -2.158e-02, 2.544e-01, 4.463e-01, 1.404e-01, -6.854e-02, -9.712e-02, -4.920e-01, -2.485e-02, -6.416e-02, 3.612e-02, 2.451e-01, 2.327e-02, -1.251e-03));
|
||||
r += mul(s2_6, M4(6.507e-02, -2.267e-02, -7.660e-02, 3.043e-02, 3.541e-01, 2.804e-01, 2.783e-01, -2.580e-01, -1.185e-01, 8.028e-02, -1.395e-01, -4.988e-03, 4.702e-02, -5.327e-02, 4.580e-02, 3.130e-03));
|
||||
r += mul(s2_7, M4(9.806e-02, 6.990e-02, -4.317e-02, -2.415e-02, -2.263e-01, -1.723e-01, 2.669e-02, -3.393e-01, 9.368e-02, -6.775e-02, -1.883e-01, -8.601e-02, -2.278e-01, 1.612e-01, 1.625e-01, 8.821e-02));
|
||||
r += mul(s2_8, M4(-1.921e-02, 1.119e-01, 3.717e-02, -2.554e-02, 2.852e-02, 8.987e-02, 1.246e-01, 6.463e-03, 2.548e-02, -2.950e-02, 7.289e-02, 1.802e-02, 2.576e-02, 5.798e-02, 6.021e-02, -5.030e-03));
|
||||
r += mul(s3_0, M4(-1.023e-01, -3.759e-02, -2.437e-02, 1.032e-01, -2.143e-02, -4.189e-02, -6.139e-02, 9.887e-02, -9.094e-03, 3.087e-02, -1.056e-01, 1.376e-01, 1.702e-02, 3.138e-02, -1.243e-01, -5.115e-02));
|
||||
r += mul(s3_1, M4(3.439e-02, -1.018e-01, -3.260e-01, 6.226e-02, 3.794e-02, -6.747e-02, -1.743e-01, -9.149e-02, 6.116e-02, -3.539e-02, -3.971e-01, -2.458e-02, -1.436e-01, 4.323e-02, 5.595e-01, 1.160e-01));
|
||||
r += mul(s3_2, M4(-7.596e-02, -9.502e-02, -1.112e-02, -7.256e-02, -1.625e-02, -1.013e-01, -7.450e-02, 2.969e-03, -1.481e-02, -1.199e-01, -8.230e-02, 2.952e-02, -3.199e-02, 8.852e-02, -1.541e-02, 1.722e-02));
|
||||
r += mul(s3_3, M4(2.768e-03, -9.600e-02, 1.333e-01, -1.174e-01, -7.190e-02, 1.265e-02, 8.135e-02, -6.909e-03, 9.249e-02, -2.800e-02, 2.029e-01, -1.212e-02, 9.955e-02, -2.791e-02, -1.172e-01, 2.079e-01));
|
||||
r += mul(s3_4, M4(-1.948e-01, -1.936e-01, 5.127e-01, -7.970e-02, -1.135e-01, 1.060e-01, 1.226e-01, -3.195e-01, -4.980e-01, -5.665e-03, 3.167e-01, -2.413e-01, 2.036e-01, 1.519e-01, 7.793e-04, -1.316e-01));
|
||||
r += mul(s3_5, M4(-8.284e-02, -1.590e-01, 5.041e-03, -2.936e-02, 1.485e-01, 8.341e-02, -3.804e-02, 3.576e-02, 1.499e-01, -8.989e-02, 7.085e-02, -4.898e-02, 1.070e-01, 5.825e-02, 1.863e-01, -9.850e-03));
|
||||
r += mul(s3_6, M4(-3.057e-01, 2.794e-02, -7.737e-02, -4.168e-02, 2.696e-02, 1.279e-02, 2.638e-02, 8.177e-02, 1.217e-01, 2.531e-02, -1.188e-01, 1.018e-01, -5.486e-02, -6.606e-03, 1.868e-01, -1.050e-01));
|
||||
r += mul(s3_7, M4(-3.018e-01, -1.795e-01, -1.578e-01, -1.809e-01, 1.241e-01, -4.960e-02, -1.067e-01, -1.004e-02, -8.835e-02, 6.620e-02, 1.309e-01, -1.399e-01, 4.651e-02, 4.837e-02, -9.106e-02, 1.670e-01));
|
||||
r += mul(s3_8, M4(1.081e-02, -9.947e-02, 1.643e-02, -2.769e-02, 9.803e-02, -8.389e-02, -2.782e-02, -2.689e-02, 3.693e-02, -3.436e-03, 1.229e-02, -2.929e-02, -1.751e-01, -5.859e-03, 1.543e-01, 8.225e-02));
|
||||
r += V4(-2.994e-04, -3.163e-05, 4.528e-03, -1.285e-02);
|
||||
return r;
|
||||
}
|
||||
|
||||
void Pass4(uint2 blockStart, uint3 tid) {
|
||||
float2 pt = float2(GetInputPt());
|
||||
uint2 gxy = Rmp8x8(tid.x) + blockStart;
|
||||
uint2 size = GetInputSize();
|
||||
if (gxy.x >= size.x || gxy.y >= size.y) {
|
||||
return;
|
||||
}
|
||||
float2 pos = (gxy + 0.5) * pt;
|
||||
|
||||
V4 s0_0 = l0(-1.0, -1.0);
|
||||
V4 s0_1 = l0(0.0, -1.0);
|
||||
V4 s0_2 = l0(1.0, -1.0);
|
||||
V4 s0_3 = l0(-1.0, 0.0);
|
||||
V4 s0_4 = l0(0.0, 0.0);
|
||||
V4 s0_5 = l0(1.0, 0.0);
|
||||
V4 s0_6 = l0(-1.0, 1.0);
|
||||
V4 s0_7 = l0(0.0, 1.0);
|
||||
V4 s0_8 = l0(1.0, 1.0);
|
||||
V4 s1_0 = -max(-s0_0, 0.0);
|
||||
V4 s1_1 = -max(-s0_1, 0.0);
|
||||
V4 s1_2 = -max(-s0_2, 0.0);
|
||||
V4 s1_3 = -max(-s0_3, 0.0);
|
||||
V4 s1_4 = -max(-s0_4, 0.0);
|
||||
V4 s1_5 = -max(-s0_5, 0.0);
|
||||
V4 s1_6 = -max(-s0_6, 0.0);
|
||||
V4 s1_7 = -max(-s0_7, 0.0);
|
||||
V4 s1_8 = -max(-s0_8, 0.0);
|
||||
s0_0 = max(s0_0, 0.0);
|
||||
s0_1 = max(s0_1, 0.0);
|
||||
s0_2 = max(s0_2, 0.0);
|
||||
s0_3 = max(s0_3, 0.0);
|
||||
s0_4 = max(s0_4, 0.0);
|
||||
s0_5 = max(s0_5, 0.0);
|
||||
s0_6 = max(s0_6, 0.0);
|
||||
s0_7 = max(s0_7, 0.0);
|
||||
s0_8 = max(s0_8, 0.0);
|
||||
|
||||
V4 s2_0 = l1(-1.0, -1.0);
|
||||
V4 s2_1 = l1(0.0, -1.0);
|
||||
V4 s2_2 = l1(1.0, -1.0);
|
||||
V4 s2_3 = l1(-1.0, 0.0);
|
||||
V4 s2_4 = l1(0.0, 0.0);
|
||||
V4 s2_5 = l1(1.0, 0.0);
|
||||
V4 s2_6 = l1(-1.0, 1.0);
|
||||
V4 s2_7 = l1(0.0, 1.0);
|
||||
V4 s2_8 = l1(1.0, 1.0);
|
||||
V4 s3_0 = -max(-s2_0, 0.0);
|
||||
V4 s3_1 = -max(-s2_1, 0.0);
|
||||
V4 s3_2 = -max(-s2_2, 0.0);
|
||||
V4 s3_3 = -max(-s2_3, 0.0);
|
||||
V4 s3_4 = -max(-s2_4, 0.0);
|
||||
V4 s3_5 = -max(-s2_5, 0.0);
|
||||
V4 s3_6 = -max(-s2_6, 0.0);
|
||||
V4 s3_7 = -max(-s2_7, 0.0);
|
||||
V4 s3_8 = -max(-s2_8, 0.0);
|
||||
s2_0 = max(s2_0, 0.0);
|
||||
s2_1 = max(s2_1, 0.0);
|
||||
s2_2 = max(s2_2, 0.0);
|
||||
s2_3 = max(s2_3, 0.0);
|
||||
s2_4 = max(s2_4, 0.0);
|
||||
s2_5 = max(s2_5, 0.0);
|
||||
s2_6 = max(s2_6, 0.0);
|
||||
s2_7 = max(s2_7, 0.0);
|
||||
s2_8 = max(s2_8, 0.0);
|
||||
|
||||
t2[gxy] = f0(s0_0, s0_1, s0_2, s0_3, s0_4, s0_5, s0_6, s0_7, s0_8, s1_0, s1_1, s1_2, s1_3, s1_4, s1_5, s1_6, s1_7, s1_8, s2_0, s2_1, s2_2, s2_3, s2_4, s2_5, s2_6, s2_7, s2_8, s3_0, s3_1, s3_2, s3_3, s3_4, s3_5, s3_6, s3_7, s3_8);
|
||||
t3[gxy] = f1(s0_0, s0_1, s0_2, s0_3, s0_4, s0_5, s0_6, s0_7, s0_8, s1_0, s1_1, s1_2, s1_3, s1_4, s1_5, s1_6, s1_7, s1_8, s2_0, s2_1, s2_2, s2_3, s2_4, s2_5, s2_6, s2_7, s2_8, s3_0, s3_1, s3_2, s3_3, s3_4, s3_5, s3_6, s3_7, s3_8);
|
||||
}
|
||||
|
||||
//!PASS 5
|
||||
//!DESC conv4
|
||||
//!BLOCK_SIZE 8
|
||||
//!NUM_THREADS 64
|
||||
//!IN t2, t3
|
||||
//!OUT t0, t1
|
||||
|
||||
#define l0(x, y) V4(O(t2, float2(x, y)))
|
||||
#define l1(x, y) V4(O(t3, float2(x, y)))
|
||||
|
||||
V4 f0(V4 s0_0, V4 s0_1, V4 s0_2, V4 s0_3, V4 s0_4, V4 s0_5, V4 s0_6, V4 s0_7, V4 s0_8, V4 s1_0, V4 s1_1, V4 s1_2, V4 s1_3, V4 s1_4, V4 s1_5, V4 s1_6, V4 s1_7, V4 s1_8, V4 s2_0, V4 s2_1, V4 s2_2, V4 s2_3, V4 s2_4, V4 s2_5, V4 s2_6, V4 s2_7, V4 s2_8, V4 s3_0, V4 s3_1, V4 s3_2, V4 s3_3, V4 s3_4, V4 s3_5, V4 s3_6, V4 s3_7, V4 s3_8) {
|
||||
V4 r = 0.0;
|
||||
r += mul(s0_0, M4(5.681e-02, -7.933e-02, -1.161e-02, -3.257e-02, -1.507e-02, 2.248e-02, -1.351e-02, 2.789e-02, -1.713e-01, 9.482e-02, 2.715e-02, 9.506e-02, 1.714e-01, -1.090e-01, -7.237e-02, -1.563e-01));
|
||||
r += mul(s0_1, M4(-9.622e-03, -7.774e-04, -4.095e-02, 1.106e-02, -3.592e-02, -4.358e-02, 1.983e-02, -1.134e-02, -1.313e-02, -1.086e-01, 1.102e-01, -3.091e-01, 1.982e-01, 1.438e-01, -6.038e-02, 9.579e-02));
|
||||
r += mul(s0_2, M4(-3.893e-02, 1.554e-02, -7.763e-05, 1.610e-02, 3.470e-03, 9.915e-03, -9.881e-03, 5.331e-02, -9.152e-02, 6.899e-02, -3.615e-02, 1.558e-01, -3.300e-02, 4.493e-02, 2.148e-02, -3.677e-02));
|
||||
r += mul(s0_3, M4(1.939e-01, -7.700e-02, -1.449e-01, -1.942e-02, 9.649e-02, -3.580e-03, -1.767e-02, 2.394e-02, -1.299e-01, 1.160e-01, 8.000e-02, 9.737e-02, 2.751e-01, -4.435e-01, 1.013e-01, -1.782e-01));
|
||||
r += mul(s0_4, M4(-2.745e-01, 2.922e-01, -2.008e-01, 1.636e-01, -4.843e-02, 4.172e-01, 3.097e-02, 3.326e-01, -1.798e-02, -3.860e-01, 3.246e-02, 4.225e-01, -1.057e-01, 2.302e-01, -7.879e-02, 4.832e-02));
|
||||
r += mul(s0_5, M4(-6.834e-04, -3.372e-02, -9.351e-02, 1.547e-02, 5.621e-02, -1.195e-02, -9.402e-03, 6.439e-02, 8.787e-02, 1.499e-02, 1.928e-01, 6.693e-02, 6.516e-02, -1.145e-01, -6.610e-02, 3.986e-02));
|
||||
r += mul(s0_6, M4(7.682e-02, -9.222e-02, 1.566e-01, -1.438e-02, 5.080e-02, -2.762e-02, -3.121e-02, -1.242e-02, 2.046e-02, -1.131e-02, 4.555e-02, -3.006e-02, 1.125e-01, -7.883e-02, 1.063e-01, 3.027e-03));
|
||||
r += mul(s0_7, M4(-1.395e-01, 4.847e-02, 1.605e-01, 1.363e-01, 6.243e-02, -1.464e-02, 3.336e-02, -8.862e-02, 3.286e-02, -2.398e-02, -2.326e-02, -8.408e-02, 1.274e-01, -4.997e-02, 1.548e-01, -8.650e-02));
|
||||
r += mul(s0_8, M4(4.236e-02, 3.116e-02, 7.690e-02, 3.084e-02, 6.290e-03, 1.016e-02, 7.155e-02, -9.786e-02, -1.453e-02, -4.564e-04, -3.654e-02, 7.179e-03, -2.110e-02, -2.766e-02, 1.022e-01, -6.664e-02));
|
||||
r += mul(s1_0, M4(-2.814e-02, 6.473e-02, 5.209e-02, 6.202e-02, -1.898e-02, 6.061e-02, -1.557e-02, 3.561e-02, 2.137e-01, -1.913e-01, 2.387e-03, -1.470e-01, 4.553e-02, -3.358e-02, 1.936e-03, -4.798e-02));
|
||||
r += mul(s1_1, M4(4.947e-03, -8.431e-02, -3.362e-03, -1.057e-01, -6.735e-02, 8.463e-03, -4.622e-02, -2.022e-02, -1.450e-01, -1.687e-03, -1.541e-02, -1.116e-02, 4.447e-02, 5.088e-02, -7.198e-03, 3.279e-02));
|
||||
r += mul(s1_2, M4(1.202e-03, -2.591e-02, -5.357e-03, -3.844e-02, -7.403e-03, 3.771e-02, -6.171e-02, 8.820e-02, 6.744e-03, -4.156e-02, -1.377e-02, 9.398e-02, -2.643e-02, 4.991e-02, -2.000e-02, 1.056e-02));
|
||||
r += mul(s1_3, M4(3.923e-01, 3.525e-02, -1.294e-01, 1.478e-02, 9.667e-02, 1.289e-01, 8.960e-02, 1.946e-02, 3.128e-01, -3.315e-01, -3.019e-01, 1.021e-01, 2.095e-01, -1.488e-01, -9.439e-02, -9.635e-02));
|
||||
r += mul(s1_4, M4(-3.641e-01, -9.985e-02, -3.482e-01, -2.646e-01, -5.257e-01, 9.475e-01, 1.714e-01, 5.842e-01, -2.199e-01, -6.131e-02, -4.597e-01, 5.556e-01, 7.933e-02, -2.150e-01, -3.469e-01, -1.978e-01));
|
||||
r += mul(s1_5, M4(7.883e-05, -2.207e-02, -1.735e-02, 2.167e-02, 4.628e-02, 8.814e-02, -4.837e-02, 6.515e-02, 1.617e-01, -4.460e-02, -1.002e-01, 7.496e-02, -1.180e-01, 5.540e-02, -5.708e-02, 5.715e-02));
|
||||
r += mul(s1_6, M4(1.680e-01, -5.262e-02, 6.143e-02, -4.758e-02, -5.343e-02, 4.332e-02, 1.191e-01, 8.545e-03, 1.171e-01, -8.169e-02, 1.535e-02, -2.281e-01, 8.009e-02, -9.744e-02, 6.114e-02, 8.379e-03));
|
||||
r += mul(s1_7, M4(-9.744e-02, 2.573e-02, 6.125e-02, 1.265e-01, 9.253e-02, -1.227e-01, 3.224e-01, -2.402e-01, 1.083e-01, 1.607e-02, 1.155e-01, -4.014e-01, -2.347e-02, -3.821e-02, 2.379e-01, 2.605e-02));
|
||||
r += mul(s1_8, M4(5.428e-02, -5.434e-02, -2.345e-02, -2.189e-03, 1.274e-02, 7.503e-02, 1.442e-01, -8.839e-02, -3.480e-02, 1.444e-02, -3.859e-02, -1.089e-01, -3.183e-02, 9.172e-02, 1.092e-01, 6.688e-02));
|
||||
r += mul(s2_0, M4(2.283e-01, 3.872e-02, -5.533e-02, -1.704e-02, -1.533e-02, 1.459e-02, 3.842e-02, 6.367e-02, -4.041e-02, -6.411e-03, -5.052e-03, -8.331e-03, 2.786e-03, -5.502e-02, 6.695e-03, -1.982e-02));
|
||||
r += mul(s2_1, M4(-4.716e-01, 4.092e-01, -1.581e-01, 4.209e-01, 1.255e-01, -7.138e-02, 7.300e-02, -1.357e-01, -6.908e-02, -1.986e-02, 1.801e-02, -4.505e-02, -1.611e-01, -1.216e-01, -6.522e-02, -9.093e-02));
|
||||
r += mul(s2_2, M4(1.019e-01, -3.650e-02, 1.353e-02, 2.487e-01, -1.344e-04, 4.653e-02, 1.721e-02, 4.005e-02, 7.572e-03, -4.357e-02, -3.720e-02, 2.091e-02, 6.051e-03, -6.957e-02, -9.009e-02, -1.788e-02));
|
||||
r += mul(s2_3, M4(2.159e-02, -3.325e-02, 3.084e-02, 1.091e-01, -9.662e-02, 1.040e-01, 1.078e-01, -2.572e-02, 2.237e-04, -2.571e-02, -2.335e-02, -1.554e-02, 1.275e-01, -4.579e-02, -1.772e-02, 3.282e-02));
|
||||
r += mul(s2_4, M4(4.984e-02, 2.302e-01, 6.568e-02, 1.279e-01, 6.857e-02, -1.499e-01, -4.461e-02, -1.977e-01, -1.903e-01, 1.430e-01, 3.271e-02, 1.978e-01, 2.410e-01, 5.980e-01, -1.394e-01, 2.261e-01));
|
||||
r += mul(s2_5, M4(2.188e-02, -8.976e-03, 2.475e-02, 1.340e-02, -4.458e-02, 5.360e-02, 2.628e-02, -1.405e-02, 6.166e-02, -4.895e-02, 1.348e-03, 5.680e-02, -1.123e-01, 7.224e-02, -6.458e-02, 1.314e-01));
|
||||
r += mul(s2_6, M4(3.252e-02, -2.389e-02, -2.067e-02, -6.871e-02, -8.327e-02, 7.793e-02, 7.681e-03, 5.095e-02, -1.693e-02, -3.622e-02, 3.065e-02, -1.582e-02, -6.963e-03, 2.835e-02, 6.805e-02, -1.475e-02));
|
||||
r += mul(s2_7, M4(4.783e-02, -2.945e-02, 4.732e-02, -9.789e-04, -1.619e-02, -2.603e-02, -1.368e-01, 2.956e-02, 9.844e-02, -1.214e-01, 1.776e-01, -1.461e-01, -5.165e-02, -1.055e-02, 1.793e-01, -4.355e-02));
|
||||
r += mul(s2_8, M4(2.619e-03, 4.801e-02, 6.393e-02, -2.399e-02, -1.280e-03, -2.210e-02, -4.649e-02, 1.561e-03, -1.789e-02, 5.576e-02, 1.200e-01, 3.338e-03, 4.475e-02, -2.957e-02, 9.300e-02, -7.837e-02));
|
||||
r += mul(s3_0, M4(-1.536e-01, -3.593e-03, -1.064e-02, 1.740e-02, 9.197e-02, 2.772e-01, 5.258e-01, 5.745e-01, 2.331e-02, 8.995e-02, 2.611e-02, 5.463e-02, 4.872e-02, -8.230e-03, -1.742e-02, 3.405e-03));
|
||||
r += mul(s3_1, M4(4.799e-02, 1.088e-01, -7.562e-02, 5.926e-02, 4.190e-01, -4.922e-01, -1.822e-01, -2.309e-01, 1.776e-01, 1.799e-01, 1.213e-01, 3.198e-01, -1.565e-01, 2.118e-02, -5.914e-02, 1.048e-01));
|
||||
r += mul(s3_2, M4(-6.867e-02, -2.488e-02, 2.563e-02, -3.161e-02, -4.038e-02, 5.042e-02, 2.474e-02, 3.962e-03, -4.263e-02, 4.382e-02, -6.197e-03, 5.435e-02, 8.477e-02, -7.694e-02, -2.473e-02, -2.000e-02));
|
||||
r += mul(s3_3, M4(-6.567e-02, 7.271e-02, -2.275e-02, -4.345e-03, -4.825e-02, -7.541e-01, 5.163e-01, 9.170e-01, -1.040e-01, -9.911e-03, 3.569e-02, 2.347e-01, 2.350e-02, 6.202e-02, 7.421e-03, 2.377e-02));
|
||||
r += mul(s3_4, M4(-3.371e-02, -2.738e-02, 1.670e-01, 2.607e-01, -5.009e-02, 5.743e-03, -6.991e-01, -2.858e-02, -6.907e-02, -4.016e-01, 3.462e-01, 9.128e-01, -1.622e-01, 1.392e-01, 2.250e-01, 1.183e-01));
|
||||
r += mul(s3_5, M4(-8.330e-03, 1.029e-01, 1.045e-01, 2.013e-01, 2.609e-02, 7.939e-02, -1.054e-01, 6.487e-02, 1.165e-01, -6.250e-02, 1.274e-01, 2.396e-01, 2.390e-01, -2.468e-01, 1.178e-02, 6.794e-02));
|
||||
r += mul(s3_6, M4(5.411e-02, -5.669e-02, 2.831e-02, -3.762e-02, 1.186e-01, 1.750e-01, -2.862e-01, -9.876e-02, 5.851e-02, 2.750e-02, 7.348e-03, -2.151e-01, -3.151e-02, 5.225e-02, 3.178e-02, 1.438e-02));
|
||||
r += mul(s3_7, M4(-2.053e-03, 2.875e-02, -4.633e-02, -7.843e-02, -5.216e-02, -1.497e-04, -2.534e-01, -5.098e-01, 3.092e-02, -4.215e-02, -1.330e-01, -9.137e-02, 5.062e-02, 5.514e-02, -1.958e-01, 6.162e-03));
|
||||
r += mul(s3_8, M4(3.627e-02, 1.482e-02, 2.228e-02, -7.151e-02, -1.770e-02, 6.009e-02, 2.013e-01, 2.403e-02, 1.912e-03, -9.001e-03, 1.673e-02, -3.465e-02, 5.222e-02, -3.027e-02, -4.458e-03, -6.391e-02));
|
||||
r += V4(-3.261e-03, 1.350e-04, -6.605e-05, 1.307e-04);
|
||||
return r;
|
||||
}
|
||||
|
||||
V4 f1(V4 s0_0, V4 s0_1, V4 s0_2, V4 s0_3, V4 s0_4, V4 s0_5, V4 s0_6, V4 s0_7, V4 s0_8, V4 s1_0, V4 s1_1, V4 s1_2, V4 s1_3, V4 s1_4, V4 s1_5, V4 s1_6, V4 s1_7, V4 s1_8, V4 s2_0, V4 s2_1, V4 s2_2, V4 s2_3, V4 s2_4, V4 s2_5, V4 s2_6, V4 s2_7, V4 s2_8, V4 s3_0, V4 s3_1, V4 s3_2, V4 s3_3, V4 s3_4, V4 s3_5, V4 s3_6, V4 s3_7, V4 s3_8) {
|
||||
V4 r = 0.0;
|
||||
r += mul(s0_0, M4(-5.528e-02, 2.435e-02, -2.728e-02, 5.042e-02, -2.357e-02, 1.752e-02, 6.730e-02, -1.869e-02, 5.562e-02, 2.108e-03, -2.535e-02, -7.791e-02, -6.984e-02, 8.842e-02, 7.203e-02, 3.709e-02));
|
||||
r += mul(s0_1, M4(-6.164e-02, -1.824e-02, 8.179e-02, -3.238e-02, 5.338e-02, -5.506e-02, -1.020e-01, 1.520e-02, 1.953e-01, -2.850e-02, 8.323e-02, -8.899e-02, 5.112e-02, 6.369e-02, -5.510e-02, 1.997e-02));
|
||||
r += mul(s0_2, M4(6.117e-02, -1.311e-02, -9.258e-03, -1.479e-02, -2.710e-02, 2.958e-02, 2.946e-02, -9.472e-03, 4.257e-02, -7.053e-02, -5.896e-02, 5.475e-02, 6.131e-02, -1.827e-02, -2.909e-02, -6.470e-02));
|
||||
r += mul(s0_3, M4(-1.411e-01, 1.597e-01, 2.142e-01, 6.972e-02, 1.704e-02, 4.423e-02, -8.405e-02, 4.993e-02, 1.176e-02, -8.471e-02, 4.062e-02, -1.001e-01, -3.805e-02, 3.820e-02, -6.258e-01, 2.568e-01));
|
||||
r += mul(s0_4, M4(3.384e-01, -2.619e-01, 1.799e-01, -3.175e-01, 3.472e-03, -1.186e-01, 7.886e-02, -1.126e-01, 1.378e-01, -3.772e-02, -1.396e-02, 6.889e-02, -1.383e-01, 1.958e-01, 7.297e-02, -1.066e+00));
|
||||
r += mul(s0_5, M4(-4.115e-04, 8.733e-03, 3.432e-02, 5.650e-02, 9.203e-02, 6.899e-02, -9.987e-03, 5.139e-02, 2.075e-01, -1.229e-02, 5.912e-02, -2.866e-02, -1.602e-01, 1.654e-01, 6.957e-02, 5.472e-02));
|
||||
r += mul(s0_6, M4(-1.000e-01, 9.401e-02, -3.864e-02, 1.160e-01, 1.108e-03, 8.814e-02, 6.570e-04, 2.167e-02, 6.762e-05, -1.080e-02, -1.670e-02, -4.178e-03, -9.704e-03, 2.164e-01, 3.748e-02, -1.258e-02));
|
||||
r += mul(s0_7, M4(7.557e-02, -2.360e-01, -2.727e-02, -7.688e-02, -3.110e-02, 1.671e-02, -4.238e-02, 5.553e-02, 6.518e-02, 3.357e-02, -2.725e-02, -2.524e-02, -1.352e-01, -1.005e-01, -4.108e-02, 2.664e-01));
|
||||
r += mul(s0_8, M4(9.624e-02, 5.754e-03, 8.412e-02, -2.955e-02, 2.850e-02, 8.830e-03, -4.162e-02, -1.337e-02, -4.374e-02, -2.352e-02, -1.566e-02, 1.822e-02, 7.979e-02, -9.058e-02, -1.071e-01, -3.379e-03));
|
||||
r += mul(s1_0, M4(1.395e-02, 1.801e-02, 1.899e-03, -3.313e-02, 2.251e-02, -3.697e-03, 5.577e-02, -3.001e-02, -6.090e-02, 1.645e-01, -1.047e-01, 1.483e-01, -6.634e-03, 3.917e-04, -1.999e-02, 2.114e-02));
|
||||
r += mul(s1_1, M4(2.859e-03, 5.455e-02, 4.336e-02, -2.717e-02, 9.302e-02, -9.807e-02, 7.046e-02, -3.707e-02, -1.275e-01, -3.463e-02, -1.160e-01, -4.227e-02, 3.162e-02, 3.583e-02, 4.579e-02, -1.196e-02));
|
||||
r += mul(s1_2, M4(-7.086e-03, 2.542e-03, 1.500e-03, -6.273e-03, 5.711e-02, -5.317e-02, -5.455e-03, 4.847e-02, 8.830e-02, 5.991e-02, 3.356e-02, 1.214e-03, -5.272e-03, -5.211e-02, -2.142e-02, -1.246e-02));
|
||||
r += mul(s1_3, M4(-4.807e-02, 4.530e-02, 2.719e-01, -1.035e-02, 4.911e-02, -5.824e-03, -6.478e-02, -1.051e-03, -1.348e-02, 6.405e-01, -4.257e-01, 3.690e-01, -9.665e-02, 2.101e-01, 6.571e-02, 9.738e-02));
|
||||
r += mul(s1_4, M4(2.423e-01, -2.074e-01, -4.394e-01, -2.830e-02, 5.415e-02, -2.337e-01, 6.080e-01, -1.843e-01, -5.128e-01, 1.559e-01, -2.033e-01, -6.040e-02, -6.726e-02, 2.589e-01, 1.901e-01, -9.598e-02));
|
||||
r += mul(s1_5, M4(-1.456e-01, 6.484e-02, 1.125e-01, -1.183e-02, 2.186e-01, 2.930e-02, -4.285e-02, 6.272e-02, 1.500e-01, 1.033e-01, 2.173e-01, -3.328e-02, -6.785e-02, -7.882e-02, -1.450e-01, 7.182e-02));
|
||||
r += mul(s1_6, M4(-4.062e-02, 9.988e-02, -5.106e-02, 1.546e-01, 5.122e-02, -7.398e-02, -5.320e-03, -5.669e-02, -4.188e-02, 2.035e-01, -5.253e-02, -7.554e-03, -6.233e-02, 1.285e-01, 1.152e-02, 7.495e-02));
|
||||
r += mul(s1_7, M4(1.168e-01, -1.061e-01, -8.798e-02, -2.456e-01, -1.274e-01, -9.338e-02, 6.064e-04, 1.255e-01, 2.944e-02, -9.599e-02, -1.606e-01, 1.477e-01, -5.541e-02, -9.992e-02, -5.652e-02, 1.402e-02));
|
||||
r += mul(s1_8, M4(-8.447e-02, -2.272e-02, 3.291e-02, 1.141e-01, 2.835e-01, 2.747e-02, 9.338e-03, -1.271e-01, 1.118e-03, -3.543e-02, -3.201e-02, 5.803e-02, 1.793e-01, -6.889e-02, -3.139e-02, -1.000e-01));
|
||||
r += mul(s2_0, M4(3.477e-02, 8.152e-03, -8.100e-03, 3.869e-02, 4.675e-02, 8.080e-02, -4.909e-02, 6.764e-03, -2.946e-03, -7.021e-02, -1.191e-02, -1.660e-02, -5.967e-02, -1.872e-02, -3.485e-02, 3.391e-02));
|
||||
r += mul(s2_1, M4(1.685e-01, -2.681e-01, -2.340e-01, -1.748e-01, -1.593e-01, 7.496e-02, 3.748e-02, 1.562e-02, 5.150e-02, -3.648e-02, 3.739e-02, -4.384e-02, -1.521e-02, -1.061e-01, -1.381e-01, 1.733e-02));
|
||||
r += mul(s2_2, M4(1.573e-01, 1.415e-01, 1.714e-01, -5.175e-02, -2.442e-02, 1.054e-02, 3.047e-03, -5.944e-03, -6.027e-03, 1.034e-02, -3.381e-02, 4.299e-02, -9.763e-02, 4.729e-02, 9.642e-02, -1.450e-02));
|
||||
r += mul(s2_3, M4(8.191e-03, 1.353e-01, -6.018e-02, 5.677e-02, -1.725e-02, -1.324e-01, 1.646e-01, -1.154e-01, -9.796e-03, 3.066e-02, -5.975e-02, 2.878e-02, -1.381e-01, 1.550e-01, 3.556e-02, 8.926e-02));
|
||||
r += mul(s2_4, M4(1.715e-01, -2.115e-02, 8.179e-02, -2.066e-01, 1.275e-01, 1.599e-01, 2.325e-02, -9.637e-03, 6.565e-02, -1.901e-01, 7.185e-02, -1.559e-01, 1.106e-01, -6.210e-02, -3.672e-01, 6.248e-02));
|
||||
r += mul(s2_5, M4(-3.453e-03, 5.284e-02, -1.031e-01, 5.091e-02, 1.538e-02, -9.971e-02, -5.610e-02, -2.585e-02, 6.441e-02, 1.113e-01, 3.085e-02, 6.860e-02, -6.167e-02, -6.774e-02, -6.898e-02, -4.397e-03));
|
||||
r += mul(s2_6, M4(-1.561e-02, 5.106e-02, 2.999e-03, -7.663e-03, 6.665e-02, -1.217e-01, -9.529e-03, -2.096e-02, -2.825e-02, 4.854e-02, -2.196e-02, -7.191e-03, 2.274e-03, 1.698e-02, -1.727e-02, 1.967e-03));
|
||||
r += mul(s2_7, M4(3.534e-02, -1.077e-02, 1.607e-02, 4.542e-02, -7.989e-02, 1.294e-01, 4.920e-02, -6.332e-02, -9.402e-02, 2.028e-02, -6.305e-03, 9.061e-02, 2.225e-03, 2.352e-02, -4.032e-03, -4.985e-02));
|
||||
r += mul(s2_8, M4(7.112e-02, -1.427e-02, -2.352e-02, -2.989e-02, -5.633e-02, -6.039e-03, 3.496e-03, 2.535e-02, 1.265e-01, -4.541e-02, -5.393e-02, -5.355e-02, 1.498e-03, 2.057e-02, 1.278e-02, 5.662e-02));
|
||||
r += mul(s3_0, M4(9.523e-02, -7.183e-02, -2.740e-01, -1.569e-02, 1.008e-01, 3.065e+00, -2.003e-01, 1.938e-01, 7.503e-02, -1.096e-01, -3.177e-02, -4.074e-02, 1.090e-03, -2.250e-02, -4.727e-02, 2.528e-02));
|
||||
r += mul(s3_1, M4(-7.789e-02, 7.186e-03, 3.838e-01, -1.314e-01, -4.119e-01, 1.344e-01, 5.252e-02, -4.478e-02, -2.421e-01, 8.221e-02, 1.588e-01, 5.943e-02, -6.960e-02, -7.055e-02, -5.857e-02, -2.367e-02));
|
||||
r += mul(s3_2, M4(1.578e-01, -5.477e-02, -1.343e-01, 7.698e-02, 9.761e-02, -2.725e-02, -6.329e-02, -5.552e-02, -6.854e-02, 1.143e-02, -8.043e-02, 1.416e-02, 5.387e-02, 1.371e-01, 1.146e-01, -5.881e-04));
|
||||
r += mul(s3_3, M4(7.307e-03, -8.177e-02, 5.634e-02, -1.149e-01, -4.060e-01, 1.613e+00, -3.145e-01, 2.057e-02, -9.555e-02, 2.548e-01, 5.932e-02, 7.789e-02, 7.174e-03, -6.399e-03, -2.315e-02, 8.381e-03));
|
||||
r += mul(s3_4, M4(1.200e-01, 1.356e-01, 8.711e-03, 7.537e-02, -1.751e-01, 3.458e-02, 2.391e-01, -1.111e-01, 1.506e-01, -3.165e-01, -4.619e-01, -9.386e-02, -4.377e-02, -1.492e-01, -5.002e-01, 9.821e-02));
|
||||
r += mul(s3_5, M4(1.539e-01, 7.309e-02, 4.257e-03, -1.539e-01, -4.757e-01, 1.070e-01, 1.702e-02, 9.709e-02, -1.140e-01, 1.938e-01, 1.982e-01, -3.215e-02, -3.822e-01, 3.408e-01, 1.647e-01, 1.597e-01));
|
||||
r += mul(s3_6, M4(-3.320e-02, 4.854e-02, -1.957e-02, 3.353e-02, 1.823e-01, 8.532e-02, 3.236e-02, -1.874e-01, -1.073e-02, -6.598e-03, -2.954e-02, -2.175e-02, 1.184e-02, -3.856e-02, 2.166e-02, -2.608e-02));
|
||||
r += mul(s3_7, M4(2.038e-02, -4.606e-02, -3.841e-02, -4.008e-02, -2.542e-01, -1.076e-01, -2.891e-02, 1.837e-01, 3.842e-02, 1.753e-01, 3.043e-02, -3.298e-02, 2.990e-02, 1.215e-01, 9.583e-02, -5.860e-02));
|
||||
r += mul(s3_8, M4(6.138e-02, 3.405e-02, 3.364e-04, 6.037e-03, 1.811e-01, 9.691e-04, 3.497e-02, -1.810e-02, -3.940e-02, -1.159e-01, -7.007e-02, 1.170e-01, 1.829e-02, -2.216e-02, -1.689e-02, 1.150e-01));
|
||||
r += V4(-1.782e-04, -1.204e-03, 6.004e-04, -1.736e-03);
|
||||
return r;
|
||||
}
|
||||
|
||||
void Pass5(uint2 blockStart, uint3 tid) {
|
||||
float2 pt = float2(GetInputPt());
|
||||
uint2 gxy = Rmp8x8(tid.x) + blockStart;
|
||||
uint2 size = GetInputSize();
|
||||
if (gxy.x >= size.x || gxy.y >= size.y) {
|
||||
return;
|
||||
}
|
||||
float2 pos = (gxy + 0.5) * pt;
|
||||
|
||||
V4 s0_0 = l0(-1.0, -1.0);
|
||||
V4 s0_1 = l0(0.0, -1.0);
|
||||
V4 s0_2 = l0(1.0, -1.0);
|
||||
V4 s0_3 = l0(-1.0, 0.0);
|
||||
V4 s0_4 = l0(0.0, 0.0);
|
||||
V4 s0_5 = l0(1.0, 0.0);
|
||||
V4 s0_6 = l0(-1.0, 1.0);
|
||||
V4 s0_7 = l0(0.0, 1.0);
|
||||
V4 s0_8 = l0(1.0, 1.0);
|
||||
V4 s1_0 = -max(-s0_0, 0.0);
|
||||
V4 s1_1 = -max(-s0_1, 0.0);
|
||||
V4 s1_2 = -max(-s0_2, 0.0);
|
||||
V4 s1_3 = -max(-s0_3, 0.0);
|
||||
V4 s1_4 = -max(-s0_4, 0.0);
|
||||
V4 s1_5 = -max(-s0_5, 0.0);
|
||||
V4 s1_6 = -max(-s0_6, 0.0);
|
||||
V4 s1_7 = -max(-s0_7, 0.0);
|
||||
V4 s1_8 = -max(-s0_8, 0.0);
|
||||
s0_0 = max(s0_0, 0.0);
|
||||
s0_1 = max(s0_1, 0.0);
|
||||
s0_2 = max(s0_2, 0.0);
|
||||
s0_3 = max(s0_3, 0.0);
|
||||
s0_4 = max(s0_4, 0.0);
|
||||
s0_5 = max(s0_5, 0.0);
|
||||
s0_6 = max(s0_6, 0.0);
|
||||
s0_7 = max(s0_7, 0.0);
|
||||
s0_8 = max(s0_8, 0.0);
|
||||
|
||||
V4 s2_0 = l1(-1.0, -1.0);
|
||||
V4 s2_1 = l1(0.0, -1.0);
|
||||
V4 s2_2 = l1(1.0, -1.0);
|
||||
V4 s2_3 = l1(-1.0, 0.0);
|
||||
V4 s2_4 = l1(0.0, 0.0);
|
||||
V4 s2_5 = l1(1.0, 0.0);
|
||||
V4 s2_6 = l1(-1.0, 1.0);
|
||||
V4 s2_7 = l1(0.0, 1.0);
|
||||
V4 s2_8 = l1(1.0, 1.0);
|
||||
V4 s3_0 = -max(-s2_0, 0.0);
|
||||
V4 s3_1 = -max(-s2_1, 0.0);
|
||||
V4 s3_2 = -max(-s2_2, 0.0);
|
||||
V4 s3_3 = -max(-s2_3, 0.0);
|
||||
V4 s3_4 = -max(-s2_4, 0.0);
|
||||
V4 s3_5 = -max(-s2_5, 0.0);
|
||||
V4 s3_6 = -max(-s2_6, 0.0);
|
||||
V4 s3_7 = -max(-s2_7, 0.0);
|
||||
V4 s3_8 = -max(-s2_8, 0.0);
|
||||
s2_0 = max(s2_0, 0.0);
|
||||
s2_1 = max(s2_1, 0.0);
|
||||
s2_2 = max(s2_2, 0.0);
|
||||
s2_3 = max(s2_3, 0.0);
|
||||
s2_4 = max(s2_4, 0.0);
|
||||
s2_5 = max(s2_5, 0.0);
|
||||
s2_6 = max(s2_6, 0.0);
|
||||
s2_7 = max(s2_7, 0.0);
|
||||
s2_8 = max(s2_8, 0.0);
|
||||
|
||||
t0[gxy] = f0(s0_0, s0_1, s0_2, s0_3, s0_4, s0_5, s0_6, s0_7, s0_8, s1_0, s1_1, s1_2, s1_3, s1_4, s1_5, s1_6, s1_7, s1_8, s2_0, s2_1, s2_2, s2_3, s2_4, s2_5, s2_6, s2_7, s2_8, s3_0, s3_1, s3_2, s3_3, s3_4, s3_5, s3_6, s3_7, s3_8);
|
||||
t1[gxy] = f1(s0_0, s0_1, s0_2, s0_3, s0_4, s0_5, s0_6, s0_7, s0_8, s1_0, s1_1, s1_2, s1_3, s1_4, s1_5, s1_6, s1_7, s1_8, s2_0, s2_1, s2_2, s2_3, s2_4, s2_5, s2_6, s2_7, s2_8, s3_0, s3_1, s3_2, s3_3, s3_4, s3_5, s3_6, s3_7, s3_8);
|
||||
}
|
||||
|
||||
//!PASS 6
|
||||
//!DESC out-shuffle
|
||||
//!BLOCK_SIZE 16
|
||||
//!NUM_THREADS 64
|
||||
//!IN INPUT, t0, t1
|
||||
//!OUT OUTPUT
|
||||
|
||||
#define l0(x, y) V4(O(t0, float2(x, y)))
|
||||
#define l1(x, y) V4(O(t1, float2(x, y)))
|
||||
|
||||
V4 f0(V4 s0_0, V4 s0_1, V4 s0_2, V4 s0_3, V4 s0_4, V4 s0_5, V4 s0_6, V4 s0_7, V4 s0_8, V4 s1_0, V4 s1_1, V4 s1_2, V4 s1_3, V4 s1_4, V4 s1_5, V4 s1_6, V4 s1_7, V4 s1_8, V4 s2_0, V4 s2_1, V4 s2_2, V4 s2_3, V4 s2_4, V4 s2_5, V4 s2_6, V4 s2_7, V4 s2_8, V4 s3_0, V4 s3_1, V4 s3_2, V4 s3_3, V4 s3_4, V4 s3_5, V4 s3_6, V4 s3_7, V4 s3_8) {
|
||||
V4 r = 0.0;
|
||||
r += mul(s0_0, M4(-6.910e-02, 1.215e-03, -2.039e-03, -1.079e-04, 8.088e-02, -2.119e-02, -1.929e-02, 1.865e-02, -6.142e-02, 2.499e-02, -4.185e-03, 1.951e-03, -1.099e-02, 1.071e-02, 3.133e-03, -9.539e-03));
|
||||
r += mul(s0_1, M4(-2.129e-02, 6.812e-02, 2.738e-02, -2.965e-02, -1.569e-01, -7.369e-02, 6.714e-02, -2.416e-02, 6.421e-02, -3.329e-02, 4.397e-03, 1.902e-02, 1.426e-01, 7.469e-02, -3.306e-02, 1.260e-02));
|
||||
r += mul(s0_2, M4(-2.521e-02, -1.556e-02, -1.880e-02, 1.813e-02, -2.926e-03, -3.967e-02, -2.562e-02, 1.669e-02, 1.699e-03, 2.545e-02, 9.862e-03, 1.052e-02, -1.392e-02, 1.215e-02, 2.436e-02, 2.113e-04));
|
||||
r += mul(s0_3, M4(1.800e-02, -2.761e-02, 1.145e-02, -6.469e-02, 1.392e-01, 1.033e-02, 1.406e-01, -7.326e-03, -2.077e-02, 2.985e-03, -1.102e-01, 2.804e-02, -1.544e-02, 5.050e-02, 2.915e-02, 2.396e-02));
|
||||
r += mul(s0_4, M4(1.242e-01, -4.463e-01, -3.829e-01, 1.871e-01, -8.392e-02, 6.470e-02, -3.115e-01, -1.970e-01, -1.186e-01, -1.204e-01, -2.296e-02, -1.763e-01, -1.265e-01, -1.919e-01, 6.718e-02, 8.923e-02));
|
||||
r += mul(s0_5, M4(-2.493e-02, 3.014e-02, 2.446e-02, -1.488e-01, 1.299e-02, -5.759e-02, 2.138e-02, -9.211e-02, -8.051e-03, -4.216e-02, -1.327e-02, -9.724e-04, 3.675e-02, 7.968e-03, -3.353e-02, -4.044e-02));
|
||||
r += mul(s0_6, M4(2.027e-02, 3.813e-03, -2.557e-03, -2.670e-02, 2.068e-02, 1.886e-02, 6.014e-02, 3.191e-02, -1.917e-03, -2.659e-03, 1.273e-02, 3.109e-03, 9.881e-03, -4.410e-04, 7.569e-03, 1.276e-02));
|
||||
r += mul(s0_7, M4(-1.802e-03, 4.820e-02, 4.201e-02, 4.574e-02, 2.826e-02, 2.044e-02, 1.196e-01, 9.132e-02, 1.800e-02, 2.670e-02, -3.398e-03, 1.359e-02, 1.247e-02, 1.268e-02, 1.628e-03, -1.067e-02));
|
||||
r += mul(s0_8, M4(5.233e-03, 3.648e-02, 2.719e-02, 2.838e-02, 1.857e-03, -1.999e-03, 1.703e-02, 5.921e-02, 7.925e-03, -2.543e-03, 5.431e-03, -1.102e-02, -1.116e-02, -5.510e-03, -9.183e-03, -8.054e-03));
|
||||
r += mul(s1_0, M4(-6.423e-02, -5.758e-03, -8.948e-03, -2.227e-03, 5.802e-02, -2.252e-02, -8.134e-03, 1.448e-02, -3.642e-02, 4.476e-03, 7.865e-03, 3.269e-03, 1.053e-02, 1.269e-02, -1.530e-03, -9.628e-03));
|
||||
r += mul(s1_1, M4(-2.553e-02, 4.747e-02, 4.136e-02, -2.368e-02, -1.401e-01, -4.967e-02, 6.372e-02, -1.788e-04, 3.663e-01, 2.193e-01, -8.228e-02, -8.507e-02, 1.404e-01, 8.229e-02, -5.862e-02, -1.161e-02));
|
||||
r += mul(s1_2, M4(-2.216e-02, -7.521e-03, -2.522e-02, 2.337e-02, -2.651e-03, -3.786e-02, -9.854e-03, 2.033e-02, 9.696e-03, 1.237e-01, 6.173e-03, 2.898e-02, -1.335e-02, 2.948e-02, 9.778e-03, -1.243e-02));
|
||||
r += mul(s1_3, M4(-1.598e-02, -1.677e-02, -4.726e-02, -2.250e-02, 2.076e-01, -2.825e-02, 1.389e-01, -2.552e-02, 3.209e-02, -3.267e-03, -9.876e-02, 3.775e-02, -5.440e-02, 6.367e-02, 8.425e-02, 7.583e-03));
|
||||
r += mul(s1_4, M4(-2.339e-01, -8.617e-02, -3.313e-01, 1.470e-01, -1.249e-01, 3.994e-01, -7.191e-01, -2.121e-01, 2.521e-02, 4.601e-02, -3.584e-01, -4.014e-01, -4.299e-01, -4.828e-01, 4.034e-01, 3.633e-01));
|
||||
r += mul(s1_5, M4(3.413e-02, -4.685e-03, 4.308e-02, -1.211e-01, 3.722e-02, -1.000e-01, 5.938e-02, -1.900e-01, 3.286e-03, 6.076e-03, 2.628e-02, -1.190e-01, 3.968e-02, -3.583e-02, -4.724e-02, 5.713e-02));
|
||||
r += mul(s1_6, M4(3.008e-02, -2.083e-02, 7.970e-03, -2.011e-02, -8.809e-03, 9.741e-03, 7.228e-02, 1.875e-02, -8.374e-03, -2.245e-03, 1.642e-02, -9.996e-03, 2.093e-02, 6.393e-03, 6.227e-03, -6.775e-03));
|
||||
r += mul(s1_7, M4(1.113e-02, 5.783e-02, -1.430e-02, 2.826e-02, -1.250e-02, -3.106e-02, 1.754e-01, 2.001e-01, -1.431e-02, -1.368e-02, 4.329e-02, 4.832e-02, 4.089e-02, 3.702e-02, -5.774e-03, 8.701e-03));
|
||||
r += mul(s1_8, M4(1.395e-03, 3.747e-02, 2.706e-02, 4.675e-02, -1.191e-02, -2.163e-02, 3.137e-02, 7.056e-02, 4.929e-03, -6.465e-03, 1.083e-03, 1.816e-02, -3.896e-03, 1.081e-02, -1.507e-02, -1.412e-02));
|
||||
r += mul(s2_0, M4(5.551e-02, 3.061e-02, 2.172e-02, -4.435e-04, 7.341e-02, -4.254e-03, -3.710e-02, 2.005e-02, 3.528e-02, 1.764e-02, 4.547e-03, -6.460e-03, 1.949e-01, 2.466e-02, 7.886e-02, -2.722e-03));
|
||||
r += mul(s2_1, M4(-1.216e-03, 4.895e-02, -2.548e-02, 1.354e-02, 1.184e-01, -2.592e-01, 3.262e-02, 3.213e-02, -7.885e-02, -2.429e-02, -5.811e-02, 1.909e-02, 3.185e-02, -7.057e-02, -2.388e-02, 1.018e-01));
|
||||
r += mul(s2_2, M4(-4.325e-03, 8.278e-03, -7.126e-04, -3.013e-03, -2.277e-02, 6.470e-02, -3.258e-02, 6.558e-03, 2.954e-02, 9.175e-03, -1.066e-03, -1.931e-02, 3.523e-03, 1.347e-03, -1.837e-03, -3.765e-03));
|
||||
r += mul(s2_3, M4(-1.063e-01, 1.364e-02, -1.031e-01, 7.569e-02, -3.770e-02, 3.667e-02, 2.683e-02, 5.980e-02, -1.057e-01, -1.107e-02, -7.272e-02, 5.094e-02, 7.605e-02, 1.566e-02, 1.708e-01, 2.124e-01));
|
||||
r += mul(s2_4, M4(1.344e-02, -6.091e-02, 2.694e-02, -2.727e-02, 2.786e-01, 5.187e-02, 6.738e-01, -9.220e-01, 1.745e-01, -1.468e-02, 1.843e-01, -1.866e-01, -9.396e-02, -1.505e-01, 2.471e-01, -1.138e+00));
|
||||
r += mul(s2_5, M4(6.506e-03, 7.226e-03, 9.650e-03, 3.959e-03, -2.858e-02, -1.124e-01, -5.599e-02, 8.081e-02, -3.923e-02, 6.977e-02, 2.327e-03, 1.164e-01, 1.242e-02, -1.947e-02, -4.582e-02, 2.119e-02));
|
||||
r += mul(s2_6, M4(-1.730e-02, -2.202e-02, -2.408e-02, -6.448e-02, -3.767e-03, 2.506e-02, -4.165e-02, 4.527e-02, 1.431e-02, -2.421e-02, -1.170e-02, -6.665e-02, -1.236e-02, 5.709e-03, -6.345e-03, -3.440e-02));
|
||||
r += mul(s2_7, M4(-4.211e-02, -5.191e-02, -9.762e-02, -1.275e-01, 2.079e-02, -1.004e-01, 7.470e-02, 1.084e-02, -1.789e-02, 8.006e-02, 3.170e-02, 1.111e-01, -4.772e-02, -6.100e-02, 2.375e-02, 2.545e-03));
|
||||
r += mul(s2_8, M4(-7.109e-03, 1.968e-03, -9.159e-03, -1.523e-02, -1.024e-02, -5.787e-04, -4.581e-02, -1.496e-02, 2.302e-02, -1.568e-02, 2.850e-02, 9.731e-03, -1.219e-02, 1.316e-03, -1.859e-02, 8.662e-02));
|
||||
r += mul(s3_0, M4(2.241e-01, 1.599e-02, -3.007e-02, -8.278e-02, -2.343e-02, -1.323e-02, 6.153e-03, 8.030e-03, 1.988e-02, 1.870e-02, 7.620e-03, -1.035e-02, 2.443e-01, 4.061e-02, 3.123e-02, -4.152e-03));
|
||||
r += mul(s3_1, M4(-1.500e-02, -2.365e-02, -2.046e-02, 4.369e-02, 7.611e-03, -9.342e-03, 4.413e-03, -1.110e-03, -1.238e-01, -3.394e-02, -4.442e-02, 2.423e-02, -9.742e-02, -2.324e-02, -3.479e-02, 4.742e-02));
|
||||
r += mul(s3_2, M4(5.839e-03, 1.560e-02, -3.631e-03, 6.730e-03, -2.371e-03, -1.011e-02, -3.821e-03, 1.830e-03, 2.255e-02, 1.426e-02, -1.146e-02, -1.650e-02, 9.035e-03, 5.831e-03, 2.660e-03, -4.854e-03));
|
||||
r += mul(s3_3, M4(-1.694e-01, -2.771e-01, 6.449e-01, -2.979e-01, 9.108e-02, -2.277e-02, -5.309e-02, -3.552e-02, -1.626e-01, 2.544e-02, -7.033e-02, 7.145e-02, -1.334e-01, 1.008e-01, 1.121e-01, 1.733e-01));
|
||||
r += mul(s3_4, M4(-1.019e-01, 1.989e-01, -6.682e-02, -7.066e-02, -3.795e-02, 1.362e-01, 4.307e-02, -4.383e-02, 6.286e-01, -3.881e-01, 1.970e-01, -3.421e-01, -5.374e-03, -2.446e-01, -8.874e-02, -4.099e-01));
|
||||
r += mul(s3_5, M4(1.279e-02, -1.406e-02, 7.997e-03, 1.743e-02, 2.251e-02, -4.285e-02, -2.154e-03, -1.441e-02, -2.329e-02, 1.667e-02, 4.333e-02, 1.229e-01, -2.284e-03, -2.450e-02, -8.000e-03, -1.712e-02));
|
||||
r += mul(s3_6, M4(7.251e-02, 9.488e-03, -1.511e-01, -6.947e-02, -2.728e-02, 7.342e-03, 2.289e-02, 1.443e-02, 1.492e-02, -8.903e-03, -5.817e-02, -4.836e-02, -1.677e-03, 1.964e-02, -6.858e-03, -1.328e-02));
|
||||
r += mul(s3_7, M4(-8.618e-02, -5.596e-02, -1.276e-01, -1.230e-01, 4.851e-03, -5.676e-02, 2.939e-02, -4.192e-02, -2.508e-02, 4.430e-02, 1.352e-01, 2.072e-02, -8.584e-03, -3.983e-02, 1.177e-02, -4.721e-02));
|
||||
r += mul(s3_8, M4(6.050e-03, -3.781e-04, -3.124e-03, -1.667e-02, -1.291e-02, -1.315e-02, -2.106e-02, -5.240e-03, 1.412e-02, -2.504e-02, 3.138e-02, -2.989e-02, -6.363e-03, -1.480e-04, 1.157e-03, 1.933e-02));
|
||||
r += V4(-8.480e-04, -1.222e-04, -8.629e-04, -1.828e-04);
|
||||
return tanh(r);
|
||||
}
|
||||
|
||||
void Pass6(uint2 blockStart, uint3 tid) {
|
||||
float2 pt = float2(GetInputPt());
|
||||
uint2 gxy = (Rmp8x8(tid.x) << 1) + blockStart;
|
||||
uint2 size = GetOutputSize();
|
||||
if (gxy.x >= size.x || gxy.y >= size.y) {
|
||||
return;
|
||||
}
|
||||
float2 pos = ((gxy >> 1) + 0.5) * pt;
|
||||
|
||||
V4 s0_0 = l0(-1.0, -1.0);
|
||||
V4 s0_1 = l0(0.0, -1.0);
|
||||
V4 s0_2 = l0(1.0, -1.0);
|
||||
V4 s0_3 = l0(-1.0, 0.0);
|
||||
V4 s0_4 = l0(0.0, 0.0);
|
||||
V4 s0_5 = l0(1.0, 0.0);
|
||||
V4 s0_6 = l0(-1.0, 1.0);
|
||||
V4 s0_7 = l0(0.0, 1.0);
|
||||
V4 s0_8 = l0(1.0, 1.0);
|
||||
V4 s1_0 = -max(-s0_0, 0.0);
|
||||
V4 s1_1 = -max(-s0_1, 0.0);
|
||||
V4 s1_2 = -max(-s0_2, 0.0);
|
||||
V4 s1_3 = -max(-s0_3, 0.0);
|
||||
V4 s1_4 = -max(-s0_4, 0.0);
|
||||
V4 s1_5 = -max(-s0_5, 0.0);
|
||||
V4 s1_6 = -max(-s0_6, 0.0);
|
||||
V4 s1_7 = -max(-s0_7, 0.0);
|
||||
V4 s1_8 = -max(-s0_8, 0.0);
|
||||
s0_0 = max(s0_0, 0.0);
|
||||
s0_1 = max(s0_1, 0.0);
|
||||
s0_2 = max(s0_2, 0.0);
|
||||
s0_3 = max(s0_3, 0.0);
|
||||
s0_4 = max(s0_4, 0.0);
|
||||
s0_5 = max(s0_5, 0.0);
|
||||
s0_6 = max(s0_6, 0.0);
|
||||
s0_7 = max(s0_7, 0.0);
|
||||
s0_8 = max(s0_8, 0.0);
|
||||
|
||||
V4 s2_0 = l1(-1.0, -1.0);
|
||||
V4 s2_1 = l1(0.0, -1.0);
|
||||
V4 s2_2 = l1(1.0, -1.0);
|
||||
V4 s2_3 = l1(-1.0, 0.0);
|
||||
V4 s2_4 = l1(0.0, 0.0);
|
||||
V4 s2_5 = l1(1.0, 0.0);
|
||||
V4 s2_6 = l1(-1.0, 1.0);
|
||||
V4 s2_7 = l1(0.0, 1.0);
|
||||
V4 s2_8 = l1(1.0, 1.0);
|
||||
V4 s3_0 = -max(-s2_0, 0.0);
|
||||
V4 s3_1 = -max(-s2_1, 0.0);
|
||||
V4 s3_2 = -max(-s2_2, 0.0);
|
||||
V4 s3_3 = -max(-s2_3, 0.0);
|
||||
V4 s3_4 = -max(-s2_4, 0.0);
|
||||
V4 s3_5 = -max(-s2_5, 0.0);
|
||||
V4 s3_6 = -max(-s2_6, 0.0);
|
||||
V4 s3_7 = -max(-s2_7, 0.0);
|
||||
V4 s3_8 = -max(-s2_8, 0.0);
|
||||
s2_0 = max(s2_0, 0.0);
|
||||
s2_1 = max(s2_1, 0.0);
|
||||
s2_2 = max(s2_2, 0.0);
|
||||
s2_3 = max(s2_3, 0.0);
|
||||
s2_4 = max(s2_4, 0.0);
|
||||
s2_5 = max(s2_5, 0.0);
|
||||
s2_6 = max(s2_6, 0.0);
|
||||
s2_7 = max(s2_7, 0.0);
|
||||
s2_8 = max(s2_8, 0.0);
|
||||
|
||||
V4 r = f0(s0_0, s0_1, s0_2, s0_3, s0_4, s0_5, s0_6, s0_7, s0_8, s1_0, s1_1, s1_2, s1_3, s1_4, s1_5, s1_6, s1_7, s1_8, s2_0, s2_1, s2_2, s2_3, s2_4, s2_5, s2_6, s2_7, s2_8, s3_0, s3_1, s3_2, s3_3, s3_4, s3_5, s3_6, s3_7, s3_8);
|
||||
|
||||
static const float3x3 rgb2yuv = {0.299, 0.587, 0.114, -0.169, -0.331, 0.5, 0.5, -0.419, -0.081};
|
||||
static const float3x3 yuv2rgb = {1, -0.00093, 1.401687, 1, -0.3437, -0.71417, 1, 1.77216, 0.00099};
|
||||
float2 opt = float2(GetOutputPt());
|
||||
|
||||
pos -= 0.5f * opt;
|
||||
float3 yuv = mul(rgb2yuv, INPUT.SampleLevel(SL, pos, 0).rgb);
|
||||
OUTPUT[gxy] = float4(mul(yuv2rgb, float3(saturate(yuv.r + r.x), yuv.yz)), 1);
|
||||
|
||||
++gxy.x;
|
||||
pos.x += opt.x;
|
||||
yuv = mul(rgb2yuv, INPUT.SampleLevel(SL, pos, 0).rgb);
|
||||
OUTPUT[gxy] = float4(mul(yuv2rgb, float3(saturate(yuv.r + r.y), yuv.yz)), 1);
|
||||
|
||||
++gxy.y;
|
||||
pos.y += opt.y;
|
||||
yuv = mul(rgb2yuv, INPUT.SampleLevel(SL, pos, 0).rgb);
|
||||
OUTPUT[gxy] = float4(mul(yuv2rgb, float3(saturate(yuv.r + r.w), yuv.yz)), 1);
|
||||
|
||||
--gxy.x;
|
||||
pos.x -= opt.x;
|
||||
yuv = mul(rgb2yuv, INPUT.SampleLevel(SL, pos, 0).rgb);
|
||||
OUTPUT[gxy] = float4(mul(yuv2rgb, float3(saturate(yuv.r + r.z), yuv.yz)), 1);
|
||||
}
|
||||
1247
src/Effects/CuNNy/CuNNy-6x8C-NVL-DN.hlsl
Normal file
1247
src/Effects/CuNNy/CuNNy-6x8C-NVL-DN.hlsl
Normal file
File diff suppressed because it is too large
Load diff
1247
src/Effects/CuNNy/CuNNy-6x8C-NVL.hlsl
Normal file
1247
src/Effects/CuNNy/CuNNy-6x8C-NVL.hlsl
Normal file
File diff suppressed because it is too large
Load diff
4027
src/Effects/CuNNy/CuNNy-8x16C-NVL-DN.hlsl
Normal file
4027
src/Effects/CuNNy/CuNNy-8x16C-NVL-DN.hlsl
Normal file
File diff suppressed because it is too large
Load diff
4027
src/Effects/CuNNy/CuNNy-8x16C-NVL.hlsl
Normal file
4027
src/Effects/CuNNy/CuNNy-8x16C-NVL.hlsl
Normal file
File diff suppressed because it is too large
Load diff
778
src/Effects/CuNNy/CuNNy-8x4C-NVL-DN.hlsl
Normal file
778
src/Effects/CuNNy/CuNNy-8x4C-NVL-DN.hlsl
Normal file
|
|
@ -0,0 +1,778 @@
|
|||
// CuNNy 8x4C BILINEAR RGB NVL DN - https://github.com/cunnyplapper/CuNNy
|
||||
|
||||
// This program is free software: you can redistribute it and/or modify
|
||||
// it under the terms of the GNU General Public License as published by
|
||||
// the Free Software Foundation, either version 3 of the License, or
|
||||
// (at your option) any later version.
|
||||
//
|
||||
// This program is distributed in the hope that it will be useful,
|
||||
// but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
//
|
||||
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
// GNU General Public License for more details.
|
||||
//
|
||||
// You should have received a copy of the GNU General Public License
|
||||
// along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
|
||||
//!MAGPIE EFFECT
|
||||
//!VERSION 4
|
||||
//!SORT_NAME CuNNy-DN-D04N08
|
||||
|
||||
//!TEXTURE
|
||||
Texture2D INPUT;
|
||||
|
||||
//!TEXTURE
|
||||
//!WIDTH INPUT_WIDTH * 2
|
||||
//!HEIGHT INPUT_HEIGHT * 2
|
||||
Texture2D OUTPUT;
|
||||
|
||||
//!SAMPLER
|
||||
//!FILTER POINT
|
||||
SamplerState SP;
|
||||
|
||||
//!SAMPLER
|
||||
//!FILTER LINEAR
|
||||
SamplerState SL;
|
||||
|
||||
//!COMMON
|
||||
#define O(t, p) t.SampleLevel(SP, pos + p * pt, 0)
|
||||
#define V4 min16float4
|
||||
#define M4 min16float4x4
|
||||
|
||||
//!TEXTURE
|
||||
//!WIDTH INPUT_WIDTH
|
||||
//!HEIGHT INPUT_HEIGHT
|
||||
//!FORMAT R8G8B8A8_SNORM
|
||||
Texture2D t0;
|
||||
|
||||
//!TEXTURE
|
||||
//!WIDTH INPUT_WIDTH
|
||||
//!HEIGHT INPUT_HEIGHT
|
||||
//!FORMAT R8G8B8A8_SNORM
|
||||
Texture2D t1;
|
||||
|
||||
//!PASS 1
|
||||
//!DESC in
|
||||
//!BLOCK_SIZE 8
|
||||
//!NUM_THREADS 64
|
||||
//!IN INPUT
|
||||
//!OUT t0
|
||||
|
||||
#define l0(x, y) min16float((dot(float3(-1.880e-01, -3.696e-01, -8.936e-02), O(INPUT, float2(x, y)).rgb) + 5.137e-01))
|
||||
|
||||
V4 f0(min16float s0_0, min16float s0_1, min16float s0_2, min16float s0_3, min16float s0_4, min16float s0_5, min16float s0_6, min16float s0_7, min16float s0_8) {
|
||||
V4 r = 0.0;
|
||||
r += V4(6.049e-03, -3.524e-01, -1.308e-01, -6.691e-02) * s0_0;
|
||||
r += V4(1.720e-02, -7.092e-02, -3.030e-01, 1.654e-01) * s0_1;
|
||||
r += V4(-6.706e-03, 2.289e-01, 1.982e-03, -5.756e-02) * s0_2;
|
||||
r += V4(-2.761e-02, 5.050e-01, -2.036e-01, 1.265e-01) * s0_3;
|
||||
r += V4(-8.654e-01, -6.035e-01, -2.119e-01, 5.055e-01) * s0_4;
|
||||
r += V4(-7.114e-03, 2.325e-02, 5.721e-02, 4.585e-02) * s0_5;
|
||||
r += V4(2.796e-01, 1.680e-01, 1.353e-01, 1.286e-02) * s0_6;
|
||||
r += V4(5.684e-01, 3.022e-01, 6.426e-01, 8.931e-02) * s0_7;
|
||||
r += V4(3.723e-02, -2.036e-01, 2.732e-02, -4.101e-02) * s0_8;
|
||||
r += V4(1.324e-02, -9.379e-05, 8.452e-03, 5.165e-02);
|
||||
return r;
|
||||
}
|
||||
|
||||
void Pass1(uint2 blockStart, uint3 tid) {
|
||||
float2 pt = float2(GetInputPt());
|
||||
uint2 gxy = Rmp8x8(tid.x) + blockStart;
|
||||
uint2 size = GetInputSize();
|
||||
if (gxy.x >= size.x || gxy.y >= size.y) {
|
||||
return;
|
||||
}
|
||||
float2 pos = (gxy + 0.5) * pt;
|
||||
|
||||
min16float s0_0 = l0(-1.0, -1.0);
|
||||
min16float s0_1 = l0(0.0, -1.0);
|
||||
min16float s0_2 = l0(1.0, -1.0);
|
||||
min16float s0_3 = l0(-1.0, 0.0);
|
||||
min16float s0_4 = l0(0.0, 0.0);
|
||||
min16float s0_5 = l0(1.0, 0.0);
|
||||
min16float s0_6 = l0(-1.0, 1.0);
|
||||
min16float s0_7 = l0(0.0, 1.0);
|
||||
min16float s0_8 = l0(1.0, 1.0);
|
||||
|
||||
t0[gxy] = f0(s0_0, s0_1, s0_2, s0_3, s0_4, s0_5, s0_6, s0_7, s0_8);
|
||||
}
|
||||
|
||||
//!PASS 2
|
||||
//!DESC conv1
|
||||
//!BLOCK_SIZE 8
|
||||
//!NUM_THREADS 64
|
||||
//!IN t0
|
||||
//!OUT t1
|
||||
|
||||
#define l0(x, y) V4(O(t0, float2(x, y)))
|
||||
|
||||
V4 f0(V4 s0_0, V4 s0_1, V4 s0_2, V4 s0_3, V4 s0_4, V4 s0_5, V4 s0_6, V4 s0_7, V4 s0_8, V4 s1_0, V4 s1_1, V4 s1_2, V4 s1_3, V4 s1_4, V4 s1_5, V4 s1_6, V4 s1_7, V4 s1_8) {
|
||||
V4 r = 0.0;
|
||||
r += mul(s0_0, M4(2.216e-02, 1.062e-01, -3.433e-03, -1.923e-01, 6.300e-02, -4.594e-01, 2.025e-01, 8.655e-03, -5.497e-02, 1.694e-01, -1.806e-01, 2.115e-01, -6.176e-02, 1.167e-02, -5.987e-02, 1.167e-01));
|
||||
r += mul(s0_1, M4(-1.646e-01, -5.524e-01, -1.352e-01, 1.704e-01, 3.398e-02, -2.598e-01, 1.616e-01, -1.772e-01, -5.648e-02, 2.755e-01, 2.638e-02, -2.657e-02, 3.774e-02, -6.833e-02, -1.141e-01, -2.438e-01));
|
||||
r += mul(s0_2, M4(-1.459e-01, 9.939e-02, -6.457e-04, 2.352e-02, 5.006e-02, -7.759e-01, -4.862e-02, -3.366e-02, 9.508e-02, 1.537e-01, -6.771e-02, -1.260e-01, 1.067e-01, -5.893e-02, -9.811e-02, -1.060e-02));
|
||||
r += mul(s0_3, M4(-2.901e-01, 2.907e-01, 2.178e-01, -3.877e-01, 9.034e-03, 8.718e-03, -1.213e-01, 9.252e-02, 3.286e-01, -8.247e-02, -5.573e-02, -3.852e-01, -1.371e-01, 1.877e-01, 2.337e-01, 5.324e-01));
|
||||
r += mul(s0_4, M4(-9.182e-01, 1.013e-01, 2.969e-01, 7.117e-01, -2.367e-01, -7.128e-02, 1.828e-01, 5.993e-01, -2.965e-01, 1.323e-01, 3.117e-02, -3.215e-01, -1.410e-01, 5.359e-02, -1.137e-01, -2.603e-01));
|
||||
r += mul(s0_5, M4(-1.071e-01, -8.801e-02, 9.524e-03, -2.937e-02, 7.723e-02, 1.195e-01, -9.056e-02, 6.161e-02, 1.962e-01, -2.740e-01, -9.418e-02, 1.141e-01, 6.203e-02, -1.084e-01, 2.402e-01, -2.066e-01));
|
||||
r += mul(s0_6, M4(2.226e-01, -2.259e-01, -2.499e-02, -9.184e-02, -1.499e-01, -3.737e-02, 1.576e-01, 1.084e-01, -2.221e-01, -1.080e-02, 2.643e-02, -1.023e-01, 1.068e-01, 1.193e-01, -2.781e-01, 3.396e-01));
|
||||
r += mul(s0_7, M4(7.520e-01, -1.043e-01, -4.535e-02, 2.775e-01, 1.577e-01, -1.526e-01, 1.796e-01, 1.085e-01, -1.012e+00, 4.333e-02, 1.270e-02, -1.692e-01, 1.127e-01, -2.847e-01, -1.784e-01, -3.956e-01));
|
||||
r += mul(s0_8, M4(2.206e-01, 1.370e-01, -7.453e-02, 1.050e-01, 8.412e-02, -1.396e-01, 1.707e-02, -1.654e-02, -2.116e-01, -7.944e-02, 1.244e-01, -6.709e-02, -5.577e-02, 1.619e-01, -2.818e-01, 1.460e-01));
|
||||
r += mul(s1_0, M4(1.180e-01, -2.345e-01, 5.406e-02, -1.102e-01, 1.559e-02, -3.865e-01, -1.077e-01, 1.442e-02, -1.405e-01, 1.578e-01, -3.338e-02, 1.157e-01, -1.676e-01, 4.656e-02, -1.507e-01, 2.590e-02));
|
||||
r += mul(s1_1, M4(-3.112e-02, -5.537e-01, -3.626e-01, -2.915e-01, 7.495e-02, 4.473e-01, -1.847e-01, -8.743e-02, -3.290e-02, 3.660e-02, 1.252e-01, 1.058e-02, 1.193e-01, 6.421e-02, -1.456e-01, -1.693e-01));
|
||||
r += mul(s1_2, M4(-1.047e-01, -4.306e-01, 6.486e-03, 1.137e-01, 2.935e-02, -3.608e-01, 5.242e-02, -2.374e-02, 1.130e-01, -4.864e-02, -7.302e-02, -2.205e-02, 8.227e-02, -8.403e-02, -9.468e-02, 8.095e-02));
|
||||
r += mul(s1_3, M4(-3.759e-02, 2.709e-01, 1.269e-01, -4.994e-01, -1.577e-02, 1.871e-01, -2.532e-01, 8.960e-02, 2.298e-01, -2.462e-01, -1.634e-02, -3.955e-01, 2.750e-02, -4.812e-02, -2.441e-01, 9.926e-01));
|
||||
r += mul(s1_4, M4(-7.288e-01, 5.644e-01, 1.042e+00, 6.160e-01, -4.271e-01, 4.419e-01, 1.437e-01, 3.840e-01, -1.220e-01, -8.627e-01, 6.664e-02, -1.220e-02, 5.260e-02, 1.505e-01, -2.182e-01, -6.116e-01));
|
||||
r += mul(s1_5, M4(1.659e-01, 2.566e-01, -5.954e-02, -9.187e-02, -8.251e-02, 1.091e-01, -1.506e-01, 1.370e-01, 3.056e-01, -3.512e-01, -4.956e-03, 7.008e-02, 1.320e-01, -3.995e-01, -8.603e-03, -3.542e-01));
|
||||
r += mul(s1_6, M4(2.549e-01, -7.946e-02, -1.755e-01, -2.902e-02, -1.912e-01, 2.349e-01, 6.770e-02, 9.683e-02, -2.690e-01, -1.715e-01, 5.692e-02, -1.064e-01, 2.998e-01, 7.619e-02, 8.040e-03, 2.706e-01));
|
||||
r += mul(s1_7, M4(7.320e-01, 1.397e-01, -5.600e-02, 9.609e-02, -1.267e-01, 6.841e-02, 2.429e-01, 3.167e-02, -6.816e-01, -3.313e-03, 5.622e-02, -4.727e-02, -3.420e-01, 4.283e-02, -3.250e-01, -4.118e-01));
|
||||
r += mul(s1_8, M4(1.607e-01, 1.581e-01, -6.049e-02, 9.118e-02, -1.583e-02, 2.918e-01, 1.703e-02, -1.206e-01, -2.114e-01, -1.248e-01, 6.689e-02, -2.131e-02, -7.779e-02, 1.069e-01, -1.181e-01, 2.230e-01));
|
||||
r += V4(1.959e-02, -5.807e-03, 9.415e-02, 7.247e-03);
|
||||
return r;
|
||||
}
|
||||
|
||||
void Pass2(uint2 blockStart, uint3 tid) {
|
||||
float2 pt = float2(GetInputPt());
|
||||
uint2 gxy = Rmp8x8(tid.x) + blockStart;
|
||||
uint2 size = GetInputSize();
|
||||
if (gxy.x >= size.x || gxy.y >= size.y) {
|
||||
return;
|
||||
}
|
||||
float2 pos = (gxy + 0.5) * pt;
|
||||
|
||||
V4 s0_0 = l0(-1.0, -1.0);
|
||||
V4 s0_1 = l0(0.0, -1.0);
|
||||
V4 s0_2 = l0(1.0, -1.0);
|
||||
V4 s0_3 = l0(-1.0, 0.0);
|
||||
V4 s0_4 = l0(0.0, 0.0);
|
||||
V4 s0_5 = l0(1.0, 0.0);
|
||||
V4 s0_6 = l0(-1.0, 1.0);
|
||||
V4 s0_7 = l0(0.0, 1.0);
|
||||
V4 s0_8 = l0(1.0, 1.0);
|
||||
V4 s1_0 = -max(-s0_0, 0.0);
|
||||
V4 s1_1 = -max(-s0_1, 0.0);
|
||||
V4 s1_2 = -max(-s0_2, 0.0);
|
||||
V4 s1_3 = -max(-s0_3, 0.0);
|
||||
V4 s1_4 = -max(-s0_4, 0.0);
|
||||
V4 s1_5 = -max(-s0_5, 0.0);
|
||||
V4 s1_6 = -max(-s0_6, 0.0);
|
||||
V4 s1_7 = -max(-s0_7, 0.0);
|
||||
V4 s1_8 = -max(-s0_8, 0.0);
|
||||
s0_0 = max(s0_0, 0.0);
|
||||
s0_1 = max(s0_1, 0.0);
|
||||
s0_2 = max(s0_2, 0.0);
|
||||
s0_3 = max(s0_3, 0.0);
|
||||
s0_4 = max(s0_4, 0.0);
|
||||
s0_5 = max(s0_5, 0.0);
|
||||
s0_6 = max(s0_6, 0.0);
|
||||
s0_7 = max(s0_7, 0.0);
|
||||
s0_8 = max(s0_8, 0.0);
|
||||
|
||||
t1[gxy] = f0(s0_0, s0_1, s0_2, s0_3, s0_4, s0_5, s0_6, s0_7, s0_8, s1_0, s1_1, s1_2, s1_3, s1_4, s1_5, s1_6, s1_7, s1_8);
|
||||
}
|
||||
|
||||
//!PASS 3
|
||||
//!DESC conv2
|
||||
//!BLOCK_SIZE 8
|
||||
//!NUM_THREADS 64
|
||||
//!IN t1
|
||||
//!OUT t0
|
||||
|
||||
#define l0(x, y) V4(O(t1, float2(x, y)))
|
||||
|
||||
V4 f0(V4 s0_0, V4 s0_1, V4 s0_2, V4 s0_3, V4 s0_4, V4 s0_5, V4 s0_6, V4 s0_7, V4 s0_8, V4 s1_0, V4 s1_1, V4 s1_2, V4 s1_3, V4 s1_4, V4 s1_5, V4 s1_6, V4 s1_7, V4 s1_8) {
|
||||
V4 r = 0.0;
|
||||
r += mul(s0_0, M4(9.727e-02, 1.849e-01, 2.125e-02, 1.933e-01, 9.183e-02, 8.307e-03, -9.035e-02, 3.241e-02, 1.141e-01, 8.739e-02, -9.547e-02, 1.616e-01, 2.912e-02, -1.780e-02, 5.433e-02, 2.720e-02));
|
||||
r += mul(s0_1, M4(-1.524e-01, -9.138e-02, 8.798e-02, -1.691e-01, 8.519e-03, 3.597e-02, -1.784e-02, 3.049e-02, 3.078e-02, 1.823e-01, 1.051e-02, -5.317e-02, -1.977e-01, 1.013e-01, 1.215e-01, 4.261e-02));
|
||||
r += mul(s0_2, M4(-1.992e-02, -1.191e-01, -1.365e-03, 3.976e-02, 3.452e-03, 7.503e-03, 4.850e-03, 8.970e-03, -7.652e-03, 1.166e-01, 9.888e-02, 3.423e-03, -3.354e-01, -3.335e-01, -2.226e-02, -1.509e-01));
|
||||
r += mul(s0_3, M4(-7.994e-02, 1.374e-01, -1.701e-02, -2.530e-01, 2.153e-01, -6.957e-03, -1.405e-01, -6.175e-02, 7.274e-03, 1.734e-01, -9.107e-02, -1.303e-01, -1.265e-01, 1.669e-02, 3.494e-02, -8.377e-02));
|
||||
r += mul(s0_4, M4(-1.124e+00, 1.355e-02, -1.979e-01, -4.092e-01, -1.276e-01, -1.096e-01, 5.949e-02, 1.073e-01, -4.780e-02, 1.378e-01, 1.905e-01, -9.525e-02, -5.999e-01, 1.274e-01, 8.416e-01, 2.483e-01));
|
||||
r += mul(s0_5, M4(3.312e-01, 2.036e-01, -5.231e-02, 5.357e-02, 1.666e-03, -2.102e-03, -3.213e-03, 4.747e-02, 1.130e-01, 3.492e-01, -1.263e-01, 4.100e-01, -5.859e-01, 4.875e-02, 2.227e-01, 3.127e-01));
|
||||
r += mul(s0_6, M4(-3.699e-02, 6.066e-02, 3.448e-03, -4.158e-03, -4.048e-03, -3.619e-02, -8.830e-02, -8.917e-03, 2.990e-02, 6.919e-03, 9.803e-02, 2.188e-02, 5.674e-02, -3.122e-02, -6.793e-02, 8.573e-02));
|
||||
r += mul(s0_7, M4(-1.255e-01, 1.754e-01, -1.332e-01, -1.124e-01, -2.163e-01, 1.552e-02, -7.485e-04, 4.194e-02, -1.899e-01, 1.334e-01, -1.721e-01, -3.487e-01, 3.847e-01, -3.823e-02, 1.121e-02, -7.128e-02));
|
||||
r += mul(s0_8, M4(7.152e-02, -1.631e-02, 4.810e-02, 1.435e-01, 3.881e-02, -3.596e-02, -7.544e-03, -1.071e-01, -8.509e-02, 1.110e-01, 8.542e-02, 1.980e-02, -1.134e-01, -7.967e-02, -1.586e-01, 2.511e-01));
|
||||
r += mul(s1_0, M4(2.326e-01, 4.791e-02, -1.996e-01, 1.352e-02, -9.909e-03, 1.117e-01, 2.198e-02, -6.683e-02, 1.356e-01, 2.830e-01, -8.418e-02, 2.137e-01, -1.401e-02, -7.056e-02, 5.360e-02, 6.243e-02));
|
||||
r += mul(s1_1, M4(7.739e-01, -3.172e-01, -2.031e-01, 2.054e-01, -1.263e-01, -7.571e-03, 8.090e-02, -1.372e-01, 1.053e-01, 2.982e-01, -6.235e-02, 1.452e-02, 1.973e-01, 9.233e-02, -1.067e-01, 1.088e-01));
|
||||
r += mul(s1_2, M4(-1.136e-01, -1.332e-01, -7.369e-02, 2.046e-01, -9.302e-02, 2.722e-02, 9.461e-02, -1.895e-01, 1.216e-02, 2.595e-01, 1.028e-01, 8.413e-02, -1.339e-01, -2.259e-01, -1.047e-01, 5.994e-02));
|
||||
r += mul(s1_3, M4(1.224e-01, -3.713e-02, -2.383e-01, -1.743e-01, -1.876e-01, 1.155e-01, 2.212e-01, -1.375e-01, 1.618e-01, 2.628e-01, -1.161e-01, -1.826e-01, 8.003e-02, -1.961e-02, -6.278e-02, -5.710e-02));
|
||||
r += mul(s1_4, M4(-2.647e-01, -1.603e-01, -7.731e-01, 1.958e-01, -4.093e-01, -1.110e-01, 3.352e-01, -3.093e-02, -6.201e-01, 3.073e-01, 3.779e-01, -2.733e-01, 4.035e-01, 1.230e-01, -1.606e-01, 9.421e-02));
|
||||
r += mul(s1_5, M4(1.981e-01, -8.801e-03, -9.874e-03, -4.003e-02, 2.686e-03, -1.346e-01, -1.813e-02, -1.003e-01, 1.561e-01, 3.252e-01, -1.189e-01, 2.014e-01, 1.343e-01, 4.088e-02, -9.918e-02, 1.025e+00));
|
||||
r += mul(s1_6, M4(-2.323e-02, 3.284e-02, -5.099e-03, -3.025e-02, -1.458e-02, -1.640e-02, 1.268e-01, -3.787e-02, 5.078e-02, 4.529e-02, 1.050e-02, -8.079e-03, -1.530e-02, -6.509e-02, -1.620e-01, 6.662e-02));
|
||||
r += mul(s1_7, M4(3.972e-02, 8.570e-02, -8.723e-02, -3.746e-02, -1.902e-01, 5.121e-02, 1.161e-01, -4.624e-02, -6.268e-02, 1.852e-01, -1.535e-01, -2.023e-01, 2.476e-01, -2.211e-02, -1.590e-01, -3.109e-02));
|
||||
r += mul(s1_8, M4(-8.025e-03, -4.798e-02, 5.162e-02, 6.616e-02, -2.416e-02, -5.815e-02, -1.334e-02, -1.029e-01, 5.381e-02, 1.539e-01, 4.511e-02, 1.426e-01, -5.511e-02, -9.311e-02, -3.072e-02, 1.572e-01));
|
||||
r += V4(3.240e-02, -1.989e-01, -2.700e-02, 6.578e-03);
|
||||
return r;
|
||||
}
|
||||
|
||||
void Pass3(uint2 blockStart, uint3 tid) {
|
||||
float2 pt = float2(GetInputPt());
|
||||
uint2 gxy = Rmp8x8(tid.x) + blockStart;
|
||||
uint2 size = GetInputSize();
|
||||
if (gxy.x >= size.x || gxy.y >= size.y) {
|
||||
return;
|
||||
}
|
||||
float2 pos = (gxy + 0.5) * pt;
|
||||
|
||||
V4 s0_0 = l0(-1.0, -1.0);
|
||||
V4 s0_1 = l0(0.0, -1.0);
|
||||
V4 s0_2 = l0(1.0, -1.0);
|
||||
V4 s0_3 = l0(-1.0, 0.0);
|
||||
V4 s0_4 = l0(0.0, 0.0);
|
||||
V4 s0_5 = l0(1.0, 0.0);
|
||||
V4 s0_6 = l0(-1.0, 1.0);
|
||||
V4 s0_7 = l0(0.0, 1.0);
|
||||
V4 s0_8 = l0(1.0, 1.0);
|
||||
V4 s1_0 = -max(-s0_0, 0.0);
|
||||
V4 s1_1 = -max(-s0_1, 0.0);
|
||||
V4 s1_2 = -max(-s0_2, 0.0);
|
||||
V4 s1_3 = -max(-s0_3, 0.0);
|
||||
V4 s1_4 = -max(-s0_4, 0.0);
|
||||
V4 s1_5 = -max(-s0_5, 0.0);
|
||||
V4 s1_6 = -max(-s0_6, 0.0);
|
||||
V4 s1_7 = -max(-s0_7, 0.0);
|
||||
V4 s1_8 = -max(-s0_8, 0.0);
|
||||
s0_0 = max(s0_0, 0.0);
|
||||
s0_1 = max(s0_1, 0.0);
|
||||
s0_2 = max(s0_2, 0.0);
|
||||
s0_3 = max(s0_3, 0.0);
|
||||
s0_4 = max(s0_4, 0.0);
|
||||
s0_5 = max(s0_5, 0.0);
|
||||
s0_6 = max(s0_6, 0.0);
|
||||
s0_7 = max(s0_7, 0.0);
|
||||
s0_8 = max(s0_8, 0.0);
|
||||
|
||||
t0[gxy] = f0(s0_0, s0_1, s0_2, s0_3, s0_4, s0_5, s0_6, s0_7, s0_8, s1_0, s1_1, s1_2, s1_3, s1_4, s1_5, s1_6, s1_7, s1_8);
|
||||
}
|
||||
|
||||
//!PASS 4
|
||||
//!DESC conv3
|
||||
//!BLOCK_SIZE 8
|
||||
//!NUM_THREADS 64
|
||||
//!IN t0
|
||||
//!OUT t1
|
||||
|
||||
#define l0(x, y) V4(O(t0, float2(x, y)))
|
||||
|
||||
V4 f0(V4 s0_0, V4 s0_1, V4 s0_2, V4 s0_3, V4 s0_4, V4 s0_5, V4 s0_6, V4 s0_7, V4 s0_8, V4 s1_0, V4 s1_1, V4 s1_2, V4 s1_3, V4 s1_4, V4 s1_5, V4 s1_6, V4 s1_7, V4 s1_8) {
|
||||
V4 r = 0.0;
|
||||
r += mul(s0_0, M4(9.384e-02, 1.183e-01, 5.136e-02, -4.583e-01, -1.060e-01, 6.124e-02, -1.479e-01, -2.457e-01, -5.881e-02, 4.756e-03, -2.540e-02, -5.047e-02, -1.897e-01, 4.062e-02, 1.226e-02, 1.465e-01));
|
||||
r += mul(s0_1, M4(-1.890e-01, -9.535e-02, 2.627e-01, 3.224e-01, 1.050e-01, -3.922e-02, -3.551e-01, -2.632e-01, -2.349e-01, -5.605e-02, -2.856e-01, 4.331e-01, -2.614e-02, -6.027e-02, -3.236e-02, 2.873e-01));
|
||||
r += mul(s0_2, M4(-1.702e-01, 7.462e-02, 2.168e-01, 4.212e-01, 8.150e-03, 6.671e-02, -2.781e-01, -1.322e-01, -3.933e-02, 2.698e-02, -3.420e-01, -1.116e-02, -1.788e-02, 8.701e-03, -1.044e-01, 1.264e-01));
|
||||
r += mul(s0_3, M4(3.573e-01, -4.592e-02, 4.539e-01, 2.854e-01, -6.463e-01, -1.763e-01, 6.236e-01, 7.125e-02, 4.126e-01, -1.621e-02, 1.685e-02, 2.328e-01, -5.456e-01, -2.113e-01, 1.424e-01, 1.414e-01));
|
||||
r += mul(s0_4, M4(3.838e-01, -1.008e+00, 4.023e-01, 1.302e+00, -1.503e-01, 4.245e-02, 1.496e+00, -3.479e-01, -3.763e-01, -7.877e-01, 4.081e-01, -2.192e-01, -2.853e-01, 2.123e-01, -3.407e-01, 2.423e-01));
|
||||
r += mul(s0_5, M4(5.073e-03, -2.123e-01, 1.851e-01, 1.482e-01, -2.814e-01, 1.262e-01, 6.890e-01, -2.317e-01, 6.427e-02, -5.801e-02, -3.684e-02, 7.526e-02, 1.309e-02, -2.125e-02, -7.760e-02, 4.795e-02));
|
||||
r += mul(s0_6, M4(1.409e-01, -1.062e-01, 1.665e-01, 5.277e-01, 6.676e-01, -1.872e-01, 1.251e+00, 1.165e-01, -2.287e-02, -5.235e-02, -2.028e-03, -3.305e-02, -1.968e-01, 1.898e-01, -9.538e-02, -1.418e-01));
|
||||
r += mul(s0_7, M4(7.353e-02, -3.073e-01, 1.789e-01, 2.137e-01, -6.435e-01, -6.052e-01, 2.259e+00, 2.884e-02, 7.105e-04, 1.247e-01, -7.393e-02, 2.539e-02, 1.194e-01, 1.870e-01, -1.126e-01, 2.444e-02));
|
||||
r += mul(s0_8, M4(3.853e-02, -2.242e-01, 1.470e-01, 1.701e-02, 4.586e-02, 2.027e-01, 7.448e-01, -4.414e-01, 9.096e-03, 1.277e-01, 4.010e-02, 1.064e-02, 2.401e-02, 1.901e-02, 1.956e-02, 8.744e-02));
|
||||
r += mul(s1_0, M4(-4.741e-02, 1.819e-03, -8.321e-02, -1.496e-01, -1.801e-02, 4.682e-02, -6.041e-02, -7.243e-02, -1.478e-01, 4.970e-02, 6.424e-02, -5.378e-02, -9.117e-02, 5.496e-02, -2.648e-02, -4.042e-02));
|
||||
r += mul(s1_1, M4(-8.815e-02, 5.938e-02, -2.433e-01, 1.737e-01, 1.095e-01, -5.108e-02, -5.729e-02, 8.334e-03, -2.763e-01, -6.431e-02, -2.454e-02, 4.055e-01, 2.113e-02, -1.298e-01, -3.908e-02, -1.780e-02));
|
||||
r += mul(s1_2, M4(-1.905e-02, 3.894e-02, -1.293e-01, 8.303e-03, -7.800e-03, -5.508e-03, 8.606e-02, -7.501e-02, 1.542e-02, 3.046e-02, -2.920e-01, -4.240e-02, -3.932e-02, -1.813e-02, -8.213e-02, 1.017e-01));
|
||||
r += mul(s1_3, M4(1.965e-01, 3.626e-02, 3.418e-02, 9.779e-02, -6.664e-02, -2.295e-02, -2.736e-02, 1.091e-01, 1.129e-01, -3.896e-02, 1.171e-02, -2.870e-02, -1.382e-01, -1.691e-01, 3.018e-01, -1.186e-01));
|
||||
r += mul(s1_4, M4(1.075e-01, -6.894e-01, 1.714e-01, 5.097e-01, 9.868e-03, 1.087e-01, 2.107e-01, -6.591e-02, -3.233e-01, -9.792e-01, -1.189e-01, -5.480e-01, -1.157e-01, 5.941e-02, -5.770e-01, -1.030e-01));
|
||||
r += mul(s1_5, M4(3.289e-02, 3.941e-02, 1.824e-01, 7.260e-04, -9.787e-03, 3.128e-02, -1.333e-01, 1.352e-01, 5.954e-03, -2.520e-01, -8.536e-02, -3.566e-01, 2.998e-02, -5.941e-02, -8.531e-02, -4.232e-02));
|
||||
r += mul(s1_6, M4(2.592e-02, -7.528e-02, -1.956e-02, 1.002e-01, 2.992e-02, -1.673e-01, 4.413e-02, 1.683e-01, 1.440e-02, -1.047e-02, 1.425e-02, -1.292e-01, -1.777e-01, 1.220e-01, -6.381e-02, 4.174e-02));
|
||||
r += mul(s1_7, M4(-3.107e-02, -8.612e-02, 1.248e-02, -8.544e-02, -1.161e-01, 7.718e-02, -1.150e-01, -1.699e-01, -1.392e-02, 7.590e-02, -5.195e-02, -3.599e-01, 4.872e-02, 1.381e-01, -1.143e-01, -1.473e-03));
|
||||
r += mul(s1_8, M4(1.277e-02, 3.020e-02, 4.658e-02, 8.071e-02, 6.867e-02, -2.693e-02, 7.897e-02, -1.264e-02, -1.035e-03, 1.509e-01, 4.169e-02, -1.716e-01, -4.694e-03, 1.627e-02, 7.171e-03, -4.496e-02));
|
||||
r += V4(4.014e-03, -2.020e-02, 1.560e-02, -2.352e-02);
|
||||
return r;
|
||||
}
|
||||
|
||||
void Pass4(uint2 blockStart, uint3 tid) {
|
||||
float2 pt = float2(GetInputPt());
|
||||
uint2 gxy = Rmp8x8(tid.x) + blockStart;
|
||||
uint2 size = GetInputSize();
|
||||
if (gxy.x >= size.x || gxy.y >= size.y) {
|
||||
return;
|
||||
}
|
||||
float2 pos = (gxy + 0.5) * pt;
|
||||
|
||||
V4 s0_0 = l0(-1.0, -1.0);
|
||||
V4 s0_1 = l0(0.0, -1.0);
|
||||
V4 s0_2 = l0(1.0, -1.0);
|
||||
V4 s0_3 = l0(-1.0, 0.0);
|
||||
V4 s0_4 = l0(0.0, 0.0);
|
||||
V4 s0_5 = l0(1.0, 0.0);
|
||||
V4 s0_6 = l0(-1.0, 1.0);
|
||||
V4 s0_7 = l0(0.0, 1.0);
|
||||
V4 s0_8 = l0(1.0, 1.0);
|
||||
V4 s1_0 = -max(-s0_0, 0.0);
|
||||
V4 s1_1 = -max(-s0_1, 0.0);
|
||||
V4 s1_2 = -max(-s0_2, 0.0);
|
||||
V4 s1_3 = -max(-s0_3, 0.0);
|
||||
V4 s1_4 = -max(-s0_4, 0.0);
|
||||
V4 s1_5 = -max(-s0_5, 0.0);
|
||||
V4 s1_6 = -max(-s0_6, 0.0);
|
||||
V4 s1_7 = -max(-s0_7, 0.0);
|
||||
V4 s1_8 = -max(-s0_8, 0.0);
|
||||
s0_0 = max(s0_0, 0.0);
|
||||
s0_1 = max(s0_1, 0.0);
|
||||
s0_2 = max(s0_2, 0.0);
|
||||
s0_3 = max(s0_3, 0.0);
|
||||
s0_4 = max(s0_4, 0.0);
|
||||
s0_5 = max(s0_5, 0.0);
|
||||
s0_6 = max(s0_6, 0.0);
|
||||
s0_7 = max(s0_7, 0.0);
|
||||
s0_8 = max(s0_8, 0.0);
|
||||
|
||||
t1[gxy] = f0(s0_0, s0_1, s0_2, s0_3, s0_4, s0_5, s0_6, s0_7, s0_8, s1_0, s1_1, s1_2, s1_3, s1_4, s1_5, s1_6, s1_7, s1_8);
|
||||
}
|
||||
|
||||
//!PASS 5
|
||||
//!DESC conv4
|
||||
//!BLOCK_SIZE 8
|
||||
//!NUM_THREADS 64
|
||||
//!IN t1
|
||||
//!OUT t0
|
||||
|
||||
#define l0(x, y) V4(O(t1, float2(x, y)))
|
||||
|
||||
V4 f0(V4 s0_0, V4 s0_1, V4 s0_2, V4 s0_3, V4 s0_4, V4 s0_5, V4 s0_6, V4 s0_7, V4 s0_8, V4 s1_0, V4 s1_1, V4 s1_2, V4 s1_3, V4 s1_4, V4 s1_5, V4 s1_6, V4 s1_7, V4 s1_8) {
|
||||
V4 r = 0.0;
|
||||
r += mul(s0_0, M4(3.174e-02, -2.020e-01, -6.843e-03, 1.049e-01, 1.680e-01, -6.387e-01, -1.541e-01, -1.952e-01, -4.586e-02, -1.580e-01, -5.507e-02, 1.065e-01, -5.257e-03, -9.464e-02, -9.788e-02, 1.221e-01));
|
||||
r += mul(s0_1, M4(1.365e-01, -4.220e-02, -4.186e-02, -1.569e-01, -5.527e-01, -1.180e-01, -2.274e-01, -2.007e-01, 2.207e-02, 1.190e-02, 3.746e-02, -1.565e-01, -2.808e-02, 1.657e-02, -5.376e-02, -1.093e-02));
|
||||
r += mul(s0_2, M4(-7.935e-02, -3.809e-02, -3.727e-02, -4.730e-02, -8.556e-02, 3.451e-04, -8.191e-02, 8.086e-02, 2.051e-02, 7.072e-03, 2.537e-02, 2.793e-02, 9.384e-04, -3.624e-02, -2.171e-02, 7.103e-02));
|
||||
r += mul(s0_3, M4(-1.261e-02, 2.716e-01, 2.739e-01, -7.349e-02, -2.130e-02, -4.131e-01, -1.851e-01, 1.065e-01, -7.827e-02, 2.868e-01, -1.500e-01, -1.442e-01, -1.842e-02, -2.983e-01, -4.232e-02, 1.395e-01));
|
||||
r += mul(s0_4, M4(2.733e-01, 4.015e-01, 4.102e-01, -2.027e-01, 4.229e-01, 2.213e-01, 3.628e-01, -1.011e-01, -4.893e-01, 1.333e-01, -4.245e-02, -8.133e-02, -1.086e-02, -1.089e-01, -8.720e-02, 1.513e-01));
|
||||
r += mul(s0_5, M4(8.521e-02, 1.460e-01, 1.589e-01, -2.075e-01, -5.391e-02, 7.449e-03, -6.763e-02, -2.352e-01, 4.055e-02, -1.812e-02, -1.413e-02, 9.240e-02, -3.070e-02, -4.975e-03, -8.972e-02, -2.225e-02));
|
||||
r += mul(s0_6, M4(1.880e-01, -1.481e-01, 1.001e-01, 6.339e-02, -6.208e-02, -2.814e-02, -5.944e-03, 1.002e-01, -7.822e-02, 1.010e-01, -2.161e-02, 9.175e-02, 1.495e-02, 1.645e-02, 8.901e-03, -3.865e-02));
|
||||
r += mul(s0_7, M4(4.449e-01, -1.089e-01, -1.249e-01, -8.911e-01, 3.096e-02, 1.724e-01, 5.605e-02, -7.605e-02, -9.644e-02, -1.191e-01, -1.332e-01, 2.544e-02, 5.659e-02, -2.706e-04, -9.886e-02, 9.218e-02));
|
||||
r += mul(s0_8, M4(7.394e-02, -2.112e-01, 1.505e-02, -1.236e-01, -1.848e-02, -2.716e-02, -6.663e-02, 2.764e-02, -1.120e-02, 3.440e-03, -1.443e-02, 1.745e-02, -3.847e-02, -4.228e-03, -8.888e-02, 2.134e-02));
|
||||
r += mul(s1_0, M4(6.588e-03, -6.764e-02, -2.660e-02, -3.967e-02, 6.459e-02, -6.345e-01, -5.784e-01, 9.294e-02, 2.426e-02, -9.858e-02, -9.036e-02, -9.545e-02, 2.094e-02, -1.001e-01, -1.145e-01, -6.470e-02));
|
||||
r += mul(s1_1, M4(-2.633e-03, 5.849e-02, 3.154e-02, -7.386e-02, -6.412e-01, -4.405e-01, -5.885e-01, 1.657e-01, -1.757e-01, -1.882e-02, -1.023e-01, -1.713e-01, -1.047e-01, -1.558e-01, -1.509e-01, -2.815e-01));
|
||||
r += mul(s1_2, M4(1.880e-01, -3.790e-02, 1.112e-01, 1.672e-02, -1.713e-01, 2.611e-02, -9.008e-02, 9.359e-02, -6.567e-02, 9.399e-02, 3.743e-02, 3.662e-02, 3.190e-02, -1.466e-01, -1.154e-01, 1.692e-02));
|
||||
r += mul(s1_3, M4(-1.733e-02, 1.381e-01, 8.342e-02, -5.893e-02, -1.467e-02, -4.365e-01, -3.057e-01, 1.506e-01, 7.300e-02, 6.777e-01, -5.484e-03, -3.499e-01, 1.978e-01, -6.846e-01, -2.921e-01, -1.173e-01));
|
||||
r += mul(s1_4, M4(-1.829e-01, -4.506e-01, -5.685e-02, 8.260e-01, 3.056e-01, 1.803e-01, 1.908e-01, -2.029e-01, -1.578e-01, 5.039e-01, 3.016e-01, -4.971e-01, -4.977e-01, 4.537e-01, -4.268e-01, 7.878e-01));
|
||||
r += mul(s1_5, M4(-3.251e-01, -1.229e-01, -1.447e-01, 3.290e-01, -2.134e-01, -6.542e-03, -7.109e-02, -1.004e-01, 3.887e-02, -1.008e-01, -7.490e-02, 6.126e-02, 2.757e-01, -1.980e-01, -1.792e-01, 2.722e-01));
|
||||
r += mul(s1_6, M4(4.765e-02, -5.401e-02, 4.164e-02, 1.847e-03, -3.178e-02, -4.201e-02, -2.504e-02, 1.350e-02, -1.436e-01, 1.654e-01, -1.099e-02, -3.733e-02, 1.118e-01, -2.529e-01, -1.353e-01, -9.309e-02));
|
||||
r += mul(s1_7, M4(1.684e-01, -1.978e-01, 2.645e-02, -9.582e-02, 2.618e-02, 9.350e-02, -2.281e-02, -1.901e-01, 1.176e-02, -1.571e-01, 1.491e-02, -2.105e-01, -1.685e-01, -2.459e-01, -2.166e-01, 1.082e-01));
|
||||
r += mul(s1_8, M4(2.225e-02, 7.813e-02, -4.112e-02, 6.166e-02, -4.143e-02, -2.160e-02, -7.478e-02, -2.251e-02, -1.306e-02, -6.002e-02, -7.496e-02, -2.538e-03, 7.824e-02, 9.597e-02, -3.546e-03, -1.794e-01));
|
||||
r += V4(-5.942e-03, -2.718e-02, -1.234e-02, 3.307e-02);
|
||||
return r;
|
||||
}
|
||||
|
||||
void Pass5(uint2 blockStart, uint3 tid) {
|
||||
float2 pt = float2(GetInputPt());
|
||||
uint2 gxy = Rmp8x8(tid.x) + blockStart;
|
||||
uint2 size = GetInputSize();
|
||||
if (gxy.x >= size.x || gxy.y >= size.y) {
|
||||
return;
|
||||
}
|
||||
float2 pos = (gxy + 0.5) * pt;
|
||||
|
||||
V4 s0_0 = l0(-1.0, -1.0);
|
||||
V4 s0_1 = l0(0.0, -1.0);
|
||||
V4 s0_2 = l0(1.0, -1.0);
|
||||
V4 s0_3 = l0(-1.0, 0.0);
|
||||
V4 s0_4 = l0(0.0, 0.0);
|
||||
V4 s0_5 = l0(1.0, 0.0);
|
||||
V4 s0_6 = l0(-1.0, 1.0);
|
||||
V4 s0_7 = l0(0.0, 1.0);
|
||||
V4 s0_8 = l0(1.0, 1.0);
|
||||
V4 s1_0 = -max(-s0_0, 0.0);
|
||||
V4 s1_1 = -max(-s0_1, 0.0);
|
||||
V4 s1_2 = -max(-s0_2, 0.0);
|
||||
V4 s1_3 = -max(-s0_3, 0.0);
|
||||
V4 s1_4 = -max(-s0_4, 0.0);
|
||||
V4 s1_5 = -max(-s0_5, 0.0);
|
||||
V4 s1_6 = -max(-s0_6, 0.0);
|
||||
V4 s1_7 = -max(-s0_7, 0.0);
|
||||
V4 s1_8 = -max(-s0_8, 0.0);
|
||||
s0_0 = max(s0_0, 0.0);
|
||||
s0_1 = max(s0_1, 0.0);
|
||||
s0_2 = max(s0_2, 0.0);
|
||||
s0_3 = max(s0_3, 0.0);
|
||||
s0_4 = max(s0_4, 0.0);
|
||||
s0_5 = max(s0_5, 0.0);
|
||||
s0_6 = max(s0_6, 0.0);
|
||||
s0_7 = max(s0_7, 0.0);
|
||||
s0_8 = max(s0_8, 0.0);
|
||||
|
||||
t0[gxy] = f0(s0_0, s0_1, s0_2, s0_3, s0_4, s0_5, s0_6, s0_7, s0_8, s1_0, s1_1, s1_2, s1_3, s1_4, s1_5, s1_6, s1_7, s1_8);
|
||||
}
|
||||
|
||||
//!PASS 6
|
||||
//!DESC conv5
|
||||
//!BLOCK_SIZE 8
|
||||
//!NUM_THREADS 64
|
||||
//!IN t0
|
||||
//!OUT t1
|
||||
|
||||
#define l0(x, y) V4(O(t0, float2(x, y)))
|
||||
|
||||
V4 f0(V4 s0_0, V4 s0_1, V4 s0_2, V4 s0_3, V4 s0_4, V4 s0_5, V4 s0_6, V4 s0_7, V4 s0_8, V4 s1_0, V4 s1_1, V4 s1_2, V4 s1_3, V4 s1_4, V4 s1_5, V4 s1_6, V4 s1_7, V4 s1_8) {
|
||||
V4 r = 0.0;
|
||||
r += mul(s0_0, M4(-1.069e-01, 1.009e-01, -5.972e-02, -1.732e-02, -9.217e-02, 9.177e-03, -3.127e-02, -5.872e-02, -1.364e-02, -9.990e-04, 1.518e-01, 5.861e-02, -9.835e-02, -1.155e-01, 6.714e-02, -5.142e-02));
|
||||
r += mul(s0_1, M4(1.404e-02, 1.372e-01, -2.759e-01, -4.361e-02, -1.407e-01, 1.570e-01, -1.216e-01, -7.289e-02, 3.088e-01, -1.285e-01, 1.107e-01, 1.651e-01, 1.596e-01, -1.569e-01, 1.437e-02, -1.455e-01));
|
||||
r += mul(s0_2, M4(-4.001e-02, 1.772e-01, -2.761e-01, 4.916e-02, -1.489e-01, 1.680e-01, -5.244e-02, 1.334e-01, 1.245e-01, -2.321e-01, 5.371e-01, -2.549e-01, -9.624e-02, -1.072e-01, 2.322e-01, -2.261e-01));
|
||||
r += mul(s0_3, M4(-2.291e-01, 7.774e-04, -1.015e-02, 6.036e-02, -1.133e-01, 7.554e-02, 1.081e-01, 1.704e-01, 2.123e-01, -2.065e-01, 4.928e-02, 2.352e-03, -2.488e-01, -1.765e-01, 2.044e-01, 1.302e-02));
|
||||
r += mul(s0_4, M4(3.195e-01, -5.410e-01, -4.771e-01, -1.713e-01, 2.778e-01, -1.028e-01, 8.603e-02, 2.162e-01, 1.466e-02, 2.633e-02, -3.299e-01, -5.183e-02, -3.598e-01, -4.015e-01, 5.674e-02, -1.429e-01));
|
||||
r += mul(s0_5, M4(-1.480e-01, 2.440e-01, -2.189e-01, 1.407e-01, -3.439e-01, 2.624e-01, 4.947e-01, 7.813e-01, 1.067e-01, -6.781e-02, -5.271e-02, -1.331e-02, -2.133e-01, -1.038e-01, 4.267e-01, -4.026e-01));
|
||||
r += mul(s0_6, M4(-1.086e-01, 2.607e-01, -1.897e-01, -1.710e-01, 6.096e-02, -1.121e-01, 8.797e-02, -8.204e-02, 4.825e-02, -9.364e-02, 8.472e-02, -1.923e-02, -1.755e-01, 1.086e-01, -3.987e-02, 1.737e-02));
|
||||
r += mul(s0_7, M4(5.606e-02, 4.516e-02, -7.352e-02, 7.654e-02, -6.706e-02, 2.674e-01, -2.388e-01, -1.997e-01, 9.871e-02, -9.055e-02, 1.274e-01, 1.854e-01, -1.765e-01, -1.779e-01, 1.114e-01, -1.882e-01));
|
||||
r += mul(s0_8, M4(-4.811e-02, 2.057e-01, -2.913e-01, 1.265e-01, 1.304e-01, 1.462e-01, -4.432e-03, 4.191e-01, 6.606e-02, -1.382e-01, 1.052e-01, -3.990e-01, 9.737e-02, -9.675e-02, 6.216e-02, -2.130e-01));
|
||||
r += mul(s1_0, M4(-1.183e-01, -5.696e-02, 9.372e-02, 3.074e-03, -2.694e-02, -2.272e-02, -3.489e-02, -2.667e-02, 1.635e-01, -5.761e-04, -1.677e-03, -1.076e-01, -5.411e-02, -1.100e-02, 1.742e-02, 6.403e-02));
|
||||
r += mul(s1_1, M4(-4.462e-03, 3.912e-02, -1.208e-01, -9.360e-02, -1.260e-01, 1.602e-02, -1.047e-01, -1.252e-01, 2.940e-01, 1.068e-01, -2.602e-01, 1.692e-01, 1.120e-01, -2.613e-02, -1.083e-02, 1.754e-02));
|
||||
r += mul(s1_2, M4(2.307e-02, 1.240e-01, -2.024e-01, 1.761e-01, -2.326e-01, 3.209e-02, 5.352e-02, 3.399e-02, 1.754e-01, -3.059e-01, 4.554e-01, -2.412e-01, 4.242e-03, 3.919e-02, 7.769e-02, -1.155e-01));
|
||||
r += mul(s1_3, M4(-1.946e-01, -9.445e-02, 1.698e-01, 1.165e-01, -1.571e-01, 1.700e-02, 5.682e-02, 4.628e-02, 4.425e-01, -1.872e-01, 3.713e-02, 8.537e-02, 4.211e-02, -6.178e-02, 1.398e-02, 5.929e-02));
|
||||
r += mul(s1_4, M4(5.957e-01, -6.855e-01, -3.668e-01, -2.565e-01, -4.383e-02, -8.094e-02, -2.101e-02, -2.446e-01, -7.781e-02, 5.879e-01, -5.272e-01, -1.786e-01, -2.396e-01, -4.148e-01, 5.226e-02, 9.011e-02));
|
||||
r += mul(s1_5, M4(-4.655e-02, 1.107e-01, -1.109e-01, 3.601e-01, -2.103e-01, 3.712e-01, 1.666e-01, 3.972e-01, -2.227e-02, -2.115e-02, -7.054e-02, -1.216e-01, 4.739e-03, 1.201e-01, 1.335e-01, -1.775e-01));
|
||||
r += mul(s1_6, M4(-7.542e-02, 9.157e-02, 1.143e-02, -7.961e-02, -3.812e-02, 1.722e-02, 1.396e-02, -3.920e-02, -6.220e-03, -6.723e-02, 9.364e-02, -4.804e-02, -8.885e-02, 1.313e-01, -7.872e-02, 2.733e-02));
|
||||
r += mul(s1_7, M4(-3.879e-01, -2.705e-01, 3.305e-01, -1.542e-01, -1.179e-01, 9.695e-02, -1.353e-01, -2.320e-01, 1.433e-02, -2.689e-01, 2.066e-01, 3.704e-01, -5.587e-02, -6.296e-02, 6.326e-02, 1.881e-02));
|
||||
r += mul(s1_8, M4(-4.722e-02, -5.909e-02, 4.089e-02, -8.851e-02, 2.017e-01, -2.652e-02, 9.432e-02, 3.252e-01, -2.219e-01, 2.142e-02, -4.496e-02, 5.456e-02, 2.364e-02, 1.081e-01, -9.898e-02, 9.928e-02));
|
||||
r += V4(1.102e-03, 4.481e-03, 3.096e-03, -9.818e-03);
|
||||
return r;
|
||||
}
|
||||
|
||||
void Pass6(uint2 blockStart, uint3 tid) {
|
||||
float2 pt = float2(GetInputPt());
|
||||
uint2 gxy = Rmp8x8(tid.x) + blockStart;
|
||||
uint2 size = GetInputSize();
|
||||
if (gxy.x >= size.x || gxy.y >= size.y) {
|
||||
return;
|
||||
}
|
||||
float2 pos = (gxy + 0.5) * pt;
|
||||
|
||||
V4 s0_0 = l0(-1.0, -1.0);
|
||||
V4 s0_1 = l0(0.0, -1.0);
|
||||
V4 s0_2 = l0(1.0, -1.0);
|
||||
V4 s0_3 = l0(-1.0, 0.0);
|
||||
V4 s0_4 = l0(0.0, 0.0);
|
||||
V4 s0_5 = l0(1.0, 0.0);
|
||||
V4 s0_6 = l0(-1.0, 1.0);
|
||||
V4 s0_7 = l0(0.0, 1.0);
|
||||
V4 s0_8 = l0(1.0, 1.0);
|
||||
V4 s1_0 = -max(-s0_0, 0.0);
|
||||
V4 s1_1 = -max(-s0_1, 0.0);
|
||||
V4 s1_2 = -max(-s0_2, 0.0);
|
||||
V4 s1_3 = -max(-s0_3, 0.0);
|
||||
V4 s1_4 = -max(-s0_4, 0.0);
|
||||
V4 s1_5 = -max(-s0_5, 0.0);
|
||||
V4 s1_6 = -max(-s0_6, 0.0);
|
||||
V4 s1_7 = -max(-s0_7, 0.0);
|
||||
V4 s1_8 = -max(-s0_8, 0.0);
|
||||
s0_0 = max(s0_0, 0.0);
|
||||
s0_1 = max(s0_1, 0.0);
|
||||
s0_2 = max(s0_2, 0.0);
|
||||
s0_3 = max(s0_3, 0.0);
|
||||
s0_4 = max(s0_4, 0.0);
|
||||
s0_5 = max(s0_5, 0.0);
|
||||
s0_6 = max(s0_6, 0.0);
|
||||
s0_7 = max(s0_7, 0.0);
|
||||
s0_8 = max(s0_8, 0.0);
|
||||
|
||||
t1[gxy] = f0(s0_0, s0_1, s0_2, s0_3, s0_4, s0_5, s0_6, s0_7, s0_8, s1_0, s1_1, s1_2, s1_3, s1_4, s1_5, s1_6, s1_7, s1_8);
|
||||
}
|
||||
|
||||
//!PASS 7
|
||||
//!DESC conv6
|
||||
//!BLOCK_SIZE 8
|
||||
//!NUM_THREADS 64
|
||||
//!IN t1
|
||||
//!OUT t0
|
||||
|
||||
#define l0(x, y) V4(O(t1, float2(x, y)))
|
||||
|
||||
V4 f0(V4 s0_0, V4 s0_1, V4 s0_2, V4 s0_3, V4 s0_4, V4 s0_5, V4 s0_6, V4 s0_7, V4 s0_8, V4 s1_0, V4 s1_1, V4 s1_2, V4 s1_3, V4 s1_4, V4 s1_5, V4 s1_6, V4 s1_7, V4 s1_8) {
|
||||
V4 r = 0.0;
|
||||
r += mul(s0_0, M4(-3.488e-02, 3.507e-02, 3.848e-02, -5.906e-02, 9.669e-02, 3.121e-02, -2.182e-02, 1.691e-01, -1.132e-01, -7.602e-02, -5.000e-02, -6.017e-03, 3.962e-02, 1.086e-01, -3.343e-04, 9.002e-02));
|
||||
r += mul(s0_1, M4(9.453e-02, -1.793e-01, -6.074e-02, 5.317e-03, 1.056e-01, 3.460e-01, 5.291e-02, 7.825e-02, 5.510e-02, 4.818e-02, -1.119e-02, 3.913e-02, -8.177e-02, -1.060e-01, -9.989e-03, -9.245e-02));
|
||||
r += mul(s0_2, M4(-8.190e-02, 1.375e-01, -4.322e-02, -6.721e-02, 1.645e-02, -1.392e-01, 7.103e-02, -1.950e-02, 4.302e-03, -3.213e-02, -7.517e-03, -3.406e-03, -2.132e-02, 1.333e-01, -6.553e-02, 7.300e-02));
|
||||
r += mul(s0_3, M4(-1.102e-01, 3.005e-01, -8.521e-02, 3.002e-01, 1.866e-01, 1.089e-01, -2.968e-02, 1.271e-01, -3.566e-01, 1.224e-01, -7.462e-02, -2.765e-01, 5.175e-02, 1.567e-01, 1.450e-01, -1.948e-01));
|
||||
r += mul(s0_4, M4(1.558e-01, 3.780e-02, 9.697e-02, -2.485e-01, -3.560e-01, -3.667e-01, 1.396e-01, 1.020e+00, -2.319e-01, -2.878e-01, -2.849e-01, 5.648e-01, 2.094e-01, -5.684e-01, 1.482e-01, -6.172e-01));
|
||||
r += mul(s0_5, M4(-1.276e-01, -1.685e-01, 4.271e-01, -1.489e-01, 2.154e-01, 2.661e-01, -1.093e-01, -7.859e-02, 6.618e-02, 9.795e-02, 2.778e-02, -1.286e-01, -1.527e-01, -3.586e-01, 2.523e-01, 9.196e-02));
|
||||
r += mul(s0_6, M4(-1.354e-01, -6.680e-02, 5.541e-02, -5.314e-02, 1.639e-02, -1.639e-01, -1.856e-01, -1.863e-01, -1.519e-01, -5.459e-02, 1.027e-01, 6.492e-02, 3.482e-02, -9.074e-03, 1.861e-01, 1.393e-01));
|
||||
r += mul(s0_7, M4(1.907e-02, 1.189e-02, -5.038e-01, -8.478e-02, 3.643e-01, 1.086e-02, 3.067e-01, 1.071e-01, -6.552e-01, 1.505e-01, -7.394e-01, 1.155e-01, -1.815e-01, -1.739e-02, -2.723e-01, -1.607e-01));
|
||||
r += mul(s0_8, M4(-8.319e-02, -2.563e-02, -1.127e-01, -7.792e-02, 1.295e-01, 1.091e-01, 2.920e-02, -5.761e-02, -9.443e-02, 7.429e-03, -2.117e-01, -3.670e-02, -7.118e-02, -4.469e-02, -6.460e-02, -1.261e-02));
|
||||
r += mul(s1_0, M4(2.400e-02, -2.740e-02, -3.394e-02, 5.817e-02, -6.716e-02, -5.672e-02, -7.339e-02, -3.921e-02, -9.506e-02, -3.805e-02, -3.235e-02, -8.145e-02, 1.265e-02, 7.308e-02, -5.707e-02, 1.141e-01));
|
||||
r += mul(s1_1, M4(-1.565e-01, 1.052e-01, -8.934e-02, -6.945e-02, 3.804e-02, 2.091e-01, -1.102e-01, 2.394e-01, 6.041e-02, -9.942e-02, -6.054e-03, 4.857e-02, -7.265e-02, 1.596e-02, 9.135e-02, -8.397e-02));
|
||||
r += mul(s1_2, M4(-9.449e-02, 1.121e-01, -1.101e-01, -2.980e-02, 5.100e-02, -6.337e-02, 1.692e-01, -5.062e-02, -3.931e-02, 1.083e-01, 3.952e-03, 9.801e-04, -6.425e-02, 8.015e-02, -1.628e-01, 8.317e-02));
|
||||
r += mul(s1_3, M4(7.400e-02, 8.412e-02, 2.984e-02, 8.693e-02, -1.474e-01, -3.529e-02, -6.134e-02, -1.107e-01, -3.264e-01, 8.009e-02, -2.261e-01, -1.472e-01, -4.683e-02, -1.258e-01, 1.061e-01, -1.125e-01));
|
||||
r += mul(s1_4, M4(4.970e-01, -1.211e-01, 2.379e-01, 2.124e-01, -1.003e-01, -5.656e-01, 5.001e-02, 4.959e-01, 1.538e-01, -7.985e-01, -2.085e-01, 2.220e-01, 7.247e-02, 6.581e-02, -9.437e-02, -3.066e-01));
|
||||
r += mul(s1_5, M4(-8.611e-02, -9.199e-02, 2.518e-01, -7.482e-02, -1.208e-01, 1.015e-01, 3.428e-02, -1.354e-01, 1.038e-01, -4.497e-02, 2.744e-01, -4.281e-02, 4.090e-02, -2.726e-01, 1.839e-01, 1.138e-01));
|
||||
r += mul(s1_6, M4(-8.703e-02, -4.776e-02, -1.477e-01, -1.870e-02, -1.072e-01, 3.204e-02, -8.396e-03, 1.175e-01, 1.685e-01, -1.427e-01, 2.152e-01, -2.155e-01, 1.898e-03, 5.924e-02, -1.089e-02, 5.197e-02));
|
||||
r += mul(s1_7, M4(-9.679e-02, 1.961e-02, 1.636e-01, -6.049e-02, -7.071e-03, 1.519e-01, -6.303e-01, 4.739e-02, -3.331e-01, 8.291e-02, -5.944e-01, -7.677e-02, -1.164e-01, -4.580e-02, 1.419e-01, 6.839e-02));
|
||||
r += mul(s1_8, M4(6.174e-02, -4.004e-02, 1.256e-02, -4.981e-02, 4.659e-03, 8.371e-02, -1.664e-01, -2.897e-02, -1.253e-01, 2.381e-02, -1.147e-01, -8.724e-02, -3.736e-02, 1.140e-02, -1.550e-01, 6.350e-03));
|
||||
r += V4(-6.918e-03, -1.945e-03, -7.751e-03, 1.645e-02);
|
||||
return r;
|
||||
}
|
||||
|
||||
void Pass7(uint2 blockStart, uint3 tid) {
|
||||
float2 pt = float2(GetInputPt());
|
||||
uint2 gxy = Rmp8x8(tid.x) + blockStart;
|
||||
uint2 size = GetInputSize();
|
||||
if (gxy.x >= size.x || gxy.y >= size.y) {
|
||||
return;
|
||||
}
|
||||
float2 pos = (gxy + 0.5) * pt;
|
||||
|
||||
V4 s0_0 = l0(-1.0, -1.0);
|
||||
V4 s0_1 = l0(0.0, -1.0);
|
||||
V4 s0_2 = l0(1.0, -1.0);
|
||||
V4 s0_3 = l0(-1.0, 0.0);
|
||||
V4 s0_4 = l0(0.0, 0.0);
|
||||
V4 s0_5 = l0(1.0, 0.0);
|
||||
V4 s0_6 = l0(-1.0, 1.0);
|
||||
V4 s0_7 = l0(0.0, 1.0);
|
||||
V4 s0_8 = l0(1.0, 1.0);
|
||||
V4 s1_0 = -max(-s0_0, 0.0);
|
||||
V4 s1_1 = -max(-s0_1, 0.0);
|
||||
V4 s1_2 = -max(-s0_2, 0.0);
|
||||
V4 s1_3 = -max(-s0_3, 0.0);
|
||||
V4 s1_4 = -max(-s0_4, 0.0);
|
||||
V4 s1_5 = -max(-s0_5, 0.0);
|
||||
V4 s1_6 = -max(-s0_6, 0.0);
|
||||
V4 s1_7 = -max(-s0_7, 0.0);
|
||||
V4 s1_8 = -max(-s0_8, 0.0);
|
||||
s0_0 = max(s0_0, 0.0);
|
||||
s0_1 = max(s0_1, 0.0);
|
||||
s0_2 = max(s0_2, 0.0);
|
||||
s0_3 = max(s0_3, 0.0);
|
||||
s0_4 = max(s0_4, 0.0);
|
||||
s0_5 = max(s0_5, 0.0);
|
||||
s0_6 = max(s0_6, 0.0);
|
||||
s0_7 = max(s0_7, 0.0);
|
||||
s0_8 = max(s0_8, 0.0);
|
||||
|
||||
t0[gxy] = f0(s0_0, s0_1, s0_2, s0_3, s0_4, s0_5, s0_6, s0_7, s0_8, s1_0, s1_1, s1_2, s1_3, s1_4, s1_5, s1_6, s1_7, s1_8);
|
||||
}
|
||||
|
||||
//!PASS 8
|
||||
//!DESC conv7
|
||||
//!BLOCK_SIZE 8
|
||||
//!NUM_THREADS 64
|
||||
//!IN t0
|
||||
//!OUT t1
|
||||
|
||||
#define l0(x, y) V4(O(t0, float2(x, y)))
|
||||
|
||||
V4 f0(V4 s0_0, V4 s0_1, V4 s0_2, V4 s0_3, V4 s0_4, V4 s0_5, V4 s0_6, V4 s0_7, V4 s0_8, V4 s1_0, V4 s1_1, V4 s1_2, V4 s1_3, V4 s1_4, V4 s1_5, V4 s1_6, V4 s1_7, V4 s1_8) {
|
||||
V4 r = 0.0;
|
||||
r += mul(s0_0, M4(-4.116e-02, -3.385e-02, -4.697e-02, 4.650e-02, -3.488e-02, 1.006e-01, -4.538e-03, 4.637e-02, 1.288e-01, 7.769e-03, 1.150e-01, -7.930e-03, 1.045e-02, 4.849e-02, 2.767e-02, 4.909e-02));
|
||||
r += mul(s0_1, M4(-5.332e-01, -5.254e-01, -3.541e-01, -3.525e-01, 1.117e-02, 2.929e-02, 6.817e-02, 9.115e-02, 1.055e+00, 6.141e-02, 3.976e-01, 4.649e-02, 2.561e-01, -1.191e-01, 1.230e-03, -1.047e-01));
|
||||
r += mul(s0_2, M4(-1.450e-01, -2.829e-01, -6.857e-01, -4.294e-01, 3.217e-02, 2.745e-02, 5.242e-02, 3.556e-02, 1.284e-01, 4.292e-01, 7.161e-01, 2.220e-01, -1.508e-02, 1.802e-01, 1.842e-01, 9.827e-02));
|
||||
r += mul(s0_3, M4(1.381e-01, 5.690e-02, 5.107e-02, 6.625e-02, -1.173e-01, -7.448e-02, -1.152e-01, -1.808e-01, -1.470e-01, -1.833e-01, -1.653e-01, -1.217e-01, 9.096e-02, 5.579e-02, 1.128e-02, 9.791e-02));
|
||||
r += mul(s0_4, M4(5.936e-01, -2.579e-01, 5.761e-01, -7.051e-01, -7.023e-01, 2.824e-01, 2.057e-01, 3.628e-01, 1.006e-02, 3.209e-01, 6.969e-02, -3.464e-01, 4.768e-01, -3.194e-01, -4.817e-02, 3.050e-02));
|
||||
r += mul(s0_5, M4(2.145e-01, -1.899e-01, 1.446e-01, 2.497e-02, -8.750e-02, -3.154e-01, -5.060e-01, -7.413e-02, -8.542e-02, -4.198e-02, -1.528e-01, -1.812e-01, -2.597e-01, 8.374e-02, -5.592e-01, -2.557e-01));
|
||||
r += mul(s0_6, M4(5.713e-02, -4.294e-03, 2.388e-02, -7.124e-02, -2.163e-02, -3.642e-03, 3.839e-02, -6.934e-02, -9.052e-02, -1.153e-02, 1.213e-02, 7.120e-02, -3.698e-02, 4.260e-02, -7.245e-02, 7.898e-02));
|
||||
r += mul(s0_7, M4(2.780e-02, 1.944e-02, 1.415e-01, 1.216e-01, 9.163e-02, -3.069e-02, -1.829e-02, -2.182e-01, 5.815e-02, -1.923e-02, -5.934e-02, -3.487e-02, -1.082e-01, 1.362e-01, 8.120e-02, 2.621e-01));
|
||||
r += mul(s0_8, M4(9.334e-03, -1.300e-02, 4.936e-02, 1.751e-01, -1.214e-01, 1.629e-02, -1.131e-01, 7.402e-02, 1.134e-02, 1.663e-03, -5.887e-03, -8.862e-02, 1.029e-01, -5.629e-02, 9.127e-02, -6.668e-02));
|
||||
r += mul(s1_0, M4(1.165e-02, 4.389e-02, 6.299e-03, 7.939e-02, -2.769e-02, 9.353e-02, 6.239e-02, 1.341e-02, 4.713e-02, -2.731e-03, 5.256e-02, -3.515e-02, -8.911e-02, -1.425e-01, -7.889e-02, -1.627e-01));
|
||||
r += mul(s1_1, M4(-2.869e-01, 2.838e-02, -2.541e-02, 5.216e-02, 2.660e-01, -2.095e-01, 1.375e-01, -2.562e-02, 2.715e-01, 1.694e-01, 9.471e-02, -7.292e-03, 3.257e-01, -2.247e-01, 7.698e-03, -2.076e-01));
|
||||
r += mul(s1_2, M4(1.832e-02, -1.860e-01, -4.951e-02, -1.392e-03, 9.307e-02, 7.671e-02, 1.043e-01, -3.675e-02, 1.433e-03, 1.219e-01, 1.978e-01, 5.960e-02, 9.624e-02, 1.448e-01, 3.561e-01, 3.054e-02));
|
||||
r += mul(s1_3, M4(-4.647e-02, 4.225e-03, 3.830e-02, -3.233e-02, -1.532e-01, -6.289e-01, -3.037e-01, -4.131e-01, -1.794e-01, -4.090e-02, -9.644e-02, -4.828e-02, 7.978e-02, 6.792e-03, 5.043e-02, 4.905e-02));
|
||||
r += mul(s1_4, M4(2.967e-01, -5.750e-02, 1.168e-01, -2.681e-02, 1.232e-01, -2.481e-03, 8.164e-01, 2.468e-01, -3.721e-01, -5.041e-02, -4.796e-01, -2.778e-02, 3.623e-01, -8.387e-01, -5.229e-01, -4.492e-01));
|
||||
r += mul(s1_5, M4(-1.507e-02, 1.343e-01, 8.567e-02, 8.923e-02, -2.766e-03, -1.548e-01, -2.588e-01, -1.295e-01, 1.777e-02, -9.243e-02, -4.495e-02, -5.528e-02, -1.071e-01, -1.284e-01, -4.142e-01, -1.800e-01));
|
||||
r += mul(s1_6, M4(-4.695e-03, -9.431e-04, -1.256e-02, -4.959e-03, 1.607e-01, -8.763e-02, 2.039e-01, -1.243e-01, 3.725e-02, -5.612e-02, -3.615e-03, -2.475e-02, 4.955e-02, 4.065e-02, -1.879e-02, -1.195e-01));
|
||||
r += mul(s1_7, M4(-1.039e-02, 5.631e-02, 2.655e-02, 7.419e-02, 1.286e-01, -6.430e-02, 4.800e-02, -4.480e-02, 5.067e-03, -4.197e-02, -3.342e-02, -7.461e-02, -3.225e-02, 6.062e-03, 5.391e-02, -7.135e-02));
|
||||
r += mul(s1_8, M4(5.195e-02, 3.799e-02, 1.130e-01, -8.811e-03, -4.285e-02, 1.609e-02, -8.972e-03, 3.530e-02, -6.932e-02, -3.013e-03, -5.208e-02, 5.823e-02, -4.561e-02, -1.068e-01, -1.458e-01, -5.739e-02));
|
||||
r += V4(1.569e-02, 1.505e-02, 2.765e-02, 1.258e-02);
|
||||
return r;
|
||||
}
|
||||
|
||||
void Pass8(uint2 blockStart, uint3 tid) {
|
||||
float2 pt = float2(GetInputPt());
|
||||
uint2 gxy = Rmp8x8(tid.x) + blockStart;
|
||||
uint2 size = GetInputSize();
|
||||
if (gxy.x >= size.x || gxy.y >= size.y) {
|
||||
return;
|
||||
}
|
||||
float2 pos = (gxy + 0.5) * pt;
|
||||
|
||||
V4 s0_0 = l0(-1.0, -1.0);
|
||||
V4 s0_1 = l0(0.0, -1.0);
|
||||
V4 s0_2 = l0(1.0, -1.0);
|
||||
V4 s0_3 = l0(-1.0, 0.0);
|
||||
V4 s0_4 = l0(0.0, 0.0);
|
||||
V4 s0_5 = l0(1.0, 0.0);
|
||||
V4 s0_6 = l0(-1.0, 1.0);
|
||||
V4 s0_7 = l0(0.0, 1.0);
|
||||
V4 s0_8 = l0(1.0, 1.0);
|
||||
V4 s1_0 = -max(-s0_0, 0.0);
|
||||
V4 s1_1 = -max(-s0_1, 0.0);
|
||||
V4 s1_2 = -max(-s0_2, 0.0);
|
||||
V4 s1_3 = -max(-s0_3, 0.0);
|
||||
V4 s1_4 = -max(-s0_4, 0.0);
|
||||
V4 s1_5 = -max(-s0_5, 0.0);
|
||||
V4 s1_6 = -max(-s0_6, 0.0);
|
||||
V4 s1_7 = -max(-s0_7, 0.0);
|
||||
V4 s1_8 = -max(-s0_8, 0.0);
|
||||
s0_0 = max(s0_0, 0.0);
|
||||
s0_1 = max(s0_1, 0.0);
|
||||
s0_2 = max(s0_2, 0.0);
|
||||
s0_3 = max(s0_3, 0.0);
|
||||
s0_4 = max(s0_4, 0.0);
|
||||
s0_5 = max(s0_5, 0.0);
|
||||
s0_6 = max(s0_6, 0.0);
|
||||
s0_7 = max(s0_7, 0.0);
|
||||
s0_8 = max(s0_8, 0.0);
|
||||
|
||||
t1[gxy] = f0(s0_0, s0_1, s0_2, s0_3, s0_4, s0_5, s0_6, s0_7, s0_8, s1_0, s1_1, s1_2, s1_3, s1_4, s1_5, s1_6, s1_7, s1_8);
|
||||
}
|
||||
|
||||
//!PASS 9
|
||||
//!DESC conv8
|
||||
//!BLOCK_SIZE 8
|
||||
//!NUM_THREADS 64
|
||||
//!IN t1
|
||||
//!OUT t0
|
||||
|
||||
#define l0(x, y) V4(O(t1, float2(x, y)))
|
||||
|
||||
V4 f0(V4 s0_0, V4 s0_1, V4 s0_2, V4 s0_3, V4 s0_4, V4 s0_5, V4 s0_6, V4 s0_7, V4 s0_8, V4 s1_0, V4 s1_1, V4 s1_2, V4 s1_3, V4 s1_4, V4 s1_5, V4 s1_6, V4 s1_7, V4 s1_8) {
|
||||
V4 r = 0.0;
|
||||
r += mul(s0_0, M4(4.724e-03, 6.987e-03, -3.797e-03, 2.147e-02, -5.616e-03, 1.123e-02, -2.768e-02, 8.185e-03, -4.051e-03, 5.608e-05, -9.522e-02, 2.924e-02, -5.976e-03, 8.331e-03, 7.513e-02, -2.513e-02));
|
||||
r += mul(s0_1, M4(-2.224e-02, 1.093e-04, 5.901e-02, 2.350e-02, 1.167e-01, -7.837e-02, 1.939e-01, 1.987e-01, 5.530e-02, -4.759e-05, 1.221e-01, 4.764e-02, -8.813e-02, 7.695e-02, -4.577e-01, 1.671e-02));
|
||||
r += mul(s0_2, M4(-5.696e-02, 6.005e-03, -5.620e-02, -8.978e-02, 4.014e-02, -3.822e-02, 1.081e-01, -6.532e-03, 9.444e-03, 7.498e-03, -3.228e-02, 4.908e-02, -2.043e-02, 2.374e-02, 2.163e-02, -4.505e-02));
|
||||
r += mul(s0_3, M4(-8.098e-02, 1.943e-02, -5.744e-02, 3.824e-02, -2.071e-01, 1.036e-01, -6.926e-02, -2.348e-01, 2.378e-01, -1.069e-01, -5.307e-02, 1.161e-01, 1.881e-01, -5.785e-02, -6.570e-02, 2.227e-01));
|
||||
r += mul(s0_4, M4(7.577e-02, -4.125e-02, 1.714e-01, -6.934e-01, -2.448e-01, 1.146e-01, 2.354e-01, -4.935e-01, -2.321e-01, -8.273e-02, 5.890e-02, 5.704e-01, 4.833e-02, 2.875e-02, 1.163e-01, -1.802e-01));
|
||||
r += mul(s0_5, M4(2.287e-01, -3.461e-02, -2.542e-02, 2.882e-02, 7.142e-02, -1.556e-01, 4.055e-02, 1.534e-02, -1.647e-01, 3.087e-03, -6.811e-02, -3.896e-02, 1.334e-01, 1.188e-01, -1.847e-01, 4.293e-02));
|
||||
r += mul(s0_6, M4(-3.094e-02, 2.712e-03, 3.387e-03, 1.877e-02, 9.494e-02, -2.863e-02, -4.239e-02, -3.402e-02, 5.541e-03, -1.178e-02, 1.795e-02, -3.515e-02, -3.044e-02, -2.463e-02, -1.320e-02, 8.952e-02));
|
||||
r += mul(s0_7, M4(1.035e-01, -3.181e-02, 1.902e-02, 3.973e-03, 2.267e-01, -2.620e-01, 1.821e-01, 1.631e-01, 1.494e-02, 6.125e-02, -6.176e-02, -2.497e-02, -1.364e-02, 7.542e-02, -8.480e-02, -4.648e-02));
|
||||
r += mul(s0_8, M4(-1.466e-01, 3.028e-02, 2.798e-02, -7.887e-02, -4.370e-02, 1.408e-02, -6.161e-02, -3.034e-02, 6.567e-02, 2.071e-02, 3.126e-02, 6.993e-02, -5.556e-02, 1.507e-02, 2.991e-02, -4.924e-02));
|
||||
r += mul(s1_0, M4(1.637e-02, -2.767e-02, 8.568e-02, -4.254e-02, 3.215e-02, 1.987e-04, -3.697e-02, 3.787e-02, 2.236e-02, -6.576e-02, 7.400e-02, 1.093e-01, 3.271e-03, 1.809e-03, 1.011e-02, 1.509e-01));
|
||||
r += mul(s1_1, M4(5.538e-02, -5.865e-02, 4.351e-01, 2.494e-01, 1.101e-01, -1.484e-02, 5.176e-01, 3.999e-02, -4.782e-03, 1.155e-01, -2.099e-01, 5.012e-03, -1.919e-01, 2.292e-01, -5.378e-01, -1.223e-01));
|
||||
r += mul(s1_2, M4(-5.691e-02, 7.653e-02, -2.572e-01, -1.332e-01, -5.652e-02, -5.008e-02, 7.840e-02, -3.729e-02, 6.942e-02, 6.483e-04, -2.243e-05, 8.430e-02, -6.848e-02, -2.096e-02, -3.908e-02, -9.062e-02));
|
||||
r += mul(s1_3, M4(2.725e-01, -1.841e-01, -6.710e-03, 3.965e-01, -1.298e-01, -4.014e-03, 2.007e-01, -3.700e-01, 5.329e-01, -4.014e-01, 2.619e-02, 1.606e-01, 2.179e-01, -1.403e-01, 4.227e-02, 8.568e-02));
|
||||
r += mul(s1_4, M4(4.188e-01, -7.320e-01, -5.609e-01, -6.087e-01, -7.521e-01, 7.363e-01, -6.253e-01, -2.011e-01, -1.017e+00, 3.331e-02, -2.135e-02, 2.084e-01, 6.074e-01, -9.824e-01, 5.154e-01, 1.748e-01));
|
||||
r += mul(s1_5, M4(1.733e-01, 5.176e-01, -7.335e-02, 1.899e-02, 1.028e-01, -6.330e-02, -1.632e-01, 9.241e-05, -1.357e-01, -1.131e-01, 9.644e-02, -1.424e-02, -1.835e-02, 7.296e-01, -3.204e-01, -2.966e-02));
|
||||
r += mul(s1_6, M4(4.798e-02, -1.047e-01, 3.646e-02, 6.703e-02, 5.371e-02, 4.759e-02, -2.975e-02, -6.945e-02, 7.985e-02, -1.101e-01, 3.034e-02, -1.472e-02, -3.827e-02, 9.839e-03, -4.922e-03, 4.307e-03));
|
||||
r += mul(s1_7, M4(-8.950e-02, -1.253e-02, -1.730e-05, 3.862e-02, 2.692e-01, -4.645e-01, 2.399e-01, 2.744e-01, -4.503e-02, 1.724e-01, -7.935e-02, -5.200e-02, -2.132e-03, -1.926e-02, 2.926e-02, -2.288e-02));
|
||||
r += mul(s1_8, M4(-1.072e-01, -1.145e-02, 6.605e-03, -1.090e-01, -7.524e-03, 8.598e-02, -7.698e-02, -6.976e-02, 5.869e-02, -5.499e-02, 3.529e-02, 7.813e-02, -1.794e-01, 4.212e-02, -4.479e-03, -7.253e-02));
|
||||
r += V4(8.112e-05, 3.290e-03, -6.342e-04, 1.340e-02);
|
||||
return r;
|
||||
}
|
||||
|
||||
void Pass9(uint2 blockStart, uint3 tid) {
|
||||
float2 pt = float2(GetInputPt());
|
||||
uint2 gxy = Rmp8x8(tid.x) + blockStart;
|
||||
uint2 size = GetInputSize();
|
||||
if (gxy.x >= size.x || gxy.y >= size.y) {
|
||||
return;
|
||||
}
|
||||
float2 pos = (gxy + 0.5) * pt;
|
||||
|
||||
V4 s0_0 = l0(-1.0, -1.0);
|
||||
V4 s0_1 = l0(0.0, -1.0);
|
||||
V4 s0_2 = l0(1.0, -1.0);
|
||||
V4 s0_3 = l0(-1.0, 0.0);
|
||||
V4 s0_4 = l0(0.0, 0.0);
|
||||
V4 s0_5 = l0(1.0, 0.0);
|
||||
V4 s0_6 = l0(-1.0, 1.0);
|
||||
V4 s0_7 = l0(0.0, 1.0);
|
||||
V4 s0_8 = l0(1.0, 1.0);
|
||||
V4 s1_0 = -max(-s0_0, 0.0);
|
||||
V4 s1_1 = -max(-s0_1, 0.0);
|
||||
V4 s1_2 = -max(-s0_2, 0.0);
|
||||
V4 s1_3 = -max(-s0_3, 0.0);
|
||||
V4 s1_4 = -max(-s0_4, 0.0);
|
||||
V4 s1_5 = -max(-s0_5, 0.0);
|
||||
V4 s1_6 = -max(-s0_6, 0.0);
|
||||
V4 s1_7 = -max(-s0_7, 0.0);
|
||||
V4 s1_8 = -max(-s0_8, 0.0);
|
||||
s0_0 = max(s0_0, 0.0);
|
||||
s0_1 = max(s0_1, 0.0);
|
||||
s0_2 = max(s0_2, 0.0);
|
||||
s0_3 = max(s0_3, 0.0);
|
||||
s0_4 = max(s0_4, 0.0);
|
||||
s0_5 = max(s0_5, 0.0);
|
||||
s0_6 = max(s0_6, 0.0);
|
||||
s0_7 = max(s0_7, 0.0);
|
||||
s0_8 = max(s0_8, 0.0);
|
||||
|
||||
t0[gxy] = f0(s0_0, s0_1, s0_2, s0_3, s0_4, s0_5, s0_6, s0_7, s0_8, s1_0, s1_1, s1_2, s1_3, s1_4, s1_5, s1_6, s1_7, s1_8);
|
||||
}
|
||||
|
||||
//!PASS 10
|
||||
//!DESC out-shuffle
|
||||
//!BLOCK_SIZE 16
|
||||
//!NUM_THREADS 64
|
||||
//!IN INPUT, t0
|
||||
//!OUT OUTPUT
|
||||
|
||||
#define l0(x, y) V4(O(t0, float2(x, y)))
|
||||
|
||||
V4 f0(V4 s0_0, V4 s0_1, V4 s0_2, V4 s0_3, V4 s0_4, V4 s0_5, V4 s0_6, V4 s0_7, V4 s0_8, V4 s1_0, V4 s1_1, V4 s1_2, V4 s1_3, V4 s1_4, V4 s1_5, V4 s1_6, V4 s1_7, V4 s1_8) {
|
||||
V4 r = 0.0;
|
||||
r += mul(s0_0, M4(-1.348e-01, -9.107e-02, -4.849e-02, 4.484e-04, -3.384e-02, -6.768e-02, -9.628e-03, -1.766e-02, -9.939e-03, -2.182e-02, -1.288e-02, 8.518e-03, 2.218e-02, -1.184e-03, 1.240e-03, 1.065e-02));
|
||||
r += mul(s0_1, M4(7.301e-02, 1.014e-01, -1.363e-02, -4.850e-02, -2.842e-01, -3.060e-02, -4.154e-02, 4.057e-03, -3.458e-02, -6.335e-02, -2.660e-02, -1.335e-02, -7.944e-03, 8.560e-03, 4.588e-02, 7.580e-03));
|
||||
r += mul(s0_2, M4(-9.349e-05, -2.142e-02, -1.258e-04, -9.330e-03, -5.058e-03, 3.912e-02, -2.976e-02, 2.410e-02, -8.512e-03, 4.954e-02, -2.093e-02, -2.582e-03, 1.648e-02, 7.942e-03, 1.520e-02, 3.414e-02));
|
||||
r += mul(s0_3, M4(-1.021e-01, -1.140e-01, 2.412e-01, 1.289e-02, -1.192e-01, -1.140e-01, 2.395e-01, 6.930e-03, -2.027e-01, -4.824e-02, 1.243e-01, 3.820e-03, 9.280e-03, 2.866e-02, 2.106e-02, 3.644e-03));
|
||||
r += mul(s0_4, M4(8.727e-02, 8.162e-02, 1.478e-01, 5.348e-01, -3.115e-01, -2.605e-02, 1.510e-01, 7.249e-01, -8.110e-02, -6.698e-01, 1.080e-01, -8.090e-02, -3.492e-01, -1.891e-01, -1.877e-01, -1.319e-01));
|
||||
r += mul(s0_5, M4(-5.622e-03, 2.237e-02, -4.008e-03, -1.980e-02, -1.837e-02, -3.311e-02, 4.289e-02, 3.256e-02, -2.178e-02, 2.653e-02, -1.722e-03, 8.373e-02, -8.042e-02, -2.962e-01, -4.643e-03, -6.865e-02));
|
||||
r += mul(s0_6, M4(6.248e-03, -2.320e-02, 1.883e-03, -1.430e-02, 1.224e-02, 5.634e-03, -1.964e-02, -1.627e-02, 2.010e-02, 1.174e-02, -3.919e-02, 9.559e-04, 3.016e-02, -2.836e-03, 7.667e-02, 3.552e-02));
|
||||
r += mul(s0_7, M4(-6.141e-03, 1.380e-02, 1.024e-02, -1.210e-02, 4.548e-02, 3.626e-02, -9.142e-02, -7.666e-02, -3.241e-02, -2.296e-02, -3.244e-02, -2.870e-01, 4.427e-02, 8.899e-02, -1.327e-01, 4.920e-02));
|
||||
r += mul(s0_8, M4(-2.801e-03, -9.930e-04, -2.770e-03, 1.623e-02, 2.158e-03, -1.258e-02, -3.089e-02, 3.211e-02, -9.620e-03, 1.776e-02, -4.337e-03, 4.676e-02, 1.130e-02, -7.436e-03, -3.572e-02, -1.742e-01));
|
||||
r += mul(s1_0, M4(-4.306e-02, -6.039e-02, -1.642e-02, -1.966e-02, -5.996e-02, -1.743e-01, -3.128e-02, 1.714e-02, -4.357e-03, -7.720e-03, -4.532e-03, 4.571e-03, 3.988e-02, 2.067e-02, 1.548e-02, -2.964e-04));
|
||||
r += mul(s1_1, M4(-6.070e-02, -9.324e-02, 7.472e-03, 2.173e-02, -7.996e-02, -5.139e-02, -5.545e-02, -1.891e-02, -1.767e-02, -1.527e-02, -2.906e-02, 1.310e-02, 2.594e-02, 7.495e-02, -7.681e-03, -4.678e-03));
|
||||
r += mul(s1_2, M4(4.883e-02, -3.167e-02, 2.862e-02, 3.357e-02, -8.454e-03, 9.265e-03, -1.657e-02, -8.086e-03, -1.170e-02, -3.549e-02, 7.437e-03, 1.425e-02, 1.441e-02, -1.961e-02, 1.560e-02, -1.122e-02));
|
||||
r += mul(s1_3, M4(-6.156e-02, -6.763e-02, 1.987e-01, 2.459e-02, -5.710e-02, -2.009e-01, 4.581e-01, -1.181e-02, -9.054e-02, -5.658e-02, 3.432e-02, 2.004e-02, 6.965e-03, -1.655e-02, 5.178e-03, -1.236e-02));
|
||||
r += mul(s1_4, M4(-1.301e-01, -5.093e-02, 7.676e-01, 6.003e-01, -9.216e-02, -2.228e-03, 7.034e-02, 1.851e-01, -5.318e-01, -1.852e-01, -2.980e-02, 1.919e-02, -8.147e-01, -1.773e-01, -2.675e-01, 2.314e-02));
|
||||
r += mul(s1_5, M4(-9.962e-03, -1.888e-01, -1.877e-02, 1.190e-01, -2.283e-02, -1.241e-02, 2.969e-04, 3.894e-02, -5.077e-02, 2.300e-01, -6.192e-02, 1.793e-01, 5.384e-03, -4.378e-01, 2.970e-02, -1.125e-01));
|
||||
r += mul(s1_6, M4(1.376e-02, -2.891e-03, 9.292e-03, -1.288e-03, 2.615e-02, 2.656e-02, -8.111e-02, -1.779e-02, -7.512e-03, 9.174e-03, -4.553e-02, -1.139e-02, 2.259e-02, 4.351e-03, 4.963e-02, 2.002e-02));
|
||||
r += mul(s1_7, M4(9.993e-03, 1.250e-02, -2.090e-02, 6.839e-03, 3.502e-03, 2.070e-03, -5.530e-02, -2.855e-03, 1.144e-02, -4.191e-02, -8.395e-02, -2.056e-01, 6.909e-02, 7.425e-02, -2.374e-01, 8.636e-02));
|
||||
r += mul(s1_8, M4(-1.762e-02, -3.804e-04, 2.643e-02, 4.383e-02, 1.748e-03, -1.201e-02, -8.452e-03, -1.216e-02, -1.203e-02, -3.454e-02, -1.957e-02, 2.212e-01, -1.375e-02, -1.094e-02, -1.245e-02, -2.124e-01));
|
||||
r += V4(3.107e-03, 3.655e-03, 5.416e-04, 5.397e-04);
|
||||
return tanh(r);
|
||||
}
|
||||
|
||||
void Pass10(uint2 blockStart, uint3 tid) {
|
||||
float2 pt = float2(GetInputPt());
|
||||
uint2 gxy = (Rmp8x8(tid.x) << 1) + blockStart;
|
||||
uint2 size = GetOutputSize();
|
||||
if (gxy.x >= size.x || gxy.y >= size.y) {
|
||||
return;
|
||||
}
|
||||
float2 pos = ((gxy >> 1) + 0.5) * pt;
|
||||
|
||||
V4 s0_0 = l0(-1.0, -1.0);
|
||||
V4 s0_1 = l0(0.0, -1.0);
|
||||
V4 s0_2 = l0(1.0, -1.0);
|
||||
V4 s0_3 = l0(-1.0, 0.0);
|
||||
V4 s0_4 = l0(0.0, 0.0);
|
||||
V4 s0_5 = l0(1.0, 0.0);
|
||||
V4 s0_6 = l0(-1.0, 1.0);
|
||||
V4 s0_7 = l0(0.0, 1.0);
|
||||
V4 s0_8 = l0(1.0, 1.0);
|
||||
V4 s1_0 = -max(-s0_0, 0.0);
|
||||
V4 s1_1 = -max(-s0_1, 0.0);
|
||||
V4 s1_2 = -max(-s0_2, 0.0);
|
||||
V4 s1_3 = -max(-s0_3, 0.0);
|
||||
V4 s1_4 = -max(-s0_4, 0.0);
|
||||
V4 s1_5 = -max(-s0_5, 0.0);
|
||||
V4 s1_6 = -max(-s0_6, 0.0);
|
||||
V4 s1_7 = -max(-s0_7, 0.0);
|
||||
V4 s1_8 = -max(-s0_8, 0.0);
|
||||
s0_0 = max(s0_0, 0.0);
|
||||
s0_1 = max(s0_1, 0.0);
|
||||
s0_2 = max(s0_2, 0.0);
|
||||
s0_3 = max(s0_3, 0.0);
|
||||
s0_4 = max(s0_4, 0.0);
|
||||
s0_5 = max(s0_5, 0.0);
|
||||
s0_6 = max(s0_6, 0.0);
|
||||
s0_7 = max(s0_7, 0.0);
|
||||
s0_8 = max(s0_8, 0.0);
|
||||
|
||||
V4 r = f0(s0_0, s0_1, s0_2, s0_3, s0_4, s0_5, s0_6, s0_7, s0_8, s1_0, s1_1, s1_2, s1_3, s1_4, s1_5, s1_6, s1_7, s1_8);
|
||||
|
||||
static const float3x3 rgb2yuv = {0.299, 0.587, 0.114, -0.169, -0.331, 0.5, 0.5, -0.419, -0.081};
|
||||
static const float3x3 yuv2rgb = {1, -0.00093, 1.401687, 1, -0.3437, -0.71417, 1, 1.77216, 0.00099};
|
||||
float2 opt = float2(GetOutputPt());
|
||||
|
||||
pos -= 0.5f * opt;
|
||||
float3 yuv = mul(rgb2yuv, INPUT.SampleLevel(SL, pos, 0).rgb);
|
||||
OUTPUT[gxy] = float4(mul(yuv2rgb, float3(saturate(yuv.r + r.x), yuv.yz)), 1);
|
||||
|
||||
++gxy.x;
|
||||
pos.x += opt.x;
|
||||
yuv = mul(rgb2yuv, INPUT.SampleLevel(SL, pos, 0).rgb);
|
||||
OUTPUT[gxy] = float4(mul(yuv2rgb, float3(saturate(yuv.r + r.y), yuv.yz)), 1);
|
||||
|
||||
++gxy.y;
|
||||
pos.y += opt.y;
|
||||
yuv = mul(rgb2yuv, INPUT.SampleLevel(SL, pos, 0).rgb);
|
||||
OUTPUT[gxy] = float4(mul(yuv2rgb, float3(saturate(yuv.r + r.w), yuv.yz)), 1);
|
||||
|
||||
--gxy.x;
|
||||
pos.x -= opt.x;
|
||||
yuv = mul(rgb2yuv, INPUT.SampleLevel(SL, pos, 0).rgb);
|
||||
OUTPUT[gxy] = float4(mul(yuv2rgb, float3(saturate(yuv.r + r.z), yuv.yz)), 1);
|
||||
}
|
||||
778
src/Effects/CuNNy/CuNNy-8x4C-NVL.hlsl
Normal file
778
src/Effects/CuNNy/CuNNy-8x4C-NVL.hlsl
Normal file
|
|
@ -0,0 +1,778 @@
|
|||
// CuNNy 8x4C BILINEAR RGB NVL - https://github.com/cunnyplapper/CuNNy
|
||||
|
||||
// This program is free software: you can redistribute it and/or modify
|
||||
// it under the terms of the GNU General Public License as published by
|
||||
// the Free Software Foundation, either version 3 of the License, or
|
||||
// (at your option) any later version.
|
||||
//
|
||||
// This program is distributed in the hope that it will be useful,
|
||||
// but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
//
|
||||
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
// GNU General Public License for more details.
|
||||
//
|
||||
// You should have received a copy of the GNU General Public License
|
||||
// along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
|
||||
//!MAGPIE EFFECT
|
||||
//!VERSION 4
|
||||
//!SORT_NAME CuNNy-D04N08
|
||||
|
||||
//!TEXTURE
|
||||
Texture2D INPUT;
|
||||
|
||||
//!TEXTURE
|
||||
//!WIDTH INPUT_WIDTH * 2
|
||||
//!HEIGHT INPUT_HEIGHT * 2
|
||||
Texture2D OUTPUT;
|
||||
|
||||
//!SAMPLER
|
||||
//!FILTER POINT
|
||||
SamplerState SP;
|
||||
|
||||
//!SAMPLER
|
||||
//!FILTER LINEAR
|
||||
SamplerState SL;
|
||||
|
||||
//!COMMON
|
||||
#define O(t, p) t.SampleLevel(SP, pos + p * pt, 0)
|
||||
#define V4 min16float4
|
||||
#define M4 min16float4x4
|
||||
|
||||
//!TEXTURE
|
||||
//!WIDTH INPUT_WIDTH
|
||||
//!HEIGHT INPUT_HEIGHT
|
||||
//!FORMAT R8G8B8A8_SNORM
|
||||
Texture2D t0;
|
||||
|
||||
//!TEXTURE
|
||||
//!WIDTH INPUT_WIDTH
|
||||
//!HEIGHT INPUT_HEIGHT
|
||||
//!FORMAT R8G8B8A8_SNORM
|
||||
Texture2D t1;
|
||||
|
||||
//!PASS 1
|
||||
//!DESC in
|
||||
//!BLOCK_SIZE 8
|
||||
//!NUM_THREADS 64
|
||||
//!IN INPUT
|
||||
//!OUT t0
|
||||
|
||||
#define l0(x, y) min16float((dot(float3(2.666e-01, 5.050e-01, 1.135e-01), O(INPUT, float2(x, y)).rgb) + -8.258e-01))
|
||||
|
||||
V4 f0(min16float s0_0, min16float s0_1, min16float s0_2, min16float s0_3, min16float s0_4, min16float s0_5, min16float s0_6, min16float s0_7, min16float s0_8) {
|
||||
V4 r = 0.0;
|
||||
r += V4(-2.544e-02, -4.130e-01, -2.634e-01, 2.417e-02) * s0_0;
|
||||
r += V4(1.256e-02, -8.013e-02, 9.539e-02, -7.111e-02) * s0_1;
|
||||
r += V4(1.768e-02, -2.469e-01, -1.627e-01, 8.569e-02) * s0_2;
|
||||
r += V4(-1.554e-01, 3.441e-02, -1.508e-01, 2.491e-02) * s0_3;
|
||||
r += V4(1.628e-01, 8.679e-01, -1.960e-02, -5.810e-01) * s0_4;
|
||||
r += V4(-1.237e-02, -1.704e-01, 2.915e-01, -5.922e-01) * s0_5;
|
||||
r += V4(7.925e-01, 5.570e-03, 7.074e-02, 4.442e-04) * s0_6;
|
||||
r += V4(-7.910e-01, -1.530e-02, -8.229e-02, 3.149e-03) * s0_7;
|
||||
r += V4(-3.973e-03, 2.262e-02, -1.213e-01, 3.843e-02) * s0_8;
|
||||
r += V4(-8.495e-04, -1.121e-04, 1.842e-02, 5.844e-02);
|
||||
return r;
|
||||
}
|
||||
|
||||
void Pass1(uint2 blockStart, uint3 tid) {
|
||||
float2 pt = float2(GetInputPt());
|
||||
uint2 gxy = Rmp8x8(tid.x) + blockStart;
|
||||
uint2 size = GetInputSize();
|
||||
if (gxy.x >= size.x || gxy.y >= size.y) {
|
||||
return;
|
||||
}
|
||||
float2 pos = (gxy + 0.5) * pt;
|
||||
|
||||
min16float s0_0 = l0(-1.0, -1.0);
|
||||
min16float s0_1 = l0(0.0, -1.0);
|
||||
min16float s0_2 = l0(1.0, -1.0);
|
||||
min16float s0_3 = l0(-1.0, 0.0);
|
||||
min16float s0_4 = l0(0.0, 0.0);
|
||||
min16float s0_5 = l0(1.0, 0.0);
|
||||
min16float s0_6 = l0(-1.0, 1.0);
|
||||
min16float s0_7 = l0(0.0, 1.0);
|
||||
min16float s0_8 = l0(1.0, 1.0);
|
||||
|
||||
t0[gxy] = f0(s0_0, s0_1, s0_2, s0_3, s0_4, s0_5, s0_6, s0_7, s0_8);
|
||||
}
|
||||
|
||||
//!PASS 2
|
||||
//!DESC conv1
|
||||
//!BLOCK_SIZE 8
|
||||
//!NUM_THREADS 64
|
||||
//!IN t0
|
||||
//!OUT t1
|
||||
|
||||
#define l0(x, y) V4(O(t0, float2(x, y)))
|
||||
|
||||
V4 f0(V4 s0_0, V4 s0_1, V4 s0_2, V4 s0_3, V4 s0_4, V4 s0_5, V4 s0_6, V4 s0_7, V4 s0_8, V4 s1_0, V4 s1_1, V4 s1_2, V4 s1_3, V4 s1_4, V4 s1_5, V4 s1_6, V4 s1_7, V4 s1_8) {
|
||||
V4 r = 0.0;
|
||||
r += mul(s0_0, M4(4.254e-02, 1.997e-01, 4.636e-02, -4.800e-02, 2.043e-01, -4.096e-02, -7.212e-02, 1.408e-02, -3.916e-01, 2.630e-03, 7.016e-02, 9.613e-02, 1.773e-01, -2.723e-01, -9.458e-02, -1.890e-01));
|
||||
r += mul(s0_1, M4(2.350e-01, -8.474e-01, -4.044e-01, -9.188e-01, 9.560e-03, 5.061e-02, 1.092e-02, 1.781e-01, -2.144e-01, 3.203e-02, 6.349e-02, -8.272e-02, -3.105e-01, -3.917e-02, -1.320e-02, -1.541e-01));
|
||||
r += mul(s0_2, M4(-8.130e-01, -1.003e-01, 8.195e-02, -7.597e-01, 5.207e-02, 3.470e-02, -8.823e-03, -1.131e-01, -4.029e-02, 7.571e-02, -2.010e-01, 2.487e-01, 1.677e-01, -5.118e-02, -1.070e-01, 7.606e-02));
|
||||
r += mul(s0_3, M4(-1.158e-02, 4.898e-02, 1.202e-02, 5.012e-01, -5.343e-02, 4.756e-02, -2.438e-01, 6.399e-02, 2.822e-01, -2.863e-02, 1.996e-01, -7.099e-02, -1.323e-01, -3.797e-01, 5.385e-02, -1.014e-01));
|
||||
r += mul(s0_4, M4(2.812e-01, 7.903e-01, -1.733e-01, 6.668e-01, 4.775e-01, 5.452e-01, 7.089e-01, -1.851e-01, -2.382e-01, -5.180e-02, -3.623e-01, -3.040e-01, -4.313e-01, -1.167e-02, 1.235e-01, 1.436e-01));
|
||||
r += mul(s0_5, M4(-1.291e-01, -3.022e-02, -4.083e-01, -5.939e-02, -4.249e-01, -1.750e-01, 1.094e-01, -1.176e-01, 1.374e-02, 1.342e-01, 2.086e-01, 2.841e-01, 2.347e-01, 1.450e-01, 7.604e-02, 2.176e-01));
|
||||
r += mul(s0_6, M4(8.130e-02, -7.215e-02, -5.249e-02, 9.518e-03, -1.979e-01, -4.441e-02, -1.857e-01, -4.227e-01, 2.149e-01, -1.610e-01, 1.655e-01, -8.841e-02, 1.409e-01, -1.059e-01, 2.037e-01, -2.744e-03));
|
||||
r += mul(s0_7, M4(-7.266e-02, 1.638e-02, -1.639e-01, 1.957e-02, -2.857e-01, 1.936e-01, -1.243e-01, -1.490e-01, 1.525e-01, -8.934e-02, 7.415e-02, -1.779e-01, 1.648e-02, -6.456e-02, 7.053e-02, -9.530e-02));
|
||||
r += mul(s0_8, M4(-6.960e-02, -8.960e-02, -1.757e-02, -1.370e-01, -5.137e-01, -1.179e-01, -4.053e-01, -1.987e-01, 7.100e-02, 2.928e-02, -9.682e-02, 2.403e-01, 1.814e-01, 2.131e-02, 5.579e-02, 5.457e-02));
|
||||
r += mul(s1_0, M4(-2.737e-02, 5.272e-02, -1.801e-02, -2.491e-01, 2.871e-01, -3.704e-02, -6.568e-02, 2.905e-02, 1.011e-01, -3.782e-01, -8.696e-02, 4.682e-01, 3.233e-01, -3.060e-01, -3.251e-02, 1.165e+00));
|
||||
r += mul(s1_1, M4(-4.994e-01, 3.049e-02, -8.802e-02, -6.179e-02, 7.133e-02, -1.957e-02, -4.465e-02, 1.130e-01, 7.255e-02, 6.956e-03, -1.204e-01, 3.699e-01, -8.844e-02, 4.624e-01, -9.881e-02, -2.512e-01));
|
||||
r += mul(s1_2, M4(-3.645e-01, 1.274e-01, 2.387e-01, -1.963e-01, -5.995e-02, -5.943e-02, 9.694e-02, -2.518e-01, -2.797e-01, 1.598e-01, -1.371e-02, 4.000e-01, 2.213e-01, 9.692e-02, -3.302e-01, 1.132e+00));
|
||||
r += mul(s1_3, M4(-8.539e-03, -6.535e-02, 5.575e-02, 1.928e-01, 1.156e-01, 5.227e-02, -3.039e-01, 4.794e-01, 1.441e-01, 1.929e-01, -4.689e-02, 2.023e-02, 1.330e-01, -1.358e+00, -5.393e-01, 7.907e-01));
|
||||
r += mul(s1_4, M4(1.701e-01, -3.479e-02, 5.404e-01, -2.491e-01, 4.564e-01, 6.659e-01, 7.009e-01, -2.288e-02, -7.696e-01, -4.959e-01, 2.881e-01, -4.322e-01, -9.013e-01, -4.765e-01, 5.556e-02, -1.805e-01));
|
||||
r += mul(s1_5, M4(-2.424e-01, 8.034e-03, -4.699e-02, -2.628e-01, -4.682e-01, 2.977e-02, 2.258e-01, -1.419e-01, 3.514e-01, 6.860e-03, 2.147e-01, 3.806e-01, 3.747e-01, 1.403e-01, 3.106e-01, 9.680e-01));
|
||||
r += mul(s1_6, M4(1.776e-01, -4.873e-02, -1.403e-01, -1.817e-02, -3.551e-01, 4.838e-04, -2.786e-01, -6.048e-01, 3.082e-01, -4.703e-01, 2.419e-01, -3.002e-01, -4.310e-01, -6.490e-01, 1.343e+00, -1.019e+00));
|
||||
r += mul(s1_7, M4(4.689e-02, -2.927e-02, -7.494e-02, -3.516e-02, -2.217e-01, -3.189e-01, 2.202e-01, -2.936e-01, 4.772e-02, -1.609e-01, 9.853e-02, -4.214e-01, 2.780e-01, -1.073e-01, 1.102e-01, -2.033e-01));
|
||||
r += mul(s1_8, M4(-9.468e-02, 4.428e-02, 1.269e-01, -1.086e-01, -1.106e-01, -1.367e-01, -3.356e-01, 4.656e-03, 4.648e-02, -1.743e-02, -2.074e-01, -3.745e-02, 1.281e-01, -3.233e-01, 6.533e-01, 3.705e-01));
|
||||
r += V4(1.016e-03, 5.583e-03, -1.608e-02, -1.996e-04);
|
||||
return r;
|
||||
}
|
||||
|
||||
void Pass2(uint2 blockStart, uint3 tid) {
|
||||
float2 pt = float2(GetInputPt());
|
||||
uint2 gxy = Rmp8x8(tid.x) + blockStart;
|
||||
uint2 size = GetInputSize();
|
||||
if (gxy.x >= size.x || gxy.y >= size.y) {
|
||||
return;
|
||||
}
|
||||
float2 pos = (gxy + 0.5) * pt;
|
||||
|
||||
V4 s0_0 = l0(-1.0, -1.0);
|
||||
V4 s0_1 = l0(0.0, -1.0);
|
||||
V4 s0_2 = l0(1.0, -1.0);
|
||||
V4 s0_3 = l0(-1.0, 0.0);
|
||||
V4 s0_4 = l0(0.0, 0.0);
|
||||
V4 s0_5 = l0(1.0, 0.0);
|
||||
V4 s0_6 = l0(-1.0, 1.0);
|
||||
V4 s0_7 = l0(0.0, 1.0);
|
||||
V4 s0_8 = l0(1.0, 1.0);
|
||||
V4 s1_0 = -max(-s0_0, 0.0);
|
||||
V4 s1_1 = -max(-s0_1, 0.0);
|
||||
V4 s1_2 = -max(-s0_2, 0.0);
|
||||
V4 s1_3 = -max(-s0_3, 0.0);
|
||||
V4 s1_4 = -max(-s0_4, 0.0);
|
||||
V4 s1_5 = -max(-s0_5, 0.0);
|
||||
V4 s1_6 = -max(-s0_6, 0.0);
|
||||
V4 s1_7 = -max(-s0_7, 0.0);
|
||||
V4 s1_8 = -max(-s0_8, 0.0);
|
||||
s0_0 = max(s0_0, 0.0);
|
||||
s0_1 = max(s0_1, 0.0);
|
||||
s0_2 = max(s0_2, 0.0);
|
||||
s0_3 = max(s0_3, 0.0);
|
||||
s0_4 = max(s0_4, 0.0);
|
||||
s0_5 = max(s0_5, 0.0);
|
||||
s0_6 = max(s0_6, 0.0);
|
||||
s0_7 = max(s0_7, 0.0);
|
||||
s0_8 = max(s0_8, 0.0);
|
||||
|
||||
t1[gxy] = f0(s0_0, s0_1, s0_2, s0_3, s0_4, s0_5, s0_6, s0_7, s0_8, s1_0, s1_1, s1_2, s1_3, s1_4, s1_5, s1_6, s1_7, s1_8);
|
||||
}
|
||||
|
||||
//!PASS 3
|
||||
//!DESC conv2
|
||||
//!BLOCK_SIZE 8
|
||||
//!NUM_THREADS 64
|
||||
//!IN t1
|
||||
//!OUT t0
|
||||
|
||||
#define l0(x, y) V4(O(t1, float2(x, y)))
|
||||
|
||||
V4 f0(V4 s0_0, V4 s0_1, V4 s0_2, V4 s0_3, V4 s0_4, V4 s0_5, V4 s0_6, V4 s0_7, V4 s0_8, V4 s1_0, V4 s1_1, V4 s1_2, V4 s1_3, V4 s1_4, V4 s1_5, V4 s1_6, V4 s1_7, V4 s1_8) {
|
||||
V4 r = 0.0;
|
||||
r += mul(s0_0, M4(-4.810e-02, 2.379e-02, -8.471e-02, 1.305e-01, -5.897e-02, 1.263e-01, -9.639e-02, 9.150e-02, 9.002e-03, -1.763e-01, 8.275e-02, -2.357e-01, 7.181e-02, -7.360e-02, 4.629e-02, -8.259e-02));
|
||||
r += mul(s0_1, M4(6.774e-02, 9.108e-02, -3.750e-01, 8.014e-02, 2.890e-01, 9.986e-02, -1.262e-02, -1.285e-01, -2.789e-01, -1.145e-01, -4.982e-02, -1.101e-01, -2.051e-02, -2.271e-01, 1.343e-01, -8.643e-02));
|
||||
r += mul(s0_2, M4(-5.433e-02, 6.899e-02, -3.350e-01, -7.837e-02, -1.076e-01, 1.912e-02, -9.061e-02, 1.919e-01, 9.387e-02, -4.206e-02, 1.861e-01, -4.416e-03, -1.560e-01, -4.364e-02, 4.364e-01, 8.765e-02));
|
||||
r += mul(s0_3, M4(2.382e-01, 3.032e-01, -1.313e-01, -1.154e-01, 1.008e-01, 3.058e-01, -8.513e-02, 2.713e-01, -9.875e-02, 3.017e-01, 3.203e-02, 5.762e-01, -2.056e-03, -7.698e-02, 8.681e-02, 4.245e-02));
|
||||
r += mul(s0_4, M4(2.643e-01, 1.750e-01, 4.850e-02, 3.131e-03, 2.785e-01, 1.598e-01, 5.772e-01, -4.118e-04, -4.270e-01, -2.447e-01, 4.486e-01, 9.155e-02, -3.428e-01, -2.583e-01, -3.721e-02, 6.278e-02));
|
||||
r += mul(s0_5, M4(-1.080e-01, -5.514e-02, -3.648e-01, -2.319e-02, -2.100e-01, -4.065e-02, 1.126e-01, 3.970e-02, 9.824e-02, 1.377e-02, 1.295e-01, -2.512e-02, 1.115e-01, 7.094e-02, 3.413e-01, -5.245e-02));
|
||||
r += mul(s0_6, M4(1.991e-01, 4.710e-02, -9.305e-02, -1.471e-01, -8.221e-02, 1.134e-01, -1.718e-01, -2.606e-01, -8.167e-02, -1.462e-02, -1.094e-01, -1.569e-01, 2.133e-02, 3.374e-02, 4.583e-02, 1.228e-01));
|
||||
r += mul(s0_7, M4(-2.135e-01, 6.874e-02, -4.993e-02, 1.156e-02, -4.261e-01, 1.366e-01, 4.250e-02, -5.707e-02, -1.966e-01, -6.106e-02, 1.265e-01, -3.076e-03, 2.043e-03, -3.072e-02, 1.043e-01, 3.422e-01));
|
||||
r += mul(s0_8, M4(7.235e-02, -3.542e-04, -1.435e-02, -3.815e-02, -8.855e-02, 8.327e-02, 1.954e-01, 1.462e-01, 1.615e-01, -4.957e-02, 1.596e-02, -8.625e-02, 6.574e-02, -9.799e-02, 5.401e-03, 7.595e-02));
|
||||
r += mul(s1_0, M4(1.245e-01, -2.812e-03, 1.486e-02, 1.246e-01, -5.943e-02, 1.170e-01, -1.068e-01, 8.960e-02, 5.354e-03, -2.039e-01, 8.228e-02, -2.530e-01, -2.789e-03, -6.932e-02, -3.187e-02, -5.794e-02));
|
||||
r += mul(s1_1, M4(-2.539e-02, 4.598e-02, -1.205e-01, 1.597e-01, 2.391e-01, 1.269e-01, -1.116e-02, 1.498e-02, -2.388e-01, -1.548e-01, -7.389e-02, -1.083e-02, -1.181e-01, -7.069e-02, 9.383e-03, -2.018e-01));
|
||||
r += mul(s1_2, M4(-1.248e-02, 3.267e-02, -2.761e-01, -2.043e-02, -8.520e-02, 3.937e-02, -1.372e-01, 1.821e-02, 6.915e-02, -4.061e-02, 1.782e-01, -4.619e-02, 6.811e-02, -5.458e-04, 3.193e-01, 8.892e-03));
|
||||
r += mul(s1_3, M4(-1.580e-01, 7.536e-02, -6.680e-02, 1.891e-01, 1.196e-01, 3.476e-01, -6.321e-02, 1.972e-01, -9.851e-02, 4.483e-01, 9.326e-03, 5.272e-01, -1.478e-01, -4.009e-02, -3.561e-02, -2.549e-01));
|
||||
r += mul(s1_4, M4(-1.253e-01, 1.345e-01, 4.994e-01, 2.000e-01, 2.728e-01, 1.672e-01, 5.501e-01, -1.736e-02, -5.782e-01, -2.191e-01, 4.380e-01, 4.346e-02, -3.006e-01, -5.220e-02, -1.613e-01, 6.023e-02));
|
||||
r += mul(s1_5, M4(1.276e-01, -8.319e-02, -2.115e-01, 1.471e-01, -1.669e-01, -2.484e-02, 9.906e-02, 1.836e-02, 1.010e-01, 1.847e-02, 1.027e-01, -1.680e-02, -1.880e-01, 1.377e-01, 3.823e-02, -8.256e-02));
|
||||
r += mul(s1_6, M4(-3.200e-01, -7.023e-02, -1.243e-01, -2.003e-02, -7.863e-02, 6.650e-02, -1.264e-01, -1.862e-01, -9.119e-02, -4.374e-02, -1.195e-01, -6.902e-02, -1.360e-01, 3.356e-02, -3.667e-02, -1.815e-01));
|
||||
r += mul(s1_7, M4(1.462e-02, 1.001e-01, 2.453e-01, -1.298e-02, -4.372e-01, 1.509e-01, 8.011e-02, -1.323e-01, -1.980e-01, -4.785e-02, 1.733e-01, 1.100e-02, -2.153e-01, 6.711e-02, 2.595e-03, 1.213e-01));
|
||||
r += mul(s1_8, M4(-3.794e-03, 2.239e-02, -6.960e-02, 7.342e-02, -1.882e-01, 1.159e-01, 1.876e-01, 3.125e-02, 2.242e-01, -5.956e-02, 1.328e-02, -5.400e-02, 2.205e-02, -6.049e-02, -9.151e-02, -1.137e-01));
|
||||
r += V4(-1.437e-02, -2.276e-02, 2.275e-02, 6.547e-04);
|
||||
return r;
|
||||
}
|
||||
|
||||
void Pass3(uint2 blockStart, uint3 tid) {
|
||||
float2 pt = float2(GetInputPt());
|
||||
uint2 gxy = Rmp8x8(tid.x) + blockStart;
|
||||
uint2 size = GetInputSize();
|
||||
if (gxy.x >= size.x || gxy.y >= size.y) {
|
||||
return;
|
||||
}
|
||||
float2 pos = (gxy + 0.5) * pt;
|
||||
|
||||
V4 s0_0 = l0(-1.0, -1.0);
|
||||
V4 s0_1 = l0(0.0, -1.0);
|
||||
V4 s0_2 = l0(1.0, -1.0);
|
||||
V4 s0_3 = l0(-1.0, 0.0);
|
||||
V4 s0_4 = l0(0.0, 0.0);
|
||||
V4 s0_5 = l0(1.0, 0.0);
|
||||
V4 s0_6 = l0(-1.0, 1.0);
|
||||
V4 s0_7 = l0(0.0, 1.0);
|
||||
V4 s0_8 = l0(1.0, 1.0);
|
||||
V4 s1_0 = -max(-s0_0, 0.0);
|
||||
V4 s1_1 = -max(-s0_1, 0.0);
|
||||
V4 s1_2 = -max(-s0_2, 0.0);
|
||||
V4 s1_3 = -max(-s0_3, 0.0);
|
||||
V4 s1_4 = -max(-s0_4, 0.0);
|
||||
V4 s1_5 = -max(-s0_5, 0.0);
|
||||
V4 s1_6 = -max(-s0_6, 0.0);
|
||||
V4 s1_7 = -max(-s0_7, 0.0);
|
||||
V4 s1_8 = -max(-s0_8, 0.0);
|
||||
s0_0 = max(s0_0, 0.0);
|
||||
s0_1 = max(s0_1, 0.0);
|
||||
s0_2 = max(s0_2, 0.0);
|
||||
s0_3 = max(s0_3, 0.0);
|
||||
s0_4 = max(s0_4, 0.0);
|
||||
s0_5 = max(s0_5, 0.0);
|
||||
s0_6 = max(s0_6, 0.0);
|
||||
s0_7 = max(s0_7, 0.0);
|
||||
s0_8 = max(s0_8, 0.0);
|
||||
|
||||
t0[gxy] = f0(s0_0, s0_1, s0_2, s0_3, s0_4, s0_5, s0_6, s0_7, s0_8, s1_0, s1_1, s1_2, s1_3, s1_4, s1_5, s1_6, s1_7, s1_8);
|
||||
}
|
||||
|
||||
//!PASS 4
|
||||
//!DESC conv3
|
||||
//!BLOCK_SIZE 8
|
||||
//!NUM_THREADS 64
|
||||
//!IN t0
|
||||
//!OUT t1
|
||||
|
||||
#define l0(x, y) V4(O(t0, float2(x, y)))
|
||||
|
||||
V4 f0(V4 s0_0, V4 s0_1, V4 s0_2, V4 s0_3, V4 s0_4, V4 s0_5, V4 s0_6, V4 s0_7, V4 s0_8, V4 s1_0, V4 s1_1, V4 s1_2, V4 s1_3, V4 s1_4, V4 s1_5, V4 s1_6, V4 s1_7, V4 s1_8) {
|
||||
V4 r = 0.0;
|
||||
r += mul(s0_0, M4(3.886e-03, -1.503e-01, -6.378e-01, 4.214e-02, -1.255e-01, 1.146e-01, -1.917e-01, -6.556e-02, -3.368e-02, 6.874e-02, 2.796e-01, -2.936e-02, -3.239e-02, 3.923e-02, -6.439e-02, 1.313e-02));
|
||||
r += mul(s0_1, M4(4.357e-01, -1.067e-01, 3.330e-01, -8.295e-02, -4.004e-01, 3.113e-01, -4.222e-02, 2.290e-01, -1.861e-01, 9.039e-02, -1.132e-01, 1.077e-01, -1.603e-02, 6.296e-02, 4.907e-01, 3.396e-02));
|
||||
r += mul(s0_2, M4(-3.290e-01, -1.073e-01, 1.064e-02, -2.792e-03, -4.366e-01, 3.239e-01, -1.383e-01, 1.918e-01, 3.058e-02, 1.006e-01, -6.898e-02, -1.451e-02, -1.882e-01, 2.248e-01, 1.744e-02, -3.155e-02));
|
||||
r += mul(s0_3, M4(2.403e-02, -1.353e-01, 1.895e-01, -2.285e-01, -1.211e-01, 1.771e-01, 2.135e-01, 1.900e-01, -4.204e-03, 3.719e-02, -4.772e-01, 2.006e-01, -2.532e-03, 5.872e-02, 2.901e-01, -9.450e-02));
|
||||
r += mul(s0_4, M4(8.054e-02, 1.389e-02, -2.060e-02, -3.042e-01, -2.476e-01, 9.905e-02, -9.248e-01, 3.372e-01, -5.254e-01, 4.455e-01, 5.707e-02, 1.057e-01, -3.525e-01, 3.349e-01, -3.414e-01, 7.090e-02));
|
||||
r += mul(s0_5, M4(-1.889e-01, -2.290e-01, -4.930e-02, -1.824e-01, -2.062e+00, 6.868e-02, 2.552e-01, 3.883e-01, 5.778e-02, 9.141e-02, 9.917e-02, -1.164e-01, 4.359e-02, 2.105e-01, -7.911e-02, -1.916e-01));
|
||||
r += mul(s0_6, M4(-2.267e-02, -6.231e-03, -9.718e-03, 3.770e-04, -6.982e-02, 4.184e-02, -2.296e-01, -9.542e-02, 5.236e-02, -5.412e-02, -1.757e-01, -1.054e-01, 1.414e-02, -7.772e-02, -1.338e-02, 3.928e-02));
|
||||
r += mul(s0_7, M4(5.776e-02, 4.703e-02, 3.914e-02, -1.617e-02, -3.606e-01, 3.037e-01, -3.096e-01, 3.562e-02, 3.108e-01, -3.684e-01, 3.725e-02, -2.050e-01, -1.494e-02, 8.741e-02, 5.992e-02, 2.655e-02));
|
||||
r += mul(s0_8, M4(3.614e-02, -1.212e-01, 2.507e-02, -5.858e-02, -1.121e-01, -3.433e-01, 6.613e-02, -6.943e-01, 2.233e-02, -5.467e-02, -6.900e-03, -2.566e-01, -1.106e-01, 2.016e-02, -3.700e-02, -2.886e-01));
|
||||
r += mul(s1_0, M4(-5.136e-02, -2.190e-01, -1.035e+00, -5.722e-02, 2.876e-02, 5.070e-02, 3.532e-01, -6.778e-03, 2.930e-04, -6.219e-02, 2.314e-01, -5.210e-02, 1.508e-02, -4.390e-02, -7.749e-02, -9.658e-03));
|
||||
r += mul(s1_1, M4(3.663e-01, -9.746e-02, -6.582e-01, -3.676e-01, -1.694e-01, 7.883e-02, -1.613e-01, 2.328e-02, 2.595e-04, -3.763e-02, -9.946e-02, -6.137e-02, 1.429e-01, -1.964e-01, 2.439e-01, 4.898e-02));
|
||||
r += mul(s1_2, M4(7.884e-02, 1.842e-01, -1.309e-01, 4.895e-02, 4.820e-02, 8.364e-02, 1.189e-02, -1.438e-02, -7.934e-02, 4.775e-02, -6.137e-02, -1.335e-02, -4.416e-02, 3.584e-02, 1.751e-04, -1.178e-02));
|
||||
r += mul(s1_3, M4(-9.861e-03, -1.277e-01, 2.389e-03, -3.232e-01, -2.782e-03, 1.115e-01, -6.485e-02, 2.093e-01, 2.056e-01, 2.527e-02, -1.772e-01, 1.863e-02, 5.983e-02, -8.103e-02, 3.076e-01, -2.027e-01));
|
||||
r += mul(s1_4, M4(1.001e-01, 3.476e-01, -1.305e-01, -1.653e-01, 8.890e-02, -4.170e-01, -1.530e-01, 7.048e-02, -5.605e-01, 1.093e-01, 2.038e-01, -2.320e-01, -1.287e-01, -2.173e-01, -1.630e-01, -9.691e-02));
|
||||
r += mul(s1_5, M4(-2.778e-01, 1.393e-01, -2.802e-02, -5.375e-02, -4.550e-01, -1.661e-01, 2.293e-03, -5.984e-02, -5.070e-02, -8.852e-02, 7.806e-02, 2.187e-02, 1.901e-01, -3.219e-01, -1.937e-01, -2.336e-01));
|
||||
r += mul(s1_6, M4(-8.489e-02, 1.968e-01, -7.760e-02, 1.388e-01, 4.713e-03, 1.527e-01, 8.535e-02, 1.643e-02, 1.429e-01, -1.558e-01, 2.339e-01, 2.762e-01, 1.694e-02, -4.245e-02, -2.793e-02, -3.332e-02));
|
||||
r += mul(s1_7, M4(-4.377e-02, 3.486e-01, -1.766e-01, -1.065e-01, -1.645e-01, -8.722e-04, -1.147e-01, 1.663e-01, 6.801e-02, -3.539e-01, 1.560e-02, -1.819e-01, 1.440e-02, -1.221e-02, 3.693e-02, 5.886e-03));
|
||||
r += mul(s1_8, M4(5.940e-02, 1.624e-01, 1.526e-02, 6.692e-02, 1.812e-01, -8.647e-02, 3.210e-02, -3.751e-04, 2.884e-02, -4.717e-02, 4.121e-03, 5.144e-02, -1.995e-02, -2.827e-01, 6.148e-03, 7.209e-02));
|
||||
r += V4(1.575e-02, -2.007e-01, -3.519e-03, -9.082e-03);
|
||||
return r;
|
||||
}
|
||||
|
||||
void Pass4(uint2 blockStart, uint3 tid) {
|
||||
float2 pt = float2(GetInputPt());
|
||||
uint2 gxy = Rmp8x8(tid.x) + blockStart;
|
||||
uint2 size = GetInputSize();
|
||||
if (gxy.x >= size.x || gxy.y >= size.y) {
|
||||
return;
|
||||
}
|
||||
float2 pos = (gxy + 0.5) * pt;
|
||||
|
||||
V4 s0_0 = l0(-1.0, -1.0);
|
||||
V4 s0_1 = l0(0.0, -1.0);
|
||||
V4 s0_2 = l0(1.0, -1.0);
|
||||
V4 s0_3 = l0(-1.0, 0.0);
|
||||
V4 s0_4 = l0(0.0, 0.0);
|
||||
V4 s0_5 = l0(1.0, 0.0);
|
||||
V4 s0_6 = l0(-1.0, 1.0);
|
||||
V4 s0_7 = l0(0.0, 1.0);
|
||||
V4 s0_8 = l0(1.0, 1.0);
|
||||
V4 s1_0 = -max(-s0_0, 0.0);
|
||||
V4 s1_1 = -max(-s0_1, 0.0);
|
||||
V4 s1_2 = -max(-s0_2, 0.0);
|
||||
V4 s1_3 = -max(-s0_3, 0.0);
|
||||
V4 s1_4 = -max(-s0_4, 0.0);
|
||||
V4 s1_5 = -max(-s0_5, 0.0);
|
||||
V4 s1_6 = -max(-s0_6, 0.0);
|
||||
V4 s1_7 = -max(-s0_7, 0.0);
|
||||
V4 s1_8 = -max(-s0_8, 0.0);
|
||||
s0_0 = max(s0_0, 0.0);
|
||||
s0_1 = max(s0_1, 0.0);
|
||||
s0_2 = max(s0_2, 0.0);
|
||||
s0_3 = max(s0_3, 0.0);
|
||||
s0_4 = max(s0_4, 0.0);
|
||||
s0_5 = max(s0_5, 0.0);
|
||||
s0_6 = max(s0_6, 0.0);
|
||||
s0_7 = max(s0_7, 0.0);
|
||||
s0_8 = max(s0_8, 0.0);
|
||||
|
||||
t1[gxy] = f0(s0_0, s0_1, s0_2, s0_3, s0_4, s0_5, s0_6, s0_7, s0_8, s1_0, s1_1, s1_2, s1_3, s1_4, s1_5, s1_6, s1_7, s1_8);
|
||||
}
|
||||
|
||||
//!PASS 5
|
||||
//!DESC conv4
|
||||
//!BLOCK_SIZE 8
|
||||
//!NUM_THREADS 64
|
||||
//!IN t1
|
||||
//!OUT t0
|
||||
|
||||
#define l0(x, y) V4(O(t1, float2(x, y)))
|
||||
|
||||
V4 f0(V4 s0_0, V4 s0_1, V4 s0_2, V4 s0_3, V4 s0_4, V4 s0_5, V4 s0_6, V4 s0_7, V4 s0_8, V4 s1_0, V4 s1_1, V4 s1_2, V4 s1_3, V4 s1_4, V4 s1_5, V4 s1_6, V4 s1_7, V4 s1_8) {
|
||||
V4 r = 0.0;
|
||||
r += mul(s0_0, M4(-6.479e-02, -9.976e-02, -1.507e-01, -9.934e-02, -1.046e-02, -1.471e-01, -4.218e-02, -8.348e-04, -5.963e-02, 1.519e-03, 5.897e-03, 5.284e-02, -4.467e-01, 4.779e-01, -1.953e-02, 1.951e-01));
|
||||
r += mul(s0_1, M4(-5.276e-02, -1.201e-01, -1.160e-01, 6.076e-02, -4.798e-02, -3.491e-01, -3.055e-01, -1.607e-01, -8.989e-02, 1.221e-01, -1.561e-01, 6.227e-02, -1.598e-01, -6.666e-01, 6.029e-01, -5.466e-01));
|
||||
r += mul(s0_2, M4(-1.331e-01, -4.988e-02, -2.217e-02, 3.405e-02, 2.261e-02, 1.352e-01, 1.124e-02, 8.259e-02, -3.548e-02, 2.454e-01, 4.417e-02, 2.297e-01, 1.780e-01, -2.203e-01, 5.913e-02, -2.201e-01));
|
||||
r += mul(s0_3, M4(1.348e-01, 5.544e-01, -4.335e-01, -3.619e-01, 1.011e-01, 2.665e-01, -2.627e-01, -1.800e-01, -1.158e-01, -8.543e-02, -7.868e-03, 2.056e-01, 1.988e-01, 1.174e+00, -1.291e-01, 1.131e-01));
|
||||
r += mul(s0_4, M4(4.504e-01, 1.025e-01, -1.449e-01, -3.442e-02, -4.525e-01, -1.513e-01, -8.135e-02, -9.669e-02, -3.287e-01, 5.251e-01, -6.540e-01, 7.386e-02, 2.603e-01, -8.246e-01, -1.378e-01, 2.363e+00));
|
||||
r += mul(s0_5, M4(-7.102e-02, -5.554e-02, -3.489e-02, -6.688e-02, 2.877e-01, -6.258e-02, 8.515e-02, -2.109e-01, -2.723e-01, 1.543e-01, 1.285e-01, 9.366e-02, 3.135e-02, -3.700e-01, -4.111e-01, 1.822e+00));
|
||||
r += mul(s0_6, M4(-4.018e-02, -3.412e-01, 5.388e-02, 4.947e-01, -3.234e-02, -6.778e-02, 3.825e-02, 1.313e-01, -6.083e-02, 3.439e-02, -1.081e-01, 6.456e-02, 2.287e-02, -2.470e-01, 2.026e-02, -1.886e-02));
|
||||
r += mul(s0_7, M4(2.410e-01, 1.529e-01, -1.370e-01, -1.389e-01, 1.549e-01, 8.308e-03, 3.064e-02, 3.925e-02, -9.013e-02, 1.131e-01, -9.240e-02, 3.740e-01, -1.009e-01, -6.576e-02, -1.491e-01, -3.452e-02));
|
||||
r += mul(s0_8, M4(-1.628e-01, -2.480e-02, -6.569e-02, 3.873e-02, 1.604e-02, 1.651e-02, -4.681e-02, -1.647e-02, -1.648e-02, 1.541e-01, 2.284e-02, 6.545e-01, 1.799e-03, 1.193e-03, -1.215e-01, 5.919e-02));
|
||||
r += mul(s1_0, M4(-1.115e-02, -5.014e-02, -1.499e-01, -7.414e-04, -6.944e-02, -4.168e-02, -1.254e-01, -6.576e-02, 2.946e-04, -2.669e-02, 4.109e-02, 1.949e-02, 1.242e-01, 1.753e-01, 9.717e-02, 1.446e-01));
|
||||
r += mul(s1_1, M4(-1.327e-02, -1.462e-01, -8.510e-02, -1.228e-02, 1.772e-01, 1.009e-01, -4.342e-02, -8.827e-02, -6.663e-02, -1.245e-01, -4.625e-02, -4.285e-02, 7.586e-02, -1.208e-01, 2.705e-01, -1.558e-01));
|
||||
r += mul(s1_2, M4(-7.024e-02, -3.045e-02, -1.916e-02, 4.979e-02, -9.145e-02, 2.285e-01, 4.612e-02, 2.217e-01, 7.690e-02, -4.332e-02, 6.032e-03, -2.370e-02, 3.802e-01, -8.124e-02, 1.982e-02, -8.310e-02));
|
||||
r += mul(s1_3, M4(1.238e-01, 5.787e-01, -5.332e-01, -2.806e-01, 1.208e-01, 6.549e-02, -2.040e-01, -2.578e-02, -5.878e-02, -1.496e-01, 1.213e-01, 1.489e-02, 9.569e-02, 1.964e-01, 6.477e-02, -2.939e-01));
|
||||
r += mul(s1_4, M4(5.825e-01, 2.257e-01, -1.943e-01, 1.101e-01, -3.240e-01, -2.967e-01, -4.203e-02, -3.636e-01, -1.062e-01, -3.799e-02, -4.444e-01, -7.607e-02, -3.056e-01, -2.926e-01, -4.582e-02, 2.795e-01));
|
||||
r += mul(s1_5, M4(-9.076e-02, -5.130e-02, -3.718e-02, -6.163e-02, 1.831e-01, -1.199e-01, 9.176e-02, -2.456e-01, 2.362e-01, -1.854e-01, -1.394e-01, 3.560e-03, 2.070e-02, -6.903e-02, -5.061e-02, 3.068e-02));
|
||||
r += mul(s1_6, M4(-4.988e-02, -3.880e-01, 3.001e-02, 3.892e-01, -2.827e-02, -2.880e-02, 4.071e-02, 2.861e-01, -4.016e-02, -1.085e-01, 9.207e-03, -7.367e-02, 9.072e-03, 8.960e-02, 5.334e-03, -6.480e-02));
|
||||
r += mul(s1_7, M4(2.900e-01, 1.450e-01, -1.401e-01, -2.809e-01, 1.218e-01, -3.153e-03, -2.544e-02, 1.898e-01, -7.197e-02, -3.721e-01, 4.042e-02, 9.918e-02, -1.132e-01, 3.578e-02, 4.000e-02, 6.991e-02));
|
||||
r += mul(s1_8, M4(-1.493e-01, -2.310e-02, -6.133e-02, 5.322e-02, -4.879e-02, -5.139e-02, -8.058e-02, 4.140e-02, 2.511e-01, 3.669e-02, -1.003e-01, -1.457e-01, 1.528e-01, 1.177e-01, 6.665e-02, -3.084e-02));
|
||||
r += V4(2.513e-04, -2.994e-02, -5.133e-02, -8.977e-03);
|
||||
return r;
|
||||
}
|
||||
|
||||
void Pass5(uint2 blockStart, uint3 tid) {
|
||||
float2 pt = float2(GetInputPt());
|
||||
uint2 gxy = Rmp8x8(tid.x) + blockStart;
|
||||
uint2 size = GetInputSize();
|
||||
if (gxy.x >= size.x || gxy.y >= size.y) {
|
||||
return;
|
||||
}
|
||||
float2 pos = (gxy + 0.5) * pt;
|
||||
|
||||
V4 s0_0 = l0(-1.0, -1.0);
|
||||
V4 s0_1 = l0(0.0, -1.0);
|
||||
V4 s0_2 = l0(1.0, -1.0);
|
||||
V4 s0_3 = l0(-1.0, 0.0);
|
||||
V4 s0_4 = l0(0.0, 0.0);
|
||||
V4 s0_5 = l0(1.0, 0.0);
|
||||
V4 s0_6 = l0(-1.0, 1.0);
|
||||
V4 s0_7 = l0(0.0, 1.0);
|
||||
V4 s0_8 = l0(1.0, 1.0);
|
||||
V4 s1_0 = -max(-s0_0, 0.0);
|
||||
V4 s1_1 = -max(-s0_1, 0.0);
|
||||
V4 s1_2 = -max(-s0_2, 0.0);
|
||||
V4 s1_3 = -max(-s0_3, 0.0);
|
||||
V4 s1_4 = -max(-s0_4, 0.0);
|
||||
V4 s1_5 = -max(-s0_5, 0.0);
|
||||
V4 s1_6 = -max(-s0_6, 0.0);
|
||||
V4 s1_7 = -max(-s0_7, 0.0);
|
||||
V4 s1_8 = -max(-s0_8, 0.0);
|
||||
s0_0 = max(s0_0, 0.0);
|
||||
s0_1 = max(s0_1, 0.0);
|
||||
s0_2 = max(s0_2, 0.0);
|
||||
s0_3 = max(s0_3, 0.0);
|
||||
s0_4 = max(s0_4, 0.0);
|
||||
s0_5 = max(s0_5, 0.0);
|
||||
s0_6 = max(s0_6, 0.0);
|
||||
s0_7 = max(s0_7, 0.0);
|
||||
s0_8 = max(s0_8, 0.0);
|
||||
|
||||
t0[gxy] = f0(s0_0, s0_1, s0_2, s0_3, s0_4, s0_5, s0_6, s0_7, s0_8, s1_0, s1_1, s1_2, s1_3, s1_4, s1_5, s1_6, s1_7, s1_8);
|
||||
}
|
||||
|
||||
//!PASS 6
|
||||
//!DESC conv5
|
||||
//!BLOCK_SIZE 8
|
||||
//!NUM_THREADS 64
|
||||
//!IN t0
|
||||
//!OUT t1
|
||||
|
||||
#define l0(x, y) V4(O(t0, float2(x, y)))
|
||||
|
||||
V4 f0(V4 s0_0, V4 s0_1, V4 s0_2, V4 s0_3, V4 s0_4, V4 s0_5, V4 s0_6, V4 s0_7, V4 s0_8, V4 s1_0, V4 s1_1, V4 s1_2, V4 s1_3, V4 s1_4, V4 s1_5, V4 s1_6, V4 s1_7, V4 s1_8) {
|
||||
V4 r = 0.0;
|
||||
r += mul(s0_0, M4(4.575e-01, 2.412e-01, 1.926e-01, 5.873e-02, 2.954e-02, -1.424e-01, 7.881e-03, 2.358e-04, -5.872e-02, -1.007e-01, -3.632e-02, 5.718e-02, 1.389e-01, -4.163e-02, -1.379e-01, 2.160e-03));
|
||||
r += mul(s0_1, M4(1.347e-01, -8.074e-01, -1.155e-01, 2.242e-01, -2.673e-01, 4.053e-01, 8.867e-02, -2.840e-02, 9.443e-02, 2.632e-01, 9.207e-02, -1.793e-02, 1.519e-01, 3.302e-03, 2.027e-01, 2.643e-02));
|
||||
r += mul(s0_2, M4(1.462e-02, -7.543e-02, -6.080e-02, 7.431e-02, -3.673e-02, -1.665e-01, -2.745e-01, -4.416e-02, -3.270e-01, 7.677e-01, 7.241e-01, -1.157e-01, -8.204e-03, 2.172e-02, 3.183e-01, 3.931e-02));
|
||||
r += mul(s0_3, M4(1.168e+00, -8.427e-01, -3.237e-03, 5.416e-02, 1.694e-02, -1.042e-01, -2.173e-01, -1.089e-01, -9.881e-02, -1.109e-01, -1.003e-01, -5.080e-02, -9.279e-02, -1.111e-01, -2.699e-02, -2.297e-02));
|
||||
r += mul(s0_4, M4(-4.884e-01, -4.472e-01, -9.701e-02, 8.789e-01, 1.962e-02, 5.041e-01, 3.221e-01, -4.622e-02, 9.039e-02, -2.531e-01, 6.228e-01, 1.590e-02, 1.804e-02, 7.795e-02, -8.005e-02, -6.310e-03));
|
||||
r += mul(s0_5, M4(-6.567e-02, -5.161e-02, 5.550e-02, 5.285e-02, -6.147e-02, -1.840e-01, 2.028e-01, 4.014e-01, 4.070e-01, -1.022e-01, 1.414e+00, -3.126e-01, 7.508e-03, 1.013e-01, -7.300e-02, -4.282e-01));
|
||||
r += mul(s0_6, M4(1.721e+00, 1.776e-01, -8.690e-02, -1.102e-01, -8.467e-02, -2.165e-02, 6.238e-02, 2.052e-02, 2.763e-01, -3.472e-02, -1.179e-01, 2.993e-02, -6.860e-02, 1.887e-02, 3.140e-02, -6.853e-02));
|
||||
r += mul(s0_7, M4(1.937e-01, 1.975e-01, -2.456e-01, -1.360e+00, 1.792e-01, -5.969e-02, -7.670e-02, 2.606e-01, 1.355e-01, -9.109e-03, 2.756e-01, 6.674e-02, 1.312e-02, -1.542e-02, 2.236e-02, 1.997e-01));
|
||||
r += mul(s0_8, M4(4.255e-02, -1.452e-02, -8.732e-02, -1.084e-01, 1.495e-02, 1.302e-02, -9.151e-02, -2.814e-01, 5.197e-02, 2.866e-02, 5.490e-01, 4.310e-01, 3.666e-02, -3.380e-03, -2.830e-02, -8.223e-02));
|
||||
r += mul(s1_0, M4(2.549e-02, 7.469e-02, -5.290e-02, -4.972e-02, -2.340e-01, -1.875e-01, 1.656e-01, 5.697e-02, -8.570e-02, -1.520e-01, -2.622e-02, 1.043e-02, -2.377e-01, -3.927e-02, 1.539e-01, 4.528e-02));
|
||||
r += mul(s1_1, M4(-1.188e-02, -9.781e-02, 1.606e-01, 5.138e-02, -4.165e-01, 8.262e-01, 1.709e-01, -1.063e-01, 8.393e-03, 7.300e-02, -9.347e-02, -6.226e-02, -3.633e-01, -4.453e-01, 2.190e-01, 2.415e-01));
|
||||
r += mul(s1_2, M4(-4.011e-02, 3.404e-02, 1.013e-01, 3.551e-02, 9.692e-02, -2.109e-01, 1.897e-01, -2.192e-01, -1.703e-01, 5.317e-01, 1.354e-01, -2.027e-01, -3.658e-01, -1.845e-01, -5.465e-01, 1.436e-01));
|
||||
r += mul(s1_3, M4(7.674e-01, 1.677e-01, -7.875e-02, 7.537e-03, -4.911e-01, -1.083e-01, 7.183e-03, -1.107e-01, -2.514e-02, -1.257e-01, -5.070e-02, -3.886e-02, 1.368e-01, -1.991e-02, -1.698e-01, -7.850e-03));
|
||||
r += mul(s1_4, M4(-5.096e-02, 7.912e-02, -2.105e-01, 1.149e-01, 9.798e-02, 2.243e-01, -3.434e-01, 3.492e-01, -1.265e-01, -1.839e-01, -1.337e-01, -6.909e-02, -8.552e-01, 1.334e-01, 8.652e-01, -3.408e-01));
|
||||
r += mul(s1_5, M4(-2.933e-02, 1.424e-01, 6.542e-02, -1.710e-01, -1.459e-01, -3.069e-02, -1.275e-01, -9.443e-02, 2.657e-01, -4.784e-04, -6.729e-03, -1.910e-01, -4.628e-01, 3.808e-02, -1.470e-01, 1.480e-01));
|
||||
r += mul(s1_6, M4(1.512e-01, -1.755e-02, -5.440e-02, 1.317e-02, -7.181e-02, -6.842e-03, -7.375e-02, -8.356e-02, 7.332e-02, -9.437e-02, -1.008e-01, -4.731e-02, -9.102e-02, -8.192e-03, 7.862e-04, 6.417e-02));
|
||||
r += mul(s1_7, M4(2.457e-01, -1.058e-01, -2.777e-02, -1.532e-03, 7.609e-02, 3.452e-02, 1.774e-01, 3.296e-01, 6.779e-02, -6.683e-02, 1.485e-01, 7.321e-02, -3.082e-02, -4.348e-02, 3.558e-03, 9.111e-03));
|
||||
r += mul(s1_8, M4(1.104e-01, 5.040e-03, 9.642e-03, -8.991e-02, -2.134e-01, 3.758e-02, -1.244e-01, -1.987e-01, -7.007e-02, 6.792e-03, 1.369e-01, 5.332e-01, -5.354e-02, -2.024e-02, -1.038e-01, -4.812e-02));
|
||||
r += V4(4.102e-03, 1.192e-03, -2.598e-03, -2.812e-03);
|
||||
return r;
|
||||
}
|
||||
|
||||
void Pass6(uint2 blockStart, uint3 tid) {
|
||||
float2 pt = float2(GetInputPt());
|
||||
uint2 gxy = Rmp8x8(tid.x) + blockStart;
|
||||
uint2 size = GetInputSize();
|
||||
if (gxy.x >= size.x || gxy.y >= size.y) {
|
||||
return;
|
||||
}
|
||||
float2 pos = (gxy + 0.5) * pt;
|
||||
|
||||
V4 s0_0 = l0(-1.0, -1.0);
|
||||
V4 s0_1 = l0(0.0, -1.0);
|
||||
V4 s0_2 = l0(1.0, -1.0);
|
||||
V4 s0_3 = l0(-1.0, 0.0);
|
||||
V4 s0_4 = l0(0.0, 0.0);
|
||||
V4 s0_5 = l0(1.0, 0.0);
|
||||
V4 s0_6 = l0(-1.0, 1.0);
|
||||
V4 s0_7 = l0(0.0, 1.0);
|
||||
V4 s0_8 = l0(1.0, 1.0);
|
||||
V4 s1_0 = -max(-s0_0, 0.0);
|
||||
V4 s1_1 = -max(-s0_1, 0.0);
|
||||
V4 s1_2 = -max(-s0_2, 0.0);
|
||||
V4 s1_3 = -max(-s0_3, 0.0);
|
||||
V4 s1_4 = -max(-s0_4, 0.0);
|
||||
V4 s1_5 = -max(-s0_5, 0.0);
|
||||
V4 s1_6 = -max(-s0_6, 0.0);
|
||||
V4 s1_7 = -max(-s0_7, 0.0);
|
||||
V4 s1_8 = -max(-s0_8, 0.0);
|
||||
s0_0 = max(s0_0, 0.0);
|
||||
s0_1 = max(s0_1, 0.0);
|
||||
s0_2 = max(s0_2, 0.0);
|
||||
s0_3 = max(s0_3, 0.0);
|
||||
s0_4 = max(s0_4, 0.0);
|
||||
s0_5 = max(s0_5, 0.0);
|
||||
s0_6 = max(s0_6, 0.0);
|
||||
s0_7 = max(s0_7, 0.0);
|
||||
s0_8 = max(s0_8, 0.0);
|
||||
|
||||
t1[gxy] = f0(s0_0, s0_1, s0_2, s0_3, s0_4, s0_5, s0_6, s0_7, s0_8, s1_0, s1_1, s1_2, s1_3, s1_4, s1_5, s1_6, s1_7, s1_8);
|
||||
}
|
||||
|
||||
//!PASS 7
|
||||
//!DESC conv6
|
||||
//!BLOCK_SIZE 8
|
||||
//!NUM_THREADS 64
|
||||
//!IN t1
|
||||
//!OUT t0
|
||||
|
||||
#define l0(x, y) V4(O(t1, float2(x, y)))
|
||||
|
||||
V4 f0(V4 s0_0, V4 s0_1, V4 s0_2, V4 s0_3, V4 s0_4, V4 s0_5, V4 s0_6, V4 s0_7, V4 s0_8, V4 s1_0, V4 s1_1, V4 s1_2, V4 s1_3, V4 s1_4, V4 s1_5, V4 s1_6, V4 s1_7, V4 s1_8) {
|
||||
V4 r = 0.0;
|
||||
r += mul(s0_0, M4(6.200e-02, 5.385e-02, -5.478e-02, 3.955e-02, -1.722e-02, -1.194e-01, 8.331e-02, -9.296e-02, -2.161e-02, 8.716e-02, -5.918e-02, 1.032e-01, 4.954e-02, -3.822e-02, 8.472e-02, -2.191e-01));
|
||||
r += mul(s0_1, M4(2.503e-01, 5.635e-02, 7.355e-03, -2.025e-01, 7.104e-02, -1.324e-01, -3.051e-02, 2.246e-02, -4.480e-02, 6.693e-03, 4.467e-02, 3.388e-02, 4.262e-01, 1.488e-01, -8.809e-01, 5.350e-01));
|
||||
r += mul(s0_2, M4(-7.511e-03, 1.921e-01, -3.653e-01, 2.096e-02, 2.413e-02, 4.846e-02, -1.538e-01, 3.359e-02, 5.958e-03, -1.033e-02, 2.389e-02, 1.283e-02, -5.270e-02, 2.842e-01, 5.681e-02, -3.578e-02));
|
||||
r += mul(s0_3, M4(-2.198e-02, -1.674e-02, 3.330e-02, 3.249e-02, -4.430e-02, 9.217e-02, -3.348e-02, -3.546e-01, 1.228e-01, 3.875e-02, 7.220e-03, 6.719e-02, -8.768e-01, -1.165e-02, -3.862e-02, -2.045e-02));
|
||||
r += mul(s0_4, M4(-6.935e-01, -4.898e-01, 2.252e-01, -1.647e-01, -6.408e-02, 4.562e-01, -6.617e-01, 1.220e-01, 1.053e-02, -9.937e-02, -1.118e-02, 3.272e-01, -9.081e-02, 2.353e-02, 4.776e-01, -1.238e-01));
|
||||
r += mul(s0_5, M4(2.481e-01, -3.296e-01, -3.372e-02, -2.008e-02, 5.924e-03, 1.762e-02, 3.642e-01, -1.182e-01, -2.219e-02, -4.332e-02, -9.762e-02, 3.537e-02, 2.114e-02, -5.440e-02, 3.124e-01, 5.069e-02));
|
||||
r += mul(s0_6, M4(-5.465e-02, -5.352e-03, -3.419e-03, -6.733e-02, -8.079e-02, -6.569e-02, -1.494e-02, -3.462e-01, -8.125e-03, 2.572e-03, -3.894e-02, -3.246e-02, -1.566e-02, -3.004e-02, 1.145e-01, 6.794e-02));
|
||||
r += mul(s0_7, M4(4.788e-02, 7.675e-03, -7.030e-02, -2.384e-02, -3.070e-01, -7.080e-01, -2.017e-01, 9.579e-02, 1.259e-01, -1.004e-02, -1.287e-01, 3.334e-02, -9.642e-02, -8.073e-02, 2.546e-02, 5.204e-02));
|
||||
r += mul(s0_8, M4(-6.015e-02, 1.650e-01, -5.471e-02, -1.454e-01, -2.785e-02, -1.831e-01, 1.123e-01, 3.453e-02, -1.179e-02, 1.722e-02, -1.068e-02, -2.608e-02, 1.514e-04, -1.287e-02, -7.741e-03, -9.765e-03));
|
||||
r += mul(s1_0, M4(-4.922e-02, -5.675e-03, -2.161e-02, 3.164e-02, -2.003e-02, -3.890e-02, 5.198e-02, -1.811e-03, -3.385e-02, -1.510e-02, -2.289e-02, 1.009e-01, 4.427e-02, -1.763e-01, 1.255e-01, -5.073e-02));
|
||||
r += mul(s1_1, M4(1.057e-01, -8.124e-02, 1.131e-01, -1.361e-01, 4.740e-02, -6.425e-02, 8.930e-03, 5.318e-02, 5.266e-02, -6.003e-02, 1.320e-01, 4.163e-02, 1.277e-01, -2.404e-01, -1.696e-01, 2.204e-01));
|
||||
r += mul(s1_2, M4(2.723e-02, 1.918e-01, -2.822e-01, -1.877e-02, -4.599e-03, 7.591e-02, -1.128e-01, -6.519e-03, 2.311e-02, -1.684e-01, 2.293e-01, -1.042e-01, -1.882e-02, 4.970e-02, -1.309e-01, -8.894e-03));
|
||||
r += mul(s1_3, M4(4.883e-02, 2.819e-02, 4.318e-02, 3.186e-02, 7.782e-02, 1.741e-01, -8.927e-02, 4.005e-02, 5.888e-02, -1.057e-01, 9.692e-02, 8.032e-02, -1.086e-01, 6.323e-02, -8.520e-02, -1.273e-02));
|
||||
r += mul(s1_4, M4(-1.746e-01, -2.834e-02, -3.694e-02, 3.226e-01, -2.541e-01, 6.860e-01, -1.436e-01, 1.705e-01, 2.614e-01, -6.751e-02, 5.646e-02, 3.666e-01, -2.621e-02, 4.951e-01, -1.090e-01, -3.168e-01));
|
||||
r += mul(s1_5, M4(1.513e-01, 5.210e-02, 2.625e-01, -6.303e-02, -2.252e-02, -9.485e-02, 4.776e-01, -1.789e-01, -1.291e-01, -9.714e-02, -1.427e-01, -1.165e-01, 2.415e-02, 9.790e-02, 6.024e-02, -9.622e-02));
|
||||
r += mul(s1_6, M4(3.751e-02, -2.907e-02, -1.762e-02, -9.545e-02, 2.866e-01, -7.329e-02, -9.787e-03, 4.513e-03, -9.486e-02, -2.446e-02, -2.357e-02, -5.002e-02, 4.973e-02, 6.256e-02, -2.532e-02, -1.817e-02));
|
||||
r += mul(s1_7, M4(-6.855e-02, -6.762e-02, -6.269e-02, -6.947e-02, -1.389e-01, -1.915e-01, -4.806e-02, 1.870e-01, 1.298e-01, 6.268e-03, -5.985e-02, -5.396e-02, -3.048e-02, -5.396e-03, -9.720e-02, 3.289e-03));
|
||||
r += mul(s1_8, M4(-2.052e-02, -8.106e-02, -1.721e-02, 9.911e-03, -8.521e-02, 4.832e-02, -1.708e-01, -6.445e-02, -9.788e-02, 8.836e-02, -1.204e-01, -1.123e-01, 1.514e-02, 1.628e-02, -5.003e-02, -6.128e-03));
|
||||
r += V4(1.448e-03, -2.432e-03, -8.004e-04, 5.896e-05);
|
||||
return r;
|
||||
}
|
||||
|
||||
void Pass7(uint2 blockStart, uint3 tid) {
|
||||
float2 pt = float2(GetInputPt());
|
||||
uint2 gxy = Rmp8x8(tid.x) + blockStart;
|
||||
uint2 size = GetInputSize();
|
||||
if (gxy.x >= size.x || gxy.y >= size.y) {
|
||||
return;
|
||||
}
|
||||
float2 pos = (gxy + 0.5) * pt;
|
||||
|
||||
V4 s0_0 = l0(-1.0, -1.0);
|
||||
V4 s0_1 = l0(0.0, -1.0);
|
||||
V4 s0_2 = l0(1.0, -1.0);
|
||||
V4 s0_3 = l0(-1.0, 0.0);
|
||||
V4 s0_4 = l0(0.0, 0.0);
|
||||
V4 s0_5 = l0(1.0, 0.0);
|
||||
V4 s0_6 = l0(-1.0, 1.0);
|
||||
V4 s0_7 = l0(0.0, 1.0);
|
||||
V4 s0_8 = l0(1.0, 1.0);
|
||||
V4 s1_0 = -max(-s0_0, 0.0);
|
||||
V4 s1_1 = -max(-s0_1, 0.0);
|
||||
V4 s1_2 = -max(-s0_2, 0.0);
|
||||
V4 s1_3 = -max(-s0_3, 0.0);
|
||||
V4 s1_4 = -max(-s0_4, 0.0);
|
||||
V4 s1_5 = -max(-s0_5, 0.0);
|
||||
V4 s1_6 = -max(-s0_6, 0.0);
|
||||
V4 s1_7 = -max(-s0_7, 0.0);
|
||||
V4 s1_8 = -max(-s0_8, 0.0);
|
||||
s0_0 = max(s0_0, 0.0);
|
||||
s0_1 = max(s0_1, 0.0);
|
||||
s0_2 = max(s0_2, 0.0);
|
||||
s0_3 = max(s0_3, 0.0);
|
||||
s0_4 = max(s0_4, 0.0);
|
||||
s0_5 = max(s0_5, 0.0);
|
||||
s0_6 = max(s0_6, 0.0);
|
||||
s0_7 = max(s0_7, 0.0);
|
||||
s0_8 = max(s0_8, 0.0);
|
||||
|
||||
t0[gxy] = f0(s0_0, s0_1, s0_2, s0_3, s0_4, s0_5, s0_6, s0_7, s0_8, s1_0, s1_1, s1_2, s1_3, s1_4, s1_5, s1_6, s1_7, s1_8);
|
||||
}
|
||||
|
||||
//!PASS 8
|
||||
//!DESC conv7
|
||||
//!BLOCK_SIZE 8
|
||||
//!NUM_THREADS 64
|
||||
//!IN t0
|
||||
//!OUT t1
|
||||
|
||||
#define l0(x, y) V4(O(t0, float2(x, y)))
|
||||
|
||||
V4 f0(V4 s0_0, V4 s0_1, V4 s0_2, V4 s0_3, V4 s0_4, V4 s0_5, V4 s0_6, V4 s0_7, V4 s0_8, V4 s1_0, V4 s1_1, V4 s1_2, V4 s1_3, V4 s1_4, V4 s1_5, V4 s1_6, V4 s1_7, V4 s1_8) {
|
||||
V4 r = 0.0;
|
||||
r += mul(s0_0, M4(5.901e-02, -1.033e-01, -1.441e-01, 4.291e-02, 2.355e-02, -1.199e-01, -1.741e-01, -5.263e-03, -6.030e-03, -4.043e-02, 1.910e-01, 8.326e-03, 2.913e-02, 1.969e-02, -1.380e-01, 9.492e-02));
|
||||
r += mul(s0_1, M4(-1.616e-01, 1.649e-01, -1.133e-02, -1.037e-01, -1.060e-02, 2.299e-01, -5.302e-02, -2.329e-01, -8.540e-02, 2.232e-01, 2.647e-01, 3.922e-01, 5.387e-02, 5.841e-01, -1.264e-01, -1.440e-01));
|
||||
r += mul(s0_2, M4(-1.944e-02, -7.262e-02, 9.583e-02, 3.448e-02, 4.402e-02, 5.319e-02, -2.384e-02, 4.652e-02, 6.280e-02, -4.195e-02, 1.573e-02, 7.059e-02, 1.029e-01, -1.784e-02, -3.735e-02, -4.952e-02));
|
||||
r += mul(s0_3, M4(7.393e-02, -1.825e-01, -2.983e-01, -5.798e-02, -2.475e-01, -4.958e-02, 6.660e-01, -2.202e-01, -9.158e-02, 4.280e-04, 2.472e-01, -2.979e-01, -9.887e-02, 6.188e-02, 2.163e-01, -9.358e-03));
|
||||
r += mul(s0_4, M4(-8.664e-01, 2.357e-01, 3.390e-01, -5.275e-01, -2.213e-01, -4.992e-01, 5.479e-01, 4.245e-01, -7.542e-02, 4.854e-01, -3.525e-01, 3.950e-01, 3.619e-01, -3.968e-01, -3.447e-01, 5.089e-01));
|
||||
r += mul(s0_5, M4(-9.239e-02, -6.370e-01, -7.252e-02, -3.435e-01, -1.057e-01, 1.616e-01, -4.413e-02, 1.824e-01, 2.001e-02, -1.343e-01, -5.730e-02, 7.302e-02, -2.361e-02, -9.044e-02, -1.041e-01, 2.971e-01));
|
||||
r += mul(s0_6, M4(-2.803e-02, -8.707e-02, -1.407e-01, -2.685e-02, 1.099e-01, 1.721e-01, 1.612e-01, 6.962e-02, -1.659e-02, 7.845e-02, 2.165e-01, -7.067e-02, 1.666e-02, 7.051e-02, 6.373e-02, 4.391e-02));
|
||||
r += mul(s0_7, M4(-1.560e-01, -2.698e-02, -5.684e-01, -1.184e-01, 7.742e-01, -1.023e-03, -8.177e-02, 2.857e-01, 2.253e-02, -1.400e-02, -6.523e-02, 7.644e-02, 1.789e-01, -8.433e-03, 1.041e-01, 7.009e-02));
|
||||
r += mul(s0_8, M4(-1.491e-01, -2.037e-01, -2.499e-01, -7.730e-02, 1.051e-01, -1.718e-02, -1.762e-01, 4.808e-02, -3.068e-03, 1.737e-02, -3.772e-04, 4.732e-02, 7.205e-02, 7.901e-02, -1.759e-02, 8.476e-02));
|
||||
r += mul(s1_0, M4(4.810e-02, -1.822e-02, -1.150e-01, -1.679e-02, -5.481e-02, -7.544e-02, 2.213e-01, 2.615e-02, -2.628e-03, -1.482e-01, -5.570e-02, 5.137e-02, -1.381e-02, -1.878e-03, -3.132e-02, -3.309e-02));
|
||||
r += mul(s1_1, M4(1.101e-01, 1.003e-01, -4.307e-01, -2.520e-02, 1.138e-02, -1.966e-01, 6.664e-02, 1.114e-01, -1.431e-01, 3.634e-01, 4.274e-02, -8.279e-02, -5.291e-02, 3.540e-01, 8.995e-02, -1.401e-01));
|
||||
r += mul(s1_2, M4(7.230e-02, 4.684e-01, -6.542e-02, -2.792e-01, 2.936e-02, 3.476e-03, -1.024e-02, 1.880e-01, 1.898e-02, 2.529e-02, 8.537e-03, -6.073e-03, 1.025e-01, -2.320e-01, -1.804e-02, 5.471e-02));
|
||||
r += mul(s1_3, M4(-9.258e-03, -7.731e-03, 4.285e-02, -4.725e-02, -3.878e-02, -1.749e-02, -1.681e-02, -1.020e-01, -3.975e-02, 1.609e-02, 8.299e-02, -1.824e-01, -2.500e-02, 3.516e-02, 8.591e-02, 1.714e-02));
|
||||
r += mul(s1_4, M4(-2.210e-01, 1.534e-01, 3.410e-01, -2.552e-01, -5.090e-02, 1.582e-02, 1.802e-01, -1.333e-01, -5.371e-01, 3.751e-01, -1.323e-01, 3.018e-01, 1.756e-01, -9.756e-02, -4.873e-01, 4.985e-01));
|
||||
r += mul(s1_5, M4(-1.073e-02, 2.919e-01, -2.025e-01, 3.240e-01, 4.318e-02, -1.972e-02, -1.612e-01, 3.528e-01, -6.472e-02, -6.212e-02, 3.146e-02, 6.391e-02, 4.950e-02, -6.270e-01, -1.985e-02, 4.680e-02));
|
||||
r += mul(s1_6, M4(-2.215e-02, 1.836e-02, 5.021e-02, -3.016e-02, -7.854e-03, 1.135e-02, 3.407e-02, -2.923e-02, -5.384e-03, 6.570e-02, 2.437e-01, -8.712e-02, 2.275e-02, -2.291e-03, -7.378e-02, 5.231e-02));
|
||||
r += mul(s1_7, M4(-4.186e-02, 6.944e-02, 8.353e-02, -1.927e-02, 3.937e-02, 2.105e-02, 7.152e-02, 5.635e-03, 1.114e-01, -3.772e-02, -1.853e-01, 6.636e-02, 4.654e-02, -1.008e-01, -1.625e-01, 7.888e-02));
|
||||
r += mul(s1_8, M4(5.288e-02, -5.516e-02, -4.014e-02, 8.854e-02, 2.434e-02, 9.192e-02, -1.203e-02, 6.813e-02, 4.626e-02, -4.892e-02, 4.700e-03, 7.578e-02, -5.040e-02, 3.497e-02, 3.176e-02, -9.741e-02));
|
||||
r += V4(2.671e-03, -5.536e-03, -4.013e-03, 4.378e-03);
|
||||
return r;
|
||||
}
|
||||
|
||||
void Pass8(uint2 blockStart, uint3 tid) {
|
||||
float2 pt = float2(GetInputPt());
|
||||
uint2 gxy = Rmp8x8(tid.x) + blockStart;
|
||||
uint2 size = GetInputSize();
|
||||
if (gxy.x >= size.x || gxy.y >= size.y) {
|
||||
return;
|
||||
}
|
||||
float2 pos = (gxy + 0.5) * pt;
|
||||
|
||||
V4 s0_0 = l0(-1.0, -1.0);
|
||||
V4 s0_1 = l0(0.0, -1.0);
|
||||
V4 s0_2 = l0(1.0, -1.0);
|
||||
V4 s0_3 = l0(-1.0, 0.0);
|
||||
V4 s0_4 = l0(0.0, 0.0);
|
||||
V4 s0_5 = l0(1.0, 0.0);
|
||||
V4 s0_6 = l0(-1.0, 1.0);
|
||||
V4 s0_7 = l0(0.0, 1.0);
|
||||
V4 s0_8 = l0(1.0, 1.0);
|
||||
V4 s1_0 = -max(-s0_0, 0.0);
|
||||
V4 s1_1 = -max(-s0_1, 0.0);
|
||||
V4 s1_2 = -max(-s0_2, 0.0);
|
||||
V4 s1_3 = -max(-s0_3, 0.0);
|
||||
V4 s1_4 = -max(-s0_4, 0.0);
|
||||
V4 s1_5 = -max(-s0_5, 0.0);
|
||||
V4 s1_6 = -max(-s0_6, 0.0);
|
||||
V4 s1_7 = -max(-s0_7, 0.0);
|
||||
V4 s1_8 = -max(-s0_8, 0.0);
|
||||
s0_0 = max(s0_0, 0.0);
|
||||
s0_1 = max(s0_1, 0.0);
|
||||
s0_2 = max(s0_2, 0.0);
|
||||
s0_3 = max(s0_3, 0.0);
|
||||
s0_4 = max(s0_4, 0.0);
|
||||
s0_5 = max(s0_5, 0.0);
|
||||
s0_6 = max(s0_6, 0.0);
|
||||
s0_7 = max(s0_7, 0.0);
|
||||
s0_8 = max(s0_8, 0.0);
|
||||
|
||||
t1[gxy] = f0(s0_0, s0_1, s0_2, s0_3, s0_4, s0_5, s0_6, s0_7, s0_8, s1_0, s1_1, s1_2, s1_3, s1_4, s1_5, s1_6, s1_7, s1_8);
|
||||
}
|
||||
|
||||
//!PASS 9
|
||||
//!DESC conv8
|
||||
//!BLOCK_SIZE 8
|
||||
//!NUM_THREADS 64
|
||||
//!IN t1
|
||||
//!OUT t0
|
||||
|
||||
#define l0(x, y) V4(O(t1, float2(x, y)))
|
||||
|
||||
V4 f0(V4 s0_0, V4 s0_1, V4 s0_2, V4 s0_3, V4 s0_4, V4 s0_5, V4 s0_6, V4 s0_7, V4 s0_8, V4 s1_0, V4 s1_1, V4 s1_2, V4 s1_3, V4 s1_4, V4 s1_5, V4 s1_6, V4 s1_7, V4 s1_8) {
|
||||
V4 r = 0.0;
|
||||
r += mul(s0_0, M4(8.283e-02, 5.262e-02, 1.580e-02, 4.991e-02, 6.836e-02, -3.234e-02, 5.630e-02, 1.275e-01, 5.398e-03, 9.866e-04, -1.054e-02, 1.601e-02, 1.546e-02, -7.786e-02, -2.630e-02, -3.023e-02));
|
||||
r += mul(s0_1, M4(9.285e-02, 3.403e-01, -4.572e-02, 1.431e-01, 2.876e-01, -3.271e-01, -8.133e-04, 5.998e-01, 4.515e-02, 9.836e-02, 2.315e-02, 1.724e-01, -8.080e-02, -1.978e-01, -5.366e-02, -4.535e-02));
|
||||
r += mul(s0_2, M4(1.708e-02, -8.374e-02, -1.831e-02, 1.744e-02, 4.902e-02, -1.037e-02, -3.508e-02, 3.501e-02, 1.160e-01, 2.529e-01, 4.235e-02, 4.233e-02, -5.953e-03, -1.398e-01, -8.815e-03, 1.053e-02));
|
||||
r += mul(s0_3, M4(-2.836e-03, -2.496e-01, 2.703e-02, 9.490e-02, 3.985e-01, -9.458e-02, 1.355e-01, 5.917e-01, 5.597e-03, -8.963e-02, 5.238e-02, 4.360e-02, -1.070e-01, 7.593e-02, 6.376e-02, -1.498e-01));
|
||||
r += mul(s0_4, M4(3.214e-01, -8.045e-01, 6.621e-01, -1.261e-01, -1.487e+00, 1.086e+00, 3.779e-01, -1.762e+00, 2.721e-01, -3.815e-02, -1.450e-01, 4.063e-01, 2.804e-01, 3.876e-01, 2.607e-01, 2.174e-01));
|
||||
r += mul(s0_5, M4(-3.896e-01, 3.340e-01, -2.529e-01, -6.519e-02, -1.815e-01, 5.542e-02, -1.669e-01, 1.732e-02, 2.995e-01, 4.942e-02, 6.557e-02, -1.386e-01, -1.392e-01, 2.822e-01, 2.016e-02, -1.313e-01));
|
||||
r += mul(s0_6, M4(-2.130e-02, 4.137e-02, 7.324e-02, 4.834e-03, 9.333e-02, -2.998e-01, 4.229e-01, 9.535e-02, -2.595e-02, 2.955e-02, 7.491e-02, -3.028e-02, -2.850e-02, 1.582e-02, -1.076e-01, -3.159e-02));
|
||||
r += mul(s0_7, M4(-3.601e-02, 5.993e-02, -1.190e-02, -6.800e-02, 6.894e-03, -2.095e-01, -9.548e-02, -2.539e-02, -2.390e-02, 2.947e-02, 1.581e-01, -5.305e-03, 1.029e-01, -1.456e-01, -3.526e-02, 9.251e-02));
|
||||
r += mul(s0_8, M4(-7.206e-02, 9.690e-02, -4.464e-02, -6.999e-03, 3.140e-02, -4.201e-02, -6.364e-03, 5.280e-03, -1.412e-01, 1.696e-01, -1.274e-01, -9.546e-02, 5.285e-02, -1.072e-01, 5.994e-02, 1.293e-02));
|
||||
r += mul(s1_0, M4(-1.808e-02, 1.243e-01, -6.814e-02, -4.219e-03, 1.273e-02, 2.752e-02, 3.764e-02, 3.650e-02, 7.663e-04, 6.843e-03, 1.380e-02, -3.235e-02, 5.400e-02, -5.352e-02, 1.190e-02, -1.028e-01));
|
||||
r += mul(s1_1, M4(2.568e-01, 2.764e-01, 7.740e-02, 1.273e-01, 7.059e-02, 6.668e-02, 4.211e-02, 6.293e-02, -4.164e-02, 2.210e-01, -1.293e-02, 8.369e-02, 2.046e-01, 1.238e-01, 9.491e-02, 4.614e-02));
|
||||
r += mul(s1_2, M4(-2.387e-02, 3.174e-01, 8.165e-02, -6.680e-02, -1.516e-02, 1.482e-02, -1.342e-02, 1.692e-02, -2.288e-02, -6.891e-02, -5.559e-02, 4.771e-02, 3.290e-02, 1.234e-01, 4.334e-02, -5.106e-02));
|
||||
r += mul(s1_3, M4(6.216e-02, -2.114e-01, -1.616e-01, 1.664e-01, 3.796e-02, 6.036e-02, -1.106e-01, 1.398e-01, -3.139e-02, -6.274e-02, 4.988e-02, -6.274e-02, 2.296e-02, -5.131e-02, 5.052e-02, -8.866e-02));
|
||||
r += mul(s1_4, M4(2.647e-01, -7.858e-01, 1.597e-01, -8.262e-01, -3.213e-01, 2.427e-01, 1.686e-01, -4.251e-01, 1.505e-01, 3.244e-02, 1.023e-01, 1.962e-01, -1.116e-01, 3.525e-01, 8.848e-01, -1.945e-01));
|
||||
r += mul(s1_5, M4(-2.549e-01, -1.429e-01, -3.696e-02, 3.042e-01, -1.256e-01, 2.760e-02, -3.650e-02, 7.985e-02, -1.958e-01, 3.076e-01, -9.253e-02, -8.512e-02, -1.708e-01, -3.422e-04, -8.181e-02, 2.319e-01));
|
||||
r += mul(s1_6, M4(-3.382e-02, 6.627e-02, 1.158e-01, -3.044e-02, -7.983e-03, -7.855e-02, 1.729e-02, 3.219e-04, -1.764e-02, 4.065e-02, -1.400e-02, -2.387e-02, 2.673e-03, 5.460e-03, -4.992e-02, -1.573e-02));
|
||||
r += mul(s1_7, M4(-2.505e-02, 1.763e-01, -4.433e-01, -1.024e-01, 1.391e-01, -2.435e-01, -5.358e-02, 5.203e-02, 3.157e-02, 2.012e-02, 7.424e-03, 3.723e-02, -2.388e-02, 7.204e-02, -4.522e-01, -1.187e-02));
|
||||
r += mul(s1_8, M4(9.737e-02, 7.067e-02, 4.072e-02, 4.303e-02, 2.890e-02, -1.810e-02, 5.156e-03, -1.953e-02, -3.503e-02, 7.492e-02, 1.402e-02, -9.796e-03, 2.320e-01, -2.135e-01, 1.462e-01, 1.194e-01));
|
||||
r += V4(-5.006e-05, -2.252e-04, -1.752e-03, 4.586e-04);
|
||||
return r;
|
||||
}
|
||||
|
||||
void Pass9(uint2 blockStart, uint3 tid) {
|
||||
float2 pt = float2(GetInputPt());
|
||||
uint2 gxy = Rmp8x8(tid.x) + blockStart;
|
||||
uint2 size = GetInputSize();
|
||||
if (gxy.x >= size.x || gxy.y >= size.y) {
|
||||
return;
|
||||
}
|
||||
float2 pos = (gxy + 0.5) * pt;
|
||||
|
||||
V4 s0_0 = l0(-1.0, -1.0);
|
||||
V4 s0_1 = l0(0.0, -1.0);
|
||||
V4 s0_2 = l0(1.0, -1.0);
|
||||
V4 s0_3 = l0(-1.0, 0.0);
|
||||
V4 s0_4 = l0(0.0, 0.0);
|
||||
V4 s0_5 = l0(1.0, 0.0);
|
||||
V4 s0_6 = l0(-1.0, 1.0);
|
||||
V4 s0_7 = l0(0.0, 1.0);
|
||||
V4 s0_8 = l0(1.0, 1.0);
|
||||
V4 s1_0 = -max(-s0_0, 0.0);
|
||||
V4 s1_1 = -max(-s0_1, 0.0);
|
||||
V4 s1_2 = -max(-s0_2, 0.0);
|
||||
V4 s1_3 = -max(-s0_3, 0.0);
|
||||
V4 s1_4 = -max(-s0_4, 0.0);
|
||||
V4 s1_5 = -max(-s0_5, 0.0);
|
||||
V4 s1_6 = -max(-s0_6, 0.0);
|
||||
V4 s1_7 = -max(-s0_7, 0.0);
|
||||
V4 s1_8 = -max(-s0_8, 0.0);
|
||||
s0_0 = max(s0_0, 0.0);
|
||||
s0_1 = max(s0_1, 0.0);
|
||||
s0_2 = max(s0_2, 0.0);
|
||||
s0_3 = max(s0_3, 0.0);
|
||||
s0_4 = max(s0_4, 0.0);
|
||||
s0_5 = max(s0_5, 0.0);
|
||||
s0_6 = max(s0_6, 0.0);
|
||||
s0_7 = max(s0_7, 0.0);
|
||||
s0_8 = max(s0_8, 0.0);
|
||||
|
||||
t0[gxy] = f0(s0_0, s0_1, s0_2, s0_3, s0_4, s0_5, s0_6, s0_7, s0_8, s1_0, s1_1, s1_2, s1_3, s1_4, s1_5, s1_6, s1_7, s1_8);
|
||||
}
|
||||
|
||||
//!PASS 10
|
||||
//!DESC out-shuffle
|
||||
//!BLOCK_SIZE 16
|
||||
//!NUM_THREADS 64
|
||||
//!IN INPUT, t0
|
||||
//!OUT OUTPUT
|
||||
|
||||
#define l0(x, y) V4(O(t0, float2(x, y)))
|
||||
|
||||
V4 f0(V4 s0_0, V4 s0_1, V4 s0_2, V4 s0_3, V4 s0_4, V4 s0_5, V4 s0_6, V4 s0_7, V4 s0_8, V4 s1_0, V4 s1_1, V4 s1_2, V4 s1_3, V4 s1_4, V4 s1_5, V4 s1_6, V4 s1_7, V4 s1_8) {
|
||||
V4 r = 0.0;
|
||||
r += mul(s0_0, M4(2.670e-02, -1.964e-03, 2.191e-02, 3.109e-02, 1.911e-02, -2.017e-02, -2.948e-02, -2.237e-02, -3.845e-02, -7.954e-03, -3.472e-02, -2.253e-02, -1.571e-02, -6.613e-03, -1.489e-02, -2.647e-02));
|
||||
r += mul(s0_1, M4(-6.714e-02, -2.106e-02, 7.577e-03, 1.788e-02, 8.081e-02, 8.813e-02, -5.510e-02, -2.724e-02, 1.150e-01, 5.284e-02, -8.964e-02, -3.024e-02, 5.215e-02, 5.334e-02, -1.180e-02, 6.927e-03));
|
||||
r += mul(s0_2, M4(1.036e-02, 1.826e-02, -8.095e-03, -9.967e-03, 1.368e-03, 3.479e-02, -1.887e-03, -2.161e-02, -3.464e-02, -1.124e-01, -4.623e-03, -5.295e-03, -7.199e-03, -4.285e-02, 8.862e-03, -1.610e-02));
|
||||
r += mul(s0_3, M4(2.388e-01, -1.001e-03, 1.699e-01, -4.519e-02, -3.274e-01, 1.550e-01, 3.748e-02, 3.435e-02, -1.655e-01, 1.227e-02, -1.372e-01, 4.700e-02, -1.636e-01, 1.222e-02, -1.323e-01, 3.239e-02));
|
||||
r += mul(s0_4, M4(1.698e-01, 4.561e-01, -1.355e-01, 1.831e-01, -3.815e-01, -7.832e-01, 1.738e-01, 4.516e-02, 2.803e-01, -4.239e-01, 8.945e-01, -1.339e-02, -3.701e-01, -3.731e-01, 1.765e-01, -1.343e-01));
|
||||
r += mul(s0_5, M4(-4.653e-02, -8.470e-02, -1.076e-03, -7.153e-02, 1.022e-02, -2.560e-02, -1.154e-02, 2.252e-02, -1.053e-01, 4.014e-01, -1.479e-01, 3.667e-01, 9.425e-02, -8.079e-02, 5.594e-03, 4.870e-02));
|
||||
r += mul(s0_6, M4(-6.274e-02, -3.430e-02, -5.955e-02, 1.220e-02, -6.075e-02, 1.284e-02, -8.384e-02, 2.143e-01, -2.050e-02, -8.887e-03, -1.445e-02, 1.797e-02, 1.436e-01, -8.067e-04, 1.013e-01, 3.847e-03));
|
||||
r += mul(s0_7, M4(6.862e-02, -7.230e-02, -2.461e-01, -3.760e-01, 4.038e-02, -2.634e-02, -2.725e-01, -4.389e-01, 9.088e-03, -1.873e-02, -9.497e-02, -1.860e-01, -1.038e-01, 2.502e-01, -6.194e-01, 4.470e-02));
|
||||
r += mul(s0_8, M4(-1.984e-02, 4.173e-02, 5.328e-02, 5.554e-02, 1.241e-03, -2.290e-03, 5.972e-02, 4.381e-02, -3.320e-03, -1.434e-04, -5.754e-02, -6.072e-02, -6.854e-03, 6.781e-02, 1.208e-01, -5.469e-02));
|
||||
r += mul(s1_0, M4(7.050e-02, -3.676e-02, 7.009e-03, 1.431e-02, -1.258e-02, -6.854e-03, -9.803e-04, 5.955e-03, -3.077e-03, -2.372e-02, 8.060e-03, -5.992e-02, -7.957e-02, 2.905e-02, 3.914e-04, -1.408e-02));
|
||||
r += mul(s1_1, M4(-1.068e-01, 4.589e-02, -1.399e-02, -8.157e-03, 1.811e-02, 7.241e-03, 9.447e-03, 3.242e-03, 5.152e-02, 8.667e-02, -2.512e-02, -2.978e-02, 1.382e-01, 5.481e-02, -2.199e-02, -2.739e-02));
|
||||
r += mul(s1_2, M4(3.676e-02, 1.705e-02, -4.520e-03, -6.449e-03, 1.006e-02, 9.807e-03, -6.046e-03, -1.299e-03, -5.035e-02, -4.415e-02, 9.619e-03, -1.059e-02, -6.952e-03, -1.803e-02, -4.042e-03, -1.751e-02));
|
||||
r += mul(s1_3, M4(5.123e-02, 4.500e-02, 2.099e-01, -7.254e-03, -7.977e-02, 2.822e-02, -1.546e-01, -3.748e-02, -2.378e-01, -1.836e-02, -3.508e-02, -2.147e-03, 3.371e-02, -4.720e-02, -5.574e-02, -1.592e-02));
|
||||
r += mul(s1_4, M4(-5.764e-01, 5.998e-01, -2.288e-01, 7.223e-01, -1.855e-01, -3.467e-01, 5.173e-02, -8.967e-02, 3.308e-01, -8.987e-02, 2.397e-01, 3.701e-01, -7.970e-02, -9.046e-01, 2.397e-01, -1.626e-01));
|
||||
r += mul(s1_5, M4(1.177e-02, -1.538e-01, 4.138e-02, -5.198e-02, 3.165e-03, 3.827e-02, -5.913e-03, 8.727e-03, 7.885e-02, 2.979e-01, -6.160e-02, 1.198e-01, 1.186e-02, 9.421e-02, -4.101e-02, 4.185e-03));
|
||||
r += mul(s1_6, M4(-7.690e-02, -4.820e-03, -1.106e-01, 4.040e-02, -6.883e-02, -3.284e-02, 1.259e-02, 1.509e-01, 6.378e-03, -5.293e-04, -3.690e-02, 6.274e-02, 1.401e-01, -3.801e-03, 1.489e-01, -1.044e-02));
|
||||
r += mul(s1_7, M4(1.140e-01, -1.333e-01, -1.739e-01, -1.739e-01, 4.736e-02, -1.306e-02, -3.673e-01, -6.127e-01, -3.477e-02, -6.090e-02, 2.430e-02, -2.666e-01, -6.599e-02, 2.794e-01, -1.724e-01, -2.744e-01));
|
||||
r += mul(s1_8, M4(1.045e-02, 6.106e-02, 3.463e-02, 6.708e-02, -1.028e-02, -2.277e-02, 6.536e-02, 8.227e-02, -5.566e-02, -3.941e-02, -6.862e-03, -1.219e-02, -1.438e-02, -4.651e-02, 5.359e-02, 4.650e-02));
|
||||
r += V4(-1.731e-03, -2.098e-03, -1.131e-03, -1.644e-03);
|
||||
return tanh(r);
|
||||
}
|
||||
|
||||
void Pass10(uint2 blockStart, uint3 tid) {
|
||||
float2 pt = float2(GetInputPt());
|
||||
uint2 gxy = (Rmp8x8(tid.x) << 1) + blockStart;
|
||||
uint2 size = GetOutputSize();
|
||||
if (gxy.x >= size.x || gxy.y >= size.y) {
|
||||
return;
|
||||
}
|
||||
float2 pos = ((gxy >> 1) + 0.5) * pt;
|
||||
|
||||
V4 s0_0 = l0(-1.0, -1.0);
|
||||
V4 s0_1 = l0(0.0, -1.0);
|
||||
V4 s0_2 = l0(1.0, -1.0);
|
||||
V4 s0_3 = l0(-1.0, 0.0);
|
||||
V4 s0_4 = l0(0.0, 0.0);
|
||||
V4 s0_5 = l0(1.0, 0.0);
|
||||
V4 s0_6 = l0(-1.0, 1.0);
|
||||
V4 s0_7 = l0(0.0, 1.0);
|
||||
V4 s0_8 = l0(1.0, 1.0);
|
||||
V4 s1_0 = -max(-s0_0, 0.0);
|
||||
V4 s1_1 = -max(-s0_1, 0.0);
|
||||
V4 s1_2 = -max(-s0_2, 0.0);
|
||||
V4 s1_3 = -max(-s0_3, 0.0);
|
||||
V4 s1_4 = -max(-s0_4, 0.0);
|
||||
V4 s1_5 = -max(-s0_5, 0.0);
|
||||
V4 s1_6 = -max(-s0_6, 0.0);
|
||||
V4 s1_7 = -max(-s0_7, 0.0);
|
||||
V4 s1_8 = -max(-s0_8, 0.0);
|
||||
s0_0 = max(s0_0, 0.0);
|
||||
s0_1 = max(s0_1, 0.0);
|
||||
s0_2 = max(s0_2, 0.0);
|
||||
s0_3 = max(s0_3, 0.0);
|
||||
s0_4 = max(s0_4, 0.0);
|
||||
s0_5 = max(s0_5, 0.0);
|
||||
s0_6 = max(s0_6, 0.0);
|
||||
s0_7 = max(s0_7, 0.0);
|
||||
s0_8 = max(s0_8, 0.0);
|
||||
|
||||
V4 r = f0(s0_0, s0_1, s0_2, s0_3, s0_4, s0_5, s0_6, s0_7, s0_8, s1_0, s1_1, s1_2, s1_3, s1_4, s1_5, s1_6, s1_7, s1_8);
|
||||
|
||||
static const float3x3 rgb2yuv = {0.299, 0.587, 0.114, -0.169, -0.331, 0.5, 0.5, -0.419, -0.081};
|
||||
static const float3x3 yuv2rgb = {1, -0.00093, 1.401687, 1, -0.3437, -0.71417, 1, 1.77216, 0.00099};
|
||||
float2 opt = float2(GetOutputPt());
|
||||
|
||||
pos -= 0.5f * opt;
|
||||
float3 yuv = mul(rgb2yuv, INPUT.SampleLevel(SL, pos, 0).rgb);
|
||||
OUTPUT[gxy] = float4(mul(yuv2rgb, float3(saturate(yuv.r + r.x), yuv.yz)), 1);
|
||||
|
||||
++gxy.x;
|
||||
pos.x += opt.x;
|
||||
yuv = mul(rgb2yuv, INPUT.SampleLevel(SL, pos, 0).rgb);
|
||||
OUTPUT[gxy] = float4(mul(yuv2rgb, float3(saturate(yuv.r + r.y), yuv.yz)), 1);
|
||||
|
||||
++gxy.y;
|
||||
pos.y += opt.y;
|
||||
yuv = mul(rgb2yuv, INPUT.SampleLevel(SL, pos, 0).rgb);
|
||||
OUTPUT[gxy] = float4(mul(yuv2rgb, float3(saturate(yuv.r + r.w), yuv.yz)), 1);
|
||||
|
||||
--gxy.x;
|
||||
pos.x -= opt.x;
|
||||
yuv = mul(rgb2yuv, INPUT.SampleLevel(SL, pos, 0).rgb);
|
||||
OUTPUT[gxy] = float4(mul(yuv2rgb, float3(saturate(yuv.r + r.z), yuv.yz)), 1);
|
||||
}
|
||||
1573
src/Effects/CuNNy/CuNNy-8x8C-NVL-DN.hlsl
Normal file
1573
src/Effects/CuNNy/CuNNy-8x8C-NVL-DN.hlsl
Normal file
File diff suppressed because it is too large
Load diff
1573
src/Effects/CuNNy/CuNNy-8x8C-NVL.hlsl
Normal file
1573
src/Effects/CuNNy/CuNNy-8x8C-NVL.hlsl
Normal file
File diff suppressed because it is too large
Load diff
|
|
@ -2,9 +2,7 @@
|
|||
// Port from https://github.com/haasn/gentoo-conf/blob/xor/home/nand/.mpv/shaders/deband.glsl
|
||||
|
||||
//!MAGPIE EFFECT
|
||||
//!VERSION 3
|
||||
//!OUTPUT_WIDTH INPUT_WIDTH
|
||||
//!OUTPUT_HEIGHT INPUT_HEIGHT
|
||||
//!VERSION 4
|
||||
|
||||
//!PARAMETER
|
||||
//!LABEL Threshold
|
||||
|
|
@ -54,6 +52,11 @@ float grain;
|
|||
//!TEXTURE
|
||||
Texture2D INPUT;
|
||||
|
||||
//!TEXTURE
|
||||
//!WIDTH INPUT_WIDTH
|
||||
//!HEIGHT INPUT_HEIGHT
|
||||
Texture2D OUTPUT;
|
||||
|
||||
//!SAMPLER
|
||||
//!FILTER LINEAR
|
||||
SamplerState sam;
|
||||
|
|
@ -66,6 +69,7 @@ SamplerState sam1;
|
|||
//!PASS 1
|
||||
//!STYLE PS
|
||||
//!IN INPUT
|
||||
//!OUT OUTPUT
|
||||
|
||||
// Wide usage friendly PRNG, shamelessly stolen from a GLSL tricks forum post
|
||||
float mod289(float x) { return x - floor(x / 289.0) * 289.0; }
|
||||
|
|
|
|||
|
|
@ -29,36 +29,24 @@
|
|||
<CopyFileToFolders Include="ACNet.hlsl">
|
||||
<FileType>Document</FileType>
|
||||
</CopyFileToFolders>
|
||||
<CopyFileToFolders Include="Bicubic.hlsl">
|
||||
<FileType>Document</FileType>
|
||||
</CopyFileToFolders>
|
||||
<CopyFileToFolders Include="Jinc.hlsl">
|
||||
<FileType>Document</FileType>
|
||||
</CopyFileToFolders>
|
||||
<CopyFileToFolders Include="Lanczos.hlsl">
|
||||
<FileType>Document</FileType>
|
||||
</CopyFileToFolders>
|
||||
<CopyFileToFolders Include="Bilinear.hlsl">
|
||||
<FileType>Document</FileType>
|
||||
</CopyFileToFolders>
|
||||
<CopyFileToFolders Include="Nearest.hlsl">
|
||||
<FileType>Document</FileType>
|
||||
</CopyFileToFolders>
|
||||
<CopyFileToFolders Include="SSimDownscaler.hlsl">
|
||||
<FileType>Document</FileType>
|
||||
</CopyFileToFolders>
|
||||
</ItemGroup>
|
||||
<ItemGroup>
|
||||
<CopyFileToFolders Include="Bicubic.hlsl">
|
||||
<FileType>Document</FileType>
|
||||
</CopyFileToFolders>
|
||||
<CopyFileToFolders Include="ImageAdjustment.hlsl">
|
||||
<FileType>Document</FileType>
|
||||
</CopyFileToFolders>
|
||||
</ItemGroup>
|
||||
<ItemGroup>
|
||||
<CopyFileToFolders Include="SMAA\SMAA.hlsli">
|
||||
<FileType>Document</FileType>
|
||||
</CopyFileToFolders>
|
||||
</ItemGroup>
|
||||
<ItemGroup>
|
||||
<CopyFileToFolders Include="SMAA\SMAA_High.hlsl">
|
||||
<FileType>Document</FileType>
|
||||
</CopyFileToFolders>
|
||||
|
|
@ -71,20 +59,14 @@
|
|||
<CopyFileToFolders Include="SMAA\SMAA_Ultra.hlsl">
|
||||
<FileType>Document</FileType>
|
||||
</CopyFileToFolders>
|
||||
</ItemGroup>
|
||||
<ItemGroup>
|
||||
<CopyFileToFolders Include="FSR\FSR_EASU.hlsl">
|
||||
<FileType>Document</FileType>
|
||||
</CopyFileToFolders>
|
||||
<CopyFileToFolders Include="FSR\FSR_RCAS.hlsl">
|
||||
<FileType>Document</FileType>
|
||||
</CopyFileToFolders>
|
||||
</ItemGroup>
|
||||
<ItemGroup>
|
||||
<CopyFileToFolders Include="SMAA\AreaTex.dds" />
|
||||
<CopyFileToFolders Include="SMAA\SearchTex.dds" />
|
||||
</ItemGroup>
|
||||
<ItemGroup>
|
||||
<CopyFileToFolders Include="Anime4K\Anime4K_3D_AA_Upscale_US.hlsl">
|
||||
<FileType>Document</FileType>
|
||||
</CopyFileToFolders>
|
||||
|
|
@ -160,8 +142,6 @@
|
|||
<CopyFileToFolders Include="Anime4K\Anime4K_Upscale_VL.hlsl">
|
||||
<FileType>Document</FileType>
|
||||
</CopyFileToFolders>
|
||||
</ItemGroup>
|
||||
<ItemGroup>
|
||||
<CopyFileToFolders Include="CRT\CRT_Easymode.hlsl">
|
||||
<FileType>Document</FileType>
|
||||
</CopyFileToFolders>
|
||||
|
|
@ -177,8 +157,6 @@
|
|||
<CopyFileToFolders Include="CRT\GTU_v050.hlsl">
|
||||
<FileType>Document</FileType>
|
||||
</CopyFileToFolders>
|
||||
</ItemGroup>
|
||||
<ItemGroup>
|
||||
<CopyFileToFolders Include="xBRZ\xBRZ_2x.hlsl">
|
||||
<FileType>Document</FileType>
|
||||
</CopyFileToFolders>
|
||||
|
|
@ -197,13 +175,9 @@
|
|||
<CopyFileToFolders Include="xBRZ\xBRZ_Freescale.hlsl">
|
||||
<FileType>Document</FileType>
|
||||
</CopyFileToFolders>
|
||||
</ItemGroup>
|
||||
<ItemGroup>
|
||||
<CopyFileToFolders Include="FXAA\FXAA.hlsli">
|
||||
<FileType>Document</FileType>
|
||||
</CopyFileToFolders>
|
||||
</ItemGroup>
|
||||
<ItemGroup>
|
||||
<CopyFileToFolders Include="FXAA\FXAA_High.hlsl">
|
||||
<FileType>Document</FileType>
|
||||
</CopyFileToFolders>
|
||||
|
|
@ -213,56 +187,153 @@
|
|||
<CopyFileToFolders Include="FXAA\FXAA_Ultra.hlsl">
|
||||
<FileType>Document</FileType>
|
||||
</CopyFileToFolders>
|
||||
</ItemGroup>
|
||||
<ItemGroup>
|
||||
<CopyFileToFolders Include="RAVU\prescalers.hlsli">
|
||||
<FileType>Document</FileType>
|
||||
</CopyFileToFolders>
|
||||
<CopyFileToFolders Include="RAVU\RAVU_3x_R2.hlsl">
|
||||
<FileType>Document</FileType>
|
||||
</CopyFileToFolders>
|
||||
<CopyFileToFolders Include="RAVU\RAVU_3x_R2_RGB.hlsl">
|
||||
<FileType>Document</FileType>
|
||||
</CopyFileToFolders>
|
||||
<CopyFileToFolders Include="RAVU\RAVU_3x_R3.hlsl">
|
||||
<FileType>Document</FileType>
|
||||
</CopyFileToFolders>
|
||||
<CopyFileToFolders Include="RAVU\RAVU_3x_R3_RGB.hlsl">
|
||||
<FileType>Document</FileType>
|
||||
</CopyFileToFolders>
|
||||
<CopyFileToFolders Include="RAVU\RAVU_3x_R4.hlsl">
|
||||
<FileType>Document</FileType>
|
||||
</CopyFileToFolders>
|
||||
<CopyFileToFolders Include="RAVU\RAVU_3x_R4_RGB.hlsl">
|
||||
<FileType>Document</FileType>
|
||||
</CopyFileToFolders>
|
||||
<CopyFileToFolders Include="RAVU\RAVU_Lite_AR_R2.hlsl">
|
||||
<FileType>Document</FileType>
|
||||
</CopyFileToFolders>
|
||||
<CopyFileToFolders Include="RAVU\RAVU_Lite_AR_R3.hlsl">
|
||||
<FileType>Document</FileType>
|
||||
</CopyFileToFolders>
|
||||
<CopyFileToFolders Include="RAVU\RAVU_Lite_AR_R4.hlsl">
|
||||
<FileType>Document</FileType>
|
||||
</CopyFileToFolders>
|
||||
<CopyFileToFolders Include="RAVU\RAVU_Lite_R2.hlsl">
|
||||
<FileType>Document</FileType>
|
||||
</CopyFileToFolders>
|
||||
<CopyFileToFolders Include="RAVU\RAVU_Lite_R3.hlsl">
|
||||
<FileType>Document</FileType>
|
||||
</CopyFileToFolders>
|
||||
<CopyFileToFolders Include="RAVU\RAVU_Lite_R4.hlsl">
|
||||
<FileType>Document</FileType>
|
||||
</CopyFileToFolders>
|
||||
<CopyFileToFolders Include="RAVU\RAVU_R2.hlsl">
|
||||
<FileType>Document</FileType>
|
||||
</CopyFileToFolders>
|
||||
<CopyFileToFolders Include="RAVU\RAVU_R2_RGB.hlsl">
|
||||
<FileType>Document</FileType>
|
||||
</CopyFileToFolders>
|
||||
<CopyFileToFolders Include="RAVU\RAVU_R3.hlsl">
|
||||
<FileType>Document</FileType>
|
||||
</CopyFileToFolders>
|
||||
<CopyFileToFolders Include="RAVU\RAVU_R3_RGB.hlsl">
|
||||
<FileType>Document</FileType>
|
||||
</CopyFileToFolders>
|
||||
<CopyFileToFolders Include="RAVU\RAVU_R4.hlsl">
|
||||
<FileType>Document</FileType>
|
||||
</CopyFileToFolders>
|
||||
<CopyFileToFolders Include="RAVU\RAVU_R4_RGB.hlsl">
|
||||
<FileType>Document</FileType>
|
||||
</CopyFileToFolders>
|
||||
<CopyFileToFolders Include="RAVU\RAVU_Zoom_AR_R2.hlsl">
|
||||
<FileType>Document</FileType>
|
||||
</CopyFileToFolders>
|
||||
<CopyFileToFolders Include="RAVU\RAVU_Zoom_AR_R2_RGB.hlsl">
|
||||
<FileType>Document</FileType>
|
||||
</CopyFileToFolders>
|
||||
<CopyFileToFolders Include="RAVU\RAVU_Zoom_AR_R3.hlsl">
|
||||
<FileType>Document</FileType>
|
||||
</CopyFileToFolders>
|
||||
<CopyFileToFolders Include="RAVU\RAVU_Zoom_AR_R3_RGB.hlsl">
|
||||
<FileType>Document</FileType>
|
||||
</CopyFileToFolders>
|
||||
<CopyFileToFolders Include="RAVU\RAVU_Zoom_R2.hlsl">
|
||||
<FileType>Document</FileType>
|
||||
</CopyFileToFolders>
|
||||
<CopyFileToFolders Include="RAVU\RAVU_Zoom_R2_RGB.hlsl">
|
||||
<FileType>Document</FileType>
|
||||
</CopyFileToFolders>
|
||||
<CopyFileToFolders Include="RAVU\RAVU_Zoom_R3.hlsl">
|
||||
<FileType>Document</FileType>
|
||||
</CopyFileToFolders>
|
||||
</ItemGroup>
|
||||
<ItemGroup>
|
||||
<CopyFileToFolders Include="RAVU\RAVU_Lite_R3_Weights.dds" />
|
||||
<CopyFileToFolders Include="RAVU\RAVU_Zoom_R3_Weights.dds" />
|
||||
</ItemGroup>
|
||||
<ItemGroup>
|
||||
<CopyFileToFolders Include="RAVU\RAVU_Zoom_R3_RGB.hlsl">
|
||||
<FileType>Document</FileType>
|
||||
</CopyFileToFolders>
|
||||
<CopyFileToFolders Include="RAVU\ravu_3x_lut2_f16.dds" />
|
||||
<CopyFileToFolders Include="RAVU\ravu_3x_lut3_f16.dds" />
|
||||
<CopyFileToFolders Include="RAVU\ravu_3x_lut4_f16.dds" />
|
||||
<CopyFileToFolders Include="RAVU\ravu_lite_lut2_f16.dds" />
|
||||
<CopyFileToFolders Include="RAVU\ravu_lite_lut3_f16.dds" />
|
||||
<CopyFileToFolders Include="RAVU\ravu_lite_lut4_f16.dds" />
|
||||
<CopyFileToFolders Include="RAVU\ravu_lut2_f16.dds" />
|
||||
<CopyFileToFolders Include="RAVU\ravu_lut3_f16.dds" />
|
||||
<CopyFileToFolders Include="RAVU\ravu_lut4_f16.dds" />
|
||||
<CopyFileToFolders Include="RAVU\ravu_zoom_lut2_ar_f16.dds" />
|
||||
<CopyFileToFolders Include="RAVU\ravu_zoom_lut2_f16.dds" />
|
||||
<CopyFileToFolders Include="RAVU\ravu_zoom_lut3_ar_f16.dds" />
|
||||
<CopyFileToFolders Include="RAVU\ravu_zoom_lut3_f16.dds" />
|
||||
<CopyFileToFolders Include="NNEDI3\NNEDI3_nns128_win8x4.hlsl">
|
||||
<FileType>Document</FileType>
|
||||
</CopyFileToFolders>
|
||||
<CopyFileToFolders Include="NNEDI3\NNEDI3_nns128_win8x6.hlsl">
|
||||
<FileType>Document</FileType>
|
||||
</CopyFileToFolders>
|
||||
<CopyFileToFolders Include="NNEDI3\NNEDI3_nns16_win8x4.hlsl">
|
||||
<FileType>Document</FileType>
|
||||
</CopyFileToFolders>
|
||||
<CopyFileToFolders Include="NNEDI3\NNEDI3_nns16_win8x6.hlsl">
|
||||
<FileType>Document</FileType>
|
||||
</CopyFileToFolders>
|
||||
<CopyFileToFolders Include="NNEDI3\NNEDI3_nns256_win8x4.hlsl">
|
||||
<FileType>Document</FileType>
|
||||
</CopyFileToFolders>
|
||||
<CopyFileToFolders Include="NNEDI3\NNEDI3_nns256_win8x6.hlsl">
|
||||
<FileType>Document</FileType>
|
||||
</CopyFileToFolders>
|
||||
<CopyFileToFolders Include="NNEDI3\NNEDI3_nns32_win8x4.hlsl">
|
||||
<FileType>Document</FileType>
|
||||
</CopyFileToFolders>
|
||||
<CopyFileToFolders Include="NNEDI3\NNEDI3_nns32_win8x6.hlsl">
|
||||
<FileType>Document</FileType>
|
||||
</CopyFileToFolders>
|
||||
<CopyFileToFolders Include="NNEDI3\NNEDI3_nns64_win8x4.hlsl">
|
||||
<FileType>Document</FileType>
|
||||
</CopyFileToFolders>
|
||||
<CopyFileToFolders Include="NNEDI3\NNEDI3_nns64_win8x6.hlsl">
|
||||
<FileType>Document</FileType>
|
||||
</CopyFileToFolders>
|
||||
</ItemGroup>
|
||||
<ItemGroup>
|
||||
<CopyFileToFolders Include="NNEDI3\prescalers.hlsli">
|
||||
<FileType>Document</FileType>
|
||||
</CopyFileToFolders>
|
||||
<CopyFileToFolders Include="NIS\NIS.hlsl">
|
||||
<FileType>Document</FileType>
|
||||
</CopyFileToFolders>
|
||||
<CopyFileToFolders Include="NIS\NVSharpen.hlsl">
|
||||
<FileType>Document</FileType>
|
||||
</CopyFileToFolders>
|
||||
</ItemGroup>
|
||||
<ItemGroup>
|
||||
<CopyFileToFolders Include="NIS\Coef_Scale.dds" />
|
||||
<CopyFileToFolders Include="NIS\Coef_USM.dds" />
|
||||
</ItemGroup>
|
||||
<ItemGroup>
|
||||
<CopyFileToFolders Include="CAS\CAS.hlsl">
|
||||
<FileType>Document</FileType>
|
||||
</CopyFileToFolders>
|
||||
<CopyFileToFolders Include="CAS\CAS_Scaling.hlsl">
|
||||
<FileType>Document</FileType>
|
||||
</CopyFileToFolders>
|
||||
</ItemGroup>
|
||||
<ItemGroup>
|
||||
<CopyFileToFolders Include="FSRCNNX\FSRCNNX.hlsl">
|
||||
<FileType>Document</FileType>
|
||||
</CopyFileToFolders>
|
||||
<CopyFileToFolders Include="FSRCNNX\FSRCNNX_LineArt.hlsl">
|
||||
<FileType>Document</FileType>
|
||||
</CopyFileToFolders>
|
||||
</ItemGroup>
|
||||
<ItemGroup>
|
||||
<CopyFileToFolders Include="Sharpen\AdaptiveSharpen.hlsl">
|
||||
<FileType>Document</FileType>
|
||||
</CopyFileToFolders>
|
||||
|
|
@ -275,8 +346,6 @@
|
|||
<CopyFileToFolders Include="Sharpen\LumaSharpen.hlsl">
|
||||
<FileType>Document</FileType>
|
||||
</CopyFileToFolders>
|
||||
</ItemGroup>
|
||||
<ItemGroup>
|
||||
<CopyFileToFolders Include="Pixel Art\MMPX.hlsl">
|
||||
<FileType>Document</FileType>
|
||||
</CopyFileToFolders>
|
||||
|
|
@ -286,11 +355,81 @@
|
|||
<CopyFileToFolders Include="Pixel Art\SharpBilinear.hlsl">
|
||||
<FileType>Document</FileType>
|
||||
</CopyFileToFolders>
|
||||
</ItemGroup>
|
||||
<ItemGroup>
|
||||
<CopyFileToFolders Include="Deband.hlsl">
|
||||
<FileType>Document</FileType>
|
||||
</CopyFileToFolders>
|
||||
<CopyFileToFolders Include="Bilinear.hlsl">
|
||||
<FileType>Document</FileType>
|
||||
</CopyFileToFolders>
|
||||
<CopyFileToFolders Include="Nearest.hlsl">
|
||||
<FileType>Document</FileType>
|
||||
</CopyFileToFolders>
|
||||
<CopyFileToFolders Include="Anime4K\Anime4K_Upscale_GAN_x3_L.hlsl">
|
||||
<FileType>Document</FileType>
|
||||
</CopyFileToFolders>
|
||||
<CopyFileToFolders Include="Anime4K\Anime4K_Upscale_GAN_x2_M.hlsl">
|
||||
<FileType>Document</FileType>
|
||||
</CopyFileToFolders>
|
||||
<CopyFileToFolders Include="CuNNy\CuNNy-2x4C-NVL.hlsl">
|
||||
<FileType>Document</FileType>
|
||||
</CopyFileToFolders>
|
||||
<CopyFileToFolders Include="CuNNy\CuNNy-2x4C-NVL-DN.hlsl">
|
||||
<FileType>Document</FileType>
|
||||
</CopyFileToFolders>
|
||||
<CopyFileToFolders Include="CuNNy\CuNNy-3x4C-NVL.hlsl">
|
||||
<FileType>Document</FileType>
|
||||
</CopyFileToFolders>
|
||||
<CopyFileToFolders Include="CuNNy\CuNNy-3x4C-NVL-DN.hlsl">
|
||||
<FileType>Document</FileType>
|
||||
</CopyFileToFolders>
|
||||
<CopyFileToFolders Include="CuNNy\CuNNy-4x4C-NVL.hlsl">
|
||||
<FileType>Document</FileType>
|
||||
</CopyFileToFolders>
|
||||
<CopyFileToFolders Include="CuNNy\CuNNy-4x4C-NVL-DN.hlsl">
|
||||
<FileType>Document</FileType>
|
||||
</CopyFileToFolders>
|
||||
<CopyFileToFolders Include="CuNNy\CuNNy-4x8C-NVL.hlsl">
|
||||
<FileType>Document</FileType>
|
||||
</CopyFileToFolders>
|
||||
<CopyFileToFolders Include="CuNNy\CuNNy-4x8C-NVL-DN.hlsl">
|
||||
<FileType>Document</FileType>
|
||||
</CopyFileToFolders>
|
||||
<CopyFileToFolders Include="CuNNy\CuNNy-4x16C-NVL.hlsl">
|
||||
<FileType>Document</FileType>
|
||||
</CopyFileToFolders>
|
||||
<CopyFileToFolders Include="CuNNy\CuNNy-4x16C-NVL-DN.hlsl">
|
||||
<FileType>Document</FileType>
|
||||
</CopyFileToFolders>
|
||||
<CopyFileToFolders Include="CuNNy\CuNNy-6x8C-NVL.hlsl">
|
||||
<FileType>Document</FileType>
|
||||
</CopyFileToFolders>
|
||||
<CopyFileToFolders Include="CuNNy\CuNNy-6x8C-NVL-DN.hlsl">
|
||||
<FileType>Document</FileType>
|
||||
</CopyFileToFolders>
|
||||
<CopyFileToFolders Include="CuNNy\CuNNy-8x4C-NVL.hlsl">
|
||||
<FileType>Document</FileType>
|
||||
</CopyFileToFolders>
|
||||
<CopyFileToFolders Include="CuNNy\CuNNy-8x4C-NVL-DN.hlsl">
|
||||
<FileType>Document</FileType>
|
||||
</CopyFileToFolders>
|
||||
<CopyFileToFolders Include="CuNNy\CuNNy-8x8C-NVL.hlsl">
|
||||
<FileType>Document</FileType>
|
||||
</CopyFileToFolders>
|
||||
<CopyFileToFolders Include="CuNNy\CuNNy-8x8C-NVL-DN.hlsl">
|
||||
<FileType>Document</FileType>
|
||||
</CopyFileToFolders>
|
||||
<CopyFileToFolders Include="CuNNy\CuNNy-8x16C-NVL.hlsl">
|
||||
<FileType>Document</FileType>
|
||||
</CopyFileToFolders>
|
||||
<CopyFileToFolders Include="CuNNy\CuNNy-8x16C-NVL-DN.hlsl">
|
||||
<FileType>Document</FileType>
|
||||
</CopyFileToFolders>
|
||||
<CopyFileToFolders Include="CuNNy\CuNNy-16x16C-NVL.hlsl">
|
||||
<FileType>Document</FileType>
|
||||
</CopyFileToFolders>
|
||||
<CopyFileToFolders Include="CuNNy\CuNNy-16x16C-NVL-DN.hlsl">
|
||||
<FileType>Document</FileType>
|
||||
</CopyFileToFolders>
|
||||
</ItemGroup>
|
||||
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
|
||||
</Project>
|
||||
|
|
@ -2,11 +2,8 @@
|
|||
<Project ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
|
||||
<ItemGroup>
|
||||
<CopyFileToFolders Include="ACNet.hlsl" />
|
||||
<CopyFileToFolders Include="Bicubic.hlsl" />
|
||||
<CopyFileToFolders Include="Jinc.hlsl" />
|
||||
<CopyFileToFolders Include="Lanczos.hlsl" />
|
||||
<CopyFileToFolders Include="Bilinear.hlsl" />
|
||||
<CopyFileToFolders Include="Nearest.hlsl" />
|
||||
<CopyFileToFolders Include="SSimDownscaler.hlsl" />
|
||||
<CopyFileToFolders Include="ImageAdjustment.hlsl" />
|
||||
<CopyFileToFolders Include="SMAA\SMAA.hlsli">
|
||||
|
|
@ -59,7 +56,7 @@
|
|||
</CopyFileToFolders>
|
||||
<CopyFileToFolders Include="Anime4K\Anime4K_Restore_Soft_UL.hlsl">
|
||||
<Filter>Anime4K</Filter>
|
||||
</CopyFileToFolders>
|
||||
</CopyFileToFolders>
|
||||
<CopyFileToFolders Include="Anime4K\Anime4K_Restore_Soft_VL.hlsl">
|
||||
<Filter>Anime4K</Filter>
|
||||
</CopyFileToFolders>
|
||||
|
|
@ -156,24 +153,159 @@
|
|||
<CopyFileToFolders Include="FXAA\FXAA.hlsli">
|
||||
<Filter>FXAA</Filter>
|
||||
</CopyFileToFolders>
|
||||
<CopyFileToFolders Include="RAVU\prescalers.hlsli">
|
||||
<Filter>RAVU</Filter>
|
||||
</CopyFileToFolders>
|
||||
<CopyFileToFolders Include="RAVU\ravu_3x_lut2_f16.dds">
|
||||
<Filter>RAVU</Filter>
|
||||
</CopyFileToFolders>
|
||||
<CopyFileToFolders Include="RAVU\ravu_3x_lut3_f16.dds">
|
||||
<Filter>RAVU</Filter>
|
||||
</CopyFileToFolders>
|
||||
<CopyFileToFolders Include="RAVU\ravu_3x_lut4_f16.dds">
|
||||
<Filter>RAVU</Filter>
|
||||
</CopyFileToFolders>
|
||||
<CopyFileToFolders Include="RAVU\RAVU_3x_R2_RGB.hlsl">
|
||||
<Filter>RAVU</Filter>
|
||||
</CopyFileToFolders>
|
||||
<CopyFileToFolders Include="RAVU\RAVU_3x_R2.hlsl">
|
||||
<Filter>RAVU</Filter>
|
||||
</CopyFileToFolders>
|
||||
<CopyFileToFolders Include="RAVU\RAVU_3x_R3_RGB.hlsl">
|
||||
<Filter>RAVU</Filter>
|
||||
</CopyFileToFolders>
|
||||
<CopyFileToFolders Include="RAVU\RAVU_3x_R3.hlsl">
|
||||
<Filter>RAVU</Filter>
|
||||
</CopyFileToFolders>
|
||||
<CopyFileToFolders Include="RAVU\RAVU_3x_R4_RGB.hlsl">
|
||||
<Filter>RAVU</Filter>
|
||||
</CopyFileToFolders>
|
||||
<CopyFileToFolders Include="RAVU\RAVU_3x_R4.hlsl">
|
||||
<Filter>RAVU</Filter>
|
||||
</CopyFileToFolders>
|
||||
<CopyFileToFolders Include="RAVU\RAVU_Lite_AR_R2.hlsl">
|
||||
<Filter>RAVU</Filter>
|
||||
</CopyFileToFolders>
|
||||
<CopyFileToFolders Include="RAVU\RAVU_Lite_AR_R3.hlsl">
|
||||
<Filter>RAVU</Filter>
|
||||
</CopyFileToFolders>
|
||||
<CopyFileToFolders Include="RAVU\RAVU_Lite_AR_R4.hlsl">
|
||||
<Filter>RAVU</Filter>
|
||||
</CopyFileToFolders>
|
||||
<CopyFileToFolders Include="RAVU\ravu_lite_lut2_f16.dds">
|
||||
<Filter>RAVU</Filter>
|
||||
</CopyFileToFolders>
|
||||
<CopyFileToFolders Include="RAVU\ravu_lite_lut3_f16.dds">
|
||||
<Filter>RAVU</Filter>
|
||||
</CopyFileToFolders>
|
||||
<CopyFileToFolders Include="RAVU\ravu_lite_lut4_f16.dds">
|
||||
<Filter>RAVU</Filter>
|
||||
</CopyFileToFolders>
|
||||
<CopyFileToFolders Include="RAVU\RAVU_Lite_R2.hlsl">
|
||||
<Filter>RAVU</Filter>
|
||||
</CopyFileToFolders>
|
||||
<CopyFileToFolders Include="RAVU\RAVU_Lite_R3.hlsl">
|
||||
<Filter>RAVU</Filter>
|
||||
</CopyFileToFolders>
|
||||
<CopyFileToFolders Include="RAVU\RAVU_Lite_R3_Weights.dds">
|
||||
<CopyFileToFolders Include="RAVU\RAVU_Lite_R4.hlsl">
|
||||
<Filter>RAVU</Filter>
|
||||
</CopyFileToFolders>
|
||||
<CopyFileToFolders Include="RAVU\ravu_lut2_f16.dds">
|
||||
<Filter>RAVU</Filter>
|
||||
</CopyFileToFolders>
|
||||
<CopyFileToFolders Include="RAVU\ravu_lut3_f16.dds">
|
||||
<Filter>RAVU</Filter>
|
||||
</CopyFileToFolders>
|
||||
<CopyFileToFolders Include="RAVU\ravu_lut4_f16.dds">
|
||||
<Filter>RAVU</Filter>
|
||||
</CopyFileToFolders>
|
||||
<CopyFileToFolders Include="RAVU\RAVU_R2_RGB.hlsl">
|
||||
<Filter>RAVU</Filter>
|
||||
</CopyFileToFolders>
|
||||
<CopyFileToFolders Include="RAVU\RAVU_R2.hlsl">
|
||||
<Filter>RAVU</Filter>
|
||||
</CopyFileToFolders>
|
||||
<CopyFileToFolders Include="RAVU\RAVU_R3_RGB.hlsl">
|
||||
<Filter>RAVU</Filter>
|
||||
</CopyFileToFolders>
|
||||
<CopyFileToFolders Include="RAVU\RAVU_R3.hlsl">
|
||||
<Filter>RAVU</Filter>
|
||||
</CopyFileToFolders>
|
||||
<CopyFileToFolders Include="RAVU\RAVU_R4_RGB.hlsl">
|
||||
<Filter>RAVU</Filter>
|
||||
</CopyFileToFolders>
|
||||
<CopyFileToFolders Include="RAVU\RAVU_R4.hlsl">
|
||||
<Filter>RAVU</Filter>
|
||||
</CopyFileToFolders>
|
||||
<CopyFileToFolders Include="RAVU\RAVU_Zoom_AR_R2_RGB.hlsl">
|
||||
<Filter>RAVU</Filter>
|
||||
</CopyFileToFolders>
|
||||
<CopyFileToFolders Include="RAVU\RAVU_Zoom_AR_R2.hlsl">
|
||||
<Filter>RAVU</Filter>
|
||||
</CopyFileToFolders>
|
||||
<CopyFileToFolders Include="RAVU\RAVU_Zoom_AR_R3_RGB.hlsl">
|
||||
<Filter>RAVU</Filter>
|
||||
</CopyFileToFolders>
|
||||
<CopyFileToFolders Include="RAVU\RAVU_Zoom_AR_R3.hlsl">
|
||||
<Filter>RAVU</Filter>
|
||||
</CopyFileToFolders>
|
||||
<CopyFileToFolders Include="RAVU\ravu_zoom_lut2_ar_f16.dds">
|
||||
<Filter>RAVU</Filter>
|
||||
</CopyFileToFolders>
|
||||
<CopyFileToFolders Include="RAVU\ravu_zoom_lut2_f16.dds">
|
||||
<Filter>RAVU</Filter>
|
||||
</CopyFileToFolders>
|
||||
<CopyFileToFolders Include="RAVU\ravu_zoom_lut3_ar_f16.dds">
|
||||
<Filter>RAVU</Filter>
|
||||
</CopyFileToFolders>
|
||||
<CopyFileToFolders Include="RAVU\ravu_zoom_lut3_f16.dds">
|
||||
<Filter>RAVU</Filter>
|
||||
</CopyFileToFolders>
|
||||
<CopyFileToFolders Include="RAVU\RAVU_Zoom_R2_RGB.hlsl">
|
||||
<Filter>RAVU</Filter>
|
||||
</CopyFileToFolders>
|
||||
<CopyFileToFolders Include="RAVU\RAVU_Zoom_R2.hlsl">
|
||||
<Filter>RAVU</Filter>
|
||||
</CopyFileToFolders>
|
||||
<CopyFileToFolders Include="RAVU\RAVU_Zoom_R3.hlsl">
|
||||
<Filter>RAVU</Filter>
|
||||
</CopyFileToFolders>
|
||||
<CopyFileToFolders Include="RAVU\RAVU_Zoom_R3_Weights.dds">
|
||||
<CopyFileToFolders Include="RAVU\RAVU_Zoom_R3_RGB.hlsl">
|
||||
<Filter>RAVU</Filter>
|
||||
</CopyFileToFolders>
|
||||
<CopyFileToFolders Include="NNEDI3\NNEDI3_nns128_win8x4.hlsl">
|
||||
<Filter>NNEDI3</Filter>
|
||||
</CopyFileToFolders>
|
||||
<CopyFileToFolders Include="NNEDI3\NNEDI3_nns128_win8x6.hlsl">
|
||||
<Filter>NNEDI3</Filter>
|
||||
</CopyFileToFolders>
|
||||
<CopyFileToFolders Include="NNEDI3\NNEDI3_nns16_win8x4.hlsl">
|
||||
<Filter>NNEDI3</Filter>
|
||||
</CopyFileToFolders>
|
||||
<CopyFileToFolders Include="NNEDI3\NNEDI3_nns16_win8x6.hlsl">
|
||||
<Filter>NNEDI3</Filter>
|
||||
</CopyFileToFolders>
|
||||
<CopyFileToFolders Include="NNEDI3\NNEDI3_nns256_win8x4.hlsl">
|
||||
<Filter>NNEDI3</Filter>
|
||||
</CopyFileToFolders>
|
||||
<CopyFileToFolders Include="NNEDI3\NNEDI3_nns256_win8x6.hlsl">
|
||||
<Filter>NNEDI3</Filter>
|
||||
</CopyFileToFolders>
|
||||
<CopyFileToFolders Include="NNEDI3\NNEDI3_nns32_win8x4.hlsl">
|
||||
<Filter>NNEDI3</Filter>
|
||||
</CopyFileToFolders>
|
||||
<CopyFileToFolders Include="NNEDI3\NNEDI3_nns32_win8x6.hlsl">
|
||||
<Filter>NNEDI3</Filter>
|
||||
</CopyFileToFolders>
|
||||
<CopyFileToFolders Include="NNEDI3\NNEDI3_nns64_win8x4.hlsl">
|
||||
<Filter>NNEDI3</Filter>
|
||||
</CopyFileToFolders>
|
||||
<CopyFileToFolders Include="NNEDI3\NNEDI3_nns64_win8x6.hlsl">
|
||||
<Filter>NNEDI3</Filter>
|
||||
</CopyFileToFolders>
|
||||
<CopyFileToFolders Include="NNEDI3\prescalers.hlsli">
|
||||
<Filter>NNEDI3</Filter>
|
||||
</CopyFileToFolders>
|
||||
<CopyFileToFolders Include="NIS\NIS.hlsl">
|
||||
<Filter>NIS</Filter>
|
||||
</CopyFileToFolders>
|
||||
|
|
@ -220,6 +352,75 @@
|
|||
<Filter>Pixel Art</Filter>
|
||||
</CopyFileToFolders>
|
||||
<CopyFileToFolders Include="Deband.hlsl" />
|
||||
<CopyFileToFolders Include="Nearest.hlsl" />
|
||||
<CopyFileToFolders Include="Bilinear.hlsl" />
|
||||
<CopyFileToFolders Include="Anime4K\Anime4K_Upscale_GAN_x3_L.hlsl">
|
||||
<Filter>Anime4K</Filter>
|
||||
</CopyFileToFolders>
|
||||
<CopyFileToFolders Include="Anime4K\Anime4K_Upscale_GAN_x2_M.hlsl">
|
||||
<Filter>Anime4K</Filter>
|
||||
</CopyFileToFolders>
|
||||
<CopyFileToFolders Include="CuNNy\CuNNy-2x4C-NVL.hlsl">
|
||||
<Filter>CuNNy</Filter>
|
||||
</CopyFileToFolders>
|
||||
<CopyFileToFolders Include="CuNNy\CuNNy-2x4C-NVL-DN.hlsl">
|
||||
<Filter>CuNNy</Filter>
|
||||
</CopyFileToFolders>
|
||||
<CopyFileToFolders Include="CuNNy\CuNNy-3x4C-NVL.hlsl">
|
||||
<Filter>CuNNy</Filter>
|
||||
</CopyFileToFolders>
|
||||
<CopyFileToFolders Include="CuNNy\CuNNy-3x4C-NVL-DN.hlsl">
|
||||
<Filter>CuNNy</Filter>
|
||||
</CopyFileToFolders>
|
||||
<CopyFileToFolders Include="CuNNy\CuNNy-4x4C-NVL.hlsl">
|
||||
<Filter>CuNNy</Filter>
|
||||
</CopyFileToFolders>
|
||||
<CopyFileToFolders Include="CuNNy\CuNNy-4x4C-NVL-DN.hlsl">
|
||||
<Filter>CuNNy</Filter>
|
||||
</CopyFileToFolders>
|
||||
<CopyFileToFolders Include="CuNNy\CuNNy-4x8C-NVL.hlsl">
|
||||
<Filter>CuNNy</Filter>
|
||||
</CopyFileToFolders>
|
||||
<CopyFileToFolders Include="CuNNy\CuNNy-4x8C-NVL-DN.hlsl">
|
||||
<Filter>CuNNy</Filter>
|
||||
</CopyFileToFolders>
|
||||
<CopyFileToFolders Include="CuNNy\CuNNy-4x16C-NVL.hlsl">
|
||||
<Filter>CuNNy</Filter>
|
||||
</CopyFileToFolders>
|
||||
<CopyFileToFolders Include="CuNNy\CuNNy-4x16C-NVL-DN.hlsl">
|
||||
<Filter>CuNNy</Filter>
|
||||
</CopyFileToFolders>
|
||||
<CopyFileToFolders Include="CuNNy\CuNNy-6x8C-NVL.hlsl">
|
||||
<Filter>CuNNy</Filter>
|
||||
</CopyFileToFolders>
|
||||
<CopyFileToFolders Include="CuNNy\CuNNy-6x8C-NVL-DN.hlsl">
|
||||
<Filter>CuNNy</Filter>
|
||||
</CopyFileToFolders>
|
||||
<CopyFileToFolders Include="CuNNy\CuNNy-8x4C-NVL.hlsl">
|
||||
<Filter>CuNNy</Filter>
|
||||
</CopyFileToFolders>
|
||||
<CopyFileToFolders Include="CuNNy\CuNNy-8x4C-NVL-DN.hlsl">
|
||||
<Filter>CuNNy</Filter>
|
||||
</CopyFileToFolders>
|
||||
<CopyFileToFolders Include="CuNNy\CuNNy-8x8C-NVL.hlsl">
|
||||
<Filter>CuNNy</Filter>
|
||||
</CopyFileToFolders>
|
||||
<CopyFileToFolders Include="CuNNy\CuNNy-8x8C-NVL-DN.hlsl">
|
||||
<Filter>CuNNy</Filter>
|
||||
</CopyFileToFolders>
|
||||
<CopyFileToFolders Include="CuNNy\CuNNy-8x16C-NVL.hlsl">
|
||||
<Filter>CuNNy</Filter>
|
||||
</CopyFileToFolders>
|
||||
<CopyFileToFolders Include="CuNNy\CuNNy-8x16C-NVL-DN.hlsl">
|
||||
<Filter>CuNNy</Filter>
|
||||
</CopyFileToFolders>
|
||||
<CopyFileToFolders Include="CuNNy\CuNNy-16x16C-NVL.hlsl">
|
||||
<Filter>CuNNy</Filter>
|
||||
</CopyFileToFolders>
|
||||
<CopyFileToFolders Include="CuNNy\CuNNy-16x16C-NVL-DN.hlsl">
|
||||
<Filter>CuNNy</Filter>
|
||||
</CopyFileToFolders>
|
||||
<CopyFileToFolders Include="Bicubic.hlsl" />
|
||||
</ItemGroup>
|
||||
<ItemGroup>
|
||||
<Filter Include="Anime4K">
|
||||
|
|
@ -261,5 +462,8 @@
|
|||
<Filter Include="Pixel Art">
|
||||
<UniqueIdentifier>{0b58f073-84cb-4c38-919d-80176ae408bc}</UniqueIdentifier>
|
||||
</Filter>
|
||||
<Filter Include="CuNNy">
|
||||
<UniqueIdentifier>{9157745b-aa96-42ce-bdc6-1230dffa326b}</UniqueIdentifier>
|
||||
</Filter>
|
||||
</ItemGroup>
|
||||
</Project>
|
||||
|
|
@ -2,11 +2,13 @@
|
|||
// 移植自 https://github.com/GPUOpen-Effects/FidelityFX-FSR/blob/master/ffx-fsr/ffx_fsr1.h
|
||||
|
||||
//!MAGPIE EFFECT
|
||||
//!VERSION 3
|
||||
//!VERSION 4
|
||||
|
||||
//!TEXTURE
|
||||
Texture2D INPUT;
|
||||
|
||||
//!TEXTURE
|
||||
Texture2D OUTPUT;
|
||||
|
||||
//!SAMPLER
|
||||
//!FILTER POINT
|
||||
|
|
@ -15,6 +17,7 @@ SamplerState sam;
|
|||
|
||||
//!PASS 1
|
||||
//!IN INPUT
|
||||
//!OUT OUTPUT
|
||||
//!BLOCK_SIZE 16
|
||||
//!NUM_THREADS 64
|
||||
|
||||
|
|
@ -228,12 +231,13 @@ float3 FsrEasuF(uint2 pos, float4 con0, float4 con1, float4 con2, float2 con3) {
|
|||
|
||||
void Pass1(uint2 blockStart, uint3 threadId) {
|
||||
uint2 gxy = blockStart + Rmp8x8(threadId.x);
|
||||
if (!CheckViewport(gxy)) {
|
||||
|
||||
const uint2 outputSize = GetOutputSize();
|
||||
if (gxy.x >= outputSize.x || gxy.y >= outputSize.y) {
|
||||
return;
|
||||
}
|
||||
|
||||
uint2 inputSize = GetInputSize();
|
||||
uint2 outputSize = GetOutputSize();
|
||||
float2 inputPt = GetInputPt();
|
||||
|
||||
float4 con0, con1, con2;
|
||||
|
|
@ -271,20 +275,20 @@ void Pass1(uint2 blockStart, uint3 threadId) {
|
|||
con3[0] = 0;
|
||||
con3[1] = 4.0f * inputPt.y;
|
||||
|
||||
WriteToOutput(gxy, FsrEasuF(gxy, con0, con1, con2, con3));
|
||||
OUTPUT[gxy] = float4(FsrEasuF(gxy, con0, con1, con2, con3), 1);
|
||||
|
||||
gxy.x += 8u;
|
||||
if (CheckViewport(gxy)) {
|
||||
WriteToOutput(gxy, FsrEasuF(gxy, con0, con1, con2, con3));
|
||||
if (gxy.x < outputSize.x && gxy.y < outputSize.y) {
|
||||
OUTPUT[gxy] = float4(FsrEasuF(gxy, con0, con1, con2, con3), 1);
|
||||
}
|
||||
|
||||
gxy.y += 8u;
|
||||
if (CheckViewport(gxy)) {
|
||||
WriteToOutput(gxy, FsrEasuF(gxy, con0, con1, con2, con3));
|
||||
if (gxy.x < outputSize.x && gxy.y < outputSize.y) {
|
||||
OUTPUT[gxy] = float4(FsrEasuF(gxy, con0, con1, con2, con3), 1);
|
||||
}
|
||||
|
||||
gxy.x -= 8u;
|
||||
if (CheckViewport(gxy)) {
|
||||
WriteToOutput(gxy, FsrEasuF(gxy, con0, con1, con2, con3));
|
||||
if (gxy.x < outputSize.x && gxy.y < outputSize.y) {
|
||||
OUTPUT[gxy] = float4(FsrEasuF(gxy, con0, con1, con2, con3), 1);
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -2,9 +2,7 @@
|
|||
// 移植自 https://github.com/GPUOpen-Effects/FidelityFX-FSR/blob/master/ffx-fsr/ffx_fsr1.h
|
||||
|
||||
//!MAGPIE EFFECT
|
||||
//!VERSION 3
|
||||
//!OUTPUT_WIDTH INPUT_WIDTH
|
||||
//!OUTPUT_HEIGHT INPUT_HEIGHT
|
||||
//!VERSION 4
|
||||
|
||||
|
||||
//!PARAMETER
|
||||
|
|
@ -18,12 +16,19 @@ float sharpness;
|
|||
//!TEXTURE
|
||||
Texture2D INPUT;
|
||||
|
||||
//!TEXTURE
|
||||
//!WIDTH INPUT_WIDTH
|
||||
//!HEIGHT INPUT_HEIGHT
|
||||
Texture2D OUTPUT;
|
||||
|
||||
//!SAMPLER
|
||||
//!FILTER POINT
|
||||
SamplerState sam;
|
||||
|
||||
|
||||
//!PASS 1
|
||||
//!IN INPUT
|
||||
//!OUT OUTPUT
|
||||
//!BLOCK_SIZE 16
|
||||
//!NUM_THREADS 64
|
||||
|
||||
|
|
@ -108,7 +113,9 @@ float3 FsrRcasF(float3 b, float3 d, float3 e, float3 f, float3 h) {
|
|||
|
||||
void Pass1(uint2 blockStart, uint3 threadId) {
|
||||
uint2 gxy = blockStart + (Rmp8x8(threadId.x) << 1);
|
||||
if (!CheckViewport(gxy)) {
|
||||
|
||||
const uint2 outputSize = GetOutputSize();
|
||||
if (gxy.x >= outputSize.x || gxy.y >= outputSize.y) {
|
||||
return;
|
||||
}
|
||||
|
||||
|
|
@ -126,20 +133,20 @@ void Pass1(uint2 blockStart, uint3 threadId) {
|
|||
src[3][1] = INPUT.Load(int3(gxy.x + 2, gxy.y, 0)).rgb;
|
||||
src[3][2] = INPUT.Load(int3(gxy.x + 2, gxy.y + 1, 0)).rgb;
|
||||
|
||||
WriteToOutput(gxy, FsrRcasF(src[1][0], src[0][1], src[1][1], src[2][1], src[1][2]));
|
||||
OUTPUT[gxy] = float4(FsrRcasF(src[1][0], src[0][1], src[1][1], src[2][1], src[1][2]), 1);
|
||||
|
||||
++gxy.x;
|
||||
if (CheckViewport(gxy)) {
|
||||
WriteToOutput(gxy, FsrRcasF(src[2][0], src[1][1], src[2][1], src[3][1], src[2][2]));
|
||||
if (gxy.x < outputSize.x && gxy.y < outputSize.y) {
|
||||
OUTPUT[gxy] = float4(FsrRcasF(src[2][0], src[1][1], src[2][1], src[3][1], src[2][2]), 1);
|
||||
}
|
||||
|
||||
++gxy.y;
|
||||
if (CheckViewport(gxy)) {
|
||||
WriteToOutput(gxy, FsrRcasF(src[2][1], src[1][2], src[2][2], src[3][2], src[2][3]));
|
||||
if (gxy.x < outputSize.x && gxy.y < outputSize.y) {
|
||||
OUTPUT[gxy] = float4(FsrRcasF(src[2][1], src[1][2], src[2][2], src[3][2], src[2][3]), 1);
|
||||
}
|
||||
|
||||
--gxy.x;
|
||||
if (CheckViewport(gxy)) {
|
||||
WriteToOutput(gxy, FsrRcasF(src[1][1], src[0][2], src[1][2], src[2][2], src[1][3]));
|
||||
if (gxy.x < outputSize.x && gxy.y < outputSize.y) {
|
||||
OUTPUT[gxy] = float4(FsrRcasF(src[1][1], src[0][2], src[1][2], src[2][2], src[1][3]), 1);
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -3,14 +3,17 @@
|
|||
|
||||
|
||||
//!MAGPIE EFFECT
|
||||
//!VERSION 3
|
||||
//!OUTPUT_WIDTH INPUT_WIDTH * 2
|
||||
//!OUTPUT_HEIGHT INPUT_HEIGHT * 2
|
||||
//!VERSION 4
|
||||
|
||||
|
||||
//!TEXTURE
|
||||
Texture2D INPUT;
|
||||
|
||||
//!TEXTURE
|
||||
//!WIDTH INPUT_WIDTH * 2
|
||||
//!HEIGHT INPUT_HEIGHT * 2
|
||||
Texture2D OUTPUT;
|
||||
|
||||
//!TEXTURE
|
||||
//!WIDTH INPUT_WIDTH
|
||||
//!HEIGHT INPUT_HEIGHT
|
||||
|
|
@ -534,6 +537,7 @@ void Pass5(uint2 blockStart, uint3 threadId) {
|
|||
//!PASS 6
|
||||
//!DESC sub-pixel convolution, aggregation
|
||||
//!IN tex3, tex4, INPUT
|
||||
//!OUT OUTPUT
|
||||
//!BLOCK_SIZE 16
|
||||
//!NUM_THREADS 64
|
||||
|
||||
|
|
@ -551,7 +555,8 @@ const static float3x3 yuv2rgb = {
|
|||
void Pass6(uint2 blockStart, uint3 threadId) {
|
||||
uint2 gxy = (Rmp8x8(threadId.x) << 1) + blockStart;
|
||||
|
||||
if (!CheckViewport(gxy)) {
|
||||
const uint2 outputSize = GetOutputSize();
|
||||
if (gxy.x >= outputSize.x || gxy.y >= outputSize.y) {
|
||||
return;
|
||||
}
|
||||
|
||||
|
|
@ -609,15 +614,9 @@ void Pass6(uint2 blockStart, uint3 threadId) {
|
|||
for (uint j = 0; j <= 1; ++j) {
|
||||
const uint2 destPos = gxy + uint2(i, j);
|
||||
|
||||
if (i != 0 || j != 0) {
|
||||
if (!CheckViewport(destPos)) {
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
float2 originUV = mul(rgb2uv, INPUT.SampleLevel(sam1, (destPos + 0.5f) * outputPt, 0).rgb);
|
||||
const uint index = i * 2 + j;
|
||||
WriteToOutput(destPos, mul(yuv2rgb, float3(result[index], originUV)));
|
||||
OUTPUT[destPos] = float4(mul(yuv2rgb, float3(result[index], originUV)), 1);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -2,14 +2,17 @@
|
|||
// 移植自 https://github.com/igv/FSRCNN-TensorFlow
|
||||
|
||||
//!MAGPIE EFFECT
|
||||
//!VERSION 3
|
||||
//!OUTPUT_WIDTH INPUT_WIDTH * 2
|
||||
//!OUTPUT_HEIGHT INPUT_HEIGHT * 2
|
||||
//!VERSION 4
|
||||
|
||||
|
||||
//!TEXTURE
|
||||
Texture2D INPUT;
|
||||
|
||||
//!TEXTURE
|
||||
//!WIDTH INPUT_WIDTH * 2
|
||||
//!HEIGHT INPUT_HEIGHT * 2
|
||||
Texture2D OUTPUT;
|
||||
|
||||
//!TEXTURE
|
||||
//!WIDTH INPUT_WIDTH
|
||||
//!HEIGHT INPUT_HEIGHT
|
||||
|
|
@ -531,6 +534,7 @@ void Pass5(uint2 blockStart, uint3 threadId) {
|
|||
//!PASS 6
|
||||
//!DESC sub-pixel convolution, aggregation
|
||||
//!IN tex3, tex4, INPUT
|
||||
//!OUT OUTPUT
|
||||
//!BLOCK_SIZE 16
|
||||
//!NUM_THREADS 64
|
||||
|
||||
|
|
@ -548,7 +552,8 @@ const static float3x3 yuv2rgb = {
|
|||
void Pass6(uint2 blockStart, uint3 threadId) {
|
||||
uint2 gxy = (Rmp8x8(threadId.x) << 1) + blockStart;
|
||||
|
||||
if (!CheckViewport(gxy)) {
|
||||
const uint2 outputSize = GetOutputSize();
|
||||
if (gxy.x >= outputSize.x || gxy.y >= outputSize.y) {
|
||||
return;
|
||||
}
|
||||
|
||||
|
|
@ -606,15 +611,9 @@ void Pass6(uint2 blockStart, uint3 threadId) {
|
|||
for (uint j = 0; j <= 1; ++j) {
|
||||
const uint2 destPos = gxy + uint2(i, j);
|
||||
|
||||
if (i != 0 || j != 0) {
|
||||
if (!CheckViewport(destPos)) {
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
float2 originUV = mul(rgb2uv, INPUT.SampleLevel(sam1, (destPos + 0.5f) * outputPt, 0).rgb);
|
||||
const uint index = i * 2 + j;
|
||||
WriteToOutput(destPos, mul(yuv2rgb, float3(result[index], originUV)));
|
||||
OUTPUT[destPos] = float4(mul(yuv2rgb, float3(result[index], originUV)), 1);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -1,15 +1,18 @@
|
|||
// 移植自 https://github.com/libretro/slang-shaders/blob/master/anti-aliasing/shaders/fxaa.slang
|
||||
|
||||
//!MAGPIE EFFECT
|
||||
//!VERSION 3
|
||||
//!OUTPUT_WIDTH INPUT_WIDTH
|
||||
//!OUTPUT_HEIGHT INPUT_HEIGHT
|
||||
//!VERSION 4
|
||||
//!SORT_NAME FXAA_1
|
||||
|
||||
|
||||
//!TEXTURE
|
||||
Texture2D INPUT;
|
||||
|
||||
//!TEXTURE
|
||||
//!WIDTH INPUT_WIDTH
|
||||
//!HEIGHT INPUT_HEIGHT
|
||||
Texture2D OUTPUT;
|
||||
|
||||
//!SAMPLER
|
||||
//!FILTER LINEAR
|
||||
SamplerState sam;
|
||||
|
|
@ -17,6 +20,7 @@ SamplerState sam;
|
|||
|
||||
//!PASS 1
|
||||
//!IN INPUT
|
||||
//!OUT OUTPUT
|
||||
//!BLOCK_SIZE 16
|
||||
//!NUM_THREADS 64
|
||||
|
||||
|
|
@ -26,7 +30,9 @@ SamplerState sam;
|
|||
|
||||
void Pass1(uint2 blockStart, uint3 threadId) {
|
||||
uint2 gxy = (Rmp8x8(threadId.x) << 1) + blockStart;
|
||||
if (!CheckViewport(gxy)) {
|
||||
|
||||
const uint2 outputSize = GetOutputSize();
|
||||
if (gxy.x >= outputSize.x || gxy.y >= outputSize.y) {
|
||||
return;
|
||||
}
|
||||
|
||||
|
|
@ -57,14 +63,7 @@ void Pass1(uint2 blockStart, uint3 threadId) {
|
|||
[unroll]
|
||||
for (j = 0; j <= 1; ++j) {
|
||||
uint2 destPos = gxy + uint2(i, j);
|
||||
|
||||
if (i != 0 || j != 0) {
|
||||
if (!CheckViewport(gxy)) {
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
WriteToOutput(destPos, FXAA(src, i + 1, j + 1, INPUT, sam, (destPos + 0.5f) * inputPt, inputPt));
|
||||
OUTPUT[destPos] = float4(FXAA(src, i + 1, j + 1, INPUT, sam, (destPos + 0.5f) * inputPt, inputPt), 1);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -1,15 +1,18 @@
|
|||
// 移植自 https://github.com/libretro/slang-shaders/blob/master/anti-aliasing/shaders/fxaa.slang
|
||||
|
||||
//!MAGPIE EFFECT
|
||||
//!VERSION 3
|
||||
//!OUTPUT_WIDTH INPUT_WIDTH
|
||||
//!OUTPUT_HEIGHT INPUT_HEIGHT
|
||||
//!VERSION 4
|
||||
//!SORT_NAME FXAA_0
|
||||
|
||||
|
||||
//!TEXTURE
|
||||
Texture2D INPUT;
|
||||
|
||||
//!TEXTURE
|
||||
//!WIDTH INPUT_WIDTH
|
||||
//!HEIGHT INPUT_HEIGHT
|
||||
Texture2D OUTPUT;
|
||||
|
||||
//!SAMPLER
|
||||
//!FILTER LINEAR
|
||||
SamplerState sam;
|
||||
|
|
@ -17,6 +20,7 @@ SamplerState sam;
|
|||
|
||||
//!PASS 1
|
||||
//!IN INPUT
|
||||
//!OUT OUTPUT
|
||||
//!BLOCK_SIZE 16
|
||||
//!NUM_THREADS 64
|
||||
|
||||
|
|
@ -26,7 +30,9 @@ SamplerState sam;
|
|||
|
||||
void Pass1(uint2 blockStart, uint3 threadId) {
|
||||
uint2 gxy = (Rmp8x8(threadId.x) << 1) + blockStart;
|
||||
if (!CheckViewport(gxy)) {
|
||||
|
||||
const uint2 outputSize = GetOutputSize();
|
||||
if (gxy.x >= outputSize.x || gxy.y >= outputSize.y) {
|
||||
return;
|
||||
}
|
||||
|
||||
|
|
@ -57,14 +63,7 @@ void Pass1(uint2 blockStart, uint3 threadId) {
|
|||
[unroll]
|
||||
for (j = 0; j <= 1; ++j) {
|
||||
uint2 destPos = gxy + uint2(i, j);
|
||||
|
||||
if (i != 0 || j != 0) {
|
||||
if (!CheckViewport(gxy)) {
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
WriteToOutput(destPos, FXAA(src, i + 1, j + 1, INPUT, sam, (destPos + 0.5f) * inputPt, inputPt));
|
||||
OUTPUT[destPos] = float4(FXAA(src, i + 1, j + 1, INPUT, sam, (destPos + 0.5f) * inputPt, inputPt), 1);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -1,15 +1,18 @@
|
|||
// 移植自 https://github.com/libretro/slang-shaders/blob/master/anti-aliasing/shaders/fxaa.slang
|
||||
|
||||
//!MAGPIE EFFECT
|
||||
//!VERSION 3
|
||||
//!OUTPUT_WIDTH INPUT_WIDTH
|
||||
//!OUTPUT_HEIGHT INPUT_HEIGHT
|
||||
//!VERSION 4
|
||||
//!SORT_NAME FXAA_2
|
||||
|
||||
|
||||
//!TEXTURE
|
||||
Texture2D INPUT;
|
||||
|
||||
//!TEXTURE
|
||||
//!WIDTH INPUT_WIDTH
|
||||
//!HEIGHT INPUT_HEIGHT
|
||||
Texture2D OUTPUT;
|
||||
|
||||
//!SAMPLER
|
||||
//!FILTER LINEAR
|
||||
SamplerState sam;
|
||||
|
|
@ -17,6 +20,7 @@ SamplerState sam;
|
|||
|
||||
//!PASS 1
|
||||
//!IN INPUT
|
||||
//!OUT OUTPUT
|
||||
//!BLOCK_SIZE 16
|
||||
//!NUM_THREADS 64
|
||||
|
||||
|
|
@ -26,7 +30,9 @@ SamplerState sam;
|
|||
|
||||
void Pass1(uint2 blockStart, uint3 threadId) {
|
||||
uint2 gxy = (Rmp8x8(threadId.x) << 1) + blockStart;
|
||||
if (!CheckViewport(gxy)) {
|
||||
|
||||
const uint2 outputSize = GetOutputSize();
|
||||
if (gxy.x >= outputSize.x || gxy.y >= outputSize.y) {
|
||||
return;
|
||||
}
|
||||
|
||||
|
|
@ -57,14 +63,7 @@ void Pass1(uint2 blockStart, uint3 threadId) {
|
|||
[unroll]
|
||||
for (j = 0; j <= 1; ++j) {
|
||||
uint2 destPos = gxy + uint2(i, j);
|
||||
|
||||
if (i != 0 || j != 0) {
|
||||
if (!CheckViewport(gxy)) {
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
WriteToOutput(destPos, FXAA(src, i + 1, j + 1, INPUT, sam, (destPos + 0.5f) * inputPt, inputPt));
|
||||
OUTPUT[destPos] = float4(FXAA(src, i + 1, j + 1, INPUT, sam, (destPos + 0.5f) * inputPt, inputPt), 1);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -1,9 +1,7 @@
|
|||
// 移植自 https://github.com/libretro/slang-shaders/blob/3f67e1870dbd5be74ae2f09eaed0eeadce6abd15/misc/image-adjustment.slang
|
||||
|
||||
//!MAGPIE EFFECT
|
||||
//!VERSION 3
|
||||
//!OUTPUT_WIDTH INPUT_WIDTH
|
||||
//!OUTPUT_HEIGHT INPUT_HEIGHT
|
||||
//!VERSION 4
|
||||
|
||||
|
||||
//!PARAMETER
|
||||
|
|
@ -89,6 +87,11 @@ float b;
|
|||
//!TEXTURE
|
||||
Texture2D INPUT;
|
||||
|
||||
//!TEXTURE
|
||||
//!WIDTH INPUT_WIDTH
|
||||
//!HEIGHT INPUT_HEIGHT
|
||||
Texture2D OUTPUT;
|
||||
|
||||
//!SAMPLER
|
||||
//!FILTER POINT
|
||||
SamplerState sam;
|
||||
|
|
@ -97,6 +100,7 @@ SamplerState sam;
|
|||
//!PASS 1
|
||||
//!STYLE PS
|
||||
//!IN INPUT
|
||||
//!OUT OUTPUT
|
||||
|
||||
float3 RGBtoHSV(float3 c) {
|
||||
float4 K = float4(0.0, -1.0 / 3.0, 2.0 / 3.0, -1.0);
|
||||
|
|
|
|||
|
|
@ -10,8 +10,7 @@
|
|||
// B = 0.825 to get rid of dithering. Increase B to get a fine sharpness, though dithering returns.
|
||||
|
||||
//!MAGPIE EFFECT
|
||||
//!VERSION 3
|
||||
//!GENERIC_DOWNSCALER
|
||||
//!VERSION 4
|
||||
|
||||
|
||||
//!PARAMETER
|
||||
|
|
@ -41,6 +40,9 @@ float ARStrength;
|
|||
//!TEXTURE
|
||||
Texture2D INPUT;
|
||||
|
||||
//!TEXTURE
|
||||
Texture2D OUTPUT;
|
||||
|
||||
//!SAMPLER
|
||||
//!FILTER POINT
|
||||
SamplerState sam;
|
||||
|
|
@ -48,6 +50,7 @@ SamplerState sam;
|
|||
|
||||
//!PASS 1
|
||||
//!IN INPUT
|
||||
//!OUT OUTPUT
|
||||
//!BLOCK_SIZE 8
|
||||
//!NUM_THREADS 64
|
||||
|
||||
|
|
@ -70,7 +73,9 @@ float4 resampler(float4 x, float wa, float wb) {
|
|||
|
||||
void Pass1(uint2 blockStart, uint3 threadId) {
|
||||
uint2 gxy = Rmp8x8(threadId.x) + blockStart;
|
||||
if (!CheckViewport(gxy)) {
|
||||
|
||||
const uint2 outputSize = GetOutputSize();
|
||||
if (gxy.x >= outputSize.x || gxy.y >= outputSize.y) {
|
||||
return;
|
||||
}
|
||||
|
||||
|
|
@ -126,5 +131,5 @@ void Pass1(uint2 blockStart, uint3 threadId) {
|
|||
color = lerp(color, clamp(color, min_sample, max_sample), ARStrength);
|
||||
|
||||
// final sum and weight normalization
|
||||
WriteToOutput(gxy, color);
|
||||
OUTPUT[gxy] = float4(color, 1);
|
||||
}
|
||||
|
|
|
|||
|
|
@ -2,8 +2,7 @@
|
|||
// 移植自 https://github.com/libretro/common-shaders/blob/master/windowed/shaders/lanczos6.cg
|
||||
|
||||
//!MAGPIE EFFECT
|
||||
//!VERSION 3
|
||||
//!GENERIC_DOWNSCALER
|
||||
//!VERSION 4
|
||||
|
||||
|
||||
//!PARAMETER
|
||||
|
|
@ -17,6 +16,9 @@ float ARStrength;
|
|||
//!TEXTURE
|
||||
Texture2D INPUT;
|
||||
|
||||
//!TEXTURE
|
||||
Texture2D OUTPUT;
|
||||
|
||||
//!SAMPLER
|
||||
//!FILTER POINT
|
||||
SamplerState sam;
|
||||
|
|
@ -25,6 +27,7 @@ SamplerState sam;
|
|||
//!PASS 1
|
||||
//!STYLE PS
|
||||
//!IN INPUT
|
||||
//!OUT OUTPUT
|
||||
|
||||
#define FIX(c) max(abs(c), 1e-5)
|
||||
#define PI 3.14159265359
|
||||
|
|
|
|||
|
|
@ -1,7 +1,7 @@
|
|||
// 移植自 https://github.com/NVIDIAGameWorks/NVIDIAImageScaling/blob/main/NIS/NIS_Scaler.h
|
||||
|
||||
//!MAGPIE EFFECT
|
||||
//!VERSION 3
|
||||
//!VERSION 4
|
||||
|
||||
|
||||
//!PARAMETER
|
||||
|
|
@ -15,6 +15,9 @@ float sharpness;
|
|||
//!TEXTURE
|
||||
Texture2D INPUT;
|
||||
|
||||
//!TEXTURE
|
||||
Texture2D OUTPUT;
|
||||
|
||||
//!TEXTURE
|
||||
//!SOURCE Coef_Scale.dds
|
||||
//!FORMAT R16G16B16A16_FLOAT
|
||||
|
|
@ -32,6 +35,7 @@ SamplerState samplerLinearClamp;
|
|||
|
||||
//!PASS 1
|
||||
//!IN INPUT, coef_scaler, coef_usm
|
||||
//!OUT OUTPUT
|
||||
//!BLOCK_SIZE 32,32
|
||||
//!NUM_THREADS 256
|
||||
|
||||
|
|
@ -431,12 +435,18 @@ void Pass1(uint2 blockStart, uint3 threadId) {
|
|||
// discretized phase
|
||||
const int fx_int = int(fx * kPhaseCount);
|
||||
|
||||
const uint2 outputSize = GetOutputSize();
|
||||
if (dstX >= outputSize.x) {
|
||||
return;
|
||||
}
|
||||
|
||||
for (int k = 0; k < NIS_BLOCK_WIDTH * NIS_BLOCK_HEIGHT / NIS_THREAD_GROUP_SIZE; ++k) {
|
||||
// y coord inside the output image
|
||||
const int dstY = dstBlockY + pos.y + k * (NIS_THREAD_GROUP_SIZE / NIS_BLOCK_WIDTH);
|
||||
if (!CheckViewport(int2(dstX, dstY))) {
|
||||
if (dstY >= outputSize.y) {
|
||||
return;
|
||||
}
|
||||
|
||||
// y coord inside the input image
|
||||
const float srcY = (0.5f + dstY) * kScaleY - 0.5f;
|
||||
|
||||
|
|
@ -487,13 +497,13 @@ void Pass1(uint2 blockStart, uint3 threadId) {
|
|||
|
||||
// do bilinear tap for chroma upscaling
|
||||
|
||||
float3 op = INPUT.SampleLevel(samplerLinearClamp, float2((srcX + 0.5f) * kSrcNormX, (srcY + 0.5f) * kSrcNormY), 0).rgb;
|
||||
float4 op = INPUT.SampleLevel(samplerLinearClamp, float2((srcX + 0.5f) * kSrcNormX, (srcY + 0.5f) * kSrcNormY), 0);
|
||||
|
||||
const float corr = opY * (1.0f / NIS_SCALE_FLOAT) - getY(float3(op.x, op.y, op.z));
|
||||
op.x += corr;
|
||||
op.y += corr;
|
||||
op.z += corr;
|
||||
|
||||
WriteToOutput(uint2(dstX, dstY), op);
|
||||
OUTPUT[uint2(dstX, dstY)] = op;
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -1,18 +1,9 @@
|
|||
// 移植自 https://github.com/NVIDIAGameWorks/NVIDIAImageScaling/blob/main/NIS/NIS_Scaler.h
|
||||
|
||||
//!MAGPIE EFFECT
|
||||
//!VERSION 3
|
||||
//!OUTPUT_WIDTH INPUT_WIDTH
|
||||
//!OUTPUT_HEIGHT INPUT_HEIGHT
|
||||
//!VERSION 4
|
||||
|
||||
|
||||
//!TEXTURE
|
||||
Texture2D INPUT;
|
||||
|
||||
//!SAMPLER
|
||||
//!FILTER LINEAR
|
||||
SamplerState samplerLinearClamp;
|
||||
|
||||
//!PARAMETER
|
||||
//!LABEL Sharpness
|
||||
//!DEFAULT 0.5
|
||||
|
|
@ -21,9 +12,22 @@ SamplerState samplerLinearClamp;
|
|||
//!STEP 0.01
|
||||
float sharpness;
|
||||
|
||||
//!TEXTURE
|
||||
Texture2D INPUT;
|
||||
|
||||
//!TEXTURE
|
||||
//!WIDTH INPUT_WIDTH
|
||||
//!HEIGHT INPUT_HEIGHT
|
||||
Texture2D OUTPUT;
|
||||
|
||||
//!SAMPLER
|
||||
//!FILTER LINEAR
|
||||
SamplerState samplerLinearClamp;
|
||||
|
||||
|
||||
//!PASS 1
|
||||
//!IN INPUT
|
||||
//!OUT OUTPUT
|
||||
//!BLOCK_SIZE 32, 32
|
||||
//!NUM_THREADS 256
|
||||
|
||||
|
|
@ -208,6 +212,8 @@ void Pass1(uint2 blockStart, uint3 threadId) {
|
|||
|
||||
GroupMemoryBarrierWithGroupSync();
|
||||
|
||||
const int2 outputSize = (int2)GetOutputSize();
|
||||
|
||||
for (int k = int(threadIdx); k < NIS_BLOCK_WIDTH * NIS_BLOCK_HEIGHT; k += NIS_THREAD_GROUP_SIZE) {
|
||||
const int2 pos = int2(uint(k) % uint(NIS_BLOCK_WIDTH), uint(k) / uint(NIS_BLOCK_WIDTH));
|
||||
|
||||
|
|
@ -215,7 +221,7 @@ void Pass1(uint2 blockStart, uint3 threadId) {
|
|||
const int dstX = dstBlockX + pos.x;
|
||||
const int dstY = dstBlockY + pos.y;
|
||||
|
||||
if (!CheckViewport(int2(dstX, dstY))) {
|
||||
if (dstX >= outputSize.x || dstY >= outputSize.y) {
|
||||
continue;
|
||||
}
|
||||
|
||||
|
|
@ -238,9 +244,9 @@ void Pass1(uint2 blockStart, uint3 threadId) {
|
|||
// final USM is a weighted sum filter outputs
|
||||
const float usmY = (dirUSM.x * w.x + dirUSM.y * w.y + dirUSM.z * w.z + dirUSM.w * w.w);
|
||||
|
||||
float3 op = INPUT.SampleLevel(samplerLinearClamp, float2((dstX + 0.5f) * kSrcNormX, (dstY + 0.5f) * kSrcNormY), 0).rgb;
|
||||
float4 op = INPUT.SampleLevel(samplerLinearClamp, float2((dstX + 0.5f) * kSrcNormX, (dstY + 0.5f) * kSrcNormY), 0);
|
||||
op += usmY;
|
||||
|
||||
WriteToOutput(uint2(dstX, dstY), op);
|
||||
OUTPUT[uint2(dstX, dstY)] = op;
|
||||
}
|
||||
}
|
||||
|
|
|
|||
4024
src/Effects/NNEDI3/NNEDI3_nns128_win8x4.hlsl
Normal file
4024
src/Effects/NNEDI3/NNEDI3_nns128_win8x4.hlsl
Normal file
File diff suppressed because it is too large
Load diff
5735
src/Effects/NNEDI3/NNEDI3_nns128_win8x6.hlsl
Normal file
5735
src/Effects/NNEDI3/NNEDI3_nns128_win8x6.hlsl
Normal file
File diff suppressed because it is too large
Load diff
|
|
@ -1,206 +1,732 @@
|
|||
// nnedi3-nns16-win8x4
|
||||
// 移植自 https://github.com/bjin/mpv-prescalers/blob/cc02ed95c1fe05b72bc21d41257c4c085e6e409b/compute/nnedi3-nns16-win8x4.hook
|
||||
// 有半像素的偏移
|
||||
// This file is generated by the scripts available at https://github.com/hauuau/magpie-prescalers
|
||||
// Please don't edit this file directly.
|
||||
// Generated by: nnedi3.py --nns 16 --win 8x4 --use-compute-shader --use-magpie
|
||||
//
|
||||
// This program is free software: you can redistribute it and/or modify
|
||||
// it under the terms of the GNU General Public License as published by
|
||||
// the Free Software Foundation, either version 3 of the License, or
|
||||
// (at your option) any later version.
|
||||
//
|
||||
// This program is distributed in the hope that it will be useful,
|
||||
// but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
// GNU General Public License for more details.
|
||||
//
|
||||
// You should have received a copy of the GNU General Public License
|
||||
// along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
|
||||
//!MAGPIE EFFECT
|
||||
//!VERSION 3
|
||||
//!OUTPUT_WIDTH INPUT_WIDTH * 2
|
||||
//!OUTPUT_HEIGHT INPUT_HEIGHT * 2
|
||||
|
||||
//!VERSION 4
|
||||
//!SORT_NAME NNEDI3_016_4
|
||||
|
||||
//!TEXTURE
|
||||
Texture2D INPUT;
|
||||
|
||||
//!SAMPLER
|
||||
//!FILTER POINT
|
||||
SamplerState sam;
|
||||
SamplerState sam_INPUT;
|
||||
|
||||
//!TEXTURE
|
||||
//!WIDTH INPUT_WIDTH * 1 * 2
|
||||
//!HEIGHT INPUT_HEIGHT * 2 * 1
|
||||
Texture2D OUTPUT;
|
||||
|
||||
//!SAMPLER
|
||||
//!FILTER LINEAR
|
||||
SamplerState sam1;
|
||||
SamplerState sam_INPUT_LINEAR;
|
||||
|
||||
//!TEXTURE
|
||||
//!WIDTH INPUT_WIDTH
|
||||
//!HEIGHT INPUT_HEIGHT * 2
|
||||
//!FORMAT R16_FLOAT
|
||||
Texture2D tex1;
|
||||
//!WIDTH INPUT_WIDTH * 1
|
||||
//!HEIGHT INPUT_HEIGHT * 2
|
||||
Texture2D temp;
|
||||
|
||||
//!SAMPLER
|
||||
//!FILTER POINT
|
||||
SamplerState sam_temp;
|
||||
|
||||
//!COMMON
|
||||
#include "prescalers.hlsli"
|
||||
|
||||
#define T(x) asfloat(x)
|
||||
#define W(i,w0,w1,w2,w3) dot(samples[i],float4(T(w0),T(w1),T(w2),T(w3)))
|
||||
#define WS(w0,w1) sum1 = exp(sum1 * mstd2 + T(w0)); sum2 = sum2 * mstd2 + T(w1); wsum += sum1; vsum += sum1*(sum2/(1.0+abs(sum2)))
|
||||
|
||||
#define LAST_PASS 2
|
||||
|
||||
//!PASS 1
|
||||
//!DESC double_y
|
||||
//!DESC NNEDI3 (double_y, nns16, win8x4)
|
||||
//!IN INPUT
|
||||
//!OUT tex1
|
||||
//!BLOCK_SIZE 32,16
|
||||
//!NUM_THREADS 32,8
|
||||
|
||||
|
||||
float nnedi3(float4 samples[8]) {
|
||||
//!OUT temp
|
||||
//!BLOCK_SIZE 32, 16
|
||||
//!NUM_THREADS 32, 8
|
||||
#pragma optionNV(inline none)
|
||||
float nnedi3(vec4 samples[8]) {
|
||||
float sum = 0.0, sumsq = 0.0;
|
||||
[unroll]
|
||||
for (int i = 0; i < 8; i++) {
|
||||
sum += dot(samples[i], 1.0f);
|
||||
[unroll] for (int i = 0; i < 8; i++) {
|
||||
sum += dot(samples[i], vec4(1.0, 1.0, 1.0, 1.0));
|
||||
sumsq += dot(samples[i], samples[i]);
|
||||
}
|
||||
|
||||
float mstd0 = sum / 32.0;
|
||||
float mstd1 = sumsq / 32.0 - mstd0 * mstd0;
|
||||
// 不能使用 lerp,否则结果可能为 nan
|
||||
float mstd2 = mstd1 >= 1.192092896e-7 ? rsqrt(mstd1) : 0.0;
|
||||
float mstd2 = mix(0.0, inversesqrt(mstd1), mstd1 >= 1.192092896e-7);
|
||||
mstd1 *= mstd2;
|
||||
|
||||
float vsum = 0.0, wsum = 0.0, sum1, sum2;
|
||||
#define T(x) intBitsToFloat(x)
|
||||
#define W(i, w0, w1, w2, w3) dot(samples[i], vec4(T(w0), T(w1), T(w2), T(w3)))
|
||||
#define WS(w0, w1) \
|
||||
sum1 = exp(sum1 * mstd2 + T(w0)); \
|
||||
sum2 = sum2 * mstd2 + T(w1); \
|
||||
wsum += sum1; \
|
||||
vsum += sum1 * (sum2 / (1.0 + abs(sum2)));
|
||||
sum1 = W(0, -1123354974, -1112248839, 1046299686, -1143613552)
|
||||
+ W(1, -1118620174, 1024662558, 1028038478, -1129268360)
|
||||
+ W(2, 1016130204, -1087068557, 1063313277, -1103342192)
|
||||
+ W(3, -1103968288, 1048182784, 1047279381, -1115088511)
|
||||
+ W(4, -1101453425, 1059583965, -1088182320, 1003350800)
|
||||
+ W(5, -1117908518, -1119323982, 1034186247, -1134684248)
|
||||
+ W(6, -1122284590, 1027638054, -1124394588, -1111377363)
|
||||
+ W(7, -1122818124, -1137723992, 978245507, 1028117438);
|
||||
sum2 = W(0, -1162931039, -1131063526, 1029801649, -1117642655)
|
||||
+ W(1, -1136248556, -1131086728, 1031011705, -1128864654)
|
||||
+ W(2, -1115594515, -1128443230, 1042762789, -1107118398)
|
||||
+ W(3, -1119907402, 1044675527, 1050674207, -1113986381)
|
||||
+ W(4, 1022791334, -1107588397, 1009001220, -1186206458)
|
||||
+ W(5, 1017500018, -1111169922, -1112569685, 1017255694)
|
||||
+ W(6, -1156766128, -1125594766, -1148613464, 993928432)
|
||||
+ W(7, 1014782692, -1135599628, -1114139175, 1007622876);
|
||||
WS(1038828992, 1041685264);
|
||||
sum1 = W(0, -1114329248, 1049950910, -1097681183, 1028668144) + W(1, 995958527, 1027336960, -1107326552, 1025858258)
|
||||
+ W(2, -1117673776, 1060640651, -1085831405, 1033402064)
|
||||
+ W(3, 1034401008, 1045782072, -1105157973, -1122828000)
|
||||
+ W(4, 1038612842, -1098159517, 1053136924, -1110558370)
|
||||
+ W(5, 1035088196, -1106507532, 1032016120, -1113173980)
|
||||
+ W(6, 1008781376, -1124000392, 1023707152, 1012109856)
|
||||
+ W(7, 1029875310, -1105439902, 1034119968, -1114749520);
|
||||
sum2 =
|
||||
W(0, 1031315360, -1099468189, -1112139926, 1036663822) + W(1, -1131767489, -1140834082, 1024287080, -1122285462)
|
||||
+ W(2, 1023637252, -1100127579, -1117241706, 1038018354)
|
||||
+ W(3, -1107869385, 1052854494, 1052996200, -1112496415)
|
||||
+ W(4, -1107666272, 1034036134, 1027811452, -1110479054)
|
||||
+ W(5, -1117110288, 1024451620, 1027157968, -1112615559) + W(6, -1124350185, 1003450083, -1131082337, 998992195)
|
||||
+ W(7, -1110538107, 1041131277, 1035032776, -1106762474);
|
||||
WS(-1086074680, 1053637716);
|
||||
sum1 = W(0, -1121345387, 1042002951, -1113042450, -1121398619)
|
||||
+ W(1, -1148805338, -1165378922, -1115297518, 991217235)
|
||||
+ W(2, -1136570733, 1052460699, -1107443934, -1117268427)
|
||||
+ W(3, 1049266593, -1094571489, -1098765182, 1036113926)
|
||||
+ W(4, 1027081787, -1124281856, 1043313411, -1136658365)
|
||||
+ W(5, -1133439181, 1040734807, 1006695533, -1112513138)
|
||||
+ W(6, -1158465386, -1121708851, 1016359031, 1021173351)
|
||||
+ W(7, -1120818857, 1035650578, 1027853163, -1106476275);
|
||||
sum2 = W(0, 1026517575, -1170492850, -1138816415, -1143472678)
|
||||
+ W(1, 1017334370, 1003954710, -1132363566, 998846550)
|
||||
+ W(2, 1051558711, -1096673587, -1136175651, -1124275402)
|
||||
+ W(3, 1071692777, -1077357700, -1098960792, 1018703670)
|
||||
+ W(4, 1049822619, -1098179385, -1116986501, 1007812651) + W(5, 1020207734, 996694924, 1003290486, 1007766851)
|
||||
+ W(6, 1022251878, -1122577241, -1141894102, 1009415395)
|
||||
+ W(7, 1019995718, 1015494226, -1126828734, -1163222937);
|
||||
WS(1051521136, 1027207116);
|
||||
sum1 =
|
||||
W(0, -1122694020, 1010830545, -1124291704, 1018062184) + W(1, -1121133108, -1124202632, 1037913146, -1116091286)
|
||||
+ W(2, -1102175837, 1057246783, -1093542759, 1041281977) + W(3, -1116351908, 1026322980, 982577970, -1125394504)
|
||||
+ W(4, 1045518980, -1089509425, 1055793637, 1008755233) + W(5, 1009393969, 1025178484, -1118947636, -1127575032)
|
||||
+ W(6, 1008379217, -1117338572, 1001093793, 1015898776) + W(7, 1015772516, 1009646833, 1001810977, -1121163492);
|
||||
sum2 = W(0, -1137495011, -1135527491, 1027730022, -1118108263)
|
||||
+ W(1, 1013616911, -1123650952, 1024465134, -1128775579)
|
||||
+ W(2, -1135578111, 1013443151, 1049128967, -1098008683)
|
||||
+ W(3, 1029346938, -1114797945, 1068130737, -1080443718)
|
||||
+ W(4, 1017473747, -1122100892, 1046423571, -1101482344)
|
||||
+ W(5, 1012413655, -1128721387, -1143058109, -1137148015)
|
||||
+ W(6, -1133405571, -1166794345, 1020545683, -1128178767)
|
||||
+ W(7, 1008139351, -1156685818, -1126785325, 991435034);
|
||||
WS(1057767608, -1132080751);
|
||||
sum1 = W(0, 1026028453, 1025766741, 1035118319, 1012106581) + W(1, 1026017621, -1135552917, 1040474693, -1138611630)
|
||||
+ W(2, -1117947285, 1051769667, -1111744027, 1030333189)
|
||||
+ W(3, 1048679017, -1083959172, -1084413328, 1045191121)
|
||||
+ W(4, 1025261389, -1120826122, 1049618505, -1122181545)
|
||||
+ W(5, 1011196341, 1045191525, -1110336171, 1030480605) + W(6, 1015828970, 1028389741, 1028257397, 1027514349)
|
||||
+ W(7, 1025013027, 1039505775, -1123719333, 1020294666);
|
||||
sum2 = W(0, 1017587161, -1101123140, 1040188371, 988296658) + W(1, 1028118553, -1103020887, 1022642341, 1010063898)
|
||||
+ W(2, 1008167722, -1099714612, 1039093756, 1026403646) + W(3, 1005112948, 1049070164, 1046164698, 1033545355)
|
||||
+ W(4, -1125344655, 1032013714, -1111525569, 1002132020)
|
||||
+ W(5, 1015776789, 1022049457, -1098832696, 1037334715)
|
||||
+ W(6, -1148301500, 1009340114, -1115917000, -1139728254)
|
||||
+ W(7, -1138850406, -1167693540, -1103378287, 1035581889);
|
||||
WS(-1099372256, -1088618788);
|
||||
sum1 = W(0, -1112538182, 1048693927, -1112344546, -1109099742)
|
||||
+ W(1, -1113349022, 1033711782, -1129092599, -1110127398)
|
||||
+ W(2, -1103996671, 1064716592, -1086749016, 1032699126)
|
||||
+ W(3, 1024020908, -1143605597, 1044926535, -1121424940)
|
||||
+ W(4, 1046614908, -1085173359, 1062252083, -1130166943)
|
||||
+ W(5, -1111225386, 1004694493, 1040479887, -1106709441)
|
||||
+ W(6, -1110537326, -1108087402, 1034104622, -1120726228)
|
||||
+ W(7, -1114146165, -1138402062, 1042110371, -1106064827);
|
||||
sum2 = W(0, 987083788, 1013472954, -1120418118, 979955865)
|
||||
+ W(1, -1144106823, -1131186779, -1122269098, -1163904780)
|
||||
+ W(2, -1120467381, -1139561796, 1038342084, -1115615181)
|
||||
+ W(3, -1121977305, 1044091298, 1042996066, -1127292875)
|
||||
+ W(4, -1118651341, 1038343490, -1118476220, -1123141745)
|
||||
+ W(5, -1162389292, -1115306287, -1128689408, 1014320394)
|
||||
+ W(6, -1152635694, -1155962630, -1132569906, -1135582470)
|
||||
+ W(7, 964510307, -1117365756, -1141833923, 1008840046);
|
||||
WS(1041282784, 1044242623);
|
||||
sum1 = W(0, -1119885764, -1171512555, 1003864029, 1025494836)
|
||||
+ W(1, -1119816052, -1121861252, 1040963149, -1113504879)
|
||||
+ W(2, -1100880653, 1057266723, -1094412795, 1043843337)
|
||||
+ W(3, -1113812594, 1010135439, -1118004569, -1125989575)
|
||||
+ W(4, 1046531310, -1089952515, 1056310444, -1156936827)
|
||||
+ W(5, 1015358999, 1031135156, -1114099002, -1122714492)
|
||||
+ W(6, 1005085853, -1115226950, 1015234855, 1003362397) + W(7, 1021011107, 1003139037, 992693307, -1120612644);
|
||||
sum2 =
|
||||
W(0, 1005317381, -1142619324, -1126266146, 1026462555) + W(1, -1143827754, 1012902153, -1128784654, 1020893616)
|
||||
+ W(2, 1019060164, -1114788024, -1094218173, 1054132458)
|
||||
+ W(3, 1009279342, -1098688460, -1078812823, 1070492026)
|
||||
+ W(4, 1014092605, -1120377499, -1099532818, 1048935725) + W(5, -1131000233, 1017453102, 1007638067, 1011358224)
|
||||
+ W(6, 1012779564, -1139793504, -1130333980, 1015734963)
|
||||
+ W(7, -1137528453, -1147729078, 1018177647, 987943782);
|
||||
WS(1046635232, 1024078131);
|
||||
sum1 = W(0, 1002735212, 1035063871, -1097977761, 1040314319) + W(1, 1025138813, 1034039879, -1105608655, 1035664624)
|
||||
+ W(2, 1017042555, 1044122447, -1094991056, 1038536855)
|
||||
+ W(3, -1132524982, -1110416695, 1051547730, -1114843703)
|
||||
+ W(4, 1031803657, -1092481954, 1050188814, 1003107468) + W(5, 1033606155, -1094320024, 1047410847, 1019470987)
|
||||
+ W(6, 1021596219, -1107502027, 1031346589, 1021345835)
|
||||
+ W(7, 1015508823, -1103391009, 1046101811, -1136683190);
|
||||
sum2 =
|
||||
W(0, -1096475926, 1044036812, 1052862983, -1106234474) + W(1, -1112281069, -1112231286, 1024115789, -1121785528)
|
||||
+ W(2, -1116645717, -1111398905, 1051331710, -1130292776)
|
||||
+ W(3, 1041647377, -1096068583, 1038036111, 1037359643) + W(4, -1113263240, 1026411348, 1042458641, -1111704128)
|
||||
+ W(5, 1023473494, -1114320784, 1028002558, -1123406807)
|
||||
+ W(6, -1117017643, -1138574198, 1037890580, -1109714921)
|
||||
+ W(7, 1039764966, -1104710548, -1106844581, 1041123403);
|
||||
WS(-1088554040, -1076674880);
|
||||
sum1 =
|
||||
W(0, 1026292820, -1132973070, -1144171612, -1130131975) + W(1, 1016736263, 1034501898, -1110973538, 1028857234)
|
||||
+ W(2, 1042339025, -1089525132, 1052671191, -1108906970)
|
||||
+ W(3, -1110236986, 1037427962, -1123890785, -1112145786)
|
||||
+ W(4, -1103961368, 1056478885, -1092344862, 1002874044) + W(5, 1016313655, -1118983748, 1041641985, 1025897228)
|
||||
+ W(6, -1151588920, 1038469390, 1010979982, -1130905399)
|
||||
+ W(7, 1014755782, -1123320716, 1017396903, 1033705562);
|
||||
sum2 = W(0, 1013915195, -1133182691, -1127318198, 1020584890)
|
||||
+ W(1, 1007730851, 1024414743, -1121307593, 1005058566) + W(2, 981970521, -1111248658, 1035588225, -1124411850)
|
||||
+ W(3, 1028189234, 1040952978, 1057294107, 1029625115)
|
||||
+ W(4, -1121038101, -1109339192, -1107404728, 1026110889)
|
||||
+ W(5, -1142484934, -1094377458, 1024397525, 1023925523)
|
||||
+ W(6, -1146368902, -1116592821, -1118541421, -1140327971)
|
||||
+ W(7, 1010322539, -1112421528, 1019759378, -1199698720);
|
||||
WS(1063581112, 1015292283);
|
||||
sum1 =
|
||||
W(0, -1123806598, -1125096044, 1046804719, -1117498166) + W(1, -1124445804, 1037634467, 1028314614, 1006823135)
|
||||
+ W(2, 1036776315, -1083793455, 1064148787, -1106689849)
|
||||
+ W(3, -1112186771, -1098422117, 1034155462, 1004978479)
|
||||
+ W(4, -1102837698, 1058965073, -1089226130, 1033810693)
|
||||
+ W(5, -1117642958, -1106625757, 1037373467, 1029436414) + W(6, -1137018200, 1036181095, 994321759, -1119765454)
|
||||
+ W(7, 1010580432, -1127761788, 1021285644, 1034713459);
|
||||
sum2 = W(0, -1127012521, -1110373665, -1121983257, 1021812843)
|
||||
+ W(1, -1129458054, -1122115974, -1121551577, 1015201109)
|
||||
+ W(2, -1134632819, -1118435057, -1107711610, 1039413537)
|
||||
+ W(3, -1113739078, 1041258512, 1043546644, -1127386873)
|
||||
+ W(4, -1106078947, 1025961773, 1048226293, -1110385416)
|
||||
+ W(5, -1115241196, 1041055451, -1131486243, -1135801459)
|
||||
+ W(6, -1122814807, 1025056413, -1139476701, -1132245806)
|
||||
+ W(7, -1119046895, 1029845331, 1018415015, -1140149017);
|
||||
WS(-1109010880, -1087548956);
|
||||
sum1 = W(0, 1034947768, -1095012676, 1046023882, 1029737824) + W(1, 1034343312, -1102610188, 1039446704, 1025692706)
|
||||
+ W(2, 1016751552, -1096454908, 1042564604, 1038373096)
|
||||
+ W(3, 1019661856, -1091443170, -1105694067, 1039271048)
|
||||
+ W(4, -1126501287, -1131030249, 1044246468, 1012879825)
|
||||
+ W(5, 1017025648, 1042942296, -1103700296, 1041317114) + W(6, 1030724160, 1019936112, -1141422594, 1029263800)
|
||||
+ W(7, -1140792121, 1024647464, -1107855416, 1041193844);
|
||||
sum2 =
|
||||
W(0, 1034034732, -1107522705, -1105460279, 1021740679) + W(1, -1113997103, -1121503695, 1038975878, -1112744336)
|
||||
+ W(2, 1028771217, -1114143244, 1032873918, -1121564954) + W(3, 1025456143, -1105773446, 1059420344, 1024971971)
|
||||
+ W(4, 1035315492, -1109746606, 1040681265, -1122379806)
|
||||
+ W(5, -1102403849, -1106040358, 1046039582, -1106873869)
|
||||
+ W(6, 1018212015, -1106459627, 1026290649, -1130313815)
|
||||
+ W(7, -1099438501, 1039219872, 1046943722, -1105420350);
|
||||
WS(-1086299832, -1077288694);
|
||||
sum1 = W(0, 1021716686, -1099039878, -1111509136, 1039618828)
|
||||
+ W(1, -1132921948, -1108540692, 1021468846, -1131678690)
|
||||
+ W(2, -1113901292, -1158126306, -1096197083, 1041516082)
|
||||
+ W(3, -1108835908, 1055092577, 1062013047, -1118733319)
|
||||
+ W(4, 1023078294, -1089051407, 1050708993, -1122936235) + W(5, 965138311, -1113759276, 1022391342, 1015065790)
|
||||
+ W(6, 998651320, -1107695832, -1133490396, 997649137) + W(7, -1130194922, -1113503632, 991635057, 1023538631);
|
||||
sum2 = W(0, -1133976495, 1035891239, -1130801609, -1113698362)
|
||||
+ W(1, 1027343155, 1030599513, -1108453664, 1016406968)
|
||||
+ W(2, -1149877867, 1037590422, 1012747883, -1108226898)
|
||||
+ W(3, -1119506980, 1054189655, -1119322812, -1120928356)
|
||||
+ W(4, -1126385541, 1041308688, -1107379808, 1016225738)
|
||||
+ W(5, 1016526837, -1112736561, -1119223720, 988482485) + W(6, 994153115, 1004824957, -1116360142, 1018050885)
|
||||
+ W(7, -1140785051, -1120347934, -1129452107, -1117792638);
|
||||
WS(-1113279936, 1066223903);
|
||||
sum1 = W(0, -1128171420, 1040261344, -1112013315, -1123695998)
|
||||
+ W(1, -1141738481, -1140107833, -1116929726, -1154978689)
|
||||
+ W(2, -1138940153, 1050703688, -1108200895, -1123177006)
|
||||
+ W(3, 1044160156, -1100167260, -1100730273, 1034288823)
|
||||
+ W(4, 1020686276, -1130335589, 1040782300, -1141423761)
|
||||
+ W(5, -1129655596, 1035637471, 1024316286, -1114187043) + W(6, 964173357, -1124525100, 1014134393, 1013984857)
|
||||
+ W(7, -1123239900, 1032644739, 1029624526, -1108229911);
|
||||
sum2 = W(0, -1115606620, 1021458196, 1009639320, -1131253088)
|
||||
+ W(1, -1125272644, 1017345212, 1016051020, -1143902384)
|
||||
+ W(2, -1099614716, 1047257730, -1120838650, 1020803060)
|
||||
+ W(3, -1080575150, 1068148121, -1113655261, 1032085971)
|
||||
+ W(4, -1102155153, 1044966894, -1132238288, 1016311348)
|
||||
+ W(5, -1122847678, 1026244022, -1130782536, -1137376840)
|
||||
+ W(6, -1123394906, 1017049220, 967940860, -1137115752)
|
||||
+ W(7, -1129056732, 1010161976, 1004223696, -1136984808);
|
||||
WS(1060545080, -1126581603);
|
||||
sum1 =
|
||||
W(0, 1032630360, -1112268976, 1045186906, -1125010622) + W(1, 1037657648, -1128752350, 1032285712, 1029508223)
|
||||
+ W(2, 1043836232, -1090205186, 1053340438, -1108078856) + W(3, 1037448680, 1048595306, -1094666759, 1041691860)
|
||||
+ W(4, 976149203, 1057651571, -1082657749, 1042698525) + W(5, 1031833596, 1035187792, -1092127852, 1040118132)
|
||||
+ W(6, 1031675647, 1034806588, -1104761760, 1033087420) + W(7, 1025282125, 1043419290, -1096441814, 1034587656);
|
||||
sum2 =
|
||||
W(0, -1123698886, 1034075649, 998149095, -1113635181) + W(1, -1126365381, 1026991402, -1118780236, -1168196508)
|
||||
+ W(2, -1135914762, 1019253181, 1023543366, -1114469118)
|
||||
+ W(3, -1121651762, 1047572688, 1038479879, -1145545780) + W(4, -1118625490, 1035108181, -1114677625, 992781287)
|
||||
+ W(5, -1122087574, -1115886918, 1011684618, -1139655050)
|
||||
+ W(6, -1147908244, 1016718341, -1132109957, -1142844852)
|
||||
+ W(7, -1134045690, -1117034488, -1137057610, 1007905050);
|
||||
WS(-1083899832, -1105526146);
|
||||
sum1 = W(0, 1026357515, -1119744955, -1117075907, -1111407198)
|
||||
+ W(1, -1139718894, -1125720471, -1106102943, -1152407445)
|
||||
+ W(2, 1044187583, -1092285679, 1048719011, -1107209883)
|
||||
+ W(3, -1105573131, 1062437883, 1052836221, -1107292779)
|
||||
+ W(4, -1104526300, 1058460257, -1089717563, -1122559055)
|
||||
+ W(5, -1119529939, 1022150135, -1123085499, -1119739267)
|
||||
+ W(6, -1125768375, 1033366698, -1114009838, -1119196243)
|
||||
+ W(7, -1132776678, 1009731342, -1112611206, -1129505495);
|
||||
sum2 = W(0, -1110807022, 1025172792, 1033543849, -1123816828)
|
||||
+ W(1, -1129400032, -1117035240, 999654946, -1144812946)
|
||||
+ W(2, -1105612607, 1035443403, 1039345667, -1120747576)
|
||||
+ W(3, -1123619892, -1135427545, 1053020794, -1113498942)
|
||||
+ W(4, -1131262448, -1111010692, 1047843748, -1113301822)
|
||||
+ W(5, 1016529300, -1115955576, -1135856481, -1146605522)
|
||||
+ W(6, -1129444600, -1117326476, 1022819536, -1119691028)
|
||||
+ W(7, -1136239801, -1121250556, 998047364, -1135792457);
|
||||
WS(-1107513792, 1064663354);
|
||||
sum1 = W(0, 1030862455, -1113532308, 1032378968, -1123071015)
|
||||
+ W(1, -1161118946, 1021510766, -1127591630, 1009770420)
|
||||
+ W(2, 1040244826, -1091621085, 1051734861, -1107582956)
|
||||
+ W(3, -1104300038, 1046262406, 1034822530, -1108820108)
|
||||
+ W(4, -1102940181, 1054782000, -1095483267, -1125175670)
|
||||
+ W(5, -1135077628, 1019068110, 1031948820, 1025488559)
|
||||
+ W(6, -1135539484, 1036941280, -1172984259, -1126076542)
|
||||
+ W(7, 1011863892, -1128724830, -1120336759, 1036426604);
|
||||
sum2 =
|
||||
W(0, -1135206239, -1140752647, 1022777359, 974924014) + W(1, -1139065871, -1123380440, 1021581075, -1133276463)
|
||||
+ W(2, 1026230428, 988696695, -1122295168, 1029689087) + W(3, 1025917606, -1092786651, -1085937537, -1140169471)
|
||||
+ W(4, 1027050280, 1049996339, 1032573953, -1135329695) + W(5, 1013849783, 1057784826, -1130048007, -1124883951)
|
||||
+ W(6, 1016077019, 1033822297, 1032545188, 1011238415) + W(7, -1127829351, 1034470972, -1137094527, 1001568686);
|
||||
WS(1058918200, -1121082995);
|
||||
|
||||
sum1 = W(0, -1123354974, -1112248839, 1046299686, -1143613552) + W(1, -1118620174, 1024662558, 1028038478, -1129268360) + W(2, 1016130204, -1087068557, 1063313277, -1103342192) + W(3, -1103968288, 1048182784, 1047279381, -1115088511) + W(4, -1101453425, 1059583965, -1088182320, 1003350800) + W(5, -1117908518, -1119323982, 1034186247, -1134684248) + W(6, -1122284590, 1027638054, -1124394588, -1111377363) + W(7, -1122818124, -1137723992, 978245507, 1028117438); sum2 = W(0, -1162931039, -1131063526, 1029801649, -1117642655) + W(1, -1136248556, -1131086728, 1031011705, -1128864654) + W(2, -1115594515, -1128443230, 1042762789, -1107118398) + W(3, -1119907402, 1044675527, 1050674207, -1113986381) + W(4, 1022791334, -1107588397, 1009001220, -1186206458) + W(5, 1017500018, -1111169922, -1112569685, 1017255694) + W(6, -1156766128, -1125594766, -1148613464, 993928432) + W(7, 1014782692, -1135599628, -1114139175, 1007622876); WS(1038828992, 1041685264);
|
||||
sum1 = W(0, -1114329248, 1049950910, -1097681183, 1028668144) + W(1, 995958527, 1027336960, -1107326552, 1025858258) + W(2, -1117673776, 1060640651, -1085831405, 1033402064) + W(3, 1034401008, 1045782072, -1105157973, -1122828000) + W(4, 1038612842, -1098159517, 1053136924, -1110558370) + W(5, 1035088196, -1106507532, 1032016120, -1113173980) + W(6, 1008781376, -1124000392, 1023707152, 1012109856) + W(7, 1029875310, -1105439902, 1034119968, -1114749520); sum2 = W(0, 1031315360, -1099468189, -1112139926, 1036663822) + W(1, -1131767489, -1140834082, 1024287080, -1122285462) + W(2, 1023637252, -1100127579, -1117241706, 1038018354) + W(3, -1107869385, 1052854494, 1052996200, -1112496415) + W(4, -1107666272, 1034036134, 1027811452, -1110479054) + W(5, -1117110288, 1024451620, 1027157968, -1112615559) + W(6, -1124350185, 1003450083, -1131082337, 998992195) + W(7, -1110538107, 1041131277, 1035032776, -1106762474); WS(-1086074680, 1053637716);
|
||||
sum1 = W(0, -1121345387, 1042002951, -1113042450, -1121398619) + W(1, -1148805338, -1165378922, -1115297518, 991217235) + W(2, -1136570733, 1052460699, -1107443934, -1117268427) + W(3, 1049266593, -1094571489, -1098765182, 1036113926) + W(4, 1027081787, -1124281856, 1043313411, -1136658365) + W(5, -1133439181, 1040734807, 1006695533, -1112513138) + W(6, -1158465386, -1121708851, 1016359031, 1021173351) + W(7, -1120818857, 1035650578, 1027853163, -1106476275); sum2 = W(0, 1026517575, -1170492850, -1138816415, -1143472678) + W(1, 1017334370, 1003954710, -1132363566, 998846550) + W(2, 1051558711, -1096673587, -1136175651, -1124275402) + W(3, 1071692777, -1077357700, -1098960792, 1018703670) + W(4, 1049822619, -1098179385, -1116986501, 1007812651) + W(5, 1020207734, 996694924, 1003290486, 1007766851) + W(6, 1022251878, -1122577241, -1141894102, 1009415395) + W(7, 1019995718, 1015494226, -1126828734, -1163222937); WS(1051521136, 1027207116);
|
||||
sum1 = W(0, -1122694020, 1010830545, -1124291704, 1018062184) + W(1, -1121133108, -1124202632, 1037913146, -1116091286) + W(2, -1102175837, 1057246783, -1093542759, 1041281977) + W(3, -1116351908, 1026322980, 982577970, -1125394504) + W(4, 1045518980, -1089509425, 1055793637, 1008755233) + W(5, 1009393969, 1025178484, -1118947636, -1127575032) + W(6, 1008379217, -1117338572, 1001093793, 1015898776) + W(7, 1015772516, 1009646833, 1001810977, -1121163492); sum2 = W(0, -1137495011, -1135527491, 1027730022, -1118108263) + W(1, 1013616911, -1123650952, 1024465134, -1128775579) + W(2, -1135578111, 1013443151, 1049128967, -1098008683) + W(3, 1029346938, -1114797945, 1068130737, -1080443718) + W(4, 1017473747, -1122100892, 1046423571, -1101482344) + W(5, 1012413655, -1128721387, -1143058109, -1137148015) + W(6, -1133405571, -1166794345, 1020545683, -1128178767) + W(7, 1008139351, -1156685818, -1126785325, 991435034); WS(1057767608, -1132080751);
|
||||
sum1 = W(0, 1026028453, 1025766741, 1035118319, 1012106581) + W(1, 1026017621, -1135552917, 1040474693, -1138611630) + W(2, -1117947285, 1051769667, -1111744027, 1030333189) + W(3, 1048679017, -1083959172, -1084413328, 1045191121) + W(4, 1025261389, -1120826122, 1049618505, -1122181545) + W(5, 1011196341, 1045191525, -1110336171, 1030480605) + W(6, 1015828970, 1028389741, 1028257397, 1027514349) + W(7, 1025013027, 1039505775, -1123719333, 1020294666); sum2 = W(0, 1017587161, -1101123140, 1040188371, 988296658) + W(1, 1028118553, -1103020887, 1022642341, 1010063898) + W(2, 1008167722, -1099714612, 1039093756, 1026403646) + W(3, 1005112948, 1049070164, 1046164698, 1033545355) + W(4, -1125344655, 1032013714, -1111525569, 1002132020) + W(5, 1015776789, 1022049457, -1098832696, 1037334715) + W(6, -1148301500, 1009340114, -1115917000, -1139728254) + W(7, -1138850406, -1167693540, -1103378287, 1035581889); WS(-1099372256, -1088618788);
|
||||
sum1 = W(0, -1112538182, 1048693927, -1112344546, -1109099742) + W(1, -1113349022, 1033711782, -1129092599, -1110127398) + W(2, -1103996671, 1064716592, -1086749016, 1032699126) + W(3, 1024020908, -1143605597, 1044926535, -1121424940) + W(4, 1046614908, -1085173359, 1062252083, -1130166943) + W(5, -1111225386, 1004694493, 1040479887, -1106709441) + W(6, -1110537326, -1108087402, 1034104622, -1120726228) + W(7, -1114146165, -1138402062, 1042110371, -1106064827); sum2 = W(0, 987083788, 1013472954, -1120418118, 979955865) + W(1, -1144106823, -1131186779, -1122269098, -1163904780) + W(2, -1120467381, -1139561796, 1038342084, -1115615181) + W(3, -1121977305, 1044091298, 1042996066, -1127292875) + W(4, -1118651341, 1038343490, -1118476220, -1123141745) + W(5, -1162389292, -1115306287, -1128689408, 1014320394) + W(6, -1152635694, -1155962630, -1132569906, -1135582470) + W(7, 964510307, -1117365756, -1141833923, 1008840046); WS(1041282784, 1044242623);
|
||||
sum1 = W(0, -1119885764, -1171512555, 1003864029, 1025494836) + W(1, -1119816052, -1121861252, 1040963149, -1113504879) + W(2, -1100880653, 1057266723, -1094412795, 1043843337) + W(3, -1113812594, 1010135439, -1118004569, -1125989575) + W(4, 1046531310, -1089952515, 1056310444, -1156936827) + W(5, 1015358999, 1031135156, -1114099002, -1122714492) + W(6, 1005085853, -1115226950, 1015234855, 1003362397) + W(7, 1021011107, 1003139037, 992693307, -1120612644); sum2 = W(0, 1005317381, -1142619324, -1126266146, 1026462555) + W(1, -1143827754, 1012902153, -1128784654, 1020893616) + W(2, 1019060164, -1114788024, -1094218173, 1054132458) + W(3, 1009279342, -1098688460, -1078812823, 1070492026) + W(4, 1014092605, -1120377499, -1099532818, 1048935725) + W(5, -1131000233, 1017453102, 1007638067, 1011358224) + W(6, 1012779564, -1139793504, -1130333980, 1015734963) + W(7, -1137528453, -1147729078, 1018177647, 987943782); WS(1046635232, 1024078131);
|
||||
sum1 = W(0, 1002735212, 1035063871, -1097977761, 1040314319) + W(1, 1025138813, 1034039879, -1105608655, 1035664624) + W(2, 1017042555, 1044122447, -1094991056, 1038536855) + W(3, -1132524982, -1110416695, 1051547730, -1114843703) + W(4, 1031803657, -1092481954, 1050188814, 1003107468) + W(5, 1033606155, -1094320024, 1047410847, 1019470987) + W(6, 1021596219, -1107502027, 1031346589, 1021345835) + W(7, 1015508823, -1103391009, 1046101811, -1136683190); sum2 = W(0, -1096475926, 1044036812, 1052862983, -1106234474) + W(1, -1112281069, -1112231286, 1024115789, -1121785528) + W(2, -1116645717, -1111398905, 1051331710, -1130292776) + W(3, 1041647377, -1096068583, 1038036111, 1037359643) + W(4, -1113263240, 1026411348, 1042458641, -1111704128) + W(5, 1023473494, -1114320784, 1028002558, -1123406807) + W(6, -1117017643, -1138574198, 1037890580, -1109714921) + W(7, 1039764966, -1104710548, -1106844581, 1041123403); WS(-1088554040, -1076674880);
|
||||
sum1 = W(0, 1026292820, -1132973070, -1144171612, -1130131975) + W(1, 1016736263, 1034501898, -1110973538, 1028857234) + W(2, 1042339025, -1089525132, 1052671191, -1108906970) + W(3, -1110236986, 1037427962, -1123890785, -1112145786) + W(4, -1103961368, 1056478885, -1092344862, 1002874044) + W(5, 1016313655, -1118983748, 1041641985, 1025897228) + W(6, -1151588920, 1038469390, 1010979982, -1130905399) + W(7, 1014755782, -1123320716, 1017396903, 1033705562); sum2 = W(0, 1013915195, -1133182691, -1127318198, 1020584890) + W(1, 1007730851, 1024414743, -1121307593, 1005058566) + W(2, 981970521, -1111248658, 1035588225, -1124411850) + W(3, 1028189234, 1040952978, 1057294107, 1029625115) + W(4, -1121038101, -1109339192, -1107404728, 1026110889) + W(5, -1142484934, -1094377458, 1024397525, 1023925523) + W(6, -1146368902, -1116592821, -1118541421, -1140327971) + W(7, 1010322539, -1112421528, 1019759378, -1199698720); WS(1063581112, 1015292283);
|
||||
sum1 = W(0, -1123806598, -1125096044, 1046804719, -1117498166) + W(1, -1124445804, 1037634467, 1028314614, 1006823135) + W(2, 1036776315, -1083793455, 1064148787, -1106689849) + W(3, -1112186771, -1098422117, 1034155462, 1004978479) + W(4, -1102837698, 1058965073, -1089226130, 1033810693) + W(5, -1117642958, -1106625757, 1037373467, 1029436414) + W(6, -1137018200, 1036181095, 994321759, -1119765454) + W(7, 1010580432, -1127761788, 1021285644, 1034713459); sum2 = W(0, -1127012521, -1110373665, -1121983257, 1021812843) + W(1, -1129458054, -1122115974, -1121551577, 1015201109) + W(2, -1134632819, -1118435057, -1107711610, 1039413537) + W(3, -1113739078, 1041258512, 1043546644, -1127386873) + W(4, -1106078947, 1025961773, 1048226293, -1110385416) + W(5, -1115241196, 1041055451, -1131486243, -1135801459) + W(6, -1122814807, 1025056413, -1139476701, -1132245806) + W(7, -1119046895, 1029845331, 1018415015, -1140149017); WS(-1109010880, -1087548956);
|
||||
sum1 = W(0, 1034947768, -1095012676, 1046023882, 1029737824) + W(1, 1034343312, -1102610188, 1039446704, 1025692706) + W(2, 1016751552, -1096454908, 1042564604, 1038373096) + W(3, 1019661856, -1091443170, -1105694067, 1039271048) + W(4, -1126501287, -1131030249, 1044246468, 1012879825) + W(5, 1017025648, 1042942296, -1103700296, 1041317114) + W(6, 1030724160, 1019936112, -1141422594, 1029263800) + W(7, -1140792121, 1024647464, -1107855416, 1041193844); sum2 = W(0, 1034034732, -1107522705, -1105460279, 1021740679) + W(1, -1113997103, -1121503695, 1038975878, -1112744336) + W(2, 1028771217, -1114143244, 1032873918, -1121564954) + W(3, 1025456143, -1105773446, 1059420344, 1024971971) + W(4, 1035315492, -1109746606, 1040681265, -1122379806) + W(5, -1102403849, -1106040358, 1046039582, -1106873869) + W(6, 1018212015, -1106459627, 1026290649, -1130313815) + W(7, -1099438501, 1039219872, 1046943722, -1105420350); WS(-1086299832, -1077288694);
|
||||
sum1 = W(0, 1021716686, -1099039878, -1111509136, 1039618828) + W(1, -1132921948, -1108540692, 1021468846, -1131678690) + W(2, -1113901292, -1158126306, -1096197083, 1041516082) + W(3, -1108835908, 1055092577, 1062013047, -1118733319) + W(4, 1023078294, -1089051407, 1050708993, -1122936235) + W(5, 965138311, -1113759276, 1022391342, 1015065790) + W(6, 998651320, -1107695832, -1133490396, 997649137) + W(7, -1130194922, -1113503632, 991635057, 1023538631); sum2 = W(0, -1133976495, 1035891239, -1130801609, -1113698362) + W(1, 1027343155, 1030599513, -1108453664, 1016406968) + W(2, -1149877867, 1037590422, 1012747883, -1108226898) + W(3, -1119506980, 1054189655, -1119322812, -1120928356) + W(4, -1126385541, 1041308688, -1107379808, 1016225738) + W(5, 1016526837, -1112736561, -1119223720, 988482485) + W(6, 994153115, 1004824957, -1116360142, 1018050885) + W(7, -1140785051, -1120347934, -1129452107, -1117792638); WS(-1113279936, 1066223903);
|
||||
sum1 = W(0, -1128171420, 1040261344, -1112013315, -1123695998) + W(1, -1141738481, -1140107833, -1116929726, -1154978689) + W(2, -1138940153, 1050703688, -1108200895, -1123177006) + W(3, 1044160156, -1100167260, -1100730273, 1034288823) + W(4, 1020686276, -1130335589, 1040782300, -1141423761) + W(5, -1129655596, 1035637471, 1024316286, -1114187043) + W(6, 964173357, -1124525100, 1014134393, 1013984857) + W(7, -1123239900, 1032644739, 1029624526, -1108229911); sum2 = W(0, -1115606620, 1021458196, 1009639320, -1131253088) + W(1, -1125272644, 1017345212, 1016051020, -1143902384) + W(2, -1099614716, 1047257730, -1120838650, 1020803060) + W(3, -1080575150, 1068148121, -1113655261, 1032085971) + W(4, -1102155153, 1044966894, -1132238288, 1016311348) + W(5, -1122847678, 1026244022, -1130782536, -1137376840) + W(6, -1123394906, 1017049220, 967940860, -1137115752) + W(7, -1129056732, 1010161976, 1004223696, -1136984808); WS(1060545080, -1126581603);
|
||||
sum1 = W(0, 1032630360, -1112268976, 1045186906, -1125010622) + W(1, 1037657648, -1128752350, 1032285712, 1029508223) + W(2, 1043836232, -1090205186, 1053340438, -1108078856) + W(3, 1037448680, 1048595306, -1094666759, 1041691860) + W(4, 976149203, 1057651571, -1082657749, 1042698525) + W(5, 1031833596, 1035187792, -1092127852, 1040118132) + W(6, 1031675647, 1034806588, -1104761760, 1033087420) + W(7, 1025282125, 1043419290, -1096441814, 1034587656); sum2 = W(0, -1123698886, 1034075649, 998149095, -1113635181) + W(1, -1126365381, 1026991402, -1118780236, -1168196508) + W(2, -1135914762, 1019253181, 1023543366, -1114469118) + W(3, -1121651762, 1047572688, 1038479879, -1145545780) + W(4, -1118625490, 1035108181, -1114677625, 992781287) + W(5, -1122087574, -1115886918, 1011684618, -1139655050) + W(6, -1147908244, 1016718341, -1132109957, -1142844852) + W(7, -1134045690, -1117034488, -1137057610, 1007905050); WS(-1083899832, -1105526146);
|
||||
sum1 = W(0, 1026357515, -1119744955, -1117075907, -1111407198) + W(1, -1139718894, -1125720471, -1106102943, -1152407445) + W(2, 1044187583, -1092285679, 1048719011, -1107209883) + W(3, -1105573131, 1062437883, 1052836221, -1107292779) + W(4, -1104526300, 1058460257, -1089717563, -1122559055) + W(5, -1119529939, 1022150135, -1123085499, -1119739267) + W(6, -1125768375, 1033366698, -1114009838, -1119196243) + W(7, -1132776678, 1009731342, -1112611206, -1129505495); sum2 = W(0, -1110807022, 1025172792, 1033543849, -1123816828) + W(1, -1129400032, -1117035240, 999654946, -1144812946) + W(2, -1105612607, 1035443403, 1039345667, -1120747576) + W(3, -1123619892, -1135427545, 1053020794, -1113498942) + W(4, -1131262448, -1111010692, 1047843748, -1113301822) + W(5, 1016529300, -1115955576, -1135856481, -1146605522) + W(6, -1129444600, -1117326476, 1022819536, -1119691028) + W(7, -1136239801, -1121250556, 998047364, -1135792457); WS(-1107513792, 1064663354);
|
||||
sum1 = W(0, 1030862455, -1113532308, 1032378968, -1123071015) + W(1, -1161118946, 1021510766, -1127591630, 1009770420) + W(2, 1040244826, -1091621085, 1051734861, -1107582956) + W(3, -1104300038, 1046262406, 1034822530, -1108820108) + W(4, -1102940181, 1054782000, -1095483267, -1125175670) + W(5, -1135077628, 1019068110, 1031948820, 1025488559) + W(6, -1135539484, 1036941280, -1172984259, -1126076542) + W(7, 1011863892, -1128724830, -1120336759, 1036426604); sum2 = W(0, -1135206239, -1140752647, 1022777359, 974924014) + W(1, -1139065871, -1123380440, 1021581075, -1133276463) + W(2, 1026230428, 988696695, -1122295168, 1029689087) + W(3, 1025917606, -1092786651, -1085937537, -1140169471) + W(4, 1027050280, 1049996339, 1032573953, -1135329695) + W(5, 1013849783, 1057784826, -1130048007, -1124883951) + W(6, 1016077019, 1033822297, 1032545188, 1011238415) + W(7, -1127829351, 1034470972, -1137094527, 1001568686); WS(1058918200, -1121082995);
|
||||
return clamp(mstd0 + 5.0 * vsum / wsum * mstd1, 0.0, 1.0);
|
||||
}
|
||||
|
||||
float GetLuma(float3 color) {
|
||||
return dot(float3(0.299f, 0.587f, 0.114f), color);
|
||||
}
|
||||
shared float inp[429];
|
||||
|
||||
groupshared float inp[429];
|
||||
#define CURRENT_PASS 1
|
||||
|
||||
#define GET_SAMPLE(x) dot(x.rgb, rgb2y)
|
||||
#define imageStore(out_image, pos, val) imageStoreOverride(pos, val.x)
|
||||
void imageStoreOverride(uint2 pos, float value) { temp[pos] = (value); }
|
||||
|
||||
#define INPUT_tex(pos) GET_SAMPLE(vec4(texture(INPUT, pos)))
|
||||
static const float2 INPUT_size = float2(GetInputSize());
|
||||
static const float2 INPUT_pt = float2(GetInputPt());
|
||||
|
||||
#define HOOKED_tex(pos) INPUT_tex(pos)
|
||||
#define HOOKED_size INPUT_size
|
||||
#define HOOKED_pt INPUT_pt
|
||||
|
||||
void Pass1(uint2 blockStart, uint3 threadId) {
|
||||
const float2 inputPt = GetInputPt();
|
||||
|
||||
const uint2 group_base = uint2(blockStart.x, blockStart.y >> 1);
|
||||
for (int id = threadId.x * MP_NUM_THREADS_Y + threadId.y; id < 429; id += MP_NUM_THREADS_X * MP_NUM_THREADS_Y) {
|
||||
ivec2 group_base = ivec2(gl_WorkGroupID) * ivec2(gl_WorkGroupSize);
|
||||
int local_pos = int(gl_LocalInvocationID.x) * 11 + int(gl_LocalInvocationID.y);
|
||||
for (int id = int(gl_LocalInvocationIndex); id < 429; id += int(gl_WorkGroupSize.x * gl_WorkGroupSize.y)) {
|
||||
uint x = (uint)id / 11, y = (uint)id % 11;
|
||||
inp[id] = GetLuma(INPUT.SampleLevel(sam, inputPt * float2(group_base.x + x - 3 + 0.5, group_base.y + y - 1 + 0.5), 0).rgb);
|
||||
inp[id] =
|
||||
HOOKED_tex(HOOKED_pt * vec2(float(group_base.x + x - (3)) + 0.5, float(group_base.y + y - (1)) + 0.5)).x;
|
||||
}
|
||||
|
||||
GroupMemoryBarrierWithGroupSync();
|
||||
|
||||
float4 ret = 0.0;
|
||||
float4 ret0 = 0.0;
|
||||
float4 samples[8];
|
||||
const uint local_pos = threadId.x * 11 + threadId.y;
|
||||
[unroll]
|
||||
for (int i = 0; i < 8; ++i) {
|
||||
[unroll]
|
||||
for (int j = 0; j < 4; ++j) {
|
||||
samples[i][j] = inp[local_pos + i * 11 + j];
|
||||
}
|
||||
barrier();
|
||||
vec4 ret = vec4(0.0, 0.0, 0.0, 0.0);
|
||||
vec4 ret0 = vec4(0.0, 0.0, 0.0, 0.0);
|
||||
vec4 samples[8];
|
||||
samples[0][0] = inp[local_pos + 0];
|
||||
samples[0][1] = inp[local_pos + 1];
|
||||
samples[0][2] = inp[local_pos + 2];
|
||||
samples[0][3] = inp[local_pos + 3];
|
||||
samples[1][0] = inp[local_pos + 11];
|
||||
samples[1][1] = inp[local_pos + 12];
|
||||
samples[1][2] = inp[local_pos + 13];
|
||||
samples[1][3] = inp[local_pos + 14];
|
||||
samples[2][0] = inp[local_pos + 22];
|
||||
samples[2][1] = inp[local_pos + 23];
|
||||
samples[2][2] = inp[local_pos + 24];
|
||||
samples[2][3] = inp[local_pos + 25];
|
||||
samples[3][0] = inp[local_pos + 33];
|
||||
samples[3][1] = inp[local_pos + 34];
|
||||
samples[3][2] = inp[local_pos + 35];
|
||||
samples[3][3] = inp[local_pos + 36];
|
||||
samples[4][0] = inp[local_pos + 44];
|
||||
samples[4][1] = inp[local_pos + 45];
|
||||
samples[4][2] = inp[local_pos + 46];
|
||||
samples[4][3] = inp[local_pos + 47];
|
||||
samples[5][0] = inp[local_pos + 55];
|
||||
samples[5][1] = inp[local_pos + 56];
|
||||
samples[5][2] = inp[local_pos + 57];
|
||||
samples[5][3] = inp[local_pos + 58];
|
||||
samples[6][0] = inp[local_pos + 66];
|
||||
samples[6][1] = inp[local_pos + 67];
|
||||
samples[6][2] = inp[local_pos + 68];
|
||||
samples[6][3] = inp[local_pos + 69];
|
||||
samples[7][0] = inp[local_pos + 77];
|
||||
samples[7][1] = inp[local_pos + 78];
|
||||
samples[7][2] = inp[local_pos + 79];
|
||||
samples[7][3] = inp[local_pos + 80];
|
||||
ret[0] = nnedi3(samples);
|
||||
ret0[0] = inp[local_pos + 34];
|
||||
#if CURRENT_PASS == LAST_PASS
|
||||
uint2 destPos = blockStart + threadId.xy * 2;
|
||||
uint2 outputSize = GetOutputSize();
|
||||
if (destPos.x >= outputSize.x || destPos.y >= outputSize.y) {
|
||||
return;
|
||||
}
|
||||
|
||||
const uint2 destPos = blockStart + uint2(threadId.x, threadId.y * 2);
|
||||
tex1[destPos] = samples[3][1];
|
||||
tex1[destPos + uint2(0, 1)] = nnedi3(samples);
|
||||
#endif
|
||||
imageStore(out_image, ivec2(gl_GlobalInvocationID) * ivec2(1, 2), ret0);
|
||||
imageStore(out_image, ivec2(gl_GlobalInvocationID) * ivec2(1, 2) + ivec2(0, 1), ret);
|
||||
}
|
||||
|
||||
|
||||
//!PASS 2
|
||||
//!DESC double_x
|
||||
//!IN tex1, INPUT
|
||||
//!BLOCK_SIZE 64,8
|
||||
//!NUM_THREADS 32,8
|
||||
|
||||
float nnedi3(float4 samples[8]) {
|
||||
//!DESC NNEDI3 (double_x, nns16, win8x4)
|
||||
//!IN INPUT, temp
|
||||
//!OUT OUTPUT
|
||||
//!BLOCK_SIZE 64, 8
|
||||
//!NUM_THREADS 32, 8
|
||||
#pragma optionNV(inline none)
|
||||
float nnedi3(vec4 samples[8]) {
|
||||
float sum = 0.0, sumsq = 0.0;
|
||||
[unroll]
|
||||
for (int i = 0; i < 8; i++) {
|
||||
sum += dot(samples[i], 1.0f);
|
||||
[unroll] for (int i = 0; i < 8; i++) {
|
||||
sum += dot(samples[i], vec4(1.0, 1.0, 1.0, 1.0));
|
||||
sumsq += dot(samples[i], samples[i]);
|
||||
}
|
||||
|
||||
float mstd0 = sum / 32.0;
|
||||
float mstd1 = sumsq / 32.0 - mstd0 * mstd0;
|
||||
// 不能使用 lerp,否则结果可能为 nan
|
||||
float mstd2 = mstd1 >= 1.192092896e-7 ? rsqrt(mstd1) : 0.0;
|
||||
float mstd2 = mix(0.0, inversesqrt(mstd1), mstd1 >= 1.192092896e-7);
|
||||
mstd1 *= mstd2;
|
||||
|
||||
float vsum = 0.0, wsum = 0.0, sum1, sum2;
|
||||
#define T(x) intBitsToFloat(x)
|
||||
#define W(i, w0, w1, w2, w3) dot(samples[i], vec4(T(w0), T(w1), T(w2), T(w3)))
|
||||
#define WS(w0, w1) \
|
||||
sum1 = exp(sum1 * mstd2 + T(w0)); \
|
||||
sum2 = sum2 * mstd2 + T(w1); \
|
||||
wsum += sum1; \
|
||||
vsum += sum1 * (sum2 / (1.0 + abs(sum2)));
|
||||
sum1 = W(0, -1123354974, -1118620174, 1016130204, -1103968288)
|
||||
+ W(1, -1101453425, -1117908518, -1122284590, -1122818124)
|
||||
+ W(2, -1112248839, 1024662558, -1087068557, 1048182784)
|
||||
+ W(3, 1059583965, -1119323982, 1027638054, -1137723992) + W(4, 1046299686, 1028038478, 1063313277, 1047279381)
|
||||
+ W(5, -1088182320, 1034186247, -1124394588, 978245507)
|
||||
+ W(6, -1143613552, -1129268360, -1103342192, -1115088511)
|
||||
+ W(7, 1003350800, -1134684248, -1111377363, 1028117438);
|
||||
sum2 =
|
||||
W(0, -1162931039, -1136248556, -1115594515, -1119907402) + W(1, 1022791334, 1017500018, -1156766128, 1014782692)
|
||||
+ W(2, -1131063526, -1131086728, -1128443230, 1044675527)
|
||||
+ W(3, -1107588397, -1111169922, -1125594766, -1135599628)
|
||||
+ W(4, 1029801649, 1031011705, 1042762789, 1050674207) + W(5, 1009001220, -1112569685, -1148613464, -1114139175)
|
||||
+ W(6, -1117642655, -1128864654, -1107118398, -1113986381)
|
||||
+ W(7, -1186206458, 1017255694, 993928432, 1007622876);
|
||||
WS(1038828992, 1041685264);
|
||||
sum1 = W(0, -1114329248, 995958527, -1117673776, 1034401008) + W(1, 1038612842, 1035088196, 1008781376, 1029875310)
|
||||
+ W(2, 1049950910, 1027336960, 1060640651, 1045782072)
|
||||
+ W(3, -1098159517, -1106507532, -1124000392, -1105439902)
|
||||
+ W(4, -1097681183, -1107326552, -1085831405, -1105157973)
|
||||
+ W(5, 1053136924, 1032016120, 1023707152, 1034119968) + W(6, 1028668144, 1025858258, 1033402064, -1122828000)
|
||||
+ W(7, -1110558370, -1113173980, 1012109856, -1114749520);
|
||||
sum2 = W(0, 1031315360, -1131767489, 1023637252, -1107869385)
|
||||
+ W(1, -1107666272, -1117110288, -1124350185, -1110538107)
|
||||
+ W(2, -1099468189, -1140834082, -1100127579, 1052854494)
|
||||
+ W(3, 1034036134, 1024451620, 1003450083, 1041131277) + W(4, -1112139926, 1024287080, -1117241706, 1052996200)
|
||||
+ W(5, 1027811452, 1027157968, -1131082337, 1035032776)
|
||||
+ W(6, 1036663822, -1122285462, 1038018354, -1112496415)
|
||||
+ W(7, -1110479054, -1112615559, 998992195, -1106762474);
|
||||
WS(-1086074680, 1053637716);
|
||||
sum1 = W(0, -1121345387, -1148805338, -1136570733, 1049266593)
|
||||
+ W(1, 1027081787, -1133439181, -1158465386, -1120818857)
|
||||
+ W(2, 1042002951, -1165378922, 1052460699, -1094571489)
|
||||
+ W(3, -1124281856, 1040734807, -1121708851, 1035650578)
|
||||
+ W(4, -1113042450, -1115297518, -1107443934, -1098765182)
|
||||
+ W(5, 1043313411, 1006695533, 1016359031, 1027853163) + W(6, -1121398619, 991217235, -1117268427, 1036113926)
|
||||
+ W(7, -1136658365, -1112513138, 1021173351, -1106476275);
|
||||
sum2 = W(0, 1026517575, 1017334370, 1051558711, 1071692777) + W(1, 1049822619, 1020207734, 1022251878, 1019995718)
|
||||
+ W(2, -1170492850, 1003954710, -1096673587, -1077357700)
|
||||
+ W(3, -1098179385, 996694924, -1122577241, 1015494226)
|
||||
+ W(4, -1138816415, -1132363566, -1136175651, -1098960792)
|
||||
+ W(5, -1116986501, 1003290486, -1141894102, -1126828734)
|
||||
+ W(6, -1143472678, 998846550, -1124275402, 1018703670)
|
||||
+ W(7, 1007812651, 1007766851, 1009415395, -1163222937);
|
||||
WS(1051521136, 1027207116);
|
||||
sum1 = W(0, -1122694020, -1121133108, -1102175837, -1116351908)
|
||||
+ W(1, 1045518980, 1009393969, 1008379217, 1015772516) + W(2, 1010830545, -1124202632, 1057246783, 1026322980)
|
||||
+ W(3, -1089509425, 1025178484, -1117338572, 1009646833)
|
||||
+ W(4, -1124291704, 1037913146, -1093542759, 982577970) + W(5, 1055793637, -1118947636, 1001093793, 1001810977)
|
||||
+ W(6, 1018062184, -1116091286, 1041281977, -1125394504)
|
||||
+ W(7, 1008755233, -1127575032, 1015898776, -1121163492);
|
||||
sum2 = W(0, -1137495011, 1013616911, -1135578111, 1029346938)
|
||||
+ W(1, 1017473747, 1012413655, -1133405571, 1008139351)
|
||||
+ W(2, -1135527491, -1123650952, 1013443151, -1114797945)
|
||||
+ W(3, -1122100892, -1128721387, -1166794345, -1156685818)
|
||||
+ W(4, 1027730022, 1024465134, 1049128967, 1068130737) + W(5, 1046423571, -1143058109, 1020545683, -1126785325)
|
||||
+ W(6, -1118108263, -1128775579, -1098008683, -1080443718)
|
||||
+ W(7, -1101482344, -1137148015, -1128178767, 991435034);
|
||||
WS(1057767608, -1132080751);
|
||||
sum1 =
|
||||
W(0, 1026028453, 1026017621, -1117947285, 1048679017) + W(1, 1025261389, 1011196341, 1015828970, 1025013027)
|
||||
+ W(2, 1025766741, -1135552917, 1051769667, -1083959172) + W(3, -1120826122, 1045191525, 1028389741, 1039505775)
|
||||
+ W(4, 1035118319, 1040474693, -1111744027, -1084413328)
|
||||
+ W(5, 1049618505, -1110336171, 1028257397, -1123719333) + W(6, 1012106581, -1138611630, 1030333189, 1045191121)
|
||||
+ W(7, -1122181545, 1030480605, 1027514349, 1020294666);
|
||||
sum2 = W(0, 1017587161, 1028118553, 1008167722, 1005112948)
|
||||
+ W(1, -1125344655, 1015776789, -1148301500, -1138850406)
|
||||
+ W(2, -1101123140, -1103020887, -1099714612, 1049070164)
|
||||
+ W(3, 1032013714, 1022049457, 1009340114, -1167693540) + W(4, 1040188371, 1022642341, 1039093756, 1046164698)
|
||||
+ W(5, -1111525569, -1098832696, -1115917000, -1103378287)
|
||||
+ W(6, 988296658, 1010063898, 1026403646, 1033545355) + W(7, 1002132020, 1037334715, -1139728254, 1035581889);
|
||||
WS(-1099372256, -1088618788);
|
||||
sum1 = W(0, -1112538182, -1113349022, -1103996671, 1024020908)
|
||||
+ W(1, 1046614908, -1111225386, -1110537326, -1114146165)
|
||||
+ W(2, 1048693927, 1033711782, 1064716592, -1143605597)
|
||||
+ W(3, -1085173359, 1004694493, -1108087402, -1138402062)
|
||||
+ W(4, -1112344546, -1129092599, -1086749016, 1044926535)
|
||||
+ W(5, 1062252083, 1040479887, 1034104622, 1042110371)
|
||||
+ W(6, -1109099742, -1110127398, 1032699126, -1121424940)
|
||||
+ W(7, -1130166943, -1106709441, -1120726228, -1106064827);
|
||||
sum2 = W(0, 987083788, -1144106823, -1120467381, -1121977305)
|
||||
+ W(1, -1118651341, -1162389292, -1152635694, 964510307)
|
||||
+ W(2, 1013472954, -1131186779, -1139561796, 1044091298)
|
||||
+ W(3, 1038343490, -1115306287, -1155962630, -1117365756)
|
||||
+ W(4, -1120418118, -1122269098, 1038342084, 1042996066)
|
||||
+ W(5, -1118476220, -1128689408, -1132569906, -1141833923)
|
||||
+ W(6, 979955865, -1163904780, -1115615181, -1127292875)
|
||||
+ W(7, -1123141745, 1014320394, -1135582470, 1008840046);
|
||||
WS(1041282784, 1044242623);
|
||||
sum1 = W(0, -1119885764, -1119816052, -1100880653, -1113812594)
|
||||
+ W(1, 1046531310, 1015358999, 1005085853, 1021011107) + W(2, -1171512555, -1121861252, 1057266723, 1010135439)
|
||||
+ W(3, -1089952515, 1031135156, -1115226950, 1003139037)
|
||||
+ W(4, 1003864029, 1040963149, -1094412795, -1118004569) + W(5, 1056310444, -1114099002, 1015234855, 992693307)
|
||||
+ W(6, 1025494836, -1113504879, 1043843337, -1125989575)
|
||||
+ W(7, -1156936827, -1122714492, 1003362397, -1120612644);
|
||||
sum2 = W(0, 1005317381, -1143827754, 1019060164, 1009279342)
|
||||
+ W(1, 1014092605, -1131000233, 1012779564, -1137528453)
|
||||
+ W(2, -1142619324, 1012902153, -1114788024, -1098688460)
|
||||
+ W(3, -1120377499, 1017453102, -1139793504, -1147729078)
|
||||
+ W(4, -1126266146, -1128784654, -1094218173, -1078812823)
|
||||
+ W(5, -1099532818, 1007638067, -1130333980, 1018177647) + W(6, 1026462555, 1020893616, 1054132458, 1070492026)
|
||||
+ W(7, 1048935725, 1011358224, 1015734963, 987943782);
|
||||
WS(1046635232, 1024078131);
|
||||
sum1 = W(0, 1002735212, 1025138813, 1017042555, -1132524982) + W(1, 1031803657, 1033606155, 1021596219, 1015508823)
|
||||
+ W(2, 1035063871, 1034039879, 1044122447, -1110416695)
|
||||
+ W(3, -1092481954, -1094320024, -1107502027, -1103391009)
|
||||
+ W(4, -1097977761, -1105608655, -1094991056, 1051547730)
|
||||
+ W(5, 1050188814, 1047410847, 1031346589, 1046101811) + W(6, 1040314319, 1035664624, 1038536855, -1114843703)
|
||||
+ W(7, 1003107468, 1019470987, 1021345835, -1136683190);
|
||||
sum2 = W(0, -1096475926, -1112281069, -1116645717, 1041647377)
|
||||
+ W(1, -1113263240, 1023473494, -1117017643, 1039764966)
|
||||
+ W(2, 1044036812, -1112231286, -1111398905, -1096068583)
|
||||
+ W(3, 1026411348, -1114320784, -1138574198, -1104710548)
|
||||
+ W(4, 1052862983, 1024115789, 1051331710, 1038036111) + W(5, 1042458641, 1028002558, 1037890580, -1106844581)
|
||||
+ W(6, -1106234474, -1121785528, -1130292776, 1037359643)
|
||||
+ W(7, -1111704128, -1123406807, -1109714921, 1041123403);
|
||||
WS(-1088554040, -1076674880);
|
||||
sum1 = W(0, 1026292820, 1016736263, 1042339025, -1110236986)
|
||||
+ W(1, -1103961368, 1016313655, -1151588920, 1014755782)
|
||||
+ W(2, -1132973070, 1034501898, -1089525132, 1037427962)
|
||||
+ W(3, 1056478885, -1118983748, 1038469390, -1123320716)
|
||||
+ W(4, -1144171612, -1110973538, 1052671191, -1123890785)
|
||||
+ W(5, -1092344862, 1041641985, 1010979982, 1017396903)
|
||||
+ W(6, -1130131975, 1028857234, -1108906970, -1112145786)
|
||||
+ W(7, 1002874044, 1025897228, -1130905399, 1033705562);
|
||||
sum2 = W(0, 1013915195, 1007730851, 981970521, 1028189234) + W(1, -1121038101, -1142484934, -1146368902, 1010322539)
|
||||
+ W(2, -1133182691, 1024414743, -1111248658, 1040952978)
|
||||
+ W(3, -1109339192, -1094377458, -1116592821, -1112421528)
|
||||
+ W(4, -1127318198, -1121307593, 1035588225, 1057294107)
|
||||
+ W(5, -1107404728, 1024397525, -1118541421, 1019759378)
|
||||
+ W(6, 1020584890, 1005058566, -1124411850, 1029625115)
|
||||
+ W(7, 1026110889, 1023925523, -1140327971, -1199698720);
|
||||
WS(1063581112, 1015292283);
|
||||
sum1 = W(0, -1123806598, -1124445804, 1036776315, -1112186771)
|
||||
+ W(1, -1102837698, -1117642958, -1137018200, 1010580432)
|
||||
+ W(2, -1125096044, 1037634467, -1083793455, -1098422117)
|
||||
+ W(3, 1058965073, -1106625757, 1036181095, -1127761788) + W(4, 1046804719, 1028314614, 1064148787, 1034155462)
|
||||
+ W(5, -1089226130, 1037373467, 994321759, 1021285644) + W(6, -1117498166, 1006823135, -1106689849, 1004978479)
|
||||
+ W(7, 1033810693, 1029436414, -1119765454, 1034713459);
|
||||
sum2 =
|
||||
W(0, -1127012521, -1129458054, -1134632819, -1113739078)
|
||||
+ W(1, -1106078947, -1115241196, -1122814807, -1119046895)
|
||||
+ W(2, -1110373665, -1122115974, -1118435057, 1041258512) + W(3, 1025961773, 1041055451, 1025056413, 1029845331)
|
||||
+ W(4, -1121983257, -1121551577, -1107711610, 1043546644)
|
||||
+ W(5, 1048226293, -1131486243, -1139476701, 1018415015) + W(6, 1021812843, 1015201109, 1039413537, -1127386873)
|
||||
+ W(7, -1110385416, -1135801459, -1132245806, -1140149017);
|
||||
WS(-1109010880, -1087548956);
|
||||
sum1 = W(0, 1034947768, 1034343312, 1016751552, 1019661856) + W(1, -1126501287, 1017025648, 1030724160, -1140792121)
|
||||
+ W(2, -1095012676, -1102610188, -1096454908, -1091443170)
|
||||
+ W(3, -1131030249, 1042942296, 1019936112, 1024647464) + W(4, 1046023882, 1039446704, 1042564604, -1105694067)
|
||||
+ W(5, 1044246468, -1103700296, -1141422594, -1107855416)
|
||||
+ W(6, 1029737824, 1025692706, 1038373096, 1039271048) + W(7, 1012879825, 1041317114, 1029263800, 1041193844);
|
||||
sum2 = W(0, 1034034732, -1113997103, 1028771217, 1025456143)
|
||||
+ W(1, 1035315492, -1102403849, 1018212015, -1099438501)
|
||||
+ W(2, -1107522705, -1121503695, -1114143244, -1105773446)
|
||||
+ W(3, -1109746606, -1106040358, -1106459627, 1039219872)
|
||||
+ W(4, -1105460279, 1038975878, 1032873918, 1059420344) + W(5, 1040681265, 1046039582, 1026290649, 1046943722)
|
||||
+ W(6, 1021740679, -1112744336, -1121564954, 1024971971)
|
||||
+ W(7, -1122379806, -1106873869, -1130313815, -1105420350);
|
||||
WS(-1086299832, -1077288694);
|
||||
sum1 = W(0, 1021716686, -1132921948, -1113901292, -1108835908) + W(1, 1023078294, 965138311, 998651320, -1130194922)
|
||||
+ W(2, -1099039878, -1108540692, -1158126306, 1055092577)
|
||||
+ W(3, -1089051407, -1113759276, -1107695832, -1113503632)
|
||||
+ W(4, -1111509136, 1021468846, -1096197083, 1062013047) + W(5, 1050708993, 1022391342, -1133490396, 991635057)
|
||||
+ W(6, 1039618828, -1131678690, 1041516082, -1118733319)
|
||||
+ W(7, -1122936235, 1015065790, 997649137, 1023538631);
|
||||
sum2 = W(0, -1133976495, 1027343155, -1149877867, -1119506980)
|
||||
+ W(1, -1126385541, 1016526837, 994153115, -1140785051) + W(2, 1035891239, 1030599513, 1037590422, 1054189655)
|
||||
+ W(3, 1041308688, -1112736561, 1004824957, -1120347934)
|
||||
+ W(4, -1130801609, -1108453664, 1012747883, -1119322812)
|
||||
+ W(5, -1107379808, -1119223720, -1116360142, -1129452107)
|
||||
+ W(6, -1113698362, 1016406968, -1108226898, -1120928356)
|
||||
+ W(7, 1016225738, 988482485, 1018050885, -1117792638);
|
||||
WS(-1113279936, 1066223903);
|
||||
sum1 =
|
||||
W(0, -1128171420, -1141738481, -1138940153, 1044160156) + W(1, 1020686276, -1129655596, 964173357, -1123239900)
|
||||
+ W(2, 1040261344, -1140107833, 1050703688, -1100167260)
|
||||
+ W(3, -1130335589, 1035637471, -1124525100, 1032644739)
|
||||
+ W(4, -1112013315, -1116929726, -1108200895, -1100730273)
|
||||
+ W(5, 1040782300, 1024316286, 1014134393, 1029624526) + W(6, -1123695998, -1154978689, -1123177006, 1034288823)
|
||||
+ W(7, -1141423761, -1114187043, 1013984857, -1108229911);
|
||||
sum2 = W(0, -1115606620, -1125272644, -1099614716, -1080575150)
|
||||
+ W(1, -1102155153, -1122847678, -1123394906, -1129056732)
|
||||
+ W(2, 1021458196, 1017345212, 1047257730, 1068148121) + W(3, 1044966894, 1026244022, 1017049220, 1010161976)
|
||||
+ W(4, 1009639320, 1016051020, -1120838650, -1113655261)
|
||||
+ W(5, -1132238288, -1130782536, 967940860, 1004223696)
|
||||
+ W(6, -1131253088, -1143902384, 1020803060, 1032085971)
|
||||
+ W(7, 1016311348, -1137376840, -1137115752, -1136984808);
|
||||
WS(1060545080, -1126581603);
|
||||
sum1 = W(0, 1032630360, 1037657648, 1043836232, 1037448680) + W(1, 976149203, 1031833596, 1031675647, 1025282125)
|
||||
+ W(2, -1112268976, -1128752350, -1090205186, 1048595306)
|
||||
+ W(3, 1057651571, 1035187792, 1034806588, 1043419290) + W(4, 1045186906, 1032285712, 1053340438, -1094666759)
|
||||
+ W(5, -1082657749, -1092127852, -1104761760, -1096441814)
|
||||
+ W(6, -1125010622, 1029508223, -1108078856, 1041691860)
|
||||
+ W(7, 1042698525, 1040118132, 1033087420, 1034587656);
|
||||
sum2 = W(0, -1123698886, -1126365381, -1135914762, -1121651762)
|
||||
+ W(1, -1118625490, -1122087574, -1147908244, -1134045690)
|
||||
+ W(2, 1034075649, 1026991402, 1019253181, 1047572688) + W(3, 1035108181, -1115886918, 1016718341, -1117034488)
|
||||
+ W(4, 998149095, -1118780236, 1023543366, 1038479879)
|
||||
+ W(5, -1114677625, 1011684618, -1132109957, -1137057610)
|
||||
+ W(6, -1113635181, -1168196508, -1114469118, -1145545780)
|
||||
+ W(7, 992781287, -1139655050, -1142844852, 1007905050);
|
||||
WS(-1083899832, -1105526146);
|
||||
sum1 = W(0, 1026357515, -1139718894, 1044187583, -1105573131)
|
||||
+ W(1, -1104526300, -1119529939, -1125768375, -1132776678)
|
||||
+ W(2, -1119744955, -1125720471, -1092285679, 1062437883)
|
||||
+ W(3, 1058460257, 1022150135, 1033366698, 1009731342) + W(4, -1117075907, -1106102943, 1048719011, 1052836221)
|
||||
+ W(5, -1089717563, -1123085499, -1114009838, -1112611206)
|
||||
+ W(6, -1111407198, -1152407445, -1107209883, -1107292779)
|
||||
+ W(7, -1122559055, -1119739267, -1119196243, -1129505495);
|
||||
sum2 = W(0, -1110807022, -1129400032, -1105612607, -1123619892)
|
||||
+ W(1, -1131262448, 1016529300, -1129444600, -1136239801)
|
||||
+ W(2, 1025172792, -1117035240, 1035443403, -1135427545)
|
||||
+ W(3, -1111010692, -1115955576, -1117326476, -1121250556)
|
||||
+ W(4, 1033543849, 999654946, 1039345667, 1053020794) + W(5, 1047843748, -1135856481, 1022819536, 998047364)
|
||||
+ W(6, -1123816828, -1144812946, -1120747576, -1113498942)
|
||||
+ W(7, -1113301822, -1146605522, -1119691028, -1135792457);
|
||||
WS(-1107513792, 1064663354);
|
||||
sum1 = W(0, 1030862455, -1161118946, 1040244826, -1104300038)
|
||||
+ W(1, -1102940181, -1135077628, -1135539484, 1011863892)
|
||||
+ W(2, -1113532308, 1021510766, -1091621085, 1046262406)
|
||||
+ W(3, 1054782000, 1019068110, 1036941280, -1128724830) + W(4, 1032378968, -1127591630, 1051734861, 1034822530)
|
||||
+ W(5, -1095483267, 1031948820, -1172984259, -1120336759)
|
||||
+ W(6, -1123071015, 1009770420, -1107582956, -1108820108)
|
||||
+ W(7, -1125175670, 1025488559, -1126076542, 1036426604);
|
||||
sum2 =
|
||||
W(0, -1135206239, -1139065871, 1026230428, 1025917606) + W(1, 1027050280, 1013849783, 1016077019, -1127829351)
|
||||
+ W(2, -1140752647, -1123380440, 988696695, -1092786651) + W(3, 1049996339, 1057784826, 1033822297, 1034470972)
|
||||
+ W(4, 1022777359, 1021581075, -1122295168, -1085937537)
|
||||
+ W(5, 1032573953, -1130048007, 1032545188, -1137094527) + W(6, 974924014, -1133276463, 1029689087, -1140169471)
|
||||
+ W(7, -1135329695, -1124883951, 1011238415, 1001568686);
|
||||
WS(1058918200, -1121082995);
|
||||
|
||||
sum1 = W(0, -1123354974, -1118620174, 1016130204, -1103968288) + W(1, -1101453425, -1117908518, -1122284590, -1122818124) + W(2, -1112248839, 1024662558, -1087068557, 1048182784) + W(3, 1059583965, -1119323982, 1027638054, -1137723992) + W(4, 1046299686, 1028038478, 1063313277, 1047279381) + W(5, -1088182320, 1034186247, -1124394588, 978245507) + W(6, -1143613552, -1129268360, -1103342192, -1115088511) + W(7, 1003350800, -1134684248, -1111377363, 1028117438); sum2 = W(0, -1162931039, -1136248556, -1115594515, -1119907402) + W(1, 1022791334, 1017500018, -1156766128, 1014782692) + W(2, -1131063526, -1131086728, -1128443230, 1044675527) + W(3, -1107588397, -1111169922, -1125594766, -1135599628) + W(4, 1029801649, 1031011705, 1042762789, 1050674207) + W(5, 1009001220, -1112569685, -1148613464, -1114139175) + W(6, -1117642655, -1128864654, -1107118398, -1113986381) + W(7, -1186206458, 1017255694, 993928432, 1007622876); WS(1038828992, 1041685264);
|
||||
sum1 = W(0, -1114329248, 995958527, -1117673776, 1034401008) + W(1, 1038612842, 1035088196, 1008781376, 1029875310) + W(2, 1049950910, 1027336960, 1060640651, 1045782072) + W(3, -1098159517, -1106507532, -1124000392, -1105439902) + W(4, -1097681183, -1107326552, -1085831405, -1105157973) + W(5, 1053136924, 1032016120, 1023707152, 1034119968) + W(6, 1028668144, 1025858258, 1033402064, -1122828000) + W(7, -1110558370, -1113173980, 1012109856, -1114749520); sum2 = W(0, 1031315360, -1131767489, 1023637252, -1107869385) + W(1, -1107666272, -1117110288, -1124350185, -1110538107) + W(2, -1099468189, -1140834082, -1100127579, 1052854494) + W(3, 1034036134, 1024451620, 1003450083, 1041131277) + W(4, -1112139926, 1024287080, -1117241706, 1052996200) + W(5, 1027811452, 1027157968, -1131082337, 1035032776) + W(6, 1036663822, -1122285462, 1038018354, -1112496415) + W(7, -1110479054, -1112615559, 998992195, -1106762474); WS(-1086074680, 1053637716);
|
||||
sum1 = W(0, -1121345387, -1148805338, -1136570733, 1049266593) + W(1, 1027081787, -1133439181, -1158465386, -1120818857) + W(2, 1042002951, -1165378922, 1052460699, -1094571489) + W(3, -1124281856, 1040734807, -1121708851, 1035650578) + W(4, -1113042450, -1115297518, -1107443934, -1098765182) + W(5, 1043313411, 1006695533, 1016359031, 1027853163) + W(6, -1121398619, 991217235, -1117268427, 1036113926) + W(7, -1136658365, -1112513138, 1021173351, -1106476275); sum2 = W(0, 1026517575, 1017334370, 1051558711, 1071692777) + W(1, 1049822619, 1020207734, 1022251878, 1019995718) + W(2, -1170492850, 1003954710, -1096673587, -1077357700) + W(3, -1098179385, 996694924, -1122577241, 1015494226) + W(4, -1138816415, -1132363566, -1136175651, -1098960792) + W(5, -1116986501, 1003290486, -1141894102, -1126828734) + W(6, -1143472678, 998846550, -1124275402, 1018703670) + W(7, 1007812651, 1007766851, 1009415395, -1163222937); WS(1051521136, 1027207116);
|
||||
sum1 = W(0, -1122694020, -1121133108, -1102175837, -1116351908) + W(1, 1045518980, 1009393969, 1008379217, 1015772516) + W(2, 1010830545, -1124202632, 1057246783, 1026322980) + W(3, -1089509425, 1025178484, -1117338572, 1009646833) + W(4, -1124291704, 1037913146, -1093542759, 982577970) + W(5, 1055793637, -1118947636, 1001093793, 1001810977) + W(6, 1018062184, -1116091286, 1041281977, -1125394504) + W(7, 1008755233, -1127575032, 1015898776, -1121163492); sum2 = W(0, -1137495011, 1013616911, -1135578111, 1029346938) + W(1, 1017473747, 1012413655, -1133405571, 1008139351) + W(2, -1135527491, -1123650952, 1013443151, -1114797945) + W(3, -1122100892, -1128721387, -1166794345, -1156685818) + W(4, 1027730022, 1024465134, 1049128967, 1068130737) + W(5, 1046423571, -1143058109, 1020545683, -1126785325) + W(6, -1118108263, -1128775579, -1098008683, -1080443718) + W(7, -1101482344, -1137148015, -1128178767, 991435034); WS(1057767608, -1132080751);
|
||||
sum1 = W(0, 1026028453, 1026017621, -1117947285, 1048679017) + W(1, 1025261389, 1011196341, 1015828970, 1025013027) + W(2, 1025766741, -1135552917, 1051769667, -1083959172) + W(3, -1120826122, 1045191525, 1028389741, 1039505775) + W(4, 1035118319, 1040474693, -1111744027, -1084413328) + W(5, 1049618505, -1110336171, 1028257397, -1123719333) + W(6, 1012106581, -1138611630, 1030333189, 1045191121) + W(7, -1122181545, 1030480605, 1027514349, 1020294666); sum2 = W(0, 1017587161, 1028118553, 1008167722, 1005112948) + W(1, -1125344655, 1015776789, -1148301500, -1138850406) + W(2, -1101123140, -1103020887, -1099714612, 1049070164) + W(3, 1032013714, 1022049457, 1009340114, -1167693540) + W(4, 1040188371, 1022642341, 1039093756, 1046164698) + W(5, -1111525569, -1098832696, -1115917000, -1103378287) + W(6, 988296658, 1010063898, 1026403646, 1033545355) + W(7, 1002132020, 1037334715, -1139728254, 1035581889); WS(-1099372256, -1088618788);
|
||||
sum1 = W(0, -1112538182, -1113349022, -1103996671, 1024020908) + W(1, 1046614908, -1111225386, -1110537326, -1114146165) + W(2, 1048693927, 1033711782, 1064716592, -1143605597) + W(3, -1085173359, 1004694493, -1108087402, -1138402062) + W(4, -1112344546, -1129092599, -1086749016, 1044926535) + W(5, 1062252083, 1040479887, 1034104622, 1042110371) + W(6, -1109099742, -1110127398, 1032699126, -1121424940) + W(7, -1130166943, -1106709441, -1120726228, -1106064827); sum2 = W(0, 987083788, -1144106823, -1120467381, -1121977305) + W(1, -1118651341, -1162389292, -1152635694, 964510307) + W(2, 1013472954, -1131186779, -1139561796, 1044091298) + W(3, 1038343490, -1115306287, -1155962630, -1117365756) + W(4, -1120418118, -1122269098, 1038342084, 1042996066) + W(5, -1118476220, -1128689408, -1132569906, -1141833923) + W(6, 979955865, -1163904780, -1115615181, -1127292875) + W(7, -1123141745, 1014320394, -1135582470, 1008840046); WS(1041282784, 1044242623);
|
||||
sum1 = W(0, -1119885764, -1119816052, -1100880653, -1113812594) + W(1, 1046531310, 1015358999, 1005085853, 1021011107) + W(2, -1171512555, -1121861252, 1057266723, 1010135439) + W(3, -1089952515, 1031135156, -1115226950, 1003139037) + W(4, 1003864029, 1040963149, -1094412795, -1118004569) + W(5, 1056310444, -1114099002, 1015234855, 992693307) + W(6, 1025494836, -1113504879, 1043843337, -1125989575) + W(7, -1156936827, -1122714492, 1003362397, -1120612644); sum2 = W(0, 1005317381, -1143827754, 1019060164, 1009279342) + W(1, 1014092605, -1131000233, 1012779564, -1137528453) + W(2, -1142619324, 1012902153, -1114788024, -1098688460) + W(3, -1120377499, 1017453102, -1139793504, -1147729078) + W(4, -1126266146, -1128784654, -1094218173, -1078812823) + W(5, -1099532818, 1007638067, -1130333980, 1018177647) + W(6, 1026462555, 1020893616, 1054132458, 1070492026) + W(7, 1048935725, 1011358224, 1015734963, 987943782); WS(1046635232, 1024078131);
|
||||
sum1 = W(0, 1002735212, 1025138813, 1017042555, -1132524982) + W(1, 1031803657, 1033606155, 1021596219, 1015508823) + W(2, 1035063871, 1034039879, 1044122447, -1110416695) + W(3, -1092481954, -1094320024, -1107502027, -1103391009) + W(4, -1097977761, -1105608655, -1094991056, 1051547730) + W(5, 1050188814, 1047410847, 1031346589, 1046101811) + W(6, 1040314319, 1035664624, 1038536855, -1114843703) + W(7, 1003107468, 1019470987, 1021345835, -1136683190); sum2 = W(0, -1096475926, -1112281069, -1116645717, 1041647377) + W(1, -1113263240, 1023473494, -1117017643, 1039764966) + W(2, 1044036812, -1112231286, -1111398905, -1096068583) + W(3, 1026411348, -1114320784, -1138574198, -1104710548) + W(4, 1052862983, 1024115789, 1051331710, 1038036111) + W(5, 1042458641, 1028002558, 1037890580, -1106844581) + W(6, -1106234474, -1121785528, -1130292776, 1037359643) + W(7, -1111704128, -1123406807, -1109714921, 1041123403); WS(-1088554040, -1076674880);
|
||||
sum1 = W(0, 1026292820, 1016736263, 1042339025, -1110236986) + W(1, -1103961368, 1016313655, -1151588920, 1014755782) + W(2, -1132973070, 1034501898, -1089525132, 1037427962) + W(3, 1056478885, -1118983748, 1038469390, -1123320716) + W(4, -1144171612, -1110973538, 1052671191, -1123890785) + W(5, -1092344862, 1041641985, 1010979982, 1017396903) + W(6, -1130131975, 1028857234, -1108906970, -1112145786) + W(7, 1002874044, 1025897228, -1130905399, 1033705562); sum2 = W(0, 1013915195, 1007730851, 981970521, 1028189234) + W(1, -1121038101, -1142484934, -1146368902, 1010322539) + W(2, -1133182691, 1024414743, -1111248658, 1040952978) + W(3, -1109339192, -1094377458, -1116592821, -1112421528) + W(4, -1127318198, -1121307593, 1035588225, 1057294107) + W(5, -1107404728, 1024397525, -1118541421, 1019759378) + W(6, 1020584890, 1005058566, -1124411850, 1029625115) + W(7, 1026110889, 1023925523, -1140327971, -1199698720); WS(1063581112, 1015292283);
|
||||
sum1 = W(0, -1123806598, -1124445804, 1036776315, -1112186771) + W(1, -1102837698, -1117642958, -1137018200, 1010580432) + W(2, -1125096044, 1037634467, -1083793455, -1098422117) + W(3, 1058965073, -1106625757, 1036181095, -1127761788) + W(4, 1046804719, 1028314614, 1064148787, 1034155462) + W(5, -1089226130, 1037373467, 994321759, 1021285644) + W(6, -1117498166, 1006823135, -1106689849, 1004978479) + W(7, 1033810693, 1029436414, -1119765454, 1034713459); sum2 = W(0, -1127012521, -1129458054, -1134632819, -1113739078) + W(1, -1106078947, -1115241196, -1122814807, -1119046895) + W(2, -1110373665, -1122115974, -1118435057, 1041258512) + W(3, 1025961773, 1041055451, 1025056413, 1029845331) + W(4, -1121983257, -1121551577, -1107711610, 1043546644) + W(5, 1048226293, -1131486243, -1139476701, 1018415015) + W(6, 1021812843, 1015201109, 1039413537, -1127386873) + W(7, -1110385416, -1135801459, -1132245806, -1140149017); WS(-1109010880, -1087548956);
|
||||
sum1 = W(0, 1034947768, 1034343312, 1016751552, 1019661856) + W(1, -1126501287, 1017025648, 1030724160, -1140792121) + W(2, -1095012676, -1102610188, -1096454908, -1091443170) + W(3, -1131030249, 1042942296, 1019936112, 1024647464) + W(4, 1046023882, 1039446704, 1042564604, -1105694067) + W(5, 1044246468, -1103700296, -1141422594, -1107855416) + W(6, 1029737824, 1025692706, 1038373096, 1039271048) + W(7, 1012879825, 1041317114, 1029263800, 1041193844); sum2 = W(0, 1034034732, -1113997103, 1028771217, 1025456143) + W(1, 1035315492, -1102403849, 1018212015, -1099438501) + W(2, -1107522705, -1121503695, -1114143244, -1105773446) + W(3, -1109746606, -1106040358, -1106459627, 1039219872) + W(4, -1105460279, 1038975878, 1032873918, 1059420344) + W(5, 1040681265, 1046039582, 1026290649, 1046943722) + W(6, 1021740679, -1112744336, -1121564954, 1024971971) + W(7, -1122379806, -1106873869, -1130313815, -1105420350); WS(-1086299832, -1077288694);
|
||||
sum1 = W(0, 1021716686, -1132921948, -1113901292, -1108835908) + W(1, 1023078294, 965138311, 998651320, -1130194922) + W(2, -1099039878, -1108540692, -1158126306, 1055092577) + W(3, -1089051407, -1113759276, -1107695832, -1113503632) + W(4, -1111509136, 1021468846, -1096197083, 1062013047) + W(5, 1050708993, 1022391342, -1133490396, 991635057) + W(6, 1039618828, -1131678690, 1041516082, -1118733319) + W(7, -1122936235, 1015065790, 997649137, 1023538631); sum2 = W(0, -1133976495, 1027343155, -1149877867, -1119506980) + W(1, -1126385541, 1016526837, 994153115, -1140785051) + W(2, 1035891239, 1030599513, 1037590422, 1054189655) + W(3, 1041308688, -1112736561, 1004824957, -1120347934) + W(4, -1130801609, -1108453664, 1012747883, -1119322812) + W(5, -1107379808, -1119223720, -1116360142, -1129452107) + W(6, -1113698362, 1016406968, -1108226898, -1120928356) + W(7, 1016225738, 988482485, 1018050885, -1117792638); WS(-1113279936, 1066223903);
|
||||
sum1 = W(0, -1128171420, -1141738481, -1138940153, 1044160156) + W(1, 1020686276, -1129655596, 964173357, -1123239900) + W(2, 1040261344, -1140107833, 1050703688, -1100167260) + W(3, -1130335589, 1035637471, -1124525100, 1032644739) + W(4, -1112013315, -1116929726, -1108200895, -1100730273) + W(5, 1040782300, 1024316286, 1014134393, 1029624526) + W(6, -1123695998, -1154978689, -1123177006, 1034288823) + W(7, -1141423761, -1114187043, 1013984857, -1108229911); sum2 = W(0, -1115606620, -1125272644, -1099614716, -1080575150) + W(1, -1102155153, -1122847678, -1123394906, -1129056732) + W(2, 1021458196, 1017345212, 1047257730, 1068148121) + W(3, 1044966894, 1026244022, 1017049220, 1010161976) + W(4, 1009639320, 1016051020, -1120838650, -1113655261) + W(5, -1132238288, -1130782536, 967940860, 1004223696) + W(6, -1131253088, -1143902384, 1020803060, 1032085971) + W(7, 1016311348, -1137376840, -1137115752, -1136984808); WS(1060545080, -1126581603);
|
||||
sum1 = W(0, 1032630360, 1037657648, 1043836232, 1037448680) + W(1, 976149203, 1031833596, 1031675647, 1025282125) + W(2, -1112268976, -1128752350, -1090205186, 1048595306) + W(3, 1057651571, 1035187792, 1034806588, 1043419290) + W(4, 1045186906, 1032285712, 1053340438, -1094666759) + W(5, -1082657749, -1092127852, -1104761760, -1096441814) + W(6, -1125010622, 1029508223, -1108078856, 1041691860) + W(7, 1042698525, 1040118132, 1033087420, 1034587656); sum2 = W(0, -1123698886, -1126365381, -1135914762, -1121651762) + W(1, -1118625490, -1122087574, -1147908244, -1134045690) + W(2, 1034075649, 1026991402, 1019253181, 1047572688) + W(3, 1035108181, -1115886918, 1016718341, -1117034488) + W(4, 998149095, -1118780236, 1023543366, 1038479879) + W(5, -1114677625, 1011684618, -1132109957, -1137057610) + W(6, -1113635181, -1168196508, -1114469118, -1145545780) + W(7, 992781287, -1139655050, -1142844852, 1007905050); WS(-1083899832, -1105526146);
|
||||
sum1 = W(0, 1026357515, -1139718894, 1044187583, -1105573131) + W(1, -1104526300, -1119529939, -1125768375, -1132776678) + W(2, -1119744955, -1125720471, -1092285679, 1062437883) + W(3, 1058460257, 1022150135, 1033366698, 1009731342) + W(4, -1117075907, -1106102943, 1048719011, 1052836221) + W(5, -1089717563, -1123085499, -1114009838, -1112611206) + W(6, -1111407198, -1152407445, -1107209883, -1107292779) + W(7, -1122559055, -1119739267, -1119196243, -1129505495); sum2 = W(0, -1110807022, -1129400032, -1105612607, -1123619892) + W(1, -1131262448, 1016529300, -1129444600, -1136239801) + W(2, 1025172792, -1117035240, 1035443403, -1135427545) + W(3, -1111010692, -1115955576, -1117326476, -1121250556) + W(4, 1033543849, 999654946, 1039345667, 1053020794) + W(5, 1047843748, -1135856481, 1022819536, 998047364) + W(6, -1123816828, -1144812946, -1120747576, -1113498942) + W(7, -1113301822, -1146605522, -1119691028, -1135792457); WS(-1107513792, 1064663354);
|
||||
sum1 = W(0, 1030862455, -1161118946, 1040244826, -1104300038) + W(1, -1102940181, -1135077628, -1135539484, 1011863892) + W(2, -1113532308, 1021510766, -1091621085, 1046262406) + W(3, 1054782000, 1019068110, 1036941280, -1128724830) + W(4, 1032378968, -1127591630, 1051734861, 1034822530) + W(5, -1095483267, 1031948820, -1172984259, -1120336759) + W(6, -1123071015, 1009770420, -1107582956, -1108820108) + W(7, -1125175670, 1025488559, -1126076542, 1036426604); sum2 = W(0, -1135206239, -1139065871, 1026230428, 1025917606) + W(1, 1027050280, 1013849783, 1016077019, -1127829351) + W(2, -1140752647, -1123380440, 988696695, -1092786651) + W(3, 1049996339, 1057784826, 1033822297, 1034470972) + W(4, 1022777359, 1021581075, -1122295168, -1085937537) + W(5, 1032573953, -1130048007, 1032545188, -1137094527) + W(6, 974924014, -1133276463, 1029689087, -1140169471) + W(7, -1135329695, -1124883951, 1011238415, 1001568686); WS(1058918200, -1121082995);
|
||||
return clamp(mstd0 + 5.0 * vsum / wsum * mstd1, 0.0, 1.0);
|
||||
}
|
||||
|
||||
const static float2x3 rgb2uv = {
|
||||
-0.169, -0.331, 0.5,
|
||||
0.5, -0.419, -0.081
|
||||
};
|
||||
shared float inp[525];
|
||||
|
||||
const static float3x3 yuv2rgb = {
|
||||
1, -0.00093, 1.401687,
|
||||
1, -0.3437, -0.71417,
|
||||
1, 1.77216, 0.00099
|
||||
};
|
||||
#define CURRENT_PASS 2
|
||||
|
||||
groupshared float inp[525];
|
||||
#define GET_SAMPLE(x) dot(x.rgb, rgb2y)
|
||||
#define imageStore(out_image, pos, val) imageStoreOverride(pos, val.x)
|
||||
void imageStoreOverride(uint2 pos, float value) {
|
||||
float2 UV = mul(rgb2uv, INPUT.SampleLevel(sam_INPUT_LINEAR, HOOKED_map(pos), 0).rgb);
|
||||
OUTPUT[pos] = float4(mul(yuv2rgb, float3(value.x, UV)), 1.0);
|
||||
}
|
||||
|
||||
#define INPUT_tex(pos) GET_SAMPLE(vec4(texture(INPUT, pos)))
|
||||
static const float2 INPUT_size = float2(GetInputSize());
|
||||
static const float2 INPUT_pt = float2(GetInputPt());
|
||||
|
||||
#define temp_tex(pos) (float(texture(temp, pos).x))
|
||||
static const float2 temp_size = float2(GetInputSize().x * 1, GetInputSize().y * 2);
|
||||
static const float2 temp_pt = float2(1.0 / (temp_size.x), 1.0 / (temp_size.y));
|
||||
|
||||
#define HOOKED_tex(pos) temp_tex(pos)
|
||||
#define HOOKED_size temp_size
|
||||
#define HOOKED_pt temp_pt
|
||||
|
||||
void Pass2(uint2 blockStart, uint3 threadId) {
|
||||
const float2 inputPt = GetInputPt();
|
||||
const float2 outputPt = GetOutputPt();
|
||||
|
||||
const uint2 group_base = uint2(blockStart.x >> 1, blockStart.y);
|
||||
for (int id = threadId.x * MP_NUM_THREADS_Y + threadId.y; id < 525; id += MP_NUM_THREADS_X * MP_NUM_THREADS_Y) {
|
||||
ivec2 group_base = ivec2(gl_WorkGroupID) * ivec2(gl_WorkGroupSize);
|
||||
int local_pos = int(gl_LocalInvocationID.x) * 15 + int(gl_LocalInvocationID.y);
|
||||
for (int id = int(gl_LocalInvocationIndex); id < 525; id += int(gl_WorkGroupSize.x * gl_WorkGroupSize.y)) {
|
||||
uint x = (uint)id / 15, y = (uint)id % 15;
|
||||
inp[id] = tex1.SampleLevel(sam, inputPt * float2(group_base.x + x - 1 + 0.5, (group_base.y + y - 3 + 0.5) * 0.5), 0).r;
|
||||
inp[id] =
|
||||
HOOKED_tex(HOOKED_pt * vec2(float(group_base.x + x - (1)) + 0.5, float(group_base.y + y - (3)) + 0.5)).x;
|
||||
}
|
||||
|
||||
GroupMemoryBarrierWithGroupSync();
|
||||
|
||||
uint2 destPos = blockStart + uint2(threadId.x * 2, threadId.y);
|
||||
if (!CheckViewport(destPos)) {
|
||||
barrier();
|
||||
vec4 ret = vec4(0.0, 0.0, 0.0, 0.0);
|
||||
vec4 ret0 = vec4(0.0, 0.0, 0.0, 0.0);
|
||||
vec4 samples[8];
|
||||
samples[0][0] = inp[local_pos + 0];
|
||||
samples[0][1] = inp[local_pos + 1];
|
||||
samples[0][2] = inp[local_pos + 2];
|
||||
samples[0][3] = inp[local_pos + 3];
|
||||
samples[1][0] = inp[local_pos + 4];
|
||||
samples[1][1] = inp[local_pos + 5];
|
||||
samples[1][2] = inp[local_pos + 6];
|
||||
samples[1][3] = inp[local_pos + 7];
|
||||
samples[2][0] = inp[local_pos + 15];
|
||||
samples[2][1] = inp[local_pos + 16];
|
||||
samples[2][2] = inp[local_pos + 17];
|
||||
samples[2][3] = inp[local_pos + 18];
|
||||
samples[3][0] = inp[local_pos + 19];
|
||||
samples[3][1] = inp[local_pos + 20];
|
||||
samples[3][2] = inp[local_pos + 21];
|
||||
samples[3][3] = inp[local_pos + 22];
|
||||
samples[4][0] = inp[local_pos + 30];
|
||||
samples[4][1] = inp[local_pos + 31];
|
||||
samples[4][2] = inp[local_pos + 32];
|
||||
samples[4][3] = inp[local_pos + 33];
|
||||
samples[5][0] = inp[local_pos + 34];
|
||||
samples[5][1] = inp[local_pos + 35];
|
||||
samples[5][2] = inp[local_pos + 36];
|
||||
samples[5][3] = inp[local_pos + 37];
|
||||
samples[6][0] = inp[local_pos + 45];
|
||||
samples[6][1] = inp[local_pos + 46];
|
||||
samples[6][2] = inp[local_pos + 47];
|
||||
samples[6][3] = inp[local_pos + 48];
|
||||
samples[7][0] = inp[local_pos + 49];
|
||||
samples[7][1] = inp[local_pos + 50];
|
||||
samples[7][2] = inp[local_pos + 51];
|
||||
samples[7][3] = inp[local_pos + 52];
|
||||
ret[0] = nnedi3(samples);
|
||||
ret0[0] = inp[local_pos + 18];
|
||||
#if CURRENT_PASS == LAST_PASS
|
||||
uint2 destPos = blockStart + threadId.xy * 2;
|
||||
uint2 outputSize = GetOutputSize();
|
||||
if (destPos.x >= outputSize.x || destPos.y >= outputSize.y) {
|
||||
return;
|
||||
}
|
||||
|
||||
float4 ret = 0.0;
|
||||
float4 ret0 = 0.0;
|
||||
float4 samples[8];
|
||||
const uint local_pos = threadId.x * 15 + threadId.y;
|
||||
[unroll]
|
||||
for (int i = 0; i < 8; ++i) {
|
||||
[unroll]
|
||||
for (int j = 0; j < 4; ++j) {
|
||||
samples[i][j] = inp[local_pos + (i / 2) * 15 + (i % 2) * 4 + j];
|
||||
}
|
||||
}
|
||||
|
||||
float2 originUV = mul(rgb2uv, INPUT.SampleLevel(sam1, (destPos + 0.5f) * outputPt, 0).rgb);
|
||||
WriteToOutput(destPos, mul(yuv2rgb, float3(samples[2][3], originUV)));
|
||||
|
||||
++destPos.x;
|
||||
if (!CheckViewport(destPos)) {
|
||||
return;
|
||||
}
|
||||
|
||||
originUV = mul(rgb2uv, INPUT.SampleLevel(sam1, (destPos + 0.5f) * outputPt, 0).rgb);
|
||||
WriteToOutput(destPos, mul(yuv2rgb, float3(nnedi3(samples), originUV)));
|
||||
#endif
|
||||
imageStore(out_image, ivec2(gl_GlobalInvocationID) * ivec2(2, 1), ret0);
|
||||
imageStore(out_image, ivec2(gl_GlobalInvocationID) * ivec2(2, 1) + ivec2(1, 0), ret);
|
||||
}
|
||||
|
|
|
|||
953
src/Effects/NNEDI3/NNEDI3_nns16_win8x6.hlsl
Normal file
953
src/Effects/NNEDI3/NNEDI3_nns16_win8x6.hlsl
Normal file
|
|
@ -0,0 +1,953 @@
|
|||
// This file is generated by the scripts available at https://github.com/hauuau/magpie-prescalers
|
||||
// Please don't edit this file directly.
|
||||
// Generated by: nnedi3.py --nns 16 --win 8x6 --use-compute-shader --use-magpie
|
||||
//
|
||||
// This program is free software: you can redistribute it and/or modify
|
||||
// it under the terms of the GNU General Public License as published by
|
||||
// the Free Software Foundation, either version 3 of the License, or
|
||||
// (at your option) any later version.
|
||||
//
|
||||
// This program is distributed in the hope that it will be useful,
|
||||
// but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
// GNU General Public License for more details.
|
||||
//
|
||||
// You should have received a copy of the GNU General Public License
|
||||
// along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
|
||||
//!MAGPIE EFFECT
|
||||
//!VERSION 4
|
||||
//!SORT_NAME NNEDI3_016_6
|
||||
|
||||
//!TEXTURE
|
||||
Texture2D INPUT;
|
||||
|
||||
//!SAMPLER
|
||||
//!FILTER POINT
|
||||
SamplerState sam_INPUT;
|
||||
|
||||
//!TEXTURE
|
||||
//!WIDTH INPUT_WIDTH * 1 * 2
|
||||
//!HEIGHT INPUT_HEIGHT * 2 * 1
|
||||
Texture2D OUTPUT;
|
||||
|
||||
//!SAMPLER
|
||||
//!FILTER LINEAR
|
||||
SamplerState sam_INPUT_LINEAR;
|
||||
|
||||
//!TEXTURE
|
||||
//!FORMAT R16_FLOAT
|
||||
//!WIDTH INPUT_WIDTH * 1
|
||||
//!HEIGHT INPUT_HEIGHT * 2
|
||||
Texture2D temp;
|
||||
|
||||
//!SAMPLER
|
||||
//!FILTER POINT
|
||||
SamplerState sam_temp;
|
||||
|
||||
//!COMMON
|
||||
#include "prescalers.hlsli"
|
||||
|
||||
#define LAST_PASS 2
|
||||
|
||||
//!PASS 1
|
||||
//!DESC NNEDI3 (double_y, nns16, win8x6)
|
||||
//!IN INPUT
|
||||
//!OUT temp
|
||||
//!BLOCK_SIZE 32, 16
|
||||
//!NUM_THREADS 32, 8
|
||||
#pragma optionNV(inline none)
|
||||
float nnedi3(vec4 samples[12]) {
|
||||
float sum = 0.0, sumsq = 0.0;
|
||||
[unroll] for (int i = 0; i < 12; i++) {
|
||||
sum += dot(samples[i], vec4(1.0, 1.0, 1.0, 1.0));
|
||||
sumsq += dot(samples[i], samples[i]);
|
||||
}
|
||||
float mstd0 = sum / 48.0;
|
||||
float mstd1 = sumsq / 48.0 - mstd0 * mstd0;
|
||||
float mstd2 = mix(0.0, inversesqrt(mstd1), mstd1 >= 1.192092896e-7);
|
||||
mstd1 *= mstd2;
|
||||
float vsum = 0.0, wsum = 0.0, sum1, sum2;
|
||||
#define T(x) intBitsToFloat(x)
|
||||
#define W(i, w0, w1, w2, w3) dot(samples[i], vec4(T(w0), T(w1), T(w2), T(w3)))
|
||||
#define WS(w0, w1) \
|
||||
sum1 = exp(sum1 * mstd2 + T(w0)); \
|
||||
sum2 = sum2 * mstd2 + T(w1); \
|
||||
wsum += sum1; \
|
||||
vsum += sum1 * (sum2 / (1.0 + abs(sum2)));
|
||||
sum1 =
|
||||
W(0, -1126897990, 1027745880, 1024250604, 1024642508) + W(1, -1121959908, -1149906049, -1130469888, -1121396864)
|
||||
+ W(2, 1039079928, -1107295041, -1147395201, -1126556538)
|
||||
+ W(3, -1113607518, 1041026790, 1022159130, 1044630722) + W(4, -1107136294, 1005058137, -1116173177, 1042195560)
|
||||
+ W(5, -1098313415, 999141354, 1019497054, 1007702352) + W(6, 1015526727, 1018714920, 1042189511, -1106681307)
|
||||
+ W(7, 1035880216, -1121374916, -1133977224, 1026239260)
|
||||
+ W(8, -1106606352, 1038936227, -1124106064, 1025050132) + W(9, 990390561, -1131068140, 1013770942, -1122507740)
|
||||
+ W(10, -1136584888, -1135809122, -1122292152, 1015308851)
|
||||
+ W(11, -1122039043, 1031978820, -1116330759, 1018900008);
|
||||
sum2 = W(0, 1017133506, 1011515348, -1139818306, -1123730089) + W(1, 996184056, -1138856554, 1023321012, 1029416248)
|
||||
+ W(2, -1115999672, 1020129658, 1015618084, 1007066512)
|
||||
+ W(3, -1119553894, 1057587887, -1090489276, -1109933138)
|
||||
+ W(4, 1016266760, -1145378916, -1112177411, 1071604647)
|
||||
+ W(5, -1079392139, -1097028615, 1028448562, 1008681896)
|
||||
+ W(6, -1165256880, 1051025857, -1098617840, -1105405946)
|
||||
+ W(7, -1155286464, 1000343320, -1133574805, 1035052104)
|
||||
+ W(8, -1139515542, -1135392452, -1138601606, 991053648)
|
||||
+ W(9, 1020043526, 1010374724, -1121583660, -1142174380)
|
||||
+ W(10, 997185888, -1155288808, -1135761830, 1018728192)
|
||||
+ W(11, 1024878156, 1002597928, -1131188096, -1132781834);
|
||||
WS(1018288640, 1027735986);
|
||||
sum1 =
|
||||
W(0, 1012158232, -1178449286, 1044498160, -1128542910) + W(1, -1115962871, 1002517720, 1006778572, -1114624234)
|
||||
+ W(2, 1032943202, 1027108853, 1017365062, 964628492) + W(3, 1025063871, -1104570115, 1059928494, -1088743921)
|
||||
+ W(4, 1032615126, -1134936888, -1156175041, 1028919475)
|
||||
+ W(5, -1097612337, -1106124541, 1026836706, -1146238776)
|
||||
+ W(6, 1010747802, 1034856692, -1085331503, 1059914122)
|
||||
+ W(7, -1114177498, 1020458158, -1140348884, -1127457566)
|
||||
+ W(8, 1031833306, 1032056909, -1122073627, 1016604174) + W(9, 1020162890, -1122825993, -1119592595, 1033999672)
|
||||
+ W(10, 1022377282, 998219705, -1172026051, -1115773453)
|
||||
+ W(11, 1038136595, 1027508251, -1129465364, 1023799671);
|
||||
sum2 =
|
||||
W(0, -1126840972, -1130460798, 1019075916, 1017322604) + W(1, -1131054760, -1131047996, -1145399745, 985194115)
|
||||
+ W(2, -1120812206, -1129997452, 1006903064, -1143360737)
|
||||
+ W(3, -1139273136, -1112997847, -1139625904, 1042717692)
|
||||
+ W(4, -1114175000, -1130986946, 991527106, -1120456092) + W(5, 1043975251, 1051048254, -1113881740, 1007107280)
|
||||
+ W(6, -1135317632, 1001121889, -1150833602, -1121880440)
|
||||
+ W(7, 978663174, -1143215153, -1139461992, 1017866680)
|
||||
+ W(8, -1128878392, -1112673669, 1026044394, -1125685806)
|
||||
+ W(9, -1129486378, 1006765920, -1133504840, -1126929736)
|
||||
+ W(10, 1014584312, -1144361281, 995542402, 1000306721)
|
||||
+ W(11, -1142139489, -1114488494, 1007041936, -1134951296);
|
||||
WS(1042433344, -1111851638);
|
||||
sum1 = W(0, -1128612156, -1112658226, -1119638967, 1043958886)
|
||||
+ W(1, -1120465263, -1128976934, -1139940268, -1123380939)
|
||||
+ W(2, -1126908022, 1033805831, -1115346894, -1142120768)
|
||||
+ W(3, -1122042583, -1128727592, -1097703246, 1057665642)
|
||||
+ W(4, -1104545545, 1005565040, 984858240, -1107767030)
|
||||
+ W(5, 1052387104, 1046318672, -1108167869, -1148354296)
|
||||
+ W(6, 999630836, -1114896432, 1054789077, -1095395475)
|
||||
+ W(7, 1029397739, -1133849404, -1146630760, -1115281716)
|
||||
+ W(8, 1030603948, -1117224401, -1163176544, -1117808895)
|
||||
+ W(9, -1126512698, -1129996802, 1028419819, -1123618471)
|
||||
+ W(10, -1117439993, 1013349902, 996431920, -1123547845)
|
||||
+ W(11, 1026334318, -1113258842, -1134051464, -1120421311);
|
||||
sum2 =
|
||||
W(0, 1022431497, -1109389142, 1004613154, 1028727631) + W(1, 1029503922, -1132574761, -1132240188, -1119299282)
|
||||
+ W(2, -1139248009, -1129989652, -1140046689, -1114039002)
|
||||
+ W(3, 1024165374, -1107432916, 1041447926, 1047487962) + W(4, 1017218352, -1135952741, -1114822837, 1044244351)
|
||||
+ W(5, -1108646182, -1100679909, 1040665470, -1123756570)
|
||||
+ W(6, -1120729932, 1031006195, 1047688354, -1126089152)
|
||||
+ W(7, -1120804126, -1148002498, -1124855948, 983982854) + W(8, 1009435309, 1033956847, -1107003694, 1028342876)
|
||||
+ W(9, -1126342960, -1158996358, -1122846542, -1123334894)
|
||||
+ W(10, -1140927562, -1117057946, -1128289576, -1121099750)
|
||||
+ W(11, 1036127241, 1039673953, -1102421772, 1026336008);
|
||||
WS(1015433728, 1058400049);
|
||||
sum1 =
|
||||
W(0, -1139873791, 1031161269, -1113693508, 1033801204) + W(1, -1119172737, -1143910182, -1133909491, 1032977294)
|
||||
+ W(2, -1112917766, -1131731326, 989007258, 1019358132) + W(3, 1023506921, -1116372870, -1116140698, 1045725159)
|
||||
+ W(4, -1122523445, 1008313039, -1230944644, 1035249566)
|
||||
+ W(5, -1103376612, -1102794347, 1044071755, -1115540344)
|
||||
+ W(6, -1118840528, -1120831281, 1044830734, -1116748777)
|
||||
+ W(7, 1030473357, -1126204226, 1028378783, -1114963068)
|
||||
+ W(8, -1141442286, 1032646513, 1018738506, -1118552369)
|
||||
+ W(9, -1121050287, 1032892305, 1023234585, -1112562780)
|
||||
+ W(10, 1021910870, 1016154651, 1033465034, -1105610222)
|
||||
+ W(11, 1034039600, 1030129285, -1122899972, -1124368226);
|
||||
sum2 = W(0, -1138428449, -1158711528, -1124467432, -1140697417)
|
||||
+ W(1, 1030243467, 1012442941, 992976916, 1013039401) + W(2, -1130455464, -1123518198, 1033499227, 975746961)
|
||||
+ W(3, -1142924106, -1128734961, -1113146735, -1099387353)
|
||||
+ W(4, 1051222006, -1122081826, 976851025, 1036130613)
|
||||
+ W(5, -1097860430, -1077268149, 1072898808, -1117904739)
|
||||
+ W(6, 989093448, 1010050489, -1108810723, -1091225653) + W(7, 1056060393, -1131990027, 997652548, -1137359275)
|
||||
+ W(8, -1122996798, 1032494444, 1025590581, 951236744) + W(9, -1153131756, 990210276, -1140348735, -1115493835)
|
||||
+ W(10, 1025171621, 1006284898, -1134977059, -1138876101)
|
||||
+ W(11, -1127238416, 1018469149, 1026307569, -1146863422);
|
||||
WS(-1143089152, 1030017260);
|
||||
sum1 =
|
||||
W(0, 1012276081, -1116644609, 1019444907, -1124688427) + W(1, 1029853709, -1130860131, 1001605962, -1127223379)
|
||||
+ W(2, -1119160665, 1035777366, -1136557285, -1130309965)
|
||||
+ W(3, 1024406997, -1109637089, 1048989101, -1098625404)
|
||||
+ W(4, 1038057505, -1130883561, -1155861797, -1115433381)
|
||||
+ W(5, 1044433671, 1006101820, -1111190908, 1009046005) + W(6, -1155627981, 1036571679, -1098184025, 1048780603)
|
||||
+ W(7, -1112291813, 1025361773, -1122534699, 1028189701)
|
||||
+ W(8, 1039597237, -1104960796, -1130076067, 1018788475)
|
||||
+ W(9, 1018348791, -1126280255, -1117935161, 1029641477)
|
||||
+ W(10, 1012573277, -1125993892, -1120990241, 1036379833)
|
||||
+ W(11, -1136463217, -1111599465, -1154886405, 1020397819);
|
||||
sum2 = W(0, -1153319600, 1008405084, -1118973116, -1140784820)
|
||||
+ W(1, 1012585128, 1010769460, -1147284080, 985822624) + W(2, 1010505984, -1129308604, 1021293048, 1001814848)
|
||||
+ W(3, 1008968960, -1142311064, -1101248908, 1037448945) + W(4, 1024969278, -1160749952, 995456320, 1022276922)
|
||||
+ W(5, -1089187936, 1057794596, 1033366347, -1123619202)
|
||||
+ W(6, -1140178660, -1140411728, -1109859050, 1029773785)
|
||||
+ W(7, 1024400778, -1136545168, -1146954776, 1005012008) + W(8, 1017518401, 1015531414, 1007802556, 1000322872)
|
||||
+ W(9, -1142030464, 1003782736, 982409184, 974134143) + W(10, 1003482728, -1152799248, -1170856127, 1006946188)
|
||||
+ W(11, 995727232, 960534268, 1009923956, 985284128);
|
||||
WS(1064472528, -1121594920);
|
||||
sum1 =
|
||||
W(0, -1142654991, 1027230343, -1112807213, 1027061019) + W(1, -1128825126, -1164359388, -1143599223, 1032290711)
|
||||
+ W(2, -1113392623, 1016010466, 991342574, 1014490160) + W(3, 1014568428, -1136037408, -1115590690, 1034098395)
|
||||
+ W(4, 1008695068, -1148094031, 1010500896, 1002050167)
|
||||
+ W(5, -1113734161, -1112872467, 1027642302, -1127829894)
|
||||
+ W(6, -1124387333, -1122938499, 1038834309, -1130883382)
|
||||
+ W(7, 1013984188, -1138058188, 1020884834, -1120250507) + W(8, 1029912912, 1015162858, 1015817710, -1124941766)
|
||||
+ W(9, -1131205634, 1025589157, 1019867389, -1123484555)
|
||||
+ W(10, 1015459258, 1008886302, 1026841191, -1110863224)
|
||||
+ W(11, 1031947569, 1019435182, -1129521612, -1130075526);
|
||||
sum2 =
|
||||
W(0, 1003807591, -1154115373, 1000124719, 1017182228) + W(1, -1126980607, -1130234859, -1147429191, -1139843175)
|
||||
+ W(2, 1001833687, 1024488826, -1116401990, 987658746) + W(3, 1002635095, 1018649088, 1008095031, 1040714709)
|
||||
+ W(4, -1105844805, 1013729967, -1132089351, 1016729308)
|
||||
+ W(5, -1105992985, 1063780536, -1085442794, 1024604622) + W(6, -1147602519, 1024344696, 1014141127, 1047200342)
|
||||
+ W(7, -1101306502, 995366957, -1151072125, -1155997437)
|
||||
+ W(8, -1132427785, 1020609216, -1122913939, -1147894927) + W(9, 964968041, 1001714367, -1141957575, 1023684454)
|
||||
+ W(10, -1125194898, -1146690231, 1011860423, -1141691791)
|
||||
+ W(11, -1139390003, 1017456200, -1128761080, -1146063807);
|
||||
WS(1061878800, -1131153991);
|
||||
sum1 =
|
||||
W(0, -1123872727, 1015115512, -1099302516, 1041224340) + W(1, -1144166978, -1171049230, 1018625288, 1031144036)
|
||||
+ W(2, -1102371221, 1009910425, 1014687697, 1022902338) + W(3, -1127640224, 1036357847, -1085394744, 1052022073)
|
||||
+ W(4, -1115552350, -1132534141, 1026350045, -1108974562)
|
||||
+ W(5, 1059569738, 1058525661, -1125187302, 1016189168) + W(6, 1013916191, -1107191102, 1050617832, -1088226291)
|
||||
+ W(7, 1037730450, -1123531112, 1018183052, 1006433282) + W(8, 1032504563, -1097316565, 1040234099, -1127405808)
|
||||
+ W(9, -1145362866, 1014427177, 1031877738, -1109508096) + W(10, 1015825508, 1018548825, 1016048056, 1026198990)
|
||||
+ W(11, 1033421596, -1098228398, 1035235966, -1137247201);
|
||||
sum2 = W(0, -1131301730, 1031269327, -1127010401, -1109842974)
|
||||
+ W(1, -1181736700, -1180777340, 973798558, -1131640108)
|
||||
+ W(2, 1028981651, -1125259759, -1167651134, -1160957999)
|
||||
+ W(3, -1127780866, 1013454096, -1149526184, -1113692773)
|
||||
+ W(4, -1123287814, 993986728, 1013478572, -1109509101) + W(5, 1051779317, 1047088883, -1109788940, 1020962386)
|
||||
+ W(6, -1160424319, -1117315078, 1028380081, -1134194124)
|
||||
+ W(7, -1115287133, -1136947718, -1135840779, -1131160392)
|
||||
+ W(8, -1137527992, 1028175261, -1121515979, -1138138790)
|
||||
+ W(9, -1164912671, -1145619912, 998238336, 1018886164)
|
||||
+ W(10, -1125209194, -1152989064, -1138738786, -1127332243)
|
||||
+ W(11, -1148504424, 1027237057, -1142455024, -1123011340);
|
||||
WS(-1146021888, 1053974589);
|
||||
sum1 =
|
||||
W(0, 1029642476, -1119368753, 1042969521, -1095098901) + W(1, 1046685039, 984849429, 1013890275, -1134074211)
|
||||
+ W(2, 1042359026, -1107285127, 1031018217, -1135393367)
|
||||
+ W(3, -1176939092, 1007708103, 1045769551, -1096985546) + W(4, 1036262392, -1139413615, 1022266947, 1017736689)
|
||||
+ W(5, -1101301107, 1034918881, 1003810877, 1024875117) + W(6, -1146466657, 1027345005, -1094644679, 1050538529)
|
||||
+ W(7, -1120828825, -1172526890, 1004183253, 1032510570) + W(8, -1091538585, 1051699648, 1011534979, 1017671961)
|
||||
+ W(9, -1160650069, 1019378973, -1107179580, 1036824506)
|
||||
+ W(10, -1133351451, -1160823333, -1127783457, 1031489314)
|
||||
+ W(11, -1095508207, 1048776768, 1035618600, 1006585957);
|
||||
sum2 = W(0, 1031363252, -1091101506, 1048232756, 1057852755) + W(1, -1095952784, 1016290300, 1030774484, 1001500224)
|
||||
+ W(2, -1110436898, -1132290932, -1131305343, -1126601761)
|
||||
+ W(3, 1015165558, -1110787951, 1016237906, 1043794074) + W(4, -1113356328, 1003743696, 1007437656, 965388167)
|
||||
+ W(5, 1014973676, 1047525730, -1152923833, 1022650220) + W(6, 1020087968, 1003188992, -1123006886, 1011818344)
|
||||
+ W(7, -1111245491, 1021501454, -1158035650, 1041338676)
|
||||
+ W(8, -1105090874, -1129296549, -1131940021, 1017537464)
|
||||
+ W(9, -1137051446, -1134903850, -1123217223, 1034851396)
|
||||
+ W(10, -1117639196, -1133259176, 1018262350, 1033269727)
|
||||
+ W(11, -1104724635, -1106365430, 1024945328, 1019937714);
|
||||
WS(-1077057896, -1083600334);
|
||||
sum1 = W(0, 1017420011, 1011471785, 1029223422, -1116040414) + W(1, 1017123181, 1016511669, 1014201033, 1019976613)
|
||||
+ W(2, -1126437509, 1015478313, 1024110818, -1167731667)
|
||||
+ W(3, 1017846781, -1138042285, 1049638570, -1103217262) + W(4, 1023111893, 1009386661, 999765850, 1040273597)
|
||||
+ W(5, -1090770241, -1087230893, 1030676769, 1023090125)
|
||||
+ W(6, -1162024122, 1016487629, 1029091694, 1046437488) + W(7, -1112046985, 1020460717, 985808522, 1027730222)
|
||||
+ W(8, 1037672698, 1024768280, -1120839802, 1025489318) + W(9, 1019153993, 1010855969, 1027546578, 1028909230)
|
||||
+ W(10, 1023955584, -1134545259, 1011766057, 1025127228)
|
||||
+ W(11, 1025680213, 1017109109, -1128064723, 1027741830);
|
||||
sum2 =
|
||||
W(0, 1023774756, -1107003878, 1020767940, -1118294055) + W(1, -1113997093, 1021408408, -1152708847, 1013240776)
|
||||
+ W(2, -1108605887, -1128830540, -1139588328, -1119578529)
|
||||
+ W(3, 1005727232, -1108761818, 1050907301, -1097736561)
|
||||
+ W(4, 1032528025, -1135972104, -1128030280, 1032847770) + W(5, 1058054639, 1008347200, 1039669350, -1131826954)
|
||||
+ W(6, 1004577664, 1024878510, -1106188814, 1049418167) + W(7, -1108856812, 999382680, -1116453887, -1129071264)
|
||||
+ W(8, 1040942692, -1105809360, -1104688291, 1019392776)
|
||||
+ W(9, 1020705336, -1124253692, -1115446820, 1014050712)
|
||||
+ W(10, 1018266740, -1117167612, -1127775332, -1114566712)
|
||||
+ W(11, 1042743894, -1132221182, -1103534695, 1022204104);
|
||||
WS(1034686080, -1080904524);
|
||||
sum1 =
|
||||
W(0, -1139332721, 1025190657, -1143163562, 1041601261) + W(1, 1024768205, -1137907141, -1156631187, 1024127465)
|
||||
+ W(2, 1040892278, 1028605547, -1129308018, 1012089369) + W(3, 1023562901, 1006799241, -1104914606, 1052908885)
|
||||
+ W(4, -1117860929, 1019594656, 1011454089, -1145135178)
|
||||
+ W(5, -1089193318, -1091833281, 1036300940, -1143330794)
|
||||
+ W(6, 1009225011, -1129417722, 1043909393, -1103073573) + W(7, 1040987970, 992909011, 1012327853, 1017495114)
|
||||
+ W(8, -1119873834, 1025246703, 1033652713, -1123933213)
|
||||
+ W(9, 1010687981, 1027561839, -1136185891, -1124345098) + W(10, 1024209623, 1018355139, 1010798725, 1010795083)
|
||||
+ W(11, -1118482716, 1032670633, 1027144528, -1123266333);
|
||||
sum2 =
|
||||
W(0, 998154484, -1124228589, -1132108902, -1115676434) + W(1, -1123985162, 1004957466, -1136847690, 1028193069)
|
||||
+ W(2, -1123281782, -1123302060, -1132306691, 1011392625)
|
||||
+ W(3, -1120010648, 1043298286, -1097765474, 1027211577)
|
||||
+ W(4, -1114822183, -1127542967, -1145824866, -1115567961)
|
||||
+ W(5, 1059221182, 1034703777, -1131429597, 1022587458) + W(6, 1015307650, -1106126812, 1048600788, -1099334080)
|
||||
+ W(7, 1029215805, -1127163397, 994166396, -1111174068) + W(8, -1130476352, 1015056080, 1023836215, -1122559367)
|
||||
+ W(9, 1000606426, -1128437454, 1026255089, -1137618020)
|
||||
+ W(10, -1127893362, -1171736302, 1010815409, -1110538383)
|
||||
+ W(11, -1118584150, 1028199647, 1025007180, -1124423270);
|
||||
WS(-1097173920, -1100403112);
|
||||
sum1 = W(0, -1133792968, -1126599342, 1026626987, -1109988694)
|
||||
+ W(1, -1128510918, -1124691470, -1124511038, -1134319356)
|
||||
+ W(2, -1112479512, -1122054529, -1138055228, -1131431128)
|
||||
+ W(3, -1133667884, -1113753548, 1051379210, -1097159959)
|
||||
+ W(4, 1031366423, -1128464692, -1126404688, -1113718896)
|
||||
+ W(5, 1058852431, 1058630415, -1108453759, -1122909907)
|
||||
+ W(6, -1129657589, 1034489098, -1097104011, 1049904553)
|
||||
+ W(7, -1111244112, 1006087192, -1123548289, 1017816566)
|
||||
+ W(8, 1007326848, -1104990865, -1129654222, -1138955724)
|
||||
+ W(9, -1134226372, -1122628437, -1112737379, 983139170)
|
||||
+ W(10, -1143321192, -1123473736, -1120375479, 1029275393)
|
||||
+ W(11, -1116837058, -1110311540, -1132471000, -1149064600);
|
||||
sum2 = W(0, -1133003813, -1145103116, -1105221269, 1033080040)
|
||||
+ W(1, 1016862101, -1129731365, -1170659932, 1024883426)
|
||||
+ W(2, -1117429423, 1028547885, -1128891234, -1147341896)
|
||||
+ W(3, 1006656308, -1122208183, -1098340061, 1042272545)
|
||||
+ W(4, -1121562483, -1121650606, 1031055883, -1101651786)
|
||||
+ W(5, 1055658740, 1058321046, -1100689547, 1031708925)
|
||||
+ W(6, -1122785076, -1107240567, 1035604404, -1112738821)
|
||||
+ W(7, -1115182870, -1123396988, -1138148825, -1137951645)
|
||||
+ W(8, -1131811521, 1003752088, 1026865631, -1133076983)
|
||||
+ W(9, -1134424500, -1131665157, -1130287800, 1015669581)
|
||||
+ W(10, -1129373191, -1131162259, -1131089901, -1116779622)
|
||||
+ W(11, -1123356625, 1033205575, -1134576021, -1127933595);
|
||||
WS(1049422752, 1064394145);
|
||||
sum1 = W(0, 1016583527, -1106085006, 995307718, 1042273115) + W(1, -1113049442, 1025810280, 997641734, -1123841888)
|
||||
+ W(2, 1031369872, 1021597381, -1122854832, 1006187755)
|
||||
+ W(3, -1129211865, 1041111742, -1088517333, 1058826428)
|
||||
+ W(4, -1113933244, 1019889767, -1131677043, 1032245856)
|
||||
+ W(5, -1098988005, -1105331685, 1032610296, -1131685097)
|
||||
+ W(6, 1021172552, -1110939130, 1058612208, -1090507155)
|
||||
+ W(7, 1037338632, -1155049030, 1021691141, -1105269375)
|
||||
+ W(8, 1030057089, 1043687978, -1122591528, -1134096210)
|
||||
+ W(9, -1133007562, -1137128282, 1036830720, -1120823228)
|
||||
+ W(10, -1116248270, 1025994697, 1026669144, -1106745812)
|
||||
+ W(11, 1034516890, 1038691348, -1117945591, -1126546729);
|
||||
sum2 = W(0, 1015668141, -1138201662, -1111996311, -1127284815)
|
||||
+ W(1, -1125087482, 1020174885, -1124041461, -1140877219)
|
||||
+ W(2, -1116450062, -1123578506, 1024732308, -1139064970)
|
||||
+ W(3, 1005775275, 1027346708, -1125910350, -1106280325)
|
||||
+ W(4, 1034158307, -1133423524, 1015274173, 1016303395)
|
||||
+ W(5, -1108948194, 1052974100, 1032925063, -1161498797)
|
||||
+ W(6, -1138139200, -1106503093, -1104963655, 1053021197)
|
||||
+ W(7, -1107449032, -1134898868, 992639399, -1117618841) + W(8, 1031763952, 957951850, 994113735, 1013272790)
|
||||
+ W(9, -1132053353, -1115775134, 1015724405, 1016609913)
|
||||
+ W(10, -1132927280, -1132485274, -1129319398, -1122071744)
|
||||
+ W(11, 1034411590, -1140595900, -1140186580, -1164791981);
|
||||
WS(-1101497152, -1084603877);
|
||||
sum1 =
|
||||
W(0, -1136425045, 1016522037, 967194407, 1019848413) + W(1, -1129523533, -1142614610, -1140218249, -1157845066)
|
||||
+ W(2, 1029505522, -1119357636, -1140249161, -1135395837)
|
||||
+ W(3, -1121565262, 1035402982, 1022903246, 1027088345)
|
||||
+ W(4, -1121932442, -1148904362, -1122160667, 1027884002)
|
||||
+ W(5, -1107598171, 1024422013, -1127296803, 1002411186) + W(6, 1006883159, 1025282390, 1025270942, -1117602990)
|
||||
+ W(7, 1030372258, -1130529549, -1132497425, 1022271101)
|
||||
+ W(8, -1120772739, 1030415880, -1129818261, 1018540973)
|
||||
+ W(9, 1004502690, -1138792353, -1154700189, -1171556244)
|
||||
+ W(10, -1138666305, -1138856043, -1128604789, 995143101)
|
||||
+ W(11, -1128284203, 1025955498, -1121511513, 1011955033);
|
||||
sum2 = W(0, -1126668299, -1131366283, 1024971228, 1000957181)
|
||||
+ W(1, -1151515419, 1005199725, -1137964827, -1117612139)
|
||||
+ W(2, 1034620123, -1119890411, -1145021381, -1136862175)
|
||||
+ W(3, 1015963121, -1097765254, 1049249869, 1026062254) + W(4, 1001872029, 1007955643, 1030757650, -1083955387)
|
||||
+ W(5, 1064229708, -1107214224, 1026637176, -1125717658)
|
||||
+ W(6, -1137547503, -1103492737, 1047078464, -1122275403)
|
||||
+ W(7, 1027173860, -1169614250, 997720155, -1118797430) + W(8, 1017921725, 1016072153, -1135832789, 923654805)
|
||||
+ W(9, -1132279825, -1131387718, 1024786888, -1133941049)
|
||||
+ W(10, -1148432117, 1002011725, -1152589275, -1140632131)
|
||||
+ W(11, -1144191965, 996433547, -1140699475, 1005736109);
|
||||
WS(1059552336, -1136539026);
|
||||
sum1 =
|
||||
W(0, 990367896, 1041343484, -1096612504, 1033353841) + W(1, -1125599349, 1028944863, 1010957914, 1036710283)
|
||||
+ W(2, -1107358947, 1029016441, -1132821402, 1024290996)
|
||||
+ W(3, -1154541352, 1045269292, -1087221074, 1042554433)
|
||||
+ W(4, -1154580200, 1023892422, 1017372383, -1112141659) + W(5, 1058232297, 1029783110, -1114120867, 1023410731)
|
||||
+ W(6, 1026284586, -1116984235, 1051438086, -1087458720) + W(7, 1033522371, -1144215764, 1015461809, 1018013925)
|
||||
+ W(8, 1047713030, -1095293300, 1032365167, -1144750420) + W(9, 1014364322, 1006339428, 1032067931, -1114380761)
|
||||
+ W(10, 1004597796, 1001346936, 1021777309, 1032228520)
|
||||
+ W(11, 1045851190, -1099415088, 1030006574, -1130073781);
|
||||
sum2 = W(0, -1153914788, -1101809160, 1052877341, 1046574229)
|
||||
+ W(1, -1095334336, 1023520281, -1126180245, -1115520194)
|
||||
+ W(2, 1022007580, 1000424166, -1113807813, 1021218858) + W(3, 995844276, -1114410922, 1055965696, 1034680258)
|
||||
+ W(4, -1109583292, 1008634443, -1141303142, 1033573989)
|
||||
+ W(5, -1098900400, -1098051352, 1033797491, -1115608949) + W(6, 1026951758, 998799030, 1023481081, 1045079279)
|
||||
+ W(7, 1032986287, 1032307290, 990856044, -1110191966) + W(8, 1023185808, -1106708743, 1025876178, -1128938562)
|
||||
+ W(9, 1004850742, -1129252703, 1031073312, 984863273) + W(10, -1137844345, 1017335440, 1015235936, 1016759632)
|
||||
+ W(11, -1104219784, -1103050031, 1038371038, 1020607644);
|
||||
WS(-1080660584, -1085825159);
|
||||
sum1 =
|
||||
W(0, 1013708199, -1123370319, -1145658646, -1118786339) + W(1, 1028171867, -1144908790, 998525366, -1131079022)
|
||||
+ W(2, -1111041043, 1035331132, 1017605134, -1131113128)
|
||||
+ W(3, 1026247587, -1110742584, 1047524760, -1095527502)
|
||||
+ W(4, 1042485668, -1130744068, 1009982783, -1113918027) + W(5, 1038280501, 1041941518, -1110999603, 992723116)
|
||||
+ W(6, -1136883881, 1032009669, -1096311074, 1051037928)
|
||||
+ W(7, -1106204846, 1025830203, -1128223794, 1025751155)
|
||||
+ W(8, 1042402294, -1106649743, -1132447358, 1017749654) + W(9, 999596614, -1126831290, -1118872454, 1032615945)
|
||||
+ W(10, 1002160934, -1127230527, -1126850910, 1033490448)
|
||||
+ W(11, 1023947050, -1111971999, 971034337, 1018668086);
|
||||
sum2 = W(0, 988660617, 1017543700, 1015794522, -1133704409) + W(1, 1003471274, -1140119133, -1145776834, 1002138986)
|
||||
+ W(2, 1001599498, 1024621822, -1135257421, -1136500105)
|
||||
+ W(3, -1133422913, 1031822055, 1041494739, -1102581932)
|
||||
+ W(4, 970658596, -1163479081, -1126488793, 1032911160) + W(5, 1056510750, -1089051586, 1026713544, 1009057465)
|
||||
+ W(6, 999416722, 1018658069, 1023998101, -1111744235) + W(7, 945757471, 1000517690, 999055930, 1007351961)
|
||||
+ W(8, -1138508317, 1009295285, 998080468, -1137960905) + W(9, 987033481, -1162261577, 991201876, -1140892226)
|
||||
+ W(10, -1156050276, -1186683976, -1179419172, 999395634)
|
||||
+ W(11, -1141702058, -1147317506, 1007988669, -1146609818);
|
||||
WS(1064784784, -1120346387);
|
||||
sum1 = W(0, -1150678408, 1015721531, 1049255678, -1099108228)
|
||||
+ W(1, -1149551256, -1136953142, 1000581420, -1110077251)
|
||||
+ W(2, 1043607805, -1107416484, 1017163947, -1140022794)
|
||||
+ W(3, 1006062348, -1107299655, 1059242626, -1089544734)
|
||||
+ W(4, 1023526494, -1139533474, 1015088861, -1132691862)
|
||||
+ W(5, -1123916922, -1130977491, 1022505321, 1012221798)
|
||||
+ W(6, -1136518116, -1148196556, -1096371932, 1057929313)
|
||||
+ W(7, -1104456865, 1014035238, -1126533711, 1013224070)
|
||||
+ W(8, -1100407642, 1048500643, -1111675367, 1026165050)
|
||||
+ W(9, 1012432222, -1124886999, -1132580564, 1035479729)
|
||||
+ W(10, -1127245287, -1136458552, -1122704190, 1014270588)
|
||||
+ W(11, -1102354822, 1044504531, 1007459698, 1017479699);
|
||||
sum2 = W(0, -1140771860, 1031694512, -1104948969, -1115570202)
|
||||
+ W(1, 1040745971, -1127298441, -1125513054, -1122230843) + W(2, 993388690, 1042093481, -1111499166, 995262946)
|
||||
+ W(3, -1131667695, 979286214, 1026183534, 1042830623) + W(4, -1119680402, 1002124441, -1131288705, 1025077104)
|
||||
+ W(5, -1111209187, -1112764939, 982469091, -1123012516) + W(6, 978159878, -1108853537, 1041617383, 1043422569)
|
||||
+ W(7, -1120447085, -1129740789, 1012596136, -1102087836)
|
||||
+ W(8, 1045410736, 1034771561, -1109907689, -1125016939)
|
||||
+ W(9, 1011933560, -1117751010, 1030126174, 1014235016)
|
||||
+ W(10, -1127258987, 1004566649, -1121534607, -1113389694)
|
||||
+ W(11, 1044425994, 1025820984, -1115100280, -1119639931);
|
||||
WS(-1088649680, 1067112300);
|
||||
|
||||
return clamp(mstd0 + 5.0 * vsum / wsum * mstd1, 0.0, 1.0);
|
||||
}
|
||||
|
||||
shared float inp[507];
|
||||
|
||||
#define CURRENT_PASS 1
|
||||
|
||||
#define GET_SAMPLE(x) dot(x.rgb, rgb2y)
|
||||
#define imageStore(out_image, pos, val) imageStoreOverride(pos, val.x)
|
||||
void imageStoreOverride(uint2 pos, float value) { temp[pos] = (value); }
|
||||
|
||||
#define INPUT_tex(pos) GET_SAMPLE(vec4(texture(INPUT, pos)))
|
||||
static const float2 INPUT_size = float2(GetInputSize());
|
||||
static const float2 INPUT_pt = float2(GetInputPt());
|
||||
|
||||
#define HOOKED_tex(pos) INPUT_tex(pos)
|
||||
#define HOOKED_size INPUT_size
|
||||
#define HOOKED_pt INPUT_pt
|
||||
|
||||
void Pass1(uint2 blockStart, uint3 threadId) {
|
||||
ivec2 group_base = ivec2(gl_WorkGroupID) * ivec2(gl_WorkGroupSize);
|
||||
int local_pos = int(gl_LocalInvocationID.x) * 13 + int(gl_LocalInvocationID.y);
|
||||
for (int id = int(gl_LocalInvocationIndex); id < 507; id += int(gl_WorkGroupSize.x * gl_WorkGroupSize.y)) {
|
||||
uint x = (uint)id / 13, y = (uint)id % 13;
|
||||
inp[id] =
|
||||
HOOKED_tex(HOOKED_pt * vec2(float(group_base.x + x - (3)) + 0.5, float(group_base.y + y - (2)) + 0.5)).x;
|
||||
}
|
||||
barrier();
|
||||
vec4 ret = vec4(0.0, 0.0, 0.0, 0.0);
|
||||
vec4 ret0 = vec4(0.0, 0.0, 0.0, 0.0);
|
||||
vec4 samples[12];
|
||||
samples[0][0] = inp[local_pos + 0];
|
||||
samples[0][1] = inp[local_pos + 1];
|
||||
samples[0][2] = inp[local_pos + 2];
|
||||
samples[0][3] = inp[local_pos + 3];
|
||||
samples[1][0] = inp[local_pos + 4];
|
||||
samples[1][1] = inp[local_pos + 5];
|
||||
samples[1][2] = inp[local_pos + 13];
|
||||
samples[1][3] = inp[local_pos + 14];
|
||||
samples[2][0] = inp[local_pos + 15];
|
||||
samples[2][1] = inp[local_pos + 16];
|
||||
samples[2][2] = inp[local_pos + 17];
|
||||
samples[2][3] = inp[local_pos + 18];
|
||||
samples[3][0] = inp[local_pos + 26];
|
||||
samples[3][1] = inp[local_pos + 27];
|
||||
samples[3][2] = inp[local_pos + 28];
|
||||
samples[3][3] = inp[local_pos + 29];
|
||||
samples[4][0] = inp[local_pos + 30];
|
||||
samples[4][1] = inp[local_pos + 31];
|
||||
samples[4][2] = inp[local_pos + 39];
|
||||
samples[4][3] = inp[local_pos + 40];
|
||||
samples[5][0] = inp[local_pos + 41];
|
||||
samples[5][1] = inp[local_pos + 42];
|
||||
samples[5][2] = inp[local_pos + 43];
|
||||
samples[5][3] = inp[local_pos + 44];
|
||||
samples[6][0] = inp[local_pos + 52];
|
||||
samples[6][1] = inp[local_pos + 53];
|
||||
samples[6][2] = inp[local_pos + 54];
|
||||
samples[6][3] = inp[local_pos + 55];
|
||||
samples[7][0] = inp[local_pos + 56];
|
||||
samples[7][1] = inp[local_pos + 57];
|
||||
samples[7][2] = inp[local_pos + 65];
|
||||
samples[7][3] = inp[local_pos + 66];
|
||||
samples[8][0] = inp[local_pos + 67];
|
||||
samples[8][1] = inp[local_pos + 68];
|
||||
samples[8][2] = inp[local_pos + 69];
|
||||
samples[8][3] = inp[local_pos + 70];
|
||||
samples[9][0] = inp[local_pos + 78];
|
||||
samples[9][1] = inp[local_pos + 79];
|
||||
samples[9][2] = inp[local_pos + 80];
|
||||
samples[9][3] = inp[local_pos + 81];
|
||||
samples[10][0] = inp[local_pos + 82];
|
||||
samples[10][1] = inp[local_pos + 83];
|
||||
samples[10][2] = inp[local_pos + 91];
|
||||
samples[10][3] = inp[local_pos + 92];
|
||||
samples[11][0] = inp[local_pos + 93];
|
||||
samples[11][1] = inp[local_pos + 94];
|
||||
samples[11][2] = inp[local_pos + 95];
|
||||
samples[11][3] = inp[local_pos + 96];
|
||||
ret[0] = nnedi3(samples);
|
||||
ret0[0] = inp[local_pos + 41];
|
||||
#if CURRENT_PASS == LAST_PASS
|
||||
uint2 destPos = blockStart + threadId.xy * 2;
|
||||
uint2 outputSize = GetOutputSize();
|
||||
if (destPos.x >= outputSize.x || destPos.y >= outputSize.y) {
|
||||
return;
|
||||
}
|
||||
#endif
|
||||
imageStore(out_image, ivec2(gl_GlobalInvocationID) * ivec2(1, 2), ret0);
|
||||
imageStore(out_image, ivec2(gl_GlobalInvocationID) * ivec2(1, 2) + ivec2(0, 1), ret);
|
||||
}
|
||||
//!PASS 2
|
||||
//!DESC NNEDI3 (double_x, nns16, win8x6)
|
||||
//!IN INPUT, temp
|
||||
//!OUT OUTPUT
|
||||
//!BLOCK_SIZE 64, 8
|
||||
//!NUM_THREADS 32, 8
|
||||
#pragma optionNV(inline none)
|
||||
float nnedi3(vec4 samples[12]) {
|
||||
float sum = 0.0, sumsq = 0.0;
|
||||
[unroll] for (int i = 0; i < 12; i++) {
|
||||
sum += dot(samples[i], vec4(1.0, 1.0, 1.0, 1.0));
|
||||
sumsq += dot(samples[i], samples[i]);
|
||||
}
|
||||
float mstd0 = sum / 48.0;
|
||||
float mstd1 = sumsq / 48.0 - mstd0 * mstd0;
|
||||
float mstd2 = mix(0.0, inversesqrt(mstd1), mstd1 >= 1.192092896e-7);
|
||||
mstd1 *= mstd2;
|
||||
float vsum = 0.0, wsum = 0.0, sum1, sum2;
|
||||
#define T(x) intBitsToFloat(x)
|
||||
#define W(i, w0, w1, w2, w3) dot(samples[i], vec4(T(w0), T(w1), T(w2), T(w3)))
|
||||
#define WS(w0, w1) \
|
||||
sum1 = exp(sum1 * mstd2 + T(w0)); \
|
||||
sum2 = sum2 * mstd2 + T(w1); \
|
||||
wsum += sum1; \
|
||||
vsum += sum1 * (sum2 / (1.0 + abs(sum2)));
|
||||
sum1 = W(0, -1126897990, -1130469888, -1113607518, -1116173177)
|
||||
+ W(1, 1015526727, -1133977224, 990390561, -1122292152) + W(2, 1027745880, -1121396864, 1041026790, 1042195560)
|
||||
+ W(3, 1018714920, 1026239260, -1131068140, 1015308851) + W(4, 1024250604, 1039079928, 1022159130, -1098313415)
|
||||
+ W(5, 1042189511, -1106606352, 1013770942, -1122039043) + W(6, 1024642508, -1107295041, 1044630722, 999141354)
|
||||
+ W(7, -1106681307, 1038936227, -1122507740, 1031978820)
|
||||
+ W(8, -1121959908, -1147395201, -1107136294, 1019497054)
|
||||
+ W(9, 1035880216, -1124106064, -1136584888, -1116330759)
|
||||
+ W(10, -1149906049, -1126556538, 1005058137, 1007702352)
|
||||
+ W(11, -1121374916, 1025050132, -1135809122, 1018900008);
|
||||
sum2 = W(0, 1017133506, 1023321012, -1119553894, -1112177411)
|
||||
+ W(1, -1165256880, -1133574805, 1020043526, -1135761830)
|
||||
+ W(2, 1011515348, 1029416248, 1057587887, 1071604647) + W(3, 1051025857, 1035052104, 1010374724, 1018728192)
|
||||
+ W(4, -1139818306, -1115999672, -1090489276, -1079392139)
|
||||
+ W(5, -1098617840, -1139515542, -1121583660, 1024878156)
|
||||
+ W(6, -1123730089, 1020129658, -1109933138, -1097028615)
|
||||
+ W(7, -1105405946, -1135392452, -1142174380, 1002597928) + W(8, 996184056, 1015618084, 1016266760, 1028448562)
|
||||
+ W(9, -1155286464, -1138601606, 997185888, -1131188096)
|
||||
+ W(10, -1138856554, 1007066512, -1145378916, 1008681896)
|
||||
+ W(11, 1000343320, 991053648, -1155288808, -1132781834);
|
||||
WS(1018288640, 1027735986);
|
||||
sum1 =
|
||||
W(0, 1012158232, 1006778572, 1025063871, -1156175041) + W(1, 1010747802, -1140348884, 1020162890, -1172026051)
|
||||
+ W(2, -1178449286, -1114624234, -1104570115, 1028919475)
|
||||
+ W(3, 1034856692, -1127457566, -1122825993, -1115773453)
|
||||
+ W(4, 1044498160, 1032943202, 1059928494, -1097612337) + W(5, -1085331503, 1031833306, -1119592595, 1038136595)
|
||||
+ W(6, -1128542910, 1027108853, -1088743921, -1106124541) + W(7, 1059914122, 1032056909, 1033999672, 1027508251)
|
||||
+ W(8, -1115962871, 1017365062, 1032615126, 1026836706)
|
||||
+ W(9, -1114177498, -1122073627, 1022377282, -1129465364)
|
||||
+ W(10, 1002517720, 964628492, -1134936888, -1146238776) + W(11, 1020458158, 1016604174, 998219705, 1023799671);
|
||||
sum2 = W(0, -1126840972, -1145399745, -1139273136, 991527106)
|
||||
+ W(1, -1135317632, -1139461992, -1129486378, 995542402)
|
||||
+ W(2, -1130460798, 985194115, -1112997847, -1120456092) + W(3, 1001121889, 1017866680, 1006765920, 1000306721)
|
||||
+ W(4, 1019075916, -1120812206, -1139625904, 1043975251)
|
||||
+ W(5, -1150833602, -1128878392, -1133504840, -1142139489)
|
||||
+ W(6, 1017322604, -1129997452, 1042717692, 1051048254)
|
||||
+ W(7, -1121880440, -1112673669, -1126929736, -1114488494)
|
||||
+ W(8, -1131054760, 1006903064, -1114175000, -1113881740) + W(9, 978663174, 1026044394, 1014584312, 1007041936)
|
||||
+ W(10, -1131047996, -1143360737, -1130986946, 1007107280)
|
||||
+ W(11, -1143215153, -1125685806, -1144361281, -1134951296);
|
||||
WS(1042433344, -1111851638);
|
||||
sum1 = W(0, -1128612156, -1139940268, -1122042583, 984858240) + W(1, 999630836, -1146630760, -1126512698, 996431920)
|
||||
+ W(2, -1112658226, -1123380939, -1128727592, -1107767030)
|
||||
+ W(3, -1114896432, -1115281716, -1129996802, -1123547845)
|
||||
+ W(4, -1119638967, -1126908022, -1097703246, 1052387104)
|
||||
+ W(5, 1054789077, 1030603948, 1028419819, 1026334318) + W(6, 1043958886, 1033805831, 1057665642, 1046318672)
|
||||
+ W(7, -1095395475, -1117224401, -1123618471, -1113258842)
|
||||
+ W(8, -1120465263, -1115346894, -1104545545, -1108167869)
|
||||
+ W(9, 1029397739, -1163176544, -1117439993, -1134051464)
|
||||
+ W(10, -1128976934, -1142120768, 1005565040, -1148354296)
|
||||
+ W(11, -1133849404, -1117808895, 1013349902, -1120421311);
|
||||
sum2 =
|
||||
W(0, 1022431497, -1132240188, 1024165374, -1114822837)
|
||||
+ W(1, -1120729932, -1124855948, -1126342960, -1128289576)
|
||||
+ W(2, -1109389142, -1119299282, -1107432916, 1044244351)
|
||||
+ W(3, 1031006195, 983982854, -1158996358, -1121099750) + W(4, 1004613154, -1139248009, 1041447926, -1108646182)
|
||||
+ W(5, 1047688354, 1009435309, -1122846542, 1036127241) + W(6, 1028727631, -1129989652, 1047487962, -1100679909)
|
||||
+ W(7, -1126089152, 1033956847, -1123334894, 1039673953) + W(8, 1029503922, -1140046689, 1017218352, 1040665470)
|
||||
+ W(9, -1120804126, -1107003694, -1140927562, -1102421772)
|
||||
+ W(10, -1132574761, -1114039002, -1135952741, -1123756570)
|
||||
+ W(11, -1148002498, 1028342876, -1117057946, 1026336008);
|
||||
WS(1015433728, 1058400049);
|
||||
sum1 =
|
||||
W(0, -1139873791, -1133909491, 1023506921, -1230944644) + W(1, -1118840528, 1028378783, -1121050287, 1033465034)
|
||||
+ W(2, 1031161269, 1032977294, -1116372870, 1035249566)
|
||||
+ W(3, -1120831281, -1114963068, 1032892305, -1105610222)
|
||||
+ W(4, -1113693508, -1112917766, -1116140698, -1103376612)
|
||||
+ W(5, 1044830734, -1141442286, 1023234585, 1034039600) + W(6, 1033801204, -1131731326, 1045725159, -1102794347)
|
||||
+ W(7, -1116748777, 1032646513, -1112562780, 1030129285) + W(8, -1119172737, 989007258, -1122523445, 1044071755)
|
||||
+ W(9, 1030473357, 1018738506, 1021910870, -1122899972)
|
||||
+ W(10, -1143910182, 1019358132, 1008313039, -1115540344)
|
||||
+ W(11, -1126204226, -1118552369, 1016154651, -1124368226);
|
||||
sum2 = W(0, -1138428449, 992976916, -1142924106, 976851025) + W(1, 989093448, 997652548, -1153131756, -1134977059)
|
||||
+ W(2, -1158711528, 1013039401, -1128734961, 1036130613)
|
||||
+ W(3, 1010050489, -1137359275, 990210276, -1138876101)
|
||||
+ W(4, -1124467432, -1130455464, -1113146735, -1097860430)
|
||||
+ W(5, -1108810723, -1122996798, -1140348735, -1127238416)
|
||||
+ W(6, -1140697417, -1123518198, -1099387353, -1077268149)
|
||||
+ W(7, -1091225653, 1032494444, -1115493835, 1018469149) + W(8, 1030243467, 1033499227, 1051222006, 1072898808)
|
||||
+ W(9, 1056060393, 1025590581, 1025171621, 1026307569) + W(10, 1012442941, 975746961, -1122081826, -1117904739)
|
||||
+ W(11, -1131990027, 951236744, 1006284898, -1146863422);
|
||||
WS(-1143089152, 1030017260);
|
||||
sum1 = W(0, 1012276081, 1001605962, 1024406997, -1155861797)
|
||||
+ W(1, -1155627981, -1122534699, 1018348791, -1120990241)
|
||||
+ W(2, -1116644609, -1127223379, -1109637089, -1115433381)
|
||||
+ W(3, 1036571679, 1028189701, -1126280255, 1036379833) + W(4, 1019444907, -1119160665, 1048989101, 1044433671)
|
||||
+ W(5, -1098184025, 1039597237, -1117935161, -1136463217)
|
||||
+ W(6, -1124688427, 1035777366, -1098625404, 1006101820)
|
||||
+ W(7, 1048780603, -1104960796, 1029641477, -1111599465)
|
||||
+ W(8, 1029853709, -1136557285, 1038057505, -1111190908)
|
||||
+ W(9, -1112291813, -1130076067, 1012573277, -1154886405)
|
||||
+ W(10, -1130860131, -1130309965, -1130883561, 1009046005)
|
||||
+ W(11, 1025361773, 1018788475, -1125993892, 1020397819);
|
||||
sum2 = W(0, -1153319600, -1147284080, 1008968960, 995456320)
|
||||
+ W(1, -1140178660, -1146954776, -1142030464, -1170856127)
|
||||
+ W(2, 1008405084, 985822624, -1142311064, 1022276922) + W(3, -1140411728, 1005012008, 1003782736, 1006946188)
|
||||
+ W(4, -1118973116, 1010505984, -1101248908, -1089187936) + W(5, -1109859050, 1017518401, 982409184, 995727232)
|
||||
+ W(6, -1140784820, -1129308604, 1037448945, 1057794596) + W(7, 1029773785, 1015531414, 974134143, 960534268)
|
||||
+ W(8, 1012585128, 1021293048, 1024969278, 1033366347) + W(9, 1024400778, 1007802556, 1003482728, 1009923956)
|
||||
+ W(10, 1010769460, 1001814848, -1160749952, -1123619202)
|
||||
+ W(11, -1136545168, 1000322872, -1152799248, 985284128);
|
||||
WS(1064472528, -1121594920);
|
||||
sum1 = W(0, -1142654991, -1143599223, 1014568428, 1010500896)
|
||||
+ W(1, -1124387333, 1020884834, -1131205634, 1026841191)
|
||||
+ W(2, 1027230343, 1032290711, -1136037408, 1002050167)
|
||||
+ W(3, -1122938499, -1120250507, 1025589157, -1110863224)
|
||||
+ W(4, -1112807213, -1113392623, -1115590690, -1113734161)
|
||||
+ W(5, 1038834309, 1029912912, 1019867389, 1031947569) + W(6, 1027061019, 1016010466, 1034098395, -1112872467)
|
||||
+ W(7, -1130883382, 1015162858, -1123484555, 1019435182) + W(8, -1128825126, 991342574, 1008695068, 1027642302)
|
||||
+ W(9, 1013984188, 1015817710, 1015459258, -1129521612)
|
||||
+ W(10, -1164359388, 1014490160, -1148094031, -1127829894)
|
||||
+ W(11, -1138058188, -1124941766, 1008886302, -1130075526);
|
||||
sum2 =
|
||||
W(0, 1003807591, -1147429191, 1002635095, -1132089351) + W(1, -1147602519, -1151072125, 964968041, 1011860423)
|
||||
+ W(2, -1154115373, -1139843175, 1018649088, 1016729308)
|
||||
+ W(3, 1024344696, -1155997437, 1001714367, -1141691791) + W(4, 1000124719, 1001833687, 1008095031, -1105992985)
|
||||
+ W(5, 1014141127, -1132427785, -1141957575, -1139390003) + W(6, 1017182228, 1024488826, 1040714709, 1063780536)
|
||||
+ W(7, 1047200342, 1020609216, 1023684454, 1017456200)
|
||||
+ W(8, -1126980607, -1116401990, -1105844805, -1085442794)
|
||||
+ W(9, -1101306502, -1122913939, -1125194898, -1128761080)
|
||||
+ W(10, -1130234859, 987658746, 1013729967, 1024604622)
|
||||
+ W(11, 995366957, -1147894927, -1146690231, -1146063807);
|
||||
WS(1061878800, -1131153991);
|
||||
sum1 =
|
||||
W(0, -1123872727, 1018625288, -1127640224, 1026350045) + W(1, 1013916191, 1018183052, -1145362866, 1016048056)
|
||||
+ W(2, 1015115512, 1031144036, 1036357847, -1108974562) + W(3, -1107191102, 1006433282, 1014427177, 1026198990)
|
||||
+ W(4, -1099302516, -1102371221, -1085394744, 1059569738) + W(5, 1050617832, 1032504563, 1031877738, 1033421596)
|
||||
+ W(6, 1041224340, 1009910425, 1052022073, 1058525661)
|
||||
+ W(7, -1088226291, -1097316565, -1109508096, -1098228398)
|
||||
+ W(8, -1144166978, 1014687697, -1115552350, -1125187302) + W(9, 1037730450, 1040234099, 1015825508, 1035235966)
|
||||
+ W(10, -1171049230, 1022902338, -1132534141, 1016189168)
|
||||
+ W(11, -1123531112, -1127405808, 1018548825, -1137247201);
|
||||
sum2 =
|
||||
W(0, -1131301730, 973798558, -1127780866, 1013478572) + W(1, -1160424319, -1135840779, -1164912671, -1138738786)
|
||||
+ W(2, 1031269327, -1131640108, 1013454096, -1109509101)
|
||||
+ W(3, -1117315078, -1131160392, -1145619912, -1127332243)
|
||||
+ W(4, -1127010401, 1028981651, -1149526184, 1051779317) + W(5, 1028380081, -1137527992, 998238336, -1148504424)
|
||||
+ W(6, -1109842974, -1125259759, -1113692773, 1047088883)
|
||||
+ W(7, -1134194124, 1028175261, 1018886164, 1027237057)
|
||||
+ W(8, -1181736700, -1167651134, -1123287814, -1109788940)
|
||||
+ W(9, -1115287133, -1121515979, -1125209194, -1142455024)
|
||||
+ W(10, -1180777340, -1160957999, 993986728, 1020962386)
|
||||
+ W(11, -1136947718, -1138138790, -1152989064, -1123011340);
|
||||
WS(-1146021888, 1053974589);
|
||||
sum1 =
|
||||
W(0, 1029642476, 1013890275, -1176939092, 1022266947) + W(1, -1146466657, 1004183253, -1160650069, -1127783457)
|
||||
+ W(2, -1119368753, -1134074211, 1007708103, 1017736689) + W(3, 1027345005, 1032510570, 1019378973, 1031489314)
|
||||
+ W(4, 1042969521, 1042359026, 1045769551, -1101301107)
|
||||
+ W(5, -1094644679, -1091538585, -1107179580, -1095508207)
|
||||
+ W(6, -1095098901, -1107285127, -1096985546, 1034918881) + W(7, 1050538529, 1051699648, 1036824506, 1048776768)
|
||||
+ W(8, 1046685039, 1031018217, 1036262392, 1003810877) + W(9, -1120828825, 1011534979, -1133351451, 1035618600)
|
||||
+ W(10, 984849429, -1135393367, -1139413615, 1024875117)
|
||||
+ W(11, -1172526890, 1017671961, -1160823333, 1006585957);
|
||||
sum2 = W(0, 1031363252, 1030774484, 1015165558, 1007437656) + W(1, 1020087968, -1158035650, -1137051446, 1018262350)
|
||||
+ W(2, -1091101506, 1001500224, -1110787951, 965388167) + W(3, 1003188992, 1041338676, -1134903850, 1033269727)
|
||||
+ W(4, 1048232756, -1110436898, 1016237906, 1014973676)
|
||||
+ W(5, -1123006886, -1105090874, -1123217223, -1104724635)
|
||||
+ W(6, 1057852755, -1132290932, 1043794074, 1047525730)
|
||||
+ W(7, 1011818344, -1129296549, 1034851396, -1106365430)
|
||||
+ W(8, -1095952784, -1131305343, -1113356328, -1152923833)
|
||||
+ W(9, -1111245491, -1131940021, -1117639196, 1024945328)
|
||||
+ W(10, 1016290300, -1126601761, 1003743696, 1022650220)
|
||||
+ W(11, 1021501454, 1017537464, -1133259176, 1019937714);
|
||||
WS(-1077057896, -1083600334);
|
||||
sum1 = W(0, 1017420011, 1014201033, 1017846781, 999765850) + W(1, -1162024122, 985808522, 1019153993, 1011766057)
|
||||
+ W(2, 1011471785, 1019976613, -1138042285, 1040273597) + W(3, 1016487629, 1027730222, 1010855969, 1025127228)
|
||||
+ W(4, 1029223422, -1126437509, 1049638570, -1090770241) + W(5, 1029091694, 1037672698, 1027546578, 1025680213)
|
||||
+ W(6, -1116040414, 1015478313, -1103217262, -1087230893)
|
||||
+ W(7, 1046437488, 1024768280, 1028909230, 1017109109) + W(8, 1017123181, 1024110818, 1023111893, 1030676769)
|
||||
+ W(9, -1112046985, -1120839802, 1023955584, -1128064723)
|
||||
+ W(10, 1016511669, -1167731667, 1009386661, 1023090125)
|
||||
+ W(11, 1020460717, 1025489318, -1134545259, 1027741830);
|
||||
sum2 =
|
||||
W(0, 1023774756, -1152708847, 1005727232, -1128030280) + W(1, 1004577664, -1116453887, 1020705336, -1127775332)
|
||||
+ W(2, -1107003878, 1013240776, -1108761818, 1032847770)
|
||||
+ W(3, 1024878510, -1129071264, -1124253692, -1114566712)
|
||||
+ W(4, 1020767940, -1108605887, 1050907301, 1058054639) + W(5, -1106188814, 1040942692, -1115446820, 1042743894)
|
||||
+ W(6, -1118294055, -1128830540, -1097736561, 1008347200)
|
||||
+ W(7, 1049418167, -1105809360, 1014050712, -1132221182)
|
||||
+ W(8, -1113997093, -1139588328, 1032528025, 1039669350)
|
||||
+ W(9, -1108856812, -1104688291, 1018266740, -1103534695)
|
||||
+ W(10, 1021408408, -1119578529, -1135972104, -1131826954)
|
||||
+ W(11, 999382680, 1019392776, -1117167612, 1022204104);
|
||||
WS(1034686080, -1080904524);
|
||||
sum1 = W(0, -1139332721, -1156631187, 1023562901, 1011454089) + W(1, 1009225011, 1012327853, 1010687981, 1010798725)
|
||||
+ W(2, 1025190657, 1024127465, 1006799241, -1145135178) + W(3, -1129417722, 1017495114, 1027561839, 1010795083)
|
||||
+ W(4, -1143163562, 1040892278, -1104914606, -1089193318)
|
||||
+ W(5, 1043909393, -1119873834, -1136185891, -1118482716)
|
||||
+ W(6, 1041601261, 1028605547, 1052908885, -1091833281)
|
||||
+ W(7, -1103073573, 1025246703, -1124345098, 1032670633)
|
||||
+ W(8, 1024768205, -1129308018, -1117860929, 1036300940) + W(9, 1040987970, 1033652713, 1024209623, 1027144528)
|
||||
+ W(10, -1137907141, 1012089369, 1019594656, -1143330794)
|
||||
+ W(11, 992909011, -1123933213, 1018355139, -1123266333);
|
||||
sum2 = W(0, 998154484, -1136847690, -1120010648, -1145824866) + W(1, 1015307650, 994166396, 1000606426, 1010815409)
|
||||
+ W(2, -1124228589, 1028193069, 1043298286, -1115567961)
|
||||
+ W(3, -1106126812, -1111174068, -1128437454, -1110538383)
|
||||
+ W(4, -1132108902, -1123281782, -1097765474, 1059221182)
|
||||
+ W(5, 1048600788, -1130476352, 1026255089, -1118584150)
|
||||
+ W(6, -1115676434, -1123302060, 1027211577, 1034703777)
|
||||
+ W(7, -1099334080, 1015056080, -1137618020, 1028199647)
|
||||
+ W(8, -1123985162, -1132306691, -1114822183, -1131429597)
|
||||
+ W(9, 1029215805, 1023836215, -1127893362, 1025007180)
|
||||
+ W(10, 1004957466, 1011392625, -1127542967, 1022587458)
|
||||
+ W(11, -1127163397, -1122559367, -1171736302, -1124423270);
|
||||
WS(-1097173920, -1100403112);
|
||||
sum1 = W(0, -1133792968, -1124511038, -1133667884, -1126404688)
|
||||
+ W(1, -1129657589, -1123548289, -1134226372, -1120375479)
|
||||
+ W(2, -1126599342, -1134319356, -1113753548, -1113718896)
|
||||
+ W(3, 1034489098, 1017816566, -1122628437, 1029275393) + W(4, 1026626987, -1112479512, 1051379210, 1058852431)
|
||||
+ W(5, -1097104011, 1007326848, -1112737379, -1116837058)
|
||||
+ W(6, -1109988694, -1122054529, -1097159959, 1058630415)
|
||||
+ W(7, 1049904553, -1104990865, 983139170, -1110311540)
|
||||
+ W(8, -1128510918, -1138055228, 1031366423, -1108453759)
|
||||
+ W(9, -1111244112, -1129654222, -1143321192, -1132471000)
|
||||
+ W(10, -1124691470, -1131431128, -1128464692, -1122909907)
|
||||
+ W(11, 1006087192, -1138955724, -1123473736, -1149064600);
|
||||
sum2 = W(0, -1133003813, -1170659932, 1006656308, 1031055883)
|
||||
+ W(1, -1122785076, -1138148825, -1134424500, -1131089901)
|
||||
+ W(2, -1145103116, 1024883426, -1122208183, -1101651786)
|
||||
+ W(3, -1107240567, -1137951645, -1131665157, -1116779622)
|
||||
+ W(4, -1105221269, -1117429423, -1098340061, 1055658740)
|
||||
+ W(5, 1035604404, -1131811521, -1130287800, -1123356625)
|
||||
+ W(6, 1033080040, 1028547885, 1042272545, 1058321046) + W(7, -1112738821, 1003752088, 1015669581, 1033205575)
|
||||
+ W(8, 1016862101, -1128891234, -1121562483, -1100689547)
|
||||
+ W(9, -1115182870, 1026865631, -1129373191, -1134576021)
|
||||
+ W(10, -1129731365, -1147341896, -1121650606, 1031708925)
|
||||
+ W(11, -1123396988, -1133076983, -1131162259, -1127933595);
|
||||
WS(1049422752, 1064394145);
|
||||
sum1 = W(0, 1016583527, 997641734, -1129211865, -1131677043) + W(1, 1021172552, 1021691141, -1133007562, 1026669144)
|
||||
+ W(2, -1106085006, -1123841888, 1041111742, 1032245856)
|
||||
+ W(3, -1110939130, -1105269375, -1137128282, -1106745812)
|
||||
+ W(4, 995307718, 1031369872, -1088517333, -1098988005) + W(5, 1058612208, 1030057089, 1036830720, 1034516890)
|
||||
+ W(6, 1042273115, 1021597381, 1058826428, -1105331685)
|
||||
+ W(7, -1090507155, 1043687978, -1120823228, 1038691348)
|
||||
+ W(8, -1113049442, -1122854832, -1113933244, 1032610296)
|
||||
+ W(9, 1037338632, -1122591528, -1116248270, -1117945591)
|
||||
+ W(10, 1025810280, 1006187755, 1019889767, -1131685097)
|
||||
+ W(11, -1155049030, -1134096210, 1025994697, -1126546729);
|
||||
sum2 =
|
||||
W(0, 1015668141, -1124041461, 1005775275, 1015274173) + W(1, -1138139200, 992639399, -1132053353, -1129319398)
|
||||
+ W(2, -1138201662, -1140877219, 1027346708, 1016303395)
|
||||
+ W(3, -1106503093, -1117618841, -1115775134, -1122071744)
|
||||
+ W(4, -1111996311, -1116450062, -1125910350, -1108948194)
|
||||
+ W(5, -1104963655, 1031763952, 1015724405, 1034411590)
|
||||
+ W(6, -1127284815, -1123578506, -1106280325, 1052974100) + W(7, 1053021197, 957951850, 1016609913, -1140595900)
|
||||
+ W(8, -1125087482, 1024732308, 1034158307, 1032925063) + W(9, -1107449032, 994113735, -1132927280, -1140186580)
|
||||
+ W(10, 1020174885, -1139064970, -1133423524, -1161498797)
|
||||
+ W(11, -1134898868, 1013272790, -1132485274, -1164791981);
|
||||
WS(-1101497152, -1084603877);
|
||||
sum1 =
|
||||
W(0, -1136425045, -1140218249, -1121565262, -1122160667)
|
||||
+ W(1, 1006883159, -1132497425, 1004502690, -1128604789) + W(2, 1016522037, -1157845066, 1035402982, 1027884002)
|
||||
+ W(3, 1025282390, 1022271101, -1138792353, 995143101) + W(4, 967194407, 1029505522, 1022903246, -1107598171)
|
||||
+ W(5, 1025270942, -1120772739, -1154700189, -1128284203)
|
||||
+ W(6, 1019848413, -1119357636, 1027088345, 1024422013) + W(7, -1117602990, 1030415880, -1171556244, 1025955498)
|
||||
+ W(8, -1129523533, -1140249161, -1121932442, -1127296803)
|
||||
+ W(9, 1030372258, -1129818261, -1138666305, -1121511513)
|
||||
+ W(10, -1142614610, -1135395837, -1148904362, 1002411186)
|
||||
+ W(11, -1130529549, 1018540973, -1138856043, 1011955033);
|
||||
sum2 =
|
||||
W(0, -1126668299, -1137964827, 1015963121, 1030757650) + W(1, -1137547503, 997720155, -1132279825, -1152589275)
|
||||
+ W(2, -1131366283, -1117612139, -1097765254, -1083955387)
|
||||
+ W(3, -1103492737, -1118797430, -1131387718, -1140632131)
|
||||
+ W(4, 1024971228, 1034620123, 1049249869, 1064229708) + W(5, 1047078464, 1017921725, 1024786888, -1144191965)
|
||||
+ W(6, 1000957181, -1119890411, 1026062254, -1107214224) + W(7, -1122275403, 1016072153, -1133941049, 996433547)
|
||||
+ W(8, -1151515419, -1145021381, 1001872029, 1026637176)
|
||||
+ W(9, 1027173860, -1135832789, -1148432117, -1140699475)
|
||||
+ W(10, 1005199725, -1136862175, 1007955643, -1125717658)
|
||||
+ W(11, -1169614250, 923654805, 1002011725, 1005736109);
|
||||
WS(1059552336, -1136539026);
|
||||
sum1 = W(0, 990367896, 1010957914, -1154541352, 1017372383) + W(1, 1026284586, 1015461809, 1014364322, 1021777309)
|
||||
+ W(2, 1041343484, 1036710283, 1045269292, -1112141659) + W(3, -1116984235, 1018013925, 1006339428, 1032228520)
|
||||
+ W(4, -1096612504, -1107358947, -1087221074, 1058232297)
|
||||
+ W(5, 1051438086, 1047713030, 1032067931, 1045851190) + W(6, 1033353841, 1029016441, 1042554433, 1029783110)
|
||||
+ W(7, -1087458720, -1095293300, -1114380761, -1099415088)
|
||||
+ W(8, -1125599349, -1132821402, -1154580200, -1114120867)
|
||||
+ W(9, 1033522371, 1032365167, 1004597796, 1030006574) + W(10, 1028944863, 1024290996, 1023892422, 1023410731)
|
||||
+ W(11, -1144215764, -1144750420, 1001346936, -1130073781);
|
||||
sum2 = W(0, -1153914788, -1126180245, 995844276, -1141303142) + W(1, 1026951758, 990856044, 1004850742, 1015235936)
|
||||
+ W(2, -1101809160, -1115520194, -1114410922, 1033573989)
|
||||
+ W(3, 998799030, -1110191966, -1129252703, 1016759632) + W(4, 1052877341, 1022007580, 1055965696, -1098900400)
|
||||
+ W(5, 1023481081, 1023185808, 1031073312, -1104219784) + W(6, 1046574229, 1000424166, 1034680258, -1098051352)
|
||||
+ W(7, 1045079279, -1106708743, 984863273, -1103050031)
|
||||
+ W(8, -1095334336, -1113807813, -1109583292, 1033797491)
|
||||
+ W(9, 1032986287, 1025876178, -1137844345, 1038371038)
|
||||
+ W(10, 1023520281, 1021218858, 1008634443, -1115608949)
|
||||
+ W(11, 1032307290, -1128938562, 1017335440, 1020607644);
|
||||
WS(-1080660584, -1085825159);
|
||||
sum1 = W(0, 1013708199, 998525366, 1026247587, 1009982783) + W(1, -1136883881, -1128223794, 999596614, -1126850910)
|
||||
+ W(2, -1123370319, -1131079022, -1110742584, -1113918027)
|
||||
+ W(3, 1032009669, 1025751155, -1126831290, 1033490448)
|
||||
+ W(4, -1145658646, -1111041043, 1047524760, 1038280501)
|
||||
+ W(5, -1096311074, 1042402294, -1118872454, 1023947050)
|
||||
+ W(6, -1118786339, 1035331132, -1095527502, 1041941518)
|
||||
+ W(7, 1051037928, -1106649743, 1032615945, -1111971999)
|
||||
+ W(8, 1028171867, 1017605134, 1042485668, -1110999603) + W(9, -1106204846, -1132447358, 1002160934, 971034337)
|
||||
+ W(10, -1144908790, -1131113128, -1130744068, 992723116)
|
||||
+ W(11, 1025830203, 1017749654, -1127230527, 1018668086);
|
||||
sum2 = W(0, 988660617, -1145776834, -1133422913, -1126488793) + W(1, 999416722, 999055930, 987033481, -1179419172)
|
||||
+ W(2, 1017543700, 1002138986, 1031822055, 1032911160) + W(3, 1018658069, 1007351961, -1162261577, 999395634)
|
||||
+ W(4, 1015794522, 1001599498, 1041494739, 1056510750) + W(5, 1023998101, -1138508317, 991201876, -1141702058)
|
||||
+ W(6, -1133704409, 1024621822, -1102581932, -1089051586)
|
||||
+ W(7, -1111744235, 1009295285, -1140892226, -1147317506)
|
||||
+ W(8, 1003471274, -1135257421, 970658596, 1026713544) + W(9, 945757471, 998080468, -1156050276, 1007988669)
|
||||
+ W(10, -1140119133, -1136500105, -1163479081, 1009057465)
|
||||
+ W(11, 1000517690, -1137960905, -1186683976, -1146609818);
|
||||
WS(1064784784, -1120346387);
|
||||
sum1 =
|
||||
W(0, -1150678408, 1000581420, 1006062348, 1015088861) + W(1, -1136518116, -1126533711, 1012432222, -1122704190)
|
||||
+ W(2, 1015721531, -1110077251, -1107299655, -1132691862)
|
||||
+ W(3, -1148196556, 1013224070, -1124886999, 1014270588) + W(4, 1049255678, 1043607805, 1059242626, -1123916922)
|
||||
+ W(5, -1096371932, -1100407642, -1132580564, -1102354822)
|
||||
+ W(6, -1099108228, -1107416484, -1089544734, -1130977491)
|
||||
+ W(7, 1057929313, 1048500643, 1035479729, 1044504531) + W(8, -1149551256, 1017163947, 1023526494, 1022505321)
|
||||
+ W(9, -1104456865, -1111675367, -1127245287, 1007459698)
|
||||
+ W(10, -1136953142, -1140022794, -1139533474, 1012221798)
|
||||
+ W(11, 1014035238, 1026165050, -1136458552, 1017479699);
|
||||
sum2 = W(0, -1140771860, -1125513054, -1131667695, -1131288705)
|
||||
+ W(1, 978159878, 1012596136, 1011933560, -1121534607) + W(2, 1031694512, -1122230843, 979286214, 1025077104)
|
||||
+ W(3, -1108853537, -1102087836, -1117751010, -1113389694)
|
||||
+ W(4, -1104948969, 993388690, 1026183534, -1111209187) + W(5, 1041617383, 1045410736, 1030126174, 1044425994)
|
||||
+ W(6, -1115570202, 1042093481, 1042830623, -1112764939) + W(7, 1043422569, 1034771561, 1014235016, 1025820984)
|
||||
+ W(8, 1040745971, -1111499166, -1119680402, 982469091)
|
||||
+ W(9, -1120447085, -1109907689, -1127258987, -1115100280)
|
||||
+ W(10, -1127298441, 995262946, 1002124441, -1123012516)
|
||||
+ W(11, -1129740789, -1125016939, 1004566649, -1119639931);
|
||||
WS(-1088649680, 1067112300);
|
||||
|
||||
return clamp(mstd0 + 5.0 * vsum / wsum * mstd1, 0.0, 1.0);
|
||||
}
|
||||
|
||||
shared float inp[555];
|
||||
|
||||
#define CURRENT_PASS 2
|
||||
|
||||
#define GET_SAMPLE(x) dot(x.rgb, rgb2y)
|
||||
#define imageStore(out_image, pos, val) imageStoreOverride(pos, val.x)
|
||||
void imageStoreOverride(uint2 pos, float value) {
|
||||
float2 UV = mul(rgb2uv, INPUT.SampleLevel(sam_INPUT_LINEAR, HOOKED_map(pos), 0).rgb);
|
||||
OUTPUT[pos] = float4(mul(yuv2rgb, float3(value.x, UV)), 1.0);
|
||||
}
|
||||
|
||||
#define INPUT_tex(pos) GET_SAMPLE(vec4(texture(INPUT, pos)))
|
||||
static const float2 INPUT_size = float2(GetInputSize());
|
||||
static const float2 INPUT_pt = float2(GetInputPt());
|
||||
|
||||
#define temp_tex(pos) (float(texture(temp, pos).x))
|
||||
static const float2 temp_size = float2(GetInputSize().x * 1, GetInputSize().y * 2);
|
||||
static const float2 temp_pt = float2(1.0 / (temp_size.x), 1.0 / (temp_size.y));
|
||||
|
||||
#define HOOKED_tex(pos) temp_tex(pos)
|
||||
#define HOOKED_size temp_size
|
||||
#define HOOKED_pt temp_pt
|
||||
|
||||
void Pass2(uint2 blockStart, uint3 threadId) {
|
||||
ivec2 group_base = ivec2(gl_WorkGroupID) * ivec2(gl_WorkGroupSize);
|
||||
int local_pos = int(gl_LocalInvocationID.x) * 15 + int(gl_LocalInvocationID.y);
|
||||
for (int id = int(gl_LocalInvocationIndex); id < 555; id += int(gl_WorkGroupSize.x * gl_WorkGroupSize.y)) {
|
||||
uint x = (uint)id / 15, y = (uint)id % 15;
|
||||
inp[id] =
|
||||
HOOKED_tex(HOOKED_pt * vec2(float(group_base.x + x - (2)) + 0.5, float(group_base.y + y - (3)) + 0.5)).x;
|
||||
}
|
||||
barrier();
|
||||
vec4 ret = vec4(0.0, 0.0, 0.0, 0.0);
|
||||
vec4 ret0 = vec4(0.0, 0.0, 0.0, 0.0);
|
||||
vec4 samples[12];
|
||||
samples[0][0] = inp[local_pos + 0];
|
||||
samples[0][1] = inp[local_pos + 1];
|
||||
samples[0][2] = inp[local_pos + 2];
|
||||
samples[0][3] = inp[local_pos + 3];
|
||||
samples[1][0] = inp[local_pos + 4];
|
||||
samples[1][1] = inp[local_pos + 5];
|
||||
samples[1][2] = inp[local_pos + 6];
|
||||
samples[1][3] = inp[local_pos + 7];
|
||||
samples[2][0] = inp[local_pos + 15];
|
||||
samples[2][1] = inp[local_pos + 16];
|
||||
samples[2][2] = inp[local_pos + 17];
|
||||
samples[2][3] = inp[local_pos + 18];
|
||||
samples[3][0] = inp[local_pos + 19];
|
||||
samples[3][1] = inp[local_pos + 20];
|
||||
samples[3][2] = inp[local_pos + 21];
|
||||
samples[3][3] = inp[local_pos + 22];
|
||||
samples[4][0] = inp[local_pos + 30];
|
||||
samples[4][1] = inp[local_pos + 31];
|
||||
samples[4][2] = inp[local_pos + 32];
|
||||
samples[4][3] = inp[local_pos + 33];
|
||||
samples[5][0] = inp[local_pos + 34];
|
||||
samples[5][1] = inp[local_pos + 35];
|
||||
samples[5][2] = inp[local_pos + 36];
|
||||
samples[5][3] = inp[local_pos + 37];
|
||||
samples[6][0] = inp[local_pos + 45];
|
||||
samples[6][1] = inp[local_pos + 46];
|
||||
samples[6][2] = inp[local_pos + 47];
|
||||
samples[6][3] = inp[local_pos + 48];
|
||||
samples[7][0] = inp[local_pos + 49];
|
||||
samples[7][1] = inp[local_pos + 50];
|
||||
samples[7][2] = inp[local_pos + 51];
|
||||
samples[7][3] = inp[local_pos + 52];
|
||||
samples[8][0] = inp[local_pos + 60];
|
||||
samples[8][1] = inp[local_pos + 61];
|
||||
samples[8][2] = inp[local_pos + 62];
|
||||
samples[8][3] = inp[local_pos + 63];
|
||||
samples[9][0] = inp[local_pos + 64];
|
||||
samples[9][1] = inp[local_pos + 65];
|
||||
samples[9][2] = inp[local_pos + 66];
|
||||
samples[9][3] = inp[local_pos + 67];
|
||||
samples[10][0] = inp[local_pos + 75];
|
||||
samples[10][1] = inp[local_pos + 76];
|
||||
samples[10][2] = inp[local_pos + 77];
|
||||
samples[10][3] = inp[local_pos + 78];
|
||||
samples[11][0] = inp[local_pos + 79];
|
||||
samples[11][1] = inp[local_pos + 80];
|
||||
samples[11][2] = inp[local_pos + 81];
|
||||
samples[11][3] = inp[local_pos + 82];
|
||||
ret[0] = nnedi3(samples);
|
||||
ret0[0] = inp[local_pos + 33];
|
||||
#if CURRENT_PASS == LAST_PASS
|
||||
uint2 destPos = blockStart + threadId.xy * 2;
|
||||
uint2 outputSize = GetOutputSize();
|
||||
if (destPos.x >= outputSize.x || destPos.y >= outputSize.y) {
|
||||
return;
|
||||
}
|
||||
#endif
|
||||
imageStore(out_image, ivec2(gl_GlobalInvocationID) * ivec2(2, 1), ret0);
|
||||
imageStore(out_image, ivec2(gl_GlobalInvocationID) * ivec2(2, 1) + ivec2(1, 0), ret);
|
||||
}
|
||||
7880
src/Effects/NNEDI3/NNEDI3_nns256_win8x4.hlsl
Normal file
7880
src/Effects/NNEDI3/NNEDI3_nns256_win8x4.hlsl
Normal file
File diff suppressed because it is too large
Load diff
11232
src/Effects/NNEDI3/NNEDI3_nns256_win8x6.hlsl
Normal file
11232
src/Effects/NNEDI3/NNEDI3_nns256_win8x6.hlsl
Normal file
File diff suppressed because it is too large
Load diff
1193
src/Effects/NNEDI3/NNEDI3_nns32_win8x4.hlsl
Normal file
1193
src/Effects/NNEDI3/NNEDI3_nns32_win8x4.hlsl
Normal file
File diff suppressed because it is too large
Load diff
1643
src/Effects/NNEDI3/NNEDI3_nns32_win8x6.hlsl
Normal file
1643
src/Effects/NNEDI3/NNEDI3_nns32_win8x6.hlsl
Normal file
File diff suppressed because it is too large
Load diff
2111
src/Effects/NNEDI3/NNEDI3_nns64_win8x4.hlsl
Normal file
2111
src/Effects/NNEDI3/NNEDI3_nns64_win8x4.hlsl
Normal file
File diff suppressed because it is too large
Load diff
File diff suppressed because it is too large
Load diff
73
src/Effects/NNEDI3/prescalers.hlsli
Normal file
73
src/Effects/NNEDI3/prescalers.hlsli
Normal file
|
|
@ -0,0 +1,73 @@
|
|||
// This program is free software: you can redistribute it and/or modify
|
||||
// it under the terms of the GNU General Public License as published by
|
||||
// the Free Software Foundation, either version 3 of the License, or
|
||||
// (at your option) any later version.
|
||||
//
|
||||
// This program is distributed in the hope that it will be useful,
|
||||
// but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
// GNU General Public License for more details.
|
||||
//
|
||||
// You should have received a copy of the GNU General Public License
|
||||
// along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
|
||||
// Conversion from GLSL to HLSL is done through defines as much as possible to ease synchronization and comparison with upstream
|
||||
#define ivec2 int2
|
||||
|
||||
#define vec2 float2
|
||||
#define vec3 float3
|
||||
#define vec4 float4
|
||||
|
||||
#define mat4x3 float4x3
|
||||
#define matrixCompMult(mtx1, mtx2) (mtx1 * mtx2)
|
||||
|
||||
#define shared groupshared
|
||||
|
||||
#define atan atan2
|
||||
#define barrier GroupMemoryBarrierWithGroupSync
|
||||
#define fract frac
|
||||
#define intBitsToFloat asfloat
|
||||
#define inversesqrt rsqrt
|
||||
// mod deals only with positive numbers here and it could be substituted by fmod
|
||||
#define mod fmod
|
||||
|
||||
// lerp handles bools as the third argument differently from mix
|
||||
float mix(float a, float b, bool c) {
|
||||
return c ? b : a;
|
||||
}
|
||||
|
||||
#define MIX_LERP(type1, type3) type1 mix(type1 a, type1 b, type3 c) { return lerp(a, b, c); }
|
||||
MIX_LERP(float, float)
|
||||
MIX_LERP(float2, float2)
|
||||
MIX_LERP(float3, float)
|
||||
MIX_LERP(float4, float)
|
||||
|
||||
#define texture(tex, pos) tex.SampleLevel(sam_##tex, pos, 0.0)
|
||||
|
||||
#define OUTPUT_pt float2(GetOutputPt())
|
||||
#define frag_pos(id) (vec2(id) + vec2(0.5, 0.5))
|
||||
#define frag_map(id) (OUTPUT_pt * frag_pos(id))
|
||||
#define HOOKED_map(id) frag_map(id)
|
||||
|
||||
#define gl_LocalInvocationIndex (threadId.y*MP_NUM_THREADS_X + threadId.x)
|
||||
#define gl_LocalInvocationID threadId
|
||||
#define gl_WorkGroupSize (uint2(MP_NUM_THREADS_X, MP_NUM_THREADS_Y))
|
||||
#define gl_WorkGroupID (blockStart / uint2(MP_BLOCK_WIDTH, MP_BLOCK_HEIGHT))
|
||||
#define gl_GlobalInvocationID (gl_WorkGroupID*gl_WorkGroupSize + threadId.xy)
|
||||
|
||||
// disable warning about unknown pragma
|
||||
#pragma warning(disable: 3568)
|
||||
// disable warning about too many threads (ravu-r4-rgb triggers it)
|
||||
#pragma warning(disable: 4714)
|
||||
|
||||
// https://www.itu.int/dms_pubrec/itu-r/rec/bt/R-REC-BT.709-6-201506-I!!PDF-E.pdf
|
||||
static const float3 rgb2y = float3(0.2126, 0.7152, 0.0722);
|
||||
static const float2x3 rgb2uv = {
|
||||
-0.2126/1.8556, -0.7152/1.8556, 0.9278/1.8556,
|
||||
0.7874/1.5748, -0.7152/1.5748, -0.0722/1.5748
|
||||
};
|
||||
static const float3x3 yuv2rgb = {
|
||||
1, 0, 1.5748,
|
||||
1, -0.187324, -0.468124,
|
||||
1, 1.8556, 0
|
||||
};
|
||||
|
|
@ -1,20 +1,20 @@
|
|||
//!MAGPIE EFFECT
|
||||
//!VERSION 3
|
||||
//!GENERIC_DOWNSCALER
|
||||
|
||||
//!VERSION 4
|
||||
|
||||
//!TEXTURE
|
||||
Texture2D INPUT;
|
||||
|
||||
//!TEXTURE
|
||||
Texture2D OUTPUT;
|
||||
|
||||
//!SAMPLER
|
||||
//!FILTER POINT
|
||||
SamplerState sam;
|
||||
|
||||
|
||||
//!PASS 1
|
||||
//!STYLE PS
|
||||
//!IN INPUT
|
||||
|
||||
//!OUT OUTPUT
|
||||
float4 Pass1(float2 pos) {
|
||||
return INPUT.SampleLevel(sam, pos, 0);
|
||||
}
|
||||
|
|
|
|||
|
|
@ -2,14 +2,17 @@
|
|||
// 移植自 https://casual-effects.com/research/McGuire2021PixelArt/index.html
|
||||
|
||||
//!MAGPIE EFFECT
|
||||
//!VERSION 3
|
||||
//!OUTPUT_WIDTH INPUT_WIDTH * 2
|
||||
//!OUTPUT_HEIGHT INPUT_HEIGHT * 2
|
||||
//!VERSION 4
|
||||
|
||||
|
||||
//!TEXTURE
|
||||
Texture2D INPUT;
|
||||
|
||||
//!TEXTURE
|
||||
//!WIDTH INPUT_WIDTH * 2
|
||||
//!HEIGHT INPUT_HEIGHT * 2
|
||||
Texture2D OUTPUT;
|
||||
|
||||
//!SAMPLER
|
||||
//!FILTER POINT
|
||||
SamplerState sam;
|
||||
|
|
@ -17,118 +20,113 @@ SamplerState sam;
|
|||
|
||||
//!PASS 1
|
||||
//!IN INPUT
|
||||
//!OUT OUTPUT
|
||||
//!BLOCK_SIZE 16
|
||||
//!NUM_THREADS 64
|
||||
|
||||
|
||||
#define src(x, y) INPUT.SampleLevel(sam, float2(x, y) * GetInputPt(), 0).rgb
|
||||
|
||||
float luma(float3 C) {
|
||||
return C.r + C.g + C.b;
|
||||
return C.r + C.g + C.b;
|
||||
}
|
||||
|
||||
bool all_eq2(float3 B, float3 A0, float3 A1) {
|
||||
return all(B == A0) && all(B == A1);
|
||||
return all(B == A0) && all(B == A1);
|
||||
}
|
||||
|
||||
bool all_eq3(float3 B, float3 A0, float3 A1, float3 A2) {
|
||||
return all(B == A0) && all(B == A1) && all(B == A2);
|
||||
return all(B == A0) && all(B == A1) && all(B == A2);
|
||||
}
|
||||
|
||||
bool all_eq4(float3 B, float3 A0, float3 A1, float3 A2, float3 A3) {
|
||||
return all(B == A0) && all(B == A1) && all(B == A2) && all(B == A3);
|
||||
return all(B == A0) && all(B == A1) && all(B == A2) && all(B == A3);
|
||||
}
|
||||
|
||||
bool any_eq3(float3 B, float3 A0, float3 A1, float3 A2) {
|
||||
return all(B == A0) || all(B == A1) || all(B == A2);
|
||||
return all(B == A0) || all(B == A1) || all(B == A2);
|
||||
}
|
||||
|
||||
bool none_eq2(float3 B, float3 A0, float3 A1) {
|
||||
return any(B != A0) && any(B != A1);
|
||||
return any(B != A0) && any(B != A1);
|
||||
}
|
||||
|
||||
bool none_eq4(float3 B, float3 A0, float3 A1, float3 A2, float3 A3) {
|
||||
return any(B != A0) && any(B != A1) && any(B != A2) && any(B != A3);
|
||||
return any(B != A0) && any(B != A1) && any(B != A2) && any(B != A3);
|
||||
}
|
||||
|
||||
void Pass1(uint2 blockStart, uint3 threadId) {
|
||||
uint2 gxy = (Rmp8x8(threadId.x) << 1) + blockStart;
|
||||
uint2 gxy = (Rmp8x8(threadId.x) << 1) + blockStart;
|
||||
|
||||
if (!CheckViewport(gxy)) {
|
||||
return;
|
||||
}
|
||||
const uint2 outputSize = GetOutputSize();
|
||||
if (gxy.x >= outputSize.x || gxy.y >= outputSize.y) {
|
||||
return;
|
||||
}
|
||||
|
||||
float srcX = (gxy.x >> 1) + 0.5f;
|
||||
float srcY = (gxy.y >> 1) + 0.5f;
|
||||
float srcX = (gxy.x >> 1) + 0.5f;
|
||||
float srcY = (gxy.y >> 1) + 0.5f;
|
||||
|
||||
float3 A = src(srcX - 1, srcY - 1), B = src(srcX, srcY - 1), C = src(srcX + 1, srcY - 1);
|
||||
float3 D = src(srcX - 1, srcY + 0), E = src(srcX, srcY + 0), F = src(srcX + 1, srcY + 0);
|
||||
float3 G = src(srcX - 1, srcY + 1), H = src(srcX, srcY + 1), I = src(srcX + 1, srcY + 1);
|
||||
float3 A = src(srcX - 1, srcY - 1), B = src(srcX, srcY - 1), C = src(srcX + 1, srcY - 1);
|
||||
float3 D = src(srcX - 1, srcY + 0), E = src(srcX, srcY + 0), F = src(srcX + 1, srcY + 0);
|
||||
float3 G = src(srcX - 1, srcY + 1), H = src(srcX, srcY + 1), I = src(srcX + 1, srcY + 1);
|
||||
|
||||
float3 J = E, K = E, L = E, M = E;
|
||||
float3 J = E, K = E, L = E, M = E;
|
||||
|
||||
if (any(A != E) || any(B != E) || any(C != E) || any(D != E) || any(F != E) || any(G != E) || any(H != E) || any(I != E)) {
|
||||
float3 P = src(srcX, srcY - 2), S = src(srcX, srcY + 2);
|
||||
float3 Q = src(srcX - 2, srcY), R = src(srcX + 2, srcY);
|
||||
float Bl = luma(B), Dl = luma(D), El = luma(E), Fl = luma(F), Hl = luma(H);
|
||||
if (any(A != E) || any(B != E) || any(C != E) || any(D != E) || any(F != E) || any(G != E) || any(H != E) || any(I != E)) {
|
||||
float3 P = src(srcX, srcY - 2), S = src(srcX, srcY + 2);
|
||||
float3 Q = src(srcX - 2, srcY), R = src(srcX + 2, srcY);
|
||||
float Bl = luma(B), Dl = luma(D), El = luma(E), Fl = luma(F), Hl = luma(H);
|
||||
|
||||
// 1:1 slope rules
|
||||
if ((all(D == B) && any(D != H) && any(D != F)) && (El >= Dl || all(E == A)) && any_eq3(E, A, C, G) && ((El < Dl) || any(A != D) || any(E != P) || any(E != Q))) J = D;
|
||||
if ((all(B == F) && any(B != D) && any(B != H)) && (El >= Bl || all(E == C)) && any_eq3(E, A, C, I) && ((El < Bl) || any(C != B) || any(E != P) || any(E != R))) K = B;
|
||||
if ((all(H == D) && any(H != F) && any(H != B)) && (El >= Hl || all(E == G)) && any_eq3(E, A, G, I) && ((El < Hl) || any(G != H) || any(E != S) || any(E != Q))) L = H;
|
||||
if ((all(F == H) && any(F != B) && any(F != D)) && (El >= Fl || all(E == I)) && any_eq3(E, C, G, I) && ((El < Fl) || any(I != H) || any(E != R) || any(E != S))) M = F;
|
||||
// 1:1 slope rules
|
||||
if ((all(D == B) && any(D != H) && any(D != F)) && (El >= Dl || all(E == A)) && any_eq3(E, A, C, G) && ((El < Dl) || any(A != D) || any(E != P) || any(E != Q))) J = D;
|
||||
if ((all(B == F) && any(B != D) && any(B != H)) && (El >= Bl || all(E == C)) && any_eq3(E, A, C, I) && ((El < Bl) || any(C != B) || any(E != P) || any(E != R))) K = B;
|
||||
if ((all(H == D) && any(H != F) && any(H != B)) && (El >= Hl || all(E == G)) && any_eq3(E, A, G, I) && ((El < Hl) || any(G != H) || any(E != S) || any(E != Q))) L = H;
|
||||
if ((all(F == H) && any(F != B) && any(F != D)) && (El >= Fl || all(E == I)) && any_eq3(E, C, G, I) && ((El < Fl) || any(I != H) || any(E != R) || any(E != S))) M = F;
|
||||
|
||||
// Intersection rules
|
||||
if ((any(E != F) && all_eq4(E, C, I, D, Q) && all_eq2(F, B, H)) && (any(F != src(srcX + 3, srcY)))) K = M = F;
|
||||
if ((any(E != D) && all_eq4(E, A, G, F, R) && all_eq2(D, B, H)) && (any(D != src(srcX - 3, srcY)))) J = L = D;
|
||||
if ((any(E != H) && all_eq4(E, G, I, B, P) && all_eq2(H, D, F)) && (any(H != src(srcX, srcY + 3)))) L = M = H;
|
||||
if ((any(E != B) && all_eq4(E, A, C, H, S) && all_eq2(B, D, F)) && (any(B != src(srcX, srcY - 3)))) J = K = B;
|
||||
if (Bl < El && all_eq4(E, G, H, I, S) && none_eq4(E, A, D, C, F)) J = K = B;
|
||||
if (Hl < El && all_eq4(E, A, B, C, P) && none_eq4(E, D, G, I, F)) L = M = H;
|
||||
if (Fl < El && all_eq4(E, A, D, G, Q) && none_eq4(E, B, C, I, H)) K = M = F;
|
||||
if (Dl < El && all_eq4(E, C, F, I, R) && none_eq4(E, B, A, G, H)) J = L = D;
|
||||
// Intersection rules
|
||||
if ((any(E != F) && all_eq4(E, C, I, D, Q) && all_eq2(F, B, H)) && (any(F != src(srcX + 3, srcY)))) K = M = F;
|
||||
if ((any(E != D) && all_eq4(E, A, G, F, R) && all_eq2(D, B, H)) && (any(D != src(srcX - 3, srcY)))) J = L = D;
|
||||
if ((any(E != H) && all_eq4(E, G, I, B, P) && all_eq2(H, D, F)) && (any(H != src(srcX, srcY + 3)))) L = M = H;
|
||||
if ((any(E != B) && all_eq4(E, A, C, H, S) && all_eq2(B, D, F)) && (any(B != src(srcX, srcY - 3)))) J = K = B;
|
||||
if (Bl < El && all_eq4(E, G, H, I, S) && none_eq4(E, A, D, C, F)) J = K = B;
|
||||
if (Hl < El && all_eq4(E, A, B, C, P) && none_eq4(E, D, G, I, F)) L = M = H;
|
||||
if (Fl < El && all_eq4(E, A, D, G, Q) && none_eq4(E, B, C, I, H)) K = M = F;
|
||||
if (Dl < El && all_eq4(E, C, F, I, R) && none_eq4(E, B, A, G, H)) J = L = D;
|
||||
|
||||
// 2:1 slope rules
|
||||
if (any(H != B)) {
|
||||
if (any(H != A) && any(H != E) && any(H != C)) {
|
||||
if (all_eq3(H, G, F, R) && none_eq2(H, D, src(srcX + 2, srcY - 1))) L = M;
|
||||
if (all_eq3(H, I, D, Q) && none_eq2(H, F, src(srcX - 2, srcY - 1))) M = L;
|
||||
}
|
||||
// 2:1 slope rules
|
||||
if (any(H != B)) {
|
||||
if (any(H != A) && any(H != E) && any(H != C)) {
|
||||
if (all_eq3(H, G, F, R) && none_eq2(H, D, src(srcX + 2, srcY - 1))) L = M;
|
||||
if (all_eq3(H, I, D, Q) && none_eq2(H, F, src(srcX - 2, srcY - 1))) M = L;
|
||||
}
|
||||
|
||||
if (any(B != I) && any(B != G) && any(B != E)) {
|
||||
if (all_eq3(B, A, F, R) && none_eq2(B, D, src(srcX + 2, srcY + 1))) J = K;
|
||||
if (all_eq3(B, C, D, Q) && none_eq2(B, F, src(srcX - 2, srcY + 1))) K = J;
|
||||
}
|
||||
} // H !== B
|
||||
if (any(B != I) && any(B != G) && any(B != E)) {
|
||||
if (all_eq3(B, A, F, R) && none_eq2(B, D, src(srcX + 2, srcY + 1))) J = K;
|
||||
if (all_eq3(B, C, D, Q) && none_eq2(B, F, src(srcX - 2, srcY + 1))) K = J;
|
||||
}
|
||||
} // H !== B
|
||||
|
||||
if (any(F != D)) {
|
||||
if (any(D != I) && any(D != E) && any(D != C)) {
|
||||
if (all_eq3(D, A, H, S) && none_eq2(D, B, src(srcX + 1, srcY + 2))) J = L;
|
||||
if (all_eq3(D, G, B, P) && none_eq2(D, H, src(srcX + 1, srcY - 2))) L = J;
|
||||
}
|
||||
if (any(F != D)) {
|
||||
if (any(D != I) && any(D != E) && any(D != C)) {
|
||||
if (all_eq3(D, A, H, S) && none_eq2(D, B, src(srcX + 1, srcY + 2))) J = L;
|
||||
if (all_eq3(D, G, B, P) && none_eq2(D, H, src(srcX + 1, srcY - 2))) L = J;
|
||||
}
|
||||
|
||||
if (any(F != E) && any(F != A) && any(F != G)) {
|
||||
if (all_eq3(F, C, H, S) && none_eq2(F, B, src(srcX - 1, srcY + 2))) K = M;
|
||||
if (all_eq3(F, I, B, P) && none_eq2(F, H, src(srcX - 1, srcY - 2))) M = K;
|
||||
}
|
||||
} // F !== D
|
||||
} // not constant
|
||||
if (any(F != E) && any(F != A) && any(F != G)) {
|
||||
if (all_eq3(F, C, H, S) && none_eq2(F, B, src(srcX - 1, srcY + 2))) K = M;
|
||||
if (all_eq3(F, I, B, P) && none_eq2(F, H, src(srcX - 1, srcY - 2))) M = K;
|
||||
}
|
||||
} // F !== D
|
||||
} // not constant
|
||||
|
||||
// Write four pixels at once
|
||||
WriteToOutput(gxy, J);
|
||||
// Write four pixels at once
|
||||
OUTPUT[gxy] = float4(J, 1);
|
||||
|
||||
++gxy.x;
|
||||
if (CheckViewport(gxy)) {
|
||||
WriteToOutput(gxy, K);
|
||||
}
|
||||
++gxy.x;
|
||||
OUTPUT[gxy] = float4(K, 1);
|
||||
|
||||
++gxy.y;
|
||||
if (CheckViewport(gxy)) {
|
||||
WriteToOutput(gxy, M);
|
||||
}
|
||||
++gxy.y;
|
||||
OUTPUT[gxy] = float4(M, 1);
|
||||
|
||||
--gxy.x;
|
||||
if (CheckViewport(gxy)) {
|
||||
WriteToOutput(gxy, L);
|
||||
}
|
||||
--gxy.x;
|
||||
OUTPUT[gxy] = float4(L, 1);
|
||||
}
|
||||
|
|
|
|||
|
|
@ -1,12 +1,15 @@
|
|||
// 移植自 https://github.com/libretro/common-shaders/blob/master/interpolation/shaders/pixellate.cg
|
||||
|
||||
//!MAGPIE EFFECT
|
||||
//!VERSION 3
|
||||
//!VERSION 4
|
||||
|
||||
|
||||
//!TEXTURE
|
||||
Texture2D INPUT;
|
||||
|
||||
//!TEXTURE
|
||||
Texture2D OUTPUT;
|
||||
|
||||
//!SAMPLER
|
||||
//!FILTER POINT
|
||||
SamplerState sam;
|
||||
|
|
@ -15,6 +18,7 @@ SamplerState sam;
|
|||
//!PASS 1
|
||||
//!STYLE PS
|
||||
//!IN INPUT
|
||||
//!OUT OUTPUT
|
||||
|
||||
float4 Pass1(float2 pos) {
|
||||
float2 texelSize = GetInputPt();
|
||||
|
|
|
|||
Some files were not shown because too many files have changed in this diff Show more
Loading…
Add table
Add a link
Reference in a new issue