Fix groupshared local variable declaration error and correct texelFetch translation in HLSL generator

This commit is contained in:
PANDATREE 2026-05-21 19:19:13 +09:00
commit b02cde3f59
3 changed files with 146 additions and 108 deletions

View file

@ -77,6 +77,61 @@ def translate_matrix_vector(code):
return code
def replace_texel_fetch_robust(code):
pos = 0
while True:
idx = code.find('texelFetch(', pos)
if idx == -1:
break
start = idx + len('texelFetch(')
depth = 1
i = start
while i < len(code) and depth > 0:
if code[i] == '(':
depth += 1
elif code[i] == ')':
depth -= 1
i += 1
if depth == 0:
full_expr = code[idx:i]
inner = code[start:i-1]
args = []
arg_start = 0
inner_depth = 0
for j in range(len(inner)):
if inner[j] == '(':
inner_depth += 1
elif inner[j] == ')':
inner_depth -= 1
elif inner[j] == ',' and inner_depth == 0:
args.append(inner[arg_start:j].strip())
arg_start = j + 1
args.append(inner[arg_start:].strip())
if len(args) == 3:
tex_raw = args[0]
pos_expr = args[1]
tex_name = tex_raw
if tex_name.endswith('_raw'):
tex_name = tex_name[:-4]
if tex_name == 'LUMA':
tex_name = 'LUMA'
# Simplify pos_expr
pos_expr = re.sub(r'\*\s*int2\(1,\s*1\)', '', pos_expr)
pos_expr = re.sub(r'\+\s*int2\(0,\s*0\)', '', pos_expr)
pos_expr = re.sub(r'\*\s*float2\(1,\s*1\)', '', pos_expr)
pos_expr = re.sub(r'\+\s*float2\(0,\s*0\)', '', pos_expr)
replacement = f"{tex_name}.Load(int3({pos_expr}, 0))"
code = code[:idx] + replacement + code[i:]
pos = idx + len(replacement)
else:
pos = i
else:
break
return code
def port_standard(glsl_path, hlsl_path):
header, passes = parse_glsl_passes(glsl_path)
@ -478,32 +533,14 @@ def port_cmp(glsl_path, hlsl_path):
# Since gl_GlobalInvocationID is base + tid.xy:
translated_code = re.sub(r'\bgl_GlobalInvocationID\b', '(base + tid.xy)', translated_code)
# Replace texelFetch calls
# e.g., texelFetch(LUMA_raw, (base + ivec2(x,y) - offset) * ivec2(1, 1) + ivec2(0, 0), 0)
# In HLSL: LUMA.Load(int3(pos, 0))
# Let's write a regex to convert this fetch pattern
# texelFetch(TEX_raw, POS, 0) -> TEX.Load(int3(POS, 0))
def repl_texel_fetch(match):
tex_raw = match.group(1)
pos_expr = match.group(2)
# Map TEX_raw to HLSL texture name
tex_name = tex_raw
if tex_name.endswith('_raw'):
tex_name = tex_name[:-4]
if tex_name == 'LUMA':
tex_name = 'LUMA'
# Simplify POS expression (remove * ivec2(1,1) + ivec2(0,0))
pos_expr = re.sub(r'\*\s*i?vec2\(1,\s*1\)', '', pos_expr)
pos_expr = re.sub(r'\+\s*i?vec2\(0,\s*0\)', '', pos_expr)
return f"{tex_name}.Load(int3({pos_expr.strip()}, 0))"
translated_code = re.sub(r'\btexelFetch\((\w+),\s*(.*?),\s*0\)', repl_texel_fetch, translated_code)
# Replace texelFetch calls using our robust parser
translated_code = replace_texel_fetch_robust(translated_code)
# Convert imageStore to output assignments
# e.g., imageStore(out_image, store_pos0, result0); -> conv2d[store_pos0] = result0;
translated_code = re.sub(r'imageStore\(out_image,\s*(.*?),\s*(.*?)\);', rf'{save_target}[\1] = \2;', translated_code)
# Add local declarations prepended to the function body
# Add global declarations above the function body
if save_target == 'conv2d_6':
isize_x = 18
isize_y = 18
@ -517,12 +554,13 @@ def port_cmp(glsl_path, hlsl_path):
isize_y = 18
inp_decl = f"groupshared MF4 inp[8][{isize_y}][{isize_x}];"
prepend_block = f"\tstatic const int2 ksize = int2(3, 3);\n\tstatic const int2 offset = int2(1, 1);\n\tstatic const int2 isize = int2({isize_x}, {isize_y});\n\t{inp_decl}\n"
global_decl = f"static const int2 ksize = int2(3, 3);\nstatic const int2 offset = int2(1, 1);\nstatic const int2 isize = int2({isize_x}, {isize_y});\n{inp_decl}\n"
# Rewrite the hook function signature to match MagpieFX style
func_sig = f"void Pass{pass_num}(uint2 blockStart, uint3 tid) {{"
translated_code = translated_code.replace("void hook() {", func_sig + "\n" + prepend_block)
translated_code = translated_code.replace("void hook() {", func_sig)
hlsl_content.append(global_decl)
hlsl_content.append(translated_code)
hlsl_content.append("\n")

View file

@ -1,4 +1,4 @@
// Generated from Ani4Kv2_ArtCNN_C4F32_i2.glsl - Action Trigger 2026-05-21
// Generated from Ani4Kv2_ArtCNN_C4F32_i2.glsl
// Ani4Kv2 ArtCNN - trained by Sirosky and distributed on the CC BY-NC 4.0 license.
// The ArtCNN architecture is further subject to the below license.
// MIT License

View file

@ -125,16 +125,16 @@ void Pass1(uint2 blockStart, uint3 tid) {
//!IN LUMA
//!OUT conv2d
void Pass2(uint2 blockStart, uint3 tid) {
static const int2 ksize = int2(3, 3);
static const int2 offset = int2(1, 1);
static const int2 isize = int2(4, 18);
groupshared MF inp[1][18][4];
static const int2 ksize = int2(3, 3);
static const int2 offset = int2(1, 1);
static const int2 isize = int2(4, 18);
groupshared MF inp[1][18][4];
void Pass2(uint2 blockStart, uint3 tid) {
uint2 base = uint2(blockStart.x / 8, blockStart.y);
for (uint y = tid.y; y < isize.y; y += 16) {
for (uint x = tid.x; x < isize.x; x += 2) {
inp[0][y][x] = MF(LUMA_mul * LUMA.Load(int3((base + int2(x,y) - offset) * int2(1, 1) + int2(0, 0)), 0).x);
inp[0][y][x] = MF(LUMA_mul * LUMA.Load(int3((base + int2(x,y) - offset) , 0)).x);
}
}
@ -246,23 +246,23 @@ void Pass2(uint2 blockStart, uint3 tid) {
//!IN conv2d
//!OUT conv2d_1
void Pass3(uint2 blockStart, uint3 tid) {
static const int2 ksize = int2(3, 3);
static const int2 offset = int2(1, 1);
static const int2 isize = int2(4, 18);
groupshared MF4 inp[8][18][4];
static const int2 ksize = int2(3, 3);
static const int2 offset = int2(1, 1);
static const int2 isize = int2(4, 18);
groupshared MF4 inp[8][18][4];
void Pass3(uint2 blockStart, uint3 tid) {
uint2 base = uint2(blockStart.x / 8, blockStart.y);
for (uint y = tid.y; y < isize.y; y += 16) {
for (uint x = tid.x; x < isize.x; x += 2) {
inp[0][y][x] = MF4(conv2d_mul * conv2d.Load(int3((base + int2(x,y) - offset) * int2(8, 1) + int2(0, 0)), 0));
inp[1][y][x] = MF4(conv2d_mul * conv2d.Load(int3((base + int2(x,y) - offset) * int2(8, 1) + int2(1, 0)), 0));
inp[2][y][x] = MF4(conv2d_mul * conv2d.Load(int3((base + int2(x,y) - offset) * int2(8, 1) + int2(2, 0)), 0));
inp[3][y][x] = MF4(conv2d_mul * conv2d.Load(int3((base + int2(x,y) - offset) * int2(8, 1) + int2(3, 0)), 0));
inp[4][y][x] = MF4(conv2d_mul * conv2d.Load(int3((base + int2(x,y) - offset) * int2(8, 1) + int2(4, 0)), 0));
inp[5][y][x] = MF4(conv2d_mul * conv2d.Load(int3((base + int2(x,y) - offset) * int2(8, 1) + int2(5, 0)), 0));
inp[6][y][x] = MF4(conv2d_mul * conv2d.Load(int3((base + int2(x,y) - offset) * int2(8, 1) + int2(6, 0)), 0));
inp[7][y][x] = MF4(conv2d_mul * conv2d.Load(int3((base + int2(x,y) - offset) * int2(8, 1) + int2(7, 0)), 0));
inp[0][y][x] = MF4(conv2d_mul * conv2d.Load(int3((base + int2(x,y) - offset) * int2(8, 1) , 0)));
inp[1][y][x] = MF4(conv2d_mul * conv2d.Load(int3((base + int2(x,y) - offset) * int2(8, 1) + int2(1, 0), 0)));
inp[2][y][x] = MF4(conv2d_mul * conv2d.Load(int3((base + int2(x,y) - offset) * int2(8, 1) + int2(2, 0), 0)));
inp[3][y][x] = MF4(conv2d_mul * conv2d.Load(int3((base + int2(x,y) - offset) * int2(8, 1) + int2(3, 0), 0)));
inp[4][y][x] = MF4(conv2d_mul * conv2d.Load(int3((base + int2(x,y) - offset) * int2(8, 1) + int2(4, 0), 0)));
inp[5][y][x] = MF4(conv2d_mul * conv2d.Load(int3((base + int2(x,y) - offset) * int2(8, 1) + int2(5, 0), 0)));
inp[6][y][x] = MF4(conv2d_mul * conv2d.Load(int3((base + int2(x,y) - offset) * int2(8, 1) + int2(6, 0), 0)));
inp[7][y][x] = MF4(conv2d_mul * conv2d.Load(int3((base + int2(x,y) - offset) * int2(8, 1) + int2(7, 0), 0)));
}
}
@ -878,23 +878,23 @@ void Pass3(uint2 blockStart, uint3 tid) {
//!IN conv2d_1
//!OUT conv2d_2
void Pass4(uint2 blockStart, uint3 tid) {
static const int2 ksize = int2(3, 3);
static const int2 offset = int2(1, 1);
static const int2 isize = int2(4, 18);
groupshared MF4 inp[8][18][4];
static const int2 ksize = int2(3, 3);
static const int2 offset = int2(1, 1);
static const int2 isize = int2(4, 18);
groupshared MF4 inp[8][18][4];
void Pass4(uint2 blockStart, uint3 tid) {
uint2 base = uint2(blockStart.x / 8, blockStart.y);
for (uint y = tid.y; y < isize.y; y += 16) {
for (uint x = tid.x; x < isize.x; x += 2) {
inp[0][y][x] = MF4(conv2d_1_mul * conv2d_1.Load(int3((base + int2(x,y) - offset) * int2(8, 1) + int2(0, 0)), 0));
inp[1][y][x] = MF4(conv2d_1_mul * conv2d_1.Load(int3((base + int2(x,y) - offset) * int2(8, 1) + int2(1, 0)), 0));
inp[2][y][x] = MF4(conv2d_1_mul * conv2d_1.Load(int3((base + int2(x,y) - offset) * int2(8, 1) + int2(2, 0)), 0));
inp[3][y][x] = MF4(conv2d_1_mul * conv2d_1.Load(int3((base + int2(x,y) - offset) * int2(8, 1) + int2(3, 0)), 0));
inp[4][y][x] = MF4(conv2d_1_mul * conv2d_1.Load(int3((base + int2(x,y) - offset) * int2(8, 1) + int2(4, 0)), 0));
inp[5][y][x] = MF4(conv2d_1_mul * conv2d_1.Load(int3((base + int2(x,y) - offset) * int2(8, 1) + int2(5, 0)), 0));
inp[6][y][x] = MF4(conv2d_1_mul * conv2d_1.Load(int3((base + int2(x,y) - offset) * int2(8, 1) + int2(6, 0)), 0));
inp[7][y][x] = MF4(conv2d_1_mul * conv2d_1.Load(int3((base + int2(x,y) - offset) * int2(8, 1) + int2(7, 0)), 0));
inp[0][y][x] = MF4(conv2d_1_mul * conv2d_1.Load(int3((base + int2(x,y) - offset) * int2(8, 1) , 0)));
inp[1][y][x] = MF4(conv2d_1_mul * conv2d_1.Load(int3((base + int2(x,y) - offset) * int2(8, 1) + int2(1, 0), 0)));
inp[2][y][x] = MF4(conv2d_1_mul * conv2d_1.Load(int3((base + int2(x,y) - offset) * int2(8, 1) + int2(2, 0), 0)));
inp[3][y][x] = MF4(conv2d_1_mul * conv2d_1.Load(int3((base + int2(x,y) - offset) * int2(8, 1) + int2(3, 0), 0)));
inp[4][y][x] = MF4(conv2d_1_mul * conv2d_1.Load(int3((base + int2(x,y) - offset) * int2(8, 1) + int2(4, 0), 0)));
inp[5][y][x] = MF4(conv2d_1_mul * conv2d_1.Load(int3((base + int2(x,y) - offset) * int2(8, 1) + int2(5, 0), 0)));
inp[6][y][x] = MF4(conv2d_1_mul * conv2d_1.Load(int3((base + int2(x,y) - offset) * int2(8, 1) + int2(6, 0), 0)));
inp[7][y][x] = MF4(conv2d_1_mul * conv2d_1.Load(int3((base + int2(x,y) - offset) * int2(8, 1) + int2(7, 0), 0)));
}
}
@ -1510,23 +1510,23 @@ void Pass4(uint2 blockStart, uint3 tid) {
//!IN conv2d_2
//!OUT conv2d_3
void Pass5(uint2 blockStart, uint3 tid) {
static const int2 ksize = int2(3, 3);
static const int2 offset = int2(1, 1);
static const int2 isize = int2(4, 18);
groupshared MF4 inp[8][18][4];
static const int2 ksize = int2(3, 3);
static const int2 offset = int2(1, 1);
static const int2 isize = int2(4, 18);
groupshared MF4 inp[8][18][4];
void Pass5(uint2 blockStart, uint3 tid) {
uint2 base = uint2(blockStart.x / 8, blockStart.y);
for (uint y = tid.y; y < isize.y; y += 16) {
for (uint x = tid.x; x < isize.x; x += 2) {
inp[0][y][x] = MF4(conv2d_2_mul * conv2d_2.Load(int3((base + int2(x,y) - offset) * int2(8, 1) + int2(0, 0)), 0));
inp[1][y][x] = MF4(conv2d_2_mul * conv2d_2.Load(int3((base + int2(x,y) - offset) * int2(8, 1) + int2(1, 0)), 0));
inp[2][y][x] = MF4(conv2d_2_mul * conv2d_2.Load(int3((base + int2(x,y) - offset) * int2(8, 1) + int2(2, 0)), 0));
inp[3][y][x] = MF4(conv2d_2_mul * conv2d_2.Load(int3((base + int2(x,y) - offset) * int2(8, 1) + int2(3, 0)), 0));
inp[4][y][x] = MF4(conv2d_2_mul * conv2d_2.Load(int3((base + int2(x,y) - offset) * int2(8, 1) + int2(4, 0)), 0));
inp[5][y][x] = MF4(conv2d_2_mul * conv2d_2.Load(int3((base + int2(x,y) - offset) * int2(8, 1) + int2(5, 0)), 0));
inp[6][y][x] = MF4(conv2d_2_mul * conv2d_2.Load(int3((base + int2(x,y) - offset) * int2(8, 1) + int2(6, 0)), 0));
inp[7][y][x] = MF4(conv2d_2_mul * conv2d_2.Load(int3((base + int2(x,y) - offset) * int2(8, 1) + int2(7, 0)), 0));
inp[0][y][x] = MF4(conv2d_2_mul * conv2d_2.Load(int3((base + int2(x,y) - offset) * int2(8, 1) , 0)));
inp[1][y][x] = MF4(conv2d_2_mul * conv2d_2.Load(int3((base + int2(x,y) - offset) * int2(8, 1) + int2(1, 0), 0)));
inp[2][y][x] = MF4(conv2d_2_mul * conv2d_2.Load(int3((base + int2(x,y) - offset) * int2(8, 1) + int2(2, 0), 0)));
inp[3][y][x] = MF4(conv2d_2_mul * conv2d_2.Load(int3((base + int2(x,y) - offset) * int2(8, 1) + int2(3, 0), 0)));
inp[4][y][x] = MF4(conv2d_2_mul * conv2d_2.Load(int3((base + int2(x,y) - offset) * int2(8, 1) + int2(4, 0), 0)));
inp[5][y][x] = MF4(conv2d_2_mul * conv2d_2.Load(int3((base + int2(x,y) - offset) * int2(8, 1) + int2(5, 0), 0)));
inp[6][y][x] = MF4(conv2d_2_mul * conv2d_2.Load(int3((base + int2(x,y) - offset) * int2(8, 1) + int2(6, 0), 0)));
inp[7][y][x] = MF4(conv2d_2_mul * conv2d_2.Load(int3((base + int2(x,y) - offset) * int2(8, 1) + int2(7, 0), 0)));
}
}
@ -2142,23 +2142,23 @@ void Pass5(uint2 blockStart, uint3 tid) {
//!IN conv2d_3
//!OUT conv2d_4
void Pass6(uint2 blockStart, uint3 tid) {
static const int2 ksize = int2(3, 3);
static const int2 offset = int2(1, 1);
static const int2 isize = int2(4, 18);
groupshared MF4 inp[8][18][4];
static const int2 ksize = int2(3, 3);
static const int2 offset = int2(1, 1);
static const int2 isize = int2(4, 18);
groupshared MF4 inp[8][18][4];
void Pass6(uint2 blockStart, uint3 tid) {
uint2 base = uint2(blockStart.x / 8, blockStart.y);
for (uint y = tid.y; y < isize.y; y += 16) {
for (uint x = tid.x; x < isize.x; x += 2) {
inp[0][y][x] = MF4(conv2d_3_mul * conv2d_3.Load(int3((base + int2(x,y) - offset) * int2(8, 1) + int2(0, 0)), 0));
inp[1][y][x] = MF4(conv2d_3_mul * conv2d_3.Load(int3((base + int2(x,y) - offset) * int2(8, 1) + int2(1, 0)), 0));
inp[2][y][x] = MF4(conv2d_3_mul * conv2d_3.Load(int3((base + int2(x,y) - offset) * int2(8, 1) + int2(2, 0)), 0));
inp[3][y][x] = MF4(conv2d_3_mul * conv2d_3.Load(int3((base + int2(x,y) - offset) * int2(8, 1) + int2(3, 0)), 0));
inp[4][y][x] = MF4(conv2d_3_mul * conv2d_3.Load(int3((base + int2(x,y) - offset) * int2(8, 1) + int2(4, 0)), 0));
inp[5][y][x] = MF4(conv2d_3_mul * conv2d_3.Load(int3((base + int2(x,y) - offset) * int2(8, 1) + int2(5, 0)), 0));
inp[6][y][x] = MF4(conv2d_3_mul * conv2d_3.Load(int3((base + int2(x,y) - offset) * int2(8, 1) + int2(6, 0)), 0));
inp[7][y][x] = MF4(conv2d_3_mul * conv2d_3.Load(int3((base + int2(x,y) - offset) * int2(8, 1) + int2(7, 0)), 0));
inp[0][y][x] = MF4(conv2d_3_mul * conv2d_3.Load(int3((base + int2(x,y) - offset) * int2(8, 1) , 0)));
inp[1][y][x] = MF4(conv2d_3_mul * conv2d_3.Load(int3((base + int2(x,y) - offset) * int2(8, 1) + int2(1, 0), 0)));
inp[2][y][x] = MF4(conv2d_3_mul * conv2d_3.Load(int3((base + int2(x,y) - offset) * int2(8, 1) + int2(2, 0), 0)));
inp[3][y][x] = MF4(conv2d_3_mul * conv2d_3.Load(int3((base + int2(x,y) - offset) * int2(8, 1) + int2(3, 0), 0)));
inp[4][y][x] = MF4(conv2d_3_mul * conv2d_3.Load(int3((base + int2(x,y) - offset) * int2(8, 1) + int2(4, 0), 0)));
inp[5][y][x] = MF4(conv2d_3_mul * conv2d_3.Load(int3((base + int2(x,y) - offset) * int2(8, 1) + int2(5, 0), 0)));
inp[6][y][x] = MF4(conv2d_3_mul * conv2d_3.Load(int3((base + int2(x,y) - offset) * int2(8, 1) + int2(6, 0), 0)));
inp[7][y][x] = MF4(conv2d_3_mul * conv2d_3.Load(int3((base + int2(x,y) - offset) * int2(8, 1) + int2(7, 0), 0)));
}
}
@ -2774,23 +2774,23 @@ void Pass6(uint2 blockStart, uint3 tid) {
//!IN conv2d_4
//!OUT conv2d_5
void Pass7(uint2 blockStart, uint3 tid) {
static const int2 ksize = int2(3, 3);
static const int2 offset = int2(1, 1);
static const int2 isize = int2(4, 18);
groupshared MF4 inp[8][18][4];
static const int2 ksize = int2(3, 3);
static const int2 offset = int2(1, 1);
static const int2 isize = int2(4, 18);
groupshared MF4 inp[8][18][4];
void Pass7(uint2 blockStart, uint3 tid) {
uint2 base = uint2(blockStart.x / 8, blockStart.y);
for (uint y = tid.y; y < isize.y; y += 16) {
for (uint x = tid.x; x < isize.x; x += 2) {
inp[0][y][x] = MF4(conv2d_4_mul * conv2d_4.Load(int3((base + int2(x,y) - offset) * int2(8, 1) + int2(0, 0)), 0));
inp[1][y][x] = MF4(conv2d_4_mul * conv2d_4.Load(int3((base + int2(x,y) - offset) * int2(8, 1) + int2(1, 0)), 0));
inp[2][y][x] = MF4(conv2d_4_mul * conv2d_4.Load(int3((base + int2(x,y) - offset) * int2(8, 1) + int2(2, 0)), 0));
inp[3][y][x] = MF4(conv2d_4_mul * conv2d_4.Load(int3((base + int2(x,y) - offset) * int2(8, 1) + int2(3, 0)), 0));
inp[4][y][x] = MF4(conv2d_4_mul * conv2d_4.Load(int3((base + int2(x,y) - offset) * int2(8, 1) + int2(4, 0)), 0));
inp[5][y][x] = MF4(conv2d_4_mul * conv2d_4.Load(int3((base + int2(x,y) - offset) * int2(8, 1) + int2(5, 0)), 0));
inp[6][y][x] = MF4(conv2d_4_mul * conv2d_4.Load(int3((base + int2(x,y) - offset) * int2(8, 1) + int2(6, 0)), 0));
inp[7][y][x] = MF4(conv2d_4_mul * conv2d_4.Load(int3((base + int2(x,y) - offset) * int2(8, 1) + int2(7, 0)), 0));
inp[0][y][x] = MF4(conv2d_4_mul * conv2d_4.Load(int3((base + int2(x,y) - offset) * int2(8, 1) , 0)));
inp[1][y][x] = MF4(conv2d_4_mul * conv2d_4.Load(int3((base + int2(x,y) - offset) * int2(8, 1) + int2(1, 0), 0)));
inp[2][y][x] = MF4(conv2d_4_mul * conv2d_4.Load(int3((base + int2(x,y) - offset) * int2(8, 1) + int2(2, 0), 0)));
inp[3][y][x] = MF4(conv2d_4_mul * conv2d_4.Load(int3((base + int2(x,y) - offset) * int2(8, 1) + int2(3, 0), 0)));
inp[4][y][x] = MF4(conv2d_4_mul * conv2d_4.Load(int3((base + int2(x,y) - offset) * int2(8, 1) + int2(4, 0), 0)));
inp[5][y][x] = MF4(conv2d_4_mul * conv2d_4.Load(int3((base + int2(x,y) - offset) * int2(8, 1) + int2(5, 0), 0)));
inp[6][y][x] = MF4(conv2d_4_mul * conv2d_4.Load(int3((base + int2(x,y) - offset) * int2(8, 1) + int2(6, 0), 0)));
inp[7][y][x] = MF4(conv2d_4_mul * conv2d_4.Load(int3((base + int2(x,y) - offset) * int2(8, 1) + int2(7, 0), 0)));
}
}
@ -3406,23 +3406,23 @@ void Pass7(uint2 blockStart, uint3 tid) {
//!IN conv2d_5
//!OUT conv2d_6
void Pass8(uint2 blockStart, uint3 tid) {
static const int2 ksize = int2(3, 3);
static const int2 offset = int2(1, 1);
static const int2 isize = int2(18, 18);
groupshared MF4 inp[8][18][18];
static const int2 ksize = int2(3, 3);
static const int2 offset = int2(1, 1);
static const int2 isize = int2(18, 18);
groupshared MF4 inp[8][18][18];
void Pass8(uint2 blockStart, uint3 tid) {
uint2 base = blockStart;
for (uint y = tid.y; y < isize.y; y += 16) {
for (uint x = tid.x; x < isize.x; x += 16) {
inp[0][y][x] = MF4(conv2d_5_mul * conv2d_5.Load(int3((base + int2(x,y) - offset) * int2(8, 1) + int2(0, 0)), 0) + conv2d_mul * conv2d.Load(int3((base + int2(x,y) - offset) * int2(8, 1) + int2(0, 0)), 0));
inp[1][y][x] = MF4(conv2d_5_mul * conv2d_5.Load(int3((base + int2(x,y) - offset) * int2(8, 1) + int2(1, 0)), 0) + conv2d_mul * conv2d.Load(int3((base + int2(x,y) - offset) * int2(8, 1) + int2(1, 0)), 0));
inp[2][y][x] = MF4(conv2d_5_mul * conv2d_5.Load(int3((base + int2(x,y) - offset) * int2(8, 1) + int2(2, 0)), 0) + conv2d_mul * conv2d.Load(int3((base + int2(x,y) - offset) * int2(8, 1) + int2(2, 0)), 0));
inp[3][y][x] = MF4(conv2d_5_mul * conv2d_5.Load(int3((base + int2(x,y) - offset) * int2(8, 1) + int2(3, 0)), 0) + conv2d_mul * conv2d.Load(int3((base + int2(x,y) - offset) * int2(8, 1) + int2(3, 0)), 0));
inp[4][y][x] = MF4(conv2d_5_mul * conv2d_5.Load(int3((base + int2(x,y) - offset) * int2(8, 1) + int2(4, 0)), 0) + conv2d_mul * conv2d.Load(int3((base + int2(x,y) - offset) * int2(8, 1) + int2(4, 0)), 0));
inp[5][y][x] = MF4(conv2d_5_mul * conv2d_5.Load(int3((base + int2(x,y) - offset) * int2(8, 1) + int2(5, 0)), 0) + conv2d_mul * conv2d.Load(int3((base + int2(x,y) - offset) * int2(8, 1) + int2(5, 0)), 0));
inp[6][y][x] = MF4(conv2d_5_mul * conv2d_5.Load(int3((base + int2(x,y) - offset) * int2(8, 1) + int2(6, 0)), 0) + conv2d_mul * conv2d.Load(int3((base + int2(x,y) - offset) * int2(8, 1) + int2(6, 0)), 0));
inp[7][y][x] = MF4(conv2d_5_mul * conv2d_5.Load(int3((base + int2(x,y) - offset) * int2(8, 1) + int2(7, 0)), 0) + conv2d_mul * conv2d.Load(int3((base + int2(x,y) - offset) * int2(8, 1) + int2(7, 0)), 0));
inp[0][y][x] = MF4(conv2d_5_mul * conv2d_5.Load(int3((base + int2(x,y) - offset) * int2(8, 1) , 0)) + conv2d_mul * conv2d.Load(int3((base + int2(x,y) - offset) * int2(8, 1) , 0)));
inp[1][y][x] = MF4(conv2d_5_mul * conv2d_5.Load(int3((base + int2(x,y) - offset) * int2(8, 1) + int2(1, 0), 0)) + conv2d_mul * conv2d.Load(int3((base + int2(x,y) - offset) * int2(8, 1) + int2(1, 0), 0)));
inp[2][y][x] = MF4(conv2d_5_mul * conv2d_5.Load(int3((base + int2(x,y) - offset) * int2(8, 1) + int2(2, 0), 0)) + conv2d_mul * conv2d.Load(int3((base + int2(x,y) - offset) * int2(8, 1) + int2(2, 0), 0)));
inp[3][y][x] = MF4(conv2d_5_mul * conv2d_5.Load(int3((base + int2(x,y) - offset) * int2(8, 1) + int2(3, 0), 0)) + conv2d_mul * conv2d.Load(int3((base + int2(x,y) - offset) * int2(8, 1) + int2(3, 0), 0)));
inp[4][y][x] = MF4(conv2d_5_mul * conv2d_5.Load(int3((base + int2(x,y) - offset) * int2(8, 1) + int2(4, 0), 0)) + conv2d_mul * conv2d.Load(int3((base + int2(x,y) - offset) * int2(8, 1) + int2(4, 0), 0)));
inp[5][y][x] = MF4(conv2d_5_mul * conv2d_5.Load(int3((base + int2(x,y) - offset) * int2(8, 1) + int2(5, 0), 0)) + conv2d_mul * conv2d.Load(int3((base + int2(x,y) - offset) * int2(8, 1) + int2(5, 0), 0)));
inp[6][y][x] = MF4(conv2d_5_mul * conv2d_5.Load(int3((base + int2(x,y) - offset) * int2(8, 1) + int2(6, 0), 0)) + conv2d_mul * conv2d.Load(int3((base + int2(x,y) - offset) * int2(8, 1) + int2(6, 0), 0)));
inp[7][y][x] = MF4(conv2d_5_mul * conv2d_5.Load(int3((base + int2(x,y) - offset) * int2(8, 1) + int2(7, 0), 0)) + conv2d_mul * conv2d.Load(int3((base + int2(x,y) - offset) * int2(8, 1) + int2(7, 0), 0)));
}
}