Merge 19e3a8142f into 2980a69156

2026-06-24 02:04:10 +00:00 · 2026-05-27 11:59:01 +00:00 · 2026-05-27 11:59:01 +00:00 · 13b1cd4a19
commit 13b1cd4a19
parent 2980a69156 19e3a8142f
5 changed files with 4230 additions and 2 deletions
--- a/src/Effects/ArtCNN/Ani4Kv2_ArtCNN_C4F32_i2.hlsl
+++ b/src/Effects/ArtCNN/Ani4Kv2_ArtCNN_C4F32_i2.hlsl
--- a/src/Effects/Effects.vcxproj
+++ b/src/Effects/Effects.vcxproj
@ -464,6 +464,9 @@
    <CopyFileToFolders Include="k7_modernAnime_FHD_x2.hlsl">
      <FileType>Document</FileType>
    </CopyFileToFolders>
+    <CopyFileToFolders Include="ArtCNN\Ani4Kv2_ArtCNN_C4F32_i2.hlsl">
+      <FileType>Document</FileType>
+    </CopyFileToFolders>
  </ItemGroup>
  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
 </Project>
--- a/src/Effects/Effects.vcxproj.filters
+++ b/src/Effects/Effects.vcxproj.filters
@ -1,4 +1,4 @@
-<?xml version="1.0" encoding="utf-8"?>
+<?xml version="1.0" encoding="utf-8"?>
 <Project ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
  <ItemGroup>
    <CopyFileToFolders Include="ACNet.hlsl" />
@ -453,6 +453,9 @@
    </CopyFileToFolders>
    <CopyFileToFolders Include="SGSR.hlsl" />
    <CopyFileToFolders Include="k7_modernAnime_FHD_x2.hlsl" />
+    <CopyFileToFolders Include="ArtCNN\Ani4Kv2_ArtCNN_C4F32_i2.hlsl">
+      <Filter>ArtCNN</Filter>
+    </CopyFileToFolders>
  </ItemGroup>
  <ItemGroup>
    <Filter Include="Anime4K">
@ -500,6 +503,9 @@
    <Filter Include="CuNNy2">
      <UniqueIdentifier>{52055d56-41dc-409a-a878-3c1278082f6d}</UniqueIdentifier>
    </Filter>
+    <Filter Include="ArtCNN">
+      <UniqueIdentifier>{a2055d56-41dc-409a-a878-3c1278082f6a}</UniqueIdentifier>
+    </Filter>
  </ItemGroup>
  <ItemGroup>
    <None Include="StubDefs.hlsli" />
--- a/src/Magpie/AppSettings.cpp
+++ b/src/Magpie/AppSettings.cpp
@ -1180,7 +1180,7 @@ bool AppSettings::_SetDefaultShortcuts() noexcept {
 }

 void AppSettings::_SetDefaultScalingModes() noexcept {
-	_scalingModes.resize(7);
+	_scalingModes.resize(8);

 	// Lanczos
 	{
@ -1246,6 +1246,12 @@ void AppSettings::_SetDefaultScalingModes() noexcept {
 		nearest.scalingType = ::Magpie::ScalingType::Normal;
 		nearest.scale = { 2.0f,2.0f };
 	}
+	// Anime4K v2 ArtCNN
+	{
+		auto& artcnn = _scalingModes[7];
+		artcnn.name = L"Anime4K v2 ArtCNN";
+		artcnn.effects.emplace_back().name = L"ArtCNN\\Ani4Kv2_ArtCNN_C4F32_i2";
+	}

 	// 全局缩放模式默认为 Lanczos
 	_defaultProfile.scalingMode = 0;
--- a/tools/ArtCNNPorter/port_artcnn.py
+++ b/tools/ArtCNNPorter/port_artcnn.py
@ -0,0 +1,662 @@
+import os
+import re
+
+def parse_glsl_passes(file_path):
+    with open(file_path, 'r', encoding='utf-8') as f:
+        content = f.read()
+    
+    # Split content by passes
+    pass_blocks = content.split('//!DESC')
+    header_comment = ""
+    # Extract header comments (license, etc.) from the first block
+    if pass_blocks:
+        first_lines = pass_blocks[0].splitlines()
+        license_lines = [l for l in first_lines if l.startswith('//')]
+        header_comment = '\n'.join(license_lines)
+    
+    passes = []
+    for block in pass_blocks[1:]:
+        lines = block.splitlines()
+        desc = lines[0].strip()
+        
+        directives = {}
+        code_lines = []
+        in_hook = False
+        
+        for line in lines[1:]:
+            line_str = line.strip()
+            if line_str.startswith('//!'):
+                parts = line_str[3:].split(maxsplit=1)
+                cmd = parts[0]
+                val = parts[1] if len(parts) > 1 else ""
+                if cmd == 'BIND':
+                    if 'BIND' not in directives:
+                        directives['BIND'] = []
+                    directives['BIND'].append(val)
+                else:
+                    directives[cmd] = val
+            elif 'vec4 hook()' in line or 'void hook()' in line:
+                in_hook = True
+                code_lines.append(line)
+            elif in_hook:
+                code_lines.append(line)
+        
+        hook_code = '\n'.join(code_lines)
+        passes.append({
+            'desc': desc,
+            'directives': directives,
+            'hook_code': hook_code
+        })
+    
+    return header_comment, passes
+
+def clean_float(val_str):
+    # Standardize float format (e.g. 1.026e-03 -> 1.026e-03f or similar, but HLSL accepts scientific notation)
+    return val_str
+
+def translate_matrix_vector(code):
+    # Match matrix * vector and convert to mul(matrix, vector)
+    # This matches both standard shader (with _texOff vector) and compute shader (with inp vector)
+    matrix_pattern = r'\b(mat4|f16mat4|M4|MF4x4)\s*\(([^)]+)\)\s*\*\s*(inp\[[^\]]+\]\[[^\]]+\]\[[^\]]+\]|\w+_texOff\(vec2\([^\)]+\)\)|\(\s*\w+_texOff\(vec2\([^\)]+\)\)\s*\+\s*\w+_texOff\(vec2\([^\)]+\)\)\s*\))'
+    code = re.sub(matrix_pattern, r'mul(\3, \1(\2))', code)
+    
+    # Strip single-argument vector constructors in compute shader loads
+    # E.g. inp[0][y][x] = V4(conv2d_mul * texelFetch(...)); -> inp[0][y][x] = conv2d_mul * texelFetch(...);
+    code = re.sub(
+        r'\b(inp\[\d+\]\[y\]\[x\]\s*=\s*)(?:V4|vec4|f16vec4|MF4)\((.*)\);',
+        r'\1\2;',
+        code
+    )
+
+    # Translate GLSL types to HLSL
+    # vec4 -> MF4, mat4 -> MF4x4, vec2 -> float2, ivec2 -> int2, etc.
+    code = re.sub(r'\bvec4\b', 'MF4', code)
+    code = re.sub(r'\bmat4\b', 'MF4x4', code)
+    code = re.sub(r'\bvec2\b', 'float2', code)
+    code = re.sub(r'\bivec2\b', 'int2', code)
+    code = re.sub(r'\bvec3\b', 'float3', code)
+    
+    # GLSL explicit arithmetic types float16
+    code = re.sub(r'\bf16vec4\b', 'MF4', code)
+    code = re.sub(r'\bf16mat4\b', 'MF4x4', code)
+    code = re.sub(r'\bfloat16_t\b', 'MF', code)
+    code = re.sub(r'\bF\b', 'MF', code)
+    code = re.sub(r'\bV4\b', 'MF4', code)
+    code = re.sub(r'\bM4\b', 'MF4x4', code)
+    
+    # GLSL barrier() -> GroupMemoryBarrierWithGroupSync()
+    code = re.sub(r'\bbarrier\(\)', 'GroupMemoryBarrierWithGroupSync()', code)
+    
+    # Strip single-argument vector constructors
+    # E.g. MF4(0.0) -> 0.0, float4(0.0) -> 0.0, etc.
+    code = re.sub(
+        r'\b(?:V4|vec4|f16vec4|MF4|MF3|MF2|float4|float3|float2)\(([-\d.]+)\)',
+        r'\1',
+        code
+    )
+    
+    # GLSL imageStore -> output texture assignment
+    # e.g., imageStore(out_image, store_pos0, result0);
+    # In HLSL we can assign directly: OutTex[pos] = val;
+    # We will handle imageStore specifically in the generator
+    # Translate: resultX += mul(vector, matrix);
+    # to: resultX = MulAdd(vector, matrix, resultX);
+    muladd_pattern = r'\b(result\d+)\s*\+=\s*mul\(([^,]+),\s*(MF4x4\([^)]+\))\);'
+    code = re.sub(muladd_pattern, r'\1 = MulAdd(\2, \3, \1);', code)
+    
+    return code
+
+
+def replace_texel_fetch_robust(code):
+    pos = 0
+    while True:
+        idx = code.find('texelFetch(', pos)
+        if idx == -1:
+            break
+        start = idx + len('texelFetch(')
+        depth = 1
+        i = start
+        while i < len(code) and depth > 0:
+            if code[i] == '(':
+                depth += 1
+            elif code[i] == ')':
+                depth -= 1
+            i += 1
+        if depth == 0:
+            full_expr = code[idx:i]
+            inner = code[start:i-1]
+            args = []
+            arg_start = 0
+            inner_depth = 0
+            for j in range(len(inner)):
+                if inner[j] == '(':
+                    inner_depth += 1
+                elif inner[j] == ')':
+                    inner_depth -= 1
+                elif inner[j] == ',' and inner_depth == 0:
+                    args.append(inner[arg_start:j].strip())
+                    arg_start = j + 1
+            args.append(inner[arg_start:].strip())
+            
+            if len(args) == 3:
+                tex_raw = args[0]
+                pos_expr = args[1]
+                tex_name = tex_raw
+                if tex_name.endswith('_raw'):
+                    tex_name = tex_name[:-4]
+                if tex_name == 'LUMA':
+                    tex_name = 'LUMA'
+                # Simplify pos_expr
+                pos_expr = re.sub(r'\*\s*int2\(1,\s*1\)', '', pos_expr)
+                pos_expr = re.sub(r'\+\s*int2\(0,\s*0\)', '', pos_expr)
+                pos_expr = re.sub(r'\*\s*float2\(1,\s*1\)', '', pos_expr)
+                pos_expr = re.sub(r'\+\s*float2\(0,\s*0\)', '', pos_expr)
+                
+                replacement = f"{tex_name}.Load(int3({pos_expr}, 0))"
+                code = code[:idx] + replacement + code[i:]
+                pos = idx + len(replacement)
+            else:
+                pos = i
+        else:
+            break
+    return code
+
+
+def port_standard(glsl_path, hlsl_path):
+    header, passes = parse_glsl_passes(glsl_path)
+    
+    hlsl_content = []
+    hlsl_content.append(f"// Generated from {os.path.basename(glsl_path)}")
+    hlsl_content.append(header)
+    hlsl_content.append("\n//!MAGPIE EFFECT")
+    hlsl_content.append("//!VERSION 4")
+    hlsl_content.append("//!USE MulAdd")
+    hlsl_content.append("//!CAPABILITY FP16\n")
+    hlsl_content.append('#include "../StubDefs.hlsli"\n')
+    
+    # Add static constants and samplers
+    hlsl_content.append("//!TEXTURE")
+    hlsl_content.append("Texture2D INPUT;\n")
+    hlsl_content.append("//!TEXTURE")
+    hlsl_content.append("//!WIDTH INPUT_WIDTH * 2")
+    hlsl_content.append("//!HEIGHT INPUT_HEIGHT * 2")
+    hlsl_content.append("Texture2D OUTPUT;\n")
+    hlsl_content.append("//!SAMPLER")
+    hlsl_content.append("//!FILTER POINT")
+    hlsl_content.append("SamplerState SP;\n")
+    hlsl_content.append("//!SAMPLER")
+    hlsl_content.append("//!FILTER LINEAR")
+    hlsl_content.append("SamplerState SL;\n")
+    
+    # Define color space matrices
+    hlsl_content.append("//!COMMON")
+    hlsl_content.append("static const MF3x3 RY = {0.299, 0.587, 0.114, -0.169, -0.331, 0.5, 0.5, -0.419, -0.081};")
+    hlsl_content.append("static const MF3x3 YR = {1, -0.00093, 1.401687, 1, -0.3437, -0.71417, 1, 1.77216, 0.00099};\n")
+    hlsl_content.append("#define O(t, x, y) t.SampleLevel(SP, pos + float2(x, y) * pt, 0)\n")
+    
+    # Intermediate textures
+    hlsl_content.append("//!TEXTURE")
+    hlsl_content.append("//!WIDTH INPUT_WIDTH")
+    hlsl_content.append("//!HEIGHT INPUT_HEIGHT")
+    hlsl_content.append("//!FORMAT R16G16B16A16_FLOAT")
+    hlsl_content.append("Texture2D LUMA;\n")
+    
+    # We have Layer 0 (conv2d_0 to 7), Layer 1 (conv2d_1_0 to 7), ..., Layer 5 (conv2d_5_0 to 7), Layer 6 (conv2d_6_0)
+    for layer in range(6):
+        for p in range(8):
+            hlsl_content.append("//!TEXTURE")
+            hlsl_content.append("//!WIDTH INPUT_WIDTH")
+            hlsl_content.append("//!HEIGHT INPUT_HEIGHT")
+            hlsl_content.append("//!FORMAT R16G16B16A16_FLOAT")
+            hlsl_content.append(f"Texture2D T{layer}_{p};\n")
+            
+    hlsl_content.append("//!TEXTURE")
+    hlsl_content.append("//!WIDTH INPUT_WIDTH")
+    hlsl_content.append("//!HEIGHT INPUT_HEIGHT")
+    hlsl_content.append("//!FORMAT R16G16B16A16_FLOAT")
+    hlsl_content.append("Texture2D T6_0;\n")
+    
+    # Pass 1: Luma pre-pass
+    hlsl_content.append("//!PASS 1")
+    hlsl_content.append("//!DESC Luma pre-pass")
+    hlsl_content.append("//!BLOCK_SIZE 8")
+    hlsl_content.append("//!NUM_THREADS 64")
+    hlsl_content.append("//!IN INPUT")
+    hlsl_content.append("//!OUT LUMA")
+    hlsl_content.append("""void Pass1(uint2 blockStart, uint3 tid) {
+	uint2 gxy = Rmp8x8(tid.x) + blockStart;
+	uint2 sz = GetInputSize();
+	if (gxy.x >= sz.x || gxy.y >= sz.y)
+		return;
+	float2 pt = float2(GetInputPt());
+	float2 pos = (gxy + 0.5) * pt;
+	MF3 color = INPUT.SampleLevel(SP, pos, 0).rgb;
+	LUMA[gxy] = MF4(dot(MF3(0.299, 0.587, 0.114), color), 0.0, 0.0, 0.0);
+}
+""")
+
+    # Port all conv passes (from index 0 to 48)
+    for idx, p in enumerate(passes[:-1]):
+        pass_num = idx + 2
+        desc = p['desc']
+        directives = p['directives']
+        hook_code = p['hook_code']
+        
+        save_target = directives.get('SAVE', '')
+        # Map save target to our intermediate textures
+        # e.g., conv2d_0 -> T0_0, conv2d_1_0 -> T1_0, conv2d_6_0 -> T6_0
+        out_tex = ""
+        if save_target == 'conv2d':
+            out_tex = 'T0_0'
+        elif save_target.startswith('conv2d_'):
+            parts = save_target.split('_')
+            if len(parts) == 2:
+                # conv2d_X
+                out_tex = f"T0_{parts[1]}"
+            elif len(parts) == 3:
+                # conv2d_X_Y
+                out_tex = f"T{parts[1]}_{parts[2]}"
+        
+        # Get input bindings
+        in_bindings = []
+        for bind_val in directives.get('BIND', []):
+            if bind_val == 'LUMA':
+                in_bindings.append('LUMA')
+            elif bind_val == 'conv2d':
+                in_bindings.append('T0_0')
+            elif bind_val.startswith('conv2d_'):
+                parts = bind_val.split('_')
+                if len(parts) == 2:
+                    in_bindings.append(f"T0_{parts[1]}")
+                elif len(parts) == 3:
+                    in_bindings.append(f"T{parts[1]}_{parts[2]}")
+        
+        # Ensure unique bindings list
+        in_bindings_str = ", ".join(sorted(list(set(in_bindings))))
+        
+        hlsl_content.append(f"//!PASS {pass_num}")
+        hlsl_content.append(f"//!DESC {desc}")
+        hlsl_content.append("//!BLOCK_SIZE 8")
+        hlsl_content.append("//!NUM_THREADS 64")
+        hlsl_content.append(f"//!IN {in_bindings_str}")
+        hlsl_content.append(f"//!OUT {out_tex}\n")
+        
+        # Process hook code
+        # Replace texture offset functions with our O(t, x, y) macro
+        # e.g., LUMA_texOff(vec2(-1, -1)).x -> O(LUMA, -1, -1).x
+        # e.g., conv2d_0_texOff(vec2(-1, -1)) -> O(T0_0, -1, -1)
+        # e.g., conv2d_5_5_texOff(vec2(0, -1)) -> O(T5_5, 0, -1)
+        translated_code = translate_matrix_vector(hook_code)
+        
+        # Replace offset function calls
+        def repl_tex_off(match):
+            tex_name = match.group(1)
+            dx = match.group(2)
+            dy = match.group(3)
+            # Map tex_name to our HLSL texture name
+            hlsl_tex = ""
+            if tex_name == 'LUMA':
+                hlsl_tex = 'LUMA'
+            elif tex_name == 'conv2d':
+                hlsl_tex = 'T0_0'
+            elif tex_name.startswith('conv2d_'):
+                parts = tex_name.split('_')
+                if len(parts) == 2:
+                    hlsl_tex = f"T0_{parts[1]}"
+                elif len(parts) == 3:
+                    hlsl_tex = f"T{parts[1]}_{parts[2]}"
+            return f"O({hlsl_tex}, {dx}, {dy})"
+            
+        translated_code = re.sub(r'(\b\w+)_texOff\(float2\(([-\d]+),\s*([-\d]+)\)\)', repl_tex_off, translated_code)
+        
+        # Rewrite the hook function signature to match MagpieFX style
+        translated_code = translated_code.replace("MF4 hook() {", f"void Pass{pass_num}(uint2 blockStart, uint3 tid) {{")
+        
+        # Handle the return statement
+        # In GLSL, standard passes return vec4 (MF4). In HLSL CS style, we write to output texture
+        # e.g. return max(result, vec4(0.0)); or return result;
+        # We need to assign it to out_tex[gxy]
+        translated_code = re.sub(r'return\s+max\((result),\s*MF4\(0\.0\)\);', rf'{out_tex}[gxy] = max(\1, 0.0);', translated_code)
+        translated_code = re.sub(r'return\s+max\((result),\s*0\.0\);', rf'{out_tex}[gxy] = max(\1, 0.0);', translated_code)
+        translated_code = re.sub(r'return\s+(result);', rf'{out_tex}[gxy] = \1;', translated_code)
+        
+        # Prepend coordinate computation block
+        coord_block = """\tuint2 gxy = Rmp8x8(tid.x) + blockStart;
+	uint2 sz = GetInputSize();
+	if (gxy.x >= sz.x || gxy.y >= sz.y)
+		return;
+	float2 pt = float2(GetInputPt());
+	float2 pos = (gxy + 0.5) * pt;
+"""
+        # Insert coord_block right after the opening brace of Pass function
+        func_start = translated_code.find(f"void Pass{pass_num}(uint2 blockStart, uint3 tid) {{")
+        if func_start != -1:
+            brace_pos = translated_code.find("{", func_start)
+            translated_code = translated_code[:brace_pos+1] + "\n" + coord_block + translated_code[brace_pos+1:]
+            
+        hlsl_content.append(translated_code)
+        hlsl_content.append("\n")
+        
+    # Last pass: Depth-to-space (Pass 51)
+    hlsl_content.append("//!PASS 51")
+    hlsl_content.append("//!DESC Depth-To-Space")
+    hlsl_content.append("//!BLOCK_SIZE 16")
+    hlsl_content.append("//!NUM_THREADS 64")
+    hlsl_content.append("//!IN INPUT, T6_0")
+    hlsl_content.append("//!OUT OUTPUT\n")
+    hlsl_content.append("""void Pass51(uint2 blockStart, uint3 tid) {
+	float2 pt = float2(GetInputPt());
+	uint2 gxy = (Rmp8x8(tid.x) << 1) + blockStart;
+	uint2 sz = GetOutputSize();
+	if (gxy.x >= sz.x || gxy.y >= sz.y)
+		return;
+
+	MF4 channels = T6_0.Load(int3(gxy >> 1, 0));
+	float2 opt = float2(GetOutputPt());
+
+	float2 pos;
+	MF3 rgb;
+	MF3 yuv;
+
+	// (0, 0)
+	pos = (float2(gxy) + float2(0.5, 0.5)) * opt;
+	rgb = INPUT.SampleLevel(SL, pos, 0).rgb;
+	yuv = mul(RY, rgb);
+	yuv.r = saturate(channels.x);
+	OUTPUT[gxy + int2(0, 0)] = MF4(mul(YR, yuv), 1.0);
+
+	// (1, 0)
+	if (gxy.x + 1 < sz.x) {
+		pos = (float2(gxy) + float2(1.5, 0.5)) * opt;
+		rgb = INPUT.SampleLevel(SL, pos, 0).rgb;
+		yuv = mul(RY, rgb);
+		yuv.r = saturate(channels.y);
+		OUTPUT[gxy + int2(1, 0)] = MF4(mul(YR, yuv), 1.0);
+	}
+
+	// (0, 1)
+	if (gxy.y + 1 < sz.y) {
+		pos = (float2(gxy) + float2(0.5, 1.5)) * opt;
+		rgb = INPUT.SampleLevel(SL, pos, 0).rgb;
+		yuv = mul(RY, rgb);
+		yuv.r = saturate(channels.z);
+		OUTPUT[gxy + int2(0, 1)] = MF4(mul(YR, yuv), 1.0);
+	}
+
+	// (1, 1)
+	if (gxy.x + 1 < sz.x && gxy.y + 1 < sz.y) {
+		pos = (float2(gxy) + float2(1.5, 1.5)) * opt;
+		rgb = INPUT.SampleLevel(SL, pos, 0).rgb;
+		yuv = mul(RY, rgb);
+		yuv.r = saturate(channels.w);
+		OUTPUT[gxy + int2(1, 1)] = MF4(mul(YR, yuv), 1.0);
+	}
+}
+""")
+
+    # Write to file
+    os.makedirs(os.path.dirname(hlsl_path), exist_ok=True)
+    with open(hlsl_path, 'w', encoding='utf-8') as f:
+        f.write('\n'.join(hlsl_content))
+    print(f"Successfully generated {hlsl_path}")
+
+def port_cmp(glsl_path, hlsl_path):
+    header, passes = parse_glsl_passes(glsl_path)
+    
+    hlsl_content = []
+    hlsl_content.append(f"// Generated from {os.path.basename(glsl_path)}")
+    hlsl_content.append(header)
+    hlsl_content.append("\n//!MAGPIE EFFECT")
+    hlsl_content.append("//!VERSION 4")
+    hlsl_content.append("//!USE MulAdd")
+    hlsl_content.append("//!CAPABILITY FP16\n")
+    hlsl_content.append('#include "../StubDefs.hlsli"\n')
+    
+    # Add static constants and samplers
+    hlsl_content.append("//!TEXTURE")
+    hlsl_content.append("Texture2D INPUT;\n")
+    hlsl_content.append("//!TEXTURE")
+    hlsl_content.append("//!WIDTH INPUT_WIDTH * 2")
+    hlsl_content.append("//!HEIGHT INPUT_HEIGHT * 2")
+    hlsl_content.append("Texture2D OUTPUT;\n")
+    hlsl_content.append("//!SAMPLER")
+    hlsl_content.append("//!FILTER POINT")
+    hlsl_content.append("SamplerState SP;\n")
+    hlsl_content.append("//!SAMPLER")
+    hlsl_content.append("//!FILTER LINEAR")
+    hlsl_content.append("SamplerState SL;\n")
+    
+    # Define color space matrices
+    hlsl_content.append("//!COMMON")
+    hlsl_content.append("static const MF3x3 RY = {0.299, 0.587, 0.114, -0.169, -0.331, 0.5, 0.5, -0.419, -0.081};")
+    hlsl_content.append("static const MF3x3 YR = {1, -0.00093, 1.401687, 1, -0.3437, -0.71417, 1, 1.77216, 0.00099};")
+    hlsl_content.append("#define LUMA_mul 1.0f")
+    hlsl_content.append("#define conv2d_mul 1.0f")
+    hlsl_content.append("#define conv2d_1_mul 1.0f")
+    hlsl_content.append("#define conv2d_2_mul 1.0f")
+    hlsl_content.append("#define conv2d_3_mul 1.0f")
+    hlsl_content.append("#define conv2d_4_mul 1.0f")
+    hlsl_content.append("#define conv2d_5_mul 1.0f\n")
+    
+    # Intermediate textures
+    hlsl_content.append("//!TEXTURE")
+    hlsl_content.append("//!WIDTH INPUT_WIDTH")
+    hlsl_content.append("//!HEIGHT INPUT_HEIGHT")
+    hlsl_content.append("//!FORMAT R16G16B16A16_FLOAT")
+    hlsl_content.append("Texture2D LUMA;\n")
+    
+    # conv2d to conv2d_5 are horizontally packed 8x wider
+    for layer in range(6):
+        name = "conv2d" if layer == 0 else f"conv2d_{layer}"
+        hlsl_content.append("//!TEXTURE")
+        hlsl_content.append("//!WIDTH INPUT_WIDTH * 8")
+        hlsl_content.append("//!HEIGHT INPUT_HEIGHT")
+        hlsl_content.append("//!FORMAT R16G16B16A16_FLOAT")
+        hlsl_content.append(f"Texture2D {name};\n")
+        
+    hlsl_content.append("//!TEXTURE")
+    hlsl_content.append("//!WIDTH INPUT_WIDTH")
+    hlsl_content.append("//!HEIGHT INPUT_HEIGHT")
+    hlsl_content.append("//!FORMAT R16G16B16A16_FLOAT")
+    hlsl_content.append("Texture2D conv2d_6;\n")
+    
+    # Pass 1: Luma pre-pass
+    hlsl_content.append("//!PASS 1")
+    hlsl_content.append("//!DESC Luma pre-pass")
+    hlsl_content.append("//!BLOCK_SIZE 8")
+    hlsl_content.append("//!NUM_THREADS 64")
+    hlsl_content.append("//!IN INPUT")
+    hlsl_content.append("//!OUT LUMA")
+    hlsl_content.append("""void Pass1(uint2 blockStart, uint3 tid) {
+	uint2 gxy = Rmp8x8(tid.x) + blockStart;
+	uint2 sz = GetInputSize();
+	if (gxy.x >= sz.x || gxy.y >= sz.y)
+		return;
+	float2 pt = float2(GetInputPt());
+	float2 pos = (gxy + 0.5) * pt;
+	MF3 color = INPUT.SampleLevel(SP, pos, 0).rgb;
+	LUMA[gxy] = MF4(dot(MF3(0.299, 0.587, 0.114), color), 0.0, 0.0, 0.0);
+}
+""")
+
+    # Port all compute passes (from index 0 to 6)
+    for idx, p in enumerate(passes[:-1]):
+        pass_num = idx + 2
+        desc = p['desc']
+        directives = p['directives']
+        hook_code = p['hook_code']
+        
+        save_target = directives.get('SAVE', '')
+        
+        # Get input bindings
+        in_bindings = []
+        for bind_val in directives.get('BIND', []):
+            if bind_val == 'LUMA':
+                in_bindings.append('LUMA')
+            else:
+                in_bindings.append(bind_val)
+        if save_target == 'conv2d_6':
+            in_bindings.append('INPUT')
+        
+        # Ensure unique bindings list
+        in_bindings_str = ", ".join(sorted(list(set(in_bindings))))
+        
+        hlsl_content.append(f"//!PASS {pass_num}")
+        hlsl_content.append(f"//!DESC {desc}")
+        
+        # Parse workgroup dimensions tx, ty from compute directive
+        compute_val = directives.get('COMPUTE', '')
+        if compute_val:
+            comp_parts = compute_val.split()
+            tx = comp_parts[2]
+            ty = comp_parts[3]
+        else:
+            tx = '2'
+            ty = '16'
+
+        # Override dimensions for conv2d_6 to prevent shared memory overflow (limit is 32KB on cs_5_0)
+        if save_target == 'conv2d_6':
+            tx = '16'
+            ty = '8'
+
+        # Output width and block size depend on whether we are outputting a packed texture or 1x texture
+        if save_target == 'conv2d_6':
+            # Last convolution pass outputs 1x size texture, but we merge it with depth-to-space (2x output scale)
+            hlsl_content.append("//!BLOCK_SIZE 32, 16")
+            hlsl_content.append(f"//!NUM_THREADS {tx}, {ty}")
+        else:
+            # Intermediate packed passes output 8x wider texture, but wait:
+            # The block size is specified as 16x16 in output coordinates.
+            # In output coordinates, the width is 8x wider.
+            hlsl_content.append("//!BLOCK_SIZE 16, 16")
+            hlsl_content.append(f"//!NUM_THREADS {tx}, {ty}")
+            
+        hlsl_content.append(f"//!IN {in_bindings_str}")
+        if save_target == 'conv2d_6':
+            hlsl_content.append("//!OUT OUTPUT\n")
+        else:
+            hlsl_content.append(f"//!OUT {save_target}\n")
+        
+        # Translate gl_WorkGroupID / gl_WorkGroupSize / gl_LocalInvocationID in raw hook_code
+        if save_target == 'conv2d_6':
+            base_def = "uint2 base = blockStart >> 1;"
+        else:
+            base_def = "uint2 base = uint2(blockStart.x / 8, blockStart.y);"
+            
+        hook_code = re.sub(
+            r'ivec2\s+base\s+=\s+ivec2\(gl_WorkGroupID\)\s*\*\s*ivec2\(gl_WorkGroupSize\);',
+            base_def,
+            hook_code
+        )
+        
+        # Translate matrix, vector, and barrier
+        translated_code = translate_matrix_vector(hook_code)
+        
+        # Replace:
+        # gl_LocalInvocationID
+        # to:
+        # tid
+        translated_code = re.sub(r'\bgl_LocalInvocationID\b', 'tid', translated_code)
+        
+        # Replace gl_WorkGroupSize.x and gl_WorkGroupSize.y dynamically
+        translated_code = re.sub(r'\bgl_WorkGroupSize\.x\b', tx, translated_code)
+        translated_code = re.sub(r'\bgl_WorkGroupSize\.y\b', ty, translated_code)
+        translated_code = re.sub(r'\bgl_WorkGroupSize\b', f'uint2({tx}, {ty})', translated_code)
+        
+        # Replace gl_GlobalInvocationID. In GLSL CMP, gl_GlobalInvocationID is in 1x space (since work group size is 2x16).
+        # Wait, is gl_GlobalInvocationID used? Let's check:
+        # ivec2 store_pos0 = ivec2(gl_GlobalInvocationID) * ivec2(8, 1) + ivec2(0, 0);
+        # Since gl_GlobalInvocationID is base + tid.xy:
+        translated_code = re.sub(r'\bgl_GlobalInvocationID\b', '(base + tid.xy)', translated_code)
+        
+        # Replace texelFetch calls using our robust parser
+        translated_code = replace_texel_fetch_robust(translated_code)
+        
+        # Convert imageStore to output assignments
+        # e.g., imageStore(out_image, store_pos0, result0); -> conv2d[store_pos0] = result0;
+        if save_target == 'conv2d_6':
+            subpixel_code = """
+    uint2 dest_1x = base + tid.xy;
+    uint2 sz = GetOutputSize();
+    uint2 gxy = dest_1x << 1;
+    
+    if (gxy.x < sz.x && gxy.y < sz.y) {
+        float2 opt = float2(GetOutputPt());
+        float2 pos;
+        MF3 rgb;
+        MF3 yuv;
+
+        // (0, 0)
+        pos = (float2(gxy) + float2(0.5, 0.5)) * opt;
+        rgb = INPUT.SampleLevel(SL, pos, 0).rgb;
+        yuv = mul(RY, rgb);
+        yuv.r = saturate(result0.x);
+        OUTPUT[gxy + int2(0, 0)] = MF4(mul(YR, yuv), 1.0);
+
+        // (1, 0)
+        if (gxy.x + 1 < sz.x) {
+            pos = (float2(gxy) + float2(1.5, 0.5)) * opt;
+            rgb = INPUT.SampleLevel(SL, pos, 0).rgb;
+            yuv = mul(RY, rgb);
+            yuv.r = saturate(result0.y);
+            OUTPUT[gxy + int2(1, 0)] = MF4(mul(YR, yuv), 1.0);
+        }
+
+        // (0, 1)
+        if (gxy.y + 1 < sz.y) {
+            pos = (float2(gxy) + float2(0.5, 1.5)) * opt;
+            rgb = INPUT.SampleLevel(SL, pos, 0).rgb;
+            yuv = mul(RY, rgb);
+            yuv.r = saturate(result0.z);
+            OUTPUT[gxy + int2(0, 1)] = MF4(mul(YR, yuv), 1.0);
+        }
+
+        // (1, 1)
+        if (gxy.x + 1 < sz.x && gxy.y + 1 < sz.y) {
+            pos = (float2(gxy) + float2(1.5, 1.5)) * opt;
+            rgb = INPUT.SampleLevel(SL, pos, 0).rgb;
+            yuv = mul(RY, rgb);
+            yuv.r = saturate(result0.w);
+            OUTPUT[gxy + int2(1, 1)] = MF4(mul(YR, yuv), 1.0);
+        }
+    }"""
+            translated_code = re.sub(r'imageStore\(out_image,\s*(.*?),\s*(.*?)\);', subpixel_code, translated_code)
+        else:
+            translated_code = re.sub(r'imageStore\(out_image,\s*(.*?),\s*(.*?)\);', rf'{save_target}[\1] = \2;', translated_code)
+        
+        # Add global declarations above the function body
+        isize_x = int(tx) + 2
+        isize_y = int(ty) + 2
+        
+        if save_target == 'conv2d_6':
+            inp_decl = f"groupshared MF4 inp[8][{isize_y}][{isize_x}];"
+        elif save_target == 'conv2d':
+            inp_decl = f"groupshared MF inp[1][{isize_y}][{isize_x}];"
+        else:
+            inp_decl = f"groupshared MF4 inp[8][{isize_y}][{isize_x}];"
+            
+        global_decl = f"static const int2 ksize = int2(3, 3);\nstatic const int2 offset = int2(1, 1);\nstatic const uint2 isize = uint2({isize_x}, {isize_y});\n{inp_decl}\n"
+        
+        # Rewrite the hook function signature to match MagpieFX style
+        func_sig = f"void Pass{pass_num}(uint2 blockStart, uint3 tid) {{"
+        translated_code = translated_code.replace("void hook() {", func_sig)
+        
+        hlsl_content.append(global_decl)
+        hlsl_content.append(translated_code)
+        hlsl_content.append("\n")
+        
+    # Write to file
+    os.makedirs(os.path.dirname(hlsl_path), exist_ok=True)
+    with open(hlsl_path, 'w', encoding='utf-8') as f:
+        f.write('\n'.join(hlsl_content))
+    print(f"Successfully generated {hlsl_path}")
+
+if __name__ == "__main__":
+    glsl_cmp = r"C:\Users\xiong\Desktop\APP\mpv\mpv\shaders\Ani4Kv2_ArtCNN_C4F32_i2_CMP.glsl"
+    
+    # Output path inside workspace
+    hlsl_cmp = r"src\Effects\ArtCNN\Ani4Kv2_ArtCNN_C4F32_i2.hlsl"
+    
+    # Convert CMP
+    port_cmp(glsl_cmp, hlsl_cmp)
+