mirror of
https://github.com/tinygrad/tinygrad.git
synced 2026-06-24 02:14:17 +00:00
* renderer/amd: move in tree * fix paths in tests * 24000 lines * no delete for amd files
11524 lines
554 KiB
Python
11524 lines
554 KiB
Python
from tinygrad.runtime.autogen.amd.cdna.ins import *
|
|
from tinygrad.dtype import dtypes
|
|
|
|
# M0 is encoded with 124 (NULL in RDNA) in CDNA
|
|
M0 = NULL
|
|
|
|
# (M, N, K) -> (numWG, iters, total)
|
|
GEMM_ARGS = {
|
|
(8192, 4096, 4096): (256, 64, 32768),
|
|
(8192, 14336, 4096): (256, 64, 114688),
|
|
(8192, 4096, 14336): (256, 224, 114688),
|
|
# TODO: get a fast gemm for this shape
|
|
#(8192, 128256, 4096): (16032, 64, 1026048),
|
|
(8192, 8192, 8192): (256, 128, 131072),
|
|
(4096, 4096, 4096): (256, 64, 16384),
|
|
(4096, 14336, 4096): (256, 64, 57344),
|
|
(4096, 14336, 8192): (256, 128, 114688),
|
|
(4096, 4096, 14336): (256, 224, 57344),
|
|
(14336, 4096, 8192): (256, 128, 114688),
|
|
(4096, 8192, 14336): (256, 224, 114688),
|
|
(4096, 4096, 8192): (256, 128, 32768),
|
|
(4096, 8192, 4096): (256, 64, 32768),
|
|
}
|
|
ITERS_ARGS = {64: (67108864, 0), 128: (33554432, 0), 224: (613566757, 2147483656)}
|
|
|
|
class Kernel:
|
|
def __init__(self, name="gemm"): self.name, self.instructions, self.labels, self.label_at_pos, self.pos = name, [], {}, {}, 0
|
|
|
|
def label(self, name):
|
|
self.labels[name] = self.pos
|
|
self.label_at_pos[self.pos] = name
|
|
|
|
def emit(self, inst, target=None):
|
|
self.instructions.append(inst)
|
|
inst._target, inst._pos = target, self.pos
|
|
self.pos += inst.size()
|
|
return inst
|
|
|
|
def waitcnt(self, lgkm=None, vm=None):
|
|
vmcnt, lgkmcnt, expcnt = vm if vm is not None else 63, lgkm if lgkm is not None else 15, 7
|
|
waitcnt = (vmcnt & 0xF) | ((expcnt & 0x7) << 4) | ((lgkmcnt & 0xF) << 8) | (((vmcnt >> 4) & 0x3) << 14)
|
|
self.emit(s_waitcnt(waitcnt))
|
|
|
|
def to_asm(self):
|
|
# patch branches
|
|
for inst in self.instructions:
|
|
if inst._target is None: continue
|
|
inst.simm16 = (self.labels[inst._target] - inst._pos - inst.size()) // 4
|
|
# convert instructions to bytes, pack hsa
|
|
inst_bytes = b"".join(inst.to_bytes() for inst in self.instructions)
|
|
body = "\n".join(" .byte " + ",".join(f"0x{b:02x}" for b in inst_bytes[i:i+16]) for i in range(0, len(inst_bytes), 16))
|
|
hsa = [('group_segment_fixed_size', 133120), ('private_segment_fixed_size', 0), ('kernarg_size', 24),
|
|
('next_free_vgpr', 512), ('next_free_sgpr', 96), ('system_sgpr_workgroup_id_x', 1),
|
|
('system_sgpr_workgroup_id_y', 1), ('system_sgpr_workgroup_id_z', 1), ('user_sgpr_kernarg_segment_ptr', 1),
|
|
('user_sgpr_count', 2), ('user_sgpr_kernarg_preload_length', 0), ('user_sgpr_kernarg_preload_offset', 0),
|
|
('accum_offset', 256), ('uses_dynamic_stack', 0), ('tg_split', 0), ('float_round_mode_32', 0),
|
|
('float_round_mode_16_64', 0), ('float_denorm_mode_32', 3), ('float_denorm_mode_16_64', 3),
|
|
('ieee_mode', 1), ('fp16_overflow', 0), ('dx10_clamp', 1)]
|
|
args = '\n'.join(f' - .address_space: generic\n .name: {n}\n .offset: {i*8}\n'
|
|
f' .size: 8\n .value_kind: global_buffer' for i,n in enumerate(['C', 'A', 'B']))
|
|
n = self.name
|
|
return '\n'.join(['.text', '.section\t.text.', f'.global\t{n}', '.p2align\t8', f'.type\t{n},@function', '', f'{n}:',
|
|
body, '', '.section .rodata,"a",@progbits', '.p2align 6, 0x0', f'.amdhsa_kernel {n}',
|
|
*[f' .amdhsa_{k} {v}' for k, v in hsa], '.end_amdhsa_kernel', '', '.amdgpu_metadata', '---', 'amdhsa.kernels:',
|
|
' - .args:', args, ' .group_segment_fixed_size: 133120', ' .kernarg_segment_align: 8',
|
|
' .kernarg_segment_size: 24', ' .max_flat_workgroup_size: 256', f' .name: {n}',
|
|
' .private_segment_fixed_size: 0', ' .sgpr_count: 95', ' .sgpr_spill_count: 0', f' .symbol: {n}.kd',
|
|
' .vgpr_count: 249', ' .vgpr_spill_count: 0', ' .wavefront_size: 64', 'amdhsa.version:', ' - 1',
|
|
' - 1', '...', '.end_amdgpu_metadata', ''])
|
|
|
|
# outputs readable source code for this kernel
|
|
def to_text(self) -> str:
|
|
lines, pos = [], 0
|
|
for inst in self.instructions:
|
|
if (label := self.label_at_pos.get(pos)) is not None: lines.append(f"{label}:")
|
|
from test.amd.disasm import disasm
|
|
lines.append(f" {disasm(inst)}" if inst._target is None else f" {inst.op_name.lower()} {inst._target}")
|
|
pos += inst.size()
|
|
return "\n".join(lines)
|
|
|
|
def build_kernel(batch, M, N, K, dtype):
|
|
numWG, iters, total = GEMM_ARGS[(M, N, K)]
|
|
total *= batch
|
|
magic, shift = ITERS_ARGS[iters]
|
|
v_mfma_16x16x32 = {dtypes.half:v_mfma_f32_16x16x32_f16, dtypes.bfloat16:v_mfma_f32_16x16x32_bf16}[dtype]
|
|
v_cvt_pk = {dtypes.half:v_cvt_pk_f16_f32, dtypes.bfloat16:v_cvt_pk_bf16_f32}[dtype]
|
|
v_cvt = {dtypes.half:v_cvt_f32_f16_e32, dtypes.bfloat16:v_cvt_f32_bf16_e32}[dtype]
|
|
k = Kernel(f"gemm_{batch}_{M}_{N}_{K}")
|
|
# load D, A, B pointers
|
|
k.emit(s_load_dwordx2(s[24:25], s[0:1], s[0], 0, 0, 0, 0, 1))
|
|
k.emit(s_load_dwordx2(s[30:31], s[0:1], s[0], 8, 0, 0, 0, 1))
|
|
k.emit(s_load_dwordx2(s[28:29], s[0:1], s[0], 16, 0, 0, 0, 1))
|
|
k.waitcnt(lgkm=0)
|
|
# params as constants
|
|
k.emit(s_mov_b32(s[69], numWG))
|
|
k.emit(s_mov_b32(s[20], N))
|
|
k.emit(s_mov_b32(s[21], batch * M))
|
|
k.emit(s_mov_b32(s[22], 1))
|
|
k.emit(s_mov_b32(s[23], K))
|
|
k.emit(s_mov_b32(s[36], N))
|
|
k.emit(s_mov_b32(s[37], 0))
|
|
k.emit(s_mov_b32(s[40], N))
|
|
k.emit(s_mov_b32(s[41], 0))
|
|
k.emit(s_mov_b32(s[42], K))
|
|
k.emit(s_mov_b32(s[43], 0))
|
|
k.emit(s_mov_b32(s[46], iters))
|
|
k.emit(s_mov_b32(s[47], magic))
|
|
k.emit(s_mov_b32(s[48], shift))
|
|
k.emit(s_mov_b32(s[49], total))
|
|
k.emit(s_mov_b32(s[62], 0))
|
|
k.emit(s_mov_b32(s[68], 0))
|
|
# kernel size is 256x256
|
|
k.emit(s_mov_b32(s[51], 256)); k.emit(s_mov_b32(s[52], 256))
|
|
k.emit(s_mov_b32(s[38], s[36]))
|
|
k.emit(s_mov_b32(s[39], s[37]))
|
|
k.emit(s_mov_b64(s[26:27], s[24:25]))
|
|
k.emit(s_and_b32(s[6], s[68], 4294901760))
|
|
k.emit(s_lshr_b32(s[6], s[6], 16))
|
|
k.emit(s_mov_b32(s[63], 0))
|
|
k.emit(s_setprio(3))
|
|
k.emit(s_mov_b32(M0, 133120))
|
|
k.emit(v_mov_b32_e32(v[180], v[0]))
|
|
# XCCG=256
|
|
# labels are named based on function:
|
|
# PGR = Prefetch Global Read (the global→LDS pipeline stage)
|
|
# SK = Stream-K (work partitioning by K-iterations, not tiles)
|
|
# WGM = WorkGroup Mapping (tile assignment scheme for cache locality)
|
|
# GLVW = Global Load Vector Width (edge tile width handling)
|
|
# BM0 = Block M offset 0 (register block position)
|
|
# OrdNLL = Ordered No-Load-Loop (final iteration without prefetch loads)
|
|
k.emit(s_mov_b32(s[75], 256))
|
|
k.emit(v_cvt_f32_u32_e32(v[18], s[75]))
|
|
k.emit(v_rcp_iflag_f32_e32(v[18], v[18]))
|
|
k.emit(v_cvt_f32_u32_e32(v[19], s[2]))
|
|
k.emit(v_mul_f32_e32(v[18], v[18], v[19]))
|
|
k.emit(v_cvt_u32_f32_e32(v[18], v[18]))
|
|
k.emit(v_mul_u32_u24_e64(v[19], v[18], s[75]))
|
|
k.emit(v_sub_u32_e32(v[19], s[2], v[19]))
|
|
k.emit(v_cmpx_eq_u32_e64(EXEC, v[19], s[75]))
|
|
k.emit(v_add_u32_e32(v[18], 1, v[18]))
|
|
k.emit(v_mov_b32_e32(v[19], 0))
|
|
k.emit(s_mov_b64(EXEC, -1))
|
|
k.emit(v_cmpx_gt_u32_e64(EXEC, v[19], s[75]))
|
|
k.emit(v_sub_u32_e64(v[18], v[18], 1))
|
|
k.emit(v_mul_u32_u24_e64(v[19], v[18], s[75]))
|
|
k.emit(v_sub_u32_e32(v[19], s[2], v[19]))
|
|
k.emit(s_mov_b64(EXEC, -1))
|
|
k.emit(v_readfirstlane_b32_e32(v[71], v[18]))
|
|
k.emit(v_readfirstlane_b32_e32(v[72], v[19]))
|
|
k.emit(s_mul_i32(s[71], s[71], s[75]))
|
|
k.emit(s_lshr_b32(s[72], s[72], 1))
|
|
k.emit(s_add_u32(s[71], s[71], s[72]))
|
|
k.emit(v_cvt_f32_u32_e32(v[18], s[75]))
|
|
k.emit(v_rcp_iflag_f32_e32(v[18], v[18]))
|
|
k.emit(v_cvt_f32_u32_e32(v[19], s[69]))
|
|
k.emit(v_mul_f32_e32(v[18], v[18], v[19]))
|
|
k.emit(v_cvt_u32_f32_e32(v[18], v[18]))
|
|
k.emit(v_mul_u32_u24_e64(v[19], v[18], s[75]))
|
|
k.emit(v_sub_u32_e32(v[19], s[69], v[19]))
|
|
k.emit(v_cmpx_eq_u32_e64(EXEC, v[19], s[75]))
|
|
k.emit(v_add_u32_e32(v[18], 1, v[18]))
|
|
k.emit(s_mov_b64(EXEC, -1))
|
|
k.emit(v_cmpx_gt_u32_e64(EXEC, v[19], s[75]))
|
|
k.emit(v_sub_u32_e64(v[18], v[18], 1))
|
|
k.emit(s_mov_b64(EXEC, -1))
|
|
k.emit(v_readfirstlane_b32_e32(v[72], v[18]))
|
|
k.emit(s_mul_i32(s[72], s[72], s[75]))
|
|
k.emit(s_sub_u32(s[73], s[69], s[72]))
|
|
k.emit(s_cmp_gt_u32(s[2], s[72]))
|
|
k.emit(s_cselect_b32(s[72], s[73], s[75]))
|
|
k.emit(s_lshr_b32(s[72], s[72], 1))
|
|
k.emit(s_bfm_b32(s[73], 1, 0))
|
|
k.emit(s_and_b32(s[73], s[2], s[73]))
|
|
k.emit(s_mul_i32(s[72], s[72], s[73]))
|
|
k.emit(s_add_u32(s[2], s[71], s[72]))
|
|
k.label('skip_WGMXCC')
|
|
k.emit(v_mov_b32_e32(v[20], 256))
|
|
k.emit(v_mov_b32_e32(v[19], s[20]))
|
|
k.emit(v_cvt_f32_u32_e32(v[18], v[20]))
|
|
k.emit(v_rcp_iflag_f32_e32(v[18], v[18]))
|
|
k.emit(v_cvt_f32_u32_e32(v[21], v[19]))
|
|
k.emit(v_mul_f32_e32(v[18], v[18], v[21]))
|
|
k.emit(v_cvt_u32_f32_e32(v[18], v[18]))
|
|
k.emit(v_mul_u32_u24_e32(v[21], v[18], v[20]))
|
|
k.emit(v_sub_u32_e32(v[21], v[19], v[21]))
|
|
k.emit(v_cmp_ne_u32_e64(VCC, v[21], 0))
|
|
k.emit(v_addc_co_u32(v[18], VCC, v[18], 0, VCC))
|
|
k.emit(v_mov_b32_e32(v[20], 256))
|
|
k.emit(v_mov_b32_e32(v[19], s[21]))
|
|
k.emit(v_readfirstlane_b32_e32(v[10], v[18]))
|
|
k.emit(v_cvt_f32_u32_e32(v[18], v[20]))
|
|
k.emit(v_rcp_iflag_f32_e32(v[18], v[18]))
|
|
k.emit(v_cvt_f32_u32_e32(v[21], v[19]))
|
|
k.emit(v_mul_f32_e32(v[18], v[18], v[21]))
|
|
k.emit(v_cvt_u32_f32_e32(v[18], v[18]))
|
|
k.emit(v_mul_u32_u24_e32(v[21], v[18], v[20]))
|
|
k.emit(v_sub_u32_e32(v[21], v[19], v[21]))
|
|
k.emit(v_cmp_ne_u32_e64(VCC, v[21], 0))
|
|
k.emit(v_addc_co_u32(v[18], VCC, v[18], 0, VCC))
|
|
k.emit(s_nop())
|
|
k.emit(v_readfirstlane_b32_e32(v[11], v[18]))
|
|
k.waitcnt(lgkm=0)
|
|
k.emit(s_mov_b32(s[85], 84148480))
|
|
k.emit(s_mov_b32(s[86], 117834498))
|
|
k.emit(s_sub_u32(s[28], s[28], 16))
|
|
k.emit(s_subb_u32(s[29], s[29], 0))
|
|
k.emit(s_sub_u32(s[30], s[30], 16))
|
|
k.emit(s_subb_u32(s[31], s[31], 0))
|
|
k.label('AlphaNonZero')
|
|
k.emit(s_mov_b32(s[57], s[2]))
|
|
k.emit(s_mul_i32(s[58], s[57], s[46]))
|
|
k.emit(s_mov_b32(s[59], s[49]))
|
|
k.emit(s_mul_i32(s[87], s[52], s[46]))
|
|
k.emit(s_cmp_lt_u32(s[87], s[49]))
|
|
k.emit(s_cbranch_scc1(), target='SK_InitDone')
|
|
k.emit(s_mul_i32(s[87], s[52], s[46]))
|
|
k.emit(s_mul_i32(s[88], s[46], s[51]))
|
|
k.emit(s_sub_u32(s[87], s[87], s[88]))
|
|
k.emit(s_mul_i32(s[58], s[57], s[46]))
|
|
k.emit(s_add_u32(s[58], s[58], s[87]))
|
|
k.emit(s_add_u32(s[59], s[58], s[46]))
|
|
k.emit(s_add_u32(s[89], s[46], 1))
|
|
k.emit(s_mul_i32(s[88], s[57], s[89]))
|
|
k.emit(s_add_u32(s[89], s[88], s[89]))
|
|
k.emit(s_cmp_lt_u32(s[57], s[87]))
|
|
k.emit(s_cselect_b32(s[58], s[88], s[58]))
|
|
k.emit(s_cselect_b32(s[59], s[89], s[59]))
|
|
k.emit(s_mul_i32(s[87], s[52], s[46]))
|
|
k.emit(s_min_u32(s[59], s[59], s[87]))
|
|
k.label('SK_InitDone')
|
|
k.emit(s_cmp_ge_u32(s[58], s[49]))
|
|
k.emit(s_cbranch_scc1(), target='KernelEnd')
|
|
k.label('PersistentLoopStart')
|
|
k.emit(v_xor_b32_e32(v[18], v[178], v[16]))
|
|
k.emit(v_min_i32_e32(v[16], v[16], v[18]))
|
|
k.emit(v_xor_b32_e32(v[18], v[179], v[17]))
|
|
k.emit(v_min_i32_e32(v[17], v[17], v[18]))
|
|
k.emit(s_mul_hi_u32(s[89], s[58], s[47]))
|
|
k.emit(s_lshr_b32(s[90], s[48], 31))
|
|
k.emit(s_mul_i32(s[88], s[58], s[90]))
|
|
k.emit(s_add_u32(s[88], s[88], s[89]))
|
|
k.emit(s_and_b32(s[90], s[48], 2147483647))
|
|
k.emit(s_lshr_b32(s[88], s[88], s[90]))
|
|
k.emit(s_mul_i32(s[89], s[88], s[46]))
|
|
k.emit(s_add_u32(s[90], s[89], s[46]))
|
|
k.emit(s_sub_u32(s[60], s[58], s[89]))
|
|
k.emit(s_min_u32(s[61], s[59], s[90]))
|
|
k.emit(s_sub_u32(s[61], s[61], s[89]))
|
|
k.emit(s_mul_i32(s[91], s[52], s[46]))
|
|
k.emit(s_sub_u32(s[91], s[49], s[91]))
|
|
k.emit(s_mul_i32(s[89], s[51], s[46]))
|
|
k.emit(s_add_u32(s[89], s[89], s[58]))
|
|
k.emit(s_cmp_lt_u32(s[89], s[91]))
|
|
k.emit(s_cbranch_scc1(), target='NoBranch_8G3ZEUE1ZDJOP9IU')
|
|
k.emit(s_mov_b32(s[89], s[90]))
|
|
k.emit(s_cmp_le_u32(s[91], s[58]))
|
|
k.emit(s_cbranch_scc1(), target='NoBranch_8G3ZEUE1ZDJOP9IU')
|
|
k.emit(s_mul_i32(s[87], s[52], s[46]))
|
|
k.emit(s_mul_i32(s[92], s[46], s[51]))
|
|
k.emit(s_sub_u32(s[87], s[87], s[92]))
|
|
k.emit(s_mul_i32(s[58], s[57], s[46]))
|
|
k.emit(s_add_u32(s[58], s[58], s[87]))
|
|
k.emit(s_add_u32(s[59], s[58], s[46]))
|
|
k.emit(s_add_u32(s[93], s[46], 1))
|
|
k.emit(s_mul_i32(s[92], s[57], s[93]))
|
|
k.emit(s_add_u32(s[93], s[92], s[93]))
|
|
k.emit(s_cmp_lt_u32(s[57], s[87]))
|
|
k.emit(s_cselect_b32(s[58], s[92], s[58]))
|
|
k.emit(s_cselect_b32(s[59], s[93], s[59]))
|
|
k.emit(s_add_u32(s[89], s[58], s[91]))
|
|
k.emit(s_add_u32(s[59], s[59], s[91]))
|
|
k.emit(s_min_u32(s[59], s[59], s[49]))
|
|
k.emit(s_cmp_ge_u32(s[58], s[49]))
|
|
k.emit(s_cbranch_scc1(), target='KernelEnd')
|
|
k.label('NoBranch_8G3ZEUE1ZDJOP9IU')
|
|
k.emit(s_mov_b32(s[58], s[89]))
|
|
k.emit(s_mul_i32(s[89], s[10], s[11]))
|
|
k.emit(v_cvt_f32_u32_e32(v[18], s[89]))
|
|
k.emit(v_rcp_iflag_f32_e32(v[18], v[18]))
|
|
k.emit(v_cvt_f32_u32_e32(v[19], s[88]))
|
|
k.emit(v_mul_f32_e32(v[18], v[18], v[19]))
|
|
k.emit(v_cvt_u32_f32_e32(v[18], v[18]))
|
|
k.emit(v_mul_u32_u24_e64(v[19], v[18], s[89]))
|
|
k.emit(v_sub_u32_e32(v[19], s[88], v[19]))
|
|
k.emit(v_cmpx_eq_u32_e64(EXEC, v[19], s[89]))
|
|
k.emit(v_add_u32_e32(v[18], 1, v[18]))
|
|
k.emit(v_mov_b32_e32(v[19], 0))
|
|
k.emit(s_mov_b64(EXEC, -1))
|
|
k.emit(v_cmpx_gt_u32_e64(EXEC, v[19], s[89]))
|
|
k.emit(v_sub_u32_e64(v[18], v[18], 1))
|
|
k.emit(v_mul_u32_u24_e64(v[19], v[18], s[89]))
|
|
k.emit(v_sub_u32_e32(v[19], s[88], v[19]))
|
|
k.emit(s_mov_b64(EXEC, -1))
|
|
k.emit(v_readfirstlane_b32_e32(v[4], v[18]))
|
|
k.emit(v_readfirstlane_b32_e32(v[90], v[19]))
|
|
k.emit(v_cvt_f32_u32_e32(v[18], s[10]))
|
|
k.emit(v_rcp_iflag_f32_e32(v[18], v[18]))
|
|
k.emit(v_cvt_f32_u32_e32(v[19], s[90]))
|
|
k.emit(v_mul_f32_e32(v[18], v[18], v[19]))
|
|
k.emit(v_cvt_u32_f32_e32(v[18], v[18]))
|
|
k.emit(v_mul_u32_u24_e64(v[19], v[18], s[10]))
|
|
k.emit(v_sub_u32_e32(v[19], s[90], v[19]))
|
|
k.emit(v_cmpx_eq_u32_e64(EXEC, v[19], s[10]))
|
|
k.emit(v_add_u32_e32(v[18], 1, v[18]))
|
|
k.emit(v_mov_b32_e32(v[19], 0))
|
|
k.emit(s_mov_b64(EXEC, -1))
|
|
k.emit(v_cmpx_gt_u32_e64(EXEC, v[19], s[10]))
|
|
k.emit(v_sub_u32_e64(v[18], v[18], 1))
|
|
k.emit(v_mul_u32_u24_e64(v[19], v[18], s[10]))
|
|
k.emit(v_sub_u32_e32(v[19], s[90], v[19]))
|
|
k.emit(s_mov_b64(EXEC, -1))
|
|
k.emit(v_readfirstlane_b32_e32(v[3], v[18]))
|
|
k.emit(v_readfirstlane_b32_e32(v[2], v[19]))
|
|
k.label('SKAlphaCheck')
|
|
k.emit(s_mov_b32(s[91], 16))
|
|
k.emit(v_cvt_f32_u32_e32(v[18], s[91]))
|
|
k.emit(v_rcp_iflag_f32_e32(v[18], v[18]))
|
|
k.emit(v_cvt_f32_u32_e32(v[19], s[3]))
|
|
k.emit(v_mul_f32_e32(v[18], v[18], v[19]))
|
|
k.emit(v_cvt_u32_f32_e32(v[18], v[18]))
|
|
k.emit(v_mul_u32_u24_e64(v[19], v[18], s[91]))
|
|
k.emit(v_sub_u32_e32(v[19], s[3], v[19]))
|
|
k.emit(v_cmpx_eq_u32_e64(EXEC, v[19], s[91]))
|
|
k.emit(v_add_u32_e32(v[18], 1, v[18]))
|
|
k.emit(s_mov_b64(EXEC, -1))
|
|
k.emit(v_cmpx_gt_u32_e64(EXEC, v[19], s[91]))
|
|
k.emit(v_sub_u32_e64(v[18], v[18], 1))
|
|
k.emit(s_mov_b64(EXEC, -1))
|
|
k.emit(v_readfirstlane_b32_e32(v[87], v[18]))
|
|
k.emit(s_mul_i32(s[90], s[87], s[91]))
|
|
k.emit(s_sub_u32(s[90], s[3], s[90]))
|
|
k.emit(s_mul_i32(s[90], s[90], s[10]))
|
|
k.emit(s_add_u32(s[90], s[90], s[2]))
|
|
k.emit(v_cvt_f32_u32_e32(v[18], s[91]))
|
|
k.emit(v_rcp_iflag_f32_e32(v[18], v[18]))
|
|
k.emit(v_cvt_f32_u32_e32(v[19], s[11]))
|
|
k.emit(v_mul_f32_e32(v[18], v[18], v[19]))
|
|
k.emit(v_cvt_u32_f32_e32(v[18], v[18]))
|
|
k.emit(v_mul_u32_u24_e64(v[19], v[18], s[91]))
|
|
k.emit(v_sub_u32_e32(v[19], s[11], v[19]))
|
|
k.emit(v_cmpx_eq_u32_e64(EXEC, v[19], s[91]))
|
|
k.emit(v_add_u32_e32(v[18], 1, v[18]))
|
|
k.emit(s_mov_b64(EXEC, -1))
|
|
k.emit(v_cmpx_gt_u32_e64(EXEC, v[19], s[91]))
|
|
k.emit(v_sub_u32_e64(v[18], v[18], 1))
|
|
k.emit(s_mov_b64(EXEC, -1))
|
|
k.emit(v_readfirstlane_b32_e32(v[88], v[18]))
|
|
k.emit(s_mul_i32(s[89], s[91], s[88]))
|
|
k.emit(s_sub_u32(s[89], s[11], s[89]))
|
|
k.emit(s_cmp_eq_u32(s[89], 0))
|
|
k.emit(s_cmov_b32(s[89], s[91]))
|
|
k.emit(s_cmp_ge_u32(s[87], s[88]))
|
|
k.emit(s_cselect_b32(s[88], s[89], s[91]))
|
|
k.emit(v_cvt_f32_u32_e32(v[18], s[88]))
|
|
k.emit(v_rcp_iflag_f32_e32(v[18], v[18]))
|
|
k.emit(v_cvt_f32_u32_e32(v[19], s[90]))
|
|
k.emit(v_mul_f32_e32(v[18], v[18], v[19]))
|
|
k.emit(v_cvt_u32_f32_e32(v[18], v[18]))
|
|
k.emit(v_mul_u32_u24_e64(v[19], v[18], s[88]))
|
|
k.emit(v_sub_u32_e32(v[19], s[90], v[19]))
|
|
k.emit(v_cmpx_eq_u32_e64(EXEC, v[19], s[88]))
|
|
k.emit(v_add_u32_e32(v[18], 1, v[18]))
|
|
k.emit(v_mov_b32_e32(v[19], 0))
|
|
k.emit(s_mov_b64(EXEC, -1))
|
|
k.emit(v_cmpx_gt_u32_e64(EXEC, v[19], s[88]))
|
|
k.emit(v_sub_u32_e64(v[18], v[18], 1))
|
|
k.emit(v_mul_u32_u24_e64(v[19], v[18], s[88]))
|
|
k.emit(v_sub_u32_e32(v[19], s[90], v[19]))
|
|
k.emit(s_mov_b64(EXEC, -1))
|
|
k.emit(v_readfirstlane_b32_e32(v[2], v[18]))
|
|
k.emit(v_readfirstlane_b32_e32(v[3], v[19]))
|
|
k.emit(s_mul_i32(s[3], s[2], s[88]))
|
|
k.emit(s_sub_u32(s[3], s[90], s[3]))
|
|
k.emit(s_mul_i32(s[87], s[87], s[91]))
|
|
k.emit(s_add_u32(s[3], s[3], s[87]))
|
|
k.label('WGM')
|
|
k.emit(v_and_b32_e32(v[19], 63, v[180]))
|
|
k.emit(v_and_b32_e32(v[18], 15, v[19]))
|
|
k.emit(v_lshlrev_b32_e32(v[18], 3, v[18]))
|
|
k.emit(v_lshrrev_b32_e32(v[19], 4, v[19]))
|
|
k.emit(v_lshl_add_u32_e64(v[18], v[19], 11, v[18]))
|
|
k.emit(v_lshrrev_b32_e32(v[22], 6, v[180]))
|
|
k.emit(v_and_b32_e32(v[22], 1, v[22]))
|
|
k.emit(v_lshl_add_u32_e64(v[18], v[22], 7, v[18]))
|
|
k.emit(v_and_b32_e32(v[20], 63, v[180]))
|
|
k.emit(v_and_b32_e32(v[19], 15, v[20]))
|
|
k.emit(v_lshlrev_b32_e32(v[19], 6, v[19]))
|
|
k.emit(v_lshlrev_b32_e32(v[19], 3, v[19]))
|
|
k.emit(v_lshrrev_b32_e32(v[20], 4, v[20]))
|
|
k.emit(v_lshl_add_u32_e64(v[19], v[20], 3, v[19]))
|
|
k.emit(v_lshrrev_b32_e32(v[21], 7, v[180]))
|
|
k.emit(v_and_b32_e32(v[21], 1, v[21]))
|
|
k.emit(v_lshl_add_u32_e64(v[19], v[21], 13, v[19]))
|
|
k.emit(v_lshrrev_b32_e32(v[20], 6, v[180]))
|
|
k.emit(v_lshrrev_b32_e32(v[20], 2, v[20]))
|
|
k.emit(s_mov_b32(s[87], 16384))
|
|
k.emit(v_mul_lo_u32(v[20], s[87], v[20]))
|
|
k.emit(v_add_lshl_u32_e64(v[16], v[20], v[18], 1))
|
|
k.emit(v_lshrrev_b32_e32(v[18], 6, v[180]))
|
|
k.emit(v_lshrrev_b32_e32(v[18], 2, v[18]))
|
|
k.emit(s_mov_b32(s[87], 64))
|
|
k.emit(v_mul_lo_u32(v[18], s[87], v[18]))
|
|
k.emit(v_add_lshl_u32_e64(v[17], v[18], v[19], 1))
|
|
k.emit(v_lshrrev_b32_e32(v[20], 10, v[17]))
|
|
k.emit(v_lshl_add_u32_e64(v[17], v[20], 5, v[17]))
|
|
k.emit(v_add_co_u32_e32(v[17], 32768, v[17]))
|
|
k.emit(v_add_u32_e32(v[178], 66560, v[16]))
|
|
k.emit(v_xor_b32_e32(v[178], v[178], v[16]))
|
|
k.emit(v_add_u32_e32(v[179], 66560, v[17]))
|
|
k.emit(v_xor_b32_e32(v[179], v[179], v[17]))
|
|
k.emit(v_lshrrev_b32_e32(v[19], 5, v[180]))
|
|
k.emit(v_and_b32_e32(v[18], 31, v[180]))
|
|
k.emit(v_lshlrev_b32_e32(v[18], 3, v[18]))
|
|
k.emit(v_mov_b32_e32(v[22], v[19]))
|
|
k.emit(v_lshrrev_b32_e32(v[20], 3, v[180]))
|
|
k.emit(v_and_b32_e32(v[21], 7, v[180]))
|
|
k.emit(v_lshlrev_b32_e32(v[21], 3, v[21]))
|
|
k.emit(v_mov_b32_e32(v[23], v[21]))
|
|
k.emit(v_mul_u32_u24_e32(v[24], 256, v[22]))
|
|
k.emit(v_add_lshl_u32_e64(v[24], v[18], v[24], 1))
|
|
k.emit(s_nop())
|
|
k.emit(v_readfirstlane_b32_e32(v[53], v[24]))
|
|
k.emit(s_nop())
|
|
k.emit(s_add_u32(s[55], s[53], 66560))
|
|
k.emit(s_xor_b32(s[55], s[55], s[53]))
|
|
k.emit(v_mul_u32_u24_e32(v[24], 64, v[20]))
|
|
k.emit(v_add_lshl_u32_e64(v[24], v[23], v[24], 1))
|
|
k.emit(v_lshrrev_b32_e32(v[26], 10, v[24]))
|
|
k.emit(v_lshl_add_u32_e64(v[24], v[26], 5, v[24]))
|
|
k.emit(v_add_co_u32_e32(v[24], 32768, v[24]))
|
|
k.emit(s_nop())
|
|
k.emit(v_readfirstlane_b32_e32(v[54], v[24]))
|
|
k.emit(s_nop())
|
|
k.emit(s_add_u32(s[56], s[54], 66560))
|
|
k.emit(s_xor_b32(s[56], s[56], s[54]))
|
|
k.emit(v_mov_b32_e32(v[24], v[18]))
|
|
k.emit(v_mov_b32_e32(v[25], v[20]))
|
|
k.emit(v_add_co_u32_e32(v[26], 32, v[25]))
|
|
k.emit(v_add_co_u32_e32(v[27], 32, v[26]))
|
|
k.emit(v_add_co_u32_e32(v[28], 32, v[27]))
|
|
k.emit(v_add_co_u32_e32(v[29], 32, v[28]))
|
|
k.emit(v_add_co_u32_e32(v[30], 32, v[29]))
|
|
k.emit(v_add_co_u32_e32(v[31], 32, v[30]))
|
|
k.emit(v_add_co_u32_e32(v[32], 32, v[31]))
|
|
k.emit(v_mov_b32_e32(v[33], v[19]))
|
|
k.emit(v_add_co_u32_e32(v[34], 8, v[33]))
|
|
k.emit(v_add_co_u32_e32(v[35], 8, v[34]))
|
|
k.emit(v_add_co_u32_e32(v[36], 8, v[35]))
|
|
k.emit(v_add_co_u32_e32(v[37], 8, v[36]))
|
|
k.emit(v_add_co_u32_e32(v[38], 8, v[37]))
|
|
k.emit(v_add_co_u32_e32(v[39], 8, v[38]))
|
|
k.emit(v_add_co_u32_e32(v[40], 8, v[39]))
|
|
k.emit(v_mov_b32_e32(v[41], v[21]))
|
|
k.emit(s_mul_i32(s[87], s[2], 256))
|
|
k.emit(s_sub_u32(s[87], s[20], s[87]))
|
|
k.emit(s_sub_u32(s[87], s[87], 8))
|
|
k.emit(v_mov_b32_e32(v[42], s[87]))
|
|
k.emit(v_min_i32_e32(v[24], v[42], v[24]))
|
|
k.emit(v_mul_lo_u32(v[42], s[40], v[33]))
|
|
k.emit(v_add_co_u32_e32(v[0], v[24], v[42]))
|
|
k.emit(v_add_u32_e32(v[0], 8))
|
|
k.emit(v_lshlrev_b32_e32(v[0], 1))
|
|
k.emit(v_mul_lo_u32(v[42], s[40], v[34]))
|
|
k.emit(v_add_co_u32_e32(v[1], v[24], v[42]))
|
|
k.emit(v_add_u32_e32(v[1], 8, v[1]))
|
|
k.emit(v_lshlrev_b32_e32(v[1], 1, v[1]))
|
|
k.emit(v_mul_lo_u32(v[42], s[40], v[35]))
|
|
k.emit(v_add_co_u32_e32(v[2], v[24], v[42]))
|
|
k.emit(v_add_u32_e32(v[2], 8, v[2]))
|
|
k.emit(v_lshlrev_b32_e32(v[2], 1, v[2]))
|
|
k.emit(v_mul_lo_u32(v[42], s[40], v[36]))
|
|
k.emit(v_add_co_u32_e32(v[3], v[24], v[42]))
|
|
k.emit(v_add_u32_e32(v[3], 8, v[3]))
|
|
k.emit(v_lshlrev_b32_e32(v[3], 1, v[3]))
|
|
k.emit(v_mul_lo_u32(v[42], s[40], v[37]))
|
|
k.emit(v_add_co_u32_e32(v[4], v[24], v[42]))
|
|
k.emit(v_add_u32_e32(v[4], 8, v[4]))
|
|
k.emit(v_lshlrev_b32_e32(v[4], 1, v[4]))
|
|
k.emit(v_mul_lo_u32(v[42], s[40], v[38]))
|
|
k.emit(v_add_co_u32_e32(v[5], v[24], v[42]))
|
|
k.emit(v_add_u32_e32(v[5], 8, v[5]))
|
|
k.emit(v_lshlrev_b32_e32(v[5], 1, v[5]))
|
|
k.emit(v_mul_lo_u32(v[42], s[40], v[39]))
|
|
k.emit(v_add_co_u32_e32(v[6], v[24], v[42]))
|
|
k.emit(v_add_u32_e32(v[6], 8, v[6]))
|
|
k.emit(v_lshlrev_b32_e32(v[6], 1, v[6]))
|
|
k.emit(v_mul_lo_u32(v[42], s[40], v[40]))
|
|
k.emit(v_add_co_u32_e32(v[7], v[24], v[42]))
|
|
k.emit(v_add_u32_e32(v[7], 8, v[7]))
|
|
k.emit(v_lshlrev_b32_e32(v[7], 1, v[7]))
|
|
k.emit(v_mul_lo_u32(v[33], s[42], v[25]))
|
|
k.emit(v_add_co_u32_e32(v[8], v[41], v[33]))
|
|
k.emit(v_add_u32_e32(v[8], 8, v[8]))
|
|
k.emit(v_lshlrev_b32_e32(v[8], 1, v[8]))
|
|
k.emit(v_mul_lo_u32(v[33], s[42], v[26]))
|
|
k.emit(v_add_co_u32_e32(v[9], v[41], v[33]))
|
|
k.emit(v_add_u32_e32(v[9], 8, v[9]))
|
|
k.emit(v_lshlrev_b32_e32(v[9], 1, v[9]))
|
|
k.emit(v_mul_lo_u32(v[33], s[42], v[27]))
|
|
k.emit(v_add_co_u32_e32(v[10], v[41], v[33]))
|
|
k.emit(v_add_u32_e32(v[10], 8, v[10]))
|
|
k.emit(v_lshlrev_b32_e32(v[10], 1, v[10]))
|
|
k.emit(v_mul_lo_u32(v[33], s[42], v[28]))
|
|
k.emit(v_add_co_u32_e32(v[11], v[41], v[33]))
|
|
k.emit(v_add_u32_e32(v[11], 8, v[11]))
|
|
k.emit(v_lshlrev_b32_e32(v[11], 1, v[11]))
|
|
k.emit(v_mul_lo_u32(v[33], s[42], v[29]))
|
|
k.emit(v_add_co_u32_e32(v[12], v[41], v[33]))
|
|
k.emit(v_add_u32_e32(v[12], 8, v[12]))
|
|
k.emit(v_lshlrev_b32_e32(v[12], 1, v[12]))
|
|
k.emit(v_mul_lo_u32(v[33], s[42], v[30]))
|
|
k.emit(v_add_co_u32_e32(v[13], v[41], v[33]))
|
|
k.emit(v_add_u32_e32(v[13], 8, v[13]))
|
|
k.emit(v_lshlrev_b32_e32(v[13], 1, v[13]))
|
|
k.emit(v_mul_lo_u32(v[33], s[42], v[31]))
|
|
k.emit(v_add_co_u32_e32(v[14], v[41], v[33]))
|
|
k.emit(v_add_u32_e32(v[14], 8, v[14]))
|
|
k.emit(v_lshlrev_b32_e32(v[14], 1, v[14]))
|
|
k.emit(v_mul_lo_u32(v[33], s[42], v[32]))
|
|
k.emit(v_add_co_u32_e32(v[15], v[41], v[33]))
|
|
k.emit(v_add_u32_e32(v[15], 8, v[15]))
|
|
k.emit(v_lshlrev_b32_e32(v[15], 1, v[15]))
|
|
k.emit(s_mul_hi_u32(s[91], s[2], 256))
|
|
k.emit(s_mul_i32(s[90], s[2], 256))
|
|
k.emit(s_mul_i32(s[88], s[60], 64))
|
|
k.emit(s_mul_hi_u32(s[89], s[88], s[40]))
|
|
k.emit(s_mul_i32(s[88], s[88], s[40]))
|
|
k.emit(s_add_u32(s[90], s[90], s[88]))
|
|
k.emit(s_addc_u32(s[91], s[91], s[89]))
|
|
k.emit(s_mov_b64(s[62:63], 1))
|
|
k.emit(s_sub_u32(s[88], s[20], 1))
|
|
k.emit(s_mul_hi_u32(s[89], 1, s[88]))
|
|
k.emit(s_mul_i32(s[88], 1, s[88]))
|
|
k.emit(s_add_u32(s[62], s[62], s[88]))
|
|
k.emit(s_addc_u32(s[63], s[63], s[89]))
|
|
k.emit(s_sub_u32(s[88], s[23], 1))
|
|
k.emit(s_mul_hi_u32(s[89], s[40], s[88]))
|
|
k.emit(s_mul_i32(s[88], s[40], s[88]))
|
|
k.emit(s_add_u32(s[62], s[62], s[88]))
|
|
k.emit(s_addc_u32(s[63], s[63], s[89]))
|
|
k.emit(s_sub_u32(s[62], s[62], s[90]))
|
|
k.emit(s_subb_u32(s[63], s[63], s[91]))
|
|
k.emit(s_lshl_b64(s[62:63], s[62:63], 1))
|
|
k.emit(s_add_u32(s[62], s[62], 16))
|
|
k.emit(s_addc_u32(s[63], s[63], 0))
|
|
k.emit(s_cmp_eq_u32(s[63], 0))
|
|
k.emit(s_cselect_b32(s[70], s[62], -1))
|
|
k.emit(s_mul_hi_u32(s[89], s[41], s[4]))
|
|
k.emit(s_mul_i32(s[88], s[41], s[4]))
|
|
k.emit(s_add_u32(s[90], s[90], s[88]))
|
|
k.emit(s_addc_u32(s[91], s[91], s[89]))
|
|
k.emit(s_lshl_b64(s[90:91], s[90:91], 1))
|
|
k.emit(s_add_u32(s[68], s[28], s[90]))
|
|
k.emit(s_addc_u32(s[69], s[29], s[91]))
|
|
k.emit(s_mov_b32(s[71], 131072))
|
|
k.emit(s_mul_hi_u32(s[91], s[3], 256))
|
|
k.emit(s_mul_i32(s[90], s[3], 256))
|
|
k.emit(s_mul_hi_u32(s[91], s[90], s[42]))
|
|
k.emit(s_mul_i32(s[90], s[90], s[42]))
|
|
k.emit(s_mul_i32(s[88], s[60], 64))
|
|
k.emit(s_mul_hi_u32(s[89], s[88], 1))
|
|
k.emit(s_mul_i32(s[88], s[88], 1))
|
|
k.emit(s_add_u32(s[90], s[90], s[88]))
|
|
k.emit(s_addc_u32(s[91], s[91], s[89]))
|
|
k.emit(s_mov_b64(s[76:77], 1))
|
|
k.emit(s_sub_u32(s[88], s[23], 1))
|
|
k.emit(s_mul_hi_u32(s[89], 1, s[88]))
|
|
k.emit(s_mul_i32(s[88], 1, s[88]))
|
|
k.emit(s_add_u32(s[76], s[76], s[88]))
|
|
k.emit(s_addc_u32(s[77], s[77], s[89]))
|
|
k.emit(s_sub_u32(s[88], s[21], 1))
|
|
k.emit(s_mul_hi_u32(s[89], s[42], s[88]))
|
|
k.emit(s_mul_i32(s[88], s[42], s[88]))
|
|
k.emit(s_add_u32(s[76], s[76], s[88]))
|
|
k.emit(s_addc_u32(s[77], s[77], s[89]))
|
|
k.emit(s_sub_u32(s[76], s[76], s[90]))
|
|
k.emit(s_subb_u32(s[77], s[77], s[91]))
|
|
k.emit(s_lshl_b64(s[76:77], s[76:77], 1))
|
|
k.emit(s_add_u32(s[76], s[76], 16))
|
|
k.emit(s_addc_u32(s[77], s[77], 0))
|
|
k.emit(s_cmp_eq_u32(s[77], 0))
|
|
k.emit(s_cselect_b32(s[74], s[76], -1))
|
|
k.emit(s_mul_hi_u32(s[89], s[43], s[4]))
|
|
k.emit(s_mul_i32(s[88], s[43], s[4]))
|
|
k.emit(s_add_u32(s[90], s[90], s[88]))
|
|
k.emit(s_addc_u32(s[91], s[91], s[89]))
|
|
k.emit(s_lshl_b64(s[90:91], s[90:91], 1))
|
|
k.emit(s_add_u32(s[72], s[30], s[90]))
|
|
k.emit(s_addc_u32(s[73], s[31], s[91]))
|
|
k.emit(s_mov_b32(s[75], 131072))
|
|
k.emit(s_mul_i32(s[83], 128, s[40]))
|
|
k.emit(s_mov_b32(s[84], 128))
|
|
k.emit(s_sub_u32(s[8], s[61], s[60]))
|
|
k.label('SKAlphaCheck2')
|
|
k.emit(s_and_b32(s[89], 63, s[23]))
|
|
k.emit(s_cmp_eq_u32(s[89], 0))
|
|
k.emit(s_cselect_b32(s[88], 0, 1))
|
|
k.emit(s_cmp_eq_u32(s[61], s[46]))
|
|
k.emit(s_cselect_b32(s[88], s[88], 0))
|
|
k.emit(s_sub_u32(s[8], s[8], s[88]))
|
|
k.emit(s_mov_b32(s[9], s[8]))
|
|
k.emit(s_and_b32(s[90], s[6], 7936))
|
|
k.emit(s_lshr_b32(s[90], s[90], 8))
|
|
k.emit(s_and_b32(s[91], s[6], 57344))
|
|
k.emit(s_and_b32(s[6], s[6], 255))
|
|
k.emit(s_mov_b32(s[88], s[6]))
|
|
k.label('beginStaggerUIter')
|
|
k.emit(s_lshl_b32(s[89], s[88], s[90]))
|
|
k.emit(s_cmp_ge_u32(s[9], s[89]))
|
|
k.emit(s_cbranch_scc1(), target='endStaggerUIter')
|
|
k.emit(s_lshr_b32(s[88], s[88], 1))
|
|
k.emit(s_branch(), target='beginStaggerUIter')
|
|
k.label('endStaggerUIter')
|
|
k.emit(s_sub_u32(s[89], s[88], 1))
|
|
k.emit(s_cmp_ge_u32(s[88], 1))
|
|
k.emit(s_cselect_b32(s[78], s[89], 0))
|
|
k.emit(s_cmp_eq_u32(s[91], 0))
|
|
k.emit(s_cbranch_scc1(), target='StaggerUMapping_1')
|
|
k.emit(s_mov_b32(s[88], s[2]))
|
|
k.emit(s_branch(), target='staggerInputEnd')
|
|
k.label('StaggerUMapping_1')
|
|
k.emit(s_cmp_eq_u32(s[91], 8192))
|
|
k.emit(s_cbranch_scc1(), target='StaggerUMapping_2')
|
|
k.emit(s_mov_b32(s[88], s[3]))
|
|
k.emit(s_branch(), target='staggerInputEnd')
|
|
k.label('StaggerUMapping_2')
|
|
k.emit(s_cmp_eq_u32(s[91], 16384))
|
|
k.emit(s_cbranch_scc1(), target='StaggerUMapping_3')
|
|
k.emit(s_mov_b32(s[88], -1))
|
|
k.emit(s_branch(), target='staggerInputEnd')
|
|
k.label('StaggerUMapping_3')
|
|
k.emit(s_cmp_eq_u32(s[91], 24576))
|
|
k.emit(s_cbranch_scc1(), target='StaggerUMapping_4')
|
|
k.emit(s_mul_i32(s[89], s[10], s[3]))
|
|
k.emit(s_add_u32(s[88], s[88], s[89]))
|
|
k.emit(s_add_u32(s[88], s[88], s[2]))
|
|
k.emit(s_branch(), target='staggerInputEnd')
|
|
k.label('StaggerUMapping_4')
|
|
k.emit(s_cmp_eq_u32(s[91], 32768))
|
|
k.emit(s_cbranch_scc1(), target='staggerInputEnd')
|
|
k.emit(s_mov_b32(s[88], -1))
|
|
k.emit(s_branch(), target='staggerInputEnd')
|
|
k.label('staggerInputEnd')
|
|
k.emit(s_and_b32(s[78], s[78], s[88]))
|
|
k.emit(s_lshl_b32(s[78], s[78], s[90]))
|
|
k.emit(s_cmp_gt_u32(s[60], 0))
|
|
k.emit(s_cmov_b32(s[78], 0))
|
|
k.emit(s_cmp_lt_u32(s[61], s[46]))
|
|
k.emit(s_cmov_b32(s[78], 0))
|
|
k.emit(s_mul_hi_i32(s[89], s[78], s[83]))
|
|
k.emit(s_mul_i32(s[88], s[78], s[83]))
|
|
k.emit(s_mul_hi_i32(s[80], s[8], s[83]))
|
|
k.emit(s_mul_i32(s[79], s[8], s[83]))
|
|
k.emit(s_sub_u32(s[79], s[83], s[79]))
|
|
k.emit(s_subb_u32(s[80], 0, s[80]))
|
|
k.emit(s_add_u32(s[68], s[68], s[88]))
|
|
k.emit(s_addc_u32(s[69], s[69], s[89]))
|
|
k.emit(s_sub_u32(s[62], s[62], s[88]))
|
|
k.emit(s_subb_u32(s[63], s[63], s[89]))
|
|
k.emit(s_cmp_eq_u32(s[63], 0))
|
|
k.emit(s_cselect_b32(s[70], s[62], -1))
|
|
k.emit(s_mul_hi_i32(s[89], s[78], s[84]))
|
|
k.emit(s_mul_i32(s[88], s[78], s[84]))
|
|
k.emit(s_mul_hi_i32(s[82], s[8], s[84]))
|
|
k.emit(s_mul_i32(s[81], s[8], s[84]))
|
|
k.emit(s_sub_u32(s[81], s[84], s[81]))
|
|
k.emit(s_subb_u32(s[82], 0, s[82]))
|
|
k.emit(s_add_u32(s[72], s[72], s[88]))
|
|
k.emit(s_addc_u32(s[73], s[73], s[89]))
|
|
k.emit(s_sub_u32(s[76], s[76], s[88]))
|
|
k.emit(s_subb_u32(s[77], s[77], s[89]))
|
|
k.emit(s_cmp_eq_u32(s[77], 0))
|
|
k.emit(s_cselect_b32(s[74], s[76], -1))
|
|
k.emit(s_add_u32(s[78], s[78], 2))
|
|
k.emit(s_cmp_eq_u32(s[8], 0))
|
|
k.emit(s_setprio())
|
|
k.emit(s_cbranch_scc1(), target='ShadowInitStart')
|
|
k.emit(s_mov_b32(M0, s[53]))
|
|
k.waitcnt(lgkm=0)
|
|
k.emit(s_barrier())
|
|
k.emit(buffer_load_dwordx4(v[0:3], v[0], s[68:71], 0, 0, 1, 0, 0, 1, 0, 1))
|
|
k.emit(s_add_u32(M0, M0, 4096))
|
|
k.emit(buffer_load_dwordx4(v[0:3], v[1], s[68:71], 0, 0, 1, 0, 0, 1, 0, 1))
|
|
k.emit(s_add_u32(M0, M0, 4096))
|
|
k.emit(buffer_load_dwordx4(v[0:3], v[2], s[68:71], 0, 0, 1, 0, 0, 1, 0, 1))
|
|
k.emit(s_add_u32(M0, M0, 4096))
|
|
k.emit(buffer_load_dwordx4(v[0:3], v[3], s[68:71], 0, 0, 1, 0, 0, 1, 0, 1))
|
|
k.emit(s_add_u32(M0, M0, 4096))
|
|
k.emit(buffer_load_dwordx4(v[0:3], v[4], s[68:71], 0, 0, 1, 0, 0, 1, 0, 1))
|
|
k.emit(s_add_u32(M0, M0, 4096))
|
|
k.emit(buffer_load_dwordx4(v[0:3], v[5], s[68:71], 0, 0, 1, 0, 0, 1, 0, 1))
|
|
k.emit(s_add_u32(M0, M0, 4096))
|
|
k.emit(buffer_load_dwordx4(v[0:3], v[6], s[68:71], 0, 0, 1, 0, 0, 1, 0, 1))
|
|
k.emit(s_add_u32(M0, M0, 4096))
|
|
k.emit(buffer_load_dwordx4(v[0:3], v[7], s[68:71], 0, 0, 1, 0, 0, 1, 0, 1))
|
|
k.emit(s_mov_b32(M0, 133120))
|
|
k.emit(s_mov_b32(M0, s[54]))
|
|
k.emit(buffer_load_dwordx4(v[0:3], v[8], s[72:75], 0, 0, 1, 0, 0, 0, 1, 1, 1))
|
|
k.emit(s_add_u32(M0, M0, 4224))
|
|
k.emit(buffer_load_dwordx4(v[0:3], v[9], s[72:75], 0, 0, 1, 0, 0, 0, 1, 1, 1))
|
|
k.emit(s_add_u32(M0, M0, 4224))
|
|
k.emit(buffer_load_dwordx4(v[0:3], v[10], s[72:75], 0, 0, 1, 0, 0, 0, 1, 1, 1))
|
|
k.emit(s_add_u32(M0, M0, 4224))
|
|
k.emit(buffer_load_dwordx4(v[0:3], v[11], s[72:75], 0, 0, 1, 0, 0, 0, 1, 1, 1))
|
|
k.emit(s_add_u32(M0, M0, 4224))
|
|
k.emit(buffer_load_dwordx4(v[0:3], v[12], s[72:75], 0, 0, 1, 0, 0, 0, 1, 1, 1))
|
|
k.emit(s_add_u32(M0, M0, 4224))
|
|
k.emit(buffer_load_dwordx4(v[0:3], v[13], s[72:75], 0, 0, 1, 0, 0, 0, 1, 1, 1))
|
|
k.emit(s_add_u32(M0, M0, 4224))
|
|
k.emit(buffer_load_dwordx4(v[0:3], v[14], s[72:75], 0, 0, 1, 0, 0, 0, 1, 1, 1))
|
|
k.emit(s_add_u32(M0, M0, 4224))
|
|
k.emit(buffer_load_dwordx4(v[0:3], v[15], s[72:75], 0, 0, 1, 0, 0, 0, 1, 1, 1))
|
|
k.emit(s_mov_b32(M0, 133120))
|
|
k.emit(s_add_u32(s[90], s[8], 1))
|
|
k.emit(s_cmp_eq_u32(s[78], s[90]))
|
|
k.emit(s_cselect_b32(s[88], s[79], s[83]))
|
|
k.emit(s_cselect_b32(s[89], s[80], 0))
|
|
k.emit(s_add_u32(s[68], s[68], s[88]))
|
|
k.emit(s_addc_u32(s[69], s[69], s[89]))
|
|
k.emit(s_sub_u32(s[62], s[62], s[88]))
|
|
k.emit(s_subb_u32(s[63], s[63], s[89]))
|
|
k.emit(s_cmp_eq_u32(s[63], 0))
|
|
k.emit(s_cselect_b32(s[70], s[62], -1))
|
|
k.emit(s_add_u32(s[90], s[8], 1))
|
|
k.emit(s_cmp_eq_u32(s[78], s[90]))
|
|
k.emit(s_cselect_b32(s[88], s[81], s[84]))
|
|
k.emit(s_cselect_b32(s[89], s[82], 0))
|
|
k.emit(s_add_u32(s[72], s[72], s[88]))
|
|
k.emit(s_addc_u32(s[73], s[73], s[89]))
|
|
k.emit(s_sub_u32(s[76], s[76], s[88]))
|
|
k.emit(s_subb_u32(s[77], s[77], s[89]))
|
|
k.emit(s_cmp_eq_u32(s[77], 0))
|
|
k.emit(s_cselect_b32(s[74], s[76], -1))
|
|
k.label('ShadowInitStart')
|
|
k.emit(s_mov_b64(s[12:13], s[24:25]))
|
|
k.emit(s_mov_b32(s[14], 2147483648))
|
|
k.emit(s_mov_b32(s[15], 131072))
|
|
k.emit(s_mov_b64(s[16:17], s[24:25]))
|
|
k.emit(s_mov_b32(s[18], 2147483648))
|
|
k.emit(s_mov_b32(s[19], 131072))
|
|
k.emit(s_mov_b32(s[87], 1))
|
|
k.emit(s_mov_b32(s[88], 1))
|
|
k.emit(s_mul_i32(s[92], 256, s[3]))
|
|
k.emit(s_mul_hi_u32(s[91], s[92], s[38]))
|
|
k.emit(s_mul_i32(s[90], s[92], s[38]))
|
|
k.emit(s_lshl_b64(s[90:91], s[90:91], s[87]))
|
|
k.emit(s_add_u32(s[16], s[26], s[90]))
|
|
k.emit(s_addc_u32(s[17], s[27], s[91]))
|
|
k.emit(s_mul_hi_u32(s[91], s[92], s[36]))
|
|
k.emit(s_mul_i32(s[90], s[92], s[36]))
|
|
k.emit(s_lshl_b64(s[90:91], s[90:91], s[88]))
|
|
k.emit(s_add_u32(s[12], s[24], s[90]))
|
|
k.emit(s_addc_u32(s[13], s[25], s[91]))
|
|
k.emit(s_mul_hi_u32(s[91], s[4], s[39]))
|
|
k.emit(s_mul_i32(s[90], s[4], s[39]))
|
|
k.emit(s_lshl_b64(s[90:91], s[90:91], s[87]))
|
|
k.emit(s_add_u32(s[16], s[16], s[90]))
|
|
k.emit(s_addc_u32(s[17], s[17], s[91]))
|
|
k.emit(s_mul_hi_u32(s[91], s[4], s[37]))
|
|
k.emit(s_mul_i32(s[90], s[4], s[37]))
|
|
k.emit(s_lshl_b64(s[90:91], s[90:91], s[88]))
|
|
k.emit(s_add_u32(s[12], s[12], s[90]))
|
|
k.emit(s_addc_u32(s[13], s[13], s[91]))
|
|
k.emit(v_mov_b64_e32(v[182:183], 0))
|
|
k.emit(v_accvgpr_write(v[0], 0))
|
|
k.emit(v_accvgpr_write(v[1], 0))
|
|
k.emit(v_accvgpr_write(v[2], 0))
|
|
k.emit(v_accvgpr_write(v[3], 0))
|
|
k.emit(v_accvgpr_write(v[4], 0))
|
|
k.emit(v_accvgpr_write(v[5], 0))
|
|
k.emit(v_accvgpr_write(v[6], 0))
|
|
k.emit(v_accvgpr_write(v[7], 0))
|
|
k.emit(v_accvgpr_write(v[8], 0))
|
|
k.emit(v_accvgpr_write(v[9], 0))
|
|
k.emit(v_accvgpr_write(v[10], 0))
|
|
k.emit(v_accvgpr_write(v[11], 0))
|
|
k.emit(v_accvgpr_write(v[12], 0))
|
|
k.emit(v_accvgpr_write(v[13], 0))
|
|
k.emit(v_accvgpr_write(v[14], 0))
|
|
k.emit(v_accvgpr_write(v[15], 0))
|
|
k.emit(v_mfma_i32_32x32x16_i8(v[16:31], v[182:183], v[182:183], v[0:15], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mfma_i32_32x32x16_i8(v[32:47], v[182:183], v[182:183], v[0:15], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mfma_i32_32x32x16_i8(v[48:63], v[182:183], v[182:183], v[0:15], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mfma_i32_32x32x16_i8(v[64:79], v[182:183], v[182:183], v[0:15], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mfma_i32_32x32x16_i8(v[80:95], v[182:183], v[182:183], v[0:15], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mfma_i32_32x32x16_i8(v[96:111], v[182:183], v[182:183], v[0:15], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mfma_i32_32x32x16_i8(v[112:127], v[182:183], v[182:183], v[0:15], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mfma_i32_32x32x16_i8(v[128:143], v[182:183], v[182:183], v[0:15], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mfma_i32_32x32x16_i8(v[144:159], v[182:183], v[182:183], v[0:15], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mfma_i32_32x32x16_i8(v[160:175], v[182:183], v[182:183], v[0:15], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mfma_i32_32x32x16_i8(v[176:191], v[182:183], v[182:183], v[0:15], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mfma_i32_32x32x16_i8(v[192:207], v[182:183], v[182:183], v[0:15], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mfma_i32_32x32x16_i8(v[208:223], v[182:183], v[182:183], v[0:15], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mfma_i32_32x32x16_i8(v[224:239], v[182:183], v[182:183], v[0:15], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mfma_i32_32x32x16_i8(v[240:255], v[182:183], v[182:183], v[0:15], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(s_cmp_eq_u32(s[8], 0))
|
|
k.emit(s_cbranch_scc1(), target='toPGR1end_OrdNLL')
|
|
k.waitcnt(vm=0)
|
|
k.emit(s_barrier())
|
|
k.emit(s_xor_b32(s[53], s[55], s[53]))
|
|
k.emit(s_xor_b32(s[54], s[56], s[54]))
|
|
k.emit(s_cmp_eq_u32(s[8], 1))
|
|
k.emit(s_cbranch_scc1(), target='skipPGR2')
|
|
k.emit(s_mov_b32(M0, s[53]))
|
|
k.waitcnt(lgkm=0)
|
|
k.emit(s_barrier())
|
|
k.emit(buffer_load_dwordx4(v[0:3], v[0], s[68:71], 0, 0, 1, 0, 0, 1, 0, 1))
|
|
k.emit(s_add_u32(M0, M0, 4096))
|
|
k.emit(buffer_load_dwordx4(v[0:3], v[1], s[68:71], 0, 0, 1, 0, 0, 1, 0, 1))
|
|
k.emit(s_add_u32(M0, M0, 4096))
|
|
k.emit(buffer_load_dwordx4(v[0:3], v[2], s[68:71], 0, 0, 1, 0, 0, 1, 0, 1))
|
|
k.emit(s_add_u32(M0, M0, 4096))
|
|
k.emit(buffer_load_dwordx4(v[0:3], v[3], s[68:71], 0, 0, 1, 0, 0, 1, 0, 1))
|
|
k.emit(s_add_u32(M0, M0, 4096))
|
|
k.emit(buffer_load_dwordx4(v[0:3], v[4], s[68:71], 0, 0, 1, 0, 0, 1, 0, 1))
|
|
k.emit(s_add_u32(M0, M0, 4096))
|
|
k.emit(buffer_load_dwordx4(v[0:3], v[5], s[68:71], 0, 0, 1, 0, 0, 1, 0, 1))
|
|
k.emit(s_add_u32(M0, M0, 4096))
|
|
k.emit(buffer_load_dwordx4(v[0:3], v[6], s[68:71], 0, 0, 1, 0, 0, 1, 0, 1))
|
|
k.emit(s_add_u32(M0, M0, 4096))
|
|
k.emit(buffer_load_dwordx4(v[0:3], v[7], s[68:71], 0, 0, 1, 0, 0, 1, 0, 1))
|
|
k.emit(s_mov_b32(M0, 133120))
|
|
k.emit(s_mov_b32(M0, s[54]))
|
|
k.waitcnt(lgkm=0)
|
|
k.emit(s_barrier())
|
|
k.emit(buffer_load_dwordx4(v[0:3], v[8], s[72:75], 0, 0, 1, 0, 0, 0, 1, 1, 1))
|
|
k.emit(s_add_u32(M0, M0, 4224))
|
|
k.emit(buffer_load_dwordx4(v[0:3], v[9], s[72:75], 0, 0, 1, 0, 0, 0, 1, 1, 1))
|
|
k.emit(s_add_u32(M0, M0, 4224))
|
|
k.emit(buffer_load_dwordx4(v[0:3], v[10], s[72:75], 0, 0, 1, 0, 0, 0, 1, 1, 1))
|
|
k.emit(s_add_u32(M0, M0, 4224))
|
|
k.emit(buffer_load_dwordx4(v[0:3], v[11], s[72:75], 0, 0, 1, 0, 0, 0, 1, 1, 1))
|
|
k.emit(s_add_u32(M0, M0, 4224))
|
|
k.emit(buffer_load_dwordx4(v[0:3], v[12], s[72:75], 0, 0, 1, 0, 0, 0, 1, 1, 1))
|
|
k.emit(s_add_u32(M0, M0, 4224))
|
|
k.emit(buffer_load_dwordx4(v[0:3], v[13], s[72:75], 0, 0, 1, 0, 0, 0, 1, 1, 1))
|
|
k.emit(s_add_u32(M0, M0, 4224))
|
|
k.emit(buffer_load_dwordx4(v[0:3], v[14], s[72:75], 0, 0, 1, 0, 0, 0, 1, 1, 1))
|
|
k.emit(s_add_u32(M0, M0, 4224))
|
|
k.emit(buffer_load_dwordx4(v[0:3], v[15], s[72:75], 0, 0, 1, 0, 0, 0, 1, 1, 1))
|
|
k.emit(s_mov_b32(M0, 133120))
|
|
k.emit(s_xor_b32(s[53], s[55], s[53]))
|
|
k.emit(s_xor_b32(s[54], s[56], s[54]))
|
|
k.label('skipPGR2')
|
|
k.emit(s_barrier())
|
|
k.emit(ds_read_b128(v[82:85], v[16]))
|
|
k.emit(ds_read_b128(v[86:89], v[16], v[0], v[0], 0, 0, 2))
|
|
k.emit(ds_read_b128(v[90:93], v[16], v[0], v[0], 0, 0, 4))
|
|
k.emit(ds_read_b128(v[94:97], v[16], v[0], v[0], 0, 0, 6))
|
|
k.emit(ds_read_b128(v[98:101], v[16], v[0], v[0], 0, 0, 8))
|
|
k.emit(ds_read_b128(v[102:105], v[16], v[0], v[0], 0, 0, 10))
|
|
k.emit(ds_read_b128(v[106:109], v[16], v[0], v[0], 0, 0, 12))
|
|
k.emit(ds_read_b128(v[110:113], v[16], v[0], v[0], 0, 0, 14))
|
|
k.emit(ds_read_b128(v[114:117], v[17]))
|
|
k.emit(ds_read_b128(v[118:121], v[17], v[0], v[0], 0, 128))
|
|
k.emit(ds_read_b128(v[122:125], v[17], v[0], v[0], 0, 0, 1))
|
|
k.emit(ds_read_b128(v[126:129], v[17], v[0], v[0], 0, 128, 1))
|
|
k.emit(ds_read_b128(v[130:133], v[17], v[0], v[0], 0, 0, 2))
|
|
k.emit(ds_read_b128(v[134:137], v[17], v[0], v[0], 0, 128, 2))
|
|
k.emit(ds_read_b128(v[138:141], v[17], v[0], v[0], 0, 0, 3))
|
|
k.emit(ds_read_b128(v[142:145], v[17], v[0], v[0], 0, 128, 3))
|
|
k.waitcnt(lgkm=0)
|
|
k.emit(v_perm_b32_e64(v[18], v[86], v[82], s[85]))
|
|
k.emit(v_perm_b32_e64(v[19], v[94], v[90], s[85]))
|
|
k.emit(v_perm_b32_e64(v[20], v[102], v[98], s[85]))
|
|
k.emit(v_perm_b32_e64(v[21], v[110], v[106], s[85]))
|
|
k.emit(v_perm_b32_e64(v[22], v[86], v[82], s[86]))
|
|
k.emit(v_perm_b32_e64(v[23], v[94], v[90], s[86]))
|
|
k.emit(v_perm_b32_e64(v[24], v[102], v[98], s[86]))
|
|
k.emit(v_perm_b32_e64(v[25], v[110], v[106], s[86]))
|
|
k.emit(v_perm_b32_e64(v[26], v[87], v[83], s[85]))
|
|
k.emit(v_perm_b32_e64(v[27], v[95], v[91], s[85]))
|
|
k.emit(v_perm_b32_e64(v[28], v[103], v[99], s[85]))
|
|
k.emit(v_perm_b32_e64(v[29], v[111], v[107], s[85]))
|
|
k.emit(v_perm_b32_e64(v[30], v[87], v[83], s[86]))
|
|
k.emit(v_perm_b32_e64(v[31], v[95], v[91], s[86]))
|
|
k.emit(v_perm_b32_e64(v[32], v[103], v[99], s[86]))
|
|
k.emit(v_perm_b32_e64(v[33], v[111], v[107], s[86]))
|
|
k.emit(v_perm_b32_e64(v[34], v[88], v[84], s[85]))
|
|
k.emit(v_perm_b32_e64(v[35], v[96], v[92], s[85]))
|
|
k.emit(v_perm_b32_e64(v[36], v[104], v[100], s[85]))
|
|
k.emit(v_perm_b32_e64(v[37], v[112], v[108], s[85]))
|
|
k.emit(v_perm_b32_e64(v[38], v[88], v[84], s[86]))
|
|
k.emit(v_perm_b32_e64(v[39], v[96], v[92], s[86]))
|
|
k.emit(v_perm_b32_e64(v[40], v[104], v[100], s[86]))
|
|
k.emit(v_perm_b32_e64(v[41], v[112], v[108], s[86]))
|
|
k.emit(v_perm_b32_e64(v[42], v[89], v[85], s[85]))
|
|
k.emit(v_perm_b32_e64(v[43], v[97], v[93], s[85]))
|
|
k.emit(v_perm_b32_e64(v[44], v[105], v[101], s[85]))
|
|
k.emit(v_perm_b32_e64(v[45], v[113], v[109], s[85]))
|
|
k.emit(v_perm_b32_e64(v[46], v[89], v[85], s[86]))
|
|
k.emit(v_perm_b32_e64(v[47], v[97], v[93], s[86]))
|
|
k.emit(v_perm_b32_e64(v[48], v[105], v[101], s[86]))
|
|
k.emit(v_perm_b32_e64(v[49], v[113], v[109], s[86]))
|
|
k.label('openLoopL')
|
|
k.emit(s_cmp_eq_u32(s[8], 1))
|
|
k.emit(s_cbranch_scc1(), target='toPGR1')
|
|
k.emit(s_cmp_le_u32(s[8], 2))
|
|
k.emit(s_cbranch_scc1(), target='LoopEndL')
|
|
k.label('LoopBeginL')
|
|
k.emit(s_getreg_b32(s[87], 260))
|
|
k.waitcnt(lgkm=0)
|
|
k.emit(s_cmp_eq_u32(s[87], 0))
|
|
k.emit(s_cbranch_scc1(), target='LoopBeginL_0')
|
|
k.emit(s_cmp_eq_u32(s[87], 1))
|
|
k.emit(s_cbranch_scc1(), target='LoopBeginL_1')
|
|
k.label('LoopBeginL_0')
|
|
k.emit(v_mfma_16x16x32(v[0:3], v[114:117], v[18:21], v[0:3], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(s_cmp_eq_u32(s[8], s[78]))
|
|
k.emit(ds_read_b128(v[82:85], v[16], v[0], v[0], 0, 0, 64))
|
|
k.emit(ds_read_b128(v[86:89], v[16], v[0], v[0], 0, 0, 66))
|
|
k.emit(v_mfma_16x16x32(v[4:7], v[114:117], v[22:25], v[4:7], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(s_cselect_b32(s[88], s[79], s[83]))
|
|
k.emit(v_mfma_16x16x32(v[8:11], v[114:117], v[26:29], v[8:11], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(s_cselect_b32(s[89], s[80], 0))
|
|
k.emit(ds_read_b128(v[90:93], v[16], v[0], v[0], 0, 0, 68))
|
|
k.emit(ds_read_b128(v[94:97], v[16], v[0], v[0], 0, 0, 70))
|
|
k.emit(v_mfma_16x16x32(v[12:15], v[114:117], v[30:33], v[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(s_add_u32(s[68], s[68], s[88]))
|
|
k.emit(v_mfma_16x16x32(v[16:19], v[114:117], v[34:37], v[16:19], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(s_addc_u32(s[69], s[69], s[89]))
|
|
k.emit(ds_read_b128(v[98:101], v[16], v[0], v[0], 0, 0, 72))
|
|
k.emit(ds_read_b128(v[102:105], v[16], v[0], v[0], 0, 0, 74))
|
|
k.emit(v_mfma_16x16x32(v[20:23], v[114:117], v[38:41], v[20:23], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(s_sub_u32(s[62], s[62], s[88]))
|
|
k.emit(v_mfma_16x16x32(v[24:27], v[114:117], v[42:45], v[24:27], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(s_subb_u32(s[63], s[63], s[89]))
|
|
k.emit(ds_read_b128(v[106:109], v[16], v[0], v[0], 0, 0, 76))
|
|
k.emit(ds_read_b128(v[110:113], v[16], v[0], v[0], 0, 0, 78))
|
|
k.emit(v_mfma_16x16x32(v[28:31], v[114:117], v[46:49], v[28:31], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(s_cmp_eq_u32(s[63], 0))
|
|
k.emit(v_mfma_16x16x32(v[32:35], v[118:121], v[18:21], v[32:35], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.waitcnt(lgkm=4)
|
|
k.emit(s_cselect_b32(s[70], s[62], -1))
|
|
k.emit(v_perm_b32_e64(v[50], v[86], v[82], s[85]))
|
|
k.emit(v_perm_b32_e64(v[51], v[94], v[90], s[85]))
|
|
k.emit(v_mfma_16x16x32(v[36:39], v[118:121], v[22:25], v[36:39], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(ds_read_b128(v[146:149], v[17], v[0], v[0], 0, 64))
|
|
k.emit(v_mfma_16x16x32(v[40:43], v[118:121], v[26:29], v[40:43], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mfma_16x16x32(v[44:47], v[118:121], v[30:33], v[44:47], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(ds_read_b128(v[150:153], v[17], v[0], v[0], 0, 192))
|
|
k.emit(v_mfma_16x16x32(v[48:51], v[118:121], v[34:37], v[48:51], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.waitcnt(lgkm=1)
|
|
k.emit(v_mfma_16x16x32(v[52:55], v[118:121], v[38:41], v[52:55], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(s_barrier())
|
|
k.emit(v_mfma_16x16x32(v[56:59], v[118:121], v[42:45], v[56:59], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(s_mov_b32(M0, s[53]))
|
|
k.emit(buffer_load_dwordx4(v[0:3], v[0], s[68:71], 0, 0, 1, 0, 0, 1, 0, 1))
|
|
k.emit(v_mfma_16x16x32(v[60:63], v[118:121], v[46:49], v[60:63], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(ds_read_b128(v[154:157], v[17], v[0], v[0], 0, 64, 1))
|
|
k.emit(v_mfma_16x16x32(v[64:67], v[122:125], v[18:21], v[64:67], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_perm_b32_e64(v[52], v[102], v[98], s[85]))
|
|
k.emit(v_perm_b32_e64(v[53], v[110], v[106], s[85]))
|
|
k.emit(v_mfma_16x16x32(v[68:71], v[122:125], v[22:25], v[68:71], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(s_add_u32(M0, M0, 4096))
|
|
k.emit(buffer_load_dwordx4(v[0:3], v[1], s[68:71], 0, 0, 1, 0, 0, 1, 0, 1))
|
|
k.emit(v_mfma_16x16x32(v[72:75], v[122:125], v[26:29], v[72:75], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(ds_read_b128(v[158:161], v[17], v[0], v[0], 0, 192, 1))
|
|
k.emit(v_mfma_16x16x32(v[76:79], v[122:125], v[30:33], v[76:79], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_perm_b32_e64(v[54], v[86], v[82], s[86]))
|
|
k.emit(v_perm_b32_e64(v[55], v[94], v[90], s[86]))
|
|
k.emit(v_mfma_16x16x32(v[80:83], v[122:125], v[34:37], v[80:83], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(s_add_u32(M0, M0, 4096))
|
|
k.emit(buffer_load_dwordx4(v[0:3], v[2], s[68:71], 0, 0, 1, 0, 0, 1, 0, 1))
|
|
k.emit(v_mfma_16x16x32(v[84:87], v[122:125], v[38:41], v[84:87], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(ds_read_b128(v[162:165], v[17], v[0], v[0], 0, 64, 2))
|
|
k.emit(v_mfma_16x16x32(v[88:91], v[122:125], v[42:45], v[88:91], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_perm_b32_e64(v[56], v[102], v[98], s[86]))
|
|
k.emit(v_perm_b32_e64(v[57], v[110], v[106], s[86]))
|
|
k.emit(v_mfma_16x16x32(v[92:95], v[122:125], v[46:49], v[92:95], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(s_add_u32(M0, M0, 4096))
|
|
k.emit(buffer_load_dwordx4(v[0:3], v[3], s[68:71], 0, 0, 1, 0, 0, 1, 0, 1))
|
|
k.emit(v_mfma_16x16x32(v[96:99], v[126:129], v[18:21], v[96:99], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(ds_read_b128(v[166:169], v[17], v[0], v[0], 0, 192, 2))
|
|
k.emit(v_mfma_16x16x32(v[100:103], v[126:129], v[22:25], v[100:103], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_perm_b32_e64(v[58], v[87], v[83], s[85]))
|
|
k.emit(v_perm_b32_e64(v[59], v[95], v[91], s[85]))
|
|
k.emit(v_mfma_16x16x32(v[104:107], v[126:129], v[26:29], v[104:107], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(s_add_u32(M0, M0, 4096))
|
|
k.emit(buffer_load_dwordx4(v[0:3], v[4], s[68:71], 0, 0, 1, 0, 0, 1, 0, 1))
|
|
k.emit(v_mfma_16x16x32(v[108:111], v[126:129], v[30:33], v[108:111], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(ds_read_b128(v[170:173], v[17], v[0], v[0], 0, 64, 3))
|
|
k.emit(v_mfma_16x16x32(v[112:115], v[126:129], v[34:37], v[112:115], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(s_cmp_eq_u32(s[8], s[78]))
|
|
k.emit(v_perm_b32_e64(v[60], v[103], v[99], s[85]))
|
|
k.emit(v_perm_b32_e64(v[61], v[111], v[107], s[85]))
|
|
k.emit(v_mfma_16x16x32(v[116:119], v[126:129], v[38:41], v[116:119], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(s_cselect_b32(s[88], s[81], s[84]))
|
|
k.emit(v_perm_b32_e64(v[62], v[87], v[83], s[86]))
|
|
k.emit(v_perm_b32_e64(v[63], v[95], v[91], s[86]))
|
|
k.emit(v_mfma_16x16x32(v[120:123], v[126:129], v[42:45], v[120:123], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(s_cselect_b32(s[89], s[82], 0))
|
|
k.emit(ds_read_b128(v[174:177], v[17], v[0], v[0], 0, 192, 3))
|
|
k.emit(v_mfma_16x16x32(v[124:127], v[126:129], v[46:49], v[124:127], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(s_add_u32(s[72], s[72], s[88]))
|
|
k.emit(v_perm_b32_e64(v[64], v[103], v[99], s[86]))
|
|
k.emit(v_perm_b32_e64(v[65], v[111], v[107], s[86]))
|
|
k.emit(v_mfma_16x16x32(v[128:131], v[130:133], v[18:21], v[128:131], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(s_addc_u32(s[73], s[73], s[89]))
|
|
k.emit(v_perm_b32_e64(v[66], v[88], v[84], s[85]))
|
|
k.emit(v_perm_b32_e64(v[67], v[96], v[92], s[85]))
|
|
k.emit(v_mfma_16x16x32(v[132:135], v[130:133], v[22:25], v[132:135], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(s_sub_u32(s[76], s[76], s[88]))
|
|
k.emit(v_perm_b32_e64(v[68], v[104], v[100], s[85]))
|
|
k.emit(v_perm_b32_e64(v[69], v[112], v[108], s[85]))
|
|
k.emit(v_mfma_16x16x32(v[136:139], v[130:133], v[26:29], v[136:139], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(s_subb_u32(s[77], s[77], s[89]))
|
|
k.emit(v_perm_b32_e64(v[70], v[88], v[84], s[86]))
|
|
k.emit(v_perm_b32_e64(v[71], v[96], v[92], s[86]))
|
|
k.emit(v_mfma_16x16x32(v[140:143], v[130:133], v[30:33], v[140:143], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(s_cmp_eq_u32(s[77], 0))
|
|
k.emit(v_perm_b32_e64(v[72], v[104], v[100], s[86]))
|
|
k.emit(v_perm_b32_e64(v[73], v[112], v[108], s[86]))
|
|
k.emit(v_mfma_16x16x32(v[144:147], v[130:133], v[34:37], v[144:147], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.waitcnt(lgkm=0)
|
|
k.emit(s_cselect_b32(s[74], s[76], -1))
|
|
k.emit(v_perm_b32_e64(v[74], v[89], v[85], s[85]))
|
|
k.emit(v_perm_b32_e64(v[75], v[97], v[93], s[85]))
|
|
k.emit(v_mfma_16x16x32(v[148:151], v[130:133], v[38:41], v[148:151], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_perm_b32_e64(v[76], v[105], v[101], s[85]))
|
|
k.emit(v_perm_b32_e64(v[77], v[113], v[109], s[85]))
|
|
k.emit(v_mfma_16x16x32(v[152:155], v[130:133], v[42:45], v[152:155], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_perm_b32_e64(v[78], v[89], v[85], s[86]))
|
|
k.emit(v_perm_b32_e64(v[79], v[97], v[93], s[86]))
|
|
k.emit(v_mfma_16x16x32(v[156:159], v[130:133], v[46:49], v[156:159], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_perm_b32_e64(v[80], v[105], v[101], s[86]))
|
|
k.emit(v_perm_b32_e64(v[81], v[113], v[109], s[86]))
|
|
k.emit(v_mfma_16x16x32(v[160:163], v[134:137], v[18:21], v[160:163], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mfma_16x16x32(v[164:167], v[134:137], v[22:25], v[164:167], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mfma_16x16x32(v[168:171], v[134:137], v[26:29], v[168:171], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mfma_16x16x32(v[172:175], v[134:137], v[30:33], v[172:175], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mfma_16x16x32(v[176:179], v[134:137], v[34:37], v[176:179], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(s_barrier())
|
|
k.emit(v_mfma_16x16x32(v[180:183], v[134:137], v[38:41], v[180:183], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(s_add_u32(M0, M0, 4096))
|
|
k.emit(buffer_load_dwordx4(v[0:3], v[5], s[68:71], 0, 0, 1, 0, 0, 1, 0, 1))
|
|
k.emit(v_mfma_16x16x32(v[184:187], v[134:137], v[42:45], v[184:187], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mfma_16x16x32(v[188:191], v[134:137], v[46:49], v[188:191], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mfma_16x16x32(v[192:195], v[138:141], v[18:21], v[192:195], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(s_add_u32(M0, M0, 4096))
|
|
k.emit(buffer_load_dwordx4(v[0:3], v[6], s[68:71], 0, 0, 1, 0, 0, 1, 0, 1))
|
|
k.emit(v_mfma_16x16x32(v[196:199], v[138:141], v[22:25], v[196:199], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mfma_16x16x32(v[200:203], v[138:141], v[26:29], v[200:203], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mfma_16x16x32(v[204:207], v[138:141], v[30:33], v[204:207], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(s_add_u32(M0, M0, 4096))
|
|
k.emit(buffer_load_dwordx4(v[0:3], v[7], s[68:71], 0, 0, 1, 0, 0, 1, 0, 1))
|
|
k.emit(v_mfma_16x16x32(v[208:211], v[138:141], v[34:37], v[208:211], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mfma_16x16x32(v[212:215], v[138:141], v[38:41], v[212:215], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mfma_16x16x32(v[216:219], v[138:141], v[42:45], v[216:219], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(s_mov_b32(M0, s[54]))
|
|
k.emit(buffer_load_dwordx4(v[0:3], v[8], s[72:75], 0, 0, 1, 0, 0, 0, 1, 1, 1))
|
|
k.emit(v_mfma_16x16x32(v[220:223], v[138:141], v[46:49], v[220:223], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mfma_16x16x32(v[224:227], v[142:145], v[18:21], v[224:227], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.waitcnt(vm=17)
|
|
k.emit(v_mfma_16x16x32(v[228:231], v[142:145], v[22:25], v[228:231], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(s_add_u32(M0, M0, 4224))
|
|
k.emit(buffer_load_dwordx4(v[0:3], v[9], s[72:75], 0, 0, 1, 0, 0, 0, 1, 1, 1))
|
|
k.emit(v_mfma_16x16x32(v[232:235], v[142:145], v[26:29], v[232:235], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mfma_16x16x32(v[236:239], v[142:145], v[30:33], v[236:239], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(s_barrier())
|
|
k.emit(v_xor_b32_e32(v[16], v[178], v[16]))
|
|
k.emit(v_xor_b32_e32(v[17], v[179], v[17]))
|
|
k.emit(v_mfma_16x16x32(v[240:243], v[142:145], v[34:37], v[240:243], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(ds_read_b128(v[82:85], v[16]))
|
|
k.emit(ds_read_b128(v[86:89], v[16], v[0], v[0], 0, 0, 2))
|
|
k.emit(v_mfma_16x16x32(v[244:247], v[142:145], v[38:41], v[244:247], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mfma_16x16x32(v[248:251], v[142:145], v[42:45], v[248:251], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(ds_read_b128(v[90:93], v[16], v[0], v[0], 0, 0, 4))
|
|
k.emit(ds_read_b128(v[94:97], v[16], v[0], v[0], 0, 0, 6))
|
|
k.emit(v_mfma_16x16x32(v[252:255], v[142:145], v[46:49], v[252:255], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mfma_16x16x32(v[0:3], v[146:149], v[50:53], v[0:3], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(ds_read_b128(v[98:101], v[16], v[0], v[0], 0, 0, 8))
|
|
k.emit(ds_read_b128(v[102:105], v[16], v[0], v[0], 0, 0, 10))
|
|
k.emit(v_mfma_16x16x32(v[4:7], v[146:149], v[54:57], v[4:7], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mfma_16x16x32(v[8:11], v[146:149], v[58:61], v[8:11], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.waitcnt(vm=9)
|
|
k.emit(ds_read_b128(v[106:109], v[16], v[0], v[0], 0, 0, 12))
|
|
k.emit(ds_read_b128(v[110:113], v[16], v[0], v[0], 0, 0, 14))
|
|
k.emit(v_mfma_16x16x32(v[12:15], v[146:149], v[62:65], v[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mfma_16x16x32(v[16:19], v[146:149], v[66:69], v[16:19], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(s_barrier())
|
|
k.emit(ds_read_b128(v[114:117], v[17]))
|
|
k.emit(v_mfma_16x16x32(v[20:23], v[146:149], v[70:73], v[20:23], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mfma_16x16x32(v[24:27], v[146:149], v[74:77], v[24:27], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(ds_read_b128(v[118:121], v[17], v[0], v[0], 0, 128))
|
|
k.emit(v_mfma_16x16x32(v[28:31], v[146:149], v[78:81], v[28:31], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mfma_16x16x32(v[32:35], v[150:153], v[50:53], v[32:35], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(ds_read_b128(v[122:125], v[17], v[0], v[0], 0, 0, 1))
|
|
k.emit(v_mfma_16x16x32(v[36:39], v[150:153], v[54:57], v[36:39], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.waitcnt(lgkm=4)
|
|
k.emit(v_mfma_16x16x32(v[40:43], v[150:153], v[58:61], v[40:43], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(ds_read_b128(v[126:129], v[17], v[0], v[0], 0, 128, 1))
|
|
k.emit(v_mfma_16x16x32(v[44:47], v[150:153], v[62:65], v[44:47], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_perm_b32_e64(v[18], v[86], v[82], s[85]))
|
|
k.emit(v_perm_b32_e64(v[19], v[94], v[90], s[85]))
|
|
k.emit(v_mfma_16x16x32(v[48:51], v[150:153], v[66:69], v[48:51], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(ds_read_b128(v[130:133], v[17], v[0], v[0], 0, 0, 2))
|
|
k.emit(v_mfma_16x16x32(v[52:55], v[150:153], v[70:73], v[52:55], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_perm_b32_e64(v[20], v[102], v[98], s[85]))
|
|
k.emit(v_perm_b32_e64(v[21], v[110], v[106], s[85]))
|
|
k.emit(v_mfma_16x16x32(v[56:59], v[150:153], v[74:77], v[56:59], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(ds_read_b128(v[134:137], v[17], v[0], v[0], 0, 128, 2))
|
|
k.emit(v_mfma_16x16x32(v[60:63], v[150:153], v[78:81], v[60:63], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_perm_b32_e64(v[22], v[86], v[82], s[86]))
|
|
k.emit(v_perm_b32_e64(v[23], v[94], v[90], s[86]))
|
|
k.emit(v_mfma_16x16x32(v[64:67], v[154:157], v[50:53], v[64:67], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(ds_read_b128(v[138:141], v[17], v[0], v[0], 0, 0, 3))
|
|
k.emit(v_mfma_16x16x32(v[68:71], v[154:157], v[54:57], v[68:71], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_perm_b32_e64(v[24], v[102], v[98], s[86]))
|
|
k.emit(v_perm_b32_e64(v[25], v[110], v[106], s[86]))
|
|
k.emit(v_mfma_16x16x32(v[72:75], v[154:157], v[58:61], v[72:75], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(ds_read_b128(v[142:145], v[17], v[0], v[0], 0, 128, 3))
|
|
k.emit(v_mfma_16x16x32(v[76:79], v[154:157], v[62:65], v[76:79], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_perm_b32_e64(v[26], v[87], v[83], s[85]))
|
|
k.emit(v_perm_b32_e64(v[27], v[95], v[91], s[85]))
|
|
k.emit(v_mfma_16x16x32(v[80:83], v[154:157], v[66:69], v[80:83], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_perm_b32_e64(v[28], v[103], v[99], s[85]))
|
|
k.emit(v_perm_b32_e64(v[29], v[111], v[107], s[85]))
|
|
k.emit(v_mfma_16x16x32(v[84:87], v[154:157], v[70:73], v[84:87], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_perm_b32_e64(v[30], v[87], v[83], s[86]))
|
|
k.emit(v_perm_b32_e64(v[31], v[95], v[91], s[86]))
|
|
k.emit(v_mfma_16x16x32(v[88:91], v[154:157], v[74:77], v[88:91], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_perm_b32_e64(v[32], v[103], v[99], s[86]))
|
|
k.emit(v_perm_b32_e64(v[33], v[111], v[107], s[86]))
|
|
k.emit(v_mfma_16x16x32(v[92:95], v[154:157], v[78:81], v[92:95], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(s_add_u32(M0, M0, 4224))
|
|
k.emit(buffer_load_dwordx4(v[0:3], v[10], s[72:75], 0, 0, 1, 0, 0, 0, 1, 1, 1))
|
|
k.emit(v_mfma_16x16x32(v[96:99], v[158:161], v[50:53], v[96:99], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_perm_b32_e64(v[34], v[88], v[84], s[85]))
|
|
k.emit(v_perm_b32_e64(v[35], v[96], v[92], s[85]))
|
|
k.emit(v_mfma_16x16x32(v[100:103], v[158:161], v[54:57], v[100:103], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_perm_b32_e64(v[36], v[104], v[100], s[85]))
|
|
k.emit(v_perm_b32_e64(v[37], v[112], v[108], s[85]))
|
|
k.emit(v_mfma_16x16x32(v[104:107], v[158:161], v[58:61], v[104:107], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(s_add_u32(M0, M0, 4224))
|
|
k.emit(buffer_load_dwordx4(v[0:3], v[11], s[72:75], 0, 0, 1, 0, 0, 0, 1, 1, 1))
|
|
k.emit(v_mfma_16x16x32(v[108:111], v[158:161], v[62:65], v[108:111], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_perm_b32_e64(v[38], v[88], v[84], s[86]))
|
|
k.emit(v_perm_b32_e64(v[39], v[96], v[92], s[86]))
|
|
k.emit(v_mfma_16x16x32(v[112:115], v[158:161], v[66:69], v[112:115], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_perm_b32_e64(v[40], v[104], v[100], s[86]))
|
|
k.emit(v_perm_b32_e64(v[41], v[112], v[108], s[86]))
|
|
k.emit(v_mfma_16x16x32(v[116:119], v[158:161], v[70:73], v[116:119], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(s_add_u32(M0, M0, 4224))
|
|
k.emit(buffer_load_dwordx4(v[0:3], v[12], s[72:75], 0, 0, 1, 0, 0, 0, 1, 1, 1))
|
|
k.emit(v_mfma_16x16x32(v[120:123], v[158:161], v[74:77], v[120:123], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_perm_b32_e64(v[42], v[89], v[85], s[85]))
|
|
k.emit(v_perm_b32_e64(v[43], v[97], v[93], s[85]))
|
|
k.emit(v_mfma_16x16x32(v[124:127], v[158:161], v[78:81], v[124:127], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_perm_b32_e64(v[44], v[105], v[101], s[85]))
|
|
k.emit(v_perm_b32_e64(v[45], v[113], v[109], s[85]))
|
|
k.emit(v_mfma_16x16x32(v[128:131], v[162:165], v[50:53], v[128:131], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(s_add_u32(M0, M0, 4224))
|
|
k.emit(buffer_load_dwordx4(v[0:3], v[13], s[72:75], 0, 0, 1, 0, 0, 0, 1, 1, 1))
|
|
k.emit(v_mfma_16x16x32(v[132:135], v[162:165], v[54:57], v[132:135], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_perm_b32_e64(v[46], v[89], v[85], s[86]))
|
|
k.emit(v_perm_b32_e64(v[47], v[97], v[93], s[86]))
|
|
k.emit(v_mfma_16x16x32(v[136:139], v[162:165], v[58:61], v[136:139], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_perm_b32_e64(v[48], v[105], v[101], s[86]))
|
|
k.emit(v_perm_b32_e64(v[49], v[113], v[109], s[86]))
|
|
k.emit(v_mfma_16x16x32(v[140:143], v[162:165], v[62:65], v[140:143], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(s_add_u32(M0, M0, 4224))
|
|
k.emit(buffer_load_dwordx4(v[0:3], v[14], s[72:75], 0, 0, 1, 0, 0, 0, 1, 1, 1))
|
|
k.emit(v_mfma_16x16x32(v[144:147], v[162:165], v[66:69], v[144:147], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mfma_16x16x32(v[148:151], v[162:165], v[70:73], v[148:151], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mfma_16x16x32(v[152:155], v[162:165], v[74:77], v[152:155], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mfma_16x16x32(v[156:159], v[162:165], v[78:81], v[156:159], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mfma_16x16x32(v[160:163], v[166:169], v[50:53], v[160:163], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mfma_16x16x32(v[164:167], v[166:169], v[54:57], v[164:167], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mfma_16x16x32(v[168:171], v[166:169], v[58:61], v[168:171], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mfma_16x16x32(v[172:175], v[166:169], v[62:65], v[172:175], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mfma_16x16x32(v[176:179], v[166:169], v[66:69], v[176:179], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mfma_16x16x32(v[180:183], v[166:169], v[70:73], v[180:183], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mfma_16x16x32(v[184:187], v[166:169], v[74:77], v[184:187], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mfma_16x16x32(v[188:191], v[166:169], v[78:81], v[188:191], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mfma_16x16x32(v[192:195], v[170:173], v[50:53], v[192:195], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mfma_16x16x32(v[196:199], v[170:173], v[54:57], v[196:199], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mfma_16x16x32(v[200:203], v[170:173], v[58:61], v[200:203], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mfma_16x16x32(v[204:207], v[170:173], v[62:65], v[204:207], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mfma_16x16x32(v[208:211], v[170:173], v[66:69], v[208:211], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mfma_16x16x32(v[212:215], v[170:173], v[70:73], v[212:215], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mfma_16x16x32(v[216:219], v[170:173], v[74:77], v[216:219], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mfma_16x16x32(v[220:223], v[170:173], v[78:81], v[220:223], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mfma_16x16x32(v[224:227], v[174:177], v[50:53], v[224:227], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mfma_16x16x32(v[228:231], v[174:177], v[54:57], v[228:231], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mfma_16x16x32(v[232:235], v[174:177], v[58:61], v[232:235], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mfma_16x16x32(v[236:239], v[174:177], v[62:65], v[236:239], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(s_add_u32(M0, M0, 4224))
|
|
k.emit(buffer_load_dwordx4(v[0:3], v[15], s[72:75], 0, 0, 1, 0, 0, 0, 1, 1, 1))
|
|
k.emit(v_mfma_16x16x32(v[240:243], v[174:177], v[66:69], v[240:243], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mfma_16x16x32(v[244:247], v[174:177], v[70:73], v[244:247], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(s_xor_b32(s[53], s[55], s[53]))
|
|
k.emit(s_xor_b32(s[54], s[56], s[54]))
|
|
k.emit(v_mfma_16x16x32(v[248:251], v[174:177], v[74:77], v[248:251], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(s_sub_u32(s[8], s[8], 1))
|
|
k.emit(s_cmp_eq_i32(s[8], 2))
|
|
k.emit(v_mfma_16x16x32(v[252:255], v[174:177], v[78:81], v[252:255], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(s_cbranch_scc0(), target='LoopBeginL_0')
|
|
k.emit(s_branch(), target='LoopEndL')
|
|
k.label('LoopBeginL_1')
|
|
k.emit(v_mfma_16x16x32(v[0:3], v[114:117], v[18:21], v[0:3], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(s_cmp_eq_u32(s[8], s[78]))
|
|
k.emit(v_mfma_16x16x32(v[4:7], v[114:117], v[22:25], v[4:7], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(s_cselect_b32(s[88], s[79], s[83]))
|
|
k.emit(ds_read_b128(v[82:85], v[16], v[0], v[0], 0, 0, 64))
|
|
k.emit(ds_read_b128(v[86:89], v[16], v[0], v[0], 0, 0, 66))
|
|
k.emit(v_mfma_16x16x32(v[8:11], v[114:117], v[26:29], v[8:11], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(s_cselect_b32(s[89], s[80], 0))
|
|
k.emit(v_mfma_16x16x32(v[12:15], v[114:117], v[30:33], v[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(s_add_u32(s[68], s[68], s[88]))
|
|
k.emit(ds_read_b128(v[90:93], v[16], v[0], v[0], 0, 0, 68))
|
|
k.emit(ds_read_b128(v[94:97], v[16], v[0], v[0], 0, 0, 70))
|
|
k.emit(v_mfma_16x16x32(v[16:19], v[114:117], v[34:37], v[16:19], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(s_addc_u32(s[69], s[69], s[89]))
|
|
k.emit(v_mfma_16x16x32(v[20:23], v[114:117], v[38:41], v[20:23], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(s_sub_u32(s[62], s[62], s[88]))
|
|
k.emit(ds_read_b128(v[98:101], v[16], v[0], v[0], 0, 0, 72))
|
|
k.emit(ds_read_b128(v[102:105], v[16], v[0], v[0], 0, 0, 74))
|
|
k.emit(v_mfma_16x16x32(v[24:27], v[114:117], v[42:45], v[24:27], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(s_subb_u32(s[63], s[63], s[89]))
|
|
k.emit(v_mfma_16x16x32(v[28:31], v[114:117], v[46:49], v[28:31], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(s_cmp_eq_u32(s[63], 0))
|
|
k.emit(ds_read_b128(v[106:109], v[16], v[0], v[0], 0, 0, 76))
|
|
k.emit(ds_read_b128(v[110:113], v[16], v[0], v[0], 0, 0, 78))
|
|
k.emit(v_mfma_16x16x32(v[32:35], v[118:121], v[18:21], v[32:35], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.waitcnt(lgkm=4)
|
|
k.emit(s_cselect_b32(s[70], s[62], -1))
|
|
k.emit(v_perm_b32_e64(v[50], v[86], v[82], s[85]))
|
|
k.emit(v_perm_b32_e64(v[51], v[94], v[90], s[85]))
|
|
k.emit(v_mfma_16x16x32(v[36:39], v[118:121], v[22:25], v[36:39], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mfma_16x16x32(v[40:43], v[118:121], v[26:29], v[40:43], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(ds_read_b128(v[146:149], v[17], v[0], v[0], 0, 64))
|
|
k.emit(v_mfma_16x16x32(v[44:47], v[118:121], v[30:33], v[44:47], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mfma_16x16x32(v[48:51], v[118:121], v[34:37], v[48:51], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.waitcnt(lgkm=1)
|
|
k.emit(ds_read_b128(v[150:153], v[17], v[0], v[0], 0, 192))
|
|
k.emit(v_mfma_16x16x32(v[52:55], v[118:121], v[38:41], v[52:55], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(s_barrier())
|
|
k.emit(v_mfma_16x16x32(v[56:59], v[118:121], v[42:45], v[56:59], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(ds_read_b128(v[154:157], v[17], v[0], v[0], 0, 64, 1))
|
|
k.emit(v_mfma_16x16x32(v[60:63], v[118:121], v[46:49], v[60:63], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(s_mov_b32(M0, s[53]))
|
|
k.emit(buffer_load_dwordx4(v[0:3], v[0], s[68:71], 0, 0, 1, 0, 0, 1, 0, 1))
|
|
k.emit(v_mfma_16x16x32(v[64:67], v[122:125], v[18:21], v[64:67], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_perm_b32_e64(v[52], v[102], v[98], s[85]))
|
|
k.emit(v_perm_b32_e64(v[53], v[110], v[106], s[85]))
|
|
k.emit(v_mfma_16x16x32(v[68:71], v[122:125], v[22:25], v[68:71], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(ds_read_b128(v[158:161], v[17], v[0], v[0], 0, 192, 1))
|
|
k.emit(v_mfma_16x16x32(v[72:75], v[122:125], v[26:29], v[72:75], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(s_add_u32(M0, M0, 4096))
|
|
k.emit(buffer_load_dwordx4(v[0:3], v[1], s[68:71], 0, 0, 1, 0, 0, 1, 0, 1))
|
|
k.emit(v_mfma_16x16x32(v[76:79], v[122:125], v[30:33], v[76:79], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_perm_b32_e64(v[54], v[86], v[82], s[86]))
|
|
k.emit(v_perm_b32_e64(v[55], v[94], v[90], s[86]))
|
|
k.emit(v_mfma_16x16x32(v[80:83], v[122:125], v[34:37], v[80:83], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(ds_read_b128(v[162:165], v[17], v[0], v[0], 0, 64, 2))
|
|
k.emit(v_mfma_16x16x32(v[84:87], v[122:125], v[38:41], v[84:87], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(s_add_u32(M0, M0, 4096))
|
|
k.emit(buffer_load_dwordx4(v[0:3], v[2], s[68:71], 0, 0, 1, 0, 0, 1, 0, 1))
|
|
k.emit(v_mfma_16x16x32(v[88:91], v[122:125], v[42:45], v[88:91], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_perm_b32_e64(v[56], v[102], v[98], s[86]))
|
|
k.emit(v_perm_b32_e64(v[57], v[110], v[106], s[86]))
|
|
k.emit(v_mfma_16x16x32(v[92:95], v[122:125], v[46:49], v[92:95], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(ds_read_b128(v[166:169], v[17], v[0], v[0], 0, 192, 2))
|
|
k.emit(v_mfma_16x16x32(v[96:99], v[126:129], v[18:21], v[96:99], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(s_add_u32(M0, M0, 4096))
|
|
k.emit(buffer_load_dwordx4(v[0:3], v[3], s[68:71], 0, 0, 1, 0, 0, 1, 0, 1))
|
|
k.emit(v_mfma_16x16x32(v[100:103], v[126:129], v[22:25], v[100:103], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_perm_b32_e64(v[58], v[87], v[83], s[85]))
|
|
k.emit(v_perm_b32_e64(v[59], v[95], v[91], s[85]))
|
|
k.emit(v_mfma_16x16x32(v[104:107], v[126:129], v[26:29], v[104:107], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(ds_read_b128(v[170:173], v[17], v[0], v[0], 0, 64, 3))
|
|
k.emit(v_mfma_16x16x32(v[108:111], v[126:129], v[30:33], v[108:111], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(s_add_u32(M0, M0, 4096))
|
|
k.emit(buffer_load_dwordx4(v[0:3], v[4], s[68:71], 0, 0, 1, 0, 0, 1, 0, 1))
|
|
k.emit(v_mfma_16x16x32(v[112:115], v[126:129], v[34:37], v[112:115], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(s_cmp_eq_u32(s[8], s[78]))
|
|
k.emit(v_perm_b32_e64(v[60], v[103], v[99], s[85]))
|
|
k.emit(v_perm_b32_e64(v[61], v[111], v[107], s[85]))
|
|
k.emit(v_mfma_16x16x32(v[116:119], v[126:129], v[38:41], v[116:119], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(s_cselect_b32(s[88], s[81], s[84]))
|
|
k.emit(v_perm_b32_e64(v[62], v[87], v[83], s[86]))
|
|
k.emit(v_perm_b32_e64(v[63], v[95], v[91], s[86]))
|
|
k.emit(ds_read_b128(v[174:177], v[17], v[0], v[0], 0, 192, 3))
|
|
k.emit(v_mfma_16x16x32(v[120:123], v[126:129], v[42:45], v[120:123], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(s_cselect_b32(s[89], s[82], 0))
|
|
k.emit(v_mfma_16x16x32(v[124:127], v[126:129], v[46:49], v[124:127], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(s_add_u32(s[72], s[72], s[88]))
|
|
k.emit(v_perm_b32_e64(v[64], v[103], v[99], s[86]))
|
|
k.emit(v_perm_b32_e64(v[65], v[111], v[107], s[86]))
|
|
k.emit(v_mfma_16x16x32(v[128:131], v[130:133], v[18:21], v[128:131], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(s_addc_u32(s[73], s[73], s[89]))
|
|
k.emit(v_perm_b32_e64(v[66], v[88], v[84], s[85]))
|
|
k.emit(v_perm_b32_e64(v[67], v[96], v[92], s[85]))
|
|
k.emit(v_mfma_16x16x32(v[132:135], v[130:133], v[22:25], v[132:135], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(s_sub_u32(s[76], s[76], s[88]))
|
|
k.emit(v_perm_b32_e64(v[68], v[104], v[100], s[85]))
|
|
k.emit(v_perm_b32_e64(v[69], v[112], v[108], s[85]))
|
|
k.emit(v_mfma_16x16x32(v[136:139], v[130:133], v[26:29], v[136:139], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(s_subb_u32(s[77], s[77], s[89]))
|
|
k.emit(v_perm_b32_e64(v[70], v[88], v[84], s[86]))
|
|
k.emit(v_perm_b32_e64(v[71], v[96], v[92], s[86]))
|
|
k.emit(v_mfma_16x16x32(v[140:143], v[130:133], v[30:33], v[140:143], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(s_cmp_eq_u32(s[77], 0))
|
|
k.emit(v_perm_b32_e64(v[72], v[104], v[100], s[86]))
|
|
k.emit(v_perm_b32_e64(v[73], v[112], v[108], s[86]))
|
|
k.emit(v_mfma_16x16x32(v[144:147], v[130:133], v[34:37], v[144:147], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.waitcnt(lgkm=0)
|
|
k.emit(s_cselect_b32(s[74], s[76], -1))
|
|
k.emit(v_perm_b32_e64(v[74], v[89], v[85], s[85]))
|
|
k.emit(v_perm_b32_e64(v[75], v[97], v[93], s[85]))
|
|
k.emit(v_mfma_16x16x32(v[148:151], v[130:133], v[38:41], v[148:151], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_perm_b32_e64(v[76], v[105], v[101], s[85]))
|
|
k.emit(v_perm_b32_e64(v[77], v[113], v[109], s[85]))
|
|
k.emit(v_mfma_16x16x32(v[152:155], v[130:133], v[42:45], v[152:155], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_perm_b32_e64(v[78], v[89], v[85], s[86]))
|
|
k.emit(v_perm_b32_e64(v[79], v[97], v[93], s[86]))
|
|
k.emit(v_mfma_16x16x32(v[156:159], v[130:133], v[46:49], v[156:159], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_perm_b32_e64(v[80], v[105], v[101], s[86]))
|
|
k.emit(v_perm_b32_e64(v[81], v[113], v[109], s[86]))
|
|
k.emit(v_mfma_16x16x32(v[160:163], v[134:137], v[18:21], v[160:163], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mfma_16x16x32(v[164:167], v[134:137], v[22:25], v[164:167], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mfma_16x16x32(v[168:171], v[134:137], v[26:29], v[168:171], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mfma_16x16x32(v[172:175], v[134:137], v[30:33], v[172:175], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mfma_16x16x32(v[176:179], v[134:137], v[34:37], v[176:179], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(s_barrier())
|
|
k.emit(v_mfma_16x16x32(v[180:183], v[134:137], v[38:41], v[180:183], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mfma_16x16x32(v[184:187], v[134:137], v[42:45], v[184:187], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(s_add_u32(M0, M0, 4096))
|
|
k.emit(buffer_load_dwordx4(v[0:3], v[5], s[68:71], 0, 0, 1, 0, 0, 1, 0, 1))
|
|
k.emit(v_mfma_16x16x32(v[188:191], v[134:137], v[46:49], v[188:191], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mfma_16x16x32(v[192:195], v[138:141], v[18:21], v[192:195], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mfma_16x16x32(v[196:199], v[138:141], v[22:25], v[196:199], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(s_add_u32(M0, M0, 4096))
|
|
k.emit(buffer_load_dwordx4(v[0:3], v[6], s[68:71], 0, 0, 1, 0, 0, 1, 0, 1))
|
|
k.emit(v_mfma_16x16x32(v[200:203], v[138:141], v[26:29], v[200:203], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mfma_16x16x32(v[204:207], v[138:141], v[30:33], v[204:207], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mfma_16x16x32(v[208:211], v[138:141], v[34:37], v[208:211], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(s_add_u32(M0, M0, 4096))
|
|
k.emit(buffer_load_dwordx4(v[0:3], v[7], s[68:71], 0, 0, 1, 0, 0, 1, 0, 1))
|
|
k.emit(v_mfma_16x16x32(v[212:215], v[138:141], v[38:41], v[212:215], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mfma_16x16x32(v[216:219], v[138:141], v[42:45], v[216:219], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mfma_16x16x32(v[220:223], v[138:141], v[46:49], v[220:223], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(s_mov_b32(M0, s[54]))
|
|
k.emit(buffer_load_dwordx4(v[0:3], v[8], s[72:75], 0, 0, 1, 0, 0, 0, 1, 1, 1))
|
|
k.emit(v_mfma_16x16x32(v[224:227], v[142:145], v[18:21], v[224:227], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.waitcnt(vm=17)
|
|
k.emit(v_mfma_16x16x32(v[228:231], v[142:145], v[22:25], v[228:231], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mfma_16x16x32(v[232:235], v[142:145], v[26:29], v[232:235], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(s_add_u32(M0, M0, 4224))
|
|
k.emit(buffer_load_dwordx4(v[0:3], v[9], s[72:75], 0, 0, 1, 0, 0, 0, 1, 1, 1))
|
|
k.emit(v_mfma_16x16x32(v[236:239], v[142:145], v[30:33], v[236:239], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(s_barrier())
|
|
k.emit(v_xor_b32_e32(v[16], v[178], v[16]))
|
|
k.emit(v_xor_b32_e32(v[17], v[179], v[17]))
|
|
k.emit(v_mfma_16x16x32(v[240:243], v[142:145], v[34:37], v[240:243], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mfma_16x16x32(v[244:247], v[142:145], v[38:41], v[244:247], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(ds_read_b128(v[82:85], v[16]))
|
|
k.emit(ds_read_b128(v[86:89], v[16], v[0], v[0], 0, 0, 2))
|
|
k.emit(v_mfma_16x16x32(v[248:251], v[142:145], v[42:45], v[248:251], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mfma_16x16x32(v[252:255], v[142:145], v[46:49], v[252:255], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(ds_read_b128(v[90:93], v[16], v[0], v[0], 0, 0, 4))
|
|
k.emit(ds_read_b128(v[94:97], v[16], v[0], v[0], 0, 0, 6))
|
|
k.emit(v_mfma_16x16x32(v[0:3], v[146:149], v[50:53], v[0:3], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mfma_16x16x32(v[4:7], v[146:149], v[54:57], v[4:7], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(ds_read_b128(v[98:101], v[16], v[0], v[0], 0, 0, 8))
|
|
k.emit(ds_read_b128(v[102:105], v[16], v[0], v[0], 0, 0, 10))
|
|
k.emit(v_mfma_16x16x32(v[8:11], v[146:149], v[58:61], v[8:11], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.waitcnt(vm=9)
|
|
k.emit(v_mfma_16x16x32(v[12:15], v[146:149], v[62:65], v[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(ds_read_b128(v[106:109], v[16], v[0], v[0], 0, 0, 12))
|
|
k.emit(ds_read_b128(v[110:113], v[16], v[0], v[0], 0, 0, 14))
|
|
k.emit(v_mfma_16x16x32(v[16:19], v[146:149], v[66:69], v[16:19], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(s_barrier())
|
|
k.emit(v_mfma_16x16x32(v[20:23], v[146:149], v[70:73], v[20:23], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(ds_read_b128(v[114:117], v[17]))
|
|
k.emit(v_mfma_16x16x32(v[24:27], v[146:149], v[74:77], v[24:27], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mfma_16x16x32(v[28:31], v[146:149], v[78:81], v[28:31], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(ds_read_b128(v[118:121], v[17], v[0], v[0], 0, 128))
|
|
k.emit(v_mfma_16x16x32(v[32:35], v[150:153], v[50:53], v[32:35], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mfma_16x16x32(v[36:39], v[150:153], v[54:57], v[36:39], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.waitcnt(lgkm=4)
|
|
k.emit(ds_read_b128(v[122:125], v[17], v[0], v[0], 0, 0, 1))
|
|
k.emit(v_mfma_16x16x32(v[40:43], v[150:153], v[58:61], v[40:43], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_perm_b32_e64(v[18], v[86], v[82], s[85]))
|
|
k.emit(v_perm_b32_e64(v[19], v[94], v[90], s[85]))
|
|
k.emit(v_mfma_16x16x32(v[44:47], v[150:153], v[62:65], v[44:47], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(ds_read_b128(v[126:129], v[17], v[0], v[0], 0, 128, 1))
|
|
k.emit(v_mfma_16x16x32(v[48:51], v[150:153], v[66:69], v[48:51], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_perm_b32_e64(v[20], v[102], v[98], s[85]))
|
|
k.emit(v_perm_b32_e64(v[21], v[110], v[106], s[85]))
|
|
k.emit(v_mfma_16x16x32(v[52:55], v[150:153], v[70:73], v[52:55], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(ds_read_b128(v[130:133], v[17], v[0], v[0], 0, 0, 2))
|
|
k.emit(v_mfma_16x16x32(v[56:59], v[150:153], v[74:77], v[56:59], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_perm_b32_e64(v[22], v[86], v[82], s[86]))
|
|
k.emit(v_perm_b32_e64(v[23], v[94], v[90], s[86]))
|
|
k.emit(v_mfma_16x16x32(v[60:63], v[150:153], v[78:81], v[60:63], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(ds_read_b128(v[134:137], v[17], v[0], v[0], 0, 128, 2))
|
|
k.emit(v_mfma_16x16x32(v[64:67], v[154:157], v[50:53], v[64:67], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_perm_b32_e64(v[24], v[102], v[98], s[86]))
|
|
k.emit(v_perm_b32_e64(v[25], v[110], v[106], s[86]))
|
|
k.emit(v_mfma_16x16x32(v[68:71], v[154:157], v[54:57], v[68:71], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(ds_read_b128(v[138:141], v[17], v[0], v[0], 0, 0, 3))
|
|
k.emit(v_mfma_16x16x32(v[72:75], v[154:157], v[58:61], v[72:75], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_perm_b32_e64(v[26], v[87], v[83], s[85]))
|
|
k.emit(v_perm_b32_e64(v[27], v[95], v[91], s[85]))
|
|
k.emit(v_mfma_16x16x32(v[76:79], v[154:157], v[62:65], v[76:79], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(ds_read_b128(v[142:145], v[17], v[0], v[0], 0, 128, 3))
|
|
k.emit(v_mfma_16x16x32(v[80:83], v[154:157], v[66:69], v[80:83], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_perm_b32_e64(v[28], v[103], v[99], s[85]))
|
|
k.emit(v_perm_b32_e64(v[29], v[111], v[107], s[85]))
|
|
k.emit(v_mfma_16x16x32(v[84:87], v[154:157], v[70:73], v[84:87], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_perm_b32_e64(v[30], v[87], v[83], s[86]))
|
|
k.emit(v_perm_b32_e64(v[31], v[95], v[91], s[86]))
|
|
k.emit(v_mfma_16x16x32(v[88:91], v[154:157], v[74:77], v[88:91], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_perm_b32_e64(v[32], v[103], v[99], s[86]))
|
|
k.emit(v_perm_b32_e64(v[33], v[111], v[107], s[86]))
|
|
k.emit(v_mfma_16x16x32(v[92:95], v[154:157], v[78:81], v[92:95], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_perm_b32_e64(v[34], v[88], v[84], s[85]))
|
|
k.emit(v_perm_b32_e64(v[35], v[96], v[92], s[85]))
|
|
k.emit(v_mfma_16x16x32(v[96:99], v[158:161], v[50:53], v[96:99], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(s_add_u32(M0, M0, 4224))
|
|
k.emit(buffer_load_dwordx4(v[0:3], v[10], s[72:75], 0, 0, 1, 0, 0, 0, 1, 1, 1))
|
|
k.emit(v_mfma_16x16x32(v[100:103], v[158:161], v[54:57], v[100:103], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_perm_b32_e64(v[36], v[104], v[100], s[85]))
|
|
k.emit(v_perm_b32_e64(v[37], v[112], v[108], s[85]))
|
|
k.emit(v_mfma_16x16x32(v[104:107], v[158:161], v[58:61], v[104:107], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_perm_b32_e64(v[38], v[88], v[84], s[86]))
|
|
k.emit(v_perm_b32_e64(v[39], v[96], v[92], s[86]))
|
|
k.emit(v_mfma_16x16x32(v[108:111], v[158:161], v[62:65], v[108:111], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(s_add_u32(M0, M0, 4224))
|
|
k.emit(buffer_load_dwordx4(v[0:3], v[11], s[72:75], 0, 0, 1, 0, 0, 0, 1, 1, 1))
|
|
k.emit(v_mfma_16x16x32(v[112:115], v[158:161], v[66:69], v[112:115], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_perm_b32_e64(v[40], v[104], v[100], s[86]))
|
|
k.emit(v_perm_b32_e64(v[41], v[112], v[108], s[86]))
|
|
k.emit(v_mfma_16x16x32(v[116:119], v[158:161], v[70:73], v[116:119], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_perm_b32_e64(v[42], v[89], v[85], s[85]))
|
|
k.emit(v_perm_b32_e64(v[43], v[97], v[93], s[85]))
|
|
k.emit(v_mfma_16x16x32(v[120:123], v[158:161], v[74:77], v[120:123], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(s_add_u32(M0, M0, 4224))
|
|
k.emit(buffer_load_dwordx4(v[0:3], v[12], s[72:75], 0, 0, 1, 0, 0, 0, 1, 1, 1))
|
|
k.emit(v_mfma_16x16x32(v[124:127], v[158:161], v[78:81], v[124:127], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_perm_b32_e64(v[44], v[105], v[101], s[85]))
|
|
k.emit(v_perm_b32_e64(v[45], v[113], v[109], s[85]))
|
|
k.emit(v_mfma_16x16x32(v[128:131], v[162:165], v[50:53], v[128:131], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_perm_b32_e64(v[46], v[89], v[85], s[86]))
|
|
k.emit(v_perm_b32_e64(v[47], v[97], v[93], s[86]))
|
|
k.emit(v_mfma_16x16x32(v[132:135], v[162:165], v[54:57], v[132:135], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(s_add_u32(M0, M0, 4224))
|
|
k.emit(buffer_load_dwordx4(v[0:3], v[13], s[72:75], 0, 0, 1, 0, 0, 0, 1, 1, 1))
|
|
k.emit(v_mfma_16x16x32(v[136:139], v[162:165], v[58:61], v[136:139], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_perm_b32_e64(v[48], v[105], v[101], s[86]))
|
|
k.emit(v_perm_b32_e64(v[49], v[113], v[109], s[86]))
|
|
k.emit(v_mfma_16x16x32(v[140:143], v[162:165], v[62:65], v[140:143], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mfma_16x16x32(v[144:147], v[162:165], v[66:69], v[144:147], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(s_add_u32(M0, M0, 4224))
|
|
k.emit(buffer_load_dwordx4(v[0:3], v[14], s[72:75], 0, 0, 1, 0, 0, 0, 1, 1, 1))
|
|
k.emit(v_mfma_16x16x32(v[148:151], v[162:165], v[70:73], v[148:151], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mfma_16x16x32(v[152:155], v[162:165], v[74:77], v[152:155], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mfma_16x16x32(v[156:159], v[162:165], v[78:81], v[156:159], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mfma_16x16x32(v[160:163], v[166:169], v[50:53], v[160:163], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mfma_16x16x32(v[164:167], v[166:169], v[54:57], v[164:167], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mfma_16x16x32(v[168:171], v[166:169], v[58:61], v[168:171], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mfma_16x16x32(v[172:175], v[166:169], v[62:65], v[172:175], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mfma_16x16x32(v[176:179], v[166:169], v[66:69], v[176:179], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mfma_16x16x32(v[180:183], v[166:169], v[70:73], v[180:183], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mfma_16x16x32(v[184:187], v[166:169], v[74:77], v[184:187], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mfma_16x16x32(v[188:191], v[166:169], v[78:81], v[188:191], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mfma_16x16x32(v[192:195], v[170:173], v[50:53], v[192:195], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mfma_16x16x32(v[196:199], v[170:173], v[54:57], v[196:199], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mfma_16x16x32(v[200:203], v[170:173], v[58:61], v[200:203], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mfma_16x16x32(v[204:207], v[170:173], v[62:65], v[204:207], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mfma_16x16x32(v[208:211], v[170:173], v[66:69], v[208:211], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mfma_16x16x32(v[212:215], v[170:173], v[70:73], v[212:215], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mfma_16x16x32(v[216:219], v[170:173], v[74:77], v[216:219], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mfma_16x16x32(v[220:223], v[170:173], v[78:81], v[220:223], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mfma_16x16x32(v[224:227], v[174:177], v[50:53], v[224:227], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mfma_16x16x32(v[228:231], v[174:177], v[54:57], v[228:231], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mfma_16x16x32(v[232:235], v[174:177], v[58:61], v[232:235], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mfma_16x16x32(v[236:239], v[174:177], v[62:65], v[236:239], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mfma_16x16x32(v[240:243], v[174:177], v[66:69], v[240:243], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(s_add_u32(M0, M0, 4224))
|
|
k.emit(buffer_load_dwordx4(v[0:3], v[15], s[72:75], 0, 0, 1, 0, 0, 0, 1, 1, 1))
|
|
k.emit(v_mfma_16x16x32(v[244:247], v[174:177], v[70:73], v[244:247], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(s_xor_b32(s[53], s[55], s[53]))
|
|
k.emit(s_xor_b32(s[54], s[56], s[54]))
|
|
k.emit(v_mfma_16x16x32(v[248:251], v[174:177], v[74:77], v[248:251], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(s_sub_u32(s[8], s[8], 1))
|
|
k.emit(s_cmp_eq_i32(s[8], 2))
|
|
k.emit(v_mfma_16x16x32(v[252:255], v[174:177], v[78:81], v[252:255], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(s_cbranch_scc0(), target='LoopBeginL_1')
|
|
k.emit(s_branch(), target='LoopEndL')
|
|
k.label('LoopEndL')
|
|
k.emit(s_waitcnt())
|
|
k.emit(s_barrier())
|
|
k.emit(v_mfma_16x16x32(v[0:3], v[114:117], v[18:21], v[0:3], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(s_cmp_eq_u32(s[8], s[78]))
|
|
k.emit(ds_read_b128(v[82:85], v[16], v[0], v[0], 0, 0, 64))
|
|
k.emit(ds_read_b128(v[86:89], v[16], v[0], v[0], 0, 0, 66))
|
|
k.emit(v_mfma_16x16x32(v[4:7], v[114:117], v[22:25], v[4:7], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(s_cselect_b32(s[88], s[79], s[83]))
|
|
k.emit(v_mfma_16x16x32(v[8:11], v[114:117], v[26:29], v[8:11], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(s_cselect_b32(s[89], s[80], 0))
|
|
k.emit(ds_read_b128(v[90:93], v[16], v[0], v[0], 0, 0, 68))
|
|
k.emit(ds_read_b128(v[94:97], v[16], v[0], v[0], 0, 0, 70))
|
|
k.emit(v_mfma_16x16x32(v[12:15], v[114:117], v[30:33], v[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(s_add_u32(s[68], s[68], s[88]))
|
|
k.emit(v_mfma_16x16x32(v[16:19], v[114:117], v[34:37], v[16:19], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(s_addc_u32(s[69], s[69], s[89]))
|
|
k.emit(ds_read_b128(v[98:101], v[16], v[0], v[0], 0, 0, 72))
|
|
k.emit(ds_read_b128(v[102:105], v[16], v[0], v[0], 0, 0, 74))
|
|
k.emit(v_mfma_16x16x32(v[20:23], v[114:117], v[38:41], v[20:23], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(s_sub_u32(s[62], s[62], s[88]))
|
|
k.emit(v_mfma_16x16x32(v[24:27], v[114:117], v[42:45], v[24:27], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(s_subb_u32(s[63], s[63], s[89]))
|
|
k.emit(ds_read_b128(v[106:109], v[16], v[0], v[0], 0, 0, 76))
|
|
k.emit(ds_read_b128(v[110:113], v[16], v[0], v[0], 0, 0, 78))
|
|
k.emit(v_mfma_16x16x32(v[28:31], v[114:117], v[46:49], v[28:31], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(s_cmp_eq_u32(s[63], 0))
|
|
k.emit(v_mfma_16x16x32(v[32:35], v[118:121], v[18:21], v[32:35], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.waitcnt(lgkm=4)
|
|
k.emit(s_cselect_b32(s[70], s[62], -1))
|
|
k.emit(v_perm_b32_e64(v[50], v[86], v[82], s[85]))
|
|
k.emit(v_perm_b32_e64(v[51], v[94], v[90], s[85]))
|
|
k.emit(v_mfma_16x16x32(v[36:39], v[118:121], v[22:25], v[36:39], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(ds_read_b128(v[146:149], v[17], v[0], v[0], 0, 64))
|
|
k.emit(v_mfma_16x16x32(v[40:43], v[118:121], v[26:29], v[40:43], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mfma_16x16x32(v[44:47], v[118:121], v[30:33], v[44:47], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(ds_read_b128(v[150:153], v[17], v[0], v[0], 0, 192))
|
|
k.emit(v_mfma_16x16x32(v[48:51], v[118:121], v[34:37], v[48:51], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.waitcnt(lgkm=1)
|
|
k.emit(v_mfma_16x16x32(v[52:55], v[118:121], v[38:41], v[52:55], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(s_barrier())
|
|
k.emit(v_mfma_16x16x32(v[56:59], v[118:121], v[42:45], v[56:59], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mfma_16x16x32(v[60:63], v[118:121], v[46:49], v[60:63], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(ds_read_b128(v[154:157], v[17], v[0], v[0], 0, 64, 1))
|
|
k.emit(v_mfma_16x16x32(v[64:67], v[122:125], v[18:21], v[64:67], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_perm_b32_e64(v[52], v[102], v[98], s[85]))
|
|
k.emit(v_perm_b32_e64(v[53], v[110], v[106], s[85]))
|
|
k.emit(v_mfma_16x16x32(v[68:71], v[122:125], v[22:25], v[68:71], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mfma_16x16x32(v[72:75], v[122:125], v[26:29], v[72:75], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(ds_read_b128(v[158:161], v[17], v[0], v[0], 0, 192, 1))
|
|
k.emit(v_mfma_16x16x32(v[76:79], v[122:125], v[30:33], v[76:79], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_perm_b32_e64(v[54], v[86], v[82], s[86]))
|
|
k.emit(v_perm_b32_e64(v[55], v[94], v[90], s[86]))
|
|
k.emit(v_mfma_16x16x32(v[80:83], v[122:125], v[34:37], v[80:83], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mfma_16x16x32(v[84:87], v[122:125], v[38:41], v[84:87], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(ds_read_b128(v[162:165], v[17], v[0], v[0], 0, 64, 2))
|
|
k.emit(v_mfma_16x16x32(v[88:91], v[122:125], v[42:45], v[88:91], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_perm_b32_e64(v[56], v[102], v[98], s[86]))
|
|
k.emit(v_perm_b32_e64(v[57], v[110], v[106], s[86]))
|
|
k.emit(v_mfma_16x16x32(v[92:95], v[122:125], v[46:49], v[92:95], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mfma_16x16x32(v[96:99], v[126:129], v[18:21], v[96:99], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(ds_read_b128(v[166:169], v[17], v[0], v[0], 0, 192, 2))
|
|
k.emit(v_mfma_16x16x32(v[100:103], v[126:129], v[22:25], v[100:103], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_perm_b32_e64(v[58], v[87], v[83], s[85]))
|
|
k.emit(v_perm_b32_e64(v[59], v[95], v[91], s[85]))
|
|
k.emit(v_mfma_16x16x32(v[104:107], v[126:129], v[26:29], v[104:107], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mfma_16x16x32(v[108:111], v[126:129], v[30:33], v[108:111], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(ds_read_b128(v[170:173], v[17], v[0], v[0], 0, 64, 3))
|
|
k.emit(v_mfma_16x16x32(v[112:115], v[126:129], v[34:37], v[112:115], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(s_cmp_eq_u32(s[8], s[78]))
|
|
k.emit(v_perm_b32_e64(v[60], v[103], v[99], s[85]))
|
|
k.emit(v_perm_b32_e64(v[61], v[111], v[107], s[85]))
|
|
k.emit(v_mfma_16x16x32(v[116:119], v[126:129], v[38:41], v[116:119], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(s_cselect_b32(s[88], s[81], s[84]))
|
|
k.emit(v_perm_b32_e64(v[62], v[87], v[83], s[86]))
|
|
k.emit(v_perm_b32_e64(v[63], v[95], v[91], s[86]))
|
|
k.emit(v_mfma_16x16x32(v[120:123], v[126:129], v[42:45], v[120:123], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(s_cselect_b32(s[89], s[82], 0))
|
|
k.emit(ds_read_b128(v[174:177], v[17], v[0], v[0], 0, 192, 3))
|
|
k.emit(v_mfma_16x16x32(v[124:127], v[126:129], v[46:49], v[124:127], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(s_add_u32(s[72], s[72], s[88]))
|
|
k.emit(v_perm_b32_e64(v[64], v[103], v[99], s[86]))
|
|
k.emit(v_perm_b32_e64(v[65], v[111], v[107], s[86]))
|
|
k.emit(v_mfma_16x16x32(v[128:131], v[130:133], v[18:21], v[128:131], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(s_addc_u32(s[73], s[73], s[89]))
|
|
k.emit(v_perm_b32_e64(v[66], v[88], v[84], s[85]))
|
|
k.emit(v_perm_b32_e64(v[67], v[96], v[92], s[85]))
|
|
k.emit(v_mfma_16x16x32(v[132:135], v[130:133], v[22:25], v[132:135], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(s_sub_u32(s[76], s[76], s[88]))
|
|
k.emit(v_perm_b32_e64(v[68], v[104], v[100], s[85]))
|
|
k.emit(v_perm_b32_e64(v[69], v[112], v[108], s[85]))
|
|
k.emit(v_mfma_16x16x32(v[136:139], v[130:133], v[26:29], v[136:139], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(s_subb_u32(s[77], s[77], s[89]))
|
|
k.emit(v_perm_b32_e64(v[70], v[88], v[84], s[86]))
|
|
k.emit(v_perm_b32_e64(v[71], v[96], v[92], s[86]))
|
|
k.emit(v_mfma_16x16x32(v[140:143], v[130:133], v[30:33], v[140:143], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(s_cmp_eq_u32(s[77], 0))
|
|
k.emit(v_perm_b32_e64(v[72], v[104], v[100], s[86]))
|
|
k.emit(v_perm_b32_e64(v[73], v[112], v[108], s[86]))
|
|
k.emit(v_mfma_16x16x32(v[144:147], v[130:133], v[34:37], v[144:147], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.waitcnt(lgkm=0)
|
|
k.emit(s_cselect_b32(s[74], s[76], -1))
|
|
k.emit(v_perm_b32_e64(v[74], v[89], v[85], s[85]))
|
|
k.emit(v_perm_b32_e64(v[75], v[97], v[93], s[85]))
|
|
k.emit(v_mfma_16x16x32(v[148:151], v[130:133], v[38:41], v[148:151], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_perm_b32_e64(v[76], v[105], v[101], s[85]))
|
|
k.emit(v_perm_b32_e64(v[77], v[113], v[109], s[85]))
|
|
k.emit(v_mfma_16x16x32(v[152:155], v[130:133], v[42:45], v[152:155], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_perm_b32_e64(v[78], v[89], v[85], s[86]))
|
|
k.emit(v_perm_b32_e64(v[79], v[97], v[93], s[86]))
|
|
k.emit(v_mfma_16x16x32(v[156:159], v[130:133], v[46:49], v[156:159], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_perm_b32_e64(v[80], v[105], v[101], s[86]))
|
|
k.emit(v_perm_b32_e64(v[81], v[113], v[109], s[86]))
|
|
k.emit(v_mfma_16x16x32(v[160:163], v[134:137], v[18:21], v[160:163], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mfma_16x16x32(v[164:167], v[134:137], v[22:25], v[164:167], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mfma_16x16x32(v[168:171], v[134:137], v[26:29], v[168:171], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mfma_16x16x32(v[172:175], v[134:137], v[30:33], v[172:175], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mfma_16x16x32(v[176:179], v[134:137], v[34:37], v[176:179], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(s_barrier())
|
|
k.emit(v_mfma_16x16x32(v[180:183], v[134:137], v[38:41], v[180:183], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mfma_16x16x32(v[184:187], v[134:137], v[42:45], v[184:187], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mfma_16x16x32(v[188:191], v[134:137], v[46:49], v[188:191], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mfma_16x16x32(v[192:195], v[138:141], v[18:21], v[192:195], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mfma_16x16x32(v[196:199], v[138:141], v[22:25], v[196:199], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mfma_16x16x32(v[200:203], v[138:141], v[26:29], v[200:203], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mfma_16x16x32(v[204:207], v[138:141], v[30:33], v[204:207], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mfma_16x16x32(v[208:211], v[138:141], v[34:37], v[208:211], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mfma_16x16x32(v[212:215], v[138:141], v[38:41], v[212:215], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mfma_16x16x32(v[216:219], v[138:141], v[42:45], v[216:219], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mfma_16x16x32(v[220:223], v[138:141], v[46:49], v[220:223], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mfma_16x16x32(v[224:227], v[142:145], v[18:21], v[224:227], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.waitcnt(vm=17)
|
|
k.emit(v_mfma_16x16x32(v[228:231], v[142:145], v[22:25], v[228:231], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mfma_16x16x32(v[232:235], v[142:145], v[26:29], v[232:235], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mfma_16x16x32(v[236:239], v[142:145], v[30:33], v[236:239], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(s_barrier())
|
|
k.emit(v_xor_b32_e32(v[16], v[178], v[16]))
|
|
k.emit(v_xor_b32_e32(v[17], v[179], v[17]))
|
|
k.emit(v_mfma_16x16x32(v[240:243], v[142:145], v[34:37], v[240:243], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(ds_read_b128(v[82:85], v[16]))
|
|
k.emit(ds_read_b128(v[86:89], v[16], v[0], v[0], 0, 0, 2))
|
|
k.emit(v_mfma_16x16x32(v[244:247], v[142:145], v[38:41], v[244:247], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mfma_16x16x32(v[248:251], v[142:145], v[42:45], v[248:251], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(ds_read_b128(v[90:93], v[16], v[0], v[0], 0, 0, 4))
|
|
k.emit(ds_read_b128(v[94:97], v[16], v[0], v[0], 0, 0, 6))
|
|
k.emit(v_mfma_16x16x32(v[252:255], v[142:145], v[46:49], v[252:255], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mfma_16x16x32(v[0:3], v[146:149], v[50:53], v[0:3], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(ds_read_b128(v[98:101], v[16], v[0], v[0], 0, 0, 8))
|
|
k.emit(ds_read_b128(v[102:105], v[16], v[0], v[0], 0, 0, 10))
|
|
k.emit(v_mfma_16x16x32(v[4:7], v[146:149], v[54:57], v[4:7], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mfma_16x16x32(v[8:11], v[146:149], v[58:61], v[8:11], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.waitcnt(vm=9)
|
|
k.emit(ds_read_b128(v[106:109], v[16], v[0], v[0], 0, 0, 12))
|
|
k.emit(ds_read_b128(v[110:113], v[16], v[0], v[0], 0, 0, 14))
|
|
k.emit(v_mfma_16x16x32(v[12:15], v[146:149], v[62:65], v[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mfma_16x16x32(v[16:19], v[146:149], v[66:69], v[16:19], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(s_barrier())
|
|
k.emit(ds_read_b128(v[114:117], v[17]))
|
|
k.emit(v_mfma_16x16x32(v[20:23], v[146:149], v[70:73], v[20:23], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mfma_16x16x32(v[24:27], v[146:149], v[74:77], v[24:27], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(ds_read_b128(v[118:121], v[17], v[0], v[0], 0, 128))
|
|
k.emit(v_mfma_16x16x32(v[28:31], v[146:149], v[78:81], v[28:31], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mfma_16x16x32(v[32:35], v[150:153], v[50:53], v[32:35], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(ds_read_b128(v[122:125], v[17], v[0], v[0], 0, 0, 1))
|
|
k.emit(v_mfma_16x16x32(v[36:39], v[150:153], v[54:57], v[36:39], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.waitcnt(lgkm=4)
|
|
k.emit(v_mfma_16x16x32(v[40:43], v[150:153], v[58:61], v[40:43], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(ds_read_b128(v[126:129], v[17], v[0], v[0], 0, 128, 1))
|
|
k.emit(v_mfma_16x16x32(v[44:47], v[150:153], v[62:65], v[44:47], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_perm_b32_e64(v[18], v[86], v[82], s[85]))
|
|
k.emit(v_perm_b32_e64(v[19], v[94], v[90], s[85]))
|
|
k.emit(v_mfma_16x16x32(v[48:51], v[150:153], v[66:69], v[48:51], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(ds_read_b128(v[130:133], v[17], v[0], v[0], 0, 0, 2))
|
|
k.emit(v_mfma_16x16x32(v[52:55], v[150:153], v[70:73], v[52:55], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_perm_b32_e64(v[20], v[102], v[98], s[85]))
|
|
k.emit(v_perm_b32_e64(v[21], v[110], v[106], s[85]))
|
|
k.emit(v_mfma_16x16x32(v[56:59], v[150:153], v[74:77], v[56:59], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(ds_read_b128(v[134:137], v[17], v[0], v[0], 0, 128, 2))
|
|
k.emit(v_mfma_16x16x32(v[60:63], v[150:153], v[78:81], v[60:63], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_perm_b32_e64(v[22], v[86], v[82], s[86]))
|
|
k.emit(v_perm_b32_e64(v[23], v[94], v[90], s[86]))
|
|
k.emit(v_mfma_16x16x32(v[64:67], v[154:157], v[50:53], v[64:67], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(ds_read_b128(v[138:141], v[17], v[0], v[0], 0, 0, 3))
|
|
k.emit(v_mfma_16x16x32(v[68:71], v[154:157], v[54:57], v[68:71], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_perm_b32_e64(v[24], v[102], v[98], s[86]))
|
|
k.emit(v_perm_b32_e64(v[25], v[110], v[106], s[86]))
|
|
k.emit(v_mfma_16x16x32(v[72:75], v[154:157], v[58:61], v[72:75], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(ds_read_b128(v[142:145], v[17], v[0], v[0], 0, 128, 3))
|
|
k.emit(v_mfma_16x16x32(v[76:79], v[154:157], v[62:65], v[76:79], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_perm_b32_e64(v[26], v[87], v[83], s[85]))
|
|
k.emit(v_perm_b32_e64(v[27], v[95], v[91], s[85]))
|
|
k.emit(v_mfma_16x16x32(v[80:83], v[154:157], v[66:69], v[80:83], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_perm_b32_e64(v[28], v[103], v[99], s[85]))
|
|
k.emit(v_perm_b32_e64(v[29], v[111], v[107], s[85]))
|
|
k.emit(v_mfma_16x16x32(v[84:87], v[154:157], v[70:73], v[84:87], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_perm_b32_e64(v[30], v[87], v[83], s[86]))
|
|
k.emit(v_perm_b32_e64(v[31], v[95], v[91], s[86]))
|
|
k.emit(v_mfma_16x16x32(v[88:91], v[154:157], v[74:77], v[88:91], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_perm_b32_e64(v[32], v[103], v[99], s[86]))
|
|
k.emit(v_perm_b32_e64(v[33], v[111], v[107], s[86]))
|
|
k.emit(v_mfma_16x16x32(v[92:95], v[154:157], v[78:81], v[92:95], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mfma_16x16x32(v[96:99], v[158:161], v[50:53], v[96:99], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_perm_b32_e64(v[34], v[88], v[84], s[85]))
|
|
k.emit(v_perm_b32_e64(v[35], v[96], v[92], s[85]))
|
|
k.emit(v_mfma_16x16x32(v[100:103], v[158:161], v[54:57], v[100:103], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_perm_b32_e64(v[36], v[104], v[100], s[85]))
|
|
k.emit(v_perm_b32_e64(v[37], v[112], v[108], s[85]))
|
|
k.emit(v_mfma_16x16x32(v[104:107], v[158:161], v[58:61], v[104:107], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mfma_16x16x32(v[108:111], v[158:161], v[62:65], v[108:111], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_perm_b32_e64(v[38], v[88], v[84], s[86]))
|
|
k.emit(v_perm_b32_e64(v[39], v[96], v[92], s[86]))
|
|
k.emit(v_mfma_16x16x32(v[112:115], v[158:161], v[66:69], v[112:115], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_perm_b32_e64(v[40], v[104], v[100], s[86]))
|
|
k.emit(v_perm_b32_e64(v[41], v[112], v[108], s[86]))
|
|
k.emit(v_mfma_16x16x32(v[116:119], v[158:161], v[70:73], v[116:119], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mfma_16x16x32(v[120:123], v[158:161], v[74:77], v[120:123], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_perm_b32_e64(v[42], v[89], v[85], s[85]))
|
|
k.emit(v_perm_b32_e64(v[43], v[97], v[93], s[85]))
|
|
k.emit(v_mfma_16x16x32(v[124:127], v[158:161], v[78:81], v[124:127], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_perm_b32_e64(v[44], v[105], v[101], s[85]))
|
|
k.emit(v_perm_b32_e64(v[45], v[113], v[109], s[85]))
|
|
k.emit(v_mfma_16x16x32(v[128:131], v[162:165], v[50:53], v[128:131], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mfma_16x16x32(v[132:135], v[162:165], v[54:57], v[132:135], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_perm_b32_e64(v[46], v[89], v[85], s[86]))
|
|
k.emit(v_perm_b32_e64(v[47], v[97], v[93], s[86]))
|
|
k.emit(v_mfma_16x16x32(v[136:139], v[162:165], v[58:61], v[136:139], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_perm_b32_e64(v[48], v[105], v[101], s[86]))
|
|
k.emit(v_perm_b32_e64(v[49], v[113], v[109], s[86]))
|
|
k.emit(v_mfma_16x16x32(v[140:143], v[162:165], v[62:65], v[140:143], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mfma_16x16x32(v[144:147], v[162:165], v[66:69], v[144:147], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mfma_16x16x32(v[148:151], v[162:165], v[70:73], v[148:151], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mfma_16x16x32(v[152:155], v[162:165], v[74:77], v[152:155], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mfma_16x16x32(v[156:159], v[162:165], v[78:81], v[156:159], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mfma_16x16x32(v[160:163], v[166:169], v[50:53], v[160:163], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mfma_16x16x32(v[164:167], v[166:169], v[54:57], v[164:167], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mfma_16x16x32(v[168:171], v[166:169], v[58:61], v[168:171], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mfma_16x16x32(v[172:175], v[166:169], v[62:65], v[172:175], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mfma_16x16x32(v[176:179], v[166:169], v[66:69], v[176:179], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mfma_16x16x32(v[180:183], v[166:169], v[70:73], v[180:183], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mfma_16x16x32(v[184:187], v[166:169], v[74:77], v[184:187], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mfma_16x16x32(v[188:191], v[166:169], v[78:81], v[188:191], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mfma_16x16x32(v[192:195], v[170:173], v[50:53], v[192:195], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mfma_16x16x32(v[196:199], v[170:173], v[54:57], v[196:199], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mfma_16x16x32(v[200:203], v[170:173], v[58:61], v[200:203], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mfma_16x16x32(v[204:207], v[170:173], v[62:65], v[204:207], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mfma_16x16x32(v[208:211], v[170:173], v[66:69], v[208:211], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mfma_16x16x32(v[212:215], v[170:173], v[70:73], v[212:215], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mfma_16x16x32(v[216:219], v[170:173], v[74:77], v[216:219], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mfma_16x16x32(v[220:223], v[170:173], v[78:81], v[220:223], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mfma_16x16x32(v[224:227], v[174:177], v[50:53], v[224:227], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mfma_16x16x32(v[228:231], v[174:177], v[54:57], v[228:231], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mfma_16x16x32(v[232:235], v[174:177], v[58:61], v[232:235], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mfma_16x16x32(v[236:239], v[174:177], v[62:65], v[236:239], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mfma_16x16x32(v[240:243], v[174:177], v[66:69], v[240:243], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mfma_16x16x32(v[244:247], v[174:177], v[70:73], v[244:247], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mfma_16x16x32(v[248:251], v[174:177], v[74:77], v[248:251], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mfma_16x16x32(v[252:255], v[174:177], v[78:81], v[252:255], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.label('toPGR1')
|
|
k.emit(s_waitcnt())
|
|
k.emit(s_barrier())
|
|
k.emit(v_mfma_16x16x32(v[0:3], v[114:117], v[18:21], v[0:3], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(ds_read_b128(v[82:85], v[16], v[0], v[0], 0, 0, 64))
|
|
k.emit(ds_read_b128(v[86:89], v[16], v[0], v[0], 0, 0, 66))
|
|
k.emit(v_mfma_16x16x32(v[4:7], v[114:117], v[22:25], v[4:7], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mfma_16x16x32(v[8:11], v[114:117], v[26:29], v[8:11], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(ds_read_b128(v[90:93], v[16], v[0], v[0], 0, 0, 68))
|
|
k.emit(ds_read_b128(v[94:97], v[16], v[0], v[0], 0, 0, 70))
|
|
k.emit(v_mfma_16x16x32(v[12:15], v[114:117], v[30:33], v[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mfma_16x16x32(v[16:19], v[114:117], v[34:37], v[16:19], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(ds_read_b128(v[98:101], v[16], v[0], v[0], 0, 0, 72))
|
|
k.emit(ds_read_b128(v[102:105], v[16], v[0], v[0], 0, 0, 74))
|
|
k.emit(v_mfma_16x16x32(v[20:23], v[114:117], v[38:41], v[20:23], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mfma_16x16x32(v[24:27], v[114:117], v[42:45], v[24:27], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(ds_read_b128(v[106:109], v[16], v[0], v[0], 0, 0, 76))
|
|
k.emit(ds_read_b128(v[110:113], v[16], v[0], v[0], 0, 0, 78))
|
|
k.emit(v_mfma_16x16x32(v[28:31], v[114:117], v[46:49], v[28:31], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mfma_16x16x32(v[32:35], v[118:121], v[18:21], v[32:35], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.waitcnt(lgkm=4)
|
|
k.emit(v_perm_b32_e64(v[50], v[86], v[82], s[85]))
|
|
k.emit(v_perm_b32_e64(v[51], v[94], v[90], s[85]))
|
|
k.emit(v_mfma_16x16x32(v[36:39], v[118:121], v[22:25], v[36:39], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(ds_read_b128(v[146:149], v[17], v[0], v[0], 0, 64))
|
|
k.emit(v_mfma_16x16x32(v[40:43], v[118:121], v[26:29], v[40:43], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mfma_16x16x32(v[44:47], v[118:121], v[30:33], v[44:47], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(ds_read_b128(v[150:153], v[17], v[0], v[0], 0, 192))
|
|
k.emit(v_mfma_16x16x32(v[48:51], v[118:121], v[34:37], v[48:51], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.waitcnt(lgkm=1)
|
|
k.emit(v_mfma_16x16x32(v[52:55], v[118:121], v[38:41], v[52:55], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(s_barrier())
|
|
k.emit(v_mfma_16x16x32(v[56:59], v[118:121], v[42:45], v[56:59], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mfma_16x16x32(v[60:63], v[118:121], v[46:49], v[60:63], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(ds_read_b128(v[154:157], v[17], v[0], v[0], 0, 64, 1))
|
|
k.emit(v_mfma_16x16x32(v[64:67], v[122:125], v[18:21], v[64:67], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_perm_b32_e64(v[52], v[102], v[98], s[85]))
|
|
k.emit(v_perm_b32_e64(v[53], v[110], v[106], s[85]))
|
|
k.emit(v_mfma_16x16x32(v[68:71], v[122:125], v[22:25], v[68:71], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mfma_16x16x32(v[72:75], v[122:125], v[26:29], v[72:75], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(ds_read_b128(v[158:161], v[17], v[0], v[0], 0, 192, 1))
|
|
k.emit(v_mfma_16x16x32(v[76:79], v[122:125], v[30:33], v[76:79], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_perm_b32_e64(v[54], v[86], v[82], s[86]))
|
|
k.emit(v_perm_b32_e64(v[55], v[94], v[90], s[86]))
|
|
k.emit(v_mfma_16x16x32(v[80:83], v[122:125], v[34:37], v[80:83], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mfma_16x16x32(v[84:87], v[122:125], v[38:41], v[84:87], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(ds_read_b128(v[162:165], v[17], v[0], v[0], 0, 64, 2))
|
|
k.emit(v_mfma_16x16x32(v[88:91], v[122:125], v[42:45], v[88:91], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_perm_b32_e64(v[56], v[102], v[98], s[86]))
|
|
k.emit(v_perm_b32_e64(v[57], v[110], v[106], s[86]))
|
|
k.emit(v_mfma_16x16x32(v[92:95], v[122:125], v[46:49], v[92:95], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mfma_16x16x32(v[96:99], v[126:129], v[18:21], v[96:99], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(ds_read_b128(v[166:169], v[17], v[0], v[0], 0, 192, 2))
|
|
k.emit(v_mfma_16x16x32(v[100:103], v[126:129], v[22:25], v[100:103], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_perm_b32_e64(v[58], v[87], v[83], s[85]))
|
|
k.emit(v_perm_b32_e64(v[59], v[95], v[91], s[85]))
|
|
k.emit(v_mfma_16x16x32(v[104:107], v[126:129], v[26:29], v[104:107], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mfma_16x16x32(v[108:111], v[126:129], v[30:33], v[108:111], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(ds_read_b128(v[170:173], v[17], v[0], v[0], 0, 64, 3))
|
|
k.emit(v_mfma_16x16x32(v[112:115], v[126:129], v[34:37], v[112:115], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_perm_b32_e64(v[60], v[103], v[99], s[85]))
|
|
k.emit(v_perm_b32_e64(v[61], v[111], v[107], s[85]))
|
|
k.emit(v_mfma_16x16x32(v[116:119], v[126:129], v[38:41], v[116:119], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_perm_b32_e64(v[62], v[87], v[83], s[86]))
|
|
k.emit(v_perm_b32_e64(v[63], v[95], v[91], s[86]))
|
|
k.emit(v_mfma_16x16x32(v[120:123], v[126:129], v[42:45], v[120:123], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(ds_read_b128(v[174:177], v[17], v[0], v[0], 0, 192, 3))
|
|
k.emit(v_mfma_16x16x32(v[124:127], v[126:129], v[46:49], v[124:127], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_perm_b32_e64(v[64], v[103], v[99], s[86]))
|
|
k.emit(v_perm_b32_e64(v[65], v[111], v[107], s[86]))
|
|
k.emit(v_mfma_16x16x32(v[128:131], v[130:133], v[18:21], v[128:131], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_perm_b32_e64(v[66], v[88], v[84], s[85]))
|
|
k.emit(v_perm_b32_e64(v[67], v[96], v[92], s[85]))
|
|
k.emit(v_mfma_16x16x32(v[132:135], v[130:133], v[22:25], v[132:135], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_perm_b32_e64(v[68], v[104], v[100], s[85]))
|
|
k.emit(v_perm_b32_e64(v[69], v[112], v[108], s[85]))
|
|
k.emit(v_mfma_16x16x32(v[136:139], v[130:133], v[26:29], v[136:139], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_perm_b32_e64(v[70], v[88], v[84], s[86]))
|
|
k.emit(v_perm_b32_e64(v[71], v[96], v[92], s[86]))
|
|
k.emit(v_mfma_16x16x32(v[140:143], v[130:133], v[30:33], v[140:143], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_perm_b32_e64(v[72], v[104], v[100], s[86]))
|
|
k.emit(v_perm_b32_e64(v[73], v[112], v[108], s[86]))
|
|
k.emit(v_mfma_16x16x32(v[144:147], v[130:133], v[34:37], v[144:147], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.waitcnt(lgkm=0)
|
|
k.emit(v_perm_b32_e64(v[74], v[89], v[85], s[85]))
|
|
k.emit(v_perm_b32_e64(v[75], v[97], v[93], s[85]))
|
|
k.emit(v_mfma_16x16x32(v[148:151], v[130:133], v[38:41], v[148:151], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_perm_b32_e64(v[76], v[105], v[101], s[85]))
|
|
k.emit(v_perm_b32_e64(v[77], v[113], v[109], s[85]))
|
|
k.emit(v_mfma_16x16x32(v[152:155], v[130:133], v[42:45], v[152:155], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_perm_b32_e64(v[78], v[89], v[85], s[86]))
|
|
k.emit(v_perm_b32_e64(v[79], v[97], v[93], s[86]))
|
|
k.emit(v_mfma_16x16x32(v[156:159], v[130:133], v[46:49], v[156:159], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_perm_b32_e64(v[80], v[105], v[101], s[86]))
|
|
k.emit(v_perm_b32_e64(v[81], v[113], v[109], s[86]))
|
|
k.emit(v_mfma_16x16x32(v[160:163], v[134:137], v[18:21], v[160:163], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mfma_16x16x32(v[164:167], v[134:137], v[22:25], v[164:167], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mfma_16x16x32(v[168:171], v[134:137], v[26:29], v[168:171], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mfma_16x16x32(v[172:175], v[134:137], v[30:33], v[172:175], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mfma_16x16x32(v[176:179], v[134:137], v[34:37], v[176:179], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(s_barrier())
|
|
k.emit(v_mfma_16x16x32(v[180:183], v[134:137], v[38:41], v[180:183], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mfma_16x16x32(v[184:187], v[134:137], v[42:45], v[184:187], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mfma_16x16x32(v[188:191], v[134:137], v[46:49], v[188:191], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mfma_16x16x32(v[192:195], v[138:141], v[18:21], v[192:195], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mfma_16x16x32(v[196:199], v[138:141], v[22:25], v[196:199], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mfma_16x16x32(v[200:203], v[138:141], v[26:29], v[200:203], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mfma_16x16x32(v[204:207], v[138:141], v[30:33], v[204:207], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mfma_16x16x32(v[208:211], v[138:141], v[34:37], v[208:211], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mfma_16x16x32(v[212:215], v[138:141], v[38:41], v[212:215], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mfma_16x16x32(v[216:219], v[138:141], v[42:45], v[216:219], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mfma_16x16x32(v[220:223], v[138:141], v[46:49], v[220:223], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mfma_16x16x32(v[224:227], v[142:145], v[18:21], v[224:227], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.waitcnt(vm=17)
|
|
k.emit(v_mfma_16x16x32(v[228:231], v[142:145], v[22:25], v[228:231], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mfma_16x16x32(v[232:235], v[142:145], v[26:29], v[232:235], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mfma_16x16x32(v[236:239], v[142:145], v[30:33], v[236:239], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(s_barrier())
|
|
k.emit(v_mfma_16x16x32(v[240:243], v[142:145], v[34:37], v[240:243], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mfma_16x16x32(v[244:247], v[142:145], v[38:41], v[244:247], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mfma_16x16x32(v[248:251], v[142:145], v[42:45], v[248:251], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mfma_16x16x32(v[252:255], v[142:145], v[46:49], v[252:255], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mfma_16x16x32(v[0:3], v[146:149], v[50:53], v[0:3], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mfma_16x16x32(v[4:7], v[146:149], v[54:57], v[4:7], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mfma_16x16x32(v[8:11], v[146:149], v[58:61], v[8:11], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.waitcnt(vm=9)
|
|
k.emit(v_mfma_16x16x32(v[12:15], v[146:149], v[62:65], v[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mfma_16x16x32(v[16:19], v[146:149], v[66:69], v[16:19], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(s_barrier())
|
|
k.emit(v_mfma_16x16x32(v[20:23], v[146:149], v[70:73], v[20:23], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mfma_16x16x32(v[24:27], v[146:149], v[74:77], v[24:27], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mfma_16x16x32(v[28:31], v[146:149], v[78:81], v[28:31], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mfma_16x16x32(v[32:35], v[150:153], v[50:53], v[32:35], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mfma_16x16x32(v[36:39], v[150:153], v[54:57], v[36:39], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.waitcnt(lgkm=4)
|
|
k.emit(v_mfma_16x16x32(v[40:43], v[150:153], v[58:61], v[40:43], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mfma_16x16x32(v[44:47], v[150:153], v[62:65], v[44:47], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_perm_b32_e64(v[18], v[86], v[82], s[85]))
|
|
k.emit(v_perm_b32_e64(v[19], v[94], v[90], s[85]))
|
|
k.emit(v_mfma_16x16x32(v[48:51], v[150:153], v[66:69], v[48:51], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mfma_16x16x32(v[52:55], v[150:153], v[70:73], v[52:55], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_perm_b32_e64(v[20], v[102], v[98], s[85]))
|
|
k.emit(v_perm_b32_e64(v[21], v[110], v[106], s[85]))
|
|
k.emit(v_mfma_16x16x32(v[56:59], v[150:153], v[74:77], v[56:59], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mfma_16x16x32(v[60:63], v[150:153], v[78:81], v[60:63], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_perm_b32_e64(v[22], v[86], v[82], s[86]))
|
|
k.emit(v_perm_b32_e64(v[23], v[94], v[90], s[86]))
|
|
k.emit(v_mfma_16x16x32(v[64:67], v[154:157], v[50:53], v[64:67], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mfma_16x16x32(v[68:71], v[154:157], v[54:57], v[68:71], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_perm_b32_e64(v[24], v[102], v[98], s[86]))
|
|
k.emit(v_perm_b32_e64(v[25], v[110], v[106], s[86]))
|
|
k.emit(v_mfma_16x16x32(v[72:75], v[154:157], v[58:61], v[72:75], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mfma_16x16x32(v[76:79], v[154:157], v[62:65], v[76:79], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_perm_b32_e64(v[26], v[87], v[83], s[85]))
|
|
k.emit(v_perm_b32_e64(v[27], v[95], v[91], s[85]))
|
|
k.emit(v_mfma_16x16x32(v[80:83], v[154:157], v[66:69], v[80:83], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_perm_b32_e64(v[28], v[103], v[99], s[85]))
|
|
k.emit(v_perm_b32_e64(v[29], v[111], v[107], s[85]))
|
|
k.emit(v_mfma_16x16x32(v[84:87], v[154:157], v[70:73], v[84:87], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_perm_b32_e64(v[30], v[87], v[83], s[86]))
|
|
k.emit(v_perm_b32_e64(v[31], v[95], v[91], s[86]))
|
|
k.emit(v_mfma_16x16x32(v[88:91], v[154:157], v[74:77], v[88:91], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_perm_b32_e64(v[32], v[103], v[99], s[86]))
|
|
k.emit(v_perm_b32_e64(v[33], v[111], v[107], s[86]))
|
|
k.emit(v_mfma_16x16x32(v[92:95], v[154:157], v[78:81], v[92:95], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mfma_16x16x32(v[96:99], v[158:161], v[50:53], v[96:99], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_perm_b32_e64(v[34], v[88], v[84], s[85]))
|
|
k.emit(v_perm_b32_e64(v[35], v[96], v[92], s[85]))
|
|
k.emit(v_mfma_16x16x32(v[100:103], v[158:161], v[54:57], v[100:103], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_perm_b32_e64(v[36], v[104], v[100], s[85]))
|
|
k.emit(v_perm_b32_e64(v[37], v[112], v[108], s[85]))
|
|
k.emit(v_mfma_16x16x32(v[104:107], v[158:161], v[58:61], v[104:107], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mfma_16x16x32(v[108:111], v[158:161], v[62:65], v[108:111], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_perm_b32_e64(v[38], v[88], v[84], s[86]))
|
|
k.emit(v_perm_b32_e64(v[39], v[96], v[92], s[86]))
|
|
k.emit(v_mfma_16x16x32(v[112:115], v[158:161], v[66:69], v[112:115], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_perm_b32_e64(v[40], v[104], v[100], s[86]))
|
|
k.emit(v_perm_b32_e64(v[41], v[112], v[108], s[86]))
|
|
k.emit(v_mfma_16x16x32(v[116:119], v[158:161], v[70:73], v[116:119], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mfma_16x16x32(v[120:123], v[158:161], v[74:77], v[120:123], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_perm_b32_e64(v[42], v[89], v[85], s[85]))
|
|
k.emit(v_perm_b32_e64(v[43], v[97], v[93], s[85]))
|
|
k.emit(v_mfma_16x16x32(v[124:127], v[158:161], v[78:81], v[124:127], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_perm_b32_e64(v[44], v[105], v[101], s[85]))
|
|
k.emit(v_perm_b32_e64(v[45], v[113], v[109], s[85]))
|
|
k.emit(v_mfma_16x16x32(v[128:131], v[162:165], v[50:53], v[128:131], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mfma_16x16x32(v[132:135], v[162:165], v[54:57], v[132:135], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_perm_b32_e64(v[46], v[89], v[85], s[86]))
|
|
k.emit(v_perm_b32_e64(v[47], v[97], v[93], s[86]))
|
|
k.emit(v_mfma_16x16x32(v[136:139], v[162:165], v[58:61], v[136:139], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_perm_b32_e64(v[48], v[105], v[101], s[86]))
|
|
k.emit(v_perm_b32_e64(v[49], v[113], v[109], s[86]))
|
|
k.emit(v_mfma_16x16x32(v[140:143], v[162:165], v[62:65], v[140:143], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mfma_16x16x32(v[144:147], v[162:165], v[66:69], v[144:147], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mfma_16x16x32(v[148:151], v[162:165], v[70:73], v[148:151], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mfma_16x16x32(v[152:155], v[162:165], v[74:77], v[152:155], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mfma_16x16x32(v[156:159], v[162:165], v[78:81], v[156:159], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mfma_16x16x32(v[160:163], v[166:169], v[50:53], v[160:163], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mfma_16x16x32(v[164:167], v[166:169], v[54:57], v[164:167], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mfma_16x16x32(v[168:171], v[166:169], v[58:61], v[168:171], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mfma_16x16x32(v[172:175], v[166:169], v[62:65], v[172:175], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mfma_16x16x32(v[176:179], v[166:169], v[66:69], v[176:179], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mfma_16x16x32(v[180:183], v[166:169], v[70:73], v[180:183], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mfma_16x16x32(v[184:187], v[166:169], v[74:77], v[184:187], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mfma_16x16x32(v[188:191], v[166:169], v[78:81], v[188:191], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mfma_16x16x32(v[192:195], v[170:173], v[50:53], v[192:195], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mfma_16x16x32(v[196:199], v[170:173], v[54:57], v[196:199], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mfma_16x16x32(v[200:203], v[170:173], v[58:61], v[200:203], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mfma_16x16x32(v[204:207], v[170:173], v[62:65], v[204:207], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mfma_16x16x32(v[208:211], v[170:173], v[66:69], v[208:211], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mfma_16x16x32(v[212:215], v[170:173], v[70:73], v[212:215], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mfma_16x16x32(v[216:219], v[170:173], v[74:77], v[216:219], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mfma_16x16x32(v[220:223], v[170:173], v[78:81], v[220:223], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mfma_16x16x32(v[224:227], v[174:177], v[50:53], v[224:227], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mfma_16x16x32(v[228:231], v[174:177], v[54:57], v[228:231], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mfma_16x16x32(v[232:235], v[174:177], v[58:61], v[232:235], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mfma_16x16x32(v[236:239], v[174:177], v[62:65], v[236:239], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mfma_16x16x32(v[240:243], v[174:177], v[66:69], v[240:243], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mfma_16x16x32(v[244:247], v[174:177], v[70:73], v[244:247], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mfma_16x16x32(v[248:251], v[174:177], v[74:77], v[248:251], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mfma_16x16x32(v[252:255], v[174:177], v[78:81], v[252:255], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.label('toPGR1end_OrdNLL')
|
|
k.emit(s_xor_b32(s[87], s[55], s[53]))
|
|
k.emit(s_min_u32(s[53], s[53], s[87]))
|
|
k.emit(s_xor_b32(s[87], s[56], s[54]))
|
|
k.emit(s_min_u32(s[54], s[54], s[87]))
|
|
k.emit(s_and_b32(s[8], 63, s[23]))
|
|
k.emit(s_cmp_lt_u32(s[61], s[46]))
|
|
k.emit(s_cmov_b32(s[8], 0))
|
|
k.emit(s_cmp_eq_u32(s[8], 0))
|
|
k.emit(s_mov_b32(s[9], 0))
|
|
k.emit(s_cbranch_scc1(), target='SkipTailLoopL')
|
|
k.emit(s_sub_i32(s[88], 3, s[78]))
|
|
k.emit(s_cmp_ge_i32(s[88], 0))
|
|
k.emit(s_cbranch_scc0(), target='Negative_LHNOKZ26V2FLOONQ')
|
|
k.emit(s_mul_hi_u32(s[89], s[88], s[83]))
|
|
k.emit(s_mul_i32(s[88], s[88], s[83]))
|
|
k.emit(s_branch(), target='MultiplyDone_L9DK3KJL31S8WWGN')
|
|
k.label('Negative_LHNOKZ26V2FLOONQ')
|
|
k.emit(s_abs_i32(s[88], s[88]))
|
|
k.emit(s_mul_hi_u32(s[89], s[88], s[83]))
|
|
k.emit(s_mul_i32(s[88], s[88], s[83]))
|
|
k.emit(s_xor_b32(s[88], s[88], -1))
|
|
k.emit(s_xor_b32(s[89], s[89], -1))
|
|
k.emit(s_add_u32(s[88], s[88], 1))
|
|
k.emit(s_addc_u32(s[89], s[89], 0))
|
|
k.label('MultiplyDone_L9DK3KJL31S8WWGN')
|
|
k.emit(s_sub_u32(s[88], s[88], s[79]))
|
|
k.emit(s_subb_u32(s[89], s[89], s[80]))
|
|
k.emit(s_add_u32(s[68], s[68], s[88]))
|
|
k.emit(s_addc_u32(s[69], s[69], s[89]))
|
|
k.emit(s_sub_u32(s[62], s[62], s[88]))
|
|
k.emit(s_subb_u32(s[63], s[63], s[89]))
|
|
k.emit(s_cmp_eq_u32(s[63], 0))
|
|
k.emit(s_cselect_b32(s[70], s[62], -1))
|
|
k.emit(s_sub_i32(s[88], 3, s[78]))
|
|
k.emit(s_cmp_ge_i32(s[88], 0))
|
|
k.emit(s_cbranch_scc0(), target='Negative_3U2TZUPK3AVX5ODG')
|
|
k.emit(s_mul_hi_u32(s[89], s[88], s[84]))
|
|
k.emit(s_mul_i32(s[88], s[88], s[84]))
|
|
k.emit(s_branch(), target='MultiplyDone_NW6XNGOG77EAT0NM')
|
|
k.label('Negative_3U2TZUPK3AVX5ODG')
|
|
k.emit(s_abs_i32(s[88], s[88]))
|
|
k.emit(s_mul_hi_u32(s[89], s[88], s[84]))
|
|
k.emit(s_mul_i32(s[88], s[88], s[84]))
|
|
k.emit(s_xor_b32(s[88], s[88], -1))
|
|
k.emit(s_xor_b32(s[89], s[89], -1))
|
|
k.emit(s_add_u32(s[88], s[88], 1))
|
|
k.emit(s_addc_u32(s[89], s[89], 0))
|
|
k.label('MultiplyDone_NW6XNGOG77EAT0NM')
|
|
k.emit(s_sub_u32(s[88], s[88], s[81]))
|
|
k.emit(s_subb_u32(s[89], s[89], s[82]))
|
|
k.emit(s_add_u32(s[72], s[72], s[88]))
|
|
k.emit(s_addc_u32(s[73], s[73], s[89]))
|
|
k.emit(s_sub_u32(s[76], s[76], s[88]))
|
|
k.emit(s_subb_u32(s[77], s[77], s[89]))
|
|
k.emit(s_cmp_eq_u32(s[77], 0))
|
|
k.emit(s_cselect_b32(s[74], s[76], -1))
|
|
k.emit(s_mov_b32(M0, s[53]))
|
|
k.waitcnt(lgkm=0)
|
|
k.emit(s_barrier())
|
|
k.emit(buffer_load_short_d16(v[18], v[0], s[68:71], 0, 0, 1, 0, 0, 1))
|
|
k.emit(buffer_load_short_d16_hi(v[84], v[0], s[68:71], 0, 2, 1, 0, 0, 1))
|
|
k.emit(buffer_load_short_d16(v[19], v[0], s[68:71], 0, 4, 1, 0, 0, 1))
|
|
k.emit(buffer_load_short_d16_hi(v[85], v[0], s[68:71], 0, 6, 1, 0, 0, 1))
|
|
k.emit(buffer_load_short_d16(v[20], v[0], s[68:71], 0, 8, 1, 0, 0, 1))
|
|
k.emit(buffer_load_short_d16_hi(v[86], v[0], s[68:71], 0, 10, 1, 0, 0, 1))
|
|
k.emit(buffer_load_short_d16(v[21], v[0], s[68:71], 0, 12, 1, 0, 0, 1))
|
|
k.emit(buffer_load_short_d16_hi(v[87], v[0], s[68:71], 0, 14, 1, 0, 0, 1))
|
|
k.emit(buffer_load_short_d16(v[22], v[1], s[68:71], 0, 0, 1, 0, 0, 1))
|
|
k.emit(buffer_load_short_d16_hi(v[88], v[1], s[68:71], 0, 2, 1, 0, 0, 1))
|
|
k.emit(buffer_load_short_d16(v[23], v[1], s[68:71], 0, 4, 1, 0, 0, 1))
|
|
k.emit(buffer_load_short_d16_hi(v[89], v[1], s[68:71], 0, 6, 1, 0, 0, 1))
|
|
k.emit(buffer_load_short_d16(v[24], v[1], s[68:71], 0, 8, 1, 0, 0, 1))
|
|
k.emit(buffer_load_short_d16_hi(v[90], v[1], s[68:71], 0, 10, 1, 0, 0, 1))
|
|
k.emit(buffer_load_short_d16(v[25], v[1], s[68:71], 0, 12, 1, 0, 0, 1))
|
|
k.emit(buffer_load_short_d16_hi(v[91], v[1], s[68:71], 0, 14, 1, 0, 0, 1))
|
|
k.emit(buffer_load_short_d16(v[26], v[2], s[68:71], 0, 0, 1, 0, 0, 1))
|
|
k.emit(buffer_load_short_d16_hi(v[92], v[2], s[68:71], 0, 2, 1, 0, 0, 1))
|
|
k.emit(buffer_load_short_d16(v[27], v[2], s[68:71], 0, 4, 1, 0, 0, 1))
|
|
k.emit(buffer_load_short_d16_hi(v[93], v[2], s[68:71], 0, 6, 1, 0, 0, 1))
|
|
k.emit(buffer_load_short_d16(v[28], v[2], s[68:71], 0, 8, 1, 0, 0, 1))
|
|
k.emit(buffer_load_short_d16_hi(v[94], v[2], s[68:71], 0, 10, 1, 0, 0, 1))
|
|
k.emit(buffer_load_short_d16(v[29], v[2], s[68:71], 0, 12, 1, 0, 0, 1))
|
|
k.emit(buffer_load_short_d16_hi(v[95], v[2], s[68:71], 0, 14, 1, 0, 0, 1))
|
|
k.emit(buffer_load_short_d16(v[30], v[3], s[68:71], 0, 0, 1, 0, 0, 1))
|
|
k.emit(buffer_load_short_d16_hi(v[96], v[3], s[68:71], 0, 2, 1, 0, 0, 1))
|
|
k.emit(buffer_load_short_d16(v[31], v[3], s[68:71], 0, 4, 1, 0, 0, 1))
|
|
k.emit(buffer_load_short_d16_hi(v[97], v[3], s[68:71], 0, 6, 1, 0, 0, 1))
|
|
k.emit(buffer_load_short_d16(v[32], v[3], s[68:71], 0, 8, 1, 0, 0, 1))
|
|
k.emit(buffer_load_short_d16_hi(v[98], v[3], s[68:71], 0, 10, 1, 0, 0, 1))
|
|
k.emit(buffer_load_short_d16(v[33], v[3], s[68:71], 0, 12, 1, 0, 0, 1))
|
|
k.emit(buffer_load_short_d16_hi(v[99], v[3], s[68:71], 0, 14, 1, 0, 0, 1))
|
|
k.emit(buffer_load_short_d16(v[34], v[4], s[68:71], 0, 0, 1, 0, 0, 1))
|
|
k.emit(buffer_load_short_d16_hi(v[100], v[4], s[68:71], 0, 2, 1, 0, 0, 1))
|
|
k.emit(buffer_load_short_d16(v[35], v[4], s[68:71], 0, 4, 1, 0, 0, 1))
|
|
k.emit(buffer_load_short_d16_hi(v[101], v[4], s[68:71], 0, 6, 1, 0, 0, 1))
|
|
k.emit(buffer_load_short_d16(v[36], v[4], s[68:71], 0, 8, 1, 0, 0, 1))
|
|
k.emit(buffer_load_short_d16_hi(v[102], v[4], s[68:71], 0, 10, 1, 0, 0, 1))
|
|
k.emit(buffer_load_short_d16(v[37], v[4], s[68:71], 0, 12, 1, 0, 0, 1))
|
|
k.emit(buffer_load_short_d16_hi(v[103], v[4], s[68:71], 0, 14, 1, 0, 0, 1))
|
|
k.emit(buffer_load_short_d16(v[38], v[5], s[68:71], 0, 0, 1, 0, 0, 1))
|
|
k.emit(buffer_load_short_d16_hi(v[104], v[5], s[68:71], 0, 2, 1, 0, 0, 1))
|
|
k.emit(buffer_load_short_d16(v[39], v[5], s[68:71], 0, 4, 1, 0, 0, 1))
|
|
k.emit(buffer_load_short_d16_hi(v[105], v[5], s[68:71], 0, 6, 1, 0, 0, 1))
|
|
k.emit(buffer_load_short_d16(v[40], v[5], s[68:71], 0, 8, 1, 0, 0, 1))
|
|
k.emit(buffer_load_short_d16_hi(v[106], v[5], s[68:71], 0, 10, 1, 0, 0, 1))
|
|
k.emit(buffer_load_short_d16(v[41], v[5], s[68:71], 0, 12, 1, 0, 0, 1))
|
|
k.emit(buffer_load_short_d16_hi(v[107], v[5], s[68:71], 0, 14, 1, 0, 0, 1))
|
|
k.emit(buffer_load_short_d16(v[42], v[6], s[68:71], 0, 0, 1, 0, 0, 1))
|
|
k.emit(buffer_load_short_d16_hi(v[108], v[6], s[68:71], 0, 2, 1, 0, 0, 1))
|
|
k.emit(buffer_load_short_d16(v[43], v[6], s[68:71], 0, 4, 1, 0, 0, 1))
|
|
k.emit(buffer_load_short_d16_hi(v[109], v[6], s[68:71], 0, 6, 1, 0, 0, 1))
|
|
k.emit(buffer_load_short_d16(v[44], v[6], s[68:71], 0, 8, 1, 0, 0, 1))
|
|
k.emit(buffer_load_short_d16_hi(v[110], v[6], s[68:71], 0, 10, 1, 0, 0, 1))
|
|
k.emit(buffer_load_short_d16(v[45], v[6], s[68:71], 0, 12, 1, 0, 0, 1))
|
|
k.emit(buffer_load_short_d16_hi(v[111], v[6], s[68:71], 0, 14, 1, 0, 0, 1))
|
|
k.emit(buffer_load_short_d16(v[46], v[7], s[68:71], 0, 0, 1, 0, 0, 1))
|
|
k.emit(buffer_load_short_d16_hi(v[112], v[7], s[68:71], 0, 2, 1, 0, 0, 1))
|
|
k.emit(buffer_load_short_d16(v[47], v[7], s[68:71], 0, 4, 1, 0, 0, 1))
|
|
k.emit(buffer_load_short_d16_hi(v[113], v[7], s[68:71], 0, 6, 1, 0, 0, 1))
|
|
k.emit(buffer_load_short_d16(v[48], v[7], s[68:71], 0, 8, 1, 0, 0, 1))
|
|
k.emit(buffer_load_short_d16_hi(v[114], v[7], s[68:71], 0, 10, 1, 0, 0, 1))
|
|
k.emit(buffer_load_short_d16(v[49], v[7], s[68:71], 0, 12, 1, 0, 0, 1))
|
|
k.emit(buffer_load_short_d16_hi(v[115], v[7], s[68:71], 0, 14, 1, 0, 0, 1))
|
|
k.waitcnt(vm=0)
|
|
k.emit(v_or_b32_e32(v[18], v[18], v[84]))
|
|
k.emit(v_or_b32_e32(v[19], v[19], v[85]))
|
|
k.emit(v_or_b32_e32(v[20], v[20], v[86]))
|
|
k.emit(v_or_b32_e32(v[21], v[21], v[87]))
|
|
k.emit(v_or_b32_e32(v[22], v[22], v[88]))
|
|
k.emit(v_or_b32_e32(v[23], v[23], v[89]))
|
|
k.emit(v_or_b32_e32(v[24], v[24], v[90]))
|
|
k.emit(v_or_b32_e32(v[25], v[25], v[91]))
|
|
k.emit(v_or_b32_e32(v[26], v[26], v[92]))
|
|
k.emit(v_or_b32_e32(v[27], v[27], v[93]))
|
|
k.emit(v_or_b32_e32(v[28], v[28], v[94]))
|
|
k.emit(v_or_b32_e32(v[29], v[29], v[95]))
|
|
k.emit(v_or_b32_e32(v[30], v[30], v[96]))
|
|
k.emit(v_or_b32_e32(v[31], v[31], v[97]))
|
|
k.emit(v_or_b32_e32(v[32], v[32], v[98]))
|
|
k.emit(v_or_b32_e32(v[33], v[33], v[99]))
|
|
k.emit(v_or_b32_e32(v[34], v[34], v[100]))
|
|
k.emit(v_or_b32_e32(v[35], v[35], v[101]))
|
|
k.emit(v_or_b32_e32(v[36], v[36], v[102]))
|
|
k.emit(v_or_b32_e32(v[37], v[37], v[103]))
|
|
k.emit(v_or_b32_e32(v[38], v[38], v[104]))
|
|
k.emit(v_or_b32_e32(v[39], v[39], v[105]))
|
|
k.emit(v_or_b32_e32(v[40], v[40], v[106]))
|
|
k.emit(v_or_b32_e32(v[41], v[41], v[107]))
|
|
k.emit(v_or_b32_e32(v[42], v[42], v[108]))
|
|
k.emit(v_or_b32_e32(v[43], v[43], v[109]))
|
|
k.emit(v_or_b32_e32(v[44], v[44], v[110]))
|
|
k.emit(v_or_b32_e32(v[45], v[45], v[111]))
|
|
k.emit(v_or_b32_e32(v[46], v[46], v[112]))
|
|
k.emit(v_or_b32_e32(v[47], v[47], v[113]))
|
|
k.emit(v_or_b32_e32(v[48], v[48], v[114]))
|
|
k.emit(v_or_b32_e32(v[49], v[49], v[115]))
|
|
k.emit(s_mov_b32(M0, 133120))
|
|
k.emit(s_mov_b32(M0, s[54]))
|
|
k.emit(buffer_load_short_d16(v[50], v[8], s[72:75], 0, 0, 1, 0, 0, 0, 1, 0, 1))
|
|
k.emit(buffer_load_short_d16_hi(v[84], v[8], s[72:75], 0, 2, 1, 0, 0, 0, 1, 0, 1))
|
|
k.emit(buffer_load_short_d16(v[51], v[8], s[72:75], 0, 4, 1, 0, 0, 0, 1, 0, 1))
|
|
k.emit(buffer_load_short_d16_hi(v[85], v[8], s[72:75], 0, 6, 1, 0, 0, 0, 1, 0, 1))
|
|
k.emit(buffer_load_short_d16(v[52], v[8], s[72:75], 0, 8, 1, 0, 0, 0, 1, 0, 1))
|
|
k.emit(buffer_load_short_d16_hi(v[86], v[8], s[72:75], 0, 10, 1, 0, 0, 0, 1, 0, 1))
|
|
k.emit(buffer_load_short_d16(v[53], v[8], s[72:75], 0, 12, 1, 0, 0, 0, 1, 0, 1))
|
|
k.emit(buffer_load_short_d16_hi(v[87], v[8], s[72:75], 0, 14, 1, 0, 0, 0, 1, 0, 1))
|
|
k.emit(buffer_load_short_d16(v[54], v[9], s[72:75], 0, 0, 1, 0, 0, 0, 1, 0, 1))
|
|
k.emit(buffer_load_short_d16_hi(v[88], v[9], s[72:75], 0, 2, 1, 0, 0, 0, 1, 0, 1))
|
|
k.emit(buffer_load_short_d16(v[55], v[9], s[72:75], 0, 4, 1, 0, 0, 0, 1, 0, 1))
|
|
k.emit(buffer_load_short_d16_hi(v[89], v[9], s[72:75], 0, 6, 1, 0, 0, 0, 1, 0, 1))
|
|
k.emit(buffer_load_short_d16(v[56], v[9], s[72:75], 0, 8, 1, 0, 0, 0, 1, 0, 1))
|
|
k.emit(buffer_load_short_d16_hi(v[90], v[9], s[72:75], 0, 10, 1, 0, 0, 0, 1, 0, 1))
|
|
k.emit(buffer_load_short_d16(v[57], v[9], s[72:75], 0, 12, 1, 0, 0, 0, 1, 0, 1))
|
|
k.emit(buffer_load_short_d16_hi(v[91], v[9], s[72:75], 0, 14, 1, 0, 0, 0, 1, 0, 1))
|
|
k.emit(buffer_load_short_d16(v[58], v[10], s[72:75], 0, 0, 1, 0, 0, 0, 1, 0, 1))
|
|
k.emit(buffer_load_short_d16_hi(v[92], v[10], s[72:75], 0, 2, 1, 0, 0, 0, 1, 0, 1))
|
|
k.emit(buffer_load_short_d16(v[59], v[10], s[72:75], 0, 4, 1, 0, 0, 0, 1, 0, 1))
|
|
k.emit(buffer_load_short_d16_hi(v[93], v[10], s[72:75], 0, 6, 1, 0, 0, 0, 1, 0, 1))
|
|
k.emit(buffer_load_short_d16(v[60], v[10], s[72:75], 0, 8, 1, 0, 0, 0, 1, 0, 1))
|
|
k.emit(buffer_load_short_d16_hi(v[94], v[10], s[72:75], 0, 10, 1, 0, 0, 0, 1, 0, 1))
|
|
k.emit(buffer_load_short_d16(v[61], v[10], s[72:75], 0, 12, 1, 0, 0, 0, 1, 0, 1))
|
|
k.emit(buffer_load_short_d16_hi(v[95], v[10], s[72:75], 0, 14, 1, 0, 0, 0, 1, 0, 1))
|
|
k.emit(buffer_load_short_d16(v[62], v[11], s[72:75], 0, 0, 1, 0, 0, 0, 1, 0, 1))
|
|
k.emit(buffer_load_short_d16_hi(v[96], v[11], s[72:75], 0, 2, 1, 0, 0, 0, 1, 0, 1))
|
|
k.emit(buffer_load_short_d16(v[63], v[11], s[72:75], 0, 4, 1, 0, 0, 0, 1, 0, 1))
|
|
k.emit(buffer_load_short_d16_hi(v[97], v[11], s[72:75], 0, 6, 1, 0, 0, 0, 1, 0, 1))
|
|
k.emit(buffer_load_short_d16(v[64], v[11], s[72:75], 0, 8, 1, 0, 0, 0, 1, 0, 1))
|
|
k.emit(buffer_load_short_d16_hi(v[98], v[11], s[72:75], 0, 10, 1, 0, 0, 0, 1, 0, 1))
|
|
k.emit(buffer_load_short_d16(v[65], v[11], s[72:75], 0, 12, 1, 0, 0, 0, 1, 0, 1))
|
|
k.emit(buffer_load_short_d16_hi(v[99], v[11], s[72:75], 0, 14, 1, 0, 0, 0, 1, 0, 1))
|
|
k.emit(buffer_load_short_d16(v[66], v[12], s[72:75], 0, 0, 1, 0, 0, 0, 1, 0, 1))
|
|
k.emit(buffer_load_short_d16_hi(v[100], v[12], s[72:75], 0, 2, 1, 0, 0, 0, 1, 0, 1))
|
|
k.emit(buffer_load_short_d16(v[67], v[12], s[72:75], 0, 4, 1, 0, 0, 0, 1, 0, 1))
|
|
k.emit(buffer_load_short_d16_hi(v[101], v[12], s[72:75], 0, 6, 1, 0, 0, 0, 1, 0, 1))
|
|
k.emit(buffer_load_short_d16(v[68], v[12], s[72:75], 0, 8, 1, 0, 0, 0, 1, 0, 1))
|
|
k.emit(buffer_load_short_d16_hi(v[102], v[12], s[72:75], 0, 10, 1, 0, 0, 0, 1, 0, 1))
|
|
k.emit(buffer_load_short_d16(v[69], v[12], s[72:75], 0, 12, 1, 0, 0, 0, 1, 0, 1))
|
|
k.emit(buffer_load_short_d16_hi(v[103], v[12], s[72:75], 0, 14, 1, 0, 0, 0, 1, 0, 1))
|
|
k.emit(buffer_load_short_d16(v[70], v[13], s[72:75], 0, 0, 1, 0, 0, 0, 1, 0, 1))
|
|
k.emit(buffer_load_short_d16_hi(v[104], v[13], s[72:75], 0, 2, 1, 0, 0, 0, 1, 0, 1))
|
|
k.emit(buffer_load_short_d16(v[71], v[13], s[72:75], 0, 4, 1, 0, 0, 0, 1, 0, 1))
|
|
k.emit(buffer_load_short_d16_hi(v[105], v[13], s[72:75], 0, 6, 1, 0, 0, 0, 1, 0, 1))
|
|
k.emit(buffer_load_short_d16(v[72], v[13], s[72:75], 0, 8, 1, 0, 0, 0, 1, 0, 1))
|
|
k.emit(buffer_load_short_d16_hi(v[106], v[13], s[72:75], 0, 10, 1, 0, 0, 0, 1, 0, 1))
|
|
k.emit(buffer_load_short_d16(v[73], v[13], s[72:75], 0, 12, 1, 0, 0, 0, 1, 0, 1))
|
|
k.emit(buffer_load_short_d16_hi(v[107], v[13], s[72:75], 0, 14, 1, 0, 0, 0, 1, 0, 1))
|
|
k.emit(buffer_load_short_d16(v[74], v[14], s[72:75], 0, 0, 1, 0, 0, 0, 1, 0, 1))
|
|
k.emit(buffer_load_short_d16_hi(v[108], v[14], s[72:75], 0, 2, 1, 0, 0, 0, 1, 0, 1))
|
|
k.emit(buffer_load_short_d16(v[75], v[14], s[72:75], 0, 4, 1, 0, 0, 0, 1, 0, 1))
|
|
k.emit(buffer_load_short_d16_hi(v[109], v[14], s[72:75], 0, 6, 1, 0, 0, 0, 1, 0, 1))
|
|
k.emit(buffer_load_short_d16(v[76], v[14], s[72:75], 0, 8, 1, 0, 0, 0, 1, 0, 1))
|
|
k.emit(buffer_load_short_d16_hi(v[110], v[14], s[72:75], 0, 10, 1, 0, 0, 0, 1, 0, 1))
|
|
k.emit(buffer_load_short_d16(v[77], v[14], s[72:75], 0, 12, 1, 0, 0, 0, 1, 0, 1))
|
|
k.emit(buffer_load_short_d16_hi(v[111], v[14], s[72:75], 0, 14, 1, 0, 0, 0, 1, 0, 1))
|
|
k.emit(buffer_load_short_d16(v[78], v[15], s[72:75], 0, 0, 1, 0, 0, 0, 1, 0, 1))
|
|
k.emit(buffer_load_short_d16_hi(v[112], v[15], s[72:75], 0, 2, 1, 0, 0, 0, 1, 0, 1))
|
|
k.emit(buffer_load_short_d16(v[79], v[15], s[72:75], 0, 4, 1, 0, 0, 0, 1, 0, 1))
|
|
k.emit(buffer_load_short_d16_hi(v[113], v[15], s[72:75], 0, 6, 1, 0, 0, 0, 1, 0, 1))
|
|
k.emit(buffer_load_short_d16(v[80], v[15], s[72:75], 0, 8, 1, 0, 0, 0, 1, 0, 1))
|
|
k.emit(buffer_load_short_d16_hi(v[114], v[15], s[72:75], 0, 10, 1, 0, 0, 0, 1, 0, 1))
|
|
k.emit(buffer_load_short_d16(v[81], v[15], s[72:75], 0, 12, 1, 0, 0, 0, 1, 0, 1))
|
|
k.emit(buffer_load_short_d16_hi(v[115], v[15], s[72:75], 0, 14, 1, 0, 0, 0, 1, 0, 1))
|
|
k.waitcnt(vm=0)
|
|
k.emit(v_or_b32_e32(v[50], v[50], v[84]))
|
|
k.emit(v_or_b32_e32(v[51], v[51], v[85]))
|
|
k.emit(v_or_b32_e32(v[52], v[52], v[86]))
|
|
k.emit(v_or_b32_e32(v[53], v[53], v[87]))
|
|
k.emit(v_or_b32_e32(v[54], v[54], v[88]))
|
|
k.emit(v_or_b32_e32(v[55], v[55], v[89]))
|
|
k.emit(v_or_b32_e32(v[56], v[56], v[90]))
|
|
k.emit(v_or_b32_e32(v[57], v[57], v[91]))
|
|
k.emit(v_or_b32_e32(v[58], v[58], v[92]))
|
|
k.emit(v_or_b32_e32(v[59], v[59], v[93]))
|
|
k.emit(v_or_b32_e32(v[60], v[60], v[94]))
|
|
k.emit(v_or_b32_e32(v[61], v[61], v[95]))
|
|
k.emit(v_or_b32_e32(v[62], v[62], v[96]))
|
|
k.emit(v_or_b32_e32(v[63], v[63], v[97]))
|
|
k.emit(v_or_b32_e32(v[64], v[64], v[98]))
|
|
k.emit(v_or_b32_e32(v[65], v[65], v[99]))
|
|
k.emit(v_or_b32_e32(v[66], v[66], v[100]))
|
|
k.emit(v_or_b32_e32(v[67], v[67], v[101]))
|
|
k.emit(v_or_b32_e32(v[68], v[68], v[102]))
|
|
k.emit(v_or_b32_e32(v[69], v[69], v[103]))
|
|
k.emit(v_or_b32_e32(v[70], v[70], v[104]))
|
|
k.emit(v_or_b32_e32(v[71], v[71], v[105]))
|
|
k.emit(v_or_b32_e32(v[72], v[72], v[106]))
|
|
k.emit(v_or_b32_e32(v[73], v[73], v[107]))
|
|
k.emit(v_or_b32_e32(v[74], v[74], v[108]))
|
|
k.emit(v_or_b32_e32(v[75], v[75], v[109]))
|
|
k.emit(v_or_b32_e32(v[76], v[76], v[110]))
|
|
k.emit(v_or_b32_e32(v[77], v[77], v[111]))
|
|
k.emit(v_or_b32_e32(v[78], v[78], v[112]))
|
|
k.emit(v_or_b32_e32(v[79], v[79], v[113]))
|
|
k.emit(v_or_b32_e32(v[80], v[80], v[114]))
|
|
k.emit(v_or_b32_e32(v[81], v[81], v[115]))
|
|
k.emit(s_mov_b32(M0, 133120))
|
|
k.waitcnt(vm=0)
|
|
k.emit(s_barrier())
|
|
k.emit(v_and_b32_e32(v[82], 63, v[180]))
|
|
k.emit(v_lshlrev_b32_e32(v[82], 4, v[82]))
|
|
k.emit(v_add_u32_e32(v[82], s[53], v[82]))
|
|
k.emit(v_and_b32_e32(v[83], 63, v[180]))
|
|
k.emit(v_lshlrev_b32_e32(v[83], 4, v[83]))
|
|
k.emit(v_add_u32_e32(v[83], s[54], v[83]))
|
|
k.emit(ds_write_b128(v[0], v[82], v[18:21]))
|
|
k.emit(ds_write_b128(v[0], v[82], v[22:25], v[0], 0, 0, 16))
|
|
k.emit(ds_write_b128(v[0], v[82], v[26:29], v[0], 0, 0, 32))
|
|
k.emit(ds_write_b128(v[0], v[82], v[30:33], v[0], 0, 0, 48))
|
|
k.emit(ds_write_b128(v[0], v[82], v[34:37], v[0], 0, 0, 64))
|
|
k.emit(ds_write_b128(v[0], v[82], v[38:41], v[0], 0, 0, 80))
|
|
k.emit(ds_write_b128(v[0], v[82], v[42:45], v[0], 0, 0, 96))
|
|
k.emit(ds_write_b128(v[0], v[82], v[46:49], v[0], 0, 0, 112))
|
|
k.emit(ds_write_b128(v[0], v[83], v[50:53]))
|
|
k.emit(ds_write_b128(v[0], v[83], v[54:57], v[0], 0, 128, 16))
|
|
k.emit(ds_write_b128(v[0], v[83], v[58:61], v[0], 0, 0, 33))
|
|
k.emit(ds_write_b128(v[0], v[83], v[62:65], v[0], 0, 128, 49))
|
|
k.emit(ds_write_b128(v[0], v[83], v[66:69], v[0], 0, 0, 66))
|
|
k.emit(ds_write_b128(v[0], v[83], v[70:73], v[0], 0, 128, 82))
|
|
k.emit(ds_write_b128(v[0], v[83], v[74:77], v[0], 0, 0, 99))
|
|
k.emit(ds_write_b128(v[0], v[83], v[78:81], v[0], 0, 128, 115))
|
|
k.waitcnt(lgkm=0)
|
|
k.emit(s_barrier())
|
|
k.emit(v_xor_b32_e32(v[181], v[178], v[16]))
|
|
k.emit(v_min_i32_e32(v[16], v[16], v[181]))
|
|
k.emit(v_xor_b32_e32(v[181], v[179], v[17]))
|
|
k.emit(v_min_i32_e32(v[17], v[17], v[181]))
|
|
k.label('TailLoopBeginL')
|
|
k.emit(ds_read_b128(v[82:85], v[16]))
|
|
k.emit(ds_read_b128(v[86:89], v[16], v[0], v[0], 0, 0, 2))
|
|
k.emit(ds_read_b128(v[90:93], v[16], v[0], v[0], 0, 0, 4))
|
|
k.emit(ds_read_b128(v[94:97], v[16], v[0], v[0], 0, 0, 6))
|
|
k.emit(ds_read_b128(v[98:101], v[16], v[0], v[0], 0, 0, 8))
|
|
k.emit(ds_read_b128(v[102:105], v[16], v[0], v[0], 0, 0, 10))
|
|
k.emit(ds_read_b128(v[106:109], v[16], v[0], v[0], 0, 0, 12))
|
|
k.emit(ds_read_b128(v[110:113], v[16], v[0], v[0], 0, 0, 14))
|
|
k.emit(ds_read_b128(v[114:117], v[17]))
|
|
k.emit(ds_read_b128(v[118:121], v[17], v[0], v[0], 0, 128))
|
|
k.emit(ds_read_b128(v[122:125], v[17], v[0], v[0], 0, 0, 1))
|
|
k.emit(ds_read_b128(v[126:129], v[17], v[0], v[0], 0, 128, 1))
|
|
k.emit(ds_read_b128(v[130:133], v[17], v[0], v[0], 0, 0, 2))
|
|
k.emit(ds_read_b128(v[134:137], v[17], v[0], v[0], 0, 128, 2))
|
|
k.emit(ds_read_b128(v[138:141], v[17], v[0], v[0], 0, 0, 3))
|
|
k.emit(ds_read_b128(v[142:145], v[17], v[0], v[0], 0, 128, 3))
|
|
k.emit(s_mov_b32(s[87], 16384))
|
|
k.emit(v_add_co_u32_e32(v[16], s[87], v[16]))
|
|
k.emit(s_mov_b32(s[87], 64))
|
|
k.emit(v_add_co_u32_e32(v[17], s[87], v[17]))
|
|
k.waitcnt(lgkm=0)
|
|
k.emit(v_perm_b32_e64(v[18], v[86], v[82], s[85]))
|
|
k.emit(v_perm_b32_e64(v[19], v[94], v[90], s[85]))
|
|
k.emit(v_perm_b32_e64(v[20], v[102], v[98], s[85]))
|
|
k.emit(v_perm_b32_e64(v[21], v[110], v[106], s[85]))
|
|
k.emit(v_perm_b32_e64(v[22], v[86], v[82], s[86]))
|
|
k.emit(v_perm_b32_e64(v[23], v[94], v[90], s[86]))
|
|
k.emit(v_perm_b32_e64(v[24], v[102], v[98], s[86]))
|
|
k.emit(v_perm_b32_e64(v[25], v[110], v[106], s[86]))
|
|
k.emit(v_perm_b32_e64(v[26], v[87], v[83], s[85]))
|
|
k.emit(v_perm_b32_e64(v[27], v[95], v[91], s[85]))
|
|
k.emit(v_perm_b32_e64(v[28], v[103], v[99], s[85]))
|
|
k.emit(v_perm_b32_e64(v[29], v[111], v[107], s[85]))
|
|
k.emit(v_perm_b32_e64(v[30], v[87], v[83], s[86]))
|
|
k.emit(v_perm_b32_e64(v[31], v[95], v[91], s[86]))
|
|
k.emit(v_perm_b32_e64(v[32], v[103], v[99], s[86]))
|
|
k.emit(v_perm_b32_e64(v[33], v[111], v[107], s[86]))
|
|
k.emit(v_perm_b32_e64(v[34], v[88], v[84], s[85]))
|
|
k.emit(v_perm_b32_e64(v[35], v[96], v[92], s[85]))
|
|
k.emit(v_perm_b32_e64(v[36], v[104], v[100], s[85]))
|
|
k.emit(v_perm_b32_e64(v[37], v[112], v[108], s[85]))
|
|
k.emit(v_perm_b32_e64(v[38], v[88], v[84], s[86]))
|
|
k.emit(v_perm_b32_e64(v[39], v[96], v[92], s[86]))
|
|
k.emit(v_perm_b32_e64(v[40], v[104], v[100], s[86]))
|
|
k.emit(v_perm_b32_e64(v[41], v[112], v[108], s[86]))
|
|
k.emit(v_perm_b32_e64(v[42], v[89], v[85], s[85]))
|
|
k.emit(v_perm_b32_e64(v[43], v[97], v[93], s[85]))
|
|
k.emit(v_perm_b32_e64(v[44], v[105], v[101], s[85]))
|
|
k.emit(v_perm_b32_e64(v[45], v[113], v[109], s[85]))
|
|
k.emit(v_perm_b32_e64(v[46], v[89], v[85], s[86]))
|
|
k.emit(v_perm_b32_e64(v[47], v[97], v[93], s[86]))
|
|
k.emit(v_perm_b32_e64(v[48], v[105], v[101], s[86]))
|
|
k.emit(v_perm_b32_e64(v[49], v[113], v[109], s[86]))
|
|
k.emit(v_and_b32_e32(v[181], 63, v[180]))
|
|
k.emit(v_lshrrev_b32_e32(v[181], 4, v[181]))
|
|
k.emit(v_lshlrev_b32_e32(v[181], 3, v[181]))
|
|
k.emit(v_add_u32_e64(v[182], v[181], 0))
|
|
k.emit(v_cmp_ge_i32_e64(s[88:89], v[182], s[8]))
|
|
k.emit(v_cndmask_b32_e64(v[18], v[18], 0, s[88:89]))
|
|
k.emit(v_cndmask_b32_e64(v[22], v[22], 0, s[88:89]))
|
|
k.emit(v_cndmask_b32_e64(v[26], v[26], 0, s[88:89]))
|
|
k.emit(v_cndmask_b32_e64(v[30], v[30], 0, s[88:89]))
|
|
k.emit(v_cndmask_b32_e64(v[34], v[34], 0, s[88:89]))
|
|
k.emit(v_cndmask_b32_e64(v[38], v[38], 0, s[88:89]))
|
|
k.emit(v_cndmask_b32_e64(v[42], v[42], 0, s[88:89]))
|
|
k.emit(v_cndmask_b32_e64(v[46], v[46], 0, s[88:89]))
|
|
k.emit(v_cndmask_b32_e64(v[19], v[19], 0, s[88:89]))
|
|
k.emit(v_cndmask_b32_e64(v[23], v[23], 0, s[88:89]))
|
|
k.emit(v_cndmask_b32_e64(v[27], v[27], 0, s[88:89]))
|
|
k.emit(v_cndmask_b32_e64(v[31], v[31], 0, s[88:89]))
|
|
k.emit(v_cndmask_b32_e64(v[35], v[35], 0, s[88:89]))
|
|
k.emit(v_cndmask_b32_e64(v[39], v[39], 0, s[88:89]))
|
|
k.emit(v_cndmask_b32_e64(v[43], v[43], 0, s[88:89]))
|
|
k.emit(v_cndmask_b32_e64(v[47], v[47], 0, s[88:89]))
|
|
k.emit(v_add_u32_e64(v[182], v[182], 4))
|
|
k.emit(v_cmp_ge_i32_e64(s[88:89], v[182], s[8]))
|
|
k.emit(v_cndmask_b32_e64(v[20], v[20], 0, s[88:89]))
|
|
k.emit(v_cndmask_b32_e64(v[24], v[24], 0, s[88:89]))
|
|
k.emit(v_cndmask_b32_e64(v[28], v[28], 0, s[88:89]))
|
|
k.emit(v_cndmask_b32_e64(v[32], v[32], 0, s[88:89]))
|
|
k.emit(v_cndmask_b32_e64(v[36], v[36], 0, s[88:89]))
|
|
k.emit(v_cndmask_b32_e64(v[40], v[40], 0, s[88:89]))
|
|
k.emit(v_cndmask_b32_e64(v[44], v[44], 0, s[88:89]))
|
|
k.emit(v_cndmask_b32_e64(v[48], v[48], 0, s[88:89]))
|
|
k.emit(v_cndmask_b32_e64(v[21], v[21], 0, s[88:89]))
|
|
k.emit(v_cndmask_b32_e64(v[25], v[25], 0, s[88:89]))
|
|
k.emit(v_cndmask_b32_e64(v[29], v[29], 0, s[88:89]))
|
|
k.emit(v_cndmask_b32_e64(v[33], v[33], 0, s[88:89]))
|
|
k.emit(v_cndmask_b32_e64(v[37], v[37], 0, s[88:89]))
|
|
k.emit(v_cndmask_b32_e64(v[41], v[41], 0, s[88:89]))
|
|
k.emit(v_cndmask_b32_e64(v[45], v[45], 0, s[88:89]))
|
|
k.emit(v_cndmask_b32_e64(v[49], v[49], 0, s[88:89]))
|
|
k.emit(v_and_b32_e32(v[181], 63, v[180]))
|
|
k.emit(v_lshrrev_b32_e32(v[181], 4, v[181]))
|
|
k.emit(v_lshlrev_b32_e32(v[181], 3, v[181]))
|
|
k.emit(v_add_u32_e64(v[182], v[181], 0))
|
|
k.emit(v_cmp_ge_i32_e64(s[88:89], v[182], s[8]))
|
|
k.emit(v_cndmask_b32_e64(v[114], v[114], 0, s[88:89]))
|
|
k.emit(v_cndmask_b32_e64(v[118], v[118], 0, s[88:89]))
|
|
k.emit(v_cndmask_b32_e64(v[122], v[122], 0, s[88:89]))
|
|
k.emit(v_cndmask_b32_e64(v[126], v[126], 0, s[88:89]))
|
|
k.emit(v_cndmask_b32_e64(v[130], v[130], 0, s[88:89]))
|
|
k.emit(v_cndmask_b32_e64(v[134], v[134], 0, s[88:89]))
|
|
k.emit(v_cndmask_b32_e64(v[138], v[138], 0, s[88:89]))
|
|
k.emit(v_cndmask_b32_e64(v[142], v[142], 0, s[88:89]))
|
|
k.emit(v_cndmask_b32_e64(v[115], v[115], 0, s[88:89]))
|
|
k.emit(v_cndmask_b32_e64(v[119], v[119], 0, s[88:89]))
|
|
k.emit(v_cndmask_b32_e64(v[123], v[123], 0, s[88:89]))
|
|
k.emit(v_cndmask_b32_e64(v[127], v[127], 0, s[88:89]))
|
|
k.emit(v_cndmask_b32_e64(v[131], v[131], 0, s[88:89]))
|
|
k.emit(v_cndmask_b32_e64(v[135], v[135], 0, s[88:89]))
|
|
k.emit(v_cndmask_b32_e64(v[139], v[139], 0, s[88:89]))
|
|
k.emit(v_cndmask_b32_e64(v[143], v[143], 0, s[88:89]))
|
|
k.emit(v_add_u32_e64(v[182], v[182], 4))
|
|
k.emit(v_cmp_ge_i32_e64(s[88:89], v[182], s[8]))
|
|
k.emit(v_cndmask_b32_e64(v[116], v[116], 0, s[88:89]))
|
|
k.emit(v_cndmask_b32_e64(v[120], v[120], 0, s[88:89]))
|
|
k.emit(v_cndmask_b32_e64(v[124], v[124], 0, s[88:89]))
|
|
k.emit(v_cndmask_b32_e64(v[128], v[128], 0, s[88:89]))
|
|
k.emit(v_cndmask_b32_e64(v[132], v[132], 0, s[88:89]))
|
|
k.emit(v_cndmask_b32_e64(v[136], v[136], 0, s[88:89]))
|
|
k.emit(v_cndmask_b32_e64(v[140], v[140], 0, s[88:89]))
|
|
k.emit(v_cndmask_b32_e64(v[144], v[144], 0, s[88:89]))
|
|
k.emit(v_cndmask_b32_e64(v[117], v[117], 0, s[88:89]))
|
|
k.emit(v_cndmask_b32_e64(v[121], v[121], 0, s[88:89]))
|
|
k.emit(v_cndmask_b32_e64(v[125], v[125], 0, s[88:89]))
|
|
k.emit(v_cndmask_b32_e64(v[129], v[129], 0, s[88:89]))
|
|
k.emit(v_cndmask_b32_e64(v[133], v[133], 0, s[88:89]))
|
|
k.emit(v_cndmask_b32_e64(v[137], v[137], 0, s[88:89]))
|
|
k.emit(v_cndmask_b32_e64(v[141], v[141], 0, s[88:89]))
|
|
k.emit(v_cndmask_b32_e64(v[145], v[145], 0, s[88:89]))
|
|
k.emit(s_and_b32(s[87], s[23], 7))
|
|
k.emit(s_cmp_eq_u32(s[87], 0))
|
|
k.emit(s_cbranch_scc1(), target='TailLoop_SkipZeroOutMask_0FMPG10PI1CDGWZ9')
|
|
k.emit(s_and_b32(s[87], s[8], 7))
|
|
k.emit(s_sub_u32(s[87], 8, s[87]))
|
|
k.emit(s_lshl_b32(s[87], s[87], 4))
|
|
k.emit(v_lshlrev_b64(v[184:185], s[87], v[18:19]))
|
|
k.emit(v_lshlrev_b64(v[186:187], s[87], v[20:21]))
|
|
k.emit(v_add_u32_e64(v[182], v[181], 4))
|
|
k.emit(v_cmp_ge_i32_e64(s[88:89], v[182], s[8]))
|
|
k.emit(v_cndmask_b32_e64(v[18], v[18], v[184], s[88:89]))
|
|
k.emit(v_cmp_ge_i32_e64(s[88:89], v[182], s[8]))
|
|
k.emit(v_cndmask_b32_e64(v[19], v[19], v[185], s[88:89]))
|
|
k.emit(v_add_u32_e64(v[182], v[182], 4))
|
|
k.emit(v_cmp_ge_i32_e64(s[88:89], v[182], s[8]))
|
|
k.emit(v_cndmask_b32_e64(v[20], v[20], v[186], s[88:89]))
|
|
k.emit(v_cmp_ge_i32_e64(s[88:89], v[182], s[8]))
|
|
k.emit(v_cndmask_b32_e64(v[21], v[21], v[187], s[88:89]))
|
|
k.emit(v_lshlrev_b64(v[184:185], s[87], v[22:23]))
|
|
k.emit(v_lshlrev_b64(v[186:187], s[87], v[24:25]))
|
|
k.emit(v_add_u32_e64(v[182], v[181], 4))
|
|
k.emit(v_cmp_ge_i32_e64(s[88:89], v[182], s[8]))
|
|
k.emit(v_cndmask_b32_e64(v[22], v[22], v[184], s[88:89]))
|
|
k.emit(v_cmp_ge_i32_e64(s[88:89], v[182], s[8]))
|
|
k.emit(v_cndmask_b32_e64(v[23], v[23], v[185], s[88:89]))
|
|
k.emit(v_add_u32_e64(v[182], v[182], 4))
|
|
k.emit(v_cmp_ge_i32_e64(s[88:89], v[182], s[8]))
|
|
k.emit(v_cndmask_b32_e64(v[24], v[24], v[186], s[88:89]))
|
|
k.emit(v_cmp_ge_i32_e64(s[88:89], v[182], s[8]))
|
|
k.emit(v_cndmask_b32_e64(v[25], v[25], v[187], s[88:89]))
|
|
k.emit(v_lshlrev_b64(v[184:185], s[87], v[26:27]))
|
|
k.emit(v_lshlrev_b64(v[186:187], s[87], v[28:29]))
|
|
k.emit(v_add_u32_e64(v[182], v[181], 4))
|
|
k.emit(v_cmp_ge_i32_e64(s[88:89], v[182], s[8]))
|
|
k.emit(v_cndmask_b32_e64(v[26], v[26], v[184], s[88:89]))
|
|
k.emit(v_cmp_ge_i32_e64(s[88:89], v[182], s[8]))
|
|
k.emit(v_cndmask_b32_e64(v[27], v[27], v[185], s[88:89]))
|
|
k.emit(v_add_u32_e64(v[182], v[182], 4))
|
|
k.emit(v_cmp_ge_i32_e64(s[88:89], v[182], s[8]))
|
|
k.emit(v_cndmask_b32_e64(v[28], v[28], v[186], s[88:89]))
|
|
k.emit(v_cmp_ge_i32_e64(s[88:89], v[182], s[8]))
|
|
k.emit(v_cndmask_b32_e64(v[29], v[29], v[187], s[88:89]))
|
|
k.emit(v_lshlrev_b64(v[184:185], s[87], v[30:31]))
|
|
k.emit(v_lshlrev_b64(v[186:187], s[87], v[32:33]))
|
|
k.emit(v_add_u32_e64(v[182], v[181], 4))
|
|
k.emit(v_cmp_ge_i32_e64(s[88:89], v[182], s[8]))
|
|
k.emit(v_cndmask_b32_e64(v[30], v[30], v[184], s[88:89]))
|
|
k.emit(v_cmp_ge_i32_e64(s[88:89], v[182], s[8]))
|
|
k.emit(v_cndmask_b32_e64(v[31], v[31], v[185], s[88:89]))
|
|
k.emit(v_add_u32_e64(v[182], v[182], 4))
|
|
k.emit(v_cmp_ge_i32_e64(s[88:89], v[182], s[8]))
|
|
k.emit(v_cndmask_b32_e64(v[32], v[32], v[186], s[88:89]))
|
|
k.emit(v_cmp_ge_i32_e64(s[88:89], v[182], s[8]))
|
|
k.emit(v_cndmask_b32_e64(v[33], v[33], v[187], s[88:89]))
|
|
k.emit(v_lshlrev_b64(v[184:185], s[87], v[34:35]))
|
|
k.emit(v_lshlrev_b64(v[186:187], s[87], v[36:37]))
|
|
k.emit(v_add_u32_e64(v[182], v[181], 4))
|
|
k.emit(v_cmp_ge_i32_e64(s[88:89], v[182], s[8]))
|
|
k.emit(v_cndmask_b32_e64(v[34], v[34], v[184], s[88:89]))
|
|
k.emit(v_cmp_ge_i32_e64(s[88:89], v[182], s[8]))
|
|
k.emit(v_cndmask_b32_e64(v[35], v[35], v[185], s[88:89]))
|
|
k.emit(v_add_u32_e64(v[182], v[182], 4))
|
|
k.emit(v_cmp_ge_i32_e64(s[88:89], v[182], s[8]))
|
|
k.emit(v_cndmask_b32_e64(v[36], v[36], v[186], s[88:89]))
|
|
k.emit(v_cmp_ge_i32_e64(s[88:89], v[182], s[8]))
|
|
k.emit(v_cndmask_b32_e64(v[37], v[37], v[187], s[88:89]))
|
|
k.emit(v_lshlrev_b64(v[184:185], s[87], v[38:39]))
|
|
k.emit(v_lshlrev_b64(v[186:187], s[87], v[40:41]))
|
|
k.emit(v_add_u32_e64(v[182], v[181], 4))
|
|
k.emit(v_cmp_ge_i32_e64(s[88:89], v[182], s[8]))
|
|
k.emit(v_cndmask_b32_e64(v[38], v[38], v[184], s[88:89]))
|
|
k.emit(v_cmp_ge_i32_e64(s[88:89], v[182], s[8]))
|
|
k.emit(v_cndmask_b32_e64(v[39], v[39], v[185], s[88:89]))
|
|
k.emit(v_add_u32_e64(v[182], v[182], 4))
|
|
k.emit(v_cmp_ge_i32_e64(s[88:89], v[182], s[8]))
|
|
k.emit(v_cndmask_b32_e64(v[40], v[40], v[186], s[88:89]))
|
|
k.emit(v_cmp_ge_i32_e64(s[88:89], v[182], s[8]))
|
|
k.emit(v_cndmask_b32_e64(v[41], v[41], v[187], s[88:89]))
|
|
k.emit(v_lshlrev_b64(v[184:185], s[87], v[42:43]))
|
|
k.emit(v_lshlrev_b64(v[186:187], s[87], v[44:45]))
|
|
k.emit(v_add_u32_e64(v[182], v[181], 4))
|
|
k.emit(v_cmp_ge_i32_e64(s[88:89], v[182], s[8]))
|
|
k.emit(v_cndmask_b32_e64(v[42], v[42], v[184], s[88:89]))
|
|
k.emit(v_cmp_ge_i32_e64(s[88:89], v[182], s[8]))
|
|
k.emit(v_cndmask_b32_e64(v[43], v[43], v[185], s[88:89]))
|
|
k.emit(v_add_u32_e64(v[182], v[182], 4))
|
|
k.emit(v_cmp_ge_i32_e64(s[88:89], v[182], s[8]))
|
|
k.emit(v_cndmask_b32_e64(v[44], v[44], v[186], s[88:89]))
|
|
k.emit(v_cmp_ge_i32_e64(s[88:89], v[182], s[8]))
|
|
k.emit(v_cndmask_b32_e64(v[45], v[45], v[187], s[88:89]))
|
|
k.emit(v_lshlrev_b64(v[184:185], s[87], v[46:47]))
|
|
k.emit(v_lshlrev_b64(v[186:187], s[87], v[48:49]))
|
|
k.emit(v_add_u32_e64(v[182], v[181], 4))
|
|
k.emit(v_cmp_ge_i32_e64(s[88:89], v[182], s[8]))
|
|
k.emit(v_cndmask_b32_e64(v[46], v[46], v[184], s[88:89]))
|
|
k.emit(v_cmp_ge_i32_e64(s[88:89], v[182], s[8]))
|
|
k.emit(v_cndmask_b32_e64(v[47], v[47], v[185], s[88:89]))
|
|
k.emit(v_add_u32_e64(v[182], v[182], 4))
|
|
k.emit(v_cmp_ge_i32_e64(s[88:89], v[182], s[8]))
|
|
k.emit(v_cndmask_b32_e64(v[48], v[48], v[186], s[88:89]))
|
|
k.emit(v_cmp_ge_i32_e64(s[88:89], v[182], s[8]))
|
|
k.emit(v_cndmask_b32_e64(v[49], v[49], v[187], s[88:89]))
|
|
k.emit(v_lshlrev_b64(v[184:185], s[87], v[114:115]))
|
|
k.emit(v_lshlrev_b64(v[186:187], s[87], v[116:117]))
|
|
k.emit(v_add_u32_e64(v[182], v[181], 4))
|
|
k.emit(v_cmp_ge_i32_e64(s[88:89], v[182], s[8]))
|
|
k.emit(v_cndmask_b32_e64(v[114], v[114], v[184], s[88:89]))
|
|
k.emit(v_cmp_ge_i32_e64(s[88:89], v[182], s[8]))
|
|
k.emit(v_cndmask_b32_e64(v[115], v[115], v[185], s[88:89]))
|
|
k.emit(v_add_u32_e64(v[182], v[182], 4))
|
|
k.emit(v_cmp_ge_i32_e64(s[88:89], v[182], s[8]))
|
|
k.emit(v_cndmask_b32_e64(v[116], v[116], v[186], s[88:89]))
|
|
k.emit(v_cmp_ge_i32_e64(s[88:89], v[182], s[8]))
|
|
k.emit(v_cndmask_b32_e64(v[117], v[117], v[187], s[88:89]))
|
|
k.emit(v_lshlrev_b64(v[184:185], s[87], v[118:119]))
|
|
k.emit(v_lshlrev_b64(v[186:187], s[87], v[120:121]))
|
|
k.emit(v_add_u32_e64(v[182], v[181], 4))
|
|
k.emit(v_cmp_ge_i32_e64(s[88:89], v[182], s[8]))
|
|
k.emit(v_cndmask_b32_e64(v[118], v[118], v[184], s[88:89]))
|
|
k.emit(v_cmp_ge_i32_e64(s[88:89], v[182], s[8]))
|
|
k.emit(v_cndmask_b32_e64(v[119], v[119], v[185], s[88:89]))
|
|
k.emit(v_add_u32_e64(v[182], v[182], 4))
|
|
k.emit(v_cmp_ge_i32_e64(s[88:89], v[182], s[8]))
|
|
k.emit(v_cndmask_b32_e64(v[120], v[120], v[186], s[88:89]))
|
|
k.emit(v_cmp_ge_i32_e64(s[88:89], v[182], s[8]))
|
|
k.emit(v_cndmask_b32_e64(v[121], v[121], v[187], s[88:89]))
|
|
k.emit(v_lshlrev_b64(v[184:185], s[87], v[122:123]))
|
|
k.emit(v_lshlrev_b64(v[186:187], s[87], v[124:125]))
|
|
k.emit(v_add_u32_e64(v[182], v[181], 4))
|
|
k.emit(v_cmp_ge_i32_e64(s[88:89], v[182], s[8]))
|
|
k.emit(v_cndmask_b32_e64(v[122], v[122], v[184], s[88:89]))
|
|
k.emit(v_cmp_ge_i32_e64(s[88:89], v[182], s[8]))
|
|
k.emit(v_cndmask_b32_e64(v[123], v[123], v[185], s[88:89]))
|
|
k.emit(v_add_u32_e64(v[182], v[182], 4))
|
|
k.emit(v_cmp_ge_i32_e64(s[88:89], v[182], s[8]))
|
|
k.emit(v_cndmask_b32_e64(v[124], v[124], v[186], s[88:89]))
|
|
k.emit(v_cmp_ge_i32_e64(s[88:89], v[182], s[8]))
|
|
k.emit(v_cndmask_b32_e64(v[125], v[125], v[187], s[88:89]))
|
|
k.emit(v_lshlrev_b64(v[184:185], s[87], v[126:127]))
|
|
k.emit(v_lshlrev_b64(v[186:187], s[87], v[128:129]))
|
|
k.emit(v_add_u32_e64(v[182], v[181], 4))
|
|
k.emit(v_cmp_ge_i32_e64(s[88:89], v[182], s[8]))
|
|
k.emit(v_cndmask_b32_e64(v[126], v[126], v[184], s[88:89]))
|
|
k.emit(v_cmp_ge_i32_e64(s[88:89], v[182], s[8]))
|
|
k.emit(v_cndmask_b32_e64(v[127], v[127], v[185], s[88:89]))
|
|
k.emit(v_add_u32_e64(v[182], v[182], 4))
|
|
k.emit(v_cmp_ge_i32_e64(s[88:89], v[182], s[8]))
|
|
k.emit(v_cndmask_b32_e64(v[128], v[128], v[186], s[88:89]))
|
|
k.emit(v_cmp_ge_i32_e64(s[88:89], v[182], s[8]))
|
|
k.emit(v_cndmask_b32_e64(v[129], v[129], v[187], s[88:89]))
|
|
k.emit(v_lshlrev_b64(v[184:185], s[87], v[130:131]))
|
|
k.emit(v_lshlrev_b64(v[186:187], s[87], v[132:133]))
|
|
k.emit(v_add_u32_e64(v[182], v[181], 4))
|
|
k.emit(v_cmp_ge_i32_e64(s[88:89], v[182], s[8]))
|
|
k.emit(v_cndmask_b32_e64(v[130], v[130], v[184], s[88:89]))
|
|
k.emit(v_cmp_ge_i32_e64(s[88:89], v[182], s[8]))
|
|
k.emit(v_cndmask_b32_e64(v[131], v[131], v[185], s[88:89]))
|
|
k.emit(v_add_u32_e64(v[182], v[182], 4))
|
|
k.emit(v_cmp_ge_i32_e64(s[88:89], v[182], s[8]))
|
|
k.emit(v_cndmask_b32_e64(v[132], v[132], v[186], s[88:89]))
|
|
k.emit(v_cmp_ge_i32_e64(s[88:89], v[182], s[8]))
|
|
k.emit(v_cndmask_b32_e64(v[133], v[133], v[187], s[88:89]))
|
|
k.emit(v_lshlrev_b64(v[184:185], s[87], v[134:135]))
|
|
k.emit(v_lshlrev_b64(v[186:187], s[87], v[136:137]))
|
|
k.emit(v_add_u32_e64(v[182], v[181], 4))
|
|
k.emit(v_cmp_ge_i32_e64(s[88:89], v[182], s[8]))
|
|
k.emit(v_cndmask_b32_e64(v[134], v[134], v[184], s[88:89]))
|
|
k.emit(v_cmp_ge_i32_e64(s[88:89], v[182], s[8]))
|
|
k.emit(v_cndmask_b32_e64(v[135], v[135], v[185], s[88:89]))
|
|
k.emit(v_add_u32_e64(v[182], v[182], 4))
|
|
k.emit(v_cmp_ge_i32_e64(s[88:89], v[182], s[8]))
|
|
k.emit(v_cndmask_b32_e64(v[136], v[136], v[186], s[88:89]))
|
|
k.emit(v_cmp_ge_i32_e64(s[88:89], v[182], s[8]))
|
|
k.emit(v_cndmask_b32_e64(v[137], v[137], v[187], s[88:89]))
|
|
k.emit(v_lshlrev_b64(v[184:185], s[87], v[138:139]))
|
|
k.emit(v_lshlrev_b64(v[186:187], s[87], v[140:141]))
|
|
k.emit(v_add_u32_e64(v[182], v[181], 4))
|
|
k.emit(v_cmp_ge_i32_e64(s[88:89], v[182], s[8]))
|
|
k.emit(v_cndmask_b32_e64(v[138], v[138], v[184], s[88:89]))
|
|
k.emit(v_cmp_ge_i32_e64(s[88:89], v[182], s[8]))
|
|
k.emit(v_cndmask_b32_e64(v[139], v[139], v[185], s[88:89]))
|
|
k.emit(v_add_u32_e64(v[182], v[182], 4))
|
|
k.emit(v_cmp_ge_i32_e64(s[88:89], v[182], s[8]))
|
|
k.emit(v_cndmask_b32_e64(v[140], v[140], v[186], s[88:89]))
|
|
k.emit(v_cmp_ge_i32_e64(s[88:89], v[182], s[8]))
|
|
k.emit(v_cndmask_b32_e64(v[141], v[141], v[187], s[88:89]))
|
|
k.emit(v_lshlrev_b64(v[184:185], s[87], v[142:143]))
|
|
k.emit(v_lshlrev_b64(v[186:187], s[87], v[144:145]))
|
|
k.emit(v_add_u32_e64(v[182], v[181], 4))
|
|
k.emit(v_cmp_ge_i32_e64(s[88:89], v[182], s[8]))
|
|
k.emit(v_cndmask_b32_e64(v[142], v[142], v[184], s[88:89]))
|
|
k.emit(v_cmp_ge_i32_e64(s[88:89], v[182], s[8]))
|
|
k.emit(v_cndmask_b32_e64(v[143], v[143], v[185], s[88:89]))
|
|
k.emit(v_add_u32_e64(v[182], v[182], 4))
|
|
k.emit(v_cmp_ge_i32_e64(s[88:89], v[182], s[8]))
|
|
k.emit(v_cndmask_b32_e64(v[144], v[144], v[186], s[88:89]))
|
|
k.emit(v_cmp_ge_i32_e64(s[88:89], v[182], s[8]))
|
|
k.emit(v_cndmask_b32_e64(v[145], v[145], v[187], s[88:89]))
|
|
k.label('TailLoop_SkipZeroOutMask_0FMPG10PI1CDGWZ9')
|
|
k.emit(s_nop(1))
|
|
k.emit(v_mfma_16x16x32(v[0:3], v[114:117], v[18:21], v[0:3], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mfma_16x16x32(v[4:7], v[114:117], v[22:25], v[4:7], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mfma_16x16x32(v[8:11], v[114:117], v[26:29], v[8:11], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mfma_16x16x32(v[12:15], v[114:117], v[30:33], v[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mfma_16x16x32(v[16:19], v[114:117], v[34:37], v[16:19], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mfma_16x16x32(v[20:23], v[114:117], v[38:41], v[20:23], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mfma_16x16x32(v[24:27], v[114:117], v[42:45], v[24:27], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mfma_16x16x32(v[28:31], v[114:117], v[46:49], v[28:31], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mfma_16x16x32(v[32:35], v[118:121], v[18:21], v[32:35], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mfma_16x16x32(v[36:39], v[118:121], v[22:25], v[36:39], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mfma_16x16x32(v[40:43], v[118:121], v[26:29], v[40:43], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mfma_16x16x32(v[44:47], v[118:121], v[30:33], v[44:47], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mfma_16x16x32(v[48:51], v[118:121], v[34:37], v[48:51], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mfma_16x16x32(v[52:55], v[118:121], v[38:41], v[52:55], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mfma_16x16x32(v[56:59], v[118:121], v[42:45], v[56:59], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mfma_16x16x32(v[60:63], v[118:121], v[46:49], v[60:63], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mfma_16x16x32(v[64:67], v[122:125], v[18:21], v[64:67], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mfma_16x16x32(v[68:71], v[122:125], v[22:25], v[68:71], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mfma_16x16x32(v[72:75], v[122:125], v[26:29], v[72:75], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mfma_16x16x32(v[76:79], v[122:125], v[30:33], v[76:79], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mfma_16x16x32(v[80:83], v[122:125], v[34:37], v[80:83], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mfma_16x16x32(v[84:87], v[122:125], v[38:41], v[84:87], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mfma_16x16x32(v[88:91], v[122:125], v[42:45], v[88:91], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mfma_16x16x32(v[92:95], v[122:125], v[46:49], v[92:95], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mfma_16x16x32(v[96:99], v[126:129], v[18:21], v[96:99], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mfma_16x16x32(v[100:103], v[126:129], v[22:25], v[100:103], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mfma_16x16x32(v[104:107], v[126:129], v[26:29], v[104:107], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mfma_16x16x32(v[108:111], v[126:129], v[30:33], v[108:111], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mfma_16x16x32(v[112:115], v[126:129], v[34:37], v[112:115], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mfma_16x16x32(v[116:119], v[126:129], v[38:41], v[116:119], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mfma_16x16x32(v[120:123], v[126:129], v[42:45], v[120:123], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mfma_16x16x32(v[124:127], v[126:129], v[46:49], v[124:127], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mfma_16x16x32(v[128:131], v[130:133], v[18:21], v[128:131], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mfma_16x16x32(v[132:135], v[130:133], v[22:25], v[132:135], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mfma_16x16x32(v[136:139], v[130:133], v[26:29], v[136:139], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mfma_16x16x32(v[140:143], v[130:133], v[30:33], v[140:143], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mfma_16x16x32(v[144:147], v[130:133], v[34:37], v[144:147], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mfma_16x16x32(v[148:151], v[130:133], v[38:41], v[148:151], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mfma_16x16x32(v[152:155], v[130:133], v[42:45], v[152:155], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mfma_16x16x32(v[156:159], v[130:133], v[46:49], v[156:159], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mfma_16x16x32(v[160:163], v[134:137], v[18:21], v[160:163], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mfma_16x16x32(v[164:167], v[134:137], v[22:25], v[164:167], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mfma_16x16x32(v[168:171], v[134:137], v[26:29], v[168:171], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mfma_16x16x32(v[172:175], v[134:137], v[30:33], v[172:175], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mfma_16x16x32(v[176:179], v[134:137], v[34:37], v[176:179], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mfma_16x16x32(v[180:183], v[134:137], v[38:41], v[180:183], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mfma_16x16x32(v[184:187], v[134:137], v[42:45], v[184:187], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mfma_16x16x32(v[188:191], v[134:137], v[46:49], v[188:191], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mfma_16x16x32(v[192:195], v[138:141], v[18:21], v[192:195], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mfma_16x16x32(v[196:199], v[138:141], v[22:25], v[196:199], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mfma_16x16x32(v[200:203], v[138:141], v[26:29], v[200:203], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mfma_16x16x32(v[204:207], v[138:141], v[30:33], v[204:207], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mfma_16x16x32(v[208:211], v[138:141], v[34:37], v[208:211], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mfma_16x16x32(v[212:215], v[138:141], v[38:41], v[212:215], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mfma_16x16x32(v[216:219], v[138:141], v[42:45], v[216:219], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mfma_16x16x32(v[220:223], v[138:141], v[46:49], v[220:223], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mfma_16x16x32(v[224:227], v[142:145], v[18:21], v[224:227], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mfma_16x16x32(v[228:231], v[142:145], v[22:25], v[228:231], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mfma_16x16x32(v[232:235], v[142:145], v[26:29], v[232:235], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mfma_16x16x32(v[236:239], v[142:145], v[30:33], v[236:239], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mfma_16x16x32(v[240:243], v[142:145], v[34:37], v[240:243], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mfma_16x16x32(v[244:247], v[142:145], v[38:41], v[244:247], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mfma_16x16x32(v[248:251], v[142:145], v[42:45], v[248:251], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mfma_16x16x32(v[252:255], v[142:145], v[46:49], v[252:255], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(s_sub_i32(s[8], s[8], 32))
|
|
k.emit(s_add_u32(s[9], s[9], 32))
|
|
k.emit(s_cmp_le_i32(s[8], 0))
|
|
k.emit(s_cbranch_scc1(), target='TailLoopEndL')
|
|
k.emit(ds_read_b128(v[82:85], v[16]))
|
|
k.emit(ds_read_b128(v[86:89], v[16], v[0], v[0], 0, 0, 2))
|
|
k.emit(ds_read_b128(v[90:93], v[16], v[0], v[0], 0, 0, 4))
|
|
k.emit(ds_read_b128(v[94:97], v[16], v[0], v[0], 0, 0, 6))
|
|
k.emit(ds_read_b128(v[98:101], v[16], v[0], v[0], 0, 0, 8))
|
|
k.emit(ds_read_b128(v[102:105], v[16], v[0], v[0], 0, 0, 10))
|
|
k.emit(ds_read_b128(v[106:109], v[16], v[0], v[0], 0, 0, 12))
|
|
k.emit(ds_read_b128(v[110:113], v[16], v[0], v[0], 0, 0, 14))
|
|
k.emit(ds_read_b128(v[146:149], v[17]))
|
|
k.emit(ds_read_b128(v[150:153], v[17], v[0], v[0], 0, 128))
|
|
k.emit(ds_read_b128(v[154:157], v[17], v[0], v[0], 0, 0, 1))
|
|
k.emit(ds_read_b128(v[158:161], v[17], v[0], v[0], 0, 128, 1))
|
|
k.emit(ds_read_b128(v[162:165], v[17], v[0], v[0], 0, 0, 2))
|
|
k.emit(ds_read_b128(v[166:169], v[17], v[0], v[0], 0, 128, 2))
|
|
k.emit(ds_read_b128(v[170:173], v[17], v[0], v[0], 0, 0, 3))
|
|
k.emit(ds_read_b128(v[174:177], v[17], v[0], v[0], 0, 128, 3))
|
|
k.emit(s_mov_b32(s[87], 16384))
|
|
k.emit(v_add_co_u32_e32(v[16], s[87], v[16]))
|
|
k.emit(s_mov_b32(s[87], 64))
|
|
k.emit(v_add_co_u32_e32(v[17], s[87], v[17]))
|
|
k.waitcnt(lgkm=0)
|
|
k.emit(v_perm_b32_e64(v[50], v[86], v[82], s[85]))
|
|
k.emit(v_perm_b32_e64(v[51], v[94], v[90], s[85]))
|
|
k.emit(v_perm_b32_e64(v[52], v[102], v[98], s[85]))
|
|
k.emit(v_perm_b32_e64(v[53], v[110], v[106], s[85]))
|
|
k.emit(v_perm_b32_e64(v[54], v[86], v[82], s[86]))
|
|
k.emit(v_perm_b32_e64(v[55], v[94], v[90], s[86]))
|
|
k.emit(v_perm_b32_e64(v[56], v[102], v[98], s[86]))
|
|
k.emit(v_perm_b32_e64(v[57], v[110], v[106], s[86]))
|
|
k.emit(v_perm_b32_e64(v[58], v[87], v[83], s[85]))
|
|
k.emit(v_perm_b32_e64(v[59], v[95], v[91], s[85]))
|
|
k.emit(v_perm_b32_e64(v[60], v[103], v[99], s[85]))
|
|
k.emit(v_perm_b32_e64(v[61], v[111], v[107], s[85]))
|
|
k.emit(v_perm_b32_e64(v[62], v[87], v[83], s[86]))
|
|
k.emit(v_perm_b32_e64(v[63], v[95], v[91], s[86]))
|
|
k.emit(v_perm_b32_e64(v[64], v[103], v[99], s[86]))
|
|
k.emit(v_perm_b32_e64(v[65], v[111], v[107], s[86]))
|
|
k.emit(v_perm_b32_e64(v[66], v[88], v[84], s[85]))
|
|
k.emit(v_perm_b32_e64(v[67], v[96], v[92], s[85]))
|
|
k.emit(v_perm_b32_e64(v[68], v[104], v[100], s[85]))
|
|
k.emit(v_perm_b32_e64(v[69], v[112], v[108], s[85]))
|
|
k.emit(v_perm_b32_e64(v[70], v[88], v[84], s[86]))
|
|
k.emit(v_perm_b32_e64(v[71], v[96], v[92], s[86]))
|
|
k.emit(v_perm_b32_e64(v[72], v[104], v[100], s[86]))
|
|
k.emit(v_perm_b32_e64(v[73], v[112], v[108], s[86]))
|
|
k.emit(v_perm_b32_e64(v[74], v[89], v[85], s[85]))
|
|
k.emit(v_perm_b32_e64(v[75], v[97], v[93], s[85]))
|
|
k.emit(v_perm_b32_e64(v[76], v[105], v[101], s[85]))
|
|
k.emit(v_perm_b32_e64(v[77], v[113], v[109], s[85]))
|
|
k.emit(v_perm_b32_e64(v[78], v[89], v[85], s[86]))
|
|
k.emit(v_perm_b32_e64(v[79], v[97], v[93], s[86]))
|
|
k.emit(v_perm_b32_e64(v[80], v[105], v[101], s[86]))
|
|
k.emit(v_perm_b32_e64(v[81], v[113], v[109], s[86]))
|
|
k.emit(v_and_b32_e32(v[181], 63, v[180]))
|
|
k.emit(v_lshrrev_b32_e32(v[181], 4, v[181]))
|
|
k.emit(v_lshlrev_b32_e32(v[181], 3, v[181]))
|
|
k.emit(v_add_u32_e64(v[182], v[181], 0))
|
|
k.emit(v_cmp_ge_i32_e64(s[88:89], v[182], s[8]))
|
|
k.emit(v_cndmask_b32_e64(v[50], v[50], 0, s[88:89]))
|
|
k.emit(v_cndmask_b32_e64(v[54], v[54], 0, s[88:89]))
|
|
k.emit(v_cndmask_b32_e64(v[58], v[58], 0, s[88:89]))
|
|
k.emit(v_cndmask_b32_e64(v[62], v[62], 0, s[88:89]))
|
|
k.emit(v_cndmask_b32_e64(v[66], v[66], 0, s[88:89]))
|
|
k.emit(v_cndmask_b32_e64(v[70], v[70], 0, s[88:89]))
|
|
k.emit(v_cndmask_b32_e64(v[74], v[74], 0, s[88:89]))
|
|
k.emit(v_cndmask_b32_e64(v[78], v[78], 0, s[88:89]))
|
|
k.emit(v_cndmask_b32_e64(v[51], v[51], 0, s[88:89]))
|
|
k.emit(v_cndmask_b32_e64(v[55], v[55], 0, s[88:89]))
|
|
k.emit(v_cndmask_b32_e64(v[59], v[59], 0, s[88:89]))
|
|
k.emit(v_cndmask_b32_e64(v[63], v[63], 0, s[88:89]))
|
|
k.emit(v_cndmask_b32_e64(v[67], v[67], 0, s[88:89]))
|
|
k.emit(v_cndmask_b32_e64(v[71], v[71], 0, s[88:89]))
|
|
k.emit(v_cndmask_b32_e64(v[75], v[75], 0, s[88:89]))
|
|
k.emit(v_cndmask_b32_e64(v[79], v[79], 0, s[88:89]))
|
|
k.emit(v_add_u32_e64(v[182], v[182], 4))
|
|
k.emit(v_cmp_ge_i32_e64(s[88:89], v[182], s[8]))
|
|
k.emit(v_cndmask_b32_e64(v[52], v[52], 0, s[88:89]))
|
|
k.emit(v_cndmask_b32_e64(v[56], v[56], 0, s[88:89]))
|
|
k.emit(v_cndmask_b32_e64(v[60], v[60], 0, s[88:89]))
|
|
k.emit(v_cndmask_b32_e64(v[64], v[64], 0, s[88:89]))
|
|
k.emit(v_cndmask_b32_e64(v[68], v[68], 0, s[88:89]))
|
|
k.emit(v_cndmask_b32_e64(v[72], v[72], 0, s[88:89]))
|
|
k.emit(v_cndmask_b32_e64(v[76], v[76], 0, s[88:89]))
|
|
k.emit(v_cndmask_b32_e64(v[80], v[80], 0, s[88:89]))
|
|
k.emit(v_cndmask_b32_e64(v[53], v[53], 0, s[88:89]))
|
|
k.emit(v_cndmask_b32_e64(v[57], v[57], 0, s[88:89]))
|
|
k.emit(v_cndmask_b32_e64(v[61], v[61], 0, s[88:89]))
|
|
k.emit(v_cndmask_b32_e64(v[65], v[65], 0, s[88:89]))
|
|
k.emit(v_cndmask_b32_e64(v[69], v[69], 0, s[88:89]))
|
|
k.emit(v_cndmask_b32_e64(v[73], v[73], 0, s[88:89]))
|
|
k.emit(v_cndmask_b32_e64(v[77], v[77], 0, s[88:89]))
|
|
k.emit(v_cndmask_b32_e64(v[81], v[81], 0, s[88:89]))
|
|
k.emit(v_and_b32_e32(v[181], 63, v[180]))
|
|
k.emit(v_lshrrev_b32_e32(v[181], 4, v[181]))
|
|
k.emit(v_lshlrev_b32_e32(v[181], 3, v[181]))
|
|
k.emit(v_add_u32_e64(v[182], v[181], 0))
|
|
k.emit(v_cmp_ge_i32_e64(s[88:89], v[182], s[8]))
|
|
k.emit(v_cndmask_b32_e64(v[146], v[146], 0, s[88:89]))
|
|
k.emit(v_cndmask_b32_e64(v[150], v[150], 0, s[88:89]))
|
|
k.emit(v_cndmask_b32_e64(v[154], v[154], 0, s[88:89]))
|
|
k.emit(v_cndmask_b32_e64(v[158], v[158], 0, s[88:89]))
|
|
k.emit(v_cndmask_b32_e64(v[162], v[162], 0, s[88:89]))
|
|
k.emit(v_cndmask_b32_e64(v[166], v[166], 0, s[88:89]))
|
|
k.emit(v_cndmask_b32_e64(v[170], v[170], 0, s[88:89]))
|
|
k.emit(v_cndmask_b32_e64(v[174], v[174], 0, s[88:89]))
|
|
k.emit(v_cndmask_b32_e64(v[147], v[147], 0, s[88:89]))
|
|
k.emit(v_cndmask_b32_e64(v[151], v[151], 0, s[88:89]))
|
|
k.emit(v_cndmask_b32_e64(v[155], v[155], 0, s[88:89]))
|
|
k.emit(v_cndmask_b32_e64(v[159], v[159], 0, s[88:89]))
|
|
k.emit(v_cndmask_b32_e64(v[163], v[163], 0, s[88:89]))
|
|
k.emit(v_cndmask_b32_e64(v[167], v[167], 0, s[88:89]))
|
|
k.emit(v_cndmask_b32_e64(v[171], v[171], 0, s[88:89]))
|
|
k.emit(v_cndmask_b32_e64(v[175], v[175], 0, s[88:89]))
|
|
k.emit(v_add_u32_e64(v[182], v[182], 4))
|
|
k.emit(v_cmp_ge_i32_e64(s[88:89], v[182], s[8]))
|
|
k.emit(v_cndmask_b32_e64(v[148], v[148], 0, s[88:89]))
|
|
k.emit(v_cndmask_b32_e64(v[152], v[152], 0, s[88:89]))
|
|
k.emit(v_cndmask_b32_e64(v[156], v[156], 0, s[88:89]))
|
|
k.emit(v_cndmask_b32_e64(v[160], v[160], 0, s[88:89]))
|
|
k.emit(v_cndmask_b32_e64(v[164], v[164], 0, s[88:89]))
|
|
k.emit(v_cndmask_b32_e64(v[168], v[168], 0, s[88:89]))
|
|
k.emit(v_cndmask_b32_e64(v[172], v[172], 0, s[88:89]))
|
|
k.emit(v_cndmask_b32_e64(v[176], v[176], 0, s[88:89]))
|
|
k.emit(v_cndmask_b32_e64(v[149], v[149], 0, s[88:89]))
|
|
k.emit(v_cndmask_b32_e64(v[153], v[153], 0, s[88:89]))
|
|
k.emit(v_cndmask_b32_e64(v[157], v[157], 0, s[88:89]))
|
|
k.emit(v_cndmask_b32_e64(v[161], v[161], 0, s[88:89]))
|
|
k.emit(v_cndmask_b32_e64(v[165], v[165], 0, s[88:89]))
|
|
k.emit(v_cndmask_b32_e64(v[169], v[169], 0, s[88:89]))
|
|
k.emit(v_cndmask_b32_e64(v[173], v[173], 0, s[88:89]))
|
|
k.emit(v_cndmask_b32_e64(v[177], v[177], 0, s[88:89]))
|
|
k.emit(s_and_b32(s[87], s[23], 7))
|
|
k.emit(s_cmp_eq_u32(s[87], 0))
|
|
k.emit(s_cbranch_scc1(), target='TailLoop_SkipZeroOutMask_YVWB1RHZO1Z7SCZY')
|
|
k.emit(s_and_b32(s[87], s[8], 7))
|
|
k.emit(s_sub_u32(s[87], 8, s[87]))
|
|
k.emit(s_lshl_b32(s[87], s[87], 4))
|
|
k.emit(v_lshlrev_b64(v[184:185], s[87], v[50:51]))
|
|
k.emit(v_lshlrev_b64(v[186:187], s[87], v[52:53]))
|
|
k.emit(v_add_u32_e64(v[182], v[181], 4))
|
|
k.emit(v_cmp_ge_i32_e64(s[88:89], v[182], s[8]))
|
|
k.emit(v_cndmask_b32_e64(v[50], v[50], v[184], s[88:89]))
|
|
k.emit(v_cmp_ge_i32_e64(s[88:89], v[182], s[8]))
|
|
k.emit(v_cndmask_b32_e64(v[51], v[51], v[185], s[88:89]))
|
|
k.emit(v_add_u32_e64(v[182], v[182], 4))
|
|
k.emit(v_cmp_ge_i32_e64(s[88:89], v[182], s[8]))
|
|
k.emit(v_cndmask_b32_e64(v[52], v[52], v[186], s[88:89]))
|
|
k.emit(v_cmp_ge_i32_e64(s[88:89], v[182], s[8]))
|
|
k.emit(v_cndmask_b32_e64(v[53], v[53], v[187], s[88:89]))
|
|
k.emit(v_lshlrev_b64(v[184:185], s[87], v[54:55]))
|
|
k.emit(v_lshlrev_b64(v[186:187], s[87], v[56:57]))
|
|
k.emit(v_add_u32_e64(v[182], v[181], 4))
|
|
k.emit(v_cmp_ge_i32_e64(s[88:89], v[182], s[8]))
|
|
k.emit(v_cndmask_b32_e64(v[54], v[54], v[184], s[88:89]))
|
|
k.emit(v_cmp_ge_i32_e64(s[88:89], v[182], s[8]))
|
|
k.emit(v_cndmask_b32_e64(v[55], v[55], v[185], s[88:89]))
|
|
k.emit(v_add_u32_e64(v[182], v[182], 4))
|
|
k.emit(v_cmp_ge_i32_e64(s[88:89], v[182], s[8]))
|
|
k.emit(v_cndmask_b32_e64(v[56], v[56], v[186], s[88:89]))
|
|
k.emit(v_cmp_ge_i32_e64(s[88:89], v[182], s[8]))
|
|
k.emit(v_cndmask_b32_e64(v[57], v[57], v[187], s[88:89]))
|
|
k.emit(v_lshlrev_b64(v[184:185], s[87], v[58:59]))
|
|
k.emit(v_lshlrev_b64(v[186:187], s[87], v[60:61]))
|
|
k.emit(v_add_u32_e64(v[182], v[181], 4))
|
|
k.emit(v_cmp_ge_i32_e64(s[88:89], v[182], s[8]))
|
|
k.emit(v_cndmask_b32_e64(v[58], v[58], v[184], s[88:89]))
|
|
k.emit(v_cmp_ge_i32_e64(s[88:89], v[182], s[8]))
|
|
k.emit(v_cndmask_b32_e64(v[59], v[59], v[185], s[88:89]))
|
|
k.emit(v_add_u32_e64(v[182], v[182], 4))
|
|
k.emit(v_cmp_ge_i32_e64(s[88:89], v[182], s[8]))
|
|
k.emit(v_cndmask_b32_e64(v[60], v[60], v[186], s[88:89]))
|
|
k.emit(v_cmp_ge_i32_e64(s[88:89], v[182], s[8]))
|
|
k.emit(v_cndmask_b32_e64(v[61], v[61], v[187], s[88:89]))
|
|
k.emit(v_lshlrev_b64(v[184:185], s[87], v[62:63]))
|
|
k.emit(v_lshlrev_b64(v[186:187], s[87], v[64:65]))
|
|
k.emit(v_add_u32_e64(v[182], v[181], 4))
|
|
k.emit(v_cmp_ge_i32_e64(s[88:89], v[182], s[8]))
|
|
k.emit(v_cndmask_b32_e64(v[62], v[62], v[184], s[88:89]))
|
|
k.emit(v_cmp_ge_i32_e64(s[88:89], v[182], s[8]))
|
|
k.emit(v_cndmask_b32_e64(v[63], v[63], v[185], s[88:89]))
|
|
k.emit(v_add_u32_e64(v[182], v[182], 4))
|
|
k.emit(v_cmp_ge_i32_e64(s[88:89], v[182], s[8]))
|
|
k.emit(v_cndmask_b32_e64(v[64], v[64], v[186], s[88:89]))
|
|
k.emit(v_cmp_ge_i32_e64(s[88:89], v[182], s[8]))
|
|
k.emit(v_cndmask_b32_e64(v[65], v[65], v[187], s[88:89]))
|
|
k.emit(v_lshlrev_b64(v[184:185], s[87], v[66:67]))
|
|
k.emit(v_lshlrev_b64(v[186:187], s[87], v[68:69]))
|
|
k.emit(v_add_u32_e64(v[182], v[181], 4))
|
|
k.emit(v_cmp_ge_i32_e64(s[88:89], v[182], s[8]))
|
|
k.emit(v_cndmask_b32_e64(v[66], v[66], v[184], s[88:89]))
|
|
k.emit(v_cmp_ge_i32_e64(s[88:89], v[182], s[8]))
|
|
k.emit(v_cndmask_b32_e64(v[67], v[67], v[185], s[88:89]))
|
|
k.emit(v_add_u32_e64(v[182], v[182], 4))
|
|
k.emit(v_cmp_ge_i32_e64(s[88:89], v[182], s[8]))
|
|
k.emit(v_cndmask_b32_e64(v[68], v[68], v[186], s[88:89]))
|
|
k.emit(v_cmp_ge_i32_e64(s[88:89], v[182], s[8]))
|
|
k.emit(v_cndmask_b32_e64(v[69], v[69], v[187], s[88:89]))
|
|
k.emit(v_lshlrev_b64(v[184:185], s[87], v[70:71]))
|
|
k.emit(v_lshlrev_b64(v[186:187], s[87], v[72:73]))
|
|
k.emit(v_add_u32_e64(v[182], v[181], 4))
|
|
k.emit(v_cmp_ge_i32_e64(s[88:89], v[182], s[8]))
|
|
k.emit(v_cndmask_b32_e64(v[70], v[70], v[184], s[88:89]))
|
|
k.emit(v_cmp_ge_i32_e64(s[88:89], v[182], s[8]))
|
|
k.emit(v_cndmask_b32_e64(v[71], v[71], v[185], s[88:89]))
|
|
k.emit(v_add_u32_e64(v[182], v[182], 4))
|
|
k.emit(v_cmp_ge_i32_e64(s[88:89], v[182], s[8]))
|
|
k.emit(v_cndmask_b32_e64(v[72], v[72], v[186], s[88:89]))
|
|
k.emit(v_cmp_ge_i32_e64(s[88:89], v[182], s[8]))
|
|
k.emit(v_cndmask_b32_e64(v[73], v[73], v[187], s[88:89]))
|
|
k.emit(v_lshlrev_b64(v[184:185], s[87], v[74:75]))
|
|
k.emit(v_lshlrev_b64(v[186:187], s[87], v[76:77]))
|
|
k.emit(v_add_u32_e64(v[182], v[181], 4))
|
|
k.emit(v_cmp_ge_i32_e64(s[88:89], v[182], s[8]))
|
|
k.emit(v_cndmask_b32_e64(v[74], v[74], v[184], s[88:89]))
|
|
k.emit(v_cmp_ge_i32_e64(s[88:89], v[182], s[8]))
|
|
k.emit(v_cndmask_b32_e64(v[75], v[75], v[185], s[88:89]))
|
|
k.emit(v_add_u32_e64(v[182], v[182], 4))
|
|
k.emit(v_cmp_ge_i32_e64(s[88:89], v[182], s[8]))
|
|
k.emit(v_cndmask_b32_e64(v[76], v[76], v[186], s[88:89]))
|
|
k.emit(v_cmp_ge_i32_e64(s[88:89], v[182], s[8]))
|
|
k.emit(v_cndmask_b32_e64(v[77], v[77], v[187], s[88:89]))
|
|
k.emit(v_lshlrev_b64(v[184:185], s[87], v[78:79]))
|
|
k.emit(v_lshlrev_b64(v[186:187], s[87], v[80:81]))
|
|
k.emit(v_add_u32_e64(v[182], v[181], 4))
|
|
k.emit(v_cmp_ge_i32_e64(s[88:89], v[182], s[8]))
|
|
k.emit(v_cndmask_b32_e64(v[78], v[78], v[184], s[88:89]))
|
|
k.emit(v_cmp_ge_i32_e64(s[88:89], v[182], s[8]))
|
|
k.emit(v_cndmask_b32_e64(v[79], v[79], v[185], s[88:89]))
|
|
k.emit(v_add_u32_e64(v[182], v[182], 4))
|
|
k.emit(v_cmp_ge_i32_e64(s[88:89], v[182], s[8]))
|
|
k.emit(v_cndmask_b32_e64(v[80], v[80], v[186], s[88:89]))
|
|
k.emit(v_cmp_ge_i32_e64(s[88:89], v[182], s[8]))
|
|
k.emit(v_cndmask_b32_e64(v[81], v[81], v[187], s[88:89]))
|
|
k.emit(v_lshlrev_b64(v[184:185], s[87], v[146:147]))
|
|
k.emit(v_lshlrev_b64(v[186:187], s[87], v[148:149]))
|
|
k.emit(v_add_u32_e64(v[182], v[181], 4))
|
|
k.emit(v_cmp_ge_i32_e64(s[88:89], v[182], s[8]))
|
|
k.emit(v_cndmask_b32_e64(v[146], v[146], v[184], s[88:89]))
|
|
k.emit(v_cmp_ge_i32_e64(s[88:89], v[182], s[8]))
|
|
k.emit(v_cndmask_b32_e64(v[147], v[147], v[185], s[88:89]))
|
|
k.emit(v_add_u32_e64(v[182], v[182], 4))
|
|
k.emit(v_cmp_ge_i32_e64(s[88:89], v[182], s[8]))
|
|
k.emit(v_cndmask_b32_e64(v[148], v[148], v[186], s[88:89]))
|
|
k.emit(v_cmp_ge_i32_e64(s[88:89], v[182], s[8]))
|
|
k.emit(v_cndmask_b32_e64(v[149], v[149], v[187], s[88:89]))
|
|
k.emit(v_lshlrev_b64(v[184:185], s[87], v[150:151]))
|
|
k.emit(v_lshlrev_b64(v[186:187], s[87], v[152:153]))
|
|
k.emit(v_add_u32_e64(v[182], v[181], 4))
|
|
k.emit(v_cmp_ge_i32_e64(s[88:89], v[182], s[8]))
|
|
k.emit(v_cndmask_b32_e64(v[150], v[150], v[184], s[88:89]))
|
|
k.emit(v_cmp_ge_i32_e64(s[88:89], v[182], s[8]))
|
|
k.emit(v_cndmask_b32_e64(v[151], v[151], v[185], s[88:89]))
|
|
k.emit(v_add_u32_e64(v[182], v[182], 4))
|
|
k.emit(v_cmp_ge_i32_e64(s[88:89], v[182], s[8]))
|
|
k.emit(v_cndmask_b32_e64(v[152], v[152], v[186], s[88:89]))
|
|
k.emit(v_cmp_ge_i32_e64(s[88:89], v[182], s[8]))
|
|
k.emit(v_cndmask_b32_e64(v[153], v[153], v[187], s[88:89]))
|
|
k.emit(v_lshlrev_b64(v[184:185], s[87], v[154:155]))
|
|
k.emit(v_lshlrev_b64(v[186:187], s[87], v[156:157]))
|
|
k.emit(v_add_u32_e64(v[182], v[181], 4))
|
|
k.emit(v_cmp_ge_i32_e64(s[88:89], v[182], s[8]))
|
|
k.emit(v_cndmask_b32_e64(v[154], v[154], v[184], s[88:89]))
|
|
k.emit(v_cmp_ge_i32_e64(s[88:89], v[182], s[8]))
|
|
k.emit(v_cndmask_b32_e64(v[155], v[155], v[185], s[88:89]))
|
|
k.emit(v_add_u32_e64(v[182], v[182], 4))
|
|
k.emit(v_cmp_ge_i32_e64(s[88:89], v[182], s[8]))
|
|
k.emit(v_cndmask_b32_e64(v[156], v[156], v[186], s[88:89]))
|
|
k.emit(v_cmp_ge_i32_e64(s[88:89], v[182], s[8]))
|
|
k.emit(v_cndmask_b32_e64(v[157], v[157], v[187], s[88:89]))
|
|
k.emit(v_lshlrev_b64(v[184:185], s[87], v[158:159]))
|
|
k.emit(v_lshlrev_b64(v[186:187], s[87], v[160:161]))
|
|
k.emit(v_add_u32_e64(v[182], v[181], 4))
|
|
k.emit(v_cmp_ge_i32_e64(s[88:89], v[182], s[8]))
|
|
k.emit(v_cndmask_b32_e64(v[158], v[158], v[184], s[88:89]))
|
|
k.emit(v_cmp_ge_i32_e64(s[88:89], v[182], s[8]))
|
|
k.emit(v_cndmask_b32_e64(v[159], v[159], v[185], s[88:89]))
|
|
k.emit(v_add_u32_e64(v[182], v[182], 4))
|
|
k.emit(v_cmp_ge_i32_e64(s[88:89], v[182], s[8]))
|
|
k.emit(v_cndmask_b32_e64(v[160], v[160], v[186], s[88:89]))
|
|
k.emit(v_cmp_ge_i32_e64(s[88:89], v[182], s[8]))
|
|
k.emit(v_cndmask_b32_e64(v[161], v[161], v[187], s[88:89]))
|
|
k.emit(v_lshlrev_b64(v[184:185], s[87], v[162:163]))
|
|
k.emit(v_lshlrev_b64(v[186:187], s[87], v[164:165]))
|
|
k.emit(v_add_u32_e64(v[182], v[181], 4))
|
|
k.emit(v_cmp_ge_i32_e64(s[88:89], v[182], s[8]))
|
|
k.emit(v_cndmask_b32_e64(v[162], v[162], v[184], s[88:89]))
|
|
k.emit(v_cmp_ge_i32_e64(s[88:89], v[182], s[8]))
|
|
k.emit(v_cndmask_b32_e64(v[163], v[163], v[185], s[88:89]))
|
|
k.emit(v_add_u32_e64(v[182], v[182], 4))
|
|
k.emit(v_cmp_ge_i32_e64(s[88:89], v[182], s[8]))
|
|
k.emit(v_cndmask_b32_e64(v[164], v[164], v[186], s[88:89]))
|
|
k.emit(v_cmp_ge_i32_e64(s[88:89], v[182], s[8]))
|
|
k.emit(v_cndmask_b32_e64(v[165], v[165], v[187], s[88:89]))
|
|
k.emit(v_lshlrev_b64(v[184:185], s[87], v[166:167]))
|
|
k.emit(v_lshlrev_b64(v[186:187], s[87], v[168:169]))
|
|
k.emit(v_add_u32_e64(v[182], v[181], 4))
|
|
k.emit(v_cmp_ge_i32_e64(s[88:89], v[182], s[8]))
|
|
k.emit(v_cndmask_b32_e64(v[166], v[166], v[184], s[88:89]))
|
|
k.emit(v_cmp_ge_i32_e64(s[88:89], v[182], s[8]))
|
|
k.emit(v_cndmask_b32_e64(v[167], v[167], v[185], s[88:89]))
|
|
k.emit(v_add_u32_e64(v[182], v[182], 4))
|
|
k.emit(v_cmp_ge_i32_e64(s[88:89], v[182], s[8]))
|
|
k.emit(v_cndmask_b32_e64(v[168], v[168], v[186], s[88:89]))
|
|
k.emit(v_cmp_ge_i32_e64(s[88:89], v[182], s[8]))
|
|
k.emit(v_cndmask_b32_e64(v[169], v[169], v[187], s[88:89]))
|
|
k.emit(v_lshlrev_b64(v[184:185], s[87], v[170:171]))
|
|
k.emit(v_lshlrev_b64(v[186:187], s[87], v[172:173]))
|
|
k.emit(v_add_u32_e64(v[182], v[181], 4))
|
|
k.emit(v_cmp_ge_i32_e64(s[88:89], v[182], s[8]))
|
|
k.emit(v_cndmask_b32_e64(v[170], v[170], v[184], s[88:89]))
|
|
k.emit(v_cmp_ge_i32_e64(s[88:89], v[182], s[8]))
|
|
k.emit(v_cndmask_b32_e64(v[171], v[171], v[185], s[88:89]))
|
|
k.emit(v_add_u32_e64(v[182], v[182], 4))
|
|
k.emit(v_cmp_ge_i32_e64(s[88:89], v[182], s[8]))
|
|
k.emit(v_cndmask_b32_e64(v[172], v[172], v[186], s[88:89]))
|
|
k.emit(v_cmp_ge_i32_e64(s[88:89], v[182], s[8]))
|
|
k.emit(v_cndmask_b32_e64(v[173], v[173], v[187], s[88:89]))
|
|
k.emit(v_lshlrev_b64(v[184:185], s[87], v[174:175]))
|
|
k.emit(v_lshlrev_b64(v[186:187], s[87], v[176:177]))
|
|
k.emit(v_add_u32_e64(v[182], v[181], 4))
|
|
k.emit(v_cmp_ge_i32_e64(s[88:89], v[182], s[8]))
|
|
k.emit(v_cndmask_b32_e64(v[174], v[174], v[184], s[88:89]))
|
|
k.emit(v_cmp_ge_i32_e64(s[88:89], v[182], s[8]))
|
|
k.emit(v_cndmask_b32_e64(v[175], v[175], v[185], s[88:89]))
|
|
k.emit(v_add_u32_e64(v[182], v[182], 4))
|
|
k.emit(v_cmp_ge_i32_e64(s[88:89], v[182], s[8]))
|
|
k.emit(v_cndmask_b32_e64(v[176], v[176], v[186], s[88:89]))
|
|
k.emit(v_cmp_ge_i32_e64(s[88:89], v[182], s[8]))
|
|
k.emit(v_cndmask_b32_e64(v[177], v[177], v[187], s[88:89]))
|
|
k.label('TailLoop_SkipZeroOutMask_YVWB1RHZO1Z7SCZY')
|
|
k.emit(s_nop(1))
|
|
k.emit(v_mfma_16x16x32(v[0:3], v[146:149], v[50:53], v[0:3], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mfma_16x16x32(v[4:7], v[146:149], v[54:57], v[4:7], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mfma_16x16x32(v[8:11], v[146:149], v[58:61], v[8:11], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mfma_16x16x32(v[12:15], v[146:149], v[62:65], v[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mfma_16x16x32(v[16:19], v[146:149], v[66:69], v[16:19], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mfma_16x16x32(v[20:23], v[146:149], v[70:73], v[20:23], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mfma_16x16x32(v[24:27], v[146:149], v[74:77], v[24:27], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mfma_16x16x32(v[28:31], v[146:149], v[78:81], v[28:31], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mfma_16x16x32(v[32:35], v[150:153], v[50:53], v[32:35], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mfma_16x16x32(v[36:39], v[150:153], v[54:57], v[36:39], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mfma_16x16x32(v[40:43], v[150:153], v[58:61], v[40:43], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mfma_16x16x32(v[44:47], v[150:153], v[62:65], v[44:47], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mfma_16x16x32(v[48:51], v[150:153], v[66:69], v[48:51], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mfma_16x16x32(v[52:55], v[150:153], v[70:73], v[52:55], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mfma_16x16x32(v[56:59], v[150:153], v[74:77], v[56:59], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mfma_16x16x32(v[60:63], v[150:153], v[78:81], v[60:63], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mfma_16x16x32(v[64:67], v[154:157], v[50:53], v[64:67], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mfma_16x16x32(v[68:71], v[154:157], v[54:57], v[68:71], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mfma_16x16x32(v[72:75], v[154:157], v[58:61], v[72:75], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mfma_16x16x32(v[76:79], v[154:157], v[62:65], v[76:79], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mfma_16x16x32(v[80:83], v[154:157], v[66:69], v[80:83], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mfma_16x16x32(v[84:87], v[154:157], v[70:73], v[84:87], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mfma_16x16x32(v[88:91], v[154:157], v[74:77], v[88:91], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mfma_16x16x32(v[92:95], v[154:157], v[78:81], v[92:95], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mfma_16x16x32(v[96:99], v[158:161], v[50:53], v[96:99], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mfma_16x16x32(v[100:103], v[158:161], v[54:57], v[100:103], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mfma_16x16x32(v[104:107], v[158:161], v[58:61], v[104:107], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mfma_16x16x32(v[108:111], v[158:161], v[62:65], v[108:111], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mfma_16x16x32(v[112:115], v[158:161], v[66:69], v[112:115], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mfma_16x16x32(v[116:119], v[158:161], v[70:73], v[116:119], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mfma_16x16x32(v[120:123], v[158:161], v[74:77], v[120:123], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mfma_16x16x32(v[124:127], v[158:161], v[78:81], v[124:127], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mfma_16x16x32(v[128:131], v[162:165], v[50:53], v[128:131], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mfma_16x16x32(v[132:135], v[162:165], v[54:57], v[132:135], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mfma_16x16x32(v[136:139], v[162:165], v[58:61], v[136:139], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mfma_16x16x32(v[140:143], v[162:165], v[62:65], v[140:143], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mfma_16x16x32(v[144:147], v[162:165], v[66:69], v[144:147], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mfma_16x16x32(v[148:151], v[162:165], v[70:73], v[148:151], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mfma_16x16x32(v[152:155], v[162:165], v[74:77], v[152:155], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mfma_16x16x32(v[156:159], v[162:165], v[78:81], v[156:159], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mfma_16x16x32(v[160:163], v[166:169], v[50:53], v[160:163], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mfma_16x16x32(v[164:167], v[166:169], v[54:57], v[164:167], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mfma_16x16x32(v[168:171], v[166:169], v[58:61], v[168:171], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mfma_16x16x32(v[172:175], v[166:169], v[62:65], v[172:175], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mfma_16x16x32(v[176:179], v[166:169], v[66:69], v[176:179], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mfma_16x16x32(v[180:183], v[166:169], v[70:73], v[180:183], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mfma_16x16x32(v[184:187], v[166:169], v[74:77], v[184:187], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mfma_16x16x32(v[188:191], v[166:169], v[78:81], v[188:191], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mfma_16x16x32(v[192:195], v[170:173], v[50:53], v[192:195], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mfma_16x16x32(v[196:199], v[170:173], v[54:57], v[196:199], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mfma_16x16x32(v[200:203], v[170:173], v[58:61], v[200:203], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mfma_16x16x32(v[204:207], v[170:173], v[62:65], v[204:207], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mfma_16x16x32(v[208:211], v[170:173], v[66:69], v[208:211], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mfma_16x16x32(v[212:215], v[170:173], v[70:73], v[212:215], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mfma_16x16x32(v[216:219], v[170:173], v[74:77], v[216:219], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mfma_16x16x32(v[220:223], v[170:173], v[78:81], v[220:223], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mfma_16x16x32(v[224:227], v[174:177], v[50:53], v[224:227], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mfma_16x16x32(v[228:231], v[174:177], v[54:57], v[228:231], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mfma_16x16x32(v[232:235], v[174:177], v[58:61], v[232:235], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mfma_16x16x32(v[236:239], v[174:177], v[62:65], v[236:239], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mfma_16x16x32(v[240:243], v[174:177], v[66:69], v[240:243], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mfma_16x16x32(v[244:247], v[174:177], v[70:73], v[244:247], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mfma_16x16x32(v[248:251], v[174:177], v[74:77], v[248:251], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mfma_16x16x32(v[252:255], v[174:177], v[78:81], v[252:255], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(s_sub_i32(s[8], s[8], 32))
|
|
k.emit(s_add_u32(s[9], s[9], 32))
|
|
k.emit(s_cmp_le_i32(s[8], 0))
|
|
k.emit(s_cbranch_scc0(), target='TailLoopBeginL')
|
|
k.label('TailLoopEndL')
|
|
k.emit(s_mov_b32(s[87], 512))
|
|
k.emit(s_mul_i32(s[87], s[9], s[87]))
|
|
k.emit(v_sub_u32_e64(v[16], v[16], s[87]))
|
|
k.emit(s_mov_b32(s[87], 2))
|
|
k.emit(s_mul_i32(s[87], s[9], s[87]))
|
|
k.emit(v_sub_u32_e64(v[17], v[17], s[87]))
|
|
k.label('SkipTailLoopL')
|
|
k.emit(s_setprio())
|
|
k.emit(s_mov_b64(s[68:69], 0))
|
|
k.emit(s_mov_b32(s[72], 0))
|
|
k.emit(v_mov_b32_e32(v[21], s[2]))
|
|
k.emit(v_mul_i32_i24_e32(v[21], 4294967040, v[21]))
|
|
k.emit(v_add_co_u32_e32(v[21], s[20], v[21]))
|
|
k.emit(v_mov_b32_e32(v[22], 256))
|
|
k.emit(v_cmp_lt_u32_e64(s[8:9], v[21], v[22]))
|
|
k.emit(v_cndmask_b32_e64(v[21], v[22], v[21], s[8:9]))
|
|
k.emit(v_lshrrev_b32_e32(v[23], 6, v[180]))
|
|
k.emit(v_and_b32_e32(v[23], 1, v[23]))
|
|
k.emit(v_lshrrev_b32_e32(v[24], 7, v[21]))
|
|
k.emit(v_and_b32_e32(v[24], 1, v[24]))
|
|
k.emit(v_cmp_eq_u32_e64(s[8:9], v[24], v[23]))
|
|
k.emit(v_cndmask_b32_e64(v[21], v[22], v[21], s[8:9]))
|
|
k.emit(v_lshrrev_b32_e32(v[22], 7, v[21]))
|
|
k.emit(v_lshlrev_b32_e32(v[24], 0, v[23]))
|
|
k.emit(v_sub_u32_e32(v[22], v[22], v[24]))
|
|
k.emit(v_lshrrev_b32_e32(v[24], 3, v[21]))
|
|
k.emit(v_lshrrev_b32_e32(v[25], 0, v[180]))
|
|
k.emit(v_and_b32_e32(v[25], 15, v[25]))
|
|
k.emit(v_lshlrev_b32_e32(v[25], 3, v[25]))
|
|
k.emit(v_lshrrev_b32_e32(v[25], 3, v[25]))
|
|
k.emit(v_lshlrev_b32_e32(v[23], 4, v[23]))
|
|
k.emit(v_add_co_u32_e32(v[25], v[23], v[25]))
|
|
k.emit(v_sub_u32_e32(v[24], v[24], v[25]))
|
|
k.emit(v_and_b32_e32(v[23], 7, v[21]))
|
|
k.emit(v_lshrrev_b32_e32(v[23], 3, v[23]))
|
|
k.emit(v_and_b32_e32(v[25], 7, v[21]))
|
|
k.emit(v_cmp_eq_u32_e64(VCC, v[25], 1))
|
|
k.emit(s_cbranch_vccnz(), target='ShiftVectorComponents0_GLVW1')
|
|
k.emit(v_cmp_eq_u32_e64(VCC, v[25], 2))
|
|
k.emit(s_cbranch_vccnz(), target='ShiftVectorComponents0_GLVW2')
|
|
k.emit(v_cmp_eq_u32_e64(VCC, v[25], 3))
|
|
k.emit(s_cbranch_vccnz(), target='ShiftVectorComponents0_GLVW3')
|
|
k.emit(v_cmp_eq_u32_e64(VCC, v[25], 4))
|
|
k.emit(s_cbranch_vccnz(), target='ShiftVectorComponents0_GLVW4')
|
|
k.emit(v_cmp_eq_u32_e64(VCC, v[25], 5))
|
|
k.emit(s_cbranch_vccnz(), target='ShiftVectorComponents0_GLVW5')
|
|
k.emit(v_cmp_eq_u32_e64(VCC, v[25], 6))
|
|
k.emit(s_cbranch_vccnz(), target='ShiftVectorComponents0_GLVW6')
|
|
k.emit(v_cmp_eq_u32_e64(VCC, v[25], 7))
|
|
k.emit(s_cbranch_vccnz(), target='ShiftVectorComponents0_GLVW7')
|
|
k.emit(s_branch(), target='ShiftVectorComponents0_GLVW0')
|
|
k.label('ShiftVectorComponents0_GLVW1')
|
|
k.emit(v_cmp_eq_u32_e64(VCC, v[22], 0))
|
|
k.emit(s_cbranch_vccnz(), target='ShiftVectorComponents0_GLVW1_BM0')
|
|
k.label('ShiftVectorComponents0_GLVW2')
|
|
k.emit(v_cmp_eq_u32_e64(VCC, v[22], 0))
|
|
k.emit(s_cbranch_vccnz(), target='ShiftVectorComponents0_GLVW2_BM0')
|
|
k.label('ShiftVectorComponents0_GLVW3')
|
|
k.emit(v_cmp_eq_u32_e64(VCC, v[22], 0))
|
|
k.emit(s_cbranch_vccnz(), target='ShiftVectorComponents0_GLVW3_BM0')
|
|
k.label('ShiftVectorComponents0_GLVW4')
|
|
k.emit(v_cmp_eq_u32_e64(VCC, v[22], 0))
|
|
k.emit(s_cbranch_vccnz(), target='ShiftVectorComponents0_GLVW4_BM0')
|
|
k.label('ShiftVectorComponents0_GLVW5')
|
|
k.emit(v_cmp_eq_u32_e64(VCC, v[22], 0))
|
|
k.emit(s_cbranch_vccnz(), target='ShiftVectorComponents0_GLVW5_BM0')
|
|
k.label('ShiftVectorComponents0_GLVW6')
|
|
k.emit(v_cmp_eq_u32_e64(VCC, v[22], 0))
|
|
k.emit(s_cbranch_vccnz(), target='ShiftVectorComponents0_GLVW6_BM0')
|
|
k.label('ShiftVectorComponents0_GLVW7')
|
|
k.emit(v_cmp_eq_u32_e64(VCC, v[22], 0))
|
|
k.emit(s_cbranch_vccnz(), target='ShiftVectorComponents0_GLVW7_BM0')
|
|
k.label('ShiftVectorComponents0_GLVW1_BM0')
|
|
k.emit(v_cmp_eq_u32_e64(VCC, v[23], 0))
|
|
k.emit(s_cbranch_vccnz(), target='ShiftVectorComponents0_GLVW1_BM0_VW0')
|
|
k.label('ShiftVectorComponents0_GLVW2_BM0')
|
|
k.emit(v_cmp_eq_u32_e64(VCC, v[23], 0))
|
|
k.emit(s_cbranch_vccnz(), target='ShiftVectorComponents0_GLVW2_BM0_VW0')
|
|
k.label('ShiftVectorComponents0_GLVW3_BM0')
|
|
k.emit(v_cmp_eq_u32_e64(VCC, v[23], 0))
|
|
k.emit(s_cbranch_vccnz(), target='ShiftVectorComponents0_GLVW3_BM0_VW0')
|
|
k.label('ShiftVectorComponents0_GLVW4_BM0')
|
|
k.emit(v_cmp_eq_u32_e64(VCC, v[23], 0))
|
|
k.emit(s_cbranch_vccnz(), target='ShiftVectorComponents0_GLVW4_BM0_VW0')
|
|
k.label('ShiftVectorComponents0_GLVW5_BM0')
|
|
k.emit(v_cmp_eq_u32_e64(VCC, v[23], 0))
|
|
k.emit(s_cbranch_vccnz(), target='ShiftVectorComponents0_GLVW5_BM0_VW0')
|
|
k.label('ShiftVectorComponents0_GLVW6_BM0')
|
|
k.emit(v_cmp_eq_u32_e64(VCC, v[23], 0))
|
|
k.emit(s_cbranch_vccnz(), target='ShiftVectorComponents0_GLVW6_BM0_VW0')
|
|
k.label('ShiftVectorComponents0_GLVW7_BM0')
|
|
k.emit(v_cmp_eq_u32_e64(VCC, v[23], 0))
|
|
k.emit(s_cbranch_vccnz(), target='ShiftVectorComponents0_GLVW7_BM0_VW0')
|
|
k.label('ShiftVectorComponents0_GLVW1_BM0_VW0')
|
|
k.emit(s_mov_b32(s[8], 0))
|
|
k.emit(v_cmpx_eq_u32_e64(s[8:9], v[24], s[8]))
|
|
k.emit(v_and_b32_e32(v[18], 63, v[180]))
|
|
k.emit(v_lshlrev_b32_e32(v[18], 2, v[18]))
|
|
k.emit(v_accvgpr_read(v[25], v[28]))
|
|
k.emit(s_nop(1))
|
|
k.emit(v_accvgpr_write(v[0], v[25]))
|
|
k.emit(v_accvgpr_read(v[25], v[60]))
|
|
k.emit(s_nop(1))
|
|
k.emit(v_accvgpr_write(v[32], v[25]))
|
|
k.emit(v_accvgpr_read(v[25], v[92]))
|
|
k.emit(s_nop(1))
|
|
k.emit(v_accvgpr_write(v[64], v[25]))
|
|
k.emit(v_accvgpr_read(v[25], v[124]))
|
|
k.emit(s_nop(1))
|
|
k.emit(v_accvgpr_write(v[96], v[25]))
|
|
k.emit(v_accvgpr_read(v[25], v[156]))
|
|
k.emit(s_nop(1))
|
|
k.emit(v_accvgpr_write(v[128], v[25]))
|
|
k.emit(v_accvgpr_read(v[25], v[188]))
|
|
k.emit(s_nop(1))
|
|
k.emit(v_accvgpr_write(v[160], v[25]))
|
|
k.emit(v_accvgpr_read(v[25], v[220]))
|
|
k.emit(s_nop(1))
|
|
k.emit(v_accvgpr_write(v[192], v[25]))
|
|
k.emit(v_accvgpr_read(v[25], v[252]))
|
|
k.emit(s_nop(1))
|
|
k.emit(v_accvgpr_write(v[224], v[25]))
|
|
k.emit(v_accvgpr_read(v[25], v[29]))
|
|
k.emit(s_nop(1))
|
|
k.emit(v_accvgpr_write(v[1], v[25]))
|
|
k.emit(v_accvgpr_read(v[25], v[61]))
|
|
k.emit(s_nop(1))
|
|
k.emit(v_accvgpr_write(v[33], v[25]))
|
|
k.emit(v_accvgpr_read(v[25], v[93]))
|
|
k.emit(s_nop(1))
|
|
k.emit(v_accvgpr_write(v[65], v[25]))
|
|
k.emit(v_accvgpr_read(v[25], v[125]))
|
|
k.emit(s_nop(1))
|
|
k.emit(v_accvgpr_write(v[97], v[25]))
|
|
k.emit(v_accvgpr_read(v[25], v[157]))
|
|
k.emit(s_nop(1))
|
|
k.emit(v_accvgpr_write(v[129], v[25]))
|
|
k.emit(v_accvgpr_read(v[25], v[189]))
|
|
k.emit(s_nop(1))
|
|
k.emit(v_accvgpr_write(v[161], v[25]))
|
|
k.emit(v_accvgpr_read(v[25], v[221]))
|
|
k.emit(s_nop(1))
|
|
k.emit(v_accvgpr_write(v[193], v[25]))
|
|
k.emit(v_accvgpr_read(v[25], v[253]))
|
|
k.emit(s_nop(1))
|
|
k.emit(v_accvgpr_write(v[225], v[25]))
|
|
k.emit(v_accvgpr_read(v[25], v[30]))
|
|
k.emit(s_nop(1))
|
|
k.emit(v_accvgpr_write(v[2], v[25]))
|
|
k.emit(v_accvgpr_read(v[25], v[62]))
|
|
k.emit(s_nop(1))
|
|
k.emit(v_accvgpr_write(v[34], v[25]))
|
|
k.emit(v_accvgpr_read(v[25], v[94]))
|
|
k.emit(s_nop(1))
|
|
k.emit(v_accvgpr_write(v[66], v[25]))
|
|
k.emit(v_accvgpr_read(v[25], v[126]))
|
|
k.emit(s_nop(1))
|
|
k.emit(v_accvgpr_write(v[98], v[25]))
|
|
k.emit(v_accvgpr_read(v[25], v[158]))
|
|
k.emit(s_nop(1))
|
|
k.emit(v_accvgpr_write(v[130], v[25]))
|
|
k.emit(v_accvgpr_read(v[25], v[190]))
|
|
k.emit(s_nop(1))
|
|
k.emit(v_accvgpr_write(v[162], v[25]))
|
|
k.emit(v_accvgpr_read(v[25], v[222]))
|
|
k.emit(s_nop(1))
|
|
k.emit(v_accvgpr_write(v[194], v[25]))
|
|
k.emit(v_accvgpr_read(v[25], v[254]))
|
|
k.emit(s_nop(1))
|
|
k.emit(v_accvgpr_write(v[226], v[25]))
|
|
k.emit(v_accvgpr_read(v[25], v[31]))
|
|
k.emit(s_nop(1))
|
|
k.emit(v_accvgpr_write(v[3], v[25]))
|
|
k.emit(v_accvgpr_read(v[25], v[63]))
|
|
k.emit(s_nop(1))
|
|
k.emit(v_accvgpr_write(v[35], v[25]))
|
|
k.emit(v_accvgpr_read(v[25], v[95]))
|
|
k.emit(s_nop(1))
|
|
k.emit(v_accvgpr_write(v[67], v[25]))
|
|
k.emit(v_accvgpr_read(v[25], v[127]))
|
|
k.emit(s_nop(1))
|
|
k.emit(v_accvgpr_write(v[99], v[25]))
|
|
k.emit(v_accvgpr_read(v[25], v[159]))
|
|
k.emit(s_nop(1))
|
|
k.emit(v_accvgpr_write(v[131], v[25]))
|
|
k.emit(v_accvgpr_read(v[25], v[191]))
|
|
k.emit(s_nop(1))
|
|
k.emit(v_accvgpr_write(v[163], v[25]))
|
|
k.emit(v_accvgpr_read(v[25], v[223]))
|
|
k.emit(s_nop(1))
|
|
k.emit(v_accvgpr_write(v[195], v[25]))
|
|
k.emit(v_accvgpr_read(v[25], v[255]))
|
|
k.emit(s_nop(1))
|
|
k.emit(v_accvgpr_write(v[227], v[25]))
|
|
k.emit(s_mov_b64(s[8:9], -1))
|
|
k.emit(s_or_saveexec_b64(VCC, s[8:9]))
|
|
k.emit(s_branch(), target='ShiftVectorComponents0_GLVW0')
|
|
k.label('ShiftVectorComponents0_GLVW2_BM0_VW0')
|
|
k.emit(s_mov_b32(s[8], 0))
|
|
k.emit(v_cmpx_eq_u32_e64(s[8:9], v[24], s[8]))
|
|
k.emit(v_and_b32_e32(v[18], 63, v[180]))
|
|
k.emit(v_lshlrev_b32_e32(v[18], 2, v[18]))
|
|
k.emit(v_accvgpr_read(v[25], v[24]))
|
|
k.emit(v_accvgpr_read(v[26], v[28]))
|
|
k.emit(s_nop(1))
|
|
k.emit(v_accvgpr_write(v[0], v[25]))
|
|
k.emit(v_accvgpr_write(v[4], v[26]))
|
|
k.emit(v_accvgpr_read(v[25], v[56]))
|
|
k.emit(v_accvgpr_read(v[26], v[60]))
|
|
k.emit(s_nop(1))
|
|
k.emit(v_accvgpr_write(v[32], v[25]))
|
|
k.emit(v_accvgpr_write(v[36], v[26]))
|
|
k.emit(v_accvgpr_read(v[25], v[88]))
|
|
k.emit(v_accvgpr_read(v[26], v[92]))
|
|
k.emit(s_nop(1))
|
|
k.emit(v_accvgpr_write(v[64], v[25]))
|
|
k.emit(v_accvgpr_write(v[68], v[26]))
|
|
k.emit(v_accvgpr_read(v[25], v[120]))
|
|
k.emit(v_accvgpr_read(v[26], v[124]))
|
|
k.emit(s_nop(1))
|
|
k.emit(v_accvgpr_write(v[96], v[25]))
|
|
k.emit(v_accvgpr_write(v[100], v[26]))
|
|
k.emit(v_accvgpr_read(v[25], v[152]))
|
|
k.emit(v_accvgpr_read(v[26], v[156]))
|
|
k.emit(s_nop(1))
|
|
k.emit(v_accvgpr_write(v[128], v[25]))
|
|
k.emit(v_accvgpr_write(v[132], v[26]))
|
|
k.emit(v_accvgpr_read(v[25], v[184]))
|
|
k.emit(v_accvgpr_read(v[26], v[188]))
|
|
k.emit(s_nop(1))
|
|
k.emit(v_accvgpr_write(v[160], v[25]))
|
|
k.emit(v_accvgpr_write(v[164], v[26]))
|
|
k.emit(v_accvgpr_read(v[25], v[216]))
|
|
k.emit(v_accvgpr_read(v[26], v[220]))
|
|
k.emit(s_nop(1))
|
|
k.emit(v_accvgpr_write(v[192], v[25]))
|
|
k.emit(v_accvgpr_write(v[196], v[26]))
|
|
k.emit(v_accvgpr_read(v[25], v[248]))
|
|
k.emit(v_accvgpr_read(v[26], v[252]))
|
|
k.emit(s_nop(1))
|
|
k.emit(v_accvgpr_write(v[224], v[25]))
|
|
k.emit(v_accvgpr_write(v[228], v[26]))
|
|
k.emit(v_accvgpr_read(v[25], v[25]))
|
|
k.emit(v_accvgpr_read(v[26], v[29]))
|
|
k.emit(s_nop(1))
|
|
k.emit(v_accvgpr_write(v[1], v[25]))
|
|
k.emit(v_accvgpr_write(v[5], v[26]))
|
|
k.emit(v_accvgpr_read(v[25], v[57]))
|
|
k.emit(v_accvgpr_read(v[26], v[61]))
|
|
k.emit(s_nop(1))
|
|
k.emit(v_accvgpr_write(v[33], v[25]))
|
|
k.emit(v_accvgpr_write(v[37], v[26]))
|
|
k.emit(v_accvgpr_read(v[25], v[89]))
|
|
k.emit(v_accvgpr_read(v[26], v[93]))
|
|
k.emit(s_nop(1))
|
|
k.emit(v_accvgpr_write(v[65], v[25]))
|
|
k.emit(v_accvgpr_write(v[69], v[26]))
|
|
k.emit(v_accvgpr_read(v[25], v[121]))
|
|
k.emit(v_accvgpr_read(v[26], v[125]))
|
|
k.emit(s_nop(1))
|
|
k.emit(v_accvgpr_write(v[97], v[25]))
|
|
k.emit(v_accvgpr_write(v[101], v[26]))
|
|
k.emit(v_accvgpr_read(v[25], v[153]))
|
|
k.emit(v_accvgpr_read(v[26], v[157]))
|
|
k.emit(s_nop(1))
|
|
k.emit(v_accvgpr_write(v[129], v[25]))
|
|
k.emit(v_accvgpr_write(v[133], v[26]))
|
|
k.emit(v_accvgpr_read(v[25], v[185]))
|
|
k.emit(v_accvgpr_read(v[26], v[189]))
|
|
k.emit(s_nop(1))
|
|
k.emit(v_accvgpr_write(v[161], v[25]))
|
|
k.emit(v_accvgpr_write(v[165], v[26]))
|
|
k.emit(v_accvgpr_read(v[25], v[217]))
|
|
k.emit(v_accvgpr_read(v[26], v[221]))
|
|
k.emit(s_nop(1))
|
|
k.emit(v_accvgpr_write(v[193], v[25]))
|
|
k.emit(v_accvgpr_write(v[197], v[26]))
|
|
k.emit(v_accvgpr_read(v[25], v[249]))
|
|
k.emit(v_accvgpr_read(v[26], v[253]))
|
|
k.emit(s_nop(1))
|
|
k.emit(v_accvgpr_write(v[225], v[25]))
|
|
k.emit(v_accvgpr_write(v[229], v[26]))
|
|
k.emit(v_accvgpr_read(v[25], v[26]))
|
|
k.emit(v_accvgpr_read(v[26], v[30]))
|
|
k.emit(s_nop(1))
|
|
k.emit(v_accvgpr_write(v[2], v[25]))
|
|
k.emit(v_accvgpr_write(v[6], v[26]))
|
|
k.emit(v_accvgpr_read(v[25], v[58]))
|
|
k.emit(v_accvgpr_read(v[26], v[62]))
|
|
k.emit(s_nop(1))
|
|
k.emit(v_accvgpr_write(v[34], v[25]))
|
|
k.emit(v_accvgpr_write(v[38], v[26]))
|
|
k.emit(v_accvgpr_read(v[25], v[90]))
|
|
k.emit(v_accvgpr_read(v[26], v[94]))
|
|
k.emit(s_nop(1))
|
|
k.emit(v_accvgpr_write(v[66], v[25]))
|
|
k.emit(v_accvgpr_write(v[70], v[26]))
|
|
k.emit(v_accvgpr_read(v[25], v[122]))
|
|
k.emit(v_accvgpr_read(v[26], v[126]))
|
|
k.emit(s_nop(1))
|
|
k.emit(v_accvgpr_write(v[98], v[25]))
|
|
k.emit(v_accvgpr_write(v[102], v[26]))
|
|
k.emit(v_accvgpr_read(v[25], v[154]))
|
|
k.emit(v_accvgpr_read(v[26], v[158]))
|
|
k.emit(s_nop(1))
|
|
k.emit(v_accvgpr_write(v[130], v[25]))
|
|
k.emit(v_accvgpr_write(v[134], v[26]))
|
|
k.emit(v_accvgpr_read(v[25], v[186]))
|
|
k.emit(v_accvgpr_read(v[26], v[190]))
|
|
k.emit(s_nop(1))
|
|
k.emit(v_accvgpr_write(v[162], v[25]))
|
|
k.emit(v_accvgpr_write(v[166], v[26]))
|
|
k.emit(v_accvgpr_read(v[25], v[218]))
|
|
k.emit(v_accvgpr_read(v[26], v[222]))
|
|
k.emit(s_nop(1))
|
|
k.emit(v_accvgpr_write(v[194], v[25]))
|
|
k.emit(v_accvgpr_write(v[198], v[26]))
|
|
k.emit(v_accvgpr_read(v[25], v[250]))
|
|
k.emit(v_accvgpr_read(v[26], v[254]))
|
|
k.emit(s_nop(1))
|
|
k.emit(v_accvgpr_write(v[226], v[25]))
|
|
k.emit(v_accvgpr_write(v[230], v[26]))
|
|
k.emit(v_accvgpr_read(v[25], v[27]))
|
|
k.emit(v_accvgpr_read(v[26], v[31]))
|
|
k.emit(s_nop(1))
|
|
k.emit(v_accvgpr_write(v[3], v[25]))
|
|
k.emit(v_accvgpr_write(v[7], v[26]))
|
|
k.emit(v_accvgpr_read(v[25], v[59]))
|
|
k.emit(v_accvgpr_read(v[26], v[63]))
|
|
k.emit(s_nop(1))
|
|
k.emit(v_accvgpr_write(v[35], v[25]))
|
|
k.emit(v_accvgpr_write(v[39], v[26]))
|
|
k.emit(v_accvgpr_read(v[25], v[91]))
|
|
k.emit(v_accvgpr_read(v[26], v[95]))
|
|
k.emit(s_nop(1))
|
|
k.emit(v_accvgpr_write(v[67], v[25]))
|
|
k.emit(v_accvgpr_write(v[71], v[26]))
|
|
k.emit(v_accvgpr_read(v[25], v[123]))
|
|
k.emit(v_accvgpr_read(v[26], v[127]))
|
|
k.emit(s_nop(1))
|
|
k.emit(v_accvgpr_write(v[99], v[25]))
|
|
k.emit(v_accvgpr_write(v[103], v[26]))
|
|
k.emit(v_accvgpr_read(v[25], v[155]))
|
|
k.emit(v_accvgpr_read(v[26], v[159]))
|
|
k.emit(s_nop(1))
|
|
k.emit(v_accvgpr_write(v[131], v[25]))
|
|
k.emit(v_accvgpr_write(v[135], v[26]))
|
|
k.emit(v_accvgpr_read(v[25], v[187]))
|
|
k.emit(v_accvgpr_read(v[26], v[191]))
|
|
k.emit(s_nop(1))
|
|
k.emit(v_accvgpr_write(v[163], v[25]))
|
|
k.emit(v_accvgpr_write(v[167], v[26]))
|
|
k.emit(v_accvgpr_read(v[25], v[219]))
|
|
k.emit(v_accvgpr_read(v[26], v[223]))
|
|
k.emit(s_nop(1))
|
|
k.emit(v_accvgpr_write(v[195], v[25]))
|
|
k.emit(v_accvgpr_write(v[199], v[26]))
|
|
k.emit(v_accvgpr_read(v[25], v[251]))
|
|
k.emit(v_accvgpr_read(v[26], v[255]))
|
|
k.emit(s_nop(1))
|
|
k.emit(v_accvgpr_write(v[227], v[25]))
|
|
k.emit(v_accvgpr_write(v[231], v[26]))
|
|
k.emit(s_mov_b64(s[8:9], -1))
|
|
k.emit(s_or_saveexec_b64(VCC, s[8:9]))
|
|
k.emit(s_branch(), target='ShiftVectorComponents0_GLVW0')
|
|
k.label('ShiftVectorComponents0_GLVW3_BM0_VW0')
|
|
k.emit(s_mov_b32(s[8], 0))
|
|
k.emit(v_cmpx_eq_u32_e64(s[8:9], v[24], s[8]))
|
|
k.emit(v_and_b32_e32(v[18], 63, v[180]))
|
|
k.emit(v_lshlrev_b32_e32(v[18], 2, v[18]))
|
|
k.emit(v_accvgpr_read(v[25], v[20]))
|
|
k.emit(v_accvgpr_read(v[26], v[24]))
|
|
k.emit(v_accvgpr_read(v[27], v[28]))
|
|
k.emit(s_nop(1))
|
|
k.emit(v_accvgpr_write(v[0], v[25]))
|
|
k.emit(v_accvgpr_write(v[4], v[26]))
|
|
k.emit(v_accvgpr_write(v[8], v[27]))
|
|
k.emit(v_accvgpr_read(v[25], v[52]))
|
|
k.emit(v_accvgpr_read(v[26], v[56]))
|
|
k.emit(v_accvgpr_read(v[27], v[60]))
|
|
k.emit(s_nop(1))
|
|
k.emit(v_accvgpr_write(v[32], v[25]))
|
|
k.emit(v_accvgpr_write(v[36], v[26]))
|
|
k.emit(v_accvgpr_write(v[40], v[27]))
|
|
k.emit(v_accvgpr_read(v[25], v[84]))
|
|
k.emit(v_accvgpr_read(v[26], v[88]))
|
|
k.emit(v_accvgpr_read(v[27], v[92]))
|
|
k.emit(s_nop(1))
|
|
k.emit(v_accvgpr_write(v[64], v[25]))
|
|
k.emit(v_accvgpr_write(v[68], v[26]))
|
|
k.emit(v_accvgpr_write(v[72], v[27]))
|
|
k.emit(v_accvgpr_read(v[25], v[116]))
|
|
k.emit(v_accvgpr_read(v[26], v[120]))
|
|
k.emit(v_accvgpr_read(v[27], v[124]))
|
|
k.emit(s_nop(1))
|
|
k.emit(v_accvgpr_write(v[96], v[25]))
|
|
k.emit(v_accvgpr_write(v[100], v[26]))
|
|
k.emit(v_accvgpr_write(v[104], v[27]))
|
|
k.emit(v_accvgpr_read(v[25], v[148]))
|
|
k.emit(v_accvgpr_read(v[26], v[152]))
|
|
k.emit(v_accvgpr_read(v[27], v[156]))
|
|
k.emit(s_nop(1))
|
|
k.emit(v_accvgpr_write(v[128], v[25]))
|
|
k.emit(v_accvgpr_write(v[132], v[26]))
|
|
k.emit(v_accvgpr_write(v[136], v[27]))
|
|
k.emit(v_accvgpr_read(v[25], v[180]))
|
|
k.emit(v_accvgpr_read(v[26], v[184]))
|
|
k.emit(v_accvgpr_read(v[27], v[188]))
|
|
k.emit(s_nop(1))
|
|
k.emit(v_accvgpr_write(v[160], v[25]))
|
|
k.emit(v_accvgpr_write(v[164], v[26]))
|
|
k.emit(v_accvgpr_write(v[168], v[27]))
|
|
k.emit(v_accvgpr_read(v[25], v[212]))
|
|
k.emit(v_accvgpr_read(v[26], v[216]))
|
|
k.emit(v_accvgpr_read(v[27], v[220]))
|
|
k.emit(s_nop(1))
|
|
k.emit(v_accvgpr_write(v[192], v[25]))
|
|
k.emit(v_accvgpr_write(v[196], v[26]))
|
|
k.emit(v_accvgpr_write(v[200], v[27]))
|
|
k.emit(v_accvgpr_read(v[25], v[244]))
|
|
k.emit(v_accvgpr_read(v[26], v[248]))
|
|
k.emit(v_accvgpr_read(v[27], v[252]))
|
|
k.emit(s_nop(1))
|
|
k.emit(v_accvgpr_write(v[224], v[25]))
|
|
k.emit(v_accvgpr_write(v[228], v[26]))
|
|
k.emit(v_accvgpr_write(v[232], v[27]))
|
|
k.emit(v_accvgpr_read(v[25], v[21]))
|
|
k.emit(v_accvgpr_read(v[26], v[25]))
|
|
k.emit(v_accvgpr_read(v[27], v[29]))
|
|
k.emit(s_nop(1))
|
|
k.emit(v_accvgpr_write(v[1], v[25]))
|
|
k.emit(v_accvgpr_write(v[5], v[26]))
|
|
k.emit(v_accvgpr_write(v[9], v[27]))
|
|
k.emit(v_accvgpr_read(v[25], v[53]))
|
|
k.emit(v_accvgpr_read(v[26], v[57]))
|
|
k.emit(v_accvgpr_read(v[27], v[61]))
|
|
k.emit(s_nop(1))
|
|
k.emit(v_accvgpr_write(v[33], v[25]))
|
|
k.emit(v_accvgpr_write(v[37], v[26]))
|
|
k.emit(v_accvgpr_write(v[41], v[27]))
|
|
k.emit(v_accvgpr_read(v[25], v[85]))
|
|
k.emit(v_accvgpr_read(v[26], v[89]))
|
|
k.emit(v_accvgpr_read(v[27], v[93]))
|
|
k.emit(s_nop(1))
|
|
k.emit(v_accvgpr_write(v[65], v[25]))
|
|
k.emit(v_accvgpr_write(v[69], v[26]))
|
|
k.emit(v_accvgpr_write(v[73], v[27]))
|
|
k.emit(v_accvgpr_read(v[25], v[117]))
|
|
k.emit(v_accvgpr_read(v[26], v[121]))
|
|
k.emit(v_accvgpr_read(v[27], v[125]))
|
|
k.emit(s_nop(1))
|
|
k.emit(v_accvgpr_write(v[97], v[25]))
|
|
k.emit(v_accvgpr_write(v[101], v[26]))
|
|
k.emit(v_accvgpr_write(v[105], v[27]))
|
|
k.emit(v_accvgpr_read(v[25], v[149]))
|
|
k.emit(v_accvgpr_read(v[26], v[153]))
|
|
k.emit(v_accvgpr_read(v[27], v[157]))
|
|
k.emit(s_nop(1))
|
|
k.emit(v_accvgpr_write(v[129], v[25]))
|
|
k.emit(v_accvgpr_write(v[133], v[26]))
|
|
k.emit(v_accvgpr_write(v[137], v[27]))
|
|
k.emit(v_accvgpr_read(v[25], v[181]))
|
|
k.emit(v_accvgpr_read(v[26], v[185]))
|
|
k.emit(v_accvgpr_read(v[27], v[189]))
|
|
k.emit(s_nop(1))
|
|
k.emit(v_accvgpr_write(v[161], v[25]))
|
|
k.emit(v_accvgpr_write(v[165], v[26]))
|
|
k.emit(v_accvgpr_write(v[169], v[27]))
|
|
k.emit(v_accvgpr_read(v[25], v[213]))
|
|
k.emit(v_accvgpr_read(v[26], v[217]))
|
|
k.emit(v_accvgpr_read(v[27], v[221]))
|
|
k.emit(s_nop(1))
|
|
k.emit(v_accvgpr_write(v[193], v[25]))
|
|
k.emit(v_accvgpr_write(v[197], v[26]))
|
|
k.emit(v_accvgpr_write(v[201], v[27]))
|
|
k.emit(v_accvgpr_read(v[25], v[245]))
|
|
k.emit(v_accvgpr_read(v[26], v[249]))
|
|
k.emit(v_accvgpr_read(v[27], v[253]))
|
|
k.emit(s_nop(1))
|
|
k.emit(v_accvgpr_write(v[225], v[25]))
|
|
k.emit(v_accvgpr_write(v[229], v[26]))
|
|
k.emit(v_accvgpr_write(v[233], v[27]))
|
|
k.emit(v_accvgpr_read(v[25], v[22]))
|
|
k.emit(v_accvgpr_read(v[26], v[26]))
|
|
k.emit(v_accvgpr_read(v[27], v[30]))
|
|
k.emit(s_nop(1))
|
|
k.emit(v_accvgpr_write(v[2], v[25]))
|
|
k.emit(v_accvgpr_write(v[6], v[26]))
|
|
k.emit(v_accvgpr_write(v[10], v[27]))
|
|
k.emit(v_accvgpr_read(v[25], v[54]))
|
|
k.emit(v_accvgpr_read(v[26], v[58]))
|
|
k.emit(v_accvgpr_read(v[27], v[62]))
|
|
k.emit(s_nop(1))
|
|
k.emit(v_accvgpr_write(v[34], v[25]))
|
|
k.emit(v_accvgpr_write(v[38], v[26]))
|
|
k.emit(v_accvgpr_write(v[42], v[27]))
|
|
k.emit(v_accvgpr_read(v[25], v[86]))
|
|
k.emit(v_accvgpr_read(v[26], v[90]))
|
|
k.emit(v_accvgpr_read(v[27], v[94]))
|
|
k.emit(s_nop(1))
|
|
k.emit(v_accvgpr_write(v[66], v[25]))
|
|
k.emit(v_accvgpr_write(v[70], v[26]))
|
|
k.emit(v_accvgpr_write(v[74], v[27]))
|
|
k.emit(v_accvgpr_read(v[25], v[118]))
|
|
k.emit(v_accvgpr_read(v[26], v[122]))
|
|
k.emit(v_accvgpr_read(v[27], v[126]))
|
|
k.emit(s_nop(1))
|
|
k.emit(v_accvgpr_write(v[98], v[25]))
|
|
k.emit(v_accvgpr_write(v[102], v[26]))
|
|
k.emit(v_accvgpr_write(v[106], v[27]))
|
|
k.emit(v_accvgpr_read(v[25], v[150]))
|
|
k.emit(v_accvgpr_read(v[26], v[154]))
|
|
k.emit(v_accvgpr_read(v[27], v[158]))
|
|
k.emit(s_nop(1))
|
|
k.emit(v_accvgpr_write(v[130], v[25]))
|
|
k.emit(v_accvgpr_write(v[134], v[26]))
|
|
k.emit(v_accvgpr_write(v[138], v[27]))
|
|
k.emit(v_accvgpr_read(v[25], v[182]))
|
|
k.emit(v_accvgpr_read(v[26], v[186]))
|
|
k.emit(v_accvgpr_read(v[27], v[190]))
|
|
k.emit(s_nop(1))
|
|
k.emit(v_accvgpr_write(v[162], v[25]))
|
|
k.emit(v_accvgpr_write(v[166], v[26]))
|
|
k.emit(v_accvgpr_write(v[170], v[27]))
|
|
k.emit(v_accvgpr_read(v[25], v[214]))
|
|
k.emit(v_accvgpr_read(v[26], v[218]))
|
|
k.emit(v_accvgpr_read(v[27], v[222]))
|
|
k.emit(s_nop(1))
|
|
k.emit(v_accvgpr_write(v[194], v[25]))
|
|
k.emit(v_accvgpr_write(v[198], v[26]))
|
|
k.emit(v_accvgpr_write(v[202], v[27]))
|
|
k.emit(v_accvgpr_read(v[25], v[246]))
|
|
k.emit(v_accvgpr_read(v[26], v[250]))
|
|
k.emit(v_accvgpr_read(v[27], v[254]))
|
|
k.emit(s_nop(1))
|
|
k.emit(v_accvgpr_write(v[226], v[25]))
|
|
k.emit(v_accvgpr_write(v[230], v[26]))
|
|
k.emit(v_accvgpr_write(v[234], v[27]))
|
|
k.emit(v_accvgpr_read(v[25], v[23]))
|
|
k.emit(v_accvgpr_read(v[26], v[27]))
|
|
k.emit(v_accvgpr_read(v[27], v[31]))
|
|
k.emit(s_nop(1))
|
|
k.emit(v_accvgpr_write(v[3], v[25]))
|
|
k.emit(v_accvgpr_write(v[7], v[26]))
|
|
k.emit(v_accvgpr_write(v[11], v[27]))
|
|
k.emit(v_accvgpr_read(v[25], v[55]))
|
|
k.emit(v_accvgpr_read(v[26], v[59]))
|
|
k.emit(v_accvgpr_read(v[27], v[63]))
|
|
k.emit(s_nop(1))
|
|
k.emit(v_accvgpr_write(v[35], v[25]))
|
|
k.emit(v_accvgpr_write(v[39], v[26]))
|
|
k.emit(v_accvgpr_write(v[43], v[27]))
|
|
k.emit(v_accvgpr_read(v[25], v[87]))
|
|
k.emit(v_accvgpr_read(v[26], v[91]))
|
|
k.emit(v_accvgpr_read(v[27], v[95]))
|
|
k.emit(s_nop(1))
|
|
k.emit(v_accvgpr_write(v[67], v[25]))
|
|
k.emit(v_accvgpr_write(v[71], v[26]))
|
|
k.emit(v_accvgpr_write(v[75], v[27]))
|
|
k.emit(v_accvgpr_read(v[25], v[119]))
|
|
k.emit(v_accvgpr_read(v[26], v[123]))
|
|
k.emit(v_accvgpr_read(v[27], v[127]))
|
|
k.emit(s_nop(1))
|
|
k.emit(v_accvgpr_write(v[99], v[25]))
|
|
k.emit(v_accvgpr_write(v[103], v[26]))
|
|
k.emit(v_accvgpr_write(v[107], v[27]))
|
|
k.emit(v_accvgpr_read(v[25], v[151]))
|
|
k.emit(v_accvgpr_read(v[26], v[155]))
|
|
k.emit(v_accvgpr_read(v[27], v[159]))
|
|
k.emit(s_nop(1))
|
|
k.emit(v_accvgpr_write(v[131], v[25]))
|
|
k.emit(v_accvgpr_write(v[135], v[26]))
|
|
k.emit(v_accvgpr_write(v[139], v[27]))
|
|
k.emit(v_accvgpr_read(v[25], v[183]))
|
|
k.emit(v_accvgpr_read(v[26], v[187]))
|
|
k.emit(v_accvgpr_read(v[27], v[191]))
|
|
k.emit(s_nop(1))
|
|
k.emit(v_accvgpr_write(v[163], v[25]))
|
|
k.emit(v_accvgpr_write(v[167], v[26]))
|
|
k.emit(v_accvgpr_write(v[171], v[27]))
|
|
k.emit(v_accvgpr_read(v[25], v[215]))
|
|
k.emit(v_accvgpr_read(v[26], v[219]))
|
|
k.emit(v_accvgpr_read(v[27], v[223]))
|
|
k.emit(s_nop(1))
|
|
k.emit(v_accvgpr_write(v[195], v[25]))
|
|
k.emit(v_accvgpr_write(v[199], v[26]))
|
|
k.emit(v_accvgpr_write(v[203], v[27]))
|
|
k.emit(v_accvgpr_read(v[25], v[247]))
|
|
k.emit(v_accvgpr_read(v[26], v[251]))
|
|
k.emit(v_accvgpr_read(v[27], v[255]))
|
|
k.emit(s_nop(1))
|
|
k.emit(v_accvgpr_write(v[227], v[25]))
|
|
k.emit(v_accvgpr_write(v[231], v[26]))
|
|
k.emit(v_accvgpr_write(v[235], v[27]))
|
|
k.emit(s_mov_b64(s[8:9], -1))
|
|
k.emit(s_or_saveexec_b64(VCC, s[8:9]))
|
|
k.emit(s_branch(), target='ShiftVectorComponents0_GLVW0')
|
|
k.label('ShiftVectorComponents0_GLVW4_BM0_VW0')
|
|
k.emit(s_mov_b32(s[8], 0))
|
|
k.emit(v_cmpx_eq_u32_e64(s[8:9], v[24], s[8]))
|
|
k.emit(v_and_b32_e32(v[18], 63, v[180]))
|
|
k.emit(v_lshlrev_b32_e32(v[18], 2, v[18]))
|
|
k.emit(v_accvgpr_read(v[25], v[16]))
|
|
k.emit(v_accvgpr_read(v[26], v[20]))
|
|
k.emit(v_accvgpr_read(v[27], v[24]))
|
|
k.emit(v_accvgpr_read(v[28], v[28]))
|
|
k.emit(s_nop(1))
|
|
k.emit(v_accvgpr_write(v[0], v[25]))
|
|
k.emit(v_accvgpr_write(v[4], v[26]))
|
|
k.emit(v_accvgpr_write(v[8], v[27]))
|
|
k.emit(v_accvgpr_write(v[12], v[28]))
|
|
k.emit(v_accvgpr_read(v[25], v[48]))
|
|
k.emit(v_accvgpr_read(v[26], v[52]))
|
|
k.emit(v_accvgpr_read(v[27], v[56]))
|
|
k.emit(v_accvgpr_read(v[28], v[60]))
|
|
k.emit(s_nop(1))
|
|
k.emit(v_accvgpr_write(v[32], v[25]))
|
|
k.emit(v_accvgpr_write(v[36], v[26]))
|
|
k.emit(v_accvgpr_write(v[40], v[27]))
|
|
k.emit(v_accvgpr_write(v[44], v[28]))
|
|
k.emit(v_accvgpr_read(v[25], v[80]))
|
|
k.emit(v_accvgpr_read(v[26], v[84]))
|
|
k.emit(v_accvgpr_read(v[27], v[88]))
|
|
k.emit(v_accvgpr_read(v[28], v[92]))
|
|
k.emit(s_nop(1))
|
|
k.emit(v_accvgpr_write(v[64], v[25]))
|
|
k.emit(v_accvgpr_write(v[68], v[26]))
|
|
k.emit(v_accvgpr_write(v[72], v[27]))
|
|
k.emit(v_accvgpr_write(v[76], v[28]))
|
|
k.emit(v_accvgpr_read(v[25], v[112]))
|
|
k.emit(v_accvgpr_read(v[26], v[116]))
|
|
k.emit(v_accvgpr_read(v[27], v[120]))
|
|
k.emit(v_accvgpr_read(v[28], v[124]))
|
|
k.emit(s_nop(1))
|
|
k.emit(v_accvgpr_write(v[96], v[25]))
|
|
k.emit(v_accvgpr_write(v[100], v[26]))
|
|
k.emit(v_accvgpr_write(v[104], v[27]))
|
|
k.emit(v_accvgpr_write(v[108], v[28]))
|
|
k.emit(v_accvgpr_read(v[25], v[144]))
|
|
k.emit(v_accvgpr_read(v[26], v[148]))
|
|
k.emit(v_accvgpr_read(v[27], v[152]))
|
|
k.emit(v_accvgpr_read(v[28], v[156]))
|
|
k.emit(s_nop(1))
|
|
k.emit(v_accvgpr_write(v[128], v[25]))
|
|
k.emit(v_accvgpr_write(v[132], v[26]))
|
|
k.emit(v_accvgpr_write(v[136], v[27]))
|
|
k.emit(v_accvgpr_write(v[140], v[28]))
|
|
k.emit(v_accvgpr_read(v[25], v[176]))
|
|
k.emit(v_accvgpr_read(v[26], v[180]))
|
|
k.emit(v_accvgpr_read(v[27], v[184]))
|
|
k.emit(v_accvgpr_read(v[28], v[188]))
|
|
k.emit(s_nop(1))
|
|
k.emit(v_accvgpr_write(v[160], v[25]))
|
|
k.emit(v_accvgpr_write(v[164], v[26]))
|
|
k.emit(v_accvgpr_write(v[168], v[27]))
|
|
k.emit(v_accvgpr_write(v[172], v[28]))
|
|
k.emit(v_accvgpr_read(v[25], v[208]))
|
|
k.emit(v_accvgpr_read(v[26], v[212]))
|
|
k.emit(v_accvgpr_read(v[27], v[216]))
|
|
k.emit(v_accvgpr_read(v[28], v[220]))
|
|
k.emit(s_nop(1))
|
|
k.emit(v_accvgpr_write(v[192], v[25]))
|
|
k.emit(v_accvgpr_write(v[196], v[26]))
|
|
k.emit(v_accvgpr_write(v[200], v[27]))
|
|
k.emit(v_accvgpr_write(v[204], v[28]))
|
|
k.emit(v_accvgpr_read(v[25], v[240]))
|
|
k.emit(v_accvgpr_read(v[26], v[244]))
|
|
k.emit(v_accvgpr_read(v[27], v[248]))
|
|
k.emit(v_accvgpr_read(v[28], v[252]))
|
|
k.emit(s_nop(1))
|
|
k.emit(v_accvgpr_write(v[224], v[25]))
|
|
k.emit(v_accvgpr_write(v[228], v[26]))
|
|
k.emit(v_accvgpr_write(v[232], v[27]))
|
|
k.emit(v_accvgpr_write(v[236], v[28]))
|
|
k.emit(v_accvgpr_read(v[25], v[17]))
|
|
k.emit(v_accvgpr_read(v[26], v[21]))
|
|
k.emit(v_accvgpr_read(v[27], v[25]))
|
|
k.emit(v_accvgpr_read(v[28], v[29]))
|
|
k.emit(s_nop(1))
|
|
k.emit(v_accvgpr_write(v[1], v[25]))
|
|
k.emit(v_accvgpr_write(v[5], v[26]))
|
|
k.emit(v_accvgpr_write(v[9], v[27]))
|
|
k.emit(v_accvgpr_write(v[13], v[28]))
|
|
k.emit(v_accvgpr_read(v[25], v[49]))
|
|
k.emit(v_accvgpr_read(v[26], v[53]))
|
|
k.emit(v_accvgpr_read(v[27], v[57]))
|
|
k.emit(v_accvgpr_read(v[28], v[61]))
|
|
k.emit(s_nop(1))
|
|
k.emit(v_accvgpr_write(v[33], v[25]))
|
|
k.emit(v_accvgpr_write(v[37], v[26]))
|
|
k.emit(v_accvgpr_write(v[41], v[27]))
|
|
k.emit(v_accvgpr_write(v[45], v[28]))
|
|
k.emit(v_accvgpr_read(v[25], v[81]))
|
|
k.emit(v_accvgpr_read(v[26], v[85]))
|
|
k.emit(v_accvgpr_read(v[27], v[89]))
|
|
k.emit(v_accvgpr_read(v[28], v[93]))
|
|
k.emit(s_nop(1))
|
|
k.emit(v_accvgpr_write(v[65], v[25]))
|
|
k.emit(v_accvgpr_write(v[69], v[26]))
|
|
k.emit(v_accvgpr_write(v[73], v[27]))
|
|
k.emit(v_accvgpr_write(v[77], v[28]))
|
|
k.emit(v_accvgpr_read(v[25], v[113]))
|
|
k.emit(v_accvgpr_read(v[26], v[117]))
|
|
k.emit(v_accvgpr_read(v[27], v[121]))
|
|
k.emit(v_accvgpr_read(v[28], v[125]))
|
|
k.emit(s_nop(1))
|
|
k.emit(v_accvgpr_write(v[97], v[25]))
|
|
k.emit(v_accvgpr_write(v[101], v[26]))
|
|
k.emit(v_accvgpr_write(v[105], v[27]))
|
|
k.emit(v_accvgpr_write(v[109], v[28]))
|
|
k.emit(v_accvgpr_read(v[25], v[145]))
|
|
k.emit(v_accvgpr_read(v[26], v[149]))
|
|
k.emit(v_accvgpr_read(v[27], v[153]))
|
|
k.emit(v_accvgpr_read(v[28], v[157]))
|
|
k.emit(s_nop(1))
|
|
k.emit(v_accvgpr_write(v[129], v[25]))
|
|
k.emit(v_accvgpr_write(v[133], v[26]))
|
|
k.emit(v_accvgpr_write(v[137], v[27]))
|
|
k.emit(v_accvgpr_write(v[141], v[28]))
|
|
k.emit(v_accvgpr_read(v[25], v[177]))
|
|
k.emit(v_accvgpr_read(v[26], v[181]))
|
|
k.emit(v_accvgpr_read(v[27], v[185]))
|
|
k.emit(v_accvgpr_read(v[28], v[189]))
|
|
k.emit(s_nop(1))
|
|
k.emit(v_accvgpr_write(v[161], v[25]))
|
|
k.emit(v_accvgpr_write(v[165], v[26]))
|
|
k.emit(v_accvgpr_write(v[169], v[27]))
|
|
k.emit(v_accvgpr_write(v[173], v[28]))
|
|
k.emit(v_accvgpr_read(v[25], v[209]))
|
|
k.emit(v_accvgpr_read(v[26], v[213]))
|
|
k.emit(v_accvgpr_read(v[27], v[217]))
|
|
k.emit(v_accvgpr_read(v[28], v[221]))
|
|
k.emit(s_nop(1))
|
|
k.emit(v_accvgpr_write(v[193], v[25]))
|
|
k.emit(v_accvgpr_write(v[197], v[26]))
|
|
k.emit(v_accvgpr_write(v[201], v[27]))
|
|
k.emit(v_accvgpr_write(v[205], v[28]))
|
|
k.emit(v_accvgpr_read(v[25], v[241]))
|
|
k.emit(v_accvgpr_read(v[26], v[245]))
|
|
k.emit(v_accvgpr_read(v[27], v[249]))
|
|
k.emit(v_accvgpr_read(v[28], v[253]))
|
|
k.emit(s_nop(1))
|
|
k.emit(v_accvgpr_write(v[225], v[25]))
|
|
k.emit(v_accvgpr_write(v[229], v[26]))
|
|
k.emit(v_accvgpr_write(v[233], v[27]))
|
|
k.emit(v_accvgpr_write(v[237], v[28]))
|
|
k.emit(v_accvgpr_read(v[25], v[18]))
|
|
k.emit(v_accvgpr_read(v[26], v[22]))
|
|
k.emit(v_accvgpr_read(v[27], v[26]))
|
|
k.emit(v_accvgpr_read(v[28], v[30]))
|
|
k.emit(s_nop(1))
|
|
k.emit(v_accvgpr_write(v[2], v[25]))
|
|
k.emit(v_accvgpr_write(v[6], v[26]))
|
|
k.emit(v_accvgpr_write(v[10], v[27]))
|
|
k.emit(v_accvgpr_write(v[14], v[28]))
|
|
k.emit(v_accvgpr_read(v[25], v[50]))
|
|
k.emit(v_accvgpr_read(v[26], v[54]))
|
|
k.emit(v_accvgpr_read(v[27], v[58]))
|
|
k.emit(v_accvgpr_read(v[28], v[62]))
|
|
k.emit(s_nop(1))
|
|
k.emit(v_accvgpr_write(v[34], v[25]))
|
|
k.emit(v_accvgpr_write(v[38], v[26]))
|
|
k.emit(v_accvgpr_write(v[42], v[27]))
|
|
k.emit(v_accvgpr_write(v[46], v[28]))
|
|
k.emit(v_accvgpr_read(v[25], v[82]))
|
|
k.emit(v_accvgpr_read(v[26], v[86]))
|
|
k.emit(v_accvgpr_read(v[27], v[90]))
|
|
k.emit(v_accvgpr_read(v[28], v[94]))
|
|
k.emit(s_nop(1))
|
|
k.emit(v_accvgpr_write(v[66], v[25]))
|
|
k.emit(v_accvgpr_write(v[70], v[26]))
|
|
k.emit(v_accvgpr_write(v[74], v[27]))
|
|
k.emit(v_accvgpr_write(v[78], v[28]))
|
|
k.emit(v_accvgpr_read(v[25], v[114]))
|
|
k.emit(v_accvgpr_read(v[26], v[118]))
|
|
k.emit(v_accvgpr_read(v[27], v[122]))
|
|
k.emit(v_accvgpr_read(v[28], v[126]))
|
|
k.emit(s_nop(1))
|
|
k.emit(v_accvgpr_write(v[98], v[25]))
|
|
k.emit(v_accvgpr_write(v[102], v[26]))
|
|
k.emit(v_accvgpr_write(v[106], v[27]))
|
|
k.emit(v_accvgpr_write(v[110], v[28]))
|
|
k.emit(v_accvgpr_read(v[25], v[146]))
|
|
k.emit(v_accvgpr_read(v[26], v[150]))
|
|
k.emit(v_accvgpr_read(v[27], v[154]))
|
|
k.emit(v_accvgpr_read(v[28], v[158]))
|
|
k.emit(s_nop(1))
|
|
k.emit(v_accvgpr_write(v[130], v[25]))
|
|
k.emit(v_accvgpr_write(v[134], v[26]))
|
|
k.emit(v_accvgpr_write(v[138], v[27]))
|
|
k.emit(v_accvgpr_write(v[142], v[28]))
|
|
k.emit(v_accvgpr_read(v[25], v[178]))
|
|
k.emit(v_accvgpr_read(v[26], v[182]))
|
|
k.emit(v_accvgpr_read(v[27], v[186]))
|
|
k.emit(v_accvgpr_read(v[28], v[190]))
|
|
k.emit(s_nop(1))
|
|
k.emit(v_accvgpr_write(v[162], v[25]))
|
|
k.emit(v_accvgpr_write(v[166], v[26]))
|
|
k.emit(v_accvgpr_write(v[170], v[27]))
|
|
k.emit(v_accvgpr_write(v[174], v[28]))
|
|
k.emit(v_accvgpr_read(v[25], v[210]))
|
|
k.emit(v_accvgpr_read(v[26], v[214]))
|
|
k.emit(v_accvgpr_read(v[27], v[218]))
|
|
k.emit(v_accvgpr_read(v[28], v[222]))
|
|
k.emit(s_nop(1))
|
|
k.emit(v_accvgpr_write(v[194], v[25]))
|
|
k.emit(v_accvgpr_write(v[198], v[26]))
|
|
k.emit(v_accvgpr_write(v[202], v[27]))
|
|
k.emit(v_accvgpr_write(v[206], v[28]))
|
|
k.emit(v_accvgpr_read(v[25], v[242]))
|
|
k.emit(v_accvgpr_read(v[26], v[246]))
|
|
k.emit(v_accvgpr_read(v[27], v[250]))
|
|
k.emit(v_accvgpr_read(v[28], v[254]))
|
|
k.emit(s_nop(1))
|
|
k.emit(v_accvgpr_write(v[226], v[25]))
|
|
k.emit(v_accvgpr_write(v[230], v[26]))
|
|
k.emit(v_accvgpr_write(v[234], v[27]))
|
|
k.emit(v_accvgpr_write(v[238], v[28]))
|
|
k.emit(v_accvgpr_read(v[25], v[19]))
|
|
k.emit(v_accvgpr_read(v[26], v[23]))
|
|
k.emit(v_accvgpr_read(v[27], v[27]))
|
|
k.emit(v_accvgpr_read(v[28], v[31]))
|
|
k.emit(s_nop(1))
|
|
k.emit(v_accvgpr_write(v[3], v[25]))
|
|
k.emit(v_accvgpr_write(v[7], v[26]))
|
|
k.emit(v_accvgpr_write(v[11], v[27]))
|
|
k.emit(v_accvgpr_write(v[15], v[28]))
|
|
k.emit(v_accvgpr_read(v[25], v[51]))
|
|
k.emit(v_accvgpr_read(v[26], v[55]))
|
|
k.emit(v_accvgpr_read(v[27], v[59]))
|
|
k.emit(v_accvgpr_read(v[28], v[63]))
|
|
k.emit(s_nop(1))
|
|
k.emit(v_accvgpr_write(v[35], v[25]))
|
|
k.emit(v_accvgpr_write(v[39], v[26]))
|
|
k.emit(v_accvgpr_write(v[43], v[27]))
|
|
k.emit(v_accvgpr_write(v[47], v[28]))
|
|
k.emit(v_accvgpr_read(v[25], v[83]))
|
|
k.emit(v_accvgpr_read(v[26], v[87]))
|
|
k.emit(v_accvgpr_read(v[27], v[91]))
|
|
k.emit(v_accvgpr_read(v[28], v[95]))
|
|
k.emit(s_nop(1))
|
|
k.emit(v_accvgpr_write(v[67], v[25]))
|
|
k.emit(v_accvgpr_write(v[71], v[26]))
|
|
k.emit(v_accvgpr_write(v[75], v[27]))
|
|
k.emit(v_accvgpr_write(v[79], v[28]))
|
|
k.emit(v_accvgpr_read(v[25], v[115]))
|
|
k.emit(v_accvgpr_read(v[26], v[119]))
|
|
k.emit(v_accvgpr_read(v[27], v[123]))
|
|
k.emit(v_accvgpr_read(v[28], v[127]))
|
|
k.emit(s_nop(1))
|
|
k.emit(v_accvgpr_write(v[99], v[25]))
|
|
k.emit(v_accvgpr_write(v[103], v[26]))
|
|
k.emit(v_accvgpr_write(v[107], v[27]))
|
|
k.emit(v_accvgpr_write(v[111], v[28]))
|
|
k.emit(v_accvgpr_read(v[25], v[147]))
|
|
k.emit(v_accvgpr_read(v[26], v[151]))
|
|
k.emit(v_accvgpr_read(v[27], v[155]))
|
|
k.emit(v_accvgpr_read(v[28], v[159]))
|
|
k.emit(s_nop(1))
|
|
k.emit(v_accvgpr_write(v[131], v[25]))
|
|
k.emit(v_accvgpr_write(v[135], v[26]))
|
|
k.emit(v_accvgpr_write(v[139], v[27]))
|
|
k.emit(v_accvgpr_write(v[143], v[28]))
|
|
k.emit(v_accvgpr_read(v[25], v[179]))
|
|
k.emit(v_accvgpr_read(v[26], v[183]))
|
|
k.emit(v_accvgpr_read(v[27], v[187]))
|
|
k.emit(v_accvgpr_read(v[28], v[191]))
|
|
k.emit(s_nop(1))
|
|
k.emit(v_accvgpr_write(v[163], v[25]))
|
|
k.emit(v_accvgpr_write(v[167], v[26]))
|
|
k.emit(v_accvgpr_write(v[171], v[27]))
|
|
k.emit(v_accvgpr_write(v[175], v[28]))
|
|
k.emit(v_accvgpr_read(v[25], v[211]))
|
|
k.emit(v_accvgpr_read(v[26], v[215]))
|
|
k.emit(v_accvgpr_read(v[27], v[219]))
|
|
k.emit(v_accvgpr_read(v[28], v[223]))
|
|
k.emit(s_nop(1))
|
|
k.emit(v_accvgpr_write(v[195], v[25]))
|
|
k.emit(v_accvgpr_write(v[199], v[26]))
|
|
k.emit(v_accvgpr_write(v[203], v[27]))
|
|
k.emit(v_accvgpr_write(v[207], v[28]))
|
|
k.emit(v_accvgpr_read(v[25], v[243]))
|
|
k.emit(v_accvgpr_read(v[26], v[247]))
|
|
k.emit(v_accvgpr_read(v[27], v[251]))
|
|
k.emit(v_accvgpr_read(v[28], v[255]))
|
|
k.emit(s_nop(1))
|
|
k.emit(v_accvgpr_write(v[227], v[25]))
|
|
k.emit(v_accvgpr_write(v[231], v[26]))
|
|
k.emit(v_accvgpr_write(v[235], v[27]))
|
|
k.emit(v_accvgpr_write(v[239], v[28]))
|
|
k.emit(s_mov_b64(s[8:9], -1))
|
|
k.emit(s_or_saveexec_b64(VCC, s[8:9]))
|
|
k.emit(s_branch(), target='ShiftVectorComponents0_GLVW0')
|
|
k.label('ShiftVectorComponents0_GLVW5_BM0_VW0')
|
|
k.emit(s_mov_b32(s[8], 0))
|
|
k.emit(v_cmpx_eq_u32_e64(s[8:9], v[24], s[8]))
|
|
k.emit(v_and_b32_e32(v[18], 63, v[180]))
|
|
k.emit(v_lshlrev_b32_e32(v[18], 2, v[18]))
|
|
k.emit(v_accvgpr_read(v[25], v[12]))
|
|
k.emit(v_accvgpr_read(v[26], v[16]))
|
|
k.emit(v_accvgpr_read(v[27], v[20]))
|
|
k.emit(v_accvgpr_read(v[28], v[24]))
|
|
k.emit(v_accvgpr_read(v[29], v[28]))
|
|
k.emit(s_nop(1))
|
|
k.emit(v_accvgpr_write(v[0], v[25]))
|
|
k.emit(v_accvgpr_write(v[4], v[26]))
|
|
k.emit(v_accvgpr_write(v[8], v[27]))
|
|
k.emit(v_accvgpr_write(v[12], v[28]))
|
|
k.emit(v_accvgpr_write(v[16], v[29]))
|
|
k.emit(v_accvgpr_read(v[25], v[44]))
|
|
k.emit(v_accvgpr_read(v[26], v[48]))
|
|
k.emit(v_accvgpr_read(v[27], v[52]))
|
|
k.emit(v_accvgpr_read(v[28], v[56]))
|
|
k.emit(v_accvgpr_read(v[29], v[60]))
|
|
k.emit(s_nop(1))
|
|
k.emit(v_accvgpr_write(v[32], v[25]))
|
|
k.emit(v_accvgpr_write(v[36], v[26]))
|
|
k.emit(v_accvgpr_write(v[40], v[27]))
|
|
k.emit(v_accvgpr_write(v[44], v[28]))
|
|
k.emit(v_accvgpr_write(v[48], v[29]))
|
|
k.emit(v_accvgpr_read(v[25], v[76]))
|
|
k.emit(v_accvgpr_read(v[26], v[80]))
|
|
k.emit(v_accvgpr_read(v[27], v[84]))
|
|
k.emit(v_accvgpr_read(v[28], v[88]))
|
|
k.emit(v_accvgpr_read(v[29], v[92]))
|
|
k.emit(s_nop(1))
|
|
k.emit(v_accvgpr_write(v[64], v[25]))
|
|
k.emit(v_accvgpr_write(v[68], v[26]))
|
|
k.emit(v_accvgpr_write(v[72], v[27]))
|
|
k.emit(v_accvgpr_write(v[76], v[28]))
|
|
k.emit(v_accvgpr_write(v[80], v[29]))
|
|
k.emit(v_accvgpr_read(v[25], v[108]))
|
|
k.emit(v_accvgpr_read(v[26], v[112]))
|
|
k.emit(v_accvgpr_read(v[27], v[116]))
|
|
k.emit(v_accvgpr_read(v[28], v[120]))
|
|
k.emit(v_accvgpr_read(v[29], v[124]))
|
|
k.emit(s_nop(1))
|
|
k.emit(v_accvgpr_write(v[96], v[25]))
|
|
k.emit(v_accvgpr_write(v[100], v[26]))
|
|
k.emit(v_accvgpr_write(v[104], v[27]))
|
|
k.emit(v_accvgpr_write(v[108], v[28]))
|
|
k.emit(v_accvgpr_write(v[112], v[29]))
|
|
k.emit(v_accvgpr_read(v[25], v[140]))
|
|
k.emit(v_accvgpr_read(v[26], v[144]))
|
|
k.emit(v_accvgpr_read(v[27], v[148]))
|
|
k.emit(v_accvgpr_read(v[28], v[152]))
|
|
k.emit(v_accvgpr_read(v[29], v[156]))
|
|
k.emit(s_nop(1))
|
|
k.emit(v_accvgpr_write(v[128], v[25]))
|
|
k.emit(v_accvgpr_write(v[132], v[26]))
|
|
k.emit(v_accvgpr_write(v[136], v[27]))
|
|
k.emit(v_accvgpr_write(v[140], v[28]))
|
|
k.emit(v_accvgpr_write(v[144], v[29]))
|
|
k.emit(v_accvgpr_read(v[25], v[172]))
|
|
k.emit(v_accvgpr_read(v[26], v[176]))
|
|
k.emit(v_accvgpr_read(v[27], v[180]))
|
|
k.emit(v_accvgpr_read(v[28], v[184]))
|
|
k.emit(v_accvgpr_read(v[29], v[188]))
|
|
k.emit(s_nop(1))
|
|
k.emit(v_accvgpr_write(v[160], v[25]))
|
|
k.emit(v_accvgpr_write(v[164], v[26]))
|
|
k.emit(v_accvgpr_write(v[168], v[27]))
|
|
k.emit(v_accvgpr_write(v[172], v[28]))
|
|
k.emit(v_accvgpr_write(v[176], v[29]))
|
|
k.emit(v_accvgpr_read(v[25], v[204]))
|
|
k.emit(v_accvgpr_read(v[26], v[208]))
|
|
k.emit(v_accvgpr_read(v[27], v[212]))
|
|
k.emit(v_accvgpr_read(v[28], v[216]))
|
|
k.emit(v_accvgpr_read(v[29], v[220]))
|
|
k.emit(s_nop(1))
|
|
k.emit(v_accvgpr_write(v[192], v[25]))
|
|
k.emit(v_accvgpr_write(v[196], v[26]))
|
|
k.emit(v_accvgpr_write(v[200], v[27]))
|
|
k.emit(v_accvgpr_write(v[204], v[28]))
|
|
k.emit(v_accvgpr_write(v[208], v[29]))
|
|
k.emit(v_accvgpr_read(v[25], v[236]))
|
|
k.emit(v_accvgpr_read(v[26], v[240]))
|
|
k.emit(v_accvgpr_read(v[27], v[244]))
|
|
k.emit(v_accvgpr_read(v[28], v[248]))
|
|
k.emit(v_accvgpr_read(v[29], v[252]))
|
|
k.emit(s_nop(1))
|
|
k.emit(v_accvgpr_write(v[224], v[25]))
|
|
k.emit(v_accvgpr_write(v[228], v[26]))
|
|
k.emit(v_accvgpr_write(v[232], v[27]))
|
|
k.emit(v_accvgpr_write(v[236], v[28]))
|
|
k.emit(v_accvgpr_write(v[240], v[29]))
|
|
k.emit(v_accvgpr_read(v[25], v[13]))
|
|
k.emit(v_accvgpr_read(v[26], v[17]))
|
|
k.emit(v_accvgpr_read(v[27], v[21]))
|
|
k.emit(v_accvgpr_read(v[28], v[25]))
|
|
k.emit(v_accvgpr_read(v[29], v[29]))
|
|
k.emit(s_nop(1))
|
|
k.emit(v_accvgpr_write(v[1], v[25]))
|
|
k.emit(v_accvgpr_write(v[5], v[26]))
|
|
k.emit(v_accvgpr_write(v[9], v[27]))
|
|
k.emit(v_accvgpr_write(v[13], v[28]))
|
|
k.emit(v_accvgpr_write(v[17], v[29]))
|
|
k.emit(v_accvgpr_read(v[25], v[45]))
|
|
k.emit(v_accvgpr_read(v[26], v[49]))
|
|
k.emit(v_accvgpr_read(v[27], v[53]))
|
|
k.emit(v_accvgpr_read(v[28], v[57]))
|
|
k.emit(v_accvgpr_read(v[29], v[61]))
|
|
k.emit(s_nop(1))
|
|
k.emit(v_accvgpr_write(v[33], v[25]))
|
|
k.emit(v_accvgpr_write(v[37], v[26]))
|
|
k.emit(v_accvgpr_write(v[41], v[27]))
|
|
k.emit(v_accvgpr_write(v[45], v[28]))
|
|
k.emit(v_accvgpr_write(v[49], v[29]))
|
|
k.emit(v_accvgpr_read(v[25], v[77]))
|
|
k.emit(v_accvgpr_read(v[26], v[81]))
|
|
k.emit(v_accvgpr_read(v[27], v[85]))
|
|
k.emit(v_accvgpr_read(v[28], v[89]))
|
|
k.emit(v_accvgpr_read(v[29], v[93]))
|
|
k.emit(s_nop(1))
|
|
k.emit(v_accvgpr_write(v[65], v[25]))
|
|
k.emit(v_accvgpr_write(v[69], v[26]))
|
|
k.emit(v_accvgpr_write(v[73], v[27]))
|
|
k.emit(v_accvgpr_write(v[77], v[28]))
|
|
k.emit(v_accvgpr_write(v[81], v[29]))
|
|
k.emit(v_accvgpr_read(v[25], v[109]))
|
|
k.emit(v_accvgpr_read(v[26], v[113]))
|
|
k.emit(v_accvgpr_read(v[27], v[117]))
|
|
k.emit(v_accvgpr_read(v[28], v[121]))
|
|
k.emit(v_accvgpr_read(v[29], v[125]))
|
|
k.emit(s_nop(1))
|
|
k.emit(v_accvgpr_write(v[97], v[25]))
|
|
k.emit(v_accvgpr_write(v[101], v[26]))
|
|
k.emit(v_accvgpr_write(v[105], v[27]))
|
|
k.emit(v_accvgpr_write(v[109], v[28]))
|
|
k.emit(v_accvgpr_write(v[113], v[29]))
|
|
k.emit(v_accvgpr_read(v[25], v[141]))
|
|
k.emit(v_accvgpr_read(v[26], v[145]))
|
|
k.emit(v_accvgpr_read(v[27], v[149]))
|
|
k.emit(v_accvgpr_read(v[28], v[153]))
|
|
k.emit(v_accvgpr_read(v[29], v[157]))
|
|
k.emit(s_nop(1))
|
|
k.emit(v_accvgpr_write(v[129], v[25]))
|
|
k.emit(v_accvgpr_write(v[133], v[26]))
|
|
k.emit(v_accvgpr_write(v[137], v[27]))
|
|
k.emit(v_accvgpr_write(v[141], v[28]))
|
|
k.emit(v_accvgpr_write(v[145], v[29]))
|
|
k.emit(v_accvgpr_read(v[25], v[173]))
|
|
k.emit(v_accvgpr_read(v[26], v[177]))
|
|
k.emit(v_accvgpr_read(v[27], v[181]))
|
|
k.emit(v_accvgpr_read(v[28], v[185]))
|
|
k.emit(v_accvgpr_read(v[29], v[189]))
|
|
k.emit(s_nop(1))
|
|
k.emit(v_accvgpr_write(v[161], v[25]))
|
|
k.emit(v_accvgpr_write(v[165], v[26]))
|
|
k.emit(v_accvgpr_write(v[169], v[27]))
|
|
k.emit(v_accvgpr_write(v[173], v[28]))
|
|
k.emit(v_accvgpr_write(v[177], v[29]))
|
|
k.emit(v_accvgpr_read(v[25], v[205]))
|
|
k.emit(v_accvgpr_read(v[26], v[209]))
|
|
k.emit(v_accvgpr_read(v[27], v[213]))
|
|
k.emit(v_accvgpr_read(v[28], v[217]))
|
|
k.emit(v_accvgpr_read(v[29], v[221]))
|
|
k.emit(s_nop(1))
|
|
k.emit(v_accvgpr_write(v[193], v[25]))
|
|
k.emit(v_accvgpr_write(v[197], v[26]))
|
|
k.emit(v_accvgpr_write(v[201], v[27]))
|
|
k.emit(v_accvgpr_write(v[205], v[28]))
|
|
k.emit(v_accvgpr_write(v[209], v[29]))
|
|
k.emit(v_accvgpr_read(v[25], v[237]))
|
|
k.emit(v_accvgpr_read(v[26], v[241]))
|
|
k.emit(v_accvgpr_read(v[27], v[245]))
|
|
k.emit(v_accvgpr_read(v[28], v[249]))
|
|
k.emit(v_accvgpr_read(v[29], v[253]))
|
|
k.emit(s_nop(1))
|
|
k.emit(v_accvgpr_write(v[225], v[25]))
|
|
k.emit(v_accvgpr_write(v[229], v[26]))
|
|
k.emit(v_accvgpr_write(v[233], v[27]))
|
|
k.emit(v_accvgpr_write(v[237], v[28]))
|
|
k.emit(v_accvgpr_write(v[241], v[29]))
|
|
k.emit(v_accvgpr_read(v[25], v[14]))
|
|
k.emit(v_accvgpr_read(v[26], v[18]))
|
|
k.emit(v_accvgpr_read(v[27], v[22]))
|
|
k.emit(v_accvgpr_read(v[28], v[26]))
|
|
k.emit(v_accvgpr_read(v[29], v[30]))
|
|
k.emit(s_nop(1))
|
|
k.emit(v_accvgpr_write(v[2], v[25]))
|
|
k.emit(v_accvgpr_write(v[6], v[26]))
|
|
k.emit(v_accvgpr_write(v[10], v[27]))
|
|
k.emit(v_accvgpr_write(v[14], v[28]))
|
|
k.emit(v_accvgpr_write(v[18], v[29]))
|
|
k.emit(v_accvgpr_read(v[25], v[46]))
|
|
k.emit(v_accvgpr_read(v[26], v[50]))
|
|
k.emit(v_accvgpr_read(v[27], v[54]))
|
|
k.emit(v_accvgpr_read(v[28], v[58]))
|
|
k.emit(v_accvgpr_read(v[29], v[62]))
|
|
k.emit(s_nop(1))
|
|
k.emit(v_accvgpr_write(v[34], v[25]))
|
|
k.emit(v_accvgpr_write(v[38], v[26]))
|
|
k.emit(v_accvgpr_write(v[42], v[27]))
|
|
k.emit(v_accvgpr_write(v[46], v[28]))
|
|
k.emit(v_accvgpr_write(v[50], v[29]))
|
|
k.emit(v_accvgpr_read(v[25], v[78]))
|
|
k.emit(v_accvgpr_read(v[26], v[82]))
|
|
k.emit(v_accvgpr_read(v[27], v[86]))
|
|
k.emit(v_accvgpr_read(v[28], v[90]))
|
|
k.emit(v_accvgpr_read(v[29], v[94]))
|
|
k.emit(s_nop(1))
|
|
k.emit(v_accvgpr_write(v[66], v[25]))
|
|
k.emit(v_accvgpr_write(v[70], v[26]))
|
|
k.emit(v_accvgpr_write(v[74], v[27]))
|
|
k.emit(v_accvgpr_write(v[78], v[28]))
|
|
k.emit(v_accvgpr_write(v[82], v[29]))
|
|
k.emit(v_accvgpr_read(v[25], v[110]))
|
|
k.emit(v_accvgpr_read(v[26], v[114]))
|
|
k.emit(v_accvgpr_read(v[27], v[118]))
|
|
k.emit(v_accvgpr_read(v[28], v[122]))
|
|
k.emit(v_accvgpr_read(v[29], v[126]))
|
|
k.emit(s_nop(1))
|
|
k.emit(v_accvgpr_write(v[98], v[25]))
|
|
k.emit(v_accvgpr_write(v[102], v[26]))
|
|
k.emit(v_accvgpr_write(v[106], v[27]))
|
|
k.emit(v_accvgpr_write(v[110], v[28]))
|
|
k.emit(v_accvgpr_write(v[114], v[29]))
|
|
k.emit(v_accvgpr_read(v[25], v[142]))
|
|
k.emit(v_accvgpr_read(v[26], v[146]))
|
|
k.emit(v_accvgpr_read(v[27], v[150]))
|
|
k.emit(v_accvgpr_read(v[28], v[154]))
|
|
k.emit(v_accvgpr_read(v[29], v[158]))
|
|
k.emit(s_nop(1))
|
|
k.emit(v_accvgpr_write(v[130], v[25]))
|
|
k.emit(v_accvgpr_write(v[134], v[26]))
|
|
k.emit(v_accvgpr_write(v[138], v[27]))
|
|
k.emit(v_accvgpr_write(v[142], v[28]))
|
|
k.emit(v_accvgpr_write(v[146], v[29]))
|
|
k.emit(v_accvgpr_read(v[25], v[174]))
|
|
k.emit(v_accvgpr_read(v[26], v[178]))
|
|
k.emit(v_accvgpr_read(v[27], v[182]))
|
|
k.emit(v_accvgpr_read(v[28], v[186]))
|
|
k.emit(v_accvgpr_read(v[29], v[190]))
|
|
k.emit(s_nop(1))
|
|
k.emit(v_accvgpr_write(v[162], v[25]))
|
|
k.emit(v_accvgpr_write(v[166], v[26]))
|
|
k.emit(v_accvgpr_write(v[170], v[27]))
|
|
k.emit(v_accvgpr_write(v[174], v[28]))
|
|
k.emit(v_accvgpr_write(v[178], v[29]))
|
|
k.emit(v_accvgpr_read(v[25], v[206]))
|
|
k.emit(v_accvgpr_read(v[26], v[210]))
|
|
k.emit(v_accvgpr_read(v[27], v[214]))
|
|
k.emit(v_accvgpr_read(v[28], v[218]))
|
|
k.emit(v_accvgpr_read(v[29], v[222]))
|
|
k.emit(s_nop(1))
|
|
k.emit(v_accvgpr_write(v[194], v[25]))
|
|
k.emit(v_accvgpr_write(v[198], v[26]))
|
|
k.emit(v_accvgpr_write(v[202], v[27]))
|
|
k.emit(v_accvgpr_write(v[206], v[28]))
|
|
k.emit(v_accvgpr_write(v[210], v[29]))
|
|
k.emit(v_accvgpr_read(v[25], v[238]))
|
|
k.emit(v_accvgpr_read(v[26], v[242]))
|
|
k.emit(v_accvgpr_read(v[27], v[246]))
|
|
k.emit(v_accvgpr_read(v[28], v[250]))
|
|
k.emit(v_accvgpr_read(v[29], v[254]))
|
|
k.emit(s_nop(1))
|
|
k.emit(v_accvgpr_write(v[226], v[25]))
|
|
k.emit(v_accvgpr_write(v[230], v[26]))
|
|
k.emit(v_accvgpr_write(v[234], v[27]))
|
|
k.emit(v_accvgpr_write(v[238], v[28]))
|
|
k.emit(v_accvgpr_write(v[242], v[29]))
|
|
k.emit(v_accvgpr_read(v[25], v[15]))
|
|
k.emit(v_accvgpr_read(v[26], v[19]))
|
|
k.emit(v_accvgpr_read(v[27], v[23]))
|
|
k.emit(v_accvgpr_read(v[28], v[27]))
|
|
k.emit(v_accvgpr_read(v[29], v[31]))
|
|
k.emit(s_nop(1))
|
|
k.emit(v_accvgpr_write(v[3], v[25]))
|
|
k.emit(v_accvgpr_write(v[7], v[26]))
|
|
k.emit(v_accvgpr_write(v[11], v[27]))
|
|
k.emit(v_accvgpr_write(v[15], v[28]))
|
|
k.emit(v_accvgpr_write(v[19], v[29]))
|
|
k.emit(v_accvgpr_read(v[25], v[47]))
|
|
k.emit(v_accvgpr_read(v[26], v[51]))
|
|
k.emit(v_accvgpr_read(v[27], v[55]))
|
|
k.emit(v_accvgpr_read(v[28], v[59]))
|
|
k.emit(v_accvgpr_read(v[29], v[63]))
|
|
k.emit(s_nop(1))
|
|
k.emit(v_accvgpr_write(v[35], v[25]))
|
|
k.emit(v_accvgpr_write(v[39], v[26]))
|
|
k.emit(v_accvgpr_write(v[43], v[27]))
|
|
k.emit(v_accvgpr_write(v[47], v[28]))
|
|
k.emit(v_accvgpr_write(v[51], v[29]))
|
|
k.emit(v_accvgpr_read(v[25], v[79]))
|
|
k.emit(v_accvgpr_read(v[26], v[83]))
|
|
k.emit(v_accvgpr_read(v[27], v[87]))
|
|
k.emit(v_accvgpr_read(v[28], v[91]))
|
|
k.emit(v_accvgpr_read(v[29], v[95]))
|
|
k.emit(s_nop(1))
|
|
k.emit(v_accvgpr_write(v[67], v[25]))
|
|
k.emit(v_accvgpr_write(v[71], v[26]))
|
|
k.emit(v_accvgpr_write(v[75], v[27]))
|
|
k.emit(v_accvgpr_write(v[79], v[28]))
|
|
k.emit(v_accvgpr_write(v[83], v[29]))
|
|
k.emit(v_accvgpr_read(v[25], v[111]))
|
|
k.emit(v_accvgpr_read(v[26], v[115]))
|
|
k.emit(v_accvgpr_read(v[27], v[119]))
|
|
k.emit(v_accvgpr_read(v[28], v[123]))
|
|
k.emit(v_accvgpr_read(v[29], v[127]))
|
|
k.emit(s_nop(1))
|
|
k.emit(v_accvgpr_write(v[99], v[25]))
|
|
k.emit(v_accvgpr_write(v[103], v[26]))
|
|
k.emit(v_accvgpr_write(v[107], v[27]))
|
|
k.emit(v_accvgpr_write(v[111], v[28]))
|
|
k.emit(v_accvgpr_write(v[115], v[29]))
|
|
k.emit(v_accvgpr_read(v[25], v[143]))
|
|
k.emit(v_accvgpr_read(v[26], v[147]))
|
|
k.emit(v_accvgpr_read(v[27], v[151]))
|
|
k.emit(v_accvgpr_read(v[28], v[155]))
|
|
k.emit(v_accvgpr_read(v[29], v[159]))
|
|
k.emit(s_nop(1))
|
|
k.emit(v_accvgpr_write(v[131], v[25]))
|
|
k.emit(v_accvgpr_write(v[135], v[26]))
|
|
k.emit(v_accvgpr_write(v[139], v[27]))
|
|
k.emit(v_accvgpr_write(v[143], v[28]))
|
|
k.emit(v_accvgpr_write(v[147], v[29]))
|
|
k.emit(v_accvgpr_read(v[25], v[175]))
|
|
k.emit(v_accvgpr_read(v[26], v[179]))
|
|
k.emit(v_accvgpr_read(v[27], v[183]))
|
|
k.emit(v_accvgpr_read(v[28], v[187]))
|
|
k.emit(v_accvgpr_read(v[29], v[191]))
|
|
k.emit(s_nop(1))
|
|
k.emit(v_accvgpr_write(v[163], v[25]))
|
|
k.emit(v_accvgpr_write(v[167], v[26]))
|
|
k.emit(v_accvgpr_write(v[171], v[27]))
|
|
k.emit(v_accvgpr_write(v[175], v[28]))
|
|
k.emit(v_accvgpr_write(v[179], v[29]))
|
|
k.emit(v_accvgpr_read(v[25], v[207]))
|
|
k.emit(v_accvgpr_read(v[26], v[211]))
|
|
k.emit(v_accvgpr_read(v[27], v[215]))
|
|
k.emit(v_accvgpr_read(v[28], v[219]))
|
|
k.emit(v_accvgpr_read(v[29], v[223]))
|
|
k.emit(s_nop(1))
|
|
k.emit(v_accvgpr_write(v[195], v[25]))
|
|
k.emit(v_accvgpr_write(v[199], v[26]))
|
|
k.emit(v_accvgpr_write(v[203], v[27]))
|
|
k.emit(v_accvgpr_write(v[207], v[28]))
|
|
k.emit(v_accvgpr_write(v[211], v[29]))
|
|
k.emit(v_accvgpr_read(v[25], v[239]))
|
|
k.emit(v_accvgpr_read(v[26], v[243]))
|
|
k.emit(v_accvgpr_read(v[27], v[247]))
|
|
k.emit(v_accvgpr_read(v[28], v[251]))
|
|
k.emit(v_accvgpr_read(v[29], v[255]))
|
|
k.emit(s_nop(1))
|
|
k.emit(v_accvgpr_write(v[227], v[25]))
|
|
k.emit(v_accvgpr_write(v[231], v[26]))
|
|
k.emit(v_accvgpr_write(v[235], v[27]))
|
|
k.emit(v_accvgpr_write(v[239], v[28]))
|
|
k.emit(v_accvgpr_write(v[243], v[29]))
|
|
k.emit(s_mov_b64(s[8:9], -1))
|
|
k.emit(s_or_saveexec_b64(VCC, s[8:9]))
|
|
k.emit(s_branch(), target='ShiftVectorComponents0_GLVW0')
|
|
k.label('ShiftVectorComponents0_GLVW6_BM0_VW0')
|
|
k.emit(s_mov_b32(s[8], 0))
|
|
k.emit(v_cmpx_eq_u32_e64(s[8:9], v[24], s[8]))
|
|
k.emit(v_and_b32_e32(v[18], 63, v[180]))
|
|
k.emit(v_lshlrev_b32_e32(v[18], 2, v[18]))
|
|
k.emit(v_accvgpr_read(v[25], v[8]))
|
|
k.emit(v_accvgpr_read(v[26], v[12]))
|
|
k.emit(v_accvgpr_read(v[27], v[16]))
|
|
k.emit(v_accvgpr_read(v[28], v[20]))
|
|
k.emit(v_accvgpr_read(v[29], v[24]))
|
|
k.emit(v_accvgpr_read(v[30], v[28]))
|
|
k.emit(s_nop(1))
|
|
k.emit(v_accvgpr_write(v[0], v[25]))
|
|
k.emit(v_accvgpr_write(v[4], v[26]))
|
|
k.emit(v_accvgpr_write(v[8], v[27]))
|
|
k.emit(v_accvgpr_write(v[12], v[28]))
|
|
k.emit(v_accvgpr_write(v[16], v[29]))
|
|
k.emit(v_accvgpr_write(v[20], v[30]))
|
|
k.emit(v_accvgpr_read(v[25], v[40]))
|
|
k.emit(v_accvgpr_read(v[26], v[44]))
|
|
k.emit(v_accvgpr_read(v[27], v[48]))
|
|
k.emit(v_accvgpr_read(v[28], v[52]))
|
|
k.emit(v_accvgpr_read(v[29], v[56]))
|
|
k.emit(v_accvgpr_read(v[30], v[60]))
|
|
k.emit(s_nop(1))
|
|
k.emit(v_accvgpr_write(v[32], v[25]))
|
|
k.emit(v_accvgpr_write(v[36], v[26]))
|
|
k.emit(v_accvgpr_write(v[40], v[27]))
|
|
k.emit(v_accvgpr_write(v[44], v[28]))
|
|
k.emit(v_accvgpr_write(v[48], v[29]))
|
|
k.emit(v_accvgpr_write(v[52], v[30]))
|
|
k.emit(v_accvgpr_read(v[25], v[72]))
|
|
k.emit(v_accvgpr_read(v[26], v[76]))
|
|
k.emit(v_accvgpr_read(v[27], v[80]))
|
|
k.emit(v_accvgpr_read(v[28], v[84]))
|
|
k.emit(v_accvgpr_read(v[29], v[88]))
|
|
k.emit(v_accvgpr_read(v[30], v[92]))
|
|
k.emit(s_nop(1))
|
|
k.emit(v_accvgpr_write(v[64], v[25]))
|
|
k.emit(v_accvgpr_write(v[68], v[26]))
|
|
k.emit(v_accvgpr_write(v[72], v[27]))
|
|
k.emit(v_accvgpr_write(v[76], v[28]))
|
|
k.emit(v_accvgpr_write(v[80], v[29]))
|
|
k.emit(v_accvgpr_write(v[84], v[30]))
|
|
k.emit(v_accvgpr_read(v[25], v[104]))
|
|
k.emit(v_accvgpr_read(v[26], v[108]))
|
|
k.emit(v_accvgpr_read(v[27], v[112]))
|
|
k.emit(v_accvgpr_read(v[28], v[116]))
|
|
k.emit(v_accvgpr_read(v[29], v[120]))
|
|
k.emit(v_accvgpr_read(v[30], v[124]))
|
|
k.emit(s_nop(1))
|
|
k.emit(v_accvgpr_write(v[96], v[25]))
|
|
k.emit(v_accvgpr_write(v[100], v[26]))
|
|
k.emit(v_accvgpr_write(v[104], v[27]))
|
|
k.emit(v_accvgpr_write(v[108], v[28]))
|
|
k.emit(v_accvgpr_write(v[112], v[29]))
|
|
k.emit(v_accvgpr_write(v[116], v[30]))
|
|
k.emit(v_accvgpr_read(v[25], v[136]))
|
|
k.emit(v_accvgpr_read(v[26], v[140]))
|
|
k.emit(v_accvgpr_read(v[27], v[144]))
|
|
k.emit(v_accvgpr_read(v[28], v[148]))
|
|
k.emit(v_accvgpr_read(v[29], v[152]))
|
|
k.emit(v_accvgpr_read(v[30], v[156]))
|
|
k.emit(s_nop(1))
|
|
k.emit(v_accvgpr_write(v[128], v[25]))
|
|
k.emit(v_accvgpr_write(v[132], v[26]))
|
|
k.emit(v_accvgpr_write(v[136], v[27]))
|
|
k.emit(v_accvgpr_write(v[140], v[28]))
|
|
k.emit(v_accvgpr_write(v[144], v[29]))
|
|
k.emit(v_accvgpr_write(v[148], v[30]))
|
|
k.emit(v_accvgpr_read(v[25], v[168]))
|
|
k.emit(v_accvgpr_read(v[26], v[172]))
|
|
k.emit(v_accvgpr_read(v[27], v[176]))
|
|
k.emit(v_accvgpr_read(v[28], v[180]))
|
|
k.emit(v_accvgpr_read(v[29], v[184]))
|
|
k.emit(v_accvgpr_read(v[30], v[188]))
|
|
k.emit(s_nop(1))
|
|
k.emit(v_accvgpr_write(v[160], v[25]))
|
|
k.emit(v_accvgpr_write(v[164], v[26]))
|
|
k.emit(v_accvgpr_write(v[168], v[27]))
|
|
k.emit(v_accvgpr_write(v[172], v[28]))
|
|
k.emit(v_accvgpr_write(v[176], v[29]))
|
|
k.emit(v_accvgpr_write(v[180], v[30]))
|
|
k.emit(v_accvgpr_read(v[25], v[200]))
|
|
k.emit(v_accvgpr_read(v[26], v[204]))
|
|
k.emit(v_accvgpr_read(v[27], v[208]))
|
|
k.emit(v_accvgpr_read(v[28], v[212]))
|
|
k.emit(v_accvgpr_read(v[29], v[216]))
|
|
k.emit(v_accvgpr_read(v[30], v[220]))
|
|
k.emit(s_nop(1))
|
|
k.emit(v_accvgpr_write(v[192], v[25]))
|
|
k.emit(v_accvgpr_write(v[196], v[26]))
|
|
k.emit(v_accvgpr_write(v[200], v[27]))
|
|
k.emit(v_accvgpr_write(v[204], v[28]))
|
|
k.emit(v_accvgpr_write(v[208], v[29]))
|
|
k.emit(v_accvgpr_write(v[212], v[30]))
|
|
k.emit(v_accvgpr_read(v[25], v[232]))
|
|
k.emit(v_accvgpr_read(v[26], v[236]))
|
|
k.emit(v_accvgpr_read(v[27], v[240]))
|
|
k.emit(v_accvgpr_read(v[28], v[244]))
|
|
k.emit(v_accvgpr_read(v[29], v[248]))
|
|
k.emit(v_accvgpr_read(v[30], v[252]))
|
|
k.emit(s_nop(1))
|
|
k.emit(v_accvgpr_write(v[224], v[25]))
|
|
k.emit(v_accvgpr_write(v[228], v[26]))
|
|
k.emit(v_accvgpr_write(v[232], v[27]))
|
|
k.emit(v_accvgpr_write(v[236], v[28]))
|
|
k.emit(v_accvgpr_write(v[240], v[29]))
|
|
k.emit(v_accvgpr_write(v[244], v[30]))
|
|
k.emit(v_accvgpr_read(v[25], v[9]))
|
|
k.emit(v_accvgpr_read(v[26], v[13]))
|
|
k.emit(v_accvgpr_read(v[27], v[17]))
|
|
k.emit(v_accvgpr_read(v[28], v[21]))
|
|
k.emit(v_accvgpr_read(v[29], v[25]))
|
|
k.emit(v_accvgpr_read(v[30], v[29]))
|
|
k.emit(s_nop(1))
|
|
k.emit(v_accvgpr_write(v[1], v[25]))
|
|
k.emit(v_accvgpr_write(v[5], v[26]))
|
|
k.emit(v_accvgpr_write(v[9], v[27]))
|
|
k.emit(v_accvgpr_write(v[13], v[28]))
|
|
k.emit(v_accvgpr_write(v[17], v[29]))
|
|
k.emit(v_accvgpr_write(v[21], v[30]))
|
|
k.emit(v_accvgpr_read(v[25], v[41]))
|
|
k.emit(v_accvgpr_read(v[26], v[45]))
|
|
k.emit(v_accvgpr_read(v[27], v[49]))
|
|
k.emit(v_accvgpr_read(v[28], v[53]))
|
|
k.emit(v_accvgpr_read(v[29], v[57]))
|
|
k.emit(v_accvgpr_read(v[30], v[61]))
|
|
k.emit(s_nop(1))
|
|
k.emit(v_accvgpr_write(v[33], v[25]))
|
|
k.emit(v_accvgpr_write(v[37], v[26]))
|
|
k.emit(v_accvgpr_write(v[41], v[27]))
|
|
k.emit(v_accvgpr_write(v[45], v[28]))
|
|
k.emit(v_accvgpr_write(v[49], v[29]))
|
|
k.emit(v_accvgpr_write(v[53], v[30]))
|
|
k.emit(v_accvgpr_read(v[25], v[73]))
|
|
k.emit(v_accvgpr_read(v[26], v[77]))
|
|
k.emit(v_accvgpr_read(v[27], v[81]))
|
|
k.emit(v_accvgpr_read(v[28], v[85]))
|
|
k.emit(v_accvgpr_read(v[29], v[89]))
|
|
k.emit(v_accvgpr_read(v[30], v[93]))
|
|
k.emit(s_nop(1))
|
|
k.emit(v_accvgpr_write(v[65], v[25]))
|
|
k.emit(v_accvgpr_write(v[69], v[26]))
|
|
k.emit(v_accvgpr_write(v[73], v[27]))
|
|
k.emit(v_accvgpr_write(v[77], v[28]))
|
|
k.emit(v_accvgpr_write(v[81], v[29]))
|
|
k.emit(v_accvgpr_write(v[85], v[30]))
|
|
k.emit(v_accvgpr_read(v[25], v[105]))
|
|
k.emit(v_accvgpr_read(v[26], v[109]))
|
|
k.emit(v_accvgpr_read(v[27], v[113]))
|
|
k.emit(v_accvgpr_read(v[28], v[117]))
|
|
k.emit(v_accvgpr_read(v[29], v[121]))
|
|
k.emit(v_accvgpr_read(v[30], v[125]))
|
|
k.emit(s_nop(1))
|
|
k.emit(v_accvgpr_write(v[97], v[25]))
|
|
k.emit(v_accvgpr_write(v[101], v[26]))
|
|
k.emit(v_accvgpr_write(v[105], v[27]))
|
|
k.emit(v_accvgpr_write(v[109], v[28]))
|
|
k.emit(v_accvgpr_write(v[113], v[29]))
|
|
k.emit(v_accvgpr_write(v[117], v[30]))
|
|
k.emit(v_accvgpr_read(v[25], v[137]))
|
|
k.emit(v_accvgpr_read(v[26], v[141]))
|
|
k.emit(v_accvgpr_read(v[27], v[145]))
|
|
k.emit(v_accvgpr_read(v[28], v[149]))
|
|
k.emit(v_accvgpr_read(v[29], v[153]))
|
|
k.emit(v_accvgpr_read(v[30], v[157]))
|
|
k.emit(s_nop(1))
|
|
k.emit(v_accvgpr_write(v[129], v[25]))
|
|
k.emit(v_accvgpr_write(v[133], v[26]))
|
|
k.emit(v_accvgpr_write(v[137], v[27]))
|
|
k.emit(v_accvgpr_write(v[141], v[28]))
|
|
k.emit(v_accvgpr_write(v[145], v[29]))
|
|
k.emit(v_accvgpr_write(v[149], v[30]))
|
|
k.emit(v_accvgpr_read(v[25], v[169]))
|
|
k.emit(v_accvgpr_read(v[26], v[173]))
|
|
k.emit(v_accvgpr_read(v[27], v[177]))
|
|
k.emit(v_accvgpr_read(v[28], v[181]))
|
|
k.emit(v_accvgpr_read(v[29], v[185]))
|
|
k.emit(v_accvgpr_read(v[30], v[189]))
|
|
k.emit(s_nop(1))
|
|
k.emit(v_accvgpr_write(v[161], v[25]))
|
|
k.emit(v_accvgpr_write(v[165], v[26]))
|
|
k.emit(v_accvgpr_write(v[169], v[27]))
|
|
k.emit(v_accvgpr_write(v[173], v[28]))
|
|
k.emit(v_accvgpr_write(v[177], v[29]))
|
|
k.emit(v_accvgpr_write(v[181], v[30]))
|
|
k.emit(v_accvgpr_read(v[25], v[201]))
|
|
k.emit(v_accvgpr_read(v[26], v[205]))
|
|
k.emit(v_accvgpr_read(v[27], v[209]))
|
|
k.emit(v_accvgpr_read(v[28], v[213]))
|
|
k.emit(v_accvgpr_read(v[29], v[217]))
|
|
k.emit(v_accvgpr_read(v[30], v[221]))
|
|
k.emit(s_nop(1))
|
|
k.emit(v_accvgpr_write(v[193], v[25]))
|
|
k.emit(v_accvgpr_write(v[197], v[26]))
|
|
k.emit(v_accvgpr_write(v[201], v[27]))
|
|
k.emit(v_accvgpr_write(v[205], v[28]))
|
|
k.emit(v_accvgpr_write(v[209], v[29]))
|
|
k.emit(v_accvgpr_write(v[213], v[30]))
|
|
k.emit(v_accvgpr_read(v[25], v[233]))
|
|
k.emit(v_accvgpr_read(v[26], v[237]))
|
|
k.emit(v_accvgpr_read(v[27], v[241]))
|
|
k.emit(v_accvgpr_read(v[28], v[245]))
|
|
k.emit(v_accvgpr_read(v[29], v[249]))
|
|
k.emit(v_accvgpr_read(v[30], v[253]))
|
|
k.emit(s_nop(1))
|
|
k.emit(v_accvgpr_write(v[225], v[25]))
|
|
k.emit(v_accvgpr_write(v[229], v[26]))
|
|
k.emit(v_accvgpr_write(v[233], v[27]))
|
|
k.emit(v_accvgpr_write(v[237], v[28]))
|
|
k.emit(v_accvgpr_write(v[241], v[29]))
|
|
k.emit(v_accvgpr_write(v[245], v[30]))
|
|
k.emit(v_accvgpr_read(v[25], v[10]))
|
|
k.emit(v_accvgpr_read(v[26], v[14]))
|
|
k.emit(v_accvgpr_read(v[27], v[18]))
|
|
k.emit(v_accvgpr_read(v[28], v[22]))
|
|
k.emit(v_accvgpr_read(v[29], v[26]))
|
|
k.emit(v_accvgpr_read(v[30], v[30]))
|
|
k.emit(s_nop(1))
|
|
k.emit(v_accvgpr_write(v[2], v[25]))
|
|
k.emit(v_accvgpr_write(v[6], v[26]))
|
|
k.emit(v_accvgpr_write(v[10], v[27]))
|
|
k.emit(v_accvgpr_write(v[14], v[28]))
|
|
k.emit(v_accvgpr_write(v[18], v[29]))
|
|
k.emit(v_accvgpr_write(v[22], v[30]))
|
|
k.emit(v_accvgpr_read(v[25], v[42]))
|
|
k.emit(v_accvgpr_read(v[26], v[46]))
|
|
k.emit(v_accvgpr_read(v[27], v[50]))
|
|
k.emit(v_accvgpr_read(v[28], v[54]))
|
|
k.emit(v_accvgpr_read(v[29], v[58]))
|
|
k.emit(v_accvgpr_read(v[30], v[62]))
|
|
k.emit(s_nop(1))
|
|
k.emit(v_accvgpr_write(v[34], v[25]))
|
|
k.emit(v_accvgpr_write(v[38], v[26]))
|
|
k.emit(v_accvgpr_write(v[42], v[27]))
|
|
k.emit(v_accvgpr_write(v[46], v[28]))
|
|
k.emit(v_accvgpr_write(v[50], v[29]))
|
|
k.emit(v_accvgpr_write(v[54], v[30]))
|
|
k.emit(v_accvgpr_read(v[25], v[74]))
|
|
k.emit(v_accvgpr_read(v[26], v[78]))
|
|
k.emit(v_accvgpr_read(v[27], v[82]))
|
|
k.emit(v_accvgpr_read(v[28], v[86]))
|
|
k.emit(v_accvgpr_read(v[29], v[90]))
|
|
k.emit(v_accvgpr_read(v[30], v[94]))
|
|
k.emit(s_nop(1))
|
|
k.emit(v_accvgpr_write(v[66], v[25]))
|
|
k.emit(v_accvgpr_write(v[70], v[26]))
|
|
k.emit(v_accvgpr_write(v[74], v[27]))
|
|
k.emit(v_accvgpr_write(v[78], v[28]))
|
|
k.emit(v_accvgpr_write(v[82], v[29]))
|
|
k.emit(v_accvgpr_write(v[86], v[30]))
|
|
k.emit(v_accvgpr_read(v[25], v[106]))
|
|
k.emit(v_accvgpr_read(v[26], v[110]))
|
|
k.emit(v_accvgpr_read(v[27], v[114]))
|
|
k.emit(v_accvgpr_read(v[28], v[118]))
|
|
k.emit(v_accvgpr_read(v[29], v[122]))
|
|
k.emit(v_accvgpr_read(v[30], v[126]))
|
|
k.emit(s_nop(1))
|
|
k.emit(v_accvgpr_write(v[98], v[25]))
|
|
k.emit(v_accvgpr_write(v[102], v[26]))
|
|
k.emit(v_accvgpr_write(v[106], v[27]))
|
|
k.emit(v_accvgpr_write(v[110], v[28]))
|
|
k.emit(v_accvgpr_write(v[114], v[29]))
|
|
k.emit(v_accvgpr_write(v[118], v[30]))
|
|
k.emit(v_accvgpr_read(v[25], v[138]))
|
|
k.emit(v_accvgpr_read(v[26], v[142]))
|
|
k.emit(v_accvgpr_read(v[27], v[146]))
|
|
k.emit(v_accvgpr_read(v[28], v[150]))
|
|
k.emit(v_accvgpr_read(v[29], v[154]))
|
|
k.emit(v_accvgpr_read(v[30], v[158]))
|
|
k.emit(s_nop(1))
|
|
k.emit(v_accvgpr_write(v[130], v[25]))
|
|
k.emit(v_accvgpr_write(v[134], v[26]))
|
|
k.emit(v_accvgpr_write(v[138], v[27]))
|
|
k.emit(v_accvgpr_write(v[142], v[28]))
|
|
k.emit(v_accvgpr_write(v[146], v[29]))
|
|
k.emit(v_accvgpr_write(v[150], v[30]))
|
|
k.emit(v_accvgpr_read(v[25], v[170]))
|
|
k.emit(v_accvgpr_read(v[26], v[174]))
|
|
k.emit(v_accvgpr_read(v[27], v[178]))
|
|
k.emit(v_accvgpr_read(v[28], v[182]))
|
|
k.emit(v_accvgpr_read(v[29], v[186]))
|
|
k.emit(v_accvgpr_read(v[30], v[190]))
|
|
k.emit(s_nop(1))
|
|
k.emit(v_accvgpr_write(v[162], v[25]))
|
|
k.emit(v_accvgpr_write(v[166], v[26]))
|
|
k.emit(v_accvgpr_write(v[170], v[27]))
|
|
k.emit(v_accvgpr_write(v[174], v[28]))
|
|
k.emit(v_accvgpr_write(v[178], v[29]))
|
|
k.emit(v_accvgpr_write(v[182], v[30]))
|
|
k.emit(v_accvgpr_read(v[25], v[202]))
|
|
k.emit(v_accvgpr_read(v[26], v[206]))
|
|
k.emit(v_accvgpr_read(v[27], v[210]))
|
|
k.emit(v_accvgpr_read(v[28], v[214]))
|
|
k.emit(v_accvgpr_read(v[29], v[218]))
|
|
k.emit(v_accvgpr_read(v[30], v[222]))
|
|
k.emit(s_nop(1))
|
|
k.emit(v_accvgpr_write(v[194], v[25]))
|
|
k.emit(v_accvgpr_write(v[198], v[26]))
|
|
k.emit(v_accvgpr_write(v[202], v[27]))
|
|
k.emit(v_accvgpr_write(v[206], v[28]))
|
|
k.emit(v_accvgpr_write(v[210], v[29]))
|
|
k.emit(v_accvgpr_write(v[214], v[30]))
|
|
k.emit(v_accvgpr_read(v[25], v[234]))
|
|
k.emit(v_accvgpr_read(v[26], v[238]))
|
|
k.emit(v_accvgpr_read(v[27], v[242]))
|
|
k.emit(v_accvgpr_read(v[28], v[246]))
|
|
k.emit(v_accvgpr_read(v[29], v[250]))
|
|
k.emit(v_accvgpr_read(v[30], v[254]))
|
|
k.emit(s_nop(1))
|
|
k.emit(v_accvgpr_write(v[226], v[25]))
|
|
k.emit(v_accvgpr_write(v[230], v[26]))
|
|
k.emit(v_accvgpr_write(v[234], v[27]))
|
|
k.emit(v_accvgpr_write(v[238], v[28]))
|
|
k.emit(v_accvgpr_write(v[242], v[29]))
|
|
k.emit(v_accvgpr_write(v[246], v[30]))
|
|
k.emit(v_accvgpr_read(v[25], v[11]))
|
|
k.emit(v_accvgpr_read(v[26], v[15]))
|
|
k.emit(v_accvgpr_read(v[27], v[19]))
|
|
k.emit(v_accvgpr_read(v[28], v[23]))
|
|
k.emit(v_accvgpr_read(v[29], v[27]))
|
|
k.emit(v_accvgpr_read(v[30], v[31]))
|
|
k.emit(s_nop(1))
|
|
k.emit(v_accvgpr_write(v[3], v[25]))
|
|
k.emit(v_accvgpr_write(v[7], v[26]))
|
|
k.emit(v_accvgpr_write(v[11], v[27]))
|
|
k.emit(v_accvgpr_write(v[15], v[28]))
|
|
k.emit(v_accvgpr_write(v[19], v[29]))
|
|
k.emit(v_accvgpr_write(v[23], v[30]))
|
|
k.emit(v_accvgpr_read(v[25], v[43]))
|
|
k.emit(v_accvgpr_read(v[26], v[47]))
|
|
k.emit(v_accvgpr_read(v[27], v[51]))
|
|
k.emit(v_accvgpr_read(v[28], v[55]))
|
|
k.emit(v_accvgpr_read(v[29], v[59]))
|
|
k.emit(v_accvgpr_read(v[30], v[63]))
|
|
k.emit(s_nop(1))
|
|
k.emit(v_accvgpr_write(v[35], v[25]))
|
|
k.emit(v_accvgpr_write(v[39], v[26]))
|
|
k.emit(v_accvgpr_write(v[43], v[27]))
|
|
k.emit(v_accvgpr_write(v[47], v[28]))
|
|
k.emit(v_accvgpr_write(v[51], v[29]))
|
|
k.emit(v_accvgpr_write(v[55], v[30]))
|
|
k.emit(v_accvgpr_read(v[25], v[75]))
|
|
k.emit(v_accvgpr_read(v[26], v[79]))
|
|
k.emit(v_accvgpr_read(v[27], v[83]))
|
|
k.emit(v_accvgpr_read(v[28], v[87]))
|
|
k.emit(v_accvgpr_read(v[29], v[91]))
|
|
k.emit(v_accvgpr_read(v[30], v[95]))
|
|
k.emit(s_nop(1))
|
|
k.emit(v_accvgpr_write(v[67], v[25]))
|
|
k.emit(v_accvgpr_write(v[71], v[26]))
|
|
k.emit(v_accvgpr_write(v[75], v[27]))
|
|
k.emit(v_accvgpr_write(v[79], v[28]))
|
|
k.emit(v_accvgpr_write(v[83], v[29]))
|
|
k.emit(v_accvgpr_write(v[87], v[30]))
|
|
k.emit(v_accvgpr_read(v[25], v[107]))
|
|
k.emit(v_accvgpr_read(v[26], v[111]))
|
|
k.emit(v_accvgpr_read(v[27], v[115]))
|
|
k.emit(v_accvgpr_read(v[28], v[119]))
|
|
k.emit(v_accvgpr_read(v[29], v[123]))
|
|
k.emit(v_accvgpr_read(v[30], v[127]))
|
|
k.emit(s_nop(1))
|
|
k.emit(v_accvgpr_write(v[99], v[25]))
|
|
k.emit(v_accvgpr_write(v[103], v[26]))
|
|
k.emit(v_accvgpr_write(v[107], v[27]))
|
|
k.emit(v_accvgpr_write(v[111], v[28]))
|
|
k.emit(v_accvgpr_write(v[115], v[29]))
|
|
k.emit(v_accvgpr_write(v[119], v[30]))
|
|
k.emit(v_accvgpr_read(v[25], v[139]))
|
|
k.emit(v_accvgpr_read(v[26], v[143]))
|
|
k.emit(v_accvgpr_read(v[27], v[147]))
|
|
k.emit(v_accvgpr_read(v[28], v[151]))
|
|
k.emit(v_accvgpr_read(v[29], v[155]))
|
|
k.emit(v_accvgpr_read(v[30], v[159]))
|
|
k.emit(s_nop(1))
|
|
k.emit(v_accvgpr_write(v[131], v[25]))
|
|
k.emit(v_accvgpr_write(v[135], v[26]))
|
|
k.emit(v_accvgpr_write(v[139], v[27]))
|
|
k.emit(v_accvgpr_write(v[143], v[28]))
|
|
k.emit(v_accvgpr_write(v[147], v[29]))
|
|
k.emit(v_accvgpr_write(v[151], v[30]))
|
|
k.emit(v_accvgpr_read(v[25], v[171]))
|
|
k.emit(v_accvgpr_read(v[26], v[175]))
|
|
k.emit(v_accvgpr_read(v[27], v[179]))
|
|
k.emit(v_accvgpr_read(v[28], v[183]))
|
|
k.emit(v_accvgpr_read(v[29], v[187]))
|
|
k.emit(v_accvgpr_read(v[30], v[191]))
|
|
k.emit(s_nop(1))
|
|
k.emit(v_accvgpr_write(v[163], v[25]))
|
|
k.emit(v_accvgpr_write(v[167], v[26]))
|
|
k.emit(v_accvgpr_write(v[171], v[27]))
|
|
k.emit(v_accvgpr_write(v[175], v[28]))
|
|
k.emit(v_accvgpr_write(v[179], v[29]))
|
|
k.emit(v_accvgpr_write(v[183], v[30]))
|
|
k.emit(v_accvgpr_read(v[25], v[203]))
|
|
k.emit(v_accvgpr_read(v[26], v[207]))
|
|
k.emit(v_accvgpr_read(v[27], v[211]))
|
|
k.emit(v_accvgpr_read(v[28], v[215]))
|
|
k.emit(v_accvgpr_read(v[29], v[219]))
|
|
k.emit(v_accvgpr_read(v[30], v[223]))
|
|
k.emit(s_nop(1))
|
|
k.emit(v_accvgpr_write(v[195], v[25]))
|
|
k.emit(v_accvgpr_write(v[199], v[26]))
|
|
k.emit(v_accvgpr_write(v[203], v[27]))
|
|
k.emit(v_accvgpr_write(v[207], v[28]))
|
|
k.emit(v_accvgpr_write(v[211], v[29]))
|
|
k.emit(v_accvgpr_write(v[215], v[30]))
|
|
k.emit(v_accvgpr_read(v[25], v[235]))
|
|
k.emit(v_accvgpr_read(v[26], v[239]))
|
|
k.emit(v_accvgpr_read(v[27], v[243]))
|
|
k.emit(v_accvgpr_read(v[28], v[247]))
|
|
k.emit(v_accvgpr_read(v[29], v[251]))
|
|
k.emit(v_accvgpr_read(v[30], v[255]))
|
|
k.emit(s_nop(1))
|
|
k.emit(v_accvgpr_write(v[227], v[25]))
|
|
k.emit(v_accvgpr_write(v[231], v[26]))
|
|
k.emit(v_accvgpr_write(v[235], v[27]))
|
|
k.emit(v_accvgpr_write(v[239], v[28]))
|
|
k.emit(v_accvgpr_write(v[243], v[29]))
|
|
k.emit(v_accvgpr_write(v[247], v[30]))
|
|
k.emit(s_mov_b64(s[8:9], -1))
|
|
k.emit(s_or_saveexec_b64(VCC, s[8:9]))
|
|
k.emit(s_branch(), target='ShiftVectorComponents0_GLVW0')
|
|
k.label('ShiftVectorComponents0_GLVW7_BM0_VW0')
|
|
k.emit(s_mov_b32(s[8], 0))
|
|
k.emit(v_cmpx_eq_u32_e64(s[8:9], v[24], s[8]))
|
|
k.emit(v_and_b32_e32(v[18], 63, v[180]))
|
|
k.emit(v_lshlrev_b32_e32(v[18], 2, v[18]))
|
|
k.emit(v_accvgpr_read(v[25], v[4]))
|
|
k.emit(v_accvgpr_read(v[26], v[8]))
|
|
k.emit(v_accvgpr_read(v[27], v[12]))
|
|
k.emit(v_accvgpr_read(v[28], v[16]))
|
|
k.emit(v_accvgpr_read(v[29], v[20]))
|
|
k.emit(v_accvgpr_read(v[30], v[24]))
|
|
k.emit(v_accvgpr_read(v[31], v[28]))
|
|
k.emit(s_nop(1))
|
|
k.emit(v_accvgpr_write(v[0], v[25]))
|
|
k.emit(v_accvgpr_write(v[4], v[26]))
|
|
k.emit(v_accvgpr_write(v[8], v[27]))
|
|
k.emit(v_accvgpr_write(v[12], v[28]))
|
|
k.emit(v_accvgpr_write(v[16], v[29]))
|
|
k.emit(v_accvgpr_write(v[20], v[30]))
|
|
k.emit(v_accvgpr_write(v[24], v[31]))
|
|
k.emit(v_accvgpr_read(v[25], v[36]))
|
|
k.emit(v_accvgpr_read(v[26], v[40]))
|
|
k.emit(v_accvgpr_read(v[27], v[44]))
|
|
k.emit(v_accvgpr_read(v[28], v[48]))
|
|
k.emit(v_accvgpr_read(v[29], v[52]))
|
|
k.emit(v_accvgpr_read(v[30], v[56]))
|
|
k.emit(v_accvgpr_read(v[31], v[60]))
|
|
k.emit(s_nop(1))
|
|
k.emit(v_accvgpr_write(v[32], v[25]))
|
|
k.emit(v_accvgpr_write(v[36], v[26]))
|
|
k.emit(v_accvgpr_write(v[40], v[27]))
|
|
k.emit(v_accvgpr_write(v[44], v[28]))
|
|
k.emit(v_accvgpr_write(v[48], v[29]))
|
|
k.emit(v_accvgpr_write(v[52], v[30]))
|
|
k.emit(v_accvgpr_write(v[56], v[31]))
|
|
k.emit(v_accvgpr_read(v[25], v[68]))
|
|
k.emit(v_accvgpr_read(v[26], v[72]))
|
|
k.emit(v_accvgpr_read(v[27], v[76]))
|
|
k.emit(v_accvgpr_read(v[28], v[80]))
|
|
k.emit(v_accvgpr_read(v[29], v[84]))
|
|
k.emit(v_accvgpr_read(v[30], v[88]))
|
|
k.emit(v_accvgpr_read(v[31], v[92]))
|
|
k.emit(s_nop(1))
|
|
k.emit(v_accvgpr_write(v[64], v[25]))
|
|
k.emit(v_accvgpr_write(v[68], v[26]))
|
|
k.emit(v_accvgpr_write(v[72], v[27]))
|
|
k.emit(v_accvgpr_write(v[76], v[28]))
|
|
k.emit(v_accvgpr_write(v[80], v[29]))
|
|
k.emit(v_accvgpr_write(v[84], v[30]))
|
|
k.emit(v_accvgpr_write(v[88], v[31]))
|
|
k.emit(v_accvgpr_read(v[25], v[100]))
|
|
k.emit(v_accvgpr_read(v[26], v[104]))
|
|
k.emit(v_accvgpr_read(v[27], v[108]))
|
|
k.emit(v_accvgpr_read(v[28], v[112]))
|
|
k.emit(v_accvgpr_read(v[29], v[116]))
|
|
k.emit(v_accvgpr_read(v[30], v[120]))
|
|
k.emit(v_accvgpr_read(v[31], v[124]))
|
|
k.emit(s_nop(1))
|
|
k.emit(v_accvgpr_write(v[96], v[25]))
|
|
k.emit(v_accvgpr_write(v[100], v[26]))
|
|
k.emit(v_accvgpr_write(v[104], v[27]))
|
|
k.emit(v_accvgpr_write(v[108], v[28]))
|
|
k.emit(v_accvgpr_write(v[112], v[29]))
|
|
k.emit(v_accvgpr_write(v[116], v[30]))
|
|
k.emit(v_accvgpr_write(v[120], v[31]))
|
|
k.emit(v_accvgpr_read(v[25], v[132]))
|
|
k.emit(v_accvgpr_read(v[26], v[136]))
|
|
k.emit(v_accvgpr_read(v[27], v[140]))
|
|
k.emit(v_accvgpr_read(v[28], v[144]))
|
|
k.emit(v_accvgpr_read(v[29], v[148]))
|
|
k.emit(v_accvgpr_read(v[30], v[152]))
|
|
k.emit(v_accvgpr_read(v[31], v[156]))
|
|
k.emit(s_nop(1))
|
|
k.emit(v_accvgpr_write(v[128], v[25]))
|
|
k.emit(v_accvgpr_write(v[132], v[26]))
|
|
k.emit(v_accvgpr_write(v[136], v[27]))
|
|
k.emit(v_accvgpr_write(v[140], v[28]))
|
|
k.emit(v_accvgpr_write(v[144], v[29]))
|
|
k.emit(v_accvgpr_write(v[148], v[30]))
|
|
k.emit(v_accvgpr_write(v[152], v[31]))
|
|
k.emit(v_accvgpr_read(v[25], v[164]))
|
|
k.emit(v_accvgpr_read(v[26], v[168]))
|
|
k.emit(v_accvgpr_read(v[27], v[172]))
|
|
k.emit(v_accvgpr_read(v[28], v[176]))
|
|
k.emit(v_accvgpr_read(v[29], v[180]))
|
|
k.emit(v_accvgpr_read(v[30], v[184]))
|
|
k.emit(v_accvgpr_read(v[31], v[188]))
|
|
k.emit(s_nop(1))
|
|
k.emit(v_accvgpr_write(v[160], v[25]))
|
|
k.emit(v_accvgpr_write(v[164], v[26]))
|
|
k.emit(v_accvgpr_write(v[168], v[27]))
|
|
k.emit(v_accvgpr_write(v[172], v[28]))
|
|
k.emit(v_accvgpr_write(v[176], v[29]))
|
|
k.emit(v_accvgpr_write(v[180], v[30]))
|
|
k.emit(v_accvgpr_write(v[184], v[31]))
|
|
k.emit(v_accvgpr_read(v[25], v[196]))
|
|
k.emit(v_accvgpr_read(v[26], v[200]))
|
|
k.emit(v_accvgpr_read(v[27], v[204]))
|
|
k.emit(v_accvgpr_read(v[28], v[208]))
|
|
k.emit(v_accvgpr_read(v[29], v[212]))
|
|
k.emit(v_accvgpr_read(v[30], v[216]))
|
|
k.emit(v_accvgpr_read(v[31], v[220]))
|
|
k.emit(s_nop(1))
|
|
k.emit(v_accvgpr_write(v[192], v[25]))
|
|
k.emit(v_accvgpr_write(v[196], v[26]))
|
|
k.emit(v_accvgpr_write(v[200], v[27]))
|
|
k.emit(v_accvgpr_write(v[204], v[28]))
|
|
k.emit(v_accvgpr_write(v[208], v[29]))
|
|
k.emit(v_accvgpr_write(v[212], v[30]))
|
|
k.emit(v_accvgpr_write(v[216], v[31]))
|
|
k.emit(v_accvgpr_read(v[25], v[228]))
|
|
k.emit(v_accvgpr_read(v[26], v[232]))
|
|
k.emit(v_accvgpr_read(v[27], v[236]))
|
|
k.emit(v_accvgpr_read(v[28], v[240]))
|
|
k.emit(v_accvgpr_read(v[29], v[244]))
|
|
k.emit(v_accvgpr_read(v[30], v[248]))
|
|
k.emit(v_accvgpr_read(v[31], v[252]))
|
|
k.emit(s_nop(1))
|
|
k.emit(v_accvgpr_write(v[224], v[25]))
|
|
k.emit(v_accvgpr_write(v[228], v[26]))
|
|
k.emit(v_accvgpr_write(v[232], v[27]))
|
|
k.emit(v_accvgpr_write(v[236], v[28]))
|
|
k.emit(v_accvgpr_write(v[240], v[29]))
|
|
k.emit(v_accvgpr_write(v[244], v[30]))
|
|
k.emit(v_accvgpr_write(v[248], v[31]))
|
|
k.emit(v_accvgpr_read(v[25], v[5]))
|
|
k.emit(v_accvgpr_read(v[26], v[9]))
|
|
k.emit(v_accvgpr_read(v[27], v[13]))
|
|
k.emit(v_accvgpr_read(v[28], v[17]))
|
|
k.emit(v_accvgpr_read(v[29], v[21]))
|
|
k.emit(v_accvgpr_read(v[30], v[25]))
|
|
k.emit(v_accvgpr_read(v[31], v[29]))
|
|
k.emit(s_nop(1))
|
|
k.emit(v_accvgpr_write(v[1], v[25]))
|
|
k.emit(v_accvgpr_write(v[5], v[26]))
|
|
k.emit(v_accvgpr_write(v[9], v[27]))
|
|
k.emit(v_accvgpr_write(v[13], v[28]))
|
|
k.emit(v_accvgpr_write(v[17], v[29]))
|
|
k.emit(v_accvgpr_write(v[21], v[30]))
|
|
k.emit(v_accvgpr_write(v[25], v[31]))
|
|
k.emit(v_accvgpr_read(v[25], v[37]))
|
|
k.emit(v_accvgpr_read(v[26], v[41]))
|
|
k.emit(v_accvgpr_read(v[27], v[45]))
|
|
k.emit(v_accvgpr_read(v[28], v[49]))
|
|
k.emit(v_accvgpr_read(v[29], v[53]))
|
|
k.emit(v_accvgpr_read(v[30], v[57]))
|
|
k.emit(v_accvgpr_read(v[31], v[61]))
|
|
k.emit(s_nop(1))
|
|
k.emit(v_accvgpr_write(v[33], v[25]))
|
|
k.emit(v_accvgpr_write(v[37], v[26]))
|
|
k.emit(v_accvgpr_write(v[41], v[27]))
|
|
k.emit(v_accvgpr_write(v[45], v[28]))
|
|
k.emit(v_accvgpr_write(v[49], v[29]))
|
|
k.emit(v_accvgpr_write(v[53], v[30]))
|
|
k.emit(v_accvgpr_write(v[57], v[31]))
|
|
k.emit(v_accvgpr_read(v[25], v[69]))
|
|
k.emit(v_accvgpr_read(v[26], v[73]))
|
|
k.emit(v_accvgpr_read(v[27], v[77]))
|
|
k.emit(v_accvgpr_read(v[28], v[81]))
|
|
k.emit(v_accvgpr_read(v[29], v[85]))
|
|
k.emit(v_accvgpr_read(v[30], v[89]))
|
|
k.emit(v_accvgpr_read(v[31], v[93]))
|
|
k.emit(s_nop(1))
|
|
k.emit(v_accvgpr_write(v[65], v[25]))
|
|
k.emit(v_accvgpr_write(v[69], v[26]))
|
|
k.emit(v_accvgpr_write(v[73], v[27]))
|
|
k.emit(v_accvgpr_write(v[77], v[28]))
|
|
k.emit(v_accvgpr_write(v[81], v[29]))
|
|
k.emit(v_accvgpr_write(v[85], v[30]))
|
|
k.emit(v_accvgpr_write(v[89], v[31]))
|
|
k.emit(v_accvgpr_read(v[25], v[101]))
|
|
k.emit(v_accvgpr_read(v[26], v[105]))
|
|
k.emit(v_accvgpr_read(v[27], v[109]))
|
|
k.emit(v_accvgpr_read(v[28], v[113]))
|
|
k.emit(v_accvgpr_read(v[29], v[117]))
|
|
k.emit(v_accvgpr_read(v[30], v[121]))
|
|
k.emit(v_accvgpr_read(v[31], v[125]))
|
|
k.emit(s_nop(1))
|
|
k.emit(v_accvgpr_write(v[97], v[25]))
|
|
k.emit(v_accvgpr_write(v[101], v[26]))
|
|
k.emit(v_accvgpr_write(v[105], v[27]))
|
|
k.emit(v_accvgpr_write(v[109], v[28]))
|
|
k.emit(v_accvgpr_write(v[113], v[29]))
|
|
k.emit(v_accvgpr_write(v[117], v[30]))
|
|
k.emit(v_accvgpr_write(v[121], v[31]))
|
|
k.emit(v_accvgpr_read(v[25], v[133]))
|
|
k.emit(v_accvgpr_read(v[26], v[137]))
|
|
k.emit(v_accvgpr_read(v[27], v[141]))
|
|
k.emit(v_accvgpr_read(v[28], v[145]))
|
|
k.emit(v_accvgpr_read(v[29], v[149]))
|
|
k.emit(v_accvgpr_read(v[30], v[153]))
|
|
k.emit(v_accvgpr_read(v[31], v[157]))
|
|
k.emit(s_nop(1))
|
|
k.emit(v_accvgpr_write(v[129], v[25]))
|
|
k.emit(v_accvgpr_write(v[133], v[26]))
|
|
k.emit(v_accvgpr_write(v[137], v[27]))
|
|
k.emit(v_accvgpr_write(v[141], v[28]))
|
|
k.emit(v_accvgpr_write(v[145], v[29]))
|
|
k.emit(v_accvgpr_write(v[149], v[30]))
|
|
k.emit(v_accvgpr_write(v[153], v[31]))
|
|
k.emit(v_accvgpr_read(v[25], v[165]))
|
|
k.emit(v_accvgpr_read(v[26], v[169]))
|
|
k.emit(v_accvgpr_read(v[27], v[173]))
|
|
k.emit(v_accvgpr_read(v[28], v[177]))
|
|
k.emit(v_accvgpr_read(v[29], v[181]))
|
|
k.emit(v_accvgpr_read(v[30], v[185]))
|
|
k.emit(v_accvgpr_read(v[31], v[189]))
|
|
k.emit(s_nop(1))
|
|
k.emit(v_accvgpr_write(v[161], v[25]))
|
|
k.emit(v_accvgpr_write(v[165], v[26]))
|
|
k.emit(v_accvgpr_write(v[169], v[27]))
|
|
k.emit(v_accvgpr_write(v[173], v[28]))
|
|
k.emit(v_accvgpr_write(v[177], v[29]))
|
|
k.emit(v_accvgpr_write(v[181], v[30]))
|
|
k.emit(v_accvgpr_write(v[185], v[31]))
|
|
k.emit(v_accvgpr_read(v[25], v[197]))
|
|
k.emit(v_accvgpr_read(v[26], v[201]))
|
|
k.emit(v_accvgpr_read(v[27], v[205]))
|
|
k.emit(v_accvgpr_read(v[28], v[209]))
|
|
k.emit(v_accvgpr_read(v[29], v[213]))
|
|
k.emit(v_accvgpr_read(v[30], v[217]))
|
|
k.emit(v_accvgpr_read(v[31], v[221]))
|
|
k.emit(s_nop(1))
|
|
k.emit(v_accvgpr_write(v[193], v[25]))
|
|
k.emit(v_accvgpr_write(v[197], v[26]))
|
|
k.emit(v_accvgpr_write(v[201], v[27]))
|
|
k.emit(v_accvgpr_write(v[205], v[28]))
|
|
k.emit(v_accvgpr_write(v[209], v[29]))
|
|
k.emit(v_accvgpr_write(v[213], v[30]))
|
|
k.emit(v_accvgpr_write(v[217], v[31]))
|
|
k.emit(v_accvgpr_read(v[25], v[229]))
|
|
k.emit(v_accvgpr_read(v[26], v[233]))
|
|
k.emit(v_accvgpr_read(v[27], v[237]))
|
|
k.emit(v_accvgpr_read(v[28], v[241]))
|
|
k.emit(v_accvgpr_read(v[29], v[245]))
|
|
k.emit(v_accvgpr_read(v[30], v[249]))
|
|
k.emit(v_accvgpr_read(v[31], v[253]))
|
|
k.emit(s_nop(1))
|
|
k.emit(v_accvgpr_write(v[225], v[25]))
|
|
k.emit(v_accvgpr_write(v[229], v[26]))
|
|
k.emit(v_accvgpr_write(v[233], v[27]))
|
|
k.emit(v_accvgpr_write(v[237], v[28]))
|
|
k.emit(v_accvgpr_write(v[241], v[29]))
|
|
k.emit(v_accvgpr_write(v[245], v[30]))
|
|
k.emit(v_accvgpr_write(v[249], v[31]))
|
|
k.emit(v_accvgpr_read(v[25], v[6]))
|
|
k.emit(v_accvgpr_read(v[26], v[10]))
|
|
k.emit(v_accvgpr_read(v[27], v[14]))
|
|
k.emit(v_accvgpr_read(v[28], v[18]))
|
|
k.emit(v_accvgpr_read(v[29], v[22]))
|
|
k.emit(v_accvgpr_read(v[30], v[26]))
|
|
k.emit(v_accvgpr_read(v[31], v[30]))
|
|
k.emit(s_nop(1))
|
|
k.emit(v_accvgpr_write(v[2], v[25]))
|
|
k.emit(v_accvgpr_write(v[6], v[26]))
|
|
k.emit(v_accvgpr_write(v[10], v[27]))
|
|
k.emit(v_accvgpr_write(v[14], v[28]))
|
|
k.emit(v_accvgpr_write(v[18], v[29]))
|
|
k.emit(v_accvgpr_write(v[22], v[30]))
|
|
k.emit(v_accvgpr_write(v[26], v[31]))
|
|
k.emit(v_accvgpr_read(v[25], v[38]))
|
|
k.emit(v_accvgpr_read(v[26], v[42]))
|
|
k.emit(v_accvgpr_read(v[27], v[46]))
|
|
k.emit(v_accvgpr_read(v[28], v[50]))
|
|
k.emit(v_accvgpr_read(v[29], v[54]))
|
|
k.emit(v_accvgpr_read(v[30], v[58]))
|
|
k.emit(v_accvgpr_read(v[31], v[62]))
|
|
k.emit(s_nop(1))
|
|
k.emit(v_accvgpr_write(v[34], v[25]))
|
|
k.emit(v_accvgpr_write(v[38], v[26]))
|
|
k.emit(v_accvgpr_write(v[42], v[27]))
|
|
k.emit(v_accvgpr_write(v[46], v[28]))
|
|
k.emit(v_accvgpr_write(v[50], v[29]))
|
|
k.emit(v_accvgpr_write(v[54], v[30]))
|
|
k.emit(v_accvgpr_write(v[58], v[31]))
|
|
k.emit(v_accvgpr_read(v[25], v[70]))
|
|
k.emit(v_accvgpr_read(v[26], v[74]))
|
|
k.emit(v_accvgpr_read(v[27], v[78]))
|
|
k.emit(v_accvgpr_read(v[28], v[82]))
|
|
k.emit(v_accvgpr_read(v[29], v[86]))
|
|
k.emit(v_accvgpr_read(v[30], v[90]))
|
|
k.emit(v_accvgpr_read(v[31], v[94]))
|
|
k.emit(s_nop(1))
|
|
k.emit(v_accvgpr_write(v[66], v[25]))
|
|
k.emit(v_accvgpr_write(v[70], v[26]))
|
|
k.emit(v_accvgpr_write(v[74], v[27]))
|
|
k.emit(v_accvgpr_write(v[78], v[28]))
|
|
k.emit(v_accvgpr_write(v[82], v[29]))
|
|
k.emit(v_accvgpr_write(v[86], v[30]))
|
|
k.emit(v_accvgpr_write(v[90], v[31]))
|
|
k.emit(v_accvgpr_read(v[25], v[102]))
|
|
k.emit(v_accvgpr_read(v[26], v[106]))
|
|
k.emit(v_accvgpr_read(v[27], v[110]))
|
|
k.emit(v_accvgpr_read(v[28], v[114]))
|
|
k.emit(v_accvgpr_read(v[29], v[118]))
|
|
k.emit(v_accvgpr_read(v[30], v[122]))
|
|
k.emit(v_accvgpr_read(v[31], v[126]))
|
|
k.emit(s_nop(1))
|
|
k.emit(v_accvgpr_write(v[98], v[25]))
|
|
k.emit(v_accvgpr_write(v[102], v[26]))
|
|
k.emit(v_accvgpr_write(v[106], v[27]))
|
|
k.emit(v_accvgpr_write(v[110], v[28]))
|
|
k.emit(v_accvgpr_write(v[114], v[29]))
|
|
k.emit(v_accvgpr_write(v[118], v[30]))
|
|
k.emit(v_accvgpr_write(v[122], v[31]))
|
|
k.emit(v_accvgpr_read(v[25], v[134]))
|
|
k.emit(v_accvgpr_read(v[26], v[138]))
|
|
k.emit(v_accvgpr_read(v[27], v[142]))
|
|
k.emit(v_accvgpr_read(v[28], v[146]))
|
|
k.emit(v_accvgpr_read(v[29], v[150]))
|
|
k.emit(v_accvgpr_read(v[30], v[154]))
|
|
k.emit(v_accvgpr_read(v[31], v[158]))
|
|
k.emit(s_nop(1))
|
|
k.emit(v_accvgpr_write(v[130], v[25]))
|
|
k.emit(v_accvgpr_write(v[134], v[26]))
|
|
k.emit(v_accvgpr_write(v[138], v[27]))
|
|
k.emit(v_accvgpr_write(v[142], v[28]))
|
|
k.emit(v_accvgpr_write(v[146], v[29]))
|
|
k.emit(v_accvgpr_write(v[150], v[30]))
|
|
k.emit(v_accvgpr_write(v[154], v[31]))
|
|
k.emit(v_accvgpr_read(v[25], v[166]))
|
|
k.emit(v_accvgpr_read(v[26], v[170]))
|
|
k.emit(v_accvgpr_read(v[27], v[174]))
|
|
k.emit(v_accvgpr_read(v[28], v[178]))
|
|
k.emit(v_accvgpr_read(v[29], v[182]))
|
|
k.emit(v_accvgpr_read(v[30], v[186]))
|
|
k.emit(v_accvgpr_read(v[31], v[190]))
|
|
k.emit(s_nop(1))
|
|
k.emit(v_accvgpr_write(v[162], v[25]))
|
|
k.emit(v_accvgpr_write(v[166], v[26]))
|
|
k.emit(v_accvgpr_write(v[170], v[27]))
|
|
k.emit(v_accvgpr_write(v[174], v[28]))
|
|
k.emit(v_accvgpr_write(v[178], v[29]))
|
|
k.emit(v_accvgpr_write(v[182], v[30]))
|
|
k.emit(v_accvgpr_write(v[186], v[31]))
|
|
k.emit(v_accvgpr_read(v[25], v[198]))
|
|
k.emit(v_accvgpr_read(v[26], v[202]))
|
|
k.emit(v_accvgpr_read(v[27], v[206]))
|
|
k.emit(v_accvgpr_read(v[28], v[210]))
|
|
k.emit(v_accvgpr_read(v[29], v[214]))
|
|
k.emit(v_accvgpr_read(v[30], v[218]))
|
|
k.emit(v_accvgpr_read(v[31], v[222]))
|
|
k.emit(s_nop(1))
|
|
k.emit(v_accvgpr_write(v[194], v[25]))
|
|
k.emit(v_accvgpr_write(v[198], v[26]))
|
|
k.emit(v_accvgpr_write(v[202], v[27]))
|
|
k.emit(v_accvgpr_write(v[206], v[28]))
|
|
k.emit(v_accvgpr_write(v[210], v[29]))
|
|
k.emit(v_accvgpr_write(v[214], v[30]))
|
|
k.emit(v_accvgpr_write(v[218], v[31]))
|
|
k.emit(v_accvgpr_read(v[25], v[230]))
|
|
k.emit(v_accvgpr_read(v[26], v[234]))
|
|
k.emit(v_accvgpr_read(v[27], v[238]))
|
|
k.emit(v_accvgpr_read(v[28], v[242]))
|
|
k.emit(v_accvgpr_read(v[29], v[246]))
|
|
k.emit(v_accvgpr_read(v[30], v[250]))
|
|
k.emit(v_accvgpr_read(v[31], v[254]))
|
|
k.emit(s_nop(1))
|
|
k.emit(v_accvgpr_write(v[226], v[25]))
|
|
k.emit(v_accvgpr_write(v[230], v[26]))
|
|
k.emit(v_accvgpr_write(v[234], v[27]))
|
|
k.emit(v_accvgpr_write(v[238], v[28]))
|
|
k.emit(v_accvgpr_write(v[242], v[29]))
|
|
k.emit(v_accvgpr_write(v[246], v[30]))
|
|
k.emit(v_accvgpr_write(v[250], v[31]))
|
|
k.emit(v_accvgpr_read(v[25], v[7]))
|
|
k.emit(v_accvgpr_read(v[26], v[11]))
|
|
k.emit(v_accvgpr_read(v[27], v[15]))
|
|
k.emit(v_accvgpr_read(v[28], v[19]))
|
|
k.emit(v_accvgpr_read(v[29], v[23]))
|
|
k.emit(v_accvgpr_read(v[30], v[27]))
|
|
k.emit(v_accvgpr_read(v[31], v[31]))
|
|
k.emit(s_nop(1))
|
|
k.emit(v_accvgpr_write(v[3], v[25]))
|
|
k.emit(v_accvgpr_write(v[7], v[26]))
|
|
k.emit(v_accvgpr_write(v[11], v[27]))
|
|
k.emit(v_accvgpr_write(v[15], v[28]))
|
|
k.emit(v_accvgpr_write(v[19], v[29]))
|
|
k.emit(v_accvgpr_write(v[23], v[30]))
|
|
k.emit(v_accvgpr_write(v[27], v[31]))
|
|
k.emit(v_accvgpr_read(v[25], v[39]))
|
|
k.emit(v_accvgpr_read(v[26], v[43]))
|
|
k.emit(v_accvgpr_read(v[27], v[47]))
|
|
k.emit(v_accvgpr_read(v[28], v[51]))
|
|
k.emit(v_accvgpr_read(v[29], v[55]))
|
|
k.emit(v_accvgpr_read(v[30], v[59]))
|
|
k.emit(v_accvgpr_read(v[31], v[63]))
|
|
k.emit(s_nop(1))
|
|
k.emit(v_accvgpr_write(v[35], v[25]))
|
|
k.emit(v_accvgpr_write(v[39], v[26]))
|
|
k.emit(v_accvgpr_write(v[43], v[27]))
|
|
k.emit(v_accvgpr_write(v[47], v[28]))
|
|
k.emit(v_accvgpr_write(v[51], v[29]))
|
|
k.emit(v_accvgpr_write(v[55], v[30]))
|
|
k.emit(v_accvgpr_write(v[59], v[31]))
|
|
k.emit(v_accvgpr_read(v[25], v[71]))
|
|
k.emit(v_accvgpr_read(v[26], v[75]))
|
|
k.emit(v_accvgpr_read(v[27], v[79]))
|
|
k.emit(v_accvgpr_read(v[28], v[83]))
|
|
k.emit(v_accvgpr_read(v[29], v[87]))
|
|
k.emit(v_accvgpr_read(v[30], v[91]))
|
|
k.emit(v_accvgpr_read(v[31], v[95]))
|
|
k.emit(s_nop(1))
|
|
k.emit(v_accvgpr_write(v[67], v[25]))
|
|
k.emit(v_accvgpr_write(v[71], v[26]))
|
|
k.emit(v_accvgpr_write(v[75], v[27]))
|
|
k.emit(v_accvgpr_write(v[79], v[28]))
|
|
k.emit(v_accvgpr_write(v[83], v[29]))
|
|
k.emit(v_accvgpr_write(v[87], v[30]))
|
|
k.emit(v_accvgpr_write(v[91], v[31]))
|
|
k.emit(v_accvgpr_read(v[25], v[103]))
|
|
k.emit(v_accvgpr_read(v[26], v[107]))
|
|
k.emit(v_accvgpr_read(v[27], v[111]))
|
|
k.emit(v_accvgpr_read(v[28], v[115]))
|
|
k.emit(v_accvgpr_read(v[29], v[119]))
|
|
k.emit(v_accvgpr_read(v[30], v[123]))
|
|
k.emit(v_accvgpr_read(v[31], v[127]))
|
|
k.emit(s_nop(1))
|
|
k.emit(v_accvgpr_write(v[99], v[25]))
|
|
k.emit(v_accvgpr_write(v[103], v[26]))
|
|
k.emit(v_accvgpr_write(v[107], v[27]))
|
|
k.emit(v_accvgpr_write(v[111], v[28]))
|
|
k.emit(v_accvgpr_write(v[115], v[29]))
|
|
k.emit(v_accvgpr_write(v[119], v[30]))
|
|
k.emit(v_accvgpr_write(v[123], v[31]))
|
|
k.emit(v_accvgpr_read(v[25], v[135]))
|
|
k.emit(v_accvgpr_read(v[26], v[139]))
|
|
k.emit(v_accvgpr_read(v[27], v[143]))
|
|
k.emit(v_accvgpr_read(v[28], v[147]))
|
|
k.emit(v_accvgpr_read(v[29], v[151]))
|
|
k.emit(v_accvgpr_read(v[30], v[155]))
|
|
k.emit(v_accvgpr_read(v[31], v[159]))
|
|
k.emit(s_nop(1))
|
|
k.emit(v_accvgpr_write(v[131], v[25]))
|
|
k.emit(v_accvgpr_write(v[135], v[26]))
|
|
k.emit(v_accvgpr_write(v[139], v[27]))
|
|
k.emit(v_accvgpr_write(v[143], v[28]))
|
|
k.emit(v_accvgpr_write(v[147], v[29]))
|
|
k.emit(v_accvgpr_write(v[151], v[30]))
|
|
k.emit(v_accvgpr_write(v[155], v[31]))
|
|
k.emit(v_accvgpr_read(v[25], v[167]))
|
|
k.emit(v_accvgpr_read(v[26], v[171]))
|
|
k.emit(v_accvgpr_read(v[27], v[175]))
|
|
k.emit(v_accvgpr_read(v[28], v[179]))
|
|
k.emit(v_accvgpr_read(v[29], v[183]))
|
|
k.emit(v_accvgpr_read(v[30], v[187]))
|
|
k.emit(v_accvgpr_read(v[31], v[191]))
|
|
k.emit(s_nop(1))
|
|
k.emit(v_accvgpr_write(v[163], v[25]))
|
|
k.emit(v_accvgpr_write(v[167], v[26]))
|
|
k.emit(v_accvgpr_write(v[171], v[27]))
|
|
k.emit(v_accvgpr_write(v[175], v[28]))
|
|
k.emit(v_accvgpr_write(v[179], v[29]))
|
|
k.emit(v_accvgpr_write(v[183], v[30]))
|
|
k.emit(v_accvgpr_write(v[187], v[31]))
|
|
k.emit(v_accvgpr_read(v[25], v[199]))
|
|
k.emit(v_accvgpr_read(v[26], v[203]))
|
|
k.emit(v_accvgpr_read(v[27], v[207]))
|
|
k.emit(v_accvgpr_read(v[28], v[211]))
|
|
k.emit(v_accvgpr_read(v[29], v[215]))
|
|
k.emit(v_accvgpr_read(v[30], v[219]))
|
|
k.emit(v_accvgpr_read(v[31], v[223]))
|
|
k.emit(s_nop(1))
|
|
k.emit(v_accvgpr_write(v[195], v[25]))
|
|
k.emit(v_accvgpr_write(v[199], v[26]))
|
|
k.emit(v_accvgpr_write(v[203], v[27]))
|
|
k.emit(v_accvgpr_write(v[207], v[28]))
|
|
k.emit(v_accvgpr_write(v[211], v[29]))
|
|
k.emit(v_accvgpr_write(v[215], v[30]))
|
|
k.emit(v_accvgpr_write(v[219], v[31]))
|
|
k.emit(v_accvgpr_read(v[25], v[231]))
|
|
k.emit(v_accvgpr_read(v[26], v[235]))
|
|
k.emit(v_accvgpr_read(v[27], v[239]))
|
|
k.emit(v_accvgpr_read(v[28], v[243]))
|
|
k.emit(v_accvgpr_read(v[29], v[247]))
|
|
k.emit(v_accvgpr_read(v[30], v[251]))
|
|
k.emit(v_accvgpr_read(v[31], v[255]))
|
|
k.emit(s_nop(1))
|
|
k.emit(v_accvgpr_write(v[227], v[25]))
|
|
k.emit(v_accvgpr_write(v[231], v[26]))
|
|
k.emit(v_accvgpr_write(v[235], v[27]))
|
|
k.emit(v_accvgpr_write(v[239], v[28]))
|
|
k.emit(v_accvgpr_write(v[243], v[29]))
|
|
k.emit(v_accvgpr_write(v[247], v[30]))
|
|
k.emit(v_accvgpr_write(v[251], v[31]))
|
|
k.emit(s_mov_b64(s[8:9], -1))
|
|
k.emit(s_or_saveexec_b64(VCC, s[8:9]))
|
|
k.label('ShiftVectorComponents0_GLVW0')
|
|
k.emit(v_lshrrev_b32_e32(v[22], 6, v[180]))
|
|
k.emit(v_lshrrev_b32_e32(v[23], 1, v[22]))
|
|
k.emit(v_mul_lo_u32(v[23], 16, v[23]))
|
|
k.emit(v_and_b32_e32(v[19], 63, v[180]))
|
|
k.emit(v_lshrrev_b32_e32(v[19], 4, v[19]))
|
|
k.emit(v_lshlrev_b32_e32(v[19], 2, v[19]))
|
|
k.emit(v_add_lshl_u32_e64(v[19], v[23], v[19], 3))
|
|
k.emit(v_mul_lo_u32(v[20], v[19], s[38]))
|
|
k.emit(v_mul_lo_u32(v[21], v[19], s[36]))
|
|
k.emit(v_and_b32_e32(v[18], 1, v[22]))
|
|
k.emit(v_mul_lo_u32(v[18], 16, v[18]))
|
|
k.emit(v_and_b32_e32(v[23], 15, v[180]))
|
|
k.emit(v_add_lshl_u32_e64(v[18], v[23], v[18], 3))
|
|
k.emit(s_mul_i32(s[8], 256, s[2]))
|
|
k.emit(v_add_u32_e32(v[18], s[8], v[18]))
|
|
k.emit(s_mul_i32(s[8], 256, s[3]))
|
|
k.emit(v_add_u32_e32(v[19], s[8], v[19]))
|
|
k.waitcnt(lgkm=0)
|
|
k.emit(s_add_u32(s[8], s[4], 1))
|
|
k.emit(s_mul_i32(s[8], s[73], s[8]))
|
|
k.emit(s_cmp_eq_u32(s[8], 0))
|
|
k.emit(s_cselect_b32(s[8], s[20], s[8]))
|
|
k.emit(s_mov_b32(s[91], 131072))
|
|
k.emit(s_mov_b32(s[90], 0))
|
|
k.emit(s_mul_i32(s[8], 256, s[2]))
|
|
k.emit(v_add_u32_e32(v[26], s[8], v[180]))
|
|
k.emit(s_mul_i32(s[90], 4, s[90]))
|
|
k.emit(s_mul_i32(s[8], s[73], s[4]))
|
|
k.emit(v_add_u32_e32(v[24], s[8], v[26]))
|
|
k.emit(v_lshlrev_b32_e32(v[24], 2, v[24]))
|
|
k.emit(s_mul_i32(s[8], 256, s[3]))
|
|
k.emit(v_add_u32_e32(v[26], s[8], v[180]))
|
|
k.emit(buffer_load_dword(v[22], v[24], s[88:91], 0, 0, 1))
|
|
k.emit(v_lshlrev_b32_e32(v[26], 2, v[180]))
|
|
k.emit(s_barrier())
|
|
k.waitcnt(vm=0)
|
|
k.emit(ds_write_b32(v[0], v[26], v[22]))
|
|
k.emit(v_mov_b32_e32(v[23], 1.0))
|
|
k.emit(ds_write_b32(v[0], v[26], v[23], v[0], 0, 0, 4))
|
|
k.emit(s_mul_i32(s[8], 256, s[2]))
|
|
k.emit(v_add_u32_e32(v[26], s[8], v[180]))
|
|
k.emit(s_mul_i32(s[90], 2, s[90]))
|
|
k.emit(s_mul_i32(s[8], s[73], s[4]))
|
|
k.emit(v_add_u32_e32(v[24], s[8], v[26]))
|
|
k.emit(v_lshlrev_b32_e32(v[24], 1, v[24]))
|
|
k.emit(s_mul_i32(s[8], 256, s[3]))
|
|
k.emit(v_add_u32_e32(v[26], s[8], v[180]))
|
|
k.emit(buffer_load_short_d16(v[22], v[24], s[88:91], 0, 0, 1))
|
|
k.emit(v_lshlrev_b32_e32(v[26], 2, v[180]))
|
|
k.emit(s_barrier())
|
|
k.waitcnt(vm=0)
|
|
k.emit(v_cvt(v[22], SDWA, v[22], 0, 0, 0, 0, 0, 0, 6, 2, 4))
|
|
k.emit(ds_write_b32(v[0], v[26], v[22]))
|
|
k.emit(v_mov_b32_e32(v[23], 1.0))
|
|
k.emit(ds_write_b32(v[0], v[26], v[23], v[0], 0, 0, 4))
|
|
k.emit(s_and_b32(s[78], 255, s[20]))
|
|
k.emit(s_add_u32(s[79], -1, s[10]))
|
|
k.emit(s_cmp_ge_u32(s[2], s[79]))
|
|
k.emit(s_cselect_b32(s[78], s[78], 0))
|
|
k.emit(s_cmpk_gt_u32(s[78]))
|
|
k.emit(s_cbranch_scc1(), target='GW_B0_E1_M_1')
|
|
k.emit(s_and_b32(s[78], 255, s[21]))
|
|
k.emit(s_add_u32(s[79], -1, s[11]))
|
|
k.emit(s_cmp_ge_u32(s[3], s[79]))
|
|
k.emit(s_cselect_b32(s[78], s[78], 0))
|
|
k.emit(s_cmpk_gt_u32(s[78]))
|
|
k.emit(s_cbranch_scc0(), target='GW_B0_E0_1')
|
|
k.emit(s_cbranch_scc1(), target='GW_B0_E1_N_1')
|
|
k.label('GW_B0_E0_1')
|
|
k.emit(s_mul_i32(s[68], 256, s[2]))
|
|
k.emit(v_sub_u32_e64(v[37], v[18], s[68]))
|
|
k.emit(v_lshlrev_b32_e32(v[37], 2, v[37]))
|
|
k.waitcnt(lgkm=0)
|
|
k.emit(s_barrier())
|
|
k.emit(ds_read_b128(v[88:91], v[37]))
|
|
k.emit(ds_read_b128(v[92:95], v[37], v[0], v[0], 0, 16))
|
|
k.emit(ds_read_b128(v[96:99], v[37], v[0], v[0], 0, 0, 4))
|
|
k.emit(ds_read_b128(v[100:103], v[37], v[0], v[0], 0, 16, 4))
|
|
k.emit(v_add_lshl_u32_e64(v[35], v[21], v[18], 1))
|
|
k.emit(v_accvgpr_read(v[40], v[0]))
|
|
k.emit(v_accvgpr_read(v[41], v[4]))
|
|
k.emit(v_accvgpr_read(v[42], v[8]))
|
|
k.emit(v_accvgpr_read(v[43], v[12]))
|
|
k.emit(v_accvgpr_read(v[44], v[16]))
|
|
k.emit(v_accvgpr_read(v[45], v[20]))
|
|
k.emit(v_accvgpr_read(v[46], v[24]))
|
|
k.emit(v_accvgpr_read(v[47], v[28]))
|
|
k.emit(v_accvgpr_read(v[48], v[32]))
|
|
k.emit(v_accvgpr_read(v[49], v[36]))
|
|
k.emit(v_accvgpr_read(v[50], v[40]))
|
|
k.emit(v_accvgpr_read(v[51], v[44]))
|
|
k.emit(v_accvgpr_read(v[52], v[48]))
|
|
k.emit(v_accvgpr_read(v[53], v[52]))
|
|
k.emit(v_accvgpr_read(v[54], v[56]))
|
|
k.emit(v_accvgpr_read(v[55], v[60]))
|
|
k.emit(v_accvgpr_read(v[56], v[64]))
|
|
k.emit(v_accvgpr_read(v[57], v[68]))
|
|
k.emit(v_accvgpr_read(v[58], v[72]))
|
|
k.emit(v_accvgpr_read(v[59], v[76]))
|
|
k.emit(v_accvgpr_read(v[60], v[80]))
|
|
k.emit(v_accvgpr_read(v[61], v[84]))
|
|
k.emit(v_accvgpr_read(v[62], v[88]))
|
|
k.emit(v_accvgpr_read(v[63], v[92]))
|
|
k.emit(v_accvgpr_read(v[64], v[96]))
|
|
k.emit(v_accvgpr_read(v[65], v[100]))
|
|
k.emit(v_accvgpr_read(v[66], v[104]))
|
|
k.emit(v_accvgpr_read(v[67], v[108]))
|
|
k.emit(v_accvgpr_read(v[68], v[112]))
|
|
k.emit(v_accvgpr_read(v[69], v[116]))
|
|
k.emit(v_accvgpr_read(v[70], v[120]))
|
|
k.emit(v_accvgpr_read(v[71], v[124]))
|
|
k.emit(v_accvgpr_read(v[72], v[128]))
|
|
k.emit(v_accvgpr_read(v[73], v[132]))
|
|
k.emit(v_accvgpr_read(v[74], v[136]))
|
|
k.emit(v_accvgpr_read(v[75], v[140]))
|
|
k.emit(v_accvgpr_read(v[76], v[144]))
|
|
k.emit(v_accvgpr_read(v[77], v[148]))
|
|
k.emit(v_accvgpr_read(v[78], v[152]))
|
|
k.emit(v_accvgpr_read(v[79], v[156]))
|
|
k.emit(v_accvgpr_read(v[80], v[160]))
|
|
k.emit(v_accvgpr_read(v[81], v[164]))
|
|
k.emit(v_accvgpr_read(v[82], v[168]))
|
|
k.emit(v_accvgpr_read(v[83], v[172]))
|
|
k.emit(v_accvgpr_read(v[84], v[176]))
|
|
k.emit(v_accvgpr_read(v[85], v[180]))
|
|
k.emit(v_accvgpr_read(v[86], v[184]))
|
|
k.emit(v_accvgpr_read(v[87], v[188]))
|
|
k.emit(v_mov_b32_e32(v[32], 4294901760))
|
|
k.emit(v_mov_b32_e32(v[33], 2147418112))
|
|
k.emit(v_mov_b32_e32(v[34], 32767))
|
|
k.waitcnt(lgkm=0)
|
|
k.emit(v_pk_mul_f32(v[40:41], v[96:97], v[40:41]))
|
|
k.emit(v_pk_mul_f32(v[42:43], v[98:99], v[42:43]))
|
|
k.emit(v_pk_mul_f32(v[44:45], v[100:101], v[44:45]))
|
|
k.emit(v_pk_mul_f32(v[46:47], v[102:103], v[46:47]))
|
|
k.emit(v_pk_add_f32(v[22:23], v[88:89], v[40:41]))
|
|
k.emit(v_pk_add_f32(v[24:25], v[90:91], v[42:43]))
|
|
k.emit(v_pk_add_f32(v[26:27], v[92:93], v[44:45]))
|
|
k.emit(v_pk_add_f32(v[28:29], v[94:95], v[46:47]))
|
|
k.emit(v_mov_b64_e32(v[40:41], v[22:23]))
|
|
k.emit(v_mov_b64_e32(v[42:43], v[24:25]))
|
|
k.emit(v_mov_b64_e32(v[44:45], v[26:27]))
|
|
k.emit(v_mov_b64_e32(v[46:47], v[28:29]))
|
|
k.emit(v_cvt_pk(v[40], v[40], v[41]))
|
|
k.emit(v_cvt_pk(v[41], v[42], v[43]))
|
|
k.emit(v_cvt_pk(v[42], v[44], v[45]))
|
|
k.emit(v_cvt_pk(v[43], v[46], v[47]))
|
|
k.emit(buffer_store_dwordx4(v[40:43], v[35], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_pk_mul_f32(v[48:49], v[96:97], v[48:49]))
|
|
k.emit(v_pk_mul_f32(v[50:51], v[98:99], v[50:51]))
|
|
k.emit(v_pk_mul_f32(v[52:53], v[100:101], v[52:53]))
|
|
k.emit(v_pk_mul_f32(v[54:55], v[102:103], v[54:55]))
|
|
k.emit(v_pk_add_f32(v[22:23], v[88:89], v[48:49]))
|
|
k.emit(v_pk_add_f32(v[24:25], v[90:91], v[50:51]))
|
|
k.emit(v_pk_add_f32(v[26:27], v[92:93], v[52:53]))
|
|
k.emit(v_pk_add_f32(v[28:29], v[94:95], v[54:55]))
|
|
k.emit(v_mov_b64_e32(v[48:49], v[22:23]))
|
|
k.emit(v_mov_b64_e32(v[50:51], v[24:25]))
|
|
k.emit(v_mov_b64_e32(v[52:53], v[26:27]))
|
|
k.emit(v_mov_b64_e32(v[54:55], v[28:29]))
|
|
k.emit(v_cvt_pk(v[48], v[48], v[49]))
|
|
k.emit(v_cvt_pk(v[49], v[50], v[51]))
|
|
k.emit(v_cvt_pk(v[50], v[52], v[53]))
|
|
k.emit(v_cvt_pk(v[51], v[54], v[55]))
|
|
k.emit(s_lshl_b32(s[68], s[36], 1))
|
|
k.emit(s_add_u32(s[12], s[12], s[68]))
|
|
k.emit(s_addc_u32(s[13], s[13], 0))
|
|
k.emit(buffer_store_dwordx4(v[48:51], v[35], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_pk_mul_f32(v[56:57], v[96:97], v[56:57]))
|
|
k.emit(v_pk_mul_f32(v[58:59], v[98:99], v[58:59]))
|
|
k.emit(v_pk_mul_f32(v[60:61], v[100:101], v[60:61]))
|
|
k.emit(v_pk_mul_f32(v[62:63], v[102:103], v[62:63]))
|
|
k.emit(v_pk_add_f32(v[22:23], v[88:89], v[56:57]))
|
|
k.emit(v_pk_add_f32(v[24:25], v[90:91], v[58:59]))
|
|
k.emit(v_pk_add_f32(v[26:27], v[92:93], v[60:61]))
|
|
k.emit(v_pk_add_f32(v[28:29], v[94:95], v[62:63]))
|
|
k.emit(v_mov_b64_e32(v[56:57], v[22:23]))
|
|
k.emit(v_mov_b64_e32(v[58:59], v[24:25]))
|
|
k.emit(v_mov_b64_e32(v[60:61], v[26:27]))
|
|
k.emit(v_mov_b64_e32(v[62:63], v[28:29]))
|
|
k.emit(v_cvt_pk(v[56], v[56], v[57]))
|
|
k.emit(v_cvt_pk(v[57], v[58], v[59]))
|
|
k.emit(v_cvt_pk(v[58], v[60], v[61]))
|
|
k.emit(v_cvt_pk(v[59], v[62], v[63]))
|
|
k.emit(s_lshl_b32(s[68], s[36], 1))
|
|
k.emit(s_add_u32(s[12], s[12], s[68]))
|
|
k.emit(s_addc_u32(s[13], s[13], 0))
|
|
k.emit(buffer_store_dwordx4(v[56:59], v[35], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_pk_mul_f32(v[64:65], v[96:97], v[64:65]))
|
|
k.emit(v_pk_mul_f32(v[66:67], v[98:99], v[66:67]))
|
|
k.emit(v_pk_mul_f32(v[68:69], v[100:101], v[68:69]))
|
|
k.emit(v_pk_mul_f32(v[70:71], v[102:103], v[70:71]))
|
|
k.emit(v_pk_add_f32(v[22:23], v[88:89], v[64:65]))
|
|
k.emit(v_pk_add_f32(v[24:25], v[90:91], v[66:67]))
|
|
k.emit(v_pk_add_f32(v[26:27], v[92:93], v[68:69]))
|
|
k.emit(v_pk_add_f32(v[28:29], v[94:95], v[70:71]))
|
|
k.emit(v_mov_b64_e32(v[64:65], v[22:23]))
|
|
k.emit(v_mov_b64_e32(v[66:67], v[24:25]))
|
|
k.emit(v_mov_b64_e32(v[68:69], v[26:27]))
|
|
k.emit(v_mov_b64_e32(v[70:71], v[28:29]))
|
|
k.emit(v_cvt_pk(v[64], v[64], v[65]))
|
|
k.emit(v_cvt_pk(v[65], v[66], v[67]))
|
|
k.emit(v_cvt_pk(v[66], v[68], v[69]))
|
|
k.emit(v_cvt_pk(v[67], v[70], v[71]))
|
|
k.emit(s_lshl_b32(s[68], s[36], 1))
|
|
k.emit(s_add_u32(s[12], s[12], s[68]))
|
|
k.emit(s_addc_u32(s[13], s[13], 0))
|
|
k.emit(buffer_store_dwordx4(v[64:67], v[35], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_pk_mul_f32(v[72:73], v[96:97], v[72:73]))
|
|
k.emit(v_pk_mul_f32(v[74:75], v[98:99], v[74:75]))
|
|
k.emit(v_pk_mul_f32(v[76:77], v[100:101], v[76:77]))
|
|
k.emit(v_pk_mul_f32(v[78:79], v[102:103], v[78:79]))
|
|
k.emit(v_pk_add_f32(v[22:23], v[88:89], v[72:73]))
|
|
k.emit(v_pk_add_f32(v[24:25], v[90:91], v[74:75]))
|
|
k.emit(v_pk_add_f32(v[26:27], v[92:93], v[76:77]))
|
|
k.emit(v_pk_add_f32(v[28:29], v[94:95], v[78:79]))
|
|
k.emit(v_mov_b64_e32(v[72:73], v[22:23]))
|
|
k.emit(v_mov_b64_e32(v[74:75], v[24:25]))
|
|
k.emit(v_mov_b64_e32(v[76:77], v[26:27]))
|
|
k.emit(v_mov_b64_e32(v[78:79], v[28:29]))
|
|
k.emit(v_cvt_pk(v[72], v[72], v[73]))
|
|
k.emit(v_cvt_pk(v[73], v[74], v[75]))
|
|
k.emit(v_cvt_pk(v[74], v[76], v[77]))
|
|
k.emit(v_cvt_pk(v[75], v[78], v[79]))
|
|
k.emit(s_lshl_b32(s[68], s[36], 1))
|
|
k.emit(s_add_u32(s[12], s[12], s[68]))
|
|
k.emit(s_addc_u32(s[13], s[13], 0))
|
|
k.emit(buffer_store_dwordx4(v[72:75], v[35], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_pk_mul_f32(v[80:81], v[96:97], v[80:81]))
|
|
k.emit(v_pk_mul_f32(v[82:83], v[98:99], v[82:83]))
|
|
k.emit(v_pk_mul_f32(v[84:85], v[100:101], v[84:85]))
|
|
k.emit(v_pk_mul_f32(v[86:87], v[102:103], v[86:87]))
|
|
k.emit(v_pk_add_f32(v[22:23], v[88:89], v[80:81]))
|
|
k.emit(v_pk_add_f32(v[24:25], v[90:91], v[82:83]))
|
|
k.emit(v_pk_add_f32(v[26:27], v[92:93], v[84:85]))
|
|
k.emit(v_pk_add_f32(v[28:29], v[94:95], v[86:87]))
|
|
k.emit(v_mov_b64_e32(v[80:81], v[22:23]))
|
|
k.emit(v_mov_b64_e32(v[82:83], v[24:25]))
|
|
k.emit(v_mov_b64_e32(v[84:85], v[26:27]))
|
|
k.emit(v_mov_b64_e32(v[86:87], v[28:29]))
|
|
k.emit(v_cvt_pk(v[80], v[80], v[81]))
|
|
k.emit(v_cvt_pk(v[81], v[82], v[83]))
|
|
k.emit(v_cvt_pk(v[82], v[84], v[85]))
|
|
k.emit(v_cvt_pk(v[83], v[86], v[87]))
|
|
k.emit(s_lshl_b32(s[68], s[36], 1))
|
|
k.emit(s_add_u32(s[12], s[12], s[68]))
|
|
k.emit(s_addc_u32(s[13], s[13], 0))
|
|
k.emit(buffer_store_dwordx4(v[80:83], v[35], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(s_nop())
|
|
k.emit(ds_read_b128(v[88:91], v[37]))
|
|
k.emit(ds_read_b128(v[92:95], v[37], v[0], v[0], 0, 16))
|
|
k.emit(ds_read_b128(v[96:99], v[37], v[0], v[0], 0, 0, 4))
|
|
k.emit(ds_read_b128(v[100:103], v[37], v[0], v[0], 0, 16, 4))
|
|
k.emit(v_accvgpr_read(v[40], v[192]))
|
|
k.emit(v_accvgpr_read(v[41], v[196]))
|
|
k.emit(v_accvgpr_read(v[42], v[200]))
|
|
k.emit(v_accvgpr_read(v[43], v[204]))
|
|
k.emit(v_accvgpr_read(v[44], v[208]))
|
|
k.emit(v_accvgpr_read(v[45], v[212]))
|
|
k.emit(v_accvgpr_read(v[46], v[216]))
|
|
k.emit(v_accvgpr_read(v[47], v[220]))
|
|
k.emit(v_accvgpr_read(v[48], v[224]))
|
|
k.emit(v_accvgpr_read(v[49], v[228]))
|
|
k.emit(v_accvgpr_read(v[50], v[232]))
|
|
k.emit(v_accvgpr_read(v[51], v[236]))
|
|
k.emit(v_accvgpr_read(v[52], v[240]))
|
|
k.emit(v_accvgpr_read(v[53], v[244]))
|
|
k.emit(v_accvgpr_read(v[54], v[248]))
|
|
k.emit(v_accvgpr_read(v[55], v[252]))
|
|
k.emit(v_accvgpr_read(v[56], v[1]))
|
|
k.emit(v_accvgpr_read(v[57], v[5]))
|
|
k.emit(v_accvgpr_read(v[58], v[9]))
|
|
k.emit(v_accvgpr_read(v[59], v[13]))
|
|
k.emit(v_accvgpr_read(v[60], v[17]))
|
|
k.emit(v_accvgpr_read(v[61], v[21]))
|
|
k.emit(v_accvgpr_read(v[62], v[25]))
|
|
k.emit(v_accvgpr_read(v[63], v[29]))
|
|
k.emit(v_accvgpr_read(v[64], v[33]))
|
|
k.emit(v_accvgpr_read(v[65], v[37]))
|
|
k.emit(v_accvgpr_read(v[66], v[41]))
|
|
k.emit(v_accvgpr_read(v[67], v[45]))
|
|
k.emit(v_accvgpr_read(v[68], v[49]))
|
|
k.emit(v_accvgpr_read(v[69], v[53]))
|
|
k.emit(v_accvgpr_read(v[70], v[57]))
|
|
k.emit(v_accvgpr_read(v[71], v[61]))
|
|
k.emit(v_accvgpr_read(v[72], v[65]))
|
|
k.emit(v_accvgpr_read(v[73], v[69]))
|
|
k.emit(v_accvgpr_read(v[74], v[73]))
|
|
k.emit(v_accvgpr_read(v[75], v[77]))
|
|
k.emit(v_accvgpr_read(v[76], v[81]))
|
|
k.emit(v_accvgpr_read(v[77], v[85]))
|
|
k.emit(v_accvgpr_read(v[78], v[89]))
|
|
k.emit(v_accvgpr_read(v[79], v[93]))
|
|
k.emit(v_accvgpr_read(v[80], v[97]))
|
|
k.emit(v_accvgpr_read(v[81], v[101]))
|
|
k.emit(v_accvgpr_read(v[82], v[105]))
|
|
k.emit(v_accvgpr_read(v[83], v[109]))
|
|
k.emit(v_accvgpr_read(v[84], v[113]))
|
|
k.emit(v_accvgpr_read(v[85], v[117]))
|
|
k.emit(v_accvgpr_read(v[86], v[121]))
|
|
k.emit(v_accvgpr_read(v[87], v[125]))
|
|
k.emit(v_mov_b32_e32(v[32], 4294901760))
|
|
k.emit(v_mov_b32_e32(v[33], 2147418112))
|
|
k.emit(v_mov_b32_e32(v[34], 32767))
|
|
k.waitcnt(lgkm=0)
|
|
k.emit(v_pk_mul_f32(v[40:41], v[96:97], v[40:41]))
|
|
k.emit(v_pk_mul_f32(v[42:43], v[98:99], v[42:43]))
|
|
k.emit(v_pk_mul_f32(v[44:45], v[100:101], v[44:45]))
|
|
k.emit(v_pk_mul_f32(v[46:47], v[102:103], v[46:47]))
|
|
k.emit(v_pk_add_f32(v[22:23], v[88:89], v[40:41]))
|
|
k.emit(v_pk_add_f32(v[24:25], v[90:91], v[42:43]))
|
|
k.emit(v_pk_add_f32(v[26:27], v[92:93], v[44:45]))
|
|
k.emit(v_pk_add_f32(v[28:29], v[94:95], v[46:47]))
|
|
k.emit(v_mov_b64_e32(v[40:41], v[22:23]))
|
|
k.emit(v_mov_b64_e32(v[42:43], v[24:25]))
|
|
k.emit(v_mov_b64_e32(v[44:45], v[26:27]))
|
|
k.emit(v_mov_b64_e32(v[46:47], v[28:29]))
|
|
k.emit(v_cvt_pk(v[40], v[40], v[41]))
|
|
k.emit(v_cvt_pk(v[41], v[42], v[43]))
|
|
k.emit(v_cvt_pk(v[42], v[44], v[45]))
|
|
k.emit(v_cvt_pk(v[43], v[46], v[47]))
|
|
k.emit(s_lshl_b32(s[68], s[36], 1))
|
|
k.emit(s_add_u32(s[12], s[12], s[68]))
|
|
k.emit(s_addc_u32(s[13], s[13], 0))
|
|
k.emit(buffer_store_dwordx4(v[40:43], v[35], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_pk_mul_f32(v[48:49], v[96:97], v[48:49]))
|
|
k.emit(v_pk_mul_f32(v[50:51], v[98:99], v[50:51]))
|
|
k.emit(v_pk_mul_f32(v[52:53], v[100:101], v[52:53]))
|
|
k.emit(v_pk_mul_f32(v[54:55], v[102:103], v[54:55]))
|
|
k.emit(v_pk_add_f32(v[22:23], v[88:89], v[48:49]))
|
|
k.emit(v_pk_add_f32(v[24:25], v[90:91], v[50:51]))
|
|
k.emit(v_pk_add_f32(v[26:27], v[92:93], v[52:53]))
|
|
k.emit(v_pk_add_f32(v[28:29], v[94:95], v[54:55]))
|
|
k.emit(v_mov_b64_e32(v[48:49], v[22:23]))
|
|
k.emit(v_mov_b64_e32(v[50:51], v[24:25]))
|
|
k.emit(v_mov_b64_e32(v[52:53], v[26:27]))
|
|
k.emit(v_mov_b64_e32(v[54:55], v[28:29]))
|
|
k.emit(v_cvt_pk(v[48], v[48], v[49]))
|
|
k.emit(v_cvt_pk(v[49], v[50], v[51]))
|
|
k.emit(v_cvt_pk(v[50], v[52], v[53]))
|
|
k.emit(v_cvt_pk(v[51], v[54], v[55]))
|
|
k.emit(s_lshl_b32(s[68], s[36], 1))
|
|
k.emit(s_add_u32(s[12], s[12], s[68]))
|
|
k.emit(s_addc_u32(s[13], s[13], 0))
|
|
k.emit(buffer_store_dwordx4(v[48:51], v[35], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_pk_mul_f32(v[56:57], v[96:97], v[56:57]))
|
|
k.emit(v_pk_mul_f32(v[58:59], v[98:99], v[58:59]))
|
|
k.emit(v_pk_mul_f32(v[60:61], v[100:101], v[60:61]))
|
|
k.emit(v_pk_mul_f32(v[62:63], v[102:103], v[62:63]))
|
|
k.emit(v_pk_add_f32(v[22:23], v[88:89], v[56:57]))
|
|
k.emit(v_pk_add_f32(v[24:25], v[90:91], v[58:59]))
|
|
k.emit(v_pk_add_f32(v[26:27], v[92:93], v[60:61]))
|
|
k.emit(v_pk_add_f32(v[28:29], v[94:95], v[62:63]))
|
|
k.emit(v_mov_b64_e32(v[56:57], v[22:23]))
|
|
k.emit(v_mov_b64_e32(v[58:59], v[24:25]))
|
|
k.emit(v_mov_b64_e32(v[60:61], v[26:27]))
|
|
k.emit(v_mov_b64_e32(v[62:63], v[28:29]))
|
|
k.emit(v_cvt_pk(v[56], v[56], v[57]))
|
|
k.emit(v_cvt_pk(v[57], v[58], v[59]))
|
|
k.emit(v_cvt_pk(v[58], v[60], v[61]))
|
|
k.emit(v_cvt_pk(v[59], v[62], v[63]))
|
|
k.emit(s_lshl_b32(s[68], s[36], 1))
|
|
k.emit(s_add_u32(s[12], s[12], s[68]))
|
|
k.emit(s_addc_u32(s[13], s[13], 0))
|
|
k.emit(buffer_store_dwordx4(v[56:59], v[35], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_pk_mul_f32(v[64:65], v[96:97], v[64:65]))
|
|
k.emit(v_pk_mul_f32(v[66:67], v[98:99], v[66:67]))
|
|
k.emit(v_pk_mul_f32(v[68:69], v[100:101], v[68:69]))
|
|
k.emit(v_pk_mul_f32(v[70:71], v[102:103], v[70:71]))
|
|
k.emit(v_pk_add_f32(v[22:23], v[88:89], v[64:65]))
|
|
k.emit(v_pk_add_f32(v[24:25], v[90:91], v[66:67]))
|
|
k.emit(v_pk_add_f32(v[26:27], v[92:93], v[68:69]))
|
|
k.emit(v_pk_add_f32(v[28:29], v[94:95], v[70:71]))
|
|
k.emit(v_mov_b64_e32(v[64:65], v[22:23]))
|
|
k.emit(v_mov_b64_e32(v[66:67], v[24:25]))
|
|
k.emit(v_mov_b64_e32(v[68:69], v[26:27]))
|
|
k.emit(v_mov_b64_e32(v[70:71], v[28:29]))
|
|
k.emit(v_cvt_pk(v[64], v[64], v[65]))
|
|
k.emit(v_cvt_pk(v[65], v[66], v[67]))
|
|
k.emit(v_cvt_pk(v[66], v[68], v[69]))
|
|
k.emit(v_cvt_pk(v[67], v[70], v[71]))
|
|
k.emit(s_lshl_b32(s[68], s[36], 1))
|
|
k.emit(s_add_u32(s[12], s[12], s[68]))
|
|
k.emit(s_addc_u32(s[13], s[13], 0))
|
|
k.emit(buffer_store_dwordx4(v[64:67], v[35], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_pk_mul_f32(v[72:73], v[96:97], v[72:73]))
|
|
k.emit(v_pk_mul_f32(v[74:75], v[98:99], v[74:75]))
|
|
k.emit(v_pk_mul_f32(v[76:77], v[100:101], v[76:77]))
|
|
k.emit(v_pk_mul_f32(v[78:79], v[102:103], v[78:79]))
|
|
k.emit(v_pk_add_f32(v[22:23], v[88:89], v[72:73]))
|
|
k.emit(v_pk_add_f32(v[24:25], v[90:91], v[74:75]))
|
|
k.emit(v_pk_add_f32(v[26:27], v[92:93], v[76:77]))
|
|
k.emit(v_pk_add_f32(v[28:29], v[94:95], v[78:79]))
|
|
k.emit(v_mov_b64_e32(v[72:73], v[22:23]))
|
|
k.emit(v_mov_b64_e32(v[74:75], v[24:25]))
|
|
k.emit(v_mov_b64_e32(v[76:77], v[26:27]))
|
|
k.emit(v_mov_b64_e32(v[78:79], v[28:29]))
|
|
k.emit(v_cvt_pk(v[72], v[72], v[73]))
|
|
k.emit(v_cvt_pk(v[73], v[74], v[75]))
|
|
k.emit(v_cvt_pk(v[74], v[76], v[77]))
|
|
k.emit(v_cvt_pk(v[75], v[78], v[79]))
|
|
k.emit(s_lshl_b32(s[68], s[36], 1))
|
|
k.emit(s_add_u32(s[12], s[12], s[68]))
|
|
k.emit(s_addc_u32(s[13], s[13], 0))
|
|
k.emit(buffer_store_dwordx4(v[72:75], v[35], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_pk_mul_f32(v[80:81], v[96:97], v[80:81]))
|
|
k.emit(v_pk_mul_f32(v[82:83], v[98:99], v[82:83]))
|
|
k.emit(v_pk_mul_f32(v[84:85], v[100:101], v[84:85]))
|
|
k.emit(v_pk_mul_f32(v[86:87], v[102:103], v[86:87]))
|
|
k.emit(v_pk_add_f32(v[22:23], v[88:89], v[80:81]))
|
|
k.emit(v_pk_add_f32(v[24:25], v[90:91], v[82:83]))
|
|
k.emit(v_pk_add_f32(v[26:27], v[92:93], v[84:85]))
|
|
k.emit(v_pk_add_f32(v[28:29], v[94:95], v[86:87]))
|
|
k.emit(v_mov_b64_e32(v[80:81], v[22:23]))
|
|
k.emit(v_mov_b64_e32(v[82:83], v[24:25]))
|
|
k.emit(v_mov_b64_e32(v[84:85], v[26:27]))
|
|
k.emit(v_mov_b64_e32(v[86:87], v[28:29]))
|
|
k.emit(v_cvt_pk(v[80], v[80], v[81]))
|
|
k.emit(v_cvt_pk(v[81], v[82], v[83]))
|
|
k.emit(v_cvt_pk(v[82], v[84], v[85]))
|
|
k.emit(v_cvt_pk(v[83], v[86], v[87]))
|
|
k.emit(s_lshl_b32(s[68], s[36], 1))
|
|
k.emit(s_add_u32(s[12], s[12], s[68]))
|
|
k.emit(s_addc_u32(s[13], s[13], 0))
|
|
k.emit(buffer_store_dwordx4(v[80:83], v[35], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(s_nop())
|
|
k.emit(ds_read_b128(v[88:91], v[37]))
|
|
k.emit(ds_read_b128(v[92:95], v[37], v[0], v[0], 0, 16))
|
|
k.emit(ds_read_b128(v[96:99], v[37], v[0], v[0], 0, 0, 4))
|
|
k.emit(ds_read_b128(v[100:103], v[37], v[0], v[0], 0, 16, 4))
|
|
k.emit(v_accvgpr_read(v[40], v[129]))
|
|
k.emit(v_accvgpr_read(v[41], v[133]))
|
|
k.emit(v_accvgpr_read(v[42], v[137]))
|
|
k.emit(v_accvgpr_read(v[43], v[141]))
|
|
k.emit(v_accvgpr_read(v[44], v[145]))
|
|
k.emit(v_accvgpr_read(v[45], v[149]))
|
|
k.emit(v_accvgpr_read(v[46], v[153]))
|
|
k.emit(v_accvgpr_read(v[47], v[157]))
|
|
k.emit(v_accvgpr_read(v[48], v[161]))
|
|
k.emit(v_accvgpr_read(v[49], v[165]))
|
|
k.emit(v_accvgpr_read(v[50], v[169]))
|
|
k.emit(v_accvgpr_read(v[51], v[173]))
|
|
k.emit(v_accvgpr_read(v[52], v[177]))
|
|
k.emit(v_accvgpr_read(v[53], v[181]))
|
|
k.emit(v_accvgpr_read(v[54], v[185]))
|
|
k.emit(v_accvgpr_read(v[55], v[189]))
|
|
k.emit(v_accvgpr_read(v[56], v[193]))
|
|
k.emit(v_accvgpr_read(v[57], v[197]))
|
|
k.emit(v_accvgpr_read(v[58], v[201]))
|
|
k.emit(v_accvgpr_read(v[59], v[205]))
|
|
k.emit(v_accvgpr_read(v[60], v[209]))
|
|
k.emit(v_accvgpr_read(v[61], v[213]))
|
|
k.emit(v_accvgpr_read(v[62], v[217]))
|
|
k.emit(v_accvgpr_read(v[63], v[221]))
|
|
k.emit(v_accvgpr_read(v[64], v[225]))
|
|
k.emit(v_accvgpr_read(v[65], v[229]))
|
|
k.emit(v_accvgpr_read(v[66], v[233]))
|
|
k.emit(v_accvgpr_read(v[67], v[237]))
|
|
k.emit(v_accvgpr_read(v[68], v[241]))
|
|
k.emit(v_accvgpr_read(v[69], v[245]))
|
|
k.emit(v_accvgpr_read(v[70], v[249]))
|
|
k.emit(v_accvgpr_read(v[71], v[253]))
|
|
k.emit(v_accvgpr_read(v[72], v[2]))
|
|
k.emit(v_accvgpr_read(v[73], v[6]))
|
|
k.emit(v_accvgpr_read(v[74], v[10]))
|
|
k.emit(v_accvgpr_read(v[75], v[14]))
|
|
k.emit(v_accvgpr_read(v[76], v[18]))
|
|
k.emit(v_accvgpr_read(v[77], v[22]))
|
|
k.emit(v_accvgpr_read(v[78], v[26]))
|
|
k.emit(v_accvgpr_read(v[79], v[30]))
|
|
k.emit(v_accvgpr_read(v[80], v[34]))
|
|
k.emit(v_accvgpr_read(v[81], v[38]))
|
|
k.emit(v_accvgpr_read(v[82], v[42]))
|
|
k.emit(v_accvgpr_read(v[83], v[46]))
|
|
k.emit(v_accvgpr_read(v[84], v[50]))
|
|
k.emit(v_accvgpr_read(v[85], v[54]))
|
|
k.emit(v_accvgpr_read(v[86], v[58]))
|
|
k.emit(v_accvgpr_read(v[87], v[62]))
|
|
k.emit(v_mov_b32_e32(v[32], 4294901760))
|
|
k.emit(v_mov_b32_e32(v[33], 2147418112))
|
|
k.emit(v_mov_b32_e32(v[34], 32767))
|
|
k.waitcnt(lgkm=0)
|
|
k.emit(v_pk_mul_f32(v[40:41], v[96:97], v[40:41]))
|
|
k.emit(v_pk_mul_f32(v[42:43], v[98:99], v[42:43]))
|
|
k.emit(v_pk_mul_f32(v[44:45], v[100:101], v[44:45]))
|
|
k.emit(v_pk_mul_f32(v[46:47], v[102:103], v[46:47]))
|
|
k.emit(v_pk_add_f32(v[22:23], v[88:89], v[40:41]))
|
|
k.emit(v_pk_add_f32(v[24:25], v[90:91], v[42:43]))
|
|
k.emit(v_pk_add_f32(v[26:27], v[92:93], v[44:45]))
|
|
k.emit(v_pk_add_f32(v[28:29], v[94:95], v[46:47]))
|
|
k.emit(v_mov_b64_e32(v[40:41], v[22:23]))
|
|
k.emit(v_mov_b64_e32(v[42:43], v[24:25]))
|
|
k.emit(v_mov_b64_e32(v[44:45], v[26:27]))
|
|
k.emit(v_mov_b64_e32(v[46:47], v[28:29]))
|
|
k.emit(v_cvt_pk(v[40], v[40], v[41]))
|
|
k.emit(v_cvt_pk(v[41], v[42], v[43]))
|
|
k.emit(v_cvt_pk(v[42], v[44], v[45]))
|
|
k.emit(v_cvt_pk(v[43], v[46], v[47]))
|
|
k.emit(s_lshl_b32(s[68], s[36], 1))
|
|
k.emit(s_add_u32(s[12], s[12], s[68]))
|
|
k.emit(s_addc_u32(s[13], s[13], 0))
|
|
k.emit(buffer_store_dwordx4(v[40:43], v[35], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_pk_mul_f32(v[48:49], v[96:97], v[48:49]))
|
|
k.emit(v_pk_mul_f32(v[50:51], v[98:99], v[50:51]))
|
|
k.emit(v_pk_mul_f32(v[52:53], v[100:101], v[52:53]))
|
|
k.emit(v_pk_mul_f32(v[54:55], v[102:103], v[54:55]))
|
|
k.emit(v_pk_add_f32(v[22:23], v[88:89], v[48:49]))
|
|
k.emit(v_pk_add_f32(v[24:25], v[90:91], v[50:51]))
|
|
k.emit(v_pk_add_f32(v[26:27], v[92:93], v[52:53]))
|
|
k.emit(v_pk_add_f32(v[28:29], v[94:95], v[54:55]))
|
|
k.emit(v_mov_b64_e32(v[48:49], v[22:23]))
|
|
k.emit(v_mov_b64_e32(v[50:51], v[24:25]))
|
|
k.emit(v_mov_b64_e32(v[52:53], v[26:27]))
|
|
k.emit(v_mov_b64_e32(v[54:55], v[28:29]))
|
|
k.emit(v_cvt_pk(v[48], v[48], v[49]))
|
|
k.emit(v_cvt_pk(v[49], v[50], v[51]))
|
|
k.emit(v_cvt_pk(v[50], v[52], v[53]))
|
|
k.emit(v_cvt_pk(v[51], v[54], v[55]))
|
|
k.emit(s_lshl_b32(s[68], s[36], 1))
|
|
k.emit(s_add_u32(s[12], s[12], s[68]))
|
|
k.emit(s_addc_u32(s[13], s[13], 0))
|
|
k.emit(buffer_store_dwordx4(v[48:51], v[35], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_pk_mul_f32(v[56:57], v[96:97], v[56:57]))
|
|
k.emit(v_pk_mul_f32(v[58:59], v[98:99], v[58:59]))
|
|
k.emit(v_pk_mul_f32(v[60:61], v[100:101], v[60:61]))
|
|
k.emit(v_pk_mul_f32(v[62:63], v[102:103], v[62:63]))
|
|
k.emit(v_pk_add_f32(v[22:23], v[88:89], v[56:57]))
|
|
k.emit(v_pk_add_f32(v[24:25], v[90:91], v[58:59]))
|
|
k.emit(v_pk_add_f32(v[26:27], v[92:93], v[60:61]))
|
|
k.emit(v_pk_add_f32(v[28:29], v[94:95], v[62:63]))
|
|
k.emit(v_mov_b64_e32(v[56:57], v[22:23]))
|
|
k.emit(v_mov_b64_e32(v[58:59], v[24:25]))
|
|
k.emit(v_mov_b64_e32(v[60:61], v[26:27]))
|
|
k.emit(v_mov_b64_e32(v[62:63], v[28:29]))
|
|
k.emit(v_cvt_pk(v[56], v[56], v[57]))
|
|
k.emit(v_cvt_pk(v[57], v[58], v[59]))
|
|
k.emit(v_cvt_pk(v[58], v[60], v[61]))
|
|
k.emit(v_cvt_pk(v[59], v[62], v[63]))
|
|
k.emit(s_lshl_b32(s[68], s[36], 1))
|
|
k.emit(s_add_u32(s[12], s[12], s[68]))
|
|
k.emit(s_addc_u32(s[13], s[13], 0))
|
|
k.emit(buffer_store_dwordx4(v[56:59], v[35], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_pk_mul_f32(v[64:65], v[96:97], v[64:65]))
|
|
k.emit(v_pk_mul_f32(v[66:67], v[98:99], v[66:67]))
|
|
k.emit(v_pk_mul_f32(v[68:69], v[100:101], v[68:69]))
|
|
k.emit(v_pk_mul_f32(v[70:71], v[102:103], v[70:71]))
|
|
k.emit(v_pk_add_f32(v[22:23], v[88:89], v[64:65]))
|
|
k.emit(v_pk_add_f32(v[24:25], v[90:91], v[66:67]))
|
|
k.emit(v_pk_add_f32(v[26:27], v[92:93], v[68:69]))
|
|
k.emit(v_pk_add_f32(v[28:29], v[94:95], v[70:71]))
|
|
k.emit(v_mov_b64_e32(v[64:65], v[22:23]))
|
|
k.emit(v_mov_b64_e32(v[66:67], v[24:25]))
|
|
k.emit(v_mov_b64_e32(v[68:69], v[26:27]))
|
|
k.emit(v_mov_b64_e32(v[70:71], v[28:29]))
|
|
k.emit(v_cvt_pk(v[64], v[64], v[65]))
|
|
k.emit(v_cvt_pk(v[65], v[66], v[67]))
|
|
k.emit(v_cvt_pk(v[66], v[68], v[69]))
|
|
k.emit(v_cvt_pk(v[67], v[70], v[71]))
|
|
k.emit(s_lshl_b32(s[68], s[36], 1))
|
|
k.emit(s_add_u32(s[12], s[12], s[68]))
|
|
k.emit(s_addc_u32(s[13], s[13], 0))
|
|
k.emit(buffer_store_dwordx4(v[64:67], v[35], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_pk_mul_f32(v[72:73], v[96:97], v[72:73]))
|
|
k.emit(v_pk_mul_f32(v[74:75], v[98:99], v[74:75]))
|
|
k.emit(v_pk_mul_f32(v[76:77], v[100:101], v[76:77]))
|
|
k.emit(v_pk_mul_f32(v[78:79], v[102:103], v[78:79]))
|
|
k.emit(v_pk_add_f32(v[22:23], v[88:89], v[72:73]))
|
|
k.emit(v_pk_add_f32(v[24:25], v[90:91], v[74:75]))
|
|
k.emit(v_pk_add_f32(v[26:27], v[92:93], v[76:77]))
|
|
k.emit(v_pk_add_f32(v[28:29], v[94:95], v[78:79]))
|
|
k.emit(v_mov_b64_e32(v[72:73], v[22:23]))
|
|
k.emit(v_mov_b64_e32(v[74:75], v[24:25]))
|
|
k.emit(v_mov_b64_e32(v[76:77], v[26:27]))
|
|
k.emit(v_mov_b64_e32(v[78:79], v[28:29]))
|
|
k.emit(v_cvt_pk(v[72], v[72], v[73]))
|
|
k.emit(v_cvt_pk(v[73], v[74], v[75]))
|
|
k.emit(v_cvt_pk(v[74], v[76], v[77]))
|
|
k.emit(v_cvt_pk(v[75], v[78], v[79]))
|
|
k.emit(s_lshl_b32(s[68], s[36], 1))
|
|
k.emit(s_add_u32(s[12], s[12], s[68]))
|
|
k.emit(s_addc_u32(s[13], s[13], 0))
|
|
k.emit(buffer_store_dwordx4(v[72:75], v[35], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_pk_mul_f32(v[80:81], v[96:97], v[80:81]))
|
|
k.emit(v_pk_mul_f32(v[82:83], v[98:99], v[82:83]))
|
|
k.emit(v_pk_mul_f32(v[84:85], v[100:101], v[84:85]))
|
|
k.emit(v_pk_mul_f32(v[86:87], v[102:103], v[86:87]))
|
|
k.emit(v_pk_add_f32(v[22:23], v[88:89], v[80:81]))
|
|
k.emit(v_pk_add_f32(v[24:25], v[90:91], v[82:83]))
|
|
k.emit(v_pk_add_f32(v[26:27], v[92:93], v[84:85]))
|
|
k.emit(v_pk_add_f32(v[28:29], v[94:95], v[86:87]))
|
|
k.emit(v_mov_b64_e32(v[80:81], v[22:23]))
|
|
k.emit(v_mov_b64_e32(v[82:83], v[24:25]))
|
|
k.emit(v_mov_b64_e32(v[84:85], v[26:27]))
|
|
k.emit(v_mov_b64_e32(v[86:87], v[28:29]))
|
|
k.emit(v_cvt_pk(v[80], v[80], v[81]))
|
|
k.emit(v_cvt_pk(v[81], v[82], v[83]))
|
|
k.emit(v_cvt_pk(v[82], v[84], v[85]))
|
|
k.emit(v_cvt_pk(v[83], v[86], v[87]))
|
|
k.emit(s_lshl_b32(s[68], s[36], 1))
|
|
k.emit(s_add_u32(s[12], s[12], s[68]))
|
|
k.emit(s_addc_u32(s[13], s[13], 0))
|
|
k.emit(buffer_store_dwordx4(v[80:83], v[35], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(s_nop())
|
|
k.emit(ds_read_b128(v[88:91], v[37]))
|
|
k.emit(ds_read_b128(v[92:95], v[37], v[0], v[0], 0, 16))
|
|
k.emit(ds_read_b128(v[96:99], v[37], v[0], v[0], 0, 0, 4))
|
|
k.emit(ds_read_b128(v[100:103], v[37], v[0], v[0], 0, 16, 4))
|
|
k.emit(v_accvgpr_read(v[40], v[66]))
|
|
k.emit(v_accvgpr_read(v[41], v[70]))
|
|
k.emit(v_accvgpr_read(v[42], v[74]))
|
|
k.emit(v_accvgpr_read(v[43], v[78]))
|
|
k.emit(v_accvgpr_read(v[44], v[82]))
|
|
k.emit(v_accvgpr_read(v[45], v[86]))
|
|
k.emit(v_accvgpr_read(v[46], v[90]))
|
|
k.emit(v_accvgpr_read(v[47], v[94]))
|
|
k.emit(v_accvgpr_read(v[48], v[98]))
|
|
k.emit(v_accvgpr_read(v[49], v[102]))
|
|
k.emit(v_accvgpr_read(v[50], v[106]))
|
|
k.emit(v_accvgpr_read(v[51], v[110]))
|
|
k.emit(v_accvgpr_read(v[52], v[114]))
|
|
k.emit(v_accvgpr_read(v[53], v[118]))
|
|
k.emit(v_accvgpr_read(v[54], v[122]))
|
|
k.emit(v_accvgpr_read(v[55], v[126]))
|
|
k.emit(v_accvgpr_read(v[56], v[130]))
|
|
k.emit(v_accvgpr_read(v[57], v[134]))
|
|
k.emit(v_accvgpr_read(v[58], v[138]))
|
|
k.emit(v_accvgpr_read(v[59], v[142]))
|
|
k.emit(v_accvgpr_read(v[60], v[146]))
|
|
k.emit(v_accvgpr_read(v[61], v[150]))
|
|
k.emit(v_accvgpr_read(v[62], v[154]))
|
|
k.emit(v_accvgpr_read(v[63], v[158]))
|
|
k.emit(v_accvgpr_read(v[64], v[162]))
|
|
k.emit(v_accvgpr_read(v[65], v[166]))
|
|
k.emit(v_accvgpr_read(v[66], v[170]))
|
|
k.emit(v_accvgpr_read(v[67], v[174]))
|
|
k.emit(v_accvgpr_read(v[68], v[178]))
|
|
k.emit(v_accvgpr_read(v[69], v[182]))
|
|
k.emit(v_accvgpr_read(v[70], v[186]))
|
|
k.emit(v_accvgpr_read(v[71], v[190]))
|
|
k.emit(v_accvgpr_read(v[72], v[194]))
|
|
k.emit(v_accvgpr_read(v[73], v[198]))
|
|
k.emit(v_accvgpr_read(v[74], v[202]))
|
|
k.emit(v_accvgpr_read(v[75], v[206]))
|
|
k.emit(v_accvgpr_read(v[76], v[210]))
|
|
k.emit(v_accvgpr_read(v[77], v[214]))
|
|
k.emit(v_accvgpr_read(v[78], v[218]))
|
|
k.emit(v_accvgpr_read(v[79], v[222]))
|
|
k.emit(v_accvgpr_read(v[80], v[226]))
|
|
k.emit(v_accvgpr_read(v[81], v[230]))
|
|
k.emit(v_accvgpr_read(v[82], v[234]))
|
|
k.emit(v_accvgpr_read(v[83], v[238]))
|
|
k.emit(v_accvgpr_read(v[84], v[242]))
|
|
k.emit(v_accvgpr_read(v[85], v[246]))
|
|
k.emit(v_accvgpr_read(v[86], v[250]))
|
|
k.emit(v_accvgpr_read(v[87], v[254]))
|
|
k.emit(v_mov_b32_e32(v[32], 4294901760))
|
|
k.emit(v_mov_b32_e32(v[33], 2147418112))
|
|
k.emit(v_mov_b32_e32(v[34], 32767))
|
|
k.waitcnt(lgkm=0)
|
|
k.emit(v_pk_mul_f32(v[40:41], v[96:97], v[40:41]))
|
|
k.emit(v_pk_mul_f32(v[42:43], v[98:99], v[42:43]))
|
|
k.emit(v_pk_mul_f32(v[44:45], v[100:101], v[44:45]))
|
|
k.emit(v_pk_mul_f32(v[46:47], v[102:103], v[46:47]))
|
|
k.emit(v_pk_add_f32(v[22:23], v[88:89], v[40:41]))
|
|
k.emit(v_pk_add_f32(v[24:25], v[90:91], v[42:43]))
|
|
k.emit(v_pk_add_f32(v[26:27], v[92:93], v[44:45]))
|
|
k.emit(v_pk_add_f32(v[28:29], v[94:95], v[46:47]))
|
|
k.emit(v_mov_b64_e32(v[40:41], v[22:23]))
|
|
k.emit(v_mov_b64_e32(v[42:43], v[24:25]))
|
|
k.emit(v_mov_b64_e32(v[44:45], v[26:27]))
|
|
k.emit(v_mov_b64_e32(v[46:47], v[28:29]))
|
|
k.emit(v_cvt_pk(v[40], v[40], v[41]))
|
|
k.emit(v_cvt_pk(v[41], v[42], v[43]))
|
|
k.emit(v_cvt_pk(v[42], v[44], v[45]))
|
|
k.emit(v_cvt_pk(v[43], v[46], v[47]))
|
|
k.emit(s_lshl_b32(s[68], s[36], 1))
|
|
k.emit(s_add_u32(s[12], s[12], s[68]))
|
|
k.emit(s_addc_u32(s[13], s[13], 0))
|
|
k.emit(buffer_store_dwordx4(v[40:43], v[35], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_pk_mul_f32(v[48:49], v[96:97], v[48:49]))
|
|
k.emit(v_pk_mul_f32(v[50:51], v[98:99], v[50:51]))
|
|
k.emit(v_pk_mul_f32(v[52:53], v[100:101], v[52:53]))
|
|
k.emit(v_pk_mul_f32(v[54:55], v[102:103], v[54:55]))
|
|
k.emit(v_pk_add_f32(v[22:23], v[88:89], v[48:49]))
|
|
k.emit(v_pk_add_f32(v[24:25], v[90:91], v[50:51]))
|
|
k.emit(v_pk_add_f32(v[26:27], v[92:93], v[52:53]))
|
|
k.emit(v_pk_add_f32(v[28:29], v[94:95], v[54:55]))
|
|
k.emit(v_mov_b64_e32(v[48:49], v[22:23]))
|
|
k.emit(v_mov_b64_e32(v[50:51], v[24:25]))
|
|
k.emit(v_mov_b64_e32(v[52:53], v[26:27]))
|
|
k.emit(v_mov_b64_e32(v[54:55], v[28:29]))
|
|
k.emit(v_cvt_pk(v[48], v[48], v[49]))
|
|
k.emit(v_cvt_pk(v[49], v[50], v[51]))
|
|
k.emit(v_cvt_pk(v[50], v[52], v[53]))
|
|
k.emit(v_cvt_pk(v[51], v[54], v[55]))
|
|
k.emit(s_lshl_b32(s[68], s[36], 1))
|
|
k.emit(s_add_u32(s[12], s[12], s[68]))
|
|
k.emit(s_addc_u32(s[13], s[13], 0))
|
|
k.emit(buffer_store_dwordx4(v[48:51], v[35], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_pk_mul_f32(v[56:57], v[96:97], v[56:57]))
|
|
k.emit(v_pk_mul_f32(v[58:59], v[98:99], v[58:59]))
|
|
k.emit(v_pk_mul_f32(v[60:61], v[100:101], v[60:61]))
|
|
k.emit(v_pk_mul_f32(v[62:63], v[102:103], v[62:63]))
|
|
k.emit(v_pk_add_f32(v[22:23], v[88:89], v[56:57]))
|
|
k.emit(v_pk_add_f32(v[24:25], v[90:91], v[58:59]))
|
|
k.emit(v_pk_add_f32(v[26:27], v[92:93], v[60:61]))
|
|
k.emit(v_pk_add_f32(v[28:29], v[94:95], v[62:63]))
|
|
k.emit(v_mov_b64_e32(v[56:57], v[22:23]))
|
|
k.emit(v_mov_b64_e32(v[58:59], v[24:25]))
|
|
k.emit(v_mov_b64_e32(v[60:61], v[26:27]))
|
|
k.emit(v_mov_b64_e32(v[62:63], v[28:29]))
|
|
k.emit(v_cvt_pk(v[56], v[56], v[57]))
|
|
k.emit(v_cvt_pk(v[57], v[58], v[59]))
|
|
k.emit(v_cvt_pk(v[58], v[60], v[61]))
|
|
k.emit(v_cvt_pk(v[59], v[62], v[63]))
|
|
k.emit(s_lshl_b32(s[68], s[36], 1))
|
|
k.emit(s_add_u32(s[12], s[12], s[68]))
|
|
k.emit(s_addc_u32(s[13], s[13], 0))
|
|
k.emit(buffer_store_dwordx4(v[56:59], v[35], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_pk_mul_f32(v[64:65], v[96:97], v[64:65]))
|
|
k.emit(v_pk_mul_f32(v[66:67], v[98:99], v[66:67]))
|
|
k.emit(v_pk_mul_f32(v[68:69], v[100:101], v[68:69]))
|
|
k.emit(v_pk_mul_f32(v[70:71], v[102:103], v[70:71]))
|
|
k.emit(v_pk_add_f32(v[22:23], v[88:89], v[64:65]))
|
|
k.emit(v_pk_add_f32(v[24:25], v[90:91], v[66:67]))
|
|
k.emit(v_pk_add_f32(v[26:27], v[92:93], v[68:69]))
|
|
k.emit(v_pk_add_f32(v[28:29], v[94:95], v[70:71]))
|
|
k.emit(v_mov_b64_e32(v[64:65], v[22:23]))
|
|
k.emit(v_mov_b64_e32(v[66:67], v[24:25]))
|
|
k.emit(v_mov_b64_e32(v[68:69], v[26:27]))
|
|
k.emit(v_mov_b64_e32(v[70:71], v[28:29]))
|
|
k.emit(v_cvt_pk(v[64], v[64], v[65]))
|
|
k.emit(v_cvt_pk(v[65], v[66], v[67]))
|
|
k.emit(v_cvt_pk(v[66], v[68], v[69]))
|
|
k.emit(v_cvt_pk(v[67], v[70], v[71]))
|
|
k.emit(s_lshl_b32(s[68], s[36], 1))
|
|
k.emit(s_add_u32(s[12], s[12], s[68]))
|
|
k.emit(s_addc_u32(s[13], s[13], 0))
|
|
k.emit(buffer_store_dwordx4(v[64:67], v[35], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_pk_mul_f32(v[72:73], v[96:97], v[72:73]))
|
|
k.emit(v_pk_mul_f32(v[74:75], v[98:99], v[74:75]))
|
|
k.emit(v_pk_mul_f32(v[76:77], v[100:101], v[76:77]))
|
|
k.emit(v_pk_mul_f32(v[78:79], v[102:103], v[78:79]))
|
|
k.emit(v_pk_add_f32(v[22:23], v[88:89], v[72:73]))
|
|
k.emit(v_pk_add_f32(v[24:25], v[90:91], v[74:75]))
|
|
k.emit(v_pk_add_f32(v[26:27], v[92:93], v[76:77]))
|
|
k.emit(v_pk_add_f32(v[28:29], v[94:95], v[78:79]))
|
|
k.emit(v_mov_b64_e32(v[72:73], v[22:23]))
|
|
k.emit(v_mov_b64_e32(v[74:75], v[24:25]))
|
|
k.emit(v_mov_b64_e32(v[76:77], v[26:27]))
|
|
k.emit(v_mov_b64_e32(v[78:79], v[28:29]))
|
|
k.emit(v_cvt_pk(v[72], v[72], v[73]))
|
|
k.emit(v_cvt_pk(v[73], v[74], v[75]))
|
|
k.emit(v_cvt_pk(v[74], v[76], v[77]))
|
|
k.emit(v_cvt_pk(v[75], v[78], v[79]))
|
|
k.emit(s_lshl_b32(s[68], s[36], 1))
|
|
k.emit(s_add_u32(s[12], s[12], s[68]))
|
|
k.emit(s_addc_u32(s[13], s[13], 0))
|
|
k.emit(buffer_store_dwordx4(v[72:75], v[35], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_pk_mul_f32(v[80:81], v[96:97], v[80:81]))
|
|
k.emit(v_pk_mul_f32(v[82:83], v[98:99], v[82:83]))
|
|
k.emit(v_pk_mul_f32(v[84:85], v[100:101], v[84:85]))
|
|
k.emit(v_pk_mul_f32(v[86:87], v[102:103], v[86:87]))
|
|
k.emit(v_pk_add_f32(v[22:23], v[88:89], v[80:81]))
|
|
k.emit(v_pk_add_f32(v[24:25], v[90:91], v[82:83]))
|
|
k.emit(v_pk_add_f32(v[26:27], v[92:93], v[84:85]))
|
|
k.emit(v_pk_add_f32(v[28:29], v[94:95], v[86:87]))
|
|
k.emit(v_mov_b64_e32(v[80:81], v[22:23]))
|
|
k.emit(v_mov_b64_e32(v[82:83], v[24:25]))
|
|
k.emit(v_mov_b64_e32(v[84:85], v[26:27]))
|
|
k.emit(v_mov_b64_e32(v[86:87], v[28:29]))
|
|
k.emit(v_cvt_pk(v[80], v[80], v[81]))
|
|
k.emit(v_cvt_pk(v[81], v[82], v[83]))
|
|
k.emit(v_cvt_pk(v[82], v[84], v[85]))
|
|
k.emit(v_cvt_pk(v[83], v[86], v[87]))
|
|
k.emit(s_lshl_b32(s[68], s[36], 1))
|
|
k.emit(s_add_u32(s[12], s[12], s[68]))
|
|
k.emit(s_addc_u32(s[13], s[13], 0))
|
|
k.emit(buffer_store_dwordx4(v[80:83], v[35], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(s_nop())
|
|
k.emit(ds_read_b128(v[88:91], v[37]))
|
|
k.emit(ds_read_b128(v[92:95], v[37], v[0], v[0], 0, 16))
|
|
k.emit(ds_read_b128(v[96:99], v[37], v[0], v[0], 0, 0, 4))
|
|
k.emit(ds_read_b128(v[100:103], v[37], v[0], v[0], 0, 16, 4))
|
|
k.emit(v_accvgpr_read(v[40], v[3]))
|
|
k.emit(v_accvgpr_read(v[41], v[7]))
|
|
k.emit(v_accvgpr_read(v[42], v[11]))
|
|
k.emit(v_accvgpr_read(v[43], v[15]))
|
|
k.emit(v_accvgpr_read(v[44], v[19]))
|
|
k.emit(v_accvgpr_read(v[45], v[23]))
|
|
k.emit(v_accvgpr_read(v[46], v[27]))
|
|
k.emit(v_accvgpr_read(v[47], v[31]))
|
|
k.emit(v_accvgpr_read(v[48], v[35]))
|
|
k.emit(v_accvgpr_read(v[49], v[39]))
|
|
k.emit(v_accvgpr_read(v[50], v[43]))
|
|
k.emit(v_accvgpr_read(v[51], v[47]))
|
|
k.emit(v_accvgpr_read(v[52], v[51]))
|
|
k.emit(v_accvgpr_read(v[53], v[55]))
|
|
k.emit(v_accvgpr_read(v[54], v[59]))
|
|
k.emit(v_accvgpr_read(v[55], v[63]))
|
|
k.emit(v_accvgpr_read(v[56], v[67]))
|
|
k.emit(v_accvgpr_read(v[57], v[71]))
|
|
k.emit(v_accvgpr_read(v[58], v[75]))
|
|
k.emit(v_accvgpr_read(v[59], v[79]))
|
|
k.emit(v_accvgpr_read(v[60], v[83]))
|
|
k.emit(v_accvgpr_read(v[61], v[87]))
|
|
k.emit(v_accvgpr_read(v[62], v[91]))
|
|
k.emit(v_accvgpr_read(v[63], v[95]))
|
|
k.emit(v_accvgpr_read(v[64], v[99]))
|
|
k.emit(v_accvgpr_read(v[65], v[103]))
|
|
k.emit(v_accvgpr_read(v[66], v[107]))
|
|
k.emit(v_accvgpr_read(v[67], v[111]))
|
|
k.emit(v_accvgpr_read(v[68], v[115]))
|
|
k.emit(v_accvgpr_read(v[69], v[119]))
|
|
k.emit(v_accvgpr_read(v[70], v[123]))
|
|
k.emit(v_accvgpr_read(v[71], v[127]))
|
|
k.emit(v_accvgpr_read(v[72], v[131]))
|
|
k.emit(v_accvgpr_read(v[73], v[135]))
|
|
k.emit(v_accvgpr_read(v[74], v[139]))
|
|
k.emit(v_accvgpr_read(v[75], v[143]))
|
|
k.emit(v_accvgpr_read(v[76], v[147]))
|
|
k.emit(v_accvgpr_read(v[77], v[151]))
|
|
k.emit(v_accvgpr_read(v[78], v[155]))
|
|
k.emit(v_accvgpr_read(v[79], v[159]))
|
|
k.emit(v_accvgpr_read(v[80], v[163]))
|
|
k.emit(v_accvgpr_read(v[81], v[167]))
|
|
k.emit(v_accvgpr_read(v[82], v[171]))
|
|
k.emit(v_accvgpr_read(v[83], v[175]))
|
|
k.emit(v_accvgpr_read(v[84], v[179]))
|
|
k.emit(v_accvgpr_read(v[85], v[183]))
|
|
k.emit(v_accvgpr_read(v[86], v[187]))
|
|
k.emit(v_accvgpr_read(v[87], v[191]))
|
|
k.emit(v_mov_b32_e32(v[32], 4294901760))
|
|
k.emit(v_mov_b32_e32(v[33], 2147418112))
|
|
k.emit(v_mov_b32_e32(v[34], 32767))
|
|
k.waitcnt(lgkm=0)
|
|
k.emit(v_pk_mul_f32(v[40:41], v[96:97], v[40:41]))
|
|
k.emit(v_pk_mul_f32(v[42:43], v[98:99], v[42:43]))
|
|
k.emit(v_pk_mul_f32(v[44:45], v[100:101], v[44:45]))
|
|
k.emit(v_pk_mul_f32(v[46:47], v[102:103], v[46:47]))
|
|
k.emit(v_pk_add_f32(v[22:23], v[88:89], v[40:41]))
|
|
k.emit(v_pk_add_f32(v[24:25], v[90:91], v[42:43]))
|
|
k.emit(v_pk_add_f32(v[26:27], v[92:93], v[44:45]))
|
|
k.emit(v_pk_add_f32(v[28:29], v[94:95], v[46:47]))
|
|
k.emit(v_mov_b64_e32(v[40:41], v[22:23]))
|
|
k.emit(v_mov_b64_e32(v[42:43], v[24:25]))
|
|
k.emit(v_mov_b64_e32(v[44:45], v[26:27]))
|
|
k.emit(v_mov_b64_e32(v[46:47], v[28:29]))
|
|
k.emit(v_cvt_pk(v[40], v[40], v[41]))
|
|
k.emit(v_cvt_pk(v[41], v[42], v[43]))
|
|
k.emit(v_cvt_pk(v[42], v[44], v[45]))
|
|
k.emit(v_cvt_pk(v[43], v[46], v[47]))
|
|
k.emit(s_lshl_b32(s[68], s[36], 1))
|
|
k.emit(s_add_u32(s[12], s[12], s[68]))
|
|
k.emit(s_addc_u32(s[13], s[13], 0))
|
|
k.emit(buffer_store_dwordx4(v[40:43], v[35], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_pk_mul_f32(v[48:49], v[96:97], v[48:49]))
|
|
k.emit(v_pk_mul_f32(v[50:51], v[98:99], v[50:51]))
|
|
k.emit(v_pk_mul_f32(v[52:53], v[100:101], v[52:53]))
|
|
k.emit(v_pk_mul_f32(v[54:55], v[102:103], v[54:55]))
|
|
k.emit(v_pk_add_f32(v[22:23], v[88:89], v[48:49]))
|
|
k.emit(v_pk_add_f32(v[24:25], v[90:91], v[50:51]))
|
|
k.emit(v_pk_add_f32(v[26:27], v[92:93], v[52:53]))
|
|
k.emit(v_pk_add_f32(v[28:29], v[94:95], v[54:55]))
|
|
k.emit(v_mov_b64_e32(v[48:49], v[22:23]))
|
|
k.emit(v_mov_b64_e32(v[50:51], v[24:25]))
|
|
k.emit(v_mov_b64_e32(v[52:53], v[26:27]))
|
|
k.emit(v_mov_b64_e32(v[54:55], v[28:29]))
|
|
k.emit(v_cvt_pk(v[48], v[48], v[49]))
|
|
k.emit(v_cvt_pk(v[49], v[50], v[51]))
|
|
k.emit(v_cvt_pk(v[50], v[52], v[53]))
|
|
k.emit(v_cvt_pk(v[51], v[54], v[55]))
|
|
k.emit(s_lshl_b32(s[68], s[36], 1))
|
|
k.emit(s_add_u32(s[12], s[12], s[68]))
|
|
k.emit(s_addc_u32(s[13], s[13], 0))
|
|
k.emit(buffer_store_dwordx4(v[48:51], v[35], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_pk_mul_f32(v[56:57], v[96:97], v[56:57]))
|
|
k.emit(v_pk_mul_f32(v[58:59], v[98:99], v[58:59]))
|
|
k.emit(v_pk_mul_f32(v[60:61], v[100:101], v[60:61]))
|
|
k.emit(v_pk_mul_f32(v[62:63], v[102:103], v[62:63]))
|
|
k.emit(v_pk_add_f32(v[22:23], v[88:89], v[56:57]))
|
|
k.emit(v_pk_add_f32(v[24:25], v[90:91], v[58:59]))
|
|
k.emit(v_pk_add_f32(v[26:27], v[92:93], v[60:61]))
|
|
k.emit(v_pk_add_f32(v[28:29], v[94:95], v[62:63]))
|
|
k.emit(v_mov_b64_e32(v[56:57], v[22:23]))
|
|
k.emit(v_mov_b64_e32(v[58:59], v[24:25]))
|
|
k.emit(v_mov_b64_e32(v[60:61], v[26:27]))
|
|
k.emit(v_mov_b64_e32(v[62:63], v[28:29]))
|
|
k.emit(v_cvt_pk(v[56], v[56], v[57]))
|
|
k.emit(v_cvt_pk(v[57], v[58], v[59]))
|
|
k.emit(v_cvt_pk(v[58], v[60], v[61]))
|
|
k.emit(v_cvt_pk(v[59], v[62], v[63]))
|
|
k.emit(s_lshl_b32(s[68], s[36], 1))
|
|
k.emit(s_add_u32(s[12], s[12], s[68]))
|
|
k.emit(s_addc_u32(s[13], s[13], 0))
|
|
k.emit(buffer_store_dwordx4(v[56:59], v[35], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_pk_mul_f32(v[64:65], v[96:97], v[64:65]))
|
|
k.emit(v_pk_mul_f32(v[66:67], v[98:99], v[66:67]))
|
|
k.emit(v_pk_mul_f32(v[68:69], v[100:101], v[68:69]))
|
|
k.emit(v_pk_mul_f32(v[70:71], v[102:103], v[70:71]))
|
|
k.emit(v_pk_add_f32(v[22:23], v[88:89], v[64:65]))
|
|
k.emit(v_pk_add_f32(v[24:25], v[90:91], v[66:67]))
|
|
k.emit(v_pk_add_f32(v[26:27], v[92:93], v[68:69]))
|
|
k.emit(v_pk_add_f32(v[28:29], v[94:95], v[70:71]))
|
|
k.emit(v_mov_b64_e32(v[64:65], v[22:23]))
|
|
k.emit(v_mov_b64_e32(v[66:67], v[24:25]))
|
|
k.emit(v_mov_b64_e32(v[68:69], v[26:27]))
|
|
k.emit(v_mov_b64_e32(v[70:71], v[28:29]))
|
|
k.emit(v_cvt_pk(v[64], v[64], v[65]))
|
|
k.emit(v_cvt_pk(v[65], v[66], v[67]))
|
|
k.emit(v_cvt_pk(v[66], v[68], v[69]))
|
|
k.emit(v_cvt_pk(v[67], v[70], v[71]))
|
|
k.emit(s_lshl_b32(s[68], s[36], 1))
|
|
k.emit(s_add_u32(s[12], s[12], s[68]))
|
|
k.emit(s_addc_u32(s[13], s[13], 0))
|
|
k.emit(buffer_store_dwordx4(v[64:67], v[35], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_pk_mul_f32(v[72:73], v[96:97], v[72:73]))
|
|
k.emit(v_pk_mul_f32(v[74:75], v[98:99], v[74:75]))
|
|
k.emit(v_pk_mul_f32(v[76:77], v[100:101], v[76:77]))
|
|
k.emit(v_pk_mul_f32(v[78:79], v[102:103], v[78:79]))
|
|
k.emit(v_pk_add_f32(v[22:23], v[88:89], v[72:73]))
|
|
k.emit(v_pk_add_f32(v[24:25], v[90:91], v[74:75]))
|
|
k.emit(v_pk_add_f32(v[26:27], v[92:93], v[76:77]))
|
|
k.emit(v_pk_add_f32(v[28:29], v[94:95], v[78:79]))
|
|
k.emit(v_mov_b64_e32(v[72:73], v[22:23]))
|
|
k.emit(v_mov_b64_e32(v[74:75], v[24:25]))
|
|
k.emit(v_mov_b64_e32(v[76:77], v[26:27]))
|
|
k.emit(v_mov_b64_e32(v[78:79], v[28:29]))
|
|
k.emit(v_cvt_pk(v[72], v[72], v[73]))
|
|
k.emit(v_cvt_pk(v[73], v[74], v[75]))
|
|
k.emit(v_cvt_pk(v[74], v[76], v[77]))
|
|
k.emit(v_cvt_pk(v[75], v[78], v[79]))
|
|
k.emit(s_lshl_b32(s[68], s[36], 1))
|
|
k.emit(s_add_u32(s[12], s[12], s[68]))
|
|
k.emit(s_addc_u32(s[13], s[13], 0))
|
|
k.emit(buffer_store_dwordx4(v[72:75], v[35], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_pk_mul_f32(v[80:81], v[96:97], v[80:81]))
|
|
k.emit(v_pk_mul_f32(v[82:83], v[98:99], v[82:83]))
|
|
k.emit(v_pk_mul_f32(v[84:85], v[100:101], v[84:85]))
|
|
k.emit(v_pk_mul_f32(v[86:87], v[102:103], v[86:87]))
|
|
k.emit(v_pk_add_f32(v[22:23], v[88:89], v[80:81]))
|
|
k.emit(v_pk_add_f32(v[24:25], v[90:91], v[82:83]))
|
|
k.emit(v_pk_add_f32(v[26:27], v[92:93], v[84:85]))
|
|
k.emit(v_pk_add_f32(v[28:29], v[94:95], v[86:87]))
|
|
k.emit(v_mov_b64_e32(v[80:81], v[22:23]))
|
|
k.emit(v_mov_b64_e32(v[82:83], v[24:25]))
|
|
k.emit(v_mov_b64_e32(v[84:85], v[26:27]))
|
|
k.emit(v_mov_b64_e32(v[86:87], v[28:29]))
|
|
k.emit(v_cvt_pk(v[80], v[80], v[81]))
|
|
k.emit(v_cvt_pk(v[81], v[82], v[83]))
|
|
k.emit(v_cvt_pk(v[82], v[84], v[85]))
|
|
k.emit(v_cvt_pk(v[83], v[86], v[87]))
|
|
k.emit(s_lshl_b32(s[68], s[36], 1))
|
|
k.emit(s_add_u32(s[12], s[12], s[68]))
|
|
k.emit(s_addc_u32(s[13], s[13], 0))
|
|
k.emit(buffer_store_dwordx4(v[80:83], v[35], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(s_nop())
|
|
k.emit(ds_read_b128(v[56:59], v[37]))
|
|
k.emit(ds_read_b128(v[60:63], v[37], v[0], v[0], 0, 16))
|
|
k.emit(ds_read_b128(v[64:67], v[37], v[0], v[0], 0, 0, 4))
|
|
k.emit(ds_read_b128(v[68:71], v[37], v[0], v[0], 0, 16, 4))
|
|
k.emit(v_accvgpr_read(v[40], v[195]))
|
|
k.emit(v_accvgpr_read(v[41], v[199]))
|
|
k.emit(v_accvgpr_read(v[42], v[203]))
|
|
k.emit(v_accvgpr_read(v[43], v[207]))
|
|
k.emit(v_accvgpr_read(v[44], v[211]))
|
|
k.emit(v_accvgpr_read(v[45], v[215]))
|
|
k.emit(v_accvgpr_read(v[46], v[219]))
|
|
k.emit(v_accvgpr_read(v[47], v[223]))
|
|
k.emit(v_accvgpr_read(v[48], v[227]))
|
|
k.emit(v_accvgpr_read(v[49], v[231]))
|
|
k.emit(v_accvgpr_read(v[50], v[235]))
|
|
k.emit(v_accvgpr_read(v[51], v[239]))
|
|
k.emit(v_accvgpr_read(v[52], v[243]))
|
|
k.emit(v_accvgpr_read(v[53], v[247]))
|
|
k.emit(v_accvgpr_read(v[54], v[251]))
|
|
k.emit(v_accvgpr_read(v[55], v[255]))
|
|
k.emit(v_mov_b32_e32(v[32], 4294901760))
|
|
k.emit(v_mov_b32_e32(v[33], 2147418112))
|
|
k.emit(v_mov_b32_e32(v[34], 32767))
|
|
k.waitcnt(lgkm=0)
|
|
k.emit(v_pk_mul_f32(v[40:41], v[64:65], v[40:41]))
|
|
k.emit(v_pk_mul_f32(v[42:43], v[66:67], v[42:43]))
|
|
k.emit(v_pk_mul_f32(v[44:45], v[68:69], v[44:45]))
|
|
k.emit(v_pk_mul_f32(v[46:47], v[70:71], v[46:47]))
|
|
k.emit(v_pk_add_f32(v[22:23], v[56:57], v[40:41]))
|
|
k.emit(v_pk_add_f32(v[24:25], v[58:59], v[42:43]))
|
|
k.emit(v_pk_add_f32(v[26:27], v[60:61], v[44:45]))
|
|
k.emit(v_pk_add_f32(v[28:29], v[62:63], v[46:47]))
|
|
k.emit(v_mov_b64_e32(v[40:41], v[22:23]))
|
|
k.emit(v_mov_b64_e32(v[42:43], v[24:25]))
|
|
k.emit(v_mov_b64_e32(v[44:45], v[26:27]))
|
|
k.emit(v_mov_b64_e32(v[46:47], v[28:29]))
|
|
k.emit(v_cvt_pk(v[40], v[40], v[41]))
|
|
k.emit(v_cvt_pk(v[41], v[42], v[43]))
|
|
k.emit(v_cvt_pk(v[42], v[44], v[45]))
|
|
k.emit(v_cvt_pk(v[43], v[46], v[47]))
|
|
k.emit(s_lshl_b32(s[68], s[36], 1))
|
|
k.emit(s_add_u32(s[12], s[12], s[68]))
|
|
k.emit(s_addc_u32(s[13], s[13], 0))
|
|
k.emit(buffer_store_dwordx4(v[40:43], v[35], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_pk_mul_f32(v[48:49], v[64:65], v[48:49]))
|
|
k.emit(v_pk_mul_f32(v[50:51], v[66:67], v[50:51]))
|
|
k.emit(v_pk_mul_f32(v[52:53], v[68:69], v[52:53]))
|
|
k.emit(v_pk_mul_f32(v[54:55], v[70:71], v[54:55]))
|
|
k.emit(v_pk_add_f32(v[22:23], v[56:57], v[48:49]))
|
|
k.emit(v_pk_add_f32(v[24:25], v[58:59], v[50:51]))
|
|
k.emit(v_pk_add_f32(v[26:27], v[60:61], v[52:53]))
|
|
k.emit(v_pk_add_f32(v[28:29], v[62:63], v[54:55]))
|
|
k.emit(v_mov_b64_e32(v[48:49], v[22:23]))
|
|
k.emit(v_mov_b64_e32(v[50:51], v[24:25]))
|
|
k.emit(v_mov_b64_e32(v[52:53], v[26:27]))
|
|
k.emit(v_mov_b64_e32(v[54:55], v[28:29]))
|
|
k.emit(v_cvt_pk(v[48], v[48], v[49]))
|
|
k.emit(v_cvt_pk(v[49], v[50], v[51]))
|
|
k.emit(v_cvt_pk(v[50], v[52], v[53]))
|
|
k.emit(v_cvt_pk(v[51], v[54], v[55]))
|
|
k.emit(s_lshl_b32(s[68], s[36], 1))
|
|
k.emit(s_add_u32(s[12], s[12], s[68]))
|
|
k.emit(s_addc_u32(s[13], s[13], 0))
|
|
k.emit(buffer_store_dwordx4(v[48:51], v[35], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(s_nop())
|
|
k.emit(s_branch(), target='GW_End_1')
|
|
k.label('GW_B0_E1_N_1')
|
|
k.emit(v_mov_b32_e32(v[30], 2147483648))
|
|
k.emit(v_cmp_lt_u32_e64(s[78:79], v[18], s[20]))
|
|
k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21]))
|
|
k.emit(s_and_b64(s[82:83], s[78:79], s[82:83]))
|
|
k.emit(s_mul_i32(s[78], 256, s[2]))
|
|
k.emit(v_sub_u32_e64(v[36], v[18], s[78]))
|
|
k.emit(v_lshlrev_b32_e32(v[36], 2, v[36]))
|
|
k.waitcnt(lgkm=0)
|
|
k.emit(s_barrier())
|
|
k.emit(ds_read_b128(v[88:91], v[36]))
|
|
k.emit(ds_read_b128(v[92:95], v[36], v[0], v[0], 0, 16))
|
|
k.emit(ds_read_b128(v[96:99], v[36], v[0], v[0], 0, 0, 4))
|
|
k.emit(ds_read_b128(v[100:103], v[36], v[0], v[0], 0, 16, 4))
|
|
k.emit(v_add_lshl_u32_e64(v[35], v[21], v[18], 1))
|
|
k.emit(v_cndmask_b32_e64(v[35], v[30], v[35], s[82:83]))
|
|
k.emit(v_add_co_u32(v[19], VCC, v[19], 1))
|
|
k.emit(v_add_u32_e64(v[20], v[20], s[38]))
|
|
k.emit(v_add_u32_e64(v[21], v[21], s[36]))
|
|
k.emit(v_cmp_lt_u32_e64(s[78:79], v[18], s[20]))
|
|
k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21]))
|
|
k.emit(s_and_b64(s[82:83], s[78:79], s[82:83]))
|
|
k.emit(s_mul_i32(s[78], 256, s[2]))
|
|
k.emit(v_sub_u32_e64(v[38], v[18], s[78]))
|
|
k.emit(v_lshlrev_b32_e32(v[38], 2, v[38]))
|
|
k.emit(v_add_lshl_u32_e64(v[37], v[21], v[18], 1))
|
|
k.emit(v_cndmask_b32_e64(v[37], v[30], v[37], s[82:83]))
|
|
k.emit(v_add_co_u32(v[19], VCC, v[19], 1))
|
|
k.emit(v_add_u32_e64(v[20], v[20], s[38]))
|
|
k.emit(v_add_u32_e64(v[21], v[21], s[36]))
|
|
k.emit(v_cmp_lt_u32_e64(s[78:79], v[18], s[20]))
|
|
k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21]))
|
|
k.emit(s_and_b64(s[82:83], s[78:79], s[82:83]))
|
|
k.emit(s_mul_i32(s[78], 256, s[2]))
|
|
k.emit(v_sub_u32_e64(v[104], v[18], s[78]))
|
|
k.emit(v_lshlrev_b32_e32(v[104], 2, v[104]))
|
|
k.emit(v_add_lshl_u32_e64(v[39], v[21], v[18], 1))
|
|
k.emit(v_cndmask_b32_e64(v[39], v[30], v[39], s[82:83]))
|
|
k.emit(v_add_co_u32(v[19], VCC, v[19], 1))
|
|
k.emit(v_add_u32_e64(v[20], v[20], s[38]))
|
|
k.emit(v_add_u32_e64(v[21], v[21], s[36]))
|
|
k.emit(v_cmp_lt_u32_e64(s[78:79], v[18], s[20]))
|
|
k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21]))
|
|
k.emit(s_and_b64(s[82:83], s[78:79], s[82:83]))
|
|
k.emit(s_mul_i32(s[78], 256, s[2]))
|
|
k.emit(v_sub_u32_e64(v[106], v[18], s[78]))
|
|
k.emit(v_lshlrev_b32_e32(v[106], 2, v[106]))
|
|
k.emit(v_add_lshl_u32_e64(v[105], v[21], v[18], 1))
|
|
k.emit(v_cndmask_b32_e64(v[105], v[30], v[105], s[82:83]))
|
|
k.emit(v_add_co_u32(v[19], VCC, v[19], 1))
|
|
k.emit(v_add_u32_e64(v[20], v[20], s[38]))
|
|
k.emit(v_add_u32_e64(v[21], v[21], s[36]))
|
|
k.emit(v_cmp_lt_u32_e64(s[78:79], v[18], s[20]))
|
|
k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21]))
|
|
k.emit(s_and_b64(s[82:83], s[78:79], s[82:83]))
|
|
k.emit(s_mul_i32(s[78], 256, s[2]))
|
|
k.emit(v_sub_u32_e64(v[108], v[18], s[78]))
|
|
k.emit(v_lshlrev_b32_e32(v[108], 2, v[108]))
|
|
k.emit(v_add_lshl_u32_e64(v[107], v[21], v[18], 1))
|
|
k.emit(v_cndmask_b32_e64(v[107], v[30], v[107], s[82:83]))
|
|
k.emit(v_add_co_u32(v[19], VCC, v[19], 1))
|
|
k.emit(v_add_u32_e64(v[20], v[20], s[38]))
|
|
k.emit(v_add_u32_e64(v[21], v[21], s[36]))
|
|
k.emit(v_cmp_lt_u32_e64(s[78:79], v[18], s[20]))
|
|
k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21]))
|
|
k.emit(s_and_b64(s[82:83], s[78:79], s[82:83]))
|
|
k.emit(s_mul_i32(s[78], 256, s[2]))
|
|
k.emit(v_sub_u32_e64(v[110], v[18], s[78]))
|
|
k.emit(v_lshlrev_b32_e32(v[110], 2, v[110]))
|
|
k.emit(v_add_lshl_u32_e64(v[109], v[21], v[18], 1))
|
|
k.emit(v_cndmask_b32_e64(v[109], v[30], v[109], s[82:83]))
|
|
k.emit(v_accvgpr_read(v[40], v[0]))
|
|
k.emit(v_accvgpr_read(v[41], v[4]))
|
|
k.emit(v_accvgpr_read(v[42], v[8]))
|
|
k.emit(v_accvgpr_read(v[43], v[12]))
|
|
k.emit(v_accvgpr_read(v[44], v[16]))
|
|
k.emit(v_accvgpr_read(v[45], v[20]))
|
|
k.emit(v_accvgpr_read(v[46], v[24]))
|
|
k.emit(v_accvgpr_read(v[47], v[28]))
|
|
k.emit(v_accvgpr_read(v[48], v[32]))
|
|
k.emit(v_accvgpr_read(v[49], v[36]))
|
|
k.emit(v_accvgpr_read(v[50], v[40]))
|
|
k.emit(v_accvgpr_read(v[51], v[44]))
|
|
k.emit(v_accvgpr_read(v[52], v[48]))
|
|
k.emit(v_accvgpr_read(v[53], v[52]))
|
|
k.emit(v_accvgpr_read(v[54], v[56]))
|
|
k.emit(v_accvgpr_read(v[55], v[60]))
|
|
k.emit(v_accvgpr_read(v[56], v[64]))
|
|
k.emit(v_accvgpr_read(v[57], v[68]))
|
|
k.emit(v_accvgpr_read(v[58], v[72]))
|
|
k.emit(v_accvgpr_read(v[59], v[76]))
|
|
k.emit(v_accvgpr_read(v[60], v[80]))
|
|
k.emit(v_accvgpr_read(v[61], v[84]))
|
|
k.emit(v_accvgpr_read(v[62], v[88]))
|
|
k.emit(v_accvgpr_read(v[63], v[92]))
|
|
k.emit(v_accvgpr_read(v[64], v[96]))
|
|
k.emit(v_accvgpr_read(v[65], v[100]))
|
|
k.emit(v_accvgpr_read(v[66], v[104]))
|
|
k.emit(v_accvgpr_read(v[67], v[108]))
|
|
k.emit(v_accvgpr_read(v[68], v[112]))
|
|
k.emit(v_accvgpr_read(v[69], v[116]))
|
|
k.emit(v_accvgpr_read(v[70], v[120]))
|
|
k.emit(v_accvgpr_read(v[71], v[124]))
|
|
k.emit(v_accvgpr_read(v[72], v[128]))
|
|
k.emit(v_accvgpr_read(v[73], v[132]))
|
|
k.emit(v_accvgpr_read(v[74], v[136]))
|
|
k.emit(v_accvgpr_read(v[75], v[140]))
|
|
k.emit(v_accvgpr_read(v[76], v[144]))
|
|
k.emit(v_accvgpr_read(v[77], v[148]))
|
|
k.emit(v_accvgpr_read(v[78], v[152]))
|
|
k.emit(v_accvgpr_read(v[79], v[156]))
|
|
k.emit(v_accvgpr_read(v[80], v[160]))
|
|
k.emit(v_accvgpr_read(v[81], v[164]))
|
|
k.emit(v_accvgpr_read(v[82], v[168]))
|
|
k.emit(v_accvgpr_read(v[83], v[172]))
|
|
k.emit(v_accvgpr_read(v[84], v[176]))
|
|
k.emit(v_accvgpr_read(v[85], v[180]))
|
|
k.emit(v_accvgpr_read(v[86], v[184]))
|
|
k.emit(v_accvgpr_read(v[87], v[188]))
|
|
k.waitcnt(lgkm=0)
|
|
k.emit(v_mov_b32_e32(v[32], 4294901760))
|
|
k.emit(v_mov_b32_e32(v[33], 2147418112))
|
|
k.emit(v_mov_b32_e32(v[34], 32767))
|
|
k.emit(v_pk_mul_f32(v[40:41], v[96:97], v[40:41]))
|
|
k.emit(v_pk_mul_f32(v[42:43], v[98:99], v[42:43]))
|
|
k.emit(v_pk_mul_f32(v[44:45], v[100:101], v[44:45]))
|
|
k.emit(v_pk_mul_f32(v[46:47], v[102:103], v[46:47]))
|
|
k.emit(v_pk_add_f32(v[22:23], v[88:89], v[40:41]))
|
|
k.emit(v_pk_add_f32(v[24:25], v[90:91], v[42:43]))
|
|
k.emit(v_pk_add_f32(v[26:27], v[92:93], v[44:45]))
|
|
k.emit(v_pk_add_f32(v[28:29], v[94:95], v[46:47]))
|
|
k.emit(v_mov_b64_e32(v[40:41], v[22:23]))
|
|
k.emit(v_mov_b64_e32(v[42:43], v[24:25]))
|
|
k.emit(v_mov_b64_e32(v[44:45], v[26:27]))
|
|
k.emit(v_mov_b64_e32(v[46:47], v[28:29]))
|
|
k.emit(v_cvt_pk(v[40], v[40], v[41]))
|
|
k.emit(v_cvt_pk(v[41], v[42], v[43]))
|
|
k.emit(v_cvt_pk(v[42], v[44], v[45]))
|
|
k.emit(v_cvt_pk(v[43], v[46], v[47]))
|
|
k.emit(buffer_store_dwordx4(v[40:43], v[35], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_pk_mul_f32(v[48:49], v[96:97], v[48:49]))
|
|
k.emit(v_pk_mul_f32(v[50:51], v[98:99], v[50:51]))
|
|
k.emit(v_pk_mul_f32(v[52:53], v[100:101], v[52:53]))
|
|
k.emit(v_pk_mul_f32(v[54:55], v[102:103], v[54:55]))
|
|
k.emit(v_pk_add_f32(v[22:23], v[88:89], v[48:49]))
|
|
k.emit(v_pk_add_f32(v[24:25], v[90:91], v[50:51]))
|
|
k.emit(v_pk_add_f32(v[26:27], v[92:93], v[52:53]))
|
|
k.emit(v_pk_add_f32(v[28:29], v[94:95], v[54:55]))
|
|
k.emit(v_mov_b64_e32(v[48:49], v[22:23]))
|
|
k.emit(v_mov_b64_e32(v[50:51], v[24:25]))
|
|
k.emit(v_mov_b64_e32(v[52:53], v[26:27]))
|
|
k.emit(v_mov_b64_e32(v[54:55], v[28:29]))
|
|
k.emit(v_cvt_pk(v[48], v[48], v[49]))
|
|
k.emit(v_cvt_pk(v[49], v[50], v[51]))
|
|
k.emit(v_cvt_pk(v[50], v[52], v[53]))
|
|
k.emit(v_cvt_pk(v[51], v[54], v[55]))
|
|
k.emit(buffer_store_dwordx4(v[48:51], v[37], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_pk_mul_f32(v[56:57], v[96:97], v[56:57]))
|
|
k.emit(v_pk_mul_f32(v[58:59], v[98:99], v[58:59]))
|
|
k.emit(v_pk_mul_f32(v[60:61], v[100:101], v[60:61]))
|
|
k.emit(v_pk_mul_f32(v[62:63], v[102:103], v[62:63]))
|
|
k.emit(v_pk_add_f32(v[22:23], v[88:89], v[56:57]))
|
|
k.emit(v_pk_add_f32(v[24:25], v[90:91], v[58:59]))
|
|
k.emit(v_pk_add_f32(v[26:27], v[92:93], v[60:61]))
|
|
k.emit(v_pk_add_f32(v[28:29], v[94:95], v[62:63]))
|
|
k.emit(v_mov_b64_e32(v[56:57], v[22:23]))
|
|
k.emit(v_mov_b64_e32(v[58:59], v[24:25]))
|
|
k.emit(v_mov_b64_e32(v[60:61], v[26:27]))
|
|
k.emit(v_mov_b64_e32(v[62:63], v[28:29]))
|
|
k.emit(v_cvt_pk(v[56], v[56], v[57]))
|
|
k.emit(v_cvt_pk(v[57], v[58], v[59]))
|
|
k.emit(v_cvt_pk(v[58], v[60], v[61]))
|
|
k.emit(v_cvt_pk(v[59], v[62], v[63]))
|
|
k.emit(buffer_store_dwordx4(v[56:59], v[39], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_pk_mul_f32(v[64:65], v[96:97], v[64:65]))
|
|
k.emit(v_pk_mul_f32(v[66:67], v[98:99], v[66:67]))
|
|
k.emit(v_pk_mul_f32(v[68:69], v[100:101], v[68:69]))
|
|
k.emit(v_pk_mul_f32(v[70:71], v[102:103], v[70:71]))
|
|
k.emit(v_pk_add_f32(v[22:23], v[88:89], v[64:65]))
|
|
k.emit(v_pk_add_f32(v[24:25], v[90:91], v[66:67]))
|
|
k.emit(v_pk_add_f32(v[26:27], v[92:93], v[68:69]))
|
|
k.emit(v_pk_add_f32(v[28:29], v[94:95], v[70:71]))
|
|
k.emit(v_mov_b64_e32(v[64:65], v[22:23]))
|
|
k.emit(v_mov_b64_e32(v[66:67], v[24:25]))
|
|
k.emit(v_mov_b64_e32(v[68:69], v[26:27]))
|
|
k.emit(v_mov_b64_e32(v[70:71], v[28:29]))
|
|
k.emit(v_cvt_pk(v[64], v[64], v[65]))
|
|
k.emit(v_cvt_pk(v[65], v[66], v[67]))
|
|
k.emit(v_cvt_pk(v[66], v[68], v[69]))
|
|
k.emit(v_cvt_pk(v[67], v[70], v[71]))
|
|
k.emit(buffer_store_dwordx4(v[64:67], v[105], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_pk_mul_f32(v[72:73], v[96:97], v[72:73]))
|
|
k.emit(v_pk_mul_f32(v[74:75], v[98:99], v[74:75]))
|
|
k.emit(v_pk_mul_f32(v[76:77], v[100:101], v[76:77]))
|
|
k.emit(v_pk_mul_f32(v[78:79], v[102:103], v[78:79]))
|
|
k.emit(v_pk_add_f32(v[22:23], v[88:89], v[72:73]))
|
|
k.emit(v_pk_add_f32(v[24:25], v[90:91], v[74:75]))
|
|
k.emit(v_pk_add_f32(v[26:27], v[92:93], v[76:77]))
|
|
k.emit(v_pk_add_f32(v[28:29], v[94:95], v[78:79]))
|
|
k.emit(v_mov_b64_e32(v[72:73], v[22:23]))
|
|
k.emit(v_mov_b64_e32(v[74:75], v[24:25]))
|
|
k.emit(v_mov_b64_e32(v[76:77], v[26:27]))
|
|
k.emit(v_mov_b64_e32(v[78:79], v[28:29]))
|
|
k.emit(v_cvt_pk(v[72], v[72], v[73]))
|
|
k.emit(v_cvt_pk(v[73], v[74], v[75]))
|
|
k.emit(v_cvt_pk(v[74], v[76], v[77]))
|
|
k.emit(v_cvt_pk(v[75], v[78], v[79]))
|
|
k.emit(buffer_store_dwordx4(v[72:75], v[107], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_pk_mul_f32(v[80:81], v[96:97], v[80:81]))
|
|
k.emit(v_pk_mul_f32(v[82:83], v[98:99], v[82:83]))
|
|
k.emit(v_pk_mul_f32(v[84:85], v[100:101], v[84:85]))
|
|
k.emit(v_pk_mul_f32(v[86:87], v[102:103], v[86:87]))
|
|
k.emit(v_pk_add_f32(v[22:23], v[88:89], v[80:81]))
|
|
k.emit(v_pk_add_f32(v[24:25], v[90:91], v[82:83]))
|
|
k.emit(v_pk_add_f32(v[26:27], v[92:93], v[84:85]))
|
|
k.emit(v_pk_add_f32(v[28:29], v[94:95], v[86:87]))
|
|
k.emit(v_mov_b64_e32(v[80:81], v[22:23]))
|
|
k.emit(v_mov_b64_e32(v[82:83], v[24:25]))
|
|
k.emit(v_mov_b64_e32(v[84:85], v[26:27]))
|
|
k.emit(v_mov_b64_e32(v[86:87], v[28:29]))
|
|
k.emit(v_cvt_pk(v[80], v[80], v[81]))
|
|
k.emit(v_cvt_pk(v[81], v[82], v[83]))
|
|
k.emit(v_cvt_pk(v[82], v[84], v[85]))
|
|
k.emit(v_cvt_pk(v[83], v[86], v[87]))
|
|
k.emit(buffer_store_dwordx4(v[80:83], v[109], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(s_nop())
|
|
k.emit(v_mov_b32_e32(v[30], 2147483648))
|
|
k.emit(v_add_co_u32(v[19], VCC, v[19], 1))
|
|
k.emit(v_add_u32_e64(v[20], v[20], s[38]))
|
|
k.emit(v_add_u32_e64(v[21], v[21], s[36]))
|
|
k.emit(v_cmp_lt_u32_e64(s[78:79], v[18], s[20]))
|
|
k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21]))
|
|
k.emit(s_and_b64(s[82:83], s[78:79], s[82:83]))
|
|
k.emit(s_mul_i32(s[78], 256, s[2]))
|
|
k.emit(v_sub_u32_e64(v[36], v[18], s[78]))
|
|
k.emit(v_lshlrev_b32_e32(v[36], 2, v[36]))
|
|
k.emit(ds_read_b128(v[88:91], v[36]))
|
|
k.emit(ds_read_b128(v[92:95], v[36], v[0], v[0], 0, 16))
|
|
k.emit(ds_read_b128(v[96:99], v[36], v[0], v[0], 0, 0, 4))
|
|
k.emit(ds_read_b128(v[100:103], v[36], v[0], v[0], 0, 16, 4))
|
|
k.emit(v_add_lshl_u32_e64(v[35], v[21], v[18], 1))
|
|
k.emit(v_cndmask_b32_e64(v[35], v[30], v[35], s[82:83]))
|
|
k.emit(v_add_co_u32(v[19], VCC, v[19], 1))
|
|
k.emit(v_add_u32_e64(v[20], v[20], s[38]))
|
|
k.emit(v_add_u32_e64(v[21], v[21], s[36]))
|
|
k.emit(v_cmp_lt_u32_e64(s[78:79], v[18], s[20]))
|
|
k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21]))
|
|
k.emit(s_and_b64(s[82:83], s[78:79], s[82:83]))
|
|
k.emit(s_mul_i32(s[78], 256, s[2]))
|
|
k.emit(v_sub_u32_e64(v[38], v[18], s[78]))
|
|
k.emit(v_lshlrev_b32_e32(v[38], 2, v[38]))
|
|
k.emit(v_add_lshl_u32_e64(v[37], v[21], v[18], 1))
|
|
k.emit(v_cndmask_b32_e64(v[37], v[30], v[37], s[82:83]))
|
|
k.emit(v_add_co_u32(v[19], VCC, v[19], 1))
|
|
k.emit(v_add_u32_e64(v[20], v[20], s[38]))
|
|
k.emit(v_add_u32_e64(v[21], v[21], s[36]))
|
|
k.emit(v_cmp_lt_u32_e64(s[78:79], v[18], s[20]))
|
|
k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21]))
|
|
k.emit(s_and_b64(s[82:83], s[78:79], s[82:83]))
|
|
k.emit(s_mul_i32(s[78], 256, s[2]))
|
|
k.emit(v_sub_u32_e64(v[104], v[18], s[78]))
|
|
k.emit(v_lshlrev_b32_e32(v[104], 2, v[104]))
|
|
k.emit(v_add_lshl_u32_e64(v[39], v[21], v[18], 1))
|
|
k.emit(v_cndmask_b32_e64(v[39], v[30], v[39], s[82:83]))
|
|
k.emit(v_add_co_u32(v[19], VCC, v[19], 1))
|
|
k.emit(v_add_u32_e64(v[20], v[20], s[38]))
|
|
k.emit(v_add_u32_e64(v[21], v[21], s[36]))
|
|
k.emit(v_cmp_lt_u32_e64(s[78:79], v[18], s[20]))
|
|
k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21]))
|
|
k.emit(s_and_b64(s[82:83], s[78:79], s[82:83]))
|
|
k.emit(s_mul_i32(s[78], 256, s[2]))
|
|
k.emit(v_sub_u32_e64(v[106], v[18], s[78]))
|
|
k.emit(v_lshlrev_b32_e32(v[106], 2, v[106]))
|
|
k.emit(v_add_lshl_u32_e64(v[105], v[21], v[18], 1))
|
|
k.emit(v_cndmask_b32_e64(v[105], v[30], v[105], s[82:83]))
|
|
k.emit(v_add_co_u32(v[19], VCC, v[19], 1))
|
|
k.emit(v_add_u32_e64(v[20], v[20], s[38]))
|
|
k.emit(v_add_u32_e64(v[21], v[21], s[36]))
|
|
k.emit(v_cmp_lt_u32_e64(s[78:79], v[18], s[20]))
|
|
k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21]))
|
|
k.emit(s_and_b64(s[82:83], s[78:79], s[82:83]))
|
|
k.emit(s_mul_i32(s[78], 256, s[2]))
|
|
k.emit(v_sub_u32_e64(v[108], v[18], s[78]))
|
|
k.emit(v_lshlrev_b32_e32(v[108], 2, v[108]))
|
|
k.emit(v_add_lshl_u32_e64(v[107], v[21], v[18], 1))
|
|
k.emit(v_cndmask_b32_e64(v[107], v[30], v[107], s[82:83]))
|
|
k.emit(v_add_co_u32(v[19], VCC, v[19], 1))
|
|
k.emit(v_add_u32_e64(v[20], v[20], s[38]))
|
|
k.emit(v_add_u32_e64(v[21], v[21], s[36]))
|
|
k.emit(v_cmp_lt_u32_e64(s[78:79], v[18], s[20]))
|
|
k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21]))
|
|
k.emit(s_and_b64(s[82:83], s[78:79], s[82:83]))
|
|
k.emit(s_mul_i32(s[78], 256, s[2]))
|
|
k.emit(v_sub_u32_e64(v[110], v[18], s[78]))
|
|
k.emit(v_lshlrev_b32_e32(v[110], 2, v[110]))
|
|
k.emit(v_add_lshl_u32_e64(v[109], v[21], v[18], 1))
|
|
k.emit(v_cndmask_b32_e64(v[109], v[30], v[109], s[82:83]))
|
|
k.emit(v_accvgpr_read(v[40], v[192]))
|
|
k.emit(v_accvgpr_read(v[41], v[196]))
|
|
k.emit(v_accvgpr_read(v[42], v[200]))
|
|
k.emit(v_accvgpr_read(v[43], v[204]))
|
|
k.emit(v_accvgpr_read(v[44], v[208]))
|
|
k.emit(v_accvgpr_read(v[45], v[212]))
|
|
k.emit(v_accvgpr_read(v[46], v[216]))
|
|
k.emit(v_accvgpr_read(v[47], v[220]))
|
|
k.emit(v_accvgpr_read(v[48], v[224]))
|
|
k.emit(v_accvgpr_read(v[49], v[228]))
|
|
k.emit(v_accvgpr_read(v[50], v[232]))
|
|
k.emit(v_accvgpr_read(v[51], v[236]))
|
|
k.emit(v_accvgpr_read(v[52], v[240]))
|
|
k.emit(v_accvgpr_read(v[53], v[244]))
|
|
k.emit(v_accvgpr_read(v[54], v[248]))
|
|
k.emit(v_accvgpr_read(v[55], v[252]))
|
|
k.emit(v_accvgpr_read(v[56], v[1]))
|
|
k.emit(v_accvgpr_read(v[57], v[5]))
|
|
k.emit(v_accvgpr_read(v[58], v[9]))
|
|
k.emit(v_accvgpr_read(v[59], v[13]))
|
|
k.emit(v_accvgpr_read(v[60], v[17]))
|
|
k.emit(v_accvgpr_read(v[61], v[21]))
|
|
k.emit(v_accvgpr_read(v[62], v[25]))
|
|
k.emit(v_accvgpr_read(v[63], v[29]))
|
|
k.emit(v_accvgpr_read(v[64], v[33]))
|
|
k.emit(v_accvgpr_read(v[65], v[37]))
|
|
k.emit(v_accvgpr_read(v[66], v[41]))
|
|
k.emit(v_accvgpr_read(v[67], v[45]))
|
|
k.emit(v_accvgpr_read(v[68], v[49]))
|
|
k.emit(v_accvgpr_read(v[69], v[53]))
|
|
k.emit(v_accvgpr_read(v[70], v[57]))
|
|
k.emit(v_accvgpr_read(v[71], v[61]))
|
|
k.emit(v_accvgpr_read(v[72], v[65]))
|
|
k.emit(v_accvgpr_read(v[73], v[69]))
|
|
k.emit(v_accvgpr_read(v[74], v[73]))
|
|
k.emit(v_accvgpr_read(v[75], v[77]))
|
|
k.emit(v_accvgpr_read(v[76], v[81]))
|
|
k.emit(v_accvgpr_read(v[77], v[85]))
|
|
k.emit(v_accvgpr_read(v[78], v[89]))
|
|
k.emit(v_accvgpr_read(v[79], v[93]))
|
|
k.emit(v_accvgpr_read(v[80], v[97]))
|
|
k.emit(v_accvgpr_read(v[81], v[101]))
|
|
k.emit(v_accvgpr_read(v[82], v[105]))
|
|
k.emit(v_accvgpr_read(v[83], v[109]))
|
|
k.emit(v_accvgpr_read(v[84], v[113]))
|
|
k.emit(v_accvgpr_read(v[85], v[117]))
|
|
k.emit(v_accvgpr_read(v[86], v[121]))
|
|
k.emit(v_accvgpr_read(v[87], v[125]))
|
|
k.waitcnt(lgkm=0)
|
|
k.emit(v_mov_b32_e32(v[32], 4294901760))
|
|
k.emit(v_mov_b32_e32(v[33], 2147418112))
|
|
k.emit(v_mov_b32_e32(v[34], 32767))
|
|
k.emit(v_pk_mul_f32(v[40:41], v[96:97], v[40:41]))
|
|
k.emit(v_pk_mul_f32(v[42:43], v[98:99], v[42:43]))
|
|
k.emit(v_pk_mul_f32(v[44:45], v[100:101], v[44:45]))
|
|
k.emit(v_pk_mul_f32(v[46:47], v[102:103], v[46:47]))
|
|
k.emit(v_pk_add_f32(v[22:23], v[88:89], v[40:41]))
|
|
k.emit(v_pk_add_f32(v[24:25], v[90:91], v[42:43]))
|
|
k.emit(v_pk_add_f32(v[26:27], v[92:93], v[44:45]))
|
|
k.emit(v_pk_add_f32(v[28:29], v[94:95], v[46:47]))
|
|
k.emit(v_mov_b64_e32(v[40:41], v[22:23]))
|
|
k.emit(v_mov_b64_e32(v[42:43], v[24:25]))
|
|
k.emit(v_mov_b64_e32(v[44:45], v[26:27]))
|
|
k.emit(v_mov_b64_e32(v[46:47], v[28:29]))
|
|
k.emit(v_cvt_pk(v[40], v[40], v[41]))
|
|
k.emit(v_cvt_pk(v[41], v[42], v[43]))
|
|
k.emit(v_cvt_pk(v[42], v[44], v[45]))
|
|
k.emit(v_cvt_pk(v[43], v[46], v[47]))
|
|
k.emit(buffer_store_dwordx4(v[40:43], v[35], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_pk_mul_f32(v[48:49], v[96:97], v[48:49]))
|
|
k.emit(v_pk_mul_f32(v[50:51], v[98:99], v[50:51]))
|
|
k.emit(v_pk_mul_f32(v[52:53], v[100:101], v[52:53]))
|
|
k.emit(v_pk_mul_f32(v[54:55], v[102:103], v[54:55]))
|
|
k.emit(v_pk_add_f32(v[22:23], v[88:89], v[48:49]))
|
|
k.emit(v_pk_add_f32(v[24:25], v[90:91], v[50:51]))
|
|
k.emit(v_pk_add_f32(v[26:27], v[92:93], v[52:53]))
|
|
k.emit(v_pk_add_f32(v[28:29], v[94:95], v[54:55]))
|
|
k.emit(v_mov_b64_e32(v[48:49], v[22:23]))
|
|
k.emit(v_mov_b64_e32(v[50:51], v[24:25]))
|
|
k.emit(v_mov_b64_e32(v[52:53], v[26:27]))
|
|
k.emit(v_mov_b64_e32(v[54:55], v[28:29]))
|
|
k.emit(v_cvt_pk(v[48], v[48], v[49]))
|
|
k.emit(v_cvt_pk(v[49], v[50], v[51]))
|
|
k.emit(v_cvt_pk(v[50], v[52], v[53]))
|
|
k.emit(v_cvt_pk(v[51], v[54], v[55]))
|
|
k.emit(buffer_store_dwordx4(v[48:51], v[37], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_pk_mul_f32(v[56:57], v[96:97], v[56:57]))
|
|
k.emit(v_pk_mul_f32(v[58:59], v[98:99], v[58:59]))
|
|
k.emit(v_pk_mul_f32(v[60:61], v[100:101], v[60:61]))
|
|
k.emit(v_pk_mul_f32(v[62:63], v[102:103], v[62:63]))
|
|
k.emit(v_pk_add_f32(v[22:23], v[88:89], v[56:57]))
|
|
k.emit(v_pk_add_f32(v[24:25], v[90:91], v[58:59]))
|
|
k.emit(v_pk_add_f32(v[26:27], v[92:93], v[60:61]))
|
|
k.emit(v_pk_add_f32(v[28:29], v[94:95], v[62:63]))
|
|
k.emit(v_mov_b64_e32(v[56:57], v[22:23]))
|
|
k.emit(v_mov_b64_e32(v[58:59], v[24:25]))
|
|
k.emit(v_mov_b64_e32(v[60:61], v[26:27]))
|
|
k.emit(v_mov_b64_e32(v[62:63], v[28:29]))
|
|
k.emit(v_cvt_pk(v[56], v[56], v[57]))
|
|
k.emit(v_cvt_pk(v[57], v[58], v[59]))
|
|
k.emit(v_cvt_pk(v[58], v[60], v[61]))
|
|
k.emit(v_cvt_pk(v[59], v[62], v[63]))
|
|
k.emit(buffer_store_dwordx4(v[56:59], v[39], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_pk_mul_f32(v[64:65], v[96:97], v[64:65]))
|
|
k.emit(v_pk_mul_f32(v[66:67], v[98:99], v[66:67]))
|
|
k.emit(v_pk_mul_f32(v[68:69], v[100:101], v[68:69]))
|
|
k.emit(v_pk_mul_f32(v[70:71], v[102:103], v[70:71]))
|
|
k.emit(v_pk_add_f32(v[22:23], v[88:89], v[64:65]))
|
|
k.emit(v_pk_add_f32(v[24:25], v[90:91], v[66:67]))
|
|
k.emit(v_pk_add_f32(v[26:27], v[92:93], v[68:69]))
|
|
k.emit(v_pk_add_f32(v[28:29], v[94:95], v[70:71]))
|
|
k.emit(v_mov_b64_e32(v[64:65], v[22:23]))
|
|
k.emit(v_mov_b64_e32(v[66:67], v[24:25]))
|
|
k.emit(v_mov_b64_e32(v[68:69], v[26:27]))
|
|
k.emit(v_mov_b64_e32(v[70:71], v[28:29]))
|
|
k.emit(v_cvt_pk(v[64], v[64], v[65]))
|
|
k.emit(v_cvt_pk(v[65], v[66], v[67]))
|
|
k.emit(v_cvt_pk(v[66], v[68], v[69]))
|
|
k.emit(v_cvt_pk(v[67], v[70], v[71]))
|
|
k.emit(buffer_store_dwordx4(v[64:67], v[105], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_pk_mul_f32(v[72:73], v[96:97], v[72:73]))
|
|
k.emit(v_pk_mul_f32(v[74:75], v[98:99], v[74:75]))
|
|
k.emit(v_pk_mul_f32(v[76:77], v[100:101], v[76:77]))
|
|
k.emit(v_pk_mul_f32(v[78:79], v[102:103], v[78:79]))
|
|
k.emit(v_pk_add_f32(v[22:23], v[88:89], v[72:73]))
|
|
k.emit(v_pk_add_f32(v[24:25], v[90:91], v[74:75]))
|
|
k.emit(v_pk_add_f32(v[26:27], v[92:93], v[76:77]))
|
|
k.emit(v_pk_add_f32(v[28:29], v[94:95], v[78:79]))
|
|
k.emit(v_mov_b64_e32(v[72:73], v[22:23]))
|
|
k.emit(v_mov_b64_e32(v[74:75], v[24:25]))
|
|
k.emit(v_mov_b64_e32(v[76:77], v[26:27]))
|
|
k.emit(v_mov_b64_e32(v[78:79], v[28:29]))
|
|
k.emit(v_cvt_pk(v[72], v[72], v[73]))
|
|
k.emit(v_cvt_pk(v[73], v[74], v[75]))
|
|
k.emit(v_cvt_pk(v[74], v[76], v[77]))
|
|
k.emit(v_cvt_pk(v[75], v[78], v[79]))
|
|
k.emit(buffer_store_dwordx4(v[72:75], v[107], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_pk_mul_f32(v[80:81], v[96:97], v[80:81]))
|
|
k.emit(v_pk_mul_f32(v[82:83], v[98:99], v[82:83]))
|
|
k.emit(v_pk_mul_f32(v[84:85], v[100:101], v[84:85]))
|
|
k.emit(v_pk_mul_f32(v[86:87], v[102:103], v[86:87]))
|
|
k.emit(v_pk_add_f32(v[22:23], v[88:89], v[80:81]))
|
|
k.emit(v_pk_add_f32(v[24:25], v[90:91], v[82:83]))
|
|
k.emit(v_pk_add_f32(v[26:27], v[92:93], v[84:85]))
|
|
k.emit(v_pk_add_f32(v[28:29], v[94:95], v[86:87]))
|
|
k.emit(v_mov_b64_e32(v[80:81], v[22:23]))
|
|
k.emit(v_mov_b64_e32(v[82:83], v[24:25]))
|
|
k.emit(v_mov_b64_e32(v[84:85], v[26:27]))
|
|
k.emit(v_mov_b64_e32(v[86:87], v[28:29]))
|
|
k.emit(v_cvt_pk(v[80], v[80], v[81]))
|
|
k.emit(v_cvt_pk(v[81], v[82], v[83]))
|
|
k.emit(v_cvt_pk(v[82], v[84], v[85]))
|
|
k.emit(v_cvt_pk(v[83], v[86], v[87]))
|
|
k.emit(buffer_store_dwordx4(v[80:83], v[109], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(s_nop())
|
|
k.emit(v_mov_b32_e32(v[30], 2147483648))
|
|
k.emit(v_add_co_u32(v[19], VCC, v[19], 1))
|
|
k.emit(v_add_u32_e64(v[20], v[20], s[38]))
|
|
k.emit(v_add_u32_e64(v[21], v[21], s[36]))
|
|
k.emit(v_cmp_lt_u32_e64(s[78:79], v[18], s[20]))
|
|
k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21]))
|
|
k.emit(s_and_b64(s[82:83], s[78:79], s[82:83]))
|
|
k.emit(s_mul_i32(s[78], 256, s[2]))
|
|
k.emit(v_sub_u32_e64(v[36], v[18], s[78]))
|
|
k.emit(v_lshlrev_b32_e32(v[36], 2, v[36]))
|
|
k.emit(ds_read_b128(v[88:91], v[36]))
|
|
k.emit(ds_read_b128(v[92:95], v[36], v[0], v[0], 0, 16))
|
|
k.emit(ds_read_b128(v[96:99], v[36], v[0], v[0], 0, 0, 4))
|
|
k.emit(ds_read_b128(v[100:103], v[36], v[0], v[0], 0, 16, 4))
|
|
k.emit(v_add_lshl_u32_e64(v[35], v[21], v[18], 1))
|
|
k.emit(v_cndmask_b32_e64(v[35], v[30], v[35], s[82:83]))
|
|
k.emit(v_add_co_u32(v[19], VCC, v[19], 1))
|
|
k.emit(v_add_u32_e64(v[20], v[20], s[38]))
|
|
k.emit(v_add_u32_e64(v[21], v[21], s[36]))
|
|
k.emit(v_cmp_lt_u32_e64(s[78:79], v[18], s[20]))
|
|
k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21]))
|
|
k.emit(s_and_b64(s[82:83], s[78:79], s[82:83]))
|
|
k.emit(s_mul_i32(s[78], 256, s[2]))
|
|
k.emit(v_sub_u32_e64(v[38], v[18], s[78]))
|
|
k.emit(v_lshlrev_b32_e32(v[38], 2, v[38]))
|
|
k.emit(v_add_lshl_u32_e64(v[37], v[21], v[18], 1))
|
|
k.emit(v_cndmask_b32_e64(v[37], v[30], v[37], s[82:83]))
|
|
k.emit(v_add_co_u32(v[19], VCC, v[19], 1))
|
|
k.emit(v_add_u32_e64(v[20], v[20], s[38]))
|
|
k.emit(v_add_u32_e64(v[21], v[21], s[36]))
|
|
k.emit(v_cmp_lt_u32_e64(s[78:79], v[18], s[20]))
|
|
k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21]))
|
|
k.emit(s_and_b64(s[82:83], s[78:79], s[82:83]))
|
|
k.emit(s_mul_i32(s[78], 256, s[2]))
|
|
k.emit(v_sub_u32_e64(v[104], v[18], s[78]))
|
|
k.emit(v_lshlrev_b32_e32(v[104], 2, v[104]))
|
|
k.emit(v_add_lshl_u32_e64(v[39], v[21], v[18], 1))
|
|
k.emit(v_cndmask_b32_e64(v[39], v[30], v[39], s[82:83]))
|
|
k.emit(v_add_co_u32(v[19], VCC, v[19], 1))
|
|
k.emit(v_add_u32_e64(v[20], v[20], s[38]))
|
|
k.emit(v_add_u32_e64(v[21], v[21], s[36]))
|
|
k.emit(v_cmp_lt_u32_e64(s[78:79], v[18], s[20]))
|
|
k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21]))
|
|
k.emit(s_and_b64(s[82:83], s[78:79], s[82:83]))
|
|
k.emit(s_mul_i32(s[78], 256, s[2]))
|
|
k.emit(v_sub_u32_e64(v[106], v[18], s[78]))
|
|
k.emit(v_lshlrev_b32_e32(v[106], 2, v[106]))
|
|
k.emit(v_add_lshl_u32_e64(v[105], v[21], v[18], 1))
|
|
k.emit(v_cndmask_b32_e64(v[105], v[30], v[105], s[82:83]))
|
|
k.emit(v_add_co_u32(v[19], VCC, v[19], 1))
|
|
k.emit(v_add_u32_e64(v[20], v[20], s[38]))
|
|
k.emit(v_add_u32_e64(v[21], v[21], s[36]))
|
|
k.emit(v_cmp_lt_u32_e64(s[78:79], v[18], s[20]))
|
|
k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21]))
|
|
k.emit(s_and_b64(s[82:83], s[78:79], s[82:83]))
|
|
k.emit(s_mul_i32(s[78], 256, s[2]))
|
|
k.emit(v_sub_u32_e64(v[108], v[18], s[78]))
|
|
k.emit(v_lshlrev_b32_e32(v[108], 2, v[108]))
|
|
k.emit(v_add_lshl_u32_e64(v[107], v[21], v[18], 1))
|
|
k.emit(v_cndmask_b32_e64(v[107], v[30], v[107], s[82:83]))
|
|
k.emit(v_add_co_u32(v[19], VCC, v[19], 1))
|
|
k.emit(v_add_u32_e64(v[20], v[20], s[38]))
|
|
k.emit(v_add_u32_e64(v[21], v[21], s[36]))
|
|
k.emit(v_cmp_lt_u32_e64(s[78:79], v[18], s[20]))
|
|
k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21]))
|
|
k.emit(s_and_b64(s[82:83], s[78:79], s[82:83]))
|
|
k.emit(s_mul_i32(s[78], 256, s[2]))
|
|
k.emit(v_sub_u32_e64(v[110], v[18], s[78]))
|
|
k.emit(v_lshlrev_b32_e32(v[110], 2, v[110]))
|
|
k.emit(v_add_lshl_u32_e64(v[109], v[21], v[18], 1))
|
|
k.emit(v_cndmask_b32_e64(v[109], v[30], v[109], s[82:83]))
|
|
k.emit(v_accvgpr_read(v[40], v[129]))
|
|
k.emit(v_accvgpr_read(v[41], v[133]))
|
|
k.emit(v_accvgpr_read(v[42], v[137]))
|
|
k.emit(v_accvgpr_read(v[43], v[141]))
|
|
k.emit(v_accvgpr_read(v[44], v[145]))
|
|
k.emit(v_accvgpr_read(v[45], v[149]))
|
|
k.emit(v_accvgpr_read(v[46], v[153]))
|
|
k.emit(v_accvgpr_read(v[47], v[157]))
|
|
k.emit(v_accvgpr_read(v[48], v[161]))
|
|
k.emit(v_accvgpr_read(v[49], v[165]))
|
|
k.emit(v_accvgpr_read(v[50], v[169]))
|
|
k.emit(v_accvgpr_read(v[51], v[173]))
|
|
k.emit(v_accvgpr_read(v[52], v[177]))
|
|
k.emit(v_accvgpr_read(v[53], v[181]))
|
|
k.emit(v_accvgpr_read(v[54], v[185]))
|
|
k.emit(v_accvgpr_read(v[55], v[189]))
|
|
k.emit(v_accvgpr_read(v[56], v[193]))
|
|
k.emit(v_accvgpr_read(v[57], v[197]))
|
|
k.emit(v_accvgpr_read(v[58], v[201]))
|
|
k.emit(v_accvgpr_read(v[59], v[205]))
|
|
k.emit(v_accvgpr_read(v[60], v[209]))
|
|
k.emit(v_accvgpr_read(v[61], v[213]))
|
|
k.emit(v_accvgpr_read(v[62], v[217]))
|
|
k.emit(v_accvgpr_read(v[63], v[221]))
|
|
k.emit(v_accvgpr_read(v[64], v[225]))
|
|
k.emit(v_accvgpr_read(v[65], v[229]))
|
|
k.emit(v_accvgpr_read(v[66], v[233]))
|
|
k.emit(v_accvgpr_read(v[67], v[237]))
|
|
k.emit(v_accvgpr_read(v[68], v[241]))
|
|
k.emit(v_accvgpr_read(v[69], v[245]))
|
|
k.emit(v_accvgpr_read(v[70], v[249]))
|
|
k.emit(v_accvgpr_read(v[71], v[253]))
|
|
k.emit(v_accvgpr_read(v[72], v[2]))
|
|
k.emit(v_accvgpr_read(v[73], v[6]))
|
|
k.emit(v_accvgpr_read(v[74], v[10]))
|
|
k.emit(v_accvgpr_read(v[75], v[14]))
|
|
k.emit(v_accvgpr_read(v[76], v[18]))
|
|
k.emit(v_accvgpr_read(v[77], v[22]))
|
|
k.emit(v_accvgpr_read(v[78], v[26]))
|
|
k.emit(v_accvgpr_read(v[79], v[30]))
|
|
k.emit(v_accvgpr_read(v[80], v[34]))
|
|
k.emit(v_accvgpr_read(v[81], v[38]))
|
|
k.emit(v_accvgpr_read(v[82], v[42]))
|
|
k.emit(v_accvgpr_read(v[83], v[46]))
|
|
k.emit(v_accvgpr_read(v[84], v[50]))
|
|
k.emit(v_accvgpr_read(v[85], v[54]))
|
|
k.emit(v_accvgpr_read(v[86], v[58]))
|
|
k.emit(v_accvgpr_read(v[87], v[62]))
|
|
k.waitcnt(lgkm=0)
|
|
k.emit(v_mov_b32_e32(v[32], 4294901760))
|
|
k.emit(v_mov_b32_e32(v[33], 2147418112))
|
|
k.emit(v_mov_b32_e32(v[34], 32767))
|
|
k.emit(v_pk_mul_f32(v[40:41], v[96:97], v[40:41]))
|
|
k.emit(v_pk_mul_f32(v[42:43], v[98:99], v[42:43]))
|
|
k.emit(v_pk_mul_f32(v[44:45], v[100:101], v[44:45]))
|
|
k.emit(v_pk_mul_f32(v[46:47], v[102:103], v[46:47]))
|
|
k.emit(v_pk_add_f32(v[22:23], v[88:89], v[40:41]))
|
|
k.emit(v_pk_add_f32(v[24:25], v[90:91], v[42:43]))
|
|
k.emit(v_pk_add_f32(v[26:27], v[92:93], v[44:45]))
|
|
k.emit(v_pk_add_f32(v[28:29], v[94:95], v[46:47]))
|
|
k.emit(v_mov_b64_e32(v[40:41], v[22:23]))
|
|
k.emit(v_mov_b64_e32(v[42:43], v[24:25]))
|
|
k.emit(v_mov_b64_e32(v[44:45], v[26:27]))
|
|
k.emit(v_mov_b64_e32(v[46:47], v[28:29]))
|
|
k.emit(v_cvt_pk(v[40], v[40], v[41]))
|
|
k.emit(v_cvt_pk(v[41], v[42], v[43]))
|
|
k.emit(v_cvt_pk(v[42], v[44], v[45]))
|
|
k.emit(v_cvt_pk(v[43], v[46], v[47]))
|
|
k.emit(buffer_store_dwordx4(v[40:43], v[35], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_pk_mul_f32(v[48:49], v[96:97], v[48:49]))
|
|
k.emit(v_pk_mul_f32(v[50:51], v[98:99], v[50:51]))
|
|
k.emit(v_pk_mul_f32(v[52:53], v[100:101], v[52:53]))
|
|
k.emit(v_pk_mul_f32(v[54:55], v[102:103], v[54:55]))
|
|
k.emit(v_pk_add_f32(v[22:23], v[88:89], v[48:49]))
|
|
k.emit(v_pk_add_f32(v[24:25], v[90:91], v[50:51]))
|
|
k.emit(v_pk_add_f32(v[26:27], v[92:93], v[52:53]))
|
|
k.emit(v_pk_add_f32(v[28:29], v[94:95], v[54:55]))
|
|
k.emit(v_mov_b64_e32(v[48:49], v[22:23]))
|
|
k.emit(v_mov_b64_e32(v[50:51], v[24:25]))
|
|
k.emit(v_mov_b64_e32(v[52:53], v[26:27]))
|
|
k.emit(v_mov_b64_e32(v[54:55], v[28:29]))
|
|
k.emit(v_cvt_pk(v[48], v[48], v[49]))
|
|
k.emit(v_cvt_pk(v[49], v[50], v[51]))
|
|
k.emit(v_cvt_pk(v[50], v[52], v[53]))
|
|
k.emit(v_cvt_pk(v[51], v[54], v[55]))
|
|
k.emit(buffer_store_dwordx4(v[48:51], v[37], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_pk_mul_f32(v[56:57], v[96:97], v[56:57]))
|
|
k.emit(v_pk_mul_f32(v[58:59], v[98:99], v[58:59]))
|
|
k.emit(v_pk_mul_f32(v[60:61], v[100:101], v[60:61]))
|
|
k.emit(v_pk_mul_f32(v[62:63], v[102:103], v[62:63]))
|
|
k.emit(v_pk_add_f32(v[22:23], v[88:89], v[56:57]))
|
|
k.emit(v_pk_add_f32(v[24:25], v[90:91], v[58:59]))
|
|
k.emit(v_pk_add_f32(v[26:27], v[92:93], v[60:61]))
|
|
k.emit(v_pk_add_f32(v[28:29], v[94:95], v[62:63]))
|
|
k.emit(v_mov_b64_e32(v[56:57], v[22:23]))
|
|
k.emit(v_mov_b64_e32(v[58:59], v[24:25]))
|
|
k.emit(v_mov_b64_e32(v[60:61], v[26:27]))
|
|
k.emit(v_mov_b64_e32(v[62:63], v[28:29]))
|
|
k.emit(v_cvt_pk(v[56], v[56], v[57]))
|
|
k.emit(v_cvt_pk(v[57], v[58], v[59]))
|
|
k.emit(v_cvt_pk(v[58], v[60], v[61]))
|
|
k.emit(v_cvt_pk(v[59], v[62], v[63]))
|
|
k.emit(buffer_store_dwordx4(v[56:59], v[39], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_pk_mul_f32(v[64:65], v[96:97], v[64:65]))
|
|
k.emit(v_pk_mul_f32(v[66:67], v[98:99], v[66:67]))
|
|
k.emit(v_pk_mul_f32(v[68:69], v[100:101], v[68:69]))
|
|
k.emit(v_pk_mul_f32(v[70:71], v[102:103], v[70:71]))
|
|
k.emit(v_pk_add_f32(v[22:23], v[88:89], v[64:65]))
|
|
k.emit(v_pk_add_f32(v[24:25], v[90:91], v[66:67]))
|
|
k.emit(v_pk_add_f32(v[26:27], v[92:93], v[68:69]))
|
|
k.emit(v_pk_add_f32(v[28:29], v[94:95], v[70:71]))
|
|
k.emit(v_mov_b64_e32(v[64:65], v[22:23]))
|
|
k.emit(v_mov_b64_e32(v[66:67], v[24:25]))
|
|
k.emit(v_mov_b64_e32(v[68:69], v[26:27]))
|
|
k.emit(v_mov_b64_e32(v[70:71], v[28:29]))
|
|
k.emit(v_cvt_pk(v[64], v[64], v[65]))
|
|
k.emit(v_cvt_pk(v[65], v[66], v[67]))
|
|
k.emit(v_cvt_pk(v[66], v[68], v[69]))
|
|
k.emit(v_cvt_pk(v[67], v[70], v[71]))
|
|
k.emit(buffer_store_dwordx4(v[64:67], v[105], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_pk_mul_f32(v[72:73], v[96:97], v[72:73]))
|
|
k.emit(v_pk_mul_f32(v[74:75], v[98:99], v[74:75]))
|
|
k.emit(v_pk_mul_f32(v[76:77], v[100:101], v[76:77]))
|
|
k.emit(v_pk_mul_f32(v[78:79], v[102:103], v[78:79]))
|
|
k.emit(v_pk_add_f32(v[22:23], v[88:89], v[72:73]))
|
|
k.emit(v_pk_add_f32(v[24:25], v[90:91], v[74:75]))
|
|
k.emit(v_pk_add_f32(v[26:27], v[92:93], v[76:77]))
|
|
k.emit(v_pk_add_f32(v[28:29], v[94:95], v[78:79]))
|
|
k.emit(v_mov_b64_e32(v[72:73], v[22:23]))
|
|
k.emit(v_mov_b64_e32(v[74:75], v[24:25]))
|
|
k.emit(v_mov_b64_e32(v[76:77], v[26:27]))
|
|
k.emit(v_mov_b64_e32(v[78:79], v[28:29]))
|
|
k.emit(v_cvt_pk(v[72], v[72], v[73]))
|
|
k.emit(v_cvt_pk(v[73], v[74], v[75]))
|
|
k.emit(v_cvt_pk(v[74], v[76], v[77]))
|
|
k.emit(v_cvt_pk(v[75], v[78], v[79]))
|
|
k.emit(buffer_store_dwordx4(v[72:75], v[107], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_pk_mul_f32(v[80:81], v[96:97], v[80:81]))
|
|
k.emit(v_pk_mul_f32(v[82:83], v[98:99], v[82:83]))
|
|
k.emit(v_pk_mul_f32(v[84:85], v[100:101], v[84:85]))
|
|
k.emit(v_pk_mul_f32(v[86:87], v[102:103], v[86:87]))
|
|
k.emit(v_pk_add_f32(v[22:23], v[88:89], v[80:81]))
|
|
k.emit(v_pk_add_f32(v[24:25], v[90:91], v[82:83]))
|
|
k.emit(v_pk_add_f32(v[26:27], v[92:93], v[84:85]))
|
|
k.emit(v_pk_add_f32(v[28:29], v[94:95], v[86:87]))
|
|
k.emit(v_mov_b64_e32(v[80:81], v[22:23]))
|
|
k.emit(v_mov_b64_e32(v[82:83], v[24:25]))
|
|
k.emit(v_mov_b64_e32(v[84:85], v[26:27]))
|
|
k.emit(v_mov_b64_e32(v[86:87], v[28:29]))
|
|
k.emit(v_cvt_pk(v[80], v[80], v[81]))
|
|
k.emit(v_cvt_pk(v[81], v[82], v[83]))
|
|
k.emit(v_cvt_pk(v[82], v[84], v[85]))
|
|
k.emit(v_cvt_pk(v[83], v[86], v[87]))
|
|
k.emit(buffer_store_dwordx4(v[80:83], v[109], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(s_nop())
|
|
k.emit(v_mov_b32_e32(v[30], 2147483648))
|
|
k.emit(v_add_co_u32(v[19], VCC, v[19], 1))
|
|
k.emit(v_add_u32_e64(v[20], v[20], s[38]))
|
|
k.emit(v_add_u32_e64(v[21], v[21], s[36]))
|
|
k.emit(v_cmp_lt_u32_e64(s[78:79], v[18], s[20]))
|
|
k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21]))
|
|
k.emit(s_and_b64(s[82:83], s[78:79], s[82:83]))
|
|
k.emit(s_mul_i32(s[78], 256, s[2]))
|
|
k.emit(v_sub_u32_e64(v[36], v[18], s[78]))
|
|
k.emit(v_lshlrev_b32_e32(v[36], 2, v[36]))
|
|
k.emit(ds_read_b128(v[88:91], v[36]))
|
|
k.emit(ds_read_b128(v[92:95], v[36], v[0], v[0], 0, 16))
|
|
k.emit(ds_read_b128(v[96:99], v[36], v[0], v[0], 0, 0, 4))
|
|
k.emit(ds_read_b128(v[100:103], v[36], v[0], v[0], 0, 16, 4))
|
|
k.emit(v_add_lshl_u32_e64(v[35], v[21], v[18], 1))
|
|
k.emit(v_cndmask_b32_e64(v[35], v[30], v[35], s[82:83]))
|
|
k.emit(v_add_co_u32(v[19], VCC, v[19], 1))
|
|
k.emit(v_add_u32_e64(v[20], v[20], s[38]))
|
|
k.emit(v_add_u32_e64(v[21], v[21], s[36]))
|
|
k.emit(v_cmp_lt_u32_e64(s[78:79], v[18], s[20]))
|
|
k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21]))
|
|
k.emit(s_and_b64(s[82:83], s[78:79], s[82:83]))
|
|
k.emit(s_mul_i32(s[78], 256, s[2]))
|
|
k.emit(v_sub_u32_e64(v[38], v[18], s[78]))
|
|
k.emit(v_lshlrev_b32_e32(v[38], 2, v[38]))
|
|
k.emit(v_add_lshl_u32_e64(v[37], v[21], v[18], 1))
|
|
k.emit(v_cndmask_b32_e64(v[37], v[30], v[37], s[82:83]))
|
|
k.emit(v_add_co_u32(v[19], VCC, v[19], 1))
|
|
k.emit(v_add_u32_e64(v[20], v[20], s[38]))
|
|
k.emit(v_add_u32_e64(v[21], v[21], s[36]))
|
|
k.emit(v_cmp_lt_u32_e64(s[78:79], v[18], s[20]))
|
|
k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21]))
|
|
k.emit(s_and_b64(s[82:83], s[78:79], s[82:83]))
|
|
k.emit(s_mul_i32(s[78], 256, s[2]))
|
|
k.emit(v_sub_u32_e64(v[104], v[18], s[78]))
|
|
k.emit(v_lshlrev_b32_e32(v[104], 2, v[104]))
|
|
k.emit(v_add_lshl_u32_e64(v[39], v[21], v[18], 1))
|
|
k.emit(v_cndmask_b32_e64(v[39], v[30], v[39], s[82:83]))
|
|
k.emit(v_add_co_u32(v[19], VCC, v[19], 1))
|
|
k.emit(v_add_u32_e64(v[20], v[20], s[38]))
|
|
k.emit(v_add_u32_e64(v[21], v[21], s[36]))
|
|
k.emit(v_cmp_lt_u32_e64(s[78:79], v[18], s[20]))
|
|
k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21]))
|
|
k.emit(s_and_b64(s[82:83], s[78:79], s[82:83]))
|
|
k.emit(s_mul_i32(s[78], 256, s[2]))
|
|
k.emit(v_sub_u32_e64(v[106], v[18], s[78]))
|
|
k.emit(v_lshlrev_b32_e32(v[106], 2, v[106]))
|
|
k.emit(v_add_lshl_u32_e64(v[105], v[21], v[18], 1))
|
|
k.emit(v_cndmask_b32_e64(v[105], v[30], v[105], s[82:83]))
|
|
k.emit(v_add_co_u32(v[19], VCC, v[19], 1))
|
|
k.emit(v_add_u32_e64(v[20], v[20], s[38]))
|
|
k.emit(v_add_u32_e64(v[21], v[21], s[36]))
|
|
k.emit(v_cmp_lt_u32_e64(s[78:79], v[18], s[20]))
|
|
k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21]))
|
|
k.emit(s_and_b64(s[82:83], s[78:79], s[82:83]))
|
|
k.emit(s_mul_i32(s[78], 256, s[2]))
|
|
k.emit(v_sub_u32_e64(v[108], v[18], s[78]))
|
|
k.emit(v_lshlrev_b32_e32(v[108], 2, v[108]))
|
|
k.emit(v_add_lshl_u32_e64(v[107], v[21], v[18], 1))
|
|
k.emit(v_cndmask_b32_e64(v[107], v[30], v[107], s[82:83]))
|
|
k.emit(v_add_co_u32(v[19], VCC, v[19], 1))
|
|
k.emit(v_add_u32_e64(v[20], v[20], s[38]))
|
|
k.emit(v_add_u32_e64(v[21], v[21], s[36]))
|
|
k.emit(v_cmp_lt_u32_e64(s[78:79], v[18], s[20]))
|
|
k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21]))
|
|
k.emit(s_and_b64(s[82:83], s[78:79], s[82:83]))
|
|
k.emit(s_mul_i32(s[78], 256, s[2]))
|
|
k.emit(v_sub_u32_e64(v[110], v[18], s[78]))
|
|
k.emit(v_lshlrev_b32_e32(v[110], 2, v[110]))
|
|
k.emit(v_add_lshl_u32_e64(v[109], v[21], v[18], 1))
|
|
k.emit(v_cndmask_b32_e64(v[109], v[30], v[109], s[82:83]))
|
|
k.emit(v_accvgpr_read(v[40], v[66]))
|
|
k.emit(v_accvgpr_read(v[41], v[70]))
|
|
k.emit(v_accvgpr_read(v[42], v[74]))
|
|
k.emit(v_accvgpr_read(v[43], v[78]))
|
|
k.emit(v_accvgpr_read(v[44], v[82]))
|
|
k.emit(v_accvgpr_read(v[45], v[86]))
|
|
k.emit(v_accvgpr_read(v[46], v[90]))
|
|
k.emit(v_accvgpr_read(v[47], v[94]))
|
|
k.emit(v_accvgpr_read(v[48], v[98]))
|
|
k.emit(v_accvgpr_read(v[49], v[102]))
|
|
k.emit(v_accvgpr_read(v[50], v[106]))
|
|
k.emit(v_accvgpr_read(v[51], v[110]))
|
|
k.emit(v_accvgpr_read(v[52], v[114]))
|
|
k.emit(v_accvgpr_read(v[53], v[118]))
|
|
k.emit(v_accvgpr_read(v[54], v[122]))
|
|
k.emit(v_accvgpr_read(v[55], v[126]))
|
|
k.emit(v_accvgpr_read(v[56], v[130]))
|
|
k.emit(v_accvgpr_read(v[57], v[134]))
|
|
k.emit(v_accvgpr_read(v[58], v[138]))
|
|
k.emit(v_accvgpr_read(v[59], v[142]))
|
|
k.emit(v_accvgpr_read(v[60], v[146]))
|
|
k.emit(v_accvgpr_read(v[61], v[150]))
|
|
k.emit(v_accvgpr_read(v[62], v[154]))
|
|
k.emit(v_accvgpr_read(v[63], v[158]))
|
|
k.emit(v_accvgpr_read(v[64], v[162]))
|
|
k.emit(v_accvgpr_read(v[65], v[166]))
|
|
k.emit(v_accvgpr_read(v[66], v[170]))
|
|
k.emit(v_accvgpr_read(v[67], v[174]))
|
|
k.emit(v_accvgpr_read(v[68], v[178]))
|
|
k.emit(v_accvgpr_read(v[69], v[182]))
|
|
k.emit(v_accvgpr_read(v[70], v[186]))
|
|
k.emit(v_accvgpr_read(v[71], v[190]))
|
|
k.emit(v_accvgpr_read(v[72], v[194]))
|
|
k.emit(v_accvgpr_read(v[73], v[198]))
|
|
k.emit(v_accvgpr_read(v[74], v[202]))
|
|
k.emit(v_accvgpr_read(v[75], v[206]))
|
|
k.emit(v_accvgpr_read(v[76], v[210]))
|
|
k.emit(v_accvgpr_read(v[77], v[214]))
|
|
k.emit(v_accvgpr_read(v[78], v[218]))
|
|
k.emit(v_accvgpr_read(v[79], v[222]))
|
|
k.emit(v_accvgpr_read(v[80], v[226]))
|
|
k.emit(v_accvgpr_read(v[81], v[230]))
|
|
k.emit(v_accvgpr_read(v[82], v[234]))
|
|
k.emit(v_accvgpr_read(v[83], v[238]))
|
|
k.emit(v_accvgpr_read(v[84], v[242]))
|
|
k.emit(v_accvgpr_read(v[85], v[246]))
|
|
k.emit(v_accvgpr_read(v[86], v[250]))
|
|
k.emit(v_accvgpr_read(v[87], v[254]))
|
|
k.waitcnt(lgkm=0)
|
|
k.emit(v_mov_b32_e32(v[32], 4294901760))
|
|
k.emit(v_mov_b32_e32(v[33], 2147418112))
|
|
k.emit(v_mov_b32_e32(v[34], 32767))
|
|
k.emit(v_pk_mul_f32(v[40:41], v[96:97], v[40:41]))
|
|
k.emit(v_pk_mul_f32(v[42:43], v[98:99], v[42:43]))
|
|
k.emit(v_pk_mul_f32(v[44:45], v[100:101], v[44:45]))
|
|
k.emit(v_pk_mul_f32(v[46:47], v[102:103], v[46:47]))
|
|
k.emit(v_pk_add_f32(v[22:23], v[88:89], v[40:41]))
|
|
k.emit(v_pk_add_f32(v[24:25], v[90:91], v[42:43]))
|
|
k.emit(v_pk_add_f32(v[26:27], v[92:93], v[44:45]))
|
|
k.emit(v_pk_add_f32(v[28:29], v[94:95], v[46:47]))
|
|
k.emit(v_mov_b64_e32(v[40:41], v[22:23]))
|
|
k.emit(v_mov_b64_e32(v[42:43], v[24:25]))
|
|
k.emit(v_mov_b64_e32(v[44:45], v[26:27]))
|
|
k.emit(v_mov_b64_e32(v[46:47], v[28:29]))
|
|
k.emit(v_cvt_pk(v[40], v[40], v[41]))
|
|
k.emit(v_cvt_pk(v[41], v[42], v[43]))
|
|
k.emit(v_cvt_pk(v[42], v[44], v[45]))
|
|
k.emit(v_cvt_pk(v[43], v[46], v[47]))
|
|
k.emit(buffer_store_dwordx4(v[40:43], v[35], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_pk_mul_f32(v[48:49], v[96:97], v[48:49]))
|
|
k.emit(v_pk_mul_f32(v[50:51], v[98:99], v[50:51]))
|
|
k.emit(v_pk_mul_f32(v[52:53], v[100:101], v[52:53]))
|
|
k.emit(v_pk_mul_f32(v[54:55], v[102:103], v[54:55]))
|
|
k.emit(v_pk_add_f32(v[22:23], v[88:89], v[48:49]))
|
|
k.emit(v_pk_add_f32(v[24:25], v[90:91], v[50:51]))
|
|
k.emit(v_pk_add_f32(v[26:27], v[92:93], v[52:53]))
|
|
k.emit(v_pk_add_f32(v[28:29], v[94:95], v[54:55]))
|
|
k.emit(v_mov_b64_e32(v[48:49], v[22:23]))
|
|
k.emit(v_mov_b64_e32(v[50:51], v[24:25]))
|
|
k.emit(v_mov_b64_e32(v[52:53], v[26:27]))
|
|
k.emit(v_mov_b64_e32(v[54:55], v[28:29]))
|
|
k.emit(v_cvt_pk(v[48], v[48], v[49]))
|
|
k.emit(v_cvt_pk(v[49], v[50], v[51]))
|
|
k.emit(v_cvt_pk(v[50], v[52], v[53]))
|
|
k.emit(v_cvt_pk(v[51], v[54], v[55]))
|
|
k.emit(buffer_store_dwordx4(v[48:51], v[37], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_pk_mul_f32(v[56:57], v[96:97], v[56:57]))
|
|
k.emit(v_pk_mul_f32(v[58:59], v[98:99], v[58:59]))
|
|
k.emit(v_pk_mul_f32(v[60:61], v[100:101], v[60:61]))
|
|
k.emit(v_pk_mul_f32(v[62:63], v[102:103], v[62:63]))
|
|
k.emit(v_pk_add_f32(v[22:23], v[88:89], v[56:57]))
|
|
k.emit(v_pk_add_f32(v[24:25], v[90:91], v[58:59]))
|
|
k.emit(v_pk_add_f32(v[26:27], v[92:93], v[60:61]))
|
|
k.emit(v_pk_add_f32(v[28:29], v[94:95], v[62:63]))
|
|
k.emit(v_mov_b64_e32(v[56:57], v[22:23]))
|
|
k.emit(v_mov_b64_e32(v[58:59], v[24:25]))
|
|
k.emit(v_mov_b64_e32(v[60:61], v[26:27]))
|
|
k.emit(v_mov_b64_e32(v[62:63], v[28:29]))
|
|
k.emit(v_cvt_pk(v[56], v[56], v[57]))
|
|
k.emit(v_cvt_pk(v[57], v[58], v[59]))
|
|
k.emit(v_cvt_pk(v[58], v[60], v[61]))
|
|
k.emit(v_cvt_pk(v[59], v[62], v[63]))
|
|
k.emit(buffer_store_dwordx4(v[56:59], v[39], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_pk_mul_f32(v[64:65], v[96:97], v[64:65]))
|
|
k.emit(v_pk_mul_f32(v[66:67], v[98:99], v[66:67]))
|
|
k.emit(v_pk_mul_f32(v[68:69], v[100:101], v[68:69]))
|
|
k.emit(v_pk_mul_f32(v[70:71], v[102:103], v[70:71]))
|
|
k.emit(v_pk_add_f32(v[22:23], v[88:89], v[64:65]))
|
|
k.emit(v_pk_add_f32(v[24:25], v[90:91], v[66:67]))
|
|
k.emit(v_pk_add_f32(v[26:27], v[92:93], v[68:69]))
|
|
k.emit(v_pk_add_f32(v[28:29], v[94:95], v[70:71]))
|
|
k.emit(v_mov_b64_e32(v[64:65], v[22:23]))
|
|
k.emit(v_mov_b64_e32(v[66:67], v[24:25]))
|
|
k.emit(v_mov_b64_e32(v[68:69], v[26:27]))
|
|
k.emit(v_mov_b64_e32(v[70:71], v[28:29]))
|
|
k.emit(v_cvt_pk(v[64], v[64], v[65]))
|
|
k.emit(v_cvt_pk(v[65], v[66], v[67]))
|
|
k.emit(v_cvt_pk(v[66], v[68], v[69]))
|
|
k.emit(v_cvt_pk(v[67], v[70], v[71]))
|
|
k.emit(buffer_store_dwordx4(v[64:67], v[105], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_pk_mul_f32(v[72:73], v[96:97], v[72:73]))
|
|
k.emit(v_pk_mul_f32(v[74:75], v[98:99], v[74:75]))
|
|
k.emit(v_pk_mul_f32(v[76:77], v[100:101], v[76:77]))
|
|
k.emit(v_pk_mul_f32(v[78:79], v[102:103], v[78:79]))
|
|
k.emit(v_pk_add_f32(v[22:23], v[88:89], v[72:73]))
|
|
k.emit(v_pk_add_f32(v[24:25], v[90:91], v[74:75]))
|
|
k.emit(v_pk_add_f32(v[26:27], v[92:93], v[76:77]))
|
|
k.emit(v_pk_add_f32(v[28:29], v[94:95], v[78:79]))
|
|
k.emit(v_mov_b64_e32(v[72:73], v[22:23]))
|
|
k.emit(v_mov_b64_e32(v[74:75], v[24:25]))
|
|
k.emit(v_mov_b64_e32(v[76:77], v[26:27]))
|
|
k.emit(v_mov_b64_e32(v[78:79], v[28:29]))
|
|
k.emit(v_cvt_pk(v[72], v[72], v[73]))
|
|
k.emit(v_cvt_pk(v[73], v[74], v[75]))
|
|
k.emit(v_cvt_pk(v[74], v[76], v[77]))
|
|
k.emit(v_cvt_pk(v[75], v[78], v[79]))
|
|
k.emit(buffer_store_dwordx4(v[72:75], v[107], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_pk_mul_f32(v[80:81], v[96:97], v[80:81]))
|
|
k.emit(v_pk_mul_f32(v[82:83], v[98:99], v[82:83]))
|
|
k.emit(v_pk_mul_f32(v[84:85], v[100:101], v[84:85]))
|
|
k.emit(v_pk_mul_f32(v[86:87], v[102:103], v[86:87]))
|
|
k.emit(v_pk_add_f32(v[22:23], v[88:89], v[80:81]))
|
|
k.emit(v_pk_add_f32(v[24:25], v[90:91], v[82:83]))
|
|
k.emit(v_pk_add_f32(v[26:27], v[92:93], v[84:85]))
|
|
k.emit(v_pk_add_f32(v[28:29], v[94:95], v[86:87]))
|
|
k.emit(v_mov_b64_e32(v[80:81], v[22:23]))
|
|
k.emit(v_mov_b64_e32(v[82:83], v[24:25]))
|
|
k.emit(v_mov_b64_e32(v[84:85], v[26:27]))
|
|
k.emit(v_mov_b64_e32(v[86:87], v[28:29]))
|
|
k.emit(v_cvt_pk(v[80], v[80], v[81]))
|
|
k.emit(v_cvt_pk(v[81], v[82], v[83]))
|
|
k.emit(v_cvt_pk(v[82], v[84], v[85]))
|
|
k.emit(v_cvt_pk(v[83], v[86], v[87]))
|
|
k.emit(buffer_store_dwordx4(v[80:83], v[109], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(s_nop())
|
|
k.emit(v_mov_b32_e32(v[30], 2147483648))
|
|
k.emit(v_add_co_u32(v[19], VCC, v[19], 1))
|
|
k.emit(v_add_u32_e64(v[20], v[20], s[38]))
|
|
k.emit(v_add_u32_e64(v[21], v[21], s[36]))
|
|
k.emit(v_cmp_lt_u32_e64(s[78:79], v[18], s[20]))
|
|
k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21]))
|
|
k.emit(s_and_b64(s[82:83], s[78:79], s[82:83]))
|
|
k.emit(s_mul_i32(s[78], 256, s[2]))
|
|
k.emit(v_sub_u32_e64(v[36], v[18], s[78]))
|
|
k.emit(v_lshlrev_b32_e32(v[36], 2, v[36]))
|
|
k.emit(ds_read_b128(v[88:91], v[36]))
|
|
k.emit(ds_read_b128(v[92:95], v[36], v[0], v[0], 0, 16))
|
|
k.emit(ds_read_b128(v[96:99], v[36], v[0], v[0], 0, 0, 4))
|
|
k.emit(ds_read_b128(v[100:103], v[36], v[0], v[0], 0, 16, 4))
|
|
k.emit(v_add_lshl_u32_e64(v[35], v[21], v[18], 1))
|
|
k.emit(v_cndmask_b32_e64(v[35], v[30], v[35], s[82:83]))
|
|
k.emit(v_add_co_u32(v[19], VCC, v[19], 1))
|
|
k.emit(v_add_u32_e64(v[20], v[20], s[38]))
|
|
k.emit(v_add_u32_e64(v[21], v[21], s[36]))
|
|
k.emit(v_cmp_lt_u32_e64(s[78:79], v[18], s[20]))
|
|
k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21]))
|
|
k.emit(s_and_b64(s[82:83], s[78:79], s[82:83]))
|
|
k.emit(s_mul_i32(s[78], 256, s[2]))
|
|
k.emit(v_sub_u32_e64(v[38], v[18], s[78]))
|
|
k.emit(v_lshlrev_b32_e32(v[38], 2, v[38]))
|
|
k.emit(v_add_lshl_u32_e64(v[37], v[21], v[18], 1))
|
|
k.emit(v_cndmask_b32_e64(v[37], v[30], v[37], s[82:83]))
|
|
k.emit(v_add_co_u32(v[19], VCC, v[19], 1))
|
|
k.emit(v_add_u32_e64(v[20], v[20], s[38]))
|
|
k.emit(v_add_u32_e64(v[21], v[21], s[36]))
|
|
k.emit(v_cmp_lt_u32_e64(s[78:79], v[18], s[20]))
|
|
k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21]))
|
|
k.emit(s_and_b64(s[82:83], s[78:79], s[82:83]))
|
|
k.emit(s_mul_i32(s[78], 256, s[2]))
|
|
k.emit(v_sub_u32_e64(v[104], v[18], s[78]))
|
|
k.emit(v_lshlrev_b32_e32(v[104], 2, v[104]))
|
|
k.emit(v_add_lshl_u32_e64(v[39], v[21], v[18], 1))
|
|
k.emit(v_cndmask_b32_e64(v[39], v[30], v[39], s[82:83]))
|
|
k.emit(v_add_co_u32(v[19], VCC, v[19], 1))
|
|
k.emit(v_add_u32_e64(v[20], v[20], s[38]))
|
|
k.emit(v_add_u32_e64(v[21], v[21], s[36]))
|
|
k.emit(v_cmp_lt_u32_e64(s[78:79], v[18], s[20]))
|
|
k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21]))
|
|
k.emit(s_and_b64(s[82:83], s[78:79], s[82:83]))
|
|
k.emit(s_mul_i32(s[78], 256, s[2]))
|
|
k.emit(v_sub_u32_e64(v[106], v[18], s[78]))
|
|
k.emit(v_lshlrev_b32_e32(v[106], 2, v[106]))
|
|
k.emit(v_add_lshl_u32_e64(v[105], v[21], v[18], 1))
|
|
k.emit(v_cndmask_b32_e64(v[105], v[30], v[105], s[82:83]))
|
|
k.emit(v_add_co_u32(v[19], VCC, v[19], 1))
|
|
k.emit(v_add_u32_e64(v[20], v[20], s[38]))
|
|
k.emit(v_add_u32_e64(v[21], v[21], s[36]))
|
|
k.emit(v_cmp_lt_u32_e64(s[78:79], v[18], s[20]))
|
|
k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21]))
|
|
k.emit(s_and_b64(s[82:83], s[78:79], s[82:83]))
|
|
k.emit(s_mul_i32(s[78], 256, s[2]))
|
|
k.emit(v_sub_u32_e64(v[108], v[18], s[78]))
|
|
k.emit(v_lshlrev_b32_e32(v[108], 2, v[108]))
|
|
k.emit(v_add_lshl_u32_e64(v[107], v[21], v[18], 1))
|
|
k.emit(v_cndmask_b32_e64(v[107], v[30], v[107], s[82:83]))
|
|
k.emit(v_add_co_u32(v[19], VCC, v[19], 1))
|
|
k.emit(v_add_u32_e64(v[20], v[20], s[38]))
|
|
k.emit(v_add_u32_e64(v[21], v[21], s[36]))
|
|
k.emit(v_cmp_lt_u32_e64(s[78:79], v[18], s[20]))
|
|
k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21]))
|
|
k.emit(s_and_b64(s[82:83], s[78:79], s[82:83]))
|
|
k.emit(s_mul_i32(s[78], 256, s[2]))
|
|
k.emit(v_sub_u32_e64(v[110], v[18], s[78]))
|
|
k.emit(v_lshlrev_b32_e32(v[110], 2, v[110]))
|
|
k.emit(v_add_lshl_u32_e64(v[109], v[21], v[18], 1))
|
|
k.emit(v_cndmask_b32_e64(v[109], v[30], v[109], s[82:83]))
|
|
k.emit(v_accvgpr_read(v[40], v[3]))
|
|
k.emit(v_accvgpr_read(v[41], v[7]))
|
|
k.emit(v_accvgpr_read(v[42], v[11]))
|
|
k.emit(v_accvgpr_read(v[43], v[15]))
|
|
k.emit(v_accvgpr_read(v[44], v[19]))
|
|
k.emit(v_accvgpr_read(v[45], v[23]))
|
|
k.emit(v_accvgpr_read(v[46], v[27]))
|
|
k.emit(v_accvgpr_read(v[47], v[31]))
|
|
k.emit(v_accvgpr_read(v[48], v[35]))
|
|
k.emit(v_accvgpr_read(v[49], v[39]))
|
|
k.emit(v_accvgpr_read(v[50], v[43]))
|
|
k.emit(v_accvgpr_read(v[51], v[47]))
|
|
k.emit(v_accvgpr_read(v[52], v[51]))
|
|
k.emit(v_accvgpr_read(v[53], v[55]))
|
|
k.emit(v_accvgpr_read(v[54], v[59]))
|
|
k.emit(v_accvgpr_read(v[55], v[63]))
|
|
k.emit(v_accvgpr_read(v[56], v[67]))
|
|
k.emit(v_accvgpr_read(v[57], v[71]))
|
|
k.emit(v_accvgpr_read(v[58], v[75]))
|
|
k.emit(v_accvgpr_read(v[59], v[79]))
|
|
k.emit(v_accvgpr_read(v[60], v[83]))
|
|
k.emit(v_accvgpr_read(v[61], v[87]))
|
|
k.emit(v_accvgpr_read(v[62], v[91]))
|
|
k.emit(v_accvgpr_read(v[63], v[95]))
|
|
k.emit(v_accvgpr_read(v[64], v[99]))
|
|
k.emit(v_accvgpr_read(v[65], v[103]))
|
|
k.emit(v_accvgpr_read(v[66], v[107]))
|
|
k.emit(v_accvgpr_read(v[67], v[111]))
|
|
k.emit(v_accvgpr_read(v[68], v[115]))
|
|
k.emit(v_accvgpr_read(v[69], v[119]))
|
|
k.emit(v_accvgpr_read(v[70], v[123]))
|
|
k.emit(v_accvgpr_read(v[71], v[127]))
|
|
k.emit(v_accvgpr_read(v[72], v[131]))
|
|
k.emit(v_accvgpr_read(v[73], v[135]))
|
|
k.emit(v_accvgpr_read(v[74], v[139]))
|
|
k.emit(v_accvgpr_read(v[75], v[143]))
|
|
k.emit(v_accvgpr_read(v[76], v[147]))
|
|
k.emit(v_accvgpr_read(v[77], v[151]))
|
|
k.emit(v_accvgpr_read(v[78], v[155]))
|
|
k.emit(v_accvgpr_read(v[79], v[159]))
|
|
k.emit(v_accvgpr_read(v[80], v[163]))
|
|
k.emit(v_accvgpr_read(v[81], v[167]))
|
|
k.emit(v_accvgpr_read(v[82], v[171]))
|
|
k.emit(v_accvgpr_read(v[83], v[175]))
|
|
k.emit(v_accvgpr_read(v[84], v[179]))
|
|
k.emit(v_accvgpr_read(v[85], v[183]))
|
|
k.emit(v_accvgpr_read(v[86], v[187]))
|
|
k.emit(v_accvgpr_read(v[87], v[191]))
|
|
k.waitcnt(lgkm=0)
|
|
k.emit(v_mov_b32_e32(v[32], 4294901760))
|
|
k.emit(v_mov_b32_e32(v[33], 2147418112))
|
|
k.emit(v_mov_b32_e32(v[34], 32767))
|
|
k.emit(v_pk_mul_f32(v[40:41], v[96:97], v[40:41]))
|
|
k.emit(v_pk_mul_f32(v[42:43], v[98:99], v[42:43]))
|
|
k.emit(v_pk_mul_f32(v[44:45], v[100:101], v[44:45]))
|
|
k.emit(v_pk_mul_f32(v[46:47], v[102:103], v[46:47]))
|
|
k.emit(v_pk_add_f32(v[22:23], v[88:89], v[40:41]))
|
|
k.emit(v_pk_add_f32(v[24:25], v[90:91], v[42:43]))
|
|
k.emit(v_pk_add_f32(v[26:27], v[92:93], v[44:45]))
|
|
k.emit(v_pk_add_f32(v[28:29], v[94:95], v[46:47]))
|
|
k.emit(v_mov_b64_e32(v[40:41], v[22:23]))
|
|
k.emit(v_mov_b64_e32(v[42:43], v[24:25]))
|
|
k.emit(v_mov_b64_e32(v[44:45], v[26:27]))
|
|
k.emit(v_mov_b64_e32(v[46:47], v[28:29]))
|
|
k.emit(v_cvt_pk(v[40], v[40], v[41]))
|
|
k.emit(v_cvt_pk(v[41], v[42], v[43]))
|
|
k.emit(v_cvt_pk(v[42], v[44], v[45]))
|
|
k.emit(v_cvt_pk(v[43], v[46], v[47]))
|
|
k.emit(buffer_store_dwordx4(v[40:43], v[35], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_pk_mul_f32(v[48:49], v[96:97], v[48:49]))
|
|
k.emit(v_pk_mul_f32(v[50:51], v[98:99], v[50:51]))
|
|
k.emit(v_pk_mul_f32(v[52:53], v[100:101], v[52:53]))
|
|
k.emit(v_pk_mul_f32(v[54:55], v[102:103], v[54:55]))
|
|
k.emit(v_pk_add_f32(v[22:23], v[88:89], v[48:49]))
|
|
k.emit(v_pk_add_f32(v[24:25], v[90:91], v[50:51]))
|
|
k.emit(v_pk_add_f32(v[26:27], v[92:93], v[52:53]))
|
|
k.emit(v_pk_add_f32(v[28:29], v[94:95], v[54:55]))
|
|
k.emit(v_mov_b64_e32(v[48:49], v[22:23]))
|
|
k.emit(v_mov_b64_e32(v[50:51], v[24:25]))
|
|
k.emit(v_mov_b64_e32(v[52:53], v[26:27]))
|
|
k.emit(v_mov_b64_e32(v[54:55], v[28:29]))
|
|
k.emit(v_cvt_pk(v[48], v[48], v[49]))
|
|
k.emit(v_cvt_pk(v[49], v[50], v[51]))
|
|
k.emit(v_cvt_pk(v[50], v[52], v[53]))
|
|
k.emit(v_cvt_pk(v[51], v[54], v[55]))
|
|
k.emit(buffer_store_dwordx4(v[48:51], v[37], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_pk_mul_f32(v[56:57], v[96:97], v[56:57]))
|
|
k.emit(v_pk_mul_f32(v[58:59], v[98:99], v[58:59]))
|
|
k.emit(v_pk_mul_f32(v[60:61], v[100:101], v[60:61]))
|
|
k.emit(v_pk_mul_f32(v[62:63], v[102:103], v[62:63]))
|
|
k.emit(v_pk_add_f32(v[22:23], v[88:89], v[56:57]))
|
|
k.emit(v_pk_add_f32(v[24:25], v[90:91], v[58:59]))
|
|
k.emit(v_pk_add_f32(v[26:27], v[92:93], v[60:61]))
|
|
k.emit(v_pk_add_f32(v[28:29], v[94:95], v[62:63]))
|
|
k.emit(v_mov_b64_e32(v[56:57], v[22:23]))
|
|
k.emit(v_mov_b64_e32(v[58:59], v[24:25]))
|
|
k.emit(v_mov_b64_e32(v[60:61], v[26:27]))
|
|
k.emit(v_mov_b64_e32(v[62:63], v[28:29]))
|
|
k.emit(v_cvt_pk(v[56], v[56], v[57]))
|
|
k.emit(v_cvt_pk(v[57], v[58], v[59]))
|
|
k.emit(v_cvt_pk(v[58], v[60], v[61]))
|
|
k.emit(v_cvt_pk(v[59], v[62], v[63]))
|
|
k.emit(buffer_store_dwordx4(v[56:59], v[39], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_pk_mul_f32(v[64:65], v[96:97], v[64:65]))
|
|
k.emit(v_pk_mul_f32(v[66:67], v[98:99], v[66:67]))
|
|
k.emit(v_pk_mul_f32(v[68:69], v[100:101], v[68:69]))
|
|
k.emit(v_pk_mul_f32(v[70:71], v[102:103], v[70:71]))
|
|
k.emit(v_pk_add_f32(v[22:23], v[88:89], v[64:65]))
|
|
k.emit(v_pk_add_f32(v[24:25], v[90:91], v[66:67]))
|
|
k.emit(v_pk_add_f32(v[26:27], v[92:93], v[68:69]))
|
|
k.emit(v_pk_add_f32(v[28:29], v[94:95], v[70:71]))
|
|
k.emit(v_mov_b64_e32(v[64:65], v[22:23]))
|
|
k.emit(v_mov_b64_e32(v[66:67], v[24:25]))
|
|
k.emit(v_mov_b64_e32(v[68:69], v[26:27]))
|
|
k.emit(v_mov_b64_e32(v[70:71], v[28:29]))
|
|
k.emit(v_cvt_pk(v[64], v[64], v[65]))
|
|
k.emit(v_cvt_pk(v[65], v[66], v[67]))
|
|
k.emit(v_cvt_pk(v[66], v[68], v[69]))
|
|
k.emit(v_cvt_pk(v[67], v[70], v[71]))
|
|
k.emit(buffer_store_dwordx4(v[64:67], v[105], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_pk_mul_f32(v[72:73], v[96:97], v[72:73]))
|
|
k.emit(v_pk_mul_f32(v[74:75], v[98:99], v[74:75]))
|
|
k.emit(v_pk_mul_f32(v[76:77], v[100:101], v[76:77]))
|
|
k.emit(v_pk_mul_f32(v[78:79], v[102:103], v[78:79]))
|
|
k.emit(v_pk_add_f32(v[22:23], v[88:89], v[72:73]))
|
|
k.emit(v_pk_add_f32(v[24:25], v[90:91], v[74:75]))
|
|
k.emit(v_pk_add_f32(v[26:27], v[92:93], v[76:77]))
|
|
k.emit(v_pk_add_f32(v[28:29], v[94:95], v[78:79]))
|
|
k.emit(v_mov_b64_e32(v[72:73], v[22:23]))
|
|
k.emit(v_mov_b64_e32(v[74:75], v[24:25]))
|
|
k.emit(v_mov_b64_e32(v[76:77], v[26:27]))
|
|
k.emit(v_mov_b64_e32(v[78:79], v[28:29]))
|
|
k.emit(v_cvt_pk(v[72], v[72], v[73]))
|
|
k.emit(v_cvt_pk(v[73], v[74], v[75]))
|
|
k.emit(v_cvt_pk(v[74], v[76], v[77]))
|
|
k.emit(v_cvt_pk(v[75], v[78], v[79]))
|
|
k.emit(buffer_store_dwordx4(v[72:75], v[107], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_pk_mul_f32(v[80:81], v[96:97], v[80:81]))
|
|
k.emit(v_pk_mul_f32(v[82:83], v[98:99], v[82:83]))
|
|
k.emit(v_pk_mul_f32(v[84:85], v[100:101], v[84:85]))
|
|
k.emit(v_pk_mul_f32(v[86:87], v[102:103], v[86:87]))
|
|
k.emit(v_pk_add_f32(v[22:23], v[88:89], v[80:81]))
|
|
k.emit(v_pk_add_f32(v[24:25], v[90:91], v[82:83]))
|
|
k.emit(v_pk_add_f32(v[26:27], v[92:93], v[84:85]))
|
|
k.emit(v_pk_add_f32(v[28:29], v[94:95], v[86:87]))
|
|
k.emit(v_mov_b64_e32(v[80:81], v[22:23]))
|
|
k.emit(v_mov_b64_e32(v[82:83], v[24:25]))
|
|
k.emit(v_mov_b64_e32(v[84:85], v[26:27]))
|
|
k.emit(v_mov_b64_e32(v[86:87], v[28:29]))
|
|
k.emit(v_cvt_pk(v[80], v[80], v[81]))
|
|
k.emit(v_cvt_pk(v[81], v[82], v[83]))
|
|
k.emit(v_cvt_pk(v[82], v[84], v[85]))
|
|
k.emit(v_cvt_pk(v[83], v[86], v[87]))
|
|
k.emit(buffer_store_dwordx4(v[80:83], v[109], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(s_nop())
|
|
k.emit(v_mov_b32_e32(v[30], 2147483648))
|
|
k.emit(v_add_co_u32(v[19], VCC, v[19], 1))
|
|
k.emit(v_add_u32_e64(v[20], v[20], s[38]))
|
|
k.emit(v_add_u32_e64(v[21], v[21], s[36]))
|
|
k.emit(v_cmp_lt_u32_e64(s[78:79], v[18], s[20]))
|
|
k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21]))
|
|
k.emit(s_and_b64(s[82:83], s[78:79], s[82:83]))
|
|
k.emit(s_mul_i32(s[78], 256, s[2]))
|
|
k.emit(v_sub_u32_e64(v[36], v[18], s[78]))
|
|
k.emit(v_lshlrev_b32_e32(v[36], 2, v[36]))
|
|
k.emit(ds_read_b128(v[56:59], v[36]))
|
|
k.emit(ds_read_b128(v[60:63], v[36], v[0], v[0], 0, 16))
|
|
k.emit(ds_read_b128(v[64:67], v[36], v[0], v[0], 0, 0, 4))
|
|
k.emit(ds_read_b128(v[68:71], v[36], v[0], v[0], 0, 16, 4))
|
|
k.emit(v_add_lshl_u32_e64(v[35], v[21], v[18], 1))
|
|
k.emit(v_cndmask_b32_e64(v[35], v[30], v[35], s[82:83]))
|
|
k.emit(v_add_co_u32(v[19], VCC, v[19], 1))
|
|
k.emit(v_add_u32_e64(v[20], v[20], s[38]))
|
|
k.emit(v_add_u32_e64(v[21], v[21], s[36]))
|
|
k.emit(v_cmp_lt_u32_e64(s[78:79], v[18], s[20]))
|
|
k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21]))
|
|
k.emit(s_and_b64(s[82:83], s[78:79], s[82:83]))
|
|
k.emit(s_mul_i32(s[78], 256, s[2]))
|
|
k.emit(v_sub_u32_e64(v[38], v[18], s[78]))
|
|
k.emit(v_lshlrev_b32_e32(v[38], 2, v[38]))
|
|
k.emit(v_add_lshl_u32_e64(v[37], v[21], v[18], 1))
|
|
k.emit(v_cndmask_b32_e64(v[37], v[30], v[37], s[82:83]))
|
|
k.emit(v_accvgpr_read(v[40], v[195]))
|
|
k.emit(v_accvgpr_read(v[41], v[199]))
|
|
k.emit(v_accvgpr_read(v[42], v[203]))
|
|
k.emit(v_accvgpr_read(v[43], v[207]))
|
|
k.emit(v_accvgpr_read(v[44], v[211]))
|
|
k.emit(v_accvgpr_read(v[45], v[215]))
|
|
k.emit(v_accvgpr_read(v[46], v[219]))
|
|
k.emit(v_accvgpr_read(v[47], v[223]))
|
|
k.emit(v_accvgpr_read(v[48], v[227]))
|
|
k.emit(v_accvgpr_read(v[49], v[231]))
|
|
k.emit(v_accvgpr_read(v[50], v[235]))
|
|
k.emit(v_accvgpr_read(v[51], v[239]))
|
|
k.emit(v_accvgpr_read(v[52], v[243]))
|
|
k.emit(v_accvgpr_read(v[53], v[247]))
|
|
k.emit(v_accvgpr_read(v[54], v[251]))
|
|
k.emit(v_accvgpr_read(v[55], v[255]))
|
|
k.waitcnt(lgkm=0)
|
|
k.emit(v_mov_b32_e32(v[32], 4294901760))
|
|
k.emit(v_mov_b32_e32(v[33], 2147418112))
|
|
k.emit(v_mov_b32_e32(v[34], 32767))
|
|
k.emit(v_pk_mul_f32(v[40:41], v[64:65], v[40:41]))
|
|
k.emit(v_pk_mul_f32(v[42:43], v[66:67], v[42:43]))
|
|
k.emit(v_pk_mul_f32(v[44:45], v[68:69], v[44:45]))
|
|
k.emit(v_pk_mul_f32(v[46:47], v[70:71], v[46:47]))
|
|
k.emit(v_pk_add_f32(v[22:23], v[56:57], v[40:41]))
|
|
k.emit(v_pk_add_f32(v[24:25], v[58:59], v[42:43]))
|
|
k.emit(v_pk_add_f32(v[26:27], v[60:61], v[44:45]))
|
|
k.emit(v_pk_add_f32(v[28:29], v[62:63], v[46:47]))
|
|
k.emit(v_mov_b64_e32(v[40:41], v[22:23]))
|
|
k.emit(v_mov_b64_e32(v[42:43], v[24:25]))
|
|
k.emit(v_mov_b64_e32(v[44:45], v[26:27]))
|
|
k.emit(v_mov_b64_e32(v[46:47], v[28:29]))
|
|
k.emit(v_cvt_pk(v[40], v[40], v[41]))
|
|
k.emit(v_cvt_pk(v[41], v[42], v[43]))
|
|
k.emit(v_cvt_pk(v[42], v[44], v[45]))
|
|
k.emit(v_cvt_pk(v[43], v[46], v[47]))
|
|
k.emit(buffer_store_dwordx4(v[40:43], v[35], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_pk_mul_f32(v[48:49], v[64:65], v[48:49]))
|
|
k.emit(v_pk_mul_f32(v[50:51], v[66:67], v[50:51]))
|
|
k.emit(v_pk_mul_f32(v[52:53], v[68:69], v[52:53]))
|
|
k.emit(v_pk_mul_f32(v[54:55], v[70:71], v[54:55]))
|
|
k.emit(v_pk_add_f32(v[22:23], v[56:57], v[48:49]))
|
|
k.emit(v_pk_add_f32(v[24:25], v[58:59], v[50:51]))
|
|
k.emit(v_pk_add_f32(v[26:27], v[60:61], v[52:53]))
|
|
k.emit(v_pk_add_f32(v[28:29], v[62:63], v[54:55]))
|
|
k.emit(v_mov_b64_e32(v[48:49], v[22:23]))
|
|
k.emit(v_mov_b64_e32(v[50:51], v[24:25]))
|
|
k.emit(v_mov_b64_e32(v[52:53], v[26:27]))
|
|
k.emit(v_mov_b64_e32(v[54:55], v[28:29]))
|
|
k.emit(v_cvt_pk(v[48], v[48], v[49]))
|
|
k.emit(v_cvt_pk(v[49], v[50], v[51]))
|
|
k.emit(v_cvt_pk(v[50], v[52], v[53]))
|
|
k.emit(v_cvt_pk(v[51], v[54], v[55]))
|
|
k.emit(buffer_store_dwordx4(v[48:51], v[37], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(s_nop())
|
|
k.emit(s_branch(), target='GW_End_1')
|
|
k.label('GW_B0_E1_M_1')
|
|
k.emit(v_mov_b32_e32(v[30], 2147483648))
|
|
k.emit(v_cmp_lt_u32_e64(s[78:79], v[18], s[20]))
|
|
k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21]))
|
|
k.emit(s_and_b64(s[82:83], s[78:79], s[82:83]))
|
|
k.emit(s_mul_i32(s[78], 256, s[2]))
|
|
k.emit(v_sub_u32_e64(v[78], v[18], s[78]))
|
|
k.emit(v_lshlrev_b32_e32(v[78], 2, v[78]))
|
|
k.waitcnt(lgkm=0)
|
|
k.emit(s_barrier())
|
|
k.emit(ds_read_b32(v[75], v[78]))
|
|
k.emit(ds_read_b32(v[76], v[78], v[0], v[0], 0, 0, 4))
|
|
k.emit(v_add_lshl_u32_e64(v[77], v[21], v[18], 1))
|
|
k.emit(v_cndmask_b32_e64(v[77], v[30], v[77], s[82:83]))
|
|
k.emit(v_add_co_u32(v[22], VCC, v[18], 1))
|
|
k.emit(v_cmp_lt_u32_e64(s[78:79], v[22], s[20]))
|
|
k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21]))
|
|
k.emit(s_and_b64(s[82:83], s[78:79], s[82:83]))
|
|
k.emit(s_mul_i32(s[78], 256, s[2]))
|
|
k.emit(v_sub_u32_e64(v[82], v[22], s[78]))
|
|
k.emit(v_lshlrev_b32_e32(v[82], 2, v[82]))
|
|
k.emit(ds_read_b32(v[79], v[82]))
|
|
k.emit(ds_read_b32(v[80], v[82], v[0], v[0], 0, 0, 4))
|
|
k.emit(v_add_lshl_u32_e64(v[81], v[21], v[22], 1))
|
|
k.emit(v_cndmask_b32_e64(v[81], v[30], v[81], s[82:83]))
|
|
k.emit(v_add_co_u32(v[22], VCC, v[18], 2))
|
|
k.emit(v_cmp_lt_u32_e64(s[78:79], v[22], s[20]))
|
|
k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21]))
|
|
k.emit(s_and_b64(s[82:83], s[78:79], s[82:83]))
|
|
k.emit(s_mul_i32(s[78], 256, s[2]))
|
|
k.emit(v_sub_u32_e64(v[86], v[22], s[78]))
|
|
k.emit(v_lshlrev_b32_e32(v[86], 2, v[86]))
|
|
k.emit(ds_read_b32(v[83], v[86]))
|
|
k.emit(ds_read_b32(v[84], v[86], v[0], v[0], 0, 0, 4))
|
|
k.emit(v_add_lshl_u32_e64(v[85], v[21], v[22], 1))
|
|
k.emit(v_cndmask_b32_e64(v[85], v[30], v[85], s[82:83]))
|
|
k.emit(v_add_co_u32(v[22], VCC, v[18], 3))
|
|
k.emit(v_cmp_lt_u32_e64(s[78:79], v[22], s[20]))
|
|
k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21]))
|
|
k.emit(s_and_b64(s[82:83], s[78:79], s[82:83]))
|
|
k.emit(s_mul_i32(s[78], 256, s[2]))
|
|
k.emit(v_sub_u32_e64(v[90], v[22], s[78]))
|
|
k.emit(v_lshlrev_b32_e32(v[90], 2, v[90]))
|
|
k.emit(ds_read_b32(v[87], v[90]))
|
|
k.emit(ds_read_b32(v[88], v[90], v[0], v[0], 0, 0, 4))
|
|
k.emit(v_add_lshl_u32_e64(v[89], v[21], v[22], 1))
|
|
k.emit(v_cndmask_b32_e64(v[89], v[30], v[89], s[82:83]))
|
|
k.emit(v_add_co_u32(v[22], VCC, v[18], 4))
|
|
k.emit(v_cmp_lt_u32_e64(s[78:79], v[22], s[20]))
|
|
k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21]))
|
|
k.emit(s_and_b64(s[82:83], s[78:79], s[82:83]))
|
|
k.emit(s_mul_i32(s[78], 256, s[2]))
|
|
k.emit(v_sub_u32_e64(v[94], v[22], s[78]))
|
|
k.emit(v_lshlrev_b32_e32(v[94], 2, v[94]))
|
|
k.emit(ds_read_b32(v[91], v[94]))
|
|
k.emit(ds_read_b32(v[92], v[94], v[0], v[0], 0, 0, 4))
|
|
k.emit(v_add_lshl_u32_e64(v[93], v[21], v[22], 1))
|
|
k.emit(v_cndmask_b32_e64(v[93], v[30], v[93], s[82:83]))
|
|
k.emit(v_add_co_u32(v[22], VCC, v[18], 5))
|
|
k.emit(v_cmp_lt_u32_e64(s[78:79], v[22], s[20]))
|
|
k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21]))
|
|
k.emit(s_and_b64(s[82:83], s[78:79], s[82:83]))
|
|
k.emit(s_mul_i32(s[78], 256, s[2]))
|
|
k.emit(v_sub_u32_e64(v[98], v[22], s[78]))
|
|
k.emit(v_lshlrev_b32_e32(v[98], 2, v[98]))
|
|
k.emit(ds_read_b32(v[95], v[98]))
|
|
k.emit(ds_read_b32(v[96], v[98], v[0], v[0], 0, 0, 4))
|
|
k.emit(v_add_lshl_u32_e64(v[97], v[21], v[22], 1))
|
|
k.emit(v_cndmask_b32_e64(v[97], v[30], v[97], s[82:83]))
|
|
k.emit(v_add_co_u32(v[22], VCC, v[18], 6))
|
|
k.emit(v_cmp_lt_u32_e64(s[78:79], v[22], s[20]))
|
|
k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21]))
|
|
k.emit(s_and_b64(s[82:83], s[78:79], s[82:83]))
|
|
k.emit(s_mul_i32(s[78], 256, s[2]))
|
|
k.emit(v_sub_u32_e64(v[102], v[22], s[78]))
|
|
k.emit(v_lshlrev_b32_e32(v[102], 2, v[102]))
|
|
k.emit(ds_read_b32(v[99], v[102]))
|
|
k.emit(ds_read_b32(v[100], v[102], v[0], v[0], 0, 0, 4))
|
|
k.emit(v_add_lshl_u32_e64(v[101], v[21], v[22], 1))
|
|
k.emit(v_cndmask_b32_e64(v[101], v[30], v[101], s[82:83]))
|
|
k.emit(v_add_co_u32(v[22], VCC, v[18], 7))
|
|
k.emit(v_cmp_lt_u32_e64(s[78:79], v[22], s[20]))
|
|
k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21]))
|
|
k.emit(s_and_b64(s[82:83], s[78:79], s[82:83]))
|
|
k.emit(s_mul_i32(s[78], 256, s[2]))
|
|
k.emit(v_sub_u32_e64(v[106], v[22], s[78]))
|
|
k.emit(v_lshlrev_b32_e32(v[106], 2, v[106]))
|
|
k.emit(ds_read_b32(v[103], v[106]))
|
|
k.emit(ds_read_b32(v[104], v[106], v[0], v[0], 0, 0, 4))
|
|
k.emit(v_add_lshl_u32_e64(v[105], v[21], v[22], 1))
|
|
k.emit(v_cndmask_b32_e64(v[105], v[30], v[105], s[82:83]))
|
|
k.emit(v_add_co_u32(v[19], VCC, v[19], 1))
|
|
k.emit(v_add_u32_e64(v[20], v[20], s[38]))
|
|
k.emit(v_add_u32_e64(v[21], v[21], s[36]))
|
|
k.emit(v_cmp_lt_u32_e64(s[78:79], v[18], s[20]))
|
|
k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21]))
|
|
k.emit(s_and_b64(s[82:83], s[78:79], s[82:83]))
|
|
k.emit(s_mul_i32(s[78], 256, s[2]))
|
|
k.emit(v_sub_u32_e64(v[108], v[18], s[78]))
|
|
k.emit(v_lshlrev_b32_e32(v[108], 2, v[108]))
|
|
k.emit(v_add_lshl_u32_e64(v[107], v[21], v[18], 1))
|
|
k.emit(v_cndmask_b32_e64(v[107], v[30], v[107], s[82:83]))
|
|
k.emit(v_add_co_u32(v[22], VCC, v[18], 1))
|
|
k.emit(v_cmp_lt_u32_e64(s[78:79], v[22], s[20]))
|
|
k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21]))
|
|
k.emit(s_and_b64(s[82:83], s[78:79], s[82:83]))
|
|
k.emit(s_mul_i32(s[78], 256, s[2]))
|
|
k.emit(v_sub_u32_e64(v[110], v[22], s[78]))
|
|
k.emit(v_lshlrev_b32_e32(v[110], 2, v[110]))
|
|
k.emit(v_add_lshl_u32_e64(v[109], v[21], v[22], 1))
|
|
k.emit(v_cndmask_b32_e64(v[109], v[30], v[109], s[82:83]))
|
|
k.emit(v_add_co_u32(v[22], VCC, v[18], 2))
|
|
k.emit(v_cmp_lt_u32_e64(s[78:79], v[22], s[20]))
|
|
k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21]))
|
|
k.emit(s_and_b64(s[82:83], s[78:79], s[82:83]))
|
|
k.emit(s_mul_i32(s[78], 256, s[2]))
|
|
k.emit(v_sub_u32_e64(v[112], v[22], s[78]))
|
|
k.emit(v_lshlrev_b32_e32(v[112], 2, v[112]))
|
|
k.emit(v_add_lshl_u32_e64(v[111], v[21], v[22], 1))
|
|
k.emit(v_cndmask_b32_e64(v[111], v[30], v[111], s[82:83]))
|
|
k.emit(v_add_co_u32(v[22], VCC, v[18], 3))
|
|
k.emit(v_cmp_lt_u32_e64(s[78:79], v[22], s[20]))
|
|
k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21]))
|
|
k.emit(s_and_b64(s[82:83], s[78:79], s[82:83]))
|
|
k.emit(s_mul_i32(s[78], 256, s[2]))
|
|
k.emit(v_sub_u32_e64(v[114], v[22], s[78]))
|
|
k.emit(v_lshlrev_b32_e32(v[114], 2, v[114]))
|
|
k.emit(v_add_lshl_u32_e64(v[113], v[21], v[22], 1))
|
|
k.emit(v_cndmask_b32_e64(v[113], v[30], v[113], s[82:83]))
|
|
k.emit(v_add_co_u32(v[22], VCC, v[18], 4))
|
|
k.emit(v_cmp_lt_u32_e64(s[78:79], v[22], s[20]))
|
|
k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21]))
|
|
k.emit(s_and_b64(s[82:83], s[78:79], s[82:83]))
|
|
k.emit(s_mul_i32(s[78], 256, s[2]))
|
|
k.emit(v_sub_u32_e64(v[116], v[22], s[78]))
|
|
k.emit(v_lshlrev_b32_e32(v[116], 2, v[116]))
|
|
k.emit(v_add_lshl_u32_e64(v[115], v[21], v[22], 1))
|
|
k.emit(v_cndmask_b32_e64(v[115], v[30], v[115], s[82:83]))
|
|
k.emit(v_add_co_u32(v[22], VCC, v[18], 5))
|
|
k.emit(v_cmp_lt_u32_e64(s[78:79], v[22], s[20]))
|
|
k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21]))
|
|
k.emit(s_and_b64(s[82:83], s[78:79], s[82:83]))
|
|
k.emit(s_mul_i32(s[78], 256, s[2]))
|
|
k.emit(v_sub_u32_e64(v[118], v[22], s[78]))
|
|
k.emit(v_lshlrev_b32_e32(v[118], 2, v[118]))
|
|
k.emit(v_add_lshl_u32_e64(v[117], v[21], v[22], 1))
|
|
k.emit(v_cndmask_b32_e64(v[117], v[30], v[117], s[82:83]))
|
|
k.emit(v_add_co_u32(v[22], VCC, v[18], 6))
|
|
k.emit(v_cmp_lt_u32_e64(s[78:79], v[22], s[20]))
|
|
k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21]))
|
|
k.emit(s_and_b64(s[82:83], s[78:79], s[82:83]))
|
|
k.emit(s_mul_i32(s[78], 256, s[2]))
|
|
k.emit(v_sub_u32_e64(v[120], v[22], s[78]))
|
|
k.emit(v_lshlrev_b32_e32(v[120], 2, v[120]))
|
|
k.emit(v_add_lshl_u32_e64(v[119], v[21], v[22], 1))
|
|
k.emit(v_cndmask_b32_e64(v[119], v[30], v[119], s[82:83]))
|
|
k.emit(v_add_co_u32(v[22], VCC, v[18], 7))
|
|
k.emit(v_cmp_lt_u32_e64(s[78:79], v[22], s[20]))
|
|
k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21]))
|
|
k.emit(s_and_b64(s[82:83], s[78:79], s[82:83]))
|
|
k.emit(s_mul_i32(s[78], 256, s[2]))
|
|
k.emit(v_sub_u32_e64(v[122], v[22], s[78]))
|
|
k.emit(v_lshlrev_b32_e32(v[122], 2, v[122]))
|
|
k.emit(v_add_lshl_u32_e64(v[121], v[21], v[22], 1))
|
|
k.emit(v_cndmask_b32_e64(v[121], v[30], v[121], s[82:83]))
|
|
k.emit(v_add_co_u32(v[19], VCC, v[19], 1))
|
|
k.emit(v_add_u32_e64(v[20], v[20], s[38]))
|
|
k.emit(v_add_u32_e64(v[21], v[21], s[36]))
|
|
k.emit(v_cmp_lt_u32_e64(s[78:79], v[18], s[20]))
|
|
k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21]))
|
|
k.emit(s_and_b64(s[82:83], s[78:79], s[82:83]))
|
|
k.emit(s_mul_i32(s[78], 256, s[2]))
|
|
k.emit(v_sub_u32_e64(v[124], v[18], s[78]))
|
|
k.emit(v_lshlrev_b32_e32(v[124], 2, v[124]))
|
|
k.emit(v_add_lshl_u32_e64(v[123], v[21], v[18], 1))
|
|
k.emit(v_cndmask_b32_e64(v[123], v[30], v[123], s[82:83]))
|
|
k.emit(v_add_co_u32(v[22], VCC, v[18], 1))
|
|
k.emit(v_cmp_lt_u32_e64(s[78:79], v[22], s[20]))
|
|
k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21]))
|
|
k.emit(s_and_b64(s[82:83], s[78:79], s[82:83]))
|
|
k.emit(s_mul_i32(s[78], 256, s[2]))
|
|
k.emit(v_sub_u32_e64(v[126], v[22], s[78]))
|
|
k.emit(v_lshlrev_b32_e32(v[126], 2, v[126]))
|
|
k.emit(v_add_lshl_u32_e64(v[125], v[21], v[22], 1))
|
|
k.emit(v_cndmask_b32_e64(v[125], v[30], v[125], s[82:83]))
|
|
k.emit(v_add_co_u32(v[22], VCC, v[18], 2))
|
|
k.emit(v_cmp_lt_u32_e64(s[78:79], v[22], s[20]))
|
|
k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21]))
|
|
k.emit(s_and_b64(s[82:83], s[78:79], s[82:83]))
|
|
k.emit(s_mul_i32(s[78], 256, s[2]))
|
|
k.emit(v_sub_u32_e64(v[128], v[22], s[78]))
|
|
k.emit(v_lshlrev_b32_e32(v[128], 2, v[128]))
|
|
k.emit(v_add_lshl_u32_e64(v[127], v[21], v[22], 1))
|
|
k.emit(v_cndmask_b32_e64(v[127], v[30], v[127], s[82:83]))
|
|
k.emit(v_add_co_u32(v[22], VCC, v[18], 3))
|
|
k.emit(v_cmp_lt_u32_e64(s[78:79], v[22], s[20]))
|
|
k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21]))
|
|
k.emit(s_and_b64(s[82:83], s[78:79], s[82:83]))
|
|
k.emit(s_mul_i32(s[78], 256, s[2]))
|
|
k.emit(v_sub_u32_e64(v[130], v[22], s[78]))
|
|
k.emit(v_lshlrev_b32_e32(v[130], 2, v[130]))
|
|
k.emit(v_add_lshl_u32_e64(v[129], v[21], v[22], 1))
|
|
k.emit(v_cndmask_b32_e64(v[129], v[30], v[129], s[82:83]))
|
|
k.emit(v_add_co_u32(v[22], VCC, v[18], 4))
|
|
k.emit(v_cmp_lt_u32_e64(s[78:79], v[22], s[20]))
|
|
k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21]))
|
|
k.emit(s_and_b64(s[82:83], s[78:79], s[82:83]))
|
|
k.emit(s_mul_i32(s[78], 256, s[2]))
|
|
k.emit(v_sub_u32_e64(v[132], v[22], s[78]))
|
|
k.emit(v_lshlrev_b32_e32(v[132], 2, v[132]))
|
|
k.emit(v_add_lshl_u32_e64(v[131], v[21], v[22], 1))
|
|
k.emit(v_cndmask_b32_e64(v[131], v[30], v[131], s[82:83]))
|
|
k.emit(v_add_co_u32(v[22], VCC, v[18], 5))
|
|
k.emit(v_cmp_lt_u32_e64(s[78:79], v[22], s[20]))
|
|
k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21]))
|
|
k.emit(s_and_b64(s[82:83], s[78:79], s[82:83]))
|
|
k.emit(s_mul_i32(s[78], 256, s[2]))
|
|
k.emit(v_sub_u32_e64(v[134], v[22], s[78]))
|
|
k.emit(v_lshlrev_b32_e32(v[134], 2, v[134]))
|
|
k.emit(v_add_lshl_u32_e64(v[133], v[21], v[22], 1))
|
|
k.emit(v_cndmask_b32_e64(v[133], v[30], v[133], s[82:83]))
|
|
k.emit(v_add_co_u32(v[22], VCC, v[18], 6))
|
|
k.emit(v_cmp_lt_u32_e64(s[78:79], v[22], s[20]))
|
|
k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21]))
|
|
k.emit(s_and_b64(s[82:83], s[78:79], s[82:83]))
|
|
k.emit(s_mul_i32(s[78], 256, s[2]))
|
|
k.emit(v_sub_u32_e64(v[136], v[22], s[78]))
|
|
k.emit(v_lshlrev_b32_e32(v[136], 2, v[136]))
|
|
k.emit(v_add_lshl_u32_e64(v[135], v[21], v[22], 1))
|
|
k.emit(v_cndmask_b32_e64(v[135], v[30], v[135], s[82:83]))
|
|
k.emit(v_add_co_u32(v[22], VCC, v[18], 7))
|
|
k.emit(v_cmp_lt_u32_e64(s[78:79], v[22], s[20]))
|
|
k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21]))
|
|
k.emit(s_and_b64(s[82:83], s[78:79], s[82:83]))
|
|
k.emit(s_mul_i32(s[78], 256, s[2]))
|
|
k.emit(v_sub_u32_e64(v[138], v[22], s[78]))
|
|
k.emit(v_lshlrev_b32_e32(v[138], 2, v[138]))
|
|
k.emit(v_add_lshl_u32_e64(v[137], v[21], v[22], 1))
|
|
k.emit(v_cndmask_b32_e64(v[137], v[30], v[137], s[82:83]))
|
|
k.emit(v_add_co_u32(v[19], VCC, v[19], 1))
|
|
k.emit(v_add_u32_e64(v[20], v[20], s[38]))
|
|
k.emit(v_add_u32_e64(v[21], v[21], s[36]))
|
|
k.emit(v_cmp_lt_u32_e64(s[78:79], v[18], s[20]))
|
|
k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21]))
|
|
k.emit(s_and_b64(s[82:83], s[78:79], s[82:83]))
|
|
k.emit(s_mul_i32(s[78], 256, s[2]))
|
|
k.emit(v_sub_u32_e64(v[140], v[18], s[78]))
|
|
k.emit(v_lshlrev_b32_e32(v[140], 2, v[140]))
|
|
k.emit(v_add_lshl_u32_e64(v[139], v[21], v[18], 1))
|
|
k.emit(v_cndmask_b32_e64(v[139], v[30], v[139], s[82:83]))
|
|
k.emit(v_add_co_u32(v[22], VCC, v[18], 1))
|
|
k.emit(v_cmp_lt_u32_e64(s[78:79], v[22], s[20]))
|
|
k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21]))
|
|
k.emit(s_and_b64(s[82:83], s[78:79], s[82:83]))
|
|
k.emit(s_mul_i32(s[78], 256, s[2]))
|
|
k.emit(v_sub_u32_e64(v[142], v[22], s[78]))
|
|
k.emit(v_lshlrev_b32_e32(v[142], 2, v[142]))
|
|
k.emit(v_add_lshl_u32_e64(v[141], v[21], v[22], 1))
|
|
k.emit(v_cndmask_b32_e64(v[141], v[30], v[141], s[82:83]))
|
|
k.emit(v_add_co_u32(v[22], VCC, v[18], 2))
|
|
k.emit(v_cmp_lt_u32_e64(s[78:79], v[22], s[20]))
|
|
k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21]))
|
|
k.emit(s_and_b64(s[82:83], s[78:79], s[82:83]))
|
|
k.emit(s_mul_i32(s[78], 256, s[2]))
|
|
k.emit(v_sub_u32_e64(v[144], v[22], s[78]))
|
|
k.emit(v_lshlrev_b32_e32(v[144], 2, v[144]))
|
|
k.emit(v_add_lshl_u32_e64(v[143], v[21], v[22], 1))
|
|
k.emit(v_cndmask_b32_e64(v[143], v[30], v[143], s[82:83]))
|
|
k.emit(v_add_co_u32(v[22], VCC, v[18], 3))
|
|
k.emit(v_cmp_lt_u32_e64(s[78:79], v[22], s[20]))
|
|
k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21]))
|
|
k.emit(s_and_b64(s[82:83], s[78:79], s[82:83]))
|
|
k.emit(s_mul_i32(s[78], 256, s[2]))
|
|
k.emit(v_sub_u32_e64(v[146], v[22], s[78]))
|
|
k.emit(v_lshlrev_b32_e32(v[146], 2, v[146]))
|
|
k.emit(v_add_lshl_u32_e64(v[145], v[21], v[22], 1))
|
|
k.emit(v_cndmask_b32_e64(v[145], v[30], v[145], s[82:83]))
|
|
k.emit(v_add_co_u32(v[22], VCC, v[18], 4))
|
|
k.emit(v_cmp_lt_u32_e64(s[78:79], v[22], s[20]))
|
|
k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21]))
|
|
k.emit(s_and_b64(s[82:83], s[78:79], s[82:83]))
|
|
k.emit(s_mul_i32(s[78], 256, s[2]))
|
|
k.emit(v_sub_u32_e64(v[148], v[22], s[78]))
|
|
k.emit(v_lshlrev_b32_e32(v[148], 2, v[148]))
|
|
k.emit(v_add_lshl_u32_e64(v[147], v[21], v[22], 1))
|
|
k.emit(v_cndmask_b32_e64(v[147], v[30], v[147], s[82:83]))
|
|
k.emit(v_add_co_u32(v[22], VCC, v[18], 5))
|
|
k.emit(v_cmp_lt_u32_e64(s[78:79], v[22], s[20]))
|
|
k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21]))
|
|
k.emit(s_and_b64(s[82:83], s[78:79], s[82:83]))
|
|
k.emit(s_mul_i32(s[78], 256, s[2]))
|
|
k.emit(v_sub_u32_e64(v[150], v[22], s[78]))
|
|
k.emit(v_lshlrev_b32_e32(v[150], 2, v[150]))
|
|
k.emit(v_add_lshl_u32_e64(v[149], v[21], v[22], 1))
|
|
k.emit(v_cndmask_b32_e64(v[149], v[30], v[149], s[82:83]))
|
|
k.emit(v_add_co_u32(v[22], VCC, v[18], 6))
|
|
k.emit(v_cmp_lt_u32_e64(s[78:79], v[22], s[20]))
|
|
k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21]))
|
|
k.emit(s_and_b64(s[82:83], s[78:79], s[82:83]))
|
|
k.emit(s_mul_i32(s[78], 256, s[2]))
|
|
k.emit(v_sub_u32_e64(v[152], v[22], s[78]))
|
|
k.emit(v_lshlrev_b32_e32(v[152], 2, v[152]))
|
|
k.emit(v_add_lshl_u32_e64(v[151], v[21], v[22], 1))
|
|
k.emit(v_cndmask_b32_e64(v[151], v[30], v[151], s[82:83]))
|
|
k.emit(v_add_co_u32(v[22], VCC, v[18], 7))
|
|
k.emit(v_cmp_lt_u32_e64(s[78:79], v[22], s[20]))
|
|
k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21]))
|
|
k.emit(s_and_b64(s[82:83], s[78:79], s[82:83]))
|
|
k.emit(s_mul_i32(s[78], 256, s[2]))
|
|
k.emit(v_sub_u32_e64(v[154], v[22], s[78]))
|
|
k.emit(v_lshlrev_b32_e32(v[154], 2, v[154]))
|
|
k.emit(v_add_lshl_u32_e64(v[153], v[21], v[22], 1))
|
|
k.emit(v_cndmask_b32_e64(v[153], v[30], v[153], s[82:83]))
|
|
k.emit(v_add_co_u32(v[19], VCC, v[19], 1))
|
|
k.emit(v_add_u32_e64(v[20], v[20], s[38]))
|
|
k.emit(v_add_u32_e64(v[21], v[21], s[36]))
|
|
k.emit(v_cmp_lt_u32_e64(s[78:79], v[18], s[20]))
|
|
k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21]))
|
|
k.emit(s_and_b64(s[82:83], s[78:79], s[82:83]))
|
|
k.emit(s_mul_i32(s[78], 256, s[2]))
|
|
k.emit(v_sub_u32_e64(v[156], v[18], s[78]))
|
|
k.emit(v_lshlrev_b32_e32(v[156], 2, v[156]))
|
|
k.emit(v_add_lshl_u32_e64(v[155], v[21], v[18], 1))
|
|
k.emit(v_cndmask_b32_e64(v[155], v[30], v[155], s[82:83]))
|
|
k.emit(v_add_co_u32(v[22], VCC, v[18], 1))
|
|
k.emit(v_cmp_lt_u32_e64(s[78:79], v[22], s[20]))
|
|
k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21]))
|
|
k.emit(s_and_b64(s[82:83], s[78:79], s[82:83]))
|
|
k.emit(s_mul_i32(s[78], 256, s[2]))
|
|
k.emit(v_sub_u32_e64(v[158], v[22], s[78]))
|
|
k.emit(v_lshlrev_b32_e32(v[158], 2, v[158]))
|
|
k.emit(v_add_lshl_u32_e64(v[157], v[21], v[22], 1))
|
|
k.emit(v_cndmask_b32_e64(v[157], v[30], v[157], s[82:83]))
|
|
k.emit(v_add_co_u32(v[22], VCC, v[18], 2))
|
|
k.emit(v_cmp_lt_u32_e64(s[78:79], v[22], s[20]))
|
|
k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21]))
|
|
k.emit(s_and_b64(s[82:83], s[78:79], s[82:83]))
|
|
k.emit(s_mul_i32(s[78], 256, s[2]))
|
|
k.emit(v_sub_u32_e64(v[160], v[22], s[78]))
|
|
k.emit(v_lshlrev_b32_e32(v[160], 2, v[160]))
|
|
k.emit(v_add_lshl_u32_e64(v[159], v[21], v[22], 1))
|
|
k.emit(v_cndmask_b32_e64(v[159], v[30], v[159], s[82:83]))
|
|
k.emit(v_add_co_u32(v[22], VCC, v[18], 3))
|
|
k.emit(v_cmp_lt_u32_e64(s[78:79], v[22], s[20]))
|
|
k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21]))
|
|
k.emit(s_and_b64(s[82:83], s[78:79], s[82:83]))
|
|
k.emit(s_mul_i32(s[78], 256, s[2]))
|
|
k.emit(v_sub_u32_e64(v[162], v[22], s[78]))
|
|
k.emit(v_lshlrev_b32_e32(v[162], 2, v[162]))
|
|
k.emit(v_add_lshl_u32_e64(v[161], v[21], v[22], 1))
|
|
k.emit(v_cndmask_b32_e64(v[161], v[30], v[161], s[82:83]))
|
|
k.emit(v_add_co_u32(v[22], VCC, v[18], 4))
|
|
k.emit(v_cmp_lt_u32_e64(s[78:79], v[22], s[20]))
|
|
k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21]))
|
|
k.emit(s_and_b64(s[82:83], s[78:79], s[82:83]))
|
|
k.emit(s_mul_i32(s[78], 256, s[2]))
|
|
k.emit(v_sub_u32_e64(v[164], v[22], s[78]))
|
|
k.emit(v_lshlrev_b32_e32(v[164], 2, v[164]))
|
|
k.emit(v_add_lshl_u32_e64(v[163], v[21], v[22], 1))
|
|
k.emit(v_cndmask_b32_e64(v[163], v[30], v[163], s[82:83]))
|
|
k.emit(v_add_co_u32(v[22], VCC, v[18], 5))
|
|
k.emit(v_cmp_lt_u32_e64(s[78:79], v[22], s[20]))
|
|
k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21]))
|
|
k.emit(s_and_b64(s[82:83], s[78:79], s[82:83]))
|
|
k.emit(s_mul_i32(s[78], 256, s[2]))
|
|
k.emit(v_sub_u32_e64(v[166], v[22], s[78]))
|
|
k.emit(v_lshlrev_b32_e32(v[166], 2, v[166]))
|
|
k.emit(v_add_lshl_u32_e64(v[165], v[21], v[22], 1))
|
|
k.emit(v_cndmask_b32_e64(v[165], v[30], v[165], s[82:83]))
|
|
k.emit(v_add_co_u32(v[22], VCC, v[18], 6))
|
|
k.emit(v_cmp_lt_u32_e64(s[78:79], v[22], s[20]))
|
|
k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21]))
|
|
k.emit(s_and_b64(s[82:83], s[78:79], s[82:83]))
|
|
k.emit(s_mul_i32(s[78], 256, s[2]))
|
|
k.emit(v_sub_u32_e64(v[168], v[22], s[78]))
|
|
k.emit(v_lshlrev_b32_e32(v[168], 2, v[168]))
|
|
k.emit(v_add_lshl_u32_e64(v[167], v[21], v[22], 1))
|
|
k.emit(v_cndmask_b32_e64(v[167], v[30], v[167], s[82:83]))
|
|
k.emit(v_add_co_u32(v[22], VCC, v[18], 7))
|
|
k.emit(v_cmp_lt_u32_e64(s[78:79], v[22], s[20]))
|
|
k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21]))
|
|
k.emit(s_and_b64(s[82:83], s[78:79], s[82:83]))
|
|
k.emit(s_mul_i32(s[78], 256, s[2]))
|
|
k.emit(v_sub_u32_e64(v[170], v[22], s[78]))
|
|
k.emit(v_lshlrev_b32_e32(v[170], 2, v[170]))
|
|
k.emit(v_add_lshl_u32_e64(v[169], v[21], v[22], 1))
|
|
k.emit(v_cndmask_b32_e64(v[169], v[30], v[169], s[82:83]))
|
|
k.emit(v_accvgpr_read(v[35], v[0]))
|
|
k.emit(v_accvgpr_read(v[36], v[4]))
|
|
k.emit(v_accvgpr_read(v[37], v[8]))
|
|
k.emit(v_accvgpr_read(v[38], v[12]))
|
|
k.emit(v_accvgpr_read(v[39], v[16]))
|
|
k.emit(v_accvgpr_read(v[40], v[20]))
|
|
k.emit(v_accvgpr_read(v[41], v[24]))
|
|
k.emit(v_accvgpr_read(v[42], v[28]))
|
|
k.emit(v_accvgpr_read(v[43], v[32]))
|
|
k.emit(v_accvgpr_read(v[44], v[36]))
|
|
k.emit(v_accvgpr_read(v[45], v[40]))
|
|
k.emit(v_accvgpr_read(v[46], v[44]))
|
|
k.emit(v_accvgpr_read(v[47], v[48]))
|
|
k.emit(v_accvgpr_read(v[48], v[52]))
|
|
k.emit(v_accvgpr_read(v[49], v[56]))
|
|
k.emit(v_accvgpr_read(v[50], v[60]))
|
|
k.emit(v_accvgpr_read(v[51], v[64]))
|
|
k.emit(v_accvgpr_read(v[52], v[68]))
|
|
k.emit(v_accvgpr_read(v[53], v[72]))
|
|
k.emit(v_accvgpr_read(v[54], v[76]))
|
|
k.emit(v_accvgpr_read(v[55], v[80]))
|
|
k.emit(v_accvgpr_read(v[56], v[84]))
|
|
k.emit(v_accvgpr_read(v[57], v[88]))
|
|
k.emit(v_accvgpr_read(v[58], v[92]))
|
|
k.emit(v_accvgpr_read(v[59], v[96]))
|
|
k.emit(v_accvgpr_read(v[60], v[100]))
|
|
k.emit(v_accvgpr_read(v[61], v[104]))
|
|
k.emit(v_accvgpr_read(v[62], v[108]))
|
|
k.emit(v_accvgpr_read(v[63], v[112]))
|
|
k.emit(v_accvgpr_read(v[64], v[116]))
|
|
k.emit(v_accvgpr_read(v[65], v[120]))
|
|
k.emit(v_accvgpr_read(v[66], v[124]))
|
|
k.emit(v_accvgpr_read(v[67], v[128]))
|
|
k.emit(v_accvgpr_read(v[68], v[132]))
|
|
k.emit(v_accvgpr_read(v[69], v[136]))
|
|
k.emit(v_accvgpr_read(v[70], v[140]))
|
|
k.emit(v_accvgpr_read(v[71], v[144]))
|
|
k.emit(v_accvgpr_read(v[72], v[148]))
|
|
k.emit(v_accvgpr_read(v[73], v[152]))
|
|
k.emit(v_accvgpr_read(v[74], v[156]))
|
|
k.waitcnt(lgkm=0)
|
|
k.emit(v_mov_b32_e32(v[32], 4294901760))
|
|
k.emit(v_mov_b32_e32(v[33], 2147418112))
|
|
k.emit(v_mov_b32_e32(v[34], 32767))
|
|
k.emit(v_mul_f32_e32(v[35], v[76], v[35]))
|
|
k.emit(v_add_f32_e32(v[22], v[75], v[35]))
|
|
k.emit(v_mov_b32_e32(v[35], v[22]))
|
|
k.emit(v_cvt_pk(v[35], v[35], v[35]))
|
|
k.emit(buffer_store_short(v[35], v[77], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mul_f32_e32(v[36], v[80], v[36]))
|
|
k.emit(v_add_f32_e32(v[22], v[79], v[36]))
|
|
k.emit(v_mov_b32_e32(v[36], v[22]))
|
|
k.emit(v_cvt_pk(v[36], v[36], v[36]))
|
|
k.emit(buffer_store_short(v[36], v[81], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mul_f32_e32(v[37], v[84], v[37]))
|
|
k.emit(v_add_f32_e32(v[22], v[83], v[37]))
|
|
k.emit(v_mov_b32_e32(v[37], v[22]))
|
|
k.emit(v_cvt_pk(v[37], v[37], v[37]))
|
|
k.emit(buffer_store_short(v[37], v[85], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mul_f32_e32(v[38], v[88], v[38]))
|
|
k.emit(v_add_f32_e32(v[22], v[87], v[38]))
|
|
k.emit(v_mov_b32_e32(v[38], v[22]))
|
|
k.emit(v_cvt_pk(v[38], v[38], v[38]))
|
|
k.emit(buffer_store_short(v[38], v[89], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mul_f32_e32(v[39], v[92], v[39]))
|
|
k.emit(v_add_f32_e32(v[22], v[91], v[39]))
|
|
k.emit(v_mov_b32_e32(v[39], v[22]))
|
|
k.emit(v_cvt_pk(v[39], v[39], v[39]))
|
|
k.emit(buffer_store_short(v[39], v[93], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mul_f32_e32(v[40], v[96], v[40]))
|
|
k.emit(v_add_f32_e32(v[22], v[95], v[40]))
|
|
k.emit(v_mov_b32_e32(v[40], v[22]))
|
|
k.emit(v_cvt_pk(v[40], v[40], v[40]))
|
|
k.emit(buffer_store_short(v[40], v[97], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mul_f32_e32(v[41], v[100], v[41]))
|
|
k.emit(v_add_f32_e32(v[22], v[99], v[41]))
|
|
k.emit(v_mov_b32_e32(v[41], v[22]))
|
|
k.emit(v_cvt_pk(v[41], v[41], v[41]))
|
|
k.emit(buffer_store_short(v[41], v[101], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mul_f32_e32(v[42], v[104], v[42]))
|
|
k.emit(v_add_f32_e32(v[22], v[103], v[42]))
|
|
k.emit(v_mov_b32_e32(v[42], v[22]))
|
|
k.emit(v_cvt_pk(v[42], v[42], v[42]))
|
|
k.emit(buffer_store_short(v[42], v[105], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mul_f32_e32(v[43], v[76], v[43]))
|
|
k.emit(v_add_f32_e32(v[22], v[75], v[43]))
|
|
k.emit(v_mov_b32_e32(v[43], v[22]))
|
|
k.emit(v_cvt_pk(v[43], v[43], v[43]))
|
|
k.emit(buffer_store_short(v[43], v[107], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mul_f32_e32(v[44], v[80], v[44]))
|
|
k.emit(v_add_f32_e32(v[22], v[79], v[44]))
|
|
k.emit(v_mov_b32_e32(v[44], v[22]))
|
|
k.emit(v_cvt_pk(v[44], v[44], v[44]))
|
|
k.emit(buffer_store_short(v[44], v[109], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mul_f32_e32(v[45], v[84], v[45]))
|
|
k.emit(v_add_f32_e32(v[22], v[83], v[45]))
|
|
k.emit(v_mov_b32_e32(v[45], v[22]))
|
|
k.emit(v_cvt_pk(v[45], v[45], v[45]))
|
|
k.emit(buffer_store_short(v[45], v[111], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mul_f32_e32(v[46], v[88], v[46]))
|
|
k.emit(v_add_f32_e32(v[22], v[87], v[46]))
|
|
k.emit(v_mov_b32_e32(v[46], v[22]))
|
|
k.emit(v_cvt_pk(v[46], v[46], v[46]))
|
|
k.emit(buffer_store_short(v[46], v[113], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mul_f32_e32(v[47], v[92], v[47]))
|
|
k.emit(v_add_f32_e32(v[22], v[91], v[47]))
|
|
k.emit(v_mov_b32_e32(v[47], v[22]))
|
|
k.emit(v_cvt_pk(v[47], v[47], v[47]))
|
|
k.emit(buffer_store_short(v[47], v[115], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mul_f32_e32(v[48], v[96], v[48]))
|
|
k.emit(v_add_f32_e32(v[22], v[95], v[48]))
|
|
k.emit(v_mov_b32_e32(v[48], v[22]))
|
|
k.emit(v_cvt_pk(v[48], v[48], v[48]))
|
|
k.emit(buffer_store_short(v[48], v[117], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mul_f32_e32(v[49], v[100], v[49]))
|
|
k.emit(v_add_f32_e32(v[22], v[99], v[49]))
|
|
k.emit(v_mov_b32_e32(v[49], v[22]))
|
|
k.emit(v_cvt_pk(v[49], v[49], v[49]))
|
|
k.emit(buffer_store_short(v[49], v[119], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mul_f32_e32(v[50], v[104], v[50]))
|
|
k.emit(v_add_f32_e32(v[22], v[103], v[50]))
|
|
k.emit(v_mov_b32_e32(v[50], v[22]))
|
|
k.emit(v_cvt_pk(v[50], v[50], v[50]))
|
|
k.emit(buffer_store_short(v[50], v[121], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mul_f32_e32(v[51], v[76], v[51]))
|
|
k.emit(v_add_f32_e32(v[22], v[75], v[51]))
|
|
k.emit(v_mov_b32_e32(v[51], v[22]))
|
|
k.emit(v_cvt_pk(v[51], v[51], v[51]))
|
|
k.emit(buffer_store_short(v[51], v[123], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mul_f32_e32(v[52], v[80], v[52]))
|
|
k.emit(v_add_f32_e32(v[22], v[79], v[52]))
|
|
k.emit(v_mov_b32_e32(v[52], v[22]))
|
|
k.emit(v_cvt_pk(v[52], v[52], v[52]))
|
|
k.emit(buffer_store_short(v[52], v[125], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mul_f32_e32(v[53], v[84], v[53]))
|
|
k.emit(v_add_f32_e32(v[22], v[83], v[53]))
|
|
k.emit(v_mov_b32_e32(v[53], v[22]))
|
|
k.emit(v_cvt_pk(v[53], v[53], v[53]))
|
|
k.emit(buffer_store_short(v[53], v[127], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mul_f32_e32(v[54], v[88], v[54]))
|
|
k.emit(v_add_f32_e32(v[22], v[87], v[54]))
|
|
k.emit(v_mov_b32_e32(v[54], v[22]))
|
|
k.emit(v_cvt_pk(v[54], v[54], v[54]))
|
|
k.emit(buffer_store_short(v[54], v[129], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mul_f32_e32(v[55], v[92], v[55]))
|
|
k.emit(v_add_f32_e32(v[22], v[91], v[55]))
|
|
k.emit(v_mov_b32_e32(v[55], v[22]))
|
|
k.emit(v_cvt_pk(v[55], v[55], v[55]))
|
|
k.emit(buffer_store_short(v[55], v[131], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mul_f32_e32(v[56], v[96], v[56]))
|
|
k.emit(v_add_f32_e32(v[22], v[95], v[56]))
|
|
k.emit(v_mov_b32_e32(v[56], v[22]))
|
|
k.emit(v_cvt_pk(v[56], v[56], v[56]))
|
|
k.emit(buffer_store_short(v[56], v[133], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mul_f32_e32(v[57], v[100], v[57]))
|
|
k.emit(v_add_f32_e32(v[22], v[99], v[57]))
|
|
k.emit(v_mov_b32_e32(v[57], v[22]))
|
|
k.emit(v_cvt_pk(v[57], v[57], v[57]))
|
|
k.emit(buffer_store_short(v[57], v[135], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mul_f32_e32(v[58], v[104], v[58]))
|
|
k.emit(v_add_f32_e32(v[22], v[103], v[58]))
|
|
k.emit(v_mov_b32_e32(v[58], v[22]))
|
|
k.emit(v_cvt_pk(v[58], v[58], v[58]))
|
|
k.emit(buffer_store_short(v[58], v[137], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mul_f32_e32(v[59], v[76], v[59]))
|
|
k.emit(v_add_f32_e32(v[22], v[75], v[59]))
|
|
k.emit(v_mov_b32_e32(v[59], v[22]))
|
|
k.emit(v_cvt_pk(v[59], v[59], v[59]))
|
|
k.emit(buffer_store_short(v[59], v[139], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mul_f32_e32(v[60], v[80], v[60]))
|
|
k.emit(v_add_f32_e32(v[22], v[79], v[60]))
|
|
k.emit(v_mov_b32_e32(v[60], v[22]))
|
|
k.emit(v_cvt_pk(v[60], v[60], v[60]))
|
|
k.emit(buffer_store_short(v[60], v[141], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mul_f32_e32(v[61], v[84], v[61]))
|
|
k.emit(v_add_f32_e32(v[22], v[83], v[61]))
|
|
k.emit(v_mov_b32_e32(v[61], v[22]))
|
|
k.emit(v_cvt_pk(v[61], v[61], v[61]))
|
|
k.emit(buffer_store_short(v[61], v[143], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mul_f32_e32(v[62], v[88], v[62]))
|
|
k.emit(v_add_f32_e32(v[22], v[87], v[62]))
|
|
k.emit(v_mov_b32_e32(v[62], v[22]))
|
|
k.emit(v_cvt_pk(v[62], v[62], v[62]))
|
|
k.emit(buffer_store_short(v[62], v[145], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mul_f32_e32(v[63], v[92], v[63]))
|
|
k.emit(v_add_f32_e32(v[22], v[91], v[63]))
|
|
k.emit(v_mov_b32_e32(v[63], v[22]))
|
|
k.emit(v_cvt_pk(v[63], v[63], v[63]))
|
|
k.emit(buffer_store_short(v[63], v[147], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mul_f32_e32(v[64], v[96], v[64]))
|
|
k.emit(v_add_f32_e32(v[22], v[95], v[64]))
|
|
k.emit(v_mov_b32_e32(v[64], v[22]))
|
|
k.emit(v_cvt_pk(v[64], v[64], v[64]))
|
|
k.emit(buffer_store_short(v[64], v[149], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mul_f32_e32(v[65], v[100], v[65]))
|
|
k.emit(v_add_f32_e32(v[22], v[99], v[65]))
|
|
k.emit(v_mov_b32_e32(v[65], v[22]))
|
|
k.emit(v_cvt_pk(v[65], v[65], v[65]))
|
|
k.emit(buffer_store_short(v[65], v[151], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mul_f32_e32(v[66], v[104], v[66]))
|
|
k.emit(v_add_f32_e32(v[22], v[103], v[66]))
|
|
k.emit(v_mov_b32_e32(v[66], v[22]))
|
|
k.emit(v_cvt_pk(v[66], v[66], v[66]))
|
|
k.emit(buffer_store_short(v[66], v[153], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mul_f32_e32(v[67], v[76], v[67]))
|
|
k.emit(v_add_f32_e32(v[22], v[75], v[67]))
|
|
k.emit(v_mov_b32_e32(v[67], v[22]))
|
|
k.emit(v_cvt_pk(v[67], v[67], v[67]))
|
|
k.emit(buffer_store_short(v[67], v[155], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mul_f32_e32(v[68], v[80], v[68]))
|
|
k.emit(v_add_f32_e32(v[22], v[79], v[68]))
|
|
k.emit(v_mov_b32_e32(v[68], v[22]))
|
|
k.emit(v_cvt_pk(v[68], v[68], v[68]))
|
|
k.emit(buffer_store_short(v[68], v[157], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mul_f32_e32(v[69], v[84], v[69]))
|
|
k.emit(v_add_f32_e32(v[22], v[83], v[69]))
|
|
k.emit(v_mov_b32_e32(v[69], v[22]))
|
|
k.emit(v_cvt_pk(v[69], v[69], v[69]))
|
|
k.emit(buffer_store_short(v[69], v[159], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mul_f32_e32(v[70], v[88], v[70]))
|
|
k.emit(v_add_f32_e32(v[22], v[87], v[70]))
|
|
k.emit(v_mov_b32_e32(v[70], v[22]))
|
|
k.emit(v_cvt_pk(v[70], v[70], v[70]))
|
|
k.emit(buffer_store_short(v[70], v[161], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mul_f32_e32(v[71], v[92], v[71]))
|
|
k.emit(v_add_f32_e32(v[22], v[91], v[71]))
|
|
k.emit(v_mov_b32_e32(v[71], v[22]))
|
|
k.emit(v_cvt_pk(v[71], v[71], v[71]))
|
|
k.emit(buffer_store_short(v[71], v[163], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mul_f32_e32(v[72], v[96], v[72]))
|
|
k.emit(v_add_f32_e32(v[22], v[95], v[72]))
|
|
k.emit(v_mov_b32_e32(v[72], v[22]))
|
|
k.emit(v_cvt_pk(v[72], v[72], v[72]))
|
|
k.emit(buffer_store_short(v[72], v[165], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mul_f32_e32(v[73], v[100], v[73]))
|
|
k.emit(v_add_f32_e32(v[22], v[99], v[73]))
|
|
k.emit(v_mov_b32_e32(v[73], v[22]))
|
|
k.emit(v_cvt_pk(v[73], v[73], v[73]))
|
|
k.emit(buffer_store_short(v[73], v[167], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mul_f32_e32(v[74], v[104], v[74]))
|
|
k.emit(v_add_f32_e32(v[22], v[103], v[74]))
|
|
k.emit(v_mov_b32_e32(v[74], v[22]))
|
|
k.emit(v_cvt_pk(v[74], v[74], v[74]))
|
|
k.emit(buffer_store_short(v[74], v[169], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(s_nop())
|
|
k.emit(v_mov_b32_e32(v[30], 2147483648))
|
|
k.emit(v_add_co_u32(v[19], VCC, v[19], 1))
|
|
k.emit(v_add_u32_e64(v[20], v[20], s[38]))
|
|
k.emit(v_add_u32_e64(v[21], v[21], s[36]))
|
|
k.emit(v_cmp_lt_u32_e64(s[78:79], v[18], s[20]))
|
|
k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21]))
|
|
k.emit(s_and_b64(s[82:83], s[78:79], s[82:83]))
|
|
k.emit(s_mul_i32(s[78], 256, s[2]))
|
|
k.emit(v_sub_u32_e64(v[78], v[18], s[78]))
|
|
k.emit(v_lshlrev_b32_e32(v[78], 2, v[78]))
|
|
k.emit(ds_read_b32(v[75], v[78]))
|
|
k.emit(ds_read_b32(v[76], v[78], v[0], v[0], 0, 0, 4))
|
|
k.emit(v_add_lshl_u32_e64(v[77], v[21], v[18], 1))
|
|
k.emit(v_cndmask_b32_e64(v[77], v[30], v[77], s[82:83]))
|
|
k.emit(v_add_co_u32(v[22], VCC, v[18], 1))
|
|
k.emit(v_cmp_lt_u32_e64(s[78:79], v[22], s[20]))
|
|
k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21]))
|
|
k.emit(s_and_b64(s[82:83], s[78:79], s[82:83]))
|
|
k.emit(s_mul_i32(s[78], 256, s[2]))
|
|
k.emit(v_sub_u32_e64(v[82], v[22], s[78]))
|
|
k.emit(v_lshlrev_b32_e32(v[82], 2, v[82]))
|
|
k.emit(ds_read_b32(v[79], v[82]))
|
|
k.emit(ds_read_b32(v[80], v[82], v[0], v[0], 0, 0, 4))
|
|
k.emit(v_add_lshl_u32_e64(v[81], v[21], v[22], 1))
|
|
k.emit(v_cndmask_b32_e64(v[81], v[30], v[81], s[82:83]))
|
|
k.emit(v_add_co_u32(v[22], VCC, v[18], 2))
|
|
k.emit(v_cmp_lt_u32_e64(s[78:79], v[22], s[20]))
|
|
k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21]))
|
|
k.emit(s_and_b64(s[82:83], s[78:79], s[82:83]))
|
|
k.emit(s_mul_i32(s[78], 256, s[2]))
|
|
k.emit(v_sub_u32_e64(v[86], v[22], s[78]))
|
|
k.emit(v_lshlrev_b32_e32(v[86], 2, v[86]))
|
|
k.emit(ds_read_b32(v[83], v[86]))
|
|
k.emit(ds_read_b32(v[84], v[86], v[0], v[0], 0, 0, 4))
|
|
k.emit(v_add_lshl_u32_e64(v[85], v[21], v[22], 1))
|
|
k.emit(v_cndmask_b32_e64(v[85], v[30], v[85], s[82:83]))
|
|
k.emit(v_add_co_u32(v[22], VCC, v[18], 3))
|
|
k.emit(v_cmp_lt_u32_e64(s[78:79], v[22], s[20]))
|
|
k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21]))
|
|
k.emit(s_and_b64(s[82:83], s[78:79], s[82:83]))
|
|
k.emit(s_mul_i32(s[78], 256, s[2]))
|
|
k.emit(v_sub_u32_e64(v[90], v[22], s[78]))
|
|
k.emit(v_lshlrev_b32_e32(v[90], 2, v[90]))
|
|
k.emit(ds_read_b32(v[87], v[90]))
|
|
k.emit(ds_read_b32(v[88], v[90], v[0], v[0], 0, 0, 4))
|
|
k.emit(v_add_lshl_u32_e64(v[89], v[21], v[22], 1))
|
|
k.emit(v_cndmask_b32_e64(v[89], v[30], v[89], s[82:83]))
|
|
k.emit(v_add_co_u32(v[22], VCC, v[18], 4))
|
|
k.emit(v_cmp_lt_u32_e64(s[78:79], v[22], s[20]))
|
|
k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21]))
|
|
k.emit(s_and_b64(s[82:83], s[78:79], s[82:83]))
|
|
k.emit(s_mul_i32(s[78], 256, s[2]))
|
|
k.emit(v_sub_u32_e64(v[94], v[22], s[78]))
|
|
k.emit(v_lshlrev_b32_e32(v[94], 2, v[94]))
|
|
k.emit(ds_read_b32(v[91], v[94]))
|
|
k.emit(ds_read_b32(v[92], v[94], v[0], v[0], 0, 0, 4))
|
|
k.emit(v_add_lshl_u32_e64(v[93], v[21], v[22], 1))
|
|
k.emit(v_cndmask_b32_e64(v[93], v[30], v[93], s[82:83]))
|
|
k.emit(v_add_co_u32(v[22], VCC, v[18], 5))
|
|
k.emit(v_cmp_lt_u32_e64(s[78:79], v[22], s[20]))
|
|
k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21]))
|
|
k.emit(s_and_b64(s[82:83], s[78:79], s[82:83]))
|
|
k.emit(s_mul_i32(s[78], 256, s[2]))
|
|
k.emit(v_sub_u32_e64(v[98], v[22], s[78]))
|
|
k.emit(v_lshlrev_b32_e32(v[98], 2, v[98]))
|
|
k.emit(ds_read_b32(v[95], v[98]))
|
|
k.emit(ds_read_b32(v[96], v[98], v[0], v[0], 0, 0, 4))
|
|
k.emit(v_add_lshl_u32_e64(v[97], v[21], v[22], 1))
|
|
k.emit(v_cndmask_b32_e64(v[97], v[30], v[97], s[82:83]))
|
|
k.emit(v_add_co_u32(v[22], VCC, v[18], 6))
|
|
k.emit(v_cmp_lt_u32_e64(s[78:79], v[22], s[20]))
|
|
k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21]))
|
|
k.emit(s_and_b64(s[82:83], s[78:79], s[82:83]))
|
|
k.emit(s_mul_i32(s[78], 256, s[2]))
|
|
k.emit(v_sub_u32_e64(v[102], v[22], s[78]))
|
|
k.emit(v_lshlrev_b32_e32(v[102], 2, v[102]))
|
|
k.emit(ds_read_b32(v[99], v[102]))
|
|
k.emit(ds_read_b32(v[100], v[102], v[0], v[0], 0, 0, 4))
|
|
k.emit(v_add_lshl_u32_e64(v[101], v[21], v[22], 1))
|
|
k.emit(v_cndmask_b32_e64(v[101], v[30], v[101], s[82:83]))
|
|
k.emit(v_add_co_u32(v[22], VCC, v[18], 7))
|
|
k.emit(v_cmp_lt_u32_e64(s[78:79], v[22], s[20]))
|
|
k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21]))
|
|
k.emit(s_and_b64(s[82:83], s[78:79], s[82:83]))
|
|
k.emit(s_mul_i32(s[78], 256, s[2]))
|
|
k.emit(v_sub_u32_e64(v[106], v[22], s[78]))
|
|
k.emit(v_lshlrev_b32_e32(v[106], 2, v[106]))
|
|
k.emit(ds_read_b32(v[103], v[106]))
|
|
k.emit(ds_read_b32(v[104], v[106], v[0], v[0], 0, 0, 4))
|
|
k.emit(v_add_lshl_u32_e64(v[105], v[21], v[22], 1))
|
|
k.emit(v_cndmask_b32_e64(v[105], v[30], v[105], s[82:83]))
|
|
k.emit(v_add_co_u32(v[19], VCC, v[19], 1))
|
|
k.emit(v_add_u32_e64(v[20], v[20], s[38]))
|
|
k.emit(v_add_u32_e64(v[21], v[21], s[36]))
|
|
k.emit(v_cmp_lt_u32_e64(s[78:79], v[18], s[20]))
|
|
k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21]))
|
|
k.emit(s_and_b64(s[82:83], s[78:79], s[82:83]))
|
|
k.emit(s_mul_i32(s[78], 256, s[2]))
|
|
k.emit(v_sub_u32_e64(v[108], v[18], s[78]))
|
|
k.emit(v_lshlrev_b32_e32(v[108], 2, v[108]))
|
|
k.emit(v_add_lshl_u32_e64(v[107], v[21], v[18], 1))
|
|
k.emit(v_cndmask_b32_e64(v[107], v[30], v[107], s[82:83]))
|
|
k.emit(v_add_co_u32(v[22], VCC, v[18], 1))
|
|
k.emit(v_cmp_lt_u32_e64(s[78:79], v[22], s[20]))
|
|
k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21]))
|
|
k.emit(s_and_b64(s[82:83], s[78:79], s[82:83]))
|
|
k.emit(s_mul_i32(s[78], 256, s[2]))
|
|
k.emit(v_sub_u32_e64(v[110], v[22], s[78]))
|
|
k.emit(v_lshlrev_b32_e32(v[110], 2, v[110]))
|
|
k.emit(v_add_lshl_u32_e64(v[109], v[21], v[22], 1))
|
|
k.emit(v_cndmask_b32_e64(v[109], v[30], v[109], s[82:83]))
|
|
k.emit(v_add_co_u32(v[22], VCC, v[18], 2))
|
|
k.emit(v_cmp_lt_u32_e64(s[78:79], v[22], s[20]))
|
|
k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21]))
|
|
k.emit(s_and_b64(s[82:83], s[78:79], s[82:83]))
|
|
k.emit(s_mul_i32(s[78], 256, s[2]))
|
|
k.emit(v_sub_u32_e64(v[112], v[22], s[78]))
|
|
k.emit(v_lshlrev_b32_e32(v[112], 2, v[112]))
|
|
k.emit(v_add_lshl_u32_e64(v[111], v[21], v[22], 1))
|
|
k.emit(v_cndmask_b32_e64(v[111], v[30], v[111], s[82:83]))
|
|
k.emit(v_add_co_u32(v[22], VCC, v[18], 3))
|
|
k.emit(v_cmp_lt_u32_e64(s[78:79], v[22], s[20]))
|
|
k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21]))
|
|
k.emit(s_and_b64(s[82:83], s[78:79], s[82:83]))
|
|
k.emit(s_mul_i32(s[78], 256, s[2]))
|
|
k.emit(v_sub_u32_e64(v[114], v[22], s[78]))
|
|
k.emit(v_lshlrev_b32_e32(v[114], 2, v[114]))
|
|
k.emit(v_add_lshl_u32_e64(v[113], v[21], v[22], 1))
|
|
k.emit(v_cndmask_b32_e64(v[113], v[30], v[113], s[82:83]))
|
|
k.emit(v_add_co_u32(v[22], VCC, v[18], 4))
|
|
k.emit(v_cmp_lt_u32_e64(s[78:79], v[22], s[20]))
|
|
k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21]))
|
|
k.emit(s_and_b64(s[82:83], s[78:79], s[82:83]))
|
|
k.emit(s_mul_i32(s[78], 256, s[2]))
|
|
k.emit(v_sub_u32_e64(v[116], v[22], s[78]))
|
|
k.emit(v_lshlrev_b32_e32(v[116], 2, v[116]))
|
|
k.emit(v_add_lshl_u32_e64(v[115], v[21], v[22], 1))
|
|
k.emit(v_cndmask_b32_e64(v[115], v[30], v[115], s[82:83]))
|
|
k.emit(v_add_co_u32(v[22], VCC, v[18], 5))
|
|
k.emit(v_cmp_lt_u32_e64(s[78:79], v[22], s[20]))
|
|
k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21]))
|
|
k.emit(s_and_b64(s[82:83], s[78:79], s[82:83]))
|
|
k.emit(s_mul_i32(s[78], 256, s[2]))
|
|
k.emit(v_sub_u32_e64(v[118], v[22], s[78]))
|
|
k.emit(v_lshlrev_b32_e32(v[118], 2, v[118]))
|
|
k.emit(v_add_lshl_u32_e64(v[117], v[21], v[22], 1))
|
|
k.emit(v_cndmask_b32_e64(v[117], v[30], v[117], s[82:83]))
|
|
k.emit(v_add_co_u32(v[22], VCC, v[18], 6))
|
|
k.emit(v_cmp_lt_u32_e64(s[78:79], v[22], s[20]))
|
|
k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21]))
|
|
k.emit(s_and_b64(s[82:83], s[78:79], s[82:83]))
|
|
k.emit(s_mul_i32(s[78], 256, s[2]))
|
|
k.emit(v_sub_u32_e64(v[120], v[22], s[78]))
|
|
k.emit(v_lshlrev_b32_e32(v[120], 2, v[120]))
|
|
k.emit(v_add_lshl_u32_e64(v[119], v[21], v[22], 1))
|
|
k.emit(v_cndmask_b32_e64(v[119], v[30], v[119], s[82:83]))
|
|
k.emit(v_add_co_u32(v[22], VCC, v[18], 7))
|
|
k.emit(v_cmp_lt_u32_e64(s[78:79], v[22], s[20]))
|
|
k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21]))
|
|
k.emit(s_and_b64(s[82:83], s[78:79], s[82:83]))
|
|
k.emit(s_mul_i32(s[78], 256, s[2]))
|
|
k.emit(v_sub_u32_e64(v[122], v[22], s[78]))
|
|
k.emit(v_lshlrev_b32_e32(v[122], 2, v[122]))
|
|
k.emit(v_add_lshl_u32_e64(v[121], v[21], v[22], 1))
|
|
k.emit(v_cndmask_b32_e64(v[121], v[30], v[121], s[82:83]))
|
|
k.emit(v_add_co_u32(v[19], VCC, v[19], 1))
|
|
k.emit(v_add_u32_e64(v[20], v[20], s[38]))
|
|
k.emit(v_add_u32_e64(v[21], v[21], s[36]))
|
|
k.emit(v_cmp_lt_u32_e64(s[78:79], v[18], s[20]))
|
|
k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21]))
|
|
k.emit(s_and_b64(s[82:83], s[78:79], s[82:83]))
|
|
k.emit(s_mul_i32(s[78], 256, s[2]))
|
|
k.emit(v_sub_u32_e64(v[124], v[18], s[78]))
|
|
k.emit(v_lshlrev_b32_e32(v[124], 2, v[124]))
|
|
k.emit(v_add_lshl_u32_e64(v[123], v[21], v[18], 1))
|
|
k.emit(v_cndmask_b32_e64(v[123], v[30], v[123], s[82:83]))
|
|
k.emit(v_add_co_u32(v[22], VCC, v[18], 1))
|
|
k.emit(v_cmp_lt_u32_e64(s[78:79], v[22], s[20]))
|
|
k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21]))
|
|
k.emit(s_and_b64(s[82:83], s[78:79], s[82:83]))
|
|
k.emit(s_mul_i32(s[78], 256, s[2]))
|
|
k.emit(v_sub_u32_e64(v[126], v[22], s[78]))
|
|
k.emit(v_lshlrev_b32_e32(v[126], 2, v[126]))
|
|
k.emit(v_add_lshl_u32_e64(v[125], v[21], v[22], 1))
|
|
k.emit(v_cndmask_b32_e64(v[125], v[30], v[125], s[82:83]))
|
|
k.emit(v_add_co_u32(v[22], VCC, v[18], 2))
|
|
k.emit(v_cmp_lt_u32_e64(s[78:79], v[22], s[20]))
|
|
k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21]))
|
|
k.emit(s_and_b64(s[82:83], s[78:79], s[82:83]))
|
|
k.emit(s_mul_i32(s[78], 256, s[2]))
|
|
k.emit(v_sub_u32_e64(v[128], v[22], s[78]))
|
|
k.emit(v_lshlrev_b32_e32(v[128], 2, v[128]))
|
|
k.emit(v_add_lshl_u32_e64(v[127], v[21], v[22], 1))
|
|
k.emit(v_cndmask_b32_e64(v[127], v[30], v[127], s[82:83]))
|
|
k.emit(v_add_co_u32(v[22], VCC, v[18], 3))
|
|
k.emit(v_cmp_lt_u32_e64(s[78:79], v[22], s[20]))
|
|
k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21]))
|
|
k.emit(s_and_b64(s[82:83], s[78:79], s[82:83]))
|
|
k.emit(s_mul_i32(s[78], 256, s[2]))
|
|
k.emit(v_sub_u32_e64(v[130], v[22], s[78]))
|
|
k.emit(v_lshlrev_b32_e32(v[130], 2, v[130]))
|
|
k.emit(v_add_lshl_u32_e64(v[129], v[21], v[22], 1))
|
|
k.emit(v_cndmask_b32_e64(v[129], v[30], v[129], s[82:83]))
|
|
k.emit(v_add_co_u32(v[22], VCC, v[18], 4))
|
|
k.emit(v_cmp_lt_u32_e64(s[78:79], v[22], s[20]))
|
|
k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21]))
|
|
k.emit(s_and_b64(s[82:83], s[78:79], s[82:83]))
|
|
k.emit(s_mul_i32(s[78], 256, s[2]))
|
|
k.emit(v_sub_u32_e64(v[132], v[22], s[78]))
|
|
k.emit(v_lshlrev_b32_e32(v[132], 2, v[132]))
|
|
k.emit(v_add_lshl_u32_e64(v[131], v[21], v[22], 1))
|
|
k.emit(v_cndmask_b32_e64(v[131], v[30], v[131], s[82:83]))
|
|
k.emit(v_add_co_u32(v[22], VCC, v[18], 5))
|
|
k.emit(v_cmp_lt_u32_e64(s[78:79], v[22], s[20]))
|
|
k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21]))
|
|
k.emit(s_and_b64(s[82:83], s[78:79], s[82:83]))
|
|
k.emit(s_mul_i32(s[78], 256, s[2]))
|
|
k.emit(v_sub_u32_e64(v[134], v[22], s[78]))
|
|
k.emit(v_lshlrev_b32_e32(v[134], 2, v[134]))
|
|
k.emit(v_add_lshl_u32_e64(v[133], v[21], v[22], 1))
|
|
k.emit(v_cndmask_b32_e64(v[133], v[30], v[133], s[82:83]))
|
|
k.emit(v_add_co_u32(v[22], VCC, v[18], 6))
|
|
k.emit(v_cmp_lt_u32_e64(s[78:79], v[22], s[20]))
|
|
k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21]))
|
|
k.emit(s_and_b64(s[82:83], s[78:79], s[82:83]))
|
|
k.emit(s_mul_i32(s[78], 256, s[2]))
|
|
k.emit(v_sub_u32_e64(v[136], v[22], s[78]))
|
|
k.emit(v_lshlrev_b32_e32(v[136], 2, v[136]))
|
|
k.emit(v_add_lshl_u32_e64(v[135], v[21], v[22], 1))
|
|
k.emit(v_cndmask_b32_e64(v[135], v[30], v[135], s[82:83]))
|
|
k.emit(v_add_co_u32(v[22], VCC, v[18], 7))
|
|
k.emit(v_cmp_lt_u32_e64(s[78:79], v[22], s[20]))
|
|
k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21]))
|
|
k.emit(s_and_b64(s[82:83], s[78:79], s[82:83]))
|
|
k.emit(s_mul_i32(s[78], 256, s[2]))
|
|
k.emit(v_sub_u32_e64(v[138], v[22], s[78]))
|
|
k.emit(v_lshlrev_b32_e32(v[138], 2, v[138]))
|
|
k.emit(v_add_lshl_u32_e64(v[137], v[21], v[22], 1))
|
|
k.emit(v_cndmask_b32_e64(v[137], v[30], v[137], s[82:83]))
|
|
k.emit(v_add_co_u32(v[19], VCC, v[19], 1))
|
|
k.emit(v_add_u32_e64(v[20], v[20], s[38]))
|
|
k.emit(v_add_u32_e64(v[21], v[21], s[36]))
|
|
k.emit(v_cmp_lt_u32_e64(s[78:79], v[18], s[20]))
|
|
k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21]))
|
|
k.emit(s_and_b64(s[82:83], s[78:79], s[82:83]))
|
|
k.emit(s_mul_i32(s[78], 256, s[2]))
|
|
k.emit(v_sub_u32_e64(v[140], v[18], s[78]))
|
|
k.emit(v_lshlrev_b32_e32(v[140], 2, v[140]))
|
|
k.emit(v_add_lshl_u32_e64(v[139], v[21], v[18], 1))
|
|
k.emit(v_cndmask_b32_e64(v[139], v[30], v[139], s[82:83]))
|
|
k.emit(v_add_co_u32(v[22], VCC, v[18], 1))
|
|
k.emit(v_cmp_lt_u32_e64(s[78:79], v[22], s[20]))
|
|
k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21]))
|
|
k.emit(s_and_b64(s[82:83], s[78:79], s[82:83]))
|
|
k.emit(s_mul_i32(s[78], 256, s[2]))
|
|
k.emit(v_sub_u32_e64(v[142], v[22], s[78]))
|
|
k.emit(v_lshlrev_b32_e32(v[142], 2, v[142]))
|
|
k.emit(v_add_lshl_u32_e64(v[141], v[21], v[22], 1))
|
|
k.emit(v_cndmask_b32_e64(v[141], v[30], v[141], s[82:83]))
|
|
k.emit(v_add_co_u32(v[22], VCC, v[18], 2))
|
|
k.emit(v_cmp_lt_u32_e64(s[78:79], v[22], s[20]))
|
|
k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21]))
|
|
k.emit(s_and_b64(s[82:83], s[78:79], s[82:83]))
|
|
k.emit(s_mul_i32(s[78], 256, s[2]))
|
|
k.emit(v_sub_u32_e64(v[144], v[22], s[78]))
|
|
k.emit(v_lshlrev_b32_e32(v[144], 2, v[144]))
|
|
k.emit(v_add_lshl_u32_e64(v[143], v[21], v[22], 1))
|
|
k.emit(v_cndmask_b32_e64(v[143], v[30], v[143], s[82:83]))
|
|
k.emit(v_add_co_u32(v[22], VCC, v[18], 3))
|
|
k.emit(v_cmp_lt_u32_e64(s[78:79], v[22], s[20]))
|
|
k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21]))
|
|
k.emit(s_and_b64(s[82:83], s[78:79], s[82:83]))
|
|
k.emit(s_mul_i32(s[78], 256, s[2]))
|
|
k.emit(v_sub_u32_e64(v[146], v[22], s[78]))
|
|
k.emit(v_lshlrev_b32_e32(v[146], 2, v[146]))
|
|
k.emit(v_add_lshl_u32_e64(v[145], v[21], v[22], 1))
|
|
k.emit(v_cndmask_b32_e64(v[145], v[30], v[145], s[82:83]))
|
|
k.emit(v_add_co_u32(v[22], VCC, v[18], 4))
|
|
k.emit(v_cmp_lt_u32_e64(s[78:79], v[22], s[20]))
|
|
k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21]))
|
|
k.emit(s_and_b64(s[82:83], s[78:79], s[82:83]))
|
|
k.emit(s_mul_i32(s[78], 256, s[2]))
|
|
k.emit(v_sub_u32_e64(v[148], v[22], s[78]))
|
|
k.emit(v_lshlrev_b32_e32(v[148], 2, v[148]))
|
|
k.emit(v_add_lshl_u32_e64(v[147], v[21], v[22], 1))
|
|
k.emit(v_cndmask_b32_e64(v[147], v[30], v[147], s[82:83]))
|
|
k.emit(v_add_co_u32(v[22], VCC, v[18], 5))
|
|
k.emit(v_cmp_lt_u32_e64(s[78:79], v[22], s[20]))
|
|
k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21]))
|
|
k.emit(s_and_b64(s[82:83], s[78:79], s[82:83]))
|
|
k.emit(s_mul_i32(s[78], 256, s[2]))
|
|
k.emit(v_sub_u32_e64(v[150], v[22], s[78]))
|
|
k.emit(v_lshlrev_b32_e32(v[150], 2, v[150]))
|
|
k.emit(v_add_lshl_u32_e64(v[149], v[21], v[22], 1))
|
|
k.emit(v_cndmask_b32_e64(v[149], v[30], v[149], s[82:83]))
|
|
k.emit(v_add_co_u32(v[22], VCC, v[18], 6))
|
|
k.emit(v_cmp_lt_u32_e64(s[78:79], v[22], s[20]))
|
|
k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21]))
|
|
k.emit(s_and_b64(s[82:83], s[78:79], s[82:83]))
|
|
k.emit(s_mul_i32(s[78], 256, s[2]))
|
|
k.emit(v_sub_u32_e64(v[152], v[22], s[78]))
|
|
k.emit(v_lshlrev_b32_e32(v[152], 2, v[152]))
|
|
k.emit(v_add_lshl_u32_e64(v[151], v[21], v[22], 1))
|
|
k.emit(v_cndmask_b32_e64(v[151], v[30], v[151], s[82:83]))
|
|
k.emit(v_add_co_u32(v[22], VCC, v[18], 7))
|
|
k.emit(v_cmp_lt_u32_e64(s[78:79], v[22], s[20]))
|
|
k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21]))
|
|
k.emit(s_and_b64(s[82:83], s[78:79], s[82:83]))
|
|
k.emit(s_mul_i32(s[78], 256, s[2]))
|
|
k.emit(v_sub_u32_e64(v[154], v[22], s[78]))
|
|
k.emit(v_lshlrev_b32_e32(v[154], 2, v[154]))
|
|
k.emit(v_add_lshl_u32_e64(v[153], v[21], v[22], 1))
|
|
k.emit(v_cndmask_b32_e64(v[153], v[30], v[153], s[82:83]))
|
|
k.emit(v_add_co_u32(v[19], VCC, v[19], 1))
|
|
k.emit(v_add_u32_e64(v[20], v[20], s[38]))
|
|
k.emit(v_add_u32_e64(v[21], v[21], s[36]))
|
|
k.emit(v_cmp_lt_u32_e64(s[78:79], v[18], s[20]))
|
|
k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21]))
|
|
k.emit(s_and_b64(s[82:83], s[78:79], s[82:83]))
|
|
k.emit(s_mul_i32(s[78], 256, s[2]))
|
|
k.emit(v_sub_u32_e64(v[156], v[18], s[78]))
|
|
k.emit(v_lshlrev_b32_e32(v[156], 2, v[156]))
|
|
k.emit(v_add_lshl_u32_e64(v[155], v[21], v[18], 1))
|
|
k.emit(v_cndmask_b32_e64(v[155], v[30], v[155], s[82:83]))
|
|
k.emit(v_add_co_u32(v[22], VCC, v[18], 1))
|
|
k.emit(v_cmp_lt_u32_e64(s[78:79], v[22], s[20]))
|
|
k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21]))
|
|
k.emit(s_and_b64(s[82:83], s[78:79], s[82:83]))
|
|
k.emit(s_mul_i32(s[78], 256, s[2]))
|
|
k.emit(v_sub_u32_e64(v[158], v[22], s[78]))
|
|
k.emit(v_lshlrev_b32_e32(v[158], 2, v[158]))
|
|
k.emit(v_add_lshl_u32_e64(v[157], v[21], v[22], 1))
|
|
k.emit(v_cndmask_b32_e64(v[157], v[30], v[157], s[82:83]))
|
|
k.emit(v_add_co_u32(v[22], VCC, v[18], 2))
|
|
k.emit(v_cmp_lt_u32_e64(s[78:79], v[22], s[20]))
|
|
k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21]))
|
|
k.emit(s_and_b64(s[82:83], s[78:79], s[82:83]))
|
|
k.emit(s_mul_i32(s[78], 256, s[2]))
|
|
k.emit(v_sub_u32_e64(v[160], v[22], s[78]))
|
|
k.emit(v_lshlrev_b32_e32(v[160], 2, v[160]))
|
|
k.emit(v_add_lshl_u32_e64(v[159], v[21], v[22], 1))
|
|
k.emit(v_cndmask_b32_e64(v[159], v[30], v[159], s[82:83]))
|
|
k.emit(v_add_co_u32(v[22], VCC, v[18], 3))
|
|
k.emit(v_cmp_lt_u32_e64(s[78:79], v[22], s[20]))
|
|
k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21]))
|
|
k.emit(s_and_b64(s[82:83], s[78:79], s[82:83]))
|
|
k.emit(s_mul_i32(s[78], 256, s[2]))
|
|
k.emit(v_sub_u32_e64(v[162], v[22], s[78]))
|
|
k.emit(v_lshlrev_b32_e32(v[162], 2, v[162]))
|
|
k.emit(v_add_lshl_u32_e64(v[161], v[21], v[22], 1))
|
|
k.emit(v_cndmask_b32_e64(v[161], v[30], v[161], s[82:83]))
|
|
k.emit(v_add_co_u32(v[22], VCC, v[18], 4))
|
|
k.emit(v_cmp_lt_u32_e64(s[78:79], v[22], s[20]))
|
|
k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21]))
|
|
k.emit(s_and_b64(s[82:83], s[78:79], s[82:83]))
|
|
k.emit(s_mul_i32(s[78], 256, s[2]))
|
|
k.emit(v_sub_u32_e64(v[164], v[22], s[78]))
|
|
k.emit(v_lshlrev_b32_e32(v[164], 2, v[164]))
|
|
k.emit(v_add_lshl_u32_e64(v[163], v[21], v[22], 1))
|
|
k.emit(v_cndmask_b32_e64(v[163], v[30], v[163], s[82:83]))
|
|
k.emit(v_add_co_u32(v[22], VCC, v[18], 5))
|
|
k.emit(v_cmp_lt_u32_e64(s[78:79], v[22], s[20]))
|
|
k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21]))
|
|
k.emit(s_and_b64(s[82:83], s[78:79], s[82:83]))
|
|
k.emit(s_mul_i32(s[78], 256, s[2]))
|
|
k.emit(v_sub_u32_e64(v[166], v[22], s[78]))
|
|
k.emit(v_lshlrev_b32_e32(v[166], 2, v[166]))
|
|
k.emit(v_add_lshl_u32_e64(v[165], v[21], v[22], 1))
|
|
k.emit(v_cndmask_b32_e64(v[165], v[30], v[165], s[82:83]))
|
|
k.emit(v_add_co_u32(v[22], VCC, v[18], 6))
|
|
k.emit(v_cmp_lt_u32_e64(s[78:79], v[22], s[20]))
|
|
k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21]))
|
|
k.emit(s_and_b64(s[82:83], s[78:79], s[82:83]))
|
|
k.emit(s_mul_i32(s[78], 256, s[2]))
|
|
k.emit(v_sub_u32_e64(v[168], v[22], s[78]))
|
|
k.emit(v_lshlrev_b32_e32(v[168], 2, v[168]))
|
|
k.emit(v_add_lshl_u32_e64(v[167], v[21], v[22], 1))
|
|
k.emit(v_cndmask_b32_e64(v[167], v[30], v[167], s[82:83]))
|
|
k.emit(v_add_co_u32(v[22], VCC, v[18], 7))
|
|
k.emit(v_cmp_lt_u32_e64(s[78:79], v[22], s[20]))
|
|
k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21]))
|
|
k.emit(s_and_b64(s[82:83], s[78:79], s[82:83]))
|
|
k.emit(s_mul_i32(s[78], 256, s[2]))
|
|
k.emit(v_sub_u32_e64(v[170], v[22], s[78]))
|
|
k.emit(v_lshlrev_b32_e32(v[170], 2, v[170]))
|
|
k.emit(v_add_lshl_u32_e64(v[169], v[21], v[22], 1))
|
|
k.emit(v_cndmask_b32_e64(v[169], v[30], v[169], s[82:83]))
|
|
k.emit(v_accvgpr_read(v[35], v[160]))
|
|
k.emit(v_accvgpr_read(v[36], v[164]))
|
|
k.emit(v_accvgpr_read(v[37], v[168]))
|
|
k.emit(v_accvgpr_read(v[38], v[172]))
|
|
k.emit(v_accvgpr_read(v[39], v[176]))
|
|
k.emit(v_accvgpr_read(v[40], v[180]))
|
|
k.emit(v_accvgpr_read(v[41], v[184]))
|
|
k.emit(v_accvgpr_read(v[42], v[188]))
|
|
k.emit(v_accvgpr_read(v[43], v[192]))
|
|
k.emit(v_accvgpr_read(v[44], v[196]))
|
|
k.emit(v_accvgpr_read(v[45], v[200]))
|
|
k.emit(v_accvgpr_read(v[46], v[204]))
|
|
k.emit(v_accvgpr_read(v[47], v[208]))
|
|
k.emit(v_accvgpr_read(v[48], v[212]))
|
|
k.emit(v_accvgpr_read(v[49], v[216]))
|
|
k.emit(v_accvgpr_read(v[50], v[220]))
|
|
k.emit(v_accvgpr_read(v[51], v[224]))
|
|
k.emit(v_accvgpr_read(v[52], v[228]))
|
|
k.emit(v_accvgpr_read(v[53], v[232]))
|
|
k.emit(v_accvgpr_read(v[54], v[236]))
|
|
k.emit(v_accvgpr_read(v[55], v[240]))
|
|
k.emit(v_accvgpr_read(v[56], v[244]))
|
|
k.emit(v_accvgpr_read(v[57], v[248]))
|
|
k.emit(v_accvgpr_read(v[58], v[252]))
|
|
k.emit(v_accvgpr_read(v[59], v[1]))
|
|
k.emit(v_accvgpr_read(v[60], v[5]))
|
|
k.emit(v_accvgpr_read(v[61], v[9]))
|
|
k.emit(v_accvgpr_read(v[62], v[13]))
|
|
k.emit(v_accvgpr_read(v[63], v[17]))
|
|
k.emit(v_accvgpr_read(v[64], v[21]))
|
|
k.emit(v_accvgpr_read(v[65], v[25]))
|
|
k.emit(v_accvgpr_read(v[66], v[29]))
|
|
k.emit(v_accvgpr_read(v[67], v[33]))
|
|
k.emit(v_accvgpr_read(v[68], v[37]))
|
|
k.emit(v_accvgpr_read(v[69], v[41]))
|
|
k.emit(v_accvgpr_read(v[70], v[45]))
|
|
k.emit(v_accvgpr_read(v[71], v[49]))
|
|
k.emit(v_accvgpr_read(v[72], v[53]))
|
|
k.emit(v_accvgpr_read(v[73], v[57]))
|
|
k.emit(v_accvgpr_read(v[74], v[61]))
|
|
k.waitcnt(lgkm=0)
|
|
k.emit(v_mov_b32_e32(v[32], 4294901760))
|
|
k.emit(v_mov_b32_e32(v[33], 2147418112))
|
|
k.emit(v_mov_b32_e32(v[34], 32767))
|
|
k.emit(v_mul_f32_e32(v[35], v[76], v[35]))
|
|
k.emit(v_add_f32_e32(v[22], v[75], v[35]))
|
|
k.emit(v_mov_b32_e32(v[35], v[22]))
|
|
k.emit(v_cvt_pk(v[35], v[35], v[35]))
|
|
k.emit(buffer_store_short(v[35], v[77], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mul_f32_e32(v[36], v[80], v[36]))
|
|
k.emit(v_add_f32_e32(v[22], v[79], v[36]))
|
|
k.emit(v_mov_b32_e32(v[36], v[22]))
|
|
k.emit(v_cvt_pk(v[36], v[36], v[36]))
|
|
k.emit(buffer_store_short(v[36], v[81], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mul_f32_e32(v[37], v[84], v[37]))
|
|
k.emit(v_add_f32_e32(v[22], v[83], v[37]))
|
|
k.emit(v_mov_b32_e32(v[37], v[22]))
|
|
k.emit(v_cvt_pk(v[37], v[37], v[37]))
|
|
k.emit(buffer_store_short(v[37], v[85], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mul_f32_e32(v[38], v[88], v[38]))
|
|
k.emit(v_add_f32_e32(v[22], v[87], v[38]))
|
|
k.emit(v_mov_b32_e32(v[38], v[22]))
|
|
k.emit(v_cvt_pk(v[38], v[38], v[38]))
|
|
k.emit(buffer_store_short(v[38], v[89], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mul_f32_e32(v[39], v[92], v[39]))
|
|
k.emit(v_add_f32_e32(v[22], v[91], v[39]))
|
|
k.emit(v_mov_b32_e32(v[39], v[22]))
|
|
k.emit(v_cvt_pk(v[39], v[39], v[39]))
|
|
k.emit(buffer_store_short(v[39], v[93], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mul_f32_e32(v[40], v[96], v[40]))
|
|
k.emit(v_add_f32_e32(v[22], v[95], v[40]))
|
|
k.emit(v_mov_b32_e32(v[40], v[22]))
|
|
k.emit(v_cvt_pk(v[40], v[40], v[40]))
|
|
k.emit(buffer_store_short(v[40], v[97], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mul_f32_e32(v[41], v[100], v[41]))
|
|
k.emit(v_add_f32_e32(v[22], v[99], v[41]))
|
|
k.emit(v_mov_b32_e32(v[41], v[22]))
|
|
k.emit(v_cvt_pk(v[41], v[41], v[41]))
|
|
k.emit(buffer_store_short(v[41], v[101], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mul_f32_e32(v[42], v[104], v[42]))
|
|
k.emit(v_add_f32_e32(v[22], v[103], v[42]))
|
|
k.emit(v_mov_b32_e32(v[42], v[22]))
|
|
k.emit(v_cvt_pk(v[42], v[42], v[42]))
|
|
k.emit(buffer_store_short(v[42], v[105], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mul_f32_e32(v[43], v[76], v[43]))
|
|
k.emit(v_add_f32_e32(v[22], v[75], v[43]))
|
|
k.emit(v_mov_b32_e32(v[43], v[22]))
|
|
k.emit(v_cvt_pk(v[43], v[43], v[43]))
|
|
k.emit(buffer_store_short(v[43], v[107], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mul_f32_e32(v[44], v[80], v[44]))
|
|
k.emit(v_add_f32_e32(v[22], v[79], v[44]))
|
|
k.emit(v_mov_b32_e32(v[44], v[22]))
|
|
k.emit(v_cvt_pk(v[44], v[44], v[44]))
|
|
k.emit(buffer_store_short(v[44], v[109], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mul_f32_e32(v[45], v[84], v[45]))
|
|
k.emit(v_add_f32_e32(v[22], v[83], v[45]))
|
|
k.emit(v_mov_b32_e32(v[45], v[22]))
|
|
k.emit(v_cvt_pk(v[45], v[45], v[45]))
|
|
k.emit(buffer_store_short(v[45], v[111], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mul_f32_e32(v[46], v[88], v[46]))
|
|
k.emit(v_add_f32_e32(v[22], v[87], v[46]))
|
|
k.emit(v_mov_b32_e32(v[46], v[22]))
|
|
k.emit(v_cvt_pk(v[46], v[46], v[46]))
|
|
k.emit(buffer_store_short(v[46], v[113], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mul_f32_e32(v[47], v[92], v[47]))
|
|
k.emit(v_add_f32_e32(v[22], v[91], v[47]))
|
|
k.emit(v_mov_b32_e32(v[47], v[22]))
|
|
k.emit(v_cvt_pk(v[47], v[47], v[47]))
|
|
k.emit(buffer_store_short(v[47], v[115], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mul_f32_e32(v[48], v[96], v[48]))
|
|
k.emit(v_add_f32_e32(v[22], v[95], v[48]))
|
|
k.emit(v_mov_b32_e32(v[48], v[22]))
|
|
k.emit(v_cvt_pk(v[48], v[48], v[48]))
|
|
k.emit(buffer_store_short(v[48], v[117], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mul_f32_e32(v[49], v[100], v[49]))
|
|
k.emit(v_add_f32_e32(v[22], v[99], v[49]))
|
|
k.emit(v_mov_b32_e32(v[49], v[22]))
|
|
k.emit(v_cvt_pk(v[49], v[49], v[49]))
|
|
k.emit(buffer_store_short(v[49], v[119], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mul_f32_e32(v[50], v[104], v[50]))
|
|
k.emit(v_add_f32_e32(v[22], v[103], v[50]))
|
|
k.emit(v_mov_b32_e32(v[50], v[22]))
|
|
k.emit(v_cvt_pk(v[50], v[50], v[50]))
|
|
k.emit(buffer_store_short(v[50], v[121], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mul_f32_e32(v[51], v[76], v[51]))
|
|
k.emit(v_add_f32_e32(v[22], v[75], v[51]))
|
|
k.emit(v_mov_b32_e32(v[51], v[22]))
|
|
k.emit(v_cvt_pk(v[51], v[51], v[51]))
|
|
k.emit(buffer_store_short(v[51], v[123], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mul_f32_e32(v[52], v[80], v[52]))
|
|
k.emit(v_add_f32_e32(v[22], v[79], v[52]))
|
|
k.emit(v_mov_b32_e32(v[52], v[22]))
|
|
k.emit(v_cvt_pk(v[52], v[52], v[52]))
|
|
k.emit(buffer_store_short(v[52], v[125], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mul_f32_e32(v[53], v[84], v[53]))
|
|
k.emit(v_add_f32_e32(v[22], v[83], v[53]))
|
|
k.emit(v_mov_b32_e32(v[53], v[22]))
|
|
k.emit(v_cvt_pk(v[53], v[53], v[53]))
|
|
k.emit(buffer_store_short(v[53], v[127], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mul_f32_e32(v[54], v[88], v[54]))
|
|
k.emit(v_add_f32_e32(v[22], v[87], v[54]))
|
|
k.emit(v_mov_b32_e32(v[54], v[22]))
|
|
k.emit(v_cvt_pk(v[54], v[54], v[54]))
|
|
k.emit(buffer_store_short(v[54], v[129], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mul_f32_e32(v[55], v[92], v[55]))
|
|
k.emit(v_add_f32_e32(v[22], v[91], v[55]))
|
|
k.emit(v_mov_b32_e32(v[55], v[22]))
|
|
k.emit(v_cvt_pk(v[55], v[55], v[55]))
|
|
k.emit(buffer_store_short(v[55], v[131], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mul_f32_e32(v[56], v[96], v[56]))
|
|
k.emit(v_add_f32_e32(v[22], v[95], v[56]))
|
|
k.emit(v_mov_b32_e32(v[56], v[22]))
|
|
k.emit(v_cvt_pk(v[56], v[56], v[56]))
|
|
k.emit(buffer_store_short(v[56], v[133], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mul_f32_e32(v[57], v[100], v[57]))
|
|
k.emit(v_add_f32_e32(v[22], v[99], v[57]))
|
|
k.emit(v_mov_b32_e32(v[57], v[22]))
|
|
k.emit(v_cvt_pk(v[57], v[57], v[57]))
|
|
k.emit(buffer_store_short(v[57], v[135], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mul_f32_e32(v[58], v[104], v[58]))
|
|
k.emit(v_add_f32_e32(v[22], v[103], v[58]))
|
|
k.emit(v_mov_b32_e32(v[58], v[22]))
|
|
k.emit(v_cvt_pk(v[58], v[58], v[58]))
|
|
k.emit(buffer_store_short(v[58], v[137], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mul_f32_e32(v[59], v[76], v[59]))
|
|
k.emit(v_add_f32_e32(v[22], v[75], v[59]))
|
|
k.emit(v_mov_b32_e32(v[59], v[22]))
|
|
k.emit(v_cvt_pk(v[59], v[59], v[59]))
|
|
k.emit(buffer_store_short(v[59], v[139], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mul_f32_e32(v[60], v[80], v[60]))
|
|
k.emit(v_add_f32_e32(v[22], v[79], v[60]))
|
|
k.emit(v_mov_b32_e32(v[60], v[22]))
|
|
k.emit(v_cvt_pk(v[60], v[60], v[60]))
|
|
k.emit(buffer_store_short(v[60], v[141], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mul_f32_e32(v[61], v[84], v[61]))
|
|
k.emit(v_add_f32_e32(v[22], v[83], v[61]))
|
|
k.emit(v_mov_b32_e32(v[61], v[22]))
|
|
k.emit(v_cvt_pk(v[61], v[61], v[61]))
|
|
k.emit(buffer_store_short(v[61], v[143], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mul_f32_e32(v[62], v[88], v[62]))
|
|
k.emit(v_add_f32_e32(v[22], v[87], v[62]))
|
|
k.emit(v_mov_b32_e32(v[62], v[22]))
|
|
k.emit(v_cvt_pk(v[62], v[62], v[62]))
|
|
k.emit(buffer_store_short(v[62], v[145], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mul_f32_e32(v[63], v[92], v[63]))
|
|
k.emit(v_add_f32_e32(v[22], v[91], v[63]))
|
|
k.emit(v_mov_b32_e32(v[63], v[22]))
|
|
k.emit(v_cvt_pk(v[63], v[63], v[63]))
|
|
k.emit(buffer_store_short(v[63], v[147], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mul_f32_e32(v[64], v[96], v[64]))
|
|
k.emit(v_add_f32_e32(v[22], v[95], v[64]))
|
|
k.emit(v_mov_b32_e32(v[64], v[22]))
|
|
k.emit(v_cvt_pk(v[64], v[64], v[64]))
|
|
k.emit(buffer_store_short(v[64], v[149], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mul_f32_e32(v[65], v[100], v[65]))
|
|
k.emit(v_add_f32_e32(v[22], v[99], v[65]))
|
|
k.emit(v_mov_b32_e32(v[65], v[22]))
|
|
k.emit(v_cvt_pk(v[65], v[65], v[65]))
|
|
k.emit(buffer_store_short(v[65], v[151], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mul_f32_e32(v[66], v[104], v[66]))
|
|
k.emit(v_add_f32_e32(v[22], v[103], v[66]))
|
|
k.emit(v_mov_b32_e32(v[66], v[22]))
|
|
k.emit(v_cvt_pk(v[66], v[66], v[66]))
|
|
k.emit(buffer_store_short(v[66], v[153], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mul_f32_e32(v[67], v[76], v[67]))
|
|
k.emit(v_add_f32_e32(v[22], v[75], v[67]))
|
|
k.emit(v_mov_b32_e32(v[67], v[22]))
|
|
k.emit(v_cvt_pk(v[67], v[67], v[67]))
|
|
k.emit(buffer_store_short(v[67], v[155], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mul_f32_e32(v[68], v[80], v[68]))
|
|
k.emit(v_add_f32_e32(v[22], v[79], v[68]))
|
|
k.emit(v_mov_b32_e32(v[68], v[22]))
|
|
k.emit(v_cvt_pk(v[68], v[68], v[68]))
|
|
k.emit(buffer_store_short(v[68], v[157], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mul_f32_e32(v[69], v[84], v[69]))
|
|
k.emit(v_add_f32_e32(v[22], v[83], v[69]))
|
|
k.emit(v_mov_b32_e32(v[69], v[22]))
|
|
k.emit(v_cvt_pk(v[69], v[69], v[69]))
|
|
k.emit(buffer_store_short(v[69], v[159], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mul_f32_e32(v[70], v[88], v[70]))
|
|
k.emit(v_add_f32_e32(v[22], v[87], v[70]))
|
|
k.emit(v_mov_b32_e32(v[70], v[22]))
|
|
k.emit(v_cvt_pk(v[70], v[70], v[70]))
|
|
k.emit(buffer_store_short(v[70], v[161], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mul_f32_e32(v[71], v[92], v[71]))
|
|
k.emit(v_add_f32_e32(v[22], v[91], v[71]))
|
|
k.emit(v_mov_b32_e32(v[71], v[22]))
|
|
k.emit(v_cvt_pk(v[71], v[71], v[71]))
|
|
k.emit(buffer_store_short(v[71], v[163], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mul_f32_e32(v[72], v[96], v[72]))
|
|
k.emit(v_add_f32_e32(v[22], v[95], v[72]))
|
|
k.emit(v_mov_b32_e32(v[72], v[22]))
|
|
k.emit(v_cvt_pk(v[72], v[72], v[72]))
|
|
k.emit(buffer_store_short(v[72], v[165], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mul_f32_e32(v[73], v[100], v[73]))
|
|
k.emit(v_add_f32_e32(v[22], v[99], v[73]))
|
|
k.emit(v_mov_b32_e32(v[73], v[22]))
|
|
k.emit(v_cvt_pk(v[73], v[73], v[73]))
|
|
k.emit(buffer_store_short(v[73], v[167], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mul_f32_e32(v[74], v[104], v[74]))
|
|
k.emit(v_add_f32_e32(v[22], v[103], v[74]))
|
|
k.emit(v_mov_b32_e32(v[74], v[22]))
|
|
k.emit(v_cvt_pk(v[74], v[74], v[74]))
|
|
k.emit(buffer_store_short(v[74], v[169], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(s_nop())
|
|
k.emit(v_mov_b32_e32(v[30], 2147483648))
|
|
k.emit(v_add_co_u32(v[19], VCC, v[19], 1))
|
|
k.emit(v_add_u32_e64(v[20], v[20], s[38]))
|
|
k.emit(v_add_u32_e64(v[21], v[21], s[36]))
|
|
k.emit(v_cmp_lt_u32_e64(s[78:79], v[18], s[20]))
|
|
k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21]))
|
|
k.emit(s_and_b64(s[82:83], s[78:79], s[82:83]))
|
|
k.emit(s_mul_i32(s[78], 256, s[2]))
|
|
k.emit(v_sub_u32_e64(v[78], v[18], s[78]))
|
|
k.emit(v_lshlrev_b32_e32(v[78], 2, v[78]))
|
|
k.emit(ds_read_b32(v[75], v[78]))
|
|
k.emit(ds_read_b32(v[76], v[78], v[0], v[0], 0, 0, 4))
|
|
k.emit(v_add_lshl_u32_e64(v[77], v[21], v[18], 1))
|
|
k.emit(v_cndmask_b32_e64(v[77], v[30], v[77], s[82:83]))
|
|
k.emit(v_add_co_u32(v[22], VCC, v[18], 1))
|
|
k.emit(v_cmp_lt_u32_e64(s[78:79], v[22], s[20]))
|
|
k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21]))
|
|
k.emit(s_and_b64(s[82:83], s[78:79], s[82:83]))
|
|
k.emit(s_mul_i32(s[78], 256, s[2]))
|
|
k.emit(v_sub_u32_e64(v[82], v[22], s[78]))
|
|
k.emit(v_lshlrev_b32_e32(v[82], 2, v[82]))
|
|
k.emit(ds_read_b32(v[79], v[82]))
|
|
k.emit(ds_read_b32(v[80], v[82], v[0], v[0], 0, 0, 4))
|
|
k.emit(v_add_lshl_u32_e64(v[81], v[21], v[22], 1))
|
|
k.emit(v_cndmask_b32_e64(v[81], v[30], v[81], s[82:83]))
|
|
k.emit(v_add_co_u32(v[22], VCC, v[18], 2))
|
|
k.emit(v_cmp_lt_u32_e64(s[78:79], v[22], s[20]))
|
|
k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21]))
|
|
k.emit(s_and_b64(s[82:83], s[78:79], s[82:83]))
|
|
k.emit(s_mul_i32(s[78], 256, s[2]))
|
|
k.emit(v_sub_u32_e64(v[86], v[22], s[78]))
|
|
k.emit(v_lshlrev_b32_e32(v[86], 2, v[86]))
|
|
k.emit(ds_read_b32(v[83], v[86]))
|
|
k.emit(ds_read_b32(v[84], v[86], v[0], v[0], 0, 0, 4))
|
|
k.emit(v_add_lshl_u32_e64(v[85], v[21], v[22], 1))
|
|
k.emit(v_cndmask_b32_e64(v[85], v[30], v[85], s[82:83]))
|
|
k.emit(v_add_co_u32(v[22], VCC, v[18], 3))
|
|
k.emit(v_cmp_lt_u32_e64(s[78:79], v[22], s[20]))
|
|
k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21]))
|
|
k.emit(s_and_b64(s[82:83], s[78:79], s[82:83]))
|
|
k.emit(s_mul_i32(s[78], 256, s[2]))
|
|
k.emit(v_sub_u32_e64(v[90], v[22], s[78]))
|
|
k.emit(v_lshlrev_b32_e32(v[90], 2, v[90]))
|
|
k.emit(ds_read_b32(v[87], v[90]))
|
|
k.emit(ds_read_b32(v[88], v[90], v[0], v[0], 0, 0, 4))
|
|
k.emit(v_add_lshl_u32_e64(v[89], v[21], v[22], 1))
|
|
k.emit(v_cndmask_b32_e64(v[89], v[30], v[89], s[82:83]))
|
|
k.emit(v_add_co_u32(v[22], VCC, v[18], 4))
|
|
k.emit(v_cmp_lt_u32_e64(s[78:79], v[22], s[20]))
|
|
k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21]))
|
|
k.emit(s_and_b64(s[82:83], s[78:79], s[82:83]))
|
|
k.emit(s_mul_i32(s[78], 256, s[2]))
|
|
k.emit(v_sub_u32_e64(v[94], v[22], s[78]))
|
|
k.emit(v_lshlrev_b32_e32(v[94], 2, v[94]))
|
|
k.emit(ds_read_b32(v[91], v[94]))
|
|
k.emit(ds_read_b32(v[92], v[94], v[0], v[0], 0, 0, 4))
|
|
k.emit(v_add_lshl_u32_e64(v[93], v[21], v[22], 1))
|
|
k.emit(v_cndmask_b32_e64(v[93], v[30], v[93], s[82:83]))
|
|
k.emit(v_add_co_u32(v[22], VCC, v[18], 5))
|
|
k.emit(v_cmp_lt_u32_e64(s[78:79], v[22], s[20]))
|
|
k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21]))
|
|
k.emit(s_and_b64(s[82:83], s[78:79], s[82:83]))
|
|
k.emit(s_mul_i32(s[78], 256, s[2]))
|
|
k.emit(v_sub_u32_e64(v[98], v[22], s[78]))
|
|
k.emit(v_lshlrev_b32_e32(v[98], 2, v[98]))
|
|
k.emit(ds_read_b32(v[95], v[98]))
|
|
k.emit(ds_read_b32(v[96], v[98], v[0], v[0], 0, 0, 4))
|
|
k.emit(v_add_lshl_u32_e64(v[97], v[21], v[22], 1))
|
|
k.emit(v_cndmask_b32_e64(v[97], v[30], v[97], s[82:83]))
|
|
k.emit(v_add_co_u32(v[22], VCC, v[18], 6))
|
|
k.emit(v_cmp_lt_u32_e64(s[78:79], v[22], s[20]))
|
|
k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21]))
|
|
k.emit(s_and_b64(s[82:83], s[78:79], s[82:83]))
|
|
k.emit(s_mul_i32(s[78], 256, s[2]))
|
|
k.emit(v_sub_u32_e64(v[102], v[22], s[78]))
|
|
k.emit(v_lshlrev_b32_e32(v[102], 2, v[102]))
|
|
k.emit(ds_read_b32(v[99], v[102]))
|
|
k.emit(ds_read_b32(v[100], v[102], v[0], v[0], 0, 0, 4))
|
|
k.emit(v_add_lshl_u32_e64(v[101], v[21], v[22], 1))
|
|
k.emit(v_cndmask_b32_e64(v[101], v[30], v[101], s[82:83]))
|
|
k.emit(v_add_co_u32(v[22], VCC, v[18], 7))
|
|
k.emit(v_cmp_lt_u32_e64(s[78:79], v[22], s[20]))
|
|
k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21]))
|
|
k.emit(s_and_b64(s[82:83], s[78:79], s[82:83]))
|
|
k.emit(s_mul_i32(s[78], 256, s[2]))
|
|
k.emit(v_sub_u32_e64(v[106], v[22], s[78]))
|
|
k.emit(v_lshlrev_b32_e32(v[106], 2, v[106]))
|
|
k.emit(ds_read_b32(v[103], v[106]))
|
|
k.emit(ds_read_b32(v[104], v[106], v[0], v[0], 0, 0, 4))
|
|
k.emit(v_add_lshl_u32_e64(v[105], v[21], v[22], 1))
|
|
k.emit(v_cndmask_b32_e64(v[105], v[30], v[105], s[82:83]))
|
|
k.emit(v_add_co_u32(v[19], VCC, v[19], 1))
|
|
k.emit(v_add_u32_e64(v[20], v[20], s[38]))
|
|
k.emit(v_add_u32_e64(v[21], v[21], s[36]))
|
|
k.emit(v_cmp_lt_u32_e64(s[78:79], v[18], s[20]))
|
|
k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21]))
|
|
k.emit(s_and_b64(s[82:83], s[78:79], s[82:83]))
|
|
k.emit(s_mul_i32(s[78], 256, s[2]))
|
|
k.emit(v_sub_u32_e64(v[108], v[18], s[78]))
|
|
k.emit(v_lshlrev_b32_e32(v[108], 2, v[108]))
|
|
k.emit(v_add_lshl_u32_e64(v[107], v[21], v[18], 1))
|
|
k.emit(v_cndmask_b32_e64(v[107], v[30], v[107], s[82:83]))
|
|
k.emit(v_add_co_u32(v[22], VCC, v[18], 1))
|
|
k.emit(v_cmp_lt_u32_e64(s[78:79], v[22], s[20]))
|
|
k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21]))
|
|
k.emit(s_and_b64(s[82:83], s[78:79], s[82:83]))
|
|
k.emit(s_mul_i32(s[78], 256, s[2]))
|
|
k.emit(v_sub_u32_e64(v[110], v[22], s[78]))
|
|
k.emit(v_lshlrev_b32_e32(v[110], 2, v[110]))
|
|
k.emit(v_add_lshl_u32_e64(v[109], v[21], v[22], 1))
|
|
k.emit(v_cndmask_b32_e64(v[109], v[30], v[109], s[82:83]))
|
|
k.emit(v_add_co_u32(v[22], VCC, v[18], 2))
|
|
k.emit(v_cmp_lt_u32_e64(s[78:79], v[22], s[20]))
|
|
k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21]))
|
|
k.emit(s_and_b64(s[82:83], s[78:79], s[82:83]))
|
|
k.emit(s_mul_i32(s[78], 256, s[2]))
|
|
k.emit(v_sub_u32_e64(v[112], v[22], s[78]))
|
|
k.emit(v_lshlrev_b32_e32(v[112], 2, v[112]))
|
|
k.emit(v_add_lshl_u32_e64(v[111], v[21], v[22], 1))
|
|
k.emit(v_cndmask_b32_e64(v[111], v[30], v[111], s[82:83]))
|
|
k.emit(v_add_co_u32(v[22], VCC, v[18], 3))
|
|
k.emit(v_cmp_lt_u32_e64(s[78:79], v[22], s[20]))
|
|
k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21]))
|
|
k.emit(s_and_b64(s[82:83], s[78:79], s[82:83]))
|
|
k.emit(s_mul_i32(s[78], 256, s[2]))
|
|
k.emit(v_sub_u32_e64(v[114], v[22], s[78]))
|
|
k.emit(v_lshlrev_b32_e32(v[114], 2, v[114]))
|
|
k.emit(v_add_lshl_u32_e64(v[113], v[21], v[22], 1))
|
|
k.emit(v_cndmask_b32_e64(v[113], v[30], v[113], s[82:83]))
|
|
k.emit(v_add_co_u32(v[22], VCC, v[18], 4))
|
|
k.emit(v_cmp_lt_u32_e64(s[78:79], v[22], s[20]))
|
|
k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21]))
|
|
k.emit(s_and_b64(s[82:83], s[78:79], s[82:83]))
|
|
k.emit(s_mul_i32(s[78], 256, s[2]))
|
|
k.emit(v_sub_u32_e64(v[116], v[22], s[78]))
|
|
k.emit(v_lshlrev_b32_e32(v[116], 2, v[116]))
|
|
k.emit(v_add_lshl_u32_e64(v[115], v[21], v[22], 1))
|
|
k.emit(v_cndmask_b32_e64(v[115], v[30], v[115], s[82:83]))
|
|
k.emit(v_add_co_u32(v[22], VCC, v[18], 5))
|
|
k.emit(v_cmp_lt_u32_e64(s[78:79], v[22], s[20]))
|
|
k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21]))
|
|
k.emit(s_and_b64(s[82:83], s[78:79], s[82:83]))
|
|
k.emit(s_mul_i32(s[78], 256, s[2]))
|
|
k.emit(v_sub_u32_e64(v[118], v[22], s[78]))
|
|
k.emit(v_lshlrev_b32_e32(v[118], 2, v[118]))
|
|
k.emit(v_add_lshl_u32_e64(v[117], v[21], v[22], 1))
|
|
k.emit(v_cndmask_b32_e64(v[117], v[30], v[117], s[82:83]))
|
|
k.emit(v_add_co_u32(v[22], VCC, v[18], 6))
|
|
k.emit(v_cmp_lt_u32_e64(s[78:79], v[22], s[20]))
|
|
k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21]))
|
|
k.emit(s_and_b64(s[82:83], s[78:79], s[82:83]))
|
|
k.emit(s_mul_i32(s[78], 256, s[2]))
|
|
k.emit(v_sub_u32_e64(v[120], v[22], s[78]))
|
|
k.emit(v_lshlrev_b32_e32(v[120], 2, v[120]))
|
|
k.emit(v_add_lshl_u32_e64(v[119], v[21], v[22], 1))
|
|
k.emit(v_cndmask_b32_e64(v[119], v[30], v[119], s[82:83]))
|
|
k.emit(v_add_co_u32(v[22], VCC, v[18], 7))
|
|
k.emit(v_cmp_lt_u32_e64(s[78:79], v[22], s[20]))
|
|
k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21]))
|
|
k.emit(s_and_b64(s[82:83], s[78:79], s[82:83]))
|
|
k.emit(s_mul_i32(s[78], 256, s[2]))
|
|
k.emit(v_sub_u32_e64(v[122], v[22], s[78]))
|
|
k.emit(v_lshlrev_b32_e32(v[122], 2, v[122]))
|
|
k.emit(v_add_lshl_u32_e64(v[121], v[21], v[22], 1))
|
|
k.emit(v_cndmask_b32_e64(v[121], v[30], v[121], s[82:83]))
|
|
k.emit(v_add_co_u32(v[19], VCC, v[19], 1))
|
|
k.emit(v_add_u32_e64(v[20], v[20], s[38]))
|
|
k.emit(v_add_u32_e64(v[21], v[21], s[36]))
|
|
k.emit(v_cmp_lt_u32_e64(s[78:79], v[18], s[20]))
|
|
k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21]))
|
|
k.emit(s_and_b64(s[82:83], s[78:79], s[82:83]))
|
|
k.emit(s_mul_i32(s[78], 256, s[2]))
|
|
k.emit(v_sub_u32_e64(v[124], v[18], s[78]))
|
|
k.emit(v_lshlrev_b32_e32(v[124], 2, v[124]))
|
|
k.emit(v_add_lshl_u32_e64(v[123], v[21], v[18], 1))
|
|
k.emit(v_cndmask_b32_e64(v[123], v[30], v[123], s[82:83]))
|
|
k.emit(v_add_co_u32(v[22], VCC, v[18], 1))
|
|
k.emit(v_cmp_lt_u32_e64(s[78:79], v[22], s[20]))
|
|
k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21]))
|
|
k.emit(s_and_b64(s[82:83], s[78:79], s[82:83]))
|
|
k.emit(s_mul_i32(s[78], 256, s[2]))
|
|
k.emit(v_sub_u32_e64(v[126], v[22], s[78]))
|
|
k.emit(v_lshlrev_b32_e32(v[126], 2, v[126]))
|
|
k.emit(v_add_lshl_u32_e64(v[125], v[21], v[22], 1))
|
|
k.emit(v_cndmask_b32_e64(v[125], v[30], v[125], s[82:83]))
|
|
k.emit(v_add_co_u32(v[22], VCC, v[18], 2))
|
|
k.emit(v_cmp_lt_u32_e64(s[78:79], v[22], s[20]))
|
|
k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21]))
|
|
k.emit(s_and_b64(s[82:83], s[78:79], s[82:83]))
|
|
k.emit(s_mul_i32(s[78], 256, s[2]))
|
|
k.emit(v_sub_u32_e64(v[128], v[22], s[78]))
|
|
k.emit(v_lshlrev_b32_e32(v[128], 2, v[128]))
|
|
k.emit(v_add_lshl_u32_e64(v[127], v[21], v[22], 1))
|
|
k.emit(v_cndmask_b32_e64(v[127], v[30], v[127], s[82:83]))
|
|
k.emit(v_add_co_u32(v[22], VCC, v[18], 3))
|
|
k.emit(v_cmp_lt_u32_e64(s[78:79], v[22], s[20]))
|
|
k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21]))
|
|
k.emit(s_and_b64(s[82:83], s[78:79], s[82:83]))
|
|
k.emit(s_mul_i32(s[78], 256, s[2]))
|
|
k.emit(v_sub_u32_e64(v[130], v[22], s[78]))
|
|
k.emit(v_lshlrev_b32_e32(v[130], 2, v[130]))
|
|
k.emit(v_add_lshl_u32_e64(v[129], v[21], v[22], 1))
|
|
k.emit(v_cndmask_b32_e64(v[129], v[30], v[129], s[82:83]))
|
|
k.emit(v_add_co_u32(v[22], VCC, v[18], 4))
|
|
k.emit(v_cmp_lt_u32_e64(s[78:79], v[22], s[20]))
|
|
k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21]))
|
|
k.emit(s_and_b64(s[82:83], s[78:79], s[82:83]))
|
|
k.emit(s_mul_i32(s[78], 256, s[2]))
|
|
k.emit(v_sub_u32_e64(v[132], v[22], s[78]))
|
|
k.emit(v_lshlrev_b32_e32(v[132], 2, v[132]))
|
|
k.emit(v_add_lshl_u32_e64(v[131], v[21], v[22], 1))
|
|
k.emit(v_cndmask_b32_e64(v[131], v[30], v[131], s[82:83]))
|
|
k.emit(v_add_co_u32(v[22], VCC, v[18], 5))
|
|
k.emit(v_cmp_lt_u32_e64(s[78:79], v[22], s[20]))
|
|
k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21]))
|
|
k.emit(s_and_b64(s[82:83], s[78:79], s[82:83]))
|
|
k.emit(s_mul_i32(s[78], 256, s[2]))
|
|
k.emit(v_sub_u32_e64(v[134], v[22], s[78]))
|
|
k.emit(v_lshlrev_b32_e32(v[134], 2, v[134]))
|
|
k.emit(v_add_lshl_u32_e64(v[133], v[21], v[22], 1))
|
|
k.emit(v_cndmask_b32_e64(v[133], v[30], v[133], s[82:83]))
|
|
k.emit(v_add_co_u32(v[22], VCC, v[18], 6))
|
|
k.emit(v_cmp_lt_u32_e64(s[78:79], v[22], s[20]))
|
|
k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21]))
|
|
k.emit(s_and_b64(s[82:83], s[78:79], s[82:83]))
|
|
k.emit(s_mul_i32(s[78], 256, s[2]))
|
|
k.emit(v_sub_u32_e64(v[136], v[22], s[78]))
|
|
k.emit(v_lshlrev_b32_e32(v[136], 2, v[136]))
|
|
k.emit(v_add_lshl_u32_e64(v[135], v[21], v[22], 1))
|
|
k.emit(v_cndmask_b32_e64(v[135], v[30], v[135], s[82:83]))
|
|
k.emit(v_add_co_u32(v[22], VCC, v[18], 7))
|
|
k.emit(v_cmp_lt_u32_e64(s[78:79], v[22], s[20]))
|
|
k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21]))
|
|
k.emit(s_and_b64(s[82:83], s[78:79], s[82:83]))
|
|
k.emit(s_mul_i32(s[78], 256, s[2]))
|
|
k.emit(v_sub_u32_e64(v[138], v[22], s[78]))
|
|
k.emit(v_lshlrev_b32_e32(v[138], 2, v[138]))
|
|
k.emit(v_add_lshl_u32_e64(v[137], v[21], v[22], 1))
|
|
k.emit(v_cndmask_b32_e64(v[137], v[30], v[137], s[82:83]))
|
|
k.emit(v_add_co_u32(v[19], VCC, v[19], 1))
|
|
k.emit(v_add_u32_e64(v[20], v[20], s[38]))
|
|
k.emit(v_add_u32_e64(v[21], v[21], s[36]))
|
|
k.emit(v_cmp_lt_u32_e64(s[78:79], v[18], s[20]))
|
|
k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21]))
|
|
k.emit(s_and_b64(s[82:83], s[78:79], s[82:83]))
|
|
k.emit(s_mul_i32(s[78], 256, s[2]))
|
|
k.emit(v_sub_u32_e64(v[140], v[18], s[78]))
|
|
k.emit(v_lshlrev_b32_e32(v[140], 2, v[140]))
|
|
k.emit(v_add_lshl_u32_e64(v[139], v[21], v[18], 1))
|
|
k.emit(v_cndmask_b32_e64(v[139], v[30], v[139], s[82:83]))
|
|
k.emit(v_add_co_u32(v[22], VCC, v[18], 1))
|
|
k.emit(v_cmp_lt_u32_e64(s[78:79], v[22], s[20]))
|
|
k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21]))
|
|
k.emit(s_and_b64(s[82:83], s[78:79], s[82:83]))
|
|
k.emit(s_mul_i32(s[78], 256, s[2]))
|
|
k.emit(v_sub_u32_e64(v[142], v[22], s[78]))
|
|
k.emit(v_lshlrev_b32_e32(v[142], 2, v[142]))
|
|
k.emit(v_add_lshl_u32_e64(v[141], v[21], v[22], 1))
|
|
k.emit(v_cndmask_b32_e64(v[141], v[30], v[141], s[82:83]))
|
|
k.emit(v_add_co_u32(v[22], VCC, v[18], 2))
|
|
k.emit(v_cmp_lt_u32_e64(s[78:79], v[22], s[20]))
|
|
k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21]))
|
|
k.emit(s_and_b64(s[82:83], s[78:79], s[82:83]))
|
|
k.emit(s_mul_i32(s[78], 256, s[2]))
|
|
k.emit(v_sub_u32_e64(v[144], v[22], s[78]))
|
|
k.emit(v_lshlrev_b32_e32(v[144], 2, v[144]))
|
|
k.emit(v_add_lshl_u32_e64(v[143], v[21], v[22], 1))
|
|
k.emit(v_cndmask_b32_e64(v[143], v[30], v[143], s[82:83]))
|
|
k.emit(v_add_co_u32(v[22], VCC, v[18], 3))
|
|
k.emit(v_cmp_lt_u32_e64(s[78:79], v[22], s[20]))
|
|
k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21]))
|
|
k.emit(s_and_b64(s[82:83], s[78:79], s[82:83]))
|
|
k.emit(s_mul_i32(s[78], 256, s[2]))
|
|
k.emit(v_sub_u32_e64(v[146], v[22], s[78]))
|
|
k.emit(v_lshlrev_b32_e32(v[146], 2, v[146]))
|
|
k.emit(v_add_lshl_u32_e64(v[145], v[21], v[22], 1))
|
|
k.emit(v_cndmask_b32_e64(v[145], v[30], v[145], s[82:83]))
|
|
k.emit(v_add_co_u32(v[22], VCC, v[18], 4))
|
|
k.emit(v_cmp_lt_u32_e64(s[78:79], v[22], s[20]))
|
|
k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21]))
|
|
k.emit(s_and_b64(s[82:83], s[78:79], s[82:83]))
|
|
k.emit(s_mul_i32(s[78], 256, s[2]))
|
|
k.emit(v_sub_u32_e64(v[148], v[22], s[78]))
|
|
k.emit(v_lshlrev_b32_e32(v[148], 2, v[148]))
|
|
k.emit(v_add_lshl_u32_e64(v[147], v[21], v[22], 1))
|
|
k.emit(v_cndmask_b32_e64(v[147], v[30], v[147], s[82:83]))
|
|
k.emit(v_add_co_u32(v[22], VCC, v[18], 5))
|
|
k.emit(v_cmp_lt_u32_e64(s[78:79], v[22], s[20]))
|
|
k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21]))
|
|
k.emit(s_and_b64(s[82:83], s[78:79], s[82:83]))
|
|
k.emit(s_mul_i32(s[78], 256, s[2]))
|
|
k.emit(v_sub_u32_e64(v[150], v[22], s[78]))
|
|
k.emit(v_lshlrev_b32_e32(v[150], 2, v[150]))
|
|
k.emit(v_add_lshl_u32_e64(v[149], v[21], v[22], 1))
|
|
k.emit(v_cndmask_b32_e64(v[149], v[30], v[149], s[82:83]))
|
|
k.emit(v_add_co_u32(v[22], VCC, v[18], 6))
|
|
k.emit(v_cmp_lt_u32_e64(s[78:79], v[22], s[20]))
|
|
k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21]))
|
|
k.emit(s_and_b64(s[82:83], s[78:79], s[82:83]))
|
|
k.emit(s_mul_i32(s[78], 256, s[2]))
|
|
k.emit(v_sub_u32_e64(v[152], v[22], s[78]))
|
|
k.emit(v_lshlrev_b32_e32(v[152], 2, v[152]))
|
|
k.emit(v_add_lshl_u32_e64(v[151], v[21], v[22], 1))
|
|
k.emit(v_cndmask_b32_e64(v[151], v[30], v[151], s[82:83]))
|
|
k.emit(v_add_co_u32(v[22], VCC, v[18], 7))
|
|
k.emit(v_cmp_lt_u32_e64(s[78:79], v[22], s[20]))
|
|
k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21]))
|
|
k.emit(s_and_b64(s[82:83], s[78:79], s[82:83]))
|
|
k.emit(s_mul_i32(s[78], 256, s[2]))
|
|
k.emit(v_sub_u32_e64(v[154], v[22], s[78]))
|
|
k.emit(v_lshlrev_b32_e32(v[154], 2, v[154]))
|
|
k.emit(v_add_lshl_u32_e64(v[153], v[21], v[22], 1))
|
|
k.emit(v_cndmask_b32_e64(v[153], v[30], v[153], s[82:83]))
|
|
k.emit(v_add_co_u32(v[19], VCC, v[19], 1))
|
|
k.emit(v_add_u32_e64(v[20], v[20], s[38]))
|
|
k.emit(v_add_u32_e64(v[21], v[21], s[36]))
|
|
k.emit(v_cmp_lt_u32_e64(s[78:79], v[18], s[20]))
|
|
k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21]))
|
|
k.emit(s_and_b64(s[82:83], s[78:79], s[82:83]))
|
|
k.emit(s_mul_i32(s[78], 256, s[2]))
|
|
k.emit(v_sub_u32_e64(v[156], v[18], s[78]))
|
|
k.emit(v_lshlrev_b32_e32(v[156], 2, v[156]))
|
|
k.emit(v_add_lshl_u32_e64(v[155], v[21], v[18], 1))
|
|
k.emit(v_cndmask_b32_e64(v[155], v[30], v[155], s[82:83]))
|
|
k.emit(v_add_co_u32(v[22], VCC, v[18], 1))
|
|
k.emit(v_cmp_lt_u32_e64(s[78:79], v[22], s[20]))
|
|
k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21]))
|
|
k.emit(s_and_b64(s[82:83], s[78:79], s[82:83]))
|
|
k.emit(s_mul_i32(s[78], 256, s[2]))
|
|
k.emit(v_sub_u32_e64(v[158], v[22], s[78]))
|
|
k.emit(v_lshlrev_b32_e32(v[158], 2, v[158]))
|
|
k.emit(v_add_lshl_u32_e64(v[157], v[21], v[22], 1))
|
|
k.emit(v_cndmask_b32_e64(v[157], v[30], v[157], s[82:83]))
|
|
k.emit(v_add_co_u32(v[22], VCC, v[18], 2))
|
|
k.emit(v_cmp_lt_u32_e64(s[78:79], v[22], s[20]))
|
|
k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21]))
|
|
k.emit(s_and_b64(s[82:83], s[78:79], s[82:83]))
|
|
k.emit(s_mul_i32(s[78], 256, s[2]))
|
|
k.emit(v_sub_u32_e64(v[160], v[22], s[78]))
|
|
k.emit(v_lshlrev_b32_e32(v[160], 2, v[160]))
|
|
k.emit(v_add_lshl_u32_e64(v[159], v[21], v[22], 1))
|
|
k.emit(v_cndmask_b32_e64(v[159], v[30], v[159], s[82:83]))
|
|
k.emit(v_add_co_u32(v[22], VCC, v[18], 3))
|
|
k.emit(v_cmp_lt_u32_e64(s[78:79], v[22], s[20]))
|
|
k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21]))
|
|
k.emit(s_and_b64(s[82:83], s[78:79], s[82:83]))
|
|
k.emit(s_mul_i32(s[78], 256, s[2]))
|
|
k.emit(v_sub_u32_e64(v[162], v[22], s[78]))
|
|
k.emit(v_lshlrev_b32_e32(v[162], 2, v[162]))
|
|
k.emit(v_add_lshl_u32_e64(v[161], v[21], v[22], 1))
|
|
k.emit(v_cndmask_b32_e64(v[161], v[30], v[161], s[82:83]))
|
|
k.emit(v_add_co_u32(v[22], VCC, v[18], 4))
|
|
k.emit(v_cmp_lt_u32_e64(s[78:79], v[22], s[20]))
|
|
k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21]))
|
|
k.emit(s_and_b64(s[82:83], s[78:79], s[82:83]))
|
|
k.emit(s_mul_i32(s[78], 256, s[2]))
|
|
k.emit(v_sub_u32_e64(v[164], v[22], s[78]))
|
|
k.emit(v_lshlrev_b32_e32(v[164], 2, v[164]))
|
|
k.emit(v_add_lshl_u32_e64(v[163], v[21], v[22], 1))
|
|
k.emit(v_cndmask_b32_e64(v[163], v[30], v[163], s[82:83]))
|
|
k.emit(v_add_co_u32(v[22], VCC, v[18], 5))
|
|
k.emit(v_cmp_lt_u32_e64(s[78:79], v[22], s[20]))
|
|
k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21]))
|
|
k.emit(s_and_b64(s[82:83], s[78:79], s[82:83]))
|
|
k.emit(s_mul_i32(s[78], 256, s[2]))
|
|
k.emit(v_sub_u32_e64(v[166], v[22], s[78]))
|
|
k.emit(v_lshlrev_b32_e32(v[166], 2, v[166]))
|
|
k.emit(v_add_lshl_u32_e64(v[165], v[21], v[22], 1))
|
|
k.emit(v_cndmask_b32_e64(v[165], v[30], v[165], s[82:83]))
|
|
k.emit(v_add_co_u32(v[22], VCC, v[18], 6))
|
|
k.emit(v_cmp_lt_u32_e64(s[78:79], v[22], s[20]))
|
|
k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21]))
|
|
k.emit(s_and_b64(s[82:83], s[78:79], s[82:83]))
|
|
k.emit(s_mul_i32(s[78], 256, s[2]))
|
|
k.emit(v_sub_u32_e64(v[168], v[22], s[78]))
|
|
k.emit(v_lshlrev_b32_e32(v[168], 2, v[168]))
|
|
k.emit(v_add_lshl_u32_e64(v[167], v[21], v[22], 1))
|
|
k.emit(v_cndmask_b32_e64(v[167], v[30], v[167], s[82:83]))
|
|
k.emit(v_add_co_u32(v[22], VCC, v[18], 7))
|
|
k.emit(v_cmp_lt_u32_e64(s[78:79], v[22], s[20]))
|
|
k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21]))
|
|
k.emit(s_and_b64(s[82:83], s[78:79], s[82:83]))
|
|
k.emit(s_mul_i32(s[78], 256, s[2]))
|
|
k.emit(v_sub_u32_e64(v[170], v[22], s[78]))
|
|
k.emit(v_lshlrev_b32_e32(v[170], 2, v[170]))
|
|
k.emit(v_add_lshl_u32_e64(v[169], v[21], v[22], 1))
|
|
k.emit(v_cndmask_b32_e64(v[169], v[30], v[169], s[82:83]))
|
|
k.emit(v_accvgpr_read(v[35], v[65]))
|
|
k.emit(v_accvgpr_read(v[36], v[69]))
|
|
k.emit(v_accvgpr_read(v[37], v[73]))
|
|
k.emit(v_accvgpr_read(v[38], v[77]))
|
|
k.emit(v_accvgpr_read(v[39], v[81]))
|
|
k.emit(v_accvgpr_read(v[40], v[85]))
|
|
k.emit(v_accvgpr_read(v[41], v[89]))
|
|
k.emit(v_accvgpr_read(v[42], v[93]))
|
|
k.emit(v_accvgpr_read(v[43], v[97]))
|
|
k.emit(v_accvgpr_read(v[44], v[101]))
|
|
k.emit(v_accvgpr_read(v[45], v[105]))
|
|
k.emit(v_accvgpr_read(v[46], v[109]))
|
|
k.emit(v_accvgpr_read(v[47], v[113]))
|
|
k.emit(v_accvgpr_read(v[48], v[117]))
|
|
k.emit(v_accvgpr_read(v[49], v[121]))
|
|
k.emit(v_accvgpr_read(v[50], v[125]))
|
|
k.emit(v_accvgpr_read(v[51], v[129]))
|
|
k.emit(v_accvgpr_read(v[52], v[133]))
|
|
k.emit(v_accvgpr_read(v[53], v[137]))
|
|
k.emit(v_accvgpr_read(v[54], v[141]))
|
|
k.emit(v_accvgpr_read(v[55], v[145]))
|
|
k.emit(v_accvgpr_read(v[56], v[149]))
|
|
k.emit(v_accvgpr_read(v[57], v[153]))
|
|
k.emit(v_accvgpr_read(v[58], v[157]))
|
|
k.emit(v_accvgpr_read(v[59], v[161]))
|
|
k.emit(v_accvgpr_read(v[60], v[165]))
|
|
k.emit(v_accvgpr_read(v[61], v[169]))
|
|
k.emit(v_accvgpr_read(v[62], v[173]))
|
|
k.emit(v_accvgpr_read(v[63], v[177]))
|
|
k.emit(v_accvgpr_read(v[64], v[181]))
|
|
k.emit(v_accvgpr_read(v[65], v[185]))
|
|
k.emit(v_accvgpr_read(v[66], v[189]))
|
|
k.emit(v_accvgpr_read(v[67], v[193]))
|
|
k.emit(v_accvgpr_read(v[68], v[197]))
|
|
k.emit(v_accvgpr_read(v[69], v[201]))
|
|
k.emit(v_accvgpr_read(v[70], v[205]))
|
|
k.emit(v_accvgpr_read(v[71], v[209]))
|
|
k.emit(v_accvgpr_read(v[72], v[213]))
|
|
k.emit(v_accvgpr_read(v[73], v[217]))
|
|
k.emit(v_accvgpr_read(v[74], v[221]))
|
|
k.waitcnt(lgkm=0)
|
|
k.emit(v_mov_b32_e32(v[32], 4294901760))
|
|
k.emit(v_mov_b32_e32(v[33], 2147418112))
|
|
k.emit(v_mov_b32_e32(v[34], 32767))
|
|
k.emit(v_mul_f32_e32(v[35], v[76], v[35]))
|
|
k.emit(v_add_f32_e32(v[22], v[75], v[35]))
|
|
k.emit(v_mov_b32_e32(v[35], v[22]))
|
|
k.emit(v_cvt_pk(v[35], v[35], v[35]))
|
|
k.emit(buffer_store_short(v[35], v[77], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mul_f32_e32(v[36], v[80], v[36]))
|
|
k.emit(v_add_f32_e32(v[22], v[79], v[36]))
|
|
k.emit(v_mov_b32_e32(v[36], v[22]))
|
|
k.emit(v_cvt_pk(v[36], v[36], v[36]))
|
|
k.emit(buffer_store_short(v[36], v[81], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mul_f32_e32(v[37], v[84], v[37]))
|
|
k.emit(v_add_f32_e32(v[22], v[83], v[37]))
|
|
k.emit(v_mov_b32_e32(v[37], v[22]))
|
|
k.emit(v_cvt_pk(v[37], v[37], v[37]))
|
|
k.emit(buffer_store_short(v[37], v[85], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mul_f32_e32(v[38], v[88], v[38]))
|
|
k.emit(v_add_f32_e32(v[22], v[87], v[38]))
|
|
k.emit(v_mov_b32_e32(v[38], v[22]))
|
|
k.emit(v_cvt_pk(v[38], v[38], v[38]))
|
|
k.emit(buffer_store_short(v[38], v[89], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mul_f32_e32(v[39], v[92], v[39]))
|
|
k.emit(v_add_f32_e32(v[22], v[91], v[39]))
|
|
k.emit(v_mov_b32_e32(v[39], v[22]))
|
|
k.emit(v_cvt_pk(v[39], v[39], v[39]))
|
|
k.emit(buffer_store_short(v[39], v[93], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mul_f32_e32(v[40], v[96], v[40]))
|
|
k.emit(v_add_f32_e32(v[22], v[95], v[40]))
|
|
k.emit(v_mov_b32_e32(v[40], v[22]))
|
|
k.emit(v_cvt_pk(v[40], v[40], v[40]))
|
|
k.emit(buffer_store_short(v[40], v[97], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mul_f32_e32(v[41], v[100], v[41]))
|
|
k.emit(v_add_f32_e32(v[22], v[99], v[41]))
|
|
k.emit(v_mov_b32_e32(v[41], v[22]))
|
|
k.emit(v_cvt_pk(v[41], v[41], v[41]))
|
|
k.emit(buffer_store_short(v[41], v[101], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mul_f32_e32(v[42], v[104], v[42]))
|
|
k.emit(v_add_f32_e32(v[22], v[103], v[42]))
|
|
k.emit(v_mov_b32_e32(v[42], v[22]))
|
|
k.emit(v_cvt_pk(v[42], v[42], v[42]))
|
|
k.emit(buffer_store_short(v[42], v[105], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mul_f32_e32(v[43], v[76], v[43]))
|
|
k.emit(v_add_f32_e32(v[22], v[75], v[43]))
|
|
k.emit(v_mov_b32_e32(v[43], v[22]))
|
|
k.emit(v_cvt_pk(v[43], v[43], v[43]))
|
|
k.emit(buffer_store_short(v[43], v[107], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mul_f32_e32(v[44], v[80], v[44]))
|
|
k.emit(v_add_f32_e32(v[22], v[79], v[44]))
|
|
k.emit(v_mov_b32_e32(v[44], v[22]))
|
|
k.emit(v_cvt_pk(v[44], v[44], v[44]))
|
|
k.emit(buffer_store_short(v[44], v[109], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mul_f32_e32(v[45], v[84], v[45]))
|
|
k.emit(v_add_f32_e32(v[22], v[83], v[45]))
|
|
k.emit(v_mov_b32_e32(v[45], v[22]))
|
|
k.emit(v_cvt_pk(v[45], v[45], v[45]))
|
|
k.emit(buffer_store_short(v[45], v[111], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mul_f32_e32(v[46], v[88], v[46]))
|
|
k.emit(v_add_f32_e32(v[22], v[87], v[46]))
|
|
k.emit(v_mov_b32_e32(v[46], v[22]))
|
|
k.emit(v_cvt_pk(v[46], v[46], v[46]))
|
|
k.emit(buffer_store_short(v[46], v[113], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mul_f32_e32(v[47], v[92], v[47]))
|
|
k.emit(v_add_f32_e32(v[22], v[91], v[47]))
|
|
k.emit(v_mov_b32_e32(v[47], v[22]))
|
|
k.emit(v_cvt_pk(v[47], v[47], v[47]))
|
|
k.emit(buffer_store_short(v[47], v[115], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mul_f32_e32(v[48], v[96], v[48]))
|
|
k.emit(v_add_f32_e32(v[22], v[95], v[48]))
|
|
k.emit(v_mov_b32_e32(v[48], v[22]))
|
|
k.emit(v_cvt_pk(v[48], v[48], v[48]))
|
|
k.emit(buffer_store_short(v[48], v[117], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mul_f32_e32(v[49], v[100], v[49]))
|
|
k.emit(v_add_f32_e32(v[22], v[99], v[49]))
|
|
k.emit(v_mov_b32_e32(v[49], v[22]))
|
|
k.emit(v_cvt_pk(v[49], v[49], v[49]))
|
|
k.emit(buffer_store_short(v[49], v[119], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mul_f32_e32(v[50], v[104], v[50]))
|
|
k.emit(v_add_f32_e32(v[22], v[103], v[50]))
|
|
k.emit(v_mov_b32_e32(v[50], v[22]))
|
|
k.emit(v_cvt_pk(v[50], v[50], v[50]))
|
|
k.emit(buffer_store_short(v[50], v[121], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mul_f32_e32(v[51], v[76], v[51]))
|
|
k.emit(v_add_f32_e32(v[22], v[75], v[51]))
|
|
k.emit(v_mov_b32_e32(v[51], v[22]))
|
|
k.emit(v_cvt_pk(v[51], v[51], v[51]))
|
|
k.emit(buffer_store_short(v[51], v[123], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mul_f32_e32(v[52], v[80], v[52]))
|
|
k.emit(v_add_f32_e32(v[22], v[79], v[52]))
|
|
k.emit(v_mov_b32_e32(v[52], v[22]))
|
|
k.emit(v_cvt_pk(v[52], v[52], v[52]))
|
|
k.emit(buffer_store_short(v[52], v[125], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mul_f32_e32(v[53], v[84], v[53]))
|
|
k.emit(v_add_f32_e32(v[22], v[83], v[53]))
|
|
k.emit(v_mov_b32_e32(v[53], v[22]))
|
|
k.emit(v_cvt_pk(v[53], v[53], v[53]))
|
|
k.emit(buffer_store_short(v[53], v[127], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mul_f32_e32(v[54], v[88], v[54]))
|
|
k.emit(v_add_f32_e32(v[22], v[87], v[54]))
|
|
k.emit(v_mov_b32_e32(v[54], v[22]))
|
|
k.emit(v_cvt_pk(v[54], v[54], v[54]))
|
|
k.emit(buffer_store_short(v[54], v[129], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mul_f32_e32(v[55], v[92], v[55]))
|
|
k.emit(v_add_f32_e32(v[22], v[91], v[55]))
|
|
k.emit(v_mov_b32_e32(v[55], v[22]))
|
|
k.emit(v_cvt_pk(v[55], v[55], v[55]))
|
|
k.emit(buffer_store_short(v[55], v[131], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mul_f32_e32(v[56], v[96], v[56]))
|
|
k.emit(v_add_f32_e32(v[22], v[95], v[56]))
|
|
k.emit(v_mov_b32_e32(v[56], v[22]))
|
|
k.emit(v_cvt_pk(v[56], v[56], v[56]))
|
|
k.emit(buffer_store_short(v[56], v[133], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mul_f32_e32(v[57], v[100], v[57]))
|
|
k.emit(v_add_f32_e32(v[22], v[99], v[57]))
|
|
k.emit(v_mov_b32_e32(v[57], v[22]))
|
|
k.emit(v_cvt_pk(v[57], v[57], v[57]))
|
|
k.emit(buffer_store_short(v[57], v[135], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mul_f32_e32(v[58], v[104], v[58]))
|
|
k.emit(v_add_f32_e32(v[22], v[103], v[58]))
|
|
k.emit(v_mov_b32_e32(v[58], v[22]))
|
|
k.emit(v_cvt_pk(v[58], v[58], v[58]))
|
|
k.emit(buffer_store_short(v[58], v[137], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mul_f32_e32(v[59], v[76], v[59]))
|
|
k.emit(v_add_f32_e32(v[22], v[75], v[59]))
|
|
k.emit(v_mov_b32_e32(v[59], v[22]))
|
|
k.emit(v_cvt_pk(v[59], v[59], v[59]))
|
|
k.emit(buffer_store_short(v[59], v[139], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mul_f32_e32(v[60], v[80], v[60]))
|
|
k.emit(v_add_f32_e32(v[22], v[79], v[60]))
|
|
k.emit(v_mov_b32_e32(v[60], v[22]))
|
|
k.emit(v_cvt_pk(v[60], v[60], v[60]))
|
|
k.emit(buffer_store_short(v[60], v[141], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mul_f32_e32(v[61], v[84], v[61]))
|
|
k.emit(v_add_f32_e32(v[22], v[83], v[61]))
|
|
k.emit(v_mov_b32_e32(v[61], v[22]))
|
|
k.emit(v_cvt_pk(v[61], v[61], v[61]))
|
|
k.emit(buffer_store_short(v[61], v[143], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mul_f32_e32(v[62], v[88], v[62]))
|
|
k.emit(v_add_f32_e32(v[22], v[87], v[62]))
|
|
k.emit(v_mov_b32_e32(v[62], v[22]))
|
|
k.emit(v_cvt_pk(v[62], v[62], v[62]))
|
|
k.emit(buffer_store_short(v[62], v[145], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mul_f32_e32(v[63], v[92], v[63]))
|
|
k.emit(v_add_f32_e32(v[22], v[91], v[63]))
|
|
k.emit(v_mov_b32_e32(v[63], v[22]))
|
|
k.emit(v_cvt_pk(v[63], v[63], v[63]))
|
|
k.emit(buffer_store_short(v[63], v[147], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mul_f32_e32(v[64], v[96], v[64]))
|
|
k.emit(v_add_f32_e32(v[22], v[95], v[64]))
|
|
k.emit(v_mov_b32_e32(v[64], v[22]))
|
|
k.emit(v_cvt_pk(v[64], v[64], v[64]))
|
|
k.emit(buffer_store_short(v[64], v[149], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mul_f32_e32(v[65], v[100], v[65]))
|
|
k.emit(v_add_f32_e32(v[22], v[99], v[65]))
|
|
k.emit(v_mov_b32_e32(v[65], v[22]))
|
|
k.emit(v_cvt_pk(v[65], v[65], v[65]))
|
|
k.emit(buffer_store_short(v[65], v[151], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mul_f32_e32(v[66], v[104], v[66]))
|
|
k.emit(v_add_f32_e32(v[22], v[103], v[66]))
|
|
k.emit(v_mov_b32_e32(v[66], v[22]))
|
|
k.emit(v_cvt_pk(v[66], v[66], v[66]))
|
|
k.emit(buffer_store_short(v[66], v[153], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mul_f32_e32(v[67], v[76], v[67]))
|
|
k.emit(v_add_f32_e32(v[22], v[75], v[67]))
|
|
k.emit(v_mov_b32_e32(v[67], v[22]))
|
|
k.emit(v_cvt_pk(v[67], v[67], v[67]))
|
|
k.emit(buffer_store_short(v[67], v[155], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mul_f32_e32(v[68], v[80], v[68]))
|
|
k.emit(v_add_f32_e32(v[22], v[79], v[68]))
|
|
k.emit(v_mov_b32_e32(v[68], v[22]))
|
|
k.emit(v_cvt_pk(v[68], v[68], v[68]))
|
|
k.emit(buffer_store_short(v[68], v[157], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mul_f32_e32(v[69], v[84], v[69]))
|
|
k.emit(v_add_f32_e32(v[22], v[83], v[69]))
|
|
k.emit(v_mov_b32_e32(v[69], v[22]))
|
|
k.emit(v_cvt_pk(v[69], v[69], v[69]))
|
|
k.emit(buffer_store_short(v[69], v[159], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mul_f32_e32(v[70], v[88], v[70]))
|
|
k.emit(v_add_f32_e32(v[22], v[87], v[70]))
|
|
k.emit(v_mov_b32_e32(v[70], v[22]))
|
|
k.emit(v_cvt_pk(v[70], v[70], v[70]))
|
|
k.emit(buffer_store_short(v[70], v[161], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mul_f32_e32(v[71], v[92], v[71]))
|
|
k.emit(v_add_f32_e32(v[22], v[91], v[71]))
|
|
k.emit(v_mov_b32_e32(v[71], v[22]))
|
|
k.emit(v_cvt_pk(v[71], v[71], v[71]))
|
|
k.emit(buffer_store_short(v[71], v[163], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mul_f32_e32(v[72], v[96], v[72]))
|
|
k.emit(v_add_f32_e32(v[22], v[95], v[72]))
|
|
k.emit(v_mov_b32_e32(v[72], v[22]))
|
|
k.emit(v_cvt_pk(v[72], v[72], v[72]))
|
|
k.emit(buffer_store_short(v[72], v[165], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mul_f32_e32(v[73], v[100], v[73]))
|
|
k.emit(v_add_f32_e32(v[22], v[99], v[73]))
|
|
k.emit(v_mov_b32_e32(v[73], v[22]))
|
|
k.emit(v_cvt_pk(v[73], v[73], v[73]))
|
|
k.emit(buffer_store_short(v[73], v[167], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mul_f32_e32(v[74], v[104], v[74]))
|
|
k.emit(v_add_f32_e32(v[22], v[103], v[74]))
|
|
k.emit(v_mov_b32_e32(v[74], v[22]))
|
|
k.emit(v_cvt_pk(v[74], v[74], v[74]))
|
|
k.emit(buffer_store_short(v[74], v[169], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(s_nop())
|
|
k.emit(v_mov_b32_e32(v[30], 2147483648))
|
|
k.emit(v_add_co_u32(v[19], VCC, v[19], 1))
|
|
k.emit(v_add_u32_e64(v[20], v[20], s[38]))
|
|
k.emit(v_add_u32_e64(v[21], v[21], s[36]))
|
|
k.emit(v_cmp_lt_u32_e64(s[78:79], v[18], s[20]))
|
|
k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21]))
|
|
k.emit(s_and_b64(s[82:83], s[78:79], s[82:83]))
|
|
k.emit(s_mul_i32(s[78], 256, s[2]))
|
|
k.emit(v_sub_u32_e64(v[78], v[18], s[78]))
|
|
k.emit(v_lshlrev_b32_e32(v[78], 2, v[78]))
|
|
k.emit(ds_read_b32(v[75], v[78]))
|
|
k.emit(ds_read_b32(v[76], v[78], v[0], v[0], 0, 0, 4))
|
|
k.emit(v_add_lshl_u32_e64(v[77], v[21], v[18], 1))
|
|
k.emit(v_cndmask_b32_e64(v[77], v[30], v[77], s[82:83]))
|
|
k.emit(v_add_co_u32(v[22], VCC, v[18], 1))
|
|
k.emit(v_cmp_lt_u32_e64(s[78:79], v[22], s[20]))
|
|
k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21]))
|
|
k.emit(s_and_b64(s[82:83], s[78:79], s[82:83]))
|
|
k.emit(s_mul_i32(s[78], 256, s[2]))
|
|
k.emit(v_sub_u32_e64(v[82], v[22], s[78]))
|
|
k.emit(v_lshlrev_b32_e32(v[82], 2, v[82]))
|
|
k.emit(ds_read_b32(v[79], v[82]))
|
|
k.emit(ds_read_b32(v[80], v[82], v[0], v[0], 0, 0, 4))
|
|
k.emit(v_add_lshl_u32_e64(v[81], v[21], v[22], 1))
|
|
k.emit(v_cndmask_b32_e64(v[81], v[30], v[81], s[82:83]))
|
|
k.emit(v_add_co_u32(v[22], VCC, v[18], 2))
|
|
k.emit(v_cmp_lt_u32_e64(s[78:79], v[22], s[20]))
|
|
k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21]))
|
|
k.emit(s_and_b64(s[82:83], s[78:79], s[82:83]))
|
|
k.emit(s_mul_i32(s[78], 256, s[2]))
|
|
k.emit(v_sub_u32_e64(v[86], v[22], s[78]))
|
|
k.emit(v_lshlrev_b32_e32(v[86], 2, v[86]))
|
|
k.emit(ds_read_b32(v[83], v[86]))
|
|
k.emit(ds_read_b32(v[84], v[86], v[0], v[0], 0, 0, 4))
|
|
k.emit(v_add_lshl_u32_e64(v[85], v[21], v[22], 1))
|
|
k.emit(v_cndmask_b32_e64(v[85], v[30], v[85], s[82:83]))
|
|
k.emit(v_add_co_u32(v[22], VCC, v[18], 3))
|
|
k.emit(v_cmp_lt_u32_e64(s[78:79], v[22], s[20]))
|
|
k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21]))
|
|
k.emit(s_and_b64(s[82:83], s[78:79], s[82:83]))
|
|
k.emit(s_mul_i32(s[78], 256, s[2]))
|
|
k.emit(v_sub_u32_e64(v[90], v[22], s[78]))
|
|
k.emit(v_lshlrev_b32_e32(v[90], 2, v[90]))
|
|
k.emit(ds_read_b32(v[87], v[90]))
|
|
k.emit(ds_read_b32(v[88], v[90], v[0], v[0], 0, 0, 4))
|
|
k.emit(v_add_lshl_u32_e64(v[89], v[21], v[22], 1))
|
|
k.emit(v_cndmask_b32_e64(v[89], v[30], v[89], s[82:83]))
|
|
k.emit(v_add_co_u32(v[22], VCC, v[18], 4))
|
|
k.emit(v_cmp_lt_u32_e64(s[78:79], v[22], s[20]))
|
|
k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21]))
|
|
k.emit(s_and_b64(s[82:83], s[78:79], s[82:83]))
|
|
k.emit(s_mul_i32(s[78], 256, s[2]))
|
|
k.emit(v_sub_u32_e64(v[94], v[22], s[78]))
|
|
k.emit(v_lshlrev_b32_e32(v[94], 2, v[94]))
|
|
k.emit(ds_read_b32(v[91], v[94]))
|
|
k.emit(ds_read_b32(v[92], v[94], v[0], v[0], 0, 0, 4))
|
|
k.emit(v_add_lshl_u32_e64(v[93], v[21], v[22], 1))
|
|
k.emit(v_cndmask_b32_e64(v[93], v[30], v[93], s[82:83]))
|
|
k.emit(v_add_co_u32(v[22], VCC, v[18], 5))
|
|
k.emit(v_cmp_lt_u32_e64(s[78:79], v[22], s[20]))
|
|
k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21]))
|
|
k.emit(s_and_b64(s[82:83], s[78:79], s[82:83]))
|
|
k.emit(s_mul_i32(s[78], 256, s[2]))
|
|
k.emit(v_sub_u32_e64(v[98], v[22], s[78]))
|
|
k.emit(v_lshlrev_b32_e32(v[98], 2, v[98]))
|
|
k.emit(ds_read_b32(v[95], v[98]))
|
|
k.emit(ds_read_b32(v[96], v[98], v[0], v[0], 0, 0, 4))
|
|
k.emit(v_add_lshl_u32_e64(v[97], v[21], v[22], 1))
|
|
k.emit(v_cndmask_b32_e64(v[97], v[30], v[97], s[82:83]))
|
|
k.emit(v_add_co_u32(v[22], VCC, v[18], 6))
|
|
k.emit(v_cmp_lt_u32_e64(s[78:79], v[22], s[20]))
|
|
k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21]))
|
|
k.emit(s_and_b64(s[82:83], s[78:79], s[82:83]))
|
|
k.emit(s_mul_i32(s[78], 256, s[2]))
|
|
k.emit(v_sub_u32_e64(v[102], v[22], s[78]))
|
|
k.emit(v_lshlrev_b32_e32(v[102], 2, v[102]))
|
|
k.emit(ds_read_b32(v[99], v[102]))
|
|
k.emit(ds_read_b32(v[100], v[102], v[0], v[0], 0, 0, 4))
|
|
k.emit(v_add_lshl_u32_e64(v[101], v[21], v[22], 1))
|
|
k.emit(v_cndmask_b32_e64(v[101], v[30], v[101], s[82:83]))
|
|
k.emit(v_add_co_u32(v[22], VCC, v[18], 7))
|
|
k.emit(v_cmp_lt_u32_e64(s[78:79], v[22], s[20]))
|
|
k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21]))
|
|
k.emit(s_and_b64(s[82:83], s[78:79], s[82:83]))
|
|
k.emit(s_mul_i32(s[78], 256, s[2]))
|
|
k.emit(v_sub_u32_e64(v[106], v[22], s[78]))
|
|
k.emit(v_lshlrev_b32_e32(v[106], 2, v[106]))
|
|
k.emit(ds_read_b32(v[103], v[106]))
|
|
k.emit(ds_read_b32(v[104], v[106], v[0], v[0], 0, 0, 4))
|
|
k.emit(v_add_lshl_u32_e64(v[105], v[21], v[22], 1))
|
|
k.emit(v_cndmask_b32_e64(v[105], v[30], v[105], s[82:83]))
|
|
k.emit(v_add_co_u32(v[19], VCC, v[19], 1))
|
|
k.emit(v_add_u32_e64(v[20], v[20], s[38]))
|
|
k.emit(v_add_u32_e64(v[21], v[21], s[36]))
|
|
k.emit(v_cmp_lt_u32_e64(s[78:79], v[18], s[20]))
|
|
k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21]))
|
|
k.emit(s_and_b64(s[82:83], s[78:79], s[82:83]))
|
|
k.emit(s_mul_i32(s[78], 256, s[2]))
|
|
k.emit(v_sub_u32_e64(v[108], v[18], s[78]))
|
|
k.emit(v_lshlrev_b32_e32(v[108], 2, v[108]))
|
|
k.emit(v_add_lshl_u32_e64(v[107], v[21], v[18], 1))
|
|
k.emit(v_cndmask_b32_e64(v[107], v[30], v[107], s[82:83]))
|
|
k.emit(v_add_co_u32(v[22], VCC, v[18], 1))
|
|
k.emit(v_cmp_lt_u32_e64(s[78:79], v[22], s[20]))
|
|
k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21]))
|
|
k.emit(s_and_b64(s[82:83], s[78:79], s[82:83]))
|
|
k.emit(s_mul_i32(s[78], 256, s[2]))
|
|
k.emit(v_sub_u32_e64(v[110], v[22], s[78]))
|
|
k.emit(v_lshlrev_b32_e32(v[110], 2, v[110]))
|
|
k.emit(v_add_lshl_u32_e64(v[109], v[21], v[22], 1))
|
|
k.emit(v_cndmask_b32_e64(v[109], v[30], v[109], s[82:83]))
|
|
k.emit(v_add_co_u32(v[22], VCC, v[18], 2))
|
|
k.emit(v_cmp_lt_u32_e64(s[78:79], v[22], s[20]))
|
|
k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21]))
|
|
k.emit(s_and_b64(s[82:83], s[78:79], s[82:83]))
|
|
k.emit(s_mul_i32(s[78], 256, s[2]))
|
|
k.emit(v_sub_u32_e64(v[112], v[22], s[78]))
|
|
k.emit(v_lshlrev_b32_e32(v[112], 2, v[112]))
|
|
k.emit(v_add_lshl_u32_e64(v[111], v[21], v[22], 1))
|
|
k.emit(v_cndmask_b32_e64(v[111], v[30], v[111], s[82:83]))
|
|
k.emit(v_add_co_u32(v[22], VCC, v[18], 3))
|
|
k.emit(v_cmp_lt_u32_e64(s[78:79], v[22], s[20]))
|
|
k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21]))
|
|
k.emit(s_and_b64(s[82:83], s[78:79], s[82:83]))
|
|
k.emit(s_mul_i32(s[78], 256, s[2]))
|
|
k.emit(v_sub_u32_e64(v[114], v[22], s[78]))
|
|
k.emit(v_lshlrev_b32_e32(v[114], 2, v[114]))
|
|
k.emit(v_add_lshl_u32_e64(v[113], v[21], v[22], 1))
|
|
k.emit(v_cndmask_b32_e64(v[113], v[30], v[113], s[82:83]))
|
|
k.emit(v_add_co_u32(v[22], VCC, v[18], 4))
|
|
k.emit(v_cmp_lt_u32_e64(s[78:79], v[22], s[20]))
|
|
k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21]))
|
|
k.emit(s_and_b64(s[82:83], s[78:79], s[82:83]))
|
|
k.emit(s_mul_i32(s[78], 256, s[2]))
|
|
k.emit(v_sub_u32_e64(v[116], v[22], s[78]))
|
|
k.emit(v_lshlrev_b32_e32(v[116], 2, v[116]))
|
|
k.emit(v_add_lshl_u32_e64(v[115], v[21], v[22], 1))
|
|
k.emit(v_cndmask_b32_e64(v[115], v[30], v[115], s[82:83]))
|
|
k.emit(v_add_co_u32(v[22], VCC, v[18], 5))
|
|
k.emit(v_cmp_lt_u32_e64(s[78:79], v[22], s[20]))
|
|
k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21]))
|
|
k.emit(s_and_b64(s[82:83], s[78:79], s[82:83]))
|
|
k.emit(s_mul_i32(s[78], 256, s[2]))
|
|
k.emit(v_sub_u32_e64(v[118], v[22], s[78]))
|
|
k.emit(v_lshlrev_b32_e32(v[118], 2, v[118]))
|
|
k.emit(v_add_lshl_u32_e64(v[117], v[21], v[22], 1))
|
|
k.emit(v_cndmask_b32_e64(v[117], v[30], v[117], s[82:83]))
|
|
k.emit(v_add_co_u32(v[22], VCC, v[18], 6))
|
|
k.emit(v_cmp_lt_u32_e64(s[78:79], v[22], s[20]))
|
|
k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21]))
|
|
k.emit(s_and_b64(s[82:83], s[78:79], s[82:83]))
|
|
k.emit(s_mul_i32(s[78], 256, s[2]))
|
|
k.emit(v_sub_u32_e64(v[120], v[22], s[78]))
|
|
k.emit(v_lshlrev_b32_e32(v[120], 2, v[120]))
|
|
k.emit(v_add_lshl_u32_e64(v[119], v[21], v[22], 1))
|
|
k.emit(v_cndmask_b32_e64(v[119], v[30], v[119], s[82:83]))
|
|
k.emit(v_add_co_u32(v[22], VCC, v[18], 7))
|
|
k.emit(v_cmp_lt_u32_e64(s[78:79], v[22], s[20]))
|
|
k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21]))
|
|
k.emit(s_and_b64(s[82:83], s[78:79], s[82:83]))
|
|
k.emit(s_mul_i32(s[78], 256, s[2]))
|
|
k.emit(v_sub_u32_e64(v[122], v[22], s[78]))
|
|
k.emit(v_lshlrev_b32_e32(v[122], 2, v[122]))
|
|
k.emit(v_add_lshl_u32_e64(v[121], v[21], v[22], 1))
|
|
k.emit(v_cndmask_b32_e64(v[121], v[30], v[121], s[82:83]))
|
|
k.emit(v_add_co_u32(v[19], VCC, v[19], 1))
|
|
k.emit(v_add_u32_e64(v[20], v[20], s[38]))
|
|
k.emit(v_add_u32_e64(v[21], v[21], s[36]))
|
|
k.emit(v_cmp_lt_u32_e64(s[78:79], v[18], s[20]))
|
|
k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21]))
|
|
k.emit(s_and_b64(s[82:83], s[78:79], s[82:83]))
|
|
k.emit(s_mul_i32(s[78], 256, s[2]))
|
|
k.emit(v_sub_u32_e64(v[124], v[18], s[78]))
|
|
k.emit(v_lshlrev_b32_e32(v[124], 2, v[124]))
|
|
k.emit(v_add_lshl_u32_e64(v[123], v[21], v[18], 1))
|
|
k.emit(v_cndmask_b32_e64(v[123], v[30], v[123], s[82:83]))
|
|
k.emit(v_add_co_u32(v[22], VCC, v[18], 1))
|
|
k.emit(v_cmp_lt_u32_e64(s[78:79], v[22], s[20]))
|
|
k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21]))
|
|
k.emit(s_and_b64(s[82:83], s[78:79], s[82:83]))
|
|
k.emit(s_mul_i32(s[78], 256, s[2]))
|
|
k.emit(v_sub_u32_e64(v[126], v[22], s[78]))
|
|
k.emit(v_lshlrev_b32_e32(v[126], 2, v[126]))
|
|
k.emit(v_add_lshl_u32_e64(v[125], v[21], v[22], 1))
|
|
k.emit(v_cndmask_b32_e64(v[125], v[30], v[125], s[82:83]))
|
|
k.emit(v_add_co_u32(v[22], VCC, v[18], 2))
|
|
k.emit(v_cmp_lt_u32_e64(s[78:79], v[22], s[20]))
|
|
k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21]))
|
|
k.emit(s_and_b64(s[82:83], s[78:79], s[82:83]))
|
|
k.emit(s_mul_i32(s[78], 256, s[2]))
|
|
k.emit(v_sub_u32_e64(v[128], v[22], s[78]))
|
|
k.emit(v_lshlrev_b32_e32(v[128], 2, v[128]))
|
|
k.emit(v_add_lshl_u32_e64(v[127], v[21], v[22], 1))
|
|
k.emit(v_cndmask_b32_e64(v[127], v[30], v[127], s[82:83]))
|
|
k.emit(v_add_co_u32(v[22], VCC, v[18], 3))
|
|
k.emit(v_cmp_lt_u32_e64(s[78:79], v[22], s[20]))
|
|
k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21]))
|
|
k.emit(s_and_b64(s[82:83], s[78:79], s[82:83]))
|
|
k.emit(s_mul_i32(s[78], 256, s[2]))
|
|
k.emit(v_sub_u32_e64(v[130], v[22], s[78]))
|
|
k.emit(v_lshlrev_b32_e32(v[130], 2, v[130]))
|
|
k.emit(v_add_lshl_u32_e64(v[129], v[21], v[22], 1))
|
|
k.emit(v_cndmask_b32_e64(v[129], v[30], v[129], s[82:83]))
|
|
k.emit(v_add_co_u32(v[22], VCC, v[18], 4))
|
|
k.emit(v_cmp_lt_u32_e64(s[78:79], v[22], s[20]))
|
|
k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21]))
|
|
k.emit(s_and_b64(s[82:83], s[78:79], s[82:83]))
|
|
k.emit(s_mul_i32(s[78], 256, s[2]))
|
|
k.emit(v_sub_u32_e64(v[132], v[22], s[78]))
|
|
k.emit(v_lshlrev_b32_e32(v[132], 2, v[132]))
|
|
k.emit(v_add_lshl_u32_e64(v[131], v[21], v[22], 1))
|
|
k.emit(v_cndmask_b32_e64(v[131], v[30], v[131], s[82:83]))
|
|
k.emit(v_add_co_u32(v[22], VCC, v[18], 5))
|
|
k.emit(v_cmp_lt_u32_e64(s[78:79], v[22], s[20]))
|
|
k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21]))
|
|
k.emit(s_and_b64(s[82:83], s[78:79], s[82:83]))
|
|
k.emit(s_mul_i32(s[78], 256, s[2]))
|
|
k.emit(v_sub_u32_e64(v[134], v[22], s[78]))
|
|
k.emit(v_lshlrev_b32_e32(v[134], 2, v[134]))
|
|
k.emit(v_add_lshl_u32_e64(v[133], v[21], v[22], 1))
|
|
k.emit(v_cndmask_b32_e64(v[133], v[30], v[133], s[82:83]))
|
|
k.emit(v_add_co_u32(v[22], VCC, v[18], 6))
|
|
k.emit(v_cmp_lt_u32_e64(s[78:79], v[22], s[20]))
|
|
k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21]))
|
|
k.emit(s_and_b64(s[82:83], s[78:79], s[82:83]))
|
|
k.emit(s_mul_i32(s[78], 256, s[2]))
|
|
k.emit(v_sub_u32_e64(v[136], v[22], s[78]))
|
|
k.emit(v_lshlrev_b32_e32(v[136], 2, v[136]))
|
|
k.emit(v_add_lshl_u32_e64(v[135], v[21], v[22], 1))
|
|
k.emit(v_cndmask_b32_e64(v[135], v[30], v[135], s[82:83]))
|
|
k.emit(v_add_co_u32(v[22], VCC, v[18], 7))
|
|
k.emit(v_cmp_lt_u32_e64(s[78:79], v[22], s[20]))
|
|
k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21]))
|
|
k.emit(s_and_b64(s[82:83], s[78:79], s[82:83]))
|
|
k.emit(s_mul_i32(s[78], 256, s[2]))
|
|
k.emit(v_sub_u32_e64(v[138], v[22], s[78]))
|
|
k.emit(v_lshlrev_b32_e32(v[138], 2, v[138]))
|
|
k.emit(v_add_lshl_u32_e64(v[137], v[21], v[22], 1))
|
|
k.emit(v_cndmask_b32_e64(v[137], v[30], v[137], s[82:83]))
|
|
k.emit(v_add_co_u32(v[19], VCC, v[19], 1))
|
|
k.emit(v_add_u32_e64(v[20], v[20], s[38]))
|
|
k.emit(v_add_u32_e64(v[21], v[21], s[36]))
|
|
k.emit(v_cmp_lt_u32_e64(s[78:79], v[18], s[20]))
|
|
k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21]))
|
|
k.emit(s_and_b64(s[82:83], s[78:79], s[82:83]))
|
|
k.emit(s_mul_i32(s[78], 256, s[2]))
|
|
k.emit(v_sub_u32_e64(v[140], v[18], s[78]))
|
|
k.emit(v_lshlrev_b32_e32(v[140], 2, v[140]))
|
|
k.emit(v_add_lshl_u32_e64(v[139], v[21], v[18], 1))
|
|
k.emit(v_cndmask_b32_e64(v[139], v[30], v[139], s[82:83]))
|
|
k.emit(v_add_co_u32(v[22], VCC, v[18], 1))
|
|
k.emit(v_cmp_lt_u32_e64(s[78:79], v[22], s[20]))
|
|
k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21]))
|
|
k.emit(s_and_b64(s[82:83], s[78:79], s[82:83]))
|
|
k.emit(s_mul_i32(s[78], 256, s[2]))
|
|
k.emit(v_sub_u32_e64(v[142], v[22], s[78]))
|
|
k.emit(v_lshlrev_b32_e32(v[142], 2, v[142]))
|
|
k.emit(v_add_lshl_u32_e64(v[141], v[21], v[22], 1))
|
|
k.emit(v_cndmask_b32_e64(v[141], v[30], v[141], s[82:83]))
|
|
k.emit(v_add_co_u32(v[22], VCC, v[18], 2))
|
|
k.emit(v_cmp_lt_u32_e64(s[78:79], v[22], s[20]))
|
|
k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21]))
|
|
k.emit(s_and_b64(s[82:83], s[78:79], s[82:83]))
|
|
k.emit(s_mul_i32(s[78], 256, s[2]))
|
|
k.emit(v_sub_u32_e64(v[144], v[22], s[78]))
|
|
k.emit(v_lshlrev_b32_e32(v[144], 2, v[144]))
|
|
k.emit(v_add_lshl_u32_e64(v[143], v[21], v[22], 1))
|
|
k.emit(v_cndmask_b32_e64(v[143], v[30], v[143], s[82:83]))
|
|
k.emit(v_add_co_u32(v[22], VCC, v[18], 3))
|
|
k.emit(v_cmp_lt_u32_e64(s[78:79], v[22], s[20]))
|
|
k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21]))
|
|
k.emit(s_and_b64(s[82:83], s[78:79], s[82:83]))
|
|
k.emit(s_mul_i32(s[78], 256, s[2]))
|
|
k.emit(v_sub_u32_e64(v[146], v[22], s[78]))
|
|
k.emit(v_lshlrev_b32_e32(v[146], 2, v[146]))
|
|
k.emit(v_add_lshl_u32_e64(v[145], v[21], v[22], 1))
|
|
k.emit(v_cndmask_b32_e64(v[145], v[30], v[145], s[82:83]))
|
|
k.emit(v_add_co_u32(v[22], VCC, v[18], 4))
|
|
k.emit(v_cmp_lt_u32_e64(s[78:79], v[22], s[20]))
|
|
k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21]))
|
|
k.emit(s_and_b64(s[82:83], s[78:79], s[82:83]))
|
|
k.emit(s_mul_i32(s[78], 256, s[2]))
|
|
k.emit(v_sub_u32_e64(v[148], v[22], s[78]))
|
|
k.emit(v_lshlrev_b32_e32(v[148], 2, v[148]))
|
|
k.emit(v_add_lshl_u32_e64(v[147], v[21], v[22], 1))
|
|
k.emit(v_cndmask_b32_e64(v[147], v[30], v[147], s[82:83]))
|
|
k.emit(v_add_co_u32(v[22], VCC, v[18], 5))
|
|
k.emit(v_cmp_lt_u32_e64(s[78:79], v[22], s[20]))
|
|
k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21]))
|
|
k.emit(s_and_b64(s[82:83], s[78:79], s[82:83]))
|
|
k.emit(s_mul_i32(s[78], 256, s[2]))
|
|
k.emit(v_sub_u32_e64(v[150], v[22], s[78]))
|
|
k.emit(v_lshlrev_b32_e32(v[150], 2, v[150]))
|
|
k.emit(v_add_lshl_u32_e64(v[149], v[21], v[22], 1))
|
|
k.emit(v_cndmask_b32_e64(v[149], v[30], v[149], s[82:83]))
|
|
k.emit(v_add_co_u32(v[22], VCC, v[18], 6))
|
|
k.emit(v_cmp_lt_u32_e64(s[78:79], v[22], s[20]))
|
|
k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21]))
|
|
k.emit(s_and_b64(s[82:83], s[78:79], s[82:83]))
|
|
k.emit(s_mul_i32(s[78], 256, s[2]))
|
|
k.emit(v_sub_u32_e64(v[152], v[22], s[78]))
|
|
k.emit(v_lshlrev_b32_e32(v[152], 2, v[152]))
|
|
k.emit(v_add_lshl_u32_e64(v[151], v[21], v[22], 1))
|
|
k.emit(v_cndmask_b32_e64(v[151], v[30], v[151], s[82:83]))
|
|
k.emit(v_add_co_u32(v[22], VCC, v[18], 7))
|
|
k.emit(v_cmp_lt_u32_e64(s[78:79], v[22], s[20]))
|
|
k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21]))
|
|
k.emit(s_and_b64(s[82:83], s[78:79], s[82:83]))
|
|
k.emit(s_mul_i32(s[78], 256, s[2]))
|
|
k.emit(v_sub_u32_e64(v[154], v[22], s[78]))
|
|
k.emit(v_lshlrev_b32_e32(v[154], 2, v[154]))
|
|
k.emit(v_add_lshl_u32_e64(v[153], v[21], v[22], 1))
|
|
k.emit(v_cndmask_b32_e64(v[153], v[30], v[153], s[82:83]))
|
|
k.emit(v_add_co_u32(v[19], VCC, v[19], 1))
|
|
k.emit(v_add_u32_e64(v[20], v[20], s[38]))
|
|
k.emit(v_add_u32_e64(v[21], v[21], s[36]))
|
|
k.emit(v_cmp_lt_u32_e64(s[78:79], v[18], s[20]))
|
|
k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21]))
|
|
k.emit(s_and_b64(s[82:83], s[78:79], s[82:83]))
|
|
k.emit(s_mul_i32(s[78], 256, s[2]))
|
|
k.emit(v_sub_u32_e64(v[156], v[18], s[78]))
|
|
k.emit(v_lshlrev_b32_e32(v[156], 2, v[156]))
|
|
k.emit(v_add_lshl_u32_e64(v[155], v[21], v[18], 1))
|
|
k.emit(v_cndmask_b32_e64(v[155], v[30], v[155], s[82:83]))
|
|
k.emit(v_add_co_u32(v[22], VCC, v[18], 1))
|
|
k.emit(v_cmp_lt_u32_e64(s[78:79], v[22], s[20]))
|
|
k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21]))
|
|
k.emit(s_and_b64(s[82:83], s[78:79], s[82:83]))
|
|
k.emit(s_mul_i32(s[78], 256, s[2]))
|
|
k.emit(v_sub_u32_e64(v[158], v[22], s[78]))
|
|
k.emit(v_lshlrev_b32_e32(v[158], 2, v[158]))
|
|
k.emit(v_add_lshl_u32_e64(v[157], v[21], v[22], 1))
|
|
k.emit(v_cndmask_b32_e64(v[157], v[30], v[157], s[82:83]))
|
|
k.emit(v_add_co_u32(v[22], VCC, v[18], 2))
|
|
k.emit(v_cmp_lt_u32_e64(s[78:79], v[22], s[20]))
|
|
k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21]))
|
|
k.emit(s_and_b64(s[82:83], s[78:79], s[82:83]))
|
|
k.emit(s_mul_i32(s[78], 256, s[2]))
|
|
k.emit(v_sub_u32_e64(v[160], v[22], s[78]))
|
|
k.emit(v_lshlrev_b32_e32(v[160], 2, v[160]))
|
|
k.emit(v_add_lshl_u32_e64(v[159], v[21], v[22], 1))
|
|
k.emit(v_cndmask_b32_e64(v[159], v[30], v[159], s[82:83]))
|
|
k.emit(v_add_co_u32(v[22], VCC, v[18], 3))
|
|
k.emit(v_cmp_lt_u32_e64(s[78:79], v[22], s[20]))
|
|
k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21]))
|
|
k.emit(s_and_b64(s[82:83], s[78:79], s[82:83]))
|
|
k.emit(s_mul_i32(s[78], 256, s[2]))
|
|
k.emit(v_sub_u32_e64(v[162], v[22], s[78]))
|
|
k.emit(v_lshlrev_b32_e32(v[162], 2, v[162]))
|
|
k.emit(v_add_lshl_u32_e64(v[161], v[21], v[22], 1))
|
|
k.emit(v_cndmask_b32_e64(v[161], v[30], v[161], s[82:83]))
|
|
k.emit(v_add_co_u32(v[22], VCC, v[18], 4))
|
|
k.emit(v_cmp_lt_u32_e64(s[78:79], v[22], s[20]))
|
|
k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21]))
|
|
k.emit(s_and_b64(s[82:83], s[78:79], s[82:83]))
|
|
k.emit(s_mul_i32(s[78], 256, s[2]))
|
|
k.emit(v_sub_u32_e64(v[164], v[22], s[78]))
|
|
k.emit(v_lshlrev_b32_e32(v[164], 2, v[164]))
|
|
k.emit(v_add_lshl_u32_e64(v[163], v[21], v[22], 1))
|
|
k.emit(v_cndmask_b32_e64(v[163], v[30], v[163], s[82:83]))
|
|
k.emit(v_add_co_u32(v[22], VCC, v[18], 5))
|
|
k.emit(v_cmp_lt_u32_e64(s[78:79], v[22], s[20]))
|
|
k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21]))
|
|
k.emit(s_and_b64(s[82:83], s[78:79], s[82:83]))
|
|
k.emit(s_mul_i32(s[78], 256, s[2]))
|
|
k.emit(v_sub_u32_e64(v[166], v[22], s[78]))
|
|
k.emit(v_lshlrev_b32_e32(v[166], 2, v[166]))
|
|
k.emit(v_add_lshl_u32_e64(v[165], v[21], v[22], 1))
|
|
k.emit(v_cndmask_b32_e64(v[165], v[30], v[165], s[82:83]))
|
|
k.emit(v_add_co_u32(v[22], VCC, v[18], 6))
|
|
k.emit(v_cmp_lt_u32_e64(s[78:79], v[22], s[20]))
|
|
k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21]))
|
|
k.emit(s_and_b64(s[82:83], s[78:79], s[82:83]))
|
|
k.emit(s_mul_i32(s[78], 256, s[2]))
|
|
k.emit(v_sub_u32_e64(v[168], v[22], s[78]))
|
|
k.emit(v_lshlrev_b32_e32(v[168], 2, v[168]))
|
|
k.emit(v_add_lshl_u32_e64(v[167], v[21], v[22], 1))
|
|
k.emit(v_cndmask_b32_e64(v[167], v[30], v[167], s[82:83]))
|
|
k.emit(v_add_co_u32(v[22], VCC, v[18], 7))
|
|
k.emit(v_cmp_lt_u32_e64(s[78:79], v[22], s[20]))
|
|
k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21]))
|
|
k.emit(s_and_b64(s[82:83], s[78:79], s[82:83]))
|
|
k.emit(s_mul_i32(s[78], 256, s[2]))
|
|
k.emit(v_sub_u32_e64(v[170], v[22], s[78]))
|
|
k.emit(v_lshlrev_b32_e32(v[170], 2, v[170]))
|
|
k.emit(v_add_lshl_u32_e64(v[169], v[21], v[22], 1))
|
|
k.emit(v_cndmask_b32_e64(v[169], v[30], v[169], s[82:83]))
|
|
k.emit(v_accvgpr_read(v[35], v[225]))
|
|
k.emit(v_accvgpr_read(v[36], v[229]))
|
|
k.emit(v_accvgpr_read(v[37], v[233]))
|
|
k.emit(v_accvgpr_read(v[38], v[237]))
|
|
k.emit(v_accvgpr_read(v[39], v[241]))
|
|
k.emit(v_accvgpr_read(v[40], v[245]))
|
|
k.emit(v_accvgpr_read(v[41], v[249]))
|
|
k.emit(v_accvgpr_read(v[42], v[253]))
|
|
k.emit(v_accvgpr_read(v[43], v[2]))
|
|
k.emit(v_accvgpr_read(v[44], v[6]))
|
|
k.emit(v_accvgpr_read(v[45], v[10]))
|
|
k.emit(v_accvgpr_read(v[46], v[14]))
|
|
k.emit(v_accvgpr_read(v[47], v[18]))
|
|
k.emit(v_accvgpr_read(v[48], v[22]))
|
|
k.emit(v_accvgpr_read(v[49], v[26]))
|
|
k.emit(v_accvgpr_read(v[50], v[30]))
|
|
k.emit(v_accvgpr_read(v[51], v[34]))
|
|
k.emit(v_accvgpr_read(v[52], v[38]))
|
|
k.emit(v_accvgpr_read(v[53], v[42]))
|
|
k.emit(v_accvgpr_read(v[54], v[46]))
|
|
k.emit(v_accvgpr_read(v[55], v[50]))
|
|
k.emit(v_accvgpr_read(v[56], v[54]))
|
|
k.emit(v_accvgpr_read(v[57], v[58]))
|
|
k.emit(v_accvgpr_read(v[58], v[62]))
|
|
k.emit(v_accvgpr_read(v[59], v[66]))
|
|
k.emit(v_accvgpr_read(v[60], v[70]))
|
|
k.emit(v_accvgpr_read(v[61], v[74]))
|
|
k.emit(v_accvgpr_read(v[62], v[78]))
|
|
k.emit(v_accvgpr_read(v[63], v[82]))
|
|
k.emit(v_accvgpr_read(v[64], v[86]))
|
|
k.emit(v_accvgpr_read(v[65], v[90]))
|
|
k.emit(v_accvgpr_read(v[66], v[94]))
|
|
k.emit(v_accvgpr_read(v[67], v[98]))
|
|
k.emit(v_accvgpr_read(v[68], v[102]))
|
|
k.emit(v_accvgpr_read(v[69], v[106]))
|
|
k.emit(v_accvgpr_read(v[70], v[110]))
|
|
k.emit(v_accvgpr_read(v[71], v[114]))
|
|
k.emit(v_accvgpr_read(v[72], v[118]))
|
|
k.emit(v_accvgpr_read(v[73], v[122]))
|
|
k.emit(v_accvgpr_read(v[74], v[126]))
|
|
k.waitcnt(lgkm=0)
|
|
k.emit(v_mov_b32_e32(v[32], 4294901760))
|
|
k.emit(v_mov_b32_e32(v[33], 2147418112))
|
|
k.emit(v_mov_b32_e32(v[34], 32767))
|
|
k.emit(v_mul_f32_e32(v[35], v[76], v[35]))
|
|
k.emit(v_add_f32_e32(v[22], v[75], v[35]))
|
|
k.emit(v_mov_b32_e32(v[35], v[22]))
|
|
k.emit(v_cvt_pk(v[35], v[35], v[35]))
|
|
k.emit(buffer_store_short(v[35], v[77], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mul_f32_e32(v[36], v[80], v[36]))
|
|
k.emit(v_add_f32_e32(v[22], v[79], v[36]))
|
|
k.emit(v_mov_b32_e32(v[36], v[22]))
|
|
k.emit(v_cvt_pk(v[36], v[36], v[36]))
|
|
k.emit(buffer_store_short(v[36], v[81], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mul_f32_e32(v[37], v[84], v[37]))
|
|
k.emit(v_add_f32_e32(v[22], v[83], v[37]))
|
|
k.emit(v_mov_b32_e32(v[37], v[22]))
|
|
k.emit(v_cvt_pk(v[37], v[37], v[37]))
|
|
k.emit(buffer_store_short(v[37], v[85], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mul_f32_e32(v[38], v[88], v[38]))
|
|
k.emit(v_add_f32_e32(v[22], v[87], v[38]))
|
|
k.emit(v_mov_b32_e32(v[38], v[22]))
|
|
k.emit(v_cvt_pk(v[38], v[38], v[38]))
|
|
k.emit(buffer_store_short(v[38], v[89], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mul_f32_e32(v[39], v[92], v[39]))
|
|
k.emit(v_add_f32_e32(v[22], v[91], v[39]))
|
|
k.emit(v_mov_b32_e32(v[39], v[22]))
|
|
k.emit(v_cvt_pk(v[39], v[39], v[39]))
|
|
k.emit(buffer_store_short(v[39], v[93], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mul_f32_e32(v[40], v[96], v[40]))
|
|
k.emit(v_add_f32_e32(v[22], v[95], v[40]))
|
|
k.emit(v_mov_b32_e32(v[40], v[22]))
|
|
k.emit(v_cvt_pk(v[40], v[40], v[40]))
|
|
k.emit(buffer_store_short(v[40], v[97], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mul_f32_e32(v[41], v[100], v[41]))
|
|
k.emit(v_add_f32_e32(v[22], v[99], v[41]))
|
|
k.emit(v_mov_b32_e32(v[41], v[22]))
|
|
k.emit(v_cvt_pk(v[41], v[41], v[41]))
|
|
k.emit(buffer_store_short(v[41], v[101], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mul_f32_e32(v[42], v[104], v[42]))
|
|
k.emit(v_add_f32_e32(v[22], v[103], v[42]))
|
|
k.emit(v_mov_b32_e32(v[42], v[22]))
|
|
k.emit(v_cvt_pk(v[42], v[42], v[42]))
|
|
k.emit(buffer_store_short(v[42], v[105], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mul_f32_e32(v[43], v[76], v[43]))
|
|
k.emit(v_add_f32_e32(v[22], v[75], v[43]))
|
|
k.emit(v_mov_b32_e32(v[43], v[22]))
|
|
k.emit(v_cvt_pk(v[43], v[43], v[43]))
|
|
k.emit(buffer_store_short(v[43], v[107], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mul_f32_e32(v[44], v[80], v[44]))
|
|
k.emit(v_add_f32_e32(v[22], v[79], v[44]))
|
|
k.emit(v_mov_b32_e32(v[44], v[22]))
|
|
k.emit(v_cvt_pk(v[44], v[44], v[44]))
|
|
k.emit(buffer_store_short(v[44], v[109], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mul_f32_e32(v[45], v[84], v[45]))
|
|
k.emit(v_add_f32_e32(v[22], v[83], v[45]))
|
|
k.emit(v_mov_b32_e32(v[45], v[22]))
|
|
k.emit(v_cvt_pk(v[45], v[45], v[45]))
|
|
k.emit(buffer_store_short(v[45], v[111], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mul_f32_e32(v[46], v[88], v[46]))
|
|
k.emit(v_add_f32_e32(v[22], v[87], v[46]))
|
|
k.emit(v_mov_b32_e32(v[46], v[22]))
|
|
k.emit(v_cvt_pk(v[46], v[46], v[46]))
|
|
k.emit(buffer_store_short(v[46], v[113], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mul_f32_e32(v[47], v[92], v[47]))
|
|
k.emit(v_add_f32_e32(v[22], v[91], v[47]))
|
|
k.emit(v_mov_b32_e32(v[47], v[22]))
|
|
k.emit(v_cvt_pk(v[47], v[47], v[47]))
|
|
k.emit(buffer_store_short(v[47], v[115], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mul_f32_e32(v[48], v[96], v[48]))
|
|
k.emit(v_add_f32_e32(v[22], v[95], v[48]))
|
|
k.emit(v_mov_b32_e32(v[48], v[22]))
|
|
k.emit(v_cvt_pk(v[48], v[48], v[48]))
|
|
k.emit(buffer_store_short(v[48], v[117], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mul_f32_e32(v[49], v[100], v[49]))
|
|
k.emit(v_add_f32_e32(v[22], v[99], v[49]))
|
|
k.emit(v_mov_b32_e32(v[49], v[22]))
|
|
k.emit(v_cvt_pk(v[49], v[49], v[49]))
|
|
k.emit(buffer_store_short(v[49], v[119], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mul_f32_e32(v[50], v[104], v[50]))
|
|
k.emit(v_add_f32_e32(v[22], v[103], v[50]))
|
|
k.emit(v_mov_b32_e32(v[50], v[22]))
|
|
k.emit(v_cvt_pk(v[50], v[50], v[50]))
|
|
k.emit(buffer_store_short(v[50], v[121], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mul_f32_e32(v[51], v[76], v[51]))
|
|
k.emit(v_add_f32_e32(v[22], v[75], v[51]))
|
|
k.emit(v_mov_b32_e32(v[51], v[22]))
|
|
k.emit(v_cvt_pk(v[51], v[51], v[51]))
|
|
k.emit(buffer_store_short(v[51], v[123], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mul_f32_e32(v[52], v[80], v[52]))
|
|
k.emit(v_add_f32_e32(v[22], v[79], v[52]))
|
|
k.emit(v_mov_b32_e32(v[52], v[22]))
|
|
k.emit(v_cvt_pk(v[52], v[52], v[52]))
|
|
k.emit(buffer_store_short(v[52], v[125], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mul_f32_e32(v[53], v[84], v[53]))
|
|
k.emit(v_add_f32_e32(v[22], v[83], v[53]))
|
|
k.emit(v_mov_b32_e32(v[53], v[22]))
|
|
k.emit(v_cvt_pk(v[53], v[53], v[53]))
|
|
k.emit(buffer_store_short(v[53], v[127], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mul_f32_e32(v[54], v[88], v[54]))
|
|
k.emit(v_add_f32_e32(v[22], v[87], v[54]))
|
|
k.emit(v_mov_b32_e32(v[54], v[22]))
|
|
k.emit(v_cvt_pk(v[54], v[54], v[54]))
|
|
k.emit(buffer_store_short(v[54], v[129], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mul_f32_e32(v[55], v[92], v[55]))
|
|
k.emit(v_add_f32_e32(v[22], v[91], v[55]))
|
|
k.emit(v_mov_b32_e32(v[55], v[22]))
|
|
k.emit(v_cvt_pk(v[55], v[55], v[55]))
|
|
k.emit(buffer_store_short(v[55], v[131], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mul_f32_e32(v[56], v[96], v[56]))
|
|
k.emit(v_add_f32_e32(v[22], v[95], v[56]))
|
|
k.emit(v_mov_b32_e32(v[56], v[22]))
|
|
k.emit(v_cvt_pk(v[56], v[56], v[56]))
|
|
k.emit(buffer_store_short(v[56], v[133], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mul_f32_e32(v[57], v[100], v[57]))
|
|
k.emit(v_add_f32_e32(v[22], v[99], v[57]))
|
|
k.emit(v_mov_b32_e32(v[57], v[22]))
|
|
k.emit(v_cvt_pk(v[57], v[57], v[57]))
|
|
k.emit(buffer_store_short(v[57], v[135], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mul_f32_e32(v[58], v[104], v[58]))
|
|
k.emit(v_add_f32_e32(v[22], v[103], v[58]))
|
|
k.emit(v_mov_b32_e32(v[58], v[22]))
|
|
k.emit(v_cvt_pk(v[58], v[58], v[58]))
|
|
k.emit(buffer_store_short(v[58], v[137], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mul_f32_e32(v[59], v[76], v[59]))
|
|
k.emit(v_add_f32_e32(v[22], v[75], v[59]))
|
|
k.emit(v_mov_b32_e32(v[59], v[22]))
|
|
k.emit(v_cvt_pk(v[59], v[59], v[59]))
|
|
k.emit(buffer_store_short(v[59], v[139], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mul_f32_e32(v[60], v[80], v[60]))
|
|
k.emit(v_add_f32_e32(v[22], v[79], v[60]))
|
|
k.emit(v_mov_b32_e32(v[60], v[22]))
|
|
k.emit(v_cvt_pk(v[60], v[60], v[60]))
|
|
k.emit(buffer_store_short(v[60], v[141], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mul_f32_e32(v[61], v[84], v[61]))
|
|
k.emit(v_add_f32_e32(v[22], v[83], v[61]))
|
|
k.emit(v_mov_b32_e32(v[61], v[22]))
|
|
k.emit(v_cvt_pk(v[61], v[61], v[61]))
|
|
k.emit(buffer_store_short(v[61], v[143], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mul_f32_e32(v[62], v[88], v[62]))
|
|
k.emit(v_add_f32_e32(v[22], v[87], v[62]))
|
|
k.emit(v_mov_b32_e32(v[62], v[22]))
|
|
k.emit(v_cvt_pk(v[62], v[62], v[62]))
|
|
k.emit(buffer_store_short(v[62], v[145], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mul_f32_e32(v[63], v[92], v[63]))
|
|
k.emit(v_add_f32_e32(v[22], v[91], v[63]))
|
|
k.emit(v_mov_b32_e32(v[63], v[22]))
|
|
k.emit(v_cvt_pk(v[63], v[63], v[63]))
|
|
k.emit(buffer_store_short(v[63], v[147], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mul_f32_e32(v[64], v[96], v[64]))
|
|
k.emit(v_add_f32_e32(v[22], v[95], v[64]))
|
|
k.emit(v_mov_b32_e32(v[64], v[22]))
|
|
k.emit(v_cvt_pk(v[64], v[64], v[64]))
|
|
k.emit(buffer_store_short(v[64], v[149], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mul_f32_e32(v[65], v[100], v[65]))
|
|
k.emit(v_add_f32_e32(v[22], v[99], v[65]))
|
|
k.emit(v_mov_b32_e32(v[65], v[22]))
|
|
k.emit(v_cvt_pk(v[65], v[65], v[65]))
|
|
k.emit(buffer_store_short(v[65], v[151], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mul_f32_e32(v[66], v[104], v[66]))
|
|
k.emit(v_add_f32_e32(v[22], v[103], v[66]))
|
|
k.emit(v_mov_b32_e32(v[66], v[22]))
|
|
k.emit(v_cvt_pk(v[66], v[66], v[66]))
|
|
k.emit(buffer_store_short(v[66], v[153], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mul_f32_e32(v[67], v[76], v[67]))
|
|
k.emit(v_add_f32_e32(v[22], v[75], v[67]))
|
|
k.emit(v_mov_b32_e32(v[67], v[22]))
|
|
k.emit(v_cvt_pk(v[67], v[67], v[67]))
|
|
k.emit(buffer_store_short(v[67], v[155], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mul_f32_e32(v[68], v[80], v[68]))
|
|
k.emit(v_add_f32_e32(v[22], v[79], v[68]))
|
|
k.emit(v_mov_b32_e32(v[68], v[22]))
|
|
k.emit(v_cvt_pk(v[68], v[68], v[68]))
|
|
k.emit(buffer_store_short(v[68], v[157], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mul_f32_e32(v[69], v[84], v[69]))
|
|
k.emit(v_add_f32_e32(v[22], v[83], v[69]))
|
|
k.emit(v_mov_b32_e32(v[69], v[22]))
|
|
k.emit(v_cvt_pk(v[69], v[69], v[69]))
|
|
k.emit(buffer_store_short(v[69], v[159], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mul_f32_e32(v[70], v[88], v[70]))
|
|
k.emit(v_add_f32_e32(v[22], v[87], v[70]))
|
|
k.emit(v_mov_b32_e32(v[70], v[22]))
|
|
k.emit(v_cvt_pk(v[70], v[70], v[70]))
|
|
k.emit(buffer_store_short(v[70], v[161], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mul_f32_e32(v[71], v[92], v[71]))
|
|
k.emit(v_add_f32_e32(v[22], v[91], v[71]))
|
|
k.emit(v_mov_b32_e32(v[71], v[22]))
|
|
k.emit(v_cvt_pk(v[71], v[71], v[71]))
|
|
k.emit(buffer_store_short(v[71], v[163], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mul_f32_e32(v[72], v[96], v[72]))
|
|
k.emit(v_add_f32_e32(v[22], v[95], v[72]))
|
|
k.emit(v_mov_b32_e32(v[72], v[22]))
|
|
k.emit(v_cvt_pk(v[72], v[72], v[72]))
|
|
k.emit(buffer_store_short(v[72], v[165], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mul_f32_e32(v[73], v[100], v[73]))
|
|
k.emit(v_add_f32_e32(v[22], v[99], v[73]))
|
|
k.emit(v_mov_b32_e32(v[73], v[22]))
|
|
k.emit(v_cvt_pk(v[73], v[73], v[73]))
|
|
k.emit(buffer_store_short(v[73], v[167], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mul_f32_e32(v[74], v[104], v[74]))
|
|
k.emit(v_add_f32_e32(v[22], v[103], v[74]))
|
|
k.emit(v_mov_b32_e32(v[74], v[22]))
|
|
k.emit(v_cvt_pk(v[74], v[74], v[74]))
|
|
k.emit(buffer_store_short(v[74], v[169], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(s_nop())
|
|
k.emit(v_mov_b32_e32(v[30], 2147483648))
|
|
k.emit(v_add_co_u32(v[19], VCC, v[19], 1))
|
|
k.emit(v_add_u32_e64(v[20], v[20], s[38]))
|
|
k.emit(v_add_u32_e64(v[21], v[21], s[36]))
|
|
k.emit(v_cmp_lt_u32_e64(s[78:79], v[18], s[20]))
|
|
k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21]))
|
|
k.emit(s_and_b64(s[82:83], s[78:79], s[82:83]))
|
|
k.emit(s_mul_i32(s[78], 256, s[2]))
|
|
k.emit(v_sub_u32_e64(v[78], v[18], s[78]))
|
|
k.emit(v_lshlrev_b32_e32(v[78], 2, v[78]))
|
|
k.emit(ds_read_b32(v[75], v[78]))
|
|
k.emit(ds_read_b32(v[76], v[78], v[0], v[0], 0, 0, 4))
|
|
k.emit(v_add_lshl_u32_e64(v[77], v[21], v[18], 1))
|
|
k.emit(v_cndmask_b32_e64(v[77], v[30], v[77], s[82:83]))
|
|
k.emit(v_add_co_u32(v[22], VCC, v[18], 1))
|
|
k.emit(v_cmp_lt_u32_e64(s[78:79], v[22], s[20]))
|
|
k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21]))
|
|
k.emit(s_and_b64(s[82:83], s[78:79], s[82:83]))
|
|
k.emit(s_mul_i32(s[78], 256, s[2]))
|
|
k.emit(v_sub_u32_e64(v[82], v[22], s[78]))
|
|
k.emit(v_lshlrev_b32_e32(v[82], 2, v[82]))
|
|
k.emit(ds_read_b32(v[79], v[82]))
|
|
k.emit(ds_read_b32(v[80], v[82], v[0], v[0], 0, 0, 4))
|
|
k.emit(v_add_lshl_u32_e64(v[81], v[21], v[22], 1))
|
|
k.emit(v_cndmask_b32_e64(v[81], v[30], v[81], s[82:83]))
|
|
k.emit(v_add_co_u32(v[22], VCC, v[18], 2))
|
|
k.emit(v_cmp_lt_u32_e64(s[78:79], v[22], s[20]))
|
|
k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21]))
|
|
k.emit(s_and_b64(s[82:83], s[78:79], s[82:83]))
|
|
k.emit(s_mul_i32(s[78], 256, s[2]))
|
|
k.emit(v_sub_u32_e64(v[86], v[22], s[78]))
|
|
k.emit(v_lshlrev_b32_e32(v[86], 2, v[86]))
|
|
k.emit(ds_read_b32(v[83], v[86]))
|
|
k.emit(ds_read_b32(v[84], v[86], v[0], v[0], 0, 0, 4))
|
|
k.emit(v_add_lshl_u32_e64(v[85], v[21], v[22], 1))
|
|
k.emit(v_cndmask_b32_e64(v[85], v[30], v[85], s[82:83]))
|
|
k.emit(v_add_co_u32(v[22], VCC, v[18], 3))
|
|
k.emit(v_cmp_lt_u32_e64(s[78:79], v[22], s[20]))
|
|
k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21]))
|
|
k.emit(s_and_b64(s[82:83], s[78:79], s[82:83]))
|
|
k.emit(s_mul_i32(s[78], 256, s[2]))
|
|
k.emit(v_sub_u32_e64(v[90], v[22], s[78]))
|
|
k.emit(v_lshlrev_b32_e32(v[90], 2, v[90]))
|
|
k.emit(ds_read_b32(v[87], v[90]))
|
|
k.emit(ds_read_b32(v[88], v[90], v[0], v[0], 0, 0, 4))
|
|
k.emit(v_add_lshl_u32_e64(v[89], v[21], v[22], 1))
|
|
k.emit(v_cndmask_b32_e64(v[89], v[30], v[89], s[82:83]))
|
|
k.emit(v_add_co_u32(v[22], VCC, v[18], 4))
|
|
k.emit(v_cmp_lt_u32_e64(s[78:79], v[22], s[20]))
|
|
k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21]))
|
|
k.emit(s_and_b64(s[82:83], s[78:79], s[82:83]))
|
|
k.emit(s_mul_i32(s[78], 256, s[2]))
|
|
k.emit(v_sub_u32_e64(v[94], v[22], s[78]))
|
|
k.emit(v_lshlrev_b32_e32(v[94], 2, v[94]))
|
|
k.emit(ds_read_b32(v[91], v[94]))
|
|
k.emit(ds_read_b32(v[92], v[94], v[0], v[0], 0, 0, 4))
|
|
k.emit(v_add_lshl_u32_e64(v[93], v[21], v[22], 1))
|
|
k.emit(v_cndmask_b32_e64(v[93], v[30], v[93], s[82:83]))
|
|
k.emit(v_add_co_u32(v[22], VCC, v[18], 5))
|
|
k.emit(v_cmp_lt_u32_e64(s[78:79], v[22], s[20]))
|
|
k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21]))
|
|
k.emit(s_and_b64(s[82:83], s[78:79], s[82:83]))
|
|
k.emit(s_mul_i32(s[78], 256, s[2]))
|
|
k.emit(v_sub_u32_e64(v[98], v[22], s[78]))
|
|
k.emit(v_lshlrev_b32_e32(v[98], 2, v[98]))
|
|
k.emit(ds_read_b32(v[95], v[98]))
|
|
k.emit(ds_read_b32(v[96], v[98], v[0], v[0], 0, 0, 4))
|
|
k.emit(v_add_lshl_u32_e64(v[97], v[21], v[22], 1))
|
|
k.emit(v_cndmask_b32_e64(v[97], v[30], v[97], s[82:83]))
|
|
k.emit(v_add_co_u32(v[22], VCC, v[18], 6))
|
|
k.emit(v_cmp_lt_u32_e64(s[78:79], v[22], s[20]))
|
|
k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21]))
|
|
k.emit(s_and_b64(s[82:83], s[78:79], s[82:83]))
|
|
k.emit(s_mul_i32(s[78], 256, s[2]))
|
|
k.emit(v_sub_u32_e64(v[102], v[22], s[78]))
|
|
k.emit(v_lshlrev_b32_e32(v[102], 2, v[102]))
|
|
k.emit(ds_read_b32(v[99], v[102]))
|
|
k.emit(ds_read_b32(v[100], v[102], v[0], v[0], 0, 0, 4))
|
|
k.emit(v_add_lshl_u32_e64(v[101], v[21], v[22], 1))
|
|
k.emit(v_cndmask_b32_e64(v[101], v[30], v[101], s[82:83]))
|
|
k.emit(v_add_co_u32(v[22], VCC, v[18], 7))
|
|
k.emit(v_cmp_lt_u32_e64(s[78:79], v[22], s[20]))
|
|
k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21]))
|
|
k.emit(s_and_b64(s[82:83], s[78:79], s[82:83]))
|
|
k.emit(s_mul_i32(s[78], 256, s[2]))
|
|
k.emit(v_sub_u32_e64(v[106], v[22], s[78]))
|
|
k.emit(v_lshlrev_b32_e32(v[106], 2, v[106]))
|
|
k.emit(ds_read_b32(v[103], v[106]))
|
|
k.emit(ds_read_b32(v[104], v[106], v[0], v[0], 0, 0, 4))
|
|
k.emit(v_add_lshl_u32_e64(v[105], v[21], v[22], 1))
|
|
k.emit(v_cndmask_b32_e64(v[105], v[30], v[105], s[82:83]))
|
|
k.emit(v_add_co_u32(v[19], VCC, v[19], 1))
|
|
k.emit(v_add_u32_e64(v[20], v[20], s[38]))
|
|
k.emit(v_add_u32_e64(v[21], v[21], s[36]))
|
|
k.emit(v_cmp_lt_u32_e64(s[78:79], v[18], s[20]))
|
|
k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21]))
|
|
k.emit(s_and_b64(s[82:83], s[78:79], s[82:83]))
|
|
k.emit(s_mul_i32(s[78], 256, s[2]))
|
|
k.emit(v_sub_u32_e64(v[108], v[18], s[78]))
|
|
k.emit(v_lshlrev_b32_e32(v[108], 2, v[108]))
|
|
k.emit(v_add_lshl_u32_e64(v[107], v[21], v[18], 1))
|
|
k.emit(v_cndmask_b32_e64(v[107], v[30], v[107], s[82:83]))
|
|
k.emit(v_add_co_u32(v[22], VCC, v[18], 1))
|
|
k.emit(v_cmp_lt_u32_e64(s[78:79], v[22], s[20]))
|
|
k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21]))
|
|
k.emit(s_and_b64(s[82:83], s[78:79], s[82:83]))
|
|
k.emit(s_mul_i32(s[78], 256, s[2]))
|
|
k.emit(v_sub_u32_e64(v[110], v[22], s[78]))
|
|
k.emit(v_lshlrev_b32_e32(v[110], 2, v[110]))
|
|
k.emit(v_add_lshl_u32_e64(v[109], v[21], v[22], 1))
|
|
k.emit(v_cndmask_b32_e64(v[109], v[30], v[109], s[82:83]))
|
|
k.emit(v_add_co_u32(v[22], VCC, v[18], 2))
|
|
k.emit(v_cmp_lt_u32_e64(s[78:79], v[22], s[20]))
|
|
k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21]))
|
|
k.emit(s_and_b64(s[82:83], s[78:79], s[82:83]))
|
|
k.emit(s_mul_i32(s[78], 256, s[2]))
|
|
k.emit(v_sub_u32_e64(v[112], v[22], s[78]))
|
|
k.emit(v_lshlrev_b32_e32(v[112], 2, v[112]))
|
|
k.emit(v_add_lshl_u32_e64(v[111], v[21], v[22], 1))
|
|
k.emit(v_cndmask_b32_e64(v[111], v[30], v[111], s[82:83]))
|
|
k.emit(v_add_co_u32(v[22], VCC, v[18], 3))
|
|
k.emit(v_cmp_lt_u32_e64(s[78:79], v[22], s[20]))
|
|
k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21]))
|
|
k.emit(s_and_b64(s[82:83], s[78:79], s[82:83]))
|
|
k.emit(s_mul_i32(s[78], 256, s[2]))
|
|
k.emit(v_sub_u32_e64(v[114], v[22], s[78]))
|
|
k.emit(v_lshlrev_b32_e32(v[114], 2, v[114]))
|
|
k.emit(v_add_lshl_u32_e64(v[113], v[21], v[22], 1))
|
|
k.emit(v_cndmask_b32_e64(v[113], v[30], v[113], s[82:83]))
|
|
k.emit(v_add_co_u32(v[22], VCC, v[18], 4))
|
|
k.emit(v_cmp_lt_u32_e64(s[78:79], v[22], s[20]))
|
|
k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21]))
|
|
k.emit(s_and_b64(s[82:83], s[78:79], s[82:83]))
|
|
k.emit(s_mul_i32(s[78], 256, s[2]))
|
|
k.emit(v_sub_u32_e64(v[116], v[22], s[78]))
|
|
k.emit(v_lshlrev_b32_e32(v[116], 2, v[116]))
|
|
k.emit(v_add_lshl_u32_e64(v[115], v[21], v[22], 1))
|
|
k.emit(v_cndmask_b32_e64(v[115], v[30], v[115], s[82:83]))
|
|
k.emit(v_add_co_u32(v[22], VCC, v[18], 5))
|
|
k.emit(v_cmp_lt_u32_e64(s[78:79], v[22], s[20]))
|
|
k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21]))
|
|
k.emit(s_and_b64(s[82:83], s[78:79], s[82:83]))
|
|
k.emit(s_mul_i32(s[78], 256, s[2]))
|
|
k.emit(v_sub_u32_e64(v[118], v[22], s[78]))
|
|
k.emit(v_lshlrev_b32_e32(v[118], 2, v[118]))
|
|
k.emit(v_add_lshl_u32_e64(v[117], v[21], v[22], 1))
|
|
k.emit(v_cndmask_b32_e64(v[117], v[30], v[117], s[82:83]))
|
|
k.emit(v_add_co_u32(v[22], VCC, v[18], 6))
|
|
k.emit(v_cmp_lt_u32_e64(s[78:79], v[22], s[20]))
|
|
k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21]))
|
|
k.emit(s_and_b64(s[82:83], s[78:79], s[82:83]))
|
|
k.emit(s_mul_i32(s[78], 256, s[2]))
|
|
k.emit(v_sub_u32_e64(v[120], v[22], s[78]))
|
|
k.emit(v_lshlrev_b32_e32(v[120], 2, v[120]))
|
|
k.emit(v_add_lshl_u32_e64(v[119], v[21], v[22], 1))
|
|
k.emit(v_cndmask_b32_e64(v[119], v[30], v[119], s[82:83]))
|
|
k.emit(v_add_co_u32(v[22], VCC, v[18], 7))
|
|
k.emit(v_cmp_lt_u32_e64(s[78:79], v[22], s[20]))
|
|
k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21]))
|
|
k.emit(s_and_b64(s[82:83], s[78:79], s[82:83]))
|
|
k.emit(s_mul_i32(s[78], 256, s[2]))
|
|
k.emit(v_sub_u32_e64(v[122], v[22], s[78]))
|
|
k.emit(v_lshlrev_b32_e32(v[122], 2, v[122]))
|
|
k.emit(v_add_lshl_u32_e64(v[121], v[21], v[22], 1))
|
|
k.emit(v_cndmask_b32_e64(v[121], v[30], v[121], s[82:83]))
|
|
k.emit(v_add_co_u32(v[19], VCC, v[19], 1))
|
|
k.emit(v_add_u32_e64(v[20], v[20], s[38]))
|
|
k.emit(v_add_u32_e64(v[21], v[21], s[36]))
|
|
k.emit(v_cmp_lt_u32_e64(s[78:79], v[18], s[20]))
|
|
k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21]))
|
|
k.emit(s_and_b64(s[82:83], s[78:79], s[82:83]))
|
|
k.emit(s_mul_i32(s[78], 256, s[2]))
|
|
k.emit(v_sub_u32_e64(v[124], v[18], s[78]))
|
|
k.emit(v_lshlrev_b32_e32(v[124], 2, v[124]))
|
|
k.emit(v_add_lshl_u32_e64(v[123], v[21], v[18], 1))
|
|
k.emit(v_cndmask_b32_e64(v[123], v[30], v[123], s[82:83]))
|
|
k.emit(v_add_co_u32(v[22], VCC, v[18], 1))
|
|
k.emit(v_cmp_lt_u32_e64(s[78:79], v[22], s[20]))
|
|
k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21]))
|
|
k.emit(s_and_b64(s[82:83], s[78:79], s[82:83]))
|
|
k.emit(s_mul_i32(s[78], 256, s[2]))
|
|
k.emit(v_sub_u32_e64(v[126], v[22], s[78]))
|
|
k.emit(v_lshlrev_b32_e32(v[126], 2, v[126]))
|
|
k.emit(v_add_lshl_u32_e64(v[125], v[21], v[22], 1))
|
|
k.emit(v_cndmask_b32_e64(v[125], v[30], v[125], s[82:83]))
|
|
k.emit(v_add_co_u32(v[22], VCC, v[18], 2))
|
|
k.emit(v_cmp_lt_u32_e64(s[78:79], v[22], s[20]))
|
|
k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21]))
|
|
k.emit(s_and_b64(s[82:83], s[78:79], s[82:83]))
|
|
k.emit(s_mul_i32(s[78], 256, s[2]))
|
|
k.emit(v_sub_u32_e64(v[128], v[22], s[78]))
|
|
k.emit(v_lshlrev_b32_e32(v[128], 2, v[128]))
|
|
k.emit(v_add_lshl_u32_e64(v[127], v[21], v[22], 1))
|
|
k.emit(v_cndmask_b32_e64(v[127], v[30], v[127], s[82:83]))
|
|
k.emit(v_add_co_u32(v[22], VCC, v[18], 3))
|
|
k.emit(v_cmp_lt_u32_e64(s[78:79], v[22], s[20]))
|
|
k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21]))
|
|
k.emit(s_and_b64(s[82:83], s[78:79], s[82:83]))
|
|
k.emit(s_mul_i32(s[78], 256, s[2]))
|
|
k.emit(v_sub_u32_e64(v[130], v[22], s[78]))
|
|
k.emit(v_lshlrev_b32_e32(v[130], 2, v[130]))
|
|
k.emit(v_add_lshl_u32_e64(v[129], v[21], v[22], 1))
|
|
k.emit(v_cndmask_b32_e64(v[129], v[30], v[129], s[82:83]))
|
|
k.emit(v_add_co_u32(v[22], VCC, v[18], 4))
|
|
k.emit(v_cmp_lt_u32_e64(s[78:79], v[22], s[20]))
|
|
k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21]))
|
|
k.emit(s_and_b64(s[82:83], s[78:79], s[82:83]))
|
|
k.emit(s_mul_i32(s[78], 256, s[2]))
|
|
k.emit(v_sub_u32_e64(v[132], v[22], s[78]))
|
|
k.emit(v_lshlrev_b32_e32(v[132], 2, v[132]))
|
|
k.emit(v_add_lshl_u32_e64(v[131], v[21], v[22], 1))
|
|
k.emit(v_cndmask_b32_e64(v[131], v[30], v[131], s[82:83]))
|
|
k.emit(v_add_co_u32(v[22], VCC, v[18], 5))
|
|
k.emit(v_cmp_lt_u32_e64(s[78:79], v[22], s[20]))
|
|
k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21]))
|
|
k.emit(s_and_b64(s[82:83], s[78:79], s[82:83]))
|
|
k.emit(s_mul_i32(s[78], 256, s[2]))
|
|
k.emit(v_sub_u32_e64(v[134], v[22], s[78]))
|
|
k.emit(v_lshlrev_b32_e32(v[134], 2, v[134]))
|
|
k.emit(v_add_lshl_u32_e64(v[133], v[21], v[22], 1))
|
|
k.emit(v_cndmask_b32_e64(v[133], v[30], v[133], s[82:83]))
|
|
k.emit(v_add_co_u32(v[22], VCC, v[18], 6))
|
|
k.emit(v_cmp_lt_u32_e64(s[78:79], v[22], s[20]))
|
|
k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21]))
|
|
k.emit(s_and_b64(s[82:83], s[78:79], s[82:83]))
|
|
k.emit(s_mul_i32(s[78], 256, s[2]))
|
|
k.emit(v_sub_u32_e64(v[136], v[22], s[78]))
|
|
k.emit(v_lshlrev_b32_e32(v[136], 2, v[136]))
|
|
k.emit(v_add_lshl_u32_e64(v[135], v[21], v[22], 1))
|
|
k.emit(v_cndmask_b32_e64(v[135], v[30], v[135], s[82:83]))
|
|
k.emit(v_add_co_u32(v[22], VCC, v[18], 7))
|
|
k.emit(v_cmp_lt_u32_e64(s[78:79], v[22], s[20]))
|
|
k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21]))
|
|
k.emit(s_and_b64(s[82:83], s[78:79], s[82:83]))
|
|
k.emit(s_mul_i32(s[78], 256, s[2]))
|
|
k.emit(v_sub_u32_e64(v[138], v[22], s[78]))
|
|
k.emit(v_lshlrev_b32_e32(v[138], 2, v[138]))
|
|
k.emit(v_add_lshl_u32_e64(v[137], v[21], v[22], 1))
|
|
k.emit(v_cndmask_b32_e64(v[137], v[30], v[137], s[82:83]))
|
|
k.emit(v_add_co_u32(v[19], VCC, v[19], 1))
|
|
k.emit(v_add_u32_e64(v[20], v[20], s[38]))
|
|
k.emit(v_add_u32_e64(v[21], v[21], s[36]))
|
|
k.emit(v_cmp_lt_u32_e64(s[78:79], v[18], s[20]))
|
|
k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21]))
|
|
k.emit(s_and_b64(s[82:83], s[78:79], s[82:83]))
|
|
k.emit(s_mul_i32(s[78], 256, s[2]))
|
|
k.emit(v_sub_u32_e64(v[140], v[18], s[78]))
|
|
k.emit(v_lshlrev_b32_e32(v[140], 2, v[140]))
|
|
k.emit(v_add_lshl_u32_e64(v[139], v[21], v[18], 1))
|
|
k.emit(v_cndmask_b32_e64(v[139], v[30], v[139], s[82:83]))
|
|
k.emit(v_add_co_u32(v[22], VCC, v[18], 1))
|
|
k.emit(v_cmp_lt_u32_e64(s[78:79], v[22], s[20]))
|
|
k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21]))
|
|
k.emit(s_and_b64(s[82:83], s[78:79], s[82:83]))
|
|
k.emit(s_mul_i32(s[78], 256, s[2]))
|
|
k.emit(v_sub_u32_e64(v[142], v[22], s[78]))
|
|
k.emit(v_lshlrev_b32_e32(v[142], 2, v[142]))
|
|
k.emit(v_add_lshl_u32_e64(v[141], v[21], v[22], 1))
|
|
k.emit(v_cndmask_b32_e64(v[141], v[30], v[141], s[82:83]))
|
|
k.emit(v_add_co_u32(v[22], VCC, v[18], 2))
|
|
k.emit(v_cmp_lt_u32_e64(s[78:79], v[22], s[20]))
|
|
k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21]))
|
|
k.emit(s_and_b64(s[82:83], s[78:79], s[82:83]))
|
|
k.emit(s_mul_i32(s[78], 256, s[2]))
|
|
k.emit(v_sub_u32_e64(v[144], v[22], s[78]))
|
|
k.emit(v_lshlrev_b32_e32(v[144], 2, v[144]))
|
|
k.emit(v_add_lshl_u32_e64(v[143], v[21], v[22], 1))
|
|
k.emit(v_cndmask_b32_e64(v[143], v[30], v[143], s[82:83]))
|
|
k.emit(v_add_co_u32(v[22], VCC, v[18], 3))
|
|
k.emit(v_cmp_lt_u32_e64(s[78:79], v[22], s[20]))
|
|
k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21]))
|
|
k.emit(s_and_b64(s[82:83], s[78:79], s[82:83]))
|
|
k.emit(s_mul_i32(s[78], 256, s[2]))
|
|
k.emit(v_sub_u32_e64(v[146], v[22], s[78]))
|
|
k.emit(v_lshlrev_b32_e32(v[146], 2, v[146]))
|
|
k.emit(v_add_lshl_u32_e64(v[145], v[21], v[22], 1))
|
|
k.emit(v_cndmask_b32_e64(v[145], v[30], v[145], s[82:83]))
|
|
k.emit(v_add_co_u32(v[22], VCC, v[18], 4))
|
|
k.emit(v_cmp_lt_u32_e64(s[78:79], v[22], s[20]))
|
|
k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21]))
|
|
k.emit(s_and_b64(s[82:83], s[78:79], s[82:83]))
|
|
k.emit(s_mul_i32(s[78], 256, s[2]))
|
|
k.emit(v_sub_u32_e64(v[148], v[22], s[78]))
|
|
k.emit(v_lshlrev_b32_e32(v[148], 2, v[148]))
|
|
k.emit(v_add_lshl_u32_e64(v[147], v[21], v[22], 1))
|
|
k.emit(v_cndmask_b32_e64(v[147], v[30], v[147], s[82:83]))
|
|
k.emit(v_add_co_u32(v[22], VCC, v[18], 5))
|
|
k.emit(v_cmp_lt_u32_e64(s[78:79], v[22], s[20]))
|
|
k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21]))
|
|
k.emit(s_and_b64(s[82:83], s[78:79], s[82:83]))
|
|
k.emit(s_mul_i32(s[78], 256, s[2]))
|
|
k.emit(v_sub_u32_e64(v[150], v[22], s[78]))
|
|
k.emit(v_lshlrev_b32_e32(v[150], 2, v[150]))
|
|
k.emit(v_add_lshl_u32_e64(v[149], v[21], v[22], 1))
|
|
k.emit(v_cndmask_b32_e64(v[149], v[30], v[149], s[82:83]))
|
|
k.emit(v_add_co_u32(v[22], VCC, v[18], 6))
|
|
k.emit(v_cmp_lt_u32_e64(s[78:79], v[22], s[20]))
|
|
k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21]))
|
|
k.emit(s_and_b64(s[82:83], s[78:79], s[82:83]))
|
|
k.emit(s_mul_i32(s[78], 256, s[2]))
|
|
k.emit(v_sub_u32_e64(v[152], v[22], s[78]))
|
|
k.emit(v_lshlrev_b32_e32(v[152], 2, v[152]))
|
|
k.emit(v_add_lshl_u32_e64(v[151], v[21], v[22], 1))
|
|
k.emit(v_cndmask_b32_e64(v[151], v[30], v[151], s[82:83]))
|
|
k.emit(v_add_co_u32(v[22], VCC, v[18], 7))
|
|
k.emit(v_cmp_lt_u32_e64(s[78:79], v[22], s[20]))
|
|
k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21]))
|
|
k.emit(s_and_b64(s[82:83], s[78:79], s[82:83]))
|
|
k.emit(s_mul_i32(s[78], 256, s[2]))
|
|
k.emit(v_sub_u32_e64(v[154], v[22], s[78]))
|
|
k.emit(v_lshlrev_b32_e32(v[154], 2, v[154]))
|
|
k.emit(v_add_lshl_u32_e64(v[153], v[21], v[22], 1))
|
|
k.emit(v_cndmask_b32_e64(v[153], v[30], v[153], s[82:83]))
|
|
k.emit(v_add_co_u32(v[19], VCC, v[19], 1))
|
|
k.emit(v_add_u32_e64(v[20], v[20], s[38]))
|
|
k.emit(v_add_u32_e64(v[21], v[21], s[36]))
|
|
k.emit(v_cmp_lt_u32_e64(s[78:79], v[18], s[20]))
|
|
k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21]))
|
|
k.emit(s_and_b64(s[82:83], s[78:79], s[82:83]))
|
|
k.emit(s_mul_i32(s[78], 256, s[2]))
|
|
k.emit(v_sub_u32_e64(v[156], v[18], s[78]))
|
|
k.emit(v_lshlrev_b32_e32(v[156], 2, v[156]))
|
|
k.emit(v_add_lshl_u32_e64(v[155], v[21], v[18], 1))
|
|
k.emit(v_cndmask_b32_e64(v[155], v[30], v[155], s[82:83]))
|
|
k.emit(v_add_co_u32(v[22], VCC, v[18], 1))
|
|
k.emit(v_cmp_lt_u32_e64(s[78:79], v[22], s[20]))
|
|
k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21]))
|
|
k.emit(s_and_b64(s[82:83], s[78:79], s[82:83]))
|
|
k.emit(s_mul_i32(s[78], 256, s[2]))
|
|
k.emit(v_sub_u32_e64(v[158], v[22], s[78]))
|
|
k.emit(v_lshlrev_b32_e32(v[158], 2, v[158]))
|
|
k.emit(v_add_lshl_u32_e64(v[157], v[21], v[22], 1))
|
|
k.emit(v_cndmask_b32_e64(v[157], v[30], v[157], s[82:83]))
|
|
k.emit(v_add_co_u32(v[22], VCC, v[18], 2))
|
|
k.emit(v_cmp_lt_u32_e64(s[78:79], v[22], s[20]))
|
|
k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21]))
|
|
k.emit(s_and_b64(s[82:83], s[78:79], s[82:83]))
|
|
k.emit(s_mul_i32(s[78], 256, s[2]))
|
|
k.emit(v_sub_u32_e64(v[160], v[22], s[78]))
|
|
k.emit(v_lshlrev_b32_e32(v[160], 2, v[160]))
|
|
k.emit(v_add_lshl_u32_e64(v[159], v[21], v[22], 1))
|
|
k.emit(v_cndmask_b32_e64(v[159], v[30], v[159], s[82:83]))
|
|
k.emit(v_add_co_u32(v[22], VCC, v[18], 3))
|
|
k.emit(v_cmp_lt_u32_e64(s[78:79], v[22], s[20]))
|
|
k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21]))
|
|
k.emit(s_and_b64(s[82:83], s[78:79], s[82:83]))
|
|
k.emit(s_mul_i32(s[78], 256, s[2]))
|
|
k.emit(v_sub_u32_e64(v[162], v[22], s[78]))
|
|
k.emit(v_lshlrev_b32_e32(v[162], 2, v[162]))
|
|
k.emit(v_add_lshl_u32_e64(v[161], v[21], v[22], 1))
|
|
k.emit(v_cndmask_b32_e64(v[161], v[30], v[161], s[82:83]))
|
|
k.emit(v_add_co_u32(v[22], VCC, v[18], 4))
|
|
k.emit(v_cmp_lt_u32_e64(s[78:79], v[22], s[20]))
|
|
k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21]))
|
|
k.emit(s_and_b64(s[82:83], s[78:79], s[82:83]))
|
|
k.emit(s_mul_i32(s[78], 256, s[2]))
|
|
k.emit(v_sub_u32_e64(v[164], v[22], s[78]))
|
|
k.emit(v_lshlrev_b32_e32(v[164], 2, v[164]))
|
|
k.emit(v_add_lshl_u32_e64(v[163], v[21], v[22], 1))
|
|
k.emit(v_cndmask_b32_e64(v[163], v[30], v[163], s[82:83]))
|
|
k.emit(v_add_co_u32(v[22], VCC, v[18], 5))
|
|
k.emit(v_cmp_lt_u32_e64(s[78:79], v[22], s[20]))
|
|
k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21]))
|
|
k.emit(s_and_b64(s[82:83], s[78:79], s[82:83]))
|
|
k.emit(s_mul_i32(s[78], 256, s[2]))
|
|
k.emit(v_sub_u32_e64(v[166], v[22], s[78]))
|
|
k.emit(v_lshlrev_b32_e32(v[166], 2, v[166]))
|
|
k.emit(v_add_lshl_u32_e64(v[165], v[21], v[22], 1))
|
|
k.emit(v_cndmask_b32_e64(v[165], v[30], v[165], s[82:83]))
|
|
k.emit(v_add_co_u32(v[22], VCC, v[18], 6))
|
|
k.emit(v_cmp_lt_u32_e64(s[78:79], v[22], s[20]))
|
|
k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21]))
|
|
k.emit(s_and_b64(s[82:83], s[78:79], s[82:83]))
|
|
k.emit(s_mul_i32(s[78], 256, s[2]))
|
|
k.emit(v_sub_u32_e64(v[168], v[22], s[78]))
|
|
k.emit(v_lshlrev_b32_e32(v[168], 2, v[168]))
|
|
k.emit(v_add_lshl_u32_e64(v[167], v[21], v[22], 1))
|
|
k.emit(v_cndmask_b32_e64(v[167], v[30], v[167], s[82:83]))
|
|
k.emit(v_add_co_u32(v[22], VCC, v[18], 7))
|
|
k.emit(v_cmp_lt_u32_e64(s[78:79], v[22], s[20]))
|
|
k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21]))
|
|
k.emit(s_and_b64(s[82:83], s[78:79], s[82:83]))
|
|
k.emit(s_mul_i32(s[78], 256, s[2]))
|
|
k.emit(v_sub_u32_e64(v[170], v[22], s[78]))
|
|
k.emit(v_lshlrev_b32_e32(v[170], 2, v[170]))
|
|
k.emit(v_add_lshl_u32_e64(v[169], v[21], v[22], 1))
|
|
k.emit(v_cndmask_b32_e64(v[169], v[30], v[169], s[82:83]))
|
|
k.emit(v_accvgpr_read(v[35], v[130]))
|
|
k.emit(v_accvgpr_read(v[36], v[134]))
|
|
k.emit(v_accvgpr_read(v[37], v[138]))
|
|
k.emit(v_accvgpr_read(v[38], v[142]))
|
|
k.emit(v_accvgpr_read(v[39], v[146]))
|
|
k.emit(v_accvgpr_read(v[40], v[150]))
|
|
k.emit(v_accvgpr_read(v[41], v[154]))
|
|
k.emit(v_accvgpr_read(v[42], v[158]))
|
|
k.emit(v_accvgpr_read(v[43], v[162]))
|
|
k.emit(v_accvgpr_read(v[44], v[166]))
|
|
k.emit(v_accvgpr_read(v[45], v[170]))
|
|
k.emit(v_accvgpr_read(v[46], v[174]))
|
|
k.emit(v_accvgpr_read(v[47], v[178]))
|
|
k.emit(v_accvgpr_read(v[48], v[182]))
|
|
k.emit(v_accvgpr_read(v[49], v[186]))
|
|
k.emit(v_accvgpr_read(v[50], v[190]))
|
|
k.emit(v_accvgpr_read(v[51], v[194]))
|
|
k.emit(v_accvgpr_read(v[52], v[198]))
|
|
k.emit(v_accvgpr_read(v[53], v[202]))
|
|
k.emit(v_accvgpr_read(v[54], v[206]))
|
|
k.emit(v_accvgpr_read(v[55], v[210]))
|
|
k.emit(v_accvgpr_read(v[56], v[214]))
|
|
k.emit(v_accvgpr_read(v[57], v[218]))
|
|
k.emit(v_accvgpr_read(v[58], v[222]))
|
|
k.emit(v_accvgpr_read(v[59], v[226]))
|
|
k.emit(v_accvgpr_read(v[60], v[230]))
|
|
k.emit(v_accvgpr_read(v[61], v[234]))
|
|
k.emit(v_accvgpr_read(v[62], v[238]))
|
|
k.emit(v_accvgpr_read(v[63], v[242]))
|
|
k.emit(v_accvgpr_read(v[64], v[246]))
|
|
k.emit(v_accvgpr_read(v[65], v[250]))
|
|
k.emit(v_accvgpr_read(v[66], v[254]))
|
|
k.emit(v_accvgpr_read(v[67], v[3]))
|
|
k.emit(v_accvgpr_read(v[68], v[7]))
|
|
k.emit(v_accvgpr_read(v[69], v[11]))
|
|
k.emit(v_accvgpr_read(v[70], v[15]))
|
|
k.emit(v_accvgpr_read(v[71], v[19]))
|
|
k.emit(v_accvgpr_read(v[72], v[23]))
|
|
k.emit(v_accvgpr_read(v[73], v[27]))
|
|
k.emit(v_accvgpr_read(v[74], v[31]))
|
|
k.waitcnt(lgkm=0)
|
|
k.emit(v_mov_b32_e32(v[32], 4294901760))
|
|
k.emit(v_mov_b32_e32(v[33], 2147418112))
|
|
k.emit(v_mov_b32_e32(v[34], 32767))
|
|
k.emit(v_mul_f32_e32(v[35], v[76], v[35]))
|
|
k.emit(v_add_f32_e32(v[22], v[75], v[35]))
|
|
k.emit(v_mov_b32_e32(v[35], v[22]))
|
|
k.emit(v_cvt_pk(v[35], v[35], v[35]))
|
|
k.emit(buffer_store_short(v[35], v[77], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mul_f32_e32(v[36], v[80], v[36]))
|
|
k.emit(v_add_f32_e32(v[22], v[79], v[36]))
|
|
k.emit(v_mov_b32_e32(v[36], v[22]))
|
|
k.emit(v_cvt_pk(v[36], v[36], v[36]))
|
|
k.emit(buffer_store_short(v[36], v[81], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mul_f32_e32(v[37], v[84], v[37]))
|
|
k.emit(v_add_f32_e32(v[22], v[83], v[37]))
|
|
k.emit(v_mov_b32_e32(v[37], v[22]))
|
|
k.emit(v_cvt_pk(v[37], v[37], v[37]))
|
|
k.emit(buffer_store_short(v[37], v[85], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mul_f32_e32(v[38], v[88], v[38]))
|
|
k.emit(v_add_f32_e32(v[22], v[87], v[38]))
|
|
k.emit(v_mov_b32_e32(v[38], v[22]))
|
|
k.emit(v_cvt_pk(v[38], v[38], v[38]))
|
|
k.emit(buffer_store_short(v[38], v[89], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mul_f32_e32(v[39], v[92], v[39]))
|
|
k.emit(v_add_f32_e32(v[22], v[91], v[39]))
|
|
k.emit(v_mov_b32_e32(v[39], v[22]))
|
|
k.emit(v_cvt_pk(v[39], v[39], v[39]))
|
|
k.emit(buffer_store_short(v[39], v[93], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mul_f32_e32(v[40], v[96], v[40]))
|
|
k.emit(v_add_f32_e32(v[22], v[95], v[40]))
|
|
k.emit(v_mov_b32_e32(v[40], v[22]))
|
|
k.emit(v_cvt_pk(v[40], v[40], v[40]))
|
|
k.emit(buffer_store_short(v[40], v[97], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mul_f32_e32(v[41], v[100], v[41]))
|
|
k.emit(v_add_f32_e32(v[22], v[99], v[41]))
|
|
k.emit(v_mov_b32_e32(v[41], v[22]))
|
|
k.emit(v_cvt_pk(v[41], v[41], v[41]))
|
|
k.emit(buffer_store_short(v[41], v[101], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mul_f32_e32(v[42], v[104], v[42]))
|
|
k.emit(v_add_f32_e32(v[22], v[103], v[42]))
|
|
k.emit(v_mov_b32_e32(v[42], v[22]))
|
|
k.emit(v_cvt_pk(v[42], v[42], v[42]))
|
|
k.emit(buffer_store_short(v[42], v[105], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mul_f32_e32(v[43], v[76], v[43]))
|
|
k.emit(v_add_f32_e32(v[22], v[75], v[43]))
|
|
k.emit(v_mov_b32_e32(v[43], v[22]))
|
|
k.emit(v_cvt_pk(v[43], v[43], v[43]))
|
|
k.emit(buffer_store_short(v[43], v[107], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mul_f32_e32(v[44], v[80], v[44]))
|
|
k.emit(v_add_f32_e32(v[22], v[79], v[44]))
|
|
k.emit(v_mov_b32_e32(v[44], v[22]))
|
|
k.emit(v_cvt_pk(v[44], v[44], v[44]))
|
|
k.emit(buffer_store_short(v[44], v[109], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mul_f32_e32(v[45], v[84], v[45]))
|
|
k.emit(v_add_f32_e32(v[22], v[83], v[45]))
|
|
k.emit(v_mov_b32_e32(v[45], v[22]))
|
|
k.emit(v_cvt_pk(v[45], v[45], v[45]))
|
|
k.emit(buffer_store_short(v[45], v[111], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mul_f32_e32(v[46], v[88], v[46]))
|
|
k.emit(v_add_f32_e32(v[22], v[87], v[46]))
|
|
k.emit(v_mov_b32_e32(v[46], v[22]))
|
|
k.emit(v_cvt_pk(v[46], v[46], v[46]))
|
|
k.emit(buffer_store_short(v[46], v[113], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mul_f32_e32(v[47], v[92], v[47]))
|
|
k.emit(v_add_f32_e32(v[22], v[91], v[47]))
|
|
k.emit(v_mov_b32_e32(v[47], v[22]))
|
|
k.emit(v_cvt_pk(v[47], v[47], v[47]))
|
|
k.emit(buffer_store_short(v[47], v[115], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mul_f32_e32(v[48], v[96], v[48]))
|
|
k.emit(v_add_f32_e32(v[22], v[95], v[48]))
|
|
k.emit(v_mov_b32_e32(v[48], v[22]))
|
|
k.emit(v_cvt_pk(v[48], v[48], v[48]))
|
|
k.emit(buffer_store_short(v[48], v[117], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mul_f32_e32(v[49], v[100], v[49]))
|
|
k.emit(v_add_f32_e32(v[22], v[99], v[49]))
|
|
k.emit(v_mov_b32_e32(v[49], v[22]))
|
|
k.emit(v_cvt_pk(v[49], v[49], v[49]))
|
|
k.emit(buffer_store_short(v[49], v[119], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mul_f32_e32(v[50], v[104], v[50]))
|
|
k.emit(v_add_f32_e32(v[22], v[103], v[50]))
|
|
k.emit(v_mov_b32_e32(v[50], v[22]))
|
|
k.emit(v_cvt_pk(v[50], v[50], v[50]))
|
|
k.emit(buffer_store_short(v[50], v[121], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mul_f32_e32(v[51], v[76], v[51]))
|
|
k.emit(v_add_f32_e32(v[22], v[75], v[51]))
|
|
k.emit(v_mov_b32_e32(v[51], v[22]))
|
|
k.emit(v_cvt_pk(v[51], v[51], v[51]))
|
|
k.emit(buffer_store_short(v[51], v[123], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mul_f32_e32(v[52], v[80], v[52]))
|
|
k.emit(v_add_f32_e32(v[22], v[79], v[52]))
|
|
k.emit(v_mov_b32_e32(v[52], v[22]))
|
|
k.emit(v_cvt_pk(v[52], v[52], v[52]))
|
|
k.emit(buffer_store_short(v[52], v[125], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mul_f32_e32(v[53], v[84], v[53]))
|
|
k.emit(v_add_f32_e32(v[22], v[83], v[53]))
|
|
k.emit(v_mov_b32_e32(v[53], v[22]))
|
|
k.emit(v_cvt_pk(v[53], v[53], v[53]))
|
|
k.emit(buffer_store_short(v[53], v[127], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mul_f32_e32(v[54], v[88], v[54]))
|
|
k.emit(v_add_f32_e32(v[22], v[87], v[54]))
|
|
k.emit(v_mov_b32_e32(v[54], v[22]))
|
|
k.emit(v_cvt_pk(v[54], v[54], v[54]))
|
|
k.emit(buffer_store_short(v[54], v[129], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mul_f32_e32(v[55], v[92], v[55]))
|
|
k.emit(v_add_f32_e32(v[22], v[91], v[55]))
|
|
k.emit(v_mov_b32_e32(v[55], v[22]))
|
|
k.emit(v_cvt_pk(v[55], v[55], v[55]))
|
|
k.emit(buffer_store_short(v[55], v[131], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mul_f32_e32(v[56], v[96], v[56]))
|
|
k.emit(v_add_f32_e32(v[22], v[95], v[56]))
|
|
k.emit(v_mov_b32_e32(v[56], v[22]))
|
|
k.emit(v_cvt_pk(v[56], v[56], v[56]))
|
|
k.emit(buffer_store_short(v[56], v[133], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mul_f32_e32(v[57], v[100], v[57]))
|
|
k.emit(v_add_f32_e32(v[22], v[99], v[57]))
|
|
k.emit(v_mov_b32_e32(v[57], v[22]))
|
|
k.emit(v_cvt_pk(v[57], v[57], v[57]))
|
|
k.emit(buffer_store_short(v[57], v[135], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mul_f32_e32(v[58], v[104], v[58]))
|
|
k.emit(v_add_f32_e32(v[22], v[103], v[58]))
|
|
k.emit(v_mov_b32_e32(v[58], v[22]))
|
|
k.emit(v_cvt_pk(v[58], v[58], v[58]))
|
|
k.emit(buffer_store_short(v[58], v[137], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mul_f32_e32(v[59], v[76], v[59]))
|
|
k.emit(v_add_f32_e32(v[22], v[75], v[59]))
|
|
k.emit(v_mov_b32_e32(v[59], v[22]))
|
|
k.emit(v_cvt_pk(v[59], v[59], v[59]))
|
|
k.emit(buffer_store_short(v[59], v[139], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mul_f32_e32(v[60], v[80], v[60]))
|
|
k.emit(v_add_f32_e32(v[22], v[79], v[60]))
|
|
k.emit(v_mov_b32_e32(v[60], v[22]))
|
|
k.emit(v_cvt_pk(v[60], v[60], v[60]))
|
|
k.emit(buffer_store_short(v[60], v[141], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mul_f32_e32(v[61], v[84], v[61]))
|
|
k.emit(v_add_f32_e32(v[22], v[83], v[61]))
|
|
k.emit(v_mov_b32_e32(v[61], v[22]))
|
|
k.emit(v_cvt_pk(v[61], v[61], v[61]))
|
|
k.emit(buffer_store_short(v[61], v[143], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mul_f32_e32(v[62], v[88], v[62]))
|
|
k.emit(v_add_f32_e32(v[22], v[87], v[62]))
|
|
k.emit(v_mov_b32_e32(v[62], v[22]))
|
|
k.emit(v_cvt_pk(v[62], v[62], v[62]))
|
|
k.emit(buffer_store_short(v[62], v[145], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mul_f32_e32(v[63], v[92], v[63]))
|
|
k.emit(v_add_f32_e32(v[22], v[91], v[63]))
|
|
k.emit(v_mov_b32_e32(v[63], v[22]))
|
|
k.emit(v_cvt_pk(v[63], v[63], v[63]))
|
|
k.emit(buffer_store_short(v[63], v[147], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mul_f32_e32(v[64], v[96], v[64]))
|
|
k.emit(v_add_f32_e32(v[22], v[95], v[64]))
|
|
k.emit(v_mov_b32_e32(v[64], v[22]))
|
|
k.emit(v_cvt_pk(v[64], v[64], v[64]))
|
|
k.emit(buffer_store_short(v[64], v[149], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mul_f32_e32(v[65], v[100], v[65]))
|
|
k.emit(v_add_f32_e32(v[22], v[99], v[65]))
|
|
k.emit(v_mov_b32_e32(v[65], v[22]))
|
|
k.emit(v_cvt_pk(v[65], v[65], v[65]))
|
|
k.emit(buffer_store_short(v[65], v[151], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mul_f32_e32(v[66], v[104], v[66]))
|
|
k.emit(v_add_f32_e32(v[22], v[103], v[66]))
|
|
k.emit(v_mov_b32_e32(v[66], v[22]))
|
|
k.emit(v_cvt_pk(v[66], v[66], v[66]))
|
|
k.emit(buffer_store_short(v[66], v[153], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mul_f32_e32(v[67], v[76], v[67]))
|
|
k.emit(v_add_f32_e32(v[22], v[75], v[67]))
|
|
k.emit(v_mov_b32_e32(v[67], v[22]))
|
|
k.emit(v_cvt_pk(v[67], v[67], v[67]))
|
|
k.emit(buffer_store_short(v[67], v[155], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mul_f32_e32(v[68], v[80], v[68]))
|
|
k.emit(v_add_f32_e32(v[22], v[79], v[68]))
|
|
k.emit(v_mov_b32_e32(v[68], v[22]))
|
|
k.emit(v_cvt_pk(v[68], v[68], v[68]))
|
|
k.emit(buffer_store_short(v[68], v[157], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mul_f32_e32(v[69], v[84], v[69]))
|
|
k.emit(v_add_f32_e32(v[22], v[83], v[69]))
|
|
k.emit(v_mov_b32_e32(v[69], v[22]))
|
|
k.emit(v_cvt_pk(v[69], v[69], v[69]))
|
|
k.emit(buffer_store_short(v[69], v[159], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mul_f32_e32(v[70], v[88], v[70]))
|
|
k.emit(v_add_f32_e32(v[22], v[87], v[70]))
|
|
k.emit(v_mov_b32_e32(v[70], v[22]))
|
|
k.emit(v_cvt_pk(v[70], v[70], v[70]))
|
|
k.emit(buffer_store_short(v[70], v[161], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mul_f32_e32(v[71], v[92], v[71]))
|
|
k.emit(v_add_f32_e32(v[22], v[91], v[71]))
|
|
k.emit(v_mov_b32_e32(v[71], v[22]))
|
|
k.emit(v_cvt_pk(v[71], v[71], v[71]))
|
|
k.emit(buffer_store_short(v[71], v[163], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mul_f32_e32(v[72], v[96], v[72]))
|
|
k.emit(v_add_f32_e32(v[22], v[95], v[72]))
|
|
k.emit(v_mov_b32_e32(v[72], v[22]))
|
|
k.emit(v_cvt_pk(v[72], v[72], v[72]))
|
|
k.emit(buffer_store_short(v[72], v[165], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mul_f32_e32(v[73], v[100], v[73]))
|
|
k.emit(v_add_f32_e32(v[22], v[99], v[73]))
|
|
k.emit(v_mov_b32_e32(v[73], v[22]))
|
|
k.emit(v_cvt_pk(v[73], v[73], v[73]))
|
|
k.emit(buffer_store_short(v[73], v[167], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mul_f32_e32(v[74], v[104], v[74]))
|
|
k.emit(v_add_f32_e32(v[22], v[103], v[74]))
|
|
k.emit(v_mov_b32_e32(v[74], v[22]))
|
|
k.emit(v_cvt_pk(v[74], v[74], v[74]))
|
|
k.emit(buffer_store_short(v[74], v[169], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(s_nop())
|
|
k.emit(v_mov_b32_e32(v[30], 2147483648))
|
|
k.emit(v_add_co_u32(v[19], VCC, v[19], 1))
|
|
k.emit(v_add_u32_e64(v[20], v[20], s[38]))
|
|
k.emit(v_add_u32_e64(v[21], v[21], s[36]))
|
|
k.emit(v_cmp_lt_u32_e64(s[78:79], v[18], s[20]))
|
|
k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21]))
|
|
k.emit(s_and_b64(s[82:83], s[78:79], s[82:83]))
|
|
k.emit(s_mul_i32(s[78], 256, s[2]))
|
|
k.emit(v_sub_u32_e64(v[78], v[18], s[78]))
|
|
k.emit(v_lshlrev_b32_e32(v[78], 2, v[78]))
|
|
k.emit(ds_read_b32(v[75], v[78]))
|
|
k.emit(ds_read_b32(v[76], v[78], v[0], v[0], 0, 0, 4))
|
|
k.emit(v_add_lshl_u32_e64(v[77], v[21], v[18], 1))
|
|
k.emit(v_cndmask_b32_e64(v[77], v[30], v[77], s[82:83]))
|
|
k.emit(v_add_co_u32(v[22], VCC, v[18], 1))
|
|
k.emit(v_cmp_lt_u32_e64(s[78:79], v[22], s[20]))
|
|
k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21]))
|
|
k.emit(s_and_b64(s[82:83], s[78:79], s[82:83]))
|
|
k.emit(s_mul_i32(s[78], 256, s[2]))
|
|
k.emit(v_sub_u32_e64(v[82], v[22], s[78]))
|
|
k.emit(v_lshlrev_b32_e32(v[82], 2, v[82]))
|
|
k.emit(ds_read_b32(v[79], v[82]))
|
|
k.emit(ds_read_b32(v[80], v[82], v[0], v[0], 0, 0, 4))
|
|
k.emit(v_add_lshl_u32_e64(v[81], v[21], v[22], 1))
|
|
k.emit(v_cndmask_b32_e64(v[81], v[30], v[81], s[82:83]))
|
|
k.emit(v_add_co_u32(v[22], VCC, v[18], 2))
|
|
k.emit(v_cmp_lt_u32_e64(s[78:79], v[22], s[20]))
|
|
k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21]))
|
|
k.emit(s_and_b64(s[82:83], s[78:79], s[82:83]))
|
|
k.emit(s_mul_i32(s[78], 256, s[2]))
|
|
k.emit(v_sub_u32_e64(v[86], v[22], s[78]))
|
|
k.emit(v_lshlrev_b32_e32(v[86], 2, v[86]))
|
|
k.emit(ds_read_b32(v[83], v[86]))
|
|
k.emit(ds_read_b32(v[84], v[86], v[0], v[0], 0, 0, 4))
|
|
k.emit(v_add_lshl_u32_e64(v[85], v[21], v[22], 1))
|
|
k.emit(v_cndmask_b32_e64(v[85], v[30], v[85], s[82:83]))
|
|
k.emit(v_add_co_u32(v[22], VCC, v[18], 3))
|
|
k.emit(v_cmp_lt_u32_e64(s[78:79], v[22], s[20]))
|
|
k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21]))
|
|
k.emit(s_and_b64(s[82:83], s[78:79], s[82:83]))
|
|
k.emit(s_mul_i32(s[78], 256, s[2]))
|
|
k.emit(v_sub_u32_e64(v[90], v[22], s[78]))
|
|
k.emit(v_lshlrev_b32_e32(v[90], 2, v[90]))
|
|
k.emit(ds_read_b32(v[87], v[90]))
|
|
k.emit(ds_read_b32(v[88], v[90], v[0], v[0], 0, 0, 4))
|
|
k.emit(v_add_lshl_u32_e64(v[89], v[21], v[22], 1))
|
|
k.emit(v_cndmask_b32_e64(v[89], v[30], v[89], s[82:83]))
|
|
k.emit(v_add_co_u32(v[22], VCC, v[18], 4))
|
|
k.emit(v_cmp_lt_u32_e64(s[78:79], v[22], s[20]))
|
|
k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21]))
|
|
k.emit(s_and_b64(s[82:83], s[78:79], s[82:83]))
|
|
k.emit(s_mul_i32(s[78], 256, s[2]))
|
|
k.emit(v_sub_u32_e64(v[94], v[22], s[78]))
|
|
k.emit(v_lshlrev_b32_e32(v[94], 2, v[94]))
|
|
k.emit(ds_read_b32(v[91], v[94]))
|
|
k.emit(ds_read_b32(v[92], v[94], v[0], v[0], 0, 0, 4))
|
|
k.emit(v_add_lshl_u32_e64(v[93], v[21], v[22], 1))
|
|
k.emit(v_cndmask_b32_e64(v[93], v[30], v[93], s[82:83]))
|
|
k.emit(v_add_co_u32(v[22], VCC, v[18], 5))
|
|
k.emit(v_cmp_lt_u32_e64(s[78:79], v[22], s[20]))
|
|
k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21]))
|
|
k.emit(s_and_b64(s[82:83], s[78:79], s[82:83]))
|
|
k.emit(s_mul_i32(s[78], 256, s[2]))
|
|
k.emit(v_sub_u32_e64(v[98], v[22], s[78]))
|
|
k.emit(v_lshlrev_b32_e32(v[98], 2, v[98]))
|
|
k.emit(ds_read_b32(v[95], v[98]))
|
|
k.emit(ds_read_b32(v[96], v[98], v[0], v[0], 0, 0, 4))
|
|
k.emit(v_add_lshl_u32_e64(v[97], v[21], v[22], 1))
|
|
k.emit(v_cndmask_b32_e64(v[97], v[30], v[97], s[82:83]))
|
|
k.emit(v_add_co_u32(v[22], VCC, v[18], 6))
|
|
k.emit(v_cmp_lt_u32_e64(s[78:79], v[22], s[20]))
|
|
k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21]))
|
|
k.emit(s_and_b64(s[82:83], s[78:79], s[82:83]))
|
|
k.emit(s_mul_i32(s[78], 256, s[2]))
|
|
k.emit(v_sub_u32_e64(v[102], v[22], s[78]))
|
|
k.emit(v_lshlrev_b32_e32(v[102], 2, v[102]))
|
|
k.emit(ds_read_b32(v[99], v[102]))
|
|
k.emit(ds_read_b32(v[100], v[102], v[0], v[0], 0, 0, 4))
|
|
k.emit(v_add_lshl_u32_e64(v[101], v[21], v[22], 1))
|
|
k.emit(v_cndmask_b32_e64(v[101], v[30], v[101], s[82:83]))
|
|
k.emit(v_add_co_u32(v[22], VCC, v[18], 7))
|
|
k.emit(v_cmp_lt_u32_e64(s[78:79], v[22], s[20]))
|
|
k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21]))
|
|
k.emit(s_and_b64(s[82:83], s[78:79], s[82:83]))
|
|
k.emit(s_mul_i32(s[78], 256, s[2]))
|
|
k.emit(v_sub_u32_e64(v[106], v[22], s[78]))
|
|
k.emit(v_lshlrev_b32_e32(v[106], 2, v[106]))
|
|
k.emit(ds_read_b32(v[103], v[106]))
|
|
k.emit(ds_read_b32(v[104], v[106], v[0], v[0], 0, 0, 4))
|
|
k.emit(v_add_lshl_u32_e64(v[105], v[21], v[22], 1))
|
|
k.emit(v_cndmask_b32_e64(v[105], v[30], v[105], s[82:83]))
|
|
k.emit(v_add_co_u32(v[19], VCC, v[19], 1))
|
|
k.emit(v_add_u32_e64(v[20], v[20], s[38]))
|
|
k.emit(v_add_u32_e64(v[21], v[21], s[36]))
|
|
k.emit(v_cmp_lt_u32_e64(s[78:79], v[18], s[20]))
|
|
k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21]))
|
|
k.emit(s_and_b64(s[82:83], s[78:79], s[82:83]))
|
|
k.emit(s_mul_i32(s[78], 256, s[2]))
|
|
k.emit(v_sub_u32_e64(v[108], v[18], s[78]))
|
|
k.emit(v_lshlrev_b32_e32(v[108], 2, v[108]))
|
|
k.emit(v_add_lshl_u32_e64(v[107], v[21], v[18], 1))
|
|
k.emit(v_cndmask_b32_e64(v[107], v[30], v[107], s[82:83]))
|
|
k.emit(v_add_co_u32(v[22], VCC, v[18], 1))
|
|
k.emit(v_cmp_lt_u32_e64(s[78:79], v[22], s[20]))
|
|
k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21]))
|
|
k.emit(s_and_b64(s[82:83], s[78:79], s[82:83]))
|
|
k.emit(s_mul_i32(s[78], 256, s[2]))
|
|
k.emit(v_sub_u32_e64(v[110], v[22], s[78]))
|
|
k.emit(v_lshlrev_b32_e32(v[110], 2, v[110]))
|
|
k.emit(v_add_lshl_u32_e64(v[109], v[21], v[22], 1))
|
|
k.emit(v_cndmask_b32_e64(v[109], v[30], v[109], s[82:83]))
|
|
k.emit(v_add_co_u32(v[22], VCC, v[18], 2))
|
|
k.emit(v_cmp_lt_u32_e64(s[78:79], v[22], s[20]))
|
|
k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21]))
|
|
k.emit(s_and_b64(s[82:83], s[78:79], s[82:83]))
|
|
k.emit(s_mul_i32(s[78], 256, s[2]))
|
|
k.emit(v_sub_u32_e64(v[112], v[22], s[78]))
|
|
k.emit(v_lshlrev_b32_e32(v[112], 2, v[112]))
|
|
k.emit(v_add_lshl_u32_e64(v[111], v[21], v[22], 1))
|
|
k.emit(v_cndmask_b32_e64(v[111], v[30], v[111], s[82:83]))
|
|
k.emit(v_add_co_u32(v[22], VCC, v[18], 3))
|
|
k.emit(v_cmp_lt_u32_e64(s[78:79], v[22], s[20]))
|
|
k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21]))
|
|
k.emit(s_and_b64(s[82:83], s[78:79], s[82:83]))
|
|
k.emit(s_mul_i32(s[78], 256, s[2]))
|
|
k.emit(v_sub_u32_e64(v[114], v[22], s[78]))
|
|
k.emit(v_lshlrev_b32_e32(v[114], 2, v[114]))
|
|
k.emit(v_add_lshl_u32_e64(v[113], v[21], v[22], 1))
|
|
k.emit(v_cndmask_b32_e64(v[113], v[30], v[113], s[82:83]))
|
|
k.emit(v_add_co_u32(v[22], VCC, v[18], 4))
|
|
k.emit(v_cmp_lt_u32_e64(s[78:79], v[22], s[20]))
|
|
k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21]))
|
|
k.emit(s_and_b64(s[82:83], s[78:79], s[82:83]))
|
|
k.emit(s_mul_i32(s[78], 256, s[2]))
|
|
k.emit(v_sub_u32_e64(v[116], v[22], s[78]))
|
|
k.emit(v_lshlrev_b32_e32(v[116], 2, v[116]))
|
|
k.emit(v_add_lshl_u32_e64(v[115], v[21], v[22], 1))
|
|
k.emit(v_cndmask_b32_e64(v[115], v[30], v[115], s[82:83]))
|
|
k.emit(v_add_co_u32(v[22], VCC, v[18], 5))
|
|
k.emit(v_cmp_lt_u32_e64(s[78:79], v[22], s[20]))
|
|
k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21]))
|
|
k.emit(s_and_b64(s[82:83], s[78:79], s[82:83]))
|
|
k.emit(s_mul_i32(s[78], 256, s[2]))
|
|
k.emit(v_sub_u32_e64(v[118], v[22], s[78]))
|
|
k.emit(v_lshlrev_b32_e32(v[118], 2, v[118]))
|
|
k.emit(v_add_lshl_u32_e64(v[117], v[21], v[22], 1))
|
|
k.emit(v_cndmask_b32_e64(v[117], v[30], v[117], s[82:83]))
|
|
k.emit(v_add_co_u32(v[22], VCC, v[18], 6))
|
|
k.emit(v_cmp_lt_u32_e64(s[78:79], v[22], s[20]))
|
|
k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21]))
|
|
k.emit(s_and_b64(s[82:83], s[78:79], s[82:83]))
|
|
k.emit(s_mul_i32(s[78], 256, s[2]))
|
|
k.emit(v_sub_u32_e64(v[120], v[22], s[78]))
|
|
k.emit(v_lshlrev_b32_e32(v[120], 2, v[120]))
|
|
k.emit(v_add_lshl_u32_e64(v[119], v[21], v[22], 1))
|
|
k.emit(v_cndmask_b32_e64(v[119], v[30], v[119], s[82:83]))
|
|
k.emit(v_add_co_u32(v[22], VCC, v[18], 7))
|
|
k.emit(v_cmp_lt_u32_e64(s[78:79], v[22], s[20]))
|
|
k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21]))
|
|
k.emit(s_and_b64(s[82:83], s[78:79], s[82:83]))
|
|
k.emit(s_mul_i32(s[78], 256, s[2]))
|
|
k.emit(v_sub_u32_e64(v[122], v[22], s[78]))
|
|
k.emit(v_lshlrev_b32_e32(v[122], 2, v[122]))
|
|
k.emit(v_add_lshl_u32_e64(v[121], v[21], v[22], 1))
|
|
k.emit(v_cndmask_b32_e64(v[121], v[30], v[121], s[82:83]))
|
|
k.emit(v_add_co_u32(v[19], VCC, v[19], 1))
|
|
k.emit(v_add_u32_e64(v[20], v[20], s[38]))
|
|
k.emit(v_add_u32_e64(v[21], v[21], s[36]))
|
|
k.emit(v_cmp_lt_u32_e64(s[78:79], v[18], s[20]))
|
|
k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21]))
|
|
k.emit(s_and_b64(s[82:83], s[78:79], s[82:83]))
|
|
k.emit(s_mul_i32(s[78], 256, s[2]))
|
|
k.emit(v_sub_u32_e64(v[124], v[18], s[78]))
|
|
k.emit(v_lshlrev_b32_e32(v[124], 2, v[124]))
|
|
k.emit(v_add_lshl_u32_e64(v[123], v[21], v[18], 1))
|
|
k.emit(v_cndmask_b32_e64(v[123], v[30], v[123], s[82:83]))
|
|
k.emit(v_add_co_u32(v[22], VCC, v[18], 1))
|
|
k.emit(v_cmp_lt_u32_e64(s[78:79], v[22], s[20]))
|
|
k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21]))
|
|
k.emit(s_and_b64(s[82:83], s[78:79], s[82:83]))
|
|
k.emit(s_mul_i32(s[78], 256, s[2]))
|
|
k.emit(v_sub_u32_e64(v[126], v[22], s[78]))
|
|
k.emit(v_lshlrev_b32_e32(v[126], 2, v[126]))
|
|
k.emit(v_add_lshl_u32_e64(v[125], v[21], v[22], 1))
|
|
k.emit(v_cndmask_b32_e64(v[125], v[30], v[125], s[82:83]))
|
|
k.emit(v_add_co_u32(v[22], VCC, v[18], 2))
|
|
k.emit(v_cmp_lt_u32_e64(s[78:79], v[22], s[20]))
|
|
k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21]))
|
|
k.emit(s_and_b64(s[82:83], s[78:79], s[82:83]))
|
|
k.emit(s_mul_i32(s[78], 256, s[2]))
|
|
k.emit(v_sub_u32_e64(v[128], v[22], s[78]))
|
|
k.emit(v_lshlrev_b32_e32(v[128], 2, v[128]))
|
|
k.emit(v_add_lshl_u32_e64(v[127], v[21], v[22], 1))
|
|
k.emit(v_cndmask_b32_e64(v[127], v[30], v[127], s[82:83]))
|
|
k.emit(v_add_co_u32(v[22], VCC, v[18], 3))
|
|
k.emit(v_cmp_lt_u32_e64(s[78:79], v[22], s[20]))
|
|
k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21]))
|
|
k.emit(s_and_b64(s[82:83], s[78:79], s[82:83]))
|
|
k.emit(s_mul_i32(s[78], 256, s[2]))
|
|
k.emit(v_sub_u32_e64(v[130], v[22], s[78]))
|
|
k.emit(v_lshlrev_b32_e32(v[130], 2, v[130]))
|
|
k.emit(v_add_lshl_u32_e64(v[129], v[21], v[22], 1))
|
|
k.emit(v_cndmask_b32_e64(v[129], v[30], v[129], s[82:83]))
|
|
k.emit(v_add_co_u32(v[22], VCC, v[18], 4))
|
|
k.emit(v_cmp_lt_u32_e64(s[78:79], v[22], s[20]))
|
|
k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21]))
|
|
k.emit(s_and_b64(s[82:83], s[78:79], s[82:83]))
|
|
k.emit(s_mul_i32(s[78], 256, s[2]))
|
|
k.emit(v_sub_u32_e64(v[132], v[22], s[78]))
|
|
k.emit(v_lshlrev_b32_e32(v[132], 2, v[132]))
|
|
k.emit(v_add_lshl_u32_e64(v[131], v[21], v[22], 1))
|
|
k.emit(v_cndmask_b32_e64(v[131], v[30], v[131], s[82:83]))
|
|
k.emit(v_add_co_u32(v[22], VCC, v[18], 5))
|
|
k.emit(v_cmp_lt_u32_e64(s[78:79], v[22], s[20]))
|
|
k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21]))
|
|
k.emit(s_and_b64(s[82:83], s[78:79], s[82:83]))
|
|
k.emit(s_mul_i32(s[78], 256, s[2]))
|
|
k.emit(v_sub_u32_e64(v[134], v[22], s[78]))
|
|
k.emit(v_lshlrev_b32_e32(v[134], 2, v[134]))
|
|
k.emit(v_add_lshl_u32_e64(v[133], v[21], v[22], 1))
|
|
k.emit(v_cndmask_b32_e64(v[133], v[30], v[133], s[82:83]))
|
|
k.emit(v_add_co_u32(v[22], VCC, v[18], 6))
|
|
k.emit(v_cmp_lt_u32_e64(s[78:79], v[22], s[20]))
|
|
k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21]))
|
|
k.emit(s_and_b64(s[82:83], s[78:79], s[82:83]))
|
|
k.emit(s_mul_i32(s[78], 256, s[2]))
|
|
k.emit(v_sub_u32_e64(v[136], v[22], s[78]))
|
|
k.emit(v_lshlrev_b32_e32(v[136], 2, v[136]))
|
|
k.emit(v_add_lshl_u32_e64(v[135], v[21], v[22], 1))
|
|
k.emit(v_cndmask_b32_e64(v[135], v[30], v[135], s[82:83]))
|
|
k.emit(v_add_co_u32(v[22], VCC, v[18], 7))
|
|
k.emit(v_cmp_lt_u32_e64(s[78:79], v[22], s[20]))
|
|
k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21]))
|
|
k.emit(s_and_b64(s[82:83], s[78:79], s[82:83]))
|
|
k.emit(s_mul_i32(s[78], 256, s[2]))
|
|
k.emit(v_sub_u32_e64(v[138], v[22], s[78]))
|
|
k.emit(v_lshlrev_b32_e32(v[138], 2, v[138]))
|
|
k.emit(v_add_lshl_u32_e64(v[137], v[21], v[22], 1))
|
|
k.emit(v_cndmask_b32_e64(v[137], v[30], v[137], s[82:83]))
|
|
k.emit(v_add_co_u32(v[19], VCC, v[19], 1))
|
|
k.emit(v_add_u32_e64(v[20], v[20], s[38]))
|
|
k.emit(v_add_u32_e64(v[21], v[21], s[36]))
|
|
k.emit(v_cmp_lt_u32_e64(s[78:79], v[18], s[20]))
|
|
k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21]))
|
|
k.emit(s_and_b64(s[82:83], s[78:79], s[82:83]))
|
|
k.emit(s_mul_i32(s[78], 256, s[2]))
|
|
k.emit(v_sub_u32_e64(v[140], v[18], s[78]))
|
|
k.emit(v_lshlrev_b32_e32(v[140], 2, v[140]))
|
|
k.emit(v_add_lshl_u32_e64(v[139], v[21], v[18], 1))
|
|
k.emit(v_cndmask_b32_e64(v[139], v[30], v[139], s[82:83]))
|
|
k.emit(v_add_co_u32(v[22], VCC, v[18], 1))
|
|
k.emit(v_cmp_lt_u32_e64(s[78:79], v[22], s[20]))
|
|
k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21]))
|
|
k.emit(s_and_b64(s[82:83], s[78:79], s[82:83]))
|
|
k.emit(s_mul_i32(s[78], 256, s[2]))
|
|
k.emit(v_sub_u32_e64(v[142], v[22], s[78]))
|
|
k.emit(v_lshlrev_b32_e32(v[142], 2, v[142]))
|
|
k.emit(v_add_lshl_u32_e64(v[141], v[21], v[22], 1))
|
|
k.emit(v_cndmask_b32_e64(v[141], v[30], v[141], s[82:83]))
|
|
k.emit(v_add_co_u32(v[22], VCC, v[18], 2))
|
|
k.emit(v_cmp_lt_u32_e64(s[78:79], v[22], s[20]))
|
|
k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21]))
|
|
k.emit(s_and_b64(s[82:83], s[78:79], s[82:83]))
|
|
k.emit(s_mul_i32(s[78], 256, s[2]))
|
|
k.emit(v_sub_u32_e64(v[144], v[22], s[78]))
|
|
k.emit(v_lshlrev_b32_e32(v[144], 2, v[144]))
|
|
k.emit(v_add_lshl_u32_e64(v[143], v[21], v[22], 1))
|
|
k.emit(v_cndmask_b32_e64(v[143], v[30], v[143], s[82:83]))
|
|
k.emit(v_add_co_u32(v[22], VCC, v[18], 3))
|
|
k.emit(v_cmp_lt_u32_e64(s[78:79], v[22], s[20]))
|
|
k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21]))
|
|
k.emit(s_and_b64(s[82:83], s[78:79], s[82:83]))
|
|
k.emit(s_mul_i32(s[78], 256, s[2]))
|
|
k.emit(v_sub_u32_e64(v[146], v[22], s[78]))
|
|
k.emit(v_lshlrev_b32_e32(v[146], 2, v[146]))
|
|
k.emit(v_add_lshl_u32_e64(v[145], v[21], v[22], 1))
|
|
k.emit(v_cndmask_b32_e64(v[145], v[30], v[145], s[82:83]))
|
|
k.emit(v_add_co_u32(v[22], VCC, v[18], 4))
|
|
k.emit(v_cmp_lt_u32_e64(s[78:79], v[22], s[20]))
|
|
k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21]))
|
|
k.emit(s_and_b64(s[82:83], s[78:79], s[82:83]))
|
|
k.emit(s_mul_i32(s[78], 256, s[2]))
|
|
k.emit(v_sub_u32_e64(v[148], v[22], s[78]))
|
|
k.emit(v_lshlrev_b32_e32(v[148], 2, v[148]))
|
|
k.emit(v_add_lshl_u32_e64(v[147], v[21], v[22], 1))
|
|
k.emit(v_cndmask_b32_e64(v[147], v[30], v[147], s[82:83]))
|
|
k.emit(v_add_co_u32(v[22], VCC, v[18], 5))
|
|
k.emit(v_cmp_lt_u32_e64(s[78:79], v[22], s[20]))
|
|
k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21]))
|
|
k.emit(s_and_b64(s[82:83], s[78:79], s[82:83]))
|
|
k.emit(s_mul_i32(s[78], 256, s[2]))
|
|
k.emit(v_sub_u32_e64(v[150], v[22], s[78]))
|
|
k.emit(v_lshlrev_b32_e32(v[150], 2, v[150]))
|
|
k.emit(v_add_lshl_u32_e64(v[149], v[21], v[22], 1))
|
|
k.emit(v_cndmask_b32_e64(v[149], v[30], v[149], s[82:83]))
|
|
k.emit(v_add_co_u32(v[22], VCC, v[18], 6))
|
|
k.emit(v_cmp_lt_u32_e64(s[78:79], v[22], s[20]))
|
|
k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21]))
|
|
k.emit(s_and_b64(s[82:83], s[78:79], s[82:83]))
|
|
k.emit(s_mul_i32(s[78], 256, s[2]))
|
|
k.emit(v_sub_u32_e64(v[152], v[22], s[78]))
|
|
k.emit(v_lshlrev_b32_e32(v[152], 2, v[152]))
|
|
k.emit(v_add_lshl_u32_e64(v[151], v[21], v[22], 1))
|
|
k.emit(v_cndmask_b32_e64(v[151], v[30], v[151], s[82:83]))
|
|
k.emit(v_add_co_u32(v[22], VCC, v[18], 7))
|
|
k.emit(v_cmp_lt_u32_e64(s[78:79], v[22], s[20]))
|
|
k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21]))
|
|
k.emit(s_and_b64(s[82:83], s[78:79], s[82:83]))
|
|
k.emit(s_mul_i32(s[78], 256, s[2]))
|
|
k.emit(v_sub_u32_e64(v[154], v[22], s[78]))
|
|
k.emit(v_lshlrev_b32_e32(v[154], 2, v[154]))
|
|
k.emit(v_add_lshl_u32_e64(v[153], v[21], v[22], 1))
|
|
k.emit(v_cndmask_b32_e64(v[153], v[30], v[153], s[82:83]))
|
|
k.emit(v_add_co_u32(v[19], VCC, v[19], 1))
|
|
k.emit(v_add_u32_e64(v[20], v[20], s[38]))
|
|
k.emit(v_add_u32_e64(v[21], v[21], s[36]))
|
|
k.emit(v_cmp_lt_u32_e64(s[78:79], v[18], s[20]))
|
|
k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21]))
|
|
k.emit(s_and_b64(s[82:83], s[78:79], s[82:83]))
|
|
k.emit(s_mul_i32(s[78], 256, s[2]))
|
|
k.emit(v_sub_u32_e64(v[156], v[18], s[78]))
|
|
k.emit(v_lshlrev_b32_e32(v[156], 2, v[156]))
|
|
k.emit(v_add_lshl_u32_e64(v[155], v[21], v[18], 1))
|
|
k.emit(v_cndmask_b32_e64(v[155], v[30], v[155], s[82:83]))
|
|
k.emit(v_add_co_u32(v[22], VCC, v[18], 1))
|
|
k.emit(v_cmp_lt_u32_e64(s[78:79], v[22], s[20]))
|
|
k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21]))
|
|
k.emit(s_and_b64(s[82:83], s[78:79], s[82:83]))
|
|
k.emit(s_mul_i32(s[78], 256, s[2]))
|
|
k.emit(v_sub_u32_e64(v[158], v[22], s[78]))
|
|
k.emit(v_lshlrev_b32_e32(v[158], 2, v[158]))
|
|
k.emit(v_add_lshl_u32_e64(v[157], v[21], v[22], 1))
|
|
k.emit(v_cndmask_b32_e64(v[157], v[30], v[157], s[82:83]))
|
|
k.emit(v_add_co_u32(v[22], VCC, v[18], 2))
|
|
k.emit(v_cmp_lt_u32_e64(s[78:79], v[22], s[20]))
|
|
k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21]))
|
|
k.emit(s_and_b64(s[82:83], s[78:79], s[82:83]))
|
|
k.emit(s_mul_i32(s[78], 256, s[2]))
|
|
k.emit(v_sub_u32_e64(v[160], v[22], s[78]))
|
|
k.emit(v_lshlrev_b32_e32(v[160], 2, v[160]))
|
|
k.emit(v_add_lshl_u32_e64(v[159], v[21], v[22], 1))
|
|
k.emit(v_cndmask_b32_e64(v[159], v[30], v[159], s[82:83]))
|
|
k.emit(v_add_co_u32(v[22], VCC, v[18], 3))
|
|
k.emit(v_cmp_lt_u32_e64(s[78:79], v[22], s[20]))
|
|
k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21]))
|
|
k.emit(s_and_b64(s[82:83], s[78:79], s[82:83]))
|
|
k.emit(s_mul_i32(s[78], 256, s[2]))
|
|
k.emit(v_sub_u32_e64(v[162], v[22], s[78]))
|
|
k.emit(v_lshlrev_b32_e32(v[162], 2, v[162]))
|
|
k.emit(v_add_lshl_u32_e64(v[161], v[21], v[22], 1))
|
|
k.emit(v_cndmask_b32_e64(v[161], v[30], v[161], s[82:83]))
|
|
k.emit(v_add_co_u32(v[22], VCC, v[18], 4))
|
|
k.emit(v_cmp_lt_u32_e64(s[78:79], v[22], s[20]))
|
|
k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21]))
|
|
k.emit(s_and_b64(s[82:83], s[78:79], s[82:83]))
|
|
k.emit(s_mul_i32(s[78], 256, s[2]))
|
|
k.emit(v_sub_u32_e64(v[164], v[22], s[78]))
|
|
k.emit(v_lshlrev_b32_e32(v[164], 2, v[164]))
|
|
k.emit(v_add_lshl_u32_e64(v[163], v[21], v[22], 1))
|
|
k.emit(v_cndmask_b32_e64(v[163], v[30], v[163], s[82:83]))
|
|
k.emit(v_add_co_u32(v[22], VCC, v[18], 5))
|
|
k.emit(v_cmp_lt_u32_e64(s[78:79], v[22], s[20]))
|
|
k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21]))
|
|
k.emit(s_and_b64(s[82:83], s[78:79], s[82:83]))
|
|
k.emit(s_mul_i32(s[78], 256, s[2]))
|
|
k.emit(v_sub_u32_e64(v[166], v[22], s[78]))
|
|
k.emit(v_lshlrev_b32_e32(v[166], 2, v[166]))
|
|
k.emit(v_add_lshl_u32_e64(v[165], v[21], v[22], 1))
|
|
k.emit(v_cndmask_b32_e64(v[165], v[30], v[165], s[82:83]))
|
|
k.emit(v_add_co_u32(v[22], VCC, v[18], 6))
|
|
k.emit(v_cmp_lt_u32_e64(s[78:79], v[22], s[20]))
|
|
k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21]))
|
|
k.emit(s_and_b64(s[82:83], s[78:79], s[82:83]))
|
|
k.emit(s_mul_i32(s[78], 256, s[2]))
|
|
k.emit(v_sub_u32_e64(v[168], v[22], s[78]))
|
|
k.emit(v_lshlrev_b32_e32(v[168], 2, v[168]))
|
|
k.emit(v_add_lshl_u32_e64(v[167], v[21], v[22], 1))
|
|
k.emit(v_cndmask_b32_e64(v[167], v[30], v[167], s[82:83]))
|
|
k.emit(v_add_co_u32(v[22], VCC, v[18], 7))
|
|
k.emit(v_cmp_lt_u32_e64(s[78:79], v[22], s[20]))
|
|
k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21]))
|
|
k.emit(s_and_b64(s[82:83], s[78:79], s[82:83]))
|
|
k.emit(s_mul_i32(s[78], 256, s[2]))
|
|
k.emit(v_sub_u32_e64(v[170], v[22], s[78]))
|
|
k.emit(v_lshlrev_b32_e32(v[170], 2, v[170]))
|
|
k.emit(v_add_lshl_u32_e64(v[169], v[21], v[22], 1))
|
|
k.emit(v_cndmask_b32_e64(v[169], v[30], v[169], s[82:83]))
|
|
k.emit(v_accvgpr_read(v[35], v[35]))
|
|
k.emit(v_accvgpr_read(v[36], v[39]))
|
|
k.emit(v_accvgpr_read(v[37], v[43]))
|
|
k.emit(v_accvgpr_read(v[38], v[47]))
|
|
k.emit(v_accvgpr_read(v[39], v[51]))
|
|
k.emit(v_accvgpr_read(v[40], v[55]))
|
|
k.emit(v_accvgpr_read(v[41], v[59]))
|
|
k.emit(v_accvgpr_read(v[42], v[63]))
|
|
k.emit(v_accvgpr_read(v[43], v[67]))
|
|
k.emit(v_accvgpr_read(v[44], v[71]))
|
|
k.emit(v_accvgpr_read(v[45], v[75]))
|
|
k.emit(v_accvgpr_read(v[46], v[79]))
|
|
k.emit(v_accvgpr_read(v[47], v[83]))
|
|
k.emit(v_accvgpr_read(v[48], v[87]))
|
|
k.emit(v_accvgpr_read(v[49], v[91]))
|
|
k.emit(v_accvgpr_read(v[50], v[95]))
|
|
k.emit(v_accvgpr_read(v[51], v[99]))
|
|
k.emit(v_accvgpr_read(v[52], v[103]))
|
|
k.emit(v_accvgpr_read(v[53], v[107]))
|
|
k.emit(v_accvgpr_read(v[54], v[111]))
|
|
k.emit(v_accvgpr_read(v[55], v[115]))
|
|
k.emit(v_accvgpr_read(v[56], v[119]))
|
|
k.emit(v_accvgpr_read(v[57], v[123]))
|
|
k.emit(v_accvgpr_read(v[58], v[127]))
|
|
k.emit(v_accvgpr_read(v[59], v[131]))
|
|
k.emit(v_accvgpr_read(v[60], v[135]))
|
|
k.emit(v_accvgpr_read(v[61], v[139]))
|
|
k.emit(v_accvgpr_read(v[62], v[143]))
|
|
k.emit(v_accvgpr_read(v[63], v[147]))
|
|
k.emit(v_accvgpr_read(v[64], v[151]))
|
|
k.emit(v_accvgpr_read(v[65], v[155]))
|
|
k.emit(v_accvgpr_read(v[66], v[159]))
|
|
k.emit(v_accvgpr_read(v[67], v[163]))
|
|
k.emit(v_accvgpr_read(v[68], v[167]))
|
|
k.emit(v_accvgpr_read(v[69], v[171]))
|
|
k.emit(v_accvgpr_read(v[70], v[175]))
|
|
k.emit(v_accvgpr_read(v[71], v[179]))
|
|
k.emit(v_accvgpr_read(v[72], v[183]))
|
|
k.emit(v_accvgpr_read(v[73], v[187]))
|
|
k.emit(v_accvgpr_read(v[74], v[191]))
|
|
k.waitcnt(lgkm=0)
|
|
k.emit(v_mov_b32_e32(v[32], 4294901760))
|
|
k.emit(v_mov_b32_e32(v[33], 2147418112))
|
|
k.emit(v_mov_b32_e32(v[34], 32767))
|
|
k.emit(v_mul_f32_e32(v[35], v[76], v[35]))
|
|
k.emit(v_add_f32_e32(v[22], v[75], v[35]))
|
|
k.emit(v_mov_b32_e32(v[35], v[22]))
|
|
k.emit(v_cvt_pk(v[35], v[35], v[35]))
|
|
k.emit(buffer_store_short(v[35], v[77], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mul_f32_e32(v[36], v[80], v[36]))
|
|
k.emit(v_add_f32_e32(v[22], v[79], v[36]))
|
|
k.emit(v_mov_b32_e32(v[36], v[22]))
|
|
k.emit(v_cvt_pk(v[36], v[36], v[36]))
|
|
k.emit(buffer_store_short(v[36], v[81], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mul_f32_e32(v[37], v[84], v[37]))
|
|
k.emit(v_add_f32_e32(v[22], v[83], v[37]))
|
|
k.emit(v_mov_b32_e32(v[37], v[22]))
|
|
k.emit(v_cvt_pk(v[37], v[37], v[37]))
|
|
k.emit(buffer_store_short(v[37], v[85], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mul_f32_e32(v[38], v[88], v[38]))
|
|
k.emit(v_add_f32_e32(v[22], v[87], v[38]))
|
|
k.emit(v_mov_b32_e32(v[38], v[22]))
|
|
k.emit(v_cvt_pk(v[38], v[38], v[38]))
|
|
k.emit(buffer_store_short(v[38], v[89], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mul_f32_e32(v[39], v[92], v[39]))
|
|
k.emit(v_add_f32_e32(v[22], v[91], v[39]))
|
|
k.emit(v_mov_b32_e32(v[39], v[22]))
|
|
k.emit(v_cvt_pk(v[39], v[39], v[39]))
|
|
k.emit(buffer_store_short(v[39], v[93], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mul_f32_e32(v[40], v[96], v[40]))
|
|
k.emit(v_add_f32_e32(v[22], v[95], v[40]))
|
|
k.emit(v_mov_b32_e32(v[40], v[22]))
|
|
k.emit(v_cvt_pk(v[40], v[40], v[40]))
|
|
k.emit(buffer_store_short(v[40], v[97], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mul_f32_e32(v[41], v[100], v[41]))
|
|
k.emit(v_add_f32_e32(v[22], v[99], v[41]))
|
|
k.emit(v_mov_b32_e32(v[41], v[22]))
|
|
k.emit(v_cvt_pk(v[41], v[41], v[41]))
|
|
k.emit(buffer_store_short(v[41], v[101], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mul_f32_e32(v[42], v[104], v[42]))
|
|
k.emit(v_add_f32_e32(v[22], v[103], v[42]))
|
|
k.emit(v_mov_b32_e32(v[42], v[22]))
|
|
k.emit(v_cvt_pk(v[42], v[42], v[42]))
|
|
k.emit(buffer_store_short(v[42], v[105], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mul_f32_e32(v[43], v[76], v[43]))
|
|
k.emit(v_add_f32_e32(v[22], v[75], v[43]))
|
|
k.emit(v_mov_b32_e32(v[43], v[22]))
|
|
k.emit(v_cvt_pk(v[43], v[43], v[43]))
|
|
k.emit(buffer_store_short(v[43], v[107], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mul_f32_e32(v[44], v[80], v[44]))
|
|
k.emit(v_add_f32_e32(v[22], v[79], v[44]))
|
|
k.emit(v_mov_b32_e32(v[44], v[22]))
|
|
k.emit(v_cvt_pk(v[44], v[44], v[44]))
|
|
k.emit(buffer_store_short(v[44], v[109], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mul_f32_e32(v[45], v[84], v[45]))
|
|
k.emit(v_add_f32_e32(v[22], v[83], v[45]))
|
|
k.emit(v_mov_b32_e32(v[45], v[22]))
|
|
k.emit(v_cvt_pk(v[45], v[45], v[45]))
|
|
k.emit(buffer_store_short(v[45], v[111], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mul_f32_e32(v[46], v[88], v[46]))
|
|
k.emit(v_add_f32_e32(v[22], v[87], v[46]))
|
|
k.emit(v_mov_b32_e32(v[46], v[22]))
|
|
k.emit(v_cvt_pk(v[46], v[46], v[46]))
|
|
k.emit(buffer_store_short(v[46], v[113], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mul_f32_e32(v[47], v[92], v[47]))
|
|
k.emit(v_add_f32_e32(v[22], v[91], v[47]))
|
|
k.emit(v_mov_b32_e32(v[47], v[22]))
|
|
k.emit(v_cvt_pk(v[47], v[47], v[47]))
|
|
k.emit(buffer_store_short(v[47], v[115], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mul_f32_e32(v[48], v[96], v[48]))
|
|
k.emit(v_add_f32_e32(v[22], v[95], v[48]))
|
|
k.emit(v_mov_b32_e32(v[48], v[22]))
|
|
k.emit(v_cvt_pk(v[48], v[48], v[48]))
|
|
k.emit(buffer_store_short(v[48], v[117], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mul_f32_e32(v[49], v[100], v[49]))
|
|
k.emit(v_add_f32_e32(v[22], v[99], v[49]))
|
|
k.emit(v_mov_b32_e32(v[49], v[22]))
|
|
k.emit(v_cvt_pk(v[49], v[49], v[49]))
|
|
k.emit(buffer_store_short(v[49], v[119], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mul_f32_e32(v[50], v[104], v[50]))
|
|
k.emit(v_add_f32_e32(v[22], v[103], v[50]))
|
|
k.emit(v_mov_b32_e32(v[50], v[22]))
|
|
k.emit(v_cvt_pk(v[50], v[50], v[50]))
|
|
k.emit(buffer_store_short(v[50], v[121], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mul_f32_e32(v[51], v[76], v[51]))
|
|
k.emit(v_add_f32_e32(v[22], v[75], v[51]))
|
|
k.emit(v_mov_b32_e32(v[51], v[22]))
|
|
k.emit(v_cvt_pk(v[51], v[51], v[51]))
|
|
k.emit(buffer_store_short(v[51], v[123], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mul_f32_e32(v[52], v[80], v[52]))
|
|
k.emit(v_add_f32_e32(v[22], v[79], v[52]))
|
|
k.emit(v_mov_b32_e32(v[52], v[22]))
|
|
k.emit(v_cvt_pk(v[52], v[52], v[52]))
|
|
k.emit(buffer_store_short(v[52], v[125], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mul_f32_e32(v[53], v[84], v[53]))
|
|
k.emit(v_add_f32_e32(v[22], v[83], v[53]))
|
|
k.emit(v_mov_b32_e32(v[53], v[22]))
|
|
k.emit(v_cvt_pk(v[53], v[53], v[53]))
|
|
k.emit(buffer_store_short(v[53], v[127], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mul_f32_e32(v[54], v[88], v[54]))
|
|
k.emit(v_add_f32_e32(v[22], v[87], v[54]))
|
|
k.emit(v_mov_b32_e32(v[54], v[22]))
|
|
k.emit(v_cvt_pk(v[54], v[54], v[54]))
|
|
k.emit(buffer_store_short(v[54], v[129], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mul_f32_e32(v[55], v[92], v[55]))
|
|
k.emit(v_add_f32_e32(v[22], v[91], v[55]))
|
|
k.emit(v_mov_b32_e32(v[55], v[22]))
|
|
k.emit(v_cvt_pk(v[55], v[55], v[55]))
|
|
k.emit(buffer_store_short(v[55], v[131], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mul_f32_e32(v[56], v[96], v[56]))
|
|
k.emit(v_add_f32_e32(v[22], v[95], v[56]))
|
|
k.emit(v_mov_b32_e32(v[56], v[22]))
|
|
k.emit(v_cvt_pk(v[56], v[56], v[56]))
|
|
k.emit(buffer_store_short(v[56], v[133], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mul_f32_e32(v[57], v[100], v[57]))
|
|
k.emit(v_add_f32_e32(v[22], v[99], v[57]))
|
|
k.emit(v_mov_b32_e32(v[57], v[22]))
|
|
k.emit(v_cvt_pk(v[57], v[57], v[57]))
|
|
k.emit(buffer_store_short(v[57], v[135], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mul_f32_e32(v[58], v[104], v[58]))
|
|
k.emit(v_add_f32_e32(v[22], v[103], v[58]))
|
|
k.emit(v_mov_b32_e32(v[58], v[22]))
|
|
k.emit(v_cvt_pk(v[58], v[58], v[58]))
|
|
k.emit(buffer_store_short(v[58], v[137], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mul_f32_e32(v[59], v[76], v[59]))
|
|
k.emit(v_add_f32_e32(v[22], v[75], v[59]))
|
|
k.emit(v_mov_b32_e32(v[59], v[22]))
|
|
k.emit(v_cvt_pk(v[59], v[59], v[59]))
|
|
k.emit(buffer_store_short(v[59], v[139], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mul_f32_e32(v[60], v[80], v[60]))
|
|
k.emit(v_add_f32_e32(v[22], v[79], v[60]))
|
|
k.emit(v_mov_b32_e32(v[60], v[22]))
|
|
k.emit(v_cvt_pk(v[60], v[60], v[60]))
|
|
k.emit(buffer_store_short(v[60], v[141], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mul_f32_e32(v[61], v[84], v[61]))
|
|
k.emit(v_add_f32_e32(v[22], v[83], v[61]))
|
|
k.emit(v_mov_b32_e32(v[61], v[22]))
|
|
k.emit(v_cvt_pk(v[61], v[61], v[61]))
|
|
k.emit(buffer_store_short(v[61], v[143], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mul_f32_e32(v[62], v[88], v[62]))
|
|
k.emit(v_add_f32_e32(v[22], v[87], v[62]))
|
|
k.emit(v_mov_b32_e32(v[62], v[22]))
|
|
k.emit(v_cvt_pk(v[62], v[62], v[62]))
|
|
k.emit(buffer_store_short(v[62], v[145], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mul_f32_e32(v[63], v[92], v[63]))
|
|
k.emit(v_add_f32_e32(v[22], v[91], v[63]))
|
|
k.emit(v_mov_b32_e32(v[63], v[22]))
|
|
k.emit(v_cvt_pk(v[63], v[63], v[63]))
|
|
k.emit(buffer_store_short(v[63], v[147], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mul_f32_e32(v[64], v[96], v[64]))
|
|
k.emit(v_add_f32_e32(v[22], v[95], v[64]))
|
|
k.emit(v_mov_b32_e32(v[64], v[22]))
|
|
k.emit(v_cvt_pk(v[64], v[64], v[64]))
|
|
k.emit(buffer_store_short(v[64], v[149], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mul_f32_e32(v[65], v[100], v[65]))
|
|
k.emit(v_add_f32_e32(v[22], v[99], v[65]))
|
|
k.emit(v_mov_b32_e32(v[65], v[22]))
|
|
k.emit(v_cvt_pk(v[65], v[65], v[65]))
|
|
k.emit(buffer_store_short(v[65], v[151], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mul_f32_e32(v[66], v[104], v[66]))
|
|
k.emit(v_add_f32_e32(v[22], v[103], v[66]))
|
|
k.emit(v_mov_b32_e32(v[66], v[22]))
|
|
k.emit(v_cvt_pk(v[66], v[66], v[66]))
|
|
k.emit(buffer_store_short(v[66], v[153], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mul_f32_e32(v[67], v[76], v[67]))
|
|
k.emit(v_add_f32_e32(v[22], v[75], v[67]))
|
|
k.emit(v_mov_b32_e32(v[67], v[22]))
|
|
k.emit(v_cvt_pk(v[67], v[67], v[67]))
|
|
k.emit(buffer_store_short(v[67], v[155], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mul_f32_e32(v[68], v[80], v[68]))
|
|
k.emit(v_add_f32_e32(v[22], v[79], v[68]))
|
|
k.emit(v_mov_b32_e32(v[68], v[22]))
|
|
k.emit(v_cvt_pk(v[68], v[68], v[68]))
|
|
k.emit(buffer_store_short(v[68], v[157], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mul_f32_e32(v[69], v[84], v[69]))
|
|
k.emit(v_add_f32_e32(v[22], v[83], v[69]))
|
|
k.emit(v_mov_b32_e32(v[69], v[22]))
|
|
k.emit(v_cvt_pk(v[69], v[69], v[69]))
|
|
k.emit(buffer_store_short(v[69], v[159], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mul_f32_e32(v[70], v[88], v[70]))
|
|
k.emit(v_add_f32_e32(v[22], v[87], v[70]))
|
|
k.emit(v_mov_b32_e32(v[70], v[22]))
|
|
k.emit(v_cvt_pk(v[70], v[70], v[70]))
|
|
k.emit(buffer_store_short(v[70], v[161], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mul_f32_e32(v[71], v[92], v[71]))
|
|
k.emit(v_add_f32_e32(v[22], v[91], v[71]))
|
|
k.emit(v_mov_b32_e32(v[71], v[22]))
|
|
k.emit(v_cvt_pk(v[71], v[71], v[71]))
|
|
k.emit(buffer_store_short(v[71], v[163], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mul_f32_e32(v[72], v[96], v[72]))
|
|
k.emit(v_add_f32_e32(v[22], v[95], v[72]))
|
|
k.emit(v_mov_b32_e32(v[72], v[22]))
|
|
k.emit(v_cvt_pk(v[72], v[72], v[72]))
|
|
k.emit(buffer_store_short(v[72], v[165], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mul_f32_e32(v[73], v[100], v[73]))
|
|
k.emit(v_add_f32_e32(v[22], v[99], v[73]))
|
|
k.emit(v_mov_b32_e32(v[73], v[22]))
|
|
k.emit(v_cvt_pk(v[73], v[73], v[73]))
|
|
k.emit(buffer_store_short(v[73], v[167], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mul_f32_e32(v[74], v[104], v[74]))
|
|
k.emit(v_add_f32_e32(v[22], v[103], v[74]))
|
|
k.emit(v_mov_b32_e32(v[74], v[22]))
|
|
k.emit(v_cvt_pk(v[74], v[74], v[74]))
|
|
k.emit(buffer_store_short(v[74], v[169], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(s_nop())
|
|
k.emit(v_mov_b32_e32(v[30], 2147483648))
|
|
k.emit(v_add_co_u32(v[19], VCC, v[19], 1))
|
|
k.emit(v_add_u32_e64(v[20], v[20], s[38]))
|
|
k.emit(v_add_u32_e64(v[21], v[21], s[36]))
|
|
k.emit(v_cmp_lt_u32_e64(s[78:79], v[18], s[20]))
|
|
k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21]))
|
|
k.emit(s_and_b64(s[82:83], s[78:79], s[82:83]))
|
|
k.emit(s_mul_i32(s[78], 256, s[2]))
|
|
k.emit(v_sub_u32_e64(v[54], v[18], s[78]))
|
|
k.emit(v_lshlrev_b32_e32(v[54], 2, v[54]))
|
|
k.emit(ds_read_b32(v[51], v[54]))
|
|
k.emit(ds_read_b32(v[52], v[54], v[0], v[0], 0, 0, 4))
|
|
k.emit(v_add_lshl_u32_e64(v[53], v[21], v[18], 1))
|
|
k.emit(v_cndmask_b32_e64(v[53], v[30], v[53], s[82:83]))
|
|
k.emit(v_add_co_u32(v[22], VCC, v[18], 1))
|
|
k.emit(v_cmp_lt_u32_e64(s[78:79], v[22], s[20]))
|
|
k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21]))
|
|
k.emit(s_and_b64(s[82:83], s[78:79], s[82:83]))
|
|
k.emit(s_mul_i32(s[78], 256, s[2]))
|
|
k.emit(v_sub_u32_e64(v[58], v[22], s[78]))
|
|
k.emit(v_lshlrev_b32_e32(v[58], 2, v[58]))
|
|
k.emit(ds_read_b32(v[55], v[58]))
|
|
k.emit(ds_read_b32(v[56], v[58], v[0], v[0], 0, 0, 4))
|
|
k.emit(v_add_lshl_u32_e64(v[57], v[21], v[22], 1))
|
|
k.emit(v_cndmask_b32_e64(v[57], v[30], v[57], s[82:83]))
|
|
k.emit(v_add_co_u32(v[22], VCC, v[18], 2))
|
|
k.emit(v_cmp_lt_u32_e64(s[78:79], v[22], s[20]))
|
|
k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21]))
|
|
k.emit(s_and_b64(s[82:83], s[78:79], s[82:83]))
|
|
k.emit(s_mul_i32(s[78], 256, s[2]))
|
|
k.emit(v_sub_u32_e64(v[62], v[22], s[78]))
|
|
k.emit(v_lshlrev_b32_e32(v[62], 2, v[62]))
|
|
k.emit(ds_read_b32(v[59], v[62]))
|
|
k.emit(ds_read_b32(v[60], v[62], v[0], v[0], 0, 0, 4))
|
|
k.emit(v_add_lshl_u32_e64(v[61], v[21], v[22], 1))
|
|
k.emit(v_cndmask_b32_e64(v[61], v[30], v[61], s[82:83]))
|
|
k.emit(v_add_co_u32(v[22], VCC, v[18], 3))
|
|
k.emit(v_cmp_lt_u32_e64(s[78:79], v[22], s[20]))
|
|
k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21]))
|
|
k.emit(s_and_b64(s[82:83], s[78:79], s[82:83]))
|
|
k.emit(s_mul_i32(s[78], 256, s[2]))
|
|
k.emit(v_sub_u32_e64(v[66], v[22], s[78]))
|
|
k.emit(v_lshlrev_b32_e32(v[66], 2, v[66]))
|
|
k.emit(ds_read_b32(v[63], v[66]))
|
|
k.emit(ds_read_b32(v[64], v[66], v[0], v[0], 0, 0, 4))
|
|
k.emit(v_add_lshl_u32_e64(v[65], v[21], v[22], 1))
|
|
k.emit(v_cndmask_b32_e64(v[65], v[30], v[65], s[82:83]))
|
|
k.emit(v_add_co_u32(v[22], VCC, v[18], 4))
|
|
k.emit(v_cmp_lt_u32_e64(s[78:79], v[22], s[20]))
|
|
k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21]))
|
|
k.emit(s_and_b64(s[82:83], s[78:79], s[82:83]))
|
|
k.emit(s_mul_i32(s[78], 256, s[2]))
|
|
k.emit(v_sub_u32_e64(v[70], v[22], s[78]))
|
|
k.emit(v_lshlrev_b32_e32(v[70], 2, v[70]))
|
|
k.emit(ds_read_b32(v[67], v[70]))
|
|
k.emit(ds_read_b32(v[68], v[70], v[0], v[0], 0, 0, 4))
|
|
k.emit(v_add_lshl_u32_e64(v[69], v[21], v[22], 1))
|
|
k.emit(v_cndmask_b32_e64(v[69], v[30], v[69], s[82:83]))
|
|
k.emit(v_add_co_u32(v[22], VCC, v[18], 5))
|
|
k.emit(v_cmp_lt_u32_e64(s[78:79], v[22], s[20]))
|
|
k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21]))
|
|
k.emit(s_and_b64(s[82:83], s[78:79], s[82:83]))
|
|
k.emit(s_mul_i32(s[78], 256, s[2]))
|
|
k.emit(v_sub_u32_e64(v[74], v[22], s[78]))
|
|
k.emit(v_lshlrev_b32_e32(v[74], 2, v[74]))
|
|
k.emit(ds_read_b32(v[71], v[74]))
|
|
k.emit(ds_read_b32(v[72], v[74], v[0], v[0], 0, 0, 4))
|
|
k.emit(v_add_lshl_u32_e64(v[73], v[21], v[22], 1))
|
|
k.emit(v_cndmask_b32_e64(v[73], v[30], v[73], s[82:83]))
|
|
k.emit(v_add_co_u32(v[22], VCC, v[18], 6))
|
|
k.emit(v_cmp_lt_u32_e64(s[78:79], v[22], s[20]))
|
|
k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21]))
|
|
k.emit(s_and_b64(s[82:83], s[78:79], s[82:83]))
|
|
k.emit(s_mul_i32(s[78], 256, s[2]))
|
|
k.emit(v_sub_u32_e64(v[78], v[22], s[78]))
|
|
k.emit(v_lshlrev_b32_e32(v[78], 2, v[78]))
|
|
k.emit(ds_read_b32(v[75], v[78]))
|
|
k.emit(ds_read_b32(v[76], v[78], v[0], v[0], 0, 0, 4))
|
|
k.emit(v_add_lshl_u32_e64(v[77], v[21], v[22], 1))
|
|
k.emit(v_cndmask_b32_e64(v[77], v[30], v[77], s[82:83]))
|
|
k.emit(v_add_co_u32(v[22], VCC, v[18], 7))
|
|
k.emit(v_cmp_lt_u32_e64(s[78:79], v[22], s[20]))
|
|
k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21]))
|
|
k.emit(s_and_b64(s[82:83], s[78:79], s[82:83]))
|
|
k.emit(s_mul_i32(s[78], 256, s[2]))
|
|
k.emit(v_sub_u32_e64(v[82], v[22], s[78]))
|
|
k.emit(v_lshlrev_b32_e32(v[82], 2, v[82]))
|
|
k.emit(ds_read_b32(v[79], v[82]))
|
|
k.emit(ds_read_b32(v[80], v[82], v[0], v[0], 0, 0, 4))
|
|
k.emit(v_add_lshl_u32_e64(v[81], v[21], v[22], 1))
|
|
k.emit(v_cndmask_b32_e64(v[81], v[30], v[81], s[82:83]))
|
|
k.emit(v_add_co_u32(v[19], VCC, v[19], 1))
|
|
k.emit(v_add_u32_e64(v[20], v[20], s[38]))
|
|
k.emit(v_add_u32_e64(v[21], v[21], s[36]))
|
|
k.emit(v_cmp_lt_u32_e64(s[78:79], v[18], s[20]))
|
|
k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21]))
|
|
k.emit(s_and_b64(s[82:83], s[78:79], s[82:83]))
|
|
k.emit(s_mul_i32(s[78], 256, s[2]))
|
|
k.emit(v_sub_u32_e64(v[84], v[18], s[78]))
|
|
k.emit(v_lshlrev_b32_e32(v[84], 2, v[84]))
|
|
k.emit(v_add_lshl_u32_e64(v[83], v[21], v[18], 1))
|
|
k.emit(v_cndmask_b32_e64(v[83], v[30], v[83], s[82:83]))
|
|
k.emit(v_add_co_u32(v[22], VCC, v[18], 1))
|
|
k.emit(v_cmp_lt_u32_e64(s[78:79], v[22], s[20]))
|
|
k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21]))
|
|
k.emit(s_and_b64(s[82:83], s[78:79], s[82:83]))
|
|
k.emit(s_mul_i32(s[78], 256, s[2]))
|
|
k.emit(v_sub_u32_e64(v[86], v[22], s[78]))
|
|
k.emit(v_lshlrev_b32_e32(v[86], 2, v[86]))
|
|
k.emit(v_add_lshl_u32_e64(v[85], v[21], v[22], 1))
|
|
k.emit(v_cndmask_b32_e64(v[85], v[30], v[85], s[82:83]))
|
|
k.emit(v_add_co_u32(v[22], VCC, v[18], 2))
|
|
k.emit(v_cmp_lt_u32_e64(s[78:79], v[22], s[20]))
|
|
k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21]))
|
|
k.emit(s_and_b64(s[82:83], s[78:79], s[82:83]))
|
|
k.emit(s_mul_i32(s[78], 256, s[2]))
|
|
k.emit(v_sub_u32_e64(v[88], v[22], s[78]))
|
|
k.emit(v_lshlrev_b32_e32(v[88], 2, v[88]))
|
|
k.emit(v_add_lshl_u32_e64(v[87], v[21], v[22], 1))
|
|
k.emit(v_cndmask_b32_e64(v[87], v[30], v[87], s[82:83]))
|
|
k.emit(v_add_co_u32(v[22], VCC, v[18], 3))
|
|
k.emit(v_cmp_lt_u32_e64(s[78:79], v[22], s[20]))
|
|
k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21]))
|
|
k.emit(s_and_b64(s[82:83], s[78:79], s[82:83]))
|
|
k.emit(s_mul_i32(s[78], 256, s[2]))
|
|
k.emit(v_sub_u32_e64(v[90], v[22], s[78]))
|
|
k.emit(v_lshlrev_b32_e32(v[90], 2, v[90]))
|
|
k.emit(v_add_lshl_u32_e64(v[89], v[21], v[22], 1))
|
|
k.emit(v_cndmask_b32_e64(v[89], v[30], v[89], s[82:83]))
|
|
k.emit(v_add_co_u32(v[22], VCC, v[18], 4))
|
|
k.emit(v_cmp_lt_u32_e64(s[78:79], v[22], s[20]))
|
|
k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21]))
|
|
k.emit(s_and_b64(s[82:83], s[78:79], s[82:83]))
|
|
k.emit(s_mul_i32(s[78], 256, s[2]))
|
|
k.emit(v_sub_u32_e64(v[92], v[22], s[78]))
|
|
k.emit(v_lshlrev_b32_e32(v[92], 2, v[92]))
|
|
k.emit(v_add_lshl_u32_e64(v[91], v[21], v[22], 1))
|
|
k.emit(v_cndmask_b32_e64(v[91], v[30], v[91], s[82:83]))
|
|
k.emit(v_add_co_u32(v[22], VCC, v[18], 5))
|
|
k.emit(v_cmp_lt_u32_e64(s[78:79], v[22], s[20]))
|
|
k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21]))
|
|
k.emit(s_and_b64(s[82:83], s[78:79], s[82:83]))
|
|
k.emit(s_mul_i32(s[78], 256, s[2]))
|
|
k.emit(v_sub_u32_e64(v[94], v[22], s[78]))
|
|
k.emit(v_lshlrev_b32_e32(v[94], 2, v[94]))
|
|
k.emit(v_add_lshl_u32_e64(v[93], v[21], v[22], 1))
|
|
k.emit(v_cndmask_b32_e64(v[93], v[30], v[93], s[82:83]))
|
|
k.emit(v_add_co_u32(v[22], VCC, v[18], 6))
|
|
k.emit(v_cmp_lt_u32_e64(s[78:79], v[22], s[20]))
|
|
k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21]))
|
|
k.emit(s_and_b64(s[82:83], s[78:79], s[82:83]))
|
|
k.emit(s_mul_i32(s[78], 256, s[2]))
|
|
k.emit(v_sub_u32_e64(v[96], v[22], s[78]))
|
|
k.emit(v_lshlrev_b32_e32(v[96], 2, v[96]))
|
|
k.emit(v_add_lshl_u32_e64(v[95], v[21], v[22], 1))
|
|
k.emit(v_cndmask_b32_e64(v[95], v[30], v[95], s[82:83]))
|
|
k.emit(v_add_co_u32(v[22], VCC, v[18], 7))
|
|
k.emit(v_cmp_lt_u32_e64(s[78:79], v[22], s[20]))
|
|
k.emit(v_cmp_lt_u32_e64(s[82:83], v[19], s[21]))
|
|
k.emit(s_and_b64(s[82:83], s[78:79], s[82:83]))
|
|
k.emit(s_mul_i32(s[78], 256, s[2]))
|
|
k.emit(v_sub_u32_e64(v[98], v[22], s[78]))
|
|
k.emit(v_lshlrev_b32_e32(v[98], 2, v[98]))
|
|
k.emit(v_add_lshl_u32_e64(v[97], v[21], v[22], 1))
|
|
k.emit(v_cndmask_b32_e64(v[97], v[30], v[97], s[82:83]))
|
|
k.emit(v_accvgpr_read(v[35], v[195]))
|
|
k.emit(v_accvgpr_read(v[36], v[199]))
|
|
k.emit(v_accvgpr_read(v[37], v[203]))
|
|
k.emit(v_accvgpr_read(v[38], v[207]))
|
|
k.emit(v_accvgpr_read(v[39], v[211]))
|
|
k.emit(v_accvgpr_read(v[40], v[215]))
|
|
k.emit(v_accvgpr_read(v[41], v[219]))
|
|
k.emit(v_accvgpr_read(v[42], v[223]))
|
|
k.emit(v_accvgpr_read(v[43], v[227]))
|
|
k.emit(v_accvgpr_read(v[44], v[231]))
|
|
k.emit(v_accvgpr_read(v[45], v[235]))
|
|
k.emit(v_accvgpr_read(v[46], v[239]))
|
|
k.emit(v_accvgpr_read(v[47], v[243]))
|
|
k.emit(v_accvgpr_read(v[48], v[247]))
|
|
k.emit(v_accvgpr_read(v[49], v[251]))
|
|
k.emit(v_accvgpr_read(v[50], v[255]))
|
|
k.waitcnt(lgkm=0)
|
|
k.emit(v_mov_b32_e32(v[32], 4294901760))
|
|
k.emit(v_mov_b32_e32(v[33], 2147418112))
|
|
k.emit(v_mov_b32_e32(v[34], 32767))
|
|
k.emit(v_mul_f32_e32(v[35], v[52], v[35]))
|
|
k.emit(v_add_f32_e32(v[22], v[51], v[35]))
|
|
k.emit(v_mov_b32_e32(v[35], v[22]))
|
|
k.emit(v_cvt_pk(v[35], v[35], v[35]))
|
|
k.emit(buffer_store_short(v[35], v[53], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mul_f32_e32(v[36], v[56], v[36]))
|
|
k.emit(v_add_f32_e32(v[22], v[55], v[36]))
|
|
k.emit(v_mov_b32_e32(v[36], v[22]))
|
|
k.emit(v_cvt_pk(v[36], v[36], v[36]))
|
|
k.emit(buffer_store_short(v[36], v[57], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mul_f32_e32(v[37], v[60], v[37]))
|
|
k.emit(v_add_f32_e32(v[22], v[59], v[37]))
|
|
k.emit(v_mov_b32_e32(v[37], v[22]))
|
|
k.emit(v_cvt_pk(v[37], v[37], v[37]))
|
|
k.emit(buffer_store_short(v[37], v[61], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mul_f32_e32(v[38], v[64], v[38]))
|
|
k.emit(v_add_f32_e32(v[22], v[63], v[38]))
|
|
k.emit(v_mov_b32_e32(v[38], v[22]))
|
|
k.emit(v_cvt_pk(v[38], v[38], v[38]))
|
|
k.emit(buffer_store_short(v[38], v[65], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mul_f32_e32(v[39], v[68], v[39]))
|
|
k.emit(v_add_f32_e32(v[22], v[67], v[39]))
|
|
k.emit(v_mov_b32_e32(v[39], v[22]))
|
|
k.emit(v_cvt_pk(v[39], v[39], v[39]))
|
|
k.emit(buffer_store_short(v[39], v[69], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mul_f32_e32(v[40], v[72], v[40]))
|
|
k.emit(v_add_f32_e32(v[22], v[71], v[40]))
|
|
k.emit(v_mov_b32_e32(v[40], v[22]))
|
|
k.emit(v_cvt_pk(v[40], v[40], v[40]))
|
|
k.emit(buffer_store_short(v[40], v[73], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mul_f32_e32(v[41], v[76], v[41]))
|
|
k.emit(v_add_f32_e32(v[22], v[75], v[41]))
|
|
k.emit(v_mov_b32_e32(v[41], v[22]))
|
|
k.emit(v_cvt_pk(v[41], v[41], v[41]))
|
|
k.emit(buffer_store_short(v[41], v[77], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mul_f32_e32(v[42], v[80], v[42]))
|
|
k.emit(v_add_f32_e32(v[22], v[79], v[42]))
|
|
k.emit(v_mov_b32_e32(v[42], v[22]))
|
|
k.emit(v_cvt_pk(v[42], v[42], v[42]))
|
|
k.emit(buffer_store_short(v[42], v[81], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mul_f32_e32(v[43], v[52], v[43]))
|
|
k.emit(v_add_f32_e32(v[22], v[51], v[43]))
|
|
k.emit(v_mov_b32_e32(v[43], v[22]))
|
|
k.emit(v_cvt_pk(v[43], v[43], v[43]))
|
|
k.emit(buffer_store_short(v[43], v[83], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mul_f32_e32(v[44], v[56], v[44]))
|
|
k.emit(v_add_f32_e32(v[22], v[55], v[44]))
|
|
k.emit(v_mov_b32_e32(v[44], v[22]))
|
|
k.emit(v_cvt_pk(v[44], v[44], v[44]))
|
|
k.emit(buffer_store_short(v[44], v[85], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mul_f32_e32(v[45], v[60], v[45]))
|
|
k.emit(v_add_f32_e32(v[22], v[59], v[45]))
|
|
k.emit(v_mov_b32_e32(v[45], v[22]))
|
|
k.emit(v_cvt_pk(v[45], v[45], v[45]))
|
|
k.emit(buffer_store_short(v[45], v[87], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mul_f32_e32(v[46], v[64], v[46]))
|
|
k.emit(v_add_f32_e32(v[22], v[63], v[46]))
|
|
k.emit(v_mov_b32_e32(v[46], v[22]))
|
|
k.emit(v_cvt_pk(v[46], v[46], v[46]))
|
|
k.emit(buffer_store_short(v[46], v[89], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mul_f32_e32(v[47], v[68], v[47]))
|
|
k.emit(v_add_f32_e32(v[22], v[67], v[47]))
|
|
k.emit(v_mov_b32_e32(v[47], v[22]))
|
|
k.emit(v_cvt_pk(v[47], v[47], v[47]))
|
|
k.emit(buffer_store_short(v[47], v[91], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mul_f32_e32(v[48], v[72], v[48]))
|
|
k.emit(v_add_f32_e32(v[22], v[71], v[48]))
|
|
k.emit(v_mov_b32_e32(v[48], v[22]))
|
|
k.emit(v_cvt_pk(v[48], v[48], v[48]))
|
|
k.emit(buffer_store_short(v[48], v[93], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mul_f32_e32(v[49], v[76], v[49]))
|
|
k.emit(v_add_f32_e32(v[22], v[75], v[49]))
|
|
k.emit(v_mov_b32_e32(v[49], v[22]))
|
|
k.emit(v_cvt_pk(v[49], v[49], v[49]))
|
|
k.emit(buffer_store_short(v[49], v[95], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(v_mul_f32_e32(v[50], v[80], v[50]))
|
|
k.emit(v_add_f32_e32(v[22], v[79], v[50]))
|
|
k.emit(v_mov_b32_e32(v[50], v[22]))
|
|
k.emit(v_cvt_pk(v[50], v[50], v[50]))
|
|
k.emit(buffer_store_short(v[50], v[97], s[12:15], 0, 0, 1, 0, 0, 0, 0, 0, 1))
|
|
k.emit(s_nop())
|
|
k.emit(s_branch(), target='GW_End_1')
|
|
k.label('GW_End_1')
|
|
k.emit(s_cmp_ge_u32(s[58], s[59]))
|
|
k.emit(s_cbranch_scc1(), target='KernelEnd')
|
|
k.emit(s_branch(), target='PersistentLoopStart')
|
|
k.label('KernelEnd')
|
|
k.emit(s_endpgm())
|
|
return k
|