mirror of
https://github.com/tinygrad/tinygrad.git
synced 2026-06-24 02:14:17 +00:00
fixes
This commit is contained in:
parent
12714337f0
commit
1eca96ea44
6 changed files with 56 additions and 36 deletions
|
|
@ -609,7 +609,6 @@ class TestOps(unittest.TestCase):
|
|||
helper_test_op([()], lambda x: x/2)
|
||||
helper_test_op([()], lambda x: 2/x)
|
||||
|
||||
@unittest.skip("seg fault")
|
||||
def test_mod(self):
|
||||
a = [-4, 7, 5, 4, -7, 8, -9]
|
||||
b = [2, -3, 8, -2, 3, 5, -5]
|
||||
|
|
@ -2150,7 +2149,6 @@ class TestOps(unittest.TestCase):
|
|||
lambda x,w: torch.nn.functional.conv_transpose2d(x,w, stride=stride),
|
||||
lambda x,w: Tensor.conv_transpose2d(x,w,stride=stride), atol=1e-5, grad_rtol=1e-5)
|
||||
|
||||
@unittest.skip("seg fault")
|
||||
@slow_test
|
||||
def test_output_padded_conv_transpose2d(self):
|
||||
for output_padding, stride in [((1,1), (2,3)), ((2,1), (3,2))]:
|
||||
|
|
@ -2571,7 +2569,6 @@ class TestOps(unittest.TestCase):
|
|||
self.helper_test_exception([shape], lambda x: torch.nn.functional.avg_pool2d(x, kernel_size=(2,2), padding=(1,1,1)),
|
||||
lambda x: Tensor.avg_pool2d(x, kernel_size=(2,2), padding=(1,1,1)), expected=(RuntimeError, ValueError))
|
||||
|
||||
@unittest.skip("seg fault")
|
||||
@slow_test
|
||||
def test_avg_pool2d_padding_not_counted(self):
|
||||
shape = (32,2,111,28)
|
||||
|
|
|
|||
|
|
@ -5,7 +5,7 @@ from dataclasses import dataclass
|
|||
from tinygrad.dtype import dtypes, ImageDType, DType, AddrSpace, Invalid, PtrDType
|
||||
from tinygrad.uop.ops import UOp, Ops, UPat, PatternMatcher, GroupOp, identity_element
|
||||
from tinygrad.uop.symbolic import uop_given_valid, parse_valid, invalid_gate
|
||||
from tinygrad.helpers import getenv, flatten, AMX, prod
|
||||
from tinygrad.helpers import getenv, flatten, AMX, CPU_X86, prod
|
||||
from tinygrad.renderer import Renderer
|
||||
|
||||
# ***** image load valid simplification *****
|
||||
|
|
@ -152,6 +152,9 @@ def split_load_store(ctx:Renderer|None, ls:UOp, idx:UOp):
|
|||
pass
|
||||
elif isinstance(buf.dtype, ImageDType):
|
||||
lengths = [4]
|
||||
elif ctx is not None and CPU_X86:
|
||||
lengths = [4,2] if buf.dtype.base == dtypes.float32 else []
|
||||
#must_divide = False
|
||||
elif ctx is not None and ctx.supports_float4:
|
||||
# TODO: a better way to get this than ctx
|
||||
lengths = [8,4,2] if buf.dtype.base == dtypes.half and getenv("ALLOW_HALF8") else ([16,8,4,2] if AMX else [4,2])
|
||||
|
|
|
|||
|
|
@ -6,6 +6,7 @@ from tinygrad.uop.ops import PatternMatcher, UOp, Ops, UPat, multirange_str
|
|||
from tinygrad.helpers import prod, getenv, TUPLE_ORDER
|
||||
|
||||
def linearize(sink:UOp) -> list[UOp]:
|
||||
from tinygrad.renderer.x86 import RSP
|
||||
# this is a toposort with priority
|
||||
lst = list(sink.toposort())
|
||||
consumers: defaultdict[UOp, list[UOp]] = defaultdict(list)
|
||||
|
|
@ -37,7 +38,8 @@ def linearize(sink:UOp) -> list[UOp]:
|
|||
case Ops.RANGE: priority = 5 # placing RANGE is good
|
||||
case Ops.END: priority = -5 # placing END is bad
|
||||
# x86 op version
|
||||
case X86Ops.DEFINE_REG: priority = -20
|
||||
# stack pointer needs to be scheduled at the top of the kernel
|
||||
case X86Ops.DEFINE_REG: priority = -21 if u.arg == RSP else -20
|
||||
case X86Ops.IMM: priority = -10
|
||||
case _: priority = 0 # everything else has priority 0
|
||||
priorities[u] = (run_count, priority, extra)
|
||||
|
|
|
|||
|
|
@ -5,7 +5,7 @@ from typing import Any, Generic, TypeVar, Iterator, Generator
|
|||
import importlib, inspect, functools, pathlib, os, platform, contextlib, sys, re, atexit, pickle, decimal
|
||||
from tinygrad.helpers import CI, OSX, LRU, getenv, diskcache_get, diskcache_put, DEBUG, GlobalCounters, flat_mv, PROFILE, temp, colored
|
||||
from tinygrad.helpers import Context, CCACHE, ALLOW_DEVICE_USAGE, MAX_BUFFER_SIZE, cpu_events, ProfileEvent, ProfilePointEvent, dedup, ContextVar
|
||||
from tinygrad.helpers import unwrap_class_type, suppress_finalizing, select_first_inited, VIZ, CPU_LLVM, CPU_LVP, NV_PTX, CUDA_PTX, NV_NAK
|
||||
from tinygrad.helpers import unwrap_class_type, suppress_finalizing, select_first_inited, VIZ, CPU_LLVM, CPU_LVP, CPU_X86, NV_PTX, CUDA_PTX, NV_NAK
|
||||
from tinygrad.dtype import DType, ImageDType, PtrDType, dtypes, _to_np_dtype
|
||||
from tinygrad.renderer import Renderer
|
||||
|
||||
|
|
@ -347,7 +347,7 @@ def is_dtype_supported(dtype:DType, device:str|None=None) -> bool:
|
|||
if device == "METAL": return not CI
|
||||
if device == "CUDA": return not CI and not CUDA_PTX
|
||||
if device == "NV": return not CI and not NV_PTX and not NV_NAK
|
||||
if device in {"CPU"}: return not CI and platform.machine() in {"arm", "arm64", "aarch64", "x86_64", "amd64"} and not CPU_LVP
|
||||
if device in {"CPU"}: return not CI and platform.machine() in {"arm", "arm64", "aarch64", "x86_64", "amd64"} and not CPU_LVP and not CPU_X86
|
||||
return device in {"AMD", "PYTHON", "NULL"}
|
||||
if dtype in dtypes.fp8s:
|
||||
if device == "CUDA": return not CI and not CUDA_PTX
|
||||
|
|
|
|||
|
|
@ -1,6 +1,6 @@
|
|||
import sys, struct
|
||||
from typing import cast
|
||||
from tinygrad.dtype import dtypes, PtrDType, DType
|
||||
from tinygrad.dtype import dtypes, PtrDType, DType, truncate
|
||||
from tinygrad.uop import Ops, X86Ops, GroupOp, X86GroupOp
|
||||
from tinygrad.uop.ops import UOp, UPat, PatternMatcher
|
||||
from tinygrad.renderer import Renderer
|
||||
|
|
@ -154,6 +154,8 @@ x86_extra_matcher = base_extra_matcher + PatternMatcher([
|
|||
# if gate in scalar int cmove is not a comparison need to add one to set the flag
|
||||
(UPat.var("m", dtypes.bool).where(UPat.var("a", dtypes.ints), UPat.var("b")),
|
||||
lambda m,a,b: m.ne(0).where(a,b) if m.op not in GroupOp.Comparison and a.dtype.count == 1 else None),
|
||||
# TODO: do we want this? Kinda not needed if DEVECTORIZE=0. If yes make it general
|
||||
(UPat(Ops.VECTORIZE, dtypes.float16, name="x"), lambda x: x.replace(dtype=dtypes.float32.vec(x.dtype.count), src=tuple(s.src[0] for s in x.src)).cast(x.dtype) if all(s.op is Ops.CAST for s in x.src) else None),
|
||||
])
|
||||
|
||||
# ***** X86 instruction selection pre matcher *****
|
||||
|
|
@ -203,8 +205,9 @@ WGPR = tuple(r for r in GPR if r != RSP)
|
|||
def imm(dt:DType, v:int|float) -> UOp: return UOp(X86Ops.IMM, dt, arg=v)
|
||||
def to_imm(c:UOp) -> UOp|None:
|
||||
if c.op is not Ops.CONST: return None
|
||||
if c.dtype in dtypes.uints+(dtypes.bool,) and not c.overflows(dtypes.uint32): return imm(min(dtypes.uint32, c.dtype), c.arg)
|
||||
if c.dtype in dtypes.sints and not c.overflows(dtypes.int32): return imm(min(dtypes.int32, c.dtype), c.arg)
|
||||
if c.dtype is dtypes.int64: return imm(dtypes.int32, c.arg) if not c.overflows(dtypes.int32) else None
|
||||
if c.dtype is dtypes.uint64: return imm(dtypes.uint32, c.arg) if not c.overflows(dtypes.uint32) else None
|
||||
if c.dtype in dtypes.ints+(dtypes.bool,): return imm(c.dtype, c.arg)
|
||||
return None
|
||||
def disp(c:UOp) -> UOp: return imm(dtypes.int32 if c.overflows(dtypes.int8) else dtypes.int8, c.arg)
|
||||
def cmp(x:UOp): return UOp(X86Ops.CMP, src=x.src) if (i:=to_imm(x.src[1])) is None else UOp(X86Ops.CMPi, src=(x.src[0], i))
|
||||
|
|
@ -239,10 +242,19 @@ def vpins(x:UOp) -> UOp:
|
|||
for i,s in enumerate(x.src[1:], 1): shuf = UOp(op, x.dtype, (shuf, s, imm(dtypes.uint8, i)))
|
||||
return shuf
|
||||
|
||||
def div(ctx:IselContext, x:UOp):
|
||||
# zero extend or move src[0] to x
|
||||
move = UOp(X86Ops.MOV, x.dtype, (x.src[0],), ctx.vreg(RAX))
|
||||
zero = UOp(X86Ops.MOVi, x.dtype, (imm(min(dtypes.uint32, x.dtype), 0),), ctx.vreg(RDX))
|
||||
div = UOp(X86Ops.DIV, x.dtype, (UOp(X86Ops.MOV, x.dtype, (x.src[1],), ctx.vreg(tuple(r for r in WGPR if r not in (RAX, RDX)))), zero, move), ctx.vreg(RAX))
|
||||
return UOp(X86Ops.MOV, x.dtype, (div,))
|
||||
|
||||
def idiv(ctx:IselContext, x:UOp):
|
||||
cdq_op = {1: X86Ops.CBW, 2: X86Ops.CWD, 4: X86Ops.CDQ, 8: X86Ops.CQO}[x.dtype.itemsize]
|
||||
cdq = UOp(cdq_op, x.dtype, (UOp(X86Ops.MOV, x.dtype, (x.src[0],), ctx.vreg(RAX)),), ctx.vreg(RDX))
|
||||
return UOp(X86Ops.IDIV, x.dtype, (UOp(X86Ops.MOV, x.dtype, (x.src[1],), ctx.vreg(tuple(r for r in WGPR if r != RAX))), cdq), ctx.vreg(RAX))
|
||||
idiv = UOp(X86Ops.IDIV, x.dtype, (UOp(X86Ops.MOV, x.dtype, (x.src[1],), ctx.vreg(tuple(r for r in WGPR if r not in (RAX, RDX)))), cdq), ctx.vreg(RAX))
|
||||
# this move "cleanses" the register constraint (rax) of idiv, this is because the constraint only applies on definition and not on the uses of idiv
|
||||
return UOp(X86Ops.MOV, x.dtype, (idiv,))
|
||||
|
||||
def fuse_index(ctx:IselContext, x:UOp) -> tuple[UOp, ...]:
|
||||
# fuse INDEX into the address if only used once, if there was a displacement it was already moved into the load/store to expose the base index
|
||||
|
|
@ -252,16 +264,12 @@ def fuse_index(ctx:IselContext, x:UOp) -> tuple[UOp, ...]:
|
|||
|
||||
def fuse_load(ctx:IselContext, x:UOp, i:int) -> UOp|None:
|
||||
# if the load is used multiple times we don't fuse
|
||||
return x.replace(src=x.src[:i] + fuse_index(ctx, x.src[i]) + x.src[i+1:]) if len(ctx.uses[x.src[i]]) == 1 and x.src.count(x.src[i]) == 1 else None
|
||||
return x.replace(src=x.src[:i] + fuse_index(ctx, x.src[i]) + x.src[i+1:]) if len(ctx.uses[x.src[i]]) == x.src.count(x.src[i]) == 1 else None
|
||||
|
||||
# TODO: args on the stack
|
||||
def x86_abi(ctx:IselContext, x:UOp):
|
||||
# if arg is on the stack we move rsp to rbp, but this needs to be done before rsp is deincremented somehow
|
||||
#def _stack_arg: return None
|
||||
#if sys.platform == "win32": return x.replace(op=X86Ops.DEFINE_REG, arg=ctx.vreg(((RCX, RDX, GPR[8], GPR[9])[x.arg],))) if x.arg < 4 else None
|
||||
#return x.replace(op=X86Ops.DEFINE_REG, arg=ctx.vreg(((RDI, RSI, RDX, RCX, GPR[8], GPR[9])[x.arg],))) if x.arg < 6 else x.replace(op=X86Ops.MOV, src=(def_reg(dtypes.uint64, RBP), UOp(Ops.NOOP), imm(dtypes.int8, (x.arg-5)*8)), arg=None)
|
||||
reg = (RCX, RDX, GPR[8], GPR[9])[x.arg] if sys.platform == "win32" else (RDI, RSI, RDX, RCX, GPR[8], GPR[9])[x.arg]
|
||||
return x.replace(op=X86Ops.DEFINE_REG, arg=ctx.vreg((reg,)))
|
||||
def abi(ctx:IselContext, x:UOp):
|
||||
def _stack_arg(disp:int): return UOp(X86Ops.MOV, x.dtype, (def_reg(dtypes.uint64, RSP), UOp(Ops.NOOP), UOp(X86Ops.FRAME_INDEX, dtypes.int32, arg=disp)))
|
||||
if sys.platform == "win32": return x.replace(op=X86Ops.DEFINE_REG, arg=ctx.vreg(((RCX, RDX, GPR[8], GPR[9])[x.arg],))) if x.arg < 4 else _stack_arg((x.arg-3)*8+32)
|
||||
return x.replace(op=X86Ops.DEFINE_REG, arg=ctx.vreg(((RDI, RSI, RDX, RCX, GPR[8], GPR[9])[x.arg],))) if x.arg < 6 else _stack_arg((x.arg-5)*8)
|
||||
|
||||
dts = dtypes.ints + dtypes.masks + (dtypes.bool, dtypes.float16, dtypes.float32, dtypes.float64)
|
||||
dt_16bit = tuple(dt.vec(l) for dt in dts for l in [2,1] if dt.vec(l).itemsize == 2 and dt.vec(l) not in dtypes.int16s)
|
||||
|
|
@ -272,14 +280,14 @@ dt_128bit = tuple(dt.vec(l) for dt in dts for l in [16,8,4,2,1] if dt.vec(l).ite
|
|||
isel_matcher = PatternMatcher([
|
||||
# **** Op rewrites ****
|
||||
# TODO: add callee saved registers on windows to RET
|
||||
# RET, add frame pointer to it. This makes it so the prologue and epilogue are automatically setup by the register allocator
|
||||
(UPat(Ops.SINK, name="x"), lambda x: x.replace(op=X86Ops.RET, src=x.src + (UOp(X86Ops.DEFINE_REG, dtypes.uint64, arg=RBP),))),
|
||||
# RET, add stack pointer to it. Also add add frame pointer, this makes it so the prologue and epilogue are automatically setup by the register allocator
|
||||
(UPat(Ops.SINK, name="x"), lambda x: x.replace(op=X86Ops.RET, src=x.src + (UOp(X86Ops.DEFINE_REG, dtypes.uint64, arg=RSP),) + (UOp(X86Ops.DEFINE_REG, dtypes.uint64, arg=RBP),))),
|
||||
# TODO: RANGE and END is tricky. Both linearizer and regalloc need them so they stay as Ops. This gets into a broader issue with tinygrad
|
||||
# not being able to represent control flow properly. For now they are rewritten after regalloc
|
||||
# HACK: annoying hack so const doesn't get rewritten because linearizer needs it
|
||||
(UPat(Ops.RANGE, name="x"), lambda ctx,x: x.replace(src=(x.src[0].replace(tag=1),) + x.src[1:], arg=ctx.vreg(WGPR)) if x.src[0].tag is None else None),
|
||||
# function abi constraints
|
||||
(UPat(Ops.DEFINE_GLOBAL, name="x"), x86_abi),
|
||||
(UPat(Ops.DEFINE_GLOBAL, name="x"), abi),
|
||||
# these are treated the same for now
|
||||
(UPat((Ops.DEFINE_REG, Ops.DEFINE_LOCAL), name="x"),
|
||||
lambda ctx,x: x.replace(op=X86Ops.LEA, src=(UOp(X86Ops.DEFINE_REG, x.dtype, arg=RSP), UOp(Ops.NOOP), imm(dtypes.int32, ctx.inc_stack(x.dtype.nbytes()))), arg=None)), # noqa: E501
|
||||
|
|
@ -378,7 +386,8 @@ isel_matcher = PatternMatcher([
|
|||
(UPat(Ops.SUB, dtypes.int64s, name="x"), lambda x: x.replace(op=X86Ops.VPSUBQ) if x.dtype.count > 1 else None),
|
||||
(UPat(Ops.MUL, dtypes.int16s, name="x"), lambda x: x.replace(op=X86Ops.VPMULLW) if x.dtype.count > 1 else None),
|
||||
(UPat(Ops.MUL, dtypes.int32s, name="x"), lambda x: x.replace(op=X86Ops.VPMULLD) if x.dtype.count > 1 else None),
|
||||
# scalar int binary TODO: uint idiv
|
||||
# scalar int binary
|
||||
((UPat(dtype=dtypes.uints) // UPat()).named("x"), div),
|
||||
((UPat(dtype=dtypes.sints) // UPat()).named("x"), idiv),
|
||||
((UPat.var("a", dtypes.ints) << UPat.var("b")).named("x"), lambda a,b,x: x.replace(op=X86Ops.SHLi, src=(a, imm(dtypes.uint8, b.arg))) if b.op is Ops.CONST else x.replace(op=X86Ops.SHL)), # noqa: E501
|
||||
((UPat.var("a", dtypes.uints) >> UPat.var("b")).named("x"), lambda a,b,x: x.replace(op=X86Ops.SHRi, src=(a, imm(dtypes.uint8, b.arg))) if b.op is Ops.CONST else x.replace(op=X86Ops.SHR)), # noqa: E501
|
||||
|
|
@ -465,9 +474,11 @@ isel_matcher = PatternMatcher([
|
|||
# final rewrite to match the isa spec
|
||||
post_regalloc_matcher = PatternMatcher([
|
||||
# alloc stack space
|
||||
(UPat(X86Ops.DEFINE_REG, arg=RDI, name="x"), lambda ctx,x: (x, [UOp(X86Ops.SUBi, dtypes.uint64, (imm(dtypes.uint32, ctx.stack_size),), RSP), x]) if ctx.stack_size > 0 else None),
|
||||
(UPat(X86Ops.DEFINE_REG, dtypes.uint64, arg=RSP, name="x"), lambda ctx,x: (x, [x, UOp(X86Ops.SUBi, dtypes.uint64, (imm(dtypes.uint32, ctx.stack_size),), RSP)]) if ctx.stack_size > 0 else None),
|
||||
# dealloc stack space
|
||||
(UPat(X86Ops.RET, name="x"), lambda ctx,x: (x, [UOp(X86Ops.ADDi, dtypes.uint64, (imm(dtypes.uint32, ctx.stack_size),), RSP), x]) if ctx.stack_size > 0 else None),
|
||||
# rewrite FRAME_INDEX to IMM now that the stack size is known
|
||||
(UPat(X86Ops.FRAME_INDEX, name="x"), lambda ctx,x: (nx:=x.replace(op=X86Ops.IMM, arg=ctx.stack_size + x.arg), [nx])),
|
||||
# this is the CONST in RANGE
|
||||
(UPat(Ops.CONST, name="x"), lambda x: (nx:=imm(x.dtype, x.arg), [nx])),
|
||||
# rewrite RANGE to MOV reg, 0. Terrible HACK to pass the CONST to the END
|
||||
|
|
@ -475,6 +486,9 @@ post_regalloc_matcher = PatternMatcher([
|
|||
# rewrite END to ADD 1 -> CMPLT -> JUMP
|
||||
(UPat(Ops.END, name="x"), lambda x: (jl:=x.replace(op=X86Ops.JL, src=(x.src[1], cmp:=UOp(X86Ops.CMPi,
|
||||
src=(add:=UOp(X86Ops.ADDi, x.src[1].dtype, (imm(x.src[1].dtype, 1),), x.src[1].arg), imm(x.src[1].dtype, x.src[1].tag))))), [add, cmp, jl])),
|
||||
# TODO: need a generic way to model clobbers, idiv and flags should be handled the same way, maybe add clobber field to Register?
|
||||
# fixup div, zero rdx again because scheduling constraint isn't being respected
|
||||
(UPat(X86Ops.DIV, name="x"), lambda x: (nx:=x.replace(src=x.src[:1]), [UOp(X86Ops.MOVi, x.dtype, (imm(min(dtypes.uint32, x.dtype), 0),), RDX), nx])),
|
||||
# remove cdq from idiv
|
||||
(UPat(X86Ops.IDIV, name="x"), lambda x: (nx:=x.replace(src=x.src[:-1]), [nx])),
|
||||
# rewrite two address instructions to two address form, if reused src wasn't coalesced insert a move
|
||||
|
|
@ -484,6 +498,11 @@ post_regalloc_matcher = PatternMatcher([
|
|||
|
||||
# ***** X86 instruction encoding *****
|
||||
|
||||
def to_bytes(dt:DType, v:int|float):
|
||||
v = truncate[dt](v)
|
||||
if dt in dtypes.floats: return struct.pack({dtypes.float16: "<e", dtypes.float32: "<f", dtypes.float64: "<d"}[dt], v)
|
||||
return v.to_bytes(dt.itemsize, 'little', signed=dt in dtypes.sints)
|
||||
|
||||
def encode(x:UOp, opc:int, reg:int|None=None, pp:int=0, sel:int=0, we:int=0):
|
||||
# get the encoding structure of the uop
|
||||
reg_uop, vvvv_uop, rm_uop, idx_uop, disp_uop, imm_uop = None, None, None, None, None, None
|
||||
|
|
@ -573,9 +592,7 @@ def encode(x:UOp, opc:int, reg:int|None=None, pp:int=0, sel:int=0, we:int=0):
|
|||
# IMM byte
|
||||
if imm_uop is not None:
|
||||
if isinstance(imm_uop.arg, Register): inst += bytes([(imm_uop.arg.index & 0b1111) << 4 | 0b0000])
|
||||
else:
|
||||
_imm = int.from_bytes(struct.pack({2: "<e", 4: "<f", 8: "<d"}[imm_uop.dtype.itemsize], imm_uop.arg), "little") if isinstance(imm_uop.arg, float) else imm_uop.arg
|
||||
inst += _imm.to_bytes(imm_uop.dtype.itemsize, 'little', signed=imm_uop.dtype in dtypes.sints)
|
||||
else: inst += to_bytes(imm_uop.dtype, imm_uop.arg)
|
||||
return inst
|
||||
|
||||
# https://www.felixcloutier.com/x86/
|
||||
|
|
@ -584,7 +601,7 @@ def encode(x:UOp, opc:int, reg:int|None=None, pp:int=0, sel:int=0, we:int=0):
|
|||
# map select: 0F == 1, 0F38 == 2, 0F3A == 3
|
||||
encodings = PatternMatcher([
|
||||
# moves
|
||||
(UPat(X86Ops.MOVABS, name="x"), lambda x: bytes([0b0100 << 4 | 0b1 << 3 | 0b00 << 2 | x.arg.index >> 3, 0xB8 + (x.arg.index & 0b111)]) + cast(int, x.src[0].arg).to_bytes(8, 'little', signed=x.src[0].dtype in dtypes.sints)),
|
||||
(UPat(X86Ops.MOVABS, name="x"), lambda x: bytes([0b0100 << 4 | 0b1 << 3 | 0b00 << 2 | x.arg.index >> 3, 0xB8 + (x.arg.index & 0b111)]) + to_bytes(x.src[0].dtype, x.src[0].arg)),
|
||||
(UPat(X86Ops.MOV, name="x"), lambda x: encode(x, 0x8B)), (UPat(X86Ops.MOVi, name="x"), lambda x: encode(x, 0xC7, reg=0)),
|
||||
(UPat(X86Ops.MOVm, name="x"), lambda x: encode(x, 0x89)), (UPat(X86Ops.LEA, name="x"), lambda x: encode(x, 0x8D)),
|
||||
(UPat(X86Ops.VMOVSS, name="x"), lambda x: encode(x, 0x10, pp=2, sel=1)), (UPat(X86Ops.VMOVSSm, name="x"), lambda x: encode(x, 0x11, pp=2, sel=1)),
|
||||
|
|
@ -613,7 +630,7 @@ encodings = PatternMatcher([
|
|||
# int division
|
||||
(UPat(X86Ops.CBW), lambda: bytes([0x66, 0x98])), (UPat(X86Ops.CWD), lambda: bytes([0x66, 0x99])),
|
||||
(UPat(X86Ops.CDQ), lambda: bytes([0x99])), (UPat(X86Ops.CQO), lambda: bytes([0x48, 0x99])),
|
||||
(UPat(X86Ops.IDIV, name="x"), lambda x: encode(x, 0xF7, reg=7)), (UPat(X86Ops.IDIV, dtypes.uints, name="x"), lambda x: encode(x, 0xF7, reg=6)),
|
||||
(UPat(X86Ops.IDIV, name="x"), lambda x: encode(x, 0xF7, reg=7)), (UPat(X86Ops.DIV, name="x"), lambda x: encode(x, 0xF7, reg=6)),
|
||||
# scalar int binary
|
||||
(UPat(X86Ops.SHLi, name="x"), lambda x: encode(x, 0xC1, reg=4)),
|
||||
(UPat(X86Ops.SHRi, name="x"), lambda x: encode(x, 0xC1, reg=5)), (UPat(X86Ops.SARi, name="x"), lambda x: encode(x, 0xC1, reg=7)),
|
||||
|
|
|
|||
|
|
@ -134,8 +134,8 @@ class GroupOp:
|
|||
|
||||
# NOTE: X86Ops with i suffix are variants that take an immediate, m suffix are variants that can write to memory instead of read from
|
||||
class X86Ops(FastEnum):
|
||||
# register, not an instruction
|
||||
DEFINE_REG = auto()
|
||||
# register, not an instruction. FRAME_INDEX is used when the function arg is on the stack and is rewritten to IMM when stack size is known
|
||||
DEFINE_REG = auto(); FRAME_INDEX = auto() # noqa: E702
|
||||
# const
|
||||
IMM = auto()
|
||||
# index
|
||||
|
|
@ -173,7 +173,7 @@ class X86Ops(FastEnum):
|
|||
VPBROADCASTB = auto(); VPBROADCASTW = auto(); VPBROADCASTD = auto(); VPBROADCASTQ = auto() # noqa: E702
|
||||
VBROADCASTSS = auto() # TODO: VBROADCASTSD is ymm only, add once they are supported
|
||||
# int division
|
||||
IDIV = auto()
|
||||
IDIV = auto(); DIV = auto() # noqa: E702
|
||||
CBW = auto(); CWD = auto(); CDQ = auto(); CQO = auto() # noqa: E702
|
||||
# int binary
|
||||
ADD = auto(); ADDi = auto(); SUB = auto(); SUBi = auto(); IMUL = auto(); IMULi = auto() # noqa: E702
|
||||
|
|
@ -216,7 +216,7 @@ class X86GroupOp:
|
|||
X86Ops.VPMOVSXBW, X86Ops.VPMOVSXBD, X86Ops.VPMOVSXBQ, X86Ops.VPMOVSXWD, X86Ops.VPMOVSXWQ, X86Ops.VPMOVSXDQ,
|
||||
X86Ops.VCVTDQ2PS, X86Ops.VCVTDQ2PD, X86Ops.VCVTTPS2DQ, X86Ops.VCVTTPD2DQ, X86Ops.VCVTTSS2SI, X86Ops.VCVTTSD2SI,
|
||||
X86Ops.VCVTPH2PS, X86Ops.VCVTPS2PD, X86Ops.VCVTPD2PS, X86Ops.CMOVNE, X86Ops.CMOVE, X86Ops.CMOVL, X86Ops.CMOVB,
|
||||
X86Ops.VROUNDPS, X86Ops.VROUNDPD, X86Ops.VSQRTPS, X86Ops.VSQRTPD, X86Ops.CMPi, X86Ops.IMULi, X86Ops.IDIV, X86Ops.LEA,
|
||||
X86Ops.VROUNDPS, X86Ops.VROUNDPD, X86Ops.VSQRTPS, X86Ops.VSQRTPD, X86Ops.CMPi, X86Ops.IMULi, X86Ops.IDIV, X86Ops.DIV, X86Ops.LEA,
|
||||
X86Ops.VPBROADCASTB, X86Ops.VPBROADCASTW, X86Ops.VPBROADCASTD, X86Ops.VPBROADCASTQ, X86Ops.VBROADCASTSS}
|
||||
|
||||
# X86Ops whose second src can read from memory NOTE: some of these are TwoAddress1st so the second src is actually the first
|
||||
|
|
@ -243,7 +243,8 @@ class X86GroupOp:
|
|||
X86Ops.JE, X86Ops.JNE}
|
||||
|
||||
# X86Ops that write flags or can modify flags to undefined values
|
||||
WriteFlags = {X86Ops.CMP, X86Ops.CMPi, X86Ops.ADD, X86Ops.ADDi, X86Ops.SUB, X86Ops.SUBi, X86Ops.AND, X86Ops.ANDi, X86Ops.XOR, X86Ops.XORi,
|
||||
X86Ops.SHL, X86Ops.SHLi, X86Ops.SHR, X86Ops.SHRi, X86Ops.SAR, X86Ops.SARi, X86Ops.IMUL, X86Ops.IMULi, X86Ops.IDIV, X86Ops.OR, X86Ops.ORi}
|
||||
WriteFlags = {X86Ops.CMP, X86Ops.CMPi, X86Ops.ADD, X86Ops.ADDi, X86Ops.SUB, X86Ops.SUBi, X86Ops.IMUL, X86Ops.IMULi, X86Ops.IDIV, X86Ops.DIV,
|
||||
X86Ops.SHL, X86Ops.SHLi, X86Ops.SHR, X86Ops.SHRi, X86Ops.SAR, X86Ops.SARi, X86Ops.AND, X86Ops.ANDi, X86Ops.XOR, X86Ops.XORi,
|
||||
X86Ops.OR, X86Ops.ORi}
|
||||
|
||||
All = set(X86Ops)
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue