fixes

2026-06-24 02:14:17 +00:00 · 2026-01-01 02:26:48 +00:00 · 2026-01-01 02:26:48 +00:00 · 1eca96ea44
commit 1eca96ea44
parent 12714337f0
6 changed files with 56 additions and 36 deletions
--- a/test/test_ops.py
+++ b/test/test_ops.py
@ -609,7 +609,6 @@ class TestOps(unittest.TestCase):
    helper_test_op([()], lambda x: x/2)
    helper_test_op([()], lambda x: 2/x)

-  @unittest.skip("seg fault")
  def test_mod(self):
    a = [-4, 7, 5, 4, -7, 8, -9]
    b = [2, -3, 8, -2, 3, 5, -5]
@ -2150,7 +2149,6 @@ class TestOps(unittest.TestCase):
        lambda x,w: torch.nn.functional.conv_transpose2d(x,w, stride=stride),
        lambda x,w: Tensor.conv_transpose2d(x,w,stride=stride), atol=1e-5, grad_rtol=1e-5)

-  @unittest.skip("seg fault")
  @slow_test
  def test_output_padded_conv_transpose2d(self):
    for output_padding, stride in [((1,1), (2,3)), ((2,1), (3,2))]:
@ -2571,7 +2569,6 @@ class TestOps(unittest.TestCase):
    self.helper_test_exception([shape], lambda x: torch.nn.functional.avg_pool2d(x, kernel_size=(2,2), padding=(1,1,1)),
                               lambda x: Tensor.avg_pool2d(x, kernel_size=(2,2), padding=(1,1,1)), expected=(RuntimeError, ValueError))

-  @unittest.skip("seg fault")
  @slow_test
  def test_avg_pool2d_padding_not_counted(self):
    shape = (32,2,111,28)
--- a/tinygrad/codegen/late/devectorizer.py
+++ b/tinygrad/codegen/late/devectorizer.py
@ -5,7 +5,7 @@ from dataclasses import dataclass
 from tinygrad.dtype import dtypes, ImageDType, DType, AddrSpace, Invalid, PtrDType
 from tinygrad.uop.ops import UOp, Ops, UPat, PatternMatcher, GroupOp, identity_element
 from tinygrad.uop.symbolic import uop_given_valid, parse_valid, invalid_gate
-from tinygrad.helpers import getenv, flatten, AMX, prod
+from tinygrad.helpers import getenv, flatten, AMX, CPU_X86, prod
 from tinygrad.renderer import Renderer

 # ***** image load valid simplification *****
@ -152,6 +152,9 @@ def split_load_store(ctx:Renderer|None, ls:UOp, idx:UOp):
    pass
  elif isinstance(buf.dtype, ImageDType):
    lengths = [4]
+  elif ctx is not None and CPU_X86:
+    lengths = [4,2] if buf.dtype.base == dtypes.float32 else []
+    #must_divide = False
  elif ctx is not None and ctx.supports_float4:
    # TODO: a better way to get this than ctx
    lengths = [8,4,2] if buf.dtype.base == dtypes.half and getenv("ALLOW_HALF8") else ([16,8,4,2] if AMX else [4,2])
--- a/tinygrad/codegen/late/linearizer.py
+++ b/tinygrad/codegen/late/linearizer.py
@ -6,6 +6,7 @@ from tinygrad.uop.ops import PatternMatcher, UOp, Ops, UPat, multirange_str
 from tinygrad.helpers import prod, getenv, TUPLE_ORDER

 def linearize(sink:UOp) -> list[UOp]:
+  from tinygrad.renderer.x86 import RSP
  # this is a toposort with priority
  lst = list(sink.toposort())
  consumers: defaultdict[UOp, list[UOp]] = defaultdict(list)
@ -37,7 +38,8 @@ def linearize(sink:UOp) -> list[UOp]:
      case Ops.RANGE: priority = 5    # placing RANGE is good
      case Ops.END: priority = -5     # placing END is bad
      # x86 op version
-      case X86Ops.DEFINE_REG: priority = -20
+      # stack pointer needs to be scheduled at the top of the kernel
+      case X86Ops.DEFINE_REG: priority = -21 if u.arg == RSP else -20
      case X86Ops.IMM: priority = -10
      case _: priority = 0            # everything else has priority 0
    priorities[u] = (run_count, priority, extra)
--- a/tinygrad/device.py
+++ b/tinygrad/device.py
@ -5,7 +5,7 @@ from typing import Any, Generic, TypeVar, Iterator, Generator
 import importlib, inspect, functools, pathlib, os, platform, contextlib, sys, re, atexit, pickle, decimal
 from tinygrad.helpers import CI, OSX, LRU, getenv, diskcache_get, diskcache_put, DEBUG, GlobalCounters, flat_mv, PROFILE, temp, colored
 from tinygrad.helpers import Context, CCACHE, ALLOW_DEVICE_USAGE, MAX_BUFFER_SIZE, cpu_events, ProfileEvent, ProfilePointEvent, dedup, ContextVar
-from tinygrad.helpers import unwrap_class_type, suppress_finalizing, select_first_inited, VIZ, CPU_LLVM, CPU_LVP, NV_PTX, CUDA_PTX, NV_NAK
+from tinygrad.helpers import unwrap_class_type, suppress_finalizing, select_first_inited, VIZ, CPU_LLVM, CPU_LVP, CPU_X86, NV_PTX, CUDA_PTX, NV_NAK
 from tinygrad.dtype import DType, ImageDType, PtrDType, dtypes, _to_np_dtype
 from tinygrad.renderer import Renderer

@ -347,7 +347,7 @@ def is_dtype_supported(dtype:DType, device:str|None=None) -> bool:
    if device == "METAL": return not CI
    if device == "CUDA": return not CI and not CUDA_PTX
    if device == "NV": return not CI and not NV_PTX and not NV_NAK
-    if device in {"CPU"}: return not CI and platform.machine() in {"arm", "arm64", "aarch64", "x86_64", "amd64"} and not CPU_LVP
+    if device in {"CPU"}: return not CI and platform.machine() in {"arm", "arm64", "aarch64", "x86_64", "amd64"} and not CPU_LVP and not CPU_X86
    return device in {"AMD", "PYTHON", "NULL"}
  if dtype in dtypes.fp8s:
    if device == "CUDA": return not CI and not CUDA_PTX
--- a/tinygrad/renderer/x86.py
+++ b/tinygrad/renderer/x86.py
@ -1,6 +1,6 @@
 import sys, struct
 from typing import cast
-from tinygrad.dtype import dtypes, PtrDType, DType
+from tinygrad.dtype import dtypes, PtrDType, DType, truncate
 from tinygrad.uop import Ops, X86Ops, GroupOp, X86GroupOp
 from tinygrad.uop.ops import UOp, UPat, PatternMatcher
 from tinygrad.renderer import Renderer
@ -154,6 +154,8 @@ x86_extra_matcher = base_extra_matcher + PatternMatcher([
  # if gate in scalar int cmove is not a comparison need to add one to set the flag
  (UPat.var("m", dtypes.bool).where(UPat.var("a", dtypes.ints), UPat.var("b")),
   lambda m,a,b: m.ne(0).where(a,b) if m.op not in GroupOp.Comparison and a.dtype.count == 1 else None),
+  # TODO: do we want this? Kinda not needed if DEVECTORIZE=0. If yes make it general
+  (UPat(Ops.VECTORIZE, dtypes.float16, name="x"), lambda x: x.replace(dtype=dtypes.float32.vec(x.dtype.count), src=tuple(s.src[0] for s in x.src)).cast(x.dtype) if all(s.op is Ops.CAST for s in x.src) else None),
 ])

 # ***** X86 instruction selection pre matcher *****
@ -203,8 +205,9 @@ WGPR = tuple(r for r in GPR if r != RSP)
 def imm(dt:DType, v:int|float) -> UOp: return UOp(X86Ops.IMM, dt, arg=v)
 def to_imm(c:UOp) -> UOp|None:
  if c.op is not Ops.CONST: return None
-  if c.dtype in dtypes.uints+(dtypes.bool,) and not c.overflows(dtypes.uint32): return imm(min(dtypes.uint32, c.dtype), c.arg)
-  if c.dtype in dtypes.sints and not c.overflows(dtypes.int32): return imm(min(dtypes.int32, c.dtype), c.arg)
+  if c.dtype is dtypes.int64: return imm(dtypes.int32, c.arg) if not c.overflows(dtypes.int32) else None
+  if c.dtype is dtypes.uint64: return imm(dtypes.uint32, c.arg) if not c.overflows(dtypes.uint32) else None
+  if c.dtype in dtypes.ints+(dtypes.bool,): return imm(c.dtype, c.arg)
  return None
 def disp(c:UOp) -> UOp: return imm(dtypes.int32 if c.overflows(dtypes.int8) else dtypes.int8, c.arg)
 def cmp(x:UOp): return UOp(X86Ops.CMP, src=x.src) if (i:=to_imm(x.src[1])) is None else UOp(X86Ops.CMPi, src=(x.src[0], i))
@ -239,10 +242,19 @@ def vpins(x:UOp) -> UOp:
  for i,s in enumerate(x.src[1:], 1): shuf = UOp(op, x.dtype, (shuf, s, imm(dtypes.uint8, i)))
  return shuf

+def div(ctx:IselContext, x:UOp):
+  # zero extend or move src[0] to x
+  move = UOp(X86Ops.MOV, x.dtype, (x.src[0],), ctx.vreg(RAX))
+  zero = UOp(X86Ops.MOVi, x.dtype, (imm(min(dtypes.uint32, x.dtype), 0),), ctx.vreg(RDX))
+  div = UOp(X86Ops.DIV, x.dtype, (UOp(X86Ops.MOV, x.dtype, (x.src[1],), ctx.vreg(tuple(r for r in WGPR if r not in (RAX, RDX)))), zero, move), ctx.vreg(RAX))
+  return UOp(X86Ops.MOV, x.dtype, (div,))
+
 def idiv(ctx:IselContext, x:UOp):
  cdq_op = {1: X86Ops.CBW, 2: X86Ops.CWD, 4: X86Ops.CDQ, 8: X86Ops.CQO}[x.dtype.itemsize]
  cdq = UOp(cdq_op, x.dtype, (UOp(X86Ops.MOV, x.dtype, (x.src[0],), ctx.vreg(RAX)),), ctx.vreg(RDX))
-  return UOp(X86Ops.IDIV, x.dtype, (UOp(X86Ops.MOV, x.dtype, (x.src[1],), ctx.vreg(tuple(r for r in WGPR if r != RAX))), cdq), ctx.vreg(RAX))
+  idiv = UOp(X86Ops.IDIV, x.dtype, (UOp(X86Ops.MOV, x.dtype, (x.src[1],), ctx.vreg(tuple(r for r in WGPR if r not in (RAX, RDX)))), cdq), ctx.vreg(RAX))
+  # this move "cleanses" the register constraint (rax) of idiv, this is because the constraint only applies on definition and not on the uses of idiv
+  return UOp(X86Ops.MOV, x.dtype, (idiv,))

 def fuse_index(ctx:IselContext, x:UOp) -> tuple[UOp, ...]:
  # fuse INDEX into the address if only used once, if there was a displacement it was already moved into the load/store to expose the base index
@ -252,16 +264,12 @@ def fuse_index(ctx:IselContext, x:UOp) -> tuple[UOp, ...]:

 def fuse_load(ctx:IselContext, x:UOp, i:int) -> UOp|None:
  # if the load is used multiple times we don't fuse
-  return x.replace(src=x.src[:i] + fuse_index(ctx, x.src[i]) + x.src[i+1:]) if len(ctx.uses[x.src[i]]) == 1 and x.src.count(x.src[i]) == 1 else None
+  return x.replace(src=x.src[:i] + fuse_index(ctx, x.src[i]) + x.src[i+1:]) if len(ctx.uses[x.src[i]]) == x.src.count(x.src[i]) == 1 else None

-# TODO: args on the stack
-def x86_abi(ctx:IselContext, x:UOp):
-  # if arg is on the stack we move rsp to rbp, but this needs to be done before rsp is deincremented somehow
-  #def _stack_arg: return None
-  #if sys.platform == "win32": return x.replace(op=X86Ops.DEFINE_REG, arg=ctx.vreg(((RCX, RDX, GPR[8], GPR[9])[x.arg],))) if x.arg < 4 else None
-  #return x.replace(op=X86Ops.DEFINE_REG, arg=ctx.vreg(((RDI, RSI, RDX, RCX, GPR[8], GPR[9])[x.arg],))) if x.arg < 6 else x.replace(op=X86Ops.MOV, src=(def_reg(dtypes.uint64, RBP), UOp(Ops.NOOP), imm(dtypes.int8, (x.arg-5)*8)), arg=None)
-  reg = (RCX, RDX, GPR[8], GPR[9])[x.arg] if sys.platform == "win32" else (RDI, RSI, RDX, RCX, GPR[8], GPR[9])[x.arg]
-  return x.replace(op=X86Ops.DEFINE_REG, arg=ctx.vreg((reg,)))
+def abi(ctx:IselContext, x:UOp):
+  def _stack_arg(disp:int): return UOp(X86Ops.MOV, x.dtype, (def_reg(dtypes.uint64, RSP), UOp(Ops.NOOP), UOp(X86Ops.FRAME_INDEX, dtypes.int32, arg=disp)))
+  if sys.platform == "win32": return x.replace(op=X86Ops.DEFINE_REG, arg=ctx.vreg(((RCX, RDX, GPR[8], GPR[9])[x.arg],))) if x.arg < 4 else _stack_arg((x.arg-3)*8+32)
+  return x.replace(op=X86Ops.DEFINE_REG, arg=ctx.vreg(((RDI, RSI, RDX, RCX, GPR[8], GPR[9])[x.arg],))) if x.arg < 6 else _stack_arg((x.arg-5)*8)

 dts = dtypes.ints + dtypes.masks + (dtypes.bool, dtypes.float16, dtypes.float32, dtypes.float64)
 dt_16bit = tuple(dt.vec(l) for dt in dts for l in [2,1] if dt.vec(l).itemsize == 2 and dt.vec(l) not in dtypes.int16s)
@ -272,14 +280,14 @@ dt_128bit = tuple(dt.vec(l) for dt in dts for l in [16,8,4,2,1] if dt.vec(l).ite
 isel_matcher = PatternMatcher([
  # **** Op rewrites ****
  # TODO: add callee saved registers on windows to RET
-  # RET, add frame pointer to it. This makes it so the prologue and epilogue are automatically setup by the register allocator
-  (UPat(Ops.SINK, name="x"), lambda x: x.replace(op=X86Ops.RET, src=x.src + (UOp(X86Ops.DEFINE_REG, dtypes.uint64, arg=RBP),))),
+  # RET, add stack pointer to it. Also add add frame pointer, this makes it so the prologue and epilogue are automatically setup by the register allocator
+  (UPat(Ops.SINK, name="x"), lambda x: x.replace(op=X86Ops.RET, src=x.src + (UOp(X86Ops.DEFINE_REG, dtypes.uint64, arg=RSP),) + (UOp(X86Ops.DEFINE_REG, dtypes.uint64, arg=RBP),))),
  # TODO: RANGE and END is tricky. Both linearizer and regalloc need them so they stay as Ops. This gets into a broader issue with tinygrad
  # not being able to represent control flow properly. For now they are rewritten after regalloc
  # HACK: annoying hack so const doesn't get rewritten because linearizer needs it
  (UPat(Ops.RANGE, name="x"), lambda ctx,x: x.replace(src=(x.src[0].replace(tag=1),) + x.src[1:], arg=ctx.vreg(WGPR)) if x.src[0].tag is None else None),
  # function abi constraints
-  (UPat(Ops.DEFINE_GLOBAL, name="x"), x86_abi),
+  (UPat(Ops.DEFINE_GLOBAL, name="x"), abi),
  # these are treated the same for now
  (UPat((Ops.DEFINE_REG, Ops.DEFINE_LOCAL), name="x"),
   lambda ctx,x: x.replace(op=X86Ops.LEA, src=(UOp(X86Ops.DEFINE_REG, x.dtype, arg=RSP), UOp(Ops.NOOP), imm(dtypes.int32, ctx.inc_stack(x.dtype.nbytes()))), arg=None)), # noqa: E501
@ -378,7 +386,8 @@ isel_matcher = PatternMatcher([
  (UPat(Ops.SUB, dtypes.int64s, name="x"), lambda x: x.replace(op=X86Ops.VPSUBQ) if x.dtype.count > 1 else None),
  (UPat(Ops.MUL, dtypes.int16s, name="x"), lambda x: x.replace(op=X86Ops.VPMULLW) if x.dtype.count > 1 else None),
  (UPat(Ops.MUL, dtypes.int32s, name="x"), lambda x: x.replace(op=X86Ops.VPMULLD) if x.dtype.count > 1 else None),
-  # scalar int binary TODO: uint idiv
+  # scalar int binary
+  ((UPat(dtype=dtypes.uints) // UPat()).named("x"), div),
  ((UPat(dtype=dtypes.sints) // UPat()).named("x"), idiv),
  ((UPat.var("a", dtypes.ints) << UPat.var("b")).named("x"), lambda a,b,x: x.replace(op=X86Ops.SHLi, src=(a, imm(dtypes.uint8, b.arg))) if b.op is Ops.CONST else x.replace(op=X86Ops.SHL)), # noqa: E501
  ((UPat.var("a", dtypes.uints) >> UPat.var("b")).named("x"), lambda a,b,x: x.replace(op=X86Ops.SHRi, src=(a, imm(dtypes.uint8, b.arg))) if b.op is Ops.CONST else x.replace(op=X86Ops.SHR)), # noqa: E501
@ -465,9 +474,11 @@ isel_matcher = PatternMatcher([
 # final rewrite to match the isa spec
 post_regalloc_matcher = PatternMatcher([
  # alloc stack space
-  (UPat(X86Ops.DEFINE_REG, arg=RDI, name="x"), lambda ctx,x: (x, [UOp(X86Ops.SUBi, dtypes.uint64, (imm(dtypes.uint32, ctx.stack_size),), RSP), x]) if ctx.stack_size > 0 else None),
+  (UPat(X86Ops.DEFINE_REG, dtypes.uint64, arg=RSP, name="x"), lambda ctx,x: (x, [x, UOp(X86Ops.SUBi, dtypes.uint64, (imm(dtypes.uint32, ctx.stack_size),), RSP)]) if ctx.stack_size > 0 else None),
  # dealloc stack space
  (UPat(X86Ops.RET, name="x"), lambda ctx,x: (x, [UOp(X86Ops.ADDi, dtypes.uint64, (imm(dtypes.uint32, ctx.stack_size),), RSP), x]) if ctx.stack_size > 0 else None),
+  # rewrite FRAME_INDEX to IMM now that the stack size is known
+  (UPat(X86Ops.FRAME_INDEX, name="x"), lambda ctx,x: (nx:=x.replace(op=X86Ops.IMM, arg=ctx.stack_size + x.arg), [nx])),
  # this is the CONST in RANGE
  (UPat(Ops.CONST, name="x"), lambda x: (nx:=imm(x.dtype, x.arg), [nx])),
  # rewrite RANGE to MOV reg, 0. Terrible HACK to pass the CONST to the END
@ -475,6 +486,9 @@ post_regalloc_matcher = PatternMatcher([
  # rewrite END to ADD 1 -> CMPLT -> JUMP
  (UPat(Ops.END, name="x"), lambda x: (jl:=x.replace(op=X86Ops.JL, src=(x.src[1], cmp:=UOp(X86Ops.CMPi,
    src=(add:=UOp(X86Ops.ADDi, x.src[1].dtype, (imm(x.src[1].dtype, 1),), x.src[1].arg), imm(x.src[1].dtype, x.src[1].tag))))), [add, cmp, jl])),
+  # TODO: need a generic way to model clobbers, idiv and flags should be handled the same way, maybe add clobber field to Register?
+  # fixup div, zero rdx again because scheduling constraint isn't being respected
+  (UPat(X86Ops.DIV, name="x"), lambda x: (nx:=x.replace(src=x.src[:1]), [UOp(X86Ops.MOVi, x.dtype, (imm(min(dtypes.uint32, x.dtype), 0),), RDX), nx])),
  # remove cdq from idiv
  (UPat(X86Ops.IDIV, name="x"), lambda x: (nx:=x.replace(src=x.src[:-1]), [nx])),
  # rewrite two address instructions to two address form, if reused src wasn't coalesced insert a move
@ -484,6 +498,11 @@ post_regalloc_matcher = PatternMatcher([

 # ***** X86 instruction encoding *****

+def to_bytes(dt:DType, v:int|float):
+  v = truncate[dt](v)
+  if dt in dtypes.floats: return struct.pack({dtypes.float16: "<e", dtypes.float32: "<f", dtypes.float64: "<d"}[dt], v)
+  return v.to_bytes(dt.itemsize, 'little', signed=dt in dtypes.sints)
+
 def encode(x:UOp, opc:int, reg:int|None=None, pp:int=0, sel:int=0, we:int=0):
  # get the encoding structure of the uop
  reg_uop, vvvv_uop, rm_uop, idx_uop, disp_uop, imm_uop = None, None, None, None, None, None
@ -573,9 +592,7 @@ def encode(x:UOp, opc:int, reg:int|None=None, pp:int=0, sel:int=0, we:int=0):
  # IMM byte
  if imm_uop is not None:
    if isinstance(imm_uop.arg, Register): inst += bytes([(imm_uop.arg.index & 0b1111) << 4 | 0b0000])
-    else:
-      _imm = int.from_bytes(struct.pack({2: "<e", 4: "<f", 8: "<d"}[imm_uop.dtype.itemsize], imm_uop.arg), "little") if isinstance(imm_uop.arg, float) else imm_uop.arg
-      inst += _imm.to_bytes(imm_uop.dtype.itemsize, 'little', signed=imm_uop.dtype in dtypes.sints)
+    else: inst += to_bytes(imm_uop.dtype, imm_uop.arg)
  return inst

 # https://www.felixcloutier.com/x86/
@ -584,7 +601,7 @@ def encode(x:UOp, opc:int, reg:int|None=None, pp:int=0, sel:int=0, we:int=0):
 # map select: 0F == 1, 0F38 == 2, 0F3A == 3
 encodings = PatternMatcher([
  # moves
-  (UPat(X86Ops.MOVABS, name="x"), lambda x: bytes([0b0100 << 4 | 0b1 << 3 | 0b00 << 2 | x.arg.index >> 3, 0xB8 + (x.arg.index & 0b111)]) + cast(int, x.src[0].arg).to_bytes(8, 'little', signed=x.src[0].dtype in dtypes.sints)),
+  (UPat(X86Ops.MOVABS, name="x"), lambda x: bytes([0b0100 << 4 | 0b1 << 3 | 0b00 << 2 | x.arg.index >> 3, 0xB8 + (x.arg.index & 0b111)]) + to_bytes(x.src[0].dtype, x.src[0].arg)),
  (UPat(X86Ops.MOV, name="x"), lambda x: encode(x, 0x8B)), (UPat(X86Ops.MOVi, name="x"), lambda x: encode(x, 0xC7, reg=0)),
  (UPat(X86Ops.MOVm, name="x"), lambda x: encode(x, 0x89)), (UPat(X86Ops.LEA, name="x"), lambda x: encode(x, 0x8D)),
  (UPat(X86Ops.VMOVSS, name="x"), lambda x: encode(x, 0x10, pp=2, sel=1)), (UPat(X86Ops.VMOVSSm, name="x"), lambda x: encode(x, 0x11, pp=2, sel=1)),
@ -613,7 +630,7 @@ encodings = PatternMatcher([
  # int division
  (UPat(X86Ops.CBW), lambda: bytes([0x66, 0x98])), (UPat(X86Ops.CWD), lambda: bytes([0x66, 0x99])),
  (UPat(X86Ops.CDQ), lambda: bytes([0x99])), (UPat(X86Ops.CQO), lambda: bytes([0x48, 0x99])),
-  (UPat(X86Ops.IDIV, name="x"), lambda x: encode(x, 0xF7, reg=7)), (UPat(X86Ops.IDIV, dtypes.uints, name="x"), lambda x: encode(x, 0xF7, reg=6)),
+  (UPat(X86Ops.IDIV, name="x"), lambda x: encode(x, 0xF7, reg=7)), (UPat(X86Ops.DIV, name="x"), lambda x: encode(x, 0xF7, reg=6)),
  # scalar int binary
  (UPat(X86Ops.SHLi, name="x"), lambda x: encode(x, 0xC1, reg=4)),
  (UPat(X86Ops.SHRi, name="x"), lambda x: encode(x, 0xC1, reg=5)), (UPat(X86Ops.SARi, name="x"), lambda x: encode(x, 0xC1, reg=7)),
--- a/tinygrad/uop/init.py
+++ b/tinygrad/uop/init.py
@ -134,8 +134,8 @@ class GroupOp:

 # NOTE: X86Ops with i suffix are variants that take an immediate, m suffix are variants that can write to memory instead of read from
 class X86Ops(FastEnum):
-  # register, not an instruction
-  DEFINE_REG = auto()
+  # register, not an instruction. FRAME_INDEX is used when the function arg is on the stack and is rewritten to IMM when stack size is known
+  DEFINE_REG = auto(); FRAME_INDEX = auto() # noqa: E702
  # const
  IMM = auto()
  # index
@ -173,7 +173,7 @@ class X86Ops(FastEnum):
  VPBROADCASTB = auto(); VPBROADCASTW = auto(); VPBROADCASTD = auto(); VPBROADCASTQ = auto() # noqa: E702
  VBROADCASTSS = auto() # TODO: VBROADCASTSD is ymm only, add once they are supported
  # int division
-  IDIV = auto()
+  IDIV = auto(); DIV = auto() # noqa: E702
  CBW = auto(); CWD = auto(); CDQ = auto(); CQO = auto() # noqa: E702
  # int binary
  ADD = auto(); ADDi = auto(); SUB = auto(); SUBi = auto(); IMUL = auto(); IMULi = auto() # noqa: E702
@ -216,7 +216,7 @@ class X86GroupOp:
                X86Ops.VPMOVSXBW, X86Ops.VPMOVSXBD, X86Ops.VPMOVSXBQ, X86Ops.VPMOVSXWD, X86Ops.VPMOVSXWQ, X86Ops.VPMOVSXDQ,
                X86Ops.VCVTDQ2PS, X86Ops.VCVTDQ2PD, X86Ops.VCVTTPS2DQ, X86Ops.VCVTTPD2DQ, X86Ops.VCVTTSS2SI, X86Ops.VCVTTSD2SI,
                X86Ops.VCVTPH2PS, X86Ops.VCVTPS2PD, X86Ops.VCVTPD2PS, X86Ops.CMOVNE, X86Ops.CMOVE, X86Ops.CMOVL, X86Ops.CMOVB,
-                X86Ops.VROUNDPS, X86Ops.VROUNDPD, X86Ops.VSQRTPS, X86Ops.VSQRTPD, X86Ops.CMPi, X86Ops.IMULi, X86Ops.IDIV, X86Ops.LEA,
+                X86Ops.VROUNDPS, X86Ops.VROUNDPD, X86Ops.VSQRTPS, X86Ops.VSQRTPD, X86Ops.CMPi, X86Ops.IMULi, X86Ops.IDIV, X86Ops.DIV, X86Ops.LEA,
                X86Ops.VPBROADCASTB, X86Ops.VPBROADCASTW, X86Ops.VPBROADCASTD, X86Ops.VPBROADCASTQ, X86Ops.VBROADCASTSS}

  # X86Ops whose second src can read from memory NOTE: some of these are TwoAddress1st so the second src is actually the first
@ -243,7 +243,8 @@ class X86GroupOp:
               X86Ops.JE, X86Ops.JNE}

  # X86Ops that write flags or can modify flags to undefined values
-  WriteFlags = {X86Ops.CMP, X86Ops.CMPi, X86Ops.ADD, X86Ops.ADDi, X86Ops.SUB, X86Ops.SUBi, X86Ops.AND, X86Ops.ANDi, X86Ops.XOR, X86Ops.XORi,
-                X86Ops.SHL, X86Ops.SHLi, X86Ops.SHR, X86Ops.SHRi, X86Ops.SAR, X86Ops.SARi, X86Ops.IMUL, X86Ops.IMULi, X86Ops.IDIV, X86Ops.OR, X86Ops.ORi}
+  WriteFlags = {X86Ops.CMP, X86Ops.CMPi, X86Ops.ADD, X86Ops.ADDi, X86Ops.SUB, X86Ops.SUBi, X86Ops.IMUL, X86Ops.IMULi, X86Ops.IDIV, X86Ops.DIV,
+                X86Ops.SHL, X86Ops.SHLi, X86Ops.SHR, X86Ops.SHRi, X86Ops.SAR, X86Ops.SARi, X86Ops.AND, X86Ops.ANDi, X86Ops.XOR, X86Ops.XORi,
+                X86Ops.OR, X86Ops.ORi}

  All = set(X86Ops)