This commit is contained in:
ttomsa 2026-01-01 02:26:48 +00:00
commit 1eca96ea44
6 changed files with 56 additions and 36 deletions

View file

@ -609,7 +609,6 @@ class TestOps(unittest.TestCase):
helper_test_op([()], lambda x: x/2)
helper_test_op([()], lambda x: 2/x)
@unittest.skip("seg fault")
def test_mod(self):
a = [-4, 7, 5, 4, -7, 8, -9]
b = [2, -3, 8, -2, 3, 5, -5]
@ -2150,7 +2149,6 @@ class TestOps(unittest.TestCase):
lambda x,w: torch.nn.functional.conv_transpose2d(x,w, stride=stride),
lambda x,w: Tensor.conv_transpose2d(x,w,stride=stride), atol=1e-5, grad_rtol=1e-5)
@unittest.skip("seg fault")
@slow_test
def test_output_padded_conv_transpose2d(self):
for output_padding, stride in [((1,1), (2,3)), ((2,1), (3,2))]:
@ -2571,7 +2569,6 @@ class TestOps(unittest.TestCase):
self.helper_test_exception([shape], lambda x: torch.nn.functional.avg_pool2d(x, kernel_size=(2,2), padding=(1,1,1)),
lambda x: Tensor.avg_pool2d(x, kernel_size=(2,2), padding=(1,1,1)), expected=(RuntimeError, ValueError))
@unittest.skip("seg fault")
@slow_test
def test_avg_pool2d_padding_not_counted(self):
shape = (32,2,111,28)

View file

@ -5,7 +5,7 @@ from dataclasses import dataclass
from tinygrad.dtype import dtypes, ImageDType, DType, AddrSpace, Invalid, PtrDType
from tinygrad.uop.ops import UOp, Ops, UPat, PatternMatcher, GroupOp, identity_element
from tinygrad.uop.symbolic import uop_given_valid, parse_valid, invalid_gate
from tinygrad.helpers import getenv, flatten, AMX, prod
from tinygrad.helpers import getenv, flatten, AMX, CPU_X86, prod
from tinygrad.renderer import Renderer
# ***** image load valid simplification *****
@ -152,6 +152,9 @@ def split_load_store(ctx:Renderer|None, ls:UOp, idx:UOp):
pass
elif isinstance(buf.dtype, ImageDType):
lengths = [4]
elif ctx is not None and CPU_X86:
lengths = [4,2] if buf.dtype.base == dtypes.float32 else []
#must_divide = False
elif ctx is not None and ctx.supports_float4:
# TODO: a better way to get this than ctx
lengths = [8,4,2] if buf.dtype.base == dtypes.half and getenv("ALLOW_HALF8") else ([16,8,4,2] if AMX else [4,2])

View file

@ -6,6 +6,7 @@ from tinygrad.uop.ops import PatternMatcher, UOp, Ops, UPat, multirange_str
from tinygrad.helpers import prod, getenv, TUPLE_ORDER
def linearize(sink:UOp) -> list[UOp]:
from tinygrad.renderer.x86 import RSP
# this is a toposort with priority
lst = list(sink.toposort())
consumers: defaultdict[UOp, list[UOp]] = defaultdict(list)
@ -37,7 +38,8 @@ def linearize(sink:UOp) -> list[UOp]:
case Ops.RANGE: priority = 5 # placing RANGE is good
case Ops.END: priority = -5 # placing END is bad
# x86 op version
case X86Ops.DEFINE_REG: priority = -20
# stack pointer needs to be scheduled at the top of the kernel
case X86Ops.DEFINE_REG: priority = -21 if u.arg == RSP else -20
case X86Ops.IMM: priority = -10
case _: priority = 0 # everything else has priority 0
priorities[u] = (run_count, priority, extra)

View file

@ -5,7 +5,7 @@ from typing import Any, Generic, TypeVar, Iterator, Generator
import importlib, inspect, functools, pathlib, os, platform, contextlib, sys, re, atexit, pickle, decimal
from tinygrad.helpers import CI, OSX, LRU, getenv, diskcache_get, diskcache_put, DEBUG, GlobalCounters, flat_mv, PROFILE, temp, colored
from tinygrad.helpers import Context, CCACHE, ALLOW_DEVICE_USAGE, MAX_BUFFER_SIZE, cpu_events, ProfileEvent, ProfilePointEvent, dedup, ContextVar
from tinygrad.helpers import unwrap_class_type, suppress_finalizing, select_first_inited, VIZ, CPU_LLVM, CPU_LVP, NV_PTX, CUDA_PTX, NV_NAK
from tinygrad.helpers import unwrap_class_type, suppress_finalizing, select_first_inited, VIZ, CPU_LLVM, CPU_LVP, CPU_X86, NV_PTX, CUDA_PTX, NV_NAK
from tinygrad.dtype import DType, ImageDType, PtrDType, dtypes, _to_np_dtype
from tinygrad.renderer import Renderer
@ -347,7 +347,7 @@ def is_dtype_supported(dtype:DType, device:str|None=None) -> bool:
if device == "METAL": return not CI
if device == "CUDA": return not CI and not CUDA_PTX
if device == "NV": return not CI and not NV_PTX and not NV_NAK
if device in {"CPU"}: return not CI and platform.machine() in {"arm", "arm64", "aarch64", "x86_64", "amd64"} and not CPU_LVP
if device in {"CPU"}: return not CI and platform.machine() in {"arm", "arm64", "aarch64", "x86_64", "amd64"} and not CPU_LVP and not CPU_X86
return device in {"AMD", "PYTHON", "NULL"}
if dtype in dtypes.fp8s:
if device == "CUDA": return not CI and not CUDA_PTX

View file

@ -1,6 +1,6 @@
import sys, struct
from typing import cast
from tinygrad.dtype import dtypes, PtrDType, DType
from tinygrad.dtype import dtypes, PtrDType, DType, truncate
from tinygrad.uop import Ops, X86Ops, GroupOp, X86GroupOp
from tinygrad.uop.ops import UOp, UPat, PatternMatcher
from tinygrad.renderer import Renderer
@ -154,6 +154,8 @@ x86_extra_matcher = base_extra_matcher + PatternMatcher([
# if gate in scalar int cmove is not a comparison need to add one to set the flag
(UPat.var("m", dtypes.bool).where(UPat.var("a", dtypes.ints), UPat.var("b")),
lambda m,a,b: m.ne(0).where(a,b) if m.op not in GroupOp.Comparison and a.dtype.count == 1 else None),
# TODO: do we want this? Kinda not needed if DEVECTORIZE=0. If yes make it general
(UPat(Ops.VECTORIZE, dtypes.float16, name="x"), lambda x: x.replace(dtype=dtypes.float32.vec(x.dtype.count), src=tuple(s.src[0] for s in x.src)).cast(x.dtype) if all(s.op is Ops.CAST for s in x.src) else None),
])
# ***** X86 instruction selection pre matcher *****
@ -203,8 +205,9 @@ WGPR = tuple(r for r in GPR if r != RSP)
def imm(dt:DType, v:int|float) -> UOp: return UOp(X86Ops.IMM, dt, arg=v)
def to_imm(c:UOp) -> UOp|None:
if c.op is not Ops.CONST: return None
if c.dtype in dtypes.uints+(dtypes.bool,) and not c.overflows(dtypes.uint32): return imm(min(dtypes.uint32, c.dtype), c.arg)
if c.dtype in dtypes.sints and not c.overflows(dtypes.int32): return imm(min(dtypes.int32, c.dtype), c.arg)
if c.dtype is dtypes.int64: return imm(dtypes.int32, c.arg) if not c.overflows(dtypes.int32) else None
if c.dtype is dtypes.uint64: return imm(dtypes.uint32, c.arg) if not c.overflows(dtypes.uint32) else None
if c.dtype in dtypes.ints+(dtypes.bool,): return imm(c.dtype, c.arg)
return None
def disp(c:UOp) -> UOp: return imm(dtypes.int32 if c.overflows(dtypes.int8) else dtypes.int8, c.arg)
def cmp(x:UOp): return UOp(X86Ops.CMP, src=x.src) if (i:=to_imm(x.src[1])) is None else UOp(X86Ops.CMPi, src=(x.src[0], i))
@ -239,10 +242,19 @@ def vpins(x:UOp) -> UOp:
for i,s in enumerate(x.src[1:], 1): shuf = UOp(op, x.dtype, (shuf, s, imm(dtypes.uint8, i)))
return shuf
def div(ctx:IselContext, x:UOp):
# zero extend or move src[0] to x
move = UOp(X86Ops.MOV, x.dtype, (x.src[0],), ctx.vreg(RAX))
zero = UOp(X86Ops.MOVi, x.dtype, (imm(min(dtypes.uint32, x.dtype), 0),), ctx.vreg(RDX))
div = UOp(X86Ops.DIV, x.dtype, (UOp(X86Ops.MOV, x.dtype, (x.src[1],), ctx.vreg(tuple(r for r in WGPR if r not in (RAX, RDX)))), zero, move), ctx.vreg(RAX))
return UOp(X86Ops.MOV, x.dtype, (div,))
def idiv(ctx:IselContext, x:UOp):
cdq_op = {1: X86Ops.CBW, 2: X86Ops.CWD, 4: X86Ops.CDQ, 8: X86Ops.CQO}[x.dtype.itemsize]
cdq = UOp(cdq_op, x.dtype, (UOp(X86Ops.MOV, x.dtype, (x.src[0],), ctx.vreg(RAX)),), ctx.vreg(RDX))
return UOp(X86Ops.IDIV, x.dtype, (UOp(X86Ops.MOV, x.dtype, (x.src[1],), ctx.vreg(tuple(r for r in WGPR if r != RAX))), cdq), ctx.vreg(RAX))
idiv = UOp(X86Ops.IDIV, x.dtype, (UOp(X86Ops.MOV, x.dtype, (x.src[1],), ctx.vreg(tuple(r for r in WGPR if r not in (RAX, RDX)))), cdq), ctx.vreg(RAX))
# this move "cleanses" the register constraint (rax) of idiv, this is because the constraint only applies on definition and not on the uses of idiv
return UOp(X86Ops.MOV, x.dtype, (idiv,))
def fuse_index(ctx:IselContext, x:UOp) -> tuple[UOp, ...]:
# fuse INDEX into the address if only used once, if there was a displacement it was already moved into the load/store to expose the base index
@ -252,16 +264,12 @@ def fuse_index(ctx:IselContext, x:UOp) -> tuple[UOp, ...]:
def fuse_load(ctx:IselContext, x:UOp, i:int) -> UOp|None:
# if the load is used multiple times we don't fuse
return x.replace(src=x.src[:i] + fuse_index(ctx, x.src[i]) + x.src[i+1:]) if len(ctx.uses[x.src[i]]) == 1 and x.src.count(x.src[i]) == 1 else None
return x.replace(src=x.src[:i] + fuse_index(ctx, x.src[i]) + x.src[i+1:]) if len(ctx.uses[x.src[i]]) == x.src.count(x.src[i]) == 1 else None
# TODO: args on the stack
def x86_abi(ctx:IselContext, x:UOp):
# if arg is on the stack we move rsp to rbp, but this needs to be done before rsp is deincremented somehow
#def _stack_arg: return None
#if sys.platform == "win32": return x.replace(op=X86Ops.DEFINE_REG, arg=ctx.vreg(((RCX, RDX, GPR[8], GPR[9])[x.arg],))) if x.arg < 4 else None
#return x.replace(op=X86Ops.DEFINE_REG, arg=ctx.vreg(((RDI, RSI, RDX, RCX, GPR[8], GPR[9])[x.arg],))) if x.arg < 6 else x.replace(op=X86Ops.MOV, src=(def_reg(dtypes.uint64, RBP), UOp(Ops.NOOP), imm(dtypes.int8, (x.arg-5)*8)), arg=None)
reg = (RCX, RDX, GPR[8], GPR[9])[x.arg] if sys.platform == "win32" else (RDI, RSI, RDX, RCX, GPR[8], GPR[9])[x.arg]
return x.replace(op=X86Ops.DEFINE_REG, arg=ctx.vreg((reg,)))
def abi(ctx:IselContext, x:UOp):
def _stack_arg(disp:int): return UOp(X86Ops.MOV, x.dtype, (def_reg(dtypes.uint64, RSP), UOp(Ops.NOOP), UOp(X86Ops.FRAME_INDEX, dtypes.int32, arg=disp)))
if sys.platform == "win32": return x.replace(op=X86Ops.DEFINE_REG, arg=ctx.vreg(((RCX, RDX, GPR[8], GPR[9])[x.arg],))) if x.arg < 4 else _stack_arg((x.arg-3)*8+32)
return x.replace(op=X86Ops.DEFINE_REG, arg=ctx.vreg(((RDI, RSI, RDX, RCX, GPR[8], GPR[9])[x.arg],))) if x.arg < 6 else _stack_arg((x.arg-5)*8)
dts = dtypes.ints + dtypes.masks + (dtypes.bool, dtypes.float16, dtypes.float32, dtypes.float64)
dt_16bit = tuple(dt.vec(l) for dt in dts for l in [2,1] if dt.vec(l).itemsize == 2 and dt.vec(l) not in dtypes.int16s)
@ -272,14 +280,14 @@ dt_128bit = tuple(dt.vec(l) for dt in dts for l in [16,8,4,2,1] if dt.vec(l).ite
isel_matcher = PatternMatcher([
# **** Op rewrites ****
# TODO: add callee saved registers on windows to RET
# RET, add frame pointer to it. This makes it so the prologue and epilogue are automatically setup by the register allocator
(UPat(Ops.SINK, name="x"), lambda x: x.replace(op=X86Ops.RET, src=x.src + (UOp(X86Ops.DEFINE_REG, dtypes.uint64, arg=RBP),))),
# RET, add stack pointer to it. Also add add frame pointer, this makes it so the prologue and epilogue are automatically setup by the register allocator
(UPat(Ops.SINK, name="x"), lambda x: x.replace(op=X86Ops.RET, src=x.src + (UOp(X86Ops.DEFINE_REG, dtypes.uint64, arg=RSP),) + (UOp(X86Ops.DEFINE_REG, dtypes.uint64, arg=RBP),))),
# TODO: RANGE and END is tricky. Both linearizer and regalloc need them so they stay as Ops. This gets into a broader issue with tinygrad
# not being able to represent control flow properly. For now they are rewritten after regalloc
# HACK: annoying hack so const doesn't get rewritten because linearizer needs it
(UPat(Ops.RANGE, name="x"), lambda ctx,x: x.replace(src=(x.src[0].replace(tag=1),) + x.src[1:], arg=ctx.vreg(WGPR)) if x.src[0].tag is None else None),
# function abi constraints
(UPat(Ops.DEFINE_GLOBAL, name="x"), x86_abi),
(UPat(Ops.DEFINE_GLOBAL, name="x"), abi),
# these are treated the same for now
(UPat((Ops.DEFINE_REG, Ops.DEFINE_LOCAL), name="x"),
lambda ctx,x: x.replace(op=X86Ops.LEA, src=(UOp(X86Ops.DEFINE_REG, x.dtype, arg=RSP), UOp(Ops.NOOP), imm(dtypes.int32, ctx.inc_stack(x.dtype.nbytes()))), arg=None)), # noqa: E501
@ -378,7 +386,8 @@ isel_matcher = PatternMatcher([
(UPat(Ops.SUB, dtypes.int64s, name="x"), lambda x: x.replace(op=X86Ops.VPSUBQ) if x.dtype.count > 1 else None),
(UPat(Ops.MUL, dtypes.int16s, name="x"), lambda x: x.replace(op=X86Ops.VPMULLW) if x.dtype.count > 1 else None),
(UPat(Ops.MUL, dtypes.int32s, name="x"), lambda x: x.replace(op=X86Ops.VPMULLD) if x.dtype.count > 1 else None),
# scalar int binary TODO: uint idiv
# scalar int binary
((UPat(dtype=dtypes.uints) // UPat()).named("x"), div),
((UPat(dtype=dtypes.sints) // UPat()).named("x"), idiv),
((UPat.var("a", dtypes.ints) << UPat.var("b")).named("x"), lambda a,b,x: x.replace(op=X86Ops.SHLi, src=(a, imm(dtypes.uint8, b.arg))) if b.op is Ops.CONST else x.replace(op=X86Ops.SHL)), # noqa: E501
((UPat.var("a", dtypes.uints) >> UPat.var("b")).named("x"), lambda a,b,x: x.replace(op=X86Ops.SHRi, src=(a, imm(dtypes.uint8, b.arg))) if b.op is Ops.CONST else x.replace(op=X86Ops.SHR)), # noqa: E501
@ -465,9 +474,11 @@ isel_matcher = PatternMatcher([
# final rewrite to match the isa spec
post_regalloc_matcher = PatternMatcher([
# alloc stack space
(UPat(X86Ops.DEFINE_REG, arg=RDI, name="x"), lambda ctx,x: (x, [UOp(X86Ops.SUBi, dtypes.uint64, (imm(dtypes.uint32, ctx.stack_size),), RSP), x]) if ctx.stack_size > 0 else None),
(UPat(X86Ops.DEFINE_REG, dtypes.uint64, arg=RSP, name="x"), lambda ctx,x: (x, [x, UOp(X86Ops.SUBi, dtypes.uint64, (imm(dtypes.uint32, ctx.stack_size),), RSP)]) if ctx.stack_size > 0 else None),
# dealloc stack space
(UPat(X86Ops.RET, name="x"), lambda ctx,x: (x, [UOp(X86Ops.ADDi, dtypes.uint64, (imm(dtypes.uint32, ctx.stack_size),), RSP), x]) if ctx.stack_size > 0 else None),
# rewrite FRAME_INDEX to IMM now that the stack size is known
(UPat(X86Ops.FRAME_INDEX, name="x"), lambda ctx,x: (nx:=x.replace(op=X86Ops.IMM, arg=ctx.stack_size + x.arg), [nx])),
# this is the CONST in RANGE
(UPat(Ops.CONST, name="x"), lambda x: (nx:=imm(x.dtype, x.arg), [nx])),
# rewrite RANGE to MOV reg, 0. Terrible HACK to pass the CONST to the END
@ -475,6 +486,9 @@ post_regalloc_matcher = PatternMatcher([
# rewrite END to ADD 1 -> CMPLT -> JUMP
(UPat(Ops.END, name="x"), lambda x: (jl:=x.replace(op=X86Ops.JL, src=(x.src[1], cmp:=UOp(X86Ops.CMPi,
src=(add:=UOp(X86Ops.ADDi, x.src[1].dtype, (imm(x.src[1].dtype, 1),), x.src[1].arg), imm(x.src[1].dtype, x.src[1].tag))))), [add, cmp, jl])),
# TODO: need a generic way to model clobbers, idiv and flags should be handled the same way, maybe add clobber field to Register?
# fixup div, zero rdx again because scheduling constraint isn't being respected
(UPat(X86Ops.DIV, name="x"), lambda x: (nx:=x.replace(src=x.src[:1]), [UOp(X86Ops.MOVi, x.dtype, (imm(min(dtypes.uint32, x.dtype), 0),), RDX), nx])),
# remove cdq from idiv
(UPat(X86Ops.IDIV, name="x"), lambda x: (nx:=x.replace(src=x.src[:-1]), [nx])),
# rewrite two address instructions to two address form, if reused src wasn't coalesced insert a move
@ -484,6 +498,11 @@ post_regalloc_matcher = PatternMatcher([
# ***** X86 instruction encoding *****
def to_bytes(dt:DType, v:int|float):
v = truncate[dt](v)
if dt in dtypes.floats: return struct.pack({dtypes.float16: "<e", dtypes.float32: "<f", dtypes.float64: "<d"}[dt], v)
return v.to_bytes(dt.itemsize, 'little', signed=dt in dtypes.sints)
def encode(x:UOp, opc:int, reg:int|None=None, pp:int=0, sel:int=0, we:int=0):
# get the encoding structure of the uop
reg_uop, vvvv_uop, rm_uop, idx_uop, disp_uop, imm_uop = None, None, None, None, None, None
@ -573,9 +592,7 @@ def encode(x:UOp, opc:int, reg:int|None=None, pp:int=0, sel:int=0, we:int=0):
# IMM byte
if imm_uop is not None:
if isinstance(imm_uop.arg, Register): inst += bytes([(imm_uop.arg.index & 0b1111) << 4 | 0b0000])
else:
_imm = int.from_bytes(struct.pack({2: "<e", 4: "<f", 8: "<d"}[imm_uop.dtype.itemsize], imm_uop.arg), "little") if isinstance(imm_uop.arg, float) else imm_uop.arg
inst += _imm.to_bytes(imm_uop.dtype.itemsize, 'little', signed=imm_uop.dtype in dtypes.sints)
else: inst += to_bytes(imm_uop.dtype, imm_uop.arg)
return inst
# https://www.felixcloutier.com/x86/
@ -584,7 +601,7 @@ def encode(x:UOp, opc:int, reg:int|None=None, pp:int=0, sel:int=0, we:int=0):
# map select: 0F == 1, 0F38 == 2, 0F3A == 3
encodings = PatternMatcher([
# moves
(UPat(X86Ops.MOVABS, name="x"), lambda x: bytes([0b0100 << 4 | 0b1 << 3 | 0b00 << 2 | x.arg.index >> 3, 0xB8 + (x.arg.index & 0b111)]) + cast(int, x.src[0].arg).to_bytes(8, 'little', signed=x.src[0].dtype in dtypes.sints)),
(UPat(X86Ops.MOVABS, name="x"), lambda x: bytes([0b0100 << 4 | 0b1 << 3 | 0b00 << 2 | x.arg.index >> 3, 0xB8 + (x.arg.index & 0b111)]) + to_bytes(x.src[0].dtype, x.src[0].arg)),
(UPat(X86Ops.MOV, name="x"), lambda x: encode(x, 0x8B)), (UPat(X86Ops.MOVi, name="x"), lambda x: encode(x, 0xC7, reg=0)),
(UPat(X86Ops.MOVm, name="x"), lambda x: encode(x, 0x89)), (UPat(X86Ops.LEA, name="x"), lambda x: encode(x, 0x8D)),
(UPat(X86Ops.VMOVSS, name="x"), lambda x: encode(x, 0x10, pp=2, sel=1)), (UPat(X86Ops.VMOVSSm, name="x"), lambda x: encode(x, 0x11, pp=2, sel=1)),
@ -613,7 +630,7 @@ encodings = PatternMatcher([
# int division
(UPat(X86Ops.CBW), lambda: bytes([0x66, 0x98])), (UPat(X86Ops.CWD), lambda: bytes([0x66, 0x99])),
(UPat(X86Ops.CDQ), lambda: bytes([0x99])), (UPat(X86Ops.CQO), lambda: bytes([0x48, 0x99])),
(UPat(X86Ops.IDIV, name="x"), lambda x: encode(x, 0xF7, reg=7)), (UPat(X86Ops.IDIV, dtypes.uints, name="x"), lambda x: encode(x, 0xF7, reg=6)),
(UPat(X86Ops.IDIV, name="x"), lambda x: encode(x, 0xF7, reg=7)), (UPat(X86Ops.DIV, name="x"), lambda x: encode(x, 0xF7, reg=6)),
# scalar int binary
(UPat(X86Ops.SHLi, name="x"), lambda x: encode(x, 0xC1, reg=4)),
(UPat(X86Ops.SHRi, name="x"), lambda x: encode(x, 0xC1, reg=5)), (UPat(X86Ops.SARi, name="x"), lambda x: encode(x, 0xC1, reg=7)),

View file

@ -134,8 +134,8 @@ class GroupOp:
# NOTE: X86Ops with i suffix are variants that take an immediate, m suffix are variants that can write to memory instead of read from
class X86Ops(FastEnum):
# register, not an instruction
DEFINE_REG = auto()
# register, not an instruction. FRAME_INDEX is used when the function arg is on the stack and is rewritten to IMM when stack size is known
DEFINE_REG = auto(); FRAME_INDEX = auto() # noqa: E702
# const
IMM = auto()
# index
@ -173,7 +173,7 @@ class X86Ops(FastEnum):
VPBROADCASTB = auto(); VPBROADCASTW = auto(); VPBROADCASTD = auto(); VPBROADCASTQ = auto() # noqa: E702
VBROADCASTSS = auto() # TODO: VBROADCASTSD is ymm only, add once they are supported
# int division
IDIV = auto()
IDIV = auto(); DIV = auto() # noqa: E702
CBW = auto(); CWD = auto(); CDQ = auto(); CQO = auto() # noqa: E702
# int binary
ADD = auto(); ADDi = auto(); SUB = auto(); SUBi = auto(); IMUL = auto(); IMULi = auto() # noqa: E702
@ -216,7 +216,7 @@ class X86GroupOp:
X86Ops.VPMOVSXBW, X86Ops.VPMOVSXBD, X86Ops.VPMOVSXBQ, X86Ops.VPMOVSXWD, X86Ops.VPMOVSXWQ, X86Ops.VPMOVSXDQ,
X86Ops.VCVTDQ2PS, X86Ops.VCVTDQ2PD, X86Ops.VCVTTPS2DQ, X86Ops.VCVTTPD2DQ, X86Ops.VCVTTSS2SI, X86Ops.VCVTTSD2SI,
X86Ops.VCVTPH2PS, X86Ops.VCVTPS2PD, X86Ops.VCVTPD2PS, X86Ops.CMOVNE, X86Ops.CMOVE, X86Ops.CMOVL, X86Ops.CMOVB,
X86Ops.VROUNDPS, X86Ops.VROUNDPD, X86Ops.VSQRTPS, X86Ops.VSQRTPD, X86Ops.CMPi, X86Ops.IMULi, X86Ops.IDIV, X86Ops.LEA,
X86Ops.VROUNDPS, X86Ops.VROUNDPD, X86Ops.VSQRTPS, X86Ops.VSQRTPD, X86Ops.CMPi, X86Ops.IMULi, X86Ops.IDIV, X86Ops.DIV, X86Ops.LEA,
X86Ops.VPBROADCASTB, X86Ops.VPBROADCASTW, X86Ops.VPBROADCASTD, X86Ops.VPBROADCASTQ, X86Ops.VBROADCASTSS}
# X86Ops whose second src can read from memory NOTE: some of these are TwoAddress1st so the second src is actually the first
@ -243,7 +243,8 @@ class X86GroupOp:
X86Ops.JE, X86Ops.JNE}
# X86Ops that write flags or can modify flags to undefined values
WriteFlags = {X86Ops.CMP, X86Ops.CMPi, X86Ops.ADD, X86Ops.ADDi, X86Ops.SUB, X86Ops.SUBi, X86Ops.AND, X86Ops.ANDi, X86Ops.XOR, X86Ops.XORi,
X86Ops.SHL, X86Ops.SHLi, X86Ops.SHR, X86Ops.SHRi, X86Ops.SAR, X86Ops.SARi, X86Ops.IMUL, X86Ops.IMULi, X86Ops.IDIV, X86Ops.OR, X86Ops.ORi}
WriteFlags = {X86Ops.CMP, X86Ops.CMPi, X86Ops.ADD, X86Ops.ADDi, X86Ops.SUB, X86Ops.SUBi, X86Ops.IMUL, X86Ops.IMULi, X86Ops.IDIV, X86Ops.DIV,
X86Ops.SHL, X86Ops.SHLi, X86Ops.SHR, X86Ops.SHRi, X86Ops.SAR, X86Ops.SARi, X86Ops.AND, X86Ops.ANDi, X86Ops.XOR, X86Ops.XORi,
X86Ops.OR, X86Ops.ORi}
All = set(X86Ops)