DEV is ContextVar, setting Device.DEFAULT is deprecated (#15508)

This commit is contained in:
Christopher Milan 2026-03-30 14:10:49 -07:00 committed by GitHub
commit adbfd82d1d
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
10 changed files with 72 additions and 59 deletions

View file

@ -2,13 +2,14 @@
import os
if "NOOPT" not in os.environ: os.environ["NOOPT"] = "1"
from tinygrad import Device, nn, Tensor, dtypes
Device.DEFAULT = "CPU"
from train_gpt2 import GPT, GPTConfig
from tinygrad.helpers import dedup, flatten, getenv, GlobalCounters, to_function_name
from tinygrad.helpers import DEV, dedup, flatten, getenv, GlobalCounters, to_function_name
from tinygrad.engine.realize import get_kernel
from tinygrad.engine.memory import memory_planner
from tinygrad.uop.ops import Ops
DEV.value = "CPU"
TIMING = getenv("TIMING")
if __name__ == "__main__":

View file

@ -325,19 +325,18 @@ def eval_stable_diffusion():
# NOTE: the clip weights are the same between model.cond_stage_model and clip_encoder
eval_timesteps = list(reversed(range(1, 1000, 20)))
original_device, Device.DEFAULT = Device.DEFAULT, "CPU"
# The choice of alphas_prev[0] = alphas_cumprod[0] seems arbitrary, but it's how the mlperf ref does it:
# alphas_prev = np.asarray([alphacums[0]] + alphacums[ddim_timesteps[:-1]].tolist())
eval_alphas_prev = model.alphas_cumprod[0:1].cat(model.alphas_cumprod[list(range(1, 1000, 20))[:-1]]).to(GPUS).realize()
inception = FidInceptionV3().load_from_pretrained(CKPTDIR / "inception" / "pt_inception-2015-12-05-6726825d.pth")
vision_cfg = {'width': 1280, 'layers': 32, 'd_head': 80, 'image_size': 224, 'patch_size': 14}
text_cfg = {'width': 1024, 'n_heads': 16, 'layers': 24, 'vocab_size': 49408, 'ctx_length': 77}
clip.gelu = gelu_erf
clip_encoder = OpenClipEncoder(1024, text_cfg, vision_cfg)
loaded = torch_load(CKPTDIR / "clip" / "open_clip_pytorch_model.bin")
loaded.update({"attn_mask": clip_encoder.attn_mask, "mean": clip_encoder.mean, "std": clip_encoder.std})
load_state_dict(clip_encoder, loaded)
Device.DEFAULT=original_device
with Context(DEV="CPU"):
# The choice of alphas_prev[0] = alphas_cumprod[0] seems arbitrary, but it's how the mlperf ref does it:
# alphas_prev = np.asarray([alphacums[0]] + alphacums[ddim_timesteps[:-1]].tolist())
eval_alphas_prev = model.alphas_cumprod[0:1].cat(model.alphas_cumprod[list(range(1, 1000, 20))[:-1]]).to(GPUS).realize()
inception = FidInceptionV3().load_from_pretrained(CKPTDIR / "inception" / "pt_inception-2015-12-05-6726825d.pth")
vision_cfg = {'width': 1280, 'layers': 32, 'd_head': 80, 'image_size': 224, 'patch_size': 14}
text_cfg = {'width': 1024, 'n_heads': 16, 'layers': 24, 'vocab_size': 49408, 'ctx_length': 77}
clip.gelu = gelu_erf
clip_encoder = OpenClipEncoder(1024, text_cfg, vision_cfg)
loaded = torch_load(CKPTDIR / "clip" / "open_clip_pytorch_model.bin")
loaded.update({"attn_mask": clip_encoder.attn_mask, "mean": clip_encoder.mean, "std": clip_encoder.std})
load_state_dict(clip_encoder, loaded)
@TinyJit
def denoise_step(x:Tensor, x_x:Tensor, t_t:Tensor, uc_c:Tensor, sqrt_alphas_cumprod_t:Tensor, sqrt_one_minus_alphas_cumprod_t:Tensor,

View file

@ -3,7 +3,7 @@ from extra.export_model import export_model
from examples.llama3 import build_transformer, Tokenizer
from tinygrad.nn.state import get_state_dict, load_state_dict
from tinygrad import Device, Variable, Tensor, dtypes, TinyJit
from tinygrad.helpers import fetch, Context
from tinygrad.helpers import DEV, fetch, Context
from tiktoken.load import load_tiktoken_bpe, dump_tiktoken_bpe
def prepare_browser_chunks(model):
@ -115,7 +115,7 @@ if __name__=="__main__":
start_pos = Variable("start_pos", 0, max_context).bind(0)
model_input = lambda: [Tensor([[tok]]), start_pos, TEMPERATURE, TOP_K, TOP_P, ALPHA_F, ALPHA_P]
Device.DEFAULT="CPU"
DEV.value = "CPU"
model = build_transformer(model_path, model_size="1B", quantize="int8", scale_dtype=dtypes.float32, device=Device.DEFAULT, max_context=max_context)
state_dict = get_state_dict(model)
validate_model(model, tokenizer)
@ -129,7 +129,7 @@ if __name__=="__main__":
with open(os.path.join(os.path.dirname(__file__), f"{model_name}.c"), "w") as f: f.write(cprog)
with open(os.path.join(os.path.dirname(__file__), "net_clang.js"), "w") as f: f.write(js_wrapper)
Device.DEFAULT="WEBGPU"
DEV.value = "WEBGPU"
# float16 is not yet supported for dawn/Vulkan/NVIDIA stack, see: https://issues.chromium.org/issues/42251215
# therefore for now, we used CLANG to quantize the float16 llama to int8 with float32 scales, then load to WEBGPU
model = build_transformer(model_path, model_size="1B", quantize="int8", max_context=max_context, load_weights=False)

View file

@ -4,8 +4,8 @@ from extra.f16_decompress import u32_to_f16
from examples.stable_diffusion import StableDiffusion
from tinygrad.nn.state import get_state_dict, safe_save, safe_load_metadata, torch_load, load_state_dict
from tinygrad.tensor import Tensor
from tinygrad import Device, dtypes
from tinygrad.helpers import fetch
from tinygrad import dtypes
from tinygrad.helpers import DEV, fetch
from typing import NamedTuple, Any, List
import requests
import argparse
@ -80,7 +80,7 @@ if __name__ == "__main__":
parser = argparse.ArgumentParser(description='Run Stable Diffusion', formatter_class=argparse.ArgumentDefaultsHelpFormatter)
parser.add_argument('--remoteweights', action='store_true', help="Use safetensors from Huggingface, or from local")
args = parser.parse_args()
Device.DEFAULT = "WEBGPU"
DEV.value = "WEBGPU"
model = StableDiffusion()

View file

@ -4,10 +4,11 @@ from tinygrad.tensor import Tensor
from tinygrad.nn.state import safe_save
from extra.export_model import export_model
from tinygrad.device import Device
from tinygrad.helpers import DEV
from tinygrad.nn.state import safe_load, load_state_dict
if __name__ == "__main__":
Device.DEFAULT = "WEBGPU"
DEV.value = "WEBGPU"
yolo_variant = 'n'
yolo_infer = YOLOv8(w=0.25, r=2.0, d=0.33, num_classes=80)
state_dict = safe_load(get_weights_location(yolo_variant))

View file

@ -1,11 +1,11 @@
from tinygrad import nn, Tensor, Device, dtypes
from tinygrad.helpers import Timing
from tinygrad import nn, Tensor, dtypes
from tinygrad.helpers import DEV, Timing
from extra.models.llama import Transformer
from examples.llama3 import MODEL_PARAMS
if __name__ == "__main__":
Device.DEFAULT = "NULL"
DEV.value = "NULL"
Tensor.training = True
#model_size = "8B"
model_size = "405B"

View file

@ -7,7 +7,7 @@ from onnx2torch import convert
from tinygrad.nn.onnx import OnnxRunner
from tinygrad.helpers import OSX, DEBUG, fetch, getenv
from tinygrad.dtype import _to_np_dtype
from tinygrad import Tensor, Device, dtypes
from tinygrad import Tensor, Device, Context, dtypes
MODELS = {
"resnet50": "https://github.com/onnx/models/raw/main/validated/vision/classification/resnet/model/resnet50-caffe2-v1-9.onnx",
@ -60,16 +60,16 @@ def benchmark_model(m, devices, validate_outs=False):
# print input names
if DEBUG >= 2: print(list(runner.graph_inputs))
for device in devices:
Device.DEFAULT = device
inputs = {k:Tensor(inp) for k,inp in np_inputs.items()}
tinygrad_model = runner.to(device)
benchmark(m, f"tinygrad_{device.lower()}_jitless", lambda: {k:v.numpy() for k,v in tinygrad_model(inputs).items()})
with Context(DEV=device):
inputs = {k:Tensor(inp) for k,inp in np_inputs.items()}
tinygrad_model = runner.to(device)
benchmark(m, f"tinygrad_{device.lower()}_jitless", lambda: {k:v.numpy() for k,v in tinygrad_model(inputs).items()})
from tinygrad.engine.jit import TinyJit
tinygrad_jitted_model = TinyJit(lambda **kwargs: {k:v.realize() for k,v in tinygrad_model(kwargs).items()})
for _ in range(3): {k:v.numpy() for k,v in tinygrad_jitted_model(**inputs).items()}
benchmark(m, f"tinygrad_{device.lower()}_jit", lambda: {k:v.numpy() for k,v in tinygrad_jitted_model(**inputs).items()}) # noqa: F821
del inputs, tinygrad_model, tinygrad_jitted_model
from tinygrad.engine.jit import TinyJit
tinygrad_jitted_model = TinyJit(lambda **kwargs: {k:v.realize() for k,v in tinygrad_model(kwargs).items()})
for _ in range(3): {k:v.numpy() for k,v in tinygrad_jitted_model(**inputs).items()}
benchmark(m, f"tinygrad_{device.lower()}_jit", lambda: {k:v.numpy() for k,v in tinygrad_jitted_model(**inputs).items()}) # noqa: F821
del inputs, tinygrad_model, tinygrad_jitted_model
# convert model to torch
try:
@ -104,22 +104,22 @@ def benchmark_model(m, devices, validate_outs=False):
if validate_outs:
for device in devices:
rtol, atol = 2e-3, 2e-3 # tolerance for fp16 models
Device.DEFAULT = device
# force half inputs to float for numerical stability when validating
# this will rely on automatic dtype promotion for converting half weights inside the graph
if m in half_models:
inputs = {k:Tensor(inp, dtype=dtypes.float32) if inp.dtype == np.float16 else Tensor(inp) for k,inp in np_inputs.items()}
else:
inputs = {k:Tensor(inp) for k,inp in np_inputs.items()}
tinygrad_model = runner.to(device)
tinygrad_out = tinygrad_model(inputs)
with Context(DEV=device):
# force half inputs to float for numerical stability when validating
# this will rely on automatic dtype promotion for converting half weights inside the graph
if m in half_models:
inputs = {k:Tensor(inp, dtype=dtypes.float32) if inp.dtype == np.float16 else Tensor(inp) for k,inp in np_inputs.items()}
else:
inputs = {k:Tensor(inp) for k,inp in np_inputs.items()}
tinygrad_model = runner.to(device)
tinygrad_out = tinygrad_model(inputs)
ort_sess = ort.InferenceSession(str(fn), ort_options, ["CPUExecutionProvider"])
onnx_out = ort_sess.run(output_names, np_inputs)
onnx_out = dict([*list(zip(output_names, onnx_out))])
ort_sess = ort.InferenceSession(str(fn), ort_options, ["CPUExecutionProvider"])
onnx_out = ort_sess.run(output_names, np_inputs)
onnx_out = dict([*list(zip(output_names, onnx_out))])
assert_allclose(tinygrad_out, onnx_out, rtol=rtol, atol=atol)
print(f"{m:16s}outputs validated on {device=} with rtol={rtol:.1e}, atol={atol:.1e}")
assert_allclose(tinygrad_out, onnx_out, rtol=rtol, atol=atol)
print(f"{m:16s}outputs validated on {device=} with rtol={rtol:.1e}, atol={atol:.1e}")
if open_csv is None:
open_csv = csv.DictWriter(open('onnx_inference_speed.csv', 'w', newline=''), fieldnames=list(CSV.keys()))

View file

@ -26,13 +26,16 @@ class TestDevice(unittest.TestCase):
def test_lowercase_canonicalizes(self):
device = Device.DEFAULT
Device.DEFAULT = device.lower()
self.assertEqual(Device.canonicalize(None), device)
Device.DEFAULT = device
with Context(DEV=device.lower()):
self.assertEqual(Device.canonicalize(None), device)
def test_set_device_default_raises(self):
with self.assertRaisesRegex(AttributeError, "setting Device.DEFAULT is deprecated"):
Device.DEFAULT = "CPU"
def test_old_device_env_raises(self):
result = subprocess.run(['python3', '-c', 'from tinygrad import Device; Device.DEFAULT'],
env={**os.environ, "CPU": "1"}, capture_output=True)
env={**os.environ, "CPU": "1", "DEV": ""}, capture_output=True)
self.assertNotEqual(result.returncode, 0)
self.assertIn(b"deprecated", result.stderr)
@ -95,6 +98,12 @@ class TestDevice(unittest.TestCase):
with patch("tinygrad.renderer.cstyle.ClangJITRenderer.__init__", side_effect=RuntimeError("broken")):
self.assertIsInstance(dev.renderer.compiler, CPULLVMCompiler)
def test_dev_contextvar(self):
orig_dev = Device.DEFAULT
with Context(DEV="CPU"): self.assertEqual(Tensor.empty(1).device, "CPU")
with Context(DEV="NULL"): self.assertEqual(Tensor.empty(1).device, "NULL")
self.assertEqual(Tensor.empty(1).device, orig_dev)
class MockCompiler(Compiler):
def __init__(self, key): super().__init__(key)
def compile(self, src) -> bytes: return src.encode()

View file

@ -5,7 +5,7 @@ from typing import Any, Generic, TypeVar, Iterator, Generator, TYPE_CHECKING
import importlib, inspect, functools, pathlib, os, platform, contextlib, sys, re, atexit, pickle, decimal
from tinygrad.helpers import BENCHMARKS, CI, OSX, LRU, getenv, diskcache_get, diskcache_put, DEBUG, GlobalCounters, flat_mv, PROFILE, temp, colored
from tinygrad.helpers import Context, CCACHE, ALLOW_DEVICE_USAGE, MAX_BUFFER_SIZE, cpu_events, ProfileEvent, ProfilePointEvent, ContextVar
from tinygrad.helpers import unwrap_class_type, suppress_finalizing, select_first_inited, VIZ, CPU_LLVM, CPU_LVP, NV_PTX, CUDA_PTX, NV_NAK
from tinygrad.helpers import unwrap_class_type, suppress_finalizing, select_first_inited, DEV, VIZ, CPU_LLVM, CPU_LVP, NV_PTX, CUDA_PTX, NV_NAK
from tinygrad.helpers import EMULATE, EMULATED_DTYPES, NULL_IR3, NULL_QCOMCL, IMAGE, FLOAT16, TracingKey, size_to_str
from tinygrad.dtype import DType, PtrDType, dtypes, _to_np_dtype
if TYPE_CHECKING: from tinygrad.renderer import Renderer
@ -39,11 +39,14 @@ class _Device:
def get_available_devices(self) -> Iterator[str]:
for device in ALL_DEVICES:
with contextlib.suppress(Exception): yield self[device].device
@property
def DEFAULT(self) -> str: return DEV.value.upper() if DEV else self._select_device
@DEFAULT.setter
def DEFAULT(self, v): raise AttributeError(f'setting Device.DEFAULT is deprecated, use "with Context(DEV={v!r})" or "DEV.value = {v!r}"')
@functools.cached_property
def DEFAULT(self) -> str:
def _select_device(self) -> str:
assert (dev:=next((d for d in self._devices if d not in ["DISK", "TINYFS", "NPY"] and getenv(d) == 1), None)) is None, \
f"{dev}=1 is deprecated, use DEV={dev} instead"
if (dev:=getenv("DEV", "").upper()): return dev
try:
device = next(self.get_available_devices())
os.environ["DEV"] = device # we set this in environment for spawned children

View file

@ -176,7 +176,7 @@ class ContextVar(Generic[T]):
assert isinstance(self.value, str)
return [getattr(obj, x) if obj else x for x in self.value.split(',') if x]
DEBUG, BEAM, NOOPT = ContextVar("DEBUG", 0), ContextVar("BEAM", 0), ContextVar("NOOPT", 0)
DEV, DEBUG, BEAM, NOOPT = ContextVar("DEV", ""), ContextVar("DEBUG", 0), ContextVar("BEAM", 0), ContextVar("NOOPT", 0)
IMAGE, FLOAT16, OPENPILOT_HACKS = ContextVar("IMAGE", 0), ContextVar("FLOAT16", 0), ContextVar("OPENPILOT_HACKS", 0)
JIT, JIT_BATCH_SIZE = ContextVar("JIT", 2 if OSX and ARCH_X86 else 1), ContextVar("JIT_BATCH_SIZE", 32)
WINO, CAPTURING, TRACEMETA = ContextVar("WINO", 0), ContextVar("CAPTURING", 1), ContextVar("TRACEMETA", 1)