DEV is ContextVar, setting Device.DEFAULT is deprecated (#15508)

2026-06-24 02:14:17 +00:00 · 2026-03-30 14:10:49 -07:00 · 2026-03-30 14:10:49 -07:00 · adbfd82d1d
commit adbfd82d1d
parent 9583489068
10 changed files with 72 additions and 59 deletions
--- a/examples/llm.c/export.py
+++ b/examples/llm.c/export.py
@ -2,13 +2,14 @@
 import os
 if "NOOPT" not in os.environ: os.environ["NOOPT"] = "1"
 from tinygrad import Device, nn, Tensor, dtypes
-Device.DEFAULT = "CPU"
 from train_gpt2 import GPT, GPTConfig
-from tinygrad.helpers import dedup, flatten, getenv, GlobalCounters, to_function_name
+from tinygrad.helpers import DEV, dedup, flatten, getenv, GlobalCounters, to_function_name
 from tinygrad.engine.realize import get_kernel
 from tinygrad.engine.memory import memory_planner
 from tinygrad.uop.ops import Ops

+DEV.value = "CPU"
+
 TIMING = getenv("TIMING")

 if __name__ == "__main__":
--- a/examples/mlperf/model_eval.py
+++ b/examples/mlperf/model_eval.py
@ -325,19 +325,18 @@ def eval_stable_diffusion():
  # NOTE: the clip weights are the same between model.cond_stage_model and clip_encoder
  eval_timesteps = list(reversed(range(1, 1000, 20)))

-  original_device, Device.DEFAULT = Device.DEFAULT, "CPU"
-  # The choice of alphas_prev[0] = alphas_cumprod[0] seems arbitrary, but it's how the mlperf ref does it:
-  #   alphas_prev = np.asarray([alphacums[0]] + alphacums[ddim_timesteps[:-1]].tolist())
-  eval_alphas_prev = model.alphas_cumprod[0:1].cat(model.alphas_cumprod[list(range(1, 1000, 20))[:-1]]).to(GPUS).realize()
-  inception = FidInceptionV3().load_from_pretrained(CKPTDIR / "inception" / "pt_inception-2015-12-05-6726825d.pth")
-  vision_cfg = {'width': 1280, 'layers': 32, 'd_head': 80, 'image_size': 224, 'patch_size': 14}
-  text_cfg = {'width': 1024, 'n_heads': 16, 'layers': 24, 'vocab_size': 49408, 'ctx_length': 77}
-  clip.gelu = gelu_erf
-  clip_encoder = OpenClipEncoder(1024, text_cfg, vision_cfg)
-  loaded = torch_load(CKPTDIR / "clip" / "open_clip_pytorch_model.bin")
-  loaded.update({"attn_mask": clip_encoder.attn_mask, "mean": clip_encoder.mean, "std": clip_encoder.std})
-  load_state_dict(clip_encoder, loaded)
-  Device.DEFAULT=original_device
+  with Context(DEV="CPU"):
+    # The choice of alphas_prev[0] = alphas_cumprod[0] seems arbitrary, but it's how the mlperf ref does it:
+    #   alphas_prev = np.asarray([alphacums[0]] + alphacums[ddim_timesteps[:-1]].tolist())
+    eval_alphas_prev = model.alphas_cumprod[0:1].cat(model.alphas_cumprod[list(range(1, 1000, 20))[:-1]]).to(GPUS).realize()
+    inception = FidInceptionV3().load_from_pretrained(CKPTDIR / "inception" / "pt_inception-2015-12-05-6726825d.pth")
+    vision_cfg = {'width': 1280, 'layers': 32, 'd_head': 80, 'image_size': 224, 'patch_size': 14}
+    text_cfg = {'width': 1024, 'n_heads': 16, 'layers': 24, 'vocab_size': 49408, 'ctx_length': 77}
+    clip.gelu = gelu_erf
+    clip_encoder = OpenClipEncoder(1024, text_cfg, vision_cfg)
+    loaded = torch_load(CKPTDIR / "clip" / "open_clip_pytorch_model.bin")
+    loaded.update({"attn_mask": clip_encoder.attn_mask, "mean": clip_encoder.mean, "std": clip_encoder.std})
+    load_state_dict(clip_encoder, loaded)

  @TinyJit
  def denoise_step(x:Tensor, x_x:Tensor, t_t:Tensor, uc_c:Tensor, sqrt_alphas_cumprod_t:Tensor, sqrt_one_minus_alphas_cumprod_t:Tensor,
--- a/examples/tinychat/tinychat-browser/compile.py
+++ b/examples/tinychat/tinychat-browser/compile.py
@ -3,7 +3,7 @@ from extra.export_model import export_model
 from examples.llama3 import build_transformer, Tokenizer
 from tinygrad.nn.state import get_state_dict, load_state_dict
 from tinygrad import Device, Variable, Tensor, dtypes, TinyJit
-from tinygrad.helpers import fetch, Context
+from tinygrad.helpers import DEV, fetch, Context
 from tiktoken.load import load_tiktoken_bpe, dump_tiktoken_bpe

 def prepare_browser_chunks(model):
@ -115,7 +115,7 @@ if __name__=="__main__":
  start_pos = Variable("start_pos", 0, max_context).bind(0)
  model_input = lambda: [Tensor([[tok]]), start_pos, TEMPERATURE, TOP_K, TOP_P, ALPHA_F, ALPHA_P]

-  Device.DEFAULT="CPU"
+  DEV.value = "CPU"
  model = build_transformer(model_path, model_size="1B", quantize="int8", scale_dtype=dtypes.float32, device=Device.DEFAULT, max_context=max_context)
  state_dict = get_state_dict(model)
  validate_model(model, tokenizer)
@ -129,7 +129,7 @@ if __name__=="__main__":
  with open(os.path.join(os.path.dirname(__file__), f"{model_name}.c"), "w") as f: f.write(cprog)
  with open(os.path.join(os.path.dirname(__file__), "net_clang.js"), "w") as f: f.write(js_wrapper)

-  Device.DEFAULT="WEBGPU"
+  DEV.value = "WEBGPU"
  # float16 is not yet supported for dawn/Vulkan/NVIDIA stack, see: https://issues.chromium.org/issues/42251215
  # therefore for now, we used CLANG to quantize the float16 llama to int8 with float32 scales, then load to WEBGPU
  model = build_transformer(model_path, model_size="1B", quantize="int8", max_context=max_context, load_weights=False)
--- a/examples/webgpu/stable_diffusion/compile.py
+++ b/examples/webgpu/stable_diffusion/compile.py
@ -4,8 +4,8 @@ from extra.f16_decompress import u32_to_f16
 from examples.stable_diffusion import StableDiffusion
 from tinygrad.nn.state import get_state_dict, safe_save, safe_load_metadata, torch_load, load_state_dict
 from tinygrad.tensor import Tensor
-from tinygrad import Device, dtypes
-from tinygrad.helpers import fetch
+from tinygrad import dtypes
+from tinygrad.helpers import DEV, fetch
 from typing import NamedTuple, Any, List
 import requests
 import argparse
@ -80,7 +80,7 @@ if __name__ == "__main__":
  parser = argparse.ArgumentParser(description='Run Stable Diffusion', formatter_class=argparse.ArgumentDefaultsHelpFormatter)
  parser.add_argument('--remoteweights', action='store_true', help="Use safetensors from Huggingface, or from local")
  args = parser.parse_args()
-  Device.DEFAULT = "WEBGPU"
+  DEV.value = "WEBGPU"

  model = StableDiffusion()

--- a/examples/webgpu/yolov8/compile.py
+++ b/examples/webgpu/yolov8/compile.py
@ -4,10 +4,11 @@ from tinygrad.tensor import Tensor
 from tinygrad.nn.state import safe_save
 from extra.export_model import export_model
 from tinygrad.device import Device
+from tinygrad.helpers import DEV
 from tinygrad.nn.state import safe_load, load_state_dict

 if __name__ == "__main__":
-    Device.DEFAULT = "WEBGPU"
+    DEV.value = "WEBGPU"
    yolo_variant = 'n'
    yolo_infer = YOLOv8(w=0.25, r=2.0, d=0.33, num_classes=80)
    state_dict = safe_load(get_weights_location(yolo_variant))
--- a/test/external/external_benchmark_llama_schedule.py
+++ b/test/external/external_benchmark_llama_schedule.py
@ -1,11 +1,11 @@
-from tinygrad import nn, Tensor, Device, dtypes
-from tinygrad.helpers import Timing
+from tinygrad import nn, Tensor, dtypes
+from tinygrad.helpers import DEV, Timing

 from extra.models.llama import Transformer
 from examples.llama3 import MODEL_PARAMS

 if __name__ == "__main__":
-  Device.DEFAULT = "NULL"
+  DEV.value = "NULL"
  Tensor.training = True
  #model_size = "8B"
  model_size = "405B"
--- a/test/external/external_model_benchmark.py
+++ b/test/external/external_model_benchmark.py
@ -7,7 +7,7 @@ from onnx2torch import convert
 from tinygrad.nn.onnx import OnnxRunner
 from tinygrad.helpers import OSX, DEBUG, fetch, getenv
 from tinygrad.dtype import _to_np_dtype
-from tinygrad import Tensor, Device, dtypes
+from tinygrad import Tensor, Device, Context, dtypes

 MODELS = {
  "resnet50": "https://github.com/onnx/models/raw/main/validated/vision/classification/resnet/model/resnet50-caffe2-v1-9.onnx",
@ -60,16 +60,16 @@ def benchmark_model(m, devices, validate_outs=False):
  # print input names
  if DEBUG >= 2: print(list(runner.graph_inputs))
  for device in devices:
-    Device.DEFAULT = device
-    inputs = {k:Tensor(inp) for k,inp in np_inputs.items()}
-    tinygrad_model = runner.to(device)
-    benchmark(m, f"tinygrad_{device.lower()}_jitless", lambda: {k:v.numpy() for k,v in tinygrad_model(inputs).items()})
+    with Context(DEV=device):
+      inputs = {k:Tensor(inp) for k,inp in np_inputs.items()}
+      tinygrad_model = runner.to(device)
+      benchmark(m, f"tinygrad_{device.lower()}_jitless", lambda: {k:v.numpy() for k,v in tinygrad_model(inputs).items()})

-    from tinygrad.engine.jit import TinyJit
-    tinygrad_jitted_model = TinyJit(lambda **kwargs: {k:v.realize() for k,v in tinygrad_model(kwargs).items()})
-    for _ in range(3): {k:v.numpy() for k,v in tinygrad_jitted_model(**inputs).items()}
-    benchmark(m, f"tinygrad_{device.lower()}_jit", lambda: {k:v.numpy() for k,v in tinygrad_jitted_model(**inputs).items()}) # noqa: F821
-    del inputs, tinygrad_model, tinygrad_jitted_model
+      from tinygrad.engine.jit import TinyJit
+      tinygrad_jitted_model = TinyJit(lambda **kwargs: {k:v.realize() for k,v in tinygrad_model(kwargs).items()})
+      for _ in range(3): {k:v.numpy() for k,v in tinygrad_jitted_model(**inputs).items()}
+      benchmark(m, f"tinygrad_{device.lower()}_jit", lambda: {k:v.numpy() for k,v in tinygrad_jitted_model(**inputs).items()}) # noqa: F821
+      del inputs, tinygrad_model, tinygrad_jitted_model

  # convert model to torch
  try:
@ -104,22 +104,22 @@ def benchmark_model(m, devices, validate_outs=False):
  if validate_outs:
    for device in devices:
      rtol, atol = 2e-3, 2e-3  # tolerance for fp16 models
-      Device.DEFAULT = device
-      # force half inputs to float for numerical stability when validating
-      # this will rely on automatic dtype promotion for converting half weights inside the graph
-      if m in half_models:
-        inputs = {k:Tensor(inp, dtype=dtypes.float32) if inp.dtype == np.float16 else Tensor(inp) for k,inp in np_inputs.items()}
-      else:
-        inputs = {k:Tensor(inp) for k,inp in np_inputs.items()}
-      tinygrad_model = runner.to(device)
-      tinygrad_out = tinygrad_model(inputs)
+      with Context(DEV=device):
+        # force half inputs to float for numerical stability when validating
+        # this will rely on automatic dtype promotion for converting half weights inside the graph
+        if m in half_models:
+          inputs = {k:Tensor(inp, dtype=dtypes.float32) if inp.dtype == np.float16 else Tensor(inp) for k,inp in np_inputs.items()}
+        else:
+          inputs = {k:Tensor(inp) for k,inp in np_inputs.items()}
+        tinygrad_model = runner.to(device)
+        tinygrad_out = tinygrad_model(inputs)

-      ort_sess = ort.InferenceSession(str(fn), ort_options, ["CPUExecutionProvider"])
-      onnx_out = ort_sess.run(output_names, np_inputs)
-      onnx_out = dict([*list(zip(output_names, onnx_out))])
+        ort_sess = ort.InferenceSession(str(fn), ort_options, ["CPUExecutionProvider"])
+        onnx_out = ort_sess.run(output_names, np_inputs)
+        onnx_out = dict([*list(zip(output_names, onnx_out))])

-      assert_allclose(tinygrad_out, onnx_out, rtol=rtol, atol=atol)
-      print(f"{m:16s}outputs validated on {device=} with rtol={rtol:.1e}, atol={atol:.1e}")
+        assert_allclose(tinygrad_out, onnx_out, rtol=rtol, atol=atol)
+        print(f"{m:16s}outputs validated on {device=} with rtol={rtol:.1e}, atol={atol:.1e}")

  if open_csv is None:
    open_csv = csv.DictWriter(open('onnx_inference_speed.csv', 'w', newline=''), fieldnames=list(CSV.keys()))
--- a/test/null/test_device.py
+++ b/test/null/test_device.py
@ -26,13 +26,16 @@ class TestDevice(unittest.TestCase):

  def test_lowercase_canonicalizes(self):
    device = Device.DEFAULT
-    Device.DEFAULT = device.lower()
-    self.assertEqual(Device.canonicalize(None), device)
-    Device.DEFAULT = device
+    with Context(DEV=device.lower()):
+      self.assertEqual(Device.canonicalize(None), device)
+
+  def test_set_device_default_raises(self):
+    with self.assertRaisesRegex(AttributeError, "setting Device.DEFAULT is deprecated"):
+      Device.DEFAULT = "CPU"

  def test_old_device_env_raises(self):
    result = subprocess.run(['python3', '-c', 'from tinygrad import Device; Device.DEFAULT'],
-                            env={**os.environ, "CPU": "1"}, capture_output=True)
+                            env={**os.environ, "CPU": "1", "DEV": ""}, capture_output=True)
    self.assertNotEqual(result.returncode, 0)
    self.assertIn(b"deprecated", result.stderr)

@ -95,6 +98,12 @@ class TestDevice(unittest.TestCase):
    with patch("tinygrad.renderer.cstyle.ClangJITRenderer.__init__", side_effect=RuntimeError("broken")):
      self.assertIsInstance(dev.renderer.compiler, CPULLVMCompiler)

+  def test_dev_contextvar(self):
+    orig_dev = Device.DEFAULT
+    with Context(DEV="CPU"): self.assertEqual(Tensor.empty(1).device, "CPU")
+    with Context(DEV="NULL"): self.assertEqual(Tensor.empty(1).device, "NULL")
+    self.assertEqual(Tensor.empty(1).device, orig_dev)
+
 class MockCompiler(Compiler):
  def __init__(self, key): super().__init__(key)
  def compile(self, src) -> bytes: return src.encode()
--- a/tinygrad/device.py
+++ b/tinygrad/device.py
@ -5,7 +5,7 @@ from typing import Any, Generic, TypeVar, Iterator, Generator, TYPE_CHECKING
 import importlib, inspect, functools, pathlib, os, platform, contextlib, sys, re, atexit, pickle, decimal
 from tinygrad.helpers import BENCHMARKS, CI, OSX, LRU, getenv, diskcache_get, diskcache_put, DEBUG, GlobalCounters, flat_mv, PROFILE, temp, colored
 from tinygrad.helpers import Context, CCACHE, ALLOW_DEVICE_USAGE, MAX_BUFFER_SIZE, cpu_events, ProfileEvent, ProfilePointEvent, ContextVar
-from tinygrad.helpers import unwrap_class_type, suppress_finalizing, select_first_inited, VIZ, CPU_LLVM, CPU_LVP, NV_PTX, CUDA_PTX, NV_NAK
+from tinygrad.helpers import unwrap_class_type, suppress_finalizing, select_first_inited, DEV, VIZ, CPU_LLVM, CPU_LVP, NV_PTX, CUDA_PTX, NV_NAK
 from tinygrad.helpers import EMULATE, EMULATED_DTYPES, NULL_IR3, NULL_QCOMCL, IMAGE, FLOAT16, TracingKey, size_to_str
 from tinygrad.dtype import DType, PtrDType, dtypes, _to_np_dtype
 if TYPE_CHECKING: from tinygrad.renderer import Renderer
@ -39,11 +39,14 @@ class _Device:
  def get_available_devices(self) -> Iterator[str]:
    for device in ALL_DEVICES:
      with contextlib.suppress(Exception): yield self[device].device
+  @property
+  def DEFAULT(self) -> str: return DEV.value.upper() if DEV else self._select_device
+  @DEFAULT.setter
+  def DEFAULT(self, v): raise AttributeError(f'setting Device.DEFAULT is deprecated, use "with Context(DEV={v!r})" or "DEV.value = {v!r}"')
  @functools.cached_property
-  def DEFAULT(self) -> str:
+  def _select_device(self) -> str:
    assert (dev:=next((d for d in self._devices if d not in ["DISK", "TINYFS", "NPY"] and getenv(d) == 1), None)) is None, \
      f"{dev}=1 is deprecated, use DEV={dev} instead"
-    if (dev:=getenv("DEV", "").upper()): return dev
    try:
      device = next(self.get_available_devices())
      os.environ["DEV"] = device   # we set this in environment for spawned children
--- a/tinygrad/helpers.py
+++ b/tinygrad/helpers.py
@ -176,7 +176,7 @@ class ContextVar(Generic[T]):
    assert isinstance(self.value, str)
    return [getattr(obj, x) if obj else x for x in self.value.split(',') if x]

-DEBUG, BEAM, NOOPT = ContextVar("DEBUG", 0), ContextVar("BEAM", 0), ContextVar("NOOPT", 0)
+DEV, DEBUG, BEAM, NOOPT = ContextVar("DEV", ""), ContextVar("DEBUG", 0), ContextVar("BEAM", 0), ContextVar("NOOPT", 0)
 IMAGE, FLOAT16, OPENPILOT_HACKS = ContextVar("IMAGE", 0), ContextVar("FLOAT16", 0), ContextVar("OPENPILOT_HACKS", 0)
 JIT, JIT_BATCH_SIZE = ContextVar("JIT", 2 if OSX and ARCH_X86 else 1), ContextVar("JIT_BATCH_SIZE", 32)
 WINO, CAPTURING, TRACEMETA = ContextVar("WINO", 0), ContextVar("CAPTURING", 1), ContextVar("TRACEMETA", 1)