clean up gguf (#16160)

2026-06-24 02:14:17 +00:00 · 2026-05-13 12:16:10 +08:00 · 2026-05-13 12:16:10 +08:00 · 3c806ff406
commit 3c806ff406
parent e97f2c1114
2 changed files with 39 additions and 38 deletions
--- a/test/unit/test_gguf.py
+++ b/test/unit/test_gguf.py
@ -115,33 +115,35 @@ class TestGGUF(unittest.TestCase):
    with self.assertRaises(ValueError):
      ggml_data_to_tensor(Tensor.empty(512, dtype=dtypes.uint8), 256, 1337)

-  def test_multi_part_load(self):
-    def build(n_total, part_no, tensors):
-      # [header] [kv_data] [tensor_infos] [padding] [tensor_data_blob]
-      buf = bytearray()
-      # Header: magic "GGUF" + version=3 + n_tensors + n_kv=2
-      buf += struct.pack("<4siqq", b"GGUF", 3, len(tensors), 2)
-      # KV entries: [key_len: uint64][key bytes][type: int32][value]
-      for k, v in [("split.count", n_total), ("split.no", part_no)]:
-        kb = k.encode()
-        buf += struct.pack("<Q", len(kb)) + kb + struct.pack("<i", 4) + struct.pack("<I", v)
-      data_off = 0
-      # Tensor infos: [name_len][name][ndims][dims reversed][qtype][offset_into_data_blob]
-      for name, dims, qtype, data in tensors:
-        nb = name.encode()
-        buf += struct.pack("<Q", len(nb)) + nb + struct.pack("<I", len(dims))
-        for d in reversed(dims): buf += struct.pack("<Q", d)
-        buf += struct.pack("<i", qtype) + struct.pack("<Q", data_off)
-        data_off += len(data)
-      buf += b"\x00" * ((32 - len(buf) % 32) % 32)
-      for _, _, _, data in tensors: buf += data
-      return bytes(buf)
+  @staticmethod
+  def _build_gguf(tensors, kvs):
+    # [header] [kv_data] [tensor_infos] [padding] [tensor_data_blob]
+    buf = bytearray()
+    # Header: magic "GGUF" + version=3 + n_tensors + n_kv
+    buf += struct.pack("<4siqq", b"GGUF", 3, len(tensors), len(kvs))
+    # KV entries: [key_len: uint64][key bytes][type: int32][value]
+    for k, v in kvs:
+      kb = k.encode()
+      if isinstance(v, str): buf += struct.pack("<Q", len(kb)) + kb + struct.pack("<i", 8) + struct.pack("<Q", len(v)) + v.encode()
+      else: buf += struct.pack("<Q", len(kb)) + kb + struct.pack("<i", 4) + struct.pack("<I", v)
+    data_off = 0
+    # Tensor infos: [name_len][name][ndims][dims reversed][qtype][offset_into_data_blob]
+    for name, dims, qtype, data in tensors:
+      nb = name.encode()
+      buf += struct.pack("<Q", len(nb)) + nb + struct.pack("<I", len(dims))
+      for d in reversed(dims): buf += struct.pack("<Q", d)
+      buf += struct.pack("<i", qtype) + struct.pack("<Q", data_off)
+      data_off += len(data)
+    buf += b"\x00" * ((32 - len(buf) % 32) % 32)
+    for _, _, _, data in tensors: buf += data
+    return bytes(buf)

+  def test_multi_part_load(self):
    with tempfile.TemporaryDirectory() as d:
      d = pathlib.Path(d)
      a, b = np.array([1.0, 2.0, 3.0, 4.0], dtype=np.float32), np.array([5.0, 6.0], dtype=np.float32)
-      (d / "test-00001-of-00002.gguf").write_bytes(build(2, 0, [("a", (4,), 0, a.tobytes())]))
-      (d / "test-00002-of-00002.gguf").write_bytes(build(2, 1, [("b", (2,), 0, b.tobytes())]))
+      (d / "test-00001-of-00002.gguf").write_bytes(self._build_gguf([("a", (4,), 0, a.tobytes())], [("split.count", 2), ("split.no", 0)]))
+      (d / "test-00002-of-00002.gguf").write_bytes(self._build_gguf([("b", (2,), 0, b.tobytes())], [("split.count", 2), ("split.no", 1)]))
      kv, ts = gguf_load(d / "test-00001-of-00002.gguf")
      self.assertEqual(kv["split.count"], 2)
      np.testing.assert_equal(ts["a"].numpy(), a)
--- a/tinygrad/llm/gguf.py
+++ b/tinygrad/llm/gguf.py
@ -12,6 +12,14 @@ def _ggml_iq_grid(device: str, grid: tuple[int, ...], grid_shape: tuple[int, int
  values = [float((w >> (8*i)) & 0xFF) for w in grid for i in range(grid_shape[1])]
  return Tensor(values, dtype=dtypes.float32, device=device).reshape(grid_shape)

+# native types {ggml_type: dtype}
+_GGML_NATIVE = {0: dtypes.float32, 1: dtypes.float16, 24: dtypes.int8, 25: dtypes.int16,
+                26: dtypes.int32, 27: dtypes.int64, 28: dtypes.float64, 30: dtypes.bfloat16}
+
+# quant types {ggml_type: (number of elements, number of bytes)}
+_GGML_QUANT = {2:(32,18), 3:(32,20), 6:(32,22), 7:(32,24), 8:(32,34),
+               12:(256,144), 13:(256,176), 14:(256,210), 18:(256,98), 21:(256,110), 22:(256,82), 23:(256,136), 39:(32,17), 41:(128,18)}
+
 def ggml_data_to_tensor(t: Tensor, n: int, ggml_type: int) -> Tensor:
  """
  Converts ggml tensor data to a tinygrad tensor.
@ -24,11 +32,7 @@ def ggml_data_to_tensor(t: Tensor, n: int, ggml_type: int) -> Tensor:
  """
  # https://github.com/ggerganov/ggml/blob/323951f1bdcdfbd5b5ff3a9a7c3770e63b1a560e/include/ggml.h#L356

-  # native types
-  if (dtype := {
-    0: dtypes.float32, 1: dtypes.float16, 24: dtypes.int8,
-    25: dtypes.int16, 26: dtypes.int32, 27: dtypes.int64, 28: dtypes.float64, 30: dtypes.bfloat16,
-  }.get(ggml_type)) is not None:
+  if (dtype := _GGML_NATIVE.get(ggml_type)) is not None:
    return t[:dtype.itemsize * n].contiguous().bitcast(dtype)

  def q_to_uint8(t: Tensor, b: int) -> Tensor:
@ -36,12 +40,7 @@ def ggml_data_to_tensor(t: Tensor, n: int, ggml_type: int) -> Tensor:
    shift_tensor, bitmask = Tensor.stack(*[ Tensor(2**(i*b), device=t.device, dtype=t.dtype) for i in range(8//b) ]), 0xff >> (8 - b)
    return t.unsqueeze(-1).expand((*t.shape,8//b)).div(shift_tensor, rounding_mode="trunc").bitwise_and(bitmask).transpose(-1, -2).flatten(-2)

-  # map to (number of elements, number of bytes)
-  if (nelements_nbytes := {
-    2:(32,18), 3:(32,20), 6:(32,22), 7:(32,24), 8:(32,34),
-    12:(256,144), 13:(256,176), 14:(256,210), 18:(256,98), 21:(256,110), 22:(256,82), 23:(256,136), 39:(32,17),
-    41:(128,18)
-  }.get(ggml_type)) is not None:
+  if (nelements_nbytes := _GGML_QUANT.get(ggml_type)) is not None:
    from tinygrad.runtime.autogen import ggml_common as _ggml
    blocks = t[:(n//nelements_nbytes[0])*nelements_nbytes[1]].reshape((-1, nelements_nbytes[1])).contiguous()
    if ggml_type == 2: return (q_to_uint8(blocks[:,2:], 4).bitcast(dtypes.int8) - 8) * blocks[:,:2].bitcast(dtypes.float16).cast(dtypes.float32)
@ -132,6 +131,8 @@ readers: dict[int, Callable[[io.BufferedIOBase], Any]] = { 8: read_str, 9: read_
 read_uint32, read_int32, read_uint64, read_int64 = readers[4], readers[5], readers[10], readers[11]

 def _gguf_parse(tensor: Tensor) -> tuple[dict, dict[str, Tensor]]:
+  # TODO: remove the need for copy to default device
+  tensor = tensor.to(None).realize()
  r = io.BufferedReader(TensorIO(tensor), 1_000_000)
  magic, version, n_tensors, n_kv = r.read(4), read_int32(r), read_int64(r), read_int64(r)
  if magic != b"GGUF" or version not in [2, 3]: raise ValueError("Invalid GGUF format!")
@ -169,10 +170,8 @@ def gguf_load(fn: Tensor|str|pathlib.Path) -> tuple[dict, dict[str, Tensor]]:

  NOTE: The provided tensor must be on a device that supports execution.
  """
-  # TODO: remove the need for copy to default device
-  def load(p): return _gguf_parse(p if isinstance(p, Tensor) else Tensor(p).to(None).realize())
-  kv, sd = load(fn)
+  kv, sd = _gguf_parse(fn if isinstance(fn, Tensor) else Tensor(pathlib.Path(fn)))
  if kv.get('split.count', 1) <= 1: return kv, sd
  if isinstance(fn, Tensor): raise ValueError("multi-part GGUF requires a path argument (got Tensor)")
-  for pp in _gguf_split_paths(pathlib.Path(fn), kv)[1:]: sd.update(load(pp)[1])
+  for pp in _gguf_split_paths(pathlib.Path(fn), kv)[1:]: sd.update(_gguf_parse(Tensor(pp))[1])
  return kv, sd