mirror of
https://github.com/tinygrad/tinygrad.git
synced 2026-06-24 02:14:17 +00:00
clean up gguf (#16160)
This commit is contained in:
parent
e97f2c1114
commit
3c806ff406
2 changed files with 39 additions and 38 deletions
|
|
@ -115,33 +115,35 @@ class TestGGUF(unittest.TestCase):
|
|||
with self.assertRaises(ValueError):
|
||||
ggml_data_to_tensor(Tensor.empty(512, dtype=dtypes.uint8), 256, 1337)
|
||||
|
||||
def test_multi_part_load(self):
|
||||
def build(n_total, part_no, tensors):
|
||||
# [header] [kv_data] [tensor_infos] [padding] [tensor_data_blob]
|
||||
buf = bytearray()
|
||||
# Header: magic "GGUF" + version=3 + n_tensors + n_kv=2
|
||||
buf += struct.pack("<4siqq", b"GGUF", 3, len(tensors), 2)
|
||||
# KV entries: [key_len: uint64][key bytes][type: int32][value]
|
||||
for k, v in [("split.count", n_total), ("split.no", part_no)]:
|
||||
kb = k.encode()
|
||||
buf += struct.pack("<Q", len(kb)) + kb + struct.pack("<i", 4) + struct.pack("<I", v)
|
||||
data_off = 0
|
||||
# Tensor infos: [name_len][name][ndims][dims reversed][qtype][offset_into_data_blob]
|
||||
for name, dims, qtype, data in tensors:
|
||||
nb = name.encode()
|
||||
buf += struct.pack("<Q", len(nb)) + nb + struct.pack("<I", len(dims))
|
||||
for d in reversed(dims): buf += struct.pack("<Q", d)
|
||||
buf += struct.pack("<i", qtype) + struct.pack("<Q", data_off)
|
||||
data_off += len(data)
|
||||
buf += b"\x00" * ((32 - len(buf) % 32) % 32)
|
||||
for _, _, _, data in tensors: buf += data
|
||||
return bytes(buf)
|
||||
@staticmethod
|
||||
def _build_gguf(tensors, kvs):
|
||||
# [header] [kv_data] [tensor_infos] [padding] [tensor_data_blob]
|
||||
buf = bytearray()
|
||||
# Header: magic "GGUF" + version=3 + n_tensors + n_kv
|
||||
buf += struct.pack("<4siqq", b"GGUF", 3, len(tensors), len(kvs))
|
||||
# KV entries: [key_len: uint64][key bytes][type: int32][value]
|
||||
for k, v in kvs:
|
||||
kb = k.encode()
|
||||
if isinstance(v, str): buf += struct.pack("<Q", len(kb)) + kb + struct.pack("<i", 8) + struct.pack("<Q", len(v)) + v.encode()
|
||||
else: buf += struct.pack("<Q", len(kb)) + kb + struct.pack("<i", 4) + struct.pack("<I", v)
|
||||
data_off = 0
|
||||
# Tensor infos: [name_len][name][ndims][dims reversed][qtype][offset_into_data_blob]
|
||||
for name, dims, qtype, data in tensors:
|
||||
nb = name.encode()
|
||||
buf += struct.pack("<Q", len(nb)) + nb + struct.pack("<I", len(dims))
|
||||
for d in reversed(dims): buf += struct.pack("<Q", d)
|
||||
buf += struct.pack("<i", qtype) + struct.pack("<Q", data_off)
|
||||
data_off += len(data)
|
||||
buf += b"\x00" * ((32 - len(buf) % 32) % 32)
|
||||
for _, _, _, data in tensors: buf += data
|
||||
return bytes(buf)
|
||||
|
||||
def test_multi_part_load(self):
|
||||
with tempfile.TemporaryDirectory() as d:
|
||||
d = pathlib.Path(d)
|
||||
a, b = np.array([1.0, 2.0, 3.0, 4.0], dtype=np.float32), np.array([5.0, 6.0], dtype=np.float32)
|
||||
(d / "test-00001-of-00002.gguf").write_bytes(build(2, 0, [("a", (4,), 0, a.tobytes())]))
|
||||
(d / "test-00002-of-00002.gguf").write_bytes(build(2, 1, [("b", (2,), 0, b.tobytes())]))
|
||||
(d / "test-00001-of-00002.gguf").write_bytes(self._build_gguf([("a", (4,), 0, a.tobytes())], [("split.count", 2), ("split.no", 0)]))
|
||||
(d / "test-00002-of-00002.gguf").write_bytes(self._build_gguf([("b", (2,), 0, b.tobytes())], [("split.count", 2), ("split.no", 1)]))
|
||||
kv, ts = gguf_load(d / "test-00001-of-00002.gguf")
|
||||
self.assertEqual(kv["split.count"], 2)
|
||||
np.testing.assert_equal(ts["a"].numpy(), a)
|
||||
|
|
|
|||
|
|
@ -12,6 +12,14 @@ def _ggml_iq_grid(device: str, grid: tuple[int, ...], grid_shape: tuple[int, int
|
|||
values = [float((w >> (8*i)) & 0xFF) for w in grid for i in range(grid_shape[1])]
|
||||
return Tensor(values, dtype=dtypes.float32, device=device).reshape(grid_shape)
|
||||
|
||||
# native types {ggml_type: dtype}
|
||||
_GGML_NATIVE = {0: dtypes.float32, 1: dtypes.float16, 24: dtypes.int8, 25: dtypes.int16,
|
||||
26: dtypes.int32, 27: dtypes.int64, 28: dtypes.float64, 30: dtypes.bfloat16}
|
||||
|
||||
# quant types {ggml_type: (number of elements, number of bytes)}
|
||||
_GGML_QUANT = {2:(32,18), 3:(32,20), 6:(32,22), 7:(32,24), 8:(32,34),
|
||||
12:(256,144), 13:(256,176), 14:(256,210), 18:(256,98), 21:(256,110), 22:(256,82), 23:(256,136), 39:(32,17), 41:(128,18)}
|
||||
|
||||
def ggml_data_to_tensor(t: Tensor, n: int, ggml_type: int) -> Tensor:
|
||||
"""
|
||||
Converts ggml tensor data to a tinygrad tensor.
|
||||
|
|
@ -24,11 +32,7 @@ def ggml_data_to_tensor(t: Tensor, n: int, ggml_type: int) -> Tensor:
|
|||
"""
|
||||
# https://github.com/ggerganov/ggml/blob/323951f1bdcdfbd5b5ff3a9a7c3770e63b1a560e/include/ggml.h#L356
|
||||
|
||||
# native types
|
||||
if (dtype := {
|
||||
0: dtypes.float32, 1: dtypes.float16, 24: dtypes.int8,
|
||||
25: dtypes.int16, 26: dtypes.int32, 27: dtypes.int64, 28: dtypes.float64, 30: dtypes.bfloat16,
|
||||
}.get(ggml_type)) is not None:
|
||||
if (dtype := _GGML_NATIVE.get(ggml_type)) is not None:
|
||||
return t[:dtype.itemsize * n].contiguous().bitcast(dtype)
|
||||
|
||||
def q_to_uint8(t: Tensor, b: int) -> Tensor:
|
||||
|
|
@ -36,12 +40,7 @@ def ggml_data_to_tensor(t: Tensor, n: int, ggml_type: int) -> Tensor:
|
|||
shift_tensor, bitmask = Tensor.stack(*[ Tensor(2**(i*b), device=t.device, dtype=t.dtype) for i in range(8//b) ]), 0xff >> (8 - b)
|
||||
return t.unsqueeze(-1).expand((*t.shape,8//b)).div(shift_tensor, rounding_mode="trunc").bitwise_and(bitmask).transpose(-1, -2).flatten(-2)
|
||||
|
||||
# map to (number of elements, number of bytes)
|
||||
if (nelements_nbytes := {
|
||||
2:(32,18), 3:(32,20), 6:(32,22), 7:(32,24), 8:(32,34),
|
||||
12:(256,144), 13:(256,176), 14:(256,210), 18:(256,98), 21:(256,110), 22:(256,82), 23:(256,136), 39:(32,17),
|
||||
41:(128,18)
|
||||
}.get(ggml_type)) is not None:
|
||||
if (nelements_nbytes := _GGML_QUANT.get(ggml_type)) is not None:
|
||||
from tinygrad.runtime.autogen import ggml_common as _ggml
|
||||
blocks = t[:(n//nelements_nbytes[0])*nelements_nbytes[1]].reshape((-1, nelements_nbytes[1])).contiguous()
|
||||
if ggml_type == 2: return (q_to_uint8(blocks[:,2:], 4).bitcast(dtypes.int8) - 8) * blocks[:,:2].bitcast(dtypes.float16).cast(dtypes.float32)
|
||||
|
|
@ -132,6 +131,8 @@ readers: dict[int, Callable[[io.BufferedIOBase], Any]] = { 8: read_str, 9: read_
|
|||
read_uint32, read_int32, read_uint64, read_int64 = readers[4], readers[5], readers[10], readers[11]
|
||||
|
||||
def _gguf_parse(tensor: Tensor) -> tuple[dict, dict[str, Tensor]]:
|
||||
# TODO: remove the need for copy to default device
|
||||
tensor = tensor.to(None).realize()
|
||||
r = io.BufferedReader(TensorIO(tensor), 1_000_000)
|
||||
magic, version, n_tensors, n_kv = r.read(4), read_int32(r), read_int64(r), read_int64(r)
|
||||
if magic != b"GGUF" or version not in [2, 3]: raise ValueError("Invalid GGUF format!")
|
||||
|
|
@ -169,10 +170,8 @@ def gguf_load(fn: Tensor|str|pathlib.Path) -> tuple[dict, dict[str, Tensor]]:
|
|||
|
||||
NOTE: The provided tensor must be on a device that supports execution.
|
||||
"""
|
||||
# TODO: remove the need for copy to default device
|
||||
def load(p): return _gguf_parse(p if isinstance(p, Tensor) else Tensor(p).to(None).realize())
|
||||
kv, sd = load(fn)
|
||||
kv, sd = _gguf_parse(fn if isinstance(fn, Tensor) else Tensor(pathlib.Path(fn)))
|
||||
if kv.get('split.count', 1) <= 1: return kv, sd
|
||||
if isinstance(fn, Tensor): raise ValueError("multi-part GGUF requires a path argument (got Tensor)")
|
||||
for pp in _gguf_split_paths(pathlib.Path(fn), kv)[1:]: sd.update(load(pp)[1])
|
||||
for pp in _gguf_split_paths(pathlib.Path(fn), kv)[1:]: sd.update(_gguf_parse(Tensor(pp))[1])
|
||||
return kv, sd
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue