mirror of
https://github.com/tinygrad/tinygrad.git
synced 2026-06-24 02:14:17 +00:00
Compare commits
1 commit
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
e401546bf0 |
1 changed files with 170 additions and 42 deletions
|
|
@ -7,7 +7,7 @@ from tinygrad.viz.serve import TCPServerWithReuse, HTTPRequestHandler
|
|||
|
||||
class SimpleTokenizer:
|
||||
def __init__(self, normal_tokens:dict[str, int], special_tokens:dict[str, int], preset:str="llama3"):
|
||||
if preset not in ("llama3","llama-v3","llama-bpe","qwen2","olmo"): raise ValueError(f"Invalid tokenizer preset '{preset}'")
|
||||
if preset not in ("llama3","llama-v3","llama-bpe","qwen2","olmo","deepseek"): raise ValueError(f"Invalid tokenizer preset '{preset}'")
|
||||
# https://github.com/openai/gpt-2/blob/9b63575ef42771a015060c964af2c3da4cf7c8ab/src/encoder.py#L9
|
||||
bs = [*range(33, 127), *range(161, 173), *range(174, 256)] # bytes that map to themselves
|
||||
self._byte_decoder = {chr(b): b for b in bs} | {chr(256+i): b for i,b in enumerate(b for b in range(256) if b not in bs)}
|
||||
|
|
@ -56,10 +56,12 @@ class SimpleTokenizer:
|
|||
def role(self, role:str):
|
||||
if self.preset == 'olmo': return self.encode("<|" + role + "|>\n") # OLMoE Instruct format
|
||||
if self.preset == 'qwen2': return self.encode("<|im_start|>" + role + "\n")
|
||||
if self.preset == 'deepseek': return self.encode("<|" + role + "|>\n")
|
||||
return self.encode("<|start_header_id|>" + role + "<|end_header_id|>\n\n")
|
||||
def end_turn(self, eos_id:int):
|
||||
if self.preset == 'olmo': return self.encode("\n")
|
||||
if self.preset == 'qwen2': return [eos_id] + self.encode("\n")
|
||||
if self.preset == 'deepseek': return [eos_id]
|
||||
return [eos_id]
|
||||
|
||||
@functools.cache
|
||||
|
|
@ -84,7 +86,9 @@ def apply_rope(x:Tensor, freqs_cis:Tensor) -> Tensor:
|
|||
|
||||
class TransformerBlock:
|
||||
def __init__(self, dim:int, hidden_dim:int, n_heads:int, n_kv_heads:int, norm_eps:float, head_dim:int, rope_theta:float,
|
||||
max_context:int=0, qk_norm:int=0, num_experts:int=0, num_experts_per_tok:int=0, norm_topk_prob:bool=False):
|
||||
max_context:int=0, qk_norm:int=0, num_experts:int=0, num_experts_per_tok:int=0, norm_topk_prob:bool=False,
|
||||
scoring_func:str="softmax", routed_scaling_factor:float=1.0, n_shared_experts:int=0, shared_expert_hidden_dim:int=0,
|
||||
kv_lora_rank:int=0, qk_nope_head_dim:int=0, qk_rope_head_dim:int=0, v_head_dim:int=0):
|
||||
self.n_heads = n_heads
|
||||
self.n_kv_heads = n_kv_heads
|
||||
self.head_dim = head_dim
|
||||
|
|
@ -92,13 +96,28 @@ class TransformerBlock:
|
|||
self.max_context = max_context
|
||||
self.qk_norm = qk_norm
|
||||
|
||||
# --- MLA (Multi-head Latent Attention) config -------------------------
|
||||
self.kv_lora_rank = kv_lora_rank
|
||||
self.qk_nope_head_dim = qk_nope_head_dim
|
||||
self.qk_rope_head_dim = qk_rope_head_dim
|
||||
self.v_head_dim = v_head_dim
|
||||
|
||||
# --- attention projections (all linear, bias-free) ------------------
|
||||
q_proj_out = self.head_dim * n_heads
|
||||
kv_proj_out = self.head_dim * n_kv_heads
|
||||
self.attn_q = nn.Linear(dim, q_proj_out, bias=False)
|
||||
self.attn_k = nn.Linear(dim, kv_proj_out, bias=False)
|
||||
self.attn_v = nn.Linear(dim, kv_proj_out, bias=False)
|
||||
self.attn_output = nn.Linear(q_proj_out, dim, bias=False)
|
||||
if kv_lora_rank > 0:
|
||||
# MLA: query projects to n_heads * (nope + rope), KV uses compressed latent
|
||||
q_head_dim = qk_nope_head_dim + qk_rope_head_dim
|
||||
self.attn_q = nn.Linear(dim, n_heads * q_head_dim, bias=False)
|
||||
self.attn_kv_a_mqa = nn.Linear(dim, kv_lora_rank + qk_rope_head_dim, bias=False)
|
||||
self.attn_kv_a_norm = nn.RMSNorm(kv_lora_rank, norm_eps)
|
||||
self.attn_kv_b = nn.Linear(kv_lora_rank, n_heads * (qk_nope_head_dim + v_head_dim), bias=False)
|
||||
self.attn_output = nn.Linear(n_heads * v_head_dim, dim, bias=False)
|
||||
else:
|
||||
q_proj_out = self.head_dim * n_heads
|
||||
kv_proj_out = self.head_dim * n_kv_heads
|
||||
self.attn_q = nn.Linear(dim, q_proj_out, bias=False)
|
||||
self.attn_k = nn.Linear(dim, kv_proj_out, bias=False)
|
||||
self.attn_v = nn.Linear(dim, kv_proj_out, bias=False)
|
||||
self.attn_output = nn.Linear(q_proj_out, dim, bias=False)
|
||||
|
||||
# --- RMSNorms --------------------------------------------------------
|
||||
self.attn_norm = nn.RMSNorm(dim, norm_eps)
|
||||
|
|
@ -109,10 +128,18 @@ class TransformerBlock:
|
|||
if num_experts > 0:
|
||||
self.norm_topk_prob = norm_topk_prob
|
||||
self.num_experts_per_tok = num_experts_per_tok
|
||||
self.scoring_func = scoring_func
|
||||
self.routed_scaling_factor = routed_scaling_factor
|
||||
self.ffn_gate_inp = nn.Linear(dim, num_experts, bias=False) # router
|
||||
if scoring_func == "sigmoid": self.exp_probs_b = Tensor.zeros(num_experts) # e_score_correction_bias for noaux_tc
|
||||
self.ffn_gate_exps = ExpertWeights(num_experts, dim, hidden_dim)
|
||||
self.ffn_up_exps = ExpertWeights(num_experts, dim, hidden_dim)
|
||||
self.ffn_down_exps = ExpertWeights(num_experts, hidden_dim, dim)
|
||||
# shared experts (always active, separate dense MLP)
|
||||
if n_shared_experts > 0:
|
||||
self.ffn_gate_shexp = nn.Linear(dim, shared_expert_hidden_dim, bias=False)
|
||||
self.ffn_up_shexp = nn.Linear(dim, shared_expert_hidden_dim, bias=False)
|
||||
self.ffn_down_shexp = nn.Linear(shared_expert_hidden_dim, dim, bias=False)
|
||||
else:
|
||||
self.ffn_gate = nn.Linear(dim, hidden_dim, bias=False)
|
||||
self.ffn_up = nn.Linear(dim, hidden_dim, bias=False)
|
||||
|
|
@ -120,33 +147,70 @@ class TransformerBlock:
|
|||
|
||||
@function
|
||||
def _attention(self, x:Tensor, start_pos:int|UOp) -> Tensor:
|
||||
x_norm = self.attn_norm(x) # (B,T,D)
|
||||
q, k, v = self.attn_q(x_norm), self.attn_k(x_norm), self.attn_v(x_norm)
|
||||
if self.qk_norm and self.qk_norm != self.head_dim: q, k = self.attn_q_norm(q), self.attn_k_norm(k)
|
||||
|
||||
B, T, _ = x.shape
|
||||
q = q.reshape(B, T, self.n_heads, self.head_dim).transpose(1, 2) # (B,H,T,Hd)
|
||||
k = k.reshape(B, T, self.n_kv_heads, self.head_dim).transpose(1, 2) # (B,KvH,T,Hd)
|
||||
v = v.reshape(B, T, self.n_kv_heads, self.head_dim).transpose(1, 2) # (B,KvH,T,Hd)
|
||||
if self.qk_norm == self.head_dim: q, k = self.attn_q_norm(q), self.attn_k_norm(k)
|
||||
x_norm = self.attn_norm(x) # (B,T,D)
|
||||
|
||||
q = apply_rope(q, self.freqs_cis[start_pos:start_pos+T])
|
||||
k = apply_rope(k, self.freqs_cis[start_pos:start_pos+T])
|
||||
if self.kv_lora_rank > 0:
|
||||
# --- MLA (Multi-head Latent Attention) path ---
|
||||
q = self.attn_q(x_norm) # (B,T, n_heads*(nope+rope))
|
||||
q = q.reshape(B, T, self.n_heads, self.qk_nope_head_dim + self.qk_rope_head_dim).transpose(1, 2) # (B,H,T,nope+rope)
|
||||
q_nope, q_pe = q.split([self.qk_nope_head_dim, self.qk_rope_head_dim], dim=-1)
|
||||
|
||||
# NOTE: we don't want to change self.cache_kv, the function API doesn't support this well
|
||||
assigned_kv = Tensor(self.cache_kv.uop.after(self.cache_kv[:, :, :, start_pos:start_pos+T, :].uop.store(Tensor.stack(k, v).uop)))
|
||||
k = assigned_kv[0, :, :, 0:start_pos+T, :]
|
||||
v = assigned_kv[1, :, :, 0:start_pos+T, :]
|
||||
# compress KV into latent + rope key
|
||||
compressed_kv = self.attn_kv_a_mqa(x_norm) # (B,T, kv_lora_rank + rope)
|
||||
compressed_kv, k_pe = compressed_kv.split([self.kv_lora_rank, self.qk_rope_head_dim], dim=-1)
|
||||
k_pe = k_pe.reshape(B, T, 1, self.qk_rope_head_dim).transpose(1, 2) # (B,1,T,rope)
|
||||
|
||||
#self.cache_kv[:, :, :, start_pos:start_pos+T, :].assign(Tensor.stack(k, v))
|
||||
#k = self.cache_kv[0, :, :, 0:start_pos+T, :]
|
||||
#v = self.cache_kv[1, :, :, 0:start_pos+T, :]
|
||||
# decompress latent to per-head K_nope and V
|
||||
kv = self.attn_kv_b(self.attn_kv_a_norm(compressed_kv)) # (B,T, n_heads*(nope+v))
|
||||
kv = kv.reshape(B, T, self.n_heads, self.qk_nope_head_dim + self.v_head_dim).transpose(1, 2) # (B,H,T,nope+v)
|
||||
k_nope, v = kv.split([self.qk_nope_head_dim, self.v_head_dim], dim=-1)
|
||||
|
||||
# apply RoPE only to the rope portion of Q and K
|
||||
q_pe = apply_rope(q_pe, self.freqs_cis[start_pos:start_pos+T])
|
||||
k_pe = apply_rope(k_pe, self.freqs_cis[start_pos:start_pos+T])
|
||||
|
||||
# reassemble full Q and K: [nope, rope]
|
||||
q = q_nope.cat(q_pe, dim=-1) # (B,H,T,nope+rope)
|
||||
k = k_nope.cat(k_pe.expand(-1, self.n_heads, -1, -1), dim=-1) # (B,H,T,nope+rope)
|
||||
|
||||
# KV cache — K and V have different last dims, pad V to match K for stacking
|
||||
q_head_dim = self.qk_nope_head_dim + self.qk_rope_head_dim
|
||||
if self.v_head_dim < q_head_dim:
|
||||
v_padded = v.pad((0, q_head_dim - self.v_head_dim)) # pad last dim
|
||||
else:
|
||||
v_padded = v
|
||||
assigned_kv = Tensor(self.cache_kv.uop.after(self.cache_kv[:, :, :, start_pos:start_pos+T, :].uop.store(Tensor.stack(k, v_padded).uop)))
|
||||
k = assigned_kv[0, :, :, 0:start_pos+T, :]
|
||||
v = assigned_kv[1, :, :, 0:start_pos+T, :self.v_head_dim] # unpad V
|
||||
|
||||
mask = Tensor.full((1, 1, T, start_pos+T), float("-inf"), dtype=x.dtype, device=x.device).triu(start_pos+1) if resolve(T != 1) else None
|
||||
attn = q.scaled_dot_product_attention(k, v, attn_mask=mask) # (B,H,T,v_head_dim)
|
||||
attn = attn.transpose(1, 2).reshape(B, T, self.n_heads * self.v_head_dim)
|
||||
else:
|
||||
# --- standard GQA/MHA path ---
|
||||
q, k, v = self.attn_q(x_norm), self.attn_k(x_norm), self.attn_v(x_norm)
|
||||
if self.qk_norm and self.qk_norm != self.head_dim: q, k = self.attn_q_norm(q), self.attn_k_norm(k)
|
||||
|
||||
q = q.reshape(B, T, self.n_heads, self.head_dim).transpose(1, 2) # (B,H,T,Hd)
|
||||
k = k.reshape(B, T, self.n_kv_heads, self.head_dim).transpose(1, 2) # (B,KvH,T,Hd)
|
||||
v = v.reshape(B, T, self.n_kv_heads, self.head_dim).transpose(1, 2) # (B,KvH,T,Hd)
|
||||
if self.qk_norm == self.head_dim: q, k = self.attn_q_norm(q), self.attn_k_norm(k)
|
||||
|
||||
q = apply_rope(q, self.freqs_cis[start_pos:start_pos+T])
|
||||
k = apply_rope(k, self.freqs_cis[start_pos:start_pos+T])
|
||||
|
||||
# NOTE: we don't want to change self.cache_kv, the function API doesn't support this well
|
||||
assigned_kv = Tensor(self.cache_kv.uop.after(self.cache_kv[:, :, :, start_pos:start_pos+T, :].uop.store(Tensor.stack(k, v).uop)))
|
||||
k = assigned_kv[0, :, :, 0:start_pos+T, :]
|
||||
v = assigned_kv[1, :, :, 0:start_pos+T, :]
|
||||
|
||||
# NOTE: this mask is causal_lower_right, not the causal_upper_left generated by is_casual = True
|
||||
# TODO: this if statement should be removed and it shouldn't generate extra kernels
|
||||
mask = Tensor.full((1, 1, T, start_pos+T), float("-inf"), dtype=x.dtype, device=x.device).triu(start_pos+1) if resolve(T != 1) else None
|
||||
attn = q.scaled_dot_product_attention(k, v, attn_mask=mask, enable_gqa=True) # (B,H,T,Hd)
|
||||
attn = attn.transpose(1, 2).reshape(B, T, -1) # back to (B,T,D)
|
||||
|
||||
# NOTE: this mask is causal_lower_right, not the causal_upper_left generated by is_casual = True
|
||||
# TODO: this if statement should be removed and it shouldn't generate extra kernels
|
||||
mask = Tensor.full((1, 1, T, start_pos+T), float("-inf"), dtype=x.dtype, device=x.device).triu(start_pos+1) if resolve(T != 1) else None
|
||||
attn = q.scaled_dot_product_attention(k, v, attn_mask=mask, enable_gqa=True) # (B,H,T,Hd)
|
||||
attn = attn.transpose(1, 2).reshape(B, T, -1) # back to (B,T,D)
|
||||
attn = self.attn_output(attn)
|
||||
return x + attn
|
||||
|
||||
|
|
@ -155,19 +219,37 @@ class TransformerBlock:
|
|||
h_norm = self.ffn_norm(h)
|
||||
if hasattr(self, 'ffn_gate_exps'):
|
||||
x = h_norm.unsqueeze(2) # (B, T, 1, D) - add expert dim for broadcasting
|
||||
probs, sel = self.ffn_gate_inp(h_norm).softmax(-1).topk(self.num_experts_per_tok) # (B, T, k) each
|
||||
if self.scoring_func == "sigmoid":
|
||||
scores = self.ffn_gate_inp(h_norm).sigmoid() # (B, T, num_experts)
|
||||
# noaux_tc: add learned bias for top-k selection, then select top-k from original scores
|
||||
scores_for_choice = scores + self.exp_probs_b.reshape(1, 1, -1)
|
||||
_, sel = scores_for_choice.topk(self.num_experts_per_tok) # (B, T, k)
|
||||
probs = scores.gather(2, sel) # (B, T, k) - gather original scores at selected indices
|
||||
else:
|
||||
probs, sel = self.ffn_gate_inp(h_norm).softmax(-1).topk(self.num_experts_per_tok) # (B, T, k) each
|
||||
if self.norm_topk_prob: probs = probs / probs.sum(axis=-1, keepdim=True)
|
||||
probs = probs * self.routed_scaling_factor
|
||||
x_down = self.ffn_down_exps(sel, self.ffn_gate_exps(sel, x).silu() * self.ffn_up_exps(sel, x)) # (B, T, k, D)
|
||||
return h + (x_down * probs.unsqueeze(-1)).sum(axis=2) # (B, T, D)
|
||||
moe_out = (x_down * probs.unsqueeze(-1)).sum(axis=2) # (B, T, D)
|
||||
# add shared expert output if present
|
||||
if hasattr(self, 'ffn_gate_shexp'):
|
||||
moe_out = moe_out + self.ffn_down_shexp(self.ffn_gate_shexp(h_norm).silu() * self.ffn_up_shexp(h_norm))
|
||||
return h + moe_out
|
||||
# TODO: remove the need for this contiguous
|
||||
gated = self.ffn_gate(h_norm).silu().contiguous() * self.ffn_up(h_norm)
|
||||
return h + self.ffn_down(gated)
|
||||
|
||||
def __call__(self, x: Tensor, start_pos: int|UOp):
|
||||
if not hasattr(self, "cache_kv"):
|
||||
# TODO: how is the dtype of this determined?
|
||||
self.cache_kv = Tensor.empty(2, x.shape[0], self.n_kv_heads, self.max_context, self.head_dim, device=x.device)
|
||||
self.freqs_cis = precompute_freqs_cis(self.head_dim, self.max_context, self.rope_theta)
|
||||
if self.kv_lora_rank > 0:
|
||||
# MLA: K cache stores full decompressed K (nope+rope), V cache stores full decompressed V
|
||||
q_head_dim = self.qk_nope_head_dim + self.qk_rope_head_dim
|
||||
self.cache_kv = Tensor.empty(2, x.shape[0], self.n_heads, self.max_context, max(q_head_dim, self.v_head_dim), device=x.device)
|
||||
self.freqs_cis = precompute_freqs_cis(self.qk_rope_head_dim, self.max_context, self.rope_theta)
|
||||
else:
|
||||
# TODO: how is the dtype of this determined?
|
||||
self.cache_kv = Tensor.empty(2, x.shape[0], self.n_kv_heads, self.max_context, self.head_dim, device=x.device)
|
||||
self.freqs_cis = precompute_freqs_cis(self.head_dim, self.max_context, self.rope_theta)
|
||||
# we pass in the weights implicitly so we unpack the GGUF on the fly
|
||||
@function(precompile=True, allow_implicit=True)
|
||||
def _run(x:Tensor, start_pos:int|UOp): return self._feed_forward(self._attention(x, start_pos)).contiguous()
|
||||
|
|
@ -175,9 +257,19 @@ class TransformerBlock:
|
|||
|
||||
class Transformer:
|
||||
def __init__(self, *, num_blocks, dim, hidden_dim, n_heads, n_kv_heads, norm_eps, vocab_size, head_dim:int, rope_theta:float,
|
||||
max_context:int=0, qk_norm:int=0, num_experts:int=0, num_experts_per_tok:int=0, norm_topk_prob:bool=False):
|
||||
self.blk = [TransformerBlock(dim, hidden_dim, n_heads, n_kv_heads, norm_eps, head_dim, rope_theta, max_context, qk_norm,
|
||||
num_experts, num_experts_per_tok, norm_topk_prob) for _ in range(num_blocks)]
|
||||
max_context:int=0, qk_norm:int=0, num_experts:int=0, num_experts_per_tok:int=0, norm_topk_prob:bool=False,
|
||||
scoring_func:str="softmax", routed_scaling_factor:float=1.0, n_shared_experts:int=0, shared_expert_hidden_dim:int=0,
|
||||
first_k_dense_replace:int=0, dense_hidden_dim:int=0,
|
||||
kv_lora_rank:int=0, qk_nope_head_dim:int=0, qk_rope_head_dim:int=0, v_head_dim:int=0):
|
||||
def make_block(i):
|
||||
is_moe = num_experts > 0 and i >= first_k_dense_replace
|
||||
return TransformerBlock(dim, hidden_dim if is_moe else (dense_hidden_dim or hidden_dim), n_heads, n_kv_heads, norm_eps,
|
||||
head_dim, rope_theta, max_context, qk_norm,
|
||||
num_experts if is_moe else 0, num_experts_per_tok if is_moe else 0, norm_topk_prob,
|
||||
scoring_func if is_moe else "softmax", routed_scaling_factor if is_moe else 1.0,
|
||||
n_shared_experts if is_moe else 0, shared_expert_hidden_dim if is_moe else 0,
|
||||
kv_lora_rank, qk_nope_head_dim, qk_rope_head_dim, v_head_dim)
|
||||
self.blk = [make_block(i) for i in range(num_blocks)]
|
||||
self.token_embd = nn.Embedding(vocab_size, dim)
|
||||
self.output_norm = nn.RMSNorm(dim, norm_eps)
|
||||
self.output = nn.Linear(dim, vocab_size, bias=False)
|
||||
|
|
@ -218,6 +310,35 @@ class Transformer:
|
|||
if 'attn_q.weight' in name: state_dict[name] = state_dict[name].rearrange("(n h two) d -> (n two h) d", n=n_heads, two=2)
|
||||
if 'attn_k.weight' in name: state_dict[name] = state_dict[name].rearrange("(n h two) d -> (n two h) d", n=n_kv_heads, two=2)
|
||||
|
||||
# MLA-specific GGUF metadata (deepseek2 architecture)
|
||||
kv_lora_rank = kv.get(f'{arch}.attention.kv_lora_rank', 0)
|
||||
qk_rope_head_dim = kv.get(f'{arch}.rope.dimension_count', 0) if kv_lora_rank else 0
|
||||
# key_length_mla = qk_nope_head_dim + qk_rope_head_dim
|
||||
qk_nope_head_dim = kv.get(f'{arch}.attention.key_length_mla', 0) - qk_rope_head_dim if kv_lora_rank else 0
|
||||
v_head_dim = kv.get(f'{arch}.attention.value_length_mla', 0) if kv_lora_rank else 0
|
||||
# deepseek2 MoE config
|
||||
scoring_func = "sigmoid" if 'blk.1.exp_probs_b.bias' in state_dict else "softmax"
|
||||
routed_scaling_factor = kv.get(f'{arch}.expert_weights_scale', 1.0)
|
||||
n_shared_experts = kv.get(f'{arch}.expert_shared_count', 0)
|
||||
expert_ff_len = kv.get(f'{arch}.expert_feed_forward_length', 0)
|
||||
shared_expert_hidden_dim = expert_ff_len * n_shared_experts if n_shared_experts else 0
|
||||
first_k_dense_replace = kv.get(f'{arch}.leading_dense_block_count', 0)
|
||||
|
||||
# for MLA, GGUF stores kv_b as a single tensor; we need to handle that during weight loading
|
||||
# llama.cpp conversion may split kv_b into attn_k_b + attn_v_b, or keep as attn_kv_b
|
||||
# reassemble split k_b/v_b into single kv_b for our model if needed
|
||||
if kv_lora_rank:
|
||||
for name in list(state_dict.keys()):
|
||||
if 'attn_k_b.weight' in name:
|
||||
prefix = name.replace('attn_k_b.weight', '')
|
||||
k_b = state_dict.pop(name) # (nope, kv_lora_rank, n_heads) in GGUF 3D
|
||||
v_b = state_dict.pop(f'{prefix}attn_v_b.weight') # (kv_lora_rank, v_head_dim, n_heads)
|
||||
# k_b was transposed during conversion: (qk_nope_head_dim, kv_lora_rank, n_heads)
|
||||
# undo transpose: -> (kv_lora_rank, qk_nope_head_dim, n_heads) -> reshape to 2D Linear weight
|
||||
k_b_2d = k_b.permute(2, 1, 0).reshape(-1, kv_lora_rank) # (n_heads*qk_nope_head_dim, kv_lora_rank)
|
||||
v_b_2d = v_b.permute(2, 1, 0).reshape(-1, kv_lora_rank) # (n_heads*v_head_dim, kv_lora_rank)
|
||||
state_dict[f'{prefix}attn_kv_b.weight'] = k_b_2d.cat(v_b_2d, dim=0) # (n_heads*(nope+v), kv_lora_rank)
|
||||
|
||||
model = Transformer(num_blocks=kv[f'{arch}.block_count'], dim=kv[f'{arch}.embedding_length'],
|
||||
hidden_dim=kv.get(f'{arch}.expert_feed_forward_length', kv[f'{arch}.feed_forward_length']),
|
||||
n_heads=n_heads, n_kv_heads=n_kv_heads, norm_eps=kv[f'{arch}.attention.layer_norm_rms_epsilon'],
|
||||
|
|
@ -226,7 +347,13 @@ class Transformer:
|
|||
rope_theta=kv[f'{arch}.rope.freq_base'], max_context=max_context,
|
||||
qk_norm=int(state_dict['blk.0.attn_q_norm.weight'].shape[0]) if 'blk.0.attn_q_norm.weight' in state_dict else 0,
|
||||
num_experts=kv.get(f'{arch}.expert_count', 0), num_experts_per_tok=kv.get(f'{arch}.expert_used_count', 0),
|
||||
norm_topk_prob=True if arch=='qwen3moe' else False)
|
||||
norm_topk_prob=True if arch in ('qwen3moe', 'deepseek2') else False,
|
||||
scoring_func=scoring_func, routed_scaling_factor=routed_scaling_factor,
|
||||
n_shared_experts=n_shared_experts, shared_expert_hidden_dim=shared_expert_hidden_dim,
|
||||
first_k_dense_replace=first_k_dense_replace,
|
||||
dense_hidden_dim=kv.get(f'{arch}.feed_forward_length', 0),
|
||||
kv_lora_rank=kv_lora_rank, qk_nope_head_dim=qk_nope_head_dim,
|
||||
qk_rope_head_dim=qk_rope_head_dim, v_head_dim=v_head_dim)
|
||||
nn.state.load_state_dict(model, state_dict, verbose=False, consume=True, realize=False) # NOTE: rope_freqs.weight (32,) is unused
|
||||
# NOTE: without this contiguous, it unpacks the weights from the model every time. we shouldn't need this, but for now it's faster
|
||||
if realize:
|
||||
|
|
@ -268,6 +395,7 @@ models = {
|
|||
"qwen3:8b": "https://huggingface.co/Qwen/Qwen3-8B-GGUF/resolve/main/Qwen3-8B-Q4_K_M.gguf",
|
||||
"qwen3:30b-a3b": "https://huggingface.co/Qwen/Qwen3-30B-A3B-GGUF/resolve/main/Qwen3-30B-A3B-Q4_K_M.gguf",
|
||||
"olmoe": "https://huggingface.co/allenai/OLMoE-1B-7B-0924-Instruct-GGUF/resolve/main/olmoe-1b-7b-0924-instruct-q4_k_m.gguf",
|
||||
"moonlight": "https://huggingface.co/gabriellarson/Moonlight-16B-A3B-Instruct-GGUF/resolve/main/Moonlight-16B-A3B-Instruct-Q4_K_M.gguf",
|
||||
}
|
||||
|
||||
# *** simple OpenAI compatible server on 11434 to match ollama ***
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue