deepsek work

2026-06-24 02:14:17 +00:00 · 2026-03-25 10:53:39 +08:00
1 changed files with 170 additions and 42 deletions
--- a/tinygrad/apps/llm.py
+++ b/tinygrad/apps/llm.py
@ -7,7 +7,7 @@ from tinygrad.viz.serve import TCPServerWithReuse, HTTPRequestHandler

 class SimpleTokenizer:
  def __init__(self, normal_tokens:dict[str, int], special_tokens:dict[str, int], preset:str="llama3"):
-    if preset not in ("llama3","llama-v3","llama-bpe","qwen2","olmo"): raise ValueError(f"Invalid tokenizer preset '{preset}'")
+    if preset not in ("llama3","llama-v3","llama-bpe","qwen2","olmo","deepseek"): raise ValueError(f"Invalid tokenizer preset '{preset}'")
    # https://github.com/openai/gpt-2/blob/9b63575ef42771a015060c964af2c3da4cf7c8ab/src/encoder.py#L9
    bs = [*range(33, 127), *range(161, 173), *range(174, 256)]  # bytes that map to themselves
    self._byte_decoder = {chr(b): b for b in bs} | {chr(256+i): b for i,b in enumerate(b for b in range(256) if b not in bs)}
@ -56,10 +56,12 @@ class SimpleTokenizer:
  def role(self, role:str):
    if self.preset == 'olmo': return self.encode("<|" + role + "|>\n")  # OLMoE Instruct format
    if self.preset == 'qwen2': return self.encode("<|im_start|>" + role + "\n")
+    if self.preset == 'deepseek': return self.encode("<|" + role + "|>\n")
    return self.encode("<|start_header_id|>" + role + "<|end_header_id|>\n\n")
  def end_turn(self, eos_id:int):
    if self.preset == 'olmo': return self.encode("\n")
    if self.preset == 'qwen2': return [eos_id] + self.encode("\n")
+    if self.preset == 'deepseek': return [eos_id]
    return [eos_id]

@functools.cache
@ -84,7 +86,9 @@ def apply_rope(x:Tensor, freqs_cis:Tensor) -> Tensor:

 class TransformerBlock:
  def __init__(self, dim:int, hidden_dim:int, n_heads:int, n_kv_heads:int, norm_eps:float, head_dim:int, rope_theta:float,
-               max_context:int=0, qk_norm:int=0, num_experts:int=0, num_experts_per_tok:int=0, norm_topk_prob:bool=False):
+               max_context:int=0, qk_norm:int=0, num_experts:int=0, num_experts_per_tok:int=0, norm_topk_prob:bool=False,
+               scoring_func:str="softmax", routed_scaling_factor:float=1.0, n_shared_experts:int=0, shared_expert_hidden_dim:int=0,
+               kv_lora_rank:int=0, qk_nope_head_dim:int=0, qk_rope_head_dim:int=0, v_head_dim:int=0):
    self.n_heads      = n_heads
    self.n_kv_heads   = n_kv_heads
    self.head_dim     = head_dim
@ -92,13 +96,28 @@ class TransformerBlock:
    self.max_context  = max_context
    self.qk_norm      = qk_norm

+    # --- MLA (Multi-head Latent Attention) config -------------------------
+    self.kv_lora_rank      = kv_lora_rank
+    self.qk_nope_head_dim  = qk_nope_head_dim
+    self.qk_rope_head_dim  = qk_rope_head_dim
+    self.v_head_dim        = v_head_dim
+
    # --- attention projections (all linear, bias-free) ------------------
-    q_proj_out       = self.head_dim * n_heads
-    kv_proj_out      = self.head_dim * n_kv_heads
-    self.attn_q      = nn.Linear(dim, q_proj_out,  bias=False)
-    self.attn_k      = nn.Linear(dim, kv_proj_out, bias=False)
-    self.attn_v      = nn.Linear(dim, kv_proj_out, bias=False)
-    self.attn_output = nn.Linear(q_proj_out, dim,  bias=False)
+    if kv_lora_rank > 0:
+      # MLA: query projects to n_heads * (nope + rope), KV uses compressed latent
+      q_head_dim = qk_nope_head_dim + qk_rope_head_dim
+      self.attn_q      = nn.Linear(dim, n_heads * q_head_dim, bias=False)
+      self.attn_kv_a_mqa = nn.Linear(dim, kv_lora_rank + qk_rope_head_dim, bias=False)
+      self.attn_kv_a_norm = nn.RMSNorm(kv_lora_rank, norm_eps)
+      self.attn_kv_b   = nn.Linear(kv_lora_rank, n_heads * (qk_nope_head_dim + v_head_dim), bias=False)
+      self.attn_output = nn.Linear(n_heads * v_head_dim, dim, bias=False)
+    else:
+      q_proj_out       = self.head_dim * n_heads
+      kv_proj_out      = self.head_dim * n_kv_heads
+      self.attn_q      = nn.Linear(dim, q_proj_out,  bias=False)
+      self.attn_k      = nn.Linear(dim, kv_proj_out, bias=False)
+      self.attn_v      = nn.Linear(dim, kv_proj_out, bias=False)
+      self.attn_output = nn.Linear(q_proj_out, dim,  bias=False)

    # --- RMSNorms --------------------------------------------------------
    self.attn_norm   = nn.RMSNorm(dim, norm_eps)
@ -109,10 +128,18 @@ class TransformerBlock:
    if num_experts > 0:
      self.norm_topk_prob = norm_topk_prob
      self.num_experts_per_tok = num_experts_per_tok
+      self.scoring_func = scoring_func
+      self.routed_scaling_factor = routed_scaling_factor
      self.ffn_gate_inp = nn.Linear(dim, num_experts, bias=False)  # router
+      if scoring_func == "sigmoid": self.exp_probs_b = Tensor.zeros(num_experts)  # e_score_correction_bias for noaux_tc
      self.ffn_gate_exps = ExpertWeights(num_experts, dim, hidden_dim)
      self.ffn_up_exps = ExpertWeights(num_experts, dim, hidden_dim)
      self.ffn_down_exps = ExpertWeights(num_experts, hidden_dim, dim)
+      # shared experts (always active, separate dense MLP)
+      if n_shared_experts > 0:
+        self.ffn_gate_shexp = nn.Linear(dim, shared_expert_hidden_dim, bias=False)
+        self.ffn_up_shexp   = nn.Linear(dim, shared_expert_hidden_dim, bias=False)
+        self.ffn_down_shexp = nn.Linear(shared_expert_hidden_dim, dim, bias=False)
    else:
      self.ffn_gate    = nn.Linear(dim, hidden_dim, bias=False)
      self.ffn_up      = nn.Linear(dim, hidden_dim, bias=False)
@ -120,33 +147,70 @@ class TransformerBlock:

  @function
  def _attention(self, x:Tensor, start_pos:int|UOp) -> Tensor:
-    x_norm = self.attn_norm(x)                       # (B,T,D)
-    q, k, v = self.attn_q(x_norm), self.attn_k(x_norm), self.attn_v(x_norm)
-    if self.qk_norm and self.qk_norm != self.head_dim: q, k = self.attn_q_norm(q), self.attn_k_norm(k)
-
    B, T, _ = x.shape
-    q = q.reshape(B, T, self.n_heads,    self.head_dim).transpose(1, 2)  # (B,H,T,Hd)
-    k = k.reshape(B, T, self.n_kv_heads, self.head_dim).transpose(1, 2)  # (B,KvH,T,Hd)
-    v = v.reshape(B, T, self.n_kv_heads, self.head_dim).transpose(1, 2)  # (B,KvH,T,Hd)
-    if self.qk_norm == self.head_dim: q, k = self.attn_q_norm(q), self.attn_k_norm(k)
+    x_norm = self.attn_norm(x)                       # (B,T,D)

-    q = apply_rope(q, self.freqs_cis[start_pos:start_pos+T])
-    k = apply_rope(k, self.freqs_cis[start_pos:start_pos+T])
+    if self.kv_lora_rank > 0:
+      # --- MLA (Multi-head Latent Attention) path ---
+      q = self.attn_q(x_norm)                        # (B,T, n_heads*(nope+rope))
+      q = q.reshape(B, T, self.n_heads, self.qk_nope_head_dim + self.qk_rope_head_dim).transpose(1, 2)  # (B,H,T,nope+rope)
+      q_nope, q_pe = q.split([self.qk_nope_head_dim, self.qk_rope_head_dim], dim=-1)

-    # NOTE: we don't want to change self.cache_kv, the function API doesn't support this well
-    assigned_kv = Tensor(self.cache_kv.uop.after(self.cache_kv[:, :, :, start_pos:start_pos+T, :].uop.store(Tensor.stack(k, v).uop)))
-    k = assigned_kv[0, :, :, 0:start_pos+T, :]
-    v = assigned_kv[1, :, :, 0:start_pos+T, :]
+      # compress KV into latent + rope key
+      compressed_kv = self.attn_kv_a_mqa(x_norm)     # (B,T, kv_lora_rank + rope)
+      compressed_kv, k_pe = compressed_kv.split([self.kv_lora_rank, self.qk_rope_head_dim], dim=-1)
+      k_pe = k_pe.reshape(B, T, 1, self.qk_rope_head_dim).transpose(1, 2)   # (B,1,T,rope)

-    #self.cache_kv[:, :, :, start_pos:start_pos+T, :].assign(Tensor.stack(k, v))
-    #k = self.cache_kv[0, :, :, 0:start_pos+T, :]
-    #v = self.cache_kv[1, :, :, 0:start_pos+T, :]
+      # decompress latent to per-head K_nope and V
+      kv = self.attn_kv_b(self.attn_kv_a_norm(compressed_kv))  # (B,T, n_heads*(nope+v))
+      kv = kv.reshape(B, T, self.n_heads, self.qk_nope_head_dim + self.v_head_dim).transpose(1, 2)  # (B,H,T,nope+v)
+      k_nope, v = kv.split([self.qk_nope_head_dim, self.v_head_dim], dim=-1)
+
+      # apply RoPE only to the rope portion of Q and K
+      q_pe = apply_rope(q_pe, self.freqs_cis[start_pos:start_pos+T])
+      k_pe = apply_rope(k_pe, self.freqs_cis[start_pos:start_pos+T])
+
+      # reassemble full Q and K: [nope, rope]
+      q = q_nope.cat(q_pe, dim=-1)                   # (B,H,T,nope+rope)
+      k = k_nope.cat(k_pe.expand(-1, self.n_heads, -1, -1), dim=-1)  # (B,H,T,nope+rope)
+
+      # KV cache — K and V have different last dims, pad V to match K for stacking
+      q_head_dim = self.qk_nope_head_dim + self.qk_rope_head_dim
+      if self.v_head_dim < q_head_dim:
+        v_padded = v.pad((0, q_head_dim - self.v_head_dim))  # pad last dim
+      else:
+        v_padded = v
+      assigned_kv = Tensor(self.cache_kv.uop.after(self.cache_kv[:, :, :, start_pos:start_pos+T, :].uop.store(Tensor.stack(k, v_padded).uop)))
+      k = assigned_kv[0, :, :, 0:start_pos+T, :]
+      v = assigned_kv[1, :, :, 0:start_pos+T, :self.v_head_dim]  # unpad V
+
+      mask = Tensor.full((1, 1, T, start_pos+T), float("-inf"), dtype=x.dtype, device=x.device).triu(start_pos+1) if resolve(T != 1) else None
+      attn = q.scaled_dot_product_attention(k, v, attn_mask=mask)  # (B,H,T,v_head_dim)
+      attn = attn.transpose(1, 2).reshape(B, T, self.n_heads * self.v_head_dim)
+    else:
+      # --- standard GQA/MHA path ---
+      q, k, v = self.attn_q(x_norm), self.attn_k(x_norm), self.attn_v(x_norm)
+      if self.qk_norm and self.qk_norm != self.head_dim: q, k = self.attn_q_norm(q), self.attn_k_norm(k)
+
+      q = q.reshape(B, T, self.n_heads,    self.head_dim).transpose(1, 2)  # (B,H,T,Hd)
+      k = k.reshape(B, T, self.n_kv_heads, self.head_dim).transpose(1, 2)  # (B,KvH,T,Hd)
+      v = v.reshape(B, T, self.n_kv_heads, self.head_dim).transpose(1, 2)  # (B,KvH,T,Hd)
+      if self.qk_norm == self.head_dim: q, k = self.attn_q_norm(q), self.attn_k_norm(k)
+
+      q = apply_rope(q, self.freqs_cis[start_pos:start_pos+T])
+      k = apply_rope(k, self.freqs_cis[start_pos:start_pos+T])
+
+      # NOTE: we don't want to change self.cache_kv, the function API doesn't support this well
+      assigned_kv = Tensor(self.cache_kv.uop.after(self.cache_kv[:, :, :, start_pos:start_pos+T, :].uop.store(Tensor.stack(k, v).uop)))
+      k = assigned_kv[0, :, :, 0:start_pos+T, :]
+      v = assigned_kv[1, :, :, 0:start_pos+T, :]
+
+      # NOTE: this mask is causal_lower_right, not the causal_upper_left generated by is_casual = True
+      # TODO: this if statement should be removed and it shouldn't generate extra kernels
+      mask = Tensor.full((1, 1, T, start_pos+T), float("-inf"), dtype=x.dtype, device=x.device).triu(start_pos+1) if resolve(T != 1) else None
+      attn = q.scaled_dot_product_attention(k, v, attn_mask=mask, enable_gqa=True)     # (B,H,T,Hd)
+      attn = attn.transpose(1, 2).reshape(B, T, -1)                                    # back to (B,T,D)

-    # NOTE: this mask is causal_lower_right, not the causal_upper_left generated by is_casual = True
-    # TODO: this if statement should be removed and it shouldn't generate extra kernels
-    mask = Tensor.full((1, 1, T, start_pos+T), float("-inf"), dtype=x.dtype, device=x.device).triu(start_pos+1) if resolve(T != 1) else None
-    attn = q.scaled_dot_product_attention(k, v, attn_mask=mask, enable_gqa=True)     # (B,H,T,Hd)
-    attn = attn.transpose(1, 2).reshape(B, T, -1)                                    # back to (B,T,D)
    attn = self.attn_output(attn)
    return x + attn

@ -155,19 +219,37 @@ class TransformerBlock:
    h_norm = self.ffn_norm(h)
    if hasattr(self, 'ffn_gate_exps'):
      x = h_norm.unsqueeze(2)  # (B, T, 1, D) - add expert dim for broadcasting
-      probs, sel = self.ffn_gate_inp(h_norm).softmax(-1).topk(self.num_experts_per_tok)  # (B, T, k) each
+      if self.scoring_func == "sigmoid":
+        scores = self.ffn_gate_inp(h_norm).sigmoid()  # (B, T, num_experts)
+        # noaux_tc: add learned bias for top-k selection, then select top-k from original scores
+        scores_for_choice = scores + self.exp_probs_b.reshape(1, 1, -1)
+        _, sel = scores_for_choice.topk(self.num_experts_per_tok)  # (B, T, k)
+        probs = scores.gather(2, sel)  # (B, T, k) - gather original scores at selected indices
+      else:
+        probs, sel = self.ffn_gate_inp(h_norm).softmax(-1).topk(self.num_experts_per_tok)  # (B, T, k) each
      if self.norm_topk_prob: probs = probs / probs.sum(axis=-1, keepdim=True)
+      probs = probs * self.routed_scaling_factor
      x_down = self.ffn_down_exps(sel, self.ffn_gate_exps(sel, x).silu() * self.ffn_up_exps(sel, x))  # (B, T, k, D)
-      return h + (x_down * probs.unsqueeze(-1)).sum(axis=2)  # (B, T, D)
+      moe_out = (x_down * probs.unsqueeze(-1)).sum(axis=2)  # (B, T, D)
+      # add shared expert output if present
+      if hasattr(self, 'ffn_gate_shexp'):
+        moe_out = moe_out + self.ffn_down_shexp(self.ffn_gate_shexp(h_norm).silu() * self.ffn_up_shexp(h_norm))
+      return h + moe_out
    # TODO: remove the need for this contiguous
    gated  = self.ffn_gate(h_norm).silu().contiguous() * self.ffn_up(h_norm)
    return h + self.ffn_down(gated)

  def __call__(self, x: Tensor, start_pos: int|UOp):
    if not hasattr(self, "cache_kv"):
-      # TODO: how is the dtype of this determined?
-      self.cache_kv = Tensor.empty(2, x.shape[0], self.n_kv_heads, self.max_context, self.head_dim, device=x.device)
-      self.freqs_cis = precompute_freqs_cis(self.head_dim, self.max_context, self.rope_theta)
+      if self.kv_lora_rank > 0:
+        # MLA: K cache stores full decompressed K (nope+rope), V cache stores full decompressed V
+        q_head_dim = self.qk_nope_head_dim + self.qk_rope_head_dim
+        self.cache_kv = Tensor.empty(2, x.shape[0], self.n_heads, self.max_context, max(q_head_dim, self.v_head_dim), device=x.device)
+        self.freqs_cis = precompute_freqs_cis(self.qk_rope_head_dim, self.max_context, self.rope_theta)
+      else:
+        # TODO: how is the dtype of this determined?
+        self.cache_kv = Tensor.empty(2, x.shape[0], self.n_kv_heads, self.max_context, self.head_dim, device=x.device)
+        self.freqs_cis = precompute_freqs_cis(self.head_dim, self.max_context, self.rope_theta)
    # we pass in the weights implicitly so we unpack the GGUF on the fly
    @function(precompile=True, allow_implicit=True)
    def _run(x:Tensor, start_pos:int|UOp): return self._feed_forward(self._attention(x, start_pos)).contiguous()
@ -175,9 +257,19 @@ class TransformerBlock:

 class Transformer:
  def __init__(self, *, num_blocks, dim, hidden_dim, n_heads, n_kv_heads, norm_eps, vocab_size, head_dim:int, rope_theta:float,
-               max_context:int=0, qk_norm:int=0, num_experts:int=0, num_experts_per_tok:int=0, norm_topk_prob:bool=False):
-    self.blk = [TransformerBlock(dim, hidden_dim, n_heads, n_kv_heads, norm_eps, head_dim, rope_theta, max_context, qk_norm,
-                                 num_experts, num_experts_per_tok, norm_topk_prob) for _ in range(num_blocks)]
+               max_context:int=0, qk_norm:int=0, num_experts:int=0, num_experts_per_tok:int=0, norm_topk_prob:bool=False,
+               scoring_func:str="softmax", routed_scaling_factor:float=1.0, n_shared_experts:int=0, shared_expert_hidden_dim:int=0,
+               first_k_dense_replace:int=0, dense_hidden_dim:int=0,
+               kv_lora_rank:int=0, qk_nope_head_dim:int=0, qk_rope_head_dim:int=0, v_head_dim:int=0):
+    def make_block(i):
+      is_moe = num_experts > 0 and i >= first_k_dense_replace
+      return TransformerBlock(dim, hidden_dim if is_moe else (dense_hidden_dim or hidden_dim), n_heads, n_kv_heads, norm_eps,
+                              head_dim, rope_theta, max_context, qk_norm,
+                              num_experts if is_moe else 0, num_experts_per_tok if is_moe else 0, norm_topk_prob,
+                              scoring_func if is_moe else "softmax", routed_scaling_factor if is_moe else 1.0,
+                              n_shared_experts if is_moe else 0, shared_expert_hidden_dim if is_moe else 0,
+                              kv_lora_rank, qk_nope_head_dim, qk_rope_head_dim, v_head_dim)
+    self.blk = [make_block(i) for i in range(num_blocks)]
    self.token_embd  = nn.Embedding(vocab_size, dim)
    self.output_norm = nn.RMSNorm(dim, norm_eps)
    self.output = nn.Linear(dim, vocab_size, bias=False)
@ -218,6 +310,35 @@ class Transformer:
        if 'attn_q.weight' in name: state_dict[name] = state_dict[name].rearrange("(n h two) d -> (n two h) d", n=n_heads, two=2)
        if 'attn_k.weight' in name: state_dict[name] = state_dict[name].rearrange("(n h two) d -> (n two h) d", n=n_kv_heads, two=2)

+    # MLA-specific GGUF metadata (deepseek2 architecture)
+    kv_lora_rank = kv.get(f'{arch}.attention.kv_lora_rank', 0)
+    qk_rope_head_dim = kv.get(f'{arch}.rope.dimension_count', 0) if kv_lora_rank else 0
+    # key_length_mla = qk_nope_head_dim + qk_rope_head_dim
+    qk_nope_head_dim = kv.get(f'{arch}.attention.key_length_mla', 0) - qk_rope_head_dim if kv_lora_rank else 0
+    v_head_dim = kv.get(f'{arch}.attention.value_length_mla', 0) if kv_lora_rank else 0
+    # deepseek2 MoE config
+    scoring_func = "sigmoid" if 'blk.1.exp_probs_b.bias' in state_dict else "softmax"
+    routed_scaling_factor = kv.get(f'{arch}.expert_weights_scale', 1.0)
+    n_shared_experts = kv.get(f'{arch}.expert_shared_count', 0)
+    expert_ff_len = kv.get(f'{arch}.expert_feed_forward_length', 0)
+    shared_expert_hidden_dim = expert_ff_len * n_shared_experts if n_shared_experts else 0
+    first_k_dense_replace = kv.get(f'{arch}.leading_dense_block_count', 0)
+
+    # for MLA, GGUF stores kv_b as a single tensor; we need to handle that during weight loading
+    # llama.cpp conversion may split kv_b into attn_k_b + attn_v_b, or keep as attn_kv_b
+    # reassemble split k_b/v_b into single kv_b for our model if needed
+    if kv_lora_rank:
+      for name in list(state_dict.keys()):
+        if 'attn_k_b.weight' in name:
+          prefix = name.replace('attn_k_b.weight', '')
+          k_b = state_dict.pop(name)                 # (nope, kv_lora_rank, n_heads) in GGUF 3D
+          v_b = state_dict.pop(f'{prefix}attn_v_b.weight')  # (kv_lora_rank, v_head_dim, n_heads)
+          # k_b was transposed during conversion: (qk_nope_head_dim, kv_lora_rank, n_heads)
+          # undo transpose: -> (kv_lora_rank, qk_nope_head_dim, n_heads) -> reshape to 2D Linear weight
+          k_b_2d = k_b.permute(2, 1, 0).reshape(-1, kv_lora_rank)   # (n_heads*qk_nope_head_dim, kv_lora_rank)
+          v_b_2d = v_b.permute(2, 1, 0).reshape(-1, kv_lora_rank)   # (n_heads*v_head_dim, kv_lora_rank)
+          state_dict[f'{prefix}attn_kv_b.weight'] = k_b_2d.cat(v_b_2d, dim=0)  # (n_heads*(nope+v), kv_lora_rank)
+
    model = Transformer(num_blocks=kv[f'{arch}.block_count'], dim=kv[f'{arch}.embedding_length'],
                        hidden_dim=kv.get(f'{arch}.expert_feed_forward_length', kv[f'{arch}.feed_forward_length']),
                        n_heads=n_heads, n_kv_heads=n_kv_heads, norm_eps=kv[f'{arch}.attention.layer_norm_rms_epsilon'],
@ -226,7 +347,13 @@ class Transformer:
                        rope_theta=kv[f'{arch}.rope.freq_base'], max_context=max_context,
                        qk_norm=int(state_dict['blk.0.attn_q_norm.weight'].shape[0]) if 'blk.0.attn_q_norm.weight' in state_dict else 0,
                        num_experts=kv.get(f'{arch}.expert_count', 0), num_experts_per_tok=kv.get(f'{arch}.expert_used_count', 0),
-                        norm_topk_prob=True if arch=='qwen3moe' else False)
+                        norm_topk_prob=True if arch in ('qwen3moe', 'deepseek2') else False,
+                        scoring_func=scoring_func, routed_scaling_factor=routed_scaling_factor,
+                        n_shared_experts=n_shared_experts, shared_expert_hidden_dim=shared_expert_hidden_dim,
+                        first_k_dense_replace=first_k_dense_replace,
+                        dense_hidden_dim=kv.get(f'{arch}.feed_forward_length', 0),
+                        kv_lora_rank=kv_lora_rank, qk_nope_head_dim=qk_nope_head_dim,
+                        qk_rope_head_dim=qk_rope_head_dim, v_head_dim=v_head_dim)
    nn.state.load_state_dict(model, state_dict, verbose=False, consume=True, realize=False)  # NOTE: rope_freqs.weight (32,) is unused
    # NOTE: without this contiguous, it unpacks the weights from the model every time. we shouldn't need this, but for now it's faster
    if realize:
@ -268,6 +395,7 @@ models = {
  "qwen3:8b": "https://huggingface.co/Qwen/Qwen3-8B-GGUF/resolve/main/Qwen3-8B-Q4_K_M.gguf",
  "qwen3:30b-a3b": "https://huggingface.co/Qwen/Qwen3-30B-A3B-GGUF/resolve/main/Qwen3-30B-A3B-Q4_K_M.gguf",
  "olmoe": "https://huggingface.co/allenai/OLMoE-1B-7B-0924-Instruct-GGUF/resolve/main/olmoe-1b-7b-0924-instruct-q4_k_m.gguf",
+  "moonlight": "https://huggingface.co/gabriellarson/Moonlight-16B-A3B-Instruct-GGUF/resolve/main/Moonlight-16B-A3B-Instruct-Q4_K_M.gguf",
 }

 # *** simple OpenAI compatible server on 11434 to match ollama ***