A Coding Implementation On Qwen 3.6-35B-A3B Overlaying Multimodal Inference, Pondering Management, Software Calling, MoE Routing, RAG, And Session Persistence

class QwenChat:
   def __init__(self, mannequin, processor, system=None, instruments=None):
       self.mannequin, self.processor = mannequin, processor
       self.tokenizer = processor.tokenizer
       self.historical past: checklist[dict] = []
       if system: self.historical past.append({"role": "system", "content": system})
       self.instruments = instruments


   def person(self, content material):      self.historical past.append({"role":"user","content":content material}); return self
   def assistant(self, content material, reasoning=""):
       m = {"role":"assistant","content":content material}
       if reasoning: m["reasoning_content"] = reasoning
       self.historical past.append(m); return self
   def tool_result(self, identify, consequence):
       self.historical past.append({"role":"tool","name":identify,
           "content": consequence if isinstance(consequence, str) else json.dumps(consequence)})
       return self


   def _inputs(self, enable_thinking, preserve_thinking):
       return self.processor.apply_chat_template(
           self.historical past, instruments=self.instruments, tokenize=True,
           add_generation_prompt=True, return_dict=True, return_tensors="pt",
           enable_thinking=enable_thinking, preserve_thinking=preserve_thinking,
       ).to(self.mannequin.gadget)


   def generate(self, *, enable_thinking=True, preserve_thinking=False,
                max_new_tokens=2048, preset="thinking_general",
                stopping_criteria=None, append_to_history=True):
       inp = self._inputs(enable_thinking, preserve_thinking)
       cfg = SAMPLING[preset]
       gk = dict(**inp, max_new_tokens=max_new_tokens, do_sample=True,
                 temperature=cfg["temperature"], top_p=cfg["top_p"], top_k=cfg["top_k"],
                 repetition_penalty=1.0,
                 pad_token_id=self.tokenizer.pad_token_id or self.tokenizer.eos_token_id)
       if stopping_criteria just isn't None: gk["stopping_criteria"] = stopping_criteria
       with torch.inference_mode(): out = self.mannequin.generate(**gk)
       uncooked = self.tokenizer.decode(out[0, inp["input_ids"].form[-1]:], skip_special_tokens=True)
       assume, ans = split_thinking(uncooked)
       if append_to_history: self.assistant(ans, reasoning=assume)
       return assume, ans


   def stream(self, *, enable_thinking=True, preserve_thinking=False,
              max_new_tokens=2048, preset="thinking_general",
              on_thinking=None, on_answer=None):
       inp = self._inputs(enable_thinking, preserve_thinking)
       cfg = SAMPLING[preset]
       streamer = TextIteratorStreamer(self.tokenizer, skip_prompt=True, skip_special_tokens=True)
       gk = dict(**inp, streamer=streamer, max_new_tokens=max_new_tokens, do_sample=True,
                 temperature=cfg["temperature"], top_p=cfg["top_p"], top_k=cfg["top_k"],
                 pad_token_id=self.tokenizer.pad_token_id or self.tokenizer.eos_token_id)
       t = threading.Thread(goal=self.mannequin.generate, kwargs=gk); t.begin()
       buf, in_think = "", enable_thinking
       think_text, answer_text = "", ""
       for piece in streamer:
           buf += piece
           if in_think:
               if THINK_CLOSE in buf:
                   close_at = buf.index(THINK_CLOSE)
                   resid = buf[:close_at]
                   if on_thinking: on_thinking(resid[len(think_text):])
                   think_text = resid
                   buf = buf[close_at + len(THINK_CLOSE):]
                   in_think = False
                   if buf and on_answer: on_answer(buf)
                   answer_text = buf; buf = ""
               else:
                   if on_thinking: on_thinking(piece)
                   think_text += piece
           else:
               if on_answer: on_answer(piece)
               answer_text += piece
       t.be a part of()
       self.assistant(answer_text.strip(), reasoning=think_text.strip())
       return think_text.strip(), answer_text.strip()


   def save(self, path):
       with open(path, "w") as f:
           json.dump({"history": self.historical past, "tools": self.instruments}, f, indent=2)
   @classmethod
   def load(cls, mannequin, processor, path):
       with open(path) as f: knowledge = json.load(f)
       c = cls(mannequin, processor, instruments=knowledge.get("tools"))
       c.historical past = knowledge["history"]; return c


class ThinkingBudget(StoppingCriteria):
   def __init__(self, tokenizer, price range: int):
       self.price range = price range
       self.open_ids  = tokenizer.encode(THINK_OPEN,  add_special_tokens=False)
       self.close_ids = tokenizer.encode(THINK_CLOSE, add_special_tokens=False)
       self.begin = None
   def _find(self, seq, needle):
       n = len(needle)
       for i in vary(len(seq)-n+1):
           if seq[i:i+n] == needle: return i
       return None
   def __call__(self, input_ids, scores, **kwargs):
       seq = input_ids[0].tolist()
       if self.begin is None:
           idx = self._find(seq, self.open_ids)
           if idx just isn't None: self.begin = idx + len(self.open_ids)
           return False
       if self._find(seq[self.start:], self.close_ids) just isn't None: return False
       return (len(seq) - self.begin) >= self.price range


TOOL_CALL_RE = re.compile(r"s*({.*?})s*", re.S)


def run_calculate(expr: str) -> str:
   if any(c not in "0123456789+-*/().% " for c in expr):
       return json.dumps({"error":"illegal chars"})
   attempt:    return json.dumps({"result": eval(expr, {"__builtins__": {}}, {})})
   besides Exception as e: return json.dumps({"error": str(e)})


_DOCS = {
   "qwen3.6":  "Qwen3.6-35B-A3B is a 35B MoE with 3B active params and 262k native context.",
   "deltanet": "Gated DeltaNet is a linear-attention variant used in Qwen3.6's hybrid layers.",
   "moe":      "Qwen3.6 uses 256 experts with 8 routed + 1 shared per token.",
}
def run_search_docs(q):
   hits = [v for k,v in _DOCS.items() if k in q.lower()]
   return json.dumps({"results": hits or ["no hits"]})
def run_get_time():
   import datetime as dt
   return json.dumps({"iso": dt.datetime.utcnow().isoformat()+"Z"})


TOOL_FNS = {
   "calculate":   lambda a: run_calculate(a["expression"]),
   "search_docs": lambda a: run_search_docs(a["query"]),
   "get_time":    lambda a: run_get_time(),
}
TOOLS_SCHEMA = [
   {"type":"function","function":{"name":"calculate","description":"Evaluate arithmetic.",
     "parameters":{"type":"object","properties":{"expression":{"type":"string"}},"required":["expression"]}}},
   {"type":"function","function":{"name":"search_docs","description":"Search internal docs.",
     "parameters":{"type":"object","properties":{"query":{"type":"string"}},"required":["query"]}}},
   {"type":"function","function":{"name":"get_time","description":"Get current UTC time.",
     "parameters":{"type":"object","properties":{}}}},
]

Top Posts

Streamline Your Workflow: Effortlessly Combine PDFs in SMB Teams

“10 Must-Know GitHub Repositories for Mastering Python Web Development”

Morse Micro Unleashes High-Power Wi-Fi HaLow Module to Empower Long-Range IoT Designs

A Coding Implementation on Qwen 3.6-35B-A3B Overlaying Multimodal Inference, Pondering Management, Software Calling, MoE Routing, RAG, and Session Persistence

“Crunching the Pitch: Can Algorithms Really Predict World Cup Glory?”

AI Agents Outpace Traditional Search by 48x in Groundbreaking Harvard-Perplexity Study

Unlock Claude’s Full Potential: Your Definitive Blueprint for Mastering AI Skill Development with Anthropic

Here Is What the New Siri AI Could Cost You

4 Powerful Techniques to Supercharge Your Claude Code Workflow

Microsoft AI Unveils MAI-Transcribe-1.5: Record-Breaking 2.4% WER, Top FLEURS Accuracy, and 5x Faster Long-Audio Transcription

Streamline Your Workflow: Effortlessly Combine PDFs in SMB Teams

“10 Must-Know GitHub Repositories for Mastering Python Web Development”

Morse Micro Unleashes High-Power Wi-Fi HaLow Module to Empower Long-Range IoT Designs

“Crunching the Pitch: Can Algorithms Really Predict World Cup Glory?”

Whales Are Swallowing Bitcoin’s Plunge

Cybersecurity M&A in May 2026: 26 Deals Unpacked

VA EHR Expansion Accelerates: Four New Deployments Signal Nationwide Digital Health Push

Decades of Remote Work: The 7 Laptop-Bag Essentials I Never Leave Home Without

Trending

Streamline Your Workflow: Effortlessly Combine PDFs in SMB Teams

“10 Must-Know GitHub Repositories for Mastering Python Web Development”

Latest Posts

Not More Data, but Better World Models – Unite.AI

OpenAI Is Hiring Head of Preparedness, Amid AI Cyberattack Fears

Subscribe to Updates

Top Posts

A Coding Implementation on Qwen 3.6-35B-A3B Overlaying Multimodal Inference, Pondering Management, Software Calling, MoE Routing, RAG, and Session Persistence

Related Posts