WIP for context management ratio

iantbutler01 · iantbutler01 · commit b929f8b69cf8 · 2025-05-19T13:34:36.000-07:00
diff --git a/asimov/caches/cache.py b/asimov/caches/cache.py
@@ -38,21 +38,20 @@ async def apply_key_modifications(self, key: str) -> str:
 
     @asynccontextmanager
     async def with_prefix(self, prefix: str):
-        old_prefix = await self.get_prefix()
-        self._prefix.set(prefix)
+        token = self._prefix.set(prefix)
         try:
             yield self
         finally:
-            self._prefix.set(old_prefix)
+            self._prefix.reset(token)
 
     @asynccontextmanager
     async def with_suffix(self, suffix: str):
-        old_suffix = await self.get_suffix()
-        self._suffix.set(suffix)
+        token = self._suffix.set(suffix)
         try:
             yield self
         finally:
-            self._suffix.set(old_suffix)
+            self._suffix.reset(token)
+ 
 
     def __getitem__(self, key: str):
         return self.get(key)
diff --git a/asimov/services/inference_clients.py b/asimov/services/inference_clients.py
@@ -21,6 +21,7 @@
 import google.auth
 
 from asimov.asimov_base import AsimovBase
+from asimov.utils.token_counter import approx_tokens_from_serialized_messages
 from asimov.graph import NonRetryableException
 
 logger = logging.getLogger(__name__)
@@ -160,7 +161,7 @@ async def tool_chain(
                 Awaitable[tuple[str, List[Tuple[Callable, Dict[str, Any]]], Hashable]],
             ]
         ] = None,
-        fifo_context: bool = False,
+        fifo_ratio: Optional[float] = None,
     ):
         mode = None
         if mode_swap_callback:
@@ -208,6 +209,46 @@ async def tool_chain(
 
             last_mode_cached_message[mode] = len(serialized_messages) - 1
 
+            tokens = approx_tokens_from_serialized_messages(serialized_messages)
+
+            if fifo_ratio and (tokens / 200000) > fifo_ratio:
+                logger.info(
+                    f"ContextLengthExceeded ({e}), tossing early messages and retrying"
+                )
+                # If we hit context length, remove a handful of assistant,user message pairs from the middle
+                # A handful so that we can hopefully get at least a couple cache hits with this
+                # truncated history before having to drop messages again.
+
+                # We want the earliest thing we remove to be an assistant message (requesting the next tool call), which have odd indices
+                start_remove = int(len(serialized_messages) / 3)
+                if start_remove % 2 != 1:
+                    start_remove += 1
+                # And the last thing we remove should be a user message (with tool response), which have even indices
+                end_remove = int(len(serialized_messages) * 2 / 3)
+                if end_remove % 2 != 0:
+                    end_remove -= 1
+                logger.debug(
+                    f"Removing messages {start_remove} through {end_remove} from serialized messages"
+                )
+                end_remove += 1  # inclusive
+                serialized_messages = (
+                    serialized_messages[:start_remove]
+                    + serialized_messages[end_remove:]
+                )
+                for mode in last_mode_cached_message.keys():
+                    # Delete markers if they are in the removed range
+                    if (
+                        start_remove
+                        <= last_mode_cached_message[mode]
+                        < end_remove
+                    ):
+                        del last_mode_cached_message[mode]
+                    # And adjust indices of anything that got "slid" back
+                    elif last_mode_cached_message[mode] >= end_remove:
+                        last_mode_cached_message[mode] -= (
+                            end_remove - start_remove
+                        )
+
             for retry in range(1, 5):
                 try:
                     resp = await self._tool_chain_stream(
@@ -229,49 +270,10 @@ async def tool_chain(
                     await self._trace(serialized_messages, resp)
                     break
                 except ContextLengthExceeded as e:
-                    if fifo_context:
-                        logger.info(
-                            f"ContextLengthExceeded ({e}), tossing early messages and retrying"
-                        )
-                        # If we hit context length, remove a handful of assistant,user message pairs from the middle
-                        # A handful so that we can hopefully get at least a couple cache hits with this
-                        # truncated history before having to drop messages again.
-
-                        # We want the earliest thing we remove to be an assistant message (requesting the next tool call), which have odd indices
-                        start_remove = int(len(serialized_messages) / 3)
-                        if start_remove % 2 != 1:
-                            start_remove += 1
-                        # And the last thing we remove should be a user message (with tool response), which have even indices
-                        end_remove = int(len(serialized_messages) * 2 / 3)
-                        if end_remove % 2 != 0:
-                            end_remove -= 1
-                        logger.debug(
-                            f"Removing messages {start_remove} through {end_remove} from serialized messages"
-                        )
-                        end_remove += 1  # inclusive
-                        serialized_messages = (
-                            serialized_messages[:start_remove]
-                            + serialized_messages[end_remove:]
-                        )
-                        for mode in last_mode_cached_message.keys():
-                            # Delete markers if they are in the removed range
-                            if (
-                                start_remove
-                                <= last_mode_cached_message[mode]
-                                < end_remove
-                            ):
-                                del last_mode_cached_message[mode]
-                            # And adjust indices of anything that got "slid" back
-                            elif last_mode_cached_message[mode] >= end_remove:
-                                last_mode_cached_message[mode] -= (
-                                    end_remove - start_remove
-                                )
-                        continue
-                    else:
-                        logger.info(
-                            "Non-retryable exception hit (context length), bailing"
-                        )
-                        return serialized_messages
+                    logger.info(
+                        "Non-retryable exception hit (context length), bailing"
+                    )
+                    return serialized_messages
                 except NonRetryableException as e:
                     logger.info(f"Non-retryable exception hit ({e}), bailing")
                     raise
diff --git a/asimov/utils/token_counter.py b/asimov/utils/token_counter.py
@@ -0,0 +1,48 @@
+# utils/approx_token_count.py
+from typing import Any, Dict, List
+import math
+
+AVG_CHARS_PER_TOKEN = 4        # heuristic—you can tweak if your data skews long/short
+TOKENS_PER_MSG      = 4        # ChatML fixed overhead  (role, separators, etc.)
+TOKENS_PER_NAME     = -1       # spec quirk: “name” field shaves one token
+END_OF_REQ_TOKENS   = 2        # every request implicitly ends with: <assistant|ANSWER>
+
+def approx_tokens_from_serialized_messages(
+    serialized_messages: List[Dict[str, Any]],
+    avg_chars_per_token: int = AVG_CHARS_PER_TOKEN,
+) -> int:
+    """
+    Fast, model-agnostic token estimate for a ChatML message array.
+
+    Parameters
+    ----------
+    serialized_messages : list[dict]
+        Your [{role, content:[{type,text}]}] structure.
+    avg_chars_per_token : int, optional
+        How many characters you assume map to one token (default 4).
+
+    Returns
+    -------
+    int
+        Estimated prompt token count.
+    """
+    total_tokens = 0
+
+    for msg in serialized_messages:
+        total_tokens += TOKENS_PER_MSG
+
+        # role string itself
+        total_tokens += math.ceil(len(msg["role"]) / avg_chars_per_token)
+
+        if "name" in msg:
+            total_tokens += TOKENS_PER_NAME
+
+        for part in msg["content"]:
+            if part["type"] == "text":
+                total_tokens += math.ceil(len(part["text"]) / avg_chars_per_token)
+            else:
+                # non-text parts: fall back to raw length heuristic
+                total_tokens += math.ceil(len(str(part)) / avg_chars_per_token)
+
+    total_tokens += END_OF_REQ_TOKENS
+    return max(total_tokens, 0)