Skip to content

Commit 83d2367

Browse files
authored
Merge pull request #1059 from airweave-ai/fix/code-chunker-token-fallback
Fix: Code chunker fallback to hard token limit
2 parents ac670a7 + fa42946 commit 83d2367

File tree

1 file changed

+23
-20
lines changed
  • backend/airweave/platform/chunkers

1 file changed

+23
-20
lines changed

backend/airweave/platform/chunkers/code.py

Lines changed: 23 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
"""Code chunker using AST-based parsing with SentenceChunker safety net."""
1+
"""Code chunker using AST-based parsing with TokenChunker safety net."""
22

33
from typing import Any, Dict, List, Optional
44

@@ -13,19 +13,18 @@ class CodeChunker(BaseChunker):
1313
1414
Two-stage approach (internal implementation detail):
1515
1. CodeChunker: Chunks at logical code boundaries (functions, classes, methods)
16-
2. SentenceChunker: Safety net to split any chunks exceeding token limit
16+
2. TokenChunker fallback: Force-splits any oversized chunks at token boundaries
1717
1818
The chunker is shared across all syncs in the pod to avoid reloading
1919
the Magika language detection model for every sync job.
2020
2121
Note: Even with AST-based splitting, single large AST nodes (massive functions
22-
without children) can exceed chunk_size, so we use SentenceChunker as safety net.
22+
without children) can exceed chunk_size, so we use TokenChunker as safety net.
2323
"""
2424

2525
# Configuration constants
2626
MAX_TOKENS_PER_CHUNK = 8192 # OpenAI hard limit (safety net)
2727
CHUNK_SIZE = 2048 # Target chunk size (can be exceeded by large AST nodes)
28-
OVERLAP_TOKENS = 128 # Token overlap for safety net
2928
TOKENIZER = "cl100k_base" # For accurate OpenAI token counting
3029

3130
# Singleton instance
@@ -44,7 +43,7 @@ def __init__(self):
4443
return
4544

4645
self._code_chunker = None # Lazy init
47-
self._sentence_chunker = None # Lazy init (safety net)
46+
self._token_chunker = None # Lazy init (emergency fallback)
4847
self._tiktoken_tokenizer = None # Lazy init
4948
self._initialized = True
5049

@@ -56,7 +55,7 @@ def __init__(self):
5655
def _ensure_chunkers(self):
5756
"""Lazy initialization of chunker models.
5857
59-
Initializes CodeChunker (AST parsing) + SentenceChunker (safety net).
58+
Initializes CodeChunker (AST parsing) + TokenChunker (safety net).
6059
6160
Raises:
6261
SyncFailureError: If model loading fails (infrastructure error)
@@ -67,7 +66,7 @@ def _ensure_chunkers(self):
6766
try:
6867
import tiktoken
6968
from chonkie import CodeChunker as ChonkieCodeChunker
70-
from chonkie import SentenceChunker
69+
from chonkie import TokenChunker
7170

7271
# Initialize tiktoken tokenizer for accurate OpenAI token counting
7372
self._tiktoken_tokenizer = tiktoken.get_encoding(self.TOKENIZER)
@@ -81,18 +80,18 @@ def _ensure_chunkers(self):
8180
include_nodes=False,
8281
)
8382

84-
# Initialize SentenceChunker for safety net
85-
# Needed because large functions/classes without children can exceed CHUNK_SIZE
86-
self._sentence_chunker = SentenceChunker(
83+
# Initialize TokenChunker for fallback
84+
# Splits at exact token boundaries when code chunking produces oversized chunks
85+
# GUARANTEES chunks ≤ MAX_TOKENS_PER_CHUNK (uses same tokenizer for encode/decode)
86+
self._token_chunker = TokenChunker(
8787
tokenizer=self._tiktoken_tokenizer,
8888
chunk_size=self.MAX_TOKENS_PER_CHUNK,
89-
chunk_overlap=self.OVERLAP_TOKENS,
90-
min_sentences_per_chunk=1,
89+
chunk_overlap=0,
9190
)
9291

9392
logger.info(
9493
f"Loaded CodeChunker (auto-detect, target: {self.CHUNK_SIZE}) + "
95-
f"SentenceChunker safety net (hard_limit: {self.MAX_TOKENS_PER_CHUNK})"
94+
f"TokenChunker fallback (hard_limit: {self.MAX_TOKENS_PER_CHUNK})"
9695
)
9796

9897
except Exception as e:
@@ -102,7 +101,7 @@ async def chunk_batch(self, texts: List[str]) -> List[List[Dict[str, Any]]]:
102101
"""Chunk a batch of code texts with two-stage approach.
103102
104103
Stage 1: CodeChunker chunks at AST boundaries (functions, classes)
105-
Stage 2: SentenceChunker splits any chunks exceeding MAX_TOKENS_PER_CHUNK
104+
Stage 2: TokenChunker force-splits any chunks exceeding MAX_TOKENS_PER_CHUNK (hard limit)
106105
107106
Uses run_in_thread_pool because Chonkie is synchronous (avoids blocking event loop).
108107
@@ -146,7 +145,7 @@ async def chunk_batch(self, texts: List[str]) -> List[List[Dict[str, Any]]]:
146145
def _apply_safety_net_batched(
147146
self, code_results: List[List[Any]]
148147
) -> List[List[Dict[str, Any]]]:
149-
"""Split oversized chunks using batched sentence chunking.
148+
"""Split oversized chunks using TokenChunker fallback.
150149
151150
Same implementation as SemanticChunker - collects oversized chunks,
152151
batch processes them, then reconstructs results.
@@ -168,14 +167,18 @@ def _apply_safety_net_batched(
168167
oversized_texts.append(chunk.text)
169168
oversized_map[pos] = (doc_idx, chunk_idx)
170169

171-
# Batch process all oversized chunks with SentenceChunker
170+
# Batch process all oversized chunks with TokenChunker fallback
171+
# TokenChunker enforces hard limit in one pass (no recursion needed)
172172
split_results_by_position = {}
173173
if oversized_texts:
174174
logger.debug(
175175
f"Safety net: splitting {len(oversized_texts)} oversized code chunks "
176-
f"exceeding {self.MAX_TOKENS_PER_CHUNK} tokens"
176+
f"exceeding {self.MAX_TOKENS_PER_CHUNK} tokens with TokenChunker"
177177
)
178-
split_results = self._sentence_chunker.chunk_batch(oversized_texts)
178+
179+
# Use TokenChunker to split at exact token boundaries
180+
# GUARANTEED to produce chunks ≤ MAX_TOKENS_PER_CHUNK in one pass
181+
split_results = self._token_chunker.chunk_batch(oversized_texts)
179182
split_results_by_position = dict(enumerate(split_results))
180183

181184
# Reconstruct final results
@@ -206,8 +209,8 @@ def _apply_safety_net_batched(
206209

207210
if oversized_texts:
208211
logger.debug(
209-
f"Safety net split {len(oversized_texts)} code chunks "
210-
f"exceeding {self.MAX_TOKENS_PER_CHUNK} tokens"
212+
f"TokenChunker fallback split {len(oversized_texts)} code chunks "
213+
f"that exceeded {self.MAX_TOKENS_PER_CHUNK} tokens"
211214
)
212215

213216
return final_results

0 commit comments

Comments
 (0)