1- """Code chunker using AST-based parsing with SentenceChunker safety net."""
1+ """Code chunker using AST-based parsing with TokenChunker safety net."""
22
33from typing import Any , Dict , List , Optional
44
@@ -13,19 +13,18 @@ class CodeChunker(BaseChunker):
1313
1414 Two-stage approach (internal implementation detail):
1515 1. CodeChunker: Chunks at logical code boundaries (functions, classes, methods)
16- 2. SentenceChunker: Safety net to split any chunks exceeding token limit
16+ 2. TokenChunker fallback: Force-splits any oversized chunks at token boundaries
1717
1818 The chunker is shared across all syncs in the pod to avoid reloading
1919 the Magika language detection model for every sync job.
2020
2121 Note: Even with AST-based splitting, single large AST nodes (massive functions
22- without children) can exceed chunk_size, so we use SentenceChunker as safety net.
22+ without children) can exceed chunk_size, so we use TokenChunker as safety net.
2323 """
2424
2525 # Configuration constants
2626 MAX_TOKENS_PER_CHUNK = 8192 # OpenAI hard limit (safety net)
2727 CHUNK_SIZE = 2048 # Target chunk size (can be exceeded by large AST nodes)
28- OVERLAP_TOKENS = 128 # Token overlap for safety net
2928 TOKENIZER = "cl100k_base" # For accurate OpenAI token counting
3029
3130 # Singleton instance
@@ -44,7 +43,7 @@ def __init__(self):
4443 return
4544
4645 self ._code_chunker = None # Lazy init
47- self ._sentence_chunker = None # Lazy init (safety net )
46+ self ._token_chunker = None # Lazy init (emergency fallback )
4847 self ._tiktoken_tokenizer = None # Lazy init
4948 self ._initialized = True
5049
@@ -56,7 +55,7 @@ def __init__(self):
5655 def _ensure_chunkers (self ):
5756 """Lazy initialization of chunker models.
5857
59- Initializes CodeChunker (AST parsing) + SentenceChunker (safety net).
58+ Initializes CodeChunker (AST parsing) + TokenChunker (safety net).
6059
6160 Raises:
6261 SyncFailureError: If model loading fails (infrastructure error)
@@ -67,7 +66,7 @@ def _ensure_chunkers(self):
6766 try :
6867 import tiktoken
6968 from chonkie import CodeChunker as ChonkieCodeChunker
70- from chonkie import SentenceChunker
69+ from chonkie import TokenChunker
7170
7271 # Initialize tiktoken tokenizer for accurate OpenAI token counting
7372 self ._tiktoken_tokenizer = tiktoken .get_encoding (self .TOKENIZER )
@@ -81,18 +80,18 @@ def _ensure_chunkers(self):
8180 include_nodes = False ,
8281 )
8382
84- # Initialize SentenceChunker for safety net
85- # Needed because large functions/classes without children can exceed CHUNK_SIZE
86- self ._sentence_chunker = SentenceChunker (
83+ # Initialize TokenChunker for fallback
84+ # Splits at exact token boundaries when code chunking produces oversized chunks
85+ # GUARANTEES chunks ≤ MAX_TOKENS_PER_CHUNK (uses same tokenizer for encode/decode)
86+ self ._token_chunker = TokenChunker (
8787 tokenizer = self ._tiktoken_tokenizer ,
8888 chunk_size = self .MAX_TOKENS_PER_CHUNK ,
89- chunk_overlap = self .OVERLAP_TOKENS ,
90- min_sentences_per_chunk = 1 ,
89+ chunk_overlap = 0 ,
9190 )
9291
9392 logger .info (
9493 f"Loaded CodeChunker (auto-detect, target: { self .CHUNK_SIZE } ) + "
95- f"SentenceChunker safety net (hard_limit: { self .MAX_TOKENS_PER_CHUNK } )"
94+ f"TokenChunker fallback (hard_limit: { self .MAX_TOKENS_PER_CHUNK } )"
9695 )
9796
9897 except Exception as e :
@@ -102,7 +101,7 @@ async def chunk_batch(self, texts: List[str]) -> List[List[Dict[str, Any]]]:
102101 """Chunk a batch of code texts with two-stage approach.
103102
104103 Stage 1: CodeChunker chunks at AST boundaries (functions, classes)
105- Stage 2: SentenceChunker splits any chunks exceeding MAX_TOKENS_PER_CHUNK
104+ Stage 2: TokenChunker force- splits any chunks exceeding MAX_TOKENS_PER_CHUNK (hard limit)
106105
107106 Uses run_in_thread_pool because Chonkie is synchronous (avoids blocking event loop).
108107
@@ -146,7 +145,7 @@ async def chunk_batch(self, texts: List[str]) -> List[List[Dict[str, Any]]]:
146145 def _apply_safety_net_batched (
147146 self , code_results : List [List [Any ]]
148147 ) -> List [List [Dict [str , Any ]]]:
149- """Split oversized chunks using batched sentence chunking .
148+ """Split oversized chunks using TokenChunker fallback .
150149
151150 Same implementation as SemanticChunker - collects oversized chunks,
152151 batch processes them, then reconstructs results.
@@ -168,14 +167,18 @@ def _apply_safety_net_batched(
168167 oversized_texts .append (chunk .text )
169168 oversized_map [pos ] = (doc_idx , chunk_idx )
170169
171- # Batch process all oversized chunks with SentenceChunker
170+ # Batch process all oversized chunks with TokenChunker fallback
171+ # TokenChunker enforces hard limit in one pass (no recursion needed)
172172 split_results_by_position = {}
173173 if oversized_texts :
174174 logger .debug (
175175 f"Safety net: splitting { len (oversized_texts )} oversized code chunks "
176- f"exceeding { self .MAX_TOKENS_PER_CHUNK } tokens"
176+ f"exceeding { self .MAX_TOKENS_PER_CHUNK } tokens with TokenChunker "
177177 )
178- split_results = self ._sentence_chunker .chunk_batch (oversized_texts )
178+
179+ # Use TokenChunker to split at exact token boundaries
180+ # GUARANTEED to produce chunks ≤ MAX_TOKENS_PER_CHUNK in one pass
181+ split_results = self ._token_chunker .chunk_batch (oversized_texts )
179182 split_results_by_position = dict (enumerate (split_results ))
180183
181184 # Reconstruct final results
@@ -206,8 +209,8 @@ def _apply_safety_net_batched(
206209
207210 if oversized_texts :
208211 logger .debug (
209- f"Safety net split { len (oversized_texts )} code chunks "
210- f"exceeding { self .MAX_TOKENS_PER_CHUNK } tokens"
212+ f"TokenChunker fallback split { len (oversized_texts )} code chunks "
213+ f"that exceeded { self .MAX_TOKENS_PER_CHUNK } tokens"
211214 )
212215
213216 return final_results
0 commit comments