Coverage for intelligence_toolkit/AI/text_splitter.py: 100%
13 statements
« prev ^ index » next coverage.py v7.10.7, created at 2025-10-16 13:41 -0300
« prev ^ index » next coverage.py v7.10.7, created at 2025-10-16 13:41 -0300
1# Copyright (c) 2024 Microsoft Corporation. All rights reserved.
2# Licensed under the MIT license. See LICENSE file in the project.
3#
4import semchunk
5import tiktoken
7from .defaults import CHUNK_SIZE, DEFAULT_ENCODING, DEFAULT_LLM_MODEL
10class TextSplitter:
11 def __init__(self, chunk_size: int = CHUNK_SIZE, model: str = DEFAULT_LLM_MODEL):
12 self.chunk_size = chunk_size
13 try:
14 encoding = tiktoken.encoding_for_model(model)
15 except KeyError:
16 encoding = tiktoken.get_encoding(DEFAULT_ENCODING)
18 self._chunk = semchunk.chunkerify(
19 encoding, chunk_size
20 )
22 def split(self, text: str): # -> Any:
23 return self._chunk(text)