Coverage for intelligence_toolkit/AI/text_splitter.py: 100%

13 statements  

« prev     ^ index     » next       coverage.py v7.10.7, created at 2025-10-16 13:41 -0300

1# Copyright (c) 2024 Microsoft Corporation. All rights reserved. 

2# Licensed under the MIT license. See LICENSE file in the project. 

3# 

4import semchunk 

5import tiktoken 

6 

7from .defaults import CHUNK_SIZE, DEFAULT_ENCODING, DEFAULT_LLM_MODEL 

8 

9 

10class TextSplitter: 

11 def __init__(self, chunk_size: int = CHUNK_SIZE, model: str = DEFAULT_LLM_MODEL): 

12 self.chunk_size = chunk_size 

13 try: 

14 encoding = tiktoken.encoding_for_model(model) 

15 except KeyError: 

16 encoding = tiktoken.get_encoding(DEFAULT_ENCODING) 

17 

18 self._chunk = semchunk.chunkerify( 

19 encoding, chunk_size 

20 ) 

21 

22 def split(self, text: str): # -> Any: 

23 return self._chunk(text)