|
a |
|
b/aitrika/utils/text_parser.py |
|
|
1 |
from llama_index.core import Document |
|
|
2 |
from llama_index.core.node_parser import SimpleNodeParser |
|
|
3 |
from typing import List |
|
|
4 |
from aitrika.config.config import LLMConfig |
|
|
5 |
|
|
|
6 |
|
|
|
7 |
config = LLMConfig() |
|
|
8 |
|
|
|
9 |
|
|
|
10 |
def generate_documents(content: str) -> List: |
|
|
11 |
""" |
|
|
12 |
Generate input documents for LlamaIndex. |
|
|
13 |
|
|
|
14 |
Args: |
|
|
15 |
content (str): Text |
|
|
16 |
|
|
|
17 |
Returns: |
|
|
18 |
List: List of chunks as Document |
|
|
19 |
""" |
|
|
20 |
parser = SimpleNodeParser( |
|
|
21 |
chunk_size=config.CHUNK_SIZE, chunk_overlap=config.CHUNK_OVERLAP |
|
|
22 |
) |
|
|
23 |
doc = Document(text=content, id=content.partition("\n")[0]) |
|
|
24 |
documents = parser.get_nodes_from_documents([doc]) |
|
|
25 |
return documents |