Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 15 additions & 3 deletions src/datacustomcode/run.py
Original file line number Diff line number Diff line change
Expand Up @@ -201,8 +201,20 @@ def run_function_with_test(entrypoint: str, test_file: str) -> None:


def add_py_folder(entrypoint: str):
default_py_folder = "py-files" # Hardcoded folder name
"""Add py-files subfolder and entrypoint directory to sys.path.

This ensures:
1. py-files/ is available for additional dependencies
2. The entrypoint directory is available for local module imports
"""
default_py_folder = "py-files"
cwd = Path.cwd().joinpath(entrypoint)
py_folder = cwd.parent.joinpath(default_py_folder)
entrypoint_dir = cwd.parent
py_folder = entrypoint_dir.joinpath(default_py_folder)

# Add py-files folder if it exists
if py_folder.exists():
sys.path.insert(0, str(py_folder))

sys.path.append(str(py_folder))
# Add entrypoint directory to allow local module imports
sys.path.insert(0, str(entrypoint_dir))
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
import logging

from utility import extract_citations, split_text_into_chunks

from datacustomcode.function import Runtime
from datacustomcode.function.feature_types.chunking import (
ChunkType,
Expand All @@ -15,80 +17,6 @@
DEFAULT_MAX_CHUNK_SIZE = 50


def split_text_into_chunks(text: str, max_size: int, overlap: int = 20):
"""Split text into chunks with overlap, trying to break at natural boundaries.

Tries to break at natural boundaries in order of preference:
1. Paragraph boundaries (\\n\\n)
2. Line boundaries (\\n)
3. Sentence boundaries (. ! ?)
4. Word boundaries (space)
5. Hard cut if no good boundary found

Args:
text: Text to split
max_size: Maximum characters per chunk
overlap: Number of characters to overlap between chunks

Returns:
List of text chunks
"""
if len(text) <= max_size:
return [text]

chunks = []
start = 0

while start < len(text):
# Determine end position for this chunk
end = start + max_size

if end >= len(text):
# Last chunk
chunks.append(text[start:])
break

# Try to find a good breaking point (in order of preference)
chunk_text = text[start:end]
break_point = None

# Try to break at paragraph boundary (\n\n)
last_paragraph = chunk_text.rfind("\n\n")
if last_paragraph > max_size * 0.5: # Only if it's past halfway
break_point = start + last_paragraph + 2 # +2 to skip the \n\n

# Try to break at line boundary (\n)
if break_point is None:
last_newline = chunk_text.rfind("\n")
if last_newline > max_size * 0.5:
break_point = start + last_newline + 1

# Try to break at sentence boundary (. ! ?)
if break_point is None:
for punct in [". ", "! ", "? "]:
last_sentence = chunk_text.rfind(punct)
if last_sentence > max_size * 0.5:
break_point = start + last_sentence + len(punct)
break

# Try to break at word boundary (space)
if break_point is None:
last_space = chunk_text.rfind(" ")
if last_space > max_size * 0.5:
break_point = start + last_space + 1

# If no good breaking point, just hard cut
if break_point is None:
break_point = end

chunks.append(text[start:break_point].strip())

# Move start position with overlap
start = max(break_point - overlap, start + 1)

return chunks


def function(
request: SearchIndexChunkingV1Request, runtime: Runtime
) -> SearchIndexChunkingV1Response:
Expand Down Expand Up @@ -121,11 +49,7 @@ def function(

# Create chunk outputs
for chunk_text in text_chunks:
# Create citations from source_dmo_fields if available
citations = {}
if metadata and metadata.source_dmo_fields:
for key, value in metadata.source_dmo_fields.items():
citations[key] = str(value)
citations = extract_citations(metadata)

chunk_output = SearchIndexChunkingV1Output(
chunk_type=ChunkType.TEXT,
Expand Down
104 changes: 104 additions & 0 deletions src/datacustomcode/templates/function/chunking/payload/utility.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,104 @@
"""Utility functions for text chunking operations."""

import logging
from typing import (
Dict,
List,
Optional,
)

from datacustomcode.function.feature_types.chunking import SearchIndexChunkingV1Metadata

logger = logging.getLogger(__name__)


def split_text_into_chunks(text: str, max_size: int, overlap: int = 20) -> List[str]:
"""Split text into chunks with overlap, trying to break at natural boundaries.

Tries to break at natural boundaries in order of preference:
1. Paragraph boundaries (\\n\\n)
2. Line boundaries (\\n)
3. Sentence boundaries (. ! ?)
4. Word boundaries (space)
5. Hard cut if no good boundary found

Args:
text: Text to split
max_size: Maximum characters per chunk
overlap: Number of characters to overlap between chunks

Returns:
List of text chunks
"""
if len(text) <= max_size:
return [text]

chunks = []
start = 0

while start < len(text):
# Determine end position for this chunk
end = start + max_size

if end >= len(text):
# Last chunk
chunks.append(text[start:])
break

# Try to find a good breaking point (in order of preference)
chunk_text = text[start:end]
break_point = None

# Try to break at paragraph boundary (\n\n)
last_paragraph = chunk_text.rfind("\n\n")
if last_paragraph > max_size * 0.5: # Only if it's past halfway
break_point = start + last_paragraph + 2 # +2 to skip the \n\n

# Try to break at line boundary (\n)
if break_point is None:
last_newline = chunk_text.rfind("\n")
if last_newline > max_size * 0.5:
break_point = start + last_newline + 1

# Try to break at sentence boundary (. ! ?)
if break_point is None:
for punct in [". ", "! ", "? "]:
last_sentence = chunk_text.rfind(punct)
if last_sentence > max_size * 0.5:
break_point = start + last_sentence + len(punct)
break

# Try to break at word boundary (space)
if break_point is None:
last_space = chunk_text.rfind(" ")
if last_space > max_size * 0.5:
break_point = start + last_space + 1

# If no good breaking point, just hard cut
if break_point is None:
break_point = end

chunks.append(text[start:break_point].strip())

# Move start position with overlap
start = max(break_point - overlap, start + 1)

return chunks


def extract_citations(
metadata: Optional[SearchIndexChunkingV1Metadata],
) -> Dict[str, str]:
"""Extract citations from document metadata.

Args:
metadata: Document metadata containing source DMO fields

Returns:
Dictionary of citation key-value pairs
"""
citations = {}
if metadata and metadata.source_dmo_fields:
for key, value in metadata.source_dmo_fields.items():
citations[key] = str(value)
return citations
42 changes: 42 additions & 0 deletions tests/test_run.py
Original file line number Diff line number Diff line change
Expand Up @@ -237,6 +237,48 @@ def test_run_entrypoint_with_dependencies():
sys.path.remove(module_dir)


def test_add_py_folder_enables_local_imports():
"""Test that add_py_folder adds entrypoint directory to sys.path."""
from datacustomcode.run import add_py_folder

# Create a temporary directory structure
temp_dir = tempfile.mkdtemp()

try:
# Create a utility module in the temp directory
utility_path = os.path.join(temp_dir, "utility.py")
with open(utility_path, "w") as f:
f.write("TEST_VALUE = 'local_module_works'\n")

# Create an entrypoint file
entrypoint_path = os.path.join(temp_dir, "entrypoint.py")
with open(entrypoint_path, "w") as f:
f.write("# Test entrypoint\n")

# Save original sys.path
original_path = sys.path.copy()

# Call add_py_folder with relative path from current directory
relative_entrypoint = os.path.relpath(entrypoint_path)
add_py_folder(relative_entrypoint)

# verify we can now import the utility module
import utility

assert hasattr(utility, "TEST_VALUE"), "utility module should have TEST_VALUE"
assert (
utility.TEST_VALUE == "local_module_works"
), f"Expected 'local_module_works', got {utility.TEST_VALUE}"

finally:
# Cleanup
sys.path = original_path
if "utility" in sys.modules:
del sys.modules["utility"]
if os.path.exists(temp_dir):
shutil.rmtree(temp_dir)


class TestDataspaceScenarios:
"""Test dataspace functionality in run_entrypoint."""

Expand Down
Loading