From fccfdc135a8322fd383b6ddd6a7e35f5ba9302c0 Mon Sep 17 00:00:00 2001 From: Diksha Date: Thu, 4 Jun 2026 20:18:58 +0530 Subject: [PATCH 1/5] Add entrypoint directory to sys.path to support local module imports --- src/datacustomcode/run.py | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/src/datacustomcode/run.py b/src/datacustomcode/run.py index 6322270..0d9a110 100644 --- a/src/datacustomcode/run.py +++ b/src/datacustomcode/run.py @@ -201,8 +201,20 @@ def run_function_with_test(entrypoint: str, test_file: str) -> None: def add_py_folder(entrypoint: str): + """Add py-files subfolder and entrypoint directory to sys.path. + + This ensures: + 1. py-files/ is available for additional dependencies + 2. The entrypoint directory is available for local module imports + """ default_py_folder = "py-files" # Hardcoded folder name cwd = Path.cwd().joinpath(entrypoint) - py_folder = cwd.parent.joinpath(default_py_folder) + entrypoint_dir = cwd.parent + py_folder = entrypoint_dir.joinpath(default_py_folder) + + # Add py-files folder if it exists + if py_folder.exists(): + sys.path.insert(0, str(py_folder)) - sys.path.append(str(py_folder)) + # Add entrypoint directory to allow local module imports (e.g., utility.py) + sys.path.insert(0, str(entrypoint_dir)) From c1015c06fa8ae226adf2d5a19b475f4c2de686aa Mon Sep 17 00:00:00 2001 From: Diksha Date: Thu, 4 Jun 2026 20:27:56 +0530 Subject: [PATCH 2/5] Add test for local module imports via add_py_folder --- tests/test_run.py | 42 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 42 insertions(+) diff --git a/tests/test_run.py b/tests/test_run.py index dcb8225..1f17a27 100644 --- a/tests/test_run.py +++ b/tests/test_run.py @@ -237,6 +237,48 @@ def test_run_entrypoint_with_dependencies(): sys.path.remove(module_dir) +def test_add_py_folder_enables_local_imports(): + """Test that add_py_folder adds entrypoint directory to sys.path for local imports.""" + from datacustomcode.run import add_py_folder + + # Create a temporary directory structure + temp_dir = tempfile.mkdtemp() + + try: + # Create a utility module in the temp directory + utility_path = os.path.join(temp_dir, "utility.py") + with open(utility_path, "w") as f: + f.write("TEST_VALUE = 'local_module_works'\n") + + # Create an entrypoint file + entrypoint_path = os.path.join(temp_dir, "entrypoint.py") + with open(entrypoint_path, "w") as f: + f.write("# Test entrypoint\n") + + # Save original sys.path + original_path = sys.path.copy() + + # Call add_py_folder with relative path from current directory + relative_entrypoint = os.path.relpath(entrypoint_path) + add_py_folder(relative_entrypoint) + + # verify we can now import the utility module + import utility + + assert hasattr(utility, "TEST_VALUE"), "utility module should have TEST_VALUE" + assert ( + utility.TEST_VALUE == "local_module_works" + ), f"Expected 'local_module_works', got {utility.TEST_VALUE}" + + finally: + # Cleanup + sys.path = original_path + if "utility" in sys.modules: + del sys.modules["utility"] + if os.path.exists(temp_dir): + shutil.rmtree(temp_dir) + + class TestDataspaceScenarios: """Test dataspace functionality in run_entrypoint.""" From 16c5ab490336027e8eb7f85fb7500f4a83298d14 Mon Sep 17 00:00:00 2001 From: Diksha Date: Thu, 4 Jun 2026 20:33:04 +0530 Subject: [PATCH 3/5] Update comments --- src/datacustomcode/run.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/datacustomcode/run.py b/src/datacustomcode/run.py index 0d9a110..006055c 100644 --- a/src/datacustomcode/run.py +++ b/src/datacustomcode/run.py @@ -207,7 +207,7 @@ def add_py_folder(entrypoint: str): 1. py-files/ is available for additional dependencies 2. The entrypoint directory is available for local module imports """ - default_py_folder = "py-files" # Hardcoded folder name + default_py_folder = "py-files" cwd = Path.cwd().joinpath(entrypoint) entrypoint_dir = cwd.parent py_folder = entrypoint_dir.joinpath(default_py_folder) @@ -216,5 +216,5 @@ def add_py_folder(entrypoint: str): if py_folder.exists(): sys.path.insert(0, str(py_folder)) - # Add entrypoint directory to allow local module imports (e.g., utility.py) + # Add entrypoint directory to allow local module imports sys.path.insert(0, str(entrypoint_dir)) From a24a51cf3ae20de3aa735a4bd736d2e0cfa86822 Mon Sep 17 00:00:00 2001 From: Diksha Date: Thu, 4 Jun 2026 20:36:38 +0530 Subject: [PATCH 4/5] Fix lint error --- tests/test_run.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_run.py b/tests/test_run.py index 1f17a27..1eace88 100644 --- a/tests/test_run.py +++ b/tests/test_run.py @@ -238,7 +238,7 @@ def test_run_entrypoint_with_dependencies(): def test_add_py_folder_enables_local_imports(): - """Test that add_py_folder adds entrypoint directory to sys.path for local imports.""" + """Test that add_py_folder adds entrypoint directory to sys.path.""" from datacustomcode.run import add_py_folder # Create a temporary directory structure From e17397ab3f05d3a2ef4edb90c181ce35810ffc0d Mon Sep 17 00:00:00 2001 From: Diksha Date: Fri, 5 Jun 2026 13:05:02 +0530 Subject: [PATCH 5/5] Update example --- .../function/chunking/payload/entrypoint.py | 82 +------------- .../function/chunking/payload/utility.py | 104 ++++++++++++++++++ 2 files changed, 107 insertions(+), 79 deletions(-) create mode 100644 src/datacustomcode/templates/function/chunking/payload/utility.py diff --git a/src/datacustomcode/templates/function/chunking/payload/entrypoint.py b/src/datacustomcode/templates/function/chunking/payload/entrypoint.py index 8200e0f..81dcb32 100644 --- a/src/datacustomcode/templates/function/chunking/payload/entrypoint.py +++ b/src/datacustomcode/templates/function/chunking/payload/entrypoint.py @@ -1,5 +1,7 @@ import logging +from utility import extract_citations, split_text_into_chunks + from datacustomcode.function import Runtime from datacustomcode.function.feature_types.chunking import ( ChunkType, @@ -15,80 +17,6 @@ DEFAULT_MAX_CHUNK_SIZE = 50 -def split_text_into_chunks(text: str, max_size: int, overlap: int = 20): - """Split text into chunks with overlap, trying to break at natural boundaries. - - Tries to break at natural boundaries in order of preference: - 1. Paragraph boundaries (\\n\\n) - 2. Line boundaries (\\n) - 3. Sentence boundaries (. ! ?) - 4. Word boundaries (space) - 5. Hard cut if no good boundary found - - Args: - text: Text to split - max_size: Maximum characters per chunk - overlap: Number of characters to overlap between chunks - - Returns: - List of text chunks - """ - if len(text) <= max_size: - return [text] - - chunks = [] - start = 0 - - while start < len(text): - # Determine end position for this chunk - end = start + max_size - - if end >= len(text): - # Last chunk - chunks.append(text[start:]) - break - - # Try to find a good breaking point (in order of preference) - chunk_text = text[start:end] - break_point = None - - # Try to break at paragraph boundary (\n\n) - last_paragraph = chunk_text.rfind("\n\n") - if last_paragraph > max_size * 0.5: # Only if it's past halfway - break_point = start + last_paragraph + 2 # +2 to skip the \n\n - - # Try to break at line boundary (\n) - if break_point is None: - last_newline = chunk_text.rfind("\n") - if last_newline > max_size * 0.5: - break_point = start + last_newline + 1 - - # Try to break at sentence boundary (. ! ?) - if break_point is None: - for punct in [". ", "! ", "? "]: - last_sentence = chunk_text.rfind(punct) - if last_sentence > max_size * 0.5: - break_point = start + last_sentence + len(punct) - break - - # Try to break at word boundary (space) - if break_point is None: - last_space = chunk_text.rfind(" ") - if last_space > max_size * 0.5: - break_point = start + last_space + 1 - - # If no good breaking point, just hard cut - if break_point is None: - break_point = end - - chunks.append(text[start:break_point].strip()) - - # Move start position with overlap - start = max(break_point - overlap, start + 1) - - return chunks - - def function( request: SearchIndexChunkingV1Request, runtime: Runtime ) -> SearchIndexChunkingV1Response: @@ -121,11 +49,7 @@ def function( # Create chunk outputs for chunk_text in text_chunks: - # Create citations from source_dmo_fields if available - citations = {} - if metadata and metadata.source_dmo_fields: - for key, value in metadata.source_dmo_fields.items(): - citations[key] = str(value) + citations = extract_citations(metadata) chunk_output = SearchIndexChunkingV1Output( chunk_type=ChunkType.TEXT, diff --git a/src/datacustomcode/templates/function/chunking/payload/utility.py b/src/datacustomcode/templates/function/chunking/payload/utility.py new file mode 100644 index 0000000..06c3dfd --- /dev/null +++ b/src/datacustomcode/templates/function/chunking/payload/utility.py @@ -0,0 +1,104 @@ +"""Utility functions for text chunking operations.""" + +import logging +from typing import ( + Dict, + List, + Optional, +) + +from datacustomcode.function.feature_types.chunking import SearchIndexChunkingV1Metadata + +logger = logging.getLogger(__name__) + + +def split_text_into_chunks(text: str, max_size: int, overlap: int = 20) -> List[str]: + """Split text into chunks with overlap, trying to break at natural boundaries. + + Tries to break at natural boundaries in order of preference: + 1. Paragraph boundaries (\\n\\n) + 2. Line boundaries (\\n) + 3. Sentence boundaries (. ! ?) + 4. Word boundaries (space) + 5. Hard cut if no good boundary found + + Args: + text: Text to split + max_size: Maximum characters per chunk + overlap: Number of characters to overlap between chunks + + Returns: + List of text chunks + """ + if len(text) <= max_size: + return [text] + + chunks = [] + start = 0 + + while start < len(text): + # Determine end position for this chunk + end = start + max_size + + if end >= len(text): + # Last chunk + chunks.append(text[start:]) + break + + # Try to find a good breaking point (in order of preference) + chunk_text = text[start:end] + break_point = None + + # Try to break at paragraph boundary (\n\n) + last_paragraph = chunk_text.rfind("\n\n") + if last_paragraph > max_size * 0.5: # Only if it's past halfway + break_point = start + last_paragraph + 2 # +2 to skip the \n\n + + # Try to break at line boundary (\n) + if break_point is None: + last_newline = chunk_text.rfind("\n") + if last_newline > max_size * 0.5: + break_point = start + last_newline + 1 + + # Try to break at sentence boundary (. ! ?) + if break_point is None: + for punct in [". ", "! ", "? "]: + last_sentence = chunk_text.rfind(punct) + if last_sentence > max_size * 0.5: + break_point = start + last_sentence + len(punct) + break + + # Try to break at word boundary (space) + if break_point is None: + last_space = chunk_text.rfind(" ") + if last_space > max_size * 0.5: + break_point = start + last_space + 1 + + # If no good breaking point, just hard cut + if break_point is None: + break_point = end + + chunks.append(text[start:break_point].strip()) + + # Move start position with overlap + start = max(break_point - overlap, start + 1) + + return chunks + + +def extract_citations( + metadata: Optional[SearchIndexChunkingV1Metadata], +) -> Dict[str, str]: + """Extract citations from document metadata. + + Args: + metadata: Document metadata containing source DMO fields + + Returns: + Dictionary of citation key-value pairs + """ + citations = {} + if metadata and metadata.source_dmo_fields: + for key, value in metadata.source_dmo_fields.items(): + citations[key] = str(value) + return citations