Skip to content

Commit ef67865

Browse files
authored
Merge pull request #115 from diksha-sf/u/diksha2/run
@W-22468107 - support local module imports in datacustomcode run
2 parents f1211ae + f27020d commit ef67865

6 files changed

Lines changed: 217 additions & 165 deletions

File tree

poetry.lock

Lines changed: 4 additions & 4 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

pyproject.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -101,7 +101,6 @@ click = "^8.1.8"
101101
loguru = "^0.7.3"
102102
numpy = "*"
103103
pandas = "*"
104-
pipreqs = "*"
105104
pydantic = "2.13.1"
106105
pyspark = "3.5.1"
107106
python = ">=3.10,<3.12"
@@ -115,6 +114,7 @@ build = "*"
115114
coverage = ">=7.0.0,<8.0.0"
116115
ipykernel = "^6.29.5"
117116
mypy = "*"
117+
pipreqs = "*"
118118
poetry-dynamic-versioning = {extras = ["plugin"], version = "^1.8.2"}
119119
pre-commit = "*"
120120
pytest = "*"

src/datacustomcode/run.py

Lines changed: 15 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -201,8 +201,20 @@ def run_function_with_test(entrypoint: str, test_file: str) -> None:
201201

202202

203203
def add_py_folder(entrypoint: str):
204-
default_py_folder = "py-files" # Hardcoded folder name
204+
"""Add py-files subfolder and entrypoint directory to sys.path.
205+
206+
This ensures:
207+
1. py-files/ is available for additional dependencies
208+
2. The entrypoint directory is available for local module imports
209+
"""
210+
default_py_folder = "py-files"
205211
cwd = Path.cwd().joinpath(entrypoint)
206-
py_folder = cwd.parent.joinpath(default_py_folder)
212+
entrypoint_dir = cwd.parent
213+
py_folder = entrypoint_dir.joinpath(default_py_folder)
214+
215+
# Add py-files folder if it exists
216+
if py_folder.exists():
217+
sys.path.insert(0, str(py_folder))
207218

208-
sys.path.append(str(py_folder))
219+
# Add entrypoint directory to allow local module imports
220+
sys.path.insert(0, str(entrypoint_dir))
Lines changed: 51 additions & 157 deletions
Original file line numberDiff line numberDiff line change
@@ -1,173 +1,67 @@
11
import logging
2-
from typing import List
3-
from uuid import uuid4
42

5-
from datacustomcode.einstein_predictions.types import (
6-
PredictionColumBuilder,
7-
PredictionRequestBuilder,
8-
PredictionType,
9-
)
3+
from utility import extract_citations, split_text_into_chunks
4+
105
from datacustomcode.function import Runtime
11-
from datacustomcode.llm_gateway.types.generate_text_request_builder import (
12-
GenerateTextRequestBuilder,
6+
from datacustomcode.function.feature_types.chunking import (
7+
ChunkType,
8+
SearchIndexChunkingV1Output,
9+
SearchIndexChunkingV1Request,
10+
SearchIndexChunkingV1Response,
1311
)
1412

1513
logger = logging.getLogger(__name__)
14+
logging.basicConfig(level=logging.INFO)
1615

16+
# Default max chunk size (can be overridden if contract adds max_characters field)
17+
DEFAULT_MAX_CHUNK_SIZE = 50
1718

18-
def chunk_text(text: str, chunk_size: int = 1000) -> List[str]:
19-
"""
20-
Split text into chunks of approximately chunk_size characters.
21-
Tries to split at sentence boundaries when possible.
19+
20+
def function(
21+
request: SearchIndexChunkingV1Request, runtime: Runtime
22+
) -> SearchIndexChunkingV1Response:
23+
"""Chunk documents into smaller pieces for search indexing.
24+
25+
Args:
26+
request: SearchIndexChunkingV1Request with input documents
27+
runtime: Runtime context (unused but required by contract)
28+
29+
Returns:
30+
SearchIndexChunkingV1Response with chunked output
2231
"""
23-
if not text:
24-
return []
32+
print(f"Received {len(request.input)} documents to chunk")
2533

2634
chunks = []
27-
current_chunk = ""
28-
29-
# Split text into sentences (simple split by period)
30-
sentences = text.split(". ")
31-
32-
for sentence in sentences:
33-
if len(current_chunk) + len(sentence) <= chunk_size:
34-
current_chunk += sentence + ". "
35-
else:
36-
if current_chunk:
37-
chunks.append(current_chunk.strip())
38-
current_chunk = sentence + ". "
39-
40-
if current_chunk:
41-
chunks.append(current_chunk.strip())
42-
43-
return chunks
44-
45-
46-
def make_einstein_prediction(runtime: Runtime) -> None:
47-
column = (
48-
PredictionColumBuilder()
49-
.set_column_name("col1")
50-
.set_string_values(["str1", "str2"])
51-
.build()
52-
)
53-
prediction_request = (
54-
PredictionRequestBuilder()
55-
.set_prediction_type(PredictionType.REGRESSION)
56-
.set_model_api_name("regressionModel")
57-
.set_prediction_columns([column])
58-
.build()
59-
)
60-
61-
prediction_response = runtime.einstein_predictions.predict(prediction_request)
62-
logger.info(
63-
f"Einstein prediction results - success: [{prediction_response.is_success}] "
64-
f"response data: {prediction_response.data}"
65-
)
66-
67-
68-
def generate_text(runtime: Runtime, prompt: str, model: str = "sfdc_ai__DefaultGPT52"):
69-
builder = GenerateTextRequestBuilder()
70-
llm_request = builder.set_prompt(prompt).set_model(model).build()
71-
llm_response = runtime.llm_gateway.generate_text(llm_request)
72-
logger.info(
73-
f"LLM Gateway generate text results - success: [{llm_response.is_success}] "
74-
f"response data: {llm_response.data}"
75-
)
76-
77-
78-
def function(request: dict, runtime: Runtime) -> dict:
79-
logger.info("Inside Function")
80-
logger.info(request)
81-
82-
items = request["input"]
83-
output_chunks = []
84-
current_seq_no = 1 # Start sequence number from 1
35+
seq_no = 1
8536

86-
"""
87-
You can use your AI models configured in Salesforce to generate texts
88-
or predict an outcome. See README.md for how to test locally before
89-
deploying to Data Cloud.
37+
# Use default max chunk size
38+
max_chunk_size = DEFAULT_MAX_CHUNK_SIZE
9039

91-
Example:
40+
# Process each document
41+
for doc_idx, doc in enumerate(request.input):
42+
text = doc.text
43+
metadata = doc.metadata
9244

93-
>>> generated_text = generate_text(runtime, "Generate a greeting message")
94-
... prediction = make_einstein_prediction(runtime)
45+
print(f"Processing document {doc_idx + 1}: {len(text)} characters")
9546

96-
"""
47+
# Split the text using our simple chunking algorithm
48+
text_chunks = split_text_into_chunks(text, max_chunk_size, overlap=20)
49+
50+
# Create chunk outputs
51+
for chunk_text in text_chunks:
52+
citations = extract_citations(metadata)
53+
54+
chunk_output = SearchIndexChunkingV1Output(
55+
chunk_type=ChunkType.TEXT,
56+
text=chunk_text.strip(),
57+
seq_no=seq_no,
58+
citations=citations,
59+
)
60+
chunks.append(chunk_output)
61+
62+
print(f"Chunk {seq_no}: {len(chunk_text)} chars")
63+
seq_no += 1
64+
65+
print(f"Generated {len(chunks)} chunks total")
9766

98-
for item in items:
99-
# Item is DocElement as dict
100-
logger.info(f"Processing item: {item}")
101-
102-
text = item.get("text", "")
103-
metadata = item.get("metadata", {})
104-
105-
# Create chunks from the text
106-
text_chunks = chunk_text(text, chunk_size=100) # Using a larger chunk size
107-
108-
# Create chunk dictionaries for each text chunk
109-
for chunk_content in text_chunks:
110-
chunk_dict = {
111-
"text": chunk_content,
112-
"metadata": metadata,
113-
"seq_no": current_seq_no,
114-
"chunk_type": "text",
115-
"chunk_id": str(uuid4()),
116-
"tag_metadata": {},
117-
"citations": {},
118-
"source_record": item,
119-
}
120-
output_chunks.append(chunk_dict)
121-
current_seq_no += 1 # Increment sequence number for next chunk
122-
123-
logger.info("Completed chunking")
124-
response = {
125-
"output": output_chunks,
126-
"status": {"status_type": "success", "status_message": "Chunking completed"},
127-
}
128-
logger.info(response)
129-
return response
130-
131-
132-
# Test the function
133-
if __name__ == "__main__":
134-
# Configure logging
135-
logging.basicConfig(level=logging.INFO)
136-
137-
# Create test data with two DocElements
138-
test_request = {
139-
"input": [
140-
{
141-
"text": (
142-
"""This is the first sentence of the first document, which is
143-
intentionally made longer to test chunking. """
144-
"""Here is the second sentence of the first document, which is also
145-
quite long and should ensure that the chunking function splits
146-
this text into two chunks when the chunk size is set to 100."""
147-
),
148-
"metadata": {"source": "test1", "type": "document"},
149-
},
150-
{
151-
"text": (
152-
"""This is the first sentence of the second document, and it is
153-
also extended to be longer than usual for testing purposes. """
154-
"""The second sentence of the second document is similarly lengthy,
155-
so that the chunking function will again create two chunks for
156-
this document."""
157-
),
158-
"metadata": {"source": "test2", "type": "document"},
159-
},
160-
]
161-
}
162-
163-
# Run the function
164-
result = function(test_request, Runtime())
165-
166-
# Print the results in a more readable format
167-
print("\nChunking Results:")
168-
print("----------------")
169-
for chunk in result["output"]:
170-
print(f"\nChunk #{chunk['seq_no']}:")
171-
print(f"Text: {chunk['text'][:100]}...") # Print first 100 chars of each chunk
172-
print(f"Source: {chunk['metadata']['source']}")
173-
print(f"Chunk ID: {chunk['chunk_id']}")
67+
return SearchIndexChunkingV1Response(output=chunks)

0 commit comments

Comments
 (0)