|
1 | 1 | import logging |
2 | | -from typing import List |
3 | | -from uuid import uuid4 |
4 | 2 |
|
5 | | -from datacustomcode.einstein_predictions.types import ( |
6 | | - PredictionColumBuilder, |
7 | | - PredictionRequestBuilder, |
8 | | - PredictionType, |
9 | | -) |
| 3 | +from utility import extract_citations, split_text_into_chunks |
| 4 | + |
10 | 5 | from datacustomcode.function import Runtime |
11 | | -from datacustomcode.llm_gateway.types.generate_text_request_builder import ( |
12 | | - GenerateTextRequestBuilder, |
| 6 | +from datacustomcode.function.feature_types.chunking import ( |
| 7 | + ChunkType, |
| 8 | + SearchIndexChunkingV1Output, |
| 9 | + SearchIndexChunkingV1Request, |
| 10 | + SearchIndexChunkingV1Response, |
13 | 11 | ) |
14 | 12 |
|
15 | 13 | logger = logging.getLogger(__name__) |
| 14 | +logging.basicConfig(level=logging.INFO) |
16 | 15 |
|
| 16 | +# Default max chunk size (can be overridden if contract adds max_characters field) |
| 17 | +DEFAULT_MAX_CHUNK_SIZE = 50 |
17 | 18 |
|
18 | | -def chunk_text(text: str, chunk_size: int = 1000) -> List[str]: |
19 | | - """ |
20 | | - Split text into chunks of approximately chunk_size characters. |
21 | | - Tries to split at sentence boundaries when possible. |
| 19 | + |
| 20 | +def function( |
| 21 | + request: SearchIndexChunkingV1Request, runtime: Runtime |
| 22 | +) -> SearchIndexChunkingV1Response: |
| 23 | + """Chunk documents into smaller pieces for search indexing. |
| 24 | +
|
| 25 | + Args: |
| 26 | + request: SearchIndexChunkingV1Request with input documents |
| 27 | + runtime: Runtime context (unused but required by contract) |
| 28 | +
|
| 29 | + Returns: |
| 30 | + SearchIndexChunkingV1Response with chunked output |
22 | 31 | """ |
23 | | - if not text: |
24 | | - return [] |
| 32 | + print(f"Received {len(request.input)} documents to chunk") |
25 | 33 |
|
26 | 34 | chunks = [] |
27 | | - current_chunk = "" |
28 | | - |
29 | | - # Split text into sentences (simple split by period) |
30 | | - sentences = text.split(". ") |
31 | | - |
32 | | - for sentence in sentences: |
33 | | - if len(current_chunk) + len(sentence) <= chunk_size: |
34 | | - current_chunk += sentence + ". " |
35 | | - else: |
36 | | - if current_chunk: |
37 | | - chunks.append(current_chunk.strip()) |
38 | | - current_chunk = sentence + ". " |
39 | | - |
40 | | - if current_chunk: |
41 | | - chunks.append(current_chunk.strip()) |
42 | | - |
43 | | - return chunks |
44 | | - |
45 | | - |
46 | | -def make_einstein_prediction(runtime: Runtime) -> None: |
47 | | - column = ( |
48 | | - PredictionColumBuilder() |
49 | | - .set_column_name("col1") |
50 | | - .set_string_values(["str1", "str2"]) |
51 | | - .build() |
52 | | - ) |
53 | | - prediction_request = ( |
54 | | - PredictionRequestBuilder() |
55 | | - .set_prediction_type(PredictionType.REGRESSION) |
56 | | - .set_model_api_name("regressionModel") |
57 | | - .set_prediction_columns([column]) |
58 | | - .build() |
59 | | - ) |
60 | | - |
61 | | - prediction_response = runtime.einstein_predictions.predict(prediction_request) |
62 | | - logger.info( |
63 | | - f"Einstein prediction results - success: [{prediction_response.is_success}] " |
64 | | - f"response data: {prediction_response.data}" |
65 | | - ) |
66 | | - |
67 | | - |
68 | | -def generate_text(runtime: Runtime, prompt: str, model: str = "sfdc_ai__DefaultGPT52"): |
69 | | - builder = GenerateTextRequestBuilder() |
70 | | - llm_request = builder.set_prompt(prompt).set_model(model).build() |
71 | | - llm_response = runtime.llm_gateway.generate_text(llm_request) |
72 | | - logger.info( |
73 | | - f"LLM Gateway generate text results - success: [{llm_response.is_success}] " |
74 | | - f"response data: {llm_response.data}" |
75 | | - ) |
76 | | - |
77 | | - |
78 | | -def function(request: dict, runtime: Runtime) -> dict: |
79 | | - logger.info("Inside Function") |
80 | | - logger.info(request) |
81 | | - |
82 | | - items = request["input"] |
83 | | - output_chunks = [] |
84 | | - current_seq_no = 1 # Start sequence number from 1 |
| 35 | + seq_no = 1 |
85 | 36 |
|
86 | | - """ |
87 | | - You can use your AI models configured in Salesforce to generate texts |
88 | | - or predict an outcome. See README.md for how to test locally before |
89 | | - deploying to Data Cloud. |
| 37 | + # Use default max chunk size |
| 38 | + max_chunk_size = DEFAULT_MAX_CHUNK_SIZE |
90 | 39 |
|
91 | | - Example: |
| 40 | + # Process each document |
| 41 | + for doc_idx, doc in enumerate(request.input): |
| 42 | + text = doc.text |
| 43 | + metadata = doc.metadata |
92 | 44 |
|
93 | | - >>> generated_text = generate_text(runtime, "Generate a greeting message") |
94 | | - ... prediction = make_einstein_prediction(runtime) |
| 45 | + print(f"Processing document {doc_idx + 1}: {len(text)} characters") |
95 | 46 |
|
96 | | - """ |
| 47 | + # Split the text using our simple chunking algorithm |
| 48 | + text_chunks = split_text_into_chunks(text, max_chunk_size, overlap=20) |
| 49 | + |
| 50 | + # Create chunk outputs |
| 51 | + for chunk_text in text_chunks: |
| 52 | + citations = extract_citations(metadata) |
| 53 | + |
| 54 | + chunk_output = SearchIndexChunkingV1Output( |
| 55 | + chunk_type=ChunkType.TEXT, |
| 56 | + text=chunk_text.strip(), |
| 57 | + seq_no=seq_no, |
| 58 | + citations=citations, |
| 59 | + ) |
| 60 | + chunks.append(chunk_output) |
| 61 | + |
| 62 | + print(f"Chunk {seq_no}: {len(chunk_text)} chars") |
| 63 | + seq_no += 1 |
| 64 | + |
| 65 | + print(f"Generated {len(chunks)} chunks total") |
97 | 66 |
|
98 | | - for item in items: |
99 | | - # Item is DocElement as dict |
100 | | - logger.info(f"Processing item: {item}") |
101 | | - |
102 | | - text = item.get("text", "") |
103 | | - metadata = item.get("metadata", {}) |
104 | | - |
105 | | - # Create chunks from the text |
106 | | - text_chunks = chunk_text(text, chunk_size=100) # Using a larger chunk size |
107 | | - |
108 | | - # Create chunk dictionaries for each text chunk |
109 | | - for chunk_content in text_chunks: |
110 | | - chunk_dict = { |
111 | | - "text": chunk_content, |
112 | | - "metadata": metadata, |
113 | | - "seq_no": current_seq_no, |
114 | | - "chunk_type": "text", |
115 | | - "chunk_id": str(uuid4()), |
116 | | - "tag_metadata": {}, |
117 | | - "citations": {}, |
118 | | - "source_record": item, |
119 | | - } |
120 | | - output_chunks.append(chunk_dict) |
121 | | - current_seq_no += 1 # Increment sequence number for next chunk |
122 | | - |
123 | | - logger.info("Completed chunking") |
124 | | - response = { |
125 | | - "output": output_chunks, |
126 | | - "status": {"status_type": "success", "status_message": "Chunking completed"}, |
127 | | - } |
128 | | - logger.info(response) |
129 | | - return response |
130 | | - |
131 | | - |
132 | | -# Test the function |
133 | | -if __name__ == "__main__": |
134 | | - # Configure logging |
135 | | - logging.basicConfig(level=logging.INFO) |
136 | | - |
137 | | - # Create test data with two DocElements |
138 | | - test_request = { |
139 | | - "input": [ |
140 | | - { |
141 | | - "text": ( |
142 | | - """This is the first sentence of the first document, which is |
143 | | - intentionally made longer to test chunking. """ |
144 | | - """Here is the second sentence of the first document, which is also |
145 | | - quite long and should ensure that the chunking function splits |
146 | | - this text into two chunks when the chunk size is set to 100.""" |
147 | | - ), |
148 | | - "metadata": {"source": "test1", "type": "document"}, |
149 | | - }, |
150 | | - { |
151 | | - "text": ( |
152 | | - """This is the first sentence of the second document, and it is |
153 | | - also extended to be longer than usual for testing purposes. """ |
154 | | - """The second sentence of the second document is similarly lengthy, |
155 | | - so that the chunking function will again create two chunks for |
156 | | - this document.""" |
157 | | - ), |
158 | | - "metadata": {"source": "test2", "type": "document"}, |
159 | | - }, |
160 | | - ] |
161 | | - } |
162 | | - |
163 | | - # Run the function |
164 | | - result = function(test_request, Runtime()) |
165 | | - |
166 | | - # Print the results in a more readable format |
167 | | - print("\nChunking Results:") |
168 | | - print("----------------") |
169 | | - for chunk in result["output"]: |
170 | | - print(f"\nChunk #{chunk['seq_no']}:") |
171 | | - print(f"Text: {chunk['text'][:100]}...") # Print first 100 chars of each chunk |
172 | | - print(f"Source: {chunk['metadata']['source']}") |
173 | | - print(f"Chunk ID: {chunk['chunk_id']}") |
| 67 | + return SearchIndexChunkingV1Response(output=chunks) |
0 commit comments