Bench-U/systems/DocETL/api.py at main · BIT-DataLab/Bench-U · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
import ast
import asyncio
import hashlib
import json
import os
import re
import time
from typing import Any, Dict, List, Optional

from litellm import (
    APIConnectionError,
    ModelResponse,
    RateLimitError,
    ServiceUnavailableError,
    completion,
    embedding,
)
from litellm.types.utils import ChatCompletionMessageToolCall, Function
from rich import print as rprint
from rich.console import Console, Group
from rich.panel import Panel
from rich.text import Text

from docetl.utils import completion_cost

from .cache import cache, cache_key, freezeargs
from .llm import (
    InvalidOutputError,
    LLMResult,
    approx_count_tokens,
    timeout,
    truncate_messages,
)
from .validation import (
    convert_dict_schema_to_list_schema,
    convert_val,
    get_user_input_for_schema,
    safe_eval,
    strict_render,
)

import logging
import litellm

# 配置日志记录器
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

BASIC_MODELS = ["gpt-4o-mini", "gpt-4o"]


def is_deepseek_r1(model: str) -> bool:
    model = model.lower()
    return "deepseek-r1" in model or "deepseek-reasoner" in model


def is_snowflake(model: str) -> bool:
    model = model.lower()
    return "snowflake" in model


class APIWrapper(object):
    def __init__(self, runner):
        self.runner = runner
        self.default_lm_api_base = runner.config.get("default_lm_api_base", None)
        self.default_embedding_api_base = runner.config.get(
            "default_embedding_api_base", None
        )

    @freezeargs
    def gen_embedding(self, model: str, input: List[str]) -> List[float]:
        """
        A cached wrapper around litellm.embedding function.

        This function uses LRU (Least Recently Used) cache to store and retrieve
        embeddings for given inputs. It can significantly speed up repeated calls
        with the same model and input.

        Args:
            model (str): The name of the embedding model to use.
            input (str): The input text to generate an embedding for.

        Returns:
            List[float]: The embedding vector as a list of floats.

        Note:
            The cache size is set to 1000. Adjust this value based on your memory
            constraints and usage patterns.
        """
        # Create a unique key for the cache
        key = hashlib.md5(f"{model}_{input}".encode()).hexdigest()
        input = json.loads(input)

        # If the model starts with "gpt" and there is no openai key, prefix the model with "azure"
        if (
            model.startswith("text-embedding")
            and not os.environ.get("OPENAI_API_KEY")
            and self.runner.config.get("from_docwrangler", False)
        ):
            model = "azure/" + model

        with cache as c:
            # Try to get the result from cache
            result = c.get(key)
            if result is None:
                # If not in cache, compute the embedding
                if not isinstance(input[0], str):
                    input = [json.dumps(item) for item in input]

                input = [item if item else "None" for item in input]

                # FIXME: Should we use a different limit for embedding?
                self.runner.blocking_acquire("embedding_call", weight=1)
                if self.runner.is_cancelled:
                    raise asyncio.CancelledError("Operation was cancelled")

                extra_kwargs = {}
                if self.default_embedding_api_base:
                    extra_kwargs["api_base"] = self.default_embedding_api_base

                result = embedding(model=model, input=input, **extra_kwargs)
                # Cache the result
                c.set(key, result)

        return result

    def call_llm_batch(
        self,
        model: str,
        op_type: str,
        messages: List[Dict[str, str]],
        output_schema: Dict[str, str],
        verbose: bool = False,
        timeout_seconds: int = 120,
        max_retries_per_timeout: int = 2,
        bypass_cache: bool = False,
        litellm_completion_kwargs: Dict[str, Any] = {},
        op_config: Dict[str, Any] = {},
    ) -> LLMResult:
        # Turn the output schema into a list of schemas
        output_schema = convert_dict_schema_to_list_schema(output_schema)

        # Invoke the LLM call
        return self.call_llm(
            model,
            op_type,
            messages,
            output_schema,
            verbose=verbose,
            timeout_seconds=timeout_seconds,
            max_retries_per_timeout=max_retries_per_timeout,
            bypass_cache=bypass_cache,
            litellm_completion_kwargs=litellm_completion_kwargs,
            op_config=op_config,
        )

    def _cached_call_llm(
        self,
        cache_key: str,
        model: str,
        op_type: str,
        messages: List[Dict[str, str]],
        output_schema: Dict[str, str],
        tools: Optional[str] = None,
        scratchpad: Optional[str] = None,
        validation_config: Optional[Dict[str, Any]] = None,
        gleaning_config: Optional[Dict[str, Any]] = None,
        verbose: bool = False,
        bypass_cache: bool = False,
        initial_result: Optional[Any] = None,
        litellm_completion_kwargs: Dict[str, Any] = {},
        op_config: Dict[str, Any] = {},
    ) -> LLMResult:
        """
        Cached version of the call_llm function.

        This function serves as a cached wrapper around _call_llm_with_cache. It uses
        the @freezeargs decorator to ensure immutable arguments and @functools.lru_cache
        for caching results.

        Args:
            cache_key (str): A unique key for caching.
            model (str): The model name.
            op_type (str): The operation type.
            messages (List[Dict[str, str]]): The messages to send to the LLM.
            output_schema (Dict[str, str]): The output schema dictionary.
            tools (Optional[str]): The tools to pass to the LLM.
            scratchpad (Optional[str]): The scratchpad to use for the operation.
            validation_config (Optional[Dict[str, Any]]): The validation configuration.
            gleaning_config (Optional[Dict[str, Any]]): The gleaning configuration.
            verbose (bool): Whether to print verbose output.
            bypass_cache (bool): Whether to bypass the cache.
            initial_result (Optional[Any]): The initial result to use for the operation, if exists.
            op_config (Dict[str, Any]): The operation configuration.
        Returns:
            LLMResult: The response from _call_llm_with_cache.
        """
        if (
            model.startswith("gpt")
            and not os.environ.get("OPENAI_API_KEY")
            and self.runner.config.get("from_docwrangler", False)
        ):
            model = "azure/" + model

        total_cost = 0.0
        validated = False
        with cache as c:
            response = c.get(cache_key)
            if response is not None and not bypass_cache:
                validated = True
            else:
                if not initial_result:
                    response = self._call_llm_with_cache(
                        model,
                        op_type,
                        messages,
                        output_schema,
                        tools,
                        scratchpad,
                        litellm_completion_kwargs,
                        op_config=op_config,
                    )
                    total_cost += completion_cost(response)
                else:
                    response = initial_result

                if gleaning_config:
                    # Retry gleaning prompt + regular LLM
                    num_gleaning_rounds = gleaning_config.get("num_rounds", 2)

                    parsed_output = (
                        self.parse_llm_response(response, output_schema, tools)[0]
                        if isinstance(response, ModelResponse)
                        else response
                    )

                    validator_messages = (
                        [
                            {
                                "role": "system",
                                "content": f"You are a helpful assistant, intelligently processing data. This is a {op_type} operation.",
                            }
                        ]
                        + messages
                        + [{"role": "assistant", "content": json.dumps(parsed_output)}]
                    )

                    for rnd in range(num_gleaning_rounds):
                        # Break early if gleaning condition is not met
                        if not self.should_glean(gleaning_config, parsed_output):
                            break
                        # Prepare validator prompt
                        validator_prompt = strict_render(
                            gleaning_config["validation_prompt"],
                            {"output": parsed_output},
                        )
                        self.runner.blocking_acquire("llm_call", weight=1)
                        # Approx the number of tokens in the messages
                        approx_num_tokens = approx_count_tokens(
                            validator_messages
                            + [{"role": "user", "content": validator_prompt}]
                        )
                        self.runner.blocking_acquire(
                            "llm_tokens", weight=approx_num_tokens
                        )

                        # Get params for should refine
                        should_refine_params = {
                            "type": "object",
                            "properties": {
                                "should_refine": {"type": "boolean"},
                                "improvements": {"type": "string"},
                            },
                            "required": ["should_refine", "improvements"],
                        }
                        if "gemini" not in model:
                            should_refine_params["additionalProperties"] = False

                        # Add extra kwargs
                        extra_kwargs = {}
                        if self.default_lm_api_base:
                            extra_kwargs["api_base"] = self.default_lm_api_base
                        if is_snowflake(model):
                            extra_kwargs["allowed_openai_params"] = [
                                "tools",
                                "tool_choice",
                            ]

                        validator_response = completion(
                            model=gleaning_config.get("model", model),
                            messages=truncate_messages(
                                validator_messages
                                + [{"role": "user", "content": validator_prompt}],
                                model,
                            ),
                            tools=[
                                {
                                    "type": "function",
                                    "function": {
                                        "name": "should_refine_answer",
                                        "description": "Determine if the output should be refined based on the validation feedback",
                                        "strict": True,
                                        "parameters": should_refine_params,
                                        "additionalProperties": False,
                                    },
                                }
                            ],
                            tool_choice="required",
                            **litellm_completion_kwargs,
                            **extra_kwargs,
                        )
                        total_cost += completion_cost(validator_response)

                        # Parse the validator response
                        suggestion = json.loads(
                            validator_response.choices[0]
                            .message.tool_calls[0]
                            .function.arguments
                        )
                        if not suggestion["should_refine"]:
                            break

                        if verbose:
                            self.runner.console.log(
                                f"Validator improvements (gleaning round {rnd + 1}): {suggestion['improvements']}"
                            )

                        # Prompt for improvement
                        improvement_prompt = f"""Based on the validation feedback:

                        ```
                        {suggestion['improvements']}
                        ```

                        Please improve your previous response. Ensure that the output adheres to the required schema and addresses any issues raised in the validation."""
                        messages.append({"role": "user", "content": improvement_prompt})

                        # Call LLM again
                        response = self._call_llm_with_cache(
                            model,
                            op_type,
                            messages,
                            output_schema,
                            tools,
                            scratchpad,
                            litellm_completion_kwargs,
                            op_config=op_config,
                        )
                        parsed_output = self.parse_llm_response(
                            response, output_schema, tools
                        )[0]
                        validator_messages[-1] = {
                            "role": "assistant",
                            "content": json.dumps(parsed_output),
                        }

                        total_cost += completion_cost(response)

                    validated = True

                # If there's validation, handle it here
                elif validation_config:
                    num_tries = validation_config.get("num_retries", 2) + 1
                    validation_fn = validation_config.get("validation_fn")
                    val_rule = validation_config.get("val_rule")

                    # Try validation
                    i = 0
                    validation_result = False
                    while not validation_result and i < num_tries:
                        parsed_output, validation_result = validation_fn(response)
                        if validation_result:
                            validated = True
                            break

                        # Append the validation result to messages
                        messages.append(
                            {
                                "role": "assistant",
                                "content": json.dumps(parsed_output),
                            }
                        )
                        messages.append(
                            {
                                "role": "user",
                                "content": f"Your output {parsed_output} failed my validation rule: {str(val_rule)}\n\nPlease try again.",
                            }
                        )
                        self.runner.console.log(
                            f"[bold red]Validation failed:[/bold red] {val_rule}\n"
                            f"\t[yellow]Output:[/yellow] {parsed_output}\n"
                            f"\t({i + 1}/{num_tries})"
                        )
                        i += 1

                        response = self._call_llm_with_cache(
                            model,
                            op_type,
                            messages,
                            output_schema,
                            tools,
                            scratchpad,
                            litellm_completion_kwargs,
                            op_config=op_config,
                        )
                        total_cost += completion_cost(response)

                else:
                    # No validation, so we assume the result is valid
                    validated = True

                # Only set the cache if the result tool calls or output is not empty
                if validated:
                    c.set(cache_key, response)

        return LLMResult(response=response, total_cost=total_cost, validated=validated)

    def call_llm(
        self,
        model: str,
        op_type: str,
        messages: List[Dict[str, str]],
        output_schema: Dict[str, str],
        tools: Optional[List[Dict[str, str]]] = None,
        scratchpad: Optional[str] = None,
        timeout_seconds: int = 120,
        max_retries_per_timeout: int = 2,
        validation_config: Optional[Dict[str, Any]] = None,
        gleaning_config: Optional[Dict[str, Any]] = None,
        verbose: bool = False,
        bypass_cache: bool = False,
        initial_result: Optional[Any] = None,
        litellm_completion_kwargs: Dict[str, Any] = {},
        op_config: Dict[str, Any] = {},
    ) -> LLMResult:
        """
        Wrapper function that uses caching for LLM calls.

        This function generates a cache key and calls the cached version of call_llm.
        It retries the call if it times out after 60 seconds.

        Args:
            model (str): The model name.
            op_type (str): The operation type.
            messages (List[Dict[str, str]]): The messages to send to the LLM.
            output_schema (Dict[str, str]): The output schema dictionary.
            tools (Optional[List[Dict[str, str]]]): The tools to pass to the LLM.
            scratchpad (Optional[str]): The scratchpad to use for the operation.
            timeout_seconds (int): The timeout for the LLM call.
            max_retries_per_timeout (int): The maximum number of retries per timeout.
            bypass_cache (bool): Whether to bypass the cache.
            initial_result (Optional[Any]): The initial result to use for the operation, if exists.
        Returns:
            LLMResult: The result from the cached LLM call.

        Raises:
            TimeoutError: If the call times out after retrying.
        """
        key = cache_key(
            model,
            op_type,
            messages,
            output_schema,
            scratchpad,
            self.runner.config.get("system_prompt", {}),
            op_config,
        )

        max_retries = max_retries_per_timeout
        attempt = 0
        rate_limited_attempt = 0
        while attempt <= max_retries:
            try:
                output = timeout(timeout_seconds)(self._cached_call_llm)(
                    key,
                    model,
                    op_type,
                    messages,
                    output_schema,
                    json.dumps(tools) if tools else None,
                    scratchpad,
                    validation_config=validation_config,
                    gleaning_config=gleaning_config,
                    verbose=verbose,
                    bypass_cache=bypass_cache,
                    initial_result=initial_result,
                    litellm_completion_kwargs=litellm_completion_kwargs,
                    op_config=op_config,
                )
                # Log input and output if verbose
                if verbose:
                    # Truncate messages to 500 chars
                    messages_str = str(messages)
                    truncated_messages = (
                        messages_str[:500] + "..."
                        if len(messages_str) > 500
                        else messages_str
                    )

                    # Log with nice formatting
                    self.runner.console.print(
                        Panel(
                            Group(
                                Text("Input:", style="bold cyan"),
                                Text(truncated_messages),
                                Text("\nOutput:", style="bold cyan"),
                                Text(str(output)),
                            ),
                            title="[bold green]LLM Call Details[/bold green]",
                            border_style="green",
                        )
                    )

                return output
            except RateLimitError:
                # TODO: this is a really hacky way to handle rate limits
                # we should implement a more robust retry mechanism
                backoff_time = 4 * (2**rate_limited_attempt)  # Exponential backoff
                max_backoff = 120  # Maximum backoff time of 60 seconds
                sleep_time = min(backoff_time, max_backoff)
                self.runner.console.log(
                    f"[yellow]Rate limit hit. Retrying in {sleep_time:.2f} seconds...[/yellow]"
                )
                time.sleep(sleep_time)
                rate_limited_attempt += 1
            except APIConnectionError as e:
                self.runner.console.log(
                    f"[bold red]API connection error. Retrying...[/bold red] {e}"
                )
                time.sleep(1)
            except ServiceUnavailableError:
                self.runner.console.log(
                    "[bold red]Service unavailable. Retrying...[/bold red]"
                )
                time.sleep(1)
            except TimeoutError:
                if attempt == max_retries:
                    self.runner.console.log(
                        f"[bold red]LLM call timed out after {max_retries + 1} attempts[/bold red]"
                    )
                    # TODO: HITL
                    return LLMResult(response=None, total_cost=0.0, validated=False)
                attempt += 1

    def _call_llm_with_cache(
        self,
        model: str,
        op_type: str,
        messages: List[Dict[str, str]],
        output_schema: Dict[str, str],
        tools: Optional[str] = None,
        scratchpad: Optional[str] = None,
        litellm_completion_kwargs: Dict[str, Any] = {},
        op_config: Dict[str, Any] = {},
    ) -> Any:
        """
        Make an LLM call with caching.

        This function prepares the necessary parameters and makes a call to the LLM
        using the provided model, operation type, prompt, and output schema.

        Args:
            model (str): The model name.
            op_type (str): The operation type.
            messages (List[Dict[str, str]]): The messages to send to the LLM.
            output_schema (Dict[str, str]): The output schema dictionary.
            tools (Optional[str]): The tools to pass to the LLM.
            scratchpad (Optional[str]): The scratchpad to use for the operation.
        Returns:
            str: The response from the LLM.
        """
        props = {key: convert_val(value) for key, value in output_schema.items()}
        use_tools = True

        if (
            len(props) == 1
            and list(props.values())[0].get("type") == "string"
            and scratchpad is None
            and ("sagemaker" in model or is_deepseek_r1(model))
        ):
            use_tools = False

        if tools is None and use_tools:
            if scratchpad is not None:
                props["updated_scratchpad"] = {"type": "string"}

            parameters = {"type": "object", "properties": props}
            parameters["required"] = list(props.keys())

            # TODO: this is a hack to get around the fact that gemini doesn't support additionalProperties
            if "gemini" not in model and "claude" not in model:
                parameters["additionalProperties"] = False

            if is_snowflake(model):
                tools = [
                    {
                        "tool_spec": {
                            "type": "generic",
                            "name": "send_output",
                            "description": "Send output back to the user",
                            "input_schema": parameters,
                        }
                    }
                ]
            else:
                tools = [
                    {
                        "type": "function",
                        "function": {
                            "name": "send_output",
                            "description": "Send output back to the user",
                            "parameters": parameters,
                        },
                    }
                ]
            if "claude" not in model:
                tools[0]["additionalProperties"] = False
                tools[0]["strict"] = True

            tool_choice = {"type": "function", "function": {"name": "send_output"}}

        elif tools is not None:
            tools = json.loads(tools)
            tool_choice = (
                "required"
                if any(tool.get("required", False) for tool in tools)
                else "auto"
            )
            tools = [
                {"type": "function", "function": tool["function"]} for tool in tools
            ]

        else:
            tools = None
            tool_choice = None

        persona = self.runner.config.get("system_prompt", {}).get(
            "persona", "a helpful assistant"
        )
        dataset_description = self.runner.config.get("system_prompt", {}).get(
            "dataset_description", "a collection of unstructured documents"
        )
        parethetical_op_instructions = (
            "many inputs:one output" if op_type == "reduce" else "one input:one output"
        )

        # Different system prompts based on model type
        base_prompt = f"You are a {persona}, helping the user make sense of their data. The dataset description is: {dataset_description}. You will be performing a {op_type} operation ({parethetical_op_instructions}). You will perform the specified task on the provided data, as precisely and exhaustively (i.e., high recall) as possible."

        if "sagemaker" in model or is_deepseek_r1(model):
            system_prompt = base_prompt
        else:
            system_prompt = (
                base_prompt
                + " The result should be a structured output that you will send back to the user, with the `send_output` function. Do not influence your answers too much based on the `send_output` function parameter names; just use them to send the result back to the user."
            )

        if scratchpad:
            system_prompt += f"""

You are incrementally processing data across multiple batches. You will see:
1. The current batch of data to process
2. The intermediate output so far (what you returned last time)
3. A scratchpad for tracking additional state: {scratchpad}

IMPORTANT: Only use the scratchpad if your task specifically requires tracking items that appear multiple times across batches. If you only need to track distinct/unique items, leave the scratchpad empty and set updated_scratchpad to null.

The intermediate output contains the result that directly answers the user's task, for **all** the data processed so far, including the current batch. You must return this via the send_output function.

Example task that NEEDS scratchpad - counting words that appear >2 times:
- Call send_output with: {{"frequent_words": ["the", "and"]}} # Words seen 3+ times - this is your actual result
- Set updated_scratchpad to: {{"pending": {{"cat": 2, "dog": 1}}}} # Must track words seen 1-2 times

Example task that does NOT need scratchpad - collecting unique locations:
- Call send_output with: {{"locations": ["New York", "Paris"]}} # Just the unique items
- Set updated_scratchpad to: null # No need to track counts since we only want distinct items

As you process each batch:
1. Use both the previous output and scratchpad (if needed) to inform your processing
2. Call send_output with your result that combines the current batch with previous output
3. Set updated_scratchpad only if you need to track counts/frequencies between batches

If you use the scratchpad, keep it concise (~500 chars) and easily parsable using:
- Key-value pairs
- JSON-like format
- Simple counters/tallies

Your main result must be sent via send_output. The updated_scratchpad is only for tracking state between batches, and should be null unless you specifically need to track frequencies."""

        # Truncate messages if they exceed the model's context length
        messages_with_system_prompt = truncate_messages(
            [
                {
                    "role": "system",
                    "content": system_prompt,
                },
            ] + messages,
            model,
        )

        self.runner.blocking_acquire("llm_call", weight=1)

        # Approx the number of tokens in the messages
        approx_num_tokens = approx_count_tokens(messages)
        logger.info(f"Input tokens: {approx_num_tokens}")
        self.runner.blocking_acquire("llm_tokens", weight=approx_num_tokens)
        if self.runner.is_cancelled:
            raise asyncio.CancelledError("Operation was cancelled")

        extra_litellm_kwargs = {}
        extra_litellm_kwargs.update(litellm_completion_kwargs)
        if "n" in op_config.get("output", {}).keys():
            extra_litellm_kwargs["n"] = op_config["output"]["n"]
        if is_snowflake(model):
            extra_litellm_kwargs["allowed_openai_params"] = ["tools", "tool_choice"]
        if self.default_lm_api_base:
            extra_litellm_kwargs["api_base"] = self.default_lm_api_base

        if tools is not None:
            try:
                response = completion(
                    model=model,
                    messages=messages_with_system_prompt,
                    tools=tools,
                    tool_choice=tool_choice,
                    **extra_litellm_kwargs,
                )
            except Exception as e:
                # Check that there's a prefix for the model name if it's not a basic model
                if model not in BASIC_MODELS:
                    if "/" not in model:
                        raise ValueError(
                            f"Note: You may also need to prefix your model name with the provider, e.g. 'openai/gpt-4o-mini' or 'gemini/gemini-1.5-flash' to conform to LiteLLM API standards. Original error: {e}"
                        )
                raise e
        else:
            try:
                response = completion(
                    model=model,
                    messages=messages_with_system_prompt,
                    **extra_litellm_kwargs,
                )
            except Exception as e:
                # Check that there's a prefix for the model name if it's not a basic model
                if model not in BASIC_MODELS:
                    if "/" not in model:
                        raise ValueError(
                            f"Note: You may also need to prefix your model name with the provider, e.g. 'openai/gpt-4o-mini' or 'gemini/gemini-1.5-flash' to conform to LiteLLM API standards. Original error: {e}"
                        )
                raise e

        output_content = response.choices[0].message.content
        output_tokens = litellm.token_counter(output_content)
        logger.info(f"Output tokens: {output_tokens}")  # Output output token number

        return response

    def parse_llm_response(
        self,
        response: Any,
        schema: Dict[str, Any] = {},
        tools: Optional[List[Dict[str, str]]] = None,
        manually_fix_errors: bool = False,
    ) -> List[Dict[str, Any]]:
        """
        Parse the response from a language model.
        This function extracts the tool calls from the LLM response and returns the arguments
        """
        try:
            if not response:
                raise InvalidOutputError("No response from LLM", [{}], schema, [], [])

            # Go through each choice
            results = []
            for index in range(len(response.choices)):
                results.extend(
                    self._parse_llm_response_helper(response, schema, tools, index)
                )
            return results
        except InvalidOutputError as e:
            if manually_fix_errors:
                rprint(
                    f"[bold red]Could not parse LLM output:[/bold red] {e.message}\n"
                    f"\tExpected Schema: {e.expected_schema}\n"
                    f"\tPlease manually set this output."
                )
                rprint(
                    f"\n[bold yellow]LLM-Generated Response:[/bold yellow]\n{response}"
                )
                output = get_user_input_for_schema(schema)

                return [output]
            else:
                raise e

    def _parse_llm_response_helper(
        self,
        response: Any,
        schema: Dict[str, Any] = {},
        tools: Optional[List[Dict[str, str]]] = None,
        index: int = 0,
    ) -> List[Dict[str, Any]]:
        """
        Parse the response from a language model.

        This function extracts the tool calls from the LLM response and returns the arguments
        of any 'send_output' function calls as a list of dictionaries.

        Args:
            response (Any): The response object from the language model.
            schema (Optional[Dict[str, Any]]): The schema that was passed to the LLM.
            tools (Optional[List[Dict[str, str]]]): The tools that were passed to the LLM.

        Returns:
            List[Dict[str, Any]]: A list of dictionaries containing the parsed output.

        Raises:
            InvalidOutputError: If the response is not valid.
        """
        if is_snowflake(response.model):
            tool_calls = (
                [
                    ChatCompletionMessageToolCall(
                        function=Function(
                            name=content.get("tool_use", {}).get("name"),
                            arguments=content.get("tool_use", {}).get("input"),
                        )
                    )
                    for content in response.choices[index].message.content_list
                    if content.get("type") == "tool_use"
                ]
                if hasattr(response.choices[index].message, "content_list")
                else []
            )
        else:
            tool_calls = (
                response.choices[index].message.tool_calls
                if "tool_calls" in dir(response.choices[index].message)
                else []
            )

        # Check if there are no tools and the schema has a single key-value pair
        if not tools and len(schema) == 1 and not tool_calls:
            key = next(iter(schema))
            content = response.choices[index].message.content

            # Handle deepseek-r1 models' think tags
            if is_deepseek_r1(response.model):
                result = {}
                # Extract think content if present
                think_match = re.search(r"<think>(.*?)</think>", content, re.DOTALL)
                if think_match:
                    result["think"] = think_match.group(1).strip()
                    # Get the remaining content after </think>
                    main_content = re.split(r"</think>", content, maxsplit=1)[
                        -1
                    ].strip()
                    result[key] = main_content
                else:
                    # If no think tags, just use the content as is
                    result[key] = content
                return [result]

            # For other models, continue with existing behavior
            return [{key: content}]

        # Parse the response based on the provided tools
        if tools:
            # If custom tools are provided, parse accordingly
            tool_calls = response.choices[index].message.tool_calls
            results = []
            for tool_call in tool_calls:
                for tool in tools:
                    if tool_call.function.name == tool["function"]["name"]:
                        try:
                            function_args = (
                                json.loads(tool_call.function.arguments)
                                if isinstance(tool_call.function.arguments, str)
                                else tool_call.function.arguments
                            )
                        except json.JSONDecodeError:
                            return [{}]
                        # Execute the function defined in the tool's code
                        local_scope = {}
                        exec(tool["code"].strip(), globals(), local_scope)
                        function_result = local_scope[tool["function"]["name"]](
                            **function_args
                        )
                        function_args.update(function_result)
                        results.append(function_args)
            return results
        else:
            if not tool_calls:
                raise InvalidOutputError(
                    "No tool calls in LLM response", [{}], schema, response.choices, []
                )

            outputs = []
            for tool_call in tool_calls:
                if response.choices[index].finish_reason == "content_filter":
                    raise InvalidOutputError(
                        "Content filter triggered by LLM provider.",
                        "",
                        schema,
                        response.choices,
                        tools,
                    )
                try:
                    output_dict = (
                        json.loads(tool_call.function.arguments)
                        if isinstance(tool_call.function.arguments, str)
                        else tool_call.function.arguments
                    )
                    # Augment output_dict with empty values for any keys in the schema that are not in output_dict
                    for key in schema:
                        if key not in output_dict:
                            output_dict[key] = "Not found"

                    if "ollama" in response.model or "sagemaker" in response.model:
                        for key, value in output_dict.items():
                            if not isinstance(value, str):
                                continue
                            try:
                                output_dict[key] = ast.literal_eval(value)
                            except Exception:
                                try:
                                    if value.startswith("["):
                                        output_dict[key] = ast.literal_eval(value + "]")
                                    else:
                                        output_dict[key] = value
                                except Exception:
                                    pass
                    outputs.append(output_dict)
                except json.JSONDecodeError:
                    raise InvalidOutputError(
                        "Could not decode LLM JSON response",
                        [tool_call.function.arguments],
                        schema,
                        response.choices,
                        tools,
                    )
                except Exception as e:
                    raise InvalidOutputError(
                        f"Error parsing LLM response: {e}",
                        [tool_call.function.arguments],
                        schema,
                        response.choices,
                        tools,
                    )

            return outputs

        # message = response.choices[0].message
        # return [json.loads(message.content)]

    def validate_output(self, operation: Dict, output: Dict, console: Console) -> bool:
        """
        Validate the output against the specified validation rules in the operation.

        Args:
            operation (Dict): The operation dictionary containing validation rules.
            output (Dict): The output to be validated.
            console (Console): The console object for logging.

        Returns:
            bool: True if all validations pass, False otherwise.
        """
        if "validate" not in operation:
            return True
        for validation in operation["validate"]:
            try:
                if not safe_eval(validation, output):
                    console.log(f"[bold red]Validation failed:[/bold red] {validation}")
                    console.log(f"[yellow]Output:[/yellow] {output}")
                    return False
            except Exception as e:
                console.log(f"[bold red]Validation error:[/bold red] {str(e)}")
                console.log(f"[yellow]Output:[/yellow] {output}")
                return False
        return True

    def should_glean(self, gleaning_config: Optional[Dict[str, Any]], output: Dict[str, Any]) -> bool:
        """Determine whether to execute a gleaning round based on an optional conditional expression.

        If ``gleaning_config`` contains an ``"if"`` key, its value is treated as a Python
        boolean expression that will be evaluated with the current ``output`` bound to the
        name ``output`` using :pyfunc:`safe_eval`. When the expression evaluates to
        ``True`` the gleaning round proceeds. If it evaluates to ``False`` (or raises an
        exception) the gleaning loop should terminate early.

        If no ``"if"`` key is present the method defaults to returning ``True`` so that
        gleaning proceeds normally.
        """
        # No gleaning_config or no conditional -> always glean
        if not gleaning_config or "if" not in gleaning_config:
            return True

        condition = gleaning_config.get("if")
        if not isinstance(condition, str):
            raise ValueError(f"Invalid gleaning condition (should be a string): {condition}")