AgentLab/src/agentlab/agents/visual_agent/visual_agent_prompts.py at a5a8ef4e58ef9407b0e0845bc9cdd2bf598b49ff · ServiceNow/AgentLab · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
"""
Prompt builder for GenericAgent

It is based on the dynamic_prompting module from the agentlab package.
"""

import logging
from dataclasses import dataclass
import bgym

from browsergym.core.action.base import AbstractActionSet

from agentlab.agents import dynamic_prompting as dp
from agentlab.llm.llm_utils import BaseMessage, HumanMessage, image_to_jpg_base64_url


@dataclass
class PromptFlags(dp.Flags):
    """
    A class to represent various flags used to control features in an application.
    """

    obs: dp.ObsFlags = None
    action: dp.ActionFlags = None
    use_thinking: bool = True
    use_concrete_example: bool = False
    use_abstract_example: bool = True
    enable_chat: bool = False
    extra_instructions: str | None = None


class SystemPrompt(dp.PromptElement):
    _prompt = """\
You are an agent trying to solve a web task based on the content of the page and
user instructions. You can interact with the page and explore, and send messages to the user. Each time you
submit an action it will be sent to the browser and you will receive a new page."""


def make_instructions(obs: dict, from_chat: bool, extra_instructions: str | None):
    """Convenient wrapper to extract instructions from either goal or chat"""
    if from_chat:
        instructions = dp.ChatInstructions(
            obs["chat_messages"], extra_instructions=extra_instructions
        )
    else:
        if sum([msg["role"] == "user" for msg in obs.get("chat_messages", [])]) > 1:
            logging.warning(
                "Agent is in goal mode, but multiple user messages are present in the chat. Consider switching to `enable_chat=True`."
            )
        instructions = dp.GoalInstructions(
            obs["goal_object"], extra_instructions=extra_instructions
        )
    return instructions


class History(dp.PromptElement):
    """
    Format the actions and thoughts of previous steps."""

    def __init__(self, actions, thoughts) -> None:
        super().__init__()
        prompt_elements = []
        for i, (action, thought) in enumerate(zip(actions, thoughts)):
            prompt_elements.append(
                f"""
## Step {i}
### Thoughts:
{thought}
### Action:
{action}
"""
            )
        self._prompt = "\n".join(prompt_elements) + "\n"


class Observation(dp.PromptElement):
    """Observation of the current step.

    Contains the html, the accessibility tree and the error logs.
    """

    def __init__(self, obs, flags: dp.ObsFlags) -> None:
        super().__init__()
        self.flags = flags
        self.obs = obs

        # for a multi-tab browser, we need to show the current tab
        self.tabs = dp.Tabs(
            obs,
            visible=lambda: flags.use_tabs,
            prefix="## ",
        )

        # if an error is present, we need to show it
        self.error = dp.Error(
            obs["last_action_error"],
            visible=lambda: flags.use_error_logs and obs["last_action_error"],
            prefix="## ",
        )

    @property
    def _prompt(self) -> str:
        return f"""
# Observation of current step:
{self.tabs.prompt}{self.error.prompt}

"""

    def add_screenshot(self, prompt: BaseMessage) -> BaseMessage:
        if self.flags.use_screenshot:
            if self.flags.use_som:
                screenshot = self.obs["screenshot_som"]
                prompt.add_text(
                    "\n## Screenshot:\nHere is a screenshot of the page, it is annotated with bounding boxes and corresponding bids:"
                )
            else:
                screenshot = self.obs["screenshot"]
                prompt.add_text("\n## Screenshot:\nHere is a screenshot of the page:")
            img_url = image_to_jpg_base64_url(screenshot)
            prompt.add_image(img_url, detail=self.flags.openai_vision_detail)
        return prompt


class MainPrompt(dp.PromptElement):

    def __init__(
        self,
        action_set: AbstractActionSet,
        obs: dict,
        actions: list[str],
        thoughts: list[str],
        flags: PromptFlags,
    ) -> None:
        super().__init__()
        self.flags = flags
        self.history = History(actions, thoughts)
        self.instructions = make_instructions(obs, flags.enable_chat, flags.extra_instructions)
        self.obs = Observation(obs, self.flags.obs)

        self.action_prompt = dp.ActionPrompt(action_set, action_flags=flags.action)
        self.think = dp.Think(visible=lambda: flags.use_thinking)

    @property
    def _prompt(self) -> HumanMessage:
        prompt = HumanMessage(self.instructions.prompt)
        prompt.add_text(
            f"""\
{self.obs.prompt}\
{self.history.prompt}\
{self.action_prompt.prompt}\
{self.think.prompt}\
"""
        )

        if self.flags.use_abstract_example:
            prompt.add_text(
                f"""
# Abstract Example

Here is an abstract version of the answer with description of the content of
each tag. Make sure you follow this structure, but replace the content with your
answer:
{self.think.abstract_ex}\
{self.action_prompt.abstract_ex}\
"""
            )

        if self.flags.use_concrete_example:
            prompt.add_text(
                f"""
# Concrete Example

Here is a concrete example of how to format your answer.
Make sure to follow the template with proper tags:
{self.think.concrete_ex}\
{self.action_prompt.concrete_ex}\
"""
            )
        return self.obs.add_screenshot(prompt)

    def _parse_answer(self, text_answer):
        ans_dict = {}
        ans_dict.update(self.think.parse_answer(text_answer))
        ans_dict.update(self.action_prompt.parse_answer(text_answer))
        return ans_dict