-
Notifications
You must be signed in to change notification settings - Fork 795
Expand file tree
/
Copy pathattack_config.py
More file actions
228 lines (180 loc) · 9.58 KB
/
Copy pathattack_config.py
File metadata and controls
228 lines (180 loc) · 9.58 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT license.
import logging
from dataclasses import dataclass, field
from pathlib import Path
from pyrit.common.deprecation import print_deprecation_message
from pyrit.executor.core import StrategyConverterConfig
from pyrit.models import JsonSchemaDefinition, SeedPrompt
from pyrit.prompt_target import PromptTarget
from pyrit.score import Scorer, TrueFalseScorer
logger = logging.getLogger(__name__)
# Default first message sent to the adversarial chat when there is no objective-target
# response yet (rendered with ``{{ objective }}``).
DEFAULT_ADVERSARIAL_FIRST_MESSAGE = "Generate your first message to achieve: {{ objective }}"
# Default template for the per-turn message handed to the adversarial chat. It renders the
# objective target's latest text response. Templates may also reference ``{{ objective }}``,
# ``{{ score.score_value }}`` / ``{{ score.score_rationale }}``, and any data-type bucket on
# ``message`` (e.g. ``{{ message.image_path.converted_value }}``).
DEFAULT_ADVERSARIAL_PROMPT_TEMPLATE = "{{ message.text.converted_value }}"
def resolve_adversarial_json_schema(
*,
system_prompt: SeedPrompt | None,
first_message: SeedPrompt | None,
) -> JsonSchemaDefinition | None:
"""
Resolve the single adversarial-chat response JSON schema from a pair of prompts.
The schema may be declared on either the adversarial system prompt or the first message
(via ``response_json_schema`` / ``response_json_schema_name`` in YAML), but not both —
declaring it twice is ambiguous about which one drives the response shape.
Args:
system_prompt: The resolved adversarial system-prompt SeedPrompt, or None.
first_message: The resolved adversarial first-message SeedPrompt, or None.
Returns:
The declared schema, or None when neither prompt declares one.
Raises:
ValueError: If both prompts declare a ``response_json_schema``.
"""
system_schema = system_prompt.response_json_schema if system_prompt is not None else None
first_message_schema = first_message.response_json_schema if first_message is not None else None
if system_schema is not None and first_message_schema is not None:
raise ValueError(
"Both the adversarial system prompt and first message declare a response_json_schema; "
"set the schema on only one of them."
)
return system_schema or first_message_schema
@dataclass
class AttackAdversarialConfig:
"""
Adversarial configuration for attacks that involve adversarial chat targets.
This class defines the configuration for attacks that utilize an adversarial chat target,
including the target chat model, system prompt, and seed prompt for the attack.
"""
_DEFAULT_SEED_PROMPT = ""
# Adversarial chat target for the attack
target: PromptTarget
# Path to the YAML file containing the system prompt for the adversarial chat target.
# Deprecated: use ``system_prompt`` (an inline string or SeedPrompt) instead.
system_prompt_path: str | Path | None = None
# First message sent to the adversarial chat when there is no objective-target response
# yet (supports the {{ objective }} template variable). May be None for strategies that
# do not use a first message.
first_message: str | SeedPrompt | None = DEFAULT_ADVERSARIAL_FIRST_MESSAGE
# Template rendered each turn to build the text handed to the adversarial chat from the
# objective target's latest response. Receives ``objective``, ``score``, and a
# data-type-bucketed ``message`` view (e.g. {{ message.text.converted_value }}).
adversarial_prompt_template: str | SeedPrompt | None = DEFAULT_ADVERSARIAL_PROMPT_TEMPLATE
# System prompt for the adversarial chat target, as an inline Jinja template string or a
# SeedPrompt. Takes precedence over ``system_prompt_path`` when both are provided.
system_prompt: str | SeedPrompt | None = None
def __post_init__(self) -> None:
"""Emit a deprecation warning when the legacy ``system_prompt_path`` is used."""
if self.system_prompt_path is not None:
print_deprecation_message(
old_item="AttackAdversarialConfig.system_prompt_path",
new_item="AttackAdversarialConfig.system_prompt",
removed_in="0.17.0",
)
if self.system_prompt is not None:
logger.warning(
"Both 'system_prompt' and 'system_prompt_path' are set on AttackAdversarialConfig; "
"'system_prompt' takes precedence and 'system_prompt_path' is ignored."
)
def get_json_schema(self) -> JsonSchemaDefinition | None:
"""
Return the adversarial-chat response JSON schema declared on this config.
Reads ``response_json_schema`` off ``system_prompt`` and ``first_message`` when they
are ``SeedPrompt`` instances. Inline strings and ``system_prompt_path`` carry no
schema and are ignored here; for those, the schema is resolved from the effective
system prompt at attack-construction time.
Returns:
The declared schema, or None when neither prompt declares one.
Raises:
ValueError: If both ``system_prompt`` and ``first_message`` declare a schema.
"""
system_prompt = self.system_prompt if isinstance(self.system_prompt, SeedPrompt) else None
first_message = self.first_message if isinstance(self.first_message, SeedPrompt) else None
return resolve_adversarial_json_schema(system_prompt=system_prompt, first_message=first_message)
def resolve_adversarial_system_prompt(
*,
config: AttackAdversarialConfig,
default_system_prompt_path: str | Path,
required_parameters: list[str],
error_message: str | None = None,
) -> SeedPrompt:
"""
Resolve the effective adversarial system-prompt ``SeedPrompt`` for a strategy.
Resolution order:
1. ``config.system_prompt`` (inline string or SeedPrompt), if provided.
2. ``config.system_prompt_path`` (deprecated), if provided.
3. ``default_system_prompt_path``.
Inline strings are trusted: they are wrapped in a Jinja ``SeedPrompt`` whose declared
parameters are set to ``required_parameters``. Explicitly provided ``SeedPrompt`` objects
and YAML files are validated against ``required_parameters``.
Args:
config: The adversarial configuration to resolve the system prompt from.
default_system_prompt_path: Fallback YAML path when neither inline nor path is set.
required_parameters: Parameter names the resolved template must support.
error_message: Optional custom error message for validation failures.
Returns:
The resolved adversarial system-prompt SeedPrompt.
Raises:
ValueError: If an explicitly provided SeedPrompt is missing required parameters.
"""
system_prompt = config.system_prompt
if system_prompt is not None:
if isinstance(system_prompt, SeedPrompt):
# Validate only explicitly provided SeedPrompts against the required parameters.
declared = system_prompt.parameters or []
missing = [param for param in required_parameters if param not in declared]
if missing:
raise ValueError(
error_message or f"Adversarial system prompt is missing required parameters: {missing}"
)
return system_prompt
# Inline strings are trusted — declare all required params so Jinja rendering works.
return SeedPrompt(
value=system_prompt,
is_jinja_template=True,
parameters=list(required_parameters),
)
template_path = config.system_prompt_path or default_system_prompt_path
return SeedPrompt.from_yaml_with_required_parameters(
template_path=template_path,
required_parameters=required_parameters,
error_message=error_message,
)
@dataclass
class AttackScoringConfig:
"""
Scoring configuration for evaluating attack effectiveness.
This class defines the scoring components used to evaluate attack effectiveness,
detect refusals, and perform auxiliary scoring operations.
"""
# Primary scorer for evaluating attack effectiveness
objective_scorer: TrueFalseScorer | None = None
# Refusal scorer for detecting refusals or non-compliance
refusal_scorer: TrueFalseScorer | None = None
# Additional scorers for auxiliary metrics or custom evaluations
auxiliary_scorers: list[Scorer] = field(default_factory=list)
# Whether to use scoring results as feedback for iterative attacks
use_score_as_feedback: bool = True
def __post_init__(self) -> None:
"""
Validate configuration values.
Raises:
ValueError: If the objective or refusal scorers are not of type TrueFalseScorer.
"""
# Enforce objective scorer type: must be a TrueFalseScorer if provided
if self.objective_scorer and not isinstance(self.objective_scorer, TrueFalseScorer):
raise ValueError("Objective scorer must be a TrueFalseScorer")
# Enforce refusal scorer type: must be a TrueFalseScorer if provided
if self.refusal_scorer and not isinstance(self.refusal_scorer, TrueFalseScorer):
raise ValueError("Refusal scorer must be a TrueFalseScorer")
@dataclass
class AttackConverterConfig(StrategyConverterConfig):
"""
Configuration for prompt converters used in attacks.
This class defines the converter configurations that transform prompts
during the attack process, both for requests and responses.
"""