Skip to content

Commit 2ee73ea

Browse files
ValbuenaVCVictor Valbuena
andauthored
FEAT: Jailbreak Scenario Expansion (#1340)
Co-authored-by: Victor Valbuena <vvalbuena@microsoft.com>
1 parent 947d43a commit 2ee73ea

3 files changed

Lines changed: 296 additions & 55 deletions

File tree

pyrit/datasets/jailbreak/text_jailbreak.py

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -104,12 +104,12 @@ def __init__(
104104
self.template.value = self.template.render_template_value_silent(**kwargs)
105105

106106
@classmethod
107-
def get_all_jailbreak_templates(cls, n: Optional[int] = None) -> List[str]:
107+
def get_jailbreak_templates(cls, num_templates: Optional[int] = None) -> List[str]:
108108
"""
109109
Retrieve all jailbreaks from the JAILBREAK_TEMPLATES_PATH.
110110
111111
Args:
112-
n (int, optional): Number of jailbreak templates to return. None to get all.
112+
num_templates (int, optional): Number of jailbreak templates to return. None to get all.
113113
114114
Returns:
115115
List[str]: List of jailbreak template file names.
@@ -122,12 +122,12 @@ def get_all_jailbreak_templates(cls, n: Optional[int] = None) -> List[str]:
122122
if not jailbreak_template_names:
123123
raise ValueError("No jailbreak templates found in the jailbreak directory")
124124

125-
if n:
126-
if n > len(jailbreak_template_names):
125+
if num_templates:
126+
if num_templates > len(jailbreak_template_names):
127127
raise ValueError(
128-
f"Attempted to pull {n} jailbreaks from a dataset with only {len(jailbreak_template_names)} jailbreaks!"
128+
f"Attempted to pull {num_templates} jailbreaks from a dataset with only {len(jailbreak_template_names)} jailbreaks!"
129129
)
130-
jailbreak_template_names = random.choices(jailbreak_template_names, k=n)
130+
jailbreak_template_names = random.choices(jailbreak_template_names, k=num_templates)
131131
return jailbreak_template_names
132132

133133
def get_jailbreak_system_prompt(self) -> str:

pyrit/scenario/scenarios/airt/jailbreak.py

Lines changed: 125 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -3,25 +3,26 @@
33

44
import os
55
from pathlib import Path
6-
from typing import List, Optional
6+
from typing import List, Optional, Union
77

88
from pyrit.common import apply_defaults
99
from pyrit.datasets import TextJailBreak
1010
from pyrit.executor.attack.core.attack_config import (
1111
AttackConverterConfig,
1212
AttackScoringConfig,
1313
)
14+
from pyrit.executor.attack.single_turn.many_shot_jailbreak import ManyShotJailbreakAttack
1415
from pyrit.executor.attack.single_turn.prompt_sending import PromptSendingAttack
16+
from pyrit.executor.attack.single_turn.role_play import RolePlayAttack, RolePlayPaths
17+
from pyrit.executor.attack.single_turn.skeleton_key import SkeletonKeyAttack
1518
from pyrit.models import SeedAttackGroup
1619
from pyrit.prompt_converter import TextJailbreakConverter
1720
from pyrit.prompt_normalizer import PromptConverterConfiguration
1821
from pyrit.prompt_target import OpenAIChatTarget
1922
from pyrit.scenario.core.atomic_attack import AtomicAttack
2023
from pyrit.scenario.core.dataset_configuration import DatasetConfiguration
2124
from pyrit.scenario.core.scenario import Scenario
22-
from pyrit.scenario.core.scenario_strategy import (
23-
ScenarioStrategy,
24-
)
25+
from pyrit.scenario.core.scenario_strategy import ScenarioCompositeStrategy, ScenarioStrategy
2526
from pyrit.score import (
2627
SelfAskRefusalScorer,
2728
TrueFalseInverterScorer,
@@ -31,13 +32,41 @@
3132

3233
class JailbreakStrategy(ScenarioStrategy):
3334
"""
34-
Strategy for single-turn jailbreak attacks.
35+
Strategy for jailbreak attacks.
36+
37+
The SIMPLE strategy just sends the jailbroken prompt and records the response. It is meant to
38+
expose an obvious way of using this scenario without worrying about additional tweaks and changes
39+
to the prompt.
3540
36-
There is currently only one, running all jailbreaks.
41+
COMPLEX strategies use additional techniques to enhance the jailbreak like modifying the
42+
system prompt or probing the target model for an additional vulnerability (e.g. the SkeletonKeyAttack).
43+
They are meant to provide a sense of how well a jailbreak generalizes to slight changes in the delivery
44+
method.
3745
"""
3846

47+
# Aggregate members (special markers that expand to strategies with matching tags)
3948
ALL = ("all", {"all"})
40-
PYRIT = ("pyrit", {"pyrit"})
49+
SIMPLE = ("simple", {"simple"})
50+
COMPLEX = ("complex", {"complex"})
51+
52+
# Simple strategies
53+
PromptSending = ("prompt_sending", {"simple"})
54+
55+
# Complex strategies
56+
ManyShot = ("many_shot", {"complex"})
57+
SkeletonKey = ("skeleton", {"complex"})
58+
RolePlay = ("role_play", {"complex"})
59+
60+
@classmethod
61+
def get_aggregate_tags(cls) -> set[str]:
62+
"""
63+
Get the set of tags that represent aggregate categories.
64+
65+
Returns:
66+
set[str]: Set of tags that are aggregate markers.
67+
"""
68+
# Include base class aggregates ("all") and add scenario-specific ones
69+
return super().get_aggregate_tags() | {"simple", "complex"}
4170

4271

4372
class Jailbreak(Scenario):
@@ -67,9 +96,9 @@ def get_default_strategy(cls) -> ScenarioStrategy:
6796
Get the default strategy used when no strategies are specified.
6897
6998
Returns:
70-
ScenarioStrategy: JailbreakStrategy.ALL.
99+
ScenarioStrategy: JailbreakStrategy.PromptSending.
71100
"""
72-
return JailbreakStrategy.ALL
101+
return JailbreakStrategy.SIMPLE
73102

74103
@classmethod
75104
def required_datasets(cls) -> list[str]:
@@ -93,7 +122,9 @@ def __init__(
93122
objective_scorer: Optional[TrueFalseScorer] = None,
94123
include_baseline: bool = False,
95124
scenario_result_id: Optional[str] = None,
96-
n_jailbreaks: Optional[int] = 3,
125+
num_templates: Optional[int] = None,
126+
num_attempts: int = 1,
127+
jailbreak_names: List[str] = [],
97128
) -> None:
98129
"""
99130
Initialize the jailbreak scenario.
@@ -104,13 +135,45 @@ def __init__(
104135
include_baseline (bool): Whether to include a baseline atomic attack that sends all
105136
objectives without modifications. Defaults to True.
106137
scenario_result_id (Optional[str]): Optional ID of an existing scenario result to resume.
107-
n_jailbreaks (Optional[int]): Choose n random jailbreaks rather than using all of them.
138+
num_templates (Optional[int]): Choose num_templates random jailbreaks rather than using all of them.
139+
num_attempts (Optional[int]): Number of times to try each jailbreak.
140+
jailbreak_names (Optional[List[str]]): List of jailbreak names from the template list under datasets.
141+
to use.
142+
143+
Raises:
144+
ValueError: If both jailbreak_names and num_templates are provided, as random selection
145+
is incompatible with a predetermined list.
146+
ValueError: If the jailbreak_names list contains a jailbreak that isn't in the listed
147+
templates.
148+
108149
"""
150+
if jailbreak_names and num_templates:
151+
raise ValueError(
152+
"Please provide only one of `num_templates` (random selection) or `jailbreak_names` (specific selection)."
153+
)
154+
109155
if not objective_scorer:
110156
objective_scorer = self._get_default_objective_scorer()
111157
self._scorer_config = AttackScoringConfig(objective_scorer=objective_scorer)
112158

113-
self._n = n_jailbreaks
159+
self._num_templates = num_templates
160+
self._num_attempts = num_attempts
161+
162+
# Note that num_templates and jailbreak_names are mutually exclusive.
163+
# If self._num_templates is None, then this returns all discoverable jailbreak templates.
164+
# If self._num_templates has some value, then all_templates is a subset of all available
165+
# templates, but jailbreak_names is guaranteed to be [], so diff = {}.
166+
all_templates = TextJailBreak.get_jailbreak_templates(num_templates=self._num_templates)
167+
168+
# Example: if jailbreak_names is {'a', 'b', 'c'}, and all_templates is {'b', 'c', 'd'},
169+
# then diff = {'a'}, which raises the error as 'a' was not discovered in all_templates.
170+
diff = set(jailbreak_names) - set(all_templates)
171+
if len(diff) > 0:
172+
raise ValueError(f"Error: could not find templates `{diff}`!")
173+
174+
# If jailbreak_names has some value, then `if jailbreak_names` passes, and self._jailbreaks
175+
# is set to jailbreak_names. Otherwise we use all_templates.
176+
self._jailbreaks = jailbreak_names if jailbreak_names else all_templates
114177

115178
super().__init__(
116179
name="Jailbreak",
@@ -146,6 +209,20 @@ def _get_default_objective_scorer(self) -> TrueFalseScorer:
146209
)
147210
return refusal_scorer
148211

212+
def _get_default_adversarial_target(self) -> OpenAIChatTarget:
213+
"""
214+
Create and retrieve the default adversarial target.
215+
216+
Returns:
217+
OpenAIChatTarget: Default adversarial target using an unfiltered endpoint.
218+
"""
219+
return OpenAIChatTarget(
220+
endpoint=os.environ.get("AZURE_OPENAI_GPT4O_UNSAFE_CHAT_ENDPOINT"),
221+
api_key=os.environ.get("AZURE_OPENAI_GPT4O_UNSAFE_CHAT_KEY"),
222+
model_name=os.environ.get("AZURE_OPENAI_GPT4O_UNSAFE_CHAT_MODEL"),
223+
temperature=1.2,
224+
)
225+
149226
def _resolve_seed_groups(self) -> List[SeedAttackGroup]:
150227
"""
151228
Resolve seed groups from dataset configuration.
@@ -161,23 +238,14 @@ def _resolve_seed_groups(self) -> List[SeedAttackGroup]:
161238

162239
return list(seed_groups)
163240

164-
def _get_all_jailbreak_templates(self) -> List[str]:
165-
"""
166-
Retrieve all available jailbreak templates.
167-
168-
Returns:
169-
List[str]: List of jailbreak template file names.
170-
"""
171-
if not self._n:
172-
return TextJailBreak.get_all_jailbreak_templates()
173-
else:
174-
return TextJailBreak.get_all_jailbreak_templates(n=self._n)
175-
176-
async def _get_atomic_attack_from_jailbreak_async(self, *, jailbreak_template_name: str) -> AtomicAttack:
241+
async def _get_atomic_attack_from_strategy_async(
242+
self, *, strategy: str, jailbreak_template_name: str
243+
) -> AtomicAttack:
177244
"""
178245
Create an atomic attack for a specific jailbreak template.
179246
180247
Args:
248+
strategy (str): JailbreakStrategy to use.
181249
jailbreak_template_name (str): Name of the jailbreak template file.
182250
183251
Returns:
@@ -202,12 +270,28 @@ async def _get_atomic_attack_from_jailbreak_async(self, *, jailbreak_template_na
202270
request_converters=PromptConverterConfiguration.from_converters(converters=[jailbreak_converter])
203271
)
204272

205-
# Create the attack
206-
attack = PromptSendingAttack(
207-
objective_target=self._objective_target,
208-
attack_scoring_config=self._scorer_config,
209-
attack_converter_config=converter_config,
210-
)
273+
attack: Optional[Union[ManyShotJailbreakAttack, PromptSendingAttack, RolePlayAttack, SkeletonKeyAttack]] = None
274+
args = {
275+
"objective_target": self._objective_target,
276+
"attack_scoring_config": self._scorer_config,
277+
"attack_converter_config": converter_config,
278+
}
279+
match strategy:
280+
case "many_shot":
281+
attack = ManyShotJailbreakAttack(**args)
282+
case "prompt_sending":
283+
attack = PromptSendingAttack(**args)
284+
case "skeleton":
285+
attack = SkeletonKeyAttack(**args)
286+
case "role_play":
287+
args["adversarial_chat"] = self._get_default_adversarial_target()
288+
args["role_play_definition_path"] = RolePlayPaths.PERSUASION_SCRIPT.value
289+
attack = RolePlayAttack(**args)
290+
case _:
291+
raise ValueError(f"Unknown JailbreakStrategy `{strategy}`.")
292+
293+
if not attack:
294+
raise ValueError(f"Attack cannot be None!")
211295

212296
# Extract template name without extension for the atomic attack name
213297
template_name = Path(jailbreak_template_name).stem
@@ -230,11 +314,16 @@ async def _get_atomic_attacks_async(self) -> List[AtomicAttack]:
230314
# Retrieve seed prompts based on selected strategies
231315
self._seed_groups = self._resolve_seed_groups()
232316

233-
# Get all jailbreak template names
234-
jailbreak_template_names = self._get_all_jailbreak_templates()
317+
strategies = ScenarioCompositeStrategy.extract_single_strategy_values(
318+
composites=self._scenario_composites, strategy_type=JailbreakStrategy
319+
)
235320

236-
for template_name in jailbreak_template_names:
237-
atomic_attack = await self._get_atomic_attack_from_jailbreak_async(jailbreak_template_name=template_name)
238-
atomic_attacks.append(atomic_attack)
321+
for strategy in strategies:
322+
for template_name in self._jailbreaks:
323+
for _ in range(0, self._num_attempts):
324+
atomic_attack = await self._get_atomic_attack_from_strategy_async(
325+
strategy=strategy, jailbreak_template_name=template_name
326+
)
327+
atomic_attacks.append(atomic_attack)
239328

240329
return atomic_attacks

0 commit comments

Comments
 (0)