33
44import os
55from pathlib import Path
6- from typing import List , Optional
6+ from typing import List , Optional , Union
77
88from pyrit .common import apply_defaults
99from pyrit .datasets import TextJailBreak
1010from pyrit .executor .attack .core .attack_config import (
1111 AttackConverterConfig ,
1212 AttackScoringConfig ,
1313)
14+ from pyrit .executor .attack .single_turn .many_shot_jailbreak import ManyShotJailbreakAttack
1415from pyrit .executor .attack .single_turn .prompt_sending import PromptSendingAttack
16+ from pyrit .executor .attack .single_turn .role_play import RolePlayAttack , RolePlayPaths
17+ from pyrit .executor .attack .single_turn .skeleton_key import SkeletonKeyAttack
1518from pyrit .models import SeedAttackGroup
1619from pyrit .prompt_converter import TextJailbreakConverter
1720from pyrit .prompt_normalizer import PromptConverterConfiguration
1821from pyrit .prompt_target import OpenAIChatTarget
1922from pyrit .scenario .core .atomic_attack import AtomicAttack
2023from pyrit .scenario .core .dataset_configuration import DatasetConfiguration
2124from pyrit .scenario .core .scenario import Scenario
22- from pyrit .scenario .core .scenario_strategy import (
23- ScenarioStrategy ,
24- )
25+ from pyrit .scenario .core .scenario_strategy import ScenarioCompositeStrategy , ScenarioStrategy
2526from pyrit .score import (
2627 SelfAskRefusalScorer ,
2728 TrueFalseInverterScorer ,
3132
3233class JailbreakStrategy (ScenarioStrategy ):
3334 """
34- Strategy for single-turn jailbreak attacks.
35+ Strategy for jailbreak attacks.
36+
37+ The SIMPLE strategy just sends the jailbroken prompt and records the response. It is meant to
38+ expose an obvious way of using this scenario without worrying about additional tweaks and changes
39+ to the prompt.
3540
36- There is currently only one, running all jailbreaks.
41+ COMPLEX strategies use additional techniques to enhance the jailbreak like modifying the
42+ system prompt or probing the target model for an additional vulnerability (e.g. the SkeletonKeyAttack).
43+ They are meant to provide a sense of how well a jailbreak generalizes to slight changes in the delivery
44+ method.
3745 """
3846
47+ # Aggregate members (special markers that expand to strategies with matching tags)
3948 ALL = ("all" , {"all" })
40- PYRIT = ("pyrit" , {"pyrit" })
49+ SIMPLE = ("simple" , {"simple" })
50+ COMPLEX = ("complex" , {"complex" })
51+
52+ # Simple strategies
53+ PromptSending = ("prompt_sending" , {"simple" })
54+
55+ # Complex strategies
56+ ManyShot = ("many_shot" , {"complex" })
57+ SkeletonKey = ("skeleton" , {"complex" })
58+ RolePlay = ("role_play" , {"complex" })
59+
60+ @classmethod
61+ def get_aggregate_tags (cls ) -> set [str ]:
62+ """
63+ Get the set of tags that represent aggregate categories.
64+
65+ Returns:
66+ set[str]: Set of tags that are aggregate markers.
67+ """
68+ # Include base class aggregates ("all") and add scenario-specific ones
69+ return super ().get_aggregate_tags () | {"simple" , "complex" }
4170
4271
4372class Jailbreak (Scenario ):
@@ -67,9 +96,9 @@ def get_default_strategy(cls) -> ScenarioStrategy:
6796 Get the default strategy used when no strategies are specified.
6897
6998 Returns:
70- ScenarioStrategy: JailbreakStrategy.ALL .
99+ ScenarioStrategy: JailbreakStrategy.PromptSending .
71100 """
72- return JailbreakStrategy .ALL
101+ return JailbreakStrategy .SIMPLE
73102
74103 @classmethod
75104 def required_datasets (cls ) -> list [str ]:
@@ -93,7 +122,9 @@ def __init__(
93122 objective_scorer : Optional [TrueFalseScorer ] = None ,
94123 include_baseline : bool = False ,
95124 scenario_result_id : Optional [str ] = None ,
96- n_jailbreaks : Optional [int ] = 3 ,
125+ num_templates : Optional [int ] = None ,
126+ num_attempts : int = 1 ,
127+ jailbreak_names : List [str ] = [],
97128 ) -> None :
98129 """
99130 Initialize the jailbreak scenario.
@@ -104,13 +135,45 @@ def __init__(
104135 include_baseline (bool): Whether to include a baseline atomic attack that sends all
105136 objectives without modifications. Defaults to True.
106137 scenario_result_id (Optional[str]): Optional ID of an existing scenario result to resume.
107- n_jailbreaks (Optional[int]): Choose n random jailbreaks rather than using all of them.
138+ num_templates (Optional[int]): Choose num_templates random jailbreaks rather than using all of them.
139+ num_attempts (Optional[int]): Number of times to try each jailbreak.
140+ jailbreak_names (Optional[List[str]]): List of jailbreak names from the template list under datasets.
141+ to use.
142+
143+ Raises:
144+ ValueError: If both jailbreak_names and num_templates are provided, as random selection
145+ is incompatible with a predetermined list.
146+ ValueError: If the jailbreak_names list contains a jailbreak that isn't in the listed
147+ templates.
148+
108149 """
150+ if jailbreak_names and num_templates :
151+ raise ValueError (
152+ "Please provide only one of `num_templates` (random selection) or `jailbreak_names` (specific selection)."
153+ )
154+
109155 if not objective_scorer :
110156 objective_scorer = self ._get_default_objective_scorer ()
111157 self ._scorer_config = AttackScoringConfig (objective_scorer = objective_scorer )
112158
113- self ._n = n_jailbreaks
159+ self ._num_templates = num_templates
160+ self ._num_attempts = num_attempts
161+
162+ # Note that num_templates and jailbreak_names are mutually exclusive.
163+ # If self._num_templates is None, then this returns all discoverable jailbreak templates.
164+ # If self._num_templates has some value, then all_templates is a subset of all available
165+ # templates, but jailbreak_names is guaranteed to be [], so diff = {}.
166+ all_templates = TextJailBreak .get_jailbreak_templates (num_templates = self ._num_templates )
167+
168+ # Example: if jailbreak_names is {'a', 'b', 'c'}, and all_templates is {'b', 'c', 'd'},
169+ # then diff = {'a'}, which raises the error as 'a' was not discovered in all_templates.
170+ diff = set (jailbreak_names ) - set (all_templates )
171+ if len (diff ) > 0 :
172+ raise ValueError (f"Error: could not find templates `{ diff } `!" )
173+
174+ # If jailbreak_names has some value, then `if jailbreak_names` passes, and self._jailbreaks
175+ # is set to jailbreak_names. Otherwise we use all_templates.
176+ self ._jailbreaks = jailbreak_names if jailbreak_names else all_templates
114177
115178 super ().__init__ (
116179 name = "Jailbreak" ,
@@ -146,6 +209,20 @@ def _get_default_objective_scorer(self) -> TrueFalseScorer:
146209 )
147210 return refusal_scorer
148211
212+ def _get_default_adversarial_target (self ) -> OpenAIChatTarget :
213+ """
214+ Create and retrieve the default adversarial target.
215+
216+ Returns:
217+ OpenAIChatTarget: Default adversarial target using an unfiltered endpoint.
218+ """
219+ return OpenAIChatTarget (
220+ endpoint = os .environ .get ("AZURE_OPENAI_GPT4O_UNSAFE_CHAT_ENDPOINT" ),
221+ api_key = os .environ .get ("AZURE_OPENAI_GPT4O_UNSAFE_CHAT_KEY" ),
222+ model_name = os .environ .get ("AZURE_OPENAI_GPT4O_UNSAFE_CHAT_MODEL" ),
223+ temperature = 1.2 ,
224+ )
225+
149226 def _resolve_seed_groups (self ) -> List [SeedAttackGroup ]:
150227 """
151228 Resolve seed groups from dataset configuration.
@@ -161,23 +238,14 @@ def _resolve_seed_groups(self) -> List[SeedAttackGroup]:
161238
162239 return list (seed_groups )
163240
164- def _get_all_jailbreak_templates (self ) -> List [str ]:
165- """
166- Retrieve all available jailbreak templates.
167-
168- Returns:
169- List[str]: List of jailbreak template file names.
170- """
171- if not self ._n :
172- return TextJailBreak .get_all_jailbreak_templates ()
173- else :
174- return TextJailBreak .get_all_jailbreak_templates (n = self ._n )
175-
176- async def _get_atomic_attack_from_jailbreak_async (self , * , jailbreak_template_name : str ) -> AtomicAttack :
241+ async def _get_atomic_attack_from_strategy_async (
242+ self , * , strategy : str , jailbreak_template_name : str
243+ ) -> AtomicAttack :
177244 """
178245 Create an atomic attack for a specific jailbreak template.
179246
180247 Args:
248+ strategy (str): JailbreakStrategy to use.
181249 jailbreak_template_name (str): Name of the jailbreak template file.
182250
183251 Returns:
@@ -202,12 +270,28 @@ async def _get_atomic_attack_from_jailbreak_async(self, *, jailbreak_template_na
202270 request_converters = PromptConverterConfiguration .from_converters (converters = [jailbreak_converter ])
203271 )
204272
205- # Create the attack
206- attack = PromptSendingAttack (
207- objective_target = self ._objective_target ,
208- attack_scoring_config = self ._scorer_config ,
209- attack_converter_config = converter_config ,
210- )
273+ attack : Optional [Union [ManyShotJailbreakAttack , PromptSendingAttack , RolePlayAttack , SkeletonKeyAttack ]] = None
274+ args = {
275+ "objective_target" : self ._objective_target ,
276+ "attack_scoring_config" : self ._scorer_config ,
277+ "attack_converter_config" : converter_config ,
278+ }
279+ match strategy :
280+ case "many_shot" :
281+ attack = ManyShotJailbreakAttack (** args )
282+ case "prompt_sending" :
283+ attack = PromptSendingAttack (** args )
284+ case "skeleton" :
285+ attack = SkeletonKeyAttack (** args )
286+ case "role_play" :
287+ args ["adversarial_chat" ] = self ._get_default_adversarial_target ()
288+ args ["role_play_definition_path" ] = RolePlayPaths .PERSUASION_SCRIPT .value
289+ attack = RolePlayAttack (** args )
290+ case _:
291+ raise ValueError (f"Unknown JailbreakStrategy `{ strategy } `." )
292+
293+ if not attack :
294+ raise ValueError (f"Attack cannot be None!" )
211295
212296 # Extract template name without extension for the atomic attack name
213297 template_name = Path (jailbreak_template_name ).stem
@@ -230,11 +314,16 @@ async def _get_atomic_attacks_async(self) -> List[AtomicAttack]:
230314 # Retrieve seed prompts based on selected strategies
231315 self ._seed_groups = self ._resolve_seed_groups ()
232316
233- # Get all jailbreak template names
234- jailbreak_template_names = self ._get_all_jailbreak_templates ()
317+ strategies = ScenarioCompositeStrategy .extract_single_strategy_values (
318+ composites = self ._scenario_composites , strategy_type = JailbreakStrategy
319+ )
235320
236- for template_name in jailbreak_template_names :
237- atomic_attack = await self ._get_atomic_attack_from_jailbreak_async (jailbreak_template_name = template_name )
238- atomic_attacks .append (atomic_attack )
321+ for strategy in strategies :
322+ for template_name in self ._jailbreaks :
323+ for _ in range (0 , self ._num_attempts ):
324+ atomic_attack = await self ._get_atomic_attack_from_strategy_async (
325+ strategy = strategy , jailbreak_template_name = template_name
326+ )
327+ atomic_attacks .append (atomic_attack )
239328
240329 return atomic_attacks
0 commit comments