Skip to content

Commit 23b862d

Browse files
ValbuenaVCVictor ValbuenaCopilot
authored
FIX: Integration test fixes (#1897)
Co-authored-by: Victor Valbuena <vvalbuena@microsoft.com> Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
1 parent f58a218 commit 23b862d

9 files changed

Lines changed: 568 additions & 68 deletions

doc/code/scenarios/0_scenarios.ipynb

Lines changed: 344 additions & 4 deletions
Large diffs are not rendered by default.

doc/code/scenarios/0_scenarios.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -102,8 +102,10 @@
102102
)
103103
from pyrit.score.true_false.true_false_scorer import TrueFalseScorer
104104
from pyrit.setup import initialize_pyrit_async
105+
from pyrit.setup.initializers.components import ScenarioTechniqueInitializer
105106

106107
await initialize_pyrit_async(memory_db_type="InMemory") # type: ignore [top-level-await]
108+
await ScenarioTechniqueInitializer().initialize_async() # type: ignore [top-level-await]
107109

108110

109111
class MyStrategy(ScenarioStrategy):

doc/code/scenarios/1_common_scenario_parameters.ipynb

Lines changed: 71 additions & 58 deletions
Original file line numberDiff line numberDiff line change
@@ -43,14 +43,21 @@
4343
"name": "stdout",
4444
"output_type": "stream",
4545
"text": [
46-
"No new upgrade operations detected.\n"
46+
"[pyrit:alembic] No new upgrade operations detected.\n"
4747
]
4848
},
4949
{
5050
"name": "stderr",
5151
"output_type": "stream",
5252
"text": [
53-
"Skipping scorer main: required target not found in TargetRegistry\n"
53+
"Skipping target 'platform_openai_chat': PLATFORM_OPENAI_CHAT_GPT4O_MODEL is not set. All declared env vars (endpoint, key, model) must be present for this target to register.\n"
54+
]
55+
},
56+
{
57+
"name": "stderr",
58+
"output_type": "stream",
59+
"text": [
60+
"Skipping target 'azure_foundry_phi4': AZURE_FOUNDRY_PHI4_MODEL is not set. All declared env vars (endpoint, key, model) must be present for this target to register.\n"
5461
]
5562
},
5663
{
@@ -68,7 +75,9 @@
6875
"from pyrit.registry import TargetRegistry\n",
6976
"from pyrit.scenario.scenarios.foundry import FoundryStrategy, RedTeamAgent\n",
7077
"from pyrit.setup import initialize_from_config_async\n",
78+
"from pyrit.setup.initializers.components import ScenarioTechniqueInitializer\n",
7179
"\n",
80+
"await ScenarioTechniqueInitializer().initialize_async() # type: ignore [top-level-await]\n",
7281
"await initialize_from_config_async(config_path=Path(\"../../scanner/pyrit_conf.yaml\")) # type: ignore\n",
7382
"\n",
7483
"objective_target = TargetRegistry.get_registry_singleton().get_instance_by_name(\"openai_chat\")"
@@ -242,7 +251,7 @@
242251
{
243252
"data": {
244253
"application/vnd.jupyter.widget-view+json": {
245-
"model_id": "8734281af4a940bd9995f232ce52c16c",
254+
"model_id": "e386860005e440a98009fc5cd1b55e86",
246255
"version_major": 2,
247256
"version_minor": 0
248257
},
@@ -283,26 +292,29 @@
283292
"\n",
284293
"\u001b[1m 📊 Scorer Information\u001b[0m\n",
285294
"\u001b[37m ▸ Scorer Identifier\u001b[0m\n",
286-
"\u001b[36m • Scorer Type: SelfAskTrueFalseScorer\u001b[0m\n",
295+
"\u001b[36m • Scorer Type: TrueFalseInverterScorer\u001b[0m\n",
287296
"\u001b[36m • scorer_type: true_false\u001b[0m\n",
288297
"\u001b[36m • score_aggregator: OR_\u001b[0m\n",
289-
"\u001b[36m • model_name: gpt-40\u001b[0m\n",
290-
"\u001b[36m • temperature: 0.9\u001b[0m\n",
298+
"\u001b[36m └─ Composite of 1 scorer(s):\u001b[0m\n",
299+
"\u001b[36m • Scorer Type: SelfAskRefusalScorer\u001b[0m\n",
300+
"\u001b[36m • scorer_type: true_false\u001b[0m\n",
301+
"\u001b[36m • score_aggregator: OR_\u001b[0m\n",
302+
"\u001b[36m • model_name: gpt-4o-japan-nilfilter\u001b[0m\n",
291303
"\n",
292304
"\u001b[37m ▸ Performance Metrics\u001b[0m\n",
293-
"\u001b[36m • Accuracy: 79.24%\u001b[0m\n",
294-
"\u001b[36m • Accuracy Std Error: ±0.0204\u001b[0m\n",
295-
"\u001b[36m • F1 Score: 0.7560\u001b[0m\n",
296-
"\u001b[36m • Precision: 0.8759\u001b[0m\n",
297-
"\u001b[31m • Recall: 0.6649\u001b[0m\n",
298-
"\u001b[36m • Average Score Time: 1.64s\u001b[0m\n",
305+
"\u001b[36m • Accuracy: 89.37%\u001b[0m\n",
306+
"\u001b[36m • Accuracy Std Error: ±0.0155\u001b[0m\n",
307+
"\u001b[36m • F1 Score: 0.8918\u001b[0m\n",
308+
"\u001b[36m • Precision: 0.8782\u001b[0m\n",
309+
"\u001b[32m • Recall: 0.9058\u001b[0m\n",
310+
"\u001b[36m • Average Score Time: 0.59s\u001b[0m\n",
299311
"\n",
300312
"\u001b[1m\u001b[36m▼ Overall Statistics\u001b[0m\n",
301313
"\u001b[36m────────────────────────────────────────────────────────────────────────────────────────────────────\u001b[0m\n",
302314
"\u001b[1m 📈 Summary\u001b[0m\n",
303315
"\u001b[32m • Total Strategies: 21\u001b[0m\n",
304316
"\u001b[32m • Total Attack Results: 42\u001b[0m\n",
305-
"\u001b[32m • Overall Success Rate: 11%\u001b[0m\n",
317+
"\u001b[32m • Overall Success Rate: 0%\u001b[0m\n",
306318
"\u001b[32m • Unique Objectives: 2\u001b[0m\n",
307319
"\n",
308320
"\u001b[1m\u001b[36m▼ Per-Group Breakdown\u001b[0m\n",
@@ -316,11 +328,11 @@
316328
"\u001b[33m • Number of Results: 2\u001b[0m\n",
317329
"\u001b[32m • Success Rate: 0%\u001b[0m\n",
318330
"\n",
319-
"\u001b[1m 🔸 Group: ascii_art\u001b[0m\n",
331+
"\u001b[1m 🔸 Group: ascii_smuggler\u001b[0m\n",
320332
"\u001b[33m • Number of Results: 2\u001b[0m\n",
321333
"\u001b[32m • Success Rate: 0%\u001b[0m\n",
322334
"\n",
323-
"\u001b[1m 🔸 Group: ascii_smuggler\u001b[0m\n",
335+
"\u001b[1m 🔸 Group: ascii_art\u001b[0m\n",
324336
"\u001b[33m • Number of Results: 2\u001b[0m\n",
325337
"\u001b[32m • Success Rate: 0%\u001b[0m\n",
326338
"\n",
@@ -330,11 +342,11 @@
330342
"\n",
331343
"\u001b[1m 🔸 Group: base64\u001b[0m\n",
332344
"\u001b[33m • Number of Results: 2\u001b[0m\n",
333-
"\u001b[31m • Success Rate: 100%\u001b[0m\n",
345+
"\u001b[32m • Success Rate: 0%\u001b[0m\n",
334346
"\n",
335347
"\u001b[1m 🔸 Group: binary\u001b[0m\n",
336348
"\u001b[33m • Number of Results: 2\u001b[0m\n",
337-
"\u001b[33m • Success Rate: 50%\u001b[0m\n",
349+
"\u001b[32m • Success Rate: 0%\u001b[0m\n",
338350
"\n",
339351
"\u001b[1m 🔸 Group: caesar\u001b[0m\n",
340352
"\u001b[33m • Number of Results: 2\u001b[0m\n",
@@ -364,11 +376,11 @@
364376
"\u001b[33m • Number of Results: 2\u001b[0m\n",
365377
"\u001b[32m • Success Rate: 0%\u001b[0m\n",
366378
"\n",
367-
"\u001b[1m 🔸 Group: rot13\u001b[0m\n",
379+
"\u001b[1m 🔸 Group: suffix_append\u001b[0m\n",
368380
"\u001b[33m • Number of Results: 2\u001b[0m\n",
369381
"\u001b[32m • Success Rate: 0%\u001b[0m\n",
370382
"\n",
371-
"\u001b[1m 🔸 Group: suffix_append\u001b[0m\n",
383+
"\u001b[1m 🔸 Group: rot13\u001b[0m\n",
372384
"\u001b[33m • Number of Results: 2\u001b[0m\n",
373385
"\u001b[32m • Success Rate: 0%\u001b[0m\n",
374386
"\n",
@@ -378,7 +390,7 @@
378390
"\n",
379391
"\u001b[1m 🔸 Group: unicode_confusable\u001b[0m\n",
380392
"\u001b[33m • Number of Results: 2\u001b[0m\n",
381-
"\u001b[33m • Success Rate: 50%\u001b[0m\n",
393+
"\u001b[32m • Success Rate: 0%\u001b[0m\n",
382394
"\n",
383395
"\u001b[1m 🔸 Group: unicode_substitution\u001b[0m\n",
384396
"\u001b[33m • Number of Results: 2\u001b[0m\n",
@@ -390,7 +402,7 @@
390402
"\n",
391403
"\u001b[1m 🔸 Group: jailbreak\u001b[0m\n",
392404
"\u001b[33m • Number of Results: 2\u001b[0m\n",
393-
"\u001b[33m • Success Rate: 50%\u001b[0m\n",
405+
"\u001b[32m • Success Rate: 0%\u001b[0m\n",
394406
"\n",
395407
"\u001b[36m====================================================================================================\u001b[0m\n",
396408
"\n"
@@ -405,7 +417,7 @@
405417
" dataset_config=dataset_config,\n",
406418
")\n",
407419
"baseline_result = await baseline_scenario.run_async() # type: ignore\n",
408-
"await output_scenario_async(baseline_result)"
420+
"await output_scenario_async(baseline_result) # type: ignore [top-level-await]"
409421
]
410422
},
411423
{
@@ -458,64 +470,59 @@
458470
"\n",
459471
"\u001b[1m 📊 Scorer Information\u001b[0m\n",
460472
"\u001b[37m ▸ Scorer Identifier\u001b[0m\n",
461-
"\u001b[36m • Scorer Type: SelfAskTrueFalseScorer\u001b[0m\n",
473+
"\u001b[36m • Scorer Type: TrueFalseInverterScorer\u001b[0m\n",
462474
"\u001b[36m • scorer_type: true_false\u001b[0m\n",
463475
"\u001b[36m • score_aggregator: OR_\u001b[0m\n",
464-
"\u001b[36m • model_name: gpt-40\u001b[0m\n",
465-
"\u001b[36m • temperature: 0.9\u001b[0m\n",
476+
"\u001b[36m └─ Composite of 1 scorer(s):\u001b[0m\n",
477+
"\u001b[36m • Scorer Type: SelfAskRefusalScorer\u001b[0m\n",
478+
"\u001b[36m • scorer_type: true_false\u001b[0m\n",
479+
"\u001b[36m • score_aggregator: OR_\u001b[0m\n",
480+
"\u001b[36m • model_name: gpt-4o-japan-nilfilter\u001b[0m\n",
466481
"\n",
467482
"\u001b[37m ▸ Performance Metrics\u001b[0m\n",
468-
"\u001b[36m • Accuracy: 79.24%\u001b[0m\n",
469-
"\u001b[36m • Accuracy Std Error: ±0.0204\u001b[0m\n",
470-
"\u001b[36m • F1 Score: 0.7560\u001b[0m\n",
471-
"\u001b[36m • Precision: 0.8759\u001b[0m\n",
472-
"\u001b[31m • Recall: 0.6649\u001b[0m\n",
473-
"\u001b[36m • Average Score Time: 1.64s\u001b[0m\n",
483+
"\u001b[36m • Accuracy: 89.37%\u001b[0m\n",
484+
"\u001b[36m • Accuracy Std Error: ±0.0155\u001b[0m\n",
485+
"\u001b[36m • F1 Score: 0.8918\u001b[0m\n",
486+
"\u001b[36m • Precision: 0.8782\u001b[0m\n",
487+
"\u001b[32m • Recall: 0.9058\u001b[0m\n",
488+
"\u001b[36m • Average Score Time: 0.59s\u001b[0m\n",
474489
"\n",
475490
"\u001b[1m\u001b[36m▼ Overall Statistics\u001b[0m\n",
476491
"\u001b[36m────────────────────────────────────────────────────────────────────────────────────────────────────\u001b[0m\n",
477492
"\u001b[1m 📈 Summary\u001b[0m\n",
478493
"\u001b[32m • Total Strategies: 21\u001b[0m\n",
479494
"\u001b[32m • Total Attack Results: 42\u001b[0m\n",
480-
"\u001b[32m • Overall Success Rate: 11%\u001b[0m\n",
495+
"\u001b[32m • Overall Success Rate: 0%\u001b[0m\n",
481496
"\u001b[32m • Unique Objectives: 2\u001b[0m\n",
482497
"\n",
483498
"\u001b[1m\u001b[36m▼ Per-Group Breakdown\u001b[0m\n",
484499
"\u001b[36m────────────────────────────────────────────────────────────────────────────────────────────────────\u001b[0m\n",
485500
"\n",
486-
"\u001b[1m 🔸 Group: base64\u001b[0m\n",
487-
"\u001b[33m • Number of Results: 2\u001b[0m\n",
488-
"\u001b[31m • Success Rate: 100%\u001b[0m\n",
489-
"\n",
490-
"\u001b[1m 🔸 Group: binary\u001b[0m\n",
491-
"\u001b[33m • Number of Results: 2\u001b[0m\n",
492-
"\u001b[33m • Success Rate: 50%\u001b[0m\n",
493-
"\n",
494-
"\u001b[1m 🔸 Group: unicode_confusable\u001b[0m\n",
501+
"\u001b[1m 🔸 Group: baseline\u001b[0m\n",
495502
"\u001b[33m • Number of Results: 2\u001b[0m\n",
496-
"\u001b[33m • Success Rate: 50%\u001b[0m\n",
503+
"\u001b[32m • Success Rate: 0%\u001b[0m\n",
497504
"\n",
498-
"\u001b[1m 🔸 Group: jailbreak\u001b[0m\n",
505+
"\u001b[1m 🔸 Group: ansi_attack\u001b[0m\n",
499506
"\u001b[33m • Number of Results: 2\u001b[0m\n",
500-
"\u001b[33m • Success Rate: 50%\u001b[0m\n",
507+
"\u001b[32m • Success Rate: 0%\u001b[0m\n",
501508
"\n",
502-
"\u001b[1m 🔸 Group: baseline\u001b[0m\n",
509+
"\u001b[1m 🔸 Group: ascii_smuggler\u001b[0m\n",
503510
"\u001b[33m • Number of Results: 2\u001b[0m\n",
504511
"\u001b[32m • Success Rate: 0%\u001b[0m\n",
505512
"\n",
506-
"\u001b[1m 🔸 Group: ansi_attack\u001b[0m\n",
513+
"\u001b[1m 🔸 Group: ascii_art\u001b[0m\n",
507514
"\u001b[33m • Number of Results: 2\u001b[0m\n",
508515
"\u001b[32m • Success Rate: 0%\u001b[0m\n",
509516
"\n",
510-
"\u001b[1m 🔸 Group: ascii_art\u001b[0m\n",
517+
"\u001b[1m 🔸 Group: atbash\u001b[0m\n",
511518
"\u001b[33m • Number of Results: 2\u001b[0m\n",
512519
"\u001b[32m • Success Rate: 0%\u001b[0m\n",
513520
"\n",
514-
"\u001b[1m 🔸 Group: ascii_smuggler\u001b[0m\n",
521+
"\u001b[1m 🔸 Group: base64\u001b[0m\n",
515522
"\u001b[33m • Number of Results: 2\u001b[0m\n",
516523
"\u001b[32m • Success Rate: 0%\u001b[0m\n",
517524
"\n",
518-
"\u001b[1m 🔸 Group: atbash\u001b[0m\n",
525+
"\u001b[1m 🔸 Group: binary\u001b[0m\n",
519526
"\u001b[33m • Number of Results: 2\u001b[0m\n",
520527
"\u001b[32m • Success Rate: 0%\u001b[0m\n",
521528
"\n",
@@ -547,18 +554,22 @@
547554
"\u001b[33m • Number of Results: 2\u001b[0m\n",
548555
"\u001b[32m • Success Rate: 0%\u001b[0m\n",
549556
"\n",
550-
"\u001b[1m 🔸 Group: rot13\u001b[0m\n",
557+
"\u001b[1m 🔸 Group: suffix_append\u001b[0m\n",
551558
"\u001b[33m • Number of Results: 2\u001b[0m\n",
552559
"\u001b[32m • Success Rate: 0%\u001b[0m\n",
553560
"\n",
554-
"\u001b[1m 🔸 Group: suffix_append\u001b[0m\n",
561+
"\u001b[1m 🔸 Group: rot13\u001b[0m\n",
555562
"\u001b[33m • Number of Results: 2\u001b[0m\n",
556563
"\u001b[32m • Success Rate: 0%\u001b[0m\n",
557564
"\n",
558565
"\u001b[1m 🔸 Group: string_join\u001b[0m\n",
559566
"\u001b[33m • Number of Results: 2\u001b[0m\n",
560567
"\u001b[32m • Success Rate: 0%\u001b[0m\n",
561568
"\n",
569+
"\u001b[1m 🔸 Group: unicode_confusable\u001b[0m\n",
570+
"\u001b[33m • Number of Results: 2\u001b[0m\n",
571+
"\u001b[32m • Success Rate: 0%\u001b[0m\n",
572+
"\n",
562573
"\u001b[1m 🔸 Group: unicode_substitution\u001b[0m\n",
563574
"\u001b[33m • Number of Results: 2\u001b[0m\n",
564575
"\u001b[32m • Success Rate: 0%\u001b[0m\n",
@@ -567,6 +578,10 @@
567578
"\u001b[33m • Number of Results: 2\u001b[0m\n",
568579
"\u001b[32m • Success Rate: 0%\u001b[0m\n",
569580
"\n",
581+
"\u001b[1m 🔸 Group: jailbreak\u001b[0m\n",
582+
"\u001b[33m • Number of Results: 2\u001b[0m\n",
583+
"\u001b[32m • Success Rate: 0%\u001b[0m\n",
584+
"\n",
570585
"\u001b[36m====================================================================================================\u001b[0m\n",
571586
"\n"
572587
]
@@ -617,7 +632,7 @@
617632
{
618633
"data": {
619634
"application/vnd.jupyter.widget-view+json": {
620-
"model_id": "103ae439a5554be79c786a8bcc9c1524",
635+
"model_id": "9f07786563cc4128ba51a5e47eb53a6f",
621636
"version_major": 2,
622637
"version_minor": 0
623638
},
@@ -675,7 +690,7 @@
675690
"\u001b[1m 📈 Summary\u001b[0m\n",
676691
"\u001b[32m • Total Strategies: 2\u001b[0m\n",
677692
"\u001b[32m • Total Attack Results: 4\u001b[0m\n",
678-
"\u001b[36m • Overall Success Rate: 25%\u001b[0m\n",
693+
"\u001b[33m • Overall Success Rate: 50%\u001b[0m\n",
679694
"\u001b[32m • Unique Objectives: 2\u001b[0m\n",
680695
"\n",
681696
"\u001b[1m\u001b[36m▼ Per-Group Breakdown\u001b[0m\n",
@@ -687,7 +702,7 @@
687702
"\n",
688703
"\u001b[1m 🔸 Group: base64\u001b[0m\n",
689704
"\u001b[33m • Number of Results: 2\u001b[0m\n",
690-
"\u001b[32m • Success Rate: 0%\u001b[0m\n",
705+
"\u001b[33m • Success Rate: 50%\u001b[0m\n",
691706
"\n",
692707
"\u001b[36m====================================================================================================\u001b[0m\n",
693708
"\n"
@@ -710,15 +725,13 @@
710725
" scenario_strategies=[FoundryStrategy.Base64],\n",
711726
" dataset_config=dataset_config,\n",
712727
")\n",
728+
"\n",
713729
"custom_result = await custom_scenario.run_async() # type: ignore\n",
714730
"await output_scenario_async(custom_result)"
715731
]
716732
}
717733
],
718734
"metadata": {
719-
"jupytext": {
720-
"main_language": "python"
721-
},
722735
"language_info": {
723736
"codemirror_mode": {
724737
"name": "ipython",
@@ -729,7 +742,7 @@
729742
"name": "python",
730743
"nbconvert_exporter": "python",
731744
"pygments_lexer": "ipython3",
732-
"version": "3.12.12"
745+
"version": "3.13.13"
733746
}
734747
},
735748
"nbformat": 4,

doc/code/scenarios/1_common_scenario_parameters.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,9 @@
3232
from pyrit.registry import TargetRegistry
3333
from pyrit.scenario.scenarios.foundry import FoundryStrategy, RedTeamAgent
3434
from pyrit.setup import initialize_from_config_async
35+
from pyrit.setup.initializers.components import ScenarioTechniqueInitializer
3536

37+
await ScenarioTechniqueInitializer().initialize_async() # type: ignore [top-level-await]
3638
await initialize_from_config_async(config_path=Path("../../scanner/pyrit_conf.yaml")) # type: ignore
3739

3840
objective_target = TargetRegistry.get_registry_singleton().get_instance_by_name("openai_chat")
@@ -118,7 +120,7 @@
118120
dataset_config=dataset_config,
119121
)
120122
baseline_result = await baseline_scenario.run_async() # type: ignore
121-
await output_scenario_async(baseline_result)
123+
await output_scenario_async(baseline_result) # type: ignore [top-level-await]
122124

123125
# %% [markdown]
124126
# ### Sorting the Per-Group Breakdown by Success Rate
@@ -170,5 +172,6 @@
170172
scenario_strategies=[FoundryStrategy.Base64],
171173
dataset_config=dataset_config,
172174
)
175+
173176
custom_result = await custom_scenario.run_async() # type: ignore
174177
await output_scenario_async(custom_result)

0 commit comments

Comments
 (0)