|
7 | 7 | import logging |
8 | 8 | import uuid |
9 | 9 | from pathlib import Path |
10 | | -from typing import Any, Callable, Optional, Union |
| 10 | +from typing import Any, Callable, Optional |
11 | 11 |
|
12 | 12 | from pyrit.common.apply_defaults import REQUIRED_VALUE, apply_defaults |
13 | 13 | from pyrit.common.path import EXECUTOR_RED_TEAM_PATH |
@@ -357,41 +357,20 @@ async def _generate_next_prompt_async(self, context: MultiTurnAttackContext[Any] |
357 | 357 | # Generate prompt using adversarial chat |
358 | 358 | logger.debug(f"Generating prompt for turn {context.executed_turns + 1}") |
359 | 359 |
|
360 | | - # Prepare prompt for the adversarial chat |
361 | | - attack_message = await self._build_adversarial_prompt(context) |
362 | | - |
363 | | - # Build the message for the adversarial chat. |
364 | | - # For file/media responses, construct a multimodal message with both |
365 | | - # the textual feedback and the actual media (image/video) so the |
366 | | - # adversarial chat (e.g. GPT-4o) can see what the target generated. |
367 | | - if isinstance(prompt_result, tuple): |
368 | | - feedback_text, media_piece = prompt_result |
369 | | - # Use a shared conversation_id so Message validation passes |
370 | | - shared_conversation_id = str(uuid.uuid4()) |
371 | | - pieces = [ |
372 | | - MessagePiece( |
373 | | - original_value=feedback_text, |
374 | | - role="user", |
375 | | - conversation_id=shared_conversation_id, |
376 | | - ) |
377 | | - ] |
378 | | - if media_piece is not None: |
379 | | - pieces.append( |
380 | | - MessagePiece( |
381 | | - original_value=media_piece.converted_value, |
382 | | - role="user", |
383 | | - original_value_data_type=media_piece.converted_value_data_type, |
384 | | - conversation_id=shared_conversation_id, |
385 | | - ) |
386 | | - ) |
387 | | - prompt_message = Message(message_pieces=pieces) |
| 360 | + # Build the message for the adversarial chat |
| 361 | + prompt_message = await self._build_adversarial_prompt(context) |
| 362 | + |
| 363 | + # Log the message being sent |
| 364 | + if prompt_message.is_multimodal(): |
| 365 | + text_piece = prompt_message.get_first_piece_by_data_type("text") |
| 366 | + media_pieces = [p for p in prompt_message.message_pieces if p.converted_value_data_type != "text"] |
| 367 | + feedback_text = text_piece.converted_value if text_piece else "No text content" |
| 368 | + media_info = f"{len(media_pieces)} media piece(s)" if media_pieces else "no media" |
388 | 369 | logger.debug( |
389 | | - f"Sending multimodal prompt to adversarial chat: {feedback_text[:50]}... " |
390 | | - f"+ {media_piece.converted_value_data_type if media_piece else 'no'} media" |
| 370 | + f"Sending multimodal prompt to adversarial chat: {feedback_text[:50]}... + {media_info}" |
391 | 371 | ) |
392 | 372 | else: |
393 | | - prompt_text = prompt_result |
394 | | - prompt_message = Message.from_prompt(prompt=prompt_text, role="user") |
| 373 | + prompt_text = prompt_message.get_first_piece().converted_value |
395 | 374 | logger.debug(f"Sending prompt to adversarial chat: {prompt_text[:50]}...") |
396 | 375 |
|
397 | 376 | with execution_context( |
@@ -420,33 +399,35 @@ async def _generate_next_prompt_async(self, context: MultiTurnAttackContext[Any] |
420 | 399 | async def _build_adversarial_prompt( |
421 | 400 | self, |
422 | 401 | context: MultiTurnAttackContext[Any], |
423 | | - ) -> Union[str, tuple[str, Optional[MessagePiece]]]: |
| 402 | + ) -> Message: |
424 | 403 | """ |
425 | | - Build a prompt for the adversarial chat based on the last response. |
| 404 | + Build a prompt message for the adversarial chat based on the last response. |
426 | 405 |
|
427 | | - For text responses, returns a plain string. For file/media responses (images, video, etc.), |
428 | | - returns a tuple of (feedback_text, media_piece) so the caller can construct a multimodal |
429 | | - message that includes the actual generated media alongside the textual feedback. |
| 406 | + For text responses, creates a simple text message. For file/media responses (images, video, etc.), |
| 407 | + creates a multimodal message that includes both the textual feedback and the actual generated |
| 408 | + media so the adversarial chat can see what the target produced. |
430 | 409 |
|
431 | 410 | Args: |
432 | 411 | context (MultiTurnAttackContext): The attack context containing the current state and configuration. |
433 | 412 |
|
434 | 413 | Returns: |
435 | | - Union[str, tuple[str, Optional[MessagePiece]]]: Either a plain text prompt string, |
436 | | - or a tuple of (feedback_text, media_piece) when the target returned media content. |
| 414 | + Message: A message ready to be sent to the adversarial chat. |
437 | 415 | """ |
438 | 416 | # If no last response, return the seed prompt (rendered with objective if template exists) |
439 | 417 | if not context.last_response: |
440 | | - return self._adversarial_chat_seed_prompt.render_template_value_silent(objective=context.objective) |
| 418 | + prompt_text = self._adversarial_chat_seed_prompt.render_template_value_silent(objective=context.objective) |
| 419 | + return Message.from_prompt(prompt=prompt_text, role="user") |
441 | 420 |
|
442 | 421 | # Get the last assistant piece from the response |
443 | 422 | response_piece = context.last_response.get_piece() |
444 | 423 |
|
445 | | - # Text/error responses return str; file responses return tuple[str, Optional[MessagePiece]] |
| 424 | + # Build message based on response type (text vs file/media) |
446 | 425 | if response_piece.converted_value_data_type in ("text", "error"): |
447 | | - return self._handle_adversarial_text_response(context=context) |
448 | | - |
449 | | - return self._handle_adversarial_file_response(context=context) |
| 426 | + feedback_text = self._handle_adversarial_text_response(context=context) |
| 427 | + return self._build_text_message(feedback_text) |
| 428 | + else: |
| 429 | + feedback_text, media_piece = self._handle_adversarial_file_response(context=context) |
| 430 | + return self._build_multimodal_message(feedback_text, media_piece) |
450 | 431 |
|
451 | 432 | def _handle_adversarial_text_response(self, *, context: MultiTurnAttackContext[Any]) -> str: |
452 | 433 | """ |
@@ -538,6 +519,49 @@ def _handle_adversarial_file_response( |
538 | 519 |
|
539 | 520 | return (feedback, response_piece) |
540 | 521 |
|
| 522 | + def _build_text_message(self, feedback_text: str) -> Message: |
| 523 | + """ |
| 524 | + Build a simple text message for the adversarial chat. |
| 525 | +
|
| 526 | + Args: |
| 527 | + feedback_text (str): The text content for the message. |
| 528 | +
|
| 529 | + Returns: |
| 530 | + Message: A text message ready to be sent to the adversarial chat. |
| 531 | + """ |
| 532 | + return Message.from_prompt(prompt=feedback_text, role="user") |
| 533 | + |
| 534 | + def _build_multimodal_message(self, feedback_text: str, media_piece: Optional[MessagePiece]) -> Message: |
| 535 | + """ |
| 536 | + Build a multimodal message for the adversarial chat containing both text and media. |
| 537 | +
|
| 538 | + Args: |
| 539 | + feedback_text (str): The textual feedback to include. |
| 540 | + media_piece (Optional[MessagePiece]): The media piece from the target response, if any. |
| 541 | +
|
| 542 | + Returns: |
| 543 | + Message: A multimodal message ready to be sent to the adversarial chat. |
| 544 | + """ |
| 545 | + # Use a shared conversation_id so Message validation passes |
| 546 | + shared_conversation_id = str(uuid.uuid4()) |
| 547 | + pieces = [ |
| 548 | + MessagePiece( |
| 549 | + original_value=feedback_text, |
| 550 | + role="user", |
| 551 | + conversation_id=shared_conversation_id, |
| 552 | + ) |
| 553 | + ] |
| 554 | + if media_piece is not None: |
| 555 | + pieces.append( |
| 556 | + MessagePiece( |
| 557 | + original_value=media_piece.converted_value, |
| 558 | + role="user", |
| 559 | + original_value_data_type=media_piece.converted_value_data_type, |
| 560 | + conversation_id=shared_conversation_id, |
| 561 | + ) |
| 562 | + ) |
| 563 | + return Message(message_pieces=pieces) |
| 564 | + |
541 | 565 | async def _send_prompt_to_objective_target_async( |
542 | 566 | self, |
543 | 567 | *, |
|
0 commit comments