TransluceAI
diff --git a/‎.github/workflows/typecheck.yml‎
Lines changed: 42 additions & 0 deletions b/‎.github/workflows/typecheck.yml‎
Lines changed: 42 additions & 0 deletions
diff --git a/‎README.md‎
Lines changed: 35 additions & 0 deletions b/‎README.md‎
Lines changed: 35 additions & 0 deletions
diff --git a/‎datasets/transluce_cbrn.jsonl‎
Lines changed: 48 additions & 0 deletions b/‎datasets/transluce_cbrn.jsonl‎
Lines changed: 48 additions & 0 deletions
diff --git a/‎jailbreaking_frontier_models/async_utils.py‎
Lines changed: 212 additions & 0 deletions b/‎jailbreaking_frontier_models/async_utils.py‎
Lines changed: 212 additions & 0 deletions
diff --git a/‎jailbreaking_frontier_models/client_utils.py‎
Lines changed: 91 additions & 0 deletions b/‎jailbreaking_frontier_models/client_utils.py‎
Lines changed: 91 additions & 0 deletions
diff --git a/‎jailbreaking_frontier_models/example_reward_fn_computation.py‎ b/‎jailbreaking_frontier_models/example_reward_fn_computation.py‎
@@ -0,0 +1,42 @@
+name: Type Check
+
+on:
+  push:
+    branches: [ main, master ]
+  pull_request:
+    branches: [ main, master ]
+  workflow_dispatch:
+
+jobs:
+  typecheck:
+    runs-on: ubuntu-latest
+    
+    steps:
+    - name: Checkout code
+      uses: actions/checkout@v4
+    
+    - name: Install uv
+      uses: astral-sh/setup-uv@v3
+      with:
+        enable-cache: true
+        cache-dependency-glob: "uv.lock"
+    
+    - name: Set up Python
+      run: uv python install 3.12
+    
+    - name: Install dependencies
+      run: |
+        uv sync --all-extras --dev
+    
+    - name: Run pyright
+      run: |
+        uv run pyright
+    
+    - name: Upload type checking results
+      if: failure()
+      uses: actions/upload-artifact@v4
+      with:
+        name: typecheck-results
+        path: |
+          **/pyrightconfig.json
+          **/.pyright/
@@ -0,0 +1,35 @@
+# Jailbreaking Frontier Models
+
+This is code accompanying the the blogpost, ["Automatically Jailbreaking Frontier Language Models with Investigator Agents"](jailbreaking-frontier-models).
+
+Note: We do not include the RL training code, as it is tightly coupled with our internal research tooling. However, we implement all datasets and reward functions, so that they can be used to train jailbreaking agents and reproduce our experiments.
+
+## Example reproduction
+
+### Install dependencies
+
+```bash
+uv sync
+source .venv/bin/activate
+```
+
+### Run a test script demonstrating the reward function computation
+
+```bash
+# assumes that you have a OpenAI-compatible endpoint (e.g. vLLM or SGLang server) running at localhost:8080
+
+
+```
+
+## Dataset
+
+The dataset is located at `data/jailbreak_dataset.jsonl`. It is a `jsonl` file with the following format:
+
+```json
+{
+    "behavior_id": "<unique id for the behavior>",
+    "behavior": "<user prompt for the behavior to elicit>",
+    "category": "<one of drug/chemical/biological/radiological/nuclear/explosive>",
+    "optimizer_target": "<start of an affirmative response, like \"Sure...\" -- this is not used by our investigator agent, but might be useful for other jailbreaking techniques>",
+}
+```
@@ -0,0 +1,212 @@
+from __future__ import annotations
+
+import typing
+from typing import Awaitable, Callable, ParamSpec, TypeVar
+
+import anyio
+import anyio.abc
+
+Args = typing.TypeVarTuple("Args")
+T = typing.TypeVar("T")
+U = typing.TypeVar("U")
+
+
+class FutureFinishedWithExceptionError(Exception):
+    """An error raised when a future finished with an exception."""
+
+
+class FutureNotSetError(Exception):
+    """An error raised when a future is not set."""
+
+
+class SimpleFuture[T]:
+    """A simple anyio-based future.
+
+    Futures can hold a single result, which is provided by an async task. Following anyio (rather
+    than asyncio) conventions, every task is associated with a task group. Futures allow us to
+    more easily pass around "boxes" that will eventually contain results.
+
+    Note that if you just want to wait for a set of tasks to complete, you can just use an anyio
+    task group, and store the result in a mutable container like a list or dict; you don't need
+    futures the same way you do in asyncio. This class is most useful when either:
+    - you only have a single result of each type and don't want to deal with creating a list with
+      one element, or want the more convenient typechecked interface of SimpleFuture,
+    - you want to pass a box between two tasks, so that one task can set the result and another
+      can wait for it to be set, without requiring a more heavyweight mechanism like a queue.
+
+    A basic low-level pattern you can use to run a task and get its result later:
+
+    ```python
+    async with anyio.create_task_group() as tg:
+        future = SimpleFuture()
+
+        async def go(*args):
+            future.set_result(await some_task_fn(*args))
+
+        tg.start_soon(go, *args)
+
+        # do something else asynchronously while the task runs
+
+    # once you exit the task group, the future will have a result
+    value = future.get()
+    ```
+
+    This pattern is useful enough that there is a wrapper so that you can instead do:
+
+    ```python
+    async with anyio.create_task_group() as tg:
+        future = future_from_start_soon(tg, some_task_fn, *args)
+        # do something else asynchronously while the task runs
+
+    # once you exit the task group, the future will have a result
+    value = future.get()
+    ```
+
+    You can also use `await future.wait_for_result()` to wait for the future to complete and get
+    the result.
+    """
+
+    def __init__(self):
+        self.event = anyio.Event()
+        self.result = None
+        self.exception = None
+
+    def set_result(self, result: T):
+        """Set the result of the future."""
+
+        assert not self.event.is_set(), "Result already set"
+        self.result = result
+        self.event.set()
+
+    def set_from_task(
+        self,
+        task_fn: Callable[[*Args], typing.Awaitable[T]],
+        set_exception: bool = True,
+        catch_exception: bool = False,
+    ) -> Callable[[*Args], typing.Awaitable[None]]:
+        """Wrap a callable to set the result of the future from its result.
+
+        This function can be used to wrap a task that returns a value into a task that sets a
+        future. It is most useful in combination with `TaskGroup.start_soon`, e.g.
+
+        ```python
+        future = SimpleFuture()
+        async with anyio.create_task_group() as tg:
+            # runs `await some_task_fn(*args)` and sets the future to the result
+            tg.start_soon(future.set_from_task(some_task_fn), *args)
+
+            # or, same pattern but with keyword arguments
+            tg.start_soon(future.set_from_task(functools.partial(some_task_fn, **kwargs)))
+        ```
+
+        Args:
+            task_fn: A callable that returns an awaitable.
+            set_exception: Whether to set the future to the exception if the task raises an
+                exception. If False, the exception will not be set.
+            catch_exception: Whether to catch exceptions and set the future to the exception. If
+                False (the default), exceptions will be propagated up to the task group. If True,
+                exceptions will be caught and will not cancel the task group. `set_exception` must
+                be True if `catch_exception` is True.
+
+        Returns:
+            A callable that can be used to start a task that sets the future to the result of
+            `task_fn`.
+        """
+        if catch_exception and not set_exception:
+            raise ValueError("set_exception must be True if catch_exception is True")
+
+        async def wrapped_task_fn(*args: *Args):
+            try:
+                result = await task_fn(*args)
+            except Exception as e:
+                if set_exception:
+                    self.set_exception(e)
+                    if not catch_exception:
+                        raise
+                else:
+                    raise
+            else:
+                self.set_result(result)
+
+        return wrapped_task_fn
+
+    def set_exception(self, exception: Exception):
+        """Set the result of the future to an exception."""
+        assert not self.event.is_set(), "Result already set"
+        self.exception = exception
+        self.event.set()
+
+    def has_result(self) -> bool:
+        """Check if the future has a result."""
+        return self.event.is_set()
+
+    async def wait_for_result(self) -> T:
+        """Wait for the future to complete and return the result.
+
+        If the future has an exception, it will be raised.
+        """
+        await self.event.wait()
+        if self.exception:
+            raise FutureFinishedWithExceptionError(
+                f"Future finished with an exception (of type {type(self.exception).__name__})!"
+            ) from self.exception
+        assert self.result is not None
+        return self.result
+
+    def get(self) -> T:
+        """Synchronously get the result of the future.
+
+        If the future has an exception, it will be raised.
+        """
+        if not self.has_result():
+            raise FutureNotSetError("Result not set")
+        if self.exception:
+            raise self.exception
+        assert self.result is not None
+        return self.result
+
+
+def future_from_start_soon[
+    T, *Args
+](
+    task_group: anyio.abc.TaskGroup,
+    task_fn: Callable[[*Args], Awaitable[T]],
+    *args: *Args,
+    catch_exception: bool = False,
+) -> SimpleFuture[T]:
+    """Create a future from the result of starting a coroutine in a task group.
+
+    This is a convenience function for creating a future and starting a task in a task group.
+    It can be used like this:
+
+    ```python
+    async with anyio.create_task_group() as tg:
+        future = future_from_start_soon(tg, some_task_fn, *args)
+    ```
+
+    Args:
+        task_group: The task group to start the task in. This is the task group that owns the
+            task and will be cancelled if the task raises an exception (unless
+            `catch_exception` is True).
+        task_fn: The coroutine to start.
+        *args: The arguments to pass to the coroutine.
+        catch_exception: Whether to catch exceptions and set the future to the exception. If
+            False (the default), exceptions will be propagated up to the task group. If True,
+            exceptions will be caught and stored in the future instead, and the task group
+            will not be cancelled.
+
+    Returns:
+        A future that will eventually contain the result of the coroutine, or an exception if
+        the coroutine raises an exception and `catch_exception` is True.
+    """
+    result = SimpleFuture[T]()
+    task_group.start_soon(
+        result.set_from_task(task_fn, set_exception=True, catch_exception=catch_exception),
+        *args,
+    )
+    return result
+
+
+P = ParamSpec("P")  # full parameter list of the wrapped function
+T = TypeVar("T")  # its return type
+
@@ -0,0 +1,91 @@
+from openai import AsyncOpenAI
+from openai.types.chat.chat_completion_message_param import ChatCompletionMessageParam
+from jailbreaking_frontier_models.logprobs import LogProbs
+from transformers import PreTrainedTokenizerBase
+from tenacity import AsyncRetrying, before_sleep_log, stop_after_attempt, wait_random_exponential
+import logging
+
+logger = logging.getLogger(__name__)
+
+async def get_token_logprobs(
+    client: AsyncOpenAI,
+    tokenizer: PreTrainedTokenizerBase,
+    model: str,
+    input_token_ids: list[int] | None = None,
+    input_messages: list[ChatCompletionMessageParam] | None = None,
+    output_token_ids: list[int] | None = None,
+    output_text: str | None = None,
+) -> LogProbs:
+    """Get token-level log probabilities for a response.
+
+    Args:
+        client: OpenAI client instance
+        model: The model to use
+        input_messages: The messages to use
+        limiter_fn: The limiter function to use
+
+    Returns:
+        LogProbs object containing prompt and response tokens with their logprobs
+
+    WARNING: For vllm, logprobs are not affected by temperature (they assume temperature=1.0). Also, you can't use top_p with logprobs.
+    """
+
+    assert (input_token_ids is not None or input_messages is not None) and not (
+        input_token_ids is not None and input_messages is not None
+    ), "Must provide either input_token_ids or input_messages, but not both"
+    assert (output_token_ids is not None or output_text is not None) and not (
+        output_token_ids is not None and output_text is not None
+    ), "Must provide either output_token_ids or output_text, but not both"
+
+    if input_messages is not None:
+        conversation_tokens = tokenizer.apply_chat_template(input_messages, add_generation_prompt=True)  # type: ignore
+    elif input_token_ids is not None:
+        conversation_tokens = input_token_ids
+    else:
+        raise ValueError("Must provide either input_messages or input_token_ids")
+
+    assert output_text is not None, "output_text must be provided"
+
+    full_text = tokenizer.decode(conversation_tokens) + output_text
+
+    async for attempt in AsyncRetrying(
+        wait=wait_random_exponential(multiplier=1, min=1, max=60),
+        stop=stop_after_attempt(50),
+        reraise=True,
+        before_sleep=before_sleep_log(logger, logging.WARNING),
+    ):
+        with attempt:
+            output = await client.completions.create(
+                model=model,
+                prompt=full_text,
+                max_tokens=1,
+                logprobs=1,
+                echo=True,
+            )
+
+            # Cut off the last tokens, since we sample max_tokens=1 (required for sglang)
+            token_strs = output.choices[0].logprobs.tokens[:-1]  # type: ignore
+            token_logprobs = output.choices[0].logprobs.token_logprobs[:-1]  # type: ignore
+
+            if token_strs is None or token_logprobs is None:
+                raise ValueError("Failed to get logprobs from model")
+
+            suffix_idx = len(conversation_tokens)
+
+            prompt_token_strs = token_strs[:suffix_idx]
+            prompt_token_logprobs = list(
+                token_logprobs[:suffix_idx]
+            )  
+            response_token_strs = token_strs[suffix_idx:]
+            response_token_logprobs = list(
+                token_logprobs[suffix_idx:]
+            ) 
+
+            return LogProbs(
+                prompt_token_strs=prompt_token_strs,
+                prompt_token_logprobs=prompt_token_logprobs,  # type: ignore
+                response_token_strs=response_token_strs,
+                response_token_logprobs=response_token_logprobs,  # type: ignore
+            )
+
+    raise RuntimeError("Failed to get logprobs from model")