Skip to content
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
28 changes: 17 additions & 11 deletions doc/references.bib
Original file line number Diff line number Diff line change
Expand Up @@ -99,9 +99,12 @@ @misc{odin2024
@article{inie2025summon,
title = {Summon a Demon and Bind it: A Grounded Theory of {LLM} Red Teaming},
author = {Nanna Inie and Jonathan Stray and Leon Derczynski},
journal = {PLoS ONE},
journal = {PLOS ONE},
volume = {20},
number = {1},
pages = {e0314658},
year = {2025},
url = {https://arxiv.org/abs/2311.06237},
url = {https://doi.org/10.1371/journal.pone.0314658},
}

@misc{vantaylor2024socialbias,
Expand Down Expand Up @@ -633,17 +636,19 @@ @article{rottger2025msts
url = {https://arxiv.org/abs/2501.10057},
}

@article{zong2024vlguard,
@inproceedings{zong2024vlguard,
title = {Safety Fine-Tuning at (Almost) No Cost: A Baseline for Vision Large Language Models},
author = {Yongshuo Zong and Ondrej Bohdal and Tingyang Yu and Yongxin Yang and Timothy Hospedales},
journal = {arXiv preprint arXiv:2402.02207},
booktitle = {Proceedings of the 41st International Conference on Machine Learning (ICML)},
pages = {62867--62891},
year = {2024},
url = {https://arxiv.org/abs/2402.02207},
publisher = {PMLR},
url = {https://proceedings.mlr.press/v235/zong24a.html},
}

@article{lopez2024pyrit,
title = {{PyRIT}: A Framework for Security Risk Identification and Red Teaming in Generative {AI} Systems},
author = {Gary D. Lopez Munoz and Amanda J. Minnich and Roman Lutz and Richard Lundeen and Raja Sekhar Rao Dheekonda and Nina Chikanov and Bolor-Erdene Jagdagdorj and Martin Pouliot and Shiven Chawla and Whitney Maxwell and Blake Bullwinkel and Katherine Pratt and Joris de Gruyter and Charlotte Siska and Pete Bryan and Tori Westerhoff and Chang Kawaguchi and Christian Seifert and Ram Shankar Siva Kumar and Yonatan Zunger},
author = {Gary D. {Lopez Munoz} and Amanda J. Minnich and Roman Lutz and Richard Lundeen and Raja Sekhar Rao Dheekonda and Nina Chikanov and Bolor-Erdene Jagdagdorj and Martin Pouliot and Shiven Chawla and Whitney Maxwell and Blake Bullwinkel and Katherine Pratt and Joris de Gruyter and Charlotte Siska and Pete Bryan and Tori Westerhoff and Chang Kawaguchi and Christian Seifert and Ram Shankar Siva Kumar and Yonatan Zunger},
journal = {arXiv preprint arXiv:2410.02828},
year = {2024},
url = {https://arxiv.org/abs/2410.02828},
Expand All @@ -667,12 +672,13 @@ @inproceedings{wang2025siuo
note = {Introduces the {SIUO} (Safe Inputs but Unsafe Output) benchmark},
}

@misc{darkbench2025,
title = {{DarkBench}: A Comprehensive Benchmark for Dark Design Patterns in Large Language Models},
author = {{Apart Research}},
@inproceedings{darkbench2025,
title = {{DarkBench}: Benchmarking Dark Patterns in Large Language Models},
author = {Esben Kran and Hieu Minh Nguyen and Akash Kundu and Sami Jawhar and Jinsuk Park and Mateusz Maria Jurewicz},
booktitle = {International Conference on Learning Representations (ICLR)},
year = {2025},
url = {https://darkbench.ai/},
note = {OpenReview: https://openreview.net/forum?id=odjMSBSWRt},
url = {https://arxiv.org/abs/2503.10728},
note = {Oral presentation at ICLR 2025},
}

@misc{embracethered2025sneakybits,
Expand Down
Loading