|
| 1 | +"use client" |
| 2 | + |
| 3 | +import Image from "next/image"; |
| 4 | +import CoverImage from "../components"; |
| 5 | +import AgentvsStoryteller_intro from '@/public/projects/agent_vs_storyteller/agent_vs_storyteller_intro.png' |
| 6 | +import sotopia_pi from '@/public/projects/sotopia_rl.png' |
| 7 | +import model from '@/public/projects/sotopia_pi/model.png' |
| 8 | +import safety_agent_1 from '@/public/projects/sotopia_pi/safety_agent_1.png' |
| 9 | +import safety_agent_2 from '@/public/projects/sotopia_pi/safety_agent_2.png' |
| 10 | +import model_step_1 from '@/public/projects/sotopia_pi/model_step_1.png' |
| 11 | +import model_step_2 from '@/public/projects/sotopia_pi/model_step_2.png' |
| 12 | +import model_step_3 from '@/public/projects/sotopia_pi/model_step_3.png' |
| 13 | +import model_eval from '@/public/projects/sotopia_pi/model_eval.png' |
| 14 | +import model_perf from '@/public/projects/sotopia_pi/model_perf.png' |
| 15 | +import pomdp_sotopia from '@/public/projects/sotopia_rl/pomdp_sotopia.png' |
| 16 | +import sotopia_rl_pipeline from '@/public/projects/sotopia_rl/sotopia_rl_pipeline.png' |
| 17 | +import sotopia_rl_case_study from '@/public/projects/sotopia_rl/sotopia_rl_case_study.png' |
| 18 | +import sotopia_rl_case_study_2 from '@/public/projects/sotopia_rl/sotopia_rl_case_study_2.png' |
| 19 | +import sotopia_rl_performance from '@/public/projects/sotopia_rl/sotopia_rl_performance.png' |
| 20 | +import goal_results from '@/public/projects/agent_vs_storyteller/goal_results.png' |
| 21 | +import { |
| 22 | + HoverCard, |
| 23 | + HoverCardContent, |
| 24 | + HoverCardTrigger, |
| 25 | +} from "@/components/ui/hover-card" |
| 26 | +import Link from "next/link"; |
| 27 | +import { Abhaya_Libre } from "next/font/google"; |
| 28 | +import { Detail } from "@/components/ListDetail/Detail"; |
| 29 | +import React from "react"; |
| 30 | + |
| 31 | +import { |
| 32 | + NavigationMenu, |
| 33 | + NavigationMenuContent, |
| 34 | + NavigationMenuIndicator, |
| 35 | + NavigationMenuItem, |
| 36 | + NavigationMenuLink, |
| 37 | + NavigationMenuList, |
| 38 | + NavigationMenuTrigger, |
| 39 | + NavigationMenuViewport, |
| 40 | + navigationMenuTriggerStyle, |
| 41 | +} from "@/components/ui/navigation-menu" |
| 42 | + |
| 43 | + |
| 44 | +function SectionContent(props: React.JSX.IntrinsicAttributes & React.ClassAttributes<HTMLDivElement> & React.HTMLAttributes<HTMLDivElement>) { |
| 45 | + return <div className="w-full gap-8 grid grid-cols-12 font-inter py-10 px-6 sm:px-12" {...props} /> |
| 46 | +} |
| 47 | + |
| 48 | +function SectionTitle(props: React.JSX.IntrinsicAttributes & React.ClassAttributes<HTMLDivElement> & React.HTMLAttributes<HTMLDivElement>) { |
| 49 | + return <h1 className="text-4xl col-span-12 font-grotesk black dark:white dark:font-light dark:drop-shadow-[0_1px_15px_#ccfbf1]" {...props} /> |
| 50 | +} |
| 51 | + |
| 52 | +function SectionText(props: React.JSX.IntrinsicAttributes & React.ClassAttributes<HTMLDivElement> & React.HTMLAttributes<HTMLDivElement>) { |
| 53 | + return <p className="col-span-12 prose mt-4 text-lg text-slate-800 dark:text-gray-200 font-space" {...props} /> |
| 54 | +} |
| 55 | + |
| 56 | +function Title() { |
| 57 | + return ( |
| 58 | + <> |
| 59 | + <h3 className="mb-4 text-4xl lg:text-5xl leading-tight"> |
| 60 | + <span className="font-display inline">Sotopia-RL</span>: Reward Design for Social Intelligence. |
| 61 | + </h3> |
| 62 | + <div className="mb-2 text-md lg:text-lg"> |
| 63 | + <div><Link className="underline" href="https://haofeiyu.me/">Haofei Yu</Link>*</div> |
| 64 | + <div><Link className="underline" href="https://www.linkedin.com/in/zhengyang-jason-qi/">Zhengyang Qi</Link>*<sup>1</sup></div> |
| 65 | + <div><Link className="underline" href="https://www.linkedin.com/in/zhaoyining/">Yining Zhao</Link>*<sup>1</sup></div> |
| 66 | + <div><Link className="underline" href="https://sites.uci.edu/kolbynottingham/">Kolby Nottingham</Link><sup>2</sup></div> |
| 67 | + <div><Link className="underline" href="https://keyangds.github.io/">Keyang Xuan</Link><sup>1</sup></div> |
| 68 | + <div><Link className="underline" href="https://www.majumderb.com/">Bodhisattwa Prasad Majumder</Link><sup>3</sup></div> |
| 69 | + <div><Link className="underline" href="https://zhuhao.me">Hao Zhu</Link>†<sup>5</sup></div> |
| 70 | + <div><Link className="underline" href="https://pliang279.github.io/">Paul Pu Liang</Link>†<sup>6</sup></div> |
| 71 | + <div><Link className="underline" href="https://cs.stanford.edu/~jiaxuan/">Jiaxuan You</Link>†<sup>1</sup></div> |
| 72 | + |
| 73 | + </div> |
| 74 | + |
| 75 | + <div className="mb-2 text-sm lg:text-md"> |
| 76 | + * Leading authors. |
| 77 | + † Equal advising. |
| 78 | + </div> |
| 79 | + |
| 80 | + <span className="inline-block pr-1"><sup>1</sup>University of Illinois Urbana-Champaign,</span> |
| 81 | + <span className="inline-block pr-1"><sup>2</sup>University of California Irvine,</span> |
| 82 | + <span className="inline-block pr-1"><sup>3</sup>Allen Institute for Artificial Intelligence,</span> |
| 83 | + <span className="inline-block pr-1"><sup>4</sup>Carnegie Mellon Unversity,</span> |
| 84 | + <span className="inline-block pr-1"><sup>5</sup>Stanford University</span> |
| 85 | + <span className="inline-block pr-1"><sup>6</sup>Massachusetts Institute of Technology</span> |
| 86 | + </> |
| 87 | + ); |
| 88 | +} |
| 89 | + |
| 90 | +function Abstract() { |
| 91 | + return ( |
| 92 | + <p className="antialiased mb-4 text-2xl lg:text-3xl leading-relaxed"> |
| 93 | + How can social agents learn to generate high-quality utterances in social interactions? We propose Sotopia-RL, an RL framework that tackles partial observability and multi-dimensionality in social interactions by assigning fine-grained, utterance-level rewards. We reach the state-of-the-art performance on Sotopia with Qwen2.5-7B-Instruct as the base model. |
| 94 | + </p> |
| 95 | + ); |
| 96 | +} |
| 97 | + |
| 98 | +export default function Index() { |
| 99 | + const scrollContainerRef = React.useRef(null); |
| 100 | + return ( |
| 101 | + <Detail.Container data-cy="home-intro" ref={scrollContainerRef}> |
| 102 | + <Detail.ContentContainer className="mx-auto max-w-2xl mt-8 lg:max-w-5xl"> |
| 103 | + |
| 104 | + <SectionContent> |
| 105 | + <h1 className="text-xl md:text-2xl font-bold tracking-tighter leading-tight md:pr-8">Research.</h1> |
| 106 | + </SectionContent> |
| 107 | + |
| 108 | + |
| 109 | + <SectionContent> |
| 110 | + <div className="col-span-12 md:col-span-6"> |
| 111 | + <Title></Title> |
| 112 | + </div> |
| 113 | + <div className="col-span-12 md:col-span-6"> |
| 114 | + <div className="mb-2 2xl:mb-4"> |
| 115 | + <CoverImage title={"Sotopia-pi"} src={sotopia_pi}/> |
| 116 | + </div> |
| 117 | + </div> |
| 118 | + </SectionContent> |
| 119 | + |
| 120 | + |
| 121 | + <SectionContent> |
| 122 | + <div className="col-span-12 text-center">Check out our <Link href="https://arxiv.org/pdf/2508.03905"><span className="underline font-bold">paper</span></Link>, {' '} |
| 123 | + <Link href="https://github.com/sotopia-lab/sotopia-rl"><span className="underline font-bold">code</span></Link>, {' '} |
| 124 | + <Link href="https://huggingface.co/ulab-ai/sotopia-rl-qwen-2.5-7B-grpo"><span className="underline font-bold">policy model</span></Link>, {' '} <Link href="https://huggingface.co/ulab-ai/sotopia-rl-qwen2.5-7B-rm"><span className="underline font-bold">reward model</span></Link>, {' '} |
| 125 | + <Link href="https://huggingface.co/datasets/ulab-ai/sotopia-rl-reward-annotation"><span className="underline font-bold">data</span></Link>, {' '} |
| 126 | + and interactive demo!</div> |
| 127 | + <div className="col-span-12"><Abstract></Abstract></div> |
| 128 | + <hr className="col-span-12" /> |
| 129 | + </SectionContent> |
| 130 | + |
| 131 | + |
| 132 | + <SectionContent> |
| 133 | + <SectionTitle>Why we need reward design for social intelligence?</SectionTitle> |
| 134 | + <SectionText> |
| 135 | + Social agent naturally learn skills through interactions. However, two key features of social interactions (partial observability and multi-dimensionality) make it challenging to train social agents with reinforcement learning (RL). |
| 136 | + </SectionText> |
| 137 | + <SectionText> |
| 138 | + <span className="font-display">Partial Observability</span>. In social interactions, agents operate under partial observability—they only observe the dialogue history, not hidden factors like intentions, emotions, or social norms that drive outcomes. This makes credit assignment difficult: even low-quality utterances may appear in successful conversations, and high-quality ones may go unrewarded. For RL, this creates high-variance reward signals that hinder stable policy learning compared to tasks like math or coding. |
| 139 | +</SectionText> |
| 140 | + |
| 141 | +<SectionText> |
| 142 | +<span className="font-display">Multi-Dimensionality</span>. Rapport building and knowledge seeking can indirectly contribute to the final goal achievement of social interactions. Single-dimensional reward on goal completion score oversimplifies this complexity, encouraging agents to exploit narrow signals while ignoring diverse social behaviors. For RL, this increases the risk of reward hacking or overfitting to spurious features, making it harder to generalize and develop socially intelligent strategies that align with human expectations. |
| 143 | + </SectionText> |
| 144 | + |
| 145 | +<SectionText> |
| 146 | + Overall, we target at designing RL framework for social agents that makes training efficient and effective, making social agents perform well under diverse social scenarios. |
| 147 | + </SectionText> |
| 148 | +</SectionContent> |
| 149 | + |
| 150 | +<Image src={pomdp_sotopia} className="col-span-18 w-10/12 my-4 mx-auto" alt="Step 1" /> |
| 151 | + |
| 152 | + |
| 153 | + |
| 154 | + |
| 155 | + <SectionContent> |
| 156 | + <SectionTitle>How does Sotopia-RL work?</SectionTitle> |
| 157 | + <SectionText> |
| 158 | + SOTOPIA-RL consists of three stages: (1) reward design, (2) reward model training, and (3) policy training. |
| 159 | + </SectionText> |
| 160 | + <SectionText> |
| 161 | + <span className="font-display">Reward Design</span>. To build better offline reward labels for RL training, we expand the reward signal along two axes. First, we shift from coarse episode-level feedback to fine-grained <span className="underline decoration-double">utterance-level</span> credit, capturing the temporal granularity of social interactions. Second, we enrich the signal from a single-dimensional score to a <span className="underline decoration-double">multi-dimensional</span> evaluation that includes goal completion, relationship-building, and knowledge-sharing, as provided by SOTOPIA-EVAL. Multi-dimensional scores are normalized and aggregated into a scalar reward, producing robust and socially grounded supervision for RL. |
| 162 | + </SectionText> |
| 163 | + |
| 164 | + <SectionText> |
| 165 | + <span className="font-display">RM Training</span>. In the second stage, we train a reward model to predict the quality of an utterance given the conversation history. Supervised by offline reward labels, the model learns to approximate utterance-level feedback via mean squared error loss, enabling utterance-level online reward inference during policy training. |
| 166 | + </SectionText> |
| 167 | + <SectionText> |
| 168 | + <span className="font-display">Policy Training</span>. Finally, we fine-tune an LLM-based policy model using GRPO. Starting with behavior cloning to ensure fluency and coherence, we continue with single-turn online RL. At each turn, the reward model provides immediate feedback for the utterance, guiding the agent toward socially effective behaviors through interaction. |
| 169 | + </SectionText> |
| 170 | + <Image src={sotopia_rl_pipeline} className="my-4 col-span-12" alt="Step 1" /> |
| 171 | + </SectionContent> |
| 172 | + |
| 173 | + <SectionContent> |
| 174 | + <SectionTitle> |
| 175 | + Performance evaluation |
| 176 | + </SectionTitle> |
| 177 | + |
| 178 | + <SectionText> |
| 179 | + We evaluate our model by simulating the interaction between RL-agent and RL-partner (behavior cloning based agent). {' '} |
| 180 | + This is then evaluated by both GPT-4o rating and human rating. |
| 181 | + </SectionText> |
| 182 | + <Image src={model_eval} className="col-span-12 w-2/3 my-4 lg:w-1/3 lg:my-0 mx-auto" alt="Evaluation" /> |
| 183 | + |
| 184 | + <SectionText> |
| 185 | + On the hard <span className="font-display">Sotopia</span> tasks, {' '} |
| 186 | + <span className="font-display">Sotopia-RL</span> improves the social goal completion ability of the behavior cloned model (Qwen-2.5-7B) on both LLM-based evaluation and human evaluations. Training with Sotopia-RL also provides better performance compared with <span className="font-display">Sotopia-π</span>. {' '} |
| 187 | + Additionally, designing RL rewards across multiple dimensions—such as relationship maintenance, knowledge sharing, and goal completion—leads to better model performance than optimizing for goal completion alone. The reason for that is potentially because it encourages more diverse social behaviors and regularizes the training process from overfitting. |
| 188 | + </SectionText> |
| 189 | + |
| 190 | + <Image src={sotopia_rl_performance} className="col-span-12 w-full my-4 lg:w-2/3 lg:my-0 mx-auto" alt="Performance" /> |
| 191 | + |
| 192 | + <SectionTitle> |
| 193 | + Sotopia-RL builds a more intelligent social agent |
| 194 | + </SectionTitle> |
| 195 | + <SectionText> |
| 196 | + As shown in the examples, the model trained with Sotopia-RL is more persuasive to propose collaborative solutions compared with behavior cloning baselines. {' '} |
| 197 | + </SectionText> |
| 198 | + <Image src={sotopia_rl_case_study} className="col-span-12 w-2/3 my-4 lg:w-2/3 lg:my-0 mx-auto" alt="Step 1"/> |
| 199 | + |
| 200 | + <SectionText> |
| 201 | + Moreover, in the following example, the model trained with Sotopia-RL is able to acknowledge the social goal of both sides and offer a solution-oriented perspective. |
| 202 | + </SectionText> |
| 203 | + <Image src={sotopia_rl_case_study_2} className="col-span-12 w-2/3 my-4 lg:w-2/3 lg:my-0 mx-auto" alt="Step 1"/> |
| 204 | + |
| 205 | + |
| 206 | + </SectionContent> |
| 207 | + |
| 208 | + <SectionContent> |
| 209 | + <SectionTitle> |
| 210 | + Citation |
| 211 | + </SectionTitle> |
| 212 | + <code className="col-span-12 block whitespace-pre overflow-x-scroll bg-slate-100 bg-clip-border p-3"> |
| 213 | + {'@misc{yu2025sotopiarlrewarddesignsocial,\n' + |
| 214 | + ' title={Sotopia-RL: Reward Design for Social Intelligence},\n' + |
| 215 | + ' author={Haofei Yu and Zhengyang Qi and Yining Zhao and Kolby Nottingham and Keyang Xuan and Bodhisattwa Prasad Majumder and Hao Zhu and Paul Pu Liang and Jiaxuan You},\n' + |
| 216 | + ' year={2025},\n' + |
| 217 | + ' eprint={2508.03905},\n' + |
| 218 | + ' archivePrefix={arXiv},\n' + |
| 219 | + ' primaryClass={cs.CL},\n' + |
| 220 | + ' url={https://arxiv.org/abs/2508.03905}\n' + |
| 221 | + '}'} |
| 222 | +</code> |
| 223 | + |
| 224 | + |
| 225 | + </SectionContent> |
| 226 | + |
| 227 | + </Detail.ContentContainer> |
| 228 | + </Detail.Container> |
| 229 | + ); |
| 230 | +} |
0 commit comments