Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
25 changes: 25 additions & 0 deletions apps/evaluations/admin.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,8 @@

from .models import (
AppliedTag,
DatasetAutoPopulationRule,
DatasetIngestionEntry,
EvaluationConfig,
EvaluationDataset,
EvaluationMessage,
Expand Down Expand Up @@ -82,3 +84,26 @@ class AppliedTagAdmin(ReadonlyAdminMixin, admin.ModelAdmin):
list_display = ("id", "evaluation_result", "rule", "tag", "team")
list_filter = ("team",)
search_fields = ("rule__evaluator__name",)


@admin.register(DatasetAutoPopulationRule)
class DatasetAutoPopulationRuleAdmin(ReadonlyAdminMixin, admin.ModelAdmin):
list_display = (
"id",
"dataset",
"source_experiment",
"evaluation_mode",
"is_enabled",
"last_run_at",
"last_run_status",
"team",
)
list_filter = ("team", "evaluation_mode", "is_enabled", "last_run_status")
search_fields = ("dataset__name", "source_experiment__name")


@admin.register(DatasetIngestionEntry)
class DatasetIngestionEntryAdmin(ReadonlyAdminMixin, admin.ModelAdmin):
list_display = ("id", "rule", "evaluation_message", "source_session", "source_message", "created_at")
list_filter = ("rule",)
search_fields = ("rule__dataset__name",)
Original file line number Diff line number Diff line change
@@ -0,0 +1,83 @@
# Generated by Django 5.2.13 on 2026-05-06 12:51

import apps.utils.models
import django.db.models.deletion
import django.utils.timezone
from django.db import migrations, models


class Migration(migrations.Migration):

dependencies = [
('chat', '0024_delete_orphaned_chats'),
('evaluations', '0014_add_tag_rules_and_applied_tags'),
('experiments', '0134_expsession_team_lastact_idx'),
('teams', '0009_merge_pipeline_admin_into_experiment_admin'),
]

operations = [
migrations.AddField(
model_name='evaluationconfig',
name='auto_run_on_append',
field=models.BooleanField(default=False, help_text='When enabled, an evaluation run is automatically enqueued for newly appended dataset messages. Each auto-run only evaluates the new rows, but still incurs LLM costs proportional to the number of evaluators and new messages.'),
),
migrations.AddField(
model_name='evaluationrun',
name='scoped_messages',
field=models.ManyToManyField(blank=True, help_text='When set, this run evaluates only these messages instead of the full dataset. Used by delta runs triggered by dataset appends.', related_name='scoped_evaluation_runs', to='evaluations.evaluationmessage'),
),
migrations.AlterField(
model_name='evaluationrun',
name='type',
field=models.CharField(choices=[('full', 'Full'), ('preview', 'Preview'), ('delta', 'Delta')], db_index=True, default='full', max_length=20),
),
migrations.CreateModel(
name='DatasetAutoPopulationRule',
fields=[
('id', models.BigAutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')),
('created_at', models.DateTimeField(auto_now_add=True)),
('updated_at', models.DateTimeField(auto_now=True)),
('evaluation_mode', models.CharField(choices=[('message', 'Message'), ('session', 'Session')], help_text="Must match the target dataset's evaluation_mode.", max_length=10)),
('filter_query', models.TextField(blank=True, default='', help_text='Query-string-encoded FilterParams (same format used by manual filter import). Empty means no filter beyond the source experiment scope.')),
('is_enabled', models.BooleanField(default=True)),
('last_ingested_at', models.DateTimeField(default=django.utils.timezone.now, help_text="High-water mark: the periodic task only considers sources newer than this timestamp (minus a small safety margin) on each run. Initialised to the rule's creation time so existing history is not backfilled.")),
('last_run_at', models.DateTimeField(blank=True, null=True)),
('last_run_status', models.CharField(blank=True, choices=[('success', 'Success'), ('error', 'Error'), ('no_op', 'No-op')], default='', max_length=20)),
('last_error', models.TextField(blank=True, default='')),
('consecutive_failure_count', models.PositiveIntegerField(default=0)),
('dataset', models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, related_name='auto_population_rules', to='evaluations.evaluationdataset')),
('source_experiment', models.ForeignKey(help_text='The experiment whose sessions/messages are sampled by this rule.', on_delete=django.db.models.deletion.CASCADE, related_name='dataset_auto_population_rules', to='experiments.experiment')),
('team', models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, to='teams.team', verbose_name='Team')),
],
bases=(models.Model, apps.utils.models.VersioningMixin),
),
migrations.CreateModel(
name='DatasetIngestionEntry',
fields=[
('id', models.BigAutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')),
('created_at', models.DateTimeField(auto_now_add=True)),
('updated_at', models.DateTimeField(auto_now=True)),
('evaluation_message', models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, related_name='ingestion_entries', to='evaluations.evaluationmessage')),
('rule', models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, related_name='ingestion_entries', to='evaluations.datasetautopopulationrule')),
('source_message', models.ForeignKey(blank=True, help_text='Set for message-mode ingestion; null for session-mode ingestion.', null=True, on_delete=django.db.models.deletion.CASCADE, related_name='ingestion_entries', to='chat.chatmessage')),
('source_session', models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, related_name='ingestion_entries', to='experiments.experimentsession')),
],
bases=(models.Model, apps.utils.models.VersioningMixin),
),
migrations.AddIndex(
model_name='datasetautopopulationrule',
index=models.Index(fields=['is_enabled', 'last_run_at'], name='evaluations_is_enab_afd336_idx'),
),
migrations.AddIndex(
model_name='datasetingestionentry',
index=models.Index(fields=['rule', 'created_at'], name='evaluations_rule_id_b74853_idx'),
),
migrations.AddConstraint(
model_name='datasetingestionentry',
constraint=models.UniqueConstraint(condition=models.Q(('source_message__isnull', False)), fields=('rule', 'source_message'), name='unique_ingestion_per_rule_message'),
),
migrations.AddConstraint(
model_name='datasetingestionentry',
constraint=models.UniqueConstraint(condition=models.Q(('source_message__isnull', True)), fields=('rule', 'source_session'), name='unique_ingestion_per_rule_session'),
),
]
127 changes: 127 additions & 0 deletions apps/evaluations/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,13 @@ class EvaluationRunStatus(models.TextChoices):
class EvaluationRunType(models.TextChoices):
FULL = "full", "Full"
PREVIEW = "preview", "Preview"
DELTA = "delta", "Delta"


class AutoPopulationRunStatus(models.TextChoices):
SUCCESS = "success", "Success"
ERROR = "error", "Error"
NO_OP = "no_op", "No-op"


class DatasetCreationStatus(models.TextChoices):
Expand Down Expand Up @@ -295,6 +302,14 @@ class EvaluationConfig(BaseTeamModel):
default=ExperimentVersionSelection.SPECIFIC,
help_text=("Type of version selection: specific, latest_working, or latest_published"),
)
auto_run_on_append = models.BooleanField(
default=False,
help_text=(
"When enabled, an evaluation run is automatically enqueued for newly appended "
"dataset messages. Each auto-run only evaluates the new rows, but still incurs "
"LLM costs proportional to the number of evaluators and new messages."
),
)

def __str__(self):
return f"EvaluationConfig ({self.name})"
Expand Down Expand Up @@ -357,6 +372,15 @@ class EvaluationRun(BaseTeamModel):
type = models.CharField(
max_length=20, choices=EvaluationRunType.choices, default=EvaluationRunType.FULL, db_index=True
)
scoped_messages = models.ManyToManyField(
EvaluationMessage,
blank=True,
related_name="scoped_evaluation_runs",
help_text=(
"When set, this run evaluates only these messages instead of the full dataset. "
"Used by delta runs triggered by dataset appends."
),
)
job_id = models.CharField(max_length=255, blank=True)
error_message = models.TextField(blank=True)

Expand Down Expand Up @@ -518,3 +542,106 @@ class Meta:

def __str__(self):
return f"AppliedTag(result={self.evaluation_result_id}, rule={self.rule_id}, tag={self.tag_id})"


class DatasetAutoPopulationRule(BaseTeamModel):
"""A rule that periodically appends matching sessions/messages from a source experiment to a dataset."""

AUTO_DISABLE_FAILURE_THRESHOLD = 3

dataset = models.ForeignKey(EvaluationDataset, on_delete=models.CASCADE, related_name="auto_population_rules")
source_experiment = models.ForeignKey(
"experiments.Experiment",
on_delete=models.CASCADE,
related_name="dataset_auto_population_rules",
help_text="The experiment whose sessions/messages are sampled by this rule.",
)
evaluation_mode = models.CharField(
max_length=10,
choices=EvaluationMode.choices,
help_text="Must match the target dataset's evaluation_mode.",
)
filter_query = models.TextField(
blank=True,
default="",
help_text=(
"Query-string-encoded FilterParams (same format used by manual filter import). "
"Empty means no filter beyond the source experiment scope."
),
)
is_enabled = models.BooleanField(default=True)
last_ingested_at = models.DateTimeField(
default=timezone.now,
help_text=(
"High-water mark: the periodic task only considers sources newer than "
"this timestamp (minus a small safety margin) on each run. Initialised to the rule's "
"creation time so existing history is not backfilled."
),
)
last_run_at = models.DateTimeField(null=True, blank=True)
last_run_status = models.CharField(
max_length=20,
choices=AutoPopulationRunStatus.choices,
blank=True,
default="",
)
last_error = models.TextField(blank=True, default="")
consecutive_failure_count = models.PositiveIntegerField(default=0)

class Meta:
indexes = [
models.Index(fields=["is_enabled", "last_run_at"]),
]

def __str__(self):
return f"AutoPopulationRule(dataset={self.dataset_id}, experiment={self.source_experiment_id})"

def get_absolute_url(self):
return self.dataset.get_absolute_url()


class DatasetIngestionEntry(BaseModel):
"""Provenance record: a specific source session/message was appended to a dataset by a specific rule.

The unique constraints prevent the same source from being appended twice by the same rule, even
across crashes or worker overlap. Message-mode rules use ``source_message`` as the unique key;
session-mode rules use ``source_session`` (with ``source_message`` left null).
"""

rule = models.ForeignKey(DatasetAutoPopulationRule, on_delete=models.CASCADE, related_name="ingestion_entries")
evaluation_message = models.ForeignKey(
EvaluationMessage, on_delete=models.CASCADE, related_name="ingestion_entries"
)
source_session = models.ForeignKey(
ExperimentSession,
on_delete=models.CASCADE,
related_name="ingestion_entries",
)
source_message = models.ForeignKey(
ChatMessage,
on_delete=models.CASCADE,
null=True,
blank=True,
related_name="ingestion_entries",
help_text="Set for message-mode ingestion; null for session-mode ingestion.",
)

class Meta:
constraints = [
models.UniqueConstraint(
fields=["rule", "source_message"],
condition=models.Q(source_message__isnull=False),
name="unique_ingestion_per_rule_message",
),
models.UniqueConstraint(
fields=["rule", "source_session"],
condition=models.Q(source_message__isnull=True),
name="unique_ingestion_per_rule_session",
),
]
indexes = [
models.Index(fields=["rule", "created_at"]),
]

def __str__(self):
return f"DatasetIngestionEntry(rule={self.rule_id}, eval_message={self.evaluation_message_id})"
8 changes: 8 additions & 0 deletions apps/teams/flags.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,14 @@ class Flags(FlagInfo, Enum):

EVALS = ("flag_evaluations", "Chatbot Evaluations (beta)", "evals", [], True)

AUTO_POPULATE_EVAL_DATASETS = (
"flag_auto_populate_eval_datasets",
"Auto-populate evaluation datasets from filter rules and auto-run linked evaluations",
"evals",
["flag_evaluations"],
True,
)

MCP = ("flag_mcp", "MCP tool support for chatbots (alpha)")

NOTIFICATIONS = ("flag_notifications", "User notifications", "", [], False, True)
Expand Down
2 changes: 2 additions & 0 deletions openspec/changes/auto-populate-eval-datasets/.openspec.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
schema: spec-driven
created: 2026-05-06
Loading
Loading