diff --git a/.github/workflows/build-push-images.yml b/.github/workflows/build-push-images.yml index 9a491385..edfac190 100644 --- a/.github/workflows/build-push-images.yml +++ b/.github/workflows/build-push-images.yml @@ -19,6 +19,7 @@ jobs: - chat - image-analysis - flux-image-gen + - omni permissions: contents: read id-token: write # needed for signing the images with GitHub OIDC Token diff --git a/charts/azimuth-omni-backend/.helmignore b/charts/azimuth-omni-backend/.helmignore new file mode 100644 index 00000000..1924f397 --- /dev/null +++ b/charts/azimuth-omni-backend/.helmignore @@ -0,0 +1,33 @@ +# Patterns to ignore when building packages. +# This supports shell glob matching, relative path matching, and +# negation (prefixed with !). Only one pattern per line. +.DS_Store +# Common VCS dirs +.git/ +.gitignore +.bzr/ +.bzrignore +.hg/ +.hgignore +.svn/ +# Common backup files +*.swp +*.bak +*.tmp +*.orig +*~ +# Various IDEs +.project +.idea/ +*.tmproj +.vscode/ + +# Others +README.md +Dockerfile +*kubeconfig.y[a]ml +venv/ +__pycache__/ +images/ +*.secret +ci/ diff --git a/charts/azimuth-omni-backend/Chart.yaml b/charts/azimuth-omni-backend/Chart.yaml new file mode 100644 index 00000000..9ae99ec5 --- /dev/null +++ b/charts/azimuth-omni-backend/Chart.yaml @@ -0,0 +1,16 @@ +apiVersion: v2 +name: azimuth-llm-omni-backend +description: | + In-cluster vLLM backends for the Omni multimodal interface + (text-to-text / chat, text-to-speech, text-to-image). +maintainers: + - name: "Victor HANG" + email: victor@stackhpc.com + +type: application + +# The version and appVersion are updated by the chart build script +version: 0.1.0 +appVersion: local + +icon: https://huggingface.co/datasets/huggingface/brand-assets/resolve/main/hf-logo.svg diff --git a/charts/azimuth-omni-backend/ci/test-values.yaml b/charts/azimuth-omni-backend/ci/test-values.yaml new file mode 100644 index 00000000..3131f547 --- /dev/null +++ b/charts/azimuth-omni-backend/ci/test-values.yaml @@ -0,0 +1,13 @@ +# CI: only exercise the TTT backend with the smallest possible model. +ttt: + enabled: true + huggingface: + model: HuggingFaceTB/SmolLM2-135M-Instruct + api: + monitoring: + enabled: false + gpus: 0 +tts: + enabled: false +tti: + enabled: false diff --git a/charts/azimuth-omni-backend/templates/NOTES.txt b/charts/azimuth-omni-backend/templates/NOTES.txt new file mode 100644 index 00000000..e11a1eca --- /dev/null +++ b/charts/azimuth-omni-backend/templates/NOTES.txt @@ -0,0 +1,17 @@ +Azimuth Omni backends provide one or more in-cluster vLLM model deployments +(text-to-text / chat, text-to-speech, text-to-image) for the Omni interface. + +Enabled backends in this release: +{{- range $alias := list "ttt" "tts" "tti" }} +{{- $sub := index $.Values $alias }} +{{- if and $sub $sub.enabled }} + - {{ $alias }}: {{ $sub.huggingface.model }} (in-cluster at http://{{ $.Release.Name }}-{{ $alias }}.{{ $.Release.Namespace }}.svc) +{{- else }} + - {{ $alias }}: disabled (toggle with `{{ $alias }}.enabled=true`) +{{- end }} +{{- end }} + +Each enabled backend downloads its model weights from HuggingFace on first +start, which can take a while. + +Release notes: https://github.com/stackhpc/azimuth-llm/releases diff --git a/charts/azimuth-omni-backend/templates/_backend.tpl b/charts/azimuth-omni-backend/templates/_backend.tpl new file mode 100644 index 00000000..78d026f1 --- /dev/null +++ b/charts/azimuth-omni-backend/templates/_backend.tpl @@ -0,0 +1,338 @@ +{{/* +Render a full vLLM backend (Deployment + Service + optional +ingress/httproute/zenith/pdb/servicemonitor) for one modality. + +Usage: + {{- include "azimuth-omni-backend.backend" (dict "root" . "alias" "ttt") }} + +Per-modality config is read from `.Values.`. +*/}} + +{{/* +Selector labels, distinct per alias so deployments don't select each +other's pods. +*/}} +{{- define "azimuth-omni-backend.backend-selectorLabels" -}} +{{- $alias := .alias -}} +{{- $root := .root -}} +app.kubernetes.io/name: {{ include "azimuth-omni-backend.name" $root }} +app.kubernetes.io/instance: {{ $root.Release.Name }} +app.kubernetes.io/component: backend +azimuth-omni.stackhpc.com/modality: {{ $alias }} +{{- end }} + +{{/* +Common labels for a backend resource. +*/}} +{{- define "azimuth-omni-backend.backend-labels" -}} +helm.sh/chart: {{ include "azimuth-omni-backend.chart" .root }} +{{ include "azimuth-omni-backend.backend-selectorLabels" . }} +{{- if .root.Chart.AppVersion }} +app.kubernetes.io/version: {{ .root.Chart.AppVersion | quote }} +{{- end }} +app.kubernetes.io/managed-by: {{ .root.Release.Service }} +{{- end }} + +{{/* +Conditional chat-template arg. +*/}} +{{- define "azimuth-omni-backend.chatTemplate" -}} +{{- $cfg := .cfg -}} +{{- if $cfg.chatTemplate }} +- --chat-template +- {{ quote $cfg.chatTemplate }} +{{- else if contains "WizardCoder" $cfg.huggingface.model }} +- --chat-template +- {{ quote "{% for message in messages %}{% if message['role'] == 'system' %}{% endif %}{% if message['role'] == 'user' %}{{ '### Instruction:\n' }}{% endif %}{% if message['role'] == 'assistant' %}{{ '### Response:\n' }}{% endif %}{{ message['content'].strip() }}{% if not loop.last %}{{ '\n\n' }}{% endif %}{% if message['role'] == 'user' and loop.last %}{{ '### Response:\n' }}{% endif %}{% endfor %}" }} +{{- end -}} +{{- end }} + +{{/* +Renders every resource for one modality. +*/}} +{{- define "azimuth-omni-backend.backend" -}} +{{- $root := .root -}} +{{- $alias := .alias -}} +{{- $cfg := index $root.Values $alias -}} +{{- if not $cfg }}{{- fail (printf "azimuth-omni-backend: missing values block for backend %q" $alias) }}{{- end }} +{{- if not $cfg.enabled -}} + +{{- else -}} +{{- $name := printf "%s-%s" $root.Release.Name $alias -}} +{{- $labelArgs := dict "root" $root "alias" $alias -}} +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: {{ $name }} + labels: + {{- include "azimuth-omni-backend.backend-labels" $labelArgs | nindent 4 }} +spec: + replicas: {{ default 1 $cfg.api.replicas }} + selector: + matchLabels: + {{- include "azimuth-omni-backend.backend-selectorLabels" $labelArgs | nindent 6 }} + strategy: + {{- $cfg.api.updateStrategy | toYaml | nindent 4 }} + template: + metadata: + labels: + {{- include "azimuth-omni-backend.backend-selectorLabels" $labelArgs | nindent 8 }} + spec: + containers: + - name: api + {{- if eq ($cfg.api.gpus | int) 0 }} + image: "ghcr.io/stackhpc/vllm-cpu:{{ $cfg.api.image.version }}" + {{- else if $cfg.api.intelXPUsEnabled }} + image: "ghcr.io/stackhpc/vllm-xpu:{{ $cfg.api.image.version }}" + {{- else }} + image: "{{ $cfg.api.image.containerImage }}:{{ $cfg.api.image.version }}" + {{- end }} + {{- if eq $cfg.api.image.containerImage "vllm/vllm-omni" }} + command: + - vllm + - serve + {{- end }} + ports: + - name: api + containerPort: 8000 + volumeMounts: + - name: data + mountPath: /root/.cache/huggingface + - name: shm + mountPath: /dev/shm + args: + {{- if eq $cfg.api.image.containerImage "vllm/vllm-omni" }} + - --omni + {{- end }} + {{- if semverCompare "=0-0" + repository: "file://../azimuth-omni-backend/" + condition: backend.enabled diff --git a/charts/azimuth-omni/azimuth-ui.schema.yaml b/charts/azimuth-omni/azimuth-ui.schema.yaml new file mode 100644 index 00000000..85f3fe26 --- /dev/null +++ b/charts/azimuth-omni/azimuth-ui.schema.yaml @@ -0,0 +1,87 @@ +controls: + /backend/ttt/enabled: + type: SwitchControl + /backend/ttt/huggingface/model: + type: TextControl + /backend/ttt/huggingface/token: + type: TextControl + secret: true + /backend/ttt/api/modelMaxContextLength: + type: IntegerControl + minimum: 100 + required: false + + /backend/tts/enabled: + type: SwitchControl + /backend/tts/huggingface/model: + type: TextControl + /backend/tts/huggingface/token: + type: TextControl + secret: true + /backend/tts/api/modelMaxContextLength: + type: IntegerControl + minimum: 100 + required: false + + /backend/tti/enabled: + type: SwitchControl + /backend/tti/huggingface/model: + type: TextControl + /backend/tti/huggingface/token: + type: TextControl + secret: true + /backend/tti/api/modelMaxContextLength: + type: IntegerControl + minimum: 100 + required: false + + # When a backend is in-cluster the omni UI auto-derives model_name from + # the matching huggingface.model in the configmap template; mirror it in + # the UI for visibility. + /ui/appSettings/ttt/model_name: + type: MirrorControl + path: /backend/ttt/huggingface/model + visuallyHidden: true + /ui/appSettings/tts/model_name: + type: MirrorControl + path: /backend/tts/huggingface/model + visuallyHidden: true + /ui/appSettings/tti/model_name: + type: MirrorControl + path: /backend/tti/huggingface/model + visuallyHidden: true + +sortOrder: + - /ui/appSettings/page_title + - /ui/appSettings/page_description + + - /backend/ttt/enabled + - /backend/ttt/huggingface/model + - /backend/ttt/huggingface/token + - /backend/ttt/api/azimuthNodeGroupSelector + - /backend/ttt/api/image/containerImage + - /backend/ttt/api/image/version + - /backend/ttt/api/gpus + - /backend/ttt/api/modelMaxContextLength + - /ui/appSettings/ttt/system_prompt + - /ui/appSettings/ttt/backend_url + + - /backend/tts/enabled + - /backend/tts/huggingface/model + - /backend/tts/huggingface/token + - /backend/tts/api/azimuthNodeGroupSelector + - /backend/tts/api/image/containerImage + - /backend/tts/api/image/version + - /backend/tts/api/gpus + - /backend/tts/api/modelMaxContextLength + - /ui/appSettings/tts/backend_url + + - /backend/tti/enabled + - /backend/tti/huggingface/model + - /backend/tti/huggingface/token + - /backend/tti/api/azimuthNodeGroupSelector + - /backend/tti/api/image/containerImage + - /backend/tti/api/image/version + - /backend/tti/api/gpus + - /backend/tti/api/modelMaxContextLength + - /ui/appSettings/tti/backend_url diff --git a/charts/azimuth-omni/ci/test-values.yaml b/charts/azimuth-omni/ci/test-values.yaml new file mode 100644 index 00000000..ab582e1a --- /dev/null +++ b/charts/azimuth-omni/ci/test-values.yaml @@ -0,0 +1,26 @@ +# CI: only exercise the TTT backend with the smallest possible model. +backend: + enabled: true + ttt: + enabled: true + huggingface: + model: HuggingFaceTB/SmolLM2-135M-Instruct + api: + monitoring: + enabled: false + gpus: 0 + tts: + enabled: false + tti: + enabled: false +ui: + service: + zenith: + enabled: false + appSettings: + # model_name is auto-derived from backend.ttt.huggingface.model + ttt: + params: + max_tokens: 32 + temperature: 0.1 + top_p: 0.15 diff --git a/charts/azimuth-omni/templates/NOTES.txt b/charts/azimuth-omni/templates/NOTES.txt new file mode 100644 index 00000000..b4cb98d2 --- /dev/null +++ b/charts/azimuth-omni/templates/NOTES.txt @@ -0,0 +1,23 @@ +Azimuth Omni provides a single multimodal web interface (chat, text-to-speech, text-to-image) backed by one or more vLLM model deployments. + +{{- if .Values.backend.enabled }} +In-cluster backends (from the azimuth-llm-omni-backend subchart) in this release: +{{- range $alias := list "ttt" "tts" "tti" }} +{{- $sub := index $.Values.backend $alias }} +{{- if and $sub $sub.enabled }} + - {{ $alias }}: {{ $sub.huggingface.model }} (in-cluster at http://{{ $.Release.Name }}-{{ $alias }}.{{ $.Release.Namespace }}.svc) +{{- else }} + - {{ $alias }}: disabled (toggle with `backend.{{ $alias }}.enabled=true` or point `ui.appSettings.{{ $alias }}.backend_url` at an external backend) +{{- end }} +{{- end }} + +Each enabled backend downloads its model weights from HuggingFace on first start, which can take a while. +{{- else }} +In-cluster backends are disabled (`backend.enabled=false`). The UI will only +expose tabs for modalities whose `ui.appSettings..backend_url` + +`model_name` point at an external backend. +{{- end }} + +If `ui.service.zenith.enabled` is true the omni UI is exposed through Zenith; otherwise enable `ui.ingress` or `ui.httpRoute` to expose it via standard Kubernetes networking. + +Release notes: https://github.com/stackhpc/azimuth-llm/releases diff --git a/charts/azimuth-omni/templates/_helpers.tpl b/charts/azimuth-omni/templates/_helpers.tpl new file mode 100644 index 00000000..be2e1b28 --- /dev/null +++ b/charts/azimuth-omni/templates/_helpers.tpl @@ -0,0 +1,58 @@ +{{/* +Chart name. +*/}} +{{- define "azimuth-omni.name" -}} +{{- default .Chart.Name .Values.nameOverride | trunc 63 | trimSuffix "-" }} +{{- end }} + +{{/* +Fully qualified app name. +*/}} +{{- define "azimuth-omni.fullname" -}} +{{- if .Values.fullnameOverride }} +{{- .Values.fullnameOverride | trunc 63 | trimSuffix "-" }} +{{- else }} +{{- $name := default .Chart.Name .Values.nameOverride }} +{{- if contains $name .Release.Name }} +{{- .Release.Name | trunc 63 | trimSuffix "-" }} +{{- else }} +{{- printf "%s-%s" .Release.Name $name | trunc 63 | trimSuffix "-" }} +{{- end }} +{{- end }} +{{- end }} + +{{/* +Chart label. +*/}} +{{- define "azimuth-omni.chart" -}} +{{- printf "%s-%s" .Chart.Name .Chart.Version | replace "+" "_" | trunc 63 | trimSuffix "-" }} +{{- end }} + +{{/* +Common labels. +*/}} +{{- define "azimuth-omni.labels" -}} +helm.sh/chart: {{ include "azimuth-omni.chart" . }} +{{ include "azimuth-omni.selectorLabels" . }} +{{- if .Chart.AppVersion }} +app.kubernetes.io/version: {{ .Chart.AppVersion | quote }} +{{- end }} +app.kubernetes.io/managed-by: {{ .Release.Service }} +{{- end }} + +{{/* +App selector labels. +*/}} +{{- define "azimuth-omni.selectorLabels" -}} +app.kubernetes.io/name: {{ include "azimuth-omni.name" . }} +app.kubernetes.io/instance: {{ .Release.Name }} +{{- end }} + +{{/* +UI component selector labels. +*/}} +{{- define "azimuth-omni.ui-selectorLabels" -}} +{{ include "azimuth-omni.selectorLabels" . }} +app.kubernetes.io/component: ui +{{- end }} + diff --git a/charts/azimuth-omni/templates/app-config-map.yml b/charts/azimuth-omni/templates/app-config-map.yml new file mode 100644 index 00000000..0c1155d5 --- /dev/null +++ b/charts/azimuth-omni/templates/app-config-map.yml @@ -0,0 +1,35 @@ +{{- if .Values.ui.enabled -}} +{{/* +Build the omni overrides.yml. For each modality (ttt/tts/tti): if its backend +is enabled, inject backend_url + model_name (user-supplied values win). If the +backend is disabled but the user gave a backend_url + model_name, keep that +block. Otherwise drop the modality so its tab does not appear. +*/}} +{{- $appSettings := deepCopy .Values.ui.appSettings -}} +{{- $backend := default (dict) .Values.backend -}} +{{- $backendEnabled := and $backend (ne $backend.enabled false) -}} +{{- range $alias := list "ttt" "tts" "tti" }} + {{- $cfg := index $backend $alias -}} + {{- $userBlock := default (dict) (index $appSettings $alias) -}} + {{- if and $backendEnabled $cfg $cfg.enabled }} + {{- $defaults := dict + "backend_url" (printf "http://%s-%s.%s.svc" $.Release.Name $alias $.Release.Namespace) + "model_name" $cfg.huggingface.model -}} + {{- $merged := merge $userBlock $defaults -}} + {{- $_ := set $appSettings $alias $merged -}} + {{- else }} + {{- if not (and (hasKey $userBlock "backend_url") (hasKey $userBlock "model_name")) }} + {{- $_ := unset $appSettings $alias -}} + {{- end }} + {{- end }} +{{- end }} +apiVersion: v1 +kind: ConfigMap +metadata: + name: {{ .Release.Name }}-omni-web-app + labels: + {{- include "azimuth-omni.labels" . | nindent 4 }} +data: + overrides.yml: | + {{- $appSettings | toYaml | nindent 4 }} +{{- end -}} diff --git a/charts/azimuth-omni/templates/deployment.yml b/charts/azimuth-omni/templates/deployment.yml new file mode 100644 index 00000000..8e9feb12 --- /dev/null +++ b/charts/azimuth-omni/templates/deployment.yml @@ -0,0 +1,62 @@ +{{- if .Values.ui.enabled -}} +apiVersion: apps/v1 +kind: Deployment +metadata: + name: {{ .Release.Name }}-omni-ui + labels: + {{- include "azimuth-omni.labels" . | nindent 4 }} +spec: + replicas: 1 + selector: + matchLabels: + {{- include "azimuth-omni.ui-selectorLabels" . | nindent 6 }} + strategy: + {{- .Values.ui.updateStrategy | toYaml | nindent 4 }} + template: + metadata: + labels: + {{- include "azimuth-omni.ui-selectorLabels" . | nindent 8 }} + # Restart deployment when settings config map changes + # https://helm.sh/docs/howto/charts_tips_and_tricks/#automatically-roll-deployments + annotations: + checksum/config: {{ include (print $.Template.BasePath "/app-config-map.yml") . | sha256sum }} + spec: + containers: + - name: omni-ui + {{- with .Values.ui.image }} + image: {{ printf "%s:%s" .repository (default $.Chart.AppVersion .tag) }} + {{- if .imagePullPolicy }} + imagePullPolicy: {{ .imagePullPolicy }} + {{- end -}} + {{- end }} + ports: + - name: ui + containerPort: 7860 + volumeMounts: + - name: app + mountPath: /etc/web-app + env: + - name: PYTHONUNBUFFERED + value: "1" + tty: true # Make stdout from python visible in k8s logs + readinessProbe: + tcpSocket: + port: 7860 + periodSeconds: 5 + volumes: + - name: app + configMap: + name: {{ .Release.Name }}-omni-web-app + {{- with $.Values.ui.nodeSelector }} + nodeSelector: + {{- toYaml . | nindent 8 }} + {{- end }} + {{- with $.Values.ui.affinity }} + affinity: + {{- toYaml . | nindent 8 }} + {{- end }} + {{- with $.Values.ui.tolerations }} + tolerations: + {{- toYaml . | nindent 8 }} + {{- end }} +{{- end -}} diff --git a/charts/azimuth-omni/templates/httproute.yml b/charts/azimuth-omni/templates/httproute.yml new file mode 100644 index 00000000..26ac1a1c --- /dev/null +++ b/charts/azimuth-omni/templates/httproute.yml @@ -0,0 +1,29 @@ +{{- if and .Values.ui.enabled .Values.ui.httpRoute.enabled -}} +apiVersion: gateway.networking.k8s.io/v1 +kind: HTTPRoute +metadata: + name: {{ printf "%s-omni-ui" .Release.Name }} + labels: + {{- include "azimuth-omni.labels" . | nindent 4 }} + {{- with .Values.ui.httpRoute.annotations }} + annotations: + {{- toYaml . | nindent 4 }} + {{- end }} +spec: + {{- with .Values.ui.httpRoute.parentRefs }} + parentRefs: + {{- toYaml . | nindent 4 }} + {{- end }} + {{- with .Values.ui.httpRoute.hostnames }} + hostnames: + {{- toYaml . | nindent 4 }} + {{- end }} + rules: + {{- range .Values.ui.httpRoute.rules }} + - {{ with .matches }}matches: + {{- toYaml . | nindent 8 }} + {{ end }}backendRefs: + - name: {{ $.Release.Name }}-omni-ui + port: 80 + {{- end }} +{{- end }} diff --git a/charts/azimuth-omni/templates/ingress.yml b/charts/azimuth-omni/templates/ingress.yml new file mode 100644 index 00000000..62fe9529 --- /dev/null +++ b/charts/azimuth-omni/templates/ingress.yml @@ -0,0 +1,43 @@ +{{- if and .Values.ui.enabled .Values.ui.ingress.enabled -}} +apiVersion: networking.k8s.io/v1 +kind: Ingress +metadata: + name: {{ printf "%s-omni-ui" .Release.Name }} + labels: + {{- include "azimuth-omni.labels" . | nindent 4 }} + {{- with .Values.ui.ingress.annotations }} + annotations: + {{- toYaml . | nindent 4 }} + {{- end }} +spec: + {{- with .Values.ui.ingress.className }} + ingressClassName: {{ . }} + {{- end }} + {{- if .Values.ui.ingress.tls }} + tls: + {{- range .Values.ui.ingress.tls }} + - hosts: + {{- range .hosts }} + - {{ . | quote }} + {{- end }} + secretName: {{ .secretName }} + {{- end }} + {{- end }} + rules: + {{- range .Values.ui.ingress.hosts }} + - host: {{ .host | quote }} + http: + paths: + {{- range .paths }} + - path: {{ .path }} + {{- with .pathType }} + pathType: {{ . }} + {{- end }} + backend: + service: + name: {{ $.Release.Name }}-omni-ui + port: + number: 80 + {{- end }} + {{- end }} +{{- end }} diff --git a/charts/azimuth-omni/templates/pod-disruption-budget.yml b/charts/azimuth-omni/templates/pod-disruption-budget.yml new file mode 100644 index 00000000..59d7b520 --- /dev/null +++ b/charts/azimuth-omni/templates/pod-disruption-budget.yml @@ -0,0 +1,18 @@ +{{- if and .Values.ui.enabled .Values.ui.pdb.enabled }} +apiVersion: policy/v1 +kind: PodDisruptionBudget +metadata: + name: {{ .Release.Name }}-omni-ui + labels: + {{- include "azimuth-omni.labels" . | nindent 4 }} +spec: + {{- with .Values.ui.pdb.minAvailable }} + minAvailable: {{ . }} + {{- end }} + {{- with .Values.ui.pdb.maxUnavailable }} + maxUnavailable: {{ . }} + {{- end }} + selector: + matchLabels: + {{- include "azimuth-omni.ui-selectorLabels" . | nindent 6 }} +{{- end }} diff --git a/charts/azimuth-omni/templates/service.yml b/charts/azimuth-omni/templates/service.yml new file mode 100644 index 00000000..4ff01615 --- /dev/null +++ b/charts/azimuth-omni/templates/service.yml @@ -0,0 +1,17 @@ +{{- if .Values.ui.enabled -}} +apiVersion: v1 +kind: Service +metadata: + name: {{ .Release.Name }}-omni-ui + labels: + {{- include "azimuth-omni.labels" . | nindent 4 }} +spec: + ports: + - name: ui + port: 80 + protocol: TCP + targetPort: ui + type: {{ .Values.ui.service.type }} + selector: + {{- include "azimuth-omni.ui-selectorLabels" . | nindent 4 }} +{{- end -}} diff --git a/charts/azimuth-omni/templates/ui-zenith-client.yml b/charts/azimuth-omni/templates/ui-zenith-client.yml new file mode 100644 index 00000000..0f8acca7 --- /dev/null +++ b/charts/azimuth-omni/templates/ui-zenith-client.yml @@ -0,0 +1,16 @@ +{{- if .Values.ui.enabled -}} +{{- if .Values.ui.service.zenith.enabled -}} +apiVersion: zenith.stackhpc.com/v1alpha1 +kind: Client +metadata: + name: {{ .Release.Name }}-omni-ui + labels: + {{- include "azimuth-omni.labels" . | nindent 4 }} +spec: + reservationName: {{ .Release.Name }}-omni-ui + upstream: + serviceName: {{ .Release.Name }}-omni-ui + auth: + skip: {{ .Values.ui.service.zenith.skipAuth }} +{{- end -}} +{{- end -}} diff --git a/charts/azimuth-omni/templates/ui-zenith-reservation.yml b/charts/azimuth-omni/templates/ui-zenith-reservation.yml new file mode 100644 index 00000000..1ff7e2b3 --- /dev/null +++ b/charts/azimuth-omni/templates/ui-zenith-reservation.yml @@ -0,0 +1,18 @@ +{{- if .Values.ui.enabled -}} +{{- if .Values.ui.service.zenith.enabled -}} +apiVersion: zenith.stackhpc.com/v1alpha1 +kind: Reservation +metadata: + name: {{ .Release.Name }}-omni-ui + labels: + {{- include "azimuth-omni.labels" . | nindent 4 }} + annotations: + azimuth.stackhpc.com/service-label: {{ quote .Values.ui.service.zenith.label }} + azimuth.stackhpc.com/service-icon-url: {{ .Values.ui.service.zenith.iconUrl }} + {{- with .Values.ui.service.zenith.description }} + azimuth.stackhpc.com/service-description: {{ quote . }} + {{- end }} +spec: + credentialSecretName: {{ .Release.Name }}-omni-ui-zenith-credential +{{- end -}} +{{- end -}} diff --git a/charts/azimuth-omni/values.schema.json b/charts/azimuth-omni/values.schema.json new file mode 100644 index 00000000..8d15bb77 --- /dev/null +++ b/charts/azimuth-omni/values.schema.json @@ -0,0 +1,174 @@ +{ + "$schema": "http://json-schema.org/draft-07/schema#", + "type": "object", + "definitions": { + "backend": { + "type": "object", + "properties": { + "enabled": { + "type": "boolean", + "title": "Deploy in-cluster backend", + "description": "If true, a vLLM Deployment + Service is created to serve this modality. Disable to omit the tab or to point the omni UI at an external (off-cluster) backend.", + "default": false + }, + "huggingface": { + "type": "object", + "properties": { + "model": { + "type": "string", + "title": "Model", + "description": "The [HuggingFace model](https://huggingface.co/models) to deploy for this modality." + }, + "token": { + "type": ["string", "null"], + "title": "Access Token", + "description": "A HuggingFace [access token](https://huggingface.co/docs/hub/security-tokens). Required for [gated models](https://huggingface.co/docs/hub/en/models-gated)." + } + } + }, + "api": { + "type": "object", + "properties": { + "image": { + "type": "object", + "properties": { + "containerImage": { + "type": "string", + "title": "vLLM Container Image", + "description": "Container to use as API backend. 'vllm/vllm-openai' for text/vision/image models, 'vllm/vllm-omni' for TTS/multimodal-omni models.", + "default": "vllm/vllm-openai", + "enum": [ + "vllm/vllm-openai", + "vllm/vllm-omni" + ] + }, + "version": { + "type": "string", + "title": "Backend vLLM version", + "description": "Tag from https://github.com/vllm-project/vllm/tags (or vllm-omni/tags).", + "default": "v0.11.0" + } + } + }, + "gpus": { + "type": "integer", + "title": "GPUs", + "description": "Number of GPUs to request per backend pod. Set to 0 to fall back to a CPU vLLM image (testing only).", + "minimum": 0, + "default": 1 + }, + "modelMaxContextLength": { + "title": "Model Context Length", + "description": "Override the maximum context length, if the model's default is unsuitable." + }, + "azimuthNodeGroupSelector": { + "type": "string", + "title": "Node Group", + "description": "(Optional) Require that the model runs on a particular node group." + } + } + } + } + } + }, + "properties": { + "backend": { + "type": "object", + "title": "In-cluster vLLM backends", + "description": "Settings forwarded to the azimuth-llm-omni-backend subchart.", + "properties": { + "enabled": { + "type": "boolean", + "title": "Deploy in-cluster backends", + "description": "If false, no in-cluster backends are deployed and the UI must target external backends.", + "default": true + }, + "ttt": { + "title": "Text-to-Text backend (chat)", + "allOf": [{ "$ref": "#/definitions/backend" }] + }, + "tts": { + "title": "Text-to-Speech backend", + "allOf": [{ "$ref": "#/definitions/backend" }] + }, + "tti": { + "title": "Text-to-Image backend", + "allOf": [{ "$ref": "#/definitions/backend" }] + } + } + }, + "ui": { + "type": "object", + "properties": { + "appSettings": { + "type": "object", + "properties": { + "page_title": { + "type": "string", + "title": "Page Title", + "description": "The title shown at the top of the omni interface.", + "default": "Omni Interface" + }, + "page_description": { + "type": "string", + "title": "Page Description", + "description": "Subtitle markdown shown under the page title.", + "default": "A unified interface for multimodal AI." + }, + "ttt": { + "type": "object", + "title": "Chat (TTT) UI settings", + "properties": { + "model_name": { + "type": "string", + "title": "Model name", + "description": "Model identifier sent to the chat backend. Mirrors backend.ttt.huggingface.model when ttt is in-cluster." + }, + "system_prompt": { + "type": "string", + "title": "System prompt", + "description": "Initial system message. `{date}` is substituted at request time." + }, + "backend_url": { + "type": "string", + "title": "External backend URL", + "description": "Only needed when backend.ttt.enabled is false. OpenAI-compatible base URL without /v1." + } + } + }, + "tts": { + "type": "object", + "title": "TTS UI settings", + "properties": { + "model_name": { + "type": "string", + "title": "Model name" + }, + "backend_url": { + "type": "string", + "title": "External backend URL", + "description": "Only needed when backend.tts.enabled is false." + } + } + }, + "tti": { + "type": "object", + "title": "Image (TTI) UI settings", + "properties": { + "model_name": { + "type": "string", + "title": "Model name" + }, + "backend_url": { + "type": "string", + "title": "External backend URL", + "description": "Only needed when backend.tti.enabled is false." + } + } + } + } + } + } + } + } +} diff --git a/charts/azimuth-omni/values.yaml b/charts/azimuth-omni/values.yaml new file mode 100644 index 00000000..6097038f --- /dev/null +++ b/charts/azimuth-omni/values.yaml @@ -0,0 +1,108 @@ +# In-cluster vLLM backends, provided by the azimuth-llm-omni-backend subchart +# aliased here under `backend`. Values set here pass through to that subchart +# (see charts/azimuth-omni-backend/values.yaml for per-modality options). +# Set backend.enabled: false to use external backends via ui.appSettings.. +backend: + enabled: true + # Text-to-text backend (chat / multimodal understanding). + ttt: + enabled: true + # Text-to-speech backend. + tts: + enabled: false + # Text-to-image backend. + tti: + enabled: false + +# Omni UI - the web app that talks to the enabled backends above (or to +# external URLs you provide). +ui: + enabled: true + image: + repository: ghcr.io/stackhpc/azimuth-llm-omni-ui + # Defaults to chart's appVersion + tag: + imagePullPolicy: + + # Settings written to /etc/web-app/overrides.yml inside the UI container. + # Format matches web-apps/omni/defaults.yml. For each enabled backend the + # chart fills in backend_url and model_name, so you only need to set them + # when pointing at an external backend. + appSettings: + page_title: Omni Interface + page_description: A unified interface for multimodal AI. + # Use local system fonts by default to avoid GDPR issues with Gradio's + # default fonts (which require fetching from the Google fonts API). + theme_params: + font: + - sans-serif + - Arial + font_mono: + - sans-serif + - Arial + + # Per-backend UI/inference config. To point a tab at an out-of-cluster + # backend, disable backend..enabled and set backend_url + model_name + # here. Other keys (system_prompt, params, ui, ...) are forwarded as-is. + ttt: + system_prompt: "You are a helpful AI assistant. Today's date is {date}." + params: + max_tokens: 1024 + temperature: 0.7 + top_p: 0.9 + tts: + params: + voice: casual_male + response_format: wav + # ui: + # voice_choices: ["casual_male", "casual_female"] + # format_choices: ["wav", "mp3", "ogg", "aac", "flac"] + tti: + params: + size: 1024x1024 + ui: + size_choices: ["1024x1024", "768x1024", "1024x768"] + show_negative_prompt: true + + service: + type: ClusterIP + zenith: + enabled: true + skipAuth: false + label: Omni Interface + iconUrl: https://raw.githubusercontent.com/gradio-app/gradio/5524e590577769b0444a5332b8d444aafb0c5c12/js/app/public/static/img/logo.svg + description: | + A unified multimodal web interface (chat / text-to-speech / text-to-image). + ingress: + enabled: false + className: "" + annotations: {} + hosts: + - host: chart-example.local + paths: + - path: / + pathType: ImplementationSpecific + tls: [] + httpRoute: + enabled: false + annotations: {} + parentRefs: + - name: my-gateway + namespace: default + sectionName: https + hostnames: + - chart-example.local + rules: + - matches: + - path: + type: PathPrefix + value: / + updateStrategy: + rollingUpdate: + maxSurge: 25% + maxUnavailable: 25% + nodeSelector: {} + tolerations: [] + affinity: {} + pdb: + enabled: false diff --git a/scripts/perf-test/stress.py b/scripts/perf-test/stress.py index 6af6426f..425f132e 100644 --- a/scripts/perf-test/stress.py +++ b/scripts/perf-test/stress.py @@ -16,11 +16,12 @@ prompts = [ "Hi, how are you?", "What's the weather like with you?", - "Who's the best footballer of all time?" + "Who's the best footballer of all time?", ] client_count = 3 -request_count = 5 # Requests per client +request_count = 5 # Requests per client + def make_requests(client_id: int): client = Client(url) @@ -32,7 +33,12 @@ def make_requests(client_id: int): timings.append(time.time() - start_time) return timings -results = list(Parallel(n_jobs=client_count)(delayed(make_requests)(i) for i in range(1, client_count+1))) + +results = list( + Parallel(n_jobs=client_count)( + delayed(make_requests)(i) for i in range(1, client_count + 1) + ) +) all_timings = [] for client_timings in results: all_timings += client_timings diff --git a/web-apps/chat/Dockerfile b/web-apps/chat/Dockerfile index 5b7be0c8..abb137dd 100644 --- a/web-apps/chat/Dockerfile +++ b/web-apps/chat/Dockerfile @@ -1,6 +1,11 @@ -FROM ghcr.io/astral-sh/uv:python3.11-bookworm-slim - -RUN apt-get update && apt-get install -y libssl3=3.0.19-1~deb12u2 openssl=3.0.19-1~deb12u2 && rm -rf /var/lib/apt/lists/* +FROM ghcr.io/astral-sh/uv:python3.11-trixie-slim + +RUN apt-get update && apt-get install -y \ + libssl3t64=3.5.6-1~deb13u2 \ + openssl=3.5.6-1~deb13u2 \ + libsqlite3-0=3.46.1-7+deb13u1 \ + libgnutls30t64=3.8.9-3+deb13u4 \ + && rm -rf /var/lib/apt/lists/* ARG DIR=chat diff --git a/web-apps/chat/app.py b/web-apps/chat/app.py index 82fff79b..1c84ded2 100644 --- a/web-apps/chat/app.py +++ b/web-apps/chat/app.py @@ -62,6 +62,7 @@ class PossibleSystemPromptException(Exception): streaming=True, ) + def inference(latest_message, history): # Allow mutating global variable global BACKEND_INITIALISED @@ -69,16 +70,18 @@ def inference(latest_message, history): try: context = [] - model_instruction = settings.model_instruction.replace("{date}", f"{date.today()}") + model_instruction = settings.model_instruction.replace( + "{date}", f"{date.today()}" + ) if INCLUDE_SYSTEM_PROMPT: context.append(SystemMessage(content=model_instruction)) elif history and len(history) > 0: # Mimic system prompt by prepending it to first human message - history[0]['content'] = f"{model_instruction}\n\n{history[0]['content']}" + history[0]["content"] = f"{model_instruction}\n\n{history[0]['content']}" for message in history: - role = message['role'] - content = message['content'] + role = message["role"] + content = message["content"] if role == "user": context.append(HumanMessage(content=content)) else: @@ -102,10 +105,10 @@ def inference(latest_message, history): # The "think" tags mark the chatbot's reasoning. Remove the content # and replace with "Thinking..." until the closing tag is found. content = chunk.content - if '' in content or thinking: + if "" in content or thinking: thinking = True response = "Thinking..." - if '' in content: + if "" in content: thinking = False response = "" else: @@ -175,7 +178,7 @@ def inference_wrapper(*args): js=settings.custom_javascript, title=settings.page_title, ) as demo: - gr.Markdown('# ' + settings.page_title) + gr.Markdown("# " + settings.page_title) gr.ChatInterface( inference_wrapper, type="messages", @@ -187,10 +190,10 @@ def inference_wrapper(*args): sanitize_html=True, autoscroll=False, latex_delimiters=[ - {"left": "$$", "right": "$$", "display": True }, - {"left": "$", "right": "$", "display": False } - ], - ), + {"left": "$$", "right": "$$", "display": True}, + {"left": "$", "right": "$", "display": False}, + ], + ), ) diff --git a/web-apps/chat/gradio-client-test.py b/web-apps/chat/gradio-client-test.py index 723852dc..1943da6b 100644 --- a/web-apps/chat/gradio-client-test.py +++ b/web-apps/chat/gradio-client-test.py @@ -6,7 +6,7 @@ gradio_host = sys.argv[1] retries = 60 -for n in range(1, retries+1): +for n in range(1, retries + 1): try: client = Client(gradio_host) result = client.predict("Hi", api_name="/chat") diff --git a/web-apps/chat/requirements.txt b/web-apps/chat/requirements.txt index a448a54f..013d67a8 100644 --- a/web-apps/chat/requirements.txt +++ b/web-apps/chat/requirements.txt @@ -1,7 +1,7 @@ gradio<6 gradio_client openai -langchain<1.0 +langchain>=0.3,<1.0 langchain_openai pydantic structlog diff --git a/web-apps/chat/test.py b/web-apps/chat/test.py index 29bb5738..05e1a480 100644 --- a/web-apps/chat/test.py +++ b/web-apps/chat/test.py @@ -7,6 +7,7 @@ url = os.environ.get("GRADIO_URL", "http://localhost:7860") client = Client(url) + class TestSuite(unittest.TestCase): def test_gradio_api(self): @@ -19,5 +20,6 @@ def test_gradio_api(self): # # mock_response.assert_called_once_with("Hi", []) # self.assertEqual(result, "Mocked") + if __name__ == "__main__": unittest.main() diff --git a/web-apps/flux-image-gen/Dockerfile b/web-apps/flux-image-gen/Dockerfile index c530f4c0..a3a703e8 100644 --- a/web-apps/flux-image-gen/Dockerfile +++ b/web-apps/flux-image-gen/Dockerfile @@ -1,13 +1,50 @@ FROM ghcr.io/astral-sh/uv:python3.11-trixie -# https://stackoverflow.com/questions/55313610/importerror-libgl-so-1-cannot-open-shared-object-file-no-such-file-or-directo +ARG IMAGEMAGICK_VERSION=8:7.1.1.43+dfsg1-1+deb13u9 +ARG LIBUNBOUND_VERSION=1.22.0-2+deb13u3 +ARG KRB5_VERSION=1.21.3-5+deb13u1 +ARG LIBGCRYPT_VERSION=1.11.0-7+deb13u1 RUN apt-get update && \ - apt-get install -y ffmpeg libsm6 libxext6 && \ + apt-get install -y --no-install-recommends \ + ffmpeg \ + libsm6 \ + libxext6 \ + "imagemagick=${IMAGEMAGICK_VERSION}" \ + "imagemagick-7-common=${IMAGEMAGICK_VERSION}" \ + "imagemagick-7.q16=${IMAGEMAGICK_VERSION}" \ + "libmagickcore-7-arch-config=${IMAGEMAGICK_VERSION}" \ + "libmagickcore-7-headers=${IMAGEMAGICK_VERSION}" \ + "libmagickcore-7.q16-10=${IMAGEMAGICK_VERSION}" \ + "libmagickcore-7.q16-10-extra=${IMAGEMAGICK_VERSION}" \ + "libmagickcore-7.q16-dev=${IMAGEMAGICK_VERSION}" \ + "libmagickcore-dev=${IMAGEMAGICK_VERSION}" \ + "libmagickwand-7-headers=${IMAGEMAGICK_VERSION}" \ + "libmagickwand-7.q16-10=${IMAGEMAGICK_VERSION}" \ + "libmagickwand-7.q16-dev=${IMAGEMAGICK_VERSION}" \ + "libmagickwand-dev=${IMAGEMAGICK_VERSION}" \ + "libunbound8=${LIBUNBOUND_VERSION}" \ + "krb5-multidev=${KRB5_VERSION}" \ + "libgssapi-krb5-2=${KRB5_VERSION}" \ + "libgssrpc4t64=${KRB5_VERSION}" \ + "libk5crypto3=${KRB5_VERSION}" \ + "libkadm5clnt-mit12=${KRB5_VERSION}" \ + "libkadm5srv-mit12=${KRB5_VERSION}" \ + "libkdb5-10t64=${KRB5_VERSION}" \ + "libkrb5-3=${KRB5_VERSION}" \ + "libkrb5-dev=${KRB5_VERSION}" \ + "libkrb5support0=${KRB5_VERSION}" \ + "libgcrypt20=${LIBGCRYPT_VERSION}" && \ + apt-get clean && \ rm -rf /var/lib/apt/lists/* ARG DIR=flux-image-gen +RUN uv pip install --system --no-cache-dir --upgrade \ + pip \ + setuptools \ + wheel + COPY $DIR/requirements.txt requirements.txt RUN uv pip install --system --no-cache-dir -r requirements.txt diff --git a/web-apps/flux-image-gen/api_server.py b/web-apps/flux-image-gen/api_server.py index 1d89e3f8..7ba8880c 100644 --- a/web-apps/flux-image-gen/api_server.py +++ b/web-apps/flux-image-gen/api_server.py @@ -31,10 +31,12 @@ class ImageGenInput(BaseModel): prompt: str add_sampling_metadata: bool + @app.get("/") def health_check(): return "Server is running" + @app.get("/model") async def get_model(): return {"model": model} @@ -61,7 +63,9 @@ async def generate_image(input: ImageGenInput): add_sampling_metadata=input.add_sampling_metadata, ) if not image: - return JSONResponse({"error": {"message": msg, "seed": seed}}, status_code=400) + return JSONResponse( + {"error": {"message": msg, "seed": seed}}, status_code=400 + ) # Convert image to bytes response buffer = io.BytesIO() image.save(buffer, format="jpeg") diff --git a/web-apps/flux-image-gen/gradio_ui.py b/web-apps/flux-image-gen/gradio_ui.py index 658b5e4b..1cd84ef6 100644 --- a/web-apps/flux-image-gen/gradio_ui.py +++ b/web-apps/flux-image-gen/gradio_ui.py @@ -16,13 +16,13 @@ class Model(BaseModel): name: str address: HttpUrl + class AppSettings(BaseModel): models: List[Model] example_prompt: str = "Yoda riding a skateboard." title: str = "Flux Image Generation Demo" - settings_path = pathlib.Path("/etc/gradio-app/gradio_config.yaml") if not settings_path.exists(): print("No settings overrides found at", settings_path) @@ -38,7 +38,14 @@ class AppSettings(BaseModel): # Disable analytics for GDPR compliance os.environ["GRADIO_ANALYTICS_ENABLED"] = "False" -def save_image(model_name: str, prompt: str, seed: int, add_sampling_metadata: bool, image: Image.Image): + +def save_image( + model_name: str, + prompt: str, + seed: int, + add_sampling_metadata: bool, + image: Image.Image, +): filename = f"output/gradio/{uuid.uuid4()}.jpg" os.makedirs(os.path.dirname(filename), exist_ok=True) exif_data = Image.Exif() @@ -95,22 +102,43 @@ async def generate_image( return image, seed, filename, None + with gr.Blocks(title=settings.title) as demo: gr.Markdown(f"# {settings.title}") with gr.Row(): with gr.Column(): - model = gr.Dropdown(MODEL_NAMES, value=MODEL_NAMES[0], label="Model", interactive=len(MODEL_NAMES) > 1) + model = gr.Dropdown( + MODEL_NAMES, + value=MODEL_NAMES[0], + label="Model", + interactive=len(MODEL_NAMES) > 1, + ) prompt = gr.Textbox(label="Prompt", value=settings.example_prompt) with gr.Accordion("Advanced Options", open=False): # TODO: Make min/max slide values configurable width = gr.Slider(128, 8192, 1360, step=16, label="Width") height = gr.Slider(128, 8192, 768, step=16, label="Height") - num_steps = gr.Slider(1, 50, 4 if model.value == "flux-schnell" else 50, step=1, label="Number of steps") - guidance = gr.Slider(1.0, 10.0, 3.5, step=0.1, label="Guidance", interactive=not model.value == "flux-schnell") + num_steps = gr.Slider( + 1, + 50, + 4 if model.value == "flux-schnell" else 50, + step=1, + label="Number of steps", + ) + guidance = gr.Slider( + 1.0, + 10.0, + 3.5, + step=0.1, + label="Guidance", + interactive=not model.value == "flux-schnell", + ) seed = gr.Textbox("-1", label="Seed (-1 for random)") - add_sampling_metadata = gr.Checkbox(label="Add sampling parameters to metadata?", value=True) + add_sampling_metadata = gr.Checkbox( + label="Add sampling parameters to metadata?", value=True + ) generate_btn = gr.Button("Generate") @@ -122,7 +150,16 @@ async def generate_image( generate_btn.click( fn=generate_image, - inputs=[model, width, height, num_steps, guidance, seed, prompt, add_sampling_metadata], + inputs=[ + model, + width, + height, + num_steps, + guidance, + seed, + prompt, + add_sampling_metadata, + ], outputs=[output_image, seed_output, download_btn, warning_text], ) demo.launch(enable_monitoring=False) diff --git a/web-apps/flux-image-gen/image_gen.py b/web-apps/flux-image-gen/image_gen.py index 28585d1b..d0cffe37 100644 --- a/web-apps/flux-image-gen/image_gen.py +++ b/web-apps/flux-image-gen/image_gen.py @@ -16,14 +16,18 @@ NSFW_THRESHOLD = 0.85 + def get_models(name: str, device: torch.device, offload: bool, is_schnell: bool): t5 = load_t5(device, max_length=256 if is_schnell else 512) clip = load_clip(device) model = load_flow_model(name, device="cpu" if offload else device) ae = load_ae(name, device="cpu" if offload else device) - nsfw_classifier = pipeline("image-classification", model="Falconsai/nsfw_image_detection", device=device) + nsfw_classifier = pipeline( + "image-classification", model="Falconsai/nsfw_image_detection", device=device + ) return model, ae, t5, clip, nsfw_classifier + class FluxGenerator: def __init__(self, model_name: str, device: str, offload: bool): self.device = torch.device(device) @@ -69,10 +73,14 @@ def generate_image( if init_image is not None: if isinstance(init_image, np.ndarray): - init_image = torch.from_numpy(init_image).permute(2, 0, 1).float() / 255.0 + init_image = ( + torch.from_numpy(init_image).permute(2, 0, 1).float() / 255.0 + ) init_image = init_image.unsqueeze(0) init_image = init_image.to(self.device) - init_image = torch.nn.functional.interpolate(init_image, (opts.height, opts.width)) + init_image = torch.nn.functional.interpolate( + init_image, (opts.height, opts.width) + ) if self.offload: self.ae.encoder.to(self.device) init_image = self.ae.encode(init_image.to()) @@ -137,7 +145,7 @@ def generate_image( x = rearrange(x[0], "c h w -> h w c") img = Image.fromarray((127.5 * (x + 1.0)).cpu().byte().numpy()) - nsfw_score = [x["score"] for x in self.nsfw_classifier(img) if x["label"] == "nsfw"][0] # type: ignore + nsfw_score = [x["score"] for x in self.nsfw_classifier(img) if x["label"] == "nsfw"][0] # type: ignore if nsfw_score < NSFW_THRESHOLD: exif_data = Image.Exif() @@ -152,4 +160,8 @@ def generate_image( return img, str(opts.seed), None else: - return None, str(opts.seed), "Your generated image may contain NSFW content." + return ( + None, + str(opts.seed), + "Your generated image may contain NSFW content.", + ) diff --git a/web-apps/flux-image-gen/requirements.txt b/web-apps/flux-image-gen/requirements.txt index 069c65c0..d29104a2 100644 --- a/web-apps/flux-image-gen/requirements.txt +++ b/web-apps/flux-image-gen/requirements.txt @@ -1,4 +1,6 @@ flux[gradio] @ git+https://github.com/black-forest-labs/flux@478338d fastapi[standard] httpx +urllib3>=2.7.0 +idna>=3.15 # ../utils diff --git a/web-apps/flux-image-gen/test_client.py b/web-apps/flux-image-gen/test_client.py index 21d8a693..7e4b9738 100644 --- a/web-apps/flux-image-gen/test_client.py +++ b/web-apps/flux-image-gen/test_client.py @@ -5,16 +5,16 @@ model = os.environ.get("FLUX_MODEL", "flux-schnell") client = Client(address) web_page, seed, file_name, err = client.predict( - model_name=model, - # width=1360, - width=3888, - # height=768, - height=2544, - num_steps=4, - guidance=3.5, - seed="-1", - prompt="Yoda riding a skateboard", - add_sampling_metadata=True, - api_name="/generate_image" + model_name=model, + # width=1360, + width=3888, + # height=768, + height=2544, + num_steps=4, + guidance=3.5, + seed="-1", + prompt="Yoda riding a skateboard", + add_sampling_metadata=True, + api_name="/generate_image", ) -print('Result saved to:', file_name) +print("Result saved to:", file_name) diff --git a/web-apps/image-analysis/Dockerfile b/web-apps/image-analysis/Dockerfile index a5d6d169..78ff66f7 100644 --- a/web-apps/image-analysis/Dockerfile +++ b/web-apps/image-analysis/Dockerfile @@ -1,8 +1,11 @@ -FROM ghcr.io/astral-sh/uv:python3.11-bookworm-slim - -RUN apt-get update && \ - apt-get install -y --only-upgrade libssl3=3.0.19-1~deb12u2 openssl=3.0.19-1~deb12u2 && \ - rm -rf /var/lib/apt/lists/* +FROM ghcr.io/astral-sh/uv:python3.11-trixie-slim + +RUN apt-get update && apt-get install -y \ + libssl3t64=3.5.6-1~deb13u2 \ + openssl=3.5.6-1~deb13u2 \ + libsqlite3-0=3.46.1-7+deb13u1 \ + libgnutls30t64=3.8.9-3+deb13u4 \ + && rm -rf /var/lib/apt/lists/* ARG DIR=image-analysis diff --git a/web-apps/omni/Dockerfile b/web-apps/omni/Dockerfile new file mode 100644 index 00000000..fb07391c --- /dev/null +++ b/web-apps/omni/Dockerfile @@ -0,0 +1,26 @@ +FROM ghcr.io/astral-sh/uv:python3.11-trixie-slim + +RUN apt-get update && apt-get install -y \ + libssl3t64=3.5.6-1~deb13u2 \ + openssl=3.5.6-1~deb13u2 \ + libsqlite3-0=3.46.1-7+deb13u1 \ + libgnutls30t64=3.8.9-3+deb13u4 \ + && rm -rf /var/lib/apt/lists/* + +ARG DIR=omni + +COPY $DIR/requirements.txt requirements.txt +RUN sed -i s$../utils$./utils$ requirements.txt +COPY utils utils +RUN uv pip install --system --no-cache-dir -r requirements.txt + +COPY purge-google-fonts.sh purge-google-fonts.sh +RUN bash purge-google-fonts.sh + +WORKDIR /app + +COPY $DIR/*.py . + +COPY $DIR/defaults.yml . + +ENTRYPOINT ["python3", "app.py"] diff --git a/web-apps/omni/app.py b/web-apps/omni/app.py new file mode 100644 index 00000000..fa035f72 --- /dev/null +++ b/web-apps/omni/app.py @@ -0,0 +1,602 @@ +"""Azimuth Omni: ttt-first UI with optional tts/tti backends.""" + +import base64 +import gradio as gr +import httpx +import io +import tempfile +import threading +import time +import utils + +from datetime import date +from openai import OpenAI +from pathlib import Path +from PIL import Image +from pydantic import BaseModel, ConfigDict +from scipy.io import wavfile +from typing import Dict, List, Any, Optional +from urllib.parse import urljoin + +log = utils.get_logger() +log.info(f"Gradio version: {gr.__version__}") + + +# Param classes hold the known UI defaults. Extra keys in a backend's params +# are kept and forwarded to the backend via the OpenAI SDK extra_body. + + +class ChatParams(BaseModel): + max_tokens: int = 1024 + temperature: float = 0.7 + top_p: float = 0.9 + model_config = ConfigDict(extra="allow") + + +class TTSParams(BaseModel): + voice: str = "casual_male" + response_format: str = "wav" + model_config = ConfigDict(extra="allow") + + +class ImageGenParams(BaseModel): + size: str = "1024x1024" + style: Optional[str] = None + quality: Optional[str] = None + model_config = ConfigDict(extra="allow") + + +# Per-backend UI config: dropdown choices and slider ranges. All optional. + + +class ChatUI(BaseModel): + model_config = ConfigDict(extra="allow") + + +class TTSUI(BaseModel): + voice_choices: Optional[List[str]] = None + format_choices: List[str] = ["wav", "mp3", "ogg", "aac", "flac"] + model_config = ConfigDict(extra="allow") + + +class ImageUI(BaseModel): + size_choices: List[str] = [ + "1024x1024", + "1024x1792", + "1792x1024", + "512x512", + "256x256", + ] + style_choices: Optional[List[str]] = None + quality_choices: Optional[List[str]] = None + show_negative_prompt: bool = True + model_config = ConfigDict(extra="allow") + + +class BackendConfig(BaseModel): + backend_url: str + model_name: str + system_prompt: Optional[str] = None + params: Dict[str, Any] = {} + ui: Dict[str, Any] = {} + # OpenAI client request timeout (seconds). Falls back to AppConfig. + request_timeout_s: Optional[float] = None + model_config = ConfigDict(extra="allow") + + +class AppConfig(BaseModel): + probe_timeout_s: float = 2.0 + probe_interval_s: float = 10.0 + concurrency_limit: int = 5 + # Default OpenAI client timeout (seconds) for inference requests. + request_timeout_s: float = 1800.0 + model_config = ConfigDict(extra="allow") + + +class AppSettings(BaseModel): + host_address: str = "0.0.0.0" + page_title: str = "Omni Interface" + page_description: str = "" + app: AppConfig = AppConfig() + ttt: Optional[BackendConfig] = None + tts: Optional[BackendConfig] = None + tti: Optional[BackendConfig] = None + theme_params: Dict[str, Any] = {} + theme_params_extended: Dict[str, Any] = {} + css_overrides: Optional[str] = None + custom_javascript: Optional[str] = None + model_config = ConfigDict(protected_namespaces=(), extra="allow") + + +def load_settings() -> dict: + """Merge defaults.yml with overrides (k8s mount path, then local fallback).""" + defaults = utils.load_yaml("./defaults.yml") + for candidate in ("/etc/web-app/overrides.yml", "./overrides.yml"): + if Path(candidate).exists(): + return {**defaults, **utils.load_yaml(candidate)} + return defaults + + +settings = AppSettings(**load_settings()) + + +BACKEND_NAMES = ("ttt", "tts", "tti") + +clients: Dict[str, OpenAI] = {} +probe_urls: Dict[str, str] = {} +for name in BACKEND_NAMES: + cfg: Optional[BackendConfig] = getattr(settings, name) + if cfg is None: + continue + base = cfg.backend_url.rstrip("/") + "/" + timeout_s = ( + cfg.request_timeout_s + if cfg.request_timeout_s is not None + else settings.app.request_timeout_s + ) + clients[name] = OpenAI( + base_url=urljoin(base, "v1"), + api_key="not-needed", + timeout=timeout_s, + ) + probe_urls[name] = urljoin(base, "v1/models") + log.info(f" {name}: client request_timeout_s={timeout_s}") + +enabled = list(clients.keys()) +log.info(f"Enabled backends: {enabled}") +if not enabled: + raise RuntimeError( + f"No backends configured. Set at least one of: {', '.join(BACKEND_NAMES)}" + ) + + +# A background thread probes GET /v1/models for every backend and writes the +# results to `health`. UI refresh and inference guards read from it. + +PROBE_TIMEOUT_S = settings.app.probe_timeout_s +PROBE_INTERVAL_S = settings.app.probe_interval_s + +_probe_http = httpx.Client(timeout=PROBE_TIMEOUT_S) +health: Dict[str, bool] = {name: False for name in enabled} + + +def _probe_once() -> None: + for name in enabled: + try: + health[name] = _probe_http.get(probe_urls[name]).is_success + except httpx.HTTPError as e: + health[name] = False + log.debug(f"Health probe failed for {name}: {e}") + + +def _probe_loop() -> None: + while True: + time.sleep(PROBE_INTERVAL_S) + _probe_once() + + +# Prime synchronously so the first page load sees real values. +_probe_once() +threading.Thread(target=_probe_loop, name="health-probe", daemon=True).start() + + +def _status_markdown(name: str) -> str: + url = getattr(settings, name).backend_url + if health[name]: + return f"**Status:** reachable - `{url}`" + return ( + f"**Status:** unreachable - `{url}` " + f"(retrying every {int(PROBE_INTERVAL_S)}s; inputs disabled)" + ) + + +def file_to_base64(file_path: str) -> tuple[str, str]: + """Convert a file to base64, return (data_uri, mime_type).""" + path = Path(file_path) + suffix = path.suffix.lower() + mime_types = { + ".png": "image/png", + ".jpg": "image/jpeg", + ".jpeg": "image/jpeg", + ".gif": "image/gif", + ".webp": "image/webp", + ".bmp": "image/bmp", + ".wav": "audio/wav", + ".mp3": "audio/mpeg", + ".ogg": "audio/ogg", + ".flac": "audio/flac", + ".m4a": "audio/mp4", + ".mp4": "video/mp4", + ".webm": "video/webm", + ".avi": "video/x-msvideo", + ".mov": "video/quicktime", + } + mime_type = mime_types.get(suffix, "application/octet-stream") + with open(file_path, "rb") as f: + data = base64.b64encode(f.read()).decode() + return f"data:{mime_type};base64,{data}", mime_type + + +def build_message_content(text: str, files: List[str]) -> List[Dict]: + content = [] + if text: + content.append({"type": "text", "text": text}) + for fp in files: + uri, mt = file_to_base64(fp) + if mt.startswith("image/"): + content.append({"type": "image_url", "image_url": {"url": uri}}) + elif mt.startswith("audio/"): + content.append( + { + "type": "input_audio", + "input_audio": { + "data": uri.split(",")[1], + "format": mt.split("/")[1], + }, + } + ) + elif mt.startswith("video/"): + content.append({"type": "video_url", "video_url": {"url": uri}}) + return content or [{"type": "text", "text": ""}] + + +_CHAT_NATIVE_KEYS = {"max_tokens", "temperature", "top_p"} + + +def _split_native(params: Dict[str, Any], native: set) -> tuple[Dict, Dict]: + """Partition params into (SDK-native kwargs, extra_body for the rest).""" + native_kwargs = {k: v for k, v in params.items() if k in native} + extra = {k: v for k, v in params.items() if k not in native} + return native_kwargs, extra + + +def chat_inference(message, history): + cfg = settings.ttt + client = clients["ttt"] + raw_params = {**ChatParams().model_dump(), **cfg.params} + native_kwargs, extra_body = _split_native(raw_params, _CHAT_NATIVE_KEYS) + + if not health["ttt"]: + yield ( + "Chat backend is currently unreachable. The status banner above " + "will update once it comes back online." + ) + return + + try: + messages = [] + if cfg.system_prompt: + sp = cfg.system_prompt.replace("{date}", str(date.today())) + messages.append({"role": "system", "content": sp}) + + for msg in history: + content = msg.get("content", "") + if isinstance(content, dict) and "path" in content: + content = build_message_content("", [content["path"]]) + elif not isinstance(content, (str, list)): + continue + messages.append({"role": msg.get("role", "user"), "content": content}) + + if isinstance(message, dict): + text = message.get("text", "") or "" + files = message.get("files", []) + content = build_message_content(text, files) + messages.append({"role": "user", "content": content}) + else: + messages.append({"role": "user", "content": str(message)}) + + create_kwargs: Dict[str, Any] = dict( + model=cfg.model_name, + messages=messages, + stream=True, + **native_kwargs, + ) + if extra_body: + create_kwargs["extra_body"] = extra_body + stream = client.chat.completions.create(**create_kwargs) + + response = "" + for chunk in stream: + if chunk.choices and chunk.choices[0].delta.content: + response += chunk.choices[0].delta.content + yield response + + except Exception as e: + log.error(f"Chat error: {e}") + yield f"Error: {e}" + + +_TTS_NATIVE_KEYS = {"voice", "response_format", "speed"} + + +def tts_inference(text: str, voice: str, response_format: str): + cfg = settings.tts + client = clients["tts"] + + if not health["tts"]: + raise gr.Error("TTS backend is currently unreachable.") + + raw_params = {**TTSParams().model_dump(), **cfg.params} + raw_params["response_format"] = response_format + if voice: + raw_params["voice"] = voice + else: + raw_params.pop("voice", None) + native_kwargs, extra_body = _split_native(raw_params, _TTS_NATIVE_KEYS) + + try: + create_kwargs: Dict[str, Any] = dict( + model=cfg.model_name, + input=text, + **native_kwargs, + ) + if extra_body: + create_kwargs["extra_body"] = extra_body + response = client.audio.speech.create(**create_kwargs) + audio_bytes = response.read() + + if response_format == "wav": + sr, audio = wavfile.read(io.BytesIO(audio_bytes)) + return (sr, audio) + with tempfile.NamedTemporaryFile( + suffix=f".{response_format}", delete=False + ) as f: + f.write(audio_bytes) + return f.name + + except Exception as e: + log.error(f"TTS error: {e}") + raise gr.Error(f"TTS error: {e}") + + +_IMAGE_NATIVE_KEYS = {"size", "style", "quality", "n", "response_format"} + + +def image_inference( + prompt: str, + negative_prompt: Optional[str] = None, + size: Optional[str] = None, + style: Optional[str] = None, + quality: Optional[str] = None, +): + cfg = settings.tti + client = clients["tti"] + + if not health["tti"]: + raise gr.Error("Image backend is currently unreachable.") + + raw_params: Dict[str, Any] = {**cfg.params} + # Live UI values override config; None/empty means don't send. + for key, val in (("size", size), ("style", style), ("quality", quality)): + if val: + raw_params[key] = val + raw_params.setdefault("response_format", "b64_json") + if negative_prompt: + raw_params["negative_prompt"] = negative_prompt + + native_kwargs, extra_body = _split_native(raw_params, _IMAGE_NATIVE_KEYS) + + try: + create_kwargs: Dict[str, Any] = dict( + model=cfg.model_name, + prompt=prompt, + **native_kwargs, + ) + if extra_body: + create_kwargs["extra_body"] = extra_body + response = client.images.generate(**create_kwargs) + if response.data and response.data[0].b64_json: + return Image.open(io.BytesIO(base64.b64decode(response.data[0].b64_json))) + raise gr.Error("No image data received from model") + + except gr.Error: + raise + except Exception as e: + log.error(f"Image generation error: {e}") + raise gr.Error(f"Image generation error: {e}") + + +theme = gr.themes.Default(**settings.theme_params) +if settings.theme_params_extended: + theme.set(**settings.theme_params_extended) + +blocks_kwargs = { + "fill_height": True, + "title": settings.page_title, + "theme": theme, + "css": settings.css_overrides, + "js": settings.custom_javascript, +} +launch_kwargs = {"server_name": settings.host_address} + + +with gr.Blocks(**blocks_kwargs) as demo: + gr.Markdown(f"# {settings.page_title}") + if settings.page_description: + gr.Markdown(settings.page_description) + + # {backend_name: (status_markdown, [widgets to toggle when unreachable])} + health_widgets: Dict[str, tuple] = {} + + with gr.Tabs(): + if settings.ttt: + with gr.Tab("Chat"): + gr.Markdown(f"**Model:** `{settings.ttt.model_name}`") + chat_status = gr.Markdown(_status_markdown("ttt")) + + chatbot = gr.Chatbot( + type="messages", + height="65vh", + resizable=True, + sanitize_html=True, + autoscroll=True, + show_copy_button=True, + allow_tags=False, + latex_delimiters=[ + {"left": "$$", "right": "$$", "display": True}, + {"left": "$", "right": "$", "display": False}, + ], + ) + textbox = gr.MultimodalTextbox( + file_types=["image", "audio", "video"], + file_count="multiple", + placeholder="Type a message or upload files...", + show_label=False, + ) + gr.ChatInterface( + fn=chat_inference, + type="messages", + multimodal=True, + chatbot=chatbot, + textbox=textbox, + analytics_enabled=False, + ) + health_widgets["ttt"] = (chat_status, [textbox]) + + if settings.tts: + tts_defaults = TTSParams(**settings.tts.params) + tts_ui = TTSUI(**(settings.tts.ui or {})) + + with gr.Tab("Text-to-Speech"): + gr.Markdown(f"**Model:** `{settings.tts.model_name}`") + tts_status = gr.Markdown(_status_markdown("tts")) + + with gr.Row(): + with gr.Column(scale=2): + tts_input = gr.Textbox( + label="Text to Speak", + placeholder="Enter the text you want to convert to speech...", + lines=5, + ) + tts_output = gr.Audio( + label="Generated Audio", + show_download_button=True, + ) + with gr.Column(scale=1): + if tts_ui.voice_choices: + tts_voice = gr.Dropdown( + choices=tts_ui.voice_choices, + value=( + tts_defaults.voice + if tts_defaults.voice in tts_ui.voice_choices + else tts_ui.voice_choices[0] + ), + label="Voice", + ) + else: + tts_voice = gr.Textbox( + value=tts_defaults.voice, + label="Voice", + info="Voice name supported by the model", + ) + tts_format = gr.Dropdown( + choices=tts_ui.format_choices, + value=( + tts_defaults.response_format + if tts_defaults.response_format in tts_ui.format_choices + else tts_ui.format_choices[0] + ), + label="Format", + ) + tts_btn = gr.Button("Generate Speech", variant="primary") + + tts_btn.click( + tts_inference, [tts_input, tts_voice, tts_format], tts_output + ) + health_widgets["tts"] = ( + tts_status, + [tts_input, tts_voice, tts_format, tts_btn], + ) + + if settings.tti: + img_defaults = ImageGenParams(**settings.tti.params) + img_ui = ImageUI(**(settings.tti.ui or {})) + + with gr.Tab("Image Generation"): + gr.Markdown(f"**Model:** `{settings.tti.model_name}`") + image_status = gr.Markdown(_status_markdown("tti")) + + with gr.Row(): + with gr.Column(scale=2): + img_prompt = gr.Textbox( + label="Prompt", placeholder="Describe the image...", lines=3 + ) + img_negative = gr.Textbox( + label="Negative Prompt (optional)", + lines=2, + visible=img_ui.show_negative_prompt, + ) + img_output = gr.Image(label="Generated Image", height=512) + with gr.Column(scale=1): + img_size = gr.Dropdown( + choices=img_ui.size_choices, + value=( + img_defaults.size + if img_defaults.size in img_ui.size_choices + else ( + img_ui.size_choices[0] + if img_ui.size_choices + else None + ) + ), + label="Size", + ) + # style/quality hidden unless configured (DALL-E-3-specific). + img_style = gr.Dropdown( + choices=img_ui.style_choices or [], + value=img_defaults.style, + label="Style", + visible=bool(img_ui.style_choices), + ) + img_quality = gr.Dropdown( + choices=img_ui.quality_choices or [], + value=img_defaults.quality, + label="Quality", + visible=bool(img_ui.quality_choices), + ) + img_btn = gr.Button("Generate Image", variant="primary") + + img_btn.click( + image_inference, + [img_prompt, img_negative, img_size, img_style, img_quality], + img_output, + ) + health_widgets["tti"] = ( + image_status, + [ + img_prompt, + img_negative, + img_size, + img_style, + img_quality, + img_btn, + ], + ) + + # Order must match refresh_health: status_md then inputs, per backend. + health_outputs = [ + w + for name in enabled + for w in (health_widgets[name][0], *health_widgets[name][1]) + ] + + def refresh_health() -> List[Any]: + updates: List[Any] = [] + for name in enabled: + _, inputs = health_widgets[name] + updates.append(gr.update(value=_status_markdown(name))) + updates.extend(gr.update(interactive=health[name]) for _ in inputs) + return updates + + # Timer is per-session and only drives the UI; probing is global. + demo.load(refresh_health, inputs=None, outputs=health_outputs) + gr.Timer(PROBE_INTERVAL_S).tick(refresh_health, inputs=None, outputs=health_outputs) + + +if __name__ == "__main__": + for name in enabled: + cfg = getattr(settings, name) + log.info(f" {name}: model={cfg.model_name} url={cfg.backend_url}") + demo.queue(default_concurrency_limit=settings.app.concurrency_limit).launch( + **launch_kwargs + ) diff --git a/web-apps/omni/defaults.yml b/web-apps/omni/defaults.yml new file mode 100644 index 00000000..8c06df4d --- /dev/null +++ b/web-apps/omni/defaults.yml @@ -0,0 +1,64 @@ +host_address: 0.0.0.0 + +page_title: Omni Interface +page_description: A unified interface for multimodal AI. + +# Global app tunables +app: + probe_timeout_s: 2.0 + probe_interval_s: 10.0 + concurrency_limit: 5 + # Default OpenAI client request timeout (seconds) for inference calls. + request_timeout_s: 1800.0 + +# Each backend is optional; only configured backends show in the UI. +# +# Schema per backend: +# backend_url: OpenAI-compatible base URL (without /v1) +# model_name: Model identifier accepted by the backend +# system_prompt: (ttt only) optional system message; {date} is substituted +# params: Defaults sent on every request. Unknown keys go via extra_body. +# ui: Optional dropdown choices and slider ranges (see below). +# request_timeout_s: Optional per-backend override of app.request_timeout_s. + +# TTT (text-to-text) backend - for text chat and multimodal understanding +# ttt: +# backend_url: http://localhost:8000 +# model_name: Qwen/Qwen2.5-Omni-7B +# system_prompt: "You are a helpful AI assistant. Today's date is {date}." +# params: +# max_tokens: 1024 +# temperature: 0.7 +# top_p: 0.9 +# # Any extras (forwarded via extra_body): +# # repetition_penalty: 1.05 +# # seed: 42 + +# TTS backend - for text-to-speech +tts: + backend_url: http://localhost:8000 + model_name: mistralai/Voxtral-4B-TTS-2603 + params: + voice: casual_male + response_format: wav + # ui: + # voice_choices: ["casual_male", "casual_female"] + # format_choices: ["wav", "mp3", "ogg", "aac", "flac"] + +# TTI (text-to-image) generation backend +# tti: +# backend_url: http://localhost:8002 +# model_name: Tongyi-MAI/Z-Image-Turbo +# params: +# size: 1024x1024 +# ui: +# size_choices: ["1024x1024", "768x1024", "1024x768"] +# # style_choices: ["vivid", "natural"] +# # quality_choices: ["standard", "hd"] +# show_negative_prompt: true + +# Gradio theme +theme_params: {} +theme_params_extended: {} +css_overrides: +custom_javascript: diff --git a/web-apps/omni/requirements.txt b/web-apps/omni/requirements.txt new file mode 100644 index 00000000..ffe1c299 --- /dev/null +++ b/web-apps/omni/requirements.txt @@ -0,0 +1,10 @@ +gradio<6 +gradio_client +httpx +openai +pydantic +structlog +pillow +numpy +scipy +../utils diff --git a/web-apps/utils/utils.py b/web-apps/utils/utils.py index cb99776b..cf8db30a 100644 --- a/web-apps/utils/utils.py +++ b/web-apps/utils/utils.py @@ -24,8 +24,10 @@ def get_logger(): structlog.configure(wrapper_class=structlog.make_filtering_bound_logger(log_level)) return structlog.get_logger() + log = get_logger() + class LLMParams(BaseModel): """ Parameters for vLLM API requests. For details see