|
| 1 | +#!/usr/bin/env python3 |
| 2 | + |
| 3 | +# |
| 4 | +# Copyright 2026. |
| 5 | +# |
| 6 | +# Licensed under the Apache License, Version 2.0 (the "License"); |
| 7 | +# you may not use this file except in compliance with the License. |
| 8 | +# You may obtain a copy of the License at |
| 9 | +# |
| 10 | +# http://www.apache.org/licenses/LICENSE-2.0 |
| 11 | +# |
| 12 | +# Unless required by applicable law or agreed to in writing, software |
| 13 | +# distributed under the License is distributed on an "AS IS" BASIS, |
| 14 | +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| 15 | +# See the License for the specific language governing permissions and |
| 16 | +# limitations under the License. |
| 17 | + |
| 18 | +"""Vector database configuration builder for the OpenStack Lightspeed operator. |
| 19 | +
|
| 20 | +Runs as the second init container (`vector-database-config-build`), after |
| 21 | +`vector_database_collect.sh`. It loads operator-provided base configs, walks every |
| 22 | +vector DB directory left by the collect step, and writes merged configs back to |
| 23 | +the shared volume. |
| 24 | +
|
| 25 | +Input layout (under --vector-db-path, produced by vector_database_collect.sh): |
| 26 | + {vector-db-path}/ |
| 27 | + └── <image_uuid_dir>/ (random directory name from collect script) |
| 28 | + ├── vector_db/ |
| 29 | + │ ├── <vector-db-name>/ |
| 30 | + │ │ ├── llama-stack.yaml |
| 31 | + │ │ └── faiss_store.db |
| 32 | + │ └── ocp_X.YZ/ (optional, when OCP RAG is enabled) |
| 33 | + │ ├── llama-stack.yaml |
| 34 | + │ └── faiss_store.db |
| 35 | + └── embeddings_model/ |
| 36 | +
|
| 37 | +Output (written to --vector-db-path, same basenames as the base configs): |
| 38 | + {vector-db-path}/ |
| 39 | + ├── ogx_config.yaml |
| 40 | + ├── lightspeed-stack.yaml |
| 41 | + └── <collect-dir>/ (collected data preserved) |
| 42 | +
|
| 43 | +Processing: |
| 44 | + 1. For each subdirectory of */vector_db/, read its llama-stack.yaml. |
| 45 | + 2. For each detected llama-stack.yaml file, extract its data and inject |
| 46 | + the relevant entries into the output ogx_config.yaml and lightspeed-stack.yaml |
| 47 | + files. |
| 48 | + 3. Write the merged YAML next to the collected data. |
| 49 | +
|
| 50 | +Warning: This script only injects values into existing config structures. |
| 51 | +Base configs MUST contain otherwise this script will fail: |
| 52 | +- OGX config: registered_resources.{models,vector_stores}, storage.backends, |
| 53 | + providers.{inference,vector_io} |
| 54 | +- Lightspeed Stack config: byok_rag, rag.inline |
| 55 | +
|
| 56 | +Arguments: |
| 57 | + --vector-db-path Shared volume path (input collected data, output configs) |
| 58 | + --ogx-config-path Path to the base OGX configuration file |
| 59 | + --lightspeed-stack-path Path to the base Lightspeed Stack configuration file |
| 60 | +""" |
| 61 | + |
| 62 | +import argparse |
| 63 | +from pathlib import Path |
| 64 | +from typing import Any, Iterable, Optional, Callable |
| 65 | +import logging |
| 66 | +import sys |
| 67 | + |
| 68 | +import yaml |
| 69 | + |
| 70 | +# Template for the directory path where data for a single vector database |
| 71 | +# instance resides. In the configuration, VECTOR_DB_DATA_PATH will typically |
| 72 | +# be substituted by the operator as an environment variable. |
| 73 | +VECTOR_DB_DIR_TEMPLATE = ( |
| 74 | + "${{env.VECTOR_DB_DATA_PATH}}/{uuid}/vector_db/{vector_db_name}" |
| 75 | +) |
| 76 | + |
| 77 | +# Template for the directory path where data for an embedding model resides. In |
| 78 | +# the configuration, VECTOR_DB_DATA_PATH will be substituted by OGX using |
| 79 | +# an environment variable. |
| 80 | +EMBEDDING_MODEL_DIR_TEMPLATE = "${{env.VECTOR_DB_DATA_PATH}}/{uuid}/embeddings_model" |
| 81 | + |
| 82 | +# Template for a file path where vector db data are stored. |
| 83 | +VECTOR_DB_DATA_PATH_TEMPLATE = f"{VECTOR_DB_DIR_TEMPLATE}/faiss_store.db" |
| 84 | + |
| 85 | +# The original configuration file name for OGX in the mounted vector database data. |
| 86 | +# Update later: The file is still named 'llama-stack.yaml' for backward compatibility, |
| 87 | +# since the Llama Stack project was renamed to OGX recently. |
| 88 | +OGX_CONFIG_SOURCE_FILE_NAME = "llama-stack.yaml" |
| 89 | + |
| 90 | + |
| 91 | +# -- Shared functions -------------------------------------------------------- |
| 92 | +def load_yaml_file(yaml_file_path: Path) -> dict[str, Any]: |
| 93 | + """Load YAML file""" |
| 94 | + try: |
| 95 | + with open(yaml_file_path, "r", encoding="utf-8") as f: |
| 96 | + return yaml.safe_load(f) or {} |
| 97 | + except FileNotFoundError: |
| 98 | + logging.error("YAML file not found: %s", yaml_file_path) |
| 99 | + sys.exit(1) |
| 100 | + |
| 101 | + |
| 102 | +def add_unique(lst: list, item: Any, key: Optional[str] = None) -> None: |
| 103 | + """Add item to list if not already present. |
| 104 | +
|
| 105 | + :param lst: List to modify in-place |
| 106 | + :param item: Item to add |
| 107 | + :param key: If provided, check uniqueness by comparing item[key] values. |
| 108 | + If None, check direct item equality. |
| 109 | + """ |
| 110 | + if key: |
| 111 | + if any(existing.get(key) == item.get(key) for existing in lst): |
| 112 | + return |
| 113 | + elif item in lst: |
| 114 | + return |
| 115 | + lst.append(item) |
| 116 | + |
| 117 | + |
| 118 | +def write_yaml_file(yaml_data: dict[str, Any], dest_path: Path) -> None: |
| 119 | + """Write YAML data to the specified file path.""" |
| 120 | + try: |
| 121 | + dest_path.parent.mkdir(parents=True, exist_ok=True) |
| 122 | + with open(dest_path, "w", encoding="utf-8") as f: |
| 123 | + yaml.dump(yaml_data, f, default_flow_style=False, sort_keys=False) |
| 124 | + except (OSError, yaml.YAMLError) as e: |
| 125 | + logging.error("Failed to write YAML to %s: %s", dest_path, e) |
| 126 | + sys.exit(1) |
| 127 | + |
| 128 | + |
| 129 | +def iterate_vector_db_data_dir(vector_db_data_dir_path: Path) -> Iterable[Path]: |
| 130 | + """Return all folders inside any vector_db/ subfolder, one per yield.""" |
| 131 | + for image_uuid_dir in vector_db_data_dir_path.iterdir(): |
| 132 | + vector_db_path = image_uuid_dir.joinpath("vector_db") |
| 133 | + |
| 134 | + if not vector_db_path.is_dir(): |
| 135 | + continue |
| 136 | + |
| 137 | + for folder in vector_db_path.iterdir(): |
| 138 | + if folder.is_dir(): |
| 139 | + yield folder |
| 140 | + |
| 141 | + |
| 142 | +def config_build( |
| 143 | + vector_db_parent_dir: Path, |
| 144 | + config_target_path: Path, |
| 145 | + config_populate_fn: Callable[[Path, dict[str, Any]], dict[str, Any]], |
| 146 | +) -> None: |
| 147 | + config_target = load_yaml_file(config_target_path) |
| 148 | + for vector_db_dir in iterate_vector_db_data_dir(vector_db_parent_dir): |
| 149 | + ogx_config_source_path = vector_db_dir.joinpath(OGX_CONFIG_SOURCE_FILE_NAME) |
| 150 | + try: |
| 151 | + config_target = config_populate_fn(ogx_config_source_path, config_target) |
| 152 | + except (KeyError, IndexError) as e: |
| 153 | + logging.error( |
| 154 | + "Error processing config: missing required section in source " |
| 155 | + "or target file (%s)", |
| 156 | + e, |
| 157 | + ) |
| 158 | + sys.exit(1) |
| 159 | + |
| 160 | + config_product_path = vector_db_parent_dir.joinpath(config_target_path.name) |
| 161 | + write_yaml_file(config_target, config_product_path) |
| 162 | + |
| 163 | + |
| 164 | +# ---------------------------------------------------------------------------- |
| 165 | + |
| 166 | + |
| 167 | +# -- OGX functions ----------------------------------------------------------- |
| 168 | +def ogx_process(ogx_config_source_path: Path, ogx_config_target: dict[str, Any]): |
| 169 | + """Populate the target OGX config with vector DB data from source OGX config""" |
| 170 | + ogx_config_source = load_yaml_file(ogx_config_source_path) |
| 171 | + |
| 172 | + # E.g.: /data/<uuid>/vector_db/os_product_docs/llama-stack.yaml -> <uuid> |
| 173 | + image_uuid = ogx_config_source_path.parts[-4] |
| 174 | + |
| 175 | + # E.g.: /data/<uuid>/vector_db/os_product_docs/llama-stack.yaml -> os_product_docs |
| 176 | + vector_db_name = ogx_config_source_path.parts[-2] |
| 177 | + |
| 178 | + vector_db_file = VECTOR_DB_DATA_PATH_TEMPLATE.format( |
| 179 | + uuid=image_uuid, vector_db_name=vector_db_name |
| 180 | + ) |
| 181 | + embedding_model_dir = EMBEDDING_MODEL_DIR_TEMPLATE.format(uuid=image_uuid) |
| 182 | + |
| 183 | + # Populate registered_resources.models |
| 184 | + src_model = ogx_config_source["registered_resources"]["models"][0].copy() |
| 185 | + src_model["provider_model_id"] = embedding_model_dir |
| 186 | + tgt_models = ogx_config_target["registered_resources"]["models"] |
| 187 | + add_unique(tgt_models, src_model, "model_id") |
| 188 | + |
| 189 | + # Populate registered_resources.vector_stores |
| 190 | + embedding_model = f"{src_model['provider_id']}/{embedding_model_dir}" |
| 191 | + src_vstore = ogx_config_source["registered_resources"]["vector_stores"][0].copy() |
| 192 | + src_vstore["embedding_model"] = embedding_model |
| 193 | + tgt_vstores = ogx_config_target["registered_resources"]["vector_stores"] |
| 194 | + add_unique(tgt_vstores, src_vstore) |
| 195 | + |
| 196 | + # Populate storage.backends |
| 197 | + storage_backend_key = f"kv_rag_{image_uuid}_{vector_db_name}" |
| 198 | + storage_backend = ogx_config_source["storage"]["backends"]["kv_rag"].copy() |
| 199 | + storage_backend["db_path"] = vector_db_file |
| 200 | + ogx_config_target["storage"]["backends"][storage_backend_key] = storage_backend |
| 201 | + |
| 202 | + # Populate providers.inference |
| 203 | + src_inference = ogx_config_source["providers"]["inference"][0] |
| 204 | + tgt_inferences = ogx_config_target["providers"]["inference"] |
| 205 | + add_unique(tgt_inferences, src_inference) |
| 206 | + |
| 207 | + # Populate providers.vector_io |
| 208 | + src_vector_io = ogx_config_source["providers"]["vector_io"][0].copy() |
| 209 | + src_vector_io["config"]["persistence"]["backend"] = storage_backend_key |
| 210 | + tgt_vector_ios = ogx_config_target["providers"]["vector_io"] |
| 211 | + add_unique(tgt_vector_ios, src_vector_io) |
| 212 | + |
| 213 | + return ogx_config_target |
| 214 | + |
| 215 | + |
| 216 | +# ---------------------------------------------------------------------------- |
| 217 | + |
| 218 | + |
| 219 | +# -- Lightspeed Stack functions ---------------------------------------------- |
| 220 | +def lstack_process( |
| 221 | + ogx_config_source_path: Path, lstack_config_target: dict[str, Any] |
| 222 | +) -> dict[str, Any]: |
| 223 | + """Update Lightspeed stack config with RAG entries from OGX config source.""" |
| 224 | + ogx_config_source = load_yaml_file(ogx_config_source_path) |
| 225 | + |
| 226 | + src_vstores = ogx_config_source["registered_resources"]["vector_stores"] |
| 227 | + vector_store_id = src_vstores[0]["vector_store_id"] |
| 228 | + |
| 229 | + add_unique( |
| 230 | + lstack_config_target["byok_rag"], |
| 231 | + { |
| 232 | + "rag_id": vector_store_id, |
| 233 | + "vector_db_id": vector_store_id, |
| 234 | + # The score multiplier is set to 1.0 so all BYOK sources have |
| 235 | + # equal weighting. |
| 236 | + "score_multiplier": 1.0, |
| 237 | + # The Lightspeed Stack currently requires a "db_path" value even |
| 238 | + # when OGX operates in server mode. This placeholder value ("NONE") |
| 239 | + # is provided solely to satisfy this requirement and should be |
| 240 | + # removed once the Lightspeed Stack no longer mandates it for |
| 241 | + # server mode. |
| 242 | + "db_path": "NONE", |
| 243 | + }, |
| 244 | + ) |
| 245 | + |
| 246 | + add_unique(lstack_config_target["rag"]["inline"], vector_store_id) |
| 247 | + return lstack_config_target |
| 248 | + |
| 249 | + |
| 250 | +# ---------------------------------------------------------------------------- |
| 251 | + |
| 252 | + |
| 253 | +def parse_arguments() -> argparse.Namespace: |
| 254 | + """Parse command-line arguments and return parsed namespace.""" |
| 255 | + parser = argparse.ArgumentParser( |
| 256 | + description=( |
| 257 | + "Build vector database configuration files by merging collected " |
| 258 | + "vector DB data with base configs" |
| 259 | + ) |
| 260 | + ) |
| 261 | + parser.add_argument( |
| 262 | + "--vector-db-path", |
| 263 | + type=Path, |
| 264 | + required=True, |
| 265 | + help="Path (as pathlib.Path) to the mounted vector DB data volume and output destination", |
| 266 | + ) |
| 267 | + parser.add_argument( |
| 268 | + "--ogx-config-path", |
| 269 | + type=Path, |
| 270 | + required=True, |
| 271 | + help="Path (as pathlib.Path) to the base OGX configuration file", |
| 272 | + ) |
| 273 | + parser.add_argument( |
| 274 | + "--lightspeed-stack-path", |
| 275 | + type=Path, |
| 276 | + required=True, |
| 277 | + help="Path (as pathlib.Path) to the base Lightspeed Stack configuration file", |
| 278 | + ) |
| 279 | + |
| 280 | + return parser.parse_args() |
| 281 | + |
| 282 | + |
| 283 | +def main() -> None: |
| 284 | + """main""" |
| 285 | + args = parse_arguments() |
| 286 | + config_build(args.vector_db_path, args.ogx_config_path, ogx_process) |
| 287 | + config_build(args.vector_db_path, args.lightspeed_stack_path, lstack_process) |
| 288 | + |
| 289 | + |
| 290 | +if __name__ == "__main__": |
| 291 | + main() |
0 commit comments