Skip to content

Commit 31c6b9b

Browse files
authored
fix(k8s): resolve K8s upstream in add_exapp to prevent cache overwrite (#101)
Signed-off-by: Oleksander Piskun <oleksandr2088@icloud.com>
1 parent f90fe9f commit 31c6b9b

8 files changed

Lines changed: 231 additions & 2 deletions

.github/workflows/tests-deploy-k8s-clusterip.yml

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -179,6 +179,11 @@ jobs:
179179
K8S_EXPOSE_TYPE: clusterip
180180
run: python3 apps/app_api/tests/test_occ_commands_k8s.py
181181

182+
- name: Verify HaRP routing across enable lifecycle (ClusterIP)
183+
env:
184+
K8S_EXPOSE_TYPE: clusterip
185+
run: pip install --quiet requests && python3 harp/tests/test_k8s_routing.py
186+
182187
- name: Collect HaRP logs
183188
if: always()
184189
run: docker logs appapi-harp > harp.log 2>&1

.github/workflows/tests-deploy-k8s-loadbalancer.yml

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -185,6 +185,11 @@ jobs:
185185
K8S_EXPOSE_TYPE: loadbalancer
186186
run: python3 apps/app_api/tests/test_occ_commands_k8s.py
187187

188+
- name: Verify HaRP routing across enable lifecycle (LoadBalancer)
189+
env:
190+
K8S_EXPOSE_TYPE: loadbalancer
191+
run: pip install --quiet requests && python3 harp/tests/test_k8s_routing.py
192+
188193
- name: Collect HaRP logs
189194
if: always()
190195
run: docker logs appapi-harp > harp.log 2>&1

.github/workflows/tests-deploy-k8s-manual.yml

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -181,6 +181,12 @@ jobs:
181181
MANUAL_CLUSTER_IP: ${{ env.MANUAL_CLUSTER_IP }}
182182
run: python3 apps/app_api/tests/test_occ_commands_k8s.py
183183

184+
- name: Verify HaRP routing across enable lifecycle (manual)
185+
env:
186+
K8S_EXPOSE_TYPE: manual
187+
MANUAL_CLUSTER_IP: ${{ env.MANUAL_CLUSTER_IP }}
188+
run: pip install --quiet requests && python3 harp/tests/test_k8s_routing.py
189+
184190
- name: Collect HaRP logs
185191
if: always()
186192
run: docker logs appapi-harp > harp.log 2>&1

.github/workflows/tests-deploy-k8s.yml

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -179,6 +179,11 @@ jobs:
179179
K8S_EXPOSE_TYPE: nodeport
180180
run: python3 apps/app_api/tests/test_occ_commands_k8s.py
181181

182+
- name: Verify HaRP routing across enable lifecycle (NodePort)
183+
env:
184+
K8S_EXPOSE_TYPE: nodeport
185+
run: pip install --quiet requests && python3 harp/tests/test_k8s_routing.py
186+
182187
- name: Collect HaRP logs
183188
if: always()
184189
run: docker logs appapi-harp > harp.log 2>&1

haproxy_agent.py

Lines changed: 11 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -720,10 +720,19 @@ async def add_exapp(request: web.Request):
720720
data = await request.json()
721721
app_id = request.match_info["app_id"].lower()
722722
try:
723-
EXAPP_CACHE[app_id] = ExApp.model_validate(data)
724-
_EXAPP_NEGATIVE_CACHE.pop(app_id, None)
723+
exapp_record = ExApp.model_validate(data)
725724
except ValidationError:
726725
raise web.HTTPBadRequest() from None
726+
727+
# AppAPI sends the logical host (the appId). For K8s it must be replaced with
728+
# the real upstream or SPOA would NXDOMAIN. Mirror _fetch_exapp_record's cache-miss path.
729+
k8s_upstream = await _k8s_resolve_exapp_upstream(app_id)
730+
if k8s_upstream:
731+
exapp_record.host, exapp_record.port = k8s_upstream
732+
exapp_record.resolved_host = ""
733+
734+
EXAPP_CACHE[app_id] = exapp_record
735+
_EXAPP_NEGATIVE_CACHE.pop(app_id, None)
727736
return web.HTTPNoContent()
728737

729738

pyproject.toml

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,10 @@ lint.extend-per-file-ignores."development/**/*.py" = [
1616
"D",
1717
"S",
1818
]
19+
lint.extend-per-file-ignores."tests/**/*.py" = [
20+
"D",
21+
"S",
22+
]
1923

2024
[tool.isort]
2125
profile = "black"

tests/__init__.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
# SPDX-FileCopyrightText: 2026 Nextcloud GmbH and Nextcloud contributors
2+
# SPDX-License-Identifier: AGPL-3.0-or-later

tests/test_k8s_routing.py

Lines changed: 193 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,193 @@
1+
# SPDX-FileCopyrightText: 2026 Nextcloud GmbH and Nextcloud contributors
2+
# SPDX-License-Identifier: AGPL-3.0-or-later
3+
"""K8s ExApp routing integration test.
4+
5+
Verifies that HaRP keeps routing requests to a K8s-deployed ExApp correctly across the full enable/disable lifecycle.
6+
Catches the cache-overwrite bug where AppAPI's `harpExAppUpdate(true)` push (POST /exapp_storage/{appId})
7+
ran at the end of `enableExApp` and replaced the K8s upstream that `/k8s/exapp/expose` had correctly
8+
resolved earlier with the logical Docker-style host (the appId).
9+
10+
Test flow:
11+
1. Register `app-skeleton-python` on the K8s daemon
12+
(deploy -> expose -> init -> enable -> harpExAppUpdate(true) chain has fully completed by the time the call returns)
13+
2. Hit /exapps/<appid>/public via HaRP. The /public route is declared PUBLIC in app-skeleton-python's info.xml,
14+
so no AppAPI auth headers are needed; we just need HaRP to route successfully
15+
3. Disable, then re-enable. Re-enable also runs harpExAppUpdate(true)
16+
4. Hit /public again — must still return 200
17+
18+
Prerequisites (provided by the K8s test workflows):
19+
- Nextcloud + AppAPI installed, k3s running, HaRP container with K8s backend up, k8s_test daemon registered as default
20+
- app-skeleton-python image pre-pulled into k3s
21+
- NODE_IP env var pointing to the k3s node
22+
23+
Env:
24+
NODE_IP (required) k3s node IP exposing HaRP and NodePort Services
25+
HP_SHARED_KEY HaRP shared key (only used in failure messages)
26+
K8S_EXPOSE_TYPE informational; matches the workflow under test
27+
ROUTING_TEST_RETRIES override the retry count for routing checks
28+
ROUTING_TEST_RETRY_INTERVAL override the per-retry sleep (seconds)
29+
"""
30+
31+
import json
32+
import os
33+
import sys
34+
import time
35+
from subprocess import run
36+
37+
import requests
38+
39+
EXPOSE_TYPE = os.environ.get("K8S_EXPOSE_TYPE", "nodeport")
40+
NODE_IP = os.environ.get("NODE_IP", "")
41+
HP_SHARED_KEY = os.environ.get("HP_SHARED_KEY", "test_shared_key_12345")
42+
MANUAL_CLUSTER_IP = os.environ.get("MANUAL_CLUSTER_IP", "10.43.200.200")
43+
ROUTING_RETRIES = int(os.environ.get("ROUTING_TEST_RETRIES", "15"))
44+
ROUTING_INTERVAL = float(os.environ.get("ROUTING_TEST_RETRY_INTERVAL", "1.0"))
45+
46+
DAEMON = "k8s_test"
47+
APP_ID = "app-skeleton-python"
48+
NAMESPACE = "nextcloud-exapps"
49+
APP_PORT = 23000
50+
SKELETON_XML_URL = "https://raw.githubusercontent.com/nextcloud/app-skeleton-python/main/appinfo/info.xml"
51+
52+
53+
def _harp_url() -> str:
54+
if not NODE_IP:
55+
sys.exit("NODE_IP env var required (export from kubectl get node ...)")
56+
return f"http://{NODE_IP}:8780"
57+
58+
59+
def occ(args: list[str], *, check: bool = False, timeout: int | None = None):
60+
cmd = ["php", "occ", "--no-warnings", *args]
61+
return run(cmd, capture_output=True, check=check, timeout=timeout)
62+
63+
64+
def http_get_via_harp(path: str, *, timeout: float = 10.0) -> requests.Response:
65+
"""Hit HaRP's HTTP frontend (port 8780) directly, bypassing nginx."""
66+
return requests.get(f"{_harp_url()}/exapps/{APP_ID}{path}", timeout=timeout)
67+
68+
69+
def assert_routing_works(label: str) -> None:
70+
"""Make a routed request via HaRP and assert the ExApp pod responded.
71+
72+
`/public` is a PUBLIC route in app-skeleton-python's info.xml, so it needs no AppAPI auth
73+
"""
74+
last: str | None = None
75+
for attempt in range(ROUTING_RETRIES):
76+
try:
77+
r = http_get_via_harp("/public")
78+
except requests.RequestException as e:
79+
last = f"request error: {e}"
80+
else:
81+
if r.status_code == 200:
82+
print(f" [{label}] routing OK (attempt {attempt + 1})")
83+
return
84+
last = f"HTTP {r.status_code}: {r.text[:200]!r}"
85+
time.sleep(ROUTING_INTERVAL)
86+
87+
raise AssertionError(
88+
f"[{label}] HaRP routing to '{APP_ID}' failed after {ROUTING_RETRIES} attempts: "
89+
f"{last}\n"
90+
f"A 404 here typically means HaRP's cached `host` for the ExApp is the appId rather "
91+
f"than a K8s upstream (NodePort/ClusterIP/LB), and the SPOA agent's resolve_ip() returned NXDOMAIN."
92+
f"See the comment on haproxy_agent.py:add_exapp."
93+
)
94+
95+
96+
def ensure_manual_service() -> None:
97+
"""Pre-create the operator-managed Service for manual expose tests.
98+
99+
For expose_type=manual, HaRP doesn't create a K8s Service — the operator is expected to.
100+
The k8s_test daemon is registered with a fixed upstream_host (MANUAL_CLUSTER_IP),
101+
so the Service we create here uses the same ClusterIP. No-op for other expose types.
102+
"""
103+
if EXPOSE_TYPE != "manual":
104+
return
105+
svc_name = f"nc-app-{APP_ID}"
106+
manifest = json.dumps({
107+
"apiVersion": "v1",
108+
"kind": "Service",
109+
"metadata": {"name": svc_name, "namespace": NAMESPACE},
110+
"spec": {
111+
"clusterIP": MANUAL_CLUSTER_IP,
112+
"selector": {"app": svc_name},
113+
"ports": [{"name": "http", "port": APP_PORT, "targetPort": APP_PORT}],
114+
},
115+
})
116+
r = run(
117+
["kubectl", "-n", NAMESPACE, "apply", "-f", "-"],
118+
input=manifest.encode(), capture_output=True,
119+
)
120+
if r.returncode != 0:
121+
sys.exit(
122+
f"Failed to pre-create operator Service '{svc_name}': "
123+
f"{r.stderr.decode(errors='replace')}"
124+
)
125+
126+
127+
def cleanup() -> None:
128+
occ(
129+
["app_api:app:unregister", APP_ID, "--rm-data", "--silent", "--force"],
130+
timeout=180,
131+
)
132+
if EXPOSE_TYPE == "manual":
133+
run(
134+
["kubectl", "-n", NAMESPACE, "delete", "service",
135+
f"nc-app-{APP_ID}", "--ignore-not-found=true"],
136+
capture_output=True,
137+
)
138+
139+
140+
def register_app() -> None:
141+
print(f"Registering {APP_ID} on daemon '{DAEMON}'...")
142+
r = occ(
143+
[
144+
"app_api:app:register", APP_ID, DAEMON,
145+
"--info-xml", SKELETON_XML_URL,
146+
"--wait-finish",
147+
],
148+
timeout=600,
149+
)
150+
if r.returncode != 0:
151+
sys.exit(
152+
f"Register failed (exit {r.returncode}):\n"
153+
f" stdout: {r.stdout.decode(errors='replace')}\n"
154+
f" stderr: {r.stderr.decode(errors='replace')}"
155+
)
156+
157+
158+
def disable_then_enable() -> None:
159+
print("Disabling...")
160+
r = occ(["app_api:app:disable", APP_ID], timeout=180)
161+
if r.returncode != 0:
162+
sys.exit(f"Disable failed: {r.stdout.decode(errors='replace')}")
163+
164+
print("Re-enabling...")
165+
r = occ(["app_api:app:enable", APP_ID], timeout=300)
166+
if r.returncode != 0:
167+
sys.exit(f"Enable failed: {r.stdout.decode(errors='replace')}")
168+
169+
170+
def main() -> None:
171+
print(f"K8s routing test (expose_type={EXPOSE_TYPE}, harp={_harp_url()})")
172+
173+
cleanup() # any leftover from a previous test run
174+
ensure_manual_service()
175+
register_app()
176+
177+
try:
178+
# After register:
179+
# expose populated cache correctly, then init=100% triggered enableExApp -> harpExAppUpdate(true).
180+
assert_routing_works("after-register")
181+
182+
# Re-enable goes through harpExAppUpdate(true) again,
183+
# exercising the same cache write path without a preceding expose call.
184+
disable_then_enable()
185+
assert_routing_works("after-re-enable")
186+
187+
print("\nPASS: HaRP keeps routing correct across the enable lifecycle.")
188+
finally:
189+
cleanup()
190+
191+
192+
if __name__ == "__main__":
193+
main()

0 commit comments

Comments
 (0)