Skip to content

Commit ec14444

Browse files
authored
Merge pull request microsoft#66 from xlab-uiuc/astro-observe
Fix observability APIs for astronomy shop
2 parents 7f0edd8 + da31cbc commit ec14444

3 files changed

Lines changed: 63 additions & 35 deletions

File tree

aiopslab/observer/trace_api.py

Lines changed: 49 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -22,13 +22,18 @@ def __init__(self, namespace: str):
2222
self.namespace = namespace
2323
self.stop_event = threading.Event()
2424

25-
# NOTE: it may not be jaeger-out for other apps
26-
node_port = self.get_nodeport("jaeger", namespace)
27-
if node_port:
28-
self.base_url = f"http://localhost:{node_port}"
29-
else:
30-
self.base_url = "http://localhost:16686"
25+
if self.namespace == "astronomy-shop":
26+
# No NodePort in astronomy shop
27+
self.base_url = "http://localhost:16686/jaeger/ui"
3128
self.start_port_forward()
29+
else:
30+
# Other namespaces may expose a NodePort
31+
node_port = self.get_nodeport("jaeger", namespace)
32+
if node_port:
33+
self.base_url = f"http://localhost:{node_port}"
34+
else:
35+
self.base_url = "http://localhost:16686"
36+
self.start_port_forward()
3237

3338
def get_nodeport(self, service_name, namespace):
3439
"""Fetch the NodePort for the given service."""
@@ -74,19 +79,38 @@ def print_output(self, stream):
7479
def is_port_in_use(self, port):
7580
with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
7681
return s.connect_ex(("127.0.0.1", port)) == 0
82+
83+
def get_jaeger_pod_name(self):
84+
try:
85+
result = subprocess.check_output(
86+
["kubectl", "get", "pods", "-n", self.namespace,
87+
"-l", "app.kubernetes.io/name=jaeger",
88+
"-o", "jsonpath={.items[0].metadata.name}"],
89+
text=True
90+
)
91+
return result.strip()
92+
except subprocess.CalledProcessError as e:
93+
print("Error getting Jaeger pod name:", e)
94+
raise
7795

7896
def start_port_forward(self):
79-
"""Starts kubectl port-forward command to access Jaeger service."""
97+
"""Starts kubectl port-forward command to access Jaeger service or pod."""
8098
for attempt in range(3):
8199
if self.is_port_in_use(16686):
82100
print(
83-
f"Port 16686 is already in use. Attempt {attempt + 1} of {3}. Retrying in {3} seconds..."
101+
f"Port 16686 is already in use. Attempt {attempt + 1} of 3. Retrying in 3 seconds..."
84102
)
85103
time.sleep(3)
86104
continue
87105

88-
# command = "kubectl port-forward svc/jaeger 16686:16686 -n hotel-reservation"
89-
command = f"kubectl port-forward svc/jaeger 16686:16686 -n {self.namespace}"
106+
# Use pod port-forwarding for astronomy-shop only
107+
if self.namespace == "astronomy-shop":
108+
pod_name = self.get_jaeger_pod_name()
109+
command = f"kubectl port-forward pod/{pod_name} 16686:16686 -n {self.namespace}"
110+
else:
111+
command = f"kubectl port-forward svc/jaeger 16686:16686 -n {self.namespace}"
112+
113+
print("Starting port-forward with command:", command)
90114
self.port_forward_process = subprocess.Popen(
91115
command,
92116
shell=True,
@@ -103,17 +127,17 @@ def start_port_forward(self):
103127
)
104128
thread_out.start()
105129
thread_err.start()
106-
time.sleep(3) # Wait a bit for the port-forward to establish
107130

108-
if (
109-
self.port_forward_process.poll() is None
110-
): # Check if the process is still running
131+
time.sleep(3) # Let port-forward initialize
132+
133+
if self.port_forward_process.poll() is None:
111134
print("Port forwarding established successfully.")
112135
break
113136
else:
114137
print("Port forwarding failed. Retrying...")
115138
else:
116-
print("Failed to establish port forwarding after multiple attempts.")
139+
print("Failed to establish port forwarding after 3 attempts.")
140+
117141
# TODO: modify this command for other microservices
118142
# command = "kubectl port-forward svc/jaeger 16686:16686 -n hotel-reservation"
119143
# self.port_forward_process = subprocess.Popen(command, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
@@ -151,13 +175,14 @@ def cleanup(self):
151175
def get_services(self) -> list:
152176
"""Fetch a list of services from the tracing API."""
153177
url = f"{self.base_url}/api/services"
154-
response = requests.get(url)
155-
if response.status_code == 200:
156-
data = response.json()
157-
# print(f"data: {response}")
158-
return data.get("data", [])
159-
else:
160-
print(f"Failed to get services: {response.status_code}")
178+
headers = {"Accept": "application/json"} if self.namespace == "astronomy-shop" else {}
179+
180+
try:
181+
response = requests.get(url, headers=headers)
182+
response.raise_for_status()
183+
return response.json().get("data", [])
184+
except Exception as e:
185+
print(f"Failed to get services: {e}")
161186
return []
162187

163188
def get_traces(
@@ -171,15 +196,14 @@ def get_traces(
171196
Fetch traces for a specific service between start_time and end_time.
172197
If limit is not specified, all available traces are fetched.
173198
"""
174-
# Calculate the lookback in milliseconds.
175199
lookback = int((datetime.now() - start_time).total_seconds())
176-
177200
url = f"{self.base_url}/api/traces?service={service_name}&lookback={lookback}s"
178201
if limit is not None:
179202
url += f"&limit={limit}"
180203

204+
headers = {"Accept": "application/json"} if self.namespace == "astronomy-shop" else {}
181205
try:
182-
response = requests.get(url)
206+
response = requests.get(url, headers=headers)
183207
response.raise_for_status()
184208
return response.json().get("data", [])
185209
except requests.RequestException as e:

aiopslab/orchestrator/actions/base.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,10 @@ def get_logs(namespace: str, service: str) -> str:
3939
user_service_pod = kubectl.get_pod_name(
4040
namespace, f"io.kompose.service={service}"
4141
)
42+
elif namespace == "astronomy-shop":
43+
user_service_pod = kubectl.get_pod_name(
44+
namespace, f"app.kubernetes.io/name={service}"
45+
)
4246
elif namespace == "default" and "wrk2-job" in service:
4347
user_service_pod = kubectl.get_pod_name(namespace, f"job-name=wrk2-job")
4448
else:

aiopslab/orchestrator/problems/registry.py

Lines changed: 10 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -201,16 +201,16 @@ def __init__(self):
201201
"wrong_bin_usage-analysis-1": WrongBinUsageAnalysis,
202202
"wrong_bin_usage-mitigation-1": WrongBinUsageMitigation,
203203
# K8S operator misoperation
204-
"operator_overload_replicas-detection-1": K8SOperatorOverloadReplicasDetection,
205-
"operator_overload_replicas-localization-1": K8SOperatorOverloadReplicasLocalization,
206-
"operator_non_existent_storage-detection-1": K8SOperatorNonExistentStorageDetection,
207-
"operator_non_existent_storage-localization-1": K8SOperatorNonExistentStorageLocalization,
208-
"operator_invalid_affinity_toleration-detection-1": K8SOperatorInvalidAffinityTolerationDetection,
209-
"operator_invalid_affinity_toleration-localization-1": K8SOperatorInvalidAffinityTolerationLocalization,
210-
"operator_security_context_fault-detection-1": K8SOperatorSecurityContextFaultDetection,
211-
"operator_security_context_fault-localization-1": K8SOperatorSecurityContextFaultLocalization,
212-
"operator_wrong_update_strategy-detection-1": K8SOperatorWrongUpdateStrategyDetection,
213-
"operator_wrong_update_strategy-localization-1": K8SOperatorWrongUpdateStrategyLocalization,
204+
# "operator_overload_replicas-detection-1": K8SOperatorOverloadReplicasDetection,
205+
# "operator_overload_replicas-localization-1": K8SOperatorOverloadReplicasLocalization,
206+
# "operator_non_existent_storage-detection-1": K8SOperatorNonExistentStorageDetection,
207+
# "operator_non_existent_storage-localization-1": K8SOperatorNonExistentStorageLocalization,
208+
# "operator_invalid_affinity_toleration-detection-1": K8SOperatorInvalidAffinityTolerationDetection,
209+
# "operator_invalid_affinity_toleration-localization-1": K8SOperatorInvalidAffinityTolerationLocalization,
210+
# "operator_security_context_fault-detection-1": K8SOperatorSecurityContextFaultDetection,
211+
# "operator_security_context_fault-localization-1": K8SOperatorSecurityContextFaultLocalization,
212+
# "operator_wrong_update_strategy-detection-1": K8SOperatorWrongUpdateStrategyDetection,
213+
# "operator_wrong_update_strategy-localization-1": K8SOperatorWrongUpdateStrategyLocalization,
214214
}
215215

216216
def get_problem_instance(self, problem_id: str):

0 commit comments

Comments
 (0)