Skip to content

Commit 62a3ef1

Browse files
committed
fix: attempt to setup log stream first then fall back to the log group then the stream
1 parent b492379 commit 62a3ef1

2 files changed

Lines changed: 90 additions & 14 deletions

File tree

aws-opentelemetry-distro/src/amazon/opentelemetry/distro/exporter/aws/metrics/_cloudwatch_log_client.py

Lines changed: 12 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -303,10 +303,18 @@ def _send_log_batch(self, batch: LogEventBatch) -> None:
303303
logger.info("Log group or stream not found, creating resources and retrying")
304304
with suppress_instrumentation():
305305
try:
306-
# Create log group first
307-
self._create_log_group_if_needed()
308-
# Then create log stream
309-
self._create_log_stream_if_needed()
306+
# Try creating the log stream first — the log group
307+
# may already exist (e.g. managed by IaC)
308+
try:
309+
self._create_log_stream_if_needed()
310+
except ClientError as stream_error:
311+
stream_error_code = stream_error.response.get("Error", {}).get("Code")
312+
if stream_error_code == "ResourceNotFoundException":
313+
# Log group doesn't exist either, create both
314+
self._create_log_group_if_needed()
315+
self._create_log_stream_if_needed()
316+
else:
317+
raise
310318

311319
# Retry the PutLogEvents call
312320
response = self.logs_client.put_log_events(**put_log_events_input)

aws-opentelemetry-distro/tests/amazon/opentelemetry/distro/exporter/aws/metrics/test_cloudwatch_log_client.py

Lines changed: 78 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -272,7 +272,11 @@ def test_send_log_event_method_exists(self):
272272
self.fail(f"send_log_event raised an exception: {error}")
273273

274274
def test_send_log_batch_with_resource_not_found(self):
275-
"""Test lazy creation when put_log_events fails with ResourceNotFoundException."""
275+
"""Test lazy creation when put_log_events fails with ResourceNotFoundException.
276+
277+
The recovery path tries creating the log stream first. If that succeeds
278+
(log group already exists), log group creation is skipped entirely.
279+
"""
276280
batch = self.log_client._create_event_batch()
277281
batch.add_event({"message": "test message", "timestamp": int(time.time() * 1000)}, 10)
278282

@@ -283,7 +287,7 @@ def test_send_log_batch_with_resource_not_found(self):
283287
{"nextSequenceToken": "12345"},
284288
]
285289

286-
# Mock the create methods
290+
# Mock the create methods — stream creation succeeds (group exists)
287291
mock_create_group = Mock()
288292
mock_create_stream = Mock()
289293
self.log_client._create_log_group_if_needed = mock_create_group
@@ -292,13 +296,47 @@ def test_send_log_batch_with_resource_not_found(self):
292296
# Should not raise an exception and should create resources
293297
self.log_client._send_log_batch(batch)
294298

295-
# Verify that creation methods were called
296-
mock_create_group.assert_called_once()
299+
# Stream creation attempted first; group creation NOT needed
300+
mock_create_group.assert_not_called()
297301
mock_create_stream.assert_called_once()
298302

299303
# Verify put_log_events was called twice (initial attempt + retry)
300304
self.assertEqual(mock_put.call_count, 2)
301305

306+
def test_send_log_batch_with_resource_not_found_creates_group_and_stream(self):
307+
"""Test that both log group and stream are created when neither exists.
308+
309+
When CreateLogStream fails with ResourceNotFoundException, it means the
310+
log group doesn't exist either, so both are created.
311+
"""
312+
batch = self.log_client._create_event_batch()
313+
batch.add_event({"message": "test message", "timestamp": int(time.time() * 1000)}, 10)
314+
315+
# Mock put_log_events to fail first, then succeed
316+
mock_put = self.log_client.logs_client.put_log_events
317+
mock_put.side_effect = [
318+
ClientError({"Error": {"Code": "ResourceNotFoundException"}}, "PutLogEvents"),
319+
{"nextSequenceToken": "12345"},
320+
]
321+
322+
# First stream creation fails (no group), second succeeds
323+
mock_create_stream = Mock(
324+
side_effect=[
325+
ClientError({"Error": {"Code": "ResourceNotFoundException"}}, "CreateLogStream"),
326+
None,
327+
]
328+
)
329+
mock_create_group = Mock()
330+
self.log_client._create_log_group_if_needed = mock_create_group
331+
self.log_client._create_log_stream_if_needed = mock_create_stream
332+
333+
self.log_client._send_log_batch(batch)
334+
335+
# Both should be called: stream (fail), group, stream (succeed)
336+
mock_create_group.assert_called_once()
337+
self.assertEqual(mock_create_stream.call_count, 2)
338+
self.assertEqual(mock_put.call_count, 2)
339+
302340
def test_send_log_batch_with_other_error(self):
303341
"""Test that non-ResourceNotFoundException errors are re-raised."""
304342
batch = self.log_client._create_event_batch()
@@ -313,6 +351,37 @@ def test_send_log_batch_with_other_error(self):
313351
with self.assertRaises(ClientError):
314352
self.log_client._send_log_batch(batch)
315353

354+
def test_send_log_batch_recovers_when_log_group_managed_externally(self):
355+
"""Test that send_log_batch succeeds when the log group exists but
356+
CreateLogGroup would be denied (IaC-managed log group scenario).
357+
358+
Since the stream is created first and the group already exists,
359+
CreateLogGroup is never called at all.
360+
"""
361+
batch = self.log_client._create_event_batch()
362+
batch.add_event({"message": "test message", "timestamp": int(time.time() * 1000)}, 10)
363+
364+
# First put_log_events fails (stream doesn't exist yet), retry succeeds
365+
self.log_client.logs_client.put_log_events.side_effect = [
366+
ClientError({"Error": {"Code": "ResourceNotFoundException"}}, "PutLogEvents"),
367+
{"nextSequenceToken": "12345"},
368+
]
369+
370+
# CreateLogGroup would be denied — but it should never be called
371+
self.log_client.logs_client.create_log_group.side_effect = ClientError(
372+
{"Error": {"Code": "AccessDeniedException"}}, "CreateLogGroup"
373+
)
374+
375+
# Should NOT raise — stream creation succeeds, group creation skipped
376+
self.log_client._send_log_batch(batch)
377+
378+
# CreateLogGroup was never attempted
379+
self.log_client.logs_client.create_log_group.assert_not_called()
380+
# CreateLogStream was called
381+
self.log_client.logs_client.create_log_stream.assert_called_once()
382+
# put_log_events was retried
383+
self.assertEqual(self.log_client.logs_client.put_log_events.call_count, 2)
384+
316385
def test_create_log_stream_if_needed_success(self):
317386
"""Test log stream creation when needed."""
318387
# This method should not raise an exception
@@ -671,16 +740,15 @@ def test_send_log_batch_retry_uses_suppress_instrumentation(self, mock_suppress)
671740
# Verify suppress_instrumentation was called:
672741
# 1. Initial _send_log_batch context
673742
# 2. Nested context in the retry block
674-
# 3. _create_log_group_if_needed context
675-
# 4. _create_log_stream_if_needed context
676-
self.assertEqual(mock_suppress.call_count, 4)
743+
# 3. _create_log_stream_if_needed context (stream-first, succeeds so no group creation)
744+
self.assertEqual(mock_suppress.call_count, 3)
677745
# Each context should have been properly entered and exited
678-
self.assertEqual(mock_context.__enter__.call_count, 4)
679-
self.assertEqual(mock_context.__exit__.call_count, 4)
746+
self.assertEqual(mock_context.__enter__.call_count, 3)
747+
self.assertEqual(mock_context.__exit__.call_count, 3)
680748

681749
# Verify AWS calls happened
682750
self.assertEqual(self.log_client.logs_client.put_log_events.call_count, 2)
683-
self.log_client.logs_client.create_log_group.assert_called_once()
751+
self.log_client.logs_client.create_log_group.assert_not_called()
684752
self.log_client.logs_client.create_log_stream.assert_called_once()
685753

686754
@patch("amazon.opentelemetry.distro.exporter.aws.metrics._cloudwatch_log_client.suppress_instrumentation")

0 commit comments

Comments
 (0)