Skip to content

Commit d9bbe80

Browse files
authored
Fix: _send_log_batch fails permanently when log group is managed by IaC (#689)
Fixes #688 # Problem When PutLogEvents fails with ResourceNotFoundException, the recovery path in _send_log_batch unconditionally calls _create_log_group_if_needed() before _create_log_stream_if_needed(). If the IAM role lacks logs:CreateLogGroup (because the log group is pre-created by Terraform/IaC), the AccessDeniedException propagates and CreateLogStream is never attempted. The EMF metrics exporter then fails permanently, logging errors every ~60s. # Fix Reorder the recovery path to try creating the log stream first. If that succeeds (log group already exists), log group creation is skipped entirely. Only if CreateLogStream itself fails with ResourceNotFoundException (meaning the group genuinely doesn't exist) does it fall back to creating both. # Changes _cloudwatch_log_client.py: Reordered resource creation in _send_log_batch — stream-first, group only if needed. test_cloudwatch_log_client.py: Updated existing tests to reflect stream-first ordering, added tests for the IaC-managed log group scenario and the both-missing scenario.
2 parents 1baa1c2 + 53e99fe commit d9bbe80

2 files changed

Lines changed: 90 additions & 14 deletions

File tree

aws-opentelemetry-distro/src/amazon/opentelemetry/distro/exporter/aws/metrics/_cloudwatch_log_client.py

Lines changed: 12 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -303,10 +303,18 @@ def _send_log_batch(self, batch: LogEventBatch) -> None:
303303
logger.info("Log group or stream not found, creating resources and retrying")
304304
with suppress_instrumentation():
305305
try:
306-
# Create log group first
307-
self._create_log_group_if_needed()
308-
# Then create log stream
309-
self._create_log_stream_if_needed()
306+
# Try creating the log stream first — the log group
307+
# may already exist (e.g. managed by IaC)
308+
try:
309+
self._create_log_stream_if_needed()
310+
except ClientError as stream_error:
311+
stream_error_code = stream_error.response.get("Error", {}).get("Code")
312+
if stream_error_code == "ResourceNotFoundException":
313+
# Log group doesn't exist either, create both
314+
self._create_log_group_if_needed()
315+
self._create_log_stream_if_needed()
316+
else:
317+
raise
310318

311319
# Retry the PutLogEvents call
312320
response = self.logs_client.put_log_events(**put_log_events_input)

aws-opentelemetry-distro/tests/amazon/opentelemetry/distro/exporter/aws/metrics/test_cloudwatch_log_client.py

Lines changed: 78 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -272,7 +272,11 @@ def test_send_log_event_method_exists(self):
272272
self.fail(f"send_log_event raised an exception: {error}")
273273

274274
def test_send_log_batch_with_resource_not_found(self):
275-
"""Test lazy creation when put_log_events fails with ResourceNotFoundException."""
275+
"""Test lazy creation when put_log_events fails with ResourceNotFoundException.
276+
277+
The recovery path tries creating the log stream first. If that succeeds
278+
(log group already exists), log group creation is skipped entirely.
279+
"""
276280
batch = self.log_client._create_event_batch()
277281
batch.add_event({"message": "test message", "timestamp": int(time.time() * 1000)}, 10)
278282

@@ -283,7 +287,7 @@ def test_send_log_batch_with_resource_not_found(self):
283287
{"nextSequenceToken": "12345"},
284288
]
285289

286-
# Mock the create methods
290+
# Mock the create methods — stream creation succeeds (group exists)
287291
mock_create_group = Mock()
288292
mock_create_stream = Mock()
289293
self.log_client._create_log_group_if_needed = mock_create_group
@@ -292,13 +296,47 @@ def test_send_log_batch_with_resource_not_found(self):
292296
# Should not raise an exception and should create resources
293297
self.log_client._send_log_batch(batch)
294298

295-
# Verify that creation methods were called
296-
mock_create_group.assert_called_once()
299+
# Stream creation attempted first; group creation NOT needed
300+
mock_create_group.assert_not_called()
297301
mock_create_stream.assert_called_once()
298302

299303
# Verify put_log_events was called twice (initial attempt + retry)
300304
self.assertEqual(mock_put.call_count, 2)
301305

306+
def test_send_log_batch_with_resource_not_found_creates_group_and_stream(self):
307+
"""Test that both log group and stream are created when neither exists.
308+
309+
When CreateLogStream fails with ResourceNotFoundException, it means the
310+
log group doesn't exist either, so both are created.
311+
"""
312+
batch = self.log_client._create_event_batch()
313+
batch.add_event({"message": "test message", "timestamp": int(time.time() * 1000)}, 10)
314+
315+
# Mock put_log_events to fail first, then succeed
316+
mock_put = self.log_client.logs_client.put_log_events
317+
mock_put.side_effect = [
318+
ClientError({"Error": {"Code": "ResourceNotFoundException"}}, "PutLogEvents"),
319+
{"nextSequenceToken": "12345"},
320+
]
321+
322+
# First stream creation fails (no group), second succeeds
323+
mock_create_stream = Mock(
324+
side_effect=[
325+
ClientError({"Error": {"Code": "ResourceNotFoundException"}}, "CreateLogStream"),
326+
None,
327+
]
328+
)
329+
mock_create_group = Mock()
330+
self.log_client._create_log_group_if_needed = mock_create_group
331+
self.log_client._create_log_stream_if_needed = mock_create_stream
332+
333+
self.log_client._send_log_batch(batch)
334+
335+
# Both should be called: stream (fail), group, stream (succeed)
336+
mock_create_group.assert_called_once()
337+
self.assertEqual(mock_create_stream.call_count, 2)
338+
self.assertEqual(mock_put.call_count, 2)
339+
302340
def test_send_log_batch_with_other_error(self):
303341
"""Test that non-ResourceNotFoundException errors are re-raised."""
304342
batch = self.log_client._create_event_batch()
@@ -313,6 +351,37 @@ def test_send_log_batch_with_other_error(self):
313351
with self.assertRaises(ClientError):
314352
self.log_client._send_log_batch(batch)
315353

354+
def test_send_log_batch_recovers_when_log_group_managed_externally(self):
355+
"""Test that send_log_batch succeeds when the log group exists but
356+
CreateLogGroup would be denied (IaC-managed log group scenario).
357+
358+
Since the stream is created first and the group already exists,
359+
CreateLogGroup is never called at all.
360+
"""
361+
batch = self.log_client._create_event_batch()
362+
batch.add_event({"message": "test message", "timestamp": int(time.time() * 1000)}, 10)
363+
364+
# First put_log_events fails (stream doesn't exist yet), retry succeeds
365+
self.log_client.logs_client.put_log_events.side_effect = [
366+
ClientError({"Error": {"Code": "ResourceNotFoundException"}}, "PutLogEvents"),
367+
{"nextSequenceToken": "12345"},
368+
]
369+
370+
# CreateLogGroup would be denied — but it should never be called
371+
self.log_client.logs_client.create_log_group.side_effect = ClientError(
372+
{"Error": {"Code": "AccessDeniedException"}}, "CreateLogGroup"
373+
)
374+
375+
# Should NOT raise — stream creation succeeds, group creation skipped
376+
self.log_client._send_log_batch(batch)
377+
378+
# CreateLogGroup was never attempted
379+
self.log_client.logs_client.create_log_group.assert_not_called()
380+
# CreateLogStream was called
381+
self.log_client.logs_client.create_log_stream.assert_called_once()
382+
# put_log_events was retried
383+
self.assertEqual(self.log_client.logs_client.put_log_events.call_count, 2)
384+
316385
def test_create_log_stream_if_needed_success(self):
317386
"""Test log stream creation when needed."""
318387
# This method should not raise an exception
@@ -671,16 +740,15 @@ def test_send_log_batch_retry_uses_suppress_instrumentation(self, mock_suppress)
671740
# Verify suppress_instrumentation was called:
672741
# 1. Initial _send_log_batch context
673742
# 2. Nested context in the retry block
674-
# 3. _create_log_group_if_needed context
675-
# 4. _create_log_stream_if_needed context
676-
self.assertEqual(mock_suppress.call_count, 4)
743+
# 3. _create_log_stream_if_needed context (stream-first, succeeds so no group creation)
744+
self.assertEqual(mock_suppress.call_count, 3)
677745
# Each context should have been properly entered and exited
678-
self.assertEqual(mock_context.__enter__.call_count, 4)
679-
self.assertEqual(mock_context.__exit__.call_count, 4)
746+
self.assertEqual(mock_context.__enter__.call_count, 3)
747+
self.assertEqual(mock_context.__exit__.call_count, 3)
680748

681749
# Verify AWS calls happened
682750
self.assertEqual(self.log_client.logs_client.put_log_events.call_count, 2)
683-
self.log_client.logs_client.create_log_group.assert_called_once()
751+
self.log_client.logs_client.create_log_group.assert_not_called()
684752
self.log_client.logs_client.create_log_stream.assert_called_once()
685753

686754
@patch("amazon.opentelemetry.distro.exporter.aws.metrics._cloudwatch_log_client.suppress_instrumentation")

0 commit comments

Comments
 (0)