diff --git a/CHANGELOG.md b/CHANGELOG.md
index 93f0cab4fc..6177babe1a 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -34,6 +34,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
   ([#4360](https://github.com/open-telemetry/opentelemetry-python-contrib/pull/4360))
 - `opentelemetry-instrumentation-aiohttp-server`: Use `canonical` attribute of the `Resource` as a span name
   ([#3896](https://github.com/open-telemetry/opentelemetry-python-contrib/pull/3896))
+- `opentelemetry-instrumentation-flask`: Wrap `wsgi_app` call in try/finally so the `http.server.active_requests` gauge is decremented when a request raises, preventing a long-term gauge drift
+  ([#4433](https://github.com/open-telemetry/opentelemetry-python-contrib/pull/4433))
 
 ### Breaking changes
 
diff --git a/instrumentation/opentelemetry-instrumentation-flask/src/opentelemetry/instrumentation/flask/__init__.py b/instrumentation/opentelemetry-instrumentation-flask/src/opentelemetry/instrumentation/flask/__init__.py
index 02c6ea693f..269df59f94 100644
--- a/instrumentation/opentelemetry-instrumentation-flask/src/opentelemetry/instrumentation/flask/__init__.py
+++ b/instrumentation/opentelemetry-instrumentation-flask/src/opentelemetry/instrumentation/flask/__init__.py
@@ -416,48 +416,50 @@ def _start_response(status, response_headers, *args, **kwargs):
                     response_hook(span, status, response_headers)
             return start_response(status, response_headers, *args, **kwargs)
 
-        result = wsgi_app(wrapped_app_environ, _start_response)
-
-        # Note: Streaming response context cleanup is now handled in the Flask teardown function
-        # (_wrapped_teardown_request) to ensure proper cleanup following Logfire's recommendations
-        # for OpenTelemetry generator context management
-
-        if should_trace:
-            duration_s = default_timer() - start
-            # Get the span from wrapped_app_environ and re-create context manually
-            # to pass to histogram for exemplars generation
-            span = wrapped_app_environ.get(_ENVIRON_SPAN_KEY)
-            metrics_context = trace.set_span_in_context(span)
-
-            if duration_histogram_old:
-                duration_attrs_old = otel_wsgi._parse_duration_attrs(
-                    attributes, _StabilityMode.DEFAULT
-                )
+        try:
+            result = wsgi_app(wrapped_app_environ, _start_response)
 
-                if request_route:
-                    # http.target to be included in old semantic conventions
-                    duration_attrs_old[HTTP_TARGET] = str(request_route)
-                duration_histogram_old.record(
-                    max(round(duration_s * 1000), 0),
-                    duration_attrs_old,
-                    context=metrics_context,
-                )
-            if duration_histogram_new:
-                duration_attrs_new = otel_wsgi._parse_duration_attrs(
-                    attributes, _StabilityMode.HTTP
-                )
+            # Note: Streaming response context cleanup is now handled in the Flask teardown function
+            # (_wrapped_teardown_request) to ensure proper cleanup following Logfire's recommendations
+            # for OpenTelemetry generator context management
+
+            if should_trace:
+                duration_s = default_timer() - start
+                # Get the span from wrapped_app_environ and re-create context manually
+                # to pass to histogram for exemplars generation
+                span = wrapped_app_environ.get(_ENVIRON_SPAN_KEY)
+                metrics_context = trace.set_span_in_context(span)
+
+                if duration_histogram_old:
+                    duration_attrs_old = otel_wsgi._parse_duration_attrs(
+                        attributes, _StabilityMode.DEFAULT
+                    )
+
+                    if request_route:
+                        # http.target to be included in old semantic conventions
+                        duration_attrs_old[HTTP_TARGET] = str(request_route)
+                    duration_histogram_old.record(
+                        max(round(duration_s * 1000), 0),
+                        duration_attrs_old,
+                        context=metrics_context,
+                    )
+                if duration_histogram_new:
+                    duration_attrs_new = otel_wsgi._parse_duration_attrs(
+                        attributes, _StabilityMode.HTTP
+                    )
 
-                if request_route:
-                    duration_attrs_new[HTTP_ROUTE] = str(request_route)
+                    if request_route:
+                        duration_attrs_new[HTTP_ROUTE] = str(request_route)
 
-                duration_histogram_new.record(
-                    max(duration_s, 0),
-                    duration_attrs_new,
-                    context=metrics_context,
-                )
+                    duration_histogram_new.record(
+                        max(duration_s, 0),
+                        duration_attrs_new,
+                        context=metrics_context,
+                    )
 
-        active_requests_counter.add(-1, active_requests_count_attrs)
-        return result
+            return result
+        finally:
+            active_requests_counter.add(-1, active_requests_count_attrs)
 
     def _should_trace(excluded_urls) -> bool:
         return bool(
diff --git a/instrumentation/opentelemetry-instrumentation-flask/tests/test_programmatic.py b/instrumentation/opentelemetry-instrumentation-flask/tests/test_programmatic.py
index 039ae9db83..eb3d4f6e1f 100644
--- a/instrumentation/opentelemetry-instrumentation-flask/tests/test_programmatic.py
+++ b/instrumentation/opentelemetry-instrumentation-flask/tests/test_programmatic.py
@@ -836,6 +836,41 @@ def test_duration_histogram_new_record_with_context_new_semconv(self):
                 span_arg.context.span_id, finished_span.context.span_id
             )
 
+    def test_active_requests_counter_decremented_on_error(self):
+        """Regression test for https://github.com/open-telemetry/opentelemetry-python-contrib/issues/4431.
+        http.server.active_requests must be decremented even when the view
+        raises an exception, so it does not permanently leak.
+        """
+        self.client.get("/hello/123")
+
+        resp = self.client.get("/hello/500")
+        self.assertEqual(500, resp.status_code)
+
+        metrics_list = self.memory_metrics_reader.get_metrics_data()
+        active_requests_value = None
+        for resource_metric in metrics_list.resource_metrics:
+            for scope_metric in resource_metric.scope_metrics:
+                for metric in scope_metric.metrics:
+                    if metric.name != "http.server.active_requests":
+                        continue
+                    points = [
+                        p.value
+                        for p in metric.data.data_points
+                        if isinstance(p, NumberDataPoint)
+                    ]
+                    if points:
+                        active_requests_value = points[-1]
+
+        self.assertIsNotNone(
+            active_requests_value,
+            "http.server.active_requests metric not found",
+        )
+        self.assertEqual(
+            0,
+            active_requests_value,
+            f"active_requests counter leaked: expected 0 but got {active_requests_value}",
+        )
+
 
 class TestProgrammaticHooks(InstrumentationTest, WsgiTestBase):
     def setUp(self):