Skip to content

Commit bdccc54

Browse files
authored
Merge pull request #211 from fairagro/improve_error_handling
Improve error handling
2 parents a713823 + 0125ce0 commit bdccc54

9 files changed

Lines changed: 411 additions & 143 deletions

File tree

middleware/http_session/__init__.py

Lines changed: 106 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -3,11 +3,14 @@
33
HttpSessionConfig.
44
"""
55

6+
import asyncio
67
from typing import Annotated, NamedTuple, Optional, Type
78
from types import TracebackType
89
from urllib.parse import urlparse
910
from pathlib import PurePath
10-
from aiohttp import ClientSession, TCPConnector, ClientTimeout
11+
from aiohttp import ClientError, ClientSession, TCPConnector, ClientTimeout
12+
from opentelemetry import trace
13+
from opentelemetry.semconv.attributes import url_attributes
1114
import aiofiles
1215
import chardet
1316

@@ -17,11 +20,33 @@ class HttpSessionConfig(NamedTuple):
1720
A simple configuration class for the HttpSession class.
1821
"""
1922

20-
connection_limit: Annotated[int, "maximum number of conconcurrent http connections"]
21-
receive_timeout: Annotated[int, "timeout in seconds until an http connection is established"]
23+
connection_limit: Annotated[int,
24+
"maximum number of conconcurrent http connections"]
25+
receive_timeout: Annotated[int,
26+
"timeout in seconds until an http connection is established"]
2227
connect_timeout: Annotated[int, "timeout in seconds for an http download"]
2328

2429

30+
class HttpSessionFetchError(Exception):
31+
"""Base class for fetching errors."""
32+
33+
34+
class HttpSessionArgumentError(HttpSessionFetchError):
35+
"""A input argument is not supported (aka validation error)"""
36+
37+
38+
class HttpSessionTechnicalError(HttpSessionFetchError):
39+
"""URL could not be reached at all (network, DNS, timeout)."""
40+
41+
42+
class HttpSessionResponseError(HttpSessionFetchError):
43+
"""Server/file responded with error (4xx/5xx, file not found, etc.)."""
44+
45+
46+
class HttpSessionDecodeError(HttpSessionFetchError):
47+
"""Response could not be decoded with detected encoding."""
48+
49+
2550
class HttpSession(ClientSession):
2651
"""
2752
A wrapper around aiohttp.ClientSession that adds the method get_decoded_url for automatic
@@ -44,12 +69,17 @@ def __init__(self, config: HttpSessionConfig) -> None:
4469
-------
4570
None
4671
"""
47-
connector = TCPConnector(limit=config.connection_limit)
48-
timeout = ClientTimeout(
49-
total=None, # disable total timeout as this might trigger if we are out of connections
50-
sock_read=config.receive_timeout,
51-
sock_connect=config.connect_timeout)
52-
super().__init__(connector=connector, timeout=timeout, raise_for_status=True)
72+
try:
73+
connector = TCPConnector(limit=config.connection_limit)
74+
timeout = ClientTimeout(
75+
# disable total timeout which might trigger if we are out of connections
76+
total=None,
77+
sock_read=config.receive_timeout,
78+
sock_connect=config.connect_timeout)
79+
super().__init__(connector=connector, timeout=timeout, raise_for_status=True)
80+
except Exception as e:
81+
raise HttpSessionArgumentError(
82+
"Could not create HttpSession object") from e
5383

5484
async def __aenter__(self) -> "HttpSession":
5585
"""
@@ -60,7 +90,7 @@ async def __aenter__(self) -> "HttpSession":
6090
HttpSession
6191
The session itself.
6292
"""
63-
await super().__aenter__() # just in case there might be any side-effects
93+
await super().__aenter__() # just in case there might be any side-effects
6494
return self
6595

6696
async def __aexit__(
@@ -102,25 +132,70 @@ async def get_decoded_url(self, url: str) -> str:
102132
The decoded content of the URL.
103133
"""
104134

105-
parsed_url = urlparse(url)
106-
if parsed_url.scheme in ["http", "https"]:
107-
async with self.get(url) as response:
108-
encoded_content = await response.read()
109-
encoding = str(chardet.detect(encoded_content)['encoding'])
110-
content = encoded_content.decode(encoding)
111-
elif parsed_url.scheme == "file":
112-
# We need to deal with the following situation:
113-
# urlparse('file://test') => netloc = 'test', path = '', joined = 'test'
114-
# urlparse('file:///test') => netloc = '', path = '/test', joined = '\test'
115-
# urlparse('file://./test') => netloc = '.', path = '/test', joined = '\test'
116-
# In the last case the path is relative, so the result is wrong. Thus this code:
117-
base_path = PurePath(parsed_url.netloc)
118-
if base_path == PurePath('.'):
119-
path = base_path / parsed_url.path.lstrip("/").lstrip("\\")
135+
with trace.get_tracer(__name__).start_as_current_span(
136+
"HttpSession.get_decoded_url") as otel_span:
137+
otel_span.set_attribute(url_attributes.URL_FULL, url)
138+
139+
parsed_url = urlparse(url) # does not raise
140+
if parsed_url.scheme in ["http", "https"]:
141+
try:
142+
async with self.get(url) as response:
143+
# treat 5xx like a technical network error
144+
if 500 <= response.status < 600:
145+
otel_span.add_event(
146+
"server reponse 5xx, raising HttpSessionTechnicalError")
147+
raise HttpSessionTechnicalError(
148+
f"Server error {response.status} for {url}"
149+
)
150+
# treat 4xx as response error
151+
if 400 <= response.status < 500:
152+
otel_span.add_event(
153+
"server reponse 4xx, raising HttpSessionResponseError")
154+
raise HttpSessionResponseError(
155+
f"Server error {response.status} for {url}"
156+
)
157+
encoded_content = await response.read()
158+
except (ClientError, asyncio.TimeoutError) as e:
159+
otel_span.record_exception(e)
160+
otel_span.add_event(
161+
"caught network-related exception, raising HttpSessionTechnicalError")
162+
raise HttpSessionTechnicalError(
163+
f"Cannot fetch {url}: {e}") from e
164+
elif parsed_url.scheme == "file":
165+
try:
166+
# We need to deal with the following situation:
167+
# urlparse('file://test') => netloc = 'test', path = '', joined = 'test'
168+
# urlparse('file:///test') => netloc = '', path = '/test', joined = '\test'
169+
# urlparse('file://./test') => netloc = '.', path = '/test', joined = '\test'
170+
# In the last case the path is relative, so the result is wrong. Thus this code:
171+
base_path = PurePath(parsed_url.netloc)
172+
if base_path == PurePath('.'):
173+
path = base_path / parsed_url.path.lstrip("/").lstrip("\\")
174+
else:
175+
path = base_path / parsed_url.path
176+
async with aiofiles.open(path, 'rb') as f:
177+
encoded_content = await f.read()
178+
except Exception as e:
179+
otel_span.record_exception(e)
180+
otel_span.add_event(
181+
"caught exception when trying to read file, "
182+
"raising HttpSessionResponseError")
183+
raise HttpSessionResponseError(
184+
f"Cannot read file {url}: {e}") from e
120185
else:
121-
path = base_path / parsed_url.path
122-
async with aiofiles.open(path, 'r') as f:
123-
content = await f.read()
124-
else:
125-
raise ValueError(f"Unsupported scheme: {parsed_url.scheme}")
126-
return content
186+
otel_span.add_event(
187+
"found unsupported URL protocol, raising HttpSessionArgumentError")
188+
raise HttpSessionArgumentError(
189+
f"Unsupported URL scheme: {parsed_url.scheme} in URL {url}")
190+
191+
try:
192+
encoding = str(chardet.detect(encoded_content)['encoding']) or 'utf-8'
193+
content = encoded_content.decode(encoding)
194+
except Exception as e:
195+
otel_span.record_exception(e)
196+
otel_span.add_event(
197+
"caught exception during decoding, raising HttpSessionDecodeError")
198+
raise HttpSessionDecodeError(
199+
f"cannot decode URL content from {url}: {e}") from e
200+
201+
return content

middleware/http_session/test/test_http_session.py

Lines changed: 160 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -3,55 +3,180 @@
33
"""
44

55
import unittest
6-
from unittest.mock import AsyncMock, patch
7-
8-
from middleware.http_session import HttpSession, HttpSessionConfig
6+
from unittest.mock import AsyncMock, patch, MagicMock
7+
from aiohttp import ClientError
8+
from middleware.http_session import (
9+
HttpSession,
10+
HttpSessionTechnicalError,
11+
HttpSessionResponseError,
12+
HttpSessionDecodeError,
13+
HttpSessionArgumentError
14+
)
915

1016

1117
class TestHttpSession(unittest.IsolatedAsyncioTestCase):
1218
"""
1319
The main test class
1420
"""
1521

16-
async def test_get_decoded_url(self):
22+
async def asyncSetUp(self):
23+
self.session = HttpSession(config=MagicMock())
24+
25+
@patch("middleware.http_session.trace.get_tracer")
26+
@patch("middleware.http_session.chardet.detect", return_value={"encoding": "utf-8"})
27+
@patch("middleware.http_session.HttpSession.get")
28+
async def test_http_success(self, mock_get, _mock_chardet, _mock_tracer):
29+
"""
30+
test good http path
31+
"""
32+
mock_response = AsyncMock()
33+
mock_response.status = 200
34+
mock_response.read = AsyncMock(return_value=b"hello world")
35+
mock_get.return_value = MagicMock(
36+
__aenter__=AsyncMock(return_value=mock_response),
37+
__aexit__=AsyncMock(return_value=None)
38+
)
39+
40+
content = await self.session.get_decoded_url("http://example.com")
41+
self.assertEqual(content, "hello world")
42+
43+
@patch("middleware.http_session.trace.get_tracer")
44+
@patch("middleware.http_session.HttpSession.get")
45+
async def test_http_5xx_raises_technical_error(self, mock_get, _mock_tracer):
46+
"""
47+
test http 5xx raises technical error
48+
"""
49+
mock_response = AsyncMock()
50+
mock_response.status = 502
51+
mock_get.return_value = MagicMock(
52+
__aenter__=AsyncMock(return_value=mock_response),
53+
__aexit__=AsyncMock(return_value=None)
54+
)
55+
56+
with self.assertRaises(HttpSessionTechnicalError):
57+
await self.session.get_decoded_url("http://example.com")
58+
59+
@patch("middleware.http_session.trace.get_tracer")
60+
@patch("middleware.http_session.HttpSession.get")
61+
async def test_http_4xx_raises_response_error(self, mock_get, _mock_tracer):
62+
"""
63+
test http 4xx raises response error
64+
"""
65+
mock_response = AsyncMock()
66+
mock_response.status = 404
67+
mock_get.return_value = MagicMock(
68+
__aenter__=AsyncMock(return_value=mock_response),
69+
__aexit__=AsyncMock(return_value=None)
70+
)
71+
72+
with self.assertRaises(HttpSessionResponseError):
73+
await self.session.get_decoded_url("http://example.com")
74+
75+
@patch("middleware.http_session.trace.get_tracer")
76+
@patch("middleware.http_session.HttpSession.get")
77+
async def test_http_network_exception(self, mock_get, _mock_tracer):
78+
"""
79+
test network exception
80+
"""
81+
mock_get.return_value = MagicMock(
82+
__aenter__=AsyncMock(side_effect=ClientError("boom")),
83+
__aexit__=AsyncMock(return_value=None)
84+
)
85+
86+
with self.assertRaises(HttpSessionTechnicalError):
87+
await self.session.get_decoded_url("http://example.com")
88+
89+
@patch("middleware.http_session.trace.get_tracer")
90+
@patch("middleware.http_session.aiofiles.open")
91+
@patch("middleware.http_session.chardet.detect", return_value={"encoding": "utf-8"})
92+
async def test_file_success(self, _mock_chardet, mock_aiofiles, _mock_tracer):
93+
"""
94+
test good file path
95+
"""
96+
mock_file = AsyncMock()
97+
mock_file.read = AsyncMock(return_value=b"file content")
98+
mock_aiofiles.return_value.__aenter__.return_value = mock_file
99+
100+
content = await self.session.get_decoded_url("file:///tmp/test.txt")
101+
self.assertEqual(content, "file content")
102+
103+
@patch("middleware.http_session.trace.get_tracer")
104+
async def test_unsupported_scheme_raises_argument_error(self, _mock_tracer):
105+
"""
106+
test unsupported scheme raises argument error
107+
"""
108+
with self.assertRaises(HttpSessionArgumentError):
109+
await self.session.get_decoded_url("ftp://example.com")
110+
111+
@patch("middleware.http_session.trace.get_tracer")
112+
@patch("middleware.http_session.chardet.detect", return_value={"encoding": "utf-8"})
113+
@patch("middleware.http_session.HttpSession.get")
114+
async def test_decode_error_raises_decode_error(self, mock_get, _mock_chardet, _mock_tracer):
17115
"""
18-
This test was created automatically with the help of Codeium.
19-
It's trivial. Better tests of HttpSession.get_decoded_url would
20-
take the HttpSessionConfig into account. But this is far form
21-
trivial...
116+
test decode error raises decode error
22117
"""
23-
url = "https://example.com"
24-
content = b'Hello, \xe4\xb8\x96\xe7\x95\x8c!' # corresponds to "Hello, 世界!"
25-
encoding = "utf-8"
26-
session_config = HttpSessionConfig(**{
27-
'connection_limit': 1,
28-
'receive_timeout': 10,
29-
'connect_timeout': 10
30-
})
118+
mock_response = AsyncMock()
119+
mock_response.status = 200
120+
mock_response.read = AsyncMock(return_value=b"\xff\xff")
121+
mock_get.return_value = MagicMock(
122+
__aenter__=AsyncMock(return_value=mock_response),
123+
__aexit__=AsyncMock(return_value=None)
124+
)
125+
126+
with self.assertRaises(HttpSessionDecodeError):
127+
await self.session.get_decoded_url("http://example.com")
128+
129+
130+
# import unittest
131+
# from unittest.mock import AsyncMock, patch
132+
133+
# from middleware.http_session import HttpSession, HttpSessionConfig
134+
135+
136+
# class TestHttpSession(unittest.IsolatedAsyncioTestCase):
137+
# """
138+
# The main test class
139+
# """
140+
141+
# async def test_get_decoded_url(self):
142+
# """
143+
# This test was created automatically with the help of Codeium.
144+
# It's trivial. Better tests of HttpSession.get_decoded_url would
145+
# take the HttpSessionConfig into account. But this is far form
146+
# trivial...
147+
# """
148+
# url = "https://example.com"
149+
# content = b'Hello, \xe4\xb8\x96\xe7\x95\x8c!' # corresponds to "Hello, 世界!"
150+
# encoding = "utf-8"
151+
# session_config = HttpSessionConfig(**{
152+
# 'connection_limit': 1,
153+
# 'receive_timeout': 10,
154+
# 'connect_timeout': 10
155+
# })
31156

32-
# Mock the response object
33-
response = AsyncMock()
34-
response.read.return_value = content
157+
# # Mock the response object
158+
# response = AsyncMock()
159+
# response.read.return_value = content
35160

36-
# Patch the chardet.detect function to return the expected encoding
37-
with patch("chardet.detect") as mock_detect:
38-
mock_detect.return_value = {"encoding": encoding}
161+
# # Patch the chardet.detect function to return the expected encoding
162+
# with patch("chardet.detect") as mock_detect:
163+
# mock_detect.return_value = {"encoding": encoding}
39164

40-
# Create an instance of the class
41-
async with HttpSession(session_config) as session:
165+
# # Create an instance of the class
166+
# async with HttpSession(session_config) as session:
42167

43-
# Patch the get method to return the mocked response
44-
with patch.object(session, "get") as mock_get:
45-
mock_get.return_value.__aenter__.return_value = response
168+
# # Patch the get method to return the mocked response
169+
# with patch.object(session, "get") as mock_get:
170+
# mock_get.return_value.__aenter__.return_value = response
46171

47-
# Call the function under test
48-
decoded_content = await session.get_decoded_url(url)
172+
# # Call the function under test
173+
# decoded_content = await session.get_decoded_url(url)
49174

50-
# Assert the decoded content
51-
self.assertEqual(decoded_content, content.decode(encoding))
175+
# # Assert the decoded content
176+
# self.assertEqual(decoded_content, content.decode(encoding))
52177

53-
# Assert that the get method was called with the correct URL
54-
mock_get.assert_called_once_with(url)
178+
# # Assert that the get method was called with the correct URL
179+
# mock_get.assert_called_once_with(url)
55180

56-
# Assert that the chardet.detect function was called with the correct content
57-
mock_detect.assert_called_once_with(content)
181+
# # Assert that the chardet.detect function was called with the correct content
182+
# mock_detect.assert_called_once_with(content)

0 commit comments

Comments
 (0)