Skip to content

Commit 235ef4c

Browse files
authored
audit and docs update of logging (#54)
1 parent db6d8aa commit 235ef4c

9 files changed

Lines changed: 52 additions & 17 deletions

File tree

CHANGELOG.txt

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -59,6 +59,8 @@
5959
`InvalidDocument` no longer has a .message attribute
6060

6161
Exceptions now invoke `super().__init__(args)`
62+
63+
`tldextract` is now required for installation and used by default.
6264

6365
New Functionality
6466

README.rst

Lines changed: 24 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -75,32 +75,48 @@ Logging
7575

7676
This file utilizes extensive logging to help developers pinpoint problems.
7777

78-
* ``log.debug``
78+
* ``log.debug`` (10)
7979
This log level is mostly used to handle library maintenance and
8080
troubleshooting, aka "Library Debugging". Library Debugging is verbose, but
8181
is nested under ``if __debug__:`` statements, so it is compiled away when
8282
PYTHONOPTIMIZE is set.
8383
Several sections of logic useful to developers will also emit logging
8484
statements at the ``debug`` level, regardless of PYTHONOPTIMIZE.
8585

86-
* ``log.info``
87-
Currently unused
86+
* ``log.info`` (20)
87+
This log level is only used during package initialization to notify if
88+
the ``tldextract`` package is being utilized or not.
8889

89-
* ``log.warning``
90+
* ``log.warning`` (30)
9091
Currently unused
9192

92-
* ``log.error``
93-
This log level is mostly used to alert developers of errors that were
93+
* ``log.error`` (40)
94+
This log level will record each URL that a parse is attempted for.
95+
96+
This log level is mostly used to alert users of errors that were
9497
encountered during url fetching and document parsing, and often emits a log
9598
statement just before an Exception is raised. The log statements will contain
9699
at least the exception type, and may contain the active URL and additional
97100
debugging information, if any of that information is available.
101+
102+
URLs that trigger error logging should be collected and run on a secondary
103+
system that utilizes `log.debug` without PYTHONOPTIMIZE.
98104

99-
* ``log.critical``
105+
106+
* ``log.critical`` (50)
100107
Currently unused
101108

102109

103-
It is STRONGLY recommended to keep Python's logging at ``debug``.
110+
It is STRONGLY recommended to keep Python's logging at ``debug`` and not run
111+
PYTHONOPTIMIZE if you are new to this package.
112+
113+
For experienced users, running under PYTHONOPTIMIZE to not emit debug logging is
114+
designed to make the system run as fast as possible. The intent of
115+
``log.error`` is to present you with a feed of URLs as they are processed, and
116+
show any errors that arise. Any issues that arise should then be run on a
117+
second system that enables debug logging to pinpoint the error. This allows one
118+
to split a deployment into production and R&D/troubleshooting, to maximize
119+
the throughput of the production system.
104120

105121

106122
Optional Integrations

setup.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,7 @@
3030
"BeautifulSoup4>4.13.0,<4.14.0",
3131
"requests>=2.19.1",
3232
"requests-toolbelt>=0.8.0",
33+
"tldextract<6.0.0", # `.registered_domain` deprecation
3334
"typing_extensions",
3435
]
3536

@@ -38,10 +39,11 @@
3839

3940
tests_require = [
4041
"httpbin",
42+
"flake8",
43+
"flake8-import-order>=0.19.2",
4144
"pytest",
4245
"pytest-httpbin",
4346
"responses",
44-
"tldextract<6.0.0", # `.registered_domain` deprecation
4547
"types-beautifulsoup4",
4648
"types-requests",
4749
"werkzeug<2.1.0", # httpbin compat issue

src/metadata_parser/__init__.py

Lines changed: 13 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -52,6 +52,7 @@
5252

5353
if TYPE_CHECKING:
5454
from bs4 import Tag as _bs4_Tag
55+
5556
from .typing import TYPE_ENCODER
5657
from .typing import TYPE_REQUESTS_TIMEOUT
5758
from .typing import TYPE_URL_FETCH
@@ -86,6 +87,10 @@
8687

8788
USE_TLDEXTRACT = True
8889

90+
log.info("`tldextract` support enabled.")
91+
else:
92+
log.info("`tldextract` support disabled.")
93+
8994
# ------------------------------------------------------------------------------
9095

9196

@@ -189,7 +194,7 @@ def is_parsed_valid_url(
189194
parsed = parsed.decode()
190195
assert isinstance(parsed, ParseResult)
191196
if __debug__:
192-
log.debug("is_parsed_valid_url = %s", parsed)
197+
log.debug("is_parsed_valid_url(parsed=%s", parsed)
193198
if not all((parsed.scheme, parsed.netloc)):
194199
if __debug__:
195200
log.debug(" FALSE - missing `scheme` or `netloc`")
@@ -528,6 +533,9 @@ def log(
528533
logger: Callable[..., None] = log.error,
529534
) -> None:
530535
"""
536+
Invoked to log troubleshooting information when an error is encountered.
537+
By default this goes to `log.error`.
538+
531539
:param prefix: Prefix for logging, defaults to "ResponseHistory"
532540
:type prefix: str
533541
:param logger: default `log.error`
@@ -1476,7 +1484,9 @@ def make_soup(self, html, **kwargs_bs):
14761484
doc = BeautifulSoup(html, "lxml", **kwargs_bs)
14771485
except Exception as exc: # noqa: F841
14781486
if __debug__:
1479-
log.debug("`BeautifulSoup` could not parse with `lxml`")
1487+
log.error(
1488+
"`BeautifulSoup` could not parse with `lxml`; attempting `html.parser`"
1489+
)
14801490
doc = BeautifulSoup(html, "html.parser", **kwargs_bs)
14811491
return doc
14821492

@@ -1566,7 +1576,7 @@ def parse(
15661576
pass
15671577
except Exception as exc:
15681578
if __debug__:
1569-
log.debug("Ran into a serious error parsing `og`: %s", exc)
1579+
log.error("Ran into a serious error parsing `og`: %s", exc)
15701580
pass
15711581

15721582
twitters = doc_searchpath.find_all("meta", attrs={"name": RE_prefix_twitter})

src/metadata_parser/exceptions.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44

55
if TYPE_CHECKING:
66
import requests
7+
78
from . import MetadataParser
89
from .typing import TYPES_RESPONSE
910

src/metadata_parser/requests_extensions.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
import _socket # noqa: I201
22

33
# stdlib
4-
import cgi # noqa: I100
4+
import cgi # noqa: I202
55
import logging
66
import socket
77
from typing import Optional
@@ -18,9 +18,9 @@
1818
from .utils import DummyResponse
1919
from .utils import safe_sample
2020

21-
2221
if TYPE_CHECKING:
2322
from requests.structures import CaseInsensitiveDict
23+
2424
from .typing import TYPES_PEERNAME
2525
from .typing import TYPES_RESPONSE
2626

src/metadata_parser/typing.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,8 +11,10 @@
1111
from typing_extensions import Protocol # py38
1212

1313
if TYPE_CHECKING:
14-
import requests
1514
from urllib.parse import ParseResult
15+
16+
import requests
17+
1618
from . import DummyResponse
1719
from . import ResponseHistory
1820

src/metadata_parser/utils.py

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,6 @@
2424
from . import config
2525
from .regex import RE_rfc3986_valid_characters
2626

27-
2827
if TYPE_CHECKING:
2928
from urllib.parse import ParseResult
3029

@@ -156,7 +155,7 @@ def fix_unicode_url(
156155
candidate[_idx] = parsed[_idx]
157156
candidate[_idx] = url_quote(url_unquote(candidate[_idx]))
158157
except Exception as exc:
159-
log.debug("fix_unicode_url failure: %s | %s | %s", url, encoding, exc)
158+
log.error("fix_unicode_url failure: %s | %s | %s", url, encoding, exc)
160159
return url
161160
_url = urlunparse(candidate)
162161
return _url

tests/test_document_parsing.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -439,6 +439,9 @@ def test_default_encoder(self):
439439

440440
class _TestDocumentParsingCore:
441441

442+
assertIn: Callable
443+
assertEqual: Callable
444+
442445
def _MakeOne(self, filename):
443446
"""lazy cache of files as needed"""
444447
global CACHED_FILESYSTEM_DOCUMENTS

0 commit comments

Comments
 (0)