From 4fa19b7504854c61304f193a207b9a89b56d765c Mon Sep 17 00:00:00 2001 From: jonathan vanasco Date: Fri, 23 May 2025 13:49:34 -0400 Subject: [PATCH 1/4] update testharness improve typing remove deprecations adjust tests for deprecations --- .github/workflows/python-package.yml | 9 +- CHANGELOG.txt | 9 + setup.py | 2 +- src/metadata_parser/__init__.py | 242 ++++++++++++++------------- tests/test_document_parsing.py | 105 ++++++------ tests/test_ip_tracking.py | 7 +- tox.ini | 2 +- 7 files changed, 206 insertions(+), 170 deletions(-) diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml index 4be57c3..22c1e89 100644 --- a/.github/workflows/python-package.yml +++ b/.github/workflows/python-package.yml @@ -15,20 +15,15 @@ jobs: strategy: matrix: os: - - "ubuntu-latest" + - "ubuntu-22.04" python-version: + - "3.7" - "3.8" - "3.9" - "3.10" - "3.11" - "3.12" - "3.13" - include: - # ubuntu-latest[22.04] does not have: py36 - - os: "ubuntu-20.04" - python-version: "3.6" - - os: "ubuntu-20.04" - python-version: "3.7" steps: - uses: actions/checkout@v3 - name: Set up Python ${{ matrix.python-version }} diff --git a/CHANGELOG.txt b/CHANGELOG.txt index 65ed432..fdbe5e8 100644 --- a/CHANGELOG.txt +++ b/CHANGELOG.txt @@ -1,6 +1,15 @@ 1.0 1.0 will be a complete api overhaul + +0.13.0rc0 + * drop py36; no test options due to github deprecation of ubuntu20.04 + * `_coerce_validate_strategy` (invoked by `get_metadatas`) will now raise a + ValueError if a string other than "all" is submitted. The only valid + string is "all", otherwise a list of string - excluding "all" - must be + submitted. Warnings of this have been emitted for several years. + * __init__(`search_head_only`) now defaults to False + 0.12.3 * pin "BeautifulSoup4<4.15.0" * See `https://git.launchpad.net/beautifulsoup/tree/CHANGELOG` diff --git a/setup.py b/setup.py index 5d1daaa..9cb3195 100644 --- a/setup.py +++ b/setup.py @@ -30,6 +30,7 @@ "BeautifulSoup4<4.15.0", "requests>=2.19.1", "requests-toolbelt>=0.8.0", + "typing_extensions", ] if sys.version_info.major == 2: requires.append("backports.html") @@ -59,7 +60,6 @@ "Intended Audience :: Developers", "License :: OSI Approved :: MIT License", "Programming Language :: Python :: 3", - "Programming Language :: Python :: 3.6", "Programming Language :: Python :: 3.7", "Programming Language :: Python :: 3.8", "Programming Language :: Python :: 3.9", diff --git a/src/metadata_parser/__init__.py b/src/metadata_parser/__init__.py index 04df982..4ea70b6 100644 --- a/src/metadata_parser/__init__.py +++ b/src/metadata_parser/__init__.py @@ -10,6 +10,7 @@ import re import socket # peername hack, see below import typing +from typing import Any from typing import Callable from typing import Dict from typing import Iterable @@ -33,6 +34,7 @@ import requests from requests.structures import CaseInsensitiveDict from requests_toolbelt.utils.deprecated import get_encodings_from_content +from typing_extensions import Literal # py38 if TYPE_CHECKING: from bs4 import Tag as _bs4_Tag @@ -46,7 +48,7 @@ # ============================================================================== -__VERSION__ = "0.12.3" +__VERSION__ = "0.13.0rc0" # ------------------------------------------------------------------------------ @@ -88,7 +90,9 @@ def warn_user(message: str) -> None: TYPES_RESPONSE = Union["DummyResponse", requests.Response] TYPES_PEERNAME = Tuple[str, int] # (ip, port) TYPE_URL_FETCH = Tuple[str, str, "ResponseHistory"] - +TYPE_REQUESTS_TIMEOUT = Optional[ + Union[int, float, Tuple[int, int], Tuple[float, float]] +] # ------------------------------------------------------------------------------ @@ -352,15 +356,15 @@ def _get_socket() -> Optional[socket.socket]: i += 1 try: if i == 1: - sock = resp.raw._connection.sock + sock = resp.raw._connection.sock # type: ignore[union-attr] elif i == 2: - sock = resp.raw._connection.sock.socket + sock = resp.raw._connection.sock.socket # type: ignore[union-attr] elif i == 3: - sock = resp.raw._fp.fp._sock + sock = resp.raw._fp.fp._sock # type: ignore[union-attr] elif i == 4: - sock = resp.raw._fp.fp._sock.socket + sock = resp.raw._fp.fp._sock.socket # type: ignore[union-attr] elif i == 5: - sock = resp.raw._fp.fp.raw._sock + sock = resp.raw._fp.fp.raw._sock # type: ignore[union-attr] else: break if not isinstance(sock, _compatible_sockets): @@ -582,7 +586,7 @@ def is_parsed_valid_relative(parsed: ParseResult) -> bool: def parsed_to_relative( parsed: ParseResult, - parsed_fallback: Optional[str] = None, + parsed_fallback: Optional[ParseResult] = None, ) -> str: """turns a parsed url into a full relative path""" assert isinstance(parsed, ParseResult) @@ -611,7 +615,7 @@ def parsed_to_relative( def fix_unicode_url( url: str, encoding: Optional[str] = None, - urlparser: Callable = urlparse, + urlparser: Callable[[str], ParseResult] = urlparse, ) -> str: """ some cms systems will put unicode in their canonical url @@ -650,8 +654,8 @@ def is_url_valid( url: str, require_public_netloc: Optional[bool] = None, allow_localhosts: Optional[bool] = None, - urlparser: Callable = urlparse, -) -> bool: + urlparser: Callable[[str], ParseResult] = urlparse, +) -> Union[Literal[False], ParseResult]: """ tries to parse a url. if valid returns `ParseResult` (boolean eval is True); if invalid returns `False` @@ -678,7 +682,7 @@ def url_to_absolute_url( url_fallback: Optional[str] = None, require_public_netloc: Optional[bool] = None, allow_localhosts: Optional[bool] = None, - urlparser: Callable = urlparse, + urlparser: Callable[[str], ParseResult] = urlparse, ) -> Optional[str]: """ returns an "absolute url" if we have one. @@ -714,6 +718,9 @@ def url_to_absolute_url( if url_test is None and url_fallback is not None: return url_fallback + if TYPE_CHECKING: + assert url_test is not None + parsed = urlparser(url_test) # if we passed in a url, we can't remount it onto another domain @@ -749,6 +756,8 @@ def url_to_absolute_url( # this can happen if someone puts in "" for the canonical # but this can also happen if we have different domains... if url_fallback: + if TYPE_CHECKING: + assert parsed_fallback is not None if (parsed_fallback.scheme == parsed.scheme) or ( parsed_fallback.netloc == parsed.netloc ): @@ -771,6 +780,8 @@ def url_to_absolute_url( # ok, the URL isn't valid # can we re-assemble it if url_fallback: + if TYPE_CHECKING: + assert parsed_fallback is not None if is_parsed_valid_url( parsed_fallback, require_public_netloc=require_public_netloc, @@ -791,6 +802,8 @@ def url_to_absolute_url( class InvalidDocument(Exception): + message: str + def __init__(self, message: str = ""): self.message = message @@ -799,6 +812,11 @@ def __str__(self) -> str: class NotParsable(Exception): + raised: Optional[requests.exceptions.RequestException] + code: Optional[int] + metadataParser: Optional["MetadataParser"] + response: Optional[TYPES_RESPONSE] + def __init__( self, message: str = "", @@ -856,6 +874,11 @@ class RedirectDetected(Exception): ``response``: actual response object """ + location: str + code: Optional[int] + response: Optional[TYPES_RESPONSE] + metadataParser: Optional["MetadataParser"] + def __init__( self, location: str = "", @@ -945,7 +968,7 @@ def __init__(self, resp: TYPES_RESPONSE): def log( self, prefix: str = "ResponseHistory", - logger: Callable = log.error, + logger: Callable[..., None] = log.error, ) -> None: """ :param prefix: Prefix for logging, defaults to "ResponseHistory" @@ -1014,11 +1037,10 @@ class ParsedResult(object): None # only stashing `ResponseHistory` if we have it ) _version: int = 1 # version tracking - default_encoder: Optional[Callable] = None - + default_encoder: Optional[Callable[[str], str]] = None og_minimum_requirements: List = ["title", "type", "image", "url"] twitter_sections: List = ["card", "title", "site", "description"] - strategy: Union[List, str] = ["og", "dc", "meta", "page", "twitter"] + strategy: Union[List[str], str] = ["og", "dc", "meta", "page", "twitter"] _get_metadata__last_strategy: Optional[str] = None @@ -1074,21 +1096,13 @@ def _add_discovered( def _coerce_validate_strategy( self, - strategy: Union[list, str, None] = None, + strategy: Union[List[str], str, None] = None, ) -> Union[List, str]: """normalize a strategy into a valid option""" if strategy: if isinstance(strategy, str): if strategy != "all": - warn_user( - """If `strategy` is not a `list`, it should be 'all'.""" - """This is coerced into a list, but will be enforced.""" - ) - if strategy not in self.strategy: - raise ValueError("invalid strategy: %s" % strategy) - strategy = [ - strategy, - ] + raise ValueError("If `strategy` is not a `list`, it must be 'all'.") elif isinstance(strategy, list): _invalids = [] for _candidate in strategy: @@ -1107,7 +1121,7 @@ def get_metadata( self, field: str, strategy: Union[list, str, None] = None, - encoder: Optional[Callable] = None, + encoder: Optional[Callable[[str], str]] = None, ) -> Union[str, Dict[str, Union[str, Dict]], None]: """ LEGACY. DEPRECATED. DO NOT USE THIS. @@ -1157,8 +1171,9 @@ def get_metadata( function or "raw" """ warn_future( - """`get_metadata` returns a string and is being deprecated""" - """in favor of `get_metadatas` which returns a list.""" + """`ParsedResult.get_metadata` returns a string and is deprecated""" + """in favor of `get_metadatas` which returns a list. """ + """This will be removed in the next minor or major release.""" ) strategy = self._coerce_validate_strategy(strategy) self._get_metadata__last_strategy = None @@ -1217,8 +1232,8 @@ def _lookup(store: str) -> Optional[Union[str, Dict]]: def get_metadatas( self, field: str, - strategy: Union[list, str, None] = None, - encoder: Optional[Callable] = None, + strategy: Union[List[str], str, None] = None, + encoder: Optional[Callable[[str], str]] = None, ) -> Optional[Union[Dict, List]]: """ looks for the field in various stores. defaults to the core @@ -1349,65 +1364,69 @@ class MetadataParser(object): this can be necessary on development machines """ - url = None - url_actual = None - strategy = None - LEN_MAX_TITLE = 255 - only_parse_file_extensions = None - allow_localhosts = None - require_public_netloc = None - force_doctype = None - requests_timeout = None + url: Optional[str] = None + url_actual: Optional[str] = None + strategy: Union[List[str], str, None] = None + LEN_MAX_TITLE: int = 255 + only_parse_file_extensions: Optional[List[str]] = None + allow_localhosts: Optional[bool] = None + require_public_netloc: Optional[bool] = None + force_doctype: Optional[bool] = None + requests_timeout: TYPE_REQUESTS_TIMEOUT = None peername: Optional[TYPES_PEERNAME] = None - is_redirect = None - is_redirect_unique = None - is_redirect_same_host = None - - force_parse = None - force_parse_invalid_content_type = None - only_parse_http_ok = None - requests_session = None - derive_encoding = None - default_encoding = None - default_encoder: Optional[Callable] = None - support_malformed = None + is_redirect: Optional[bool] = None + is_redirect_unique: Optional[bool] = None + is_redirect_same_host: Optional[bool] = None + + force_parse: Optional[bool] = None + force_parse_invalid_content_type: Optional[bool] = None + only_parse_http_ok: Optional[bool] = None + requests_session: Optional[requests.Session] = None + derive_encoding: Optional[bool] = None + default_encoding: Optional[str] = None + default_encoder: Optional[Callable[[str], str]] = None + support_malformed: Optional[bool] = None + + urlparse: Callable[[str], ParseResult] # this has a per-parser default tuple # it can be upgraded manually - schemeless_fields_upgradeable = SCHEMELESS_FIELDS_UPGRADEABLE - schemeless_fields_disallow = SCHEMELESS_FIELDS_DISALLOW + schemeless_fields_upgradeable: Tuple[str, ...] = SCHEMELESS_FIELDS_UPGRADEABLE + schemeless_fields_disallow: Tuple[str, ...] = SCHEMELESS_FIELDS_DISALLOW - _content_types_parse = ("text/html",) - _content_types_noparse = ("application/json",) + _content_types_parse: Tuple[str, ...] = ("text/html",) + _content_types_noparse: Tuple[str, ...] = ("application/json",) + + response: Optional[TYPES_RESPONSE] def __init__( self, url: Optional[str] = None, html: Optional[str] = None, - strategy: Union[list, str, None] = None, - url_data=None, - url_headers=None, + strategy: Union[List[str], str, None] = None, + url_data: Optional[Dict[str, Any]] = None, + url_headers: Optional[Dict[str, str]] = None, force_parse: bool = False, ssl_verify: bool = True, - only_parse_file_extensions=None, - force_parse_invalid_content_type=False, - require_public_netloc=True, - allow_localhosts=None, - force_doctype=False, - requests_timeout=None, - raise_on_invalid=False, - search_head_only=None, - allow_redirects=True, - requests_session=None, - only_parse_http_ok=True, - defer_fetch=False, - derive_encoding=True, - html_encoding=None, - default_encoding=None, - default_encoder: Optional[Callable] = None, - retry_dropped_without_headers=None, - support_malformed=None, - cached_urlparser=True, + only_parse_file_extensions: Optional[List[str]] = None, + force_parse_invalid_content_type: bool = False, + require_public_netloc: bool = True, + allow_localhosts: Optional[bool] = None, + force_doctype: bool = False, + requests_timeout: TYPE_REQUESTS_TIMEOUT = None, + raise_on_invalid: bool = False, + search_head_only: bool = False, + allow_redirects: bool = True, + requests_session: Optional[requests.Session] = None, + only_parse_http_ok: bool = True, + defer_fetch: bool = False, + derive_encoding: bool = True, + html_encoding: Optional[str] = None, + default_encoding: Optional[str] = None, + default_encoder: Optional[Callable[[str], str]] = None, + retry_dropped_without_headers: Optional[bool] = None, + support_malformed: Optional[bool] = None, + cached_urlparser: Union[bool, int, Callable[[str], ParseResult]] = True, ): """ creates a new `MetadataParser` instance. @@ -1466,11 +1485,10 @@ def __init__( if True, will raise an InvalidDocument exception if the response does not look like a proper html document `search_head_only` - default: None - if `None` will default to True and emit a deprecation warning. + default: False if `True`, will only search the document head for meta information. `search_head_only=True` is the legacy behavior, but missed too many - bad html implementations. This will be set to `False` in the future. + bad html implementations. `allow_redirects` default: True passed onto `fetch_url`, which will pass it onto requests.get @@ -1504,7 +1522,7 @@ def __init__( options: True: use a instance of UrlParserCacheable(maxitems=30) : INT: use a instance of UrlParserCacheable(maxitems=cached_urlparser) : None/False/0 - use native urlparse - : other truthy values - use as a custom urlparse + : callable - use as a custom urlparse """ if __debug__: log.debug("MetadataParser.__init__(%s)", url) @@ -1513,16 +1531,17 @@ def __init__( self.parsed_result = ParsedResult() if cached_urlparser: if cached_urlparser is True: - cached_urlparser = UrlParserCacheable() # a cache - self._cached_urlparser = cached_urlparser # stash it - self.urlparse = cached_urlparser.urlparse + _cached_urlparser = UrlParserCacheable() # a cache + self._cached_urlparser = _cached_urlparser # stash it + self.urlparse = _cached_urlparser.urlparse elif isinstance(cached_urlparser, int): - cached_urlparser = UrlParserCacheable( + _cached_urlparser = UrlParserCacheable( maxitems=cached_urlparser ) # a cache - self._cached_urlparser = cached_urlparser # stash it - self.urlparse = cached_urlparser.urlparse + self._cached_urlparser = _cached_urlparser # stash it + self.urlparse = _cached_urlparser.urlparse else: + # TODO - raise value error if not callable self.urlparse = cached_urlparser else: self.urlparse = urlparse @@ -1541,12 +1560,6 @@ def __init__( self.force_parse = force_parse self.force_parse_invalid_content_type = force_parse_invalid_content_type self.only_parse_http_ok = only_parse_http_ok - if search_head_only is None: - warn_future( - """`search_head_only` was not provided and defaulting to `True` """ - """Future versions will default to `False`.""" - ) - search_head_only = True self.search_head_only = search_head_only self.raise_on_invalid = raise_on_invalid self.requests_session = requests_session @@ -1651,7 +1664,7 @@ def get_metadata( self, field: str, strategy: Union[list, str, None] = None, - encoder: Optional[Callable] = None, + encoder: Optional[Callable[[str], str]] = None, ) -> Union[str, Dict[str, Union[str, Dict]], None]: # deprecating in 1.0; operate on the result instead warn_future( @@ -1664,8 +1677,8 @@ def get_metadata( def get_metadatas( self, field, - strategy: Union[list, str, None] = None, - encoder: Optional[Callable] = None, + strategy: Union[List[str], str, None] = None, + encoder: Optional[Callable[[str], str]] = None, ) -> Optional[Union[Dict, List]]: # deprecating in 1.0; operate on the result instead warn_future( @@ -1697,18 +1710,18 @@ def _response_encoding(self) -> Optional[str]: def fetch_url( self, - url_data=None, - url_headers=None, - force_parse=None, - force_parse_invalid_content_type=None, - allow_redirects=None, - ssl_verify=None, - requests_timeout=None, - requests_session=None, - only_parse_http_ok=None, - derive_encoding=None, - default_encoding=None, - retry_dropped_without_headers=None, + url_data: Optional[Dict[str, Any]] = None, # ???: required + url_headers: Optional[Union[CaseInsensitiveDict, Dict[str, Any]]] = None, + force_parse: Optional[bool] = None, # `None` will use `self.force_parse` + force_parse_invalid_content_type: Optional[bool] = None, + allow_redirects: Optional[bool] = None, + ssl_verify: Optional[bool] = None, + requests_timeout: TYPE_REQUESTS_TIMEOUT = None, + requests_session: Optional[requests.Session] = None, + only_parse_http_ok: Optional[bool] = None, + derive_encoding: Optional[bool] = None, + default_encoding: Optional[str] = None, + retry_dropped_without_headers: Optional[bool] = None, ) -> TYPE_URL_FETCH: """ fetches the url and returns a tuple of (html, html_encoding). @@ -1758,6 +1771,7 @@ def fetch_url( else self.only_parse_http_ok ) if not force_parse and self.only_parse_file_extensions is not None: + assert self.url parsed = self.urlparse(self.url) path = parsed.path if path: @@ -1971,6 +1985,8 @@ def _run_in_session(_requests_session: requests.Session): except requests.exceptions.RequestException as error: if hasattr(error, "response") and (error.response is not None): + if TYPE_CHECKING: + assert error.response is not None self.response = error.response try: assert self.response is not None # mypy @@ -1990,7 +2006,9 @@ def _run_in_session(_requests_session: requests.Session): raised=error, metadataParser=self, ) - + if TYPE_CHECKING: + assert html is not None + assert html_encoding is not None return (html, html_encoding, response_history) def absolute_url(self, link: Optional[str] = None) -> Optional[str]: @@ -2566,7 +2584,7 @@ def get_discrete_url( def get_metadata_link( self, field: str, - strategy: Union[list, str, None] = None, + strategy: Union[List[str], str, None] = None, allow_encoded_uri: bool = False, require_public_global: bool = True, ) -> Optional[str]: @@ -2577,7 +2595,7 @@ def get_metadata_link( kwargs: strategy=None - ('all') or iterable ['og', 'dc', 'meta', 'page', 'twitter', ] + 'all' or List ['og', 'dc', 'meta', 'page', 'twitter', ] allow_encoded_uri=False require_public_global=True diff --git a/tests/test_document_parsing.py b/tests/test_document_parsing.py index a1e5d23..8516f04 100644 --- a/tests/test_document_parsing.py +++ b/tests/test_document_parsing.py @@ -722,22 +722,28 @@ def test_complex_html(self): dc_mixed_candidates[_key], dcTestMixedCandidates1aExpected[_key] ) # but we need to test get_metadata and get_metadatas + with self.assertRaises(ValueError) as cm: + parsed.get_metadata("TestMixedCandidates1a", strategy="dc") + self.assertEqual( + cm.exception.args[0], "If `strategy` is not a `list`, it must be 'all'." + ) + self.assertEqual( - parsed.get_metadata("TestMixedCandidates1a", strategy="dc"), "Friendship" + parsed.get_metadata("TestMixedCandidates1a", strategy=["dc"]), "Friendship" ) self.assertEqual( - parsed.get_metadatas("TestMixedCandidates1a", strategy="dc"), + parsed.get_metadatas("TestMixedCandidates1a", strategy=["dc"]), [dcTestMixedCandidates1aExpected], ) self.assertEqual( parsed.get_metadata( - "TestMixedCandidates1a", strategy="dc", encoder=encoder_capitalizer + "TestMixedCandidates1a", strategy=["dc"], encoder=encoder_capitalizer ), "FRIENDSHIP", ) self.assertEqual( parsed.get_metadatas( - "TestMixedCandidates1a", strategy="dc", encoder=encoder_capitalizer + "TestMixedCandidates1a", strategy=["dc"], encoder=encoder_capitalizer ), [{"CONTENT": "FRIENDSHIP"}], ) @@ -760,21 +766,21 @@ def test_complex_html(self): ) # but we need to test get_metadata and get_metadatas self.assertEqual( - parsed.get_metadata("TestMixedCandidates1b", strategy="dc"), "158.25" + parsed.get_metadata("TestMixedCandidates1b", strategy=["dc"]), "158.25" ) self.assertEqual( - parsed.get_metadatas("TestMixedCandidates1b", strategy="dc"), + parsed.get_metadatas("TestMixedCandidates1b", strategy=["dc"]), [dcTestMixedCandidates1bExpected], ) self.assertEqual( parsed.get_metadata( - "TestMixedCandidates1b", strategy="dc", encoder=encoder_capitalizer + "TestMixedCandidates1b", strategy=["dc"], encoder=encoder_capitalizer ), "158.25", ) self.assertEqual( parsed.get_metadatas( - "TestMixedCandidates1b", strategy="dc", encoder=encoder_capitalizer + "TestMixedCandidates1b", strategy=["dc"], encoder=encoder_capitalizer ), [{"CONTENT": "158.25", "SCHEME": "DDC"}], ) @@ -809,21 +815,21 @@ def test_complex_html(self): # but we need to test get_metadata and get_metadatas self.assertEqual( - parsed.get_metadata("TestMixedCandidates2a", strategy="dc"), "Friendship" + parsed.get_metadata("TestMixedCandidates2a", strategy=["dc"]), "Friendship" ) self.assertEqual( - parsed.get_metadatas("TestMixedCandidates2a", strategy="dc"), + parsed.get_metadatas("TestMixedCandidates2a", strategy=["dc"]), dcTestMixedCandidates2aExpected, ) self.assertEqual( parsed.get_metadata( - "TestMixedCandidates2a", strategy="dc", encoder=encoder_capitalizer + "TestMixedCandidates2a", strategy=["dc"], encoder=encoder_capitalizer ), "FRIENDSHIP", ) self.assertEqual( parsed.get_metadatas( - "TestMixedCandidates2a", strategy="dc", encoder=encoder_capitalizer + "TestMixedCandidates2a", strategy=["dc"], encoder=encoder_capitalizer ), [{"CONTENT": "158.25", "SCHEME": "DDC"}, {"CONTENT": "FRIENDSHIP"}], ) @@ -854,30 +860,30 @@ def test_complex_html(self): ) # but we need to test get_metadata and get_metadatas self.assertEqual( - parsed.get_metadata("TestMixedCandidates2b", strategy="dc"), "Friendship" + parsed.get_metadata("TestMixedCandidates2b", strategy=["dc"]), "Friendship" ) self.assertEqual( - parsed.get_metadatas("TestMixedCandidates2b", strategy="dc"), + parsed.get_metadatas("TestMixedCandidates2b", strategy=["dc"]), dcTestMixedCandidates2bExpected, ) self.assertEqual( parsed.get_metadata( - "TestMixedCandidates2b", strategy="dc", encoder=encoder_capitalizer + "TestMixedCandidates2b", strategy=["dc"], encoder=encoder_capitalizer ), "FRIENDSHIP", ) self.assertEqual( parsed.get_metadatas( - "TestMixedCandidates2b", strategy="dc", encoder=encoder_capitalizer + "TestMixedCandidates2b", strategy=["dc"], encoder=encoder_capitalizer ), [{"CONTENT": "FRIENDSHIP"}, {"CONTENT": "158.25", "SCHEME": "DDC"}], ) # ok, mixedfield tests: # TestMixedField0 - self.assertEqual(parsed.get_metadata("TestMixedField0", strategy="dc"), None) + self.assertEqual(parsed.get_metadata("TestMixedField0", strategy=["dc"]), None) self.assertEqual( - parsed.get_metadata("TestMixedField0", strategy="meta"), + parsed.get_metadata("TestMixedField0", strategy=["meta"]), "meta:TestMixedField0", ) self.assertEqual( @@ -886,13 +892,13 @@ def test_complex_html(self): ) self.assertEqual( parsed.get_metadata( - "TestMixedField0", strategy="dc", encoder=encoder_capitalizer + "TestMixedField0", strategy=["dc"], encoder=encoder_capitalizer ), None, ) self.assertEqual( parsed.get_metadata( - "TestMixedField0", strategy="meta", encoder=encoder_capitalizer + "TestMixedField0", strategy=["meta"], encoder=encoder_capitalizer ), "META:TESTMIXEDFIELD0", ) @@ -902,9 +908,9 @@ def test_complex_html(self): ), {"meta": "META:TESTMIXEDFIELD0"}, ) - self.assertEqual(parsed.get_metadatas("TestMixedField0", strategy="dc"), None) + self.assertEqual(parsed.get_metadatas("TestMixedField0", strategy=["dc"]), None) self.assertEqual( - parsed.get_metadatas("TestMixedField0", strategy="meta"), + parsed.get_metadatas("TestMixedField0", strategy=["meta"]), ["meta:TestMixedField0"], ) self.assertEqual( @@ -913,13 +919,13 @@ def test_complex_html(self): ) self.assertEqual( parsed.get_metadatas( - "TestMixedField0", strategy="dc", encoder=encoder_capitalizer + "TestMixedField0", strategy=["dc"], encoder=encoder_capitalizer ), None, ) self.assertEqual( parsed.get_metadatas( - "TestMixedField0", strategy="meta", encoder=encoder_capitalizer + "TestMixedField0", strategy=["meta"], encoder=encoder_capitalizer ), ["META:TESTMIXEDFIELD0"], ) @@ -932,10 +938,11 @@ def test_complex_html(self): # TestMixedField1 self.assertEqual( - parsed.get_metadata("TestMixedField1", strategy="dc"), "dc:TestMixedField1" + parsed.get_metadata("TestMixedField1", strategy=["dc"]), + "dc:TestMixedField1", ) self.assertEqual( - parsed.get_metadata("TestMixedField1", strategy="meta"), + parsed.get_metadata("TestMixedField1", strategy=["meta"]), "meta:TestMixedField1", ) self.assertEqual( @@ -944,13 +951,13 @@ def test_complex_html(self): ) self.assertEqual( parsed.get_metadata( - "TestMixedField1", strategy="dc", encoder=encoder_capitalizer + "TestMixedField1", strategy=["dc"], encoder=encoder_capitalizer ), "DC:TESTMIXEDFIELD1", ) self.assertEqual( parsed.get_metadata( - "TestMixedField1", strategy="meta", encoder=encoder_capitalizer + "TestMixedField1", strategy=["meta"], encoder=encoder_capitalizer ), "META:TESTMIXEDFIELD1", ) @@ -961,11 +968,11 @@ def test_complex_html(self): {"meta": "META:TESTMIXEDFIELD1", "dc": "DC:TESTMIXEDFIELD1"}, ) self.assertEqual( - parsed.get_metadatas("TestMixedField1", strategy="dc"), + parsed.get_metadatas("TestMixedField1", strategy=["dc"]), [{"content": "dc:TestMixedField1"}], ) self.assertEqual( - parsed.get_metadatas("TestMixedField1", strategy="meta"), + parsed.get_metadatas("TestMixedField1", strategy=["meta"]), ["meta:TestMixedField1"], ) self.assertEqual( @@ -977,13 +984,13 @@ def test_complex_html(self): ) self.assertEqual( parsed.get_metadatas( - "TestMixedField1", strategy="dc", encoder=encoder_capitalizer + "TestMixedField1", strategy=["dc"], encoder=encoder_capitalizer ), [{"CONTENT": "DC:TESTMIXEDFIELD1"}], ) self.assertEqual( parsed.get_metadatas( - "TestMixedField1", strategy="meta", encoder=encoder_capitalizer + "TestMixedField1", strategy=["meta"], encoder=encoder_capitalizer ), ["META:TESTMIXEDFIELD1"], ) @@ -998,10 +1005,11 @@ def test_complex_html(self): ) # TestMixedField2 self.assertEqual( - parsed.get_metadata("TestMixedField2", strategy="dc"), "dc:TestMixedField2" + parsed.get_metadata("TestMixedField2", strategy=["dc"]), + "dc:TestMixedField2", ) self.assertEqual( - parsed.get_metadata("TestMixedField2", strategy="meta"), + parsed.get_metadata("TestMixedField2", strategy=["meta"]), "meta:TestMixedField2", ) self.assertEqual( @@ -1010,13 +1018,13 @@ def test_complex_html(self): ) self.assertEqual( parsed.get_metadata( - "TestMixedField2", strategy="dc", encoder=encoder_capitalizer + "TestMixedField2", strategy=["dc"], encoder=encoder_capitalizer ), "DC:TESTMIXEDFIELD2", ) self.assertEqual( parsed.get_metadata( - "TestMixedField2", strategy="meta", encoder=encoder_capitalizer + "TestMixedField2", strategy=["meta"], encoder=encoder_capitalizer ), "META:TESTMIXEDFIELD2", ) @@ -1027,14 +1035,14 @@ def test_complex_html(self): {"meta": "META:TESTMIXEDFIELD2", "dc": "DC:TESTMIXEDFIELD2"}, ) self.assertEqual( - parsed.get_metadatas("TestMixedField2", strategy="dc"), + parsed.get_metadatas("TestMixedField2", strategy=["dc"]), [ {"content": "dc:TestMixedField2"}, {"content": "dc:TestMixedField2.ddc", "scheme": "ddc"}, ], ) self.assertEqual( - parsed.get_metadatas("TestMixedField2", strategy="meta"), + parsed.get_metadatas("TestMixedField2", strategy=["meta"]), ["meta:TestMixedField2"], ) self.assertEqual( @@ -1049,7 +1057,7 @@ def test_complex_html(self): ) self.assertEqual( parsed.get_metadatas( - "TestMixedField2", strategy="dc", encoder=encoder_capitalizer + "TestMixedField2", strategy=["dc"], encoder=encoder_capitalizer ), [ {"CONTENT": "DC:TESTMIXEDFIELD2"}, @@ -1058,7 +1066,7 @@ def test_complex_html(self): ) self.assertEqual( parsed.get_metadatas( - "TestMixedField2", strategy="meta", encoder=encoder_capitalizer + "TestMixedField2", strategy=["meta"], encoder=encoder_capitalizer ), ["META:TESTMIXEDFIELD2"], ) @@ -1077,10 +1085,11 @@ def test_complex_html(self): # TestMixedField3 self.assertEqual( - parsed.get_metadata("TestMixedField3", strategy="dc"), "dc:TestMixedField3" + parsed.get_metadata("TestMixedField3", strategy=["dc"]), + "dc:TestMixedField3", ) self.assertEqual( - parsed.get_metadata("TestMixedField3", strategy="meta"), + parsed.get_metadata("TestMixedField3", strategy=["meta"]), "meta:TestMixedField3", ) self.assertEqual( @@ -1089,13 +1098,13 @@ def test_complex_html(self): ) self.assertEqual( parsed.get_metadata( - "TestMixedField3", strategy="dc", encoder=encoder_capitalizer + "TestMixedField3", strategy=["dc"], encoder=encoder_capitalizer ), "DC:TESTMIXEDFIELD3", ) self.assertEqual( parsed.get_metadata( - "TestMixedField3", strategy="meta", encoder=encoder_capitalizer + "TestMixedField3", strategy=["meta"], encoder=encoder_capitalizer ), "META:TESTMIXEDFIELD3", ) @@ -1106,11 +1115,11 @@ def test_complex_html(self): {"meta": "META:TESTMIXEDFIELD3", "dc": "DC:TESTMIXEDFIELD3"}, ) self.assertEqual( - parsed.get_metadatas("TestMixedField3", strategy="dc"), + parsed.get_metadatas("TestMixedField3", strategy=["dc"]), [{"content": "dc:TestMixedField3"}], ) self.assertEqual( - parsed.get_metadatas("TestMixedField3", strategy="meta"), + parsed.get_metadatas("TestMixedField3", strategy=["meta"]), ["meta:TestMixedField3"], ) self.assertEqual( @@ -1122,13 +1131,13 @@ def test_complex_html(self): ) self.assertEqual( parsed.get_metadatas( - "TestMixedField3", strategy="dc", encoder=encoder_capitalizer + "TestMixedField3", strategy=["dc"], encoder=encoder_capitalizer ), [{"CONTENT": "DC:TESTMIXEDFIELD3"}], ) self.assertEqual( parsed.get_metadatas( - "TestMixedField3", strategy="meta", encoder=encoder_capitalizer + "TestMixedField3", strategy=["meta"], encoder=encoder_capitalizer ), ["META:TESTMIXEDFIELD3"], ) diff --git a/tests/test_ip_tracking.py b/tests/test_ip_tracking.py index b2840ea..b34f075 100644 --- a/tests/test_ip_tracking.py +++ b/tests/test_ip_tracking.py @@ -11,6 +11,11 @@ class TestIpLookups(unittest.TestCase): """""" def test_ip_lookup(self): - url = "http://example.com/" + """ + this is using the live internet + + todo: use httpbin + """ + url = "https://example.com/" page = metadata_parser.MetadataParser(url=url) self.assertTrue(page.peername) diff --git a/tox.ini b/tox.ini index 6fb6c6d..052fa76 100644 --- a/tox.ini +++ b/tox.ini @@ -2,7 +2,7 @@ envlist = lint, mypy, - py36,py37,py38,py39,py310,py311,py312,py313 + py37,py38,py39,py310,py311,py312,py313 [testenv] commands = From 3469f8e7f06553568866ca2e0e612c237c7183de Mon Sep 17 00:00:00 2001 From: jonathan vanasco Date: Fri, 23 May 2025 14:18:53 -0400 Subject: [PATCH 2/4] bump beautifulsoup to 4.14.x branch ignore internal pytest warnings --- pytest.ini | 5 +++++ setup.py | 2 +- src/metadata_parser/__init__.py | 14 +++++++------- 3 files changed, 13 insertions(+), 8 deletions(-) create mode 100644 pytest.ini diff --git a/pytest.ini b/pytest.ini new file mode 100644 index 0000000..db7cd78 --- /dev/null +++ b/pytest.ini @@ -0,0 +1,5 @@ +[pytest] + +filterwarnings = + ignore:MetadataParser. + ignore:`ParsedResult.get_metadata` returns a string diff --git a/setup.py b/setup.py index 9cb3195..3c48d58 100644 --- a/setup.py +++ b/setup.py @@ -27,7 +27,7 @@ long_description = fp.read() requires = [ - "BeautifulSoup4<4.15.0", + "BeautifulSoup4>4.13.0,<4.14.0", "requests>=2.19.1", "requests-toolbelt>=0.8.0", "typing_extensions", diff --git a/src/metadata_parser/__init__.py b/src/metadata_parser/__init__.py index 4ea70b6..4453b42 100644 --- a/src/metadata_parser/__init__.py +++ b/src/metadata_parser/__init__.py @@ -1171,7 +1171,7 @@ def get_metadata( function or "raw" """ warn_future( - """`ParsedResult.get_metadata` returns a string and is deprecated""" + """`ParsedResult.get_metadata` returns a string and is deprecated """ """in favor of `get_metadatas` which returns a list. """ """This will be removed in the next minor or major release.""" ) @@ -2129,7 +2129,7 @@ def parse( return doc_searchpath = doc.html.head # bs4.element.Tag - ogs = doc_searchpath.findAll("meta", attrs={"property": RE_prefix_opengraph}) + ogs = doc_searchpath.find_all("meta", attrs={"property": RE_prefix_opengraph}) for og in ogs: try: parsed_result._add_discovered( @@ -2144,7 +2144,7 @@ def parse( log.debug("Ran into a serious error parsing `og`: %s", exc) pass - twitters = doc_searchpath.findAll("meta", attrs={"name": RE_prefix_twitter}) + twitters = doc_searchpath.find_all("meta", attrs={"name": RE_prefix_twitter}) for twitter in twitters: try: # for the deprecated "twitter:(label|data)" meta tags, we must use a 'value' attr @@ -2196,7 +2196,7 @@ def parse( pass # is there an image_src? - images = doc.findAll("link", attrs={"rel": RE_prefix_rel_img_src}) + images = doc.find_all("link", attrs={"rel": RE_prefix_rel_img_src}) if images: # we only use the first image on the page image = images[0] @@ -2218,7 +2218,7 @@ def parse( pass # figure out the canonical url - canonicals = doc.findAll("link", attrs={"rel": RE_canonical}) + canonicals = doc.find_all("link", attrs={"rel": RE_canonical}) if canonicals: # only use the first? canonical = canonicals[0] @@ -2240,7 +2240,7 @@ def parse( pass # is there a shortlink? - shortlinks = doc.findAll("link", attrs={"rel": RE_shortlink}) + shortlinks = doc.find_all("link", attrs={"rel": RE_shortlink}) for shortlink in shortlinks: if shortlink.has_attr("href"): _link = shortlink["href"] @@ -2260,7 +2260,7 @@ def parse( pass # pull out all the metadata - meta = doc_searchpath.findAll(name="meta") + meta = doc_searchpath.find_all(name="meta") for m in meta: try: k = None # metadata key From 71ce964484a4a40b48914cad974d991e3c4ffa0c Mon Sep 17 00:00:00 2001 From: jonathan vanasco Date: Fri, 23 May 2025 19:22:01 -0400 Subject: [PATCH 3/4] standardizing UrlParserCacheable, cached_urlparser, and cached_urlparser_maxitems --- CHANGELOG.txt | 12 ++ src/metadata_parser/__init__.py | 66 ++++++++-- tests/test_document_parsing.py | 212 +++++++++++++++++++++++++++++--- 3 files changed, 263 insertions(+), 27 deletions(-) diff --git a/CHANGELOG.txt b/CHANGELOG.txt index fdbe5e8..0c436ae 100644 --- a/CHANGELOG.txt +++ b/CHANGELOG.txt @@ -9,6 +9,18 @@ string is "all", otherwise a list of string - excluding "all" - must be submitted. Warnings of this have been emitted for several years. * __init__(`search_head_only`) now defaults to False + * `UrlParserCacheable` has been extended to accepted a `urlparser` argument. + This defaults to `urlparse` and expects the same signature. + * __init__(`cached_urlparser`) has new deprecations to standardize the API + submitting an Int to set max_items is deprecated; instead: + cached_urlparser=True + cached_urlparser_maxitems=int + submitting 0 is deprecated; instead: + cached_urlparser=False + or + cached_urlparser_maxitems=0 + cached_urlparser=False + * __init__(`cached_urlparser_maxitems`) has been added 0.12.3 * pin "BeautifulSoup4<4.15.0" diff --git a/src/metadata_parser/__init__.py b/src/metadata_parser/__init__.py index 4453b42..ee0a4f7 100644 --- a/src/metadata_parser/__init__.py +++ b/src/metadata_parser/__init__.py @@ -35,6 +35,7 @@ from requests.structures import CaseInsensitiveDict from requests_toolbelt.utils.deprecated import get_encodings_from_content from typing_extensions import Literal # py38 +from typing_extensions import Protocol # py38 if TYPE_CHECKING: from bs4 import Tag as _bs4_Tag @@ -987,7 +988,11 @@ def log( ) -class UrlParserCacheable(object): +class _UrlParserCacheable(Protocol): + urlparse: Callable[[str], ParseResult] + + +class UrlParserCacheable(_UrlParserCacheable): """ class for caching calls to urlparse @@ -996,14 +1001,20 @@ class for caching calls to urlparse cache: collections.OrderedDict maxitems: int + urlparser: Callable[[str], ParseResult] - def __init__(self, maxitems: int = 30): + def __init__( + self, + maxitems: int = 30, + urlparser: Callable[[str], ParseResult] = urlparse, + ): """ :param maxitems: maximum items to cache, default 30 :type maxitems: int, optional """ self.cache = collections.OrderedDict() self.maxitems = maxitems + self.urlparser = urlparser def urlparse(self, url: str) -> ParseResult: """ @@ -1011,7 +1022,7 @@ def urlparse(self, url: str) -> ParseResult: :type url: str """ if url not in self.cache: - self.cache[url] = urlparse(url) + self.cache[url] = self.urlparser(url) if len(self.cache) > self.maxitems: self.cache.popitem(last=False) return self.cache[url] @@ -1388,6 +1399,7 @@ class MetadataParser(object): support_malformed: Optional[bool] = None urlparse: Callable[[str], ParseResult] + _cached_urlparser: Optional[_UrlParserCacheable] # this has a per-parser default tuple # it can be upgraded manually @@ -1427,6 +1439,7 @@ def __init__( retry_dropped_without_headers: Optional[bool] = None, support_malformed: Optional[bool] = None, cached_urlparser: Union[bool, int, Callable[[str], ParseResult]] = True, + cached_urlparser_maxitems: Optional[int] = None, ): """ creates a new `MetadataParser` instance. @@ -1521,27 +1534,56 @@ def __init__( default: True options: True: use a instance of UrlParserCacheable(maxitems=30) : INT: use a instance of UrlParserCacheable(maxitems=cached_urlparser) - : None/False/0 - use native urlparse + DEPRECATED in v13.0 + instead, set `cached_urlparser=True, cached_urlparser_maxitems=maxitems + : None/False - use native urlparse : callable - use as a custom urlparse + `cached_urlparser_maxitems` + default: None + options: int: sets maxitems """ if __debug__: log.debug("MetadataParser.__init__(%s)", url) if url is not None: url = url.strip() self.parsed_result = ParsedResult() + if cached_urlparser_maxitems: + if cached_urlparser is not True: + raise ValueError( + "`cached_urlparser_maxitems` requires `cached_urlparser=True`" + ) + if cached_urlparser == 0: + warn_future( + "Supplying `0` to `cached_urlparser` to set maxitems is deprecated. " + "This will be removed in the next major or minor release." + "Supply `cached_urlparser=False` instead." + ) + cached_urlparser = False if cached_urlparser: + if isinstance(cached_urlparser, int): + # build a default parser with maxitems + warn_future( + "Supplying an int to `cached_urlparser` to set maxitems is deprecated. " + "This will be removed in the next major or minor release." + "Supply `cached_urlparser=True, cached_urlparser_maxitems=int` instead." + ) + # coerce args for the next block + cached_urlparser_maxitems = cached_urlparser + cached_urlparser = True if cached_urlparser is True: - _cached_urlparser = UrlParserCacheable() # a cache - self._cached_urlparser = _cached_urlparser # stash it - self.urlparse = _cached_urlparser.urlparse - elif isinstance(cached_urlparser, int): - _cached_urlparser = UrlParserCacheable( - maxitems=cached_urlparser - ) # a cache + # build a default parser + if cached_urlparser_maxitems is not None: + _cached_urlparser = UrlParserCacheable( + maxitems=cached_urlparser_maxitems + ) + else: + _cached_urlparser = UrlParserCacheable() self._cached_urlparser = _cached_urlparser # stash it self.urlparse = _cached_urlparser.urlparse else: - # TODO - raise value error if not callable + if not callable(cached_urlparser): + raise ValueError("`cached_urlparser` must be a callable") + self._cached_urlparser = None self.urlparse = cached_urlparser else: self.urlparse = urlparse diff --git a/tests/test_document_parsing.py b/tests/test_document_parsing.py index 8516f04..094822c 100644 --- a/tests/test_document_parsing.py +++ b/tests/test_document_parsing.py @@ -2,6 +2,7 @@ import os from typing import Dict import unittest +import warnings # local import metadata_parser @@ -215,7 +216,7 @@ def _docs_test(test_names): return errors -def _docs_test_parser(test_names, cached_urlparser): +def _docs_test_parser(test_names, cached_urlparser, cached_urlparser_maxitems=None): errors = [] for test in test_names: tests = [] @@ -223,6 +224,8 @@ def _docs_test_parser(test_names, cached_urlparser): kwargs = {} if cached_urlparser != "*no-kwarg": kwargs["cached_urlparser"] = cached_urlparser + if cached_urlparser_maxitems is not None: + kwargs["cached_urlparser_maxitems"] = cached_urlparser_maxitems parsed = metadata_parser.MetadataParser( url=url, html=docs[test]["doc"], **kwargs ) @@ -1354,12 +1357,13 @@ def test_charsets(self): self.assertEqual(c_parsed.metadata["meta"]["charset"], "UTF-8") -class TestCustomUrlparser(unittest.TestCase): +class Test_UrlParserCacheable(unittest.TestCase): """ - python -m unittest tests.document_parsing.TestCustomUrlparser + python -m unittest tests.document_parsing.Test_UrlParserCacheable """ - def test_default__get_discrete_url__good_relative(self): + def test__default(self): + """MetadataParser()""" errors = _docs_test_parser( [ "good-canonical-relative", @@ -1371,7 +1375,8 @@ def test_default__get_discrete_url__good_relative(self): if errors: raise ValueError(errors) - def test_true__get_discrete_url__good_relative(self): + def test__True(self): + """MetadataParser(cached_urlparser=True)""" errors = _docs_test_parser( [ "good-canonical-relative", @@ -1383,52 +1388,229 @@ def test_true__get_discrete_url__good_relative(self): if errors: raise ValueError(errors) - def test_int__get_discrete_url__good_relative(self): + def test__Int_1(self): + """MetadataParser(cached_urlparser=1)""" + with warnings.catch_warnings(record=True) as warned: + warnings.simplefilter("always") + errors = _docs_test_parser( + [ + "good-canonical-relative", + "good-canonical-relative_alt", + "good-og-relative_alt", + ], + 1, + ) + if errors: + raise ValueError(errors) + assert len(warned) >= 1 + _found = False + for w in warned: + if isinstance(w.message, FutureWarning): + if w.message.args[0].startswith( + "Supplying an int to `cached_urlparser` to set maxitems is deprecated." + ): + _found = True + assert ( + "Supply `cached_urlparser=True, cached_urlparser_maxitems=int` instead." + in w.message.args[0] + ) + assert _found is True + + def test__Int_0(self): + """MetadataParser(cached_urlparser=1)""" + with warnings.catch_warnings(record=True) as warned: + warnings.simplefilter("always") + errors = _docs_test_parser( + [ + "good-canonical-relative", + "good-canonical-relative_alt", + "good-og-relative_alt", + ], + 0, + ) + if errors: + raise ValueError(errors) + assert len(warned) >= 1 + _found = False + for w in warned: + if isinstance(w.message, FutureWarning): + if w.message.args[0].startswith( + "Supplying `0` to `cached_urlparser` to set maxitems is deprecated." + ): + _found = True + assert ( + "Supply `cached_urlparser=False` instead" + in w.message.args[0] + ) + assert _found is True + + def test__None(self): errors = _docs_test_parser( [ "good-canonical-relative", "good-canonical-relative_alt", "good-og-relative_alt", ], - 1, + None, ) if errors: raise ValueError(errors) - def test_none__get_discrete_url__good_relative(self): + def test__False(self): errors = _docs_test_parser( [ "good-canonical-relative", "good-canonical-relative_alt", "good-og-relative_alt", ], - None, + False, ) if errors: raise ValueError(errors) - def test_false__get_discrete_url__good_relative(self): + def test__CustomParser(self): + custom_parser_obj = metadata_parser.UrlParserCacheable() + custom_parser = custom_parser_obj.urlparse errors = _docs_test_parser( [ "good-canonical-relative", "good-canonical-relative_alt", "good-og-relative_alt", ], - False, + custom_parser, ) if errors: raise ValueError(errors) - def test_instance__get_discrete_url__good_relative(self): - custom_parser_obj = metadata_parser.UrlParserCacheable() - custom_parser = custom_parser_obj.urlparse + +class Test_UrlParserCacheable_MaxItems(unittest.TestCase): + + def test__default(self): + """MetadataParser()""" errors = _docs_test_parser( [ "good-canonical-relative", "good-canonical-relative_alt", "good-og-relative_alt", ], - custom_parser, + "*no-kwarg", + cached_urlparser_maxitems=1, ) if errors: raise ValueError(errors) + + def test__True(self): + # this should fail + errors = _docs_test_parser( + [ + "good-canonical-relative", + "good-canonical-relative_alt", + "good-og-relative_alt", + ], + True, + cached_urlparser_maxitems=1, + ) + if errors: + raise ValueError(errors) + + def test__False(self): + # this should fail + with self.assertRaises(ValueError) as cm: + errors = _docs_test_parser( + [ + "good-canonical-relative", + "good-canonical-relative_alt", + "good-og-relative_alt", + ], + False, + cached_urlparser_maxitems=1, + ) + if errors: + raise ValueError(errors) + assert isinstance(cm.exception, ValueError) + assert ( + cm.exception.args[0] + == "`cached_urlparser_maxitems` requires `cached_urlparser=True`" + ) + + def test__Int_1(self): + # this should fail + with self.assertRaises(ValueError) as cm: + errors = _docs_test_parser( + [ + "good-canonical-relative", + "good-canonical-relative_alt", + "good-og-relative_alt", + ], + 1, + cached_urlparser_maxitems=1, + ) + if errors: + raise ValueError(errors) + assert isinstance(cm.exception, ValueError) + assert ( + cm.exception.args[0] + == "`cached_urlparser_maxitems` requires `cached_urlparser=True`" + ) + + def test__Int_0(self): + # this should fail + with self.assertRaises(ValueError) as cm: + errors = _docs_test_parser( + [ + "good-canonical-relative", + "good-canonical-relative_alt", + "good-og-relative_alt", + ], + 0, + cached_urlparser_maxitems=1, + ) + if errors: + raise ValueError(errors) + assert isinstance(cm.exception, ValueError) + assert ( + cm.exception.args[0] + == "`cached_urlparser_maxitems` requires `cached_urlparser=True`" + ) + + def test__None(self): + # this should fail + with self.assertRaises(ValueError) as cm: + errors = _docs_test_parser( + [ + "good-canonical-relative", + "good-canonical-relative_alt", + "good-og-relative_alt", + ], + None, + cached_urlparser_maxitems=1, + ) + if errors: + raise ValueError(errors) + assert isinstance(cm.exception, ValueError) + assert ( + cm.exception.args[0] + == "`cached_urlparser_maxitems` requires `cached_urlparser=True`" + ) + + def test__CustomParser(self): + # this should fail + custom_parser_obj = metadata_parser.UrlParserCacheable() + custom_parser = custom_parser_obj.urlparse + with self.assertRaises(ValueError) as cm: + errors = _docs_test_parser( + [ + "good-canonical-relative", + "good-canonical-relative_alt", + "good-og-relative_alt", + ], + custom_parser, + cached_urlparser_maxitems=1, + ) + if errors: + raise ValueError(errors) + assert isinstance(cm.exception, ValueError) + assert ( + cm.exception.args[0] + == "`cached_urlparser_maxitems` requires `cached_urlparser=True`" + ) From fa031ba0cbdb14808acd08419c8a956ec81f410f Mon Sep 17 00:00:00 2001 From: jonathan vanasco Date: Fri, 23 May 2025 21:23:00 -0400 Subject: [PATCH 4/4] clear version --- CHANGELOG.txt | 7 ++++--- src/metadata_parser/__init__.py | 2 +- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/CHANGELOG.txt b/CHANGELOG.txt index 0c436ae..3991205 100644 --- a/CHANGELOG.txt +++ b/CHANGELOG.txt @@ -1,8 +1,8 @@ -1.0 - 1.0 will be a complete api overhaul +1.0 (unreleased) + 1.0 will include an api overhaul and remove all deprecations -0.13.0rc0 +0.13.0 * drop py36; no test options due to github deprecation of ubuntu20.04 * `_coerce_validate_strategy` (invoked by `get_metadatas`) will now raise a ValueError if a string other than "all" is submitted. The only valid @@ -21,6 +21,7 @@ cached_urlparser_maxitems=0 cached_urlparser=False * __init__(`cached_urlparser_maxitems`) has been added + * the next release is likely to be 1.0 0.12.3 * pin "BeautifulSoup4<4.15.0" diff --git a/src/metadata_parser/__init__.py b/src/metadata_parser/__init__.py index ee0a4f7..07d16fb 100644 --- a/src/metadata_parser/__init__.py +++ b/src/metadata_parser/__init__.py @@ -49,7 +49,7 @@ # ============================================================================== -__VERSION__ = "0.13.0rc0" +__VERSION__ = "0.13.0" # ------------------------------------------------------------------------------