From 4fa19b7504854c61304f193a207b9a89b56d765c Mon Sep 17 00:00:00 2001
From: jonathan vanasco <jonathan@2xlp.com>
Date: Fri, 23 May 2025 13:49:34 -0400
Subject: [PATCH 1/4] update testharness improve typing remove deprecations
 adjust tests for deprecations

---
 .github/workflows/python-package.yml |   9 +-
 CHANGELOG.txt                        |   9 +
 setup.py                             |   2 +-
 src/metadata_parser/__init__.py      | 242 ++++++++++++++-------------
 tests/test_document_parsing.py       | 105 ++++++------
 tests/test_ip_tracking.py            |   7 +-
 tox.ini                              |   2 +-
 7 files changed, 206 insertions(+), 170 deletions(-)

diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml
index 4be57c3..22c1e89 100644
--- a/.github/workflows/python-package.yml
+++ b/.github/workflows/python-package.yml
@@ -15,20 +15,15 @@ jobs:
     strategy:
       matrix:
         os:
-          - "ubuntu-latest"
+          - "ubuntu-22.04"
         python-version:
+          - "3.7"
           - "3.8"
           - "3.9"
           - "3.10"
           - "3.11"
           - "3.12"
           - "3.13"
-        include:
-          # ubuntu-latest[22.04] does not have: py36
-          - os: "ubuntu-20.04"
-            python-version: "3.6"
-          - os: "ubuntu-20.04"
-            python-version: "3.7"
     steps:
     - uses: actions/checkout@v3
     - name: Set up Python ${{ matrix.python-version }}
diff --git a/CHANGELOG.txt b/CHANGELOG.txt
index 65ed432..fdbe5e8 100644
--- a/CHANGELOG.txt
+++ b/CHANGELOG.txt
@@ -1,6 +1,15 @@
 1.0
     1.0 will be a complete api overhaul
 
+
+0.13.0rc0
+    * drop py36; no test options due to github deprecation of ubuntu20.04
+    * `_coerce_validate_strategy` (invoked by `get_metadatas`) will now raise a
+        ValueError if a string other than "all" is submitted.  The only valid
+        string is "all", otherwise a list of string - excluding "all" - must be
+        submitted. Warnings of this have been emitted for several years.
+    * __init__(`search_head_only`) now defaults to False
+
 0.12.3
     * pin "BeautifulSoup4<4.15.0"
         * See `https://git.launchpad.net/beautifulsoup/tree/CHANGELOG`
diff --git a/setup.py b/setup.py
index 5d1daaa..9cb3195 100644
--- a/setup.py
+++ b/setup.py
@@ -30,6 +30,7 @@
     "BeautifulSoup4<4.15.0",
     "requests>=2.19.1",
     "requests-toolbelt>=0.8.0",
+    "typing_extensions",
 ]
 if sys.version_info.major == 2:
     requires.append("backports.html")
@@ -59,7 +60,6 @@
         "Intended Audience :: Developers",
         "License :: OSI Approved :: MIT License",
         "Programming Language :: Python :: 3",
-        "Programming Language :: Python :: 3.6",
         "Programming Language :: Python :: 3.7",
         "Programming Language :: Python :: 3.8",
         "Programming Language :: Python :: 3.9",
diff --git a/src/metadata_parser/__init__.py b/src/metadata_parser/__init__.py
index 04df982..4ea70b6 100644
--- a/src/metadata_parser/__init__.py
+++ b/src/metadata_parser/__init__.py
@@ -10,6 +10,7 @@
 import re
 import socket  # peername hack, see below
 import typing
+from typing import Any
 from typing import Callable
 from typing import Dict
 from typing import Iterable
@@ -33,6 +34,7 @@
 import requests
 from requests.structures import CaseInsensitiveDict
 from requests_toolbelt.utils.deprecated import get_encodings_from_content
+from typing_extensions import Literal  # py38
 
 if TYPE_CHECKING:
     from bs4 import Tag as _bs4_Tag
@@ -46,7 +48,7 @@
 # ==============================================================================
 
 
-__VERSION__ = "0.12.3"
+__VERSION__ = "0.13.0rc0"
 
 
 # ------------------------------------------------------------------------------
@@ -88,7 +90,9 @@ def warn_user(message: str) -> None:
 TYPES_RESPONSE = Union["DummyResponse", requests.Response]
 TYPES_PEERNAME = Tuple[str, int]  # (ip, port)
 TYPE_URL_FETCH = Tuple[str, str, "ResponseHistory"]
-
+TYPE_REQUESTS_TIMEOUT = Optional[
+    Union[int, float, Tuple[int, int], Tuple[float, float]]
+]
 
 # ------------------------------------------------------------------------------
 
@@ -352,15 +356,15 @@ def _get_socket() -> Optional[socket.socket]:
             i += 1
             try:
                 if i == 1:
-                    sock = resp.raw._connection.sock
+                    sock = resp.raw._connection.sock  # type: ignore[union-attr]
                 elif i == 2:
-                    sock = resp.raw._connection.sock.socket
+                    sock = resp.raw._connection.sock.socket  # type: ignore[union-attr]
                 elif i == 3:
-                    sock = resp.raw._fp.fp._sock
+                    sock = resp.raw._fp.fp._sock  # type: ignore[union-attr]
                 elif i == 4:
-                    sock = resp.raw._fp.fp._sock.socket
+                    sock = resp.raw._fp.fp._sock.socket  # type: ignore[union-attr]
                 elif i == 5:
-                    sock = resp.raw._fp.fp.raw._sock
+                    sock = resp.raw._fp.fp.raw._sock  # type: ignore[union-attr]
                 else:
                     break
                 if not isinstance(sock, _compatible_sockets):
@@ -582,7 +586,7 @@ def is_parsed_valid_relative(parsed: ParseResult) -> bool:
 
 def parsed_to_relative(
     parsed: ParseResult,
-    parsed_fallback: Optional[str] = None,
+    parsed_fallback: Optional[ParseResult] = None,
 ) -> str:
     """turns a parsed url into a full relative path"""
     assert isinstance(parsed, ParseResult)
@@ -611,7 +615,7 @@ def parsed_to_relative(
 def fix_unicode_url(
     url: str,
     encoding: Optional[str] = None,
-    urlparser: Callable = urlparse,
+    urlparser: Callable[[str], ParseResult] = urlparse,
 ) -> str:
     """
     some cms systems will put unicode in their canonical url
@@ -650,8 +654,8 @@ def is_url_valid(
     url: str,
     require_public_netloc: Optional[bool] = None,
     allow_localhosts: Optional[bool] = None,
-    urlparser: Callable = urlparse,
-) -> bool:
+    urlparser: Callable[[str], ParseResult] = urlparse,
+) -> Union[Literal[False], ParseResult]:
     """
     tries to parse a url. if valid returns `ParseResult`
     (boolean eval is True); if invalid returns `False`
@@ -678,7 +682,7 @@ def url_to_absolute_url(
     url_fallback: Optional[str] = None,
     require_public_netloc: Optional[bool] = None,
     allow_localhosts: Optional[bool] = None,
-    urlparser: Callable = urlparse,
+    urlparser: Callable[[str], ParseResult] = urlparse,
 ) -> Optional[str]:
     """
     returns an "absolute url" if we have one.
@@ -714,6 +718,9 @@ def url_to_absolute_url(
     if url_test is None and url_fallback is not None:
         return url_fallback
 
+    if TYPE_CHECKING:
+        assert url_test is not None
+
     parsed = urlparser(url_test)
 
     # if we passed in a url, we can't remount it onto another domain
@@ -749,6 +756,8 @@ def url_to_absolute_url(
         # this can happen if someone puts in "" for the canonical
         # but this can also happen if we have different domains...
         if url_fallback:
+            if TYPE_CHECKING:
+                assert parsed_fallback is not None
             if (parsed_fallback.scheme == parsed.scheme) or (
                 parsed_fallback.netloc == parsed.netloc
             ):
@@ -771,6 +780,8 @@ def url_to_absolute_url(
         # ok, the URL isn't valid
         # can we re-assemble it
         if url_fallback:
+            if TYPE_CHECKING:
+                assert parsed_fallback is not None
             if is_parsed_valid_url(
                 parsed_fallback,
                 require_public_netloc=require_public_netloc,
@@ -791,6 +802,8 @@ def url_to_absolute_url(
 
 
 class InvalidDocument(Exception):
+    message: str
+
     def __init__(self, message: str = ""):
         self.message = message
 
@@ -799,6 +812,11 @@ def __str__(self) -> str:
 
 
 class NotParsable(Exception):
+    raised: Optional[requests.exceptions.RequestException]
+    code: Optional[int]
+    metadataParser: Optional["MetadataParser"]
+    response: Optional[TYPES_RESPONSE]
+
     def __init__(
         self,
         message: str = "",
@@ -856,6 +874,11 @@ class RedirectDetected(Exception):
     ``response``: actual response object
     """
 
+    location: str
+    code: Optional[int]
+    response: Optional[TYPES_RESPONSE]
+    metadataParser: Optional["MetadataParser"]
+
     def __init__(
         self,
         location: str = "",
@@ -945,7 +968,7 @@ def __init__(self, resp: TYPES_RESPONSE):
     def log(
         self,
         prefix: str = "ResponseHistory",
-        logger: Callable = log.error,
+        logger: Callable[..., None] = log.error,
     ) -> None:
         """
         :param prefix: Prefix for logging, defaults to "ResponseHistory"
@@ -1014,11 +1037,10 @@ class ParsedResult(object):
         None  # only stashing `ResponseHistory` if we have it
     )
     _version: int = 1  # version tracking
-    default_encoder: Optional[Callable] = None
-
+    default_encoder: Optional[Callable[[str], str]] = None
     og_minimum_requirements: List = ["title", "type", "image", "url"]
     twitter_sections: List = ["card", "title", "site", "description"]
-    strategy: Union[List, str] = ["og", "dc", "meta", "page", "twitter"]
+    strategy: Union[List[str], str] = ["og", "dc", "meta", "page", "twitter"]
 
     _get_metadata__last_strategy: Optional[str] = None
 
@@ -1074,21 +1096,13 @@ def _add_discovered(
 
     def _coerce_validate_strategy(
         self,
-        strategy: Union[list, str, None] = None,
+        strategy: Union[List[str], str, None] = None,
     ) -> Union[List, str]:
         """normalize a strategy into a valid option"""
         if strategy:
             if isinstance(strategy, str):
                 if strategy != "all":
-                    warn_user(
-                        """If `strategy` is not a `list`, it should be 'all'."""
-                        """This is coerced into a list, but will be enforced."""
-                    )
-                    if strategy not in self.strategy:
-                        raise ValueError("invalid strategy: %s" % strategy)
-                    strategy = [
-                        strategy,
-                    ]
+                    raise ValueError("If `strategy` is not a `list`, it must be 'all'.")
             elif isinstance(strategy, list):
                 _invalids = []
                 for _candidate in strategy:
@@ -1107,7 +1121,7 @@ def get_metadata(
         self,
         field: str,
         strategy: Union[list, str, None] = None,
-        encoder: Optional[Callable] = None,
+        encoder: Optional[Callable[[str], str]] = None,
     ) -> Union[str, Dict[str, Union[str, Dict]], None]:
         """
         LEGACY. DEPRECATED.  DO NOT USE THIS.
@@ -1157,8 +1171,9 @@ def get_metadata(
           function or "raw"
         """
         warn_future(
-            """`get_metadata` returns a string and is being deprecated"""
-            """in favor of `get_metadatas` which returns a list."""
+            """`ParsedResult.get_metadata` returns a string and is deprecated"""
+            """in favor of `get_metadatas` which returns a list. """
+            """This will be removed in the next minor or major release."""
         )
         strategy = self._coerce_validate_strategy(strategy)
         self._get_metadata__last_strategy = None
@@ -1217,8 +1232,8 @@ def _lookup(store: str) -> Optional[Union[str, Dict]]:
     def get_metadatas(
         self,
         field: str,
-        strategy: Union[list, str, None] = None,
-        encoder: Optional[Callable] = None,
+        strategy: Union[List[str], str, None] = None,
+        encoder: Optional[Callable[[str], str]] = None,
     ) -> Optional[Union[Dict, List]]:
         """
         looks for the field in various stores.  defaults to the core
@@ -1349,65 +1364,69 @@ class MetadataParser(object):
         this can be necessary on development machines
     """
 
-    url = None
-    url_actual = None
-    strategy = None
-    LEN_MAX_TITLE = 255
-    only_parse_file_extensions = None
-    allow_localhosts = None
-    require_public_netloc = None
-    force_doctype = None
-    requests_timeout = None
+    url: Optional[str] = None
+    url_actual: Optional[str] = None
+    strategy: Union[List[str], str, None] = None
+    LEN_MAX_TITLE: int = 255
+    only_parse_file_extensions: Optional[List[str]] = None
+    allow_localhosts: Optional[bool] = None
+    require_public_netloc: Optional[bool] = None
+    force_doctype: Optional[bool] = None
+    requests_timeout: TYPE_REQUESTS_TIMEOUT = None
     peername: Optional[TYPES_PEERNAME] = None
-    is_redirect = None
-    is_redirect_unique = None
-    is_redirect_same_host = None
-
-    force_parse = None
-    force_parse_invalid_content_type = None
-    only_parse_http_ok = None
-    requests_session = None
-    derive_encoding = None
-    default_encoding = None
-    default_encoder: Optional[Callable] = None
-    support_malformed = None
+    is_redirect: Optional[bool] = None
+    is_redirect_unique: Optional[bool] = None
+    is_redirect_same_host: Optional[bool] = None
+
+    force_parse: Optional[bool] = None
+    force_parse_invalid_content_type: Optional[bool] = None
+    only_parse_http_ok: Optional[bool] = None
+    requests_session: Optional[requests.Session] = None
+    derive_encoding: Optional[bool] = None
+    default_encoding: Optional[str] = None
+    default_encoder: Optional[Callable[[str], str]] = None
+    support_malformed: Optional[bool] = None
+
+    urlparse: Callable[[str], ParseResult]
 
     # this has a per-parser default tuple
     # it can be upgraded manually
-    schemeless_fields_upgradeable = SCHEMELESS_FIELDS_UPGRADEABLE
-    schemeless_fields_disallow = SCHEMELESS_FIELDS_DISALLOW
+    schemeless_fields_upgradeable: Tuple[str, ...] = SCHEMELESS_FIELDS_UPGRADEABLE
+    schemeless_fields_disallow: Tuple[str, ...] = SCHEMELESS_FIELDS_DISALLOW
 
-    _content_types_parse = ("text/html",)
-    _content_types_noparse = ("application/json",)
+    _content_types_parse: Tuple[str, ...] = ("text/html",)
+    _content_types_noparse: Tuple[str, ...] = ("application/json",)
+
+    response: Optional[TYPES_RESPONSE]
 
     def __init__(
         self,
         url: Optional[str] = None,
         html: Optional[str] = None,
-        strategy: Union[list, str, None] = None,
-        url_data=None,
-        url_headers=None,
+        strategy: Union[List[str], str, None] = None,
+        url_data: Optional[Dict[str, Any]] = None,
+        url_headers: Optional[Dict[str, str]] = None,
         force_parse: bool = False,
         ssl_verify: bool = True,
-        only_parse_file_extensions=None,
-        force_parse_invalid_content_type=False,
-        require_public_netloc=True,
-        allow_localhosts=None,
-        force_doctype=False,
-        requests_timeout=None,
-        raise_on_invalid=False,
-        search_head_only=None,
-        allow_redirects=True,
-        requests_session=None,
-        only_parse_http_ok=True,
-        defer_fetch=False,
-        derive_encoding=True,
-        html_encoding=None,
-        default_encoding=None,
-        default_encoder: Optional[Callable] = None,
-        retry_dropped_without_headers=None,
-        support_malformed=None,
-        cached_urlparser=True,
+        only_parse_file_extensions: Optional[List[str]] = None,
+        force_parse_invalid_content_type: bool = False,
+        require_public_netloc: bool = True,
+        allow_localhosts: Optional[bool] = None,
+        force_doctype: bool = False,
+        requests_timeout: TYPE_REQUESTS_TIMEOUT = None,
+        raise_on_invalid: bool = False,
+        search_head_only: bool = False,
+        allow_redirects: bool = True,
+        requests_session: Optional[requests.Session] = None,
+        only_parse_http_ok: bool = True,
+        defer_fetch: bool = False,
+        derive_encoding: bool = True,
+        html_encoding: Optional[str] = None,
+        default_encoding: Optional[str] = None,
+        default_encoder: Optional[Callable[[str], str]] = None,
+        retry_dropped_without_headers: Optional[bool] = None,
+        support_malformed: Optional[bool] = None,
+        cached_urlparser: Union[bool, int, Callable[[str], ParseResult]] = True,
     ):
         """
         creates a new `MetadataParser` instance.
@@ -1466,11 +1485,10 @@ def __init__(
                 if True, will raise an InvalidDocument exception if the response
                 does not look like a proper html document
             `search_head_only`
-                default: None
-                if `None` will default to True and emit a deprecation warning.
+                default: False
                 if `True`, will only search the document head for meta information.
                 `search_head_only=True` is the legacy behavior, but missed too many
-                bad html implementations. This will be set to `False` in the future.
+                bad html implementations.
             `allow_redirects`
                 default: True
                 passed onto `fetch_url`, which will pass it onto requests.get
@@ -1504,7 +1522,7 @@ def __init__(
                 options: True: use a instance of UrlParserCacheable(maxitems=30)
                        : INT: use a instance of UrlParserCacheable(maxitems=cached_urlparser)
                        : None/False/0 - use native urlparse
-                       : other truthy values - use as a custom urlparse
+                       : callable - use as a custom urlparse
         """
         if __debug__:
             log.debug("MetadataParser.__init__(%s)", url)
@@ -1513,16 +1531,17 @@ def __init__(
         self.parsed_result = ParsedResult()
         if cached_urlparser:
             if cached_urlparser is True:
-                cached_urlparser = UrlParserCacheable()  # a cache
-                self._cached_urlparser = cached_urlparser  # stash it
-                self.urlparse = cached_urlparser.urlparse
+                _cached_urlparser = UrlParserCacheable()  # a cache
+                self._cached_urlparser = _cached_urlparser  # stash it
+                self.urlparse = _cached_urlparser.urlparse
             elif isinstance(cached_urlparser, int):
-                cached_urlparser = UrlParserCacheable(
+                _cached_urlparser = UrlParserCacheable(
                     maxitems=cached_urlparser
                 )  # a cache
-                self._cached_urlparser = cached_urlparser  # stash it
-                self.urlparse = cached_urlparser.urlparse
+                self._cached_urlparser = _cached_urlparser  # stash it
+                self.urlparse = _cached_urlparser.urlparse
             else:
+                # TODO - raise value error if not callable
                 self.urlparse = cached_urlparser
         else:
             self.urlparse = urlparse
@@ -1541,12 +1560,6 @@ def __init__(
         self.force_parse = force_parse
         self.force_parse_invalid_content_type = force_parse_invalid_content_type
         self.only_parse_http_ok = only_parse_http_ok
-        if search_head_only is None:
-            warn_future(
-                """`search_head_only` was not provided and defaulting to `True` """
-                """Future versions will default to `False`."""
-            )
-            search_head_only = True
         self.search_head_only = search_head_only
         self.raise_on_invalid = raise_on_invalid
         self.requests_session = requests_session
@@ -1651,7 +1664,7 @@ def get_metadata(
         self,
         field: str,
         strategy: Union[list, str, None] = None,
-        encoder: Optional[Callable] = None,
+        encoder: Optional[Callable[[str], str]] = None,
     ) -> Union[str, Dict[str, Union[str, Dict]], None]:
         # deprecating in 1.0; operate on the result instead
         warn_future(
@@ -1664,8 +1677,8 @@ def get_metadata(
     def get_metadatas(
         self,
         field,
-        strategy: Union[list, str, None] = None,
-        encoder: Optional[Callable] = None,
+        strategy: Union[List[str], str, None] = None,
+        encoder: Optional[Callable[[str], str]] = None,
     ) -> Optional[Union[Dict, List]]:
         # deprecating in 1.0; operate on the result instead
         warn_future(
@@ -1697,18 +1710,18 @@ def _response_encoding(self) -> Optional[str]:
 
     def fetch_url(
         self,
-        url_data=None,
-        url_headers=None,
-        force_parse=None,
-        force_parse_invalid_content_type=None,
-        allow_redirects=None,
-        ssl_verify=None,
-        requests_timeout=None,
-        requests_session=None,
-        only_parse_http_ok=None,
-        derive_encoding=None,
-        default_encoding=None,
-        retry_dropped_without_headers=None,
+        url_data: Optional[Dict[str, Any]] = None,  # ???: required
+        url_headers: Optional[Union[CaseInsensitiveDict, Dict[str, Any]]] = None,
+        force_parse: Optional[bool] = None,  # `None` will use `self.force_parse`
+        force_parse_invalid_content_type: Optional[bool] = None,
+        allow_redirects: Optional[bool] = None,
+        ssl_verify: Optional[bool] = None,
+        requests_timeout: TYPE_REQUESTS_TIMEOUT = None,
+        requests_session: Optional[requests.Session] = None,
+        only_parse_http_ok: Optional[bool] = None,
+        derive_encoding: Optional[bool] = None,
+        default_encoding: Optional[str] = None,
+        retry_dropped_without_headers: Optional[bool] = None,
     ) -> TYPE_URL_FETCH:
         """
         fetches the url and returns a tuple of (html, html_encoding).
@@ -1758,6 +1771,7 @@ def fetch_url(
             else self.only_parse_http_ok
         )
         if not force_parse and self.only_parse_file_extensions is not None:
+            assert self.url
             parsed = self.urlparse(self.url)
             path = parsed.path
             if path:
@@ -1971,6 +1985,8 @@ def _run_in_session(_requests_session: requests.Session):
 
         except requests.exceptions.RequestException as error:
             if hasattr(error, "response") and (error.response is not None):
+                if TYPE_CHECKING:
+                    assert error.response is not None
                 self.response = error.response
                 try:
                     assert self.response is not None  # mypy
@@ -1990,7 +2006,9 @@ def _run_in_session(_requests_session: requests.Session):
                 raised=error,
                 metadataParser=self,
             )
-
+        if TYPE_CHECKING:
+            assert html is not None
+            assert html_encoding is not None
         return (html, html_encoding, response_history)
 
     def absolute_url(self, link: Optional[str] = None) -> Optional[str]:
@@ -2566,7 +2584,7 @@ def get_discrete_url(
     def get_metadata_link(
         self,
         field: str,
-        strategy: Union[list, str, None] = None,
+        strategy: Union[List[str], str, None] = None,
         allow_encoded_uri: bool = False,
         require_public_global: bool = True,
     ) -> Optional[str]:
@@ -2577,7 +2595,7 @@ def get_metadata_link(
 
         kwargs:
             strategy=None
-                ('all') or iterable ['og', 'dc', 'meta', 'page', 'twitter', ]
+                'all' or List ['og', 'dc', 'meta', 'page', 'twitter', ]
             allow_encoded_uri=False
             require_public_global=True
 
diff --git a/tests/test_document_parsing.py b/tests/test_document_parsing.py
index a1e5d23..8516f04 100644
--- a/tests/test_document_parsing.py
+++ b/tests/test_document_parsing.py
@@ -722,22 +722,28 @@ def test_complex_html(self):
                 dc_mixed_candidates[_key], dcTestMixedCandidates1aExpected[_key]
             )
         # but we need to test get_metadata and get_metadatas
+        with self.assertRaises(ValueError) as cm:
+            parsed.get_metadata("TestMixedCandidates1a", strategy="dc")
+        self.assertEqual(
+            cm.exception.args[0], "If `strategy` is not a `list`, it must be 'all'."
+        )
+
         self.assertEqual(
-            parsed.get_metadata("TestMixedCandidates1a", strategy="dc"), "Friendship"
+            parsed.get_metadata("TestMixedCandidates1a", strategy=["dc"]), "Friendship"
         )
         self.assertEqual(
-            parsed.get_metadatas("TestMixedCandidates1a", strategy="dc"),
+            parsed.get_metadatas("TestMixedCandidates1a", strategy=["dc"]),
             [dcTestMixedCandidates1aExpected],
         )
         self.assertEqual(
             parsed.get_metadata(
-                "TestMixedCandidates1a", strategy="dc", encoder=encoder_capitalizer
+                "TestMixedCandidates1a", strategy=["dc"], encoder=encoder_capitalizer
             ),
             "FRIENDSHIP",
         )
         self.assertEqual(
             parsed.get_metadatas(
-                "TestMixedCandidates1a", strategy="dc", encoder=encoder_capitalizer
+                "TestMixedCandidates1a", strategy=["dc"], encoder=encoder_capitalizer
             ),
             [{"CONTENT": "FRIENDSHIP"}],
         )
@@ -760,21 +766,21 @@ def test_complex_html(self):
             )
         # but we need to test get_metadata and get_metadatas
         self.assertEqual(
-            parsed.get_metadata("TestMixedCandidates1b", strategy="dc"), "158.25"
+            parsed.get_metadata("TestMixedCandidates1b", strategy=["dc"]), "158.25"
         )
         self.assertEqual(
-            parsed.get_metadatas("TestMixedCandidates1b", strategy="dc"),
+            parsed.get_metadatas("TestMixedCandidates1b", strategy=["dc"]),
             [dcTestMixedCandidates1bExpected],
         )
         self.assertEqual(
             parsed.get_metadata(
-                "TestMixedCandidates1b", strategy="dc", encoder=encoder_capitalizer
+                "TestMixedCandidates1b", strategy=["dc"], encoder=encoder_capitalizer
             ),
             "158.25",
         )
         self.assertEqual(
             parsed.get_metadatas(
-                "TestMixedCandidates1b", strategy="dc", encoder=encoder_capitalizer
+                "TestMixedCandidates1b", strategy=["dc"], encoder=encoder_capitalizer
             ),
             [{"CONTENT": "158.25", "SCHEME": "DDC"}],
         )
@@ -809,21 +815,21 @@ def test_complex_html(self):
         # but we need to test get_metadata and get_metadatas
 
         self.assertEqual(
-            parsed.get_metadata("TestMixedCandidates2a", strategy="dc"), "Friendship"
+            parsed.get_metadata("TestMixedCandidates2a", strategy=["dc"]), "Friendship"
         )
         self.assertEqual(
-            parsed.get_metadatas("TestMixedCandidates2a", strategy="dc"),
+            parsed.get_metadatas("TestMixedCandidates2a", strategy=["dc"]),
             dcTestMixedCandidates2aExpected,
         )
         self.assertEqual(
             parsed.get_metadata(
-                "TestMixedCandidates2a", strategy="dc", encoder=encoder_capitalizer
+                "TestMixedCandidates2a", strategy=["dc"], encoder=encoder_capitalizer
             ),
             "FRIENDSHIP",
         )
         self.assertEqual(
             parsed.get_metadatas(
-                "TestMixedCandidates2a", strategy="dc", encoder=encoder_capitalizer
+                "TestMixedCandidates2a", strategy=["dc"], encoder=encoder_capitalizer
             ),
             [{"CONTENT": "158.25", "SCHEME": "DDC"}, {"CONTENT": "FRIENDSHIP"}],
         )
@@ -854,30 +860,30 @@ def test_complex_html(self):
                 )
         # but we need to test get_metadata and get_metadatas
         self.assertEqual(
-            parsed.get_metadata("TestMixedCandidates2b", strategy="dc"), "Friendship"
+            parsed.get_metadata("TestMixedCandidates2b", strategy=["dc"]), "Friendship"
         )
         self.assertEqual(
-            parsed.get_metadatas("TestMixedCandidates2b", strategy="dc"),
+            parsed.get_metadatas("TestMixedCandidates2b", strategy=["dc"]),
             dcTestMixedCandidates2bExpected,
         )
         self.assertEqual(
             parsed.get_metadata(
-                "TestMixedCandidates2b", strategy="dc", encoder=encoder_capitalizer
+                "TestMixedCandidates2b", strategy=["dc"], encoder=encoder_capitalizer
             ),
             "FRIENDSHIP",
         )
         self.assertEqual(
             parsed.get_metadatas(
-                "TestMixedCandidates2b", strategy="dc", encoder=encoder_capitalizer
+                "TestMixedCandidates2b", strategy=["dc"], encoder=encoder_capitalizer
             ),
             [{"CONTENT": "FRIENDSHIP"}, {"CONTENT": "158.25", "SCHEME": "DDC"}],
         )
 
         # ok, mixedfield tests:
         # TestMixedField0
-        self.assertEqual(parsed.get_metadata("TestMixedField0", strategy="dc"), None)
+        self.assertEqual(parsed.get_metadata("TestMixedField0", strategy=["dc"]), None)
         self.assertEqual(
-            parsed.get_metadata("TestMixedField0", strategy="meta"),
+            parsed.get_metadata("TestMixedField0", strategy=["meta"]),
             "meta:TestMixedField0",
         )
         self.assertEqual(
@@ -886,13 +892,13 @@ def test_complex_html(self):
         )
         self.assertEqual(
             parsed.get_metadata(
-                "TestMixedField0", strategy="dc", encoder=encoder_capitalizer
+                "TestMixedField0", strategy=["dc"], encoder=encoder_capitalizer
             ),
             None,
         )
         self.assertEqual(
             parsed.get_metadata(
-                "TestMixedField0", strategy="meta", encoder=encoder_capitalizer
+                "TestMixedField0", strategy=["meta"], encoder=encoder_capitalizer
             ),
             "META:TESTMIXEDFIELD0",
         )
@@ -902,9 +908,9 @@ def test_complex_html(self):
             ),
             {"meta": "META:TESTMIXEDFIELD0"},
         )
-        self.assertEqual(parsed.get_metadatas("TestMixedField0", strategy="dc"), None)
+        self.assertEqual(parsed.get_metadatas("TestMixedField0", strategy=["dc"]), None)
         self.assertEqual(
-            parsed.get_metadatas("TestMixedField0", strategy="meta"),
+            parsed.get_metadatas("TestMixedField0", strategy=["meta"]),
             ["meta:TestMixedField0"],
         )
         self.assertEqual(
@@ -913,13 +919,13 @@ def test_complex_html(self):
         )
         self.assertEqual(
             parsed.get_metadatas(
-                "TestMixedField0", strategy="dc", encoder=encoder_capitalizer
+                "TestMixedField0", strategy=["dc"], encoder=encoder_capitalizer
             ),
             None,
         )
         self.assertEqual(
             parsed.get_metadatas(
-                "TestMixedField0", strategy="meta", encoder=encoder_capitalizer
+                "TestMixedField0", strategy=["meta"], encoder=encoder_capitalizer
             ),
             ["META:TESTMIXEDFIELD0"],
         )
@@ -932,10 +938,11 @@ def test_complex_html(self):
 
         # TestMixedField1
         self.assertEqual(
-            parsed.get_metadata("TestMixedField1", strategy="dc"), "dc:TestMixedField1"
+            parsed.get_metadata("TestMixedField1", strategy=["dc"]),
+            "dc:TestMixedField1",
         )
         self.assertEqual(
-            parsed.get_metadata("TestMixedField1", strategy="meta"),
+            parsed.get_metadata("TestMixedField1", strategy=["meta"]),
             "meta:TestMixedField1",
         )
         self.assertEqual(
@@ -944,13 +951,13 @@ def test_complex_html(self):
         )
         self.assertEqual(
             parsed.get_metadata(
-                "TestMixedField1", strategy="dc", encoder=encoder_capitalizer
+                "TestMixedField1", strategy=["dc"], encoder=encoder_capitalizer
             ),
             "DC:TESTMIXEDFIELD1",
         )
         self.assertEqual(
             parsed.get_metadata(
-                "TestMixedField1", strategy="meta", encoder=encoder_capitalizer
+                "TestMixedField1", strategy=["meta"], encoder=encoder_capitalizer
             ),
             "META:TESTMIXEDFIELD1",
         )
@@ -961,11 +968,11 @@ def test_complex_html(self):
             {"meta": "META:TESTMIXEDFIELD1", "dc": "DC:TESTMIXEDFIELD1"},
         )
         self.assertEqual(
-            parsed.get_metadatas("TestMixedField1", strategy="dc"),
+            parsed.get_metadatas("TestMixedField1", strategy=["dc"]),
             [{"content": "dc:TestMixedField1"}],
         )
         self.assertEqual(
-            parsed.get_metadatas("TestMixedField1", strategy="meta"),
+            parsed.get_metadatas("TestMixedField1", strategy=["meta"]),
             ["meta:TestMixedField1"],
         )
         self.assertEqual(
@@ -977,13 +984,13 @@ def test_complex_html(self):
         )
         self.assertEqual(
             parsed.get_metadatas(
-                "TestMixedField1", strategy="dc", encoder=encoder_capitalizer
+                "TestMixedField1", strategy=["dc"], encoder=encoder_capitalizer
             ),
             [{"CONTENT": "DC:TESTMIXEDFIELD1"}],
         )
         self.assertEqual(
             parsed.get_metadatas(
-                "TestMixedField1", strategy="meta", encoder=encoder_capitalizer
+                "TestMixedField1", strategy=["meta"], encoder=encoder_capitalizer
             ),
             ["META:TESTMIXEDFIELD1"],
         )
@@ -998,10 +1005,11 @@ def test_complex_html(self):
         )
         # TestMixedField2
         self.assertEqual(
-            parsed.get_metadata("TestMixedField2", strategy="dc"), "dc:TestMixedField2"
+            parsed.get_metadata("TestMixedField2", strategy=["dc"]),
+            "dc:TestMixedField2",
         )
         self.assertEqual(
-            parsed.get_metadata("TestMixedField2", strategy="meta"),
+            parsed.get_metadata("TestMixedField2", strategy=["meta"]),
             "meta:TestMixedField2",
         )
         self.assertEqual(
@@ -1010,13 +1018,13 @@ def test_complex_html(self):
         )
         self.assertEqual(
             parsed.get_metadata(
-                "TestMixedField2", strategy="dc", encoder=encoder_capitalizer
+                "TestMixedField2", strategy=["dc"], encoder=encoder_capitalizer
             ),
             "DC:TESTMIXEDFIELD2",
         )
         self.assertEqual(
             parsed.get_metadata(
-                "TestMixedField2", strategy="meta", encoder=encoder_capitalizer
+                "TestMixedField2", strategy=["meta"], encoder=encoder_capitalizer
             ),
             "META:TESTMIXEDFIELD2",
         )
@@ -1027,14 +1035,14 @@ def test_complex_html(self):
             {"meta": "META:TESTMIXEDFIELD2", "dc": "DC:TESTMIXEDFIELD2"},
         )
         self.assertEqual(
-            parsed.get_metadatas("TestMixedField2", strategy="dc"),
+            parsed.get_metadatas("TestMixedField2", strategy=["dc"]),
             [
                 {"content": "dc:TestMixedField2"},
                 {"content": "dc:TestMixedField2.ddc", "scheme": "ddc"},
             ],
         )
         self.assertEqual(
-            parsed.get_metadatas("TestMixedField2", strategy="meta"),
+            parsed.get_metadatas("TestMixedField2", strategy=["meta"]),
             ["meta:TestMixedField2"],
         )
         self.assertEqual(
@@ -1049,7 +1057,7 @@ def test_complex_html(self):
         )
         self.assertEqual(
             parsed.get_metadatas(
-                "TestMixedField2", strategy="dc", encoder=encoder_capitalizer
+                "TestMixedField2", strategy=["dc"], encoder=encoder_capitalizer
             ),
             [
                 {"CONTENT": "DC:TESTMIXEDFIELD2"},
@@ -1058,7 +1066,7 @@ def test_complex_html(self):
         )
         self.assertEqual(
             parsed.get_metadatas(
-                "TestMixedField2", strategy="meta", encoder=encoder_capitalizer
+                "TestMixedField2", strategy=["meta"], encoder=encoder_capitalizer
             ),
             ["META:TESTMIXEDFIELD2"],
         )
@@ -1077,10 +1085,11 @@ def test_complex_html(self):
 
         # TestMixedField3
         self.assertEqual(
-            parsed.get_metadata("TestMixedField3", strategy="dc"), "dc:TestMixedField3"
+            parsed.get_metadata("TestMixedField3", strategy=["dc"]),
+            "dc:TestMixedField3",
         )
         self.assertEqual(
-            parsed.get_metadata("TestMixedField3", strategy="meta"),
+            parsed.get_metadata("TestMixedField3", strategy=["meta"]),
             "meta:TestMixedField3",
         )
         self.assertEqual(
@@ -1089,13 +1098,13 @@ def test_complex_html(self):
         )
         self.assertEqual(
             parsed.get_metadata(
-                "TestMixedField3", strategy="dc", encoder=encoder_capitalizer
+                "TestMixedField3", strategy=["dc"], encoder=encoder_capitalizer
             ),
             "DC:TESTMIXEDFIELD3",
         )
         self.assertEqual(
             parsed.get_metadata(
-                "TestMixedField3", strategy="meta", encoder=encoder_capitalizer
+                "TestMixedField3", strategy=["meta"], encoder=encoder_capitalizer
             ),
             "META:TESTMIXEDFIELD3",
         )
@@ -1106,11 +1115,11 @@ def test_complex_html(self):
             {"meta": "META:TESTMIXEDFIELD3", "dc": "DC:TESTMIXEDFIELD3"},
         )
         self.assertEqual(
-            parsed.get_metadatas("TestMixedField3", strategy="dc"),
+            parsed.get_metadatas("TestMixedField3", strategy=["dc"]),
             [{"content": "dc:TestMixedField3"}],
         )
         self.assertEqual(
-            parsed.get_metadatas("TestMixedField3", strategy="meta"),
+            parsed.get_metadatas("TestMixedField3", strategy=["meta"]),
             ["meta:TestMixedField3"],
         )
         self.assertEqual(
@@ -1122,13 +1131,13 @@ def test_complex_html(self):
         )
         self.assertEqual(
             parsed.get_metadatas(
-                "TestMixedField3", strategy="dc", encoder=encoder_capitalizer
+                "TestMixedField3", strategy=["dc"], encoder=encoder_capitalizer
             ),
             [{"CONTENT": "DC:TESTMIXEDFIELD3"}],
         )
         self.assertEqual(
             parsed.get_metadatas(
-                "TestMixedField3", strategy="meta", encoder=encoder_capitalizer
+                "TestMixedField3", strategy=["meta"], encoder=encoder_capitalizer
             ),
             ["META:TESTMIXEDFIELD3"],
         )
diff --git a/tests/test_ip_tracking.py b/tests/test_ip_tracking.py
index b2840ea..b34f075 100644
--- a/tests/test_ip_tracking.py
+++ b/tests/test_ip_tracking.py
@@ -11,6 +11,11 @@ class TestIpLookups(unittest.TestCase):
     """"""
 
     def test_ip_lookup(self):
-        url = "http://example.com/"
+        """
+        this is using the live internet
+
+        todo: use httpbin
+        """
+        url = "https://example.com/"
         page = metadata_parser.MetadataParser(url=url)
         self.assertTrue(page.peername)
diff --git a/tox.ini b/tox.ini
index 6fb6c6d..052fa76 100644
--- a/tox.ini
+++ b/tox.ini
@@ -2,7 +2,7 @@
 envlist =
 	lint,
 	mypy,
-	py36,py37,py38,py39,py310,py311,py312,py313
+	py37,py38,py39,py310,py311,py312,py313
 
 [testenv]
 commands =

From 3469f8e7f06553568866ca2e0e612c237c7183de Mon Sep 17 00:00:00 2001
From: jonathan vanasco <jonathan@2xlp.com>
Date: Fri, 23 May 2025 14:18:53 -0400
Subject: [PATCH 2/4] bump beautifulsoup to 4.14.x branch ignore internal
 pytest warnings

---
 pytest.ini                      |  5 +++++
 setup.py                        |  2 +-
 src/metadata_parser/__init__.py | 14 +++++++-------
 3 files changed, 13 insertions(+), 8 deletions(-)
 create mode 100644 pytest.ini

diff --git a/pytest.ini b/pytest.ini
new file mode 100644
index 0000000..db7cd78
--- /dev/null
+++ b/pytest.ini
@@ -0,0 +1,5 @@
+[pytest]
+
+filterwarnings =
+    ignore:MetadataParser.
+    ignore:`ParsedResult.get_metadata` returns a string
diff --git a/setup.py b/setup.py
index 9cb3195..3c48d58 100644
--- a/setup.py
+++ b/setup.py
@@ -27,7 +27,7 @@
     long_description = fp.read()
 
 requires = [
-    "BeautifulSoup4<4.15.0",
+    "BeautifulSoup4>4.13.0,<4.14.0",
     "requests>=2.19.1",
     "requests-toolbelt>=0.8.0",
     "typing_extensions",
diff --git a/src/metadata_parser/__init__.py b/src/metadata_parser/__init__.py
index 4ea70b6..4453b42 100644
--- a/src/metadata_parser/__init__.py
+++ b/src/metadata_parser/__init__.py
@@ -1171,7 +1171,7 @@ def get_metadata(
           function or "raw"
         """
         warn_future(
-            """`ParsedResult.get_metadata` returns a string and is deprecated"""
+            """`ParsedResult.get_metadata` returns a string and is deprecated """
             """in favor of `get_metadatas` which returns a list. """
             """This will be removed in the next minor or major release."""
         )
@@ -2129,7 +2129,7 @@ def parse(
                 return
             doc_searchpath = doc.html.head  # bs4.element.Tag
 
-        ogs = doc_searchpath.findAll("meta", attrs={"property": RE_prefix_opengraph})
+        ogs = doc_searchpath.find_all("meta", attrs={"property": RE_prefix_opengraph})
         for og in ogs:
             try:
                 parsed_result._add_discovered(
@@ -2144,7 +2144,7 @@ def parse(
                     log.debug("Ran into a serious error parsing `og`: %s", exc)
                 pass
 
-        twitters = doc_searchpath.findAll("meta", attrs={"name": RE_prefix_twitter})
+        twitters = doc_searchpath.find_all("meta", attrs={"name": RE_prefix_twitter})
         for twitter in twitters:
             try:
                 # for the deprecated "twitter:(label|data)" meta tags, we must use a 'value' attr
@@ -2196,7 +2196,7 @@ def parse(
                 pass
 
         # is there an image_src?
-        images = doc.findAll("link", attrs={"rel": RE_prefix_rel_img_src})
+        images = doc.find_all("link", attrs={"rel": RE_prefix_rel_img_src})
         if images:
             # we only use the first image on the page
             image = images[0]
@@ -2218,7 +2218,7 @@ def parse(
                 pass
 
         # figure out the canonical url
-        canonicals = doc.findAll("link", attrs={"rel": RE_canonical})
+        canonicals = doc.find_all("link", attrs={"rel": RE_canonical})
         if canonicals:
             # only use the first?
             canonical = canonicals[0]
@@ -2240,7 +2240,7 @@ def parse(
                 pass
 
         # is there a shortlink?
-        shortlinks = doc.findAll("link", attrs={"rel": RE_shortlink})
+        shortlinks = doc.find_all("link", attrs={"rel": RE_shortlink})
         for shortlink in shortlinks:
             if shortlink.has_attr("href"):
                 _link = shortlink["href"]
@@ -2260,7 +2260,7 @@ def parse(
                 pass
 
         # pull out all the metadata
-        meta = doc_searchpath.findAll(name="meta")
+        meta = doc_searchpath.find_all(name="meta")
         for m in meta:
             try:
                 k = None  # metadata key

From 71ce964484a4a40b48914cad974d991e3c4ffa0c Mon Sep 17 00:00:00 2001
From: jonathan vanasco <jonathan@2xlp.com>
Date: Fri, 23 May 2025 19:22:01 -0400
Subject: [PATCH 3/4] standardizing UrlParserCacheable, cached_urlparser, and
 cached_urlparser_maxitems

---
 CHANGELOG.txt                   |  12 ++
 src/metadata_parser/__init__.py |  66 ++++++++--
 tests/test_document_parsing.py  | 212 +++++++++++++++++++++++++++++---
 3 files changed, 263 insertions(+), 27 deletions(-)

diff --git a/CHANGELOG.txt b/CHANGELOG.txt
index fdbe5e8..0c436ae 100644
--- a/CHANGELOG.txt
+++ b/CHANGELOG.txt
@@ -9,6 +9,18 @@
         string is "all", otherwise a list of string - excluding "all" - must be
         submitted. Warnings of this have been emitted for several years.
     * __init__(`search_head_only`) now defaults to False
+    * `UrlParserCacheable` has been extended to accepted a `urlparser` argument.
+      This defaults to `urlparse` and expects the same signature.
+    * __init__(`cached_urlparser`) has new deprecations to standardize the API
+        submitting an Int to set max_items is deprecated; instead:
+            cached_urlparser=True
+            cached_urlparser_maxitems=int
+        submitting 0 is deprecated; instead:
+            cached_urlparser=False
+                or
+            cached_urlparser_maxitems=0
+            cached_urlparser=False
+    * __init__(`cached_urlparser_maxitems`) has been added
 
 0.12.3
     * pin "BeautifulSoup4<4.15.0"
diff --git a/src/metadata_parser/__init__.py b/src/metadata_parser/__init__.py
index 4453b42..ee0a4f7 100644
--- a/src/metadata_parser/__init__.py
+++ b/src/metadata_parser/__init__.py
@@ -35,6 +35,7 @@
 from requests.structures import CaseInsensitiveDict
 from requests_toolbelt.utils.deprecated import get_encodings_from_content
 from typing_extensions import Literal  # py38
+from typing_extensions import Protocol  # py38
 
 if TYPE_CHECKING:
     from bs4 import Tag as _bs4_Tag
@@ -987,7 +988,11 @@ def log(
                 )
 
 
-class UrlParserCacheable(object):
+class _UrlParserCacheable(Protocol):
+    urlparse: Callable[[str], ParseResult]
+
+
+class UrlParserCacheable(_UrlParserCacheable):
     """
     class for caching calls to urlparse
 
@@ -996,14 +1001,20 @@ class for caching calls to urlparse
 
     cache: collections.OrderedDict
     maxitems: int
+    urlparser: Callable[[str], ParseResult]
 
-    def __init__(self, maxitems: int = 30):
+    def __init__(
+        self,
+        maxitems: int = 30,
+        urlparser: Callable[[str], ParseResult] = urlparse,
+    ):
         """
         :param maxitems: maximum items to cache, default 30
         :type maxitems: int, optional
         """
         self.cache = collections.OrderedDict()
         self.maxitems = maxitems
+        self.urlparser = urlparser
 
     def urlparse(self, url: str) -> ParseResult:
         """
@@ -1011,7 +1022,7 @@ def urlparse(self, url: str) -> ParseResult:
         :type url: str
         """
         if url not in self.cache:
-            self.cache[url] = urlparse(url)
+            self.cache[url] = self.urlparser(url)
             if len(self.cache) > self.maxitems:
                 self.cache.popitem(last=False)
         return self.cache[url]
@@ -1388,6 +1399,7 @@ class MetadataParser(object):
     support_malformed: Optional[bool] = None
 
     urlparse: Callable[[str], ParseResult]
+    _cached_urlparser: Optional[_UrlParserCacheable]
 
     # this has a per-parser default tuple
     # it can be upgraded manually
@@ -1427,6 +1439,7 @@ def __init__(
         retry_dropped_without_headers: Optional[bool] = None,
         support_malformed: Optional[bool] = None,
         cached_urlparser: Union[bool, int, Callable[[str], ParseResult]] = True,
+        cached_urlparser_maxitems: Optional[int] = None,
     ):
         """
         creates a new `MetadataParser` instance.
@@ -1521,27 +1534,56 @@ def __init__(
                 default: True
                 options: True: use a instance of UrlParserCacheable(maxitems=30)
                        : INT: use a instance of UrlParserCacheable(maxitems=cached_urlparser)
-                       : None/False/0 - use native urlparse
+                            DEPRECATED in v13.0
+                            instead, set `cached_urlparser=True, cached_urlparser_maxitems=maxitems
+                       : None/False - use native urlparse
                        : callable - use as a custom urlparse
+            `cached_urlparser_maxitems`
+                default: None
+                options: int: sets maxitems
         """
         if __debug__:
             log.debug("MetadataParser.__init__(%s)", url)
         if url is not None:
             url = url.strip()
         self.parsed_result = ParsedResult()
+        if cached_urlparser_maxitems:
+            if cached_urlparser is not True:
+                raise ValueError(
+                    "`cached_urlparser_maxitems` requires `cached_urlparser=True`"
+                )
+        if cached_urlparser == 0:
+            warn_future(
+                "Supplying `0` to `cached_urlparser` to set maxitems is deprecated. "
+                "This will be removed in the next major or minor release."
+                "Supply `cached_urlparser=False` instead."
+            )
+            cached_urlparser = False
         if cached_urlparser:
+            if isinstance(cached_urlparser, int):
+                # build a default parser with maxitems
+                warn_future(
+                    "Supplying an int to `cached_urlparser` to set maxitems is deprecated. "
+                    "This will be removed in the next major or minor release."
+                    "Supply `cached_urlparser=True, cached_urlparser_maxitems=int` instead."
+                )
+                # coerce args for the next block
+                cached_urlparser_maxitems = cached_urlparser
+                cached_urlparser = True
             if cached_urlparser is True:
-                _cached_urlparser = UrlParserCacheable()  # a cache
-                self._cached_urlparser = _cached_urlparser  # stash it
-                self.urlparse = _cached_urlparser.urlparse
-            elif isinstance(cached_urlparser, int):
-                _cached_urlparser = UrlParserCacheable(
-                    maxitems=cached_urlparser
-                )  # a cache
+                # build a default parser
+                if cached_urlparser_maxitems is not None:
+                    _cached_urlparser = UrlParserCacheable(
+                        maxitems=cached_urlparser_maxitems
+                    )
+                else:
+                    _cached_urlparser = UrlParserCacheable()
                 self._cached_urlparser = _cached_urlparser  # stash it
                 self.urlparse = _cached_urlparser.urlparse
             else:
-                # TODO - raise value error if not callable
+                if not callable(cached_urlparser):
+                    raise ValueError("`cached_urlparser` must be a callable")
+                self._cached_urlparser = None
                 self.urlparse = cached_urlparser
         else:
             self.urlparse = urlparse
diff --git a/tests/test_document_parsing.py b/tests/test_document_parsing.py
index 8516f04..094822c 100644
--- a/tests/test_document_parsing.py
+++ b/tests/test_document_parsing.py
@@ -2,6 +2,7 @@
 import os
 from typing import Dict
 import unittest
+import warnings
 
 # local
 import metadata_parser
@@ -215,7 +216,7 @@ def _docs_test(test_names):
     return errors
 
 
-def _docs_test_parser(test_names, cached_urlparser):
+def _docs_test_parser(test_names, cached_urlparser, cached_urlparser_maxitems=None):
     errors = []
     for test in test_names:
         tests = []
@@ -223,6 +224,8 @@ def _docs_test_parser(test_names, cached_urlparser):
         kwargs = {}
         if cached_urlparser != "*no-kwarg":
             kwargs["cached_urlparser"] = cached_urlparser
+        if cached_urlparser_maxitems is not None:
+            kwargs["cached_urlparser_maxitems"] = cached_urlparser_maxitems
         parsed = metadata_parser.MetadataParser(
             url=url, html=docs[test]["doc"], **kwargs
         )
@@ -1354,12 +1357,13 @@ def test_charsets(self):
         self.assertEqual(c_parsed.metadata["meta"]["charset"], "UTF-8")
 
 
-class TestCustomUrlparser(unittest.TestCase):
+class Test_UrlParserCacheable(unittest.TestCase):
     """
-    python -m unittest tests.document_parsing.TestCustomUrlparser
+    python -m unittest tests.document_parsing.Test_UrlParserCacheable
     """
 
-    def test_default__get_discrete_url__good_relative(self):
+    def test__default(self):
+        """MetadataParser()"""
         errors = _docs_test_parser(
             [
                 "good-canonical-relative",
@@ -1371,7 +1375,8 @@ def test_default__get_discrete_url__good_relative(self):
         if errors:
             raise ValueError(errors)
 
-    def test_true__get_discrete_url__good_relative(self):
+    def test__True(self):
+        """MetadataParser(cached_urlparser=True)"""
         errors = _docs_test_parser(
             [
                 "good-canonical-relative",
@@ -1383,52 +1388,229 @@ def test_true__get_discrete_url__good_relative(self):
         if errors:
             raise ValueError(errors)
 
-    def test_int__get_discrete_url__good_relative(self):
+    def test__Int_1(self):
+        """MetadataParser(cached_urlparser=1)"""
+        with warnings.catch_warnings(record=True) as warned:
+            warnings.simplefilter("always")
+            errors = _docs_test_parser(
+                [
+                    "good-canonical-relative",
+                    "good-canonical-relative_alt",
+                    "good-og-relative_alt",
+                ],
+                1,
+            )
+            if errors:
+                raise ValueError(errors)
+            assert len(warned) >= 1
+            _found = False
+            for w in warned:
+                if isinstance(w.message, FutureWarning):
+                    if w.message.args[0].startswith(
+                        "Supplying an int to `cached_urlparser` to set maxitems is deprecated."
+                    ):
+                        _found = True
+                        assert (
+                            "Supply `cached_urlparser=True, cached_urlparser_maxitems=int` instead."
+                            in w.message.args[0]
+                        )
+            assert _found is True
+
+    def test__Int_0(self):
+        """MetadataParser(cached_urlparser=1)"""
+        with warnings.catch_warnings(record=True) as warned:
+            warnings.simplefilter("always")
+            errors = _docs_test_parser(
+                [
+                    "good-canonical-relative",
+                    "good-canonical-relative_alt",
+                    "good-og-relative_alt",
+                ],
+                0,
+            )
+            if errors:
+                raise ValueError(errors)
+            assert len(warned) >= 1
+            _found = False
+            for w in warned:
+                if isinstance(w.message, FutureWarning):
+                    if w.message.args[0].startswith(
+                        "Supplying `0` to `cached_urlparser` to set maxitems is deprecated."
+                    ):
+                        _found = True
+                        assert (
+                            "Supply `cached_urlparser=False` instead"
+                            in w.message.args[0]
+                        )
+            assert _found is True
+
+    def test__None(self):
         errors = _docs_test_parser(
             [
                 "good-canonical-relative",
                 "good-canonical-relative_alt",
                 "good-og-relative_alt",
             ],
-            1,
+            None,
         )
         if errors:
             raise ValueError(errors)
 
-    def test_none__get_discrete_url__good_relative(self):
+    def test__False(self):
         errors = _docs_test_parser(
             [
                 "good-canonical-relative",
                 "good-canonical-relative_alt",
                 "good-og-relative_alt",
             ],
-            None,
+            False,
         )
         if errors:
             raise ValueError(errors)
 
-    def test_false__get_discrete_url__good_relative(self):
+    def test__CustomParser(self):
+        custom_parser_obj = metadata_parser.UrlParserCacheable()
+        custom_parser = custom_parser_obj.urlparse
         errors = _docs_test_parser(
             [
                 "good-canonical-relative",
                 "good-canonical-relative_alt",
                 "good-og-relative_alt",
             ],
-            False,
+            custom_parser,
         )
         if errors:
             raise ValueError(errors)
 
-    def test_instance__get_discrete_url__good_relative(self):
-        custom_parser_obj = metadata_parser.UrlParserCacheable()
-        custom_parser = custom_parser_obj.urlparse
+
+class Test_UrlParserCacheable_MaxItems(unittest.TestCase):
+
+    def test__default(self):
+        """MetadataParser()"""
         errors = _docs_test_parser(
             [
                 "good-canonical-relative",
                 "good-canonical-relative_alt",
                 "good-og-relative_alt",
             ],
-            custom_parser,
+            "*no-kwarg",
+            cached_urlparser_maxitems=1,
         )
         if errors:
             raise ValueError(errors)
+
+    def test__True(self):
+        # this should fail
+        errors = _docs_test_parser(
+            [
+                "good-canonical-relative",
+                "good-canonical-relative_alt",
+                "good-og-relative_alt",
+            ],
+            True,
+            cached_urlparser_maxitems=1,
+        )
+        if errors:
+            raise ValueError(errors)
+
+    def test__False(self):
+        # this should fail
+        with self.assertRaises(ValueError) as cm:
+            errors = _docs_test_parser(
+                [
+                    "good-canonical-relative",
+                    "good-canonical-relative_alt",
+                    "good-og-relative_alt",
+                ],
+                False,
+                cached_urlparser_maxitems=1,
+            )
+            if errors:
+                raise ValueError(errors)
+        assert isinstance(cm.exception, ValueError)
+        assert (
+            cm.exception.args[0]
+            == "`cached_urlparser_maxitems` requires `cached_urlparser=True`"
+        )
+
+    def test__Int_1(self):
+        # this should fail
+        with self.assertRaises(ValueError) as cm:
+            errors = _docs_test_parser(
+                [
+                    "good-canonical-relative",
+                    "good-canonical-relative_alt",
+                    "good-og-relative_alt",
+                ],
+                1,
+                cached_urlparser_maxitems=1,
+            )
+            if errors:
+                raise ValueError(errors)
+        assert isinstance(cm.exception, ValueError)
+        assert (
+            cm.exception.args[0]
+            == "`cached_urlparser_maxitems` requires `cached_urlparser=True`"
+        )
+
+    def test__Int_0(self):
+        # this should fail
+        with self.assertRaises(ValueError) as cm:
+            errors = _docs_test_parser(
+                [
+                    "good-canonical-relative",
+                    "good-canonical-relative_alt",
+                    "good-og-relative_alt",
+                ],
+                0,
+                cached_urlparser_maxitems=1,
+            )
+            if errors:
+                raise ValueError(errors)
+        assert isinstance(cm.exception, ValueError)
+        assert (
+            cm.exception.args[0]
+            == "`cached_urlparser_maxitems` requires `cached_urlparser=True`"
+        )
+
+    def test__None(self):
+        # this should fail
+        with self.assertRaises(ValueError) as cm:
+            errors = _docs_test_parser(
+                [
+                    "good-canonical-relative",
+                    "good-canonical-relative_alt",
+                    "good-og-relative_alt",
+                ],
+                None,
+                cached_urlparser_maxitems=1,
+            )
+            if errors:
+                raise ValueError(errors)
+        assert isinstance(cm.exception, ValueError)
+        assert (
+            cm.exception.args[0]
+            == "`cached_urlparser_maxitems` requires `cached_urlparser=True`"
+        )
+
+    def test__CustomParser(self):
+        # this should fail
+        custom_parser_obj = metadata_parser.UrlParserCacheable()
+        custom_parser = custom_parser_obj.urlparse
+        with self.assertRaises(ValueError) as cm:
+            errors = _docs_test_parser(
+                [
+                    "good-canonical-relative",
+                    "good-canonical-relative_alt",
+                    "good-og-relative_alt",
+                ],
+                custom_parser,
+                cached_urlparser_maxitems=1,
+            )
+            if errors:
+                raise ValueError(errors)
+        assert isinstance(cm.exception, ValueError)
+        assert (
+            cm.exception.args[0]
+            == "`cached_urlparser_maxitems` requires `cached_urlparser=True`"
+        )

From fa031ba0cbdb14808acd08419c8a956ec81f410f Mon Sep 17 00:00:00 2001
From: jonathan vanasco <jonathan@2xlp.com>
Date: Fri, 23 May 2025 21:23:00 -0400
Subject: [PATCH 4/4] clear version

---
 CHANGELOG.txt                   | 7 ++++---
 src/metadata_parser/__init__.py | 2 +-
 2 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/CHANGELOG.txt b/CHANGELOG.txt
index 0c436ae..3991205 100644
--- a/CHANGELOG.txt
+++ b/CHANGELOG.txt
@@ -1,8 +1,8 @@
-1.0
-    1.0 will be a complete api overhaul
+1.0 (unreleased)
+    1.0 will include an api overhaul and remove all deprecations
 
 
-0.13.0rc0
+0.13.0
     * drop py36; no test options due to github deprecation of ubuntu20.04
     * `_coerce_validate_strategy` (invoked by `get_metadatas`) will now raise a
         ValueError if a string other than "all" is submitted.  The only valid
@@ -21,6 +21,7 @@
             cached_urlparser_maxitems=0
             cached_urlparser=False
     * __init__(`cached_urlparser_maxitems`) has been added
+    * the next release is likely to be 1.0
 
 0.12.3
     * pin "BeautifulSoup4<4.15.0"
diff --git a/src/metadata_parser/__init__.py b/src/metadata_parser/__init__.py
index ee0a4f7..07d16fb 100644
--- a/src/metadata_parser/__init__.py
+++ b/src/metadata_parser/__init__.py
@@ -49,7 +49,7 @@
 # ==============================================================================
 
 
-__VERSION__ = "0.13.0rc0"
+__VERSION__ = "0.13.0"
 
 
 # ------------------------------------------------------------------------------