danieldotnl
diff --git a/‎custom_components/multiscrape/extractors.py‎
Lines changed: 44 additions & 0 deletions b/‎custom_components/multiscrape/extractors.py‎
Lines changed: 44 additions & 0 deletions
diff --git a/‎custom_components/multiscrape/parsers.py‎
Lines changed: 90 additions & 0 deletions b/‎custom_components/multiscrape/parsers.py‎
Lines changed: 90 additions & 0 deletions
diff --git a/‎custom_components/multiscrape/scraper.py‎
Lines changed: 52 additions & 74 deletions b/‎custom_components/multiscrape/scraper.py‎
Lines changed: 52 additions & 74 deletions
@@ -0,0 +1,44 @@
+"""Value extraction from parsed HTML content."""
+from __future__ import annotations
+
+from typing import TYPE_CHECKING
+
+from bs4 import Tag
+
+if TYPE_CHECKING:
+    from .selector import Selector
+
+
+class ValueExtractor:
+    """Extracts values from BeautifulSoup elements."""
+
+    def __init__(self, separator: str = ","):
+        """Initialize value extractor."""
+        self._separator = separator
+
+    def extract_single(self, element: Tag, selector: Selector) -> str:
+        """Extract a value from a single element."""
+        if selector.attribute is not None:
+            return element[selector.attribute]
+        return self._extract_tag_value(element, selector)
+
+    def extract_list(self, elements: list[Tag], selector: Selector) -> str:
+        """Extract values from a list of elements and join with separator."""
+        if selector.attribute is not None:
+            values = [elem[selector.attribute] for elem in elements]
+        else:
+            values = [self._extract_tag_value(elem, selector) for elem in elements]
+        return self._separator.join(values)
+
+    @staticmethod
+    def _extract_tag_value(tag: Tag, selector: Selector) -> str:
+        """Extract value from HTML tag based on extract mode."""
+        if tag.name in ("style", "script", "template"):
+            return tag.string or ""
+        if selector.extract == "text":
+            return tag.text
+        elif selector.extract == "content":
+            return ''.join(map(str, tag.contents))
+        elif selector.extract == "tag":
+            return str(tag)
+        return tag.text
@@ -0,0 +1,90 @@
+"""Content parsers for multiscrape using the Strategy pattern."""
+from __future__ import annotations
+
+import logging
+from abc import ABC, abstractmethod
+from typing import Any
+
+from bs4 import BeautifulSoup
+
+_LOGGER = logging.getLogger(__name__)
+
+
+class ContentParser(ABC):
+    """Base class for content parsers."""
+
+    @property
+    @abstractmethod
+    def name(self) -> str:
+        """Return parser name for logging."""
+
+    @abstractmethod
+    def can_parse(self, content: str) -> bool:
+        """Check if this parser can handle the content."""
+
+    @abstractmethod
+    async def parse(self, content: str, hass: Any) -> Any:
+        """Parse content and return parsed structure."""
+
+
+class HtmlParser(ContentParser):
+    """Parse HTML/XML content using BeautifulSoup."""
+
+    def __init__(self, parser_name: str = "lxml"):
+        """Initialize HTML parser."""
+        self._parser_name = parser_name
+
+    @property
+    def name(self) -> str:
+        """Return parser name."""
+        return f"html ({self._parser_name})"
+
+    def can_parse(self, content: str) -> bool:
+        """HTML parser handles anything that's not JSON."""
+        content_stripped = content.lstrip() if content else ""
+        if not content_stripped:
+            return True
+        return content_stripped[0] not in ("{", "[")
+
+    async def parse(self, content: str, hass: Any) -> BeautifulSoup:
+        """Parse HTML content with BeautifulSoup."""
+        return await hass.async_add_executor_job(
+            BeautifulSoup, content, self._parser_name
+        )
+
+
+class JsonDetector(ContentParser):
+    """Detects JSON content. Does not parse it (JSON uses value_template only)."""
+
+    @property
+    def name(self) -> str:
+        """Return parser name."""
+        return "json"
+
+    def can_parse(self, content: str) -> bool:
+        """Check if content looks like JSON."""
+        content_stripped = content.lstrip() if content else ""
+        return bool(content_stripped) and content_stripped[0] in ("{", "[")
+
+    async def parse(self, content: str, hass: Any) -> None:
+        """JSON is not parsed into a queryable structure."""
+        return None
+
+
+class ParserFactory:
+    """Selects the appropriate parser for content."""
+
+    def __init__(self, parser_name: str):
+        """Initialize with the HTML parser name."""
+        self._parsers: list[ContentParser] = [
+            JsonDetector(),
+            HtmlParser(parser_name),
+        ]
+
+    def get_parser(self, content: str) -> ContentParser:
+        """Get appropriate parser for content."""
+        for parser in self._parsers:
+            if parser.can_parse(content):
+                return parser
+        # Fallback to HTML (should never happen since HtmlParser accepts empty)
+        return self._parsers[-1]
@@ -4,6 +4,8 @@
 from bs4 import BeautifulSoup
 
 from .const import CONF_PARSER, CONF_SEPARATOR
+from .extractors import ValueExtractor
+from .parsers import JsonDetector, ParserFactory
 from .scrape_context import ScrapeContext
 
 DEFAULT_TIMEOUT = 10
@@ -26,7 +28,7 @@ def create_scraper(config_name, config, hass, file_manager):
 
 
 class Scraper:
-    """Class for handling the retrieval and scraping of data."""
+    """Orchestrates parsing and value extraction."""
 
     def __init__(
         self,
@@ -42,10 +44,11 @@ def __init__(
         self._hass = hass
         self._file_manager = file_manager
         self._config_name = config_name
-        self._parser = parser
+        self._parser_factory = ParserFactory(parser)
+        self._extractor = ValueExtractor(separator)
         self._soup: BeautifulSoup = None
         self._data = None
-        self._separator = separator
+        self._is_json = False
         self.reset()
 
     @property
@@ -57,6 +60,7 @@ def reset(self):
         """Reset the scraper object."""
         self._data = None
         self._soup = None
+        self._is_json = False
 
     @property
     def formatted_content(self):
@@ -68,45 +72,41 @@ def formatted_content(self):
     async def set_content(self, content):
         """Set the content to be scraped."""
         self._data = content
+        parser = self._parser_factory.get_parser(content)
 
-        # Try to detect JSON more robustly
-        content_stripped = content.lstrip() if content else ""
-        if content_stripped and content_stripped[0] in ["{", "["]:
+        if isinstance(parser, JsonDetector):
             _LOGGER.debug(
                 "%s # Response seems to be json. Skip parsing with BeautifulSoup.",
                 self._config_name,
             )
-        else:
-            try:
-                _LOGGER.debug(
-                    "%s # Loading the content in BeautifulSoup.",
-                    self._config_name,
-                )
-                self._soup = await self._hass.async_add_executor_job(
-                    BeautifulSoup, self._data, self._parser
-                )
-
-                if self._file_manager:
-                    await self._async_file_log("page_soup", self._soup.prettify())
-
-            except Exception as ex:
-                self.reset()
-                _LOGGER.error(
-                    "%s # Unable to parse response with BeautifulSoup: %s",
-                    self._config_name,
-                    ex,
-                )
-                raise
+            self._is_json = True
+            return
+
+        try:
+            _LOGGER.debug(
+                "%s # Loading the content in BeautifulSoup.",
+                self._config_name,
+            )
+            self._soup = await parser.parse(content, self._hass)
+
+            if self._file_manager:
+                await self._async_file_log("page_soup", self._soup.prettify())
+
+        except Exception as ex:
+            self.reset()
+            _LOGGER.error(
+                "%s # Unable to parse response with BeautifulSoup: %s",
+                self._config_name,
+                ex,
+            )
+            raise
 
     def scrape(self, selector, sensor, attribute=None, context: ScrapeContext | None = None):
         """Scrape based on given selector the data."""
         if context is None:
             context = ScrapeContext.empty()
 
-        # This is required as this function is called separately for sensors and attributes
-        log_prefix = f"{self._config_name} # {sensor}"
-        if attribute:
-            log_prefix = log_prefix + f"# {attribute}"
+        log_prefix = self._make_log_prefix(sensor, attribute)
 
         if selector.just_value:
             _LOGGER.debug("%s # Applying value_template only.", log_prefix)
@@ -115,43 +115,12 @@ def scrape(self, selector, sensor, attribute=None, context: ScrapeContext | None
             )
             return selector.value_template._parse_result(result)
 
-        # Check if content is JSON
-        content_stripped = self._data.lstrip() if self._data else ""
-        if content_stripped and content_stripped[0] in ["{", "["]:
+        if self._is_json:
             raise ValueError(
                 "JSON cannot be scraped. Please provide a value template to parse JSON response."
             )
 
-        if selector.is_list:
-            tags = self._soup.select(selector.list)
-            _LOGGER.debug("%s # List selector selected tags: %s",
-                          log_prefix, tags)
-            if selector.attribute is not None:
-                _LOGGER.debug(
-                    "%s # Try to find attributes: %s",
-                    log_prefix,
-                    selector.attribute,
-                )
-                values = [tag[selector.attribute] for tag in tags]
-            else:
-                values = [self.extract_tag_value(tag, selector) for tag in tags]
-            value = self._separator.join(values)
-            _LOGGER.debug("%s # List selector csv: %s", log_prefix, value)
-
-        else:
-            tag = self._soup.select_one(selector.element)
-            _LOGGER.debug("%s # Tag selected: %s", log_prefix, tag)
-            if tag is None:
-                raise ValueError("Could not find a tag for given selector")
-
-            if selector.attribute is not None:
-                _LOGGER.debug(
-                    "%s # Try to find attribute: %s", log_prefix, selector.attribute
-                )
-                value = tag[selector.attribute]
-            else:
-                value = self.extract_tag_value(tag, selector)
-            _LOGGER.debug("%s # Selector result: %s", log_prefix, value)
+        value = self._extract_value(selector, log_prefix)
 
         if value is not None and selector.value_template is not None:
             _LOGGER.debug(
@@ -167,17 +136,26 @@ def scrape(self, selector, sensor, attribute=None, context: ScrapeContext | None
         )
         return value
 
-    def extract_tag_value(self, tag, selector):
-        """Extract value from a tag."""
-        if tag.name in ("style", "script", "template"):
-            return tag.string
+    def _extract_value(self, selector, log_prefix):
+        """Delegate extraction to ValueExtractor."""
+        if selector.is_list:
+            tags = self._soup.select(selector.list)
+            _LOGGER.debug("%s # List selector selected tags: %s",
+                          log_prefix, tags)
+            return self._extractor.extract_list(tags, selector)
         else:
-            if selector.extract == "text":
-                return tag.text
-            elif selector.extract == "content":
-                return ''.join(map(str, tag.contents))
-            elif selector.extract == "tag":
-                return str(tag)
+            tag = self._soup.select_one(selector.element)
+            _LOGGER.debug("%s # Tag selected: %s", log_prefix, tag)
+            if tag is None:
+                raise ValueError("Could not find a tag for given selector")
+            return self._extractor.extract_single(tag, selector)
+
+    def _make_log_prefix(self, sensor, attribute):
+        """Create log prefix for messages."""
+        prefix = f"{self._config_name} # {sensor}"
+        if attribute:
+            prefix = prefix + f"# {attribute}"
+        return prefix
 
     async def _async_file_log(self, content_name, content):
         try: