Skip to content

Commit e07c4e0

Browse files
authored
Merge pull request #568 from danieldotnl/feature/552-strategy-pattern
Refactor Scraper class using Strategy pattern (#552)
2 parents bace136 + 292f846 commit e07c4e0

5 files changed

Lines changed: 459 additions & 74 deletions

File tree

Lines changed: 44 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,44 @@
1+
"""Value extraction from parsed HTML content."""
2+
from __future__ import annotations
3+
4+
from typing import TYPE_CHECKING
5+
6+
from bs4 import Tag
7+
8+
if TYPE_CHECKING:
9+
from .selector import Selector
10+
11+
12+
class ValueExtractor:
13+
"""Extracts values from BeautifulSoup elements."""
14+
15+
def __init__(self, separator: str = ","):
16+
"""Initialize value extractor."""
17+
self._separator = separator
18+
19+
def extract_single(self, element: Tag, selector: Selector) -> str:
20+
"""Extract a value from a single element."""
21+
if selector.attribute is not None:
22+
return element[selector.attribute]
23+
return self._extract_tag_value(element, selector)
24+
25+
def extract_list(self, elements: list[Tag], selector: Selector) -> str:
26+
"""Extract values from a list of elements and join with separator."""
27+
if selector.attribute is not None:
28+
values = [elem[selector.attribute] for elem in elements]
29+
else:
30+
values = [self._extract_tag_value(elem, selector) for elem in elements]
31+
return self._separator.join(values)
32+
33+
@staticmethod
34+
def _extract_tag_value(tag: Tag, selector: Selector) -> str:
35+
"""Extract value from HTML tag based on extract mode."""
36+
if tag.name in ("style", "script", "template"):
37+
return tag.string or ""
38+
if selector.extract == "text":
39+
return tag.text
40+
elif selector.extract == "content":
41+
return ''.join(map(str, tag.contents))
42+
elif selector.extract == "tag":
43+
return str(tag)
44+
return tag.text
Lines changed: 90 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,90 @@
1+
"""Content parsers for multiscrape using the Strategy pattern."""
2+
from __future__ import annotations
3+
4+
import logging
5+
from abc import ABC, abstractmethod
6+
from typing import Any
7+
8+
from bs4 import BeautifulSoup
9+
10+
_LOGGER = logging.getLogger(__name__)
11+
12+
13+
class ContentParser(ABC):
14+
"""Base class for content parsers."""
15+
16+
@property
17+
@abstractmethod
18+
def name(self) -> str:
19+
"""Return parser name for logging."""
20+
21+
@abstractmethod
22+
def can_parse(self, content: str) -> bool:
23+
"""Check if this parser can handle the content."""
24+
25+
@abstractmethod
26+
async def parse(self, content: str, hass: Any) -> Any:
27+
"""Parse content and return parsed structure."""
28+
29+
30+
class HtmlParser(ContentParser):
31+
"""Parse HTML/XML content using BeautifulSoup."""
32+
33+
def __init__(self, parser_name: str = "lxml"):
34+
"""Initialize HTML parser."""
35+
self._parser_name = parser_name
36+
37+
@property
38+
def name(self) -> str:
39+
"""Return parser name."""
40+
return f"html ({self._parser_name})"
41+
42+
def can_parse(self, content: str) -> bool:
43+
"""HTML parser handles anything that's not JSON."""
44+
content_stripped = content.lstrip() if content else ""
45+
if not content_stripped:
46+
return True
47+
return content_stripped[0] not in ("{", "[")
48+
49+
async def parse(self, content: str, hass: Any) -> BeautifulSoup:
50+
"""Parse HTML content with BeautifulSoup."""
51+
return await hass.async_add_executor_job(
52+
BeautifulSoup, content, self._parser_name
53+
)
54+
55+
56+
class JsonDetector(ContentParser):
57+
"""Detects JSON content. Does not parse it (JSON uses value_template only)."""
58+
59+
@property
60+
def name(self) -> str:
61+
"""Return parser name."""
62+
return "json"
63+
64+
def can_parse(self, content: str) -> bool:
65+
"""Check if content looks like JSON."""
66+
content_stripped = content.lstrip() if content else ""
67+
return bool(content_stripped) and content_stripped[0] in ("{", "[")
68+
69+
async def parse(self, content: str, hass: Any) -> None:
70+
"""JSON is not parsed into a queryable structure."""
71+
return None
72+
73+
74+
class ParserFactory:
75+
"""Selects the appropriate parser for content."""
76+
77+
def __init__(self, parser_name: str):
78+
"""Initialize with the HTML parser name."""
79+
self._parsers: list[ContentParser] = [
80+
JsonDetector(),
81+
HtmlParser(parser_name),
82+
]
83+
84+
def get_parser(self, content: str) -> ContentParser:
85+
"""Get appropriate parser for content."""
86+
for parser in self._parsers:
87+
if parser.can_parse(content):
88+
return parser
89+
# Fallback to HTML (should never happen since HtmlParser accepts empty)
90+
return self._parsers[-1]

custom_components/multiscrape/scraper.py

Lines changed: 52 additions & 74 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,8 @@
44
from bs4 import BeautifulSoup
55

66
from .const import CONF_PARSER, CONF_SEPARATOR
7+
from .extractors import ValueExtractor
8+
from .parsers import JsonDetector, ParserFactory
79
from .scrape_context import ScrapeContext
810

911
DEFAULT_TIMEOUT = 10
@@ -26,7 +28,7 @@ def create_scraper(config_name, config, hass, file_manager):
2628

2729

2830
class Scraper:
29-
"""Class for handling the retrieval and scraping of data."""
31+
"""Orchestrates parsing and value extraction."""
3032

3133
def __init__(
3234
self,
@@ -42,10 +44,11 @@ def __init__(
4244
self._hass = hass
4345
self._file_manager = file_manager
4446
self._config_name = config_name
45-
self._parser = parser
47+
self._parser_factory = ParserFactory(parser)
48+
self._extractor = ValueExtractor(separator)
4649
self._soup: BeautifulSoup = None
4750
self._data = None
48-
self._separator = separator
51+
self._is_json = False
4952
self.reset()
5053

5154
@property
@@ -57,6 +60,7 @@ def reset(self):
5760
"""Reset the scraper object."""
5861
self._data = None
5962
self._soup = None
63+
self._is_json = False
6064

6165
@property
6266
def formatted_content(self):
@@ -68,45 +72,41 @@ def formatted_content(self):
6872
async def set_content(self, content):
6973
"""Set the content to be scraped."""
7074
self._data = content
75+
parser = self._parser_factory.get_parser(content)
7176

72-
# Try to detect JSON more robustly
73-
content_stripped = content.lstrip() if content else ""
74-
if content_stripped and content_stripped[0] in ["{", "["]:
77+
if isinstance(parser, JsonDetector):
7578
_LOGGER.debug(
7679
"%s # Response seems to be json. Skip parsing with BeautifulSoup.",
7780
self._config_name,
7881
)
79-
else:
80-
try:
81-
_LOGGER.debug(
82-
"%s # Loading the content in BeautifulSoup.",
83-
self._config_name,
84-
)
85-
self._soup = await self._hass.async_add_executor_job(
86-
BeautifulSoup, self._data, self._parser
87-
)
88-
89-
if self._file_manager:
90-
await self._async_file_log("page_soup", self._soup.prettify())
91-
92-
except Exception as ex:
93-
self.reset()
94-
_LOGGER.error(
95-
"%s # Unable to parse response with BeautifulSoup: %s",
96-
self._config_name,
97-
ex,
98-
)
99-
raise
82+
self._is_json = True
83+
return
84+
85+
try:
86+
_LOGGER.debug(
87+
"%s # Loading the content in BeautifulSoup.",
88+
self._config_name,
89+
)
90+
self._soup = await parser.parse(content, self._hass)
91+
92+
if self._file_manager:
93+
await self._async_file_log("page_soup", self._soup.prettify())
94+
95+
except Exception as ex:
96+
self.reset()
97+
_LOGGER.error(
98+
"%s # Unable to parse response with BeautifulSoup: %s",
99+
self._config_name,
100+
ex,
101+
)
102+
raise
100103

101104
def scrape(self, selector, sensor, attribute=None, context: ScrapeContext | None = None):
102105
"""Scrape based on given selector the data."""
103106
if context is None:
104107
context = ScrapeContext.empty()
105108

106-
# This is required as this function is called separately for sensors and attributes
107-
log_prefix = f"{self._config_name} # {sensor}"
108-
if attribute:
109-
log_prefix = log_prefix + f"# {attribute}"
109+
log_prefix = self._make_log_prefix(sensor, attribute)
110110

111111
if selector.just_value:
112112
_LOGGER.debug("%s # Applying value_template only.", log_prefix)
@@ -115,43 +115,12 @@ def scrape(self, selector, sensor, attribute=None, context: ScrapeContext | None
115115
)
116116
return selector.value_template._parse_result(result)
117117

118-
# Check if content is JSON
119-
content_stripped = self._data.lstrip() if self._data else ""
120-
if content_stripped and content_stripped[0] in ["{", "["]:
118+
if self._is_json:
121119
raise ValueError(
122120
"JSON cannot be scraped. Please provide a value template to parse JSON response."
123121
)
124122

125-
if selector.is_list:
126-
tags = self._soup.select(selector.list)
127-
_LOGGER.debug("%s # List selector selected tags: %s",
128-
log_prefix, tags)
129-
if selector.attribute is not None:
130-
_LOGGER.debug(
131-
"%s # Try to find attributes: %s",
132-
log_prefix,
133-
selector.attribute,
134-
)
135-
values = [tag[selector.attribute] for tag in tags]
136-
else:
137-
values = [self.extract_tag_value(tag, selector) for tag in tags]
138-
value = self._separator.join(values)
139-
_LOGGER.debug("%s # List selector csv: %s", log_prefix, value)
140-
141-
else:
142-
tag = self._soup.select_one(selector.element)
143-
_LOGGER.debug("%s # Tag selected: %s", log_prefix, tag)
144-
if tag is None:
145-
raise ValueError("Could not find a tag for given selector")
146-
147-
if selector.attribute is not None:
148-
_LOGGER.debug(
149-
"%s # Try to find attribute: %s", log_prefix, selector.attribute
150-
)
151-
value = tag[selector.attribute]
152-
else:
153-
value = self.extract_tag_value(tag, selector)
154-
_LOGGER.debug("%s # Selector result: %s", log_prefix, value)
123+
value = self._extract_value(selector, log_prefix)
155124

156125
if value is not None and selector.value_template is not None:
157126
_LOGGER.debug(
@@ -167,17 +136,26 @@ def scrape(self, selector, sensor, attribute=None, context: ScrapeContext | None
167136
)
168137
return value
169138

170-
def extract_tag_value(self, tag, selector):
171-
"""Extract value from a tag."""
172-
if tag.name in ("style", "script", "template"):
173-
return tag.string
139+
def _extract_value(self, selector, log_prefix):
140+
"""Delegate extraction to ValueExtractor."""
141+
if selector.is_list:
142+
tags = self._soup.select(selector.list)
143+
_LOGGER.debug("%s # List selector selected tags: %s",
144+
log_prefix, tags)
145+
return self._extractor.extract_list(tags, selector)
174146
else:
175-
if selector.extract == "text":
176-
return tag.text
177-
elif selector.extract == "content":
178-
return ''.join(map(str, tag.contents))
179-
elif selector.extract == "tag":
180-
return str(tag)
147+
tag = self._soup.select_one(selector.element)
148+
_LOGGER.debug("%s # Tag selected: %s", log_prefix, tag)
149+
if tag is None:
150+
raise ValueError("Could not find a tag for given selector")
151+
return self._extractor.extract_single(tag, selector)
152+
153+
def _make_log_prefix(self, sensor, attribute):
154+
"""Create log prefix for messages."""
155+
prefix = f"{self._config_name} # {sensor}"
156+
if attribute:
157+
prefix = prefix + f"# {attribute}"
158+
return prefix
181159

182160
async def _async_file_log(self, content_name, content):
183161
try:

0 commit comments

Comments
 (0)