44from bs4 import BeautifulSoup
55
66from .const import CONF_PARSER , CONF_SEPARATOR
7+ from .extractors import ValueExtractor
8+ from .parsers import JsonDetector , ParserFactory
79from .scrape_context import ScrapeContext
810
911DEFAULT_TIMEOUT = 10
@@ -26,7 +28,7 @@ def create_scraper(config_name, config, hass, file_manager):
2628
2729
2830class Scraper :
29- """Class for handling the retrieval and scraping of data ."""
31+ """Orchestrates parsing and value extraction ."""
3032
3133 def __init__ (
3234 self ,
@@ -42,10 +44,11 @@ def __init__(
4244 self ._hass = hass
4345 self ._file_manager = file_manager
4446 self ._config_name = config_name
45- self ._parser = parser
47+ self ._parser_factory = ParserFactory (parser )
48+ self ._extractor = ValueExtractor (separator )
4649 self ._soup : BeautifulSoup = None
4750 self ._data = None
48- self ._separator = separator
51+ self ._is_json = False
4952 self .reset ()
5053
5154 @property
@@ -57,6 +60,7 @@ def reset(self):
5760 """Reset the scraper object."""
5861 self ._data = None
5962 self ._soup = None
63+ self ._is_json = False
6064
6165 @property
6266 def formatted_content (self ):
@@ -68,45 +72,41 @@ def formatted_content(self):
6872 async def set_content (self , content ):
6973 """Set the content to be scraped."""
7074 self ._data = content
75+ parser = self ._parser_factory .get_parser (content )
7176
72- # Try to detect JSON more robustly
73- content_stripped = content .lstrip () if content else ""
74- if content_stripped and content_stripped [0 ] in ["{" , "[" ]:
77+ if isinstance (parser , JsonDetector ):
7578 _LOGGER .debug (
7679 "%s # Response seems to be json. Skip parsing with BeautifulSoup." ,
7780 self ._config_name ,
7881 )
79- else :
80- try :
81- _LOGGER . debug (
82- "%s # Loading the content in BeautifulSoup." ,
83- self . _config_name ,
84- )
85- self ._soup = await self . _hass . async_add_executor_job (
86- BeautifulSoup , self . _data , self . _parser
87- )
88-
89- if self ._file_manager :
90- await self ._async_file_log ("page_soup" , self ._soup .prettify ())
91-
92- except Exception as ex :
93- self .reset ()
94- _LOGGER .error (
95- "%s # Unable to parse response with BeautifulSoup: %s" ,
96- self ._config_name ,
97- ex ,
98- )
99- raise
82+ self . _is_json = True
83+ return
84+
85+ try :
86+ _LOGGER . debug (
87+ "%s # Loading the content in BeautifulSoup." ,
88+ self ._config_name ,
89+ )
90+ self . _soup = await parser . parse ( content , self . _hass )
91+
92+ if self ._file_manager :
93+ await self ._async_file_log ("page_soup" , self ._soup .prettify ())
94+
95+ except Exception as ex :
96+ self .reset ()
97+ _LOGGER .error (
98+ "%s # Unable to parse response with BeautifulSoup: %s" ,
99+ self ._config_name ,
100+ ex ,
101+ )
102+ raise
100103
101104 def scrape (self , selector , sensor , attribute = None , context : ScrapeContext | None = None ):
102105 """Scrape based on given selector the data."""
103106 if context is None :
104107 context = ScrapeContext .empty ()
105108
106- # This is required as this function is called separately for sensors and attributes
107- log_prefix = f"{ self ._config_name } # { sensor } "
108- if attribute :
109- log_prefix = log_prefix + f"# { attribute } "
109+ log_prefix = self ._make_log_prefix (sensor , attribute )
110110
111111 if selector .just_value :
112112 _LOGGER .debug ("%s # Applying value_template only." , log_prefix )
@@ -115,43 +115,12 @@ def scrape(self, selector, sensor, attribute=None, context: ScrapeContext | None
115115 )
116116 return selector .value_template ._parse_result (result )
117117
118- # Check if content is JSON
119- content_stripped = self ._data .lstrip () if self ._data else ""
120- if content_stripped and content_stripped [0 ] in ["{" , "[" ]:
118+ if self ._is_json :
121119 raise ValueError (
122120 "JSON cannot be scraped. Please provide a value template to parse JSON response."
123121 )
124122
125- if selector .is_list :
126- tags = self ._soup .select (selector .list )
127- _LOGGER .debug ("%s # List selector selected tags: %s" ,
128- log_prefix , tags )
129- if selector .attribute is not None :
130- _LOGGER .debug (
131- "%s # Try to find attributes: %s" ,
132- log_prefix ,
133- selector .attribute ,
134- )
135- values = [tag [selector .attribute ] for tag in tags ]
136- else :
137- values = [self .extract_tag_value (tag , selector ) for tag in tags ]
138- value = self ._separator .join (values )
139- _LOGGER .debug ("%s # List selector csv: %s" , log_prefix , value )
140-
141- else :
142- tag = self ._soup .select_one (selector .element )
143- _LOGGER .debug ("%s # Tag selected: %s" , log_prefix , tag )
144- if tag is None :
145- raise ValueError ("Could not find a tag for given selector" )
146-
147- if selector .attribute is not None :
148- _LOGGER .debug (
149- "%s # Try to find attribute: %s" , log_prefix , selector .attribute
150- )
151- value = tag [selector .attribute ]
152- else :
153- value = self .extract_tag_value (tag , selector )
154- _LOGGER .debug ("%s # Selector result: %s" , log_prefix , value )
123+ value = self ._extract_value (selector , log_prefix )
155124
156125 if value is not None and selector .value_template is not None :
157126 _LOGGER .debug (
@@ -167,17 +136,26 @@ def scrape(self, selector, sensor, attribute=None, context: ScrapeContext | None
167136 )
168137 return value
169138
170- def extract_tag_value (self , tag , selector ):
171- """Extract value from a tag."""
172- if tag .name in ("style" , "script" , "template" ):
173- return tag .string
139+ def _extract_value (self , selector , log_prefix ):
140+ """Delegate extraction to ValueExtractor."""
141+ if selector .is_list :
142+ tags = self ._soup .select (selector .list )
143+ _LOGGER .debug ("%s # List selector selected tags: %s" ,
144+ log_prefix , tags )
145+ return self ._extractor .extract_list (tags , selector )
174146 else :
175- if selector .extract == "text" :
176- return tag .text
177- elif selector .extract == "content" :
178- return '' .join (map (str , tag .contents ))
179- elif selector .extract == "tag" :
180- return str (tag )
147+ tag = self ._soup .select_one (selector .element )
148+ _LOGGER .debug ("%s # Tag selected: %s" , log_prefix , tag )
149+ if tag is None :
150+ raise ValueError ("Could not find a tag for given selector" )
151+ return self ._extractor .extract_single (tag , selector )
152+
153+ def _make_log_prefix (self , sensor , attribute ):
154+ """Create log prefix for messages."""
155+ prefix = f"{ self ._config_name } # { sensor } "
156+ if attribute :
157+ prefix = prefix + f"# { attribute } "
158+ return prefix
181159
182160 async def _async_file_log (self , content_name , content ):
183161 try :
0 commit comments