Skip to content

Commit 04feebc

Browse files
committed
Use regex substitution to interpolate data from regexes into name/model/version values
1 parent 8ea04c9 commit 04feebc

27 files changed

Lines changed: 5264 additions & 5383 deletions

README.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -182,6 +182,8 @@ cp $upstream/regexes/*.yml $pdd/device_detector/regexes/upstream/
182182
cp $upstream/Tests/fixtures/* $pdd/device_detector/tests/fixtures/upstream/
183183
cp $upstream/Tests/Parser/Client/fixtures/* $pdd/device_detector/tests/parser/fixtures/upstream/client/
184184
cp $upstream/Tests/Parser/Device/fixtures/* $pdd/device_detector/tests/parser/fixtures/upstream/device/
185+
186+
./configure_regex_interpolators.sh
185187
```
186188

187189
After copying the fixtures, review the diffs and restore changes that were made, especially for app types, so tests pass.

configure_regex_interpolators.sh

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,15 @@
1+
#!/bin/bash
2+
3+
# Replace $1 / $2 regex interpolation placeholders
4+
# with \g<1>, \g<2> for native regex substitution
5+
for i in 1 2 3 4 5;
6+
do
7+
grep -rl --include=*.yml "\$$i" device_detector/regexes | xargs sed -i "s#\$$i#\\\g<$i>#g" 2> /dev/null
8+
done
9+
10+
MALFORMED=$(grep -rl --include=*.yml "eZee'Tab\\\g<1>" device_detector/regexes)
11+
12+
if [ $MALFORMED ]; then
13+
echo "Invalid yaml value found in '${MALFORMED}'"
14+
echo "Manually convert 'eZee'Tab\g<1>' to 'eZee'Tab\\\g<1>"
15+
fi

device_detector/parser/device/base.py

Lines changed: 30 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,4 @@
1-
from ..parser import Parser
2-
from device_detector.parser.extractors import ModelExtractor
1+
from ..parser import Parser, perform_substitutions
32
from device_detector.enums import DeviceType
43

54
MOBILE_DEVICE_TYPES = {
@@ -2140,24 +2139,16 @@ def extract_model(self) -> None:
21402139
"""
21412140
user_agent = self.user_agent
21422141
for model in self.ua_data.pop('models', []):
2143-
if not (matched := model['regex'].search(user_agent)):
2144-
continue
2142+
if model_matched := model['regex'].search(user_agent):
2143+
self.ua_data |= {k: v.strip() for k, v in model.items() if k != 'regex'}
2144+
self.ua_data['model'] = perform_model_substitutions(
2145+
model['model'], model_matched, ' '
2146+
)
2147+
return
21452148

2146-
self.matched_regex = matched
2147-
self.ua_data |= {
2148-
k: v.strip().replace('_', ' ') for k, v in model.items() if k != 'regex'
2149-
}
2150-
2151-
# Must return after first match! Later patterns could match
2152-
# again and clobber the earlier, correct, values.
2153-
# i.e. Sony Ericsson should override Sony
2154-
break
2155-
2156-
if 'model' in self.ua_data and self.matched_regex:
2157-
if groups := self.matched_regex.groups():
2158-
self.ua_data['model'] = ModelExtractor(self.ua_data, groups).extract()
2159-
2160-
return None
2149+
if name := self.ua_data.get('model', ''):
2150+
if "\\g<" in name:
2151+
self.ua_data['model'] = perform_model_substitutions(name, self.matched_regex, ' ')
21612152

21622153
def dtype(self) -> DeviceType | str:
21632154
return self.DEVICE_TYPE
@@ -2173,7 +2164,7 @@ def set_details(self) -> None:
21732164
"""
21742165
Set device data from UA or Client Hints.
21752166
"""
2176-
if self.matched_regex:
2167+
if self.ua_data:
21772168
self.extract_model()
21782169

21792170
self.ua_data |= {
@@ -2184,6 +2175,25 @@ def set_details(self) -> None:
21842175
return super().set_details()
21852176

21862177

2178+
def perform_model_substitutions(substring: str, regex_match, underscore_substitute: str) -> str:
2179+
"""
2180+
Perform several normalizations after default regex substitution
2181+
"""
2182+
value = perform_substitutions(substring, regex_match, underscore_substitute)
2183+
if not value or value == 'Build':
2184+
return ''
2185+
2186+
# normalize D510_TD / ETON-T730D_TD
2187+
# Tbook 16 Power(M5F8) Build
2188+
for suffix in (' TD', ' Build'):
2189+
value = value.removesuffix(suffix)
2190+
2191+
if value.endswith('))'):
2192+
return value[:-1]
2193+
2194+
return value
2195+
2196+
21872197
__all__ = (
21882198
'BaseDeviceParser',
21892199
'DEVICE_BRANDS',

device_detector/parser/extractors.py

Lines changed: 2 additions & 127 deletions
Original file line numberDiff line numberDiff line change
@@ -21,95 +21,6 @@
2121
)
2222

2323

24-
class DataExtractor:
25-
"""
26-
Regex will define a string value or 1-based index
27-
position of the desired metadata
28-
29-
- regex: '(?:Apple-)?(?:iPhone|iPad|iPod)(?:.*Mac OS X.*Version/(\\d+\\.\\d+)|; Opera)?'
30-
name: 'iOS'
31-
version: '$1'
32-
"""
33-
34-
__slots__ = (
35-
'metadata',
36-
'groups',
37-
'user_agent',
38-
'details',
39-
'_app_id_pretty_names',
40-
)
41-
42-
# metadata value to extract / return
43-
# subclasses must override
44-
key = ''
45-
46-
def __init__(self, metadata: dict, groups: tuple):
47-
"""
48-
:param metadata: dict of regex and associated metadata
49-
{'regex': <regex1>, 'name': 'iOS', 'version': '$1'}
50-
{'regex': <regex2>, 'name': 'Windows', 'version': '10'}
51-
:param groups: Tuple of groups from regex
52-
('Debian', None)
53-
('iOS', '8_2')
54-
"""
55-
self.metadata = metadata
56-
self.groups = groups
57-
58-
def get_value_from_regex(self, value: str) -> str:
59-
"""
60-
Model / Name values may be in format of
61-
$<int> or <NamePrefix> $<int>
62-
63-
'Xino Z$1 X$2
64-
65-
Replace %<int> section replaced with {} for format string
66-
67-
'Xino Z$1 X$2 -> 'Xino Z{} X{}'
68-
69-
Return interpolated string with value from regex group
70-
"""
71-
chars = []
72-
indices = []
73-
index_int_next = False
74-
75-
for char in value:
76-
if char != '$':
77-
if index_int_next:
78-
indices.append(int(char) - 1)
79-
index_int_next = False
80-
else:
81-
chars.append(char)
82-
else:
83-
chars.append('{}')
84-
index_int_next = True
85-
86-
# collect regex group values, substituting empty string for None
87-
group_values = []
88-
for pos in indices:
89-
try:
90-
if not self.groups[pos]:
91-
group_values.append('')
92-
else:
93-
group_values.append(self.groups[pos])
94-
except IndexError:
95-
return ''
96-
97-
fmt_string = ''.join(chars)
98-
return fmt_string.format(*group_values).strip()
99-
100-
def extract(self) -> str:
101-
value = str(self.metadata.get(self.key, ''))
102-
if value and '$' in value:
103-
return self.get_value_from_regex(value)
104-
return value
105-
106-
def __str__(self) -> str:
107-
return f'{self.__class__.__name__} Extractor'
108-
109-
def __repr__(self) -> str:
110-
return f'{self.__class__.__name__}({self.metadata}, {self.groups})'
111-
112-
11324
class ApplicationIDExtractor(RegexLoader):
11425
"""
11526
Extract App Store IDs such as:
@@ -195,42 +106,6 @@ def __repr__(self) -> str:
195106
return f'{self.__class__.__name__}({self.user_agent!r})'
196107

197108

198-
class NameExtractor(DataExtractor):
199-
key = 'name'
200-
201-
202-
class ModelExtractor(DataExtractor):
203-
key = 'model'
204-
205-
def extract(self) -> str:
206-
value = super().extract()
207-
if not value:
208-
return value
209-
210-
if value == 'Build':
211-
return ''
212-
213-
# normalize D510_TD / ETON-T730D_TD
214-
if value.endswith('_TD'):
215-
value = value[:-3]
216-
return value.replace('_', ' ').strip()
217-
218-
219-
class VersionExtractor(DataExtractor):
220-
key = 'version'
221-
222-
def extract(self) -> str:
223-
value = super().extract()
224-
if not value:
225-
return value
226-
227-
return value.replace('_', '.').strip('.')
228-
229-
230-
__all__ = (
109+
__all__ = [
231110
'ApplicationIDExtractor',
232-
'DataExtractor',
233-
'ModelExtractor',
234-
'NameExtractor',
235-
'VersionExtractor',
236-
)
111+
]

device_detector/parser/parser.py

Lines changed: 37 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -3,12 +3,9 @@
33
except ImportError:
44
from typing_extensions import Self
55

6+
from regex._regex_core import error as RegexError
67
from ..lazy_regex import RegexLazyIgnore
78
from .client_hints import ClientHints
8-
from .extractors import (
9-
NameExtractor,
10-
VersionExtractor,
11-
)
129
from ..yaml_loader import RegexLoader, app_pretty_names_types_data
1310

1411
# Match regexes that ONLY values like:
@@ -137,30 +134,40 @@ def extract_version(self) -> None:
137134
Extract the version if UA Yaml files specify version regexes.
138135
See oss.yml for example file structure.
139136
"""
140-
137+
user_agent = self.user_agent
141138
for version in self.ua_data.pop('versions', []):
142-
if version['regex'].search(self.user_agent):
143-
self.ua_data['version'] = version['version']
139+
if version_regex_match := version['regex'].search(user_agent):
140+
self.ua_data['version'] = perform_substitutions(
141+
version['version'], version_regex_match, '.'
142+
)
144143
return
145144

145+
self._set_data_from_field('version', '.')
146+
146147
def set_details(self) -> None:
147148
"""
148149
Override this method on subclasses.
149150
150151
Update fields with interpolated values from regex data
151152
"""
152-
groups = self.matched_regex and self.matched_regex.groups() or None
153-
if groups:
154-
if 'name' in self.ua_data:
155-
self.ua_data['name'] = NameExtractor(self.ua_data, groups).extract()
156-
157-
if 'version' in self.ua_data:
158-
self.ua_data['version'] = VersionExtractor(self.ua_data, groups).extract()
153+
if self.matched_regex:
154+
self._set_data_from_field('name', '.')
159155

160156
# no version should be considered valid if the name can't be parsed
161157
if not self.ua_data.get('name') and self.ua_data.get('version'):
162158
self.ua_data['version'] = ''
163159

160+
def _set_data_from_field(self, field: str, separator: str):
161+
"""
162+
Check specified field value to see if it has a regex separator,
163+
and if so, update the value to include the regex capture details.
164+
"""
165+
if substring := self.ua_data.get(field, ''):
166+
if "\\g<" in substring:
167+
self.ua_data[field] = perform_substitutions(
168+
substring, self.matched_regex, separator
169+
)
170+
164171
def name(self) -> str:
165172
return self.ua_data.get('name', '')
166173

@@ -198,8 +205,24 @@ def __repr__(self) -> str:
198205
return f'{klass}({self.user_agent!r}, {self.ua_data!r})'
199206

200207

208+
def perform_substitutions(substring: str, regex_match, separator: str) -> str:
209+
"""
210+
Substitute the captured value from the regex for the regex placeholder.
211+
"""
212+
regex_pattern = regex_match.re
213+
capture = regex_match.captures()[0]
214+
try:
215+
value = regex_pattern.sub(substring, capture)
216+
if value.endswith(('\\g<1>', '\\g<2>')):
217+
value = value[: value.rfind('\\g<')]
218+
return value.replace('_', separator).strip(' .')
219+
except RegexError:
220+
return substring
221+
222+
201223
__all__ = (
202224
'Parser',
225+
'perform_substitutions',
203226
'IPHONE_ONLY_UA',
204227
'ENDSWITH_DARWIN',
205228
)

device_detector/regexes/local/client/antivirus.yml

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@
1414

1515
- regex: ^cis_([\d\.]+)_
1616
name: Comodo Internet Security
17-
version: '$1'
17+
version: '\g<1>'
1818

1919
- regex: AV/,,ffl$
2020
name: Avast
@@ -26,11 +26,11 @@
2626

2727
- regex: AV/(\d+[\.\d]+)
2828
name: Avast
29-
version: '$1'
29+
version: '\g<1>'
3030

3131
- regex: CCleaner, ?(\d+[\.\d]+)
3232
name: CCleaner
33-
version: '$1'
33+
version: '\g<1>'
3434

3535
- regex: panda ?security ?nano
3636
name: Panda Security Nano

device_detector/regexes/local/client/browsers.yml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -8,12 +8,12 @@
88
# Avastium Browser - Avast browser based on Chrome
99
- regex: 'Avastium Chrome(?:\/(\d+[\.\d]+))'
1010
name: 'Avast SafeZone'
11-
version: '$1'
11+
version: '\g<1>'
1212
engine:
1313
default: 'WebKit'
1414

1515
- regex: 'avastium[ \/]\(?(\d+[\.\d]+)'
1616
name: 'Avast SafeZone'
17-
version: '$1'
17+
version: '\g<1>'
1818
engine:
1919
default: 'WebKit'

0 commit comments

Comments
 (0)