-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathproduct_patterns.py
More file actions
77 lines (75 loc) · 2.59 KB
/
product_patterns.py
File metadata and controls
77 lines (75 loc) · 2.59 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
DEAD_END_PATTERNS = {
'account_auth': [
r'/login', r'/signin', r'/sign-in', r'/register', r'/signup', r'/sign-up',
r'/account', r'/profile', r'/my-account', r'/user', r'/member',
r'/checkout', r'/cart', r'/basket', r'/bag', r'/wishlist', r'/favorites',
r'/logout', r'/signout', r'/sign-out'
],
'legal_info': [
r'/terms', r'/privacy', r'/policy', r'/legal', r'/disclaimer',
r'/cookies', r'/gdpr', r'/compliance', r'/terms-of-service',
r'/privacy-policy', r'/return-policy', r'/shipping-policy'
],
'company_info': [
r'/about', r'/contact', r'/careers', r'/jobs', r'/investors',
r'/press', r'/media', r'/news', r'/blog', r'/help', r'/support',
r'/faq', r'/customer-service', r'/team', r'/company'
],
'api_technical': [
r'/api/', r'/ajax/', r'/json/', r'/xml/', r'/rss/', r'/feed/',
r'/webhook', r'/callback', r'/oauth', r'/auth/', r'/token',
r'\.css', r'\.js', r'\.json', r'\.xml', r'\.txt', r'\.pdf',
r'\.jpg', r'\.jpeg', r'\.png', r'\.gif', r'\.svg', r'\.ico',
r'\.woff', r'\.ttf', r'\.eot'
],
'admin_backend': [
r'/admin', r'/dashboard', r'/cms', r'/wp-admin', r'/backend',
r'/manage', r'/control-panel', r'/administrator'
],
'search_filter': [
r'/search', r'/filter', r'/sort', r'/compare', r'/reviews-only',
r'/questions', r'/q&a', r'/specifications-only'
],
'non_product_actions': [
r'/add-to-cart', r'/buy-now', r'/quick-view', r'/share',
r'/email-friend', r'/track-order', r'/order-status',
r'/download', r'/subscribe', r'/unsubscribe', r'/reviews',
]
}
PRODUCT_URL_PATTERNS = [
r'/dp/[A-Z0-9]{10}',
r'/gp/product/[A-Z0-9]{10}',
r'/exec/obidos/ASIN/[A-Z0-9]{10}',
r'/product-reviews/[A-Z0-9]{10}',
r'/[^/]+/dp/[A-Z0-9]{10}',
r'/itm/[0-9]+',
r'/p/[0-9]+',
r'/i/[0-9]+',
r'/deals/[^/]+/[0-9]+',
r'/ip/[^/]+/[0-9]+',
r'/product/[^/]+/[0-9]+',
r'/grocery/ip/[^/]+/[0-9]+',
r'/p/[^/]+/-/A-[0-9]+',
r'/product/[^/]+/-/A-[0-9]+',
r'/listing/[0-9]+',
r'/[^/]+/listing/[0-9]+',
r'/products/[^/?]+',
r'/collections/[^/]+/products/[^/?]+',
r'/item/[0-9]+\.html',
r'/store/product/[^/]+/[0-9]+\.html',
r'/product-detail/[^/]+_[0-9]+\.html',
r'/p/[^/]+/[0-9]+\.html',
# Generic patterns
r'/product[s]?/[^/?]+',
r'/item[s]?/[^/?]+',
r'/p/[^/?]+',
r'/goods/[^/?]+',
r'/detail/[^/?]+',
r'/product-[0-9]+',
r'/item-[0-9]+',
r'/[^/]+-p[0-9]+',
r'/sku[/-][0-9A-Za-z]+',
r'/catalog/product/view/id/[0-9]+',
r'/[^/]+\.html\?.*product.*id=\d+',
r'/product_info\.php\?products_id=\d+',
]