Skip to content

Commit bd72fe0

Browse files
authored
Merge pull request #118 from codeforboston/deep-link-reconstruction
Link Reconstruction
2 parents 5eb8894 + 8351041 commit bd72fe0

4 files changed

Lines changed: 226 additions & 1 deletion

File tree

abstractions.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@ class AdoptablePet:
2525
sex: str | None = None
2626
size_group: str | None = None
2727
pet_id: str | None = None
28+
rescue_id: str | None = None # shelter's own animal id (RescueGroups "rescueId")
2829

2930

3031
class PetSource(ABC):

adoption_sources/pet_links.py

Lines changed: 108 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,108 @@
1+
"""Reconstruct deep links to an individual pet's adoption page.
2+
3+
The RescueGroups API gives us an org's *landing* page (e.g.
4+
``https://sterlingshelter.org/``), not a link to the specific animal. Some orgs
5+
embed the RescueGroups "web toolkit" (``toolkit.rescuegroups.org/j/3/.../toolkit.js``),
6+
which renders a single animal when the page URL carries a hash fragment of the
7+
form::
8+
9+
<pet-finder page>#action_0=pet&animalID_0=<animal id>
10+
11+
``animalID_0`` is the RescueGroups animal id -- the exact value the API returns
12+
as ``animal["id"]`` and we store as ``AdoptablePet.pet_id`` -- so we can rebuild
13+
the deep link without scraping. (``petIndex_0`` only drives next/prev nav within
14+
a result list and is not needed to load a specific animal.)
15+
16+
We only reconstruct for orgs we have verified use the toolkit; every other org
17+
falls back to whatever URL the API provided.
18+
"""
19+
20+
from typing import Iterable
21+
from urllib.parse import urlparse
22+
23+
# Domain -> (template, id_key). Each org's pet page is reachable from one of the
24+
# ids we get from the API:
25+
# * "pet_id" -- the RescueGroups numeric animal id (toolkit shelters).
26+
# * "rescue_id_lower" -- the shelter's own animal id (RescueGroups "rescueId"),
27+
# lowercased (MSPCA's /pets/a######/ urls).
28+
# The template uses a single ``{id}`` placeholder filled with that id.
29+
#
30+
# Sterling & SmallDog embed the RescueGroups toolkit v3; the trailing
31+
# ``petIndex_0=-1`` is the toolkit's "standalone pet, not part of a browsed list"
32+
# sentinel -- without it the widget can show the full list instead of the animal.
33+
PET_FINDER_TEMPLATES: dict[str, tuple[str, str]] = {
34+
"sterlingshelter.org": (
35+
"https://sterlingshelter.org/pet-finder/#action_0=pet&animalID_0={id}&petIndex_0=-1",
36+
"pet_id",
37+
),
38+
"smalldogrescuene.org": (
39+
"https://www.smalldogrescuene.org/adoptable-dogs/#action_0=pet&animalID_0={id}&petIndex_0=-1",
40+
"pet_id",
41+
),
42+
"mspca.org": (
43+
"https://www.mspca.org/pets/{id}/",
44+
"rescue_id_lower",
45+
),
46+
}
47+
48+
49+
def _domain_of(url: str | None) -> str | None:
50+
"""Return the lowercased host of ``url`` without a leading ``www.``."""
51+
if not url:
52+
return None
53+
netloc = urlparse(url.strip()).netloc.lower()
54+
if not netloc:
55+
return None
56+
# Drop any user:pass@ and :port, then a leading www.
57+
netloc = netloc.rsplit("@", 1)[-1].split(":", 1)[0]
58+
return netloc[4:] if netloc.startswith("www.") else netloc
59+
60+
61+
def _template_for_domain(domain: str | None) -> tuple[str, str] | None:
62+
if not domain:
63+
return None
64+
# Exact match or any subdomain of a known org (e.g. adopt.sterlingshelter.org).
65+
for known, entry in PET_FINDER_TEMPLATES.items():
66+
if domain == known or domain.endswith("." + known):
67+
return entry
68+
return None
69+
70+
71+
def is_supported_org(url: str | None) -> bool:
72+
"""True if ``url``'s domain is a shelter we have a deep-link template for."""
73+
return _template_for_domain(_domain_of(url)) is not None
74+
75+
76+
def reconstruct_adoption_url(
77+
candidate_urls: Iterable[str | None],
78+
pet_id: str | None,
79+
rescue_id: str | None = None,
80+
) -> str | None:
81+
"""Build a deep link to a specific pet, or ``None`` if we can't.
82+
83+
Args:
84+
candidate_urls: URLs from the API that might reveal the org's domain
85+
(adoption URL, org adoption URL, org website, ...). Checked in order;
86+
the first whose domain matches a known org wins.
87+
pet_id: The RescueGroups numeric animal id (``AdoptablePet.pet_id``).
88+
rescue_id: The shelter's own animal id (``AdoptablePet.rescue_id`` /
89+
RescueGroups "rescueId"), used by orgs like MSPCA.
90+
91+
Returns:
92+
A reconstructed deep link, or ``None`` when no candidate domain is known
93+
or the id that org's template needs is missing.
94+
"""
95+
ids = {
96+
"pet_id": pet_id or None,
97+
"rescue_id_lower": rescue_id.lower() if rescue_id else None,
98+
}
99+
for url in candidate_urls:
100+
entry = _template_for_domain(_domain_of(url))
101+
if not entry:
102+
continue
103+
template, id_key = entry
104+
id_value = ids.get(id_key)
105+
if id_value:
106+
return template.format(id=id_value)
107+
# Domain matched but we lack the id it needs -> fall back (try next url).
108+
return None

adoption_sources/rescue_groups.py

Lines changed: 19 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@
1515
from urllib3.util.retry import Retry
1616

1717
from abstractions import AdoptablePet, PetSource
18+
from adoption_sources.pet_links import reconstruct_adoption_url
1819
from config import CITY_NAME, CITY_STATE, POSTAL_CODE
1920

2021
logger = logging.getLogger(__name__)
@@ -164,12 +165,28 @@ def _parse_animal(self, animal: dict, orgs_by_id: dict) -> AdoptablePet | None:
164165
.get("id")
165166
)
166167
org_attrs = orgs_by_id.get(org_id, {}) if org_id else {}
168+
url_candidates = (
169+
attrs.get("adoptionUrl"),
170+
org_attrs.get("adoptionUrl"),
171+
org_attrs.get("url"),
172+
)
167173
adoption_url = next(
168-
(u for u in (attrs.get("adoptionUrl"), org_attrs.get("adoptionUrl"), org_attrs.get("url"))
174+
(u for u in url_candidates
169175
if u and u.strip().rstrip("/") not in ("http:", "https:", "http://", "https://")),
170176
None
171177
)
172178

179+
# Shelter's own animal id (e.g. MSPCA's "A468573"); some orgs' deep
180+
# links are keyed on this rather than the RescueGroups id.
181+
rescue_id = attrs.get("rescueId")
182+
183+
# For shelters we have a template for, rebuild a deep link to this
184+
# specific pet; otherwise keep the org landing page from above.
185+
adoption_url = (
186+
reconstruct_adoption_url(url_candidates, animal_id, rescue_id)
187+
or adoption_url
188+
)
189+
173190
# Get best available image
174191
image_url = self._get_image_url(attrs)
175192

@@ -189,6 +206,7 @@ def _parse_animal(self, animal: dict, orgs_by_id: dict) -> AdoptablePet | None:
189206
sex=attrs.get("sex"),
190207
size_group=attrs.get("sizeGroup"),
191208
pet_id=animal_id,
209+
rescue_id=rescue_id,
192210
)
193211
except Exception as e:
194212
logger.warning(f"Failed to parse animal {animal.get('id', 'unknown')}: {e}")

tests/test_pet_links.py

Lines changed: 98 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,98 @@
1+
import unittest
2+
3+
from adoption_sources.pet_links import _domain_of, reconstruct_adoption_url
4+
from adoption_sources.rescue_groups import SourceRescueGroups
5+
6+
7+
class DomainOfTests(unittest.TestCase):
8+
def test_strips_www_and_scheme(self):
9+
self.assertEqual(_domain_of("https://www.sterlingshelter.org/"), "sterlingshelter.org")
10+
11+
def test_keeps_subdomain_other_than_www(self):
12+
self.assertEqual(_domain_of("https://adopt.sterlingshelter.org/x"), "adopt.sterlingshelter.org")
13+
14+
def test_drops_port(self):
15+
self.assertEqual(_domain_of("https://sterlingshelter.org:8443/pet"), "sterlingshelter.org")
16+
17+
def test_none_and_empty(self):
18+
self.assertIsNone(_domain_of(None))
19+
self.assertIsNone(_domain_of(""))
20+
self.assertIsNone(_domain_of("not a url"))
21+
22+
23+
class ReconstructAdoptionUrlTests(unittest.TestCase):
24+
def test_sterling_deep_link(self):
25+
self.assertEqual(
26+
reconstruct_adoption_url(["https://sterlingshelter.org/"], "22506352"),
27+
"https://sterlingshelter.org/pet-finder/#action_0=pet&animalID_0=22506352&petIndex_0=-1",
28+
)
29+
30+
def test_smalldog_deep_link(self):
31+
self.assertEqual(
32+
reconstruct_adoption_url(["https://www.smalldogrescuene.org/"], "999"),
33+
"https://www.smalldogrescuene.org/adoptable-dogs/#action_0=pet&animalID_0=999&petIndex_0=-1",
34+
)
35+
36+
def test_matches_via_subdomain(self):
37+
self.assertEqual(
38+
reconstruct_adoption_url(["https://adopt.sterlingshelter.org/foo"], "1"),
39+
"https://sterlingshelter.org/pet-finder/#action_0=pet&animalID_0=1&petIndex_0=-1",
40+
)
41+
42+
def test_first_matching_candidate_wins(self):
43+
# Unknown domain is skipped; the known one is used.
44+
self.assertEqual(
45+
reconstruct_adoption_url(
46+
[None, "https://rescuegroups.org/foo", "https://sterlingshelter.org/"], "42"
47+
),
48+
"https://sterlingshelter.org/pet-finder/#action_0=pet&animalID_0=42&petIndex_0=-1",
49+
)
50+
51+
def test_mspca_uses_lowercased_rescue_id(self):
52+
self.assertEqual(
53+
reconstruct_adoption_url(["http://www.mspca.org/boston"], "22301016", rescue_id="A467410"),
54+
"https://www.mspca.org/pets/a467410/",
55+
)
56+
57+
def test_mspca_without_rescue_id_returns_none(self):
58+
# MSPCA's template needs rescue_id, not the RescueGroups pet_id.
59+
self.assertIsNone(reconstruct_adoption_url(["http://www.mspca.org/boston"], "22301016"))
60+
61+
def test_unknown_domain_returns_none(self):
62+
self.assertIsNone(reconstruct_adoption_url(["https://www.example.org/adoption-search/"], "5"))
63+
64+
def test_missing_pet_id_returns_none(self):
65+
self.assertIsNone(reconstruct_adoption_url(["https://sterlingshelter.org/"], None))
66+
self.assertIsNone(reconstruct_adoption_url(["https://sterlingshelter.org/"], ""))
67+
68+
69+
class ParseAnimalIntegrationTests(unittest.TestCase):
70+
"""The deep link should be applied end-to-end in SourceRescueGroups."""
71+
72+
def setUp(self):
73+
self.source = SourceRescueGroups(api_key="dummy")
74+
75+
def _animal(self):
76+
return {
77+
"type": "animals",
78+
"id": "22506352",
79+
"attributes": {"name": "Ketchup", "breedString": "Lab Mix"},
80+
"relationships": {"orgs": {"data": [{"type": "orgs", "id": "org1"}]}},
81+
}
82+
83+
def test_toolkit_org_gets_deep_link(self):
84+
orgs = {"org1": {"city": "Sterling", "state": "MA", "url": "https://sterlingshelter.org/"}}
85+
pet = self.source._parse_animal(self._animal(), orgs)
86+
self.assertEqual(
87+
pet.adoption_url,
88+
"https://sterlingshelter.org/pet-finder/#action_0=pet&animalID_0=22506352&petIndex_0=-1",
89+
)
90+
91+
def test_non_toolkit_org_keeps_landing_url(self):
92+
orgs = {"org1": {"city": "Boston", "state": "MA", "url": "https://www.mspca.org/"}}
93+
pet = self.source._parse_animal(self._animal(), orgs)
94+
self.assertEqual(pet.adoption_url, "https://www.mspca.org/")
95+
96+
97+
if __name__ == "__main__":
98+
unittest.main()

0 commit comments

Comments
 (0)