Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions abstractions.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ class AdoptablePet:
sex: str | None = None
size_group: str | None = None
pet_id: str | None = None
rescue_id: str | None = None # shelter's own animal id (RescueGroups "rescueId")


class PetSource(ABC):
Expand Down
108 changes: 108 additions & 0 deletions adoption_sources/pet_links.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,108 @@
"""Reconstruct deep links to an individual pet's adoption page.

The RescueGroups API gives us an org's *landing* page (e.g.
``https://sterlingshelter.org/``), not a link to the specific animal. Some orgs
embed the RescueGroups "web toolkit" (``toolkit.rescuegroups.org/j/3/.../toolkit.js``),
which renders a single animal when the page URL carries a hash fragment of the
form::

<pet-finder page>#action_0=pet&animalID_0=<animal id>

``animalID_0`` is the RescueGroups animal id -- the exact value the API returns
as ``animal["id"]`` and we store as ``AdoptablePet.pet_id`` -- so we can rebuild
the deep link without scraping. (``petIndex_0`` only drives next/prev nav within
a result list and is not needed to load a specific animal.)

We only reconstruct for orgs we have verified use the toolkit; every other org
falls back to whatever URL the API provided.
"""

from typing import Iterable
from urllib.parse import urlparse

# Domain -> (template, id_key). Each org's pet page is reachable from one of the
# ids we get from the API:
# * "pet_id" -- the RescueGroups numeric animal id (toolkit shelters).
# * "rescue_id_lower" -- the shelter's own animal id (RescueGroups "rescueId"),
# lowercased (MSPCA's /pets/a######/ urls).
# The template uses a single ``{id}`` placeholder filled with that id.
#
# Sterling & SmallDog embed the RescueGroups toolkit v3; the trailing
# ``petIndex_0=-1`` is the toolkit's "standalone pet, not part of a browsed list"
# sentinel -- without it the widget can show the full list instead of the animal.
PET_FINDER_TEMPLATES: dict[str, tuple[str, str]] = {
"sterlingshelter.org": (
"https://sterlingshelter.org/pet-finder/#action_0=pet&animalID_0={id}&petIndex_0=-1",
"pet_id",
),
"smalldogrescuene.org": (
"https://www.smalldogrescuene.org/adoptable-dogs/#action_0=pet&animalID_0={id}&petIndex_0=-1",
"pet_id",
),
"mspca.org": (
"https://www.mspca.org/pets/{id}/",
"rescue_id_lower",
),
}


def _domain_of(url: str | None) -> str | None:
"""Return the lowercased host of ``url`` without a leading ``www.``."""
if not url:
return None
netloc = urlparse(url.strip()).netloc.lower()
if not netloc:
return None
# Drop any user:pass@ and :port, then a leading www.
netloc = netloc.rsplit("@", 1)[-1].split(":", 1)[0]
return netloc[4:] if netloc.startswith("www.") else netloc


def _template_for_domain(domain: str | None) -> tuple[str, str] | None:
if not domain:
return None
# Exact match or any subdomain of a known org (e.g. adopt.sterlingshelter.org).
for known, entry in PET_FINDER_TEMPLATES.items():
if domain == known or domain.endswith("." + known):
return entry
return None


def is_supported_org(url: str | None) -> bool:
"""True if ``url``'s domain is a shelter we have a deep-link template for."""
return _template_for_domain(_domain_of(url)) is not None


def reconstruct_adoption_url(
candidate_urls: Iterable[str | None],
pet_id: str | None,
rescue_id: str | None = None,
) -> str | None:
"""Build a deep link to a specific pet, or ``None`` if we can't.

Args:
candidate_urls: URLs from the API that might reveal the org's domain
(adoption URL, org adoption URL, org website, ...). Checked in order;
the first whose domain matches a known org wins.
pet_id: The RescueGroups numeric animal id (``AdoptablePet.pet_id``).
rescue_id: The shelter's own animal id (``AdoptablePet.rescue_id`` /
RescueGroups "rescueId"), used by orgs like MSPCA.

Returns:
A reconstructed deep link, or ``None`` when no candidate domain is known
or the id that org's template needs is missing.
"""
ids = {
"pet_id": pet_id or None,
"rescue_id_lower": rescue_id.lower() if rescue_id else None,
}
for url in candidate_urls:
entry = _template_for_domain(_domain_of(url))
if not entry:
continue
template, id_key = entry
id_value = ids.get(id_key)
if id_value:
return template.format(id=id_value)
# Domain matched but we lack the id it needs -> fall back (try next url).
return None
20 changes: 19 additions & 1 deletion adoption_sources/rescue_groups.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
import requests

from abstractions import AdoptablePet, PetSource
from adoption_sources.pet_links import reconstruct_adoption_url
from config import CITY_NAME, CITY_STATE, POSTAL_CODE

logger = logging.getLogger(__name__)
Expand Down Expand Up @@ -140,12 +141,28 @@ def _parse_animal(self, animal: dict, orgs_by_id: dict) -> AdoptablePet | None:
.get("id")
)
org_attrs = orgs_by_id.get(org_id, {}) if org_id else {}
url_candidates = (
attrs.get("adoptionUrl"),
org_attrs.get("adoptionUrl"),
org_attrs.get("url"),
)
adoption_url = next(
(u for u in (attrs.get("adoptionUrl"), org_attrs.get("adoptionUrl"), org_attrs.get("url"))
(u for u in url_candidates
if u and u.strip().rstrip("/") not in ("http:", "https:", "http://", "https://")),
None
)

# Shelter's own animal id (e.g. MSPCA's "A468573"); some orgs' deep
# links are keyed on this rather than the RescueGroups id.
rescue_id = attrs.get("rescueId")

# For shelters we have a template for, rebuild a deep link to this
# specific pet; otherwise keep the org landing page from above.
adoption_url = (
reconstruct_adoption_url(url_candidates, animal_id, rescue_id)
or adoption_url
)

# Get best available image
image_url = self._get_image_url(attrs)

Expand All @@ -165,6 +182,7 @@ def _parse_animal(self, animal: dict, orgs_by_id: dict) -> AdoptablePet | None:
sex=attrs.get("sex"),
size_group=attrs.get("sizeGroup"),
pet_id=animal_id,
rescue_id=rescue_id,
)
except Exception as e:
logger.warning(f"Failed to parse animal {animal.get('id', 'unknown')}: {e}")
Expand Down
98 changes: 98 additions & 0 deletions tests/test_pet_links.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,98 @@
import unittest

from adoption_sources.pet_links import _domain_of, reconstruct_adoption_url
from adoption_sources.rescue_groups import SourceRescueGroups


class DomainOfTests(unittest.TestCase):
def test_strips_www_and_scheme(self):
self.assertEqual(_domain_of("https://www.sterlingshelter.org/"), "sterlingshelter.org")

def test_keeps_subdomain_other_than_www(self):
self.assertEqual(_domain_of("https://adopt.sterlingshelter.org/x"), "adopt.sterlingshelter.org")

def test_drops_port(self):
self.assertEqual(_domain_of("https://sterlingshelter.org:8443/pet"), "sterlingshelter.org")

def test_none_and_empty(self):
self.assertIsNone(_domain_of(None))
self.assertIsNone(_domain_of(""))
self.assertIsNone(_domain_of("not a url"))


class ReconstructAdoptionUrlTests(unittest.TestCase):
def test_sterling_deep_link(self):
self.assertEqual(
reconstruct_adoption_url(["https://sterlingshelter.org/"], "22506352"),
"https://sterlingshelter.org/pet-finder/#action_0=pet&animalID_0=22506352&petIndex_0=-1",
)

def test_smalldog_deep_link(self):
self.assertEqual(
reconstruct_adoption_url(["https://www.smalldogrescuene.org/"], "999"),
"https://www.smalldogrescuene.org/adoptable-dogs/#action_0=pet&animalID_0=999&petIndex_0=-1",
)

def test_matches_via_subdomain(self):
self.assertEqual(
reconstruct_adoption_url(["https://adopt.sterlingshelter.org/foo"], "1"),
"https://sterlingshelter.org/pet-finder/#action_0=pet&animalID_0=1&petIndex_0=-1",
)

def test_first_matching_candidate_wins(self):
# Unknown domain is skipped; the known one is used.
self.assertEqual(
reconstruct_adoption_url(
[None, "https://rescuegroups.org/foo", "https://sterlingshelter.org/"], "42"
),
"https://sterlingshelter.org/pet-finder/#action_0=pet&animalID_0=42&petIndex_0=-1",
)

def test_mspca_uses_lowercased_rescue_id(self):
self.assertEqual(
reconstruct_adoption_url(["http://www.mspca.org/boston"], "22301016", rescue_id="A467410"),
"https://www.mspca.org/pets/a467410/",
)

def test_mspca_without_rescue_id_returns_none(self):
# MSPCA's template needs rescue_id, not the RescueGroups pet_id.
self.assertIsNone(reconstruct_adoption_url(["http://www.mspca.org/boston"], "22301016"))

def test_unknown_domain_returns_none(self):
self.assertIsNone(reconstruct_adoption_url(["https://www.example.org/adoption-search/"], "5"))

def test_missing_pet_id_returns_none(self):
self.assertIsNone(reconstruct_adoption_url(["https://sterlingshelter.org/"], None))
self.assertIsNone(reconstruct_adoption_url(["https://sterlingshelter.org/"], ""))


class ParseAnimalIntegrationTests(unittest.TestCase):
"""The deep link should be applied end-to-end in SourceRescueGroups."""

def setUp(self):
self.source = SourceRescueGroups(api_key="dummy")

def _animal(self):
return {
"type": "animals",
"id": "22506352",
"attributes": {"name": "Ketchup", "breedString": "Lab Mix"},
"relationships": {"orgs": {"data": [{"type": "orgs", "id": "org1"}]}},
}

def test_toolkit_org_gets_deep_link(self):
orgs = {"org1": {"city": "Sterling", "state": "MA", "url": "https://sterlingshelter.org/"}}
pet = self.source._parse_animal(self._animal(), orgs)
self.assertEqual(
pet.adoption_url,
"https://sterlingshelter.org/pet-finder/#action_0=pet&animalID_0=22506352&petIndex_0=-1",
)

def test_non_toolkit_org_keeps_landing_url(self):
orgs = {"org1": {"city": "Boston", "state": "MA", "url": "https://www.mspca.org/"}}
pet = self.source._parse_animal(self._animal(), orgs)
self.assertEqual(pet.adoption_url, "https://www.mspca.org/")


if __name__ == "__main__":
unittest.main()