Merge pull request #118 from codeforboston/deep-link-reconstruction

prajwalbang · web-flow · commit bd72fe01b111 · 2026-06-09T20:35:39.000-04:00
Link Reconstruction
diff --git a/abstractions.py b/abstractions.py
@@ -25,6 +25,7 @@ class AdoptablePet:
     sex: str | None = None
     size_group: str | None = None
     pet_id: str | None = None
+    rescue_id: str | None = None  # shelter's own animal id (RescueGroups "rescueId")
 
 
 class PetSource(ABC):
diff --git a/adoption_sources/pet_links.py b/adoption_sources/pet_links.py
@@ -0,0 +1,108 @@
+"""Reconstruct deep links to an individual pet's adoption page.
+
+The RescueGroups API gives us an org's *landing* page (e.g.
+``https://sterlingshelter.org/``), not a link to the specific animal. Some orgs
+embed the RescueGroups "web toolkit" (``toolkit.rescuegroups.org/j/3/.../toolkit.js``),
+which renders a single animal when the page URL carries a hash fragment of the
+form::
+
+    <pet-finder page>#action_0=pet&animalID_0=<animal id>
+
+``animalID_0`` is the RescueGroups animal id -- the exact value the API returns
+as ``animal["id"]`` and we store as ``AdoptablePet.pet_id`` -- so we can rebuild
+the deep link without scraping. (``petIndex_0`` only drives next/prev nav within
+a result list and is not needed to load a specific animal.)
+
+We only reconstruct for orgs we have verified use the toolkit; every other org
+falls back to whatever URL the API provided.
+"""
+
+from typing import Iterable
+from urllib.parse import urlparse
+
+# Domain -> (template, id_key). Each org's pet page is reachable from one of the
+# ids we get from the API:
+#   * "pet_id"          -- the RescueGroups numeric animal id (toolkit shelters).
+#   * "rescue_id_lower" -- the shelter's own animal id (RescueGroups "rescueId"),
+#                          lowercased (MSPCA's /pets/a######/ urls).
+# The template uses a single ``{id}`` placeholder filled with that id.
+#
+# Sterling & SmallDog embed the RescueGroups toolkit v3; the trailing
+# ``petIndex_0=-1`` is the toolkit's "standalone pet, not part of a browsed list"
+# sentinel -- without it the widget can show the full list instead of the animal.
+PET_FINDER_TEMPLATES: dict[str, tuple[str, str]] = {
+    "sterlingshelter.org": (
+        "https://sterlingshelter.org/pet-finder/#action_0=pet&animalID_0={id}&petIndex_0=-1",
+        "pet_id",
+    ),
+    "smalldogrescuene.org": (
+        "https://www.smalldogrescuene.org/adoptable-dogs/#action_0=pet&animalID_0={id}&petIndex_0=-1",
+        "pet_id",
+    ),
+    "mspca.org": (
+        "https://www.mspca.org/pets/{id}/",
+        "rescue_id_lower",
+    ),
+}
+
+
+def _domain_of(url: str | None) -> str | None:
+    """Return the lowercased host of ``url`` without a leading ``www.``."""
+    if not url:
+        return None
+    netloc = urlparse(url.strip()).netloc.lower()
+    if not netloc:
+        return None
+    # Drop any user:pass@ and :port, then a leading www.
+    netloc = netloc.rsplit("@", 1)[-1].split(":", 1)[0]
+    return netloc[4:] if netloc.startswith("www.") else netloc
+
+
+def _template_for_domain(domain: str | None) -> tuple[str, str] | None:
+    if not domain:
+        return None
+    # Exact match or any subdomain of a known org (e.g. adopt.sterlingshelter.org).
+    for known, entry in PET_FINDER_TEMPLATES.items():
+        if domain == known or domain.endswith("." + known):
+            return entry
+    return None
+
+
+def is_supported_org(url: str | None) -> bool:
+    """True if ``url``'s domain is a shelter we have a deep-link template for."""
+    return _template_for_domain(_domain_of(url)) is not None
+
+
+def reconstruct_adoption_url(
+    candidate_urls: Iterable[str | None],
+    pet_id: str | None,
+    rescue_id: str | None = None,
+) -> str | None:
+    """Build a deep link to a specific pet, or ``None`` if we can't.
+
+    Args:
+        candidate_urls: URLs from the API that might reveal the org's domain
+            (adoption URL, org adoption URL, org website, ...). Checked in order;
+            the first whose domain matches a known org wins.
+        pet_id: The RescueGroups numeric animal id (``AdoptablePet.pet_id``).
+        rescue_id: The shelter's own animal id (``AdoptablePet.rescue_id`` /
+            RescueGroups "rescueId"), used by orgs like MSPCA.
+
+    Returns:
+        A reconstructed deep link, or ``None`` when no candidate domain is known
+        or the id that org's template needs is missing.
+    """
+    ids = {
+        "pet_id": pet_id or None,
+        "rescue_id_lower": rescue_id.lower() if rescue_id else None,
+    }
+    for url in candidate_urls:
+        entry = _template_for_domain(_domain_of(url))
+        if not entry:
+            continue
+        template, id_key = entry
+        id_value = ids.get(id_key)
+        if id_value:
+            return template.format(id=id_value)
+        # Domain matched but we lack the id it needs -> fall back (try next url).
+    return None
diff --git a/adoption_sources/rescue_groups.py b/adoption_sources/rescue_groups.py
@@ -15,6 +15,7 @@
 from urllib3.util.retry import Retry
 
 from abstractions import AdoptablePet, PetSource
+from adoption_sources.pet_links import reconstruct_adoption_url
 from config import CITY_NAME, CITY_STATE, POSTAL_CODE
 
 logger = logging.getLogger(__name__)
@@ -164,12 +165,28 @@ def _parse_animal(self, animal: dict, orgs_by_id: dict) -> AdoptablePet | None:
                 .get("id")
             )
             org_attrs = orgs_by_id.get(org_id, {}) if org_id else {}
+            url_candidates = (
+                attrs.get("adoptionUrl"),
+                org_attrs.get("adoptionUrl"),
+                org_attrs.get("url"),
+            )
             adoption_url = next(
-                (u for u in (attrs.get("adoptionUrl"), org_attrs.get("adoptionUrl"), org_attrs.get("url"))
+                (u for u in url_candidates
                  if u and u.strip().rstrip("/") not in ("http:", "https:", "http://", "https://")),
                 None
             )
 
+            # Shelter's own animal id (e.g. MSPCA's "A468573"); some orgs' deep
+            # links are keyed on this rather than the RescueGroups id.
+            rescue_id = attrs.get("rescueId")
+
+            # For shelters we have a template for, rebuild a deep link to this
+            # specific pet; otherwise keep the org landing page from above.
+            adoption_url = (
+                reconstruct_adoption_url(url_candidates, animal_id, rescue_id)
+                or adoption_url
+            )
+
             # Get best available image
             image_url = self._get_image_url(attrs)
 
@@ -189,6 +206,7 @@ def _parse_animal(self, animal: dict, orgs_by_id: dict) -> AdoptablePet | None:
                 sex=attrs.get("sex"),
                 size_group=attrs.get("sizeGroup"),
                 pet_id=animal_id,
+                rescue_id=rescue_id,
             )
         except Exception as e:
             logger.warning(f"Failed to parse animal {animal.get('id', 'unknown')}: {e}")
diff --git a/tests/test_pet_links.py b/tests/test_pet_links.py
@@ -0,0 +1,98 @@
+import unittest
+
+from adoption_sources.pet_links import _domain_of, reconstruct_adoption_url
+from adoption_sources.rescue_groups import SourceRescueGroups
+
+
+class DomainOfTests(unittest.TestCase):
+    def test_strips_www_and_scheme(self):
+        self.assertEqual(_domain_of("https://www.sterlingshelter.org/"), "sterlingshelter.org")
+
+    def test_keeps_subdomain_other_than_www(self):
+        self.assertEqual(_domain_of("https://adopt.sterlingshelter.org/x"), "adopt.sterlingshelter.org")
+
+    def test_drops_port(self):
+        self.assertEqual(_domain_of("https://sterlingshelter.org:8443/pet"), "sterlingshelter.org")
+
+    def test_none_and_empty(self):
+        self.assertIsNone(_domain_of(None))
+        self.assertIsNone(_domain_of(""))
+        self.assertIsNone(_domain_of("not a url"))
+
+
+class ReconstructAdoptionUrlTests(unittest.TestCase):
+    def test_sterling_deep_link(self):
+        self.assertEqual(
+            reconstruct_adoption_url(["https://sterlingshelter.org/"], "22506352"),
+            "https://sterlingshelter.org/pet-finder/#action_0=pet&animalID_0=22506352&petIndex_0=-1",
+        )
+
+    def test_smalldog_deep_link(self):
+        self.assertEqual(
+            reconstruct_adoption_url(["https://www.smalldogrescuene.org/"], "999"),
+            "https://www.smalldogrescuene.org/adoptable-dogs/#action_0=pet&animalID_0=999&petIndex_0=-1",
+        )
+
+    def test_matches_via_subdomain(self):
+        self.assertEqual(
+            reconstruct_adoption_url(["https://adopt.sterlingshelter.org/foo"], "1"),
+            "https://sterlingshelter.org/pet-finder/#action_0=pet&animalID_0=1&petIndex_0=-1",
+        )
+
+    def test_first_matching_candidate_wins(self):
+        # Unknown domain is skipped; the known one is used.
+        self.assertEqual(
+            reconstruct_adoption_url(
+                [None, "https://rescuegroups.org/foo", "https://sterlingshelter.org/"], "42"
+            ),
+            "https://sterlingshelter.org/pet-finder/#action_0=pet&animalID_0=42&petIndex_0=-1",
+        )
+
+    def test_mspca_uses_lowercased_rescue_id(self):
+        self.assertEqual(
+            reconstruct_adoption_url(["http://www.mspca.org/boston"], "22301016", rescue_id="A467410"),
+            "https://www.mspca.org/pets/a467410/",
+        )
+
+    def test_mspca_without_rescue_id_returns_none(self):
+        # MSPCA's template needs rescue_id, not the RescueGroups pet_id.
+        self.assertIsNone(reconstruct_adoption_url(["http://www.mspca.org/boston"], "22301016"))
+
+    def test_unknown_domain_returns_none(self):
+        self.assertIsNone(reconstruct_adoption_url(["https://www.example.org/adoption-search/"], "5"))
+
+    def test_missing_pet_id_returns_none(self):
+        self.assertIsNone(reconstruct_adoption_url(["https://sterlingshelter.org/"], None))
+        self.assertIsNone(reconstruct_adoption_url(["https://sterlingshelter.org/"], ""))
+
+
+class ParseAnimalIntegrationTests(unittest.TestCase):
+    """The deep link should be applied end-to-end in SourceRescueGroups."""
+
+    def setUp(self):
+        self.source = SourceRescueGroups(api_key="dummy")
+
+    def _animal(self):
+        return {
+            "type": "animals",
+            "id": "22506352",
+            "attributes": {"name": "Ketchup", "breedString": "Lab Mix"},
+            "relationships": {"orgs": {"data": [{"type": "orgs", "id": "org1"}]}},
+        }
+
+    def test_toolkit_org_gets_deep_link(self):
+        orgs = {"org1": {"city": "Sterling", "state": "MA", "url": "https://sterlingshelter.org/"}}
+        pet = self.source._parse_animal(self._animal(), orgs)
+        self.assertEqual(
+            pet.adoption_url,
+            "https://sterlingshelter.org/pet-finder/#action_0=pet&animalID_0=22506352&petIndex_0=-1",
+        )
+
+    def test_non_toolkit_org_keeps_landing_url(self):
+        orgs = {"org1": {"city": "Boston", "state": "MA", "url": "https://www.mspca.org/"}}
+        pet = self.source._parse_animal(self._animal(), orgs)
+        self.assertEqual(pet.adoption_url, "https://www.mspca.org/")
+
+
+if __name__ == "__main__":
+    unittest.main()