From 4b9ff241ddade0cfa6f3841ab26f60560c22ea57 Mon Sep 17 00:00:00 2001
From: William Palin <bill@free.law>
Date: Thu, 7 May 2026 14:02:00 -0400
Subject: [PATCH] fix(nyappdiv): Fix nyappdiv_2nd html parsing

---
 .../united_states/state/nyappdiv_2nd.py       | 48 +++++++++++++++++++
 1 file changed, 48 insertions(+)
diff --git a/juriscraper/opinions/united_states/state/nyappdiv_2nd.py b/juriscraper/opinions/united_states/state/nyappdiv_2nd.py
index 2f7a493af..c113c5fa2 100644
--- a/juriscraper/opinions/united_states/state/nyappdiv_2nd.py
+++ b/juriscraper/opinions/united_states/state/nyappdiv_2nd.py
@@ -6,6 +6,10 @@
 # Date: 2014-07-04
 from datetime import date
 
+import nh3
+from lxml import etree
+from lxml.html import fromstring, tostring
+
 from juriscraper.opinions.united_states.state import nyappdiv_1st
 
 
@@ -13,3 +17,47 @@ class Site(nyappdiv_1st.Site):
     first_opinion_date = date(2003, 9, 25)
     days_interval = 30
     court = "App Div, 2d Dept"
+
+    @staticmethod
+    def cleanup_content(content: bytes) -> bytes:
+        """Remove hash altering timestamps to prevent duplicates
+
+        Previously we've been more targeted about removing a href's but
+        doctor will strip them out anyway so we should just clean our html
+        content here.
+
+        :param content: downloaded content `r.content`
+        :return: content without hash altering elements
+        """
+        try:
+            html_str = content.decode("utf-8")
+        except UnicodeDecodeError:
+            return content
+
+        if not nh3.is_html(html_str):
+            return content
+
+        # remove <a> tags; allow <main> so we can extract the opinion;
+        # fully drop nav/footer/header/script/style content (changes between
+        # requests and pollutes hashes)
+        clean_content_tags = {"script", "style", "nav", "footer", "header"}
+        allowed = set(nh3.ALLOWED_TAGS) - clean_content_tags
+        allowed.discard("a")
+        allowed.add("main")
+
+        cleaned = nh3.clean(
+            html_str,
+            tags=allowed,
+            clean_content_tags=clean_content_tags,
+        )
+
+        tree = fromstring(cleaned)
+        main = tree.xpath('//main[@id="main"]') or tree.xpath("//main")
+        if main:
+            new_tree = etree.Element("html")
+            body = etree.SubElement(new_tree, "body")
+            body.append(main[0])
+            tree = new_tree
+
+        normalized_html = tostring(tree, encoding="unicode", method="html")
+        return normalized_html.encode()