From 4b9ff241ddade0cfa6f3841ab26f60560c22ea57 Mon Sep 17 00:00:00 2001 From: William Palin Date: Thu, 7 May 2026 14:02:00 -0400 Subject: [PATCH] fix(nyappdiv): Fix nyappdiv_2nd html parsing --- .../united_states/state/nyappdiv_2nd.py | 48 +++++++++++++++++++ 1 file changed, 48 insertions(+) diff --git a/juriscraper/opinions/united_states/state/nyappdiv_2nd.py b/juriscraper/opinions/united_states/state/nyappdiv_2nd.py index 2f7a493af..c113c5fa2 100644 --- a/juriscraper/opinions/united_states/state/nyappdiv_2nd.py +++ b/juriscraper/opinions/united_states/state/nyappdiv_2nd.py @@ -6,6 +6,10 @@ # Date: 2014-07-04 from datetime import date +import nh3 +from lxml import etree +from lxml.html import fromstring, tostring + from juriscraper.opinions.united_states.state import nyappdiv_1st @@ -13,3 +17,47 @@ class Site(nyappdiv_1st.Site): first_opinion_date = date(2003, 9, 25) days_interval = 30 court = "App Div, 2d Dept" + + @staticmethod + def cleanup_content(content: bytes) -> bytes: + """Remove hash altering timestamps to prevent duplicates + + Previously we've been more targeted about removing a href's but + doctor will strip them out anyway so we should just clean our html + content here. + + :param content: downloaded content `r.content` + :return: content without hash altering elements + """ + try: + html_str = content.decode("utf-8") + except UnicodeDecodeError: + return content + + if not nh3.is_html(html_str): + return content + + # remove tags; allow
so we can extract the opinion; + # fully drop nav/footer/header/script/style content (changes between + # requests and pollutes hashes) + clean_content_tags = {"script", "style", "nav", "footer", "header"} + allowed = set(nh3.ALLOWED_TAGS) - clean_content_tags + allowed.discard("a") + allowed.add("main") + + cleaned = nh3.clean( + html_str, + tags=allowed, + clean_content_tags=clean_content_tags, + ) + + tree = fromstring(cleaned) + main = tree.xpath('//main[@id="main"]') or tree.xpath("//main") + if main: + new_tree = etree.Element("html") + body = etree.SubElement(new_tree, "body") + body.append(main[0]) + tree = new_tree + + normalized_html = tostring(tree, encoding="unicode", method="html") + return normalized_html.encode()