GC-Ref-Data-Tracker/scraper.py at main · PatLittle/GC-Ref-Data-Tracker · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
import csv
from urllib.parse import urljoin

import requests
from bs4 import BeautifulSoup, NavigableString, Tag


def _find_standards_items(soup):
    section = soup.find("section", class_="gc-srvinfo")
    if not section:
        section = soup.find("div", class_="row wb-eqht-grd")
    if section:
        item_divs = section.find_all("div", class_="col-md-6")
        if item_divs:
            return item_divs
        return section.find_all("div", recursive=False)

    content_root = soup.find("main") or soup.find(id="wb-cont") or soup.body
    items = []
    if content_root:
        for h3 in content_root.find_all("h3"):
            link = h3.find("a", href=True)
            if not link:
                continue
            href = link["href"]
            if "data-reference-standard" not in href and "norme" not in href:
                continue
            container = h3.find_parent(["div", "section", "article"]) or h3
            items.append(container)
    return items


def scrape_standards_page(url, output_filename):
    """
    Scrape a GC "Reference standards" page and write a CSV with the columns:
    Title, Description, Effective date, Data reference standard stewards,
    Machine-readable formats (available on Open Government)
    """
    resp = requests.get(url)
    if resp.status_code != 200:
        print(f"Failed to retrieve {url}, status code: {resp.status_code}")
        return

    soup = BeautifulSoup(resp.content, "html.parser")

    item_divs = _find_standards_items(soup)
    if not item_divs:
        print(f"Could not find standards section on {url}")
        return

    rows = []

    # keyword lists for label matching (English and French)
    effective_keywords = [
        "effective", "effective date", "date d’entrée", "date d’entrée en vigueur",
        "date d’entrée en vigueur :", "date d’entrée en vigueur", "date d’entrée",
        "date d'entrée", "date"
    ]
    steward_keywords = ["steward", "stewards", "responsable", "responsables"]
    machine_keywords = ["machine", "machine-readable", "machine-readable formats",
                        "format lisible", "format lisible par machine", "formats lisibles par machine",
                        "formats lisibles", "format lisible par machine"]

    def match_label(label_text):
        lab = label_text.lower()
        for k in steward_keywords:
            if k in lab:
                return "stewards"
        for k in machine_keywords:
            if k in lab:
                return "machine"
        for k in effective_keywords:
            if k in lab:
                return "effective"
        return None

    for item in item_divs:
        h3 = item.find("h3")
        title = h3.get_text(" ", strip=True) if h3 else "Unknown Title"

        description = ""
        if h3:
            desc_p = h3.find_next_sibling("p")
            if desc_p:
                description = desc_p.get_text(" ", strip=True)
        if not description:
            p_tags = item.find_all("p")
            if p_tags:
                description = p_tags[0].get_text(" ", strip=True)

        meta_p = None
        p_tags = item.find_all("p")
        for p in p_tags:
            if p.find("strong"):
                meta_p = p
                break

        if not meta_p and h3:
            maybe = h3.find_next_sibling("p")
            if maybe and maybe.find("strong"):
                meta_p = maybe

        # fallback: check other containers like lists or divs with strong tags
        if not meta_p:
            for candidate in item.find_all(["div", "ul", "ol", "dl"]):
                if candidate.find("strong"):
                    meta_p = candidate
                    break

        effective = ""
        stewards = ""
        machine = ""

        if meta_p:
            strongs = meta_p.find_all("strong")
            if strongs:
                for strong in strongs:
                    label = strong.get_text(" ", strip=True).rstrip(":").strip()
                    field = match_label(label)
                    parts = []
                    for sib in strong.next_siblings:
                        if isinstance(sib, Tag) and sib.name == "strong":
                            break
                        if isinstance(sib, NavigableString):
                            txt = str(sib).strip()
                            if txt:
                                parts.append(txt)
                        elif isinstance(sib, Tag):
                            if sib.name == "a" and sib.get("href"):
                                href = urljoin(url, sib["href"])
                                parts.append(f"{sib.get_text(strip=True)} ({href})")

                    val = " ".join(parts).strip()
                    if field == "effective": effective = val
                    elif field == "stewards": stewards = val
                    elif field == "machine": machine = val

        rows.append([title, description, effective, stewards, machine])

    with open(output_filename, "w", newline="", encoding="utf-8") as f:
        writer = csv.writer(f)
        writer.writerow(["Title", "Description", "Effective date", "Stewards", "Machine-readable formats"])
        writer.writerows(rows)