|
| 1 | +from pathlib import Path |
| 2 | +from typing import List, Dict, Any |
| 3 | +from xml.dom import minidom |
| 4 | +from xml.etree.ElementTree import Element, SubElement, tostring |
| 5 | + |
| 6 | +from reflex import constants |
| 7 | +from reflex.utils import prerequisites |
| 8 | +from reflex.config import get_config |
| 9 | + |
| 10 | + |
| 11 | +# _static folder in the .web directory containing the sitemap.xml file. |
| 12 | +_sitemap_folder_path: Path = ( |
| 13 | + Path.cwd() / prerequisites.get_web_dir() / constants.Dirs.STATIC |
| 14 | +) |
| 15 | + |
| 16 | +# sitemap file path |
| 17 | +_sitemap_file_path: Path = _sitemap_folder_path / "sitemap.xml" |
| 18 | + |
| 19 | + |
| 20 | +def check_sitemap_file_exists() -> bool: |
| 21 | + """Check if the sitemap file exists. |
| 22 | +
|
| 23 | + Returns: |
| 24 | + bool: True if the sitemap file exists in the .web/_static folder. |
| 25 | + """ |
| 26 | + return _sitemap_folder_path.exists() & _sitemap_file_path.exists() |
| 27 | + |
| 28 | + |
| 29 | +def read_sitemap_file() -> str: |
| 30 | + """Read the sitemap file. |
| 31 | +
|
| 32 | + Returns: |
| 33 | + str: The contents of the sitemap file. |
| 34 | + """ |
| 35 | + with open(_sitemap_file_path, "r") as f: |
| 36 | + return f.read() |
| 37 | + |
| 38 | + |
| 39 | +def generate_xml(links: List[Dict[str, Any]]) -> str: |
| 40 | + """Generate an XML sitemap from a list of links. |
| 41 | +
|
| 42 | + Args: |
| 43 | + links (List[Dict[str, Any]]): A list of dictionaries where each dictionary contains |
| 44 | + 'loc' (URL of the page), 'changefreq' (frequency of changes), and 'priority' (priority of the page). |
| 45 | +
|
| 46 | + Returns: |
| 47 | + str: A pretty-printed XML string representing the sitemap. |
| 48 | + """ |
| 49 | + urlset = Element("urlset", xmlns="https://www.sitemaps.org/schemas/sitemap/0.9") |
| 50 | + for link in links: |
| 51 | + url = SubElement(urlset, "url") |
| 52 | + loc = SubElement(url, "loc") |
| 53 | + loc.text = link["loc"] |
| 54 | + changefreq = SubElement(url, "changefreq") |
| 55 | + changefreq.text = link["changefreq"] |
| 56 | + priority = SubElement(url, "priority") |
| 57 | + priority.text = str(link["priority"]) |
| 58 | + rough_string = tostring(urlset, "utf-8") |
| 59 | + reparsed = minidom.parseString(rough_string) |
| 60 | + return reparsed.toprettyxml(indent=" ") |
| 61 | + |
| 62 | + |
| 63 | +def generate_links_for_sitemap(app_instance) -> List[dict[str, str]]: |
| 64 | + """Generate a list of links for which sitemaps are generated. |
| 65 | +
|
| 66 | + This function loops through the registered routes in the app and generates a list of |
| 67 | + links with their respective sitemap properties such as location (URL), change frequency, |
| 68 | + and priority. Dynamic routes and the 404 page are excluded from the sitemap. |
| 69 | +
|
| 70 | + Args: |
| 71 | + app_instance: The instance of the App class from app.py. |
| 72 | +
|
| 73 | + Returns: |
| 74 | + List: A list of dictionaries where each dictionary contains the 'loc' (URL of the page), 'priority' and |
| 75 | + 'changefreq' of each route. |
| 76 | + """ |
| 77 | + links = [] |
| 78 | + |
| 79 | + # find link of pages that are not dynamicaly created. |
| 80 | + for route, component in app_instance.get_pages().items(): |
| 81 | + # Ignore dynamic routes and 404 |
| 82 | + if ("[" in route and "]" in route) or route == "404": |
| 83 | + continue |
| 84 | + |
| 85 | + # Handle the index route |
| 86 | + if route == "index": |
| 87 | + route = "/" |
| 88 | + |
| 89 | + if not route.startswith("/"): |
| 90 | + route = f"/{route}" |
| 91 | + |
| 92 | + sitemap_changefreq = constants.DefaultPage.SITEMAP_CHANGEFREQ # default value |
| 93 | + sitemap_priority = constants.DefaultPage.SITEMAP_PRIORITY # default value |
| 94 | + |
| 95 | + # extract sitemap properties from the app's property, route exist in the compiled pages. |
| 96 | + if route in app_instance.site_map_properties: |
| 97 | + sitemap_priority = app_instance.site_map_properties[route]["priority"] |
| 98 | + sitemap_changefreq = app_instance.site_map_properties[route]["changefreq"] |
| 99 | + |
| 100 | + if ( |
| 101 | + sitemap_priority == constants.DefaultPage.SITEMAP_PRIORITY |
| 102 | + ): # indicates that user didn't set priority |
| 103 | + depth = route.count("/") |
| 104 | + sitemap_priority = max(0.5, 1.0 - (depth * 0.1)) |
| 105 | + |
| 106 | + deploy_url = get_config().deploy_url # pick domain url from the config file. |
| 107 | + |
| 108 | + links.append( |
| 109 | + { |
| 110 | + "loc": f"{deploy_url}{route}", |
| 111 | + "changefreq": sitemap_changefreq, |
| 112 | + "priority": sitemap_priority, |
| 113 | + } |
| 114 | + ) |
| 115 | + return links |
| 116 | + |
| 117 | + |
| 118 | +def generate_static_sitemap(links: List[dict[str, str]]) -> None: |
| 119 | + """Generates the sitemaps for the pages stored in _pages. Store it in sitemap.xml. |
| 120 | +
|
| 121 | + This method is called from two methods: |
| 122 | + 1. Everytime the web app is deployed onto the server. |
| 123 | + 2. When the user (or crawler) requests for the sitemap.xml file. |
| 124 | +
|
| 125 | + Args: |
| 126 | + links: The list of urls for which the sitemap is to be generated. |
| 127 | + """ |
| 128 | + sitemap = generate_xml(links) |
| 129 | + Path(_sitemap_folder_path).mkdir(parents=True, exist_ok=True) |
| 130 | + |
| 131 | + # this method is only called when old sitemap.xml is not retrieved. So we can safely replace an already existing xml |
| 132 | + # file. |
| 133 | + with open(_sitemap_file_path, "w") as f: |
| 134 | + f.write(sitemap) |
| 135 | + |
| 136 | + |
| 137 | +def remove_sitemap_file() -> None: |
| 138 | + """Remove the sitemap file, if a regeneration is needed. |
| 139 | +
|
| 140 | + Generally for testing the generation of sitemap.xml file, we need to remove the automatically generated file |
| 141 | + when the app is initialized. |
| 142 | + """ |
| 143 | + if _sitemap_file_path.exists(): |
| 144 | + _sitemap_file_path.unlink() |
0 commit comments