|
| 1 | +import re |
| 2 | +import os |
| 3 | +from collections import defaultdict |
| 4 | +import pprint |
| 5 | +import csv |
| 6 | + |
| 7 | +# Function to recursively search for filenames in a directory, ignoring extensions |
| 8 | +def find_files(base_dir, filename_without_ext): |
| 9 | + matches = [] |
| 10 | + for root, _, files in os.walk(base_dir): |
| 11 | + for file in files: |
| 12 | + file_without_ext = file.rsplit('.', 1)[0] |
| 13 | + if file_without_ext == filename_without_ext: |
| 14 | + # Construct the suggested path, ignoring 'docs/' and dropping the extension |
| 15 | + relative_path = os.path.relpath(root, base_dir) |
| 16 | + suggestion = os.path.join(relative_path, filename_without_ext) |
| 17 | + matches.append(suggestion.replace("\\", "/")) # Ensure POSIX-style paths |
| 18 | + return matches |
| 19 | + |
| 20 | +# Initialize the dictionary to store the broken links |
| 21 | +broken_links_map = defaultdict(list) |
| 22 | + |
| 23 | +# Read the log file as a whole |
| 24 | +with open('build.log', 'r') as file: |
| 25 | + log_content = file.read() |
| 26 | + |
| 27 | +# Regular expression to match the broken link warnings |
| 28 | +pattern1 = re.compile(r"\[WARNING\] Docs markdown link couldn't be resolved: \((.*?)\) in source file \"(.*?)\"") |
| 29 | +pattern2 = re.compile(r"Broken link on source page path = (.*?):\n(?:\s*-> linking to (.*)\n?)+", re.MULTILINE) |
| 30 | + |
| 31 | +# Extract broken links and their source files for the first pattern |
| 32 | +for match in pattern1.finditer(log_content): |
| 33 | + broken_link, source_file = match.groups() |
| 34 | + broken_link = re.sub(r'#.*$', '', broken_link) # Remove the anchor from the broken link |
| 35 | + broken_links_map[broken_link].append(source_file) |
| 36 | + |
| 37 | +# Extract broken links and their source files for the second pattern |
| 38 | +matches = pattern2.finditer(log_content) |
| 39 | +for match in matches: |
| 40 | + source_path = match.group(1).strip() |
| 41 | + links = re.findall(r"-> linking to (.*)", match.group(0)) |
| 42 | + for broken_link in links: |
| 43 | + broken_link = re.sub(r'#.*$', '', broken_link) # Remove the anchor from the broken link |
| 44 | + broken_links_map[broken_link.strip()].append(source_path) |
| 45 | + |
| 46 | +# Debugging: Print the extracted broken links map |
| 47 | +print("Extracted broken links map:") |
| 48 | +pprint.pprint(dict(broken_links_map)) |
| 49 | + |
| 50 | +# Base directory to search for files (assuming it's 'docs/') |
| 51 | +base_dir = 'docs/' |
| 52 | + |
| 53 | +# Dictionary to store suggestions for broken links |
| 54 | +suggestions_map = defaultdict(list) |
| 55 | +# List to store links with no suggestions |
| 56 | +no_suggestions = [] |
| 57 | + |
| 58 | +# Filter out unwanted broken links and process each broken link to find potential matches in the docs folder |
| 59 | +ignored_prefixes = ['/components/library', '/tags/'] |
| 60 | +for broken_link, source_files in broken_links_map.items(): |
| 61 | + if any(broken_link.startswith(prefix) for prefix in ignored_prefixes): |
| 62 | + continue |
| 63 | + |
| 64 | + # Skip any source paths ending with "design-decisions" with an optional trailing slash |
| 65 | + if any(re.search(r'design-decisions/?$', source_file) for source_file in source_files): |
| 66 | + no_suggestions.append(broken_link) |
| 67 | + continue |
| 68 | + |
| 69 | + # Extract filename without extension, handling different path formats |
| 70 | + filename_without_ext = os.path.splitext(os.path.basename(broken_link.strip("/")))[0] |
| 71 | + print(f"Searching for matches for broken link: {broken_link} (filename: {filename_without_ext})") |
| 72 | + potential_matches = find_files(base_dir, filename_without_ext) |
| 73 | + if potential_matches: |
| 74 | + for match in potential_matches: |
| 75 | + suggestions_map[broken_link].append(match) |
| 76 | + else: |
| 77 | + no_suggestions.append(broken_link) |
| 78 | + |
| 79 | +# Print the broken links with their suggestions |
| 80 | +suggestions_list = [] |
| 81 | +for broken_link, suggestions in suggestions_map.items(): |
| 82 | + print(f"Broken link: {broken_link}") |
| 83 | + for suggestion in suggestions: |
| 84 | + print(f" Suggested replacement: /{suggestion}") |
| 85 | + suggestions_list.append((broken_link, f"/{suggestion}")) |
| 86 | + |
| 87 | +# Write the suggestions to a TSV file |
| 88 | +with open('suggested_links.tsv', 'w', newline='') as tsvfile: |
| 89 | + writer = csv.writer(tsvfile, delimiter='\t') |
| 90 | + writer.writerow(['Old Link', 'New Suggested Link']) |
| 91 | + for old_link, new_link in suggestions_list: |
| 92 | + writer.writerow([old_link, new_link]) |
| 93 | + writer.writerow([]) # Blank line for separation |
| 94 | + writer.writerow(['Links with No Suggestions']) # Header for no suggestions section |
| 95 | + for link in no_suggestions: |
| 96 | + writer.writerow([link]) |
| 97 | + |
| 98 | +# Print the map of broken links and their source files for reference |
| 99 | +print("\nMap of broken links and their source files:") |
| 100 | +pprint.pprint(dict(broken_links_map)) |
| 101 | + |
| 102 | +# Print the links with no suggestions |
| 103 | +if no_suggestions: |
| 104 | + print("\nLinks with no suggestions found:") |
| 105 | + for link in no_suggestions: |
| 106 | + print(link) |
| 107 | +else: |
| 108 | + print("\nAll broken links have suggestions.") |
| 109 | + |
| 110 | +# Count the total unique broken links |
| 111 | +total_unique_broken_links = len(broken_links_map) |
| 112 | +print(f"\nTotal unique broken links: {total_unique_broken_links}") |
| 113 | + |
| 114 | +# Count the total unique broken links without any suggestions |
| 115 | +total_no_suggestions = len(no_suggestions) |
| 116 | +print(f"Total unique broken links without any suggestions: {total_no_suggestions}") |
0 commit comments