-
Notifications
You must be signed in to change notification settings - Fork 198
fix: Sanitize filenames and allow optional kebab case #260
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -28,7 +28,7 @@ def __str__(self) -> str: ... | |
| logging.getLogger("opentelemetry.sdk.metrics._internal.instrument").setLevel(logging.ERROR) | ||
|
|
||
|
|
||
| def generate_permalink(file_path: Union[Path, str, PathLike]) -> str: | ||
| def generate_permalink(file_path: Union[Path, str, PathLike], split_extension: bool = True) -> str: | ||
|
Collaborator
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Most modifications in this file are simply lint fixes, applied automatically on save in my IDE. The main changes are:
Doing so allows this function to be reused, keeping things DRY, rather than repeating almost the entire function for use when generating titles.
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. re: lint fixes. As long as ruff is happy. |
||
| """Generate a stable permalink from a file path. | ||
|
|
||
| Args: | ||
|
|
@@ -51,53 +51,59 @@ def generate_permalink(file_path: Union[Path, str, PathLike]) -> str: | |
| # Convert Path to string if needed | ||
| path_str = Path(str(file_path)).as_posix() | ||
|
|
||
| # Remove extension | ||
| base = os.path.splitext(path_str)[0] | ||
| # Remove extension (for now, possibly) | ||
| (base, extension) = os.path.splitext(path_str) | ||
|
|
||
| # Check if we have CJK characters that should be preserved | ||
| # CJK ranges: \u4e00-\u9fff (CJK Unified Ideographs), \u3000-\u303f (CJK symbols), | ||
| # CJK ranges: \u4e00-\u9fff (CJK Unified Ideographs), \u3000-\u303f (CJK symbols), | ||
| # \u3400-\u4dbf (CJK Extension A), \uff00-\uffef (Fullwidth forms) | ||
| has_cjk_chars = any( | ||
| '\u4e00' <= char <= '\u9fff' or | ||
| '\u3000' <= char <= '\u303f' or | ||
| '\u3400' <= char <= '\u4dbf' or | ||
| '\uff00' <= char <= '\uffef' | ||
| "\u4e00" <= char <= "\u9fff" | ||
| or "\u3000" <= char <= "\u303f" | ||
| or "\u3400" <= char <= "\u4dbf" | ||
| or "\uff00" <= char <= "\uffef" | ||
| for char in base | ||
| ) | ||
|
|
||
| if has_cjk_chars: | ||
| # For text with CJK characters, selectively transliterate only Latin accented chars | ||
| result = "" | ||
| for char in base: | ||
| if ('\u4e00' <= char <= '\u9fff' or | ||
| '\u3000' <= char <= '\u303f' or | ||
| '\u3400' <= char <= '\u4dbf'): | ||
| if ( | ||
| "\u4e00" <= char <= "\u9fff" | ||
| or "\u3000" <= char <= "\u303f" | ||
| or "\u3400" <= char <= "\u4dbf" | ||
| ): | ||
| # Preserve CJK ideographs and symbols | ||
| result += char | ||
| elif ('\uff00' <= char <= '\uffef'): | ||
| elif "\uff00" <= char <= "\uffef": | ||
| # Remove Chinese fullwidth punctuation entirely (like ,!?) | ||
| continue | ||
| else: | ||
| # Transliterate Latin accented characters to ASCII | ||
| result += unidecode(char) | ||
|
|
||
| # Insert hyphens between CJK and Latin character transitions | ||
| # Match: CJK followed by Latin letter/digit, or Latin letter/digit followed by CJK | ||
| result = re.sub(r'([\u4e00-\u9fff\u3000-\u303f\u3400-\u4dbf])([a-zA-Z0-9])', r'\1-\2', result) | ||
| result = re.sub(r'([a-zA-Z0-9])([\u4e00-\u9fff\u3000-\u303f\u3400-\u4dbf])', r'\1-\2', result) | ||
|
|
||
| result = re.sub( | ||
| r"([\u4e00-\u9fff\u3000-\u303f\u3400-\u4dbf])([a-zA-Z0-9])", r"\1-\2", result | ||
| ) | ||
| result = re.sub( | ||
| r"([a-zA-Z0-9])([\u4e00-\u9fff\u3000-\u303f\u3400-\u4dbf])", r"\1-\2", result | ||
| ) | ||
|
|
||
| # Insert dash between camelCase | ||
| result = re.sub(r"([a-z0-9])([A-Z])", r"\1-\2", result) | ||
|
|
||
| # Convert ASCII letters to lowercase, preserve CJK | ||
| lower_text = "".join(c.lower() if c.isascii() and c.isalpha() else c for c in result) | ||
|
|
||
| # Replace underscores with hyphens | ||
| text_with_hyphens = lower_text.replace("_", "-") | ||
|
|
||
| # Remove apostrophes entirely (don't replace with hyphens) | ||
| text_no_apostrophes = text_with_hyphens.replace("'", "") | ||
|
|
||
| # Replace unsafe chars with hyphens, but preserve CJK characters | ||
| clean_text = re.sub( | ||
| r"[^a-z0-9\u4e00-\u9fff\u3000-\u303f\u3400-\u4dbf/\-]", "-", text_no_apostrophes | ||
|
|
@@ -129,7 +135,13 @@ def generate_permalink(file_path: Union[Path, str, PathLike]) -> str: | |
| segments = clean_text.split("/") | ||
| clean_segments = [s.strip("-") for s in segments] | ||
|
|
||
| return "/".join(clean_segments) | ||
| return_val = "/".join(clean_segments) | ||
|
|
||
| # Append file extension back, if necessary | ||
| if not split_extension and extension: | ||
| return_val += extension | ||
|
|
||
| return return_val | ||
|
|
||
|
|
||
| def setup_logging( | ||
|
|
@@ -229,79 +241,79 @@ def normalize_newlines(multiline: str) -> str: | |
| Returns: | ||
| A string with normalized newlines native to the platform. | ||
| """ | ||
| return re.sub(r'\r\n?|\n', os.linesep, multiline) | ||
| return re.sub(r"\r\n?|\n", os.linesep, multiline) | ||
|
|
||
|
|
||
| def normalize_file_path_for_comparison(file_path: str) -> str: | ||
| """Normalize a file path for conflict detection. | ||
|
|
||
| This function normalizes file paths to help detect potential conflicts: | ||
| - Converts to lowercase for case-insensitive comparison | ||
| - Normalizes Unicode characters | ||
| - Handles path separators consistently | ||
|
|
||
| Args: | ||
| file_path: The file path to normalize | ||
|
|
||
| Returns: | ||
| Normalized file path for comparison purposes | ||
| """ | ||
| import unicodedata | ||
|
|
||
| # Convert to lowercase for case-insensitive comparison | ||
| normalized = file_path.lower() | ||
|
|
||
| # Normalize Unicode characters (NFD normalization) | ||
| normalized = unicodedata.normalize('NFD', normalized) | ||
| normalized = unicodedata.normalize("NFD", normalized) | ||
|
|
||
| # Replace path separators with forward slashes | ||
| normalized = normalized.replace('\\', '/') | ||
| normalized = normalized.replace("\\", "/") | ||
|
|
||
| # Remove multiple slashes | ||
| normalized = re.sub(r'/+', '/', normalized) | ||
| normalized = re.sub(r"/+", "/", normalized) | ||
|
|
||
| return normalized | ||
|
|
||
|
|
||
| def detect_potential_file_conflicts(file_path: str, existing_paths: List[str]) -> List[str]: | ||
| """Detect potential conflicts between a file path and existing paths. | ||
|
|
||
| This function checks for various types of conflicts: | ||
| - Case sensitivity differences | ||
| - Unicode normalization differences | ||
| - Path separator differences | ||
| - Permalink generation conflicts | ||
|
|
||
| Args: | ||
| file_path: The file path to check | ||
| existing_paths: List of existing file paths to check against | ||
|
|
||
| Returns: | ||
| List of existing paths that might conflict with the given file path | ||
| """ | ||
| conflicts = [] | ||
|
|
||
| # Normalize the input file path | ||
| normalized_input = normalize_file_path_for_comparison(file_path) | ||
| input_permalink = generate_permalink(file_path) | ||
|
|
||
| for existing_path in existing_paths: | ||
| # Skip identical paths | ||
| if existing_path == file_path: | ||
| continue | ||
|
|
||
| # Check for case-insensitive path conflicts | ||
| normalized_existing = normalize_file_path_for_comparison(existing_path) | ||
| if normalized_input == normalized_existing: | ||
| conflicts.append(existing_path) | ||
| continue | ||
|
|
||
| # Check for permalink conflicts | ||
| existing_permalink = generate_permalink(existing_path) | ||
| if input_permalink == existing_permalink: | ||
| conflicts.append(existing_path) | ||
| continue | ||
|
|
||
| return conflicts | ||
|
|
||
|
|
||
|
|
@@ -336,13 +348,13 @@ def validate_project_path(path: str, project_path: Path) -> bool: | |
|
|
||
| def ensure_timezone_aware(dt: datetime) -> datetime: | ||
| """Ensure a datetime is timezone-aware using system timezone. | ||
|
|
||
| If the datetime is naive, convert it to timezone-aware using the system's local timezone. | ||
| If it's already timezone-aware, return it unchanged. | ||
|
|
||
| Args: | ||
| dt: The datetime to ensure is timezone-aware | ||
|
|
||
| Returns: | ||
| A timezone-aware datetime | ||
| """ | ||
|
|
@@ -351,4 +363,4 @@ def ensure_timezone_aware(dt: datetime) -> datetime: | |
| return dt.astimezone() | ||
| else: | ||
| # Already timezone-aware | ||
| return dt | ||
| return dt | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I like this
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think we'll need to do the same for project names. A project named "Hi/There" will fail to be found by the remove_project tool. Even if escaped, the project name when passed to the FastApi endpoint will confuse FastApi and return 404.