|
19 | 19 | import os.path |
20 | 20 | import shutil |
21 | 21 | import os |
22 | | -import re |
23 | 22 | import stat |
24 | 23 | import platform |
25 | 24 | import uuid |
@@ -539,147 +538,6 @@ def canonical_path(path: str, platform=None): |
539 | 538 | return path |
540 | 539 |
|
541 | 540 |
|
542 | | -def encode_filesystem_name(input_str: str): |
543 | | - """Encodes an arbitrary unicode string to a generic filesystem-compatible |
544 | | - non-unicode filename. |
545 | | -
|
546 | | - The result after encoding will only contain the standard ascii lowercase |
547 | | - letters (a-z), the digits (0-9), or periods, underscores, or dashes |
548 | | - (".", "_", or "-"). No uppercase letters will be used, for |
549 | | - comaptibility with case-insensitive filesystems. |
550 | | -
|
551 | | - The rules for the encoding are: |
552 | | -
|
553 | | - 1. Any lowercase letter, digit, period, or dash (a-z, 0-9, ., or -) is |
554 | | - encoded as-is. |
555 | | -
|
556 | | - 2. Any underscore is encoded as a double-underscore (``__``) |
557 | | -
|
558 | | - 3. Any uppercase ascii letter (A-Z) is encoded as an underscore followed |
559 | | - by the corresponding lowercase letter (ie, "A" => "_a") |
560 | | -
|
561 | | - 4. All other characters are encoded using their UTF-8 encoded unicode |
562 | | - representation, in the following format: ``_NHH...``, where: |
563 | | -
|
564 | | - * N represents the number of bytes needed for the UTF-8 encoding, |
565 | | - except with N=0 for one-byte representation (the exception for N=1 |
566 | | - is made both because it means that for "standard" ascii characters |
567 | | - in the range 0-127, their encoding will be _0xx, where xx is their |
568 | | - ascii hex code; and because it mirrors the ways UTF-8 encoding |
569 | | - itself works, where the number of bytes needed for the character can |
570 | | - be determined by counting the number of leading "1"s in the binary |
571 | | - representation of the character, except that if it is a 1-byte |
572 | | - sequence, there are 0 leading 1's). |
573 | | - * HH represents the bytes of the corresponding UTF-8 encoding, in |
574 | | - hexadecimal (using lower-case letters) |
575 | | -
|
576 | | - As an example, the character ``*``, whose (hex) UTF-8 representation |
577 | | - of 2A, would be encoded as "_02a", while the "euro" symbol, which |
578 | | - has a UTF-8 representation of E2 82 AC, would be encoded as |
579 | | - "_3e282ac". (Note that, strictly speaking, the "N" part of the |
580 | | - encoding is redundant information, since it is essentially encoded |
581 | | - in the UTF-8 representation itself, but it makes the resulting |
582 | | - string more human-readable, and easier to decode). |
583 | | -
|
584 | | - As an example, the string "Foo_Bar (fun).txt" would get encoded as ``_foo___bar_020_028fun_029.txt``. |
585 | | - """ |
586 | | - # TODO: Test this |
587 | | - if isinstance(input_str, str): |
588 | | - input_str = input_str.encode(encoding="utf-8") |
589 | | - |
590 | | - as_is = u'abcdefghijklmnopqrstuvwxyz0123456789.-' |
591 | | - uppercase = u'ABCDEFGHIJKLMNOPQRSTUVWXYZ' |
592 | | - result = [] |
593 | | - for char in input_str: |
594 | | - if char in as_is: |
595 | | - result.append(char) |
596 | | - elif char == u'_': |
597 | | - result.append('__') |
598 | | - elif char in uppercase: |
599 | | - result.append('_%s' % char.lower()) |
600 | | - else: |
601 | | - utf8 = char.encode('utf8') |
602 | | - N = len(utf8) |
603 | | - if N == 1: |
604 | | - N = 0 |
605 | | - HH = ''.join('%x' % ord(c) for c in utf8) |
606 | | - result.append('_%d%s' % (N, HH)) |
607 | | - return ''.join(result) |
608 | | - |
609 | | - |
610 | | -_FILESYSTEM_TOKEN_RE = re.compile(r'(?P<as_is>[a-z0-9.-])|(?P<underscore>__)|_(?P<uppercase>[a-z])|_(?P<N>[0-9])') |
611 | | -_HEX_RE = re.compile('[0-9a-f]+$') |
612 | | - |
613 | | - |
614 | | -def decode_filesystem_name(filename: str): |
615 | | - """Decodes a filename encoded using the rules given in encode_filesystem_name |
616 | | - to a unicode string. |
617 | | - """ |
618 | | - result = [] |
619 | | - remain = filename |
620 | | - i = 0 |
621 | | - while remain: |
622 | | - # use match, to ensure it matches from the start of the string... |
623 | | - match = _FILESYSTEM_TOKEN_RE.match(remain) |
624 | | - if not match: |
625 | | - raise ValueError("incorrectly encoded filesystem name %r" |
626 | | - " (bad index: %d - %r)" % (filename, i, |
627 | | - remain[:2])) |
628 | | - match_str = match.group(0) |
629 | | - match_len = len(match_str) |
630 | | - i += match_len |
631 | | - remain = remain[match_len:] |
632 | | - match_dict = match.groupdict() |
633 | | - if match_dict['as_is']: |
634 | | - result.append(unicode(match_str)) |
635 | | - elif match_dict['underscore']: |
636 | | - result.append(u'_') |
637 | | - elif match_dict['uppercase']: |
638 | | - result.append(unicode(match_dict['uppercase'].upper())) |
639 | | - elif match_dict['N']: |
640 | | - N = int(match_dict['N']) |
641 | | - if N == 0: |
642 | | - N = 1 |
643 | | - # hex-encoded, so need to grab 2*N chars |
644 | | - bytes_len = 2 * N |
645 | | - i += bytes_len |
646 | | - bytes = remain[:bytes_len] |
647 | | - remain = remain[bytes_len:] |
648 | | - |
649 | | - # need this check to ensure that we don't end up eval'ing |
650 | | - # something nasty... |
651 | | - if not _HEX_RE.match(bytes): |
652 | | - raise ValueError("Bad utf8 encoding in name %r" |
653 | | - " (bad index: %d - %r)" % (filename, i, bytes)) |
654 | | - |
655 | | - bytes_repr = ''.join('\\x%s' % bytes[i:i + 2] |
656 | | - for i in xrange(0, bytes_len, 2)) |
657 | | - bytes_repr = "'%s'" % bytes_repr |
658 | | - result.append(eval(bytes_repr).decode('utf8')) |
659 | | - else: |
660 | | - raise ValueError("Unrecognized match type in filesystem name %r" |
661 | | - " (bad index: %d - %r)" % (filename, i, remain[:2])) |
662 | | - |
663 | | - return u''.join(result) |
664 | | - |
665 | | - |
666 | | -def test_encode_decode() -> None: |
667 | | - def do_test(orig, expected_encoded) -> None: |
668 | | - print('=' * 80) |
669 | | - print(orig) |
670 | | - encoded = encode_filesystem_name(orig) |
671 | | - print(encoded) |
672 | | - assert encoded == expected_encoded |
673 | | - decoded = decode_filesystem_name(encoded) |
674 | | - print(decoded) |
675 | | - assert decoded == orig |
676 | | - |
677 | | - do_test("Foo_Bar (fun).txt", '_foo___bar_020_028fun_029.txt') |
678 | | - |
679 | | - # u'\u20ac' == Euro symbol |
680 | | - do_test(u"\u20ac3 ~= $4.06", '_3e282ac3_020_07e_03d_020_0244.06') |
681 | | - |
682 | | - |
683 | 541 | def walk_up_dirs(path: str): |
684 | 542 | """Yields absolute directories starting with the given path, and iterating |
685 | 543 | up through all it's parents, until it reaches a root directory""" |
|
0 commit comments