diff --git a/tests/json/chinese.json b/tests/json/chinese.json new file mode 100644 index 00000000..8dee5ffa --- /dev/null +++ b/tests/json/chinese.json @@ -0,0 +1,6 @@ +{ + "title": "测试文件", + "content": "这是一个中文测试", + "greeting": "你好世界", + "items": ["一", "二", "三"] +} diff --git a/tests/json/chinese.txt b/tests/json/chinese.txt new file mode 100644 index 00000000..179f9890 --- /dev/null +++ b/tests/json/chinese.txt @@ -0,0 +1 @@ +这是一个中文测试 你好世界 一 二 三 测试文件 \ No newline at end of file diff --git a/tests/json/cyrillic.json b/tests/json/cyrillic.json new file mode 100644 index 00000000..5681a765 --- /dev/null +++ b/tests/json/cyrillic.json @@ -0,0 +1,9 @@ +{ + "name": "Тестовый файл", + "description": "Это тест на кириллице", + "nested": { + "value": "Значение", + "greeting": "Привет мир" + }, + "items": ["один", "два", "три"] +} diff --git a/tests/json/cyrillic.txt b/tests/json/cyrillic.txt new file mode 100644 index 00000000..47e8804a --- /dev/null +++ b/tests/json/cyrillic.txt @@ -0,0 +1 @@ +Это тест на кириллице один два три Тестовый файл Привет мир Значение \ No newline at end of file diff --git a/tests/json/emoji.json b/tests/json/emoji.json new file mode 100644 index 00000000..0b86de1a --- /dev/null +++ b/tests/json/emoji.json @@ -0,0 +1,5 @@ +{ + "message": "Hello 😀 World", + "status": "✅ success", + "items": ["🚀", "🎉", "❤️"] +} diff --git a/tests/json/emoji.txt b/tests/json/emoji.txt new file mode 100644 index 00000000..eec2cd91 --- /dev/null +++ b/tests/json/emoji.txt @@ -0,0 +1 @@ +🚀 🎉 ❤️ Hello 😀 World ✅ success \ No newline at end of file diff --git a/tests/json/mixed_scripts.json b/tests/json/mixed_scripts.json new file mode 100644 index 00000000..a5f72894 --- /dev/null +++ b/tests/json/mixed_scripts.json @@ -0,0 +1,7 @@ +{ + "english": "Hello", + "russian": "Привет", + "chinese": "你好", + "emoji": "👋", + "combined": "Mixed: Привет 你好 👋" +} diff --git a/tests/json/mixed_scripts.txt b/tests/json/mixed_scripts.txt new file mode 100644 index 00000000..7d52b8f6 --- /dev/null +++ b/tests/json/mixed_scripts.txt @@ -0,0 +1 @@ +你好 Mixed: Привет 你好 👋 👋 Hello Привет \ No newline at end of file diff --git a/tests/test_json.py b/tests/test_json.py index 721ba91c..27744a15 100644 --- a/tests/test_json.py +++ b/tests/test_json.py @@ -3,5 +3,43 @@ from . import base +_ENCODING_TEST_CASES: list[tuple[str, str]] = [ + ("cyrillic", ""), + ("chinese", ""), + ("emoji", ""), + ("mixed_scripts", ""), +] class JsonTestCase(base.BaseParserTestCase, unittest.TestCase): extension = "json" + + + + def test_character_sets_python(self): + """Test JSON parser with various character encodings via Python API.""" + d = self.get_extension_directory() + for charset, skip_reason in _ENCODING_TEST_CASES: + with self.subTest(charset=charset): + if skip_reason: + self.skipTest(skip_reason) + filename = f"{d}/{charset}.json" + expected_filename = f"{d}/{charset}.txt" + self.compare_python_output(filename, expected_filename) + + def test_character_sets_cli(self): + """Test JSON parser with various character encodings via CLI.""" + d = self.get_extension_directory() + for charset, skip_reason in _ENCODING_TEST_CASES: + with self.subTest(charset=charset): + if skip_reason: + self.skipTest(skip_reason) + filename = f"{d}/{charset}.json" + expected_filename = f"{d}/{charset}.txt" + self.compare_cli_output(filename, expected_filename) + + + def test_explicit_encoding_parameter(self): + """Test that encoding parameter is respected.""" + filename = self.get_extension_directory() + "/cyrillic.json" + expected_filename = self.get_extension_directory() + "/cyrillic.txt" + # Test with explicit UTF-8 encoding + self.compare_python_output(filename, expected_filename, encoding='utf-8') diff --git a/textract/parsers/json_parser.py b/textract/parsers/json_parser.py index 44a49481..e513d775 100644 --- a/textract/parsers/json_parser.py +++ b/textract/parsers/json_parser.py @@ -10,8 +10,8 @@ class Parser(BaseParser): from mongodb dumps, for example. """ - def extract(self, filename, **kwargs): - with Path(filename).open(encoding="utf-8") as raw: + def extract(self, filename, encoding='utf-8', **kwargs): + with Path(filename).open(encoding=encoding) as raw: deserialized_json = json.load(raw) return self.get_text(deserialized_json)