Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions tests/json/chinese.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
{
"title": "测试文件",
"content": "这是一个中文测试",
"greeting": "你好世界",
"items": ["一", "二", "三"]
}
1 change: 1 addition & 0 deletions tests/json/chinese.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
这是一个中文测试 你好世界 一 二 三 测试文件
9 changes: 9 additions & 0 deletions tests/json/cyrillic.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
{
"name": "Тестовый файл",
"description": "Это тест на кириллице",
"nested": {
"value": "Значение",
"greeting": "Привет мир"
},
"items": ["один", "два", "три"]
}
1 change: 1 addition & 0 deletions tests/json/cyrillic.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Это тест на кириллице один два три Тестовый файл Привет мир Значение
5 changes: 5 additions & 0 deletions tests/json/emoji.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
{
"message": "Hello 😀 World",
"status": "✅ success",
"items": ["🚀", "🎉", "❤️"]
}
1 change: 1 addition & 0 deletions tests/json/emoji.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
🚀 🎉 ❤️ Hello 😀 World ✅ success
7 changes: 7 additions & 0 deletions tests/json/mixed_scripts.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
{
"english": "Hello",
"russian": "Привет",
"chinese": "你好",
"emoji": "👋",
"combined": "Mixed: Привет 你好 👋"
}
1 change: 1 addition & 0 deletions tests/json/mixed_scripts.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
你好 Mixed: Привет 你好 👋 👋 Hello Привет
38 changes: 38 additions & 0 deletions tests/test_json.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,5 +3,43 @@
from . import base


_ENCODING_TEST_CASES: list[tuple[str, str]] = [
("cyrillic", ""),
("chinese", ""),
("emoji", ""),
("mixed_scripts", ""),
]
class JsonTestCase(base.BaseParserTestCase, unittest.TestCase):
extension = "json"



def test_character_sets_python(self):
"""Test JSON parser with various character encodings via Python API."""
d = self.get_extension_directory()
for charset, skip_reason in _ENCODING_TEST_CASES:
with self.subTest(charset=charset):
if skip_reason:
self.skipTest(skip_reason)
filename = f"{d}/{charset}.json"
expected_filename = f"{d}/{charset}.txt"
self.compare_python_output(filename, expected_filename)

def test_character_sets_cli(self):
"""Test JSON parser with various character encodings via CLI."""
d = self.get_extension_directory()
for charset, skip_reason in _ENCODING_TEST_CASES:
with self.subTest(charset=charset):
if skip_reason:
self.skipTest(skip_reason)
filename = f"{d}/{charset}.json"
expected_filename = f"{d}/{charset}.txt"
self.compare_cli_output(filename, expected_filename)


def test_explicit_encoding_parameter(self):
"""Test that encoding parameter is respected."""
filename = self.get_extension_directory() + "/cyrillic.json"
expected_filename = self.get_extension_directory() + "/cyrillic.txt"
# Test with explicit UTF-8 encoding
self.compare_python_output(filename, expected_filename, encoding='utf-8')
4 changes: 2 additions & 2 deletions textract/parsers/json_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,8 +10,8 @@ class Parser(BaseParser):
from mongodb dumps, for example.
"""

def extract(self, filename, **kwargs):
with Path(filename).open(encoding="utf-8") as raw:
def extract(self, filename, encoding='utf-8', **kwargs):
with Path(filename).open(encoding=encoding) as raw:
deserialized_json = json.load(raw)
return self.get_text(deserialized_json)

Expand Down
Loading