Skip to content

Commit 176132c

Browse files
Merge pull request #465 from afuetterer/tests
test: restructure tests
2 parents 2995f11 + c106709 commit 176132c

8 files changed

Lines changed: 128 additions & 135 deletions

File tree

tests/conftest.py

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
from pathlib import Path
2+
3+
import pytest
4+
5+
6+
@pytest.fixture
7+
def test_file_path():
8+
return Path(__file__).parent / "files" / "rwservlet.pdf"
9+

tests/test_detector.py

Lines changed: 4 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1,17 +1,13 @@
1-
from pathlib import Path
2-
31
from tika import detector
42

5-
TEST_FILE_PATH = Path(__file__).parent / "files" / "rwservlet.pdf"
6-
73

8-
def test_local_binary():
9-
with open(TEST_FILE_PATH, "rb") as file_obj:
4+
def test_local_binary(test_file_path):
5+
with open(test_file_path, "rb") as file_obj:
106
assert detector.from_file(file_obj) == "application/pdf"
117

128

13-
def test_local_path():
14-
assert detector.from_file(str(TEST_FILE_PATH)) == "application/pdf"
9+
def test_local_path(test_file_path):
10+
assert detector.from_file(str(test_file_path)) == "application/pdf"
1511

1612

1713
def test_local_buffer():

tests/test_from_file_service.py

Lines changed: 53 additions & 50 deletions
Original file line numberDiff line numberDiff line change
@@ -14,56 +14,59 @@
1414
# See the License for the specific language governing permissions and
1515
# limitations under the License.
1616
#
17-
# python -m unittest tika.tests.test_from_file_service
1817

19-
import unittest
2018
from unittest import mock
2119

22-
import tika.parser
23-
24-
25-
class CreateTest(unittest.TestCase):
26-
'test different services in from_file parsing: Content, Metadata or both in recursive mode'
27-
28-
def test_default_service(self):
29-
'parse file using default service'
30-
result = tika.parser.from_file(
31-
'https://boe.es/boe/dias/2019/12/02/pdfs/BOE-A-2019-17288.pdf')
32-
self.assertEqual(result['metadata']['Content-Type'],'application/pdf')
33-
self.assertIn('AUTORIDADES Y PERSONAL',result['content'])
34-
@mock.patch('tika.parser._parse')
35-
@mock.patch('tika.parser.parse1')
36-
def test_remote_endpoint(self, tika_call_mock, _):
37-
result = tika.parser.from_file(
38-
'filename', 'http://tika:9998/tika')
39-
40-
tika_call_mock.assert_called_with(
41-
'all', 'filename', 'http://tika:9998/tika', headers=None, config_path=None,
42-
requestOptions={})
43-
def test_default_service_explicit(self):
44-
'parse file using default service explicitly'
45-
result = tika.parser.from_file(
46-
'https://boe.es/boe/dias/2019/12/02/pdfs/BOE-A-2019-17288.pdf', service='all')
47-
self.assertEqual(result['metadata']['Content-Type'],'application/pdf')
48-
self.assertIn('AUTORIDADES Y PERSONAL',result['content'])
49-
def test_text_service(self):
50-
'parse file using the content only service'
51-
result = tika.parser.from_file(
52-
'https://boe.es/boe/dias/2019/12/02/pdfs/BOE-A-2019-17288.pdf', service='text')
53-
self.assertIsNone(result['metadata'])
54-
self.assertIn('AUTORIDADES Y PERSONAL',result['content'])
55-
def test_meta_service(self):
56-
'parse file using the content only service'
57-
result = tika.parser.from_file(
58-
'https://boe.es/boe/dias/2019/12/02/pdfs/BOE-A-2019-17288.pdf', service='meta')
59-
self.assertIsNone(result['content'])
60-
self.assertEqual(result['metadata']['Content-Type'],'application/pdf')
61-
def test_invalid_service(self):
62-
'parse file using an invalid service should perform the default parsing'
63-
result = tika.parser.from_file(
64-
'https://boe.es/boe/dias/2019/12/02/pdfs/BOE-A-2019-17288.pdf', service='bad')
65-
self.assertEqual(result['metadata']['Content-Type'],'application/pdf')
66-
self.assertIn('AUTORIDADES Y PERSONAL',result['content'])
67-
68-
if __name__ == '__main__':
69-
unittest.main()
20+
from tika import parser
21+
22+
TEST_PDF_URL = "https://boe.es/boe/dias/2019/12/02/pdfs/BOE-A-2019-17288.pdf"
23+
24+
25+
def test_default_service():
26+
"parse file using default service"
27+
result = parser.from_file(TEST_PDF_URL)
28+
assert result["metadata"]["Content-Type"] == "application/pdf"
29+
assert "AUTORIDADES Y PERSONAL" in result["content"]
30+
31+
32+
@mock.patch("tika.parser._parse")
33+
@mock.patch("tika.parser.parse1")
34+
def test_remote_endpoint(tika_call_mock, _):
35+
result = parser.from_file("filename", "http://tika:9998/tika")
36+
37+
tika_call_mock.assert_called_with(
38+
"all",
39+
"filename",
40+
"http://tika:9998/tika",
41+
headers=None,
42+
config_path=None,
43+
requestOptions={},
44+
)
45+
46+
47+
def test_default_service_explicit():
48+
"parse file using default service explicitly"
49+
result = parser.from_file(TEST_PDF_URL, service="all")
50+
assert result["metadata"]["Content-Type"] == "application/pdf"
51+
assert "AUTORIDADES Y PERSONAL" in result["content"]
52+
53+
54+
def test_text_service():
55+
"parse file using the content only service"
56+
result = parser.from_file(TEST_PDF_URL, service="text")
57+
assert result["metadata"] is None
58+
assert "AUTORIDADES Y PERSONAL" in result["content"]
59+
60+
61+
def test_meta_service():
62+
"parse file using the content only service"
63+
result = parser.from_file(TEST_PDF_URL, service="meta")
64+
assert result["content"] is None
65+
assert result["metadata"]["Content-Type"] == "application/pdf"
66+
67+
68+
def test_invalid_service():
69+
"parse file using an invalid service should perform the default parsing"
70+
result = parser.from_file(TEST_PDF_URL, service="bad")
71+
assert result["metadata"]["Content-Type"] == "application/pdf"
72+
assert "AUTORIDADES Y PERSONAL" in result["content"]

tests/test_language.py

Lines changed: 4 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1,17 +1,13 @@
1-
from pathlib import Path
2-
31
from tika import language
42

5-
TEST_FILE_PATH = Path(__file__).parent / "files" / "rwservlet.pdf"
6-
73

8-
def test_local_binary():
9-
with open(TEST_FILE_PATH, "rb") as file_obj:
4+
def test_local_binary(test_file_path):
5+
with open(test_file_path, "rb") as file_obj:
106
assert language.from_file(file_obj) == "en"
117

128

13-
def test_local_path():
14-
assert language.from_file(str(TEST_FILE_PATH)) == "en"
9+
def test_local_path(test_file_path):
10+
assert language.from_file(str(test_file_path)) == "en"
1511

1612

1713
def test_local_buffer():

tests/test_parser.py

Lines changed: 43 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,43 @@
1+
from http import HTTPStatus
2+
3+
from tika import parser
4+
5+
6+
def test_remote_pdf():
7+
"""parse remote PDF"""
8+
assert parser.from_file(
9+
"https://upload.wikimedia.org/wikipedia/commons/4/42/Article_feedback_flow_B_-_Thank_editors.pdf")
10+
11+
12+
def test_remote_html():
13+
"""parse remote HTML"""
14+
assert parser.from_file("http://nossl.sh")
15+
16+
17+
def test_remote_mp3():
18+
"""parse remote mp3"""
19+
assert parser.from_file(
20+
"https://archive.org/download/Ainst-Spaceshipdemo.mp3/Ainst-Spaceshipdemo.mp3")
21+
22+
23+
def test_remote_jpg():
24+
"""parse remote jpg"""
25+
assert parser.from_file(
26+
"https://upload.wikimedia.org/wikipedia/commons/b/b7/X_logo.jpg")
27+
28+
29+
def test_local_binary(test_file_path):
30+
"""parse file binary"""
31+
with open(test_file_path, "rb") as file_obj:
32+
assert parser.from_file(file_obj)
33+
34+
35+
def test_local_buffer():
36+
response = parser.from_buffer("Good evening, Dave")
37+
assert response["status"] == HTTPStatus.OK
38+
39+
40+
def test_local_path(test_file_path):
41+
"""parse file path"""
42+
assert parser.from_file(str(test_file_path))
43+

tests/test_pdf.py

Lines changed: 2 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,6 @@
1-
from pathlib import Path
2-
31
from tika import pdf
42

5-
TEST_FILE_PATH = Path(__file__).parent / "files" / "rwservlet.pdf"
6-
73

8-
def test_local_path():
9-
text_pages = pdf.text_from_pdf_pages(str(TEST_FILE_PATH))
4+
def test_local_path(test_file_path):
5+
text_pages = pdf.text_from_pdf_pages(str(test_file_path))
106
assert isinstance(text_pages, list)

tests/test_tika.py

Lines changed: 3 additions & 47 deletions
Original file line numberDiff line numberDiff line change
@@ -14,56 +14,12 @@
1414
# See the License for the specific language governing permissions and
1515
# limitations under the License.
1616

17-
from http import HTTPStatus
18-
from pathlib import Path
19-
20-
import tika.parser
2117
import tika.tika
22-
23-
TEST_FILE_PATH = Path(__file__).parent / "files" / "rwservlet.pdf"
24-
25-
26-
def test_remote_pdf():
27-
"""parse remote PDF"""
28-
assert tika.parser.from_file(
29-
"https://upload.wikimedia.org/wikipedia/commons/4/42/Article_feedback_flow_B_-_Thank_editors.pdf")
30-
31-
32-
def test_remote_html():
33-
"""parse remote HTML"""
34-
assert tika.parser.from_file("http://nossl.sh")
35-
36-
37-
def test_remote_mp3():
38-
"""parse remote mp3"""
39-
assert tika.parser.from_file(
40-
"https://archive.org/download/Ainst-Spaceshipdemo.mp3/Ainst-Spaceshipdemo.mp3")
41-
42-
43-
def test_remote_jpg():
44-
"""parse remote jpg"""
45-
assert tika.parser.from_file(
46-
"https://upload.wikimedia.org/wikipedia/commons/b/b7/X_logo.jpg")
47-
48-
49-
def test_local_binary():
50-
"""parse file binary"""
51-
with open(TEST_FILE_PATH, "rb") as file_obj:
52-
assert tika.parser.from_file(file_obj)
53-
54-
55-
def test_local_buffer():
56-
response = tika.parser.from_buffer("Good evening, Dave")
57-
assert response["status"] == HTTPStatus.OK
58-
59-
60-
def test_local_path():
61-
"""parse file path"""
62-
assert tika.parser.from_file(str(TEST_FILE_PATH))
18+
from tika import parser
6319

6420

65-
def test_kill_server():
21+
def test_kill_server(test_file_path):
6622
"""parse some file then kills server"""
67-
with open(TEST_FILE_PATH, "rb") as file_obj:
23+
with open(test_file_path, "rb") as file_obj:
6824
tika.parser.from_file(file_obj)
6925
assert tika.tika.killServer() is None

tests/test_unpack.py

Lines changed: 10 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -1,30 +1,24 @@
1-
from tempfile import NamedTemporaryFile
2-
31
from tika import unpack
42

53
# Test data
64
TEXT_UTF8 = "Hello, world!! 😎 👽"
75
TEXT_ASCII = "Hello, world!!"
86

97

10-
def test_utf8():
8+
def test_utf8(tmp_path):
119
"""Test UTF-8 encoding"""
12-
with NamedTemporaryFile("w+b", prefix="tika-python", suffix=".txt", dir="/tmp") as f:
13-
f.write(TEXT_UTF8.encode("utf8"))
14-
f.flush()
15-
f.seek(0)
16-
parsed = unpack.from_file(f.name)
17-
assert parsed["content"].strip() == TEXT_UTF8
10+
test_file = tmp_path / "test_utf8.txt"
11+
test_file.write_bytes(TEXT_UTF8.encode("utf8"))
12+
parsed = unpack.from_file(str(test_file))
13+
assert parsed["content"].strip() == TEXT_UTF8
1814

1915

20-
def test_ascii():
16+
def test_ascii(tmp_path):
2117
"""Test ASCII encoding"""
22-
with NamedTemporaryFile("w+t", prefix="tika-python", suffix=".txt", dir="/tmp") as f:
23-
f.write(TEXT_ASCII)
24-
f.flush()
25-
f.seek(0)
26-
parsed = unpack.from_file(f.name)
27-
assert parsed["content"].strip() == TEXT_ASCII
18+
test_file = tmp_path / "test_ascii.txt"
19+
test_file.write_text(TEXT_ASCII)
20+
parsed = unpack.from_file(str(test_file))
21+
assert parsed["content"].strip() == TEXT_ASCII
2822

2923

3024
def test_from_buffer():

0 commit comments

Comments
 (0)