|
14 | 14 | # See the License for the specific language governing permissions and |
15 | 15 | # limitations under the License. |
16 | 16 | # |
17 | | -# python -m unittest tika.tests.test_from_file_service |
18 | 17 |
|
19 | | -import unittest |
20 | 18 | from unittest import mock |
21 | 19 |
|
22 | | -import tika.parser |
23 | | - |
24 | | - |
25 | | -class CreateTest(unittest.TestCase): |
26 | | - 'test different services in from_file parsing: Content, Metadata or both in recursive mode' |
27 | | - |
28 | | - def test_default_service(self): |
29 | | - 'parse file using default service' |
30 | | - result = tika.parser.from_file( |
31 | | - 'https://boe.es/boe/dias/2019/12/02/pdfs/BOE-A-2019-17288.pdf') |
32 | | - self.assertEqual(result['metadata']['Content-Type'],'application/pdf') |
33 | | - self.assertIn('AUTORIDADES Y PERSONAL',result['content']) |
34 | | - @mock.patch('tika.parser._parse') |
35 | | - @mock.patch('tika.parser.parse1') |
36 | | - def test_remote_endpoint(self, tika_call_mock, _): |
37 | | - result = tika.parser.from_file( |
38 | | - 'filename', 'http://tika:9998/tika') |
39 | | - |
40 | | - tika_call_mock.assert_called_with( |
41 | | - 'all', 'filename', 'http://tika:9998/tika', headers=None, config_path=None, |
42 | | - requestOptions={}) |
43 | | - def test_default_service_explicit(self): |
44 | | - 'parse file using default service explicitly' |
45 | | - result = tika.parser.from_file( |
46 | | - 'https://boe.es/boe/dias/2019/12/02/pdfs/BOE-A-2019-17288.pdf', service='all') |
47 | | - self.assertEqual(result['metadata']['Content-Type'],'application/pdf') |
48 | | - self.assertIn('AUTORIDADES Y PERSONAL',result['content']) |
49 | | - def test_text_service(self): |
50 | | - 'parse file using the content only service' |
51 | | - result = tika.parser.from_file( |
52 | | - 'https://boe.es/boe/dias/2019/12/02/pdfs/BOE-A-2019-17288.pdf', service='text') |
53 | | - self.assertIsNone(result['metadata']) |
54 | | - self.assertIn('AUTORIDADES Y PERSONAL',result['content']) |
55 | | - def test_meta_service(self): |
56 | | - 'parse file using the content only service' |
57 | | - result = tika.parser.from_file( |
58 | | - 'https://boe.es/boe/dias/2019/12/02/pdfs/BOE-A-2019-17288.pdf', service='meta') |
59 | | - self.assertIsNone(result['content']) |
60 | | - self.assertEqual(result['metadata']['Content-Type'],'application/pdf') |
61 | | - def test_invalid_service(self): |
62 | | - 'parse file using an invalid service should perform the default parsing' |
63 | | - result = tika.parser.from_file( |
64 | | - 'https://boe.es/boe/dias/2019/12/02/pdfs/BOE-A-2019-17288.pdf', service='bad') |
65 | | - self.assertEqual(result['metadata']['Content-Type'],'application/pdf') |
66 | | - self.assertIn('AUTORIDADES Y PERSONAL',result['content']) |
67 | | - |
68 | | -if __name__ == '__main__': |
69 | | - unittest.main() |
| 20 | +from tika import parser |
| 21 | + |
| 22 | +TEST_PDF_URL = "https://boe.es/boe/dias/2019/12/02/pdfs/BOE-A-2019-17288.pdf" |
| 23 | + |
| 24 | + |
| 25 | +def test_default_service(): |
| 26 | + "parse file using default service" |
| 27 | + result = parser.from_file(TEST_PDF_URL) |
| 28 | + assert result["metadata"]["Content-Type"] == "application/pdf" |
| 29 | + assert "AUTORIDADES Y PERSONAL" in result["content"] |
| 30 | + |
| 31 | + |
| 32 | +@mock.patch("tika.parser._parse") |
| 33 | +@mock.patch("tika.parser.parse1") |
| 34 | +def test_remote_endpoint(tika_call_mock, _): |
| 35 | + result = parser.from_file("filename", "http://tika:9998/tika") |
| 36 | + |
| 37 | + tika_call_mock.assert_called_with( |
| 38 | + "all", |
| 39 | + "filename", |
| 40 | + "http://tika:9998/tika", |
| 41 | + headers=None, |
| 42 | + config_path=None, |
| 43 | + requestOptions={}, |
| 44 | + ) |
| 45 | + |
| 46 | + |
| 47 | +def test_default_service_explicit(): |
| 48 | + "parse file using default service explicitly" |
| 49 | + result = parser.from_file(TEST_PDF_URL, service="all") |
| 50 | + assert result["metadata"]["Content-Type"] == "application/pdf" |
| 51 | + assert "AUTORIDADES Y PERSONAL" in result["content"] |
| 52 | + |
| 53 | + |
| 54 | +def test_text_service(): |
| 55 | + "parse file using the content only service" |
| 56 | + result = parser.from_file(TEST_PDF_URL, service="text") |
| 57 | + assert result["metadata"] is None |
| 58 | + assert "AUTORIDADES Y PERSONAL" in result["content"] |
| 59 | + |
| 60 | + |
| 61 | +def test_meta_service(): |
| 62 | + "parse file using the content only service" |
| 63 | + result = parser.from_file(TEST_PDF_URL, service="meta") |
| 64 | + assert result["content"] is None |
| 65 | + assert result["metadata"]["Content-Type"] == "application/pdf" |
| 66 | + |
| 67 | + |
| 68 | +def test_invalid_service(): |
| 69 | + "parse file using an invalid service should perform the default parsing" |
| 70 | + result = parser.from_file(TEST_PDF_URL, service="bad") |
| 71 | + assert result["metadata"]["Content-Type"] == "application/pdf" |
| 72 | + assert "AUTORIDADES Y PERSONAL" in result["content"] |
0 commit comments