@@ -58,7 +58,6 @@ Testing it out
5858Parser Interface (backwards compat prior to REST)
5959-------------------------------------------------
6060``` python
61- # !/usr/bin/env python
6261import tika
6362tika.initVM()
6463from tika import parser
@@ -79,8 +78,6 @@ The parser interface needs the following environment variable set on the console
7978``` export PYTHONIOENCODING=utf8 ```
8079
8180``` python
82- # !/usr/bin/env python
83- import tika
8481from tika import parser
8582parsed = parser.from_file(' /path/to/file' )
8683print (parsed[" metadata" ])
@@ -96,34 +93,35 @@ string_parsed = parser.from_buffer('Good evening, Dave', 'http://tika:9998/tika'
9693```
9794
9895You can also pass a binary stream
99- ```
96+
97+ ``` python
10098with open (file , ' rb' ) as file_obj:
10199 response = tika.parser.from_file(file_obj)
102100```
103101
104102Gzip compression
105103---------------------
106- Since Tika 1.24.1 gzip compression of input and output streams is allowed.
104+ Since Tika 1.24.1 gzip compression of input and output streams is allowed.
107105
108106Input compression can be achieved with gzip or zlib:
109- ```
110- import zlib
107+ ``` python
108+ import zlib
111109
112- with open(file, 'rb') as file_obj:
113- return tika.parser.from_buffer(zlib.compress(file_obj.read()))
110+ with open (file , ' rb' ) as file_obj:
111+ return tika.parser.from_buffer(zlib.compress(file_obj.read()))
114112
115113...
116114
117- import gzip
118-
119- with open(file, 'rb') as file_obj:
120- return tika.parser.from_buffer(gzip.compress(file_obj.read()))
115+ import gzip
116+
117+ with open (file , ' rb' ) as file_obj:
118+ return tika.parser.from_buffer(gzip.compress(file_obj.read()))
121119```
122120
123121And output with the header:
124- ```
125- with open(file, 'rb') as file_obj:
126- return tika.parser.from_file(file_obj, headers={'Accept-Encoding': 'gzip, deflate'})
122+ ``` python
123+ with open (file , ' rb' ) as file_obj:
124+ return tika.parser.from_file(file_obj, headers = {' Accept-Encoding' : ' gzip, deflate' })
127125```
128126
129127Specify Output Format To XHTML
@@ -136,8 +134,6 @@ The parser interface needs the following environment variable set on the console
136134``` export PYTHONIOENCODING=utf8 ```
137135
138136``` python
139- # !/usr/bin/env python
140- import tika
141137from tika import parser
142138parsed = parser.from_file(' /path/to/file' , xmlContent = True )
143139print (parsed[" metadata" ])
@@ -153,8 +149,6 @@ call and internally returns back a tarball of metadata and text entries that
153149is internally unpacked, reducing the wire load for extraction.
154150
155151``` python
156- # !/usr/bin/env python
157- import tika
158152from tika import unpack
159153parsed = unpack.from_file(' /path/to/file' )
160154```
@@ -165,8 +159,6 @@ The detect interface provides a IANA MIME type classification for the
165159provided file.
166160
167161``` python
168- # !/usr/bin/env python
169- import tika
170162from tika import detector
171163print (detector.from_file(' /path/to/file' ))
172164```
@@ -178,8 +170,6 @@ configuration including what parsers, mime types, and detectors the
178170server has been configured with.
179171
180172``` python
181- # !/usr/bin/env python
182- import tika
183173from tika import config
184174print (config.getParsers())
185175print (config.getMimeTypes())
@@ -192,7 +182,6 @@ The language detection interface provides a 2 character language
192182code texted based on the text in provided file.
193183
194184``` python
195- # !/usr/bin/env python
196185from tika import language
197186print (language.from_file(' /path/to/file' ))
198187```
@@ -203,7 +192,6 @@ The translate interface translates the text automatically extracted
203192by Tika from the source language to the destination language.
204193
205194``` python
206- # !/usr/bin/env python
207195from tika import translate
208196print (translate.from_file(' /path/to/spanish' , ' es' , ' en' ))
209197```
@@ -215,6 +203,7 @@ Note you can also use a Parser and Detector
215203a string or bytes buffer in Python and/or detect its MIME
216204type. This is useful if you've already loaded
217205the content into memory.
206+
218207``` python
219208string_parsed = parser.from_buffer(' Good evening, Dave' )
220209byte_data: bytes = b ' B\xc3\xa4 ume'
@@ -224,9 +213,10 @@ parsed = parser.from_buffer(io.BytesIO(byte_data))
224213Using Client Only Mode
225214----------------------
226215You can set Tika to use Client only mode by setting
216+
227217``` python
228- import tika from tika
229- tika.TikaClientOnly = True
218+ import tika. tika
219+ tika.tika. TikaClientOnly = True
230220```
231221
232222Then you can run any of the methods and it will fully
@@ -251,8 +241,8 @@ kill -9 PID
251241```
252242
253243``` python
254- import tika.tika
255244import os
245+ import tika.tika
256246from tika import parser
257247home = os.getenv(' HOME' )
258248tika.tika.TikaServerClasspath = home + ' /git/geotopicparser-utils/mime:' + home+ ' /git/geotopicparser-utils/models/polar'
0 commit comments