2323
2424from .tika import ServerEndpoint , callServer , parse1
2525
26- # Python 3 introduced .readable() to tarfile extracted files objects - this
27- # is required to wrap a TextIOWrapper around the object. However, wrapping
28- # with TextIOWrapper is only required for csv.reader() in Python 3, so the
29- # tarfile returned object can be used as is in earlier versions.
30- _text_wrapper = TextIOWrapper if version_info .major >= 3 else lambda x : x
26+ _text_wrapper = TextIOWrapper
3127
3228
3329def from_file (filename , serverEndpoint = ServerEndpoint , requestOptions = {}):
@@ -81,32 +77,18 @@ def _parse(tarOutput):
8177
8278 metadataMember = tarFile .getmember ("__METADATA__" )
8379 if not metadataMember .issym () and metadataMember .isfile ():
84- if version_info .major >= 3 :
85- with closing (_text_wrapper (tarFile .extractfile (metadataMember ), encoding = tarFile .encoding )) as metadataFile :
86- metadataReader = csv .reader (_truncate_nulls (metadataFile ))
87- for metadataLine in metadataReader :
88- # each metadata line comes as a key-value pair, with list values
89- # returned as extra values in the line - convert single values
90- # to non-list values to be consistent with parser metadata
91- assert len (metadataLine ) >= 2
92-
93- if len (metadataLine ) > 2 :
94- metadata [metadataLine [0 ]] = metadataLine [1 :]
95- else :
96- metadata [metadataLine [0 ]] = metadataLine [1 ]
97- else :
98- with closing (_text_wrapper (tarFile .extractfile (metadataMember ))) as metadataFile :
99- metadataReader = csv .reader (_truncate_nulls (metadataFile ))
100- for metadataLine in metadataReader :
101- # each metadata line comes as a key-value pair, with list values
102- # returned as extra values in the line - convert single values
103- # to non-list values to be consistent with parser metadata
104- assert len (metadataLine ) >= 2
105-
106- if len (metadataLine ) > 2 :
107- metadata [metadataLine [0 ]] = metadataLine [1 :]
108- else :
109- metadata [metadataLine [0 ]] = metadataLine [1 ]
80+ with closing (_text_wrapper (tarFile .extractfile (metadataMember ), encoding = tarFile .encoding )) as metadataFile :
81+ metadataReader = csv .reader (_truncate_nulls (metadataFile ))
82+ for metadataLine in metadataReader :
83+ # each metadata line comes as a key-value pair, with list values
84+ # returned as extra values in the line - convert single values
85+ # to non-list values to be consistent with parser metadata
86+ assert len (metadataLine ) >= 2
87+
88+ if len (metadataLine ) > 2 :
89+ metadata [metadataLine [0 ]] = metadataLine [1 :]
90+ else :
91+ metadata [metadataLine [0 ]] = metadataLine [1 ]
11092
11193
11294 # get the content
@@ -116,12 +98,8 @@ def _parse(tarOutput):
11698
11799 contentMember = tarFile .getmember ("__TEXT__" )
118100 if not contentMember .issym () and contentMember .isfile ():
119- if version_info .major >= 3 :
120- with closing (_text_wrapper (tarFile .extractfile (contentMember ), encoding = 'utf8' )) as content_file :
121- content = content_file .read ()
122- else :
123- with closing (tarFile .extractfile (contentMember )) as content_file :
124- content = content_file .read ().decode ('utf8' )
101+ with closing (_text_wrapper (tarFile .extractfile (contentMember ), encoding = 'utf8' )) as content_file :
102+ content = content_file .read ()
125103
126104 # get the remaining files as attachments
127105 attachments = {}
0 commit comments