2222
2323from .tika import ServerEndpoint , callServer , parse1
2424
25- # Python 3 introduced .readable() to tarfile extracted files objects - this
26- # is required to wrap a TextIOWrapper around the object. However, wrapping
27- # with TextIOWrapper is only required for csv.reader() in Python 3, so the
28- # tarfile returned object can be used as is in earlier versions.
29- _text_wrapper = TextIOWrapper if version_info .major >= 3 else lambda x : x
25+ _text_wrapper = TextIOWrapper
3026
3127
3228def from_file (filename , serverEndpoint = ServerEndpoint , requestOptions = {}):
@@ -80,32 +76,18 @@ def _parse(tarOutput):
8076
8177 metadataMember = tarFile .getmember ("__METADATA__" )
8278 if not metadataMember .issym () and metadataMember .isfile ():
83- if version_info .major >= 3 :
84- with closing (_text_wrapper (tarFile .extractfile (metadataMember ), encoding = tarFile .encoding )) as metadataFile :
85- metadataReader = csv .reader (_truncate_nulls (metadataFile ))
86- for metadataLine in metadataReader :
87- # each metadata line comes as a key-value pair, with list values
88- # returned as extra values in the line - convert single values
89- # to non-list values to be consistent with parser metadata
90- assert len (metadataLine ) >= 2
91-
92- if len (metadataLine ) > 2 :
93- metadata [metadataLine [0 ]] = metadataLine [1 :]
94- else :
95- metadata [metadataLine [0 ]] = metadataLine [1 ]
96- else :
97- with closing (_text_wrapper (tarFile .extractfile (metadataMember ))) as metadataFile :
98- metadataReader = csv .reader (_truncate_nulls (metadataFile ))
99- for metadataLine in metadataReader :
100- # each metadata line comes as a key-value pair, with list values
101- # returned as extra values in the line - convert single values
102- # to non-list values to be consistent with parser metadata
103- assert len (metadataLine ) >= 2
104-
105- if len (metadataLine ) > 2 :
106- metadata [metadataLine [0 ]] = metadataLine [1 :]
107- else :
108- metadata [metadataLine [0 ]] = metadataLine [1 ]
79+ with closing (_text_wrapper (tarFile .extractfile (metadataMember ), encoding = tarFile .encoding )) as metadataFile :
80+ metadataReader = csv .reader (_truncate_nulls (metadataFile ))
81+ for metadataLine in metadataReader :
82+ # each metadata line comes as a key-value pair, with list values
83+ # returned as extra values in the line - convert single values
84+ # to non-list values to be consistent with parser metadata
85+ assert len (metadataLine ) >= 2
86+
87+ if len (metadataLine ) > 2 :
88+ metadata [metadataLine [0 ]] = metadataLine [1 :]
89+ else :
90+ metadata [metadataLine [0 ]] = metadataLine [1 ]
10991
11092
11193 # get the content
@@ -115,12 +97,8 @@ def _parse(tarOutput):
11597
11698 contentMember = tarFile .getmember ("__TEXT__" )
11799 if not contentMember .issym () and contentMember .isfile ():
118- if version_info .major >= 3 :
119- with closing (_text_wrapper (tarFile .extractfile (contentMember ), encoding = 'utf8' )) as content_file :
120- content = content_file .read ()
121- else :
122- with closing (tarFile .extractfile (contentMember )) as content_file :
123- content = content_file .read ().decode ('utf8' )
100+ with closing (_text_wrapper (tarFile .extractfile (contentMember ), encoding = 'utf8' )) as content_file :
101+ content = content_file .read ()
124102
125103 # get the remaining files as attachments
126104 attachments = {}
0 commit comments