66#include <R_ext/Print.h>
77#include <R_ext/Riconv.h>
88#include <errno.h>
9+ #include "rjstring.h"
910
1011/* R 4.0.1 broke EXTPTR_PTR ABI so re-map it to safety at
1112 the small expense of speed */
@@ -156,9 +157,9 @@ SEXP j2SEXP(JNIEnv *env, jobject o, int releaseLocal) {
156157 }
157158}
158159
159- #if R_VERSION >= R_Version (2 ,7 ,0 )
160160/* returns string from a CHARSXP making sure that the result is in UTF-8
161- NOTE: this should NOT be used to create Java strings as they require UTF-16 natively */
161+ NOTE: this should NOT be used to create Java strings as they require UTF-16 natively
162+ For Java strings use rj_*_utf16 function from rjstring.h */
162163const char * rj_char_utf8 (SEXP s ) {
163164#ifdef DEBUG_ENCODING
164165 fprintf (stderr , "rJava.rj_char_utf8, CE=%d: \"%s\"\n" , (int )Rf_getCharCE (s ), CHAR (s ));
@@ -168,176 +169,10 @@ const char *rj_char_utf8(SEXP s) {
168169 return (Rf_getCharCE (s ) == CE_UTF8 ) ? CHAR (s ) : Rf_reEnc (CHAR (s ), getCharCE (s ), CE_UTF8 , 0 ); /* subst. invalid chars: 1=hex, 2=., 3=?, other=skip */
169170}
170171
171- #ifdef WIN32
172- extern unsigned int localeCP ;
173- static char cpbuf [16 ];
174- #endif
175- static jchar js_zero [2 ] = { 0 , 0 };
176- static jchar js_buf [128 ];
177- /* returns string from a CHARSXP making sure that the result is in UTF-16.
178- the buffer is owned by the function and may be static, so copy after use */
179- int rj_char_utf16 (SEXP s , jchar * * buf ) {
180- void * ih ;
181- cetype_t ce_in = getCharCE (s );
182- const char * ifrom = "" , * c = CHAR (s ), * ce = strchr (c , 0 );
183- if (ce == c ) {
184- buf [0 ] = js_zero ;
185- return 0 ;
186- }
187- size_t osize = sizeof (jchar ) * (ce - c + 1 ), isize = ce - c ;
188- jchar * js = buf [0 ] = (osize < sizeof (js_buf )) ? js_buf : (jchar * ) R_alloc (sizeof (jchar ), ce - c + 1 );
189- char * dst = (char * ) js ;
190- int end_test = 1 ;
191-
192- #ifdef DEBUG_ENCODING
193- fprintf (stderr , "rJava.rj_char_utf16, CE=%d:" , (int )ce_in );
194- { const char * c0 = c ; while (* c0 ) fprintf (stderr , " %02x" , (int )((unsigned char )* (c0 ++ ))); }
195- fprintf (stderr , "\n" );
196- #endif
197-
198- switch (ce_in ) {
199- #ifdef WIN32
200- case CE_NATIVE :
201- /* reEnc uses this, but translateCharUtf8 uses "" so let's go with ""
202- sprintf(cpbuf, "CP%d", localeCP);
203- ifrom = cpbuf;
204- */
205- break ;
206- case CE_LATIN1 : ifrom = "CP1252" ; break ;
207- #else
208- case CE_NATIVE : break ; /* is already "" */
209- case CE_LATIN1 : ifrom = "latin1" ; break ;
210- #endif
211- default :
212- ifrom = "UTF-8" ; break ;
213- }
214-
215- #ifdef DEBUG_ENCODING
216- fprintf (stderr , " '%s' -> UTF-16: " , ifrom );
217- #endif
218- ih = Riconv_open (((char * )& end_test )[0 ] == 1 ? "UTF-16LE" : "UTF-16BE" , ifrom );
219- if (ih == (void * )(-1 ))
220- Rf_error ("Unable to start conversion to UTF-16" );
221- while (c < ce ) {
222- size_t res = Riconv (ih , & c , & isize , & dst , & osize );
223- /* this should never happen since we allocated far more than needed */
224- if (res == -1 && errno == E2BIG )
225- Rf_error ("Conversion to UTF-16 failed due to unexpectedly large buffer requirements." );
226- else if (res == -1 && (errno == EILSEQ || errno == EINVAL )) { /* invalid char */
227- * (dst ++ ) = '?' ;
228- * (dst ++ ) = 0 ;
229- osize -= 2 ;
230- c ++ ;
231- isize -- ;
232- }
233- }
234- Riconv_close (ih );
235- #ifdef DEBUG_ENCODING
236- { const jchar * j = js ; while (j < (const jchar * )dst ) fprintf (stderr , " %04x" , (unsigned int )* (j ++ )); }
237- fprintf (stderr , "\n" );
238- #endif
239- return dst - (char * ) js ;
240- }
241-
242- /* Java returns *modified* UTF-8 which is incompatible with UTF-8,
243- so we have to detect the illegal surrgoate pairs and convert them */
244- SEXP mkCharUTF8 (const char * src ) {
245- const unsigned char * s = (const unsigned char * ) src ;
246- const unsigned char * c = (const unsigned char * ) s ;
247- /* check if the string contains any surrogate pairs, i.e.
248- Unicode in the range 0xD800-0xDFFF
249- We want this to be fast since in 99.99% of cases it will
250- be false */
251- while (* c ) {
252- if (c [0 ] == 0xED &&
253- (c [1 ] & 0xE0 ) == 0xA0 )
254- break ;
255- c ++ ;
256- }
257- if (* c ) { /* yes, we have to convert them */
258- SEXP res ;
259- const unsigned char * e = (const unsigned char * ) strchr ((const char * )s , 0 ); /* find the end for size */
260- unsigned char * dst = 0 , * d , sbuf [64 ];
261- if (!e ) /* should never occur */
262- return mkChar ("" );
263- /* we use static buffer for small strings and dynamic alloc for large */
264- if (e - s >= sizeof (sbuf )) {
265- /* allocate temp buffer since our input is const */
266- d = dst = (unsigned char * ) malloc (e - s + 1 );
267- if (!dst )
268- Rf_error ("Cannot allocate memory for surrogate pair conversion" );
269- } else
270- d = (unsigned char * )sbuf ;
271- if (c - s > 0 ) {
272- memcpy (d , s , c - s );
273- d += c - s ;
274- }
275- while (* c ) {
276- unsigned int u1 , u ;
277- * (d ++ ) = * (c ++ );
278- /* start of a sequence ? */
279- if ((c [-1 ] & 0xC0 ) != 0xC0 )
280- continue ;
281- if ((c [-1 ] & 0xE0 ) == 0xC0 ) { /* 2-byte, not a surrogate pair */
282- if ((c [0 ] & 0xC0 ) != 0x80 ) {
283- if (dst ) free (dst );
284- Rf_error ("illegal 2-byte sequence in Java string" );
285- }
286- * (d ++ ) = * (c ++ );
287- continue ;
288- }
289- if ((c [-1 ] & 0xF0 ) != 0xE0 ) { /* must be 3-byte */
290- if (dst ) free (dst );
291- Rf_error ("illegal multi-byte seqeunce in Java string (>3-byte)" );
292- }
293- if (((c [0 ] & 0xC0 ) != 0x80 ||
294- (c [1 ] & 0xC0 ) != 0x80 )) {
295- if (dst ) free (dst );
296- Rf_error ("illegal 3-byte sequence in Java string" );
297- }
298- u1 = ((((unsigned int )c [-1 ]) & 0x0F ) << 12 ) |
299- ((((unsigned int )c [0 ]) & 0x3F ) << 6 ) |
300- (((unsigned int )c [1 ]) & 0x3F );
301- if (u1 < 0xD800 || u1 > 0xDBFF ) { /* not a surrogate pair -> regular copy */
302- * (d ++ ) = * (c ++ );
303- * (d ++ ) = * (c ++ );
304- continue ;
305- }
306- if (u1 >= 0xDC00 && u1 <= 0xDFFF ) { /* low surrogate pair ? */
307- if (dst ) free (dst );
308- Rf_error ("illegal sequence in Java string: low surrogate pair without a high one" );
309- }
310- c += 2 ; /* move to the low pair */
311- if (c [0 ] != 0xED ||
312- (c [1 ] & 0xF0 ) != 0xB0 ||
313- (c [2 ] & 0xC0 ) != 0x80 ) {
314- if (dst ) free (dst );
315- Rf_error ("illegal sequence in Java string: high surrogate pair not followed by low one" );
316- }
317- /* the actually encoded unicode character */
318- u = ((((unsigned int )c [1 ]) & 0x0F ) << 6 ) |
319- (((unsigned int )c [2 ]) & 0x3F );
320- u |= (u1 & 0x03FF ) << 10 ;
321- u += 0x10000 ;
322- c += 3 ;
323- /* it must be <= 0x10FFFF by design (each surrogate has 10 bits) */
324- d [-1 ] = (unsigned char ) (((u >> 18 ) & 0x0F ) | 0xF0 );
325- * (d ++ ) = (unsigned char ) (((u >> 12 ) & 0x3F ) | 0x80 );
326- * (d ++ ) = (unsigned char ) (((u >> 6 ) & 0x3F ) | 0x80 );
327- * (d ++ ) = (unsigned char ) ((u & 0x3F ) | 0x80 );
328- }
329- res = mkCharLenCE ((const char * ) (dst ? dst : sbuf ), dst ? (d - dst ) : (d - sbuf ), CE_UTF8 );
330- if (dst ) free (dst );
331- return res ;
332- }
333- return mkCharLenCE (src , c - s , CE_UTF8 );
334- }
335-
336- #endif
337172
338173static jstring newJavaString (JNIEnv * env , SEXP sChar ) {
339174 jchar * s ;
340- size_t len = rj_char_utf16 (sChar , & s );
175+ size_t len = rj_rchar_utf16 (sChar , & s );
341176 return newString16 (env , s , (len + 1 ) >> 1 );
342177}
343178
0 commit comments