Skip to content

Commit fa19e1c

Browse files
committed
re-factor string conversions to rjstring.c
1 parent 7f57cd9 commit fa19e1c

4 files changed

Lines changed: 266 additions & 172 deletions

File tree

src/Rglue.c

Lines changed: 4 additions & 169 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66
#include <R_ext/Print.h>
77
#include <R_ext/Riconv.h>
88
#include <errno.h>
9+
#include "rjstring.h"
910

1011
/* R 4.0.1 broke EXTPTR_PTR ABI so re-map it to safety at
1112
the small expense of speed */
@@ -156,9 +157,9 @@ SEXP j2SEXP(JNIEnv *env, jobject o, int releaseLocal) {
156157
}
157158
}
158159

159-
#if R_VERSION >= R_Version(2,7,0)
160160
/* returns string from a CHARSXP making sure that the result is in UTF-8
161-
NOTE: this should NOT be used to create Java strings as they require UTF-16 natively */
161+
NOTE: this should NOT be used to create Java strings as they require UTF-16 natively
162+
For Java strings use rj_*_utf16 function from rjstring.h */
162163
const char *rj_char_utf8(SEXP s) {
163164
#ifdef DEBUG_ENCODING
164165
fprintf(stderr, "rJava.rj_char_utf8, CE=%d: \"%s\"\n", (int)Rf_getCharCE(s), CHAR(s));
@@ -168,176 +169,10 @@ const char *rj_char_utf8(SEXP s) {
168169
return (Rf_getCharCE(s) == CE_UTF8) ? CHAR(s) : Rf_reEnc(CHAR(s), getCharCE(s), CE_UTF8, 0); /* subst. invalid chars: 1=hex, 2=., 3=?, other=skip */
169170
}
170171

171-
#ifdef WIN32
172-
extern unsigned int localeCP;
173-
static char cpbuf[16];
174-
#endif
175-
static jchar js_zero[2] = { 0, 0 };
176-
static jchar js_buf[128];
177-
/* returns string from a CHARSXP making sure that the result is in UTF-16.
178-
the buffer is owned by the function and may be static, so copy after use */
179-
int rj_char_utf16(SEXP s, jchar **buf) {
180-
void *ih;
181-
cetype_t ce_in = getCharCE(s);
182-
const char *ifrom = "", *c = CHAR(s), *ce = strchr(c, 0);
183-
if (ce == c) {
184-
buf[0] = js_zero;
185-
return 0;
186-
}
187-
size_t osize = sizeof(jchar) * (ce - c + 1), isize = ce - c;
188-
jchar *js = buf[0] = (osize < sizeof(js_buf)) ? js_buf : (jchar*) R_alloc(sizeof(jchar), ce - c + 1);
189-
char *dst = (char*) js;
190-
int end_test = 1;
191-
192-
#ifdef DEBUG_ENCODING
193-
fprintf(stderr, "rJava.rj_char_utf16, CE=%d:", (int)ce_in);
194-
{ const char *c0 = c; while (*c0) fprintf(stderr, " %02x", (int)((unsigned char)*(c0++))); }
195-
fprintf(stderr, "\n");
196-
#endif
197-
198-
switch (ce_in) {
199-
#ifdef WIN32
200-
case CE_NATIVE:
201-
/* reEnc uses this, but translateCharUtf8 uses "" so let's go with ""
202-
sprintf(cpbuf, "CP%d", localeCP);
203-
ifrom = cpbuf;
204-
*/
205-
break;
206-
case CE_LATIN1: ifrom = "CP1252"; break;
207-
#else
208-
case CE_NATIVE: break; /* is already "" */
209-
case CE_LATIN1: ifrom = "latin1"; break;
210-
#endif
211-
default:
212-
ifrom = "UTF-8"; break;
213-
}
214-
215-
#ifdef DEBUG_ENCODING
216-
fprintf(stderr, " '%s' -> UTF-16: ", ifrom);
217-
#endif
218-
ih = Riconv_open(((char*)&end_test)[0] == 1 ? "UTF-16LE" : "UTF-16BE", ifrom);
219-
if(ih == (void *)(-1))
220-
Rf_error("Unable to start conversion to UTF-16");
221-
while (c < ce) {
222-
size_t res = Riconv(ih, &c, &isize, &dst, &osize);
223-
/* this should never happen since we allocated far more than needed */
224-
if (res == -1 && errno == E2BIG)
225-
Rf_error("Conversion to UTF-16 failed due to unexpectedly large buffer requirements.");
226-
else if(res == -1 && (errno == EILSEQ || errno == EINVAL)) { /* invalid char */
227-
*(dst++) = '?';
228-
*(dst++) = 0;
229-
osize -= 2;
230-
c++;
231-
isize--;
232-
}
233-
}
234-
Riconv_close(ih);
235-
#ifdef DEBUG_ENCODING
236-
{ const jchar *j = js; while (j < (const jchar*)dst) fprintf(stderr, " %04x", (unsigned int)*(j++)); }
237-
fprintf(stderr, "\n");
238-
#endif
239-
return dst - (char*) js;
240-
}
241-
242-
/* Java returns *modified* UTF-8 which is incompatible with UTF-8,
243-
so we have to detect the illegal surrgoate pairs and convert them */
244-
SEXP mkCharUTF8(const char *src) {
245-
const unsigned char *s = (const unsigned char*) src;
246-
const unsigned char *c = (const unsigned char*) s;
247-
/* check if the string contains any surrogate pairs, i.e.
248-
Unicode in the range 0xD800-0xDFFF
249-
We want this to be fast since in 99.99% of cases it will
250-
be false */
251-
while (*c) {
252-
if (c[0] == 0xED &&
253-
(c[1] & 0xE0) == 0xA0)
254-
break;
255-
c++;
256-
}
257-
if (*c) { /* yes, we have to convert them */
258-
SEXP res;
259-
const unsigned char *e = (const unsigned char*) strchr((const char*)s, 0); /* find the end for size */
260-
unsigned char *dst = 0, *d, sbuf[64];
261-
if (!e) /* should never occur */
262-
return mkChar("");
263-
/* we use static buffer for small strings and dynamic alloc for large */
264-
if (e - s >= sizeof(sbuf)) {
265-
/* allocate temp buffer since our input is const */
266-
d = dst = (unsigned char *) malloc(e - s + 1);
267-
if (!dst)
268-
Rf_error("Cannot allocate memory for surrogate pair conversion");
269-
} else
270-
d = (unsigned char *)sbuf;
271-
if (c - s > 0) {
272-
memcpy(d, s, c - s);
273-
d += c - s;
274-
}
275-
while (*c) {
276-
unsigned int u1, u;
277-
*(d++) = *(c++);
278-
/* start of a sequence ? */
279-
if ((c[-1] & 0xC0) != 0xC0)
280-
continue;
281-
if ((c[-1] & 0xE0) == 0xC0) { /* 2-byte, not a surrogate pair */
282-
if ((c[0] & 0xC0) != 0x80) {
283-
if (dst) free(dst);
284-
Rf_error("illegal 2-byte sequence in Java string");
285-
}
286-
*(d++) = *(c++);
287-
continue;
288-
}
289-
if ((c[-1] & 0xF0) != 0xE0) { /* must be 3-byte */
290-
if (dst) free(dst);
291-
Rf_error("illegal multi-byte seqeunce in Java string (>3-byte)");
292-
}
293-
if (((c[0] & 0xC0) != 0x80 ||
294-
(c[1] & 0xC0) != 0x80)) {
295-
if (dst) free(dst);
296-
Rf_error("illegal 3-byte sequence in Java string");
297-
}
298-
u1 = ((((unsigned int)c[-1]) & 0x0F) << 12) |
299-
((((unsigned int)c[0]) & 0x3F) << 6) |
300-
(((unsigned int)c[1]) & 0x3F);
301-
if (u1 < 0xD800 || u1 > 0xDBFF) { /* not a surrogate pair -> regular copy */
302-
*(d++) = *(c++);
303-
*(d++) = *(c++);
304-
continue;
305-
}
306-
if (u1 >= 0xDC00 && u1 <= 0xDFFF) { /* low surrogate pair ? */
307-
if (dst) free(dst);
308-
Rf_error("illegal sequence in Java string: low surrogate pair without a high one");
309-
}
310-
c += 2; /* move to the low pair */
311-
if (c[0] != 0xED ||
312-
(c[1] & 0xF0) != 0xB0 ||
313-
(c[2] & 0xC0) != 0x80) {
314-
if (dst) free(dst);
315-
Rf_error("illegal sequence in Java string: high surrogate pair not followed by low one");
316-
}
317-
/* the actually encoded unicode character */
318-
u = ((((unsigned int)c[1]) & 0x0F) << 6) |
319-
(((unsigned int)c[2]) & 0x3F);
320-
u |= (u1 & 0x03FF) << 10;
321-
u += 0x10000;
322-
c += 3;
323-
/* it must be <= 0x10FFFF by design (each surrogate has 10 bits) */
324-
d[-1] = (unsigned char) (((u >> 18) & 0x0F) | 0xF0);
325-
*(d++) = (unsigned char) (((u >> 12) & 0x3F) | 0x80);
326-
*(d++) = (unsigned char) (((u >> 6) & 0x3F) | 0x80);
327-
*(d++) = (unsigned char) ((u & 0x3F) | 0x80);
328-
}
329-
res = mkCharLenCE((const char*) (dst ? dst : sbuf), dst ? (d - dst) : (d - sbuf), CE_UTF8);
330-
if (dst) free(dst);
331-
return res;
332-
}
333-
return mkCharLenCE(src, c - s, CE_UTF8);
334-
}
335-
336-
#endif
337172

338173
static jstring newJavaString(JNIEnv *env, SEXP sChar) {
339174
jchar *s;
340-
size_t len = rj_char_utf16(sChar, &s);
175+
size_t len = rj_rchar_utf16(sChar, &s);
341176
return newString16(env, s, (len + 1) >> 1);
342177
}
343178

src/rJava.h

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -97,14 +97,17 @@ void profReport(char *fmt, ...);
9797
#define END_RJAVA_CALL };
9898
#endif
9999

100-
/* define mkCharUTF8 in a compatible fashion */
100+
/* define mkCharUTF8 in a compatible fashion
101+
NOTE: those should NOT be used anymore since native
102+
Java strings use UTF-16 so use only in cases where UTF8 is required */
101103
#if R_VERSION < R_Version(2,7,0)
102104
#define mkCharUTF8(X) mkChar(X)
103105
#define CHAR_UTF8(X) CHAR(X)
104106
#else
107+
#define mkCharUTF8(X) rj_mkCharUTF8(X)
105108
#define CHAR_UTF8(X) rj_char_utf8(X)
106-
extern SEXP mkCharUTF8(const char *);
107-
extern const char *rj_char_utf8(SEXP);
109+
extern SEXP rj_mkCharUTF8(const char *); /* rjstring.c */
110+
extern const char *rj_char_utf8(SEXP); /* Rglue.c */
108111
#endif
109112

110113
/* signatures are stored in a local buffer if they fit. Only if they don't fit a heap buffer is allocated and used. */

0 commit comments

Comments
 (0)