Skip to content

Commit 9c77b6e

Browse files
committed
copy instead of symlink because of Windows
1 parent 73495bb commit 9c77b6e

2 files changed

Lines changed: 256 additions & 2 deletions

File tree

jri/src/rjstring.c

Lines changed: 0 additions & 1 deletion
This file was deleted.

jri/src/rjstring.c

Lines changed: 231 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,231 @@
1+
#include "rjstring.h"
2+
3+
#include <string.h>
4+
#include <stdlib.h>
5+
#include <R_ext/Riconv.h>
6+
#include <errno.h>
7+
8+
#ifdef WIN32
9+
/* -- currently unused - was used to mimick reEnc()
10+
extern unsigned int localeCP;
11+
static char cpbuf[16]; */
12+
#endif
13+
static jchar js_zero[2] = { 0, 0 };
14+
static jchar js_buf[128];
15+
16+
/* if len = -1 then c is assumed to be NUL terminated */
17+
int rj_char_utf16(const char *c, int len, jchar **buf, const char *ifrom, int can_error) {
18+
void *ih;
19+
const char *ce = (len < 0) ? strchr(c, 0) : (c + len);
20+
if (ce == c) {
21+
buf[0] = js_zero;
22+
return 0;
23+
}
24+
size_t osize = sizeof(jchar) * (ce - c + 1), isize = ce - c;
25+
jchar *js = buf[0] = (osize < sizeof(js_buf)) ? js_buf : (jchar*) R_alloc(sizeof(jchar), ce - c + 1);
26+
char *dst = (char*) js;
27+
int end_test = 1, is_le = (((char*)&end_test)[0] == 1) ? 1 : 0;
28+
if (!ifrom) ifrom = "";
29+
30+
#ifdef DEBUG_ENCODING
31+
fprintf(stderr, "rJava.rj_char_utf16_native:");
32+
{ const char *c0 = c; while (*c0) fprintf(stderr, " %02x", (int)((unsigned char)*(c0++))); }
33+
fprintf(stderr, "\n");
34+
#endif
35+
36+
ih = Riconv_open(is_le ? "UTF-16LE" : "UTF-16BE", ifrom);
37+
if (ih == (void *)(-1)) {
38+
if (can_error)
39+
Rf_error("Unable to start conversion to UTF-16");
40+
return -1;
41+
}
42+
while (c < ce) {
43+
size_t res = Riconv(ih, &c, &isize, &dst, &osize);
44+
/* this should never happen since we allocated far more than needed */
45+
if (res == -1 && errno == E2BIG) {
46+
if (can_error)
47+
Rf_error("Conversion to UTF-16 failed due to unexpectedly large buffer requirements.");
48+
return -1;
49+
} else if(res == -1 && (errno == EILSEQ || errno == EINVAL)) { /* invalid char */
50+
if (is_le) {
51+
*(dst++) = '?';
52+
*(dst++) = 0;
53+
} else {
54+
*(dst++) = 0;
55+
*(dst++) = '?';
56+
}
57+
osize -= 2;
58+
c++;
59+
isize--;
60+
}
61+
}
62+
Riconv_close(ih);
63+
#ifdef DEBUG_ENCODING
64+
{ const jchar *j = js; while (j < (const jchar*)dst) fprintf(stderr, " %04x", (unsigned int)*(j++)); }
65+
fprintf(stderr, "\n");
66+
#endif
67+
return dst - (char*) js;
68+
}
69+
70+
/* returns string from a CHARSXP making sure that the result is in UTF-16.
71+
the buffer is owned by the function and may be static, so copy after use.
72+
73+
Returns the length of the resulting string or -1 on error (if
74+
can_error is 0).
75+
*/
76+
static int rj_CHARSXP_utf16_(SEXP s, jchar **buf, int can_error) {
77+
cetype_t ce_in = getCharCE(s);
78+
const char *ifrom = "", *c = CHAR(s), *ce = strchr(c, 0);
79+
if (ce == c) {
80+
buf[0] = js_zero;
81+
return 0;
82+
}
83+
84+
switch (ce_in) {
85+
#ifdef WIN32
86+
case CE_NATIVE:
87+
/* reEnc uses this, but translateCharUtf8 uses "" so let's go with ""
88+
sprintf(cpbuf, "CP%d", localeCP);
89+
ifrom = cpbuf;
90+
*/
91+
break;
92+
case CE_LATIN1: ifrom = "CP1252"; break;
93+
#else
94+
case CE_NATIVE: break; /* is already "" */
95+
case CE_LATIN1: ifrom = "latin1"; break;
96+
#endif
97+
default:
98+
ifrom = "UTF-8"; break;
99+
}
100+
101+
return rj_char_utf16(c, ce - c, buf, ifrom, can_error);
102+
}
103+
104+
int rj_rchar_utf16(SEXP s, jchar **buf) { return rj_CHARSXP_utf16_(s, buf, 1); }
105+
int rj_rchar_utf16_noerr(SEXP s, jchar **buf) { return rj_CHARSXP_utf16_(s, buf, 0); }
106+
107+
/* FIXME: we should probably deprecate this as well and use UTF-16 instead.
108+
The only reason not to is that we would have to fully implement
109+
a full UTF-16 -> UTF-8 conversion including surrogate pairs ... */
110+
111+
/* Java returns *modified* UTF-8 which is incompatible with UTF-8,
112+
so we have to detect the illegal surrgoate pairs and convert them */
113+
SEXP rj_mkCharUTF8_(const char *src, int can_error) {
114+
const unsigned char *s = (const unsigned char*) src;
115+
const unsigned char *c = (const unsigned char*) s;
116+
/* check if the string contains any surrogate pairs, i.e.
117+
Unicode in the range 0xD800-0xDFFF
118+
We want this to be fast since in 99.99% of cases it will
119+
be false */
120+
while (*c) {
121+
if (c[0] == 0xED &&
122+
(c[1] & 0xE0) == 0xA0)
123+
break;
124+
c++;
125+
}
126+
if (*c) { /* yes, we have to convert them */
127+
SEXP res;
128+
const unsigned char *e = (const unsigned char*) strchr((const char*)s, 0); /* find the end for size */
129+
unsigned char *dst = 0, *d, sbuf[64];
130+
if (!e) /* should never occur */
131+
return mkChar("");
132+
/* we use static buffer for small strings and dynamic alloc for large */
133+
if (e - s >= sizeof(sbuf)) {
134+
/* allocate temp buffer since our input is const */
135+
d = dst = (unsigned char *) malloc(e - s + 1);
136+
if (!dst) {
137+
if (can_error)
138+
Rf_error("Cannot allocate memory for surrogate pair conversion");
139+
return 0;
140+
}
141+
} else
142+
d = (unsigned char *)sbuf;
143+
if (c - s > 0) {
144+
memcpy(d, s, c - s);
145+
d += c - s;
146+
}
147+
while (*c) {
148+
unsigned int u1, u;
149+
*(d++) = *(c++);
150+
/* start of a sequence ? */
151+
if ((c[-1] & 0xC0) != 0xC0)
152+
continue;
153+
if ((c[-1] & 0xE0) == 0xC0) { /* 2-byte, not a surrogate pair */
154+
if ((c[0] & 0xC0) != 0x80) {
155+
if (dst) free(dst);
156+
if (can_error)
157+
Rf_error("illegal 2-byte sequence in Java string");
158+
return 0;
159+
}
160+
*(d++) = *(c++);
161+
continue;
162+
}
163+
if ((c[-1] & 0xF0) != 0xE0) { /* must be 3-byte */
164+
if (dst) free(dst);
165+
if (can_error)
166+
Rf_error("illegal multi-byte seqeunce in Java string (>3-byte)");
167+
return 0;
168+
}
169+
if (((c[0] & 0xC0) != 0x80 ||
170+
(c[1] & 0xC0) != 0x80)) {
171+
if (dst) free(dst);
172+
if (can_error)
173+
Rf_error("illegal 3-byte sequence in Java string");
174+
return 0;
175+
}
176+
u1 = ((((unsigned int)c[-1]) & 0x0F) << 12) |
177+
((((unsigned int)c[0]) & 0x3F) << 6) |
178+
(((unsigned int)c[1]) & 0x3F);
179+
if (u1 < 0xD800 || u1 > 0xDBFF) { /* not a surrogate pair -> regular copy */
180+
*(d++) = *(c++);
181+
*(d++) = *(c++);
182+
continue;
183+
}
184+
if (u1 >= 0xDC00 && u1 <= 0xDFFF) { /* low surrogate pair ? */
185+
if (dst) free(dst);
186+
if (can_error)
187+
Rf_error("illegal sequence in Java string: low surrogate pair without a high one");
188+
return 0;
189+
}
190+
c += 2; /* move to the low pair */
191+
if (c[0] != 0xED ||
192+
(c[1] & 0xF0) != 0xB0 ||
193+
(c[2] & 0xC0) != 0x80) {
194+
if (dst) free(dst);
195+
if (can_error)
196+
Rf_error("illegal sequence in Java string: high surrogate pair not followed by low one");
197+
return 0;
198+
}
199+
/* the actually encoded unicode character */
200+
u = ((((unsigned int)c[1]) & 0x0F) << 6) |
201+
(((unsigned int)c[2]) & 0x3F);
202+
u |= (u1 & 0x03FF) << 10;
203+
u += 0x10000;
204+
c += 3;
205+
/* it must be <= 0x10FFFF by design (each surrogate has 10 bits) */
206+
d[-1] = (unsigned char) (((u >> 18) & 0x0F) | 0xF0);
207+
*(d++) = (unsigned char) (((u >> 12) & 0x3F) | 0x80);
208+
*(d++) = (unsigned char) (((u >> 6) & 0x3F) | 0x80);
209+
*(d++) = (unsigned char) ((u & 0x3F) | 0x80);
210+
}
211+
res = mkCharLenCE((const char*) (dst ? dst : sbuf), dst ? (d - dst) : (d - sbuf), CE_UTF8);
212+
if (dst) free(dst);
213+
return res;
214+
}
215+
return mkCharLenCE(src, c - s, CE_UTF8);
216+
}
217+
218+
SEXP rj_mkCharUTF8(const char *src) { return rj_mkCharUTF8_(src, 0); }
219+
SEXP rj_mkCharUTF8_noerr(const char *src) { return rj_mkCharUTF8_(src, 1); }
220+
221+
jstring rj_newJavaString(JNIEnv *env, SEXP sChar) {
222+
jchar *s;
223+
int len = rj_rchar_utf16(sChar, &s);
224+
return (*env)->NewString(env, s, (len + 1) >> 1);
225+
}
226+
227+
jstring rj_newNativeJavaString(JNIEnv *env, const char *str, int len) {
228+
jchar *s;
229+
int rlen = rj_char_utf16(str, len, &s, "", 0);
230+
return (rlen < 0) ? 0 : (*env)->NewString(env, s, (rlen + 1) >> 1);
231+
}

jri/src/rjstring.h

Lines changed: 0 additions & 1 deletion
This file was deleted.

jri/src/rjstring.h

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,25 @@
1+
#ifndef RJ_STRING_H__
2+
#define RJ_STRING_H__
3+
4+
#include <jni.h> /* for jchar */
5+
#include <Rinternals.h> /* for SEXP */
6+
7+
/* --- API --- */
8+
9+
/* Returns static content for short strings so don't re-use.
10+
For dynamic strings uses R_alloc */
11+
int rj_char_utf16(const char *c, int len, jchar **buf, const char *ifrom, int can_error);
12+
13+
/* wrappers for above to use with CHARSXP to detect proper ifrom */
14+
int rj_rchar_utf16(SEXP s, jchar **buf);
15+
int rj_rchar_utf16_noerr(SEXP s, jchar **buf);
16+
17+
/* return jstring, but do NOT check exceptions */
18+
jstring rj_newJavaString(JNIEnv *env, SEXP sChar);
19+
jstring rj_newNativeJavaString(JNIEnv *env, const char *str, int len);
20+
21+
/* takes modified UTF-8 from Java, creates CHARSXP with valid UTF8 */
22+
SEXP rj_mkCharUTF8(const char *src);
23+
SEXP rj_mkCharUTF8_noerr(const char *src);
24+
25+
#endif

0 commit comments

Comments
 (0)