|
| 1 | +#include "rjstring.h" |
| 2 | + |
| 3 | +#include <string.h> |
| 4 | +#include <stdlib.h> |
| 5 | +#include <R_ext/Riconv.h> |
| 6 | +#include <errno.h> |
| 7 | + |
| 8 | +#ifdef WIN32 |
| 9 | +/* -- currently unused - was used to mimick reEnc() |
| 10 | + extern unsigned int localeCP; |
| 11 | + static char cpbuf[16]; */ |
| 12 | +#endif |
| 13 | +static jchar js_zero[2] = { 0, 0 }; |
| 14 | +static jchar js_buf[128]; |
| 15 | + |
| 16 | +/* if len = -1 then c is assumed to be NUL terminated */ |
| 17 | +int rj_char_utf16(const char *c, int len, jchar **buf, const char *ifrom, int can_error) { |
| 18 | + void *ih; |
| 19 | + const char *ce = (len < 0) ? strchr(c, 0) : (c + len); |
| 20 | + if (ce == c) { |
| 21 | + buf[0] = js_zero; |
| 22 | + return 0; |
| 23 | + } |
| 24 | + size_t osize = sizeof(jchar) * (ce - c + 1), isize = ce - c; |
| 25 | + jchar *js = buf[0] = (osize < sizeof(js_buf)) ? js_buf : (jchar*) R_alloc(sizeof(jchar), ce - c + 1); |
| 26 | + char *dst = (char*) js; |
| 27 | + int end_test = 1, is_le = (((char*)&end_test)[0] == 1) ? 1 : 0; |
| 28 | + if (!ifrom) ifrom = ""; |
| 29 | + |
| 30 | +#ifdef DEBUG_ENCODING |
| 31 | + fprintf(stderr, "rJava.rj_char_utf16_native:"); |
| 32 | + { const char *c0 = c; while (*c0) fprintf(stderr, " %02x", (int)((unsigned char)*(c0++))); } |
| 33 | + fprintf(stderr, "\n"); |
| 34 | +#endif |
| 35 | + |
| 36 | + ih = Riconv_open(is_le ? "UTF-16LE" : "UTF-16BE", ifrom); |
| 37 | + if (ih == (void *)(-1)) { |
| 38 | + if (can_error) |
| 39 | + Rf_error("Unable to start conversion to UTF-16"); |
| 40 | + return -1; |
| 41 | + } |
| 42 | + while (c < ce) { |
| 43 | + size_t res = Riconv(ih, &c, &isize, &dst, &osize); |
| 44 | + /* this should never happen since we allocated far more than needed */ |
| 45 | + if (res == -1 && errno == E2BIG) { |
| 46 | + if (can_error) |
| 47 | + Rf_error("Conversion to UTF-16 failed due to unexpectedly large buffer requirements."); |
| 48 | + return -1; |
| 49 | + } else if(res == -1 && (errno == EILSEQ || errno == EINVAL)) { /* invalid char */ |
| 50 | + if (is_le) { |
| 51 | + *(dst++) = '?'; |
| 52 | + *(dst++) = 0; |
| 53 | + } else { |
| 54 | + *(dst++) = 0; |
| 55 | + *(dst++) = '?'; |
| 56 | + } |
| 57 | + osize -= 2; |
| 58 | + c++; |
| 59 | + isize--; |
| 60 | + } |
| 61 | + } |
| 62 | + Riconv_close(ih); |
| 63 | +#ifdef DEBUG_ENCODING |
| 64 | + { const jchar *j = js; while (j < (const jchar*)dst) fprintf(stderr, " %04x", (unsigned int)*(j++)); } |
| 65 | + fprintf(stderr, "\n"); |
| 66 | +#endif |
| 67 | + return dst - (char*) js; |
| 68 | +} |
| 69 | + |
| 70 | +/* returns string from a CHARSXP making sure that the result is in UTF-16. |
| 71 | + the buffer is owned by the function and may be static, so copy after use. |
| 72 | +
|
| 73 | + Returns the length of the resulting string or -1 on error (if |
| 74 | + can_error is 0). |
| 75 | + */ |
| 76 | +static int rj_CHARSXP_utf16_(SEXP s, jchar **buf, int can_error) { |
| 77 | + cetype_t ce_in = getCharCE(s); |
| 78 | + const char *ifrom = "", *c = CHAR(s), *ce = strchr(c, 0); |
| 79 | + if (ce == c) { |
| 80 | + buf[0] = js_zero; |
| 81 | + return 0; |
| 82 | + } |
| 83 | + |
| 84 | + switch (ce_in) { |
| 85 | +#ifdef WIN32 |
| 86 | + case CE_NATIVE: |
| 87 | +/* reEnc uses this, but translateCharUtf8 uses "" so let's go with "" |
| 88 | + sprintf(cpbuf, "CP%d", localeCP); |
| 89 | + ifrom = cpbuf; |
| 90 | +*/ |
| 91 | + break; |
| 92 | + case CE_LATIN1: ifrom = "CP1252"; break; |
| 93 | +#else |
| 94 | + case CE_NATIVE: break; /* is already "" */ |
| 95 | + case CE_LATIN1: ifrom = "latin1"; break; |
| 96 | +#endif |
| 97 | + default: |
| 98 | + ifrom = "UTF-8"; break; |
| 99 | + } |
| 100 | + |
| 101 | + return rj_char_utf16(c, ce - c, buf, ifrom, can_error); |
| 102 | +} |
| 103 | + |
| 104 | +int rj_rchar_utf16(SEXP s, jchar **buf) { return rj_CHARSXP_utf16_(s, buf, 1); } |
| 105 | +int rj_rchar_utf16_noerr(SEXP s, jchar **buf) { return rj_CHARSXP_utf16_(s, buf, 0); } |
| 106 | + |
| 107 | +/* FIXME: we should probably deprecate this as well and use UTF-16 instead. |
| 108 | + The only reason not to is that we would have to fully implement |
| 109 | + a full UTF-16 -> UTF-8 conversion including surrogate pairs ... */ |
| 110 | + |
| 111 | +/* Java returns *modified* UTF-8 which is incompatible with UTF-8, |
| 112 | + so we have to detect the illegal surrgoate pairs and convert them */ |
| 113 | +SEXP rj_mkCharUTF8_(const char *src, int can_error) { |
| 114 | + const unsigned char *s = (const unsigned char*) src; |
| 115 | + const unsigned char *c = (const unsigned char*) s; |
| 116 | + /* check if the string contains any surrogate pairs, i.e. |
| 117 | + Unicode in the range 0xD800-0xDFFF |
| 118 | + We want this to be fast since in 99.99% of cases it will |
| 119 | + be false */ |
| 120 | + while (*c) { |
| 121 | + if (c[0] == 0xED && |
| 122 | + (c[1] & 0xE0) == 0xA0) |
| 123 | + break; |
| 124 | + c++; |
| 125 | + } |
| 126 | + if (*c) { /* yes, we have to convert them */ |
| 127 | + SEXP res; |
| 128 | + const unsigned char *e = (const unsigned char*) strchr((const char*)s, 0); /* find the end for size */ |
| 129 | + unsigned char *dst = 0, *d, sbuf[64]; |
| 130 | + if (!e) /* should never occur */ |
| 131 | + return mkChar(""); |
| 132 | + /* we use static buffer for small strings and dynamic alloc for large */ |
| 133 | + if (e - s >= sizeof(sbuf)) { |
| 134 | + /* allocate temp buffer since our input is const */ |
| 135 | + d = dst = (unsigned char *) malloc(e - s + 1); |
| 136 | + if (!dst) { |
| 137 | + if (can_error) |
| 138 | + Rf_error("Cannot allocate memory for surrogate pair conversion"); |
| 139 | + return 0; |
| 140 | + } |
| 141 | + } else |
| 142 | + d = (unsigned char *)sbuf; |
| 143 | + if (c - s > 0) { |
| 144 | + memcpy(d, s, c - s); |
| 145 | + d += c - s; |
| 146 | + } |
| 147 | + while (*c) { |
| 148 | + unsigned int u1, u; |
| 149 | + *(d++) = *(c++); |
| 150 | + /* start of a sequence ? */ |
| 151 | + if ((c[-1] & 0xC0) != 0xC0) |
| 152 | + continue; |
| 153 | + if ((c[-1] & 0xE0) == 0xC0) { /* 2-byte, not a surrogate pair */ |
| 154 | + if ((c[0] & 0xC0) != 0x80) { |
| 155 | + if (dst) free(dst); |
| 156 | + if (can_error) |
| 157 | + Rf_error("illegal 2-byte sequence in Java string"); |
| 158 | + return 0; |
| 159 | + } |
| 160 | + *(d++) = *(c++); |
| 161 | + continue; |
| 162 | + } |
| 163 | + if ((c[-1] & 0xF0) != 0xE0) { /* must be 3-byte */ |
| 164 | + if (dst) free(dst); |
| 165 | + if (can_error) |
| 166 | + Rf_error("illegal multi-byte seqeunce in Java string (>3-byte)"); |
| 167 | + return 0; |
| 168 | + } |
| 169 | + if (((c[0] & 0xC0) != 0x80 || |
| 170 | + (c[1] & 0xC0) != 0x80)) { |
| 171 | + if (dst) free(dst); |
| 172 | + if (can_error) |
| 173 | + Rf_error("illegal 3-byte sequence in Java string"); |
| 174 | + return 0; |
| 175 | + } |
| 176 | + u1 = ((((unsigned int)c[-1]) & 0x0F) << 12) | |
| 177 | + ((((unsigned int)c[0]) & 0x3F) << 6) | |
| 178 | + (((unsigned int)c[1]) & 0x3F); |
| 179 | + if (u1 < 0xD800 || u1 > 0xDBFF) { /* not a surrogate pair -> regular copy */ |
| 180 | + *(d++) = *(c++); |
| 181 | + *(d++) = *(c++); |
| 182 | + continue; |
| 183 | + } |
| 184 | + if (u1 >= 0xDC00 && u1 <= 0xDFFF) { /* low surrogate pair ? */ |
| 185 | + if (dst) free(dst); |
| 186 | + if (can_error) |
| 187 | + Rf_error("illegal sequence in Java string: low surrogate pair without a high one"); |
| 188 | + return 0; |
| 189 | + } |
| 190 | + c += 2; /* move to the low pair */ |
| 191 | + if (c[0] != 0xED || |
| 192 | + (c[1] & 0xF0) != 0xB0 || |
| 193 | + (c[2] & 0xC0) != 0x80) { |
| 194 | + if (dst) free(dst); |
| 195 | + if (can_error) |
| 196 | + Rf_error("illegal sequence in Java string: high surrogate pair not followed by low one"); |
| 197 | + return 0; |
| 198 | + } |
| 199 | + /* the actually encoded unicode character */ |
| 200 | + u = ((((unsigned int)c[1]) & 0x0F) << 6) | |
| 201 | + (((unsigned int)c[2]) & 0x3F); |
| 202 | + u |= (u1 & 0x03FF) << 10; |
| 203 | + u += 0x10000; |
| 204 | + c += 3; |
| 205 | + /* it must be <= 0x10FFFF by design (each surrogate has 10 bits) */ |
| 206 | + d[-1] = (unsigned char) (((u >> 18) & 0x0F) | 0xF0); |
| 207 | + *(d++) = (unsigned char) (((u >> 12) & 0x3F) | 0x80); |
| 208 | + *(d++) = (unsigned char) (((u >> 6) & 0x3F) | 0x80); |
| 209 | + *(d++) = (unsigned char) ((u & 0x3F) | 0x80); |
| 210 | + } |
| 211 | + res = mkCharLenCE((const char*) (dst ? dst : sbuf), dst ? (d - dst) : (d - sbuf), CE_UTF8); |
| 212 | + if (dst) free(dst); |
| 213 | + return res; |
| 214 | + } |
| 215 | + return mkCharLenCE(src, c - s, CE_UTF8); |
| 216 | +} |
| 217 | + |
| 218 | +SEXP rj_mkCharUTF8(const char *src) { return rj_mkCharUTF8_(src, 0); } |
| 219 | +SEXP rj_mkCharUTF8_noerr(const char *src) { return rj_mkCharUTF8_(src, 1); } |
| 220 | + |
| 221 | +jstring rj_newJavaString(JNIEnv *env, SEXP sChar) { |
| 222 | + jchar *s; |
| 223 | + int len = rj_rchar_utf16(sChar, &s); |
| 224 | + return (*env)->NewString(env, s, (len + 1) >> 1); |
| 225 | +} |
| 226 | + |
| 227 | +jstring rj_newNativeJavaString(JNIEnv *env, const char *str, int len) { |
| 228 | + jchar *s; |
| 229 | + int rlen = rj_char_utf16(str, len, &s, "", 0); |
| 230 | + return (rlen < 0) ? 0 : (*env)->NewString(env, s, (rlen + 1) >> 1); |
| 231 | +} |
0 commit comments