From e44dd98dbf2effeff3305430835e889d76ec73b4 Mon Sep 17 00:00:00 2001 From: roytam1 Date: Sat, 19 May 2018 21:59:33 +0800 Subject: [PATCH] sqlite: UTF-8 fix for older Windows --- db/sqlite3/src/os_win.c | 182 +++++++++++++++++++++++++++++++++++----- 1 file changed, 161 insertions(+), 21 deletions(-) diff --git a/db/sqlite3/src/os_win.c b/db/sqlite3/src/os_win.c index 147ded35..7a61ec7f 100644 --- a/db/sqlite3/src/os_win.c +++ b/db/sqlite3/src/os_win.c @@ -123,6 +123,163 @@ int sqlite3_os_type = 0; } #endif /* OS_WINCE */ +/*** UTF16<-->UTF8 functions minicking MultiByteToWideChar/WideCharToMultiByte ***/ +int utf8GetMaskIndex(unsigned char n) { + if((unsigned char)(n + 2) < 0xc2) return 1; // 00~10111111, fe, ff + if(n < 0xe0) return 2; // 110xxxxx + if(n < 0xf0) return 3; // 1110xxxx + if(n < 0xf8) return 4; // 11110xxx + if(n < 0xfc) return 5; // 111110xx + return 6; // 1111110x +} + +int wc2Utf8Len(wchar_t ** n, int *len) { + wchar_t *ch = *n, ch2; + int qch; + if((0xD800 <= *ch && *ch <= 0xDBFF) && *len) { + ch2 = *(ch + 1); + if(0xDC00 <= ch2 && ch2 <= 0xDFFF) { + qch = 0x10000 + (((*ch - 0xD800) & 0x3ff) << 10) + ((ch2 - 0xDC00) & 0x3ff); + (*n)++; + (*len)--; + } + } + else + qch = (int) *ch; + + if (qch <= 0x7f) return 1; + else if (qch <= 0x7ff) return 2; + else if (qch <= 0xffff) return 3; + else if (qch <= 0x1fffff) return 4; + else if (qch <= 0x3ffffff) return 5; + else return 6; +} + +int Utf8ToWideChar(unsigned int unused1, unsigned long unused2, char *sb, int ss, wchar_t * wb, int ws) { + static const unsigned char utf8mask[] = { 0, 0xff, 0x1f, 0x0f, 0x07, 0x03, 0x01 }; + char *p = (char *)(sb); + char *e = (char *)(sb + ss); + wchar_t *w = wb; + int cnt = 0, t, qch; + + if (ss < 1) { + ss = lstrlenA(sb); + e = (char *)(sb + ss); + } + + if (wb && ws) { + for (; p < e; ++w) { + t = utf8GetMaskIndex(*p); + qch = (*p++ & utf8mask[t]); + while(p < e && --t) + qch <<= 6, qch |= (*p++) & 0x3f; + if(qch < 0x10000) { + if(cnt <= ws) + *w = (wchar_t) qch; + cnt++; + } else { + if (cnt + 2 <= ws) { + *w++ = (wchar_t) (0xD800 + (((qch - 0x10000) >> 10) & 0x3ff)), + *w = (wchar_t) (0xDC00 + (((qch - 0x10000)) & 0x3ff)); + } + cnt += 2; + } + } + if(cnt < ws) { + *(wb+cnt) = 0; + return cnt; + } else { + *(wb+ws) = 0; + return ws; + } + } else { + for (t; p < e;) { + t = utf8GetMaskIndex(*p); + qch = (*p++ & utf8mask[t]); + while (p < e && --t) + qch <<= 6, qch |= (*p++) & 0x3f; + if (qch < 0x10000) + cnt++; + else + cnt += 2; + } + return cnt+1; + } +} + +int WideCharToUtf8(unsigned int unused1, unsigned long unused2, wchar_t * wb, int ws, char *sb, int ss) { + wchar_t *p = (wchar_t *)(wb); + wchar_t *e = (wchar_t *)(wb + ws); + wchar_t *oldp; + char *s = sb; + int cnt = 0, qch, t; + + if (ws < 1) { + ws = lstrlenW(wb); + e = (wchar_t *)(wb + ws); + } + + if (sb && ss) { + for (t; p < e; ++p) { + oldp = p; + t = wc2Utf8Len(&p, &ws); + + if (p != oldp) { /* unicode surrogates encountered */ + qch = 0x10000 + (((*oldp - 0xD800) & 0x3ff) << 10) + ((*p - 0xDC00) & 0x3ff); + } else + qch = *p; + + if (qch <= 0x7f) + *s++ = (char) (qch), + cnt++; + else if (qch <= 0x7ff) + *s++ = 0xc0 | (char) (qch >> 6), + *s++ = 0x80 | (char) (qch & 0x3f), + cnt += 2; + else if (qch <= 0xffff) + *s++ = 0xe0 | (char) (qch >> 12), + *s++ = 0x80 | (char) ((qch >> 6) & 0x3f), + *s++ = 0x80 | (char) (qch & 0x3f), + cnt += 3; + else if (qch <= 0x1fffff) + *s++ = 0xf0 | (char) (qch >> 18), + *s++ = 0x80 | (char) ((qch >> 12) & 0x3f), + *s++ = 0x80 | (char) ((qch >> 6) & 0x3f), + *s++ = 0x80 | (char) (qch & 0x3f), + cnt += 4; + else if (qch <= 0x3ffffff) + *s++ = 0xf8 | (char) (qch >> 24), + *s++ = 0x80 | (char) ((qch >> 18) & 0x3f), + *s++ = 0x80 | (char) ((qch >> 12) & 0x3f), + *s++ = 0x80 | (char) ((qch >> 6) & 0x3f), + *s++ = 0x80 | (char) (qch & 0x3f), + cnt += 5; + else + *s++ = 0xfc | (char) (qch >> 30), + *s++ = 0x80 | (char) ((qch >> 24) & 0x3f), + *s++ = 0x80 | (char) ((qch >> 18) & 0x3f), + *s++ = 0x80 | (char) ((qch >> 12) & 0x3f), + *s++ = 0x80 | (char) ((qch >> 6) & 0x3f), + *s++ = 0x80 | (char) (qch & 0x3f), + cnt += 6; + } + if(cnt < ss) { + *(sb+cnt) = 0; + return cnt; + } else { + *(sb+ss) = 0; + return ss; + } + } else { + for (t; p < e; ++p) { + t = wc2Utf8Len(&p, &ws); + cnt += t; + } + return cnt+1; + } +} +/*** Ends ***/ + /* ** Convert a UTF-8 string to UTF-32. Space to hold the returned string ** is obtained from sqliteMalloc. @@ -132,20 +289,12 @@ static WCHAR *utf8ToUnicode(const char *zFilename){ int nCharcpy; WCHAR *zWideFilename; - nCharcpy = MultiByteToWideChar(CP_UTF8, 0, zFilename, -1, NULL, 0); - if( zFilename!=0 && nCharcpy == 0) { - /* UTF8 conversion failed. This happens on NT 3.51 and 95. Make do with ACP conversion instead. */ - nCharcpy = MultiByteToWideChar(CP_ACP, 0, zFilename, -1, NULL, 0); - } + nCharcpy = Utf8ToWideChar(CP_UTF8, 0, zFilename, -1, NULL, 0); zWideFilename = sqliteMalloc( nCharcpy*sizeof(zWideFilename[0]) ); if( zWideFilename==0 ){ return 0; } - nChar = MultiByteToWideChar(CP_UTF8, 0, zFilename, -1, zWideFilename, nCharcpy); - if( zFilename!=0 && nChar == 0) { - /* UTF8 conversion failed. This happens on NT 3.51 and 95. Make do with ACP conversion instead. */ - nChar = MultiByteToWideChar(CP_ACP, 0, zFilename, -1, zWideFilename, nCharcpy); - } + nChar = Utf8ToWideChar(CP_UTF8, 0, zFilename, -1, zWideFilename, nCharcpy); if( nChar==0 ){ sqliteFree(zWideFilename); @@ -163,22 +312,13 @@ static char *unicodeToUtf8(const WCHAR *zWideFilename){ int nBytecpy; char *zFilename; - nBytecpy = WideCharToMultiByte(CP_UTF8, 0, zWideFilename, -1, 0, 0, 0, 0); - if( zWideFilename!=0 && nBytecpy == 0) { - /* UTF8 conversion failed. This happens on NT 3.51 and 95. Make do with ACP conversion instead. */ - nBytecpy = WideCharToMultiByte(CP_ACP, 0, zWideFilename, -1, 0, 0, 0, 0); - } + nBytecpy = WideCharToUtf8(CP_UTF8, 0, zWideFilename, -1, 0, 0, 0, 0); zFilename = sqliteMalloc( nBytecpy ); if( zFilename==0 ){ return 0; } - nByte = WideCharToMultiByte(CP_UTF8, 0, zWideFilename, -1, zFilename, nBytecpy, + nByte = WideCharToUtf8(CP_UTF8, 0, zWideFilename, -1, zFilename, nBytecpy, 0, 0); - if( zWideFilename!=0 && nByte == 0) { - /* UTF8 conversion failed. This happens on NT 3.51 and 95. Make do with ACP conversion instead. */ - nByte = WideCharToMultiByte(CP_ACP, 0, zWideFilename, -1, zFilename, nBytecpy, - 0, 0); - } if( nByte == 0 ){ sqliteFree(zFilename); zFilename = 0;