sqlite: UTF-8 fix for older Windows

2024-11-13 03:10:10 +01:00 · 2018-05-19 21:59:33 +08:00 · 2018-05-19 21:59:33 +08:00 · e44dd98dbf
commit e44dd98dbf
parent 67325a144e
1 changed files with 161 additions and 21 deletions
--- a/db/sqlite3/src/os_win.c
+++ b/db/sqlite3/src/os_win.c
@ -123,6 +123,163 @@ int sqlite3_os_type = 0;
  }
 #endif /* OS_WINCE */

+/*** UTF16<-->UTF8 functions minicking MultiByteToWideChar/WideCharToMultiByte ***/
+int utf8GetMaskIndex(unsigned char n) {
+  if((unsigned char)(n + 2) < 0xc2) return 1; // 00~10111111, fe, ff
+  if(n < 0xe0)                      return 2; // 110xxxxx
+  if(n < 0xf0)                      return 3; // 1110xxxx
+  if(n < 0xf8)                      return 4; // 11110xxx
+  if(n < 0xfc)                      return 5; // 111110xx
+                                    return 6; // 1111110x
+}
+
+int wc2Utf8Len(wchar_t ** n, int *len) {
+  wchar_t *ch = *n, ch2;
+  int qch;
+  if((0xD800 <= *ch && *ch <= 0xDBFF) && *len) {
+    ch2 = *(ch + 1);
+    if(0xDC00 <= ch2 && ch2 <= 0xDFFF) {
+	  qch = 0x10000 + (((*ch - 0xD800) & 0x3ff) << 10) + ((ch2 - 0xDC00) & 0x3ff);
+	  (*n)++;
+	  (*len)--;
+	}
+  }
+  else
+    qch = (int) *ch;
+
+  if (qch <= 0x7f)           return 1;
+  else if (qch <= 0x7ff)     return 2;
+  else if (qch <= 0xffff)    return 3;
+  else if (qch <= 0x1fffff)  return 4;
+  else if (qch <= 0x3ffffff) return 5;
+  else                       return 6;
+}
+
+int Utf8ToWideChar(unsigned int unused1, unsigned long unused2, char *sb, int ss, wchar_t * wb, int ws) {
+  static const unsigned char utf8mask[] = { 0, 0xff, 0x1f, 0x0f, 0x07, 0x03, 0x01 };
+  char *p = (char *)(sb);
+  char *e = (char *)(sb + ss);
+  wchar_t *w = wb;
+  int cnt = 0, t, qch;
+
+  if (ss < 1) {
+    ss = lstrlenA(sb);
+    e = (char *)(sb + ss);
+  }
+
+  if (wb && ws) {
+    for (; p < e; ++w) {
+	  t = utf8GetMaskIndex(*p);
+	  qch = (*p++ & utf8mask[t]);
+	  while(p < e && --t)
+	    qch <<= 6, qch |= (*p++) & 0x3f;
+	  if(qch < 0x10000) {
+	    if(cnt <= ws)
+		  *w = (wchar_t) qch;
+	    cnt++;
+	  } else {
+	    if (cnt + 2 <= ws) {
+		  *w++ = (wchar_t) (0xD800 + (((qch - 0x10000) >> 10) & 0x3ff)),
+		    *w = (wchar_t) (0xDC00 + (((qch - 0x10000)) & 0x3ff));
+		}
+	    cnt += 2;
+	  }
+	}
+    if(cnt < ws) {
+      *(wb+cnt) = 0;
+      return cnt;
+    } else {
+      *(wb+ws) = 0;
+      return ws;
+    }
+  } else {
+    for (t; p < e;) {
+	  t = utf8GetMaskIndex(*p);
+	  qch = (*p++ & utf8mask[t]);
+	  while (p < e && --t)
+	    qch <<= 6, qch |= (*p++) & 0x3f;
+	  if (qch < 0x10000)
+	    cnt++;
+	  else
+	    cnt += 2;
+	}
+    return cnt+1;
+  }
+}
+
+int WideCharToUtf8(unsigned int unused1, unsigned long unused2, wchar_t * wb, int ws, char *sb, int ss) {
+  wchar_t *p = (wchar_t *)(wb);
+  wchar_t *e = (wchar_t *)(wb + ws);
+  wchar_t *oldp;
+  char *s = sb;
+  int cnt = 0, qch, t;
+
+  if (ws < 1) {
+    ws = lstrlenW(wb);
+    e = (wchar_t *)(wb + ws);
+  }
+
+  if (sb && ss) {
+    for (t; p < e; ++p) {
+      oldp = p;
+      t = wc2Utf8Len(&p, &ws);
+
+      if (p != oldp) { /* unicode surrogates encountered */
+        qch = 0x10000 + (((*oldp - 0xD800) & 0x3ff) << 10) + ((*p - 0xDC00) & 0x3ff);
+      } else
+        qch = *p;
+
+      if (qch <= 0x7f)
+        *s++ = (char) (qch),
+        cnt++;
+      else if (qch <= 0x7ff)
+        *s++ = 0xc0 | (char) (qch >> 6),
+        *s++ = 0x80 | (char) (qch & 0x3f),
+        cnt += 2;
+      else if (qch <= 0xffff)
+        *s++ = 0xe0 | (char) (qch >> 12),
+        *s++ = 0x80 | (char) ((qch >> 6) & 0x3f),
+        *s++ = 0x80 | (char) (qch & 0x3f),
+        cnt += 3;
+      else if (qch <= 0x1fffff)
+        *s++ = 0xf0 | (char) (qch >> 18),
+        *s++ = 0x80 | (char) ((qch >> 12) & 0x3f),
+        *s++ = 0x80 | (char) ((qch >> 6) & 0x3f),
+        *s++ = 0x80 | (char) (qch & 0x3f),
+        cnt += 4;
+      else if (qch <= 0x3ffffff)
+        *s++ = 0xf8 | (char) (qch >> 24),
+        *s++ = 0x80 | (char) ((qch >> 18) & 0x3f),
+        *s++ = 0x80 | (char) ((qch >> 12) & 0x3f),
+        *s++ = 0x80 | (char) ((qch >> 6) & 0x3f),
+        *s++ = 0x80 | (char) (qch & 0x3f),
+        cnt += 5;
+      else
+        *s++ = 0xfc | (char) (qch >> 30),
+        *s++ = 0x80 | (char) ((qch >> 24) & 0x3f),
+        *s++ = 0x80 | (char) ((qch >> 18) & 0x3f),
+        *s++ = 0x80 | (char) ((qch >> 12) & 0x3f),
+        *s++ = 0x80 | (char) ((qch >> 6) & 0x3f),
+        *s++ = 0x80 | (char) (qch & 0x3f),
+        cnt += 6;
+    }
+    if(cnt < ss) {
+      *(sb+cnt) = 0;
+      return cnt;
+    } else {
+      *(sb+ss) = 0;
+      return ss;
+    }
+  } else {
+    for (t; p < e; ++p) {
+      t = wc2Utf8Len(&p, &ws);
+      cnt += t;
+    }
+    return cnt+1;
+  }
+}
+/*** Ends ***/
+
 /*
 ** Convert a UTF-8 string to UTF-32.  Space to hold the returned string
 ** is obtained from sqliteMalloc.
@ -132,20 +289,12 @@ static WCHAR *utf8ToUnicode(const char *zFilename){
  int nCharcpy;
  WCHAR *zWideFilename;

-  nCharcpy = MultiByteToWideChar(CP_UTF8, 0, zFilename, -1, NULL, 0);
-  if( zFilename!=0 && nCharcpy == 0) {
-    /* UTF8 conversion failed. This happens on NT 3.51 and 95. Make do with ACP conversion instead. */
-    nCharcpy = MultiByteToWideChar(CP_ACP, 0, zFilename, -1, NULL, 0);
-  }
+  nCharcpy = Utf8ToWideChar(CP_UTF8, 0, zFilename, -1, NULL, 0);
  zWideFilename = sqliteMalloc( nCharcpy*sizeof(zWideFilename[0]) );
  if( zWideFilename==0 ){
    return 0;
  }
-  nChar = MultiByteToWideChar(CP_UTF8, 0, zFilename, -1, zWideFilename, nCharcpy);
-  if( zFilename!=0 && nChar == 0) {
-    /* UTF8 conversion failed. This happens on NT 3.51 and 95. Make do with ACP conversion instead. */
-    nChar = MultiByteToWideChar(CP_ACP, 0, zFilename, -1, zWideFilename, nCharcpy);
-  }
+  nChar = Utf8ToWideChar(CP_UTF8, 0, zFilename, -1, zWideFilename, nCharcpy);

  if( nChar==0 ){
    sqliteFree(zWideFilename);
@ -163,22 +312,13 @@ static char *unicodeToUtf8(const WCHAR *zWideFilename){
  int nBytecpy;
  char *zFilename;

-  nBytecpy = WideCharToMultiByte(CP_UTF8, 0, zWideFilename, -1, 0, 0, 0, 0);
-  if( zWideFilename!=0 && nBytecpy == 0) {
-    /* UTF8 conversion failed. This happens on NT 3.51 and 95. Make do with ACP conversion instead. */
-    nBytecpy = WideCharToMultiByte(CP_ACP, 0, zWideFilename, -1, 0, 0, 0, 0);
-  }
+  nBytecpy = WideCharToUtf8(CP_UTF8, 0, zWideFilename, -1, 0, 0, 0, 0);
  zFilename = sqliteMalloc( nBytecpy );
  if( zFilename==0 ){
    return 0;
  }
-  nByte = WideCharToMultiByte(CP_UTF8, 0, zWideFilename, -1, zFilename, nBytecpy,
+  nByte = WideCharToUtf8(CP_UTF8, 0, zWideFilename, -1, zFilename, nBytecpy,
                              0, 0);
-  if( zWideFilename!=0 && nByte == 0) {
-    /* UTF8 conversion failed. This happens on NT 3.51 and 95. Make do with ACP conversion instead. */
-    nByte = WideCharToMultiByte(CP_ACP, 0, zWideFilename, -1, zFilename, nBytecpy,
-                                0, 0);
-  }
  if( nByte == 0 ){
    sqliteFree(zFilename);
    zFilename = 0;