/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ /* vim:expandtab:shiftwidth=2:tabstop=2: */ /* ***** BEGIN LICENSE BLOCK ***** * Version: MPL 1.1/GPL 2.0/LGPL 2.1 * * The contents of this file are subject to the Mozilla Public License Version * 1.1 (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * http://www.mozilla.org/MPL/ * * Software distributed under the License is distributed on an "AS IS" basis, * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License * for the specific language governing rights and limitations under the * License. * * The Original Code is GNU C Library code (http://www.gnu.org) * * The Initial Developer of the Original Code is * Bruno Haible . * Portions created by the Initial Developer are Copyright (C) 2002 * the Free Software Foundation. All Rights Reserved. * * Contributor(s): * Jungshik Shin * * Alternatively, the contents of this file may be used under the terms of * either the GNU General Public License Version 2 or later (the "GPL"), or * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"), * in which case the provisions of the GPL or the LGPL are applicable instead * of those above. If you wish to allow use of your version of this file only * under the terms of either the GPL or the LGPL, and not to allow others to * use your version of this file under the terms of the MPL, indicate your * decision by deleting the provisions above and replace them with the notice * and other provisions required by the GPL or the LGPL. If you do not delete * the provisions above, a recipient may use your version of this file under * the terms of any one of the MPL, the GPL or the LGPL. * * ***** END LICENSE BLOCK ***** */ #include "nsUnicodeToTSCII.h" #include "nsMemory.h" #include "tamil.h" /* * TSCII is an 8-bit encoding consisting of: * 0x00..0x7F: ASCII * 0x80..0x90, 0x95..0x9F, 0xAB..0xFE: * Tamil letters and glyphs * 0xA1..0xA5, 0xAA: Tamil combining letters (after the base character) * 0xA6..0xA8: Tamil combining letters (before the base character) * 0x91..0x94: Punctuation * 0xA9: Symbols */ //---------------------------------------------------------------------- // Class nsUnicodeToTSCII [implementation] NS_IMPL_ISUPPORTS2(nsUnicodeToTSCII, nsIUnicodeEncoder, nsICharRepresentable) /* * During UCS-4 to TSCII conversion, mState contains * the last byte (or sometimes the last two bytes) to be output. * This can be: * 0x00 Nothing pending. * 0xB8..0xC9, 0x83..0x86 A consonant. * 0xEC, 0x8A A consonant with VIRAMA sign (final or joining). * 0x87, 0xC38A Two consonants combined through a VIRAMA sign. */ static const PRUint8 UnicharToTSCII[] = { 0, 0, 0, 0xb7, 0, 0xab, 0xac, 0xfe, // 0x0B80..0x0B87 0xae, 0xaf, 0xb0, 0, 0, 0, 0xb1, 0xb2, // 0x0B88..0x0B8F 0xb3, 0, 0xb4, 0xb5, 0xb6, 0xb8, 0, 0, // 0x0B90..0x0B97 0, 0xb9, 0xba, 0, 0x83, 0, 0xbb, 0xbc, // 0x0B98..0x0B9F 0, 0, 0, 0xbd, 0xbe, 0, 0, 0, // 0x0BA0..0x0BA7 0xbf, 0xc9, 0xc0, 0, 0, 0, 0xc1, 0xc2, // 0x0BA8..0x0BAF 0xc3, 0xc8, 0xc4, 0xc7, 0xc6, 0xc5, 0, 0x84, // 0x0BB0..0x0BB7 0x85, 0x86, 0, 0, 0, 0, 0xa1, 0xa2, // 0x0BB8..0x0BBF 0xa3, 0xa4, 0xa5, 0, 0, 0, 0xa6, 0xa7, // 0x0BC0..0x0BC7 0xa8, 0, 0, 0, 0, 0, 0, 0, // 0x0BC8..0x0BCF 0, 0, 0, 0, 0, 0, 0, 0xaa, // 0x0BD0..0x0BD7 0, 0, 0, 0, 0, 0, 0, 0, // 0x0BD8..0x0BDF 0, 0, 0, 0, 0, 0, 0x80, 0x81, // 0x0BE0..0x0BE7 0x8d, 0x8e, 0x8f, 0x90, 0x95, 0x96, 0x97, 0x98, // 0x0BE8..0x0BEF 0x9d, 0x9e, 0x9f, 0, 0, 0, 0, 0, // 0x0BF0..0x0BF7 0, 0, 0, 0, 0, 0, 0, 0 // 0x0BF8..0x0BFF }; static const PRUint8 consonant_with_u[] = { 0xcc, 0x99, 0xcd, 0x9a, 0xce, 0xcf, 0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, 0xd8, 0xd9, 0xda, 0xdb }; static const PRUint8 consonant_with_uu[] = { 0xdc, 0x9b, 0xdd, 0x9c, 0xde, 0xdf, 0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, 0xe8, 0xe9, 0xea, 0xeb }; static const PRUint8 consonant_with_virama[18] = { 0xec, 0xed, 0xee, 0xef, 0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, 0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd }; // Modified implementation of Unicode to TSCII converter in glibc by // Bruno Haible. My modifications are based on Unicode 3.0 chap. 9 and // the code chart for Tamil. NS_IMETHODIMP nsUnicodeToTSCII::Convert(const PRUnichar * aSrc, PRInt32 * aSrcLength, char * aDest, PRInt32 * aDestLength) { const PRUnichar * src = aSrc; const PRUnichar * srcEnd = aSrc + *aSrcLength; char * dest = aDest; char * destEnd = dest + *aDestLength; nsresult rv = NS_OK; while (src < srcEnd && dest < destEnd) { PRUnichar ch = *src; if (mBuffer) { // Attempt to combine the last character with this one. PRUint32 last = mBuffer; // last : consonant if (IS_TSC_CONSONANT(last)) { if (ch == UNI_VOWELSIGN_U && IS_TSC_CONSONANT1(last)) { *dest++ = consonant_with_u[last - TSC_KA]; mBuffer = 0; ++src; continue; } if (ch == UNI_VOWELSIGN_UU && IS_TSC_CONSONANT1(last)) { *dest++ = consonant_with_uu[last - TSC_KA]; mBuffer = 0; ++src; continue; } // reorder. vowel sign goes to the left of consonant if (IS_UNI_LEFT_VOWELSIGN(ch)) { if (dest + 2 > destEnd) goto error_more_output; *dest++ = TSC_LEFT_VOWELSIGN(ch); *dest++ = last; mBuffer = 0; ++src; continue; } // split and reorder. consonant goes bet. two parts if (IS_UNI_2PARTS_VOWELSIGN(ch)) { if (dest + 3 > destEnd) goto error_more_output; *dest++ = TSC_LEFT_VOWEL_PART(ch); *dest++ = last; *dest++ = TSC_RIGHT_VOWEL_PART(ch); mBuffer = 0; ++src; continue; } // Virama if (ch == UNI_VIRAMA) { // consonant KA can form a conjunct with consonant SSA(SHA). // buffer dead consonant 'K' for the now. if (last == TSC_KA) { mBuffer = TSC_KA_DEAD; } // SA can form a conjunct when followed by 'RA'. // buffer dead consonant 'S' for the now. else if (last == TSC_SA) { mBuffer = TSC_SA_DEAD; } else { *dest++ = IS_TSC_CONSONANT1(last) ? consonant_with_virama[last - TSC_KA] : last + 5; mBuffer = 0; } ++src; continue; } // consonant TA forms a ligature with vowel 'I' or 'II'. if (last == TSC_TA && (ch == UNI_VOWELSIGN_I || ch == UNI_VOWELSIGN_II)) { *dest++ = ch - (UNI_VOWELSIGN_I - TSC_TI_LIGA); mBuffer = 0; ++src; continue; } } else if (last == TSC_KA_DEAD) { // Kd + SSA = K.SSA if (ch == UNI_SSA) { mBuffer = TSC_KSSA; ++src; continue; } } else if (last == TSC_SA_DEAD) { // Sd + RA = S.RA. Buffer RA + Sd. if (ch == UNI_RA) { mBuffer = 0xc38a; ++src; continue; } } else if (last == TSC_KSSA) { if (ch == UNI_VIRAMA) { *dest++ = (char) TSC_KSSA_DEAD; mBuffer = 0; ++src; continue; } // vowel splitting/reordering should be done around conjuncts as well. // reorder. vowel sign goes to the left of consonant if (IS_UNI_LEFT_VOWELSIGN(ch)) { if (dest + 2 > destEnd) goto error_more_output; *dest++ = TSC_LEFT_VOWELSIGN(ch); *dest++ = last; mBuffer = 0; ++src; continue; } // split and reorder. consonant goes bet. two parts if (IS_UNI_2PARTS_VOWELSIGN(ch)) { if (dest + 3 > destEnd) goto error_more_output; *dest++ = TSC_LEFT_VOWEL_PART(ch); *dest++ = last; *dest++ = TSC_RIGHT_VOWEL_PART(ch); mBuffer = 0; ++src; continue; } } else { NS_ASSERTION(last == 0xc38a, "No other value can be buffered"); if (ch == UNI_VOWELSIGN_II) { *dest++ = (char) TSC_SRII_LIGA; mBuffer = 0; ++src; continue; } else { // put back TSC_SA_DEAD and TSC_RA *dest++ = (char) TSC_SA_DEAD; mBuffer = TSC_RA; ++src; continue; } } /* Output the buffered character. */ if (last >> 8) { if (dest + 2 > destEnd) goto error_more_output; *dest++ = last & 0xff; *dest++ = (last >> 8) & 0xff; } else *dest++ = last & 0xff; mBuffer = 0; continue; } if (ch < 0x80) // Plain ASCII character. *dest++ = (char)ch; else if (IS_UNI_TAMIL(ch)) { PRUint8 t = UnicharToTSCII[ch - UNI_TAMIL_START]; if (t != 0) { if (IS_TSC_CONSONANT(t)) mBuffer = (PRUint32) t; else *dest++ = t; } else if (IS_UNI_2PARTS_VOWELSIGN(ch)) { // actually this is an illegal sequence. if (dest + 2 > destEnd) goto error_more_output; *dest++ = TSC_LEFT_VOWEL_PART(ch); *dest++ = TSC_RIGHT_VOWEL_PART(ch); } else { *aDestLength = dest - aDest; return NS_ERROR_UENC_NOMAPPING; } } else if (ch == 0x00A9) *dest++ = (char)ch; else if (IS_UNI_SINGLE_QUOTE(ch)) *dest++ = ch - UNI_LEFT_SINGLE_QUOTE + TSC_LEFT_SINGLE_QUOTE; else if (IS_UNI_DOUBLE_QUOTE(ch)) *dest++ = ch - UNI_LEFT_DOUBLE_QUOTE + TSC_LEFT_DOUBLE_QUOTE; else { *aDestLength = dest - aDest; return NS_ERROR_UENC_NOMAPPING; } /* Now that we wrote the output increment the input pointer. */ ++src; } // flush the buffer if (mBuffer >> 8) { // Write out the last character, two bytes. if (dest + 2 > destEnd) goto error_more_output; *dest++ = (mBuffer >> 8) & 0xff; *dest++ = mBuffer & 0xff; mBuffer = 0; } else if (mBuffer) { // Write out the last character, a single byte. if (dest >= destEnd) goto error_more_output; *dest++ = mBuffer & 0xff; mBuffer = 0; } *aSrcLength = src - aSrc; *aDestLength = dest - aDest; return rv; error_more_output: *aSrcLength = src - aSrc; *aDestLength = dest - aDest; return NS_OK_UENC_MOREOUTPUT; } NS_IMETHODIMP nsUnicodeToTSCII::Finish(char* aDest, PRInt32* aDestLength) { if (!mBuffer) { *aDestLength = 0; return NS_OK; } if (mBuffer >> 8) { // Write out the last character, two bytes. if (*aDestLength < 2) { *aDestLength = 0; return NS_OK_UENC_MOREOUTPUT; } *aDest++ = (mBuffer >> 8) & 0xff; *aDest++ = mBuffer & 0xff; mBuffer = 0; *aDestLength = 2; } else { // Write out the last character, a single byte. if (*aDestLength < 1) { *aDestLength = 0; return NS_OK_UENC_MOREOUTPUT; } *aDest++ = mBuffer & 0xff; mBuffer = 0; *aDestLength = 1; } return NS_OK; } //================================================================ NS_IMETHODIMP nsUnicodeToTSCII::Reset() { mBuffer = 0; return NS_OK; } NS_IMETHODIMP nsUnicodeToTSCII::GetMaxLength(const PRUnichar * aSrc, PRInt32 aSrcLength, PRInt32 * aDestLength) { // Some Tamil letters can be decomposed into 2 glyphs in TSCII. *aDestLength = aSrcLength * 2; return NS_OK; } NS_IMETHODIMP nsUnicodeToTSCII::FillInfo(PRUint32* aInfo) { // Tamil block is so sparse. static const PRUint8 coverage[] = { 0xe8, // 11101000 U+0B87 - U+0B80 0xc7, // 11000111 U+0B8F - U+0B88 0x3d, // 00111101 U+0B97 - U+0B90 0xd6, // 11010110 U+0B9F - U+0B98 0x18, // 00011000 U+0BA7 - U+0BA0 0xc7, // 11000111 U+0BAF - U+0BA8 0xbf, // 10111111 U+0BB7 - U+0BB0 0xc7, // 11000111 U+0BBF - U+0BB8 0xc7, // 11000111 U+0BC7 - U+0BC0 0x3d, // 00111101 U+0BCF - U+0BC8 0x80, // 10000000 U+0BD7 - U+0BD0 0x00, // 00000000 U+0BDF - U+0BD8 0x80, // 10000000 U+0BE7 - U+0BE0 0xff, // 11111111 U+0BEF - U+0BE8 0x07, // 00000111 U+0BF7 - U+0BF0 }; PRUnichar i; for(i = 0; i < 0x78; i++) if (coverage[i / 8] & (1 << (i % 8))) SET_REPRESENTABLE(aInfo, i + UNI_TAMIL_START); // TSCII is a superset of US-ASCII. for(i = 0x20; i < 0x7f; i++) SET_REPRESENTABLE(aInfo, i); // additional characters in TSCII SET_REPRESENTABLE(aInfo, 0xA9); // copyright sign SET_REPRESENTABLE(aInfo, UNI_LEFT_SINGLE_QUOTE); SET_REPRESENTABLE(aInfo, UNI_RIGHT_SINGLE_QUOTE); SET_REPRESENTABLE(aInfo, UNI_LEFT_DOUBLE_QUOTE); SET_REPRESENTABLE(aInfo, UNI_RIGHT_DOUBLE_QUOTE); return NS_OK; } NS_IMETHODIMP nsUnicodeToTSCII::SetOutputErrorBehavior(PRInt32 aBehavior, nsIUnicharEncoder *aEncoder, PRUnichar aChar) { return NS_OK; } // same as the mapping of the C1(0x80-0x9f) part of Windows-1252 to Unicode const static PRUnichar gTSCIIToTTF[] = { 0x20AC, 0x0081, 0x201A, 0x0192, 0x201E, 0x2026, 0x2020, 0x2021, 0x02C6, 0x2030, 0x0160, 0x2039, 0x0152, 0x008D, 0x017D, 0x008F, 0x0090, 0x2018, 0x2019, 0x201C, 0x201D, 0x2022, 0x2013, 0x2014, 0x02DC, 0x2122, 0x0161, 0x203A, 0x0153, 0x009D, 0x017E, 0x0178 }; //---------------------------------------------------------------------- // Class nsUnicodeToTamilTTF [implementation] // NS_IMPL_ISUPPORTS_INHERITED0(nsUnicodeToTamilTTF, nsUnicodeToTSCII) NS_IMETHODIMP nsUnicodeToTamilTTF::Convert(const PRUnichar * aSrc, PRInt32 * aSrcLength, char * aDest, PRInt32 * aDestLength) { PRInt32 medLen, destLen; char *med; GetMaxLength(aSrc, *aSrcLength, &destLen); NS_ASSERTION(destLen <= *aDestLength, "insufficient dest. buffer size"); // TSCII converter is a single byte encoder and takes half the space // taken by TamilTTF encoder. medLen = destLen / 2; if (medLen > CHAR_BUFFER_SIZE) { med = (char *) nsMemory::Alloc(medLen); if (!med) return NS_ERROR_OUT_OF_MEMORY; } else med = mStaticBuffer; nsresult rv = nsUnicodeToTSCII::Convert(aSrc, aSrcLength, med, &medLen); if (NS_FAILED(rv)) { if (med != mStaticBuffer) nsMemory::Free(med); return rv; } PRInt32 i, j; // widen 8bit TSCII to pseudo-Unicode font encoding of TSCII-Tamil font for (i = 0, j = 0; i < medLen; i++) { // Only C1 part(0x80-0x9f) needs to be mapped as if they're CP1251. PRUnichar ucs2 = (med[i] & 0xe0) == 0x80 ? gTSCIIToTTF[med[i] & 0x7f] : PRUint8(med[i]); // A lot of TSCII fonts are still based on TSCII 1.6 so that // they have Tamil vowel 'I' at 0xad instead of 0xfe. if (ucs2 == 0xfe) ucs2 = 0xad; aDest[j++] = PRUint8((ucs2 & 0xff00) >> 8); aDest[j++] = PRUint8(ucs2 & 0x00ff); } *aDestLength = j; if (med != mStaticBuffer) nsMemory::Free(med); return NS_OK; } NS_IMETHODIMP nsUnicodeToTamilTTF::GetMaxLength(const PRUnichar * aSrc, PRInt32 aSrcLength, PRInt32 * aDestLength) { // Each Tamil character can generate at most two presentation forms, // but we're 'extending' them to 16bit shorts, which accounts for // additional factor of 2. *aDestLength = (aSrcLength + 1) * 4; return NS_OK; } NS_IMETHODIMP nsUnicodeToTamilTTF::SetOutputErrorBehavior(PRInt32 aBehavior, nsIUnicharEncoder *aEncoder, PRUnichar aChar) { if (aBehavior == kOnError_CallBack && aEncoder == nsnull) return NS_ERROR_NULL_POINTER; mErrEncoder = aEncoder; mErrBehavior = aBehavior; mErrChar = aChar; return NS_OK; }