RetroZilla/mailnews/base/util/nsMsgI18N.cpp
2015-10-20 23:03:22 -04:00

581 lines
19 KiB
C++

/* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
/* ***** BEGIN LICENSE BLOCK *****
* Version: MPL 1.1/GPL 2.0/LGPL 2.1
*
* The contents of this file are subject to the Mozilla Public License Version
* 1.1 (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
* http://www.mozilla.org/MPL/
*
* Software distributed under the License is distributed on an "AS IS" basis,
* WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
* for the specific language governing rights and limitations under the
* License.
*
* The Original Code is mozilla.org code.
*
* The Initial Developer of the Original Code is
* Netscape Communications Corporation.
* Portions created by the Initial Developer are Copyright (C) 1998
* the Initial Developer. All Rights Reserved.
*
* Contributor(s):
* Pierre Phaneuf <pp@ludusdesign.com>
*
* Alternatively, the contents of this file may be used under the terms of
* either of the GNU General Public License Version 2 or later (the "GPL"),
* or the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
* in which case the provisions of the GPL or the LGPL are applicable instead
* of those above. If you wish to allow use of your version of this file only
* under the terms of either the GPL or the LGPL, and not to allow others to
* use your version of this file under the terms of the MPL, indicate your
* decision by deleting the provisions above and replace them with the notice
* and other provisions required by the GPL or the LGPL. If you do not delete
* the provisions above, a recipient may use your version of this file under
* the terms of any one of the MPL, the GPL or the LGPL.
*
* ***** END LICENSE BLOCK ***** */
// as does this
#include "nsICharsetConverterManager.h"
#include "nsICharsetAlias.h"
#include "nsIPlatformCharset.h"
#include "nsIServiceManager.h"
#include "nsICharsetConverterManager.h"
#include "nsISupports.h"
#include "nsIPrefBranch.h"
#include "nsIPrefService.h"
#include "nsIPrefLocalizedString.h"
#include "nsIMimeConverter.h"
#include "msgCore.h"
#include "nsMsgI18N.h"
#include "nsFileSpec.h"
#include "nsFileStream.h"
#include "nsMsgMimeCID.h"
#include "nsMimeTypes.h"
#include "nsIEntityConverter.h"
#include "nsISaveAsCharset.h"
#include "nsHankakuToZenkakuCID.h"
#include "nsXPIDLString.h"
#include "nsString.h"
#include "nsReadableUtils.h"
#include "prmem.h"
#include "nsFileSpec.h"
#include "plstr.h"
static NS_DEFINE_CID(kCharsetConverterManagerCID, NS_ICHARSETCONVERTERMANAGER_CID);
static NS_DEFINE_CID(kEntityConverterCID, NS_ENTITYCONVERTER_CID);
//
// International functions necessary for composition
//
nsresult nsMsgI18NConvertFromUnicode(const char* aCharset,
const nsAFlatString& inString,
nsACString& outString,
PRBool aIsCharsetCanonical)
{
if (inString.IsEmpty()) {
outString.Truncate();
return NS_OK;
}
// Note: this will hide a possible error when the unicode text may contain more than one charset.
// (e.g. Latin1 + Japanese). Use nsMsgI18NSaveAsCharset instead to avoid that problem.
else if (!*aCharset || !PL_strcasecmp(aCharset, "us-ascii") ||
!PL_strcasecmp(aCharset, "ISO-8859-1")) {
LossyCopyUTF16toASCII(inString, outString);
return NS_OK;
}
else if (!PL_strcasecmp(aCharset, "UTF-8")) {
CopyUTF16toUTF8(inString, outString);
return NS_OK;
}
nsresult rv;
nsCOMPtr <nsICharsetConverterManager> ccm = do_GetService(NS_CHARSETCONVERTERMANAGER_CONTRACTID, &rv);
NS_ENSURE_SUCCESS(rv, rv);
nsCOMPtr <nsIUnicodeEncoder> encoder;
// get an unicode converter
if (aIsCharsetCanonical) // optimize for modified UTF-7 used by IMAP
rv = ccm->GetUnicodeEncoderRaw(aCharset, getter_AddRefs(encoder));
else
rv = ccm->GetUnicodeEncoder(aCharset, getter_AddRefs(encoder));
NS_ENSURE_SUCCESS(rv, rv);
rv = encoder->SetOutputErrorBehavior(nsIUnicodeEncoder::kOnError_Replace, nsnull, '?');
NS_ENSURE_SUCCESS(rv, rv);
const PRUnichar *originalSrcPtr = inString.get();
const PRUnichar *currentSrcPtr = originalSrcPtr;
PRInt32 originalUnicharLength = inString.Length();
PRInt32 srcLength;
PRInt32 dstLength;
char localbuf[512];
PRInt32 consumedLen = 0;
outString.Truncate();
// convert
while (consumedLen < originalUnicharLength) {
srcLength = originalUnicharLength - consumedLen;
dstLength = 512;
rv = encoder->Convert(currentSrcPtr, &srcLength, localbuf, &dstLength);
if (NS_FAILED(rv) || dstLength == 0)
break;
outString.Append(localbuf, dstLength);
currentSrcPtr += srcLength;
consumedLen = currentSrcPtr - originalSrcPtr; // src length used so far
}
rv = encoder->Finish(localbuf, &dstLength);
if (NS_SUCCEEDED(rv))
outString.Append(localbuf, dstLength);
return rv;
}
nsresult nsMsgI18NConvertToUnicode(const char* aCharset,
const nsAFlatCString& inString,
nsAString& outString,
PRBool aIsCharsetCanonical)
{
if (inString.IsEmpty()) {
outString.Truncate();
return NS_OK;
}
else if (!*aCharset || !PL_strcasecmp(aCharset, "us-ascii") ||
!PL_strcasecmp(aCharset, "ISO-8859-1")) {
// Despite its name, it also works for Latin-1.
CopyASCIItoUTF16(inString, outString);
return NS_OK;
}
else if (!PL_strcasecmp(aCharset, "UTF-8")) {
if (IsUTF8(inString)) {
nsAutoString tmp;
CopyUTF8toUTF16(inString, tmp);
if (!tmp.IsEmpty() && tmp.get()[0] == PRUnichar(0xFEFF))
tmp.Cut(0, 1);
outString.Assign(tmp);
return NS_OK;
}
NS_WARNING("Invalid UTF-8 string");
return NS_ERROR_UNEXPECTED;
}
nsresult rv;
nsCOMPtr <nsICharsetConverterManager> ccm = do_GetService(NS_CHARSETCONVERTERMANAGER_CONTRACTID, &rv);
NS_ENSURE_SUCCESS(rv, rv);
nsCOMPtr <nsIUnicodeDecoder> decoder;
// get an unicode converter
if (aIsCharsetCanonical) // optimize for modified UTF-7 used by IMAP
rv = ccm->GetUnicodeDecoderRaw(aCharset, getter_AddRefs(decoder));
else
rv = ccm->GetUnicodeDecoder(aCharset, getter_AddRefs(decoder));
NS_ENSURE_SUCCESS(rv, rv);
const char *originalSrcPtr = inString.get();
const char *currentSrcPtr = originalSrcPtr;
PRInt32 originalLength = inString.Length();
PRInt32 srcLength;
PRInt32 dstLength;
PRUnichar localbuf[512];
PRInt32 consumedLen = 0;
outString.Truncate();
// convert
while (consumedLen < originalLength) {
srcLength = originalLength - consumedLen;
dstLength = 512;
rv = decoder->Convert(currentSrcPtr, &srcLength, localbuf, &dstLength);
if (NS_FAILED(rv) || dstLength == 0)
break;
outString.Append(localbuf, dstLength);
currentSrcPtr += srcLength;
consumedLen = currentSrcPtr - originalSrcPtr; // src length used so far
}
return rv;
}
// Charset used by the file system.
const char * nsMsgI18NFileSystemCharset()
{
/* Get a charset used for the file. */
static nsCAutoString fileSystemCharset;
if (fileSystemCharset.IsEmpty())
{
nsresult rv;
nsCOMPtr <nsIPlatformCharset> platformCharset = do_GetService(NS_PLATFORMCHARSET_CONTRACTID, &rv);
if (NS_SUCCEEDED(rv)) {
rv = platformCharset->GetCharset(kPlatformCharsetSel_FileName,
fileSystemCharset);
}
if (NS_FAILED(rv))
fileSystemCharset.Assign("ISO-8859-1");
}
return fileSystemCharset.get();
}
// Charset used by the text file.
void nsMsgI18NTextFileCharset(nsACString& aCharset)
{
nsresult rv;
nsCOMPtr <nsIPlatformCharset> platformCharset =
do_GetService(NS_PLATFORMCHARSET_CONTRACTID, &rv);
if (NS_SUCCEEDED(rv)) {
rv = platformCharset->GetCharset(kPlatformCharsetSel_PlainTextInFile,
aCharset);
}
if (NS_FAILED(rv))
aCharset.Assign("ISO-8859-1");
}
// MIME encoder, output string should be freed by PR_FREE
// XXX : fix callers later to avoid allocation and copy
char * nsMsgI18NEncodeMimePartIIStr(const char *header, PRBool structured, const char *charset, PRInt32 fieldnamelen, PRBool usemime)
{
// No MIME, convert to the outgoing mail charset.
if (PR_FALSE == usemime) {
nsCAutoString convertedStr;
if (NS_SUCCEEDED(ConvertFromUnicode(charset, NS_ConvertUTF8toUTF16(header),
convertedStr)))
return PL_strdup(convertedStr.get());
else
return PL_strdup(header);
}
char *encodedString = nsnull;
nsresult res;
nsCOMPtr<nsIMimeConverter> converter = do_GetService(NS_MIME_CONVERTER_CONTRACTID, &res);
if (NS_SUCCEEDED(res) && nsnull != converter)
res = converter->EncodeMimePartIIStr_UTF8(header, structured, charset, fieldnamelen, kMIME_ENCODED_WORD_SIZE, &encodedString);
return NS_SUCCEEDED(res) ? encodedString : nsnull;
}
// Return True if a charset is stateful (e.g. JIS).
PRBool nsMsgI18Nstateful_charset(const char *charset)
{
//TODO: use charset manager's service
return (nsCRT::strcasecmp(charset, "ISO-2022-JP") == 0);
}
PRBool nsMsgI18Nmultibyte_charset(const char *charset)
{
nsresult res;
nsCOMPtr <nsICharsetConverterManager> ccm = do_GetService(kCharsetConverterManagerCID, &res);
PRBool result = PR_FALSE;
if (NS_SUCCEEDED(res)) {
nsAutoString charsetData;
res = ccm->GetCharsetData(charset, NS_LITERAL_STRING(".isMultibyte").get(), charsetData);
if (NS_SUCCEEDED(res)) {
result = charsetData.LowerCaseEqualsLiteral("true");
}
}
return result;
}
PRBool nsMsgI18Ncheck_data_in_charset_range(const char *charset, const PRUnichar* inString, char **fallbackCharset)
{
if (!charset || !*charset || !inString || !*inString)
return PR_TRUE;
nsresult res;
PRBool result = PR_TRUE;
nsCOMPtr <nsICharsetConverterManager> ccm = do_GetService(kCharsetConverterManagerCID, &res);
if (NS_SUCCEEDED(res)) {
nsCOMPtr <nsIUnicodeEncoder> encoder;
// get an unicode converter
res = ccm->GetUnicodeEncoderRaw(charset, getter_AddRefs(encoder));
if(NS_SUCCEEDED(res)) {
const PRUnichar *originalPtr = inString;
PRInt32 originalLen = nsCRT::strlen(inString);
const PRUnichar *currentSrcPtr = originalPtr;
char localBuff[512];
PRInt32 consumedLen = 0;
PRInt32 srcLen;
PRInt32 dstLength;
// convert from unicode
while (consumedLen < originalLen) {
srcLen = originalLen - consumedLen;
dstLength = 512;
res = encoder->Convert(currentSrcPtr, &srcLen, localBuff, &dstLength);
if (NS_ERROR_UENC_NOMAPPING == res) {
result = PR_FALSE;
break;
}
else if (NS_FAILED(res) || (0 == dstLength))
break;
currentSrcPtr += srcLen;
consumedLen = currentSrcPtr - originalPtr; // src length used so far
}
}
}
// if the conversion was not successful then try fallback to other charsets
if (!result && fallbackCharset) {
nsXPIDLCString convertedString;
res = nsMsgI18NSaveAsCharset("text/plain", charset, inString,
getter_Copies(convertedString), fallbackCharset);
result = (NS_SUCCEEDED(res) && NS_ERROR_UENC_NOMAPPING != res);
}
return result;
}
// Simple parser to parse META charset.
// It only supports the case when the description is within one line.
const char *
nsMsgI18NParseMetaCharset(nsFileSpec* fileSpec)
{
static char charset[kMAX_CSNAME+1];
char buffer[512];
*charset = '\0';
if (fileSpec->IsDirectory()) {
NS_ASSERTION(0,"file is a directory");
return charset;
}
nsInputFileStream fileStream(*fileSpec);
while (!fileStream.eof() && !fileStream.failed() &&
fileStream.is_open()) {
fileStream.readline(buffer, 512);
if (*buffer == nsCRT::CR || *buffer == nsCRT::LF || *buffer == 0)
continue;
PRUint32 len = PL_strlen(buffer);
for (PRUint32 i = 0; i < len; i++) {
buffer[i] = toupper(buffer[i]);
}
if (PL_strstr(buffer, "/HEAD"))
break;
if (PL_strstr(buffer, "META") &&
PL_strstr(buffer, "HTTP-EQUIV") &&
PL_strstr(buffer, "CONTENT-TYPE") &&
PL_strstr(buffer, "CHARSET")) {
char *cp = PL_strstr(PL_strstr(buffer, "CHARSET"), "=");
char *newStr;
char *token = cp ? nsCRT::strtok(cp + 1, " \"\'", &newStr) : nsnull;
if (token) {
PL_strncpy(charset, token, sizeof(charset));
charset[sizeof(charset)-1] = '\0';
// this function cannot parse a file if it is really
// encoded by one of the following charsets
// so we can say that the charset label must be incorrect for
// the .html if we actually see those charsets parsed
// and we should ignore them
if (!nsCRT::strncasecmp("UTF-16", charset, sizeof("UTF-16")-1) ||
!nsCRT::strncasecmp("UTF-32", charset, sizeof("UTF-32")-1))
charset[0] = '\0';
break;
}
}
}
return charset;
}
nsresult nsMsgI18NConvertToEntity(const nsString& inString, nsString* outString)
{
nsresult res;
outString->Truncate();
nsCOMPtr <nsIEntityConverter> entityConv = do_CreateInstance(kEntityConverterCID, &res);
if(NS_SUCCEEDED(res)) {
PRUnichar *entities = NULL;
res = entityConv->ConvertToEntities(inString.get(), nsIEntityConverter::html40Latin1, &entities);
if (NS_SUCCEEDED(res) && (NULL != entities))
outString->Adopt(entities);
}
return res;
}
nsresult nsMsgI18NSaveAsCharset(const char* contentType, const char *charset,
const PRUnichar* inString, char** outString,
char **fallbackCharset, PRBool *isAsciiOnly)
{
NS_ENSURE_ARG_POINTER(contentType);
NS_ENSURE_ARG_POINTER(charset);
NS_ENSURE_ARG_POINTER(inString);
NS_ENSURE_ARG_POINTER(outString);
*outString = nsnull;
if (nsCRT::IsAscii(inString)) {
if (isAsciiOnly)
*isAsciiOnly = PR_TRUE;
*outString = nsCRT::strdup(NS_LossyConvertUTF16toASCII(inString).get());
return (nsnull != *outString) ? NS_OK : NS_ERROR_OUT_OF_MEMORY;
}
if (isAsciiOnly)
*isAsciiOnly = PR_FALSE;
PRBool bTEXT_HTML = PR_FALSE;
nsresult res;
if (!nsCRT::strcasecmp(contentType, TEXT_HTML)) {
bTEXT_HTML = PR_TRUE;
}
else if (nsCRT::strcasecmp(contentType, TEXT_PLAIN)) {
return NS_ERROR_ILLEGAL_VALUE; // not supported type
}
nsCOMPtr <nsICharsetAlias> calias =
do_GetService(NS_CHARSETALIAS_CONTRACTID, &res);
NS_ENSURE_SUCCESS(res, res);
nsCAutoString charsetName;
res = calias->GetPreferred(nsDependentCString(charset),
charsetName);
NS_ENSURE_SUCCESS(res, res);
// charset converter plus entity, NCR generation
nsCOMPtr <nsISaveAsCharset> conv = do_CreateInstance(NS_SAVEASCHARSET_CONTRACTID, &res);
NS_ENSURE_SUCCESS(res, res);
// attribute:
// html text - charset conv then fallback to entity or NCR
// plain text - charset conv then fallback to '?'
if (bTEXT_HTML)
// For ISO-8859-1 only, convert to entity first (always generate entites like &nbsp;).
res = conv->Init(charsetName.get(),
charsetName.EqualsLiteral("ISO-8859-1") ?
nsISaveAsCharset::attr_htmlTextDefault :
nsISaveAsCharset::attr_EntityAfterCharsetConv + nsISaveAsCharset::attr_FallbackDecimalNCR,
nsIEntityConverter::html32);
else
// fallback for text/plain: first try transliterate then '?'
res = conv->Init(charsetName.get(),
nsISaveAsCharset::attr_FallbackQuestionMark + nsISaveAsCharset::attr_EntityAfterCharsetConv,
nsIEntityConverter::transliterate);
NS_ENSURE_SUCCESS(res, res);
const PRUnichar *input = inString;
// Mapping characters in a certain range (required for Japanese only)
nsAutoString mapped;
if (charsetName.EqualsLiteral("ISO-2022-JP")) {
static PRInt32 sSendHankakuKana = -1;
if (sSendHankakuKana < 0) {
nsCOMPtr<nsIPrefBranch> prefBranch(do_GetService(NS_PREFSERVICE_CONTRACTID, &res));
NS_ENSURE_SUCCESS(res, res);
PRBool sendHankaku;
// Get a hidden 4.x pref with no UI, get it only once.
if (NS_FAILED(prefBranch->GetBoolPref("mailnews.send_hankaku_kana", &sendHankaku)))
sSendHankakuKana = 0; // no pref means need the mapping
else
sSendHankakuKana = sendHankaku ? 1 : 0;
}
if (!sSendHankakuKana) {
nsCOMPtr <nsITextTransform> textTransform = do_CreateInstance(NS_HANKAKUTOZENKAKU_CONTRACTID, &res);
if (NS_SUCCEEDED(res)) {
res = textTransform->Change(inString, nsCRT::strlen(inString), mapped);
if (NS_SUCCEEDED(res))
input = mapped.get();
}
}
}
// Convert to charset
res = conv->Convert(input, outString);
// If the converer cannot encode to the charset,
// then fallback to pref sepcified charsets.
if (NS_ERROR_UENC_NOMAPPING == res && !bTEXT_HTML && fallbackCharset) {
nsCOMPtr<nsIPrefBranch> prefBranch(do_GetService(NS_PREFSERVICE_CONTRACTID, &res));
NS_ENSURE_SUCCESS(res, res);
nsCAutoString prefString("intl.fallbackCharsetList.");
prefString.Append(charset);
nsXPIDLCString fallbackList;
res = prefBranch->GetCharPref(prefString.get(), getter_Copies(fallbackList));
// do the fallback only if there is a pref for the charset
if (NS_FAILED(res) || fallbackList.IsEmpty())
return NS_ERROR_UENC_NOMAPPING;
res = conv->Init(fallbackList.get(),
nsISaveAsCharset::attr_FallbackQuestionMark +
nsISaveAsCharset::attr_EntityAfterCharsetConv +
nsISaveAsCharset::attr_CharsetFallback,
nsIEntityConverter::transliterate);
NS_ENSURE_SUCCESS(res, res);
// free whatever we have now
PR_FREEIF(*outString);
res = conv->Convert(input, outString);
NS_ENSURE_SUCCESS(res, res);
// get the actual charset used for the conversion
if (NS_FAILED(conv->GetCharset(fallbackCharset)))
*fallbackCharset = nsnull;
}
// In case of HTML, non ASCII may be encoded as CER, NCR.
// Exclude stateful charset which is 7 bit but not ASCII only.
else if (isAsciiOnly && bTEXT_HTML && *outString &&
!nsMsgI18Nstateful_charset(charsetName.get()))
*isAsciiOnly = nsCRT::IsAscii(*outString);
return res;
}
nsresult nsMsgI18NFormatNNTPXPATInNonRFC1522Format(const nsCString& aCharset,
const nsString& inString,
nsCString& outString)
{
outString.AssignWithConversion(inString);
return NS_OK;
}
const char *
nsMsgI18NGetAcceptLanguage(void)
{
nsCOMPtr<nsIPrefBranch> prefBranch(do_GetService(NS_PREFSERVICE_CONTRACTID));
if (prefBranch)
{
nsCOMPtr<nsIPrefLocalizedString> prefString;
prefBranch->GetComplexValue("intl.accept_languages",
NS_GET_IID(nsIPrefLocalizedString),
getter_AddRefs(prefString));
if (prefString)
{
nsXPIDLString ucsval;
prefString->ToString(getter_Copies(ucsval));
if (!ucsval.IsEmpty())
{
static nsCAutoString acceptLang;
LossyCopyUTF16toASCII(ucsval, acceptLang);
return acceptLang.get();
}
}
}
// Default Accept-Language
return "en";
}