RetroZilla/netwerk/streamconv/converters/mozTXTToHTMLConv.cpp
2015-10-20 23:03:22 -04:00

1408 lines
42 KiB
C++

/* -*- Mode: C; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
/* ***** BEGIN LICENSE BLOCK *****
* Version: MPL 1.1/GPL 2.0/LGPL 2.1
*
* The contents of this file are subject to the Mozilla Public License Version
* 1.1 (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
* http://www.mozilla.org/MPL/
*
* Software distributed under the License is distributed on an "AS IS" basis,
* WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
* for the specific language governing rights and limitations under the
* License.
*
* The Original Code is the Mozilla Text to HTML converter code.
*
* The Initial Developer of the Original Code is
* Ben Bucksch <http://www.bucksch.org>.
* Portions created by the Initial Developer are Copyright (C) 1999, 2000
* the Initial Developer. All Rights Reserved.
*
* Contributor(s):
*
* Alternatively, the contents of this file may be used under the terms of
* either the GNU General Public License Version 2 or later (the "GPL"), or
* the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
* in which case the provisions of the GPL or the LGPL are applicable instead
* of those above. If you wish to allow use of your version of this file only
* under the terms of either the GPL or the LGPL, and not to allow others to
* use your version of this file under the terms of the MPL, indicate your
* decision by deleting the provisions above and replace them with the notice
* and other provisions required by the GPL or the LGPL. If you do not delete
* the provisions above, a recipient may use your version of this file under
* the terms of any one of the MPL, the GPL or the LGPL.
*
* ***** END LICENSE BLOCK ***** */
#include "mozTXTToHTMLConv.h"
#include "nsIServiceManager.h"
#include "nsNetCID.h"
#include "nsReadableUtils.h"
#include "nsUnicharUtils.h"
#include "nsCRT.h"
#include "nsIExternalProtocolHandler.h"
static NS_DEFINE_CID(kIOServiceCID, NS_IOSERVICE_CID);
#ifdef DEBUG_BenB_Perf
#include "prtime.h"
#include "prinrval.h"
#endif
const PRFloat64 growthRate = 1.2;
// Bug 183111, editor now replaces multiple spaces with leading
// 0xA0's and a single ending space, so need to treat 0xA0's as spaces.
// 0xA0 is the Latin1/Unicode character for "non-breaking space (nbsp)"
// Also recognize the Japanese ideographic space 0x3000 as a space.
static inline PRBool IsSpace(const PRUnichar aChar)
{
return (nsCRT::IsAsciiSpace(aChar) || aChar == 0xA0 || aChar == 0x3000);
}
// Escape Char will take ch, escape it and append the result to
// aStringToAppendTo
void
mozTXTToHTMLConv::EscapeChar(const PRUnichar ch, nsString& aStringToAppendTo)
{
switch (ch)
{
case '<':
aStringToAppendTo.AppendLiteral("&lt;");
break;
case '>':
aStringToAppendTo.AppendLiteral("&gt;");
break;
case '&':
aStringToAppendTo.AppendLiteral("&amp;");
break;
default:
aStringToAppendTo += ch;
}
return;
}
// EscapeStr takes the passed in string and
// escapes it IN PLACE.
void
mozTXTToHTMLConv::EscapeStr(nsString& aInString)
{
// the replace substring routines
// don't seem to work if you have a character
// in the in string that is also in the replacement
// string! =(
//aInString.ReplaceSubstring("&", "&amp;");
//aInString.ReplaceSubstring("<", "&lt;");
//aInString.ReplaceSubstring(">", "&gt;");
for (PRUint32 i = 0; i < aInString.Length();)
{
switch (aInString[i])
{
case '<':
aInString.Cut(i, 1);
aInString.Insert(NS_LITERAL_STRING("&lt;"), i);
i += 4; // skip past the integers we just added
break;
case '>':
aInString.Cut(i, 1);
aInString.Insert(NS_LITERAL_STRING("&gt;"), i);
i += 4; // skip past the integers we just added
break;
case '&':
aInString.Cut(i, 1);
aInString.Insert(NS_LITERAL_STRING("&amp;"), i);
i += 5; // skip past the integers we just added
break;
default:
i++;
}
}
}
void
mozTXTToHTMLConv::UnescapeStr(const PRUnichar * aInString, PRInt32 aStartPos, PRInt32 aLength, nsString& aOutString)
{
const PRUnichar * subString = nsnull;
for (PRUint32 i = aStartPos; PRInt32(i) - aStartPos < aLength;)
{
PRInt32 remainingChars = i - aStartPos;
if (aInString[i] == '&')
{
subString = &aInString[i];
if (!nsCRT::strncmp(subString, NS_LITERAL_STRING("&lt;").get(), MinInt(4, aLength - remainingChars)))
{
aOutString.Append(PRUnichar('<'));
i += 4;
}
else if (!nsCRT::strncmp(subString, NS_LITERAL_STRING("&gt;").get(), MinInt(4, aLength - remainingChars)))
{
aOutString.Append(PRUnichar('>'));
i += 4;
}
else if (!nsCRT::strncmp(subString, NS_LITERAL_STRING("&amp;").get(), MinInt(5, aLength - remainingChars)))
{
aOutString.Append(PRUnichar('&'));
i += 5;
}
else
{
aOutString += aInString[i];
i++;
}
}
else
{
aOutString += aInString[i];
i++;
}
}
}
void
mozTXTToHTMLConv::CompleteAbbreviatedURL(const PRUnichar * aInString, PRInt32 aInLength,
const PRUint32 pos, nsString& aOutString)
{
NS_ASSERTION(pos < aInLength, "bad args to CompleteAbbreviatedURL, see bug #190851");
if (pos >= aInLength)
return;
if (aInString[pos] == '@')
{
// only pre-pend a mailto url if the string contains a .domain in it..
//i.e. we want to linkify johndoe@foo.com but not "let's meet @8pm"
nsDependentString inString(aInString, aInLength);
if (inString.FindChar('.', pos) != kNotFound) // if we have a '.' after the @ sign....
{
aOutString.AssignLiteral("mailto:");
aOutString += aInString;
}
}
else if (aInString[pos] == '.')
{
if (ItMatchesDelimited(aInString, aInLength,
NS_LITERAL_STRING("www.").get(), 4, LT_IGNORE, LT_IGNORE))
{
aOutString.AssignLiteral("http://");
aOutString += aInString;
}
else if (ItMatchesDelimited(aInString,aInLength, NS_LITERAL_STRING("ftp.").get(), 4, LT_IGNORE, LT_IGNORE))
{
aOutString.AssignLiteral("ftp://");
aOutString += aInString;
}
}
}
PRBool
mozTXTToHTMLConv::FindURLStart(const PRUnichar * aInString, PRInt32 aInLength,
const PRUint32 pos, const modetype check,
PRUint32& start)
{
switch(check)
{ // no breaks, because end of blocks is never reached
case RFC1738:
{
if (!nsCRT::strncmp(&aInString[MaxInt(pos - 4, 0)], NS_LITERAL_STRING("<URL:").get(), 5))
{
start = pos + 1;
return PR_TRUE;
}
else
return PR_FALSE;
}
case RFC2396E:
{
nsString temp(aInString, aInLength);
PRInt32 i = pos <= 0 ? kNotFound : temp.RFindCharInSet(NS_LITERAL_STRING("<>\"").get(), pos - 1);
if (i != kNotFound && (temp[PRUint32(i)] == '<' ||
temp[PRUint32(i)] == '"'))
{
start = PRUint32(++i);
return start < pos;
}
else
return PR_FALSE;
}
case freetext:
{
PRInt32 i = pos - 1;
for (; i >= 0 && (
nsCRT::IsAsciiAlpha(aInString[PRUint32(i)]) ||
nsCRT::IsAsciiDigit(aInString[PRUint32(i)]) ||
aInString[PRUint32(i)] == '+' ||
aInString[PRUint32(i)] == '-' ||
aInString[PRUint32(i)] == '.'
); i--)
;
if (++i >= 0 && i < pos && nsCRT::IsAsciiAlpha(aInString[PRUint32(i)]))
{
start = PRUint32(i);
return PR_TRUE;
}
else
return PR_FALSE;
}
case abbreviated:
{
PRInt32 i = pos - 1;
// This disallows non-ascii-characters for email.
// Currently correct, but revisit later after standards changed.
PRBool isEmail = aInString[pos] == (PRUnichar)'@';
// These chars mark the start of the URL
for (; i >= 0
&& aInString[PRUint32(i)] != '>' && aInString[PRUint32(i)] != '<'
&& aInString[PRUint32(i)] != '"' && aInString[PRUint32(i)] != '\''
&& aInString[PRUint32(i)] != '`' && aInString[PRUint32(i)] != ','
&& aInString[PRUint32(i)] != '{' && aInString[PRUint32(i)] != '['
&& aInString[PRUint32(i)] != '(' && aInString[PRUint32(i)] != '|'
&& aInString[PRUint32(i)] != '\\'
&& !IsSpace(aInString[PRUint32(i)])
&& (!isEmail || nsCRT::IsAscii(aInString[PRUint32(i)]))
; i--)
;
if
(
++i >= 0 && i < pos
&&
(
nsCRT::IsAsciiAlpha(aInString[PRUint32(i)]) ||
nsCRT::IsAsciiDigit(aInString[PRUint32(i)])
)
)
{
start = PRUint32(i);
return PR_TRUE;
}
else
return PR_FALSE;
}
default:
return PR_FALSE;
} //switch
}
PRBool
mozTXTToHTMLConv::FindURLEnd(const PRUnichar * aInString, PRInt32 aInStringLength, const PRUint32 pos,
const modetype check, const PRUint32 start, PRUint32& end)
{
switch(check)
{ // no breaks, because end of blocks is never reached
case RFC1738:
case RFC2396E:
{
nsString temp(aInString, aInStringLength);
PRInt32 i = temp.FindCharInSet(NS_LITERAL_STRING("<>\"").get(), pos + 1);
if (i != kNotFound && temp[PRUint32(i--)] ==
(check == RFC1738 || temp[start - 1] == '<' ? '>' : '"'))
{
end = PRUint32(i);
return end > pos;
}
else
return PR_FALSE;
}
case freetext:
case abbreviated:
{
PRUint32 i = pos + 1;
PRBool isEmail = aInString[pos] == (PRUnichar)'@';
PRBool haveOpeningBracket = PR_FALSE;
for (; PRInt32(i) < aInStringLength; i++)
{
// These chars mark the end of the URL
if (aInString[i] == '>' || aInString[i] == '<' ||
aInString[i] == '"' || aInString[i] == '`' ||
aInString[i] == '}' || aInString[i] == ']' ||
aInString[i] == '{' || aInString[i] == '[' ||
aInString[i] == '|' ||
(aInString[i] == ')' && !haveOpeningBracket) ||
IsSpace(aInString[i]) )
break;
// Disallow non-ascii-characters for email.
// Currently correct, but revisit later after standards changed.
if (isEmail && (
aInString[i] == '(' || aInString[i] == '\'' ||
!nsCRT::IsAscii(aInString[i]) ))
break;
if (aInString[i] == '(')
haveOpeningBracket = PR_TRUE;
}
// These chars are allowed in the middle of the URL, but not at end.
// Technically they are, but are used in normal text after the URL.
while (--i > pos && (
aInString[i] == '.' || aInString[i] == ',' || aInString[i] == ';' ||
aInString[i] == '!' || aInString[i] == '?' || aInString[i] == '-' ||
aInString[i] == '\''
))
;
if (i > pos)
{
end = i;
return PR_TRUE;
}
else
return PR_FALSE;
}
default:
return PR_FALSE;
} //switch
}
void
mozTXTToHTMLConv::CalculateURLBoundaries(const PRUnichar * aInString, PRInt32 aInStringLength,
const PRUint32 pos, const PRUint32 whathasbeendone,
const modetype check, const PRUint32 start, const PRUint32 end,
nsString& txtURL, nsString& desc,
PRInt32& replaceBefore, PRInt32& replaceAfter)
{
PRUint32 descstart = start;
switch(check)
{
case RFC1738:
{
descstart = start - 5;
desc.Append(&aInString[descstart], end - descstart + 2); // include "<URL:" and ">"
replaceAfter = end - pos + 1;
} break;
case RFC2396E:
{
descstart = start - 1;
desc.Append(&aInString[descstart], end - descstart + 2); // include brackets
replaceAfter = end - pos + 1;
} break;
case freetext:
case abbreviated:
{
descstart = start;
desc.Append(&aInString[descstart], end - start + 1); // don't include brackets
replaceAfter = end - pos;
} break;
default: break;
} //switch
EscapeStr(desc);
txtURL.Append(&aInString[start], end - start + 1);
txtURL.StripWhitespace();
// FIX ME
nsAutoString temp2;
ScanTXT(&aInString[descstart], pos - descstart, ~kURLs /*prevents loop*/ & whathasbeendone, temp2);
replaceBefore = temp2.Length();
return;
}
PRBool mozTXTToHTMLConv::ShouldLinkify(const nsCString& aURL)
{
if (!mIOService)
return PR_FALSE;
nsCAutoString scheme;
nsresult rv = mIOService->ExtractScheme(aURL, scheme);
if(NS_FAILED(rv))
return PR_FALSE;
// Get the handler for this scheme.
nsCOMPtr<nsIProtocolHandler> handler;
rv = mIOService->GetProtocolHandler(scheme.get(), getter_AddRefs(handler));
if(NS_FAILED(rv))
return PR_FALSE;
// Is it an external protocol handler? If not, linkify it.
nsCOMPtr<nsIExternalProtocolHandler> externalHandler = do_QueryInterface(handler, &rv);
if (!externalHandler)
return PR_TRUE; // handler is built-in, linkify it!
// If external app exists for the scheme then linkify it.
PRBool exists;
rv = externalHandler->ExternalAppExistsForScheme(scheme, &exists);
return(NS_SUCCEEDED(rv) && exists);
}
PRBool
mozTXTToHTMLConv::CheckURLAndCreateHTML(
const nsString& txtURL, const nsString& desc, const modetype mode,
nsString& outputHTML)
{
// Create *uri from txtURL
nsCOMPtr<nsIURI> uri;
nsresult rv = NS_OK;
if (!mIOService)
mIOService = do_GetService(kIOServiceCID, &rv);
if (NS_FAILED(rv) || !mIOService)
return PR_FALSE;
// See if the url should be linkified.
NS_ConvertUCS2toUTF8 utf8URL(txtURL);
if (!ShouldLinkify(utf8URL))
return PR_FALSE;
// it would be faster if we could just check to see if there is a protocol
// handler for the url and return instead of actually trying to create a url...
rv = mIOService->NewURI(utf8URL, nsnull, nsnull, getter_AddRefs(uri));
// Real work
if (NS_SUCCEEDED(rv) && uri)
{
outputHTML.AssignLiteral("<a class=\"moz-txt-link-");
switch(mode)
{
case RFC1738:
outputHTML.AppendLiteral("rfc1738");
break;
case RFC2396E:
outputHTML.AppendLiteral("rfc2396E");
break;
case freetext:
outputHTML.AppendLiteral("freetext");
break;
case abbreviated:
outputHTML.AppendLiteral("abbreviated");
break;
default: break;
}
outputHTML.AppendLiteral("\" href=\"");
outputHTML += txtURL;
outputHTML.AppendLiteral("\">");
outputHTML += desc;
outputHTML.AppendLiteral("</a>");
return PR_TRUE;
}
else
return PR_FALSE;
}
NS_IMETHODIMP mozTXTToHTMLConv::FindURLInPlaintext(const PRUnichar * aInString, PRInt32 aInLength, PRInt32 aPos, PRInt32 * aStartPos, PRInt32 * aEndPos)
{
// call FindURL on the passed in string
nsAutoString outputHTML; // we'll ignore the generated output HTML
*aStartPos = -1;
*aEndPos = -1;
FindURL(aInString, aInLength, aPos, kURLs, outputHTML, *aStartPos, *aEndPos);
return NS_OK;
}
PRBool
mozTXTToHTMLConv::FindURL(const PRUnichar * aInString, PRInt32 aInLength, const PRUint32 pos,
const PRUint32 whathasbeendone,
nsString& outputHTML, PRInt32& replaceBefore, PRInt32& replaceAfter)
{
enum statetype {unchecked, invalid, startok, endok, success};
static const modetype ranking[] = {RFC1738, RFC2396E, freetext, abbreviated};
statetype state[mozTXTToHTMLConv_lastMode + 1]; // 0(=unknown)..lastMode
/* I don't like this abuse of enums as index for the array,
but I don't know a better method */
// Define, which modes to check
/* all modes but abbreviated are checked for text[pos] == ':',
only abbreviated for '.', RFC2396E and abbreviated for '@' */
for (modetype iState = unknown; iState <= mozTXTToHTMLConv_lastMode;
iState = modetype(iState + 1))
state[iState] = aInString[pos] == ':' ? unchecked : invalid;
switch (aInString[pos])
{
case '@':
state[RFC2396E] = unchecked;
// no break here
case '.':
state[abbreviated] = unchecked;
break;
case ':':
state[abbreviated] = invalid;
break;
default:
break;
}
// Test, first successful mode wins, sequence defined by |ranking|
PRInt32 iCheck = 0; // the currently tested modetype
modetype check = ranking[iCheck];
for (; iCheck < mozTXTToHTMLConv_numberOfModes && state[check] != success;
iCheck++)
/* check state from last run.
If this is the first, check this one, which isn't = success yet */
{
check = ranking[iCheck];
PRUint32 start, end;
if (state[check] == unchecked)
if (FindURLStart(aInString, aInLength, pos, check, start))
state[check] = startok;
if (state[check] == startok)
if (FindURLEnd(aInString, aInLength, pos, check, start, end))
state[check] = endok;
if (state[check] == endok)
{
nsAutoString txtURL, desc;
PRInt32 resultReplaceBefore, resultReplaceAfter;
CalculateURLBoundaries(aInString, aInLength, pos, whathasbeendone, check, start, end,
txtURL, desc,
resultReplaceBefore, resultReplaceAfter);
if (aInString[pos] != ':')
{
nsAutoString temp = txtURL;
txtURL.SetLength(0);
CompleteAbbreviatedURL(temp.get(),temp.Length(), pos - start, txtURL);
}
if (!txtURL.IsEmpty() && CheckURLAndCreateHTML(txtURL, desc, check,
outputHTML))
{
replaceBefore = resultReplaceBefore;
replaceAfter = resultReplaceAfter;
state[check] = success;
}
} // if
} // for
return state[check] == success;
}
PRBool
mozTXTToHTMLConv::ItMatchesDelimited(const PRUnichar * aInString,
PRInt32 aInLength, const PRUnichar* rep, PRInt32 aRepLen,
LIMTYPE before, LIMTYPE after)
{
// this little method gets called a LOT. I found we were spending a
// lot of time just calculating the length of the variable "rep"
// over and over again every time we called it. So we're now passing
// an integer in here.
PRInt32 textLen = aInLength;
if
(
(before == LT_IGNORE && (after == LT_IGNORE || after == LT_DELIMITER))
&& textLen < aRepLen ||
(before != LT_IGNORE || after != LT_IGNORE && after != LT_DELIMITER)
&& textLen < aRepLen + 1 ||
before != LT_IGNORE && after != LT_IGNORE && after != LT_DELIMITER
&& textLen < aRepLen + 2
)
return PR_FALSE;
PRUnichar text0 = aInString[0];
PRUnichar textAfterPos = aInString[aRepLen + (before == LT_IGNORE ? 0 : 1)];
if
(
before == LT_ALPHA
&& !nsCRT::IsAsciiAlpha(text0) ||
before == LT_DIGIT
&& !nsCRT::IsAsciiDigit(text0) ||
before == LT_DELIMITER
&&
(
nsCRT::IsAsciiAlpha(text0) ||
nsCRT::IsAsciiDigit(text0) ||
text0 == *rep
) ||
after == LT_ALPHA
&& !nsCRT::IsAsciiAlpha(textAfterPos) ||
after == LT_DIGIT
&& !nsCRT::IsAsciiDigit(textAfterPos) ||
after == LT_DELIMITER
&&
(
nsCRT::IsAsciiAlpha(textAfterPos) ||
nsCRT::IsAsciiDigit(textAfterPos) ||
textAfterPos == *rep
) ||
!Substring(Substring(aInString, aInString+aInLength),
(before == LT_IGNORE ? 0 : 1),
aRepLen).Equals(Substring(rep, rep+aRepLen),
nsCaseInsensitiveStringComparator())
)
return PR_FALSE;
return PR_TRUE;
}
PRUint32
mozTXTToHTMLConv::NumberOfMatches(const PRUnichar * aInString, PRInt32 aInStringLength,
const PRUnichar* rep, PRInt32 aRepLen, LIMTYPE before, LIMTYPE after)
{
PRUint32 result = 0;
for (PRInt32 i = 0; i < aInStringLength; i++)
{
const PRUnichar * indexIntoString = &aInString[i];
if (ItMatchesDelimited(indexIntoString, aInStringLength - i, rep, aRepLen, before, after))
result++;
}
return result;
}
// NOTE: the converted html for the phrase is appended to aOutString
// tagHTML and attributeHTML are plain ASCII (literal strings, in fact)
PRBool
mozTXTToHTMLConv::StructPhraseHit(const PRUnichar * aInString, PRInt32 aInStringLength, PRBool col0,
const PRUnichar* tagTXT, PRInt32 aTagTXTLen,
const char* tagHTML, const char* attributeHTML,
nsString& aOutString, PRUint32& openTags)
{
/* We're searching for the following pattern:
LT_DELIMITER - "*" - ALPHA -
[ some text (maybe more "*"-pairs) - ALPHA ] "*" - LT_DELIMITER.
<strong> is only inserted, if existance of a pair could be verified
We use the first opening/closing tag, if we can choose */
const PRUnichar * newOffset = aInString;
PRInt32 newLength = aInStringLength;
if (!col0) // skip the first element?
{
newOffset = &aInString[1];
newLength = aInStringLength - 1;
}
// opening tag
if
(
ItMatchesDelimited(aInString, aInStringLength, tagTXT, aTagTXTLen,
(col0 ? LT_IGNORE : LT_DELIMITER), LT_ALPHA) // is opening tag
&& NumberOfMatches(newOffset, newLength, tagTXT, aTagTXTLen,
LT_ALPHA, LT_DELIMITER) // remaining closing tags
> openTags
)
{
openTags++;
aOutString.AppendLiteral("<");
aOutString.AppendASCII(tagHTML);
aOutString.Append(PRUnichar(' '));
aOutString.AppendASCII(attributeHTML);
aOutString.AppendLiteral("><span class=\"moz-txt-tag\">");
aOutString.Append(tagTXT);
aOutString.AppendLiteral("</span>");
return PR_TRUE;
}
// closing tag
else if (openTags > 0
&& ItMatchesDelimited(aInString, aInStringLength, tagTXT, aTagTXTLen, LT_ALPHA, LT_DELIMITER))
{
openTags--;
aOutString.AppendLiteral("<span class=\"moz-txt-tag\">");
aOutString.Append(tagTXT);
aOutString.AppendLiteral("</span></");
aOutString.AppendASCII(tagHTML);
aOutString.Append(PRUnichar('>'));
return PR_TRUE;
}
return PR_FALSE;
}
PRBool
mozTXTToHTMLConv::SmilyHit(const PRUnichar * aInString, PRInt32 aLength, PRBool col0,
const char* tagTXT, const char* imageName,
nsString& outputHTML, PRInt32& glyphTextLen)
{
if ( !aInString || !tagTXT || !imageName )
return PR_FALSE;
PRInt32 tagLen = nsCRT::strlen(tagTXT);
PRUint32 delim = (col0 ? 0 : 1) + tagLen;
if
(
(col0 || IsSpace(aInString[0]))
&&
(
aLength <= PRInt32(delim) ||
IsSpace(aInString[delim]) ||
aLength > PRInt32(delim + 1)
&&
(
aInString[delim] == '.' ||
aInString[delim] == ',' ||
aInString[delim] == ';' ||
aInString[delim] == '8' ||
aInString[delim] == '>' ||
aInString[delim] == '!' ||
aInString[delim] == '?'
)
&& IsSpace(aInString[delim + 1])
)
&& ItMatchesDelimited(aInString, aLength, NS_ConvertASCIItoUCS2(tagTXT).get(), tagLen,
col0 ? LT_IGNORE : LT_DELIMITER, LT_IGNORE)
// Note: tests at different pos for LT_IGNORE and LT_DELIMITER
)
{
if (!col0)
{
outputHTML.Truncate();
outputHTML.Append(PRUnichar(' '));
}
outputHTML.AppendLiteral("<span class=\""); // <span class="
AppendASCIItoUTF16(imageName, outputHTML); // smiley-frown
outputHTML.AppendLiteral("\"><span> "); // "> <span>
AppendASCIItoUTF16(tagTXT, outputHTML); // alt text
outputHTML.AppendLiteral(" </span></span>"); // </span></span>
glyphTextLen = (col0 ? 0 : 1) + tagLen;
return PR_TRUE;
}
return PR_FALSE;
}
// the glyph is appended to aOutputString instead of the original string...
PRBool
mozTXTToHTMLConv::GlyphHit(const PRUnichar * aInString, PRInt32 aInLength, PRBool col0,
nsString& aOutputString, PRInt32& glyphTextLen)
{
MOZ_TIMER_START(mGlyphHitTimer);
PRUnichar text0 = aInString[0];
PRUnichar text1 = aInString[1];
PRUnichar firstChar = (col0 ? text0 : text1);
// temporary variable used to store the glyph html text
nsAutoString outputHTML;
PRBool bTestSmilie;
PRBool bArg;
int i;
// refactor some of this mess to avoid code duplication and speed execution a bit
// there are two cases that need to be tried one after another. To avoid a lot of
// duplicate code, rolling into a loop
i = 0;
while ( i < 2 )
{
bTestSmilie = PR_FALSE;
if ( !i && (firstChar == ':' || firstChar == ';' || firstChar == '=' || firstChar == '>' || firstChar == '8' || firstChar == 'O'))
{
// first test passed
bTestSmilie = PR_TRUE;
bArg = col0;
}
if ( i && col0 && ( text1 == ':' || text1 == ';' || text1 == '=' || text1 == '>' || text1 == '8' || text1 == 'O' ) )
{
// second test passed
bTestSmilie = PR_TRUE;
bArg = PR_FALSE;
}
if ( bTestSmilie && (
SmilyHit(aInString, aInLength, bArg,
":-)",
"moz-smiley-s1", // smile
outputHTML, glyphTextLen) ||
SmilyHit(aInString, aInLength, bArg,
":)",
"moz-smiley-s1", // smile
outputHTML, glyphTextLen) ||
SmilyHit(aInString, aInLength, bArg,
":-D",
"moz-smiley-s5", // laughing
outputHTML, glyphTextLen) ||
SmilyHit(aInString, aInLength, bArg,
":-(",
"moz-smiley-s2", // frown
outputHTML, glyphTextLen) ||
SmilyHit(aInString, aInLength, bArg,
":(",
"moz-smiley-s2", // frown
outputHTML, glyphTextLen) ||
SmilyHit(aInString, aInLength, bArg,
":-[",
"moz-smiley-s6", // embarassed
outputHTML, glyphTextLen) ||
SmilyHit(aInString, aInLength, bArg,
";-)",
"moz-smiley-s3", // wink
outputHTML, glyphTextLen) ||
SmilyHit(aInString, aInLength, col0,
";)",
"moz-smiley-s3", // wink
outputHTML, glyphTextLen) ||
SmilyHit(aInString, aInLength, bArg,
":-\\",
"moz-smiley-s7", // undecided
outputHTML, glyphTextLen) ||
SmilyHit(aInString, aInLength, bArg,
":-P",
"moz-smiley-s4", // tongue
outputHTML, glyphTextLen) ||
SmilyHit(aInString, aInLength, bArg,
";-P",
"moz-smiley-s4", // tongue
outputHTML, glyphTextLen) ||
SmilyHit(aInString, aInLength, bArg,
"=-O",
"moz-smiley-s8", // surprise
outputHTML, glyphTextLen) ||
SmilyHit(aInString, aInLength, bArg,
":-*",
"moz-smiley-s9", // kiss
outputHTML, glyphTextLen) ||
SmilyHit(aInString, aInLength, bArg,
">:o",
"moz-smiley-s10", // yell
outputHTML, glyphTextLen) ||
SmilyHit(aInString, aInLength, bArg,
">:-o",
"moz-smiley-yell", // yell
outputHTML, glyphTextLen) ||
SmilyHit(aInString, aInLength, bArg,
"8-)",
"moz-smiley-s11", // cool
outputHTML, glyphTextLen) ||
SmilyHit(aInString, aInLength, bArg,
":-$",
"moz-smiley-s12", // money
outputHTML, glyphTextLen) ||
SmilyHit(aInString, aInLength, bArg,
":-!",
"moz-smiley-s13", // foot
outputHTML, glyphTextLen) ||
SmilyHit(aInString, aInLength, bArg,
"O:-)",
"moz-smiley-s14", // innocent
outputHTML, glyphTextLen) ||
SmilyHit(aInString, aInLength, bArg,
":'(",
"moz-smiley-s15", // cry
outputHTML, glyphTextLen) ||
SmilyHit(aInString, aInLength, bArg,
":-X",
"moz-smiley-s16", // sealed
outputHTML, glyphTextLen)
)
)
{
aOutputString.Append(outputHTML);
MOZ_TIMER_STOP(mGlyphHitTimer);
return PR_TRUE;
}
i++;
}
if (text0 == '\f')
{
aOutputString.AppendLiteral("<span class='moz-txt-formfeed'></span>");
glyphTextLen = 1;
MOZ_TIMER_STOP(mGlyphHitTimer);
return PR_TRUE;
}
if (text0 == '+' || text1 == '+')
{
if (ItMatchesDelimited(aInString, aInLength,
NS_LITERAL_STRING(" +/-").get(), 4,
LT_IGNORE, LT_IGNORE))
{
aOutputString.AppendLiteral(" &plusmn;");
glyphTextLen = 4;
MOZ_TIMER_STOP(mGlyphHitTimer);
return PR_TRUE;
}
if (col0 && ItMatchesDelimited(aInString, aInLength,
NS_LITERAL_STRING("+/-").get(), 3,
LT_IGNORE, LT_IGNORE))
{
aOutputString.AppendLiteral("&plusmn;");
glyphTextLen = 3;
MOZ_TIMER_STOP(mGlyphHitTimer);
return PR_TRUE;
}
}
// x^2 => x<sup>2</sup>, also handle powers x^-2, x^0.5
// implement regular expression /[\dA-Za-z\)\]}]\^-?\d+(\.\d+)*[^\dA-Za-z]/
if
(
text1 == '^'
&&
(
nsCRT::IsAsciiDigit(text0) || nsCRT::IsAsciiAlpha(text0) ||
text0 == ')' || text0 == ']' || text0 == '}'
)
&&
(
2 < aInLength && nsCRT::IsAsciiDigit(aInString[2]) ||
3 < aInLength && aInString[2] == '-' && nsCRT::IsAsciiDigit(aInString[3])
)
)
{
// Find first non-digit
PRInt32 delimPos = 3; // skip "^" and first digit (or '-')
for (; delimPos < aInLength
&&
(
nsCRT::IsAsciiDigit(aInString[delimPos]) ||
aInString[delimPos] == '.' && delimPos + 1 < aInLength &&
nsCRT::IsAsciiDigit(aInString[delimPos + 1])
);
delimPos++)
;
if (delimPos < aInLength && nsCRT::IsAsciiAlpha(aInString[delimPos]))
{
MOZ_TIMER_STOP(mGlyphHitTimer);
return PR_FALSE;
}
outputHTML.Truncate();
outputHTML += text0;
outputHTML.AppendLiteral("<sup class=\"moz-txt-sup\">");
aOutputString.Append(outputHTML);
aOutputString.Append(&aInString[2], delimPos - 2);
aOutputString.AppendLiteral("</sup>");
glyphTextLen = delimPos /* - 1 + 1 */ ;
MOZ_TIMER_STOP(mGlyphHitTimer);
return PR_TRUE;
}
/*
The following strings are not substituted:
|TXT |HTML |Reason
+------+---------+----------
-> &larr; Bug #454
=> &lArr; dito
<- &rarr; dito
<= &rArr; dito
(tm) &trade; dito
1/4 &frac14; is triggered by 1/4 Part 1, 2/4 Part 2, ...
3/4 &frac34; dito
1/2 &frac12; similar
*/
MOZ_TIMER_STOP(mGlyphHitTimer);
return PR_FALSE;
}
/***************************************************************************
Library-internal Interface
****************************************************************************/
mozTXTToHTMLConv::mozTXTToHTMLConv()
{
MOZ_TIMER_RESET(mScanTXTTimer);
MOZ_TIMER_RESET(mGlyphHitTimer);
MOZ_TIMER_RESET(mTotalMimeTime);
MOZ_TIMER_START(mTotalMimeTime);
}
mozTXTToHTMLConv::~mozTXTToHTMLConv()
{
MOZ_TIMER_STOP(mTotalMimeTime);
MOZ_TIMER_DEBUGLOG(("MIME Total Processing Time: "));
MOZ_TIMER_PRINT(mTotalMimeTime);
MOZ_TIMER_DEBUGLOG(("mozTXTToHTMLConv::ScanTXT(): "));
MOZ_TIMER_PRINT(mScanTXTTimer);
MOZ_TIMER_DEBUGLOG(("mozTXTToHTMLConv::GlyphHit(): "));
MOZ_TIMER_PRINT(mGlyphHitTimer);
}
NS_IMPL_ISUPPORTS1(mozTXTToHTMLConv, mozTXTToHTMLConv)
PRInt32
mozTXTToHTMLConv::CiteLevelTXT(const PRUnichar *line,
PRUint32& logLineStart)
{
PRInt32 result = 0;
PRInt32 lineLength = nsCRT::strlen(line);
PRBool moreCites = PR_TRUE;
while (moreCites)
{
/* E.g. the following lines count as quote:
> text
//#ifdef QUOTE_RECOGNITION_AGGRESSIVE
>text
//#ifdef QUOTE_RECOGNITION_AGGRESSIVE
> text
] text
USER> text
USER] text
//#endif
logLineStart is the position of "t" in this example
*/
PRUint32 i = logLineStart;
#ifdef QUOTE_RECOGNITION_AGGRESSIVE
for (; PRInt32(i) < lineLength && IsSpace(line[i]); i++)
;
for (; PRInt32(i) < lineLength && nsCRT::IsAsciiAlpha(line[i])
&& nsCRT::IsUpper(line[i]) ; i++)
;
if (PRInt32(i) < lineLength && (line[i] == '>' || line[i] == ']'))
#else
if (PRInt32(i) < lineLength && line[i] == '>')
#endif
{
i++;
if (PRInt32(i) < lineLength && line[i] == ' ')
i++;
// sendmail/mbox
// Placed here for performance increase
const PRUnichar * indexString = &line[logLineStart];
// here, |logLineStart < lineLength| is always true
PRUint32 minlength = MinInt(6,nsCRT::strlen(indexString));
if (Substring(indexString,
indexString+minlength).Equals(Substring(NS_LITERAL_STRING(">From "), 0, minlength),
nsCaseInsensitiveStringComparator()))
//XXX RFC2646
moreCites = PR_FALSE;
else
{
result++;
logLineStart = i;
}
}
else
moreCites = PR_FALSE;
}
return result;
}
void
mozTXTToHTMLConv::ScanTXT(const PRUnichar * aInString, PRInt32 aInStringLength, PRUint32 whattodo, nsString& aOutString)
{
PRBool doURLs = whattodo & kURLs;
PRBool doGlyphSubstitution = whattodo & kGlyphSubstitution;
PRBool doStructPhrase = whattodo & kStructPhrase;
MOZ_TIMER_START(mScanTXTTimer);
PRUint32 structPhrase_strong = 0; // Number of currently open tags
PRUint32 structPhrase_underline = 0;
PRUint32 structPhrase_italic = 0;
PRUint32 structPhrase_code = 0;
nsAutoString outputHTML; // moved here for performance increase
for(PRUint32 i = 0; PRInt32(i) < aInStringLength;)
{
if (doGlyphSubstitution)
{
PRInt32 glyphTextLen;
if (GlyphHit(&aInString[i], aInStringLength - i, i == 0, aOutString, glyphTextLen))
{
i += glyphTextLen;
continue;
}
}
if (doStructPhrase)
{
const PRUnichar * newOffset = aInString;
PRInt32 newLength = aInStringLength;
if (i > 0 ) // skip the first element?
{
newOffset = &aInString[i-1];
newLength = aInStringLength - i + 1;
}
switch (aInString[i]) // Performance increase
{
case '*':
if (StructPhraseHit(newOffset, newLength, i == 0,
NS_LITERAL_STRING("*").get(), 1,
"b", "class=\"moz-txt-star\"",
aOutString, structPhrase_strong))
{
i++;
continue;
}
break;
case '/':
if (StructPhraseHit(newOffset, newLength, i == 0,
NS_LITERAL_STRING("/").get(), 1,
"i", "class=\"moz-txt-slash\"",
aOutString, structPhrase_italic))
{
i++;
continue;
}
break;
case '_':
if (StructPhraseHit(newOffset, newLength, i == 0,
NS_LITERAL_STRING("_").get(), 1,
"span" /* <u> is deprecated */,
"class=\"moz-txt-underscore\"",
aOutString, structPhrase_underline))
{
i++;
continue;
}
break;
case '|':
if (StructPhraseHit(newOffset, newLength, i == 0,
NS_LITERAL_STRING("|").get(), 1,
"code", "class=\"moz-txt-verticalline\"",
aOutString, structPhrase_code))
{
i++;
continue;
}
break;
}
}
if (doURLs)
{
switch (aInString[i])
{
case ':':
case '@':
case '.':
if ( (i == 0 || ((i > 0) && aInString[i - 1] != ' ')) && aInString[i +1] != ' ') // Performance increase
{
PRInt32 replaceBefore;
PRInt32 replaceAfter;
if (FindURL(aInString, aInStringLength, i, whattodo,
outputHTML, replaceBefore, replaceAfter)
&& structPhrase_strong + structPhrase_italic +
structPhrase_underline + structPhrase_code == 0
/* workaround for bug #19445 */ )
{
aOutString.Cut(aOutString.Length() - replaceBefore, replaceBefore);
aOutString += outputHTML;
i += replaceAfter + 1;
continue;
}
}
break;
} //switch
}
switch (aInString[i])
{
// Special symbols
case '<':
case '>':
case '&':
EscapeChar(aInString[i], aOutString);
i++;
break;
// Normal characters
default:
aOutString += aInString[i];
i++;
break;
}
}
MOZ_TIMER_STOP(mScanTXTTimer);
}
void
mozTXTToHTMLConv::ScanHTML(nsString& aInString, PRUint32 whattodo, nsString &aOutString)
{
// some common variables we were recalculating
// every time inside the for loop...
PRInt32 lengthOfInString = aInString.Length();
const PRUnichar * uniBuffer = aInString.get();
#ifdef DEBUG_BenB_Perf
PRTime parsing_start = PR_IntervalNow();
#endif
// Look for simple entities not included in a tags and scan them.
/* Skip all tags ("<[...]>") and content in an a tag ("<a[...]</a>")
or in a tag ("<!--[...]-->").
Unescape the rest (text between tags) and pass it to ScanTXT. */
for (PRInt32 i = 0; PRUint32(i) < lengthOfInString;)
{
if (aInString[i] == '<') // html tag
{
PRUint32 start = PRUint32(i);
if (nsCRT::ToLower((char)aInString[PRUint32(i) + 1]) == 'a')
// if a tag, skip until </a>
{
i = aInString.Find("</a>", PR_TRUE, i);
if (i == kNotFound)
i = lengthOfInString;
else
i += 4;
}
else if (aInString[PRUint32(i) + 1] == '!' && aInString[PRUint32(i) + 2] == '-' &&
aInString[PRUint32(i) + 3] == '-')
//if out-commended code, skip until -->
{
i = aInString.Find("-->", PR_FALSE, i);
if (i == kNotFound)
i = lengthOfInString;
else
i += 3;
}
else // just skip tag (attributes etc.)
{
i = aInString.FindChar('>', i);
if (i == kNotFound)
i = lengthOfInString;
else
i++;
}
aOutString.Append(&uniBuffer[start], PRUint32(i) - start);
}
else
{
PRUint32 start = PRUint32(i);
i = aInString.FindChar('<', i);
if (i == kNotFound)
i = lengthOfInString;
nsString tempString;
tempString.SetCapacity(PRUint32((PRUint32(i) - start) * growthRate));
UnescapeStr(uniBuffer, start, PRUint32(i) - start, tempString);
ScanTXT(tempString.get(), tempString.Length(), whattodo, aOutString);
}
}
#ifdef DEBUG_BenB_Perf
printf("ScanHTML time: %d ms\n", PR_IntervalToMilliseconds(PR_IntervalNow() - parsing_start));
#endif
}
/****************************************************************************
XPCOM Interface
*****************************************************************************/
NS_IMETHODIMP
mozTXTToHTMLConv::Convert(nsIInputStream *aFromStream,
const char *aFromType,
const char *aToType,
nsISupports *aCtxt, nsIInputStream **_retval)
{
return NS_ERROR_NOT_IMPLEMENTED;
}
NS_IMETHODIMP
mozTXTToHTMLConv::AsyncConvertData(const char *aFromType,
const char *aToType,
nsIStreamListener *aListener, nsISupports *aCtxt) {
return NS_ERROR_NOT_IMPLEMENTED;
}
NS_IMETHODIMP
mozTXTToHTMLConv::OnDataAvailable(nsIRequest* request, nsISupports *ctxt,
nsIInputStream *inStr, PRUint32 sourceOffset,
PRUint32 count)
{
return NS_ERROR_NOT_IMPLEMENTED;
}
NS_IMETHODIMP
mozTXTToHTMLConv::OnStartRequest(nsIRequest* request, nsISupports *ctxt)
{
return NS_ERROR_NOT_IMPLEMENTED;
}
NS_IMETHODIMP
mozTXTToHTMLConv::OnStopRequest(nsIRequest* request, nsISupports *ctxt,
nsresult aStatus)
{
return NS_ERROR_NOT_IMPLEMENTED;
}
NS_IMETHODIMP
mozTXTToHTMLConv::CiteLevelTXT(const PRUnichar *line, PRUint32 *logLineStart,
PRUint32 *_retval)
{
if (!logLineStart || !_retval || !line)
return NS_ERROR_NULL_POINTER;
*_retval = CiteLevelTXT(line, *logLineStart);
return NS_OK;
}
NS_IMETHODIMP
mozTXTToHTMLConv::ScanTXT(const PRUnichar *text, PRUint32 whattodo,
PRUnichar **_retval)
{
NS_ENSURE_ARG(text);
// FIX ME!!!
nsString outString;
PRInt32 inLength = nsCRT::strlen(text);
// by setting a large capacity up front, we save time
// when appending characters to the output string because we don't
// need to reallocate and re-copy the characters already in the out String.
NS_ASSERTION(inLength, "ScanTXT passed 0 length string");
if (inLength == 0) {
*_retval = nsCRT::strdup(text);
return NS_OK;
}
outString.SetCapacity(PRUint32(inLength * growthRate));
ScanTXT(text, inLength, whattodo, outString);
*_retval = ToNewUnicode(outString);
return *_retval ? NS_OK : NS_ERROR_OUT_OF_MEMORY;
}
NS_IMETHODIMP
mozTXTToHTMLConv::ScanHTML(const PRUnichar *text, PRUint32 whattodo,
PRUnichar **_retval)
{
NS_ENSURE_ARG(text);
// FIX ME!!!
nsString outString;
nsString inString (text); // look at this nasty extra copy of the entire input buffer!
outString.SetCapacity(PRUint32(inString.Length() * growthRate));
ScanHTML(inString, whattodo, outString);
*_retval = ToNewUnicode(outString);
return *_retval ? NS_OK : NS_ERROR_OUT_OF_MEMORY;
}
nsresult
MOZ_NewTXTToHTMLConv(mozTXTToHTMLConv** aConv)
{
NS_PRECONDITION(aConv != nsnull, "null ptr");
if (!aConv)
return NS_ERROR_NULL_POINTER;
*aConv = new mozTXTToHTMLConv();
if (!*aConv)
return NS_ERROR_OUT_OF_MEMORY;
NS_ADDREF(*aConv);
// return (*aConv)->Init();
return NS_OK;
}