RetroZilla/parser/htmlparser/public/nsHTMLTokens.h

/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
/* ***** BEGIN LICENSE BLOCK *****
 * Version: MPL 1.1/GPL 2.0/LGPL 2.1
 *
 * The contents of this file are subject to the Mozilla Public License Version
 * 1.1 (the "License"); you may not use this file except in compliance with
 * the License. You may obtain a copy of the License at
 * http://www.mozilla.org/MPL/
 *
 * Software distributed under the License is distributed on an "AS IS" basis,
 * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
 * for the specific language governing rights and limitations under the
 * License.
 *
 * The Original Code is mozilla.org code.
 *
 * The Initial Developer of the Original Code is
 * Netscape Communications Corporation.
 * Portions created by the Initial Developer are Copyright (C) 1998
 * the Initial Developer. All Rights Reserved.
 *
 * Contributor(s):
 *
 * Alternatively, the contents of this file may be used under the terms of
 * either of the GNU General Public License Version 2 or later (the "GPL"),
 * or the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
 * in which case the provisions of the GPL or the LGPL are applicable instead
 * of those above. If you wish to allow use of your version of this file only
 * under the terms of either the GPL or the LGPL, and not to allow others to
 * use your version of this file under the terms of the MPL, indicate your
 * decision by deleting the provisions above and replace them with the notice
 * and other provisions required by the GPL or the LGPL. If you do not delete
 * the provisions above, a recipient may use your version of this file under
 * the terms of any one of the MPL, the GPL or the LGPL.
 *
 * ***** END LICENSE BLOCK ***** */

/**
 * MODULE NOTES:
 * @update  gess 4/1/98
 *
 * This file contains the declarations for all the HTML specific token types that
 * our DTD's understand. In fact, the same set of token types are used for XML.
 * Currently we have tokens for text, comments, start and end tags, entities,
 * attributes, style, script and skipped content. Whitespace and newlines also
 * have their own token types, but don't count on them to stay forever.
 *
 * If you're looking for the html tags, they're in a file called nsHTMLTag.h/cpp.
 *
 * Most of the token types have a similar API. They have methods to get the type
 * of token (GetTokenType); those that represent HTML tags also have a method to
 * get type tag type (GetTypeID). In addition, most have a method that causes the
 * token to help in the parsing process called (Consume). We've also thrown in a
 * few standard debugging methods as well.
 */

#ifndef HTMLTOKENS_H
#define HTMLTOKENS_H

#include "nsToken.h"
#include "nsHTMLTags.h"
#include "nsString.h"
#include "nsScannerString.h"

class nsScanner;

  /*******************************************************************
   * This enum defines the set of token types that we currently support.
   *******************************************************************/

enum eHTMLTokenTypes {
  eToken_unknown=0,
  eToken_start=1,      eToken_end,          eToken_comment,         eToken_entity,
  eToken_whitespace,   eToken_newline,      eToken_text,            eToken_attribute,
  eToken_instruction,  eToken_cdatasection, eToken_doctypeDecl,     eToken_markupDecl,
  eToken_last //make sure this stays the last token...
};

enum eHTMLCategory {
  eHTMLCategory_unknown=0,
  eHTMLCategory_inline,
  eHTMLCategory_block,
  eHTMLCategory_blockAndInline,
  eHTMLCategory_list,
  eHTMLCategory_table,
  eHTMLCategory_tablepart,
  eHTMLCategory_tablerow,
  eHTMLCategory_tabledata,
  eHTMLCategory_head,
  eHTMLCategory_html,
  eHTMLCategory_body,
  eHTMLCategory_form,
  eHTMLCategory_options,
  eHTMLCategory_frameset,
  eHTMLCategory_text
};


nsresult      ConsumeQuotedString(PRUnichar aChar,nsString& aString,nsScanner& aScanner);
nsresult      ConsumeAttributeText(PRUnichar aChar,nsString& aString,nsScanner& aScanner);
const PRUnichar* GetTagName(PRInt32 aTag);
//PRInt32     FindEntityIndex(nsString& aString,PRInt32 aCount=-1);


/**
 *  This declares the basic token type used in the HTML DTD's.
 *  @update  gess 3/25/98
 */
class CHTMLToken : public CToken {
public:
  virtual ~CHTMLToken();
  CHTMLToken(eHTMLTags aTag);

  virtual eContainerInfo GetContainerInfo(void) const {return eFormUnknown;}
  virtual void SetContainerInfo(eContainerInfo aInfo) { }

protected:
};

/**
 *  This declares start tokens, which always take the form <xxxx>.
 *  This class also knows how to consume related attributes.
 *
 *  @update  gess 3/25/98
 */
class CStartToken: public CHTMLToken {
  CTOKEN_IMPL_SIZEOF

public:
  CStartToken(eHTMLTags aTag=eHTMLTag_unknown);
  CStartToken(const nsAString& aString);
  CStartToken(const nsAString& aName,eHTMLTags aTag);

  virtual nsresult Consume(PRUnichar aChar,nsScanner& aScanner,PRInt32 aMode);
  virtual PRInt32 GetTypeID(void);
  virtual PRInt32 GetTokenType(void);

  virtual PRBool IsEmpty(void);
  virtual void SetEmpty(PRBool aValue);

  virtual const nsSubstring& GetStringValue();
  virtual void GetSource(nsString& anOutputString);
  virtual void AppendSourceTo(nsAString& anOutputString);

  // the following info is used to set well-formedness state on start tags...
  virtual eContainerInfo GetContainerInfo(void) const {return mContainerInfo;}
  virtual void SetContainerInfo(eContainerInfo aContainerInfo) {
    if (eFormUnknown==mContainerInfo) {
      mContainerInfo=aContainerInfo;
    }
  }
  virtual PRBool IsWellFormed(void) const {
    return eWellFormed == mContainerInfo;
  }

  nsString mTextValue;
protected:
  eContainerInfo mContainerInfo;
  PRPackedBool mEmpty;
#ifdef DEBUG
  PRPackedBool mAttributed;
#endif
};


/**
 *  This declares end tokens, which always take the
 *  form </xxxx>. This class also knows how to consume
 *  related attributes.
 *
 *  @update  gess 3/25/98
 */
class CEndToken: public CHTMLToken {
  CTOKEN_IMPL_SIZEOF

public:
  CEndToken(eHTMLTags aTag);
  CEndToken(const nsAString& aString);
  CEndToken(const nsAString& aName,eHTMLTags aTag);
  virtual nsresult Consume(PRUnichar aChar,nsScanner& aScanner,PRInt32 aMode);
  virtual PRInt32 GetTypeID(void);
  virtual PRInt32 GetTokenType(void);

  virtual const nsSubstring& GetStringValue();
  virtual void GetSource(nsString& anOutputString);
  virtual void AppendSourceTo(nsAString& anOutputString);

protected:
  nsString mTextValue;
};


/**
 *  This declares comment tokens. Comments are usually
 *  thought of as tokens, but we treat them that way
 *  here so that the parser can have a consistent view
 *  of all tokens.
 *
 *  @update  gess 3/25/98
 */
class CCommentToken: public CHTMLToken {
  CTOKEN_IMPL_SIZEOF

public:
  CCommentToken();
  CCommentToken(const nsAString& aString);
  virtual nsresult Consume(PRUnichar aChar,nsScanner& aScanner,PRInt32 aMode);
  virtual PRInt32 GetTokenType(void);
  virtual const nsSubstring& GetStringValue(void);
  virtual void AppendSourceTo(nsAString& anOutputString);

  nsresult ConsumeStrictComment(nsScanner& aScanner);
  nsresult ConsumeQuirksComment(nsScanner& aScanner);

protected:
  nsScannerSubstring mComment; // does not include MDO & MDC
  nsScannerSubstring mCommentDecl; // includes MDO & MDC
};


/**
 *  This class declares entity tokens, which always take
 *  the form &xxxx;. This class also offers a few utility
 *  methods that allow you to easily reduce entities.
 *
 *  @update  gess 3/25/98
 */
class CEntityToken : public CHTMLToken {
  CTOKEN_IMPL_SIZEOF

public:
  CEntityToken();
  CEntityToken(const nsAString& aString);
  virtual PRInt32 GetTokenType(void);
  PRInt32 TranslateToUnicodeStr(nsString& aString);
  virtual nsresult Consume(PRUnichar aChar,nsScanner& aScanner,PRInt32 aMode);
  static nsresult ConsumeEntity(PRUnichar aChar, nsString& aString,
                                nsScanner& aScanner);
  static PRInt32 TranslateToUnicodeStr(PRInt32 aValue,nsString& aString);

  virtual const nsSubstring& GetStringValue(void);
  virtual void GetSource(nsString& anOutputString);
  virtual void AppendSourceTo(nsAString& anOutputString);

protected:
  nsString mTextValue;
};


/**
 *  Whitespace tokens are used where whitespace can be
 *  detected as distinct from text. This allows us to
 *  easily skip leading/trailing whitespace when desired.
 *
 *  @update  gess 3/25/98
 */
class CWhitespaceToken: public CHTMLToken {
  CTOKEN_IMPL_SIZEOF

public:
  CWhitespaceToken();
  CWhitespaceToken(const nsAString& aString);
  virtual nsresult Consume(PRUnichar aChar,nsScanner& aScanner,PRInt32 aMode);
  virtual PRInt32 GetTokenType(void);
  virtual const nsSubstring& GetStringValue(void);

protected:
  nsScannerSharedSubstring mTextValue;
};

/**
 *  Text tokens contain the normalized form of html text.
 *  These tokens are guaranteed not to contain entities,
 *  start or end tags, or newlines.
 *
 *  @update  gess 3/25/98
 */
class CTextToken: public CHTMLToken {
  CTOKEN_IMPL_SIZEOF

public:
  CTextToken();
  CTextToken(const nsAString& aString);
  virtual nsresult Consume(PRUnichar aChar,nsScanner& aScanner,PRInt32 aMode);
  virtual PRInt32 GetTokenType(void);
  virtual PRInt32 GetTextLength(void);
  virtual void CopyTo(nsAString& aStr);
  virtual const nsSubstring& GetStringValue(void);
  virtual void Bind(nsScanner* aScanner, nsScannerIterator& aStart,
                    nsScannerIterator& aEnd);
  virtual void Bind(const nsAString& aStr);

  nsresult ConsumeCharacterData(PRBool aConservativeConsume,
                                PRBool aIgnoreComments,
                                nsScanner& aScanner,
                                const nsAString& aEndTagName,
                                PRInt32 aFlag,
                                PRBool& aFlushTokens);

  nsresult ConsumeParsedCharacterData(PRBool aDiscardFirstNewline,
                                      PRBool aConservativeConsume,
                                      nsScanner& aScanner,
                                      const nsAString& aEndTagName,
                                      PRInt32 aFlag,
                                      PRBool& aFound);

protected:
  nsScannerSubstring mTextValue;
};


/**
 *  CDATASection tokens contain raw unescaped text content delimited by
 *  a ![CDATA[ and ]].
 *  XXX Not really a HTML construct - maybe we need a separation
 *
 *  @update  vidur 11/12/98
 */
class CCDATASectionToken : public CHTMLToken {
  CTOKEN_IMPL_SIZEOF

public:
  CCDATASectionToken(eHTMLTags aTag = eHTMLTag_unknown);
  CCDATASectionToken(const nsAString& aString);
  virtual nsresult Consume(PRUnichar aChar,nsScanner& aScanner,PRInt32 aMode);
  virtual PRInt32 GetTokenType(void);
  virtual const nsSubstring& GetStringValue(void);

protected:
  nsString mTextValue;
};


/**
 *  Declaration tokens contain raw unescaped text content (not really, but
 *  right now we use this only for view source).
 *  XXX Not really a HTML construct - maybe we need a separation
 *
 */
class CMarkupDeclToken : public CHTMLToken {
  CTOKEN_IMPL_SIZEOF

public:
  CMarkupDeclToken();
  CMarkupDeclToken(const nsAString& aString);
  virtual nsresult Consume(PRUnichar aChar,nsScanner& aScanner,PRInt32 aMode);
  virtual PRInt32 GetTokenType(void);
  virtual const nsSubstring& GetStringValue(void);

protected:
  nsScannerSubstring  mTextValue;
};


/**
 *  Attribute tokens are used to contain attribute key/value
 *  pairs whereever they may occur. Typically, they should
 *  occur only in start tokens. However, we may expand that
 *  ability when XML tokens become commonplace.
 *
 *  @update  gess 3/25/98
 */
class CAttributeToken: public CHTMLToken {
  CTOKEN_IMPL_SIZEOF

public:
  CAttributeToken();
  CAttributeToken(const nsAString& aString);
  CAttributeToken(const nsAString& aKey, const nsAString& aString);
  ~CAttributeToken() {}
  virtual nsresult Consume(PRUnichar aChar,nsScanner& aScanner,PRInt32 aMode);
  virtual PRInt32 GetTokenType(void);
  const nsSubstring&     GetKey(void) { return mTextKey.AsString(); }
  virtual void SetKey(const nsAString& aKey);
  virtual void BindKey(nsScanner* aScanner, nsScannerIterator& aStart,
                       nsScannerIterator& aEnd);
  const nsSubstring& GetValue(void) {return mTextValue.str();}
  virtual const nsSubstring& GetStringValue(void);
  virtual void GetSource(nsString& anOutputString);
  virtual void AppendSourceTo(nsAString& anOutputString);

  PRPackedBool mHasEqualWithoutValue;
protected:
#ifdef DEBUG
  PRPackedBool mLastAttribute;
#endif
  nsScannerSharedSubstring mTextValue;
  nsScannerSubstring mTextKey;
};


/**
 *  Newline tokens contain, you guessed it, newlines.
 *  They consume newline (CR/LF) either alone or in pairs.
 *
 *  @update  gess 3/25/98
 */
class CNewlineToken: public CHTMLToken {
  CTOKEN_IMPL_SIZEOF

public:
  CNewlineToken();
  virtual nsresult Consume(PRUnichar aChar,nsScanner& aScanner,PRInt32 aMode);
  virtual PRInt32 GetTokenType(void);
  virtual const nsSubstring& GetStringValue(void);

  static void AllocNewline();
  static void FreeNewline();
};


/**
 *  Whitespace tokens are used where whitespace can be
 *  detected as distinct from text. This allows us to
 *  easily skip leading/trailing whitespace when desired.
 *
 *  @update  gess 3/25/98
 */
class CInstructionToken: public CHTMLToken {
  CTOKEN_IMPL_SIZEOF

public:
  CInstructionToken();
  CInstructionToken(const nsAString& aString);
  virtual nsresult Consume(PRUnichar aChar,nsScanner& aScanner,PRInt32 aMode);
  virtual PRInt32 GetTokenType(void);
  virtual const nsSubstring& GetStringValue(void);

protected:
  nsString mTextValue;
};


/**
 * This token is generated by the HTML and Expat tokenizers
 * when they see the doctype declaration ("<!DOCTYPE ... >")
 *
 */

class CDoctypeDeclToken: public CHTMLToken {
  CTOKEN_IMPL_SIZEOF

public:
  CDoctypeDeclToken(eHTMLTags aTag=eHTMLTag_unknown);
  CDoctypeDeclToken(const nsAString& aString,eHTMLTags aTag=eHTMLTag_unknown);
  virtual nsresult Consume(PRUnichar aChar,nsScanner& aScanner,PRInt32 aMode);
  virtual PRInt32 GetTokenType(void);
  virtual const nsSubstring& GetStringValue(void);
  virtual void SetStringValue(const nsAString& aStr);

protected:
  nsString mTextValue;
};

#endif