﻿#line 1 "MiniXMLParser.upp"
// mini XML parser
// $Id$

#include <stdio.h>
#include <ccc/xml/MiniXMLParser.h>
#include <ccc/xml/XMLParser.h>
#include <ccc/base/cstring.h>

CCC_NAMESPACE_START(CCC);

// Version 1 CODEX DOM has other DocumenType methods.
// These methods are not supported by V2 CODEX.
//#define CODEX_V1_DOM

#ifndef CODEX_SKIPSP
#define CODEX_KEEPSP
#endif

#ifndef TEST
#define print(a)
#define prints(a, b)
#else /* TEST */
void
print(char* msg)
{
  printf("%s\n", msg);
}

void
prints(char* msg, DOMString* s)
{
  printf("%s", msg);
  Size len = s->getLength();
  DOMChar* c = s->getStartPtr();
  while (len--)
  {
    putchar(*c++);
  }
  printf("\n");
}
#endif /* TEST */

// ------------------------------------------------------------
// class MiniXMLParser
MiniXMLParser::MiniXMLParser()
{
  in = 0;
  reader = 0;
  document = 0;
}

MiniXMLParser::~MiniXMLParser()
{
  delete reader;
}

Document*
MiniXMLParser::read(DOMImplementation* dom_implementation, IFlow* in)
{
  MiniXMLParser::in = in;
  reader = new PushbackIFilter<DOMChar>(in);
  document = dom_implementation->createDocument();
  try
  {
    readS1();	// read XML decdel
    readS2();	// read DTD
    readS3(document);	// read XML body
  }
  catch (IOException)
  {
    delete document;
    fprintf(stderr, "stream error.\n");
    return 0;
  }
  return document;
}

bool
MiniXMLParser::whiteSpP(DOMChar c)
{
  static DOMChar sps[] = { 0x20 /*   */, 0x09 /* \t */, 0x0d /* \r */, 0x0a /* \n */, 0x00, };
  for (DOMChar* p = sps; *p != 0x00 /* \0 */; p++)
  {
    if (*p == c)
    {
      return true;
    }
  }
  return false;
}

void
MiniXMLParser::readSp() CCC_RAISES(IOException)
{
  DOMChar c;
  bool loop_p = true;
  try
  {
    while (loop_p)
    {
      c = getChar();
      if (!whiteSpP(c))
      {
	ungetChar(c);
	loop_p = false;
      }
    }
  }
  catch (IOException se)
  {
    if (se.errorNum() != IOException::READ_BEYOND_THE_EOF)
    {
      throw se;
    }
  }
}
bool
MiniXMLParser::newLineP(DOMChar c)
{
  static DOMChar sps[] = { 0x0d /* \r */, 0x0a /* \n */, 0x00, };
  for (DOMChar* p = sps; *p != 0x00 /* \0 */; p++)
  {
    if (*p == c)
    {
      return true;
    }
  }
  return false;
}

void
MiniXMLParser::readNl() CCC_RAISES(IOException)
{
  DOMChar c;
  bool loop_p = true;
  try
  {
    while (loop_p)
    {
      c = getChar();
      if (!newLineP(c))
      {
	ungetChar(c);
	loop_p = false;
      }
    }
  }
  catch (IOException se)
  {
    if (se.errorNum() != IOException::READ_BEYOND_THE_EOF)
    {
      throw se;
    }
  }
}

void
MiniXMLParser::readS1() CCC_RAISES(IOException)
{
  print("S1");
  readSp();
  static DOMChar xmldecl[] = { 0x3c /* < */, 0x3f /* ? */, 0x78 /* x */, 0x6d /* m */, 0x6c /* l */, 0x00, };
  if (!readStr(xmldecl))
  {
    return;	// ERROR: NO XML decl
  }
#if 0
  static DOMChar xmldecl_end[] = { 0x3f /* ? */, 0x3e /* > */, 0x00, };
  DOMString* xml_decl = getString3(xmldecl_end);
  //printf("XML:%s\n", xml_decl);
  delete xml_decl;
#else
  readS11();
#endif
}

void
MiniXMLParser::readS11() CCC_RAISES(IOException)
{
  bool loop_p = true;
  //Element* element = 0;
  NamedNodeMap* xml_decl_node_map = (NamedNodeMap*)document->getXMLDeclNodeMap();

  Attr* attr = 0;
  while (loop_p)
  {
    static DOMChar delm[] = { 0x20 /*   */, 0x09 /* \t */, 0x0d /* \r */, 0x0a /* \n */, 0x3d /* = */, 0x3f /* ? */, 0x00, };
    DOMString* r = getString2(delm);
    if (r->getLength() > 0)
    {
      prints("ATTRIBUTE_NAME: ", r);
      // attribute="..."
      attr = document->createAttribute(r);
      /* const Node* old_attr = */ xml_decl_node_map->setNamedItem(attr);
    }
    else
    {
      DOMChar c = getChar();
      if (c == 0x3f /* ? */)
      {
	c = getChar();
	if (c == 0x3e /* > */)
	{
	  loop_p = false;
	  readSp();
	}
	else
	{
	  return;
	  // ERROR
	}
      }
      else if (c == 0x3d /* = */)
      {
	DOMChar c = getChar();
	if (c == 0x22 /* \" */)
	{
	  static DOMChar delm[] = { 0x22 /* \" */, 0x00, };
	  DOMString* r2 = getString2(delm);
	  prints("ATTRIBUTE_VALUE: ", r2);
	  attr->setValue(r2);
	  delete r2;
	  c = getChar();
	  if (c != 0x22 /* \" */)
	  {
	    return;	// ERROR
	  }
	  readSp();
	}
	else if (c == 0x27 /* \' */)
	{
	  static DOMChar delm[] = { 0x27 /* \' */, 0x00, };
	  DOMString* r2 = getString2(delm);
	  prints("ATTRIBUTE_VALUE: ", r2);
	  attr->setValue(r2);
	  delete r2;
	  c = getChar();
	  if (c != 0x27 /* \' */)
	  {
	    return;	// ERROR
	  }
	  readSp();
	}
	else
	{
	  return;	// ERROR
	}
      }
    }
    delete r;
  }
}

void
MiniXMLParser::readS2() CCC_RAISES(IOException)
{
#ifdef CODEX_V1_DOM
  DocumentType* dtd = document->getDoctype();
  assert(dtd);
#endif /* CODEX_V1_DOM */
  print("S2");
  readSp();
  static DOMChar doctype[] = { 0x3c /* < */, 0x21 /* ! */, 0x44 /* D */, 0x4f /* O */, 0x43 /* C */, 0x54 /* T */, 0x59 /* Y */, 0x50 /* P */, 0x45 /* E */, 0x00, };
  if (!readStr(doctype))
  {
    return;	// It's OK, no DOCTYPE
  }
  // '<!DOCTYPE' S Name S? ([ ... ]) >
  // '<!DOCTYPE' S Name S 'SYSTEM' S SystemLiteral S? ([ ... ]) >
  // '<!DOCTYPE' S Name S 'PUBLIC' S PubidLiteral S SystemLiteral S? ([ ... ]) >

  static DOMChar delimits[] = { 0x20 /*   */, 0x09 /* \t */, 0x0d /* \r */, 0x0a /* \n */, 0x5b /* [ */, 0x3e /* > */, 0x00, };

  DOMString* name = getString2(delimits);
//  if (!name)
//  {
//    return;	// ERROR
//  }
  if (name->getLength() == 0)
  {
    // '<!DOCTYPE' S? >
    // This type of DOCTYPE declearation is not standard XML, 
    // but this parser can handle this.
    readSp();
    DOMChar c = getChar();
    if (c == 0x3e /* > */)
    {
      return;	// ACCEPT
    }
    return;	// ERROR
  }
#ifdef CODEX_V1_DOM
  dtd->setName(name);
#endif /* CODEX_V1_DOM */
  prints("DOCTYPE: name: ", name);
  delete name;

  readSp();
  while (true)
  {
    DOMChar c = getChar();
    if (c == 0x3e /* > */)
    {
      break;
    }
    else if (c == 0x5b /* [ */)
    {
#ifdef CODEX_V1_DOM
      dtd->setCategory(DocumentType::CATEGORY_LOCAL);
#endif /* CODEX_V1_DOM */
      readS21();
      readSp();
      c = getChar();
      if (c != 0x5d /* ] */)
      {
	break;	// ERROR
      }
    }
    else if (c == 0x53 /* S */)	// may be SYSTEM
    {
      static DOMChar ystem[] = { 0x59 /* Y */, 0x53 /* S */, 0x54 /* T */, 0x45 /* E */, 0x4d /* M */, 0x00, };
      if (!readStr(ystem))
      {
	break; // ERROR
      }
#ifdef CODEX_V1_DOM
      dtd->setCategory(DocumentType::CATEGORY_SYSTEM);
#endif /* CODEX_V1_DOM */
      readSp();
      c = getChar();
      DOMString* system_literal = 0;
      if (c == 0x22 /* " */)
      {
	static DOMChar delm[] = { 0x22 /* \" */, 0x00, };
	system_literal = getString2(delm);
      }
      else if (c == 0x27 /* \' */)
      {
	static DOMChar delm[] = { 0x27 /* ' */, 0x00, };
	system_literal = getString2(delm);
      }
#ifdef CODEX_V1_DOM
      if (system_literal)
      {
	dtd->setSystemLiteral(system_literal);
      }
#endif /* CODEX_V1_DOM */
      c = getChar();	// read ", '
      prints("DOCTYPE-SYSTEM: ", system_literal);
      delete system_literal;
      readSp();
    }
    else if (c == 0x50 /* P */)	// may be PUBLIC
    {
      static DOMChar ublic[] = { 0x55 /* U */, 0x42 /* B */, 0x4c /* L */, 0x49 /* I */, 0x43 /* C */, 0x00, };
      if (!readStr(ublic))
      {
	break; // ERROR
      }
#ifdef CODEX_V1_DOM
      dtd->setCategory(DocumentType::CATEGORY_PUBLIC);
#endif /* CODEX_V1_DOM */
      readSp();
      DOMString* pubid_literal = 0;
      c = getChar();
      if (c == 0x22 /* " */)
      {
	static DOMChar delm[] = { 0x22 /* \" */, 0x00, };
	pubid_literal = getString2(delm);
      }
      else if (c == 0x27 /* \' */)
      {
	static DOMChar delm[] = { 0x27 /* ' */, 0x00, };
	pubid_literal = getString2(delm);
      }
#ifdef CODEX_V1_DOM
      if (pubid_literal)
      {
	dtd->setPubidLiteral(pubid_literal);
      }
#endif /* CODEX_V1_DOM */
      c = getChar();	// read ", '
      readSp();
      DOMString* system_literal = 0;
      c = getChar();
      if (c == 0x22 /* " */)
      {
	static DOMChar delm[] = { 0x22 /* \" */, 0x00, };
	system_literal = getString2(delm);
      }
      else if (c == 0x27 /* \' */)
      {
	static DOMChar delm[] = { 0x27 /* ' */, 0x00, };
	system_literal = getString2(delm);
      }
#ifdef CODEX_V1_DOM
      if (system_literal)
      {
	dtd->setSystemLiteral(system_literal);
      }
#endif /* CODEX_V1_DOM */
      c = getChar();	// read ", '
      print("DOCTYPE-PUBLIC: ");
      prints("  PUBID LITERAL: ", pubid_literal);
      prints("  SYSTEM LITERAL: ", system_literal);
      delete pubid_literal;
      delete system_literal;
      readSp();
    }
    else
    {
      break;	// ERROR
    }
  }
}

void
MiniXMLParser::readS21() CCC_RAISES(IOException)
{
  print("S21");
  static DOMChar delimits[] = { 0x20 /*   */, 0x09 /* \t */, 0x0d /* \r */, 0x0a /* \n */, 0x5d /* ] */, 0x00, };
  while (true)
  {
    readSp();
    DOMString* markup = getString2(delimits);
//    if (!markup)
//    {
//      break;
//    }
    if (markup->getLength() == 0)
    {
      delete markup;
      break;
    }
    DOMChar c = (*markup)[0];
    if (c == 0x3c /* < */)
    {
      // markupdecl
      readSp();
      static DOMChar markupdelm[] = { 0x3e /* > */, 0x00, };
      DOMString* body = getString2(markupdelm);
//      if (!body)
//      {
//	break;	// ERROR
//      }
      if (body->getLength() == 0)
      {
	delete body;
	break;
      }

      /* DOMChar c = */ getChar();	// read '>'
      print("DOCTYPE-MARKUPDECL:");
      prints("  MARKUP: ", markup);
      prints("  BODY: ", body);
      delete body;
    }
    else if (c == 0x25 /* % */)
    {
      // PEReference 
      prints("DOCTYPE-PEREFERCE: ", markup);
    }
    else
    {
      delete markup;
      break;	// ERROR
    }
    delete markup;
  }
}

void
MiniXMLParser::readS3(Node* parent) CCC_RAISES(IOException)
{
  print("S3");
  readSp();
  DOMChar c = getChar();
  if (c != 0x3c /* < */)
  {
    assert(false);
    return;	// WARN
  }
  bool loop_p = true;
  bool first_p = true;
  Element* element = 0;
  DOMString* attr_name = 0;
  while (loop_p)
  {
    static DOMChar delm[] = { 0x20 /*   */, 0x09 /* \t */, 0x0d /* \r */, 0x0a /* \n */, 0x3d /* = */, 0x2f /* / */, 0x3e /* > */, 0x00, };
    DOMString* r = getString2(delm);
    if (r->getLength() > 0)
    {
      if (first_p)
      {
	prints("ELEMENT: ", r);
	element = document->createElement(r);
	parent->appendChild(element);
	first_p = false;
      }
      else
      {
	prints("ATTRIBUTE_NAME: ", r);
	// attribute="..."
	Attr* attr = document->createAttribute(r);
	Attr* old_attr = element->setAttributeNode(attr);
	delete old_attr;
	attr_name = r;
	r = 0;
      }
    }
    else
    {
      DOMChar c = getChar();
      if (c == 0x3e /* > */)
      {
	loop_p = false;
	// read sub elements
	readS4(element);
      }
      else if (c == 0x2f /* / */)
      {
	c = getChar();
	if (c == 0x3e /* > */)
	{
	  loop_p = false;
	  // empty tag
	  print("EMPTY_ELEMENT");
	  readSp();
	}
	else
	{
	  return;
	  // ERROR
	}
      }
      else if (c == 0x3d /* = */)
      {
	DOMChar c = getChar();
	if (c == 0x22 /* \" */)
	{
	  static DOMChar delm[] = { 0x22 /* \" */, 0x00, };
	  DOMString* r2 = getString2(delm);
	  prints("ATTRIBUTE_VALUE: ", r2);
	  element->setAttribute(attr_name, r2);
	  delete attr_name;
	  attr_name = 0;
	  delete r2;
	  
	  c = getChar();
	  if (c != 0x22 /* \" */)
	  {
	    return;	// ERROR
	  }
	}
	else
	{
	  return;	// ERROR
	}
      }
    }
    delete r;
  }
}

void
MiniXMLParser::readS4(Node* parent) CCC_RAISES(IOException)
{
  print("S4");
#ifdef CODEX_KEEPSP
  readNl();
#else /* CODEX_KEEPSP */
  readSp();
#endif /* CODEX_KEEPSP */
  while (true)
  {
    DOMChar c = getChar();
    if (c == 0x3c /* < */)
    {
      c = getChar();
      if (c == 0x2f /* / */)
      {
	// end tag
	ungetChar(0x2f /* / */);
	ungetChar(0x3c /* < */);
	readS5();
	readSp();
	break;
      }
      else if (c == 0x21 /* ! */)
      {
	// comment or CDATA
	c = getChar();
	if (c == 0x2d /* - */)
	{
	  c = getChar();
	  if (c == 0x2d /* - */)
	  {
	    static DOMChar comment_delm[] = { 0x2d /* - */, 0x2d /* - */, 0x3e /* > */, 0x00, };
	    DOMString* s = getString3(comment_delm);
	    prints("COMMENT: ", s);
	    Comment* comment = document->createComment(s);
	    parent->appendChild(comment);
	    	    
	    delete s;
	    readSp();
	  }
	  else
	  {
	    // ERROR
	    return;
	  }
	}
	else if (c == 0x5b /* [ */)
	{
	  static DOMChar cdata_start[] = { 0x43 /* C */, 0x44 /* D */, 0x41 /* A */, 0x54 /* T */, 0x41 /* A */, 0x5b /* [ */, 0x00, };
	  DOMString* s1 = getString3(cdata_start);
	  if (s1)
	  {
	    static DOMChar cdata_end[] = { 0x5d /* ] */, 0x5d /* ] */, 0x3e /* > */, 0x00, };
	    DOMString* s = getString3(cdata_end);
	    prints("CDATA: ", s);
	    CDATASection* cdata = document->createCDATASection(s);
	    parent->appendChild(cdata);

	    delete s1;
	    delete s;
	    readSp();
	  }
//	  else
//	  {
//	    // ERROR
//	    return;
//	  }
	}
	else
	{
	  // ERROR
	  return;
	}
      }
      else if (c == 0x3f /* ? */)
      {
	// PI: Processing Instruction
	DOMString* target = getString();
	static DOMChar pi_delm[] = { 0x3f /* ? */, 0x3e /* > */, 0x00, };
	DOMString* desc = getString3(pi_delm);
	print("PI: ");
	prints("  TARGET: ", target);
	prints("  DESC: ", desc);

	ProcessingInstruction* pi = document->createProcessingInstruction(target, desc);
	parent->appendChild(pi);

	delete target;
	delete desc;
	readSp();
      }
      else
      {
	ungetChar(c);
	ungetChar(0x3c /* < */);
	readS3(parent);
      }
    }
    else
    {
      ungetChar(c);
      DOMString* r = getText();
      prints("TEXT: ", r);

      Text* text = document->createTextNode(r);
      parent->appendChild(text);
      delete r;
    }
  }
}

void
MiniXMLParser::readS5() CCC_RAISES(IOException)
{
  print("S5");
  print("ELEMENT_END:");
  DOMChar c = getChar();
  if (c != 0x3c /* < */)
  {
    return;	// ERROR:
  }
  c = getChar();
  if (c != 0x2f /* / */)
  {
    return;	// ERROR:
  }
  while (true)
  {
    c = getChar();
    if (c == 0x3e /* > */)
    {
      break;
    }
//    printf("%c", c);
  }
//  printf("\n");
}

void
MiniXMLParser::ungetString(DOMString* str)
{
  DOMSubStringIterator ssi(*str);
  ssi.unwind();
  DOMChar* s;
  while ((s = ssi.prev()))
  {
    ungetChar(*s);
  }
}

bool
MiniXMLParser::readStr(DOMChar* str) CCC_RAISES(IOException)
{
#if 1
  Size len = strLen(str);
  DOMString* r = new DOMString();
  for (Size l = 0; l < len; l++)
  {
    DOMChar c = getChar();
    r->add(c);
  }
  bool ret = false;
  if (*r == str)
  {
    ret = true;
    readSp();
  }
  else
  {
    ungetString(r);
  }
  delete r;
  return ret;
#else
  DOMString* r = getString();
  bool ret = false;
//  if (!strcmp(r, str))
  if (*r == str)
  {
    ret = true;
  }
  else
  {
    ungetChar(0x20 /*   */);
    ungetString(r);
  }
  delete r;
  return ret;
#endif
}

DOMString*
MiniXMLParser::getString() CCC_RAISES(IOException)
{
  DOMString* ret = new DOMString();
  while (true)
  {
    DOMChar c = getChar();
    if (whiteSpP(c))
    {
      readSp();
      break;
    }
    ret->add(c);
  }
  return ret;
}

DOMString*
MiniXMLParser::getString2(DOMChar* delimiters) CCC_RAISES(IOException)
{
  DOMString* ret = new DOMString();
  while (true)
  {
    DOMChar c = getChar();
    DOMChar* p = delimiters;
    while (*p)
    {
      if (*p == c)
      {
	ungetChar(c);
	break;
      }
      p++;
    }
    if (*p)
    {
      break;
    }
    ret->add(c);
  }
  return ret;
}

DOMString*
MiniXMLParser::getString3(DOMChar* delimiter) CCC_RAISES(IOException)
{
  DOMString buf;
  Size delimiter_len = strLen(delimiter);
  while (true)
  {
    DOMChar c = getChar();
    buf.add(c);
    if (buf.getLength() >= delimiter_len)
    {
      unsigned int i;
      DOMChar* p = buf.getStartPtr();
      for (i = 0; i < delimiter_len; i++)
      {
	if (p[buf.getLength() - delimiter_len + i] != delimiter[i])
	{
	  break;
	}
      }
      if (i == delimiter_len)
      {
	break;
      }
    }
  }
  DOMSubString ss = buf.leftSubStr(buf.getLength() - delimiter_len);
  DOMString* ret = new DOMString(ss);
  return ret;
}

DOMChar
MiniXMLParser::getChar()
{
  return reader->getChar();
}

void
MiniXMLParser::ungetChar(DOMChar c)
{
  reader->push(c);
}

static bool
compare(DOMString* s1, DOMChar* s2)
{
  DOMChar* s1c = s1->getStartPtr();
  DOMChar* s1end = s1c + s1->getLength();
  while (s1c < s1end)
  {
    if (*s1c++ != *s2++)
    {
      return false;
    }
  }
  return true;
}

DOMChar
MiniXMLParser::getUnEscapedChar(bool& escaped_p)
{
  DOMChar c = getChar();
  if (c == 0x26 /* & */)
  {
    DOMString s;
    for (;;)
    {
      c = getChar();
      if (c == 0x3b /* ; */)
      {
	break;
      }
      s.add(c);
    }
    // search basic_entity
    EntityReplacement* er = XMLToken::basic_entity;
    while (er->esc)
    {
      if (compare(&s, er->esc))
      {
	break;
      }
      er++;
    }
    if (er->esc)
    {
      escaped_p = true;
      return er->src;
    }
    // unget
    ungetChar(0x3b /* ; */);
    DOMChar* s1 = s.getStartPtr();
    DOMChar* p = s.getStartPtr() + s.getLength() - 1;
    while (s1 <= p)
    {
      ungetChar(*p--);
    }
    escaped_p = false;
    return 0x26 /* & */;
  }
  else
  {
    escaped_p = false;
    return c;
  }
}

DOMString*
MiniXMLParser::getText() CCC_RAISES(IOException)
{
  DOMString* ret = new DOMString();
  while (true)
  {
    bool escaped_p;
    DOMChar c = getUnEscapedChar(escaped_p);
    if ((escaped_p == false) && (c == 0x3c /* < */))
    {
      ungetChar(c);
      break;
    }
    ret->add(c);
  }
  return ret;
}

CCC_NAMESPACE_END(CCC);

