﻿// @@DISTHDR@@
// $Id$
// Detector for XML input

#include <assert.h>
#include <string.h>
#include <ccc/iceman/XmlDetector.h>

CCC_NAMESPACE_START(CCC);

XmlDetector::XmlDetector()
{
  enc_name = 0;
  clear();
}

XmlDetector::~XmlDetector()
{
  delete enc_name;
}

void
XmlDetector::clear()
{
  in_type = ONEBYTE;	// default
  unget_size = 0;
  in = 0;
  delete enc_name;
  enc_name = 0;
}

bool
XmlDetector::rewind()
{
  clear();
  return true;
}

CeId
XmlDetector::detect(IFlow* in)
{
  XmlDetector::in = in;
  UInt8 h[4];
  int i;
  const UInt8 le = (UInt8)'<';
  const UInt8 qu = (UInt8)'?';
  const UInt8 x = (UInt8)'x';
  const UInt8 m = (UInt8)'m';
  //const UInt8 l = (UInt8)'l';
  try
  {
    for (i = 0; i < 4; i++)
    {
      h[i] = (UInt8)in->getInt8();
    }
  }
  catch (IOException /* ioe */)
  {
    return CEID_NULL;
  }
  
  if (h[0] == 0 && h[1] == 0 && h[2] == 0xfe && h[3] == 0xff)
  {
    return CEID_UCS4_BE;
  }
  else if (h[0] == 0xff && h[1] == 0xfe && h[2] == 0 && h[3] == 0)
  {
    return CEID_UCS4_LE;
  }
#if 0
  else if (h[0] == 0 && h[1] == 0 && h[2] == 0xff && h[3] == 0xfe)
  {
    // TODO: 2143
  }
  else if (h[0] == 0xfe && h[1] == 0xff && h[2] == 0 && h[3] == 0)
  {
    // TODO: 3412
  }
#endif  
  else if (h[0] == 0xfe && h[1] == 0xff && h[2] == 0 && h[3] == le)
  {
    return CEID_UTF16_BE;
  }
  else if (h[0] == 0xff && h[1] == 0xfe && h[2] == le && h[3] == 0)
  {
    return CEID_UTF16_LE;
  }
  else if (h[0] == 0xef && h[1] == 0xbb && h[2] == 0xbf)
  {
    return CEID_UTF8;
  }
  else if (h[0] == 0 && h[1] == 0 && h[2] == 0 && h[3] == le)
  {
    return CEID_UCS4_BE;
  }
  else if (h[0] == le && h[1] == 0 && h[2] == 0 && h[3] == 0)
  {
    return CEID_UCS4_LE;
  }
#if 0
  if (h[0] == 0 && h[1] == 0 && h[2] == le && h[3] == 0)
  {
    // TODO: 2143
  }
  if (h[0] == 0 && h[1] == le && h[2] == 0 && h[3] == 0)
  {
    // TODO: 3412
  }
#endif
  else if (h[0] == 0 && h[1] == le && h[2] == 0 && h[3] == qu)
  {
    // UTF-16BE or UCS-2BE
    in_type = BE2BYTE;
  }
  else if (h[0] == le && h[1] == 0 && h[2] == qu && h[3] == 0)
  {
    // UTF-16LE or UCS-2LE
    in_type = LE2BYTE;
  }
  else if (h[0] == le && h[1] == qu && h[2] == x && h[3] == m)
  {
    // <?xm
    in_type = ONEBYTE;
  }
#if 0
  if (h[0] == 0x4c && h[1] == 0x6f && h[2] == 0xa7 && h[3] == 0x94)
  {
    // EBCDIC
  }
#endif
  if (checkEncoding())
  {
    assert(!!enc_name);
    CeId ret = Iceman::stringToCeId(enc_name);
    if (ret != CEID_NULL)
    {
      return ret;
    }
  }
  switch (in_type)
  {
   case BE2BYTE:
    return CEID_UTF16_BE;
   case LE2BYTE:
    return CEID_UTF16_LE;
   case ONEBYTE:
    return CEID_UTF8;
   default:
    assert(false);
    break;
  }
  //assert(false);
  return CEID_NULL;
}

Detector*
XmlDetector::createDetector()
{
  return new XmlDetector();
}

bool
XmlDetector::checkEncoding()
{
  try
  {
    if (!in->rewind())
    {
      // rewind failure
      return false;
    }
    if (!parseXMLDecl())
    {
      return false;
    }
  }
  catch (XmlParseError& /*xpe*/) 
  {
    return false;
  }
  catch (IOException& /*ioe*/)
  {
    return false;
  }
  return true;
}

void
XmlDetector::ungetChar(char c)
{
  assert(unget_size < max_unget_size);
  unget_buf[unget_size++] = c;
}

char
XmlDetector::getChar()
{
  if (unget_size)
  {
    return unget_buf[--unget_size];
  }
  char c, c2;
  switch (in_type)
  {
   case UNKNOWN:
    assert(false);
    break;
   case ONEBYTE:
    c = (char)in->getInt8();
    break;
   case BE2BYTE:
    // 00 xx
    c2 = (char)in->getInt8();
    c = (char)in->getInt8();
    break;
   case LE2BYTE:
    // xx 00
    c = (char)in->getInt8();
    c2 = (char)in->getInt8();
    break;
  }
  return c;
}

// XMLDecl ::= '<?xml' VersionInfo EncodingDecl? SDDecl? S? '?>'
// VersionInfo ::= S 'version' Eq ("'" VersionNum "'" | '"' VersionNum '"')
// Eq ::= S? '=' S?
// VersionNum ::= ([a-zA-Z0-9_.:]|'-')+
// EncodingDecl ::= S 'encoding' Eq ('"' EncName '"' | "'" EncName "'")
// EncName ::= [A-Za-z]([A-Za-z0-9._]|'-')*
// SDDecl ::= S 'standalone' Eq (("'" ('yes' | 'no') "'") | ('"' ('yes' | 'no') '"'))
//
// EncName
// UTF-8, UTF-16, ISO-10646-UCS-2, ISO-10646-UCS-4
// ISO-8859-1, ISO-8859-2, ...
// ISO-2022-JP, Shift_JIS, EUC-JP
// etc...

#ifdef CHECK
#undef CHECK
#endif
//! CHECK is a macro for shorten arguments of the bool check(char* str, Size len).
//! @param s DOMChar array
#define CHECK(s) check(s, (sizeof(s) - 1))

bool
XmlDetector::check(char* str, Size len)
{
  char* buf = new char[len];
  Size i;
  for (i = 0; i < len; i++)
  {
    buf[i] = getChar();
  }
  bool ret = !strncmp(str, buf, len);
  if (!ret)
  {
    for (i = 0; i < len; i++)
    {
      ungetChar(buf[len - i - 1]);
    }
  }
  delete[] buf;
  return ret;
}

static char str_xmldecl_start[] = "<?xml";
//static char str_xmldecl_end[] = "?>";
static char str_version[] = "version";
static char str_quote[] = "'";
static char str_dquote[] = "\"";
static char str_encoding[] = "encoding";
static char str_eq[] = "=";

bool
XmlDetector::parseXMLDecl()
{
  // <?xml
  if (!CHECK(str_xmldecl_start))
  {
    return false;
  }
  // VersionInfo
  if (!parseVersionInfo(true))
  {
    throw XmlParseError(10, "can't find 'version' between '<?xml' and '?>'.");
  }
  parseVersionInfo(false);
  // S
  if (parseS(true))
  {
    parseS(false);
    // EncodingDecl?
    if (parseEncodingDecl(true))
    {
      return parseEncodingDecl(false);
    }
  }
  return false;
}

bool
XmlDetector::parseVersionInfo(bool l1_p)
{
  if (l1_p)
  {
    // S
    parseS();
    // 'version'
    return CHECK(str_version);
  }
  // Eq
  parseEq();
  // ( "'" VersionNum "'" | '"' VersionNum '"' )
  if (CHECK(str_quote))
  {
    if (!parseVersionNum())
    {
      throw XmlParseError(20, "version number pase error");
    }
    if (!CHECK(str_quote))
    {
      throw XmlParseError(30, "can't find \"'\".");
    }
  }
  else if (CHECK(str_dquote))
  {
    if (!parseVersionNum())
    {
      throw XmlParseError(40, "version number pase error");
    }
    if (!CHECK(str_dquote))
    {
      throw XmlParseError(50, "can't find '\"'.");
    }
  }
  else
  {
    throw XmlParseError(60, "version number specification syntax error.");
  }
  return true;
}

void
XmlDetector::parseS()
{
  if (!parseS(true))
  {
    throw XmlParseError(70, "can't find space.");
  }
  parseS(false);
}

bool
XmlDetector::parseEncodingDecl(bool l1_p)
{
  if (l1_p)
  {
    return CHECK(str_encoding);
  }
  parseEq();
  if (CHECK(str_quote))
  {
    if (!parseEncName())
    {
      throw XmlParseError(80, "encoding name parse error");
    }
    if (!CHECK(str_quote))
    {
      throw XmlParseError(90, "can't find \"'\".");
    }
  }
  else if (CHECK(str_dquote))
  {
    if (!parseEncName())
    {
      throw XmlParseError(100, "encoding name parse error");
    }
    if (!CHECK(str_dquote))
    {
      throw XmlParseError(110, "can't find '\"'.");
    }
  }
  else
  {
    throw XmlParseError(120, "encoding specification syntax error.");
  }
  return true;
}

bool
XmlDetector::parseEncName()
{
  static char cls_encname1[] = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz";
  static char cls_encname2[] = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789._-";
  char c = getChar();
  char* p = cls_encname1;
  while (*p)
  {
    if (c == *p)
    {
      break;
    }
    p++;
  }
  if (!*p)
  {
    ungetChar(c);
    return false;
  }
  Size enc_name_buf_size = 10;
  Size enc_name_len = 0;
  enc_name = new char[enc_name_buf_size];
  enc_name[enc_name_len++] = c;
  for (;;)
  {
    c = getChar();
    p = cls_encname2;
    while (*p)
    {
      if (c == *p)
      {
	break;
      }
      p++;
    }
    if (!*p)
    {
      ungetChar(c);
      break;
    }
    if (enc_name_len == enc_name_buf_size)
    {
      enc_name_buf_size *= 2;
      char* x = new char[enc_name_buf_size];
      memcpy(x, enc_name, enc_name_len);
      delete enc_name;
      enc_name = x;
    }
    enc_name[enc_name_len++] = c;
  }
  if (enc_name_len == enc_name_buf_size)
  {
    enc_name_buf_size *= 2;
    char* x = new char[enc_name_buf_size];
    memcpy(x, enc_name, enc_name_len);
    delete enc_name;
    enc_name = x;
  }
  enc_name[enc_name_len++] = '\0';
  return true;
}

bool
XmlDetector::parseClassString(char* class_str)
{
  char c = getChar();
  char* p = class_str;
  while (*p)
  {
    if (c == *p)
    {
      break;
    }
    p++;
  }
  if (!*p)
  {
    ungetChar(c);
    return false;
  }
  for (;;)
  {
    c = getChar();
    p = class_str;
    while (*p)
    {
      if (c == *p)
      {
	break;
      }
      p++;
    }
    if (!*p)
    {
      ungetChar(c);
      break;
    }
  }
  return true;
}
  
bool
XmlDetector::parseS(bool l1_p)
{
  static char cls_sp[] = { 0x20, 0x09, 0x0d, 0x0a, 0x00 };
  if (l1_p)
  {
    return parseClassString(cls_sp);
  }
  return true;
}

bool
XmlDetector::parseVersionNum()
{
  static char cls_versionnum[] = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789_.:-";
  return parseClassString(cls_versionnum);
}

void
XmlDetector::parseEq()
{
  if (parseS(true))
  {
    parseS(false);
  }
  if (!CHECK(str_eq))
  {
    throw XmlParseError(130, "can't find '='.");
  }
  if (parseS(true))
  {
    parseS(false);
  }
}

CCC_NAMESPACE_END(CCC);
