﻿// @@DISTHDR@@
// $Id$

#include <ccc/iceman/Ja2Detector.h>
#include <ccc/iceman/i18ncode.h>
#include <ccc/iceman/jpcode.h>

CCC_NAMESPACE_START(CCC);

Ja2Detector::Ja2Detector()
{
}

Ja2Detector::~Ja2Detector()
{
}

Detector*
Ja2Detector::createDetector()
{
  return new Ja2Detector();
}

static bool
checkUsAsciiCode(const char* str)
{
  CCC::UInt8* s = (CCC::UInt8*)str;
  while (*s != '\0')
  {
    if (*s++ >= 0x80)
    {
      return false;
    }
  }
  return true;
}

static bool
checkShiftJisCode(const char* str)
{
  while (*str != '\0')
  {
    CCC::UInt8 c1 = (CCC::UInt8)*str;
    CCC::UInt8 c2 = (CCC::UInt8)*(str + 1);
    // test
    // <ccc/iceman/i18ncode.h>
    // bool asciiP(UInt8 c)
    if (CCC::asciiP(c1))
    {
      str++;
      //printf("ASCII:%02x\n", c1);
      continue;
    }
    // <ccc/iceman/jpcode.h>
    // bool sjisHankakuKanaP(UInt8 c);
    // bool cp932WithGaijiP(UInt8 c1, UInt8 c2);
    if (CCC::sjisHankakuKanaP(c1))
    {
      str++;
      //printf("HANKAKU KANA:%02x\n", c1);
      continue;
    }
    if (CCC::cp932P(c1, c2))
    {
      str += 2;
      continue;
    }
    if (CCC::cp932WithGaijiP(c1, c2))
    {
      str += 2;
      //printf("SJIS:%02x %02x\n", c1, c2);
      continue;
    }
    str++;
    //printf("UNKNOWN:%02x %02x\n", c1, c2);
    return false;
  }
  return true;
}

/*!
 * check the string is utf-8 or not.
 * \param str string for checking
 * \param strong_possibility_p, true will be set if the string is strong posibility than shift_jis.
 * \retval true utf-8
 * \retval false not utf-8
 */
static bool
checkUtf8Code(const char* str, bool* strong_possibility_p)
{
  *strong_possibility_p = false;
  CCC::UInt8* s = (CCC::UInt8*)str;
  while (*s != '\0')
  {
    // 1: 00-7f					7 bits
    // 2: c2-df 80-bf				11 bits
    // 3: e0-ef 80-bf 80-bf			16 bits
    // 4: f0-f7 80-bf 80-bf 80-bf		21 bits
    // 5: f8-fb 80-bf 80-bf 80-bf 80-bf		26 bits
    // 6: fc-fd 80-bf 80-bf 80-bf 80-bf 80-bf	31 bits
    // -: 80-c1					not utf-8
    // -: fe-ff                                 not utf-8 (UTF-16 BOM)
    CCC::UInt8 c1 = *s;
    CCC::UInt8 c2 = *(s + 1);
    CCC::UInt8 c3 = (c2 == '\0') ? '\0' : *(s + 2);
    CCC::UInt8 c4 = (c3 == '\0') ? '\0' : *(s + 3);
    CCC::UInt8 c5 = (c4 == '\0') ? '\0' : *(s + 4);
    CCC::UInt8 c6 = (c5 == '\0') ? '\0' : *(s + 5);
    // 1: 00-7f					7 bits
    if (c1 <= 0x7f)
    {
      // 1 byte
      s++;
      //printf("UTF-8 1 byte:%02x\n", c1);
    }
    // 2: c2-df 80-bf				11 bits
    else if ((c1 >= 0xc1) && (c1 <= 0xdf))
    {
      if ((c2 >= 0x80) && (c2 <= 0xbf))
      {
	// 2 bytes
	s += 2;
	//printf("UTF-8 2 bytes:%02x %02x\n", c1, c2);
      }
      else
      {
	// error
	//printf("UTF-8 2 bytes ERROR:%02x %02x\n", c1, c2);
	return false;
      }
    }
    // 3: e0-ef 80-bf 80-bf			16 bits
    else if ((c1 >= 0xe0) && (c1 <= 0xef))
    {
      if ((c2 >= 0x80) && (c2 <= 0xbf) &&
	  (c3 >= 0x80) && (c3 <= 0xbf))
      {
	// 3 bytes
	s += 3;
	*strong_possibility_p = true;
	//printf("UTF-8 3 bytes:%02x %02x %02x\n", c1, c2, c3);
      }
      else
      {
	// error
	//printf("UTF-8 3 bytes ERROR:%02x %02x %02x\n", c1, c2, c3);
	return false;
      }
    }
    // 4: f0-f7 80-bf 80-bf 80-bf		21 bits
    else if ((c1 >= 0xf0) && (c1 <= 0xf7))
    {
      if ((c2 >= 0x80) && (c2 <= 0xbf) &&
	  (c3 >= 0x80) && (c3 <= 0xbf) &&
	  (c4 >= 0x80) && (c4 <= 0xbf))
      {
	// 4 bytes
	s += 4;
	//printf("UTF-8 4 bytes:%02x %02x %02x %02x\n", c1, c2, c3, c4);
      }
      else
      {
	// error
	//printf("UTF-8 4 bytes ERROR:%02x %02x %02x %02x\n", c1, c2, c3, c4);
	return false;
      }
    }
    // 5: f8-fb 80-bf 80-bf 80-bf 80-bf		26 bits
    else if ((c1 >= 0xf8) && (c1 <= 0xfb))
    {
      if ((c2 >= 0x80) && (c2 <= 0xbf) &&
	  (c3 >= 0x80) && (c3 <= 0xbf) &&
	  (c4 >= 0x80) && (c4 <= 0xbf) &&
	  (c5 >= 0x80) && (c5 <= 0xbf))
      {
	// 5 bytes
	s += 5;
	//printf("UTF-8 5 bytes:%02x %02x %02x %02x %02x\n", c1, c2, c3, c4, c5);
      }
      else
      {
	// error
	//printf("UTF-8 5 bytes ERROR:%02x %02x %02x %02x %02x\n", c1, c2, c3, c4, c5);
	return false;
      }
    }
    // 6: fc-fd 80-bf 80-bf 80-bf 80-bf 80-bf	31 bits
    else if ((c1 >= 0xfc) && (c1 <= 0xfd))
    {
      if ((c2 >= 0x80) && (c2 <= 0xbf) &&
	  (c3 >= 0x80) && (c3 <= 0xbf) &&
	  (c4 >= 0x80) && (c4 <= 0xbf) &&
	  (c5 >= 0x80) && (c5 <= 0xbf) &&
	  (c6 >= 0x80) && (c6 <= 0xbf))
      {
	// 6 bytes
	s += 6;
	//printf("UTF-8 6 bytes:%02x %02x %02x %02x %02x %02x\n", c1, c2, c3, c4, c5, c6);
      }
      else
      {
	// error
	//printf("UTF-8 6 bytes ERROR:%02x %02x %02x %02x %02x %02x\n", c1, c2, c3, c4, c5, c6);
	return false;
      }
    }
    else
    {
      //printf("UTF-8 1 bytes ERROR:%02x %02x %02x %02x %02x %02x\n", c1, c2, c3, c4, c5, c6);
      return false;
    }
  }
  return true;
}

// this method detects
// CEID_UTF8, CEID_SJIS and CEID_USASCII.
CeId
Ja2Detector::detect(IFlow* in)
{
  CCC::BString str;
  try
  {
    for (;;)
    {
      Int8 c = in->getInt8();
      str.add(c);
    }
  }
  catch (IOException)
  {
    /* catch EOF */
  }

  Size len = str.getLength();
  UInt8* s = (UInt8*)str.getCString();
  // Check the BOM
  if ((len >= 3) && (s[0] == 0xef) && (s[1] == 0xbb) && (s[2] == 0xbf))
  {
    return CEID_UTF8;
  }

  if (checkUsAsciiCode((const char*)s))
  {
    return CEID_USASCII;
  }
  bool sjis_p = checkShiftJisCode((const char*)s);
  bool utf8_strong_possibility_p;
  bool utf8_p = checkUtf8Code((const char*)s, &utf8_strong_possibility_p);
  if (utf8_p)
  {
    if (sjis_p)
    {
      if (utf8_strong_possibility_p)
      {
	return CEID_UTF8;
      }
      else
      {
	return CEID_CP932;
      }
    }
    return CEID_UTF8;
  }
  if (sjis_p)
  {
    return CEID_CP932;
  }
  // unknown patterns
  return CEID_USASCII;
}

CCC_NAMESPACE_END(CCC);
