﻿// @@DISTHDR@@
// $Id$

#include <ccc/iceman/JaDetector.h>
#include <ccc/iceman/jpcode.h>

CCC_NAMESPACE_START(CCC);

JaDetector::JaDetector()
{
}

JaDetector::~JaDetector()
{
}

Detector*
JaDetector::createDetector()
{
  return new JaDetector();
}

#define EUCC11ST 0x01
#define EUCC12ND 0x02
#define EUCC32ND 0x04
#define EUCC33RD 0x08
#define SJIS1ST  0x10
#define SJIS2ND  0x20
#define HKANA    0x40

UInt8 jadetect_table[256] =
{
/* 00: */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 
/* 10: */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 
/* 20: */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 
/* 30: */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 
/* 40: */ 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 
/* 50: */ 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 
/* 60: */ 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 
/* 70: */ 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x00, 
/* 80: */ 0x20, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 
/* 90: */ 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 
/* a0: */ 0x20, 0x6f, 0x6f, 0x6f, 0x6f, 0x6f, 0x6f, 0x6f, 0x6f, 0x6f, 0x6f, 0x6f, 0x6f, 0x6f, 0x6f, 0x6f, 
/* b0: */ 0x6f, 0x6f, 0x6f, 0x6f, 0x6f, 0x6f, 0x6f, 0x6f, 0x6f, 0x6f, 0x6f, 0x6f, 0x6f, 0x6f, 0x6f, 0x6f, 
/* c0: */ 0x6f, 0x6f, 0x6f, 0x6f, 0x6f, 0x6f, 0x6f, 0x6f, 0x6f, 0x6f, 0x6f, 0x6f, 0x6f, 0x6f, 0x6f, 0x6f, 
/* d0: */ 0x6f, 0x6f, 0x6f, 0x6f, 0x6f, 0x6f, 0x6f, 0x6f, 0x6f, 0x6f, 0x6f, 0x6f, 0x6f, 0x6f, 0x6f, 0x6f, 
/* e0: */ 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3b, 0x3b, 
/* f0: */ 0x2b, 0x2b, 0x2b, 0x2b, 0x2b, 0x2b, 0x2b, 0x2b, 0x2b, 0x2b, 0x2b, 0x2b, 0x2b, 0x0a, 0x0a, 0x00, 
};

// this method detects
// CEID_UTF16_LE, CEID_UTF16_BE, CEID_UTF8, CEID_ISO2022JP, CEID_SJIS and CEID_EUCJP.
CeId
JaDetector::detect(IFlow* in)
{
  try
  {
    UInt8 x;
    UInt8 c = (UInt8)in->getInt8();
    // Check the BOM
    if (c == 0xffu)
    {
      // CEID_UTF16_LE BOM: ff fe
      c = (UInt8)in->getInt8();
      if (c == 0xfeu)
      {
	return CEID_UTF16_LE;
      }
    }
    else if (c == 0xfeu)
    {
      // CEID_UTF16_BE BOM: fe ff 
      c = (UInt8)in->getInt8();
      if (c == 0xffu)
      {
	return CEID_UTF16_BE;
      }
    }
    else if (c == 0xef)
    {
      // UTF-8 BOM: ef bb bf
      c = (UInt8)in->getInt8();
      if (c == 0xbb)
      {
	c = (UInt8)in->getInt8();
	if (c == 0xbf)
	{
	  return CEID_UTF8;
	}
      }
    }
    
    for (;;)
    {
      if (c == ASCII_ESC)	// ESC
      {
	c = (UInt8)in->getInt8();
	if (c == '$')
	{
	  c = (UInt8)in->getInt8();
	  if (c == 'B')
	  {
	    // JIS X 0208-1983
	    return CEID_ISO2022JP;
	  }
	  else if (c == '@')
	  {
	    // OLD JIS: (JIS C 6226-1978)
	    return CEID_ISO2022JP;
	  }
	}
#if 0
	else if (c == 'K')
	{
	  // NEC Japanse detected.
	}
#endif
      }
      if (c == EUC_SS2)	// 0x8e
      {
	c = (UInt8)in->getInt8();
	x = jadetect_table[c];
	if ((x & SJIS2ND) &&
	    !(x & HKANA))
	{
	  return CEID_SJIS;
	}
      }
      else if (c == EUC_SS3)	// 0x8f
      {
	c = (UInt8)in->getInt8();
	x = jadetect_table[c];
	if ((x & SJIS2ND) && !(x & EUCC32ND))
	{
	  return CEID_SJIS;
	}
	c = (UInt8)in->getInt8();
	x = jadetect_table[c];
	if (x & EUCC33RD)
	{
	  if (!(x & SJIS1ST))
	  {
	    return CEID_EUCJP;
	  }
	  else
	  {
	    // 0xe0?? - 0xef??
	    // EUC C3 3rd or SJIS 1st
	    // skip until next certain ASCII or Control character
	    // to recover character gap status. 
	    do
	    {
	      c = (UInt8)in->getInt8();
	    } while (c > 0x40);
	  }
	}
	else
	{
	  return CEID_SJIS;
	}
      }
      else
      {
	x = jadetect_table[c];
	if (x & SJIS1ST)
	{
	  if (!(x & EUCC11ST))
	  {
	    return CEID_SJIS;
	  }
	  else
	  {
	    c = (UInt8)in->getInt8();
	    x = jadetect_table[c];
	    if (x & SJIS2ND)
	    {
	      if (!(x & EUCC12ND))
	      {
		return CEID_SJIS;
	      }
	    }
	    else if (x & EUCC12ND)
	    {
	      return CEID_EUCJP;
	    }
	  }
	}
	else if (x & EUCC11ST)
	{
	  if (!(x & SJIS1ST))
	  {
	    return CEID_EUCJP;
	  }
	  else
	  {
	    c = (UInt8)in->getInt8();
	    x = jadetect_table[c];
	    if (x & SJIS2ND)
	    {
	      if (!(x & EUCC12ND))
	      {
		return CEID_SJIS;
	      }
	    }
	    else if (x & EUCC12ND)
	    {
	      return CEID_EUCJP;
	    }
	  }
	}
      }
      c = (UInt8)in->getInt8();
    }
  }
  catch (IOException)
  {
    /* catch EOF */
  }
  return CEID_NULL;
}

// jadetect_table is generated by following code
#if 0

#include <stdio.h>
typedef unsigned char UInt8;

/*
## SJIS
o 2byte char
  1st byte: 0x81-0x9f, 0xe0-0xef 
  2nd byte: 0x40-0x7e, 0x80-0xfc
o hankaku kana
            0xa1-0xdf
o ascii/jis roman
            0x21-0x7e

## EUC
o c0 ASCII
            0x21-0x7e
o c1 JIS X 0208-1990
  1st byte: 0xa1-0xf4(fc) : KU 1 - 84, (NEC IBM extentions KU 89-92)
  2nd byte: 0xa1-0xfe	: TEN 1 - 94
o c2 JIS X 0201-1997 hankaku kana
  1st byte: 0x8e
  2nd byte: 0xa1-0xdf
o c3 JIS X 0212-1990
  1st byte: 0x8f
  2nd byte: 0xa1-0xed	: KU 1 - 77
  3rd byte: 0xa1-0xfe	: TEN 1 - 94
*/

#define EUCC11ST 0x01
#define EUCC12ND 0x02
#define EUCC32ND 0x04
#define EUCC33RD 0x08
#define SJIS1ST  0x10
#define SJIS2ND  0x20
#define HKANA    0x40

UInt8 jadetect_table[256];

int
main()
{
  int i;
  
  for (i = 0; i < 256; i++)
  {
    jadetect_table[i] = 0;
  }

  // Hankaku Kana
  for (i = 0xa1; i <= 0xdf; i++)
  {
    jadetect_table[i] |= HKANA;
  }

  // EUC-JP
  for (i = 0xa1; i <= 0xfc; i++)
  {
    jadetect_table[i] |= EUCC11ST;
  }
  for (i = 0xa1; i <= 0xfe; i++)
  {
    jadetect_table[i] |= EUCC12ND;
  }
  for (i = 0xa1; i <= 0xed; i++)
  {
    jadetect_table[i] |= EUCC32ND;
  }
  for (i = 0xa1; i <= 0xfe; i++)
  {
    jadetect_table[i] |= EUCC33RD;
  }

  // Shift JIS
  for (i = 0x81; i <= 0x9f; i++)
  {
    jadetect_table[i] |= SJIS1ST;
  }
  for (i = 0xe0; i <= 0xef; i++)
  {
    jadetect_table[i] |= SJIS1ST;
  }
  for (i = 0x40; i <= 0x7e; i++)
  {
    jadetect_table[i] |= SJIS2ND;
  }
  for (i = 0x80; i <= 0xfc; i++)
  {
    jadetect_table[i] |= SJIS2ND;
  }
  for (i = 0; i < 16; i++)
  {
    printf("/* %1x0: */ ", i);
    for (int j = 0; j < 16; j++)
    {
      printf("0x%02x, ", jadetect_table[i * 16 + j]);
    }
    printf("\n");
  }
  return 0;
}

#endif /* 0 */

CCC_NAMESPACE_END(CCC);
