/////////////////////////////////////////////////////////////////////////////
//
//	File: QzTxtReader.cpp
//
//	$Header: /TS/TsFile/QzTxtReader.cpp  5  2009/9/7 3:18:22p  Lee $
//
//
//	Utility class for reading a text file.  This will convert the file into
//	UTF-8 format and compose the text to merge diacritics with their base
//	symbols.  This can recognize ASCII, UTF-8, UTF-16 BE, UTF-16 LE, and
//	UTF-32.  It will read the whole file into memory and compose it in one
//	pass, so all read operations are directly from memory.
//
/////////////////////////////////////////////////////////////////////////////


#include "QzCommon.h"
#include "QzTxtReader.h"


#ifdef USE_MALLOC_MACRO
#define new DEBUG_NEW
#undef THIS_FILE
static char THIS_FILE[] = __FILE__;
#endif


/////////////////////////////////////////////////////////////////////////////
//
//	constructor
//
QzTxtReader::QzTxtReader(void)
	:	m_ByteOffset(0),
		m_ByteCount(0),
		m_pBuffer(NULL)
{
}


/////////////////////////////////////////////////////////////////////////////
//
//	destructor
//
QzTxtReader::~QzTxtReader(void)
{
	SafeDeleteArray(m_pBuffer);
}


/////////////////////////////////////////////////////////////////////////////
//
//	LoadFile()
//
//	Read in a file and convert it into UTF-8 format.  This will compose the
//	contents of the file, making certain that any normalized diacritics are
//	combined into single code symbols, and that any unsupported symbols will
//	be discarded to simplify processing.
//
bool QzTxtReader::LoadFile(const Utf08_t filename[])
{
	SafeDeleteArray(m_pBuffer);

	U32  byteCount = 0;
	U08* pBuffer   = QzReadFileToBuffer(filename, byteCount);

	if (NULL == pBuffer) {
		return false;
	}

	// Scan the first few bytes to see if there is a byte order mark.
	U32 utfEncoding = UtfDetectEncoding(pBuffer, byteCount);

	m_ByteOffset = 0;

	switch (utfEncoding) {
		case UtfEncode_ASCII:
			m_ByteCount = UtfPreComposeASCII(pBuffer);
			m_pBuffer   = new Utf08_t[m_ByteCount+1];

			UtfComposeASCII(m_pBuffer, m_ByteCount+1, reinterpret_cast<char*>(pBuffer));

			SafeDeleteArray(pBuffer);
			break;

		case UtfEncode_UTF_8:
			// Need to skip the 3-byte UTF-8 byte-order-mark.
			m_ByteCount = UtfPreCompose08to08(reinterpret_cast<Utf08_t*>(pBuffer + 3));
			m_pBuffer   = new Utf08_t[m_ByteCount+1];

			UtfCompose08to08(m_pBuffer, m_ByteCount+1, reinterpret_cast<Utf08_t*>(pBuffer + 3));

			SafeDeleteArray(pBuffer);
			break;

		case UtfEncode_UTF_16_BE:
			for (U32 i = 0; i < byteCount; i += 2) {
				Swap(pBuffer[i], pBuffer[i+1]);
			}

			// Intentionally fall through.

		case UtfEncode_UTF_16_LE:
			// Need to skip the 2-byte UTF-16 byte-order-mark.
			m_ByteCount = UtfPreCompose16to08(reinterpret_cast<Utf16_t*>(pBuffer + 2));
			m_pBuffer   = new Utf08_t[m_ByteCount+1];

			UtfCompose16to08(m_pBuffer, m_ByteCount+1, reinterpret_cast<Utf16_t*>(pBuffer + 2));

			SafeDeleteArray(pBuffer);
			break;

		case UtfEncode_UTF_32_BE:
			for (U32 i = 0; i < byteCount; i += 4) {
				Swap(pBuffer[i  ], pBuffer[i+3]);
				Swap(pBuffer[i+1], pBuffer[i+2]);
			}

			// Intentionally fall through.

		case UtfEncode_UTF_32_LE:
			// Need to skip the 4-byte UTF-32 byte-order-mark.
			m_ByteCount = UtfPreCompose32to08(reinterpret_cast<Utf32_t*>(pBuffer + 4));
			m_pBuffer   = new Utf08_t[m_ByteCount+1];

			UtfCompose32to08(m_pBuffer, m_ByteCount+1, reinterpret_cast<Utf32_t*>(pBuffer + 4));

			SafeDeleteArray(pBuffer);
			break;
	}

	return true;
}


/////////////////////////////////////////////////////////////////////////////
//
//	ReadLine()
//
//	This will read an entire line, up to the next '\n' marker.  If the line
//	is too long to fit within the destination buffer, this will discard the
//	rest of the line.
//
S32 QzTxtReader::ReadLine(Utf08_t pDst[], U32 dstLimit)
{
	U32 dstOffset = 0;

	// Reserve a space at the end of the buffer for the '\0' sentinel.
	--dstLimit;

	if (m_ByteOffset >= m_ByteCount) {
		return -1;
	}

	while (m_ByteOffset < m_ByteCount) {
		Utf08_t c = m_pBuffer[m_ByteOffset++];

		if ('\n' == c) {
			break;
		}
		else if ('\r' == c) {
			// ignore
		}
		else if ((dstOffset + 1) < dstLimit) {
			pDst[dstOffset++] = c;
		}
	}

	pDst[dstOffset] = L'\0';

	return dstOffset;
}


/////////////////////////////////////////////////////////////////////////////
//
//	ExtractBuffer()
//
Utf08_t* QzTxtReader::ExtractBuffer(U32 &byteCount)
{
	byteCount = m_ByteCount;

	Utf08_t *p = m_pBuffer;

	m_ByteOffset = 0;
	m_ByteCount  = 0;
	m_pBuffer    = NULL;

	return p;
}




