1 files changed, 572 insertions, 0 deletions
diff --git a/vmime-master/src/vmime/charsetConverter_icu.cpp b/vmime-master/src/vmime/charsetConverter_icu.cpp
new file mode 100644
index 0000000..55195b7
--- /dev/null
+++ b/vmime-master/src/vmime/charsetConverter_icu.cpp
@@ -0,0 +1,572 @@
+//
+// VMime library (http://www.vmime.org)
+// Copyright (C) 2002 Vincent Richard <vincent@vmime.org>
+//
+// This program is free software; you can redistribute it and/or
+// modify it under the terms of the GNU General Public License as
+// published by the Free Software Foundation; either version 3 of
+// the License, or (at your option) any later version.
+//
+// This program is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+// General Public License for more details.
+//
+// You should have received a copy of the GNU General Public License along
+// with this program; if not, write to the Free Software Foundation, Inc.,
+// 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+//
+// Linking this library statically or dynamically with other modules is making
+// a combined work based on this library.  Thus, the terms and conditions of
+// the GNU General Public License cover the whole combination.
+//
+
+#include "vmime/config.hpp"
+
+
+#if VMIME_CHARSETCONV_LIB_IS_ICU
+
+
+#include "vmime/charsetConverter_icu.hpp"
+
+#include "vmime/exception.hpp"
+#include "vmime/utility/inputStreamStringAdapter.hpp"
+#include "vmime/utility/outputStreamStringAdapter.hpp"
+
+
+#ifndef VMIME_BUILDING_DOC
+
+	#include <unicode/ucnv.h>
+	#include <unicode/ucnv_err.h>
+
+#endif // VMIME_BUILDING_DOC
+
+
+#include <unicode/unistr.h>
+
+
+namespace vmime {
+
+
+// static
+shared_ptr <charsetConverter> charsetConverter::createGenericConverter(
+	const charset& source,
+	const charset& dest,
+	const charsetConverterOptions& opts
+) {
+
+	return make_shared <charsetConverter_icu>(source, dest, opts);
+}
+
+
+charsetConverter_icu::charsetConverter_icu(
+	const charset& source,
+	const charset& dest,
+	const charsetConverterOptions& opts
+)
+	: m_from(NULL),
+	  m_to(NULL),
+	  m_source(source),
+	  m_dest(dest),
+	  m_options(opts) {
+
+	UErrorCode err = U_ZERO_ERROR;
+	m_from = ucnv_open(source.getName().c_str(), &err);
+
+	if (!U_SUCCESS(err)) {
+
+		throw exceptions::charset_conv_error(
+			"Cannot initialize ICU converter for source charset '" + source.getName()
+			+ "' (error code: " + u_errorName(err) + "."
+		);
+	}
+
+	m_to = ucnv_open(dest.getName().c_str(), &err);
+
+	if (!U_SUCCESS(err)) {
+
+		throw exceptions::charset_conv_error(
+			"Cannot initialize ICU converter for destination charset '" + dest.getName()
+			+ "' (error code: " + u_errorName(err) + "."
+		);
+	}
+}
+
+
+charsetConverter_icu::~charsetConverter_icu() {
+
+	if (m_from) ucnv_close(m_from);
+	if (m_to) ucnv_close(m_to);
+}
+
+
+void charsetConverter_icu::convert(
+	utility::inputStream& in,
+	utility::outputStream& out,
+	status* st
+) {
+
+	UErrorCode err = U_ZERO_ERROR;
+
+	ucnv_reset(m_from);
+	ucnv_reset(m_to);
+
+	if (st) {
+		new (st) status();
+	}
+
+	// From buffers
+	byte_t cpInBuffer[16]; // stream data put here
+	const size_t outSize = ucnv_getMinCharSize(m_from) * sizeof(cpInBuffer) * sizeof(UChar);
+	std::vector <UChar> uOutBuffer(outSize); // Unicode chars end up here
+
+	// To buffers
+	// converted (char) data end up here
+	const size_t cpOutBufferSz = ucnv_getMaxCharSize(m_to) * outSize;
+	std::vector <char> cpOutBuffer(cpOutBufferSz);
+
+	// Tell ICU what to do when encountering an illegal byte sequence
+	if (m_options.silentlyReplaceInvalidSequences) {
+
+		// Set replacement chars for when converting from Unicode to codepage
+		icu::UnicodeString substString(m_options.invalidSequence.c_str());
+		ucnv_setSubstString(m_to, substString.getTerminatedBuffer(), -1, &err);
+
+		if (U_FAILURE(err)) {
+			throw exceptions::charset_conv_error("[ICU] Error when setting substitution string.");
+		}
+
+	} else {
+
+		// Tell ICU top stop (and return an error) on illegal byte sequences
+		ucnv_setToUCallBack(
+			m_from, UCNV_TO_U_CALLBACK_STOP, UCNV_SUB_STOP_ON_ILLEGAL, NULL, NULL, &err
+		);
+
+		if (U_FAILURE(err)) {
+			throw exceptions::charset_conv_error("[ICU] Error when setting ToU callback.");
+		}
+
+		ucnv_setFromUCallBack(
+			m_to, UCNV_FROM_U_CALLBACK_STOP, UCNV_SUB_STOP_ON_ILLEGAL, NULL, NULL, &err
+		);
+
+		if (U_FAILURE(err)) {
+			throw exceptions::charset_conv_error("[ICU] Error when setting FromU callback.");
+		}
+	}
+
+	// Input data available
+	while (!in.eof()) {
+
+		// Read input data into buffer
+		size_t inLength = in.read(cpInBuffer, sizeof(cpInBuffer));
+
+		// Beginning of read data
+		const char* source = reinterpret_cast <const char*>(&cpInBuffer[0]);
+		const char* sourceLimit = source + inLength; // end + 1
+
+		UBool flush = in.eof();  // is this last run?
+
+		UErrorCode toErr;
+
+		// Loop until all source has been processed
+		do {
+
+			// Set up target pointers
+			UChar* target = &uOutBuffer[0];
+			UChar* targetLimit = &target[0] + outSize;
+
+			toErr = U_ZERO_ERROR;
+
+			ucnv_toUnicode(
+				m_from, &target, targetLimit,
+				&source, sourceLimit, NULL, flush, &toErr
+			);
+
+			if (st) {
+				st->inputBytesRead += (source - reinterpret_cast <const char*>(&cpInBuffer[0]));
+			}
+
+			if (toErr != U_BUFFER_OVERFLOW_ERROR && U_FAILURE(toErr)) {
+
+				if (toErr == U_INVALID_CHAR_FOUND ||
+				    toErr == U_TRUNCATED_CHAR_FOUND ||
+				    toErr == U_ILLEGAL_CHAR_FOUND) {
+
+					// Error will be thrown later (*)
+
+				} else {
+
+					throw exceptions::charset_conv_error(
+						"[ICU] Error converting to Unicode from " + m_source.getName()
+					);
+				}
+			}
+
+			// The Unicode source is the buffer just written and the limit
+			// is where the previous conversion stopped (target is moved in the conversion)
+			const UChar* uSource = &uOutBuffer[0];
+			UChar* uSourceLimit = &target[0];
+			UErrorCode fromErr;
+
+			// Loop until converted chars are fully written
+			do {
+
+				char* cpTarget = &cpOutBuffer[0];
+				const char* cpTargetLimit = &cpOutBuffer[0] + cpOutBufferSz;
+
+				fromErr = U_ZERO_ERROR;
+
+				// Write converted bytes (Unicode) to destination codepage
+				ucnv_fromUnicode(
+					m_to, &cpTarget, cpTargetLimit,
+					&uSource, uSourceLimit, NULL, flush, &fromErr
+				);
+
+				if (st) {
+
+					// Decrement input bytes count by the number of input bytes in error
+					char errBytes[16];
+					int8_t errBytesLen = sizeof(errBytes);
+					UErrorCode errBytesErr = U_ZERO_ERROR;
+
+	 				ucnv_getInvalidChars(m_from, errBytes, &errBytesLen, &errBytesErr);
+
+					st->inputBytesRead -= errBytesLen;
+					st->outputBytesWritten += cpTarget - &cpOutBuffer[0];
+				}
+
+				// (*) If an error occurred while converting from input charset, throw it now
+				if (toErr == U_INVALID_CHAR_FOUND ||
+				    toErr == U_TRUNCATED_CHAR_FOUND ||
+				    toErr == U_ILLEGAL_CHAR_FOUND) {
+
+					throw exceptions::illegal_byte_sequence_for_charset();
+				}
+
+				if (fromErr != U_BUFFER_OVERFLOW_ERROR && U_FAILURE(fromErr)) {
+
+					if (fromErr == U_INVALID_CHAR_FOUND ||
+					    fromErr == U_TRUNCATED_CHAR_FOUND ||
+					    fromErr == U_ILLEGAL_CHAR_FOUND) {
+
+						throw exceptions::illegal_byte_sequence_for_charset();
+
+					} else {
+
+						throw exceptions::charset_conv_error(
+							"[ICU] Error converting from Unicode to " + m_dest.getName()
+						);
+					}
+				}
+
+				// Write to destination stream
+				out.write(&cpOutBuffer[0], (cpTarget - &cpOutBuffer[0]));
+
+			} while (fromErr == U_BUFFER_OVERFLOW_ERROR);
+
+		} while (toErr == U_BUFFER_OVERFLOW_ERROR);
+	}
+}
+
+
+void charsetConverter_icu::convert(const string& in, string& out, status* st) {
+
+	if (st) {
+		new (st) status();
+	}
+
+	out.clear();
+
+	utility::inputStreamStringAdapter is(in);
+	utility::outputStreamStringAdapter os(out);
+
+	convert(is, os, st);
+
+	os.flush();
+}
+
+
+shared_ptr <utility::charsetFilteredOutputStream>
+	charsetConverter_icu::getFilteredOutputStream(
+		utility::outputStream& os,
+		const charsetConverterOptions& opts
+	) {
+
+	return make_shared <utility::charsetFilteredOutputStream_icu>(m_source, m_dest, &os, opts);
+}
+
+
+
+// charsetFilteredOutputStream_icu
+
+namespace utility {
+
+
+charsetFilteredOutputStream_icu::charsetFilteredOutputStream_icu(
+	const charset& source,
+	const charset& dest,
+	outputStream* os,
+	const charsetConverterOptions& opts
+)
+	: m_from(NULL),
+	  m_to(NULL),
+	  m_sourceCharset(source),
+	  m_destCharset(dest),
+	  m_stream(*os),
+	  m_options(opts) {
+
+	UErrorCode err = U_ZERO_ERROR;
+	m_from = ucnv_open(source.getName().c_str(), &err);
+
+	if (!U_SUCCESS(err)) {
+
+		throw exceptions::charset_conv_error(
+			"Cannot initialize ICU converter for source charset '" + source.getName()
+			+ "' (error code: " + u_errorName(err) + "."
+		);
+	}
+
+	m_to = ucnv_open(dest.getName().c_str(), &err);
+
+	if (!U_SUCCESS(err)) {
+
+		throw exceptions::charset_conv_error(
+			"Cannot initialize ICU converter for destination charset '" + dest.getName()
+			+ "' (error code: " + u_errorName(err) + "."
+		);
+	}
+
+	// Tell ICU what to do when encountering an illegal byte sequence
+	if (m_options.silentlyReplaceInvalidSequences) {
+
+		// Set replacement chars for when converting from Unicode to codepage
+		icu::UnicodeString substString(m_options.invalidSequence.c_str());
+		ucnv_setSubstString(m_to, substString.getTerminatedBuffer(), -1, &err);
+
+		if (U_FAILURE(err)) {
+			throw exceptions::charset_conv_error("[ICU] Error when setting substitution string.");
+		}
+
+	} else {
+
+		// Tell ICU top stop (and return an error) on illegal byte sequences
+		ucnv_setToUCallBack(
+			m_to, UCNV_TO_U_CALLBACK_STOP, UCNV_SUB_STOP_ON_ILLEGAL, NULL, NULL, &err
+		);
+
+		if (U_FAILURE(err)) {
+			throw exceptions::charset_conv_error("[ICU] Error when setting ToU callback.");
+		}
+
+		ucnv_setFromUCallBack(
+			m_to, UCNV_FROM_U_CALLBACK_STOP, UCNV_SUB_STOP_ON_ILLEGAL, NULL, NULL, &err
+		);
+
+		if (U_FAILURE(err)) {
+			throw exceptions::charset_conv_error("[ICU] Error when setting FromU callback.");
+		}
+	}
+}
+
+
+charsetFilteredOutputStream_icu::~charsetFilteredOutputStream_icu() {
+
+	if (m_from) ucnv_close(m_from);
+	if (m_to) ucnv_close(m_to);
+}
+
+
+outputStream& charsetFilteredOutputStream_icu::getNextOutputStream() {
+
+	return m_stream;
+}
+
+
+void charsetFilteredOutputStream_icu::writeImpl(
+	const byte_t* const data,
+	const size_t count
+) {
+
+	if (!m_from || !m_to) {
+		throw exceptions::charset_conv_error("Cannot initialize converters.");
+	}
+
+	// Allocate buffer for Unicode chars
+	const size_t uniSize = ucnv_getMinCharSize(m_from) * count * sizeof(UChar);
+	std::vector <UChar> uniBuffer(uniSize);
+
+	// Conversion loop
+	UErrorCode toErr = U_ZERO_ERROR;
+
+	const char* uniSource = reinterpret_cast <const char*>(data);
+	const char* uniSourceLimit = uniSource + count;
+
+	do {
+
+		// Convert from source charset to Unicode
+		UChar* uniTarget = &uniBuffer[0];
+		UChar* uniTargetLimit = &uniBuffer[0] + uniSize;
+
+		toErr = U_ZERO_ERROR;
+
+		ucnv_toUnicode(
+			m_from, &uniTarget, uniTargetLimit,
+			&uniSource, uniSourceLimit, NULL, /* flush */ FALSE, &toErr
+		);
+
+		if (U_FAILURE(toErr) && toErr != U_BUFFER_OVERFLOW_ERROR) {
+
+			if (toErr == U_INVALID_CHAR_FOUND ||
+			    toErr == U_TRUNCATED_CHAR_FOUND ||
+			    toErr == U_ILLEGAL_CHAR_FOUND) {
+
+				throw exceptions::illegal_byte_sequence_for_charset();
+
+			} else {
+
+				throw exceptions::charset_conv_error(
+					"[ICU] Error converting to Unicode from '" + m_sourceCharset.getName() + "'."
+				);
+			}
+		}
+
+		const size_t uniLength = uniTarget - &uniBuffer[0];
+
+		// Allocate buffer for destination charset
+		const size_t cpSize = ucnv_getMinCharSize(m_to) * uniLength;
+		std::vector <char> cpBuffer(cpSize);
+
+		// Convert from Unicode to destination charset
+		UErrorCode fromErr = U_ZERO_ERROR;
+
+		const UChar* cpSource = &uniBuffer[0];
+		const UChar* cpSourceLimit = &uniBuffer[0] + uniLength;
+
+		do {
+
+			char* cpTarget = &cpBuffer[0];
+			char* cpTargetLimit = &cpBuffer[0] + cpSize;
+
+			fromErr = U_ZERO_ERROR;
+
+			ucnv_fromUnicode(
+				m_to, &cpTarget, cpTargetLimit,
+				&cpSource, cpSourceLimit, NULL, /* flush */ FALSE, &fromErr
+			);
+
+			if (fromErr != U_BUFFER_OVERFLOW_ERROR && U_FAILURE(fromErr)) {
+
+				if (fromErr == U_INVALID_CHAR_FOUND ||
+				    fromErr == U_TRUNCATED_CHAR_FOUND ||
+				    fromErr == U_ILLEGAL_CHAR_FOUND) {
+
+					throw exceptions::illegal_byte_sequence_for_charset();
+
+				} else {
+
+					throw exceptions::charset_conv_error(
+						"[ICU] Error converting from Unicode to '" + m_destCharset.getName() + "'."
+					);
+				}
+			}
+
+			const size_t cpLength = cpTarget - &cpBuffer[0];
+
+			// Write successfully converted bytes
+			m_stream.write(&cpBuffer[0], cpLength);
+
+		} while (fromErr == U_BUFFER_OVERFLOW_ERROR);
+
+	} while (toErr == U_BUFFER_OVERFLOW_ERROR);
+}
+
+
+void charsetFilteredOutputStream_icu::flush() {
+
+	if (!m_from || !m_to) {
+		throw exceptions::charset_conv_error("Cannot initialize converters.");
+	}
+
+	// Allocate buffer for Unicode chars
+	const size_t uniSize = ucnv_getMinCharSize(m_from) * 1024 * sizeof(UChar);
+	std::vector <UChar> uniBuffer(uniSize);
+
+	// Conversion loop (with flushing)
+	UErrorCode toErr = U_ZERO_ERROR;
+
+	const char* uniSource = 0;
+	const char* uniSourceLimit = 0;
+
+	do {
+
+		// Convert from source charset to Unicode
+		UChar* uniTarget = &uniBuffer[0];
+		UChar* uniTargetLimit = &uniBuffer[0] + uniSize;
+
+		toErr = U_ZERO_ERROR;
+
+		ucnv_toUnicode(
+			m_from, &uniTarget, uniTargetLimit,
+			&uniSource, uniSourceLimit, NULL, /* flush */ TRUE, &toErr
+		);
+
+		if (U_FAILURE(toErr) && toErr != U_BUFFER_OVERFLOW_ERROR) {
+
+			throw exceptions::charset_conv_error(
+				"[ICU] Error converting to Unicode from '" + m_sourceCharset.getName() + "'."
+			);
+		}
+
+		const size_t uniLength = uniTarget - &uniBuffer[0];
+
+		// Allocate buffer for destination charset
+		const size_t cpSize = ucnv_getMinCharSize(m_to) * uniLength;
+		std::vector <char> cpBuffer(cpSize);
+
+		// Convert from Unicode to destination charset
+		UErrorCode fromErr = U_ZERO_ERROR;
+
+		const UChar* cpSource = &uniBuffer[0];
+		const UChar* cpSourceLimit = &uniBuffer[0] + uniLength;
+
+		do {
+
+			char* cpTarget = &cpBuffer[0];
+			char* cpTargetLimit = &cpBuffer[0] + cpSize;
+
+			fromErr = U_ZERO_ERROR;
+
+			ucnv_fromUnicode(
+				m_to, &cpTarget, cpTargetLimit,
+				&cpSource, cpSourceLimit, NULL, /* flush */ TRUE, &fromErr
+			);
+
+			if (fromErr != U_BUFFER_OVERFLOW_ERROR && U_FAILURE(fromErr)) {
+
+				throw exceptions::charset_conv_error(
+					"[ICU] Error converting from Unicode to '" + m_destCharset.getName() + "'."
+				);
+			}
+
+			const size_t cpLength = cpTarget - &cpBuffer[0];
+
+			// Write successfully converted bytes
+			m_stream.write(&cpBuffer[0], cpLength);
+
+		} while (fromErr == U_BUFFER_OVERFLOW_ERROR);
+
+	} while (toErr == U_BUFFER_OVERFLOW_ERROR);
+
+	m_stream.flush();
+}
+
+
+} // utility
+
+
+} // vmime
+
+
+#endif // VMIME_CHARSETCONV_LIB_IS_ICU