From 7a97d981de20c2e9e9fa27d06f5b2c69ea09de16 Mon Sep 17 00:00:00 2001 From: Gert van Valkenhoef Date: Sun, 13 Nov 2011 20:03:38 +0000 Subject: [PATCH] Try to use character conversion --- odcread.cc | 69 ++++++++++++++++++++++++++++++++++++++++++++++++---- textmodel.cc | 16 ++++++------ textmodel.h | 24 ++++++++++++------ 3 files changed, 88 insertions(+), 21 deletions(-) diff --git a/odcread.cc b/odcread.cc index 858028e..caa87e7 100644 --- a/odcread.cc +++ b/odcread.cc @@ -9,6 +9,12 @@ #include #include +// Character encoding conversions +#include +#include +#include +#include + namespace odc { class Context { public: @@ -80,14 +86,63 @@ namespace odc { virtual void foldRight() { terminateContext(); } + char *getCharSet() { + return "UTF-8"; // FIXME setlocale(LC_CTYPE, 0) + processing + } virtual void textShortPiece(const ShortPiece *piece) { - std::string text = piece->getText(); - d_context.top()->addPiece(text); + iconv_t conv = iconv_open("UTF-8", "ISO-8859-1"); + if (conv == (iconv_t)-1) { + std::string str("iconv initialization error: "); + str += strerror(errno); + throw str.c_str(); + } + size_t bytesIn = piece->size() + 1; + SHORTCHAR *in = piece->getBuffer(); + size_t bytesOut = bytesIn; // FIXME probably not safe. + char *out = new char[bytesIn]; + char *outPtr = out; + size_t rval = iconv(conv, &in, &bytesIn, &outPtr, &bytesOut); + if (rval == (size_t)-1) { + std::string str("iconv error: "); + str += strerror(errno); + throw str.c_str(); + } + iconv_close(conv); + std::string str(out); + for (std::string::iterator it = str.begin(); it < str.end(); ++it) { + if (*it == '\r') *it = '\n'; + } + d_context.top()->addPiece(str); } virtual void textLongPiece(const LongPiece *piece) { - throw "Long Piece not handled"; - //std::string text = piece->getText(); - //d_context.top()->addPiece(text); + char *out = (char*)piece->getBuffer(); + std::string str(out); + d_context.top()->addPiece(str); + //d_convLong = iconv_open(setlocale(LC_CTYPE, 0), "UCS-2"); + /* + iconv_t conv = iconv_open("UTF-8", "UTF-8"); + if (conv == (iconv_t)-1) { + std::string str("iconv initialization error: "); + str += strerror(errno); + throw str.c_str(); + } + size_t bytesIn = piece->size() + 1; + char *in = (char*)piece->getBuffer(); + size_t bytesOut = bytesIn; // FIXME probably not safe. + char *out = new char[bytesIn]; + char *outPtr = out; + size_t rval = iconv(conv, &in, &bytesIn, &outPtr, &bytesOut); + if (rval == (size_t)-1) { + std::string str("iconv error: "); + str += strerror(errno); + throw str.c_str(); + } + iconv_close(conv); + std::string str(out); + for (std::string::iterator it = str.begin(); it < str.end(); ++it) { + if (*it == '\r') *it = '\n'; + } + d_context.top()->addPiece(str);*/ } }; @@ -112,6 +167,10 @@ int main(int argc, char *argv[]) { if (argc < 2) { return 1; } + + // Set the locale according to the terminal's environment + setlocale(LC_ALL, ""); + std::ifstream in(argv[1], std::ios::in | std::ios::binary); odc::Store* s; diff --git a/textmodel.cc b/textmodel.cc index dd982dc..7afc34c 100644 --- a/textmodel.cc +++ b/textmodel.cc @@ -99,6 +99,10 @@ void StdTextModel::accept(Visitor &visitor) const { TextPiece::TextPiece(size_t len): d_len(len) {} +unsigned TextPiece::size() const { + return d_len; +} + LongPiece::LongPiece(size_t len): TextPiece(len * 2) {} LongPiece::~LongPiece() { @@ -115,8 +119,8 @@ std::string LongPiece::toString() const { return std::string("LongPiece(FIXME)"); } -std::wstring LongPiece::getText() const { - return std::wstring((wchar_t*)d_buf); +CHAR* LongPiece::getBuffer() const { + return d_buf; } void LongPiece::accept(Visitor &visitor) const { @@ -139,12 +143,8 @@ std::string ShortPiece::toString() const { return std::string("ShortPiece(") + std::string(d_buf) + std::string(")"); } -std::string ShortPiece::getText() const { - std::string str(d_buf); - for (std::string::iterator it = str.begin(); it < str.end(); ++it) { - if (*it == '\r') *it = '\n'; - } - return str; +SHORTCHAR* ShortPiece::getBuffer() const { + return d_buf; } void ShortPiece::accept(Visitor &visitor) const { diff --git a/textmodel.h b/textmodel.h index 8eb26eb..e6faa6e 100644 --- a/textmodel.h +++ b/textmodel.h @@ -38,11 +38,15 @@ namespace odc { virtual void read(Reader &reader) = 0; virtual std::string toString() const = 0; virtual void accept(Visitor &visitor) const = 0; + /** + * Size in bytes, excluding the null-character that terminates the string (i.e. the size that is read from file). + */ + unsigned size() const; }; /** - * TextPiece consisting of 16-bit characters. - * Not sure of the encoding. + * TextPiece consisting of 16-bit unicode characters. + * Not sure if the encoding is UCS-2 or UTF-16. */ class LongPiece : public TextPiece { private: @@ -52,16 +56,16 @@ namespace odc { ~LongPiece(); virtual void read(Reader &reader); virtual std::string toString() const; + virtual void accept(Visitor &visitor) const; + /** - * Return the text contained in this piece. - * Currently just casting the buffer to wchar_t* and hoping for the best. + * Get the buffer contents as 16-bit (UCS-2 or UTF-16 I don't know) unicode. */ - virtual std::wstring getText() const; - virtual void accept(Visitor &visitor) const; + CHAR *getBuffer() const; }; /** - * TextPiece consisting of 8-bit characters. + * TextPiece consisting of 8-bit characters in the Latin-1 extension of ASCII. */ class ShortPiece : public TextPiece { private: @@ -71,8 +75,12 @@ namespace odc { ~ShortPiece(); virtual void read(Reader &reader); virtual std::string toString() const; - virtual std::string getText() const; virtual void accept(Visitor &visitor) const; + + /** + * Get the buffer contents as 8-bit (Latin-1) characters. + */ + SHORTCHAR *getBuffer() const; }; /** -- 2.29.2