DEADSOFTWARE

Try to use character conversion
authorGert van Valkenhoef <g.h.m.van.valkenhoef@rug.nl>
Sun, 13 Nov 2011 20:03:38 +0000 (20:03 +0000)
committerGert van Valkenhoef <g.h.m.van.valkenhoef@rug.nl>
Sun, 13 Nov 2011 20:03:38 +0000 (20:03 +0000)
odcread.cc
textmodel.cc
textmodel.h

index 858028e6babf5e252ede8ade0c0f71d2946ceb4e..caa87e7713f0dc937cd24c717c54dbbc2add4160 100644 (file)
@@ -9,6 +9,12 @@
 #include <textmodel.h>
 #include <visitor.h>
 
+// Character encoding conversions
+#include <locale.h>
+#include <iconv.h>
+#include <errno.h>
+#include <string.h>
+
 namespace odc {
        class Context {
                public:
@@ -80,14 +86,63 @@ namespace odc {
                virtual void foldRight() {
                        terminateContext();
                }
+               char *getCharSet() {
+                       return "UTF-8"; // FIXME setlocale(LC_CTYPE, 0) + processing
+               }
                virtual void textShortPiece(const ShortPiece *piece) {
-                       std::string text = piece->getText();
-                       d_context.top()->addPiece(text);
+                       iconv_t conv = iconv_open("UTF-8", "ISO-8859-1");
+                       if (conv == (iconv_t)-1) {
+                               std::string str("iconv initialization error: ");
+                               str += strerror(errno);
+                               throw str.c_str();
+                       }
+                       size_t bytesIn = piece->size() + 1;
+                       SHORTCHAR *in = piece->getBuffer();
+                       size_t bytesOut = bytesIn; // FIXME probably not safe.
+                       char *out = new char[bytesIn];
+                       char *outPtr = out;
+                       size_t rval = iconv(conv, &in, &bytesIn, &outPtr, &bytesOut);
+                       if (rval == (size_t)-1) {
+                               std::string str("iconv error: ");
+                               str += strerror(errno);
+                               throw str.c_str();
+                       }
+                       iconv_close(conv);
+                       std::string str(out);
+                       for (std::string::iterator it = str.begin(); it < str.end(); ++it) {
+                               if (*it == '\r') *it = '\n';
+                       }
+                       d_context.top()->addPiece(str);
                }
                virtual void textLongPiece(const LongPiece *piece) {
-                       throw "Long Piece not handled";
-                       //std::string text = piece->getText();
-                       //d_context.top()->addPiece(text);
+                       char *out = (char*)piece->getBuffer();
+                       std::string str(out);
+                       d_context.top()->addPiece(str);
+                       //d_convLong = iconv_open(setlocale(LC_CTYPE, 0), "UCS-2");
+                       /*
+                       iconv_t conv = iconv_open("UTF-8", "UTF-8");
+                       if (conv == (iconv_t)-1) {
+                               std::string str("iconv initialization error: ");
+                               str += strerror(errno);
+                               throw str.c_str();
+                       }
+                       size_t bytesIn = piece->size() + 1;
+                       char *in = (char*)piece->getBuffer();
+                       size_t bytesOut = bytesIn; // FIXME probably not safe.
+                       char *out = new char[bytesIn];
+                       char *outPtr = out;
+                       size_t rval = iconv(conv, &in, &bytesIn, &outPtr, &bytesOut);
+                       if (rval == (size_t)-1) {
+                               std::string str("iconv error: ");
+                               str += strerror(errno);
+                               throw str.c_str();
+                       }
+                       iconv_close(conv);
+                       std::string str(out);
+                       for (std::string::iterator it = str.begin(); it < str.end(); ++it) {
+                               if (*it == '\r') *it = '\n';
+                       }
+                       d_context.top()->addPiece(str);*/
                }
        };
 
@@ -112,6 +167,10 @@ int main(int argc, char *argv[]) {
        if (argc < 2) {
                return 1;
        }
+
+       // Set the locale according to the terminal's environment
+       setlocale(LC_ALL, "");
+
        std::ifstream in(argv[1], std::ios::in | std::ios::binary);
 
        odc::Store* s;
index dd982dc490873b612a2377269d35fa3b30e0edac..7afc34c7983ccaf56ee3cb37ca05a92a1cb0de4a 100644 (file)
@@ -99,6 +99,10 @@ void StdTextModel::accept(Visitor &visitor) const {
 
 TextPiece::TextPiece(size_t len): d_len(len) {}
 
+unsigned TextPiece::size() const {
+       return d_len;
+}
+
 LongPiece::LongPiece(size_t len): TextPiece(len * 2) {}
 
 LongPiece::~LongPiece() {
@@ -115,8 +119,8 @@ std::string LongPiece::toString() const {
        return std::string("LongPiece(FIXME)");
 }
 
-std::wstring LongPiece::getText() const {
-       return std::wstring((wchar_t*)d_buf);
+CHAR* LongPiece::getBuffer() const {
+       return d_buf;
 }
 
 void LongPiece::accept(Visitor &visitor) const {
@@ -139,12 +143,8 @@ std::string ShortPiece::toString() const {
        return std::string("ShortPiece(") + std::string(d_buf) + std::string(")");
 }
 
-std::string ShortPiece::getText() const {
-       std::string str(d_buf);
-       for (std::string::iterator it = str.begin(); it < str.end(); ++it) {
-               if (*it == '\r') *it = '\n';
-       }
-       return str;
+SHORTCHAR* ShortPiece::getBuffer() const {
+       return d_buf;
 }
 
 void ShortPiece::accept(Visitor &visitor) const {
index 8eb26eb47d419c2a2d2b7d5e30765161e4898d91..e6faa6ec8ee2a25d792f6fd451ca2f29d61fd43e 100644 (file)
@@ -38,11 +38,15 @@ namespace odc {
                virtual void read(Reader &reader) = 0;
                virtual std::string toString() const = 0;
                virtual void accept(Visitor &visitor) const = 0;
+               /**
+                * Size in bytes, excluding the null-character that terminates the string (i.e. the size that is read from file).
+                */
+               unsigned size() const;
        };
 
        /**
-        * TextPiece consisting of 16-bit characters.
-        * Not sure of the encoding.
+        * TextPiece consisting of 16-bit unicode characters.
+        * Not sure if the encoding is UCS-2 or UTF-16.
         */
        class LongPiece : public TextPiece {
                private:
@@ -52,16 +56,16 @@ namespace odc {
                ~LongPiece();
                virtual void read(Reader &reader);
                virtual std::string toString() const;
+               virtual void accept(Visitor &visitor) const;
+
                /**
-                * Return the text contained in this piece.
-                * Currently just casting the buffer to wchar_t* and hoping for the best.
+                * Get the buffer contents as 16-bit (UCS-2 or UTF-16 I don't know) unicode.
                 */
-               virtual std::wstring getText() const;
-               virtual void accept(Visitor &visitor) const;
+               CHAR *getBuffer() const;
        };
 
        /**
-        * TextPiece consisting of 8-bit characters.
+        * TextPiece consisting of 8-bit characters in the Latin-1 extension of ASCII.
         */
        class ShortPiece : public TextPiece {
                private:
@@ -71,8 +75,12 @@ namespace odc {
                ~ShortPiece();
                virtual void read(Reader &reader);
                virtual std::string toString() const;
-               virtual std::string getText() const;
                virtual void accept(Visitor &visitor) const;
+
+               /**
+                * Get the buffer contents as 8-bit (Latin-1) characters.
+                */
+               SHORTCHAR *getBuffer() const;
        };
 
        /**