Added basic unicode helper functions

701f49c3 · Terraneo Federico · 6dea608a · 701f49c3 · 701f49c3 · 701f49c3
Commit 701f49c3 authored 11 years ago by Terraneo Federico
--- a/miosix/Makefile
+++ b/miosix/Makefile
@@ -32,6 +32,7 @@ stdlib_integration/libc_integration.cpp                                    \
 stdlib_integration/libstdcpp_integration.cpp                               \
 e20/e20.cpp                                                                \
 util/util.cpp                                                              \
+util/unicode.cpp                                                           \
 util/version.cpp                                                           \
 util/crc16.cpp                                                             \
 util/lcd44780.cpp

--- a/miosix/_doc/textdoc/Changelog.txt
+++ b/miosix/_doc/textdoc/Changelog.txt
 Changelog for Miosix np embedded OS
+- Added basic unicode helper functions
 - Implemented inode support in FAT32
 - Started implementing FAT32 directory listing, still needs work.
 - Implemented getcwd() and chdir() syscalls

--- a/miosix/util/unicode.cpp
+++ b/miosix/util/unicode.cpp
+/***************************************************************************
+ *   Copyright (C) 2013 by Terraneo Federico                               *
+ *                                                                         *
+ *   This program is free software; you can redistribute it and/or modify  *
+ *   it under the terms of the GNU General Public License as published by  *
+ *   the Free Software Foundation; either version 2 of the License, or     *
+ *   (at your option) any later version.                                   *
+ *                                                                         *
+ *   This program is distributed in the hope that it will be useful,       *
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of        *
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the         *
+ *   GNU General Public License for more details.                          *
+ *                                                                         *
+ *   As a special exception, if other files instantiate templates or use   *
+ *   macros or inline functions from this file, or you compile this file   *
+ *   and link it with other works to produce a work based on this file,    *
+ *   this file does not by itself cause the resulting work to be covered   *
+ *   by the GNU General Public License. However the source code for this   *
+ *   file must still be made available in accordance with the GNU General  *
+ *   Public License. This exception does not invalidate any other reasons  *
+ *   why a work based on this file might be covered by the GNU General     *
+ *   Public License.                                                       *
+ *                                                                         *
+ *   You should have received a copy of the GNU General Public License     *
+ *   along with this program; if not, see <http://www.gnu.org/licenses/>   *
+ ***************************************************************************/
+#include "unicode.h"
+using namespace std;
+namespace miosix {
+pair<Unicode::error,int> Unicode::utf8toutf16(char16_t *dst, int dstSize, char *src)
+{
+    int length=0;
+    #define PUT(x) do \
+    { \
+        if(length>=dstSize) return make_pair(INSUFFICIENT_SPACE,length); \
+        *dst++=x; length++; \
+    } while(0)
+    for(;;)
+    {
+        char32_t c=nextUtf8(src);
+        if(c==0) break;
+        if(c==invalid) return make_pair(INVALID_STRING,length);
+        if(c>0xffff)
+        {
+            const char32_t leadOffset=0xd800-(0x10000>>10);
+            PUT(leadOffset+(c>>10));
+            PUT(0xdc00+(c & 0x3ff));
+        } else PUT(c);
+    }
+    PUT(0); //Terminate string
+    return make_pair(OK,length-1);
+    #undef PUT
+}
+pair<Unicode::error,int> Unicode::utf16toutf8(char *dst, int dstSize, char16_t *src)
+{
+    //Note: explicit cast to be double sure that no sign extension happens
+    unsigned short *srcu=reinterpret_cast<unsigned short*>(src);
+    int length=0;
+    #define PUT(x) do \
+    { \
+        if(length>=dstSize) return make_pair(INSUFFICIENT_SPACE,length); \
+        *dst++=x; length++; \
+    } while(0)
+    while(char32_t c=*srcu++)
+    {
+        //Common case first: ASCII
+        if(c<0x80)
+        {
+            PUT(c);
+            continue;
+        }
+        //If not ASCII, pass through utf32        
+        if(c>=0xd800 && c<=0xdbff)
+        {
+            char32_t next=*srcu++;
+            //Unpaired lead surrogate (this includes the case next==0)
+            if(next<0xdc00 || next>0xdfff) return make_pair(INVALID_STRING,length);
+            const char32_t surrogateOffset=0x10000-(0xd800<<10)-0xdc00;
+            c=(c<<10)+next+surrogateOffset;
+        } else if(c>=0xdc00 && c<=0xdfff) {
+            //Unpaired trail surrogate
+            return make_pair(INVALID_STRING,length);
+        }
+        if(c<0x800)
+        {
+            PUT(c>>6 | 0xc0);
+        } else if(c<0x10000) {
+            PUT(c>>12 | 0xe0);
+            PUT(((c>>6) & 0x3f) | 0x80);
+        } else {
+            PUT(c>>18 | 0xf0);
+            PUT(((c>>12) & 0x3f) | 0x80);
+            PUT(((c>>6) & 0x3f) | 0x80);
+        }
+        PUT((c & 0x3f) | 0x80);
+    }
+    PUT(0); //Terminate string
+    return make_pair(OK,length-1);
+    #undef PUT
+}
+std::pair<bool,int> Unicode::validateUtf8(char* str)
+{
+    char *iter=str;
+    for(;;)
+    {
+        char32_t codePoint=nextUtf8(iter);
+        if(codePoint==0) return make_pair(true,iter-str);
+        if(codePoint==invalid) return make_pair(false,iter-str);
+    }
+}
+} //namespace miosix
+/*
+#include <iostream>
+#include <fstream>
+#include <cassert>
+#include "unicode.h"
+using namespace std;
+int main(int argc, char *argv[])
+{
+    ifstream in(argv[1]);
+    in.seekg(0,ios::end);
+    const int size=in.tellg();
+    in.seekg(0,ios::beg);
+    ofstream out(argv[2]);
+    if(argv[3][0]=='u')
+    {
+        char *c=new char[size+1];
+        in.read(c,size);
+        c[size]='\0';
+        char16_t *cc=new char16_t[512];
+        pair<Unicode::error,int> result=Unicode::utf8toutf16(cc,512,c);
+        assert(result.first==Unicode::OK);
+        cout<<"Target string len "<<result.second<<endl;
+        out.write((char*)cc,result.second*2);
+    } else {
+        char16_t *c=new char16_t[size/2+1];
+        in.read((char*)c,size);
+        c[size/2]=0;
+        char *cc=new char[1024];
+        pair<Unicode::error,int> result=Unicode::utf16toutf8(cc,1024,c);
+        assert(result.first==Unicode::OK);
+        cout<<"Target string len "<<result.second<<endl;
+        out.write(cc,result.second);
+    }
+} 
+*/
--- a/miosix/util/unicode.h
+++ b/miosix/util/unicode.h
+/***************************************************************************
+ *   Copyright (C) 2013 by Terraneo Federico                               *
+ *                                                                         *
+ *   This program is free software; you can redistribute it and/or modify  *
+ *   it under the terms of the GNU General Public License as published by  *
+ *   the Free Software Foundation; either version 2 of the License, or     *
+ *   (at your option) any later version.                                   *
+ *                                                                         *
+ *   This program is distributed in the hope that it will be useful,       *
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of        *
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the         *
+ *   GNU General Public License for more details.                          *
+ *                                                                         *
+ *   As a special exception, if other files instantiate templates or use   *
+ *   macros or inline functions from this file, or you compile this file   *
+ *   and link it with other works to produce a work based on this file,    *
+ *   this file does not by itself cause the resulting work to be covered   *
+ *   by the GNU General Public License. However the source code for this   *
+ *   file must still be made available in accordance with the GNU General  *
+ *   Public License. This exception does not invalidate any other reasons  *
+ *   why a work based on this file might be covered by the GNU General     *
+ *   Public License.                                                       *
+ *                                                                         *
+ *   You should have received a copy of the GNU General Public License     *
+ *   along with this program; if not, see <http://www.gnu.org/licenses/>   *
+ ***************************************************************************/
+#include <stdint.h>
+#include <utility>
+#ifndef UNICODE_H
+#define UNICODE_H
+//TODO: these should be provided by the compiler, byt they're not
+typedef uint16_t char16_t;
+typedef uint32_t char32_t;
+namespace miosix {
+/**
+ * Result codes for unicode related conversion stuff
+ */
+class Unicode
+{
+public:
+    /**
+     * Possible errors for unicode string conversion
+     */
+    enum error
+    {
+        OK,                 ///< The string conversion completed successfully
+        INSUFFICIENT_SPACE, ///< The source string is too long to fit
+        INVALID_STRING      ///< The source string is an illegal unicode string
+    };
+    /// Represents an invalid code point
+    static const char32_t invalid=0xffffffff;
+    /**
+     * Peek an unicode code point out of an iterator into an utf8 string
+     * \param it an iterator into an utf8 encoded string
+     * \param end iterator one past the last character of the string
+     * \return an unicode code point, or Unicode::invalid if the string
+     * contains an invalid code point. Returns 0 if the end of string is found,
+     * and it is not in the middle of a character
+     */
+    template<typename Iter>
+    static char32_t nextUtf8(Iter& it, Iter end)
+    {
+        return nextUtf8(it,end,true);
+    }
+    /**
+     * Peek an unicode code point out of an iterator into an utf8 string
+     * \param it an iterator into an utf8 encoded string, the string is assumed
+     * to be nul-terminated
+     * \return an unicode code point, or Unicode::invalid if the string
+     * contains an invalid code point. Returns 0 if the end of string is found,
+     * and it is not in the middle of a character
+     */
+    template<typename Iter>
+    static char32_t nextUtf8(Iter& it)
+    {
+        return nextUtf8(it,it,false);
+    }
+    /**
+     * Convert an utf8 string in an utf16 one
+     * \param dst an utf16 string in system-dependent endianness (i.e: little
+     * endian in a little endian machine and big endian in a big endian one)
+     * \param dstSize size in units of char16_t of dst, to prevent overflow
+     * \param src a nul-terminated utf8 string
+     * \return an error code and the length (in units of char16_t) of the
+     * string written to dst
+     */
+    static std::pair<error,int> utf8toutf16(char16_t *dst, int dstSize, char *src);
+    /**
+     * Convert an utf16 string in an utf8 one
+     * \param dst an utf8 string
+     * \param dstSize size in bytes of dst, to prevent overflow
+     * \param src a nul-terminated utf16 string in system-dependent endianness
+     * (i.e: little endian in a little endian machine and big endian in a big
+     * endian one)
+     * \return an error code and the length of the string written to dst
+     */
+    static std::pair<error,int> utf16toutf8(char *dst, int dstSize, char16_t *src);
+    /**
+     * \param str an utf8 encoded string
+     * \return a pair with a bool that is true if the string is valid, and the
+     * string length in bytes, not code points
+     */
+    static std::pair<bool,int> validateUtf8(char *str);
+private:
+    /**
+     * Common implementation of nextUtf8
+     * \param it an iterator into an utf8 encoded string
+     * \param end iterator one past the last character of the string
+     * \param checkEnd true if there is the need to check for end of string
+     * considering end. If false, a nul in the char stream is the only end
+     * condition.
+     * \return an unicode code point, or Unicode::invalid if the string
+     * contains an invalid code point. Returns 0 if the end of string is found,
+     * and it is not in the middle of a character
+     */
+    template<typename Iter>
+    static char32_t nextUtf8(Iter& it, Iter end, bool checkEnd);
+};
+template<typename Iter>
+char32_t Unicode::nextUtf8(Iter& it, Iter end, bool checkEnd)
+{
+    //End of string at the beginning, return 0
+    if(checkEnd && it==end) return 0;
+    //Note: cast to unsigned char to prevent sign extension if *it > 0x7f
+    char32_t c=static_cast<unsigned char>(*it++);
+    //Common case first: ASCII
+    if(c<0x80) return c;
+    //If not ASCII, decode to utf32        
+    int additionalBytes;
+    if((c & 0xe0)==0xc0)      { c &= 0x1f; additionalBytes=1; } //110xxxxx
+    else if((c & 0xf0)==0xe0) { c &= 0x0f; additionalBytes=2; } //1110xxxx
+    else if((c & 0xf8)==0xf0) { c &= 0x07; additionalBytes=3; } //11110xxx
+    else return invalid;
+    for(int i=0;i<additionalBytes;i++)
+    {
+        //End of string in the middle of a char, return invalid
+        if(checkEnd && it==end) return invalid;
+        char32_t next=static_cast<unsigned char>(*it++);
+        //This includes the case next==0
+        if((next & 0xc0)!=0x80) return invalid;
+        c<<=6;
+        c |= next & 0x3f;
+    }
+    //Detect overlong encodings as errors to prevent vulnerabilities
+    switch(additionalBytes)
+    {
+        case 1:
+            if(c<0x80) return invalid;
+            break;
+        case 2:
+            if(c<0x800) return invalid;
+            break;
+        case 3:
+            if(c<0x10000) return invalid;
+            break;
+    }
+    //Reserved space for surrogate pairs in utf16 are invalid code points
+    if(c>=0xd800 && c<= 0xdfff) return invalid;
+    //Unicode is limited in the range 0-0x10ffff
+    if(c>0x10ffff) return invalid;
+    return c;
+}
+} //namespace miosix
+#endif //UNICODE_H
--- a/miosix/util/utf8test
+++ b/miosix/util/utf8test
+This is a test è ώ 𝄞
\ No newline at end of file
--- a/miosix_np_2/nbproject/private/configurations.xml
+++ b/miosix_np_2/nbproject/private/configurations.xml
@@ -408,6 +408,8 @@
          <in>lcd44780.h</in>
          <in>software_i2c.h</in>
          <in>software_spi.h</in>
+          <in>unicode.cpp</in>
+          <in>unicode.h</in>
          <in>util.cpp</in>
          <in>util.h</in>
          <in>version.cpp</in>