From 701f49c38c7bf1afd8d861c238710544cc737a9e Mon Sep 17 00:00:00 2001 From: Terraneo Federico <fede.tft@hotmail.it> Date: Wed, 11 Dec 2013 23:28:38 +0000 Subject: [PATCH] Added basic unicode helper functions --- miosix/Makefile | 1 + miosix/_doc/textdoc/Changelog.txt | 1 + miosix/util/unicode.cpp | 166 ++++++++++++++++ miosix/util/unicode.h | 184 ++++++++++++++++++ miosix/util/utf8test | 1 + .../nbproject/private/configurations.xml | 2 + 6 files changed, 355 insertions(+) create mode 100644 miosix/util/unicode.cpp create mode 100644 miosix/util/unicode.h create mode 100644 miosix/util/utf8test diff --git a/miosix/Makefile b/miosix/Makefile index a86880e1..3a549e7b 100644 --- a/miosix/Makefile +++ b/miosix/Makefile @@ -32,6 +32,7 @@ stdlib_integration/libc_integration.cpp \ stdlib_integration/libstdcpp_integration.cpp \ e20/e20.cpp \ util/util.cpp \ +util/unicode.cpp \ util/version.cpp \ util/crc16.cpp \ util/lcd44780.cpp diff --git a/miosix/_doc/textdoc/Changelog.txt b/miosix/_doc/textdoc/Changelog.txt index 598c9fce..0ae295b1 100644 --- a/miosix/_doc/textdoc/Changelog.txt +++ b/miosix/_doc/textdoc/Changelog.txt @@ -1,5 +1,6 @@ Changelog for Miosix np embedded OS +- Added basic unicode helper functions - Implemented inode support in FAT32 - Started implementing FAT32 directory listing, still needs work. - Implemented getcwd() and chdir() syscalls diff --git a/miosix/util/unicode.cpp b/miosix/util/unicode.cpp new file mode 100644 index 00000000..97e0a433 --- /dev/null +++ b/miosix/util/unicode.cpp @@ -0,0 +1,166 @@ +/*************************************************************************** + * Copyright (C) 2013 by Terraneo Federico * + * * + * This program is free software; you can redistribute it and/or modify * + * it under the terms of the GNU General Public License as published by * + * the Free Software Foundation; either version 2 of the License, or * + * (at your option) any later version. * + * * + * This program is distributed in the hope that it will be useful, * + * but WITHOUT ANY WARRANTY; without even the implied warranty of * + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * + * GNU General Public License for more details. * + * * + * As a special exception, if other files instantiate templates or use * + * macros or inline functions from this file, or you compile this file * + * and link it with other works to produce a work based on this file, * + * this file does not by itself cause the resulting work to be covered * + * by the GNU General Public License. However the source code for this * + * file must still be made available in accordance with the GNU General * + * Public License. This exception does not invalidate any other reasons * + * why a work based on this file might be covered by the GNU General * + * Public License. * + * * + * You should have received a copy of the GNU General Public License * + * along with this program; if not, see <http://www.gnu.org/licenses/> * + ***************************************************************************/ + +#include "unicode.h" + +using namespace std; + +namespace miosix { + +pair<Unicode::error,int> Unicode::utf8toutf16(char16_t *dst, int dstSize, char *src) +{ + int length=0; + + #define PUT(x) do \ + { \ + if(length>=dstSize) return make_pair(INSUFFICIENT_SPACE,length); \ + *dst++=x; length++; \ + } while(0) + + for(;;) + { + char32_t c=nextUtf8(src); + if(c==0) break; + if(c==invalid) return make_pair(INVALID_STRING,length); + + if(c>0xffff) + { + const char32_t leadOffset=0xd800-(0x10000>>10); + PUT(leadOffset+(c>>10)); + PUT(0xdc00+(c & 0x3ff)); + } else PUT(c); + } + + PUT(0); //Terminate string + return make_pair(OK,length-1); + #undef PUT +} + +pair<Unicode::error,int> Unicode::utf16toutf8(char *dst, int dstSize, char16_t *src) +{ + //Note: explicit cast to be double sure that no sign extension happens + unsigned short *srcu=reinterpret_cast<unsigned short*>(src); + int length=0; + + #define PUT(x) do \ + { \ + if(length>=dstSize) return make_pair(INSUFFICIENT_SPACE,length); \ + *dst++=x; length++; \ + } while(0) + + while(char32_t c=*srcu++) + { + //Common case first: ASCII + if(c<0x80) + { + PUT(c); + continue; + } + + //If not ASCII, pass through utf32 + if(c>=0xd800 && c<=0xdbff) + { + char32_t next=*srcu++; + //Unpaired lead surrogate (this includes the case next==0) + if(next<0xdc00 || next>0xdfff) return make_pair(INVALID_STRING,length); + + const char32_t surrogateOffset=0x10000-(0xd800<<10)-0xdc00; + c=(c<<10)+next+surrogateOffset; + } else if(c>=0xdc00 && c<=0xdfff) { + //Unpaired trail surrogate + return make_pair(INVALID_STRING,length); + } + + if(c<0x800) + { + PUT(c>>6 | 0xc0); + } else if(c<0x10000) { + PUT(c>>12 | 0xe0); + PUT(((c>>6) & 0x3f) | 0x80); + } else { + PUT(c>>18 | 0xf0); + PUT(((c>>12) & 0x3f) | 0x80); + PUT(((c>>6) & 0x3f) | 0x80); + } + PUT((c & 0x3f) | 0x80); + } + + PUT(0); //Terminate string + return make_pair(OK,length-1); + #undef PUT +} + +std::pair<bool,int> Unicode::validateUtf8(char* str) +{ + char *iter=str; + for(;;) + { + char32_t codePoint=nextUtf8(iter); + if(codePoint==0) return make_pair(true,iter-str); + if(codePoint==invalid) return make_pair(false,iter-str); + } +} + +} //namespace miosix + +/* +#include <iostream> +#include <fstream> +#include <cassert> +#include "unicode.h" + +using namespace std; + +int main(int argc, char *argv[]) +{ + ifstream in(argv[1]); + in.seekg(0,ios::end); + const int size=in.tellg(); + in.seekg(0,ios::beg); + ofstream out(argv[2]); + if(argv[3][0]=='u') + { + char *c=new char[size+1]; + in.read(c,size); + c[size]='\0'; + char16_t *cc=new char16_t[512]; + pair<Unicode::error,int> result=Unicode::utf8toutf16(cc,512,c); + assert(result.first==Unicode::OK); + cout<<"Target string len "<<result.second<<endl; + out.write((char*)cc,result.second*2); + } else { + char16_t *c=new char16_t[size/2+1]; + in.read((char*)c,size); + c[size/2]=0; + char *cc=new char[1024]; + pair<Unicode::error,int> result=Unicode::utf16toutf8(cc,1024,c); + assert(result.first==Unicode::OK); + cout<<"Target string len "<<result.second<<endl; + out.write(cc,result.second); + } +} +*/ diff --git a/miosix/util/unicode.h b/miosix/util/unicode.h new file mode 100644 index 00000000..2c211b0f --- /dev/null +++ b/miosix/util/unicode.h @@ -0,0 +1,184 @@ +/*************************************************************************** + * Copyright (C) 2013 by Terraneo Federico * + * * + * This program is free software; you can redistribute it and/or modify * + * it under the terms of the GNU General Public License as published by * + * the Free Software Foundation; either version 2 of the License, or * + * (at your option) any later version. * + * * + * This program is distributed in the hope that it will be useful, * + * but WITHOUT ANY WARRANTY; without even the implied warranty of * + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * + * GNU General Public License for more details. * + * * + * As a special exception, if other files instantiate templates or use * + * macros or inline functions from this file, or you compile this file * + * and link it with other works to produce a work based on this file, * + * this file does not by itself cause the resulting work to be covered * + * by the GNU General Public License. However the source code for this * + * file must still be made available in accordance with the GNU General * + * Public License. This exception does not invalidate any other reasons * + * why a work based on this file might be covered by the GNU General * + * Public License. * + * * + * You should have received a copy of the GNU General Public License * + * along with this program; if not, see <http://www.gnu.org/licenses/> * + ***************************************************************************/ + +#include <stdint.h> +#include <utility> + +#ifndef UNICODE_H +#define UNICODE_H + +//TODO: these should be provided by the compiler, byt they're not +typedef uint16_t char16_t; +typedef uint32_t char32_t; + +namespace miosix { + +/** + * Result codes for unicode related conversion stuff + */ +class Unicode +{ +public: + /** + * Possible errors for unicode string conversion + */ + enum error + { + OK, ///< The string conversion completed successfully + INSUFFICIENT_SPACE, ///< The source string is too long to fit + INVALID_STRING ///< The source string is an illegal unicode string + }; + + /// Represents an invalid code point + static const char32_t invalid=0xffffffff; + + /** + * Peek an unicode code point out of an iterator into an utf8 string + * \param it an iterator into an utf8 encoded string + * \param end iterator one past the last character of the string + * \return an unicode code point, or Unicode::invalid if the string + * contains an invalid code point. Returns 0 if the end of string is found, + * and it is not in the middle of a character + */ + template<typename Iter> + static char32_t nextUtf8(Iter& it, Iter end) + { + return nextUtf8(it,end,true); + } + + /** + * Peek an unicode code point out of an iterator into an utf8 string + * \param it an iterator into an utf8 encoded string, the string is assumed + * to be nul-terminated + * \return an unicode code point, or Unicode::invalid if the string + * contains an invalid code point. Returns 0 if the end of string is found, + * and it is not in the middle of a character + */ + template<typename Iter> + static char32_t nextUtf8(Iter& it) + { + return nextUtf8(it,it,false); + } + + /** + * Convert an utf8 string in an utf16 one + * \param dst an utf16 string in system-dependent endianness (i.e: little + * endian in a little endian machine and big endian in a big endian one) + * \param dstSize size in units of char16_t of dst, to prevent overflow + * \param src a nul-terminated utf8 string + * \return an error code and the length (in units of char16_t) of the + * string written to dst + */ + static std::pair<error,int> utf8toutf16(char16_t *dst, int dstSize, char *src); + + /** + * Convert an utf16 string in an utf8 one + * \param dst an utf8 string + * \param dstSize size in bytes of dst, to prevent overflow + * \param src a nul-terminated utf16 string in system-dependent endianness + * (i.e: little endian in a little endian machine and big endian in a big + * endian one) + * \return an error code and the length of the string written to dst + */ + static std::pair<error,int> utf16toutf8(char *dst, int dstSize, char16_t *src); + + /** + * \param str an utf8 encoded string + * \return a pair with a bool that is true if the string is valid, and the + * string length in bytes, not code points + */ + static std::pair<bool,int> validateUtf8(char *str); + +private: + /** + * Common implementation of nextUtf8 + * \param it an iterator into an utf8 encoded string + * \param end iterator one past the last character of the string + * \param checkEnd true if there is the need to check for end of string + * considering end. If false, a nul in the char stream is the only end + * condition. + * \return an unicode code point, or Unicode::invalid if the string + * contains an invalid code point. Returns 0 if the end of string is found, + * and it is not in the middle of a character + */ + template<typename Iter> + static char32_t nextUtf8(Iter& it, Iter end, bool checkEnd); +}; + +template<typename Iter> +char32_t Unicode::nextUtf8(Iter& it, Iter end, bool checkEnd) +{ + //End of string at the beginning, return 0 + if(checkEnd && it==end) return 0; + + //Note: cast to unsigned char to prevent sign extension if *it > 0x7f + char32_t c=static_cast<unsigned char>(*it++); + + //Common case first: ASCII + if(c<0x80) return c; + + //If not ASCII, decode to utf32 + int additionalBytes; + if((c & 0xe0)==0xc0) { c &= 0x1f; additionalBytes=1; } //110xxxxx + else if((c & 0xf0)==0xe0) { c &= 0x0f; additionalBytes=2; } //1110xxxx + else if((c & 0xf8)==0xf0) { c &= 0x07; additionalBytes=3; } //11110xxx + else return invalid; + for(int i=0;i<additionalBytes;i++) + { + //End of string in the middle of a char, return invalid + if(checkEnd && it==end) return invalid; + char32_t next=static_cast<unsigned char>(*it++); + //This includes the case next==0 + if((next & 0xc0)!=0x80) return invalid; + c<<=6; + c |= next & 0x3f; + } + //Detect overlong encodings as errors to prevent vulnerabilities + switch(additionalBytes) + { + case 1: + if(c<0x80) return invalid; + break; + case 2: + if(c<0x800) return invalid; + break; + case 3: + if(c<0x10000) return invalid; + break; + } + + //Reserved space for surrogate pairs in utf16 are invalid code points + if(c>=0xd800 && c<= 0xdfff) return invalid; + //Unicode is limited in the range 0-0x10ffff + if(c>0x10ffff) return invalid; + return c; +} + +} //namespace miosix + +#endif //UNICODE_H + diff --git a/miosix/util/utf8test b/miosix/util/utf8test new file mode 100644 index 00000000..4b8c31ee --- /dev/null +++ b/miosix/util/utf8test @@ -0,0 +1 @@ +This is a test è ώ 𝄞 \ No newline at end of file diff --git a/miosix_np_2/nbproject/private/configurations.xml b/miosix_np_2/nbproject/private/configurations.xml index 2bd85b8b..fc77598f 100644 --- a/miosix_np_2/nbproject/private/configurations.xml +++ b/miosix_np_2/nbproject/private/configurations.xml @@ -408,6 +408,8 @@ <in>lcd44780.h</in> <in>software_i2c.h</in> <in>software_spi.h</in> + <in>unicode.cpp</in> + <in>unicode.h</in> <in>util.cpp</in> <in>util.h</in> <in>version.cpp</in> -- GitLab