Skip to content
Snippets Groups Projects
Commit 701f49c3 authored by Terraneo Federico's avatar Terraneo Federico
Browse files

Added basic unicode helper functions

parent 6dea608a
No related branches found
No related tags found
No related merge requests found
...@@ -32,6 +32,7 @@ stdlib_integration/libc_integration.cpp \ ...@@ -32,6 +32,7 @@ stdlib_integration/libc_integration.cpp \
stdlib_integration/libstdcpp_integration.cpp \ stdlib_integration/libstdcpp_integration.cpp \
e20/e20.cpp \ e20/e20.cpp \
util/util.cpp \ util/util.cpp \
util/unicode.cpp \
util/version.cpp \ util/version.cpp \
util/crc16.cpp \ util/crc16.cpp \
util/lcd44780.cpp util/lcd44780.cpp
......
Changelog for Miosix np embedded OS Changelog for Miosix np embedded OS
- Added basic unicode helper functions
- Implemented inode support in FAT32 - Implemented inode support in FAT32
- Started implementing FAT32 directory listing, still needs work. - Started implementing FAT32 directory listing, still needs work.
- Implemented getcwd() and chdir() syscalls - Implemented getcwd() and chdir() syscalls
......
/***************************************************************************
* Copyright (C) 2013 by Terraneo Federico *
* *
* This program is free software; you can redistribute it and/or modify *
* it under the terms of the GNU General Public License as published by *
* the Free Software Foundation; either version 2 of the License, or *
* (at your option) any later version. *
* *
* This program is distributed in the hope that it will be useful, *
* but WITHOUT ANY WARRANTY; without even the implied warranty of *
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
* GNU General Public License for more details. *
* *
* As a special exception, if other files instantiate templates or use *
* macros or inline functions from this file, or you compile this file *
* and link it with other works to produce a work based on this file, *
* this file does not by itself cause the resulting work to be covered *
* by the GNU General Public License. However the source code for this *
* file must still be made available in accordance with the GNU General *
* Public License. This exception does not invalidate any other reasons *
* why a work based on this file might be covered by the GNU General *
* Public License. *
* *
* You should have received a copy of the GNU General Public License *
* along with this program; if not, see <http://www.gnu.org/licenses/> *
***************************************************************************/
#include "unicode.h"
using namespace std;
namespace miosix {
pair<Unicode::error,int> Unicode::utf8toutf16(char16_t *dst, int dstSize, char *src)
{
int length=0;
#define PUT(x) do \
{ \
if(length>=dstSize) return make_pair(INSUFFICIENT_SPACE,length); \
*dst++=x; length++; \
} while(0)
for(;;)
{
char32_t c=nextUtf8(src);
if(c==0) break;
if(c==invalid) return make_pair(INVALID_STRING,length);
if(c>0xffff)
{
const char32_t leadOffset=0xd800-(0x10000>>10);
PUT(leadOffset+(c>>10));
PUT(0xdc00+(c & 0x3ff));
} else PUT(c);
}
PUT(0); //Terminate string
return make_pair(OK,length-1);
#undef PUT
}
pair<Unicode::error,int> Unicode::utf16toutf8(char *dst, int dstSize, char16_t *src)
{
//Note: explicit cast to be double sure that no sign extension happens
unsigned short *srcu=reinterpret_cast<unsigned short*>(src);
int length=0;
#define PUT(x) do \
{ \
if(length>=dstSize) return make_pair(INSUFFICIENT_SPACE,length); \
*dst++=x; length++; \
} while(0)
while(char32_t c=*srcu++)
{
//Common case first: ASCII
if(c<0x80)
{
PUT(c);
continue;
}
//If not ASCII, pass through utf32
if(c>=0xd800 && c<=0xdbff)
{
char32_t next=*srcu++;
//Unpaired lead surrogate (this includes the case next==0)
if(next<0xdc00 || next>0xdfff) return make_pair(INVALID_STRING,length);
const char32_t surrogateOffset=0x10000-(0xd800<<10)-0xdc00;
c=(c<<10)+next+surrogateOffset;
} else if(c>=0xdc00 && c<=0xdfff) {
//Unpaired trail surrogate
return make_pair(INVALID_STRING,length);
}
if(c<0x800)
{
PUT(c>>6 | 0xc0);
} else if(c<0x10000) {
PUT(c>>12 | 0xe0);
PUT(((c>>6) & 0x3f) | 0x80);
} else {
PUT(c>>18 | 0xf0);
PUT(((c>>12) & 0x3f) | 0x80);
PUT(((c>>6) & 0x3f) | 0x80);
}
PUT((c & 0x3f) | 0x80);
}
PUT(0); //Terminate string
return make_pair(OK,length-1);
#undef PUT
}
std::pair<bool,int> Unicode::validateUtf8(char* str)
{
char *iter=str;
for(;;)
{
char32_t codePoint=nextUtf8(iter);
if(codePoint==0) return make_pair(true,iter-str);
if(codePoint==invalid) return make_pair(false,iter-str);
}
}
} //namespace miosix
/*
#include <iostream>
#include <fstream>
#include <cassert>
#include "unicode.h"
using namespace std;
int main(int argc, char *argv[])
{
ifstream in(argv[1]);
in.seekg(0,ios::end);
const int size=in.tellg();
in.seekg(0,ios::beg);
ofstream out(argv[2]);
if(argv[3][0]=='u')
{
char *c=new char[size+1];
in.read(c,size);
c[size]='\0';
char16_t *cc=new char16_t[512];
pair<Unicode::error,int> result=Unicode::utf8toutf16(cc,512,c);
assert(result.first==Unicode::OK);
cout<<"Target string len "<<result.second<<endl;
out.write((char*)cc,result.second*2);
} else {
char16_t *c=new char16_t[size/2+1];
in.read((char*)c,size);
c[size/2]=0;
char *cc=new char[1024];
pair<Unicode::error,int> result=Unicode::utf16toutf8(cc,1024,c);
assert(result.first==Unicode::OK);
cout<<"Target string len "<<result.second<<endl;
out.write(cc,result.second);
}
}
*/
/***************************************************************************
* Copyright (C) 2013 by Terraneo Federico *
* *
* This program is free software; you can redistribute it and/or modify *
* it under the terms of the GNU General Public License as published by *
* the Free Software Foundation; either version 2 of the License, or *
* (at your option) any later version. *
* *
* This program is distributed in the hope that it will be useful, *
* but WITHOUT ANY WARRANTY; without even the implied warranty of *
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
* GNU General Public License for more details. *
* *
* As a special exception, if other files instantiate templates or use *
* macros or inline functions from this file, or you compile this file *
* and link it with other works to produce a work based on this file, *
* this file does not by itself cause the resulting work to be covered *
* by the GNU General Public License. However the source code for this *
* file must still be made available in accordance with the GNU General *
* Public License. This exception does not invalidate any other reasons *
* why a work based on this file might be covered by the GNU General *
* Public License. *
* *
* You should have received a copy of the GNU General Public License *
* along with this program; if not, see <http://www.gnu.org/licenses/> *
***************************************************************************/
#include <stdint.h>
#include <utility>
#ifndef UNICODE_H
#define UNICODE_H
//TODO: these should be provided by the compiler, byt they're not
typedef uint16_t char16_t;
typedef uint32_t char32_t;
namespace miosix {
/**
* Result codes for unicode related conversion stuff
*/
class Unicode
{
public:
/**
* Possible errors for unicode string conversion
*/
enum error
{
OK, ///< The string conversion completed successfully
INSUFFICIENT_SPACE, ///< The source string is too long to fit
INVALID_STRING ///< The source string is an illegal unicode string
};
/// Represents an invalid code point
static const char32_t invalid=0xffffffff;
/**
* Peek an unicode code point out of an iterator into an utf8 string
* \param it an iterator into an utf8 encoded string
* \param end iterator one past the last character of the string
* \return an unicode code point, or Unicode::invalid if the string
* contains an invalid code point. Returns 0 if the end of string is found,
* and it is not in the middle of a character
*/
template<typename Iter>
static char32_t nextUtf8(Iter& it, Iter end)
{
return nextUtf8(it,end,true);
}
/**
* Peek an unicode code point out of an iterator into an utf8 string
* \param it an iterator into an utf8 encoded string, the string is assumed
* to be nul-terminated
* \return an unicode code point, or Unicode::invalid if the string
* contains an invalid code point. Returns 0 if the end of string is found,
* and it is not in the middle of a character
*/
template<typename Iter>
static char32_t nextUtf8(Iter& it)
{
return nextUtf8(it,it,false);
}
/**
* Convert an utf8 string in an utf16 one
* \param dst an utf16 string in system-dependent endianness (i.e: little
* endian in a little endian machine and big endian in a big endian one)
* \param dstSize size in units of char16_t of dst, to prevent overflow
* \param src a nul-terminated utf8 string
* \return an error code and the length (in units of char16_t) of the
* string written to dst
*/
static std::pair<error,int> utf8toutf16(char16_t *dst, int dstSize, char *src);
/**
* Convert an utf16 string in an utf8 one
* \param dst an utf8 string
* \param dstSize size in bytes of dst, to prevent overflow
* \param src a nul-terminated utf16 string in system-dependent endianness
* (i.e: little endian in a little endian machine and big endian in a big
* endian one)
* \return an error code and the length of the string written to dst
*/
static std::pair<error,int> utf16toutf8(char *dst, int dstSize, char16_t *src);
/**
* \param str an utf8 encoded string
* \return a pair with a bool that is true if the string is valid, and the
* string length in bytes, not code points
*/
static std::pair<bool,int> validateUtf8(char *str);
private:
/**
* Common implementation of nextUtf8
* \param it an iterator into an utf8 encoded string
* \param end iterator one past the last character of the string
* \param checkEnd true if there is the need to check for end of string
* considering end. If false, a nul in the char stream is the only end
* condition.
* \return an unicode code point, or Unicode::invalid if the string
* contains an invalid code point. Returns 0 if the end of string is found,
* and it is not in the middle of a character
*/
template<typename Iter>
static char32_t nextUtf8(Iter& it, Iter end, bool checkEnd);
};
template<typename Iter>
char32_t Unicode::nextUtf8(Iter& it, Iter end, bool checkEnd)
{
//End of string at the beginning, return 0
if(checkEnd && it==end) return 0;
//Note: cast to unsigned char to prevent sign extension if *it > 0x7f
char32_t c=static_cast<unsigned char>(*it++);
//Common case first: ASCII
if(c<0x80) return c;
//If not ASCII, decode to utf32
int additionalBytes;
if((c & 0xe0)==0xc0) { c &= 0x1f; additionalBytes=1; } //110xxxxx
else if((c & 0xf0)==0xe0) { c &= 0x0f; additionalBytes=2; } //1110xxxx
else if((c & 0xf8)==0xf0) { c &= 0x07; additionalBytes=3; } //11110xxx
else return invalid;
for(int i=0;i<additionalBytes;i++)
{
//End of string in the middle of a char, return invalid
if(checkEnd && it==end) return invalid;
char32_t next=static_cast<unsigned char>(*it++);
//This includes the case next==0
if((next & 0xc0)!=0x80) return invalid;
c<<=6;
c |= next & 0x3f;
}
//Detect overlong encodings as errors to prevent vulnerabilities
switch(additionalBytes)
{
case 1:
if(c<0x80) return invalid;
break;
case 2:
if(c<0x800) return invalid;
break;
case 3:
if(c<0x10000) return invalid;
break;
}
//Reserved space for surrogate pairs in utf16 are invalid code points
if(c>=0xd800 && c<= 0xdfff) return invalid;
//Unicode is limited in the range 0-0x10ffff
if(c>0x10ffff) return invalid;
return c;
}
} //namespace miosix
#endif //UNICODE_H
This is a test è ώ 𝄞
\ No newline at end of file
...@@ -408,6 +408,8 @@ ...@@ -408,6 +408,8 @@
<in>lcd44780.h</in> <in>lcd44780.h</in>
<in>software_i2c.h</in> <in>software_i2c.h</in>
<in>software_spi.h</in> <in>software_spi.h</in>
<in>unicode.cpp</in>
<in>unicode.h</in>
<in>util.cpp</in> <in>util.cpp</in>
<in>util.h</in> <in>util.h</in>
<in>version.cpp</in> <in>version.cpp</in>
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment