Skip to content

Commit 455172b

Browse files
committed
added optional normalization parameters to UnicodeFromUTF8() and UnicodeToUTF8()
The default is to always produce NFC.
1 parent 2b706b2 commit 455172b

3 files changed

Lines changed: 20 additions & 8 deletions

File tree

include/ticcutils/Unicode.h

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -45,14 +45,14 @@
4545
namespace TiCC {
4646
using namespace icu;
4747

48-
std::string UnicodeToUTF8( const UnicodeString& );
48+
std::string UnicodeToUTF8( const UnicodeString&,
49+
const std::string& = "" );
4950

5051
UnicodeString UnicodeFromEnc( const std::string& ,
5152
const std::string& = "UTF8" );
5253

53-
inline UnicodeString UnicodeFromUTF8( const std::string& s ){
54-
return UnicodeString::fromUTF8( s );
55-
}
54+
UnicodeString UnicodeFromUTF8( const std::string&,
55+
const std::string& = "" );
5656

5757
/// \brief a class that can normalize UnicodeStrings to NFC/NFD/NFKC/NFKD
5858
class UnicodeNormalizer {

src/Unicode.cxx

Lines changed: 13 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -50,17 +50,28 @@ namespace TiCC {
5050
return UnicodeString( s.c_str(), s.length(), enc.c_str() );
5151
}
5252

53-
string UnicodeToUTF8( const UnicodeString& s ){
53+
string UnicodeToUTF8( const UnicodeString& s,
54+
const string& normalization ){
5455
/// convert a UnicodeString to a UTF-8 string
5556
/*!
5657
\param s the UnicodeString to convert
58+
\param normalization the normalization to use. Default NFC
5759
\return an UTF-8 encoded string
5860
*/
61+
UnicodeNormalizer UN( normalization);
62+
UnicodeString normalized = UN.normalize( s );
5963
string result;
60-
s.toUTF8String(result);
64+
normalized.toUTF8String(result);
6165
return result;
6266
}
6367

68+
UnicodeString UnicodeFromUTF8( const string& s,
69+
const string& normalization ){
70+
UnicodeNormalizer UN( normalization);
71+
UnicodeString result = UnicodeString::fromUTF8( s );
72+
return UN.normalize( result );
73+
}
74+
6475
UnicodeNormalizer::UnicodeNormalizer( const string& enc ): _normalizer(0) {
6576
/// create an UnicodeNormalizer object
6677
/*!

src/runtest.cxx

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -889,13 +889,14 @@ void test_unicode( const string& path ){
889889
UnicodeString ng1 = N.normalize( greek1 );
890890
UnicodeString ng2 = N.normalize( greek2 );
891891
assertEqual( UnicodeToUTF8(ng1), UnicodeToUTF8(ng2) );
892+
assertEqual( UnicodeToUTF8(ng1,"NFD"), UnicodeToUTF8(ng2,"NFD") );
892893
N.setMode("NFD");
893894
UnicodeString ng11 = N.normalize( greek1 );
894895
UnicodeString ng12 = N.normalize( greek2 );
895896
assertEqual( UnicodeToUTF8(ng11), UnicodeToUTF8(ng12) );
896897
string utf8_1 = "ἀντιϰειμένου";
897-
string utf8_2 = "ἀντικειμένου";
898-
assertEqual( TiCC::utf8_uppercase( utf8_1 ), "ἈΝΤΙΚΕΙΜΈΝΟΥ" );
898+
string utf8_2 = "ἀντικειμένου";
899+
assertEqual( TiCC::utf8_uppercase( utf8_1 ), "ἈΝΤΙΚΕΙΜΈΝΟΥ" );
899900
assertEqual( TiCC::utf8_lowercase( "ἈΝΤΙΚΕΙΜΈΝΟΥ" ), utf8_2 );
900901
assertEqual( TiCC::utf8_uppercase( "æ en ß en œ" ), "Æ EN SS EN Œ" );
901902
}

0 commit comments

Comments
 (0)