Skip to content

Commit d37739c

Browse files
committed
also added normalization to NFC in UnicodeFromEnc()
1 parent 455172b commit d37739c

2 files changed

Lines changed: 17 additions & 7 deletions

File tree

include/ticcutils/Unicode.h

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -48,8 +48,9 @@ namespace TiCC {
4848
std::string UnicodeToUTF8( const UnicodeString&,
4949
const std::string& = "" );
5050

51-
UnicodeString UnicodeFromEnc( const std::string& ,
52-
const std::string& = "UTF8" );
51+
UnicodeString UnicodeFromEnc( const std::string&,
52+
const std::string& = "UTF8",
53+
const std::string& = "" );
5354

5455
UnicodeString UnicodeFromUTF8( const std::string&,
5556
const std::string& = "" );

src/Unicode.cxx

Lines changed: 14 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -40,14 +40,21 @@ using namespace std;
4040
namespace TiCC {
4141
using namespace icu;
4242

43-
UnicodeString UnicodeFromEnc( const string& s, const string& enc ){
43+
UnicodeString UnicodeFromEnc( const string& s,
44+
const string& encoding,
45+
const string& normalization ){
4446
/// convert a character buffer in some encoding to a UnicodeString
4547
/*!
4648
\param s the string to interpret as a character buffer
47-
\param enc the encoding to use
48-
\return an UnicodeString object
49+
\param encoding the encoding assumed for s. Default UTF8
50+
\param normalization the normalization to use. Default NFC
51+
\return a normalized UnicodeString object
4952
*/
50-
return UnicodeString( s.c_str(), s.length(), enc.c_str() );
53+
UnicodeString result = UnicodeString( s.c_str(),
54+
s.length(),
55+
encoding.c_str() );
56+
UnicodeNormalizer UN( normalization);
57+
return UN.normalize( result );
5158
}
5259

5360
string UnicodeToUTF8( const UnicodeString& s,
@@ -868,6 +875,7 @@ namespace TiCC {
868875
/*!
869876
\param is The stream to read from
870877
\param us the UnicodeString to read. (will be cleared before reading)
878+
the string is normalized in NFC.
871879
\param delim The delimiter. Default '\n'
872880
\return the stream
873881
*/
@@ -882,14 +890,15 @@ namespace TiCC {
882890
/*!
883891
\param is The stream to read from
884892
\param us the UnicodeString to read. (will be cleared before reading)
893+
the string is normalized in NFC.
885894
\param encoding The Unicode encoding of the input stream. It is up to the
886895
caller to assure this encoding is valid.
887896
\param delim The delimiter. Default '\n'
888897
\return the stream
889898
*/
890899
string line;
891900
std::getline( is, line, delim );
892-
us = TiCC::UnicodeFromEnc( line, encoding );
901+
us = TiCC::UnicodeFromEnc( line, encoding, "NFC" );
893902
return is;
894903
}
895904

0 commit comments

Comments
 (0)