@@ -40,14 +40,21 @@ using namespace std;
4040namespace TiCC {
4141 using namespace icu ;
4242
43- UnicodeString UnicodeFromEnc ( const string& s, const string& enc ){
43+ UnicodeString UnicodeFromEnc ( const string& s,
44+ const string& encoding,
45+ const string& normalization ){
4446 // / convert a character buffer in some encoding to a UnicodeString
4547 /* !
4648 \param s the string to interpret as a character buffer
47- \param enc the encoding to use
48- \return an UnicodeString object
49+ \param encoding the encoding assumed for s. Default UTF8
50+ \param normalization the normalization to use. Default NFC
51+ \return a normalized UnicodeString object
4952 */
50- return UnicodeString ( s.c_str (), s.length (), enc.c_str () );
53+ UnicodeString result = UnicodeString ( s.c_str (),
54+ s.length (),
55+ encoding.c_str () );
56+ UnicodeNormalizer UN ( normalization);
57+ return UN.normalize ( result );
5158 }
5259
5360 string UnicodeToUTF8 ( const UnicodeString& s,
@@ -868,6 +875,7 @@ namespace TiCC {
868875 /* !
869876 \param is The stream to read from
870877 \param us the UnicodeString to read. (will be cleared before reading)
878+ the string is normalized in NFC.
871879 \param delim The delimiter. Default '\n'
872880 \return the stream
873881 */
@@ -882,14 +890,15 @@ namespace TiCC {
882890 /* !
883891 \param is The stream to read from
884892 \param us the UnicodeString to read. (will be cleared before reading)
893+ the string is normalized in NFC.
885894 \param encoding The Unicode encoding of the input stream. It is up to the
886895 caller to assure this encoding is valid.
887896 \param delim The delimiter. Default '\n'
888897 \return the stream
889898 */
890899 string line;
891900 std::getline ( is, line, delim );
892- us = TiCC::UnicodeFromEnc ( line, encoding );
901+ us = TiCC::UnicodeFromEnc ( line, encoding, " NFC " );
893902 return is;
894903 }
895904
0 commit comments