@@ -123,6 +123,13 @@ namespace Tokenizer {
123123
124124 UnicodeString convert ( const string& line,
125125 const string& inputEncoding ){
126+ // / convert a string with \e inputEncoding into a UnicodeString
127+ /* !
128+ \param line int inputstring
129+ \param inputEncoding the assumed encodfing
130+ \return a valid UnicodeString
131+ will throw when the encoding is wrong, or bogus characters are found
132+ */
126133 UnicodeString result;
127134 if ( !line.empty () ){
128135 try {
@@ -155,11 +162,21 @@ namespace Tokenizer {
155162 const UnicodeString type_unknown = " UNKNOWN" ;
156163 const UnicodeString type_unanalyzed = " UNANALYZED" ;
157164
158- UnicodeString filter_ZCARON ( const UnicodeString& in ){
165+ UChar32 SPACE_PLACEHOLDER = U' Ž' ;
166+
167+ UnicodeString filter_placeholder ( const UnicodeString& in ){
168+ // / filter out the SPACE_PACEHOLDER characters
169+ // / assuming they were added usiing the hidden --keep-spaces-inside-quotes
170+ // / option
171+ /* !
172+ \param in UnicodeString input
173+ \return filtered output, where the SPACE_PLACEHOLDER
174+ is replaced by a space
175+ */
159176 UnicodeString result;
160177 for ( int i=0 ; i < in.length (); ++i ){
161178 UChar32 c = in[i];
162- if ( c == U ' Ž ' ){
179+ if ( c == SPACE_PLACEHOLDER ){
163180 c = ' ' ;
164181 }
165182 result += c;
@@ -173,7 +190,7 @@ namespace Tokenizer {
173190 const string& _lang_code ):
174191 type (_type), role (_role), lang_code (_lang_code) {
175192 if ( keep_quoted_spaces ){
176- us = filter_ZCARON ( _s );
193+ us = filter_placeholder ( _s );
177194 }
178195 else {
179196 us = _s;
@@ -185,7 +202,7 @@ namespace Tokenizer {
185202 const string& _lang_code ):
186203 type (_type), role (NOROLE), lang_code (_lang_code) {
187204 if ( keep_quoted_spaces ){
188- us = filter_ZCARON ( _s );
205+ us = filter_placeholder ( _s );
189206 }
190207 else {
191208 us = _s;
@@ -513,9 +530,18 @@ namespace Tokenizer {
513530 }
514531 }
515532
516- string fixup_UTF16 ( const string& input_line, const string& encoding ){
533+ string fixup_UTF16 ( const string& input_line,
534+ const string& encoding ){
535+ // / cleanup a string from unwanted 0 bytes (UTF16) and CR
536+ /* !
537+ \param input_line the line to fixup
538+ \param encoding the encoding
539+ \return the cleaned UTF16 result
540+
541+ this is some hackery to handle exotic input. UTF16 but also CR at end.
542+
543+ */
517544 string line = input_line;
518- // some hackery to handle exotic input. UTF-16 but also CR at end.
519545 string::size_type pos = line.rfind ( ' \r ' );
520546 if ( pos != string::npos ){
521547 line.erase ( pos );
@@ -2740,6 +2766,12 @@ namespace Tokenizer {
27402766 }
27412767
27422768 string TokenizerClass::checkBOM ( istream& in ){
2769+ // / check a stream for a Byte Order Marker
2770+ // / when present, use it to detect the encoding
2771+ /* !
2772+ \param in the inputstream
2773+ \return the detected encoding, or the default inputEncoding if not found
2774+ */
27432775 string result = inputEncoding;
27442776 if ( &in == &cin ){
27452777 return result;
@@ -2754,15 +2786,16 @@ namespace Tokenizer {
27542786 &bomLength,
27552787 &err);
27562788 if ( bomLength ){
2789+ // so a BOM is found, and an encoding detected
27572790 if ( tokDebug ){
27582791 DBG << " Autodetected encoding: " << encoding << endl;
27592792 }
27602793 result = encoding;
2761- if ( result == " UTF16BE"
2762- || result == " UTF-16BE" ){
2794+ if ( result == " UTF-16BE" ){
27632795 result = " UTF16BE" ;
27642796 }
27652797 }
2798+ // make sure to position the stream after the BOM
27662799 in.seekg ( pos + (streampos)bomLength );
27672800 return result;
27682801 }
@@ -2928,6 +2961,12 @@ namespace Tokenizer {
29282961 }
29292962
29302963 UnicodeString replace_quoted_spaces ( const UnicodeString& in ){
2964+ // / replace spaces inside quotes by a placeholder
2965+ /* !
2966+ \parameter in the inputstring
2967+ \return a string where every space inside quotes is replaced by a
2968+ placeholder
2969+ */
29312970 UnicodeString result;
29322971 UChar32 quote = ' \x0 ' ;
29332972 for ( int i=0 ; i < in.length (); ++i ){
@@ -2946,7 +2985,7 @@ namespace Tokenizer {
29462985 }
29472986 }
29482987 else if ( c == ' ' && quote != ' \x0 ' ){
2949- c = U ' Ž ' ; // mark as Ž
2988+ c = SPACE_PLACEHOLDER;
29502989 }
29512990 result += c;
29522991 }
@@ -3204,6 +3243,11 @@ namespace Tokenizer {
32043243 || special ) {
32053244 // single character, no need to process all rules, do some simpler (faster) detection
32063245 UChar32 c = input.char32At (0 );
3246+ if ( c == SPACE_PLACEHOLDER ){
3247+ // will be translated back to a single space, ergo an empty token
3248+ // We don't want that
3249+ return ;
3250+ }
32073251 UnicodeString type = detect_type ( c );
32083252 if ( tokDebug >= 8 ){
32093253 DBG << " a single character: " << UnicodeString (c) << " type= "
0 commit comments