Skip to content

Commit 9a9a5f1

Browse files
committed
added a test for the (undocumented) --keep-spaces-inside-quotes optiin.
And improved the working of said option
1 parent 48377b4 commit 9a9a5f1

4 files changed

Lines changed: 95 additions & 10 deletions

File tree

src/tokenize.cxx

Lines changed: 53 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -123,6 +123,13 @@ namespace Tokenizer {
123123

124124
UnicodeString convert( const string& line,
125125
const string& inputEncoding ){
126+
/// convert a string with \e inputEncoding into a UnicodeString
127+
/*!
128+
\param line int inputstring
129+
\param inputEncoding the assumed encodfing
130+
\return a valid UnicodeString
131+
will throw when the encoding is wrong, or bogus characters are found
132+
*/
126133
UnicodeString result;
127134
if ( !line.empty() ){
128135
try {
@@ -155,11 +162,21 @@ namespace Tokenizer {
155162
const UnicodeString type_unknown = "UNKNOWN";
156163
const UnicodeString type_unanalyzed = "UNANALYZED";
157164

158-
UnicodeString filter_ZCARON( const UnicodeString& in ){
165+
UChar32 SPACE_PLACEHOLDER = U'Ž';
166+
167+
UnicodeString filter_placeholder( const UnicodeString& in ){
168+
/// filter out the SPACE_PACEHOLDER characters
169+
/// assuming they were added usiing the hidden --keep-spaces-inside-quotes
170+
/// option
171+
/*!
172+
\param in UnicodeString input
173+
\return filtered output, where the SPACE_PLACEHOLDER
174+
is replaced by a space
175+
*/
159176
UnicodeString result;
160177
for ( int i=0; i < in.length(); ++i ){
161178
UChar32 c = in[i];
162-
if ( c == U'Ž' ){
179+
if ( c == SPACE_PLACEHOLDER ){
163180
c = ' ';
164181
}
165182
result += c;
@@ -173,7 +190,7 @@ namespace Tokenizer {
173190
const string& _lang_code ):
174191
type(_type), role(_role), lang_code(_lang_code) {
175192
if ( keep_quoted_spaces ){
176-
us = filter_ZCARON( _s );
193+
us = filter_placeholder( _s );
177194
}
178195
else {
179196
us = _s;
@@ -185,7 +202,7 @@ namespace Tokenizer {
185202
const string& _lang_code ):
186203
type(_type), role(NOROLE), lang_code(_lang_code) {
187204
if ( keep_quoted_spaces ){
188-
us = filter_ZCARON( _s );
205+
us = filter_placeholder( _s );
189206
}
190207
else {
191208
us = _s;
@@ -513,9 +530,18 @@ namespace Tokenizer {
513530
}
514531
}
515532

516-
string fixup_UTF16( const string& input_line, const string& encoding ){
533+
string fixup_UTF16( const string& input_line,
534+
const string& encoding ){
535+
/// cleanup a string from unwanted 0 bytes (UTF16) and CR
536+
/*!
537+
\param input_line the line to fixup
538+
\param encoding the encoding
539+
\return the cleaned UTF16 result
540+
541+
this is some hackery to handle exotic input. UTF16 but also CR at end.
542+
543+
*/
517544
string line = input_line;
518-
// some hackery to handle exotic input. UTF-16 but also CR at end.
519545
string::size_type pos = line.rfind( '\r' );
520546
if ( pos != string::npos ){
521547
line.erase( pos );
@@ -2740,6 +2766,12 @@ namespace Tokenizer {
27402766
}
27412767

27422768
string TokenizerClass::checkBOM( istream& in ){
2769+
/// check a stream for a Byte Order Marker
2770+
/// when present, use it to detect the encoding
2771+
/*!
2772+
\param in the inputstream
2773+
\return the detected encoding, or the default inputEncoding if not found
2774+
*/
27432775
string result = inputEncoding;
27442776
if ( &in == &cin ){
27452777
return result;
@@ -2754,15 +2786,16 @@ namespace Tokenizer {
27542786
&bomLength,
27552787
&err);
27562788
if ( bomLength ){
2789+
// so a BOM is found, and an encoding detected
27572790
if ( tokDebug ){
27582791
DBG << "Autodetected encoding: " << encoding << endl;
27592792
}
27602793
result = encoding;
2761-
if ( result == "UTF16BE"
2762-
|| result == "UTF-16BE" ){
2794+
if ( result == "UTF-16BE" ){
27632795
result = "UTF16BE";
27642796
}
27652797
}
2798+
// make sure to position the stream after the BOM
27662799
in.seekg( pos + (streampos)bomLength );
27672800
return result;
27682801
}
@@ -2928,6 +2961,12 @@ namespace Tokenizer {
29282961
}
29292962

29302963
UnicodeString replace_quoted_spaces( const UnicodeString& in ){
2964+
/// replace spaces inside quotes by a placeholder
2965+
/*!
2966+
\parameter in the inputstring
2967+
\return a string where every space inside quotes is replaced by a
2968+
placeholder
2969+
*/
29312970
UnicodeString result;
29322971
UChar32 quote = '\x0';
29332972
for ( int i=0; i < in.length(); ++i ){
@@ -2946,7 +2985,7 @@ namespace Tokenizer {
29462985
}
29472986
}
29482987
else if ( c == ' ' && quote != '\x0' ){
2949-
c = U'Ž'; // mark as Ž
2988+
c = SPACE_PLACEHOLDER;
29502989
}
29512990
result += c;
29522991
}
@@ -3204,6 +3243,11 @@ namespace Tokenizer {
32043243
|| special ) {
32053244
//single character, no need to process all rules, do some simpler (faster) detection
32063245
UChar32 c = input.char32At(0);
3246+
if ( c == SPACE_PLACEHOLDER ){
3247+
// will be translated back to a single space, ergo an empty token
3248+
// We don't want that
3249+
return;
3250+
}
32073251
UnicodeString type = detect_type( c );
32083252
if ( tokDebug >= 8 ){
32093253
DBG << " a single character: " << UnicodeString(c) << " type= "

tests/testall.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@ for file in testusage testlanguage testconf1 testconf2 testinclude \
1818
testtokens testoption-P testoption-split testissue64 testissue66 \
1919
testissue71 testissue72 testissue70 testnbsp testcorrect \
2020
testtag testissue81 testissue83 testissue84 testissue87 \
21-
testissue68 testissue93 testoption-m testbatch
21+
testissue68 testissue93 testoption-m testbatch testkeepquotes
2222
do
2323
./testone.sh $file
2424
if [ $? -ne 0 ]; then

tests/testkeepquotes.ok

Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,36 @@
1+
dit WORD BEGINOFSENTENCE NEWPARAGRAPH
2+
" PUNCTUATION
3+
is WORD
4+
een WORD
5+
test WORD NOSPACE
6+
" PUNCTUATION NOSPACE
7+
, PUNCTUATION
8+
en WORD
9+
" PUNCTUATION
10+
' PUNCTUATION
11+
dit WORD
12+
ook WORD
13+
' PUNCTUATION
14+
" PUNCTUATION NOSPACE
15+
. PUNCTUATION ENDOFSENTENCE
16+
17+
Bijzonder WORD NOSPACE BEGINOFSENTENCE
18+
! PUNCTUATION ENDOFSENTENCE
19+
20+
21+
dit WORD BEGINOFSENTENCE NEWPARAGRAPH
22+
" PUNCTUATION NOSPACE
23+
is een test WORD NOSPACE
24+
" PUNCTUATION NOSPACE
25+
, PUNCTUATION
26+
en WORD
27+
" PUNCTUATION NOSPACE
28+
' PUNCTUATION NOSPACE
29+
dit ook WORD NOSPACE
30+
' PUNCTUATION
31+
" PUNCTUATION NOSPACE
32+
. PUNCTUATION NOSPACE ENDOFSENTENCE
33+
Bijzonder WORD NOSPACE BEGINOFSENTENCE
34+
! PUNCTUATION ENDOFSENTENCE
35+
36+

tests/testkeepquotes.sh

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
#/bin/sh
2+
3+
$exe -L nld -v spaced_quotes.nl
4+
5+
$exe -L nld -v --keep-spaces-inside-quotes spaced_quotes.nl

0 commit comments

Comments
 (0)