@@ -100,12 +100,29 @@ public class CharsetDetector
100100 /// <summary>
101101 /// "list" of probers
102102 /// </summary>
103- private readonly CharsetProber [ ] _charsetProbers = new CharsetProber [ ProbersNum ] ;
103+ private IList < CharsetProber > _charsetProbers ;
104104
105105 /// <summary>
106106 /// TODO unknown
107107 /// </summary>
108- private CharsetProber _escCharsetProber ;
108+ private IList < CharsetProber > _escCharsetProber ;
109+
110+ private IList < CharsetProber > CharsetProbers
111+ {
112+ get
113+ {
114+ switch ( InputState )
115+ {
116+ case InputState . EscASCII :
117+ return _escCharsetProber ;
118+ case InputState . Highbyte :
119+ return _charsetProbers ;
120+ default :
121+ // pure ascii
122+ return new List < CharsetProber > ( ) ;
123+ }
124+ }
125+ }
109126
110127 /// <summary>
111128 /// Detected charset. Most of the time <see cref="_done"/> is true
@@ -114,11 +131,6 @@ public class CharsetDetector
114131
115132 private const float MinimumThreshold = 0.20f ;
116133
117- /// <summary>
118- /// tries
119- /// </summary>
120- private const int ProbersNum = 3 ;
121-
122134 private CharsetDetector ( )
123135 {
124136 _start = true ;
@@ -182,7 +194,7 @@ public static DetectionResult DetectFromStream(Stream stream, long? maxBytesToRe
182194 }
183195
184196 var detector = new CharsetDetector ( ) ;
185-
197+
186198 ReadStream ( stream , maxBytesToRead , detector ) ;
187199 return detector . DataEnd ( ) ;
188200 }
@@ -264,7 +276,7 @@ public static DetectionResult DetectFromFile(FileInfo file)
264276 }
265277 }
266278
267- #endif
279+ #endif // !NETSTANDARD1_0
268280
269281 protected virtual void Feed ( byte [ ] buf , int offset , int len )
270282 {
@@ -279,50 +291,37 @@ protected virtual void Feed(byte[] buf, int offset, int len)
279291 // If the data starts with BOM, we know it is UTF
280292 if ( _start )
281293 {
282- var bomSet = FindCharSetByBom ( buf , len ) ;
283294 _start = false ;
284- if ( bomSet != null )
285- {
286- _detectionDetail = new DetectionDetail ( bomSet , 1.0f ) ;
287- _done = true ;
295+ _done = IsStartsWithBom ( buf , len ) ;
296+ if ( _done )
288297 return ;
289- }
290298 }
291299
292300 FindInputState ( buf , len ) ;
293-
294- switch ( InputState )
301+ foreach ( var prober in CharsetProbers )
295302 {
296- case InputState . EscASCII :
297-
298- _escCharsetProber = _escCharsetProber ?? new EscCharsetProber ( ) ;
299-
300- RunProber ( buf , offset , len , _escCharsetProber ) ;
301-
302- break ;
303- case InputState . Highbyte :
304- for ( int i = 0 ; i < ProbersNum ; i ++ )
305- {
306- var charsetProber = _charsetProbers [ i ] ;
303+ _done = RunProber ( buf , offset , len , prober ) ;
304+ if ( _done )
305+ return ;
306+ } ;
307+ }
307308
308- if ( charsetProber != null )
309- {
310- var found = RunProber ( buf , offset , len , charsetProber ) ;
311- if ( found ) return ;
312- }
313- }
314- break ;
315- // else pure ascii
309+ private bool IsStartsWithBom ( byte [ ] buf , int len )
310+ {
311+ var bomSet = FindCharSetByBom ( buf , len ) ;
312+ if ( bomSet != null )
313+ {
314+ _detectionDetail = new DetectionDetail ( bomSet , 1.0f ) ;
315+ return true ;
316316 }
317+ return false ;
317318 }
318319
319320 private bool RunProber ( byte [ ] buf , int offset , int len , CharsetProber charsetProber )
320321 {
321322 var probingState = charsetProber . HandleData ( buf , offset , len ) ;
322-
323323 if ( probingState == ProbingState . FoundIt )
324324 {
325- _done = true ;
326325 _detectionDetail = new DetectionDetail ( charsetProber ) ;
327326 return true ;
328327 }
@@ -343,14 +342,7 @@ private void FindInputState(byte[] buf, int len)
343342
344343 // kill EscCharsetProber if it is active
345344 _escCharsetProber = null ;
346-
347- // start multibyte and singlebyte charset prober
348- if ( _charsetProbers [ 0 ] == null )
349- _charsetProbers [ 0 ] = new MBCSGroupProber ( ) ;
350- if ( _charsetProbers [ 1 ] == null )
351- _charsetProbers [ 1 ] = new SBCSGroupProber ( ) ;
352- if ( _charsetProbers [ 2 ] == null )
353- _charsetProbers [ 2 ] = new Latin1Prober ( ) ;
345+ _charsetProbers = _charsetProbers ?? GetNewProbers ( ) ;
354346 }
355347 }
356348 else
@@ -360,6 +352,7 @@ private void FindInputState(byte[] buf, int len)
360352 {
361353 // found escape character or HZ "~{"
362354 InputState = InputState . EscASCII ;
355+ _escCharsetProber = _escCharsetProber ?? GetNewProbers ( ) ;
363356 }
364357 _lastChar = buf [ i ] ;
365358 }
@@ -431,18 +424,11 @@ private DetectionResult DataEnd()
431424
432425 if ( InputState == InputState . Highbyte )
433426 {
434- var list = new List < DetectionDetail > ( ProbersNum ) ;
435- for ( int i = 0 ; i < ProbersNum ; i ++ )
436- {
437- var charsetProber = _charsetProbers [ i ] ;
438-
439- if ( charsetProber != null )
440- {
441- list . Add ( new DetectionDetail ( charsetProber ) ) ;
442- }
443- }
444-
445- var detectionResults = list . Where ( p => p . Confidence > MinimumThreshold ) . OrderByDescending ( p => p . Confidence ) . ToList ( ) ;
427+ var detectionResults = _charsetProbers
428+ . Select ( prober => new DetectionDetail ( prober ) )
429+ . Where ( result => result . Confidence > MinimumThreshold )
430+ . OrderByDescending ( result => result . Confidence )
431+ . ToList ( ) ;
446432
447433 return new DetectionResult ( detectionResults ) ;
448434
@@ -453,8 +439,30 @@ private DetectionResult DataEnd()
453439 //TODO why done isn't true?
454440 return new DetectionResult ( new DetectionDetail ( "ASCII" , 1.0f ) ) ;
455441 }
442+
456443 return new DetectionResult ( ) ;
457444 }
445+
446+ internal IList < CharsetProber > GetNewProbers ( )
447+ {
448+ switch ( InputState )
449+ {
450+ case InputState . EscASCII :
451+ return new List < CharsetProber > ( ) { new EscCharsetProber ( ) } ;
452+
453+ case InputState . Highbyte :
454+ return new List < CharsetProber > ( )
455+ {
456+ new MBCSGroupProber ( ) ,
457+ new SBCSGroupProber ( ) ,
458+ new Latin1Prober ( ) ,
459+ } ;
460+
461+ default :
462+ // pure ascii
463+ return new List < CharsetProber > ( ) ;
464+ }
465+ }
458466 }
459467}
460468
0 commit comments