Skip to content

Commit d5d1e28

Browse files
rstm-sf304NotModified
authored andcommitted
Refactor probers part (#63)
* refactoring to understand how to submit probers * resolve discussion: return public permissions
1 parent ebd0bf0 commit d5d1e28

4 files changed

Lines changed: 79 additions & 62 deletions

File tree

src/CharsetDetector.cs

Lines changed: 66 additions & 58 deletions
Original file line numberDiff line numberDiff line change
@@ -100,12 +100,29 @@ public class CharsetDetector
100100
/// <summary>
101101
/// "list" of probers
102102
/// </summary>
103-
private readonly CharsetProber[] _charsetProbers = new CharsetProber[ProbersNum];
103+
private IList<CharsetProber> _charsetProbers;
104104

105105
/// <summary>
106106
/// TODO unknown
107107
/// </summary>
108-
private CharsetProber _escCharsetProber;
108+
private IList<CharsetProber> _escCharsetProber;
109+
110+
private IList<CharsetProber> CharsetProbers
111+
{
112+
get
113+
{
114+
switch (InputState)
115+
{
116+
case InputState.EscASCII:
117+
return _escCharsetProber;
118+
case InputState.Highbyte:
119+
return _charsetProbers;
120+
default:
121+
// pure ascii
122+
return new List<CharsetProber>();
123+
}
124+
}
125+
}
109126

110127
/// <summary>
111128
/// Detected charset. Most of the time <see cref="_done"/> is true
@@ -114,11 +131,6 @@ public class CharsetDetector
114131

115132
private const float MinimumThreshold = 0.20f;
116133

117-
/// <summary>
118-
/// tries
119-
/// </summary>
120-
private const int ProbersNum = 3;
121-
122134
private CharsetDetector()
123135
{
124136
_start = true;
@@ -182,7 +194,7 @@ public static DetectionResult DetectFromStream(Stream stream, long? maxBytesToRe
182194
}
183195

184196
var detector = new CharsetDetector();
185-
197+
186198
ReadStream(stream, maxBytesToRead, detector);
187199
return detector.DataEnd();
188200
}
@@ -264,7 +276,7 @@ public static DetectionResult DetectFromFile(FileInfo file)
264276
}
265277
}
266278

267-
#endif
279+
#endif // !NETSTANDARD1_0
268280

269281
protected virtual void Feed(byte[] buf, int offset, int len)
270282
{
@@ -279,50 +291,37 @@ protected virtual void Feed(byte[] buf, int offset, int len)
279291
// If the data starts with BOM, we know it is UTF
280292
if (_start)
281293
{
282-
var bomSet = FindCharSetByBom(buf, len);
283294
_start = false;
284-
if (bomSet != null)
285-
{
286-
_detectionDetail = new DetectionDetail(bomSet, 1.0f);
287-
_done = true;
295+
_done = IsStartsWithBom(buf, len);
296+
if (_done)
288297
return;
289-
}
290298
}
291299

292300
FindInputState(buf, len);
293-
294-
switch (InputState)
301+
foreach (var prober in CharsetProbers)
295302
{
296-
case InputState.EscASCII:
297-
298-
_escCharsetProber = _escCharsetProber ?? new EscCharsetProber();
299-
300-
RunProber(buf, offset, len, _escCharsetProber);
301-
302-
break;
303-
case InputState.Highbyte:
304-
for (int i = 0; i < ProbersNum; i++)
305-
{
306-
var charsetProber = _charsetProbers[i];
303+
_done = RunProber(buf, offset, len, prober);
304+
if (_done)
305+
return;
306+
};
307+
}
307308

308-
if (charsetProber != null)
309-
{
310-
var found = RunProber(buf, offset, len, charsetProber);
311-
if (found) return;
312-
}
313-
}
314-
break;
315-
// else pure ascii
309+
private bool IsStartsWithBom(byte[] buf, int len)
310+
{
311+
var bomSet = FindCharSetByBom(buf, len);
312+
if (bomSet != null)
313+
{
314+
_detectionDetail = new DetectionDetail(bomSet, 1.0f);
315+
return true;
316316
}
317+
return false;
317318
}
318319

319320
private bool RunProber(byte[] buf, int offset, int len, CharsetProber charsetProber)
320321
{
321322
var probingState = charsetProber.HandleData(buf, offset, len);
322-
323323
if (probingState == ProbingState.FoundIt)
324324
{
325-
_done = true;
326325
_detectionDetail = new DetectionDetail(charsetProber);
327326
return true;
328327
}
@@ -343,14 +342,7 @@ private void FindInputState(byte[] buf, int len)
343342

344343
// kill EscCharsetProber if it is active
345344
_escCharsetProber = null;
346-
347-
// start multibyte and singlebyte charset prober
348-
if (_charsetProbers[0] == null)
349-
_charsetProbers[0] = new MBCSGroupProber();
350-
if (_charsetProbers[1] == null)
351-
_charsetProbers[1] = new SBCSGroupProber();
352-
if (_charsetProbers[2] == null)
353-
_charsetProbers[2] = new Latin1Prober();
345+
_charsetProbers = _charsetProbers ?? GetNewProbers();
354346
}
355347
}
356348
else
@@ -360,6 +352,7 @@ private void FindInputState(byte[] buf, int len)
360352
{
361353
// found escape character or HZ "~{"
362354
InputState = InputState.EscASCII;
355+
_escCharsetProber = _escCharsetProber ?? GetNewProbers();
363356
}
364357
_lastChar = buf[i];
365358
}
@@ -431,18 +424,11 @@ private DetectionResult DataEnd()
431424

432425
if (InputState == InputState.Highbyte)
433426
{
434-
var list = new List<DetectionDetail>(ProbersNum);
435-
for (int i = 0; i < ProbersNum; i++)
436-
{
437-
var charsetProber = _charsetProbers[i];
438-
439-
if (charsetProber != null)
440-
{
441-
list.Add(new DetectionDetail(charsetProber));
442-
}
443-
}
444-
445-
var detectionResults = list.Where(p => p.Confidence > MinimumThreshold).OrderByDescending(p => p.Confidence).ToList();
427+
var detectionResults = _charsetProbers
428+
.Select(prober => new DetectionDetail(prober))
429+
.Where(result => result.Confidence > MinimumThreshold)
430+
.OrderByDescending(result => result.Confidence)
431+
.ToList();
446432

447433
return new DetectionResult(detectionResults);
448434

@@ -453,8 +439,30 @@ private DetectionResult DataEnd()
453439
//TODO why done isn't true?
454440
return new DetectionResult(new DetectionDetail("ASCII", 1.0f));
455441
}
442+
456443
return new DetectionResult();
457444
}
445+
446+
internal IList<CharsetProber> GetNewProbers()
447+
{
448+
switch (InputState)
449+
{
450+
case InputState.EscASCII:
451+
return new List<CharsetProber>() { new EscCharsetProber() };
452+
453+
case InputState.Highbyte:
454+
return new List<CharsetProber>()
455+
{
456+
new MBCSGroupProber(),
457+
new SBCSGroupProber(),
458+
new Latin1Prober(),
459+
};
460+
461+
default:
462+
// pure ascii
463+
return new List<CharsetProber>();
464+
}
465+
}
458466
}
459467
}
460468

src/Core/InputState.cs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@ enum InputState
88
/// Found escape character or HZ "~{"
99
/// </summary>
1010
EscASCII = 1,
11+
1112
/// <summary>
1213
/// non-ascii byte (high-byte)
1314
/// </summary>

src/DetectionDetail.cs

Lines changed: 11 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -15,18 +15,26 @@ public class DetectionDetail
1515
/// <summary>
1616
/// New result
1717
/// </summary>
18-
public DetectionDetail(string encodingShortName, float confidence, CharsetProber prober = null, TimeSpan? time = null, string statusLog = null)
18+
public DetectionDetail(
19+
string encodingShortName,
20+
float confidence,
21+
CharsetProber prober = null,
22+
TimeSpan? time = null,
23+
string statusLog = null)
1924
{
2025
EncodingName = encodingShortName.Split('(').First().Trim();
2126
Confidence = confidence;
2227

2328
try
2429
{
25-
Encoding = Encoding.GetEncoding(encodingShortName.Split('(').Last().Split(')').First().Trim());
30+
var encodingName = encodingShortName
31+
.Split('(').Last()
32+
.Split(')').First()
33+
.Trim();
34+
Encoding = Encoding.GetEncoding(encodingName);
2635
}
2736
catch (Exception)
2837
{
29-
3038
//wrong name
3139
}
3240

src/DetectionResult.cs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -41,7 +41,7 @@ public DetectionResult(DetectionDetail detectionDetail)
4141
/// <summary>
4242
/// All results
4343
/// </summary>
44-
public IList<DetectionDetail> Details { set; get; }
44+
public IList<DetectionDetail> Details { get; set; }
4545

4646
public override string ToString()
4747
{

0 commit comments

Comments
 (0)