From 4006fdafcc16209f4cd240af932e8f066c884f6e Mon Sep 17 00:00:00 2001
From: adamd1008
Date: Thu, 25 Jan 2024 17:47:51 +0000
Subject: [PATCH] Added switch to include Chi-squared p-value in terse output
---
src/ent.c | 35 +++++++++++++++++++++++++----------
src/ent.html | 23 +++++++++++++++++++++--
2 files changed, 46 insertions(+), 12 deletions(-)
diff --git a/src/ent.c b/src/ent.c
index bf981bc..47e6971 100644
--- a/src/ent.c
+++ b/src/ent.c
@@ -59,6 +59,7 @@ static void help(void)
printf("\n -c Print occurrence counts");
printf("\n -f Fold upper to lower case letters");
printf("\n -t Terse output in CSV format");
+ printf("\n -p Include Chi-square p-value in terse output (as decimal)");
printf("\n -u Print this message\n");
printf("\nVersion " VERSION);
printf("\nBy John Walker");
@@ -107,9 +108,10 @@ int main(int argc, char *argv[])
int counts = FALSE, /* Print character counts */
fold = FALSE, /* Fold upper to lower */
binary = FALSE, /* Treat input as a bitstream */
- terse = FALSE; /* Terse (CSV format) output */
+ terse = FALSE, /* Terse (CSV format) output */
+ csp = FALSE; /* Terse includes Chi^2 p-value */
- while ((opt = getopt(argc, argv, "bcftuv?BCFTUV")) != -1) {
+ while ((opt = getopt(argc, argv, "bcfptuv?BCFPTUV")) != -1) {
switch (toISOlower(opt)) {
case 'b':
binary = TRUE;
@@ -123,6 +125,10 @@ int main(int argc, char *argv[])
fold = TRUE;
break;
+ case 'p':
+ csp = TRUE;
+ break;
+
case 't':
terse = TRUE;
break;
@@ -200,22 +206,31 @@ int main(int argc, char *argv[])
}
fclose(fp);
- /* Complete calculation and return sequence metrics */
+ /* Complete calculation */
rt_end(&ent, &chisq, &mean, &montepi, &scc);
- if (terse) {
- printf("0,File-%ss,Entropy,Chi-square,Mean,Monte-Carlo-Pi,Serial-Correlation\n",
- binary ? "bit" : "byte");
- printf("1,%ld,%f,%f,%f,%f,%f\n",
- totalc, ent, chisq, mean, montepi, scc);
- }
-
/* Calculate probability of observed distribution occurring from
the results of the Chi-Square test */
chip = pochisq(chisq, (binary ? 1 : 255));
+ /* Return sequence metrics */
+
+ if (terse) {
+ if (csp) {
+ printf("0,File-%ss,Entropy,Chi-square,Chi-square-p-val,Mean,Monte-Carlo-Pi,Serial-Correlation\n",
+ binary ? "bit" : "byte");
+ printf("1,%ld,%f,%f,%f,%f,%f,%f\n",
+ totalc, ent, chisq, chip, mean, montepi, scc);
+ } else {
+ printf("0,File-%ss,Entropy,Chi-square,Mean,Monte-Carlo-Pi,Serial-Correlation\n",
+ binary ? "bit" : "byte");
+ printf("1,%ld,%f,%f,%f,%f,%f\n",
+ totalc, ent, chisq, mean, montepi, scc);
+ }
+ }
+
/* Print bin counts if requested */
if (counts) {
diff --git a/src/ent.html b/src/ent.html
index 77e2e59..f153758 100644
--- a/src/ent.html
+++ b/src/ent.html
@@ -127,7 +127,7 @@ NAME
SYNOPSIS
- ent [ -b -c -f -t -u ] [ infile ]
+ ent [ -b -c -f -p -t -u ] [ infile ]
DESCRIPTION
@@ -304,6 +304,12 @@ OPTIONS
Terse Mode Output Format
below for additional details.
+-p Used in conjunction with -t to
+ include the Chi-squared p-value in the terse
+ output (as decimal). See
+ Terse Mode Output Format
+ below for additional details.
+
-u Print how-to-call information.
@@ -340,7 +346,20 @@
column title record. If the -b option is specified, the second
field of the type 0 record will be “File-bits”, and
the file_length field in type 1 record will be given
-in bits instead of bytes. If the -c option is specified,
+in bits instead of bytes.
+
+
+
+Specifying -p in conjunction with -t includes the Chi-squared p-value in the CSV output. Note that it is provided as decimal, not as a percentage. When specified, the output becomes:
+
+
+
+0,File-bytes,Entropy,Chi-square,Chi-square-p-val,Mean,Monte-Carlo-Pi,Serial-Correlation
+1,file_length,entropy,chi_square,chi_square_p_val,mean,Pi_value,correlation
+
+
+
+If the -c option is specified,
additional records are appended to the terse mode output which
contain the character counts: