Skip to content

Commit de9b544

Browse files
committed
CTPH
1 parent 3d2c704 commit de9b544

4 files changed

Lines changed: 333 additions & 1 deletion

File tree

README.md

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -41,4 +41,8 @@ $lcs = new \webd\language\LCS($str1, $str2);
4141
echo $lcs->value();
4242
echo $lcs->length();
4343
echo $lcs->distance();
44+
45+
// SpamSum, aka ssdeep, aka Context-Triggered Piecewize Hashing (CTPH):
46+
$s = new \webd\language\SpamSum;
47+
echo $s->HashString(file_get_contents($f));
4448
```

src/webd/language/LCS.php

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,4 @@
11
<?php
2-
32
namespace webd\language;
43

54
/**

src/webd/language/SpamSum.php

Lines changed: 217 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,217 @@
1+
<?php
2+
namespace webd\language;
3+
4+
/**
5+
* PHP implementation of the SpamSum algorithm, also called ssdeep or
6+
* context-triggered piecewize hashing
7+
*/
8+
class SpamSum
9+
{
10+
/**
11+
* Compute the SpamSum of string using default parameters:
12+
* length = 64 characters
13+
* 64 possible letters (Base64)
14+
* min blocksize = 3
15+
* block size computed automatically
16+
*
17+
* @param type $string
18+
* @return \webd\language\SpamSum
19+
*/
20+
public static function Hash($string) {
21+
$ss = new SpamSum();
22+
$ss->HashString($string);
23+
return $ss;
24+
}
25+
26+
const HASH_PRIME = 0x01000193;
27+
const HASH_INIT = 0x28021967;
28+
const B64 = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
29+
30+
protected $SPAMSUM_LENGTH = 64;
31+
protected $MIN_BLOCKSIZE = 3;
32+
protected $LETTERS = 64;
33+
protected $BLOCKSIZE = 0;
34+
35+
protected $left;
36+
protected $right;
37+
38+
public function SetHashLength($l) {
39+
$this->SPAMSUM_LENGTH = $l;
40+
}
41+
42+
public function SetLetters($l) {
43+
$this->LETTERS = $l;
44+
}
45+
46+
public function SetMinBlocksize($s) {
47+
$this->MIN_BLOCKSIZE = $s;
48+
}
49+
50+
/**
51+
*
52+
* @param type $string
53+
* @return \webd\language\SpamSum
54+
*/
55+
public function HashString($string) {
56+
$b64 = self::B64;
57+
$length = strlen($string);
58+
59+
$in = unpack('C*', $string);
60+
61+
// Reindex (to start from 0)
62+
foreach ($in as $k => $v) {
63+
$in[$k - 1] = $v;
64+
}
65+
unset($in[count($in)]);
66+
67+
// Guess a a reasonable block size
68+
if ($this->BLOCKSIZE == 0) {
69+
$this->BLOCKSIZE = $this->MIN_BLOCKSIZE;
70+
71+
while ($this->BLOCKSIZE * $this->SPAMSUM_LENGTH < $length) {
72+
$this->BLOCKSIZE = $this->BLOCKSIZE * 2;
73+
}
74+
}
75+
76+
again:
77+
78+
$this->left = array();
79+
$this->right = array();
80+
81+
$k = $j = 0;
82+
$h3 = $h2 = self::HASH_INIT;
83+
$h = $this->rolling_hash_reset();
84+
85+
for ($i = 0; $i < $length; $i++) {
86+
87+
/* at each character we update the rolling hash and the normal
88+
* hash. When the rolling hash hits the reset value then we emit
89+
* the normal hash as a element of the signature and reset both
90+
* hashes
91+
*/
92+
$h = $this->rolling_hash($in[$i]);
93+
$h2 = self::sum_hash($in[$i], $h2);
94+
$h3 = self::sum_hash($in[$i], $h3);
95+
96+
if ($h % $this->BLOCKSIZE == ($this->BLOCKSIZE - 1)) {
97+
98+
/* we have hit a reset point. We now emit a hash which is based
99+
* on all chacaters in the piece of the string between the last
100+
* reset point and this one
101+
*/
102+
$this->left[$j] = $b64[$h2 % $this->LETTERS];
103+
if ($j < $this->SPAMSUM_LENGTH - 1) {
104+
105+
/* we can have a problem with the tail overflowing. The easiest way
106+
* to cope with this is to only reset the second hash if we have
107+
* room for more characters in our signature. This has the effect of
108+
* combining the last few pieces of the message into a single piece
109+
*/
110+
$h2 = self::HASH_INIT;
111+
$j++;
112+
}
113+
}
114+
115+
/* this produces a second signature with a block size of block_size*2.
116+
* By producing dual signatures in this way the effect of small changes
117+
* in the string near a block size boundary is greatly reduced.
118+
*/
119+
if ($h % ($this->BLOCKSIZE * 2) == (($this->BLOCKSIZE * 2) - 1)) {
120+
$this->right[$k] = $b64[$h3 % $this->LETTERS];
121+
if ($k < $this->SPAMSUM_LENGTH / 2 - 1) {
122+
$h3 = self::HASH_INIT;
123+
$k++;
124+
}
125+
}
126+
}
127+
128+
/* If we have anything left then add it to the end. This ensures that the
129+
* last part of the string is always considered
130+
*/
131+
if ($h != 0) {
132+
$this->left[$j] = $b64[$h2 % $this->LETTERS];
133+
$this->right[$k] = $b64[$h3 % $this->LETTERS];
134+
}
135+
136+
/* Our blocksize guess may have been way off - repeat if necessary
137+
*/
138+
if ($this->BLOCKSIZE > $this->MIN_BLOCKSIZE
139+
&& $j < $this->SPAMSUM_LENGTH / 2) {
140+
141+
$this->BLOCKSIZE = $this->BLOCKSIZE / 2;
142+
goto again;
143+
}
144+
145+
return $this;
146+
}
147+
148+
public function __toString() {
149+
return
150+
$this->BLOCKSIZE . ":" . $this->Left() . ":" . $this->Right();
151+
}
152+
153+
public function BlockSize() {
154+
return $this->BLOCKSIZE;
155+
}
156+
157+
public function Left() {
158+
return implode("", $this->left);
159+
}
160+
161+
public function Right() {
162+
return implode("", $this->right);
163+
}
164+
165+
/* A simple non-rolling hash, based on the FNV hash
166+
*/
167+
protected static function sum_hash($c, $h) {
168+
$h = ($h * self::HASH_PRIME) % pow(2, 32);
169+
$h = ($h ^ $c) % pow(2, 32);
170+
return $h;
171+
}
172+
173+
174+
/* A rolling hash, based on the Adler checksum. By using a rolling hash
175+
* we can perform auto resynchronisation after inserts/deletes internally,
176+
* h1 is the sum of the bytes in the window and h2 is the sum of the bytes
177+
* times the index h3 is a shift/xor based rolling hash, and is mostly
178+
* needed to ensure that we can cope with large blocksize values
179+
*/
180+
const ROLLING_WINDOW = 7;
181+
182+
protected $rolling_window = array();
183+
protected $rolling_h1;
184+
protected $rolling_h2;
185+
protected $rolling_h3;
186+
protected $rolling_n;
187+
188+
protected function rolling_hash($c) {
189+
$this->rolling_h2 -= $this->rolling_h1;
190+
$this->rolling_h2 += self::ROLLING_WINDOW * $c;
191+
192+
$this->rolling_h1 += $c;
193+
$this->rolling_h1 -= $this->rolling_window[$this->rolling_n % self::ROLLING_WINDOW];
194+
195+
$this->rolling_window[$this->rolling_n % self::ROLLING_WINDOW] = $c;
196+
$this->rolling_n++;
197+
198+
$this->rolling_h3 = ($this->rolling_h3 << 5) & 0xFFFFFFFF;
199+
$this->rolling_h3 ^= $c;
200+
201+
return $this->rolling_h1 + $this->rolling_h2 + $this->rolling_h3;
202+
}
203+
204+
protected function rolling_hash_reset() {
205+
for ($i = 0; $i < self::ROLLING_WINDOW; $i++) {
206+
$this->rolling_window[$i] = 0;
207+
}
208+
209+
$this->rolling_h1 = 0;
210+
$this->rolling_h2 = 0;
211+
$this->rolling_h3 = 0;
212+
$this->rolling_n = 0;
213+
214+
return 0;
215+
}
216+
217+
}
Lines changed: 112 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,112 @@
1+
<?php
2+
3+
namespace webd\language;
4+
5+
/**
6+
* Generated by PHPUnit_SkeletonGenerator 1.2.1 on 2014-12-16 at 13:38:17.
7+
*/
8+
class SpamSumTest extends \PHPUnit_Framework_TestCase {
9+
10+
/**
11+
* @var SpamSum
12+
*/
13+
protected $object;
14+
15+
protected $str1 = "A string, for example the title of a spam, or the text of an email, or even the content of a webpage... Who knows? With some additional characters to make it long enough... and some more characters, I hope I will evnetually reach a sufficient lenght....\n";
16+
17+
/**
18+
* Sets up the fixture, for example, opens a network connection.
19+
* This method is called before a test is executed.
20+
*/
21+
protected function setUp() {
22+
$this->object = new SpamSum;
23+
}
24+
25+
/**
26+
* Tears down the fixture, for example, closes a network connection.
27+
* This method is called after a test is executed.
28+
*/
29+
protected function tearDown() {
30+
31+
}
32+
33+
/**
34+
* @covers webd\language\SpamSum::Hash
35+
* @todo Implement testHash().
36+
*/
37+
public function testHashString() {
38+
$this->assertEquals(
39+
$this->object->HashString($this->str1),
40+
"6:MZEYWZDrpCGgFLLELGrX+TPdLgN98M6S8HROQ9Svb:M+hpTGgiNiM58LSj");
41+
}
42+
43+
/**
44+
* @covers webd\language\SpamSum::SetHashLength
45+
* @todo Implement testSetHashLength().
46+
*/
47+
public function testSetHashLength() {
48+
$s = new SpamSum;
49+
$s->SetHashLength(10);
50+
$s->HashString($this->str1);
51+
$this->assertEquals(
52+
"M0Gj58Lo",
53+
$s->Left());
54+
55+
}
56+
57+
/**
58+
* @covers webd\language\SpamSum::SetLetters
59+
* @todo Implement testSetLetters().
60+
*/
61+
public function testSetLetters() {
62+
$s = new SpamSum;
63+
$s->SetLetters(8);
64+
$s->HashString($this->str1);
65+
$this->assertEquals(
66+
"EBEAGBDDBCGAFDDEDGDHGDHFDAFFEECCEHBGAFCHD",
67+
$s->Left());
68+
}
69+
70+
public function testSetMinBlocksize() {
71+
$s = new SpamSum;
72+
$s->SetMinBlocksize(1);
73+
$s->HashString($this->str1);
74+
$this->assertEquals(
75+
"4:M1yuN7qZF30RqjKgBDlWdH0eKyXCBMqGUAiDmNA1XEGAnFNuoILPaFAAhNj:MLN7qZvjKgJU0VmC7GmSFL8PaFAAhh",
76+
$s->__toString());
77+
}
78+
79+
80+
/**
81+
* @covers webd\language\SpamSum::__toString
82+
* @todo Implement test__toString().
83+
*/
84+
public function test__toString() {
85+
86+
}
87+
88+
/**
89+
* @covers webd\language\SpamSum::BlockSize
90+
* @todo Implement testBlockSize().
91+
*/
92+
public function testBlockSize() {
93+
94+
}
95+
96+
/**
97+
* @covers webd\language\SpamSum::Left
98+
* @todo Implement testLeft().
99+
*/
100+
public function testLeft() {
101+
102+
}
103+
104+
/**
105+
* @covers webd\language\SpamSum::Right
106+
* @todo Implement testRight().
107+
*/
108+
public function testRight() {
109+
110+
}
111+
112+
}

0 commit comments

Comments
 (0)