1+ <?php
2+ namespace webd \language ;
3+
4+ /**
5+ * PHP implementation of the SpamSum algorithm, also called ssdeep or
6+ * context-triggered piecewize hashing
7+ */
8+ class SpamSum
9+ {
10+ /**
11+ * Compute the SpamSum of string using default parameters:
12+ * length = 64 characters
13+ * 64 possible letters (Base64)
14+ * min blocksize = 3
15+ * block size computed automatically
16+ *
17+ * @param type $string
18+ * @return \webd\language\SpamSum
19+ */
20+ public static function Hash ($ string ) {
21+ $ ss = new SpamSum ();
22+ $ ss ->HashString ($ string );
23+ return $ ss ;
24+ }
25+
26+ const HASH_PRIME = 0x01000193 ;
27+ const HASH_INIT = 0x28021967 ;
28+ const B64 = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/ " ;
29+
30+ protected $ SPAMSUM_LENGTH = 64 ;
31+ protected $ MIN_BLOCKSIZE = 3 ;
32+ protected $ LETTERS = 64 ;
33+ protected $ BLOCKSIZE = 0 ;
34+
35+ protected $ left ;
36+ protected $ right ;
37+
38+ public function SetHashLength ($ l ) {
39+ $ this ->SPAMSUM_LENGTH = $ l ;
40+ }
41+
42+ public function SetLetters ($ l ) {
43+ $ this ->LETTERS = $ l ;
44+ }
45+
46+ public function SetMinBlocksize ($ s ) {
47+ $ this ->MIN_BLOCKSIZE = $ s ;
48+ }
49+
50+ /**
51+ *
52+ * @param type $string
53+ * @return \webd\language\SpamSum
54+ */
55+ public function HashString ($ string ) {
56+ $ b64 = self ::B64 ;
57+ $ length = strlen ($ string );
58+
59+ $ in = unpack ('C* ' , $ string );
60+
61+ // Reindex (to start from 0)
62+ foreach ($ in as $ k => $ v ) {
63+ $ in [$ k - 1 ] = $ v ;
64+ }
65+ unset($ in [count ($ in )]);
66+
67+ // Guess a a reasonable block size
68+ if ($ this ->BLOCKSIZE == 0 ) {
69+ $ this ->BLOCKSIZE = $ this ->MIN_BLOCKSIZE ;
70+
71+ while ($ this ->BLOCKSIZE * $ this ->SPAMSUM_LENGTH < $ length ) {
72+ $ this ->BLOCKSIZE = $ this ->BLOCKSIZE * 2 ;
73+ }
74+ }
75+
76+ again:
77+
78+ $ this ->left = array ();
79+ $ this ->right = array ();
80+
81+ $ k = $ j = 0 ;
82+ $ h3 = $ h2 = self ::HASH_INIT ;
83+ $ h = $ this ->rolling_hash_reset ();
84+
85+ for ($ i = 0 ; $ i < $ length ; $ i ++) {
86+
87+ /* at each character we update the rolling hash and the normal
88+ * hash. When the rolling hash hits the reset value then we emit
89+ * the normal hash as a element of the signature and reset both
90+ * hashes
91+ */
92+ $ h = $ this ->rolling_hash ($ in [$ i ]);
93+ $ h2 = self ::sum_hash ($ in [$ i ], $ h2 );
94+ $ h3 = self ::sum_hash ($ in [$ i ], $ h3 );
95+
96+ if ($ h % $ this ->BLOCKSIZE == ($ this ->BLOCKSIZE - 1 )) {
97+
98+ /* we have hit a reset point. We now emit a hash which is based
99+ * on all chacaters in the piece of the string between the last
100+ * reset point and this one
101+ */
102+ $ this ->left [$ j ] = $ b64 [$ h2 % $ this ->LETTERS ];
103+ if ($ j < $ this ->SPAMSUM_LENGTH - 1 ) {
104+
105+ /* we can have a problem with the tail overflowing. The easiest way
106+ * to cope with this is to only reset the second hash if we have
107+ * room for more characters in our signature. This has the effect of
108+ * combining the last few pieces of the message into a single piece
109+ */
110+ $ h2 = self ::HASH_INIT ;
111+ $ j ++;
112+ }
113+ }
114+
115+ /* this produces a second signature with a block size of block_size*2.
116+ * By producing dual signatures in this way the effect of small changes
117+ * in the string near a block size boundary is greatly reduced.
118+ */
119+ if ($ h % ($ this ->BLOCKSIZE * 2 ) == (($ this ->BLOCKSIZE * 2 ) - 1 )) {
120+ $ this ->right [$ k ] = $ b64 [$ h3 % $ this ->LETTERS ];
121+ if ($ k < $ this ->SPAMSUM_LENGTH / 2 - 1 ) {
122+ $ h3 = self ::HASH_INIT ;
123+ $ k ++;
124+ }
125+ }
126+ }
127+
128+ /* If we have anything left then add it to the end. This ensures that the
129+ * last part of the string is always considered
130+ */
131+ if ($ h != 0 ) {
132+ $ this ->left [$ j ] = $ b64 [$ h2 % $ this ->LETTERS ];
133+ $ this ->right [$ k ] = $ b64 [$ h3 % $ this ->LETTERS ];
134+ }
135+
136+ /* Our blocksize guess may have been way off - repeat if necessary
137+ */
138+ if ($ this ->BLOCKSIZE > $ this ->MIN_BLOCKSIZE
139+ && $ j < $ this ->SPAMSUM_LENGTH / 2 ) {
140+
141+ $ this ->BLOCKSIZE = $ this ->BLOCKSIZE / 2 ;
142+ goto again;
143+ }
144+
145+ return $ this ;
146+ }
147+
148+ public function __toString () {
149+ return
150+ $ this ->BLOCKSIZE . ": " . $ this ->Left () . ": " . $ this ->Right ();
151+ }
152+
153+ public function BlockSize () {
154+ return $ this ->BLOCKSIZE ;
155+ }
156+
157+ public function Left () {
158+ return implode ("" , $ this ->left );
159+ }
160+
161+ public function Right () {
162+ return implode ("" , $ this ->right );
163+ }
164+
165+ /* A simple non-rolling hash, based on the FNV hash
166+ */
167+ protected static function sum_hash ($ c , $ h ) {
168+ $ h = ($ h * self ::HASH_PRIME ) % pow (2 , 32 );
169+ $ h = ($ h ^ $ c ) % pow (2 , 32 );
170+ return $ h ;
171+ }
172+
173+
174+ /* A rolling hash, based on the Adler checksum. By using a rolling hash
175+ * we can perform auto resynchronisation after inserts/deletes internally,
176+ * h1 is the sum of the bytes in the window and h2 is the sum of the bytes
177+ * times the index h3 is a shift/xor based rolling hash, and is mostly
178+ * needed to ensure that we can cope with large blocksize values
179+ */
180+ const ROLLING_WINDOW = 7 ;
181+
182+ protected $ rolling_window = array ();
183+ protected $ rolling_h1 ;
184+ protected $ rolling_h2 ;
185+ protected $ rolling_h3 ;
186+ protected $ rolling_n ;
187+
188+ protected function rolling_hash ($ c ) {
189+ $ this ->rolling_h2 -= $ this ->rolling_h1 ;
190+ $ this ->rolling_h2 += self ::ROLLING_WINDOW * $ c ;
191+
192+ $ this ->rolling_h1 += $ c ;
193+ $ this ->rolling_h1 -= $ this ->rolling_window [$ this ->rolling_n % self ::ROLLING_WINDOW ];
194+
195+ $ this ->rolling_window [$ this ->rolling_n % self ::ROLLING_WINDOW ] = $ c ;
196+ $ this ->rolling_n ++;
197+
198+ $ this ->rolling_h3 = ($ this ->rolling_h3 << 5 ) & 0xFFFFFFFF ;
199+ $ this ->rolling_h3 ^= $ c ;
200+
201+ return $ this ->rolling_h1 + $ this ->rolling_h2 + $ this ->rolling_h3 ;
202+ }
203+
204+ protected function rolling_hash_reset () {
205+ for ($ i = 0 ; $ i < self ::ROLLING_WINDOW ; $ i ++) {
206+ $ this ->rolling_window [$ i ] = 0 ;
207+ }
208+
209+ $ this ->rolling_h1 = 0 ;
210+ $ this ->rolling_h2 = 0 ;
211+ $ this ->rolling_h3 = 0 ;
212+ $ this ->rolling_n = 0 ;
213+
214+ return 0 ;
215+ }
216+
217+ }
0 commit comments