Skip to content

Commit 134577d

Browse files
committed
Added support for large vectors. Fixes issue #9
1 parent 2fea8b4 commit 134577d

3 files changed

Lines changed: 101 additions & 20 deletions

File tree

src/main/java/info/debatty/java/lsh/LSHMinHash.java

Lines changed: 14 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -33,31 +33,31 @@ public class LSHMinHash extends LSH {
3333

3434
/**
3535
* Instantiates a LSH instance that internally uses MinHash,
36-
* with s stages (or bands) and b buckets (per stage), for sets out of a
36+
* with s stages (or bands) and b buckets (per stage), for sets out of a
3737
* dictionary of n elements.
38-
*
38+
*
3939
* Attention: the number of buckets should be chosen such that we have at
4040
* least 100 items per bucket.
41-
*
41+
*
4242
* @param s stages
4343
* @param b buckets (per stage)
4444
* @param n dictionary size
4545
*/
4646
public LSHMinHash(int s, int b, int n) {
4747
super(s, b, n);
48-
48+
4949
/**
5050
* "Mining of Massive Datasets", p.88.
51-
* It can be shown that, using MinHash, the probability that the
52-
* signatures of 2 sets with Jaccard similarity s agree in all the
53-
* rows of at least one stage (band), and therefore become a candidate
51+
* It can be shown that, using MinHash, the probability that the
52+
* signatures of 2 sets with Jaccard similarity s agree in all the
53+
* rows of at least one stage (band), and therefore become a candidate
5454
* pair, is 1−(1−s^R)^b
5555
* where R = signature_size / b (number of rows in a stage/band)
56-
* Thus, the curve that shows the probability that 2 items fall in the
57-
* same bucket for at least one of the stages, as a function of their
56+
* Thus, the curve that shows the probability that 2 items fall in the
57+
* same bucket for at least one of the stages, as a function of their
5858
* Jaccard index similarity, has a S shape.
59-
* The threshold (the value of similarity at which the probability of
60-
* becoming a candidate is 1/2) is a function of the number of stages
59+
* The threshold (the value of similarity at which the probability of
60+
* becoming a candidate is 1/2) is a function of the number of stages
6161
* (s, or bands b in the book) and the signature size:
6262
* threshold ≃ (1/s)^(1/R)
6363
* Hence the signature size can be computed as:
@@ -69,12 +69,12 @@ public LSHMinHash(int s, int b, int n) {
6969
int signature_size = R * s;
7070
this.mh = new MinHash(signature_size, n);
7171
}
72-
72+
7373
public int[] hash(boolean[] vector) {
7474
return hashSignature(this.mh.signature(vector));
7575
}
76-
77-
public int[][] getCoefficients() {
76+
77+
public long[][] getCoefficients() {
7878
return mh.getCoefficients();
7979
}
8080
}

src/main/java/info/debatty/java/lsh/MinHash.java

Lines changed: 7 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -81,7 +81,7 @@ public static int size(double error) {
8181
/**
8282
* Random a and b coefficients for the random hash functions
8383
*/
84-
private int[][] hash_coefs;
84+
private long[][] hash_coefs;
8585

8686
/**
8787
* Dictionary size
@@ -154,8 +154,8 @@ public int[] signature(Set<Integer> set) {
154154

155155
for (final int r : list) {
156156

157-
// However, if c has 1 in row r, then for each i = 1, 2, . . . ,n
158-
// set SIG(i, c) to the smaller of the current value of
157+
// However, if c has 1 in row r, then for each i = 1, 2, . . . ,n
158+
// set SIG(i, c) to the smaller of the current value of
159159
// SIG(i, c) and hi(r)
160160
for (int i = 0; i < n; i++) {
161161
sig[i] = Math.min(
@@ -207,7 +207,7 @@ private void init(int size, int dict_size) {
207207
// h = (a * x) + b
208208
// a and b should be randomly generated
209209
Random r = new Random();
210-
hash_coefs = new int[n][2];
210+
hash_coefs = new long[n][2];
211211
for (int i = 0; i < n; i++) {
212212
hash_coefs[i][0] = r.nextInt(dict_size); // a
213213
hash_coefs[i][1] = r.nextInt(dict_size); // b
@@ -222,10 +222,11 @@ private void init(int size, int dict_size) {
222222
* @return the hashed value of x, using ith hash function
223223
*/
224224
private int h(int i, int x) {
225-
return (hash_coefs[i][0] * x + hash_coefs[i][1]) % dict_size;
225+
return (int)
226+
((hash_coefs[i][0] * (long) x + hash_coefs[i][1]) % dict_size);
226227
}
227228

228-
public int[][] getCoefficients() {
229+
public long[][] getCoefficients() {
229230
return hash_coefs;
230231
}
231232
}
Lines changed: 80 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,80 @@
1+
/*
2+
* The MIT License
3+
*
4+
* Copyright 2016 Thibault Debatty.
5+
*
6+
* Permission is hereby granted, free of charge, to any person obtaining a copy
7+
* of this software and associated documentation files (the "Software"), to deal
8+
* in the Software without restriction, including without limitation the rights
9+
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10+
* copies of the Software, and to permit persons to whom the Software is
11+
* furnished to do so, subject to the following conditions:
12+
*
13+
* The above copyright notice and this permission notice shall be included in
14+
* all copies or substantial portions of the Software.
15+
*
16+
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17+
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18+
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19+
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20+
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21+
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
22+
* THE SOFTWARE.
23+
*/
24+
package info.debatty.java.lsh;
25+
26+
import java.util.Random;
27+
import org.junit.Test;
28+
29+
/**
30+
*
31+
* @author Thibault Debatty
32+
*/
33+
public class LSHMinHashTest {
34+
35+
/**
36+
* Test of hash method, of class LSHMinHash.
37+
*/
38+
@Test
39+
public void testHash() {
40+
System.out.println("hash");
41+
42+
// proportion of 0's in the vectors
43+
// if the vectors are dense (lots of 1's), the average jaccard similarity
44+
// will be very high (especially for large vectors), and LSH
45+
// won't be able to distinguish them
46+
// as a result, all vectors will be binned in the same bucket...
47+
double sparsity = 0.75;
48+
49+
// Number and size of vectors
50+
int count = 10000;
51+
int n = 100000;
52+
53+
int stages = 2;
54+
int buckets = 10;
55+
56+
// Let's generate some random sets
57+
boolean[][] vectors = new boolean[count][n];
58+
Random rand = new Random();
59+
60+
for (int i = 0; i < count; i++) {
61+
for (int j = 0; j < n; j++) {
62+
vectors[i][j] = rand.nextDouble() > sparsity;
63+
}
64+
}
65+
66+
LSHMinHash lsh = new LSHMinHash(stages, buckets, n);
67+
int[][] counts = new int[stages][buckets];
68+
69+
// Perform hashing
70+
for (boolean[] vector : vectors) {
71+
int[] hash = lsh.hash(vector);
72+
73+
for (int i = 0; i < hash.length; i++) {
74+
// this will raise an ArrayIndexOutOfBoundsException
75+
// if the bin values are negatives or too large
76+
counts[i][hash[i]]++;
77+
}
78+
}
79+
}
80+
}

0 commit comments

Comments
 (0)