|
| 1 | +/* |
| 2 | + * The MIT License |
| 3 | + * |
| 4 | + * Copyright 2016 Thibault Debatty. |
| 5 | + * |
| 6 | + * Permission is hereby granted, free of charge, to any person obtaining a copy |
| 7 | + * of this software and associated documentation files (the "Software"), to deal |
| 8 | + * in the Software without restriction, including without limitation the rights |
| 9 | + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell |
| 10 | + * copies of the Software, and to permit persons to whom the Software is |
| 11 | + * furnished to do so, subject to the following conditions: |
| 12 | + * |
| 13 | + * The above copyright notice and this permission notice shall be included in |
| 14 | + * all copies or substantial portions of the Software. |
| 15 | + * |
| 16 | + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR |
| 17 | + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, |
| 18 | + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE |
| 19 | + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER |
| 20 | + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, |
| 21 | + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN |
| 22 | + * THE SOFTWARE. |
| 23 | + */ |
| 24 | +package info.debatty.java.lsh; |
| 25 | + |
| 26 | +import java.util.Random; |
| 27 | +import org.junit.Test; |
| 28 | + |
| 29 | +/** |
| 30 | + * |
| 31 | + * @author Thibault Debatty |
| 32 | + */ |
| 33 | +public class LSHMinHashTest { |
| 34 | + |
| 35 | + /** |
| 36 | + * Test of hash method, of class LSHMinHash. |
| 37 | + */ |
| 38 | + @Test |
| 39 | + public void testHash() { |
| 40 | + System.out.println("hash"); |
| 41 | + |
| 42 | + // proportion of 0's in the vectors |
| 43 | + // if the vectors are dense (lots of 1's), the average jaccard similarity |
| 44 | + // will be very high (especially for large vectors), and LSH |
| 45 | + // won't be able to distinguish them |
| 46 | + // as a result, all vectors will be binned in the same bucket... |
| 47 | + double sparsity = 0.75; |
| 48 | + |
| 49 | + // Number and size of vectors |
| 50 | + int count = 10000; |
| 51 | + int n = 100000; |
| 52 | + |
| 53 | + int stages = 2; |
| 54 | + int buckets = 10; |
| 55 | + |
| 56 | + // Let's generate some random sets |
| 57 | + boolean[][] vectors = new boolean[count][n]; |
| 58 | + Random rand = new Random(); |
| 59 | + |
| 60 | + for (int i = 0; i < count; i++) { |
| 61 | + for (int j = 0; j < n; j++) { |
| 62 | + vectors[i][j] = rand.nextDouble() > sparsity; |
| 63 | + } |
| 64 | + } |
| 65 | + |
| 66 | + LSHMinHash lsh = new LSHMinHash(stages, buckets, n); |
| 67 | + int[][] counts = new int[stages][buckets]; |
| 68 | + |
| 69 | + // Perform hashing |
| 70 | + for (boolean[] vector : vectors) { |
| 71 | + int[] hash = lsh.hash(vector); |
| 72 | + |
| 73 | + for (int i = 0; i < hash.length; i++) { |
| 74 | + // this will raise an ArrayIndexOutOfBoundsException |
| 75 | + // if the bin values are negatives or too large |
| 76 | + counts[i][hash[i]]++; |
| 77 | + } |
| 78 | + } |
| 79 | + } |
| 80 | +} |
0 commit comments