|
1 | | -import random |
2 | 1 | from collections import defaultdict |
| 2 | +import random |
3 | 3 |
|
4 | 4 | from graphblas.binary import plus |
5 | 5 | from graphblas.core.dtypes import BOOL, INT32 |
6 | 6 | from graphblas.core.matrix import Matrix |
7 | 7 | from graphblas.core.vector import Vector |
8 | 8 |
|
9 | 9 | from cfpq_decomposer.abstract_decomposer import AbstractDecomposer |
| 10 | +from cfpq_decomposer.constants import HASH_PRIME_MODULUS, HASH_FUNCTIONS_COUNT, PROTOTYPE_MIN_LSH_BUCKET_SIZE, \ |
| 11 | + PROTOTYPE_OUTLIER_THRESHOLD, PROTOTYPE_MIN_VALUES_PER_ROW |
10 | 12 |
|
11 | 13 |
|
12 | 14 | class PrototypeDecomposer(AbstractDecomposer): |
13 | | - def row_based_decompose(self, M: Matrix): |
14 | | - n_rows, n_cols = M.shape |
15 | | - |
16 | | - I, J, V = M.to_coo() |
17 | | - |
18 | | - rows = defaultdict(set) |
19 | | - for i, j in zip(I, J): |
20 | | - rows[i].add(j) |
21 | | - |
22 | | - p = 2147483647 |
23 | | - num_hashes = 3 |
24 | | - hash_funcs = [] |
25 | | - for _ in range(num_hashes): |
26 | | - a = random.randint(1, p - 1) |
27 | | - b = random.randint(0, p - 1) |
28 | | - hash_funcs.append((a, b)) |
29 | | - |
30 | | - minhashes = dict() |
31 | | - |
32 | | - for i, S_i in rows.items(): |
33 | | - minhash_values = [] |
34 | | - if len(S_i) < 5: |
| 15 | + def row_based_decompose(self, input_matrix: Matrix): |
| 16 | + number_of_rows, number_of_columns = input_matrix.shape |
| 17 | + row_indices, column_indices, _ = input_matrix.to_coo() |
| 18 | + |
| 19 | + row_to_column_sets = defaultdict(set) |
| 20 | + for row_index, column_index in zip(row_indices, column_indices): |
| 21 | + row_to_column_sets[row_index].add(column_index) |
| 22 | + |
| 23 | + hash_coefficients_and_offsets = [] |
| 24 | + for _ in range(HASH_FUNCTIONS_COUNT): |
| 25 | + coefficient = random.randint(1, HASH_PRIME_MODULUS - 1) |
| 26 | + offset = random.randint(0, HASH_PRIME_MODULUS - 1) |
| 27 | + hash_coefficients_and_offsets.append((coefficient, offset)) |
| 28 | + |
| 29 | + row_to_minhash_signature = {} |
| 30 | + for row_index, column_set in row_to_column_sets.items(): |
| 31 | + if len(column_set) < PROTOTYPE_MIN_VALUES_PER_ROW: |
35 | 32 | continue |
36 | | - for a, b in hash_funcs: |
37 | | - min_hash = min(((a * x + b) % p) for x in S_i) |
38 | | - minhash_values.append(min_hash) |
39 | | - minhashes[i] = tuple(minhash_values) |
40 | | - |
41 | | - master_hashes = dict() |
42 | | - for i, minhash_values in minhashes.items(): |
43 | | - master_hash = hash(minhash_values) |
44 | | - master_hashes[i] = master_hash |
45 | | - |
46 | | - buckets = defaultdict(list) |
47 | | - for i, master_hash in master_hashes.items(): |
48 | | - buckets[master_hash].append(i) |
49 | | - |
50 | | - buckets = {h: idxs for h, idxs in buckets.items() if len(idxs) >= 5} |
51 | | - |
52 | | - LEFT_columns = [] |
53 | | - RIGHT_rows = [] |
54 | | - |
55 | | - for h, B in buckets.items(): |
56 | | - N = len(B) |
57 | | - M_B: Matrix = M[B, :].new() |
58 | | - A1 = M_B.dup(dtype=INT32).reduce_columnwise(plus).new() |
59 | | - |
60 | | - threshold = int(0.95 * N) |
61 | | - A2: Vector = A1.select('>=', threshold).new() |
62 | | - |
63 | | - if A2.nvals == 0: |
| 33 | + signature = [] |
| 34 | + for coefficient, offset in hash_coefficients_and_offsets: |
| 35 | + min_hash = min((coefficient * col + offset) % HASH_PRIME_MODULUS for col in column_set) |
| 36 | + signature.append(min_hash) |
| 37 | + row_to_minhash_signature[row_index] = tuple(signature) |
| 38 | + |
| 39 | + row_to_master_hash = { |
| 40 | + row_index: hash(signature) |
| 41 | + for row_index, signature in row_to_minhash_signature.items() |
| 42 | + } |
| 43 | + |
| 44 | + master_hash_to_rows = defaultdict(list) |
| 45 | + for row_index, master_hash in row_to_master_hash.items(): |
| 46 | + master_hash_to_rows[master_hash].append(row_index) |
| 47 | + |
| 48 | + buckets_with_enough_rows = { |
| 49 | + master_hash: rows |
| 50 | + for master_hash, rows in master_hash_to_rows.items() |
| 51 | + if len(rows) >= PROTOTYPE_MIN_LSH_BUCKET_SIZE |
| 52 | + } |
| 53 | + |
| 54 | + left_factor_column_vectors = [] |
| 55 | + right_factor_row_signatures = [] |
| 56 | + |
| 57 | + for master_hash, bucket_row_indices in buckets_with_enough_rows.items(): |
| 58 | + bucket_size = len(bucket_row_indices) |
| 59 | + bucket_submatrix = input_matrix[bucket_row_indices, :].new() |
| 60 | + column_sums = bucket_submatrix.dup(dtype=INT32).reduce_columnwise(plus).new() |
| 61 | + |
| 62 | + first_threshold = int((1 - PROTOTYPE_OUTLIER_THRESHOLD) * bucket_size) |
| 63 | + frequent_columns_after_first_filter = column_sums.select('>=', first_threshold).new() |
| 64 | + if frequent_columns_after_first_filter.nvals == 0: |
64 | 65 | continue |
65 | 66 |
|
66 | | - S_A2 = set(A2.to_coo()[0]) |
67 | | - |
68 | | - B_prime = [i for i in B if S_A2 <= rows[i]] |
69 | | - |
70 | | - K = len(B_prime) |
71 | | - if K == 0: |
| 67 | + frequent_column_indices = set(frequent_columns_after_first_filter.to_coo()[0]) |
| 68 | + first_filtered_rows = [ |
| 69 | + row_index |
| 70 | + for row_index in bucket_row_indices |
| 71 | + if frequent_column_indices <= row_to_column_sets[row_index] |
| 72 | + ] |
| 73 | + if not first_filtered_rows: |
72 | 74 | continue |
73 | 75 |
|
74 | | - M_B_prime = M[B_prime, :].new() |
75 | | - A3 = M_B_prime.dup(dtype=INT32).reduce_columnwise(plus) |
76 | | - |
77 | | - threshold = int(0.95 * K) |
78 | | - A4 = A3.select('>=', threshold).new() |
| 76 | + filtered_submatrix = input_matrix[first_filtered_rows, :].new() |
| 77 | + filtered_column_sums = filtered_submatrix.dup(dtype=INT32).reduce_columnwise(plus) |
79 | 78 |
|
80 | | - if A4.nvals == 0: |
| 79 | + second_threshold = int((1 - PROTOTYPE_OUTLIER_THRESHOLD) * len(first_filtered_rows)) |
| 80 | + frequent_columns_after_second_filter = filtered_column_sums.select('>=', second_threshold).new() |
| 81 | + if frequent_columns_after_second_filter.nvals == 0: |
81 | 82 | continue |
82 | 83 |
|
83 | | - S_A4 = set(A4.to_coo()[0]) |
84 | | - |
85 | | - B_double_prime = [i for i in B_prime if S_A4 <= rows[i]] |
86 | | - |
87 | | - if len(B_double_prime) < 5: |
| 84 | + frequent_filtered_column_indices = set(frequent_columns_after_second_filter.to_coo()[0]) |
| 85 | + second_filtered_rows = [ |
| 86 | + row_index |
| 87 | + for row_index in first_filtered_rows |
| 88 | + if frequent_filtered_column_indices <= row_to_column_sets[row_index] |
| 89 | + ] |
| 90 | + if len(second_filtered_rows) < PROTOTYPE_MIN_LSH_BUCKET_SIZE: |
88 | 91 | continue |
89 | 92 |
|
90 | | - RIGHT_rows.append(A4) |
| 93 | + right_factor_row_signatures.append(frequent_columns_after_second_filter) |
91 | 94 |
|
92 | | - CORE = Vector(BOOL, size=n_rows) |
93 | | - for i in B_double_prime: |
94 | | - CORE[i] = True |
95 | | - LEFT_columns.append(CORE) |
| 95 | + core_membership_vector = Vector(BOOL, size=number_of_rows) |
| 96 | + for core_row in second_filtered_rows: |
| 97 | + core_membership_vector[core_row] = True |
| 98 | + left_factor_column_vectors.append(core_membership_vector) |
96 | 99 |
|
97 | | - num_buckets_remaining = len(LEFT_columns) |
98 | | - if num_buckets_remaining == 0: |
99 | | - return Matrix(M.dtype, M.nrows, 0), Matrix(M.dtype, 0, M.ncols) |
| 100 | + bucket_count = len(left_factor_column_vectors) |
| 101 | + if bucket_count == 0: |
| 102 | + return Matrix(input_matrix.dtype, number_of_rows, 0), \ |
| 103 | + Matrix(input_matrix.dtype, 0, number_of_columns) |
100 | 104 |
|
101 | | - LEFT = Matrix(bool, n_rows, num_buckets_remaining) |
102 | | - for idx, CORE in enumerate(LEFT_columns): |
103 | | - LEFT[:, idx] = CORE |
| 105 | + left_factor = Matrix(bool, number_of_rows, bucket_count) |
| 106 | + for idx, column_vector in enumerate(left_factor_column_vectors): |
| 107 | + left_factor[:, idx] = column_vector |
104 | 108 |
|
105 | | - RIGHT = Matrix(bool, num_buckets_remaining, n_cols) |
106 | | - for idx, A4 in enumerate(RIGHT_rows): |
107 | | - RIGHT[idx, :] = A4 |
| 109 | + right_factor = Matrix(bool, bucket_count, number_of_columns) |
| 110 | + for idx, row_signature in enumerate(right_factor_row_signatures): |
| 111 | + right_factor[idx, :] = row_signature |
108 | 112 |
|
109 | | - return LEFT, RIGHT |
| 113 | + return left_factor, right_factor |
0 commit comments