Skip to content

Commit 31a446a

Browse files
committed
Improve naming in PrototypeDecomposer
1 parent af83931 commit 31a446a

2 files changed

Lines changed: 91 additions & 83 deletions

File tree

cfpq_decomposer/constants.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,3 +6,7 @@
66
HASH_FUNCTIONS_COUNT = 3
77
MIN_REDUCTION_RATIO = 0.05
88
MAX_SIZE_RATIO = 0.3
9+
10+
PROTOTYPE_MIN_VALUES_PER_ROW = 5
11+
PROTOTYPE_OUTLIER_THRESHOLD = 0.05
12+
PROTOTYPE_MIN_LSH_BUCKET_SIZE = 5
Lines changed: 87 additions & 83 deletions
Original file line numberDiff line numberDiff line change
@@ -1,109 +1,113 @@
1-
import random
21
from collections import defaultdict
2+
import random
33

44
from graphblas.binary import plus
55
from graphblas.core.dtypes import BOOL, INT32
66
from graphblas.core.matrix import Matrix
77
from graphblas.core.vector import Vector
88

99
from cfpq_decomposer.abstract_decomposer import AbstractDecomposer
10+
from cfpq_decomposer.constants import HASH_PRIME_MODULUS, HASH_FUNCTIONS_COUNT, PROTOTYPE_MIN_LSH_BUCKET_SIZE, \
11+
PROTOTYPE_OUTLIER_THRESHOLD, PROTOTYPE_MIN_VALUES_PER_ROW
1012

1113

1214
class PrototypeDecomposer(AbstractDecomposer):
13-
def row_based_decompose(self, M: Matrix):
14-
n_rows, n_cols = M.shape
15-
16-
I, J, V = M.to_coo()
17-
18-
rows = defaultdict(set)
19-
for i, j in zip(I, J):
20-
rows[i].add(j)
21-
22-
p = 2147483647
23-
num_hashes = 3
24-
hash_funcs = []
25-
for _ in range(num_hashes):
26-
a = random.randint(1, p - 1)
27-
b = random.randint(0, p - 1)
28-
hash_funcs.append((a, b))
29-
30-
minhashes = dict()
31-
32-
for i, S_i in rows.items():
33-
minhash_values = []
34-
if len(S_i) < 5:
15+
def row_based_decompose(self, input_matrix: Matrix):
16+
number_of_rows, number_of_columns = input_matrix.shape
17+
row_indices, column_indices, _ = input_matrix.to_coo()
18+
19+
row_to_column_sets = defaultdict(set)
20+
for row_index, column_index in zip(row_indices, column_indices):
21+
row_to_column_sets[row_index].add(column_index)
22+
23+
hash_coefficients_and_offsets = []
24+
for _ in range(HASH_FUNCTIONS_COUNT):
25+
coefficient = random.randint(1, HASH_PRIME_MODULUS - 1)
26+
offset = random.randint(0, HASH_PRIME_MODULUS - 1)
27+
hash_coefficients_and_offsets.append((coefficient, offset))
28+
29+
row_to_minhash_signature = {}
30+
for row_index, column_set in row_to_column_sets.items():
31+
if len(column_set) < PROTOTYPE_MIN_VALUES_PER_ROW:
3532
continue
36-
for a, b in hash_funcs:
37-
min_hash = min(((a * x + b) % p) for x in S_i)
38-
minhash_values.append(min_hash)
39-
minhashes[i] = tuple(minhash_values)
40-
41-
master_hashes = dict()
42-
for i, minhash_values in minhashes.items():
43-
master_hash = hash(minhash_values)
44-
master_hashes[i] = master_hash
45-
46-
buckets = defaultdict(list)
47-
for i, master_hash in master_hashes.items():
48-
buckets[master_hash].append(i)
49-
50-
buckets = {h: idxs for h, idxs in buckets.items() if len(idxs) >= 5}
51-
52-
LEFT_columns = []
53-
RIGHT_rows = []
54-
55-
for h, B in buckets.items():
56-
N = len(B)
57-
M_B: Matrix = M[B, :].new()
58-
A1 = M_B.dup(dtype=INT32).reduce_columnwise(plus).new()
59-
60-
threshold = int(0.95 * N)
61-
A2: Vector = A1.select('>=', threshold).new()
62-
63-
if A2.nvals == 0:
33+
signature = []
34+
for coefficient, offset in hash_coefficients_and_offsets:
35+
min_hash = min((coefficient * col + offset) % HASH_PRIME_MODULUS for col in column_set)
36+
signature.append(min_hash)
37+
row_to_minhash_signature[row_index] = tuple(signature)
38+
39+
row_to_master_hash = {
40+
row_index: hash(signature)
41+
for row_index, signature in row_to_minhash_signature.items()
42+
}
43+
44+
master_hash_to_rows = defaultdict(list)
45+
for row_index, master_hash in row_to_master_hash.items():
46+
master_hash_to_rows[master_hash].append(row_index)
47+
48+
buckets_with_enough_rows = {
49+
master_hash: rows
50+
for master_hash, rows in master_hash_to_rows.items()
51+
if len(rows) >= PROTOTYPE_MIN_LSH_BUCKET_SIZE
52+
}
53+
54+
left_factor_column_vectors = []
55+
right_factor_row_signatures = []
56+
57+
for master_hash, bucket_row_indices in buckets_with_enough_rows.items():
58+
bucket_size = len(bucket_row_indices)
59+
bucket_submatrix = input_matrix[bucket_row_indices, :].new()
60+
column_sums = bucket_submatrix.dup(dtype=INT32).reduce_columnwise(plus).new()
61+
62+
first_threshold = int((1 - PROTOTYPE_OUTLIER_THRESHOLD) * bucket_size)
63+
frequent_columns_after_first_filter = column_sums.select('>=', first_threshold).new()
64+
if frequent_columns_after_first_filter.nvals == 0:
6465
continue
6566

66-
S_A2 = set(A2.to_coo()[0])
67-
68-
B_prime = [i for i in B if S_A2 <= rows[i]]
69-
70-
K = len(B_prime)
71-
if K == 0:
67+
frequent_column_indices = set(frequent_columns_after_first_filter.to_coo()[0])
68+
first_filtered_rows = [
69+
row_index
70+
for row_index in bucket_row_indices
71+
if frequent_column_indices <= row_to_column_sets[row_index]
72+
]
73+
if not first_filtered_rows:
7274
continue
7375

74-
M_B_prime = M[B_prime, :].new()
75-
A3 = M_B_prime.dup(dtype=INT32).reduce_columnwise(plus)
76-
77-
threshold = int(0.95 * K)
78-
A4 = A3.select('>=', threshold).new()
76+
filtered_submatrix = input_matrix[first_filtered_rows, :].new()
77+
filtered_column_sums = filtered_submatrix.dup(dtype=INT32).reduce_columnwise(plus)
7978

80-
if A4.nvals == 0:
79+
second_threshold = int((1 - PROTOTYPE_OUTLIER_THRESHOLD) * len(first_filtered_rows))
80+
frequent_columns_after_second_filter = filtered_column_sums.select('>=', second_threshold).new()
81+
if frequent_columns_after_second_filter.nvals == 0:
8182
continue
8283

83-
S_A4 = set(A4.to_coo()[0])
84-
85-
B_double_prime = [i for i in B_prime if S_A4 <= rows[i]]
86-
87-
if len(B_double_prime) < 5:
84+
frequent_filtered_column_indices = set(frequent_columns_after_second_filter.to_coo()[0])
85+
second_filtered_rows = [
86+
row_index
87+
for row_index in first_filtered_rows
88+
if frequent_filtered_column_indices <= row_to_column_sets[row_index]
89+
]
90+
if len(second_filtered_rows) < PROTOTYPE_MIN_LSH_BUCKET_SIZE:
8891
continue
8992

90-
RIGHT_rows.append(A4)
93+
right_factor_row_signatures.append(frequent_columns_after_second_filter)
9194

92-
CORE = Vector(BOOL, size=n_rows)
93-
for i in B_double_prime:
94-
CORE[i] = True
95-
LEFT_columns.append(CORE)
95+
core_membership_vector = Vector(BOOL, size=number_of_rows)
96+
for core_row in second_filtered_rows:
97+
core_membership_vector[core_row] = True
98+
left_factor_column_vectors.append(core_membership_vector)
9699

97-
num_buckets_remaining = len(LEFT_columns)
98-
if num_buckets_remaining == 0:
99-
return Matrix(M.dtype, M.nrows, 0), Matrix(M.dtype, 0, M.ncols)
100+
bucket_count = len(left_factor_column_vectors)
101+
if bucket_count == 0:
102+
return Matrix(input_matrix.dtype, number_of_rows, 0), \
103+
Matrix(input_matrix.dtype, 0, number_of_columns)
100104

101-
LEFT = Matrix(bool, n_rows, num_buckets_remaining)
102-
for idx, CORE in enumerate(LEFT_columns):
103-
LEFT[:, idx] = CORE
105+
left_factor = Matrix(bool, number_of_rows, bucket_count)
106+
for idx, column_vector in enumerate(left_factor_column_vectors):
107+
left_factor[:, idx] = column_vector
104108

105-
RIGHT = Matrix(bool, num_buckets_remaining, n_cols)
106-
for idx, A4 in enumerate(RIGHT_rows):
107-
RIGHT[idx, :] = A4
109+
right_factor = Matrix(bool, bucket_count, number_of_columns)
110+
for idx, row_signature in enumerate(right_factor_row_signatures):
111+
right_factor[idx, :] = row_signature
108112

109-
return LEFT, RIGHT
113+
return left_factor, right_factor

0 commit comments

Comments
 (0)