-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy path07_performance_optimization.sql
More file actions
128 lines (106 loc) · 3.55 KB
/
07_performance_optimization.sql
File metadata and controls
128 lines (106 loc) · 3.55 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
-- PostgreSQL Data Anonymization in AWS RDS
-- Performance optimization techniques
-- Create materialized views for better performance
CREATE MATERIALIZED VIEW IF NOT EXISTS masked_users_mat AS
SELECT * FROM masked_users;
CREATE MATERIALIZED VIEW IF NOT EXISTS pseudonymized_users_mat AS
SELECT * FROM pseudonymized_users;
CREATE MATERIALIZED VIEW IF NOT EXISTS research_patients_mat AS
SELECT * FROM research_patients;
-- Create a function to refresh all materialized views
CREATE OR REPLACE FUNCTION anon.refresh_all_mat_views()
RETURNS void AS
$$
DECLARE
mat_view record;
BEGIN
FOR mat_view IN SELECT matviewname FROM pg_matviews WHERE schemaname = 'public' LOOP
EXECUTE 'REFRESH MATERIALIZED VIEW ' || mat_view.matviewname;
END LOOP;
END;
$$
LANGUAGE plpgsql;
-- Create indexes on commonly queried columns in materialized views
CREATE INDEX IF NOT EXISTS idx_masked_users_mat_email ON masked_users_mat(email);
CREATE INDEX IF NOT EXISTS idx_masked_users_mat_last_name ON masked_users_mat(last_name);
CREATE INDEX IF NOT EXISTS idx_pseudonymized_users_mat_email ON pseudonymized_users_mat(email);
CREATE INDEX IF NOT EXISTS idx_pseudonymized_users_mat_full_name ON pseudonymized_users_mat(full_name);
CREATE INDEX IF NOT EXISTS idx_research_patients_mat_diagnosis ON research_patients_mat(diagnosis_code);
CREATE INDEX IF NOT EXISTS idx_research_patients_mat_treatment ON research_patients_mat(treatment_code);
-- Batch processing function for static anonymization of large tables
CREATE OR REPLACE FUNCTION anon.batch_anonymize_emails(
table_name text,
column_name text,
batch_size int DEFAULT 10000
)
RETURNS void AS
$$
DECLARE
max_id int;
current_id int := 0;
update_query text;
BEGIN
-- Get the maximum ID
EXECUTE format('SELECT MAX(id) FROM %I', table_name) INTO max_id;
-- Process in batches
WHILE current_id < max_id LOOP
update_query := format(
'UPDATE %I SET %I = anon.partial_email(%I)
WHERE id > %s AND id <= %s',
table_name, column_name, column_name, current_id, current_id + batch_size
);
EXECUTE update_query;
current_id := current_id + batch_size;
-- Commit each batch to avoid long-running transactions
COMMIT;
END LOOP;
END;
$$
LANGUAGE plpgsql;
-- Example usage:
-- SELECT anon.batch_anonymize_emails('users', 'email', 5000);
-- Create a function to estimate the time required for anonymization
CREATE OR REPLACE FUNCTION anon.estimate_anonymization_time(
table_name text,
sample_size int DEFAULT 1000
)
RETURNS interval AS
$$
DECLARE
start_time timestamp;
end_time timestamp;
row_count bigint;
estimated_time interval;
BEGIN
-- Get the total row count
EXECUTE format('SELECT COUNT(*) FROM %I', table_name) INTO row_count;
-- Create a temporary table with a sample
EXECUTE format('
CREATE TEMP TABLE temp_sample AS
SELECT * FROM %I
ORDER BY random()
LIMIT %s',
table_name, sample_size
);
-- Time the anonymization of the sample
start_time := clock_timestamp();
EXECUTE format('
SELECT
anon.partial_email(email),
anon.fake_first_name(),
anon.fake_last_name(),
anon.hash_string(COALESCE(password, '''')),
anon.fake_address()
FROM temp_sample'
);
end_time := clock_timestamp();
-- Calculate the estimated time for the full table
estimated_time := (end_time - start_time) * (row_count::float / sample_size);
-- Drop the temporary table
DROP TABLE temp_sample;
RETURN estimated_time;
END;
$$
LANGUAGE plpgsql;
-- Example usage:
-- SELECT anon.estimate_anonymization_time('users', 1000);