Skip to content

Commit d9e3c72

Browse files
authored
Reusable rules (GoogleCloudPlatform#17232)
1 parent b5c32cf commit d9e3c72

6 files changed

Lines changed: 1547 additions & 26 deletions

File tree

mmv1/products/dataplex/Datascan.yaml

Lines changed: 49 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -175,6 +175,20 @@ examples:
175175
project_name: 'PROJECT_NAME'
176176
location: 'REGION'
177177
external_providers: ["time"]
178+
- name: 'dataplex_datascan_quality_reusable_rules_catalog_based'
179+
primary_resource_id: 'reusable_rules_catalog_based'
180+
vars:
181+
datascan_name: 'dataquality-catalog'
182+
test_env_vars:
183+
project_name: 'PROJECT_NAME'
184+
external_providers: ["time"]
185+
- name: 'dataplex_datascan_data_quality_template_reference'
186+
primary_resource_id: 'data_quality_template_reference'
187+
vars:
188+
datascan_name: 'dataquality-template'
189+
test_env_vars:
190+
project_name: 'PROJECT_NAME'
191+
external_providers: ["time"]
178192
parameters:
179193
- name: 'location'
180194
type: String
@@ -506,6 +520,10 @@ properties:
506520
description: |
507521
Description of the rule.
508522
The maximum length is 1,024 characters.
523+
- name: 'attributes'
524+
type: KeyValuePairs
525+
description: |
526+
Map of attribute name and value linked to the rule.
509527
- name: 'rangeExpectation'
510528
type: NestedObject
511529
description: |
@@ -636,11 +654,42 @@ properties:
636654
description: |
637655
The SQL statement.
638656
required: true
657+
- name: 'templateReference'
658+
type: NestedObject
659+
description: |
660+
Aggregate rule which references a rule template and provides the parameters to be substituted in the template.
661+
properties:
662+
- name: 'name'
663+
type: String
664+
description: |
665+
The resource name of the template entry.
666+
required: true
667+
- name: 'values'
668+
type: Map
669+
key_name: 'name'
670+
description: |
671+
The map of parameter name and value.
672+
value_type:
673+
type: NestedObject
674+
properties:
675+
- name: 'value'
676+
type: String
677+
required: true
678+
description: |
679+
The string representation of the parameter value.
639680
min_size: 1
640681
- name: 'catalogPublishingEnabled'
641682
type: Boolean
642683
description: |
643684
If set, the latest DataScan job result will be published to Dataplex Catalog.
685+
- name: 'enableCatalogBasedRules'
686+
type: Boolean
687+
description: |
688+
If set to true, the scan will retrieve rules defined in Data Catalog for the resource.
689+
- name: 'filter'
690+
type: String
691+
description: |
692+
A filter to selectively run a subset of rules.
644693
- name: 'dataProfileSpec'
645694
type: NestedObject
646695
description: |
Lines changed: 133 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,133 @@
1+
data "google_project" "project" {
2+
project_id = "{{index $.TestEnvVars "project_name"}}"
3+
}
4+
5+
resource "google_service_account" "sa" {
6+
account_id = "tf-test-sa-%{random_suffix}"
7+
display_name = "DataScan Service Account"
8+
project = data.google_project.project.project_id
9+
}
10+
11+
resource "google_service_account_iam_member" "dataplex_sa_impersonate" {
12+
service_account_id = google_service_account.sa.name
13+
role = "roles/iam.serviceAccountTokenCreator"
14+
member = "serviceAccount:service-${data.google_project.project.number}@gcp-sa-dataplex.iam.gserviceaccount.com"
15+
}
16+
resource "time_sleep" "wait_120_seconds" {
17+
depends_on = [google_service_account_iam_member.dataplex_sa_impersonate]
18+
create_duration = "120s"
19+
}
20+
21+
22+
resource "google_project_iam_member" "sa_bq_data_viewer" {
23+
project = data.google_project.project.project_id
24+
role = "roles/bigquery.dataViewer"
25+
member = "serviceAccount:${google_service_account.sa.email}"
26+
}
27+
28+
resource "google_project_iam_member" "sa_bq_job_user" {
29+
project = data.google_project.project.project_id
30+
role = "roles/bigquery.jobUser"
31+
member = "serviceAccount:${google_service_account.sa.email}"
32+
}
33+
34+
resource "google_dataplex_entry_group" "test_group" {
35+
location = "us-central1"
36+
entry_group_id = "test-group-%{random_suffix}"
37+
project = data.google_project.project.project_id
38+
}
39+
40+
resource "google_dataplex_entry" "test_entry" {
41+
location = "us-central1"
42+
entry_group_id = google_dataplex_entry_group.test_group.entry_group_id
43+
entry_id = "test-entry-%{random_suffix}"
44+
entry_type = "projects/655216118709/locations/global/entryTypes/data-quality-rule-template"
45+
project = data.google_project.project.number
46+
aspects {
47+
aspect_key = "655216118709.global.data-quality-rule-template"
48+
aspect {
49+
data = jsonencode({
50+
dimension = "VALIDITY"
51+
sqlCollection = [
52+
{
53+
query = "SELECT * FROM $${data()} WHERE $${column()} IS NOT NULL"
54+
}
55+
]
56+
})
57+
}
58+
}
59+
}
60+
61+
resource "google_bigquery_dataset" "tf_test_dataset" {
62+
dataset_id = "tf_test_dataset_id_%{random_suffix}"
63+
default_table_expiration_ms = 3600000
64+
location = "us-central1"
65+
project = data.google_project.project.project_id
66+
67+
depends_on = [
68+
google_service_account_iam_member.dataplex_sa_impersonate,
69+
google_project_iam_member.sa_bq_data_viewer,
70+
google_project_iam_member.sa_bq_job_user
71+
]
72+
}
73+
74+
resource "google_bigquery_table" "tf_test_table" {
75+
dataset_id = google_bigquery_dataset.tf_test_dataset.dataset_id
76+
table_id = "tf_test_table_id_%{random_suffix}"
77+
deletion_protection = false
78+
project = data.google_project.project.project_id
79+
schema = <<EOF
80+
[
81+
{
82+
"name": "name",
83+
"type": "STRING",
84+
"mode": "NULLABLE"
85+
}
86+
]
87+
EOF
88+
}
89+
90+
91+
resource "google_dataplex_datascan" "{{$.PrimaryResourceId}}" {
92+
location = "us-central1"
93+
display_name = "Data Quality Template Reference"
94+
data_scan_id = "{{index $.Vars "datascan_name"}}"
95+
96+
data {
97+
resource = "//bigquery.googleapis.com/projects/${data.google_project.project.project_id}/datasets/${google_bigquery_dataset.tf_test_dataset.dataset_id}/tables/${google_bigquery_table.tf_test_table.table_id}"
98+
}
99+
100+
execution_spec {
101+
trigger {
102+
on_demand {}
103+
}
104+
}
105+
106+
execution_identity {
107+
service_account {
108+
email = google_service_account.sa.email
109+
}
110+
}
111+
112+
data_quality_spec {
113+
rules {
114+
column = "name"
115+
dimension = "VALIDITY"
116+
template_reference {
117+
name = google_dataplex_entry.test_entry.name
118+
values {
119+
name = "min_length"
120+
value = "10"
121+
}
122+
}
123+
}
124+
}
125+
126+
127+
project = data.google_project.project.project_id
128+
129+
depends_on = [
130+
google_bigquery_table.tf_test_table,
131+
time_sleep.wait_120_seconds
132+
]
133+
}

mmv1/templates/terraform/examples/dataplex_datascan_full_quality.tf.tmpl

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@ resource "google_dataplex_datascan" "{{$.PrimaryResourceId}}" {
2424
sampling_percent = 5
2525
row_filter = "station_id > 1000"
2626
catalog_publishing_enabled = true
27+
filter = "attributes.priority = 'high'"
2728
post_scan_actions {
2829
notification_report {
2930
recipients {
@@ -39,6 +40,9 @@ resource "google_dataplex_datascan" "{{$.PrimaryResourceId}}" {
3940
column = "address"
4041
dimension = "VALIDITY"
4142
threshold = 0.99
43+
attributes = {
44+
priority = "high"
45+
}
4246
non_null_expectation {}
4347
}
4448

mmv1/templates/terraform/examples/dataplex_datascan_full_quality_test.tf.tmpl

Lines changed: 30 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
resource "google_bigquery_dataset" "tf_test_dataset" {
22
dataset_id = "tf_test_dataset_id_%{random_suffix}"
33
default_table_expiration_ms = 3600000
4+
location = "us-central1"
45
}
56

67
resource "google_bigquery_table" "tf_test_table" {
@@ -71,7 +72,7 @@ resource "google_dataplex_datascan" "{{$.PrimaryResourceId}}" {
7172
}
7273

7374
data {
74-
resource = "//bigquery.googleapis.com/projects/{{index $.TestEnvVars "project_name"}}/datasets/${google_bigquery_dataset.tf_test_dataset.dataset_id}/tables/${google_bigquery_table.tf_test_table.table_id}"
75+
resource = "//bigquery.googleapis.com/projects/${"{{index $.TestEnvVars "project_name"}}"}/datasets/${google_bigquery_dataset.tf_test_dataset.dataset_id}/tables/${google_bigquery_table.tf_test_table.table_id}"
7576
}
7677

7778
execution_spec {
@@ -86,6 +87,7 @@ resource "google_dataplex_datascan" "{{$.PrimaryResourceId}}" {
8687
sampling_percent = 5
8788
row_filter = "station_id > 1000"
8889
catalog_publishing_enabled = true
90+
filter = "attributes.priority = \"high\""
8991
post_scan_actions {
9092
notification_report {
9193
recipients {
@@ -101,6 +103,9 @@ resource "google_dataplex_datascan" "{{$.PrimaryResourceId}}" {
101103
column = "address"
102104
dimension = "VALIDITY"
103105
threshold = 0.99
106+
attributes = {
107+
priority = "high"
108+
}
104109
non_null_expectation {}
105110
}
106111

@@ -109,6 +114,9 @@ resource "google_dataplex_datascan" "{{$.PrimaryResourceId}}" {
109114
dimension = "VALIDITY"
110115
ignore_null = true
111116
threshold = 0.9
117+
attributes = {
118+
priority = "low"
119+
}
112120
range_expectation {
113121
min_value = 1
114122
max_value = 10
@@ -121,6 +129,9 @@ resource "google_dataplex_datascan" "{{$.PrimaryResourceId}}" {
121129
column = "power_type"
122130
dimension = "VALIDITY"
123131
ignore_null = false
132+
attributes = {
133+
priority = "high"
134+
}
124135
regex_expectation {
125136
regex = ".*solar.*"
126137
}
@@ -130,6 +141,9 @@ resource "google_dataplex_datascan" "{{$.PrimaryResourceId}}" {
130141
column = "property_type"
131142
dimension = "VALIDITY"
132143
ignore_null = false
144+
attributes = {
145+
priority = "low"
146+
}
133147
set_expectation {
134148
values = ["sidewalk", "parkland"]
135149
}
@@ -139,12 +153,18 @@ resource "google_dataplex_datascan" "{{$.PrimaryResourceId}}" {
139153
rules {
140154
column = "address"
141155
dimension = "UNIQUENESS"
156+
attributes = {
157+
priority = "high"
158+
}
142159
uniqueness_expectation {}
143160
}
144161

145162
rules {
146163
column = "number_of_docks"
147164
dimension = "VALIDITY"
165+
attributes = {
166+
priority = "low"
167+
}
148168
statistic_range_expectation {
149169
statistic = "MEAN"
150170
min_value = 5
@@ -157,20 +177,29 @@ resource "google_dataplex_datascan" "{{$.PrimaryResourceId}}" {
157177
rules {
158178
column = "footprint_length"
159179
dimension = "VALIDITY"
180+
attributes = {
181+
priority = "high"
182+
}
160183
row_condition_expectation {
161184
sql_expression = "footprint_length > 0 AND footprint_length <= 10"
162185
}
163186
}
164187

165188
rules {
166189
dimension = "VALIDITY"
190+
attributes = {
191+
priority = "low"
192+
}
167193
table_condition_expectation {
168194
sql_expression = "COUNT(*) > 0"
169195
}
170196
}
171197

172198
rules {
173199
dimension = "VALIDITY"
200+
attributes = {
201+
priority = "high"
202+
}
174203
sql_assertion {
175204
sql_statement = "select * from $${data()} where address is null"
176205
}

0 commit comments

Comments
 (0)