Skip to content

Commit 1a02dbc

Browse files
dleiva04arunpamulapatiramdaskmdbshreelshah12
authored
Release/0.4.0 (#240)
* GCP first round of updates to use SP and not use service account * GCP use-sp-auth fixes * ran dabs successfully * ran dabs successfully with sp * ran dabs successfully with sp * fixed sp in config * Tested gcp calls * add pagination support * deleted job_runs_client.py * matching the notebooks to the library version and functions * setup.py * removed pip install from initialize.py * removed pip install from initialize.pyv2 * Redoing changes that seem to have disappeared since last commit re implementing some changes that are not present * updated the regex for azuredatabricks matching * changed setup.py * adding tags to SAT workflows * issue with accounts api * Added ../Includes/install_sat_sdk to these notebook. * Added comment for gov-3 * GCP token fixes * Added code to handle domain name based on the environment we are in * changes to logic and gov cloud * Log delivery config changes Made changes to log delivery config changes between azure(workspace level) and other clouds (account level) * diagnostic log library changes * diagnostic log library changes - setup.py * Added domain names to other clouds and the build number for tests. * regex issue for domain check * Clean run on aws after these fixes * ignore schema lists and treat as pattern 1 * Fixed initial run issue * Fixed check for the uc share * throttle calls between pages and fix empty responses * added maxpages and timebetweencalls to initialize json * bug fix for empty records * Updated build number * fixed to carry azure account id * Cleanup and gov cloud diagnostics (#238) Co-authored-by: arunpamulapati <arunpamulapati> * Configured the official package * Fixed tag value --------- Co-authored-by: arunpamulapati <arunpamulapati> Co-authored-by: arunpamulapati <arun.pamulapati@databricks.com> Co-authored-by: ramdas.murali <ramdas.murali@databricks.com> Co-authored-by: ramdaskmdb <ramdaskmdb> Co-authored-by: Shreel Shah <shreelshah12@gmail.com> Co-authored-by: Arun Pamulapati <39059536+arunpamulapati@users.noreply.github.com>
1 parent 55203c8 commit 1a02dbc

100 files changed

Lines changed: 4055 additions & 1522 deletions

File tree

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

dabs/dabs_template/databricks_template_schema.json

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -9,10 +9,6 @@
99
"type": "string",
1010
"description": "Cloud type"
1111
},
12-
"google_service_account": {
13-
"type": "string",
14-
"description": "Google service account"
15-
},
1612
"latest_lts": {
1713
"type": "string",
1814
"description": "Latest LTS version"

dabs/dabs_template/template/tmp/resources/sat_driver_job.yml.tmpl

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,8 @@ resources:
22
jobs:
33
sat_driver:
44
name: "SAT Driver Notebook"
5+
tags:
6+
Application: SAT
57
schedule:
68
quartz_cron_expression: "0 0 8 ? * Mon,Wed,Fri"
79
timezone_id: "America/New_York"
@@ -25,8 +27,4 @@ resources:
2527
spark_version: {{.latest_lts}}
2628
runtime_engine: "PHOTON"
2729
node_type_id: {{.node_type}}
28-
{{- if eq .cloud "gcp" }}
29-
gcp_attributes:
30-
google_service_account: {{.google_service_account}}
31-
{{- end }}
3230
{{- end }}

dabs/dabs_template/template/tmp/resources/sat_initiliazer_job.yml.tmpl

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,8 @@ resources:
22
jobs:
33
sat_initializer:
44
name: "SAT Initializer Notebook (one-time)"
5+
tags:
6+
Application: SAT
57

68
tasks:
79
- task_key: "sat_initializer"
@@ -25,8 +27,4 @@ resources:
2527
spark_version: {{.latest_lts}}
2628
runtime_engine: "PHOTON"
2729
node_type_id: {{.node_type}}
28-
{{- if eq .cloud "gcp" }}
29-
gcp_attributes:
30-
google_service_account: {{.google_service_account}}
31-
{{- end }}
3230
{{- end }}

dabs/main.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,6 @@ def install(client: WorkspaceClient, answers: dict, profile: str):
1313
config = {
1414
"catalog": answers.get("catalog", None),
1515
"cloud": cloud,
16-
"google_service_account": answers.get("gcp-impersonate-service-account", None),
1716
"latest_lts": client.clusters.select_spark_version(
1817
long_term_support=True,
1918
latest=True,

dabs/sat/config.py

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -108,15 +108,15 @@ def cloud_specific_questions(client: WorkspaceClient):
108108
]
109109
gcp = [
110110
Text(
111-
name="gcp-gs-path-to-json",
112-
message="Path to JSON key file",
111+
name="gcp-client-id",
112+
message="Client ID",
113113
ignore=cloud_validation(client, "gcp"),
114114
),
115-
Text(
116-
name="gcp-impersonate-service-account",
117-
message="Impersonate Service Account",
115+
Password(
116+
name="gcp-client-secret",
117+
message="Client Secret",
118118
ignore=cloud_validation(client, "gcp"),
119-
default="",
119+
echo="",
120120
),
121121
]
122122
aws = [
@@ -179,7 +179,7 @@ def generate_secrets(client: WorkspaceClient, answers: dict, cloud_type: str):
179179
string_value="{}",
180180
)
181181

182-
if cloud_type == "aws":
182+
if cloud_type == "aws" or cloud_type == "gcp":
183183
client.secrets.put_secret(
184184
scope=scope_name,
185185
key="use-sp-auth",

notebooks/Includes/install_sat_sdk.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,7 @@
2929

3030
# COMMAND ----------
3131

32-
SDK_VERSION='0.0.102'
32+
SDK_VERSION='0.1.38'
3333

3434
# COMMAND ----------
3535

notebooks/Includes/workspace_analysis.py

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -981,9 +981,8 @@ def cc_check(df):
981981
# COMMAND ----------
982982

983983
# DBTITLE 1,Get all audit log delivery configurations. Should be enabled.
984-
check_id='8' #Log delivery configurations
984+
check_id='8' #GOV-3 Log delivery configurations
985985
enabled, sbp_rec = getSecurityBestPracticeRecord(check_id, cloud_type)
986-
workspaceId = workspace_id
987986

988987
def log_check(df):
989988
if df is not None and not isEmpty(df) and len(df.collect())>=1:
@@ -997,14 +996,15 @@ def log_check(df):
997996
return (check_id, 1, {})
998997

999998
if enabled:
1000-
tbl_name = 'acctlogdelivery'
999+
tbl_name = 'acctlogdelivery' if cloud_type != 'azure' else 'acctlogdelivery' + '_' + workspace_id
10011000
sql=f'''
10021001
SELECT config_name, config_id
10031002
FROM {tbl_name}
10041003
WHERE log_type="AUDIT_LOGS" and status="ENABLED"
10051004
'''
10061005
sqlctrl(workspace_id, sql, log_check)
10071006

1007+
10081008
# COMMAND ----------
10091009

10101010
# DBTITLE 1,How long since the last cluster restart
@@ -1140,7 +1140,7 @@ def uc_delta_share_ip_accesslist(df):
11401140
else:
11411141
return (check_id, 0, {})
11421142
if enabled:
1143-
tbl_name = 'unitycatalogsharerecipients' + '_' + workspace_id
1143+
tbl_name = 'delta_sharing_recepients_list' + '_' + workspace_id
11441144
sql=f'''
11451145
SELECT name, owner
11461146
FROM {tbl_name}
@@ -1162,7 +1162,7 @@ def uc_delta_share_expiration_time(df):
11621162
else:
11631163
return (check_id, 0, {})
11641164
if enabled:
1165-
tbl_name = 'unitycatalogsharerecipients' + '_' + workspace_id
1165+
tbl_name = 'delta_sharing_recepients_list' + '_' + workspace_id
11661166
sql=f'''
11671167
SELECT tokens.* FROM (select explode(tokens) as tokens, full_name, owner
11681168
FROM {tbl_name}

notebooks/Setup/1. list_account_workspaces_to_conf_file.py

Lines changed: 25 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -38,14 +38,28 @@
3838
)
3939

4040

41+
# COMMAND ----------
42+
43+
from urllib.parse import urlparse
44+
domain = "com"
45+
try:
46+
parsed_url = urlparse(hostname)
47+
domain_parts = parsed_url.netloc.split('.')
48+
if len(domain_parts) < 2:
49+
raise ValueError("Invalid hostname: cannot extract domain part.")
50+
domain = domain_parts[-1]
51+
except Exception as e:
52+
print(f"Error extracting domain: {e}")
53+
print(domain)
54+
4155
# COMMAND ----------
4256

4357
import json
4458

4559
dbutils.notebook.run(
4660
f"{basePath()}/notebooks/Utils/accounts_bootstrap",
4761
300,
48-
{"json_": json.dumps(json_)},
62+
{"json_": json.dumps(json_), "origin": "initializer"},
4963
)
5064

5165
# COMMAND ----------
@@ -55,7 +69,6 @@
5569
# easily modify the new lines for new workspaces.
5670
def generateWorkspaceConfigFile():
5771
from pyspark.sql.functions import col, concat, lit
58-
5972
dfexist = readWorkspaceConfigFile()
6073
excluded_configured_workspace = ""
6174
header_value = True
@@ -66,29 +79,33 @@ def generateWorkspaceConfigFile():
6679
header_value = False
6780
else:
6881
excluded_configured_workspace = "" # running first time
82+
6983
# get current workspaces that are not yet configured for analysis
7084
spsql = f"""select workspace_id, deployment_name as deployment_url, workspace_name, workspace_status from `acctworkspaces`
71-
where workspace_status = "RUNNING" {excluded_configured_workspace}"""
85+
where trim(workspace_status) = "RUNNING" {excluded_configured_workspace}"""
86+
#print(spsql)
87+
7288
df = spark.sql(spsql)
89+
#display(df)
7390
if len(df.take(1)) > 0:
7491
if cloud_type == "azure":
7592
df = df.withColumn(
7693
"deployment_url",
77-
concat(col("deployment_url"), lit(".azuredatabricks.net")),
94+
concat(col("deployment_url"), lit(".azuredatabricks."), lit(domain)),
7895
) # Azure
7996
elif cloud_type == "aws":
8097
df = df.withColumn(
8198
"deployment_url",
82-
concat(col("deployment_url"), lit(".cloud.databricks.com")),
99+
concat(col("deployment_url"), lit(".cloud.databricks."), lit(domain)),
83100
) # AWS
84101
else:
85102
df = df.withColumn(
86103
"deployment_url",
87-
concat(col("deployment_url"), lit(".gcp.databricks.com")),
104+
concat(col("deployment_url"), lit(".gcp.databricks."), lit(domain)),
88105
) # GCP
89106

90-
#both azure and gcp require sso
91-
if cloud_type == "azure" or cloud_type == "gcp" :
107+
# both azure and gcp require sso
108+
if cloud_type == "azure" or cloud_type == "gcp":
92109
df = df.withColumn("sso_enabled", lit(True))
93110
else:
94111
df = df.withColumn("sso_enabled", lit(False))

notebooks/Setup/3. test_connections.py

Lines changed: 4 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -28,16 +28,13 @@
2828
if cloud_type =='azure': #use client secret
2929
client_secret = dbutils.secrets.get(json_['master_name_scope'], json_["client_secret_key"])
3030
json_.update({'token':token, 'client_secret': client_secret})
31-
elif (cloud_type =='aws' and json_['use_sp_auth'].lower() == 'true'):
31+
elif ((cloud_type =='aws' or cloud_type =='gcp') and json_['use_sp_auth'].lower() == 'true'):
3232
client_secret = dbutils.secrets.get(json_['master_name_scope'], json_["client_secret_key"])
3333
json_.update({'token':'dapijedi', 'client_secret': client_secret})
3434
mastername =' ' # this will not be present when using SPs
3535
masterpwd = ' ' # we still need to send empty user/pwd.
3636
json_.update({'token':'dapijedi', 'mastername':mastername, 'masterpwd':masterpwd})
37-
else: #lets populate master key for accounts api
38-
mastername = dbutils.secrets.get(json_['master_name_scope'], json_['master_name_key'])
39-
masterpwd = dbutils.secrets.get(json_['master_pwd_scope'], json_['master_pwd_key'])
40-
json_.update({'token':'dapijedi', 'mastername':mastername, 'masterpwd':masterpwd})
37+
4138
json_.update({'url':hostname, 'workspace_id': 'accounts', 'cloud_type': cloud_type, 'clusterid':clusterid})
4239

4340

@@ -141,18 +138,8 @@ def modifyWorkspaceConfigFile(input_connection_arr):
141138

142139
# COMMAND ----------
143140

144-
def renewWorkspaceTokens():
145-
if cloud_type == "gcp":
146-
# refesh workspace level tokens if PAT tokens are not used as the temp tokens expire in 10 hours
147-
gcp_status2 = dbutils.notebook.run("../Setup/gcp/configure_tokens_for_worksaces", 3000)
148-
if gcp_status2 != "OK":
149-
loggr.exception("Error Encountered in GCP Step#2", gcp_status2)
150-
dbutils.notebook.exit()
151-
152-
# COMMAND ----------
153-
154141
input_status_arr=[]
155-
renewWorkspaceTokens()
142+
156143
for ws in workspaces:
157144
import json
158145

@@ -165,16 +152,12 @@ def renewWorkspaceTokens():
165152
if cloud_type =='azure': #use client secret
166153
client_secret = dbutils.secrets.get(json_['master_name_scope'], json_["client_secret_key"])
167154
json_.update({'token':token, 'client_secret': client_secret})
168-
elif (cloud_type =='aws' and json_['use_sp_auth'].lower() == 'true'):
155+
elif ((cloud_type =='aws' or cloud_type =='gcp') and json_['use_sp_auth'].lower() == 'true'):
169156
client_secret = dbutils.secrets.get(json_['master_name_scope'], json_["client_secret_key"])
170157
json_.update({'token':token, 'client_secret': client_secret})
171158
mastername =' ' # this will not be present when using SPs
172159
masterpwd = ' ' # we still need to send empty user/pwd.
173160
json_.update({'token':token, 'mastername':mastername, 'masterpwd':masterpwd})
174-
else: #lets populate master key for accounts api
175-
mastername = dbutils.secrets.get(json_['master_name_scope'], json_['master_name_key'])
176-
masterpwd = dbutils.secrets.get(json_['master_pwd_scope'], json_['master_pwd_key'])
177-
json_.update({'token':token, 'mastername':mastername, 'masterpwd':masterpwd})
178161

179162
if (json_['use_mastercreds']) is False:
180163
tokenscope = json_['workspace_pat_scope']

notebooks/Setup/4. enable_workspaces_for_sat.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,10 @@
55

66
# COMMAND ----------
77

8+
# MAGIC %run ../Includes/install_sat_sdk
9+
10+
# COMMAND ----------
11+
812
# MAGIC %run ../Utils/initialize
913

1014
# COMMAND ----------

0 commit comments

Comments
 (0)