Skip to content

Commit 63bcd3d

Browse files
authored
Validate Project of staging bucket (#39008)
1 parent 17bed43 commit 63bcd3d

3 files changed

Lines changed: 140 additions & 3 deletions

File tree

sdks/python/apache_beam/io/gcp/gcsio.py

Lines changed: 47 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -77,6 +77,48 @@ def default_gcs_bucket_name(project, region):
7777
region, md5(project.encode('utf8')).hexdigest())
7878

7979

80+
def _get_project_number(project_id, credentials=None):
81+
"""Resolves a project ID to its project number using Cloud Resource Manager API."""
82+
from google.cloud import resourcemanager_v3
83+
client = resourcemanager_v3.ProjectsClient(credentials=credentials)
84+
project_info = client.get_project(name=f"projects/{project_id}")
85+
# project_info.name is of the form "projects/PROJECT_NUMBER"
86+
return int(project_info.name.split('/')[-1])
87+
88+
89+
def _validate_bucket_project(bucket, project_id, credentials=None):
90+
"""Verifies that the GCS bucket is owned by the executing project."""
91+
bucket_project_number = bucket.project_number
92+
93+
# Skip validation if the bucket project number is a mock object
94+
if (type(bucket_project_number).__name__.endswith('Mock') or
95+
hasattr(bucket_project_number, '_mock_self')):
96+
_LOGGER.warning(
97+
'Bucket project number is a mock object. Skipping ownership validation.'
98+
)
99+
return
100+
101+
if bucket_project_number is None:
102+
_LOGGER.warning(
103+
'Bucket gs://%s does not have a project number. Skipping ownership validation.',
104+
bucket.name)
105+
return
106+
107+
try:
108+
project_number = _get_project_number(project_id, credentials=credentials)
109+
except Exception as e:
110+
_LOGGER.warning(
111+
'Failed to resolve project number for project %s: %s. '
112+
'Skipping bucket ownership validation.',
113+
project_id,
114+
e)
115+
return
116+
117+
if bucket_project_number != project_number:
118+
raise ValueError(
119+
f'Bucket gs://{bucket.name} is not owned by project {project_id}.')
120+
121+
80122
def get_or_create_default_gcs_bucket(options):
81123
"""Create a default GCS bucket for this project."""
82124
if getattr(options, 'dataflow_kms_key', None):
@@ -90,16 +132,18 @@ def get_or_create_default_gcs_bucket(options):
90132
return None
91133

92134
bucket_name = default_gcs_bucket_name(project, region)
93-
bucket = GcsIO(pipeline_options=options).get_bucket(bucket_name)
135+
gcs = GcsIO(pipeline_options=options)
136+
bucket = gcs.get_bucket(bucket_name)
94137
if bucket:
138+
_validate_bucket_project(
139+
bucket, project, credentials=getattr(gcs.client, '_credentials', None))
95140
return bucket
96141
else:
97142
_LOGGER.warning(
98143
'Creating default GCS bucket for project %s: gs://%s',
99144
project,
100145
bucket_name)
101-
return GcsIO(pipeline_options=options).create_bucket(
102-
bucket_name, project, location=region)
146+
return gcs.create_bucket(bucket_name, project, location=region)
103147

104148

105149
def create_storage_client(pipeline_options, use_credentials=True):

sdks/python/apache_beam/io/gcp/gcsio_test.py

Lines changed: 92 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -395,6 +395,98 @@ def test_default_bucket_name_failure(self):
395395
DEFAULT_GCP_PROJECT, "us-central1", kms_key="kmskey!")),
396396
None)
397397

398+
@mock.patch('google.cloud.resourcemanager_v3.ProjectsClient')
399+
@mock.patch('apache_beam.io.gcp.gcsio.GcsIO')
400+
def test_get_or_create_default_gcs_bucket_ownership_match(
401+
self, mock_gcsio_class, mock_crm_class):
402+
mock_gcsio = mock_gcsio_class.return_value
403+
mock_bucket = mock.Mock()
404+
mock_bucket.project_number = 123456789
405+
mock_gcsio.get_bucket.return_value = mock_bucket
406+
407+
mock_crm_client = mock_crm_class.return_value
408+
mock_project_info = mock.Mock()
409+
mock_project_info.name = 'projects/123456789'
410+
mock_crm_client.get_project.return_value = mock_project_info
411+
412+
options = SampleOptions(DEFAULT_GCP_PROJECT, 'us-central1')
413+
bucket = gcsio.get_or_create_default_gcs_bucket(options)
414+
415+
self.assertEqual(bucket, mock_bucket)
416+
mock_gcsio.get_bucket.assert_called_once_with(
417+
'dataflow-staging-us-central1-77b801c0838aee13391c0d1885860494')
418+
419+
@mock.patch('google.cloud.resourcemanager_v3.ProjectsClient')
420+
@mock.patch('apache_beam.io.gcp.gcsio.GcsIO')
421+
def test_get_or_create_default_gcs_bucket_ownership_mismatch(
422+
self, mock_gcsio_class, mock_crm_class):
423+
mock_gcsio = mock_gcsio_class.return_value
424+
mock_bucket = mock.Mock()
425+
mock_bucket.project_number = 999999999
426+
mock_gcsio.get_bucket.return_value = mock_bucket
427+
428+
mock_crm_client = mock_crm_class.return_value
429+
mock_project_info = mock.Mock()
430+
mock_project_info.name = 'projects/123456789'
431+
mock_crm_client.get_project.return_value = mock_project_info
432+
433+
options = SampleOptions(DEFAULT_GCP_PROJECT, 'us-central1')
434+
with self.assertRaises(ValueError) as ctx:
435+
gcsio.get_or_create_default_gcs_bucket(options)
436+
437+
self.assertIn(
438+
f'is not owned by project {DEFAULT_GCP_PROJECT}', str(ctx.exception))
439+
440+
@mock.patch('google.cloud.resourcemanager_v3.ProjectsClient')
441+
@mock.patch('apache_beam.io.gcp.gcsio.GcsIO')
442+
def test_get_or_create_default_gcs_bucket_ownership_no_bucket_number(
443+
self, mock_gcsio_class, mock_crm_class):
444+
mock_gcsio = mock_gcsio_class.return_value
445+
mock_bucket = mock.Mock()
446+
mock_bucket.project_number = None
447+
mock_bucket.name = 'dataflow-staging-us-central1-77b801c0838aee13391c0d1885860494'
448+
mock_gcsio.get_bucket.return_value = mock_bucket
449+
450+
options = SampleOptions(DEFAULT_GCP_PROJECT, 'us-central1')
451+
bucket = gcsio.get_or_create_default_gcs_bucket(options)
452+
453+
self.assertEqual(bucket, mock_bucket)
454+
mock_crm_class.assert_not_called()
455+
456+
@mock.patch('google.cloud.resourcemanager_v3.ProjectsClient')
457+
@mock.patch('apache_beam.io.gcp.gcsio.GcsIO')
458+
def test_get_or_create_default_gcs_bucket_ownership_crm_error(
459+
self, mock_gcsio_class, mock_crm_class):
460+
mock_gcsio = mock_gcsio_class.return_value
461+
mock_bucket = mock.Mock()
462+
mock_bucket.project_number = 123456789
463+
mock_gcsio.get_bucket.return_value = mock_bucket
464+
465+
mock_crm_client = mock_crm_class.return_value
466+
mock_crm_client.get_project.side_effect = Exception("API Disabled")
467+
468+
options = SampleOptions(DEFAULT_GCP_PROJECT, 'us-central1')
469+
bucket = gcsio.get_or_create_default_gcs_bucket(options)
470+
471+
# Should fall back to success (warn only)
472+
self.assertEqual(bucket, mock_bucket)
473+
474+
@mock.patch('google.cloud.resourcemanager_v3.ProjectsClient')
475+
@mock.patch('apache_beam.io.gcp.gcsio.GcsIO')
476+
def test_get_or_create_default_gcs_bucket_ownership_mock_project_number(
477+
self, mock_gcsio_class, mock_crm_class):
478+
mock_gcsio = mock_gcsio_class.return_value
479+
# Creating a mock bucket without setting project_number (it returns another mock object)
480+
mock_bucket = mock.Mock()
481+
mock_gcsio.get_bucket.return_value = mock_bucket
482+
483+
options = SampleOptions(DEFAULT_GCP_PROJECT, 'us-central1')
484+
bucket = gcsio.get_or_create_default_gcs_bucket(options)
485+
486+
# Verification should be skipped, returning the mock bucket and never calling CRM
487+
self.assertEqual(bucket, mock_bucket)
488+
mock_crm_class.assert_not_called()
489+
398490
def test_exists(self):
399491
file_name = 'gs://gcsio-test/dummy_file'
400492
file_size = 1234

sdks/python/setup.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -531,6 +531,7 @@ def get_portability_package_data():
531531
'google-cloud-datastore>=2.0.0,<3',
532532
'google-cloud-pubsub>=2.1.0,<3',
533533
'google-cloud-storage>=2.18.2,<4',
534+
'google-cloud-resource-manager>=1.12.0,<2',
534535
'google-cloud-dataflow-client>=0.13.0,<0.14.0',
535536
# GCP packages required by tests
536537
'google-cloud-bigquery>=2.0.0,<4',

0 commit comments

Comments
 (0)