Skip to content
This repository was archived by the owner on Mar 6, 2026. It is now read-only.
Merged
Show file tree
Hide file tree
Changes from 10 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .coveragerc
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ omit =
google/cloud/__init__.py

[report]
fail_under = 99
fail_under = 98
show_missing = True
exclude_lines =
# Re-enable the standard pragma
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/unittest.yml
Original file line number Diff line number Diff line change
Expand Up @@ -58,4 +58,4 @@ jobs:
run: |
find .coverage-results -type f -name '*.zip' -exec unzip {} \;
coverage combine .coverage-results/**/.coverage*
coverage report --show-missing --fail-under=99
coverage report --show-missing --fail-under=98
Comment thread
holtskinner marked this conversation as resolved.
14 changes: 13 additions & 1 deletion google/cloud/documentai_toolbox/utilities/gcs_utilities.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
# limitations under the License.
#
"""Google Cloud Storage utilities."""
import importlib.metadata
import os
import re
from typing import Dict, List, Optional, Tuple
Expand Down Expand Up @@ -142,7 +143,18 @@ def get_blob(
if not re.match(constants.FILE_CHECK_REGEX, gcs_uri):
raise ValueError("gcs_uri must link to a single file.")

return storage.Blob.from_string(gcs_uri, _get_storage_client(module=module))
try:
version = importlib.metadata.version("google-cloud-storage")
except importlib.metadata.PackageNotFoundError:
raise ImportError("google-cloud-storage is not installed.")

client = _get_storage_client(module=module)

major, _, _ = map(int, version.split("."))
if major < 3:
Comment thread
holtskinner marked this conversation as resolved.
Outdated
return storage.Blob.from_string(gcs_uri, client)
else:
return storage.Blob.from_uri(gcs_uri, client)


def split_gcs_uri(gcs_uri: str) -> Tuple[str, str]:
Expand Down
2 changes: 1 addition & 1 deletion owlbot.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@
unit_test_python_versions=["3.8", "3.9", "3.10", "3.11", "3.12", "3.13", "3.14"],
system_test_python_versions=["3.9", "3.14"],
default_python_version="3.13",
cov_level=99,
cov_level=98,
intersphinx_dependencies={
"pandas": "https://pandas.pydata.org/pandas-docs/stable/"
},
Expand Down
88 changes: 88 additions & 0 deletions tests/unit/test_gcs_utilities.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
# See the License for the specific language governing permissions and
# limitations under the License.

import importlib
from unittest import mock

import pytest
Expand Down Expand Up @@ -579,3 +580,90 @@ def test_get_blobs_with_no_input():
match="You must provide either `gcs_uri` or both `gcs_bucket_name` and `gcs_prefix`.",
):
gcs_utilities.get_blobs()


@mock.patch("google.cloud.documentai_toolbox.utilities.gcs_utilities.storage")
def test_get_blobs_with_gcs_uri(mock_storage):
client = mock_storage.Client.return_value
gcs_uri = "gs://test-bucket/test-directory/1/"

gcs_utilities.get_blobs(gcs_uri=gcs_uri)

mock_storage.Client.assert_called_once()
client.list_blobs.assert_called_once_with("test-bucket", prefix="test-directory/1/")


def test_get_blobs_with_file_type_error():
with pytest.raises(ValueError, match="gcs_prefix cannot contain file types"):
gcs_utilities.get_blobs(gcs_bucket_name="test-bucket", gcs_prefix="test.json")


@mock.patch("google.cloud.documentai_toolbox.utilities.gcs_utilities.storage")
def test_get_blob_success_major_3(mock_storage):
mock_version = "3.0.0"
with mock.patch("importlib.metadata.version", return_value=mock_version):
client = mock_storage.Client.return_value
gcs_uri = "gs://test-bucket/test.json"

gcs_utilities.get_blob(gcs_uri)

mock_storage.Blob.from_uri.assert_called_once_with(gcs_uri, client)


@mock.patch("google.cloud.documentai_toolbox.utilities.gcs_utilities.storage")
def test_get_blob_success_major_2(mock_storage):
mock_version = "2.0.0"
with mock.patch("importlib.metadata.version", return_value=mock_version):
client = mock_storage.Client.return_value
gcs_uri = "gs://test-bucket/test.json"

gcs_utilities.get_blob(gcs_uri)

mock_storage.Blob.from_string.assert_called_once_with(gcs_uri, client)


def test_get_blob_invalid_uri():
with pytest.raises(ValueError, match="gcs_uri must link to a single file."):
gcs_utilities.get_blob("gs://test-bucket/prefix/")


def test_get_blob_import_error():
with mock.patch(
"importlib.metadata.version",
side_effect=importlib.metadata.PackageNotFoundError,
):
with pytest.raises(ImportError, match="google-cloud-storage is not installed."):
gcs_utilities.get_blob("gs://test-bucket/test.json")


@mock.patch("google.cloud.documentai_toolbox.utilities.gcs_utilities.storage")
def test_print_gcs_document_tree_with_skipping_files(mock_storage, capfd):
client = mock_storage.Client.return_value
mock_bucket = mock.Mock()
client.Bucket.return_value = mock_bucket

blobs = [
storage.Blob(
name=f"gs://test-directory/1/test_shard{i}.json",
bucket="gs://test-directory/1",
)
for i in range(1, 11)
]

client.list_blobs.return_value = blobs

# files_to_display = 2. 10 files total.
# idx 0, 1, 2 -> print
# idx 3, 4, 5, 6, 7, 8 -> skip
# idx 9 -> print last
gcs_utilities.print_gcs_document_tree(
gcs_bucket_name="test-directory", gcs_prefix="/", files_to_display=2
)

out, err = capfd.readouterr()
assert "├──test_shard1.json" in out
assert "├──test_shard2.json" in out
assert "├──test_shard3.json" in out
assert "├──test_shard4.json" not in out
assert "│ ...." in out
assert "└──test_shard10.json" in out
Loading