googleapis · Shifat7 · Apr 29, 2025 · Apr 29, 2025 · Apr 30, 2025 · Apr 30, 2025
diff --git a/google/cloud/documentai_toolbox/wrappers/page.py b/google/cloud/documentai_toolbox/wrappers/page.py
@@ -247,11 +247,29 @@ class Token(_BasePageElement):
             Required. The text of the Token.
         symbols (List[Symbol]):
             Optional. The Symbols contained within the Token.
+        confidence (float):
+            Optional. The confidence score of the Token detection.
+        detected_languages (List[documentai.Document.Page.DetectedLanguage]):
+            Optional. A list of detected languages for this Token.
     """
 
     @cached_property
     def symbols(self) -> List[Symbol]:
         return self._get_children_of_element(self._page.symbols)
+
+    @cached_property
+    def confidence(self) -> float:
+        """
+        The confidence score of the Token detection.
+        """
+        return self.documentai_object.layout.confidence
+
+    @cached_property
+    def detected_languages(self) -> List[documentai.Document.Page.DetectedLanguage]:
+        """
+        A list of detected languages for this Token.
+        """
+        return self.documentai_object.detected_languages
 
 
 @dataclasses.dataclass

diff --git a/samples/snippets/test_token_confidence_sample.py b/samples/snippets/test_token_confidence_sample.py
@@ -0,0 +1,36 @@
+# Copyright 2024 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+
+import token_confidence_sample
+
+
+def test_token_confidence_sample(capsys):
+    # Use a test document from the resources directory
+    test_file_path = os.path.join(
+        os.path.dirname(__file__), "resources", "form_with_tables.json"
+    )
+
+    # Run the sample
+    token_confidence_sample.token_confidence_sample(
+        document_path=test_file_path
+    )
+
+    # Capture output
+    stdout, _ = capsys.readouterr()
+
+    # Check that the output contains expected strings
+    assert "Token" in stdout
+    assert "Confidence:" in stdout 
diff --git a/samples/snippets/test_token_detected_languages_sample.py b/samples/snippets/test_token_detected_languages_sample.py
@@ -0,0 +1,36 @@
+# Copyright 2024 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+
+import token_detected_languages_sample
+
+
+def test_token_detected_languages_sample(capsys):
+    # Use a test document from the resources directory
+    test_file_path = os.path.join(
+        os.path.dirname(__file__), "resources", "form_with_tables.json"
+    )
+
+    # Run the sample
+    token_detected_languages_sample.token_detected_languages_sample(
+        document_path=test_file_path
+    )
+
+    # Capture output
+    stdout, _ = capsys.readouterr()
+
+    # Check that the output contains expected strings
+    assert "Token" in stdout
+    assert "Detected Languages:" in stdout 
diff --git a/samples/snippets/token_confidence_sample.py b/samples/snippets/token_confidence_sample.py
@@ -0,0 +1,70 @@
+# Copyright 2024 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+# [START documentai_toolbox_token_confidence]
+from typing import Optional
+
+from google.cloud.documentai_toolbox import document
+
+# TODO(developer): Uncomment these variables before running the sample.
+# gcs_uri = "gs://bucket/path/to/folder/document.json"
+
+
+def token_confidence_sample(
+    gcs_uri: Optional[str] = None,
+    document_path: Optional[str] = None,
+) -> None:
+    """Demonstrates how to access token-level confidence scores.
+
+    Args:
+        gcs_uri (Optional[str]): 
+            URI to a Document JSON file in GCS.
+        document_path (Optional[str]): 
+            Path to a local Document JSON file.
+    """
+    if gcs_uri:
+        # Load a single Document from a Google Cloud Storage URI
+        wrapped_document = document.Document.from_gcs_uri(gcs_uri=gcs_uri)
+    elif document_path:
+        # Load from local `Document` JSON file
+        wrapped_document = document.Document.from_document_path(document_path)
+    else:
+        raise ValueError("No document source provided.")
+
+    # Display token confidence for the first page
+    if wrapped_document.pages:
+        page = wrapped_document.pages[0]
+        print(f"Page {page.page_number} Tokens:")
+
+        for i, token in enumerate(page.tokens[:10]):  # Limiting to first 10 tokens for brevity
+            print(f"Token {i}: '{token.text.strip()}'")
+            print(f"  Confidence: {token.confidence:.4f}")
+            print()
+# [END documentai_toolbox_token_confidence]
+
+
+if __name__ == "__main__":
+    import argparse
+
+    parser = argparse.ArgumentParser()
+    group = parser.add_mutually_exclusive_group(required=True)
+    group.add_argument("--gcs_uri", help="GCS URI to Document JSON.")
+    group.add_argument("--document_path", help="Path to local Document JSON file.")
+    args = parser.parse_args()
+
+    token_confidence_sample(
+        gcs_uri=args.gcs_uri,
+        document_path=args.document_path,
+    ) 
diff --git a/samples/snippets/token_detected_languages_sample.py b/samples/snippets/token_detected_languages_sample.py
@@ -0,0 +1,77 @@
+# Copyright 2024 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+# [START documentai_toolbox_token_detected_languages]
+from typing import Optional
+
+from google.cloud.documentai_toolbox import document
+
+# TODO(developer): Uncomment these variables before running the sample.
+# gcs_uri = "gs://bucket/path/to/folder/document.json"
+
+
+def token_detected_languages_sample(
+    gcs_uri: Optional[str] = None,
+    document_path: Optional[str] = None,
+) -> None:
+    """Demonstrates how to access token-level detected languages.
+
+    Args:
+        gcs_uri (Optional[str]): 
+            URI to a Document JSON file in GCS.
+        document_path (Optional[str]): 
+            Path to a local Document JSON file.
+    """
+    if gcs_uri:
+        # Load a single Document from a Google Cloud Storage URI
+        wrapped_document = document.Document.from_gcs_uri(gcs_uri=gcs_uri)
+    elif document_path:
+        # Load from local `Document` JSON file
+        wrapped_document = document.Document.from_document_path(document_path)
+    else:
+        raise ValueError("No document source provided.")
+
+    # Display detected languages for tokens in the first page
+    if wrapped_document.pages:
+        page = wrapped_document.pages[0]
+        print(f"Page {page.page_number} Tokens:")
+
+        for i, token in enumerate(page.tokens[:10]):  # Limiting to first 10 tokens for brevity
+            print(f"Token {i}: '{token.text.strip()}'")
+
+            if token.detected_languages:
+                print("  Detected Languages:")
+                for lang in token.detected_languages:
+                    confidence_str = f", confidence: {lang.confidence:.4f}" if hasattr(lang, "confidence") else ""
+                    print(f"    - {lang.language_code}{confidence_str}")
+            else:
+                print("  No language detected")
+            print()
+# [END documentai_toolbox_token_detected_languages]
+
+
+if __name__ == "__main__":
+    import argparse
+
+    parser = argparse.ArgumentParser()
+    group = parser.add_mutually_exclusive_group(required=True)
+    group.add_argument("--gcs_uri", help="GCS URI to Document JSON.")
+    group.add_argument("--document_path", help="Path to local Document JSON file.")
+    args = parser.parse_args()
+
+    token_detected_languages_sample(
+        gcs_uri=args.gcs_uri,
+        document_path=args.document_path,
+    ) 
diff --git a/tests/unit/test_page.py b/tests/unit/test_page.py
@@ -297,6 +297,17 @@ def test_Token(docproto):
     # checking cached value
     assert token.text == "Q.\n"
     assert token.hocr_bounding_box == "bbox 585 1781 620 1818"
+
+    # Check confidence value
+    assert isinstance(token.confidence, float)
+    assert 0.0 <= token.confidence <= 1.0
+
+    # Check detected languages
+    assert isinstance(token.detected_languages, list)
+    if token.detected_languages:
+        for language in token.detected_languages:
+            assert isinstance(language, documentai.Document.Page.DetectedLanguage)
+            assert hasattr(language, "language_code")
 
     assert token.symbols == []