Skip to content
This repository was archived by the owner on Mar 6, 2026. It is now read-only.

Commit 96abe22

Browse files
holtskinnergalz10
andauthored
feat: Added form_fields_to_bigquery() method (#104)
* feat: Added `form_fields_to_dict()` and `form_fields_to_bigquery()` methods * Add Tests and fixed invalid characters * Added function to convert bq_column_name * Add Client Info to BQ Client --------- Co-authored-by: Gal Zahavi <38544478+galz10@users.noreply.github.com>
1 parent 78807b1 commit 96abe22

4 files changed

Lines changed: 224 additions & 48 deletions

File tree

google/cloud/documentai_toolbox/utilities/gcs_utilities.py

Lines changed: 16 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -26,33 +26,32 @@
2626
from google.cloud.documentai_toolbox import constants
2727

2828

29-
def _get_storage_client(module: str = None):
30-
r"""Returns a Storage client with custom user agent header.
29+
def _get_client_info(module: str = None) -> client_info.ClientInfo:
30+
r"""Returns a custom user agent header.
3131
3232
Returns:
33-
storage.Client.
33+
client_info.ClientInfo.
3434
3535
"""
36+
client_library_version = documentai_toolbox.__version__
3637

3738
if module:
38-
user_agent = (
39-
f"{constants.USER_AGENT_PRODUCT}/{documentai_toolbox.__version__}-{module}"
40-
)
39+
client_library_version = f"{client_library_version}-{module}"
4140

42-
info = client_info.ClientInfo(
43-
client_library_version=f"{documentai_toolbox.__version__}-{module}",
44-
user_agent=user_agent,
45-
)
46-
return storage.Client(client_info=info)
41+
return client_info.ClientInfo(
42+
client_library_version=client_library_version,
43+
user_agent=f"{constants.USER_AGENT_PRODUCT}/{client_library_version}",
44+
)
4745

48-
user_agent = f"{constants.USER_AGENT_PRODUCT}/{documentai_toolbox.__version__}"
4946

50-
info = client_info.ClientInfo(
51-
client_library_version=documentai_toolbox.__version__,
52-
user_agent=user_agent,
53-
)
47+
def _get_storage_client(module: str = None) -> storage.Client:
48+
r"""Returns a Storage client with custom user agent header.
5449
55-
return storage.Client(client_info=info)
50+
Returns:
51+
storage.Client.
52+
53+
"""
54+
return storage.Client(client_info=_get_client_info(module))
5655

5756

5857
def get_bytes(gcs_bucket_name: str, gcs_prefix: str) -> List[bytes]:

google/cloud/documentai_toolbox/wrappers/document.py

Lines changed: 152 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -234,6 +234,106 @@ def _get_batch_process_metadata(
234234
return metadata
235235

236236

237+
def _insert_into_dictionary_with_list(dic: Dict, key: str, value: str) -> Dict:
238+
r"""Inserts value into a dictionary that can contain lists.
239+
240+
Args:
241+
dic (Dict):
242+
Required. The dictionary to insert into.
243+
key (str):
244+
Required. The key to be created or inserted into.
245+
value (str):
246+
Required. The value to be inserted.
247+
248+
Returns:
249+
Dict:
250+
The dictionary after adding the key value pair.
251+
"""
252+
existing_value = dic.get(key)
253+
254+
if existing_value:
255+
# For duplicate keys,
256+
# Change Type to a List if not already
257+
if not isinstance(existing_value, list):
258+
existing_value = [existing_value]
259+
260+
existing_value.append(value)
261+
dic[key] = existing_value
262+
else:
263+
dic[key] = value
264+
265+
return dic
266+
267+
268+
def _bigquery_column_name(input_string: str) -> str:
269+
r"""Converts a string into a BigQuery column name.
270+
https://cloud.google.com/bigquery/docs/schemas#column_names
271+
272+
Args:
273+
input_string (str):
274+
Required: The string to convert.
275+
Returns:
276+
str
277+
The converted string.
278+
279+
"""
280+
char_map: Dict[str, str] = {
281+
r":|;|\(|\)|\[|\]|,|\.|\?|\!|\'|\n": "",
282+
r"/| ": "_",
283+
r"#": "num",
284+
r"@": "at",
285+
}
286+
287+
for key, value in char_map.items():
288+
input_string = re.sub(key, value, input_string)
289+
290+
return input_string.lower()
291+
292+
293+
def _dict_to_bigquery(
294+
dic: Dict,
295+
dataset_name: str,
296+
table_name: str,
297+
project_id: Optional[str],
298+
) -> bigquery.job.LoadJob:
299+
r"""Loads dictionary to a BigQuery table.
300+
301+
Args:
302+
dic (Dict):
303+
Required: The dictionary to insert.
304+
dataset_name (str):
305+
Required. Name of the BigQuery dataset.
306+
table_name (str):
307+
Required. Name of the BigQuery table.
308+
project_id (Optional[str]):
309+
Optional. Project ID containing the BigQuery table. If not passed, falls back to the default inferred from the environment.
310+
Returns:
311+
bigquery.job.LoadJob:
312+
The BigQuery LoadJob for adding the dictionary.
313+
314+
"""
315+
bq_client = bigquery.Client(
316+
project=project_id, client_info=gcs_utilities._get_client_info()
317+
)
318+
table_ref = bigquery.DatasetReference(
319+
project=project_id, dataset_id=dataset_name
320+
).table(table_name)
321+
322+
job_config = bigquery.LoadJobConfig(
323+
schema_update_options=[
324+
bigquery.SchemaUpdateOption.ALLOW_FIELD_ADDITION,
325+
bigquery.SchemaUpdateOption.ALLOW_FIELD_RELAXATION,
326+
],
327+
source_format=bigquery.SourceFormat.NEWLINE_DELIMITED_JSON,
328+
)
329+
330+
return bq_client.load_table_from_json(
331+
json_rows=[dic],
332+
destination=table_ref,
333+
job_config=job_config,
334+
)
335+
336+
237337
@dataclasses.dataclass
238338
class Document:
239339
r"""Represents a wrapped `Document`.
@@ -476,6 +576,49 @@ def get_form_field_by_name(self, target_field: str) -> List[FormField]:
476576

477577
return found_fields
478578

579+
def form_fields_to_dict(self) -> Dict:
580+
r"""Returns Dictionary of form fields in document.
581+
582+
Returns:
583+
Dict:
584+
The Dict of the form fields indexed by type.
585+
586+
"""
587+
form_fields_dict: Dict = {}
588+
for p in self.pages:
589+
for form_field in p.form_fields:
590+
field_name = _bigquery_column_name(form_field.field_name)
591+
form_fields_dict = _insert_into_dictionary_with_list(
592+
form_fields_dict, field_name, form_field.field_value
593+
)
594+
595+
return form_fields_dict
596+
597+
def form_fields_to_bigquery(
598+
self, dataset_name: str, table_name: str, project_id: Optional[str] = None
599+
) -> bigquery.job.LoadJob:
600+
r"""Adds extracted form fields to a BigQuery table.
601+
602+
Args:
603+
dataset_name (str):
604+
Required. Name of the BigQuery dataset.
605+
table_name (str):
606+
Required. Name of the BigQuery table.
607+
project_id (Optional[str]):
608+
Optional. Project ID containing the BigQuery table. If not passed, falls back to the default inferred from the environment.
609+
Returns:
610+
bigquery.job.LoadJob:
611+
The BigQuery LoadJob for adding the form fields.
612+
613+
"""
614+
615+
return _dict_to_bigquery(
616+
self.form_fields_to_dict(),
617+
dataset_name,
618+
table_name,
619+
project_id,
620+
)
621+
479622
def get_entity_by_type(self, target_type: str) -> List[Entity]:
480623
r"""Returns the list of Entities of target_type.
481624
@@ -500,20 +643,10 @@ def entities_to_dict(self) -> Dict:
500643
"""
501644
entities_dict: Dict = {}
502645
for entity in self.entities:
503-
entity_type = entity.type_.replace("/", "_")
504-
505-
existing_entity = entities_dict.get(entity_type)
506-
if not existing_entity:
507-
entities_dict[entity_type] = entity.mention_text
508-
continue
509-
510-
# For entities that can have multiple (e.g. line_item)
511-
# Change Entity Type to a List
512-
if not isinstance(existing_entity, list):
513-
existing_entity = [existing_entity]
514-
515-
existing_entity.append(entity.mention_text)
516-
entities_dict[entity_type] = existing_entity
646+
entity_type = _bigquery_column_name(entity.type_)
647+
entities_dict = _insert_into_dictionary_with_list(
648+
entities_dict, entity_type, entity.mention_text
649+
)
517650

518651
return entities_dict
519652

@@ -534,23 +667,12 @@ def entities_to_bigquery(
534667
The BigQuery LoadJob for adding the entities.
535668
536669
"""
537-
bq_client = bigquery.Client(project=project_id)
538-
table_ref = bigquery.DatasetReference(
539-
project=project_id, dataset_id=dataset_name
540-
).table(table_name)
541-
542-
job_config = bigquery.LoadJobConfig(
543-
schema_update_options=[
544-
bigquery.SchemaUpdateOption.ALLOW_FIELD_ADDITION,
545-
bigquery.SchemaUpdateOption.ALLOW_FIELD_RELAXATION,
546-
],
547-
source_format=bigquery.SourceFormat.NEWLINE_DELIMITED_JSON,
548-
)
549670

550-
return bq_client.load_table_from_json(
551-
json_rows=[self.entities_to_dict()],
552-
destination=table_ref,
553-
job_config=job_config,
671+
return _dict_to_bigquery(
672+
self.entities_to_dict(),
673+
dataset_name,
674+
table_name,
675+
project_id,
554676
)
555677

556678
def split_pdf(self, pdf_path: str, output_path: str) -> List[str]:

samples/snippets/entities_to_bigquery_sample.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,11 @@ def entities_to_bigquery_sample(
4242
dataset_name=dataset_name, table_name=table_name, project_id=project_id
4343
)
4444

45+
# Also supported:
46+
# job = wrapped_document.form_fields_to_bigquery(
47+
# dataset_name=dataset_name, table_name=table_name, project_id=project_id
48+
# )
49+
4550
print("Document entities loaded into BigQuery")
4651
print(f"Job ID: {job.job_id}")
4752
print(f"Table: {job.destination.path}")

tests/unit/test_document.py

Lines changed: 51 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -186,7 +186,7 @@ def test_get_batch_process_metadata_with_no_metadata(mock_docai):
186186

187187

188188
@mock.patch("google.cloud.documentai_toolbox.wrappers.document.documentai")
189-
def test_document_from_batch_process_operation_with_invalid_metadata_type(mock_docai):
189+
def test_get_batch_process_metadata_with_invalid_metadata_type(mock_docai):
190190
with pytest.raises(
191191
ValueError,
192192
match="Operation metadata type is not",
@@ -206,6 +206,19 @@ def test_document_from_batch_process_operation_with_invalid_metadata_type(mock_d
206206
document._get_batch_process_metadata(location, operation_name)
207207

208208

209+
def test_bigquery_column_name():
210+
string_map = {
211+
"Phone #:": "phone_num",
212+
"Emergency Contact:": "emergency_contact",
213+
"Marital Status:": "marital_status",
214+
"Are you currently taking any medication? (If yes, please describe):": "are_you_currently_taking_any_medication_if_yes_please_describe",
215+
"Describe your medical concerns (symptoms, diagnoses, etc):": "describe_your_medical_concerns_symptoms_diagnoses_etc",
216+
}
217+
218+
for key, value in string_map.items():
219+
assert document._bigquery_column_name(key) == value
220+
221+
209222
def test_document_from_document_path_with_single_shard():
210223
actual = document.Document.from_document_path(
211224
document_path="tests/unit/resources/0/toolbox_invoice_test-0.json"
@@ -401,6 +414,43 @@ def test_get_form_field_by_name(get_bytes_form_parser_mock):
401414
assert actual[0].field_value == "(906) 917-3486"
402415

403416

417+
def test_form_fields_to_dict(get_bytes_form_parser_mock):
418+
doc = document.Document.from_gcs(
419+
gcs_bucket_name="test-directory", gcs_prefix="documentai/output/123456789/0"
420+
)
421+
actual = doc.form_fields_to_dict()
422+
423+
get_bytes_form_parser_mock.assert_called_once()
424+
425+
assert len(actual) == 17
426+
assert actual.get("address") == "24 Barney Lane"
427+
assert actual.get("city") == "Towaco"
428+
429+
430+
@mock.patch("google.cloud.documentai_toolbox.wrappers.document.bigquery")
431+
def test_form_fields_to_bigquery(mock_bigquery, get_bytes_form_parser_mock):
432+
client = mock_bigquery.Client.return_value
433+
434+
mock_table = mock.Mock()
435+
client.dataset.table.return_value = mock_table
436+
437+
mock_load_job = mock.Mock()
438+
client.load_table_from_json.return_value = mock_load_job
439+
440+
doc = document.Document.from_gcs(
441+
gcs_bucket_name="test-directory", gcs_prefix="documentai/output/123456789/0"
442+
)
443+
444+
actual = doc.form_fields_to_bigquery(
445+
dataset_name="test_dataset", table_name="test_table", project_id="test_project"
446+
)
447+
448+
get_bytes_form_parser_mock.assert_called_once()
449+
mock_bigquery.Client.assert_called_once()
450+
451+
assert actual
452+
453+
404454
def test_entities_to_dict(get_bytes_single_file_mock):
405455
doc = document.Document.from_gcs(
406456
gcs_bucket_name="test-directory", gcs_prefix="documentai/output/123456789/0"

0 commit comments

Comments
 (0)