Skip to content

Commit 3b9497e

Browse files
authored
Merge pull request #110 from Azure-Samples/invoice-demo
Add Invoice classification and extraction use case scenario
2 parents 57ffeb1 + 15e304f commit 3b9497e

32 files changed

Lines changed: 4890 additions & 2869 deletions

.devcontainer/devcontainer.json

Lines changed: 70 additions & 70 deletions
Original file line numberDiff line numberDiff line change
@@ -1,74 +1,74 @@
11
{
2-
"name": "Azure AI Document Processing Samples",
3-
"image": "mcr.microsoft.com/devcontainers/base:1-bookworm",
4-
"features": {
5-
"ghcr.io/devcontainers/features/git:1": {
6-
"version": "latest",
7-
"ppa": "false"
8-
},
9-
"ghcr.io/devcontainers/features/powershell:1": {},
10-
"ghcr.io/devcontainers/features/azure-cli:1": {},
11-
"ghcr.io/azure/azure-dev/azd:0": {},
12-
"ghcr.io/devcontainers/features/dotnet:2": {
13-
"version": "9.0",
14-
"additionalVersions": "8.0"
15-
},
16-
"ghcr.io/devcontainers/features/python:1": {
17-
"version": "3.12"
18-
},
19-
"ghcr.io/devcontainers/features/git-lfs:1": {},
20-
"ghcr.io/devcontainers/features/github-cli:1": {},
21-
"ghcr.io/devcontainers/features/docker-in-docker:2": {},
22-
"./local-features/dev-tools": "latest"
2+
"name": "Azure AI Document Processing Samples",
3+
"image": "mcr.microsoft.com/devcontainers/base:1-bookworm",
4+
"features": {
5+
"ghcr.io/devcontainers/features/git:1": {
6+
"version": "latest",
7+
"ppa": "false"
238
},
24-
"overrideFeatureInstallOrder": [
25-
"ghcr.io/devcontainers/features/git",
26-
"ghcr.io/devcontainers/features/powershell",
27-
"ghcr.io/devcontainers/features/azure-cli",
28-
"ghcr.io/azure/azure-dev/azd",
29-
"ghcr.io/devcontainers/features/dotnet",
30-
"ghcr.io/devcontainers/features/python",
31-
"ghcr.io/devcontainers/features/git-lfs",
32-
"ghcr.io/devcontainers/features/github-cli",
33-
"ghcr.io/devcontainers/features/docker-in-docker",
34-
"./local-features/dev-tools"
35-
],
36-
"remoteUser": "vscode",
37-
"containerUser": "vscode",
38-
"forwardPorts": [],
39-
"otherPortsAttributes": {
40-
"onAutoForward": "ignore"
9+
"ghcr.io/devcontainers/features/powershell:1": {},
10+
"ghcr.io/devcontainers/features/azure-cli:1": {},
11+
"ghcr.io/azure/azure-dev/azd:0": {},
12+
"ghcr.io/devcontainers/features/dotnet:2": {
13+
"version": "9.0",
14+
"additionalVersions": "8.0"
4115
},
42-
"customizations": {
43-
"vscode": {
44-
"extensions": [
45-
"GitHub.remotehub",
46-
"GitHub.copilot",
47-
"GitHub.copilot-chat",
48-
"github.vscode-pull-request-github",
49-
"GitHub.vscode-github-actions",
50-
"ms-azuretools.azure-dev",
51-
"ms-azuretools.vscode-bicep",
52-
"ms-azuretools.vscode-docker",
53-
"ms-azuretools.vscode-azureresourcegroups",
54-
"ms-azuretools.vscode-azurestorage",
55-
"ms-azuretools.vscode-azure-github-copilot",
56-
"ms-dotnettools.csdevkit",
57-
"ms-dotnettools.dotnet-interactive-vscode",
58-
"ms-python.python",
59-
"ms-python.vscode-pylance",
60-
"ms-python.autopep8",
61-
"ms-python.debugpy",
62-
"ms-toolsai.jupyter",
63-
"ms-vscode.vscode-node-azure-pack",
64-
"ms-vscode.powershell",
65-
"ms-vscode-remote.vscode-remote-extensionpack",
66-
"esbenp.prettier-vscode",
67-
"VisualStudioExptTeam.vscodeintellicode",
68-
"eamodio.gitlens",
69-
"EditorConfig.EditorConfig"
70-
]
71-
}
16+
"ghcr.io/devcontainers/features/python:1": {
17+
"version": "3.12"
7218
},
73-
"postCreateCommand": ".devcontainer/post-create.sh"
74-
}
19+
"ghcr.io/devcontainers/features/git-lfs:1": {},
20+
"ghcr.io/devcontainers/features/github-cli:1": {},
21+
"ghcr.io/devcontainers/features/docker-in-docker:2": {},
22+
"./local-features/dev-tools": "latest"
23+
},
24+
"overrideFeatureInstallOrder": [
25+
"ghcr.io/devcontainers/features/git",
26+
"ghcr.io/devcontainers/features/powershell",
27+
"ghcr.io/devcontainers/features/azure-cli",
28+
"ghcr.io/azure/azure-dev/azd",
29+
"ghcr.io/devcontainers/features/dotnet",
30+
"ghcr.io/devcontainers/features/python",
31+
"ghcr.io/devcontainers/features/git-lfs",
32+
"ghcr.io/devcontainers/features/github-cli",
33+
"ghcr.io/devcontainers/features/docker-in-docker",
34+
"./local-features/dev-tools"
35+
],
36+
"remoteUser": "vscode",
37+
"containerUser": "vscode",
38+
"forwardPorts": [],
39+
"otherPortsAttributes": {
40+
"onAutoForward": "ignore"
41+
},
42+
"customizations": {
43+
"vscode": {
44+
"extensions": [
45+
"GitHub.remotehub",
46+
"GitHub.copilot",
47+
"GitHub.copilot-chat",
48+
"github.vscode-pull-request-github",
49+
"GitHub.vscode-github-actions",
50+
"ms-azuretools.azure-dev",
51+
"ms-azuretools.vscode-bicep",
52+
"ms-azuretools.vscode-docker",
53+
"ms-azuretools.vscode-azureresourcegroups",
54+
"ms-azuretools.vscode-azurestorage",
55+
"ms-azuretools.vscode-azure-github-copilot",
56+
"ms-dotnettools.csdevkit",
57+
"ms-dotnettools.dotnet-interactive-vscode",
58+
"ms-python.python",
59+
"ms-python.vscode-pylance",
60+
"ms-python.autopep8",
61+
"ms-python.debugpy",
62+
"ms-toolsai.jupyter",
63+
"ms-vscode.vscode-node-azure-pack",
64+
"ms-vscode.powershell",
65+
"ms-vscode-remote.vscode-remote-extensionpack",
66+
"esbenp.prettier-vscode",
67+
"VisualStudioExptTeam.vscodeintellicode",
68+
"eamodio.gitlens",
69+
"EditorConfig.EditorConfig"
70+
]
71+
}
72+
},
73+
"postCreateCommand": "bash .devcontainer/post-create.sh"
74+
}

.devcontainer/local-features/dev-tools/install.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,6 @@ check_packages() {
4141
###########################################
4242

4343
# Install dependencies
44-
check_packages poppler-utils tesseract-ocr libtesseract-dev ffmpeg libsm6 libxext6 python3-opencv
44+
check_packages poppler-utils tesseract-ocr libtesseract-dev ffmpeg libsm6 libxext6 python3-opencv libfontconfig1 libice6 libsm6
4545

4646
echo 'dev-tools script has completed!'

.gitignore

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -563,4 +563,7 @@ cython_debug/
563563
# Repo Specific
564564
*Outputs.json
565565
*.pdf.json
566-
*redacted*.pdf
566+
*redacted*.pdf
567+
samples/**/scenarios/invoices/*
568+
!samples/**/scenarios/invoices/*.py
569+
!samples/**/scenarios/invoices/*.ipynb

README.md

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,7 @@ The techniques demonstrated take advantage of various capabilities from each ser
3535
- [Document Classification](#document-classification)
3636
- [Document Redaction](#document-redaction)
3737
- [Document Extraction](#document-extraction)
38+
- [Use Case Scenarios](#use-case-scenarios)
3839
- [Getting Started](#getting-started)
3940
- [Setup on GitHub Codespaces](#setup-on-github-codespaces)
4041
- [Setup on Local](#setup-on-local)
@@ -70,6 +71,14 @@ The techniques demonstrated take advantage of various capabilities from each ser
7071
| Vision-based Extraction with Azure OpenAI GPT-4o GPT-4o | [Python](./samples/python/extraction/vision/document-extraction-gpt-vision.ipynb) \| [.NET](./samples/dotnet/extraction/vision/document-extraction-gpt-vision.ipynb) | Use Azure OpenAI GPT-4o models to extract structured data from documents using vision capabilities. | Complex documents with a mix of text and images, including diagrams, signatures, selection marks, etc. such as reports and contracts. |
7172
| Multi-Modal (Text and Vision) Extraction with Azure AI Document Intelligence and Azure OpenAI GPT-4o | [Python](./samples/python/extraction/multimodal/document-extraction-gpt-text-and-vision.ipynb) \| [.NET](./samples/dotnet/extraction/multimodal/document-extraction-gpt-text-and-vision.ipynb) | Improve the accuracy and confidence in extracting structured data from documents by combining text and images with LLMs. | Any structured or unstructured document type. |
7273

74+
## Use Case Scenarios
75+
76+
This repo also contains a collection of end-to-end use case scenarios that demonstrate how to combine the various samples to create a real-world scenario for document processing.
77+
78+
| Scenario | Link | Description |
79+
| ----------- | ---------------------------------------------------------------------------------------------------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
80+
| **Invoice** | [Python](./samples/python/scenarios/invoices/invoice-extraction.ipynb) \| [.NET](./samples/dotnet/scenarios/invoices/invoice-extraction.ipynb) | Using a structured Invoice object ([Python](./samples/python/modules/samples/models/invoice.py) \| [.NET](./samples/dotnet/modules/samples/models/Invoice.csx)), invoice documents can be extracted into a standard Invoice schema by first classifying which pages to extract from using boundary detection. |
81+
7382
## Getting Started
7483

7584
The sample repository comes with a [**Dev Container**](./.devcontainer/README.md) that contains all the necessary tools and dependencies to run the sample. Please review the [**container and it's dependencies**](./.devcontainer/README.md) to understand all of the necessary components required to run these in a real-world environment, including the use of [Poppler](https://poppler.freedesktop.org/).
Lines changed: 77 additions & 94 deletions
Original file line numberDiff line numberDiff line change
@@ -1,100 +1,83 @@
11
{
2-
"fname": "invoice_1.pdf",
3-
"expected": {
4-
"customer_name": "Sharp Consulting",
5-
"customer_address": {
6-
"street": "73 Regal Way",
7-
"city": "Leeds",
8-
"state": null,
9-
"postal_code": "LS1 5AB",
10-
"country": "UK"
2+
"fname": "invoice_1.pdf",
3+
"0_expected": {
4+
"customer_name": "Sharp Consulting",
5+
"customer_tax_id": null,
6+
"customer_address": {
7+
"street": "73 Regal Way",
8+
"city": "Leeds",
9+
"state": null,
10+
"postal_code": "LS1 5AB",
11+
"country": "UK"
12+
},
13+
"shipping_address": null,
14+
"purchase_order": "15931",
15+
"invoice_id": "3847193",
16+
"invoice_date": "2024-05-16",
17+
"due_date": "2024-05-24",
18+
"vendor_name": "NEXGEN",
19+
"vendor_address": null,
20+
"vendor_tax_id": null,
21+
"remittance_address": null,
22+
"subtotal": {
23+
"currency_code": "GBP",
24+
"amount": 293.52
25+
},
26+
"total_discount": null,
27+
"total_tax": null,
28+
"invoice_total": {
29+
"currency_code": "GBP",
30+
"amount": 293.52
31+
},
32+
"payment_term": null,
33+
"items": [
34+
{
35+
"product_code": "MA197",
36+
"description": "STRETCHWRAP ROLL",
37+
"quantity": 5,
38+
"tax": null,
39+
"unit_price": {
40+
"currency_code": "GBP",
41+
"amount": 16.62
1142
},
12-
"customer_tax_id": null,
13-
"shipping_address": null,
14-
"purchase_order": "15931",
15-
"invoice_id": "3847193",
16-
"invoice_date": "2024-05-16",
17-
"payable_by": "2024-05-24",
18-
"vendor_name": "NEXGEN",
19-
"vendor_address": null,
20-
"vendor_tax_id": null,
21-
"remittance_address": null,
22-
"subtotal": 293.52,
23-
"total_discount": null,
24-
"total_tax": null,
25-
"invoice_total": 293.52,
26-
"payment_terms": null,
27-
"items": [
28-
{
29-
"product_code": "MA197",
30-
"description": "STRETCHWRAP ROLL",
31-
"quantity": 5,
32-
"tax": null,
33-
"tax_rate": null,
34-
"unit_price": 16.62,
35-
"total": 83.1,
36-
"reason": null
37-
},
38-
{
39-
"product_code": "ST4086",
40-
"description": "BALLPOINT PEN MED.",
41-
"quantity": 10,
42-
"tax": null,
43-
"tax_rate": null,
44-
"unit_price": 2.49,
45-
"total": 24.9,
46-
"reason": null
47-
},
48-
{
49-
"product_code": "JF9912413BF",
50-
"description": "BUBBLE FILM ROLL CL.",
51-
"quantity": 12,
52-
"tax": null,
53-
"tax_rate": null,
54-
"unit_price": 15.46,
55-
"total": 185.52,
56-
"reason": null
57-
}
58-
],
59-
"total_item_quantity": 27,
60-
"items_customer_signature": {
61-
"signatory": "Sarah H",
62-
"is_signed": true
63-
},
64-
"items_vendor_signature": {
65-
"signatory": "James T",
66-
"is_signed": true
43+
"total": {
44+
"currency_code": "GBP",
45+
"amount": 83.1
46+
}
47+
},
48+
{
49+
"product_code": "ST4086",
50+
"description": "BALLPOINT PEN MED.",
51+
"quantity": 10,
52+
"tax": null,
53+
"unit_price": {
54+
"currency_code": "GBP",
55+
"amount": 2.49
6756
},
68-
"returns": [
69-
{
70-
"product_code": "MA145",
71-
"description": "POSTAL TUBE BROWN",
72-
"quantity": 1,
73-
"tax": null,
74-
"tax_rate": null,
75-
"unit_price": null,
76-
"total": null,
77-
"reason": "This item was provided in previous order as a replacement"
78-
},
79-
{
80-
"product_code": "JF7902",
81-
"description": "MAILBOX 25PK",
82-
"quantity": 1,
83-
"tax": null,
84-
"tax_rate": null,
85-
"unit_price": null,
86-
"total": null,
87-
"reason": "Not required"
88-
}
89-
],
90-
"total_return_quantity": 2,
91-
"returns_customer_signature": {
92-
"signatory": "Sarah H",
93-
"is_signed": true
57+
"total": {
58+
"currency_code": "GBP",
59+
"amount": 24.9
60+
}
61+
},
62+
{
63+
"product_code": "JF9912413BF",
64+
"description": "BUBBLE FILM ROLL CL.",
65+
"quantity": 12,
66+
"tax": null,
67+
"unit_price": {
68+
"currency_code": "GBP",
69+
"amount": 15.46
9470
},
95-
"returns_vendor_signature": {
96-
"signatory": "James T",
97-
"is_signed": true
71+
"total": {
72+
"currency_code": "GBP",
73+
"amount": 185.52
9874
}
75+
}
76+
],
77+
"customer_signature": {
78+
"signatory": "Sarah H",
79+
"date": null,
80+
"has_written_signature": true
9981
}
100-
}
82+
}
83+
}

0 commit comments

Comments
 (0)