diff --git a/.github/actions/jfrog-pip-bootstrap/action.yml b/.github/actions/jfrog-pip-bootstrap/action.yml new file mode 100644 index 0000000..87892a7 --- /dev/null +++ b/.github/actions/jfrog-pip-bootstrap/action.yml @@ -0,0 +1,58 @@ +name: 'Bootstrap pip JFrog routing (pre-setup-python)' +description: | + Acquire a JFrog OIDC access token and pre-configure pip + netrc so that any + pip invocation routes through the JFrog db-pypi mirror instead of pypi.org — + including actions/setup-python's internal "Upgrading pip" step, which runs + before any other action gets a chance to redirect pip. + + Why this exists: the Databricks hardened runner groups (larger-runners, + databrickslabs-protected-runner-group) block egress to pypi.org by design; + the go/hardened-gha policy is that all package fetches go through JFrog. + Without this pre-bootstrap, setup-python's bundled "pip install --upgrade pip" + hits pypi.org and dies with SSL EOF before .github/actions/jfrog-auth + ever gets to configure pip. + + Reuses the OIDC exchange script from .github/actions/jfrog-auth/jfrog-auth + to avoid duplicating curl logic that we keep in sync with UCX. The main + jfrog-auth composite still runs later in the same job (idempotent for pip; + primary for Maven, which needs setup-java to have run first). + + Caller job MUST declare: + permissions: + id-token: write +runs: + using: "composite" + steps: + - id: jfrog-auth + name: Acquire JFrog OIDC access token + shell: bash + run: | + if [[ -z "${ACTIONS_ID_TOKEN_REQUEST_URL}" ]] || [[ -z "${ACTIONS_ID_TOKEN_REQUEST_TOKEN}" ]]; then + printf '::error::%s\n' 'This action uses OIDC: job must have "id-token: write" permission' + exit 1 + fi + "${GITHUB_WORKSPACE}/.github/actions/jfrog-auth/jfrog-auth" \ + "${ACTIONS_ID_TOKEN_REQUEST_URL}" \ + "${ACTIONS_ID_TOKEN_REQUEST_TOKEN}" + + - name: Write pip.conf + netrc for JFrog (db-pypi) + shell: bash + env: + JFROG_ACCESS_TOKEN: "${{ steps.jfrog-auth.outputs.jfrog-access-token }}" + run: | + umask 077 + cat > "${RUNNER_TEMP}/.netrc" << EOF + machine databricks.jfrog.io + login gha-service-account + password ${JFROG_ACCESS_TOKEN} + EOF + # Same db-pypi URL the main jfrog-auth composite uses; the later + # jfrog-auth run will overwrite this file with an identical value + # (modulo a fresh token in netrc) — idempotent by design. + cat > "${RUNNER_TEMP}/.pip.conf" << 'EOF' + [global] + index-url = https://databricks.jfrog.io/artifactory/api/pypi/db-pypi/simple + EOF + printf '%s=%s\n' 'NETRC' "${RUNNER_TEMP}/.netrc" >> "${GITHUB_ENV}" + printf '%s=%s\n' 'PIP_CONFIG_FILE' "${RUNNER_TEMP}/.pip.conf" >> "${GITHUB_ENV}" + printf '::debug::%s\n' 'Pre-bootstrap: configured JFrog access for pip.' diff --git a/.github/actions/python_build/action.yml b/.github/actions/python_build/action.yml index c8fe559..7831647 100644 --- a/.github/actions/python_build/action.yml +++ b/.github/actions/python_build/action.yml @@ -11,6 +11,15 @@ inputs: runs: using: "composite" steps: + # Pre-route pip at db-pypi BEFORE actions/setup-python runs: setup-python's + # python-versions installer unconditionally runs `pip install --upgrade pip`, + # which on the hardened runner group hits the egress allowlist and fails + # SSL-EOF against pypi.org. This step writes netrc + pip.conf so that + # internal pip call (and every later one) routes through JFrog instead. + # Idempotent if scala_build already ran in the same job — the env vars + # NETRC + PIP_CONFIG_FILE just get re-set to the same paths with a fresh token. + - name: Pre-bootstrap pip for JFrog (pre-setup-python) + uses: ./.github/actions/jfrog-pip-bootstrap - name: Configure python interpreter uses: actions/setup-python@a309ff8b426b58ec0e2a45f0f869d46889d02405 # v6.2.0 with: @@ -18,8 +27,8 @@ runs: cache-dependency-path: '.ci-pip-cache-key' python-version: ${{ matrix.python }} # Route pip through JFrog (OIDC) per go/hardened-gha policy. - # Idempotent if scala_build already ran in the same job (re-auths but env vars stay set). - # Caller job must declare `permissions: id-token: write`. + # Idempotent: jfrog-pip-bootstrap already configured pip; this re-runs the + # same write with a fresh token. Caller job must declare `permissions: id-token: write`. - name: Authenticate for JFrog (pip via OIDC) uses: ./.github/actions/jfrog-auth - name: Add packaged GDAL dependencies diff --git a/.github/actions/scala_build/action.yml b/.github/actions/scala_build/action.yml index 41b5900..e405d29 100644 --- a/.github/actions/scala_build/action.yml +++ b/.github/actions/scala_build/action.yml @@ -27,16 +27,25 @@ runs: - name: Set Maven opts for coverage and parallel builds shell: bash run: echo "MAVEN_OPTS=-Xmx4g -XX:+UseG1GC" >> $GITHUB_ENV + # Pre-route pip at db-pypi BEFORE actions/setup-python runs: setup-python's + # python-versions installer unconditionally runs `pip install --upgrade pip`, + # which on the hardened runner group hits the egress allowlist and fails + # SSL-EOF against pypi.org. This step writes netrc + pip.conf so that + # internal pip call (and every later one) routes through JFrog instead. + - name: Pre-bootstrap pip for JFrog (pre-setup-python) + uses: ./.github/actions/jfrog-pip-bootstrap - name: Configure python interpreter uses: actions/setup-python@a309ff8b426b58ec0e2a45f0f869d46889d02405 # v6.2.0 with: cache: 'pip' # caches dependencies for faster subsequent runs cache-dependency-path: '.ci-pip-cache-key' python-version: ${{ matrix.python }} - # Route pip + Maven through JFrog (OIDC) per go/hardened-gha policy. - # Must run after setup-java + setup-python so mvn + pip3 are on PATH for auto-detect. + # Route Maven through JFrog (OIDC) per go/hardened-gha policy. Pip was + # already configured by jfrog-pip-bootstrap above; this re-runs pip's + # netrc/pip.conf write idempotently with a fresh token, and configures + # Maven now that setup-java has put mvn + ~/.m2/settings.xml in place. # Caller job must declare `permissions: id-token: write`. - - name: Authenticate for JFrog (pip + Maven via OIDC) + - name: Authenticate for JFrog (Maven + pip refresh via OIDC) uses: ./.github/actions/jfrog-auth # Verify the PGP signature of every Maven dependency / plugin against # .maven-keys.list BEFORE any other mvn call resolves or compiles them. diff --git a/.github/workflows/diag-pgpverify-pom-transit.yml b/.github/workflows/diag-pgpverify-pom-transit.yml new file mode 100644 index 0000000..6ab18ce --- /dev/null +++ b/.github/workflows/diag-pgpverify-pom-transit.yml @@ -0,0 +1,59 @@ +name: Diag — POM transit (db-maven vs Maven Central) +# One-shot diagnostic for Category B PGP failures: "PGP Signature INVALID" +# on .pom files only (never .jar). Hypothesis: db-maven JFrog mirror is +# mutating POM bytes between Maven Central and the CI runner, breaking the +# upstream signature. +# +# This workflow fetches each suspect .pom + .pom.asc from db-maven (the +# exact path Maven uses in CI), computes sha256, and compares against a +# known reference from Maven Central (recorded inside the script). Mismatch +# = byte mutation confirmed; match = signature failure has a different +# root cause. +# +# Manual trigger only (workflow_dispatch). Delete this workflow once the +# investigation concludes. Run blocks contain no untrusted github.event.* +# values — only static script invocations. + +on: + # workflow_dispatch is the long-term trigger, but it only works once the + # workflow file lands on the default branch. While this diagnostic lives + # only on ci-fix-jfrog, also fire on push that touches the diag script + # or the workflow itself so we can actually run it. Remove the push + # trigger once the diagnostic concludes and the workflow is deleted. + workflow_dispatch: {} + push: + branches: + - 'ci-fix-jfrog' + paths: + - 'scripts/security/diag-pgpverify-pom-transit' + - '.github/workflows/diag-pgpverify-pom-transit.yml' + +permissions: + contents: read + +jobs: + diag: + name: pom-transit-diff + runs-on: + group: databrickslabs-protected-runner-group + labels: linux-ubuntu-latest + environment: runtime + permissions: + contents: read + id-token: write + steps: + - name: Checkout + uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 + with: + token: ${{ secrets.REPO_ACCESS_TOKEN || secrets.GITHUB_TOKEN }} + - name: Configure JDK + uses: actions/setup-java@be666c2fcd27ec809703dec50e508c2fdc7f6654 # v5.2.0 + with: + java-version: '17' + distribution: 'zulu' + - name: Authenticate for JFrog (Maven via OIDC) + uses: ./.github/actions/jfrog-auth + - name: Install gpg (for standalone signature verification) + run: sudo apt-get -o DPkg::Lock::Timeout=-1 install -y gpg + - name: Run POM transit diagnostic + run: ./scripts/security/diag-pgpverify-pom-transit diff --git a/.maven-keys.list b/.maven-keys.list index d332a4f..d7c83a9 100644 --- a/.maven-keys.list +++ b/.maven-keys.list @@ -390,11 +390,8 @@ org.tukaani:xz = 0x369 antlr:antlr:jar:2.7.2 = noSig antlr:antlr:pom:2.7.2 = noSig com.fasterxml.jackson.core:jackson-annotations:jar:2.18.3 = noSig -com.fasterxml.jackson.core:jackson-annotations:pom:2.18.3 = noSig com.fasterxml.jackson.core:jackson-core:jar:2.18.3 = noSig -com.fasterxml.jackson.core:jackson-core:pom:2.18.3 = noSig com.fasterxml.jackson.core:jackson-databind:jar:2.18.3 = noSig -com.fasterxml.jackson.core:jackson-databind:pom:2.18.3 = noSig com.github.luben:zstd-jni:jar:1.5.7-6 = noSig com.github.luben:zstd-jni:pom:1.5.7-6 = noSig com.google.code.findbugs:jsr305:jar:2.0.1 = noSig @@ -498,7 +495,6 @@ junit:junit:pom:4.13.2 log4j:log4j:jar:1.2.12 = noSig log4j:log4j:pom:1.2.12 = noSig net.alchim31.maven:scala-maven-plugin:jar:4.9.9 = noSig -net.alchim31.maven:scala-maven-plugin:pom:4.9.9 = noSig net.openhft:zero-allocation-hashing:jar:0.16 = noSig net.openhft:zero-allocation-hashing:pom:0.16 = noSig org.apache-extras.beanshell:bsh:jar:2.0b6 = noSig @@ -1053,3 +1049,92 @@ xml-apis:xml-apis:jar:1.0.b2 xml-apis:xml-apis:jar:1.3.04 = noSig xml-apis:xml-apis:pom:1.0.b2 = noSig xml-apis:xml-apis:pom:1.3.04 = noSig + +# --- Version-specific keyed entries surfaced by post-pin closure ---------- +# +# These are lifecycle-plugin versions that pom.xml now pins explicitly +# (maven-compiler-plugin, maven-install-plugin, maven-deploy-plugin) plus +# their transitive plugin-dependencies (plexus-compiler-*, file-management). +# Newer versions are signed by Apache committers whose keys aren't on the +# Apache Maven KEYS file but ARE on keyserver.ubuntu.com with self-signed +# UIDs at apache.org addresses. +# +# Trust verification (2026-05-18): +# +# 0x84789D24DF77A32433CE1F079EB80E92EB2135B1 +# uid: Slawomir Jaranowski +# Apache Maven committer; lead developer of pgpverify-maven-plugin +# itself. Key created 2021-12-22, RSA 4096, active. +# https://keyserver.ubuntu.com/pks/lookup?op=vindex&fingerprint=on&search=0x84789D24DF77A32433CE1F079EB80E92EB2135B1 +# +# 0x32118CF76C9EC5D918E54967CA80D1F0EB6CA4BA +# uid: Sylwester Lachiewicz +# Apache committer; Mojo Codehaus / Maven plugins maintainer. +# Key created 2020-05-09, RSA 4096, active. +# https://keyserver.ubuntu.com/pks/lookup?op=vindex&fingerprint=on&search=0x32118CF76C9EC5D918E54967CA80D1F0EB6CA4BA +# +# Versions are pinned in pom.xml; bump them in lockstep with new entries here. + +org.apache.maven.plugins:maven-install-plugin:jar:3.1.4 = 0x84789D24DF77A32433CE1F079EB80E92EB2135B1 +org.apache.maven.plugins:maven-install-plugin:pom:3.1.4 = 0x84789D24DF77A32433CE1F079EB80E92EB2135B1 +org.apache.maven.plugins:maven-deploy-plugin:jar:3.1.4 = 0x84789D24DF77A32433CE1F079EB80E92EB2135B1 +org.apache.maven.plugins:maven-deploy-plugin:pom:3.1.4 = 0x84789D24DF77A32433CE1F079EB80E92EB2135B1 +org.apache.maven.shared:file-management:jar:3.2.0 = 0x84789D24DF77A32433CE1F079EB80E92EB2135B1 +org.apache.maven.shared:file-management:pom:3.2.0 = 0x84789D24DF77A32433CE1F079EB80E92EB2135B1 +org.apache.maven.plugins:maven-compiler-plugin:jar:3.15.0 = 0x32118CF76C9EC5D918E54967CA80D1F0EB6CA4BA +org.apache.maven.plugins:maven-compiler-plugin:pom:3.15.0 = 0x32118CF76C9EC5D918E54967CA80D1F0EB6CA4BA +org.codehaus.plexus:plexus-compiler-api:jar:2.16.2 = 0x32118CF76C9EC5D918E54967CA80D1F0EB6CA4BA +org.codehaus.plexus:plexus-compiler-api:pom:2.16.2 = 0x32118CF76C9EC5D918E54967CA80D1F0EB6CA4BA +org.codehaus.plexus:plexus-compiler-javac:jar:2.16.2 = 0x32118CF76C9EC5D918E54967CA80D1F0EB6CA4BA +org.codehaus.plexus:plexus-compiler-javac:pom:2.16.2 = 0x32118CF76C9EC5D918E54967CA80D1F0EB6CA4BA +org.codehaus.plexus:plexus-compiler-manager:jar:2.16.2 = 0x32118CF76C9EC5D918E54967CA80D1F0EB6CA4BA +org.codehaus.plexus:plexus-compiler-manager:pom:2.16.2 = 0x32118CF76C9EC5D918E54967CA80D1F0EB6CA4BA + +# --- badSig overrides for POMs byte-mutated by db-maven JFrog mirror ----- +# +# These six .pom files fail pgpverify-maven-plugin with "PGP Signature +# INVALID" — cryptographic verification of the .asc against the bytes +# fails. The corresponding .jar files for the same artifacts verify +# OK (they're binary and pass through JFrog untouched). +# +# Root cause confirmed 2026-05-18 via the diag-pgpverify-pom-transit +# workflow (run 26056343517 on branch ci-fix-jfrog): +# +# * .pom sha256 from db-maven DIFFERS from Maven Central (size delta +# -348 to +170 bytes per file) +# * .pom.asc sha256 from db-maven MATCHES Maven Central (signatures +# untouched) +# * Hex dump shows LF → CRLF conversion at every line end, plus +# additional content normalization (whitespace / XML formatting) +# * Standalone `gpg --verify` on the db-maven bytes produces "Signature +# made … using RSA key … Can't check signature" — the .asc was +# computed against the original Maven Central bytes, not the +# JFrog-mutated ones. +# +# JFrog db-maven applies some form of text-resource transformation to +# .pom files on the mirror side, which breaks the upstream PGP signature +# chain. The .jar signatures remain trustworthy (binary, untouched), so +# code-execution integrity is still gated. POM-declared dependency +# coordinates inherit JFrog-as-trust-boundary status (an attacker who +# compromises db-maven could regress to a previously-trusted-but-now- +# vulnerable signed JAR via POM rewrite — risk is real but bounded +# to artifacts already in the mirror's allowlist). +# +# `badSig` here means "tolerate that the .pom signature does not +# cryptographically verify" — narrower than `noSig` ("tolerate no +# signature at all"). The artifact must still HAVE a .asc on the +# server; this only suppresses the crypto-failure error. +# +# Action item: file a JFrog admin ticket asking db-maven to disable +# text-artifact transformations so POM bytes pass through verbatim, then +# delete this block (POMs will verify again). +# +# Versions are pinned at the exact artifact:packaging:version level so +# future releases inherit strict verification rather than the override. + +com.fasterxml.jackson.core:jackson-annotations:pom:2.18.3 = badSig +com.fasterxml.jackson.core:jackson-core:pom:2.18.3 = badSig +com.fasterxml.jackson.core:jackson-databind:pom:2.18.3 = badSig +javax.servlet:javax.servlet-api:pom:3.1.0 = badSig +net.alchim31.maven:scala-maven-plugin:pom:4.9.9 = badSig +org.iq80.snappy:snappy:pom:0.4 = badSig diff --git a/docs/doc-snippet-inventory.json b/docs/doc-snippet-inventory.json index 5d53c57..05eeb16 100644 --- a/docs/doc-snippet-inventory.json +++ b/docs/doc-snippet-inventory.json @@ -2,14486 +2,38 @@ "generated_at": "2026-01-12", "docs_directory": "docs/docs", "statistics": { - "total_files": 28, - "total_snippets": 637, - "by_category": { - "EXTENDS_SETUP": 324, - "NEEDS_REVIEW": 79, - "EXAMPLE_ONLY": 85, - "SELF_CONTAINED": 135, - "SHELL_COMMAND": 14 - }, - "by_language": { - "python": 358, - "scala": 158, - "bash": 14, - "sql": 107 - } + "total_files": 0, + "total_snippets": 0, + "by_category": {}, + "by_language": {} }, "common_setups_needed": { "rasterx_basic": { "description": "Basic RasterX setup with registration", - "needed_by_count": 144, - "files": [ - "advanced/custom-udfs.md", - "advanced/overview.md", - "api/overview.md", - "api/python.md", - "api/rasterx-functions.md", - "api/scala.md", - "api/sql.md", - "api/tile-structure.md", - "api/vectorx-functions.md", - "installation.md", - "packages/gridx.md" - ] + "needed_by_count": 0, + "files": [] }, "gridx_bng": { "description": "GridX BNG setup with registration", - "needed_by_count": 106, - "files": [ - "advanced/custom-udfs.md", - "advanced/overview.md", - "api/gridx-functions.md", - "api/overview.md", - "api/python.md", - "api/rasterx-functions.md", - "api/scala.md", - "api/vectorx-functions.md", - "examples/overview.md", - "installation.md", - "packages/gridx.md", - "packages/vectorx.md", - "quick-start.md" - ] + "needed_by_count": 0, + "files": [] }, "vectorx_basic": { "description": "VectorX setup with registration", - "needed_by_count": 23, - "files": [ - "api/python.md", - "api/scala.md", - "api/vectorx-functions.md", - "examples/overview.md", - "packages/gridx.md", - "packages/vectorx.md", - "quick-start.md" - ] + "needed_by_count": 0, + "files": [] }, "raster_data_loading": { "description": "Load sample raster data for processing", - "needed_by_count": 122, - "files": [ - "advanced/custom-udfs.md", - "advanced/gdal-cli.md", - "advanced/overview.md", - "api/overview.md", - "api/python.md", - "api/rasterx-functions.md", - "api/scala.md", - "api/sql.md", - "api/tile-structure.md", - "packages/rasterx.md", - "quick-start.md", - "readers/gdal.md", - "readers/overview.md" - ] + "needed_by_count": 0, + "files": [] }, "vector_data_loading": { "description": "Load sample vector data for processing", - "needed_by_count": 44, - "files": [ - "api/sql.md", - "quick-start.md", - "readers/geojson.md", - "readers/ogr.md", - "readers/overview.md", - "readers/shapefile.md" - ] + "needed_by_count": 0, + "files": [] } }, - "file_summaries": [ - { - "file": "advanced/custom-udfs.md", - "total_snippets": 18, - "by_category": { - "EXTENDS_SETUP": 5, - "NEEDS_REVIEW": 10, - "EXAMPLE_ONLY": 2, - "SELF_CONTAINED": 1 - }, - "by_language": { - "python": 3, - "scala": 15 - } - }, - { - "file": "advanced/gdal-cli.md", - "total_snippets": 28, - "by_category": { - "SHELL_COMMAND": 13, - "SELF_CONTAINED": 2, - "EXAMPLE_ONLY": 10, - "EXTENDS_SETUP": 2, - "NEEDS_REVIEW": 1 - }, - "by_language": { - "bash": 13, - "python": 14, - "sql": 1 - } - }, - { - "file": "advanced/library-integration.md", - "total_snippets": 12, - "by_category": { - "EXAMPLE_ONLY": 7, - "NEEDS_REVIEW": 5 - }, - "by_language": { - "python": 12 - } - }, - { - "file": "advanced/overview.md", - "total_snippets": 3, - "by_category": { - "EXTENDS_SETUP": 1, - "NEEDS_REVIEW": 1, - "EXAMPLE_ONLY": 1 - }, - "by_language": { - "python": 2, - "scala": 1 - } - }, - { - "file": "api/gridx-functions.md", - "total_snippets": 29, - "by_category": { - "NEEDS_REVIEW": 7, - "EXTENDS_SETUP": 16, - "EXAMPLE_ONLY": 6 - }, - "by_language": { - "scala": 11, - "python": 16, - "sql": 2 - } - }, - { - "file": "api/overview.md", - "total_snippets": 12, - "by_category": { - "NEEDS_REVIEW": 5, - "EXAMPLE_ONLY": 2, - "EXTENDS_SETUP": 5 - }, - "by_language": { - "python": 10, - "scala": 1, - "sql": 1 - } - }, - { - "file": "api/python.md", - "total_snippets": 24, - "by_category": { - "NEEDS_REVIEW": 2, - "EXTENDS_SETUP": 19, - "SELF_CONTAINED": 3 - }, - "by_language": { - "python": 24 - } - }, - { - "file": "api/rasterx-functions.md", - "total_snippets": 211, - "by_category": { - "EXAMPLE_ONLY": 39, - "SELF_CONTAINED": 40, - "EXTENDS_SETUP": 102, - "NEEDS_REVIEW": 30 - }, - "by_language": { - "scala": 98, - "python": 64, - "sql": 49 - } - }, - { - "file": "api/scala.md", - "total_snippets": 19, - "by_category": { - "EXTENDS_SETUP": 13, - "NEEDS_REVIEW": 1, - "SELF_CONTAINED": 5 - }, - "by_language": { - "scala": 19 - } - }, - { - "file": "api/sql.md", - "total_snippets": 25, - "by_category": { - "NEEDS_REVIEW": 2, - "EXAMPLE_ONLY": 3, - "SELF_CONTAINED": 14, - "EXTENDS_SETUP": 6 - }, - "by_language": { - "python": 1, - "scala": 1, - "sql": 23 - } - }, - { - "file": "api/tile-structure.md", - "total_snippets": 21, - "by_category": { - "EXTENDS_SETUP": 10, - "EXAMPLE_ONLY": 6, - "SELF_CONTAINED": 5 - }, - "by_language": { - "sql": 2, - "python": 18, - "scala": 1 - } - }, - { - "file": "api/vectorx-functions.md", - "total_snippets": 22, - "by_category": { - "EXAMPLE_ONLY": 3, - "EXTENDS_SETUP": 17, - "NEEDS_REVIEW": 2 - }, - "by_language": { - "scala": 2, - "python": 15, - "sql": 5 - } - }, - { - "file": "examples/overview.md", - "total_snippets": 3, - "by_category": { - "SELF_CONTAINED": 1, - "EXTENDS_SETUP": 2 - }, - "by_language": { - "python": 3 - } - }, - { - "file": "installation.md", - "total_snippets": 3, - "by_category": { - "SHELL_COMMAND": 1, - "EXTENDS_SETUP": 1, - "EXAMPLE_ONLY": 1 - }, - "by_language": { - "bash": 1, - "python": 1, - "sql": 1 - } - }, - { - "file": "limitations.md", - "total_snippets": 1, - "by_category": { - "SELF_CONTAINED": 1 - }, - "by_language": { - "python": 1 - } - }, - { - "file": "packages/gridx.md", - "total_snippets": 11, - "by_category": { - "EXTENDS_SETUP": 11 - }, - "by_language": { - "python": 9, - "scala": 1, - "sql": 1 - } - }, - { - "file": "packages/overview.md", - "total_snippets": 2, - "by_category": { - "NEEDS_REVIEW": 2 - }, - "by_language": { - "python": 2 - } - }, - { - "file": "packages/rasterx.md", - "total_snippets": 9, - "by_category": { - "EXAMPLE_ONLY": 1, - "SELF_CONTAINED": 6, - "EXTENDS_SETUP": 1, - "NEEDS_REVIEW": 1 - }, - "by_language": { - "python": 7, - "scala": 1, - "sql": 1 - } - }, - { - "file": "packages/vectorx.md", - "total_snippets": 12, - "by_category": { - "EXTENDS_SETUP": 8, - "SELF_CONTAINED": 1, - "NEEDS_REVIEW": 3 - }, - "by_language": { - "python": 8, - "sql": 3, - "scala": 1 - } - }, - { - "file": "quick-start.md", - "total_snippets": 16, - "by_category": { - "NEEDS_REVIEW": 5, - "EXAMPLE_ONLY": 2, - "SELF_CONTAINED": 6, - "EXTENDS_SETUP": 3 - }, - "by_language": { - "python": 11, - "scala": 1, - "sql": 4 - } - }, - { - "file": "readers/filegdb.md", - "total_snippets": 25, - "by_category": { - "EXTENDS_SETUP": 16, - "SELF_CONTAINED": 8, - "NEEDS_REVIEW": 1 - }, - "by_language": { - "python": 21, - "scala": 1, - "sql": 3 - } - }, - { - "file": "readers/gdal.md", - "total_snippets": 25, - "by_category": { - "EXTENDS_SETUP": 15, - "SELF_CONTAINED": 10 - }, - "by_language": { - "python": 23, - "scala": 1, - "sql": 1 - } - }, - { - "file": "readers/geojson.md", - "total_snippets": 25, - "by_category": { - "EXTENDS_SETUP": 16, - "SELF_CONTAINED": 9 - }, - "by_language": { - "python": 21, - "scala": 1, - "sql": 3 - } - }, - { - "file": "readers/geopackage.md", - "total_snippets": 22, - "by_category": { - "EXTENDS_SETUP": 12, - "SELF_CONTAINED": 9, - "NEEDS_REVIEW": 1 - }, - "by_language": { - "python": 18, - "scala": 1, - "sql": 3 - } - }, - { - "file": "readers/ogr.md", - "total_snippets": 17, - "by_category": { - "EXTENDS_SETUP": 15, - "SELF_CONTAINED": 2 - }, - "by_language": { - "python": 17 - } - }, - { - "file": "readers/overview.md", - "total_snippets": 18, - "by_category": { - "EXTENDS_SETUP": 13, - "EXAMPLE_ONLY": 2, - "SELF_CONTAINED": 3 - }, - "by_language": { - "python": 17, - "sql": 1 - } - }, - { - "file": "readers/shapefile.md", - "total_snippets": 23, - "by_category": { - "EXTENDS_SETUP": 14, - "SELF_CONTAINED": 9 - }, - "by_language": { - "python": 19, - "scala": 1, - "sql": 3 - } - }, - { - "file": "support.md", - "total_snippets": 1, - "by_category": { - "EXTENDS_SETUP": 1 - }, - "by_language": { - "python": 1 - } - } - ], - "snippets": [ - { - "language": "python", - "code": "from databricks.labs.gbx.rasterx import functions as rx\nrx.register(spark)\n\n# This uses eval internally\ndf = rasters.select(rx.rst_boundingbox(\"tile\"))\n", - "line_number": 17, - "length_lines": 6, - "source_file": "advanced/custom-udfs.md", - "category": "EXTENDS_SETUP", - "confidence": "high", - "analysis": { - "has_imports": true, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": false, - "has_rx_registration": true, - "uses_existing_df": false, - "uses_existing_rasters": true, - "uses_rx_without_import": false, - "uses_spark_without_creation": false, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "scala", - "code": "import com.databricks.labs.gbx.rasterx.expressions.accessors.RST_BoundingBox\nimport org.gdal.gdal.Dataset\n\n// Direct GDAL dataset access\nval bbox = RST_BoundingBox.execute(dataset)\n", - "line_number": 35, - "length_lines": 6, - "source_file": "advanced/custom-udfs.md", - "category": "NEEDS_REVIEW", - "confidence": "low", - "analysis": { - "has_imports": true, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": false, - "has_rx_registration": false, - "uses_existing_df": null, - "uses_existing_rasters": null, - "uses_rx_without_import": false, - "uses_spark_without_creation": false, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "python", - "code": "from pyspark.sql.functions import udf\nfrom pyspark.sql.types import StringType, MapType\nimport json\n\n# Import GeoBrix execute methods (via Py4J bridge)\nfrom databricks.labs.gbx.rasterx.expressions import accessors\n\n@udf(MapType(StringType(), StringType()))\ndef extract_custom_metadata(tile_binary):\n \"\"\"\n Extract custom metadata from raster tile\n \"\"\"\n try:\n # Load GDAL dataset from binary\n # This is simplified - actual implementation needs proper deserialization\n from databricks.labs.gbx.rasterx.gdal import GDALManager\n \n # Get dataset handle\n dataset = load_dataset_from_tile(tile_binary)\n \n # Use execute methods\n metadata = {}\n metadata[\"format\"] = accessors.RST_Format.execute(dataset)\n metadata[\"width\"] = str(accessors.RST_Width.execute(dataset))\n metadata[\"height\"] = str(accessors.RST_Height.execute(dataset))\n \n # Add custom logic\n metadata[\"aspect_ratio\"] = str(\n float(metadata[\"width\"]) / float(metadata[\"height\"])\n )\n \n # Clean up\n dataset.delete()\n \n return metadata\n except Exception as e:\n return {\"error\": str(e)}\n\n# Use the UDF\nenriched = rasters.withColumn(\"custom_metadata\", extract_custom_metadata(\"tile\"))\n", - "line_number": 77, - "length_lines": 41, - "source_file": "advanced/custom-udfs.md", - "category": "EXTENDS_SETUP", - "confidence": "high", - "analysis": { - "has_imports": true, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": false, - "has_rx_registration": false, - "uses_existing_df": null, - "uses_existing_rasters": true, - "uses_rx_without_import": false, - "uses_spark_without_creation": false, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "scala", - "code": "import org.apache.spark.sql.expressions.UserDefinedFunction\nimport org.apache.spark.sql.functions.udf\nimport com.databricks.labs.gbx.rasterx.expressions.accessors._\nimport com.databricks.labs.gbx.rasterx.gdal.RasterDriver\nimport org.gdal.gdal.Dataset\n\nobject CustomRasterUDFs {\n\n /**\n * Extract custom statistics from raster\n */\n def customRasterStats: UserDefinedFunction = udf((tileBytes: Array[Byte]) => {\n try {\n // Read Dataset from binary raster data\n val ds: Dataset = RasterDriver.readFromBytes(tileBytes, Map.empty[String, String])\n \n // Use execute methods to get statistics\n val width = RST_Width.execute(ds)\n val height = RST_Height.execute(ds)\n val numBands = RST_NumBands.execute(ds)\n \n // Custom calculation\n val totalPixels = width * height * numBands\n val pixelWidth = RST_PixelWidth.execute(ds)\n val pixelHeight = RST_PixelHeight.execute(ds)\n val coverage = width * pixelWidth * height * pixelHeight\n \n // Clean up\n RasterDriver.releaseDataset(ds)\n \n // Return custom result\n Map(\n \"total_pixels\" -> totalPixels.toString,\n \"coverage_sqm\" -> coverage.toString,\n \"pixel_density\" -> (totalPixels / coverage).toString\n )\n } catch {\n case e: Exception => Map(\"error\" -> e.getMessage)\n }\n })\n\n /**\n * Custom bounding box with buffer\n */\n def boundingBoxWithBuffer(bufferMeters: Double): UserDefinedFunction = \n udf((tileBytes: Array[Byte]) => {\n try {\n // Read Dataset from binary raster data\n val ds: Dataset = RasterDriver.readFromBytes(tileBytes, Map.empty[String, String])\n \n // Get bounding box using execute\n val bbox = RST_BoundingBox.execute(ds)\n \n // Apply buffer (custom logic)\n val buffered = bbox.buffer(bufferMeters)\n \n // Clean up\n RasterDriver.releaseDataset(ds)\n \n // Convert to WKB\n val wkb = buffered.getBinary\n \n wkb\n } catch {\n case e: Exception => Array.empty[Byte]\n }\n })\n\n /**\n * Filter rasters by custom criteria\n */\n def meetsQualityCriteria: UserDefinedFunction = udf((tileBytes: Array[Byte]) => {\n try {\n // Read Dataset from binary raster data\n val ds: Dataset = RasterDriver.readFromBytes(tileBytes, Map.empty[String, String])\n \n // Multiple execute calls for criteria\n val width = RST_Width.execute(ds)\n val height = RST_Height.execute(ds)\n val numBands = RST_NumBands.execute(ds)\n val band = ds.GetRasterBand(1)\n val noData = RST_GetNoData.execute(band)\n \n // Custom quality logic\n val validSize = width >= 512 && height >= 512\n val hasMultipleBands = numBands >= 3\n val hasNoDataValue = noData.isDefined\n \n validSize && hasMultipleBands && hasNoDataValue\n } catch {\n case _: Exception => false\n }\n })\n}\n", - "line_number": 122, - "length_lines": 95, - "source_file": "advanced/custom-udfs.md", - "category": "EXTENDS_SETUP", - "confidence": "high", - "analysis": { - "has_imports": true, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": false, - "has_rx_registration": false, - "uses_existing_df": null, - "uses_existing_rasters": true, - "uses_rx_without_import": false, - "uses_spark_without_creation": false, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "scala", - "code": "import com.databricks.labs.gbx.rasterx.expressions.accessors._\nimport org.apache.spark.sql.functions.udf\n\n/**\n * Apply custom cloud masking logic based on multiple bands\n */\ndef applyCloudMask: UserDefinedFunction = udf((tileBytes: Array[Byte]) => {\n // Read Dataset from binary raster data\n val ds: Dataset = RasterDriver.readFromBytes(tileBytes, Map.empty[String, String])\n \n try {\n // Get band data using execute methods\n val band1 = ds.GetRasterBand(1)\n val band2 = ds.GetRasterBand(2)\n val band3 = ds.GetRasterBand(3)\n \n val width = RST_Width.execute(ds)\n val height = RST_Height.execute(ds)\n \n // Read pixel data\n val pixels1 = band1.ReadRaster(0, 0, width, height)\n val pixels2 = band2.ReadRaster(0, 0, width, height)\n val pixels3 = band3.ReadRaster(0, 0, width, height)\n \n // Apply custom cloud detection algorithm\n // (simplified example - actual algorithm would be more complex)\n val cloudMask = detectClouds(pixels1, pixels2, pixels3)\n \n // Create new raster with cloud mask applied\n val maskedRaster = applyMask(ds, cloudMask)\n \n // Write to bytes and return\n val result = RasterDriver.writeToBytes(maskedRaster, Map.empty[String, String])\n RasterDriver.releaseDataset(maskedRaster)\n result\n } finally {\n RasterDriver.releaseDataset(ds)\n }\n})\n", - "line_number": 223, - "length_lines": 40, - "source_file": "advanced/custom-udfs.md", - "category": "NEEDS_REVIEW", - "confidence": "low", - "analysis": { - "has_imports": true, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": false, - "has_rx_registration": false, - "uses_existing_df": null, - "uses_existing_rasters": null, - "uses_rx_without_import": false, - "uses_spark_without_creation": false, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "scala", - "code": "/**\n * Compare rasters from different time periods\n */\ndef calculateNDVIChange: UserDefinedFunction = \n udf((before: Array[Byte], after: Array[Byte]) => {\n // Read Datasets from binary raster data\n val dsBefore: Dataset = RasterDriver.readFromBytes(before, Map.empty[String, String])\n val dsAfter: Dataset = RasterDriver.readFromBytes(after, Map.empty[String, String])\n \n try {\n // Extract NIR and Red bands using execute methods\n val nirBefore = dsBefore.GetRasterBand(4)\n val redBefore = dsBefore.GetRasterBand(3)\n val nirAfter = dsAfter.GetRasterBand(4)\n val redAfter = dsAfter.GetRasterBand(3)\n \n // Calculate NDVI for both periods\n val ndviBefore = calculateNDVI(nirBefore, redBefore)\n val ndviAfter = calculateNDVI(nirAfter, redAfter)\n \n // Calculate change\n val change = ndviAfter - ndviBefore\n \n // Return statistics\n Map(\n \"mean_change\" -> change.mean.toString,\n \"max_gain\" -> change.max.toString,\n \"max_loss\" -> change.min.toString,\n \"percent_improved\" -> (change.filter(_ > 0.1).count.toDouble / change.size * 100).toString\n )\n } finally {\n RasterDriver.releaseDataset(dsBefore)\n RasterDriver.releaseDataset(dsAfter)\n }\n })\n", - "line_number": 267, - "length_lines": 36, - "source_file": "advanced/custom-udfs.md", - "category": "EXTENDS_SETUP", - "confidence": "high", - "analysis": { - "has_imports": true, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": false, - "has_rx_registration": false, - "uses_existing_df": null, - "uses_existing_rasters": true, - "uses_rx_without_import": false, - "uses_spark_without_creation": false, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "scala", - "code": "/**\n * Handle proprietary raster format\n */\ndef processProprietaryFormat: UserDefinedFunction = \n udf((filePath: String) => {\n try {\n // Use GDAL's flexible driver system\n val ds = gdal.Open(filePath)\n \n // Extract metadata using execute methods\n val metadata = RST_MetaData.execute(ds)\n \n // Apply domain-specific interpretation\n val calibrationFactor = metadata.getOrElse(\"CAL_FACTOR\", \"1.0\").toDouble\n val sensorType = metadata.getOrElse(\"SENSOR\", \"unknown\")\n \n // Get band data\n val band = ds.GetRasterBand(1)\n val width = RST_Width.execute(ds)\n val height = RST_Height.execute(ds)\n \n // Read and calibrate\n val pixels = band.ReadRaster(0, 0, width, height)\n val calibrated = applyCalibration(pixels, calibrationFactor, sensorType)\n \n // Create calibrated raster\n val output = createCalibratedDataset(calibrated, width, height, ds)\n \n // Write to bytes\n val result = RasterDriver.writeToBytes(output, Map.empty[String, String])\n RasterDriver.releaseDataset(ds)\n RasterDriver.releaseDataset(output)\n result\n } catch {\n case e: Exception => \n log.error(s\"Failed to process $filePath: ${e.getMessage}\")\n Array.empty[Byte]\n }\n })\n", - "line_number": 307, - "length_lines": 40, - "source_file": "advanced/custom-udfs.md", - "category": "NEEDS_REVIEW", - "confidence": "low", - "analysis": { - "has_imports": false, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": false, - "has_rx_registration": false, - "uses_existing_df": null, - "uses_existing_rasters": null, - "uses_rx_without_import": false, - "uses_spark_without_creation": false, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "scala", - "code": "import com.databricks.labs.gbx.rasterx.expressions.accessors._\n\nRST_Avg.execute(dataset) // Average pixel value\nRST_BandMetaData.execute(band) // Band metadata\nRST_BoundingBox.execute(dataset) // Bounding box geometry\nRST_Format.execute(dataset) // Raster format\nRST_GeoReference.execute(dataset) // Georeference info\nRST_Height.execute(dataset) // Height in pixels\nRST_Width.execute(dataset) // Width in pixels\nRST_Max.execute(dataset) // Maximum pixel value\nRST_Min.execute(dataset) // Minimum pixel value\nRST_MetaData.execute(dataset) // Full metadata map\nRST_NumBands.execute(dataset) // Number of bands\nRST_PixelWidth.execute(dataset) // Pixel width in CRS units\nRST_PixelHeight.execute(dataset) // Pixel height in CRS units\nRST_SRID.execute(dataset) // Spatial reference ID\n// ... and many more\n", - "line_number": 356, - "length_lines": 18, - "source_file": "advanced/custom-udfs.md", - "category": "EXAMPLE_ONLY", - "confidence": "high", - "analysis": { - "has_imports": true, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": false, - "has_rx_registration": false, - "uses_existing_df": null, - "uses_existing_rasters": null, - "uses_rx_without_import": false, - "uses_spark_without_creation": false, - "has_placeholders": true, - "is_short": false - } - }, - { - "language": "scala", - "code": "import com.databricks.labs.gbx.rasterx.expressions.transformations._\n\nRST_Clip.execute(dataset, geometry) // Clip to geometry\nRST_Resample.execute(dataset, width, height) // Resample\nRST_Transform.execute(dataset, srid) // Transform CRS\n", - "line_number": 377, - "length_lines": 6, - "source_file": "advanced/custom-udfs.md", - "category": "NEEDS_REVIEW", - "confidence": "low", - "analysis": { - "has_imports": true, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": false, - "has_rx_registration": false, - "uses_existing_df": null, - "uses_existing_rasters": null, - "uses_rx_without_import": false, - "uses_spark_without_creation": false, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "scala", - "code": "import com.databricks.labs.gbx.gridx.bng.expressions._\n\nBNG_CellArea.execute(gridRef, precision) // Cell area\nBNG_PointToCell.execute(point, precision) // Point to cell\n", - "line_number": 386, - "length_lines": 5, - "source_file": "advanced/custom-udfs.md", - "category": "NEEDS_REVIEW", - "confidence": "low", - "analysis": { - "has_imports": true, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": false, - "has_rx_registration": false, - "uses_existing_df": null, - "uses_existing_rasters": null, - "uses_rx_without_import": false, - "uses_spark_without_creation": false, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "scala", - "code": "def safeExecute[T](f: Dataset => T): UserDefinedFunction = udf((bytes: Array[Byte]) => {\n // Read Dataset from binary raster data\n val ds: Dataset = RasterDriver.readFromBytes(bytes, Map.empty[String, String])\n try {\n f(ds)\n } finally {\n RasterDriver.releaseDataset(ds) // Always clean up!\n }\n})\n", - "line_number": 399, - "length_lines": 10, - "source_file": "advanced/custom-udfs.md", - "category": "NEEDS_REVIEW", - "confidence": "low", - "analysis": { - "has_imports": true, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": false, - "has_rx_registration": false, - "uses_existing_df": null, - "uses_existing_rasters": null, - "uses_rx_without_import": false, - "uses_spark_without_creation": false, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "scala", - "code": "@udf\ndef robustUDF(bytes: Array[Byte]): Option[String] = {\n try {\n val ds = loadDataset(bytes)\n val result = RST_Format.execute(ds)\n ds.delete()\n Some(result)\n } catch {\n case e: Exception =>\n log.warn(s\"UDF failed: ${e.getMessage}\")\n None\n }\n}\n", - "line_number": 415, - "length_lines": 14, - "source_file": "advanced/custom-udfs.md", - "category": "NEEDS_REVIEW", - "confidence": "low", - "analysis": { - "has_imports": false, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": false, - "has_rx_registration": false, - "uses_existing_df": null, - "uses_existing_rasters": null, - "uses_rx_without_import": false, - "uses_spark_without_creation": false, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "scala", - "code": "def efficientBatchUDF: UserDefinedFunction = udf((bytes: Array[Byte]) => {\n val ds = loadDataset(bytes)\n \n try {\n // Single dataset load, multiple operations\n val results = Map(\n \"format\" -> RST_Format.execute(ds),\n \"width\" -> RST_Width.execute(ds).toString,\n \"height\" -> RST_Height.execute(ds).toString,\n \"bands\" -> RST_NumBands.execute(ds).toString,\n \"srid\" -> RST_SRID.execute(ds).toString\n )\n results\n } finally {\n ds.delete()\n }\n})\n", - "line_number": 435, - "length_lines": 18, - "source_file": "advanced/custom-udfs.md", - "category": "NEEDS_REVIEW", - "confidence": "low", - "analysis": { - "has_imports": false, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": false, - "has_rx_registration": false, - "uses_existing_df": null, - "uses_existing_rasters": null, - "uses_rx_without_import": false, - "uses_spark_without_creation": false, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "scala", - "code": "import org.apache.spark.sql.types._\n\nval schema = StructType(Seq(\n StructField(\"width\", IntegerType, nullable = false),\n StructField(\"height\", IntegerType, nullable = false),\n StructField(\"aspect_ratio\", DoubleType, nullable = false)\n))\n\nspark.udf.register(\"raster_dims\", \n (bytes: Array[Byte]) => {\n val ds = loadDataset(bytes)\n val w = RST_Width.execute(ds)\n val h = RST_Height.execute(ds)\n ds.delete()\n (w, h, w.toDouble / h.toDouble)\n }, \n schema\n)\n", - "line_number": 459, - "length_lines": 19, - "source_file": "advanced/custom-udfs.md", - "category": "EXTENDS_SETUP", - "confidence": "high", - "analysis": { - "has_imports": true, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": false, - "has_rx_registration": false, - "uses_existing_df": null, - "uses_existing_rasters": null, - "uses_rx_without_import": false, - "uses_spark_without_creation": true, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "scala", - "code": "class CustomUDFTest extends AnyFunSuite with BeforeAndAfterAll {\n \n var testDataset: Dataset = _\n \n override def beforeAll(): Unit = {\n GDALManager.loadSharedObjects(Iterable.empty)\n gdal.AllRegister()\n val path = getClass.getResource(\"/test_raster.tif\").getPath\n testDataset = gdal.Open(path)\n }\n \n override def afterAll(): Unit = {\n testDataset.delete()\n }\n \n test(\"custom UDF produces expected output\") {\n // Test execute method directly first\n val result = RST_Width.execute(testDataset)\n assert(result > 0)\n \n // Then test in UDF context\n val udf = customRasterStats\n // ... test UDF behavior\n }\n}\n", - "line_number": 484, - "length_lines": 26, - "source_file": "advanced/custom-udfs.md", - "category": "EXAMPLE_ONLY", - "confidence": "high", - "analysis": { - "has_imports": false, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": false, - "has_rx_registration": false, - "uses_existing_df": null, - "uses_existing_rasters": null, - "uses_rx_without_import": false, - "uses_spark_without_creation": false, - "has_placeholders": true, - "is_short": false - } - }, - { - "language": "python", - "code": "from pyspark.sql import SparkSession\n\ndef test_custom_udf_integration():\n spark = SparkSession.builder.getOrCreate()\n \n # Register UDF\n spark.udf.register(\"custom_stats\", custom_raster_stats)\n \n # Load test data\n rasters = spark.read.format(\"gdal\").load(\"/test/data\")\n \n # Apply UDF\n result = rasters.selectExpr(\"custom_stats(tile) as stats\")\n \n # Verify\n assert result.count() > 0\n assert result.first()[\"stats\"] is not None\n", - "line_number": 514, - "length_lines": 18, - "source_file": "advanced/custom-udfs.md", - "category": "SELF_CONTAINED", - "confidence": "high", - "analysis": { - "has_imports": true, - "has_spark_creation": true, - "has_data_creation": false, - "has_file_reading": true, - "has_rx_registration": false, - "uses_existing_df": null, - "uses_existing_rasters": false, - "uses_rx_without_import": false, - "uses_spark_without_creation": false, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "scala", - "code": "def conditionalProcess: UserDefinedFunction = udf((bytes: Array[Byte]) => {\n val ds = loadDataset(bytes)\n \n try {\n // Check condition using execute method\n val numBands = RST_NumBands.execute(ds)\n \n if (numBands >= 4) {\n // Process multispectral\n processMultispectral(ds)\n } else {\n // Process RGB\n processRGB(ds)\n }\n } finally {\n ds.delete()\n }\n})\n", - "line_number": 538, - "length_lines": 19, - "source_file": "advanced/custom-udfs.md", - "category": "NEEDS_REVIEW", - "confidence": "low", - "analysis": { - "has_imports": false, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": false, - "has_rx_registration": false, - "uses_existing_df": null, - "uses_existing_rasters": null, - "uses_rx_without_import": false, - "uses_spark_without_creation": false, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "scala", - "code": "def chainedProcessing: UserDefinedFunction = udf((bytes: Array[Byte]) => {\n val ds = loadDataset(bytes)\n \n try {\n // Step 1: Extract info\n val metadata = RST_MetaData.execute(ds)\n \n // Step 2: Validate\n val isValid = validateMetadata(metadata)\n \n if (isValid) {\n // Step 3: Process\n val bbox = RST_BoundingBox.execute(ds)\n val clipped = clipToAOI(ds, bbox)\n \n // Step 4: Serialize result\n serializeRaster(clipped)\n } else {\n Array.empty[Byte]\n }\n } finally {\n ds.delete()\n }\n})\n", - "line_number": 561, - "length_lines": 25, - "source_file": "advanced/custom-udfs.md", - "category": "NEEDS_REVIEW", - "confidence": "low", - "analysis": { - "has_imports": false, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": false, - "has_rx_registration": false, - "uses_existing_df": null, - "uses_existing_rasters": null, - "uses_rx_without_import": false, - "uses_spark_without_creation": false, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "bash", - "code": "# 1. Reproject to common CRS before loading\ngdalwarp -t_srs EPSG:4326 input/*.tif output/\n\n# 2. Create optimized Cloud-Optimized GeoTIFFs\ngdal_translate -co TILED=YES -co COMPRESS=LZW -co COPY_SRC_OVERVIEWS=YES \\\n input.tif output_cog.tif\n\n# 3. Then load efficiently into GeoBrix\nspark.read.format(\"gdal\").load(\"output_cog.tif\")\n", - "line_number": 24, - "length_lines": 10, - "source_file": "advanced/gdal-cli.md", - "category": "SHELL_COMMAND", - "confidence": "high", - "analysis": { - "has_imports": false, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": true, - "has_rx_registration": false, - "uses_existing_df": null, - "uses_existing_rasters": null, - "uses_rx_without_import": false, - "uses_spark_without_creation": true, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "bash", - "code": "# Basic information\ngdalinfo /data/raster.tif\n\n# JSON output for parsing\ngdalinfo -json /data/raster.tif\n\n# Statistics\ngdalinfo -stats /data/raster.tif\n\n# Checksum\ngdalinfo -checksum /data/raster.tif\n", - "line_number": 57, - "length_lines": 12, - "source_file": "advanced/gdal-cli.md", - "category": "SHELL_COMMAND", - "confidence": "high", - "analysis": { - "has_imports": false, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": false, - "has_rx_registration": false, - "uses_existing_df": null, - "uses_existing_rasters": null, - "uses_rx_without_import": false, - "uses_spark_without_creation": false, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "python", - "code": "import subprocess\nimport json\n\ndef get_raster_info(path):\n \"\"\"Get GDAL info before loading into Spark\"\"\"\n result = subprocess.run(\n [\"gdalinfo\", \"-json\", path],\n capture_output=True,\n text=True\n )\n return json.loads(result.stdout)\n\n# Use to filter files before loading\nraster_info = get_raster_info(\"/data/sample.tif\")\nif raster_info[\"size\"][0] > 1000: # Width > 1000\n # Load large rasters\n df = spark.read.format(\"gdal\").load(\"/data/sample.tif\")\n", - "line_number": 72, - "length_lines": 18, - "source_file": "advanced/gdal-cli.md", - "category": "SELF_CONTAINED", - "confidence": "medium", - "analysis": { - "has_imports": true, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": true, - "has_rx_registration": false, - "uses_existing_df": false, - "uses_existing_rasters": false, - "uses_rx_without_import": false, - "uses_spark_without_creation": false, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "bash", - "code": "# Reproject to WGS84\ngdalwarp -t_srs EPSG:4326 input.tif output.tif\n\n# Reproject with resampling method\ngdalwarp -t_srs EPSG:3857 -r bilinear input.tif output.tif\n\n# Reproject and clip to bounds\ngdalwarp -t_srs EPSG:4326 \\\n -te -180 -90 180 90 \\\n -te_srs EPSG:4326 \\\n input.tif output.tif\n\n# Batch reproject directory\nfor f in /data/input/*.tif; do\n gdalwarp -t_srs EPSG:4326 \"$f\" \"/data/output/$(basename $f)\"\ndone\n", - "line_number": 98, - "length_lines": 17, - "source_file": "advanced/gdal-cli.md", - "category": "SHELL_COMMAND", - "confidence": "high", - "analysis": { - "has_imports": false, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": false, - "has_rx_registration": false, - "uses_existing_df": null, - "uses_existing_rasters": null, - "uses_rx_without_import": false, - "uses_spark_without_creation": false, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "python", - "code": "# %sh magic for preprocessing\n%sh\n#!/bin/bash\nmkdir -p /Volumes//tmp/reprojected\nfor f in /Volumes//raw/*.tif; do\n gdalwarp -t_srs EPSG:4326 -co COMPRESS=LZW \"$f\" \"/Volumes//tmp/reprojected/$(basename $f)\"\ndone\n\n# Then load into Spark\nspark.read.format(\"gdal\").load(\"/Volumes//tmp/reprojected/*.tif\")\n", - "line_number": 118, - "length_lines": 11, - "source_file": "advanced/gdal-cli.md", - "category": "EXAMPLE_ONLY", - "confidence": "high", - "analysis": { - "has_imports": false, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": true, - "has_rx_registration": false, - "uses_existing_df": null, - "uses_existing_rasters": null, - "uses_rx_without_import": false, - "uses_spark_without_creation": true, - "has_placeholders": true, - "is_short": false - } - }, - { - "language": "bash", - "code": "# Create Cloud-Optimized GeoTIFF (COG)\ngdal_translate -co TILED=YES \\\n -co COMPRESS=LZW \\\n -co COPY_SRC_OVERVIEWS=YES \\\n -co BLOCKXSIZE=512 \\\n -co BLOCKYSIZE=512 \\\n input.tif output_cog.tif\n\n# Convert format\ngdal_translate -of GTiff input.img output.tif\n\n# Extract specific bands\ngdal_translate -b 1 -b 2 -b 3 multispectral.tif rgb.tif\n\n# Resize\ngdal_translate -outsize 50% 50% large.tif small.tif\n\n# Convert to pyramided GeoTIFF\ngdal_translate -co TILED=YES -co COMPRESS=DEFLATE input.tif output.tif\ngdaladdo -r average output.tif 2 4 8 16\n", - "line_number": 141, - "length_lines": 21, - "source_file": "advanced/gdal-cli.md", - "category": "SHELL_COMMAND", - "confidence": "high", - "analysis": { - "has_imports": false, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": false, - "has_rx_registration": false, - "uses_existing_df": null, - "uses_existing_rasters": null, - "uses_rx_without_import": false, - "uses_spark_without_creation": false, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "python", - "code": "import subprocess\n\ndef optimize_for_spark(input_path, output_path):\n \"\"\"Create COG optimized for Spark processing\"\"\"\n cmd = [\n \"gdal_translate\",\n \"-co\", \"TILED=YES\",\n \"-co\", \"COMPRESS=LZW\",\n \"-co\", \"BLOCKXSIZE=256\",\n \"-co\", \"BLOCKYSIZE=256\",\n \"-co\", \"NUM_THREADS=ALL_CPUS\",\n input_path,\n output_path\n ]\n subprocess.run(cmd, check=True)\n \n # Add overviews\n subprocess.run([\"gdaladdo\", \"-r\", \"average\", output_path, \"2\", \"4\", \"8\"], check=True)\n\n# Optimize before loading\noptimize_for_spark(\"/data/large.tif\", \"/data/optimized.tif\")\nrasters = spark.read.format(\"gdal\").load(\"/data/optimized.tif\")\n", - "line_number": 165, - "length_lines": 23, - "source_file": "advanced/gdal-cli.md", - "category": "SELF_CONTAINED", - "confidence": "medium", - "analysis": { - "has_imports": true, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": true, - "has_rx_registration": false, - "uses_existing_df": null, - "uses_existing_rasters": false, - "uses_rx_without_import": false, - "uses_spark_without_creation": false, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "bash", - "code": "# Simple merge\ngdal_merge.py -o merged.tif input1.tif input2.tif input3.tif\n\n# Merge with nodata\ngdal_merge.py -n -9999 -a_nodata -9999 -o merged.tif *.tif\n\n# Merge with specific options\ngdal_merge.py -co COMPRESS=LZW -co TILED=YES -o merged.tif *.tif\n", - "line_number": 196, - "length_lines": 9, - "source_file": "advanced/gdal-cli.md", - "category": "SHELL_COMMAND", - "confidence": "high", - "analysis": { - "has_imports": false, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": false, - "has_rx_registration": false, - "uses_existing_df": null, - "uses_existing_rasters": null, - "uses_rx_without_import": false, - "uses_spark_without_creation": false, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "bash", - "code": "# Merge tiles before distributed processing\ngdal_merge.py -o /Volumes//tmp/merged_scene.tif \\\n /Volumes//tiles/*.tif\n\n# Then process merged file with GeoBrix\ndf = spark.read.format(\"gdal\").load(\"/Volumes//tmp/merged_scene.tif\")\n", - "line_number": 208, - "length_lines": 7, - "source_file": "advanced/gdal-cli.md", - "category": "SHELL_COMMAND", - "confidence": "high", - "analysis": { - "has_imports": false, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": true, - "has_rx_registration": false, - "uses_existing_df": false, - "uses_existing_rasters": null, - "uses_rx_without_import": false, - "uses_spark_without_creation": true, - "has_placeholders": true, - "is_short": false - } - }, - { - "language": "bash", - "code": "# Create VRT from directory\ngdalbuildvrt mosaic.vrt /data/tiles/*.tif\n\n# VRT with resolution\ngdalbuildvrt -resolution highest mosaic.vrt *.tif\n\n# VRT with specific bands\ngdalbuildvrt -separate bands.vrt band1.tif band2.tif band3.tif\n", - "line_number": 223, - "length_lines": 9, - "source_file": "advanced/gdal-cli.md", - "category": "SHELL_COMMAND", - "confidence": "high", - "analysis": { - "has_imports": true, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": false, - "has_rx_registration": false, - "uses_existing_df": null, - "uses_existing_rasters": null, - "uses_rx_without_import": false, - "uses_spark_without_creation": false, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "python", - "code": "# Create VRT for large collection\nimport subprocess\n\nsubprocess.run([\n \"gdalbuildvrt\",\n \"/Volumes//tmp/collection.vrt\",\n *dbutils.fs.ls(\"/Volumes//satellite/tiles/\")\n])\n\n# Read VRT with GeoBrix (treats as single raster)\nvrt_df = spark.read.format(\"gdal\").load(\"/Volumes//tmp/collection.vrt\")\n", - "line_number": 235, - "length_lines": 12, - "source_file": "advanced/gdal-cli.md", - "category": "EXAMPLE_ONLY", - "confidence": "high", - "analysis": { - "has_imports": true, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": true, - "has_rx_registration": false, - "uses_existing_df": null, - "uses_existing_rasters": null, - "uses_rx_without_import": false, - "uses_spark_without_creation": true, - "has_placeholders": true, - "is_short": false - } - }, - { - "language": "bash", - "code": "# Hillshade\ngdaldem hillshade -z 2 elevation.tif hillshade.tif\n\n# Slope\ngdaldem slope elevation.tif slope.tif\n\n# Aspect\ngdaldem aspect elevation.tif aspect.tif\n\n# Color relief\ngdaldem color-relief elevation.tif color_ramp.txt colored.tif\n", - "line_number": 255, - "length_lines": 12, - "source_file": "advanced/gdal-cli.md", - "category": "SHELL_COMMAND", - "confidence": "high", - "analysis": { - "has_imports": false, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": false, - "has_rx_registration": false, - "uses_existing_df": null, - "uses_existing_rasters": null, - "uses_rx_without_import": false, - "uses_spark_without_creation": false, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "python", - "code": "# Generate terrain products before analysis\n%sh\ngdaldem hillshade /Volumes//dems/elevation.tif /Volumes//tmp/hillshade.tif -z 2\ngdaldem slope /Volumes//dems/elevation.tif /Volumes//tmp/slope.tif\ngdaldem aspect /Volumes//dems/elevation.tif /Volumes//tmp/aspect.tif\n\n# Load all products\nhillshade = spark.read.format(\"gdal\").load(\"/Volumes//tmp/hillshade.tif\")\nslope = spark.read.format(\"gdal\").load(\"/Volumes//tmp/slope.tif\")\naspect = spark.read.format(\"gdal\").load(\"/Volumes//tmp/aspect.tif\")\n\n# Combine in Spark\ncombined = hillshade.join(slope, \"tile_id\").join(aspect, \"tile_id\")\n", - "line_number": 270, - "length_lines": 14, - "source_file": "advanced/gdal-cli.md", - "category": "EXAMPLE_ONLY", - "confidence": "high", - "analysis": { - "has_imports": false, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": true, - "has_rx_registration": false, - "uses_existing_df": null, - "uses_existing_rasters": null, - "uses_rx_without_import": false, - "uses_spark_without_creation": true, - "has_placeholders": true, - "is_short": false - } - }, - { - "language": "bash", - "code": "# NDVI calculation\ngdal_calc.py -A nir.tif -B red.tif \\\n --outfile=ndvi.tif \\\n --calc=\"(A-B)/(A+B)\" \\\n --NoDataValue=-9999\n\n# EVI calculation\ngdal_calc.py -A nir.tif -B red.tif -C blue.tif \\\n --outfile=evi.tif \\\n --calc=\"2.5*((A-B)/(A+6*B-7.5*C+1))\"\n\n# Threshold\ngdal_calc.py -A input.tif \\\n --outfile=threshold.tif \\\n --calc=\"A>100\"\n", - "line_number": 292, - "length_lines": 16, - "source_file": "advanced/gdal-cli.md", - "category": "SHELL_COMMAND", - "confidence": "high", - "analysis": { - "has_imports": false, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": false, - "has_rx_registration": false, - "uses_existing_df": null, - "uses_existing_rasters": null, - "uses_rx_without_import": false, - "uses_spark_without_creation": false, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "bash", - "code": "# Reproject vector\nogr2ogr -t_srs EPSG:4326 output.shp input.shp\n\n# Convert format\nogr2ogr -f GeoJSON output.geojson input.shp\n\n# Filter and subset\nogr2ogr -where \"POP > 1000000\" cities.shp all_cities.shp\n\n# Clip vector to bounds\nogr2ogr -clipsrc -180 -90 180 90 clipped.shp input.shp\n", - "line_number": 316, - "length_lines": 12, - "source_file": "advanced/gdal-cli.md", - "category": "SHELL_COMMAND", - "confidence": "high", - "analysis": { - "has_imports": false, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": false, - "has_rx_registration": false, - "uses_existing_df": null, - "uses_existing_rasters": null, - "uses_rx_without_import": false, - "uses_spark_without_creation": false, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "python", - "code": "# Preprocess shapefile\n%sh\nogr2ogr -t_srs EPSG:4326 -f GeoJSON \\\n /Volumes//tmp/boundaries.geojson \\\n /Volumes//raw/boundaries.shp\n\n# Read with GeoBrix\nboundaries = spark.read.format(\"geojson\").load(\"/Volumes//tmp/boundaries.geojson\")\n", - "line_number": 331, - "length_lines": 9, - "source_file": "advanced/gdal-cli.md", - "category": "EXAMPLE_ONLY", - "confidence": "high", - "analysis": { - "has_imports": false, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": true, - "has_rx_registration": false, - "uses_existing_df": null, - "uses_existing_rasters": null, - "uses_rx_without_import": false, - "uses_spark_without_creation": true, - "has_placeholders": true, - "is_short": false - } - }, - { - "language": "bash", - "code": "#!/bin/bash\n# preprocessing.sh\n\nINPUT_DIR=\"/Volumes//raw_satellite\"\nOUTPUT_DIR=\"/Volumes//tmp/processed\"\n\nmkdir -p \"$OUTPUT_DIR\"\n\necho \"Step 1: Reproject to WGS84\"\nfor f in \"$INPUT_DIR\"/*.tif; do\n base=$(basename \"$f\" .tif)\n gdalwarp -t_srs EPSG:4326 -r bilinear \\\n -co COMPRESS=LZW -co TILED=YES \\\n \"$f\" \"$OUTPUT_DIR/${base}_wgs84.tif\"\ndone\n\necho \"Step 2: Create RGB composites\"\nfor scene in $(ls \"$OUTPUT_DIR\"/*_B01_wgs84.tif | sed 's/_B01_wgs84.tif//'); do\n gdal_merge.py -separate -co COMPRESS=LZW \\\n -o \"${scene}_RGB.tif\" \\\n \"${scene}_B01_wgs84.tif\" \\\n \"${scene}_B02_wgs84.tif\" \\\n \"${scene}_B03_wgs84.tif\"\ndone\n\necho \"Step 3: Create overviews\"\nfor f in \"$OUTPUT_DIR\"/*_RGB.tif; do\n gdaladdo -r average \"$f\" 2 4 8 16\ndone\n\necho \"Step 4: Create VRT catalog\"\ngdalbuildvrt \"$OUTPUT_DIR/catalog.vrt\" \"$OUTPUT_DIR\"/*_RGB.tif\n\necho \"Preprocessing complete!\"\n", - "line_number": 349, - "length_lines": 35, - "source_file": "advanced/gdal-cli.md", - "category": "SHELL_COMMAND", - "confidence": "high", - "analysis": { - "has_imports": false, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": false, - "has_rx_registration": false, - "uses_existing_df": null, - "uses_existing_rasters": null, - "uses_rx_without_import": false, - "uses_spark_without_creation": false, - "has_placeholders": true, - "is_short": false - } - }, - { - "language": "python", - "code": "# Run preprocessing\n%sh /Volumes//scripts/preprocessing.sh\n\n# Load processed data\nprocessed = spark.read.format(\"gdal\").load(\"/Volumes//tmp/processed/*_RGB.tif\")\n\n# Apply GeoBrix operations\nfrom databricks.labs.gbx.rasterx import functions as rx\nrx.register(spark)\n\nresult = processed.select(\n \"*\",\n rx.rst_boundingbox(\"tile\").alias(\"bbox\"),\n rx.rst_clip(\"tile\", aoi_geometry).alias(\"clipped\")\n)\n\nresult.write.mode(\"overwrite\").saveAsTable(\"processed_imagery\")\n", - "line_number": 387, - "length_lines": 18, - "source_file": "advanced/gdal-cli.md", - "category": "EXAMPLE_ONLY", - "confidence": "high", - "analysis": { - "has_imports": true, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": true, - "has_rx_registration": true, - "uses_existing_df": null, - "uses_existing_rasters": null, - "uses_rx_without_import": false, - "uses_spark_without_creation": true, - "has_placeholders": true, - "is_short": false - } - }, - { - "language": "python", - "code": "# Process in Spark\nresults = spark.sql(\"\"\"\n SELECT path, \n gbx_rst_clip(tile, aoi) as clipped_tile\n FROM rasters\n\"\"\")\n\n# Save to temporary location, e.g. parquet or delta\nresults.write.mode(\"overwrite\").format(\"parquet\").save(\"/Volumes//tmp/results\")\n\n# Postprocess with GDAL\n%sh\n# Extract rasters from Parquet and optimize\nfor path in $(ls /Volumes//tmp/results/*.parquet); do\n # Extract and convert\n # (This is simplified - actual extraction would need custom code)\n gdal_translate -co COMPRESS=DEFLATE -co TILED=YES \\\n \"$path\" \"/Volumes//output/$(basename $path .parquet).tif\"\ndone\n\n# Create final mosaic\ngdal_merge.py -co COMPRESS=LZW -o /Volumes//output/final_mosaic.tif \\\n /Volumes//output/*.tif\n", - "line_number": 411, - "length_lines": 24, - "source_file": "advanced/gdal-cli.md", - "category": "EXAMPLE_ONLY", - "confidence": "high", - "analysis": { - "has_imports": true, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": false, - "has_rx_registration": false, - "uses_existing_df": null, - "uses_existing_rasters": true, - "uses_rx_without_import": false, - "uses_spark_without_creation": true, - "has_placeholders": true, - "is_short": false - } - }, - { - "language": "python", - "code": "from pyspark.sql.functions import udf\nfrom pyspark.sql.types import BinaryType\nimport subprocess\nimport tempfile\nimport os\n\n@udf(BinaryType())\ndef apply_gdal_operation(tile_binary, operation):\n \"\"\"Apply GDAL CLI operation via UDF\"\"\"\n with tempfile.TemporaryDirectory() as tmpdir:\n # Write tile to temp file\n input_path = os.path.join(tmpdir, \"input.tif\")\n output_path = os.path.join(tmpdir, \"output.tif\")\n \n with open(input_path, 'wb') as f:\n f.write(tile_binary)\n \n # Apply GDAL operation\n if operation == \"hillshade\":\n subprocess.run([\n \"gdaldem\", \"hillshade\",\n input_path, output_path\n ], check=True)\n elif operation == \"slope\":\n subprocess.run([\n \"gdaldem\", \"slope\",\n input_path, output_path\n ], check=True)\n \n # Read result\n with open(output_path, 'rb') as f:\n return f.read()\n\n# Use in Spark\nprocessed = rasters.withColumn(\n \"hillshade\",\n apply_gdal_operation(\"tile\", lit(\"hillshade\"))\n)\n", - "line_number": 441, - "length_lines": 39, - "source_file": "advanced/gdal-cli.md", - "category": "EXTENDS_SETUP", - "confidence": "high", - "analysis": { - "has_imports": true, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": false, - "has_rx_registration": false, - "uses_existing_df": null, - "uses_existing_rasters": true, - "uses_rx_without_import": false, - "uses_spark_without_creation": false, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "python", - "code": "# \u2705 Good: Use Unity Catalog Volumes for persistence\n%sh\ngdalwarp -t_srs EPSG:4326 \\\n /Volumes//input/raster.tif \\\n /Volumes//tmp/reprojected.tif\n\n# \u274c Bad: Using local temp (not visible to all nodes, ephemeral)\n%sh\ngdalwarp -t_srs EPSG:4326 \\\n /tmp/input.tif \\\n /tmp/output.tif\n", - "line_number": 488, - "length_lines": 12, - "source_file": "advanced/gdal-cli.md", - "category": "EXAMPLE_ONLY", - "confidence": "high", - "analysis": { - "has_imports": false, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": false, - "has_rx_registration": false, - "uses_existing_df": null, - "uses_existing_rasters": null, - "uses_rx_without_import": false, - "uses_spark_without_creation": false, - "has_placeholders": true, - "is_short": false - } - }, - { - "language": "sql", - "code": "-- Create volume for GeoBrix data (catalog.schema.volume)\nCREATE VOLUME IF NOT EXISTS main.default.geobrix;\n\n-- Organize with subdirectories\n-- /Volumes/main/default/geobrix/raw/ - raw input data\n-- /Volumes/main/default/geobrix/tmp/ - intermediate processing\n-- /Volumes/main/default/geobrix/output/ - final results\n", - "line_number": 503, - "length_lines": 8, - "source_file": "advanced/gdal-cli.md", - "category": "EXAMPLE_ONLY", - "confidence": "medium", - "analysis": { - "has_imports": false, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": false, - "has_rx_registration": false, - "uses_existing_df": null, - "uses_existing_rasters": null, - "uses_rx_without_import": false, - "uses_spark_without_creation": false, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "bash", - "code": "# Create tiled, compressed GeoTIFFs\ngdal_translate -co TILED=YES \\\n -co COMPRESS=LZW \\\n -co BLOCKXSIZE=256 \\\n -co BLOCKYSIZE=256 \\\n input.tif output.tif\n", - "line_number": 524, - "length_lines": 7, - "source_file": "advanced/gdal-cli.md", - "category": "SHELL_COMMAND", - "confidence": "high", - "analysis": { - "has_imports": false, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": false, - "has_rx_registration": false, - "uses_existing_df": null, - "uses_existing_rasters": null, - "uses_rx_without_import": false, - "uses_spark_without_creation": false, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "bash", - "code": "# Instead of merging, create VRT\ngdalbuildvrt collection.vrt /data/tiles/*.tif\n\n# More efficient for Spark to read tiles individually\n", - "line_number": 535, - "length_lines": 5, - "source_file": "advanced/gdal-cli.md", - "category": "SHELL_COMMAND", - "confidence": "high", - "analysis": { - "has_imports": false, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": false, - "has_rx_registration": false, - "uses_existing_df": null, - "uses_existing_rasters": null, - "uses_rx_without_import": false, - "uses_spark_without_creation": false, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "python", - "code": "# Use parallel processing for batch operations\nfrom concurrent.futures import ThreadPoolExecutor\nimport subprocess\n\ndef reproject_file(input_path):\n output_path = input_path.replace(\"raw\", \"processed\")\n subprocess.run([\n \"gdalwarp\", \"-t_srs\", \"EPSG:4326\",\n input_path, output_path\n ])\n\nfiles = dbutils.fs.ls(\"/Volumes//raw\")\nwith ThreadPoolExecutor(max_workers=8) as executor:\n executor.map(reproject_file, [f.path for f in files])\n", - "line_number": 544, - "length_lines": 15, - "source_file": "advanced/gdal-cli.md", - "category": "EXAMPLE_ONLY", - "confidence": "high", - "analysis": { - "has_imports": true, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": false, - "has_rx_registration": false, - "uses_existing_df": null, - "uses_existing_rasters": null, - "uses_rx_without_import": false, - "uses_spark_without_creation": false, - "has_placeholders": true, - "is_short": false - } - }, - { - "language": "python", - "code": "# 1. Preprocess with GDAL\n%sh\ngdalwarp -t_srs EPSG:4326 /Volumes//raw/*.tif /Volumes//tmp/reprojected/\n\n# 2. Process with GeoBrix\ndf = spark.read.format(\"gdal\").load(\"/Volumes//tmp/reprojected\")\nresult = df.select(rx.rst_clip(\"tile\", aoi))\nresult.write.mode(\"overwrite\").saveAsTable(\"results\")\n\n# 3. Postprocess with GDAL\n# Export and optimize final outputs\n", - "line_number": 565, - "length_lines": 12, - "source_file": "advanced/gdal-cli.md", - "category": "EXAMPLE_ONLY", - "confidence": "high", - "analysis": { - "has_imports": false, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": true, - "has_rx_registration": false, - "uses_existing_df": false, - "uses_existing_rasters": null, - "uses_rx_without_import": true, - "uses_spark_without_creation": true, - "has_placeholders": true, - "is_short": false - } - }, - { - "language": "python", - "code": "@udf(BinaryType())\ndef preprocess_and_load(path):\n \"\"\"Preprocess on-the-fly\"\"\"\n with tempfile.NamedTemporaryFile(suffix='.tif') as tmp:\n subprocess.run([\n \"gdalwarp\", \"-t_srs\", \"EPSG:4326\",\n path, tmp.name\n ])\n with open(tmp.name, 'rb') as f:\n return f.read()\n", - "line_number": 581, - "length_lines": 11, - "source_file": "advanced/gdal-cli.md", - "category": "NEEDS_REVIEW", - "confidence": "low", - "analysis": { - "has_imports": false, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": false, - "has_rx_registration": false, - "uses_existing_df": null, - "uses_existing_rasters": null, - "uses_rx_without_import": false, - "uses_spark_without_creation": false, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "python", - "code": "# Use GDAL CLI for quick metadata extraction\ninfo = json.loads(subprocess.run(\n [\"gdalinfo\", \"-json\", path],\n capture_output=True, text=True\n).stdout)\n\n# Filter based on metadata\nif info[\"size\"][0] > 1000:\n # Process large files with GeoBrix\n df = spark.read.format(\"gdal\").load(path)\n", - "line_number": 596, - "length_lines": 11, - "source_file": "advanced/gdal-cli.md", - "category": "EXTENDS_SETUP", - "confidence": "high", - "analysis": { - "has_imports": false, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": true, - "has_rx_registration": false, - "uses_existing_df": false, - "uses_existing_rasters": null, - "uses_rx_without_import": false, - "uses_spark_without_creation": true, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "python", - "code": "%pip install rasterio\n", - "line_number": 37, - "length_lines": 2, - "source_file": "advanced/library-integration.md", - "category": "EXAMPLE_ONLY", - "confidence": "high", - "analysis": { - "has_imports": false, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": false, - "has_rx_registration": false, - "uses_existing_df": null, - "uses_existing_rasters": null, - "uses_rx_without_import": false, - "uses_spark_without_creation": false, - "has_placeholders": false, - "is_short": true - } - }, - { - "language": "python", - "code": "import rasterio\nfrom rasterio.io import MemoryFile\nfrom pyspark.sql import functions as f\nfrom pyspark.sql.types import BinaryType, StructType, StructField, DoubleType\nfrom databricks.labs.gbx.rasterx import functions as rx\n\n@f.udf(StructType([\n StructField(\"mean\", DoubleType()),\n StructField(\"std\", DoubleType()),\n StructField(\"min\", DoubleType()),\n StructField(\"max\", DoubleType())\n]))\ndef compute_statistics_rasterio(raster_binary):\n \"\"\"Compute statistics using rasterio\"\"\"\n if raster_binary is None:\n return None\n \n # Convert to bytes (Spark may pass bytearray)\n tile_data = bytes(raster_binary)\n \n # Open binary raster as rasterio dataset\n with MemoryFile(tile_data) as memfile:\n with memfile.open() as dataset:\n # Read first band as NumPy array\n data = dataset.read(1)\n \n # Use NumPy for statistics\n import numpy as np\n return {\n \"mean\": float(np.mean(data)),\n \"std\": float(np.std(data)),\n \"min\": float(np.min(data)),\n \"max\": float(np.max(data))\n }\n\n# Use with binary tiles created from binaryFile content\n# Note: tile.raster contains the binary data when using rst_fromcontent\nbinary_df = spark.read.format(\"binaryFile\").load(\"/Volumes//rasters/*.tif\")\ntiles_df = binary_df.select(\n \"path\",\n rx.rst_fromcontent(f.col(\"content\"), f.lit(\"GTiff\")).alias(\"tile\")\n)\n# Pass tile.raster (the binary field) to rasterio UDF\nstats = tiles_df.select(\"path\", compute_statistics_rasterio(f.col(\"tile.raster\")).alias(\"stats\"))\nstats.show()\n", - "line_number": 51, - "length_lines": 46, - "source_file": "advanced/library-integration.md", - "category": "EXAMPLE_ONLY", - "confidence": "high", - "analysis": { - "has_imports": true, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": true, - "has_rx_registration": false, - "uses_existing_df": null, - "uses_existing_rasters": false, - "uses_rx_without_import": false, - "uses_spark_without_creation": false, - "has_placeholders": true, - "is_short": false - } - }, - { - "language": "python", - "code": "import rasterio\nfrom rasterio.io import MemoryFile\nfrom pyspark.sql import functions as f\nfrom pyspark.sql.types import StringType, IntegerType, ArrayType, DoubleType\nfrom databricks.labs.gbx.rasterx import functions as rx\nimport json\n\n@f.udf(StringType())\ndef extract_metadata_rasterio(tile_bytes):\n \"\"\"Extract comprehensive metadata using rasterio\"\"\"\n if tile_bytes is None:\n return None\n tile_data = bytes(tile_bytes)\n with MemoryFile(tile_data) as memfile:\n with memfile.open() as src:\n metadata = {\n \"driver\": src.driver,\n \"width\": src.width,\n \"height\": src.height,\n \"count\": src.count,\n \"dtype\": str(src.dtypes[0]),\n \"crs\": str(src.crs) if src.crs else None,\n \"bounds\": src.bounds._asdict(),\n \"transform\": list(src.transform)[:6],\n \"nodata\": src.nodata,\n \"colorinterp\": [ci.name for ci in src.colorinterp]\n }\n return json.dumps(metadata)\n\n@f.udf(ArrayType(IntegerType()))\ndef get_valid_pixel_count(tile_bytes):\n \"\"\"Count valid (non-nodata) pixels\"\"\"\n if tile_bytes is None:\n return None\n tile_data = bytes(tile_bytes)\n with MemoryFile(tile_data) as memfile:\n with memfile.open() as src:\n data = src.read(1)\n nodata = src.nodata\n \n import numpy as np\n if nodata is not None:\n valid_count = int(np.sum(data != nodata))\n else:\n valid_count = int(data.size)\n \n return [valid_count, int(data.size)]\n\n# Usage with binaryFile + rst_fromcontent\nbinary_df = spark.read.format(\"binaryFile\").load(\"/Volumes//rasters/*.tif\")\ndf = binary_df.select(\n \"path\",\n rx.rst_fromcontent(f.col(\"content\"), f.lit(\"GTiff\")).alias(\"tile\")\n)\nresult = df.select(\n \"path\",\n extract_metadata_rasterio(f.col(\"tile.raster\")).alias(\"metadata_json\"),\n get_valid_pixel_count(f.col(\"tile.raster\")).alias(\"pixel_counts\")\n)\n", - "line_number": 103, - "length_lines": 60, - "source_file": "advanced/library-integration.md", - "category": "EXAMPLE_ONLY", - "confidence": "high", - "analysis": { - "has_imports": true, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": true, - "has_rx_registration": false, - "uses_existing_df": false, - "uses_existing_rasters": false, - "uses_rx_without_import": false, - "uses_spark_without_creation": false, - "has_placeholders": true, - "is_short": false - } - }, - { - "language": "python", - "code": "import rasterio\nfrom rasterio.io import MemoryFile\nfrom pyspark.sql import functions as f\nfrom pyspark.sql.types import BinaryType\nfrom databricks.labs.gbx.rasterx import functions as rx\nimport numpy as np\n\n@f.udf(BinaryType())\ndef normalize_raster(tile_bytes):\n \"\"\"Normalize raster values to 0-255 range\"\"\"\n if tile_bytes is None:\n return None\n tile_data = bytes(tile_bytes)\n with MemoryFile(tile_data) as memfile:\n with memfile.open() as src:\n # Read data\n data = src.read()\n profile = src.profile.copy()\n \n # Normalize each band\n normalized = np.zeros_like(data, dtype=np.uint8)\n for i in range(data.shape[0]):\n band = data[i]\n band_min, band_max = band.min(), band.max()\n if band_max > band_min:\n normalized[i] = ((band - band_min) / (band_max - band_min) * 255).astype(np.uint8)\n \n # Update profile for uint8\n profile.update(dtype=rasterio.uint8, nodata=None)\n \n # Write to new tile\n output = MemoryFile()\n with output.open(**profile) as dst:\n dst.write(normalized)\n \n return bytes(output.getbuffer())\n\n@f.udf(BinaryType())\ndef compute_ndvi(tile_bytes):\n \"\"\"Compute NDVI from multispectral tile (assuming bands 4=NIR, 3=Red)\"\"\"\n if tile_bytes is None:\n return None\n tile_data = bytes(tile_bytes)\n with MemoryFile(tile_data) as memfile:\n with memfile.open() as src:\n if src.count < 4:\n return None\n \n # Read NIR (band 4) and Red (band 3)\n nir = src.read(4).astype(float)\n red = src.read(3).astype(float)\n \n # Calculate NDVI: (NIR - Red) / (NIR + Red)\n ndvi = np.where(\n (nir + red) != 0,\n (nir - red) / (nir + red),\n 0\n )\n \n # Create output profile (single band, float32)\n profile = src.profile.copy()\n profile.update(\n count=1,\n dtype=rasterio.float32,\n nodata=-9999\n )\n \n # Write NDVI to new tile\n output = MemoryFile()\n with output.open(**profile) as dst:\n dst.write(ndvi.astype(np.float32), 1)\n \n return bytes(output.getbuffer())\n\n# Usage with binaryFile + rst_fromcontent\nbinary_df = spark.read.format(\"binaryFile\").load(\"/Volumes//multispectral/*.tif\")\ndf = binary_df.select(\n \"path\",\n rx.rst_fromcontent(f.col(\"content\"), f.lit(\"GTiff\")).alias(\"tile\")\n)\nndvi_df = df.withColumn(\"ndvi_tile\", compute_ndvi(f.col(\"tile.raster\")))\nndvi_df.select(\"path\", \"ndvi_tile\").write.format(\"parquet\").save(\"/Volumes//ndvi_results\")\n", - "line_number": 171, - "length_lines": 83, - "source_file": "advanced/library-integration.md", - "category": "EXAMPLE_ONLY", - "confidence": "high", - "analysis": { - "has_imports": true, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": true, - "has_rx_registration": false, - "uses_existing_df": false, - "uses_existing_rasters": null, - "uses_rx_without_import": false, - "uses_spark_without_creation": false, - "has_placeholders": true, - "is_short": false - } - }, - { - "language": "python", - "code": "import rasterio\nfrom rasterio.io import MemoryFile\nfrom rasterio.windows import Window\nfrom pyspark.sql import functions as f\nfrom pyspark.sql.functions import explode, lit\nfrom pyspark.sql.types import ArrayType, StructType, StructField, BinaryType, IntegerType, DoubleType\nfrom databricks.labs.gbx.rasterx import functions as rx\nimport numpy as np\n\n@f.udf(ArrayType(StructType([\n StructField(\"window_id\", IntegerType()),\n StructField(\"col_off\", IntegerType()),\n StructField(\"row_off\", IntegerType()),\n StructField(\"width\", IntegerType()),\n StructField(\"height\", IntegerType()),\n StructField(\"mean\", DoubleType())\n])))\ndef process_windows(tile_bytes, window_size=256):\n \"\"\"Process raster in windows and compute statistics per window\"\"\"\n if tile_bytes is None:\n return None\n tile_data = bytes(tile_bytes) if isinstance(tile_bytes, bytearray) else tile_bytes\n results = []\n \n with MemoryFile(tile_data) as memfile:\n with memfile.open() as src:\n # Iterate over windows\n window_id = 0\n for col_off in range(0, src.width, window_size):\n for row_off in range(0, src.height, window_size):\n # Define window\n width = min(window_size, src.width - col_off)\n height = min(window_size, src.height - row_off)\n window = Window(col_off, row_off, width, height)\n \n # Read window\n data = src.read(1, window=window)\n \n # Compute statistics\n results.append({\n \"window_id\": window_id,\n \"col_off\": col_off,\n \"row_off\": row_off,\n \"width\": width,\n \"height\": height,\n \"mean\": float(np.mean(data))\n })\n window_id += 1\n \n return results\n\n# Usage: Create one row per window\nbinary_df = spark.read.format(\"binaryFile\").load(\"/Volumes//large_rasters/*.tif\")\ndf = binary_df.select(\n \"path\",\n rx.rst_fromcontent(f.col(\"content\"), f.lit(\"GTiff\")).alias(\"tile\")\n)\nwindowed = df.select(\n \"path\",\n explode(process_windows(f.col(\"tile.raster\"), lit(512))).alias(\"window\")\n)\nwindowed.select(\"path\", \"window.*\").show()\n", - "line_number": 262, - "length_lines": 63, - "source_file": "advanced/library-integration.md", - "category": "EXAMPLE_ONLY", - "confidence": "high", - "analysis": { - "has_imports": true, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": true, - "has_rx_registration": false, - "uses_existing_df": false, - "uses_existing_rasters": null, - "uses_rx_without_import": false, - "uses_spark_without_creation": false, - "has_placeholders": true, - "is_short": false - } - }, - { - "language": "python", - "code": "import rasterio\nfrom rasterio.io import MemoryFile\nfrom rasterio.transform import from_bounds\nfrom pyspark.sql import functions as f\nfrom pyspark.sql.types import BinaryType\nfrom databricks.labs.gbx.rasterx import functions as rx\nimport numpy as np\n\n@f.udf(BinaryType())\ndef create_raster_from_array(data_array, width, height, bounds, crs=\"EPSG:4326\"):\n \"\"\"Create a GeoTIFF from NumPy array with spatial reference\"\"\"\n # Define transform from bounds\n transform = from_bounds(\n bounds[0], bounds[1], bounds[2], bounds[3],\n width, height\n )\n \n # Create profile\n profile = {\n 'driver': 'GTiff',\n 'height': height,\n 'width': width,\n 'count': 1,\n 'dtype': data_array.dtype,\n 'crs': crs,\n 'transform': transform,\n 'compress': 'lzw'\n }\n \n # Write to memory\n output = MemoryFile()\n with output.open(**profile) as dst:\n dst.write(np.array(data_array).reshape(1, height, width))\n \n return bytes(output.getbuffer())\n\n@f.udf(BinaryType())\ndef apply_color_ramp(tile_bytes):\n \"\"\"Apply a color ramp to single-band raster\"\"\"\n if tile_bytes is None:\n return None\n tile_data = bytes(tile_bytes)\n with MemoryFile(tile_data) as memfile:\n with memfile.open() as src:\n data = src.read(1)\n \n # Normalize to 0-255\n data_min, data_max = data.min(), data.max()\n normalized = ((data - data_min) / (data_max - data_min) * 255).astype(np.uint8)\n \n # Apply simple color ramp (e.g., blue to red)\n import matplotlib.pyplot as plt\n cmap = plt.get_cmap('RdYlBu_r')\n colored = cmap(normalized / 255.0)[:, :, :3] # RGB only\n colored = (colored * 255).astype(np.uint8)\n \n # Create RGB output\n profile = src.profile.copy()\n profile.update(\n count=3,\n dtype=rasterio.uint8,\n photometric='RGB'\n )\n \n output = MemoryFile()\n with output.open(**profile) as dst:\n for i in range(3):\n dst.write(colored[:, :, i], i + 1)\n \n return bytes(output.getbuffer())\n", - "line_number": 333, - "length_lines": 71, - "source_file": "advanced/library-integration.md", - "category": "NEEDS_REVIEW", - "confidence": "low", - "analysis": { - "has_imports": true, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": false, - "has_rx_registration": false, - "uses_existing_df": null, - "uses_existing_rasters": null, - "uses_rx_without_import": false, - "uses_spark_without_creation": false, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "python", - "code": "import rasterio\nfrom rasterio.io import MemoryFile\nfrom rasterio.warp import calculate_default_transform, reproject, Resampling\nfrom pyspark.sql import functions as f\nfrom pyspark.sql.types import BinaryType, ArrayType, DoubleType\nfrom databricks.labs.gbx.rasterx import functions as rx\n\n@f.udf(BinaryType())\ndef reproject_raster(tile_bytes, dst_crs=\"EPSG:3857\"):\n \"\"\"Reproject raster to new CRS using rasterio\"\"\"\n if tile_bytes is None:\n return None\n tile_data = bytes(tile_bytes)\n with MemoryFile(tile_data) as memfile:\n with memfile.open() as src:\n # Calculate transform for new CRS\n transform, width, height = calculate_default_transform(\n src.crs, dst_crs, src.width, src.height, *src.bounds\n )\n \n # Update profile\n profile = src.profile.copy()\n profile.update({\n 'crs': dst_crs,\n 'transform': transform,\n 'width': width,\n 'height': height\n })\n \n # Reproject\n output = MemoryFile()\n with output.open(**profile) as dst:\n for i in range(1, src.count + 1):\n reproject(\n source=rasterio.band(src, i),\n destination=rasterio.band(dst, i),\n src_transform=src.transform,\n src_crs=src.crs,\n dst_transform=transform,\n dst_crs=dst_crs,\n resampling=Resampling.bilinear\n )\n \n return bytes(output.getbuffer())\n\n@f.udf(ArrayType(DoubleType()))\ndef pixel_to_coords(tile_bytes, col, row):\n \"\"\"Convert pixel coordinates to geographic coordinates\"\"\"\n if tile_bytes is None:\n return None\n tile_data = bytes(tile_bytes)\n with MemoryFile(tile_data) as memfile:\n with memfile.open() as src:\n # Use affine transform\n x, y = src.xy(row, col)\n return [float(x), float(y)]\n", - "line_number": 412, - "length_lines": 57, - "source_file": "advanced/library-integration.md", - "category": "NEEDS_REVIEW", - "confidence": "low", - "analysis": { - "has_imports": true, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": false, - "has_rx_registration": false, - "uses_existing_df": null, - "uses_existing_rasters": null, - "uses_rx_without_import": false, - "uses_spark_without_creation": false, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "python", - "code": "from pyspark.sql import functions as f\nfrom pyspark.sql.functions import col, lit\nfrom pyspark.sql.types import BinaryType, StructType, StructField, StringType, DoubleType, BooleanType\nfrom databricks.labs.gbx.rasterx import functions as rx\nimport rasterio\nfrom rasterio.io import MemoryFile\nfrom rasterio.warp import reproject, Resampling, calculate_default_transform\nimport numpy as np\n\n# UDF 1: Quality check\n@f.udf(StructType([\n StructField(\"valid\", BooleanType()),\n StructField(\"reason\", StringType())\n]))\ndef quality_check(tile_bytes):\n \"\"\"Check if raster meets quality criteria\"\"\"\n if tile_bytes is None:\n return {\"valid\": False, \"reason\": \"null_data\"}\n tile_data = bytes(tile_bytes)\n with MemoryFile(tile_data) as memfile:\n with memfile.open() as src:\n data = src.read(1)\n \n # Check for sufficient valid data\n if src.nodata is not None:\n valid_ratio = np.sum(data != src.nodata) / data.size\n else:\n valid_ratio = 1.0\n \n if valid_ratio < 0.5:\n return {\"valid\": False, \"reason\": \"insufficient_data\"}\n \n # Check data range\n if data.min() == data.max():\n return {\"valid\": False, \"reason\": \"no_variation\"}\n \n return {\"valid\": True, \"reason\": \"ok\"}\n\n# UDF 2: Standardize CRS and resolution\n@f.udf(BinaryType())\ndef standardize_raster(tile_bytes, target_crs=\"EPSG:4326\", target_resolution=0.0001):\n \"\"\"Reproject and resample to standard CRS and resolution\"\"\"\n if tile_bytes is None:\n return None\n tile_data = bytes(tile_bytes)\n with MemoryFile(tile_data) as memfile:\n with memfile.open() as src:\n # Calculate new transform\n bounds = src.bounds\n new_width = int((bounds.right - bounds.left) / target_resolution)\n new_height = int((bounds.top - bounds.bottom) / target_resolution)\n \n transform, width, height = calculate_default_transform(\n src.crs, target_crs,\n new_width, new_height,\n *bounds\n )\n \n profile = src.profile.copy()\n profile.update({\n 'crs': target_crs,\n 'transform': transform,\n 'width': width,\n 'height': height\n })\n \n output = MemoryFile()\n with output.open(**profile) as dst:\n reproject(\n source=rasterio.band(src, 1),\n destination=rasterio.band(dst, 1),\n src_transform=src.transform,\n src_crs=src.crs,\n dst_transform=transform,\n dst_crs=target_crs,\n resampling=Resampling.bilinear\n )\n \n return bytes(output.getbuffer())\n\n# UDF 3: Apply processing\n@f.udf(BinaryType())\ndef enhance_contrast(tile_bytes, percentile_low=2, percentile_high=98):\n \"\"\"Enhance contrast using percentile stretch\"\"\"\n if tile_bytes is None:\n return None\n tile_data = bytes(tile_bytes)\n with MemoryFile(tile_data) as memfile:\n with memfile.open() as src:\n data = src.read(1).astype(float)\n \n # Compute percentiles\n p_low = np.percentile(data, percentile_low)\n p_high = np.percentile(data, percentile_high)\n \n # Stretch\n stretched = np.clip((data - p_low) / (p_high - p_low) * 255, 0, 255).astype(np.uint8)\n \n profile = src.profile.copy()\n profile.update(dtype=rasterio.uint8)\n \n output = MemoryFile()\n with output.open(**profile) as dst:\n dst.write(stretched, 1)\n \n return bytes(output.getbuffer())\n\n# Pipeline with binaryFile + rst_fromcontent\nbinary_df = spark.read.format(\"binaryFile\").load(\"/Volumes//raw_rasters/*.tif\")\ndf = binary_df.select(\n \"path\",\n rx.rst_fromcontent(col(\"content\"), lit(\"GTiff\")).alias(\"tile\")\n)\n\n# Step 1: Quality check - pass tile.raster (the binary field)\nchecked = df.withColumn(\"qc\", quality_check(col(\"tile.raster\")))\nvalid_tiles = checked.filter(col(\"qc.valid\") == True)\n\n# Step 2: Standardize\nstandardized = valid_tiles.withColumn(\"tile_std\", standardize_raster(col(\"tile.raster\")))\n\n# Step 3: Enhance\nenhanced = standardized.withColumn(\"tile_enhanced\", enhance_contrast(col(\"tile_std\")))\n\n# Save results\nenhanced.select(\"path\", \"tile_enhanced\").write.format(\"parquet\").save(\"/Volumes//processed_rasters\")\n", - "line_number": 477, - "length_lines": 127, - "source_file": "advanced/library-integration.md", - "category": "EXAMPLE_ONLY", - "confidence": "high", - "analysis": { - "has_imports": true, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": true, - "has_rx_registration": false, - "uses_existing_df": false, - "uses_existing_rasters": null, - "uses_rx_without_import": false, - "uses_spark_without_creation": false, - "has_placeholders": true, - "is_short": false - } - }, - { - "language": "python", - "code": "# \u2705 Good: Use MemoryFile for in-memory operations\nwith MemoryFile(tile_bytes) as memfile:\n with memfile.open() as src:\n data = src.read()\n\n# \u274c Bad: Writing to disk unnecessarily\nwith open('/tmp/temp.tif', 'wb') as f:\n f.write(tile_bytes)\nwith rasterio.open('/tmp/temp.tif') as src:\n data = src.read()\n", - "line_number": 612, - "length_lines": 11, - "source_file": "advanced/library-integration.md", - "category": "NEEDS_REVIEW", - "confidence": "low", - "analysis": { - "has_imports": false, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": false, - "has_rx_registration": false, - "uses_existing_df": null, - "uses_existing_rasters": null, - "uses_rx_without_import": false, - "uses_spark_without_creation": false, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "python", - "code": "# Always check for nodata values\nwith MemoryFile(tile_bytes) as memfile:\n with memfile.open() as src:\n data = src.read(1)\n nodata = src.nodata\n \n if nodata is not None:\n valid_data = data[data != nodata]\n else:\n valid_data = data\n", - "line_number": 627, - "length_lines": 11, - "source_file": "advanced/library-integration.md", - "category": "NEEDS_REVIEW", - "confidence": "low", - "analysis": { - "has_imports": false, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": false, - "has_rx_registration": false, - "uses_existing_df": null, - "uses_existing_rasters": null, - "uses_rx_without_import": false, - "uses_spark_without_creation": false, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "python", - "code": "# Copy and update profile to preserve metadata\nwith MemoryFile(tile_bytes) as memfile:\n with memfile.open() as src:\n profile = src.profile.copy() # Preserve original metadata\n profile.update(dtype=new_dtype) # Update only what changes\n", - "line_number": 642, - "length_lines": 6, - "source_file": "advanced/library-integration.md", - "category": "NEEDS_REVIEW", - "confidence": "low", - "analysis": { - "has_imports": false, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": false, - "has_rx_registration": false, - "uses_existing_df": null, - "uses_existing_rasters": null, - "uses_rx_without_import": false, - "uses_spark_without_creation": false, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "python", - "code": "# Context managers handle cleanup automatically\nwith MemoryFile(tile_bytes) as memfile:\n with memfile.open() as src:\n # Process...\n pass\n # Resources automatically cleaned up\n", - "line_number": 652, - "length_lines": 7, - "source_file": "advanced/library-integration.md", - "category": "EXAMPLE_ONLY", - "confidence": "high", - "analysis": { - "has_imports": false, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": false, - "has_rx_registration": false, - "uses_existing_df": null, - "uses_existing_rasters": null, - "uses_rx_without_import": false, - "uses_spark_without_creation": false, - "has_placeholders": true, - "is_short": false - } - }, - { - "language": "python", - "code": "from databricks.labs.gbx.rasterx import functions as rx\n\n# Uses Spark's columnar expression engine\ndf = rasters.select(rx.rst_boundingbox(\"tile\"))\n", - "line_number": 36, - "length_lines": 5, - "source_file": "advanced/overview.md", - "category": "EXTENDS_SETUP", - "confidence": "high", - "analysis": { - "has_imports": true, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": false, - "has_rx_registration": false, - "uses_existing_df": false, - "uses_existing_rasters": true, - "uses_rx_without_import": false, - "uses_spark_without_creation": false, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "scala", - "code": "import com.databricks.labs.gbx.rasterx.expressions.accessors.RST_BoundingBox\nimport org.gdal.gdal.Dataset\n\n// Direct GDAL dataset manipulation\nval bbox = RST_BoundingBox.execute(dataset)\n", - "line_number": 50, - "length_lines": 6, - "source_file": "advanced/overview.md", - "category": "NEEDS_REVIEW", - "confidence": "low", - "analysis": { - "has_imports": true, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": false, - "has_rx_registration": false, - "uses_existing_df": null, - "uses_existing_rasters": null, - "uses_rx_without_import": false, - "uses_spark_without_creation": false, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "python", - "code": "# 1. Preprocess with GDAL CLI (via subprocess or notebook magic)\n# !gdalwarp -t_srs EPSG:4326 input.tif reprojected.tif\n\n# 2. Read with GeoBrix\nrasters = spark.read.format(\"gdal\").load(\"/data/reprojected.tif\")\n\n# 3. Apply custom UDF for specialized logic\nfrom pyspark.sql.functions import udf\nfrom databricks.labs.gbx.rasterx.expressions.accessors import RST_Metadata\n\n@udf(\"map\")\ndef extract_custom_metadata(tile_binary):\n # Custom logic using execute methods\n # (This is simplified - see Custom UDFs guide for details)\n dataset = load_dataset_from_binary(tile_binary)\n metadata = RST_Metadata.execute(dataset)\n # Add custom processing\n metadata[\"processed_date\"] = datetime.now().isoformat()\n return metadata\n\nenriched = rasters.withColumn(\"custom_metadata\", extract_custom_metadata(\"tile\"))\n\n# 4. Use standard GeoBrix for distributed operations\nfrom databricks.labs.gbx.rasterx import functions as rx\nrx.register(spark)\n\nresult = enriched.select(\n \"*\",\n rx.rst_boundingbox(\"tile\").alias(\"bbox\"),\n rx.rst_clip(\"tile\", aoi_geometry).alias(\"clipped\")\n)\n\n# 5. Integrate with xarray for analysis (see Library Integration guide)\n# Convert to xarray for advanced array operations\n# ...\n\n# 6. Save results\nresult.write.mode(\"overwrite\").saveAsTable(\"processed_rasters\")\n", - "line_number": 119, - "length_lines": 39, - "source_file": "advanced/overview.md", - "category": "EXAMPLE_ONLY", - "confidence": "high", - "analysis": { - "has_imports": true, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": true, - "has_rx_registration": true, - "uses_existing_df": null, - "uses_existing_rasters": false, - "uses_rx_without_import": false, - "uses_spark_without_creation": false, - "has_placeholders": true, - "is_short": false - } - }, - { - "language": "scala", - "code": "bng_cellarea(gridLetter: Column, precision: Column): Column\n", - "line_number": 20, - "length_lines": 2, - "source_file": "api/gridx-functions.md", - "category": "NEEDS_REVIEW", - "confidence": "low", - "analysis": { - "has_imports": false, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": false, - "has_rx_registration": false, - "uses_existing_df": null, - "uses_existing_rasters": null, - "uses_rx_without_import": false, - "uses_spark_without_creation": false, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "python", - "code": "from databricks.labs.gbx.gridx.bng import functions as bx\nbx.register(spark)\n\n# Calculate area of 1km cell in TQ grid\narea = spark.sql(\"SELECT gbx_bng_cellarea('TQ', 1000) as area_sqm\")\narea.show()\n# Output: area_sqm = 1000000.0\n\n# Multiple precisions\nareas = spark.sql(\"\"\"\n SELECT\n gbx_bng_cellarea('TQ', 10000) as area_10km,\n gbx_bng_cellarea('TQ', 1000) as area_1km,\n gbx_bng_cellarea('TQ', 100) as area_100m\n\"\"\")\n", - "line_number": 33, - "length_lines": 16, - "source_file": "api/gridx-functions.md", - "category": "EXTENDS_SETUP", - "confidence": "high", - "analysis": { - "has_imports": true, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": false, - "has_rx_registration": false, - "uses_existing_df": null, - "uses_existing_rasters": null, - "uses_rx_without_import": false, - "uses_spark_without_creation": true, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "scala", - "code": "import com.databricks.labs.gbx.gridx.bng.{functions => bx}\nbx.register(spark)\n\nval area = spark.sql(\"SELECT gbx_bng_cellarea('TQ', 1000) as area\")\n", - "line_number": 51, - "length_lines": 5, - "source_file": "api/gridx-functions.md", - "category": "EXTENDS_SETUP", - "confidence": "high", - "analysis": { - "has_imports": true, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": false, - "has_rx_registration": false, - "uses_existing_df": null, - "uses_existing_rasters": null, - "uses_rx_without_import": false, - "uses_spark_without_creation": true, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "scala", - "code": "bng_celldistance(cell1: Column, cell2: Column): Column\n", - "line_number": 65, - "length_lines": 2, - "source_file": "api/gridx-functions.md", - "category": "NEEDS_REVIEW", - "confidence": "low", - "analysis": { - "has_imports": false, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": false, - "has_rx_registration": false, - "uses_existing_df": null, - "uses_existing_rasters": null, - "uses_rx_without_import": false, - "uses_spark_without_creation": false, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "python", - "code": "distance = spark.sql(\"\"\"\n SELECT gbx_bng_celldistance('TQ3080', 'TQ3180') as distance_m\n\"\"\")\n", - "line_number": 78, - "length_lines": 4, - "source_file": "api/gridx-functions.md", - "category": "EXTENDS_SETUP", - "confidence": "high", - "analysis": { - "has_imports": false, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": false, - "has_rx_registration": false, - "uses_existing_df": null, - "uses_existing_rasters": null, - "uses_rx_without_import": false, - "uses_spark_without_creation": true, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "scala", - "code": "bng_cellkring(centerCell: Column, k: Column): Column\n", - "line_number": 91, - "length_lines": 2, - "source_file": "api/gridx-functions.md", - "category": "NEEDS_REVIEW", - "confidence": "low", - "analysis": { - "has_imports": false, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": false, - "has_rx_registration": false, - "uses_existing_df": null, - "uses_existing_rasters": null, - "uses_rx_without_import": false, - "uses_spark_without_creation": false, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "python", - "code": "# Get cells within 2-ring distance\nnearby = spark.sql(\"\"\"\n SELECT\n 'TQ3080' as center,\n gbx_bng_cellkring('TQ3080', 2) as nearby_cells\n\"\"\")\n\n# Explode to individual cells\nfrom pyspark.sql.functions import explode\nexpanded = nearby.select(\n \"center\",\n explode(\"nearby_cells\").alias(\"nearby_cell\")\n)\n", - "line_number": 104, - "length_lines": 14, - "source_file": "api/gridx-functions.md", - "category": "EXTENDS_SETUP", - "confidence": "high", - "analysis": { - "has_imports": true, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": false, - "has_rx_registration": false, - "uses_existing_df": null, - "uses_existing_rasters": null, - "uses_rx_without_import": false, - "uses_spark_without_creation": true, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "scala", - "code": "bng_cellfromprecision(easting: Column, northing: Column, precision: Column): Column\n", - "line_number": 127, - "length_lines": 2, - "source_file": "api/gridx-functions.md", - "category": "NEEDS_REVIEW", - "confidence": "low", - "analysis": { - "has_imports": false, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": false, - "has_rx_registration": false, - "uses_existing_df": null, - "uses_existing_rasters": null, - "uses_rx_without_import": false, - "uses_spark_without_creation": false, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "scala", - "code": "bng_pointtocell(point: Column, precision: Column): Column\n", - "line_number": 148, - "length_lines": 2, - "source_file": "api/gridx-functions.md", - "category": "NEEDS_REVIEW", - "confidence": "low", - "analysis": { - "has_imports": false, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": false, - "has_rx_registration": false, - "uses_existing_df": null, - "uses_existing_rasters": null, - "uses_rx_without_import": false, - "uses_spark_without_creation": false, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "python", - "code": "from pyspark.sql.functions import expr\n\n# Convert points to BNG cells\nlocations = spark.table(\"uk_locations\")\n\nbng_cells = locations.select(\n \"location_id\",\n \"latitude\",\n \"longitude\",\n expr(\"gbx_bng_pointtocell(st_point(longitude, latitude), 1000)\").alias(\"bng_1km\"),\n expr(\"gbx_bng_pointtocell(st_point(longitude, latitude), 100)\").alias(\"bng_100m\")\n)\n\nbng_cells.show()\n", - "line_number": 161, - "length_lines": 15, - "source_file": "api/gridx-functions.md", - "category": "EXTENDS_SETUP", - "confidence": "high", - "analysis": { - "has_imports": true, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": false, - "has_rx_registration": false, - "uses_existing_df": null, - "uses_existing_rasters": null, - "uses_rx_without_import": false, - "uses_spark_without_creation": true, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "sql", - "code": "-- Convert points to BNG cells at multiple resolutions\nSELECT\n location_id,\n latitude,\n longitude,\n gbx_bng_pointtocell(st_point(longitude, latitude), 10000) as bng_10km,\n gbx_bng_pointtocell(st_point(longitude, latitude), 1000) as bng_1km,\n gbx_bng_pointtocell(st_point(longitude, latitude), 100) as bng_100m\nFROM uk_locations;\n", - "line_number": 178, - "length_lines": 10, - "source_file": "api/gridx-functions.md", - "category": "EXTENDS_SETUP", - "confidence": "high", - "analysis": { - "has_imports": false, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": false, - "has_rx_registration": false, - "uses_existing_df": null, - "uses_existing_rasters": null, - "uses_rx_without_import": false, - "uses_spark_without_creation": false, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "scala", - "code": "bng_celltoboundary(cell: Column): Column\n", - "line_number": 197, - "length_lines": 2, - "source_file": "api/gridx-functions.md", - "category": "EXAMPLE_ONLY", - "confidence": "high", - "analysis": { - "has_imports": false, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": false, - "has_rx_registration": false, - "uses_existing_df": null, - "uses_existing_rasters": null, - "uses_rx_without_import": false, - "uses_spark_without_creation": false, - "has_placeholders": false, - "is_short": true - } - }, - { - "language": "python", - "code": "boundaries = spark.sql(\"\"\"\n SELECT\n bng_cell,\n gbx_bng_celltoboundary(bng_cell) as cell_boundary\n FROM bng_cells\n\"\"\")\n", - "line_number": 209, - "length_lines": 7, - "source_file": "api/gridx-functions.md", - "category": "EXTENDS_SETUP", - "confidence": "high", - "analysis": { - "has_imports": false, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": false, - "has_rx_registration": false, - "uses_existing_df": null, - "uses_existing_rasters": null, - "uses_rx_without_import": false, - "uses_spark_without_creation": true, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "scala", - "code": "bng_celltopoint(cell: Column): Column\n", - "line_number": 225, - "length_lines": 2, - "source_file": "api/gridx-functions.md", - "category": "EXAMPLE_ONLY", - "confidence": "high", - "analysis": { - "has_imports": false, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": false, - "has_rx_registration": false, - "uses_existing_df": null, - "uses_existing_rasters": null, - "uses_rx_without_import": false, - "uses_spark_without_creation": false, - "has_placeholders": false, - "is_short": true - } - }, - { - "language": "python", - "code": "centers = spark.sql(\"\"\"\n SELECT\n bng_cell,\n gbx_bng_celltopoint(bng_cell) as center_point\n FROM bng_cells\n\"\"\")\n", - "line_number": 237, - "length_lines": 7, - "source_file": "api/gridx-functions.md", - "category": "EXTENDS_SETUP", - "confidence": "high", - "analysis": { - "has_imports": false, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": false, - "has_rx_registration": false, - "uses_existing_df": null, - "uses_existing_rasters": null, - "uses_rx_without_import": false, - "uses_spark_without_creation": true, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "scala", - "code": "bng_getgridletter(cell: Column): Column\n", - "line_number": 255, - "length_lines": 2, - "source_file": "api/gridx-functions.md", - "category": "EXAMPLE_ONLY", - "confidence": "high", - "analysis": { - "has_imports": false, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": false, - "has_rx_registration": false, - "uses_existing_df": null, - "uses_existing_rasters": null, - "uses_rx_without_import": false, - "uses_spark_without_creation": false, - "has_placeholders": false, - "is_short": true - } - }, - { - "language": "python", - "code": "grid_letters = spark.sql(\"\"\"\n SELECT\n bng_cell,\n gbx_bng_getgridletter(bng_cell) as grid_square\n FROM locations\n\"\"\")\n", - "line_number": 267, - "length_lines": 7, - "source_file": "api/gridx-functions.md", - "category": "EXTENDS_SETUP", - "confidence": "high", - "analysis": { - "has_imports": false, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": false, - "has_rx_registration": false, - "uses_existing_df": null, - "uses_existing_rasters": null, - "uses_rx_without_import": false, - "uses_spark_without_creation": true, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "scala", - "code": "bng_getprecision(cell: Column): Column\n", - "line_number": 283, - "length_lines": 2, - "source_file": "api/gridx-functions.md", - "category": "EXAMPLE_ONLY", - "confidence": "high", - "analysis": { - "has_imports": false, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": false, - "has_rx_registration": false, - "uses_existing_df": null, - "uses_existing_rasters": null, - "uses_rx_without_import": false, - "uses_spark_without_creation": false, - "has_placeholders": false, - "is_short": true - } - }, - { - "language": "python", - "code": "precisions = spark.sql(\"\"\"\n SELECT\n bng_cell,\n gbx_bng_getprecision(bng_cell) as precision_m\n FROM locations\n\"\"\")\n", - "line_number": 295, - "length_lines": 7, - "source_file": "api/gridx-functions.md", - "category": "EXTENDS_SETUP", - "confidence": "high", - "analysis": { - "has_imports": false, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": false, - "has_rx_registration": false, - "uses_existing_df": null, - "uses_existing_rasters": null, - "uses_rx_without_import": false, - "uses_spark_without_creation": true, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "scala", - "code": "bng_isvalid(cell: Column): Column\n", - "line_number": 311, - "length_lines": 2, - "source_file": "api/gridx-functions.md", - "category": "EXAMPLE_ONLY", - "confidence": "high", - "analysis": { - "has_imports": false, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": false, - "has_rx_registration": false, - "uses_existing_df": null, - "uses_existing_rasters": null, - "uses_rx_without_import": false, - "uses_spark_without_creation": false, - "has_placeholders": false, - "is_short": true - } - }, - { - "language": "python", - "code": "validated = spark.sql(\"\"\"\n SELECT\n bng_ref,\n gbx_bng_isvalid(bng_ref) as is_valid\n FROM input_data\n\"\"\")\n\n# Filter to valid references only\nvalid_only = validated.filter(\"is_valid = true\")\n", - "line_number": 323, - "length_lines": 10, - "source_file": "api/gridx-functions.md", - "category": "EXTENDS_SETUP", - "confidence": "high", - "analysis": { - "has_imports": false, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": false, - "has_rx_registration": false, - "uses_existing_df": null, - "uses_existing_rasters": null, - "uses_rx_without_import": false, - "uses_spark_without_creation": true, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "python", - "code": "from databricks.labs.gbx.gridx.bng import functions as bx\nfrom pyspark.sql.functions import expr, count, avg\n\nbx.register(spark)\n\n# Aggregate measurements by BNG cell\naggregated = spark.sql(\"\"\"\n SELECT\n gbx_bng_pointtocell(st_point(longitude, latitude), 1000) as bng_cell,\n COUNT(*) as measurement_count,\n AVG(temperature) as avg_temp,\n MAX(temperature) as max_temp,\n MIN(temperature) as min_temp,\n STDDEV(temperature) as stddev_temp\n FROM weather_measurements\n WHERE country = 'GB'\n GROUP BY bng_cell\n ORDER BY measurement_count DESC\n\"\"\")\n\n# Save results\naggregated.write.mode(\"overwrite\").saveAsTable(\"weather_by_bng\")\n", - "line_number": 341, - "length_lines": 23, - "source_file": "api/gridx-functions.md", - "category": "EXTENDS_SETUP", - "confidence": "high", - "analysis": { - "has_imports": true, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": false, - "has_rx_registration": false, - "uses_existing_df": null, - "uses_existing_rasters": null, - "uses_rx_without_import": false, - "uses_spark_without_creation": true, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "python", - "code": "from databricks.labs.gbx.gridx.bng import functions as bx\nbx.register(spark)\n\n# Create multi-resolution grid\nmulti_res = spark.sql(\"\"\"\n SELECT\n location_id,\n name,\n st_point(longitude, latitude) as location,\n gbx_bng_pointtocell(st_point(longitude, latitude), 10000) as bng_10km,\n gbx_bng_pointtocell(st_point(longitude, latitude), 1000) as bng_1km,\n gbx_bng_pointtocell(st_point(longitude, latitude), 100) as bng_100m\n FROM poi_uk\n\"\"\")\n\n# Count by resolution\nspark.sql(\"\"\"\n SELECT '10km' as resolution, COUNT(DISTINCT bng_10km) as unique_cells\n FROM multi_res\n UNION ALL\n SELECT '1km', COUNT(DISTINCT bng_1km) FROM multi_res\n UNION ALL\n SELECT '100m', COUNT(DISTINCT bng_100m) FROM multi_res\n ORDER BY resolution\n\"\"\").show()\n", - "line_number": 370, - "length_lines": 26, - "source_file": "api/gridx-functions.md", - "category": "EXTENDS_SETUP", - "confidence": "high", - "analysis": { - "has_imports": true, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": false, - "has_rx_registration": false, - "uses_existing_df": null, - "uses_existing_rasters": null, - "uses_rx_without_import": false, - "uses_spark_without_creation": true, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "python", - "code": "from databricks.labs.gbx.gridx.bng import functions as bx\nbx.register(spark)\n\n# Index both datasets with BNG\nincidents = spark.sql(\"\"\"\n SELECT\n incident_id,\n incident_type,\n gbx_bng_pointtocell(location, 1000) as bng_cell\n FROM incidents\n\"\"\")\n\nresources = spark.sql(\"\"\"\n SELECT\n resource_id,\n resource_type,\n gbx_bng_pointtocell(location, 1000) as bng_cell\n FROM emergency_resources\n\"\"\")\n\n# Join on BNG cell for fast spatial matching\nmatched = incidents.join(resources, on=\"bng_cell\", how=\"inner\")\n\n# Find incidents with nearby resources\nmatched.select(\n \"incident_id\",\n \"incident_type\",\n \"resource_id\",\n \"resource_type\",\n \"bng_cell\"\n).show()\n", - "line_number": 402, - "length_lines": 32, - "source_file": "api/gridx-functions.md", - "category": "EXTENDS_SETUP", - "confidence": "high", - "analysis": { - "has_imports": true, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": false, - "has_rx_registration": false, - "uses_existing_df": null, - "uses_existing_rasters": null, - "uses_rx_without_import": false, - "uses_spark_without_creation": true, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "python", - "code": "from databricks.labs.gbx.gridx.bng import functions as bx\nfrom pyspark.sql.functions import expr, explode\nbx.register(spark)\n\n# Find all data within k-ring of important locations\nimportant_sites = spark.sql(\"\"\"\n SELECT\n site_id,\n site_name,\n gbx_bng_pointtocell(location, 1000) as center_cell\n FROM critical_sites\n\"\"\")\n\n# Generate 3-ring around each site\nnearby_cells = important_sites.select(\n \"site_id\",\n \"site_name\",\n \"center_cell\",\n expr(\"gbx_bng_cellkring(center_cell, 3)\").alias(\"nearby_cells\")\n)\n\n# Explode to individual cells\nexpanded = nearby_cells.select(\n \"site_id\",\n \"site_name\",\n explode(\"nearby_cells\").alias(\"bng_cell\")\n)\n\n# Join with sensor data\nsensor_data = spark.table(\"sensor_readings_by_bng\")\n\nresults = expanded.join(\n sensor_data,\n on=\"bng_cell\",\n how=\"inner\"\n)\n\nresults.write.mode(\"overwrite\").saveAsTable(\"site_nearby_sensors\")\n", - "line_number": 440, - "length_lines": 39, - "source_file": "api/gridx-functions.md", - "category": "EXTENDS_SETUP", - "confidence": "high", - "analysis": { - "has_imports": true, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": false, - "has_rx_registration": false, - "uses_existing_df": null, - "uses_existing_rasters": null, - "uses_rx_without_import": false, - "uses_spark_without_creation": true, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "python", - "code": "from databricks.labs.gbx.gridx.bng import functions as bx\nbx.register(spark)\n\n# Create a comprehensive BNG grid catalog\ncatalog = spark.sql(\"\"\"\n SELECT\n bng_cell,\n gbx_bng_getgridletter(bng_cell) as grid_square,\n gbx_bng_getprecision(bng_cell) as precision_m,\n gbx_bng_cellarea(\n gbx_bng_getgridletter(bng_cell),\n gbx_bng_getprecision(bng_cell)\n ) as area_sqm,\n gbx_bng_celltopoint(bng_cell) as center_point,\n gbx_bng_celltoboundary(bng_cell) as boundary\n FROM (\n SELECT DISTINCT bng_cell\n FROM location_index\n )\n\"\"\")\n\ncatalog.write.mode(\"overwrite\").saveAsTable(\"bng_grid_catalog\")\n", - "line_number": 485, - "length_lines": 23, - "source_file": "api/gridx-functions.md", - "category": "EXTENDS_SETUP", - "confidence": "high", - "analysis": { - "has_imports": true, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": false, - "has_rx_registration": false, - "uses_existing_df": null, - "uses_existing_rasters": null, - "uses_rx_without_import": false, - "uses_spark_without_creation": true, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "python", - "code": "# Add BNG cell column to table\nenriched = locations.withColumn(\n \"bng_1km\",\n expr(\"gbx_bng_pointtocell(st_point(lon, lat), 1000)\")\n)\n\n# Partition by BNG for efficient queries\nenriched.write.partitionBy(\"bng_1km\").saveAsTable(\"locations_indexed\")\n", - "line_number": 543, - "length_lines": 9, - "source_file": "api/gridx-functions.md", - "category": "NEEDS_REVIEW", - "confidence": "low", - "analysis": { - "has_imports": false, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": false, - "has_rx_registration": false, - "uses_existing_df": null, - "uses_existing_rasters": null, - "uses_rx_without_import": false, - "uses_spark_without_creation": false, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "sql", - "code": "-- Optimize table with Z-ordering on BNG cell\nOPTIMIZE locations_indexed\nZORDER BY (bng_1km);\n", - "line_number": 556, - "length_lines": 4, - "source_file": "api/gridx-functions.md", - "category": "EXAMPLE_ONLY", - "confidence": "medium", - "analysis": { - "has_imports": false, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": false, - "has_rx_registration": false, - "uses_existing_df": null, - "uses_existing_rasters": null, - "uses_rx_without_import": false, - "uses_spark_without_creation": false, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "python", - "code": "# Use coarser resolution for broad analysis\ncoarse = expr(\"gbx_bng_pointtocell(location, 10000)\") # 10km\n\n# Use finer resolution for detailed analysis\nfine = expr(\"gbx_bng_pointtocell(location, 100)\") # 100m\n", - "line_number": 564, - "length_lines": 6, - "source_file": "api/gridx-functions.md", - "category": "NEEDS_REVIEW", - "confidence": "low", - "analysis": { - "has_imports": false, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": false, - "has_rx_registration": false, - "uses_existing_df": null, - "uses_existing_rasters": null, - "uses_rx_without_import": false, - "uses_spark_without_creation": false, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "python", - "code": "from databricks.labs.gbx.rasterx import functions as rx\nfrom databricks.labs.gbx.gridx.bng import functions as bx\nfrom databricks.labs.gbx.vectorx import functions as vx\n\n# Register each package\nrx.register(spark)\nbx.register(spark)\nvx.register(spark)\n", - "line_number": 55, - "length_lines": 9, - "source_file": "api/overview.md", - "category": "NEEDS_REVIEW", - "confidence": "low", - "analysis": { - "has_imports": true, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": false, - "has_rx_registration": true, - "uses_existing_df": null, - "uses_existing_rasters": null, - "uses_rx_without_import": false, - "uses_spark_without_creation": false, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "scala", - "code": "import com.databricks.labs.gbx.rasterx.{functions => rx}\nimport com.databricks.labs.gbx.gridx.bng.{functions => bx}\nimport com.databricks.labs.gbx.vectorx.{functions => vx}\n\n// Register each package\nrx.register(spark)\nbx.register(spark)\nvx.register(spark)\n", - "line_number": 68, - "length_lines": 9, - "source_file": "api/overview.md", - "category": "NEEDS_REVIEW", - "confidence": "low", - "analysis": { - "has_imports": true, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": false, - "has_rx_registration": true, - "uses_existing_df": null, - "uses_existing_rasters": null, - "uses_rx_without_import": false, - "uses_spark_without_creation": false, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "sql", - "code": "-- No registration needed in SQL\n-- Functions are available after Python/Scala registration\n\nSHOW FUNCTIONS LIKE 'gbx_*';\n", - "line_number": 83, - "length_lines": 5, - "source_file": "api/overview.md", - "category": "EXAMPLE_ONLY", - "confidence": "medium", - "analysis": { - "has_imports": false, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": false, - "has_rx_registration": false, - "uses_existing_df": null, - "uses_existing_rasters": null, - "uses_rx_without_import": false, - "uses_spark_without_creation": false, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "python", - "code": "# Import functions with alias\nfrom databricks.labs.gbx.rasterx import functions as rx\n\n# Register with Spark\nrx.register(spark)\n\n# Use in DataFrame operations\ndf = rasters.select(rx.rst_boundingbox(\"tile\"))\n\n# Or use in SQL after registration\nspark.sql(\"SELECT gbx_rst_boundingbox(tile) FROM rasters\")\n", - "line_number": 129, - "length_lines": 12, - "source_file": "api/overview.md", - "category": "EXTENDS_SETUP", - "confidence": "high", - "analysis": { - "has_imports": true, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": false, - "has_rx_registration": true, - "uses_existing_df": false, - "uses_existing_rasters": true, - "uses_rx_without_import": false, - "uses_spark_without_creation": true, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "python", - "code": "# Register in Python\nfrom databricks.labs.gbx.rasterx import functions as rx\nrx.register(spark)\n\n# Use in Python\npython_result = rasters.select(rx.rst_boundingbox(\"tile\"))\n\n# Use in SQL\nsql_result = spark.sql(\"\"\"\n SELECT gbx_rst_boundingbox(tile) as bbox\n FROM rasters\n\"\"\")\n\n# Both return the same results\n", - "line_number": 145, - "length_lines": 15, - "source_file": "api/overview.md", - "category": "EXTENDS_SETUP", - "confidence": "high", - "analysis": { - "has_imports": true, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": false, - "has_rx_registration": true, - "uses_existing_df": null, - "uses_existing_rasters": true, - "uses_rx_without_import": false, - "uses_spark_without_creation": true, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "python", - "code": "from databricks.labs.gbx.rasterx import functions as rx\nfrom pyspark.sql.functions import expr\n\nrx.register(spark)\n\n# Chain multiple operations\nresult = (\n rasters\n .select(\n \"path\",\n rx.rst_clip(\"tile\", expr(\"st_geomfromtext('POLYGON(...)')\")).alias(\"clipped\")\n )\n .select(\n \"path\",\n \"clipped\",\n rx.rst_boundingbox(\"clipped\").alias(\"new_bounds\")\n )\n)\n", - "line_number": 164, - "length_lines": 19, - "source_file": "api/overview.md", - "category": "EXAMPLE_ONLY", - "confidence": "high", - "analysis": { - "has_imports": true, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": false, - "has_rx_registration": true, - "uses_existing_df": null, - "uses_existing_rasters": true, - "uses_rx_without_import": false, - "uses_spark_without_creation": true, - "has_placeholders": true, - "is_short": false - } - }, - { - "language": "python", - "code": "# List registered functions\nspark.sql(\"SHOW FUNCTIONS LIKE 'gbx_*'\").show()\n\n# Describe a specific function\nspark.sql(\"DESCRIBE FUNCTION EXTENDED gbx_rst_boundingbox\").show()\n", - "line_number": 189, - "length_lines": 6, - "source_file": "api/overview.md", - "category": "EXTENDS_SETUP", - "confidence": "high", - "analysis": { - "has_imports": false, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": false, - "has_rx_registration": false, - "uses_existing_df": null, - "uses_existing_rasters": null, - "uses_rx_without_import": false, - "uses_spark_without_creation": true, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "python", - "code": "# Error: Function 'gbx_rst_boundingbox' not found\n\n# Solution: Register functions first\nfrom databricks.labs.gbx.rasterx import functions as rx\nrx.register(spark)\n", - "line_number": 201, - "length_lines": 6, - "source_file": "api/overview.md", - "category": "NEEDS_REVIEW", - "confidence": "low", - "analysis": { - "has_imports": true, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": false, - "has_rx_registration": true, - "uses_existing_df": null, - "uses_existing_rasters": null, - "uses_rx_without_import": false, - "uses_spark_without_creation": false, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "python", - "code": "# Error: No module named 'databricks.labs.gbx'\n\n# Solution: Ensure the wheel is installed on the cluster\n# Check cluster libraries\n", - "line_number": 211, - "length_lines": 5, - "source_file": "api/overview.md", - "category": "NEEDS_REVIEW", - "confidence": "low", - "analysis": { - "has_imports": false, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": false, - "has_rx_registration": false, - "uses_existing_df": null, - "uses_existing_rasters": null, - "uses_rx_without_import": false, - "uses_spark_without_creation": false, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "python", - "code": "# Register functions once at the start of your notebook\nfrom databricks.labs.gbx.rasterx import functions as rx\nrx.register(spark)\n\n# Then use throughout the notebook\n# Don't re-register in every cell\n", - "line_number": 222, - "length_lines": 7, - "source_file": "api/overview.md", - "category": "NEEDS_REVIEW", - "confidence": "low", - "analysis": { - "has_imports": true, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": false, - "has_rx_registration": true, - "uses_existing_df": null, - "uses_existing_rasters": null, - "uses_rx_without_import": false, - "uses_spark_without_creation": false, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "python", - "code": "# Prefer DataFrame API for complex operations\nresult = df.select(rx.rst_boundingbox(\"tile\"))\n\n# Over repeated SQL calls\n# result = spark.sql(\"SELECT gbx_rst_boundingbox(tile) FROM df\")\n", - "line_number": 233, - "length_lines": 6, - "source_file": "api/overview.md", - "category": "EXTENDS_SETUP", - "confidence": "high", - "analysis": { - "has_imports": false, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": false, - "has_rx_registration": false, - "uses_existing_df": false, - "uses_existing_rasters": null, - "uses_rx_without_import": true, - "uses_spark_without_creation": true, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "python", - "code": "# Process multiple columns at once\nresult = df.select(\n rx.rst_boundingbox(\"tile\").alias(\"bbox\"),\n rx.rst_width(\"tile\").alias(\"width\"),\n rx.rst_height(\"tile\").alias(\"height\"),\n rx.rst_metadata(\"tile\").alias(\"metadata\")\n)\n", - "line_number": 243, - "length_lines": 8, - "source_file": "api/overview.md", - "category": "EXTENDS_SETUP", - "confidence": "high", - "analysis": { - "has_imports": false, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": false, - "has_rx_registration": false, - "uses_existing_df": true, - "uses_existing_rasters": null, - "uses_rx_without_import": true, - "uses_spark_without_creation": false, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "python", - "code": "# Verify installation\nimport databricks.labs.gbx\nprint(\"GeoBrix installed successfully\")\n", - "line_number": 13, - "length_lines": 4, - "source_file": "api/python.md", - "category": "NEEDS_REVIEW", - "confidence": "low", - "analysis": { - "has_imports": true, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": false, - "has_rx_registration": false, - "uses_existing_df": null, - "uses_existing_rasters": null, - "uses_rx_without_import": false, - "uses_spark_without_creation": false, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "python", - "code": "from databricks.labs.gbx.rasterx import functions as rx\n\n# Register functions\nrx.register(spark)\n\n# Use functions\ndf = rasters.select(rx.rst_boundingbox(\"tile\"))\n", - "line_number": 23, - "length_lines": 8, - "source_file": "api/python.md", - "category": "EXTENDS_SETUP", - "confidence": "high", - "analysis": { - "has_imports": true, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": false, - "has_rx_registration": true, - "uses_existing_df": false, - "uses_existing_rasters": true, - "uses_rx_without_import": false, - "uses_spark_without_creation": false, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "python", - "code": "from databricks.labs.gbx.gridx.bng import functions as bx\n\n# Register functions\nbx.register(spark)\n\n# Use functions\ndf = spark.sql(\"SELECT gbx_bng_cellarea('TQ', 1000)\")\n", - "line_number": 35, - "length_lines": 8, - "source_file": "api/python.md", - "category": "EXTENDS_SETUP", - "confidence": "high", - "analysis": { - "has_imports": true, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": false, - "has_rx_registration": false, - "uses_existing_df": false, - "uses_existing_rasters": null, - "uses_rx_without_import": false, - "uses_spark_without_creation": true, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "python", - "code": "from databricks.labs.gbx.vectorx import functions as vx\n\n# Register functions\nvx.register(spark)\n\n# Use functions\ndf = legacy_data.select(vx.st_legacyaswkb(\"mosaic_geom\"))\n", - "line_number": 47, - "length_lines": 8, - "source_file": "api/python.md", - "category": "NEEDS_REVIEW", - "confidence": "low", - "analysis": { - "has_imports": true, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": false, - "has_rx_registration": false, - "uses_existing_df": false, - "uses_existing_rasters": null, - "uses_rx_without_import": false, - "uses_spark_without_creation": false, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "python", - "code": "from databricks.labs.gbx.rasterx import functions as rx\nrx.register(spark)\n\nrasters = spark.read.format(\"gdal\").load(\"/data/rasters\")\nbbox_df = rasters.select(\n \"path\",\n rx.rst_boundingbox(\"tile\").alias(\"bbox\")\n)\n", - "line_number": 67, - "length_lines": 9, - "source_file": "api/python.md", - "category": "SELF_CONTAINED", - "confidence": "high", - "analysis": { - "has_imports": true, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": true, - "has_rx_registration": true, - "uses_existing_df": null, - "uses_existing_rasters": false, - "uses_rx_without_import": false, - "uses_spark_without_creation": true, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "python", - "code": "width_df = rasters.select(rx.rst_width(\"tile\").alias(\"width\"))\n", - "line_number": 82, - "length_lines": 2, - "source_file": "api/python.md", - "category": "EXTENDS_SETUP", - "confidence": "high", - "analysis": { - "has_imports": false, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": false, - "has_rx_registration": false, - "uses_existing_df": null, - "uses_existing_rasters": true, - "uses_rx_without_import": true, - "uses_spark_without_creation": false, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "python", - "code": "height_df = rasters.select(rx.rst_height(\"tile\").alias(\"height\"))\n", - "line_number": 90, - "length_lines": 2, - "source_file": "api/python.md", - "category": "EXTENDS_SETUP", - "confidence": "high", - "analysis": { - "has_imports": false, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": false, - "has_rx_registration": false, - "uses_existing_df": null, - "uses_existing_rasters": true, - "uses_rx_without_import": true, - "uses_spark_without_creation": false, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "python", - "code": "bands_df = rasters.select(rx.rst_numbands(\"tile\").alias(\"num_bands\"))\n", - "line_number": 98, - "length_lines": 2, - "source_file": "api/python.md", - "category": "EXTENDS_SETUP", - "confidence": "high", - "analysis": { - "has_imports": false, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": false, - "has_rx_registration": false, - "uses_existing_df": null, - "uses_existing_rasters": true, - "uses_rx_without_import": true, - "uses_spark_without_creation": false, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "python", - "code": "metadata_df = rasters.select(rx.rst_metadata(\"tile\").alias(\"metadata\"))\n", - "line_number": 106, - "length_lines": 2, - "source_file": "api/python.md", - "category": "EXTENDS_SETUP", - "confidence": "high", - "analysis": { - "has_imports": false, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": false, - "has_rx_registration": false, - "uses_existing_df": null, - "uses_existing_rasters": true, - "uses_rx_without_import": true, - "uses_spark_without_creation": false, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "python", - "code": "srid_df = rasters.select(rx.rst_srid(\"tile\").alias(\"srid\"))\n", - "line_number": 114, - "length_lines": 2, - "source_file": "api/python.md", - "category": "EXTENDS_SETUP", - "confidence": "high", - "analysis": { - "has_imports": false, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": false, - "has_rx_registration": false, - "uses_existing_df": null, - "uses_existing_rasters": true, - "uses_rx_without_import": true, - "uses_spark_without_creation": false, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "python", - "code": "from pyspark.sql.functions import expr\n\nclipped = rasters.select(\n rx.rst_clip(\n \"tile\",\n expr(\"st_geomfromtext('POLYGON((-122 37, -122 38, -121 38, -121 37, -122 37))')\")\n ).alias(\"clipped_tile\")\n)\n", - "line_number": 126, - "length_lines": 9, - "source_file": "api/python.md", - "category": "EXTENDS_SETUP", - "confidence": "high", - "analysis": { - "has_imports": true, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": false, - "has_rx_registration": false, - "uses_existing_df": null, - "uses_existing_rasters": true, - "uses_rx_without_import": false, - "uses_spark_without_creation": true, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "python", - "code": "resampled = rasters.select(\n rx.rst_resample(\"tile\", 1024, 1024).alias(\"resampled_tile\")\n)\n", - "line_number": 141, - "length_lines": 4, - "source_file": "api/python.md", - "category": "EXTENDS_SETUP", - "confidence": "high", - "analysis": { - "has_imports": false, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": false, - "has_rx_registration": false, - "uses_existing_df": null, - "uses_existing_rasters": true, - "uses_rx_without_import": true, - "uses_spark_without_creation": false, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "python", - "code": "from databricks.labs.gbx.rasterx import functions as rx\nfrom pyspark.sql.functions import expr\n\n# Register functions\nrx.register(spark)\n\n# Read rasters\nrasters = spark.read.format(\"gdal\").load(\"/data/satellite\")\n\n# Extract metadata and process\nresult = rasters.select(\n \"path\",\n rx.rst_boundingbox(\"tile\").alias(\"bbox\"),\n rx.rst_width(\"tile\").alias(\"width\"),\n rx.rst_height(\"tile\").alias(\"height\"),\n rx.rst_numbands(\"tile\").alias(\"bands\"),\n rx.rst_metadata(\"tile\").alias(\"metadata\")\n).filter(\n \"width > 1000 AND height > 1000\"\n)\n\nresult.write.mode(\"overwrite\").saveAsTable(\"raster_catalog\")\n", - "line_number": 149, - "length_lines": 23, - "source_file": "api/python.md", - "category": "SELF_CONTAINED", - "confidence": "high", - "analysis": { - "has_imports": true, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": true, - "has_rx_registration": true, - "uses_existing_df": null, - "uses_existing_rasters": false, - "uses_rx_without_import": false, - "uses_spark_without_creation": true, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "python", - "code": "from databricks.labs.gbx.gridx.bng import functions as bx\nbx.register(spark)\n\n# Calculate cell area\narea = spark.sql(\"SELECT gbx_bng_cellarea('TQ', 1000) as area_sqm\")\narea.show()\n", - "line_number": 184, - "length_lines": 7, - "source_file": "api/python.md", - "category": "EXTENDS_SETUP", - "confidence": "high", - "analysis": { - "has_imports": true, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": false, - "has_rx_registration": false, - "uses_existing_df": null, - "uses_existing_rasters": null, - "uses_rx_without_import": false, - "uses_spark_without_creation": true, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "python", - "code": "from pyspark.sql.functions import expr\n\npoints = spark.table(\"uk_locations\")\nbng_cells = points.select(\n \"location_id\",\n expr(\"gbx_bng_pointtocell(st_point(longitude, latitude), 1000)\").alias(\"bng_cell\")\n)\n", - "line_number": 197, - "length_lines": 8, - "source_file": "api/python.md", - "category": "EXTENDS_SETUP", - "confidence": "high", - "analysis": { - "has_imports": true, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": false, - "has_rx_registration": false, - "uses_existing_df": null, - "uses_existing_rasters": null, - "uses_rx_without_import": false, - "uses_spark_without_creation": true, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "python", - "code": "from databricks.labs.gbx.gridx.bng import functions as bx\nfrom pyspark.sql.functions import expr, count\n\n# Register functions\nbx.register(spark)\n\n# Aggregate points by BNG cell\nresult = spark.sql(\"\"\"\n SELECT\n gbx_bng_pointtocell(st_point(longitude, latitude), 1000) as bng_cell,\n COUNT(*) as point_count,\n AVG(value) as avg_value\n FROM measurements\n WHERE country = 'GB'\n GROUP BY bng_cell\n\"\"\")\n\nresult.write.mode(\"overwrite\").saveAsTable(\"bng_aggregated\")\n", - "line_number": 209, - "length_lines": 19, - "source_file": "api/python.md", - "category": "EXTENDS_SETUP", - "confidence": "high", - "analysis": { - "has_imports": true, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": false, - "has_rx_registration": false, - "uses_existing_df": null, - "uses_existing_rasters": null, - "uses_rx_without_import": false, - "uses_spark_without_creation": true, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "python", - "code": "from databricks.labs.gbx.vectorx import functions as vx\nfrom pyspark.sql.functions import expr\n\n# Register functions\nvx.register(spark)\n\n# Convert legacy geometries\nlegacy = spark.table(\"legacy_mosaic_table\")\nconverted = legacy.select(\n \"feature_id\",\n vx.st_legacyaswkb(\"mosaic_geom\").alias(\"wkb_geom\")\n)\n\n# Convert to Databricks GEOMETRY type\ngeometry_df = converted.select(\n \"feature_id\",\n \"wkb_geom\",\n expr(\"st_geomfromwkb(wkb_geom)\").alias(\"geometry\")\n)\n\ngeometry_df.write.mode(\"overwrite\").saveAsTable(\"converted_features\")\n", - "line_number": 238, - "length_lines": 22, - "source_file": "api/python.md", - "category": "EXTENDS_SETUP", - "confidence": "high", - "analysis": { - "has_imports": true, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": false, - "has_rx_registration": false, - "uses_existing_df": null, - "uses_existing_rasters": null, - "uses_rx_without_import": false, - "uses_spark_without_creation": true, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "python", - "code": "from databricks.labs.gbx.vectorx import functions as vx\nfrom pyspark.sql.functions import expr\n\n# Register functions\nvx.register(spark)\n\n# Full migration workflow\nlegacy_table = spark.table(\"legacy_mosaic_geometries\")\n\n# Convert and validate\nmigrated = legacy_table.select(\n \"*\",\n expr(\"st_geomfromwkb(gbx_st_legacyaswkb(mosaic_geom))\").alias(\"geometry\")\n).select(\n \"feature_id\",\n \"properties\",\n \"geometry\",\n expr(\"st_isvalid(geometry)\").alias(\"is_valid\"),\n expr(\"st_area(geometry)\").alias(\"area\")\n).filter(\"is_valid = true\")\n\n# Save to Delta\nmigrated.write.mode(\"overwrite\").saveAsTable(\"migrated_features\")\n", - "line_number": 264, - "length_lines": 24, - "source_file": "api/python.md", - "category": "EXTENDS_SETUP", - "confidence": "high", - "analysis": { - "has_imports": true, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": false, - "has_rx_registration": false, - "uses_existing_df": null, - "uses_existing_rasters": null, - "uses_rx_without_import": false, - "uses_spark_without_creation": true, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "python", - "code": "from databricks.labs.gbx.rasterx import functions as rx\nrx.register(spark)\n\n# Single function\nresult = df.select(rx.rst_boundingbox(\"tile\"))\n\n# Multiple functions\nresult = df.select(\n rx.rst_boundingbox(\"tile\").alias(\"bbox\"),\n rx.rst_width(\"tile\").alias(\"width\"),\n rx.rst_height(\"tile\").alias(\"height\")\n)\n\n# With column renaming\nresult = df.select(\n \"path\",\n rx.rst_metadata(\"tile\").alias(\"raster_metadata\")\n)\n", - "line_number": 294, - "length_lines": 19, - "source_file": "api/python.md", - "category": "EXTENDS_SETUP", - "confidence": "high", - "analysis": { - "has_imports": true, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": false, - "has_rx_registration": true, - "uses_existing_df": true, - "uses_existing_rasters": null, - "uses_rx_without_import": false, - "uses_spark_without_creation": false, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "python", - "code": "# Filter based on GeoBrix function results\nresult = df.filter(\n rx.rst_width(\"tile\") > 1000\n)\n\n# Complex filters\nresult = df.filter(\n (rx.rst_width(\"tile\") > 1000) &\n (rx.rst_height(\"tile\") > 1000) &\n (rx.rst_numbands(\"tile\") >= 3)\n)\n", - "line_number": 317, - "length_lines": 12, - "source_file": "api/python.md", - "category": "EXTENDS_SETUP", - "confidence": "high", - "analysis": { - "has_imports": false, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": false, - "has_rx_registration": false, - "uses_existing_df": true, - "uses_existing_rasters": null, - "uses_rx_without_import": true, - "uses_spark_without_creation": false, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "python", - "code": "# Add new columns\nresult = df.withColumn(\"bbox\", rx.rst_boundingbox(\"tile\"))\nresult = df.withColumn(\"width\", rx.rst_width(\"tile\"))\nresult = df.withColumn(\"height\", rx.rst_height(\"tile\"))\n\n# Chain operations\nresult = (\n df\n .withColumn(\"bbox\", rx.rst_boundingbox(\"tile\"))\n .withColumn(\"width\", rx.rst_width(\"tile\"))\n .withColumn(\"height\", rx.rst_height(\"tile\"))\n)\n", - "line_number": 333, - "length_lines": 13, - "source_file": "api/python.md", - "category": "EXTENDS_SETUP", - "confidence": "high", - "analysis": { - "has_imports": false, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": false, - "has_rx_registration": false, - "uses_existing_df": true, - "uses_existing_rasters": null, - "uses_rx_without_import": true, - "uses_spark_without_creation": false, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "python", - "code": "from databricks.labs.gbx.rasterx import functions as rx\nrx.register(spark)\n\n# Create temp view\nrasters = spark.read.format(\"gdal\").load(\"/data/rasters\")\nrasters.createOrReplaceTempView(\"rasters\")\n\n# Use in SQL\nresult = spark.sql(\"\"\"\n SELECT\n path,\n gbx_rst_boundingbox(tile) as bbox,\n gbx_rst_width(tile) as width,\n gbx_rst_height(tile) as height\n FROM rasters\n WHERE gbx_rst_width(tile) > 1000\n\"\"\")\n", - "line_number": 352, - "length_lines": 18, - "source_file": "api/python.md", - "category": "SELF_CONTAINED", - "confidence": "high", - "analysis": { - "has_imports": true, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": true, - "has_rx_registration": true, - "uses_existing_df": null, - "uses_existing_rasters": false, - "uses_rx_without_import": false, - "uses_spark_without_creation": true, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "python", - "code": "from pyspark.sql import DataFrame\nfrom databricks.labs.gbx.rasterx import functions as rx\n\ndef process_rasters(df: DataFrame) -> DataFrame:\n \"\"\"\n Process rasters and extract metadata.\n \n Args:\n df: DataFrame with 'tile' column\n \n Returns:\n DataFrame with extracted metadata\n \"\"\"\n rx.register(df.sparkSession)\n \n return df.select(\n \"path\",\n rx.rst_boundingbox(\"tile\").alias(\"bbox\"),\n rx.rst_width(\"tile\").alias(\"width\"),\n rx.rst_height(\"tile\").alias(\"height\")\n )\n", - "line_number": 376, - "length_lines": 22, - "source_file": "api/python.md", - "category": "EXTENDS_SETUP", - "confidence": "high", - "analysis": { - "has_imports": true, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": false, - "has_rx_registration": false, - "uses_existing_df": true, - "uses_existing_rasters": true, - "uses_rx_without_import": false, - "uses_spark_without_creation": false, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "python", - "code": "from databricks.labs.gbx.rasterx import functions as rx\n\ntry:\n rx.register(spark)\n result = df.select(rx.rst_boundingbox(\"tile\"))\n result.show()\nexcept Exception as e:\n print(f\"Error processing rasters: {e}\")\n", - "line_number": 402, - "length_lines": 9, - "source_file": "api/python.md", - "category": "EXTENDS_SETUP", - "confidence": "high", - "analysis": { - "has_imports": true, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": false, - "has_rx_registration": true, - "uses_existing_df": true, - "uses_existing_rasters": true, - "uses_rx_without_import": false, - "uses_spark_without_creation": false, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "scala", - "code": "rst_boundingbox(tile: Column): Column\n", - "line_number": 18, - "length_lines": 2, - "source_file": "api/rasterx-functions.md", - "category": "EXAMPLE_ONLY", - "confidence": "high", - "analysis": { - "has_imports": false, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": false, - "has_rx_registration": false, - "uses_existing_df": null, - "uses_existing_rasters": null, - "uses_rx_without_import": false, - "uses_spark_without_creation": false, - "has_placeholders": false, - "is_short": true - } - }, - { - "language": "python", - "code": "from databricks.labs.gbx.rasterx import functions as rx\nrx.register(spark)\n\nrasters = spark.read.format(\"gdal\").load(\"/data/rasters\")\nbbox_df = rasters.select(\n \"path\",\n rx.rst_boundingbox(\"tile\").alias(\"bbox\")\n)\n", - "line_number": 30, - "length_lines": 9, - "source_file": "api/rasterx-functions.md", - "category": "SELF_CONTAINED", - "confidence": "high", - "analysis": { - "has_imports": true, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": true, - "has_rx_registration": true, - "uses_existing_df": null, - "uses_existing_rasters": false, - "uses_rx_without_import": false, - "uses_spark_without_creation": true, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "scala", - "code": "import com.databricks.labs.gbx.rasterx.{functions => rx}\nrx.register(spark)\n\nval rasters = spark.read.format(\"gdal\").load(\"/data/rasters\")\nval bboxDf = rasters.select(\n col(\"path\"),\n rx.rst_boundingbox(col(\"tile\")).alias(\"bbox\")\n)\n", - "line_number": 41, - "length_lines": 9, - "source_file": "api/rasterx-functions.md", - "category": "SELF_CONTAINED", - "confidence": "high", - "analysis": { - "has_imports": true, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": true, - "has_rx_registration": true, - "uses_existing_df": null, - "uses_existing_rasters": false, - "uses_rx_without_import": false, - "uses_spark_without_creation": true, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "sql", - "code": "SELECT\n path,\n gbx_rst_boundingbox(tile) as bbox\nFROM gdal.`/data/rasters`;\n", - "line_number": 52, - "length_lines": 5, - "source_file": "api/rasterx-functions.md", - "category": "EXTENDS_SETUP", - "confidence": "high", - "analysis": { - "has_imports": false, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": false, - "has_rx_registration": false, - "uses_existing_df": null, - "uses_existing_rasters": true, - "uses_rx_without_import": false, - "uses_spark_without_creation": false, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "scala", - "code": "rst_width(tile: Column): Column\n", - "line_number": 66, - "length_lines": 2, - "source_file": "api/rasterx-functions.md", - "category": "EXAMPLE_ONLY", - "confidence": "high", - "analysis": { - "has_imports": false, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": false, - "has_rx_registration": false, - "uses_existing_df": null, - "uses_existing_rasters": null, - "uses_rx_without_import": false, - "uses_spark_without_creation": false, - "has_placeholders": false, - "is_short": true - } - }, - { - "language": "python", - "code": "from databricks.labs.gbx.rasterx import functions as rx\nfrom pyspark.sql import functions as f\n\n# Get dimensions from a file\ndf = spark.range(1).select(\n rx.rst_width(\n rx.rst_fromfile(f.lit(\"/data/raster.tif\"), f.lit(\"GTiff\"))\n ).alias(\"width\")\n)\n\n# Get dimensions for rasters in a table\nrasters = spark.read.format(\"gdal\").load(\"/data/rasters\")\ndims_df = rasters.select(\n \"path\",\n rx.rst_width(\"tile\").alias(\"width\"),\n rx.rst_height(\"tile\").alias(\"height\"),\n rx.rst_pixelwidth(\"tile\").alias(\"pixel_width\"),\n rx.rst_pixelheight(\"tile\").alias(\"pixel_height\")\n)\n", - "line_number": 78, - "length_lines": 20, - "source_file": "api/rasterx-functions.md", - "category": "SELF_CONTAINED", - "confidence": "medium", - "analysis": { - "has_imports": true, - "has_spark_creation": false, - "has_data_creation": true, - "has_file_reading": true, - "has_rx_registration": false, - "uses_existing_df": false, - "uses_existing_rasters": false, - "uses_rx_without_import": false, - "uses_spark_without_creation": true, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "sql", - "code": "-- Get width from raster table\nSELECT gbx_rst_width(tile) as width FROM rasters;\n\n-- Get multiple dimensions\nSELECT \n path,\n gbx_rst_width(tile) as width,\n gbx_rst_height(tile) as height,\n gbx_rst_pixelwidth(tile) as pixel_width_m\nFROM rasters;\n", - "line_number": 100, - "length_lines": 11, - "source_file": "api/rasterx-functions.md", - "category": "EXTENDS_SETUP", - "confidence": "high", - "analysis": { - "has_imports": true, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": false, - "has_rx_registration": false, - "uses_existing_df": null, - "uses_existing_rasters": true, - "uses_rx_without_import": false, - "uses_spark_without_creation": false, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "scala", - "code": "rst_height(tile: Column): Column\n", - "line_number": 120, - "length_lines": 2, - "source_file": "api/rasterx-functions.md", - "category": "EXAMPLE_ONLY", - "confidence": "high", - "analysis": { - "has_imports": false, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": false, - "has_rx_registration": false, - "uses_existing_df": null, - "uses_existing_rasters": null, - "uses_rx_without_import": false, - "uses_spark_without_creation": false, - "has_placeholders": false, - "is_short": true - } - }, - { - "language": "python", - "code": "# Calculate total pixel count\ndf = rasters.select(\n \"path\",\n rx.rst_height(\"tile\").alias(\"height\"),\n rx.rst_width(\"tile\").alias(\"width\"),\n (rx.rst_width(\"tile\") * rx.rst_height(\"tile\")).alias(\"total_pixels\")\n)\n", - "line_number": 132, - "length_lines": 8, - "source_file": "api/rasterx-functions.md", - "category": "EXTENDS_SETUP", - "confidence": "high", - "analysis": { - "has_imports": false, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": false, - "has_rx_registration": false, - "uses_existing_df": false, - "uses_existing_rasters": true, - "uses_rx_without_import": true, - "uses_spark_without_creation": false, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "sql", - "code": "SELECT \n gbx_rst_height(tile) as height,\n gbx_rst_width(tile) as width,\n gbx_rst_width(tile) * gbx_rst_height(tile) as total_pixels\nFROM rasters;\n", - "line_number": 142, - "length_lines": 6, - "source_file": "api/rasterx-functions.md", - "category": "EXTENDS_SETUP", - "confidence": "high", - "analysis": { - "has_imports": false, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": false, - "has_rx_registration": false, - "uses_existing_df": null, - "uses_existing_rasters": true, - "uses_rx_without_import": false, - "uses_spark_without_creation": false, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "scala", - "code": "rst_numbands(tile: Column): Column\n", - "line_number": 157, - "length_lines": 2, - "source_file": "api/rasterx-functions.md", - "category": "EXAMPLE_ONLY", - "confidence": "high", - "analysis": { - "has_imports": false, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": false, - "has_rx_registration": false, - "uses_existing_df": null, - "uses_existing_rasters": null, - "uses_rx_without_import": false, - "uses_spark_without_creation": false, - "has_placeholders": false, - "is_short": true - } - }, - { - "language": "python", - "code": "bands_df = rasters.select(rx.rst_numbands(\"tile\").alias(\"num_bands\"))\n", - "line_number": 169, - "length_lines": 2, - "source_file": "api/rasterx-functions.md", - "category": "EXTENDS_SETUP", - "confidence": "high", - "analysis": { - "has_imports": false, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": false, - "has_rx_registration": false, - "uses_existing_df": null, - "uses_existing_rasters": true, - "uses_rx_without_import": true, - "uses_spark_without_creation": false, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "sql", - "code": "SELECT gbx_rst_numbands(tile) as bands FROM rasters;\n", - "line_number": 173, - "length_lines": 2, - "source_file": "api/rasterx-functions.md", - "category": "EXTENDS_SETUP", - "confidence": "high", - "analysis": { - "has_imports": false, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": false, - "has_rx_registration": false, - "uses_existing_df": null, - "uses_existing_rasters": true, - "uses_rx_without_import": false, - "uses_spark_without_creation": false, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "scala", - "code": "rst_metadata(tile: Column): Column\n", - "line_number": 184, - "length_lines": 2, - "source_file": "api/rasterx-functions.md", - "category": "EXAMPLE_ONLY", - "confidence": "high", - "analysis": { - "has_imports": false, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": false, - "has_rx_registration": false, - "uses_existing_df": null, - "uses_existing_rasters": null, - "uses_rx_without_import": false, - "uses_spark_without_creation": false, - "has_placeholders": false, - "is_short": true - } - }, - { - "language": "python", - "code": "metadata_df = rasters.select(rx.rst_metadata(\"tile\").alias(\"metadata\"))\n", - "line_number": 196, - "length_lines": 2, - "source_file": "api/rasterx-functions.md", - "category": "EXTENDS_SETUP", - "confidence": "high", - "analysis": { - "has_imports": false, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": false, - "has_rx_registration": false, - "uses_existing_df": null, - "uses_existing_rasters": true, - "uses_rx_without_import": true, - "uses_spark_without_creation": false, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "sql", - "code": "SELECT gbx_rst_metadata(tile) as metadata FROM rasters;\n", - "line_number": 200, - "length_lines": 2, - "source_file": "api/rasterx-functions.md", - "category": "EXTENDS_SETUP", - "confidence": "high", - "analysis": { - "has_imports": false, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": false, - "has_rx_registration": false, - "uses_existing_df": null, - "uses_existing_rasters": true, - "uses_rx_without_import": false, - "uses_spark_without_creation": false, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "scala", - "code": "rst_srid(tile: Column): Column\n", - "line_number": 211, - "length_lines": 2, - "source_file": "api/rasterx-functions.md", - "category": "EXAMPLE_ONLY", - "confidence": "high", - "analysis": { - "has_imports": false, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": false, - "has_rx_registration": false, - "uses_existing_df": null, - "uses_existing_rasters": null, - "uses_rx_without_import": false, - "uses_spark_without_creation": false, - "has_placeholders": false, - "is_short": true - } - }, - { - "language": "python", - "code": "srid_df = rasters.select(rx.rst_srid(\"tile\").alias(\"srid\"))\n", - "line_number": 223, - "length_lines": 2, - "source_file": "api/rasterx-functions.md", - "category": "EXTENDS_SETUP", - "confidence": "high", - "analysis": { - "has_imports": false, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": false, - "has_rx_registration": false, - "uses_existing_df": null, - "uses_existing_rasters": true, - "uses_rx_without_import": true, - "uses_spark_without_creation": false, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "sql", - "code": "SELECT gbx_rst_srid(tile) as srid FROM rasters;\n", - "line_number": 227, - "length_lines": 2, - "source_file": "api/rasterx-functions.md", - "category": "EXTENDS_SETUP", - "confidence": "high", - "analysis": { - "has_imports": false, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": false, - "has_rx_registration": false, - "uses_existing_df": null, - "uses_existing_rasters": true, - "uses_rx_without_import": false, - "uses_spark_without_creation": false, - "has_placeholders": false, - "is_short": true - } - }, - { - "language": "scala", - "code": "rst_bandmetadata(tile: Column, bandIndex: Column): Column\n", - "line_number": 238, - "length_lines": 2, - "source_file": "api/rasterx-functions.md", - "category": "NEEDS_REVIEW", - "confidence": "low", - "analysis": { - "has_imports": false, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": false, - "has_rx_registration": false, - "uses_existing_df": null, - "uses_existing_rasters": null, - "uses_rx_without_import": false, - "uses_spark_without_creation": false, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "python", - "code": "band_meta = rasters.select(\n rx.rst_bandmetadata(\"tile\", lit(1)).alias(\"band1_metadata\")\n)\n", - "line_number": 251, - "length_lines": 4, - "source_file": "api/rasterx-functions.md", - "category": "EXTENDS_SETUP", - "confidence": "high", - "analysis": { - "has_imports": false, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": false, - "has_rx_registration": false, - "uses_existing_df": null, - "uses_existing_rasters": true, - "uses_rx_without_import": true, - "uses_spark_without_creation": false, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "scala", - "code": "rst_pixelcount(tile: Column): Column\n", - "line_number": 264, - "length_lines": 2, - "source_file": "api/rasterx-functions.md", - "category": "EXAMPLE_ONLY", - "confidence": "high", - "analysis": { - "has_imports": false, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": false, - "has_rx_registration": false, - "uses_existing_df": null, - "uses_existing_rasters": null, - "uses_rx_without_import": false, - "uses_spark_without_creation": false, - "has_placeholders": false, - "is_short": true - } - }, - { - "language": "python", - "code": "pixel_count = rasters.select(rx.rst_pixelcount(\"tile\").alias(\"pixel_count\"))\n", - "line_number": 276, - "length_lines": 2, - "source_file": "api/rasterx-functions.md", - "category": "EXTENDS_SETUP", - "confidence": "high", - "analysis": { - "has_imports": false, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": false, - "has_rx_registration": false, - "uses_existing_df": null, - "uses_existing_rasters": true, - "uses_rx_without_import": true, - "uses_spark_without_creation": false, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "scala", - "code": "rst_avg(tile: Column): Column\n", - "line_number": 287, - "length_lines": 2, - "source_file": "api/rasterx-functions.md", - "category": "EXAMPLE_ONLY", - "confidence": "high", - "analysis": { - "has_imports": false, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": false, - "has_rx_registration": false, - "uses_existing_df": null, - "uses_existing_rasters": null, - "uses_rx_without_import": false, - "uses_spark_without_creation": false, - "has_placeholders": false, - "is_short": true - } - }, - { - "language": "python", - "code": "from databricks.labs.gbx.rasterx import functions as rx\nfrom pyspark.sql import functions as f\n\n# Get average values for all bands\nrasters = spark.read.format(\"gdal\").load(\"/data/satellite/*.tif\")\n\navg_values = rasters.select(\n \"path\",\n rx.rst_avg(f.col(\"tile\")).alias(\"band_averages\")\n)\n\n# Extract individual band averages for multi-band rasters\navg_values.select(\n \"path\",\n f.col(\"band_averages\").getItem(0).alias(\"band1_avg\"),\n f.col(\"band_averages\").getItem(1).alias(\"band2_avg\"),\n f.col(\"band_averages\").getItem(2).alias(\"band3_avg\")\n).show()\n\n# Filter rasters by average value threshold\nbright_rasters = rasters.select(\n \"path\",\n rx.rst_avg(f.col(\"tile\")).alias(\"avgs\")\n).filter(\n f.col(\"avgs\").getItem(0) > 100.0\n)\n\n# Compute statistics across raster collection\ncollection_stats = rasters.select(\n f.avg(rx.rst_avg(f.col(\"tile\")).getItem(0)).alias(\"collection_mean\"),\n f.stddev(rx.rst_avg(f.col(\"tile\")).getItem(0)).alias(\"collection_stddev\")\n)\n", - "line_number": 299, - "length_lines": 33, - "source_file": "api/rasterx-functions.md", - "category": "SELF_CONTAINED", - "confidence": "medium", - "analysis": { - "has_imports": true, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": true, - "has_rx_registration": false, - "uses_existing_df": null, - "uses_existing_rasters": false, - "uses_rx_without_import": false, - "uses_spark_without_creation": true, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "scala", - "code": "import com.databricks.labs.gbx.rasterx.{functions => rx}\nimport org.apache.spark.sql.functions._\n\nval rasters = spark.read.format(\"gdal\").load(\"/data/rasters\")\n\n// Get average per band\nval avgValues = rasters.select(\n col(\"path\"),\n rx.rst_avg(col(\"tile\")).alias(\"band_avgs\")\n)\n", - "line_number": 334, - "length_lines": 11, - "source_file": "api/rasterx-functions.md", - "category": "SELF_CONTAINED", - "confidence": "medium", - "analysis": { - "has_imports": true, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": true, - "has_rx_registration": false, - "uses_existing_df": null, - "uses_existing_rasters": false, - "uses_rx_without_import": false, - "uses_spark_without_creation": true, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "sql", - "code": "-- Get average values\nSELECT\n path,\n gbx_rst_avg(tile) as band_averages,\n gbx_rst_avg(tile)[0] as band1_avg\nFROM rasters;\n\n-- Filter by average threshold\nSELECT * FROM rasters\nWHERE gbx_rst_avg(tile)[0] > 50.0;\n", - "line_number": 347, - "length_lines": 11, - "source_file": "api/rasterx-functions.md", - "category": "EXTENDS_SETUP", - "confidence": "high", - "analysis": { - "has_imports": false, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": false, - "has_rx_registration": false, - "uses_existing_df": null, - "uses_existing_rasters": true, - "uses_rx_without_import": false, - "uses_spark_without_creation": false, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "scala", - "code": "rst_min(tile: Column): Column\n", - "line_number": 374, - "length_lines": 2, - "source_file": "api/rasterx-functions.md", - "category": "EXAMPLE_ONLY", - "confidence": "high", - "analysis": { - "has_imports": false, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": false, - "has_rx_registration": false, - "uses_existing_df": null, - "uses_existing_rasters": null, - "uses_rx_without_import": false, - "uses_spark_without_creation": false, - "has_placeholders": false, - "is_short": true - } - }, - { - "language": "python", - "code": "from databricks.labs.gbx.rasterx import functions as rx\nfrom pyspark.sql import functions as f\n\n# Find minimum values\nrasters = spark.read.format(\"gdal\").load(\"/data/elevation/*.tif\")\n\nmin_elevations = rasters.select(\n \"path\",\n rx.rst_min(f.col(\"tile\")).getItem(0).alias(\"min_elevation_m\")\n)\n\n# Quality check: detect negative values in non-negative data\nquality_check = rasters.select(\n \"path\",\n rx.rst_min(f.col(\"tile\")).alias(\"mins\")\n).filter(\n f.col(\"mins\").getItem(0) < 0\n)\n\n# Range analysis\nranges = rasters.select(\n \"path\",\n rx.rst_min(f.col(\"tile\")).getItem(0).alias(\"min_value\"),\n rx.rst_max(f.col(\"tile\")).getItem(0).alias(\"max_value\"),\n (rx.rst_max(f.col(\"tile\")).getItem(0) - \n rx.rst_min(f.col(\"tile\")).getItem(0)).alias(\"range\")\n)\n", - "line_number": 386, - "length_lines": 28, - "source_file": "api/rasterx-functions.md", - "category": "SELF_CONTAINED", - "confidence": "medium", - "analysis": { - "has_imports": true, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": true, - "has_rx_registration": false, - "uses_existing_df": null, - "uses_existing_rasters": false, - "uses_rx_without_import": false, - "uses_spark_without_creation": true, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "scala", - "code": "import com.databricks.labs.gbx.rasterx.{functions => rx}\nimport org.apache.spark.sql.functions._\n\nval minVals = rasters.select(\n rx.rst_min(col(\"tile\")).alias(\"min_values\")\n)\n", - "line_number": 416, - "length_lines": 7, - "source_file": "api/rasterx-functions.md", - "category": "EXTENDS_SETUP", - "confidence": "high", - "analysis": { - "has_imports": true, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": false, - "has_rx_registration": false, - "uses_existing_df": null, - "uses_existing_rasters": true, - "uses_rx_without_import": false, - "uses_spark_without_creation": true, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "sql", - "code": "SELECT\n path,\n gbx_rst_min(tile)[0] as min_value,\n gbx_rst_max(tile)[0] as max_value,\n gbx_rst_max(tile)[0] - gbx_rst_min(tile)[0] as value_range\nFROM elevation_rasters;\n", - "line_number": 425, - "length_lines": 7, - "source_file": "api/rasterx-functions.md", - "category": "EXTENDS_SETUP", - "confidence": "high", - "analysis": { - "has_imports": false, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": false, - "has_rx_registration": false, - "uses_existing_df": null, - "uses_existing_rasters": null, - "uses_rx_without_import": false, - "uses_spark_without_creation": false, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "scala", - "code": "rst_max(tile: Column): Column\n", - "line_number": 447, - "length_lines": 2, - "source_file": "api/rasterx-functions.md", - "category": "EXAMPLE_ONLY", - "confidence": "high", - "analysis": { - "has_imports": false, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": false, - "has_rx_registration": false, - "uses_existing_df": null, - "uses_existing_rasters": null, - "uses_rx_without_import": false, - "uses_spark_without_creation": false, - "has_placeholders": false, - "is_short": true - } - }, - { - "language": "python", - "code": "from databricks.labs.gbx.rasterx import functions as rx\nfrom pyspark.sql import functions as f\n\n# Find maximum values\ntemps = spark.read.format(\"gdal\").load(\"/data/temperature/*.tif\")\n\nmax_temps = temps.select(\n \"path\",\n \"date\",\n rx.rst_max(f.col(\"tile\")).getItem(0).alias(\"max_temp_c\")\n)\n\n# Find extreme heat events\nheat_waves = max_temps.filter(f.col(\"max_temp_c\") > 40.0)\n\n# Peak detection across time series\ndaily_peaks = temps.groupBy(f.to_date(\"date\")).agg(\n f.max(rx.rst_max(f.col(\"tile\")).getItem(0)).alias(\"daily_peak_temp\")\n)\n\n# Saturation detection (sensor maximum)\nsaturated = rasters.select(\n \"path\",\n rx.rst_max(f.col(\"tile\")).alias(\"maxs\")\n).filter(\n f.col(\"maxs\").getItem(0) >= 255.0 # 8-bit saturation\n)\n", - "line_number": 459, - "length_lines": 28, - "source_file": "api/rasterx-functions.md", - "category": "SELF_CONTAINED", - "confidence": "medium", - "analysis": { - "has_imports": true, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": true, - "has_rx_registration": false, - "uses_existing_df": null, - "uses_existing_rasters": false, - "uses_rx_without_import": false, - "uses_spark_without_creation": true, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "scala", - "code": "import com.databricks.labs.gbx.rasterx.{functions => rx}\nimport org.apache.spark.sql.functions._\n\nval maxVals = rasters.select(\n rx.rst_max(col(\"tile\")).alias(\"max_values\")\n)\n", - "line_number": 489, - "length_lines": 7, - "source_file": "api/rasterx-functions.md", - "category": "EXTENDS_SETUP", - "confidence": "high", - "analysis": { - "has_imports": true, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": false, - "has_rx_registration": false, - "uses_existing_df": null, - "uses_existing_rasters": true, - "uses_rx_without_import": false, - "uses_spark_without_creation": true, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "sql", - "code": "SELECT\n date,\n MAX(gbx_rst_max(tile)[0]) as peak_temperature\nFROM daily_temps\nGROUP BY date\nORDER BY date;\n", - "line_number": 498, - "length_lines": 7, - "source_file": "api/rasterx-functions.md", - "category": "EXTENDS_SETUP", - "confidence": "high", - "analysis": { - "has_imports": false, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": false, - "has_rx_registration": false, - "uses_existing_df": null, - "uses_existing_rasters": null, - "uses_rx_without_import": false, - "uses_spark_without_creation": false, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "scala", - "code": "rst_median(tile: Column): Column\n", - "line_number": 520, - "length_lines": 2, - "source_file": "api/rasterx-functions.md", - "category": "EXAMPLE_ONLY", - "confidence": "high", - "analysis": { - "has_imports": false, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": false, - "has_rx_registration": false, - "uses_existing_df": null, - "uses_existing_rasters": null, - "uses_rx_without_import": false, - "uses_spark_without_creation": false, - "has_placeholders": false, - "is_short": true - } - }, - { - "language": "python", - "code": "from databricks.labs.gbx.rasterx import functions as rx\nfrom pyspark.sql import functions as f\n\n# Robust central tendency measurement\nrasters = spark.read.format(\"gdal\").load(\"/data/ndvi/*.tif\")\n\nstats = rasters.select(\n \"path\",\n rx.rst_avg(f.col(\"tile\")).getItem(0).alias(\"mean_ndvi\"),\n rx.rst_median(f.col(\"tile\")).getItem(0).alias(\"median_ndvi\")\n)\n\n# Detect skewness (mean != median suggests outliers)\nskewed = stats.filter(\n f.abs(f.col(\"mean_ndvi\") - f.col(\"median_ndvi\")) > 0.1\n)\n\n# Cloud-free composite using median (reduces cloud outliers)\ntime_series = spark.table(\"daily_satellite_images\")\n\ncomposite = time_series.groupBy(\"pixel_location\").agg(\n rx.rst_median(f.collect_list(\"tile\")).alias(\"cloud_free_median\")\n)\n\n# Robust quality metric\nquality_score = rasters.select(\n \"path\",\n (rx.rst_median(f.col(\"tile\")).getItem(0) - \n rx.rst_avg(f.col(\"tile\")).getItem(0)).alias(\"outlier_influence\")\n)\n", - "line_number": 532, - "length_lines": 31, - "source_file": "api/rasterx-functions.md", - "category": "SELF_CONTAINED", - "confidence": "medium", - "analysis": { - "has_imports": true, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": true, - "has_rx_registration": false, - "uses_existing_df": null, - "uses_existing_rasters": false, - "uses_rx_without_import": false, - "uses_spark_without_creation": true, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "scala", - "code": "import com.databricks.labs.gbx.rasterx.{functions => rx}\nimport org.apache.spark.sql.functions._\n\nval medianVals = rasters.select(\n rx.rst_median(col(\"tile\")).alias(\"median_values\")\n)\n", - "line_number": 565, - "length_lines": 7, - "source_file": "api/rasterx-functions.md", - "category": "EXTENDS_SETUP", - "confidence": "high", - "analysis": { - "has_imports": true, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": false, - "has_rx_registration": false, - "uses_existing_df": null, - "uses_existing_rasters": true, - "uses_rx_without_import": false, - "uses_spark_without_creation": true, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "sql", - "code": "SELECT\n path,\n gbx_rst_avg(tile)[0] as mean_value,\n gbx_rst_median(tile)[0] as median_value,\n ABS(gbx_rst_avg(tile)[0] - gbx_rst_median(tile)[0]) as skewness\nFROM rasters;\n", - "line_number": 574, - "length_lines": 7, - "source_file": "api/rasterx-functions.md", - "category": "EXTENDS_SETUP", - "confidence": "high", - "analysis": { - "has_imports": false, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": false, - "has_rx_registration": false, - "uses_existing_df": null, - "uses_existing_rasters": true, - "uses_rx_without_import": false, - "uses_spark_without_creation": false, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "scala", - "code": "rst_format(tile: Column): Column\n", - "line_number": 598, - "length_lines": 2, - "source_file": "api/rasterx-functions.md", - "category": "EXAMPLE_ONLY", - "confidence": "high", - "analysis": { - "has_imports": false, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": false, - "has_rx_registration": false, - "uses_existing_df": null, - "uses_existing_rasters": null, - "uses_rx_without_import": false, - "uses_spark_without_creation": false, - "has_placeholders": false, - "is_short": true - } - }, - { - "language": "python", - "code": "from databricks.labs.gbx.rasterx import functions as rx\nfrom pyspark.sql import functions as f\n\n# Identify raster formats\nrasters = spark.read.format(\"gdal\").load(\"/data/mixed_formats/*\")\n\nformats = rasters.select(\n \"path\",\n rx.rst_format(f.col(\"tile\")).alias(\"format\")\n)\n\n# Group by format\nformat_counts = formats.groupBy(\"format\").count()\n\n# Filter by format\ngeotiffs = rasters.filter(\n rx.rst_format(f.col(\"tile\")) == \"GTiff\"\n)\n\n# Format validation\ninvalid_formats = rasters.filter(\n ~rx.rst_format(f.col(\"tile\")).isin(\"GTiff\", \"PNG\", \"JPEG\")\n)\n\n# Format inventory for data catalog\ncatalog = rasters.select(\n \"path\",\n rx.rst_format(f.col(\"tile\")).alias(\"format\"),\n rx.rst_width(f.col(\"tile\")).alias(\"width\"),\n rx.rst_height(f.col(\"tile\")).alias(\"height\"),\n rx.rst_numbands(f.col(\"tile\")).alias(\"bands\")\n)\n", - "line_number": 610, - "length_lines": 33, - "source_file": "api/rasterx-functions.md", - "category": "SELF_CONTAINED", - "confidence": "medium", - "analysis": { - "has_imports": true, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": true, - "has_rx_registration": false, - "uses_existing_df": null, - "uses_existing_rasters": false, - "uses_rx_without_import": false, - "uses_spark_without_creation": true, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "scala", - "code": "import com.databricks.labs.gbx.rasterx.{functions => rx}\nimport org.apache.spark.sql.functions._\n\n// Get formats\nval formats = rasters.select(\n col(\"path\"),\n rx.rst_format(col(\"tile\")).alias(\"format\")\n)\n\n// Filter GeoTIFFs\nval geotiffs = rasters.filter(rx.rst_format(col(\"tile\")) === \"GTiff\")\n", - "line_number": 645, - "length_lines": 12, - "source_file": "api/rasterx-functions.md", - "category": "EXTENDS_SETUP", - "confidence": "high", - "analysis": { - "has_imports": true, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": false, - "has_rx_registration": false, - "uses_existing_df": null, - "uses_existing_rasters": true, - "uses_rx_without_import": false, - "uses_spark_without_creation": true, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "sql", - "code": "-- Identify formats\nSELECT\n gbx_rst_format(tile) as format,\n COUNT(*) as count\nFROM rasters\nGROUP BY gbx_rst_format(tile);\n\n-- Find non-GeoTIFF files\nSELECT path, gbx_rst_format(tile) as format\nFROM rasters\nWHERE gbx_rst_format(tile) != 'GTiff';\n", - "line_number": 659, - "length_lines": 12, - "source_file": "api/rasterx-functions.md", - "category": "EXTENDS_SETUP", - "confidence": "high", - "analysis": { - "has_imports": false, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": false, - "has_rx_registration": false, - "uses_existing_df": null, - "uses_existing_rasters": true, - "uses_rx_without_import": false, - "uses_spark_without_creation": false, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "scala", - "code": "rst_type(tile: Column): Column\n", - "line_number": 692, - "length_lines": 2, - "source_file": "api/rasterx-functions.md", - "category": "EXAMPLE_ONLY", - "confidence": "high", - "analysis": { - "has_imports": false, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": false, - "has_rx_registration": false, - "uses_existing_df": null, - "uses_existing_rasters": null, - "uses_rx_without_import": false, - "uses_spark_without_creation": false, - "has_placeholders": false, - "is_short": true - } - }, - { - "language": "python", - "code": "from databricks.labs.gbx.rasterx import functions as rx\nfrom pyspark.sql import functions as f\n\n# Get data types\nrasters = spark.read.format(\"gdal\").load(\"/data/satellite/*.tif\")\n\ntypes = rasters.select(\n \"path\",\n rx.rst_type(f.col(\"tile\")).alias(\"band_types\")\n)\n\n# Extract individual band types for multi-band rasters\ntypes.select(\n \"path\",\n f.col(\"band_types\").getItem(0).alias(\"band1_type\"),\n f.col(\"band_types\").getItem(1).alias(\"band2_type\"),\n f.col(\"band_types\").getItem(2).alias(\"band3_type\")\n).show()\n\n# Filter by data type\nfloat_rasters = rasters.filter(\n rx.rst_type(f.col(\"tile\")).getItem(0) == \"Float32\"\n)\n\n# Identify mixed-type multi-band rasters\nfrom pyspark.sql.functions import size, array_distinct\n\nmixed_types = rasters.select(\n \"path\",\n rx.rst_type(f.col(\"tile\")).alias(\"types\")\n).filter(\n f.size(f.array_distinct(f.col(\"types\"))) > 1\n)\n\n# Data type inventory\ntype_stats = rasters.select(\n \"path\",\n rx.rst_type(f.col(\"tile\")).alias(\"types\"),\n rx.rst_numbands(f.col(\"tile\")).alias(\"num_bands\")\n).select(\n f.col(\"types\").getItem(0).alias(\"type\")\n).groupBy(\"type\").count()\n\n# Precision check (detect unexpected high-precision data)\nhigh_precision = rasters.filter(\n rx.rst_type(f.col(\"tile\")).getItem(0).isin(\"Float64\", \"Int32\", \"UInt32\")\n)\n", - "line_number": 704, - "length_lines": 48, - "source_file": "api/rasterx-functions.md", - "category": "SELF_CONTAINED", - "confidence": "medium", - "analysis": { - "has_imports": true, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": true, - "has_rx_registration": false, - "uses_existing_df": null, - "uses_existing_rasters": false, - "uses_rx_without_import": false, - "uses_spark_without_creation": true, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "scala", - "code": "import com.databricks.labs.gbx.rasterx.{functions => rx}\nimport org.apache.spark.sql.functions._\n\n// Get data types\nval types = rasters.select(\n col(\"path\"),\n rx.rst_type(col(\"tile\")).alias(\"band_types\")\n)\n\n// Filter Float32 rasters\nval floatRasters = rasters.filter(\n rx.rst_type(col(\"tile\")).getItem(0) === \"Float32\"\n)\n", - "line_number": 754, - "length_lines": 14, - "source_file": "api/rasterx-functions.md", - "category": "EXTENDS_SETUP", - "confidence": "high", - "analysis": { - "has_imports": true, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": false, - "has_rx_registration": false, - "uses_existing_df": null, - "uses_existing_rasters": true, - "uses_rx_without_import": false, - "uses_spark_without_creation": true, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "sql", - "code": "-- Get data types\nSELECT\n path,\n gbx_rst_type(tile) as band_types,\n gbx_rst_type(tile)[0] as band1_type\nFROM rasters;\n\n-- Group by data type\nSELECT\n gbx_rst_type(tile)[0] as data_type,\n COUNT(*) as count\nFROM rasters\nGROUP BY gbx_rst_type(tile)[0];\n\n-- Find 8-bit rasters\nSELECT path\nFROM rasters\nWHERE gbx_rst_type(tile)[0] = 'Byte';\n", - "line_number": 770, - "length_lines": 19, - "source_file": "api/rasterx-functions.md", - "category": "EXTENDS_SETUP", - "confidence": "high", - "analysis": { - "has_imports": false, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": false, - "has_rx_registration": false, - "uses_existing_df": null, - "uses_existing_rasters": true, - "uses_rx_without_import": false, - "uses_spark_without_creation": false, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "scala", - "code": "rst_pixelwidth(tile: Column): Column\n", - "line_number": 815, - "length_lines": 2, - "source_file": "api/rasterx-functions.md", - "category": "EXAMPLE_ONLY", - "confidence": "high", - "analysis": { - "has_imports": false, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": false, - "has_rx_registration": false, - "uses_existing_df": null, - "uses_existing_rasters": null, - "uses_rx_without_import": false, - "uses_spark_without_creation": false, - "has_placeholders": false, - "is_short": true - } - }, - { - "language": "python", - "code": "from databricks.labs.gbx.rasterx import functions as rx\nfrom pyspark.sql import functions as f\n\n# Get pixel resolution\nrasters = spark.read.format(\"gdal\").load(\"/data/*.tif\")\n\nresolution = rasters.select(\n \"path\",\n rx.rst_pixelwidth(f.col(\"tile\")).alias(\"pixel_width_m\"),\n rx.rst_pixelheight(f.col(\"tile\")).alias(\"pixel_height_m\")\n)\n\n# Calculate total ground coverage\ncoverage = rasters.select(\n \"path\",\n (rx.rst_width(f.col(\"tile\")) * rx.rst_pixelwidth(f.col(\"tile\"))).alias(\"width_m\"),\n (rx.rst_height(f.col(\"tile\")) * rx.rst_pixelheight(f.col(\"tile\"))).alias(\"height_m\")\n)\n", - "line_number": 827, - "length_lines": 19, - "source_file": "api/rasterx-functions.md", - "category": "SELF_CONTAINED", - "confidence": "medium", - "analysis": { - "has_imports": true, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": true, - "has_rx_registration": false, - "uses_existing_df": null, - "uses_existing_rasters": false, - "uses_rx_without_import": false, - "uses_spark_without_creation": true, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "sql", - "code": "SELECT\n path,\n gbx_rst_pixelwidth(tile) as pixel_width,\n gbx_rst_pixelheight(tile) as pixel_height,\n gbx_rst_width(tile) * gbx_rst_pixelwidth(tile) as total_width_m\nFROM rasters;\n", - "line_number": 848, - "length_lines": 7, - "source_file": "api/rasterx-functions.md", - "category": "EXTENDS_SETUP", - "confidence": "high", - "analysis": { - "has_imports": false, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": false, - "has_rx_registration": false, - "uses_existing_df": null, - "uses_existing_rasters": true, - "uses_rx_without_import": false, - "uses_spark_without_creation": false, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "scala", - "code": "rst_pixelheight(tile: Column): Column\n", - "line_number": 869, - "length_lines": 2, - "source_file": "api/rasterx-functions.md", - "category": "EXAMPLE_ONLY", - "confidence": "high", - "analysis": { - "has_imports": false, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": false, - "has_rx_registration": false, - "uses_existing_df": null, - "uses_existing_rasters": null, - "uses_rx_without_import": false, - "uses_spark_without_creation": false, - "has_placeholders": false, - "is_short": true - } - }, - { - "language": "python", - "code": "# Compare resolutions\nresolution_comparison = rasters.select(\n \"path\",\n rx.rst_pixelwidth(f.col(\"tile\")).alias(\"width_m\"),\n rx.rst_pixelheight(f.col(\"tile\")).alias(\"height_m\"),\n (rx.rst_pixelwidth(f.col(\"tile\")) / rx.rst_pixelheight(f.col(\"tile\"))).alias(\"aspect_ratio\")\n)\n", - "line_number": 881, - "length_lines": 8, - "source_file": "api/rasterx-functions.md", - "category": "EXTENDS_SETUP", - "confidence": "high", - "analysis": { - "has_imports": false, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": false, - "has_rx_registration": false, - "uses_existing_df": null, - "uses_existing_rasters": true, - "uses_rx_without_import": true, - "uses_spark_without_creation": false, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "scala", - "code": "rst_scalex(tile: Column): Column\n", - "line_number": 902, - "length_lines": 2, - "source_file": "api/rasterx-functions.md", - "category": "EXAMPLE_ONLY", - "confidence": "high", - "analysis": { - "has_imports": false, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": false, - "has_rx_registration": false, - "uses_existing_df": null, - "uses_existing_rasters": null, - "uses_rx_without_import": false, - "uses_spark_without_creation": false, - "has_placeholders": false, - "is_short": true - } - }, - { - "language": "python", - "code": "# Get geotransform components\ngeotransform = rasters.select(\n \"path\",\n rx.rst_scalex(f.col(\"tile\")).alias(\"scale_x\"),\n rx.rst_scaley(f.col(\"tile\")).alias(\"scale_y\"),\n rx.rst_skewx(f.col(\"tile\")).alias(\"skew_x\"),\n rx.rst_skewy(f.col(\"tile\")).alias(\"skew_y\")\n)\n", - "line_number": 914, - "length_lines": 9, - "source_file": "api/rasterx-functions.md", - "category": "EXTENDS_SETUP", - "confidence": "high", - "analysis": { - "has_imports": false, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": false, - "has_rx_registration": false, - "uses_existing_df": null, - "uses_existing_rasters": true, - "uses_rx_without_import": true, - "uses_spark_without_creation": false, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "scala", - "code": "rst_scaley(tile: Column): Column\n", - "line_number": 936, - "length_lines": 2, - "source_file": "api/rasterx-functions.md", - "category": "EXAMPLE_ONLY", - "confidence": "high", - "analysis": { - "has_imports": false, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": false, - "has_rx_registration": false, - "uses_existing_df": null, - "uses_existing_rasters": null, - "uses_rx_without_import": false, - "uses_spark_without_creation": false, - "has_placeholders": false, - "is_short": true - } - }, - { - "language": "scala", - "code": "rst_upperleftx(tile: Column): Column\n", - "line_number": 957, - "length_lines": 2, - "source_file": "api/rasterx-functions.md", - "category": "EXAMPLE_ONLY", - "confidence": "high", - "analysis": { - "has_imports": false, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": false, - "has_rx_registration": false, - "uses_existing_df": null, - "uses_existing_rasters": null, - "uses_rx_without_import": false, - "uses_spark_without_creation": false, - "has_placeholders": false, - "is_short": true - } - }, - { - "language": "python", - "code": "# Get raster extent\nextent = rasters.select(\n \"path\",\n rx.rst_upperleftx(f.col(\"tile\")).alias(\"min_x\"),\n rx.rst_upperlefty(f.col(\"tile\")).alias(\"max_y\"),\n (rx.rst_upperleftx(f.col(\"tile\")) + \n rx.rst_width(f.col(\"tile\")) * rx.rst_scalex(f.col(\"tile\"))).alias(\"max_x\"),\n (rx.rst_upperlefty(f.col(\"tile\")) + \n rx.rst_height(f.col(\"tile\")) * rx.rst_scaley(f.col(\"tile\"))).alias(\"min_y\")\n)\n", - "line_number": 969, - "length_lines": 11, - "source_file": "api/rasterx-functions.md", - "category": "EXTENDS_SETUP", - "confidence": "high", - "analysis": { - "has_imports": false, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": false, - "has_rx_registration": false, - "uses_existing_df": null, - "uses_existing_rasters": true, - "uses_rx_without_import": true, - "uses_spark_without_creation": false, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "scala", - "code": "rst_upperlefty(tile: Column): Column\n", - "line_number": 993, - "length_lines": 2, - "source_file": "api/rasterx-functions.md", - "category": "EXAMPLE_ONLY", - "confidence": "high", - "analysis": { - "has_imports": false, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": false, - "has_rx_registration": false, - "uses_existing_df": null, - "uses_existing_rasters": null, - "uses_rx_without_import": false, - "uses_spark_without_creation": false, - "has_placeholders": false, - "is_short": true - } - }, - { - "language": "scala", - "code": "rst_getnodata(tile: Column): Column\n", - "line_number": 1014, - "length_lines": 2, - "source_file": "api/rasterx-functions.md", - "category": "EXAMPLE_ONLY", - "confidence": "high", - "analysis": { - "has_imports": false, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": false, - "has_rx_registration": false, - "uses_existing_df": null, - "uses_existing_rasters": null, - "uses_rx_without_import": false, - "uses_spark_without_creation": false, - "has_placeholders": false, - "is_short": true - } - }, - { - "language": "python", - "code": "from databricks.labs.gbx.rasterx import functions as rx\nfrom pyspark.sql import functions as f\n\n# Get NoData values\nnodata_values = rasters.select(\n \"path\",\n rx.rst_getnodata(f.col(\"tile\")).alias(\"nodata_per_band\")\n)\n\n# Check for common NoData values\ncommon_nodata = rasters.select(\n \"path\",\n rx.rst_getnodata(f.col(\"tile\")).getItem(0).alias(\"nodata\")\n).filter(\n f.col(\"nodata\").isin(-9999.0, -32768.0, 0.0, 255.0)\n)\n\n# Identify rasters without NoData\nno_nodata_set = rasters.filter(\n f.col(\"nodata\").isNull()\n)\n", - "line_number": 1026, - "length_lines": 22, - "source_file": "api/rasterx-functions.md", - "category": "EXTENDS_SETUP", - "confidence": "high", - "analysis": { - "has_imports": true, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": false, - "has_rx_registration": false, - "uses_existing_df": null, - "uses_existing_rasters": true, - "uses_rx_without_import": false, - "uses_spark_without_creation": true, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "sql", - "code": "SELECT\n path,\n gbx_rst_getnodata(tile) as nodata_values,\n gbx_rst_getnodata(tile)[0] as band1_nodata\nFROM rasters;\n", - "line_number": 1050, - "length_lines": 6, - "source_file": "api/rasterx-functions.md", - "category": "EXTENDS_SETUP", - "confidence": "high", - "analysis": { - "has_imports": false, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": false, - "has_rx_registration": false, - "uses_existing_df": null, - "uses_existing_rasters": true, - "uses_rx_without_import": false, - "uses_spark_without_creation": false, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "scala", - "code": "rst_rotation(tile: Column): Column\n", - "line_number": 1070, - "length_lines": 2, - "source_file": "api/rasterx-functions.md", - "category": "EXAMPLE_ONLY", - "confidence": "high", - "analysis": { - "has_imports": false, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": false, - "has_rx_registration": false, - "uses_existing_df": null, - "uses_existing_rasters": null, - "uses_rx_without_import": false, - "uses_spark_without_creation": false, - "has_placeholders": false, - "is_short": true - } - }, - { - "language": "python", - "code": "rotation = rasters.select(rx.rst_rotation(f.col(\"tile\")).alias(\"rotation_deg\"))\n", - "line_number": 1082, - "length_lines": 2, - "source_file": "api/rasterx-functions.md", - "category": "EXTENDS_SETUP", - "confidence": "high", - "analysis": { - "has_imports": false, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": false, - "has_rx_registration": false, - "uses_existing_df": null, - "uses_existing_rasters": true, - "uses_rx_without_import": true, - "uses_spark_without_creation": false, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "scala", - "code": "rst_skewx(tile: Column): Column\n", - "line_number": 1097, - "length_lines": 2, - "source_file": "api/rasterx-functions.md", - "category": "EXAMPLE_ONLY", - "confidence": "high", - "analysis": { - "has_imports": false, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": false, - "has_rx_registration": false, - "uses_existing_df": null, - "uses_existing_rasters": null, - "uses_rx_without_import": false, - "uses_spark_without_creation": false, - "has_placeholders": false, - "is_short": true - } - }, - { - "language": "python", - "code": "skew = rasters.select(\n rx.rst_skewx(f.col(\"tile\")).alias(\"skew_x\"),\n rx.rst_skewy(f.col(\"tile\")).alias(\"skew_y\")\n)\n", - "line_number": 1109, - "length_lines": 5, - "source_file": "api/rasterx-functions.md", - "category": "EXTENDS_SETUP", - "confidence": "high", - "analysis": { - "has_imports": false, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": false, - "has_rx_registration": false, - "uses_existing_df": null, - "uses_existing_rasters": true, - "uses_rx_without_import": true, - "uses_spark_without_creation": false, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "scala", - "code": "rst_skewy(tile: Column): Column\n", - "line_number": 1127, - "length_lines": 2, - "source_file": "api/rasterx-functions.md", - "category": "EXAMPLE_ONLY", - "confidence": "high", - "analysis": { - "has_imports": false, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": false, - "has_rx_registration": false, - "uses_existing_df": null, - "uses_existing_rasters": null, - "uses_rx_without_import": false, - "uses_spark_without_creation": false, - "has_placeholders": false, - "is_short": true - } - }, - { - "language": "scala", - "code": "rst_georeference(tile: Column): Column\n", - "line_number": 1148, - "length_lines": 2, - "source_file": "api/rasterx-functions.md", - "category": "EXAMPLE_ONLY", - "confidence": "high", - "analysis": { - "has_imports": false, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": false, - "has_rx_registration": false, - "uses_existing_df": null, - "uses_existing_rasters": null, - "uses_rx_without_import": false, - "uses_spark_without_creation": false, - "has_placeholders": false, - "is_short": true - } - }, - { - "language": "python", - "code": "georef = rasters.select(rx.rst_georeference(f.col(\"tile\")).alias(\"georef\"))\n", - "line_number": 1160, - "length_lines": 2, - "source_file": "api/rasterx-functions.md", - "category": "EXTENDS_SETUP", - "confidence": "high", - "analysis": { - "has_imports": false, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": false, - "has_rx_registration": false, - "uses_existing_df": null, - "uses_existing_rasters": true, - "uses_rx_without_import": true, - "uses_spark_without_creation": false, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "scala", - "code": "rst_getsubdataset(tile: Column, subsetName: Column): Column\n", - "line_number": 1175, - "length_lines": 2, - "source_file": "api/rasterx-functions.md", - "category": "NEEDS_REVIEW", - "confidence": "low", - "analysis": { - "has_imports": false, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": false, - "has_rx_registration": false, - "uses_existing_df": null, - "uses_existing_rasters": null, - "uses_rx_without_import": false, - "uses_spark_without_creation": false, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "python", - "code": "from databricks.labs.gbx.rasterx import functions as rx\nfrom pyspark.sql import functions as f\n\n# Load NetCDF file\nnetcdf = spark.read.format(\"gdal\") \\\n .option(\"driverName\", \"NetCDF\") \\\n .load(\"/data/climate.nc\")\n\n# List available subdatasets first\nsubdatasets_list = netcdf.select(\n rx.rst_subdatasets(f.col(\"tile\")).alias(\"subdatasets\")\n)\n\n# Extract specific subdataset\ntemperature = netcdf.select(\n rx.rst_getsubdataset(f.col(\"tile\"), f.lit(\"temperature\")).alias(\"temp_tile\")\n)\n\n# Extract multiple subdatasets\nclimate_vars = netcdf.select(\n \"path\",\n rx.rst_getsubdataset(f.col(\"tile\"), f.lit(\"temperature\")).alias(\"temp\"),\n rx.rst_getsubdataset(f.col(\"tile\"), f.lit(\"precipitation\")).alias(\"precip\"),\n rx.rst_getsubdataset(f.col(\"tile\"), f.lit(\"humidity\")).alias(\"humid\")\n)\n", - "line_number": 1188, - "length_lines": 26, - "source_file": "api/rasterx-functions.md", - "category": "SELF_CONTAINED", - "confidence": "medium", - "analysis": { - "has_imports": true, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": true, - "has_rx_registration": false, - "uses_existing_df": null, - "uses_existing_rasters": null, - "uses_rx_without_import": false, - "uses_spark_without_creation": true, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "sql", - "code": "SELECT\n path,\n gbx_rst_getsubdataset(tile, 'temperature') as temp_layer\nFROM netcdf_files;\n", - "line_number": 1216, - "length_lines": 5, - "source_file": "api/rasterx-functions.md", - "category": "EXTENDS_SETUP", - "confidence": "high", - "analysis": { - "has_imports": false, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": false, - "has_rx_registration": false, - "uses_existing_df": null, - "uses_existing_rasters": null, - "uses_rx_without_import": false, - "uses_spark_without_creation": false, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "scala", - "code": "rst_subdatasets(tile: Column): Column\n", - "line_number": 1235, - "length_lines": 2, - "source_file": "api/rasterx-functions.md", - "category": "EXAMPLE_ONLY", - "confidence": "high", - "analysis": { - "has_imports": false, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": false, - "has_rx_registration": false, - "uses_existing_df": null, - "uses_existing_rasters": null, - "uses_rx_without_import": false, - "uses_spark_without_creation": false, - "has_placeholders": false, - "is_short": true - } - }, - { - "language": "python", - "code": "# List subdatasets\nsubdatasets = netcdf.select(\n \"path\",\n rx.rst_subdatasets(f.col(\"tile\")).alias(\"available_layers\")\n)\n\n# Extract all subdatasets dynamically\nfrom pyspark.sql.functions import explode\n\nall_layers = netcdf.select(\n \"path\",\n explode(rx.rst_subdatasets(f.col(\"tile\"))).alias(\"layer_name\")\n).select(\n \"path\",\n \"layer_name\",\n rx.rst_getsubdataset(f.col(\"tile\"), f.col(\"layer_name\")).alias(\"layer_data\")\n)\n", - "line_number": 1247, - "length_lines": 18, - "source_file": "api/rasterx-functions.md", - "category": "EXTENDS_SETUP", - "confidence": "high", - "analysis": { - "has_imports": true, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": false, - "has_rx_registration": false, - "uses_existing_df": null, - "uses_existing_rasters": null, - "uses_rx_without_import": false, - "uses_spark_without_creation": true, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "scala", - "code": "rst_memsize(tile: Column): Column\n", - "line_number": 1278, - "length_lines": 2, - "source_file": "api/rasterx-functions.md", - "category": "EXAMPLE_ONLY", - "confidence": "high", - "analysis": { - "has_imports": false, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": false, - "has_rx_registration": false, - "uses_existing_df": null, - "uses_existing_rasters": null, - "uses_rx_without_import": false, - "uses_spark_without_creation": false, - "has_placeholders": false, - "is_short": true - } - }, - { - "language": "python", - "code": "# Check raster sizes\nsizes = rasters.select(\n \"path\",\n rx.rst_memsize(f.col(\"tile\")).alias(\"size_bytes\"),\n (rx.rst_memsize(f.col(\"tile\")) / 1024 / 1024).alias(\"size_mb\")\n)\n\n# Filter large rasters\nlarge = rasters.filter(rx.rst_memsize(f.col(\"tile\")) > 100 * 1024 * 1024) # > 100 MB\n", - "line_number": 1290, - "length_lines": 10, - "source_file": "api/rasterx-functions.md", - "category": "EXTENDS_SETUP", - "confidence": "high", - "analysis": { - "has_imports": false, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": false, - "has_rx_registration": false, - "uses_existing_df": null, - "uses_existing_rasters": true, - "uses_rx_without_import": true, - "uses_spark_without_creation": false, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "scala", - "code": "rst_summary(tile: Column): Column\n", - "line_number": 1313, - "length_lines": 2, - "source_file": "api/rasterx-functions.md", - "category": "EXAMPLE_ONLY", - "confidence": "high", - "analysis": { - "has_imports": false, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": false, - "has_rx_registration": false, - "uses_existing_df": null, - "uses_existing_rasters": null, - "uses_rx_without_import": false, - "uses_spark_without_creation": false, - "has_placeholders": false, - "is_short": true - } - }, - { - "language": "python", - "code": "summary = rasters.select(rx.rst_summary(f.col(\"tile\")).alias(\"summary\"))\n", - "line_number": 1325, - "length_lines": 2, - "source_file": "api/rasterx-functions.md", - "category": "EXTENDS_SETUP", - "confidence": "high", - "analysis": { - "has_imports": false, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": false, - "has_rx_registration": false, - "uses_existing_df": null, - "uses_existing_rasters": true, - "uses_rx_without_import": true, - "uses_spark_without_creation": false, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "scala", - "code": "rst_fromfile(path: Column, driver: Column): Column\n", - "line_number": 1344, - "length_lines": 2, - "source_file": "api/rasterx-functions.md", - "category": "NEEDS_REVIEW", - "confidence": "low", - "analysis": { - "has_imports": false, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": false, - "has_rx_registration": false, - "uses_existing_df": null, - "uses_existing_rasters": null, - "uses_rx_without_import": false, - "uses_spark_without_creation": false, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "python", - "code": "from databricks.labs.gbx.rasterx import functions as rx\nfrom pyspark.sql import functions as f\n\n# Load single raster\ndf = spark.range(1).select(\n rx.rst_fromfile(f.lit(\"/data/raster.tif\"), f.lit(\"GTiff\")).alias(\"tile\")\n)\n\n# Load multiple rasters from paths table\npaths_df = spark.createDataFrame([\n (1, \"/data/tile_01.tif\"),\n (2, \"/data/tile_02.tif\"),\n (3, \"/data/tile_03.tif\")\n], [\"id\", \"path\"])\n\nrasters = paths_df.select(\n \"id\",\n \"path\",\n rx.rst_fromfile(f.col(\"path\"), f.lit(\"GTiff\")).alias(\"tile\")\n)\n\n# Access properties of loaded raster\nrasters.select(\n \"path\",\n rx.rst_width(\"tile\").alias(\"width\"),\n rx.rst_height(\"tile\").alias(\"height\"),\n rx.rst_numbands(\"tile\").alias(\"bands\"),\n rx.rst_srid(\"tile\").alias(\"epsg_code\")\n).show()\n\n# Load NetCDF subdataset\nnetcdf_df = spark.range(1).select(\n rx.rst_fromfile(\n f.lit(\"/data/climate.nc\"),\n f.lit(\"NetCDF\")\n ).alias(\"tile\")\n)\n\n# Extract specific subdataset\nnetcdf_df = netcdf_df.withColumn(\n \"temperature\",\n rx.rst_getsubdataset(f.col(\"tile\"), f.lit(\"temperature\"))\n)\n", - "line_number": 1357, - "length_lines": 44, - "source_file": "api/rasterx-functions.md", - "category": "EXTENDS_SETUP", - "confidence": "high", - "analysis": { - "has_imports": true, - "has_spark_creation": false, - "has_data_creation": true, - "has_file_reading": false, - "has_rx_registration": false, - "uses_existing_df": false, - "uses_existing_rasters": false, - "uses_rx_without_import": false, - "uses_spark_without_creation": true, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "scala", - "code": "import com.databricks.labs.gbx.rasterx.{functions => rx}\nimport org.apache.spark.sql.functions._\n\n// Load from file\nval df = Seq(\"/data/raster.tif\")\n .toDF(\"path\")\n .withColumn(\"tile\", rx.rst_fromfile(col(\"path\"), lit(\"GTiff\")))\n\n// Load and process\nval rasters = spark.read\n .format(\"csv\")\n .option(\"header\", \"true\")\n .load(\"/metadata/raster_catalog.csv\")\n .withColumn(\"tile\", rx.rst_fromfile(col(\"path\"), lit(\"GTiff\")))\n .withColumn(\"num_bands\", rx.rst_numbands(col(\"tile\")))\n", - "line_number": 1403, - "length_lines": 16, - "source_file": "api/rasterx-functions.md", - "category": "SELF_CONTAINED", - "confidence": "medium", - "analysis": { - "has_imports": true, - "has_spark_creation": false, - "has_data_creation": true, - "has_file_reading": true, - "has_rx_registration": false, - "uses_existing_df": false, - "uses_existing_rasters": false, - "uses_rx_without_import": false, - "uses_spark_without_creation": true, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "sql", - "code": "-- Load from path\nSELECT \n gbx_rst_fromfile('/data/raster.tif', 'GTiff') as tile;\n\n-- Load multiple and get properties\nSELECT \n path,\n gbx_rst_width(gbx_rst_fromfile(path, 'GTiff')) as width,\n gbx_rst_height(gbx_rst_fromfile(path, 'GTiff')) as height\nFROM raster_paths;\n", - "line_number": 1421, - "length_lines": 11, - "source_file": "api/rasterx-functions.md", - "category": "EXTENDS_SETUP", - "confidence": "high", - "analysis": { - "has_imports": true, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": false, - "has_rx_registration": false, - "uses_existing_df": null, - "uses_existing_rasters": null, - "uses_rx_without_import": false, - "uses_spark_without_creation": false, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "scala", - "code": "rst_fromcontent(content: Column, driver: Column): Column\n", - "line_number": 1447, - "length_lines": 2, - "source_file": "api/rasterx-functions.md", - "category": "NEEDS_REVIEW", - "confidence": "low", - "analysis": { - "has_imports": false, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": false, - "has_rx_registration": false, - "uses_existing_df": null, - "uses_existing_rasters": null, - "uses_rx_without_import": false, - "uses_spark_without_creation": false, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "python", - "code": "from databricks.labs.gbx.rasterx import functions as rx\nfrom pyspark.sql import functions as f\n\n# Load rasters using binaryFile reader\nbinary_df = spark.read.format(\"binaryFile\").load(\"/data/rasters/*.tif\")\n\n# Convert binary content to raster tiles\nrasters = binary_df.select(\n \"path\",\n rx.rst_fromcontent(f.col(\"content\"), f.lit(\"GTiff\")).alias(\"tile\")\n)\n\n# Process the loaded rasters\nresult = rasters.select(\n \"path\",\n rx.rst_width(\"tile\").alias(\"width\"),\n rx.rst_height(\"tile\").alias(\"height\"),\n rx.rst_boundingbox(\"tile\").alias(\"bbox\")\n)\n\n# Use with streaming for real-time raster processing\nstream_df = spark.readStream.format(\"binaryFile\") \\\n .option(\"pathGlobFilter\", \"*.tif\") \\\n .load(\"/incoming/rasters/\")\n\nprocessed_stream = stream_df.select(\n \"path\",\n \"modificationTime\",\n rx.rst_fromcontent(f.col(\"content\"), f.lit(\"GTiff\")).alias(\"tile\")\n).select(\n \"*\",\n rx.rst_avg(\"tile\").alias(\"avg_value\"),\n rx.rst_min(\"tile\").alias(\"min_value\"),\n rx.rst_max(\"tile\").alias(\"max_value\")\n)\n", - "line_number": 1460, - "length_lines": 36, - "source_file": "api/rasterx-functions.md", - "category": "SELF_CONTAINED", - "confidence": "medium", - "analysis": { - "has_imports": true, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": true, - "has_rx_registration": false, - "uses_existing_df": null, - "uses_existing_rasters": false, - "uses_rx_without_import": false, - "uses_spark_without_creation": true, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "scala", - "code": "import com.databricks.labs.gbx.rasterx.{functions => rx}\nimport org.apache.spark.sql.functions._\n\n// Read binary files\nval binaryDf = spark.read\n .format(\"binaryFile\")\n .load(\"/data/rasters/*.tif\")\n\n// Create tiles from content\nval rasters = binaryDf.select(\n col(\"path\"),\n rx.rst_fromcontent(col(\"content\"), lit(\"GTiff\")).alias(\"tile\")\n)\n", - "line_number": 1498, - "length_lines": 14, - "source_file": "api/rasterx-functions.md", - "category": "SELF_CONTAINED", - "confidence": "medium", - "analysis": { - "has_imports": true, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": true, - "has_rx_registration": false, - "uses_existing_df": null, - "uses_existing_rasters": false, - "uses_rx_without_import": false, - "uses_spark_without_creation": true, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "sql", - "code": "-- Load from binary table\nSELECT \n path,\n gbx_rst_fromcontent(content, 'GTiff') as tile\nFROM binary_raster_table;\n", - "line_number": 1514, - "length_lines": 6, - "source_file": "api/rasterx-functions.md", - "category": "EXTENDS_SETUP", - "confidence": "high", - "analysis": { - "has_imports": true, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": false, - "has_rx_registration": false, - "uses_existing_df": null, - "uses_existing_rasters": null, - "uses_rx_without_import": false, - "uses_spark_without_creation": false, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "scala", - "code": "rst_frombands(bands: Column): Column\n", - "line_number": 1535, - "length_lines": 2, - "source_file": "api/rasterx-functions.md", - "category": "EXAMPLE_ONLY", - "confidence": "high", - "analysis": { - "has_imports": false, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": false, - "has_rx_registration": false, - "uses_existing_df": null, - "uses_existing_rasters": null, - "uses_rx_without_import": false, - "uses_spark_without_creation": false, - "has_placeholders": false, - "is_short": true - } - }, - { - "language": "python", - "code": "from databricks.labs.gbx.rasterx import functions as rx\nfrom pyspark.sql import functions as f\n\n# Separate bands from RGB raster\nrgb = spark.read.format(\"gdal\").load(\"/data/rgb.tif\")\n\nseparated = rgb.select(\n rx.rst_separatebands(f.col(\"tile\")).alias(\"bands\")\n)\n\n# Recombine bands (e.g., after processing each separately)\nrecombined = separated.select(\n rx.rst_frombands(f.col(\"bands\")).alias(\"rgb_tile\")\n)\n\n# Create custom band combination\n# Process each band independently then recombine\nred_band = rgb.select(rx.rst_separatebands(f.col(\"tile\")).getItem(0).alias(\"red\"))\ngreen_band = rgb.select(rx.rst_separatebands(f.col(\"tile\")).getItem(1).alias(\"green\"))\nblue_band = rgb.select(rx.rst_separatebands(f.col(\"tile\")).getItem(2).alias(\"blue\"))\n\n# Apply different processing to each band\nprocessed_red = red_band.select(rx.rst_filter(f.col(\"red\"), f.lit(3), f.lit(\"median\")).alias(\"red_p\"))\nprocessed_green = green_band.select(rx.rst_filter(f.col(\"green\"), f.lit(3), f.lit(\"median\")).alias(\"green_p\"))\nprocessed_blue = blue_band.select(rx.rst_filter(f.col(\"blue\"), f.lit(3), f.lit(\"median\")).alias(\"blue_p\"))\n\n# Combine back into RGB\ncustom_rgb = spark.range(1).select(\n rx.rst_frombands(f.array(\n f.col(\"red_p\"),\n f.col(\"green_p\"),\n f.col(\"blue_p\")\n )).alias(\"processed_rgb\")\n)\n\n# Create false color composite\n# Swap bands: NIR, Red, Green \u2192 RGB for visualization\nfalse_color = separated.select(\n rx.rst_frombands(f.array(\n f.col(\"bands\").getItem(3), # NIR \u2192 Red\n f.col(\"bands\").getItem(2), # Red \u2192 Green\n f.col(\"bands\").getItem(1) # Green \u2192 Blue\n )).alias(\"false_color\")\n)\n", - "line_number": 1547, - "length_lines": 45, - "source_file": "api/rasterx-functions.md", - "category": "SELF_CONTAINED", - "confidence": "medium", - "analysis": { - "has_imports": true, - "has_spark_creation": false, - "has_data_creation": true, - "has_file_reading": true, - "has_rx_registration": false, - "uses_existing_df": null, - "uses_existing_rasters": null, - "uses_rx_without_import": false, - "uses_spark_without_creation": true, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "scala", - "code": "import com.databricks.labs.gbx.rasterx.{functions => rx}\nimport org.apache.spark.sql.functions._\n\n// Combine bands\nval multiband = df.select(\n rx.rst_frombands(array(col(\"band1\"), col(\"band2\"), col(\"band3\"))).alias(\"rgb\")\n)\n", - "line_number": 1594, - "length_lines": 8, - "source_file": "api/rasterx-functions.md", - "category": "EXTENDS_SETUP", - "confidence": "high", - "analysis": { - "has_imports": true, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": false, - "has_rx_registration": false, - "uses_existing_df": true, - "uses_existing_rasters": null, - "uses_rx_without_import": false, - "uses_spark_without_creation": true, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "sql", - "code": "SELECT\n gbx_rst_frombands(array(band1, band2, band3)) as multi_band\nFROM separated_bands;\n", - "line_number": 1604, - "length_lines": 4, - "source_file": "api/rasterx-functions.md", - "category": "EXTENDS_SETUP", - "confidence": "high", - "analysis": { - "has_imports": false, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": false, - "has_rx_registration": false, - "uses_existing_df": null, - "uses_existing_rasters": null, - "uses_rx_without_import": false, - "uses_spark_without_creation": false, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "scala", - "code": "rst_clip(tile: Column, geometry: Column, cutlineAllTouched: Column): Column\n", - "line_number": 1632, - "length_lines": 2, - "source_file": "api/rasterx-functions.md", - "category": "NEEDS_REVIEW", - "confidence": "low", - "analysis": { - "has_imports": false, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": false, - "has_rx_registration": false, - "uses_existing_df": null, - "uses_existing_rasters": null, - "uses_rx_without_import": false, - "uses_spark_without_creation": false, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "python", - "code": "from databricks.labs.gbx.rasterx import functions as rx\nfrom pyspark.sql import functions as f\n\n# Clip raster using WKT polygon\nwkt_geom = \"POLYGON((-122 37, -122 38, -121 38, -121 37, -122 37))\"\n\nclipped = spark.range(1).select(\n rx.rst_clip(\n rx.rst_fromfile(f.lit(\"/data/raster.tif\"), f.lit(\"GTiff\")),\n f.lit(wkt_geom),\n f.lit(True) # Include all touched pixels\n ).alias(\"clipped_tile\")\n)\n\n# Clip multiple rasters with different clip geometries\nrasters = spark.read.format(\"gdal\").load(\"/data/rasters\")\naoi_polygons = spark.table(\"areas_of_interest\")\n\nclipped_by_aoi = rasters.join(aoi_polygons, \"region_id\").select(\n \"path\",\n \"region_name\",\n rx.rst_clip(\n f.col(\"tile\"),\n f.col(\"geometry\"), # WKB geometry from table\n f.lit(True)\n ).alias(\"clipped\"),\n # Verify clipped dimensions\n rx.rst_width(\n rx.rst_clip(f.col(\"tile\"), f.col(\"geometry\"), f.lit(True))\n ).alias(\"clipped_width\")\n)\n", - "line_number": 1646, - "length_lines": 32, - "source_file": "api/rasterx-functions.md", - "category": "SELF_CONTAINED", - "confidence": "medium", - "analysis": { - "has_imports": true, - "has_spark_creation": false, - "has_data_creation": true, - "has_file_reading": true, - "has_rx_registration": false, - "uses_existing_df": null, - "uses_existing_rasters": false, - "uses_rx_without_import": false, - "uses_spark_without_creation": true, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "scala", - "code": "import com.databricks.labs.gbx.rasterx.{functions => rx}\nimport org.apache.spark.sql.functions._\n\nval geom = \"POLYGON((-122 37, -122 38, -121 38, -121 37, -122 37))\"\n\nval clipped = rasters.select(\n col(\"path\"),\n rx.rst_clip(col(\"tile\"), lit(geom), lit(true)).alias(\"clipped\")\n)\n", - "line_number": 1680, - "length_lines": 10, - "source_file": "api/rasterx-functions.md", - "category": "EXTENDS_SETUP", - "confidence": "high", - "analysis": { - "has_imports": true, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": false, - "has_rx_registration": false, - "uses_existing_df": null, - "uses_existing_rasters": true, - "uses_rx_without_import": false, - "uses_spark_without_creation": true, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "sql", - "code": "-- Clip with WKT geometry\nSELECT\n path,\n gbx_rst_clip(\n tile,\n 'POLYGON((-122 37, -122 38, -121 38, -121 37, -122 37))',\n true\n ) as clipped\nFROM rasters;\n\n-- Clip using geometry from another table\nSELECT\n r.path,\n a.region_name,\n gbx_rst_clip(r.tile, a.geometry, true) as clipped,\n gbx_rst_width(gbx_rst_clip(r.tile, a.geometry, true)) as clipped_width\nFROM rasters r\nJOIN areas_of_interest a ON r.region_id = a.region_id;\n", - "line_number": 1692, - "length_lines": 19, - "source_file": "api/rasterx-functions.md", - "category": "EXTENDS_SETUP", - "confidence": "high", - "analysis": { - "has_imports": true, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": false, - "has_rx_registration": false, - "uses_existing_df": null, - "uses_existing_rasters": true, - "uses_rx_without_import": false, - "uses_spark_without_creation": false, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "scala", - "code": "rst_transform(tile: Column, targetSRID: Column): Column\n", - "line_number": 1725, - "length_lines": 2, - "source_file": "api/rasterx-functions.md", - "category": "NEEDS_REVIEW", - "confidence": "low", - "analysis": { - "has_imports": false, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": false, - "has_rx_registration": false, - "uses_existing_df": null, - "uses_existing_rasters": null, - "uses_rx_without_import": false, - "uses_spark_without_creation": false, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "python", - "code": "from databricks.labs.gbx.rasterx import functions as rx\nfrom pyspark.sql import functions as f\n\n# Reproject raster to WGS84 (EPSG:4326)\nrasters = spark.read.format(\"gdal\").load(\"/data/utm_rasters\")\n\nwgs84_rasters = rasters.select(\n \"path\",\n rx.rst_transform(f.col(\"tile\"), f.lit(4326)).alias(\"wgs84_tile\")\n)\n\n# Check the results\nwgs84_rasters.select(\n \"path\",\n rx.rst_srid(\"wgs84_tile\").alias(\"epsg_code\"),\n rx.rst_boundingbox(\"wgs84_tile\").alias(\"bbox\")\n).show()\n\n# Reproject to Web Mercator for web mapping\nweb_mercator = rasters.select(\n rx.rst_transform(f.col(\"tile\"), f.lit(3857)).alias(\"tile_3857\")\n)\n\n# Batch reproject multiple rasters\npaths_df = spark.createDataFrame([\n (1, \"/data/sentinel_utm_32n.tif\", 32632), # UTM 32N\n (2, \"/data/landsat_utm_10n.tif\", 32610), # UTM 10N\n (3, \"/data/modis_sinusoidal.tif\", 54008) # Sinusoidal\n], [\"id\", \"path\", \"source_epsg\"])\n\n# Load and reproject all to WGS84\nunified = paths_df.select(\n \"id\",\n \"path\",\n rx.rst_transform(\n rx.rst_fromfile(f.col(\"path\"), f.lit(\"GTiff\")),\n f.lit(4326)\n ).alias(\"tile_wgs84\")\n)\n", - "line_number": 1738, - "length_lines": 40, - "source_file": "api/rasterx-functions.md", - "category": "SELF_CONTAINED", - "confidence": "medium", - "analysis": { - "has_imports": true, - "has_spark_creation": false, - "has_data_creation": true, - "has_file_reading": true, - "has_rx_registration": false, - "uses_existing_df": null, - "uses_existing_rasters": false, - "uses_rx_without_import": false, - "uses_spark_without_creation": true, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "scala", - "code": "import com.databricks.labs.gbx.rasterx.{functions => rx}\nimport org.apache.spark.sql.functions._\n\n// Load and reproject to WGS84\nval rasters = spark.read.format(\"gdal\").load(\"/data/rasters\")\n\nval wgs84 = rasters.select(\n col(\"path\"),\n rx.rst_transform(col(\"tile\"), lit(4326)).alias(\"wgs84_tile\")\n)\n\n// Reproject to UTM Zone 18N\nval utm = rasters.select(\n rx.rst_transform(col(\"tile\"), lit(32618)).alias(\"utm_tile\")\n)\n", - "line_number": 1780, - "length_lines": 16, - "source_file": "api/rasterx-functions.md", - "category": "SELF_CONTAINED", - "confidence": "medium", - "analysis": { - "has_imports": true, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": true, - "has_rx_registration": false, - "uses_existing_df": null, - "uses_existing_rasters": false, - "uses_rx_without_import": false, - "uses_spark_without_creation": true, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "sql", - "code": "-- Reproject to WGS84\nSELECT\n path,\n gbx_rst_transform(tile, 4326) as wgs84_tile,\n gbx_rst_srid(gbx_rst_transform(tile, 4326)) as new_srid\nFROM rasters;\n\n-- Reproject and clip\nSELECT\n path,\n gbx_rst_clip(\n gbx_rst_transform(tile, 4326),\n 'POLYGON((-122 37, -122 38, -121 38, -121 37, -122 37))',\n true\n ) as clipped_wgs84\nFROM rasters;\n", - "line_number": 1798, - "length_lines": 17, - "source_file": "api/rasterx-functions.md", - "category": "EXTENDS_SETUP", - "confidence": "high", - "analysis": { - "has_imports": false, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": false, - "has_rx_registration": false, - "uses_existing_df": null, - "uses_existing_rasters": true, - "uses_rx_without_import": false, - "uses_spark_without_creation": false, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "scala", - "code": "rst_asformat(tile: Column, newFormat: Column): Column\n", - "line_number": 1836, - "length_lines": 2, - "source_file": "api/rasterx-functions.md", - "category": "NEEDS_REVIEW", - "confidence": "low", - "analysis": { - "has_imports": false, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": false, - "has_rx_registration": false, - "uses_existing_df": null, - "uses_existing_rasters": null, - "uses_rx_without_import": false, - "uses_spark_without_creation": false, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "python", - "code": "from databricks.labs.gbx.rasterx import functions as rx\nfrom pyspark.sql import functions as f\n\n# Convert NetCDF to GeoTIFF\nrasters = spark.read.format(\"gdal\") \\\n .option(\"driverName\", \"NetCDF\") \\\n .load(\"/data/climate/*.nc\")\n\ngeotiffs = rasters.select(\n \"path\",\n rx.rst_asformat(f.col(\"tile\"), f.lit(\"GTiff\")).alias(\"tile\")\n)\n\n# Save as GeoTIFF files\ngeotiffs.write.format(\"gdal\") \\\n .mode(\"overwrite\") \\\n .option(\"ext\", \"tif\") \\\n .save(\"/output/geotiffs/\")\n\n# Convert to Cloud Optimized GeoTIFF (COG)\ncog_rasters = rasters.select(\n rx.rst_asformat(f.col(\"tile\"), f.lit(\"COG\")).alias(\"cog_tile\")\n)\n\n# Convert to PNG for visualization\npngs = rasters.select(\n \"path\",\n rx.rst_asformat(f.col(\"tile\"), f.lit(\"PNG\")).alias(\"png_tile\")\n)\n\n# Batch format conversion\nformat_conversions = spark.createDataFrame([\n (\"input1.tif\", \"PNG\"),\n (\"input2.tif\", \"JPEG\"),\n (\"input3.tif\", \"GTiff\")\n], [\"path\", \"target_format\"])\n\nconverted = format_conversions.select(\n \"path\",\n \"target_format\",\n rx.rst_asformat(\n rx.rst_fromfile(f.col(\"path\"), f.lit(\"GTiff\")),\n f.col(\"target_format\")\n ).alias(\"converted_tile\")\n)\n\n# No-op if already in target format\nalready_gtiff = geotiffs.select(\n rx.rst_asformat(f.col(\"tile\"), f.lit(\"GTiff\")).alias(\"tile\") # No conversion\n)\n", - "line_number": 1849, - "length_lines": 51, - "source_file": "api/rasterx-functions.md", - "category": "SELF_CONTAINED", - "confidence": "medium", - "analysis": { - "has_imports": true, - "has_spark_creation": false, - "has_data_creation": true, - "has_file_reading": true, - "has_rx_registration": false, - "uses_existing_df": null, - "uses_existing_rasters": false, - "uses_rx_without_import": false, - "uses_spark_without_creation": true, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "scala", - "code": "import com.databricks.labs.gbx.rasterx.{functions => rx}\nimport org.apache.spark.sql.functions._\n\n// Convert to GeoTIFF\nval geotiffs = rasters.select(\n rx.rst_asformat(col(\"tile\"), lit(\"GTiff\")).alias(\"gtiff_tile\")\n)\n\n// Convert to COG\nval cogs = rasters.select(\n rx.rst_asformat(col(\"tile\"), lit(\"COG\")).alias(\"cog_tile\")\n)\n", - "line_number": 1902, - "length_lines": 13, - "source_file": "api/rasterx-functions.md", - "category": "EXTENDS_SETUP", - "confidence": "high", - "analysis": { - "has_imports": true, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": false, - "has_rx_registration": false, - "uses_existing_df": null, - "uses_existing_rasters": true, - "uses_rx_without_import": false, - "uses_spark_without_creation": true, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "sql", - "code": "-- Convert NetCDF to GeoTIFF\nSELECT\n path,\n gbx_rst_asformat(tile, 'GTiff') as geotiff_tile\nFROM netcdf_rasters;\n\n-- Convert to PNG\nSELECT\n path,\n gbx_rst_asformat(tile, 'PNG') as png_tile\nFROM rasters;\n", - "line_number": 1917, - "length_lines": 12, - "source_file": "api/rasterx-functions.md", - "category": "EXTENDS_SETUP", - "confidence": "high", - "analysis": { - "has_imports": false, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": false, - "has_rx_registration": false, - "uses_existing_df": null, - "uses_existing_rasters": true, - "uses_rx_without_import": false, - "uses_spark_without_creation": false, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "scala", - "code": "rst_ndvi(tile: Column, redBand: Column, nirBand: Column): Column\n", - "line_number": 1953, - "length_lines": 2, - "source_file": "api/rasterx-functions.md", - "category": "NEEDS_REVIEW", - "confidence": "low", - "analysis": { - "has_imports": false, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": false, - "has_rx_registration": false, - "uses_existing_df": null, - "uses_existing_rasters": null, - "uses_rx_without_import": false, - "uses_spark_without_creation": false, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "python", - "code": "from databricks.labs.gbx.rasterx import functions as rx\nfrom pyspark.sql import functions as f\n\n# Sentinel-2 satellite imagery (Band 4 = Red, Band 8 = NIR)\nsentinel = spark.read.format(\"gdal\").load(\"/data/sentinel2/*.tif\")\n\nndvi = sentinel.select(\n \"path\",\n \"date\",\n rx.rst_ndvi(\n f.col(\"tile\"),\n f.lit(4), # Red band\n f.lit(8) # NIR band\n ).alias(\"ndvi_tile\")\n)\n\n# Compute NDVI statistics for vegetation health assessment\nndvi_stats = ndvi.select(\n \"date\",\n rx.rst_avg(\"ndvi_tile\").getItem(0).alias(\"mean_ndvi\"),\n rx.rst_min(\"ndvi_tile\").getItem(0).alias(\"min_ndvi\"),\n rx.rst_max(\"ndvi_tile\").getItem(0).alias(\"max_ndvi\")\n)\n\n# Time series analysis: track vegetation changes\nmonthly_ndvi = ndvi_stats.groupBy(\n f.date_trunc(\"month\", \"date\").alias(\"month\")\n).agg(\n f.avg(\"mean_ndvi\").alias(\"monthly_avg_ndvi\")\n).orderBy(\"month\")\n\n# Classify vegetation density\n# NDVI > 0.6: Dense vegetation\n# NDVI 0.2-0.6: Moderate vegetation\n# NDVI < 0.2: Sparse/No vegetation\nvegetation_classes = ndvi.select(\n \"path\",\n \"ndvi_tile\",\n f.when(rx.rst_avg(\"ndvi_tile\").getItem(0) > 0.6, \"Dense\")\n .when(rx.rst_avg(\"ndvi_tile\").getItem(0) > 0.2, \"Moderate\")\n .otherwise(\"Sparse\").alias(\"vegetation_density\")\n)\n\n# Landsat imagery (Band 3 = Red, Band 4 = NIR)\nlandsat = spark.read.format(\"gdal\").load(\"/data/landsat/*.tif\")\n\nlandsat_ndvi = landsat.select(\n rx.rst_ndvi(f.col(\"tile\"), f.lit(3), f.lit(4)).alias(\"ndvi\")\n)\n\n# MODIS imagery (Band 1 = Red, Band 2 = NIR)\nmodis = spark.read.format(\"gdal\").load(\"/data/modis/*.tif\")\n\nmodis_ndvi = modis.select(\n rx.rst_ndvi(f.col(\"tile\"), f.lit(1), f.lit(2)).alias(\"ndvi\")\n)\n\n# Agricultural monitoring: detect crop stress\ncrop_stress = ndvi.filter(\n rx.rst_avg(\"ndvi_tile\").getItem(0) < 0.3 # Low NDVI indicates stress\n)\n", - "line_number": 1967, - "length_lines": 62, - "source_file": "api/rasterx-functions.md", - "category": "SELF_CONTAINED", - "confidence": "medium", - "analysis": { - "has_imports": true, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": true, - "has_rx_registration": false, - "uses_existing_df": null, - "uses_existing_rasters": null, - "uses_rx_without_import": false, - "uses_spark_without_creation": true, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "scala", - "code": "import com.databricks.labs.gbx.rasterx.{functions => rx}\nimport org.apache.spark.sql.functions._\n\n// Calculate NDVI from Sentinel-2\nval sentinel = spark.read.format(\"gdal\").load(\"/data/sentinel2\")\n\nval ndvi = sentinel.select(\n col(\"path\"),\n rx.rst_ndvi(col(\"tile\"), lit(4), lit(8)).alias(\"ndvi_tile\")\n)\n\n// Vegetation statistics\nval stats = ndvi.select(\n rx.rst_avg(col(\"ndvi_tile\")).getItem(0).alias(\"avg_ndvi\")\n)\n", - "line_number": 2031, - "length_lines": 16, - "source_file": "api/rasterx-functions.md", - "category": "SELF_CONTAINED", - "confidence": "medium", - "analysis": { - "has_imports": true, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": true, - "has_rx_registration": false, - "uses_existing_df": null, - "uses_existing_rasters": null, - "uses_rx_without_import": false, - "uses_spark_without_creation": true, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "sql", - "code": "-- Calculate NDVI for Sentinel-2 imagery\nSELECT\n path,\n date,\n gbx_rst_ndvi(tile, 4, 8) as ndvi_tile,\n gbx_rst_avg(gbx_rst_ndvi(tile, 4, 8))[0] as mean_ndvi\nFROM sentinel2_images;\n\n-- Monthly vegetation trends\nSELECT\n DATE_TRUNC('month', date) as month,\n AVG(gbx_rst_avg(gbx_rst_ndvi(tile, 4, 8))[0]) as avg_monthly_ndvi\nFROM sentinel2_images\nGROUP BY DATE_TRUNC('month', date)\nORDER BY month;\n\n-- Identify areas with vegetation loss\nWITH current_ndvi AS (\n SELECT path, gbx_rst_avg(gbx_rst_ndvi(tile, 4, 8))[0] as ndvi\n FROM images_2024\n),\nprevious_ndvi AS (\n SELECT path, gbx_rst_avg(gbx_rst_ndvi(tile, 4, 8))[0] as ndvi\n FROM images_2023\n)\nSELECT \n c.path,\n c.ndvi - p.ndvi as ndvi_change\nFROM current_ndvi c\nJOIN previous_ndvi p ON c.path = p.path\nWHERE c.ndvi - p.ndvi < -0.2; -- Significant vegetation loss\n", - "line_number": 2049, - "length_lines": 32, - "source_file": "api/rasterx-functions.md", - "category": "EXTENDS_SETUP", - "confidence": "high", - "analysis": { - "has_imports": false, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": false, - "has_rx_registration": false, - "uses_existing_df": null, - "uses_existing_rasters": null, - "uses_rx_without_import": false, - "uses_spark_without_creation": false, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "scala", - "code": "rst_convolve(tile: Column, kernel: Column): Column\n", - "line_number": 2111, - "length_lines": 2, - "source_file": "api/rasterx-functions.md", - "category": "NEEDS_REVIEW", - "confidence": "low", - "analysis": { - "has_imports": false, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": false, - "has_rx_registration": false, - "uses_existing_df": null, - "uses_existing_rasters": null, - "uses_rx_without_import": false, - "uses_spark_without_creation": false, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "python", - "code": "from databricks.labs.gbx.rasterx import functions as rx\nfrom pyspark.sql import functions as f\n\nrasters = spark.read.format(\"gdal\").load(\"/data/satellite/*.tif\")\n\n# Sobel edge detection (horizontal edges)\nsobel_x = [\n [-1.0, 0.0, 1.0],\n [-2.0, 0.0, 2.0],\n [-1.0, 0.0, 1.0]\n]\n\nedges_x = rasters.select(\n \"path\",\n rx.rst_convolve(f.col(\"tile\"), f.array(*[\n f.array(*[f.lit(v) for v in row]) for row in sobel_x\n ])).alias(\"edges_horizontal\")\n)\n\n# Sobel edge detection (vertical edges)\nsobel_y = [\n [-1.0, -2.0, -1.0],\n [ 0.0, 0.0, 0.0],\n [ 1.0, 2.0, 1.0]\n]\n\nedges_y = rasters.select(\n rx.rst_convolve(f.col(\"tile\"), f.array(*[\n f.array(*[f.lit(v) for v in row]) for row in sobel_y\n ])).alias(\"edges_vertical\")\n)\n\n# Gaussian blur (3x3)\ngaussian_blur = [\n [1.0, 2.0, 1.0],\n [2.0, 4.0, 2.0],\n [1.0, 2.0, 1.0]\n]\n# Normalize kernel (sum = 1)\nkernel_sum = sum(sum(row) for row in gaussian_blur)\ngaussian_normalized = [[v/kernel_sum for v in row] for row in gaussian_blur]\n\nblurred = rasters.select(\n rx.rst_convolve(f.col(\"tile\"), f.array(*[\n f.array(*[f.lit(v) for v in row]) for row in gaussian_normalized\n ])).alias(\"blurred_tile\")\n)\n\n# Sharpen filter\nsharpen = [\n [ 0.0, -1.0, 0.0],\n [-1.0, 5.0, -1.0],\n [ 0.0, -1.0, 0.0]\n]\n\nsharpened = rasters.select(\n rx.rst_convolve(f.col(\"tile\"), f.array(*[\n f.array(*[f.lit(v) for v in row]) for row in sharpen\n ])).alias(\"sharp_tile\")\n)\n\n# Laplacian edge detection\nlaplacian = [\n [0.0, 1.0, 0.0],\n [1.0, -4.0, 1.0],\n [0.0, 1.0, 0.0]\n]\n\nedges = rasters.select(\n rx.rst_convolve(f.col(\"tile\"), f.array(*[\n f.array(*[f.lit(v) for v in row]) for row in laplacian\n ])).alias(\"edges\")\n)\n\n# Emboss filter\nemboss = [\n [-2.0, -1.0, 0.0],\n [-1.0, 1.0, 1.0],\n [ 0.0, 1.0, 2.0]\n]\n\nembossed = rasters.select(\n rx.rst_convolve(f.col(\"tile\"), f.array(*[\n f.array(*[f.lit(v) for v in row]) for row in emboss\n ])).alias(\"embossed\")\n)\n\n# Custom 5x5 kernel for stronger blur\nblur_5x5 = [\n [1.0, 1.0, 1.0, 1.0, 1.0],\n [1.0, 2.0, 2.0, 2.0, 1.0],\n [1.0, 2.0, 4.0, 2.0, 1.0],\n [1.0, 2.0, 2.0, 2.0, 1.0],\n [1.0, 1.0, 1.0, 1.0, 1.0]\n]\nkernel_sum = sum(sum(row) for row in blur_5x5)\nblur_5x5_norm = [[v/kernel_sum for v in row] for row in blur_5x5]\n\nstrong_blur = rasters.select(\n rx.rst_convolve(f.col(\"tile\"), f.array(*[\n f.array(*[f.lit(v) for v in row]) for row in blur_5x5_norm\n ])).alias(\"strong_blur\")\n)\n", - "line_number": 2124, - "length_lines": 104, - "source_file": "api/rasterx-functions.md", - "category": "SELF_CONTAINED", - "confidence": "medium", - "analysis": { - "has_imports": true, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": true, - "has_rx_registration": false, - "uses_existing_df": null, - "uses_existing_rasters": false, - "uses_rx_without_import": false, - "uses_spark_without_creation": true, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "scala", - "code": "import com.databricks.labs.gbx.rasterx.{functions => rx}\nimport org.apache.spark.sql.functions._\n\n// Sobel edge detection\nval sobelX = Seq(\n Seq(-1.0, 0.0, 1.0),\n Seq(-2.0, 0.0, 2.0),\n Seq(-1.0, 0.0, 1.0)\n)\n\nval edges = rasters.select(\n rx.rst_convolve(col(\"tile\"), lit(sobelX)).alias(\"edges\")\n)\n", - "line_number": 2230, - "length_lines": 14, - "source_file": "api/rasterx-functions.md", - "category": "EXTENDS_SETUP", - "confidence": "high", - "analysis": { - "has_imports": true, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": false, - "has_rx_registration": false, - "uses_existing_df": null, - "uses_existing_rasters": true, - "uses_rx_without_import": false, - "uses_spark_without_creation": true, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "sql", - "code": "-- Note: SQL has limited support for 2D arrays\n-- Better to use Python/Scala for convolution operations\n", - "line_number": 2246, - "length_lines": 3, - "source_file": "api/rasterx-functions.md", - "category": "EXAMPLE_ONLY", - "confidence": "medium", - "analysis": { - "has_imports": false, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": false, - "has_rx_registration": false, - "uses_existing_df": null, - "uses_existing_rasters": null, - "uses_rx_without_import": false, - "uses_spark_without_creation": false, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "scala", - "code": "rst_filter(tile: Column, kernelSize: Column, operation: Column): Column\n", - "line_number": 2277, - "length_lines": 2, - "source_file": "api/rasterx-functions.md", - "category": "NEEDS_REVIEW", - "confidence": "low", - "analysis": { - "has_imports": false, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": false, - "has_rx_registration": false, - "uses_existing_df": null, - "uses_existing_rasters": null, - "uses_rx_without_import": false, - "uses_spark_without_creation": false, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "python", - "code": "from databricks.labs.gbx.rasterx import functions as rx\nfrom pyspark.sql import functions as f\n\nrasters = spark.read.format(\"gdal\").load(\"/data/noisy/*.tif\")\n\n# Median filter for salt-and-pepper noise removal\n# Median preserves edges better than average\ndenoised = rasters.select(\n \"path\",\n rx.rst_filter(f.col(\"tile\"), f.lit(3), f.lit(\"median\")).alias(\"clean_tile\")\n)\n\n# Average filter for general smoothing\nsmoothed = rasters.select(\n rx.rst_filter(f.col(\"tile\"), f.lit(5), f.lit(\"avg\")).alias(\"smooth_tile\")\n)\n\n# Maximum filter (morphological dilation)\n# Expands bright regions\ndilated = rasters.select(\n rx.rst_filter(f.col(\"tile\"), f.lit(3), f.lit(\"max\")).alias(\"dilated\")\n)\n\n# Minimum filter (morphological erosion)\n# Expands dark regions\neroded = rasters.select(\n rx.rst_filter(f.col(\"tile\"), f.lit(3), f.lit(\"min\")).alias(\"eroded\")\n)\n\n# Mode filter for categorical rasters (land cover classification)\nland_cover = spark.read.format(\"gdal\").load(\"/data/landcover/*.tif\")\n\nsmoothed_classes = land_cover.select(\n rx.rst_filter(f.col(\"tile\"), f.lit(5), f.lit(\"mode\")).alias(\"smooth_classes\")\n)\n\n# Comparison of filter sizes\nfilters_comparison = rasters.select(\n \"path\",\n f.col(\"tile\").alias(\"original\"),\n rx.rst_filter(f.col(\"tile\"), f.lit(3), f.lit(\"median\")).alias(\"med_3x3\"),\n rx.rst_filter(f.col(\"tile\"), f.lit(5), f.lit(\"median\")).alias(\"med_5x5\"),\n rx.rst_filter(f.col(\"tile\"), f.lit(7), f.lit(\"median\")).alias(\"med_7x7\")\n)\n\n# Quality improvement pipeline\nquality_pipeline = rasters.select(\n \"path\",\n # Step 1: Remove noise\n rx.rst_filter(f.col(\"tile\"), f.lit(3), f.lit(\"median\")).alias(\"step1\")\n).select(\n \"path\",\n # Step 2: Slight smoothing\n rx.rst_filter(f.col(\"step1\"), f.lit(3), f.lit(\"avg\")).alias(\"cleaned\")\n)\n\n# Opening operation (erosion followed by dilation)\n# Removes small bright objects\nopened = rasters.select(\n rx.rst_filter(f.col(\"tile\"), f.lit(3), f.lit(\"min\")).alias(\"eroded\")\n).select(\n rx.rst_filter(f.col(\"eroded\"), f.lit(3), f.lit(\"max\")).alias(\"opened\")\n)\n\n# Closing operation (dilation followed by erosion)\n# Removes small dark holes\nclosed = rasters.select(\n rx.rst_filter(f.col(\"tile\"), f.lit(3), f.lit(\"max\")).alias(\"dilated\")\n).select(\n rx.rst_filter(f.col(\"dilated\"), f.lit(3), f.lit(\"min\")).alias(\"closed\")\n)\n", - "line_number": 2296, - "length_lines": 72, - "source_file": "api/rasterx-functions.md", - "category": "SELF_CONTAINED", - "confidence": "medium", - "analysis": { - "has_imports": true, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": true, - "has_rx_registration": false, - "uses_existing_df": null, - "uses_existing_rasters": false, - "uses_rx_without_import": false, - "uses_spark_without_creation": true, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "scala", - "code": "import com.databricks.labs.gbx.rasterx.{functions => rx}\nimport org.apache.spark.sql.functions._\n\n// Median filter for noise removal\nval denoised = rasters.select(\n rx.rst_filter(col(\"tile\"), lit(3), lit(\"median\")).alias(\"clean\")\n)\n\n// Average smoothing\nval smoothed = rasters.select(\n rx.rst_filter(col(\"tile\"), lit(5), lit(\"avg\")).alias(\"smooth\")\n)\n", - "line_number": 2370, - "length_lines": 13, - "source_file": "api/rasterx-functions.md", - "category": "EXTENDS_SETUP", - "confidence": "high", - "analysis": { - "has_imports": true, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": false, - "has_rx_registration": false, - "uses_existing_df": null, - "uses_existing_rasters": true, - "uses_rx_without_import": false, - "uses_spark_without_creation": true, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "sql", - "code": "-- Median filter (3x3 window)\nSELECT\n path,\n gbx_rst_filter(tile, 3, 'median') as denoised\nFROM noisy_rasters;\n\n-- Average smoothing (5x5 window)\nSELECT\n path,\n gbx_rst_filter(tile, 5, 'avg') as smoothed\nFROM rasters;\n\n-- Maximum filter for dilation\nSELECT gbx_rst_filter(tile, 3, 'max') as dilated\nFROM rasters;\n", - "line_number": 2385, - "length_lines": 16, - "source_file": "api/rasterx-functions.md", - "category": "EXTENDS_SETUP", - "confidence": "high", - "analysis": { - "has_imports": false, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": false, - "has_rx_registration": false, - "uses_existing_df": null, - "uses_existing_rasters": true, - "uses_rx_without_import": false, - "uses_spark_without_creation": false, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "scala", - "code": "rst_rastertoworldcoord(tile: Column, pixelX: Column, pixelY: Column): Column\n", - "line_number": 2431, - "length_lines": 2, - "source_file": "api/rasterx-functions.md", - "category": "NEEDS_REVIEW", - "confidence": "low", - "analysis": { - "has_imports": false, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": false, - "has_rx_registration": false, - "uses_existing_df": null, - "uses_existing_rasters": null, - "uses_rx_without_import": false, - "uses_spark_without_creation": false, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "python", - "code": "from databricks.labs.gbx.rasterx import functions as rx\nfrom pyspark.sql import functions as f\n\nrasters = spark.read.format(\"gdal\").load(\"/data/elevation/*.tif\")\n\n# Convert pixel coordinates to world coordinates\nworld_coords = rasters.select(\n \"path\",\n rx.rst_rastertoworldcoord(\n f.col(\"tile\"),\n f.lit(100), # pixel X\n f.lit(200) # pixel Y\n ).alias(\"coords\")\n).select(\n \"path\",\n f.col(\"coords.x\").alias(\"longitude\"),\n f.col(\"coords.y\").alias(\"latitude\")\n)\n\n# Convert center pixel to world coordinates\ncenter_coords = rasters.select(\n \"path\",\n rx.rst_width(f.col(\"tile\")).alias(\"width\"),\n rx.rst_height(f.col(\"tile\")).alias(\"height\")\n).select(\n \"path\",\n rx.rst_rastertoworldcoord(\n f.col(\"tile\"),\n (f.col(\"width\") / 2).cast(\"int\"),\n (f.col(\"height\") / 2).cast(\"int\")\n ).alias(\"center\")\n)\n\n# Sample points across raster\nsample_points = rasters.select(\n \"path\",\n f.explode(f.sequence(f.lit(0), f.lit(100), f.lit(10))).alias(\"x\")\n).select(\n \"path\",\n f.col(\"x\"),\n f.explode(f.sequence(f.lit(0), f.lit(100), f.lit(10))).alias(\"y\")\n).select(\n \"path\",\n f.col(\"x\"),\n f.col(\"y\"),\n rx.rst_rastertoworldcoord(f.col(\"tile\"), f.col(\"x\"), f.col(\"y\")).alias(\"world_coord\")\n)\n", - "line_number": 2447, - "length_lines": 48, - "source_file": "api/rasterx-functions.md", - "category": "EXAMPLE_ONLY", - "confidence": "high", - "analysis": { - "has_imports": true, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": true, - "has_rx_registration": false, - "uses_existing_df": null, - "uses_existing_rasters": false, - "uses_rx_without_import": false, - "uses_spark_without_creation": true, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "scala", - "code": "import com.databricks.labs.gbx.rasterx.{functions => rx}\nimport org.apache.spark.sql.functions._\n\n// Convert pixel to world coordinates\nval worldCoords = rasters.select(\n col(\"path\"),\n rx.rst_rastertoworldcoord(col(\"tile\"), lit(100), lit(200)).alias(\"coords\")\n).select(\n col(\"path\"),\n col(\"coords.x\").alias(\"lon\"),\n col(\"coords.y\").alias(\"lat\")\n)\n", - "line_number": 2497, - "length_lines": 13, - "source_file": "api/rasterx-functions.md", - "category": "EXTENDS_SETUP", - "confidence": "high", - "analysis": { - "has_imports": true, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": false, - "has_rx_registration": false, - "uses_existing_df": null, - "uses_existing_rasters": true, - "uses_rx_without_import": false, - "uses_spark_without_creation": true, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "sql", - "code": "SELECT\n path,\n gbx_rst_rastertoworldcoord(tile, 100, 200) as coords,\n gbx_rst_rastertoworldcoord(tile, 100, 200).x as longitude,\n gbx_rst_rastertoworldcoord(tile, 100, 200).y as latitude\nFROM rasters;\n", - "line_number": 2512, - "length_lines": 7, - "source_file": "api/rasterx-functions.md", - "category": "EXTENDS_SETUP", - "confidence": "high", - "analysis": { - "has_imports": false, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": false, - "has_rx_registration": false, - "uses_existing_df": null, - "uses_existing_rasters": true, - "uses_rx_without_import": false, - "uses_spark_without_creation": false, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "scala", - "code": "rst_rastertoworldcoordx(tile: Column, pixelX: Column, pixelY: Column): Column\n", - "line_number": 2538, - "length_lines": 2, - "source_file": "api/rasterx-functions.md", - "category": "NEEDS_REVIEW", - "confidence": "low", - "analysis": { - "has_imports": false, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": false, - "has_rx_registration": false, - "uses_existing_df": null, - "uses_existing_rasters": null, - "uses_rx_without_import": false, - "uses_spark_without_creation": false, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "python", - "code": "from databricks.labs.gbx.rasterx import functions as rx\nfrom pyspark.sql import functions as f\n\n# Get only X coordinate (longitude/easting)\nworld_x = rasters.select(\n rx.rst_rastertoworldcoordx(f.col(\"tile\"), f.lit(100), f.lit(200)).alias(\"easting\")\n)\n", - "line_number": 2552, - "length_lines": 8, - "source_file": "api/rasterx-functions.md", - "category": "EXTENDS_SETUP", - "confidence": "high", - "analysis": { - "has_imports": true, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": false, - "has_rx_registration": false, - "uses_existing_df": null, - "uses_existing_rasters": true, - "uses_rx_without_import": false, - "uses_spark_without_creation": true, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "sql", - "code": "SELECT\n gbx_rst_rastertoworldcoordx(tile, 100, 200) as easting\nFROM rasters;\n", - "line_number": 2562, - "length_lines": 4, - "source_file": "api/rasterx-functions.md", - "category": "EXTENDS_SETUP", - "confidence": "high", - "analysis": { - "has_imports": false, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": false, - "has_rx_registration": false, - "uses_existing_df": null, - "uses_existing_rasters": true, - "uses_rx_without_import": false, - "uses_spark_without_creation": false, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "scala", - "code": "rst_rastertoworldcoordy(tile: Column, pixelX: Column, pixelY: Column): Column\n", - "line_number": 2575, - "length_lines": 2, - "source_file": "api/rasterx-functions.md", - "category": "NEEDS_REVIEW", - "confidence": "low", - "analysis": { - "has_imports": false, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": false, - "has_rx_registration": false, - "uses_existing_df": null, - "uses_existing_rasters": null, - "uses_rx_without_import": false, - "uses_spark_without_creation": false, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "python", - "code": "from databricks.labs.gbx.rasterx import functions as rx\nfrom pyspark.sql import functions as f\n\n# Get only Y coordinate (latitude/northing)\nworld_y = rasters.select(\n rx.rst_rastertoworldcoordy(f.col(\"tile\"), f.lit(100), f.lit(200)).alias(\"northing\")\n)\n", - "line_number": 2589, - "length_lines": 8, - "source_file": "api/rasterx-functions.md", - "category": "EXTENDS_SETUP", - "confidence": "high", - "analysis": { - "has_imports": true, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": false, - "has_rx_registration": false, - "uses_existing_df": null, - "uses_existing_rasters": true, - "uses_rx_without_import": false, - "uses_spark_without_creation": true, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "sql", - "code": "SELECT\n gbx_rst_rastertoworldcoordy(tile, 100, 200) as northing\nFROM rasters;\n", - "line_number": 2599, - "length_lines": 4, - "source_file": "api/rasterx-functions.md", - "category": "EXTENDS_SETUP", - "confidence": "high", - "analysis": { - "has_imports": false, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": false, - "has_rx_registration": false, - "uses_existing_df": null, - "uses_existing_rasters": true, - "uses_rx_without_import": false, - "uses_spark_without_creation": false, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "scala", - "code": "rst_worldtorastercoord(tile: Column, worldX: Column, worldY: Column): Column\n", - "line_number": 2612, - "length_lines": 2, - "source_file": "api/rasterx-functions.md", - "category": "NEEDS_REVIEW", - "confidence": "low", - "analysis": { - "has_imports": false, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": false, - "has_rx_registration": false, - "uses_existing_df": null, - "uses_existing_rasters": null, - "uses_rx_without_import": false, - "uses_spark_without_creation": false, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "python", - "code": "from databricks.labs.gbx.rasterx import functions as rx\nfrom pyspark.sql import functions as f\n\nrasters = spark.read.format(\"gdal\").load(\"/data/elevation/*.tif\")\n\n# Convert specific location to pixel coordinates\npixel_coords = rasters.select(\n \"path\",\n rx.rst_worldtorastercoord(\n f.col(\"tile\"),\n f.lit(-122.4194), # San Francisco longitude\n f.lit(37.7749) # San Francisco latitude\n ).alias(\"pixel\")\n).select(\n \"path\",\n f.col(\"pixel.x\").alias(\"col\"),\n f.col(\"pixel.y\").alias(\"row\")\n)\n\n# Sample raster at specific geographic points\nlocations = spark.createDataFrame([\n (\"San Francisco\", -122.4194, 37.7749),\n (\"Los Angeles\", -118.2437, 34.0522),\n (\"Seattle\", -122.3321, 47.6062)\n], [\"city\", \"lon\", \"lat\"])\n\n# Join with raster and get pixel coordinates\nsampled = locations.crossJoin(rasters).select(\n \"city\",\n \"lon\",\n \"lat\",\n \"path\",\n rx.rst_worldtorastercoord(\n f.col(\"tile\"),\n f.col(\"lon\"),\n f.col(\"lat\")\n ).alias(\"pixel\")\n)\n\n# Extract pixel values at specific world coordinates\npoint_values = sampled.select(\n \"city\",\n f.col(\"pixel.x\").alias(\"px\"),\n f.col(\"pixel.y\").alias(\"py\")\n)\n# Then use pixel coordinates to extract actual raster values\n", - "line_number": 2628, - "length_lines": 47, - "source_file": "api/rasterx-functions.md", - "category": "EXAMPLE_ONLY", - "confidence": "high", - "analysis": { - "has_imports": true, - "has_spark_creation": false, - "has_data_creation": true, - "has_file_reading": true, - "has_rx_registration": false, - "uses_existing_df": null, - "uses_existing_rasters": false, - "uses_rx_without_import": false, - "uses_spark_without_creation": true, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "scala", - "code": "import com.databricks.labs.gbx.rasterx.{functions => rx}\nimport org.apache.spark.sql.functions._\n\n// Convert world to pixel coordinates\nval pixelCoords = rasters.select(\n rx.rst_worldtorastercoord(col(\"tile\"), lit(-122.4194), lit(37.7749)).alias(\"pixel\")\n).select(\n col(\"pixel.x\").alias(\"col\"),\n col(\"pixel.y\").alias(\"row\")\n)\n", - "line_number": 2677, - "length_lines": 11, - "source_file": "api/rasterx-functions.md", - "category": "EXTENDS_SETUP", - "confidence": "high", - "analysis": { - "has_imports": true, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": false, - "has_rx_registration": false, - "uses_existing_df": null, - "uses_existing_rasters": true, - "uses_rx_without_import": false, - "uses_spark_without_creation": true, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "sql", - "code": "-- Find pixel coordinates for a specific location\nSELECT\n path,\n gbx_rst_worldtorastercoord(tile, -122.4194, 37.7749) as pixel,\n gbx_rst_worldtorastercoord(tile, -122.4194, 37.7749).x as col,\n gbx_rst_worldtorastercoord(tile, -122.4194, 37.7749).y as row\nFROM rasters;\n\n-- Sample raster at multiple points\nWITH locations AS (\n SELECT 'San Francisco' as city, -122.4194 as lon, 37.7749 as lat\n UNION ALL\n SELECT 'Los Angeles', -118.2437, 34.0522\n)\nSELECT\n l.city,\n gbx_rst_worldtorastercoord(r.tile, l.lon, l.lat) as pixel\nFROM locations l\nCROSS JOIN rasters r;\n", - "line_number": 2690, - "length_lines": 20, - "source_file": "api/rasterx-functions.md", - "category": "EXTENDS_SETUP", - "confidence": "high", - "analysis": { - "has_imports": false, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": false, - "has_rx_registration": false, - "uses_existing_df": null, - "uses_existing_rasters": true, - "uses_rx_without_import": false, - "uses_spark_without_creation": false, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "scala", - "code": "rst_worldtorastercoordx(tile: Column, worldX: Column, worldY: Column): Column\n", - "line_number": 2728, - "length_lines": 2, - "source_file": "api/rasterx-functions.md", - "category": "NEEDS_REVIEW", - "confidence": "low", - "analysis": { - "has_imports": false, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": false, - "has_rx_registration": false, - "uses_existing_df": null, - "uses_existing_rasters": null, - "uses_rx_without_import": false, - "uses_spark_without_creation": false, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "python", - "code": "from databricks.labs.gbx.rasterx import functions as rx\nfrom pyspark.sql import functions as f\n\n# Get only pixel column\npixel_col = rasters.select(\n rx.rst_worldtorastercoordx(f.col(\"tile\"), f.lit(-122.4194), f.lit(37.7749)).alias(\"col\")\n)\n", - "line_number": 2742, - "length_lines": 8, - "source_file": "api/rasterx-functions.md", - "category": "EXTENDS_SETUP", - "confidence": "high", - "analysis": { - "has_imports": true, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": false, - "has_rx_registration": false, - "uses_existing_df": null, - "uses_existing_rasters": true, - "uses_rx_without_import": false, - "uses_spark_without_creation": true, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "sql", - "code": "SELECT\n gbx_rst_worldtorastercoordx(tile, -122.4194, 37.7749) as pixel_col\nFROM rasters;\n", - "line_number": 2752, - "length_lines": 4, - "source_file": "api/rasterx-functions.md", - "category": "EXTENDS_SETUP", - "confidence": "high", - "analysis": { - "has_imports": false, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": false, - "has_rx_registration": false, - "uses_existing_df": null, - "uses_existing_rasters": true, - "uses_rx_without_import": false, - "uses_spark_without_creation": false, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "scala", - "code": "rst_worldtorastercoordy(tile: Column, worldX: Column, worldY: Column): Column\n", - "line_number": 2765, - "length_lines": 2, - "source_file": "api/rasterx-functions.md", - "category": "NEEDS_REVIEW", - "confidence": "low", - "analysis": { - "has_imports": false, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": false, - "has_rx_registration": false, - "uses_existing_df": null, - "uses_existing_rasters": null, - "uses_rx_without_import": false, - "uses_spark_without_creation": false, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "python", - "code": "from databricks.labs.gbx.rasterx import functions as rx\nfrom pyspark.sql import functions as f\n\n# Get only pixel row\npixel_row = rasters.select(\n rx.rst_worldtorastercoordy(f.col(\"tile\"), f.lit(-122.4194), f.lit(37.7749)).alias(\"row\")\n)\n", - "line_number": 2779, - "length_lines": 8, - "source_file": "api/rasterx-functions.md", - "category": "EXTENDS_SETUP", - "confidence": "high", - "analysis": { - "has_imports": true, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": false, - "has_rx_registration": false, - "uses_existing_df": null, - "uses_existing_rasters": true, - "uses_rx_without_import": false, - "uses_spark_without_creation": true, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "sql", - "code": "SELECT\n gbx_rst_worldtorastercoordy(tile, -122.4194, 37.7749) as pixel_row\nFROM rasters;\n", - "line_number": 2789, - "length_lines": 4, - "source_file": "api/rasterx-functions.md", - "category": "EXTENDS_SETUP", - "confidence": "high", - "analysis": { - "has_imports": false, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": false, - "has_rx_registration": false, - "uses_existing_df": null, - "uses_existing_rasters": true, - "uses_rx_without_import": false, - "uses_spark_without_creation": false, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "scala", - "code": "rst_isempty(tile: Column): Column\n", - "line_number": 2820, - "length_lines": 2, - "source_file": "api/rasterx-functions.md", - "category": "EXAMPLE_ONLY", - "confidence": "high", - "analysis": { - "has_imports": false, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": false, - "has_rx_registration": false, - "uses_existing_df": null, - "uses_existing_rasters": null, - "uses_rx_without_import": false, - "uses_spark_without_creation": false, - "has_placeholders": false, - "is_short": true - } - }, - { - "language": "python", - "code": "from databricks.labs.gbx.rasterx import functions as rx\nfrom pyspark.sql import functions as f\n\nrasters = spark.read.format(\"gdal\").load(\"/data/satellite/*.tif\")\n\n# Filter out empty rasters\nvalid_rasters = rasters.filter(\n ~rx.rst_isempty(f.col(\"tile\"))\n)\n\n# Quality control: identify empty tiles\nempty_tiles = rasters.filter(\n rx.rst_isempty(f.col(\"tile\"))\n).select(\"path\")\n\n# Data completeness report\ncompleteness = rasters.select(\n f.count(\"*\").alias(\"total_rasters\"),\n f.sum(f.when(rx.rst_isempty(f.col(\"tile\")), 1).otherwise(0)).alias(\"empty_count\"),\n f.sum(f.when(~rx.rst_isempty(f.col(\"tile\")), 1).otherwise(0)).alias(\"valid_count\")\n)\n\n# Combine multiple checks\nquality_check = rasters.select(\n \"path\",\n rx.rst_isempty(f.col(\"tile\")).alias(\"is_empty\"),\n rx.rst_width(f.col(\"tile\")).alias(\"width\"),\n rx.rst_height(f.col(\"tile\")).alias(\"height\"),\n rx.rst_pixelcount(f.col(\"tile\")).alias(\"pixel_count\")\n).filter(\n (f.col(\"is_empty\") == False) & \n (f.col(\"width\") > 0) & \n (f.col(\"height\") > 0)\n)\n", - "line_number": 2832, - "length_lines": 35, - "source_file": "api/rasterx-functions.md", - "category": "SELF_CONTAINED", - "confidence": "medium", - "analysis": { - "has_imports": true, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": true, - "has_rx_registration": false, - "uses_existing_df": null, - "uses_existing_rasters": false, - "uses_rx_without_import": false, - "uses_spark_without_creation": true, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "scala", - "code": "import com.databricks.labs.gbx.rasterx.{functions => rx}\nimport org.apache.spark.sql.functions._\n\n// Filter empty rasters\nval validRasters = rasters.filter(!rx.rst_isempty(col(\"tile\")))\n\n// Count empty tiles\nval emptyCount = rasters.filter(rx.rst_isempty(col(\"tile\"))).count()\n", - "line_number": 2869, - "length_lines": 9, - "source_file": "api/rasterx-functions.md", - "category": "EXTENDS_SETUP", - "confidence": "high", - "analysis": { - "has_imports": true, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": false, - "has_rx_registration": false, - "uses_existing_df": null, - "uses_existing_rasters": true, - "uses_rx_without_import": false, - "uses_spark_without_creation": true, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "sql", - "code": "-- Filter out empty rasters\nSELECT * FROM rasters\nWHERE NOT gbx_rst_isempty(tile);\n\n-- Count empty vs valid\nSELECT\n COUNT(*) as total,\n SUM(CASE WHEN gbx_rst_isempty(tile) THEN 1 ELSE 0 END) as empty_count,\n SUM(CASE WHEN NOT gbx_rst_isempty(tile) THEN 1 ELSE 0 END) as valid_count\nFROM rasters;\n", - "line_number": 2880, - "length_lines": 11, - "source_file": "api/rasterx-functions.md", - "category": "EXTENDS_SETUP", - "confidence": "high", - "analysis": { - "has_imports": false, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": false, - "has_rx_registration": false, - "uses_existing_df": null, - "uses_existing_rasters": true, - "uses_rx_without_import": false, - "uses_spark_without_creation": false, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "scala", - "code": "rst_tryopen(tile: Column): Column\n", - "line_number": 2910, - "length_lines": 2, - "source_file": "api/rasterx-functions.md", - "category": "EXAMPLE_ONLY", - "confidence": "high", - "analysis": { - "has_imports": false, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": false, - "has_rx_registration": false, - "uses_existing_df": null, - "uses_existing_rasters": null, - "uses_rx_without_import": false, - "uses_spark_without_creation": false, - "has_placeholders": false, - "is_short": true - } - }, - { - "language": "python", - "code": "from databricks.labs.gbx.rasterx import functions as rx\nfrom pyspark.sql import functions as f\n\n# List of file paths\nfile_paths = spark.read.text(\"/data/raster_catalog.txt\") \\\n .withColumnRenamed(\"value\", \"path\")\n\n# Load and validate rasters\nrasters = file_paths.select(\n \"path\",\n rx.rst_fromfile(f.col(\"path\"), f.lit(\"GTiff\")).alias(\"tile\")\n).select(\n \"path\",\n \"tile\",\n rx.rst_tryopen(f.col(\"tile\")).alias(\"can_open\")\n)\n\n# Filter only valid rasters\nvalid = rasters.filter(f.col(\"can_open\") == True)\n\n# Identify corrupt files\ncorrupt_files = rasters.filter(f.col(\"can_open\") == False) \\\n .select(\"path\")\n\n# Validation report\nvalidation_report = rasters.groupBy(\"can_open\").agg(\n f.count(\"*\").alias(\"count\"),\n f.collect_list(\"path\").alias(\"files\")\n)\n\n# Pre-flight check before expensive processing\nvalidated = rasters.filter(\n rx.rst_tryopen(f.col(\"tile\"))\n).select(\n \"path\",\n # Proceed with expensive operations only on valid rasters\n rx.rst_h3_tessellate(f.col(\"tile\"), f.lit(7)).alias(\"tessellated\")\n)\n\n# Binary content validation\nbinary_df = spark.read.format(\"binaryFile\").load(\"/data/*.tif\")\n\nvalidated_binary = binary_df.select(\n \"path\",\n rx.rst_fromcontent(f.col(\"content\"), f.lit(\"GTiff\")).alias(\"tile\")\n).filter(\n rx.rst_tryopen(f.col(\"tile\"))\n)\n", - "line_number": 2922, - "length_lines": 49, - "source_file": "api/rasterx-functions.md", - "category": "SELF_CONTAINED", - "confidence": "medium", - "analysis": { - "has_imports": true, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": true, - "has_rx_registration": false, - "uses_existing_df": null, - "uses_existing_rasters": false, - "uses_rx_without_import": false, - "uses_spark_without_creation": true, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "scala", - "code": "import com.databricks.labs.gbx.rasterx.{functions => rx}\nimport org.apache.spark.sql.functions._\n\n// Validate rasters\nval validRasters = rasters.filter(rx.rst_tryopen(col(\"tile\")))\n\n// Find corrupt files\nval corruptFiles = rasters.filter(!rx.rst_tryopen(col(\"tile\")))\n .select(\"path\")\n", - "line_number": 2973, - "length_lines": 10, - "source_file": "api/rasterx-functions.md", - "category": "EXTENDS_SETUP", - "confidence": "high", - "analysis": { - "has_imports": true, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": false, - "has_rx_registration": false, - "uses_existing_df": null, - "uses_existing_rasters": true, - "uses_rx_without_import": false, - "uses_spark_without_creation": true, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "sql", - "code": "-- Filter valid rasters\nSELECT * FROM rasters\nWHERE gbx_rst_tryopen(tile) = true;\n\n-- Identify corrupt rasters\nSELECT path\nFROM rasters\nWHERE gbx_rst_tryopen(tile) = false;\n\n-- Validation summary\nSELECT\n gbx_rst_tryopen(tile) as is_valid,\n COUNT(*) as count\nFROM rasters\nGROUP BY gbx_rst_tryopen(tile);\n", - "line_number": 2985, - "length_lines": 16, - "source_file": "api/rasterx-functions.md", - "category": "EXTENDS_SETUP", - "confidence": "high", - "analysis": { - "has_imports": false, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": false, - "has_rx_registration": false, - "uses_existing_df": null, - "uses_existing_rasters": true, - "uses_rx_without_import": false, - "uses_spark_without_creation": false, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "scala", - "code": "rst_mapalgebra(tiles: Column, expression: Column): Column\n", - "line_number": 3027, - "length_lines": 2, - "source_file": "api/rasterx-functions.md", - "category": "NEEDS_REVIEW", - "confidence": "low", - "analysis": { - "has_imports": false, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": false, - "has_rx_registration": false, - "uses_existing_df": null, - "uses_existing_rasters": null, - "uses_rx_without_import": false, - "uses_spark_without_creation": false, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "python", - "code": "from databricks.labs.gbx.rasterx import functions as rx\nfrom pyspark.sql import functions as f\nimport json\n\n# Simple addition of two rasters\ndf = spark.createDataFrame([\n (1, \"/data/raster1.tif\"),\n (2, \"/data/raster2.tif\")\n], [\"id\", \"path\"]).select(\n f.collect_list(\n rx.rst_fromfile(f.col(\"path\"), f.lit(\"GTiff\"))\n ).alias(\"tiles\")\n)\n\n# Add rasters A + B\naddition_spec = json.dumps({\n \"calc\": \"A+B\",\n \"A_index\": 0, # First raster in array\n \"B_index\": 1 # Second raster in array\n})\n\nsum_raster = df.select(\n rx.rst_mapalgebra(f.col(\"tiles\"), f.lit(addition_spec)).alias(\"sum\")\n)\n\n# Calculate NDVI: (NIR - Red) / (NIR + Red)\n# Using a multi-band raster\nndvi_spec = json.dumps({\n \"calc\": \"(A-B)/(A+B)\",\n \"A_index\": 0,\n \"A_band\": 4, # NIR band\n \"B_index\": 0,\n \"B_band\": 3 # Red band\n})\n\nndvi = df.select(\n rx.rst_mapalgebra(f.col(\"tiles\"), f.lit(ndvi_spec)).alias(\"ndvi\")\n)\n\n# Difference of rasters\ndifference_spec = json.dumps({\n \"calc\": \"A-B\",\n \"A_index\": 0,\n \"B_index\": 1\n})\n\ndiff = df.select(\n rx.rst_mapalgebra(f.col(\"tiles\"), f.lit(difference_spec)).alias(\"difference\")\n)\n\n# Conditional expression: Set values < 0 to 0\nconditional_spec = json.dumps({\n \"calc\": \"numpy.where(A<0, 0, A)\",\n \"A_index\": 0\n})\n\nclipped = df.select(\n rx.rst_mapalgebra(f.col(\"tiles\"), f.lit(conditional_spec)).alias(\"clipped\")\n)\n\n# Complex calculation: Enhanced Vegetation Index (EVI)\n# EVI = 2.5 * (NIR - Red) / (NIR + 6*Red - 7.5*Blue + 1)\nevi_spec = json.dumps({\n \"calc\": \"2.5 * (A - B) / (A + 6*B - 7.5*C + 1)\",\n \"A_index\": 0,\n \"A_band\": 5, # NIR\n \"B_index\": 0,\n \"B_band\": 4, # Red\n \"C_index\": 0,\n \"C_band\": 2 # Blue\n})\n\nevi = df.select(\n rx.rst_mapalgebra(f.col(\"tiles\"), f.lit(evi_spec)).alias(\"evi\")\n)\n\n# Logarithmic transformation\nlog_spec = json.dumps({\n \"calc\": \"numpy.log10(A+1)\", # +1 to avoid log(0)\n \"A_index\": 0\n})\n\nlog_transformed = df.select(\n rx.rst_mapalgebra(f.col(\"tiles\"), f.lit(log_spec)).alias(\"log\")\n)\n\n# Threshold and classify\nthreshold_spec = json.dumps({\n \"calc\": \"numpy.where(A>100, 1, numpy.where(A>50, 2, 3))\",\n \"A_index\": 0\n})\n\nclassified = df.select(\n rx.rst_mapalgebra(f.col(\"tiles\"), f.lit(threshold_spec)).alias(\"classes\")\n)\n", - "line_number": 3043, - "length_lines": 96, - "source_file": "api/rasterx-functions.md", - "category": "EXTENDS_SETUP", - "confidence": "high", - "analysis": { - "has_imports": true, - "has_spark_creation": false, - "has_data_creation": true, - "has_file_reading": false, - "has_rx_registration": false, - "uses_existing_df": false, - "uses_existing_rasters": true, - "uses_rx_without_import": false, - "uses_spark_without_creation": true, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "scala", - "code": "import com.databricks.labs.gbx.rasterx.{functions => rx}\nimport org.apache.spark.sql.functions._\n\n// Add two rasters\nval spec = \"\"\"{\"calc\": \"A+B\", \"A_index\": 0, \"B_index\": 1}\"\"\"\nval result = df.select(\n rx.rst_mapalgebra(col(\"tiles\"), lit(spec)).alias(\"sum\")\n)\n", - "line_number": 3141, - "length_lines": 9, - "source_file": "api/rasterx-functions.md", - "category": "EXTENDS_SETUP", - "confidence": "high", - "analysis": { - "has_imports": true, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": false, - "has_rx_registration": false, - "uses_existing_df": true, - "uses_existing_rasters": true, - "uses_rx_without_import": false, - "uses_spark_without_creation": true, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "sql", - "code": "-- Calculate difference between two rasters\nSELECT\n gbx_rst_mapalgebra(\n tiles,\n '{\"calc\": \"A-B\", \"A_index\": 0, \"B_index\": 1}'\n ) as difference\nFROM raster_arrays;\n", - "line_number": 3152, - "length_lines": 8, - "source_file": "api/rasterx-functions.md", - "category": "EXTENDS_SETUP", - "confidence": "high", - "analysis": { - "has_imports": false, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": false, - "has_rx_registration": false, - "uses_existing_df": null, - "uses_existing_rasters": true, - "uses_rx_without_import": false, - "uses_spark_without_creation": false, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "scala", - "code": "rst_derivedband(tile: Column, pythonFunc: String, funcName: String): Column\n", - "line_number": 3183, - "length_lines": 2, - "source_file": "api/rasterx-functions.md", - "category": "NEEDS_REVIEW", - "confidence": "low", - "analysis": { - "has_imports": false, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": false, - "has_rx_registration": false, - "uses_existing_df": null, - "uses_existing_rasters": null, - "uses_rx_without_import": false, - "uses_spark_without_creation": false, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "python", - "code": "from databricks.labs.gbx.rasterx import functions as rx\nfrom pyspark.sql import functions as f\n\nrasters = spark.read.format(\"gdal\").load(\"/data/*.tif\")\n\n# Simple band averaging function\navg_func = \"\"\"\nimport numpy as np\n\ndef band_average(in_ar, out_ar, xoff, yoff, xsize, ysize, \n raster_xsize, raster_ysize, buf_radius, gt, **kwargs):\n # in_ar shape: (num_bands, ysize, xsize)\n # out_ar shape: (ysize, xsize)\n out_ar[:] = np.mean(in_ar, axis=0)\n\"\"\"\n\naveraged = rasters.select(\n \"path\",\n rx.rst_derivedband(f.col(\"tile\"), f.lit(avg_func), f.lit(\"band_average\")).alias(\"avg_band\")\n)\n\n# Normalized band (0-1 range)\nnormalize_func = \"\"\"\nimport numpy as np\n\ndef normalize(in_ar, out_ar, xoff, yoff, xsize, ysize,\n raster_xsize, raster_ysize, buf_radius, gt, **kwargs):\n band = in_ar[0] # First band\n min_val = np.min(band[band > 0]) # Ignore zeros\n max_val = np.max(band)\n out_ar[:] = (band - min_val) / (max_val - min_val)\n\"\"\"\n\nnormalized = rasters.select(\n rx.rst_derivedband(f.col(\"tile\"), f.lit(normalize_func), f.lit(\"normalize\")).alias(\"norm\")\n)\n\n# Custom vegetation index\ncustom_vi_func = \"\"\"\nimport numpy as np\n\ndef custom_vi(in_ar, out_ar, xoff, yoff, xsize, ysize,\n raster_xsize, raster_ysize, buf_radius, gt, **kwargs):\n # Assume band 0=Red, band 1=NIR, band 2=Blue\n red = in_ar[0].astype(np.float32)\n nir = in_ar[1].astype(np.float32)\n blue = in_ar[2].astype(np.float32)\n \n # Custom Enhanced Vegetation Index\n numerator = nir - red\n denominator = nir + 6*red - 7.5*blue + 1\n \n # Avoid division by zero\n denominator = np.where(denominator == 0, 0.0001, denominator)\n out_ar[:] = 2.5 * numerator / denominator\n\"\"\"\n\ncustom_vi = rasters.select(\n rx.rst_derivedband(f.col(\"tile\"), f.lit(custom_vi_func), f.lit(\"custom_vi\")).alias(\"vi\")\n)\n\n# Apply nonlinear transformation\ngamma_func = \"\"\"\nimport numpy as np\n\ndef gamma_correction(in_ar, out_ar, xoff, yoff, xsize, ysize,\n raster_xsize, raster_ysize, buf_radius, gt, **kwargs):\n gamma = 2.2\n band = in_ar[0].astype(np.float32)\n # Normalize to 0-1\n normalized = band / 255.0\n # Apply gamma\n corrected = np.power(normalized, 1.0/gamma)\n # Scale back\n out_ar[:] = corrected * 255.0\n\"\"\"\n\ngamma_corrected = rasters.select(\n rx.rst_derivedband(f.col(\"tile\"), f.lit(gamma_func), f.lit(\"gamma_correction\")).alias(\"gamma\")\n)\n\n# Moving window analysis\nwindow_func = \"\"\"\nimport numpy as np\nfrom scipy import ndimage\n\ndef local_std(in_ar, out_ar, xoff, yoff, xsize, ysize,\n raster_xsize, raster_ysize, buf_radius, gt, **kwargs):\n band = in_ar[0].astype(np.float32)\n # Calculate local standard deviation (3x3 window)\n mean = ndimage.uniform_filter(band, size=3)\n mean_sq = ndimage.uniform_filter(band**2, size=3)\n std = np.sqrt(mean_sq - mean**2)\n out_ar[:] = std\n\"\"\"\n\nlocal_std = rasters.select(\n rx.rst_derivedband(f.col(\"tile\"), f.lit(window_func), f.lit(\"local_std\")).alias(\"std\")\n)\n", - "line_number": 3197, - "length_lines": 100, - "source_file": "api/rasterx-functions.md", - "category": "SELF_CONTAINED", - "confidence": "medium", - "analysis": { - "has_imports": true, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": true, - "has_rx_registration": false, - "uses_existing_df": null, - "uses_existing_rasters": false, - "uses_rx_without_import": false, - "uses_spark_without_creation": false, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "scala", - "code": "import com.databricks.labs.gbx.rasterx.{functions => rx}\nimport org.apache.spark.sql.functions._\n\nval avgFunc = \"\"\"\nimport numpy as np\ndef band_average(in_ar, out_ar, xoff, yoff, xsize, ysize, \n raster_xsize, raster_ysize, buf_radius, gt, **kwargs):\n out_ar[:] = np.mean(in_ar, axis=0)\n\"\"\"\n\nval result = rasters.select(\n rx.rst_derivedband(col(\"tile\"), lit(avgFunc), lit(\"band_average\")).alias(\"avg\")\n)\n", - "line_number": 3299, - "length_lines": 14, - "source_file": "api/rasterx-functions.md", - "category": "EXTENDS_SETUP", - "confidence": "high", - "analysis": { - "has_imports": true, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": false, - "has_rx_registration": false, - "uses_existing_df": null, - "uses_existing_rasters": true, - "uses_rx_without_import": false, - "uses_spark_without_creation": false, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "python", - "code": " def func(in_ar, out_ar, xoff, yoff, xsize, ysize,\n raster_xsize, raster_ysize, buf_radius, gt, **kwargs):\n # Process in_ar and write to out_ar\n ", - "line_number": 3317, - "length_lines": 4, - "source_file": "api/rasterx-functions.md", - "category": "NEEDS_REVIEW", - "confidence": "low", - "analysis": { - "has_imports": false, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": false, - "has_rx_registration": false, - "uses_existing_df": null, - "uses_existing_rasters": null, - "uses_rx_without_import": false, - "uses_spark_without_creation": false, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "scala", - "code": "rst_initnodata(tile: Column): Column\n", - "line_number": 3346, - "length_lines": 2, - "source_file": "api/rasterx-functions.md", - "category": "EXAMPLE_ONLY", - "confidence": "high", - "analysis": { - "has_imports": false, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": false, - "has_rx_registration": false, - "uses_existing_df": null, - "uses_existing_rasters": null, - "uses_rx_without_import": false, - "uses_spark_without_creation": false, - "has_placeholders": false, - "is_short": true - } - }, - { - "language": "python", - "code": "from databricks.labs.gbx.rasterx import functions as rx\nfrom pyspark.sql import functions as f\n\n# Initialize NoData for rasters\ninitialized = rasters.select(\n \"path\",\n rx.rst_initnodata(f.col(\"tile\")).alias(\"tile_with_nodata\")\n)\n\n# Verify NoData was set\nverification = initialized.select(\n \"path\",\n rx.rst_getnodata(f.col(\"tile_with_nodata\")).alias(\"nodata_values\")\n)\n", - "line_number": 3358, - "length_lines": 15, - "source_file": "api/rasterx-functions.md", - "category": "EXTENDS_SETUP", - "confidence": "high", - "analysis": { - "has_imports": true, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": false, - "has_rx_registration": false, - "uses_existing_df": null, - "uses_existing_rasters": true, - "uses_rx_without_import": false, - "uses_spark_without_creation": true, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "sql", - "code": "SELECT gbx_rst_initnodata(tile) as tile FROM rasters;\n", - "line_number": 3375, - "length_lines": 2, - "source_file": "api/rasterx-functions.md", - "category": "EXTENDS_SETUP", - "confidence": "high", - "analysis": { - "has_imports": false, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": false, - "has_rx_registration": false, - "uses_existing_df": null, - "uses_existing_rasters": true, - "uses_rx_without_import": false, - "uses_spark_without_creation": false, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "scala", - "code": "rst_updatetype(tile: Column, newType: Column): Column\n", - "line_number": 3390, - "length_lines": 2, - "source_file": "api/rasterx-functions.md", - "category": "NEEDS_REVIEW", - "confidence": "low", - "analysis": { - "has_imports": false, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": false, - "has_rx_registration": false, - "uses_existing_df": null, - "uses_existing_rasters": null, - "uses_rx_without_import": false, - "uses_spark_without_creation": false, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "python", - "code": "from databricks.labs.gbx.rasterx import functions as rx\nfrom pyspark.sql import functions as f\n\n# Convert to Float32\nfloat_tiles = rasters.select(\n rx.rst_updatetype(f.col(\"tile\"), f.lit(\"Float32\")).alias(\"float\")\n)\n\n# Convert to Byte for storage\nbyte_tiles = rasters.select(\n rx.rst_updatetype(f.col(\"tile\"), f.lit(\"Byte\")).alias(\"byte\")\n)\n", - "line_number": 3403, - "length_lines": 13, - "source_file": "api/rasterx-functions.md", - "category": "EXTENDS_SETUP", - "confidence": "high", - "analysis": { - "has_imports": true, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": false, - "has_rx_registration": false, - "uses_existing_df": null, - "uses_existing_rasters": true, - "uses_rx_without_import": false, - "uses_spark_without_creation": true, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "sql", - "code": "SELECT gbx_rst_updatetype(tile, 'Float32') as float_tile FROM rasters;\n", - "line_number": 3418, - "length_lines": 2, - "source_file": "api/rasterx-functions.md", - "category": "EXTENDS_SETUP", - "confidence": "high", - "analysis": { - "has_imports": false, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": false, - "has_rx_registration": false, - "uses_existing_df": null, - "uses_existing_rasters": true, - "uses_rx_without_import": false, - "uses_spark_without_creation": false, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "scala", - "code": "rst_merge(tiles: Column): Column\n", - "line_number": 3433, - "length_lines": 2, - "source_file": "api/rasterx-functions.md", - "category": "EXAMPLE_ONLY", - "confidence": "high", - "analysis": { - "has_imports": false, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": false, - "has_rx_registration": false, - "uses_existing_df": null, - "uses_existing_rasters": null, - "uses_rx_without_import": false, - "uses_spark_without_creation": false, - "has_placeholders": false, - "is_short": true - } - }, - { - "language": "python", - "code": "from databricks.labs.gbx.rasterx import functions as rx\nfrom pyspark.sql import functions as f\n\n# Merge multiple raster files into one mosaic\ndf = spark.createDataFrame([\n (1, \"/data/tile_01.tif\"),\n (2, \"/data/tile_02.tif\"),\n (3, \"/data/tile_03.tif\")\n], [\"id\", \"path\"])\n\n# Load rasters and collect into array\ndf = df.withColumn(\n \"tile\",\n rx.rst_fromfile(f.col(\"path\"), f.lit(\"GTiff\"))\n)\n\n# Merge all tiles into one\nmerged = df.select(\n rx.rst_merge(f.collect_list(\"tile\")).alias(\"merged_tile\")\n)\n\n# Verify the merged result\nmerged.select(\n rx.rst_width(\"merged_tile\").alias(\"width\"),\n rx.rst_height(\"merged_tile\").alias(\"height\"),\n rx.rst_numbands(\"merged_tile\").alias(\"bands\")\n).show()\n", - "line_number": 3445, - "length_lines": 28, - "source_file": "api/rasterx-functions.md", - "category": "EXTENDS_SETUP", - "confidence": "high", - "analysis": { - "has_imports": true, - "has_spark_creation": false, - "has_data_creation": true, - "has_file_reading": false, - "has_rx_registration": false, - "uses_existing_df": false, - "uses_existing_rasters": true, - "uses_rx_without_import": false, - "uses_spark_without_creation": true, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "scala", - "code": "import com.databricks.labs.gbx.rasterx.{functions => rx}\nimport org.apache.spark.sql.functions._\n\nval tiles = Seq(\n (1, \"/data/tile_01.tif\"),\n (2, \"/data/tile_02.tif\"),\n (3, \"/data/tile_03.tif\")\n).toDF(\"id\", \"path\")\n .withColumn(\"tile\", rx.rst_fromfile(col(\"path\"), lit(\"GTiff\")))\n\nval merged = tiles.select(\n rx.rst_merge(collect_list(\"tile\")).alias(\"merged_tile\")\n)\n", - "line_number": 3475, - "length_lines": 14, - "source_file": "api/rasterx-functions.md", - "category": "EXTENDS_SETUP", - "confidence": "high", - "analysis": { - "has_imports": true, - "has_spark_creation": false, - "has_data_creation": true, - "has_file_reading": false, - "has_rx_registration": false, - "uses_existing_df": null, - "uses_existing_rasters": null, - "uses_rx_without_import": false, - "uses_spark_without_creation": true, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "sql", - "code": "-- Merge rasters from a table\nWITH loaded_tiles AS (\n SELECT \n id,\n gbx_rst_fromfile(path, 'GTiff') as tile\n FROM raster_paths\n)\nSELECT gbx_rst_merge(collect_list(tile)) as merged_mosaic\nFROM loaded_tiles;\n", - "line_number": 3491, - "length_lines": 10, - "source_file": "api/rasterx-functions.md", - "category": "EXTENDS_SETUP", - "confidence": "high", - "analysis": { - "has_imports": true, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": false, - "has_rx_registration": false, - "uses_existing_df": null, - "uses_existing_rasters": true, - "uses_rx_without_import": false, - "uses_spark_without_creation": false, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "scala", - "code": "rst_h3_tessellate(tile: Column, resolution: Column): Column\n", - "line_number": 3519, - "length_lines": 2, - "source_file": "api/rasterx-functions.md", - "category": "NEEDS_REVIEW", - "confidence": "low", - "analysis": { - "has_imports": false, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": false, - "has_rx_registration": false, - "uses_existing_df": null, - "uses_existing_rasters": null, - "uses_rx_without_import": false, - "uses_spark_without_creation": false, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "python", - "code": "from databricks.labs.gbx.rasterx import functions as rx\nfrom pyspark.sql import functions as f\n\n# Tessellate raster into H3 resolution 7 cells\nrasters = spark.read.format(\"gdal\").load(\"/data/satellite/*.tif\")\n\ntessellated = rasters.select(\n \"path\",\n f.explode(rx.rst_h3_tessellate(f.col(\"tile\"), f.lit(7))).alias(\"h3_tile\")\n).select(\n \"path\",\n f.col(\"h3_tile.cellid\").alias(\"h3_cell\"),\n f.col(\"h3_tile\").alias(\"tile\")\n)\n\n# Count H3 cells per raster\ncell_counts = tessellated.groupBy(\"path\").agg(\n f.count(\"h3_cell\").alias(\"num_cells\")\n)\n\n# Process each H3 cell independently\ncell_stats = tessellated.select(\n \"h3_cell\",\n rx.rst_avg(\"tile\").alias(\"avg_value\"),\n rx.rst_width(\"tile\").alias(\"width\"),\n rx.rst_height(\"tile\").alias(\"height\")\n)\n\n# Join with H3 grid for spatial analysis\nfrom databricks.labs.gbx.gridx import functions as gx\n\nenriched = tessellated.select(\n \"h3_cell\",\n \"tile\",\n gx.h3_center(\"h3_cell\").alias(\"center_point\"),\n gx.h3_boundary(\"h3_cell\").alias(\"boundary\")\n)\n", - "line_number": 3539, - "length_lines": 38, - "source_file": "api/rasterx-functions.md", - "category": "SELF_CONTAINED", - "confidence": "medium", - "analysis": { - "has_imports": true, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": true, - "has_rx_registration": false, - "uses_existing_df": null, - "uses_existing_rasters": false, - "uses_rx_without_import": false, - "uses_spark_without_creation": true, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "scala", - "code": "import com.databricks.labs.gbx.rasterx.{functions => rx}\nimport org.apache.spark.sql.functions._\n\n// Tessellate at resolution 5\nval rasters = spark.read.format(\"gdal\").load(\"/data/rasters\")\n\nval tessellated = rasters.select(\n col(\"path\"),\n explode(rx.rst_h3_tessellate(col(\"tile\"), lit(5))).alias(\"h3_tile\")\n).select(\n col(\"path\"),\n col(\"h3_tile.cellid\").alias(\"h3_cell\"),\n col(\"h3_tile\").alias(\"tile\")\n)\n\n// Aggregate by cell\nval cellAgg = tessellated.groupBy(\"h3_cell\").agg(\n count(\"*\").alias(\"tile_count\"),\n first(\"tile\").alias(\"tile\")\n)\n", - "line_number": 3579, - "length_lines": 21, - "source_file": "api/rasterx-functions.md", - "category": "SELF_CONTAINED", - "confidence": "medium", - "analysis": { - "has_imports": true, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": true, - "has_rx_registration": false, - "uses_existing_df": null, - "uses_existing_rasters": false, - "uses_rx_without_import": false, - "uses_spark_without_creation": true, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "sql", - "code": "-- Tessellate and explode H3 cells\nSELECT\n path,\n h3_tile.cellid as h3_cell,\n h3_tile as tile,\n gbx_rst_avg(h3_tile) as avg_value\nFROM rasters\nLATERAL VIEW explode(gbx_rst_h3_tessellate(tile, 7)) AS h3_tile;\n\n-- Count cells per raster\nWITH tessellated AS (\n SELECT path, gbx_rst_h3_tessellate(tile, 5) as cells\n FROM rasters\n)\nSELECT path, size(cells) as num_h3_cells\nFROM tessellated;\n", - "line_number": 3602, - "length_lines": 17, - "source_file": "api/rasterx-functions.md", - "category": "EXTENDS_SETUP", - "confidence": "high", - "analysis": { - "has_imports": false, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": false, - "has_rx_registration": false, - "uses_existing_df": null, - "uses_existing_rasters": true, - "uses_rx_without_import": false, - "uses_spark_without_creation": false, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "scala", - "code": "rst_h3_rastertogridavg(tile: Column, resolution: Column): Column\n", - "line_number": 3636, - "length_lines": 2, - "source_file": "api/rasterx-functions.md", - "category": "NEEDS_REVIEW", - "confidence": "low", - "analysis": { - "has_imports": false, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": false, - "has_rx_registration": false, - "uses_existing_df": null, - "uses_existing_rasters": null, - "uses_rx_without_import": false, - "uses_spark_without_creation": false, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "python", - "code": "from databricks.labs.gbx.rasterx import functions as rx\nfrom pyspark.sql import functions as f\n\n# Aggregate temperature raster to H3 grid\ntemps = spark.read.format(\"gdal\").load(\"/data/temperature.tif\")\n\nh3_avg = temps.select(\n \"path\",\n rx.rst_h3_rastertogridavg(f.col(\"tile\"), f.lit(6)).alias(\"h3_averages\")\n)\n\n# Explode to get individual cells\ncells = h3_avg.select(\n f.explode(f.col(\"h3_averages\")).alias(\"band_cells\")\n).select(\n f.explode(f.col(\"band_cells\")).alias(\"cell\")\n).select(\n f.col(\"cell.cellID\").alias(\"h3_cell\"),\n f.col(\"cell.measure\").alias(\"avg_temp\")\n)\n\n# Filter to cells above threshold\nhot_spots = cells.filter(f.col(\"avg_temp\") > 30.0)\n\n# Multi-band example: Average NDVI over time\nndvi_series = spark.table(\"ndvi_timeseries\")\n\nmonthly_h3 = ndvi_series.select(\n f.col(\"month\"),\n rx.rst_h3_rastertogridavg(f.col(\"ndvi_tile\"), f.lit(8)).alias(\"grid\")\n)\n", - "line_number": 3651, - "length_lines": 32, - "source_file": "api/rasterx-functions.md", - "category": "SELF_CONTAINED", - "confidence": "medium", - "analysis": { - "has_imports": true, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": true, - "has_rx_registration": false, - "uses_existing_df": null, - "uses_existing_rasters": null, - "uses_rx_without_import": false, - "uses_spark_without_creation": true, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "scala", - "code": "import com.databricks.labs.gbx.rasterx.{functions => rx}\nimport org.apache.spark.sql.functions._\n\n// Aggregate to H3 grid\nval rasters = spark.read.format(\"gdal\").load(\"/data/rasters\")\n\nval h3Grid = rasters.select(\n col(\"path\"),\n rx.rst_h3_rastertogridavg(col(\"tile\"), lit(7)).alias(\"h3_avg\")\n)\n\n// Extract cells for first band\nval band1Cells = h3Grid.select(\n explode(col(\"h3_avg\").getItem(0)).alias(\"cell\")\n).select(\n col(\"cell.cellID\").alias(\"h3_id\"),\n col(\"cell.measure\").alias(\"avg_value\")\n)\n", - "line_number": 3685, - "length_lines": 19, - "source_file": "api/rasterx-functions.md", - "category": "SELF_CONTAINED", - "confidence": "medium", - "analysis": { - "has_imports": true, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": true, - "has_rx_registration": false, - "uses_existing_df": null, - "uses_existing_rasters": false, - "uses_rx_without_import": false, - "uses_spark_without_creation": true, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "sql", - "code": "-- Aggregate raster to H3 grid\nSELECT \n path,\n gbx_rst_h3_rastertogridavg(tile, 6) as h3_grid\nFROM rasters;\n\n-- Get cells from first band\nSELECT \n path,\n cell.cellID as h3_cell,\n cell.measure as avg_value\nFROM rasters\nLATERAL VIEW explode(gbx_rst_h3_rastertogridavg(tile, 6)[0]) AS cell;\n", - "line_number": 3706, - "length_lines": 14, - "source_file": "api/rasterx-functions.md", - "category": "EXTENDS_SETUP", - "confidence": "high", - "analysis": { - "has_imports": true, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": false, - "has_rx_registration": false, - "uses_existing_df": null, - "uses_existing_rasters": true, - "uses_rx_without_import": false, - "uses_spark_without_creation": false, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "scala", - "code": "rst_h3_rastertogridcount(tile: Column, resolution: Column): Column\n", - "line_number": 3736, - "length_lines": 2, - "source_file": "api/rasterx-functions.md", - "category": "NEEDS_REVIEW", - "confidence": "low", - "analysis": { - "has_imports": false, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": false, - "has_rx_registration": false, - "uses_existing_df": null, - "uses_existing_rasters": null, - "uses_rx_without_import": false, - "uses_spark_without_creation": false, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "python", - "code": "from databricks.labs.gbx.rasterx import functions as rx\nfrom pyspark.sql import functions as f\n\n# Count valid pixels per H3 cell\nrasters = spark.read.format(\"gdal\").load(\"/data/satellite/*.tif\")\n\nh3_counts = rasters.select(\n \"path\",\n rx.rst_h3_rastertogridcount(f.col(\"tile\"), f.lit(7)).alias(\"h3_counts\")\n)\n\n# Get cells with explode\ncell_counts = h3_counts.select(\n \"path\",\n f.explode(f.col(\"h3_counts\").getItem(0)).alias(\"cell\")\n).select(\n \"path\",\n f.col(\"cell.cellID\").alias(\"h3_cell\"),\n f.col(\"cell.measure\").alias(\"pixel_count\")\n)\n\n# Find cells with low data density\nsparse_cells = cell_counts.filter(f.col(\"pixel_count\") < 10)\n\n# Data quality analysis: combine count with average\nquality = rasters.select(\n \"path\",\n rx.rst_h3_rastertogridcount(f.col(\"tile\"), f.lit(6)).alias(\"counts\"),\n rx.rst_h3_rastertogridavg(f.col(\"tile\"), f.lit(6)).alias(\"avgs\")\n)\n", - "line_number": 3751, - "length_lines": 31, - "source_file": "api/rasterx-functions.md", - "category": "SELF_CONTAINED", - "confidence": "medium", - "analysis": { - "has_imports": true, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": true, - "has_rx_registration": false, - "uses_existing_df": null, - "uses_existing_rasters": false, - "uses_rx_without_import": false, - "uses_spark_without_creation": true, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "scala", - "code": "import com.databricks.labs.gbx.rasterx.{functions => rx}\nimport org.apache.spark.sql.functions._\n\nval counts = rasters.select(\n rx.rst_h3_rastertogridcount(col(\"tile\"), lit(6)).alias(\"h3_counts\")\n)\n", - "line_number": 3784, - "length_lines": 7, - "source_file": "api/rasterx-functions.md", - "category": "EXTENDS_SETUP", - "confidence": "high", - "analysis": { - "has_imports": true, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": false, - "has_rx_registration": false, - "uses_existing_df": null, - "uses_existing_rasters": true, - "uses_rx_without_import": false, - "uses_spark_without_creation": true, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "sql", - "code": "SELECT\n gbx_rst_h3_rastertogridcount(tile, 5) as pixel_counts\nFROM rasters;\n", - "line_number": 3793, - "length_lines": 4, - "source_file": "api/rasterx-functions.md", - "category": "EXTENDS_SETUP", - "confidence": "high", - "analysis": { - "has_imports": false, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": false, - "has_rx_registration": false, - "uses_existing_df": null, - "uses_existing_rasters": true, - "uses_rx_without_import": false, - "uses_spark_without_creation": false, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "scala", - "code": "rst_h3_rastertogridmax(tile: Column, resolution: Column): Column\n", - "line_number": 3812, - "length_lines": 2, - "source_file": "api/rasterx-functions.md", - "category": "NEEDS_REVIEW", - "confidence": "low", - "analysis": { - "has_imports": false, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": false, - "has_rx_registration": false, - "uses_existing_df": null, - "uses_existing_rasters": null, - "uses_rx_without_import": false, - "uses_spark_without_creation": false, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "python", - "code": "from databricks.labs.gbx.rasterx import functions as rx\nfrom pyspark.sql import functions as f\n\n# Find maximum elevation per H3 cell\ndem = spark.read.format(\"gdal\").load(\"/data/elevation.tif\")\n\nmax_elevation = dem.select(\n rx.rst_h3_rastertogridmax(f.col(\"tile\"), f.lit(8)).alias(\"h3_max_elev\")\n).select(\n f.explode(f.col(\"h3_max_elev\").getItem(0)).alias(\"cell\")\n).select(\n f.col(\"cell.cellID\").alias(\"h3_cell\"),\n f.col(\"cell.measure\").alias(\"max_elevation_m\")\n)\n\n# Peak detection\npeaks = max_elevation.filter(f.col(\"max_elevation_m\") > 2000)\n", - "line_number": 3827, - "length_lines": 18, - "source_file": "api/rasterx-functions.md", - "category": "SELF_CONTAINED", - "confidence": "medium", - "analysis": { - "has_imports": true, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": true, - "has_rx_registration": false, - "uses_existing_df": null, - "uses_existing_rasters": null, - "uses_rx_without_import": false, - "uses_spark_without_creation": true, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "sql", - "code": "SELECT\n cell.cellID as h3_cell,\n cell.measure as max_value\nFROM rasters\nLATERAL VIEW explode(gbx_rst_h3_rastertogridmax(tile, 7)[0]) AS cell;\n", - "line_number": 3847, - "length_lines": 6, - "source_file": "api/rasterx-functions.md", - "category": "EXTENDS_SETUP", - "confidence": "high", - "analysis": { - "has_imports": false, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": false, - "has_rx_registration": false, - "uses_existing_df": null, - "uses_existing_rasters": true, - "uses_rx_without_import": false, - "uses_spark_without_creation": false, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "scala", - "code": "rst_h3_rastertogridmin(tile: Column, resolution: Column): Column\n", - "line_number": 3867, - "length_lines": 2, - "source_file": "api/rasterx-functions.md", - "category": "NEEDS_REVIEW", - "confidence": "low", - "analysis": { - "has_imports": false, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": false, - "has_rx_registration": false, - "uses_existing_df": null, - "uses_existing_rasters": null, - "uses_rx_without_import": false, - "uses_spark_without_creation": false, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "python", - "code": "from databricks.labs.gbx.rasterx import functions as rx\nfrom pyspark.sql import functions as f\n\n# Find minimum temperature per H3 cell\ntemps = spark.read.format(\"gdal\").load(\"/data/temp_min.tif\")\n\nmin_temps = temps.select(\n rx.rst_h3_rastertogridmin(f.col(\"tile\"), f.lit(6)).alias(\"h3_min\")\n).select(\n f.explode(f.col(\"h3_min\").getItem(0)).alias(\"cell\")\n).select(\n f.col(\"cell.cellID\").alias(\"h3_cell\"),\n f.col(\"cell.measure\").alias(\"min_temp_c\")\n)\n\n# Cold spots\nfreezing = min_temps.filter(f.col(\"min_temp_c\") <= 0)\n", - "line_number": 3882, - "length_lines": 18, - "source_file": "api/rasterx-functions.md", - "category": "SELF_CONTAINED", - "confidence": "medium", - "analysis": { - "has_imports": true, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": true, - "has_rx_registration": false, - "uses_existing_df": null, - "uses_existing_rasters": null, - "uses_rx_without_import": false, - "uses_spark_without_creation": true, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "sql", - "code": "SELECT\n cell.cellID as h3_cell,\n cell.measure as min_value\nFROM rasters\nLATERAL VIEW explode(gbx_rst_h3_rastertogridmin(tile, 7)[0]) AS cell;\n", - "line_number": 3902, - "length_lines": 6, - "source_file": "api/rasterx-functions.md", - "category": "EXTENDS_SETUP", - "confidence": "high", - "analysis": { - "has_imports": false, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": false, - "has_rx_registration": false, - "uses_existing_df": null, - "uses_existing_rasters": true, - "uses_rx_without_import": false, - "uses_spark_without_creation": false, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "scala", - "code": "rst_h3_rastertogridmedian(tile: Column, resolution: Column): Column\n", - "line_number": 3921, - "length_lines": 2, - "source_file": "api/rasterx-functions.md", - "category": "NEEDS_REVIEW", - "confidence": "low", - "analysis": { - "has_imports": false, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": false, - "has_rx_registration": false, - "uses_existing_df": null, - "uses_existing_rasters": null, - "uses_rx_without_import": false, - "uses_spark_without_creation": false, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "python", - "code": "from databricks.labs.gbx.rasterx import functions as rx\nfrom pyspark.sql import functions as f\n\n# Robust aggregation using median (less sensitive to outliers)\nrasters = spark.read.format(\"gdal\").load(\"/data/ndvi/*.tif\")\n\nmedian_ndvi = rasters.select(\n \"path\",\n rx.rst_h3_rastertogridmedian(f.col(\"tile\"), f.lit(7)).alias(\"h3_median\")\n).select(\n \"path\",\n f.explode(f.col(\"h3_median\").getItem(0)).alias(\"cell\")\n).select(\n \"path\",\n f.col(\"cell.cellID\").alias(\"h3_cell\"),\n f.col(\"cell.measure\").alias(\"median_ndvi\")\n)\n\n# Compare mean vs median for outlier detection\ncomparison = rasters.select(\n \"path\",\n rx.rst_h3_rastertogridavg(f.col(\"tile\"), f.lit(7)).alias(\"avg\"),\n rx.rst_h3_rastertogridmedian(f.col(\"tile\"), f.lit(7)).alias(\"median\")\n)\n", - "line_number": 3936, - "length_lines": 25, - "source_file": "api/rasterx-functions.md", - "category": "SELF_CONTAINED", - "confidence": "medium", - "analysis": { - "has_imports": true, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": true, - "has_rx_registration": false, - "uses_existing_df": null, - "uses_existing_rasters": false, - "uses_rx_without_import": false, - "uses_spark_without_creation": true, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "sql", - "code": "SELECT\n cell.cellID as h3_cell,\n cell.measure as median_value\nFROM rasters\nLATERAL VIEW explode(gbx_rst_h3_rastertogridmedian(tile, 7)[0]) AS cell;\n", - "line_number": 3963, - "length_lines": 6, - "source_file": "api/rasterx-functions.md", - "category": "EXTENDS_SETUP", - "confidence": "high", - "analysis": { - "has_imports": false, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": false, - "has_rx_registration": false, - "uses_existing_df": null, - "uses_existing_rasters": true, - "uses_rx_without_import": false, - "uses_spark_without_creation": false, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "scala", - "code": "rst_maketiles(tile: Column, tileWidth: Column, tileHeight: Column): Column\n", - "line_number": 3995, - "length_lines": 2, - "source_file": "api/rasterx-functions.md", - "category": "NEEDS_REVIEW", - "confidence": "low", - "analysis": { - "has_imports": false, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": false, - "has_rx_registration": false, - "uses_existing_df": null, - "uses_existing_rasters": null, - "uses_rx_without_import": false, - "uses_spark_without_creation": false, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "python", - "code": "from databricks.labs.gbx.rasterx import functions as rx\nfrom pyspark.sql import functions as f\n\n# Load large rasters\nlarge_rasters = spark.read.format(\"gdal\").load(\"/data/large/*.tif\")\n\n# Subdivide into 512x512 tiles\ntiled = large_rasters.select(\n \"path\",\n f.explode(rx.rst_maketiles(\n f.col(\"tile\"),\n f.lit(512),\n f.lit(512)\n )).alias(\"subtile\")\n)\n\n# Process tiles individually\nprocessed_tiles = tiled.select(\n \"path\",\n rx.rst_avg(f.col(\"subtile\")).alias(\"tile_avg\"),\n rx.rst_width(f.col(\"subtile\")).alias(\"tile_width\"),\n rx.rst_height(f.col(\"subtile\")).alias(\"tile_height\")\n)\n\n# Count tiles per raster\ntile_counts = large_rasters.select(\n \"path\",\n f.size(rx.rst_maketiles(f.col(\"tile\"), f.lit(512), f.lit(512))).alias(\"num_tiles\")\n)\n\n# Different tile sizes for different processing needs\n# Smaller tiles for detailed processing\ndetailed = large_rasters.select(\n f.explode(rx.rst_maketiles(f.col(\"tile\"), f.lit(256), f.lit(256))).alias(\"tile_256\")\n)\n\n# Larger tiles for coarser processing\ncoarse = large_rasters.select(\n f.explode(rx.rst_maketiles(f.col(\"tile\"), f.lit(1024), f.lit(1024))).alias(\"tile_1024\")\n)\n\n# Parallel processing pattern\nfrom pyspark.sql import Window\n\ntiled_with_id = tiled.select(\n \"path\",\n \"subtile\",\n f.row_number().over(Window.partitionBy(\"path\").orderBy(\"path\")).alias(\"tile_id\")\n)\n\n# Process each tile independently\nresults = tiled_with_id.select(\n \"path\",\n \"tile_id\",\n rx.rst_ndvi(f.col(\"subtile\"), f.lit(4), f.lit(8)).alias(\"ndvi_tile\")\n)\n", - "line_number": 4009, - "length_lines": 57, - "source_file": "api/rasterx-functions.md", - "category": "SELF_CONTAINED", - "confidence": "medium", - "analysis": { - "has_imports": true, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": true, - "has_rx_registration": false, - "uses_existing_df": null, - "uses_existing_rasters": false, - "uses_rx_without_import": false, - "uses_spark_without_creation": true, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "scala", - "code": "import com.databricks.labs.gbx.rasterx.{functions => rx}\nimport org.apache.spark.sql.functions._\n\n// Subdivide into tiles\nval tiled = largeRasters.select(\n col(\"path\"),\n explode(rx.rst_maketiles(col(\"tile\"), lit(512), lit(512))).alias(\"subtile\")\n)\n\n// Count tiles\nval tileCounts = largeRasters.select(\n col(\"path\"),\n size(rx.rst_maketiles(col(\"tile\"), lit(512), lit(512))).alias(\"num_tiles\")\n)\n", - "line_number": 4068, - "length_lines": 15, - "source_file": "api/rasterx-functions.md", - "category": "EXTENDS_SETUP", - "confidence": "high", - "analysis": { - "has_imports": true, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": false, - "has_rx_registration": false, - "uses_existing_df": null, - "uses_existing_rasters": null, - "uses_rx_without_import": false, - "uses_spark_without_creation": true, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "sql", - "code": "-- Subdivide and explode tiles\nSELECT\n path,\n tile_subtile as tile\nFROM rasters\nLATERAL VIEW explode(gbx_rst_maketiles(tile, 512, 512)) AS tile_subtile;\n\n-- Count tiles per raster\nSELECT\n path,\n SIZE(gbx_rst_maketiles(tile, 512, 512)) as num_tiles\nFROM large_rasters;\n", - "line_number": 4085, - "length_lines": 13, - "source_file": "api/rasterx-functions.md", - "category": "EXTENDS_SETUP", - "confidence": "high", - "analysis": { - "has_imports": false, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": false, - "has_rx_registration": false, - "uses_existing_df": null, - "uses_existing_rasters": true, - "uses_rx_without_import": false, - "uses_spark_without_creation": false, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "scala", - "code": "rst_retile(tile: Column, tileWidth: Column, tileHeight: Column): Column\n", - "line_number": 4119, - "length_lines": 2, - "source_file": "api/rasterx-functions.md", - "category": "NEEDS_REVIEW", - "confidence": "low", - "analysis": { - "has_imports": false, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": false, - "has_rx_registration": false, - "uses_existing_df": null, - "uses_existing_rasters": null, - "uses_rx_without_import": false, - "uses_spark_without_creation": false, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "python", - "code": "from databricks.labs.gbx.rasterx import functions as rx\nfrom pyspark.sql import functions as f\n\nrasters = spark.read.format(\"gdal\").load(\"/data/*.tif\")\n\n# Retile to standard 256x256 tiles\nretiled = rasters.select(\n \"path\",\n f.explode(rx.rst_retile(f.col(\"tile\"), f.lit(256), f.lit(256))).alias(\"tile_256\")\n)\n\n# Process with consistent tile sizes\nuniform_processing = retiled.select(\n \"path\",\n rx.rst_width(f.col(\"tile_256\")).alias(\"width\"),\n rx.rst_height(f.col(\"tile_256\")).alias(\"height\"),\n rx.rst_avg(f.col(\"tile_256\")).alias(\"avg\")\n)\n\n# Retile for web mapping (standard tile sizes)\nweb_tiles = rasters.select(\n f.explode(rx.rst_retile(f.col(\"tile\"), f.lit(256), f.lit(256))).alias(\"web_tile\")\n)\n\n# Variable tile sizes\nrectangular_tiles = rasters.select(\n f.explode(rx.rst_retile(f.col(\"tile\"), f.lit(512), f.lit(256))).alias(\"rect_tile\")\n)\n", - "line_number": 4133, - "length_lines": 29, - "source_file": "api/rasterx-functions.md", - "category": "SELF_CONTAINED", - "confidence": "medium", - "analysis": { - "has_imports": true, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": true, - "has_rx_registration": false, - "uses_existing_df": null, - "uses_existing_rasters": false, - "uses_rx_without_import": false, - "uses_spark_without_creation": true, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "scala", - "code": "import com.databricks.labs.gbx.rasterx.{functions => rx}\nimport org.apache.spark.sql.functions._\n\n// Retile to 256x256\nval retiled = rasters.select(\n explode(rx.rst_retile(col(\"tile\"), lit(256), lit(256))).alias(\"tile\")\n)\n", - "line_number": 4164, - "length_lines": 8, - "source_file": "api/rasterx-functions.md", - "category": "EXTENDS_SETUP", - "confidence": "high", - "analysis": { - "has_imports": true, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": false, - "has_rx_registration": false, - "uses_existing_df": null, - "uses_existing_rasters": true, - "uses_rx_without_import": false, - "uses_spark_without_creation": true, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "sql", - "code": "SELECT\n path,\n tile\nFROM rasters\nLATERAL VIEW explode(gbx_rst_retile(tile, 256, 256)) AS tile;\n", - "line_number": 4174, - "length_lines": 6, - "source_file": "api/rasterx-functions.md", - "category": "EXTENDS_SETUP", - "confidence": "high", - "analysis": { - "has_imports": false, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": false, - "has_rx_registration": false, - "uses_existing_df": null, - "uses_existing_rasters": true, - "uses_rx_without_import": false, - "uses_spark_without_creation": false, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "scala", - "code": "rst_tooverlappingtiles(tile: Column, tileWidth: Column, tileHeight: Column, overlap: Column): Column\n", - "line_number": 4196, - "length_lines": 2, - "source_file": "api/rasterx-functions.md", - "category": "NEEDS_REVIEW", - "confidence": "low", - "analysis": { - "has_imports": false, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": false, - "has_rx_registration": false, - "uses_existing_df": null, - "uses_existing_rasters": null, - "uses_rx_without_import": false, - "uses_spark_without_creation": false, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "python", - "code": "from databricks.labs.gbx.rasterx import functions as rx\nfrom pyspark.sql import functions as f\n\nrasters = spark.read.format(\"gdal\").load(\"/data/*.tif\")\n\n# Create overlapping tiles for convolution\n# 10 pixel overlap ensures edge pixels have full neighborhood\noverlapping = rasters.select(\n \"path\",\n f.explode(rx.rst_tooverlappingtiles(\n f.col(\"tile\"),\n f.lit(256), # tile width\n f.lit(256), # tile height\n f.lit(10) # 10 pixel overlap\n )).alias(\"overlap_tile\")\n)\n\n# Apply convolution to overlapping tiles\n# Overlap prevents edge artifacts\nsobel_kernel = [\n [-1.0, 0.0, 1.0],\n [-2.0, 0.0, 2.0],\n [-1.0, 0.0, 1.0]\n]\n\nedge_detected = overlapping.select(\n \"path\",\n rx.rst_convolve(\n f.col(\"overlap_tile\"),\n f.array(*[f.array(*[f.lit(v) for v in row]) for row in sobel_kernel])\n ).alias(\"edges\")\n)\n\n# Larger overlap for larger kernels\n# 5x5 kernel needs at least 2 pixel overlap\n# Use larger overlap for safety\nlarge_overlap = rasters.select(\n f.explode(rx.rst_tooverlappingtiles(\n f.col(\"tile\"),\n f.lit(512),\n f.lit(512),\n f.lit(5) # 5 pixel overlap for 5x5 kernels\n )).alias(\"tile\")\n)\n\n# Filter operations benefit from overlap\nfiltered = overlapping.select(\n rx.rst_filter(f.col(\"overlap_tile\"), f.lit(3), f.lit(\"median\")).alias(\"filtered\")\n)\n\n# Compare with and without overlap\nno_overlap = rasters.select(\n f.explode(rx.rst_maketiles(f.col(\"tile\"), f.lit(256), f.lit(256))).alias(\"tile\")\n)\n\nwith_overlap = rasters.select(\n f.explode(rx.rst_tooverlappingtiles(\n f.col(\"tile\"), f.lit(256), f.lit(256), f.lit(10)\n )).alias(\"tile\")\n)\n", - "line_number": 4211, - "length_lines": 61, - "source_file": "api/rasterx-functions.md", - "category": "SELF_CONTAINED", - "confidence": "medium", - "analysis": { - "has_imports": true, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": true, - "has_rx_registration": false, - "uses_existing_df": null, - "uses_existing_rasters": false, - "uses_rx_without_import": false, - "uses_spark_without_creation": true, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "scala", - "code": "import com.databricks.labs.gbx.rasterx.{functions => rx}\nimport org.apache.spark.sql.functions._\n\n// Create overlapping tiles\nval overlapping = rasters.select(\n explode(rx.rst_tooverlappingtiles(col(\"tile\"), lit(256), lit(256), lit(10))).alias(\"tile\")\n)\n", - "line_number": 4274, - "length_lines": 8, - "source_file": "api/rasterx-functions.md", - "category": "EXTENDS_SETUP", - "confidence": "high", - "analysis": { - "has_imports": true, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": false, - "has_rx_registration": false, - "uses_existing_df": null, - "uses_existing_rasters": true, - "uses_rx_without_import": false, - "uses_spark_without_creation": true, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "sql", - "code": "SELECT\n path,\n tile\nFROM rasters\nLATERAL VIEW explode(gbx_rst_tooverlappingtiles(tile, 256, 256, 10)) AS tile;\n", - "line_number": 4284, - "length_lines": 6, - "source_file": "api/rasterx-functions.md", - "category": "EXTENDS_SETUP", - "confidence": "high", - "analysis": { - "has_imports": false, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": false, - "has_rx_registration": false, - "uses_existing_df": null, - "uses_existing_rasters": true, - "uses_rx_without_import": false, - "uses_spark_without_creation": false, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "scala", - "code": "rst_separatebands(tile: Column): Column\n", - "line_number": 4318, - "length_lines": 2, - "source_file": "api/rasterx-functions.md", - "category": "EXAMPLE_ONLY", - "confidence": "high", - "analysis": { - "has_imports": false, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": false, - "has_rx_registration": false, - "uses_existing_df": null, - "uses_existing_rasters": null, - "uses_rx_without_import": false, - "uses_spark_without_creation": false, - "has_placeholders": false, - "is_short": true - } - }, - { - "language": "python", - "code": "from databricks.labs.gbx.rasterx import functions as rx\nfrom pyspark.sql import functions as f\n\n# Separate RGB bands\nrgb_rasters = spark.read.format(\"gdal\").load(\"/data/rgb/*.tif\")\n\nseparated = rgb_rasters.select(\n \"path\",\n rx.rst_separatebands(f.col(\"tile\")).alias(\"bands\")\n).select(\n \"path\",\n f.col(\"bands\").getItem(0).alias(\"red_band\"),\n f.col(\"bands\").getItem(1).alias(\"green_band\"),\n f.col(\"bands\").getItem(2).alias(\"blue_band\")\n)\n\n# Process each band independently\nred_stats = separated.select(\n \"path\",\n rx.rst_avg(f.col(\"red_band\")).alias(\"red_avg\"),\n rx.rst_min(f.col(\"red_band\")).alias(\"red_min\"),\n rx.rst_max(f.col(\"red_band\")).alias(\"red_max\")\n)\n\n# Explode to one row per band\nbands_exploded = rgb_rasters.select(\n \"path\",\n f.posexplode(rx.rst_separatebands(f.col(\"tile\"))).alias(\"band_index\", \"band\")\n)\n", - "line_number": 4330, - "length_lines": 30, - "source_file": "api/rasterx-functions.md", - "category": "SELF_CONTAINED", - "confidence": "medium", - "analysis": { - "has_imports": true, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": true, - "has_rx_registration": false, - "uses_existing_df": null, - "uses_existing_rasters": null, - "uses_rx_without_import": false, - "uses_spark_without_creation": true, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "scala", - "code": "import com.databricks.labs.gbx.rasterx.{functions => rx}\nimport org.apache.spark.sql.functions._\n\nval separated = rasters.select(\n col(\"path\"),\n rx.rst_separatebands(col(\"tile\")).alias(\"bands\")\n)\n", - "line_number": 4362, - "length_lines": 8, - "source_file": "api/rasterx-functions.md", - "category": "EXTENDS_SETUP", - "confidence": "high", - "analysis": { - "has_imports": true, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": false, - "has_rx_registration": false, - "uses_existing_df": null, - "uses_existing_rasters": true, - "uses_rx_without_import": false, - "uses_spark_without_creation": true, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "sql", - "code": "SELECT\n path,\n bands[0] as red_band,\n bands[1] as green_band,\n bands[2] as blue_band\nFROM (\n SELECT path, gbx_rst_separatebands(tile) as bands\n FROM rgb_rasters\n);\n", - "line_number": 4372, - "length_lines": 10, - "source_file": "api/rasterx-functions.md", - "category": "EXTENDS_SETUP", - "confidence": "high", - "analysis": { - "has_imports": false, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": false, - "has_rx_registration": false, - "uses_existing_df": null, - "uses_existing_rasters": null, - "uses_rx_without_import": false, - "uses_spark_without_creation": false, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "scala", - "code": "rst_combineavg(tiles: Column): Column\n", - "line_number": 4401, - "length_lines": 2, - "source_file": "api/rasterx-functions.md", - "category": "EXAMPLE_ONLY", - "confidence": "high", - "analysis": { - "has_imports": false, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": false, - "has_rx_registration": false, - "uses_existing_df": null, - "uses_existing_rasters": null, - "uses_rx_without_import": false, - "uses_spark_without_creation": false, - "has_placeholders": false, - "is_short": true - } - }, - { - "language": "python", - "code": "from databricks.labs.gbx.rasterx import functions as rx\nfrom pyspark.sql import functions as f\n\n# Average multiple rasters (e.g., time series)\ndf = spark.createDataFrame([\n (1, \"/data/2024-01-01.tif\"),\n (2, \"/data/2024-01-02.tif\"),\n (3, \"/data/2024-01-03.tif\")\n], [\"id\", \"path\"])\n\n# Load and combine with average\ndf = df.withColumn(\n \"tile\",\n rx.rst_fromfile(f.col(\"path\"), f.lit(\"GTiff\"))\n)\n\naveraged = df.select(\n rx.rst_combineavg(f.collect_list(\"tile\")).alias(\"avg_tile\")\n)\n\n# Create monthly average from daily rasters\ndaily_rasters = spark.table(\"daily_temperature_rasters\")\n\nmonthly_avg = daily_rasters.select(\n f.date_trunc(\"month\", \"date\").alias(\"month\"),\n \"tile\"\n).groupBy(\"month\").agg(\n rx.rst_combineavg(f.collect_list(\"tile\")).alias(\"monthly_avg_tile\")\n)\n", - "line_number": 4413, - "length_lines": 30, - "source_file": "api/rasterx-functions.md", - "category": "EXTENDS_SETUP", - "confidence": "high", - "analysis": { - "has_imports": true, - "has_spark_creation": false, - "has_data_creation": true, - "has_file_reading": false, - "has_rx_registration": false, - "uses_existing_df": false, - "uses_existing_rasters": false, - "uses_rx_without_import": false, - "uses_spark_without_creation": true, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "scala", - "code": "import com.databricks.labs.gbx.rasterx.{functions => rx}\nimport org.apache.spark.sql.functions._\n\n// Combine rasters using average\nval tiles = Seq(\n \"/data/2024-01-01.tif\",\n \"/data/2024-01-02.tif\",\n \"/data/2024-01-03.tif\"\n).toDF(\"path\")\n .withColumn(\"tile\", rx.rst_fromfile(col(\"path\"), lit(\"GTiff\")))\n\nval avgTile = tiles.select(\n rx.rst_combineavg(collect_list(\"tile\")).alias(\"avg_tile\")\n)\n", - "line_number": 4445, - "length_lines": 15, - "source_file": "api/rasterx-functions.md", - "category": "EXTENDS_SETUP", - "confidence": "high", - "analysis": { - "has_imports": true, - "has_spark_creation": false, - "has_data_creation": true, - "has_file_reading": false, - "has_rx_registration": false, - "uses_existing_df": null, - "uses_existing_rasters": true, - "uses_rx_without_import": false, - "uses_spark_without_creation": true, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "sql", - "code": "-- Average rasters for temporal composite\nWITH loaded_tiles AS (\n SELECT \n date_trunc('week', date) as week,\n gbx_rst_fromfile(path, 'GTiff') as tile\n FROM daily_rasters\n WHERE date >= '2024-01-01'\n)\nSELECT \n week,\n gbx_rst_combineavg(collect_list(tile)) as weekly_avg\nFROM loaded_tiles\nGROUP BY week;\n", - "line_number": 4462, - "length_lines": 14, - "source_file": "api/rasterx-functions.md", - "category": "EXTENDS_SETUP", - "confidence": "high", - "analysis": { - "has_imports": false, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": false, - "has_rx_registration": false, - "uses_existing_df": null, - "uses_existing_rasters": true, - "uses_rx_without_import": false, - "uses_spark_without_creation": false, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "scala", - "code": "rst_combineavg_agg(tile: Column): Column\n", - "line_number": 4491, - "length_lines": 2, - "source_file": "api/rasterx-functions.md", - "category": "EXAMPLE_ONLY", - "confidence": "high", - "analysis": { - "has_imports": false, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": false, - "has_rx_registration": false, - "uses_existing_df": null, - "uses_existing_rasters": null, - "uses_rx_without_import": false, - "uses_spark_without_creation": false, - "has_placeholders": false, - "is_short": true - } - }, - { - "language": "python", - "code": "from databricks.labs.gbx.rasterx import functions as rx\nfrom pyspark.sql import functions as f\n\n# Average rasters by region\nrasters_with_regions = spark.table(\"regional_rasters\")\n\nregional_avg = rasters_with_regions.groupBy(\"region\").agg(\n rx.rst_combineavg_agg(f.col(\"tile\")).alias(\"avg_tile\")\n)\n\n# Monthly temporal composites\ndaily_images = spark.table(\"daily_satellite\")\n\nmonthly_composites = daily_images.groupBy(\n f.date_trunc(\"month\", \"date\").alias(\"month\")\n).agg(\n rx.rst_combineavg_agg(f.col(\"tile\")).alias(\"monthly_avg\")\n)\n\n# Average by land cover type\nland_cover_avg = rasters.groupBy(\"land_cover_class\").agg(\n rx.rst_combineavg_agg(f.col(\"tile\")).alias(\"class_avg_tile\")\n)\n", - "line_number": 4503, - "length_lines": 24, - "source_file": "api/rasterx-functions.md", - "category": "EXTENDS_SETUP", - "confidence": "high", - "analysis": { - "has_imports": true, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": false, - "has_rx_registration": false, - "uses_existing_df": null, - "uses_existing_rasters": true, - "uses_rx_without_import": false, - "uses_spark_without_creation": true, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "scala", - "code": "import com.databricks.labs.gbx.rasterx.{functions => rx}\nimport org.apache.spark.sql.functions._\n\nval regionAvg = rasters.groupBy(\"region\").agg(\n rx.rst_combineavg_agg(col(\"tile\")).alias(\"avg_tile\")\n)\n", - "line_number": 4529, - "length_lines": 7, - "source_file": "api/rasterx-functions.md", - "category": "EXTENDS_SETUP", - "confidence": "high", - "analysis": { - "has_imports": true, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": false, - "has_rx_registration": false, - "uses_existing_df": null, - "uses_existing_rasters": true, - "uses_rx_without_import": false, - "uses_spark_without_creation": true, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "sql", - "code": "-- Group by region and average\nSELECT\n region,\n gbx_rst_combineavg_agg(tile) as regional_average\nFROM rasters\nGROUP BY region;\n", - "line_number": 4538, - "length_lines": 7, - "source_file": "api/rasterx-functions.md", - "category": "EXTENDS_SETUP", - "confidence": "high", - "analysis": { - "has_imports": false, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": false, - "has_rx_registration": false, - "uses_existing_df": null, - "uses_existing_rasters": true, - "uses_rx_without_import": false, - "uses_spark_without_creation": false, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "scala", - "code": "rst_derivedband_agg(tile: Column, pythonFunc: String, funcName: String): Column\n", - "line_number": 4559, - "length_lines": 2, - "source_file": "api/rasterx-functions.md", - "category": "NEEDS_REVIEW", - "confidence": "low", - "analysis": { - "has_imports": false, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": false, - "has_rx_registration": false, - "uses_existing_df": null, - "uses_existing_rasters": null, - "uses_rx_without_import": false, - "uses_spark_without_creation": false, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "python", - "code": "from databricks.labs.gbx.rasterx import functions as rx\nfrom pyspark.sql import functions as f\n\navg_func = \"\"\"\nimport numpy as np\ndef band_average(in_ar, out_ar, xoff, yoff, xsize, ysize,\n raster_xsize, raster_ysize, buf_radius, gt, **kwargs):\n out_ar[:] = np.mean(in_ar, axis=0)\n\"\"\"\n\n# Apply custom Python function during aggregation\nregional = rasters.groupBy(\"region\").agg(\n rx.rst_derivedband_agg(f.col(\"tile\"), f.lit(avg_func), f.lit(\"band_average\")).alias(\"result\")\n)\n", - "line_number": 4573, - "length_lines": 15, - "source_file": "api/rasterx-functions.md", - "category": "EXTENDS_SETUP", - "confidence": "high", - "analysis": { - "has_imports": true, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": false, - "has_rx_registration": false, - "uses_existing_df": null, - "uses_existing_rasters": true, - "uses_rx_without_import": false, - "uses_spark_without_creation": false, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "scala", - "code": "rst_merge_agg(tile: Column): Column\n", - "line_number": 4601, - "length_lines": 2, - "source_file": "api/rasterx-functions.md", - "category": "EXAMPLE_ONLY", - "confidence": "high", - "analysis": { - "has_imports": false, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": false, - "has_rx_registration": false, - "uses_existing_df": null, - "uses_existing_rasters": null, - "uses_rx_without_import": false, - "uses_spark_without_creation": false, - "has_placeholders": false, - "is_short": true - } - }, - { - "language": "python", - "code": "from databricks.labs.gbx.rasterx import functions as rx\nfrom pyspark.sql import functions as f\n\n# Merge tiles by scene ID\nscenes = spark.table(\"satellite_tiles\")\n\nmerged_scenes = scenes.groupBy(\"scene_id\").agg(\n rx.rst_merge_agg(f.col(\"tile\")).alias(\"merged_scene\")\n)\n\n# Mosaic by date\ndaily_mosaic = scenes.groupBy(\n f.to_date(\"acquisition_time\").alias(\"date\")\n).agg(\n rx.rst_merge_agg(f.col(\"tile\")).alias(\"daily_mosaic\")\n)\n\n# Merge by region\nregional_mosaic = scenes.groupBy(\"region\").agg(\n rx.rst_merge_agg(f.col(\"tile\")).alias(\"regional_tile\"),\n f.count(\"*\").alias(\"num_tiles\")\n)\n", - "line_number": 4613, - "length_lines": 23, - "source_file": "api/rasterx-functions.md", - "category": "EXTENDS_SETUP", - "confidence": "high", - "analysis": { - "has_imports": true, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": false, - "has_rx_registration": false, - "uses_existing_df": null, - "uses_existing_rasters": null, - "uses_rx_without_import": false, - "uses_spark_without_creation": true, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "scala", - "code": "import com.databricks.labs.gbx.rasterx.{functions => rx}\nimport org.apache.spark.sql.functions._\n\nval merged = tiles.groupBy(\"scene_id\").agg(\n rx.rst_merge_agg(col(\"tile\")).alias(\"merged\")\n)\n", - "line_number": 4638, - "length_lines": 7, - "source_file": "api/rasterx-functions.md", - "category": "EXTENDS_SETUP", - "confidence": "high", - "analysis": { - "has_imports": true, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": false, - "has_rx_registration": false, - "uses_existing_df": null, - "uses_existing_rasters": null, - "uses_rx_without_import": false, - "uses_spark_without_creation": true, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "sql", - "code": "SELECT\n scene_id,\n gbx_rst_merge_agg(tile) as merged_scene\nFROM satellite_tiles\nGROUP BY scene_id;\n", - "line_number": 4647, - "length_lines": 6, - "source_file": "api/rasterx-functions.md", - "category": "EXTENDS_SETUP", - "confidence": "high", - "analysis": { - "has_imports": false, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": false, - "has_rx_registration": false, - "uses_existing_df": null, - "uses_existing_rasters": null, - "uses_rx_without_import": false, - "uses_spark_without_creation": false, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "python", - "code": "from databricks.labs.gbx.rasterx import functions as rx\nfrom pyspark.sql import functions as f\n\n# Register functions\nrx.register(spark)\n\n# Read rasters\nrasters = spark.read.format(\"gdal\").load(\"/data/satellite\")\n\n# Process pipeline\nresult = (\n rasters\n # Extract metadata\n .select(\n \"path\",\n \"tile\",\n rx.rst_boundingbox(f.col(\"tile\")).alias(\"bbox\"),\n rx.rst_width(f.col(\"tile\")).alias(\"width\"),\n rx.rst_height(f.col(\"tile\")).alias(\"height\"),\n rx.rst_numbands(f.col(\"tile\")).alias(\"bands\"),\n rx.rst_srid(f.col(\"tile\")).alias(\"srid\")\n )\n # Filter by size\n .filter(\"width > 1000 AND height > 1000\")\n # Reproject to WGS84\n .select(\n \"path\",\n f.col(\"tile\"),\n rx.rst_transform(f.col(\"tile\"), f.lit(4326)).alias(\"wgs84_tile\")\n )\n # Clip to area of interest (WKT polygon)\n .select(\n \"path\",\n rx.rst_clip(\n f.col(\"wgs84_tile\"),\n f.lit(\"POLYGON((-122 37, -122 38, -121 38, -121 37, -122 37))\"),\n f.lit(True)\n ).alias(\"clipped\")\n )\n # Tessellate to H3 grid\n .select(\n \"path\",\n f.explode(rx.rst_h3_tessellate(f.col(\"clipped\"), f.lit(7))).alias(\"h3_tile\")\n )\n # Extract cell ID and compute statistics\n .select(\n \"path\",\n f.col(\"h3_tile.cellid\").alias(\"h3_cell\"),\n rx.rst_avg(f.col(\"h3_tile\")).alias(\"avg_value\"),\n rx.rst_min(f.col(\"h3_tile\")).alias(\"min_value\"),\n rx.rst_max(f.col(\"h3_tile\")).alias(\"max_value\")\n )\n)\n\n# Save results\nresult.write.mode(\"overwrite\").saveAsTable(\"processed_rasters\")\n", - "line_number": 4667, - "length_lines": 57, - "source_file": "api/rasterx-functions.md", - "category": "SELF_CONTAINED", - "confidence": "high", - "analysis": { - "has_imports": true, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": true, - "has_rx_registration": true, - "uses_existing_df": null, - "uses_existing_rasters": false, - "uses_rx_without_import": false, - "uses_spark_without_creation": true, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "scala", - "code": "import com.databricks.labs.gbx.rasterx.{functions => rx}\n\n// Register functions\nrx.register(spark)\n\n// Use functions\nval df = rasters.select(rx.rst_boundingbox(col(\"tile\")))\n", - "line_number": 13, - "length_lines": 8, - "source_file": "api/scala.md", - "category": "EXTENDS_SETUP", - "confidence": "high", - "analysis": { - "has_imports": true, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": false, - "has_rx_registration": true, - "uses_existing_df": false, - "uses_existing_rasters": true, - "uses_rx_without_import": false, - "uses_spark_without_creation": false, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "scala", - "code": "import com.databricks.labs.gbx.gridx.bng.{functions => bx}\n\n// Register functions\nbx.register(spark)\n\n// Use functions\nval df = spark.sql(\"SELECT gbx_bng_cellarea('TQ', 1000)\")\n", - "line_number": 25, - "length_lines": 8, - "source_file": "api/scala.md", - "category": "EXTENDS_SETUP", - "confidence": "high", - "analysis": { - "has_imports": true, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": false, - "has_rx_registration": false, - "uses_existing_df": false, - "uses_existing_rasters": null, - "uses_rx_without_import": false, - "uses_spark_without_creation": true, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "scala", - "code": "import com.databricks.labs.gbx.vectorx.{functions => vx}\n\n// Register functions\nvx.register(spark)\n\n// Use functions\nval df = legacyData.select(vx.st_legacyaswkb(col(\"mosaic_geom\")))\n", - "line_number": 37, - "length_lines": 8, - "source_file": "api/scala.md", - "category": "NEEDS_REVIEW", - "confidence": "low", - "analysis": { - "has_imports": true, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": false, - "has_rx_registration": false, - "uses_existing_df": false, - "uses_existing_rasters": null, - "uses_rx_without_import": false, - "uses_spark_without_creation": false, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "scala", - "code": "import com.databricks.labs.gbx.rasterx.{functions => rx}\nimport org.apache.spark.sql.functions._\n\n// Register functions\nrx.register(spark)\n\n// Read rasters\nval rasters = spark.read.format(\"gdal\").load(\"/data/rasters\")\n\n// Extract metadata\nval metadata = rasters.select(\n col(\"path\"),\n rx.rst_boundingbox(col(\"tile\")).alias(\"bbox\"),\n rx.rst_width(col(\"tile\")).alias(\"width\"),\n rx.rst_height(col(\"tile\")).alias(\"height\"),\n rx.rst_numbands(col(\"tile\")).alias(\"num_bands\"),\n rx.rst_metadata(col(\"tile\")).alias(\"metadata\")\n)\n\nmetadata.show()\n", - "line_number": 51, - "length_lines": 21, - "source_file": "api/scala.md", - "category": "SELF_CONTAINED", - "confidence": "high", - "analysis": { - "has_imports": true, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": true, - "has_rx_registration": true, - "uses_existing_df": null, - "uses_existing_rasters": false, - "uses_rx_without_import": false, - "uses_spark_without_creation": true, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "scala", - "code": "import com.databricks.labs.gbx.rasterx.{functions => rx}\nimport org.apache.spark.sql.functions._\n\nrx.register(spark)\n\nval rasters = spark.read.format(\"gdal\").load(\"/data/rasters\")\n\n// Clip raster\nval clipped = rasters.select(\n col(\"path\"),\n rx.rst_clip(\n col(\"tile\"),\n expr(\"st_geomfromtext('POLYGON((-122 37, -122 38, -121 38, -121 37, -122 37))')\")\n ).alias(\"clipped_tile\")\n)\n\n// Resample raster\nval resampled = rasters.select(\n col(\"path\"),\n rx.rst_resample(col(\"tile\"), lit(1024), lit(1024)).alias(\"resampled_tile\")\n)\n", - "line_number": 76, - "length_lines": 22, - "source_file": "api/scala.md", - "category": "SELF_CONTAINED", - "confidence": "high", - "analysis": { - "has_imports": true, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": true, - "has_rx_registration": true, - "uses_existing_df": null, - "uses_existing_rasters": false, - "uses_rx_without_import": false, - "uses_spark_without_creation": true, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "scala", - "code": "import com.databricks.labs.gbx.rasterx.{functions => rx}\nimport org.apache.spark.sql.functions._\n\n// Register functions\nrx.register(spark)\n\n// Read rasters\nval rasters = spark.read.format(\"gdal\").load(\"/data/satellite\")\n\n// Extract metadata and filter\nval catalog = rasters.select(\n col(\"path\"),\n rx.rst_boundingbox(col(\"tile\")).alias(\"bbox\"),\n rx.rst_width(col(\"tile\")).alias(\"width\"),\n rx.rst_height(col(\"tile\")).alias(\"height\"),\n rx.rst_numbands(col(\"tile\")).alias(\"bands\"),\n rx.rst_metadata(col(\"tile\")).alias(\"metadata\")\n).filter(\n col(\"width\") > 1000 && col(\"height\") > 1000\n)\n\n// Write to Delta\ncatalog.write.mode(\"overwrite\").saveAsTable(\"raster_catalog\")\n", - "line_number": 102, - "length_lines": 24, - "source_file": "api/scala.md", - "category": "SELF_CONTAINED", - "confidence": "high", - "analysis": { - "has_imports": true, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": true, - "has_rx_registration": true, - "uses_existing_df": null, - "uses_existing_rasters": false, - "uses_rx_without_import": false, - "uses_spark_without_creation": true, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "scala", - "code": "import com.databricks.labs.gbx.gridx.bng.{functions => bx}\nimport org.apache.spark.sql.functions._\n\n// Register functions\nbx.register(spark)\n\n// Calculate cell area\nval area = spark.sql(\"SELECT gbx_bng_cellarea('TQ', 1000) as area_sqm\")\narea.show()\n\n// Convert points to BNG cells\nval points = spark.table(\"uk_locations\")\nval bngCells = points.select(\n col(\"location_id\"),\n expr(\"gbx_bng_pointtocell(st_point(longitude, latitude), 1000)\").alias(\"bng_cell\")\n)\n\nbngCells.show()\n", - "line_number": 132, - "length_lines": 19, - "source_file": "api/scala.md", - "category": "EXTENDS_SETUP", - "confidence": "high", - "analysis": { - "has_imports": true, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": false, - "has_rx_registration": false, - "uses_existing_df": null, - "uses_existing_rasters": null, - "uses_rx_without_import": false, - "uses_spark_without_creation": true, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "scala", - "code": "import com.databricks.labs.gbx.gridx.bng.{functions => bx}\nimport org.apache.spark.sql.functions._\n\n// Register functions\nbx.register(spark)\n\n// Aggregate by BNG cell\nval aggregated = spark.sql(\"\"\"\n SELECT\n gbx_bng_pointtocell(st_point(longitude, latitude), 1000) as bng_cell,\n COUNT(*) as point_count,\n AVG(value) as avg_value\n FROM measurements\n WHERE country = 'GB'\n GROUP BY bng_cell\n\"\"\")\n\naggregated.write.mode(\"overwrite\").saveAsTable(\"bng_aggregated\")\n", - "line_number": 155, - "length_lines": 19, - "source_file": "api/scala.md", - "category": "EXTENDS_SETUP", - "confidence": "high", - "analysis": { - "has_imports": true, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": false, - "has_rx_registration": false, - "uses_existing_df": null, - "uses_existing_rasters": null, - "uses_rx_without_import": false, - "uses_spark_without_creation": true, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "scala", - "code": "import com.databricks.labs.gbx.vectorx.{functions => vx}\nimport org.apache.spark.sql.functions._\n\n// Register functions\nvx.register(spark)\n\n// Convert legacy geometries\nval legacy = spark.table(\"legacy_mosaic_table\")\nval converted = legacy.select(\n col(\"feature_id\"),\n vx.st_legacyaswkb(col(\"mosaic_geom\")).alias(\"wkb_geom\")\n)\n\n// Convert to Databricks GEOMETRY type\nval geometryDf = converted.select(\n col(\"feature_id\"),\n col(\"wkb_geom\"),\n expr(\"st_geomfromwkb(wkb_geom)\").alias(\"geometry\")\n)\n\ngeometryDf.write.mode(\"overwrite\").saveAsTable(\"converted_features\")\n", - "line_number": 180, - "length_lines": 22, - "source_file": "api/scala.md", - "category": "EXTENDS_SETUP", - "confidence": "high", - "analysis": { - "has_imports": true, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": false, - "has_rx_registration": false, - "uses_existing_df": null, - "uses_existing_rasters": null, - "uses_rx_without_import": false, - "uses_spark_without_creation": true, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "scala", - "code": "import com.databricks.labs.gbx.vectorx.{functions => vx}\nimport org.apache.spark.sql.functions._\n\n// Register functions\nvx.register(spark)\n\n// Full migration workflow\nval legacyTable = spark.table(\"legacy_mosaic_geometries\")\n\n// Convert and validate\nval migrated = legacyTable\n .select(\n col(\"*\"),\n expr(\"st_geomfromwkb(gbx_st_legacyaswkb(mosaic_geom))\").alias(\"geometry\")\n )\n .select(\n col(\"feature_id\"),\n col(\"properties\"),\n col(\"geometry\"),\n expr(\"st_isvalid(geometry)\").alias(\"is_valid\"),\n expr(\"st_area(geometry)\").alias(\"area\")\n )\n .filter(col(\"is_valid\") === true)\n\n// Save to Delta\nmigrated.write.mode(\"overwrite\").saveAsTable(\"migrated_features\")\n", - "line_number": 206, - "length_lines": 27, - "source_file": "api/scala.md", - "category": "EXTENDS_SETUP", - "confidence": "high", - "analysis": { - "has_imports": true, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": false, - "has_rx_registration": false, - "uses_existing_df": null, - "uses_existing_rasters": null, - "uses_rx_without_import": false, - "uses_spark_without_creation": true, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "scala", - "code": "import com.databricks.labs.gbx.rasterx.{functions => rx}\nimport org.apache.spark.sql.functions._\n\nrx.register(spark)\n\n// Single function\nval result = df.select(rx.rst_boundingbox(col(\"tile\")))\n\n// Multiple functions\nval result = df.select(\n rx.rst_boundingbox(col(\"tile\")).alias(\"bbox\"),\n rx.rst_width(col(\"tile\")).alias(\"width\"),\n rx.rst_height(col(\"tile\")).alias(\"height\")\n)\n\n// With additional columns\nval result = df.select(\n col(\"path\"),\n rx.rst_metadata(col(\"tile\")).alias(\"raster_metadata\")\n)\n", - "line_number": 239, - "length_lines": 21, - "source_file": "api/scala.md", - "category": "EXTENDS_SETUP", - "confidence": "high", - "analysis": { - "has_imports": true, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": false, - "has_rx_registration": true, - "uses_existing_df": true, - "uses_existing_rasters": null, - "uses_rx_without_import": false, - "uses_spark_without_creation": true, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "scala", - "code": "// Filter based on function results\nval result = df.filter(rx.rst_width(col(\"tile\")) > 1000)\n\n// Complex filters\nval result = df.filter(\n rx.rst_width(col(\"tile\")) > 1000 &&\n rx.rst_height(col(\"tile\")) > 1000 &&\n rx.rst_numbands(col(\"tile\")) >= 3\n)\n", - "line_number": 264, - "length_lines": 10, - "source_file": "api/scala.md", - "category": "EXTENDS_SETUP", - "confidence": "high", - "analysis": { - "has_imports": false, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": false, - "has_rx_registration": false, - "uses_existing_df": true, - "uses_existing_rasters": null, - "uses_rx_without_import": true, - "uses_spark_without_creation": false, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "scala", - "code": "// Add new columns\nval result = df\n .withColumn(\"bbox\", rx.rst_boundingbox(col(\"tile\")))\n .withColumn(\"width\", rx.rst_width(col(\"tile\")))\n .withColumn(\"height\", rx.rst_height(col(\"tile\")))\n", - "line_number": 278, - "length_lines": 6, - "source_file": "api/scala.md", - "category": "EXTENDS_SETUP", - "confidence": "high", - "analysis": { - "has_imports": false, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": false, - "has_rx_registration": false, - "uses_existing_df": true, - "uses_existing_rasters": null, - "uses_rx_without_import": true, - "uses_spark_without_creation": false, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "scala", - "code": "import com.databricks.labs.gbx.rasterx.{functions => rx}\n\n// Register functions\nrx.register(spark)\n\n// Create temp view\nval rasters = spark.read.format(\"gdal\").load(\"/data/rasters\")\nrasters.createOrReplaceTempView(\"rasters\")\n\n// Use in SQL\nval result = spark.sql(\"\"\"\n SELECT\n path,\n gbx_rst_boundingbox(tile) as bbox,\n gbx_rst_width(tile) as width,\n gbx_rst_height(tile) as height\n FROM rasters\n WHERE gbx_rst_width(tile) > 1000\n\"\"\")\n\nresult.show()\n", - "line_number": 288, - "length_lines": 22, - "source_file": "api/scala.md", - "category": "SELF_CONTAINED", - "confidence": "high", - "analysis": { - "has_imports": true, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": true, - "has_rx_registration": true, - "uses_existing_df": null, - "uses_existing_rasters": false, - "uses_rx_without_import": false, - "uses_spark_without_creation": true, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "scala", - "code": "import com.databricks.labs.gbx.rasterx.{functions => rx}\nimport org.apache.spark.sql.{DataFrame, SparkSession}\nimport org.apache.spark.sql.functions._\n\ndef processRasters(spark: SparkSession, df: DataFrame): DataFrame = {\n rx.register(spark)\n \n df.select(\n col(\"path\"),\n rx.rst_boundingbox(col(\"tile\")).alias(\"bbox\"),\n rx.rst_width(col(\"tile\")).alias(\"width\"),\n rx.rst_height(col(\"tile\")).alias(\"height\")\n )\n}\n\n// Usage\nval rasters = spark.read.format(\"gdal\").load(\"/data/rasters\")\nval processed = processRasters(spark, rasters)\n", - "line_number": 316, - "length_lines": 19, - "source_file": "api/scala.md", - "category": "SELF_CONTAINED", - "confidence": "high", - "analysis": { - "has_imports": true, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": true, - "has_rx_registration": true, - "uses_existing_df": false, - "uses_existing_rasters": false, - "uses_rx_without_import": false, - "uses_spark_without_creation": false, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "scala", - "code": "import com.databricks.labs.gbx.rasterx.{functions => rx}\nimport scala.util.{Try, Success, Failure}\n\nTry {\n rx.register(spark)\n val result = df.select(rx.rst_boundingbox(col(\"tile\")))\n result.show()\n} match {\n case Success(_) => println(\"Processing successful\")\n case Failure(exception) => println(s\"Error: ${exception.getMessage}\")\n}\n", - "line_number": 339, - "length_lines": 12, - "source_file": "api/scala.md", - "category": "EXTENDS_SETUP", - "confidence": "high", - "analysis": { - "has_imports": true, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": false, - "has_rx_registration": true, - "uses_existing_df": true, - "uses_existing_rasters": null, - "uses_rx_without_import": false, - "uses_spark_without_creation": false, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "scala", - "code": "import org.apache.spark.sql.functions.col\n\n// Good: Use col() references\nval result = df.select(rx.rst_boundingbox(col(\"tile\")))\n\n// Avoid: String column names in Scala\n// val result = df.select(rx.rst_boundingbox(\"tile\"))\n", - "line_number": 357, - "length_lines": 8, - "source_file": "api/scala.md", - "category": "EXTENDS_SETUP", - "confidence": "high", - "analysis": { - "has_imports": true, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": false, - "has_rx_registration": false, - "uses_existing_df": true, - "uses_existing_rasters": null, - "uses_rx_without_import": false, - "uses_spark_without_creation": true, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "scala", - "code": "// Process multiple operations at once\nval result = df.select(\n col(\"path\"),\n rx.rst_boundingbox(col(\"tile\")).alias(\"bbox\"),\n rx.rst_width(col(\"tile\")).alias(\"width\"),\n rx.rst_height(col(\"tile\")).alias(\"height\"),\n rx.rst_metadata(col(\"tile\")).alias(\"metadata\")\n)\n", - "line_number": 369, - "length_lines": 9, - "source_file": "api/scala.md", - "category": "EXTENDS_SETUP", - "confidence": "high", - "analysis": { - "has_imports": false, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": false, - "has_rx_registration": false, - "uses_existing_df": true, - "uses_existing_rasters": null, - "uses_rx_without_import": true, - "uses_spark_without_creation": false, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "scala", - "code": "// Cache expensive operations\nval enriched = df\n .withColumn(\"bbox\", rx.rst_boundingbox(col(\"tile\")))\n .withColumn(\"metadata\", rx.rst_metadata(col(\"tile\")))\n .cache()\n\n// Use multiple times\nval filtered1 = enriched.filter(col(\"width\") > 1000)\nval filtered2 = enriched.filter(col(\"bands\") > 3)\n", - "line_number": 382, - "length_lines": 10, - "source_file": "api/scala.md", - "category": "EXTENDS_SETUP", - "confidence": "high", - "analysis": { - "has_imports": false, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": false, - "has_rx_registration": false, - "uses_existing_df": true, - "uses_existing_rasters": null, - "uses_rx_without_import": true, - "uses_spark_without_creation": false, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "python", - "code": "from databricks.labs.gbx.rasterx import functions as rx\nfrom databricks.labs.gbx.gridx.bng import functions as bx\nfrom databricks.labs.gbx.vectorx import functions as vx\n\nrx.register(spark)\nbx.register(spark)\nvx.register(spark)\n", - "line_number": 15, - "length_lines": 8, - "source_file": "api/sql.md", - "category": "NEEDS_REVIEW", - "confidence": "low", - "analysis": { - "has_imports": true, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": false, - "has_rx_registration": true, - "uses_existing_df": null, - "uses_existing_rasters": null, - "uses_rx_without_import": false, - "uses_spark_without_creation": false, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "scala", - "code": "import com.databricks.labs.gbx.rasterx.{functions => rx}\nimport com.databricks.labs.gbx.gridx.bng.{functions => bx}\nimport com.databricks.labs.gbx.vectorx.{functions => vx}\n\nrx.register(spark)\nbx.register(spark)\nvx.register(spark)\n", - "line_number": 27, - "length_lines": 8, - "source_file": "api/sql.md", - "category": "NEEDS_REVIEW", - "confidence": "low", - "analysis": { - "has_imports": true, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": false, - "has_rx_registration": true, - "uses_existing_df": null, - "uses_existing_rasters": null, - "uses_rx_without_import": false, - "uses_spark_without_creation": false, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "sql", - "code": "-- List all GeoBrix functions\nSHOW FUNCTIONS LIKE 'gbx_*';\n\n-- List RasterX functions\nSHOW FUNCTIONS LIKE 'gbx_rst_*';\n\n-- List GridX functions\nSHOW FUNCTIONS LIKE 'gbx_bng_*';\n\n-- List VectorX functions\nSHOW FUNCTIONS LIKE 'gbx_st_*';\n", - "line_number": 47, - "length_lines": 12, - "source_file": "api/sql.md", - "category": "EXAMPLE_ONLY", - "confidence": "medium", - "analysis": { - "has_imports": false, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": false, - "has_rx_registration": false, - "uses_existing_df": null, - "uses_existing_rasters": null, - "uses_rx_without_import": false, - "uses_spark_without_creation": false, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "sql", - "code": "-- Get function details\nDESCRIBE FUNCTION gbx_rst_boundingbox;\n\n-- Get extended information\nDESCRIBE FUNCTION EXTENDED gbx_rst_boundingbox;\n", - "line_number": 63, - "length_lines": 6, - "source_file": "api/sql.md", - "category": "EXAMPLE_ONLY", - "confidence": "medium", - "analysis": { - "has_imports": false, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": false, - "has_rx_registration": false, - "uses_existing_df": null, - "uses_existing_rasters": null, - "uses_rx_without_import": false, - "uses_spark_without_creation": false, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "sql", - "code": "-- Read rasters\nCREATE OR REPLACE TEMP VIEW rasters AS\nSELECT * FROM gdal.`/data/rasters`;\n\n-- Extract metadata\nSELECT\n path,\n gbx_rst_boundingbox(tile) as bbox,\n gbx_rst_width(tile) as width,\n gbx_rst_height(tile) as height,\n gbx_rst_numbands(tile) as num_bands,\n gbx_rst_metadata(tile) as metadata\nFROM rasters;\n", - "line_number": 75, - "length_lines": 14, - "source_file": "api/sql.md", - "category": "SELF_CONTAINED", - "confidence": "medium", - "analysis": { - "has_imports": false, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": false, - "has_rx_registration": false, - "uses_existing_df": null, - "uses_existing_rasters": true, - "uses_rx_without_import": false, - "uses_spark_without_creation": false, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "sql", - "code": "-- Filter by dimensions\nSELECT *\nFROM rasters\nWHERE gbx_rst_width(tile) > 1000\n AND gbx_rst_height(tile) > 1000;\n\n-- Filter by band count\nSELECT *\nFROM rasters\nWHERE gbx_rst_numbands(tile) >= 3;\n", - "line_number": 93, - "length_lines": 11, - "source_file": "api/sql.md", - "category": "EXTENDS_SETUP", - "confidence": "high", - "analysis": { - "has_imports": false, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": false, - "has_rx_registration": false, - "uses_existing_df": null, - "uses_existing_rasters": true, - "uses_rx_without_import": false, - "uses_spark_without_creation": false, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "sql", - "code": "-- Clip raster\nSELECT\n path,\n gbx_rst_clip(\n tile,\n st_geomfromtext('POLYGON((-122 37, -122 38, -121 38, -121 37, -122 37))')\n ) as clipped_tile\nFROM rasters;\n\n-- Create raster catalog\nCREATE OR REPLACE TABLE raster_catalog AS\nSELECT\n path,\n gbx_rst_boundingbox(tile) as bounds,\n gbx_rst_width(tile) as width,\n gbx_rst_height(tile) as height,\n gbx_rst_numbands(tile) as bands,\n gbx_rst_metadata(tile) as metadata\nFROM rasters;\n", - "line_number": 108, - "length_lines": 20, - "source_file": "api/sql.md", - "category": "SELF_CONTAINED", - "confidence": "medium", - "analysis": { - "has_imports": false, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": false, - "has_rx_registration": false, - "uses_existing_df": null, - "uses_existing_rasters": true, - "uses_rx_without_import": false, - "uses_spark_without_creation": false, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "sql", - "code": "-- Calculate cell area\nSELECT gbx_bng_cellarea('TQ', 1000) as area_sqm;\n\n-- Different precisions\nSELECT\n 'TQ' as grid,\n gbx_bng_cellarea('TQ', 10000) as area_10km,\n gbx_bng_cellarea('TQ', 1000) as area_1km,\n gbx_bng_cellarea('TQ', 100) as area_100m;\n", - "line_number": 134, - "length_lines": 10, - "source_file": "api/sql.md", - "category": "EXAMPLE_ONLY", - "confidence": "medium", - "analysis": { - "has_imports": false, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": false, - "has_rx_registration": false, - "uses_existing_df": null, - "uses_existing_rasters": null, - "uses_rx_without_import": false, - "uses_spark_without_creation": false, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "sql", - "code": "-- Convert points to BNG cells\nCREATE OR REPLACE TEMP VIEW uk_points_bng AS\nSELECT\n location_id,\n latitude,\n longitude,\n gbx_bng_pointtocell(st_point(longitude, latitude), 1000) as bng_cell_1km,\n gbx_bng_pointtocell(st_point(longitude, latitude), 100) as bng_cell_100m\nFROM uk_locations;\n\nSELECT * FROM uk_points_bng;\n", - "line_number": 148, - "length_lines": 12, - "source_file": "api/sql.md", - "category": "SELF_CONTAINED", - "confidence": "medium", - "analysis": { - "has_imports": false, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": false, - "has_rx_registration": false, - "uses_existing_df": null, - "uses_existing_rasters": null, - "uses_rx_without_import": false, - "uses_spark_without_creation": false, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "sql", - "code": "-- Aggregate by BNG cell\nCREATE OR REPLACE TABLE bng_aggregated AS\nSELECT\n gbx_bng_pointtocell(st_point(longitude, latitude), 1000) as bng_cell,\n COUNT(*) as point_count,\n AVG(temperature) as avg_temp,\n MAX(temperature) as max_temp,\n MIN(temperature) as min_temp\nFROM weather_stations\nWHERE country = 'GB'\nGROUP BY bng_cell;\n\nSELECT * FROM bng_aggregated ORDER BY point_count DESC LIMIT 10;\n", - "line_number": 164, - "length_lines": 14, - "source_file": "api/sql.md", - "category": "SELF_CONTAINED", - "confidence": "medium", - "analysis": { - "has_imports": false, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": false, - "has_rx_registration": false, - "uses_existing_df": null, - "uses_existing_rasters": null, - "uses_rx_without_import": false, - "uses_spark_without_creation": false, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "sql", - "code": "-- Analyze at multiple resolutions\nCREATE OR REPLACE VIEW multi_resolution AS\nSELECT\n location_id,\n gbx_bng_pointtocell(location, 10000) as bng_10km,\n gbx_bng_pointtocell(location, 1000) as bng_1km,\n gbx_bng_pointtocell(location, 100) as bng_100m\nFROM locations;\n\n-- Count by resolution\nSELECT '10km' as resolution, COUNT(DISTINCT bng_10km) as cell_count FROM multi_resolution\nUNION ALL\nSELECT '1km', COUNT(DISTINCT bng_1km) FROM multi_resolution\nUNION ALL\nSELECT '100m', COUNT(DISTINCT bng_100m) FROM multi_resolution;\n", - "line_number": 182, - "length_lines": 16, - "source_file": "api/sql.md", - "category": "SELF_CONTAINED", - "confidence": "medium", - "analysis": { - "has_imports": false, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": false, - "has_rx_registration": false, - "uses_existing_df": null, - "uses_existing_rasters": null, - "uses_rx_without_import": false, - "uses_spark_without_creation": false, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "sql", - "code": "-- Convert legacy Mosaic geometries\nCREATE OR REPLACE TEMP VIEW converted_geometries AS\nSELECT\n feature_id,\n properties,\n gbx_st_legacyaswkb(mosaic_geom) as wkb_geom,\n st_geomfromwkb(gbx_st_legacyaswkb(mosaic_geom)) as geometry\nFROM legacy_mosaic_table;\n\nSELECT * FROM converted_geometries;\n", - "line_number": 204, - "length_lines": 11, - "source_file": "api/sql.md", - "category": "SELF_CONTAINED", - "confidence": "medium", - "analysis": { - "has_imports": false, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": false, - "has_rx_registration": false, - "uses_existing_df": null, - "uses_existing_rasters": null, - "uses_rx_without_import": false, - "uses_spark_without_creation": false, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "sql", - "code": "-- Full table migration\nCREATE OR REPLACE TABLE migrated_features AS\nSELECT\n feature_id,\n properties,\n st_geomfromwkb(gbx_st_legacyaswkb(mosaic_geom)) as geometry\nFROM legacy_mosaic_table;\n\n-- Validate results\nSELECT\n COUNT(*) as total,\n COUNT(geometry) as with_geometry,\n COUNT(CASE WHEN st_isvalid(geometry) THEN 1 END) as valid_geometries\nFROM migrated_features;\n", - "line_number": 219, - "length_lines": 15, - "source_file": "api/sql.md", - "category": "SELF_CONTAINED", - "confidence": "medium", - "analysis": { - "has_imports": false, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": false, - "has_rx_registration": false, - "uses_existing_df": null, - "uses_existing_rasters": null, - "uses_rx_without_import": false, - "uses_spark_without_creation": false, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "sql", - "code": "-- Convert and analyze\nCREATE OR REPLACE VIEW features_analyzed AS\nSELECT\n feature_id,\n st_geomfromwkb(gbx_st_legacyaswkb(mosaic_geom)) as geometry,\n st_area(st_geomfromwkb(gbx_st_legacyaswkb(mosaic_geom))) as area,\n st_length(st_geomfromwkb(gbx_st_legacyaswkb(mosaic_geom))) as perimeter,\n st_centroid(st_geomfromwkb(gbx_st_legacyaswkb(mosaic_geom))) as centroid\nFROM legacy_features;\n\nSELECT * FROM features_analyzed WHERE area > 1000;\n", - "line_number": 238, - "length_lines": 12, - "source_file": "api/sql.md", - "category": "SELF_CONTAINED", - "confidence": "medium", - "analysis": { - "has_imports": false, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": false, - "has_rx_registration": false, - "uses_existing_df": null, - "uses_existing_rasters": null, - "uses_rx_without_import": false, - "uses_spark_without_creation": false, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "sql", - "code": "-- Read shapefile\nCREATE OR REPLACE TEMP VIEW shapes AS\nSELECT * FROM shapefile.`/data/boundaries.shp`;\n\n-- Convert geometry\nCREATE OR REPLACE VIEW shapes_geom AS\nSELECT\n *,\n st_geomfromwkb(geom_0) as geometry\nFROM shapes;\n\nSELECT\n name,\n st_area(geometry) as area,\n st_centroid(geometry) as center\nFROM shapes_geom;\n", - "line_number": 256, - "length_lines": 17, - "source_file": "api/sql.md", - "category": "SELF_CONTAINED", - "confidence": "medium", - "analysis": { - "has_imports": false, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": false, - "has_rx_registration": false, - "uses_existing_df": null, - "uses_existing_rasters": null, - "uses_rx_without_import": false, - "uses_spark_without_creation": false, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "sql", - "code": "-- Read GeoJSON\nCREATE OR REPLACE TEMP VIEW features AS\nSELECT * FROM geojson.`/data/features.geojson`;\n\n-- Query with geometry\nSELECT\n name,\n type,\n st_area(st_geomfromwkb(geom_0)) as area\nFROM features\nWHERE st_area(st_geomfromwkb(geom_0)) > 1000;\n", - "line_number": 277, - "length_lines": 12, - "source_file": "api/sql.md", - "category": "SELF_CONTAINED", - "confidence": "medium", - "analysis": { - "has_imports": false, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": false, - "has_rx_registration": false, - "uses_existing_df": null, - "uses_existing_rasters": null, - "uses_rx_without_import": false, - "uses_spark_without_creation": false, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "sql", - "code": "-- Read GeoPackage (note: specify layer in Python/Scala first)\n-- gpkg = spark.read.format(\"gpkg\").option(\"layerName\", \"buildings\").load(\"/data/city.gpkg\")\n-- gpkg.createOrReplaceTempView(\"buildings\")\n\nSELECT\n building_id,\n name,\n st_area(st_geomfromwkb(shape)) as floor_area,\n st_centroid(st_geomfromwkb(shape)) as center_point\nFROM buildings\nWHERE st_area(st_geomfromwkb(shape)) > 500;\n", - "line_number": 293, - "length_lines": 12, - "source_file": "api/sql.md", - "category": "EXTENDS_SETUP", - "confidence": "high", - "analysis": { - "has_imports": false, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": true, - "has_rx_registration": false, - "uses_existing_df": null, - "uses_existing_rasters": null, - "uses_rx_without_import": false, - "uses_spark_without_creation": true, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "sql", - "code": "-- Join based on spatial relationship\nSELECT\n p.parcel_id,\n p.owner,\n z.zone_name,\n z.zone_type\nFROM\n (SELECT *, st_geomfromwkb(geom_0) as geometry FROM parcels) p\nJOIN\n (SELECT *, st_geomfromwkb(geom_0) as geometry FROM zones) z\n ON st_contains(z.geometry, st_centroid(p.geometry));\n", - "line_number": 311, - "length_lines": 12, - "source_file": "api/sql.md", - "category": "EXTENDS_SETUP", - "confidence": "high", - "analysis": { - "has_imports": false, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": false, - "has_rx_registration": false, - "uses_existing_df": null, - "uses_existing_rasters": null, - "uses_rx_without_import": false, - "uses_spark_without_creation": false, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "sql", - "code": "-- Aggregate by zone\nSELECT\n z.zone_name,\n COUNT(p.parcel_id) as parcel_count,\n SUM(st_area(p.geometry)) as total_area\nFROM\n (SELECT *, st_geomfromwkb(geom_0) as geometry FROM parcels) p\nJOIN\n (SELECT *, st_geomfromwkb(geom_0) as geometry FROM zones) z\n ON st_contains(z.geometry, p.geometry)\nGROUP BY z.zone_name;\n", - "line_number": 327, - "length_lines": 12, - "source_file": "api/sql.md", - "category": "EXTENDS_SETUP", - "confidence": "high", - "analysis": { - "has_imports": false, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": false, - "has_rx_registration": false, - "uses_existing_df": null, - "uses_existing_rasters": null, - "uses_rx_without_import": false, - "uses_spark_without_creation": false, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "sql", - "code": "-- Combine raster and vector\nWITH raster_catalog AS (\n SELECT\n path,\n gbx_rst_boundingbox(tile) as raster_bounds\n FROM gdal.`/data/rasters`\n),\nvector_features AS (\n SELECT\n feature_id,\n st_geomfromwkb(geom_0) as geometry\n FROM shapefile.`/data/features.shp`\n)\nSELECT\n r.path,\n v.feature_id,\n st_intersects(v.geometry, r.raster_bounds) as intersects\nFROM raster_catalog r\nCROSS JOIN vector_features v\nWHERE st_intersects(v.geometry, r.raster_bounds);\n", - "line_number": 343, - "length_lines": 21, - "source_file": "api/sql.md", - "category": "EXTENDS_SETUP", - "confidence": "high", - "analysis": { - "has_imports": false, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": false, - "has_rx_registration": false, - "uses_existing_df": null, - "uses_existing_rasters": true, - "uses_rx_without_import": false, - "uses_spark_without_creation": false, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "sql", - "code": "-- Create catalog table\nCREATE OR REPLACE TABLE catalog.schema.raster_metadata AS\nSELECT\n path,\n gbx_rst_boundingbox(tile) as bounds,\n gbx_rst_width(tile) as width,\n gbx_rst_height(tile) as height,\n gbx_rst_numbands(tile) as bands\nFROM gdal.`/data/rasters`;\n\n-- Query table\nSELECT * FROM catalog.schema.raster_metadata\nWHERE width > 1000;\n", - "line_number": 370, - "length_lines": 14, - "source_file": "api/sql.md", - "category": "SELF_CONTAINED", - "confidence": "medium", - "analysis": { - "has_imports": false, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": false, - "has_rx_registration": false, - "uses_existing_df": null, - "uses_existing_rasters": true, - "uses_rx_without_import": false, - "uses_spark_without_creation": false, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "sql", - "code": "-- Create temp view\nCREATE OR REPLACE TEMP VIEW processed_features AS\nSELECT\n feature_id,\n st_geomfromwkb(gbx_st_legacyaswkb(mosaic_geom)) as geometry,\n st_area(st_geomfromwkb(gbx_st_legacyaswkb(mosaic_geom))) as area\nFROM legacy_data;\n\n-- Use in session\nSELECT * FROM processed_features WHERE area > 5000;\n", - "line_number": 388, - "length_lines": 11, - "source_file": "api/sql.md", - "category": "SELF_CONTAINED", - "confidence": "medium", - "analysis": { - "has_imports": false, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": false, - "has_rx_registration": false, - "uses_existing_df": null, - "uses_existing_rasters": null, - "uses_rx_without_import": false, - "uses_spark_without_creation": false, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "sql", - "code": "-- Good: Filter before processing\nSELECT\n path,\n gbx_rst_metadata(tile) as metadata\nFROM rasters\nWHERE path LIKE '%2024%';\n\n-- Less efficient: Process then filter\n-- SELECT * FROM (\n-- SELECT path, gbx_rst_metadata(tile) as metadata\n-- FROM rasters\n-- ) WHERE path LIKE '%2024%';\n", - "line_number": 405, - "length_lines": 13, - "source_file": "api/sql.md", - "category": "EXTENDS_SETUP", - "confidence": "high", - "analysis": { - "has_imports": false, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": false, - "has_rx_registration": false, - "uses_existing_df": null, - "uses_existing_rasters": true, - "uses_rx_without_import": false, - "uses_spark_without_creation": false, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "sql", - "code": "-- Create view once\nCREATE OR REPLACE VIEW raster_catalog AS\nSELECT\n path,\n gbx_rst_boundingbox(tile) as bounds,\n gbx_rst_width(tile) as width,\n gbx_rst_height(tile) as height\nFROM gdal.`/data/rasters`;\n\n-- Query multiple times\nSELECT * FROM raster_catalog WHERE width > 1000;\nSELECT * FROM raster_catalog WHERE height > 1000;\n", - "line_number": 422, - "length_lines": 13, - "source_file": "api/sql.md", - "category": "SELF_CONTAINED", - "confidence": "medium", - "analysis": { - "has_imports": false, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": false, - "has_rx_registration": false, - "uses_existing_df": null, - "uses_existing_rasters": true, - "uses_rx_without_import": false, - "uses_spark_without_creation": false, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "sql", - "code": "-- Create partitioned table\nCREATE TABLE catalog.schema.features\nPARTITIONED BY (region)\nAS\nSELECT\n *,\n st_geomfromwkb(geom_0) as geometry,\n region\nFROM shapefile.`/data/features.shp`;\n\n-- Efficient regional queries\nSELECT * FROM catalog.schema.features\nWHERE region = 'northeast';\n", - "line_number": 439, - "length_lines": 14, - "source_file": "api/sql.md", - "category": "SELF_CONTAINED", - "confidence": "medium", - "analysis": { - "has_imports": false, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": false, - "has_rx_registration": false, - "uses_existing_df": null, - "uses_existing_rasters": null, - "uses_rx_without_import": false, - "uses_spark_without_creation": false, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "sql", - "code": "-- Non-tessellated: cellid is null\nSELECT tile.cellid \nFROM gdal.`/path/to/raster.tif`;\n-- Returns: null\n\n-- Tessellated: cellid contains H3 cell ID\nSELECT tile.cellid \nFROM (\n SELECT explode(gbx_rst_h3_tessellate(tile, 7)) as tile\n FROM gdal.`/path/to/raster.tif`\n);\n-- Returns: 604189641255419903, 604189641255420159, ...\n", - "line_number": 51, - "length_lines": 13, - "source_file": "api/tile-structure.md", - "category": "EXTENDS_SETUP", - "confidence": "high", - "analysis": { - "has_imports": false, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": false, - "has_rx_registration": false, - "uses_existing_df": null, - "uses_existing_rasters": null, - "uses_rx_without_import": false, - "uses_spark_without_creation": false, - "has_placeholders": true, - "is_short": false - } - }, - { - "language": "python", - "code": "# Access path from rst_fromfile\ndf = spark.range(1).select(\n rx.rst_fromfile(f.lit(\"/path/to/raster.tif\"), f.lit(\"GTiff\")).alias(\"tile\")\n)\n\npath_df = df.select(f.col(\"tile.raster\").alias(\"raster_path\"))\n# Returns: \"/path/to/raster.tif\"\n\n# Access binary from GDAL reader\ndf = spark.read.format(\"gdal\").load(\"/path/to/rasters/*.tif\")\nbinary_df = df.select(f.col(\"tile.raster\").alias(\"raster_binary\"))\n# Returns: b'\\x4d\\x4d\\x00\\x2a...' (binary GeoTIFF data)\n", - "line_number": 93, - "length_lines": 13, - "source_file": "api/tile-structure.md", - "category": "EXAMPLE_ONLY", - "confidence": "high", - "analysis": { - "has_imports": true, - "has_spark_creation": false, - "has_data_creation": true, - "has_file_reading": true, - "has_rx_registration": false, - "uses_existing_df": false, - "uses_existing_rasters": false, - "uses_rx_without_import": false, - "uses_spark_without_creation": true, - "has_placeholders": true, - "is_short": false - } - }, - { - "language": "python", - "code": "metadata_df = df.select(\n f.col(\"tile.metadata\").alias(\"metadata\"),\n f.col(\"tile.metadata.driver\").alias(\"driver\"),\n f.col(\"tile.metadata.extension\").alias(\"extension\"),\n f.col(\"tile.metadata.size\").alias(\"size\")\n)\n\n# Returns:\n# metadata: {\"driver\": \"GTiff\", \"extension\": \".tif\", \"size\": \"2345678\"}\n# driver: \"GTiff\"\n# extension: \".tif\"\n# size: \"2345678\"\n", - "line_number": 124, - "length_lines": 13, - "source_file": "api/tile-structure.md", - "category": "EXAMPLE_ONLY", - "confidence": "high", - "analysis": { - "has_imports": false, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": false, - "has_rx_registration": false, - "uses_existing_df": false, - "uses_existing_rasters": null, - "uses_rx_without_import": false, - "uses_spark_without_creation": false, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "python", - "code": "from pyspark.sql import functions as f\nfrom databricks.labs.gbx.rasterx import functions as rx\n\ndf = spark.read.format(\"gdal\").load(\"/path/to/rasters/*.tif\")\n\n# Access individual fields\ndf.select(\n f.col(\"tile.cellid\"),\n f.col(\"tile.raster\"),\n f.col(\"tile.metadata\"),\n f.col(\"tile.metadata.driver\")\n)\n", - "line_number": 148, - "length_lines": 13, - "source_file": "api/tile-structure.md", - "category": "SELF_CONTAINED", - "confidence": "medium", - "analysis": { - "has_imports": true, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": true, - "has_rx_registration": false, - "uses_existing_df": false, - "uses_existing_rasters": false, - "uses_rx_without_import": false, - "uses_spark_without_creation": true, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "scala", - "code": "import org.apache.spark.sql.functions._\nimport com.databricks.labs.gbx.rasterx.{functions => rx}\n\nval df = spark.read.format(\"gdal\").load(\"/path/to/rasters/*.tif\")\n\n// Access individual fields\ndf.select(\n col(\"tile.cellid\"),\n col(\"tile.raster\"),\n col(\"tile.metadata\"),\n col(\"tile.metadata.driver\")\n)\n", - "line_number": 164, - "length_lines": 13, - "source_file": "api/tile-structure.md", - "category": "SELF_CONTAINED", - "confidence": "medium", - "analysis": { - "has_imports": true, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": true, - "has_rx_registration": false, - "uses_existing_df": false, - "uses_existing_rasters": false, - "uses_rx_without_import": false, - "uses_spark_without_creation": true, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "sql", - "code": "SELECT \n tile.cellid,\n tile.raster,\n tile.metadata,\n tile.metadata['driver'] as driver\nFROM gdal.`/path/to/rasters/*.tif`;\n", - "line_number": 180, - "length_lines": 7, - "source_file": "api/tile-structure.md", - "category": "EXTENDS_SETUP", - "confidence": "high", - "analysis": { - "has_imports": false, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": false, - "has_rx_registration": false, - "uses_existing_df": null, - "uses_existing_rasters": true, - "uses_rx_without_import": false, - "uses_spark_without_creation": false, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "python", - "code": "# Filter by driver\ngtiff_only = df.filter(f.col(\"tile.metadata.driver\") == \"GTiff\")\n\n# Filter by file extension\ntif_files = df.filter(f.col(\"tile.metadata.extension\") == \".tif\")\n", - "line_number": 193, - "length_lines": 6, - "source_file": "api/tile-structure.md", - "category": "EXTENDS_SETUP", - "confidence": "high", - "analysis": { - "has_imports": false, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": false, - "has_rx_registration": false, - "uses_existing_df": true, - "uses_existing_rasters": null, - "uses_rx_without_import": false, - "uses_spark_without_creation": false, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "python", - "code": "from pyspark.sql.functions import udf\nfrom pyspark.sql.types import IntegerType\n\n@udf(IntegerType())\ndef get_raster_size(raster_binary, metadata):\n \"\"\"Get size of raster data\"\"\"\n if metadata and \"size\" in metadata:\n return int(metadata[\"size\"])\n elif raster_binary:\n return len(raster_binary)\n return 0\n\ndf_with_size = df.withColumn(\n \"data_size\",\n get_raster_size(f.col(\"tile.raster\"), f.col(\"tile.metadata\"))\n)\n", - "line_number": 205, - "length_lines": 17, - "source_file": "api/tile-structure.md", - "category": "EXTENDS_SETUP", - "confidence": "high", - "analysis": { - "has_imports": true, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": false, - "has_rx_registration": false, - "uses_existing_df": true, - "uses_existing_rasters": null, - "uses_rx_without_import": false, - "uses_spark_without_creation": false, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "python", - "code": "from rasterio.io import MemoryFile\n\n@udf(DoubleType())\ndef compute_mean_from_tile(raster_binary):\n \"\"\"Compute mean from binary raster data\"\"\"\n import numpy as np\n \n if raster_binary is None:\n return None\n \n # Convert to bytes if needed\n tile_data = bytes(raster_binary)\n \n # Open with rasterio\n with MemoryFile(tile_data) as memfile:\n with memfile.open() as src:\n data = src.read(1)\n return float(np.mean(data))\n\n# Use with tiles from content or GDAL reader\ndf = spark.read.format(\"gdal\").load(\"/path/to/rasters/*.tif\")\nstats_df = df.withColumn(\n \"mean_value\",\n compute_mean_from_tile(f.col(\"tile.raster\"))\n)\n", - "line_number": 228, - "length_lines": 26, - "source_file": "api/tile-structure.md", - "category": "SELF_CONTAINED", - "confidence": "medium", - "analysis": { - "has_imports": true, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": true, - "has_rx_registration": false, - "uses_existing_df": false, - "uses_existing_rasters": false, - "uses_rx_without_import": false, - "uses_spark_without_creation": false, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "python", - "code": "from databricks.labs.gbx.rasterx import functions as rx\n\n# File-based tile\nfile_tile = spark.range(1).select(\n rx.rst_fromfile(f.lit(\"/path/raster.tif\"), f.lit(\"GTiff\")).alias(\"tile\")\n)\n\nfile_tile.select(f.col(\"tile.raster\")).show(truncate=False)\n# +---------------------------+\n# |raster |\n# +---------------------------+\n# |/path/raster.tif |\n# +---------------------------+\n\n# Binary tile\nbinary_tile = spark.read.format(\"binaryFile\").load(\"/path/raster.tif\").select(\n rx.rst_fromcontent(f.col(\"content\"), f.lit(\"GTiff\")).alias(\"tile\")\n)\n\nbinary_tile.select(f.length(f.col(\"tile.raster\")).alias(\"size_bytes\")).show()\n# +-----------+\n# |size_bytes |\n# +-----------+\n# |2345678 |\n# +-----------+\n", - "line_number": 258, - "length_lines": 26, - "source_file": "api/tile-structure.md", - "category": "SELF_CONTAINED", - "confidence": "medium", - "analysis": { - "has_imports": true, - "has_spark_creation": false, - "has_data_creation": true, - "has_file_reading": true, - "has_rx_registration": false, - "uses_existing_df": null, - "uses_existing_rasters": null, - "uses_rx_without_import": false, - "uses_spark_without_creation": true, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "python", - "code": "df = spark.read.format(\"gdal\").load(\"/path/to/raster.tif\")\n\ndf.select(\n f.col(\"tile.cellid\"), # null\n f.col(\"tile.raster\"), # binary data\n f.col(\"tile.metadata\") # {driver: \"GTiff\", ...}\n).show()\n", - "line_number": 294, - "length_lines": 8, - "source_file": "api/tile-structure.md", - "category": "EXAMPLE_ONLY", - "confidence": "high", - "analysis": { - "has_imports": false, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": true, - "has_rx_registration": false, - "uses_existing_df": false, - "uses_existing_rasters": null, - "uses_rx_without_import": false, - "uses_spark_without_creation": true, - "has_placeholders": true, - "is_short": false - } - }, - { - "language": "python", - "code": "df = spark.read.format(\"gdal\").load(\"/path/to/raster.tif\").select(\n f.explode(rx.rst_h3_tessellate(f.col(\"tile\"), f.lit(7))).alias(\"tile\")\n)\n\ndf.select(\n f.col(\"tile.cellid\"), # H3 cell ID (e.g., 604189641255419903)\n f.col(\"tile.raster\"), # binary data (clipped to cell)\n f.col(\"tile.metadata\") # {driver: \"GTiff\", RASTERX_CELL_ID: \"604...\", ...}\n).show()\n", - "line_number": 313, - "length_lines": 10, - "source_file": "api/tile-structure.md", - "category": "EXAMPLE_ONLY", - "confidence": "high", - "analysis": { - "has_imports": false, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": true, - "has_rx_registration": false, - "uses_existing_df": false, - "uses_existing_rasters": null, - "uses_rx_without_import": true, - "uses_spark_without_creation": true, - "has_placeholders": true, - "is_short": false - } - }, - { - "language": "python", - "code": "# \u2705 Good: Access multiple fields in one select\ndf.select(\n f.col(\"tile.cellid\"),\n f.col(\"tile.metadata.driver\"),\n f.col(\"tile.metadata.extension\")\n)\n\n# \u274c Avoid: Multiple separate selects\ndf.select(f.col(\"tile.cellid\"))\ndf.select(f.col(\"tile.metadata.driver\"))\n", - "line_number": 350, - "length_lines": 11, - "source_file": "api/tile-structure.md", - "category": "EXTENDS_SETUP", - "confidence": "high", - "analysis": { - "has_imports": false, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": false, - "has_rx_registration": false, - "uses_existing_df": true, - "uses_existing_rasters": null, - "uses_rx_without_import": false, - "uses_spark_without_creation": false, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "python", - "code": "# \u2705 Good: Filter before expensive operations\ndf.filter(f.col(\"tile.metadata.driver\") == \"GTiff\") \\\n .select(rx.rst_clip(f.col(\"tile\"), boundary))\n\n# \u274c Avoid: Process then filter\ndf.select(rx.rst_clip(f.col(\"tile\"), boundary)) \\\n .filter(f.col(\"tile.metadata.driver\") == \"GTiff\")\n", - "line_number": 365, - "length_lines": 8, - "source_file": "api/tile-structure.md", - "category": "EXTENDS_SETUP", - "confidence": "high", - "analysis": { - "has_imports": false, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": false, - "has_rx_registration": false, - "uses_existing_df": true, - "uses_existing_rasters": null, - "uses_rx_without_import": true, - "uses_spark_without_creation": false, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "python", - "code": "# \u2705 Preferred: Use accessor functions\ndf.select(rx.rst_metadata(f.col(\"tile\")))\n\n# \u2705 Also fine: Direct field access\ndf.select(f.col(\"tile.metadata\"))\n", - "line_number": 379, - "length_lines": 6, - "source_file": "api/tile-structure.md", - "category": "EXTENDS_SETUP", - "confidence": "high", - "analysis": { - "has_imports": false, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": false, - "has_rx_registration": false, - "uses_existing_df": true, - "uses_existing_rasters": null, - "uses_rx_without_import": true, - "uses_spark_without_creation": false, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "python", - "code": "from pyspark.sql.functions import when\n\ndf = spark.read.format(\"gdal\").load(\"/path/to/rasters/*\")\n\nprocessed = df.withColumn(\n \"result\",\n when(f.col(\"tile.metadata.driver\") == \"GTiff\", \n rx.rst_clip(f.col(\"tile\"), aoi))\n .when(f.col(\"tile.metadata.driver\") == \"NetCDF\",\n rx.rst_subdatasets(f.col(\"tile\")))\n .otherwise(f.col(\"tile\"))\n)\n", - "line_number": 395, - "length_lines": 13, - "source_file": "api/tile-structure.md", - "category": "SELF_CONTAINED", - "confidence": "medium", - "analysis": { - "has_imports": true, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": true, - "has_rx_registration": false, - "uses_existing_df": false, - "uses_existing_rasters": false, - "uses_rx_without_import": false, - "uses_spark_without_creation": true, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "python", - "code": "# Tessellate two rasters to same grid\nrasters1 = spark.read.format(\"gdal\").load(\"/path/set1/*\").select(\n f.explode(rx.rst_h3_tessellate(f.col(\"tile\"), f.lit(7))).alias(\"tile1\")\n)\n\nrasters2 = spark.read.format(\"gdal\").load(\"/path/set2/*\").select(\n f.explode(rx.rst_h3_tessellate(f.col(\"tile\"), f.lit(7))).alias(\"tile2\")\n)\n\n# Join on cellid\njoined = rasters1.join(\n rasters2,\n f.col(\"tile1.cellid\") == f.col(\"tile2.cellid\")\n)\n", - "line_number": 412, - "length_lines": 15, - "source_file": "api/tile-structure.md", - "category": "EXTENDS_SETUP", - "confidence": "high", - "analysis": { - "has_imports": false, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": true, - "has_rx_registration": false, - "uses_existing_df": null, - "uses_existing_rasters": false, - "uses_rx_without_import": true, - "uses_spark_without_creation": true, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "python", - "code": "# Extract binary rasters for download or external processing\nexport_df = df.select(\n f.col(\"path\"),\n f.col(\"tile.raster\").alias(\"raster_bytes\")\n).write.parquet(\"/Volumes//export/\")\n\n# Or save as files\nfor row in df.select(\"path\", \"tile.raster\").collect():\n with open(f\"/tmp/{row.path}\", \"wb\") as f:\n f.write(row[\"tile.raster\"])\n", - "line_number": 431, - "length_lines": 11, - "source_file": "api/tile-structure.md", - "category": "EXAMPLE_ONLY", - "confidence": "high", - "analysis": { - "has_imports": false, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": false, - "has_rx_registration": false, - "uses_existing_df": false, - "uses_existing_rasters": true, - "uses_rx_without_import": false, - "uses_spark_without_creation": false, - "has_placeholders": true, - "is_short": false - } - }, - { - "language": "python", - "code": "# Pattern: File-based for initial read, binary for processing\ndf = spark.read.format(\"gdal\").load(\"/Volumes//rasters/*\") # File-based\n\n# Operations automatically handle deserialization\nprocessed = df.select(\n rx.rst_clip(f.col(\"tile\"), aoi) # Reads from file as needed\n)\n\n# Materialize to binary for repeated operations\ncached = processed.select(\n rx.rst_fromcontent(\n rx.rst_tobinary(f.col(\"tile\")), # Convert to binary\n f.col(\"tile.metadata.driver\")\n ).alias(\"tile\")\n).cache()\n", - "line_number": 455, - "length_lines": 16, - "source_file": "api/tile-structure.md", - "category": "EXAMPLE_ONLY", - "confidence": "high", - "analysis": { - "has_imports": true, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": true, - "has_rx_registration": false, - "uses_existing_df": false, - "uses_existing_rasters": false, - "uses_rx_without_import": false, - "uses_spark_without_creation": true, - "has_placeholders": true, - "is_short": false - } - }, - { - "language": "python", - "code": "# Check raster type\ndf.select(f.col(\"tile.raster\").cast(\"string\")).show(truncate=False)\n# If shows path \u2192 file-based\n# If shows [binary] \u2192 binary-based\n", - "line_number": 482, - "length_lines": 5, - "source_file": "api/tile-structure.md", - "category": "EXTENDS_SETUP", - "confidence": "high", - "analysis": { - "has_imports": false, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": false, - "has_rx_registration": false, - "uses_existing_df": true, - "uses_existing_rasters": null, - "uses_rx_without_import": false, - "uses_spark_without_creation": false, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "python", - "code": "# Filter out non-tessellated tiles\ntessellated_only = df.filter(f.col(\"tile.cellid\").isNotNull())\n", - "line_number": 494, - "length_lines": 3, - "source_file": "api/tile-structure.md", - "category": "EXTENDS_SETUP", - "confidence": "high", - "analysis": { - "has_imports": false, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": false, - "has_rx_registration": false, - "uses_existing_df": true, - "uses_existing_rasters": null, - "uses_rx_without_import": false, - "uses_spark_without_creation": false, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "scala", - "code": "st_legacyaswkb(legacyGeometry: Column): Column\n", - "line_number": 20, - "length_lines": 2, - "source_file": "api/vectorx-functions.md", - "category": "EXAMPLE_ONLY", - "confidence": "high", - "analysis": { - "has_imports": false, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": false, - "has_rx_registration": false, - "uses_existing_df": null, - "uses_existing_rasters": null, - "uses_rx_without_import": false, - "uses_spark_without_creation": false, - "has_placeholders": false, - "is_short": true - } - }, - { - "language": "python", - "code": "from databricks.labs.gbx.vectorx import functions as vx\nfrom pyspark.sql.functions import expr\n\nvx.register(spark)\n\n# Read legacy Mosaic table\nlegacy_data = spark.table(\"legacy_mosaic_geometries\")\n\n# Convert to WKB\nwkb_data = legacy_data.select(\n \"feature_id\",\n \"properties\",\n vx.st_legacyaswkb(\"mosaic_geom\").alias(\"wkb_geom\")\n)\n\n# Convert WKB to Databricks GEOMETRY type\ngeometry_data = wkb_data.select(\n \"feature_id\",\n \"properties\",\n \"wkb_geom\",\n expr(\"st_geomfromwkb(wkb_geom)\").alias(\"geometry\")\n)\n\ngeometry_data.show()\n", - "line_number": 37, - "length_lines": 25, - "source_file": "api/vectorx-functions.md", - "category": "EXTENDS_SETUP", - "confidence": "high", - "analysis": { - "has_imports": true, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": false, - "has_rx_registration": false, - "uses_existing_df": null, - "uses_existing_rasters": null, - "uses_rx_without_import": false, - "uses_spark_without_creation": true, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "scala", - "code": "import com.databricks.labs.gbx.vectorx.{functions => vx}\nimport org.apache.spark.sql.functions._\n\nvx.register(spark)\n\nval legacyData = spark.table(\"legacy_mosaic_geometries\")\n\nval wkbData = legacyData.select(\n col(\"feature_id\"),\n col(\"properties\"),\n vx.st_legacyaswkb(col(\"mosaic_geom\")).alias(\"wkb_geom\")\n)\n\nval geometryData = wkbData.select(\n col(\"feature_id\"),\n col(\"properties\"),\n col(\"wkb_geom\"),\n expr(\"st_geomfromwkb(wkb_geom)\").alias(\"geometry\")\n)\n\ngeometryData.show()\n", - "line_number": 65, - "length_lines": 22, - "source_file": "api/vectorx-functions.md", - "category": "EXTENDS_SETUP", - "confidence": "high", - "analysis": { - "has_imports": true, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": false, - "has_rx_registration": false, - "uses_existing_df": null, - "uses_existing_rasters": null, - "uses_rx_without_import": false, - "uses_spark_without_creation": true, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "sql", - "code": "-- First register functions in Python/Scala\n-- Then use in SQL\n\n-- Convert legacy geometries\nSELECT\n feature_id,\n properties,\n gbx_st_legacyaswkb(mosaic_geom) as wkb_geom,\n st_geomfromwkb(gbx_st_legacyaswkb(mosaic_geom)) as geometry\nFROM legacy_mosaic_table;\n", - "line_number": 90, - "length_lines": 11, - "source_file": "api/vectorx-functions.md", - "category": "EXTENDS_SETUP", - "confidence": "high", - "analysis": { - "has_imports": false, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": false, - "has_rx_registration": false, - "uses_existing_df": null, - "uses_existing_rasters": null, - "uses_rx_without_import": false, - "uses_spark_without_creation": false, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "python", - "code": "from databricks.labs.gbx.vectorx import functions as vx\nfrom pyspark.sql.functions import expr\n\nvx.register(spark)\n\n# Read legacy table\nlegacy_table = spark.table(\"catalog.schema.legacy_mosaic_features\")\n\n# Convert all geometries\nmigrated = legacy_table.select(\n \"*\", # Keep all original columns\n expr(\"st_geomfromwkb(gbx_st_legacyaswkb(mosaic_geom))\").alias(\"geometry\")\n).drop(\"mosaic_geom\") # Remove legacy column\n\n# Write to new table with GEOMETRY type\nmigrated.write.mode(\"overwrite\").saveAsTable(\"catalog.schema.migrated_features\")\n\n# Verify migration\nverification = spark.sql(\"\"\"\n SELECT\n COUNT(*) as total_records,\n COUNT(geometry) as records_with_geometry,\n SUM(CASE WHEN st_isvalid(geometry) THEN 1 ELSE 0 END) as valid_geometries\n FROM catalog.schema.migrated_features\n\"\"\")\n\nverification.show()\n", - "line_number": 111, - "length_lines": 28, - "source_file": "api/vectorx-functions.md", - "category": "EXTENDS_SETUP", - "confidence": "high", - "analysis": { - "has_imports": true, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": false, - "has_rx_registration": false, - "uses_existing_df": null, - "uses_existing_rasters": null, - "uses_rx_without_import": false, - "uses_spark_without_creation": true, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "python", - "code": "from databricks.labs.gbx.vectorx import functions as vx\nfrom pyspark.sql.functions import expr, col, when\n\nvx.register(spark)\n\n# Read and convert\nlegacy = spark.table(\"legacy_features\")\n\nconverted = legacy.select(\n \"*\",\n expr(\"gbx_st_legacyaswkb(mosaic_geom)\").alias(\"wkb_geom\"),\n expr(\"st_geomfromwkb(gbx_st_legacyaswkb(mosaic_geom))\").alias(\"geometry\")\n)\n\n# Add validation columns\nvalidated = converted.select(\n \"*\",\n expr(\"st_isvalid(geometry)\").alias(\"is_valid\"),\n expr(\"st_geometrytype(geometry)\").alias(\"geom_type\"),\n expr(\"st_srid(geometry)\").alias(\"srid\"),\n when(col(\"geometry\").isNull(), \"NULL_GEOMETRY\")\n .when(~expr(\"st_isvalid(geometry)\"), \"INVALID_GEOMETRY\")\n .otherwise(\"VALID\")\n .alias(\"validation_status\")\n)\n\n# Generate validation report\nvalidation_report = validated.groupBy(\"validation_status\", \"geom_type\").count()\nvalidation_report.show()\n\n# Save valid geometries only\nvalid_features = validated.filter(\"validation_status = 'VALID'\")\nvalid_features.write.mode(\"overwrite\").saveAsTable(\"validated_migrated_features\")\n\n# Save problematic records for review\ninvalid_features = validated.filter(\"validation_status != 'VALID'\")\ninvalid_features.write.mode(\"overwrite\").saveAsTable(\"migration_issues\")\n", - "line_number": 147, - "length_lines": 38, - "source_file": "api/vectorx-functions.md", - "category": "EXTENDS_SETUP", - "confidence": "high", - "analysis": { - "has_imports": true, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": false, - "has_rx_registration": false, - "uses_existing_df": null, - "uses_existing_rasters": null, - "uses_rx_without_import": false, - "uses_spark_without_creation": true, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "python", - "code": "from databricks.labs.gbx.vectorx import functions as vx\nfrom pyspark.sql.functions import expr\n\nvx.register(spark)\n\n# Configuration\nbatch_size = 100000\nsource_table = \"legacy_mosaic_table\"\ntarget_table = \"migrated_table\"\n\n# Get total count\ntotal_count = spark.table(source_table).count()\nnum_batches = (total_count // batch_size) + 1\n\nprint(f\"Migrating {total_count} records in {num_batches} batches...\")\n\n# Process in batches\nfor batch_id in range(num_batches):\n offset = batch_id * batch_size\n \n print(f\"Processing batch {batch_id + 1}/{num_batches} (offset: {offset})\")\n \n # Read batch\n batch = spark.sql(f\"\"\"\n SELECT *\n FROM {source_table}\n LIMIT {batch_size} OFFSET {offset}\n \"\"\")\n \n # Convert\n converted = batch.select(\n \"*\",\n expr(\"st_geomfromwkb(gbx_st_legacyaswkb(mosaic_geom))\").alias(\"geometry\")\n ).drop(\"mosaic_geom\")\n \n # Append to target table\n if batch_id == 0:\n converted.write.mode(\"overwrite\").saveAsTable(target_table)\n else:\n converted.write.mode(\"append\").saveAsTable(target_table)\n \n print(f\"Batch {batch_id + 1} completed\")\n\nprint(\"Migration complete!\")\n\n# Optimize table\nspark.sql(f\"OPTIMIZE {target_table} ZORDER BY (geometry)\")\n", - "line_number": 193, - "length_lines": 48, - "source_file": "api/vectorx-functions.md", - "category": "EXAMPLE_ONLY", - "confidence": "high", - "analysis": { - "has_imports": true, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": false, - "has_rx_registration": false, - "uses_existing_df": null, - "uses_existing_rasters": null, - "uses_rx_without_import": false, - "uses_spark_without_creation": true, - "has_placeholders": true, - "is_short": false - } - }, - { - "language": "python", - "code": "from databricks.labs.gbx.vectorx import functions as vx\nfrom pyspark.sql.functions import expr\n\nvx.register(spark)\n\n# Migrate and analyze\nmigrated_with_analysis = spark.sql(\"\"\"\n SELECT\n feature_id,\n feature_name,\n st_geomfromwkb(gbx_st_legacyaswkb(mosaic_geom)) as geometry,\n -- Spatial metrics\n st_area(st_geomfromwkb(gbx_st_legacyaswkb(mosaic_geom))) as area_sqm,\n st_length(st_geomfromwkb(gbx_st_legacyaswkb(mosaic_geom))) as perimeter_m,\n st_centroid(st_geomfromwkb(gbx_st_legacyaswkb(mosaic_geom))) as centroid,\n st_envelope(st_geomfromwkb(gbx_st_legacyaswkb(mosaic_geom))) as bbox,\n -- Geometry properties\n st_geometrytype(st_geomfromwkb(gbx_st_legacyaswkb(mosaic_geom))) as geom_type,\n st_numgeometries(st_geomfromwkb(gbx_st_legacyaswkb(mosaic_geom))) as num_parts\n FROM legacy_features\n\"\"\")\n\nmigrated_with_analysis.write.mode(\"overwrite\").saveAsTable(\"features_with_metrics\")\n", - "line_number": 249, - "length_lines": 24, - "source_file": "api/vectorx-functions.md", - "category": "EXTENDS_SETUP", - "confidence": "high", - "analysis": { - "has_imports": true, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": false, - "has_rx_registration": false, - "uses_existing_df": null, - "uses_existing_rasters": null, - "uses_rx_without_import": false, - "uses_spark_without_creation": true, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "python", - "code": "from databricks.labs.gbx.vectorx import functions as vx\nfrom pyspark.sql.functions import expr\n\nvx.register(spark)\n\n# Migrate and transform to WGS84\ntransformed = spark.sql(\"\"\"\n SELECT\n feature_id,\n original_crs,\n -- Original geometry\n st_geomfromwkb(gbx_st_legacyaswkb(mosaic_geom)) as original_geom,\n -- Transform to WGS84\n st_transform(\n st_geomfromwkb(gbx_st_legacyaswkb(mosaic_geom)),\n original_crs,\n 'EPSG:4326'\n ) as wgs84_geom\n FROM legacy_features_with_crs\n\"\"\")\n\ntransformed.write.mode(\"overwrite\").saveAsTable(\"features_wgs84\")\n", - "line_number": 281, - "length_lines": 23, - "source_file": "api/vectorx-functions.md", - "category": "EXTENDS_SETUP", - "confidence": "high", - "analysis": { - "has_imports": true, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": false, - "has_rx_registration": false, - "uses_existing_df": null, - "uses_existing_rasters": null, - "uses_rx_without_import": false, - "uses_spark_without_creation": true, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "python", - "code": "from databricks.labs.gbx.vectorx import functions as vx\nfrom pyspark.sql.functions import expr, col\n\nvx.register(spark)\n\n# Create comparison\ncomparison = spark.sql(\"\"\"\n SELECT\n feature_id,\n -- Legacy\n mosaic_geom as legacy_geom,\n -- Converted\n st_geomfromwkb(gbx_st_legacyaswkb(mosaic_geom)) as new_geom,\n -- Comparison metrics\n st_area(st_geomfromwkb(gbx_st_legacyaswkb(mosaic_geom))) as new_area,\n st_isvalid(st_geomfromwkb(gbx_st_legacyaswkb(mosaic_geom))) as is_valid\n FROM legacy_features\n LIMIT 100\n\"\"\")\n\ncomparison.show(truncate=False)\n", - "line_number": 312, - "length_lines": 22, - "source_file": "api/vectorx-functions.md", - "category": "EXTENDS_SETUP", - "confidence": "high", - "analysis": { - "has_imports": true, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": false, - "has_rx_registration": false, - "uses_existing_df": null, - "uses_existing_rasters": null, - "uses_rx_without_import": false, - "uses_spark_without_creation": true, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "python", - "code": "from databricks.labs.gbx.vectorx import functions as vx\nfrom pyspark.sql.functions import expr\n\nvx.register(spark)\n\n# Migrate legacy data\nmigrated_parcels = spark.sql(\"\"\"\n SELECT\n parcel_id,\n st_geomfromwkb(gbx_st_legacyaswkb(mosaic_geom)) as geometry\n FROM legacy_parcels\n\"\"\")\n\n# Read zones (already in GEOMETRY format)\nzones = spark.sql(\"\"\"\n SELECT\n zone_id,\n zone_name,\n geometry\n FROM planning_zones\n\"\"\")\n\n# Spatial join\nparcels_in_zones = migrated_parcels.join(\n zones,\n expr(\"st_intersects(migrated_parcels.geometry, zones.geometry)\"),\n \"inner\"\n)\n\nparcels_in_zones.select(\n \"parcel_id\",\n \"zone_id\",\n \"zone_name\"\n).write.mode(\"overwrite\").saveAsTable(\"parcel_zone_assignments\")\n", - "line_number": 342, - "length_lines": 35, - "source_file": "api/vectorx-functions.md", - "category": "EXTENDS_SETUP", - "confidence": "high", - "analysis": { - "has_imports": true, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": false, - "has_rx_registration": false, - "uses_existing_df": null, - "uses_existing_rasters": null, - "uses_rx_without_import": false, - "uses_spark_without_creation": true, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "sql", - "code": "SELECT\n feature_id,\n geometry,\n st_area(geometry) as area,\n st_length(geometry) as length,\n st_perimeter(geometry) as perimeter,\n st_distance(geometry, st_geomfromtext('POINT(0 0)')) as distance_from_origin\nFROM migrated_features;\n", - "line_number": 387, - "length_lines": 9, - "source_file": "api/vectorx-functions.md", - "category": "EXTENDS_SETUP", - "confidence": "high", - "analysis": { - "has_imports": false, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": false, - "has_rx_registration": false, - "uses_existing_df": null, - "uses_existing_rasters": null, - "uses_rx_without_import": false, - "uses_spark_without_creation": false, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "sql", - "code": "SELECT\n a.id as feature_a,\n b.id as feature_b,\n st_intersects(a.geometry, b.geometry) as intersects,\n st_contains(a.geometry, b.geometry) as a_contains_b,\n st_within(a.geometry, b.geometry) as a_within_b,\n st_overlaps(a.geometry, b.geometry) as overlaps\nFROM migrated_features a\nCROSS JOIN migrated_features b\nWHERE a.id < b.id;\n", - "line_number": 400, - "length_lines": 11, - "source_file": "api/vectorx-functions.md", - "category": "EXTENDS_SETUP", - "confidence": "high", - "analysis": { - "has_imports": false, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": false, - "has_rx_registration": false, - "uses_existing_df": null, - "uses_existing_rasters": null, - "uses_rx_without_import": false, - "uses_spark_without_creation": false, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "sql", - "code": "SELECT\n feature_id,\n geometry as original,\n st_buffer(geometry, 100) as buffered_100m,\n st_centroid(geometry) as center,\n st_envelope(geometry) as bbox,\n st_convexhull(geometry) as convex_hull\nFROM migrated_features;\n", - "line_number": 415, - "length_lines": 9, - "source_file": "api/vectorx-functions.md", - "category": "EXTENDS_SETUP", - "confidence": "high", - "analysis": { - "has_imports": false, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": false, - "has_rx_registration": false, - "uses_existing_df": null, - "uses_existing_rasters": null, - "uses_rx_without_import": false, - "uses_spark_without_creation": false, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "sql", - "code": "SELECT\n region,\n st_union_agg(geometry) as merged_geometry,\n st_envelope_agg(geometry) as region_bbox,\n COUNT(*) as feature_count,\n SUM(st_area(geometry)) as total_area\nFROM migrated_features\nGROUP BY region;\n", - "line_number": 428, - "length_lines": 9, - "source_file": "api/vectorx-functions.md", - "category": "EXTENDS_SETUP", - "confidence": "high", - "analysis": { - "has_imports": false, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": false, - "has_rx_registration": false, - "uses_existing_df": null, - "uses_existing_rasters": null, - "uses_rx_without_import": false, - "uses_spark_without_creation": false, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "python", - "code": "# Test migration on sample first\nsample = spark.table(\"legacy_table\").sample(0.01)\ntest_migrated = sample.select(\n \"*\",\n expr(\"st_geomfromwkb(gbx_st_legacyaswkb(mosaic_geom))\").alias(\"geometry\")\n)\ntest_migrated.show()\n", - "line_number": 445, - "length_lines": 8, - "source_file": "api/vectorx-functions.md", - "category": "EXTENDS_SETUP", - "confidence": "high", - "analysis": { - "has_imports": false, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": false, - "has_rx_registration": false, - "uses_existing_df": null, - "uses_existing_rasters": null, - "uses_rx_without_import": false, - "uses_spark_without_creation": true, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "python", - "code": "# For large tables, use repartitioning\nlegacy_data = spark.table(\"large_legacy_table\")\nlegacy_data = legacy_data.repartition(200)\n\nmigrated = legacy_data.select(\n \"*\",\n expr(\"st_geomfromwkb(gbx_st_legacyaswkb(mosaic_geom))\").alias(\"geometry\")\n)\n", - "line_number": 457, - "length_lines": 9, - "source_file": "api/vectorx-functions.md", - "category": "EXTENDS_SETUP", - "confidence": "high", - "analysis": { - "has_imports": false, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": false, - "has_rx_registration": false, - "uses_existing_df": null, - "uses_existing_rasters": null, - "uses_rx_without_import": false, - "uses_spark_without_creation": true, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "python", - "code": "# Backup before migration\nspark.sql(\"CREATE TABLE legacy_backup AS SELECT * FROM legacy_table\")\n\n# Proceed with migration\n# ...\n\n# Verify before dropping backup\nspark.sql(\"DROP TABLE IF EXISTS legacy_backup\")\n", - "line_number": 470, - "length_lines": 9, - "source_file": "api/vectorx-functions.md", - "category": "EXAMPLE_ONLY", - "confidence": "high", - "analysis": { - "has_imports": false, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": false, - "has_rx_registration": false, - "uses_existing_df": null, - "uses_existing_rasters": null, - "uses_rx_without_import": false, - "uses_spark_without_creation": true, - "has_placeholders": true, - "is_short": false - } - }, - { - "language": "python", - "code": "# Add migration metadata\nfrom pyspark.sql.functions import current_timestamp, lit\n\nmigrated = migrated.withColumn(\"migration_date\", current_timestamp())\nmigrated = migrated.withColumn(\"source_system\", lit(\"mosaic\"))\nmigrated = migrated.withColumn(\"migration_version\", lit(\"1.0\"))\n", - "line_number": 483, - "length_lines": 7, - "source_file": "api/vectorx-functions.md", - "category": "EXTENDS_SETUP", - "confidence": "high", - "analysis": { - "has_imports": true, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": false, - "has_rx_registration": false, - "uses_existing_df": null, - "uses_existing_rasters": null, - "uses_rx_without_import": false, - "uses_spark_without_creation": true, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "python", - "code": "# Identify NULL geometries\nnulls = converted.filter(\"geometry IS NULL\")\nnull_count = nulls.count()\n\nif null_count > 0:\n print(f\"Warning: {null_count} NULL geometries found\")\n nulls.select(\"feature_id\").show()\n", - "line_number": 498, - "length_lines": 8, - "source_file": "api/vectorx-functions.md", - "category": "NEEDS_REVIEW", - "confidence": "low", - "analysis": { - "has_imports": false, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": false, - "has_rx_registration": false, - "uses_existing_df": null, - "uses_existing_rasters": null, - "uses_rx_without_import": false, - "uses_spark_without_creation": false, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "python", - "code": "# Find and fix invalid geometries\ninvalid = converted.filter(\"NOT st_isvalid(geometry)\")\n\n# Attempt to fix with buffer(0)\nfixed = invalid.select(\n \"*\",\n expr(\"st_buffer(geometry, 0)\").alias(\"fixed_geometry\")\n)\n", - "line_number": 510, - "length_lines": 9, - "source_file": "api/vectorx-functions.md", - "category": "NEEDS_REVIEW", - "confidence": "low", - "analysis": { - "has_imports": false, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": false, - "has_rx_registration": false, - "uses_existing_df": null, - "uses_existing_rasters": null, - "uses_rx_without_import": false, - "uses_spark_without_creation": false, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "python", - "code": "# Optimize for large tables\nconverted.write \\\n .mode(\"overwrite\") \\\n .option(\"optimizeWrite\", \"true\") \\\n .saveAsTable(\"migrated_features\")\n\n# Z-order for spatial queries\nspark.sql(\"OPTIMIZE migrated_features ZORDER BY (geometry)\")\n", - "line_number": 523, - "length_lines": 9, - "source_file": "api/vectorx-functions.md", - "category": "EXTENDS_SETUP", - "confidence": "high", - "analysis": { - "has_imports": false, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": false, - "has_rx_registration": false, - "uses_existing_df": null, - "uses_existing_rasters": null, - "uses_rx_without_import": false, - "uses_spark_without_creation": true, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "python", - "code": "from databricks.labs.gbx.rasterx import functions as rx\n\nrx.register(spark)\n\n# Read rasters\nrasters = spark.read.format(\"gdal\").load(\"/data/satellite\")\n\n# Build catalog\ncatalog = rasters.select(\n \"path\",\n rx.rst_boundingbox(\"tile\").alias(\"bounds\"),\n rx.rst_width(\"tile\").alias(\"width\"),\n rx.rst_height(\"tile\").alias(\"height\"),\n rx.rst_metadata(\"tile\").alias(\"metadata\")\n)\n\ncatalog.write.mode(\"overwrite\").saveAsTable(\"raster_catalog\")\n", - "line_number": 45, - "length_lines": 18, - "source_file": "examples/overview.md", - "category": "SELF_CONTAINED", - "confidence": "high", - "analysis": { - "has_imports": true, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": true, - "has_rx_registration": true, - "uses_existing_df": null, - "uses_existing_rasters": false, - "uses_rx_without_import": false, - "uses_spark_without_creation": true, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "python", - "code": "from databricks.labs.gbx.gridx.bng import functions as bx\n\nbx.register(spark)\n\n# Aggregate points by BNG cell\nresult = spark.sql(\"\"\"\n SELECT\n gbx_bng_pointtocell(st_point(longitude, latitude), 1000) as bng_cell,\n COUNT(*) as count,\n AVG(value) as avg_value\n FROM measurements\n WHERE country = 'GB'\n GROUP BY bng_cell\n\"\"\")\n\nresult.write.mode(\"overwrite\").saveAsTable(\"bng_aggregated\")\n", - "line_number": 67, - "length_lines": 17, - "source_file": "examples/overview.md", - "category": "EXTENDS_SETUP", - "confidence": "high", - "analysis": { - "has_imports": true, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": false, - "has_rx_registration": false, - "uses_existing_df": null, - "uses_existing_rasters": null, - "uses_rx_without_import": false, - "uses_spark_without_creation": true, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "python", - "code": "from databricks.labs.gbx.vectorx import functions as vx\nfrom pyspark.sql.functions import expr\n\nvx.register(spark)\n\n# Convert legacy geometries\nlegacy = spark.table(\"legacy_mosaic_table\")\n\nmigrated = legacy.select(\n \"*\",\n expr(\"st_geomfromwkb(gbx_st_legacyaswkb(mosaic_geom))\").alias(\"geometry\")\n).drop(\"mosaic_geom\")\n\nmigrated.write.mode(\"overwrite\").saveAsTable(\"migrated_table\")\n", - "line_number": 88, - "length_lines": 15, - "source_file": "examples/overview.md", - "category": "EXTENDS_SETUP", - "confidence": "high", - "analysis": { - "has_imports": true, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": false, - "has_rx_registration": false, - "uses_existing_df": null, - "uses_existing_rasters": null, - "uses_rx_without_import": false, - "uses_spark_without_creation": true, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "bash", - "code": "#!/bin/bash\n\nsudo add-apt-repository -y \"deb http://archive.ubuntu.com/ubuntu $(lsb_release -sc)-backports main universe multiverse restricted\"\nsudo add-apt-repository -y \"deb http://archive.ubuntu.com/ubuntu $(lsb_release -sc)-updates main universe multiverse restricted\"\nsudo add-apt-repository -y \"deb http://archive.ubuntu.com/ubuntu $(lsb_release -sc)-security main multiverse restricted universe\"\nsudo add-apt-repository -y \"deb http://archive.ubuntu.com/ubuntu $(lsb_release -sc) main multiverse restricted universe\"\n# - add ubuntugis PPA with GPG key\nsudo apt-get install -y software-properties-common\nsudo add-apt-repository -y ppa:ubuntugis/ubuntugis-unstable\nsudo apt-get update -y\n\n# update to your actual volume path\nVOL_DIR=\"/Volumes/geospatial_docs/gdal_artifacts/noble/geobrix\"\n\n# install natives\n# https://gdal.org/en/stable/api/python/python_bindings.html\n# https://medium.com/@felipempfreelancer/install-gdal-for-python-on-ubuntu-24-04-9ed65dd39cac\nsudo apt-get -o DPkg::Lock::Timeout=-1 install -y unixodbc libcurl3-gnutls libsnappy-dev libopenjp2-7\nsudo apt-get -o DPkg::Lock::Timeout=-1 install -y libgdal-dev gdal-bin python3-gdal\n\n# pip install GDAL (match deps to DBR17.3)\npip install --upgrade pip setuptools wheel cython\npip install wheel setuptools==74.0.0 numpy==2.1.3\nexport GDAL_CONFIG=/usr/bin/gdal-config\npip install --no-cache-dir --force-reinstall GDAL[numpy]==\"$(gdal-config --version).*\"\n\n# copy JNI and JAR\ncp $VOL_DIR/libgdalalljni.so /usr/lib/libgdalalljni.so\ncp $VOL_DIR/geobrix-*-jar-with-dependencies.jar /databricks/jars\n", - "line_number": 45, - "length_lines": 30, - "source_file": "installation.md", - "category": "SHELL_COMMAND", - "confidence": "high", - "analysis": { - "has_imports": false, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": false, - "has_rx_registration": false, - "uses_existing_df": null, - "uses_existing_rasters": null, - "uses_rx_without_import": false, - "uses_spark_without_creation": false, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "python", - "code": "from databricks.labs.gbx.rasterx import functions as rx\n\n# Register functions\nrx.register(spark)\n\n# List registered functions\nspark.sql(\"SHOW FUNCTIONS LIKE 'gbx_rst_*'\").show()\n", - "line_number": 123, - "length_lines": 8, - "source_file": "installation.md", - "category": "EXTENDS_SETUP", - "confidence": "high", - "analysis": { - "has_imports": true, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": false, - "has_rx_registration": true, - "uses_existing_df": null, - "uses_existing_rasters": null, - "uses_rx_without_import": false, - "uses_spark_without_creation": true, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "sql", - "code": "-- List all GeoBrix functions\nSHOW FUNCTIONS LIKE 'gbx_*';\n\n-- Describe a specific function\nDESCRIBE FUNCTION EXTENDED gbx_rst_boundingbox;\n", - "line_number": 135, - "length_lines": 6, - "source_file": "installation.md", - "category": "EXAMPLE_ONLY", - "confidence": "medium", - "analysis": { - "has_imports": false, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": false, - "has_rx_registration": false, - "uses_existing_df": null, - "uses_existing_rasters": null, - "uses_rx_without_import": false, - "uses_spark_without_creation": false, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "python", - "code": "from pyspark.sql.functions import expr\n\n# Read with GeoBrix\ndf = spark.read.format(\"shapefile\").load(\"/data/shapes.shp\")\n\n# Convert to GEOMETRY type\ngeometry_df = df.select(\n \"*\",\n expr(\"st_geomfromwkb(geom_0)\").alias(\"geometry\")\n)\n\n# Now use built-in ST functions\nresult = geometry_df.select(\n \"geometry\",\n expr(\"st_area(geometry)\").alias(\"area\")\n)\n", - "line_number": 19, - "length_lines": 17, - "source_file": "limitations.md", - "category": "SELF_CONTAINED", - "confidence": "medium", - "analysis": { - "has_imports": true, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": true, - "has_rx_registration": false, - "uses_existing_df": false, - "uses_existing_rasters": null, - "uses_rx_without_import": false, - "uses_spark_without_creation": true, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "python", - "code": "from databricks.labs.gbx.gridx.bng import functions as bx\n\n# Register BNG functions\nbx.register(spark)\n\n# Calculate cell area for TQ grid square at 1km precision\narea_df = spark.sql(\"\"\"\n SELECT gbx_bng_cellarea('TQ', 1000) as cell_area\n\"\"\")\narea_df.show()\n\n# Convert point to BNG cell\nfrom pyspark.sql.functions import expr\n\npoints_df = spark.createDataFrame([\n (51.5074, -0.1278), # London coordinates (lat, lon)\n], [\"lat\", \"lon\"])\n\nbng_cells = points_df.select(\n \"lat\",\n \"lon\",\n expr(\"gbx_bng_pointtocell(st_point(lon, lat), 1000)\").alias(\"bng_cell\")\n)\n\nbng_cells.show()\n", - "line_number": 58, - "length_lines": 26, - "source_file": "packages/gridx.md", - "category": "EXTENDS_SETUP", - "confidence": "high", - "analysis": { - "has_imports": true, - "has_spark_creation": false, - "has_data_creation": true, - "has_file_reading": false, - "has_rx_registration": false, - "uses_existing_df": null, - "uses_existing_rasters": null, - "uses_rx_without_import": false, - "uses_spark_without_creation": true, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "scala", - "code": "import com.databricks.labs.gbx.gridx.bng.{functions => bx}\nimport org.apache.spark.sql.functions._\n\n// Register functions\nbx.register(spark)\n\n// Calculate cell area\nval areaDf = spark.sql(\"SELECT gbx_bng_cellarea('TQ', 1000) as area\")\nareaDf.show()\n\n// Create BNG cells from points\nval pointsDf = Seq(\n (51.5074, -0.1278)\n).toDF(\"lat\", \"lon\")\n\nval bngCells = pointsDf.select(\n col(\"lat\"),\n col(\"lon\"),\n expr(\"gbx_bng_pointtocell(st_point(lon, lat), 1000)\").alias(\"bng_cell\")\n)\n\nbngCells.show()\n", - "line_number": 88, - "length_lines": 23, - "source_file": "packages/gridx.md", - "category": "EXTENDS_SETUP", - "confidence": "high", - "analysis": { - "has_imports": true, - "has_spark_creation": false, - "has_data_creation": true, - "has_file_reading": false, - "has_rx_registration": false, - "uses_existing_df": null, - "uses_existing_rasters": null, - "uses_rx_without_import": false, - "uses_spark_without_creation": true, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "sql", - "code": "-- Register functions first in Python/Scala notebook\n\n-- Calculate area of a BNG grid cell\nSELECT gbx_bng_cellarea('TQ', 1000) as area_sqm;\n\n-- Convert points to BNG cells\nSELECT\n location_id,\n latitude,\n longitude,\n gbx_bng_pointtocell(st_point(longitude, latitude), 1000) as bng_cell_1km,\n gbx_bng_pointtocell(st_point(longitude, latitude), 100) as bng_cell_100m\nFROM locations\nWHERE country = 'GB';\n\n-- Get cell boundaries\nSELECT\n bng_reference,\n gbx_bng_celltoboundary(bng_reference) as cell_boundary,\n gbx_bng_celltopoint(bng_reference) as cell_center\nFROM bng_cells;\n\n-- Generate k-ring around a location\nSELECT\n origin_cell,\n gbx_bng_cellkring(origin_cell, 2) as neighboring_cells\nFROM important_locations;\n", - "line_number": 115, - "length_lines": 28, - "source_file": "packages/gridx.md", - "category": "EXTENDS_SETUP", - "confidence": "high", - "analysis": { - "has_imports": false, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": false, - "has_rx_registration": false, - "uses_existing_df": null, - "uses_existing_rasters": null, - "uses_rx_without_import": false, - "uses_spark_without_creation": false, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "python", - "code": "from databricks.labs.gbx.gridx.bng import functions as bx\nfrom pyspark.sql.functions import expr, count, avg\nbx.register(spark)\n\n# Aggregate points by BNG cell\naggregated = spark.sql(\"\"\"\n SELECT\n gbx_bng_pointtocell(st_point(longitude, latitude), 1000) as bng_cell,\n COUNT(*) as point_count,\n AVG(value) as avg_value\n FROM measurements\n WHERE country = 'GB'\n GROUP BY bng_cell\n\"\"\")\n\naggregated.show()\n", - "line_number": 151, - "length_lines": 17, - "source_file": "packages/gridx.md", - "category": "EXTENDS_SETUP", - "confidence": "high", - "analysis": { - "has_imports": true, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": false, - "has_rx_registration": false, - "uses_existing_df": null, - "uses_existing_rasters": null, - "uses_rx_without_import": false, - "uses_spark_without_creation": true, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "python", - "code": "from databricks.labs.gbx.gridx.bng import functions as bx\nbx.register(spark)\n\n# Index both datasets with BNG\nlocations_indexed = spark.sql(\"\"\"\n SELECT\n *,\n gbx_bng_pointtocell(st_point(lon, lat), 1000) as bng_cell\n FROM locations\n\"\"\")\n\npoi_indexed = spark.sql(\"\"\"\n SELECT\n *,\n gbx_bng_pointtocell(st_point(lon, lat), 1000) as bng_cell\n FROM points_of_interest\n\"\"\")\n\n# Join on BNG cell\njoined = locations_indexed.join(\n poi_indexed,\n on=\"bng_cell\",\n how=\"inner\"\n)\n\njoined.show()\n", - "line_number": 174, - "length_lines": 27, - "source_file": "packages/gridx.md", - "category": "EXTENDS_SETUP", - "confidence": "high", - "analysis": { - "has_imports": true, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": false, - "has_rx_registration": false, - "uses_existing_df": null, - "uses_existing_rasters": null, - "uses_rx_without_import": false, - "uses_spark_without_creation": true, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "python", - "code": "from databricks.labs.gbx.gridx.bng import functions as bx\nfrom pyspark.sql.functions import expr\nbx.register(spark)\n\n# Create multi-resolution grid\nmulti_res = spark.sql(\"\"\"\n SELECT\n location_id,\n latitude,\n longitude,\n gbx_bng_pointtocell(st_point(longitude, latitude), 10000) as bng_10km,\n gbx_bng_pointtocell(st_point(longitude, latitude), 1000) as bng_1km,\n gbx_bng_pointtocell(st_point(longitude, latitude), 100) as bng_100m\n FROM uk_locations\n\"\"\")\n\n# Aggregate at different resolutions\nagg_10km = multi_res.groupBy(\"bng_10km\").count()\nagg_1km = multi_res.groupBy(\"bng_1km\").count()\nagg_100m = multi_res.groupBy(\"bng_100m\").count()\n", - "line_number": 207, - "length_lines": 21, - "source_file": "packages/gridx.md", - "category": "EXTENDS_SETUP", - "confidence": "high", - "analysis": { - "has_imports": true, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": false, - "has_rx_registration": false, - "uses_existing_df": null, - "uses_existing_rasters": null, - "uses_rx_without_import": false, - "uses_spark_without_creation": true, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "python", - "code": "from databricks.labs.gbx.gridx.bng import functions as bx\nfrom pyspark.sql.functions import expr, explode\nbx.register(spark)\n\n# Get all cells within k-ring\nnearby_cells = spark.sql(\"\"\"\n SELECT\n location_id,\n center_bng_cell,\n gbx_bng_cellkring(center_bng_cell, 3) as nearby_cells\n FROM important_sites\n\"\"\")\n\n# Explode array to individual cells\nexpanded = nearby_cells.select(\n \"location_id\",\n \"center_bng_cell\",\n explode(\"nearby_cells\").alias(\"nearby_cell\")\n)\n\n# Join with data in those cells\nresults = expanded.join(\n data_by_cell,\n expanded.nearby_cell == data_by_cell.bng_cell,\n \"inner\"\n)\n", - "line_number": 234, - "length_lines": 27, - "source_file": "packages/gridx.md", - "category": "EXTENDS_SETUP", - "confidence": "high", - "analysis": { - "has_imports": true, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": false, - "has_rx_registration": false, - "uses_existing_df": null, - "uses_existing_rasters": null, - "uses_rx_without_import": false, - "uses_spark_without_creation": true, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "python", - "code": "# Partition data by BNG grid\ndf.repartition(\"bng_cell\").write.partitionBy(\"bng_cell\").saveAsTable(\"data_by_bng\")\n", - "line_number": 292, - "length_lines": 3, - "source_file": "packages/gridx.md", - "category": "EXTENDS_SETUP", - "confidence": "high", - "analysis": { - "has_imports": false, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": false, - "has_rx_registration": false, - "uses_existing_df": true, - "uses_existing_rasters": null, - "uses_rx_without_import": false, - "uses_spark_without_creation": false, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "python", - "code": "# Z-order by BNG cell for better performance\nspark.sql(\"\"\"\n OPTIMIZE uk_locations\n ZORDER BY (bng_cell)\n\"\"\")\n", - "line_number": 310, - "length_lines": 6, - "source_file": "packages/gridx.md", - "category": "EXTENDS_SETUP", - "confidence": "high", - "analysis": { - "has_imports": false, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": false, - "has_rx_registration": false, - "uses_existing_df": null, - "uses_existing_rasters": null, - "uses_rx_without_import": false, - "uses_spark_without_creation": true, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "python", - "code": "from databricks.labs.gbx.gridx.bng import functions as bx\nfrom databricks.labs.gbx.rasterx import functions as rx\n\nbx.register(spark)\nrx.register(spark)\n\n# Aggregate raster values by BNG cells\nraster_by_bng = spark.sql(\"\"\"\n SELECT\n gbx_bng_pointtocell(centroid, 1000) as bng_cell,\n AVG(pixel_value) as avg_value\n FROM raster_pixels\n GROUP BY bng_cell\n\"\"\")\n", - "line_number": 324, - "length_lines": 15, - "source_file": "packages/gridx.md", - "category": "EXTENDS_SETUP", - "confidence": "high", - "analysis": { - "has_imports": true, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": false, - "has_rx_registration": true, - "uses_existing_df": null, - "uses_existing_rasters": null, - "uses_rx_without_import": false, - "uses_spark_without_creation": true, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "python", - "code": "from databricks.labs.gbx.gridx.bng import functions as bx\nfrom databricks.labs.gbx.vectorx import functions as vx\n\nbx.register(spark)\nvx.register(spark)\n\n# Index vector data with BNG\nindexed_vectors = spark.sql(\"\"\"\n SELECT\n feature_id,\n gbx_st_legacyaswkb(geom) as geometry_wkb,\n gbx_bng_pointtocell(st_centroid(st_geomfromwkb(gbx_st_legacyaswkb(geom))), 1000) as bng_cell\n FROM vector_features\n\"\"\")\n", - "line_number": 345, - "length_lines": 15, - "source_file": "packages/gridx.md", - "category": "EXTENDS_SETUP", - "confidence": "high", - "analysis": { - "has_imports": true, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": false, - "has_rx_registration": false, - "uses_existing_df": null, - "uses_existing_rasters": null, - "uses_rx_without_import": false, - "uses_spark_without_creation": true, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "python", - "code": "# Register all packages\nfrom databricks.labs.gbx.rasterx import functions as rx\nfrom databricks.labs.gbx.gridx.bng import functions as bx\nfrom databricks.labs.gbx.vectorx import functions as vx\n\nrx.register(spark)\nbx.register(spark)\nvx.register(spark)\n", - "line_number": 102, - "length_lines": 9, - "source_file": "packages/overview.md", - "category": "NEEDS_REVIEW", - "confidence": "low", - "analysis": { - "has_imports": true, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": false, - "has_rx_registration": true, - "uses_existing_df": null, - "uses_existing_rasters": null, - "uses_rx_without_import": false, - "uses_spark_without_creation": false, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "python", - "code": "# Only register RasterX\nfrom databricks.labs.gbx.rasterx import functions as rx\nrx.register(spark)\n", - "line_number": 115, - "length_lines": 4, - "source_file": "packages/overview.md", - "category": "NEEDS_REVIEW", - "confidence": "low", - "analysis": { - "has_imports": true, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": false, - "has_rx_registration": true, - "uses_existing_df": null, - "uses_existing_rasters": null, - "uses_rx_without_import": false, - "uses_spark_without_creation": false, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "python", - "code": "from databricks.labs.gbx.rasterx import functions as rx\n\n# Register RasterX functions\nrx.register(spark)\n\n# Read raster files\nraster_df = spark.read.format(\"gdal\").load(\"/path/to/geotiffs\")\n\n# Get bounding box\nbbox_df = raster_df.select(\n rx.rst_boundingbox(\"tile\").alias(\"bbox\")\n)\n\n# Get metadata\nmetadata_df = raster_df.select(\n \"path\",\n rx.rst_width(\"tile\").alias(\"width\"),\n rx.rst_height(\"tile\").alias(\"height\"),\n rx.rst_numbands(\"tile\").alias(\"num_bands\"),\n rx.rst_metadata(\"tile\").alias(\"metadata\")\n)\n\nmetadata_df.show()\n\n# Clip raster by geometry\nfrom pyspark.sql.functions import expr\n\nclipped_df = raster_df.select(\n rx.rst_clip(\"tile\", expr(\"st_geomfromtext('POLYGON((...))')\")).alias(\"clipped_tile\")\n)\n", - "line_number": 92, - "length_lines": 31, - "source_file": "packages/rasterx.md", - "category": "EXAMPLE_ONLY", - "confidence": "high", - "analysis": { - "has_imports": true, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": true, - "has_rx_registration": true, - "uses_existing_df": null, - "uses_existing_rasters": null, - "uses_rx_without_import": false, - "uses_spark_without_creation": true, - "has_placeholders": true, - "is_short": false - } - }, - { - "language": "scala", - "code": "import com.databricks.labs.gbx.rasterx.{functions => rx}\n\n// Register functions\nrx.register(spark)\n\n// Read raster files\nval rasterDf = spark.read.format(\"gdal\").load(\"/path/to/geotiffs\")\n\n// Get metadata\nval metadataDf = rasterDf.select(\n col(\"path\"),\n rx.rst_width(col(\"tile\")).alias(\"width\"),\n rx.rst_height(col(\"tile\")).alias(\"height\"),\n rx.rst_numbands(col(\"tile\")).alias(\"num_bands\")\n)\n\nmetadataDf.show()\n", - "line_number": 127, - "length_lines": 18, - "source_file": "packages/rasterx.md", - "category": "SELF_CONTAINED", - "confidence": "high", - "analysis": { - "has_imports": true, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": true, - "has_rx_registration": true, - "uses_existing_df": null, - "uses_existing_rasters": null, - "uses_rx_without_import": false, - "uses_spark_without_creation": true, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "sql", - "code": "-- Register functions first in Python/Scala notebook\n-- Then use in SQL\n\n-- Read raster data\nCREATE OR REPLACE TEMP VIEW rasters AS\nSELECT * FROM gdal.`/path/to/geotiffs`;\n\n-- Extract metadata\nSELECT\n path,\n gbx_rst_boundingbox(tile) as bbox,\n gbx_rst_width(tile) as width,\n gbx_rst_height(tile) as height,\n gbx_rst_numbands(tile) as num_bands,\n gbx_rst_metadata(tile) as metadata\nFROM rasters;\n\n-- Clip raster\nSELECT\n path,\n gbx_rst_clip(\n tile,\n st_geomfromtext('POLYGON((-122.5 37.5, -122.5 38.5, -121.5 38.5, -121.5 37.5, -122.5 37.5))')\n ) as clipped_tile\nFROM rasters;\n", - "line_number": 149, - "length_lines": 26, - "source_file": "packages/rasterx.md", - "category": "SELF_CONTAINED", - "confidence": "medium", - "analysis": { - "has_imports": false, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": false, - "has_rx_registration": false, - "uses_existing_df": null, - "uses_existing_rasters": true, - "uses_rx_without_import": false, - "uses_spark_without_creation": false, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "python", - "code": "from databricks.labs.gbx.rasterx import functions as rx\nrx.register(spark)\n\n# Read all rasters\nrasters = spark.read.format(\"gdal\").load(\"/data/satellite_imagery\")\n\n# Build catalog\ncatalog = rasters.select(\n \"path\",\n rx.rst_boundingbox(\"tile\").alias(\"bounds\"),\n rx.rst_width(\"tile\").alias(\"width\"),\n rx.rst_height(\"tile\").alias(\"height\"),\n rx.rst_numbands(\"tile\").alias(\"bands\"),\n rx.rst_srid(\"tile\").alias(\"crs\"),\n rx.rst_metadata(\"tile\").alias(\"metadata\")\n)\n\n# Save as Delta table\ncatalog.write.mode(\"overwrite\").saveAsTable(\"raster_catalog\")\n", - "line_number": 183, - "length_lines": 20, - "source_file": "packages/rasterx.md", - "category": "SELF_CONTAINED", - "confidence": "high", - "analysis": { - "has_imports": true, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": true, - "has_rx_registration": true, - "uses_existing_df": null, - "uses_existing_rasters": false, - "uses_rx_without_import": false, - "uses_spark_without_creation": true, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "python", - "code": "from databricks.labs.gbx.rasterx import functions as rx\nfrom pyspark.sql.functions import expr\nrx.register(spark)\n\n# Read rasters\nrasters = spark.read.format(\"gdal\").load(\"/data/input\")\n\n# Process: clip, resample, extract statistics\nprocessed = rasters.select(\n \"path\",\n rx.rst_clip(\"tile\", expr(\"st_geomfromwkt(aoi_wkt)\")).alias(\"clipped\")\n).select(\n \"path\",\n rx.rst_resample(\"clipped\", 30, 30).alias(\"resampled\")\n).select(\n \"path\",\n \"resampled\",\n rx.rst_metadata(\"resampled\").alias(\"output_metadata\")\n)\n\n# Write results\nprocessed.write.mode(\"overwrite\").format(\"delta\").save(\"/data/processed\")\n", - "line_number": 207, - "length_lines": 23, - "source_file": "packages/rasterx.md", - "category": "SELF_CONTAINED", - "confidence": "high", - "analysis": { - "has_imports": true, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": true, - "has_rx_registration": true, - "uses_existing_df": null, - "uses_existing_rasters": false, - "uses_rx_without_import": false, - "uses_spark_without_creation": true, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "python", - "code": "from databricks.labs.gbx.rasterx import functions as rx\nrx.register(spark)\n\n# Read multi-band raster (e.g., Landsat)\nlandsat = spark.read.format(\"gdal\").load(\"/data/landsat\")\n\n# Separate bands\nbands = landsat.select(\n \"path\",\n rx.rst_separatebands(\"tile\").alias(\"bands\")\n)\n\n# Extract individual bands\nred_band = bands.select(\n \"path\",\n rx.rst_getband(\"bands\", 3).alias(\"red\")\n)\n\nnir_band = bands.select(\n \"path\",\n rx.rst_getband(\"bands\", 4).alias(\"nir\")\n)\n", - "line_number": 234, - "length_lines": 23, - "source_file": "packages/rasterx.md", - "category": "SELF_CONTAINED", - "confidence": "high", - "analysis": { - "has_imports": true, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": true, - "has_rx_registration": true, - "uses_existing_df": null, - "uses_existing_rasters": null, - "uses_rx_without_import": false, - "uses_spark_without_creation": true, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "python", - "code": "from databricks.labs.gbx.rasterx import functions as rx\nrx.register(spark)\n\n# Read large raster\nlarge_raster = spark.read.format(\"gdal\").option(\"sizeInMB\", \"16\").load(\"/data/large.tif\")\n\n# Tessellate into smaller tiles\ntiles = large_raster.select(\n rx.rst_tessellate(\"tile\", 256).alias(\"small_tile\")\n)\n\n# Process tiles in parallel\nprocessed_tiles = tiles.select(\n # Your processing here\n rx.rst_boundingbox(\"small_tile\").alias(\"tile_bounds\")\n)\n", - "line_number": 265, - "length_lines": 17, - "source_file": "packages/rasterx.md", - "category": "SELF_CONTAINED", - "confidence": "high", - "analysis": { - "has_imports": true, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": true, - "has_rx_registration": true, - "uses_existing_df": null, - "uses_existing_rasters": null, - "uses_rx_without_import": false, - "uses_spark_without_creation": true, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "python", - "code": "# Save raster metadata to Delta\ncatalog.write.mode(\"overwrite\").format(\"delta\").saveAsTable(\"raster_metadata\")\n\n# Save binary raster data\nrasters.write.mode(\"overwrite\").format(\"delta\").save(\"/data/rasters_delta\")\n", - "line_number": 315, - "length_lines": 6, - "source_file": "packages/rasterx.md", - "category": "EXTENDS_SETUP", - "confidence": "high", - "analysis": { - "has_imports": false, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": false, - "has_rx_registration": false, - "uses_existing_df": null, - "uses_existing_rasters": true, - "uses_rx_without_import": false, - "uses_spark_without_creation": false, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "python", - "code": "# Write to Unity Catalog\ncatalog.write.mode(\"overwrite\").saveAsTable(\"catalog.schema.raster_catalog\")\n", - "line_number": 327, - "length_lines": 3, - "source_file": "packages/rasterx.md", - "category": "NEEDS_REVIEW", - "confidence": "low", - "analysis": { - "has_imports": false, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": false, - "has_rx_registration": false, - "uses_existing_df": null, - "uses_existing_rasters": null, - "uses_rx_without_import": false, - "uses_spark_without_creation": false, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "python", - "code": "from databricks.labs.gbx.vectorx import functions as vx\nfrom pyspark.sql.functions import expr\n\n# Register VectorX functions\nvx.register(spark)\n\n# Convert legacy Mosaic geometries\nlegacy_table = spark.table(\"old_mosaic_features\")\n\nmigrated = legacy_table.select(\n \"*\",\n expr(\"st_geomfromwkb(gbx_st_legacyaswkb(mosaic_geom))\").alias(\"geometry\")\n).drop(\"mosaic_geom\")\n\n# Now use Databricks built-in functions\nresult = migrated.select(\n \"feature_id\",\n \"geometry\",\n expr(\"st_area(geometry)\").alias(\"area\"),\n expr(\"st_centroid(geometry)\").alias(\"centroid\")\n)\n\nresult.write.mode(\"overwrite\").saveAsTable(\"migrated_features\")\n", - "line_number": 37, - "length_lines": 24, - "source_file": "packages/vectorx.md", - "category": "EXTENDS_SETUP", - "confidence": "high", - "analysis": { - "has_imports": true, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": false, - "has_rx_registration": false, - "uses_existing_df": null, - "uses_existing_rasters": null, - "uses_rx_without_import": false, - "uses_spark_without_creation": true, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "sql", - "code": "-- In Python/Scala, register functions first\n-- Then use in SQL:\n\nCREATE OR REPLACE TABLE modern_features AS\nSELECT\n feature_id,\n properties,\n st_geomfromwkb(gbx_st_legacyaswkb(mosaic_geom)) as geometry\nFROM legacy_mosaic_table;\n", - "line_number": 69, - "length_lines": 10, - "source_file": "packages/vectorx.md", - "category": "SELF_CONTAINED", - "confidence": "medium", - "analysis": { - "has_imports": false, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": false, - "has_rx_registration": false, - "uses_existing_df": null, - "uses_existing_rasters": null, - "uses_rx_without_import": false, - "uses_spark_without_creation": false, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "python", - "code": "# Keep both for validation\ntransitional = legacy_table.select(\n \"*\",\n vx.st_legacyaswkb(\"mosaic_geom\").alias(\"wkb_geom\"),\n expr(\"st_geomfromwkb(gbx_st_legacyaswkb(mosaic_geom))\").alias(\"geometry\")\n)\n\n# Validate conversion\nvalidation = transitional.select(\n \"feature_id\",\n expr(\"st_isvalid(geometry)\").alias(\"is_valid\"),\n expr(\"st_geometrytype(geometry)\").alias(\"geom_type\")\n)\n", - "line_number": 85, - "length_lines": 14, - "source_file": "packages/vectorx.md", - "category": "EXTENDS_SETUP", - "confidence": "high", - "analysis": { - "has_imports": false, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": false, - "has_rx_registration": false, - "uses_existing_df": null, - "uses_existing_rasters": null, - "uses_rx_without_import": true, - "uses_spark_without_creation": false, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "sql", - "code": "-- Spatial joins\nSELECT a.id, b.id\nFROM migrated_features a\nJOIN other_features b\n ON st_intersects(a.geometry, b.geometry);\n\n-- Spatial aggregations \nSELECT \n region,\n st_union_agg(geometry) as merged_geometry,\n COUNT(*) as feature_count\nFROM migrated_features\nGROUP BY region;\n\n-- Spatial operations\nSELECT\n feature_id,\n st_buffer(geometry, 100) as buffered,\n st_envelope(geometry) as bbox,\n st_area(geometry) as area_sqm\nFROM migrated_features;\n", - "line_number": 105, - "length_lines": 22, - "source_file": "packages/vectorx.md", - "category": "EXTENDS_SETUP", - "confidence": "high", - "analysis": { - "has_imports": false, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": false, - "has_rx_registration": false, - "uses_existing_df": null, - "uses_existing_rasters": null, - "uses_rx_without_import": false, - "uses_spark_without_creation": false, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "python", - "code": "from databricks.labs.gbx.vectorx import functions as vx\n\n# Register\nvx.register(spark)\n\n# Use in DataFrames\ndf = df.select(vx.st_legacyaswkb(\"mosaic_column\"))\n", - "line_number": 151, - "length_lines": 8, - "source_file": "packages/vectorx.md", - "category": "NEEDS_REVIEW", - "confidence": "low", - "analysis": { - "has_imports": true, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": false, - "has_rx_registration": false, - "uses_existing_df": false, - "uses_existing_rasters": null, - "uses_rx_without_import": false, - "uses_spark_without_creation": false, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "scala", - "code": "import com.databricks.labs.gbx.vectorx.{functions => vx}\n\n// Register\nvx.register(spark)\n\n// Use in DataFrames\nval df = df.select(vx.st_legacyaswkb(col(\"mosaic_column\")))\n", - "line_number": 163, - "length_lines": 8, - "source_file": "packages/vectorx.md", - "category": "NEEDS_REVIEW", - "confidence": "low", - "analysis": { - "has_imports": true, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": false, - "has_rx_registration": false, - "uses_existing_df": false, - "uses_existing_rasters": null, - "uses_rx_without_import": false, - "uses_spark_without_creation": false, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "sql", - "code": "-- Register in Python/Scala first, then:\nSELECT gbx_st_legacyaswkb(mosaic_geom) as wkb_geom\nFROM legacy_table;\n", - "line_number": 175, - "length_lines": 4, - "source_file": "packages/vectorx.md", - "category": "EXTENDS_SETUP", - "confidence": "high", - "analysis": { - "has_imports": false, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": false, - "has_rx_registration": false, - "uses_existing_df": null, - "uses_existing_rasters": null, - "uses_rx_without_import": false, - "uses_spark_without_creation": false, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "python", - "code": "# Create backup of legacy table\nspark.sql(\"CREATE TABLE legacy_backup AS SELECT * FROM legacy_table\")\n", - "line_number": 185, - "length_lines": 3, - "source_file": "packages/vectorx.md", - "category": "EXTENDS_SETUP", - "confidence": "high", - "analysis": { - "has_imports": false, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": false, - "has_rx_registration": false, - "uses_existing_df": null, - "uses_existing_rasters": null, - "uses_rx_without_import": false, - "uses_spark_without_creation": true, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "python", - "code": "from databricks.labs.gbx.vectorx import functions as vx\nfrom pyspark.sql.functions import expr\n\nvx.register(spark)\n\nconverted = spark.sql(\"\"\"\n SELECT\n *,\n st_geomfromwkb(gbx_st_legacyaswkb(mosaic_geom)) as geometry\n FROM legacy_table\n\"\"\")\n", - "line_number": 192, - "length_lines": 12, - "source_file": "packages/vectorx.md", - "category": "EXTENDS_SETUP", - "confidence": "high", - "analysis": { - "has_imports": true, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": false, - "has_rx_registration": false, - "uses_existing_df": null, - "uses_existing_rasters": null, - "uses_rx_without_import": false, - "uses_spark_without_creation": true, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "python", - "code": "# Check for issues\nvalidation = converted.select(\n expr(\"COUNT(*) as total\"),\n expr(\"COUNT(geometry) as with_geometry\"),\n expr(\"SUM(CASE WHEN st_isvalid(geometry) THEN 1 ELSE 0 END) as valid\")\n)\nvalidation.show()\n", - "line_number": 208, - "length_lines": 8, - "source_file": "packages/vectorx.md", - "category": "NEEDS_REVIEW", - "confidence": "low", - "analysis": { - "has_imports": false, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": false, - "has_rx_registration": false, - "uses_existing_df": null, - "uses_existing_rasters": null, - "uses_rx_without_import": false, - "uses_spark_without_creation": false, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "python", - "code": "# Save migrated table\nconverted.drop(\"mosaic_geom\").write.mode(\"overwrite\").saveAsTable(\"migrated_table\")\n\n# Optimize\nspark.sql(\"OPTIMIZE migrated_table ZORDER BY (geometry)\")\n", - "line_number": 220, - "length_lines": 6, - "source_file": "packages/vectorx.md", - "category": "EXTENDS_SETUP", - "confidence": "high", - "analysis": { - "has_imports": false, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": false, - "has_rx_registration": false, - "uses_existing_df": null, - "uses_existing_rasters": null, - "uses_rx_without_import": false, - "uses_spark_without_creation": true, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "python", - "code": "from databricks.labs.gbx.vectorx import functions as vx\nfrom pyspark.sql.functions import expr\n\n# Register\nvx.register(spark)\n\n# Read legacy data\nlegacy = spark.table(\"production.legacy_mosaic_parcels\")\n\n# Convert and enrich\nmigrated = legacy.select(\n \"parcel_id\",\n \"owner\",\n \"land_use\",\n \"assessed_value\",\n vx.st_legacyaswkb(\"mosaic_geom\").alias(\"wkb_geom\")\n).select(\n \"*\",\n expr(\"st_geomfromwkb(wkb_geom)\").alias(\"geometry\")\n).select(\n \"parcel_id\",\n \"owner\", \n \"land_use\",\n \"assessed_value\",\n \"geometry\",\n # Add spatial metrics\n expr(\"st_area(geometry)\").alias(\"area_sqm\"),\n expr(\"st_perimeter(geometry)\").alias(\"perimeter_m\"),\n expr(\"st_centroid(geometry)\").alias(\"centroid\")\n)\n\n# Validate\nprint(f\"Total records: {migrated.count()}\")\nprint(f\"Valid geometries: {migrated.filter('st_isvalid(geometry)').count()}\")\n\n# Save\nmigrated.write.mode(\"overwrite\").saveAsTable(\"production.modern_parcels\")\n\n# Optimize for spatial queries\nspark.sql(\"OPTIMIZE production.modern_parcels ZORDER BY (geometry)\")\n\nprint(\"\u2705 Migration complete!\")\n", - "line_number": 256, - "length_lines": 43, - "source_file": "packages/vectorx.md", - "category": "EXTENDS_SETUP", - "confidence": "high", - "analysis": { - "has_imports": true, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": false, - "has_rx_registration": false, - "uses_existing_df": null, - "uses_existing_rasters": null, - "uses_rx_without_import": false, - "uses_spark_without_creation": true, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "python", - "code": "from databricks.labs.gbx.rasterx import functions as rx\n\n# Register RasterX functions with Spark\nrx.register(spark)\n", - "line_number": 23, - "length_lines": 5, - "source_file": "quick-start.md", - "category": "NEEDS_REVIEW", - "confidence": "low", - "analysis": { - "has_imports": true, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": false, - "has_rx_registration": true, - "uses_existing_df": null, - "uses_existing_rasters": null, - "uses_rx_without_import": false, - "uses_spark_without_creation": false, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "scala", - "code": "import com.databricks.labs.gbx.rasterx.{functions => rx}\n\n// Register RasterX functions with Spark\nrx.register(spark)\n", - "line_number": 32, - "length_lines": 5, - "source_file": "quick-start.md", - "category": "NEEDS_REVIEW", - "confidence": "low", - "analysis": { - "has_imports": true, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": false, - "has_rx_registration": true, - "uses_existing_df": null, - "uses_existing_rasters": null, - "uses_rx_without_import": false, - "uses_spark_without_creation": false, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "sql", - "code": "-- List all RasterX functions\nSHOW FUNCTIONS LIKE 'gbx_rst_*';\n\n-- List all GridX functions\nSHOW FUNCTIONS LIKE 'gbx_bng_*';\n\n-- List all VectorX functions\nSHOW FUNCTIONS LIKE 'gbx_st_*';\n\n-- List ALL GeoBrix functions\nSHOW FUNCTIONS LIKE 'gbx_*';\n", - "line_number": 45, - "length_lines": 12, - "source_file": "quick-start.md", - "category": "EXAMPLE_ONLY", - "confidence": "medium", - "analysis": { - "has_imports": false, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": false, - "has_rx_registration": false, - "uses_existing_df": null, - "uses_existing_rasters": null, - "uses_rx_without_import": false, - "uses_spark_without_creation": false, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "sql", - "code": "-- Get function description\nDESCRIBE FUNCTION EXTENDED gbx_rst_boundingbox;\n", - "line_number": 65, - "length_lines": 3, - "source_file": "quick-start.md", - "category": "EXAMPLE_ONLY", - "confidence": "medium", - "analysis": { - "has_imports": false, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": false, - "has_rx_registration": false, - "uses_existing_df": null, - "uses_existing_rasters": null, - "uses_rx_without_import": false, - "uses_spark_without_creation": false, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "python", - "code": "# Read GeoTiff raster files\ndf = (\n spark\n .read\n .format(\"gdal\")\n .load(\"/path/to/geotiff/files\")\n)\n\ndf.show()\n", - "line_number": 78, - "length_lines": 10, - "source_file": "quick-start.md", - "category": "NEEDS_REVIEW", - "confidence": "low", - "analysis": { - "has_imports": false, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": false, - "has_rx_registration": false, - "uses_existing_df": false, - "uses_existing_rasters": null, - "uses_rx_without_import": false, - "uses_spark_without_creation": false, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "python", - "code": "# Read shapefiles\ndf = (\n spark\n .read\n .format(\"shapefile\")\n .load(\"/path/to/shapefiles\")\n)\n\ndf.show()\n", - "line_number": 92, - "length_lines": 10, - "source_file": "quick-start.md", - "category": "NEEDS_REVIEW", - "confidence": "low", - "analysis": { - "has_imports": false, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": false, - "has_rx_registration": false, - "uses_existing_df": false, - "uses_existing_rasters": null, - "uses_rx_without_import": false, - "uses_spark_without_creation": false, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "python", - "code": "# Read GeoJSON files\ndf = (\n spark\n .read\n .format(\"geojson\")\n .option(\"multi\", \"false\")\n .load(\"/path/to/geojson/files\")\n)\n\ndf.show()\n", - "line_number": 106, - "length_lines": 11, - "source_file": "quick-start.md", - "category": "NEEDS_REVIEW", - "confidence": "low", - "analysis": { - "has_imports": false, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": false, - "has_rx_registration": false, - "uses_existing_df": false, - "uses_existing_rasters": null, - "uses_rx_without_import": false, - "uses_spark_without_creation": false, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "python", - "code": "from databricks.labs.gbx.rasterx import functions as rx\n\n# Register functions\nrx.register(spark)\n\n# Read raster data\nraster_df = spark.read.format(\"gdal\").load(\"/path/to/rasters\")\n\n# Get bounding box of rasters\nresult = raster_df.select(\n rx.rst_boundingbox(\"tile\").alias(\"bbox\")\n)\n\nresult.show()\n", - "line_number": 121, - "length_lines": 15, - "source_file": "quick-start.md", - "category": "SELF_CONTAINED", - "confidence": "high", - "analysis": { - "has_imports": true, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": true, - "has_rx_registration": true, - "uses_existing_df": null, - "uses_existing_rasters": false, - "uses_rx_without_import": false, - "uses_spark_without_creation": true, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "python", - "code": "from databricks.labs.gbx.gridx.bng import functions as bx\n\n# Register BNG functions\nbx.register(spark)\n\n# Calculate cell area\ndf = spark.sql(\"\"\"\n SELECT gbx_bng_cellarea('TQ', 1000) as area\n\"\"\")\n\ndf.show()\n", - "line_number": 140, - "length_lines": 12, - "source_file": "quick-start.md", - "category": "EXTENDS_SETUP", - "confidence": "high", - "analysis": { - "has_imports": true, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": false, - "has_rx_registration": false, - "uses_existing_df": false, - "uses_existing_rasters": null, - "uses_rx_without_import": false, - "uses_spark_without_creation": true, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "python", - "code": "from databricks.labs.gbx.vectorx import functions as vx\n\n# Register VectorX functions\nvx.register(spark)\n\n# Convert legacy Mosaic geometry to WKB\ndf = spark.sql(\"\"\"\n SELECT gbx_st_legacyaswkb(legacy_geom) as wkb_geom\n FROM legacy_table\n\"\"\")\n\ndf.show()\n", - "line_number": 156, - "length_lines": 13, - "source_file": "quick-start.md", - "category": "EXTENDS_SETUP", - "confidence": "high", - "analysis": { - "has_imports": true, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": false, - "has_rx_registration": false, - "uses_existing_df": false, - "uses_existing_rasters": null, - "uses_rx_without_import": false, - "uses_spark_without_creation": true, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "sql", - "code": "-- Read a shapefile and query it\nCREATE OR REPLACE TEMP VIEW my_shapes AS\nSELECT * FROM shapefile.`/path/to/shapefiles`;\n\n-- Use VectorX functions\nSELECT \n shape_id,\n gbx_st_legacyaswkb(geom_0) as geometry_wkb\nFROM my_shapes;\n\n-- Read GeoTiff and use RasterX functions\nCREATE OR REPLACE TEMP VIEW my_rasters AS\nSELECT * FROM gdal.`/path/to/geotiffs`;\n\nSELECT\n tile_id,\n gbx_rst_boundingbox(tile) as bbox,\n gbx_rst_metadata(tile) as metadata\nFROM my_rasters;\n", - "line_number": 175, - "length_lines": 20, - "source_file": "quick-start.md", - "category": "SELF_CONTAINED", - "confidence": "medium", - "analysis": { - "has_imports": false, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": false, - "has_rx_registration": false, - "uses_existing_df": null, - "uses_existing_rasters": null, - "uses_rx_without_import": false, - "uses_spark_without_creation": false, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "python", - "code": "# Read shapefile\ndf = spark.read.format(\"shapefile\").load(\"/path/to/shapefiles\")\n\n# Convert WKB to Databricks GEOMETRY type\nfrom pyspark.sql.functions import expr\n\ngeometry_df = df.select(\n \"*\",\n expr(\"st_geomfromwkb(geom_0)\").alias(\"geometry\")\n)\n\n# Now you can use built-in ST functions\nresult = geometry_df.select(\n \"geometry\",\n expr(\"st_area(geometry)\").alias(\"area\"),\n expr(\"st_length(geometry)\").alias(\"length\")\n)\n\nresult.show()\n", - "line_number": 201, - "length_lines": 20, - "source_file": "quick-start.md", - "category": "SELF_CONTAINED", - "confidence": "medium", - "analysis": { - "has_imports": true, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": true, - "has_rx_registration": false, - "uses_existing_df": false, - "uses_existing_rasters": null, - "uses_rx_without_import": false, - "uses_spark_without_creation": true, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "sql", - "code": "-- Read shapefile\nCREATE OR REPLACE TEMP VIEW shapes AS\nSELECT * FROM shapefile.`/path/to/shapefiles`;\n\n-- Convert to GEOMETRY type\nCREATE OR REPLACE TEMP VIEW shapes_with_geom AS\nSELECT \n *,\n st_geomfromwkb(geom_0) as geometry\nFROM shapes;\n\n-- Use built-in spatial functions\nSELECT\n shape_id,\n st_area(geometry) as area,\n st_centroid(geometry) as centroid,\n st_envelope(geometry) as envelope\nFROM shapes_with_geom;\n", - "line_number": 225, - "length_lines": 19, - "source_file": "quick-start.md", - "category": "SELF_CONTAINED", - "confidence": "medium", - "analysis": { - "has_imports": false, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": false, - "has_rx_registration": false, - "uses_existing_df": null, - "uses_existing_rasters": null, - "uses_rx_without_import": false, - "uses_spark_without_creation": false, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "python", - "code": "# 1. Read data with GeoBrix reader\ndf = spark.read.format(\"shapefile\").load(\"/data/shapes\")\n\n# 2. Process with GeoBrix functions\nfrom databricks.labs.gbx.vectorx import functions as vx\nvx.register(spark)\n\n# 3. Convert to Databricks types for further analysis\nresult = df.select(\n \"*\",\n expr(\"st_geomfromwkb(geom_0)\").alias(\"geometry\")\n)\n", - "line_number": 260, - "length_lines": 13, - "source_file": "quick-start.md", - "category": "SELF_CONTAINED", - "confidence": "medium", - "analysis": { - "has_imports": true, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": true, - "has_rx_registration": false, - "uses_existing_df": false, - "uses_existing_rasters": null, - "uses_rx_without_import": false, - "uses_spark_without_creation": true, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "python", - "code": "# Read different formats\ngeotiffs = spark.read.format(\"gdal\").load(\"/data/rasters\")\nshapefiles = spark.read.format(\"shapefile\").load(\"/data/vectors\")\ngeojson = spark.read.format(\"geojson\").load(\"/data/json\")\ngeopackage = spark.read.format(\"gpkg\").load(\"/data/packages\")\n", - "line_number": 277, - "length_lines": 6, - "source_file": "quick-start.md", - "category": "EXTENDS_SETUP", - "confidence": "high", - "analysis": { - "has_imports": false, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": true, - "has_rx_registration": false, - "uses_existing_df": null, - "uses_existing_rasters": false, - "uses_rx_without_import": false, - "uses_spark_without_creation": true, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "python", - "code": "from databricks.labs.gbx.rasterx import functions as rx\nrx.register(spark)\n\n# Process multiple rasters in parallel\nrasters = spark.read.format(\"gdal\").load(\"/data/many_rasters\")\n\nresults = rasters.select(\n \"path\",\n rx.rst_boundingbox(\"tile\").alias(\"bbox\"),\n rx.rst_metadata(\"tile\").alias(\"metadata\"),\n rx.rst_width(\"tile\").alias(\"width\"),\n rx.rst_height(\"tile\").alias(\"height\")\n)\n\nresults.write.mode(\"overwrite\").saveAsTable(\"raster_catalog\")\n", - "line_number": 287, - "length_lines": 16, - "source_file": "quick-start.md", - "category": "SELF_CONTAINED", - "confidence": "high", - "analysis": { - "has_imports": true, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": true, - "has_rx_registration": true, - "uses_existing_df": null, - "uses_existing_rasters": false, - "uses_rx_without_import": false, - "uses_spark_without_creation": true, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "python", - "code": "# Read File Geodatabase\ndf = spark.read.format(\"file_gdb\").load(\"/path/to/database.gdb\")\n\ndf.show()\n", - "line_number": 28, - "length_lines": 5, - "source_file": "readers/filegdb.md", - "category": "EXTENDS_SETUP", - "confidence": "high", - "analysis": { - "has_imports": false, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": true, - "has_rx_registration": false, - "uses_existing_df": false, - "uses_existing_rasters": null, - "uses_rx_without_import": false, - "uses_spark_without_creation": true, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "scala", - "code": "// Read File Geodatabase\nval df = spark.read.format(\"file_gdb\").load(\"/path/to/database.gdb\")\n\ndf.show()\n", - "line_number": 37, - "length_lines": 5, - "source_file": "readers/filegdb.md", - "category": "EXTENDS_SETUP", - "confidence": "high", - "analysis": { - "has_imports": false, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": true, - "has_rx_registration": false, - "uses_existing_df": false, - "uses_existing_rasters": null, - "uses_rx_without_import": false, - "uses_spark_without_creation": true, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "sql", - "code": "-- Read File Geodatabase\nCREATE OR REPLACE TEMP VIEW features AS\nSELECT * FROM file_gdb.`/path/to/database.gdb`;\n\nSELECT * FROM features;\n", - "line_number": 46, - "length_lines": 6, - "source_file": "readers/filegdb.md", - "category": "SELF_CONTAINED", - "confidence": "medium", - "analysis": { - "has_imports": false, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": false, - "has_rx_registration": false, - "uses_existing_df": null, - "uses_existing_rasters": null, - "uses_rx_without_import": false, - "uses_spark_without_creation": false, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "python", - "code": "# Read specific feature class by name\ndf = spark.read.format(\"file_gdb\") \\\n .option(\"layerName\", \"Buildings\") \\\n .load(\"/path/to/database.gdb\")\n\n# Read specific feature class by index (0-based)\ndf = spark.read.format(\"file_gdb\") \\\n .option(\"layerN\", \"2\") \\\n .load(\"/path/to/database.gdb\")\n", - "line_number": 74, - "length_lines": 10, - "source_file": "readers/filegdb.md", - "category": "EXTENDS_SETUP", - "confidence": "high", - "analysis": { - "has_imports": false, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": true, - "has_rx_registration": false, - "uses_existing_df": false, - "uses_existing_rasters": null, - "uses_rx_without_import": false, - "uses_spark_without_creation": true, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "python", - "code": "# Read File Geodatabase (reads first/default feature class)\nbuildings = spark.read.format(\"file_gdb\").load(\"/data/city.gdb\")\n\n# Show attributes (note: geometry column is typically 'SHAPE')\nbuildings.select(\"OBJECTID\", \"NAME\", \"HEIGHT\", \"SHAPE_srid\").show()\n", - "line_number": 99, - "length_lines": 6, - "source_file": "readers/filegdb.md", - "category": "EXTENDS_SETUP", - "confidence": "high", - "analysis": { - "has_imports": false, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": true, - "has_rx_registration": false, - "uses_existing_df": null, - "uses_existing_rasters": null, - "uses_rx_without_import": false, - "uses_spark_without_creation": true, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "python", - "code": "# File Geodatabase with multiple feature classes\n# Read Buildings feature class\nbuildings = spark.read.format(\"file_gdb\") \\\n .option(\"layerName\", \"Buildings\") \\\n .load(\"/data/city.gdb\")\n\n# Read Roads feature class\nroads = spark.read.format(\"file_gdb\") \\\n .option(\"layerName\", \"Roads\") \\\n .load(\"/data/city.gdb\")\n\n# Read Parcels feature class\nparcels = spark.read.format(\"file_gdb\") \\\n .option(\"layerName\", \"Parcels\") \\\n .load(\"/data/city.gdb\")\n\nbuildings.show()\nroads.show()\nparcels.show()\n", - "line_number": 109, - "length_lines": 20, - "source_file": "readers/filegdb.md", - "category": "EXTENDS_SETUP", - "confidence": "high", - "analysis": { - "has_imports": false, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": true, - "has_rx_registration": false, - "uses_existing_df": null, - "uses_existing_rasters": null, - "uses_rx_without_import": false, - "uses_spark_without_creation": true, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "python", - "code": "from pyspark.sql.functions import expr\n\n# Read File Geodatabase\ndf = spark.read.format(\"file_gdb\") \\\n .option(\"layerName\", \"Boundaries\") \\\n .load(\"/data/admin.gdb\")\n\n# Convert to GEOMETRY type (SHAPE column)\ngeometry_df = df.select(\n \"*\",\n expr(\"st_geomfromwkb(SHAPE)\").alias(\"geometry\")\n)\n\n# Use Databricks ST functions\nresult = geometry_df.select(\n \"NAME\",\n \"geometry\",\n expr(\"st_area(geometry)\").alias(\"area\"),\n expr(\"st_centroid(geometry)\").alias(\"center\")\n)\n\nresult.show()\n", - "line_number": 133, - "length_lines": 23, - "source_file": "readers/filegdb.md", - "category": "SELF_CONTAINED", - "confidence": "medium", - "analysis": { - "has_imports": true, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": true, - "has_rx_registration": false, - "uses_existing_df": false, - "uses_existing_rasters": null, - "uses_rx_without_import": false, - "uses_spark_without_creation": true, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "python", - "code": "# Read from S3\ns3_gdb = spark.read.format(\"file_gdb\") \\\n .option(\"layerName\", \"Features\") \\\n .load(\"s3://bucket/path/database.gdb\")\n\n# Read from Azure Blob Storage\nazure_gdb = spark.read.format(\"file_gdb\") \\\n .option(\"layerName\", \"Features\") \\\n .load(\"wasbs://container@account.blob.core.windows.net/database.gdb\")\n\n# Read from Unity Catalog Volume\nvolume_gdb = spark.read.format(\"file_gdb\") \\\n .option(\"layerName\", \"Features\") \\\n .load(\"/Volumes/catalog/schema/volume/database.gdb\")\n\ns3_gdb.show()\n", - "line_number": 160, - "length_lines": 17, - "source_file": "readers/filegdb.md", - "category": "SELF_CONTAINED", - "confidence": "medium", - "analysis": { - "has_imports": true, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": true, - "has_rx_registration": false, - "uses_existing_df": null, - "uses_existing_rasters": null, - "uses_rx_without_import": false, - "uses_spark_without_creation": true, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "python", - "code": "# File Geodatabase columns are case-insensitive\ndf = spark.read.format(\"file_gdb\") \\\n .option(\"layerName\", \"Parcels\") \\\n .load(\"/data/cadastral.gdb\")\n\n# These all work (adjust to your schema)\ndf.select(\"OBJECTID\", \"shape\", \"SHAPE_srid\").show()\ndf.select(\"objectid\", \"SHAPE\", \"shape_srid\").show()\n", - "line_number": 181, - "length_lines": 9, - "source_file": "readers/filegdb.md", - "category": "EXTENDS_SETUP", - "confidence": "high", - "analysis": { - "has_imports": false, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": true, - "has_rx_registration": false, - "uses_existing_df": false, - "uses_existing_rasters": null, - "uses_rx_without_import": false, - "uses_spark_without_creation": true, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "sql", - "code": "-- Create view from File Geodatabase\n-- Note: Need to read specific layer in Python first, then register\n\n-- In Python first:\n-- parcels = spark.read.format(\"file_gdb\").option(\"layerName\", \"Parcels\").load(\"/data/cadastral.gdb\")\n-- parcels.createOrReplaceTempView(\"parcels\")\n\n-- Then in SQL:\nSELECT\n OBJECTID,\n PARCEL_ID,\n OWNER,\n st_area(st_geomfromwkb(SHAPE)) as area_sqm,\n st_perimeter(st_geomfromwkb(SHAPE)) as perimeter_m\nFROM parcels\nWHERE st_area(st_geomfromwkb(SHAPE)) > 5000;\n", - "line_number": 196, - "length_lines": 17, - "source_file": "readers/filegdb.md", - "category": "EXTENDS_SETUP", - "confidence": "high", - "analysis": { - "has_imports": true, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": true, - "has_rx_registration": false, - "uses_existing_df": null, - "uses_existing_rasters": null, - "uses_rx_without_import": false, - "uses_spark_without_creation": true, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "sql", - "code": "-- In Python first, read and register views:\n-- buildings = spark.read.format(\"file_gdb\").option(\"layerName\", \"Buildings\").load(\"/data/city.gdb\")\n-- buildings.createOrReplaceTempView(\"buildings\")\n-- zones = spark.read.format(\"file_gdb\").option(\"layerName\", \"Zones\").load(\"/data/city.gdb\")\n-- zones.createOrReplaceTempView(\"zones\")\n\n-- Then in SQL:\nSELECT\n b.BUILDING_ID,\n b.BUILDING_NAME,\n z.ZONE_NAME,\n z.ZONE_TYPE\nFROM buildings b\nJOIN zones z\n ON st_contains(\n st_geomfromwkb(z.SHAPE),\n st_centroid(st_geomfromwkb(b.SHAPE))\n );\n", - "line_number": 217, - "length_lines": 19, - "source_file": "readers/filegdb.md", - "category": "EXTENDS_SETUP", - "confidence": "high", - "analysis": { - "has_imports": false, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": true, - "has_rx_registration": false, - "uses_existing_df": null, - "uses_existing_rasters": null, - "uses_rx_without_import": false, - "uses_spark_without_creation": true, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "python", - "code": "# Use GDAL/OGR command line tools\nimport subprocess\n\nresult = subprocess.run(\n ['ogrinfo', '-al', '-so', '/path/to/database.gdb'],\n capture_output=True,\n text=True\n)\nprint(result.stdout)\n", - "line_number": 242, - "length_lines": 10, - "source_file": "readers/filegdb.md", - "category": "NEEDS_REVIEW", - "confidence": "low", - "analysis": { - "has_imports": true, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": false, - "has_rx_registration": false, - "uses_existing_df": null, - "uses_existing_rasters": null, - "uses_rx_without_import": false, - "uses_spark_without_creation": false, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "python", - "code": "# Define feature class names (from ogrinfo or prior knowledge)\nfeature_classes = [\"Buildings\", \"Roads\", \"Parcels\", \"Zones\", \"Points_of_Interest\"]\n\n# Read each feature class\nlayers = {}\nfor fc_name in feature_classes:\n layers[fc_name] = spark.read.format(\"file_gdb\") \\\n .option(\"layerName\", fc_name) \\\n .load(\"/data/city.gdb\")\n\n# Access each layer\nbuildings_df = layers[\"Buildings\"]\nroads_df = layers[\"Roads\"]\n", - "line_number": 256, - "length_lines": 14, - "source_file": "readers/filegdb.md", - "category": "SELF_CONTAINED", - "confidence": "medium", - "analysis": { - "has_imports": true, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": true, - "has_rx_registration": false, - "uses_existing_df": null, - "uses_existing_rasters": null, - "uses_rx_without_import": false, - "uses_spark_without_creation": true, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "python", - "code": "from pyspark.sql.functions import expr\n\n# Read File Geodatabase feature class\ngdb_df = spark.read.format(\"file_gdb\") \\\n .option(\"layerName\", \"Features\") \\\n .load(\"/data/source.gdb\")\n\n# Convert to GEOMETRY type\ndelta_df = gdb_df.select(\n \"*\",\n expr(\"st_geomfromwkb(SHAPE)\").alias(\"geometry\")\n).drop(\"SHAPE\", \"SHAPE_srid\", \"SHAPE_srid_proj\")\n\n# Write to Delta Lake\ndelta_df.write.mode(\"overwrite\").saveAsTable(\"catalog.schema.features\")\n\n# Optimize\nspark.sql(\"\"\"\n OPTIMIZE catalog.schema.features\n ZORDER BY (geometry)\n\"\"\")\n", - "line_number": 276, - "length_lines": 22, - "source_file": "readers/filegdb.md", - "category": "SELF_CONTAINED", - "confidence": "medium", - "analysis": { - "has_imports": true, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": true, - "has_rx_registration": false, - "uses_existing_df": null, - "uses_existing_rasters": null, - "uses_rx_without_import": false, - "uses_spark_without_creation": true, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "python", - "code": "from pyspark.sql.functions import expr, lit, col\n\n# Read multiple feature classes and combine\nbuildings = spark.read.format(\"file_gdb\") \\\n .option(\"layerName\", \"Buildings\") \\\n .load(\"/data/city.gdb\") \\\n .withColumn(\"feature_type\", lit(\"building\"))\n\nroads = spark.read.format(\"file_gdb\") \\\n .option(\"layerName\", \"Roads\") \\\n .load(\"/data/city.gdb\") \\\n .withColumn(\"feature_type\", lit(\"road\"))\n\n# Standardize schema\nbuildings_std = buildings.select(\n col(\"OBJECTID\").alias(\"feature_id\"),\n col(\"NAME\").alias(\"name\"),\n col(\"feature_type\"),\n expr(\"st_geomfromwkb(SHAPE)\").alias(\"geometry\")\n)\n\nroads_std = roads.select(\n col(\"OBJECTID\").alias(\"feature_id\"),\n col(\"NAME\").alias(\"name\"),\n col(\"feature_type\"),\n expr(\"st_geomfromwkb(SHAPE)\").alias(\"geometry\")\n)\n\n# Combine\nall_features = buildings_std.union(roads_std)\nall_features.write.mode(\"overwrite\").saveAsTable(\"combined_features\")\n", - "line_number": 302, - "length_lines": 32, - "source_file": "readers/filegdb.md", - "category": "SELF_CONTAINED", - "confidence": "medium", - "analysis": { - "has_imports": true, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": true, - "has_rx_registration": false, - "uses_existing_df": null, - "uses_existing_rasters": null, - "uses_rx_without_import": false, - "uses_spark_without_creation": true, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "python", - "code": "from pyspark.sql.functions import expr\n\n# Feature classes to migrate\nfeature_classes = [\"Buildings\", \"Roads\", \"Parcels\", \"Zones\"]\n\n# Migrate each to Delta table\nfor fc in feature_classes:\n # Read from File GeoDatabase\n df = spark.read.format(\"file_gdb\") \\\n .option(\"layerName\", fc) \\\n .load(\"/data/legacy.gdb\")\n \n # Convert geometry\n converted = df.select(\n \"*\",\n expr(\"st_geomfromwkb(SHAPE)\").alias(\"geometry\")\n ).drop(\"SHAPE\", \"SHAPE_srid\", \"SHAPE_srid_proj\")\n \n # Write to Delta\n table_name = f\"migrated_{fc.lower()}\"\n converted.write.mode(\"overwrite\").saveAsTable(table_name)\n \n print(f\"Migrated {fc} to {table_name}\")\n", - "line_number": 338, - "length_lines": 24, - "source_file": "readers/filegdb.md", - "category": "SELF_CONTAINED", - "confidence": "medium", - "analysis": { - "has_imports": true, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": true, - "has_rx_registration": false, - "uses_existing_df": false, - "uses_existing_rasters": null, - "uses_rx_without_import": false, - "uses_spark_without_creation": true, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "python", - "code": "from pyspark.sql.functions import expr\n\n# Read File Geodatabase\nparcels = spark.read.format(\"file_gdb\") \\\n .option(\"layerName\", \"TaxParcels\") \\\n .load(\"/data/cadastral.gdb\")\n\n# Add geometry and spatial metrics\nanalyzed = parcels.select(\n \"*\",\n expr(\"st_geomfromwkb(SHAPE)\").alias(\"geometry\")\n).select(\n \"OBJECTID\",\n \"PARCEL_ID\",\n \"OWNER\",\n \"LAND_USE\",\n \"geometry\",\n expr(\"st_area(geometry)\").alias(\"area_sqm\"),\n expr(\"st_perimeter(geometry)\").alias(\"perimeter_m\"),\n expr(\"st_centroid(geometry)\").alias(\"centroid\"),\n expr(\"st_envelope(geometry)\").alias(\"bbox\")\n)\n\n# Calculate derived metrics\nfrom pyspark.sql.functions import col\n\nanalyzed = analyzed.withColumn(\n \"shape_complexity\",\n col(\"perimeter_m\") * col(\"perimeter_m\") / col(\"area_sqm\")\n)\n\n# Save results\nanalyzed.write.mode(\"overwrite\").saveAsTable(\"parcel_analysis\")\n", - "line_number": 366, - "length_lines": 34, - "source_file": "readers/filegdb.md", - "category": "SELF_CONTAINED", - "confidence": "medium", - "analysis": { - "has_imports": true, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": true, - "has_rx_registration": false, - "uses_existing_df": null, - "uses_existing_rasters": null, - "uses_rx_without_import": false, - "uses_spark_without_creation": true, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "python", - "code": "# Always specify the feature class you need\ndf = spark.read.format(\"file_gdb\") \\\n .option(\"layerName\", \"specific_feature_class\") \\\n .load(\"/data/large.gdb\")\n", - "line_number": 406, - "length_lines": 5, - "source_file": "readers/filegdb.md", - "category": "EXTENDS_SETUP", - "confidence": "high", - "analysis": { - "has_imports": false, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": true, - "has_rx_registration": false, - "uses_existing_df": false, - "uses_existing_rasters": null, - "uses_rx_without_import": false, - "uses_spark_without_creation": true, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "python", - "code": "# For large feature classes\ndf = spark.read.format(\"file_gdb\") \\\n .option(\"layerName\", \"large_features\") \\\n .option(\"chunkSize\", \"50000\") \\\n .load(\"/data/database.gdb\")\n", - "line_number": 415, - "length_lines": 6, - "source_file": "readers/filegdb.md", - "category": "EXTENDS_SETUP", - "confidence": "high", - "analysis": { - "has_imports": false, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": true, - "has_rx_registration": false, - "uses_existing_df": false, - "uses_existing_rasters": null, - "uses_rx_without_import": false, - "uses_spark_without_creation": true, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "python", - "code": "# Cache feature class data\nfc_df = spark.read.format(\"file_gdb\") \\\n .option(\"layerName\", \"important_features\") \\\n .load(\"/data/database.gdb\")\n\nfc_df.cache()\n", - "line_number": 425, - "length_lines": 7, - "source_file": "readers/filegdb.md", - "category": "EXTENDS_SETUP", - "confidence": "high", - "analysis": { - "has_imports": false, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": true, - "has_rx_registration": false, - "uses_existing_df": null, - "uses_existing_rasters": null, - "uses_rx_without_import": false, - "uses_spark_without_creation": true, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "python", - "code": "# Repartition large feature classes\ndf = spark.read.format(\"file_gdb\") \\\n .option(\"layerName\", \"large_features\") \\\n .load(\"/data/database.gdb\")\n\ndf = df.repartition(200)\n", - "line_number": 436, - "length_lines": 7, - "source_file": "readers/filegdb.md", - "category": "EXTENDS_SETUP", - "confidence": "high", - "analysis": { - "has_imports": false, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": true, - "has_rx_registration": false, - "uses_existing_df": false, - "uses_existing_rasters": null, - "uses_rx_without_import": false, - "uses_spark_without_creation": true, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "python", - "code": "# Check feature class name (case matters in option, but not in columns)\ndf = spark.read.format(\"file_gdb\") \\\n .option(\"layerName\", \"Buildings\") \\\n .load(\"/data/city.gdb\")\n", - "line_number": 463, - "length_lines": 5, - "source_file": "readers/filegdb.md", - "category": "EXTENDS_SETUP", - "confidence": "high", - "analysis": { - "has_imports": false, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": true, - "has_rx_registration": false, - "uses_existing_df": false, - "uses_existing_rasters": null, - "uses_rx_without_import": false, - "uses_spark_without_creation": true, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "python", - "code": "# File Geodatabase columns are case-insensitive\n# Use consistent casing in your code\ndf = spark.read.format(\"file_gdb\") \\\n .option(\"layerName\", \"Features\") \\\n .load(\"/data/database.gdb\")\n\n# Either of these work:\ndf.select(\"OBJECTID\", \"SHAPE\").show()\ndf.select(\"objectid\", \"shape\").show()\n", - "line_number": 472, - "length_lines": 10, - "source_file": "readers/filegdb.md", - "category": "EXTENDS_SETUP", - "confidence": "high", - "analysis": { - "has_imports": false, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": true, - "has_rx_registration": false, - "uses_existing_df": false, - "uses_existing_rasters": null, - "uses_rx_without_import": false, - "uses_spark_without_creation": true, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "python", - "code": "# Increase chunk size and repartition\ndf = spark.read.format(\"file_gdb\") \\\n .option(\"layerName\", \"large_features\") \\\n .option(\"chunkSize\", \"100000\") \\\n .load(\"/data/large.gdb\")\n\ndf = df.repartition(100)\n", - "line_number": 486, - "length_lines": 8, - "source_file": "readers/filegdb.md", - "category": "EXTENDS_SETUP", - "confidence": "high", - "analysis": { - "has_imports": false, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": true, - "has_rx_registration": false, - "uses_existing_df": false, - "uses_existing_rasters": null, - "uses_rx_without_import": false, - "uses_spark_without_creation": true, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "python", - "code": "# Ensure the .gdb directory is accessible\n# File Geodatabase is a directory, not a single file\nfrom pyspark.dbutils import DBUtils\ndbutils = DBUtils(spark)\ndbutils.fs.ls(\"/path/to/database.gdb\")\n", - "line_number": 498, - "length_lines": 6, - "source_file": "readers/filegdb.md", - "category": "EXTENDS_SETUP", - "confidence": "high", - "analysis": { - "has_imports": true, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": false, - "has_rx_registration": false, - "uses_existing_df": null, - "uses_existing_rasters": null, - "uses_rx_without_import": false, - "uses_spark_without_creation": true, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "python", - "code": "# Read GeoTIFF files\ndf = spark.read.format(\"gdal\").load(\"/path/to/geotiffs\")\n\ndf.show()\n", - "line_number": 31, - "length_lines": 5, - "source_file": "readers/gdal.md", - "category": "EXTENDS_SETUP", - "confidence": "high", - "analysis": { - "has_imports": false, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": true, - "has_rx_registration": false, - "uses_existing_df": false, - "uses_existing_rasters": null, - "uses_rx_without_import": false, - "uses_spark_without_creation": true, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "scala", - "code": "// Read GeoTIFF files\nval df = spark.read.format(\"gdal\").load(\"/path/to/geotiffs\")\n\ndf.show()\n", - "line_number": 40, - "length_lines": 5, - "source_file": "readers/gdal.md", - "category": "EXTENDS_SETUP", - "confidence": "high", - "analysis": { - "has_imports": false, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": true, - "has_rx_registration": false, - "uses_existing_df": false, - "uses_existing_rasters": null, - "uses_rx_without_import": false, - "uses_spark_without_creation": true, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "sql", - "code": "-- Read GeoTIFF files\nCREATE OR REPLACE TEMP VIEW rasters AS\nSELECT * FROM gdal.`/path/to/geotiffs`;\n\nSELECT * FROM rasters;\n", - "line_number": 49, - "length_lines": 6, - "source_file": "readers/gdal.md", - "category": "SELF_CONTAINED", - "confidence": "medium", - "analysis": { - "has_imports": false, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": false, - "has_rx_registration": false, - "uses_existing_df": null, - "uses_existing_rasters": true, - "uses_rx_without_import": false, - "uses_spark_without_creation": false, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "python", - "code": "# Increase split threshold for larger tiles\ndf = spark.read.format(\"gdal\").option(\"sizeInMB\", \"32\").load(\"/path/to/large_rasters\")\n\n# Decrease for smaller tiles (more parallelism)\ndf = spark.read.format(\"gdal\").option(\"sizeInMB\", \"8\").load(\"/path/to/rasters\")\n", - "line_number": 65, - "length_lines": 6, - "source_file": "readers/gdal.md", - "category": "EXTENDS_SETUP", - "confidence": "high", - "analysis": { - "has_imports": false, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": true, - "has_rx_registration": false, - "uses_existing_df": false, - "uses_existing_rasters": false, - "uses_rx_without_import": false, - "uses_spark_without_creation": true, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "python", - "code": "# Read only files from 2024\ndf = spark.read.format(\"gdal\").option(\"filterRegex\", \".*_2024_.*\\\\.tif\").load(\"/data/all_years\")\n\n# Read specific satellite scenes\ndf = spark.read.format(\"gdal\").option(\"filterRegex\", \"LC08.*\\\\.tif\").load(\"/data/landsat\")\n", - "line_number": 79, - "length_lines": 6, - "source_file": "readers/gdal.md", - "category": "SELF_CONTAINED", - "confidence": "medium", - "analysis": { - "has_imports": true, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": true, - "has_rx_registration": false, - "uses_existing_df": false, - "uses_existing_rasters": null, - "uses_rx_without_import": false, - "uses_spark_without_creation": true, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "python", - "code": "# Explicitly use GeoTIFF driver\ndf = spark.read.format(\"gdal\").option(\"driverName\", \"GTiff\").load(\"/path/to/files\")\n\n# Use NetCDF driver\ndf = spark.read.format(\"gdal\").option(\"driverName\", \"NetCDF\").load(\"/path/to/netcdf\")\n\n# Use HDF5 driver\ndf = spark.read.format(\"gdal\").option(\"driverName\", \"HDF5\").load(\"/path/to/hdf\")\n", - "line_number": 93, - "length_lines": 9, - "source_file": "readers/gdal.md", - "category": "EXTENDS_SETUP", - "confidence": "high", - "analysis": { - "has_imports": false, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": true, - "has_rx_registration": false, - "uses_existing_df": false, - "uses_existing_rasters": null, - "uses_rx_without_import": false, - "uses_spark_without_creation": true, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "python", - "code": "df = spark.read.format(\"gdal\").load(\"/data/sample.tif\")\ndf.printSchema()\n\n# Output:\n# root\n# |-- path: string (nullable = true)\n# |-- tile: binary (nullable = true)\n# |-- metadata: map (nullable = true)\n# | |-- key: string\n# | |-- value: string (valueContainsNull = true)\n", - "line_number": 115, - "length_lines": 11, - "source_file": "readers/gdal.md", - "category": "EXTENDS_SETUP", - "confidence": "high", - "analysis": { - "has_imports": false, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": true, - "has_rx_registration": false, - "uses_existing_df": false, - "uses_existing_rasters": null, - "uses_rx_without_import": false, - "uses_spark_without_creation": true, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "python", - "code": "# Read a single GeoTIFF file\ndf = spark.read.format(\"gdal\").load(\"/data/elevation.tif\")\n\ndf.select(\"path\", \"metadata\").show(truncate=False)\n", - "line_number": 132, - "length_lines": 5, - "source_file": "readers/gdal.md", - "category": "EXTENDS_SETUP", - "confidence": "high", - "analysis": { - "has_imports": false, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": true, - "has_rx_registration": false, - "uses_existing_df": false, - "uses_existing_rasters": null, - "uses_rx_without_import": false, - "uses_spark_without_creation": true, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "python", - "code": "# Read all GeoTIFF files in a directory\ndf = spark.read.format(\"gdal\").load(\"/data/satellite_imagery/\")\n\n# Check how many files were loaded\nprint(f\"Loaded {df.count()} raster tiles\")\n", - "line_number": 141, - "length_lines": 6, - "source_file": "readers/gdal.md", - "category": "EXTENDS_SETUP", - "confidence": "high", - "analysis": { - "has_imports": false, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": true, - "has_rx_registration": false, - "uses_existing_df": false, - "uses_existing_rasters": null, - "uses_rx_without_import": false, - "uses_spark_without_creation": true, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "python", - "code": "# Read only specific files\ndf = spark.read.format(\"gdal\") \\\n .option(\"filterRegex\", \".*_B[0-9]+\\\\.tif\") \\\n .load(\"/data/landsat_scene\")\n\n# Show file paths\ndf.select(\"path\").distinct().show(truncate=False)\n", - "line_number": 151, - "length_lines": 8, - "source_file": "readers/gdal.md", - "category": "EXTENDS_SETUP", - "confidence": "high", - "analysis": { - "has_imports": false, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": true, - "has_rx_registration": false, - "uses_existing_df": false, - "uses_existing_rasters": null, - "uses_rx_without_import": false, - "uses_spark_without_creation": true, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "python", - "code": "# Read large rasters with 64MB tiles\nlarge_rasters = spark.read.format(\"gdal\") \\\n .option(\"sizeInMB\", \"64\") \\\n .load(\"/data/large_elevation_models\")\n\nlarge_rasters.show()\n", - "line_number": 163, - "length_lines": 7, - "source_file": "readers/gdal.md", - "category": "EXTENDS_SETUP", - "confidence": "high", - "analysis": { - "has_imports": false, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": true, - "has_rx_registration": false, - "uses_existing_df": null, - "uses_existing_rasters": false, - "uses_rx_without_import": false, - "uses_spark_without_creation": true, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "python", - "code": "# Read from S3\ns3_rasters = spark.read.format(\"gdal\").load(\"s3://bucket/path/to/rasters/*.tif\")\n\n# Read from Azure Blob Storage\nazure_rasters = spark.read.format(\"gdal\") \\\n .load(\"wasbs://container@account.blob.core.windows.net/rasters/\")\n\n# Read from Unity Catalog Volume\nvolume_rasters = spark.read.format(\"gdal\") \\\n .load(\"/Volumes/catalog/schema/volume_name/rasters/\")\n", - "line_number": 174, - "length_lines": 11, - "source_file": "readers/gdal.md", - "category": "SELF_CONTAINED", - "confidence": "medium", - "analysis": { - "has_imports": true, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": true, - "has_rx_registration": false, - "uses_existing_df": null, - "uses_existing_rasters": false, - "uses_rx_without_import": false, - "uses_spark_without_creation": true, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "python", - "code": "from databricks.labs.gbx.rasterx import functions as rx\nrx.register(spark)\n\n# Read rasters\nrasters = spark.read.format(\"gdal\").load(\"/data/rasters\")\n\n# Extract raster properties\nmetadata = rasters.select(\n \"path\",\n rx.rst_width(\"tile\").alias(\"width\"),\n rx.rst_height(\"tile\").alias(\"height\"),\n rx.rst_numbands(\"tile\").alias(\"num_bands\"),\n rx.rst_boundingbox(\"tile\").alias(\"bbox\"),\n rx.rst_metadata(\"tile\").alias(\"metadata\")\n)\n\nmetadata.show(truncate=False)\n", - "line_number": 191, - "length_lines": 18, - "source_file": "readers/gdal.md", - "category": "SELF_CONTAINED", - "confidence": "high", - "analysis": { - "has_imports": true, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": true, - "has_rx_registration": true, - "uses_existing_df": null, - "uses_existing_rasters": false, - "uses_rx_without_import": false, - "uses_spark_without_creation": true, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "python", - "code": "from databricks.labs.gbx.rasterx import functions as rx\nfrom pyspark.sql.functions import expr\nrx.register(spark)\n\n# Read and process\nrasters = spark.read.format(\"gdal\").load(\"/data/input\")\n\n# Clip to area of interest\nclipped = rasters.select(\n \"path\",\n rx.rst_clip(\n \"tile\",\n expr(\"st_geomfromtext('POLYGON((-122 37, -122 38, -121 38, -121 37, -122 37))')\")\n ).alias(\"clipped_tile\")\n)\n\nclipped.show()\n", - "line_number": 213, - "length_lines": 18, - "source_file": "readers/gdal.md", - "category": "SELF_CONTAINED", - "confidence": "high", - "analysis": { - "has_imports": true, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": true, - "has_rx_registration": true, - "uses_existing_df": null, - "uses_existing_rasters": false, - "uses_rx_without_import": false, - "uses_spark_without_creation": true, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "python", - "code": "from databricks.labs.gbx.rasterx import functions as rx\nrx.register(spark)\n\n# Read all rasters\nrasters = spark.read.format(\"gdal\").load(\"/data/satellite/\")\n\n# Build catalog\ncatalog = rasters.select(\n \"path\",\n rx.rst_boundingbox(\"tile\").alias(\"bounds\"),\n rx.rst_width(\"tile\").alias(\"width\"),\n rx.rst_height(\"tile\").alias(\"height\"),\n rx.rst_numbands(\"tile\").alias(\"bands\"),\n rx.rst_srid(\"tile\").alias(\"crs\"),\n rx.rst_metadata(\"tile\").alias(\"metadata\")\n)\n\n# Save as Delta table\ncatalog.write.mode(\"overwrite\").saveAsTable(\"raster_catalog\")\n", - "line_number": 235, - "length_lines": 20, - "source_file": "readers/gdal.md", - "category": "SELF_CONTAINED", - "confidence": "high", - "analysis": { - "has_imports": true, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": true, - "has_rx_registration": true, - "uses_existing_df": null, - "uses_existing_rasters": false, - "uses_rx_without_import": false, - "uses_spark_without_creation": true, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "python", - "code": "# Small rasters (< 10MB): Use default or smaller\ndf = spark.read.format(\"gdal\").option(\"sizeInMB\", \"8\").load(\"/data/small_tiles\")\n\n# Medium rasters (10-100MB): Use default\ndf = spark.read.format(\"gdal\").load(\"/data/medium_rasters\")\n\n# Large rasters (> 100MB): Use larger split size\ndf = spark.read.format(\"gdal\").option(\"sizeInMB\", \"64\").load(\"/data/large_rasters\")\n", - "line_number": 263, - "length_lines": 9, - "source_file": "readers/gdal.md", - "category": "EXTENDS_SETUP", - "confidence": "high", - "analysis": { - "has_imports": false, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": true, - "has_rx_registration": false, - "uses_existing_df": false, - "uses_existing_rasters": false, - "uses_rx_without_import": false, - "uses_spark_without_creation": true, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "python", - "code": "# Read and repartition for processing\nrasters = spark.read.format(\"gdal\").load(\"/data/rasters\")\n\n# Repartition to match cluster size\nnum_executors = spark.sparkContext.defaultParallelism\nrasters_partitioned = rasters.repartition(num_executors)\n\n# Process in parallel\nprocessed = rasters_partitioned.select(\n \"path\",\n # Your processing here\n)\n", - "line_number": 276, - "length_lines": 13, - "source_file": "readers/gdal.md", - "category": "EXTENDS_SETUP", - "confidence": "high", - "analysis": { - "has_imports": false, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": true, - "has_rx_registration": false, - "uses_existing_df": null, - "uses_existing_rasters": false, - "uses_rx_without_import": false, - "uses_spark_without_creation": true, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "python", - "code": "# Cache raster catalog for repeated queries\ncatalog = spark.read.format(\"gdal\").load(\"/data/rasters\")\ncatalog.cache()\n\n# Query catalog multiple times\nlandsat_scenes = catalog.filter(\"path like '%LC08%'\")\nsentinel_scenes = catalog.filter(\"path like '%S2%'\")\n", - "line_number": 293, - "length_lines": 8, - "source_file": "readers/gdal.md", - "category": "EXTENDS_SETUP", - "confidence": "high", - "analysis": { - "has_imports": false, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": true, - "has_rx_registration": false, - "uses_existing_df": null, - "uses_existing_rasters": false, - "uses_rx_without_import": false, - "uses_spark_without_creation": true, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "python", - "code": "from databricks.labs.gbx.rasterx import functions as rx\nrx.register(spark)\n\n# Read all satellite imagery\nimagery = spark.read.format(\"gdal\") \\\n .option(\"filterRegex\", \".*\\\\.(tif|TIF)\") \\\n .load(\"/data/satellite/\")\n\n# Create searchable catalog\ncatalog = imagery.select(\n \"path\",\n rx.rst_boundingbox(\"tile\").alias(\"footprint\"),\n rx.rst_metadata(\"tile\").alias(\"metadata\"),\n rx.rst_numbands(\"tile\").alias(\"bands\")\n)\n\n# Extract acquisition date from metadata\nfrom pyspark.sql.functions import col\ncatalog = catalog.withColumn(\n \"acquisition_date\",\n col(\"metadata\").getItem(\"ACQUISITION_DATE\")\n)\n\ncatalog.write.mode(\"overwrite\").saveAsTable(\"satellite_catalog\")\n", - "line_number": 307, - "length_lines": 25, - "source_file": "readers/gdal.md", - "category": "SELF_CONTAINED", - "confidence": "high", - "analysis": { - "has_imports": true, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": true, - "has_rx_registration": true, - "uses_existing_df": null, - "uses_existing_rasters": null, - "uses_rx_without_import": false, - "uses_spark_without_creation": true, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "python", - "code": "from databricks.labs.gbx.rasterx import functions as rx\nrx.register(spark)\n\n# Read elevation models\ndems = spark.read.format(\"gdal\").load(\"/data/dems/\")\n\n# Calculate statistics\nstats = dems.select(\n \"path\",\n rx.rst_width(\"tile\").alias(\"width\"),\n rx.rst_height(\"tile\").alias(\"height\"),\n rx.rst_boundingbox(\"tile\").alias(\"extent\")\n)\n\nstats.show()\n", - "line_number": 336, - "length_lines": 16, - "source_file": "readers/gdal.md", - "category": "SELF_CONTAINED", - "confidence": "high", - "analysis": { - "has_imports": true, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": true, - "has_rx_registration": true, - "uses_existing_df": null, - "uses_existing_rasters": null, - "uses_rx_without_import": false, - "uses_spark_without_creation": true, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "python", - "code": "from databricks.labs.gbx.rasterx import functions as rx\nfrom pyspark.sql.functions import regexp_extract\nrx.register(spark)\n\n# Read time series of rasters\ntime_series = spark.read.format(\"gdal\") \\\n .option(\"filterRegex\", \".*_NDVI_.*\\\\.tif\") \\\n .load(\"/data/time_series/\")\n\n# Extract date from filename\ntime_series = time_series.withColumn(\n \"date\",\n regexp_extract(\"path\", r\"(\\d{8})\", 1)\n)\n\n# Build temporal catalog\ncatalog = time_series.select(\n \"date\",\n \"path\",\n rx.rst_boundingbox(\"tile\").alias(\"extent\")\n)\n\ncatalog.orderBy(\"date\").show()\n", - "line_number": 356, - "length_lines": 24, - "source_file": "readers/gdal.md", - "category": "SELF_CONTAINED", - "confidence": "high", - "analysis": { - "has_imports": true, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": true, - "has_rx_registration": true, - "uses_existing_df": null, - "uses_existing_rasters": false, - "uses_rx_without_import": false, - "uses_spark_without_creation": true, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "python", - "code": "# Explicitly specify driver\ndf = spark.read.format(\"gdal\") \\\n .option(\"driverName\", \"GTiff\") \\\n .load(\"/path/to/files\")\n", - "line_number": 386, - "length_lines": 5, - "source_file": "readers/gdal.md", - "category": "EXTENDS_SETUP", - "confidence": "high", - "analysis": { - "has_imports": false, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": true, - "has_rx_registration": false, - "uses_existing_df": false, - "uses_existing_rasters": null, - "uses_rx_without_import": false, - "uses_spark_without_creation": true, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "python", - "code": "# Reduce split size for better parallelism\ndf = spark.read.format(\"gdal\") \\\n .option(\"sizeInMB\", \"8\") \\\n .load(\"/path/to/large/files\")\n", - "line_number": 395, - "length_lines": 5, - "source_file": "readers/gdal.md", - "category": "EXTENDS_SETUP", - "confidence": "high", - "analysis": { - "has_imports": false, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": true, - "has_rx_registration": false, - "uses_existing_df": false, - "uses_existing_rasters": null, - "uses_rx_without_import": false, - "uses_spark_without_creation": true, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "python", - "code": "# Process in smaller batches\ndf = spark.read.format(\"gdal\") \\\n .option(\"sizeInMB\", \"16\") \\\n .load(\"/path/to/files\")\n\n# Don't cache large raster data\ndf.select(\"path\", \"metadata\").cache() # Only cache metadata\n", - "line_number": 404, - "length_lines": 8, - "source_file": "readers/gdal.md", - "category": "EXTENDS_SETUP", - "confidence": "high", - "analysis": { - "has_imports": false, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": true, - "has_rx_registration": false, - "uses_existing_df": false, - "uses_existing_rasters": null, - "uses_rx_without_import": false, - "uses_spark_without_creation": true, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "python", - "code": "from databricks.labs.gbx.rasterx import functions as rx\nrx.register(spark)\n\n# Read -> Process -> Save pipeline\nresult = (\n spark.read.format(\"gdal\")\n .load(\"/data/input\")\n .select(\n \"path\",\n rx.rst_boundingbox(\"tile\").alias(\"bbox\"),\n rx.rst_metadata(\"tile\").alias(\"metadata\")\n )\n)\n\nresult.write.mode(\"overwrite\").saveAsTable(\"raster_metadata\")\n", - "line_number": 418, - "length_lines": 16, - "source_file": "readers/gdal.md", - "category": "SELF_CONTAINED", - "confidence": "high", - "analysis": { - "has_imports": true, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": true, - "has_rx_registration": true, - "uses_existing_df": null, - "uses_existing_rasters": null, - "uses_rx_without_import": false, - "uses_spark_without_creation": true, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "python", - "code": "# Read GeoJSON files (uses GeoJSONSeq by default)\ndf = spark.read.format(\"geojson\").load(\"/path/to/geojson\")\n\ndf.show()\n", - "line_number": 26, - "length_lines": 5, - "source_file": "readers/geojson.md", - "category": "EXTENDS_SETUP", - "confidence": "high", - "analysis": { - "has_imports": false, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": true, - "has_rx_registration": false, - "uses_existing_df": false, - "uses_existing_rasters": null, - "uses_rx_without_import": false, - "uses_spark_without_creation": true, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "scala", - "code": "// Read GeoJSON files\nval df = spark.read.format(\"geojson\").load(\"/path/to/geojson\")\n\ndf.show()\n", - "line_number": 35, - "length_lines": 5, - "source_file": "readers/geojson.md", - "category": "EXTENDS_SETUP", - "confidence": "high", - "analysis": { - "has_imports": false, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": true, - "has_rx_registration": false, - "uses_existing_df": false, - "uses_existing_rasters": null, - "uses_rx_without_import": false, - "uses_spark_without_creation": true, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "sql", - "code": "-- Read GeoJSON files\nCREATE OR REPLACE TEMP VIEW features AS\nSELECT * FROM geojson.`/path/to/geojson`;\n\nSELECT * FROM features;\n", - "line_number": 44, - "length_lines": 6, - "source_file": "readers/geojson.md", - "category": "SELF_CONTAINED", - "confidence": "medium", - "analysis": { - "has_imports": false, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": false, - "has_rx_registration": false, - "uses_existing_df": null, - "uses_existing_rasters": null, - "uses_rx_without_import": false, - "uses_spark_without_creation": false, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "python", - "code": "# Read standard GeoJSON\ndf = spark.read.format(\"geojson\") \\\n .option(\"multi\", \"false\") \\\n .load(\"/path/to/standard.geojson\")\n\n# Read GeoJSONSeq (default)\ndf = spark.read.format(\"geojson\") \\\n .option(\"multi\", \"true\") \\\n .load(\"/path/to/features.geojsonl\")\n\n# Or simply (multi=true is default)\ndf = spark.read.format(\"geojson\").load(\"/path/to/features.geojsonl\")\n", - "line_number": 62, - "length_lines": 13, - "source_file": "readers/geojson.md", - "category": "EXTENDS_SETUP", - "confidence": "high", - "analysis": { - "has_imports": false, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": true, - "has_rx_registration": false, - "uses_existing_df": false, - "uses_existing_rasters": null, - "uses_rx_without_import": false, - "uses_spark_without_creation": true, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "python", - "code": "df = spark.read.format(\"geojson\").load(\"/data/sample.geojson\")\ndf.printSchema()\n\n# Output:\n# root\n# |-- geom_0: binary (nullable = true)\n# |-- geom_0_srid: integer (nullable = true)\n# |-- geom_0_srid_proj: string (nullable = true)\n# |-- id: long (nullable = true)\n# |-- name: string (nullable = true)\n# |-- type: string (nullable = true)\n", - "line_number": 98, - "length_lines": 12, - "source_file": "readers/geojson.md", - "category": "EXTENDS_SETUP", - "confidence": "high", - "analysis": { - "has_imports": false, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": true, - "has_rx_registration": false, - "uses_existing_df": false, - "uses_existing_rasters": null, - "uses_rx_without_import": false, - "uses_spark_without_creation": true, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "python", - "code": "# Read a FeatureCollection GeoJSON file\nfeatures = spark.read.format(\"geojson\") \\\n .option(\"multi\", \"false\") \\\n .load(\"/data/cities.geojson\")\n\nfeatures.select(\"name\", \"population\", \"geom_0_srid\").show()\n", - "line_number": 116, - "length_lines": 7, - "source_file": "readers/geojson.md", - "category": "EXTENDS_SETUP", - "confidence": "high", - "analysis": { - "has_imports": false, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": true, - "has_rx_registration": false, - "uses_existing_df": null, - "uses_existing_rasters": null, - "uses_rx_without_import": false, - "uses_spark_without_creation": true, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "python", - "code": "# Read newline-delimited GeoJSON (better for large files)\nfeatures = spark.read.format(\"geojson\").load(\"/data/features.geojsonl\")\n\nprint(f\"Loaded {features.count()} features\")\n", - "line_number": 127, - "length_lines": 5, - "source_file": "readers/geojson.md", - "category": "EXTENDS_SETUP", - "confidence": "high", - "analysis": { - "has_imports": false, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": true, - "has_rx_registration": false, - "uses_existing_df": null, - "uses_existing_rasters": null, - "uses_rx_without_import": false, - "uses_spark_without_creation": true, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "python", - "code": "from pyspark.sql.functions import expr\n\n# Read GeoJSON\ndf = spark.read.format(\"geojson\").load(\"/data/boundaries.geojson\")\n\n# Convert to GEOMETRY type\ngeometry_df = df.select(\n \"*\",\n expr(\"st_geomfromwkb(geom_0)\").alias(\"geometry\")\n)\n\n# Use Databricks ST functions\nresult = geometry_df.select(\n \"name\",\n \"geometry\",\n expr(\"st_area(geometry)\").alias(\"area\"),\n expr(\"st_centroid(geometry)\").alias(\"center\")\n)\n\nresult.show()\n", - "line_number": 136, - "length_lines": 21, - "source_file": "readers/geojson.md", - "category": "SELF_CONTAINED", - "confidence": "medium", - "analysis": { - "has_imports": true, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": true, - "has_rx_registration": false, - "uses_existing_df": false, - "uses_existing_rasters": null, - "uses_rx_without_import": false, - "uses_spark_without_creation": true, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "python", - "code": "# If you've saved API response as GeoJSON\napi_data = spark.read.format(\"geojson\") \\\n .option(\"multi\", \"false\") \\\n .load(\"/data/api_response.geojson\")\n\napi_data.show()\n", - "line_number": 161, - "length_lines": 7, - "source_file": "readers/geojson.md", - "category": "EXTENDS_SETUP", - "confidence": "high", - "analysis": { - "has_imports": false, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": true, - "has_rx_registration": false, - "uses_existing_df": null, - "uses_existing_rasters": null, - "uses_rx_without_import": false, - "uses_spark_without_creation": true, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "python", - "code": "# Read all GeoJSON files in a directory\nall_features = spark.read.format(\"geojson\").load(\"/data/geojson_files/\")\n\n# Check distinct feature types\nall_features.groupBy(\"type\").count().show()\n", - "line_number": 172, - "length_lines": 6, - "source_file": "readers/geojson.md", - "category": "EXTENDS_SETUP", - "confidence": "high", - "analysis": { - "has_imports": false, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": true, - "has_rx_registration": false, - "uses_existing_df": null, - "uses_existing_rasters": null, - "uses_rx_without_import": false, - "uses_spark_without_creation": true, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "sql", - "code": "-- Create view from GeoJSON\nCREATE OR REPLACE TEMP VIEW places AS\nSELECT\n *,\n st_geomfromwkb(geom_0) as geometry\nFROM geojson.`/data/places.geojson`;\n\n-- Query with spatial functions\nSELECT\n name,\n category,\n st_area(geometry) as area,\n st_x(st_centroid(geometry)) as longitude,\n st_y(st_centroid(geometry)) as latitude\nFROM places\nWHERE category = 'park';\n", - "line_number": 184, - "length_lines": 17, - "source_file": "readers/geojson.md", - "category": "SELF_CONTAINED", - "confidence": "medium", - "analysis": { - "has_imports": true, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": false, - "has_rx_registration": false, - "uses_existing_df": null, - "uses_existing_rasters": null, - "uses_rx_without_import": false, - "uses_spark_without_creation": false, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "sql", - "code": "-- Read GeoJSON files\nCREATE OR REPLACE TEMP VIEW points AS\nSELECT *, st_geomfromwkb(geom_0) as geometry\nFROM geojson.`/data/points.geojson`;\n\nCREATE OR REPLACE TEMP VIEW polygons AS\nSELECT *, st_geomfromwkb(geom_0) as geometry\nFROM geojson.`/data/polygons.geojson`;\n\n-- Spatial join\nSELECT\n pt.name as point_name,\n poly.name as polygon_name\nFROM points pt\nJOIN polygons poly\n ON st_contains(poly.geometry, pt.geometry);\n", - "line_number": 205, - "length_lines": 17, - "source_file": "readers/geojson.md", - "category": "SELF_CONTAINED", - "confidence": "medium", - "analysis": { - "has_imports": false, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": false, - "has_rx_registration": false, - "uses_existing_df": null, - "uses_existing_rasters": null, - "uses_rx_without_import": false, - "uses_spark_without_creation": false, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "python", - "code": "df = spark.read.format(\"geojson\").option(\"multi\", \"false\").load(\"/data/standard.geojson\")\n", - "line_number": 248, - "length_lines": 2, - "source_file": "readers/geojson.md", - "category": "EXTENDS_SETUP", - "confidence": "high", - "analysis": { - "has_imports": false, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": true, - "has_rx_registration": false, - "uses_existing_df": false, - "uses_existing_rasters": null, - "uses_rx_without_import": false, - "uses_spark_without_creation": true, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "python", - "code": "df = spark.read.format(\"geojson\").load(\"/data/features.geojsonl\")\n# or explicitly:\ndf = spark.read.format(\"geojson\").option(\"multi\", \"true\").load(\"/data/features.geojsonl\")\n", - "line_number": 266, - "length_lines": 4, - "source_file": "readers/geojson.md", - "category": "EXTENDS_SETUP", - "confidence": "high", - "analysis": { - "has_imports": false, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": true, - "has_rx_registration": false, - "uses_existing_df": false, - "uses_existing_rasters": null, - "uses_rx_without_import": false, - "uses_spark_without_creation": true, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "python", - "code": "from pyspark.sql.functions import expr\n\n# Read GeoJSON\ngeojson_df = spark.read.format(\"geojson\").load(\"/data/source.geojson\")\n\n# Convert to GEOMETRY type\ndelta_df = geojson_df.select(\n \"*\",\n expr(\"st_geomfromwkb(geom_0)\").alias(\"geometry\")\n).drop(\"geom_0\", \"geom_0_srid\", \"geom_0_srid_proj\")\n\n# Write to Delta Lake\ndelta_df.write.mode(\"overwrite\").saveAsTable(\"catalog.schema.features\")\n", - "line_number": 282, - "length_lines": 14, - "source_file": "readers/geojson.md", - "category": "SELF_CONTAINED", - "confidence": "medium", - "analysis": { - "has_imports": true, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": true, - "has_rx_registration": false, - "uses_existing_df": null, - "uses_existing_rasters": null, - "uses_rx_without_import": false, - "uses_spark_without_creation": true, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "python", - "code": "from pyspark.sql.functions import expr, to_json, struct\n\n# Read shapefile\nshapefile_df = spark.read.format(\"shapefile\").load(\"/data/input.shp\")\n\n# Convert to GeoJSON structure\ngeojson_df = shapefile_df.select(\n to_json(struct(\n expr(\"'Feature'\").alias(\"type\"),\n expr(\"st_asgeojson(st_geomfromwkb(geom_0))\").alias(\"geometry\"),\n struct(\"*\").alias(\"properties\")\n )).alias(\"feature\")\n)\n\n# Write as newline-delimited GeoJSON\ngeojson_df.write.mode(\"overwrite\").text(\"/data/output.geojsonl\")\n", - "line_number": 300, - "length_lines": 17, - "source_file": "readers/geojson.md", - "category": "SELF_CONTAINED", - "confidence": "medium", - "analysis": { - "has_imports": true, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": true, - "has_rx_registration": false, - "uses_existing_df": null, - "uses_existing_rasters": null, - "uses_rx_without_import": false, - "uses_spark_without_creation": true, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "python", - "code": "from pyspark.sql.functions import expr\n\n# Read GeoJSON\nfeatures = spark.read.format(\"geojson\").load(\"/data/all_features.geojson\")\n\n# Convert to GEOMETRY\nwith_geom = features.select(\n \"*\",\n expr(\"st_geomfromwkb(geom_0)\").alias(\"geometry\")\n)\n\n# Filter by spatial criteria\nfiltered = with_geom.filter(\n expr(\"st_area(geometry) > 1000\")\n)\n\n# Save to Delta\nfiltered.write.mode(\"overwrite\").saveAsTable(\"large_features\")\n", - "line_number": 321, - "length_lines": 19, - "source_file": "readers/geojson.md", - "category": "SELF_CONTAINED", - "confidence": "medium", - "analysis": { - "has_imports": true, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": true, - "has_rx_registration": false, - "uses_existing_df": null, - "uses_existing_rasters": null, - "uses_rx_without_import": false, - "uses_spark_without_creation": true, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "python", - "code": "from pyspark.sql.functions import input_file_name\n\n# Read multiple GeoJSON files\nall_files = spark.read.format(\"geojson\").load(\"/data/geojson/*.geojson\")\n\n# Add source file tracking\nwith_source = all_files.withColumn(\"source\", input_file_name())\n\n# Aggregate by source\nsummary = with_source.groupBy(\"source\").count()\nsummary.show()\n", - "line_number": 344, - "length_lines": 12, - "source_file": "readers/geojson.md", - "category": "SELF_CONTAINED", - "confidence": "medium", - "analysis": { - "has_imports": true, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": true, - "has_rx_registration": false, - "uses_existing_df": null, - "uses_existing_rasters": null, - "uses_rx_without_import": false, - "uses_spark_without_creation": true, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "python", - "code": "# For large files, use GeoJSONSeq (default)\nlarge_df = spark.read.format(\"geojson\").load(\"/data/large.geojsonl\")\n\n# Better performance than standard GeoJSON\n", - "line_number": 362, - "length_lines": 5, - "source_file": "readers/geojson.md", - "category": "EXTENDS_SETUP", - "confidence": "high", - "analysis": { - "has_imports": false, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": true, - "has_rx_registration": false, - "uses_existing_df": null, - "uses_existing_rasters": null, - "uses_rx_without_import": false, - "uses_spark_without_creation": true, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "python", - "code": "# For files with many features\ndf = spark.read.format(\"geojson\") \\\n .option(\"chunkSize\", \"50000\") \\\n .load(\"/data/many_features.geojsonl\")\n", - "line_number": 371, - "length_lines": 5, - "source_file": "readers/geojson.md", - "category": "EXTENDS_SETUP", - "confidence": "high", - "analysis": { - "has_imports": false, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": true, - "has_rx_registration": false, - "uses_existing_df": false, - "uses_existing_rasters": null, - "uses_rx_without_import": false, - "uses_spark_without_creation": true, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "python", - "code": "# Partition by attribute\ndf = spark.read.format(\"geojson\").load(\"/data/features.geojson\")\n\ndf.write.partitionBy(\"category\").saveAsTable(\"features_by_category\")\n", - "line_number": 380, - "length_lines": 5, - "source_file": "readers/geojson.md", - "category": "EXTENDS_SETUP", - "confidence": "high", - "analysis": { - "has_imports": false, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": true, - "has_rx_registration": false, - "uses_existing_df": false, - "uses_existing_rasters": null, - "uses_rx_without_import": false, - "uses_spark_without_creation": true, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "python", - "code": "# Try specifying the format explicitly\ndf = spark.read.format(\"geojson\") \\\n .option(\"multi\", \"false\") \\\n .load(\"/data/problematic.geojson\")\n", - "line_number": 391, - "length_lines": 5, - "source_file": "readers/geojson.md", - "category": "EXTENDS_SETUP", - "confidence": "high", - "analysis": { - "has_imports": false, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": true, - "has_rx_registration": false, - "uses_existing_df": false, - "uses_existing_rasters": null, - "uses_rx_without_import": false, - "uses_spark_without_creation": true, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "python", - "code": "# Convert to newline-delimited for better performance\n# Use standard tools to convert:\n# jq -c '.features[]' input.geojson > output.geojsonl\n\n# Then read with GeoBrix\ndf = spark.read.format(\"geojson\").load(\"/data/output.geojsonl\")\n", - "line_number": 400, - "length_lines": 7, - "source_file": "readers/geojson.md", - "category": "EXTENDS_SETUP", - "confidence": "high", - "analysis": { - "has_imports": false, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": true, - "has_rx_registration": false, - "uses_existing_df": false, - "uses_existing_rasters": null, - "uses_rx_without_import": false, - "uses_spark_without_creation": true, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "python", - "code": "# Check schema\ndf = spark.read.format(\"geojson\").load(\"/data/features.geojson\")\ndf.printSchema()\n\n# Some properties may be nested\ndf.select(\"properties.*\").show()\n", - "line_number": 411, - "length_lines": 7, - "source_file": "readers/geojson.md", - "category": "EXTENDS_SETUP", - "confidence": "high", - "analysis": { - "has_imports": false, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": true, - "has_rx_registration": false, - "uses_existing_df": false, - "uses_existing_rasters": null, - "uses_rx_without_import": false, - "uses_spark_without_creation": true, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "python", - "code": "from pyspark.sql.functions import expr\n\n# Read any format\ndf = spark.read.format(\"shapefile\").load(\"/data/input.shp\")\n\n# Convert geometry to GeoJSON\ngeojson_geom = df.select(\n \"*\",\n expr(\"st_asgeojson(st_geomfromwkb(geom_0))\").alias(\"geometry_json\")\n)\n\ngeojson_geom.show()\n", - "line_number": 424, - "length_lines": 13, - "source_file": "readers/geojson.md", - "category": "SELF_CONTAINED", - "confidence": "medium", - "analysis": { - "has_imports": true, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": true, - "has_rx_registration": false, - "uses_existing_df": false, - "uses_existing_rasters": null, - "uses_rx_without_import": false, - "uses_spark_without_creation": true, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "python", - "code": "# Read GeoPackage\ndf = spark.read.format(\"gpkg\").load(\"/path/to/file.gpkg\")\n\ndf.show()\n", - "line_number": 29, - "length_lines": 5, - "source_file": "readers/geopackage.md", - "category": "EXTENDS_SETUP", - "confidence": "high", - "analysis": { - "has_imports": false, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": true, - "has_rx_registration": false, - "uses_existing_df": false, - "uses_existing_rasters": null, - "uses_rx_without_import": false, - "uses_spark_without_creation": true, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "scala", - "code": "// Read GeoPackage\nval df = spark.read.format(\"gpkg\").load(\"/path/to/file.gpkg\")\n\ndf.show()\n", - "line_number": 38, - "length_lines": 5, - "source_file": "readers/geopackage.md", - "category": "EXTENDS_SETUP", - "confidence": "high", - "analysis": { - "has_imports": false, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": true, - "has_rx_registration": false, - "uses_existing_df": false, - "uses_existing_rasters": null, - "uses_rx_without_import": false, - "uses_spark_without_creation": true, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "sql", - "code": "-- Read GeoPackage\nCREATE OR REPLACE TEMP VIEW features AS\nSELECT * FROM gpkg.`/path/to/file.gpkg`;\n\nSELECT * FROM features;\n", - "line_number": 47, - "length_lines": 6, - "source_file": "readers/geopackage.md", - "category": "SELF_CONTAINED", - "confidence": "medium", - "analysis": { - "has_imports": false, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": false, - "has_rx_registration": false, - "uses_existing_df": null, - "uses_existing_rasters": null, - "uses_rx_without_import": false, - "uses_spark_without_creation": false, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "python", - "code": "# Read specific layer by name\ndf = spark.read.format(\"gpkg\") \\\n .option(\"layerName\", \"buildings\") \\\n .load(\"/path/to/data.gpkg\")\n\n# Read specific layer by index (0-based)\ndf = spark.read.format(\"gpkg\") \\\n .option(\"layerN\", \"1\") \\\n .load(\"/path/to/data.gpkg\")\n", - "line_number": 73, - "length_lines": 10, - "source_file": "readers/geopackage.md", - "category": "EXTENDS_SETUP", - "confidence": "high", - "analysis": { - "has_imports": false, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": true, - "has_rx_registration": false, - "uses_existing_df": false, - "uses_existing_rasters": null, - "uses_rx_without_import": false, - "uses_spark_without_creation": true, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "python", - "code": "# Read GeoPackage (reads first/default layer)\nbuildings = spark.read.format(\"gpkg\").load(\"/data/city.gpkg\")\n\n# Show attributes (note: geometry column may be named 'shape')\nbuildings.select(\"building_id\", \"name\", \"height\", \"shape_srid\").show()\n", - "line_number": 98, - "length_lines": 6, - "source_file": "readers/geopackage.md", - "category": "EXTENDS_SETUP", - "confidence": "high", - "analysis": { - "has_imports": false, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": true, - "has_rx_registration": false, - "uses_existing_df": null, - "uses_existing_rasters": null, - "uses_rx_without_import": false, - "uses_spark_without_creation": true, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "python", - "code": "# GeoPackage with multiple layers\n# Read buildings layer\nbuildings = spark.read.format(\"gpkg\") \\\n .option(\"layerName\", \"buildings\") \\\n .load(\"/data/city.gpkg\")\n\n# Read roads layer\nroads = spark.read.format(\"gpkg\") \\\n .option(\"layerName\", \"roads\") \\\n .load(\"/data/city.gpkg\")\n\n# Read parcels layer\nparcels = spark.read.format(\"gpkg\") \\\n .option(\"layerName\", \"parcels\") \\\n .load(\"/data/city.gpkg\")\n\nbuildings.show()\nroads.show()\nparcels.show()\n", - "line_number": 108, - "length_lines": 20, - "source_file": "readers/geopackage.md", - "category": "EXTENDS_SETUP", - "confidence": "high", - "analysis": { - "has_imports": false, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": true, - "has_rx_registration": false, - "uses_existing_df": null, - "uses_existing_rasters": null, - "uses_rx_without_import": false, - "uses_spark_without_creation": true, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "python", - "code": "from pyspark.sql.functions import expr\n\n# Read GeoPackage\ndf = spark.read.format(\"gpkg\").load(\"/data/boundaries.gpkg\")\n\n# Convert to GEOMETRY type (check actual column name: might be 'shape' or 'geom_0')\ngeometry_df = df.select(\n \"*\",\n expr(\"st_geomfromwkb(shape)\").alias(\"geometry\")\n)\n\n# Use Databricks ST functions\nresult = geometry_df.select(\n \"name\",\n \"geometry\",\n expr(\"st_area(geometry)\").alias(\"area\"),\n expr(\"st_centroid(geometry)\").alias(\"center\")\n)\n\nresult.show()\n", - "line_number": 132, - "length_lines": 21, - "source_file": "readers/geopackage.md", - "category": "SELF_CONTAINED", - "confidence": "medium", - "analysis": { - "has_imports": true, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": true, - "has_rx_registration": false, - "uses_existing_df": false, - "uses_existing_rasters": null, - "uses_rx_without_import": false, - "uses_spark_without_creation": true, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "python", - "code": "# Read from S3\ns3_gpkg = spark.read.format(\"gpkg\").load(\"s3://bucket/path/data.gpkg\")\n\n# Read from Azure Blob Storage\nazure_gpkg = spark.read.format(\"gpkg\") \\\n .load(\"wasbs://container@account.blob.core.windows.net/data.gpkg\")\n\n# Read from Unity Catalog Volume\nvolume_gpkg = spark.read.format(\"gpkg\") \\\n .load(\"/Volumes/catalog/schema/volume/data.gpkg\")\n\ns3_gpkg.show()\n", - "line_number": 157, - "length_lines": 13, - "source_file": "readers/geopackage.md", - "category": "SELF_CONTAINED", - "confidence": "medium", - "analysis": { - "has_imports": true, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": true, - "has_rx_registration": false, - "uses_existing_df": null, - "uses_existing_rasters": null, - "uses_rx_without_import": false, - "uses_spark_without_creation": true, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "python", - "code": "# Read multiple GeoPackage files\nall_data = spark.read.format(\"gpkg\").load(\"/data/geopackages/*.gpkg\")\n\n# Show count from each file\nfrom pyspark.sql.functions import input_file_name\n\nwith_source = all_data.withColumn(\"source\", input_file_name())\nwith_source.groupBy(\"source\").count().show(truncate=False)\n", - "line_number": 174, - "length_lines": 9, - "source_file": "readers/geopackage.md", - "category": "SELF_CONTAINED", - "confidence": "medium", - "analysis": { - "has_imports": true, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": true, - "has_rx_registration": false, - "uses_existing_df": null, - "uses_existing_rasters": null, - "uses_rx_without_import": false, - "uses_spark_without_creation": true, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "sql", - "code": "-- Create view from GeoPackage\nCREATE OR REPLACE TEMP VIEW parcels AS\nSELECT\n *,\n st_geomfromwkb(shape) as geometry\nFROM gpkg.`/data/parcels.gpkg`;\n\n-- Query with spatial functions\nSELECT\n parcel_id,\n owner,\n st_area(geometry) as area_sqm,\n st_perimeter(geometry) as perimeter_m\nFROM parcels\nWHERE st_area(geometry) > 5000;\n", - "line_number": 189, - "length_lines": 16, - "source_file": "readers/geopackage.md", - "category": "SELF_CONTAINED", - "confidence": "medium", - "analysis": { - "has_imports": true, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": false, - "has_rx_registration": false, - "uses_existing_df": null, - "uses_existing_rasters": null, - "uses_rx_without_import": false, - "uses_spark_without_creation": false, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "sql", - "code": "-- Read different layers from same GeoPackage\n-- Note: You need to specify layer in Python/Scala first, then register as views\n\n-- In Python first:\n-- buildings = spark.read.format(\"gpkg\").option(\"layerName\", \"buildings\").load(\"/data/city.gpkg\")\n-- buildings.createOrReplaceTempView(\"buildings\")\n-- \n-- roads = spark.read.format(\"gpkg\").option(\"layerName\", \"roads\").load(\"/data/city.gpkg\")\n-- roads.createOrReplaceTempView(\"roads\")\n\n-- Then in SQL:\nSELECT\n b.building_name,\n r.road_name,\n st_distance(\n st_geomfromwkb(b.shape),\n st_geomfromwkb(r.shape)\n ) as distance_m\nFROM buildings b\nCROSS JOIN roads r\nWHERE st_distance(\n st_geomfromwkb(b.shape),\n st_geomfromwkb(r.shape)\n) < 100;\n", - "line_number": 209, - "length_lines": 25, - "source_file": "readers/geopackage.md", - "category": "EXTENDS_SETUP", - "confidence": "high", - "analysis": { - "has_imports": true, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": true, - "has_rx_registration": false, - "uses_existing_df": null, - "uses_existing_rasters": null, - "uses_rx_without_import": false, - "uses_spark_without_creation": true, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "python", - "code": "# Use GDAL/OGR command line tools or Python GDAL bindings\n# In notebook:\nimport subprocess\n\nresult = subprocess.run(\n ['ogrinfo', '-al', '-so', '/path/to/data.gpkg'],\n capture_output=True,\n text=True\n)\nprint(result.stdout)\n", - "line_number": 240, - "length_lines": 11, - "source_file": "readers/geopackage.md", - "category": "NEEDS_REVIEW", - "confidence": "low", - "analysis": { - "has_imports": true, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": false, - "has_rx_registration": false, - "uses_existing_df": null, - "uses_existing_rasters": null, - "uses_rx_without_import": false, - "uses_spark_without_creation": false, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "python", - "code": "# Define layer names (you need to know them beforehand or query with ogrinfo)\nlayer_names = [\"buildings\", \"roads\", \"parcels\", \"zones\"]\n\n# Read each layer\nlayers = {}\nfor layer_name in layer_names:\n layers[layer_name] = spark.read.format(\"gpkg\") \\\n .option(\"layerName\", layer_name) \\\n .load(\"/data/city.gpkg\")\n\n# Access each layer\nbuildings_df = layers[\"buildings\"]\nroads_df = layers[\"roads\"]\n", - "line_number": 255, - "length_lines": 14, - "source_file": "readers/geopackage.md", - "category": "EXTENDS_SETUP", - "confidence": "high", - "analysis": { - "has_imports": false, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": true, - "has_rx_registration": false, - "uses_existing_df": null, - "uses_existing_rasters": null, - "uses_rx_without_import": false, - "uses_spark_without_creation": true, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "python", - "code": "from pyspark.sql.functions import expr\n\n# Read GeoPackage layer\ngpkg_df = spark.read.format(\"gpkg\") \\\n .option(\"layerName\", \"features\") \\\n .load(\"/data/source.gpkg\")\n\n# Convert to GEOMETRY type\ndelta_df = gpkg_df.select(\n \"*\",\n expr(\"st_geomfromwkb(shape)\").alias(\"geometry\")\n).drop(\"shape\", \"shape_srid\", \"shape_srid_proj\")\n\n# Write to Delta Lake\ndelta_df.write.mode(\"overwrite\").saveAsTable(\"catalog.schema.features\")\n\n# Optimize\nspark.sql(\"\"\"\n OPTIMIZE catalog.schema.features\n ZORDER BY (geometry)\n\"\"\")\n", - "line_number": 275, - "length_lines": 22, - "source_file": "readers/geopackage.md", - "category": "SELF_CONTAINED", - "confidence": "medium", - "analysis": { - "has_imports": true, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": true, - "has_rx_registration": false, - "uses_existing_df": null, - "uses_existing_rasters": null, - "uses_rx_without_import": false, - "uses_spark_without_creation": true, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "python", - "code": "from pyspark.sql.functions import expr, lit\n\n# Read multiple layers and combine\nbuildings = spark.read.format(\"gpkg\") \\\n .option(\"layerName\", \"buildings\") \\\n .load(\"/data/city.gpkg\") \\\n .withColumn(\"layer_type\", lit(\"building\"))\n\nroads = spark.read.format(\"gpkg\") \\\n .option(\"layerName\", \"roads\") \\\n .load(\"/data/city.gpkg\") \\\n .withColumn(\"layer_type\", lit(\"road\"))\n\n# Standardize schema and union\nfrom pyspark.sql.functions import col\n\nbuildings_std = buildings.select(\n col(\"building_id\").alias(\"feature_id\"),\n col(\"name\"),\n col(\"layer_type\"),\n expr(\"st_geomfromwkb(shape)\").alias(\"geometry\")\n)\n\nroads_std = roads.select(\n col(\"road_id\").alias(\"feature_id\"),\n col(\"name\"),\n col(\"layer_type\"),\n expr(\"st_geomfromwkb(shape)\").alias(\"geometry\")\n)\n\n# Combine\nall_features = buildings_std.union(roads_std)\nall_features.show()\n", - "line_number": 301, - "length_lines": 34, - "source_file": "readers/geopackage.md", - "category": "SELF_CONTAINED", - "confidence": "medium", - "analysis": { - "has_imports": true, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": true, - "has_rx_registration": false, - "uses_existing_df": null, - "uses_existing_rasters": null, - "uses_rx_without_import": false, - "uses_spark_without_creation": true, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "python", - "code": "from pyspark.sql.functions import expr\n\n# Read GeoPackage\nparcels = spark.read.format(\"gpkg\") \\\n .option(\"layerName\", \"parcels\") \\\n .load(\"/data/city.gpkg\")\n\n# Add geometry and spatial attributes\nanalyzed = parcels.select(\n \"*\",\n expr(\"st_geomfromwkb(shape)\").alias(\"geometry\")\n).select(\n \"parcel_id\",\n \"owner\",\n \"geometry\",\n expr(\"st_area(geometry)\").alias(\"area\"),\n expr(\"st_perimeter(geometry)\").alias(\"perimeter\"),\n expr(\"st_centroid(geometry)\").alias(\"centroid\"),\n expr(\"st_envelope(geometry)\").alias(\"bbox\")\n)\n\n# Save results\nanalyzed.write.mode(\"overwrite\").saveAsTable(\"parcel_analysis\")\n", - "line_number": 339, - "length_lines": 24, - "source_file": "readers/geopackage.md", - "category": "SELF_CONTAINED", - "confidence": "medium", - "analysis": { - "has_imports": true, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": true, - "has_rx_registration": false, - "uses_existing_df": null, - "uses_existing_rasters": null, - "uses_rx_without_import": false, - "uses_spark_without_creation": true, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "python", - "code": "# Don't read default layer if you need a specific one\ndf = spark.read.format(\"gpkg\") \\\n .option(\"layerName\", \"specific_layer\") \\\n .load(\"/data/large.gpkg\")\n", - "line_number": 369, - "length_lines": 5, - "source_file": "readers/geopackage.md", - "category": "EXTENDS_SETUP", - "confidence": "high", - "analysis": { - "has_imports": false, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": true, - "has_rx_registration": false, - "uses_existing_df": false, - "uses_existing_rasters": null, - "uses_rx_without_import": false, - "uses_spark_without_creation": true, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "python", - "code": "# For large layers\ndf = spark.read.format(\"gpkg\") \\\n .option(\"layerName\", \"large_layer\") \\\n .option(\"chunkSize\", \"50000\") \\\n .load(\"/data/data.gpkg\")\n", - "line_number": 378, - "length_lines": 6, - "source_file": "readers/geopackage.md", - "category": "EXTENDS_SETUP", - "confidence": "high", - "analysis": { - "has_imports": false, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": true, - "has_rx_registration": false, - "uses_existing_df": false, - "uses_existing_rasters": null, - "uses_rx_without_import": false, - "uses_spark_without_creation": true, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "python", - "code": "# Cache layer data\nlayer_df = spark.read.format(\"gpkg\") \\\n .option(\"layerName\", \"important_layer\") \\\n .load(\"/data/data.gpkg\")\n\nlayer_df.cache()\n", - "line_number": 388, - "length_lines": 7, - "source_file": "readers/geopackage.md", - "category": "EXTENDS_SETUP", - "confidence": "high", - "analysis": { - "has_imports": false, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": true, - "has_rx_registration": false, - "uses_existing_df": null, - "uses_existing_rasters": null, - "uses_rx_without_import": false, - "uses_spark_without_creation": true, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "python", - "code": "# Check layer name spelling and case\ndf = spark.read.format(\"gpkg\") \\\n .option(\"layerName\", \"Buildings\") \\\n .load(\"/data/city.gpkg\")\n", - "line_number": 410, - "length_lines": 5, - "source_file": "readers/geopackage.md", - "category": "EXTENDS_SETUP", - "confidence": "high", - "analysis": { - "has_imports": false, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": true, - "has_rx_registration": false, - "uses_existing_df": false, - "uses_existing_rasters": null, - "uses_rx_without_import": false, - "uses_spark_without_creation": true, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "python", - "code": "# GeoPackage may use 'shape', 'geom', or 'geometry' as column name\ndf = spark.read.format(\"gpkg\").load(\"/data/file.gpkg\")\ndf.columns # Check actual column names\n\n# Adjust accordingly\nfrom pyspark.sql.functions import expr\ngeometry_df = df.select(\"*\", expr(\"st_geomfromwkb(shape)\").alias(\"geometry\"))\n", - "line_number": 419, - "length_lines": 8, - "source_file": "readers/geopackage.md", - "category": "SELF_CONTAINED", - "confidence": "medium", - "analysis": { - "has_imports": true, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": true, - "has_rx_registration": false, - "uses_existing_df": false, - "uses_existing_rasters": null, - "uses_rx_without_import": false, - "uses_spark_without_creation": true, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "python", - "code": "# Increase chunk size and repartition\ndf = spark.read.format(\"gpkg\") \\\n .option(\"chunkSize\", \"100000\") \\\n .load(\"/data/large.gpkg\")\n\ndf = df.repartition(100)\n", - "line_number": 431, - "length_lines": 7, - "source_file": "readers/geopackage.md", - "category": "EXTENDS_SETUP", - "confidence": "high", - "analysis": { - "has_imports": false, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": true, - "has_rx_registration": false, - "uses_existing_df": false, - "uses_existing_rasters": null, - "uses_rx_without_import": false, - "uses_spark_without_creation": true, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "python", - "code": "# Read with auto-detected driver\ndf = spark.read.format(\"ogr\").load(\"/path/to/vector/files\")\n\ndf.show()\n", - "line_number": 33, - "length_lines": 5, - "source_file": "readers/ogr.md", - "category": "EXTENDS_SETUP", - "confidence": "high", - "analysis": { - "has_imports": false, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": true, - "has_rx_registration": false, - "uses_existing_df": false, - "uses_existing_rasters": null, - "uses_rx_without_import": false, - "uses_spark_without_creation": true, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "python", - "code": "# Explicitly use Shapefile driver\ndf = spark.read.format(\"ogr\") \\\n .option(\"driverName\", \"ESRI Shapefile\") \\\n .load(\"/path/to/shapefiles\")\n\n# Use GeoJSON driver\ndf = spark.read.format(\"ogr\") \\\n .option(\"driverName\", \"GeoJSON\") \\\n .load(\"/path/to/geojson\")\n", - "line_number": 48, - "length_lines": 10, - "source_file": "readers/ogr.md", - "category": "EXTENDS_SETUP", - "confidence": "high", - "analysis": { - "has_imports": false, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": true, - "has_rx_registration": false, - "uses_existing_df": false, - "uses_existing_rasters": null, - "uses_rx_without_import": false, - "uses_spark_without_creation": true, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "python", - "code": "# Increase chunk size for large features\ndf = spark.read.format(\"ogr\") \\\n .option(\"chunkSize\", \"50000\") \\\n .load(\"/path/to/large/file\")\n\n# Decrease for more parallelism\ndf = spark.read.format(\"ogr\") \\\n .option(\"chunkSize\", \"5000\") \\\n .load(\"/path/to/files\")\n", - "line_number": 66, - "length_lines": 10, - "source_file": "readers/ogr.md", - "category": "EXTENDS_SETUP", - "confidence": "high", - "analysis": { - "has_imports": false, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": true, - "has_rx_registration": false, - "uses_existing_df": false, - "uses_existing_rasters": null, - "uses_rx_without_import": false, - "uses_spark_without_creation": true, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "python", - "code": "# Read second layer\ndf = spark.read.format(\"ogr\") \\\n .option(\"layerN\", \"1\") \\\n .load(\"/path/to/multi_layer.gpkg\")\n", - "line_number": 84, - "length_lines": 5, - "source_file": "readers/ogr.md", - "category": "EXTENDS_SETUP", - "confidence": "high", - "analysis": { - "has_imports": false, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": true, - "has_rx_registration": false, - "uses_existing_df": false, - "uses_existing_rasters": null, - "uses_rx_without_import": false, - "uses_spark_without_creation": true, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "python", - "code": "# Read specific layer by name\ndf = spark.read.format(\"ogr\") \\\n .option(\"layerName\", \"buildings\") \\\n .load(\"/path/to/geodatabase.gdb\")\n", - "line_number": 97, - "length_lines": 5, - "source_file": "readers/ogr.md", - "category": "EXTENDS_SETUP", - "confidence": "high", - "analysis": { - "has_imports": false, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": true, - "has_rx_registration": false, - "uses_existing_df": false, - "uses_existing_rasters": null, - "uses_rx_without_import": false, - "uses_spark_without_creation": true, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "python", - "code": "# Output as WKT instead of WKB\ndf = spark.read.format(\"ogr\") \\\n .option(\"asWKB\", \"false\") \\\n .load(\"/path/to/vectors\")\n", - "line_number": 110, - "length_lines": 5, - "source_file": "readers/ogr.md", - "category": "EXTENDS_SETUP", - "confidence": "high", - "analysis": { - "has_imports": false, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": true, - "has_rx_registration": false, - "uses_existing_df": false, - "uses_existing_rasters": null, - "uses_rx_without_import": false, - "uses_spark_without_creation": true, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "python", - "code": "# Read KML files\ndf = spark.read.format(\"ogr\") \\\n .option(\"driverName\", \"KML\") \\\n .load(\"/path/to/file.kml\")\n\ndf.select(\"Name\", \"Description\", \"geom_0_srid\").show()\n", - "line_number": 133, - "length_lines": 7, - "source_file": "readers/ogr.md", - "category": "EXTENDS_SETUP", - "confidence": "high", - "analysis": { - "has_imports": false, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": true, - "has_rx_registration": false, - "uses_existing_df": false, - "uses_existing_rasters": null, - "uses_rx_without_import": false, - "uses_spark_without_creation": true, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "python", - "code": "# Read specific layer from GeoPackage\nbuildings = spark.read.format(\"ogr\") \\\n .option(\"layerName\", \"buildings\") \\\n .load(\"/path/to/data.gpkg\")\n\nroads = spark.read.format(\"ogr\") \\\n .option(\"layerName\", \"roads\") \\\n .load(\"/path/to/data.gpkg\")\n\nbuildings.show()\nroads.show()\n", - "line_number": 144, - "length_lines": 12, - "source_file": "readers/ogr.md", - "category": "SELF_CONTAINED", - "confidence": "medium", - "analysis": { - "has_imports": true, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": true, - "has_rx_registration": false, - "uses_existing_df": null, - "uses_existing_rasters": null, - "uses_rx_without_import": false, - "uses_spark_without_creation": true, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "python", - "code": "# Read large shapefile with custom chunk size\nlarge_file = spark.read.format(\"ogr\") \\\n .option(\"driverName\", \"ESRI Shapefile\") \\\n .option(\"chunkSize\", \"100000\") \\\n .load(\"/path/to/large_shapefile.shp\")\n\nprint(f\"Loaded {large_file.count()} features\")\n", - "line_number": 160, - "length_lines": 8, - "source_file": "readers/ogr.md", - "category": "EXTENDS_SETUP", - "confidence": "high", - "analysis": { - "has_imports": false, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": true, - "has_rx_registration": false, - "uses_existing_df": null, - "uses_existing_rasters": null, - "uses_rx_without_import": false, - "uses_spark_without_creation": true, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "python", - "code": "from pyspark.sql.functions import expr\n\n# Read and convert to GEOMETRY type\ndf = spark.read.format(\"ogr\").load(\"/path/to/vectors\")\n\ngeometry_df = df.select(\n \"*\",\n expr(\"st_geomfromwkb(geom_0)\").alias(\"geometry\")\n)\n\n# Use Databricks spatial functions\nresult = geometry_df.select(\n \"geometry\",\n expr(\"st_area(geometry)\").alias(\"area\"),\n expr(\"st_centroid(geometry)\").alias(\"centroid\")\n)\n\nresult.show()\n", - "line_number": 172, - "length_lines": 19, - "source_file": "readers/ogr.md", - "category": "SELF_CONTAINED", - "confidence": "medium", - "analysis": { - "has_imports": true, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": true, - "has_rx_registration": false, - "uses_existing_df": false, - "uses_existing_rasters": null, - "uses_rx_without_import": false, - "uses_spark_without_creation": true, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "python", - "code": "kml_df = spark.read.format(\"ogr\") \\\n .option(\"driverName\", \"KML\") \\\n .load(\"/path/to/file.kml\")\n\nkml_df.show()\n", - "line_number": 197, - "length_lines": 6, - "source_file": "readers/ogr.md", - "category": "EXTENDS_SETUP", - "confidence": "high", - "analysis": { - "has_imports": false, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": true, - "has_rx_registration": false, - "uses_existing_df": null, - "uses_existing_rasters": null, - "uses_rx_without_import": false, - "uses_spark_without_creation": true, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "python", - "code": "gml_df = spark.read.format(\"ogr\") \\\n .option(\"driverName\", \"GML\") \\\n .load(\"/path/to/file.gml\")\n\ngml_df.show()\n", - "line_number": 207, - "length_lines": 6, - "source_file": "readers/ogr.md", - "category": "EXTENDS_SETUP", - "confidence": "high", - "analysis": { - "has_imports": false, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": true, - "has_rx_registration": false, - "uses_existing_df": null, - "uses_existing_rasters": null, - "uses_rx_without_import": false, - "uses_spark_without_creation": true, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "python", - "code": "csv_df = spark.read.format(\"ogr\") \\\n .option(\"driverName\", \"CSV\") \\\n .load(\"/path/to/points.csv\")\n\ncsv_df.show()\n", - "line_number": 217, - "length_lines": 6, - "source_file": "readers/ogr.md", - "category": "EXTENDS_SETUP", - "confidence": "high", - "analysis": { - "has_imports": false, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": true, - "has_rx_registration": false, - "uses_existing_df": null, - "uses_existing_rasters": null, - "uses_rx_without_import": false, - "uses_spark_without_creation": true, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "python", - "code": "# PostGIS (requires connection string)\npostgis_df = spark.read.format(\"ogr\") \\\n .option(\"driverName\", \"PostgreSQL\") \\\n .load(\"PG:host=localhost dbname=gis user=postgres\")\n", - "line_number": 227, - "length_lines": 5, - "source_file": "readers/ogr.md", - "category": "EXTENDS_SETUP", - "confidence": "high", - "analysis": { - "has_imports": false, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": true, - "has_rx_registration": false, - "uses_existing_df": null, - "uses_existing_rasters": null, - "uses_rx_without_import": false, - "uses_spark_without_creation": true, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "python", - "code": "# For files with small features\ndf = spark.read.format(\"ogr\") \\\n .option(\"chunkSize\", \"50000\") \\\n .load(\"/path/to/points\")\n\n# For files with large/complex features\ndf = spark.read.format(\"ogr\") \\\n .option(\"chunkSize\", \"1000\") \\\n .load(\"/path/to/complex_polygons\")\n", - "line_number": 238, - "length_lines": 10, - "source_file": "readers/ogr.md", - "category": "EXTENDS_SETUP", - "confidence": "high", - "analysis": { - "has_imports": false, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": true, - "has_rx_registration": false, - "uses_existing_df": false, - "uses_existing_rasters": null, - "uses_rx_without_import": false, - "uses_spark_without_creation": true, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "python", - "code": "# Read multiple files in parallel\ndf = spark.read.format(\"ogr\").load(\"/path/to/directory/*.shp\")\n\n# Repartition for processing\ndf.repartition(100).write.saveAsTable(\"processed_vectors\")\n", - "line_number": 252, - "length_lines": 6, - "source_file": "readers/ogr.md", - "category": "EXTENDS_SETUP", - "confidence": "high", - "analysis": { - "has_imports": false, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": true, - "has_rx_registration": false, - "uses_existing_df": false, - "uses_existing_rasters": null, - "uses_rx_without_import": false, - "uses_spark_without_creation": true, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "python", - "code": "# These are equivalent:\ndf1 = spark.read.format(\"ogr\").option(\"driverName\", \"ESRI Shapefile\").load(\"/path\")\ndf2 = spark.read.format(\"shapefile\").load(\"/path\")\n\n# Named readers set appropriate defaults\ndf3 = spark.read.format(\"geojson\").load(\"/path\") # Sets GeoJSONSeq by default\n", - "line_number": 264, - "length_lines": 7, - "source_file": "readers/ogr.md", - "category": "EXTENDS_SETUP", - "confidence": "high", - "analysis": { - "has_imports": false, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": true, - "has_rx_registration": false, - "uses_existing_df": null, - "uses_existing_rasters": null, - "uses_rx_without_import": false, - "uses_spark_without_creation": true, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "python", - "code": "# No explicit registration needed\ndf = spark.read.format(\"shapefile\").load(\"/path/to/files\")\n", - "line_number": 37, - "length_lines": 3, - "source_file": "readers/overview.md", - "category": "EXTENDS_SETUP", - "confidence": "high", - "analysis": { - "has_imports": false, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": true, - "has_rx_registration": false, - "uses_existing_df": false, - "uses_existing_rasters": null, - "uses_rx_without_import": false, - "uses_spark_without_creation": true, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "python", - "code": "# Generic pattern\ndf = (\n spark\n .read\n .format(\"\")\n .option(\"\", \"\")\n .load(\"\")\n)\n\ndf.show()\n", - "line_number": 67, - "length_lines": 11, - "source_file": "readers/overview.md", - "category": "EXAMPLE_ONLY", - "confidence": "high", - "analysis": { - "has_imports": false, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": false, - "has_rx_registration": false, - "uses_existing_df": false, - "uses_existing_rasters": null, - "uses_rx_without_import": false, - "uses_spark_without_creation": false, - "has_placeholders": true, - "is_short": false - } - }, - { - "language": "python", - "code": "# Read GeoTIFF\nrasters = spark.read.format(\"gdal\").load(\"/data/geotiffs\")\n\n# Read Shapefile\nshapes = spark.read.format(\"shapefile\").load(\"/data/shapefiles\")\n\n# Read GeoJSON\ngeojson = spark.read.format(\"geojson\").load(\"/data/geojson\")\n\n# Read GeoPackage\ngpkg = spark.read.format(\"gpkg\").load(\"/data/packages\")\n", - "line_number": 82, - "length_lines": 12, - "source_file": "readers/overview.md", - "category": "EXTENDS_SETUP", - "confidence": "high", - "analysis": { - "has_imports": false, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": true, - "has_rx_registration": false, - "uses_existing_df": null, - "uses_existing_rasters": false, - "uses_rx_without_import": false, - "uses_spark_without_creation": true, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "python", - "code": "df = spark.read.format(\"shapefile\").load(\"/path/to/file.shp\")\n", - "line_number": 102, - "length_lines": 2, - "source_file": "readers/overview.md", - "category": "EXTENDS_SETUP", - "confidence": "high", - "analysis": { - "has_imports": false, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": true, - "has_rx_registration": false, - "uses_existing_df": false, - "uses_existing_rasters": null, - "uses_rx_without_import": false, - "uses_spark_without_creation": true, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "python", - "code": "# Reads all compatible files in directory\ndf = spark.read.format(\"shapefile\").load(\"/path/to/directory\")\n", - "line_number": 108, - "length_lines": 3, - "source_file": "readers/overview.md", - "category": "EXTENDS_SETUP", - "confidence": "high", - "analysis": { - "has_imports": false, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": true, - "has_rx_registration": false, - "uses_existing_df": false, - "uses_existing_rasters": null, - "uses_rx_without_import": false, - "uses_spark_without_creation": true, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "python", - "code": "# Read specific files\ndf = spark.read.format(\"gdal\").load(\"/path/to/*.tif\")\n", - "line_number": 115, - "length_lines": 3, - "source_file": "readers/overview.md", - "category": "EXTENDS_SETUP", - "confidence": "high", - "analysis": { - "has_imports": false, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": true, - "has_rx_registration": false, - "uses_existing_df": false, - "uses_existing_rasters": null, - "uses_rx_without_import": false, - "uses_spark_without_creation": true, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "python", - "code": "# S3\ndf = spark.read.format(\"shapefile\").load(\"s3://bucket/path/to/shapefiles\")\n\n# Azure Blob Storage\ndf = spark.read.format(\"gdal\").load(\"wasbs://container@account.blob.core.windows.net/path\")\n\n# Google Cloud Storage\ndf = spark.read.format(\"geojson\").load(\"gs://bucket/path/to/geojson\")\n\n# Unity Catalog Volumes (Recommended for Databricks)\ndf = spark.read.format(\"shapefile\").load(\"/Volumes//shapefiles\")\n", - "line_number": 122, - "length_lines": 12, - "source_file": "readers/overview.md", - "category": "EXAMPLE_ONLY", - "confidence": "high", - "analysis": { - "has_imports": false, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": true, - "has_rx_registration": false, - "uses_existing_df": false, - "uses_existing_rasters": null, - "uses_rx_without_import": false, - "uses_spark_without_creation": true, - "has_placeholders": true, - "is_short": false - } - }, - { - "language": "python", - "code": "from pyspark.sql.functions import expr\n\n# Read with GeoBrix\ndf = spark.read.format(\"shapefile\").load(\"/path/to/shapefiles\")\n\n# Convert to GEOMETRY type\ngeometry_df = df.select(\n \"*\",\n expr(\"st_geomfromwkb(geom_0)\").alias(\"geometry\")\n)\n\n# Use Databricks ST functions\nresult = geometry_df.select(\n \"geometry\",\n expr(\"st_area(geometry)\").alias(\"area\"),\n expr(\"st_centroid(geometry)\").alias(\"centroid\")\n)\n", - "line_number": 181, - "length_lines": 18, - "source_file": "readers/overview.md", - "category": "SELF_CONTAINED", - "confidence": "medium", - "analysis": { - "has_imports": true, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": true, - "has_rx_registration": false, - "uses_existing_df": false, - "uses_existing_rasters": null, - "uses_rx_without_import": false, - "uses_spark_without_creation": true, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "sql", - "code": "-- Create view with converted geometries\nCREATE OR REPLACE TEMP VIEW shapes AS\nSELECT\n *,\n st_geomfromwkb(geom_0) as geometry\nFROM shapefile.`/path/to/shapefiles`;\n\n-- Use Databricks ST functions\nSELECT\n feature_id,\n st_area(geometry) as area,\n st_length(geometry) as perimeter\nFROM shapes;\n", - "line_number": 203, - "length_lines": 14, - "source_file": "readers/overview.md", - "category": "SELF_CONTAINED", - "confidence": "medium", - "analysis": { - "has_imports": false, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": false, - "has_rx_registration": false, - "uses_existing_df": null, - "uses_existing_rasters": null, - "uses_rx_without_import": false, - "uses_spark_without_creation": false, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "python", - "code": "# For large rasters, adjust split size\nrasters = spark.read.format(\"gdal\").option(\"sizeInMB\", \"32\").load(\"/data/large_rasters\")\n\n# For large vector files, adjust chunk size\nvectors = spark.read.format(\"shapefile\").option(\"chunkSize\", \"50000\").load(\"/data/large_shapes\")\n", - "line_number": 223, - "length_lines": 6, - "source_file": "readers/overview.md", - "category": "EXTENDS_SETUP", - "confidence": "high", - "analysis": { - "has_imports": false, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": true, - "has_rx_registration": false, - "uses_existing_df": null, - "uses_existing_rasters": false, - "uses_rx_without_import": false, - "uses_spark_without_creation": true, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "python", - "code": "# Use filterRegex to read only specific files\ndf = spark.read.format(\"gdal\").option(\"filterRegex\", \".*_2024_.*\\\\.tif\").load(\"/data/all_rasters\")\n", - "line_number": 233, - "length_lines": 3, - "source_file": "readers/overview.md", - "category": "EXTENDS_SETUP", - "confidence": "high", - "analysis": { - "has_imports": false, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": true, - "has_rx_registration": false, - "uses_existing_df": false, - "uses_existing_rasters": null, - "uses_rx_without_import": false, - "uses_spark_without_creation": true, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "python", - "code": "# Partition by a spatial attribute\ndf = spark.read.format(\"shapefile\").load(\"/data/shapes\")\ndf.repartition(\"region\").write.partitionBy(\"region\").saveAsTable(\"shapes_by_region\")\n", - "line_number": 240, - "length_lines": 4, - "source_file": "readers/overview.md", - "category": "EXTENDS_SETUP", - "confidence": "high", - "analysis": { - "has_imports": false, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": true, - "has_rx_registration": false, - "uses_existing_df": false, - "uses_existing_rasters": null, - "uses_rx_without_import": false, - "uses_spark_without_creation": true, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "python", - "code": "# Cache converted geometries\ndf = spark.read.format(\"geojson\").load(\"/data/boundaries\")\ngeometry_df = df.select(\"*\", expr(\"st_geomfromwkb(geom_0)\").alias(\"geometry\"))\ngeometry_df.cache()\n", - "line_number": 248, - "length_lines": 5, - "source_file": "readers/overview.md", - "category": "EXTENDS_SETUP", - "confidence": "high", - "analysis": { - "has_imports": false, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": true, - "has_rx_registration": false, - "uses_existing_df": false, - "uses_existing_rasters": null, - "uses_rx_without_import": false, - "uses_spark_without_creation": true, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "python", - "code": "# Read different formats\nrasters = spark.read.format(\"gdal\").load(\"/data/rasters\")\nshapefiles = spark.read.format(\"shapefile\").load(\"/data/vectors\")\ngeojson = spark.read.format(\"geojson\").load(\"/data/boundaries\")\n\n# Process each format\nraster_catalog = rasters.select(\"path\", \"metadata\")\nvector_features = shapefiles.select(\"*\", expr(\"st_geomfromwkb(geom_0)\").alias(\"geometry\"))\nboundaries = geojson.select(\"*\", expr(\"st_geomfromwkb(geom_0)\").alias(\"geometry\"))\n\n# Combine or join as needed\n", - "line_number": 259, - "length_lines": 12, - "source_file": "readers/overview.md", - "category": "EXTENDS_SETUP", - "confidence": "high", - "analysis": { - "has_imports": false, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": true, - "has_rx_registration": false, - "uses_existing_df": null, - "uses_existing_rasters": false, - "uses_rx_without_import": false, - "uses_spark_without_creation": true, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "python", - "code": "# Check file paths\nfrom pyspark.dbutils import DBUtils\ndbutils = DBUtils(spark)\ndbutils.fs.ls(\"/path/to/check\")\n", - "line_number": 277, - "length_lines": 5, - "source_file": "readers/overview.md", - "category": "EXTENDS_SETUP", - "confidence": "high", - "analysis": { - "has_imports": true, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": false, - "has_rx_registration": false, - "uses_existing_df": null, - "uses_existing_rasters": null, - "uses_rx_without_import": false, - "uses_spark_without_creation": true, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "python", - "code": "# Explicitly specify driver\ndf = spark.read.format(\"ogr\").option(\"driverName\", \"ESRI Shapefile\").load(\"/path\")\n", - "line_number": 286, - "length_lines": 3, - "source_file": "readers/overview.md", - "category": "EXTENDS_SETUP", - "confidence": "high", - "analysis": { - "has_imports": false, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": true, - "has_rx_registration": false, - "uses_existing_df": false, - "uses_existing_rasters": null, - "uses_rx_without_import": false, - "uses_spark_without_creation": true, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "python", - "code": "# Adjust split size for rasters\ndf = spark.read.format(\"gdal\").option(\"sizeInMB\", \"8\").load(\"/path\")\n\n# Adjust chunk size for vectors\ndf = spark.read.format(\"shapefile\").option(\"chunkSize\", \"5000\").load(\"/path\")\n", - "line_number": 293, - "length_lines": 6, - "source_file": "readers/overview.md", - "category": "EXTENDS_SETUP", - "confidence": "high", - "analysis": { - "has_imports": false, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": true, - "has_rx_registration": false, - "uses_existing_df": false, - "uses_existing_rasters": false, - "uses_rx_without_import": false, - "uses_spark_without_creation": true, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "python", - "code": "# Validate geometries after reading\nfrom pyspark.sql.functions import expr\n\ndf = spark.read.format(\"shapefile\").load(\"/path\")\nvalidated = df.select(\n \"*\",\n expr(\"st_isvalid(st_geomfromwkb(geom_0))\").alias(\"is_valid\")\n)\nvalidated.filter(\"is_valid = false\").show()\n", - "line_number": 303, - "length_lines": 10, - "source_file": "readers/overview.md", - "category": "SELF_CONTAINED", - "confidence": "medium", - "analysis": { - "has_imports": true, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": true, - "has_rx_registration": false, - "uses_existing_df": false, - "uses_existing_rasters": null, - "uses_rx_without_import": false, - "uses_spark_without_creation": true, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "python", - "code": "# Read shapefile(s)\ndf = spark.read.format(\"shapefile\").load(\"/path/to/shapefiles\")\n\ndf.show()\n", - "line_number": 30, - "length_lines": 5, - "source_file": "readers/shapefile.md", - "category": "EXTENDS_SETUP", - "confidence": "high", - "analysis": { - "has_imports": false, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": true, - "has_rx_registration": false, - "uses_existing_df": false, - "uses_existing_rasters": null, - "uses_rx_without_import": false, - "uses_spark_without_creation": true, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "scala", - "code": "// Read shapefile(s)\nval df = spark.read.format(\"shapefile\").load(\"/path/to/shapefiles\")\n\ndf.show()\n", - "line_number": 39, - "length_lines": 5, - "source_file": "readers/shapefile.md", - "category": "EXTENDS_SETUP", - "confidence": "high", - "analysis": { - "has_imports": false, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": true, - "has_rx_registration": false, - "uses_existing_df": false, - "uses_existing_rasters": null, - "uses_rx_without_import": false, - "uses_spark_without_creation": true, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "sql", - "code": "-- Read shapefiles\nCREATE OR REPLACE TEMP VIEW shapes AS\nSELECT * FROM shapefile.`/path/to/shapefiles`;\n\nSELECT * FROM shapes;\n", - "line_number": 48, - "length_lines": 6, - "source_file": "readers/shapefile.md", - "category": "SELF_CONTAINED", - "confidence": "medium", - "analysis": { - "has_imports": false, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": false, - "has_rx_registration": false, - "uses_existing_df": null, - "uses_existing_rasters": null, - "uses_rx_without_import": false, - "uses_spark_without_creation": false, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "python", - "code": "df = spark.read.format(\"shapefile\").load(\"/data/sample.shp\")\ndf.printSchema()\n\n# Output:\n# root\n# |-- geom_0: binary (nullable = true)\n# |-- geom_0_srid: integer (nullable = true)\n# |-- geom_0_srid_proj: string (nullable = true)\n# |-- ID: long (nullable = true)\n# |-- NAME: string (nullable = true)\n# |-- POPULATION: long (nullable = true)\n", - "line_number": 70, - "length_lines": 12, - "source_file": "readers/shapefile.md", - "category": "EXTENDS_SETUP", - "confidence": "high", - "analysis": { - "has_imports": false, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": true, - "has_rx_registration": false, - "uses_existing_df": false, - "uses_existing_rasters": null, - "uses_rx_without_import": false, - "uses_spark_without_creation": true, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "python", - "code": "# Adjust chunk size for performance\ndf = spark.read.format(\"shapefile\") \\\n .option(\"chunkSize\", \"50000\") \\\n .load(\"/path/to/large/shapefile\")\n", - "line_number": 91, - "length_lines": 5, - "source_file": "readers/shapefile.md", - "category": "EXTENDS_SETUP", - "confidence": "high", - "analysis": { - "has_imports": false, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": true, - "has_rx_registration": false, - "uses_existing_df": false, - "uses_existing_rasters": null, - "uses_rx_without_import": false, - "uses_spark_without_creation": true, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "python", - "code": "# Read a single shapefile\nbuildings = spark.read.format(\"shapefile\").load(\"/data/buildings.shp\")\n\n# Show attributes\nbuildings.select(\"ID\", \"NAME\", \"HEIGHT\", \"geom_0_srid\").show()\n", - "line_number": 102, - "length_lines": 6, - "source_file": "readers/shapefile.md", - "category": "EXTENDS_SETUP", - "confidence": "high", - "analysis": { - "has_imports": false, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": true, - "has_rx_registration": false, - "uses_existing_df": null, - "uses_existing_rasters": null, - "uses_rx_without_import": false, - "uses_spark_without_creation": true, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "python", - "code": "# Read all shapefiles in a directory\nall_shapes = spark.read.format(\"shapefile\").load(\"/data/vector/\")\n\n# Show distinct file sources\nall_shapes.select(\"geom_0_srid\").distinct().show()\n", - "line_number": 112, - "length_lines": 6, - "source_file": "readers/shapefile.md", - "category": "EXTENDS_SETUP", - "confidence": "high", - "analysis": { - "has_imports": false, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": true, - "has_rx_registration": false, - "uses_existing_df": null, - "uses_existing_rasters": null, - "uses_rx_without_import": false, - "uses_spark_without_creation": true, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "python", - "code": "# Read from ZIP files\nzipped = spark.read.format(\"shapefile\").load(\"/data/shapes.zip\")\n\n# Or directory of ZIP files\nmulti_zipped = spark.read.format(\"shapefile\").load(\"/data/zipped_shapefiles/\")\n\nzipped.show()\n", - "line_number": 122, - "length_lines": 8, - "source_file": "readers/shapefile.md", - "category": "SELF_CONTAINED", - "confidence": "medium", - "analysis": { - "has_imports": true, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": true, - "has_rx_registration": false, - "uses_existing_df": null, - "uses_existing_rasters": null, - "uses_rx_without_import": false, - "uses_spark_without_creation": true, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "python", - "code": "from pyspark.sql.functions import expr\n\n# Read shapefile\ndf = spark.read.format(\"shapefile\").load(\"/data/boundaries.shp\")\n\n# Convert to GEOMETRY type\ngeometry_df = df.select(\n \"*\",\n expr(\"st_geomfromwkb(geom_0)\").alias(\"geometry\")\n)\n\n# Use Databricks ST functions\nresult = geometry_df.select(\n \"NAME\",\n \"geometry\",\n expr(\"st_area(geometry)\").alias(\"area_sqm\"),\n expr(\"st_length(geometry)\").alias(\"perimeter_m\"),\n expr(\"st_centroid(geometry)\").alias(\"center_point\")\n)\n\nresult.show()\n", - "line_number": 134, - "length_lines": 22, - "source_file": "readers/shapefile.md", - "category": "SELF_CONTAINED", - "confidence": "medium", - "analysis": { - "has_imports": true, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": true, - "has_rx_registration": false, - "uses_existing_df": false, - "uses_existing_rasters": null, - "uses_rx_without_import": false, - "uses_spark_without_creation": true, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "python", - "code": "from pyspark.sql.functions import expr\n\n# Read shapefiles\nparcels = spark.read.format(\"shapefile\").load(\"/data/parcels.shp\")\nzones = spark.read.format(\"shapefile\").load(\"/data/zones.shp\")\n\n# Convert to GEOMETRY\nparcels_geom = parcels.select(\n \"parcel_id\",\n expr(\"st_geomfromwkb(geom_0)\").alias(\"parcel_geom\")\n)\n\nzones_geom = zones.select(\n \"zone_name\",\n expr(\"st_geomfromwkb(geom_0)\").alias(\"zone_geom\")\n)\n\n# Spatial join\nresult = parcels_geom.join(\n zones_geom,\n expr(\"st_intersects(parcel_geom, zone_geom)\")\n)\n\nresult.select(\"parcel_id\", \"zone_name\").show()\n", - "line_number": 160, - "length_lines": 25, - "source_file": "readers/shapefile.md", - "category": "SELF_CONTAINED", - "confidence": "medium", - "analysis": { - "has_imports": true, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": true, - "has_rx_registration": false, - "uses_existing_df": null, - "uses_existing_rasters": null, - "uses_rx_without_import": false, - "uses_spark_without_creation": true, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "sql", - "code": "-- Create view from shapefile\nCREATE OR REPLACE TEMP VIEW buildings AS\nSELECT\n *,\n st_geomfromwkb(geom_0) as geometry\nFROM shapefile.`/data/buildings.shp`;\n\n-- Query with spatial functions\nSELECT\n building_id,\n building_name,\n st_area(geometry) as floor_area,\n st_centroid(geometry) as center_point\nFROM buildings\nWHERE st_area(geometry) > 1000;\n", - "line_number": 191, - "length_lines": 16, - "source_file": "readers/shapefile.md", - "category": "SELF_CONTAINED", - "confidence": "medium", - "analysis": { - "has_imports": true, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": false, - "has_rx_registration": false, - "uses_existing_df": null, - "uses_existing_rasters": null, - "uses_rx_without_import": false, - "uses_spark_without_creation": false, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "sql", - "code": "-- Read both shapefiles\nCREATE OR REPLACE TEMP VIEW properties AS\nSELECT *, st_geomfromwkb(geom_0) as geometry\nFROM shapefile.`/data/properties.shp`;\n\nCREATE OR REPLACE TEMP VIEW flood_zones AS\nSELECT *, st_geomfromwkb(geom_0) as geometry\nFROM shapefile.`/data/flood_zones.shp`;\n\n-- Find properties in flood zones\nSELECT\n p.property_id,\n p.address,\n f.zone_level,\n f.risk_category\nFROM properties p\nJOIN flood_zones f\n ON st_intersects(p.geometry, f.geometry);\n", - "line_number": 211, - "length_lines": 19, - "source_file": "readers/shapefile.md", - "category": "SELF_CONTAINED", - "confidence": "medium", - "analysis": { - "has_imports": false, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": false, - "has_rx_registration": false, - "uses_existing_df": null, - "uses_existing_rasters": null, - "uses_rx_without_import": false, - "uses_spark_without_creation": false, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "python", - "code": "# Read shapefile\ndf = spark.read.format(\"shapefile\").load(\"/data/parcels.shp\")\n\n# Check SRID and projection\ndf.select(\"geom_0_srid\", \"geom_0_srid_proj\").distinct().show(truncate=False)\n", - "line_number": 236, - "length_lines": 6, - "source_file": "readers/shapefile.md", - "category": "EXTENDS_SETUP", - "confidence": "high", - "analysis": { - "has_imports": false, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": true, - "has_rx_registration": false, - "uses_existing_df": false, - "uses_existing_rasters": null, - "uses_rx_without_import": false, - "uses_spark_without_creation": true, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "python", - "code": "from databricks.labs.gbx.rasterx import functions as rx\nfrom pyspark.sql.functions import expr\nrx.register(spark)\n\n# Read shapefile\ndf = spark.read.format(\"shapefile\").load(\"/data/state_plane.shp\")\n\n# Convert to GEOMETRY and reproject\nreprojected = df.select(\n \"*\",\n expr(\"st_transform(st_geomfromwkb(geom_0), 'EPSG:' || geom_0_srid, 'EPSG:4326')\").alias(\"wgs84_geom\")\n)\n\nreprojected.show()\n", - "line_number": 246, - "length_lines": 15, - "source_file": "readers/shapefile.md", - "category": "SELF_CONTAINED", - "confidence": "high", - "analysis": { - "has_imports": true, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": true, - "has_rx_registration": true, - "uses_existing_df": false, - "uses_existing_rasters": null, - "uses_rx_without_import": false, - "uses_spark_without_creation": true, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "python", - "code": "from pyspark.sql.functions import expr\n\n# Read shapefile\nshapefile_df = spark.read.format(\"shapefile\").load(\"/data/source.shp\")\n\n# Convert to GEOMETRY type\ndelta_df = shapefile_df.select(\n \"*\",\n expr(\"st_geomfromwkb(geom_0)\").alias(\"geometry\")\n).drop(\"geom_0\", \"geom_0_srid\", \"geom_0_srid_proj\")\n\n# Write to Delta Lake\ndelta_df.write.mode(\"overwrite\").saveAsTable(\"catalog.schema.spatial_table\")\n\n# Optimize with Z-ordering\nspark.sql(\"\"\"\n OPTIMIZE catalog.schema.spatial_table\n ZORDER BY (geometry)\n\"\"\")\n", - "line_number": 267, - "length_lines": 20, - "source_file": "readers/shapefile.md", - "category": "SELF_CONTAINED", - "confidence": "medium", - "analysis": { - "has_imports": true, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": true, - "has_rx_registration": false, - "uses_existing_df": null, - "uses_existing_rasters": null, - "uses_rx_without_import": false, - "uses_spark_without_creation": true, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "python", - "code": "from pyspark.sql.functions import input_file_name, expr\n\n# Read all shapefiles in directory\nall_files = spark.read.format(\"shapefile\").load(\"/data/shapefiles/*.shp\")\n\n# Add source filename\nwith_source = all_files.withColumn(\"source_file\", input_file_name())\n\n# Process each file's features\nprocessed = with_source.select(\n \"source_file\",\n \"*\",\n expr(\"st_geomfromwkb(geom_0)\").alias(\"geometry\")\n)\n\nprocessed.show()\n", - "line_number": 291, - "length_lines": 17, - "source_file": "readers/shapefile.md", - "category": "SELF_CONTAINED", - "confidence": "medium", - "analysis": { - "has_imports": true, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": true, - "has_rx_registration": false, - "uses_existing_df": null, - "uses_existing_rasters": null, - "uses_rx_without_import": false, - "uses_spark_without_creation": true, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "python", - "code": "# Read shapefile\nbuildings = spark.read.format(\"shapefile\").load(\"/data/buildings.shp\")\n\n# Filter by attributes\nhigh_rise = buildings.filter(\"HEIGHT > 100\")\ncommercial = buildings.filter(\"USE_TYPE = 'Commercial'\")\n\n# Save filtered results\nhigh_rise.write.mode(\"overwrite\").saveAsTable(\"high_rise_buildings\")\ncommercial.write.mode(\"overwrite\").saveAsTable(\"commercial_buildings\")\n", - "line_number": 312, - "length_lines": 11, - "source_file": "readers/shapefile.md", - "category": "EXTENDS_SETUP", - "confidence": "high", - "analysis": { - "has_imports": false, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": true, - "has_rx_registration": false, - "uses_existing_df": null, - "uses_existing_rasters": null, - "uses_rx_without_import": false, - "uses_spark_without_creation": true, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "python", - "code": "# For large shapefiles\nlarge_df = spark.read.format(\"shapefile\") \\\n .option(\"chunkSize\", \"100000\") \\\n .load(\"/data/large_shapefile.shp\")\n", - "line_number": 329, - "length_lines": 5, - "source_file": "readers/shapefile.md", - "category": "EXTENDS_SETUP", - "confidence": "high", - "analysis": { - "has_imports": false, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": true, - "has_rx_registration": false, - "uses_existing_df": null, - "uses_existing_rasters": null, - "uses_rx_without_import": false, - "uses_spark_without_creation": true, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "python", - "code": "# Partition by attribute for better query performance\ndf = spark.read.format(\"shapefile\").load(\"/data/parcels.shp\")\n\ndf.write.partitionBy(\"county_code\").saveAsTable(\"parcels_by_county\")\n", - "line_number": 338, - "length_lines": 5, - "source_file": "readers/shapefile.md", - "category": "EXTENDS_SETUP", - "confidence": "high", - "analysis": { - "has_imports": false, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": true, - "has_rx_registration": false, - "uses_existing_df": false, - "uses_existing_rasters": null, - "uses_rx_without_import": false, - "uses_spark_without_creation": true, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "python", - "code": "# Cache converted geometries\nshapes = spark.read.format(\"shapefile\").load(\"/data/boundaries.shp\")\nshapes_cached = shapes.cache()\n\n# Query multiple times\nresult1 = shapes_cached.filter(\"AREA > 1000\")\nresult2 = shapes_cached.filter(\"TYPE = 'Park'\")\n", - "line_number": 347, - "length_lines": 8, - "source_file": "readers/shapefile.md", - "category": "EXTENDS_SETUP", - "confidence": "high", - "analysis": { - "has_imports": false, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": true, - "has_rx_registration": false, - "uses_existing_df": null, - "uses_existing_rasters": null, - "uses_rx_without_import": false, - "uses_spark_without_creation": true, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "python", - "code": "# Check files\nfrom pyspark.dbutils import DBUtils\ndbutils = DBUtils(spark)\ndbutils.fs.ls(\"/data/shapefile_folder/\")\n", - "line_number": 363, - "length_lines": 5, - "source_file": "readers/shapefile.md", - "category": "EXTENDS_SETUP", - "confidence": "high", - "analysis": { - "has_imports": true, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": false, - "has_rx_registration": false, - "uses_existing_df": null, - "uses_existing_rasters": null, - "uses_rx_without_import": false, - "uses_spark_without_creation": true, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "python", - "code": "# Read with encoding awareness\ndf = spark.read.format(\"shapefile\").load(\"/data/international.shp\")\n\n# Check for encoding issues in attributes\ndf.select(\"NAME\").show(truncate=False)\n", - "line_number": 374, - "length_lines": 6, - "source_file": "readers/shapefile.md", - "category": "EXTENDS_SETUP", - "confidence": "high", - "analysis": { - "has_imports": false, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": true, - "has_rx_registration": false, - "uses_existing_df": false, - "uses_existing_rasters": null, - "uses_rx_without_import": false, - "uses_spark_without_creation": true, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "python", - "code": "# Split large shapefile reading\ndf = spark.read.format(\"shapefile\") \\\n .option(\"chunkSize\", \"10000\") \\\n .load(\"/data/large.shp\")\n\n# Repartition and cache\ndf.repartition(100).cache()\n", - "line_number": 384, - "length_lines": 8, - "source_file": "readers/shapefile.md", - "category": "EXTENDS_SETUP", - "confidence": "high", - "analysis": { - "has_imports": false, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": true, - "has_rx_registration": false, - "uses_existing_df": false, - "uses_existing_rasters": null, - "uses_rx_without_import": false, - "uses_spark_without_creation": true, - "has_placeholders": false, - "is_short": false - } - }, - { - "language": "python", - "code": "df = spark.read.format(\"gdal\").load(\"/data/large.tif\")\ndf.count()\n", - "line_number": 68, - "length_lines": 3, - "source_file": "support.md", - "category": "EXTENDS_SETUP", - "confidence": "high", - "analysis": { - "has_imports": false, - "has_spark_creation": false, - "has_data_creation": false, - "has_file_reading": true, - "has_rx_registration": false, - "uses_existing_df": false, - "uses_existing_rasters": null, - "uses_rx_without_import": false, - "uses_spark_without_creation": true, - "has_placeholders": false, - "is_short": false - } - } - ] + "file_summaries": [], + "snippets": [] } \ No newline at end of file diff --git a/pom.xml b/pom.xml index 9bb3c0d..b2cfaf3 100644 --- a/pom.xml +++ b/pom.xml @@ -19,11 +19,29 @@ 17 17 + + UTF-8 + UTF-8 UTF-8 false 80 + + -agentlib:jdwp=transport=dt_socket,server=y,suspend=n,address=*:5005 @@ -278,10 +296,25 @@ -XX:ReservedCodeCacheSize=512m -XX:InitialCodeCacheSize=256m -XX:MaxMetaspaceSize=1024m + -Dgbx.suites=${suites} ${agentLib} - - FS + + FSD + + com.databricks.labs.gbx.util.ProgressReporter @@ -340,6 +373,36 @@ + + + org.apache.maven.plugins + maven-compiler-plugin + 3.15.0 + + 17 + + + + org.apache.maven.plugins + maven-install-plugin + 3.1.4 + + + org.apache.maven.plugins + maven-deploy-plugin + 3.1.4 + @@ -490,9 +553,17 @@ - true true - true + true true true diff --git a/python/geobrix/pyproject.toml b/python/geobrix/pyproject.toml index dd8afe7..1eba140 100644 --- a/python/geobrix/pyproject.toml +++ b/python/geobrix/pyproject.toml @@ -10,9 +10,15 @@ description = "GeoBriX: A high-performance spatial processing library for Apache readme = "README.md" requires-python = ">=3.10, <3.13" -license = { text = "Databricks License" } +# PEP 639 SPDX expression form. setuptools >=77 deprecated the table form +# (`{ text = ... }` / `{ file = ... }`) and warns on every build by 2027-Feb-18. +# Proprietary licenses use the `LicenseRef-` namespace per PEP 639; +# canonical license text lives in ../../LICENSE at the repo root. +license = "LicenseRef-Databricks-Proprietary" classifiers = [ - "License :: Other/Proprietary License", + # PEP 639: License classifiers are forbidden once `project.license` is a SPDX + # expression — setuptools >=77 raises InvalidConfigError on overlap. + # The license is expressed above via `license = "LicenseRef-Databricks-Proprietary"`. "Topic :: Scientific/Engineering :: GIS", "Programming Language :: Python :: 3", "Programming Language :: Python :: 3.12", diff --git a/python/geobrix/requirements-ci.in b/python/geobrix/requirements-ci.in index 0936ed3..d26a9cd 100644 --- a/python/geobrix/requirements-ci.in +++ b/python/geobrix/requirements-ci.in @@ -15,7 +15,7 @@ numpy==2.1.3 pyspark==4.0.0 pytest==8.4.2 pytest-cov==7.1.0 -setuptools==74.0.0 +setuptools==80.9.0 # >= 77.0.0 required to parse PEP 639 SPDX license strings (GDAL 3.11+ sdist's `license = "MIT"`); 74.0.0 fails GDAL build with `project.license must be valid exactly by one definition` wheel==0.45.1 # from pyproject.toml [dev] extra (lint tooling) diff --git a/python/geobrix/requirements-ci.txt b/python/geobrix/requirements-ci.txt index da108c0..8b5ce36 100644 --- a/python/geobrix/requirements-ci.txt +++ b/python/geobrix/requirements-ci.txt @@ -391,9 +391,9 @@ pytokens==0.4.1 \ --hash=sha256:ee44d0f85b803321710f9239f335aafe16553b39106384cef8e6de40cb4ef2f6 \ --hash=sha256:f66a6bbe741bd431f6d741e617e0f39ec7257ca1f89089593479347cc4d13324 # via black -setuptools==74.0.0 \ - --hash=sha256:0274581a0037b638b9fc1c6883cc71c0210865aaa76073f7882376b641b84e8f \ - --hash=sha256:a85e96b8be2b906f3e3e789adec6a9323abf79758ecfa3065bd740d81158b11e +setuptools==80.9.0 \ + --hash=sha256:062d34222ad13e0cc312a4c02d73f059e86a4acbfbdea8f8f76b28c99f306922 \ + --hash=sha256:f36b47402ecde768dbfafc46e8e4207b4360c654f1f3bb84475f0a28628fb19c # via -r requirements-ci.in wheel==0.45.1 \ --hash=sha256:661e1abd9198507b1409a20c02106d9670b2576e916d58f520316666abca6729 \ diff --git a/python/geobrix/src/databricks/labs/gbx/gridx/bng/functions.py b/python/geobrix/src/databricks/labs/gbx/gridx/bng/functions.py index bf6ae7b..e88b069 100644 --- a/python/geobrix/src/databricks/labs/gbx/gridx/bng/functions.py +++ b/python/geobrix/src/databricks/labs/gbx/gridx/bng/functions.py @@ -160,9 +160,7 @@ def bng_euclideandistance(cell_id1: ColLike, cell_id2: ColLike) -> Column: Returns: Column of long (distance in metres). """ - return f.call_function( - "gbx_bng_euclideandistance", _col(cell_id1), _col(cell_id2) - ) + return f.call_function("gbx_bng_euclideandistance", _col(cell_id1), _col(cell_id2)) def bng_geomkloop(geom: ColLike, resolution: ColLike, k: ColLike) -> Column: @@ -176,9 +174,7 @@ def bng_geomkloop(geom: ColLike, resolution: ColLike, k: ColLike) -> Column: Returns: Column of array of BNG cell identifiers. """ - return f.call_function( - "gbx_bng_geomkloop", _col(geom), _col(resolution), _col(k) - ) + return f.call_function("gbx_bng_geomkloop", _col(geom), _col(resolution), _col(k)) def bng_geomkring(geom: ColLike, resolution: ColLike, k: ColLike) -> Column: @@ -192,9 +188,7 @@ def bng_geomkring(geom: ColLike, resolution: ColLike, k: ColLike) -> Column: Returns: Column of array of BNG cell identifiers. """ - return f.call_function( - "gbx_bng_geomkring", _col(geom), _col(resolution), _col(k) - ) + return f.call_function("gbx_bng_geomkring", _col(geom), _col(resolution), _col(k)) def bng_kloop(cell_id: ColLike, k: ColLike) -> Column: diff --git a/python/geobrix/src/databricks/labs/gbx/rasterx/functions.py b/python/geobrix/src/databricks/labs/gbx/rasterx/functions.py index cb091d4..4b0d138 100644 --- a/python/geobrix/src/databricks/labs/gbx/rasterx/functions.py +++ b/python/geobrix/src/databricks/labs/gbx/rasterx/functions.py @@ -412,9 +412,7 @@ def rst_combineavg_agg(tile: ColLike) -> Column: return f.call_function("gbx_rst_combineavg_agg", _col(tile)) -def rst_derivedband_agg( - tile: ColLike, pyfunc: ColLike, func_name: ColLike -) -> Column: +def rst_derivedband_agg(tile: ColLike, pyfunc: ColLike, func_name: ColLike) -> Column: """Aggregate tiles and apply a Python UDF per band (use with groupBy). Args: @@ -576,9 +574,7 @@ def rst_h3_rastertogridavg(tile: ColLike, resolution: ColLike) -> Column: Returns: Column of grid values (e.g. struct with H3 index and avg). """ - return f.call_function( - "gbx_rst_h3_rastertogridavg", _col(tile), _col(resolution) - ) + return f.call_function("gbx_rst_h3_rastertogridavg", _col(tile), _col(resolution)) def rst_h3_rastertogridcount(tile: ColLike, resolution: ColLike) -> Column: @@ -591,9 +587,7 @@ def rst_h3_rastertogridcount(tile: ColLike, resolution: ColLike) -> Column: Returns: Column of grid values (e.g. struct with H3 index and count). """ - return f.call_function( - "gbx_rst_h3_rastertogridcount", _col(tile), _col(resolution) - ) + return f.call_function("gbx_rst_h3_rastertogridcount", _col(tile), _col(resolution)) def rst_h3_rastertogridmax(tile: ColLike, resolution: ColLike) -> Column: @@ -606,9 +600,7 @@ def rst_h3_rastertogridmax(tile: ColLike, resolution: ColLike) -> Column: Returns: Column of grid values (e.g. struct with H3 index and max). """ - return f.call_function( - "gbx_rst_h3_rastertogridmax", _col(tile), _col(resolution) - ) + return f.call_function("gbx_rst_h3_rastertogridmax", _col(tile), _col(resolution)) def rst_h3_rastertogridmin(tile: ColLike, resolution: ColLike) -> Column: @@ -621,9 +613,7 @@ def rst_h3_rastertogridmin(tile: ColLike, resolution: ColLike) -> Column: Returns: Column of grid values (e.g. struct with H3 index and min). """ - return f.call_function( - "gbx_rst_h3_rastertogridmin", _col(tile), _col(resolution) - ) + return f.call_function("gbx_rst_h3_rastertogridmin", _col(tile), _col(resolution)) def rst_h3_rastertogridmedian(tile: ColLike, resolution: ColLike) -> Column: @@ -698,9 +688,7 @@ def rst_convolve(tile: ColLike, kernel: ColLike) -> Column: return f.call_function("gbx_rst_convolve", _col(tile), _col(kernel)) -def rst_derivedband( - tile_expr: ColLike, pyfunc: ColLike, func_name: ColLike -) -> Column: +def rst_derivedband(tile_expr: ColLike, pyfunc: ColLike, func_name: ColLike) -> Column: """Apply a Python UDF to each pixel (or band) to produce a derived band. Args: @@ -792,14 +780,10 @@ def rst_ndvi(tile: ColLike, red_band: ColLike, nir_band: ColLike) -> Column: Returns: Column of raster tile (single-band NDVI). """ - return f.call_function( - "gbx_rst_ndvi", _col(tile), _col(red_band), _col(nir_band) - ) + return f.call_function("gbx_rst_ndvi", _col(tile), _col(red_band), _col(nir_band)) -def rst_rastertoworldcoord( - tile: ColLike, pixel_x: ColLike, pixel_y: ColLike -) -> Column: +def rst_rastertoworldcoord(tile: ColLike, pixel_x: ColLike, pixel_y: ColLike) -> Column: """Convert pixel (x, y) to world (x, y) in the CRS of the raster. Args: @@ -889,9 +873,7 @@ def rst_updatetype(tile: ColLike, new_type: ColLike) -> Column: return f.call_function("gbx_rst_updatetype", _col(tile), _col(new_type)) -def rst_worldtorastercoord( - tile: ColLike, world_x: ColLike, world_y: ColLike -) -> Column: +def rst_worldtorastercoord(tile: ColLike, world_x: ColLike, world_y: ColLike) -> Column: """Convert world (x, y) to pixel (x, y) in the raster. Args: diff --git a/python/geobrix/test/gridx/test_bng_functions.py b/python/geobrix/test/gridx/test_bng_functions.py index 2984fc8..5fb16a4 100644 --- a/python/geobrix/test/gridx/test_bng_functions.py +++ b/python/geobrix/test/gridx/test_bng_functions.py @@ -640,10 +640,12 @@ def test_bng_scalar_literal_args(spark, bng_registered): """ polygon_wkt = "POLYGON ((530000 180000, 530500 180000, 530500 180500, 530000 180500, 530000 180000))" df = spark.range(1).select( - bng_registered.bng_pointascell(f.lit("POINT (400000 400000)"), 1).alias("cell_int_res"), - bng_registered.bng_pointascell(f.lit("POINT (400000 400000)"), f.lit("1km")).alias( - "cell_str_res" + bng_registered.bng_pointascell(f.lit("POINT (400000 400000)"), 1).alias( + "cell_int_res" ), + bng_registered.bng_pointascell( + f.lit("POINT (400000 400000)"), f.lit("1km") + ).alias("cell_str_res"), bng_registered.bng_polyfill(f.lit(polygon_wkt), 1).alias("cells_int_res"), bng_registered.bng_kloop(f.lit("TQ388791"), 1).alias("kloop_int_k"), ) diff --git a/python/geobrix/test/sample/test_sample_bundle.py b/python/geobrix/test/sample/test_sample_bundle.py index 888afad..432c87a 100644 --- a/python/geobrix/test/sample/test_sample_bundle.py +++ b/python/geobrix/test/sample/test_sample_bundle.py @@ -14,16 +14,14 @@ import pytest -# Public API from package +# Public API + the _bundle module itself (needed for internal-helper coverage tests below). +from databricks.labs.gbx.sample import _bundle as _bundle_mod from databricks.labs.gbx.sample import ( get_temp_dir, get_volumes_path, run_complete_bundle, run_essential_bundle, ) -# Internal helpers for coverage -from databricks.labs.gbx.sample import _bundle as _bundle_mod - # ========== __init__ (package surface) ========== diff --git a/scripts/security/diag-pgpverify-pom-transit b/scripts/security/diag-pgpverify-pom-transit new file mode 100755 index 0000000..22361bc --- /dev/null +++ b/scripts/security/diag-pgpverify-pom-transit @@ -0,0 +1,128 @@ +#!/bin/bash +# +# Diagnose Category B PGP failures: "PGP Signature INVALID" on .pom files +# only (never .jar). Suspected cause: db-maven JFrog mirror mutating POM +# bytes (line endings, encoding, metadata) between Maven Central and CI, +# breaking the upstream signature. +# +# This script fetches each suspect POM + .asc from the configured Maven +# mirror (db-maven via NETRC auth — same path Maven uses in CI) and +# compares its sha256 against a known reference from Maven Central (fetched +# locally via maven-proxy.dev.databricks.com, recorded below). +# +# Outputs per artifact: +# * size + sha256 of the .pom and .pom.asc as served by db-maven +# * PASS/FAIL vs the embedded Maven Central reference sha256s +# * if mismatch: first 16 bytes that differ (line-ending diff is the +# usual suspect) +# * gpg --verify result if gpg is available +# +# Usage (in CI / geobrix-dev container): +# ./scripts/security/diag-pgpverify-pom-transit +# +# Requires: curl, shasum/sha256sum. Optional: gpg. +# Requires NETRC env var pointing at a netrc with db-maven creds — set +# by .github/actions/jfrog-auth or .github/actions/jfrog-pip-bootstrap. + +set -eu + +# db-maven Maven repo root (artifactory db-maven proxy of Maven Central). +JFROG_URL="https://databricks.jfrog.io/artifactory/db-maven" + +# Each line: "||" +# Reference sha256s captured 2026-05-18 from maven-proxy.dev.databricks.com, +# which is a transparent mirror of repo1.maven.org. If those upstream bytes +# ever change, regenerate this table. +ARTIFACTS=$(cat <<'EOF' +com/fasterxml/jackson/core/jackson-annotations/2.18.3/jackson-annotations-2.18.3.pom|b9c98ba9e29ea61b693d4c8c801968feaae220b2b2002364a7bcec524998384b|0f752e188ec3c8f0a3bd6060b67243750d01eda3adc9fbfa7c3c114f784eae5e +com/fasterxml/jackson/core/jackson-core/2.18.3/jackson-core-2.18.3.pom|37dc6b8f6391a4709ac215ca98fa2341c6dd65613c2277d96a4fd82a2f50e3c9|81f47a9b1d9b40d6679888ea23f42b076f79c923f0a31ad27bde68faa3429851 +com/fasterxml/jackson/core/jackson-databind/2.18.3/jackson-databind-2.18.3.pom|e58f48ac14e4dbd48595d69e20bad35019ec1281514ef7ef155e18072ada617f|da0ea4a494ed5368e3e5579fffe677ba2208faf82f88df890169bfa08817c7b3 +net/alchim31/maven/scala-maven-plugin/4.9.9/scala-maven-plugin-4.9.9.pom|6fc2ece5857f70bcb3fc08a41c5a9f6dcb23c359cab649a48a838b7948a72ed5|b8e4a5cbb07e2492ff72f2d8c0b00a05749082ee49d70e2fd7622de261191e2e +org/iq80/snappy/snappy/0.4/snappy-0.4.pom|a709ce17111e4149d9b79a5295644e0cd5a8355aec4b2ef4c0436aba7b25d08a|1dcdccaf5d6eb766e4daa9b9a386fa697594ede0f5e273062813aea534e13bc6 +javax/servlet/javax.servlet-api/3.1.0/javax.servlet-api-3.1.0.pom|b31109e22ea3f2df1ad7955432e718a35def50ae6c19698034afa8a0cf9e9069|c7130f17422295faabf1885daa6f1c8a1a8c05d6a573c512148119f2a4aa42f3 +EOF +) + +if command -v sha256sum >/dev/null 2>&1; then + SHA() { sha256sum "$1" | awk '{print $1}'; } +elif command -v shasum >/dev/null 2>&1; then + SHA() { shasum -a 256 "$1" | awk '{print $1}'; } +else + echo "ERROR: need sha256sum or shasum on PATH" >&2 + exit 1 +fi + +NETRC_FLAG="" +if [ -n "${NETRC:-}" ] && [ -r "${NETRC}" ]; then + NETRC_FLAG="--netrc-file ${NETRC}" +elif [ -r "${HOME}/.netrc" ]; then + NETRC_FLAG="--netrc" +fi + +TMP=$(mktemp -d) +trap 'rm -rf "${TMP}"' EXIT + +mismatch=0 + +printf '%s\n' "===== POM transit diagnostic: db-maven vs Maven Central =====" +printf 'JFrog base: %s\n' "${JFROG_URL}" +printf 'Netrc: %s\n' "${NETRC:-${HOME}/.netrc (if present)}" +printf '\n' + +# Process while preserving variables across loop iterations (subshell trap) +while IFS='|' read -r path expected_pom_sha expected_asc_sha; do + [ -z "${path}" ] && continue + fname=$(basename "${path}") + pom="${TMP}/${fname}" + asc="${TMP}/${fname}.asc" + + printf '────── %s ──────\n' "${fname}" + http_pom=$(curl -sSL ${NETRC_FLAG} --max-time 20 -w '%{http_code}' -o "${pom}" "${JFROG_URL}/${path}" || echo "curl-error") + http_asc=$(curl -sSL ${NETRC_FLAG} --max-time 20 -w '%{http_code}' -o "${asc}" "${JFROG_URL}/${path}.asc" || echo "curl-error") + + if [ "${http_pom}" != "200" ] || [ "${http_asc}" != "200" ]; then + printf ' ⚠ fetch failed (.pom=%s .asc=%s); skipping\n\n' "${http_pom}" "${http_asc}" + mismatch=$((mismatch+1)) + continue + fi + + pom_size=$(wc -c < "${pom}" | tr -d ' ') + asc_size=$(wc -c < "${asc}" | tr -d ' ') + pom_sha=$(SHA "${pom}") + asc_sha=$(SHA "${asc}") + + pom_status="MATCH" + asc_status="MATCH" + [ "${pom_sha}" != "${expected_pom_sha}" ] && { pom_status="MISMATCH"; mismatch=$((mismatch+1)); } + [ "${asc_sha}" != "${expected_asc_sha}" ] && { asc_status="MISMATCH"; mismatch=$((mismatch+1)); } + + printf ' .pom size=%s sha256=%s [%s]\n' "${pom_size}" "${pom_sha}" "${pom_status}" + printf ' .pom.asc size=%s sha256=%s [%s]\n' "${asc_size}" "${asc_sha}" "${asc_status}" + + # If POM bytes differ from Maven Central reference, print a hex-diff to + # localize the divergence (line endings are the usual culprit). + if [ "${pom_status}" = "MISMATCH" ]; then + # Fetch Maven Central reference too (only path: maven-central tends to be + # blocked from the hardened runner, so this may fail — we still emit the + # local hex dump of what JFrog served for byte-level inspection). + printf ' (db-maven bytes, first 256):\n' + head -c 256 "${pom}" | od -An -c | sed 's/^/ /' + printf ' (db-maven bytes, last 64):\n' + tail -c 64 "${pom}" | od -An -c | sed 's/^/ /' + fi + + # Standalone gpg verify (does the .asc verify the .pom regardless of keysmap?) + if command -v gpg >/dev/null 2>&1; then + if gpg --verify "${asc}" "${pom}" >"${TMP}/gpg.out" 2>&1; then + printf ' gpg --verify: OK\n' + else + printf ' gpg --verify: FAILED\n' + sed 's/^/ /' "${TMP}/gpg.out" + fi + fi + + printf '\n' +done <<< "${ARTIFACTS}" + +printf '===== summary: %s mismatch event(s) =====\n' "${mismatch}" +exit "${mismatch}" diff --git a/scripts/security/maven-pgp-bootstrap b/scripts/security/maven-pgp-bootstrap index eb6ec03..1709ee9 100755 --- a/scripts/security/maven-pgp-bootstrap +++ b/scripts/security/maven-pgp-bootstrap @@ -28,13 +28,16 @@ mkdir -p "${root}/target" # first unknown key — we want a complete inventory. ( cd "${root}" + # -DfailNoKey is intentionally omitted: it is not a parameter in + # pgpverify-maven-plugin >= 1.13 (emits "Parameter 'failNoKey' is unknown"). + # Missing-keysmap-entry behavior in 1.19.1 is controlled by the keysmap + # file itself, not by mojo parameters. mvn -B -q -Pverify-pgp verify \ -DskipTests \ -Dscoverage.skip \ -Dscalastyle.skip=true \ -DfailNoSignature=false \ -DfailWeakSignature=false \ - -DfailNoKey=false \ 2>&1 ) | tee "${log}" >&2 diff --git a/src/test/scala/com/databricks/labs/gbx/util/ProgressReporter.scala b/src/test/scala/com/databricks/labs/gbx/util/ProgressReporter.scala new file mode 100644 index 0000000..4a0cbf7 --- /dev/null +++ b/src/test/scala/com/databricks/labs/gbx/util/ProgressReporter.scala @@ -0,0 +1,235 @@ +package com.databricks.labs.gbx.util + +import org.scalatest.Reporter +import org.scalatest.events._ + +import java.io.File +import java.lang.reflect.Modifier +import java.util.concurrent.atomic.AtomicInteger + +/** + * Custom ScalaTest reporter that prints a one-line progress marker after every + * `SuiteCompleted` event so that long mvn test runs surface "how far in are we" + * without needing to count test names in the log. + * + * Wired in pom.xml via scalatest-maven-plugin's `` config. Loaded by + * reflection; must have a public no-arg constructor and live on the test + * classpath. + * + * Sample lines: + * + * [progress] suite #12/64 done · SpatialRefOpsTest · 215 ms · tests=6 (0 failed) + * · totals: 312 tests, 0 failed · elapsed 3m 24s + * [progress] RUN COMPLETE · 64 suites · 1,247 tests · 0 failed · elapsed 18m 03s + * + * The "/M" denominator is computed by walking `target/test-classes/` for + * every `*Test.class` and keeping only the classes that: + * + * 1. Pass the structural Suite filter (concrete + public + extends + * `org.scalatest.Suite` + has a public no-arg constructor — same + * shape ScalaTest's own discovery uses), and + * 2. Match the `-Dsuites=…` runtime filter (comma-separated list of + * exact FQCNs and/or `package.*` wildcards) that scalatest-maven- + * plugin forwards into the test JVM as a system property. Without + * this, classes compiled from `docs/tests/scala/…` (added as a + * secondary test source by build-helper-maven-plugin) and tests + * outside our top-level namespace (e.g. + * `org.apache.spark.sql.adapters.SparkAdaptersTest`) would inflate + * the count even though ScalaTest's runner skips them under our + * default `com.databricks.labs.gbx.*` pattern. + * + * As a belt-and-braces guard, M also decrements at runtime whenever a + * SuiteCompleted fires with 0 tests — that handles any classes that + * slipped past the static filter but registered no `test("…")` blocks. + * + * For filtered runs (`-Dsuites=com.databricks.labs.gbx.gridx.*`) M reflects + * the count selected by that filter, so `#3/12` reads as "3 of 12 + * selected". The discovery path can be overridden with + * `-DgbxTestClassesDir=…`; if the directory is missing entirely, M is + * suppressed and only `#N` is printed. + * + * Counters are AtomicInteger because ScalaTest may fire events from multiple + * threads when suites run in parallel. + */ +class ProgressReporter extends Reporter { + private val suitesCompleted = new AtomicInteger(0) + private val testsTotal = new AtomicInteger(0) + private val testsFailedTotal = new AtomicInteger(0) + private val testsInCurrentSuite = new ThreadLocal[Int] { + override def initialValue(): Int = 0 + } + private val failedInCurrentSuite = new ThreadLocal[Int] { + override def initialValue(): Int = 0 + } + private val startTimeMs = System.currentTimeMillis() + + // Starts at -1 ("not yet discovered"); first SuiteCompleted triggers the + // class scan. 0 thereafter means "discovery returned nothing usable" — the + // formatter falls back to just `#N` in that case. Otherwise this value is + // an upper bound on the runnable suite count, decremented in-place when an + // empty SuiteCompleted (0 tests) is observed. + private val totalSuites = new AtomicInteger(-1) + + override def apply(event: Event): Unit = event match { + case _: SuiteStarting => + testsInCurrentSuite.set(0) + failedInCurrentSuite.set(0) + + case _: TestSucceeded => + testsTotal.incrementAndGet() + testsInCurrentSuite.set(testsInCurrentSuite.get() + 1) + + case _: TestFailed => + testsTotal.incrementAndGet() + testsFailedTotal.incrementAndGet() + testsInCurrentSuite.set(testsInCurrentSuite.get() + 1) + failedInCurrentSuite.set(failedInCurrentSuite.get() + 1) + + case e: SuiteCompleted => + // Lazy-init M on the first SuiteCompleted so the cost is paid inside the + // test JVM (where the test classpath is fully assembled), not in the + // constructor of the reporter. + if (totalSuites.get() == -1) totalSuites.set(discoverTotalSuites()) + val suiteTests = testsInCurrentSuite.get() + if (suiteTests == 0) { + // Empty suite (Suite-extending class with no `test(...)` blocks + // registered): discovery counted it structurally; the run produced + // zero work. Adjust M down and skip the progress line so #N stays + // aligned with real work. + if (totalSuites.get() > 0) totalSuites.decrementAndGet() + } else { + val n = suitesCompleted.incrementAndGet() + val suiteMs = e.duration.getOrElse(0L) + val suiteFailed = failedInCurrentSuite.get() + val totalTests = testsTotal.get() + val totalFailed = testsFailedTotal.get() + Console.out.println( + f"[progress] suite ${suiteIndex(n)} done · ${e.suiteName} · $suiteMs%,d ms · " + + f"tests=$suiteTests ($suiteFailed failed) · " + + f"totals: $totalTests%,d tests, $totalFailed failed · elapsed ${elapsedHuman()}" + ) + Console.out.flush() + } + testsInCurrentSuite.remove() + failedInCurrentSuite.remove() + + case _: RunCompleted => + Console.out.println( + f"[progress] RUN COMPLETE · ${suitesCompleted.get}%,d suites · " + + f"${testsTotal.get}%,d tests · ${testsFailedTotal.get}%,d failed · " + + f"elapsed ${elapsedHuman()}" + ) + Console.out.flush() + + case _ => // ignore other events + } + + private def suiteIndex(n: Int): String = { + val m = totalSuites.get() + if (m > 0) f"#$n/$m" else f"#$n" + } + + private def elapsedHuman(): String = { + val ms = System.currentTimeMillis() - startTimeMs + val s = ms / 1000 + val m = s / 60 + val rem = s % 60 + if (m > 0) f"${m}m ${rem}%02ds" else f"${s}s" + } + + /** + * Walks `target/test-classes/` (or `-DgbxTestClassesDir=…`) and counts the + * `*Test.class` files that ScalaTest will actually run: structurally a + * Suite (concrete + public + extends `org.scalatest.Suite` + public no-arg + * constructor) AND matched by the `-Dsuites=…` runtime filter. Returns 0 + * on any error or if the directory doesn't exist — caller treats 0 as + * "no denominator, print just #N". + */ + private def discoverTotalSuites(): Int = { + val path = sys.props.getOrElse("gbxTestClassesDir", "target/test-classes") + val dir = new File(path) + if (!dir.isDirectory) return 0 + val suiteCls = + try Class.forName("org.scalatest.Suite", false, Thread.currentThread().getContextClassLoader) + catch { case _: Throwable => return 0 } + // Read the explicit forwarded JVM prop first (set by scalatest-maven-plugin's + // argLine in pom.xml: `-Dgbx.suites=${suites}`). The bare `suites` key only + // works if the user happened to pass `-Dsuites=…` at the JVM level — + // scalatest-maven-plugin consumes the Maven `suites` property and translates + // it to runner args, NOT to a forwarded JVM system property. + val rawPattern = sys.props.getOrElse("gbx.suites", sys.props.getOrElse("suites", "")) + val matcher = compileSuitesMatcher(rawPattern) + try countRunnableSuites(dir, dir, suiteCls, matcher) + catch { case _: Throwable => 0 } + } + + private def countRunnableSuites( + root: File, + dir: File, + suiteCls: Class[_], + matcher: String => Boolean + ): Int = { + val entries = Option(dir.listFiles()).getOrElse(Array.empty[File]) + entries.foldLeft(0) { (acc, f) => + if (f.isDirectory) acc + countRunnableSuites(root, f, suiteCls, matcher) + else if (f.getName.endsWith("Test.class") && !f.getName.contains("$")) + acc + (if (isRunnableSuite(root, f, suiteCls, matcher)) 1 else 0) + else acc + } + } + + /** + * Reflective check matching ScalaTest's discovery filter. Uses + * `Class.forName(name, initialize=false, ...)` so static initializers don't + * run during counting — only the class metadata is loaded. Any failure + * (NoClassDefFoundError, missing transitive dep, locked classloader) + * conservatively counts the class as "not runnable" so a transient + * reflection issue can't inflate the denominator. + */ + private def isRunnableSuite( + root: File, + classFile: File, + suiteCls: Class[_], + matcher: String => Boolean + ): Boolean = { + try { + val rel = root.toURI.relativize(classFile.toURI).getPath + val className = rel.stripSuffix(".class").replace('/', '.') + if (!matcher(className)) return false + val cls = Class.forName(className, false, Thread.currentThread().getContextClassLoader) + val mods = cls.getModifiers + if (Modifier.isAbstract(mods) || Modifier.isInterface(mods) || !Modifier.isPublic(mods)) false + else if (!suiteCls.isAssignableFrom(cls)) false + else { + try { cls.getConstructor(); true } + catch { case _: NoSuchMethodException => false } + } + } catch { + case _: Throwable => false + } + } + + /** + * Compiles the comma-separated `-Dsuites=…` value into a single FQCN + * matcher. Each entry is either an exact class name (`com.x.YTest`) or a + * package wildcard ending in `.*` (`com.x.*` matches everything under + * `com.x.`). Empty / unset property = accept everything (no filter active). + * This mirrors scalatest-maven-plugin's documented `` semantics + * closely enough for the counting purpose — we don't need the runner's + * full glob support, just the patterns geobrix actually uses. + */ + private def compileSuitesMatcher(suitesProp: String): String => Boolean = { + val patterns = suitesProp.split(",").map(_.trim).filter(_.nonEmpty).toList + if (patterns.isEmpty) (_: String) => true + else { + val checks: List[String => Boolean] = patterns.map { + case p if p.endsWith(".*") => + val prefix = p.stripSuffix(".*") + "." + (fqcn: String) => fqcn.startsWith(prefix) + case exact => + (fqcn: String) => fqcn == exact + } + (fqcn: String) => checks.exists(_(fqcn)) + } + } +}