diff --git a/.cargo/config.toml b/.cargo/config.toml
new file mode 100644
index 0000000..d7e0ee2
--- /dev/null
+++ b/.cargo/config.toml
@@ -0,0 +1,21 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# Keep Cargo's workspace output out of `target/` so `mvn clean` (which deletes
+# the root `target/`) does not nuke the Rust build cache.
+[build]
+target-dir = "rust-target"
diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
index c5db936..da8e65a 100644
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -83,8 +83,8 @@ jobs:
path: |
~/.cargo/registry
~/.cargo/git
- native/target
- key: ${{ runner.os }}-cargo-${{ hashFiles('native/Cargo.lock') }}
+ rust-target
+ key: ${{ runner.os }}-cargo-${{ hashFiles('Cargo.lock') }}
restore-keys: ${{ runner.os }}-cargo-
- name: Build native and run tests
diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml
index 4cf628f..952bf34 100644
--- a/.github/workflows/lint.yml
+++ b/.github/workflows/lint.yml
@@ -54,7 +54,7 @@ jobs:
run: ./mvnw -q spotless:check
- name: Check Rust formatting
- run: cd native && cargo fmt --all -- --check
+ run: cargo fmt --all -- --check
clippy:
name: Clippy
@@ -81,9 +81,9 @@ jobs:
path: |
~/.cargo/registry
~/.cargo/git
- native/target
- key: ${{ runner.os }}-clippy-${{ hashFiles('native/Cargo.lock') }}
+ rust-target
+ key: ${{ runner.os }}-clippy-${{ hashFiles('Cargo.lock') }}
restore-keys: ${{ runner.os }}-clippy-
- name: Run clippy
- run: cd native && cargo clippy --all-targets -- -D warnings
+ run: cargo clippy --workspace --all-targets -- -D warnings
diff --git a/.gitignore b/.gitignore
index 719a2a4..25c9216 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,4 +1,5 @@
target/
+rust-target/
*.class
.idea/
.vscode/
diff --git a/native/Cargo.lock b/Cargo.lock
similarity index 94%
rename from native/Cargo.lock
rename to Cargo.lock
index 8c56280..286f96f 100644
--- a/native/Cargo.lock
+++ b/Cargo.lock
@@ -98,9 +98,9 @@ dependencies = [
[[package]]
name = "ar_archive_writer"
-version = "0.5.1"
+version = "0.5.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "7eb93bbb63b9c227414f6eb3a0adfddca591a8ce1e9b60661bb08969b87e340b"
+checksum = "4087686b4b0a3427190bae57a1d9a478dbb2d40c5dc1bd6e2b6d797913bdd348"
dependencies = [
"object",
]
@@ -119,9 +119,9 @@ checksum = "7c02d123df017efcdfbd739ef81735b36c5ba83ec3c59c80a9d7ecc718f92e50"
[[package]]
name = "arrow"
-version = "58.2.0"
+version = "58.3.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "607e64bb911ee4f90483e044fe78f175989148c2892e659a2cd25429e782ec54"
+checksum = "378530e55cd479eda3c14eb345310799717e6f76d0c332041e8487022166b471"
dependencies = [
"arrow-arith",
"arrow-array",
@@ -140,9 +140,9 @@ dependencies = [
[[package]]
name = "arrow-arith"
-version = "58.2.0"
+version = "58.3.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "e754319ed8a85d817fe7adf183227e0b5308b82790a737b426c1124626b48118"
+checksum = "a0ab212d2c1886e802f51c5212d78ebbcbb0bec980fff9dadc1eb8d45cd0b738"
dependencies = [
"arrow-array",
"arrow-buffer",
@@ -154,9 +154,9 @@ dependencies = [
[[package]]
name = "arrow-array"
-version = "58.2.0"
+version = "58.3.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "841321891f247aa86c6112c80d83d89cb36e0addd020fa2425085b8eb6c3f579"
+checksum = "cfd33d3e92f207444098c75b42de99d329562be0cf686b307b097cc52b4e999e"
dependencies = [
"ahash",
"arrow-buffer",
@@ -173,9 +173,9 @@ dependencies = [
[[package]]
name = "arrow-buffer"
-version = "58.2.0"
+version = "58.3.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "f955dfb73fae000425f49c8226d2044dab60fb7ad4af1e24f961756354d996c9"
+checksum = "0c6cd424c2693bcdbc150d843dc9d4d137dd2de4782ce6df491ad11a3a0416c0"
dependencies = [
"bytes",
"half",
@@ -185,9 +185,9 @@ dependencies = [
[[package]]
name = "arrow-cast"
-version = "58.2.0"
+version = "58.3.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "ca5e686972523798f76bef355145bc1ae25a84c731e650268d31ab763c701663"
+checksum = "4c5aefb56a2c02e9e2b30746241058b85f8983f0fcff2ba0c6d09006e1cded7f"
dependencies = [
"arrow-array",
"arrow-buffer",
@@ -207,9 +207,9 @@ dependencies = [
[[package]]
name = "arrow-csv"
-version = "58.2.0"
+version = "58.3.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "86c276756867fc8186ec380c72c290e6e3b23a1d4fb05df6b1d62d2e62666d48"
+checksum = "e94e8cf7e517657a52b91ea1263acf38c4ca62a84655d72458a3359b12ab97de"
dependencies = [
"arrow-array",
"arrow-cast",
@@ -222,9 +222,9 @@ dependencies = [
[[package]]
name = "arrow-data"
-version = "58.2.0"
+version = "58.3.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "db3b5846209775b6dc8056d77ff9a032b27043383dd5488abd0b663e265b9373"
+checksum = "3c88210023a2bfee1896af366309a3028fc3bcbd6515fa29a7990ee1baa08ee0"
dependencies = [
"arrow-buffer",
"arrow-schema",
@@ -235,9 +235,9 @@ dependencies = [
[[package]]
name = "arrow-ipc"
-version = "58.2.0"
+version = "58.3.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "fd8907ddd8f9fbabf91ec2c85c1d81fe2874e336d2443eb36373595e28b98dd5"
+checksum = "238438f0834483703d88896db6fe5a7138b2230debc31b34c0336c2996e3c64f"
dependencies = [
"arrow-array",
"arrow-buffer",
@@ -251,9 +251,9 @@ dependencies = [
[[package]]
name = "arrow-json"
-version = "58.2.0"
+version = "58.3.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "f4518c59acc501f10d7dcae397fe12b8db3d81bc7de94456f8a58f9165d6f502"
+checksum = "205ca2119e6d679d5c133c6f30e68f027738d95ed948cf77677ea69c7800036b"
dependencies = [
"arrow-array",
"arrow-buffer",
@@ -276,9 +276,9 @@ dependencies = [
[[package]]
name = "arrow-ord"
-version = "58.2.0"
+version = "58.3.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "efa70d9d6b1356f1fb9f1f651b84a725b7e0abb93f188cf7d31f14abfa2f2e6f"
+checksum = "1bffd8fd2579286a5d63bac898159873e5094a79009940bcb42bbfce4f19f1d0"
dependencies = [
"arrow-array",
"arrow-buffer",
@@ -289,9 +289,9 @@ dependencies = [
[[package]]
name = "arrow-row"
-version = "58.2.0"
+version = "58.3.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "faec88a945338192beffbbd4be0def70135422930caa244ac3cec0cd213b26b4"
+checksum = "bab5994731204603c73ba69267616c50f80780774c6bb0476f1f830625115e0c"
dependencies = [
"arrow-array",
"arrow-buffer",
@@ -302,9 +302,9 @@ dependencies = [
[[package]]
name = "arrow-schema"
-version = "58.2.0"
+version = "58.3.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "18aa020f6bc8e5201dcd2d4b7f98c68f8a410ef37128263243e6ff2a47a67d4f"
+checksum = "f633dbfdf39c039ada1bf9e34c694816eb71fbb7dc78f613993b7245e078a1ed"
dependencies = [
"bitflags",
"serde_core",
@@ -313,9 +313,9 @@ dependencies = [
[[package]]
name = "arrow-select"
-version = "58.2.0"
+version = "58.3.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "a657ab5132e9c8ca3b24eb15a823d0ced38017fe3930ff50167466b02e2d592c"
+checksum = "8cd065c54172ac787cf3f2f8d4107e0d3fdc26edba76fdf4f4cc170258942222"
dependencies = [
"ahash",
"arrow-array",
@@ -327,9 +327,9 @@ dependencies = [
[[package]]
name = "arrow-string"
-version = "58.2.0"
+version = "58.3.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "f6de2efbbd1a9f9780ceb8d1ff5d20421b35863b361e3386b4f571f1fc69fcb8"
+checksum = "29dd7cda3ab9692f43a2e4acc444d760cc17b12bb6d8232ddf64e9bab7c06b42"
dependencies = [
"arrow-array",
"arrow-buffer",
@@ -393,9 +393,9 @@ checksum = "1505bd5d3d116872e7271a6d4e16d81d0c8570876c8de68093a09ac269d8aac0"
[[package]]
name = "autocfg"
-version = "1.5.0"
+version = "1.5.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "c08606f8c3cbf4ce6ec8e28fb0014a2c086708fe954eaa885384a6165172e7e8"
+checksum = "f2032f911046de80f0a198e0901378627c33f59ea0ac00e363d481118bd70a53"
[[package]]
name = "base64"
@@ -419,9 +419,9 @@ dependencies = [
[[package]]
name = "bitflags"
-version = "2.11.1"
+version = "2.13.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "c4512299f36f043ab09a583e57bceb5a5aab7a73db1805848e8fef3c9e8c78b3"
+checksum = "b4388bee8683e3d04af747c73422af53102d2bd24d9eadb6cbc100baef4b43f8"
[[package]]
name = "blake2"
@@ -457,9 +457,9 @@ dependencies = [
[[package]]
name = "bon"
-version = "3.9.1"
+version = "3.9.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "f47dbe92550676ee653353c310dfb9cf6ba17ee70396e1f7cf0a2020ad49b2fe"
+checksum = "b2f04f6fef12d70d42a77b1433c9e0f065238479a6cefc4f5bab105e9873a3c3"
dependencies = [
"bon-macros",
"rustversion",
@@ -467,9 +467,9 @@ dependencies = [
[[package]]
name = "bon-macros"
-version = "3.9.1"
+version = "3.9.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "519bd3116aeeb42d5372c29d982d16d0170d3d4a5ed85fc7dd91642ffff3c67c"
+checksum = "7d0bd4c2f75335ad98052a37efb54f428b492f64340257143b3429c8a508fa7b"
dependencies = [
"darling",
"ident_case",
@@ -482,9 +482,9 @@ dependencies = [
[[package]]
name = "brotli"
-version = "8.0.2"
+version = "8.0.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "4bd8b9603c7aa97359dbd97ecf258968c95f3adddd6db2f7e7a5bef101c84560"
+checksum = "8119e4516436f5708bbc474a9d395bf12f1b5395e93a92a56e647ac3388c8610"
dependencies = [
"alloc-no-stdlib",
"alloc-stdlib",
@@ -493,9 +493,9 @@ dependencies = [
[[package]]
name = "brotli-decompressor"
-version = "5.0.0"
+version = "5.0.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "874bb8112abecc98cbd6d81ea4fa7e94fb9449648c93cc89aa40c81c24d7de03"
+checksum = "5962523e1b92ce1b5e793d9169b9943eece10d39f62550bc04bb605d75b94924"
dependencies = [
"alloc-no-stdlib",
"alloc-stdlib",
@@ -503,9 +503,9 @@ dependencies = [
[[package]]
name = "bumpalo"
-version = "3.20.2"
+version = "3.20.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "5d20789868f4b01b2f2caec9f5c4e0213b41e3e5702a50157d699ae31ced2fcb"
+checksum = "72f5acc6cb2ba439de613abc23857ec3d78374d8ed5ac84e9d11336e87da8649"
[[package]]
name = "byteorder"
@@ -530,9 +530,9 @@ dependencies = [
[[package]]
name = "cc"
-version = "1.2.62"
+version = "1.2.63"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "a1dce859f0832a7d088c4f1119888ab94ef4b5d6795d1ce05afb7fe159d79f98"
+checksum = "556e016178bb5662a08681bbe0f00f8e17631781a4dfc8c45e466e4b185ec27f"
dependencies = [
"find-msvc-tools",
"jobserver",
@@ -571,9 +571,9 @@ dependencies = [
[[package]]
name = "chrono"
-version = "0.4.44"
+version = "0.4.45"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "c673075a2e0e5f4a1dde27ce9dee1ea4558c7ffe648f576438a20ca1d2acc4b0"
+checksum = "1aa79e62e7697b8e29b513a68abacf485adcd1fe8284a4316c5ae868e6633327"
dependencies = [
"iana-time-zone",
"num-traits",
@@ -789,9 +789,9 @@ dependencies = [
[[package]]
name = "dashmap"
-version = "6.1.0"
+version = "6.2.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "5041cc499144891f3790297212f32a74fb938e5136a14943f338ef9e0ae276cf"
+checksum = "e6361d5c062261c78a176addb82d4c821ae42bed6089de0e12603cd25de2059c"
dependencies = [
"cfg-if",
"crossbeam-utils",
@@ -1299,6 +1299,16 @@ dependencies = [
"datafusion-physical-expr-common",
]
+[[package]]
+name = "datafusion-java-example-bridge"
+version = "0.1.0"
+dependencies = [
+ "arrow",
+ "datafusion",
+ "datafusion-spark-bridge",
+ "tokio",
+]
+
[[package]]
name = "datafusion-jni"
version = "0.1.0"
@@ -1306,6 +1316,7 @@ dependencies = [
"arrow",
"async-trait",
"datafusion",
+ "datafusion-jni-common",
"datafusion-proto",
"datafusion-substrait",
"futures",
@@ -1319,6 +1330,16 @@ dependencies = [
"url",
]
+[[package]]
+name = "datafusion-jni-common"
+version = "0.1.0"
+dependencies = [
+ "datafusion",
+ "futures",
+ "jni",
+ "tokio",
+]
+
[[package]]
name = "datafusion-macros"
version = "53.1.0"
@@ -1527,6 +1548,21 @@ dependencies = [
"parking_lot",
]
+[[package]]
+name = "datafusion-spark-bridge"
+version = "0.1.0"
+dependencies = [
+ "arrow",
+ "async-trait",
+ "datafusion",
+ "datafusion-jni-common",
+ "datafusion-proto",
+ "futures",
+ "jni",
+ "prost",
+ "tokio",
+]
+
[[package]]
name = "datafusion-sql"
version = "53.1.0"
@@ -1579,9 +1615,9 @@ dependencies = [
[[package]]
name = "displaydoc"
-version = "0.2.5"
+version = "0.2.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "97369cbbc041bc366949bc74d34658d6cda5621039731c6310521892a3a20ae0"
+checksum = "1ac70aa55017e108007fbaf5aa0f54b021c98f92ff8af59d42eda9da96e3dd4f"
dependencies = [
"proc-macro2",
"quote",
@@ -1596,9 +1632,9 @@ checksum = "d0881ea181b1df73ff77ffaaf9c7544ecc11e82fba9b5f27b262a3c73a332555"
[[package]]
name = "either"
-version = "1.15.0"
+version = "1.16.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "48c757948c5ede0e46177b7add2e67155f70e33c07fea8284df6576da70b3719"
+checksum = "91622ff5e7162018101f2fea40d6ebf4a78bbe5a49736a2020649edf9693679e"
[[package]]
name = "equivalent"
@@ -1904,9 +1940,9 @@ checksum = "7f24254aa9a54b5c858eaee2f5bccdb46aaf0e486a595ed5fd8f86ba55232a70"
[[package]]
name = "http"
-version = "1.4.0"
+version = "1.4.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "e3ba2a386d7f85a81f119ad7498ebe444d2e22c2af0b86b069416ace48b3311a"
+checksum = "6970f50e31d6fc17d3fa27329444bfa74e196cf62e95052a3f6fee181dba6425"
dependencies = [
"bytes",
"itoa",
@@ -1949,9 +1985,9 @@ checksum = "135b12329e5e3ce057a9f972339ea52bc954fe1e9358ef27f95e89716fbc5424"
[[package]]
name = "hyper"
-version = "1.9.0"
+version = "1.10.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "6299f016b246a94207e63da54dbe807655bf9e00044f73ded42c3ac5305fbcca"
+checksum = "55281c53a1894c864990125767da440a4e630446785086f52523b20033b74498"
dependencies = [
"atomic-waker",
"bytes",
@@ -2241,13 +2277,12 @@ dependencies = [
[[package]]
name = "js-sys"
-version = "0.3.98"
+version = "0.3.100"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "67df7112613f8bfd9150013a0314e196f4800d3201ae742489d999db2f979f08"
+checksum = "f2025f20d7a4fa7785846e7b63d10a76d3f1cee98ee5cb79ea59703f95e42162"
dependencies = [
"cfg-if",
"futures-util",
- "once_cell",
"wasm-bindgen",
]
@@ -2316,9 +2351,9 @@ dependencies = [
[[package]]
name = "libbz2-rs-sys"
-version = "0.2.3"
+version = "0.2.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "b3a6a8c165077efc8f3a971534c50ea6a1a18b329ef4a66e897a7e3a1494565f"
+checksum = "34b357333733e8260735ba5894eb928c02ecc69c78715f01a8019e7fa7f2db4c"
[[package]]
name = "libc"
@@ -2375,9 +2410,9 @@ dependencies = [
[[package]]
name = "log"
-version = "0.4.29"
+version = "0.4.32"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "5e5032e24019045c762d3c0f28f5b6b8bbf38563a65908389bf7978758920897"
+checksum = "953f07c43838f8e6f9758cab68bf5bed85465e7587ebe0b823f1bcd81978ad3a"
[[package]]
name = "lru-slab"
@@ -2406,9 +2441,9 @@ dependencies = [
[[package]]
name = "memchr"
-version = "2.8.0"
+version = "2.8.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "f8ca58f447f06ed17d5fc4043ce1b10dd205e060fb3ce5b979b8ed8e59ff3f79"
+checksum = "6b947ae49db0d222b1dbc6b113ce7248a3fc3a6ca21b696717bfc000ba4484d8"
[[package]]
name = "miniz_oxide"
@@ -2422,9 +2457,9 @@ dependencies = [
[[package]]
name = "mio"
-version = "1.2.0"
+version = "1.2.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "50b7e5b27aa02a74bac8c3f23f448f8d87ff11f92d3aac1a6ed369ee08cc56c1"
+checksum = "02bd0af71c67b473010cbbc60715ee815645a4dc942899111f494b4b737d6fda"
dependencies = [
"libc",
"wasi",
@@ -2570,9 +2605,9 @@ dependencies = [
[[package]]
name = "parquet"
-version = "58.2.0"
+version = "58.3.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "43d7efd3052f7d6ef601085559a246bc991e9a8cc77e02753737df6322ce35f1"
+checksum = "5dafa7d01085b62a47dd0c1829550a0a36710ea9c4fe358a05a85477cec8a908"
dependencies = [
"ahash",
"arrow-array",
@@ -2734,9 +2769,9 @@ dependencies = [
[[package]]
name = "prost"
-version = "0.14.3"
+version = "0.14.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "d2ea70524a2f82d518bce41317d0fae74151505651af45faf1ffbd6fd33f0568"
+checksum = "528ac67416ff8646872a3c02cad9cc4ee5dc9f9540c9b10771855c95cb2e5ae1"
dependencies = [
"bytes",
"prost-derive",
@@ -2744,9 +2779,9 @@ dependencies = [
[[package]]
name = "prost-build"
-version = "0.14.3"
+version = "0.14.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "343d3bd7056eda839b03204e68deff7d1b13aba7af2b2fd16890697274262ee7"
+checksum = "03da047801ff44bb6a4d407d4860c05fd70bb81714e6b2f3812603d5b145b042"
dependencies = [
"heck",
"itertools",
@@ -2763,9 +2798,9 @@ dependencies = [
[[package]]
name = "prost-derive"
-version = "0.14.3"
+version = "0.14.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "27c6023962132f4b30eb4c172c91ce92d933da334c59c23cddee82358ddafb0b"
+checksum = "b570b25f7617e43d59005d0990ccb79e950a423952cea19671b7a876da390adf"
dependencies = [
"anyhow",
"itertools",
@@ -2776,9 +2811,9 @@ dependencies = [
[[package]]
name = "prost-types"
-version = "0.14.3"
+version = "0.14.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "8991c4cbdb8bc5b11f0b074ffe286c30e523de90fee5ba8132f1399f23cb3dd7"
+checksum = "f94967dc7688f3054c7fac87473ffae4cc4c3904800e2d9f5b857246d8963b0a"
dependencies = [
"prost",
]
@@ -3035,9 +3070,9 @@ dependencies = [
[[package]]
name = "regex"
-version = "1.12.3"
+version = "1.12.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "e10754a14b9137dd7b1e3e5b0493cc9171fdd105e0ab477f51b72e7f3ac0e276"
+checksum = "f1292b7759ae1cb9ec195452d1390a074f0cd8541ab7a5a8c31cd6db45d4a6ba"
dependencies = [
"aho-corasick",
"memchr",
@@ -3064,9 +3099,9 @@ checksum = "cab834c73d247e67f4fae452806d17d3c7501756d98c8808d7c9c7aa7d18f973"
[[package]]
name = "regex-syntax"
-version = "0.8.10"
+version = "0.8.11"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "dc897dd8d9e8bd1ed8cdad82b5966c3e0ecae09fb1907d58efaa013543185d0a"
+checksum = "d6f6ff9a378485b298a5286656da665ba74413d36db0979633275d2e708145d4"
[[package]]
name = "regress"
@@ -3178,9 +3213,9 @@ dependencies = [
[[package]]
name = "rustls-native-certs"
-version = "0.8.3"
+version = "0.8.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "612460d5f7bea540c490b2b6395d8e34a953e52b491accd6c86c8164c5932a63"
+checksum = "dab5152771c58876a2146916e53e35057e1a4dfa2b9df0f0305b07f611fdea4d"
dependencies = [
"openssl-probe",
"rustls-pki-types",
@@ -3361,9 +3396,9 @@ dependencies = [
[[package]]
name = "serde_json"
-version = "1.0.149"
+version = "1.0.150"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "83fc039473c5595ace860d8c4fafa220ff474b3fc6bfdb4293327f1a37e94d86"
+checksum = "e8014e44b4736ed0538adeecded0fce2a272f22dc9578a7eb6b2d9993c74cfb9"
dependencies = [
"itoa",
"memchr",
@@ -3422,9 +3457,9 @@ dependencies = [
[[package]]
name = "shlex"
-version = "1.3.0"
+version = "2.0.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "0fda2ff0d084019ba4d7c6f371c95d8fd75ce3524c3cb8fb653a3023f6323e64"
+checksum = "f8fadd59c855ef2080decdef8ff161eb6661b86933c9d82e5ba29dc602a55aba"
[[package]]
name = "simd-adler32"
@@ -3464,9 +3499,9 @@ checksum = "1b6b67fb9a61334225b5b790716f609cd58395f895b3fe8b328786812a40bc3b"
[[package]]
name = "socket2"
-version = "0.6.3"
+version = "0.6.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "3a766e1110788c36f4fa1c2b71b387a7815aa65f88ce0229841826633d93723e"
+checksum = "52d1cfed4120b4d927bf7c0f86d2087a4a7d6027c906d9f9d525a80573b9be51"
dependencies = [
"libc",
"windows-sys 0.61.2",
@@ -3861,9 +3896,9 @@ checksum = "9ea3136b675547379c4bd395ca6b938e5ad3c3d20fad76e7fe85f9e0d011419c"
[[package]]
name = "typenum"
-version = "1.20.0"
+version = "1.20.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "40ce102ab67701b8526c123c1bab5cbe42d7040ccfd0f64af1a385808d2f43de"
+checksum = "b6f5e870be6c3b371b77fe0ee0bafb859fa4964b4404c27de1d380043c4dda20"
[[package]]
name = "typify"
@@ -3920,9 +3955,9 @@ checksum = "e6e4313cd5fcd3dad5cafa179702e2b244f760991f45397d14d4ebf38247da75"
[[package]]
name = "unicode-segmentation"
-version = "1.13.2"
+version = "1.13.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "9629274872b2bfaf8d66f5f15725007f635594914870f65218920345aa11aa8c"
+checksum = "c6f5d3c3b1bf09027a88a6bc961fc00497d651009560b5463668dc81b0fa87a8"
[[package]]
name = "unicode-width"
@@ -3968,9 +4003,9 @@ checksum = "b6c140620e7ffbb22c2dee59cafe6084a59b5ffc27a8859a5f0d494b5d52b6be"
[[package]]
name = "uuid"
-version = "1.23.1"
+version = "1.23.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "ddd74a9687298c6858e9b88ec8935ec45d22e8fd5e6394fa1bd4e99a87789c76"
+checksum = "144d6b123cef80b301b8f72a9e2ca4370ddec21950d0a103dd22c437006d2db7"
dependencies = [
"getrandom 0.4.2",
"js-sys",
@@ -4029,9 +4064,9 @@ dependencies = [
[[package]]
name = "wasm-bindgen"
-version = "0.2.121"
+version = "0.2.123"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "49ace1d07c165b0864824eee619580c4689389afa9dc9ed3a4c75040d82e6790"
+checksum = "a254a4b10c19a76f09a27640e7ffbf9bc30bf67e16a3bf28aaefa4920fe81563"
dependencies = [
"cfg-if",
"once_cell",
@@ -4042,9 +4077,9 @@ dependencies = [
[[package]]
name = "wasm-bindgen-futures"
-version = "0.4.71"
+version = "0.4.73"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "96492d0d3ffba25305a7dc88720d250b1401d7edca02cc3bcd50633b424673b8"
+checksum = "54568702fabf5d4849ce2b90fadfa64168a097eaf4b351ce9df8b687a0086aaf"
dependencies = [
"js-sys",
"wasm-bindgen",
@@ -4052,9 +4087,9 @@ dependencies = [
[[package]]
name = "wasm-bindgen-macro"
-version = "0.2.121"
+version = "0.2.123"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "8e68e6f4afd367a562002c05637acb8578ff2dea1943df76afb9e83d177c8578"
+checksum = "24a40fc75b0ec6f3746ceb10d36f53a93dcd68a93b11b6445983945d79eba0dc"
dependencies = [
"quote",
"wasm-bindgen-macro-support",
@@ -4062,9 +4097,9 @@ dependencies = [
[[package]]
name = "wasm-bindgen-macro-support"
-version = "0.2.121"
+version = "0.2.123"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "d95a9ec35c64b2a7cb35d3fead40c4238d0940c86d107136999567a4703259f2"
+checksum = "908f34bd9b9ce3d4caf07b72dfab63d61504d156856c6bd3cd87fa350cf3985b"
dependencies = [
"bumpalo",
"proc-macro2",
@@ -4075,9 +4110,9 @@ dependencies = [
[[package]]
name = "wasm-bindgen-shared"
-version = "0.2.121"
+version = "0.2.123"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "c4e0100b01e9f0d03189a92b96772a1fb998639d981193d7dbab487302513441"
+checksum = "7acbf7616c27b194bbb550bf77ed0c2c3e5b7fd1260a93082b95fb7f47959b92"
dependencies = [
"unicode-ident",
]
@@ -4131,9 +4166,9 @@ dependencies = [
[[package]]
name = "web-sys"
-version = "0.3.98"
+version = "0.3.100"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "4b572dff8bcf38bad0fa19729c89bb5748b2b9b1d8be70cf90df697e3a8f32aa"
+checksum = "6e0871acf327f283dc6da28a1696cdc64fb355ba9f935d052021fa77f35cce69"
dependencies = [
"js-sys",
"wasm-bindgen",
@@ -4541,9 +4576,9 @@ checksum = "1ffae5123b2d3fc086436f8834ae3ab053a283cfac8fe0a0b8eaae044768a4c4"
[[package]]
name = "yoke"
-version = "0.8.2"
+version = "0.8.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "abe8c5fda708d9ca3df187cae8bfb9ceda00dd96231bed36e445a1a48e66f9ca"
+checksum = "709fe23a0424b6a435d82152b1bd3fdfb0833487d5fa90d05d42762a9891fef5"
dependencies = [
"stable_deref_trait",
"yoke-derive",
@@ -4564,18 +4599,18 @@ dependencies = [
[[package]]
name = "zerocopy"
-version = "0.8.48"
+version = "0.8.52"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "eed437bf9d6692032087e337407a86f04cd8d6a16a37199ed57949d415bd68e9"
+checksum = "ce1022995ff5ff5d841ad7d994facc23098cd40152f2c1d11cd607c6f530653f"
dependencies = [
"zerocopy-derive",
]
[[package]]
name = "zerocopy-derive"
-version = "0.8.48"
+version = "0.8.52"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "70e3cd084b1788766f53af483dd21f93881ff30d7320490ec3ef7526d203bad4"
+checksum = "1ae7f38b72ec2a254e2b87ef277cf2cd4fb97cbebf944faa6f33354da0867930"
dependencies = [
"proc-macro2",
"quote",
@@ -4584,9 +4619,9 @@ dependencies = [
[[package]]
name = "zerofrom"
-version = "0.1.7"
+version = "0.1.8"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "69faa1f2a1ea75661980b013019ed6687ed0e83d069bc1114e2cc74c6c04c4df"
+checksum = "0ec05a11813ea801ff6d75110ad09cd0824ddba17dfe17128ea0d5f68e6c5272"
dependencies = [
"zerofrom-derive",
]
diff --git a/Cargo.toml b/Cargo.toml
new file mode 100644
index 0000000..be906aa
--- /dev/null
+++ b/Cargo.toml
@@ -0,0 +1,49 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+[workspace]
+resolver = "2"
+members = [
+ "native",
+ "native-common",
+ "examples/native",
+ "spark/bridge",
+]
+
+# Every dependency used by any workspace member is declared here so version
+# bumps live in one place and the resolver picks a single version of each
+# crate across the workspace. Members reference these via `{ workspace = true }`
+# and add per-crate flags (optional, features, default-features) at the use
+# site.
+[workspace.dependencies]
+arrow = { version = "58", features = ["ffi"] }
+async-trait = "0.1"
+datafusion = { version = "53.1.0" }
+datafusion-proto = "53.1.0"
+datafusion-substrait = "53.1.0"
+futures = "0.3"
+jni = "0.21"
+# Pinned to the major DataFusion 53.1 pulls in transitively (0.13.x) so we
+# share the same `dyn ObjectStore` vtable and don't double-link.
+object_store = { version = "0.13", default-features = false }
+prost = "0.14"
+prost-build = "0.14"
+protoc-bin-vendored = "3"
+tokio = { version = "1", features = ["rt-multi-thread"] }
+# Optional, cfg-gated. See `native/Cargo.toml` for the build-flag dance.
+tokio-metrics = "0.5"
+url = "2"
diff --git a/Makefile b/Makefile
index 6d9b0ae..d6bcf2c 100644
--- a/Makefile
+++ b/Makefile
@@ -20,14 +20,14 @@
all: native jvm
native:
- cd native && cargo build
+ cargo build --workspace
-# Build the native crate with the `runtime-metrics` Cargo feature enabled.
+# Build the JNI crate with the `runtime-metrics` Cargo feature enabled.
# Requires `--cfg tokio_unstable` because tokio-metrics gates its API there.
# Default `make native` does not pull this in; callers who need
# SessionContext.runtimeStats() pick this target explicitly.
native-runtime-metrics:
- cd native && RUSTFLAGS="--cfg tokio_unstable" cargo build --features runtime-metrics
+ RUSTFLAGS="--cfg tokio_unstable" cargo build -p datafusion-jni --features runtime-metrics
jvm:
./mvnw package -DskipTests
@@ -39,10 +39,10 @@ test: native
# `:check` form inline in .github/workflows/lint.yml.
format:
./mvnw -q spotless:apply
- cd native && cargo fmt --all
+ cargo fmt --all
clean:
- cd native && cargo clean
+ cargo clean
./mvnw clean
tpch-data:
diff --git a/core/pom.xml b/core/pom.xml
index 5ddf107..1e25736 100644
--- a/core/pom.xml
+++ b/core/pom.xml
@@ -102,8 +102,8 @@ under the License.
-
+ value="${maven.multiModuleProjectDirectory}/rust-target/${datafusion.native.profile}/${datafusion.lib.filename}"/>
+
diff --git a/core/src/main/java/org/apache/datafusion/SessionContext.java b/core/src/main/java/org/apache/datafusion/SessionContext.java
index ffc58dd..27d2b16 100644
--- a/core/src/main/java/org/apache/datafusion/SessionContext.java
+++ b/core/src/main/java/org/apache/datafusion/SessionContext.java
@@ -113,10 +113,11 @@ public DataFrame fromProto(byte[] planBytes) {
* other Substrait-emitting tool — and hand them to DataFusion without round-tripping through SQL.
*
*
Substrait support is gated behind the {@code substrait} Cargo feature on the native crate
- * and is off by default. Rebuild the native crate with {@code cargo build
- * --features substrait} (or {@code cargo build --features substrait,protoc} for hermetic builds
- * that vendor {@code protoc} via {@code cmake}) to enable it. If invoked against a native binary
- * built without the feature, this method throws {@link RuntimeException} pointing at the flag.
+ * and is off by default. Rebuild the native crate with {@code cargo build -p
+ * datafusion-jni --features substrait} (or {@code ... --features substrait,protoc} for hermetic
+ * builds that vendor {@code protoc} via {@code cmake}) to enable it. If invoked against a native
+ * binary built without the feature, this method throws {@link RuntimeException} pointing at the
+ * flag.
*
* @throws IllegalArgumentException if {@code planBytes} is {@code null}.
* @throws IllegalStateException if this context is closed.
@@ -183,7 +184,7 @@ public MemoryUsage memoryUsage() {
* Rebuild with:
*
*
If invoked against a native binary built without the feature, this method throws {@link
diff --git a/core/src/test/java/org/apache/datafusion/SessionContextRuntimeStatsTest.java b/core/src/test/java/org/apache/datafusion/SessionContextRuntimeStatsTest.java
index 120d179..d567275 100644
--- a/core/src/test/java/org/apache/datafusion/SessionContextRuntimeStatsTest.java
+++ b/core/src/test/java/org/apache/datafusion/SessionContextRuntimeStatsTest.java
@@ -37,7 +37,7 @@
* #checkFeatureEnabled}. Run
*
*
*
* before {@code ./mvnw test} to exercise this class.
diff --git a/core/src/test/java/org/apache/datafusion/SessionContextSubstraitTest.java b/core/src/test/java/org/apache/datafusion/SessionContextSubstraitTest.java
index 34db3b5..a2cfb0a 100644
--- a/core/src/test/java/org/apache/datafusion/SessionContextSubstraitTest.java
+++ b/core/src/test/java/org/apache/datafusion/SessionContextSubstraitTest.java
@@ -50,7 +50,7 @@
*
*
The {@code substrait} Cargo feature is off by default in {@code native/Cargo.toml}; if the
* native crate was built without it, every test here is skipped (see {@link #checkFeatureEnabled}).
- * Run {@code (cd native && cargo build --features substrait)} before {@code ./mvnw test} to
+ * Run {@code cargo build -p datafusion-jni --features substrait} before {@code ./mvnw test} to
* exercise this class.
*/
class SessionContextSubstraitTest {
diff --git a/docs/source/contributor-guide/development.md b/docs/source/contributor-guide/development.md
index 984d77c..fdb00f4 100644
--- a/docs/source/contributor-guide/development.md
+++ b/docs/source/contributor-guide/development.md
@@ -42,7 +42,7 @@ This builds the native Rust crate and runs the JUnit tests. The steps can
be run individually:
```sh
-cd native && cargo build
+cargo build --workspace
./mvnw test
```
@@ -74,14 +74,25 @@ disk space.
The repository is a multi-module Maven build:
-- `pom.xml` — parent POM declaring the `core` and `examples` modules and
- shared plugin/dependency versions.
+- `Cargo.toml` — Rust workspace root declaring the three crate members
+ (`native`, `native-common`, `examples/native`, `spark/bridge`) and `[workspace.dependencies]`
+ that pin shared versions in one place. Cargo writes artifacts to
+ `rust-target/` (overridden in `.cargo/config.toml`) so `mvn clean` at the
+ repo root does not nuke the Rust build cache.
+- `pom.xml` — parent POM declaring the `core`, `spark`, and `examples`
+ modules and shared plugin/dependency versions.
- `core/` — `datafusion-java` library module (Java sources, tests, and
generated protobuf classes).
+- `spark/` — `datafusion-java-spark` Spark DataSource V2 connector
+ (Scala + Java, pure JVM) and its `spark/bridge/` Rust SDK crate
+ (`datafusion-spark-bridge`: widening, scan machinery, `export_bridge!`).
- `examples/` — `datafusion-java-examples` module containing runnable
examples that depend on the library; built alongside the library so they
- cannot fall out of sync with the API.
-- `native/` — Rust crate (JNI + Arrow C Data Interface).
+ cannot fall out of sync with the API. Includes `examples/native/`, a
+ small `export_bridge!` cdylib used by the Spark connector demo
+ (`ExampleBridgeProviderFactory` + the pyspark script under
+ `examples/python/`).
+- `native/` — `datafusion-jni` Rust crate (JNI + Arrow C Data Interface).
- `proto/` — Protobuf definitions shared between Java and Rust.
- `Makefile` — top-level build orchestration (`make test`, `make format`,
`make tpch-data`).
diff --git a/docs/source/contributor-guide/updating-datafusion-version.md b/docs/source/contributor-guide/updating-datafusion-version.md
index 56d50dc..ef6cd10 100644
--- a/docs/source/contributor-guide/updating-datafusion-version.md
+++ b/docs/source/contributor-guide/updating-datafusion-version.md
@@ -21,7 +21,9 @@ under the License.
Three things must move together when bumping DataFusion:
-1. `native/Cargo.toml` — the `datafusion` crate dependency.
+1. `Cargo.toml` (workspace root) — the `datafusion`, `datafusion-ffi`,
+ `datafusion-proto`, and `datafusion-substrait` entries in
+ `[workspace.dependencies]`. Members inherit from there.
2. `pom.xml` — the `` Maven property. **Must equal
the Cargo version**; a mismatch means JVM-built protobuf plans won't
deserialize on the native side.
@@ -32,9 +34,9 @@ Three things must move together when bumping DataFusion:
## Recipe
```sh
-# 1. Bump the Cargo dep
-$EDITOR native/Cargo.toml # set datafusion = ""
-(cd native && cargo update -p datafusion)
+# 1. Bump the workspace dep
+$EDITOR Cargo.toml # set datafusion = "" in [workspace.dependencies]
+cargo update -p datafusion
# 2. Bump the Maven property to match
$EDITOR pom.xml # set
diff --git a/examples/README.md b/examples/README.md
new file mode 100644
index 0000000..da9fec7
--- /dev/null
+++ b/examples/README.md
@@ -0,0 +1,99 @@
+# DataFusion-Java examples
+
+Small, self-contained programs that each demonstrate one feature of the
+DataFusion-Java API. Every example is a Java class with a `main` method that
+builds a query against an in-process DataFusion engine and prints its result
+(as tab-separated rows) to stdout. They are the fastest way to see what the
+library can do and to copy a working starting point.
+
+## Prerequisites
+
+- JDK 17+
+- Maven (the repo ships `./mvnw`, no install needed)
+- Rust toolchain (`cargo`) — the library calls into a native DataFusion
+ build, so the Rust side must be compiled once first
+
+## Build once
+
+From the repo root:
+
+```bash
+# 1. Compile the native libraries (DataFusion + JNI glue).
+cargo build --release
+
+# 2. Build the Java/Scala modules and install them into your local Maven repo.
+./mvnw -B install -DskipTests -Drat.skip=true -Ddatafusion.native.profile=release
+```
+
+Step 2 must be `install`, not `package`: running an example below starts a
+fresh Maven invocation that resolves `datafusion-java` from your local Maven
+repository (`~/.m2/repository`), and only `install` publishes the jar there.
+If you skip it you'll see
+`Could not find artifact org.apache.datafusion:datafusion-java:...` —
+that error means "run step 2".
+
+(If your local Maven repo lives somewhere non-standard, add
+`-Dmaven.repo.local=/path/to/repo` to step 2 **and** to every run command.)
+
+## Run your first example
+
+```bash
+./mvnw -B -pl examples exec:exec \
+ -Dexec.mainClass=org.apache.datafusion.examples.SqlQueryExample
+```
+
+This registers a small CSV file, runs a SQL aggregation over it, and prints
+the result rows. Swap `SqlQueryExample` for any class in the table below.
+
+> Why `exec:exec` and not `exec:java`? Each example runs in a fresh `java`
+> process so the JVM flag the native Arrow integration needs
+> (`--add-opens=java.base/java.nio=ALL-UNNAMED`) actually applies. `exec:java`
+> would run inside Maven's own JVM without it.
+
+## The examples
+
+| Entry point (`-Dexec.mainClass=org.apache.datafusion.examples.<…>`) | Demonstrates | What you'll see |
+| --- | --- | --- |
+| `SqlQueryExample` | Register a CSV file, run a SQL aggregation | The aggregated rows printed as TSV |
+| `DataFrameExample` | The DataFrame API: filter, group, sort — no SQL strings | The transformed rows |
+| `ProtoPlanExample` | Build a DataFusion `LogicalPlanNode` protobuf in Java and execute it via `SessionContext.fromProto` — the wire-format path used by query frontends | The plan's result rows |
+| `JdbcExample` | Pull rows from a JDBC source (in-memory H2) into Arrow, register them as a table, query them | Rows that originated in H2, queried through DataFusion |
+| `AddOneExample` | Write a scalar UDF in Java and call it from SQL | Each input value, plus one |
+| `NestedTypeUdfExample` | A scalar UDF whose input and output are nested Arrow types (`List`) | The transformed list column |
+
+## The Spark connector example
+
+One example is not a standalone `main`:
+`ExampleBridgeProviderFactory` implements the Spark connector's
+`BridgeProviderFactory` interface over a tiny in-memory table built inside
+the example bridge cdylib (the `export_bridge!` crate under
+[`native/`](native/)). It exists to be loaded *by Spark* — the runnable
+end-to-end version is the PySpark demo under [`python/`](python/), and the
+guide to building your own connector is
+[`../spark/README.md`](../spark/README.md).
+
+To build its cdylib (workspace member, buildable from anywhere in the tree):
+
+```bash
+cargo build -p datafusion-java-example-bridge --release
+```
+
+Building the examples jar then bundles the cdylib inside it (under
+`org/apache/datafusion/examples///`), and the factory loads it from
+there at runtime via the connector's `NativeLibraryLoader` — the same
+packaging recipe a real bridge uses (see "Packaging your bridge" in
+[`../spark/README.md`](../spark/README.md)). To run against an unpackaged
+local build instead, pass
+`-Dexample.bridge.lib.path=/abs/path/to/libdatafusion_example_bridge.{so,dylib}`.
+
+## Troubleshooting
+
+- **`Could not find artifact org.apache.datafusion:datafusion-java`** — the
+ parent wasn't installed to your local Maven repo. Re-run build step 2
+ (`install`, not `package`).
+- **`Native library not found ...`** — the Rust side wasn't built, or was
+ built in a different profile than Maven expects. Re-run build step 1 and
+ keep `-Ddatafusion.native.profile=release` consistent between the cargo
+ profile (`--release`) and the Maven flag.
+- **`UnsatisfiedLinkError ... datafusion_example_bridge`** — only the example
+ bridge cdylib is missing; see "The Spark connector example" above.
diff --git a/examples/native/Cargo.toml b/examples/native/Cargo.toml
new file mode 100644
index 0000000..1e362cc
--- /dev/null
+++ b/examples/native/Cargo.toml
@@ -0,0 +1,29 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+
+[package]
+name = "datafusion-java-example-bridge"
+version = "0.1.0"
+edition = "2021"
+publish = false
+
+[lib]
+name = "datafusion_example_bridge"
+# Built as a cdylib so the JVM loads it via NativeLibraryLoader; `rlib` keeps
+# the Rust-level unit tests (options decoding, partition layout) runnable.
+crate-type = ["cdylib", "rlib"]
+
+[dependencies]
+arrow = { workspace = true }
+datafusion = { workspace = true }
+datafusion-spark-bridge = { path = "../../spark/bridge" }
+
+[dev-dependencies]
+tokio = { workspace = true }
diff --git a/examples/native/src/lib.rs b/examples/native/src/lib.rs
new file mode 100644
index 0000000..b0b17e8
--- /dev/null
+++ b/examples/native/src/lib.rs
@@ -0,0 +1,279 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//! Example bridge cdylib: a small DataFusion `MemTable` exposed to Spark
+//! through the `datafusion-spark-bridge` SDK. `export_bridge!` generates the
+//! whole JNI surface for `org.apache.datafusion.examples.ExampleBridgeNative`;
+//! this crate only decodes the options blob and builds the provider.
+//!
+//! The same pattern is what domain bridges (HDF5, custom Iceberg, in-house
+//! formats) use to expose their TableProviders to Spark via the connector's
+//! DataSource V2 plumbing.
+//!
+//! ## Options wire format
+//!
+//! The provider builder accepts an opaque `byte[]` that the JVM-side
+//! `ExampleBridgeProviderFactory.encodeOptions` produces. Layout (little-endian):
+//!
+//! ```text
+//! [u32 name_prefix_len][name_prefix UTF-8 bytes][u32 num_rows][u32 num_batches]
+//! [u32 num_partitions][u8 shared_scan] <- optional trailing fields
+//! ```
+//!
+//! Empty/`null` bytes decode as all defaults: `name_prefix="row"`, `num_rows=4`,
+//! `num_batches=1`, `num_partitions=1`, `shared_scan=false`. The trailing
+//! fields are optional so blobs from older encoders keep decoding. The
+//! `shared_scan` flag is consumed JVM-side (`ExampleBridgeProviderFactory.sharedScan`);
+//! this decoder carries it only so one blob format serves both sides. Real
+//! bridges can use the connector's default `OptionsCodec` instead (decoded via
+//! `datafusion_spark_bridge::options`); this example hand-rolls the encoding
+//! to show a custom wire layer.
+
+use std::sync::Arc;
+
+use arrow::array::{Float64Array, Int64Array, RecordBatch, StringArray};
+use arrow::datatypes::{DataType, Field, Schema as ArrowSchema};
+use datafusion::catalog::TableProvider;
+use datafusion::datasource::MemTable;
+use datafusion_spark_bridge::{export_bridge, BridgeContext, JniResult};
+
+#[derive(Debug)]
+struct Options {
+ name_prefix: String,
+ num_rows: u32,
+ num_batches: u32,
+ num_partitions: u32,
+}
+
+impl Default for Options {
+ fn default() -> Self {
+ Self {
+ name_prefix: "row".to_string(),
+ num_rows: 4,
+ num_batches: 1,
+ num_partitions: 1,
+ }
+ }
+}
+
+fn decode_options(bytes: &[u8]) -> Result> {
+ if bytes.is_empty() {
+ return Ok(Options::default());
+ }
+ if bytes.len() < 4 {
+ return Err("options blob too short for name_prefix length prefix".into());
+ }
+ let name_len = u32::from_le_bytes(bytes[0..4].try_into().unwrap()) as usize;
+ let name_end = 4 + name_len;
+ if bytes.len() < name_end + 8 {
+ return Err("options blob truncated: missing name_prefix bytes or trailing ints".into());
+ }
+ let name_prefix = std::str::from_utf8(&bytes[4..name_end])
+ .map_err(|e| format!("name_prefix is not valid UTF-8: {e}"))?
+ .to_string();
+ let num_rows = u32::from_le_bytes(bytes[name_end..name_end + 4].try_into().unwrap());
+ let num_batches = u32::from_le_bytes(bytes[name_end + 4..name_end + 8].try_into().unwrap());
+ if num_rows == 0 || num_batches == 0 {
+ return Err("num_rows and num_batches must both be > 0".into());
+ }
+ // Optional trailing fields (older encoders omit them): num_partitions,
+ // then the shared_scan flag byte, which only the JVM side interprets.
+ let num_partitions = if bytes.len() >= name_end + 12 {
+ u32::from_le_bytes(bytes[name_end + 8..name_end + 12].try_into().unwrap())
+ } else {
+ 1
+ };
+ if num_partitions == 0 {
+ return Err("num_partitions must be > 0".into());
+ }
+ Ok(Options {
+ name_prefix,
+ num_rows,
+ num_batches,
+ num_partitions,
+ })
+}
+
+/// Build the example schema + a multi-batch in-memory table sized per `opts`.
+/// Row `r` in batch `b` gets `id = b * num_rows + r`, `name = ""`,
+/// `value = id * 1.5` (with `value` left null for every fourth row so the demo
+/// still exercises null handling).
+fn build_mem_table(
+ opts: &Options,
+) -> Result, Box> {
+ let schema = Arc::new(ArrowSchema::new(vec![
+ Field::new("id", DataType::Int64, false),
+ Field::new("name", DataType::Utf8, true),
+ Field::new("value", DataType::Float64, true),
+ ]));
+
+ let mut batches = Vec::with_capacity(opts.num_batches as usize);
+ for b in 0..opts.num_batches {
+ let mut ids = Vec::with_capacity(opts.num_rows as usize);
+ let mut names: Vec
+
+
+ org.apache.maven.plugins
+ maven-antrun-plugin
+ 3.1.0
+
+
+ copy-example-bridge-cdylib
+ process-classes
+ run
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ native-linux-amd64
+
+ unixlinuxamd64
+
+
+ linux
+ x86_64
+ libdatafusion_example_bridge.so
+
+
+
+ native-linux-x86_64
+
+ unixlinuxx86_64
+
+
+ linux
+ x86_64
+ libdatafusion_example_bridge.so
+
+
+
+ native-linux-aarch64
+
+ unixlinuxaarch64
+
+
+ linux
+ aarch64
+ libdatafusion_example_bridge.so
+
+
+
+ native-mac-x86_64
+
+ macx86_64
+
+
+ darwin
+ x86_64
+ libdatafusion_example_bridge.dylib
+
+
+
+ native-mac-aarch64
+
+ macaarch64
+
+
+ darwin
+ aarch64
+ libdatafusion_example_bridge.dylib
+
+
+
diff --git a/examples/python/README.md b/examples/python/README.md
new file mode 100644
index 0000000..c9a335d
--- /dev/null
+++ b/examples/python/README.md
@@ -0,0 +1,132 @@
+# PySpark end-to-end demo
+
+`bridge_demo.py` proves the full DataFusion → Spark path:
+
+```
+examples/native (export_bridge! cdylib) <-- in-memory MemTable + scan machinery
+ ^ byte[] options / FFI_ArrowArrayStream
+ |
+ExampleBridgeProviderFactory <-- implements BridgeProviderFactory
+ | Class.forName(...)
+ v
+datafusion-java-spark <-- DSv2 plumbing, predicate xlate
+ | spark.read.format("datafusion")
+ v
+PySpark DataFrame <-- printSchema / show / filter / select
+```
+
+## Prerequisites
+
+1. **Java 17.** `JAVA_HOME` must point at a JDK 17 install.
+
+2. **The example bridge cdylib** built from this repo:
+
+ ```bash
+ cargo build -p datafusion-java-example-bridge --release
+ ```
+
+3. **Maven artifacts installed into a side-loaded local repository.**
+
+ The script reads `arrow-c-data`, `flatbuffers-java`, and `protobuf-java`
+ jars from `${DATAFUSION_DEMO_M2:-/tmp/m2-datafusion}` (Spark's bundled
+ versions are too old, so the demo prepends our copies on
+ `spark.driver/executor.extraClassPath`). Tell Maven to install there:
+
+ ```bash
+ mvn install -DskipTests \
+ -Ddatafusion.native.profile=release \
+ -Dmaven.repo.local=/tmp/m2-datafusion
+ ```
+
+ If you already use `~/.m2`, point `DATAFUSION_DEMO_M2` at it instead and
+ skip `-Dmaven.repo.local`.
+
+4. **A Scala 2.13 Spark distribution.** The PyPI `pyspark` wheel embeds
+ Scala 2.12 jars; the connector is compiled against 2.13, so we override
+ `SPARK_HOME` before importing pyspark. Download once:
+
+ ```bash
+ cd /tmp
+ curl -L -o spark-2.13.tgz \
+ https://archive.apache.org/dist/spark/spark-3.5.7/spark-3.5.7-bin-hadoop3-scala2.13.tgz
+ tar xzf spark-2.13.tgz
+ ```
+
+ The script defaults `SPARK_HOME` to
+ `/tmp/spark-3.5.7-bin-hadoop3-scala2.13`; set the env var if you put it
+ elsewhere.
+
+5. **A self-contained Python venv with `pyspark==3.5.7`** (uv keeps it
+ isolated from system site-packages):
+
+ ```bash
+ cd examples/python
+ uv venv --python 3.11 .venv
+ uv pip install --python .venv/bin/python "pyspark==3.5.7"
+ cd ../..
+ ```
+
+## Run
+
+```bash
+examples/python/.venv/bin/python examples/python/bridge_demo.py
+```
+
+Expected output:
+
+```
+=== schema ===
+root
+ |-- id: long (nullable = false)
+ |-- name: string (nullable = true)
+ |-- value: double (nullable = true)
+
+=== full scan ===
++---+-----+-----+
+|id |name |value|
++---+-----+-----+
+|1 |alice|1.5 |
+|2 |bob |2.5 |
+|3 |NULL |3.5 |
+|4 |dave |NULL |
++---+-----+-----+
+
+=== filter pushdown: value > 2.0 ===
++---+----+-----+
+|id |name|value|
++---+----+-----+
+|2 |bob |2.5 |
+|3 |NULL|3.5 |
++---+----+-----+
+
+=== projection: id, name ===
++---+-----+
+|id |name |
++---+-----+
+|1 |alice|
+|2 |bob |
+|3 |NULL |
+|4 |dave |
++---+-----+
+```
+
+Filter row count drops from 4 → 2 because the predicate is pushed into the
+bridge cdylib as a `LogicalExprNode` proto and applied inside DataFusion
+before Arrow batches cross back to Spark.
+
+## Notes
+
+- `master("local[2]")` keeps driver + executor in one JVM so the example
+ cdylib loads once. In cluster mode nothing extra is needed: the bridge
+ cdylib travels inside the examples jar and `NativeLibraryLoader` extracts
+ it on every worker.
+- `extraClassPath` (not `--packages` / `userClassPathFirst`) is used because
+ the Spark distro ships Arrow 12, flatbuffers 1.12, and protobuf 2.5, all
+ of which we need to override; userClassPathFirst splits Netty across two
+ class loaders and the `arrow-memory-netty-buffer-patch` shim breaks.
+- The `datafusion` format short name resolves via the SPI file in
+ `spark/src/main/resources/META-INF/services/org.apache.spark.sql.sources.DataSourceRegister`.
+ You can also use the FQCN: `format("io.datafusion.spark.DatafusionSource")`.
+- To swap in your own bridge, write a `BridgeProviderFactory` against your own
+ cdylib (mirroring `ExampleBridgeProviderFactory`) and pass its FQCN via
+ `option("df.factory", ...)`.
diff --git a/examples/python/bridge_demo.py b/examples/python/bridge_demo.py
new file mode 100644
index 0000000..a630224
--- /dev/null
+++ b/examples/python/bridge_demo.py
@@ -0,0 +1,237 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+"""End-to-end PySpark demo of a DataFusion table provider exposed as a Spark data source.
+
+Wires the in-memory example MemTable produced by ``examples/native`` into a
+Spark DataSource V2 scan through the generic connector in ``spark/``.
+
+Prerequisites (run from the repo root):
+
+ cargo build --release --workspace
+ mvn install -Ddatafusion.native.profile=release -DskipTests
+
+Run:
+
+ python3 examples/python/bridge_demo.py
+"""
+
+import glob
+import os
+import sys
+from pathlib import Path
+
+# The PyPI ``pyspark`` wheel embeds a Scala 2.12 Spark distribution; this
+# connector is compiled against Scala 2.13. Override SPARK_HOME (before the
+# pyspark import so the wheel honours it) to a side-loaded 2.13 distribution.
+_SPARK_HOME_2_13 = os.environ.get(
+ "SPARK_HOME",
+ "/tmp/spark-3.5.7-bin-hadoop3-scala2.13",
+)
+if not Path(_SPARK_HOME_2_13, "jars", "scala-library-2.13.8.jar").exists():
+ sys.exit(
+ f"missing Scala 2.13 Spark distribution at {_SPARK_HOME_2_13}. "
+ "Download from https://archive.apache.org/dist/spark/spark-3.5.7/"
+ "spark-3.5.7-bin-hadoop3-scala2.13.tgz and extract to that path "
+ "(or set SPARK_HOME to your own 2.13 distro)."
+ )
+os.environ["SPARK_HOME"] = _SPARK_HOME_2_13
+
+from pyspark.sql import SparkSession
+
+
+REPO_ROOT = Path(__file__).resolve().parents[2]
+VERSION = "0.2.0-SNAPSHOT"
+ARROW_VERSION = "19.0.0"
+FLATBUFFERS_VERSION = "25.2.10"
+PROTOBUF_VERSION = "3.25.5"
+# Local maven repository populated by ``mvn install -Dmaven.repo.local=...``.
+M2_REPO = Path(os.environ.get("DATAFUSION_DEMO_M2", "/tmp/m2-datafusion"))
+
+
+def _resolve_jar(module: str, artifact: str) -> str:
+ candidates = glob.glob(str(REPO_ROOT / module / "target" / f"{artifact}-{VERSION}.jar"))
+ if not candidates:
+ sys.exit(
+ f"missing jar for {artifact} under {module}/target/. "
+ f"Run 'mvn install -DskipTests' from {REPO_ROOT} first."
+ )
+ return candidates[0]
+
+
+def _m2_jar(group_path: str, artifact: str, version: str) -> str:
+ path = M2_REPO / group_path / artifact / version / f"{artifact}-{version}.jar"
+ if not path.exists():
+ sys.exit(
+ f"missing dependency jar {path}. "
+ f"Re-run 'mvn install -DskipTests -Dmaven.repo.local={M2_REPO}'."
+ )
+ return str(path)
+
+
+def main() -> None:
+ # Spark 3.5.7 bundles Arrow 12.0.1; datafusion-java is compiled against
+ # Arrow 19, which needs ArrowArrayStream (added after 12) and a much newer
+ # flatbuffers runtime. Ship our copies on spark.jars and force userClassPathFirst
+ # so they win over the bundled jars on both driver and executor.
+ arrow_jars = [
+ _m2_jar("org/apache/arrow", "arrow-format", ARROW_VERSION),
+ _m2_jar("org/apache/arrow", "arrow-vector", ARROW_VERSION),
+ _m2_jar("org/apache/arrow", "arrow-memory-core", ARROW_VERSION),
+ _m2_jar("org/apache/arrow", "arrow-memory-netty", ARROW_VERSION),
+ _m2_jar(
+ "org/apache/arrow",
+ "arrow-memory-netty-buffer-patch",
+ ARROW_VERSION,
+ ),
+ _m2_jar("org/apache/arrow", "arrow-c-data", ARROW_VERSION),
+ _m2_jar(
+ "com/google/flatbuffers", "flatbuffers-java", FLATBUFFERS_VERSION
+ ),
+ # Spark ships protobuf-java 2.5.0 (sans MessageOrBuilder). The proto
+ # surface in core (LogicalExprNode etc.) needs 3.25.x.
+ _m2_jar("com/google/protobuf", "protobuf-java", PROTOBUF_VERSION),
+ ]
+ app_jars = [
+ _resolve_jar("core", "datafusion-java"),
+ _resolve_jar("spark", "datafusion-java-spark_2.13"),
+ _resolve_jar("examples", "datafusion-java-examples"),
+ *arrow_jars,
+ ]
+ jars = ",".join(app_jars)
+ # Prepend the same jars onto the bootstrap classpath so Arrow 19's classes
+ # are loaded by the system class loader — avoids the
+ # ``UnsafeDirectLittleEndian cannot access superclass WrappedByteBuf``
+ # IllegalAccessError that ChildFirstURLClassLoader produces when the
+ # buffer-patch class lands in the child loader while Netty stays in the app
+ # loader.
+ extra_classpath = ":".join(app_jars)
+
+ spark = (
+ SparkSession.builder.appName("datafusion-bridge-demo")
+ .master("local[2]")
+ .config("spark.jars", jars)
+ .config("spark.driver.extraClassPath", extra_classpath)
+ .config("spark.executor.extraClassPath", extra_classpath)
+ .config(
+ "spark.driver.extraJavaOptions",
+ "--add-opens=java.base/java.nio=ALL-UNNAMED",
+ )
+ .config(
+ "spark.executor.extraJavaOptions",
+ "--add-opens=java.base/java.nio=ALL-UNNAMED",
+ )
+ .getOrCreate()
+ )
+
+ # The example cdylib is bundled inside the examples jar and extracted by
+ # NativeLibraryLoader at first use; no working-directory or path setup is
+ # needed. (-Dexample.bridge.lib.path via extraJavaOptions overrides it for
+ # unpackaged local builds.)
+
+ # `name_prefix`, `num_rows`, `num_batches` are interpreted by
+ # ExampleBridgeProviderFactory.encodeOptions and decoded on the Rust side
+ # in examples/native/src/lib.rs. They demonstrate driver-side options
+ # flowing through to the native MemTable build.
+ name_prefix = "user"
+ num_rows = 5
+ num_batches = 3
+ df = (
+ spark.read.format("datafusion")
+ .option(
+ "df.factory",
+ "org.apache.datafusion.examples.ExampleBridgeProviderFactory",
+ )
+ .option("name_prefix", name_prefix)
+ .option("num_rows", str(num_rows))
+ .option("num_batches", str(num_batches))
+ .load()
+ )
+
+ total_rows = num_rows * num_batches
+ print(f"=== options: name_prefix={name_prefix} num_rows={num_rows} num_batches={num_batches} ===")
+ print(f"=== expecting {total_rows} rows across {num_batches} Arrow batches ===")
+
+ print("=== schema ===")
+ df.printSchema()
+
+ print(f"=== full scan (first {total_rows} rows) ===")
+ df.show(n=total_rows, truncate=False)
+
+ print("=== filter pushdown: value > 5.0 ===")
+ df.filter("value > 5.0").show(n=total_rows, truncate=False)
+
+ print("=== projection: id, name ===")
+ df.select("id", "name").show(n=total_rows, truncate=False)
+
+ legacy_rows = {tuple(r) for r in df.collect()}
+
+ # --- shared-scan mode -------------------------------------------------
+ # `shared_scan=true` flips ExampleBridgeProviderFactory.sharedScan: one
+ # provider + plan cached per executor, one Spark task per MemTable
+ # partition (num_partitions=4), each task streaming one DataFusion plan
+ # partition. Results must be identical to the legacy run above.
+ num_partitions = 4
+ shared = (
+ spark.read.format("datafusion")
+ .option(
+ "df.factory",
+ "org.apache.datafusion.examples.ExampleBridgeProviderFactory",
+ )
+ .option("name_prefix", name_prefix)
+ .option("num_rows", str(num_rows))
+ .option("num_batches", str(num_batches))
+ .option("num_partitions", str(num_partitions))
+ .option("shared_scan", "true")
+ .load()
+ )
+
+ print(f"=== shared-scan mode: num_partitions={num_partitions} ===")
+ shared_partitions = shared.rdd.getNumPartitions()
+ print(f"=== shared-scan Spark partitions: {shared_partitions} ===")
+ assert shared_partitions == num_partitions, (
+ f"expected {num_partitions} Spark partitions in shared-scan mode, "
+ f"got {shared_partitions}"
+ )
+
+ shared.show(n=total_rows, truncate=False)
+ shared_rows = {tuple(r) for r in shared.collect()}
+ assert shared_rows == legacy_rows, (
+ "shared-scan rows diverge from legacy mode: "
+ f"only-legacy={legacy_rows - shared_rows} only-shared={shared_rows - legacy_rows}"
+ )
+ print(f"=== shared-scan returned the same {len(shared_rows)} rows as legacy mode ===")
+
+ print("=== shared-scan filter pushdown: value > 5.0 ===")
+ shared.filter("value > 5.0").show(n=total_rows, truncate=False)
+
+ # Note on cache scope: the executor cache is keyed by a per-query scanId,
+ # so sharing happens across the TASKS of one query (4 tasks above -> one
+ # provider build per executor JVM, in the bridge's native build_provider),
+ # not across separate actions. Each new
+ # action plans a new scan with a fresh scanId; its entry simply joins the
+ # cache until the idle TTL evicts it.
+ count_again = shared.count()
+ assert count_again == total_rows, f"expected {total_rows} rows, got {count_again}"
+ print("=== shared-scan count() as a separate action also succeeded ===")
+
+ spark.stop()
+
+
+if __name__ == "__main__":
+ main()
diff --git a/examples/src/main/java/org/apache/datafusion/examples/ExampleBridgeNative.java b/examples/src/main/java/org/apache/datafusion/examples/ExampleBridgeNative.java
new file mode 100644
index 0000000..dff42ee
--- /dev/null
+++ b/examples/src/main/java/org/apache/datafusion/examples/ExampleBridgeNative.java
@@ -0,0 +1,68 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.datafusion.examples;
+
+import io.datafusion.spark.NativeLibraryLoader;
+
+/**
+ * JNI surface generated on the Rust side by {@code export_bridge!} in {@code
+ * examples/native/src/lib.rs} with {@code jni_class =
+ * "org_apache_datafusion_examples_ExampleBridgeNative"} — the mangled binary name of THIS class.
+ * Renaming or moving this class requires regenerating the Rust macro invocation to match.
+ *
+ *
The cdylib is bundled inside this jar at {@code org/apache/datafusion/examples///}
+ * (see the antrun execution in {@code examples/pom.xml}). For local hacking against an unpackaged
+ * build, {@code -Dexample.bridge.lib.path=/abs/path/to/libdatafusion_example_bridge.dylib} bypasses
+ * the bundled copy.
+ */
+final class ExampleBridgeNative {
+
+ private ExampleBridgeNative() {}
+
+ static {
+ String explicit = System.getProperty("example.bridge.lib.path");
+ if (explicit != null && !explicit.isEmpty()) {
+ System.load(explicit);
+ } else {
+ NativeLibraryLoader.load(
+ ExampleBridgeNative.class, "org/apache/datafusion/examples", "datafusion_example_bridge");
+ }
+ }
+
+ static native byte[] providerSchemaIpc(byte[] options, byte[] partition);
+
+ static native long createScan(
+ byte[] options,
+ byte[] partition,
+ int targetPartitions,
+ int batchSize,
+ String[] optionKeys,
+ String[] optionValues,
+ String[] projectionColumns,
+ byte[][] filterProtos);
+
+ static native int partitionCount(long scanHandle);
+
+ static native void executeStreamPartition(long scanHandle, int partition, long ffiStreamAddr);
+
+ static native void executeStream(long scanHandle, long ffiStreamAddr);
+
+ static native void closeScan(long scanHandle);
+}
diff --git a/examples/src/main/java/org/apache/datafusion/examples/ExampleBridgeProviderFactory.java b/examples/src/main/java/org/apache/datafusion/examples/ExampleBridgeProviderFactory.java
new file mode 100644
index 0000000..5b4c921
--- /dev/null
+++ b/examples/src/main/java/org/apache/datafusion/examples/ExampleBridgeProviderFactory.java
@@ -0,0 +1,174 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.datafusion.examples;
+
+import java.nio.ByteBuffer;
+import java.nio.ByteOrder;
+import java.nio.charset.StandardCharsets;
+import java.util.Map;
+
+import io.datafusion.spark.BridgeProviderFactory;
+import io.datafusion.spark.PartitionInfo;
+import io.datafusion.spark.ScanBackend;
+
+/**
+ * Minimal {@link BridgeProviderFactory} that exposes the example {@code MemTable} built inside the
+ * example bridge cdylib (see {@code examples/native}) as a Spark DataSource V2 source.
+ *
+ *
{@code num_batches} — number of in-memory {@code RecordBatch}es composing the table.
+ * Default {@code 1}.
+ *
{@code num_partitions} — number of DataFusion-native MemTable partitions the batches are
+ * distributed across (round-robin). Default {@code 1}. Mostly interesting together with
+ * {@code shared_scan}.
+ *
{@code shared_scan} — {@code true} opts into the connector's shared-scan mode: one cached
+ * provider + plan per executor, one Spark task per MemTable partition. Default {@code false}
+ * (single task via {@link #listPartitions(byte[])}).
+ *
+ *
+ *
Real bridges (HDF5, custom Iceberg, in-house formats) use a protobuf schema for {@code
+ * optionsBytes}; this example uses a hand-rolled length-prefixed binary format to keep the
+ * wire layer obvious:
+ *
+ *
+ * [u32 LE name_prefix_len][name_prefix UTF-8 bytes][u32 LE num_rows][u32 LE num_batches]
+ * [u32 LE num_partitions][u8 shared_scan]
+ *
+ *
+ *
An empty {@code byte[]} is also accepted by the native side and decoded as all defaults; the
+ * two trailing fields are optional so older blobs keep decoding.
+ *
+ *
In the default mode a single partition (id {@code "p0"}, empty {@code partitionBytes}, no
+ * preferred host) is reported so Spark spawns one task; the executor hands the options bytes to
+ * {@code ExampleBridgeNative.createScan}, which builds the {@code MemTable} provider in process and
+ * streams the resulting Arrow record batches back into the Spark scan.
+ */
+public final class ExampleBridgeProviderFactory implements BridgeProviderFactory {
+
+ static final String OPT_NAME_PREFIX = "name_prefix";
+ static final String OPT_NUM_ROWS = "num_rows";
+ static final String OPT_NUM_BATCHES = "num_batches";
+ static final String OPT_NUM_PARTITIONS = "num_partitions";
+ static final String OPT_SHARED_SCAN = "shared_scan";
+
+ static final String DEFAULT_NAME_PREFIX = "row";
+ static final int DEFAULT_NUM_ROWS = 4;
+ static final int DEFAULT_NUM_BATCHES = 1;
+ static final int DEFAULT_NUM_PARTITIONS = 1;
+
+ public ExampleBridgeProviderFactory() {}
+
+ @Override
+ public byte[] encodeOptions(Map sparkOptions) {
+ String namePrefix = sparkOptions.getOrDefault(OPT_NAME_PREFIX, DEFAULT_NAME_PREFIX);
+ int numRows = parsePositiveInt(sparkOptions, OPT_NUM_ROWS, DEFAULT_NUM_ROWS);
+ int numBatches = parsePositiveInt(sparkOptions, OPT_NUM_BATCHES, DEFAULT_NUM_BATCHES);
+ int numPartitions = parsePositiveInt(sparkOptions, OPT_NUM_PARTITIONS, DEFAULT_NUM_PARTITIONS);
+ boolean sharedScan = Boolean.parseBoolean(sparkOptions.getOrDefault(OPT_SHARED_SCAN, "false"));
+
+ byte[] nameBytes = namePrefix.getBytes(StandardCharsets.UTF_8);
+ ByteBuffer buf =
+ ByteBuffer.allocate(4 + nameBytes.length + 4 + 4 + 4 + 1).order(ByteOrder.LITTLE_ENDIAN);
+ buf.putInt(nameBytes.length);
+ buf.put(nameBytes);
+ buf.putInt(numRows);
+ buf.putInt(numBatches);
+ buf.putInt(numPartitions);
+ buf.put((byte) (sharedScan ? 1 : 0));
+ return buf.array();
+ }
+
+ @Override
+ public PartitionInfo[] listPartitions(byte[] optionsBytes) {
+ // Single partition; the example MemTable is not actually sliced. A real bridge would
+ // populate `partitionBytes` per slice and `preferredLocations` with the hosts holding it.
+ return new PartitionInfo[] {new PartitionInfo("p0", new byte[0], new String[0])};
+ }
+
+ @Override
+ public PartitionInfo[] listPartitions(byte[] optionsBytes, byte[][] filterProtoBytes) {
+ // The example cannot prune its single partition, but a real bridge would inspect the
+ // pushed predicates here and drop partitions that cannot match.
+ System.out.println(
+ "ExampleBridgeProviderFactory.listPartitions received "
+ + filterProtoBytes.length
+ + " pushed filter(s)");
+ return listPartitions(optionsBytes);
+ }
+
+ @Override
+ public boolean sharedScan(byte[] optionsBytes) {
+ // The flag is the final byte of the options blob (present only when the encoder wrote the
+ // trailing fields). The bridge owns its wire format, so decoding it here is fair game.
+ return optionsBytes != null
+ && optionsBytes.length >= 1
+ && hasTrailingFields(optionsBytes)
+ && optionsBytes[optionsBytes.length - 1] == 1;
+ }
+
+ private static boolean hasTrailingFields(byte[] bytes) {
+ if (bytes.length < 4) {
+ return false;
+ }
+ int nameLen = ByteBuffer.wrap(bytes, 0, 4).order(ByteOrder.LITTLE_ENDIAN).getInt();
+ // base layout: 4 (len) + name + 4 (num_rows) + 4 (num_batches); trailing adds 4 + 1.
+ return bytes.length >= 4 + nameLen + 8 + 5;
+ }
+
+ @Override
+ public ScanBackend scanBackend() {
+ return new ExampleScanBackend();
+ }
+
+ private static int parsePositiveInt(Map opts, String key, int defaultValue) {
+ String raw = opts.get(key);
+ if (raw == null || raw.isEmpty()) {
+ return defaultValue;
+ }
+ int parsed;
+ try {
+ parsed = Integer.parseInt(raw.trim());
+ } catch (NumberFormatException e) {
+ throw new IllegalArgumentException(
+ "ExampleBridgeProviderFactory: option '" + key + "' must be an integer, got: " + raw);
+ }
+ if (parsed <= 0) {
+ throw new IllegalArgumentException(
+ "ExampleBridgeProviderFactory: option '" + key + "' must be > 0, got: " + parsed);
+ }
+ return parsed;
+ }
+}
diff --git a/examples/src/main/java/org/apache/datafusion/examples/ExampleScanBackend.java b/examples/src/main/java/org/apache/datafusion/examples/ExampleScanBackend.java
new file mode 100644
index 0000000..9854817
--- /dev/null
+++ b/examples/src/main/java/org/apache/datafusion/examples/ExampleScanBackend.java
@@ -0,0 +1,72 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.datafusion.examples;
+
+import io.datafusion.spark.ScanBackend;
+
+/** Routes the connector's scan calls to the example bridge cdylib. Pure delegation. */
+final class ExampleScanBackend implements ScanBackend {
+
+ @Override
+ public byte[] providerSchemaIpc(byte[] options, byte[] partitionBytes) {
+ return ExampleBridgeNative.providerSchemaIpc(options, partitionBytes);
+ }
+
+ @Override
+ public long createScan(
+ byte[] options,
+ byte[] partitionBytes,
+ int targetPartitions,
+ int batchSize,
+ String[] optionKeys,
+ String[] optionValues,
+ String[] projectionColumns,
+ byte[][] filterProtos) {
+ return ExampleBridgeNative.createScan(
+ options,
+ partitionBytes,
+ targetPartitions,
+ batchSize,
+ optionKeys,
+ optionValues,
+ projectionColumns,
+ filterProtos);
+ }
+
+ @Override
+ public int partitionCount(long scanHandle) {
+ return ExampleBridgeNative.partitionCount(scanHandle);
+ }
+
+ @Override
+ public void executeStreamPartition(long scanHandle, int partition, long ffiStreamAddr) {
+ ExampleBridgeNative.executeStreamPartition(scanHandle, partition, ffiStreamAddr);
+ }
+
+ @Override
+ public void executeStream(long scanHandle, long ffiStreamAddr) {
+ ExampleBridgeNative.executeStream(scanHandle, ffiStreamAddr);
+ }
+
+ @Override
+ public void closeScan(long scanHandle) {
+ ExampleBridgeNative.closeScan(scanHandle);
+ }
+}
diff --git a/native-common/Cargo.toml b/native-common/Cargo.toml
new file mode 100644
index 0000000..0a797b4
--- /dev/null
+++ b/native-common/Cargo.toml
@@ -0,0 +1,35 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+[package]
+name = "datafusion-jni-common"
+version = "0.1.0"
+edition = "2021"
+publish = false
+
+[features]
+# `datafusion-jni` builds DataFusion with `avro`, which adds the
+# `DataFusionError::AvroError` variant our classifier maps to IoException.
+# Feature-forwarded so consumers that don't read Avro (the Spark helper)
+# don't pull the apache-avro stack into their cdylib.
+avro = ["datafusion/avro"]
+
+[dependencies]
+datafusion = { workspace = true }
+futures = { workspace = true }
+jni = { workspace = true }
+tokio = { workspace = true }
diff --git a/native/src/errors.rs b/native-common/src/errors.rs
similarity index 97%
rename from native/src/errors.rs
rename to native-common/src/errors.rs
index d926544..caa2540 100644
--- a/native/src/errors.rs
+++ b/native-common/src/errors.rs
@@ -96,8 +96,11 @@ fn classify(err: &DataFusionError) -> &'static str {
}
DataFusionError::IoError(_)
| DataFusionError::ObjectStore(_)
- | DataFusionError::ParquetError(_)
- | DataFusionError::AvroError(_) => "org/apache/datafusion/IoException",
+ | DataFusionError::ParquetError(_) => "org/apache/datafusion/IoException",
+ // The AvroError variant only exists when DataFusion is built with its
+ // `avro` feature, forwarded by this crate's own `avro` feature.
+ #[cfg(feature = "avro")]
+ DataFusionError::AvroError(_) => "org/apache/datafusion/IoException",
// ArrowError is a 21-variant grab bag -- only some of those variants
// are actually IO-shaped. DivideByZero / ArithmeticOverflow / Compute
// / Cast / InvalidArgument / Memory etc. are execution-time failures
diff --git a/native-common/src/lib.rs b/native-common/src/lib.rs
new file mode 100644
index 0000000..f143d43
--- /dev/null
+++ b/native-common/src/lib.rs
@@ -0,0 +1,104 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//! JNI plumbing shared by this workspace's native crates (`datafusion-jni`
+//! and `datafusion-spark-bridge`, and through the latter every bridge
+//! cdylib): the error-to-Java-exception mapping, the per-cdylib Tokio
+//! runtime singleton, and the async-stream-to-`FFI_ArrowArrayStream`
+//! bridge.
+//!
+//! Each cdylib statically links its own copy of this rlib, so [`runtime`] is
+//! a per-cdylib singleton -- exactly the behaviour each crate had when this
+//! code lived inline. Nothing here is exported with `#[no_mangle]`, so
+//! linking this crate into several cdylibs loaded in one JVM cannot collide.
+
+pub mod errors;
+
+use std::panic::{catch_unwind, AssertUnwindSafe};
+use std::sync::OnceLock;
+
+use datafusion::arrow::array::RecordBatch;
+use datafusion::arrow::datatypes::SchemaRef;
+use datafusion::arrow::error::ArrowError;
+use datafusion::arrow::record_batch::RecordBatchReader;
+use datafusion::execution::SendableRecordBatchStream;
+use futures::StreamExt;
+use tokio::runtime::{Handle, Runtime};
+
+static RT: OnceLock = OnceLock::new();
+
+/// The cdylib-wide Tokio runtime.
+pub fn runtime() -> &'static Runtime {
+ runtime_with_init(|_| {})
+}
+
+/// Same singleton as [`runtime`], with a hook that runs exactly once, when
+/// the runtime is created. `datafusion-jni` uses it to install its
+/// runtime-metrics accumulator so the sampling baseline coincides with
+/// runtime start; every later call (either entry point) returns the existing
+/// runtime without invoking the hook.
+pub fn runtime_with_init(init: impl FnOnce(&Handle)) -> &'static Runtime {
+ RT.get_or_init(|| {
+ let rt = Runtime::new().expect("failed to create Tokio runtime");
+ init(rt.handle());
+ rt
+ })
+}
+
+/// Bridges DataFusion's async [`SendableRecordBatchStream`] to the synchronous
+/// [`RecordBatchReader`] interface that `FFI_ArrowArrayStream` (and therefore
+/// the Java `ArrowReader`) consumes. Each call to `next()` drives one
+/// `runtime().block_on(stream.next())`, so memory pressure stays bounded by the
+/// executor pipeline plus a single in-flight batch.
+pub struct StreamingReader {
+ pub schema: SchemaRef,
+ pub stream: SendableRecordBatchStream,
+}
+
+impl Iterator for StreamingReader {
+ type Item = Result;
+
+ fn next(&mut self) -> Option {
+ // Arrow's C ABI invokes this iterator through FFI_ArrowArrayStream's
+ // vtable, outside the JNI handler's try_unwrap_or_throw guard. A panic
+ // here (buggy UDF, arrow cast that panics, runtime poison) would
+ // unwind across C/FFI -- undefined behaviour. Catch it and surface as
+ // an ArrowError so the Java side sees a normal exception instead.
+ let next = catch_unwind(AssertUnwindSafe(|| runtime().block_on(self.stream.next())));
+ match next {
+ Ok(item) => item.map(|r| r.map_err(|e| ArrowError::ExternalError(Box::new(e)))),
+ Err(panic) => {
+ let msg = if let Some(s) = panic.downcast_ref::() {
+ s.clone()
+ } else if let Some(s) = panic.downcast_ref::<&str>() {
+ (*s).to_string()
+ } else {
+ "rust panic with non-string payload".to_string()
+ };
+ Some(Err(ArrowError::ExternalError(
+ format!("panic in DataFrame stream: {msg}").into(),
+ )))
+ }
+ }
+ }
+}
+
+impl RecordBatchReader for StreamingReader {
+ fn schema(&self) -> SchemaRef {
+ self.schema.clone()
+ }
+}
diff --git a/native/Cargo.toml b/native/Cargo.toml
index c462408..0f4ca83 100644
--- a/native/Cargo.toml
+++ b/native/Cargo.toml
@@ -23,8 +23,8 @@ publish = false
[lib]
# `rlib` alongside `cdylib` so `cargo test` has a Rust-level harness for
-# native-only invariants (e.g. error-classification routing through wrapped
-# DataFusionError chains). The `cdylib` is still the artifact the JVM loads.
+# native-only invariants (the error-classification tests now live in
+# `datafusion-jni-common`). The `cdylib` is still the artifact the JVM loads.
crate-type = ["cdylib", "rlib"]
[features]
@@ -69,24 +69,23 @@ protoc = ["datafusion-substrait?/protoc"]
runtime-metrics = ["dep:tokio-metrics"]
[dependencies]
-arrow = { version = "58", features = ["ffi"] }
-async-trait = "0.1"
-datafusion = { version = "53.1.0", features = ["avro"] }
-datafusion-proto = "53.1.0"
-datafusion-substrait = { version = "53.1.0", optional = true }
-futures = "0.3"
-jni = "0.21"
-# Pin to the same major as DataFusion 53.1 pulls in transitively (0.13.x)
-# so we share the same `dyn ObjectStore` vtable and don't double-link.
-object_store = { version = "0.13", default-features = false }
-prost = "0.14"
-tokio = { version = "1", features = ["rt-multi-thread"] }
-# Tokio runtime metrics. Optional + cfg-gated: this crate's API surface lives
-# behind `--cfg tokio_unstable`, so enabling the `runtime-metrics` feature also
-# requires the caller to set `RUSTFLAGS="--cfg tokio_unstable"` at build time.
-tokio-metrics = { version = "0.5", optional = true }
-url = "2"
+arrow = { workspace = true }
+async-trait = { workspace = true }
+datafusion = { workspace = true, features = ["avro"] }
+# Shared JNI plumbing (error->exception mapping, runtime singleton,
+# StreamingReader). `avro` keeps the classifier's AvroError->IoException arm
+# in sync with the `avro` feature on `datafusion` above.
+datafusion-jni-common = { path = "../native-common", features = ["avro"] }
+datafusion-proto = { workspace = true }
+datafusion-substrait = { workspace = true, optional = true }
+futures = { workspace = true }
+jni = { workspace = true }
+object_store = { workspace = true }
+prost = { workspace = true }
+tokio = { workspace = true }
+tokio-metrics = { workspace = true, optional = true }
+url = { workspace = true }
[build-dependencies]
-prost-build = "0.14"
-protoc-bin-vendored = "3"
+prost-build = { workspace = true }
+protoc-bin-vendored = { workspace = true }
diff --git a/native/src/arrow.rs b/native/src/arrow.rs
index 2bbe7b0..67e5caf 100644
--- a/native/src/arrow.rs
+++ b/native/src/arrow.rs
@@ -23,10 +23,10 @@ use jni::sys::jlong;
use jni::JNIEnv;
use prost::Message;
-use crate::errors::{try_unwrap_or_throw, JniResult};
use crate::proto_gen::ArrowReadOptionsProto;
use crate::runtime;
use crate::schema::decode_optional_schema;
+use datafusion_jni_common::errors::{try_unwrap_or_throw, JniResult};
fn with_arrow_options(
env: &mut JNIEnv,
diff --git a/native/src/avro.rs b/native/src/avro.rs
index 85d4a07..257ae32 100644
--- a/native/src/avro.rs
+++ b/native/src/avro.rs
@@ -23,10 +23,10 @@ use jni::sys::jlong;
use jni::JNIEnv;
use prost::Message;
-use crate::errors::{try_unwrap_or_throw, JniResult};
use crate::proto_gen::AvroReadOptionsProto;
use crate::runtime;
use crate::schema::decode_optional_schema;
+use datafusion_jni_common::errors::{try_unwrap_or_throw, JniResult};
fn with_avro_options(
env: &mut JNIEnv,
diff --git a/native/src/cache_manager.rs b/native/src/cache_manager.rs
index 3b9e286..ec38dc8 100644
--- a/native/src/cache_manager.rs
+++ b/native/src/cache_manager.rs
@@ -34,8 +34,8 @@ use datafusion::execution::cache::cache_unit::{
};
use datafusion::execution::cache::DefaultListFilesCache;
-use crate::errors::JniResult;
use crate::proto_gen::CacheManagerOptionsProto;
+use datafusion_jni_common::errors::JniResult;
/// Build a [`CacheManagerConfig`] from the proto. Returns `Ok(None)` if the
/// caller did not set any cache-manager field, so the JNI layer can skip the
diff --git a/native/src/csv.rs b/native/src/csv.rs
index 3ae4627..b79ed59 100644
--- a/native/src/csv.rs
+++ b/native/src/csv.rs
@@ -26,12 +26,12 @@ use jni::sys::jlong;
use jni::JNIEnv;
use prost::Message;
-use crate::errors::{try_unwrap_or_throw, JniResult};
use crate::proto_gen::{
CsvReadOptionsProto, CsvWriteOptionsProto, FileCompressionType as ProtoFileCompressionType,
};
use crate::runtime;
use crate::schema::decode_optional_schema;
+use datafusion_jni_common::errors::{try_unwrap_or_throw, JniResult};
fn with_csv_options(
env: &mut JNIEnv,
diff --git a/native/src/json.rs b/native/src/json.rs
index 8eea32f..b87be78 100644
--- a/native/src/json.rs
+++ b/native/src/json.rs
@@ -27,12 +27,12 @@ use jni::sys::jlong;
use jni::JNIEnv;
use prost::Message;
-use crate::errors::{try_unwrap_or_throw, JniResult};
use crate::proto_gen::{
FileCompressionType as ProtoFileCompressionType, JsonWriteOptionsProto, NdJsonReadOptionsProto,
};
use crate::runtime;
use crate::schema::decode_optional_schema;
+use datafusion_jni_common::errors::{try_unwrap_or_throw, JniResult};
fn with_json_options(
env: &mut JNIEnv,
diff --git a/native/src/lib.rs b/native/src/lib.rs
index 4fd7a8a..6e1a79f 100644
--- a/native/src/lib.rs
+++ b/native/src/lib.rs
@@ -19,7 +19,6 @@ mod arrow;
mod avro;
mod cache_manager;
mod csv;
-mod errors;
mod jni_util;
mod json;
mod memory;
@@ -34,16 +33,13 @@ pub(crate) mod proto_gen {
include!(concat!(env!("OUT_DIR"), "/datafusion_java.rs"));
}
-use std::panic::{catch_unwind, AssertUnwindSafe};
use std::path::PathBuf;
use std::sync::{Arc, OnceLock};
-use datafusion::arrow::array::RecordBatch;
use datafusion::arrow::datatypes::SchemaRef;
-use datafusion::arrow::error::ArrowError;
use datafusion::arrow::ffi_stream::FFI_ArrowArrayStream;
use datafusion::arrow::ipc::writer::StreamWriter;
-use datafusion::arrow::record_batch::{RecordBatchIterator, RecordBatchReader};
+use datafusion::arrow::record_batch::RecordBatchIterator;
use datafusion::common::{JoinType, UnnestOptions};
use datafusion::config::TableParquetOptions;
use datafusion::dataframe::DataFrame;
@@ -51,11 +47,9 @@ use datafusion::dataframe::DataFrameWriteOptions;
use datafusion::error::DataFusionError;
use datafusion::execution::disk_manager::{DiskManagerBuilder, DiskManagerMode};
use datafusion::execution::runtime_env::RuntimeEnvBuilder;
-use datafusion::execution::SendableRecordBatchStream;
use datafusion::logical_expr::Expr;
use datafusion::logical_expr::{col, Partitioning, ScalarUDF, Signature, SortExpr};
use datafusion::prelude::{ParquetReadOptions, SessionConfig, SessionContext};
-use futures::StreamExt;
use jni::objects::{JBooleanArray, JByteArray, JClass, JObject, JObjectArray, JString};
use jni::sys::{jboolean, jbyte, jbyteArray, jint, jlong};
use jni::JNIEnv;
@@ -63,7 +57,10 @@ use jni::JavaVM;
use prost::Message;
use tokio::runtime::Runtime;
-use crate::errors::{try_unwrap_or_throw, JniResult};
+use datafusion_jni_common::errors::{try_unwrap_or_throw, JniResult};
+// Re-exported so sibling modules keep their crate-local `crate::StreamingReader` path.
+pub(crate) use datafusion_jni_common::StreamingReader;
+
use crate::proto_gen::ParquetReadOptionsProto;
use crate::proto_gen::SessionOptions;
use crate::schema::decode_optional_schema;
@@ -84,18 +81,15 @@ pub(crate) fn jvm() -> &'static JavaVM {
}
pub(crate) fn runtime() -> &'static Runtime {
- static RT: OnceLock = OnceLock::new();
- RT.get_or_init(|| {
- let rt = Runtime::new().expect("failed to create Tokio runtime");
- // Eagerly install the runtime-metrics accumulator (no-op when the
- // `runtime-metrics` Cargo feature is off). Initialising here -- not
- // lazily on the first `runtimeStats()` call -- means the
- // RuntimeMonitor's sampling baseline coincides with runtime start, so
- // poll/park/busy totals reflect activity from the first query onward
- // rather than from the first observation.
- crate::runtime_metrics::init(rt.handle());
- rt
- })
+ // The singleton itself lives in datafusion-jni-common (shared with the
+ // datafusion-spark-bridge SDK; each cdylib statically links its own
+ // copy, so the runtime stays per-library). The init hook eagerly installs the
+ // runtime-metrics accumulator (no-op when the `runtime-metrics` Cargo
+ // feature is off). Initialising here -- not lazily on the first
+ // `runtimeStats()` call -- means the RuntimeMonitor's sampling baseline
+ // coincides with runtime start, so poll/park/busy totals reflect activity
+ // from the first query onward rather than from the first observation.
+ datafusion_jni_common::runtime_with_init(crate::runtime_metrics::init)
}
/// Wrap the (already-built) `RuntimeEnvBuilder`'s memory pool with a
@@ -289,50 +283,6 @@ pub extern "system" fn Java_org_apache_datafusion_DataFrame_collectDataFrame<'lo
})
}
-/// Bridges DataFusion's async [`SendableRecordBatchStream`] to the synchronous
-/// [`RecordBatchReader`] interface that `FFI_ArrowArrayStream` (and therefore
-/// the Java `ArrowReader`) consumes. Each call to `next()` drives one
-/// `runtime().block_on(stream.next())`, so memory pressure stays bounded by the
-/// executor pipeline plus a single in-flight batch.
-struct StreamingReader {
- schema: SchemaRef,
- stream: SendableRecordBatchStream,
-}
-
-impl Iterator for StreamingReader {
- type Item = Result;
-
- fn next(&mut self) -> Option {
- // Arrow's C ABI invokes this iterator through FFI_ArrowArrayStream's
- // vtable, outside the JNI handler's try_unwrap_or_throw guard. A panic
- // here (buggy UDF, arrow cast that panics, runtime poison) would
- // unwind across C/FFI -- undefined behaviour. Catch it and surface as
- // an ArrowError so the Java side sees a normal exception instead.
- let next = catch_unwind(AssertUnwindSafe(|| runtime().block_on(self.stream.next())));
- match next {
- Ok(item) => item.map(|r| r.map_err(|e| ArrowError::ExternalError(Box::new(e)))),
- Err(panic) => {
- let msg = if let Some(s) = panic.downcast_ref::() {
- s.clone()
- } else if let Some(s) = panic.downcast_ref::<&str>() {
- (*s).to_string()
- } else {
- "rust panic with non-string payload".to_string()
- };
- Some(Err(ArrowError::ExternalError(
- format!("panic in DataFrame stream: {msg}").into(),
- )))
- }
- }
- }
-}
-
-impl RecordBatchReader for StreamingReader {
- fn schema(&self) -> SchemaRef {
- self.schema.clone()
- }
-}
-
#[no_mangle]
pub extern "system" fn Java_org_apache_datafusion_DataFrame_executeStreamDataFrame<'local>(
mut env: JNIEnv<'local>,
diff --git a/native/src/object_store.rs b/native/src/object_store.rs
index eefccf2..985d721 100644
--- a/native/src/object_store.rs
+++ b/native/src/object_store.rs
@@ -28,9 +28,9 @@ use std::sync::Arc;
use datafusion::prelude::SessionContext;
use url::Url;
-use crate::errors::JniResult;
use crate::proto_gen::object_store_registration::Backend;
use crate::proto_gen::ObjectStoreRegistration;
+use datafusion_jni_common::errors::JniResult;
#[cfg(feature = "object-store-gcp")]
use crate::proto_gen::GcsOptions;
diff --git a/native/src/proto.rs b/native/src/proto.rs
index 4f187bc..c1315f9 100644
--- a/native/src/proto.rs
+++ b/native/src/proto.rs
@@ -28,8 +28,8 @@ use jni::sys::{jbyteArray, jlong};
use jni::JNIEnv;
use prost::Message;
-use crate::errors::{try_unwrap_or_throw, JniResult};
use crate::runtime;
+use datafusion_jni_common::errors::{try_unwrap_or_throw, JniResult};
#[no_mangle]
pub extern "system" fn Java_org_apache_datafusion_SessionContext_createDataFrameFromProto<
diff --git a/native/src/runtime_metrics.rs b/native/src/runtime_metrics.rs
index e69410e..dd60dcb 100644
--- a/native/src/runtime_metrics.rs
+++ b/native/src/runtime_metrics.rs
@@ -38,7 +38,7 @@
//! 10 totalOverflowCount
#[cfg(not(feature = "runtime-metrics"))]
-use crate::errors::JniResult;
+use datafusion_jni_common::errors::JniResult;
/// Number of i64 values in the snapshot array; kept here so the Java side and
/// the feature-off stub agree on the layout.
@@ -51,7 +51,7 @@ mod imp {
use tokio_metrics::{RuntimeIntervals, RuntimeMonitor};
use super::STATS_FIELD_COUNT;
- use crate::errors::JniResult;
+ use datafusion_jni_common::errors::JniResult;
/// `RuntimeMonitor::intervals().next()` returns *delta* metrics covering
/// the period since the previous call (or, on the very first call, since
@@ -196,7 +196,7 @@ pub fn runtime_stats() -> JniResult<[i64; STATS_FIELD_COUNT]> {
Err(
"datafusion-jni was built without the `runtime-metrics` Cargo feature; \
rebuild the native crate with \
- `RUSTFLAGS=\"--cfg tokio_unstable\" cargo build --features runtime-metrics` \
+ `RUSTFLAGS=\"--cfg tokio_unstable\" cargo build -p datafusion-jni --features runtime-metrics` \
to enable SessionContext.runtimeStats"
.into(),
)
diff --git a/native/src/schema.rs b/native/src/schema.rs
index 968a73a..0c3c7ab 100644
--- a/native/src/schema.rs
+++ b/native/src/schema.rs
@@ -20,7 +20,7 @@ use datafusion::arrow::ipc::reader::StreamReader;
use jni::objects::JByteArray;
use jni::JNIEnv;
-use crate::errors::JniResult;
+use datafusion_jni_common::errors::JniResult;
/// Decode an optional Arrow-IPC schema byte array passed in from Java.
/// Returns `None` if the byte-array reference is null.
diff --git a/pom.xml b/pom.xml
index 6210841..6baeb94 100644
--- a/pom.xml
+++ b/pom.xml
@@ -32,6 +32,7 @@ under the License.
core
+ sparkexamples
@@ -95,6 +96,11 @@ under the License.
+
+ org.apache.maven.plugins
+ maven-compiler-plugin
+ 3.13.0
+ org.apache.maven.pluginsmaven-surefire-plugin
@@ -159,6 +165,7 @@ under the License.
README.mdCONTRIBUTING.mddocs/**
+ **/*.md.gitignore.idea/**
@@ -173,12 +180,17 @@ under the License.
.mvn/****/target/**
- native/target/**
+ rust-target/**tpch-data/**
-
- native/Cargo.lock
+
+ Cargo.lock
+
+ **/META-INF/services/**dev/release/rat_exclude_files.txt
+
+ spark/scaffold/bridge-template/**
diff --git a/spark/README.md b/spark/README.md
new file mode 100644
index 0000000..5cc3d3c
--- /dev/null
+++ b/spark/README.md
@@ -0,0 +1,454 @@
+# DataFusion Spark Connector
+
+This module (`datafusion-java-spark`) lets you expose a [DataFusion
+`TableProvider`](https://docs.rs/datafusion/latest/datafusion/catalog/trait.TableProvider.html)
+written in Rust as an [Apache Spark DataSource
+V2](https://spark.apache.org/docs/latest/sql-data-sources.html) table. If you
+have data that DataFusion can already read — an in-house file format, a custom
+catalog, a remote service — this connector is the bridge that makes
+`spark.read.format(...)` work against it, with predicate pushdown, column
+pruning, and partitioned parallel reads.
+
+You write two small pieces (a Rust function and a Java class); the connector
+supplies everything else.
+
+## How it fits together
+
+Two layers, one of which already exists:
+
+```
+ your bridge (you write this) this module (already written)
++--------------------------------+ +----------------------------------+
+| cdylib on datafusion-spark- | | Scala/Java DSv2 plumbing |
+| bridge (spark/bridge SDK): | | (spark/src) schema inference, |
+| your TableProvider + one |<--| pushdown, task planning, |
+| export_bridge! invocation; |-->| shared-scan cache |
+| the SDK supplies widening, | | |
+| session, filters, planning, | | (pure JVM — all native code |
+| partition streams | | ships inside YOUR jar) |
++--------------------------------+ +----------------------------------+
+ |
+ v
+ spark.read.format("...").load()
+```
+
+The only things that cross between the JVM and your cdylib are opaque
+`byte[]` blobs that *you* define (options and per-partition payloads — the
+connector never inspects them) going in, and Arrow C streams coming back.
+Everything DataFusion-side (planning, filter application, execution) happens
+inside your bridge's native library. There is no DataFusion session on the
+JVM side at all, and no `FFI_TableProvider` boundary anywhere — your
+concrete provider is linked into the same cdylib as the scan machinery.
+
+## Getting started: generate a bridge
+
+Don't hand-assemble the pieces below — stamp them out:
+
+```bash
+python3 spark/scaffold/new_bridge.py --name acme --package com.example.acme
+```
+
+generates a standalone project (Rust cdylib with a working demo provider,
+the four Java classes, service registration, shaded-jar pom with the cdylib
+bundled, pyspark smoke test, README with the build commands). Replace the
+demo `MemTable` in its `native/src/lib.rs` and you have a connector. The
+sections below explain what each generated piece is for.
+
+## What you implement
+
+| # | Piece | Language | Contract lives at | Working example |
+|---|-------|----------|-------------------|-----------------|
+| 1 | A provider builder + one `export_bridge!` invocation | Rust | [`bridge/src/lib.rs`](bridge/src/lib.rs) (macro rustdoc) | [`examples/native/src/lib.rs`](../examples/native/src/lib.rs) |
+| 2 | A `BridgeProviderFactory` implementation (one required method) + the JNI/backend boilerplate | Java | [`src/main/java/io/datafusion/spark/BridgeProviderFactory.java`](src/main/java/io/datafusion/spark/BridgeProviderFactory.java) | [`examples/.../ExampleBridgeProviderFactory.java`](../examples/src/main/java/org/apache/datafusion/examples/ExampleBridgeProviderFactory.java) |
+| 3 | (optional) A `DatafusionSource` subclass giving your source a short name | Scala/Java | [`src/main/scala/io/datafusion/spark/DatafusionSource.scala`](src/main/scala/io/datafusion/spark/DatafusionSource.scala) | see "Wiring it into Spark" below |
+
+An end-to-end runnable version of all three — in-memory table, factory, and a
+PySpark script that scans, filters, and projects it — lives under
+[`examples/python/`](../examples/python/).
+
+### 1. The Rust side
+
+Depend on the [`datafusion-spark-bridge`](bridge/) SDK crate and let it
+generate the JNI surface; you supply one builder turning your option /
+partition bytes into a concrete `TableProvider`:
+
+```rust
+use std::sync::Arc;
+use datafusion_spark_bridge::datafusion::catalog::TableProvider;
+use datafusion_spark_bridge::{export_bridge, BridgeContext, JniResult};
+
+fn build_provider(
+ ctx: &BridgeContext,
+ options: &[u8],
+ partition: &[u8],
+) -> JniResult> {
+ let opts = MyOptions::decode(options)?;
+ Ok(ctx.block_on(MyProvider::connect(opts, partition))?)
+}
+
+export_bridge! {
+ // Underscore-mangled name of YOUR Java class declaring the native
+ // methods (dots -> underscores). Per-bridge names let several bridges
+ // coexist in one Spark JVM.
+ jni_class: "com_example_mybridge_BridgeNative",
+ build_provider: build_provider,
+}
+```
+
+The macro's rustdoc lists the exact `static native` method set the named
+Java class must declare; your factory routes the connector to it by
+overriding `scanBackend()` (see section 2). One cdylib total: your provider
+and the SDK's scan machinery are the same library, so there is no provider
+hand-off across a binary boundary and no `datafusion-ffi` anywhere. The
+builder receives empty partition bytes for the driver-side schema probe —
+schema must not depend on per-partition state.
+
+[`examples/native/src/lib.rs`](../examples/native/src/lib.rs)
+is a complete, commented version of this for a `MemTable`.
+
+### 2. The Java factory
+
+`BridgeProviderFactory` is the contract between Spark and your bridge. It
+must have a no-arg constructor (executors instantiate it reflectively by
+class name). The single required method is `scanBackend()` — Spark options
+are encoded with `OptionsCodec` by default (decode them in Rust via
+`datafusion_spark_bridge::options::decode_options`), and `listPartitions`
+defaults to one whole-dataset partition:
+
+```java
+public final class MyBridgeProviderFactory implements BridgeProviderFactory {
+
+ @Override
+ public ScanBackend scanBackend() {
+ return new MyBridgeBackend(); // six one-line delegations to BridgeNative
+ }
+}
+
+/** Declares the native methods generated by export_bridge! and loads the cdylib. */
+final class BridgeNative {
+ static {
+ NativeLibraryLoader.load(BridgeNative.class, "com/example/mybridge", "my_bridge");
+ }
+ static native byte[] providerSchemaIpc(byte[] options, byte[] partition);
+ static native long createScan(byte[] options, byte[] partition,
+ int targetPartitions, int batchSize, String[] optionKeys,
+ String[] optionValues, String[] projectionColumns, byte[][] filterProtos);
+ static native int partitionCount(long scanHandle);
+ static native void executeStreamPartition(long scanHandle, int partition, long ffiStreamAddr);
+ static native void executeStream(long scanHandle, long ffiStreamAddr);
+ static native void closeScan(long scanHandle);
+}
+```
+
+(`MyBridgeBackend implements ScanBackend` forwards each method to
+`BridgeNative` — pure boilerplate the scaffold generates.)
+
+Override `encodeOptions` only if the bridge already has its own options
+schema (e.g. a protobuf), and `listPartitions` when the dataset should split
+into more than one Spark task:
+
+```java
+ @Override
+ public PartitionInfo[] listPartitions(byte[] optionsBytes) {
+ MySlice[] slices = MyBridgeNative.listSlices(optionsBytes);
+ PartitionInfo[] out = new PartitionInfo[slices.length];
+ for (int i = 0; i < slices.length; i++) {
+ out[i] = new PartitionInfo(slices[i].id(), slices[i].payload(), slices[i].hosts());
+ }
+ return out;
+ }
+```
+
+The remaining optional methods — `sharedScan`, `reportPartitioning`, and the
+filter-aware `listPartitions(opts, filters)` overload — are covered in their
+own sections below. Their javadoc in
+[`BridgeProviderFactory.java`](src/main/java/io/datafusion/spark/BridgeProviderFactory.java)
+is the authoritative contract.
+
+### 3. Wiring it into Spark
+
+Either pass your factory class per read:
+
+```python
+df = (spark.read.format("datafusion")
+ .option("df.factory", "com.example.MyBridgeProviderFactory")
+ .option("url", "...")
+ .option("table", "my_dataset")
+ .load())
+```
+
+or ship a ~10-line subclass so users get a short format name:
+
+```scala
+class MyDataSource extends DatafusionSource {
+ override def shortName(): String = "my_format"
+ override protected def factoryFqcn(opts: CaseInsensitiveStringMap): String =
+ "com.example.MyBridgeProviderFactory"
+}
+```
+
+registered via a
+`META-INF/services/org.apache.spark.sql.sources.DataSourceRegister` file
+(this module registers `datafusion` the same way — see
+[`src/main/resources/META-INF/services/`](src/main/resources/META-INF/services/)).
+
+## Packaging your bridge
+
+The end-user experience to aim for is one artifact:
+
+```python
+# spark.jars (or --packages) gets exactly one jar, then:
+df = spark.read.format("my_format").option("url", "...").load()
+```
+
+Three pieces make that work:
+
+**Bundle your cdylib inside the jar.** Copy it into your jar's resources at
+`///` and load it from your native
+class's static initializer with the connector's loader — no hand-rolled
+extraction code:
+
+```java
+static {
+ NativeLibraryLoader.load(BridgeNative.class, "com/example/mybridge", "my_bridge");
+}
+```
+
+The pom side is one antrun copy execution plus per-host profiles; the
+examples module is a complete working copy of the pattern (see the
+`copy-example-bridge-cdylib` execution and the `native-*` profiles in
+[`examples/pom.xml`](../examples/pom.xml), and the loader call in
+[`ExampleBridgeNative.java`](../examples/src/main/java/org/apache/datafusion/examples/ExampleBridgeNative.java)).
+For a multi-platform jar, build the cdylib per platform in CI and copy each
+into its own `//` directory before `mvn package` — the layout
+supports them side by side.
+
+**Shade your dependencies into one fat jar** with `maven-shade-plugin`, so
+users don't assemble a jar list:
+
+```xml
+
+ org.apache.maven.plugins
+ maven-shade-plugin
+
+
+ package
+ shade
+
+
+
+
+
+
+
+
+ *:*
+
+ META-INF/*.SF
+ META-INF/*.DSA
+ META-INF/*.RSA
+
+
+
+
+
+
+
+```
+
+Include in the shaded jar: this connector (`datafusion-java-spark`), the core
+jar (`datafusion-java` — exception classes and, if you push predicates, the
+generated proto classes), the Arrow Java artifacts you compile against, and
+your own classes + cdylib. Keep `spark-sql`/`scala-library` `provided` — the
+cluster supplies them.
+
+**Do NOT relocate JNI-bound or JNI-loading packages.** JNI binds native
+methods by the class's fully-qualified name; `arrow-c-data` and the Arrow
+memory modules likewise load their own natives. Relocating
+`io.datafusion.spark`, `org.apache.arrow`, or your own native class breaks
+the symbol lookup at runtime. Practical consequences:
+
+- Ship a plain (unrelocated) fat jar. Two bridges in one Spark app then share
+ one copy of the connector classes — fine when they're built against the
+ same connector version, which is the only configuration we support anyway
+ (their cdylibs stay distinct via per-bridge JNI class names).
+- Spark bundles its own (often older) Arrow. Since yours can't be relocated
+ away, have users set `spark.executor.userClassPathFirst=true` and
+ `spark.driver.userClassPathFirst=true` (the pyspark demo under
+ [`examples/python/`](../examples/python/) shows the working incantation),
+ or build with Arrow pinned to the cluster's version.
+
+## Spark tasks vs. DataFusion partitions
+
+This is the most important design decision when building a connector, so it
+gets its own section.
+
+Spark parallelism and DataFusion parallelism are different things:
+
+- A **Spark task** is the unit Spark schedules onto an executor core. Each
+ task carries fixed overhead: scheduling on the driver, (de)serializing the
+ task, instantiating your factory, building a provider, planning a scan.
+- A **DataFusion partition** is one output stream of a planned physical
+ query. A single plan usually has several.
+
+The connector supports two ways of mapping one onto the other:
+
+### Default mode: one Spark task per `PartitionInfo`
+
+`listPartitions` returns N entries → Spark runs N tasks. Each task calls
+`createProvider(opts, partitionBytes)` with *its own* entry's payload, so each
+task plans and scans only its slice. If DataFusion happens to plan that slice
+into multiple internal partitions, they are merged into one stream for the
+task — within a task there is no extra parallelism, by design (the
+parallelism budget belongs to Spark).
+
+You control the mapping entirely through what you return from
+`listPartitions`. Sizing guidance:
+
+- **Don't emit one `PartitionInfo` per tiny fragment.** A Spark task should
+ do meaningfully more work than its overhead — as a rule of thumb at least
+ ~100 ms of scan time, or order-100 MB of data (Spark's own file sources
+ default to 128 MB per task for the same reason). If your natural unit is a
+ small chunk (an object-store key, a time slice, a recording segment),
+ **bin-pack several into one entry**: `partitionBytes` is opaque, so encode
+ a *list* of chunk ids and have your `createProvider` materialise all of
+ them in one provider.
+- **Watch the total task count.** The Spark driver schedules and tracks every
+ task; beyond the low thousands of tasks per stage you pay growing driver
+ CPU/memory and UI lag for no extra throughput once the cluster's cores are
+ saturated. A healthy target is roughly 2–3 tasks per available core, and
+ rarely more than a few thousand per scan. Tens of thousands of
+ single-digit-megabyte tasks is a smell — bin-pack first.
+- **Locality and partition keys only exist here.** `preferredLocations`
+ (host affinity) and `HasPartitionKey`/`reportPartitioning` (shuffle
+ elision) are properties of `PartitionInfo` entries. If you need either,
+ use this mode.
+
+### Shared-scan mode: one Spark task per DataFusion partition
+
+When provider construction itself is expensive (remote metadata, connection
+setup) or the dataset has thousands of small natural partitions, per-task
+provider builds dominate. Opting in via
+
+```java
+@Override
+public boolean sharedScan(byte[] optionsBytes) { return true; }
+```
+
+flips the mapping: the provider is built **once per executor JVM per query**
+(with empty `partitionBytes`), planned once, and Spark runs one task per
+*DataFusion output partition* — task `i` streams plan partition `i` from the
+executor-local cached plan. `listPartitions` is not called at all.
+
+The DataFusion partition count — and therefore the Spark task count — is
+pinned by `spark.datafusion.sharedScan.targetPartitions` (default 8). The
+value is resolved on the driver and shipped to executors, because
+DataFusion's default would vary with each machine's core count and the
+partition indices must mean the same thing everywhere.
+
+Choosing between the modes:
+
+| Choose | When |
+|--------|------|
+| Default (per-partition payload) | slices have host affinity, you want partition-key semantics, per-slice provider construction is cheap. Bin-pack small slices before abandoning this mode. |
+| Shared-scan | provider construction is expensive, there are thousands of small partitions with no locality story, the workload is scan + filter + projection. Provider builds drop from one-per-task to one-per-executor (plus one driver probe per query). |
+
+Shared-scan's price of admission is a **determinism contract**: the
+provider's schema, partitioning, and per-partition contents must be a pure
+function of `optionsBytes`. Remote sources must pin a snapshot
+(version/timestamp) inside the options. The connector fails tasks when an
+executor's partition count diverges from the driver's, but equal counts with
+different contents are undetectable by construction. The provider's
+`ExecutionPlan` must also tolerate `execute(i)` being called more than once
+per plan instance (Spark retries and speculatively re-executes tasks). Full
+contract: `BridgeProviderFactory.sharedScan` javadoc.
+
+Shared-scan operational details:
+
+- Executor cache ([`SharedScanCache.scala`](src/main/scala/io/datafusion/spark/SharedScanCache.scala)):
+ entries keyed per query (`scanId`), refcounted by open readers, evicted
+ after an idle TTL. Build failures are not cached; eviction between task
+ waves just rebuilds.
+- Spark conf (read on the driver at planning time, shipped to executors):
+ - `spark.datafusion.sharedScan.targetPartitions` (default 8)
+ - `spark.datafusion.sharedScan.batchSize` (default 8192)
+ - `spark.datafusion.sharedScan.idleTtlMs` (default 120000)
+
+## What the connector does for you
+
+- **Schema inference** — your provider's Arrow schema, widened, becomes the
+ Spark schema. Driver-side, one probe build with empty `partitionBytes`.
+- **Type widening** — Spark's columnar readers reject several Arrow types
+ DataFusion happily produces. The SDK (inside your bridge's cdylib)
+ transparently casts
+ unsigned ints → wider signed, `Float16` → `Float32`, `Time*` → wider ints,
+ any-unit/tz `Timestamp` → microsecond, recursively through
+ `List`/`LargeList`/`FixedSizeList` (see
+ [`native/src/widening.rs`](native/src/widening.rs)). Caveat: unsigned types
+ nested inside `Struct`/`Map` are not yet covered.
+- **Predicate pushdown** — Spark V2 `Predicate`s are translated to DataFusion
+ expressions ([`SparkPredicateTranslator.scala`](src/main/scala/io/datafusion/spark/SparkPredicateTranslator.scala)),
+ shipped as `datafusion-proto` bytes, and applied inside the native plan, so
+ your provider's `supports_filters_pushdown`/`scan` sees real Rust `Expr`s.
+ Anything untranslatable stays in Spark as a residual filter — over-claiming
+ is impossible by construction.
+- **Column pruning** — Spark's required-columns projection becomes a
+ DataFusion projection on the native plan.
+- **Partition-aware joins/aggregations** (default mode, optional) — declare
+ `reportPartitioning` + per-partition key values and Spark can elide
+ shuffles. See the javadoc on
+ [`ReportedPartitioning.java`](src/main/java/io/datafusion/spark/ReportedPartitioning.java)
+ and [`PartitionInfo.java`](src/main/java/io/datafusion/spark/PartitionInfo.java);
+ note Spark 3.3+ additionally requires
+ `spark.sql.sources.v2.bucketing.enabled=true` for storage-partitioned
+ joins.
+
+## What runs where
+
+| Phase | Where | Path |
+| ----- | ----- | ---- |
+| Schema inference | Driver | `factory.encodeOptions` → `backend.providerSchemaIpc(opts, EMPTY)` — bridge cdylib builds + widens the provider, returns the Arrow schema |
+| Scan planning (default mode) | Driver | `factory.listPartitions(opts[, filters])` → one task per entry, with its `partitionBytes` + `preferredLocations` |
+| Scan planning (shared-scan) | Driver | probe build (same code path executors use) → plan partition count `N` → `N` tasks |
+| Predicate translation | Driver | `SparkPredicateTranslator` → proto bytes per pushed predicate |
+| Per-task scan (default mode) | Executor | `backend.createScan(opts, partitionBytes, ...)` (build provider, widen, project, filter, plan) → stream whole plan |
+| Per-task scan (shared-scan) | Executor | cache-acquire by `scanId` (first task builds) → stream plan partition `i` → release |
+
+The native machinery backing all of this is
+[`bridge/src/scan.rs`](bridge/src/scan.rs), exported into each bridge's
+cdylib by `export_bridge!` and reached through its [`ScanBackend`](src/main/java/io/datafusion/spark/ScanBackend.java).
+
+## Module layout
+
+```
+spark/
+├── src/main/java/io/datafusion/spark/ public SPI (Java on purpose:
+│ bridge jars stay Scala-free)
+│ BridgeProviderFactory.java <- the contract you implement
+│ ScanBackend.java <- native scan surface (delegations
+│ to your bridge's JNI class)
+│ NativeLibraryLoader.java <- bundled-cdylib extraction/loading
+│ PartitionInfo.java <- one entry = one Spark task
+│ ReportedPartitioning.java <- optional shuffle-elision declaration
+├── src/main/scala/io/datafusion/spark/ connector internals (DSv2 wiring,
+│ readers, pushdown, shared-scan cache)
+└── bridge/ datafusion-spark-bridge SDK rlib:
+ widening + scan machinery +
+ export_bridge! (the native side of
+ every bridge cdylib)
+```
+
+## Caveats
+
+- Pushed filter expressions are deserialized with DataFusion's default
+ logical-extension codec, which covers columns, literals, and built-in
+ functions. Anything the Spark-side translator can't express stays in Spark
+ as a residual filter, so coverage gaps cost performance, never
+ correctness.
+- The bridge cdylib's DataFusion version is the SDK's: cargo resolves one
+ `datafusion` for your provider and the scan machinery together, pinned in
+ this repo's workspace [`Cargo.toml`](../Cargo.toml). Upgrading DataFusion
+ means rebuilding the bridge against a newer SDK.
+- The SDK's Tokio runtime is per-cdylib and `Once`-gated; TLS-using bridges
+ should `Once`-gate their rustls install the same way.
diff --git a/spark/bridge/Cargo.toml b/spark/bridge/Cargo.toml
new file mode 100644
index 0000000..8ed4684
--- /dev/null
+++ b/spark/bridge/Cargo.toml
@@ -0,0 +1,34 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+[package]
+name = "datafusion-spark-bridge"
+version = "0.1.0"
+edition = "2021"
+publish = false
+description = "SDK for building Spark connector bridges over DataFusion TableProviders"
+
+[dependencies]
+arrow = { workspace = true }
+async-trait = { workspace = true }
+datafusion = { workspace = true }
+datafusion-jni-common = { path = "../../native-common" }
+datafusion-proto = { workspace = true }
+futures = { workspace = true }
+jni = { workspace = true }
+prost = { workspace = true }
+tokio = { workspace = true }
diff --git a/spark/bridge/src/lib.rs b/spark/bridge/src/lib.rs
new file mode 100644
index 0000000..52ef1c1
--- /dev/null
+++ b/spark/bridge/src/lib.rs
@@ -0,0 +1,213 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//! SDK for building Spark connector bridges over DataFusion `TableProvider`s.
+//!
+//! Everything the Spark connector needs DataFusion-side lives here: the
+//! Spark-type [`widening`] layer, and the [`scan`] machinery (session from
+//! pinned config, projection, proto filters, planning, partition streams).
+//! A bridge cdylib depends on this crate and invokes [`export_bridge!`] with
+//! a builder that constructs its concrete `TableProvider` from option /
+//! partition bytes — one cdylib, no FFI provider boundary; the only foreign
+//! interface is JNI plus Arrow's C stream for the results.
+
+pub mod options;
+pub mod scan;
+pub mod widening;
+
+// Re-exported so `export_bridge!` expansions resolve these crates inside the
+// bridge author's crate without extra dependencies, and so builder signatures
+// can be written against `datafusion_spark_bridge::datafusion::...`.
+pub use datafusion;
+pub use datafusion_jni_common::errors::JniResult;
+pub use jni;
+
+use tokio::runtime::Handle;
+
+/// Execution environment handed to a bridge's provider builder.
+///
+/// Provider construction frequently needs async IO (remote catalogs,
+/// object-store metadata); run it on the bridge runtime via [`block_on`]
+/// rather than creating a runtime of your own.
+///
+/// [`block_on`]: BridgeContext::block_on
+pub struct BridgeContext {
+ handle: &'static Handle,
+}
+
+impl BridgeContext {
+ /// Used by `export_bridge!` expansions; not part of the public API.
+ #[doc(hidden)]
+ pub fn get() -> Self {
+ BridgeContext {
+ handle: runtime_handle(),
+ }
+ }
+
+ /// The cdylib-wide Tokio runtime handle (also the runtime scans run on).
+ pub fn handle(&self) -> &Handle {
+ self.handle
+ }
+
+ /// Block the current (JVM) thread on `fut`, driving it on the bridge
+ /// runtime.
+ pub fn block_on(&self, fut: F) -> F::Output {
+ self.handle.block_on(fut)
+ }
+}
+
+/// Per-cdylib Tokio runtime (the singleton from `datafusion-jni-common`).
+pub(crate) fn runtime_handle() -> &'static Handle {
+ datafusion_jni_common::runtime().handle()
+}
+
+/// Generate the JNI entry points for a bridge cdylib.
+///
+/// `jni_class` is the **underscore-mangled** binary name of the Java class
+/// declaring the matching `native` methods: dots become underscores
+/// (`com.example.mybridge.BridgeNative` → `"com_example_mybridge_BridgeNative"`).
+/// If the class or package name itself contains an underscore, JNI mangling
+/// requires it written as `_1`. Per-bridge class names are what let several
+/// bridges coexist in one Spark JVM.
+///
+/// `build_provider` is anything callable as
+/// `Fn(&BridgeContext, &[u8], &[u8]) -> JniResult>`,
+/// receiving the options bytes and partition bytes your JVM factory encoded.
+/// The schema probe calls it with empty partition bytes; the scan path passes
+/// each task's payload. Return errors boxed from `DataFusionError` to surface
+/// as the typed `org.apache.datafusion.*` exception hierarchy.
+///
+/// The generated Java-side surface (declare these as `static native` on the
+/// class named by `jni_class`):
+///
+/// ```java
+/// static native byte[] providerSchemaIpc(byte[] options, byte[] partition);
+/// static native long createScan(byte[] options, byte[] partition,
+/// int targetPartitions, int batchSize, String[] optionKeys,
+/// String[] optionValues, String[] projectionColumns, byte[][] filterProtos);
+/// static native int partitionCount(long scanHandle);
+/// static native void executeStreamPartition(long scanHandle, int partition, long ffiStreamAddr);
+/// static native void executeStream(long scanHandle, long ffiStreamAddr);
+/// static native void closeScan(long scanHandle);
+/// ```
+#[macro_export]
+macro_rules! export_bridge {
+ (jni_class: $cls:literal, build_provider: $builder:expr $(,)?) => {
+ const _: () = {
+ use $crate::jni::objects::{JByteArray, JClass, JObjectArray};
+ use $crate::jni::sys::{jbyteArray, jint, jlong};
+ use $crate::jni::JNIEnv;
+
+ fn __df_bridge_build(
+ env: &mut JNIEnv,
+ options: &JByteArray,
+ partition: &JByteArray,
+ ) -> $crate::JniResult>
+ {
+ let opts: Vec = if options.is_null() {
+ Vec::new()
+ } else {
+ env.convert_byte_array(options)?
+ };
+ let part: Vec = if partition.is_null() {
+ Vec::new()
+ } else {
+ env.convert_byte_array(partition)?
+ };
+ let ctx = $crate::BridgeContext::get();
+ ($builder)(&ctx, opts.as_slice(), part.as_slice())
+ }
+
+ #[export_name = concat!("Java_", $cls, "_providerSchemaIpc")]
+ extern "system" fn __df_bridge_provider_schema_ipc<'local>(
+ mut env: JNIEnv<'local>,
+ _class: JClass<'local>,
+ options: JByteArray<'local>,
+ partition: JByteArray<'local>,
+ ) -> jbyteArray {
+ $crate::scan::provider_schema_ipc(&mut env, |env| {
+ __df_bridge_build(env, &options, &partition)
+ })
+ }
+
+ #[export_name = concat!("Java_", $cls, "_createScan")]
+ #[allow(clippy::too_many_arguments)]
+ extern "system" fn __df_bridge_create_scan<'local>(
+ mut env: JNIEnv<'local>,
+ _class: JClass<'local>,
+ options: JByteArray<'local>,
+ partition: JByteArray<'local>,
+ target_partitions: jint,
+ batch_size: jint,
+ option_keys: JObjectArray<'local>,
+ option_values: JObjectArray<'local>,
+ projection_columns: JObjectArray<'local>,
+ filter_protos: JObjectArray<'local>,
+ ) -> jlong {
+ $crate::scan::create_scan(
+ &mut env,
+ |env| __df_bridge_build(env, &options, &partition),
+ target_partitions,
+ batch_size,
+ &option_keys,
+ &option_values,
+ &projection_columns,
+ &filter_protos,
+ )
+ }
+
+ #[export_name = concat!("Java_", $cls, "_partitionCount")]
+ extern "system" fn __df_bridge_partition_count<'local>(
+ mut env: JNIEnv<'local>,
+ _class: JClass<'local>,
+ handle: jlong,
+ ) -> jint {
+ $crate::scan::partition_count(&mut env, handle)
+ }
+
+ #[export_name = concat!("Java_", $cls, "_executeStreamPartition")]
+ extern "system" fn __df_bridge_execute_stream_partition<'local>(
+ mut env: JNIEnv<'local>,
+ _class: JClass<'local>,
+ handle: jlong,
+ partition: jint,
+ ffi_stream_addr: jlong,
+ ) {
+ $crate::scan::execute_stream_partition(&mut env, handle, partition, ffi_stream_addr)
+ }
+
+ #[export_name = concat!("Java_", $cls, "_executeStream")]
+ extern "system" fn __df_bridge_execute_stream<'local>(
+ mut env: JNIEnv<'local>,
+ _class: JClass<'local>,
+ handle: jlong,
+ ffi_stream_addr: jlong,
+ ) {
+ $crate::scan::execute_stream(&mut env, handle, ffi_stream_addr)
+ }
+
+ #[export_name = concat!("Java_", $cls, "_closeScan")]
+ extern "system" fn __df_bridge_close_scan<'local>(
+ mut env: JNIEnv<'local>,
+ _class: JClass<'local>,
+ handle: jlong,
+ ) {
+ $crate::scan::close_scan(&mut env, handle)
+ }
+ };
+ };
+}
diff --git a/spark/bridge/src/options.rs b/spark/bridge/src/options.rs
new file mode 100644
index 0000000..117ca9d
--- /dev/null
+++ b/spark/bridge/src/options.rs
@@ -0,0 +1,158 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//! Decoder for the connector's default options wire format.
+//!
+//! `BridgeProviderFactory.encodeOptions`'s default (`OptionsCodec` on the JVM
+//! side) encodes the Spark options map as length-prefixed UTF-8 pairs,
+//! sorted by key: big-endian `i32` entry count, then per entry key length,
+//! key bytes, value length, value bytes. Key-sorting makes the bytes a pure
+//! function of the map contents — the shared-scan determinism contract uses
+//! the options bytes as the scan identity.
+//!
+//! Bridges using the default JVM encoding read their options here:
+//!
+//! ```ignore
+//! let opts = datafusion_spark_bridge::options::decode_options(options_bytes)?;
+//! let url = opts.get("url").ok_or("missing required option 'url'")?;
+//! ```
+//!
+//! The two implementations are pinned to each other by the shared fixture in
+//! the tests below; `OptionsCodecTest` on the JVM side asserts the same
+//! bytes.
+
+use std::collections::BTreeMap;
+
+/// Decode bytes produced by the JVM `OptionsCodec.encode` (or
+/// [`encode_options`]). Empty input decodes as an empty map.
+pub fn decode_options(bytes: &[u8]) -> Result, String> {
+ let mut out = BTreeMap::new();
+ if bytes.is_empty() {
+ return Ok(out);
+ }
+ let mut cursor = Cursor { bytes, pos: 0 };
+ let count = cursor.read_len("entry count")?;
+ for i in 0..count {
+ let key = cursor.read_string(&format!("key of entry {i}"))?;
+ let value = cursor.read_string(&format!("value of entry {i}"))?;
+ out.insert(key, value);
+ }
+ if cursor.pos != bytes.len() {
+ return Err(format!(
+ "options blob has {} trailing byte(s) after {count} entries",
+ bytes.len() - cursor.pos
+ ));
+ }
+ Ok(out)
+}
+
+/// Encode in the same format (key-sorted via `BTreeMap`). Primarily for
+/// tests and Rust-side tooling; production encoding normally happens on the
+/// JVM driver.
+pub fn encode_options(options: &BTreeMap) -> Vec {
+ let mut out = Vec::new();
+ out.extend_from_slice(&(options.len() as i32).to_be_bytes());
+ for (key, value) in options {
+ out.extend_from_slice(&(key.len() as i32).to_be_bytes());
+ out.extend_from_slice(key.as_bytes());
+ out.extend_from_slice(&(value.len() as i32).to_be_bytes());
+ out.extend_from_slice(value.as_bytes());
+ }
+ out
+}
+
+struct Cursor<'a> {
+ bytes: &'a [u8],
+ pos: usize,
+}
+
+impl Cursor<'_> {
+ fn read_len(&mut self, what: &str) -> Result {
+ if self.bytes.len() - self.pos < 4 {
+ return Err(format!("options blob truncated reading {what}"));
+ }
+ let raw = i32::from_be_bytes(self.bytes[self.pos..self.pos + 4].try_into().unwrap());
+ self.pos += 4;
+ usize::try_from(raw).map_err(|_| format!("negative length for {what}: {raw}"))
+ }
+
+ fn read_string(&mut self, what: &str) -> Result {
+ let len = self.read_len(&format!("length of {what}"))?;
+ if self.bytes.len() - self.pos < len {
+ return Err(format!("options blob truncated reading {what}"));
+ }
+ let slice = &self.bytes[self.pos..self.pos + len];
+ self.pos += len;
+ String::from_utf8(slice.to_vec()).map_err(|e| format!("{what} is not UTF-8: {e}"))
+ }
+}
+
+#[cfg(test)]
+mod tests {
+ use super::*;
+
+ /// Shared fixture: must stay byte-identical to the one asserted by the
+ /// JVM-side `OptionsCodecTest`. {"table": "t1", "url": "grpc://h:1"}
+ /// encodes (sorted: table < url) as below.
+ fn fixture_bytes() -> Vec {
+ let mut b = Vec::new();
+ b.extend_from_slice(&2i32.to_be_bytes());
+ for (k, v) in [("table", "t1"), ("url", "grpc://h:1")] {
+ b.extend_from_slice(&(k.len() as i32).to_be_bytes());
+ b.extend_from_slice(k.as_bytes());
+ b.extend_from_slice(&(v.len() as i32).to_be_bytes());
+ b.extend_from_slice(v.as_bytes());
+ }
+ b
+ }
+
+ #[test]
+ fn decodes_fixture() {
+ let map = decode_options(&fixture_bytes()).unwrap();
+ assert_eq!(map.len(), 2);
+ assert_eq!(map.get("table").map(String::as_str), Some("t1"));
+ assert_eq!(map.get("url").map(String::as_str), Some("grpc://h:1"));
+ }
+
+ #[test]
+ fn round_trips() {
+ let mut map = BTreeMap::new();
+ map.insert("b".to_string(), "2".to_string());
+ map.insert("a".to_string(), "1".to_string());
+ map.insert("unicode".to_string(), "héllo→world".to_string());
+ let bytes = encode_options(&map);
+ assert_eq!(decode_options(&bytes).unwrap(), map);
+ }
+
+ #[test]
+ fn empty_input_is_empty_map() {
+ assert!(decode_options(&[]).unwrap().is_empty());
+ let empty = encode_options(&BTreeMap::new());
+ assert!(decode_options(&empty).unwrap().is_empty());
+ }
+
+ #[test]
+ fn rejects_truncation_and_trailing_bytes() {
+ let bytes = fixture_bytes();
+ assert!(decode_options(&bytes[..bytes.len() - 1])
+ .unwrap_err()
+ .contains("truncated"));
+ let mut extended = bytes.clone();
+ extended.push(0);
+ assert!(decode_options(&extended).unwrap_err().contains("trailing"));
+ }
+}
diff --git a/spark/bridge/src/scan.rs b/spark/bridge/src/scan.rs
new file mode 100644
index 0000000..ad27dff
--- /dev/null
+++ b/spark/bridge/src/scan.rs
@@ -0,0 +1,325 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//! Planning and execution of a Spark scan.
+//!
+//! Every function here is the body of one JNI entry point generated by a
+//! bridge's `export_bridge!` expansion, which supplies only how the provider
+//! is obtained, as a `make` closure. The provider is wrapped in a
+//! [`WideningTableProvider`] here, so every bridge gets identical
+//! Spark-compatible Arrow types.
+//!
+//! [`create_scan`] registers the widened provider on a private
+//! `SessionContext` built from the caller-pinned config, applies the pruned
+//! projection and the proto-encoded pushed filters, and plans exactly once.
+//! The returned handle supports:
+//!
+//! - [`partition_count`] — output partitions of the physical plan
+//! (shared-scan mode probes this on the driver and indexes tasks by it);
+//! - [`execute_stream_partition`] — an independent stream over ONE plan
+//! partition, concurrently callable from multiple JVM threads
+//! (`ExecutionPlan` and `TaskContext` are `Send + Sync`; each call only
+//! clones their `Arc`s). Re-executing the same partition index (Spark
+//! task retry / speculative execution) opens its own stream, but only
+//! succeeds when every operator in that partition's pipeline supports
+//! repeated `execute()` — stateless scans do, `RepartitionExec`
+//! pipelines do not;
+//! - [`execute_stream`] — the whole plan as one stream (per-partition
+//! mode, where the provider itself is the task's slice);
+//! - [`close_scan`] — drop the plan. The single unsafe interleaving is
+//! closing a handle that still has an in-flight call; the Java consumer
+//! (the shared-scan cache) prevents it with a refcount covering every
+//! open reader.
+//!
+//! Pinned-config determinism: the driver resolves `target_partitions` /
+//! `batch_size` / option overrides once and ships them to every executor, so
+//! a plan that yields N partitions on the driver yields N everywhere. This
+//! module applies whatever it is handed and stays policy-free.
+
+use std::sync::Arc;
+
+use datafusion::arrow::datatypes::SchemaRef;
+use datafusion::arrow::ffi_stream::FFI_ArrowArrayStream;
+use datafusion::arrow::ipc::writer::StreamWriter;
+use datafusion::catalog::TableProvider;
+use datafusion::dataframe::DataFrame;
+use datafusion::execution::TaskContext;
+use datafusion::physical_plan::{execute_stream as df_execute_stream, ExecutionPlan};
+use datafusion::prelude::{SessionConfig, SessionContext};
+use datafusion_jni_common::errors::{try_unwrap_or_throw, JniResult};
+use datafusion_jni_common::StreamingReader;
+use datafusion_proto::logical_plan::from_proto::parse_expr;
+use datafusion_proto::logical_plan::DefaultLogicalExtensionCodec;
+use datafusion_proto::protobuf::LogicalExprNode;
+use jni::objects::{JByteArray, JObjectArray, JString};
+use jni::sys::{jbyteArray, jint, jlong};
+use jni::JNIEnv;
+use prost::Message;
+
+use crate::runtime_handle;
+use crate::widening::WideningTableProvider;
+
+/// Registration name of the (single) provider on the scan's private context.
+/// Never surfaces in SQL — the plan is built through the DataFrame API — so
+/// no quoting/collision concerns.
+const SCAN_TABLE_NAME: &str = "df_spark_scan";
+
+struct ScanState {
+ /// Kept alive for the plan's lifetime; the registered provider and the
+ /// runtime env both hang off it.
+ _ctx: SessionContext,
+ plan: Arc,
+ task_ctx: Arc,
+}
+
+fn widen(provider: Arc) -> Arc {
+ Arc::new(WideningTableProvider::new(provider))
+}
+
+fn collect_string_array(env: &mut JNIEnv, arr: &JObjectArray) -> JniResult> {
+ if arr.is_null() {
+ return Ok(Vec::new());
+ }
+ let len = env.get_array_length(arr)?;
+ let mut owned: Vec = Vec::with_capacity(len as usize);
+ for i in 0..len {
+ let elem = env.get_object_array_element(arr, i)?;
+ let jstr: JString = elem.into();
+ owned.push(env.get_string(&jstr)?.into());
+ }
+ Ok(owned)
+}
+
+fn collect_byte_arrays(env: &mut JNIEnv, arr: &JObjectArray) -> JniResult>> {
+ if arr.is_null() {
+ return Ok(Vec::new());
+ }
+ let len = env.get_array_length(arr)?;
+ let mut owned: Vec> = Vec::with_capacity(len as usize);
+ for i in 0..len {
+ let elem = env.get_object_array_element(arr, i)?;
+ let bytes: JByteArray = elem.into();
+ owned.push(env.convert_byte_array(&bytes)?);
+ }
+ Ok(owned)
+}
+
+/// Driver-side schema probe: widened Arrow schema of the provider, as IPC
+/// bytes (deserialized JVM-side with `MessageSerializer.deserializeSchema`).
+/// `make` runs once; the provider drops before returning.
+pub fn provider_schema_ipc(
+ env: &mut JNIEnv,
+ make: impl FnOnce(&mut JNIEnv) -> JniResult>,
+) -> jbyteArray {
+ try_unwrap_or_throw(env, std::ptr::null_mut(), |env| -> JniResult {
+ let widened = widen(make(env)?);
+ let schema = widened.schema();
+ let mut buf: Vec = Vec::new();
+ {
+ let mut writer = StreamWriter::try_new(&mut buf, schema.as_ref())?;
+ writer.finish()?;
+ }
+ let arr = env.byte_array_from_slice(&buf)?;
+ Ok(arr.into_raw())
+ })
+}
+
+/// Build the scan: widen the provider from `make`, register it on a private
+/// context with the pinned config, apply projection + pushed filters, plan
+/// once.
+///
+/// `target_partitions` / `batch_size` <= 0 leave the DataFusion defaults;
+/// `option_keys`/`option_values` are parallel arrays of config overrides;
+/// empty `projection_columns` selects all columns; each element of
+/// `filter_protos` is a serialized `datafusion.LogicalExprNode`.
+#[allow(clippy::too_many_arguments)]
+pub fn create_scan(
+ env: &mut JNIEnv,
+ make: impl FnOnce(&mut JNIEnv) -> JniResult>,
+ target_partitions: jint,
+ batch_size: jint,
+ option_keys: &JObjectArray,
+ option_values: &JObjectArray,
+ projection_columns: &JObjectArray,
+ filter_protos: &JObjectArray,
+) -> jlong {
+ try_unwrap_or_throw(env, 0, |env| -> JniResult {
+ let widened = widen(make(env)?);
+
+ let keys = collect_string_array(env, option_keys)?;
+ let values = collect_string_array(env, option_values)?;
+ if keys.len() != values.len() {
+ return Err(format!(
+ "option key/value arrays differ in length: {} vs {}",
+ keys.len(),
+ values.len()
+ )
+ .into());
+ }
+ let projection = collect_string_array(env, projection_columns)?;
+ let filters = collect_byte_arrays(env, filter_protos)?;
+
+ let mut config = SessionConfig::new();
+ if target_partitions > 0 {
+ config = config.with_target_partitions(target_partitions as usize);
+ }
+ if batch_size > 0 {
+ config = config.with_batch_size(batch_size as usize);
+ }
+ for (key, value) in keys.iter().zip(values.iter()) {
+ config.options_mut().set(key, value)?;
+ }
+
+ let ctx = SessionContext::new_with_config(config);
+ ctx.register_table(SCAN_TABLE_NAME, widened)?;
+
+ let mut df: DataFrame = runtime_handle().block_on(ctx.table(SCAN_TABLE_NAME))?;
+ if !projection.is_empty() {
+ let refs: Vec<&str> = projection.iter().map(String::as_str).collect();
+ df = df.select_columns(&refs)?;
+ }
+ for bytes in &filters {
+ let node = LogicalExprNode::decode(bytes.as_slice())?;
+ // TaskContext implements FunctionRegistry; the default codec is
+ // enough because the translator only emits column/literal/builtin
+ // expressions.
+ let registry = df.task_ctx();
+ let expr = parse_expr(&node, ®istry, &DefaultLogicalExtensionCodec {})?;
+ df = df.filter(expr)?;
+ }
+
+ // task_ctx() borrows; capture before create_physical_plan consumes df.
+ let task_ctx = Arc::new(df.task_ctx());
+ let plan = runtime_handle().block_on(df.create_physical_plan())?;
+
+ let state = ScanState {
+ _ctx: ctx,
+ plan,
+ task_ctx,
+ };
+ Ok(Box::into_raw(Box::new(state)) as jlong)
+ })
+}
+
+/// Output partition count of the planned physical plan.
+pub fn partition_count(env: &mut JNIEnv, handle: jlong) -> jint {
+ try_unwrap_or_throw(env, 0, |_env| -> JniResult {
+ if handle == 0 {
+ return Err("scan handle is null".into());
+ }
+ let state = unsafe { &*(handle as *const ScanState) };
+ Ok(state
+ .plan
+ .properties()
+ .output_partitioning()
+ .partition_count() as jint)
+ })
+}
+
+/// Open an independent stream over one plan partition, writing an
+/// `FFI_ArrowArrayStream` into the caller-allocated struct at
+/// `ffi_stream_addr`.
+pub fn execute_stream_partition(
+ env: &mut JNIEnv,
+ handle: jlong,
+ partition: jint,
+ ffi_stream_addr: jlong,
+) {
+ try_unwrap_or_throw(env, (), |_env| -> JniResult<()> {
+ if handle == 0 {
+ return Err("scan handle is null".into());
+ }
+ if ffi_stream_addr == 0 {
+ return Err("ffi stream address is null".into());
+ }
+ let state = unsafe { &*(handle as *const ScanState) };
+
+ let partition_count = state
+ .plan
+ .properties()
+ .output_partitioning()
+ .partition_count();
+ if partition < 0 || partition as usize >= partition_count {
+ return Err(format!(
+ "partition index {partition} out of range: plan has {partition_count} partition(s)"
+ )
+ .into());
+ }
+
+ let plan = Arc::clone(&state.plan);
+ let task_ctx = Arc::clone(&state.task_ctx);
+ let schema: SchemaRef = plan.schema();
+
+ // ExecutionPlan::execute is synchronous, but operators may
+ // tokio::spawn at execute() time (RepartitionExec et al.), which
+ // requires a runtime context to be entered.
+ let stream = {
+ let _guard = runtime_handle().enter();
+ plan.execute(partition as usize, task_ctx)?
+ };
+
+ let reader = StreamingReader { schema, stream };
+ let ffi = FFI_ArrowArrayStream::new(Box::new(reader));
+ unsafe {
+ std::ptr::write(ffi_stream_addr as *mut FFI_ArrowArrayStream, ffi);
+ }
+ Ok(())
+ })
+}
+
+/// Whole-plan stream for per-partition mode (the provider
+/// itself is the task's slice, so all plan partitions merge into one reader).
+pub fn execute_stream(env: &mut JNIEnv, handle: jlong, ffi_stream_addr: jlong) {
+ try_unwrap_or_throw(env, (), |_env| -> JniResult<()> {
+ if handle == 0 {
+ return Err("scan handle is null".into());
+ }
+ if ffi_stream_addr == 0 {
+ return Err("ffi stream address is null".into());
+ }
+ let state = unsafe { &*(handle as *const ScanState) };
+
+ let plan = Arc::clone(&state.plan);
+ let task_ctx = Arc::clone(&state.task_ctx);
+ let schema: SchemaRef = plan.schema();
+
+ // execute_stream coalesces multi-partition plans behind one stream.
+ let stream = {
+ let _guard = runtime_handle().enter();
+ df_execute_stream(plan, task_ctx)?
+ };
+
+ let reader = StreamingReader { schema, stream };
+ let ffi = FFI_ArrowArrayStream::new(Box::new(reader));
+ unsafe {
+ std::ptr::write(ffi_stream_addr as *mut FFI_ArrowArrayStream, ffi);
+ }
+ Ok(())
+ })
+}
+
+/// Drop the planned scan. Must not race an in-flight stream-open on the same
+/// handle; the Java consumer's refcount enforces this.
+pub fn close_scan(env: &mut JNIEnv, handle: jlong) {
+ try_unwrap_or_throw(env, (), |_env| -> JniResult<()> {
+ if handle == 0 {
+ return Err("scan handle is null".into());
+ }
+ drop(unsafe { Box::from_raw(handle as *mut ScanState) });
+ Ok(())
+ })
+}
diff --git a/spark/bridge/src/widening.rs b/spark/bridge/src/widening.rs
new file mode 100644
index 0000000..86c4abf
--- /dev/null
+++ b/spark/bridge/src/widening.rs
@@ -0,0 +1,376 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//! Kernel-level Arrow type widening for Spark consumption.
+//!
+//! Spark 3.5's `ArrowColumnVector` has no accessor for unsigned ints, Time*,
+//! Float16, or non-microsecond Timestamp. The widening machinery here wraps
+//! an inner `TableProvider` with one that exposes a "widened" schema —
+//! UInt*→Int wider, Float16→Float32, Time*→Int wider, Timestamp(*, tz)→
+//! Timestamp(Microsecond, tz), recursing into List/LargeList/FixedSizeList
+//! children — and applies `arrow::compute::cast` to each produced
+//! RecordBatch column-wise. No SQL, no SessionContext, no view machinery.
+
+use std::any::Any;
+use std::fmt;
+use std::sync::Arc;
+
+use arrow::array::RecordBatch;
+use arrow::compute::cast;
+use arrow::datatypes::{DataType, Field, Schema as ArrowSchema, SchemaRef, TimeUnit};
+use async_trait::async_trait;
+use datafusion::catalog::{Session, TableProvider};
+use datafusion::common::{DataFusionError, Result};
+use datafusion::execution::TaskContext;
+use datafusion::logical_expr::{Expr, TableProviderFilterPushDown, TableType};
+use datafusion::physical_expr::EquivalenceProperties;
+use datafusion::physical_plan::stream::RecordBatchStreamAdapter;
+use datafusion::physical_plan::{
+ DisplayAs, DisplayFormatType, ExecutionPlan, PlanProperties, SendableRecordBatchStream,
+};
+use futures::stream::StreamExt;
+
+/// Compute the cast-target DataType for an Arrow type not directly readable
+/// by Spark's `ArrowColumnVector`. Returns `None` if the type passes through.
+pub fn arrow_cast_widening(dt: &DataType) -> Option {
+ match dt {
+ DataType::UInt8 => Some(DataType::Int16),
+ DataType::UInt16 => Some(DataType::Int32),
+ DataType::UInt32 => Some(DataType::Int64),
+ // UInt64 → Int64: lossy for values ≥ 2^63. Documented in REARCHITECTURE.md.
+ DataType::UInt64 => Some(DataType::Int64),
+ DataType::Float16 => Some(DataType::Float32),
+ DataType::Time32(_) => Some(DataType::Int32),
+ DataType::Time64(_) => Some(DataType::Int64),
+ DataType::Timestamp(unit, tz) => {
+ if *unit == TimeUnit::Microsecond {
+ None
+ } else {
+ Some(DataType::Timestamp(TimeUnit::Microsecond, tz.clone()))
+ }
+ }
+ DataType::List(field) => arrow_cast_widening(field.data_type())
+ .map(|inner| DataType::List(widened_child(field, inner))),
+ DataType::LargeList(field) => arrow_cast_widening(field.data_type())
+ .map(|inner| DataType::LargeList(widened_child(field, inner))),
+ // Spark 3.5's ArrowColumnVector cannot read FixedSizeList at all, so
+ // always convert it to a (variable) List — which Spark maps to
+ // ArrayType — widening the child element type when needed too.
+ DataType::FixedSizeList(field, _size) => {
+ let child = match arrow_cast_widening(field.data_type()) {
+ Some(inner) => widened_child(field, inner),
+ None => Arc::clone(field),
+ };
+ Some(DataType::List(child))
+ }
+ _ => None,
+ }
+}
+
+fn widened_child(field: &Arc, new_type: DataType) -> Arc {
+ Arc::new(Field::new(field.name(), new_type, field.is_nullable()))
+}
+
+/// Build the widened schema by walking inner fields and replacing types.
+/// Returns the widened schema plus per-column target types (None where no cast).
+fn widened_schema(inner: &ArrowSchema) -> (SchemaRef, Vec